• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 #include "libyuv/scale_row.h"
13 
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18 
19 // This module is for 32 bit Visual C x86
20 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
21     !defined(__clang__) && defined(_M_IX86)
22 
23 // Offsets for source bytes 0 to 9
24 static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
25                              128, 128, 128, 128, 128, 128, 128, 128};
26 
27 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
28 static const uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
29                              128, 128, 128, 128, 128, 128, 128, 128};
30 
31 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
32 static const uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
33                              128, 128, 128, 128, 128, 128, 128, 128};
34 
35 // Offsets for source bytes 0 to 10
36 static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
37 
38 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
39 static const uvec8 kShuf11 = {2, 3, 4, 5,  5,  6,  6,  7,
40                               8, 9, 9, 10, 10, 11, 12, 13};
41 
42 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
43 static const uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
44                               10, 11, 12, 13, 13, 14, 14, 15};
45 
46 // Coefficients for source bytes 0 to 10
47 static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
48 
49 // Coefficients for source bytes 10 to 21
50 static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
51 
52 // Coefficients for source bytes 21 to 31
53 static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
54 
55 // Coefficients for source bytes 21 to 31
56 static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
57 
58 static const uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
59                                128, 128, 128, 128, 128, 128, 128, 128};
60 
61 static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
62                                6,   8,   11,  14,  128, 128, 128, 128};
63 
64 // Arrange words 0,3,6 into 0,1,2
65 static const uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
66                               128, 128, 128, 128, 128, 128, 128, 128};
67 
68 // Arrange words 0,3,6 into 3,4,5
69 static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
70                                6,   7,   12,  13,  128, 128, 128, 128};
71 
72 // Scaling values for boxes of 3x3 and 2x3
73 static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
74                                   65536 / 9, 65536 / 6, 0,         0};
75 
76 // Arrange first value for pixels 0,1,2,3,4,5
77 static const uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
78                                11, 128, 14, 128, 128, 128, 128, 128};
79 
80 // Arrange second value for pixels 0,1,2,3,4,5
81 static const uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
82                                12, 128, 15, 128, 128, 128, 128, 128};
83 
84 // Arrange third value for pixels 0,1,2,3,4,5
85 static const uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
86                                13, 128, 128, 128, 128, 128, 128, 128};
87 
88 // Scaling values for boxes of 3x2 and 2x2
89 static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
90                                  65536 / 3, 65536 / 2, 0,         0};
91 
92 // Reads 32 pixels, throws half away and writes 16 pixels.
ScaleRowDown2_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)93 __declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
94                                            ptrdiff_t src_stride,
95                                            uint8_t* dst_ptr,
96                                            int dst_width) {
97   __asm {
98     mov        eax, [esp + 4]  // src_ptr
99     // src_stride ignored
100     mov        edx, [esp + 12]  // dst_ptr
101     mov        ecx, [esp + 16]  // dst_width
102 
103   wloop:
104     movdqu     xmm0, [eax]
105     movdqu     xmm1, [eax + 16]
106     lea        eax,  [eax + 32]
107     psrlw      xmm0, 8          // isolate odd pixels.
108     psrlw      xmm1, 8
109     packuswb   xmm0, xmm1
110     movdqu     [edx], xmm0
111     lea        edx, [edx + 16]
112     sub        ecx, 16
113     jg         wloop
114 
115     ret
116   }
117 }
118 
119 // Blends 32x1 rectangle to 16x1.
ScaleRowDown2Linear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)120 __declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
121                                                  ptrdiff_t src_stride,
122                                                  uint8_t* dst_ptr,
123                                                  int dst_width) {
124   __asm {
125     mov        eax, [esp + 4]  // src_ptr
126     // src_stride
127     mov        edx, [esp + 12]  // dst_ptr
128     mov        ecx, [esp + 16]  // dst_width
129 
130     pcmpeqb    xmm4, xmm4  // constant 0x0101
131     psrlw      xmm4, 15
132     packuswb   xmm4, xmm4
133     pxor       xmm5, xmm5  // constant 0
134 
135   wloop:
136     movdqu     xmm0, [eax]
137     movdqu     xmm1, [eax + 16]
138     lea        eax,  [eax + 32]
139     pmaddubsw  xmm0, xmm4  // horizontal add
140     pmaddubsw  xmm1, xmm4
141     pavgw      xmm0, xmm5       // (x + 1) / 2
142     pavgw      xmm1, xmm5
143     packuswb   xmm0, xmm1
144     movdqu     [edx], xmm0
145     lea        edx, [edx + 16]
146     sub        ecx, 16
147     jg         wloop
148 
149     ret
150   }
151 }
152 
153 // Blends 32x2 rectangle to 16x1.
ScaleRowDown2Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)154 __declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
155                                               ptrdiff_t src_stride,
156                                               uint8_t* dst_ptr,
157                                               int dst_width) {
158   __asm {
159     push       esi
160     mov        eax, [esp + 4 + 4]  // src_ptr
161     mov        esi, [esp + 4 + 8]  // src_stride
162     mov        edx, [esp + 4 + 12]  // dst_ptr
163     mov        ecx, [esp + 4 + 16]  // dst_width
164 
165     pcmpeqb    xmm4, xmm4  // constant 0x0101
166     psrlw      xmm4, 15
167     packuswb   xmm4, xmm4
168     pxor       xmm5, xmm5  // constant 0
169 
170   wloop:
171     movdqu     xmm0, [eax]
172     movdqu     xmm1, [eax + 16]
173     movdqu     xmm2, [eax + esi]
174     movdqu     xmm3, [eax + esi + 16]
175     lea        eax,  [eax + 32]
176     pmaddubsw  xmm0, xmm4  // horizontal add
177     pmaddubsw  xmm1, xmm4
178     pmaddubsw  xmm2, xmm4
179     pmaddubsw  xmm3, xmm4
180     paddw      xmm0, xmm2  // vertical add
181     paddw      xmm1, xmm3
182     psrlw      xmm0, 1
183     psrlw      xmm1, 1
184     pavgw      xmm0, xmm5  // (x + 1) / 2
185     pavgw      xmm1, xmm5
186     packuswb   xmm0, xmm1
187     movdqu     [edx], xmm0
188     lea        edx, [edx + 16]
189     sub        ecx, 16
190     jg         wloop
191 
192     pop        esi
193     ret
194   }
195 }
196 
197 #ifdef HAS_SCALEROWDOWN2_AVX2
198 // Reads 64 pixels, throws half away and writes 32 pixels.
ScaleRowDown2_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)199 __declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
200                                           ptrdiff_t src_stride,
201                                           uint8_t* dst_ptr,
202                                           int dst_width) {
203   __asm {
204     mov        eax, [esp + 4]  // src_ptr
205     // src_stride ignored
206     mov        edx, [esp + 12]  // dst_ptr
207     mov        ecx, [esp + 16]  // dst_width
208 
209   wloop:
210     vmovdqu     ymm0, [eax]
211     vmovdqu     ymm1, [eax + 32]
212     lea         eax,  [eax + 64]
213     vpsrlw      ymm0, ymm0, 8  // isolate odd pixels.
214     vpsrlw      ymm1, ymm1, 8
215     vpackuswb   ymm0, ymm0, ymm1
216     vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
217     vmovdqu     [edx], ymm0
218     lea         edx, [edx + 32]
219     sub         ecx, 32
220     jg          wloop
221 
222     vzeroupper
223     ret
224   }
225 }
226 
227 // Blends 64x1 rectangle to 32x1.
ScaleRowDown2Linear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)228 __declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
229                                                 ptrdiff_t src_stride,
230                                                 uint8_t* dst_ptr,
231                                                 int dst_width) {
232   __asm {
233     mov         eax, [esp + 4]  // src_ptr
234     // src_stride
235     mov         edx, [esp + 12]  // dst_ptr
236     mov         ecx, [esp + 16]  // dst_width
237 
238     vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b
239     vpsrlw      ymm4, ymm4, 15
240     vpackuswb   ymm4, ymm4, ymm4
241     vpxor       ymm5, ymm5, ymm5  // constant 0
242 
243   wloop:
244     vmovdqu     ymm0, [eax]
245     vmovdqu     ymm1, [eax + 32]
246     lea         eax,  [eax + 64]
247     vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
248     vpmaddubsw  ymm1, ymm1, ymm4
249     vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
250     vpavgw      ymm1, ymm1, ymm5
251     vpackuswb   ymm0, ymm0, ymm1
252     vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
253     vmovdqu     [edx], ymm0
254     lea         edx, [edx + 32]
255     sub         ecx, 32
256     jg          wloop
257 
258     vzeroupper
259     ret
260   }
261 }
262 
263 // For rounding, average = (sum + 2) / 4
264 // becomes average((sum >> 1), 0)
265 // Blends 64x2 rectangle to 32x1.
ScaleRowDown2Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)266 __declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
267                                              ptrdiff_t src_stride,
268                                              uint8_t* dst_ptr,
269                                              int dst_width) {
270   __asm {
271     push        esi
272     mov         eax, [esp + 4 + 4]  // src_ptr
273     mov         esi, [esp + 4 + 8]  // src_stride
274     mov         edx, [esp + 4 + 12]  // dst_ptr
275     mov         ecx, [esp + 4 + 16]  // dst_width
276 
277     vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b
278     vpsrlw      ymm4, ymm4, 15
279     vpackuswb   ymm4, ymm4, ymm4
280     vpxor       ymm5, ymm5, ymm5  // constant 0
281 
282   wloop:
283     vmovdqu     ymm0, [eax]
284     vmovdqu     ymm1, [eax + 32]
285     vmovdqu     ymm2, [eax + esi]
286     vmovdqu     ymm3, [eax + esi + 32]
287     lea         eax,  [eax + 64]
288     vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
289     vpmaddubsw  ymm1, ymm1, ymm4
290     vpmaddubsw  ymm2, ymm2, ymm4
291     vpmaddubsw  ymm3, ymm3, ymm4
292     vpaddw      ymm0, ymm0, ymm2  // vertical add
293     vpaddw      ymm1, ymm1, ymm3
294     vpsrlw      ymm0, ymm0, 1  // (x + 2) / 4 = (x / 2 + 1) / 2
295     vpsrlw      ymm1, ymm1, 1
296     vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
297     vpavgw      ymm1, ymm1, ymm5
298     vpackuswb   ymm0, ymm0, ymm1
299     vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
300     vmovdqu     [edx], ymm0
301     lea         edx, [edx + 32]
302     sub         ecx, 32
303     jg          wloop
304 
305     pop         esi
306     vzeroupper
307     ret
308   }
309 }
310 #endif  // HAS_SCALEROWDOWN2_AVX2
311 
312 // Point samples 32 pixels to 8 pixels.
ScaleRowDown4_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)313 __declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
314                                            ptrdiff_t src_stride,
315                                            uint8_t* dst_ptr,
316                                            int dst_width) {
317   __asm {
318     mov        eax, [esp + 4]  // src_ptr
319     // src_stride ignored
320     mov        edx, [esp + 12]  // dst_ptr
321     mov        ecx, [esp + 16]  // dst_width
322     pcmpeqb    xmm5, xmm5       // generate mask 0x00ff0000
323     psrld      xmm5, 24
324     pslld      xmm5, 16
325 
326   wloop:
327     movdqu     xmm0, [eax]
328     movdqu     xmm1, [eax + 16]
329     lea        eax,  [eax + 32]
330     pand       xmm0, xmm5
331     pand       xmm1, xmm5
332     packuswb   xmm0, xmm1
333     psrlw      xmm0, 8
334     packuswb   xmm0, xmm0
335     movq       qword ptr [edx], xmm0
336     lea        edx, [edx + 8]
337     sub        ecx, 8
338     jg         wloop
339 
340     ret
341   }
342 }
343 
344 // Blends 32x4 rectangle to 8x1.
ScaleRowDown4Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)345 __declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
346                                               ptrdiff_t src_stride,
347                                               uint8_t* dst_ptr,
348                                               int dst_width) {
349   __asm {
350     push       esi
351     push       edi
352     mov        eax, [esp + 8 + 4]  // src_ptr
353     mov        esi, [esp + 8 + 8]  // src_stride
354     mov        edx, [esp + 8 + 12]  // dst_ptr
355     mov        ecx, [esp + 8 + 16]  // dst_width
356     lea        edi, [esi + esi * 2]  // src_stride * 3
357     pcmpeqb    xmm4, xmm4  // constant 0x0101
358     psrlw      xmm4, 15
359     movdqa     xmm5, xmm4
360     packuswb   xmm4, xmm4
361     psllw      xmm5, 3  // constant 0x0008
362 
363   wloop:
364     movdqu     xmm0, [eax]  // average rows
365     movdqu     xmm1, [eax + 16]
366     movdqu     xmm2, [eax + esi]
367     movdqu     xmm3, [eax + esi + 16]
368     pmaddubsw  xmm0, xmm4  // horizontal add
369     pmaddubsw  xmm1, xmm4
370     pmaddubsw  xmm2, xmm4
371     pmaddubsw  xmm3, xmm4
372     paddw      xmm0, xmm2  // vertical add rows 0, 1
373     paddw      xmm1, xmm3
374     movdqu     xmm2, [eax + esi * 2]
375     movdqu     xmm3, [eax + esi * 2 + 16]
376     pmaddubsw  xmm2, xmm4
377     pmaddubsw  xmm3, xmm4
378     paddw      xmm0, xmm2  // add row 2
379     paddw      xmm1, xmm3
380     movdqu     xmm2, [eax + edi]
381     movdqu     xmm3, [eax + edi + 16]
382     lea        eax, [eax + 32]
383     pmaddubsw  xmm2, xmm4
384     pmaddubsw  xmm3, xmm4
385     paddw      xmm0, xmm2  // add row 3
386     paddw      xmm1, xmm3
387     phaddw     xmm0, xmm1
388     paddw      xmm0, xmm5  // + 8 for round
389     psrlw      xmm0, 4  // /16 for average of 4 * 4
390     packuswb   xmm0, xmm0
391     movq       qword ptr [edx], xmm0
392     lea        edx, [edx + 8]
393     sub        ecx, 8
394     jg         wloop
395 
396     pop        edi
397     pop        esi
398     ret
399   }
400 }
401 
402 #ifdef HAS_SCALEROWDOWN4_AVX2
403 // Point samples 64 pixels to 16 pixels.
ScaleRowDown4_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)404 __declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
405                                           ptrdiff_t src_stride,
406                                           uint8_t* dst_ptr,
407                                           int dst_width) {
408   __asm {
409     mov         eax, [esp + 4]  // src_ptr
410     // src_stride ignored
411     mov         edx, [esp + 12]  // dst_ptr
412     mov         ecx, [esp + 16]  // dst_width
413     vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0x00ff0000
414     vpsrld      ymm5, ymm5, 24
415     vpslld      ymm5, ymm5, 16
416 
417   wloop:
418     vmovdqu     ymm0, [eax]
419     vmovdqu     ymm1, [eax + 32]
420     lea         eax,  [eax + 64]
421     vpand       ymm0, ymm0, ymm5
422     vpand       ymm1, ymm1, ymm5
423     vpackuswb   ymm0, ymm0, ymm1
424     vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
425     vpsrlw      ymm0, ymm0, 8
426     vpackuswb   ymm0, ymm0, ymm0
427     vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
428     vmovdqu     [edx], xmm0
429     lea         edx, [edx + 16]
430     sub         ecx, 16
431     jg          wloop
432 
433     vzeroupper
434     ret
435   }
436 }
437 
438 // Blends 64x4 rectangle to 16x1.
ScaleRowDown4Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)439 __declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
440                                              ptrdiff_t src_stride,
441                                              uint8_t* dst_ptr,
442                                              int dst_width) {
443   __asm {
444     push        esi
445     push        edi
446     mov         eax, [esp + 8 + 4]  // src_ptr
447     mov         esi, [esp + 8 + 8]  // src_stride
448     mov         edx, [esp + 8 + 12]  // dst_ptr
449     mov         ecx, [esp + 8 + 16]  // dst_width
450     lea         edi, [esi + esi * 2]  // src_stride * 3
451     vpcmpeqb    ymm4, ymm4, ymm4  // constant 0x0101
452     vpsrlw      ymm4, ymm4, 15
453     vpsllw      ymm5, ymm4, 3  // constant 0x0008
454     vpackuswb   ymm4, ymm4, ymm4
455 
456   wloop:
457     vmovdqu     ymm0, [eax]  // average rows
458     vmovdqu     ymm1, [eax + 32]
459     vmovdqu     ymm2, [eax + esi]
460     vmovdqu     ymm3, [eax + esi + 32]
461     vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
462     vpmaddubsw  ymm1, ymm1, ymm4
463     vpmaddubsw  ymm2, ymm2, ymm4
464     vpmaddubsw  ymm3, ymm3, ymm4
465     vpaddw      ymm0, ymm0, ymm2  // vertical add rows 0, 1
466     vpaddw      ymm1, ymm1, ymm3
467     vmovdqu     ymm2, [eax + esi * 2]
468     vmovdqu     ymm3, [eax + esi * 2 + 32]
469     vpmaddubsw  ymm2, ymm2, ymm4
470     vpmaddubsw  ymm3, ymm3, ymm4
471     vpaddw      ymm0, ymm0, ymm2  // add row 2
472     vpaddw      ymm1, ymm1, ymm3
473     vmovdqu     ymm2, [eax + edi]
474     vmovdqu     ymm3, [eax + edi + 32]
475     lea         eax,  [eax + 64]
476     vpmaddubsw  ymm2, ymm2, ymm4
477     vpmaddubsw  ymm3, ymm3, ymm4
478     vpaddw      ymm0, ymm0, ymm2  // add row 3
479     vpaddw      ymm1, ymm1, ymm3
480     vphaddw     ymm0, ymm0, ymm1  // mutates
481     vpermq      ymm0, ymm0, 0xd8  // unmutate vphaddw
482     vpaddw      ymm0, ymm0, ymm5  // + 8 for round
483     vpsrlw      ymm0, ymm0, 4  // /32 for average of 4 * 4
484     vpackuswb   ymm0, ymm0, ymm0
485     vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
486     vmovdqu     [edx], xmm0
487     lea         edx, [edx + 16]
488     sub         ecx, 16
489     jg          wloop
490 
491     pop        edi
492     pop        esi
493     vzeroupper
494     ret
495   }
496 }
497 #endif  // HAS_SCALEROWDOWN4_AVX2
498 
499 // Point samples 32 pixels to 24 pixels.
500 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
501 // Then shuffled to do the scaling.
502 
ScaleRowDown34_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)503 __declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
504                                             ptrdiff_t src_stride,
505                                             uint8_t* dst_ptr,
506                                             int dst_width) {
507   __asm {
508     mov        eax, [esp + 4]   // src_ptr
509     // src_stride ignored
510     mov        edx, [esp + 12]  // dst_ptr
511     mov        ecx, [esp + 16]  // dst_width
512     movdqa     xmm3, xmmword ptr kShuf0
513     movdqa     xmm4, xmmword ptr kShuf1
514     movdqa     xmm5, xmmword ptr kShuf2
515 
516   wloop:
517     movdqu     xmm0, [eax]
518     movdqu     xmm1, [eax + 16]
519     lea        eax,  [eax + 32]
520     movdqa     xmm2, xmm1
521     palignr    xmm1, xmm0, 8
522     pshufb     xmm0, xmm3
523     pshufb     xmm1, xmm4
524     pshufb     xmm2, xmm5
525     movq       qword ptr [edx], xmm0
526     movq       qword ptr [edx + 8], xmm1
527     movq       qword ptr [edx + 16], xmm2
528     lea        edx, [edx + 24]
529     sub        ecx, 24
530     jg         wloop
531 
532     ret
533   }
534 }
535 
536 // Blends 32x2 rectangle to 24x1
537 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
538 // Then shuffled to do the scaling.
539 
540 // Register usage:
541 // xmm0 src_row 0
542 // xmm1 src_row 1
543 // xmm2 shuf 0
544 // xmm3 shuf 1
545 // xmm4 shuf 2
546 // xmm5 madd 0
547 // xmm6 madd 1
548 // xmm7 kRound34
549 
550 // Note that movdqa+palign may be better than movdqu.
ScaleRowDown34_1_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)551 __declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
552                                                   ptrdiff_t src_stride,
553                                                   uint8_t* dst_ptr,
554                                                   int dst_width) {
555   __asm {
556     push       esi
557     mov        eax, [esp + 4 + 4]  // src_ptr
558     mov        esi, [esp + 4 + 8]  // src_stride
559     mov        edx, [esp + 4 + 12]  // dst_ptr
560     mov        ecx, [esp + 4 + 16]  // dst_width
561     movdqa     xmm2, xmmword ptr kShuf01
562     movdqa     xmm3, xmmword ptr kShuf11
563     movdqa     xmm4, xmmword ptr kShuf21
564     movdqa     xmm5, xmmword ptr kMadd01
565     movdqa     xmm6, xmmword ptr kMadd11
566     movdqa     xmm7, xmmword ptr kRound34
567 
568   wloop:
569     movdqu     xmm0, [eax]  // pixels 0..7
570     movdqu     xmm1, [eax + esi]
571     pavgb      xmm0, xmm1
572     pshufb     xmm0, xmm2
573     pmaddubsw  xmm0, xmm5
574     paddsw     xmm0, xmm7
575     psrlw      xmm0, 2
576     packuswb   xmm0, xmm0
577     movq       qword ptr [edx], xmm0
578     movdqu     xmm0, [eax + 8]  // pixels 8..15
579     movdqu     xmm1, [eax + esi + 8]
580     pavgb      xmm0, xmm1
581     pshufb     xmm0, xmm3
582     pmaddubsw  xmm0, xmm6
583     paddsw     xmm0, xmm7
584     psrlw      xmm0, 2
585     packuswb   xmm0, xmm0
586     movq       qword ptr [edx + 8], xmm0
587     movdqu     xmm0, [eax + 16]  // pixels 16..23
588     movdqu     xmm1, [eax + esi + 16]
589     lea        eax, [eax + 32]
590     pavgb      xmm0, xmm1
591     pshufb     xmm0, xmm4
592     movdqa     xmm1, xmmword ptr kMadd21
593     pmaddubsw  xmm0, xmm1
594     paddsw     xmm0, xmm7
595     psrlw      xmm0, 2
596     packuswb   xmm0, xmm0
597     movq       qword ptr [edx + 16], xmm0
598     lea        edx, [edx + 24]
599     sub        ecx, 24
600     jg         wloop
601 
602     pop        esi
603     ret
604   }
605 }
606 
607 // Note that movdqa+palign may be better than movdqu.
ScaleRowDown34_0_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)608 __declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
609                                                   ptrdiff_t src_stride,
610                                                   uint8_t* dst_ptr,
611                                                   int dst_width) {
612   __asm {
613     push       esi
614     mov        eax, [esp + 4 + 4]  // src_ptr
615     mov        esi, [esp + 4 + 8]  // src_stride
616     mov        edx, [esp + 4 + 12]  // dst_ptr
617     mov        ecx, [esp + 4 + 16]  // dst_width
618     movdqa     xmm2, xmmword ptr kShuf01
619     movdqa     xmm3, xmmword ptr kShuf11
620     movdqa     xmm4, xmmword ptr kShuf21
621     movdqa     xmm5, xmmword ptr kMadd01
622     movdqa     xmm6, xmmword ptr kMadd11
623     movdqa     xmm7, xmmword ptr kRound34
624 
625   wloop:
626     movdqu     xmm0, [eax]  // pixels 0..7
627     movdqu     xmm1, [eax + esi]
628     pavgb      xmm1, xmm0
629     pavgb      xmm0, xmm1
630     pshufb     xmm0, xmm2
631     pmaddubsw  xmm0, xmm5
632     paddsw     xmm0, xmm7
633     psrlw      xmm0, 2
634     packuswb   xmm0, xmm0
635     movq       qword ptr [edx], xmm0
636     movdqu     xmm0, [eax + 8]  // pixels 8..15
637     movdqu     xmm1, [eax + esi + 8]
638     pavgb      xmm1, xmm0
639     pavgb      xmm0, xmm1
640     pshufb     xmm0, xmm3
641     pmaddubsw  xmm0, xmm6
642     paddsw     xmm0, xmm7
643     psrlw      xmm0, 2
644     packuswb   xmm0, xmm0
645     movq       qword ptr [edx + 8], xmm0
646     movdqu     xmm0, [eax + 16]  // pixels 16..23
647     movdqu     xmm1, [eax + esi + 16]
648     lea        eax, [eax + 32]
649     pavgb      xmm1, xmm0
650     pavgb      xmm0, xmm1
651     pshufb     xmm0, xmm4
652     movdqa     xmm1, xmmword ptr kMadd21
653     pmaddubsw  xmm0, xmm1
654     paddsw     xmm0, xmm7
655     psrlw      xmm0, 2
656     packuswb   xmm0, xmm0
657     movq       qword ptr [edx + 16], xmm0
658     lea        edx, [edx+24]
659     sub        ecx, 24
660     jg         wloop
661 
662     pop        esi
663     ret
664   }
665 }
666 
667 // 3/8 point sampler
668 
669 // Scale 32 pixels to 12
ScaleRowDown38_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)670 __declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
671                                             ptrdiff_t src_stride,
672                                             uint8_t* dst_ptr,
673                                             int dst_width) {
674   __asm {
675     mov        eax, [esp + 4]  // src_ptr
676     // src_stride ignored
677     mov        edx, [esp + 12]  // dst_ptr
678     mov        ecx, [esp + 16]  // dst_width
679     movdqa     xmm4, xmmword ptr kShuf38a
680     movdqa     xmm5, xmmword ptr kShuf38b
681 
682   xloop:
683     movdqu     xmm0, [eax]  // 16 pixels -> 0,1,2,3,4,5
684     movdqu     xmm1, [eax + 16]  // 16 pixels -> 6,7,8,9,10,11
685     lea        eax, [eax + 32]
686     pshufb     xmm0, xmm4
687     pshufb     xmm1, xmm5
688     paddusb    xmm0, xmm1
689 
690     movq       qword ptr [edx], xmm0       // write 12 pixels
691     movhlps    xmm1, xmm0
692     movd       [edx + 8], xmm1
693     lea        edx, [edx + 12]
694     sub        ecx, 12
695     jg         xloop
696 
697     ret
698   }
699 }
700 
701 // Scale 16x3 pixels to 6x1 with interpolation
ScaleRowDown38_3_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)702 __declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
703                                                   ptrdiff_t src_stride,
704                                                   uint8_t* dst_ptr,
705                                                   int dst_width) {
706   __asm {
707     push       esi
708     mov        eax, [esp + 4 + 4]  // src_ptr
709     mov        esi, [esp + 4 + 8]  // src_stride
710     mov        edx, [esp + 4 + 12]  // dst_ptr
711     mov        ecx, [esp + 4 + 16]  // dst_width
712     movdqa     xmm2, xmmword ptr kShufAc
713     movdqa     xmm3, xmmword ptr kShufAc3
714     movdqa     xmm4, xmmword ptr kScaleAc33
715     pxor       xmm5, xmm5
716 
717   xloop:
718     movdqu     xmm0, [eax]  // sum up 3 rows into xmm0/1
719     movdqu     xmm6, [eax + esi]
720     movhlps    xmm1, xmm0
721     movhlps    xmm7, xmm6
722     punpcklbw  xmm0, xmm5
723     punpcklbw  xmm1, xmm5
724     punpcklbw  xmm6, xmm5
725     punpcklbw  xmm7, xmm5
726     paddusw    xmm0, xmm6
727     paddusw    xmm1, xmm7
728     movdqu     xmm6, [eax + esi * 2]
729     lea        eax, [eax + 16]
730     movhlps    xmm7, xmm6
731     punpcklbw  xmm6, xmm5
732     punpcklbw  xmm7, xmm5
733     paddusw    xmm0, xmm6
734     paddusw    xmm1, xmm7
735 
736     movdqa     xmm6, xmm0  // 8 pixels -> 0,1,2 of xmm6
737     psrldq     xmm0, 2
738     paddusw    xmm6, xmm0
739     psrldq     xmm0, 2
740     paddusw    xmm6, xmm0
741     pshufb     xmm6, xmm2
742 
743     movdqa     xmm7, xmm1  // 8 pixels -> 3,4,5 of xmm6
744     psrldq     xmm1, 2
745     paddusw    xmm7, xmm1
746     psrldq     xmm1, 2
747     paddusw    xmm7, xmm1
748     pshufb     xmm7, xmm3
749     paddusw    xmm6, xmm7
750 
751     pmulhuw    xmm6, xmm4  // divide by 9,9,6, 9,9,6
752     packuswb   xmm6, xmm6
753 
754     movd       [edx], xmm6  // write 6 pixels
755     psrlq      xmm6, 16
756     movd       [edx + 2], xmm6
757     lea        edx, [edx + 6]
758     sub        ecx, 6
759     jg         xloop
760 
761     pop        esi
762     ret
763   }
764 }
765 
766 // Scale 16x2 pixels to 6x1 with interpolation
ScaleRowDown38_2_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)767 __declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
768                                                   ptrdiff_t src_stride,
769                                                   uint8_t* dst_ptr,
770                                                   int dst_width) {
771   __asm {
772     push       esi
773     mov        eax, [esp + 4 + 4]  // src_ptr
774     mov        esi, [esp + 4 + 8]  // src_stride
775     mov        edx, [esp + 4 + 12]  // dst_ptr
776     mov        ecx, [esp + 4 + 16]  // dst_width
777     movdqa     xmm2, xmmword ptr kShufAb0
778     movdqa     xmm3, xmmword ptr kShufAb1
779     movdqa     xmm4, xmmword ptr kShufAb2
780     movdqa     xmm5, xmmword ptr kScaleAb2
781 
782   xloop:
783     movdqu     xmm0, [eax]  // average 2 rows into xmm0
784     movdqu     xmm1, [eax + esi]
785     lea        eax, [eax + 16]
786     pavgb      xmm0, xmm1
787 
788     movdqa     xmm1, xmm0  // 16 pixels -> 0,1,2,3,4,5 of xmm1
789     pshufb     xmm1, xmm2
790     movdqa     xmm6, xmm0
791     pshufb     xmm6, xmm3
792     paddusw    xmm1, xmm6
793     pshufb     xmm0, xmm4
794     paddusw    xmm1, xmm0
795 
796     pmulhuw    xmm1, xmm5  // divide by 3,3,2, 3,3,2
797     packuswb   xmm1, xmm1
798 
799     movd       [edx], xmm1  // write 6 pixels
800     psrlq      xmm1, 16
801     movd       [edx + 2], xmm1
802     lea        edx, [edx + 6]
803     sub        ecx, 6
804     jg         xloop
805 
806     pop        esi
807     ret
808   }
809 }
810 
811 // Reads 16 bytes and accumulates to 16 shorts at a time.
ScaleAddRow_SSE2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)812 __declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr,
813                                         uint16_t* dst_ptr,
814                                         int src_width) {
815   __asm {
816     mov        eax, [esp + 4]  // src_ptr
817     mov        edx, [esp + 8]  // dst_ptr
818     mov        ecx, [esp + 12]  // src_width
819     pxor       xmm5, xmm5
820 
821         // sum rows
822   xloop:
823     movdqu     xmm3, [eax]  // read 16 bytes
824     lea        eax, [eax + 16]
825     movdqu     xmm0, [edx]  // read 16 words from destination
826     movdqu     xmm1, [edx + 16]
827     movdqa     xmm2, xmm3
828     punpcklbw  xmm2, xmm5
829     punpckhbw  xmm3, xmm5
830     paddusw    xmm0, xmm2  // sum 16 words
831     paddusw    xmm1, xmm3
832     movdqu     [edx], xmm0  // write 16 words to destination
833     movdqu     [edx + 16], xmm1
834     lea        edx, [edx + 32]
835     sub        ecx, 16
836     jg         xloop
837     ret
838   }
839 }
840 
841 #ifdef HAS_SCALEADDROW_AVX2
842 // Reads 32 bytes and accumulates to 32 shorts at a time.
ScaleAddRow_AVX2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)843 __declspec(naked) void ScaleAddRow_AVX2(const uint8_t* src_ptr,
844                                         uint16_t* dst_ptr,
845                                         int src_width) {
846   __asm {
847     mov         eax, [esp + 4]  // src_ptr
848     mov         edx, [esp + 8]  // dst_ptr
849     mov         ecx, [esp + 12]  // src_width
850     vpxor       ymm5, ymm5, ymm5
851 
852         // sum rows
853   xloop:
854     vmovdqu     ymm3, [eax]  // read 32 bytes
855     lea         eax, [eax + 32]
856     vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
857     vpunpcklbw  ymm2, ymm3, ymm5
858     vpunpckhbw  ymm3, ymm3, ymm5
859     vpaddusw    ymm0, ymm2, [edx]  // sum 16 words
860     vpaddusw    ymm1, ymm3, [edx + 32]
861     vmovdqu     [edx], ymm0  // write 32 words to destination
862     vmovdqu     [edx + 32], ymm1
863     lea         edx, [edx + 64]
864     sub         ecx, 32
865     jg          xloop
866 
867     vzeroupper
868     ret
869   }
870 }
871 #endif  // HAS_SCALEADDROW_AVX2
872 
873 // Constant for making pixels signed to avoid pmaddubsw
874 // saturation.
875 static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
876                               0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
877 
878 // Constant for making pixels unsigned and adding .5 for rounding.
879 static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
880                                0x4040, 0x4040, 0x4040, 0x4040};
881 
882 // Bilinear column filtering. SSSE3 version.
ScaleFilterCols_SSSE3(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)883 __declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
884                                              const uint8_t* src_ptr,
885                                              int dst_width,
886                                              int x,
887                                              int dx) {
888   __asm {
889     push       ebx
890     push       esi
891     push       edi
892     mov        edi, [esp + 12 + 4]  // dst_ptr
893     mov        esi, [esp + 12 + 8]  // src_ptr
894     mov        ecx, [esp + 12 + 12]  // dst_width
895     movd       xmm2, [esp + 12 + 16]  // x
896     movd       xmm3, [esp + 12 + 20]  // dx
897     mov        eax, 0x04040000  // shuffle to line up fractions with pixel.
898     movd       xmm5, eax
899     pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.
900     psrlw      xmm6, 9
901     pcmpeqb    xmm7, xmm7  // generate 0x0001
902     psrlw      xmm7, 15
903     pextrw     eax, xmm2, 1  // get x0 integer. preroll
904     sub        ecx, 2
905     jl         xloop29
906 
907     movdqa     xmm0, xmm2  // x1 = x0 + dx
908     paddd      xmm0, xmm3
909     punpckldq  xmm2, xmm0  // x0 x1
910     punpckldq  xmm3, xmm3  // dx dx
911     paddd      xmm3, xmm3  // dx * 2, dx * 2
912     pextrw     edx, xmm2, 3  // get x1 integer. preroll
913 
914     // 2 Pixel loop.
915   xloop2:
916     movdqa     xmm1, xmm2  // x0, x1 fractions.
917     paddd      xmm2, xmm3  // x += dx
918     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
919     movd       xmm0, ebx
920     psrlw      xmm1, 9  // 7 bit fractions.
921     movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
922     movd       xmm4, ebx
923     pshufb     xmm1, xmm5  // 0011
924     punpcklwd  xmm0, xmm4
925     psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
926     pxor       xmm1, xmm6  // 0..7f and 7f..0
927     paddusb    xmm1, xmm7  // +1 so 0..7f and 80..1
928     pmaddubsw  xmm1, xmm0  // 16 bit, 2 pixels.
929     pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
930     pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
931     paddw      xmm1, xmmword ptr kFadd40  // make pixels unsigned and round.
932     psrlw      xmm1, 7  // 8.7 fixed point to low 8 bits.
933     packuswb   xmm1, xmm1  // 8 bits, 2 pixels.
934     movd       ebx, xmm1
935     mov        [edi], bx
936     lea        edi, [edi + 2]
937     sub        ecx, 2  // 2 pixels
938     jge        xloop2
939 
940  xloop29:
941     add        ecx, 2 - 1
942     jl         xloop99
943 
944             // 1 pixel remainder
945     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
946     movd       xmm0, ebx
947     psrlw      xmm2, 9  // 7 bit fractions.
948     pshufb     xmm2, xmm5  // 0011
949     psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
950     pxor       xmm2, xmm6  // 0..7f and 7f..0
951     paddusb    xmm2, xmm7  // +1 so 0..7f and 80..1
952     pmaddubsw  xmm2, xmm0  // 16 bit
953     paddw      xmm2, xmmword ptr kFadd40  // make pixels unsigned and round.
954     psrlw      xmm2, 7  // 8.7 fixed point to low 8 bits.
955     packuswb   xmm2, xmm2  // 8 bits
956     movd       ebx, xmm2
957     mov        [edi], bl
958 
959  xloop99:
960 
961     pop        edi
962     pop        esi
963     pop        ebx
964     ret
965   }
966 }
967 
968 // Reads 16 pixels, duplicates them and writes 32 pixels.
ScaleColsUp2_SSE2(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)969 __declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
970                                          const uint8_t* src_ptr,
971                                          int dst_width,
972                                          int x,
973                                          int dx) {
974   __asm {
975     mov        edx, [esp + 4]  // dst_ptr
976     mov        eax, [esp + 8]  // src_ptr
977     mov        ecx, [esp + 12]  // dst_width
978 
979   wloop:
980     movdqu     xmm0, [eax]
981     lea        eax,  [eax + 16]
982     movdqa     xmm1, xmm0
983     punpcklbw  xmm0, xmm0
984     punpckhbw  xmm1, xmm1
985     movdqu     [edx], xmm0
986     movdqu     [edx + 16], xmm1
987     lea        edx, [edx + 32]
988     sub        ecx, 32
989     jg         wloop
990 
991     ret
992   }
993 }
994 
995 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
ScaleARGBRowDown2_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)996 __declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
997                                               ptrdiff_t src_stride,
998                                               uint8_t* dst_argb,
999                                               int dst_width) {
1000   __asm {
1001     mov        eax, [esp + 4]   // src_argb
1002     // src_stride ignored
1003     mov        edx, [esp + 12]  // dst_argb
1004     mov        ecx, [esp + 16]  // dst_width
1005 
1006   wloop:
1007     movdqu     xmm0, [eax]
1008     movdqu     xmm1, [eax + 16]
1009     lea        eax,  [eax + 32]
1010     shufps     xmm0, xmm1, 0xdd
1011     movdqu     [edx], xmm0
1012     lea        edx, [edx + 16]
1013     sub        ecx, 4
1014     jg         wloop
1015 
1016     ret
1017   }
1018 }
1019 
1020 // Blends 8x1 rectangle to 4x1.
ScaleARGBRowDown2Linear_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1021 __declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
1022                                                     ptrdiff_t src_stride,
1023                                                     uint8_t* dst_argb,
1024                                                     int dst_width) {
1025   __asm {
1026     mov        eax, [esp + 4]  // src_argb
1027     // src_stride ignored
1028     mov        edx, [esp + 12]  // dst_argb
1029     mov        ecx, [esp + 16]  // dst_width
1030 
1031   wloop:
1032     movdqu     xmm0, [eax]
1033     movdqu     xmm1, [eax + 16]
1034     lea        eax,  [eax + 32]
1035     movdqa     xmm2, xmm0
1036     shufps     xmm0, xmm1, 0x88  // even pixels
1037     shufps     xmm2, xmm1, 0xdd       // odd pixels
1038     pavgb      xmm0, xmm2
1039     movdqu     [edx], xmm0
1040     lea        edx, [edx + 16]
1041     sub        ecx, 4
1042     jg         wloop
1043 
1044     ret
1045   }
1046 }
1047 
1048 // Blends 8x2 rectangle to 4x1.
ScaleARGBRowDown2Box_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1049 __declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
1050                                                  ptrdiff_t src_stride,
1051                                                  uint8_t* dst_argb,
1052                                                  int dst_width) {
1053   __asm {
1054     push       esi
1055     mov        eax, [esp + 4 + 4]  // src_argb
1056     mov        esi, [esp + 4 + 8]  // src_stride
1057     mov        edx, [esp + 4 + 12]  // dst_argb
1058     mov        ecx, [esp + 4 + 16]  // dst_width
1059 
1060   wloop:
1061     movdqu     xmm0, [eax]
1062     movdqu     xmm1, [eax + 16]
1063     movdqu     xmm2, [eax + esi]
1064     movdqu     xmm3, [eax + esi + 16]
1065     lea        eax,  [eax + 32]
1066     pavgb      xmm0, xmm2  // average rows
1067     pavgb      xmm1, xmm3
1068     movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)
1069     shufps     xmm0, xmm1, 0x88  // even pixels
1070     shufps     xmm2, xmm1, 0xdd  // odd pixels
1071     pavgb      xmm0, xmm2
1072     movdqu     [edx], xmm0
1073     lea        edx, [edx + 16]
1074     sub        ecx, 4
1075     jg         wloop
1076 
1077     pop        esi
1078     ret
1079   }
1080 }
1081 
1082 // Reads 4 pixels at a time.
ScaleARGBRowDownEven_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)1083 __declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
1084                                                  ptrdiff_t src_stride,
1085                                                  int src_stepx,
1086                                                  uint8_t* dst_argb,
1087                                                  int dst_width) {
1088   __asm {
1089     push       ebx
1090     push       edi
1091     mov        eax, [esp + 8 + 4]   // src_argb
1092     // src_stride ignored
1093     mov        ebx, [esp + 8 + 12]  // src_stepx
1094     mov        edx, [esp + 8 + 16]  // dst_argb
1095     mov        ecx, [esp + 8 + 20]  // dst_width
1096     lea        ebx, [ebx * 4]
1097     lea        edi, [ebx + ebx * 2]
1098 
1099   wloop:
1100     movd       xmm0, [eax]
1101     movd       xmm1, [eax + ebx]
1102     punpckldq  xmm0, xmm1
1103     movd       xmm2, [eax + ebx * 2]
1104     movd       xmm3, [eax + edi]
1105     lea        eax,  [eax + ebx * 4]
1106     punpckldq  xmm2, xmm3
1107     punpcklqdq xmm0, xmm2
1108     movdqu     [edx], xmm0
1109     lea        edx, [edx + 16]
1110     sub        ecx, 4
1111     jg         wloop
1112 
1113     pop        edi
1114     pop        ebx
1115     ret
1116   }
1117 }
1118 
1119 // Blends four 2x2 to 4x1.
ScaleARGBRowDownEvenBox_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)1120 __declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
1121                                                     ptrdiff_t src_stride,
1122                                                     int src_stepx,
1123                                                     uint8_t* dst_argb,
1124                                                     int dst_width) {
1125   __asm {
1126     push       ebx
1127     push       esi
1128     push       edi
1129     mov        eax, [esp + 12 + 4]  // src_argb
1130     mov        esi, [esp + 12 + 8]  // src_stride
1131     mov        ebx, [esp + 12 + 12]  // src_stepx
1132     mov        edx, [esp + 12 + 16]  // dst_argb
1133     mov        ecx, [esp + 12 + 20]  // dst_width
1134     lea        esi, [eax + esi]  // row1 pointer
1135     lea        ebx, [ebx * 4]
1136     lea        edi, [ebx + ebx * 2]
1137 
1138   wloop:
1139     movq       xmm0, qword ptr [eax]  // row0 4 pairs
1140     movhps     xmm0, qword ptr [eax + ebx]
1141     movq       xmm1, qword ptr [eax + ebx * 2]
1142     movhps     xmm1, qword ptr [eax + edi]
1143     lea        eax,  [eax + ebx * 4]
1144     movq       xmm2, qword ptr [esi]  // row1 4 pairs
1145     movhps     xmm2, qword ptr [esi + ebx]
1146     movq       xmm3, qword ptr [esi + ebx * 2]
1147     movhps     xmm3, qword ptr [esi + edi]
1148     lea        esi,  [esi + ebx * 4]
1149     pavgb      xmm0, xmm2  // average rows
1150     pavgb      xmm1, xmm3
1151     movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)
1152     shufps     xmm0, xmm1, 0x88  // even pixels
1153     shufps     xmm2, xmm1, 0xdd  // odd pixels
1154     pavgb      xmm0, xmm2
1155     movdqu     [edx], xmm0
1156     lea        edx, [edx + 16]
1157     sub        ecx, 4
1158     jg         wloop
1159 
1160     pop        edi
1161     pop        esi
1162     pop        ebx
1163     ret
1164   }
1165 }
1166 
1167 // Column scaling unfiltered. SSE2 version.
ScaleARGBCols_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1168 __declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb,
1169                                           const uint8_t* src_argb,
1170                                           int dst_width,
1171                                           int x,
1172                                           int dx) {
1173   __asm {
1174     push       edi
1175     push       esi
1176     mov        edi, [esp + 8 + 4]  // dst_argb
1177     mov        esi, [esp + 8 + 8]  // src_argb
1178     mov        ecx, [esp + 8 + 12]  // dst_width
1179     movd       xmm2, [esp + 8 + 16]  // x
1180     movd       xmm3, [esp + 8 + 20]  // dx
1181 
1182     pshufd     xmm2, xmm2, 0  // x0 x0 x0 x0
1183     pshufd     xmm0, xmm3, 0x11  // dx  0 dx  0
1184     paddd      xmm2, xmm0
1185     paddd      xmm3, xmm3  // 0, 0, 0,  dx * 2
1186     pshufd     xmm0, xmm3, 0x05  // dx * 2, dx * 2, 0, 0
1187     paddd      xmm2, xmm0  // x3 x2 x1 x0
1188     paddd      xmm3, xmm3  // 0, 0, 0,  dx * 4
1189     pshufd     xmm3, xmm3, 0  // dx * 4, dx * 4, dx * 4, dx * 4
1190 
1191     pextrw     eax, xmm2, 1  // get x0 integer.
1192     pextrw     edx, xmm2, 3  // get x1 integer.
1193 
1194     cmp        ecx, 0
1195     jle        xloop99
1196     sub        ecx, 4
1197     jl         xloop49
1198 
1199         // 4 Pixel loop.
1200  xloop4:
1201     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
1202     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
1203     pextrw     eax, xmm2, 5  // get x2 integer.
1204     pextrw     edx, xmm2, 7  // get x3 integer.
1205     paddd      xmm2, xmm3  // x += dx
1206     punpckldq  xmm0, xmm1  // x0 x1
1207 
1208     movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
1209     movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
1210     pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
1211     pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
1212     punpckldq  xmm1, xmm4  // x2 x3
1213     punpcklqdq xmm0, xmm1  // x0 x1 x2 x3
1214     movdqu     [edi], xmm0
1215     lea        edi, [edi + 16]
1216     sub        ecx, 4  // 4 pixels
1217     jge        xloop4
1218 
1219  xloop49:
1220     test       ecx, 2
1221     je         xloop29
1222 
1223         // 2 Pixels.
1224     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
1225     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
1226     pextrw     eax, xmm2, 5  // get x2 integer.
1227     punpckldq  xmm0, xmm1  // x0 x1
1228 
1229     movq       qword ptr [edi], xmm0
1230     lea        edi, [edi + 8]
1231 
1232  xloop29:
1233     test       ecx, 1
1234     je         xloop99
1235 
1236         // 1 Pixels.
1237     movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
1238     movd       dword ptr [edi], xmm0
1239  xloop99:
1240 
1241     pop        esi
1242     pop        edi
1243     ret
1244   }
1245 }
1246 
1247 // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
1248 // TODO(fbarchard): Port to Neon
1249 
1250 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1251 static const uvec8 kShuffleColARGB = {
1252     0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
1253     8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
1254 };
1255 
1256 // Shuffle table for duplicating 2 fractions into 8 bytes each
1257 static const uvec8 kShuffleFractions = {
1258     0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1259 };
1260 
ScaleARGBFilterCols_SSSE3(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1261 __declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
1262                                                  const uint8_t* src_argb,
1263                                                  int dst_width,
1264                                                  int x,
1265                                                  int dx) {
1266   __asm {
1267     push       esi
1268     push       edi
1269     mov        edi, [esp + 8 + 4]  // dst_argb
1270     mov        esi, [esp + 8 + 8]  // src_argb
1271     mov        ecx, [esp + 8 + 12]  // dst_width
1272     movd       xmm2, [esp + 8 + 16]  // x
1273     movd       xmm3, [esp + 8 + 20]  // dx
1274     movdqa     xmm4, xmmword ptr kShuffleColARGB
1275     movdqa     xmm5, xmmword ptr kShuffleFractions
1276     pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.
1277     psrlw      xmm6, 9
1278     pextrw     eax, xmm2, 1  // get x0 integer. preroll
1279     sub        ecx, 2
1280     jl         xloop29
1281 
1282     movdqa     xmm0, xmm2  // x1 = x0 + dx
1283     paddd      xmm0, xmm3
1284     punpckldq  xmm2, xmm0  // x0 x1
1285     punpckldq  xmm3, xmm3  // dx dx
1286     paddd      xmm3, xmm3  // dx * 2, dx * 2
1287     pextrw     edx, xmm2, 3  // get x1 integer. preroll
1288 
1289     // 2 Pixel loop.
1290   xloop2:
1291     movdqa     xmm1, xmm2  // x0, x1 fractions.
1292     paddd      xmm2, xmm3  // x += dx
1293     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
1294     psrlw      xmm1, 9  // 7 bit fractions.
1295     movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
1296     pshufb     xmm1, xmm5  // 0000000011111111
1297     pshufb     xmm0, xmm4  // arrange pixels into pairs
1298     pxor       xmm1, xmm6  // 0..7f and 7f..0
1299     pmaddubsw  xmm0, xmm1  // argb_argb 16 bit, 2 pixels.
1300     pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
1301     pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
1302     psrlw      xmm0, 7  // argb 8.7 fixed point to low 8 bits.
1303     packuswb   xmm0, xmm0  // argb_argb 8 bits, 2 pixels.
1304     movq       qword ptr [edi], xmm0
1305     lea        edi, [edi + 8]
1306     sub        ecx, 2  // 2 pixels
1307     jge        xloop2
1308 
1309  xloop29:
1310 
1311     add        ecx, 2 - 1
1312     jl         xloop99
1313 
1314             // 1 pixel remainder
1315     psrlw      xmm2, 9  // 7 bit fractions.
1316     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
1317     pshufb     xmm2, xmm5  // 00000000
1318     pshufb     xmm0, xmm4  // arrange pixels into pairs
1319     pxor       xmm2, xmm6  // 0..7f and 7f..0
1320     pmaddubsw  xmm0, xmm2  // argb 16 bit, 1 pixel.
1321     psrlw      xmm0, 7
1322     packuswb   xmm0, xmm0  // argb 8 bits, 1 pixel.
1323     movd       [edi], xmm0
1324 
1325  xloop99:
1326 
1327     pop        edi
1328     pop        esi
1329     ret
1330   }
1331 }
1332 
1333 // Reads 4 pixels, duplicates them and writes 8 pixels.
ScaleARGBColsUp2_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1334 __declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
1335                                              const uint8_t* src_argb,
1336                                              int dst_width,
1337                                              int x,
1338                                              int dx) {
1339   __asm {
1340     mov        edx, [esp + 4]  // dst_argb
1341     mov        eax, [esp + 8]  // src_argb
1342     mov        ecx, [esp + 12]  // dst_width
1343 
1344   wloop:
1345     movdqu     xmm0, [eax]
1346     lea        eax,  [eax + 16]
1347     movdqa     xmm1, xmm0
1348     punpckldq  xmm0, xmm0
1349     punpckhdq  xmm1, xmm1
1350     movdqu     [edx], xmm0
1351     movdqu     [edx + 16], xmm1
1352     lea        edx, [edx + 32]
1353     sub        ecx, 8
1354     jg         wloop
1355 
1356     ret
1357   }
1358 }
1359 
1360 // Divide num by div and return as 16.16 fixed point result.
FixedDiv_X86(int num,int div)1361 __declspec(naked) int FixedDiv_X86(int num, int div) {
1362   __asm {
1363     mov        eax, [esp + 4]  // num
1364     cdq  // extend num to 64 bits
1365     shld       edx, eax, 16  // 32.16
1366     shl        eax, 16
1367     idiv       dword ptr [esp + 8]
1368     ret
1369   }
1370 }
1371 
1372 // Divide num by div and return as 16.16 fixed point result.
FixedDiv1_X86(int num,int div)1373 __declspec(naked) int FixedDiv1_X86(int num, int div) {
1374   __asm {
1375     mov        eax, [esp + 4]  // num
1376     mov        ecx, [esp + 8]  // denom
1377     cdq  // extend num to 64 bits
1378     shld       edx, eax, 16  // 32.16
1379     shl        eax, 16
1380     sub        eax, 0x00010001
1381     sbb        edx, 0
1382     sub        ecx, 1
1383     idiv       ecx
1384     ret
1385   }
1386 }
1387 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
1388 
1389 #ifdef __cplusplus
1390 }  // extern "C"
1391 }  // namespace libyuv
1392 #endif
1393