• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 #include "libyuv/scale_row.h"
13 
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18 
19 // This module is for 32 bit Visual C x86 and clangcl
20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
21 
22 // Offsets for source bytes 0 to 9
23 static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
24                              128, 128, 128, 128, 128, 128, 128, 128};
25 
26 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
27 static const uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
28                              128, 128, 128, 128, 128, 128, 128, 128};
29 
30 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
31 static const uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
32                              128, 128, 128, 128, 128, 128, 128, 128};
33 
34 // Offsets for source bytes 0 to 10
35 static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
36 
37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
38 static const uvec8 kShuf11 = {2, 3, 4, 5,  5,  6,  6,  7,
39                               8, 9, 9, 10, 10, 11, 12, 13};
40 
41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
42 static const uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
43                               10, 11, 12, 13, 13, 14, 14, 15};
44 
45 // Coefficients for source bytes 0 to 10
46 static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
47 
48 // Coefficients for source bytes 10 to 21
49 static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
50 
51 // Coefficients for source bytes 21 to 31
52 static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
53 
54 // Coefficients for source bytes 21 to 31
55 static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
56 
57 static const uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
58                                128, 128, 128, 128, 128, 128, 128, 128};
59 
60 static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
61                                6,   8,   11,  14,  128, 128, 128, 128};
62 
63 // Arrange words 0,3,6 into 0,1,2
64 static const uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
65                               128, 128, 128, 128, 128, 128, 128, 128};
66 
67 // Arrange words 0,3,6 into 3,4,5
68 static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
69                                6,   7,   12,  13,  128, 128, 128, 128};
70 
71 // Scaling values for boxes of 3x3 and 2x3
72 static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
73                                   65536 / 9, 65536 / 6, 0,         0};
74 
75 // Arrange first value for pixels 0,1,2,3,4,5
76 static const uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
77                                11, 128, 14, 128, 128, 128, 128, 128};
78 
79 // Arrange second value for pixels 0,1,2,3,4,5
80 static const uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
81                                12, 128, 15, 128, 128, 128, 128, 128};
82 
83 // Arrange third value for pixels 0,1,2,3,4,5
84 static const uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
85                                13, 128, 128, 128, 128, 128, 128, 128};
86 
87 // Scaling values for boxes of 3x2 and 2x2
88 static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
89                                  65536 / 3, 65536 / 2, 0,         0};
90 
91 // Reads 32 pixels, throws half away and writes 16 pixels.
ScaleRowDown2_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)92 __declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
93                                            ptrdiff_t src_stride,
94                                            uint8_t* dst_ptr,
95                                            int dst_width) {
96   __asm {
97     mov        eax, [esp + 4]  // src_ptr
98     // src_stride ignored
99     mov        edx, [esp + 12]  // dst_ptr
100     mov        ecx, [esp + 16]  // dst_width
101 
102   wloop:
103     movdqu     xmm0, [eax]
104     movdqu     xmm1, [eax + 16]
105     lea        eax,  [eax + 32]
106     psrlw      xmm0, 8          // isolate odd pixels.
107     psrlw      xmm1, 8
108     packuswb   xmm0, xmm1
109     movdqu     [edx], xmm0
110     lea        edx, [edx + 16]
111     sub        ecx, 16
112     jg         wloop
113 
114     ret
115   }
116 }
117 
118 // Blends 32x1 rectangle to 16x1.
ScaleRowDown2Linear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)119 __declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
120                                                  ptrdiff_t src_stride,
121                                                  uint8_t* dst_ptr,
122                                                  int dst_width) {
123   __asm {
124     mov        eax, [esp + 4]  // src_ptr
125     // src_stride
126     mov        edx, [esp + 12]  // dst_ptr
127     mov        ecx, [esp + 16]  // dst_width
128 
129     pcmpeqb    xmm4, xmm4  // constant 0x0101
130     psrlw      xmm4, 15
131     packuswb   xmm4, xmm4
132     pxor       xmm5, xmm5  // constant 0
133 
134   wloop:
135     movdqu     xmm0, [eax]
136     movdqu     xmm1, [eax + 16]
137     lea        eax,  [eax + 32]
138     pmaddubsw  xmm0, xmm4  // horizontal add
139     pmaddubsw  xmm1, xmm4
140     pavgw      xmm0, xmm5       // (x + 1) / 2
141     pavgw      xmm1, xmm5
142     packuswb   xmm0, xmm1
143     movdqu     [edx], xmm0
144     lea        edx, [edx + 16]
145     sub        ecx, 16
146     jg         wloop
147 
148     ret
149   }
150 }
151 
152 // Blends 32x2 rectangle to 16x1.
ScaleRowDown2Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)153 __declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
154                                               ptrdiff_t src_stride,
155                                               uint8_t* dst_ptr,
156                                               int dst_width) {
157   __asm {
158     push       esi
159     mov        eax, [esp + 4 + 4]  // src_ptr
160     mov        esi, [esp + 4 + 8]  // src_stride
161     mov        edx, [esp + 4 + 12]  // dst_ptr
162     mov        ecx, [esp + 4 + 16]  // dst_width
163 
164     pcmpeqb    xmm4, xmm4  // constant 0x0101
165     psrlw      xmm4, 15
166     packuswb   xmm4, xmm4
167     pxor       xmm5, xmm5  // constant 0
168 
169   wloop:
170     movdqu     xmm0, [eax]
171     movdqu     xmm1, [eax + 16]
172     movdqu     xmm2, [eax + esi]
173     movdqu     xmm3, [eax + esi + 16]
174     lea        eax,  [eax + 32]
175     pmaddubsw  xmm0, xmm4  // horizontal add
176     pmaddubsw  xmm1, xmm4
177     pmaddubsw  xmm2, xmm4
178     pmaddubsw  xmm3, xmm4
179     paddw      xmm0, xmm2  // vertical add
180     paddw      xmm1, xmm3
181     psrlw      xmm0, 1
182     psrlw      xmm1, 1
183     pavgw      xmm0, xmm5  // (x + 1) / 2
184     pavgw      xmm1, xmm5
185     packuswb   xmm0, xmm1
186     movdqu     [edx], xmm0
187     lea        edx, [edx + 16]
188     sub        ecx, 16
189     jg         wloop
190 
191     pop        esi
192     ret
193   }
194 }
195 
196 #ifdef HAS_SCALEROWDOWN2_AVX2
197 // Reads 64 pixels, throws half away and writes 32 pixels.
ScaleRowDown2_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)198 __declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
199                                           ptrdiff_t src_stride,
200                                           uint8_t* dst_ptr,
201                                           int dst_width) {
202   __asm {
203     mov        eax, [esp + 4]  // src_ptr
204     // src_stride ignored
205     mov        edx, [esp + 12]  // dst_ptr
206     mov        ecx, [esp + 16]  // dst_width
207 
208   wloop:
209     vmovdqu     ymm0, [eax]
210     vmovdqu     ymm1, [eax + 32]
211     lea         eax,  [eax + 64]
212     vpsrlw      ymm0, ymm0, 8  // isolate odd pixels.
213     vpsrlw      ymm1, ymm1, 8
214     vpackuswb   ymm0, ymm0, ymm1
215     vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
216     vmovdqu     [edx], ymm0
217     lea         edx, [edx + 32]
218     sub         ecx, 32
219     jg          wloop
220 
221     vzeroupper
222     ret
223   }
224 }
225 
226 // Blends 64x1 rectangle to 32x1.
ScaleRowDown2Linear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)227 __declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
228                                                 ptrdiff_t src_stride,
229                                                 uint8_t* dst_ptr,
230                                                 int dst_width) {
231   __asm {
232     mov         eax, [esp + 4]  // src_ptr
233     // src_stride
234     mov         edx, [esp + 12]  // dst_ptr
235     mov         ecx, [esp + 16]  // dst_width
236 
237     vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b
238     vpsrlw      ymm4, ymm4, 15
239     vpackuswb   ymm4, ymm4, ymm4
240     vpxor       ymm5, ymm5, ymm5  // constant 0
241 
242   wloop:
243     vmovdqu     ymm0, [eax]
244     vmovdqu     ymm1, [eax + 32]
245     lea         eax,  [eax + 64]
246     vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
247     vpmaddubsw  ymm1, ymm1, ymm4
248     vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
249     vpavgw      ymm1, ymm1, ymm5
250     vpackuswb   ymm0, ymm0, ymm1
251     vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
252     vmovdqu     [edx], ymm0
253     lea         edx, [edx + 32]
254     sub         ecx, 32
255     jg          wloop
256 
257     vzeroupper
258     ret
259   }
260 }
261 
262 // For rounding, average = (sum + 2) / 4
263 // becomes average((sum >> 1), 0)
264 // Blends 64x2 rectangle to 32x1.
ScaleRowDown2Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)265 __declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
266                                              ptrdiff_t src_stride,
267                                              uint8_t* dst_ptr,
268                                              int dst_width) {
269   __asm {
270     push        esi
271     mov         eax, [esp + 4 + 4]  // src_ptr
272     mov         esi, [esp + 4 + 8]  // src_stride
273     mov         edx, [esp + 4 + 12]  // dst_ptr
274     mov         ecx, [esp + 4 + 16]  // dst_width
275 
276     vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b
277     vpsrlw      ymm4, ymm4, 15
278     vpackuswb   ymm4, ymm4, ymm4
279     vpxor       ymm5, ymm5, ymm5  // constant 0
280 
281   wloop:
282     vmovdqu     ymm0, [eax]
283     vmovdqu     ymm1, [eax + 32]
284     vmovdqu     ymm2, [eax + esi]
285     vmovdqu     ymm3, [eax + esi + 32]
286     lea         eax,  [eax + 64]
287     vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
288     vpmaddubsw  ymm1, ymm1, ymm4
289     vpmaddubsw  ymm2, ymm2, ymm4
290     vpmaddubsw  ymm3, ymm3, ymm4
291     vpaddw      ymm0, ymm0, ymm2  // vertical add
292     vpaddw      ymm1, ymm1, ymm3
293     vpsrlw      ymm0, ymm0, 1  // (x + 2) / 4 = (x / 2 + 1) / 2
294     vpsrlw      ymm1, ymm1, 1
295     vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
296     vpavgw      ymm1, ymm1, ymm5
297     vpackuswb   ymm0, ymm0, ymm1
298     vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
299     vmovdqu     [edx], ymm0
300     lea         edx, [edx + 32]
301     sub         ecx, 32
302     jg          wloop
303 
304     pop         esi
305     vzeroupper
306     ret
307   }
308 }
309 #endif  // HAS_SCALEROWDOWN2_AVX2
310 
311 // Point samples 32 pixels to 8 pixels.
ScaleRowDown4_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)312 __declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
313                                            ptrdiff_t src_stride,
314                                            uint8_t* dst_ptr,
315                                            int dst_width) {
316   __asm {
317     mov        eax, [esp + 4]  // src_ptr
318     // src_stride ignored
319     mov        edx, [esp + 12]  // dst_ptr
320     mov        ecx, [esp + 16]  // dst_width
321     pcmpeqb    xmm5, xmm5       // generate mask 0x00ff0000
322     psrld      xmm5, 24
323     pslld      xmm5, 16
324 
325   wloop:
326     movdqu     xmm0, [eax]
327     movdqu     xmm1, [eax + 16]
328     lea        eax,  [eax + 32]
329     pand       xmm0, xmm5
330     pand       xmm1, xmm5
331     packuswb   xmm0, xmm1
332     psrlw      xmm0, 8
333     packuswb   xmm0, xmm0
334     movq       qword ptr [edx], xmm0
335     lea        edx, [edx + 8]
336     sub        ecx, 8
337     jg         wloop
338 
339     ret
340   }
341 }
342 
343 // Blends 32x4 rectangle to 8x1.
ScaleRowDown4Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)344 __declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
345                                               ptrdiff_t src_stride,
346                                               uint8_t* dst_ptr,
347                                               int dst_width) {
348   __asm {
349     push       esi
350     push       edi
351     mov        eax, [esp + 8 + 4]  // src_ptr
352     mov        esi, [esp + 8 + 8]  // src_stride
353     mov        edx, [esp + 8 + 12]  // dst_ptr
354     mov        ecx, [esp + 8 + 16]  // dst_width
355     lea        edi, [esi + esi * 2]  // src_stride * 3
356     pcmpeqb    xmm4, xmm4  // constant 0x0101
357     psrlw      xmm4, 15
358     movdqa     xmm5, xmm4
359     packuswb   xmm4, xmm4
360     psllw      xmm5, 3  // constant 0x0008
361 
362   wloop:
363     movdqu     xmm0, [eax]  // average rows
364     movdqu     xmm1, [eax + 16]
365     movdqu     xmm2, [eax + esi]
366     movdqu     xmm3, [eax + esi + 16]
367     pmaddubsw  xmm0, xmm4  // horizontal add
368     pmaddubsw  xmm1, xmm4
369     pmaddubsw  xmm2, xmm4
370     pmaddubsw  xmm3, xmm4
371     paddw      xmm0, xmm2  // vertical add rows 0, 1
372     paddw      xmm1, xmm3
373     movdqu     xmm2, [eax + esi * 2]
374     movdqu     xmm3, [eax + esi * 2 + 16]
375     pmaddubsw  xmm2, xmm4
376     pmaddubsw  xmm3, xmm4
377     paddw      xmm0, xmm2  // add row 2
378     paddw      xmm1, xmm3
379     movdqu     xmm2, [eax + edi]
380     movdqu     xmm3, [eax + edi + 16]
381     lea        eax, [eax + 32]
382     pmaddubsw  xmm2, xmm4
383     pmaddubsw  xmm3, xmm4
384     paddw      xmm0, xmm2  // add row 3
385     paddw      xmm1, xmm3
386     phaddw     xmm0, xmm1
387     paddw      xmm0, xmm5  // + 8 for round
388     psrlw      xmm0, 4  // /16 for average of 4 * 4
389     packuswb   xmm0, xmm0
390     movq       qword ptr [edx], xmm0
391     lea        edx, [edx + 8]
392     sub        ecx, 8
393     jg         wloop
394 
395     pop        edi
396     pop        esi
397     ret
398   }
399 }
400 
401 #ifdef HAS_SCALEROWDOWN4_AVX2
402 // Point samples 64 pixels to 16 pixels.
ScaleRowDown4_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)403 __declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
404                                           ptrdiff_t src_stride,
405                                           uint8_t* dst_ptr,
406                                           int dst_width) {
407   __asm {
408     mov         eax, [esp + 4]  // src_ptr
409     // src_stride ignored
410     mov         edx, [esp + 12]  // dst_ptr
411     mov         ecx, [esp + 16]  // dst_width
412     vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0x00ff0000
413     vpsrld      ymm5, ymm5, 24
414     vpslld      ymm5, ymm5, 16
415 
416   wloop:
417     vmovdqu     ymm0, [eax]
418     vmovdqu     ymm1, [eax + 32]
419     lea         eax,  [eax + 64]
420     vpand       ymm0, ymm0, ymm5
421     vpand       ymm1, ymm1, ymm5
422     vpackuswb   ymm0, ymm0, ymm1
423     vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
424     vpsrlw      ymm0, ymm0, 8
425     vpackuswb   ymm0, ymm0, ymm0
426     vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
427     vmovdqu     [edx], xmm0
428     lea         edx, [edx + 16]
429     sub         ecx, 16
430     jg          wloop
431 
432     vzeroupper
433     ret
434   }
435 }
436 
437 // Blends 64x4 rectangle to 16x1.
ScaleRowDown4Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)438 __declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
439                                              ptrdiff_t src_stride,
440                                              uint8_t* dst_ptr,
441                                              int dst_width) {
442   __asm {
443     push        esi
444     push        edi
445     mov         eax, [esp + 8 + 4]  // src_ptr
446     mov         esi, [esp + 8 + 8]  // src_stride
447     mov         edx, [esp + 8 + 12]  // dst_ptr
448     mov         ecx, [esp + 8 + 16]  // dst_width
449     lea         edi, [esi + esi * 2]  // src_stride * 3
450     vpcmpeqb    ymm4, ymm4, ymm4  // constant 0x0101
451     vpsrlw      ymm4, ymm4, 15
452     vpsllw      ymm5, ymm4, 3  // constant 0x0008
453     vpackuswb   ymm4, ymm4, ymm4
454 
455   wloop:
456     vmovdqu     ymm0, [eax]  // average rows
457     vmovdqu     ymm1, [eax + 32]
458     vmovdqu     ymm2, [eax + esi]
459     vmovdqu     ymm3, [eax + esi + 32]
460     vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
461     vpmaddubsw  ymm1, ymm1, ymm4
462     vpmaddubsw  ymm2, ymm2, ymm4
463     vpmaddubsw  ymm3, ymm3, ymm4
464     vpaddw      ymm0, ymm0, ymm2  // vertical add rows 0, 1
465     vpaddw      ymm1, ymm1, ymm3
466     vmovdqu     ymm2, [eax + esi * 2]
467     vmovdqu     ymm3, [eax + esi * 2 + 32]
468     vpmaddubsw  ymm2, ymm2, ymm4
469     vpmaddubsw  ymm3, ymm3, ymm4
470     vpaddw      ymm0, ymm0, ymm2  // add row 2
471     vpaddw      ymm1, ymm1, ymm3
472     vmovdqu     ymm2, [eax + edi]
473     vmovdqu     ymm3, [eax + edi + 32]
474     lea         eax,  [eax + 64]
475     vpmaddubsw  ymm2, ymm2, ymm4
476     vpmaddubsw  ymm3, ymm3, ymm4
477     vpaddw      ymm0, ymm0, ymm2  // add row 3
478     vpaddw      ymm1, ymm1, ymm3
479     vphaddw     ymm0, ymm0, ymm1  // mutates
480     vpermq      ymm0, ymm0, 0xd8  // unmutate vphaddw
481     vpaddw      ymm0, ymm0, ymm5  // + 8 for round
482     vpsrlw      ymm0, ymm0, 4  // /32 for average of 4 * 4
483     vpackuswb   ymm0, ymm0, ymm0
484     vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
485     vmovdqu     [edx], xmm0
486     lea         edx, [edx + 16]
487     sub         ecx, 16
488     jg          wloop
489 
490     pop        edi
491     pop        esi
492     vzeroupper
493     ret
494   }
495 }
496 #endif  // HAS_SCALEROWDOWN4_AVX2
497 
498 // Point samples 32 pixels to 24 pixels.
499 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
500 // Then shuffled to do the scaling.
501 
ScaleRowDown34_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)502 __declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
503                                             ptrdiff_t src_stride,
504                                             uint8_t* dst_ptr,
505                                             int dst_width) {
506   __asm {
507     mov        eax, [esp + 4]   // src_ptr
508     // src_stride ignored
509     mov        edx, [esp + 12]  // dst_ptr
510     mov        ecx, [esp + 16]  // dst_width
511     movdqa     xmm3, xmmword ptr kShuf0
512     movdqa     xmm4, xmmword ptr kShuf1
513     movdqa     xmm5, xmmword ptr kShuf2
514 
515   wloop:
516     movdqu     xmm0, [eax]
517     movdqu     xmm1, [eax + 16]
518     lea        eax,  [eax + 32]
519     movdqa     xmm2, xmm1
520     palignr    xmm1, xmm0, 8
521     pshufb     xmm0, xmm3
522     pshufb     xmm1, xmm4
523     pshufb     xmm2, xmm5
524     movq       qword ptr [edx], xmm0
525     movq       qword ptr [edx + 8], xmm1
526     movq       qword ptr [edx + 16], xmm2
527     lea        edx, [edx + 24]
528     sub        ecx, 24
529     jg         wloop
530 
531     ret
532   }
533 }
534 
535 // Blends 32x2 rectangle to 24x1
536 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
537 // Then shuffled to do the scaling.
538 
539 // Register usage:
540 // xmm0 src_row 0
541 // xmm1 src_row 1
542 // xmm2 shuf 0
543 // xmm3 shuf 1
544 // xmm4 shuf 2
545 // xmm5 madd 0
546 // xmm6 madd 1
547 // xmm7 kRound34
548 
549 // Note that movdqa+palign may be better than movdqu.
ScaleRowDown34_1_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)550 __declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
551                                                   ptrdiff_t src_stride,
552                                                   uint8_t* dst_ptr,
553                                                   int dst_width) {
554   __asm {
555     push       esi
556     mov        eax, [esp + 4 + 4]  // src_ptr
557     mov        esi, [esp + 4 + 8]  // src_stride
558     mov        edx, [esp + 4 + 12]  // dst_ptr
559     mov        ecx, [esp + 4 + 16]  // dst_width
560     movdqa     xmm2, xmmword ptr kShuf01
561     movdqa     xmm3, xmmword ptr kShuf11
562     movdqa     xmm4, xmmword ptr kShuf21
563     movdqa     xmm5, xmmword ptr kMadd01
564     movdqa     xmm6, xmmword ptr kMadd11
565     movdqa     xmm7, xmmword ptr kRound34
566 
567   wloop:
568     movdqu     xmm0, [eax]  // pixels 0..7
569     movdqu     xmm1, [eax + esi]
570     pavgb      xmm0, xmm1
571     pshufb     xmm0, xmm2
572     pmaddubsw  xmm0, xmm5
573     paddsw     xmm0, xmm7
574     psrlw      xmm0, 2
575     packuswb   xmm0, xmm0
576     movq       qword ptr [edx], xmm0
577     movdqu     xmm0, [eax + 8]  // pixels 8..15
578     movdqu     xmm1, [eax + esi + 8]
579     pavgb      xmm0, xmm1
580     pshufb     xmm0, xmm3
581     pmaddubsw  xmm0, xmm6
582     paddsw     xmm0, xmm7
583     psrlw      xmm0, 2
584     packuswb   xmm0, xmm0
585     movq       qword ptr [edx + 8], xmm0
586     movdqu     xmm0, [eax + 16]  // pixels 16..23
587     movdqu     xmm1, [eax + esi + 16]
588     lea        eax, [eax + 32]
589     pavgb      xmm0, xmm1
590     pshufb     xmm0, xmm4
591     movdqa     xmm1, xmmword ptr kMadd21
592     pmaddubsw  xmm0, xmm1
593     paddsw     xmm0, xmm7
594     psrlw      xmm0, 2
595     packuswb   xmm0, xmm0
596     movq       qword ptr [edx + 16], xmm0
597     lea        edx, [edx + 24]
598     sub        ecx, 24
599     jg         wloop
600 
601     pop        esi
602     ret
603   }
604 }
605 
606 // Note that movdqa+palign may be better than movdqu.
ScaleRowDown34_0_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)607 __declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
608                                                   ptrdiff_t src_stride,
609                                                   uint8_t* dst_ptr,
610                                                   int dst_width) {
611   __asm {
612     push       esi
613     mov        eax, [esp + 4 + 4]  // src_ptr
614     mov        esi, [esp + 4 + 8]  // src_stride
615     mov        edx, [esp + 4 + 12]  // dst_ptr
616     mov        ecx, [esp + 4 + 16]  // dst_width
617     movdqa     xmm2, xmmword ptr kShuf01
618     movdqa     xmm3, xmmword ptr kShuf11
619     movdqa     xmm4, xmmword ptr kShuf21
620     movdqa     xmm5, xmmword ptr kMadd01
621     movdqa     xmm6, xmmword ptr kMadd11
622     movdqa     xmm7, xmmword ptr kRound34
623 
624   wloop:
625     movdqu     xmm0, [eax]  // pixels 0..7
626     movdqu     xmm1, [eax + esi]
627     pavgb      xmm1, xmm0
628     pavgb      xmm0, xmm1
629     pshufb     xmm0, xmm2
630     pmaddubsw  xmm0, xmm5
631     paddsw     xmm0, xmm7
632     psrlw      xmm0, 2
633     packuswb   xmm0, xmm0
634     movq       qword ptr [edx], xmm0
635     movdqu     xmm0, [eax + 8]  // pixels 8..15
636     movdqu     xmm1, [eax + esi + 8]
637     pavgb      xmm1, xmm0
638     pavgb      xmm0, xmm1
639     pshufb     xmm0, xmm3
640     pmaddubsw  xmm0, xmm6
641     paddsw     xmm0, xmm7
642     psrlw      xmm0, 2
643     packuswb   xmm0, xmm0
644     movq       qword ptr [edx + 8], xmm0
645     movdqu     xmm0, [eax + 16]  // pixels 16..23
646     movdqu     xmm1, [eax + esi + 16]
647     lea        eax, [eax + 32]
648     pavgb      xmm1, xmm0
649     pavgb      xmm0, xmm1
650     pshufb     xmm0, xmm4
651     movdqa     xmm1, xmmword ptr kMadd21
652     pmaddubsw  xmm0, xmm1
653     paddsw     xmm0, xmm7
654     psrlw      xmm0, 2
655     packuswb   xmm0, xmm0
656     movq       qword ptr [edx + 16], xmm0
657     lea        edx, [edx+24]
658     sub        ecx, 24
659     jg         wloop
660 
661     pop        esi
662     ret
663   }
664 }
665 
666 // 3/8 point sampler
667 
668 // Scale 32 pixels to 12
ScaleRowDown38_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)669 __declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
670                                             ptrdiff_t src_stride,
671                                             uint8_t* dst_ptr,
672                                             int dst_width) {
673   __asm {
674     mov        eax, [esp + 4]  // src_ptr
675     // src_stride ignored
676     mov        edx, [esp + 12]  // dst_ptr
677     mov        ecx, [esp + 16]  // dst_width
678     movdqa     xmm4, xmmword ptr kShuf38a
679     movdqa     xmm5, xmmword ptr kShuf38b
680 
681   xloop:
682     movdqu     xmm0, [eax]  // 16 pixels -> 0,1,2,3,4,5
683     movdqu     xmm1, [eax + 16]  // 16 pixels -> 6,7,8,9,10,11
684     lea        eax, [eax + 32]
685     pshufb     xmm0, xmm4
686     pshufb     xmm1, xmm5
687     paddusb    xmm0, xmm1
688 
689     movq       qword ptr [edx], xmm0       // write 12 pixels
690     movhlps    xmm1, xmm0
691     movd       [edx + 8], xmm1
692     lea        edx, [edx + 12]
693     sub        ecx, 12
694     jg         xloop
695 
696     ret
697   }
698 }
699 
700 // Scale 16x3 pixels to 6x1 with interpolation
ScaleRowDown38_3_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)701 __declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
702                                                   ptrdiff_t src_stride,
703                                                   uint8_t* dst_ptr,
704                                                   int dst_width) {
705   __asm {
706     push       esi
707     mov        eax, [esp + 4 + 4]  // src_ptr
708     mov        esi, [esp + 4 + 8]  // src_stride
709     mov        edx, [esp + 4 + 12]  // dst_ptr
710     mov        ecx, [esp + 4 + 16]  // dst_width
711     movdqa     xmm2, xmmword ptr kShufAc
712     movdqa     xmm3, xmmword ptr kShufAc3
713     movdqa     xmm4, xmmword ptr kScaleAc33
714     pxor       xmm5, xmm5
715 
716   xloop:
717     movdqu     xmm0, [eax]  // sum up 3 rows into xmm0/1
718     movdqu     xmm6, [eax + esi]
719     movhlps    xmm1, xmm0
720     movhlps    xmm7, xmm6
721     punpcklbw  xmm0, xmm5
722     punpcklbw  xmm1, xmm5
723     punpcklbw  xmm6, xmm5
724     punpcklbw  xmm7, xmm5
725     paddusw    xmm0, xmm6
726     paddusw    xmm1, xmm7
727     movdqu     xmm6, [eax + esi * 2]
728     lea        eax, [eax + 16]
729     movhlps    xmm7, xmm6
730     punpcklbw  xmm6, xmm5
731     punpcklbw  xmm7, xmm5
732     paddusw    xmm0, xmm6
733     paddusw    xmm1, xmm7
734 
735     movdqa     xmm6, xmm0  // 8 pixels -> 0,1,2 of xmm6
736     psrldq     xmm0, 2
737     paddusw    xmm6, xmm0
738     psrldq     xmm0, 2
739     paddusw    xmm6, xmm0
740     pshufb     xmm6, xmm2
741 
742     movdqa     xmm7, xmm1  // 8 pixels -> 3,4,5 of xmm6
743     psrldq     xmm1, 2
744     paddusw    xmm7, xmm1
745     psrldq     xmm1, 2
746     paddusw    xmm7, xmm1
747     pshufb     xmm7, xmm3
748     paddusw    xmm6, xmm7
749 
750     pmulhuw    xmm6, xmm4  // divide by 9,9,6, 9,9,6
751     packuswb   xmm6, xmm6
752 
753     movd       [edx], xmm6  // write 6 pixels
754     psrlq      xmm6, 16
755     movd       [edx + 2], xmm6
756     lea        edx, [edx + 6]
757     sub        ecx, 6
758     jg         xloop
759 
760     pop        esi
761     ret
762   }
763 }
764 
765 // Scale 16x2 pixels to 6x1 with interpolation
ScaleRowDown38_2_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)766 __declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
767                                                   ptrdiff_t src_stride,
768                                                   uint8_t* dst_ptr,
769                                                   int dst_width) {
770   __asm {
771     push       esi
772     mov        eax, [esp + 4 + 4]  // src_ptr
773     mov        esi, [esp + 4 + 8]  // src_stride
774     mov        edx, [esp + 4 + 12]  // dst_ptr
775     mov        ecx, [esp + 4 + 16]  // dst_width
776     movdqa     xmm2, xmmword ptr kShufAb0
777     movdqa     xmm3, xmmword ptr kShufAb1
778     movdqa     xmm4, xmmword ptr kShufAb2
779     movdqa     xmm5, xmmword ptr kScaleAb2
780 
781   xloop:
782     movdqu     xmm0, [eax]  // average 2 rows into xmm0
783     movdqu     xmm1, [eax + esi]
784     lea        eax, [eax + 16]
785     pavgb      xmm0, xmm1
786 
787     movdqa     xmm1, xmm0  // 16 pixels -> 0,1,2,3,4,5 of xmm1
788     pshufb     xmm1, xmm2
789     movdqa     xmm6, xmm0
790     pshufb     xmm6, xmm3
791     paddusw    xmm1, xmm6
792     pshufb     xmm0, xmm4
793     paddusw    xmm1, xmm0
794 
795     pmulhuw    xmm1, xmm5  // divide by 3,3,2, 3,3,2
796     packuswb   xmm1, xmm1
797 
798     movd       [edx], xmm1  // write 6 pixels
799     psrlq      xmm1, 16
800     movd       [edx + 2], xmm1
801     lea        edx, [edx + 6]
802     sub        ecx, 6
803     jg         xloop
804 
805     pop        esi
806     ret
807   }
808 }
809 
810 // Reads 16 bytes and accumulates to 16 shorts at a time.
ScaleAddRow_SSE2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)811 __declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr,
812                                         uint16_t* dst_ptr,
813                                         int src_width) {
814   __asm {
815     mov        eax, [esp + 4]  // src_ptr
816     mov        edx, [esp + 8]  // dst_ptr
817     mov        ecx, [esp + 12]  // src_width
818     pxor       xmm5, xmm5
819 
820         // sum rows
821   xloop:
822     movdqu     xmm3, [eax]  // read 16 bytes
823     lea        eax, [eax + 16]
824     movdqu     xmm0, [edx]  // read 16 words from destination
825     movdqu     xmm1, [edx + 16]
826     movdqa     xmm2, xmm3
827     punpcklbw  xmm2, xmm5
828     punpckhbw  xmm3, xmm5
829     paddusw    xmm0, xmm2  // sum 16 words
830     paddusw    xmm1, xmm3
831     movdqu     [edx], xmm0  // write 16 words to destination
832     movdqu     [edx + 16], xmm1
833     lea        edx, [edx + 32]
834     sub        ecx, 16
835     jg         xloop
836     ret
837   }
838 }
839 
840 #ifdef HAS_SCALEADDROW_AVX2
841 // Reads 32 bytes and accumulates to 32 shorts at a time.
ScaleAddRow_AVX2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)842 __declspec(naked) void ScaleAddRow_AVX2(const uint8_t* src_ptr,
843                                         uint16_t* dst_ptr,
844                                         int src_width) {
845   __asm {
846     mov         eax, [esp + 4]  // src_ptr
847     mov         edx, [esp + 8]  // dst_ptr
848     mov         ecx, [esp + 12]  // src_width
849     vpxor       ymm5, ymm5, ymm5
850 
851         // sum rows
852   xloop:
853     vmovdqu     ymm3, [eax]  // read 32 bytes
854     lea         eax, [eax + 32]
855     vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
856     vpunpcklbw  ymm2, ymm3, ymm5
857     vpunpckhbw  ymm3, ymm3, ymm5
858     vpaddusw    ymm0, ymm2, [edx]  // sum 16 words
859     vpaddusw    ymm1, ymm3, [edx + 32]
860     vmovdqu     [edx], ymm0  // write 32 words to destination
861     vmovdqu     [edx + 32], ymm1
862     lea         edx, [edx + 64]
863     sub         ecx, 32
864     jg          xloop
865 
866     vzeroupper
867     ret
868   }
869 }
870 #endif  // HAS_SCALEADDROW_AVX2
871 
872 // Constant for making pixels signed to avoid pmaddubsw
873 // saturation.
874 static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
875                               0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
876 
877 // Constant for making pixels unsigned and adding .5 for rounding.
878 static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
879                                0x4040, 0x4040, 0x4040, 0x4040};
880 
881 // Bilinear column filtering. SSSE3 version.
ScaleFilterCols_SSSE3(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)882 __declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
883                                              const uint8_t* src_ptr,
884                                              int dst_width,
885                                              int x,
886                                              int dx) {
887   __asm {
888     push       ebx
889     push       esi
890     push       edi
891     mov        edi, [esp + 12 + 4]  // dst_ptr
892     mov        esi, [esp + 12 + 8]  // src_ptr
893     mov        ecx, [esp + 12 + 12]  // dst_width
894     movd       xmm2, [esp + 12 + 16]  // x
895     movd       xmm3, [esp + 12 + 20]  // dx
896     mov        eax, 0x04040000  // shuffle to line up fractions with pixel.
897     movd       xmm5, eax
898     pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.
899     psrlw      xmm6, 9
900     pcmpeqb    xmm7, xmm7  // generate 0x0001
901     psrlw      xmm7, 15
902     pextrw     eax, xmm2, 1  // get x0 integer. preroll
903     sub        ecx, 2
904     jl         xloop29
905 
906     movdqa     xmm0, xmm2  // x1 = x0 + dx
907     paddd      xmm0, xmm3
908     punpckldq  xmm2, xmm0  // x0 x1
909     punpckldq  xmm3, xmm3  // dx dx
910     paddd      xmm3, xmm3  // dx * 2, dx * 2
911     pextrw     edx, xmm2, 3  // get x1 integer. preroll
912 
913     // 2 Pixel loop.
914   xloop2:
915     movdqa     xmm1, xmm2  // x0, x1 fractions.
916     paddd      xmm2, xmm3  // x += dx
917     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
918     movd       xmm0, ebx
919     psrlw      xmm1, 9  // 7 bit fractions.
920     movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
921     movd       xmm4, ebx
922     pshufb     xmm1, xmm5  // 0011
923     punpcklwd  xmm0, xmm4
924     psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
925     pxor       xmm1, xmm6  // 0..7f and 7f..0
926     paddusb    xmm1, xmm7  // +1 so 0..7f and 80..1
927     pmaddubsw  xmm1, xmm0  // 16 bit, 2 pixels.
928     pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
929     pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
930     paddw      xmm1, xmmword ptr kFadd40  // make pixels unsigned and round.
931     psrlw      xmm1, 7  // 8.7 fixed point to low 8 bits.
932     packuswb   xmm1, xmm1  // 8 bits, 2 pixels.
933     movd       ebx, xmm1
934     mov        [edi], bx
935     lea        edi, [edi + 2]
936     sub        ecx, 2  // 2 pixels
937     jge        xloop2
938 
939  xloop29:
940     add        ecx, 2 - 1
941     jl         xloop99
942 
943             // 1 pixel remainder
944     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
945     movd       xmm0, ebx
946     psrlw      xmm2, 9  // 7 bit fractions.
947     pshufb     xmm2, xmm5  // 0011
948     psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
949     pxor       xmm2, xmm6  // 0..7f and 7f..0
950     paddusb    xmm2, xmm7  // +1 so 0..7f and 80..1
951     pmaddubsw  xmm2, xmm0  // 16 bit
952     paddw      xmm2, xmmword ptr kFadd40  // make pixels unsigned and round.
953     psrlw      xmm2, 7  // 8.7 fixed point to low 8 bits.
954     packuswb   xmm2, xmm2  // 8 bits
955     movd       ebx, xmm2
956     mov        [edi], bl
957 
958  xloop99:
959 
960     pop        edi
961     pop        esi
962     pop        ebx
963     ret
964   }
965 }
966 
967 // Reads 16 pixels, duplicates them and writes 32 pixels.
ScaleColsUp2_SSE2(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)968 __declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
969                                          const uint8_t* src_ptr,
970                                          int dst_width,
971                                          int x,
972                                          int dx) {
973   __asm {
974     mov        edx, [esp + 4]  // dst_ptr
975     mov        eax, [esp + 8]  // src_ptr
976     mov        ecx, [esp + 12]  // dst_width
977 
978   wloop:
979     movdqu     xmm0, [eax]
980     lea        eax,  [eax + 16]
981     movdqa     xmm1, xmm0
982     punpcklbw  xmm0, xmm0
983     punpckhbw  xmm1, xmm1
984     movdqu     [edx], xmm0
985     movdqu     [edx + 16], xmm1
986     lea        edx, [edx + 32]
987     sub        ecx, 32
988     jg         wloop
989 
990     ret
991   }
992 }
993 
994 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
ScaleARGBRowDown2_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)995 __declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
996                                               ptrdiff_t src_stride,
997                                               uint8_t* dst_argb,
998                                               int dst_width) {
999   __asm {
1000     mov        eax, [esp + 4]   // src_argb
1001     // src_stride ignored
1002     mov        edx, [esp + 12]  // dst_argb
1003     mov        ecx, [esp + 16]  // dst_width
1004 
1005   wloop:
1006     movdqu     xmm0, [eax]
1007     movdqu     xmm1, [eax + 16]
1008     lea        eax,  [eax + 32]
1009     shufps     xmm0, xmm1, 0xdd
1010     movdqu     [edx], xmm0
1011     lea        edx, [edx + 16]
1012     sub        ecx, 4
1013     jg         wloop
1014 
1015     ret
1016   }
1017 }
1018 
1019 // Blends 8x1 rectangle to 4x1.
ScaleARGBRowDown2Linear_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1020 __declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
1021                                                     ptrdiff_t src_stride,
1022                                                     uint8_t* dst_argb,
1023                                                     int dst_width) {
1024   __asm {
1025     mov        eax, [esp + 4]  // src_argb
1026     // src_stride ignored
1027     mov        edx, [esp + 12]  // dst_argb
1028     mov        ecx, [esp + 16]  // dst_width
1029 
1030   wloop:
1031     movdqu     xmm0, [eax]
1032     movdqu     xmm1, [eax + 16]
1033     lea        eax,  [eax + 32]
1034     movdqa     xmm2, xmm0
1035     shufps     xmm0, xmm1, 0x88  // even pixels
1036     shufps     xmm2, xmm1, 0xdd       // odd pixels
1037     pavgb      xmm0, xmm2
1038     movdqu     [edx], xmm0
1039     lea        edx, [edx + 16]
1040     sub        ecx, 4
1041     jg         wloop
1042 
1043     ret
1044   }
1045 }
1046 
1047 // Blends 8x2 rectangle to 4x1.
ScaleARGBRowDown2Box_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1048 __declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
1049                                                  ptrdiff_t src_stride,
1050                                                  uint8_t* dst_argb,
1051                                                  int dst_width) {
1052   __asm {
1053     push       esi
1054     mov        eax, [esp + 4 + 4]  // src_argb
1055     mov        esi, [esp + 4 + 8]  // src_stride
1056     mov        edx, [esp + 4 + 12]  // dst_argb
1057     mov        ecx, [esp + 4 + 16]  // dst_width
1058 
1059   wloop:
1060     movdqu     xmm0, [eax]
1061     movdqu     xmm1, [eax + 16]
1062     movdqu     xmm2, [eax + esi]
1063     movdqu     xmm3, [eax + esi + 16]
1064     lea        eax,  [eax + 32]
1065     pavgb      xmm0, xmm2  // average rows
1066     pavgb      xmm1, xmm3
1067     movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)
1068     shufps     xmm0, xmm1, 0x88  // even pixels
1069     shufps     xmm2, xmm1, 0xdd  // odd pixels
1070     pavgb      xmm0, xmm2
1071     movdqu     [edx], xmm0
1072     lea        edx, [edx + 16]
1073     sub        ecx, 4
1074     jg         wloop
1075 
1076     pop        esi
1077     ret
1078   }
1079 }
1080 
1081 // Reads 4 pixels at a time.
ScaleARGBRowDownEven_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)1082 __declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
1083                                                  ptrdiff_t src_stride,
1084                                                  int src_stepx,
1085                                                  uint8_t* dst_argb,
1086                                                  int dst_width) {
1087   __asm {
1088     push       ebx
1089     push       edi
1090     mov        eax, [esp + 8 + 4]   // src_argb
1091     // src_stride ignored
1092     mov        ebx, [esp + 8 + 12]  // src_stepx
1093     mov        edx, [esp + 8 + 16]  // dst_argb
1094     mov        ecx, [esp + 8 + 20]  // dst_width
1095     lea        ebx, [ebx * 4]
1096     lea        edi, [ebx + ebx * 2]
1097 
1098   wloop:
1099     movd       xmm0, [eax]
1100     movd       xmm1, [eax + ebx]
1101     punpckldq  xmm0, xmm1
1102     movd       xmm2, [eax + ebx * 2]
1103     movd       xmm3, [eax + edi]
1104     lea        eax,  [eax + ebx * 4]
1105     punpckldq  xmm2, xmm3
1106     punpcklqdq xmm0, xmm2
1107     movdqu     [edx], xmm0
1108     lea        edx, [edx + 16]
1109     sub        ecx, 4
1110     jg         wloop
1111 
1112     pop        edi
1113     pop        ebx
1114     ret
1115   }
1116 }
1117 
1118 // Blends four 2x2 to 4x1.
ScaleARGBRowDownEvenBox_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)1119 __declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
1120                                                     ptrdiff_t src_stride,
1121                                                     int src_stepx,
1122                                                     uint8_t* dst_argb,
1123                                                     int dst_width) {
1124   __asm {
1125     push       ebx
1126     push       esi
1127     push       edi
1128     mov        eax, [esp + 12 + 4]  // src_argb
1129     mov        esi, [esp + 12 + 8]  // src_stride
1130     mov        ebx, [esp + 12 + 12]  // src_stepx
1131     mov        edx, [esp + 12 + 16]  // dst_argb
1132     mov        ecx, [esp + 12 + 20]  // dst_width
1133     lea        esi, [eax + esi]  // row1 pointer
1134     lea        ebx, [ebx * 4]
1135     lea        edi, [ebx + ebx * 2]
1136 
1137   wloop:
1138     movq       xmm0, qword ptr [eax]  // row0 4 pairs
1139     movhps     xmm0, qword ptr [eax + ebx]
1140     movq       xmm1, qword ptr [eax + ebx * 2]
1141     movhps     xmm1, qword ptr [eax + edi]
1142     lea        eax,  [eax + ebx * 4]
1143     movq       xmm2, qword ptr [esi]  // row1 4 pairs
1144     movhps     xmm2, qword ptr [esi + ebx]
1145     movq       xmm3, qword ptr [esi + ebx * 2]
1146     movhps     xmm3, qword ptr [esi + edi]
1147     lea        esi,  [esi + ebx * 4]
1148     pavgb      xmm0, xmm2  // average rows
1149     pavgb      xmm1, xmm3
1150     movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)
1151     shufps     xmm0, xmm1, 0x88  // even pixels
1152     shufps     xmm2, xmm1, 0xdd  // odd pixels
1153     pavgb      xmm0, xmm2
1154     movdqu     [edx], xmm0
1155     lea        edx, [edx + 16]
1156     sub        ecx, 4
1157     jg         wloop
1158 
1159     pop        edi
1160     pop        esi
1161     pop        ebx
1162     ret
1163   }
1164 }
1165 
1166 // Column scaling unfiltered. SSE2 version.
ScaleARGBCols_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1167 __declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb,
1168                                           const uint8_t* src_argb,
1169                                           int dst_width,
1170                                           int x,
1171                                           int dx) {
1172   __asm {
1173     push       edi
1174     push       esi
1175     mov        edi, [esp + 8 + 4]  // dst_argb
1176     mov        esi, [esp + 8 + 8]  // src_argb
1177     mov        ecx, [esp + 8 + 12]  // dst_width
1178     movd       xmm2, [esp + 8 + 16]  // x
1179     movd       xmm3, [esp + 8 + 20]  // dx
1180 
1181     pshufd     xmm2, xmm2, 0  // x0 x0 x0 x0
1182     pshufd     xmm0, xmm3, 0x11  // dx  0 dx  0
1183     paddd      xmm2, xmm0
1184     paddd      xmm3, xmm3  // 0, 0, 0,  dx * 2
1185     pshufd     xmm0, xmm3, 0x05  // dx * 2, dx * 2, 0, 0
1186     paddd      xmm2, xmm0  // x3 x2 x1 x0
1187     paddd      xmm3, xmm3  // 0, 0, 0,  dx * 4
1188     pshufd     xmm3, xmm3, 0  // dx * 4, dx * 4, dx * 4, dx * 4
1189 
1190     pextrw     eax, xmm2, 1  // get x0 integer.
1191     pextrw     edx, xmm2, 3  // get x1 integer.
1192 
1193     cmp        ecx, 0
1194     jle        xloop99
1195     sub        ecx, 4
1196     jl         xloop49
1197 
1198         // 4 Pixel loop.
1199  xloop4:
1200     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
1201     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
1202     pextrw     eax, xmm2, 5  // get x2 integer.
1203     pextrw     edx, xmm2, 7  // get x3 integer.
1204     paddd      xmm2, xmm3  // x += dx
1205     punpckldq  xmm0, xmm1  // x0 x1
1206 
1207     movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
1208     movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
1209     pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
1210     pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
1211     punpckldq  xmm1, xmm4  // x2 x3
1212     punpcklqdq xmm0, xmm1  // x0 x1 x2 x3
1213     movdqu     [edi], xmm0
1214     lea        edi, [edi + 16]
1215     sub        ecx, 4  // 4 pixels
1216     jge        xloop4
1217 
1218  xloop49:
1219     test       ecx, 2
1220     je         xloop29
1221 
1222         // 2 Pixels.
1223     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
1224     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
1225     pextrw     eax, xmm2, 5  // get x2 integer.
1226     punpckldq  xmm0, xmm1  // x0 x1
1227 
1228     movq       qword ptr [edi], xmm0
1229     lea        edi, [edi + 8]
1230 
1231  xloop29:
1232     test       ecx, 1
1233     je         xloop99
1234 
1235         // 1 Pixels.
1236     movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
1237     movd       dword ptr [edi], xmm0
1238  xloop99:
1239 
1240     pop        esi
1241     pop        edi
1242     ret
1243   }
1244 }
1245 
1246 // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
1247 // TODO(fbarchard): Port to Neon
1248 
1249 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1250 static const uvec8 kShuffleColARGB = {
1251     0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
1252     8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
1253 };
1254 
1255 // Shuffle table for duplicating 2 fractions into 8 bytes each
1256 static const uvec8 kShuffleFractions = {
1257     0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1258 };
1259 
ScaleARGBFilterCols_SSSE3(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1260 __declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
1261                                                  const uint8_t* src_argb,
1262                                                  int dst_width,
1263                                                  int x,
1264                                                  int dx) {
1265   __asm {
1266     push       esi
1267     push       edi
1268     mov        edi, [esp + 8 + 4]  // dst_argb
1269     mov        esi, [esp + 8 + 8]  // src_argb
1270     mov        ecx, [esp + 8 + 12]  // dst_width
1271     movd       xmm2, [esp + 8 + 16]  // x
1272     movd       xmm3, [esp + 8 + 20]  // dx
1273     movdqa     xmm4, xmmword ptr kShuffleColARGB
1274     movdqa     xmm5, xmmword ptr kShuffleFractions
1275     pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.
1276     psrlw      xmm6, 9
1277     pextrw     eax, xmm2, 1  // get x0 integer. preroll
1278     sub        ecx, 2
1279     jl         xloop29
1280 
1281     movdqa     xmm0, xmm2  // x1 = x0 + dx
1282     paddd      xmm0, xmm3
1283     punpckldq  xmm2, xmm0  // x0 x1
1284     punpckldq  xmm3, xmm3  // dx dx
1285     paddd      xmm3, xmm3  // dx * 2, dx * 2
1286     pextrw     edx, xmm2, 3  // get x1 integer. preroll
1287 
1288     // 2 Pixel loop.
1289   xloop2:
1290     movdqa     xmm1, xmm2  // x0, x1 fractions.
1291     paddd      xmm2, xmm3  // x += dx
1292     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
1293     psrlw      xmm1, 9  // 7 bit fractions.
1294     movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
1295     pshufb     xmm1, xmm5  // 0000000011111111
1296     pshufb     xmm0, xmm4  // arrange pixels into pairs
1297     pxor       xmm1, xmm6  // 0..7f and 7f..0
1298     pmaddubsw  xmm0, xmm1  // argb_argb 16 bit, 2 pixels.
1299     pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
1300     pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
1301     psrlw      xmm0, 7  // argb 8.7 fixed point to low 8 bits.
1302     packuswb   xmm0, xmm0  // argb_argb 8 bits, 2 pixels.
1303     movq       qword ptr [edi], xmm0
1304     lea        edi, [edi + 8]
1305     sub        ecx, 2  // 2 pixels
1306     jge        xloop2
1307 
1308  xloop29:
1309 
1310     add        ecx, 2 - 1
1311     jl         xloop99
1312 
1313             // 1 pixel remainder
1314     psrlw      xmm2, 9  // 7 bit fractions.
1315     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
1316     pshufb     xmm2, xmm5  // 00000000
1317     pshufb     xmm0, xmm4  // arrange pixels into pairs
1318     pxor       xmm2, xmm6  // 0..7f and 7f..0
1319     pmaddubsw  xmm0, xmm2  // argb 16 bit, 1 pixel.
1320     psrlw      xmm0, 7
1321     packuswb   xmm0, xmm0  // argb 8 bits, 1 pixel.
1322     movd       [edi], xmm0
1323 
1324  xloop99:
1325 
1326     pop        edi
1327     pop        esi
1328     ret
1329   }
1330 }
1331 
1332 // Reads 4 pixels, duplicates them and writes 8 pixels.
ScaleARGBColsUp2_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1333 __declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
1334                                              const uint8_t* src_argb,
1335                                              int dst_width,
1336                                              int x,
1337                                              int dx) {
1338   __asm {
1339     mov        edx, [esp + 4]  // dst_argb
1340     mov        eax, [esp + 8]  // src_argb
1341     mov        ecx, [esp + 12]  // dst_width
1342 
1343   wloop:
1344     movdqu     xmm0, [eax]
1345     lea        eax,  [eax + 16]
1346     movdqa     xmm1, xmm0
1347     punpckldq  xmm0, xmm0
1348     punpckhdq  xmm1, xmm1
1349     movdqu     [edx], xmm0
1350     movdqu     [edx + 16], xmm1
1351     lea        edx, [edx + 32]
1352     sub        ecx, 8
1353     jg         wloop
1354 
1355     ret
1356   }
1357 }
1358 
1359 // Divide num by div and return as 16.16 fixed point result.
FixedDiv_X86(int num,int div)1360 __declspec(naked) int FixedDiv_X86(int num, int div) {
1361   __asm {
1362     mov        eax, [esp + 4]  // num
1363     cdq  // extend num to 64 bits
1364     shld       edx, eax, 16  // 32.16
1365     shl        eax, 16
1366     idiv       dword ptr [esp + 8]
1367     ret
1368   }
1369 }
1370 
1371 // Divide num by div and return as 16.16 fixed point result.
FixedDiv1_X86(int num,int div)1372 __declspec(naked) int FixedDiv1_X86(int num, int div) {
1373   __asm {
1374     mov        eax, [esp + 4]  // num
1375     mov        ecx, [esp + 8]  // denom
1376     cdq  // extend num to 64 bits
1377     shld       edx, eax, 16  // 32.16
1378     shl        eax, 16
1379     sub        eax, 0x00010001
1380     sbb        edx, 0
1381     sub        ecx, 1
1382     idiv       ecx
1383     ret
1384   }
1385 }
1386 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
1387 
1388 #ifdef __cplusplus
1389 }  // extern "C"
1390 }  // namespace libyuv
1391 #endif
1392