• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 #include "libyuv/scale_row.h"
13 
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18 
19 // This module is for 32 bit Visual C x86 and clangcl
20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
21 
22 // Offsets for source bytes 0 to 9
23 static uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
24                        128, 128, 128, 128, 128, 128, 128, 128};
25 
26 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
27 static uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
28                        128, 128, 128, 128, 128, 128, 128, 128};
29 
30 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
31 static uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
32                        128, 128, 128, 128, 128, 128, 128, 128};
33 
34 // Offsets for source bytes 0 to 10
35 static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
36 
37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
38 static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13};
39 
40 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
41 static uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
42                         10, 11, 12, 13, 13, 14, 14, 15};
43 
44 // Coefficients for source bytes 0 to 10
45 static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
46 
47 // Coefficients for source bytes 10 to 21
48 static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
49 
50 // Coefficients for source bytes 21 to 31
51 static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
52 
53 // Coefficients for source bytes 21 to 31
54 static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
55 
56 static uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
57                          128, 128, 128, 128, 128, 128, 128, 128};
58 
59 static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
60                          6,   8,   11,  14,  128, 128, 128, 128};
61 
62 // Arrange words 0,3,6 into 0,1,2
63 static uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
64                         128, 128, 128, 128, 128, 128, 128, 128};
65 
66 // Arrange words 0,3,6 into 3,4,5
67 static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
68                          6,   7,   12,  13,  128, 128, 128, 128};
69 
70 // Scaling values for boxes of 3x3 and 2x3
71 static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
72                             65536 / 9, 65536 / 6, 0,         0};
73 
74 // Arrange first value for pixels 0,1,2,3,4,5
75 static uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
76                          11, 128, 14, 128, 128, 128, 128, 128};
77 
78 // Arrange second value for pixels 0,1,2,3,4,5
79 static uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
80                          12, 128, 15, 128, 128, 128, 128, 128};
81 
82 // Arrange third value for pixels 0,1,2,3,4,5
83 static uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
84                          13, 128, 128, 128, 128, 128, 128, 128};
85 
86 // Scaling values for boxes of 3x2 and 2x2
87 static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
88                            65536 / 3, 65536 / 2, 0,         0};
89 
90 // Reads 32 pixels, throws half away and writes 16 pixels.
ScaleRowDown2_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)91 __declspec(naked) void ScaleRowDown2_SSSE3(const uint8* src_ptr,
92                                            ptrdiff_t src_stride,
93                                            uint8* dst_ptr,
94                                            int dst_width) {
95   __asm {
96     mov        eax, [esp + 4]  // src_ptr
97     // src_stride ignored
98     mov        edx, [esp + 12]  // dst_ptr
99     mov        ecx, [esp + 16]  // dst_width
100 
101   wloop:
102     movdqu     xmm0, [eax]
103     movdqu     xmm1, [eax + 16]
104     lea        eax,  [eax + 32]
105     psrlw      xmm0, 8          // isolate odd pixels.
106     psrlw      xmm1, 8
107     packuswb   xmm0, xmm1
108     movdqu     [edx], xmm0
109     lea        edx, [edx + 16]
110     sub        ecx, 16
111     jg         wloop
112 
113     ret
114   }
115 }
116 
117 // Blends 32x1 rectangle to 16x1.
ScaleRowDown2Linear_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)118 __declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr,
119                                                  ptrdiff_t src_stride,
120                                                  uint8* dst_ptr,
121                                                  int dst_width) {
122   __asm {
123     mov        eax, [esp + 4]  // src_ptr
124     // src_stride
125     mov        edx, [esp + 12]  // dst_ptr
126     mov        ecx, [esp + 16]  // dst_width
127 
128     pcmpeqb    xmm4, xmm4  // constant 0x0101
129     psrlw      xmm4, 15
130     packuswb   xmm4, xmm4
131     pxor       xmm5, xmm5  // constant 0
132 
133   wloop:
134     movdqu     xmm0, [eax]
135     movdqu     xmm1, [eax + 16]
136     lea        eax,  [eax + 32]
137     pmaddubsw  xmm0, xmm4  // horizontal add
138     pmaddubsw  xmm1, xmm4
139     pavgw      xmm0, xmm5       // (x + 1) / 2
140     pavgw      xmm1, xmm5
141     packuswb   xmm0, xmm1
142     movdqu     [edx], xmm0
143     lea        edx, [edx + 16]
144     sub        ecx, 16
145     jg         wloop
146 
147     ret
148   }
149 }
150 
151 // Blends 32x2 rectangle to 16x1.
ScaleRowDown2Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)152 __declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8* src_ptr,
153                                               ptrdiff_t src_stride,
154                                               uint8* dst_ptr,
155                                               int dst_width) {
156   __asm {
157     push       esi
158     mov        eax, [esp + 4 + 4]  // src_ptr
159     mov        esi, [esp + 4 + 8]  // src_stride
160     mov        edx, [esp + 4 + 12]  // dst_ptr
161     mov        ecx, [esp + 4 + 16]  // dst_width
162 
163     pcmpeqb    xmm4, xmm4  // constant 0x0101
164     psrlw      xmm4, 15
165     packuswb   xmm4, xmm4
166     pxor       xmm5, xmm5  // constant 0
167 
168   wloop:
169     movdqu     xmm0, [eax]
170     movdqu     xmm1, [eax + 16]
171     movdqu     xmm2, [eax + esi]
172     movdqu     xmm3, [eax + esi + 16]
173     lea        eax,  [eax + 32]
174     pmaddubsw  xmm0, xmm4  // horizontal add
175     pmaddubsw  xmm1, xmm4
176     pmaddubsw  xmm2, xmm4
177     pmaddubsw  xmm3, xmm4
178     paddw      xmm0, xmm2  // vertical add
179     paddw      xmm1, xmm3
180     psrlw      xmm0, 1
181     psrlw      xmm1, 1
182     pavgw      xmm0, xmm5  // (x + 1) / 2
183     pavgw      xmm1, xmm5
184     packuswb   xmm0, xmm1
185     movdqu     [edx], xmm0
186     lea        edx, [edx + 16]
187     sub        ecx, 16
188     jg         wloop
189 
190     pop        esi
191     ret
192   }
193 }
194 
195 #ifdef HAS_SCALEROWDOWN2_AVX2
196 // Reads 64 pixels, throws half away and writes 32 pixels.
ScaleRowDown2_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)197 __declspec(naked) void ScaleRowDown2_AVX2(const uint8* src_ptr,
198                                           ptrdiff_t src_stride,
199                                           uint8* dst_ptr,
200                                           int dst_width) {
201   __asm {
202     mov        eax, [esp + 4]  // src_ptr
203     // src_stride ignored
204     mov        edx, [esp + 12]  // dst_ptr
205     mov        ecx, [esp + 16]  // dst_width
206 
207   wloop:
208     vmovdqu     ymm0, [eax]
209     vmovdqu     ymm1, [eax + 32]
210     lea         eax,  [eax + 64]
211     vpsrlw      ymm0, ymm0, 8  // isolate odd pixels.
212     vpsrlw      ymm1, ymm1, 8
213     vpackuswb   ymm0, ymm0, ymm1
214     vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
215     vmovdqu     [edx], ymm0
216     lea         edx, [edx + 32]
217     sub         ecx, 32
218     jg          wloop
219 
220     vzeroupper
221     ret
222   }
223 }
224 
225 // Blends 64x1 rectangle to 32x1.
ScaleRowDown2Linear_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)226 __declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8* src_ptr,
227                                                 ptrdiff_t src_stride,
228                                                 uint8* dst_ptr,
229                                                 int dst_width) {
230   __asm {
231     mov         eax, [esp + 4]  // src_ptr
232     // src_stride
233     mov         edx, [esp + 12]  // dst_ptr
234     mov         ecx, [esp + 16]  // dst_width
235 
236     vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b
237     vpsrlw      ymm4, ymm4, 15
238     vpackuswb   ymm4, ymm4, ymm4
239     vpxor       ymm5, ymm5, ymm5  // constant 0
240 
241   wloop:
242     vmovdqu     ymm0, [eax]
243     vmovdqu     ymm1, [eax + 32]
244     lea         eax,  [eax + 64]
245     vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
246     vpmaddubsw  ymm1, ymm1, ymm4
247     vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
248     vpavgw      ymm1, ymm1, ymm5
249     vpackuswb   ymm0, ymm0, ymm1
250     vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
251     vmovdqu     [edx], ymm0
252     lea         edx, [edx + 32]
253     sub         ecx, 32
254     jg          wloop
255 
256     vzeroupper
257     ret
258   }
259 }
260 
261 // For rounding, average = (sum + 2) / 4
262 // becomes average((sum >> 1), 0)
263 // Blends 64x2 rectangle to 32x1.
ScaleRowDown2Box_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)264 __declspec(naked) void ScaleRowDown2Box_AVX2(const uint8* src_ptr,
265                                              ptrdiff_t src_stride,
266                                              uint8* dst_ptr,
267                                              int dst_width) {
268   __asm {
269     push        esi
270     mov         eax, [esp + 4 + 4]  // src_ptr
271     mov         esi, [esp + 4 + 8]  // src_stride
272     mov         edx, [esp + 4 + 12]  // dst_ptr
273     mov         ecx, [esp + 4 + 16]  // dst_width
274 
275     vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b
276     vpsrlw      ymm4, ymm4, 15
277     vpackuswb   ymm4, ymm4, ymm4
278     vpxor       ymm5, ymm5, ymm5  // constant 0
279 
280   wloop:
281     vmovdqu     ymm0, [eax]
282     vmovdqu     ymm1, [eax + 32]
283     vmovdqu     ymm2, [eax + esi]
284     vmovdqu     ymm3, [eax + esi + 32]
285     lea         eax,  [eax + 64]
286     vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
287     vpmaddubsw  ymm1, ymm1, ymm4
288     vpmaddubsw  ymm2, ymm2, ymm4
289     vpmaddubsw  ymm3, ymm3, ymm4
290     vpaddw      ymm0, ymm0, ymm2  // vertical add
291     vpaddw      ymm1, ymm1, ymm3
292     vpsrlw      ymm0, ymm0, 1  // (x + 2) / 4 = (x / 2 + 1) / 2
293     vpsrlw      ymm1, ymm1, 1
294     vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
295     vpavgw      ymm1, ymm1, ymm5
296     vpackuswb   ymm0, ymm0, ymm1
297     vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
298     vmovdqu     [edx], ymm0
299     lea         edx, [edx + 32]
300     sub         ecx, 32
301     jg          wloop
302 
303     pop         esi
304     vzeroupper
305     ret
306   }
307 }
308 #endif  // HAS_SCALEROWDOWN2_AVX2
309 
310 // Point samples 32 pixels to 8 pixels.
ScaleRowDown4_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)311 __declspec(naked) void ScaleRowDown4_SSSE3(const uint8* src_ptr,
312                                            ptrdiff_t src_stride,
313                                            uint8* dst_ptr,
314                                            int dst_width) {
315   __asm {
316     mov        eax, [esp + 4]  // src_ptr
317     // src_stride ignored
318     mov        edx, [esp + 12]  // dst_ptr
319     mov        ecx, [esp + 16]  // dst_width
320     pcmpeqb    xmm5, xmm5       // generate mask 0x00ff0000
321     psrld      xmm5, 24
322     pslld      xmm5, 16
323 
324   wloop:
325     movdqu     xmm0, [eax]
326     movdqu     xmm1, [eax + 16]
327     lea        eax,  [eax + 32]
328     pand       xmm0, xmm5
329     pand       xmm1, xmm5
330     packuswb   xmm0, xmm1
331     psrlw      xmm0, 8
332     packuswb   xmm0, xmm0
333     movq       qword ptr [edx], xmm0
334     lea        edx, [edx + 8]
335     sub        ecx, 8
336     jg         wloop
337 
338     ret
339   }
340 }
341 
342 // Blends 32x4 rectangle to 8x1.
ScaleRowDown4Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)343 __declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8* src_ptr,
344                                               ptrdiff_t src_stride,
345                                               uint8* dst_ptr,
346                                               int dst_width) {
347   __asm {
348     push       esi
349     push       edi
350     mov        eax, [esp + 8 + 4]  // src_ptr
351     mov        esi, [esp + 8 + 8]  // src_stride
352     mov        edx, [esp + 8 + 12]  // dst_ptr
353     mov        ecx, [esp + 8 + 16]  // dst_width
354     lea        edi, [esi + esi * 2]  // src_stride * 3
355     pcmpeqb    xmm4, xmm4  // constant 0x0101
356     psrlw      xmm4, 15
357     movdqa     xmm5, xmm4
358     packuswb   xmm4, xmm4
359     psllw      xmm5, 3  // constant 0x0008
360 
361   wloop:
362     movdqu     xmm0, [eax]  // average rows
363     movdqu     xmm1, [eax + 16]
364     movdqu     xmm2, [eax + esi]
365     movdqu     xmm3, [eax + esi + 16]
366     pmaddubsw  xmm0, xmm4  // horizontal add
367     pmaddubsw  xmm1, xmm4
368     pmaddubsw  xmm2, xmm4
369     pmaddubsw  xmm3, xmm4
370     paddw      xmm0, xmm2  // vertical add rows 0, 1
371     paddw      xmm1, xmm3
372     movdqu     xmm2, [eax + esi * 2]
373     movdqu     xmm3, [eax + esi * 2 + 16]
374     pmaddubsw  xmm2, xmm4
375     pmaddubsw  xmm3, xmm4
376     paddw      xmm0, xmm2  // add row 2
377     paddw      xmm1, xmm3
378     movdqu     xmm2, [eax + edi]
379     movdqu     xmm3, [eax + edi + 16]
380     lea        eax, [eax + 32]
381     pmaddubsw  xmm2, xmm4
382     pmaddubsw  xmm3, xmm4
383     paddw      xmm0, xmm2  // add row 3
384     paddw      xmm1, xmm3
385     phaddw     xmm0, xmm1
386     paddw      xmm0, xmm5  // + 8 for round
387     psrlw      xmm0, 4  // /16 for average of 4 * 4
388     packuswb   xmm0, xmm0
389     movq       qword ptr [edx], xmm0
390     lea        edx, [edx + 8]
391     sub        ecx, 8
392     jg         wloop
393 
394     pop        edi
395     pop        esi
396     ret
397   }
398 }
399 
400 #ifdef HAS_SCALEROWDOWN4_AVX2
401 // Point samples 64 pixels to 16 pixels.
ScaleRowDown4_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)402 __declspec(naked) void ScaleRowDown4_AVX2(const uint8* src_ptr,
403                                           ptrdiff_t src_stride,
404                                           uint8* dst_ptr,
405                                           int dst_width) {
406   __asm {
407     mov         eax, [esp + 4]  // src_ptr
408     // src_stride ignored
409     mov         edx, [esp + 12]  // dst_ptr
410     mov         ecx, [esp + 16]  // dst_width
411     vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0x00ff0000
412     vpsrld      ymm5, ymm5, 24
413     vpslld      ymm5, ymm5, 16
414 
415   wloop:
416     vmovdqu     ymm0, [eax]
417     vmovdqu     ymm1, [eax + 32]
418     lea         eax,  [eax + 64]
419     vpand       ymm0, ymm0, ymm5
420     vpand       ymm1, ymm1, ymm5
421     vpackuswb   ymm0, ymm0, ymm1
422     vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
423     vpsrlw      ymm0, ymm0, 8
424     vpackuswb   ymm0, ymm0, ymm0
425     vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
426     vmovdqu     [edx], xmm0
427     lea         edx, [edx + 16]
428     sub         ecx, 16
429     jg          wloop
430 
431     vzeroupper
432     ret
433   }
434 }
435 
436 // Blends 64x4 rectangle to 16x1.
ScaleRowDown4Box_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)437 __declspec(naked) void ScaleRowDown4Box_AVX2(const uint8* src_ptr,
438                                              ptrdiff_t src_stride,
439                                              uint8* dst_ptr,
440                                              int dst_width) {
441   __asm {
442     push        esi
443     push        edi
444     mov         eax, [esp + 8 + 4]  // src_ptr
445     mov         esi, [esp + 8 + 8]  // src_stride
446     mov         edx, [esp + 8 + 12]  // dst_ptr
447     mov         ecx, [esp + 8 + 16]  // dst_width
448     lea         edi, [esi + esi * 2]  // src_stride * 3
449     vpcmpeqb    ymm4, ymm4, ymm4  // constant 0x0101
450     vpsrlw      ymm4, ymm4, 15
451     vpsllw      ymm5, ymm4, 3  // constant 0x0008
452     vpackuswb   ymm4, ymm4, ymm4
453 
454   wloop:
455     vmovdqu     ymm0, [eax]  // average rows
456     vmovdqu     ymm1, [eax + 32]
457     vmovdqu     ymm2, [eax + esi]
458     vmovdqu     ymm3, [eax + esi + 32]
459     vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
460     vpmaddubsw  ymm1, ymm1, ymm4
461     vpmaddubsw  ymm2, ymm2, ymm4
462     vpmaddubsw  ymm3, ymm3, ymm4
463     vpaddw      ymm0, ymm0, ymm2  // vertical add rows 0, 1
464     vpaddw      ymm1, ymm1, ymm3
465     vmovdqu     ymm2, [eax + esi * 2]
466     vmovdqu     ymm3, [eax + esi * 2 + 32]
467     vpmaddubsw  ymm2, ymm2, ymm4
468     vpmaddubsw  ymm3, ymm3, ymm4
469     vpaddw      ymm0, ymm0, ymm2  // add row 2
470     vpaddw      ymm1, ymm1, ymm3
471     vmovdqu     ymm2, [eax + edi]
472     vmovdqu     ymm3, [eax + edi + 32]
473     lea         eax,  [eax + 64]
474     vpmaddubsw  ymm2, ymm2, ymm4
475     vpmaddubsw  ymm3, ymm3, ymm4
476     vpaddw      ymm0, ymm0, ymm2  // add row 3
477     vpaddw      ymm1, ymm1, ymm3
478     vphaddw     ymm0, ymm0, ymm1  // mutates
479     vpermq      ymm0, ymm0, 0xd8  // unmutate vphaddw
480     vpaddw      ymm0, ymm0, ymm5  // + 8 for round
481     vpsrlw      ymm0, ymm0, 4  // /32 for average of 4 * 4
482     vpackuswb   ymm0, ymm0, ymm0
483     vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
484     vmovdqu     [edx], xmm0
485     lea         edx, [edx + 16]
486     sub         ecx, 16
487     jg          wloop
488 
489     pop        edi
490     pop        esi
491     vzeroupper
492     ret
493   }
494 }
495 #endif  // HAS_SCALEROWDOWN4_AVX2
496 
497 // Point samples 32 pixels to 24 pixels.
498 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
499 // Then shuffled to do the scaling.
500 
ScaleRowDown34_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)501 __declspec(naked) void ScaleRowDown34_SSSE3(const uint8* src_ptr,
502                                             ptrdiff_t src_stride,
503                                             uint8* dst_ptr,
504                                             int dst_width) {
505   __asm {
506     mov        eax, [esp + 4]   // src_ptr
507     // src_stride ignored
508     mov        edx, [esp + 12]  // dst_ptr
509     mov        ecx, [esp + 16]  // dst_width
510     movdqa     xmm3, xmmword ptr kShuf0
511     movdqa     xmm4, xmmword ptr kShuf1
512     movdqa     xmm5, xmmword ptr kShuf2
513 
514   wloop:
515     movdqu     xmm0, [eax]
516     movdqu     xmm1, [eax + 16]
517     lea        eax,  [eax + 32]
518     movdqa     xmm2, xmm1
519     palignr    xmm1, xmm0, 8
520     pshufb     xmm0, xmm3
521     pshufb     xmm1, xmm4
522     pshufb     xmm2, xmm5
523     movq       qword ptr [edx], xmm0
524     movq       qword ptr [edx + 8], xmm1
525     movq       qword ptr [edx + 16], xmm2
526     lea        edx, [edx + 24]
527     sub        ecx, 24
528     jg         wloop
529 
530     ret
531   }
532 }
533 
534 // Blends 32x2 rectangle to 24x1
535 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
536 // Then shuffled to do the scaling.
537 
538 // Register usage:
539 // xmm0 src_row 0
540 // xmm1 src_row 1
541 // xmm2 shuf 0
542 // xmm3 shuf 1
543 // xmm4 shuf 2
544 // xmm5 madd 0
545 // xmm6 madd 1
546 // xmm7 kRound34
547 
548 // Note that movdqa+palign may be better than movdqu.
ScaleRowDown34_1_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)549 __declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
550                                                   ptrdiff_t src_stride,
551                                                   uint8* dst_ptr,
552                                                   int dst_width) {
553   __asm {
554     push       esi
555     mov        eax, [esp + 4 + 4]  // src_ptr
556     mov        esi, [esp + 4 + 8]  // src_stride
557     mov        edx, [esp + 4 + 12]  // dst_ptr
558     mov        ecx, [esp + 4 + 16]  // dst_width
559     movdqa     xmm2, xmmword ptr kShuf01
560     movdqa     xmm3, xmmword ptr kShuf11
561     movdqa     xmm4, xmmword ptr kShuf21
562     movdqa     xmm5, xmmword ptr kMadd01
563     movdqa     xmm6, xmmword ptr kMadd11
564     movdqa     xmm7, xmmword ptr kRound34
565 
566   wloop:
567     movdqu     xmm0, [eax]  // pixels 0..7
568     movdqu     xmm1, [eax + esi]
569     pavgb      xmm0, xmm1
570     pshufb     xmm0, xmm2
571     pmaddubsw  xmm0, xmm5
572     paddsw     xmm0, xmm7
573     psrlw      xmm0, 2
574     packuswb   xmm0, xmm0
575     movq       qword ptr [edx], xmm0
576     movdqu     xmm0, [eax + 8]  // pixels 8..15
577     movdqu     xmm1, [eax + esi + 8]
578     pavgb      xmm0, xmm1
579     pshufb     xmm0, xmm3
580     pmaddubsw  xmm0, xmm6
581     paddsw     xmm0, xmm7
582     psrlw      xmm0, 2
583     packuswb   xmm0, xmm0
584     movq       qword ptr [edx + 8], xmm0
585     movdqu     xmm0, [eax + 16]  // pixels 16..23
586     movdqu     xmm1, [eax + esi + 16]
587     lea        eax, [eax + 32]
588     pavgb      xmm0, xmm1
589     pshufb     xmm0, xmm4
590     movdqa     xmm1, xmmword ptr kMadd21
591     pmaddubsw  xmm0, xmm1
592     paddsw     xmm0, xmm7
593     psrlw      xmm0, 2
594     packuswb   xmm0, xmm0
595     movq       qword ptr [edx + 16], xmm0
596     lea        edx, [edx + 24]
597     sub        ecx, 24
598     jg         wloop
599 
600     pop        esi
601     ret
602   }
603 }
604 
605 // Note that movdqa+palign may be better than movdqu.
ScaleRowDown34_0_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)606 __declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
607                                                   ptrdiff_t src_stride,
608                                                   uint8* dst_ptr,
609                                                   int dst_width) {
610   __asm {
611     push       esi
612     mov        eax, [esp + 4 + 4]  // src_ptr
613     mov        esi, [esp + 4 + 8]  // src_stride
614     mov        edx, [esp + 4 + 12]  // dst_ptr
615     mov        ecx, [esp + 4 + 16]  // dst_width
616     movdqa     xmm2, xmmword ptr kShuf01
617     movdqa     xmm3, xmmword ptr kShuf11
618     movdqa     xmm4, xmmword ptr kShuf21
619     movdqa     xmm5, xmmword ptr kMadd01
620     movdqa     xmm6, xmmword ptr kMadd11
621     movdqa     xmm7, xmmword ptr kRound34
622 
623   wloop:
624     movdqu     xmm0, [eax]  // pixels 0..7
625     movdqu     xmm1, [eax + esi]
626     pavgb      xmm1, xmm0
627     pavgb      xmm0, xmm1
628     pshufb     xmm0, xmm2
629     pmaddubsw  xmm0, xmm5
630     paddsw     xmm0, xmm7
631     psrlw      xmm0, 2
632     packuswb   xmm0, xmm0
633     movq       qword ptr [edx], xmm0
634     movdqu     xmm0, [eax + 8]  // pixels 8..15
635     movdqu     xmm1, [eax + esi + 8]
636     pavgb      xmm1, xmm0
637     pavgb      xmm0, xmm1
638     pshufb     xmm0, xmm3
639     pmaddubsw  xmm0, xmm6
640     paddsw     xmm0, xmm7
641     psrlw      xmm0, 2
642     packuswb   xmm0, xmm0
643     movq       qword ptr [edx + 8], xmm0
644     movdqu     xmm0, [eax + 16]  // pixels 16..23
645     movdqu     xmm1, [eax + esi + 16]
646     lea        eax, [eax + 32]
647     pavgb      xmm1, xmm0
648     pavgb      xmm0, xmm1
649     pshufb     xmm0, xmm4
650     movdqa     xmm1, xmmword ptr kMadd21
651     pmaddubsw  xmm0, xmm1
652     paddsw     xmm0, xmm7
653     psrlw      xmm0, 2
654     packuswb   xmm0, xmm0
655     movq       qword ptr [edx + 16], xmm0
656     lea        edx, [edx+24]
657     sub        ecx, 24
658     jg         wloop
659 
660     pop        esi
661     ret
662   }
663 }
664 
665 // 3/8 point sampler
666 
667 // Scale 32 pixels to 12
ScaleRowDown38_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)668 __declspec(naked) void ScaleRowDown38_SSSE3(const uint8* src_ptr,
669                                             ptrdiff_t src_stride,
670                                             uint8* dst_ptr,
671                                             int dst_width) {
672   __asm {
673     mov        eax, [esp + 4]  // src_ptr
674     // src_stride ignored
675     mov        edx, [esp + 12]  // dst_ptr
676     mov        ecx, [esp + 16]  // dst_width
677     movdqa     xmm4, xmmword ptr kShuf38a
678     movdqa     xmm5, xmmword ptr kShuf38b
679 
680   xloop:
681     movdqu     xmm0, [eax]  // 16 pixels -> 0,1,2,3,4,5
682     movdqu     xmm1, [eax + 16]  // 16 pixels -> 6,7,8,9,10,11
683     lea        eax, [eax + 32]
684     pshufb     xmm0, xmm4
685     pshufb     xmm1, xmm5
686     paddusb    xmm0, xmm1
687 
688     movq       qword ptr [edx], xmm0       // write 12 pixels
689     movhlps    xmm1, xmm0
690     movd       [edx + 8], xmm1
691     lea        edx, [edx + 12]
692     sub        ecx, 12
693     jg         xloop
694 
695     ret
696   }
697 }
698 
699 // Scale 16x3 pixels to 6x1 with interpolation
ScaleRowDown38_3_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)700 __declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
701                                                   ptrdiff_t src_stride,
702                                                   uint8* dst_ptr,
703                                                   int dst_width) {
704   __asm {
705     push       esi
706     mov        eax, [esp + 4 + 4]  // src_ptr
707     mov        esi, [esp + 4 + 8]  // src_stride
708     mov        edx, [esp + 4 + 12]  // dst_ptr
709     mov        ecx, [esp + 4 + 16]  // dst_width
710     movdqa     xmm2, xmmword ptr kShufAc
711     movdqa     xmm3, xmmword ptr kShufAc3
712     movdqa     xmm4, xmmword ptr kScaleAc33
713     pxor       xmm5, xmm5
714 
715   xloop:
716     movdqu     xmm0, [eax]  // sum up 3 rows into xmm0/1
717     movdqu     xmm6, [eax + esi]
718     movhlps    xmm1, xmm0
719     movhlps    xmm7, xmm6
720     punpcklbw  xmm0, xmm5
721     punpcklbw  xmm1, xmm5
722     punpcklbw  xmm6, xmm5
723     punpcklbw  xmm7, xmm5
724     paddusw    xmm0, xmm6
725     paddusw    xmm1, xmm7
726     movdqu     xmm6, [eax + esi * 2]
727     lea        eax, [eax + 16]
728     movhlps    xmm7, xmm6
729     punpcklbw  xmm6, xmm5
730     punpcklbw  xmm7, xmm5
731     paddusw    xmm0, xmm6
732     paddusw    xmm1, xmm7
733 
734     movdqa     xmm6, xmm0  // 8 pixels -> 0,1,2 of xmm6
735     psrldq     xmm0, 2
736     paddusw    xmm6, xmm0
737     psrldq     xmm0, 2
738     paddusw    xmm6, xmm0
739     pshufb     xmm6, xmm2
740 
741     movdqa     xmm7, xmm1  // 8 pixels -> 3,4,5 of xmm6
742     psrldq     xmm1, 2
743     paddusw    xmm7, xmm1
744     psrldq     xmm1, 2
745     paddusw    xmm7, xmm1
746     pshufb     xmm7, xmm3
747     paddusw    xmm6, xmm7
748 
749     pmulhuw    xmm6, xmm4  // divide by 9,9,6, 9,9,6
750     packuswb   xmm6, xmm6
751 
752     movd       [edx], xmm6  // write 6 pixels
753     psrlq      xmm6, 16
754     movd       [edx + 2], xmm6
755     lea        edx, [edx + 6]
756     sub        ecx, 6
757     jg         xloop
758 
759     pop        esi
760     ret
761   }
762 }
763 
764 // Scale 16x2 pixels to 6x1 with interpolation
ScaleRowDown38_2_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)765 __declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
766                                                   ptrdiff_t src_stride,
767                                                   uint8* dst_ptr,
768                                                   int dst_width) {
769   __asm {
770     push       esi
771     mov        eax, [esp + 4 + 4]  // src_ptr
772     mov        esi, [esp + 4 + 8]  // src_stride
773     mov        edx, [esp + 4 + 12]  // dst_ptr
774     mov        ecx, [esp + 4 + 16]  // dst_width
775     movdqa     xmm2, xmmword ptr kShufAb0
776     movdqa     xmm3, xmmword ptr kShufAb1
777     movdqa     xmm4, xmmword ptr kShufAb2
778     movdqa     xmm5, xmmword ptr kScaleAb2
779 
780   xloop:
781     movdqu     xmm0, [eax]  // average 2 rows into xmm0
782     movdqu     xmm1, [eax + esi]
783     lea        eax, [eax + 16]
784     pavgb      xmm0, xmm1
785 
786     movdqa     xmm1, xmm0  // 16 pixels -> 0,1,2,3,4,5 of xmm1
787     pshufb     xmm1, xmm2
788     movdqa     xmm6, xmm0
789     pshufb     xmm6, xmm3
790     paddusw    xmm1, xmm6
791     pshufb     xmm0, xmm4
792     paddusw    xmm1, xmm0
793 
794     pmulhuw    xmm1, xmm5  // divide by 3,3,2, 3,3,2
795     packuswb   xmm1, xmm1
796 
797     movd       [edx], xmm1  // write 6 pixels
798     psrlq      xmm1, 16
799     movd       [edx + 2], xmm1
800     lea        edx, [edx + 6]
801     sub        ecx, 6
802     jg         xloop
803 
804     pop        esi
805     ret
806   }
807 }
808 
809 // Reads 16 bytes and accumulates to 16 shorts at a time.
ScaleAddRow_SSE2(const uint8 * src_ptr,uint16 * dst_ptr,int src_width)810 __declspec(naked) void ScaleAddRow_SSE2(const uint8* src_ptr,
811                                         uint16* dst_ptr,
812                                         int src_width) {
813   __asm {
814     mov        eax, [esp + 4]  // src_ptr
815     mov        edx, [esp + 8]  // dst_ptr
816     mov        ecx, [esp + 12]  // src_width
817     pxor       xmm5, xmm5
818 
819     // sum rows
820   xloop:
821     movdqu     xmm3, [eax]  // read 16 bytes
822     lea        eax, [eax + 16]
823     movdqu     xmm0, [edx]  // read 16 words from destination
824     movdqu     xmm1, [edx + 16]
825     movdqa     xmm2, xmm3
826     punpcklbw  xmm2, xmm5
827     punpckhbw  xmm3, xmm5
828     paddusw    xmm0, xmm2  // sum 16 words
829     paddusw    xmm1, xmm3
830     movdqu     [edx], xmm0  // write 16 words to destination
831     movdqu     [edx + 16], xmm1
832     lea        edx, [edx + 32]
833     sub        ecx, 16
834     jg         xloop
835     ret
836   }
837 }
838 
839 #ifdef HAS_SCALEADDROW_AVX2
840 // Reads 32 bytes and accumulates to 32 shorts at a time.
ScaleAddRow_AVX2(const uint8 * src_ptr,uint16 * dst_ptr,int src_width)841 __declspec(naked) void ScaleAddRow_AVX2(const uint8* src_ptr,
842                                         uint16* dst_ptr,
843                                         int src_width) {
844   __asm {
845     mov         eax, [esp + 4]  // src_ptr
846     mov         edx, [esp + 8]  // dst_ptr
847     mov         ecx, [esp + 12]  // src_width
848     vpxor       ymm5, ymm5, ymm5
849 
850     // sum rows
851   xloop:
852     vmovdqu     ymm3, [eax]  // read 32 bytes
853     lea         eax, [eax + 32]
854     vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
855     vpunpcklbw  ymm2, ymm3, ymm5
856     vpunpckhbw  ymm3, ymm3, ymm5
857     vpaddusw    ymm0, ymm2, [edx]  // sum 16 words
858     vpaddusw    ymm1, ymm3, [edx + 32]
859     vmovdqu     [edx], ymm0  // write 32 words to destination
860     vmovdqu     [edx + 32], ymm1
861     lea         edx, [edx + 64]
862     sub         ecx, 32
863     jg          xloop
864 
865     vzeroupper
866     ret
867   }
868 }
869 #endif  // HAS_SCALEADDROW_AVX2
870 
871 // Constant for making pixels signed to avoid pmaddubsw
872 // saturation.
873 static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
874                         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
875 
876 // Constant for making pixels unsigned and adding .5 for rounding.
877 static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
878                          0x4040, 0x4040, 0x4040, 0x4040};
879 
880 // Bilinear column filtering. SSSE3 version.
ScaleFilterCols_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)881 __declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr,
882                                              const uint8* src_ptr,
883                                              int dst_width,
884                                              int x,
885                                              int dx) {
886   __asm {
887     push       ebx
888     push       esi
889     push       edi
890     mov        edi, [esp + 12 + 4]  // dst_ptr
891     mov        esi, [esp + 12 + 8]  // src_ptr
892     mov        ecx, [esp + 12 + 12]  // dst_width
893     movd       xmm2, [esp + 12 + 16]  // x
894     movd       xmm3, [esp + 12 + 20]  // dx
895     mov        eax, 0x04040000  // shuffle to line up fractions with pixel.
896     movd       xmm5, eax
897     pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.
898     psrlw      xmm6, 9
899     pcmpeqb    xmm7, xmm7  // generate 0x0001
900     psrlw      xmm7, 15
901     pextrw     eax, xmm2, 1  // get x0 integer. preroll
902     sub        ecx, 2
903     jl         xloop29
904 
905     movdqa     xmm0, xmm2  // x1 = x0 + dx
906     paddd      xmm0, xmm3
907     punpckldq  xmm2, xmm0  // x0 x1
908     punpckldq  xmm3, xmm3  // dx dx
909     paddd      xmm3, xmm3  // dx * 2, dx * 2
910     pextrw     edx, xmm2, 3  // get x1 integer. preroll
911 
912     // 2 Pixel loop.
913   xloop2:
914     movdqa     xmm1, xmm2  // x0, x1 fractions.
915     paddd      xmm2, xmm3  // x += dx
916     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
917     movd       xmm0, ebx
918     psrlw      xmm1, 9  // 7 bit fractions.
919     movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
920     movd       xmm4, ebx
921     pshufb     xmm1, xmm5  // 0011
922     punpcklwd  xmm0, xmm4
923     psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
924     pxor       xmm1, xmm6  // 0..7f and 7f..0
925     paddusb    xmm1, xmm7  // +1 so 0..7f and 80..1
926     pmaddubsw  xmm1, xmm0  // 16 bit, 2 pixels.
927     pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
928     pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
929     paddw      xmm1, xmmword ptr kFadd40  // make pixels unsigned and round.
930     psrlw      xmm1, 7  // 8.7 fixed point to low 8 bits.
931     packuswb   xmm1, xmm1  // 8 bits, 2 pixels.
932     movd       ebx, xmm1
933     mov        [edi], bx
934     lea        edi, [edi + 2]
935     sub        ecx, 2  // 2 pixels
936     jge        xloop2
937 
938  xloop29:
939     add        ecx, 2 - 1
940     jl         xloop99
941 
942         // 1 pixel remainder
943     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
944     movd       xmm0, ebx
945     psrlw      xmm2, 9  // 7 bit fractions.
946     pshufb     xmm2, xmm5  // 0011
947     psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
948     pxor       xmm2, xmm6  // 0..7f and 7f..0
949     paddusb    xmm2, xmm7  // +1 so 0..7f and 80..1
950     pmaddubsw  xmm2, xmm0  // 16 bit
951     paddw      xmm2, xmmword ptr kFadd40  // make pixels unsigned and round.
952     psrlw      xmm2, 7  // 8.7 fixed point to low 8 bits.
953     packuswb   xmm2, xmm2  // 8 bits
954     movd       ebx, xmm2
955     mov        [edi], bl
956 
957  xloop99:
958 
959     pop        edi
960     pop        esi
961     pop        ebx
962     ret
963   }
964 }
965 
966 // Reads 16 pixels, duplicates them and writes 32 pixels.
ScaleColsUp2_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)967 __declspec(naked) void ScaleColsUp2_SSE2(uint8* dst_ptr,
968                                          const uint8* src_ptr,
969                                          int dst_width,
970                                          int x,
971                                          int dx) {
972   __asm {
973     mov        edx, [esp + 4]  // dst_ptr
974     mov        eax, [esp + 8]  // src_ptr
975     mov        ecx, [esp + 12]  // dst_width
976 
977   wloop:
978     movdqu     xmm0, [eax]
979     lea        eax,  [eax + 16]
980     movdqa     xmm1, xmm0
981     punpcklbw  xmm0, xmm0
982     punpckhbw  xmm1, xmm1
983     movdqu     [edx], xmm0
984     movdqu     [edx + 16], xmm1
985     lea        edx, [edx + 32]
986     sub        ecx, 32
987     jg         wloop
988 
989     ret
990   }
991 }
992 
993 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
ScaleARGBRowDown2_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)994 __declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
995                                               ptrdiff_t src_stride,
996                                               uint8* dst_argb,
997                                               int dst_width) {
998   __asm {
999     mov        eax, [esp + 4]   // src_argb
1000     // src_stride ignored
1001     mov        edx, [esp + 12]  // dst_argb
1002     mov        ecx, [esp + 16]  // dst_width
1003 
1004   wloop:
1005     movdqu     xmm0, [eax]
1006     movdqu     xmm1, [eax + 16]
1007     lea        eax,  [eax + 32]
1008     shufps     xmm0, xmm1, 0xdd
1009     movdqu     [edx], xmm0
1010     lea        edx, [edx + 16]
1011     sub        ecx, 4
1012     jg         wloop
1013 
1014     ret
1015   }
1016 }
1017 
1018 // Blends 8x1 rectangle to 4x1.
ScaleARGBRowDown2Linear_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)1019 __declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
1020                                                     ptrdiff_t src_stride,
1021                                                     uint8* dst_argb,
1022                                                     int dst_width) {
1023   __asm {
1024     mov        eax, [esp + 4]  // src_argb
1025     // src_stride ignored
1026     mov        edx, [esp + 12]  // dst_argb
1027     mov        ecx, [esp + 16]  // dst_width
1028 
1029   wloop:
1030     movdqu     xmm0, [eax]
1031     movdqu     xmm1, [eax + 16]
1032     lea        eax,  [eax + 32]
1033     movdqa     xmm2, xmm0
1034     shufps     xmm0, xmm1, 0x88  // even pixels
1035     shufps     xmm2, xmm1, 0xdd       // odd pixels
1036     pavgb      xmm0, xmm2
1037     movdqu     [edx], xmm0
1038     lea        edx, [edx + 16]
1039     sub        ecx, 4
1040     jg         wloop
1041 
1042     ret
1043   }
1044 }
1045 
1046 // Blends 8x2 rectangle to 4x1.
ScaleARGBRowDown2Box_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)1047 __declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
1048                                                  ptrdiff_t src_stride,
1049                                                  uint8* dst_argb,
1050                                                  int dst_width) {
1051   __asm {
1052     push       esi
1053     mov        eax, [esp + 4 + 4]  // src_argb
1054     mov        esi, [esp + 4 + 8]  // src_stride
1055     mov        edx, [esp + 4 + 12]  // dst_argb
1056     mov        ecx, [esp + 4 + 16]  // dst_width
1057 
1058   wloop:
1059     movdqu     xmm0, [eax]
1060     movdqu     xmm1, [eax + 16]
1061     movdqu     xmm2, [eax + esi]
1062     movdqu     xmm3, [eax + esi + 16]
1063     lea        eax,  [eax + 32]
1064     pavgb      xmm0, xmm2  // average rows
1065     pavgb      xmm1, xmm3
1066     movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)
1067     shufps     xmm0, xmm1, 0x88  // even pixels
1068     shufps     xmm2, xmm1, 0xdd  // odd pixels
1069     pavgb      xmm0, xmm2
1070     movdqu     [edx], xmm0
1071     lea        edx, [edx + 16]
1072     sub        ecx, 4
1073     jg         wloop
1074 
1075     pop        esi
1076     ret
1077   }
1078 }
1079 
1080 // Reads 4 pixels at a time.
ScaleARGBRowDownEven_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)1081 __declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8* src_argb,
1082                                                  ptrdiff_t src_stride,
1083                                                  int src_stepx,
1084                                                  uint8* dst_argb,
1085                                                  int dst_width) {
1086   __asm {
1087     push       ebx
1088     push       edi
1089     mov        eax, [esp + 8 + 4]   // src_argb
1090     // src_stride ignored
1091     mov        ebx, [esp + 8 + 12]  // src_stepx
1092     mov        edx, [esp + 8 + 16]  // dst_argb
1093     mov        ecx, [esp + 8 + 20]  // dst_width
1094     lea        ebx, [ebx * 4]
1095     lea        edi, [ebx + ebx * 2]
1096 
1097   wloop:
1098     movd       xmm0, [eax]
1099     movd       xmm1, [eax + ebx]
1100     punpckldq  xmm0, xmm1
1101     movd       xmm2, [eax + ebx * 2]
1102     movd       xmm3, [eax + edi]
1103     lea        eax,  [eax + ebx * 4]
1104     punpckldq  xmm2, xmm3
1105     punpcklqdq xmm0, xmm2
1106     movdqu     [edx], xmm0
1107     lea        edx, [edx + 16]
1108     sub        ecx, 4
1109     jg         wloop
1110 
1111     pop        edi
1112     pop        ebx
1113     ret
1114   }
1115 }
1116 
1117 // Blends four 2x2 to 4x1.
ScaleARGBRowDownEvenBox_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)1118 __declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
1119                                                     ptrdiff_t src_stride,
1120                                                     int src_stepx,
1121                                                     uint8* dst_argb,
1122                                                     int dst_width) {
1123   __asm {
1124     push       ebx
1125     push       esi
1126     push       edi
1127     mov        eax, [esp + 12 + 4]  // src_argb
1128     mov        esi, [esp + 12 + 8]  // src_stride
1129     mov        ebx, [esp + 12 + 12]  // src_stepx
1130     mov        edx, [esp + 12 + 16]  // dst_argb
1131     mov        ecx, [esp + 12 + 20]  // dst_width
1132     lea        esi, [eax + esi]  // row1 pointer
1133     lea        ebx, [ebx * 4]
1134     lea        edi, [ebx + ebx * 2]
1135 
1136   wloop:
1137     movq       xmm0, qword ptr [eax]  // row0 4 pairs
1138     movhps     xmm0, qword ptr [eax + ebx]
1139     movq       xmm1, qword ptr [eax + ebx * 2]
1140     movhps     xmm1, qword ptr [eax + edi]
1141     lea        eax,  [eax + ebx * 4]
1142     movq       xmm2, qword ptr [esi]  // row1 4 pairs
1143     movhps     xmm2, qword ptr [esi + ebx]
1144     movq       xmm3, qword ptr [esi + ebx * 2]
1145     movhps     xmm3, qword ptr [esi + edi]
1146     lea        esi,  [esi + ebx * 4]
1147     pavgb      xmm0, xmm2  // average rows
1148     pavgb      xmm1, xmm3
1149     movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)
1150     shufps     xmm0, xmm1, 0x88  // even pixels
1151     shufps     xmm2, xmm1, 0xdd  // odd pixels
1152     pavgb      xmm0, xmm2
1153     movdqu     [edx], xmm0
1154     lea        edx, [edx + 16]
1155     sub        ecx, 4
1156     jg         wloop
1157 
1158     pop        edi
1159     pop        esi
1160     pop        ebx
1161     ret
1162   }
1163 }
1164 
1165 // Column scaling unfiltered. SSE2 version.
ScaleARGBCols_SSE2(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1166 __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
1167                                           const uint8* src_argb,
1168                                           int dst_width,
1169                                           int x,
1170                                           int dx) {
1171   __asm {
1172     push       edi
1173     push       esi
1174     mov        edi, [esp + 8 + 4]  // dst_argb
1175     mov        esi, [esp + 8 + 8]  // src_argb
1176     mov        ecx, [esp + 8 + 12]  // dst_width
1177     movd       xmm2, [esp + 8 + 16]  // x
1178     movd       xmm3, [esp + 8 + 20]  // dx
1179 
1180     pshufd     xmm2, xmm2, 0  // x0 x0 x0 x0
1181     pshufd     xmm0, xmm3, 0x11  // dx  0 dx  0
1182     paddd      xmm2, xmm0
1183     paddd      xmm3, xmm3  // 0, 0, 0,  dx * 2
1184     pshufd     xmm0, xmm3, 0x05  // dx * 2, dx * 2, 0, 0
1185     paddd      xmm2, xmm0  // x3 x2 x1 x0
1186     paddd      xmm3, xmm3  // 0, 0, 0,  dx * 4
1187     pshufd     xmm3, xmm3, 0  // dx * 4, dx * 4, dx * 4, dx * 4
1188 
1189     pextrw     eax, xmm2, 1  // get x0 integer.
1190     pextrw     edx, xmm2, 3  // get x1 integer.
1191 
1192     cmp        ecx, 0
1193     jle        xloop99
1194     sub        ecx, 4
1195     jl         xloop49
1196 
1197     // 4 Pixel loop.
1198  xloop4:
1199     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
1200     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
1201     pextrw     eax, xmm2, 5  // get x2 integer.
1202     pextrw     edx, xmm2, 7  // get x3 integer.
1203     paddd      xmm2, xmm3  // x += dx
1204     punpckldq  xmm0, xmm1  // x0 x1
1205 
1206     movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
1207     movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
1208     pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
1209     pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
1210     punpckldq  xmm1, xmm4  // x2 x3
1211     punpcklqdq xmm0, xmm1  // x0 x1 x2 x3
1212     movdqu     [edi], xmm0
1213     lea        edi, [edi + 16]
1214     sub        ecx, 4  // 4 pixels
1215     jge        xloop4
1216 
1217  xloop49:
1218     test       ecx, 2
1219     je         xloop29
1220 
1221     // 2 Pixels.
1222     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
1223     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
1224     pextrw     eax, xmm2, 5  // get x2 integer.
1225     punpckldq  xmm0, xmm1  // x0 x1
1226 
1227     movq       qword ptr [edi], xmm0
1228     lea        edi, [edi + 8]
1229 
1230  xloop29:
1231     test       ecx, 1
1232     je         xloop99
1233 
1234     // 1 Pixels.
1235     movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
1236     movd       dword ptr [edi], xmm0
1237  xloop99:
1238 
1239     pop        esi
1240     pop        edi
1241     ret
1242   }
1243 }
1244 
1245 // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
1246 // TODO(fbarchard): Port to Neon
1247 
1248 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1249 static uvec8 kShuffleColARGB = {
1250     0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
1251     8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
1252 };
1253 
1254 // Shuffle table for duplicating 2 fractions into 8 bytes each
1255 static uvec8 kShuffleFractions = {
1256     0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1257 };
1258 
ScaleARGBFilterCols_SSSE3(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1259 __declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
1260                                                  const uint8* src_argb,
1261                                                  int dst_width,
1262                                                  int x,
1263                                                  int dx) {
1264   __asm {
1265     push       esi
1266     push       edi
1267     mov        edi, [esp + 8 + 4]  // dst_argb
1268     mov        esi, [esp + 8 + 8]  // src_argb
1269     mov        ecx, [esp + 8 + 12]  // dst_width
1270     movd       xmm2, [esp + 8 + 16]  // x
1271     movd       xmm3, [esp + 8 + 20]  // dx
1272     movdqa     xmm4, xmmword ptr kShuffleColARGB
1273     movdqa     xmm5, xmmword ptr kShuffleFractions
1274     pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.
1275     psrlw      xmm6, 9
1276     pextrw     eax, xmm2, 1  // get x0 integer. preroll
1277     sub        ecx, 2
1278     jl         xloop29
1279 
1280     movdqa     xmm0, xmm2  // x1 = x0 + dx
1281     paddd      xmm0, xmm3
1282     punpckldq  xmm2, xmm0  // x0 x1
1283     punpckldq  xmm3, xmm3  // dx dx
1284     paddd      xmm3, xmm3  // dx * 2, dx * 2
1285     pextrw     edx, xmm2, 3  // get x1 integer. preroll
1286 
1287     // 2 Pixel loop.
1288   xloop2:
1289     movdqa     xmm1, xmm2  // x0, x1 fractions.
1290     paddd      xmm2, xmm3  // x += dx
1291     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
1292     psrlw      xmm1, 9  // 7 bit fractions.
1293     movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
1294     pshufb     xmm1, xmm5  // 0000000011111111
1295     pshufb     xmm0, xmm4  // arrange pixels into pairs
1296     pxor       xmm1, xmm6  // 0..7f and 7f..0
1297     pmaddubsw  xmm0, xmm1  // argb_argb 16 bit, 2 pixels.
1298     pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
1299     pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
1300     psrlw      xmm0, 7  // argb 8.7 fixed point to low 8 bits.
1301     packuswb   xmm0, xmm0  // argb_argb 8 bits, 2 pixels.
1302     movq       qword ptr [edi], xmm0
1303     lea        edi, [edi + 8]
1304     sub        ecx, 2  // 2 pixels
1305     jge        xloop2
1306 
1307  xloop29:
1308 
1309     add        ecx, 2 - 1
1310     jl         xloop99
1311 
1312         // 1 pixel remainder
1313     psrlw      xmm2, 9  // 7 bit fractions.
1314     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
1315     pshufb     xmm2, xmm5  // 00000000
1316     pshufb     xmm0, xmm4  // arrange pixels into pairs
1317     pxor       xmm2, xmm6  // 0..7f and 7f..0
1318     pmaddubsw  xmm0, xmm2  // argb 16 bit, 1 pixel.
1319     psrlw      xmm0, 7
1320     packuswb   xmm0, xmm0  // argb 8 bits, 1 pixel.
1321     movd       [edi], xmm0
1322 
1323  xloop99:
1324 
1325     pop        edi
1326     pop        esi
1327     ret
1328   }
1329 }
1330 
1331 // Reads 4 pixels, duplicates them and writes 8 pixels.
ScaleARGBColsUp2_SSE2(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1332 __declspec(naked) void ScaleARGBColsUp2_SSE2(uint8* dst_argb,
1333                                              const uint8* src_argb,
1334                                              int dst_width,
1335                                              int x,
1336                                              int dx) {
1337   __asm {
1338     mov        edx, [esp + 4]  // dst_argb
1339     mov        eax, [esp + 8]  // src_argb
1340     mov        ecx, [esp + 12]  // dst_width
1341 
1342   wloop:
1343     movdqu     xmm0, [eax]
1344     lea        eax,  [eax + 16]
1345     movdqa     xmm1, xmm0
1346     punpckldq  xmm0, xmm0
1347     punpckhdq  xmm1, xmm1
1348     movdqu     [edx], xmm0
1349     movdqu     [edx + 16], xmm1
1350     lea        edx, [edx + 32]
1351     sub        ecx, 8
1352     jg         wloop
1353 
1354     ret
1355   }
1356 }
1357 
1358 // Divide num by div and return as 16.16 fixed point result.
FixedDiv_X86(int num,int div)1359 __declspec(naked) int FixedDiv_X86(int num, int div) {
1360   __asm {
1361     mov        eax, [esp + 4]  // num
1362     cdq  // extend num to 64 bits
1363     shld       edx, eax, 16  // 32.16
1364     shl        eax, 16
1365     idiv       dword ptr [esp + 8]
1366     ret
1367   }
1368 }
1369 
1370 // Divide num by div and return as 16.16 fixed point result.
FixedDiv1_X86(int num,int div)1371 __declspec(naked) int FixedDiv1_X86(int num, int div) {
1372   __asm {
1373     mov        eax, [esp + 4]  // num
1374     mov        ecx, [esp + 8]  // denom
1375     cdq  // extend num to 64 bits
1376     shld       edx, eax, 16  // 32.16
1377     shl        eax, 16
1378     sub        eax, 0x00010001
1379     sbb        edx, 0
1380     sub        ecx, 1
1381     idiv       ecx
1382     ret
1383   }
1384 }
1385 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
1386 
1387 #ifdef __cplusplus
1388 }  // extern "C"
1389 }  // namespace libyuv
1390 #endif
1391