• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 #include "libyuv/scale_row.h"
13 
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18 
19 // This module is for Visual C x86.
20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
21     defined(_MSC_VER) && !defined(__clang__)
22 
23 // Offsets for source bytes 0 to 9
24 static uvec8 kShuf0 =
25   { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
26 
27 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
28 static uvec8 kShuf1 =
29   { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
30 
31 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
32 static uvec8 kShuf2 =
33   { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
34 
35 // Offsets for source bytes 0 to 10
36 static uvec8 kShuf01 =
37   { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
38 
39 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
40 static uvec8 kShuf11 =
41   { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
42 
43 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
44 static uvec8 kShuf21 =
45   { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
46 
47 // Coefficients for source bytes 0 to 10
48 static uvec8 kMadd01 =
49   { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
50 
51 // Coefficients for source bytes 10 to 21
52 static uvec8 kMadd11 =
53   { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
54 
55 // Coefficients for source bytes 21 to 31
56 static uvec8 kMadd21 =
57   { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
58 
59 // Coefficients for source bytes 21 to 31
60 static vec16 kRound34 =
61   { 2, 2, 2, 2, 2, 2, 2, 2 };
62 
63 static uvec8 kShuf38a =
64   { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
65 
66 static uvec8 kShuf38b =
67   { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
68 
69 // Arrange words 0,3,6 into 0,1,2
70 static uvec8 kShufAc =
71   { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
72 
73 // Arrange words 0,3,6 into 3,4,5
74 static uvec8 kShufAc3 =
75   { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
76 
77 // Scaling values for boxes of 3x3 and 2x3
78 static uvec16 kScaleAc33 =
79   { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
80 
81 // Arrange first value for pixels 0,1,2,3,4,5
82 static uvec8 kShufAb0 =
83   { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
84 
85 // Arrange second value for pixels 0,1,2,3,4,5
86 static uvec8 kShufAb1 =
87   { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
88 
89 // Arrange third value for pixels 0,1,2,3,4,5
90 static uvec8 kShufAb2 =
91   { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
92 
93 // Scaling values for boxes of 3x2 and 2x2
94 static uvec16 kScaleAb2 =
95   { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
96 
97 // Reads 32 pixels, throws half away and writes 16 pixels.
98 __declspec(naked)
ScaleRowDown2_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)99 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
100                         uint8* dst_ptr, int dst_width) {
101   __asm {
102     mov        eax, [esp + 4]        // src_ptr
103                                      // src_stride ignored
104     mov        edx, [esp + 12]       // dst_ptr
105     mov        ecx, [esp + 16]       // dst_width
106 
107   wloop:
108     movdqu     xmm0, [eax]
109     movdqu     xmm1, [eax + 16]
110     lea        eax,  [eax + 32]
111     psrlw      xmm0, 8               // isolate odd pixels.
112     psrlw      xmm1, 8
113     packuswb   xmm0, xmm1
114     movdqu     [edx], xmm0
115     lea        edx, [edx + 16]
116     sub        ecx, 16
117     jg         wloop
118 
119     ret
120   }
121 }
122 
123 // Blends 32x1 rectangle to 16x1.
124 __declspec(naked)
ScaleRowDown2Linear_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)125 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
126                               uint8* dst_ptr, int dst_width) {
127   __asm {
128     mov        eax, [esp + 4]        // src_ptr
129                                      // src_stride
130     mov        edx, [esp + 12]       // dst_ptr
131     mov        ecx, [esp + 16]       // dst_width
132     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
133     psrlw      xmm5, 8
134 
135   wloop:
136     movdqu     xmm0, [eax]
137     movdqu     xmm1, [eax + 16]
138     lea        eax,  [eax + 32]
139 
140     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
141     psrlw      xmm0, 8
142     movdqa     xmm3, xmm1
143     psrlw      xmm1, 8
144     pand       xmm2, xmm5
145     pand       xmm3, xmm5
146     pavgw      xmm0, xmm2
147     pavgw      xmm1, xmm3
148     packuswb   xmm0, xmm1
149 
150     movdqu     [edx], xmm0
151     lea        edx, [edx + 16]
152     sub        ecx, 16
153     jg         wloop
154 
155     ret
156   }
157 }
158 
159 // Blends 32x2 rectangle to 16x1.
160 __declspec(naked)
ScaleRowDown2Box_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)161 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
162                            uint8* dst_ptr, int dst_width) {
163   __asm {
164     push       esi
165     mov        eax, [esp + 4 + 4]    // src_ptr
166     mov        esi, [esp + 4 + 8]    // src_stride
167     mov        edx, [esp + 4 + 12]   // dst_ptr
168     mov        ecx, [esp + 4 + 16]   // dst_width
169     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
170     psrlw      xmm5, 8
171 
172   wloop:
173     movdqu     xmm0, [eax]
174     movdqu     xmm1, [eax + 16]
175     movdqu     xmm2, [eax + esi]
176     movdqu     xmm3, [eax + esi + 16]
177     lea        eax,  [eax + 32]
178     pavgb      xmm0, xmm2            // average rows
179     pavgb      xmm1, xmm3
180 
181     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
182     psrlw      xmm0, 8
183     movdqa     xmm3, xmm1
184     psrlw      xmm1, 8
185     pand       xmm2, xmm5
186     pand       xmm3, xmm5
187     pavgw      xmm0, xmm2
188     pavgw      xmm1, xmm3
189     packuswb   xmm0, xmm1
190 
191     movdqu     [edx], xmm0
192     lea        edx, [edx + 16]
193     sub        ecx, 16
194     jg         wloop
195 
196     pop        esi
197     ret
198   }
199 }
200 
201 #ifdef HAS_SCALEROWDOWN2_AVX2
202 // Reads 64 pixels, throws half away and writes 32 pixels.
203 __declspec(naked)
ScaleRowDown2_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)204 void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
205                         uint8* dst_ptr, int dst_width) {
206   __asm {
207     mov        eax, [esp + 4]        // src_ptr
208                                      // src_stride ignored
209     mov        edx, [esp + 12]       // dst_ptr
210     mov        ecx, [esp + 16]       // dst_width
211 
212   wloop:
213     vmovdqu     ymm0, [eax]
214     vmovdqu     ymm1, [eax + 32]
215     lea         eax,  [eax + 64]
216     vpsrlw      ymm0, ymm0, 8        // isolate odd pixels.
217     vpsrlw      ymm1, ymm1, 8
218     vpackuswb   ymm0, ymm0, ymm1
219     vpermq      ymm0, ymm0, 0xd8     // unmutate vpackuswb
220     vmovdqu     [edx], ymm0
221     lea         edx, [edx + 32]
222     sub         ecx, 32
223     jg          wloop
224 
225     vzeroupper
226     ret
227   }
228 }
229 
230 // Blends 64x1 rectangle to 32x1.
231 __declspec(naked)
ScaleRowDown2Linear_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)232 void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
233                               uint8* dst_ptr, int dst_width) {
234   __asm {
235     mov         eax, [esp + 4]        // src_ptr
236                                       // src_stride
237     mov         edx, [esp + 12]       // dst_ptr
238     mov         ecx, [esp + 16]       // dst_width
239 
240     vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
241     vpsrlw      ymm4, ymm4, 15
242     vpackuswb   ymm4, ymm4, ymm4
243     vpxor       ymm5, ymm5, ymm5      // constant 0
244 
245   wloop:
246     vmovdqu     ymm0, [eax]
247     vmovdqu     ymm1, [eax + 32]
248     lea         eax,  [eax + 64]
249 
250     vpmaddubsw  ymm0, ymm0, ymm4      // average horizontally
251     vpmaddubsw  ymm1, ymm1, ymm4
252     vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
253     vpavgw      ymm1, ymm1, ymm5
254     vpackuswb   ymm0, ymm0, ymm1
255     vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
256 
257     vmovdqu     [edx], ymm0
258     lea         edx, [edx + 32]
259     sub         ecx, 32
260     jg          wloop
261 
262     vzeroupper
263     ret
264   }
265 }
266 
267 // Blends 64x2 rectangle to 32x1.
268 __declspec(naked)
ScaleRowDown2Box_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)269 void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
270                            uint8* dst_ptr, int dst_width) {
271   __asm {
272     push        esi
273     mov         eax, [esp + 4 + 4]    // src_ptr
274     mov         esi, [esp + 4 + 8]    // src_stride
275     mov         edx, [esp + 4 + 12]   // dst_ptr
276     mov         ecx, [esp + 4 + 16]   // dst_width
277 
278     vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
279     vpsrlw      ymm4, ymm4, 15
280     vpackuswb   ymm4, ymm4, ymm4
281     vpxor       ymm5, ymm5, ymm5      // constant 0
282 
283   wloop:
284     vmovdqu     ymm0, [eax]           // average rows
285     vmovdqu     ymm1, [eax + 32]
286     vpavgb      ymm0, ymm0, [eax + esi]
287     vpavgb      ymm1, ymm1, [eax + esi + 32]
288     lea         eax,  [eax + 64]
289 
290     vpmaddubsw  ymm0, ymm0, ymm4      // average horizontally
291     vpmaddubsw  ymm1, ymm1, ymm4
292     vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
293     vpavgw      ymm1, ymm1, ymm5
294     vpackuswb   ymm0, ymm0, ymm1
295     vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
296 
297     vmovdqu     [edx], ymm0
298     lea         edx, [edx + 32]
299     sub         ecx, 32
300     jg          wloop
301 
302     pop         esi
303     vzeroupper
304     ret
305   }
306 }
307 #endif  // HAS_SCALEROWDOWN2_AVX2
308 
309 // Point samples 32 pixels to 8 pixels.
310 __declspec(naked)
ScaleRowDown4_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)311 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
312                         uint8* dst_ptr, int dst_width) {
313   __asm {
314     mov        eax, [esp + 4]        // src_ptr
315                                      // src_stride ignored
316     mov        edx, [esp + 12]       // dst_ptr
317     mov        ecx, [esp + 16]       // dst_width
318     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000
319     psrld      xmm5, 24
320     pslld      xmm5, 16
321 
322   wloop:
323     movdqu     xmm0, [eax]
324     movdqu     xmm1, [eax + 16]
325     lea        eax,  [eax + 32]
326     pand       xmm0, xmm5
327     pand       xmm1, xmm5
328     packuswb   xmm0, xmm1
329     psrlw      xmm0, 8
330     packuswb   xmm0, xmm0
331     movq       qword ptr [edx], xmm0
332     lea        edx, [edx + 8]
333     sub        ecx, 8
334     jg         wloop
335 
336     ret
337   }
338 }
339 
340 // Blends 32x4 rectangle to 8x1.
341 __declspec(naked)
ScaleRowDown4Box_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)342 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
343                            uint8* dst_ptr, int dst_width) {
344   __asm {
345     push       esi
346     push       edi
347     mov        eax, [esp + 8 + 4]    // src_ptr
348     mov        esi, [esp + 8 + 8]    // src_stride
349     mov        edx, [esp + 8 + 12]   // dst_ptr
350     mov        ecx, [esp + 8 + 16]   // dst_width
351     lea        edi, [esi + esi * 2]  // src_stride * 3
352     pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
353     psrlw      xmm7, 8
354 
355   wloop:
356     movdqu     xmm0, [eax]           // average rows
357     movdqu     xmm1, [eax + 16]
358     movdqu     xmm2, [eax + esi]
359     movdqu     xmm3, [eax + esi + 16]
360     pavgb      xmm0, xmm2
361     pavgb      xmm1, xmm3
362     movdqu     xmm2, [eax + esi * 2]
363     movdqu     xmm3, [eax + esi * 2 + 16]
364     movdqu     xmm4, [eax + edi]
365     movdqu     xmm5, [eax + edi + 16]
366     lea        eax, [eax + 32]
367     pavgb      xmm2, xmm4
368     pavgb      xmm3, xmm5
369     pavgb      xmm0, xmm2
370     pavgb      xmm1, xmm3
371 
372     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
373     psrlw      xmm0, 8
374     movdqa     xmm3, xmm1
375     psrlw      xmm1, 8
376     pand       xmm2, xmm7
377     pand       xmm3, xmm7
378     pavgw      xmm0, xmm2
379     pavgw      xmm1, xmm3
380     packuswb   xmm0, xmm1
381 
382     movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
383     psrlw      xmm0, 8
384     pand       xmm2, xmm7
385     pavgw      xmm0, xmm2
386     packuswb   xmm0, xmm0
387 
388     movq       qword ptr [edx], xmm0
389     lea        edx, [edx + 8]
390     sub        ecx, 8
391     jg         wloop
392 
393     pop        edi
394     pop        esi
395     ret
396   }
397 }
398 
399 #ifdef HAS_SCALEROWDOWN4_AVX2
400 // Point samples 64 pixels to 16 pixels.
401 __declspec(naked)
ScaleRowDown4_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)402 void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
403                         uint8* dst_ptr, int dst_width) {
404   __asm {
405     mov         eax, [esp + 4]        // src_ptr
406                                       // src_stride ignored
407     mov         edx, [esp + 12]       // dst_ptr
408     mov         ecx, [esp + 16]       // dst_width
409     vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0x00ff0000
410     vpsrld      ymm5, ymm5, 24
411     vpslld      ymm5, ymm5, 16
412 
413   wloop:
414     vmovdqu     ymm0, [eax]
415     vmovdqu     ymm1, [eax + 32]
416     lea         eax,  [eax + 64]
417     vpand       ymm0, ymm0, ymm5
418     vpand       ymm1, ymm1, ymm5
419     vpackuswb   ymm0, ymm0, ymm1
420     vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
421     vpsrlw      ymm0, ymm0, 8
422     vpackuswb   ymm0, ymm0, ymm0
423     vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
424     vmovdqu     [edx], xmm0
425     lea         edx, [edx + 16]
426     sub         ecx, 16
427     jg          wloop
428 
429     vzeroupper
430     ret
431   }
432 }
433 
434 // Blends 64x4 rectangle to 16x1.
435 __declspec(naked)
ScaleRowDown4Box_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)436 void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
437                            uint8* dst_ptr, int dst_width) {
438   __asm {
439     push        esi
440     push        edi
441     mov         eax, [esp + 8 + 4]    // src_ptr
442     mov         esi, [esp + 8 + 8]    // src_stride
443     mov         edx, [esp + 8 + 12]   // dst_ptr
444     mov         ecx, [esp + 8 + 16]   // dst_width
445     lea         edi, [esi + esi * 2]  // src_stride * 3
446     vpcmpeqb    ymm7, ymm7, ymm7      // generate mask 0x00ff00ff
447     vpsrlw      ymm7, ymm7, 8
448 
449   wloop:
450     vmovdqu     ymm0, [eax]           // average rows
451     vmovdqu     ymm1, [eax + 32]
452     vpavgb      ymm0, ymm0, [eax + esi]
453     vpavgb      ymm1, ymm1, [eax + esi + 32]
454     vmovdqu     ymm2, [eax + esi * 2]
455     vmovdqu     ymm3, [eax + esi * 2 + 32]
456     vpavgb      ymm2, ymm2, [eax + edi]
457     vpavgb      ymm3, ymm3, [eax + edi + 32]
458     lea         eax, [eax + 64]
459     vpavgb      ymm0, ymm0, ymm2
460     vpavgb      ymm1, ymm1, ymm3
461 
462     vpand       ymm2, ymm0, ymm7      // average columns (64 to 32 pixels)
463     vpand       ymm3, ymm1, ymm7
464     vpsrlw      ymm0, ymm0, 8
465     vpsrlw      ymm1, ymm1, 8
466     vpavgw      ymm0, ymm0, ymm2
467     vpavgw      ymm1, ymm1, ymm3
468     vpackuswb   ymm0, ymm0, ymm1
469     vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
470 
471     vpand       ymm2, ymm0, ymm7      // average columns (32 to 16 pixels)
472     vpsrlw      ymm0, ymm0, 8
473     vpavgw      ymm0, ymm0, ymm2
474     vpackuswb   ymm0, ymm0, ymm0
475     vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
476 
477     vmovdqu     [edx], xmm0
478     lea         edx, [edx + 16]
479     sub         ecx, 16
480     jg          wloop
481 
482     pop        edi
483     pop        esi
484     vzeroupper
485     ret
486   }
487 }
488 #endif  // HAS_SCALEROWDOWN4_AVX2
489 
490 // Point samples 32 pixels to 24 pixels.
491 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
492 // Then shuffled to do the scaling.
493 
494 __declspec(naked)
ScaleRowDown34_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)495 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
496                           uint8* dst_ptr, int dst_width) {
497   __asm {
498     mov        eax, [esp + 4]        // src_ptr
499                                      // src_stride ignored
500     mov        edx, [esp + 12]       // dst_ptr
501     mov        ecx, [esp + 16]       // dst_width
502     movdqa     xmm3, kShuf0
503     movdqa     xmm4, kShuf1
504     movdqa     xmm5, kShuf2
505 
506   wloop:
507     movdqu     xmm0, [eax]
508     movdqu     xmm1, [eax + 16]
509     lea        eax,  [eax + 32]
510     movdqa     xmm2, xmm1
511     palignr    xmm1, xmm0, 8
512     pshufb     xmm0, xmm3
513     pshufb     xmm1, xmm4
514     pshufb     xmm2, xmm5
515     movq       qword ptr [edx], xmm0
516     movq       qword ptr [edx + 8], xmm1
517     movq       qword ptr [edx + 16], xmm2
518     lea        edx, [edx + 24]
519     sub        ecx, 24
520     jg         wloop
521 
522     ret
523   }
524 }
525 
526 // Blends 32x2 rectangle to 24x1
527 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
528 // Then shuffled to do the scaling.
529 
530 // Register usage:
531 // xmm0 src_row 0
532 // xmm1 src_row 1
533 // xmm2 shuf 0
534 // xmm3 shuf 1
535 // xmm4 shuf 2
536 // xmm5 madd 0
537 // xmm6 madd 1
538 // xmm7 kRound34
539 
540 // Note that movdqa+palign may be better than movdqu.
541 __declspec(naked)
ScaleRowDown34_1_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)542 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
543                                 ptrdiff_t src_stride,
544                                 uint8* dst_ptr, int dst_width) {
545   __asm {
546     push       esi
547     mov        eax, [esp + 4 + 4]    // src_ptr
548     mov        esi, [esp + 4 + 8]    // src_stride
549     mov        edx, [esp + 4 + 12]   // dst_ptr
550     mov        ecx, [esp + 4 + 16]   // dst_width
551     movdqa     xmm2, kShuf01
552     movdqa     xmm3, kShuf11
553     movdqa     xmm4, kShuf21
554     movdqa     xmm5, kMadd01
555     movdqa     xmm6, kMadd11
556     movdqa     xmm7, kRound34
557 
558   wloop:
559     movdqu     xmm0, [eax]           // pixels 0..7
560     movdqu     xmm1, [eax + esi]
561     pavgb      xmm0, xmm1
562     pshufb     xmm0, xmm2
563     pmaddubsw  xmm0, xmm5
564     paddsw     xmm0, xmm7
565     psrlw      xmm0, 2
566     packuswb   xmm0, xmm0
567     movq       qword ptr [edx], xmm0
568     movdqu     xmm0, [eax + 8]       // pixels 8..15
569     movdqu     xmm1, [eax + esi + 8]
570     pavgb      xmm0, xmm1
571     pshufb     xmm0, xmm3
572     pmaddubsw  xmm0, xmm6
573     paddsw     xmm0, xmm7
574     psrlw      xmm0, 2
575     packuswb   xmm0, xmm0
576     movq       qword ptr [edx + 8], xmm0
577     movdqu     xmm0, [eax + 16]      // pixels 16..23
578     movdqu     xmm1, [eax + esi + 16]
579     lea        eax, [eax + 32]
580     pavgb      xmm0, xmm1
581     pshufb     xmm0, xmm4
582     movdqa     xmm1, kMadd21
583     pmaddubsw  xmm0, xmm1
584     paddsw     xmm0, xmm7
585     psrlw      xmm0, 2
586     packuswb   xmm0, xmm0
587     movq       qword ptr [edx + 16], xmm0
588     lea        edx, [edx + 24]
589     sub        ecx, 24
590     jg         wloop
591 
592     pop        esi
593     ret
594   }
595 }
596 
597 // Note that movdqa+palign may be better than movdqu.
598 __declspec(naked)
ScaleRowDown34_0_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)599 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
600                                 ptrdiff_t src_stride,
601                                 uint8* dst_ptr, int dst_width) {
602   __asm {
603     push       esi
604     mov        eax, [esp + 4 + 4]    // src_ptr
605     mov        esi, [esp + 4 + 8]    // src_stride
606     mov        edx, [esp + 4 + 12]   // dst_ptr
607     mov        ecx, [esp + 4 + 16]   // dst_width
608     movdqa     xmm2, kShuf01
609     movdqa     xmm3, kShuf11
610     movdqa     xmm4, kShuf21
611     movdqa     xmm5, kMadd01
612     movdqa     xmm6, kMadd11
613     movdqa     xmm7, kRound34
614 
615   wloop:
616     movdqu     xmm0, [eax]           // pixels 0..7
617     movdqu     xmm1, [eax + esi]
618     pavgb      xmm1, xmm0
619     pavgb      xmm0, xmm1
620     pshufb     xmm0, xmm2
621     pmaddubsw  xmm0, xmm5
622     paddsw     xmm0, xmm7
623     psrlw      xmm0, 2
624     packuswb   xmm0, xmm0
625     movq       qword ptr [edx], xmm0
626     movdqu     xmm0, [eax + 8]       // pixels 8..15
627     movdqu     xmm1, [eax + esi + 8]
628     pavgb      xmm1, xmm0
629     pavgb      xmm0, xmm1
630     pshufb     xmm0, xmm3
631     pmaddubsw  xmm0, xmm6
632     paddsw     xmm0, xmm7
633     psrlw      xmm0, 2
634     packuswb   xmm0, xmm0
635     movq       qword ptr [edx + 8], xmm0
636     movdqu     xmm0, [eax + 16]      // pixels 16..23
637     movdqu     xmm1, [eax + esi + 16]
638     lea        eax, [eax + 32]
639     pavgb      xmm1, xmm0
640     pavgb      xmm0, xmm1
641     pshufb     xmm0, xmm4
642     movdqa     xmm1, kMadd21
643     pmaddubsw  xmm0, xmm1
644     paddsw     xmm0, xmm7
645     psrlw      xmm0, 2
646     packuswb   xmm0, xmm0
647     movq       qword ptr [edx + 16], xmm0
648     lea        edx, [edx+24]
649     sub        ecx, 24
650     jg         wloop
651 
652     pop        esi
653     ret
654   }
655 }
656 
657 // 3/8 point sampler
658 
659 // Scale 32 pixels to 12
660 __declspec(naked)
ScaleRowDown38_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)661 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
662                           uint8* dst_ptr, int dst_width) {
663   __asm {
664     mov        eax, [esp + 4]        // src_ptr
665                                      // src_stride ignored
666     mov        edx, [esp + 12]       // dst_ptr
667     mov        ecx, [esp + 16]       // dst_width
668     movdqa     xmm4, kShuf38a
669     movdqa     xmm5, kShuf38b
670 
671   xloop:
672     movdqu     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
673     movdqu     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
674     lea        eax, [eax + 32]
675     pshufb     xmm0, xmm4
676     pshufb     xmm1, xmm5
677     paddusb    xmm0, xmm1
678 
679     movq       qword ptr [edx], xmm0  // write 12 pixels
680     movhlps    xmm1, xmm0
681     movd       [edx + 8], xmm1
682     lea        edx, [edx + 12]
683     sub        ecx, 12
684     jg         xloop
685 
686     ret
687   }
688 }
689 
690 // Scale 16x3 pixels to 6x1 with interpolation
691 __declspec(naked)
ScaleRowDown38_3_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)692 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
693                                 ptrdiff_t src_stride,
694                                 uint8* dst_ptr, int dst_width) {
695   __asm {
696     push       esi
697     mov        eax, [esp + 4 + 4]    // src_ptr
698     mov        esi, [esp + 4 + 8]    // src_stride
699     mov        edx, [esp + 4 + 12]   // dst_ptr
700     mov        ecx, [esp + 4 + 16]   // dst_width
701     movdqa     xmm2, kShufAc
702     movdqa     xmm3, kShufAc3
703     movdqa     xmm4, kScaleAc33
704     pxor       xmm5, xmm5
705 
706   xloop:
707     movdqu     xmm0, [eax]           // sum up 3 rows into xmm0/1
708     movdqu     xmm6, [eax + esi]
709     movhlps    xmm1, xmm0
710     movhlps    xmm7, xmm6
711     punpcklbw  xmm0, xmm5
712     punpcklbw  xmm1, xmm5
713     punpcklbw  xmm6, xmm5
714     punpcklbw  xmm7, xmm5
715     paddusw    xmm0, xmm6
716     paddusw    xmm1, xmm7
717     movdqu     xmm6, [eax + esi * 2]
718     lea        eax, [eax + 16]
719     movhlps    xmm7, xmm6
720     punpcklbw  xmm6, xmm5
721     punpcklbw  xmm7, xmm5
722     paddusw    xmm0, xmm6
723     paddusw    xmm1, xmm7
724 
725     movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
726     psrldq     xmm0, 2
727     paddusw    xmm6, xmm0
728     psrldq     xmm0, 2
729     paddusw    xmm6, xmm0
730     pshufb     xmm6, xmm2
731 
732     movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
733     psrldq     xmm1, 2
734     paddusw    xmm7, xmm1
735     psrldq     xmm1, 2
736     paddusw    xmm7, xmm1
737     pshufb     xmm7, xmm3
738     paddusw    xmm6, xmm7
739 
740     pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
741     packuswb   xmm6, xmm6
742 
743     movd       [edx], xmm6           // write 6 pixels
744     psrlq      xmm6, 16
745     movd       [edx + 2], xmm6
746     lea        edx, [edx + 6]
747     sub        ecx, 6
748     jg         xloop
749 
750     pop        esi
751     ret
752   }
753 }
754 
755 // Scale 16x2 pixels to 6x1 with interpolation
756 __declspec(naked)
ScaleRowDown38_2_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)757 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
758                                 ptrdiff_t src_stride,
759                                 uint8* dst_ptr, int dst_width) {
760   __asm {
761     push       esi
762     mov        eax, [esp + 4 + 4]    // src_ptr
763     mov        esi, [esp + 4 + 8]    // src_stride
764     mov        edx, [esp + 4 + 12]   // dst_ptr
765     mov        ecx, [esp + 4 + 16]   // dst_width
766     movdqa     xmm2, kShufAb0
767     movdqa     xmm3, kShufAb1
768     movdqa     xmm4, kShufAb2
769     movdqa     xmm5, kScaleAb2
770 
771   xloop:
772     movdqu     xmm0, [eax]           // average 2 rows into xmm0
773     movdqu     xmm1, [eax + esi]
774     lea        eax, [eax + 16]
775     pavgb      xmm0, xmm1
776 
777     movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
778     pshufb     xmm1, xmm2
779     movdqa     xmm6, xmm0
780     pshufb     xmm6, xmm3
781     paddusw    xmm1, xmm6
782     pshufb     xmm0, xmm4
783     paddusw    xmm1, xmm0
784 
785     pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
786     packuswb   xmm1, xmm1
787 
788     movd       [edx], xmm1           // write 6 pixels
789     psrlq      xmm1, 16
790     movd       [edx + 2], xmm1
791     lea        edx, [edx + 6]
792     sub        ecx, 6
793     jg         xloop
794 
795     pop        esi
796     ret
797   }
798 }
799 
800 // Reads 16 bytes and accumulates to 16 shorts at a time.
801 __declspec(naked)
ScaleAddRow_SSE2(const uint8 * src_ptr,uint16 * dst_ptr,int src_width)802 void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
803   __asm {
804     mov        eax, [esp + 4]   // src_ptr
805     mov        edx, [esp + 8]   // dst_ptr
806     mov        ecx, [esp + 12]  // src_width
807     pxor       xmm5, xmm5
808 
809   // sum rows
810   xloop:
811     movdqu     xmm3, [eax]       // read 16 bytes
812     lea        eax, [eax + 16]
813     movdqu     xmm0, [edx]       // read 16 words from destination
814     movdqu     xmm1, [edx + 16]
815     movdqa     xmm2, xmm3
816     punpcklbw  xmm2, xmm5
817     punpckhbw  xmm3, xmm5
818     paddusw    xmm0, xmm2        // sum 16 words
819     paddusw    xmm1, xmm3
820     movdqu     [edx], xmm0       // write 16 words to destination
821     movdqu     [edx + 16], xmm1
822     lea        edx, [edx + 32]
823     sub        ecx, 16
824     jg         xloop
825     ret
826   }
827 }
828 
829 #ifdef HAS_SCALEADDROW_AVX2
830 // Reads 32 bytes and accumulates to 32 shorts at a time.
831 __declspec(naked)
ScaleAddRow_AVX2(const uint8 * src_ptr,uint16 * dst_ptr,int src_width)832 void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
833   __asm {
834     mov         eax, [esp + 4]   // src_ptr
835     mov         edx, [esp + 8]   // dst_ptr
836     mov         ecx, [esp + 12]  // src_width
837     vpxor       ymm5, ymm5, ymm5
838 
839   // sum rows
840   xloop:
841     vmovdqu     ymm3, [eax]       // read 32 bytes
842     lea         eax, [eax + 32]
843     vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
844     vpunpcklbw  ymm2, ymm3, ymm5
845     vpunpckhbw  ymm3, ymm3, ymm5
846     vpaddusw    ymm0, ymm2, [edx] // sum 16 words
847     vpaddusw    ymm1, ymm3, [edx + 32]
848     vmovdqu     [edx], ymm0       // write 32 words to destination
849     vmovdqu     [edx + 32], ymm1
850     lea         edx, [edx + 64]
851     sub         ecx, 32
852     jg          xloop
853 
854     vzeroupper
855     ret
856   }
857 }
858 #endif  // HAS_SCALEADDROW_AVX2
859 
860 // Bilinear column filtering. SSSE3 version.
861 __declspec(naked)
ScaleFilterCols_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)862 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
863                            int dst_width, int x, int dx) {
864   __asm {
865     push       ebx
866     push       esi
867     push       edi
868     mov        edi, [esp + 12 + 4]    // dst_ptr
869     mov        esi, [esp + 12 + 8]    // src_ptr
870     mov        ecx, [esp + 12 + 12]   // dst_width
871     movd       xmm2, [esp + 12 + 16]  // x
872     movd       xmm3, [esp + 12 + 20]  // dx
873     mov        eax, 0x04040000      // shuffle to line up fractions with pixel.
874     movd       xmm5, eax
875     pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
876     psrlw      xmm6, 9
877     pextrw     eax, xmm2, 1         // get x0 integer. preroll
878     sub        ecx, 2
879     jl         xloop29
880 
881     movdqa     xmm0, xmm2           // x1 = x0 + dx
882     paddd      xmm0, xmm3
883     punpckldq  xmm2, xmm0           // x0 x1
884     punpckldq  xmm3, xmm3           // dx dx
885     paddd      xmm3, xmm3           // dx * 2, dx * 2
886     pextrw     edx, xmm2, 3         // get x1 integer. preroll
887 
888     // 2 Pixel loop.
889   xloop2:
890     movdqa     xmm1, xmm2           // x0, x1 fractions.
891     paddd      xmm2, xmm3           // x += dx
892     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
893     movd       xmm0, ebx
894     psrlw      xmm1, 9              // 7 bit fractions.
895     movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
896     movd       xmm4, ebx
897     pshufb     xmm1, xmm5           // 0011
898     punpcklwd  xmm0, xmm4
899     pxor       xmm1, xmm6           // 0..7f and 7f..0
900     pmaddubsw  xmm0, xmm1           // 16 bit, 2 pixels.
901     pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
902     pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
903     psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
904     packuswb   xmm0, xmm0           // 8 bits, 2 pixels.
905     movd       ebx, xmm0
906     mov        [edi], bx
907     lea        edi, [edi + 2]
908     sub        ecx, 2               // 2 pixels
909     jge        xloop2
910 
911  xloop29:
912 
913     add        ecx, 2 - 1
914     jl         xloop99
915 
916     // 1 pixel remainder
917     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
918     movd       xmm0, ebx
919     psrlw      xmm2, 9              // 7 bit fractions.
920     pshufb     xmm2, xmm5           // 0011
921     pxor       xmm2, xmm6           // 0..7f and 7f..0
922     pmaddubsw  xmm0, xmm2           // 16 bit
923     psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
924     packuswb   xmm0, xmm0           // 8 bits
925     movd       ebx, xmm0
926     mov        [edi], bl
927 
928  xloop99:
929 
930     pop        edi
931     pop        esi
932     pop        ebx
933     ret
934   }
935 }
936 
937 // Reads 16 pixels, duplicates them and writes 32 pixels.
938 __declspec(naked)
ScaleColsUp2_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)939 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
940                        int dst_width, int x, int dx) {
941   __asm {
942     mov        edx, [esp + 4]    // dst_ptr
943     mov        eax, [esp + 8]    // src_ptr
944     mov        ecx, [esp + 12]   // dst_width
945 
946   wloop:
947     movdqu     xmm0, [eax]
948     lea        eax,  [eax + 16]
949     movdqa     xmm1, xmm0
950     punpcklbw  xmm0, xmm0
951     punpckhbw  xmm1, xmm1
952     movdqu     [edx], xmm0
953     movdqu     [edx + 16], xmm1
954     lea        edx, [edx + 32]
955     sub        ecx, 32
956     jg         wloop
957 
958     ret
959   }
960 }
961 
962 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
963 __declspec(naked)
ScaleARGBRowDown2_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)964 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
965                             ptrdiff_t src_stride,
966                             uint8* dst_argb, int dst_width) {
967   __asm {
968     mov        eax, [esp + 4]        // src_argb
969                                      // src_stride ignored
970     mov        edx, [esp + 12]       // dst_argb
971     mov        ecx, [esp + 16]       // dst_width
972 
973   wloop:
974     movdqu     xmm0, [eax]
975     movdqu     xmm1, [eax + 16]
976     lea        eax,  [eax + 32]
977     shufps     xmm0, xmm1, 0xdd
978     movdqu     [edx], xmm0
979     lea        edx, [edx + 16]
980     sub        ecx, 4
981     jg         wloop
982 
983     ret
984   }
985 }
986 
987 // Blends 8x1 rectangle to 4x1.
988 __declspec(naked)
ScaleARGBRowDown2Linear_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)989 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
990                                   ptrdiff_t src_stride,
991                                   uint8* dst_argb, int dst_width) {
992   __asm {
993     mov        eax, [esp + 4]        // src_argb
994                                      // src_stride ignored
995     mov        edx, [esp + 12]       // dst_argb
996     mov        ecx, [esp + 16]       // dst_width
997 
998   wloop:
999     movdqu     xmm0, [eax]
1000     movdqu     xmm1, [eax + 16]
1001     lea        eax,  [eax + 32]
1002     movdqa     xmm2, xmm0
1003     shufps     xmm0, xmm1, 0x88      // even pixels
1004     shufps     xmm2, xmm1, 0xdd      // odd pixels
1005     pavgb      xmm0, xmm2
1006     movdqu     [edx], xmm0
1007     lea        edx, [edx + 16]
1008     sub        ecx, 4
1009     jg         wloop
1010 
1011     ret
1012   }
1013 }
1014 
1015 // Blends 8x2 rectangle to 4x1.
1016 __declspec(naked)
ScaleARGBRowDown2Box_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)1017 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
1018                                ptrdiff_t src_stride,
1019                                uint8* dst_argb, int dst_width) {
1020   __asm {
1021     push       esi
1022     mov        eax, [esp + 4 + 4]    // src_argb
1023     mov        esi, [esp + 4 + 8]    // src_stride
1024     mov        edx, [esp + 4 + 12]   // dst_argb
1025     mov        ecx, [esp + 4 + 16]   // dst_width
1026 
1027   wloop:
1028     movdqu     xmm0, [eax]
1029     movdqu     xmm1, [eax + 16]
1030     movdqu     xmm2, [eax + esi]
1031     movdqu     xmm3, [eax + esi + 16]
1032     lea        eax,  [eax + 32]
1033     pavgb      xmm0, xmm2            // average rows
1034     pavgb      xmm1, xmm3
1035     movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
1036     shufps     xmm0, xmm1, 0x88      // even pixels
1037     shufps     xmm2, xmm1, 0xdd      // odd pixels
1038     pavgb      xmm0, xmm2
1039     movdqu     [edx], xmm0
1040     lea        edx, [edx + 16]
1041     sub        ecx, 4
1042     jg         wloop
1043 
1044     pop        esi
1045     ret
1046   }
1047 }
1048 
1049 // Reads 4 pixels at a time.
1050 __declspec(naked)
ScaleARGBRowDownEven_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)1051 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
1052                                int src_stepx,
1053                                uint8* dst_argb, int dst_width) {
1054   __asm {
1055     push       ebx
1056     push       edi
1057     mov        eax, [esp + 8 + 4]    // src_argb
1058                                      // src_stride ignored
1059     mov        ebx, [esp + 8 + 12]   // src_stepx
1060     mov        edx, [esp + 8 + 16]   // dst_argb
1061     mov        ecx, [esp + 8 + 20]   // dst_width
1062     lea        ebx, [ebx * 4]
1063     lea        edi, [ebx + ebx * 2]
1064 
1065   wloop:
1066     movd       xmm0, [eax]
1067     movd       xmm1, [eax + ebx]
1068     punpckldq  xmm0, xmm1
1069     movd       xmm2, [eax + ebx * 2]
1070     movd       xmm3, [eax + edi]
1071     lea        eax,  [eax + ebx * 4]
1072     punpckldq  xmm2, xmm3
1073     punpcklqdq xmm0, xmm2
1074     movdqu     [edx], xmm0
1075     lea        edx, [edx + 16]
1076     sub        ecx, 4
1077     jg         wloop
1078 
1079     pop        edi
1080     pop        ebx
1081     ret
1082   }
1083 }
1084 
1085 // Blends four 2x2 to 4x1.
1086 __declspec(naked)
ScaleARGBRowDownEvenBox_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)1087 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
1088                                   ptrdiff_t src_stride,
1089                                   int src_stepx,
1090                                   uint8* dst_argb, int dst_width) {
1091   __asm {
1092     push       ebx
1093     push       esi
1094     push       edi
1095     mov        eax, [esp + 12 + 4]    // src_argb
1096     mov        esi, [esp + 12 + 8]    // src_stride
1097     mov        ebx, [esp + 12 + 12]   // src_stepx
1098     mov        edx, [esp + 12 + 16]   // dst_argb
1099     mov        ecx, [esp + 12 + 20]   // dst_width
1100     lea        esi, [eax + esi]       // row1 pointer
1101     lea        ebx, [ebx * 4]
1102     lea        edi, [ebx + ebx * 2]
1103 
1104   wloop:
1105     movq       xmm0, qword ptr [eax]  // row0 4 pairs
1106     movhps     xmm0, qword ptr [eax + ebx]
1107     movq       xmm1, qword ptr [eax + ebx * 2]
1108     movhps     xmm1, qword ptr [eax + edi]
1109     lea        eax,  [eax + ebx * 4]
1110     movq       xmm2, qword ptr [esi]  // row1 4 pairs
1111     movhps     xmm2, qword ptr [esi + ebx]
1112     movq       xmm3, qword ptr [esi + ebx * 2]
1113     movhps     xmm3, qword ptr [esi + edi]
1114     lea        esi,  [esi + ebx * 4]
1115     pavgb      xmm0, xmm2            // average rows
1116     pavgb      xmm1, xmm3
1117     movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
1118     shufps     xmm0, xmm1, 0x88      // even pixels
1119     shufps     xmm2, xmm1, 0xdd      // odd pixels
1120     pavgb      xmm0, xmm2
1121     movdqu     [edx], xmm0
1122     lea        edx, [edx + 16]
1123     sub        ecx, 4
1124     jg         wloop
1125 
1126     pop        edi
1127     pop        esi
1128     pop        ebx
1129     ret
1130   }
1131 }
1132 
1133 // Column scaling unfiltered. SSE2 version.
1134 __declspec(naked)
ScaleARGBCols_SSE2(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1135 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
1136                         int dst_width, int x, int dx) {
1137   __asm {
1138     push       edi
1139     push       esi
1140     mov        edi, [esp + 8 + 4]    // dst_argb
1141     mov        esi, [esp + 8 + 8]    // src_argb
1142     mov        ecx, [esp + 8 + 12]   // dst_width
1143     movd       xmm2, [esp + 8 + 16]  // x
1144     movd       xmm3, [esp + 8 + 20]  // dx
1145 
1146     pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0
1147     pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0
1148     paddd      xmm2, xmm0
1149     paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2
1150     pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0
1151     paddd      xmm2, xmm0            // x3 x2 x1 x0
1152     paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4
1153     pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4
1154 
1155     pextrw     eax, xmm2, 1          // get x0 integer.
1156     pextrw     edx, xmm2, 3          // get x1 integer.
1157 
1158     cmp        ecx, 0
1159     jle        xloop99
1160     sub        ecx, 4
1161     jl         xloop49
1162 
1163     // 4 Pixel loop.
1164  xloop4:
1165     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
1166     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
1167     pextrw     eax, xmm2, 5           // get x2 integer.
1168     pextrw     edx, xmm2, 7           // get x3 integer.
1169     paddd      xmm2, xmm3             // x += dx
1170     punpckldq  xmm0, xmm1             // x0 x1
1171 
1172     movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
1173     movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
1174     pextrw     eax, xmm2, 1           // get x0 integer. next iteration.
1175     pextrw     edx, xmm2, 3           // get x1 integer. next iteration.
1176     punpckldq  xmm1, xmm4             // x2 x3
1177     punpcklqdq xmm0, xmm1             // x0 x1 x2 x3
1178     movdqu     [edi], xmm0
1179     lea        edi, [edi + 16]
1180     sub        ecx, 4                 // 4 pixels
1181     jge        xloop4
1182 
1183  xloop49:
1184     test       ecx, 2
1185     je         xloop29
1186 
1187     // 2 Pixels.
1188     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
1189     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
1190     pextrw     eax, xmm2, 5           // get x2 integer.
1191     punpckldq  xmm0, xmm1             // x0 x1
1192 
1193     movq       qword ptr [edi], xmm0
1194     lea        edi, [edi + 8]
1195 
1196  xloop29:
1197     test       ecx, 1
1198     je         xloop99
1199 
1200     // 1 Pixels.
1201     movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
1202     movd       dword ptr [edi], xmm0
1203  xloop99:
1204 
1205     pop        esi
1206     pop        edi
1207     ret
1208   }
1209 }
1210 
1211 // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
1212 // TODO(fbarchard): Port to Neon
1213 
1214 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1215 static uvec8 kShuffleColARGB = {
1216   0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
1217   8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
1218 };
1219 
1220 // Shuffle table for duplicating 2 fractions into 8 bytes each
1221 static uvec8 kShuffleFractions = {
1222   0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1223 };
1224 
1225 __declspec(naked)
ScaleARGBFilterCols_SSSE3(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1226 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
1227                                int dst_width, int x, int dx) {
1228   __asm {
1229     push       esi
1230     push       edi
1231     mov        edi, [esp + 8 + 4]    // dst_argb
1232     mov        esi, [esp + 8 + 8]    // src_argb
1233     mov        ecx, [esp + 8 + 12]   // dst_width
1234     movd       xmm2, [esp + 8 + 16]  // x
1235     movd       xmm3, [esp + 8 + 20]  // dx
1236     movdqa     xmm4, kShuffleColARGB
1237     movdqa     xmm5, kShuffleFractions
1238     pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
1239     psrlw      xmm6, 9
1240     pextrw     eax, xmm2, 1         // get x0 integer. preroll
1241     sub        ecx, 2
1242     jl         xloop29
1243 
1244     movdqa     xmm0, xmm2           // x1 = x0 + dx
1245     paddd      xmm0, xmm3
1246     punpckldq  xmm2, xmm0           // x0 x1
1247     punpckldq  xmm3, xmm3           // dx dx
1248     paddd      xmm3, xmm3           // dx * 2, dx * 2
1249     pextrw     edx, xmm2, 3         // get x1 integer. preroll
1250 
1251     // 2 Pixel loop.
1252   xloop2:
1253     movdqa     xmm1, xmm2           // x0, x1 fractions.
1254     paddd      xmm2, xmm3           // x += dx
1255     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
1256     psrlw      xmm1, 9              // 7 bit fractions.
1257     movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
1258     pshufb     xmm1, xmm5           // 0000000011111111
1259     pshufb     xmm0, xmm4           // arrange pixels into pairs
1260     pxor       xmm1, xmm6           // 0..7f and 7f..0
1261     pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.
1262     pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
1263     pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
1264     psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.
1265     packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.
1266     movq       qword ptr [edi], xmm0
1267     lea        edi, [edi + 8]
1268     sub        ecx, 2               // 2 pixels
1269     jge        xloop2
1270 
1271  xloop29:
1272 
1273     add        ecx, 2 - 1
1274     jl         xloop99
1275 
1276     // 1 pixel remainder
1277     psrlw      xmm2, 9              // 7 bit fractions.
1278     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
1279     pshufb     xmm2, xmm5           // 00000000
1280     pshufb     xmm0, xmm4           // arrange pixels into pairs
1281     pxor       xmm2, xmm6           // 0..7f and 7f..0
1282     pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.
1283     psrlw      xmm0, 7
1284     packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
1285     movd       [edi], xmm0
1286 
1287  xloop99:
1288 
1289     pop        edi
1290     pop        esi
1291     ret
1292   }
1293 }
1294 
1295 // Reads 4 pixels, duplicates them and writes 8 pixels.
1296 __declspec(naked)
ScaleARGBColsUp2_SSE2(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1297 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
1298                            int dst_width, int x, int dx) {
1299   __asm {
1300     mov        edx, [esp + 4]    // dst_argb
1301     mov        eax, [esp + 8]    // src_argb
1302     mov        ecx, [esp + 12]   // dst_width
1303 
1304   wloop:
1305     movdqu     xmm0, [eax]
1306     lea        eax,  [eax + 16]
1307     movdqa     xmm1, xmm0
1308     punpckldq  xmm0, xmm0
1309     punpckhdq  xmm1, xmm1
1310     movdqu     [edx], xmm0
1311     movdqu     [edx + 16], xmm1
1312     lea        edx, [edx + 32]
1313     sub        ecx, 8
1314     jg         wloop
1315 
1316     ret
1317   }
1318 }
1319 
1320 // Divide num by div and return as 16.16 fixed point result.
1321 __declspec(naked)
FixedDiv_X86(int num,int div)1322 int FixedDiv_X86(int num, int div) {
1323   __asm {
1324     mov        eax, [esp + 4]    // num
1325     cdq                          // extend num to 64 bits
1326     shld       edx, eax, 16      // 32.16
1327     shl        eax, 16
1328     idiv       dword ptr [esp + 8]
1329     ret
1330   }
1331 }
1332 
1333 // Divide num by div and return as 16.16 fixed point result.
1334 __declspec(naked)
FixedDiv1_X86(int num,int div)1335 int FixedDiv1_X86(int num, int div) {
1336   __asm {
1337     mov        eax, [esp + 4]    // num
1338     mov        ecx, [esp + 8]    // denom
1339     cdq                          // extend num to 64 bits
1340     shld       edx, eax, 16      // 32.16
1341     shl        eax, 16
1342     sub        eax, 0x00010001
1343     sbb        edx, 0
1344     sub        ecx, 1
1345     idiv       ecx
1346     ret
1347   }
1348 }
1349 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
1350 
1351 #ifdef __cplusplus
1352 }  // extern "C"
1353 }  // namespace libyuv
1354 #endif
1355