• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 #include "libyuv/scale_row.h"
13 
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18 
19 // This module is for GCC x86 and x64.
20 #if !defined(LIBYUV_DISABLE_X86) && \
21     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
22 
23 // Offsets for source bytes 0 to 9
24 static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
25                              128, 128, 128, 128, 128, 128, 128, 128};
26 
27 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
28 static const uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
29                              128, 128, 128, 128, 128, 128, 128, 128};
30 
31 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
32 static const uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
33                              128, 128, 128, 128, 128, 128, 128, 128};
34 
35 // Offsets for source bytes 0 to 10
36 static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
37 
38 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
39 static const uvec8 kShuf11 = {2, 3, 4, 5,  5,  6,  6,  7,
40                               8, 9, 9, 10, 10, 11, 12, 13};
41 
42 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
43 static const uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
44                               10, 11, 12, 13, 13, 14, 14, 15};
45 
46 // Coefficients for source bytes 0 to 10
47 static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
48 
49 // Coefficients for source bytes 10 to 21
50 static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
51 
52 // Coefficients for source bytes 21 to 31
53 static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
54 
55 // Coefficients for source bytes 21 to 31
56 static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
57 
58 static const uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
59                                128, 128, 128, 128, 128, 128, 128, 128};
60 
61 static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
62                                6,   8,   11,  14,  128, 128, 128, 128};
63 
64 // Arrange words 0,3,6 into 0,1,2
65 static const uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
66                               128, 128, 128, 128, 128, 128, 128, 128};
67 
68 // Arrange words 0,3,6 into 3,4,5
69 static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
70                                6,   7,   12,  13,  128, 128, 128, 128};
71 
72 // Scaling values for boxes of 3x3 and 2x3
73 static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
74                                   65536 / 9, 65536 / 6, 0,         0};
75 
76 // Arrange first value for pixels 0,1,2,3,4,5
77 static const uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
78                                11, 128, 14, 128, 128, 128, 128, 128};
79 
80 // Arrange second value for pixels 0,1,2,3,4,5
81 static const uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
82                                12, 128, 15, 128, 128, 128, 128, 128};
83 
84 // Arrange third value for pixels 0,1,2,3,4,5
85 static const uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
86                                13, 128, 128, 128, 128, 128, 128, 128};
87 
88 // Scaling values for boxes of 3x2 and 2x2
89 static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
90                                  65536 / 3, 65536 / 2, 0,         0};
91 
92 // GCC versions of row functions are verbatim conversions from Visual C.
93 // Generated using gcc disassembly on Visual C object file:
94 // objdump -D yuvscaler.obj >yuvscaler.txt
95 
ScaleRowDown2_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)96 void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
97                          ptrdiff_t src_stride,
98                          uint8_t* dst_ptr,
99                          int dst_width) {
100   (void)src_stride;
101   asm volatile(
102       // 16 pixel loop.
103       LABELALIGN
104       "1:                                        \n"
105       "movdqu      (%0),%%xmm0                   \n"
106       "movdqu      0x10(%0),%%xmm1               \n"
107       "lea         0x20(%0),%0                   \n"
108       "psrlw       $0x8,%%xmm0                   \n"
109       "psrlw       $0x8,%%xmm1                   \n"
110       "packuswb    %%xmm1,%%xmm0                 \n"
111       "movdqu      %%xmm0,(%1)                   \n"
112       "lea         0x10(%1),%1                   \n"
113       "sub         $0x10,%2                      \n"
114       "jg          1b                            \n"
115       : "+r"(src_ptr),   // %0
116         "+r"(dst_ptr),   // %1
117         "+r"(dst_width)  // %2
118         ::"memory",
119         "cc", "xmm0", "xmm1");
120 }
121 
ScaleRowDown2Linear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)122 void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
123                                ptrdiff_t src_stride,
124                                uint8_t* dst_ptr,
125                                int dst_width) {
126   (void)src_stride;
127   asm volatile(
128       "pcmpeqb     %%xmm4,%%xmm4                 \n"
129       "psrlw       $0xf,%%xmm4                   \n"
130       "packuswb    %%xmm4,%%xmm4                 \n"
131       "pxor        %%xmm5,%%xmm5                 \n"
132 
133       LABELALIGN
134       "1:                                        \n"
135       "movdqu      (%0),%%xmm0                   \n"
136       "movdqu      0x10(%0),%%xmm1               \n"
137       "lea         0x20(%0),%0                   \n"
138       "pmaddubsw   %%xmm4,%%xmm0                 \n"
139       "pmaddubsw   %%xmm4,%%xmm1                 \n"
140       "pavgw       %%xmm5,%%xmm0                 \n"
141       "pavgw       %%xmm5,%%xmm1                 \n"
142       "packuswb    %%xmm1,%%xmm0                 \n"
143       "movdqu      %%xmm0,(%1)                   \n"
144       "lea         0x10(%1),%1                   \n"
145       "sub         $0x10,%2                      \n"
146       "jg          1b                            \n"
147       : "+r"(src_ptr),   // %0
148         "+r"(dst_ptr),   // %1
149         "+r"(dst_width)  // %2
150         ::"memory",
151         "cc", "xmm0", "xmm1", "xmm4", "xmm5");
152 }
153 
ScaleRowDown2Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)154 void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
155                             ptrdiff_t src_stride,
156                             uint8_t* dst_ptr,
157                             int dst_width) {
158   asm volatile(
159       "pcmpeqb     %%xmm4,%%xmm4                 \n"
160       "psrlw       $0xf,%%xmm4                   \n"
161       "packuswb    %%xmm4,%%xmm4                 \n"
162       "pxor        %%xmm5,%%xmm5                 \n"
163 
164       LABELALIGN
165       "1:                                        \n"
166       "movdqu      (%0),%%xmm0                   \n"
167       "movdqu      0x10(%0),%%xmm1               \n"
168       "movdqu      0x00(%0,%3,1),%%xmm2          \n"
169       "movdqu      0x10(%0,%3,1),%%xmm3          \n"
170       "lea         0x20(%0),%0                   \n"
171       "pmaddubsw   %%xmm4,%%xmm0                 \n"
172       "pmaddubsw   %%xmm4,%%xmm1                 \n"
173       "pmaddubsw   %%xmm4,%%xmm2                 \n"
174       "pmaddubsw   %%xmm4,%%xmm3                 \n"
175       "paddw       %%xmm2,%%xmm0                 \n"
176       "paddw       %%xmm3,%%xmm1                 \n"
177       "psrlw       $0x1,%%xmm0                   \n"
178       "psrlw       $0x1,%%xmm1                   \n"
179       "pavgw       %%xmm5,%%xmm0                 \n"
180       "pavgw       %%xmm5,%%xmm1                 \n"
181       "packuswb    %%xmm1,%%xmm0                 \n"
182       "movdqu      %%xmm0,(%1)                   \n"
183       "lea         0x10(%1),%1                   \n"
184       "sub         $0x10,%2                      \n"
185       "jg          1b                            \n"
186       : "+r"(src_ptr),               // %0
187         "+r"(dst_ptr),               // %1
188         "+r"(dst_width)              // %2
189       : "r"((intptr_t)(src_stride))  // %3
190       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
191 }
192 
193 #ifdef HAS_SCALEROWDOWN2_AVX2
ScaleRowDown2_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)194 void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
195                         ptrdiff_t src_stride,
196                         uint8_t* dst_ptr,
197                         int dst_width) {
198   (void)src_stride;
199   asm volatile(
200 
201       LABELALIGN
202       "1:                                        \n"
203       "vmovdqu     (%0),%%ymm0                   \n"
204       "vmovdqu     0x20(%0),%%ymm1               \n"
205       "lea         0x40(%0),%0                   \n"
206       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
207       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
208       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
209       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
210       "vmovdqu     %%ymm0,(%1)                   \n"
211       "lea         0x20(%1),%1                   \n"
212       "sub         $0x20,%2                      \n"
213       "jg          1b                            \n"
214       "vzeroupper                                \n"
215       : "+r"(src_ptr),   // %0
216         "+r"(dst_ptr),   // %1
217         "+r"(dst_width)  // %2
218         ::"memory",
219         "cc", "xmm0", "xmm1");
220 }
221 
ScaleRowDown2Linear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)222 void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
223                               ptrdiff_t src_stride,
224                               uint8_t* dst_ptr,
225                               int dst_width) {
226   (void)src_stride;
227   asm volatile(
228       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
229       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
230       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
231       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
232 
233       LABELALIGN
234       "1:                                        \n"
235       "vmovdqu     (%0),%%ymm0                   \n"
236       "vmovdqu     0x20(%0),%%ymm1               \n"
237       "lea         0x40(%0),%0                   \n"
238       "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
239       "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
240       "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
241       "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
242       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
243       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
244       "vmovdqu     %%ymm0,(%1)                   \n"
245       "lea         0x20(%1),%1                   \n"
246       "sub         $0x20,%2                      \n"
247       "jg          1b                            \n"
248       "vzeroupper                                \n"
249       : "+r"(src_ptr),   // %0
250         "+r"(dst_ptr),   // %1
251         "+r"(dst_width)  // %2
252         ::"memory",
253         "cc", "xmm0", "xmm1", "xmm4", "xmm5");
254 }
255 
ScaleRowDown2Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)256 void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
257                            ptrdiff_t src_stride,
258                            uint8_t* dst_ptr,
259                            int dst_width) {
260   asm volatile(
261       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
262       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
263       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
264       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
265 
266       LABELALIGN
267       "1:                                        \n"
268       "vmovdqu     (%0),%%ymm0                   \n"
269       "vmovdqu     0x20(%0),%%ymm1               \n"
270       "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"
271       "vmovdqu     0x20(%0,%3,1),%%ymm3          \n"
272       "lea         0x40(%0),%0                   \n"
273       "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
274       "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
275       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
276       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
277       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
278       "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
279       "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"
280       "vpsrlw      $0x1,%%ymm1,%%ymm1            \n"
281       "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
282       "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
283       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
284       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
285       "vmovdqu     %%ymm0,(%1)                   \n"
286       "lea         0x20(%1),%1                   \n"
287       "sub         $0x20,%2                      \n"
288       "jg          1b                            \n"
289       "vzeroupper                                \n"
290       : "+r"(src_ptr),               // %0
291         "+r"(dst_ptr),               // %1
292         "+r"(dst_width)              // %2
293       : "r"((intptr_t)(src_stride))  // %3
294       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
295 }
296 #endif  // HAS_SCALEROWDOWN2_AVX2
297 
ScaleRowDown4_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)298 void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
299                          ptrdiff_t src_stride,
300                          uint8_t* dst_ptr,
301                          int dst_width) {
302   (void)src_stride;
303   asm volatile(
304       "pcmpeqb     %%xmm5,%%xmm5                 \n"
305       "psrld       $0x18,%%xmm5                  \n"
306       "pslld       $0x10,%%xmm5                  \n"
307 
308       LABELALIGN
309       "1:                                        \n"
310       "movdqu      (%0),%%xmm0                   \n"
311       "movdqu      0x10(%0),%%xmm1               \n"
312       "lea         0x20(%0),%0                   \n"
313       "pand        %%xmm5,%%xmm0                 \n"
314       "pand        %%xmm5,%%xmm1                 \n"
315       "packuswb    %%xmm1,%%xmm0                 \n"
316       "psrlw       $0x8,%%xmm0                   \n"
317       "packuswb    %%xmm0,%%xmm0                 \n"
318       "movq        %%xmm0,(%1)                   \n"
319       "lea         0x8(%1),%1                    \n"
320       "sub         $0x8,%2                       \n"
321       "jg          1b                            \n"
322       : "+r"(src_ptr),   // %0
323         "+r"(dst_ptr),   // %1
324         "+r"(dst_width)  // %2
325         ::"memory",
326         "cc", "xmm0", "xmm1", "xmm5");
327 }
328 
ScaleRowDown4Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)329 void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
330                             ptrdiff_t src_stride,
331                             uint8_t* dst_ptr,
332                             int dst_width) {
333   intptr_t stridex3;
334   asm volatile(
335       "pcmpeqb     %%xmm4,%%xmm4                 \n"
336       "psrlw       $0xf,%%xmm4                   \n"
337       "movdqa      %%xmm4,%%xmm5                 \n"
338       "packuswb    %%xmm4,%%xmm4                 \n"
339       "psllw       $0x3,%%xmm5                   \n"
340       "lea         0x00(%4,%4,2),%3              \n"
341 
342       LABELALIGN
343       "1:                                        \n"
344       "movdqu      (%0),%%xmm0                   \n"
345       "movdqu      0x10(%0),%%xmm1               \n"
346       "movdqu      0x00(%0,%4,1),%%xmm2          \n"
347       "movdqu      0x10(%0,%4,1),%%xmm3          \n"
348       "pmaddubsw   %%xmm4,%%xmm0                 \n"
349       "pmaddubsw   %%xmm4,%%xmm1                 \n"
350       "pmaddubsw   %%xmm4,%%xmm2                 \n"
351       "pmaddubsw   %%xmm4,%%xmm3                 \n"
352       "paddw       %%xmm2,%%xmm0                 \n"
353       "paddw       %%xmm3,%%xmm1                 \n"
354       "movdqu      0x00(%0,%4,2),%%xmm2          \n"
355       "movdqu      0x10(%0,%4,2),%%xmm3          \n"
356       "pmaddubsw   %%xmm4,%%xmm2                 \n"
357       "pmaddubsw   %%xmm4,%%xmm3                 \n"
358       "paddw       %%xmm2,%%xmm0                 \n"
359       "paddw       %%xmm3,%%xmm1                 \n"
360       "movdqu      0x00(%0,%3,1),%%xmm2          \n"
361       "movdqu      0x10(%0,%3,1),%%xmm3          \n"
362       "lea         0x20(%0),%0                   \n"
363       "pmaddubsw   %%xmm4,%%xmm2                 \n"
364       "pmaddubsw   %%xmm4,%%xmm3                 \n"
365       "paddw       %%xmm2,%%xmm0                 \n"
366       "paddw       %%xmm3,%%xmm1                 \n"
367       "phaddw      %%xmm1,%%xmm0                 \n"
368       "paddw       %%xmm5,%%xmm0                 \n"
369       "psrlw       $0x4,%%xmm0                   \n"
370       "packuswb    %%xmm0,%%xmm0                 \n"
371       "movq        %%xmm0,(%1)                   \n"
372       "lea         0x8(%1),%1                    \n"
373       "sub         $0x8,%2                       \n"
374       "jg          1b                            \n"
375       : "+r"(src_ptr),               // %0
376         "+r"(dst_ptr),               // %1
377         "+r"(dst_width),             // %2
378         "=&r"(stridex3)              // %3
379       : "r"((intptr_t)(src_stride))  // %4
380       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
381 }
382 
383 #ifdef HAS_SCALEROWDOWN4_AVX2
ScaleRowDown4_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)384 void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
385                         ptrdiff_t src_stride,
386                         uint8_t* dst_ptr,
387                         int dst_width) {
388   (void)src_stride;
389   asm volatile(
390       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
391       "vpsrld      $0x18,%%ymm5,%%ymm5           \n"
392       "vpslld      $0x10,%%ymm5,%%ymm5           \n"
393 
394       LABELALIGN
395       "1:                                        \n"
396       "vmovdqu     (%0),%%ymm0                   \n"
397       "vmovdqu     0x20(%0),%%ymm1               \n"
398       "lea         0x40(%0),%0                   \n"
399       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
400       "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
401       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
402       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
403       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
404       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
405       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
406       "vmovdqu     %%xmm0,(%1)                   \n"
407       "lea         0x10(%1),%1                   \n"
408       "sub         $0x10,%2                      \n"
409       "jg          1b                            \n"
410       "vzeroupper                                \n"
411       : "+r"(src_ptr),   // %0
412         "+r"(dst_ptr),   // %1
413         "+r"(dst_width)  // %2
414         ::"memory",
415         "cc", "xmm0", "xmm1", "xmm5");
416 }
417 
ScaleRowDown4Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)418 void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
419                            ptrdiff_t src_stride,
420                            uint8_t* dst_ptr,
421                            int dst_width) {
422   asm volatile(
423       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
424       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
425       "vpsllw      $0x3,%%ymm4,%%ymm5            \n"
426       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
427 
428       LABELALIGN
429       "1:                                        \n"
430       "vmovdqu     (%0),%%ymm0                   \n"
431       "vmovdqu     0x20(%0),%%ymm1               \n"
432       "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"
433       "vmovdqu     0x20(%0,%3,1),%%ymm3          \n"
434       "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
435       "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
436       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
437       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
438       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
439       "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
440       "vmovdqu     0x00(%0,%3,2),%%ymm2          \n"
441       "vmovdqu     0x20(%0,%3,2),%%ymm3          \n"
442       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
443       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
444       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
445       "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
446       "vmovdqu     0x00(%0,%4,1),%%ymm2          \n"
447       "vmovdqu     0x20(%0,%4,1),%%ymm3          \n"
448       "lea         0x40(%0),%0                   \n"
449       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
450       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
451       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
452       "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
453       "vphaddw     %%ymm1,%%ymm0,%%ymm0          \n"
454       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
455       "vpaddw      %%ymm5,%%ymm0,%%ymm0          \n"
456       "vpsrlw      $0x4,%%ymm0,%%ymm0            \n"
457       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
458       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
459       "vmovdqu     %%xmm0,(%1)                   \n"
460       "lea         0x10(%1),%1                   \n"
461       "sub         $0x10,%2                      \n"
462       "jg          1b                            \n"
463       "vzeroupper                                \n"
464       : "+r"(src_ptr),                   // %0
465         "+r"(dst_ptr),                   // %1
466         "+r"(dst_width)                  // %2
467       : "r"((intptr_t)(src_stride)),     // %3
468         "r"((intptr_t)(src_stride * 3))  // %4
469       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
470 }
471 #endif  // HAS_SCALEROWDOWN4_AVX2
472 
ScaleRowDown34_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)473 void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
474                           ptrdiff_t src_stride,
475                           uint8_t* dst_ptr,
476                           int dst_width) {
477   (void)src_stride;
478   asm volatile(
479       "movdqa      %0,%%xmm3                     \n"
480       "movdqa      %1,%%xmm4                     \n"
481       "movdqa      %2,%%xmm5                     \n"
482       :
483       : "m"(kShuf0),  // %0
484         "m"(kShuf1),  // %1
485         "m"(kShuf2)   // %2
486   );
487   asm volatile(
488 
489       LABELALIGN
490       "1:                                        \n"
491       "movdqu      (%0),%%xmm0                   \n"
492       "movdqu      0x10(%0),%%xmm2               \n"
493       "lea         0x20(%0),%0                   \n"
494       "movdqa      %%xmm2,%%xmm1                 \n"
495       "palignr     $0x8,%%xmm0,%%xmm1            \n"
496       "pshufb      %%xmm3,%%xmm0                 \n"
497       "pshufb      %%xmm4,%%xmm1                 \n"
498       "pshufb      %%xmm5,%%xmm2                 \n"
499       "movq        %%xmm0,(%1)                   \n"
500       "movq        %%xmm1,0x8(%1)                \n"
501       "movq        %%xmm2,0x10(%1)               \n"
502       "lea         0x18(%1),%1                   \n"
503       "sub         $0x18,%2                      \n"
504       "jg          1b                            \n"
505       : "+r"(src_ptr),   // %0
506         "+r"(dst_ptr),   // %1
507         "+r"(dst_width)  // %2
508         ::"memory",
509         "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
510 }
511 
ScaleRowDown34_1_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)512 void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
513                                 ptrdiff_t src_stride,
514                                 uint8_t* dst_ptr,
515                                 int dst_width) {
516   asm volatile(
517       "movdqa      %0,%%xmm2                     \n"  // kShuf01
518       "movdqa      %1,%%xmm3                     \n"  // kShuf11
519       "movdqa      %2,%%xmm4                     \n"  // kShuf21
520       :
521       : "m"(kShuf01),  // %0
522         "m"(kShuf11),  // %1
523         "m"(kShuf21)   // %2
524   );
525   asm volatile(
526       "movdqa      %0,%%xmm5                     \n"  // kMadd01
527       "movdqa      %1,%%xmm0                     \n"  // kMadd11
528       "movdqa      %2,%%xmm1                     \n"  // kRound34
529       :
530       : "m"(kMadd01),  // %0
531         "m"(kMadd11),  // %1
532         "m"(kRound34)  // %2
533   );
534   asm volatile(
535 
536       LABELALIGN
537       "1:                                        \n"
538       "movdqu      (%0),%%xmm6                   \n"
539       "movdqu      0x00(%0,%3,1),%%xmm7          \n"
540       "pavgb       %%xmm7,%%xmm6                 \n"
541       "pshufb      %%xmm2,%%xmm6                 \n"
542       "pmaddubsw   %%xmm5,%%xmm6                 \n"
543       "paddsw      %%xmm1,%%xmm6                 \n"
544       "psrlw       $0x2,%%xmm6                   \n"
545       "packuswb    %%xmm6,%%xmm6                 \n"
546       "movq        %%xmm6,(%1)                   \n"
547       "movdqu      0x8(%0),%%xmm6                \n"
548       "movdqu      0x8(%0,%3,1),%%xmm7           \n"
549       "pavgb       %%xmm7,%%xmm6                 \n"
550       "pshufb      %%xmm3,%%xmm6                 \n"
551       "pmaddubsw   %%xmm0,%%xmm6                 \n"
552       "paddsw      %%xmm1,%%xmm6                 \n"
553       "psrlw       $0x2,%%xmm6                   \n"
554       "packuswb    %%xmm6,%%xmm6                 \n"
555       "movq        %%xmm6,0x8(%1)                \n"
556       "movdqu      0x10(%0),%%xmm6               \n"
557       "movdqu      0x10(%0,%3,1),%%xmm7          \n"
558       "lea         0x20(%0),%0                   \n"
559       "pavgb       %%xmm7,%%xmm6                 \n"
560       "pshufb      %%xmm4,%%xmm6                 \n"
561       "pmaddubsw   %4,%%xmm6                     \n"
562       "paddsw      %%xmm1,%%xmm6                 \n"
563       "psrlw       $0x2,%%xmm6                   \n"
564       "packuswb    %%xmm6,%%xmm6                 \n"
565       "movq        %%xmm6,0x10(%1)               \n"
566       "lea         0x18(%1),%1                   \n"
567       "sub         $0x18,%2                      \n"
568       "jg          1b                            \n"
569       : "+r"(src_ptr),                // %0
570         "+r"(dst_ptr),                // %1
571         "+r"(dst_width)               // %2
572       : "r"((intptr_t)(src_stride)),  // %3
573         "m"(kMadd21)                  // %4
574       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
575         "xmm7");
576 }
577 
ScaleRowDown34_0_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)578 void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
579                                 ptrdiff_t src_stride,
580                                 uint8_t* dst_ptr,
581                                 int dst_width) {
582   asm volatile(
583       "movdqa      %0,%%xmm2                     \n"  // kShuf01
584       "movdqa      %1,%%xmm3                     \n"  // kShuf11
585       "movdqa      %2,%%xmm4                     \n"  // kShuf21
586       :
587       : "m"(kShuf01),  // %0
588         "m"(kShuf11),  // %1
589         "m"(kShuf21)   // %2
590   );
591   asm volatile(
592       "movdqa      %0,%%xmm5                     \n"  // kMadd01
593       "movdqa      %1,%%xmm0                     \n"  // kMadd11
594       "movdqa      %2,%%xmm1                     \n"  // kRound34
595       :
596       : "m"(kMadd01),  // %0
597         "m"(kMadd11),  // %1
598         "m"(kRound34)  // %2
599   );
600 
601   asm volatile(
602 
603       LABELALIGN
604       "1:                                        \n"
605       "movdqu      (%0),%%xmm6                   \n"
606       "movdqu      0x00(%0,%3,1),%%xmm7          \n"
607       "pavgb       %%xmm6,%%xmm7                 \n"
608       "pavgb       %%xmm7,%%xmm6                 \n"
609       "pshufb      %%xmm2,%%xmm6                 \n"
610       "pmaddubsw   %%xmm5,%%xmm6                 \n"
611       "paddsw      %%xmm1,%%xmm6                 \n"
612       "psrlw       $0x2,%%xmm6                   \n"
613       "packuswb    %%xmm6,%%xmm6                 \n"
614       "movq        %%xmm6,(%1)                   \n"
615       "movdqu      0x8(%0),%%xmm6                \n"
616       "movdqu      0x8(%0,%3,1),%%xmm7           \n"
617       "pavgb       %%xmm6,%%xmm7                 \n"
618       "pavgb       %%xmm7,%%xmm6                 \n"
619       "pshufb      %%xmm3,%%xmm6                 \n"
620       "pmaddubsw   %%xmm0,%%xmm6                 \n"
621       "paddsw      %%xmm1,%%xmm6                 \n"
622       "psrlw       $0x2,%%xmm6                   \n"
623       "packuswb    %%xmm6,%%xmm6                 \n"
624       "movq        %%xmm6,0x8(%1)                \n"
625       "movdqu      0x10(%0),%%xmm6               \n"
626       "movdqu      0x10(%0,%3,1),%%xmm7          \n"
627       "lea         0x20(%0),%0                   \n"
628       "pavgb       %%xmm6,%%xmm7                 \n"
629       "pavgb       %%xmm7,%%xmm6                 \n"
630       "pshufb      %%xmm4,%%xmm6                 \n"
631       "pmaddubsw   %4,%%xmm6                     \n"
632       "paddsw      %%xmm1,%%xmm6                 \n"
633       "psrlw       $0x2,%%xmm6                   \n"
634       "packuswb    %%xmm6,%%xmm6                 \n"
635       "movq        %%xmm6,0x10(%1)               \n"
636       "lea         0x18(%1),%1                   \n"
637       "sub         $0x18,%2                      \n"
638       "jg          1b                            \n"
639       : "+r"(src_ptr),                // %0
640         "+r"(dst_ptr),                // %1
641         "+r"(dst_width)               // %2
642       : "r"((intptr_t)(src_stride)),  // %3
643         "m"(kMadd21)                  // %4
644       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
645         "xmm7");
646 }
647 
ScaleRowDown38_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)648 void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
649                           ptrdiff_t src_stride,
650                           uint8_t* dst_ptr,
651                           int dst_width) {
652   (void)src_stride;
653   asm volatile(
654       "movdqa      %3,%%xmm4                     \n"
655       "movdqa      %4,%%xmm5                     \n"
656 
657       LABELALIGN
658       "1:                                        \n"
659       "movdqu      (%0),%%xmm0                   \n"
660       "movdqu      0x10(%0),%%xmm1               \n"
661       "lea         0x20(%0),%0                   \n"
662       "pshufb      %%xmm4,%%xmm0                 \n"
663       "pshufb      %%xmm5,%%xmm1                 \n"
664       "paddusb     %%xmm1,%%xmm0                 \n"
665       "movq        %%xmm0,(%1)                   \n"
666       "movhlps     %%xmm0,%%xmm1                 \n"
667       "movd        %%xmm1,0x8(%1)                \n"
668       "lea         0xc(%1),%1                    \n"
669       "sub         $0xc,%2                       \n"
670       "jg          1b                            \n"
671       : "+r"(src_ptr),   // %0
672         "+r"(dst_ptr),   // %1
673         "+r"(dst_width)  // %2
674       : "m"(kShuf38a),   // %3
675         "m"(kShuf38b)    // %4
676       : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
677 }
678 
ScaleRowDown38_2_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)679 void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
680                                 ptrdiff_t src_stride,
681                                 uint8_t* dst_ptr,
682                                 int dst_width) {
683   asm volatile(
684       "movdqa      %0,%%xmm2                     \n"
685       "movdqa      %1,%%xmm3                     \n"
686       "movdqa      %2,%%xmm4                     \n"
687       "movdqa      %3,%%xmm5                     \n"
688       :
689       : "m"(kShufAb0),  // %0
690         "m"(kShufAb1),  // %1
691         "m"(kShufAb2),  // %2
692         "m"(kScaleAb2)  // %3
693   );
694   asm volatile(
695 
696       LABELALIGN
697       "1:                                        \n"
698       "movdqu      (%0),%%xmm0                   \n"
699       "movdqu      0x00(%0,%3,1),%%xmm1          \n"
700       "lea         0x10(%0),%0                   \n"
701       "pavgb       %%xmm1,%%xmm0                 \n"
702       "movdqa      %%xmm0,%%xmm1                 \n"
703       "pshufb      %%xmm2,%%xmm1                 \n"
704       "movdqa      %%xmm0,%%xmm6                 \n"
705       "pshufb      %%xmm3,%%xmm6                 \n"
706       "paddusw     %%xmm6,%%xmm1                 \n"
707       "pshufb      %%xmm4,%%xmm0                 \n"
708       "paddusw     %%xmm0,%%xmm1                 \n"
709       "pmulhuw     %%xmm5,%%xmm1                 \n"
710       "packuswb    %%xmm1,%%xmm1                 \n"
711       "movd        %%xmm1,(%1)                   \n"
712       "psrlq       $0x10,%%xmm1                  \n"
713       "movd        %%xmm1,0x2(%1)                \n"
714       "lea         0x6(%1),%1                    \n"
715       "sub         $0x6,%2                       \n"
716       "jg          1b                            \n"
717       : "+r"(src_ptr),               // %0
718         "+r"(dst_ptr),               // %1
719         "+r"(dst_width)              // %2
720       : "r"((intptr_t)(src_stride))  // %3
721       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
722 }
723 
ScaleRowDown38_3_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)724 void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
725                                 ptrdiff_t src_stride,
726                                 uint8_t* dst_ptr,
727                                 int dst_width) {
728   asm volatile(
729       "movdqa      %0,%%xmm2                     \n"
730       "movdqa      %1,%%xmm3                     \n"
731       "movdqa      %2,%%xmm4                     \n"
732       "pxor        %%xmm5,%%xmm5                 \n"
733       :
734       : "m"(kShufAc),    // %0
735         "m"(kShufAc3),   // %1
736         "m"(kScaleAc33)  // %2
737   );
738   asm volatile(
739 
740       LABELALIGN
741       "1:                                        \n"
742       "movdqu      (%0),%%xmm0                   \n"
743       "movdqu      0x00(%0,%3,1),%%xmm6          \n"
744       "movhlps     %%xmm0,%%xmm1                 \n"
745       "movhlps     %%xmm6,%%xmm7                 \n"
746       "punpcklbw   %%xmm5,%%xmm0                 \n"
747       "punpcklbw   %%xmm5,%%xmm1                 \n"
748       "punpcklbw   %%xmm5,%%xmm6                 \n"
749       "punpcklbw   %%xmm5,%%xmm7                 \n"
750       "paddusw     %%xmm6,%%xmm0                 \n"
751       "paddusw     %%xmm7,%%xmm1                 \n"
752       "movdqu      0x00(%0,%3,2),%%xmm6          \n"
753       "lea         0x10(%0),%0                   \n"
754       "movhlps     %%xmm6,%%xmm7                 \n"
755       "punpcklbw   %%xmm5,%%xmm6                 \n"
756       "punpcklbw   %%xmm5,%%xmm7                 \n"
757       "paddusw     %%xmm6,%%xmm0                 \n"
758       "paddusw     %%xmm7,%%xmm1                 \n"
759       "movdqa      %%xmm0,%%xmm6                 \n"
760       "psrldq      $0x2,%%xmm0                   \n"
761       "paddusw     %%xmm0,%%xmm6                 \n"
762       "psrldq      $0x2,%%xmm0                   \n"
763       "paddusw     %%xmm0,%%xmm6                 \n"
764       "pshufb      %%xmm2,%%xmm6                 \n"
765       "movdqa      %%xmm1,%%xmm7                 \n"
766       "psrldq      $0x2,%%xmm1                   \n"
767       "paddusw     %%xmm1,%%xmm7                 \n"
768       "psrldq      $0x2,%%xmm1                   \n"
769       "paddusw     %%xmm1,%%xmm7                 \n"
770       "pshufb      %%xmm3,%%xmm7                 \n"
771       "paddusw     %%xmm7,%%xmm6                 \n"
772       "pmulhuw     %%xmm4,%%xmm6                 \n"
773       "packuswb    %%xmm6,%%xmm6                 \n"
774       "movd        %%xmm6,(%1)                   \n"
775       "psrlq       $0x10,%%xmm6                  \n"
776       "movd        %%xmm6,0x2(%1)                \n"
777       "lea         0x6(%1),%1                    \n"
778       "sub         $0x6,%2                       \n"
779       "jg          1b                            \n"
780       : "+r"(src_ptr),               // %0
781         "+r"(dst_ptr),               // %1
782         "+r"(dst_width)              // %2
783       : "r"((intptr_t)(src_stride))  // %3
784       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
785         "xmm7");
786 }
787 
788 // Reads 16xN bytes and produces 16 shorts at a time.
ScaleAddRow_SSE2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)789 void ScaleAddRow_SSE2(const uint8_t* src_ptr,
790                       uint16_t* dst_ptr,
791                       int src_width) {
792   asm volatile(
793 
794       "pxor        %%xmm5,%%xmm5                 \n"
795 
796       // 16 pixel loop.
797       LABELALIGN
798       "1:                                        \n"
799       "movdqu      (%0),%%xmm3                   \n"
800       "lea         0x10(%0),%0                   \n"  // src_ptr += 16
801       "movdqu      (%1),%%xmm0                   \n"
802       "movdqu      0x10(%1),%%xmm1               \n"
803       "movdqa      %%xmm3,%%xmm2                 \n"
804       "punpcklbw   %%xmm5,%%xmm2                 \n"
805       "punpckhbw   %%xmm5,%%xmm3                 \n"
806       "paddusw     %%xmm2,%%xmm0                 \n"
807       "paddusw     %%xmm3,%%xmm1                 \n"
808       "movdqu      %%xmm0,(%1)                   \n"
809       "movdqu      %%xmm1,0x10(%1)               \n"
810       "lea         0x20(%1),%1                   \n"
811       "sub         $0x10,%2                      \n"
812       "jg          1b                            \n"
813       : "+r"(src_ptr),   // %0
814         "+r"(dst_ptr),   // %1
815         "+r"(src_width)  // %2
816       :
817       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
818 }
819 
820 #ifdef HAS_SCALEADDROW_AVX2
821 // Reads 32 bytes and accumulates to 32 shorts at a time.
ScaleAddRow_AVX2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)822 void ScaleAddRow_AVX2(const uint8_t* src_ptr,
823                       uint16_t* dst_ptr,
824                       int src_width) {
825   asm volatile(
826 
827       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
828 
829       LABELALIGN
830       "1:                                        \n"
831       "vmovdqu     (%0),%%ymm3                   \n"
832       "lea         0x20(%0),%0                   \n"  // src_ptr += 32
833       "vpermq      $0xd8,%%ymm3,%%ymm3           \n"
834       "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
835       "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
836       "vpaddusw    (%1),%%ymm2,%%ymm0            \n"
837       "vpaddusw    0x20(%1),%%ymm3,%%ymm1        \n"
838       "vmovdqu     %%ymm0,(%1)                   \n"
839       "vmovdqu     %%ymm1,0x20(%1)               \n"
840       "lea         0x40(%1),%1                   \n"
841       "sub         $0x20,%2                      \n"
842       "jg          1b                            \n"
843       "vzeroupper                                \n"
844       : "+r"(src_ptr),   // %0
845         "+r"(dst_ptr),   // %1
846         "+r"(src_width)  // %2
847       :
848       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
849 }
850 #endif  // HAS_SCALEADDROW_AVX2
851 
852 // Constant for making pixels signed to avoid pmaddubsw
853 // saturation.
854 static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
855                               0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
856 
857 // Constant for making pixels unsigned and adding .5 for rounding.
858 static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
859                                0x4040, 0x4040, 0x4040, 0x4040};
860 
861 // Bilinear column filtering. SSSE3 version.
ScaleFilterCols_SSSE3(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)862 void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
863                            const uint8_t* src_ptr,
864                            int dst_width,
865                            int x,
866                            int dx) {
867   intptr_t x0, x1, temp_pixel;
868   asm volatile(
869       "movd        %6,%%xmm2                     \n"
870       "movd        %7,%%xmm3                     \n"
871       "movl        $0x04040000,%k2               \n"
872       "movd        %k2,%%xmm5                    \n"
873       "pcmpeqb     %%xmm6,%%xmm6                 \n"
874       "psrlw       $0x9,%%xmm6                   \n"  // 0x007f007f
875       "pcmpeqb     %%xmm7,%%xmm7                 \n"
876       "psrlw       $15,%%xmm7                    \n"  // 0x00010001
877 
878       "pextrw      $0x1,%%xmm2,%k3               \n"
879       "subl        $0x2,%5                       \n"
880       "jl          29f                           \n"
881       "movdqa      %%xmm2,%%xmm0                 \n"
882       "paddd       %%xmm3,%%xmm0                 \n"
883       "punpckldq   %%xmm0,%%xmm2                 \n"
884       "punpckldq   %%xmm3,%%xmm3                 \n"
885       "paddd       %%xmm3,%%xmm3                 \n"
886       "pextrw      $0x3,%%xmm2,%k4               \n"
887 
888       LABELALIGN
889       "2:                                        \n"
890       "movdqa      %%xmm2,%%xmm1                 \n"
891       "paddd       %%xmm3,%%xmm2                 \n"
892       "movzwl      0x00(%1,%3,1),%k2             \n"
893       "movd        %k2,%%xmm0                    \n"
894       "psrlw       $0x9,%%xmm1                   \n"
895       "movzwl      0x00(%1,%4,1),%k2             \n"
896       "movd        %k2,%%xmm4                    \n"
897       "pshufb      %%xmm5,%%xmm1                 \n"
898       "punpcklwd   %%xmm4,%%xmm0                 \n"
899       "psubb       %8,%%xmm0                     \n"  // make pixels signed.
900       "pxor        %%xmm6,%%xmm1                 \n"  // 128 - f = (f ^ 127 ) +
901                                                       // 1
902       "paddusb     %%xmm7,%%xmm1                 \n"
903       "pmaddubsw   %%xmm0,%%xmm1                 \n"
904       "pextrw      $0x1,%%xmm2,%k3               \n"
905       "pextrw      $0x3,%%xmm2,%k4               \n"
906       "paddw       %9,%%xmm1                     \n"  // make pixels unsigned.
907       "psrlw       $0x7,%%xmm1                   \n"
908       "packuswb    %%xmm1,%%xmm1                 \n"
909       "movd        %%xmm1,%k2                    \n"
910       "mov         %w2,(%0)                      \n"
911       "lea         0x2(%0),%0                    \n"
912       "subl        $0x2,%5                       \n"
913       "jge         2b                            \n"
914 
915       LABELALIGN
916       "29:                                       \n"
917       "addl        $0x1,%5                       \n"
918       "jl          99f                           \n"
919       "movzwl      0x00(%1,%3,1),%k2             \n"
920       "movd        %k2,%%xmm0                    \n"
921       "psrlw       $0x9,%%xmm2                   \n"
922       "pshufb      %%xmm5,%%xmm2                 \n"
923       "psubb       %8,%%xmm0                     \n"  // make pixels signed.
924       "pxor        %%xmm6,%%xmm2                 \n"
925       "paddusb     %%xmm7,%%xmm2                 \n"
926       "pmaddubsw   %%xmm0,%%xmm2                 \n"
927       "paddw       %9,%%xmm2                     \n"  // make pixels unsigned.
928       "psrlw       $0x7,%%xmm2                   \n"
929       "packuswb    %%xmm2,%%xmm2                 \n"
930       "movd        %%xmm2,%k2                    \n"
931       "mov         %b2,(%0)                      \n"
932       "99:                                       \n"
933       : "+r"(dst_ptr),      // %0
934         "+r"(src_ptr),      // %1
935         "=&a"(temp_pixel),  // %2
936         "=&r"(x0),          // %3
937         "=&r"(x1),          // %4
938 #if defined(__x86_64__)
939         "+rm"(dst_width)  // %5
940 #else
941         "+m"(dst_width)  // %5
942 #endif
943       : "rm"(x),   // %6
944         "rm"(dx),  // %7
945 #if defined(__x86_64__)
946         "x"(kFsub80),  // %8
947         "x"(kFadd40)   // %9
948 #else
949         "m"(kFsub80),    // %8
950         "m"(kFadd40)     // %9
951 #endif
952       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
953         "xmm7");
954 }
955 
956 // Reads 4 pixels, duplicates them and writes 8 pixels.
957 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleColsUp2_SSE2(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)958 void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
959                        const uint8_t* src_ptr,
960                        int dst_width,
961                        int x,
962                        int dx) {
963   (void)x;
964   (void)dx;
965   asm volatile(
966 
967       LABELALIGN
968       "1:                                        \n"
969       "movdqu      (%1),%%xmm0                   \n"
970       "lea         0x10(%1),%1                   \n"
971       "movdqa      %%xmm0,%%xmm1                 \n"
972       "punpcklbw   %%xmm0,%%xmm0                 \n"
973       "punpckhbw   %%xmm1,%%xmm1                 \n"
974       "movdqu      %%xmm0,(%0)                   \n"
975       "movdqu      %%xmm1,0x10(%0)               \n"
976       "lea         0x20(%0),%0                   \n"
977       "sub         $0x20,%2                      \n"
978       "jg          1b                            \n"
979 
980       : "+r"(dst_ptr),   // %0
981         "+r"(src_ptr),   // %1
982         "+r"(dst_width)  // %2
983         ::"memory",
984         "cc", "xmm0", "xmm1");
985 }
986 
ScaleARGBRowDown2_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)987 void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
988                             ptrdiff_t src_stride,
989                             uint8_t* dst_argb,
990                             int dst_width) {
991   (void)src_stride;
992   asm volatile(
993 
994       LABELALIGN
995       "1:                                        \n"
996       "movdqu      (%0),%%xmm0                   \n"
997       "movdqu      0x10(%0),%%xmm1               \n"
998       "lea         0x20(%0),%0                   \n"
999       "shufps      $0xdd,%%xmm1,%%xmm0           \n"
1000       "movdqu      %%xmm0,(%1)                   \n"
1001       "lea         0x10(%1),%1                   \n"
1002       "sub         $0x4,%2                       \n"
1003       "jg          1b                            \n"
1004       : "+r"(src_argb),  // %0
1005         "+r"(dst_argb),  // %1
1006         "+r"(dst_width)  // %2
1007         ::"memory",
1008         "cc", "xmm0", "xmm1");
1009 }
1010 
ScaleARGBRowDown2Linear_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1011 void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
1012                                   ptrdiff_t src_stride,
1013                                   uint8_t* dst_argb,
1014                                   int dst_width) {
1015   (void)src_stride;
1016   asm volatile(
1017 
1018       LABELALIGN
1019       "1:                                        \n"
1020       "movdqu      (%0),%%xmm0                   \n"
1021       "movdqu      0x10(%0),%%xmm1               \n"
1022       "lea         0x20(%0),%0                   \n"
1023       "movdqa      %%xmm0,%%xmm2                 \n"
1024       "shufps      $0x88,%%xmm1,%%xmm0           \n"
1025       "shufps      $0xdd,%%xmm1,%%xmm2           \n"
1026       "pavgb       %%xmm2,%%xmm0                 \n"
1027       "movdqu      %%xmm0,(%1)                   \n"
1028       "lea         0x10(%1),%1                   \n"
1029       "sub         $0x4,%2                       \n"
1030       "jg          1b                            \n"
1031       : "+r"(src_argb),  // %0
1032         "+r"(dst_argb),  // %1
1033         "+r"(dst_width)  // %2
1034         ::"memory",
1035         "cc", "xmm0", "xmm1");
1036 }
1037 
ScaleARGBRowDown2Box_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1038 void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
1039                                ptrdiff_t src_stride,
1040                                uint8_t* dst_argb,
1041                                int dst_width) {
1042   asm volatile(
1043 
1044       LABELALIGN
1045       "1:                                        \n"
1046       "movdqu      (%0),%%xmm0                   \n"
1047       "movdqu      0x10(%0),%%xmm1               \n"
1048       "movdqu      0x00(%0,%3,1),%%xmm2          \n"
1049       "movdqu      0x10(%0,%3,1),%%xmm3          \n"
1050       "lea         0x20(%0),%0                   \n"
1051       "pavgb       %%xmm2,%%xmm0                 \n"
1052       "pavgb       %%xmm3,%%xmm1                 \n"
1053       "movdqa      %%xmm0,%%xmm2                 \n"
1054       "shufps      $0x88,%%xmm1,%%xmm0           \n"
1055       "shufps      $0xdd,%%xmm1,%%xmm2           \n"
1056       "pavgb       %%xmm2,%%xmm0                 \n"
1057       "movdqu      %%xmm0,(%1)                   \n"
1058       "lea         0x10(%1),%1                   \n"
1059       "sub         $0x4,%2                       \n"
1060       "jg          1b                            \n"
1061       : "+r"(src_argb),              // %0
1062         "+r"(dst_argb),              // %1
1063         "+r"(dst_width)              // %2
1064       : "r"((intptr_t)(src_stride))  // %3
1065       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
1066 }
1067 
1068 // Reads 4 pixels at a time.
1069 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEven_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)1070 void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
1071                                ptrdiff_t src_stride,
1072                                int src_stepx,
1073                                uint8_t* dst_argb,
1074                                int dst_width) {
1075   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
1076   intptr_t src_stepx_x12;
1077   (void)src_stride;
1078   asm volatile(
1079       "lea         0x00(,%1,4),%1                \n"
1080       "lea         0x00(%1,%1,2),%4              \n"
1081 
1082       LABELALIGN
1083       "1:                                        \n"
1084       "movd        (%0),%%xmm0                   \n"
1085       "movd        0x00(%0,%1,1),%%xmm1          \n"
1086       "punpckldq   %%xmm1,%%xmm0                 \n"
1087       "movd        0x00(%0,%1,2),%%xmm2          \n"
1088       "movd        0x00(%0,%4,1),%%xmm3          \n"
1089       "lea         0x00(%0,%1,4),%0              \n"
1090       "punpckldq   %%xmm3,%%xmm2                 \n"
1091       "punpcklqdq  %%xmm2,%%xmm0                 \n"
1092       "movdqu      %%xmm0,(%2)                   \n"
1093       "lea         0x10(%2),%2                   \n"
1094       "sub         $0x4,%3                       \n"
1095       "jg          1b                            \n"
1096       : "+r"(src_argb),       // %0
1097         "+r"(src_stepx_x4),   // %1
1098         "+r"(dst_argb),       // %2
1099         "+r"(dst_width),      // %3
1100         "=&r"(src_stepx_x12)  // %4
1101         ::"memory",
1102         "cc", "xmm0", "xmm1", "xmm2", "xmm3");
1103 }
1104 
1105 // Blends four 2x2 to 4x1.
1106 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEvenBox_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)1107 void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
1108                                   ptrdiff_t src_stride,
1109                                   int src_stepx,
1110                                   uint8_t* dst_argb,
1111                                   int dst_width) {
1112   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
1113   intptr_t src_stepx_x12;
1114   intptr_t row1 = (intptr_t)(src_stride);
1115   asm volatile(
1116       "lea         0x00(,%1,4),%1                \n"
1117       "lea         0x00(%1,%1,2),%4              \n"
1118       "lea         0x00(%0,%5,1),%5              \n"
1119 
1120       LABELALIGN
1121       "1:                                        \n"
1122       "movq        (%0),%%xmm0                   \n"
1123       "movhps      0x00(%0,%1,1),%%xmm0          \n"
1124       "movq        0x00(%0,%1,2),%%xmm1          \n"
1125       "movhps      0x00(%0,%4,1),%%xmm1          \n"
1126       "lea         0x00(%0,%1,4),%0              \n"
1127       "movq        (%5),%%xmm2                   \n"
1128       "movhps      0x00(%5,%1,1),%%xmm2          \n"
1129       "movq        0x00(%5,%1,2),%%xmm3          \n"
1130       "movhps      0x00(%5,%4,1),%%xmm3          \n"
1131       "lea         0x00(%5,%1,4),%5              \n"
1132       "pavgb       %%xmm2,%%xmm0                 \n"
1133       "pavgb       %%xmm3,%%xmm1                 \n"
1134       "movdqa      %%xmm0,%%xmm2                 \n"
1135       "shufps      $0x88,%%xmm1,%%xmm0           \n"
1136       "shufps      $0xdd,%%xmm1,%%xmm2           \n"
1137       "pavgb       %%xmm2,%%xmm0                 \n"
1138       "movdqu      %%xmm0,(%2)                   \n"
1139       "lea         0x10(%2),%2                   \n"
1140       "sub         $0x4,%3                       \n"
1141       "jg          1b                            \n"
1142       : "+r"(src_argb),        // %0
1143         "+r"(src_stepx_x4),    // %1
1144         "+r"(dst_argb),        // %2
1145         "+rm"(dst_width),      // %3
1146         "=&r"(src_stepx_x12),  // %4
1147         "+r"(row1)             // %5
1148         ::"memory",
1149         "cc", "xmm0", "xmm1", "xmm2", "xmm3");
1150 }
1151 
ScaleARGBCols_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1152 void ScaleARGBCols_SSE2(uint8_t* dst_argb,
1153                         const uint8_t* src_argb,
1154                         int dst_width,
1155                         int x,
1156                         int dx) {
1157   intptr_t x0, x1;
1158   asm volatile(
1159       "movd        %5,%%xmm2                     \n"
1160       "movd        %6,%%xmm3                     \n"
1161       "pshufd      $0x0,%%xmm2,%%xmm2            \n"
1162       "pshufd      $0x11,%%xmm3,%%xmm0           \n"
1163       "paddd       %%xmm0,%%xmm2                 \n"
1164       "paddd       %%xmm3,%%xmm3                 \n"
1165       "pshufd      $0x5,%%xmm3,%%xmm0            \n"
1166       "paddd       %%xmm0,%%xmm2                 \n"
1167       "paddd       %%xmm3,%%xmm3                 \n"
1168       "pshufd      $0x0,%%xmm3,%%xmm3            \n"
1169       "pextrw      $0x1,%%xmm2,%k0               \n"
1170       "pextrw      $0x3,%%xmm2,%k1               \n"
1171       "cmp         $0x0,%4                       \n"
1172       "jl          99f                           \n"
1173       "sub         $0x4,%4                       \n"
1174       "jl          49f                           \n"
1175 
1176       LABELALIGN
1177       "40:                                       \n"
1178       "movd        0x00(%3,%0,4),%%xmm0          \n"
1179       "movd        0x00(%3,%1,4),%%xmm1          \n"
1180       "pextrw      $0x5,%%xmm2,%k0               \n"
1181       "pextrw      $0x7,%%xmm2,%k1               \n"
1182       "paddd       %%xmm3,%%xmm2                 \n"
1183       "punpckldq   %%xmm1,%%xmm0                 \n"
1184       "movd        0x00(%3,%0,4),%%xmm1          \n"
1185       "movd        0x00(%3,%1,4),%%xmm4          \n"
1186       "pextrw      $0x1,%%xmm2,%k0               \n"
1187       "pextrw      $0x3,%%xmm2,%k1               \n"
1188       "punpckldq   %%xmm4,%%xmm1                 \n"
1189       "punpcklqdq  %%xmm1,%%xmm0                 \n"
1190       "movdqu      %%xmm0,(%2)                   \n"
1191       "lea         0x10(%2),%2                   \n"
1192       "sub         $0x4,%4                       \n"
1193       "jge         40b                           \n"
1194 
1195       "49:                                       \n"
1196       "test        $0x2,%4                       \n"
1197       "je          29f                           \n"
1198       "movd        0x00(%3,%0,4),%%xmm0          \n"
1199       "movd        0x00(%3,%1,4),%%xmm1          \n"
1200       "pextrw      $0x5,%%xmm2,%k0               \n"
1201       "punpckldq   %%xmm1,%%xmm0                 \n"
1202       "movq        %%xmm0,(%2)                   \n"
1203       "lea         0x8(%2),%2                    \n"
1204       "29:                                       \n"
1205       "test        $0x1,%4                       \n"
1206       "je          99f                           \n"
1207       "movd        0x00(%3,%0,4),%%xmm0          \n"
1208       "movd        %%xmm0,(%2)                   \n"
1209       "99:                                       \n"
1210       : "=&a"(x0),       // %0
1211         "=&d"(x1),       // %1
1212         "+r"(dst_argb),  // %2
1213         "+r"(src_argb),  // %3
1214         "+r"(dst_width)  // %4
1215       : "rm"(x),         // %5
1216         "rm"(dx)         // %6
1217       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
1218 }
1219 
1220 // Reads 4 pixels, duplicates them and writes 8 pixels.
1221 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleARGBColsUp2_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1222 void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
1223                            const uint8_t* src_argb,
1224                            int dst_width,
1225                            int x,
1226                            int dx) {
1227   (void)x;
1228   (void)dx;
1229   asm volatile(
1230 
1231       LABELALIGN
1232       "1:                                        \n"
1233       "movdqu      (%1),%%xmm0                   \n"
1234       "lea         0x10(%1),%1                   \n"
1235       "movdqa      %%xmm0,%%xmm1                 \n"
1236       "punpckldq   %%xmm0,%%xmm0                 \n"
1237       "punpckhdq   %%xmm1,%%xmm1                 \n"
1238       "movdqu      %%xmm0,(%0)                   \n"
1239       "movdqu      %%xmm1,0x10(%0)               \n"
1240       "lea         0x20(%0),%0                   \n"
1241       "sub         $0x8,%2                       \n"
1242       "jg          1b                            \n"
1243 
1244       : "+r"(dst_argb),  // %0
1245         "+r"(src_argb),  // %1
1246         "+r"(dst_width)  // %2
1247         ::"memory",
1248         "cc", "xmm0", "xmm1");
1249 }
1250 
1251 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1252 static const uvec8 kShuffleColARGB = {
1253     0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
1254     8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
1255 };
1256 
1257 // Shuffle table for duplicating 2 fractions into 8 bytes each
1258 static const uvec8 kShuffleFractions = {
1259     0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1260 };
1261 
1262 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
ScaleARGBFilterCols_SSSE3(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1263 void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
1264                                const uint8_t* src_argb,
1265                                int dst_width,
1266                                int x,
1267                                int dx) {
1268   intptr_t x0, x1;
1269   asm volatile(
1270       "movdqa      %0,%%xmm4                     \n"
1271       "movdqa      %1,%%xmm5                     \n"
1272       :
1273       : "m"(kShuffleColARGB),   // %0
1274         "m"(kShuffleFractions)  // %1
1275   );
1276 
1277   asm volatile(
1278       "movd        %5,%%xmm2                     \n"
1279       "movd        %6,%%xmm3                     \n"
1280       "pcmpeqb     %%xmm6,%%xmm6                 \n"
1281       "psrlw       $0x9,%%xmm6                   \n"
1282       "pextrw      $0x1,%%xmm2,%k3               \n"
1283       "sub         $0x2,%2                       \n"
1284       "jl          29f                           \n"
1285       "movdqa      %%xmm2,%%xmm0                 \n"
1286       "paddd       %%xmm3,%%xmm0                 \n"
1287       "punpckldq   %%xmm0,%%xmm2                 \n"
1288       "punpckldq   %%xmm3,%%xmm3                 \n"
1289       "paddd       %%xmm3,%%xmm3                 \n"
1290       "pextrw      $0x3,%%xmm2,%k4               \n"
1291 
1292       LABELALIGN
1293       "2:                                        \n"
1294       "movdqa      %%xmm2,%%xmm1                 \n"
1295       "paddd       %%xmm3,%%xmm2                 \n"
1296       "movq        0x00(%1,%3,4),%%xmm0          \n"
1297       "psrlw       $0x9,%%xmm1                   \n"
1298       "movhps      0x00(%1,%4,4),%%xmm0          \n"
1299       "pshufb      %%xmm5,%%xmm1                 \n"
1300       "pshufb      %%xmm4,%%xmm0                 \n"
1301       "pxor        %%xmm6,%%xmm1                 \n"
1302       "pmaddubsw   %%xmm1,%%xmm0                 \n"
1303       "psrlw       $0x7,%%xmm0                   \n"
1304       "pextrw      $0x1,%%xmm2,%k3               \n"
1305       "pextrw      $0x3,%%xmm2,%k4               \n"
1306       "packuswb    %%xmm0,%%xmm0                 \n"
1307       "movq        %%xmm0,(%0)                   \n"
1308       "lea         0x8(%0),%0                    \n"
1309       "sub         $0x2,%2                       \n"
1310       "jge         2b                            \n"
1311 
1312       LABELALIGN
1313       "29:                                       \n"
1314       "add         $0x1,%2                       \n"
1315       "jl          99f                           \n"
1316       "psrlw       $0x9,%%xmm2                   \n"
1317       "movq        0x00(%1,%3,4),%%xmm0          \n"
1318       "pshufb      %%xmm5,%%xmm2                 \n"
1319       "pshufb      %%xmm4,%%xmm0                 \n"
1320       "pxor        %%xmm6,%%xmm2                 \n"
1321       "pmaddubsw   %%xmm2,%%xmm0                 \n"
1322       "psrlw       $0x7,%%xmm0                   \n"
1323       "packuswb    %%xmm0,%%xmm0                 \n"
1324       "movd        %%xmm0,(%0)                   \n"
1325 
1326       LABELALIGN
1327       "99:                                       \n"  // clang-format error.
1328 
1329       : "+r"(dst_argb),    // %0
1330         "+r"(src_argb),    // %1
1331         "+rm"(dst_width),  // %2
1332         "=&r"(x0),         // %3
1333         "=&r"(x1)          // %4
1334       : "rm"(x),           // %5
1335         "rm"(dx)           // %6
1336       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1337 }
1338 
1339 // Divide num by div and return as 16.16 fixed point result.
FixedDiv_X86(int num,int div)1340 int FixedDiv_X86(int num, int div) {
1341   asm volatile(
1342       "cdq                                       \n"
1343       "shld        $0x10,%%eax,%%edx             \n"
1344       "shl         $0x10,%%eax                   \n"
1345       "idiv        %1                            \n"
1346       "mov         %0, %%eax                     \n"
1347       : "+a"(num)  // %0
1348       : "c"(div)   // %1
1349       : "memory", "cc", "edx");
1350   return num;
1351 }
1352 
1353 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
FixedDiv1_X86(int num,int div)1354 int FixedDiv1_X86(int num, int div) {
1355   asm volatile(
1356       "cdq                                       \n"
1357       "shld        $0x10,%%eax,%%edx             \n"
1358       "shl         $0x10,%%eax                   \n"
1359       "sub         $0x10001,%%eax                \n"
1360       "sbb         $0x0,%%edx                    \n"
1361       "sub         $0x1,%1                       \n"
1362       "idiv        %1                            \n"
1363       "mov         %0, %%eax                     \n"
1364       : "+a"(num)  // %0
1365       : "c"(div)   // %1
1366       : "memory", "cc", "edx");
1367   return num;
1368 }
1369 
1370 #ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
1371 // Shuffle table for splitting UV into upper and lower part of register.
1372 static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
1373                                       1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
1374 static const uvec8 kShuffleMergeUV = {0u,   8u,   2u,   10u,  4u,   12u,
1375                                       6u,   14u,  0x80, 0x80, 0x80, 0x80,
1376                                       0x80, 0x80, 0x80, 0x80};
1377 
ScaleUVRowDown2Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)1378 void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
1379                               ptrdiff_t src_stride,
1380                               uint8_t* dst_ptr,
1381                               int dst_width) {
1382   asm volatile(
1383       "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 01010101
1384       "psrlw       $0xf,%%xmm4                   \n"
1385       "packuswb    %%xmm4,%%xmm4                 \n"
1386       "pxor        %%xmm5, %%xmm5                \n"  // zero
1387       "movdqa      %4,%%xmm1                     \n"  // split shuffler
1388       "movdqa      %5,%%xmm3                     \n"  // merge shuffler
1389 
1390       LABELALIGN
1391       "1:                                        \n"
1392       "movdqu      (%0),%%xmm0                   \n"  // 8 UV row 0
1393       "movdqu      0x00(%0,%3,1),%%xmm2          \n"  // 8 UV row 1
1394       "lea         0x10(%0),%0                   \n"
1395       "pshufb      %%xmm1,%%xmm0                 \n"  // uuuuvvvv
1396       "pshufb      %%xmm1,%%xmm2                 \n"
1397       "pmaddubsw   %%xmm4,%%xmm0                 \n"  // horizontal add
1398       "pmaddubsw   %%xmm4,%%xmm2                 \n"
1399       "paddw       %%xmm2,%%xmm0                 \n"  // vertical add
1400       "psrlw       $0x1,%%xmm0                   \n"  // round
1401       "pavgw       %%xmm5,%%xmm0                 \n"
1402       "pshufb      %%xmm3,%%xmm0                 \n"  // merge uv
1403       "movq        %%xmm0,(%1)                   \n"
1404       "lea         0x8(%1),%1                    \n"  // 4 UV
1405       "sub         $0x4,%2                       \n"
1406       "jg          1b                            \n"
1407       : "+r"(src_ptr),                // %0
1408         "+r"(dst_ptr),                // %1
1409         "+r"(dst_width)               // %2
1410       : "r"((intptr_t)(src_stride)),  // %3
1411         "m"(kShuffleSplitUV),         // %4
1412         "m"(kShuffleMergeUV)          // %5
1413       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1414 }
1415 #endif  // HAS_SCALEUVROWDOWN2BOX_SSSE3
1416 
1417 #ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
ScaleUVRowDown2Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)1418 void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
1419                              ptrdiff_t src_stride,
1420                              uint8_t* dst_ptr,
1421                              int dst_width) {
1422   asm volatile(
1423       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 01010101
1424       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
1425       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
1426       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"  // zero
1427       "vbroadcastf128 %4,%%ymm1                  \n"  // split shuffler
1428       "vbroadcastf128 %5,%%ymm3                  \n"  // merge shuffler
1429 
1430       LABELALIGN
1431       "1:                                        \n"
1432       "vmovdqu     (%0),%%ymm0                   \n"  // 16 UV row 0
1433       "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"  // 16 UV row 1
1434       "lea         0x20(%0),%0                   \n"
1435       "vpshufb     %%ymm1,%%ymm0,%%ymm0          \n"  // uuuuvvvv
1436       "vpshufb     %%ymm1,%%ymm2,%%ymm2          \n"
1437       "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"  // horizontal add
1438       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
1439       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"  // vertical add
1440       "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"  // round
1441       "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
1442       "vpshufb     %%ymm3,%%ymm0,%%ymm0          \n"  // merge uv
1443       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"  // combine qwords
1444       "vmovdqu     %%xmm0,(%1)                   \n"
1445       "lea         0x10(%1),%1                   \n"  // 8 UV
1446       "sub         $0x8,%2                       \n"
1447       "jg          1b                            \n"
1448       "vzeroupper                                \n"
1449       : "+r"(src_ptr),                // %0
1450         "+r"(dst_ptr),                // %1
1451         "+r"(dst_width)               // %2
1452       : "r"((intptr_t)(src_stride)),  // %3
1453         "m"(kShuffleSplitUV),         // %4
1454         "m"(kShuffleMergeUV)          // %5
1455       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1456 }
1457 #endif  // HAS_SCALEUVROWDOWN2BOX_AVX2
1458 
1459 #endif  // defined(__x86_64__) || defined(__i386__)
1460 
1461 #ifdef __cplusplus
1462 }  // extern "C"
1463 }  // namespace libyuv
1464 #endif
1465