• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 #include "libyuv/scale_row.h"
13 
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18 
19 // This module is for GCC x86 and x64.
20 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
21 
22 // Offsets for source bytes 0 to 9
23 static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
24                              128, 128, 128, 128, 128, 128, 128, 128};
25 
26 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
27 static const uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
28                              128, 128, 128, 128, 128, 128, 128, 128};
29 
30 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
31 static const uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
32                              128, 128, 128, 128, 128, 128, 128, 128};
33 
34 // Offsets for source bytes 0 to 10
35 static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
36 
37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
38 static const uvec8 kShuf11 = {2, 3, 4, 5,  5,  6,  6,  7,
39                               8, 9, 9, 10, 10, 11, 12, 13};
40 
41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
42 static const uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
43                               10, 11, 12, 13, 13, 14, 14, 15};
44 
45 // Coefficients for source bytes 0 to 10
46 static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
47 
48 // Coefficients for source bytes 10 to 21
49 static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
50 
51 // Coefficients for source bytes 21 to 31
52 static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
53 
54 // Coefficients for source bytes 21 to 31
55 static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
56 
57 static const uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
58                                128, 128, 128, 128, 128, 128, 128, 128};
59 
60 static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
61                                6,   8,   11,  14,  128, 128, 128, 128};
62 
63 // Arrange words 0,3,6 into 0,1,2
64 static const uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
65                               128, 128, 128, 128, 128, 128, 128, 128};
66 
67 // Arrange words 0,3,6 into 3,4,5
68 static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
69                                6,   7,   12,  13,  128, 128, 128, 128};
70 
71 // Scaling values for boxes of 3x3 and 2x3
72 static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
73                                   65536 / 9, 65536 / 6, 0,         0};
74 
75 // Arrange first value for pixels 0,1,2,3,4,5
76 static const uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
77                                11, 128, 14, 128, 128, 128, 128, 128};
78 
79 // Arrange second value for pixels 0,1,2,3,4,5
80 static const uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
81                                12, 128, 15, 128, 128, 128, 128, 128};
82 
83 // Arrange third value for pixels 0,1,2,3,4,5
84 static const uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
85                                13, 128, 128, 128, 128, 128, 128, 128};
86 
87 // Scaling values for boxes of 3x2 and 2x2
88 static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
89                                  65536 / 3, 65536 / 2, 0,         0};
90 
91 // GCC versions of row functions are verbatim conversions from Visual C.
92 // Generated using gcc disassembly on Visual C object file:
93 // objdump -D yuvscaler.obj >yuvscaler.txt
94 
ScaleRowDown2_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)95 void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
96                          ptrdiff_t src_stride,
97                          uint8_t* dst_ptr,
98                          int dst_width) {
99   (void)src_stride;
100   asm volatile(
101       // 16 pixel loop.
102       LABELALIGN
103       "1:                                        \n"
104       "movdqu      (%0),%%xmm0                   \n"
105       "movdqu      0x10(%0),%%xmm1               \n"
106       "lea         0x20(%0),%0                   \n"
107       "psrlw       $0x8,%%xmm0                   \n"
108       "psrlw       $0x8,%%xmm1                   \n"
109       "packuswb    %%xmm1,%%xmm0                 \n"
110       "movdqu      %%xmm0,(%1)                   \n"
111       "lea         0x10(%1),%1                   \n"
112       "sub         $0x10,%2                      \n"
113       "jg          1b                            \n"
114       : "+r"(src_ptr),   // %0
115         "+r"(dst_ptr),   // %1
116         "+r"(dst_width)  // %2
117         ::"memory",
118         "cc", "xmm0", "xmm1");
119 }
120 
ScaleRowDown2Linear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)121 void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
122                                ptrdiff_t src_stride,
123                                uint8_t* dst_ptr,
124                                int dst_width) {
125   (void)src_stride;
126   asm volatile(
127       "pcmpeqb     %%xmm4,%%xmm4                 \n"
128       "psrlw       $0xf,%%xmm4                   \n"
129       "packuswb    %%xmm4,%%xmm4                 \n"
130       "pxor        %%xmm5,%%xmm5                 \n"
131 
132       LABELALIGN
133       "1:                                        \n"
134       "movdqu      (%0),%%xmm0                   \n"
135       "movdqu      0x10(%0),%%xmm1               \n"
136       "lea         0x20(%0),%0                   \n"
137       "pmaddubsw   %%xmm4,%%xmm0                 \n"
138       "pmaddubsw   %%xmm4,%%xmm1                 \n"
139       "pavgw       %%xmm5,%%xmm0                 \n"
140       "pavgw       %%xmm5,%%xmm1                 \n"
141       "packuswb    %%xmm1,%%xmm0                 \n"
142       "movdqu      %%xmm0,(%1)                   \n"
143       "lea         0x10(%1),%1                   \n"
144       "sub         $0x10,%2                      \n"
145       "jg          1b                            \n"
146       : "+r"(src_ptr),   // %0
147         "+r"(dst_ptr),   // %1
148         "+r"(dst_width)  // %2
149         ::"memory",
150         "cc", "xmm0", "xmm1", "xmm4", "xmm5");
151 }
152 
ScaleRowDown2Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)153 void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
154                             ptrdiff_t src_stride,
155                             uint8_t* dst_ptr,
156                             int dst_width) {
157   asm volatile(
158       "pcmpeqb     %%xmm4,%%xmm4                 \n"
159       "psrlw       $0xf,%%xmm4                   \n"
160       "packuswb    %%xmm4,%%xmm4                 \n"
161       "pxor        %%xmm5,%%xmm5                 \n"
162 
163       LABELALIGN
164       "1:                                        \n"
165       "movdqu      (%0),%%xmm0                   \n"
166       "movdqu      0x10(%0),%%xmm1               \n"
167       "movdqu      0x00(%0,%3,1),%%xmm2          \n"
168       "movdqu      0x10(%0,%3,1),%%xmm3          \n"
169       "lea         0x20(%0),%0                   \n"
170       "pmaddubsw   %%xmm4,%%xmm0                 \n"
171       "pmaddubsw   %%xmm4,%%xmm1                 \n"
172       "pmaddubsw   %%xmm4,%%xmm2                 \n"
173       "pmaddubsw   %%xmm4,%%xmm3                 \n"
174       "paddw       %%xmm2,%%xmm0                 \n"
175       "paddw       %%xmm3,%%xmm1                 \n"
176       "psrlw       $0x1,%%xmm0                   \n"
177       "psrlw       $0x1,%%xmm1                   \n"
178       "pavgw       %%xmm5,%%xmm0                 \n"
179       "pavgw       %%xmm5,%%xmm1                 \n"
180       "packuswb    %%xmm1,%%xmm0                 \n"
181       "movdqu      %%xmm0,(%1)                   \n"
182       "lea         0x10(%1),%1                   \n"
183       "sub         $0x10,%2                      \n"
184       "jg          1b                            \n"
185       : "+r"(src_ptr),               // %0
186         "+r"(dst_ptr),               // %1
187         "+r"(dst_width)              // %2
188       : "r"((intptr_t)(src_stride))  // %3
189       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
190 }
191 
192 #ifdef HAS_SCALEROWDOWN2_AVX2
ScaleRowDown2_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)193 void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
194                         ptrdiff_t src_stride,
195                         uint8_t* dst_ptr,
196                         int dst_width) {
197   (void)src_stride;
198   asm volatile(LABELALIGN
199       "1:                                        \n"
200       "vmovdqu     (%0),%%ymm0                   \n"
201       "vmovdqu     0x20(%0),%%ymm1               \n"
202       "lea         0x40(%0),%0                   \n"
203       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
204       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
205       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
206       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
207       "vmovdqu     %%ymm0,(%1)                   \n"
208       "lea         0x20(%1),%1                   \n"
209       "sub         $0x20,%2                      \n"
210       "jg          1b                            \n"
211       "vzeroupper                                \n"
212                : "+r"(src_ptr),   // %0
213                  "+r"(dst_ptr),   // %1
214                  "+r"(dst_width)  // %2
215                  ::"memory",
216                  "cc", "xmm0", "xmm1");
217 }
218 
ScaleRowDown2Linear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)219 void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
220                               ptrdiff_t src_stride,
221                               uint8_t* dst_ptr,
222                               int dst_width) {
223   (void)src_stride;
224   asm volatile(
225       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
226       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
227       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
228       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
229 
230       LABELALIGN
231       "1:                                        \n"
232       "vmovdqu     (%0),%%ymm0                   \n"
233       "vmovdqu     0x20(%0),%%ymm1               \n"
234       "lea         0x40(%0),%0                   \n"
235       "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
236       "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
237       "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
238       "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
239       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
240       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
241       "vmovdqu     %%ymm0,(%1)                   \n"
242       "lea         0x20(%1),%1                   \n"
243       "sub         $0x20,%2                      \n"
244       "jg          1b                            \n"
245       "vzeroupper                                \n"
246       : "+r"(src_ptr),   // %0
247         "+r"(dst_ptr),   // %1
248         "+r"(dst_width)  // %2
249         ::"memory",
250         "cc", "xmm0", "xmm1", "xmm4", "xmm5");
251 }
252 
ScaleRowDown2Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)253 void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
254                            ptrdiff_t src_stride,
255                            uint8_t* dst_ptr,
256                            int dst_width) {
257   asm volatile(
258       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
259       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
260       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
261       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
262 
263       LABELALIGN
264       "1:                                        \n"
265       "vmovdqu     (%0),%%ymm0                   \n"
266       "vmovdqu     0x20(%0),%%ymm1               \n"
267       "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"
268       "vmovdqu     0x20(%0,%3,1),%%ymm3          \n"
269       "lea         0x40(%0),%0                   \n"
270       "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
271       "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
272       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
273       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
274       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
275       "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
276       "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"
277       "vpsrlw      $0x1,%%ymm1,%%ymm1            \n"
278       "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
279       "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
280       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
281       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
282       "vmovdqu     %%ymm0,(%1)                   \n"
283       "lea         0x20(%1),%1                   \n"
284       "sub         $0x20,%2                      \n"
285       "jg          1b                            \n"
286       "vzeroupper                                \n"
287       : "+r"(src_ptr),               // %0
288         "+r"(dst_ptr),               // %1
289         "+r"(dst_width)              // %2
290       : "r"((intptr_t)(src_stride))  // %3
291       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
292 }
293 #endif  // HAS_SCALEROWDOWN2_AVX2
294 
ScaleRowDown4_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)295 void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
296                          ptrdiff_t src_stride,
297                          uint8_t* dst_ptr,
298                          int dst_width) {
299   (void)src_stride;
300   asm volatile(
301       "pcmpeqb     %%xmm5,%%xmm5                 \n"
302       "psrld       $0x18,%%xmm5                  \n"
303       "pslld       $0x10,%%xmm5                  \n"
304 
305       LABELALIGN
306       "1:                                        \n"
307       "movdqu      (%0),%%xmm0                   \n"
308       "movdqu      0x10(%0),%%xmm1               \n"
309       "lea         0x20(%0),%0                   \n"
310       "pand        %%xmm5,%%xmm0                 \n"
311       "pand        %%xmm5,%%xmm1                 \n"
312       "packuswb    %%xmm1,%%xmm0                 \n"
313       "psrlw       $0x8,%%xmm0                   \n"
314       "packuswb    %%xmm0,%%xmm0                 \n"
315       "movq        %%xmm0,(%1)                   \n"
316       "lea         0x8(%1),%1                    \n"
317       "sub         $0x8,%2                       \n"
318       "jg          1b                            \n"
319       : "+r"(src_ptr),   // %0
320         "+r"(dst_ptr),   // %1
321         "+r"(dst_width)  // %2
322         ::"memory",
323         "cc", "xmm0", "xmm1", "xmm5");
324 }
325 
ScaleRowDown4Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)326 void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
327                             ptrdiff_t src_stride,
328                             uint8_t* dst_ptr,
329                             int dst_width) {
330   intptr_t stridex3;
331   asm volatile(
332       "pcmpeqb     %%xmm4,%%xmm4                 \n"
333       "psrlw       $0xf,%%xmm4                   \n"
334       "movdqa      %%xmm4,%%xmm5                 \n"
335       "packuswb    %%xmm4,%%xmm4                 \n"
336       "psllw       $0x3,%%xmm5                   \n"
337       "lea         0x00(%4,%4,2),%3              \n"
338 
339       LABELALIGN
340       "1:                                        \n"
341       "movdqu      (%0),%%xmm0                   \n"
342       "movdqu      0x10(%0),%%xmm1               \n"
343       "movdqu      0x00(%0,%4,1),%%xmm2          \n"
344       "movdqu      0x10(%0,%4,1),%%xmm3          \n"
345       "pmaddubsw   %%xmm4,%%xmm0                 \n"
346       "pmaddubsw   %%xmm4,%%xmm1                 \n"
347       "pmaddubsw   %%xmm4,%%xmm2                 \n"
348       "pmaddubsw   %%xmm4,%%xmm3                 \n"
349       "paddw       %%xmm2,%%xmm0                 \n"
350       "paddw       %%xmm3,%%xmm1                 \n"
351       "movdqu      0x00(%0,%4,2),%%xmm2          \n"
352       "movdqu      0x10(%0,%4,2),%%xmm3          \n"
353       "pmaddubsw   %%xmm4,%%xmm2                 \n"
354       "pmaddubsw   %%xmm4,%%xmm3                 \n"
355       "paddw       %%xmm2,%%xmm0                 \n"
356       "paddw       %%xmm3,%%xmm1                 \n"
357       "movdqu      0x00(%0,%3,1),%%xmm2          \n"
358       "movdqu      0x10(%0,%3,1),%%xmm3          \n"
359       "lea         0x20(%0),%0                   \n"
360       "pmaddubsw   %%xmm4,%%xmm2                 \n"
361       "pmaddubsw   %%xmm4,%%xmm3                 \n"
362       "paddw       %%xmm2,%%xmm0                 \n"
363       "paddw       %%xmm3,%%xmm1                 \n"
364       "phaddw      %%xmm1,%%xmm0                 \n"
365       "paddw       %%xmm5,%%xmm0                 \n"
366       "psrlw       $0x4,%%xmm0                   \n"
367       "packuswb    %%xmm0,%%xmm0                 \n"
368       "movq        %%xmm0,(%1)                   \n"
369       "lea         0x8(%1),%1                    \n"
370       "sub         $0x8,%2                       \n"
371       "jg          1b                            \n"
372       : "+r"(src_ptr),               // %0
373         "+r"(dst_ptr),               // %1
374         "+r"(dst_width),             // %2
375         "=&r"(stridex3)              // %3
376       : "r"((intptr_t)(src_stride))  // %4
377       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
378 }
379 
380 #ifdef HAS_SCALEROWDOWN4_AVX2
ScaleRowDown4_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)381 void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
382                         ptrdiff_t src_stride,
383                         uint8_t* dst_ptr,
384                         int dst_width) {
385   (void)src_stride;
386   asm volatile(
387       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
388       "vpsrld      $0x18,%%ymm5,%%ymm5           \n"
389       "vpslld      $0x10,%%ymm5,%%ymm5           \n"
390 
391       LABELALIGN
392       "1:                                        \n"
393       "vmovdqu     (%0),%%ymm0                   \n"
394       "vmovdqu     0x20(%0),%%ymm1               \n"
395       "lea         0x40(%0),%0                   \n"
396       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
397       "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
398       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
399       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
400       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
401       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
402       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
403       "vmovdqu     %%xmm0,(%1)                   \n"
404       "lea         0x10(%1),%1                   \n"
405       "sub         $0x10,%2                      \n"
406       "jg          1b                            \n"
407       "vzeroupper                                \n"
408       : "+r"(src_ptr),   // %0
409         "+r"(dst_ptr),   // %1
410         "+r"(dst_width)  // %2
411         ::"memory",
412         "cc", "xmm0", "xmm1", "xmm5");
413 }
414 
ScaleRowDown4Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)415 void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
416                            ptrdiff_t src_stride,
417                            uint8_t* dst_ptr,
418                            int dst_width) {
419   asm volatile(
420       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
421       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
422       "vpsllw      $0x3,%%ymm4,%%ymm5            \n"
423       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
424 
425       LABELALIGN
426       "1:                                        \n"
427       "vmovdqu     (%0),%%ymm0                   \n"
428       "vmovdqu     0x20(%0),%%ymm1               \n"
429       "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"
430       "vmovdqu     0x20(%0,%3,1),%%ymm3          \n"
431       "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
432       "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
433       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
434       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
435       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
436       "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
437       "vmovdqu     0x00(%0,%3,2),%%ymm2          \n"
438       "vmovdqu     0x20(%0,%3,2),%%ymm3          \n"
439       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
440       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
441       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
442       "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
443       "vmovdqu     0x00(%0,%4,1),%%ymm2          \n"
444       "vmovdqu     0x20(%0,%4,1),%%ymm3          \n"
445       "lea         0x40(%0),%0                   \n"
446       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
447       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
448       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
449       "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
450       "vphaddw     %%ymm1,%%ymm0,%%ymm0          \n"
451       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
452       "vpaddw      %%ymm5,%%ymm0,%%ymm0          \n"
453       "vpsrlw      $0x4,%%ymm0,%%ymm0            \n"
454       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
455       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
456       "vmovdqu     %%xmm0,(%1)                   \n"
457       "lea         0x10(%1),%1                   \n"
458       "sub         $0x10,%2                      \n"
459       "jg          1b                            \n"
460       "vzeroupper                                \n"
461       : "+r"(src_ptr),                   // %0
462         "+r"(dst_ptr),                   // %1
463         "+r"(dst_width)                  // %2
464       : "r"((intptr_t)(src_stride)),     // %3
465         "r"((intptr_t)(src_stride * 3))  // %4
466       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
467 }
468 #endif  // HAS_SCALEROWDOWN4_AVX2
469 
ScaleRowDown34_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)470 void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
471                           ptrdiff_t src_stride,
472                           uint8_t* dst_ptr,
473                           int dst_width) {
474   (void)src_stride;
475   asm volatile(
476       "movdqa      %0,%%xmm3                     \n"
477       "movdqa      %1,%%xmm4                     \n"
478       "movdqa      %2,%%xmm5                     \n"
479       :
480       : "m"(kShuf0),  // %0
481         "m"(kShuf1),  // %1
482         "m"(kShuf2)   // %2
483   );
484   asm volatile(LABELALIGN
485       "1:                                        \n"
486       "movdqu      (%0),%%xmm0                   \n"
487       "movdqu      0x10(%0),%%xmm2               \n"
488       "lea         0x20(%0),%0                   \n"
489       "movdqa      %%xmm2,%%xmm1                 \n"
490       "palignr     $0x8,%%xmm0,%%xmm1            \n"
491       "pshufb      %%xmm3,%%xmm0                 \n"
492       "pshufb      %%xmm4,%%xmm1                 \n"
493       "pshufb      %%xmm5,%%xmm2                 \n"
494       "movq        %%xmm0,(%1)                   \n"
495       "movq        %%xmm1,0x8(%1)                \n"
496       "movq        %%xmm2,0x10(%1)               \n"
497       "lea         0x18(%1),%1                   \n"
498       "sub         $0x18,%2                      \n"
499       "jg          1b                            \n"
500                : "+r"(src_ptr),   // %0
501                  "+r"(dst_ptr),   // %1
502                  "+r"(dst_width)  // %2
503                  ::"memory",
504                  "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
505 }
506 
ScaleRowDown34_1_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)507 void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
508                                 ptrdiff_t src_stride,
509                                 uint8_t* dst_ptr,
510                                 int dst_width) {
511   asm volatile(
512       "movdqa      %0,%%xmm2                     \n"  // kShuf01
513       "movdqa      %1,%%xmm3                     \n"  // kShuf11
514       "movdqa      %2,%%xmm4                     \n"  // kShuf21
515       :
516       : "m"(kShuf01),  // %0
517         "m"(kShuf11),  // %1
518         "m"(kShuf21)   // %2
519   );
520   asm volatile(
521       "movdqa      %0,%%xmm5                     \n"  // kMadd01
522       "movdqa      %1,%%xmm0                     \n"  // kMadd11
523       "movdqa      %2,%%xmm1                     \n"  // kRound34
524       :
525       : "m"(kMadd01),  // %0
526         "m"(kMadd11),  // %1
527         "m"(kRound34)  // %2
528   );
529   asm volatile(LABELALIGN
530       "1:                                        \n"
531       "movdqu      (%0),%%xmm6                   \n"
532       "movdqu      0x00(%0,%3,1),%%xmm7          \n"
533       "pavgb       %%xmm7,%%xmm6                 \n"
534       "pshufb      %%xmm2,%%xmm6                 \n"
535       "pmaddubsw   %%xmm5,%%xmm6                 \n"
536       "paddsw      %%xmm1,%%xmm6                 \n"
537       "psrlw       $0x2,%%xmm6                   \n"
538       "packuswb    %%xmm6,%%xmm6                 \n"
539       "movq        %%xmm6,(%1)                   \n"
540       "movdqu      0x8(%0),%%xmm6                \n"
541       "movdqu      0x8(%0,%3,1),%%xmm7           \n"
542       "pavgb       %%xmm7,%%xmm6                 \n"
543       "pshufb      %%xmm3,%%xmm6                 \n"
544       "pmaddubsw   %%xmm0,%%xmm6                 \n"
545       "paddsw      %%xmm1,%%xmm6                 \n"
546       "psrlw       $0x2,%%xmm6                   \n"
547       "packuswb    %%xmm6,%%xmm6                 \n"
548       "movq        %%xmm6,0x8(%1)                \n"
549       "movdqu      0x10(%0),%%xmm6               \n"
550       "movdqu      0x10(%0,%3,1),%%xmm7          \n"
551       "lea         0x20(%0),%0                   \n"
552       "pavgb       %%xmm7,%%xmm6                 \n"
553       "pshufb      %%xmm4,%%xmm6                 \n"
554       "pmaddubsw   %4,%%xmm6                     \n"
555       "paddsw      %%xmm1,%%xmm6                 \n"
556       "psrlw       $0x2,%%xmm6                   \n"
557       "packuswb    %%xmm6,%%xmm6                 \n"
558       "movq        %%xmm6,0x10(%1)               \n"
559       "lea         0x18(%1),%1                   \n"
560       "sub         $0x18,%2                      \n"
561       "jg          1b                            \n"
562                : "+r"(src_ptr),                // %0
563                  "+r"(dst_ptr),                // %1
564                  "+r"(dst_width)               // %2
565                : "r"((intptr_t)(src_stride)),  // %3
566                  "m"(kMadd21)                  // %4
567                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
568                  "xmm6", "xmm7");
569 }
570 
ScaleRowDown34_0_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)571 void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
572                                 ptrdiff_t src_stride,
573                                 uint8_t* dst_ptr,
574                                 int dst_width) {
575   asm volatile(
576       "movdqa      %0,%%xmm2                     \n"  // kShuf01
577       "movdqa      %1,%%xmm3                     \n"  // kShuf11
578       "movdqa      %2,%%xmm4                     \n"  // kShuf21
579       :
580       : "m"(kShuf01),  // %0
581         "m"(kShuf11),  // %1
582         "m"(kShuf21)   // %2
583   );
584   asm volatile(
585       "movdqa      %0,%%xmm5                     \n"  // kMadd01
586       "movdqa      %1,%%xmm0                     \n"  // kMadd11
587       "movdqa      %2,%%xmm1                     \n"  // kRound34
588       :
589       : "m"(kMadd01),  // %0
590         "m"(kMadd11),  // %1
591         "m"(kRound34)  // %2
592   );
593 
594   asm volatile(LABELALIGN
595       "1:                                        \n"
596       "movdqu      (%0),%%xmm6                   \n"
597       "movdqu      0x00(%0,%3,1),%%xmm7          \n"
598       "pavgb       %%xmm6,%%xmm7                 \n"
599       "pavgb       %%xmm7,%%xmm6                 \n"
600       "pshufb      %%xmm2,%%xmm6                 \n"
601       "pmaddubsw   %%xmm5,%%xmm6                 \n"
602       "paddsw      %%xmm1,%%xmm6                 \n"
603       "psrlw       $0x2,%%xmm6                   \n"
604       "packuswb    %%xmm6,%%xmm6                 \n"
605       "movq        %%xmm6,(%1)                   \n"
606       "movdqu      0x8(%0),%%xmm6                \n"
607       "movdqu      0x8(%0,%3,1),%%xmm7           \n"
608       "pavgb       %%xmm6,%%xmm7                 \n"
609       "pavgb       %%xmm7,%%xmm6                 \n"
610       "pshufb      %%xmm3,%%xmm6                 \n"
611       "pmaddubsw   %%xmm0,%%xmm6                 \n"
612       "paddsw      %%xmm1,%%xmm6                 \n"
613       "psrlw       $0x2,%%xmm6                   \n"
614       "packuswb    %%xmm6,%%xmm6                 \n"
615       "movq        %%xmm6,0x8(%1)                \n"
616       "movdqu      0x10(%0),%%xmm6               \n"
617       "movdqu      0x10(%0,%3,1),%%xmm7          \n"
618       "lea         0x20(%0),%0                   \n"
619       "pavgb       %%xmm6,%%xmm7                 \n"
620       "pavgb       %%xmm7,%%xmm6                 \n"
621       "pshufb      %%xmm4,%%xmm6                 \n"
622       "pmaddubsw   %4,%%xmm6                     \n"
623       "paddsw      %%xmm1,%%xmm6                 \n"
624       "psrlw       $0x2,%%xmm6                   \n"
625       "packuswb    %%xmm6,%%xmm6                 \n"
626       "movq        %%xmm6,0x10(%1)               \n"
627       "lea         0x18(%1),%1                   \n"
628       "sub         $0x18,%2                      \n"
629       "jg          1b                            \n"
630                : "+r"(src_ptr),                // %0
631                  "+r"(dst_ptr),                // %1
632                  "+r"(dst_width)               // %2
633                : "r"((intptr_t)(src_stride)),  // %3
634                  "m"(kMadd21)                  // %4
635                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
636                  "xmm6", "xmm7");
637 }
638 
ScaleRowDown38_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)639 void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
640                           ptrdiff_t src_stride,
641                           uint8_t* dst_ptr,
642                           int dst_width) {
643   (void)src_stride;
644   asm volatile(
645       "movdqa      %3,%%xmm4                     \n"
646       "movdqa      %4,%%xmm5                     \n"
647 
648       LABELALIGN
649       "1:                                        \n"
650       "movdqu      (%0),%%xmm0                   \n"
651       "movdqu      0x10(%0),%%xmm1               \n"
652       "lea         0x20(%0),%0                   \n"
653       "pshufb      %%xmm4,%%xmm0                 \n"
654       "pshufb      %%xmm5,%%xmm1                 \n"
655       "paddusb     %%xmm1,%%xmm0                 \n"
656       "movq        %%xmm0,(%1)                   \n"
657       "movhlps     %%xmm0,%%xmm1                 \n"
658       "movd        %%xmm1,0x8(%1)                \n"
659       "lea         0xc(%1),%1                    \n"
660       "sub         $0xc,%2                       \n"
661       "jg          1b                            \n"
662       : "+r"(src_ptr),   // %0
663         "+r"(dst_ptr),   // %1
664         "+r"(dst_width)  // %2
665       : "m"(kShuf38a),   // %3
666         "m"(kShuf38b)    // %4
667       : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
668 }
669 
ScaleRowDown38_2_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)670 void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
671                                 ptrdiff_t src_stride,
672                                 uint8_t* dst_ptr,
673                                 int dst_width) {
674   asm volatile(
675       "movdqa      %0,%%xmm2                     \n"
676       "movdqa      %1,%%xmm3                     \n"
677       "movdqa      %2,%%xmm4                     \n"
678       "movdqa      %3,%%xmm5                     \n"
679       :
680       : "m"(kShufAb0),  // %0
681         "m"(kShufAb1),  // %1
682         "m"(kShufAb2),  // %2
683         "m"(kScaleAb2)  // %3
684   );
685   asm volatile(LABELALIGN
686       "1:                                        \n"
687       "movdqu      (%0),%%xmm0                   \n"
688       "movdqu      0x00(%0,%3,1),%%xmm1          \n"
689       "lea         0x10(%0),%0                   \n"
690       "pavgb       %%xmm1,%%xmm0                 \n"
691       "movdqa      %%xmm0,%%xmm1                 \n"
692       "pshufb      %%xmm2,%%xmm1                 \n"
693       "movdqa      %%xmm0,%%xmm6                 \n"
694       "pshufb      %%xmm3,%%xmm6                 \n"
695       "paddusw     %%xmm6,%%xmm1                 \n"
696       "pshufb      %%xmm4,%%xmm0                 \n"
697       "paddusw     %%xmm0,%%xmm1                 \n"
698       "pmulhuw     %%xmm5,%%xmm1                 \n"
699       "packuswb    %%xmm1,%%xmm1                 \n"
700       "movd        %%xmm1,(%1)                   \n"
701       "psrlq       $0x10,%%xmm1                  \n"
702       "movd        %%xmm1,0x2(%1)                \n"
703       "lea         0x6(%1),%1                    \n"
704       "sub         $0x6,%2                       \n"
705       "jg          1b                            \n"
706                : "+r"(src_ptr),               // %0
707                  "+r"(dst_ptr),               // %1
708                  "+r"(dst_width)              // %2
709                : "r"((intptr_t)(src_stride))  // %3
710                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
711                  "xmm6");
712 }
713 
ScaleRowDown38_3_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)714 void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
715                                 ptrdiff_t src_stride,
716                                 uint8_t* dst_ptr,
717                                 int dst_width) {
718   asm volatile(
719       "movdqa      %0,%%xmm2                     \n"
720       "movdqa      %1,%%xmm3                     \n"
721       "movdqa      %2,%%xmm4                     \n"
722       "pxor        %%xmm5,%%xmm5                 \n"
723       :
724       : "m"(kShufAc),    // %0
725         "m"(kShufAc3),   // %1
726         "m"(kScaleAc33)  // %2
727   );
728   asm volatile(LABELALIGN
729       "1:                                        \n"
730       "movdqu      (%0),%%xmm0                   \n"
731       "movdqu      0x00(%0,%3,1),%%xmm6          \n"
732       "movhlps     %%xmm0,%%xmm1                 \n"
733       "movhlps     %%xmm6,%%xmm7                 \n"
734       "punpcklbw   %%xmm5,%%xmm0                 \n"
735       "punpcklbw   %%xmm5,%%xmm1                 \n"
736       "punpcklbw   %%xmm5,%%xmm6                 \n"
737       "punpcklbw   %%xmm5,%%xmm7                 \n"
738       "paddusw     %%xmm6,%%xmm0                 \n"
739       "paddusw     %%xmm7,%%xmm1                 \n"
740       "movdqu      0x00(%0,%3,2),%%xmm6          \n"
741       "lea         0x10(%0),%0                   \n"
742       "movhlps     %%xmm6,%%xmm7                 \n"
743       "punpcklbw   %%xmm5,%%xmm6                 \n"
744       "punpcklbw   %%xmm5,%%xmm7                 \n"
745       "paddusw     %%xmm6,%%xmm0                 \n"
746       "paddusw     %%xmm7,%%xmm1                 \n"
747       "movdqa      %%xmm0,%%xmm6                 \n"
748       "psrldq      $0x2,%%xmm0                   \n"
749       "paddusw     %%xmm0,%%xmm6                 \n"
750       "psrldq      $0x2,%%xmm0                   \n"
751       "paddusw     %%xmm0,%%xmm6                 \n"
752       "pshufb      %%xmm2,%%xmm6                 \n"
753       "movdqa      %%xmm1,%%xmm7                 \n"
754       "psrldq      $0x2,%%xmm1                   \n"
755       "paddusw     %%xmm1,%%xmm7                 \n"
756       "psrldq      $0x2,%%xmm1                   \n"
757       "paddusw     %%xmm1,%%xmm7                 \n"
758       "pshufb      %%xmm3,%%xmm7                 \n"
759       "paddusw     %%xmm7,%%xmm6                 \n"
760       "pmulhuw     %%xmm4,%%xmm6                 \n"
761       "packuswb    %%xmm6,%%xmm6                 \n"
762       "movd        %%xmm6,(%1)                   \n"
763       "psrlq       $0x10,%%xmm6                  \n"
764       "movd        %%xmm6,0x2(%1)                \n"
765       "lea         0x6(%1),%1                    \n"
766       "sub         $0x6,%2                       \n"
767       "jg          1b                            \n"
768                : "+r"(src_ptr),               // %0
769                  "+r"(dst_ptr),               // %1
770                  "+r"(dst_width)              // %2
771                : "r"((intptr_t)(src_stride))  // %3
772                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
773                  "xmm6", "xmm7");
774 }
775 
776 static const uvec8 kLinearShuffleFar = {2,  3,  0, 1, 6,  7,  4,  5,
777                                         10, 11, 8, 9, 14, 15, 12, 13};
778 
779 static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
780                                     3, 1, 1, 3, 3, 1, 1, 3};
781 
782 #ifdef HAS_SCALEROWUP2LINEAR_SSE2
ScaleRowUp2_Linear_SSE2(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)783 void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
784                              uint8_t* dst_ptr,
785                              int dst_width) {
786   asm volatile(
787       "pxor        %%xmm0,%%xmm0                 \n"  // 0
788       "pcmpeqw     %%xmm6,%%xmm6                 \n"
789       "psrlw       $15,%%xmm6                    \n"
790       "psllw       $1,%%xmm6                     \n"  // all 2
791 
792       LABELALIGN
793       "1:                                        \n"
794       "movq        (%0),%%xmm1                   \n"  // 01234567
795       "movq        1(%0),%%xmm2                  \n"  // 12345678
796       "movdqa      %%xmm1,%%xmm3                 \n"
797       "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
798       "punpcklbw   %%xmm1,%%xmm1                 \n"  // 0011223344556677
799       "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
800       "movdqa      %%xmm1,%%xmm4                 \n"
801       "punpcklbw   %%xmm0,%%xmm4                 \n"  // 00112233 (16)
802       "movdqa      %%xmm2,%%xmm5                 \n"
803       "punpcklbw   %%xmm0,%%xmm5                 \n"  // 11223344 (16)
804       "paddw       %%xmm5,%%xmm4                 \n"
805       "movdqa      %%xmm3,%%xmm5                 \n"
806       "paddw       %%xmm6,%%xmm4                 \n"
807       "punpcklbw   %%xmm0,%%xmm5                 \n"  // 01122334 (16)
808       "paddw       %%xmm5,%%xmm5                 \n"
809       "paddw       %%xmm4,%%xmm5                 \n"  // 3*near+far+2 (lo)
810       "psrlw       $2,%%xmm5                     \n"  // 3/4*near+1/4*far (lo)
811 
812       "punpckhbw   %%xmm0,%%xmm1                 \n"  // 44556677 (16)
813       "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
814       "paddw       %%xmm2,%%xmm1                 \n"
815       "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
816       "paddw       %%xmm6,%%xmm1                 \n"
817       "paddw       %%xmm3,%%xmm3                 \n"
818       "paddw       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
819       "psrlw       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
820 
821       "packuswb    %%xmm1,%%xmm5                 \n"
822       "movdqu      %%xmm5,(%1)                   \n"
823 
824       "lea         0x8(%0),%0                    \n"
825       "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
826       "sub         $0x10,%2                      \n"
827       "jg          1b                            \n"
828       : "+r"(src_ptr),   // %0
829         "+r"(dst_ptr),   // %1
830         "+r"(dst_width)  // %2
831       :
832       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
833 }
834 #endif
835 
836 #ifdef HAS_SCALEROWUP2BILINEAR_SSE2
ScaleRowUp2_Bilinear_SSE2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)837 void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
838                                ptrdiff_t src_stride,
839                                uint8_t* dst_ptr,
840                                ptrdiff_t dst_stride,
841                                int dst_width) {
842   asm volatile(
843       LABELALIGN
844       "1:                                        \n"
845       "pxor        %%xmm0,%%xmm0                 \n"  // 0
846       // above line
847       "movq        (%0),%%xmm1                   \n"  // 01234567
848       "movq        1(%0),%%xmm2                  \n"  // 12345678
849       "movdqa      %%xmm1,%%xmm3                 \n"
850       "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
851       "punpcklbw   %%xmm1,%%xmm1                 \n"  // 0011223344556677
852       "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
853 
854       "movdqa      %%xmm1,%%xmm4                 \n"
855       "punpcklbw   %%xmm0,%%xmm4                 \n"  // 00112233 (16)
856       "movdqa      %%xmm2,%%xmm5                 \n"
857       "punpcklbw   %%xmm0,%%xmm5                 \n"  // 11223344 (16)
858       "paddw       %%xmm5,%%xmm4                 \n"  // near+far
859       "movdqa      %%xmm3,%%xmm5                 \n"
860       "punpcklbw   %%xmm0,%%xmm5                 \n"  // 01122334 (16)
861       "paddw       %%xmm5,%%xmm5                 \n"  // 2*near
862       "paddw       %%xmm5,%%xmm4                 \n"  // 3*near+far (1, lo)
863 
864       "punpckhbw   %%xmm0,%%xmm1                 \n"  // 44556677 (16)
865       "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
866       "paddw       %%xmm2,%%xmm1                 \n"
867       "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
868       "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
869       "paddw       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
870 
871       // below line
872       "movq        (%0,%3),%%xmm6                \n"  // 01234567
873       "movq        1(%0,%3),%%xmm2               \n"  // 12345678
874       "movdqa      %%xmm6,%%xmm3                 \n"
875       "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
876       "punpcklbw   %%xmm6,%%xmm6                 \n"  // 0011223344556677
877       "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
878 
879       "movdqa      %%xmm6,%%xmm5                 \n"
880       "punpcklbw   %%xmm0,%%xmm5                 \n"  // 00112233 (16)
881       "movdqa      %%xmm2,%%xmm7                 \n"
882       "punpcklbw   %%xmm0,%%xmm7                 \n"  // 11223344 (16)
883       "paddw       %%xmm7,%%xmm5                 \n"  // near+far
884       "movdqa      %%xmm3,%%xmm7                 \n"
885       "punpcklbw   %%xmm0,%%xmm7                 \n"  // 01122334 (16)
886       "paddw       %%xmm7,%%xmm7                 \n"  // 2*near
887       "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far (2, lo)
888 
889       "punpckhbw   %%xmm0,%%xmm6                 \n"  // 44556677 (16)
890       "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
891       "paddw       %%xmm6,%%xmm2                 \n"  // near+far
892       "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
893       "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
894       "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far (2, hi)
895 
896       // xmm4 xmm1
897       // xmm5 xmm2
898       "pcmpeqw     %%xmm0,%%xmm0                 \n"
899       "psrlw       $15,%%xmm0                    \n"
900       "psllw       $3,%%xmm0                     \n"  // all 8
901 
902       "movdqa      %%xmm4,%%xmm3                 \n"
903       "movdqa      %%xmm5,%%xmm6                 \n"
904       "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (1, lo)
905       "paddw       %%xmm0,%%xmm6                 \n"  // 3*near+far+8 (2, lo)
906       "paddw       %%xmm4,%%xmm3                 \n"  // 9*near+3*far (1, lo)
907       "paddw       %%xmm6,%%xmm3                 \n"  // 9 3 3 1 + 8 (1, lo)
908       "psrlw       $4,%%xmm3                     \n"  // ^ div by 16
909 
910       "movdqa      %%xmm1,%%xmm7                 \n"
911       "movdqa      %%xmm2,%%xmm6                 \n"
912       "paddw       %%xmm7,%%xmm7                 \n"  // 6*near+2*far (1, hi)
913       "paddw       %%xmm0,%%xmm6                 \n"  // 3*near+far+8 (2, hi)
914       "paddw       %%xmm1,%%xmm7                 \n"  // 9*near+3*far (1, hi)
915       "paddw       %%xmm6,%%xmm7                 \n"  // 9 3 3 1 + 8 (1, hi)
916       "psrlw       $4,%%xmm7                     \n"  // ^ div by 16
917 
918       "packuswb    %%xmm7,%%xmm3                 \n"
919       "movdqu      %%xmm3,(%1)                   \n"  // save above line
920 
921       "movdqa      %%xmm5,%%xmm3                 \n"
922       "paddw       %%xmm0,%%xmm4                 \n"  // 3*near+far+8 (1, lo)
923       "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (2, lo)
924       "paddw       %%xmm3,%%xmm5                 \n"  // 9*near+3*far (2, lo)
925       "paddw       %%xmm4,%%xmm5                 \n"  // 9 3 3 1 + 8 (lo)
926       "psrlw       $4,%%xmm5                     \n"  // ^ div by 16
927 
928       "movdqa      %%xmm2,%%xmm3                 \n"
929       "paddw       %%xmm0,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
930       "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (2, hi)
931       "paddw       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
932       "paddw       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (hi)
933       "psrlw       $4,%%xmm2                     \n"  // ^ div by 16
934 
935       "packuswb    %%xmm2,%%xmm5                 \n"
936       "movdqu      %%xmm5,(%1,%4)                \n"  // save below line
937 
938       "lea         0x8(%0),%0                    \n"
939       "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
940       "sub         $0x10,%2                      \n"
941       "jg          1b                            \n"
942       : "+r"(src_ptr),                // %0
943         "+r"(dst_ptr),                // %1
944         "+r"(dst_width)               // %2
945       : "r"((intptr_t)(src_stride)),  // %3
946         "r"((intptr_t)(dst_stride))   // %4
947       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
948         "xmm7");
949 }
950 #endif
951 
952 #ifdef HAS_SCALEROWUP2LINEAR_12_SSSE3
ScaleRowUp2_Linear_12_SSSE3(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)953 void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
954                                  uint16_t* dst_ptr,
955                                  int dst_width) {
956   asm volatile(
957       "movdqa      %3,%%xmm5                     \n"
958       "pcmpeqw     %%xmm4,%%xmm4                 \n"
959       "psrlw       $15,%%xmm4                    \n"
960       "psllw       $1,%%xmm4                     \n"  // all 2
961 
962       LABELALIGN
963       "1:                                        \n"
964       "movdqu      (%0),%%xmm0                   \n"  // 01234567 (16)
965       "movdqu      2(%0),%%xmm1                  \n"  // 12345678 (16)
966 
967       "movdqa      %%xmm0,%%xmm2                 \n"
968       "punpckhwd   %%xmm1,%%xmm2                 \n"  // 45566778 (16)
969       "punpcklwd   %%xmm1,%%xmm0                 \n"  // 01122334 (16)
970 
971       "movdqa      %%xmm2,%%xmm3                 \n"
972       "movdqa      %%xmm0,%%xmm1                 \n"
973       "pshufb      %%xmm5,%%xmm3                 \n"  // 54657687 (far)
974       "pshufb      %%xmm5,%%xmm1                 \n"  // 10213243 (far)
975 
976       "paddw       %%xmm4,%%xmm1                 \n"  // far+2
977       "paddw       %%xmm4,%%xmm3                 \n"  // far+2
978       "paddw       %%xmm0,%%xmm1                 \n"  // near+far+2
979       "paddw       %%xmm2,%%xmm3                 \n"  // near+far+2
980       "paddw       %%xmm0,%%xmm0                 \n"  // 2*near
981       "paddw       %%xmm2,%%xmm2                 \n"  // 2*near
982       "paddw       %%xmm1,%%xmm0                 \n"  // 3*near+far+2 (lo)
983       "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far+2 (hi)
984 
985       "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far
986       "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far
987       "movdqu      %%xmm0,(%1)                   \n"
988       "movdqu      %%xmm2,16(%1)                 \n"
989 
990       "lea         0x10(%0),%0                   \n"
991       "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
992       "sub         $0x10,%2                      \n"
993       "jg          1b                            \n"
994       : "+r"(src_ptr),          // %0
995         "+r"(dst_ptr),          // %1
996         "+r"(dst_width)         // %2
997       : "m"(kLinearShuffleFar)  // %3
998       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
999 }
1000 #endif
1001 
1002 #ifdef HAS_SCALEROWUP2BILINEAR_12_SSSE3
ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1003 void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
1004                                    ptrdiff_t src_stride,
1005                                    uint16_t* dst_ptr,
1006                                    ptrdiff_t dst_stride,
1007                                    int dst_width) {
1008   asm volatile(
1009       "pcmpeqw     %%xmm7,%%xmm7                 \n"
1010       "psrlw       $15,%%xmm7                    \n"
1011       "psllw       $3,%%xmm7                     \n"  // all 8
1012       "movdqa      %5,%%xmm6                     \n"
1013 
1014       LABELALIGN
1015       "1:                                        \n"
1016       // above line
1017       "movdqu      (%0),%%xmm0                   \n"  // 01234567 (16)
1018       "movdqu      2(%0),%%xmm1                  \n"  // 12345678 (16)
1019       "movdqa      %%xmm0,%%xmm2                 \n"
1020       "punpckhwd   %%xmm1,%%xmm2                 \n"  // 45566778 (16)
1021       "punpcklwd   %%xmm1,%%xmm0                 \n"  // 01122334 (16)
1022       "movdqa      %%xmm2,%%xmm3                 \n"
1023       "movdqa      %%xmm0,%%xmm1                 \n"
1024       "pshufb      %%xmm6,%%xmm3                 \n"  // 54657687 (far)
1025       "pshufb      %%xmm6,%%xmm1                 \n"  // 10213243 (far)
1026       "paddw       %%xmm0,%%xmm1                 \n"  // near+far
1027       "paddw       %%xmm2,%%xmm3                 \n"  // near+far
1028       "paddw       %%xmm0,%%xmm0                 \n"  // 2*near
1029       "paddw       %%xmm2,%%xmm2                 \n"  // 2*near
1030       "paddw       %%xmm1,%%xmm0                 \n"  // 3*near+far (1, lo)
1031       "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far (1, hi)
1032 
1033       // below line
1034       "movdqu      (%0,%3,2),%%xmm1              \n"  // 01234567 (16)
1035       "movdqu      2(%0,%3,2),%%xmm4             \n"  // 12345678 (16)
1036       "movdqa      %%xmm1,%%xmm3                 \n"
1037       "punpckhwd   %%xmm4,%%xmm3                 \n"  // 45566778 (16)
1038       "punpcklwd   %%xmm4,%%xmm1                 \n"  // 01122334 (16)
1039       "movdqa      %%xmm3,%%xmm5                 \n"
1040       "movdqa      %%xmm1,%%xmm4                 \n"
1041       "pshufb      %%xmm6,%%xmm5                 \n"  // 54657687 (far)
1042       "pshufb      %%xmm6,%%xmm4                 \n"  // 10213243 (far)
1043       "paddw       %%xmm1,%%xmm4                 \n"  // near+far
1044       "paddw       %%xmm3,%%xmm5                 \n"  // near+far
1045       "paddw       %%xmm1,%%xmm1                 \n"  // 2*near
1046       "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
1047       "paddw       %%xmm4,%%xmm1                 \n"  // 3*near+far (2, lo)
1048       "paddw       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
1049 
1050       // xmm0 xmm2
1051       // xmm1 xmm3
1052 
1053       "movdqa      %%xmm0,%%xmm4                 \n"
1054       "movdqa      %%xmm1,%%xmm5                 \n"
1055       "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (1, lo)
1056       "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
1057       "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
1058       "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
1059       "psrlw       $4,%%xmm4                     \n"  // ^ div by 16
1060       "movdqu      %%xmm4,(%1)                   \n"
1061 
1062       "movdqa      %%xmm2,%%xmm4                 \n"
1063       "movdqa      %%xmm3,%%xmm5                 \n"
1064       "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (1, hi)
1065       "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far+8 (2, hi)
1066       "paddw       %%xmm2,%%xmm4                 \n"  // 9*near+3*far (1, hi)
1067       "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, hi)
1068       "psrlw       $4,%%xmm4                     \n"  // ^ div by 16
1069       "movdqu      %%xmm4,0x10(%1)               \n"
1070 
1071       "movdqa      %%xmm1,%%xmm4                 \n"
1072       "paddw       %%xmm7,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
1073       "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (2, lo)
1074       "paddw       %%xmm4,%%xmm1                 \n"  // 9*near+3*far (2, lo)
1075       "paddw       %%xmm0,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, lo)
1076       "psrlw       $4,%%xmm1                     \n"  // ^ div by 16
1077       "movdqu      %%xmm1,(%1,%4,2)              \n"
1078 
1079       "movdqa      %%xmm3,%%xmm4                 \n"
1080       "paddw       %%xmm7,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
1081       "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (2, hi)
1082       "paddw       %%xmm4,%%xmm3                 \n"  // 9*near+3*far (2, hi)
1083       "paddw       %%xmm2,%%xmm3                 \n"  // 9 3 3 1 + 8 (2, hi)
1084       "psrlw       $4,%%xmm3                     \n"  // ^ div by 16
1085       "movdqu      %%xmm3,0x10(%1,%4,2)          \n"
1086 
1087       "lea         0x10(%0),%0                   \n"
1088       "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
1089       "sub         $0x10,%2                      \n"
1090       "jg          1b                            \n"
1091       : "+r"(src_ptr),                // %0
1092         "+r"(dst_ptr),                // %1
1093         "+r"(dst_width)               // %2
1094       : "r"((intptr_t)(src_stride)),  // %3
1095         "r"((intptr_t)(dst_stride)),  // %4
1096         "m"(kLinearShuffleFar)        // %5
1097       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1098 }
1099 #endif
1100 
1101 #ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
ScaleRowUp2_Linear_16_SSE2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)1102 void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
1103                                 uint16_t* dst_ptr,
1104                                 int dst_width) {
1105   asm volatile(
1106       "pxor        %%xmm5,%%xmm5                 \n"
1107       "pcmpeqd     %%xmm4,%%xmm4                 \n"
1108       "psrld       $31,%%xmm4                    \n"
1109       "pslld       $1,%%xmm4                     \n"  // all 2
1110 
1111       LABELALIGN
1112       "1:                                        \n"
1113       "movq        (%0),%%xmm0                   \n"  // 0123 (16b)
1114       "movq        2(%0),%%xmm1                  \n"  // 1234 (16b)
1115 
1116       "punpcklwd   %%xmm5,%%xmm0                 \n"  // 0123 (32b)
1117       "punpcklwd   %%xmm5,%%xmm1                 \n"  // 1234 (32b)
1118 
1119       "movdqa      %%xmm0,%%xmm2                 \n"
1120       "movdqa      %%xmm1,%%xmm3                 \n"
1121 
1122       "pshufd      $0b10110001,%%xmm2,%%xmm2     \n"  // 1032 (even, far)
1123       "pshufd      $0b10110001,%%xmm3,%%xmm3     \n"  // 2143 (odd, far)
1124 
1125       "paddd       %%xmm4,%%xmm2                 \n"  // far+2 (lo)
1126       "paddd       %%xmm4,%%xmm3                 \n"  // far+2 (hi)
1127       "paddd       %%xmm0,%%xmm2                 \n"  // near+far+2 (lo)
1128       "paddd       %%xmm1,%%xmm3                 \n"  // near+far+2 (hi)
1129       "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
1130       "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
1131       "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far+2 (lo)
1132       "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
1133 
1134       "psrld       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
1135       "psrld       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
1136       "packssdw    %%xmm1,%%xmm0                 \n"
1137       "pshufd      $0b11011000,%%xmm0,%%xmm0     \n"
1138       "movdqu      %%xmm0,(%1)                   \n"
1139 
1140       "lea         0x8(%0),%0                    \n"
1141       "lea         0x10(%1),%1                   \n"  // 4 pixel to 8 pixel
1142       "sub         $0x8,%2                       \n"
1143       "jg          1b                            \n"
1144       : "+r"(src_ptr),   // %0
1145         "+r"(dst_ptr),   // %1
1146         "+r"(dst_width)  // %2
1147       :
1148       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1149 }
1150 #endif
1151 
1152 #ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
ScaleRowUp2_Bilinear_16_SSE2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1153 void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
1154                                   ptrdiff_t src_stride,
1155                                   uint16_t* dst_ptr,
1156                                   ptrdiff_t dst_stride,
1157                                   int dst_width) {
1158   asm volatile(
1159       "pxor        %%xmm7,%%xmm7                 \n"
1160       "pcmpeqd     %%xmm6,%%xmm6                 \n"
1161       "psrld       $31,%%xmm6                    \n"
1162       "pslld       $3,%%xmm6                     \n"  // all 8
1163 
1164       LABELALIGN
1165       "1:                                        \n"
1166       "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
1167       "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
1168       "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0011 (near) (32b, 1u1v)
1169       "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1122 (near) (32b, 1u1v)
1170       "movdqa      %%xmm0,%%xmm2                 \n"
1171       "movdqa      %%xmm1,%%xmm3                 \n"
1172       "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (far) (1, lo)
1173       "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (far) (1, hi)
1174       "paddd       %%xmm0,%%xmm2                 \n"  // near+far (1, lo)
1175       "paddd       %%xmm1,%%xmm3                 \n"  // near+far (1, hi)
1176       "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (1, lo)
1177       "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (1, hi)
1178       "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
1179       "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
1180 
1181       "movq        (%0),%%xmm0                   \n"  // 0123 (16b)
1182       "movq        2(%0),%%xmm1                  \n"  // 1234 (16b)
1183       "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0123 (32b)
1184       "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1234 (32b)
1185       "movdqa      %%xmm0,%%xmm2                 \n"
1186       "movdqa      %%xmm1,%%xmm3                 \n"
1187       "pshufd      $0b10110001,%%xmm2,%%xmm2     \n"  // 1032 (even, far)
1188       "pshufd      $0b10110001,%%xmm3,%%xmm3     \n"  // 2143 (odd, far)
1189       "paddd       %%xmm0,%%xmm2                 \n"  // near+far (lo)
1190       "paddd       %%xmm1,%%xmm3                 \n"  // near+far (hi)
1191       "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
1192       "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
1193       "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
1194       "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
1195 
1196       "movq        (%0,%3,2),%%xmm2              \n"
1197       "movq        2(%0,%3,2),%%xmm3             \n"
1198       "punpcklwd   %%xmm7,%%xmm2                 \n"  // 0123 (32b)
1199       "punpcklwd   %%xmm7,%%xmm3                 \n"  // 1234 (32b)
1200       "movdqa      %%xmm2,%%xmm4                 \n"
1201       "movdqa      %%xmm3,%%xmm5                 \n"
1202       "pshufd      $0b10110001,%%xmm4,%%xmm4     \n"  // 1032 (even, far)
1203       "pshufd      $0b10110001,%%xmm5,%%xmm5     \n"  // 2143 (odd, far)
1204       "paddd       %%xmm2,%%xmm4                 \n"  // near+far (lo)
1205       "paddd       %%xmm3,%%xmm5                 \n"  // near+far (hi)
1206       "paddd       %%xmm2,%%xmm2                 \n"  // 2*near (lo)
1207       "paddd       %%xmm3,%%xmm3                 \n"  // 2*near (hi)
1208       "paddd       %%xmm4,%%xmm2                 \n"  // 3*near+far (2, lo)
1209       "paddd       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
1210 
1211       "movdqa      %%xmm0,%%xmm4                 \n"
1212       "movdqa      %%xmm2,%%xmm5                 \n"
1213       "paddd       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
1214       "paddd       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
1215       "paddd       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
1216       "paddd       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
1217       "psrld       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
1218 
1219       "movdqa      %%xmm2,%%xmm5                 \n"
1220       "paddd       %%xmm2,%%xmm5                 \n"  // 6*near+2*far (2, lo)
1221       "paddd       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
1222       "paddd       %%xmm2,%%xmm5                 \n"  // 9*near+3*far (2, lo)
1223       "paddd       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
1224       "psrld       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
1225 
1226       "movdqa      %%xmm1,%%xmm0                 \n"
1227       "movdqa      %%xmm3,%%xmm2                 \n"
1228       "paddd       %%xmm1,%%xmm0                 \n"  // 6*near+2*far (1, hi)
1229       "paddd       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (2, hi)
1230       "paddd       %%xmm1,%%xmm0                 \n"  // 9*near+3*far (1, hi)
1231       "paddd       %%xmm2,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
1232       "psrld       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
1233 
1234       "movdqa      %%xmm3,%%xmm2                 \n"
1235       "paddd       %%xmm3,%%xmm2                 \n"  // 6*near+2*far (2, hi)
1236       "paddd       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
1237       "paddd       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
1238       "paddd       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (2, hi)
1239       "psrld       $4,%%xmm2                     \n"  // ^ div by 16 (2, hi)
1240 
1241       "packssdw    %%xmm0,%%xmm4                 \n"
1242       "pshufd      $0b11011000,%%xmm4,%%xmm4     \n"
1243       "movdqu      %%xmm4,(%1)                   \n"  // store above
1244       "packssdw    %%xmm2,%%xmm5                 \n"
1245       "pshufd      $0b11011000,%%xmm4,%%xmm4     \n"
1246       "movdqu      %%xmm5,(%1,%4,2)              \n"  // store below
1247 
1248       "lea         0x8(%0),%0                    \n"
1249       "lea         0x10(%1),%1                   \n"  // 4 pixel to 8 pixel
1250       "sub         $0x8,%2                       \n"
1251       "jg          1b                            \n"
1252       : "+r"(src_ptr),                // %0
1253         "+r"(dst_ptr),                // %1
1254         "+r"(dst_width)               // %2
1255       : "r"((intptr_t)(src_stride)),  // %3
1256         "r"((intptr_t)(dst_stride))   // %4
1257       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1258 }
1259 #endif
1260 
1261 #ifdef HAS_SCALEROWUP2LINEAR_SSSE3
ScaleRowUp2_Linear_SSSE3(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)1262 void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
1263                               uint8_t* dst_ptr,
1264                               int dst_width) {
1265   asm volatile(
1266       "pcmpeqw     %%xmm4,%%xmm4                 \n"
1267       "psrlw       $15,%%xmm4                    \n"
1268       "psllw       $1,%%xmm4                     \n"  // all 2
1269       "movdqa      %3,%%xmm3                     \n"
1270 
1271       LABELALIGN
1272       "1:                                        \n"
1273       "movq        (%0),%%xmm0                   \n"  // 01234567
1274       "movq        1(%0),%%xmm1                  \n"  // 12345678
1275       "punpcklwd   %%xmm0,%%xmm0                 \n"  // 0101232345456767
1276       "punpcklwd   %%xmm1,%%xmm1                 \n"  // 1212343456567878
1277       "movdqa      %%xmm0,%%xmm2                 \n"
1278       "punpckhdq   %%xmm1,%%xmm2                 \n"  // 4545565667677878
1279       "punpckldq   %%xmm1,%%xmm0                 \n"  // 0101121223233434
1280       "pmaddubsw   %%xmm3,%%xmm2                 \n"  // 3*near+far (hi)
1281       "pmaddubsw   %%xmm3,%%xmm0                 \n"  // 3*near+far (lo)
1282       "paddw       %%xmm4,%%xmm0                 \n"  // 3*near+far+2 (lo)
1283       "paddw       %%xmm4,%%xmm2                 \n"  // 3*near+far+2 (hi)
1284       "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
1285       "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far (hi)
1286       "packuswb    %%xmm2,%%xmm0                 \n"
1287       "movdqu      %%xmm0,(%1)                   \n"
1288       "lea         0x8(%0),%0                    \n"
1289       "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
1290       "sub         $0x10,%2                      \n"
1291       "jg          1b                            \n"
1292       : "+r"(src_ptr),      // %0
1293         "+r"(dst_ptr),      // %1
1294         "+r"(dst_width)     // %2
1295       : "m"(kLinearMadd31)  // %3
1296       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1297 }
1298 #endif
1299 
1300 #ifdef HAS_SCALEROWUP2BILINEAR_SSSE3
ScaleRowUp2_Bilinear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1301 void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
1302                                 ptrdiff_t src_stride,
1303                                 uint8_t* dst_ptr,
1304                                 ptrdiff_t dst_stride,
1305                                 int dst_width) {
1306   asm volatile(
1307       "pcmpeqw     %%xmm6,%%xmm6                 \n"
1308       "psrlw       $15,%%xmm6                    \n"
1309       "psllw       $3,%%xmm6                     \n"  // all 8
1310       "movdqa      %5,%%xmm7                     \n"
1311 
1312       LABELALIGN
1313       "1:                                        \n"
1314       "movq        (%0),%%xmm0                   \n"  // 01234567
1315       "movq        1(%0),%%xmm1                  \n"  // 12345678
1316       "punpcklwd   %%xmm0,%%xmm0                 \n"  // 0101232345456767
1317       "punpcklwd   %%xmm1,%%xmm1                 \n"  // 1212343456567878
1318       "movdqa      %%xmm0,%%xmm2                 \n"
1319       "punpckhdq   %%xmm1,%%xmm2                 \n"  // 4545565667677878
1320       "punpckldq   %%xmm1,%%xmm0                 \n"  // 0101121223233434
1321       "pmaddubsw   %%xmm7,%%xmm2                 \n"  // 3*near+far (1, hi)
1322       "pmaddubsw   %%xmm7,%%xmm0                 \n"  // 3*near+far (1, lo)
1323 
1324       "movq        (%0,%3),%%xmm1                \n"
1325       "movq        1(%0,%3),%%xmm4               \n"
1326       "punpcklwd   %%xmm1,%%xmm1                 \n"
1327       "punpcklwd   %%xmm4,%%xmm4                 \n"
1328       "movdqa      %%xmm1,%%xmm3                 \n"
1329       "punpckhdq   %%xmm4,%%xmm3                 \n"
1330       "punpckldq   %%xmm4,%%xmm1                 \n"
1331       "pmaddubsw   %%xmm7,%%xmm3                 \n"  // 3*near+far (2, hi)
1332       "pmaddubsw   %%xmm7,%%xmm1                 \n"  // 3*near+far (2, lo)
1333 
1334       // xmm0 xmm2
1335       // xmm1 xmm3
1336 
1337       "movdqa      %%xmm0,%%xmm4                 \n"
1338       "movdqa      %%xmm1,%%xmm5                 \n"
1339       "paddw       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
1340       "paddw       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
1341       "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
1342       "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
1343       "psrlw       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
1344 
1345       "movdqa      %%xmm1,%%xmm5                 \n"
1346       "paddw       %%xmm1,%%xmm5                 \n"  // 6*near+2*far (2, lo)
1347       "paddw       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
1348       "paddw       %%xmm1,%%xmm5                 \n"  // 9*near+3*far (2, lo)
1349       "paddw       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
1350       "psrlw       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
1351 
1352       "movdqa      %%xmm2,%%xmm0                 \n"
1353       "movdqa      %%xmm3,%%xmm1                 \n"
1354       "paddw       %%xmm2,%%xmm0                 \n"  // 6*near+2*far (1, hi)
1355       "paddw       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (2, hi)
1356       "paddw       %%xmm2,%%xmm0                 \n"  // 9*near+3*far (1, hi)
1357       "paddw       %%xmm1,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
1358       "psrlw       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
1359 
1360       "movdqa      %%xmm3,%%xmm1                 \n"
1361       "paddw       %%xmm3,%%xmm1                 \n"  // 6*near+2*far (2, hi)
1362       "paddw       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
1363       "paddw       %%xmm3,%%xmm1                 \n"  // 9*near+3*far (2, hi)
1364       "paddw       %%xmm2,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, hi)
1365       "psrlw       $4,%%xmm1                     \n"  // ^ div by 16 (2, hi)
1366 
1367       "packuswb    %%xmm0,%%xmm4                 \n"
1368       "movdqu      %%xmm4,(%1)                   \n"  // store above
1369       "packuswb    %%xmm1,%%xmm5                 \n"
1370       "movdqu      %%xmm5,(%1,%4)                \n"  // store below
1371 
1372       "lea         0x8(%0),%0                    \n"
1373       "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
1374       "sub         $0x10,%2                      \n"
1375       "jg          1b                            \n"
1376       : "+r"(src_ptr),                // %0
1377         "+r"(dst_ptr),                // %1
1378         "+r"(dst_width)               // %2
1379       : "r"((intptr_t)(src_stride)),  // %3
1380         "r"((intptr_t)(dst_stride)),  // %4
1381         "m"(kLinearMadd31)            // %5
1382       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1383         "xmm7");
1384 }
1385 #endif
1386 
1387 #ifdef HAS_SCALEROWUP2LINEAR_AVX2
ScaleRowUp2_Linear_AVX2(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)1388 void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
1389                              uint8_t* dst_ptr,
1390                              int dst_width) {
1391   asm volatile(
1392       "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
1393       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
1394       "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
1395       "vbroadcastf128 %3,%%ymm3                  \n"
1396 
1397       LABELALIGN
1398       "1:                                        \n"
1399       "vmovdqu     (%0),%%xmm0                   \n"  // 0123456789ABCDEF
1400       "vmovdqu     1(%0),%%xmm1                  \n"  // 123456789ABCDEF0
1401       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
1402       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
1403       "vpunpcklwd  %%ymm0,%%ymm0,%%ymm0          \n"
1404       "vpunpcklwd  %%ymm1,%%ymm1,%%ymm1          \n"
1405       "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"
1406       "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"
1407       "vpmaddubsw  %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
1408       "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0          \n"  // 3*near+far (lo)
1409       "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2 (lo)
1410       "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // 3*near+far+2 (hi)
1411       "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
1412       "vpsrlw      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
1413       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
1414       "vmovdqu     %%ymm0,(%1)                   \n"
1415 
1416       "lea         0x10(%0),%0                   \n"
1417       "lea         0x20(%1),%1                   \n"  // 16 sample to 32 sample
1418       "sub         $0x20,%2                      \n"
1419       "jg          1b                            \n"
1420       "vzeroupper                                \n"
1421       : "+r"(src_ptr),      // %0
1422         "+r"(dst_ptr),      // %1
1423         "+r"(dst_width)     // %2
1424       : "m"(kLinearMadd31)  // %3
1425       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
1426 }
1427 #endif
1428 
1429 #ifdef HAS_SCALEROWUP2BILINEAR_AVX2
ScaleRowUp2_Bilinear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1430 void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
1431                                ptrdiff_t src_stride,
1432                                uint8_t* dst_ptr,
1433                                ptrdiff_t dst_stride,
1434                                int dst_width) {
1435   asm volatile(
1436       "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
1437       "vpsrlw      $15,%%ymm6,%%ymm6             \n"
1438       "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
1439       "vbroadcastf128 %5,%%ymm7                  \n"
1440 
1441       LABELALIGN
1442       "1:                                        \n"
1443       "vmovdqu     (%0),%%xmm0                   \n"  // 0123456789ABCDEF
1444       "vmovdqu     1(%0),%%xmm1                  \n"  // 123456789ABCDEF0
1445       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
1446       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
1447       "vpunpcklwd  %%ymm0,%%ymm0,%%ymm0          \n"
1448       "vpunpcklwd  %%ymm1,%%ymm1,%%ymm1          \n"
1449       "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"
1450       "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"
1451       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
1452       "vpmaddubsw  %%ymm7,%%ymm0,%%ymm0          \n"  // 3*near+far (1, lo)
1453 
1454       "vmovdqu     (%0,%3),%%xmm2                \n"  // 0123456789ABCDEF
1455       "vmovdqu     1(%0,%3),%%xmm3               \n"  // 123456789ABCDEF0
1456       "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"
1457       "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"
1458       "vpunpcklwd  %%ymm2,%%ymm2,%%ymm2          \n"
1459       "vpunpcklwd  %%ymm3,%%ymm3,%%ymm3          \n"
1460       "vpunpckhdq  %%ymm3,%%ymm2,%%ymm4          \n"
1461       "vpunpckldq  %%ymm3,%%ymm2,%%ymm2          \n"
1462       "vpmaddubsw  %%ymm7,%%ymm4,%%ymm3          \n"  // 3*near+far (2, hi)
1463       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm2          \n"  // 3*near+far (2, lo)
1464 
1465       // ymm0 ymm1
1466       // ymm2 ymm3
1467 
1468       "vpaddw      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
1469       "vpaddw      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
1470       "vpaddw      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
1471       "vpaddw      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
1472       "vpsrlw      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
1473 
1474       "vpaddw      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
1475       "vpaddw      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
1476       "vpaddw      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
1477       "vpaddw      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
1478       "vpsrlw      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
1479 
1480       "vpaddw      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
1481       "vpaddw      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
1482       "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
1483       "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
1484       "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
1485 
1486       "vpaddw      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
1487       "vpaddw      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
1488       "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
1489       "vpaddw      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
1490       "vpsrlw      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
1491 
1492       "vpackuswb   %%ymm0,%%ymm4,%%ymm4          \n"
1493       "vmovdqu     %%ymm4,(%1)                   \n"  // store above
1494       "vpackuswb   %%ymm2,%%ymm5,%%ymm5          \n"
1495       "vmovdqu     %%ymm5,(%1,%4)                \n"  // store below
1496 
1497       "lea         0x10(%0),%0                   \n"
1498       "lea         0x20(%1),%1                   \n"  // 16 sample to 32 sample
1499       "sub         $0x20,%2                      \n"
1500       "jg          1b                            \n"
1501       "vzeroupper                                \n"
1502       : "+r"(src_ptr),                // %0
1503         "+r"(dst_ptr),                // %1
1504         "+r"(dst_width)               // %2
1505       : "r"((intptr_t)(src_stride)),  // %3
1506         "r"((intptr_t)(dst_stride)),  // %4
1507         "m"(kLinearMadd31)            // %5
1508       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1509         "xmm7");
1510 }
1511 #endif
1512 
1513 #ifdef HAS_SCALEROWUP2LINEAR_12_AVX2
ScaleRowUp2_Linear_12_AVX2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)1514 void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
1515                                 uint16_t* dst_ptr,
1516                                 int dst_width) {
1517   asm volatile(
1518       "vbroadcastf128 %3,%%ymm5                  \n"
1519       "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
1520       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
1521       "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
1522 
1523       LABELALIGN
1524       "1:                                        \n"
1525       "vmovdqu     (%0),%%ymm0                   \n"  // 0123456789ABCDEF (16b)
1526       "vmovdqu     2(%0),%%ymm1                  \n"  // 123456789ABCDEF0 (16b)
1527 
1528       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 012389AB4567CDEF
1529       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 12349ABC5678DEF0
1530 
1531       "vpunpckhwd  %%ymm1,%%ymm0,%%ymm2          \n"  // 899AABBCCDDEEFF0 (near)
1532       "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
1533       "vpshufb     %%ymm5,%%ymm2,%%ymm3          \n"  // 98A9BACBDCEDFE0F (far)
1534       "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
1535 
1536       "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // far+2
1537       "vpaddw      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2
1538       "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far+2
1539       "vpaddw      %%ymm2,%%ymm3,%%ymm3          \n"  // near+far+2
1540       "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
1541       "vpaddw      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near
1542       "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 3*near+far+2
1543       "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 3*near+far+2
1544 
1545       "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far
1546       "vpsrlw      $2,%%ymm2,%%ymm2              \n"  // 3/4*near+1/4*far
1547       "vmovdqu     %%ymm0,(%1)                   \n"
1548       "vmovdqu     %%ymm2,32(%1)                 \n"
1549 
1550       "lea         0x20(%0),%0                   \n"
1551       "lea         0x40(%1),%1                   \n"  // 16 sample to 32 sample
1552       "sub         $0x20,%2                      \n"
1553       "jg          1b                            \n"
1554       "vzeroupper                                \n"
1555       : "+r"(src_ptr),          // %0
1556         "+r"(dst_ptr),          // %1
1557         "+r"(dst_width)         // %2
1558       : "m"(kLinearShuffleFar)  // %3
1559       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1560 }
1561 #endif
1562 
1563 #ifdef HAS_SCALEROWUP2BILINEAR_12_AVX2
ScaleRowUp2_Bilinear_12_AVX2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1564 void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
1565                                   ptrdiff_t src_stride,
1566                                   uint16_t* dst_ptr,
1567                                   ptrdiff_t dst_stride,
1568                                   int dst_width) {
1569   asm volatile(
1570       "vbroadcastf128 %5,%%ymm5                  \n"
1571       "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
1572       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
1573       "vpsllw      $3,%%ymm4,%%ymm4              \n"  // all 8
1574 
1575       LABELALIGN
1576       "1:                                        \n"
1577 
1578       "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b)
1579       "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b)
1580       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
1581       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
1582       "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
1583       "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
1584       "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far
1585       "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
1586       "vpaddw      %%ymm0,%%ymm1,%%ymm2          \n"  // 3*near+far (1)
1587 
1588       "vmovdqu     (%0,%3,2),%%xmm0              \n"  // 01234567 (16b)
1589       "vmovdqu     2(%0,%3,2),%%xmm1             \n"  // 12345678 (16b)
1590       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
1591       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
1592       "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
1593       "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
1594       "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far
1595       "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
1596       "vpaddw      %%ymm0,%%ymm1,%%ymm3          \n"  // 3*near+far (2)
1597 
1598       "vpaddw      %%ymm2,%%ymm2,%%ymm0          \n"  // 6*near+2*far (1)
1599       "vpaddw      %%ymm4,%%ymm3,%%ymm1          \n"  // 3*near+far+8 (2)
1600       "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9*near+3*far (1)
1601       "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9 3 3 1 + 8 (1)
1602       "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16
1603       "vmovdqu     %%ymm0,(%1)                   \n"  // store above
1604 
1605       "vpaddw      %%ymm3,%%ymm3,%%ymm0          \n"  // 6*near+2*far (2)
1606       "vpaddw      %%ymm4,%%ymm2,%%ymm1          \n"  // 3*near+far+8 (1)
1607       "vpaddw      %%ymm0,%%ymm3,%%ymm0          \n"  // 9*near+3*far (2)
1608       "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9 3 3 1 + 8 (2)
1609       "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16
1610       "vmovdqu     %%ymm0,(%1,%4,2)              \n"  // store below
1611 
1612       "lea         0x10(%0),%0                   \n"
1613       "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
1614       "sub         $0x10,%2                      \n"
1615       "jg          1b                            \n"
1616       "vzeroupper                                \n"
1617       : "+r"(src_ptr),                // %0
1618         "+r"(dst_ptr),                // %1
1619         "+r"(dst_width)               // %2
1620       : "r"((intptr_t)(src_stride)),  // %3
1621         "r"((intptr_t)(dst_stride)),  // %4
1622         "m"(kLinearShuffleFar)        // %5
1623       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1624 }
1625 #endif
1626 
1627 #ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
ScaleRowUp2_Linear_16_AVX2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)1628 void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
1629                                 uint16_t* dst_ptr,
1630                                 int dst_width) {
1631   asm volatile(
1632       "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
1633       "vpsrld      $31,%%ymm4,%%ymm4             \n"
1634       "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
1635 
1636       LABELALIGN
1637       "1:                                        \n"
1638       "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b, 1u1v)
1639       "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b, 1u1v)
1640 
1641       "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
1642       "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
1643 
1644       "vpshufd     $0b10110001,%%ymm0,%%ymm2     \n"  // 10325476 (lo, far)
1645       "vpshufd     $0b10110001,%%ymm1,%%ymm3     \n"  // 21436587 (hi, far)
1646 
1647       "vpaddd      %%ymm4,%%ymm2,%%ymm2          \n"  // far+2 (lo)
1648       "vpaddd      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2 (hi)
1649       "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far+2 (lo)
1650       "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far+2 (hi)
1651       "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
1652       "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
1653       "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far+2 (lo)
1654       "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far+2 (hi)
1655 
1656       "vpsrld      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
1657       "vpsrld      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
1658       "vpackusdw   %%ymm1,%%ymm0,%%ymm0          \n"
1659       "vpshufd     $0b11011000,%%ymm0,%%ymm0     \n"
1660       "vmovdqu     %%ymm0,(%1)                   \n"
1661 
1662       "lea         0x10(%0),%0                   \n"
1663       "lea         0x20(%1),%1                   \n"  // 8 pixel to 16 pixel
1664       "sub         $0x10,%2                      \n"
1665       "jg          1b                            \n"
1666       "vzeroupper                                \n"
1667       : "+r"(src_ptr),   // %0
1668         "+r"(dst_ptr),   // %1
1669         "+r"(dst_width)  // %2
1670       :
1671       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
1672 }
1673 #endif
1674 
1675 #ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2
ScaleRowUp2_Bilinear_16_AVX2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1676 void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
1677                                   ptrdiff_t src_stride,
1678                                   uint16_t* dst_ptr,
1679                                   ptrdiff_t dst_stride,
1680                                   int dst_width) {
1681   asm volatile(
1682       "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
1683       "vpsrld      $31,%%ymm6,%%ymm6             \n"
1684       "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
1685 
1686       LABELALIGN
1687       "1:                                        \n"
1688 
1689       "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b, 1u1v)
1690       "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b, 1u1v)
1691       "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
1692       "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
1693       "vpshufd     $0b10110001,%%ymm0,%%ymm2     \n"  // 10325476 (lo, far)
1694       "vpshufd     $0b10110001,%%ymm1,%%ymm3     \n"  // 21436587 (hi, far)
1695       "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far (lo)
1696       "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far (hi)
1697       "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
1698       "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
1699       "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far (1, lo)
1700       "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far (1, hi)
1701 
1702       "vmovdqu     (%0,%3,2),%%xmm2              \n"  // 01234567 (16b, 1u1v)
1703       "vmovdqu     2(%0,%3,2),%%xmm3             \n"  // 12345678 (16b, 1u1v)
1704       "vpmovzxwd   %%xmm2,%%ymm2                 \n"  // 01234567 (32b, 1u1v)
1705       "vpmovzxwd   %%xmm3,%%ymm3                 \n"  // 12345678 (32b, 1u1v)
1706       "vpshufd     $0b10110001,%%ymm2,%%ymm4     \n"  // 10325476 (lo, far)
1707       "vpshufd     $0b10110001,%%ymm3,%%ymm5     \n"  // 21436587 (hi, far)
1708       "vpaddd      %%ymm2,%%ymm4,%%ymm4          \n"  // near+far (lo)
1709       "vpaddd      %%ymm3,%%ymm5,%%ymm5          \n"  // near+far (hi)
1710       "vpaddd      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near (lo)
1711       "vpaddd      %%ymm3,%%ymm3,%%ymm3          \n"  // 2*near (hi)
1712       "vpaddd      %%ymm2,%%ymm4,%%ymm2          \n"  // 3*near+far (2, lo)
1713       "vpaddd      %%ymm3,%%ymm5,%%ymm3          \n"  // 3*near+far (2, hi)
1714 
1715       "vpaddd      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
1716       "vpaddd      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
1717       "vpaddd      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
1718       "vpaddd      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
1719       "vpsrld      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
1720 
1721       "vpaddd      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
1722       "vpaddd      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
1723       "vpaddd      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
1724       "vpaddd      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
1725       "vpsrld      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
1726 
1727       "vpaddd      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
1728       "vpaddd      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
1729       "vpaddd      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
1730       "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
1731       "vpsrld      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
1732 
1733       "vpaddd      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
1734       "vpaddd      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
1735       "vpaddd      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
1736       "vpaddd      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
1737       "vpsrld      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
1738 
1739       "vpackusdw   %%ymm0,%%ymm4,%%ymm4          \n"
1740       "vpshufd     $0b11011000,%%ymm4,%%ymm4     \n"
1741       "vmovdqu     %%ymm4,(%1)                   \n"  // store above
1742       "vpackusdw   %%ymm2,%%ymm5,%%ymm5          \n"
1743       "vpshufd     $0b11011000,%%ymm5,%%ymm5     \n"
1744       "vmovdqu     %%ymm5,(%1,%4,2)              \n"  // store below
1745 
1746       "lea         0x10(%0),%0                   \n"
1747       "lea         0x20(%1),%1                   \n"  // 8 pixel to 16 pixel
1748       "sub         $0x10,%2                      \n"
1749       "jg          1b                            \n"
1750       "vzeroupper                                \n"
1751       : "+r"(src_ptr),                // %0
1752         "+r"(dst_ptr),                // %1
1753         "+r"(dst_width)               // %2
1754       : "r"((intptr_t)(src_stride)),  // %3
1755         "r"((intptr_t)(dst_stride))   // %4
1756       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1757 }
1758 #endif
1759 
1760 // Reads 16xN bytes and produces 16 shorts at a time.
ScaleAddRow_SSE2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)1761 void ScaleAddRow_SSE2(const uint8_t* src_ptr,
1762                       uint16_t* dst_ptr,
1763                       int src_width) {
1764       asm volatile("pxor        %%xmm5,%%xmm5                 \n"
1765 
1766                // 16 pixel loop.
1767                LABELALIGN
1768       "1:                                        \n"
1769       "movdqu      (%0),%%xmm3                   \n"
1770       "lea         0x10(%0),%0                   \n"  // src_ptr += 16
1771       "movdqu      (%1),%%xmm0                   \n"
1772       "movdqu      0x10(%1),%%xmm1               \n"
1773       "movdqa      %%xmm3,%%xmm2                 \n"
1774       "punpcklbw   %%xmm5,%%xmm2                 \n"
1775       "punpckhbw   %%xmm5,%%xmm3                 \n"
1776       "paddusw     %%xmm2,%%xmm0                 \n"
1777       "paddusw     %%xmm3,%%xmm1                 \n"
1778       "movdqu      %%xmm0,(%1)                   \n"
1779       "movdqu      %%xmm1,0x10(%1)               \n"
1780       "lea         0x20(%1),%1                   \n"
1781       "sub         $0x10,%2                      \n"
1782       "jg          1b                            \n"
1783                : "+r"(src_ptr),   // %0
1784                  "+r"(dst_ptr),   // %1
1785                  "+r"(src_width)  // %2
1786                :
1787                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
1788 }
1789 
1790 #ifdef HAS_SCALEADDROW_AVX2
1791 // Reads 32 bytes and accumulates to 32 shorts at a time.
ScaleAddRow_AVX2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)1792 void ScaleAddRow_AVX2(const uint8_t* src_ptr,
1793                       uint16_t* dst_ptr,
1794                       int src_width) {
1795       asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
1796 
1797                LABELALIGN
1798       "1:                                        \n"
1799       "vmovdqu     (%0),%%ymm3                   \n"
1800       "lea         0x20(%0),%0                   \n"  // src_ptr += 32
1801       "vpermq      $0xd8,%%ymm3,%%ymm3           \n"
1802       "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
1803       "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
1804       "vpaddusw    (%1),%%ymm2,%%ymm0            \n"
1805       "vpaddusw    0x20(%1),%%ymm3,%%ymm1        \n"
1806       "vmovdqu     %%ymm0,(%1)                   \n"
1807       "vmovdqu     %%ymm1,0x20(%1)               \n"
1808       "lea         0x40(%1),%1                   \n"
1809       "sub         $0x20,%2                      \n"
1810       "jg          1b                            \n"
1811       "vzeroupper                                \n"
1812                : "+r"(src_ptr),   // %0
1813                  "+r"(dst_ptr),   // %1
1814                  "+r"(src_width)  // %2
1815                :
1816                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
1817 }
1818 #endif  // HAS_SCALEADDROW_AVX2
1819 
1820 // Constant for making pixels signed to avoid pmaddubsw
1821 // saturation.
1822 static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1823                               0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
1824 
1825 // Constant for making pixels unsigned and adding .5 for rounding.
1826 static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
1827                                0x4040, 0x4040, 0x4040, 0x4040};
1828 
1829 // Bilinear column filtering. SSSE3 version.
ScaleFilterCols_SSSE3(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)1830 void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
1831                            const uint8_t* src_ptr,
1832                            int dst_width,
1833                            int x,
1834                            int dx) {
1835   intptr_t x0, x1, temp_pixel;
1836   asm volatile(
1837       "movd        %6,%%xmm2                     \n"
1838       "movd        %7,%%xmm3                     \n"
1839       "movl        $0x04040000,%k2               \n"
1840       "movd        %k2,%%xmm5                    \n"
1841       "pcmpeqb     %%xmm6,%%xmm6                 \n"
1842       "psrlw       $0x9,%%xmm6                   \n"  // 0x007f007f
1843       "pcmpeqb     %%xmm7,%%xmm7                 \n"
1844       "psrlw       $15,%%xmm7                    \n"  // 0x00010001
1845 
1846       "pextrw      $0x1,%%xmm2,%k3               \n"
1847       "subl        $0x2,%5                       \n"
1848       "jl          29f                           \n"
1849       "movdqa      %%xmm2,%%xmm0                 \n"
1850       "paddd       %%xmm3,%%xmm0                 \n"
1851       "punpckldq   %%xmm0,%%xmm2                 \n"
1852       "punpckldq   %%xmm3,%%xmm3                 \n"
1853       "paddd       %%xmm3,%%xmm3                 \n"
1854       "pextrw      $0x3,%%xmm2,%k4               \n"
1855 
1856       LABELALIGN
1857       "2:                                        \n"
1858       "movdqa      %%xmm2,%%xmm1                 \n"
1859       "paddd       %%xmm3,%%xmm2                 \n"
1860       "movzwl      0x00(%1,%3,1),%k2             \n"
1861       "movd        %k2,%%xmm0                    \n"
1862       "psrlw       $0x9,%%xmm1                   \n"
1863       "movzwl      0x00(%1,%4,1),%k2             \n"
1864       "movd        %k2,%%xmm4                    \n"
1865       "pshufb      %%xmm5,%%xmm1                 \n"
1866       "punpcklwd   %%xmm4,%%xmm0                 \n"
1867       "psubb       %8,%%xmm0                     \n"  // make pixels signed.
1868       "pxor        %%xmm6,%%xmm1                 \n"  // 128 - f = (f ^ 127 ) +
1869                                                       // 1
1870       "paddusb     %%xmm7,%%xmm1                 \n"
1871       "pmaddubsw   %%xmm0,%%xmm1                 \n"
1872       "pextrw      $0x1,%%xmm2,%k3               \n"
1873       "pextrw      $0x3,%%xmm2,%k4               \n"
1874       "paddw       %9,%%xmm1                     \n"  // make pixels unsigned.
1875       "psrlw       $0x7,%%xmm1                   \n"
1876       "packuswb    %%xmm1,%%xmm1                 \n"
1877       "movd        %%xmm1,%k2                    \n"
1878       "mov         %w2,(%0)                      \n"
1879       "lea         0x2(%0),%0                    \n"
1880       "subl        $0x2,%5                       \n"
1881       "jge         2b                            \n"
1882 
1883       LABELALIGN
1884       "29:                                       \n"
1885       "addl        $0x1,%5                       \n"
1886       "jl          99f                           \n"
1887       "movzwl      0x00(%1,%3,1),%k2             \n"
1888       "movd        %k2,%%xmm0                    \n"
1889       "psrlw       $0x9,%%xmm2                   \n"
1890       "pshufb      %%xmm5,%%xmm2                 \n"
1891       "psubb       %8,%%xmm0                     \n"  // make pixels signed.
1892       "pxor        %%xmm6,%%xmm2                 \n"
1893       "paddusb     %%xmm7,%%xmm2                 \n"
1894       "pmaddubsw   %%xmm0,%%xmm2                 \n"
1895       "paddw       %9,%%xmm2                     \n"  // make pixels unsigned.
1896       "psrlw       $0x7,%%xmm2                   \n"
1897       "packuswb    %%xmm2,%%xmm2                 \n"
1898       "movd        %%xmm2,%k2                    \n"
1899       "mov         %b2,(%0)                      \n"
1900       "99:                                       \n"
1901       : "+r"(dst_ptr),      // %0
1902         "+r"(src_ptr),      // %1
1903         "=&a"(temp_pixel),  // %2
1904         "=&r"(x0),          // %3
1905         "=&r"(x1),          // %4
1906 #if defined(__x86_64__)
1907         "+rm"(dst_width)  // %5
1908 #else
1909         "+m"(dst_width)  // %5
1910 #endif
1911       : "rm"(x),   // %6
1912         "rm"(dx),  // %7
1913 #if defined(__x86_64__)
1914         "x"(kFsub80),  // %8
1915         "x"(kFadd40)   // %9
1916 #else
1917         "m"(kFsub80),    // %8
1918         "m"(kFadd40)     // %9
1919 #endif
1920       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1921         "xmm7");
1922 }
1923 
1924 // Reads 4 pixels, duplicates them and writes 8 pixels.
1925 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleColsUp2_SSE2(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)1926 void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
1927                        const uint8_t* src_ptr,
1928                        int dst_width,
1929                        int x,
1930                        int dx) {
1931   (void)x;
1932   (void)dx;
1933   asm volatile(LABELALIGN
1934       "1:                                        \n"
1935       "movdqu      (%1),%%xmm0                   \n"
1936       "lea         0x10(%1),%1                   \n"
1937       "movdqa      %%xmm0,%%xmm1                 \n"
1938       "punpcklbw   %%xmm0,%%xmm0                 \n"
1939       "punpckhbw   %%xmm1,%%xmm1                 \n"
1940       "movdqu      %%xmm0,(%0)                   \n"
1941       "movdqu      %%xmm1,0x10(%0)               \n"
1942       "lea         0x20(%0),%0                   \n"
1943       "sub         $0x20,%2                      \n"
1944       "jg          1b                            \n"
1945 
1946                : "+r"(dst_ptr),   // %0
1947                  "+r"(src_ptr),   // %1
1948                  "+r"(dst_width)  // %2
1949                  ::"memory",
1950                  "cc", "xmm0", "xmm1");
1951 }
1952 
ScaleARGBRowDown2_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1953 void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
1954                             ptrdiff_t src_stride,
1955                             uint8_t* dst_argb,
1956                             int dst_width) {
1957   (void)src_stride;
1958   asm volatile(LABELALIGN
1959       "1:                                        \n"
1960       "movdqu      (%0),%%xmm0                   \n"
1961       "movdqu      0x10(%0),%%xmm1               \n"
1962       "lea         0x20(%0),%0                   \n"
1963       "shufps      $0xdd,%%xmm1,%%xmm0           \n"
1964       "movdqu      %%xmm0,(%1)                   \n"
1965       "lea         0x10(%1),%1                   \n"
1966       "sub         $0x4,%2                       \n"
1967       "jg          1b                            \n"
1968                : "+r"(src_argb),  // %0
1969                  "+r"(dst_argb),  // %1
1970                  "+r"(dst_width)  // %2
1971                  ::"memory",
1972                  "cc", "xmm0", "xmm1");
1973 }
1974 
ScaleARGBRowDown2Linear_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1975 void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
1976                                   ptrdiff_t src_stride,
1977                                   uint8_t* dst_argb,
1978                                   int dst_width) {
1979   (void)src_stride;
1980   asm volatile(LABELALIGN
1981       "1:                                        \n"
1982       "movdqu      (%0),%%xmm0                   \n"
1983       "movdqu      0x10(%0),%%xmm1               \n"
1984       "lea         0x20(%0),%0                   \n"
1985       "movdqa      %%xmm0,%%xmm2                 \n"
1986       "shufps      $0x88,%%xmm1,%%xmm0           \n"
1987       "shufps      $0xdd,%%xmm1,%%xmm2           \n"
1988       "pavgb       %%xmm2,%%xmm0                 \n"
1989       "movdqu      %%xmm0,(%1)                   \n"
1990       "lea         0x10(%1),%1                   \n"
1991       "sub         $0x4,%2                       \n"
1992       "jg          1b                            \n"
1993                : "+r"(src_argb),  // %0
1994                  "+r"(dst_argb),  // %1
1995                  "+r"(dst_width)  // %2
1996                  ::"memory",
1997                  "cc", "xmm0", "xmm1");
1998 }
1999 
ScaleARGBRowDown2Box_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)2000 void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
2001                                ptrdiff_t src_stride,
2002                                uint8_t* dst_argb,
2003                                int dst_width) {
2004   asm volatile(LABELALIGN
2005       "1:                                        \n"
2006       "movdqu      (%0),%%xmm0                   \n"
2007       "movdqu      0x10(%0),%%xmm1               \n"
2008       "movdqu      0x00(%0,%3,1),%%xmm2          \n"
2009       "movdqu      0x10(%0,%3,1),%%xmm3          \n"
2010       "lea         0x20(%0),%0                   \n"
2011       "pavgb       %%xmm2,%%xmm0                 \n"
2012       "pavgb       %%xmm3,%%xmm1                 \n"
2013       "movdqa      %%xmm0,%%xmm2                 \n"
2014       "shufps      $0x88,%%xmm1,%%xmm0           \n"
2015       "shufps      $0xdd,%%xmm1,%%xmm2           \n"
2016       "pavgb       %%xmm2,%%xmm0                 \n"
2017       "movdqu      %%xmm0,(%1)                   \n"
2018       "lea         0x10(%1),%1                   \n"
2019       "sub         $0x4,%2                       \n"
2020       "jg          1b                            \n"
2021                : "+r"(src_argb),              // %0
2022                  "+r"(dst_argb),              // %1
2023                  "+r"(dst_width)              // %2
2024                : "r"((intptr_t)(src_stride))  // %3
2025                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2026 }
2027 
2028 // Reads 4 pixels at a time.
2029 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEven_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)2030 void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
2031                                ptrdiff_t src_stride,
2032                                int src_stepx,
2033                                uint8_t* dst_argb,
2034                                int dst_width) {
2035   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
2036   intptr_t src_stepx_x12;
2037   (void)src_stride;
2038   asm volatile(
2039       "lea         0x00(,%1,4),%1                \n"
2040       "lea         0x00(%1,%1,2),%4              \n"
2041 
2042       LABELALIGN
2043       "1:                                        \n"
2044       "movd        (%0),%%xmm0                   \n"
2045       "movd        0x00(%0,%1,1),%%xmm1          \n"
2046       "punpckldq   %%xmm1,%%xmm0                 \n"
2047       "movd        0x00(%0,%1,2),%%xmm2          \n"
2048       "movd        0x00(%0,%4,1),%%xmm3          \n"
2049       "lea         0x00(%0,%1,4),%0              \n"
2050       "punpckldq   %%xmm3,%%xmm2                 \n"
2051       "punpcklqdq  %%xmm2,%%xmm0                 \n"
2052       "movdqu      %%xmm0,(%2)                   \n"
2053       "lea         0x10(%2),%2                   \n"
2054       "sub         $0x4,%3                       \n"
2055       "jg          1b                            \n"
2056       : "+r"(src_argb),       // %0
2057         "+r"(src_stepx_x4),   // %1
2058         "+r"(dst_argb),       // %2
2059         "+r"(dst_width),      // %3
2060         "=&r"(src_stepx_x12)  // %4
2061         ::"memory",
2062         "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2063 }
2064 
2065 // Blends four 2x2 to 4x1.
2066 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEvenBox_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)2067 void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
2068                                   ptrdiff_t src_stride,
2069                                   int src_stepx,
2070                                   uint8_t* dst_argb,
2071                                   int dst_width) {
2072   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
2073   intptr_t src_stepx_x12;
2074   intptr_t row1 = (intptr_t)(src_stride);
2075   asm volatile(
2076       "lea         0x00(,%1,4),%1                \n"
2077       "lea         0x00(%1,%1,2),%4              \n"
2078       "lea         0x00(%0,%5,1),%5              \n"
2079 
2080       LABELALIGN
2081       "1:                                        \n"
2082       "movq        (%0),%%xmm0                   \n"
2083       "movhps      0x00(%0,%1,1),%%xmm0          \n"
2084       "movq        0x00(%0,%1,2),%%xmm1          \n"
2085       "movhps      0x00(%0,%4,1),%%xmm1          \n"
2086       "lea         0x00(%0,%1,4),%0              \n"
2087       "movq        (%5),%%xmm2                   \n"
2088       "movhps      0x00(%5,%1,1),%%xmm2          \n"
2089       "movq        0x00(%5,%1,2),%%xmm3          \n"
2090       "movhps      0x00(%5,%4,1),%%xmm3          \n"
2091       "lea         0x00(%5,%1,4),%5              \n"
2092       "pavgb       %%xmm2,%%xmm0                 \n"
2093       "pavgb       %%xmm3,%%xmm1                 \n"
2094       "movdqa      %%xmm0,%%xmm2                 \n"
2095       "shufps      $0x88,%%xmm1,%%xmm0           \n"
2096       "shufps      $0xdd,%%xmm1,%%xmm2           \n"
2097       "pavgb       %%xmm2,%%xmm0                 \n"
2098       "movdqu      %%xmm0,(%2)                   \n"
2099       "lea         0x10(%2),%2                   \n"
2100       "sub         $0x4,%3                       \n"
2101       "jg          1b                            \n"
2102       : "+r"(src_argb),        // %0
2103         "+r"(src_stepx_x4),    // %1
2104         "+r"(dst_argb),        // %2
2105         "+rm"(dst_width),      // %3
2106         "=&r"(src_stepx_x12),  // %4
2107         "+r"(row1)             // %5
2108         ::"memory",
2109         "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2110 }
2111 
ScaleARGBCols_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)2112 void ScaleARGBCols_SSE2(uint8_t* dst_argb,
2113                         const uint8_t* src_argb,
2114                         int dst_width,
2115                         int x,
2116                         int dx) {
2117   intptr_t x0, x1;
2118   asm volatile(
2119       "movd        %5,%%xmm2                     \n"
2120       "movd        %6,%%xmm3                     \n"
2121       "pshufd      $0x0,%%xmm2,%%xmm2            \n"
2122       "pshufd      $0x11,%%xmm3,%%xmm0           \n"
2123       "paddd       %%xmm0,%%xmm2                 \n"
2124       "paddd       %%xmm3,%%xmm3                 \n"
2125       "pshufd      $0x5,%%xmm3,%%xmm0            \n"
2126       "paddd       %%xmm0,%%xmm2                 \n"
2127       "paddd       %%xmm3,%%xmm3                 \n"
2128       "pshufd      $0x0,%%xmm3,%%xmm3            \n"
2129       "pextrw      $0x1,%%xmm2,%k0               \n"
2130       "pextrw      $0x3,%%xmm2,%k1               \n"
2131       "cmp         $0x0,%4                       \n"
2132       "jl          99f                           \n"
2133       "sub         $0x4,%4                       \n"
2134       "jl          49f                           \n"
2135 
2136       LABELALIGN
2137       "40:                                       \n"
2138       "movd        0x00(%3,%0,4),%%xmm0          \n"
2139       "movd        0x00(%3,%1,4),%%xmm1          \n"
2140       "pextrw      $0x5,%%xmm2,%k0               \n"
2141       "pextrw      $0x7,%%xmm2,%k1               \n"
2142       "paddd       %%xmm3,%%xmm2                 \n"
2143       "punpckldq   %%xmm1,%%xmm0                 \n"
2144       "movd        0x00(%3,%0,4),%%xmm1          \n"
2145       "movd        0x00(%3,%1,4),%%xmm4          \n"
2146       "pextrw      $0x1,%%xmm2,%k0               \n"
2147       "pextrw      $0x3,%%xmm2,%k1               \n"
2148       "punpckldq   %%xmm4,%%xmm1                 \n"
2149       "punpcklqdq  %%xmm1,%%xmm0                 \n"
2150       "movdqu      %%xmm0,(%2)                   \n"
2151       "lea         0x10(%2),%2                   \n"
2152       "sub         $0x4,%4                       \n"
2153       "jge         40b                           \n"
2154 
2155       "49:                                       \n"
2156       "test        $0x2,%4                       \n"
2157       "je          29f                           \n"
2158       "movd        0x00(%3,%0,4),%%xmm0          \n"
2159       "movd        0x00(%3,%1,4),%%xmm1          \n"
2160       "pextrw      $0x5,%%xmm2,%k0               \n"
2161       "punpckldq   %%xmm1,%%xmm0                 \n"
2162       "movq        %%xmm0,(%2)                   \n"
2163       "lea         0x8(%2),%2                    \n"
2164       "29:                                       \n"
2165       "test        $0x1,%4                       \n"
2166       "je          99f                           \n"
2167       "movd        0x00(%3,%0,4),%%xmm0          \n"
2168       "movd        %%xmm0,(%2)                   \n"
2169       "99:                                       \n"
2170       : "=&a"(x0),       // %0
2171         "=&d"(x1),       // %1
2172         "+r"(dst_argb),  // %2
2173         "+r"(src_argb),  // %3
2174         "+r"(dst_width)  // %4
2175       : "rm"(x),         // %5
2176         "rm"(dx)         // %6
2177       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2178 }
2179 
2180 // Reads 4 pixels, duplicates them and writes 8 pixels.
2181 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleARGBColsUp2_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)2182 void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
2183                            const uint8_t* src_argb,
2184                            int dst_width,
2185                            int x,
2186                            int dx) {
2187   (void)x;
2188   (void)dx;
2189   asm volatile(LABELALIGN
2190       "1:                                        \n"
2191       "movdqu      (%1),%%xmm0                   \n"
2192       "lea         0x10(%1),%1                   \n"
2193       "movdqa      %%xmm0,%%xmm1                 \n"
2194       "punpckldq   %%xmm0,%%xmm0                 \n"
2195       "punpckhdq   %%xmm1,%%xmm1                 \n"
2196       "movdqu      %%xmm0,(%0)                   \n"
2197       "movdqu      %%xmm1,0x10(%0)               \n"
2198       "lea         0x20(%0),%0                   \n"
2199       "sub         $0x8,%2                       \n"
2200       "jg          1b                            \n"
2201 
2202                : "+r"(dst_argb),  // %0
2203                  "+r"(src_argb),  // %1
2204                  "+r"(dst_width)  // %2
2205                  ::"memory",
2206                  "cc", "xmm0", "xmm1");
2207 }
2208 
2209 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
2210 static const uvec8 kShuffleColARGB = {
2211     0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
2212     8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
2213 };
2214 
2215 // Shuffle table for duplicating 2 fractions into 8 bytes each
2216 static const uvec8 kShuffleFractions = {
2217     0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
2218 };
2219 
2220 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
ScaleARGBFilterCols_SSSE3(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)2221 void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
2222                                const uint8_t* src_argb,
2223                                int dst_width,
2224                                int x,
2225                                int dx) {
2226   intptr_t x0, x1;
2227   asm volatile(
2228       "movdqa      %0,%%xmm4                     \n"
2229       "movdqa      %1,%%xmm5                     \n"
2230       :
2231       : "m"(kShuffleColARGB),   // %0
2232         "m"(kShuffleFractions)  // %1
2233   );
2234 
2235   asm volatile(
2236       "movd        %5,%%xmm2                     \n"
2237       "movd        %6,%%xmm3                     \n"
2238       "pcmpeqb     %%xmm6,%%xmm6                 \n"
2239       "psrlw       $0x9,%%xmm6                   \n"
2240       "pextrw      $0x1,%%xmm2,%k3               \n"
2241       "sub         $0x2,%2                       \n"
2242       "jl          29f                           \n"
2243       "movdqa      %%xmm2,%%xmm0                 \n"
2244       "paddd       %%xmm3,%%xmm0                 \n"
2245       "punpckldq   %%xmm0,%%xmm2                 \n"
2246       "punpckldq   %%xmm3,%%xmm3                 \n"
2247       "paddd       %%xmm3,%%xmm3                 \n"
2248       "pextrw      $0x3,%%xmm2,%k4               \n"
2249 
2250       LABELALIGN
2251       "2:                                        \n"
2252       "movdqa      %%xmm2,%%xmm1                 \n"
2253       "paddd       %%xmm3,%%xmm2                 \n"
2254       "movq        0x00(%1,%3,4),%%xmm0          \n"
2255       "psrlw       $0x9,%%xmm1                   \n"
2256       "movhps      0x00(%1,%4,4),%%xmm0          \n"
2257       "pshufb      %%xmm5,%%xmm1                 \n"
2258       "pshufb      %%xmm4,%%xmm0                 \n"
2259       "pxor        %%xmm6,%%xmm1                 \n"
2260       "pmaddubsw   %%xmm1,%%xmm0                 \n"
2261       "psrlw       $0x7,%%xmm0                   \n"
2262       "pextrw      $0x1,%%xmm2,%k3               \n"
2263       "pextrw      $0x3,%%xmm2,%k4               \n"
2264       "packuswb    %%xmm0,%%xmm0                 \n"
2265       "movq        %%xmm0,(%0)                   \n"
2266       "lea         0x8(%0),%0                    \n"
2267       "sub         $0x2,%2                       \n"
2268       "jge         2b                            \n"
2269 
2270       LABELALIGN
2271       "29:                                       \n"
2272       "add         $0x1,%2                       \n"
2273       "jl          99f                           \n"
2274       "psrlw       $0x9,%%xmm2                   \n"
2275       "movq        0x00(%1,%3,4),%%xmm0          \n"
2276       "pshufb      %%xmm5,%%xmm2                 \n"
2277       "pshufb      %%xmm4,%%xmm0                 \n"
2278       "pxor        %%xmm6,%%xmm2                 \n"
2279       "pmaddubsw   %%xmm2,%%xmm0                 \n"
2280       "psrlw       $0x7,%%xmm0                   \n"
2281       "packuswb    %%xmm0,%%xmm0                 \n"
2282       "movd        %%xmm0,(%0)                   \n"
2283 
2284       LABELALIGN
2285       "99:                                       \n"  // clang-format error.
2286 
2287       : "+r"(dst_argb),    // %0
2288         "+r"(src_argb),    // %1
2289         "+rm"(dst_width),  // %2
2290         "=&r"(x0),         // %3
2291         "=&r"(x1)          // %4
2292       : "rm"(x),           // %5
2293         "rm"(dx)           // %6
2294       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2295 }
2296 
2297 // Divide num by div and return as 16.16 fixed point result.
FixedDiv_X86(int num,int div)2298 int FixedDiv_X86(int num, int div) {
2299   asm volatile(
2300       "cdq                                       \n"
2301       "shld        $0x10,%%eax,%%edx             \n"
2302       "shl         $0x10,%%eax                   \n"
2303       "idiv        %1                            \n"
2304       "mov         %0, %%eax                     \n"
2305       : "+a"(num)  // %0
2306       : "c"(div)   // %1
2307       : "memory", "cc", "edx");
2308   return num;
2309 }
2310 
2311 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
FixedDiv1_X86(int num,int div)2312 int FixedDiv1_X86(int num, int div) {
2313   asm volatile(
2314       "cdq                                       \n"
2315       "shld        $0x10,%%eax,%%edx             \n"
2316       "shl         $0x10,%%eax                   \n"
2317       "sub         $0x10001,%%eax                \n"
2318       "sbb         $0x0,%%edx                    \n"
2319       "sub         $0x1,%1                       \n"
2320       "idiv        %1                            \n"
2321       "mov         %0, %%eax                     \n"
2322       : "+a"(num)  // %0
2323       : "c"(div)   // %1
2324       : "memory", "cc", "edx");
2325   return num;
2326 }
2327 
2328 #ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
2329 // Shuffle table for splitting UV into upper and lower part of register.
2330 static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
2331                                       1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
2332 static const uvec8 kShuffleMergeUV = {0u,   8u,   2u,   10u,  4u,   12u,
2333                                       6u,   14u,  0x80, 0x80, 0x80, 0x80,
2334                                       0x80, 0x80, 0x80, 0x80};
2335 
ScaleUVRowDown2Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)2336 void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
2337                               ptrdiff_t src_stride,
2338                               uint8_t* dst_ptr,
2339                               int dst_width) {
2340   asm volatile(
2341       "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 01010101
2342       "psrlw       $0xf,%%xmm4                   \n"
2343       "packuswb    %%xmm4,%%xmm4                 \n"
2344       "pxor        %%xmm5, %%xmm5                \n"  // zero
2345       "movdqa      %4,%%xmm1                     \n"  // split shuffler
2346       "movdqa      %5,%%xmm3                     \n"  // merge shuffler
2347 
2348       LABELALIGN
2349       "1:                                        \n"
2350       "movdqu      (%0),%%xmm0                   \n"  // 8 UV row 0
2351       "movdqu      0x00(%0,%3,1),%%xmm2          \n"  // 8 UV row 1
2352       "lea         0x10(%0),%0                   \n"
2353       "pshufb      %%xmm1,%%xmm0                 \n"  // uuuuvvvv
2354       "pshufb      %%xmm1,%%xmm2                 \n"
2355       "pmaddubsw   %%xmm4,%%xmm0                 \n"  // horizontal add
2356       "pmaddubsw   %%xmm4,%%xmm2                 \n"
2357       "paddw       %%xmm2,%%xmm0                 \n"  // vertical add
2358       "psrlw       $0x1,%%xmm0                   \n"  // round
2359       "pavgw       %%xmm5,%%xmm0                 \n"
2360       "pshufb      %%xmm3,%%xmm0                 \n"  // merge uv
2361       "movq        %%xmm0,(%1)                   \n"
2362       "lea         0x8(%1),%1                    \n"  // 4 UV
2363       "sub         $0x4,%2                       \n"
2364       "jg          1b                            \n"
2365       : "+r"(src_ptr),                // %0
2366         "+r"(dst_ptr),                // %1
2367         "+r"(dst_width)               // %2
2368       : "r"((intptr_t)(src_stride)),  // %3
2369         "m"(kShuffleSplitUV),         // %4
2370         "m"(kShuffleMergeUV)          // %5
2371       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2372 }
2373 #endif  // HAS_SCALEUVROWDOWN2BOX_SSSE3
2374 
2375 #ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
ScaleUVRowDown2Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)2376 void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
2377                              ptrdiff_t src_stride,
2378                              uint8_t* dst_ptr,
2379                              int dst_width) {
2380   asm volatile(
2381       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 01010101
2382       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
2383       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
2384       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"  // zero
2385       "vbroadcastf128 %4,%%ymm1                  \n"  // split shuffler
2386       "vbroadcastf128 %5,%%ymm3                  \n"  // merge shuffler
2387 
2388       LABELALIGN
2389       "1:                                        \n"
2390       "vmovdqu     (%0),%%ymm0                   \n"  // 16 UV row 0
2391       "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"  // 16 UV row 1
2392       "lea         0x20(%0),%0                   \n"
2393       "vpshufb     %%ymm1,%%ymm0,%%ymm0          \n"  // uuuuvvvv
2394       "vpshufb     %%ymm1,%%ymm2,%%ymm2          \n"
2395       "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"  // horizontal add
2396       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
2397       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"  // vertical add
2398       "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"  // round
2399       "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
2400       "vpshufb     %%ymm3,%%ymm0,%%ymm0          \n"  // merge uv
2401       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"  // combine qwords
2402       "vmovdqu     %%xmm0,(%1)                   \n"
2403       "lea         0x10(%1),%1                   \n"  // 8 UV
2404       "sub         $0x8,%2                       \n"
2405       "jg          1b                            \n"
2406       "vzeroupper                                \n"
2407       : "+r"(src_ptr),                // %0
2408         "+r"(dst_ptr),                // %1
2409         "+r"(dst_width)               // %2
2410       : "r"((intptr_t)(src_stride)),  // %3
2411         "m"(kShuffleSplitUV),         // %4
2412         "m"(kShuffleMergeUV)          // %5
2413       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2414 }
2415 #endif  // HAS_SCALEUVROWDOWN2BOX_AVX2
2416 
2417 static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
2418                                       3, 1, 3, 1, 1, 3, 1, 3};
2419 
2420 #ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3
ScaleUVRowUp2_Linear_SSSE3(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)2421 void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
2422                                 uint8_t* dst_ptr,
2423                                 int dst_width) {
2424   asm volatile(
2425       "pcmpeqw     %%xmm4,%%xmm4                 \n"
2426       "psrlw       $15,%%xmm4                    \n"
2427       "psllw       $1,%%xmm4                     \n"  // all 2
2428       "movdqa      %3,%%xmm3                     \n"
2429 
2430       LABELALIGN
2431       "1:                                        \n"
2432       "movq        (%0),%%xmm0                   \n"  // 00112233 (1u1v)
2433       "movq        2(%0),%%xmm1                  \n"  // 11223344 (1u1v)
2434       "punpcklbw   %%xmm1,%%xmm0                 \n"  // 0101121223233434 (2u2v)
2435       "movdqa      %%xmm0,%%xmm2                 \n"
2436       "punpckhdq   %%xmm0,%%xmm2                 \n"  // 2323232334343434 (2u2v)
2437       "punpckldq   %%xmm0,%%xmm0                 \n"  // 0101010112121212 (2u2v)
2438       "pmaddubsw   %%xmm3,%%xmm2                 \n"  // 3*near+far (1u1v16, hi)
2439       "pmaddubsw   %%xmm3,%%xmm0                 \n"  // 3*near+far (1u1v16, lo)
2440       "paddw       %%xmm4,%%xmm0                 \n"  // 3*near+far+2 (lo)
2441       "paddw       %%xmm4,%%xmm2                 \n"  // 3*near+far+2 (hi)
2442       "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
2443       "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far (hi)
2444       "packuswb    %%xmm2,%%xmm0                 \n"
2445       "movdqu      %%xmm0,(%1)                   \n"
2446 
2447       "lea         0x8(%0),%0                    \n"
2448       "lea         0x10(%1),%1                   \n"  // 4 uv to 8 uv
2449       "sub         $0x8,%2                       \n"
2450       "jg          1b                            \n"
2451       : "+r"(src_ptr),        // %0
2452         "+r"(dst_ptr),        // %1
2453         "+r"(dst_width)       // %2
2454       : "m"(kUVLinearMadd31)  // %3
2455       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2456 }
2457 #endif
2458 
2459 #ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3
ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2460 void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
2461                                   ptrdiff_t src_stride,
2462                                   uint8_t* dst_ptr,
2463                                   ptrdiff_t dst_stride,
2464                                   int dst_width) {
2465   asm volatile(
2466       "pcmpeqw     %%xmm6,%%xmm6                 \n"
2467       "psrlw       $15,%%xmm6                    \n"
2468       "psllw       $3,%%xmm6                     \n"  // all 8
2469       "movdqa      %5,%%xmm7                     \n"
2470 
2471       LABELALIGN
2472       "1:                                        \n"
2473       "movq        (%0),%%xmm0                   \n"  // 00112233 (1u1v)
2474       "movq        2(%0),%%xmm1                  \n"  // 11223344 (1u1v)
2475       "punpcklbw   %%xmm1,%%xmm0                 \n"  // 0101121223233434 (2u2v)
2476       "movdqa      %%xmm0,%%xmm2                 \n"
2477       "punpckhdq   %%xmm0,%%xmm2                 \n"  // 2323232334343434 (2u2v)
2478       "punpckldq   %%xmm0,%%xmm0                 \n"  // 0101010112121212 (2u2v)
2479       "pmaddubsw   %%xmm7,%%xmm2                 \n"  // 3*near+far (1u1v16, hi)
2480       "pmaddubsw   %%xmm7,%%xmm0                 \n"  // 3*near+far (1u1v16, lo)
2481 
2482       "movq        (%0,%3),%%xmm1                \n"
2483       "movq        2(%0,%3),%%xmm4               \n"
2484       "punpcklbw   %%xmm4,%%xmm1                 \n"
2485       "movdqa      %%xmm1,%%xmm3                 \n"
2486       "punpckhdq   %%xmm1,%%xmm3                 \n"
2487       "punpckldq   %%xmm1,%%xmm1                 \n"
2488       "pmaddubsw   %%xmm7,%%xmm3                 \n"  // 3*near+far (2, hi)
2489       "pmaddubsw   %%xmm7,%%xmm1                 \n"  // 3*near+far (2, lo)
2490 
2491       // xmm0 xmm2
2492       // xmm1 xmm3
2493 
2494       "movdqa      %%xmm0,%%xmm4                 \n"
2495       "movdqa      %%xmm1,%%xmm5                 \n"
2496       "paddw       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
2497       "paddw       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
2498       "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
2499       "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
2500       "psrlw       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
2501 
2502       "movdqa      %%xmm1,%%xmm5                 \n"
2503       "paddw       %%xmm1,%%xmm5                 \n"  // 6*near+2*far (2, lo)
2504       "paddw       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
2505       "paddw       %%xmm1,%%xmm5                 \n"  // 9*near+3*far (2, lo)
2506       "paddw       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
2507       "psrlw       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
2508 
2509       "movdqa      %%xmm2,%%xmm0                 \n"
2510       "movdqa      %%xmm3,%%xmm1                 \n"
2511       "paddw       %%xmm2,%%xmm0                 \n"  // 6*near+2*far (1, hi)
2512       "paddw       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (2, hi)
2513       "paddw       %%xmm2,%%xmm0                 \n"  // 9*near+3*far (1, hi)
2514       "paddw       %%xmm1,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
2515       "psrlw       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
2516 
2517       "movdqa      %%xmm3,%%xmm1                 \n"
2518       "paddw       %%xmm3,%%xmm1                 \n"  // 6*near+2*far (2, hi)
2519       "paddw       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
2520       "paddw       %%xmm3,%%xmm1                 \n"  // 9*near+3*far (2, hi)
2521       "paddw       %%xmm2,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, hi)
2522       "psrlw       $4,%%xmm1                     \n"  // ^ div by 16 (2, hi)
2523 
2524       "packuswb    %%xmm0,%%xmm4                 \n"
2525       "movdqu      %%xmm4,(%1)                   \n"  // store above
2526       "packuswb    %%xmm1,%%xmm5                 \n"
2527       "movdqu      %%xmm5,(%1,%4)                \n"  // store below
2528 
2529       "lea         0x8(%0),%0                    \n"
2530       "lea         0x10(%1),%1                   \n"  // 4 uv to 8 uv
2531       "sub         $0x8,%2                       \n"
2532       "jg          1b                            \n"
2533       : "+r"(src_ptr),                // %0
2534         "+r"(dst_ptr),                // %1
2535         "+r"(dst_width)               // %2
2536       : "r"((intptr_t)(src_stride)),  // %3
2537         "r"((intptr_t)(dst_stride)),  // %4
2538         "m"(kUVLinearMadd31)          // %5
2539       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2540         "xmm7");
2541 }
2542 #endif
2543 
2544 #ifdef HAS_SCALEUVROWUP2LINEAR_AVX2
2545 
ScaleUVRowUp2_Linear_AVX2(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)2546 void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
2547                                uint8_t* dst_ptr,
2548                                int dst_width) {
2549   asm volatile(
2550       "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
2551       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
2552       "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
2553       "vbroadcastf128 %3,%%ymm3                  \n"
2554 
2555       LABELALIGN
2556       "1:                                        \n"
2557       "vmovdqu     (%0),%%xmm0                   \n"
2558       "vmovdqu     2(%0),%%xmm1                  \n"
2559       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
2560       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
2561       "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
2562       "vpunpckhdq  %%ymm0,%%ymm0,%%ymm2          \n"
2563       "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"
2564       "vpmaddubsw  %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
2565       "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0          \n"  // 3*near+far (lo)
2566       "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2 (lo)
2567       "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // 3*near+far+2 (hi)
2568       "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
2569       "vpsrlw      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
2570       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
2571       "vmovdqu     %%ymm0,(%1)                   \n"
2572 
2573       "lea         0x10(%0),%0                   \n"
2574       "lea         0x20(%1),%1                   \n"  // 8 uv to 16 uv
2575       "sub         $0x10,%2                      \n"
2576       "jg          1b                            \n"
2577       "vzeroupper                                \n"
2578       : "+r"(src_ptr),        // %0
2579         "+r"(dst_ptr),        // %1
2580         "+r"(dst_width)       // %2
2581       : "m"(kUVLinearMadd31)  // %3
2582       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2583 }
2584 #endif
2585 
2586 #ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2
ScaleUVRowUp2_Bilinear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2587 void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
2588                                  ptrdiff_t src_stride,
2589                                  uint8_t* dst_ptr,
2590                                  ptrdiff_t dst_stride,
2591                                  int dst_width) {
2592   asm volatile(
2593       "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
2594       "vpsrlw      $15,%%ymm6,%%ymm6             \n"
2595       "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
2596       "vbroadcastf128 %5,%%ymm7                  \n"
2597 
2598       LABELALIGN
2599       "1:                                        \n"
2600       "vmovdqu     (%0),%%xmm0                   \n"
2601       "vmovdqu     2(%0),%%xmm1                  \n"
2602       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
2603       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
2604       "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
2605       "vpunpckhdq  %%ymm0,%%ymm0,%%ymm2          \n"
2606       "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"
2607       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
2608       "vpmaddubsw  %%ymm7,%%ymm0,%%ymm0          \n"  // 3*near+far (1, lo)
2609 
2610       "vmovdqu     (%0,%3),%%xmm2                \n"  // 0123456789ABCDEF
2611       "vmovdqu     2(%0,%3),%%xmm3               \n"  // 123456789ABCDEF0
2612       "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"
2613       "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"
2614       "vpunpcklbw  %%ymm3,%%ymm2,%%ymm2          \n"
2615       "vpunpckhdq  %%ymm2,%%ymm2,%%ymm4          \n"
2616       "vpunpckldq  %%ymm2,%%ymm2,%%ymm2          \n"
2617       "vpmaddubsw  %%ymm7,%%ymm4,%%ymm3          \n"  // 3*near+far (2, hi)
2618       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm2          \n"  // 3*near+far (2, lo)
2619 
2620       // ymm0 ymm1
2621       // ymm2 ymm3
2622 
2623       "vpaddw      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
2624       "vpaddw      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
2625       "vpaddw      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
2626       "vpaddw      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
2627       "vpsrlw      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
2628 
2629       "vpaddw      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
2630       "vpaddw      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
2631       "vpaddw      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
2632       "vpaddw      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
2633       "vpsrlw      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
2634 
2635       "vpaddw      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
2636       "vpaddw      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
2637       "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
2638       "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
2639       "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
2640 
2641       "vpaddw      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
2642       "vpaddw      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
2643       "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
2644       "vpaddw      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
2645       "vpsrlw      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
2646 
2647       "vpackuswb   %%ymm0,%%ymm4,%%ymm4          \n"
2648       "vmovdqu     %%ymm4,(%1)                   \n"  // store above
2649       "vpackuswb   %%ymm2,%%ymm5,%%ymm5          \n"
2650       "vmovdqu     %%ymm5,(%1,%4)                \n"  // store below
2651 
2652       "lea         0x10(%0),%0                   \n"
2653       "lea         0x20(%1),%1                   \n"  // 8 uv to 16 uv
2654       "sub         $0x10,%2                      \n"
2655       "jg          1b                            \n"
2656       "vzeroupper                                \n"
2657       : "+r"(src_ptr),                // %0
2658         "+r"(dst_ptr),                // %1
2659         "+r"(dst_width)               // %2
2660       : "r"((intptr_t)(src_stride)),  // %3
2661         "r"((intptr_t)(dst_stride)),  // %4
2662         "m"(kUVLinearMadd31)          // %5
2663       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2664         "xmm7");
2665 }
2666 #endif
2667 
2668 #ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE41
ScaleUVRowUp2_Linear_16_SSE41(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)2669 void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
2670                                    uint16_t* dst_ptr,
2671                                    int dst_width) {
2672   asm volatile(
2673       "pxor        %%xmm5,%%xmm5                 \n"
2674       "pcmpeqd     %%xmm4,%%xmm4                 \n"
2675       "psrld       $31,%%xmm4                    \n"
2676       "pslld       $1,%%xmm4                     \n"  // all 2
2677 
2678       LABELALIGN
2679       "1:                                        \n"
2680       "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
2681       "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
2682 
2683       "punpcklwd   %%xmm5,%%xmm0                 \n"  // 0011 (32b, 1u1v)
2684       "punpcklwd   %%xmm5,%%xmm1                 \n"  // 1122 (32b, 1u1v)
2685 
2686       "movdqa      %%xmm0,%%xmm2                 \n"
2687       "movdqa      %%xmm1,%%xmm3                 \n"
2688 
2689       "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (lo, far)
2690       "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (hi, far)
2691 
2692       "paddd       %%xmm4,%%xmm2                 \n"  // far+2 (lo)
2693       "paddd       %%xmm4,%%xmm3                 \n"  // far+2 (hi)
2694       "paddd       %%xmm0,%%xmm2                 \n"  // near+far+2 (lo)
2695       "paddd       %%xmm1,%%xmm3                 \n"  // near+far+2 (hi)
2696       "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
2697       "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
2698       "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far+2 (lo)
2699       "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
2700 
2701       "psrld       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
2702       "psrld       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
2703       "packusdw    %%xmm1,%%xmm0                 \n"
2704       "movdqu      %%xmm0,(%1)                   \n"
2705 
2706       "lea         0x8(%0),%0                    \n"
2707       "lea         0x10(%1),%1                   \n"  // 2 uv to 4 uv
2708       "sub         $0x4,%2                       \n"
2709       "jg          1b                            \n"
2710       : "+r"(src_ptr),   // %0
2711         "+r"(dst_ptr),   // %1
2712         "+r"(dst_width)  // %2
2713       :
2714       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2715 }
2716 #endif
2717 
2718 #ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE41
ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2719 void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
2720                                      ptrdiff_t src_stride,
2721                                      uint16_t* dst_ptr,
2722                                      ptrdiff_t dst_stride,
2723                                      int dst_width) {
2724   asm volatile(
2725       "pxor        %%xmm7,%%xmm7                 \n"
2726       "pcmpeqd     %%xmm6,%%xmm6                 \n"
2727       "psrld       $31,%%xmm6                    \n"
2728       "pslld       $3,%%xmm6                     \n"  // all 8
2729 
2730       LABELALIGN
2731       "1:                                        \n"
2732       "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
2733       "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
2734       "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0011 (near) (32b, 1u1v)
2735       "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1122 (near) (32b, 1u1v)
2736       "movdqa      %%xmm0,%%xmm2                 \n"
2737       "movdqa      %%xmm1,%%xmm3                 \n"
2738       "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (far) (1, lo)
2739       "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (far) (1, hi)
2740       "paddd       %%xmm0,%%xmm2                 \n"  // near+far (1, lo)
2741       "paddd       %%xmm1,%%xmm3                 \n"  // near+far (1, hi)
2742       "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (1, lo)
2743       "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (1, hi)
2744       "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
2745       "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
2746 
2747       "movq        (%0,%3,2),%%xmm2              \n"
2748       "movq        4(%0,%3,2),%%xmm3             \n"
2749       "punpcklwd   %%xmm7,%%xmm2                 \n"
2750       "punpcklwd   %%xmm7,%%xmm3                 \n"
2751       "movdqa      %%xmm2,%%xmm4                 \n"
2752       "movdqa      %%xmm3,%%xmm5                 \n"
2753       "pshufd      $0b01001110,%%xmm4,%%xmm4     \n"  // 1100 (far) (2, lo)
2754       "pshufd      $0b01001110,%%xmm5,%%xmm5     \n"  // 2211 (far) (2, hi)
2755       "paddd       %%xmm2,%%xmm4                 \n"  // near+far (2, lo)
2756       "paddd       %%xmm3,%%xmm5                 \n"  // near+far (2, hi)
2757       "paddd       %%xmm2,%%xmm2                 \n"  // 2*near (2, lo)
2758       "paddd       %%xmm3,%%xmm3                 \n"  // 2*near (2, hi)
2759       "paddd       %%xmm4,%%xmm2                 \n"  // 3*near+far (2, lo)
2760       "paddd       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
2761 
2762       "movdqa      %%xmm0,%%xmm4                 \n"
2763       "movdqa      %%xmm2,%%xmm5                 \n"
2764       "paddd       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
2765       "paddd       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
2766       "paddd       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
2767       "paddd       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
2768       "psrld       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
2769 
2770       "movdqa      %%xmm2,%%xmm5                 \n"
2771       "paddd       %%xmm2,%%xmm5                 \n"  // 6*near+2*far (2, lo)
2772       "paddd       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
2773       "paddd       %%xmm2,%%xmm5                 \n"  // 9*near+3*far (2, lo)
2774       "paddd       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
2775       "psrld       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
2776 
2777       "movdqa      %%xmm1,%%xmm0                 \n"
2778       "movdqa      %%xmm3,%%xmm2                 \n"
2779       "paddd       %%xmm1,%%xmm0                 \n"  // 6*near+2*far (1, hi)
2780       "paddd       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (2, hi)
2781       "paddd       %%xmm1,%%xmm0                 \n"  // 9*near+3*far (1, hi)
2782       "paddd       %%xmm2,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
2783       "psrld       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
2784 
2785       "movdqa      %%xmm3,%%xmm2                 \n"
2786       "paddd       %%xmm3,%%xmm2                 \n"  // 6*near+2*far (2, hi)
2787       "paddd       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
2788       "paddd       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
2789       "paddd       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (2, hi)
2790       "psrld       $4,%%xmm2                     \n"  // ^ div by 16 (2, hi)
2791 
2792       "packusdw    %%xmm0,%%xmm4                 \n"
2793       "movdqu      %%xmm4,(%1)                   \n"  // store above
2794       "packusdw    %%xmm2,%%xmm5                 \n"
2795       "movdqu      %%xmm5,(%1,%4,2)              \n"  // store below
2796 
2797       "lea         0x8(%0),%0                    \n"
2798       "lea         0x10(%1),%1                   \n"  // 2 uv to 4 uv
2799       "sub         $0x4,%2                       \n"
2800       "jg          1b                            \n"
2801       : "+r"(src_ptr),                // %0
2802         "+r"(dst_ptr),                // %1
2803         "+r"(dst_width)               // %2
2804       : "r"((intptr_t)(src_stride)),  // %3
2805         "r"((intptr_t)(dst_stride))   // %4
2806       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2807         "xmm7");
2808 }
2809 #endif
2810 
2811 #ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2
ScaleUVRowUp2_Linear_16_AVX2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)2812 void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
2813                                   uint16_t* dst_ptr,
2814                                   int dst_width) {
2815   asm volatile(
2816       "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
2817       "vpsrld      $31,%%ymm4,%%ymm4             \n"
2818       "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
2819 
2820       LABELALIGN
2821       "1:                                        \n"
2822       "vmovdqu     (%0),%%xmm0                   \n"  // 00112233 (16b, 1u1v)
2823       "vmovdqu     4(%0),%%xmm1                  \n"  // 11223344 (16b, 1u1v)
2824 
2825       "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
2826       "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
2827 
2828       "vpshufd     $0b01001110,%%ymm0,%%ymm2     \n"  // 11003322 (lo, far)
2829       "vpshufd     $0b01001110,%%ymm1,%%ymm3     \n"  // 22114433 (hi, far)
2830 
2831       "vpaddd      %%ymm4,%%ymm2,%%ymm2          \n"  // far+2 (lo)
2832       "vpaddd      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2 (hi)
2833       "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far+2 (lo)
2834       "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far+2 (hi)
2835       "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
2836       "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
2837       "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far+2 (lo)
2838       "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far+2 (hi)
2839 
2840       "vpsrld      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
2841       "vpsrld      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
2842       "vpackusdw   %%ymm1,%%ymm0,%%ymm0          \n"
2843       "vmovdqu     %%ymm0,(%1)                   \n"
2844 
2845       "lea         0x10(%0),%0                   \n"
2846       "lea         0x20(%1),%1                   \n"  // 4 uv to 8 uv
2847       "sub         $0x8,%2                       \n"
2848       "jg          1b                            \n"
2849       "vzeroupper                                \n"
2850       : "+r"(src_ptr),   // %0
2851         "+r"(dst_ptr),   // %1
2852         "+r"(dst_width)  // %2
2853       :
2854       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2855 }
2856 #endif
2857 
2858 #ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2
ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2859 void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
2860                                     ptrdiff_t src_stride,
2861                                     uint16_t* dst_ptr,
2862                                     ptrdiff_t dst_stride,
2863                                     int dst_width) {
2864   asm volatile(
2865       "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
2866       "vpsrld      $31,%%ymm6,%%ymm6             \n"
2867       "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
2868 
2869       LABELALIGN
2870       "1:                                        \n"
2871 
2872       "vmovdqu     (%0),%%xmm0                   \n"  // 00112233 (16b, 1u1v)
2873       "vmovdqu     4(%0),%%xmm1                  \n"  // 11223344 (16b, 1u1v)
2874       "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
2875       "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
2876       "vpshufd     $0b01001110,%%ymm0,%%ymm2     \n"  // 11003322 (lo, far)
2877       "vpshufd     $0b01001110,%%ymm1,%%ymm3     \n"  // 22114433 (hi, far)
2878       "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far (lo)
2879       "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far (hi)
2880       "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
2881       "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
2882       "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far (lo)
2883       "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far (hi)
2884 
2885       "vmovdqu     (%0,%3,2),%%xmm2              \n"  // 00112233 (16b, 1u1v)
2886       "vmovdqu     4(%0,%3,2),%%xmm3             \n"  // 11223344 (16b, 1u1v)
2887       "vpmovzxwd   %%xmm2,%%ymm2                 \n"  // 01234567 (32b, 1u1v)
2888       "vpmovzxwd   %%xmm3,%%ymm3                 \n"  // 12345678 (32b, 1u1v)
2889       "vpshufd     $0b01001110,%%ymm2,%%ymm4     \n"  // 11003322 (lo, far)
2890       "vpshufd     $0b01001110,%%ymm3,%%ymm5     \n"  // 22114433 (hi, far)
2891       "vpaddd      %%ymm2,%%ymm4,%%ymm4          \n"  // near+far (lo)
2892       "vpaddd      %%ymm3,%%ymm5,%%ymm5          \n"  // near+far (hi)
2893       "vpaddd      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near (lo)
2894       "vpaddd      %%ymm3,%%ymm3,%%ymm3          \n"  // 2*near (hi)
2895       "vpaddd      %%ymm2,%%ymm4,%%ymm2          \n"  // 3*near+far (lo)
2896       "vpaddd      %%ymm3,%%ymm5,%%ymm3          \n"  // 3*near+far (hi)
2897 
2898       "vpaddd      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
2899       "vpaddd      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
2900       "vpaddd      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
2901       "vpaddd      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
2902       "vpsrld      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
2903 
2904       "vpaddd      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
2905       "vpaddd      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
2906       "vpaddd      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
2907       "vpaddd      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
2908       "vpsrld      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
2909 
2910       "vpaddd      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
2911       "vpaddd      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
2912       "vpaddd      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
2913       "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
2914       "vpsrld      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
2915 
2916       "vpaddd      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
2917       "vpaddd      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
2918       "vpaddd      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
2919       "vpaddd      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
2920       "vpsrld      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
2921 
2922       "vpackusdw   %%ymm0,%%ymm4,%%ymm4          \n"
2923       "vmovdqu     %%ymm4,(%1)                   \n"  // store above
2924       "vpackusdw   %%ymm2,%%ymm5,%%ymm5          \n"
2925       "vmovdqu     %%ymm5,(%1,%4,2)              \n"  // store below
2926 
2927       "lea         0x10(%0),%0                   \n"
2928       "lea         0x20(%1),%1                   \n"  // 4 uv to 8 uv
2929       "sub         $0x8,%2                       \n"
2930       "jg          1b                            \n"
2931       "vzeroupper                                \n"
2932       : "+r"(src_ptr),                // %0
2933         "+r"(dst_ptr),                // %1
2934         "+r"(dst_width)               // %2
2935       : "r"((intptr_t)(src_stride)),  // %3
2936         "r"((intptr_t)(dst_stride))   // %4
2937       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2938 }
2939 #endif
2940 
2941 #endif  // defined(__x86_64__) || defined(__i386__)
2942 
2943 #ifdef __cplusplus
2944 }  // extern "C"
2945 }  // namespace libyuv
2946 #endif
2947