• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 #include "libyuv/scale_row.h"
13 
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18 
19 // This module is for GCC x86 and x64.
20 #if !defined(LIBYUV_DISABLE_X86) && \
21     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
22 
23 // Offsets for source bytes 0 to 9
24 static uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
25                        128, 128, 128, 128, 128, 128, 128, 128};
26 
27 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
28 static uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
29                        128, 128, 128, 128, 128, 128, 128, 128};
30 
31 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
32 static uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
33                        128, 128, 128, 128, 128, 128, 128, 128};
34 
35 // Offsets for source bytes 0 to 10
36 static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
37 
38 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
39 static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13};
40 
41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
42 static uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
43                         10, 11, 12, 13, 13, 14, 14, 15};
44 
45 // Coefficients for source bytes 0 to 10
46 static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
47 
48 // Coefficients for source bytes 10 to 21
49 static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
50 
51 // Coefficients for source bytes 21 to 31
52 static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
53 
54 // Coefficients for source bytes 21 to 31
55 static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
56 
57 static uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
58                          128, 128, 128, 128, 128, 128, 128, 128};
59 
60 static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
61                          6,   8,   11,  14,  128, 128, 128, 128};
62 
63 // Arrange words 0,3,6 into 0,1,2
64 static uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
65                         128, 128, 128, 128, 128, 128, 128, 128};
66 
67 // Arrange words 0,3,6 into 3,4,5
68 static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
69                          6,   7,   12,  13,  128, 128, 128, 128};
70 
71 // Scaling values for boxes of 3x3 and 2x3
72 static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
73                             65536 / 9, 65536 / 6, 0,         0};
74 
75 // Arrange first value for pixels 0,1,2,3,4,5
76 static uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
77                          11, 128, 14, 128, 128, 128, 128, 128};
78 
79 // Arrange second value for pixels 0,1,2,3,4,5
80 static uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
81                          12, 128, 15, 128, 128, 128, 128, 128};
82 
83 // Arrange third value for pixels 0,1,2,3,4,5
84 static uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
85                          13, 128, 128, 128, 128, 128, 128, 128};
86 
87 // Scaling values for boxes of 3x2 and 2x2
88 static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
89                            65536 / 3, 65536 / 2, 0,         0};
90 
91 // GCC versions of row functions are verbatim conversions from Visual C.
92 // Generated using gcc disassembly on Visual C object file:
93 // objdump -D yuvscaler.obj >yuvscaler.txt
94 
ScaleRowDown2_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)95 void ScaleRowDown2_SSSE3(const uint8* src_ptr,
96                          ptrdiff_t src_stride,
97                          uint8* dst_ptr,
98                          int dst_width) {
99   (void)src_stride;
100   asm volatile (
101     LABELALIGN
102   "1:                                          \n"
103     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
104     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
105     "lea       " MEMLEA(0x20,0) ",%0           \n"
106     "psrlw     $0x8,%%xmm0                     \n"
107     "psrlw     $0x8,%%xmm1                     \n"
108     "packuswb  %%xmm1,%%xmm0                   \n"
109     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
110     "lea       " MEMLEA(0x10,1) ",%1           \n"
111     "sub       $0x10,%2                        \n"
112     "jg        1b                              \n"
113   : "+r"(src_ptr),    // %0
114     "+r"(dst_ptr),    // %1
115     "+r"(dst_width)   // %2
116   :: "memory", "cc", "xmm0", "xmm1"
117   );
118 }
119 
ScaleRowDown2Linear_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)120 void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr,
121                                ptrdiff_t src_stride,
122                                uint8* dst_ptr,
123                                int dst_width) {
124   (void)src_stride;
125   asm volatile (
126     "pcmpeqb    %%xmm4,%%xmm4                  \n"
127     "psrlw      $0xf,%%xmm4                    \n"
128     "packuswb   %%xmm4,%%xmm4                  \n"
129     "pxor       %%xmm5,%%xmm5                  \n"
130 
131     LABELALIGN
132   "1:                                          \n"
133     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
134     "movdqu    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"
135     "lea       " MEMLEA(0x20,0) ",%0           \n"
136     "pmaddubsw  %%xmm4,%%xmm0                  \n"
137     "pmaddubsw  %%xmm4,%%xmm1                  \n"
138     "pavgw      %%xmm5,%%xmm0                  \n"
139     "pavgw      %%xmm5,%%xmm1                  \n"
140     "packuswb   %%xmm1,%%xmm0                  \n"
141     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
142     "lea       " MEMLEA(0x10,1) ",%1           \n"
143     "sub       $0x10,%2                        \n"
144     "jg        1b                              \n"
145   : "+r"(src_ptr),    // %0
146     "+r"(dst_ptr),    // %1
147     "+r"(dst_width)   // %2
148   :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
149   );
150 }
151 
ScaleRowDown2Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)152 void ScaleRowDown2Box_SSSE3(const uint8* src_ptr,
153                             ptrdiff_t src_stride,
154                             uint8* dst_ptr,
155                             int dst_width) {
156   asm volatile (
157     "pcmpeqb    %%xmm4,%%xmm4                  \n"
158     "psrlw      $0xf,%%xmm4                    \n"
159     "packuswb   %%xmm4,%%xmm4                  \n"
160     "pxor       %%xmm5,%%xmm5                  \n"
161 
162     LABELALIGN
163   "1:                                          \n"
164     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
165     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
166     MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
167     MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
168     "lea       " MEMLEA(0x20,0) ",%0           \n"
169     "pmaddubsw  %%xmm4,%%xmm0                  \n"
170     "pmaddubsw  %%xmm4,%%xmm1                  \n"
171     "pmaddubsw  %%xmm4,%%xmm2                  \n"
172     "pmaddubsw  %%xmm4,%%xmm3                  \n"
173     "paddw      %%xmm2,%%xmm0                  \n"
174     "paddw      %%xmm3,%%xmm1                  \n"
175     "psrlw      $0x1,%%xmm0                    \n"
176     "psrlw      $0x1,%%xmm1                    \n"
177     "pavgw      %%xmm5,%%xmm0                  \n"
178     "pavgw      %%xmm5,%%xmm1                  \n"
179     "packuswb   %%xmm1,%%xmm0                  \n"
180     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
181     "lea       " MEMLEA(0x10,1) ",%1           \n"
182     "sub       $0x10,%2                        \n"
183     "jg        1b                              \n"
184   : "+r"(src_ptr),    // %0
185     "+r"(dst_ptr),    // %1
186     "+r"(dst_width)   // %2
187   : "r"((intptr_t)(src_stride))   // %3
188   : "memory", "cc", NACL_R14
189     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
190   );
191 }
192 
193 #ifdef HAS_SCALEROWDOWN2_AVX2
ScaleRowDown2_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)194 void ScaleRowDown2_AVX2(const uint8* src_ptr,
195                         ptrdiff_t src_stride,
196                         uint8* dst_ptr,
197                         int dst_width) {
198   (void)src_stride;
199   asm volatile (
200     LABELALIGN
201   "1:                                          \n"
202     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
203     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
204     "lea        " MEMLEA(0x40,0) ",%0          \n"
205     "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
206     "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
207     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
208     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
209     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
210     "lea        " MEMLEA(0x20,1) ",%1          \n"
211     "sub        $0x20,%2                       \n"
212     "jg         1b                             \n"
213     "vzeroupper                                \n"
214   : "+r"(src_ptr),    // %0
215     "+r"(dst_ptr),    // %1
216     "+r"(dst_width)   // %2
217   :: "memory", "cc", "xmm0", "xmm1"
218   );
219 }
220 
ScaleRowDown2Linear_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)221 void ScaleRowDown2Linear_AVX2(const uint8* src_ptr,
222                               ptrdiff_t src_stride,
223                               uint8* dst_ptr,
224                               int dst_width) {
225   (void)src_stride;
226   asm volatile (
227     "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
228     "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
229     "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
230     "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
231 
232     LABELALIGN
233   "1:                                          \n"
234     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
235     "vmovdqu    " MEMACCESS2(0x20, 0) ",%%ymm1 \n"
236     "lea        " MEMLEA(0x40,0) ",%0          \n"
237     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
238     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
239     "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
240     "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
241     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
242     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
243     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
244     "lea        " MEMLEA(0x20,1) ",%1          \n"
245     "sub        $0x20,%2                       \n"
246     "jg         1b                             \n"
247     "vzeroupper                                \n"
248   : "+r"(src_ptr),    // %0
249     "+r"(dst_ptr),    // %1
250     "+r"(dst_width)   // %2
251   :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
252   );
253 }
254 
ScaleRowDown2Box_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)255 void ScaleRowDown2Box_AVX2(const uint8* src_ptr,
256                            ptrdiff_t src_stride,
257                            uint8* dst_ptr,
258                            int dst_width) {
259   asm volatile (
260     "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
261     "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
262     "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
263     "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
264 
265     LABELALIGN
266   "1:                                          \n"
267     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
268     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
269     MEMOPREG(vmovdqu,0x00,0,3,1,ymm2)          //  vmovdqu  (%0,%3,1),%%ymm2
270     MEMOPREG(vmovdqu,0x20,0,3,1,ymm3)          //  vmovdqu  0x20(%0,%3,1),%%ymm3
271     "lea        " MEMLEA(0x40,0) ",%0          \n"
272     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
273     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
274     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
275     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
276     "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
277     "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
278     "vpsrlw     $0x1,%%ymm0,%%ymm0             \n"
279     "vpsrlw     $0x1,%%ymm1,%%ymm1             \n"
280     "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
281     "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
282     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
283     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
284     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
285     "lea        " MEMLEA(0x20,1) ",%1          \n"
286     "sub        $0x20,%2                       \n"
287     "jg         1b                             \n"
288     "vzeroupper                                \n"
289   : "+r"(src_ptr),    // %0
290     "+r"(dst_ptr),    // %1
291     "+r"(dst_width)   // %2
292   : "r"((intptr_t)(src_stride))   // %3
293   : "memory", "cc", NACL_R14
294     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
295   );
296 }
297 #endif  // HAS_SCALEROWDOWN2_AVX2
298 
ScaleRowDown4_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)299 void ScaleRowDown4_SSSE3(const uint8* src_ptr,
300                          ptrdiff_t src_stride,
301                          uint8* dst_ptr,
302                          int dst_width) {
303   (void)src_stride;
304   asm volatile (
305     "pcmpeqb   %%xmm5,%%xmm5                   \n"
306     "psrld     $0x18,%%xmm5                    \n"
307     "pslld     $0x10,%%xmm5                    \n"
308 
309     LABELALIGN
310   "1:                                          \n"
311     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
312     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
313     "lea       " MEMLEA(0x20,0) ",%0           \n"
314     "pand      %%xmm5,%%xmm0                   \n"
315     "pand      %%xmm5,%%xmm1                   \n"
316     "packuswb  %%xmm1,%%xmm0                   \n"
317     "psrlw     $0x8,%%xmm0                     \n"
318     "packuswb  %%xmm0,%%xmm0                   \n"
319     "movq      %%xmm0," MEMACCESS(1) "         \n"
320     "lea       " MEMLEA(0x8,1) ",%1            \n"
321     "sub       $0x8,%2                         \n"
322     "jg        1b                              \n"
323   : "+r"(src_ptr),    // %0
324     "+r"(dst_ptr),    // %1
325     "+r"(dst_width)   // %2
326   :: "memory", "cc", "xmm0", "xmm1", "xmm5"
327   );
328 }
329 
ScaleRowDown4Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)330 void ScaleRowDown4Box_SSSE3(const uint8* src_ptr,
331                             ptrdiff_t src_stride,
332                             uint8* dst_ptr,
333                             int dst_width) {
334   intptr_t stridex3;
335   asm volatile (
336     "pcmpeqb    %%xmm4,%%xmm4                  \n"
337     "psrlw      $0xf,%%xmm4                    \n"
338     "movdqa     %%xmm4,%%xmm5                  \n"
339     "packuswb   %%xmm4,%%xmm4                  \n"
340     "psllw      $0x3,%%xmm5                    \n"
341     "lea       " MEMLEA4(0x00,4,4,2) ",%3      \n"
342 
343     LABELALIGN
344   "1:                                          \n"
345     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
346     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
347     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
348     MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
349     "pmaddubsw  %%xmm4,%%xmm0                  \n"
350     "pmaddubsw  %%xmm4,%%xmm1                  \n"
351     "pmaddubsw  %%xmm4,%%xmm2                  \n"
352     "pmaddubsw  %%xmm4,%%xmm3                  \n"
353     "paddw      %%xmm2,%%xmm0                  \n"
354     "paddw      %%xmm3,%%xmm1                  \n"
355     MEMOPREG(movdqu,0x00,0,4,2,xmm2)           //  movdqu  (%0,%4,2),%%xmm2
356     MEMOPREG(movdqu,0x10,0,4,2,xmm3)           //  movdqu  0x10(%0,%4,2),%%xmm3
357     "pmaddubsw  %%xmm4,%%xmm2                  \n"
358     "pmaddubsw  %%xmm4,%%xmm3                  \n"
359     "paddw      %%xmm2,%%xmm0                  \n"
360     "paddw      %%xmm3,%%xmm1                  \n"
361     MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
362     MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
363     "lea       " MEMLEA(0x20,0) ",%0           \n"
364     "pmaddubsw  %%xmm4,%%xmm2                  \n"
365     "pmaddubsw  %%xmm4,%%xmm3                  \n"
366     "paddw      %%xmm2,%%xmm0                  \n"
367     "paddw      %%xmm3,%%xmm1                  \n"
368     "phaddw     %%xmm1,%%xmm0                  \n"
369     "paddw      %%xmm5,%%xmm0                  \n"
370     "psrlw      $0x4,%%xmm0                    \n"
371     "packuswb   %%xmm0,%%xmm0                  \n"
372     "movq      %%xmm0," MEMACCESS(1) "         \n"
373     "lea       " MEMLEA(0x8,1) ",%1            \n"
374     "sub       $0x8,%2                         \n"
375     "jg        1b                              \n"
376   : "+r"(src_ptr),     // %0
377     "+r"(dst_ptr),     // %1
378     "+r"(dst_width),   // %2
379     "=&r"(stridex3)    // %3
380   : "r"((intptr_t)(src_stride))    // %4
381   : "memory", "cc", NACL_R14
382     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
383   );
384 }
385 
386 #ifdef HAS_SCALEROWDOWN4_AVX2
ScaleRowDown4_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)387 void ScaleRowDown4_AVX2(const uint8* src_ptr,
388                         ptrdiff_t src_stride,
389                         uint8* dst_ptr,
390                         int dst_width) {
391   (void)src_stride;
392   asm volatile (
393     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
394     "vpsrld     $0x18,%%ymm5,%%ymm5            \n"
395     "vpslld     $0x10,%%ymm5,%%ymm5            \n"
396     LABELALIGN
397   "1:                                          \n"
398     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
399     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
400     "lea        " MEMLEA(0x40,0) ",%0          \n"
401     "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
402     "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
403     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
404     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
405     "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
406     "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
407     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
408     "vmovdqu    %%xmm0," MEMACCESS(1) "        \n"
409     "lea        " MEMLEA(0x10,1) ",%1          \n"
410     "sub        $0x10,%2                       \n"
411     "jg         1b                             \n"
412     "vzeroupper                                \n"
413   : "+r"(src_ptr),    // %0
414     "+r"(dst_ptr),    // %1
415     "+r"(dst_width)   // %2
416   :: "memory", "cc", "xmm0", "xmm1", "xmm5"
417   );
418 }
419 
ScaleRowDown4Box_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)420 void ScaleRowDown4Box_AVX2(const uint8* src_ptr,
421                            ptrdiff_t src_stride,
422                            uint8* dst_ptr,
423                            int dst_width) {
424   asm volatile (
425     "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
426     "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
427     "vpsllw     $0x3,%%ymm4,%%ymm5             \n"
428     "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
429 
430     LABELALIGN
431   "1:                                          \n"
432     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
433     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
434     MEMOPREG(vmovdqu,0x00,0,3,1,ymm2)          //  vmovdqu  (%0,%3,1),%%ymm2
435     MEMOPREG(vmovdqu,0x20,0,3,1,ymm3)          //  vmovdqu  0x20(%0,%3,1),%%ymm3
436     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
437     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
438     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
439     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
440     "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
441     "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
442     MEMOPREG(vmovdqu,0x00,0,3,2,ymm2)          //  vmovdqu  (%0,%3,2),%%ymm2
443     MEMOPREG(vmovdqu,0x20,0,3,2,ymm3)          //  vmovdqu  0x20(%0,%3,2),%%ymm3
444     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
445     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
446     "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
447     "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
448     MEMOPREG(vmovdqu,0x00,0,4,1,ymm2)          //  vmovdqu  (%0,%4,1),%%ymm2
449     MEMOPREG(vmovdqu,0x20,0,4,1,ymm3)          //  vmovdqu  0x20(%0,%4,1),%%ymm3
450     "lea        " MEMLEA(0x40,0) ",%0          \n"
451     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
452     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
453     "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
454     "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
455     "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"
456     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
457     "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
458     "vpsrlw     $0x4,%%ymm0,%%ymm0             \n"
459     "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
460     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
461     "vmovdqu    %%xmm0," MEMACCESS(1) "        \n"
462     "lea        " MEMLEA(0x10,1) ",%1          \n"
463     "sub        $0x10,%2                       \n"
464     "jg         1b                             \n"
465     "vzeroupper                                \n"
466   : "+r"(src_ptr),    // %0
467     "+r"(dst_ptr),    // %1
468     "+r"(dst_width)   // %2
469   : "r"((intptr_t)(src_stride)),  // %3
470     "r"((intptr_t)(src_stride * 3))   // %4
471   : "memory", "cc", NACL_R14
472     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
473   );
474 }
475 #endif  // HAS_SCALEROWDOWN4_AVX2
476 
ScaleRowDown34_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)477 void ScaleRowDown34_SSSE3(const uint8* src_ptr,
478                           ptrdiff_t src_stride,
479                           uint8* dst_ptr,
480                           int dst_width) {
481   (void)src_stride;
482   asm volatile(
483       "movdqa    %0,%%xmm3                       \n"
484       "movdqa    %1,%%xmm4                       \n"
485       "movdqa    %2,%%xmm5                       \n"
486       :
487       : "m"(kShuf0),  // %0
488         "m"(kShuf1),  // %1
489         "m"(kShuf2)   // %2
490       );
491   asm volatile (
492     LABELALIGN
493   "1:                                          \n"
494     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
495     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm2   \n"
496     "lea       " MEMLEA(0x20,0) ",%0           \n"
497     "movdqa    %%xmm2,%%xmm1                   \n"
498     "palignr   $0x8,%%xmm0,%%xmm1              \n"
499     "pshufb    %%xmm3,%%xmm0                   \n"
500     "pshufb    %%xmm4,%%xmm1                   \n"
501     "pshufb    %%xmm5,%%xmm2                   \n"
502     "movq      %%xmm0," MEMACCESS(1) "         \n"
503     "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
504     "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
505     "lea       " MEMLEA(0x18,1) ",%1           \n"
506     "sub       $0x18,%2                        \n"
507     "jg        1b                              \n"
508   : "+r"(src_ptr),   // %0
509     "+r"(dst_ptr),   // %1
510     "+r"(dst_width)  // %2
511   :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
512   );
513 }
514 
ScaleRowDown34_1_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)515 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
516                                 ptrdiff_t src_stride,
517                                 uint8* dst_ptr,
518                                 int dst_width) {
519   asm volatile(
520       "movdqa    %0,%%xmm2                       \n"  // kShuf01
521       "movdqa    %1,%%xmm3                       \n"  // kShuf11
522       "movdqa    %2,%%xmm4                       \n"  // kShuf21
523       :
524       : "m"(kShuf01),  // %0
525         "m"(kShuf11),  // %1
526         "m"(kShuf21)   // %2
527       );
528   asm volatile(
529       "movdqa    %0,%%xmm5                       \n"  // kMadd01
530       "movdqa    %1,%%xmm0                       \n"  // kMadd11
531       "movdqa    %2,%%xmm1                       \n"  // kRound34
532       :
533       : "m"(kMadd01),  // %0
534         "m"(kMadd11),  // %1
535         "m"(kRound34)  // %2
536       );
537   asm volatile (
538     LABELALIGN
539   "1:                                          \n"
540     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
541     MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3),%%xmm7
542     "pavgb     %%xmm7,%%xmm6                   \n"
543     "pshufb    %%xmm2,%%xmm6                   \n"
544     "pmaddubsw %%xmm5,%%xmm6                   \n"
545     "paddsw    %%xmm1,%%xmm6                   \n"
546     "psrlw     $0x2,%%xmm6                     \n"
547     "packuswb  %%xmm6,%%xmm6                   \n"
548     "movq      %%xmm6," MEMACCESS(1) "         \n"
549     "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
550     MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3),%%xmm7
551     "pavgb     %%xmm7,%%xmm6                   \n"
552     "pshufb    %%xmm3,%%xmm6                   \n"
553     "pmaddubsw %%xmm0,%%xmm6                   \n"
554     "paddsw    %%xmm1,%%xmm6                   \n"
555     "psrlw     $0x2,%%xmm6                     \n"
556     "packuswb  %%xmm6,%%xmm6                   \n"
557     "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
558     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
559     MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3),%%xmm7
560     "lea       " MEMLEA(0x20,0) ",%0           \n"
561     "pavgb     %%xmm7,%%xmm6                   \n"
562     "pshufb    %%xmm4,%%xmm6                   \n"
563     "pmaddubsw %4,%%xmm6                       \n"
564     "paddsw    %%xmm1,%%xmm6                   \n"
565     "psrlw     $0x2,%%xmm6                     \n"
566     "packuswb  %%xmm6,%%xmm6                   \n"
567     "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
568     "lea       " MEMLEA(0x18,1) ",%1           \n"
569     "sub       $0x18,%2                        \n"
570     "jg        1b                              \n"
571   : "+r"(src_ptr),   // %0
572     "+r"(dst_ptr),   // %1
573     "+r"(dst_width)  // %2
574   : "r"((intptr_t)(src_stride)),  // %3
575     "m"(kMadd21)     // %4
576   : "memory", "cc", NACL_R14
577     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
578   );
579 }
580 
ScaleRowDown34_0_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)581 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
582                                 ptrdiff_t src_stride,
583                                 uint8* dst_ptr,
584                                 int dst_width) {
585   asm volatile(
586       "movdqa    %0,%%xmm2                       \n"  // kShuf01
587       "movdqa    %1,%%xmm3                       \n"  // kShuf11
588       "movdqa    %2,%%xmm4                       \n"  // kShuf21
589       :
590       : "m"(kShuf01),  // %0
591         "m"(kShuf11),  // %1
592         "m"(kShuf21)   // %2
593       );
594   asm volatile(
595       "movdqa    %0,%%xmm5                       \n"  // kMadd01
596       "movdqa    %1,%%xmm0                       \n"  // kMadd11
597       "movdqa    %2,%%xmm1                       \n"  // kRound34
598       :
599       : "m"(kMadd01),  // %0
600         "m"(kMadd11),  // %1
601         "m"(kRound34)  // %2
602       );
603 
604   asm volatile (
605     LABELALIGN
606   "1:                                          \n"
607     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
608     MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3,1),%%xmm7
609     "pavgb     %%xmm6,%%xmm7                   \n"
610     "pavgb     %%xmm7,%%xmm6                   \n"
611     "pshufb    %%xmm2,%%xmm6                   \n"
612     "pmaddubsw %%xmm5,%%xmm6                   \n"
613     "paddsw    %%xmm1,%%xmm6                   \n"
614     "psrlw     $0x2,%%xmm6                     \n"
615     "packuswb  %%xmm6,%%xmm6                   \n"
616     "movq      %%xmm6," MEMACCESS(1) "         \n"
617     "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
618     MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3,1),%%xmm7
619     "pavgb     %%xmm6,%%xmm7                   \n"
620     "pavgb     %%xmm7,%%xmm6                   \n"
621     "pshufb    %%xmm3,%%xmm6                   \n"
622     "pmaddubsw %%xmm0,%%xmm6                   \n"
623     "paddsw    %%xmm1,%%xmm6                   \n"
624     "psrlw     $0x2,%%xmm6                     \n"
625     "packuswb  %%xmm6,%%xmm6                   \n"
626     "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
627     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
628     MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3,1),%%xmm7
629     "lea       " MEMLEA(0x20,0) ",%0           \n"
630     "pavgb     %%xmm6,%%xmm7                   \n"
631     "pavgb     %%xmm7,%%xmm6                   \n"
632     "pshufb    %%xmm4,%%xmm6                   \n"
633     "pmaddubsw %4,%%xmm6                       \n"
634     "paddsw    %%xmm1,%%xmm6                   \n"
635     "psrlw     $0x2,%%xmm6                     \n"
636     "packuswb  %%xmm6,%%xmm6                   \n"
637     "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
638     "lea       " MEMLEA(0x18,1) ",%1           \n"
639     "sub       $0x18,%2                        \n"
640     "jg        1b                              \n"
641     : "+r"(src_ptr),   // %0
642       "+r"(dst_ptr),   // %1
643       "+r"(dst_width)  // %2
644     : "r"((intptr_t)(src_stride)),  // %3
645       "m"(kMadd21)     // %4
646     : "memory", "cc", NACL_R14
647       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
648   );
649 }
650 
ScaleRowDown38_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)651 void ScaleRowDown38_SSSE3(const uint8* src_ptr,
652                           ptrdiff_t src_stride,
653                           uint8* dst_ptr,
654                           int dst_width) {
655   (void)src_stride;
656   asm volatile (
657     "movdqa    %3,%%xmm4                       \n"
658     "movdqa    %4,%%xmm5                       \n"
659 
660     LABELALIGN
661   "1:                                          \n"
662     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
663     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
664     "lea       " MEMLEA(0x20,0) ",%0           \n"
665     "pshufb    %%xmm4,%%xmm0                   \n"
666     "pshufb    %%xmm5,%%xmm1                   \n"
667     "paddusb   %%xmm1,%%xmm0                   \n"
668     "movq      %%xmm0," MEMACCESS(1) "         \n"
669     "movhlps   %%xmm0,%%xmm1                   \n"
670     "movd      %%xmm1," MEMACCESS2(0x8,1) "    \n"
671     "lea       " MEMLEA(0xc,1) ",%1            \n"
672     "sub       $0xc,%2                         \n"
673     "jg        1b                              \n"
674   : "+r"(src_ptr),   // %0
675     "+r"(dst_ptr),   // %1
676     "+r"(dst_width)  // %2
677   : "m"(kShuf38a),   // %3
678     "m"(kShuf38b)    // %4
679   : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
680   );
681 }
682 
ScaleRowDown38_2_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)683 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
684                                 ptrdiff_t src_stride,
685                                 uint8* dst_ptr,
686                                 int dst_width) {
687   asm volatile(
688       "movdqa    %0,%%xmm2                       \n"
689       "movdqa    %1,%%xmm3                       \n"
690       "movdqa    %2,%%xmm4                       \n"
691       "movdqa    %3,%%xmm5                       \n"
692       :
693       : "m"(kShufAb0),  // %0
694         "m"(kShufAb1),  // %1
695         "m"(kShufAb2),  // %2
696         "m"(kScaleAb2)  // %3
697       );
698   asm volatile (
699     LABELALIGN
700   "1:                                          \n"
701     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
702     MEMOPREG(movdqu,0x00,0,3,1,xmm1)           //  movdqu  (%0,%3,1),%%xmm1
703     "lea       " MEMLEA(0x10,0) ",%0           \n"
704     "pavgb     %%xmm1,%%xmm0                   \n"
705     "movdqa    %%xmm0,%%xmm1                   \n"
706     "pshufb    %%xmm2,%%xmm1                   \n"
707     "movdqa    %%xmm0,%%xmm6                   \n"
708     "pshufb    %%xmm3,%%xmm6                   \n"
709     "paddusw   %%xmm6,%%xmm1                   \n"
710     "pshufb    %%xmm4,%%xmm0                   \n"
711     "paddusw   %%xmm0,%%xmm1                   \n"
712     "pmulhuw   %%xmm5,%%xmm1                   \n"
713     "packuswb  %%xmm1,%%xmm1                   \n"
714     "movd      %%xmm1," MEMACCESS(1) "         \n"
715     "psrlq     $0x10,%%xmm1                    \n"
716     "movd      %%xmm1," MEMACCESS2(0x2,1) "    \n"
717     "lea       " MEMLEA(0x6,1) ",%1            \n"
718     "sub       $0x6,%2                         \n"
719     "jg        1b                              \n"
720   : "+r"(src_ptr),     // %0
721     "+r"(dst_ptr),     // %1
722     "+r"(dst_width)    // %2
723   : "r"((intptr_t)(src_stride))  // %3
724   : "memory", "cc", NACL_R14
725     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
726   );
727 }
728 
ScaleRowDown38_3_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)729 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
730                                 ptrdiff_t src_stride,
731                                 uint8* dst_ptr,
732                                 int dst_width) {
733   asm volatile(
734       "movdqa    %0,%%xmm2                       \n"
735       "movdqa    %1,%%xmm3                       \n"
736       "movdqa    %2,%%xmm4                       \n"
737       "pxor      %%xmm5,%%xmm5                   \n"
738       :
739       : "m"(kShufAc),    // %0
740         "m"(kShufAc3),   // %1
741         "m"(kScaleAc33)  // %2
742       );
743   asm volatile (
744     LABELALIGN
745   "1:                                          \n"
746     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
747     MEMOPREG(movdqu,0x00,0,3,1,xmm6)           //  movdqu  (%0,%3,1),%%xmm6
748     "movhlps   %%xmm0,%%xmm1                   \n"
749     "movhlps   %%xmm6,%%xmm7                   \n"
750     "punpcklbw %%xmm5,%%xmm0                   \n"
751     "punpcklbw %%xmm5,%%xmm1                   \n"
752     "punpcklbw %%xmm5,%%xmm6                   \n"
753     "punpcklbw %%xmm5,%%xmm7                   \n"
754     "paddusw   %%xmm6,%%xmm0                   \n"
755     "paddusw   %%xmm7,%%xmm1                   \n"
756     MEMOPREG(movdqu,0x00,0,3,2,xmm6)           //  movdqu  (%0,%3,2),%%xmm6
757     "lea       " MEMLEA(0x10,0) ",%0           \n"
758     "movhlps   %%xmm6,%%xmm7                   \n"
759     "punpcklbw %%xmm5,%%xmm6                   \n"
760     "punpcklbw %%xmm5,%%xmm7                   \n"
761     "paddusw   %%xmm6,%%xmm0                   \n"
762     "paddusw   %%xmm7,%%xmm1                   \n"
763     "movdqa    %%xmm0,%%xmm6                   \n"
764     "psrldq    $0x2,%%xmm0                     \n"
765     "paddusw   %%xmm0,%%xmm6                   \n"
766     "psrldq    $0x2,%%xmm0                     \n"
767     "paddusw   %%xmm0,%%xmm6                   \n"
768     "pshufb    %%xmm2,%%xmm6                   \n"
769     "movdqa    %%xmm1,%%xmm7                   \n"
770     "psrldq    $0x2,%%xmm1                     \n"
771     "paddusw   %%xmm1,%%xmm7                   \n"
772     "psrldq    $0x2,%%xmm1                     \n"
773     "paddusw   %%xmm1,%%xmm7                   \n"
774     "pshufb    %%xmm3,%%xmm7                   \n"
775     "paddusw   %%xmm7,%%xmm6                   \n"
776     "pmulhuw   %%xmm4,%%xmm6                   \n"
777     "packuswb  %%xmm6,%%xmm6                   \n"
778     "movd      %%xmm6," MEMACCESS(1) "         \n"
779     "psrlq     $0x10,%%xmm6                    \n"
780     "movd      %%xmm6," MEMACCESS2(0x2,1) "    \n"
781     "lea       " MEMLEA(0x6,1) ",%1            \n"
782     "sub       $0x6,%2                         \n"
783     "jg        1b                              \n"
784   : "+r"(src_ptr),    // %0
785     "+r"(dst_ptr),    // %1
786     "+r"(dst_width)   // %2
787   : "r"((intptr_t)(src_stride))   // %3
788   : "memory", "cc", NACL_R14
789     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
790   );
791 }
792 
793 // Reads 16xN bytes and produces 16 shorts at a time.
ScaleAddRow_SSE2(const uint8 * src_ptr,uint16 * dst_ptr,int src_width)794 void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
795   asm volatile (
796     "pxor      %%xmm5,%%xmm5                   \n"
797 
798     LABELALIGN
799   "1:                                          \n"
800     "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
801     "lea       " MEMLEA(0x10,0) ",%0           \n"  // src_ptr += 16
802     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
803     "movdqu    " MEMACCESS2(0x10,1) ",%%xmm1   \n"
804     "movdqa    %%xmm3,%%xmm2                   \n"
805     "punpcklbw %%xmm5,%%xmm2                   \n"
806     "punpckhbw %%xmm5,%%xmm3                   \n"
807     "paddusw   %%xmm2,%%xmm0                   \n"
808     "paddusw   %%xmm3,%%xmm1                   \n"
809     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
810     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
811     "lea       " MEMLEA(0x20,1) ",%1           \n"
812     "sub       $0x10,%2                        \n"
813     "jg        1b                              \n"
814   : "+r"(src_ptr),     // %0
815     "+r"(dst_ptr),     // %1
816     "+r"(src_width)    // %2
817   :
818   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
819   );
820 }
821 
822 #ifdef HAS_SCALEADDROW_AVX2
823 // Reads 32 bytes and accumulates to 32 shorts at a time.
ScaleAddRow_AVX2(const uint8 * src_ptr,uint16 * dst_ptr,int src_width)824 void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
825   asm volatile (
826     "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
827 
828     LABELALIGN
829   "1:                                          \n"
830     "vmovdqu    " MEMACCESS(0) ",%%ymm3        \n"
831     "lea        " MEMLEA(0x20,0) ",%0          \n"  // src_ptr += 32
832     "vpermq     $0xd8,%%ymm3,%%ymm3            \n"
833     "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
834     "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
835     "vpaddusw   " MEMACCESS(1) ",%%ymm2,%%ymm0 \n"
836     "vpaddusw   " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n"
837     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
838     "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
839     "lea       " MEMLEA(0x40,1) ",%1           \n"
840     "sub       $0x20,%2                        \n"
841     "jg        1b                              \n"
842     "vzeroupper                                \n"
843   : "+r"(src_ptr),     // %0
844     "+r"(dst_ptr),     // %1
845     "+r"(src_width)    // %2
846   :
847   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
848   );
849 }
850 #endif  // HAS_SCALEADDROW_AVX2
851 
852 // Constant for making pixels signed to avoid pmaddubsw
853 // saturation.
854 static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
855                         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
856 
857 // Constant for making pixels unsigned and adding .5 for rounding.
858 static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
859                          0x4040, 0x4040, 0x4040, 0x4040};
860 
861 // Bilinear column filtering. SSSE3 version.
ScaleFilterCols_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)862 void ScaleFilterCols_SSSE3(uint8* dst_ptr,
863                            const uint8* src_ptr,
864                            int dst_width,
865                            int x,
866                            int dx) {
867   intptr_t x0, x1, temp_pixel;
868   asm volatile (
869     "movd      %6,%%xmm2                       \n"
870     "movd      %7,%%xmm3                       \n"
871     "movl      $0x04040000,%k2                 \n"
872     "movd      %k2,%%xmm5                      \n"
873     "pcmpeqb   %%xmm6,%%xmm6                   \n"
874     "psrlw     $0x9,%%xmm6                     \n"  // 0x007f007f
875     "pcmpeqb   %%xmm7,%%xmm7                   \n"
876     "psrlw     $15,%%xmm7                      \n"  // 0x00010001
877 
878     "pextrw    $0x1,%%xmm2,%k3                 \n"
879     "subl      $0x2,%5                         \n"
880     "jl        29f                             \n"
881     "movdqa    %%xmm2,%%xmm0                   \n"
882     "paddd     %%xmm3,%%xmm0                   \n"
883     "punpckldq %%xmm0,%%xmm2                   \n"
884     "punpckldq %%xmm3,%%xmm3                   \n"
885     "paddd     %%xmm3,%%xmm3                   \n"
886     "pextrw    $0x3,%%xmm2,%k4                 \n"
887 
888     LABELALIGN
889   "2:                                          \n"
890     "movdqa    %%xmm2,%%xmm1                   \n"
891     "paddd     %%xmm3,%%xmm2                   \n"
892     MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
893     "movd      %k2,%%xmm0                      \n"
894     "psrlw     $0x9,%%xmm1                     \n"
895     MEMOPARG(movzwl,0x00,1,4,1,k2)             //  movzwl  (%1,%4,1),%k2
896     "movd      %k2,%%xmm4                      \n"
897     "pshufb    %%xmm5,%%xmm1                   \n"
898     "punpcklwd %%xmm4,%%xmm0                   \n"
899     "psubb     %8,%%xmm0                       \n"  // make pixels signed.
900     "pxor      %%xmm6,%%xmm1                   \n"  // 128 - f = (f ^ 127 ) + 1
901     "paddusb   %%xmm7,%%xmm1                   \n"
902     "pmaddubsw %%xmm0,%%xmm1                   \n"
903     "pextrw    $0x1,%%xmm2,%k3                 \n"
904     "pextrw    $0x3,%%xmm2,%k4                 \n"
905     "paddw     %9,%%xmm1                       \n"  // make pixels unsigned.
906     "psrlw     $0x7,%%xmm1                     \n"
907     "packuswb  %%xmm1,%%xmm1                   \n"
908     "movd      %%xmm1,%k2                      \n"
909     "mov       %w2," MEMACCESS(0) "            \n"
910     "lea       " MEMLEA(0x2,0) ",%0            \n"
911     "subl      $0x2,%5                         \n"
912     "jge       2b                              \n"
913 
914     LABELALIGN
915   "29:                                         \n"
916     "addl      $0x1,%5                         \n"
917     "jl        99f                             \n"
918     MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
919     "movd      %k2,%%xmm0                      \n"
920     "psrlw     $0x9,%%xmm2                     \n"
921     "pshufb    %%xmm5,%%xmm2                   \n"
922     "psubb     %8,%%xmm0                       \n"  // make pixels signed.
923     "pxor      %%xmm6,%%xmm2                   \n"
924     "paddusb   %%xmm7,%%xmm2                   \n"
925     "pmaddubsw %%xmm0,%%xmm2                   \n"
926     "paddw     %9,%%xmm2                       \n"  // make pixels unsigned.
927     "psrlw     $0x7,%%xmm2                     \n"
928     "packuswb  %%xmm2,%%xmm2                   \n"
929     "movd      %%xmm2,%k2                      \n"
930     "mov       %b2," MEMACCESS(0) "            \n"
931   "99:                                         \n"
932   : "+r"(dst_ptr),      // %0
933     "+r"(src_ptr),      // %1
934     "=&a"(temp_pixel),  // %2
935     "=&r"(x0),          // %3
936     "=&r"(x1),          // %4
937 #if defined(__x86_64__)
938     "+rm"(dst_width)    // %5
939 #else
940     "+m"(dst_width)    // %5
941 #endif
942   : "rm"(x),            // %6
943     "rm"(dx),           // %7
944 #if defined(__x86_64__)
945     "x"(kFsub80),       // %8
946     "x"(kFadd40)        // %9
947 #else
948     "m"(kFsub80),       // %8
949     "m"(kFadd40)        // %9
950 #endif
951   : "memory", "cc", NACL_R14
952     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
953   );
954 }
955 
956 // Reads 4 pixels, duplicates them and writes 8 pixels.
957 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleColsUp2_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)958 void ScaleColsUp2_SSE2(uint8* dst_ptr,
959                        const uint8* src_ptr,
960                        int dst_width,
961                        int x,
962                        int dx) {
963   (void)x;
964   (void)dx;
965   asm volatile (
966     LABELALIGN
967   "1:                                          \n"
968     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
969     "lea       " MEMLEA(0x10,1) ",%1           \n"
970     "movdqa    %%xmm0,%%xmm1                   \n"
971     "punpcklbw %%xmm0,%%xmm0                   \n"
972     "punpckhbw %%xmm1,%%xmm1                   \n"
973     "movdqu    %%xmm0," MEMACCESS(0) "         \n"
974     "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
975     "lea       " MEMLEA(0x20,0) ",%0           \n"
976     "sub       $0x20,%2                         \n"
977     "jg        1b                              \n"
978 
979   : "+r"(dst_ptr),     // %0
980     "+r"(src_ptr),     // %1
981     "+r"(dst_width)    // %2
982   :: "memory", "cc", "xmm0", "xmm1"
983   );
984 }
985 
ScaleARGBRowDown2_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)986 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
987                             ptrdiff_t src_stride,
988                             uint8* dst_argb,
989                             int dst_width) {
990   (void)src_stride;
991   asm volatile (
992     LABELALIGN
993   "1:                                          \n"
994     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
995     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
996     "lea       " MEMLEA(0x20,0) ",%0           \n"
997     "shufps    $0xdd,%%xmm1,%%xmm0             \n"
998     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
999     "lea       " MEMLEA(0x10,1) ",%1           \n"
1000     "sub       $0x4,%2                         \n"
1001     "jg        1b                              \n"
1002   : "+r"(src_argb),  // %0
1003     "+r"(dst_argb),  // %1
1004     "+r"(dst_width)  // %2
1005   :: "memory", "cc", "xmm0", "xmm1"
1006   );
1007 }
1008 
ScaleARGBRowDown2Linear_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)1009 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
1010                                   ptrdiff_t src_stride,
1011                                   uint8* dst_argb,
1012                                   int dst_width) {
1013   (void)src_stride;
1014   asm volatile (
1015     LABELALIGN
1016   "1:                                          \n"
1017     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1018     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1019     "lea       " MEMLEA(0x20,0) ",%0           \n"
1020     "movdqa    %%xmm0,%%xmm2                   \n"
1021     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1022     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
1023     "pavgb     %%xmm2,%%xmm0                   \n"
1024     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1025     "lea       " MEMLEA(0x10,1) ",%1           \n"
1026     "sub       $0x4,%2                         \n"
1027     "jg        1b                              \n"
1028   : "+r"(src_argb),  // %0
1029     "+r"(dst_argb),  // %1
1030     "+r"(dst_width)  // %2
1031   :: "memory", "cc", "xmm0", "xmm1"
1032   );
1033 }
1034 
ScaleARGBRowDown2Box_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)1035 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
1036                                ptrdiff_t src_stride,
1037                                uint8* dst_argb,
1038                                int dst_width) {
1039   asm volatile (
1040     LABELALIGN
1041   "1:                                          \n"
1042     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1043     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1044     MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu   (%0,%3,1),%%xmm2
1045     MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu   0x10(%0,%3,1),%%xmm3
1046     "lea       " MEMLEA(0x20,0) ",%0           \n"
1047     "pavgb     %%xmm2,%%xmm0                   \n"
1048     "pavgb     %%xmm3,%%xmm1                   \n"
1049     "movdqa    %%xmm0,%%xmm2                   \n"
1050     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1051     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
1052     "pavgb     %%xmm2,%%xmm0                   \n"
1053     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1054     "lea       " MEMLEA(0x10,1) ",%1           \n"
1055     "sub       $0x4,%2                         \n"
1056     "jg        1b                              \n"
1057   : "+r"(src_argb),   // %0
1058     "+r"(dst_argb),   // %1
1059     "+r"(dst_width)   // %2
1060   : "r"((intptr_t)(src_stride))   // %3
1061   : "memory", "cc", NACL_R14
1062     "xmm0", "xmm1", "xmm2", "xmm3"
1063   );
1064 }
1065 
1066 // Reads 4 pixels at a time.
1067 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEven_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)1068 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb,
1069                                ptrdiff_t src_stride,
1070                                int src_stepx,
1071                                uint8* dst_argb,
1072                                int dst_width) {
1073   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
1074   intptr_t src_stepx_x12;
1075   (void)src_stride;
1076   asm volatile (
1077     "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
1078     "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
1079     LABELALIGN
1080   "1:                                          \n"
1081     "movd      " MEMACCESS(0) ",%%xmm0         \n"
1082     MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
1083     "punpckldq %%xmm1,%%xmm0                   \n"
1084     MEMOPREG(movd,0x00,0,1,2,xmm2)             //  movd      (%0,%1,2),%%xmm2
1085     MEMOPREG(movd,0x00,0,4,1,xmm3)             //  movd      (%0,%4,1),%%xmm3
1086     "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
1087     "punpckldq %%xmm3,%%xmm2                   \n"
1088     "punpcklqdq %%xmm2,%%xmm0                  \n"
1089     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
1090     "lea       " MEMLEA(0x10,2) ",%2           \n"
1091     "sub       $0x4,%3                         \n"
1092     "jg        1b                              \n"
1093   : "+r"(src_argb),       // %0
1094     "+r"(src_stepx_x4),   // %1
1095     "+r"(dst_argb),       // %2
1096     "+r"(dst_width),      // %3
1097     "=&r"(src_stepx_x12)  // %4
1098   :: "memory", "cc", NACL_R14
1099     "xmm0", "xmm1", "xmm2", "xmm3"
1100   );
1101 }
1102 
1103 // Blends four 2x2 to 4x1.
1104 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEvenBox_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)1105 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
1106                                   ptrdiff_t src_stride,
1107                                   int src_stepx,
1108                                   uint8* dst_argb,
1109                                   int dst_width) {
1110   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
1111   intptr_t src_stepx_x12;
1112   intptr_t row1 = (intptr_t)(src_stride);
1113   asm volatile (
1114     "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
1115     "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
1116     "lea       " MEMLEA4(0x00,0,5,1) ",%5      \n"
1117 
1118     LABELALIGN
1119   "1:                                          \n"
1120     "movq      " MEMACCESS(0) ",%%xmm0         \n"
1121     MEMOPREG(movhps,0x00,0,1,1,xmm0)           //  movhps    (%0,%1,1),%%xmm0
1122     MEMOPREG(movq,0x00,0,1,2,xmm1)             //  movq      (%0,%1,2),%%xmm1
1123     MEMOPREG(movhps,0x00,0,4,1,xmm1)           //  movhps    (%0,%4,1),%%xmm1
1124     "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
1125     "movq      " MEMACCESS(5) ",%%xmm2         \n"
1126     MEMOPREG(movhps,0x00,5,1,1,xmm2)           //  movhps    (%5,%1,1),%%xmm2
1127     MEMOPREG(movq,0x00,5,1,2,xmm3)             //  movq      (%5,%1,2),%%xmm3
1128     MEMOPREG(movhps,0x00,5,4,1,xmm3)           //  movhps    (%5,%4,1),%%xmm3
1129     "lea       " MEMLEA4(0x00,5,1,4) ",%5      \n"
1130     "pavgb     %%xmm2,%%xmm0                   \n"
1131     "pavgb     %%xmm3,%%xmm1                   \n"
1132     "movdqa    %%xmm0,%%xmm2                   \n"
1133     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1134     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
1135     "pavgb     %%xmm2,%%xmm0                   \n"
1136     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
1137     "lea       " MEMLEA(0x10,2) ",%2           \n"
1138     "sub       $0x4,%3                         \n"
1139     "jg        1b                              \n"
1140   : "+r"(src_argb),        // %0
1141     "+r"(src_stepx_x4),    // %1
1142     "+r"(dst_argb),        // %2
1143     "+rm"(dst_width),      // %3
1144     "=&r"(src_stepx_x12),  // %4
1145     "+r"(row1)             // %5
1146   :: "memory", "cc", NACL_R14
1147     "xmm0", "xmm1", "xmm2", "xmm3"
1148   );
1149 }
1150 
ScaleARGBCols_SSE2(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1151 void ScaleARGBCols_SSE2(uint8* dst_argb,
1152                         const uint8* src_argb,
1153                         int dst_width,
1154                         int x,
1155                         int dx) {
1156   intptr_t x0, x1;
1157   asm volatile (
1158     "movd      %5,%%xmm2                       \n"
1159     "movd      %6,%%xmm3                       \n"
1160     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
1161     "pshufd    $0x11,%%xmm3,%%xmm0             \n"
1162     "paddd     %%xmm0,%%xmm2                   \n"
1163     "paddd     %%xmm3,%%xmm3                   \n"
1164     "pshufd    $0x5,%%xmm3,%%xmm0              \n"
1165     "paddd     %%xmm0,%%xmm2                   \n"
1166     "paddd     %%xmm3,%%xmm3                   \n"
1167     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
1168     "pextrw    $0x1,%%xmm2,%k0                 \n"
1169     "pextrw    $0x3,%%xmm2,%k1                 \n"
1170     "cmp       $0x0,%4                         \n"
1171     "jl        99f                             \n"
1172     "sub       $0x4,%4                         \n"
1173     "jl        49f                             \n"
1174 
1175     LABELALIGN
1176   "40:                                         \n"
1177     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
1178     MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
1179     "pextrw    $0x5,%%xmm2,%k0                 \n"
1180     "pextrw    $0x7,%%xmm2,%k1                 \n"
1181     "paddd     %%xmm3,%%xmm2                   \n"
1182     "punpckldq %%xmm1,%%xmm0                   \n"
1183     MEMOPREG(movd,0x00,3,0,4,xmm1)             //  movd      (%3,%0,4),%%xmm1
1184     MEMOPREG(movd,0x00,3,1,4,xmm4)             //  movd      (%3,%1,4),%%xmm4
1185     "pextrw    $0x1,%%xmm2,%k0                 \n"
1186     "pextrw    $0x3,%%xmm2,%k1                 \n"
1187     "punpckldq %%xmm4,%%xmm1                   \n"
1188     "punpcklqdq %%xmm1,%%xmm0                  \n"
1189     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
1190     "lea       " MEMLEA(0x10,2) ",%2           \n"
1191     "sub       $0x4,%4                         \n"
1192     "jge       40b                             \n"
1193 
1194   "49:                                         \n"
1195     "test      $0x2,%4                         \n"
1196     "je        29f                             \n"
1197     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
1198     MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
1199     "pextrw    $0x5,%%xmm2,%k0                 \n"
1200     "punpckldq %%xmm1,%%xmm0                   \n"
1201     "movq      %%xmm0," MEMACCESS(2) "         \n"
1202     "lea       " MEMLEA(0x8,2) ",%2            \n"
1203   "29:                                         \n"
1204     "test      $0x1,%4                         \n"
1205     "je        99f                             \n"
1206     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
1207     "movd      %%xmm0," MEMACCESS(2) "         \n"
1208   "99:                                         \n"
1209   : "=&a"(x0),         // %0
1210     "=&d"(x1),         // %1
1211     "+r"(dst_argb),    // %2
1212     "+r"(src_argb),    // %3
1213     "+r"(dst_width)    // %4
1214   : "rm"(x),           // %5
1215     "rm"(dx)           // %6
1216   : "memory", "cc", NACL_R14
1217     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1218   );
1219 }
1220 
1221 // Reads 4 pixels, duplicates them and writes 8 pixels.
1222 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleARGBColsUp2_SSE2(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1223 void ScaleARGBColsUp2_SSE2(uint8* dst_argb,
1224                            const uint8* src_argb,
1225                            int dst_width,
1226                            int x,
1227                            int dx) {
1228   (void)x;
1229   (void)dx;
1230   asm volatile (
1231     LABELALIGN
1232   "1:                                          \n"
1233     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
1234     "lea       " MEMLEA(0x10,1) ",%1           \n"
1235     "movdqa    %%xmm0,%%xmm1                   \n"
1236     "punpckldq %%xmm0,%%xmm0                   \n"
1237     "punpckhdq %%xmm1,%%xmm1                   \n"
1238     "movdqu    %%xmm0," MEMACCESS(0) "         \n"
1239     "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
1240     "lea       " MEMLEA(0x20,0) ",%0           \n"
1241     "sub       $0x8,%2                         \n"
1242     "jg        1b                              \n"
1243 
1244   : "+r"(dst_argb),    // %0
1245     "+r"(src_argb),    // %1
1246     "+r"(dst_width)    // %2
1247   :: "memory", "cc", NACL_R14
1248     "xmm0", "xmm1"
1249   );
1250 }
1251 
1252 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1253 static uvec8 kShuffleColARGB = {
1254     0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
1255     8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
1256 };
1257 
1258 // Shuffle table for duplicating 2 fractions into 8 bytes each
1259 static uvec8 kShuffleFractions = {
1260     0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1261 };
1262 
1263 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
ScaleARGBFilterCols_SSSE3(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1264 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
1265                                const uint8* src_argb,
1266                                int dst_width,
1267                                int x,
1268                                int dx) {
1269   intptr_t x0, x1;
1270   asm volatile(
1271       "movdqa    %0,%%xmm4                       \n"
1272       "movdqa    %1,%%xmm5                       \n"
1273       :
1274       : "m"(kShuffleColARGB),   // %0
1275         "m"(kShuffleFractions)  // %1
1276       );
1277 
1278   asm volatile (
1279     "movd      %5,%%xmm2                       \n"
1280     "movd      %6,%%xmm3                       \n"
1281     "pcmpeqb   %%xmm6,%%xmm6                   \n"
1282     "psrlw     $0x9,%%xmm6                     \n"
1283     "pextrw    $0x1,%%xmm2,%k3                 \n"
1284     "sub       $0x2,%2                         \n"
1285     "jl        29f                             \n"
1286     "movdqa    %%xmm2,%%xmm0                   \n"
1287     "paddd     %%xmm3,%%xmm0                   \n"
1288     "punpckldq %%xmm0,%%xmm2                   \n"
1289     "punpckldq %%xmm3,%%xmm3                   \n"
1290     "paddd     %%xmm3,%%xmm3                   \n"
1291     "pextrw    $0x3,%%xmm2,%k4                 \n"
1292 
1293     LABELALIGN
1294   "2:                                          \n"
1295     "movdqa    %%xmm2,%%xmm1                   \n"
1296     "paddd     %%xmm3,%%xmm2                   \n"
1297     MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
1298     "psrlw     $0x9,%%xmm1                     \n"
1299     MEMOPREG(movhps,0x00,1,4,4,xmm0)           //  movhps    (%1,%4,4),%%xmm0
1300     "pshufb    %%xmm5,%%xmm1                   \n"
1301     "pshufb    %%xmm4,%%xmm0                   \n"
1302     "pxor      %%xmm6,%%xmm1                   \n"
1303     "pmaddubsw %%xmm1,%%xmm0                   \n"
1304     "psrlw     $0x7,%%xmm0                     \n"
1305     "pextrw    $0x1,%%xmm2,%k3                 \n"
1306     "pextrw    $0x3,%%xmm2,%k4                 \n"
1307     "packuswb  %%xmm0,%%xmm0                   \n"
1308     "movq      %%xmm0," MEMACCESS(0) "         \n"
1309     "lea       " MEMLEA(0x8,0) ",%0            \n"
1310     "sub       $0x2,%2                         \n"
1311     "jge       2b                              \n"
1312 
1313     LABELALIGN
1314   "29:                                         \n"
1315     "add       $0x1,%2                         \n"
1316     "jl        99f                             \n"
1317     "psrlw     $0x9,%%xmm2                     \n"
1318     MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
1319     "pshufb    %%xmm5,%%xmm2                   \n"
1320     "pshufb    %%xmm4,%%xmm0                   \n"
1321     "pxor      %%xmm6,%%xmm2                   \n"
1322     "pmaddubsw %%xmm2,%%xmm0                   \n"
1323     "psrlw     $0x7,%%xmm0                     \n"
1324     "packuswb  %%xmm0,%%xmm0                   \n"
1325     "movd      %%xmm0," MEMACCESS(0) "         \n"
1326 
1327     LABELALIGN
1328   "99:                                         \n"
1329   : "+r"(dst_argb),    // %0
1330     "+r"(src_argb),    // %1
1331     "+rm"(dst_width),  // %2
1332     "=&r"(x0),         // %3
1333     "=&r"(x1)          // %4
1334   : "rm"(x),           // %5
1335     "rm"(dx)           // %6
1336   : "memory", "cc", NACL_R14
1337     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1338   );
1339 }
1340 
1341 // Divide num by div and return as 16.16 fixed point result.
FixedDiv_X86(int num,int div)1342 int FixedDiv_X86(int num, int div) {
1343   asm volatile(
1344       "cdq                                       \n"
1345       "shld      $0x10,%%eax,%%edx               \n"
1346       "shl       $0x10,%%eax                     \n"
1347       "idiv      %1                              \n"
1348       "mov       %0, %%eax                       \n"
1349       : "+a"(num)  // %0
1350       : "c"(div)   // %1
1351       : "memory", "cc", "edx");
1352   return num;
1353 }
1354 
1355 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
FixedDiv1_X86(int num,int div)1356 int FixedDiv1_X86(int num, int div) {
1357   asm volatile(
1358       "cdq                                       \n"
1359       "shld      $0x10,%%eax,%%edx               \n"
1360       "shl       $0x10,%%eax                     \n"
1361       "sub       $0x10001,%%eax                  \n"
1362       "sbb       $0x0,%%edx                      \n"
1363       "sub       $0x1,%1                         \n"
1364       "idiv      %1                              \n"
1365       "mov       %0, %%eax                       \n"
1366       : "+a"(num)  // %0
1367       : "c"(div)   // %1
1368       : "memory", "cc", "edx");
1369   return num;
1370 }
1371 
1372 #endif  // defined(__x86_64__) || defined(__i386__)
1373 
1374 #ifdef __cplusplus
1375 }  // extern "C"
1376 }  // namespace libyuv
1377 #endif
1378