• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17 
18 // This module is for GCC x86 and x64.
19 #if !defined(LIBYUV_DISABLE_X86) && \
20     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
21 
22 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
23 
24 // Constants for ARGB
25 static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
26                               13, 65, 33, 0, 13, 65, 33, 0};
27 
28 // JPeg full range.
29 static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
30                                15, 75, 38, 0, 15, 75, 38, 0};
31 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
32 
33 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
34 
35 static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
36                               112, -74, -38, 0, 112, -74, -38, 0};
37 
38 static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
39                                127, -84, -43, 0, 127, -84, -43, 0};
40 
41 static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
42                               -18, -94, 112, 0, -18, -94, 112, 0};
43 
44 static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
45                                -20, -107, 127, 0, -20, -107, 127, 0};
46 
47 // Constants for BGRA
48 static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
49                               0, 33, 65, 13, 0, 33, 65, 13};
50 
51 static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
52                               0, -38, -74, 112, 0, -38, -74, 112};
53 
54 static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
55                               0, 112, -94, -18, 0, 112, -94, -18};
56 
57 // Constants for ABGR
58 static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
59                               33, 65, 13, 0, 33, 65, 13, 0};
60 
61 static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
62                               -38, -74, 112, 0, -38, -74, 112, 0};
63 
64 static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
65                               112, -94, -18, 0, 112, -94, -18, 0};
66 
67 // Constants for RGBA.
68 static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
69                               0, 13, 65, 33, 0, 13, 65, 33};
70 
71 static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
72                               0, 112, -74, -38, 0, 112, -74, -38};
73 
74 static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
75                               0, -18, -94, 112, 0, -18, -94, 112};
76 
77 static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
78                               16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
79 
80 // 7 bit fixed point 0.5.
81 static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
82 
83 static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
84                                 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
85 
86 static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
87                                   0x8080u, 0x8080u, 0x8080u, 0x8080u};
88 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
89 
90 #ifdef HAS_RGB24TOARGBROW_SSSE3
91 
92 // Shuffle table for converting RGB24 to ARGB.
93 static const uvec8 kShuffleMaskRGB24ToARGB = {
94     0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
95 
96 // Shuffle table for converting RAW to ARGB.
97 static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
98                                             8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
99 
100 // Shuffle table for converting RAW to RGB24.  First 8.
101 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
102     2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
103     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
104 
105 // Shuffle table for converting RAW to RGB24.  Middle 8.
106 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
107     2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
108     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
109 
110 // Shuffle table for converting RAW to RGB24.  Last 8.
111 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
112     8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
113     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
114 
115 // Shuffle table for converting ARGB to RGB24.
116 static const uvec8 kShuffleMaskARGBToRGB24 = {
117     0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
118 
119 // Shuffle table for converting ARGB to RAW.
120 static const uvec8 kShuffleMaskARGBToRAW = {
121     2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
122 
123 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
124 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
125     0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
126 
127 // YUY2 shuf 16 Y to 32 Y.
128 static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
129                                     10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
130                                     6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
131 
132 // YUY2 shuf 8 UV to 16 UV.
133 static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
134                                      11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
135                                      5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
136 
137 // UYVY shuf 16 Y to 32 Y.
138 static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
139                                     11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
140                                     7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
141 
142 // UYVY shuf 8 UV to 16 UV.
143 static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
144                                      10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
145                                      4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
146 
147 // NV21 shuf 8 VU to 16 UV.
148 static const lvec8 kShuffleNV21 = {
149     1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
150     1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
151 };
152 #endif  // HAS_RGB24TOARGBROW_SSSE3
153 
154 #ifdef HAS_J400TOARGBROW_SSE2
J400ToARGBRow_SSE2(const uint8_t * src_y,uint8_t * dst_argb,int width)155 void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
156   asm volatile(
157       "pcmpeqb   %%xmm5,%%xmm5                   \n"
158       "pslld     $0x18,%%xmm5                    \n"
159 
160       LABELALIGN
161       "1:                                        \n"
162       "movq      (%0),%%xmm0                     \n"
163       "lea       0x8(%0),%0                      \n"
164       "punpcklbw %%xmm0,%%xmm0                   \n"
165       "movdqa    %%xmm0,%%xmm1                   \n"
166       "punpcklwd %%xmm0,%%xmm0                   \n"
167       "punpckhwd %%xmm1,%%xmm1                   \n"
168       "por       %%xmm5,%%xmm0                   \n"
169       "por       %%xmm5,%%xmm1                   \n"
170       "movdqu    %%xmm0,(%1)                     \n"
171       "movdqu    %%xmm1,0x10(%1)                 \n"
172       "lea       0x20(%1),%1                     \n"
173       "sub       $0x8,%2                         \n"
174       "jg        1b                              \n"
175       : "+r"(src_y),     // %0
176         "+r"(dst_argb),  // %1
177         "+r"(width)      // %2
178         ::"memory",
179         "cc", "xmm0", "xmm1", "xmm5");
180 }
181 #endif  // HAS_J400TOARGBROW_SSE2
182 
183 #ifdef HAS_RGB24TOARGBROW_SSSE3
RGB24ToARGBRow_SSSE3(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)184 void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
185                           uint8_t* dst_argb,
186                           int width) {
187   asm volatile(
188       "pcmpeqb   %%xmm5,%%xmm5                   \n"  // 0xff000000
189       "pslld     $0x18,%%xmm5                    \n"
190       "movdqa    %3,%%xmm4                       \n"
191 
192       LABELALIGN
193       "1:                                        \n"
194       "movdqu    (%0),%%xmm0                     \n"
195       "movdqu    0x10(%0),%%xmm1                 \n"
196       "movdqu    0x20(%0),%%xmm3                 \n"
197       "lea       0x30(%0),%0                     \n"
198       "movdqa    %%xmm3,%%xmm2                   \n"
199       "palignr   $0x8,%%xmm1,%%xmm2              \n"
200       "pshufb    %%xmm4,%%xmm2                   \n"
201       "por       %%xmm5,%%xmm2                   \n"
202       "palignr   $0xc,%%xmm0,%%xmm1              \n"
203       "pshufb    %%xmm4,%%xmm0                   \n"
204       "movdqu    %%xmm2,0x20(%1)                 \n"
205       "por       %%xmm5,%%xmm0                   \n"
206       "pshufb    %%xmm4,%%xmm1                   \n"
207       "movdqu    %%xmm0,(%1)                     \n"
208       "por       %%xmm5,%%xmm1                   \n"
209       "palignr   $0x4,%%xmm3,%%xmm3              \n"
210       "pshufb    %%xmm4,%%xmm3                   \n"
211       "movdqu    %%xmm1,0x10(%1)                 \n"
212       "por       %%xmm5,%%xmm3                   \n"
213       "movdqu    %%xmm3,0x30(%1)                 \n"
214       "lea       0x40(%1),%1                     \n"
215       "sub       $0x10,%2                        \n"
216       "jg        1b                              \n"
217       : "+r"(src_rgb24),              // %0
218         "+r"(dst_argb),               // %1
219         "+r"(width)                   // %2
220       : "m"(kShuffleMaskRGB24ToARGB)  // %3
221       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
222 }
223 
RAWToARGBRow_SSSE3(const uint8_t * src_raw,uint8_t * dst_argb,int width)224 void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
225   asm volatile(
226       "pcmpeqb   %%xmm5,%%xmm5                   \n"  // 0xff000000
227       "pslld     $0x18,%%xmm5                    \n"
228       "movdqa    %3,%%xmm4                       \n"
229 
230       LABELALIGN
231       "1:                                        \n"
232       "movdqu    (%0),%%xmm0                     \n"
233       "movdqu    0x10(%0),%%xmm1                 \n"
234       "movdqu    0x20(%0),%%xmm3                 \n"
235       "lea       0x30(%0),%0                     \n"
236       "movdqa    %%xmm3,%%xmm2                   \n"
237       "palignr   $0x8,%%xmm1,%%xmm2              \n"
238       "pshufb    %%xmm4,%%xmm2                   \n"
239       "por       %%xmm5,%%xmm2                   \n"
240       "palignr   $0xc,%%xmm0,%%xmm1              \n"
241       "pshufb    %%xmm4,%%xmm0                   \n"
242       "movdqu    %%xmm2,0x20(%1)                 \n"
243       "por       %%xmm5,%%xmm0                   \n"
244       "pshufb    %%xmm4,%%xmm1                   \n"
245       "movdqu    %%xmm0,(%1)                     \n"
246       "por       %%xmm5,%%xmm1                   \n"
247       "palignr   $0x4,%%xmm3,%%xmm3              \n"
248       "pshufb    %%xmm4,%%xmm3                   \n"
249       "movdqu    %%xmm1,0x10(%1)                 \n"
250       "por       %%xmm5,%%xmm3                   \n"
251       "movdqu    %%xmm3,0x30(%1)                 \n"
252       "lea       0x40(%1),%1                     \n"
253       "sub       $0x10,%2                        \n"
254       "jg        1b                              \n"
255       : "+r"(src_raw),              // %0
256         "+r"(dst_argb),             // %1
257         "+r"(width)                 // %2
258       : "m"(kShuffleMaskRAWToARGB)  // %3
259       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
260 }
261 
RAWToRGB24Row_SSSE3(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)262 void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
263                          uint8_t* dst_rgb24,
264                          int width) {
265   asm volatile(
266       "movdqa     %3,%%xmm3                       \n"
267       "movdqa     %4,%%xmm4                       \n"
268       "movdqa     %5,%%xmm5                       \n"
269 
270       LABELALIGN
271       "1:                                        \n"
272       "movdqu    (%0),%%xmm0                     \n"
273       "movdqu    0x4(%0),%%xmm1                  \n"
274       "movdqu    0x8(%0),%%xmm2                  \n"
275       "lea       0x18(%0),%0                     \n"
276       "pshufb    %%xmm3,%%xmm0                   \n"
277       "pshufb    %%xmm4,%%xmm1                   \n"
278       "pshufb    %%xmm5,%%xmm2                   \n"
279       "movq      %%xmm0,(%1)                     \n"
280       "movq      %%xmm1,0x8(%1)                  \n"
281       "movq      %%xmm2,0x10(%1)                 \n"
282       "lea       0x18(%1),%1                     \n"
283       "sub       $0x8,%2                         \n"
284       "jg        1b                              \n"
285       : "+r"(src_raw),                  // %0
286         "+r"(dst_rgb24),                // %1
287         "+r"(width)                     // %2
288       : "m"(kShuffleMaskRAWToRGB24_0),  // %3
289         "m"(kShuffleMaskRAWToRGB24_1),  // %4
290         "m"(kShuffleMaskRAWToRGB24_2)   // %5
291       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
292 }
293 
RGB565ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)294 void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
295   asm volatile(
296       "mov       $0x1080108,%%eax                \n"
297       "movd      %%eax,%%xmm5                    \n"
298       "pshufd    $0x0,%%xmm5,%%xmm5              \n"
299       "mov       $0x20802080,%%eax               \n"
300       "movd      %%eax,%%xmm6                    \n"
301       "pshufd    $0x0,%%xmm6,%%xmm6              \n"
302       "pcmpeqb   %%xmm3,%%xmm3                   \n"
303       "psllw     $0xb,%%xmm3                     \n"
304       "pcmpeqb   %%xmm4,%%xmm4                   \n"
305       "psllw     $0xa,%%xmm4                     \n"
306       "psrlw     $0x5,%%xmm4                     \n"
307       "pcmpeqb   %%xmm7,%%xmm7                   \n"
308       "psllw     $0x8,%%xmm7                     \n"
309       "sub       %0,%1                           \n"
310       "sub       %0,%1                           \n"
311 
312       LABELALIGN
313       "1:                                        \n"
314       "movdqu    (%0),%%xmm0                     \n"
315       "movdqa    %%xmm0,%%xmm1                   \n"
316       "movdqa    %%xmm0,%%xmm2                   \n"
317       "pand      %%xmm3,%%xmm1                   \n"
318       "psllw     $0xb,%%xmm2                     \n"
319       "pmulhuw   %%xmm5,%%xmm1                   \n"
320       "pmulhuw   %%xmm5,%%xmm2                   \n"
321       "psllw     $0x8,%%xmm1                     \n"
322       "por       %%xmm2,%%xmm1                   \n"
323       "pand      %%xmm4,%%xmm0                   \n"
324       "pmulhuw   %%xmm6,%%xmm0                   \n"
325       "por       %%xmm7,%%xmm0                   \n"
326       "movdqa    %%xmm1,%%xmm2                   \n"
327       "punpcklbw %%xmm0,%%xmm1                   \n"
328       "punpckhbw %%xmm0,%%xmm2                   \n"
329       "movdqu    %%xmm1,0x00(%1,%0,2)            \n"
330       "movdqu    %%xmm2,0x10(%1,%0,2)            \n"
331       "lea       0x10(%0),%0                     \n"
332       "sub       $0x8,%2                         \n"
333       "jg        1b                              \n"
334       : "+r"(src),   // %0
335         "+r"(dst),   // %1
336         "+r"(width)  // %2
337       :
338       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
339         "xmm6", "xmm7");
340 }
341 
ARGB1555ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)342 void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
343   asm volatile(
344       "mov       $0x1080108,%%eax                \n"
345       "movd      %%eax,%%xmm5                    \n"
346       "pshufd    $0x0,%%xmm5,%%xmm5              \n"
347       "mov       $0x42004200,%%eax               \n"
348       "movd      %%eax,%%xmm6                    \n"
349       "pshufd    $0x0,%%xmm6,%%xmm6              \n"
350       "pcmpeqb   %%xmm3,%%xmm3                   \n"
351       "psllw     $0xb,%%xmm3                     \n"
352       "movdqa    %%xmm3,%%xmm4                   \n"
353       "psrlw     $0x6,%%xmm4                     \n"
354       "pcmpeqb   %%xmm7,%%xmm7                   \n"
355       "psllw     $0x8,%%xmm7                     \n"
356       "sub       %0,%1                           \n"
357       "sub       %0,%1                           \n"
358 
359       LABELALIGN
360       "1:                                        \n"
361       "movdqu    (%0),%%xmm0                     \n"
362       "movdqa    %%xmm0,%%xmm1                   \n"
363       "movdqa    %%xmm0,%%xmm2                   \n"
364       "psllw     $0x1,%%xmm1                     \n"
365       "psllw     $0xb,%%xmm2                     \n"
366       "pand      %%xmm3,%%xmm1                   \n"
367       "pmulhuw   %%xmm5,%%xmm2                   \n"
368       "pmulhuw   %%xmm5,%%xmm1                   \n"
369       "psllw     $0x8,%%xmm1                     \n"
370       "por       %%xmm2,%%xmm1                   \n"
371       "movdqa    %%xmm0,%%xmm2                   \n"
372       "pand      %%xmm4,%%xmm0                   \n"
373       "psraw     $0x8,%%xmm2                     \n"
374       "pmulhuw   %%xmm6,%%xmm0                   \n"
375       "pand      %%xmm7,%%xmm2                   \n"
376       "por       %%xmm2,%%xmm0                   \n"
377       "movdqa    %%xmm1,%%xmm2                   \n"
378       "punpcklbw %%xmm0,%%xmm1                   \n"
379       "punpckhbw %%xmm0,%%xmm2                   \n"
380       "movdqu    %%xmm1,0x00(%1,%0,2)            \n"
381       "movdqu    %%xmm2,0x10(%1,%0,2)            \n"
382       "lea       0x10(%0),%0                     \n"
383       "sub       $0x8,%2                         \n"
384       "jg        1b                              \n"
385       : "+r"(src),   // %0
386         "+r"(dst),   // %1
387         "+r"(width)  // %2
388       :
389       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
390         "xmm6", "xmm7");
391 }
392 
ARGB4444ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)393 void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
394   asm volatile(
395       "mov       $0xf0f0f0f,%%eax                \n"
396       "movd      %%eax,%%xmm4                    \n"
397       "pshufd    $0x0,%%xmm4,%%xmm4              \n"
398       "movdqa    %%xmm4,%%xmm5                   \n"
399       "pslld     $0x4,%%xmm5                     \n"
400       "sub       %0,%1                           \n"
401       "sub       %0,%1                           \n"
402 
403       LABELALIGN
404       "1:                                        \n"
405       "movdqu    (%0),%%xmm0                     \n"
406       "movdqa    %%xmm0,%%xmm2                   \n"
407       "pand      %%xmm4,%%xmm0                   \n"
408       "pand      %%xmm5,%%xmm2                   \n"
409       "movdqa    %%xmm0,%%xmm1                   \n"
410       "movdqa    %%xmm2,%%xmm3                   \n"
411       "psllw     $0x4,%%xmm1                     \n"
412       "psrlw     $0x4,%%xmm3                     \n"
413       "por       %%xmm1,%%xmm0                   \n"
414       "por       %%xmm3,%%xmm2                   \n"
415       "movdqa    %%xmm0,%%xmm1                   \n"
416       "punpcklbw %%xmm2,%%xmm0                   \n"
417       "punpckhbw %%xmm2,%%xmm1                   \n"
418       "movdqu    %%xmm0,0x00(%1,%0,2)            \n"
419       "movdqu    %%xmm1,0x10(%1,%0,2)            \n"
420       "lea       0x10(%0),%0                     \n"
421       "sub       $0x8,%2                         \n"
422       "jg        1b                              \n"
423       : "+r"(src),   // %0
424         "+r"(dst),   // %1
425         "+r"(width)  // %2
426       :
427       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
428 }
429 
ARGBToRGB24Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)430 void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
431   asm volatile(
432 
433       "movdqa    %3,%%xmm6                       \n"
434 
435       LABELALIGN
436       "1:                                        \n"
437       "movdqu    (%0),%%xmm0                     \n"
438       "movdqu    0x10(%0),%%xmm1                 \n"
439       "movdqu    0x20(%0),%%xmm2                 \n"
440       "movdqu    0x30(%0),%%xmm3                 \n"
441       "lea       0x40(%0),%0                     \n"
442       "pshufb    %%xmm6,%%xmm0                   \n"
443       "pshufb    %%xmm6,%%xmm1                   \n"
444       "pshufb    %%xmm6,%%xmm2                   \n"
445       "pshufb    %%xmm6,%%xmm3                   \n"
446       "movdqa    %%xmm1,%%xmm4                   \n"
447       "psrldq    $0x4,%%xmm1                     \n"
448       "pslldq    $0xc,%%xmm4                     \n"
449       "movdqa    %%xmm2,%%xmm5                   \n"
450       "por       %%xmm4,%%xmm0                   \n"
451       "pslldq    $0x8,%%xmm5                     \n"
452       "movdqu    %%xmm0,(%1)                     \n"
453       "por       %%xmm5,%%xmm1                   \n"
454       "psrldq    $0x8,%%xmm2                     \n"
455       "pslldq    $0x4,%%xmm3                     \n"
456       "por       %%xmm3,%%xmm2                   \n"
457       "movdqu    %%xmm1,0x10(%1)                 \n"
458       "movdqu    %%xmm2,0x20(%1)                 \n"
459       "lea       0x30(%1),%1                     \n"
460       "sub       $0x10,%2                        \n"
461       "jg        1b                              \n"
462       : "+r"(src),                    // %0
463         "+r"(dst),                    // %1
464         "+r"(width)                   // %2
465       : "m"(kShuffleMaskARGBToRGB24)  // %3
466       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
467 }
468 
ARGBToRAWRow_SSSE3(const uint8_t * src,uint8_t * dst,int width)469 void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
470   asm volatile(
471 
472       "movdqa    %3,%%xmm6                       \n"
473 
474       LABELALIGN
475       "1:                                        \n"
476       "movdqu    (%0),%%xmm0                     \n"
477       "movdqu    0x10(%0),%%xmm1                 \n"
478       "movdqu    0x20(%0),%%xmm2                 \n"
479       "movdqu    0x30(%0),%%xmm3                 \n"
480       "lea       0x40(%0),%0                     \n"
481       "pshufb    %%xmm6,%%xmm0                   \n"
482       "pshufb    %%xmm6,%%xmm1                   \n"
483       "pshufb    %%xmm6,%%xmm2                   \n"
484       "pshufb    %%xmm6,%%xmm3                   \n"
485       "movdqa    %%xmm1,%%xmm4                   \n"
486       "psrldq    $0x4,%%xmm1                     \n"
487       "pslldq    $0xc,%%xmm4                     \n"
488       "movdqa    %%xmm2,%%xmm5                   \n"
489       "por       %%xmm4,%%xmm0                   \n"
490       "pslldq    $0x8,%%xmm5                     \n"
491       "movdqu    %%xmm0,(%1)                     \n"
492       "por       %%xmm5,%%xmm1                   \n"
493       "psrldq    $0x8,%%xmm2                     \n"
494       "pslldq    $0x4,%%xmm3                     \n"
495       "por       %%xmm3,%%xmm2                   \n"
496       "movdqu    %%xmm1,0x10(%1)                 \n"
497       "movdqu    %%xmm2,0x20(%1)                 \n"
498       "lea       0x30(%1),%1                     \n"
499       "sub       $0x10,%2                        \n"
500       "jg        1b                              \n"
501       : "+r"(src),                  // %0
502         "+r"(dst),                  // %1
503         "+r"(width)                 // %2
504       : "m"(kShuffleMaskARGBToRAW)  // %3
505       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
506 }
507 
508 #ifdef HAS_ARGBTORGB24ROW_AVX2
509 // vpermd for 12+12 to 24
510 static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
511 
ARGBToRGB24Row_AVX2(const uint8_t * src,uint8_t * dst,int width)512 void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
513   asm volatile(
514       "vbroadcastf128 %3,%%ymm6                  \n"
515       "vmovdqa    %4,%%ymm7                      \n"
516 
517       LABELALIGN
518       "1:                                        \n"
519       "vmovdqu    (%0),%%ymm0                    \n"
520       "vmovdqu    0x20(%0),%%ymm1                \n"
521       "vmovdqu    0x40(%0),%%ymm2                \n"
522       "vmovdqu    0x60(%0),%%ymm3                \n"
523       "lea        0x80(%0),%0                    \n"
524       "vpshufb    %%ymm6,%%ymm0,%%ymm0           \n"  // xxx0yyy0
525       "vpshufb    %%ymm6,%%ymm1,%%ymm1           \n"
526       "vpshufb    %%ymm6,%%ymm2,%%ymm2           \n"
527       "vpshufb    %%ymm6,%%ymm3,%%ymm3           \n"
528       "vpermd     %%ymm0,%%ymm7,%%ymm0           \n"  // pack to 24 bytes
529       "vpermd     %%ymm1,%%ymm7,%%ymm1           \n"
530       "vpermd     %%ymm2,%%ymm7,%%ymm2           \n"
531       "vpermd     %%ymm3,%%ymm7,%%ymm3           \n"
532       "vpermq     $0x3f,%%ymm1,%%ymm4            \n"  // combine 24 + 8
533       "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
534       "vmovdqu    %%ymm0,(%1)                    \n"
535       "vpermq     $0xf9,%%ymm1,%%ymm1            \n"  // combine 16 + 16
536       "vpermq     $0x4f,%%ymm2,%%ymm4            \n"
537       "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
538       "vmovdqu    %%ymm1,0x20(%1)                \n"
539       "vpermq     $0xfe,%%ymm2,%%ymm2            \n"  // combine 8 + 24
540       "vpermq     $0x93,%%ymm3,%%ymm3            \n"
541       "vpor       %%ymm3,%%ymm2,%%ymm2           \n"
542       "vmovdqu    %%ymm2,0x40(%1)                \n"
543       "lea        0x60(%1),%1                    \n"
544       "sub        $0x20,%2                       \n"
545       "jg         1b                             \n"
546       "vzeroupper                                \n"
547       : "+r"(src),                     // %0
548         "+r"(dst),                     // %1
549         "+r"(width)                    // %2
550       : "m"(kShuffleMaskARGBToRGB24),  // %3
551         "m"(kPermdRGB24_AVX)           // %4
552       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
553         "xmm7");
554 }
555 #endif
556 
557 #ifdef HAS_ARGBTORGB24ROW_AVX512VBMI
558 // Shuffle table for converting ARGBToRGB24
559 static const ulvec8 kPermARGBToRGB24_0 = {
560     0u,  1u,  2u,  4u,  5u,  6u,  8u,  9u,  10u, 12u, 13u,
561     14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
562     29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
563 static const ulvec8 kPermARGBToRGB24_1 = {
564     10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
565     25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
566     40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
567 static const ulvec8 kPermARGBToRGB24_2 = {
568     21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
569     36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
570     50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
571 
ARGBToRGB24Row_AVX512VBMI(const uint8_t * src,uint8_t * dst,int width)572 void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
573   asm volatile(
574       "vmovdqa    %3,%%ymm5                      \n"
575       "vmovdqa    %4,%%ymm6                      \n"
576       "vmovdqa    %5,%%ymm7                      \n"
577 
578       LABELALIGN
579       "1:                                        \n"
580       "vmovdqu    (%0),%%ymm0                    \n"
581       "vmovdqu    0x20(%0),%%ymm1                \n"
582       "vmovdqu    0x40(%0),%%ymm2                \n"
583       "vmovdqu    0x60(%0),%%ymm3                \n"
584       "lea        0x80(%0),%0                    \n"
585       "vpermt2b   %%ymm1,%%ymm5,%%ymm0           \n"
586       "vpermt2b   %%ymm2,%%ymm6,%%ymm1           \n"
587       "vpermt2b   %%ymm3,%%ymm7,%%ymm2           \n"
588       "vmovdqu    %%ymm0,(%1)                    \n"
589       "vmovdqu    %%ymm1,0x20(%1)                \n"
590       "vmovdqu    %%ymm2,0x40(%1)                \n"
591       "lea        0x60(%1),%1                    \n"
592       "sub        $0x20,%2                       \n"
593       "jg         1b                             \n"
594       "vzeroupper                                \n"
595       : "+r"(src),                // %0
596         "+r"(dst),                // %1
597         "+r"(width)               // %2
598       : "m"(kPermARGBToRGB24_0),  // %3
599         "m"(kPermARGBToRGB24_1),  // %4
600         "m"(kPermARGBToRGB24_2)   // %5
601       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");
602 }
603 #endif
604 
605 #ifdef HAS_ARGBTORAWROW_AVX2
ARGBToRAWRow_AVX2(const uint8_t * src,uint8_t * dst,int width)606 void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
607   asm volatile(
608       "vbroadcastf128 %3,%%ymm6                  \n"
609       "vmovdqa    %4,%%ymm7                      \n"
610 
611       LABELALIGN
612       "1:                                        \n"
613       "vmovdqu    (%0),%%ymm0                    \n"
614       "vmovdqu    0x20(%0),%%ymm1                \n"
615       "vmovdqu    0x40(%0),%%ymm2                \n"
616       "vmovdqu    0x60(%0),%%ymm3                \n"
617       "lea        0x80(%0),%0                    \n"
618       "vpshufb    %%ymm6,%%ymm0,%%ymm0           \n"  // xxx0yyy0
619       "vpshufb    %%ymm6,%%ymm1,%%ymm1           \n"
620       "vpshufb    %%ymm6,%%ymm2,%%ymm2           \n"
621       "vpshufb    %%ymm6,%%ymm3,%%ymm3           \n"
622       "vpermd     %%ymm0,%%ymm7,%%ymm0           \n"  // pack to 24 bytes
623       "vpermd     %%ymm1,%%ymm7,%%ymm1           \n"
624       "vpermd     %%ymm2,%%ymm7,%%ymm2           \n"
625       "vpermd     %%ymm3,%%ymm7,%%ymm3           \n"
626       "vpermq     $0x3f,%%ymm1,%%ymm4            \n"  // combine 24 + 8
627       "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
628       "vmovdqu    %%ymm0,(%1)                    \n"
629       "vpermq     $0xf9,%%ymm1,%%ymm1            \n"  // combine 16 + 16
630       "vpermq     $0x4f,%%ymm2,%%ymm4            \n"
631       "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
632       "vmovdqu    %%ymm1,0x20(%1)                \n"
633       "vpermq     $0xfe,%%ymm2,%%ymm2            \n"  // combine 8 + 24
634       "vpermq     $0x93,%%ymm3,%%ymm3            \n"
635       "vpor       %%ymm3,%%ymm2,%%ymm2           \n"
636       "vmovdqu    %%ymm2,0x40(%1)                \n"
637       "lea        0x60(%1),%1                    \n"
638       "sub        $0x20,%2                       \n"
639       "jg         1b                             \n"
640       "vzeroupper                                \n"
641       : "+r"(src),                   // %0
642         "+r"(dst),                   // %1
643         "+r"(width)                  // %2
644       : "m"(kShuffleMaskARGBToRAW),  // %3
645         "m"(kPermdRGB24_AVX)         // %4
646       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
647         "xmm7");
648 }
649 #endif
650 
ARGBToRGB565Row_SSE2(const uint8_t * src,uint8_t * dst,int width)651 void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
652   asm volatile(
653       "pcmpeqb   %%xmm3,%%xmm3                   \n"
654       "psrld     $0x1b,%%xmm3                    \n"
655       "pcmpeqb   %%xmm4,%%xmm4                   \n"
656       "psrld     $0x1a,%%xmm4                    \n"
657       "pslld     $0x5,%%xmm4                     \n"
658       "pcmpeqb   %%xmm5,%%xmm5                   \n"
659       "pslld     $0xb,%%xmm5                     \n"
660 
661       LABELALIGN
662       "1:                                        \n"
663       "movdqu    (%0),%%xmm0                     \n"
664       "movdqa    %%xmm0,%%xmm1                   \n"
665       "movdqa    %%xmm0,%%xmm2                   \n"
666       "pslld     $0x8,%%xmm0                     \n"
667       "psrld     $0x3,%%xmm1                     \n"
668       "psrld     $0x5,%%xmm2                     \n"
669       "psrad     $0x10,%%xmm0                    \n"
670       "pand      %%xmm3,%%xmm1                   \n"
671       "pand      %%xmm4,%%xmm2                   \n"
672       "pand      %%xmm5,%%xmm0                   \n"
673       "por       %%xmm2,%%xmm1                   \n"
674       "por       %%xmm1,%%xmm0                   \n"
675       "packssdw  %%xmm0,%%xmm0                   \n"
676       "lea       0x10(%0),%0                     \n"
677       "movq      %%xmm0,(%1)                     \n"
678       "lea       0x8(%1),%1                      \n"
679       "sub       $0x4,%2                         \n"
680       "jg        1b                              \n"
681       : "+r"(src),   // %0
682         "+r"(dst),   // %1
683         "+r"(width)  // %2
684         ::"memory",
685         "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
686 }
687 
ARGBToRGB565DitherRow_SSE2(const uint8_t * src,uint8_t * dst,const uint32_t dither4,int width)688 void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
689                                 uint8_t* dst,
690                                 const uint32_t dither4,
691                                 int width) {
692   asm volatile(
693       "movd       %3,%%xmm6                      \n"
694       "punpcklbw  %%xmm6,%%xmm6                  \n"
695       "movdqa     %%xmm6,%%xmm7                  \n"
696       "punpcklwd  %%xmm6,%%xmm6                  \n"
697       "punpckhwd  %%xmm7,%%xmm7                  \n"
698       "pcmpeqb    %%xmm3,%%xmm3                  \n"
699       "psrld      $0x1b,%%xmm3                   \n"
700       "pcmpeqb    %%xmm4,%%xmm4                  \n"
701       "psrld      $0x1a,%%xmm4                   \n"
702       "pslld      $0x5,%%xmm4                    \n"
703       "pcmpeqb    %%xmm5,%%xmm5                  \n"
704       "pslld      $0xb,%%xmm5                    \n"
705 
706       LABELALIGN
707       "1:                                        \n"
708       "movdqu     (%0),%%xmm0                    \n"
709       "paddusb    %%xmm6,%%xmm0                  \n"
710       "movdqa     %%xmm0,%%xmm1                  \n"
711       "movdqa     %%xmm0,%%xmm2                  \n"
712       "pslld      $0x8,%%xmm0                    \n"
713       "psrld      $0x3,%%xmm1                    \n"
714       "psrld      $0x5,%%xmm2                    \n"
715       "psrad      $0x10,%%xmm0                   \n"
716       "pand       %%xmm3,%%xmm1                  \n"
717       "pand       %%xmm4,%%xmm2                  \n"
718       "pand       %%xmm5,%%xmm0                  \n"
719       "por        %%xmm2,%%xmm1                  \n"
720       "por        %%xmm1,%%xmm0                  \n"
721       "packssdw   %%xmm0,%%xmm0                  \n"
722       "lea        0x10(%0),%0                    \n"
723       "movq       %%xmm0,(%1)                    \n"
724       "lea        0x8(%1),%1                     \n"
725       "sub        $0x4,%2                        \n"
726       "jg        1b                              \n"
727       : "+r"(src),    // %0
728         "+r"(dst),    // %1
729         "+r"(width)   // %2
730       : "m"(dither4)  // %3
731       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
732         "xmm7");
733 }
734 
735 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
ARGBToRGB565DitherRow_AVX2(const uint8_t * src,uint8_t * dst,const uint32_t dither4,int width)736 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
737                                 uint8_t* dst,
738                                 const uint32_t dither4,
739                                 int width) {
740   asm volatile(
741       "vbroadcastss %3,%%xmm6                    \n"
742       "vpunpcklbw %%xmm6,%%xmm6,%%xmm6           \n"
743       "vpermq     $0xd8,%%ymm6,%%ymm6            \n"
744       "vpunpcklwd %%ymm6,%%ymm6,%%ymm6           \n"
745       "vpcmpeqb   %%ymm3,%%ymm3,%%ymm3           \n"
746       "vpsrld     $0x1b,%%ymm3,%%ymm3            \n"
747       "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
748       "vpsrld     $0x1a,%%ymm4,%%ymm4            \n"
749       "vpslld     $0x5,%%ymm4,%%ymm4             \n"
750       "vpslld     $0xb,%%ymm3,%%ymm5             \n"
751 
752       LABELALIGN
753       "1:                                        \n"
754       "vmovdqu    (%0),%%ymm0                    \n"
755       "vpaddusb   %%ymm6,%%ymm0,%%ymm0           \n"
756       "vpsrld     $0x5,%%ymm0,%%ymm2             \n"
757       "vpsrld     $0x3,%%ymm0,%%ymm1             \n"
758       "vpsrld     $0x8,%%ymm0,%%ymm0             \n"
759       "vpand      %%ymm4,%%ymm2,%%ymm2           \n"
760       "vpand      %%ymm3,%%ymm1,%%ymm1           \n"
761       "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
762       "vpor       %%ymm2,%%ymm1,%%ymm1           \n"
763       "vpor       %%ymm1,%%ymm0,%%ymm0           \n"
764       "vpackusdw  %%ymm0,%%ymm0,%%ymm0           \n"
765       "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
766       "lea        0x20(%0),%0                    \n"
767       "vmovdqu    %%xmm0,(%1)                    \n"
768       "lea        0x10(%1),%1                    \n"
769       "sub        $0x8,%2                        \n"
770       "jg         1b                             \n"
771       "vzeroupper                                \n"
772       : "+r"(src),    // %0
773         "+r"(dst),    // %1
774         "+r"(width)   // %2
775       : "m"(dither4)  // %3
776       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
777         "xmm7");
778 }
779 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
780 
ARGBToARGB1555Row_SSE2(const uint8_t * src,uint8_t * dst,int width)781 void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
782   asm volatile(
783       "pcmpeqb   %%xmm4,%%xmm4                   \n"
784       "psrld     $0x1b,%%xmm4                    \n"
785       "movdqa    %%xmm4,%%xmm5                   \n"
786       "pslld     $0x5,%%xmm5                     \n"
787       "movdqa    %%xmm4,%%xmm6                   \n"
788       "pslld     $0xa,%%xmm6                     \n"
789       "pcmpeqb   %%xmm7,%%xmm7                   \n"
790       "pslld     $0xf,%%xmm7                     \n"
791 
792       LABELALIGN
793       "1:                                        \n"
794       "movdqu    (%0),%%xmm0                     \n"
795       "movdqa    %%xmm0,%%xmm1                   \n"
796       "movdqa    %%xmm0,%%xmm2                   \n"
797       "movdqa    %%xmm0,%%xmm3                   \n"
798       "psrad     $0x10,%%xmm0                    \n"
799       "psrld     $0x3,%%xmm1                     \n"
800       "psrld     $0x6,%%xmm2                     \n"
801       "psrld     $0x9,%%xmm3                     \n"
802       "pand      %%xmm7,%%xmm0                   \n"
803       "pand      %%xmm4,%%xmm1                   \n"
804       "pand      %%xmm5,%%xmm2                   \n"
805       "pand      %%xmm6,%%xmm3                   \n"
806       "por       %%xmm1,%%xmm0                   \n"
807       "por       %%xmm3,%%xmm2                   \n"
808       "por       %%xmm2,%%xmm0                   \n"
809       "packssdw  %%xmm0,%%xmm0                   \n"
810       "lea       0x10(%0),%0                     \n"
811       "movq      %%xmm0,(%1)                     \n"
812       "lea       0x8(%1),%1                      \n"
813       "sub       $0x4,%2                         \n"
814       "jg        1b                              \n"
815       : "+r"(src),   // %0
816         "+r"(dst),   // %1
817         "+r"(width)  // %2
818         ::"memory",
819         "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
820 }
821 
ARGBToARGB4444Row_SSE2(const uint8_t * src,uint8_t * dst,int width)822 void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
823   asm volatile(
824       "pcmpeqb   %%xmm4,%%xmm4                   \n"
825       "psllw     $0xc,%%xmm4                     \n"
826       "movdqa    %%xmm4,%%xmm3                   \n"
827       "psrlw     $0x8,%%xmm3                     \n"
828 
829       LABELALIGN
830       "1:                                        \n"
831       "movdqu    (%0),%%xmm0                     \n"
832       "movdqa    %%xmm0,%%xmm1                   \n"
833       "pand      %%xmm3,%%xmm0                   \n"
834       "pand      %%xmm4,%%xmm1                   \n"
835       "psrlq     $0x4,%%xmm0                     \n"
836       "psrlq     $0x8,%%xmm1                     \n"
837       "por       %%xmm1,%%xmm0                   \n"
838       "packuswb  %%xmm0,%%xmm0                   \n"
839       "lea       0x10(%0),%0                     \n"
840       "movq      %%xmm0,(%1)                     \n"
841       "lea       0x8(%1),%1                      \n"
842       "sub       $0x4,%2                         \n"
843       "jg        1b                              \n"
844       : "+r"(src),   // %0
845         "+r"(dst),   // %1
846         "+r"(width)  // %2
847         ::"memory",
848         "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
849 }
850 #endif  // HAS_RGB24TOARGBROW_SSSE3
851 
852 /*
853 
854 ARGBToAR30Row:
855 
856 Red Blue
857 With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
858 produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
859 wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
860 (1024+4)*16 for red.
861 
862 Alpha Green
863 Alpha and Green are already in the high bits so vpand can zero out the other
864 bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
865 could be used for Green - (1024+4) putting the 10 bit green in the lsb.  Alpha
866 would be a simple multiplier to shift it into position.  It wants a gap of 10
867 above the green.  Green is 10 bits, so there are 6 bits in the low short.  4
868 more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
869 and then a shift of 4 is a multiply of 16, so (4*16) = 64.  Then shift the
870 result left 10 to position the A and G channels.
871 */
872 
873 // Shuffle table for converting RAW to RGB24.  Last 8.
874 static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u,  128u, 4u,  128u, 6u,
875                                    128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
876 
877 static const uvec8 kShuffleBR30 = {128u, 2u,  128u, 0u, 128u, 6u,  128u, 4u,
878                                    128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
879 
880 static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
881 static const uint32_t kMaskRB10 = 0x3ff003ff;
882 static const uint32_t kMaskAG10 = 0xc000ff00;
883 static const uint32_t kMulAG10 = 64 * 65536 + 1028;
884 
ARGBToAR30Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)885 void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
886   asm volatile(
887       "movdqa     %3,%%xmm2                     \n"  // shuffler for RB
888       "movd       %4,%%xmm3                     \n"  // multipler for RB
889       "movd       %5,%%xmm4                     \n"  // mask for R10 B10
890       "movd       %6,%%xmm5                     \n"  // mask for AG
891       "movd       %7,%%xmm6                     \n"  // multipler for AG
892       "pshufd     $0x0,%%xmm3,%%xmm3            \n"
893       "pshufd     $0x0,%%xmm4,%%xmm4            \n"
894       "pshufd     $0x0,%%xmm5,%%xmm5            \n"
895       "pshufd     $0x0,%%xmm6,%%xmm6            \n"
896       "sub        %0,%1                         \n"
897 
898       "1:                                       \n"
899       "movdqu     (%0),%%xmm0                   \n"  // fetch 4 ARGB pixels
900       "movdqa     %%xmm0,%%xmm1                 \n"
901       "pshufb     %%xmm2,%%xmm1                 \n"  // R0B0
902       "pand       %%xmm5,%%xmm0                 \n"  // A0G0
903       "pmulhuw    %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
904       "pmulhuw    %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
905       "pand       %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
906       "pslld      $10,%%xmm0                    \n"  // A2 x10 G10 x10
907       "por        %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
908       "movdqu     %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
909       "add        $0x10,%0                      \n"
910       "sub        $0x4,%2                       \n"
911       "jg         1b                            \n"
912 
913       : "+r"(src),          // %0
914         "+r"(dst),          // %1
915         "+r"(width)         // %2
916       : "m"(kShuffleRB30),  // %3
917         "m"(kMulRB10),      // %4
918         "m"(kMaskRB10),     // %5
919         "m"(kMaskAG10),     // %6
920         "m"(kMulAG10)       // %7
921       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
922 }
923 
ABGRToAR30Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)924 void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
925   asm volatile(
926       "movdqa     %3,%%xmm2                     \n"  // shuffler for RB
927       "movd       %4,%%xmm3                     \n"  // multipler for RB
928       "movd       %5,%%xmm4                     \n"  // mask for R10 B10
929       "movd       %6,%%xmm5                     \n"  // mask for AG
930       "movd       %7,%%xmm6                     \n"  // multipler for AG
931       "pshufd     $0x0,%%xmm3,%%xmm3            \n"
932       "pshufd     $0x0,%%xmm4,%%xmm4            \n"
933       "pshufd     $0x0,%%xmm5,%%xmm5            \n"
934       "pshufd     $0x0,%%xmm6,%%xmm6            \n"
935       "sub        %0,%1                         \n"
936 
937       "1:                                       \n"
938       "movdqu     (%0),%%xmm0                   \n"  // fetch 4 ABGR pixels
939       "movdqa     %%xmm0,%%xmm1                 \n"
940       "pshufb     %%xmm2,%%xmm1                 \n"  // R0B0
941       "pand       %%xmm5,%%xmm0                 \n"  // A0G0
942       "pmulhuw    %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
943       "pmulhuw    %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
944       "pand       %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
945       "pslld      $10,%%xmm0                    \n"  // A2 x10 G10 x10
946       "por        %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
947       "movdqu     %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
948       "add        $0x10,%0                      \n"
949       "sub        $0x4,%2                       \n"
950       "jg         1b                            \n"
951 
952       : "+r"(src),          // %0
953         "+r"(dst),          // %1
954         "+r"(width)         // %2
955       : "m"(kShuffleBR30),  // %3  reversed shuffler
956         "m"(kMulRB10),      // %4
957         "m"(kMaskRB10),     // %5
958         "m"(kMaskAG10),     // %6
959         "m"(kMulAG10)       // %7
960       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
961 }
962 
963 #ifdef HAS_ARGBTOAR30ROW_AVX2
ARGBToAR30Row_AVX2(const uint8_t * src,uint8_t * dst,int width)964 void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
965   asm volatile(
966       "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
967       "vbroadcastss  %4,%%ymm3                   \n"  // multipler for RB
968       "vbroadcastss  %5,%%ymm4                   \n"  // mask for R10 B10
969       "vbroadcastss  %6,%%ymm5                   \n"  // mask for AG
970       "vbroadcastss  %7,%%ymm6                   \n"  // multipler for AG
971       "sub        %0,%1                          \n"
972 
973       "1:                                        \n"
974       "vmovdqu    (%0),%%ymm0                    \n"  // fetch 8 ARGB pixels
975       "vpshufb    %%ymm2,%%ymm0,%%ymm1           \n"  // R0B0
976       "vpand      %%ymm5,%%ymm0,%%ymm0           \n"  // A0G0
977       "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"  // X2 R16 X4  B10
978       "vpmulhuw   %%ymm6,%%ymm0,%%ymm0           \n"  // X10 A2 X10 G10
979       "vpand      %%ymm4,%%ymm1,%%ymm1           \n"  // X2 R10 X10 B10
980       "vpslld     $10,%%ymm0,%%ymm0              \n"  // A2 x10 G10 x10
981       "vpor       %%ymm1,%%ymm0,%%ymm0           \n"  // A2 R10 G10 B10
982       "vmovdqu    %%ymm0,(%1,%0)                 \n"  // store 8 AR30 pixels
983       "add        $0x20,%0                       \n"
984       "sub        $0x8,%2                        \n"
985       "jg         1b                             \n"
986       "vzeroupper                                \n"
987 
988       : "+r"(src),          // %0
989         "+r"(dst),          // %1
990         "+r"(width)         // %2
991       : "m"(kShuffleRB30),  // %3
992         "m"(kMulRB10),      // %4
993         "m"(kMaskRB10),     // %5
994         "m"(kMaskAG10),     // %6
995         "m"(kMulAG10)       // %7
996       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
997 }
998 #endif
999 
1000 #ifdef HAS_ABGRTOAR30ROW_AVX2
ABGRToAR30Row_AVX2(const uint8_t * src,uint8_t * dst,int width)1001 void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
1002   asm volatile(
1003       "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
1004       "vbroadcastss  %4,%%ymm3                   \n"  // multipler for RB
1005       "vbroadcastss  %5,%%ymm4                   \n"  // mask for R10 B10
1006       "vbroadcastss  %6,%%ymm5                   \n"  // mask for AG
1007       "vbroadcastss  %7,%%ymm6                   \n"  // multipler for AG
1008       "sub        %0,%1                          \n"
1009 
1010       "1:                                        \n"
1011       "vmovdqu    (%0),%%ymm0                    \n"  // fetch 8 ABGR pixels
1012       "vpshufb    %%ymm2,%%ymm0,%%ymm1           \n"  // R0B0
1013       "vpand      %%ymm5,%%ymm0,%%ymm0           \n"  // A0G0
1014       "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"  // X2 R16 X4  B10
1015       "vpmulhuw   %%ymm6,%%ymm0,%%ymm0           \n"  // X10 A2 X10 G10
1016       "vpand      %%ymm4,%%ymm1,%%ymm1           \n"  // X2 R10 X10 B10
1017       "vpslld     $10,%%ymm0,%%ymm0              \n"  // A2 x10 G10 x10
1018       "vpor       %%ymm1,%%ymm0,%%ymm0           \n"  // A2 R10 G10 B10
1019       "vmovdqu    %%ymm0,(%1,%0)                 \n"  // store 8 AR30 pixels
1020       "add        $0x20,%0                       \n"
1021       "sub        $0x8,%2                        \n"
1022       "jg         1b                             \n"
1023       "vzeroupper                                \n"
1024 
1025       : "+r"(src),          // %0
1026         "+r"(dst),          // %1
1027         "+r"(width)         // %2
1028       : "m"(kShuffleBR30),  // %3  reversed shuffler
1029         "m"(kMulRB10),      // %4
1030         "m"(kMaskRB10),     // %5
1031         "m"(kMaskAG10),     // %6
1032         "m"(kMulAG10)       // %7
1033       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1034 }
1035 #endif
1036 
1037 #ifdef HAS_ARGBTOYROW_SSSE3
1038 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
ARGBToYRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_y,int width)1039 void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1040   asm volatile(
1041       "movdqa    %3,%%xmm4                       \n"
1042       "movdqa    %4,%%xmm5                       \n"
1043 
1044       LABELALIGN
1045       "1:                                        \n"
1046       "movdqu    (%0),%%xmm0                     \n"
1047       "movdqu    0x10(%0),%%xmm1                 \n"
1048       "movdqu    0x20(%0),%%xmm2                 \n"
1049       "movdqu    0x30(%0),%%xmm3                 \n"
1050       "pmaddubsw %%xmm4,%%xmm0                   \n"
1051       "pmaddubsw %%xmm4,%%xmm1                   \n"
1052       "pmaddubsw %%xmm4,%%xmm2                   \n"
1053       "pmaddubsw %%xmm4,%%xmm3                   \n"
1054       "lea       0x40(%0),%0                     \n"
1055       "phaddw    %%xmm1,%%xmm0                   \n"
1056       "phaddw    %%xmm3,%%xmm2                   \n"
1057       "psrlw     $0x7,%%xmm0                     \n"
1058       "psrlw     $0x7,%%xmm2                     \n"
1059       "packuswb  %%xmm2,%%xmm0                   \n"
1060       "paddb     %%xmm5,%%xmm0                   \n"
1061       "movdqu    %%xmm0,(%1)                     \n"
1062       "lea       0x10(%1),%1                     \n"
1063       "sub       $0x10,%2                        \n"
1064       "jg        1b                              \n"
1065       : "+r"(src_argb),  // %0
1066         "+r"(dst_y),     // %1
1067         "+r"(width)      // %2
1068       : "m"(kARGBToY),   // %3
1069         "m"(kAddY16)     // %4
1070       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1071 }
1072 #endif  // HAS_ARGBTOYROW_SSSE3
1073 
1074 #ifdef HAS_ARGBTOYJROW_SSSE3
1075 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1076 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
ARGBToYJRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_y,int width)1077 void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1078   asm volatile(
1079       "movdqa    %3,%%xmm4                       \n"
1080       "movdqa    %4,%%xmm5                       \n"
1081 
1082       LABELALIGN
1083       "1:                                        \n"
1084       "movdqu    (%0),%%xmm0                     \n"
1085       "movdqu    0x10(%0),%%xmm1                 \n"
1086       "movdqu    0x20(%0),%%xmm2                 \n"
1087       "movdqu    0x30(%0),%%xmm3                 \n"
1088       "pmaddubsw %%xmm4,%%xmm0                   \n"
1089       "pmaddubsw %%xmm4,%%xmm1                   \n"
1090       "pmaddubsw %%xmm4,%%xmm2                   \n"
1091       "pmaddubsw %%xmm4,%%xmm3                   \n"
1092       "lea       0x40(%0),%0                     \n"
1093       "phaddw    %%xmm1,%%xmm0                   \n"
1094       "phaddw    %%xmm3,%%xmm2                   \n"
1095       "paddw     %%xmm5,%%xmm0                   \n"
1096       "paddw     %%xmm5,%%xmm2                   \n"
1097       "psrlw     $0x7,%%xmm0                     \n"
1098       "psrlw     $0x7,%%xmm2                     \n"
1099       "packuswb  %%xmm2,%%xmm0                   \n"
1100       "movdqu    %%xmm0,(%1)                     \n"
1101       "lea       0x10(%1),%1                     \n"
1102       "sub       $0x10,%2                        \n"
1103       "jg        1b                              \n"
1104       : "+r"(src_argb),  // %0
1105         "+r"(dst_y),     // %1
1106         "+r"(width)      // %2
1107       : "m"(kARGBToYJ),  // %3
1108         "m"(kAddYJ64)    // %4
1109       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1110 }
1111 #endif  // HAS_ARGBTOYJROW_SSSE3
1112 
1113 #ifdef HAS_ARGBTOYROW_AVX2
1114 // vpermd for vphaddw + vpackuswb vpermd.
1115 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
1116 
1117 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYRow_AVX2(const uint8_t * src_argb,uint8_t * dst_y,int width)1118 void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1119   asm volatile(
1120       "vbroadcastf128 %3,%%ymm4                  \n"
1121       "vbroadcastf128 %4,%%ymm5                  \n"
1122       "vmovdqu    %5,%%ymm6                      \n"
1123 
1124       LABELALIGN
1125       "1:                                        \n"
1126       "vmovdqu    (%0),%%ymm0                    \n"
1127       "vmovdqu    0x20(%0),%%ymm1                \n"
1128       "vmovdqu    0x40(%0),%%ymm2                \n"
1129       "vmovdqu    0x60(%0),%%ymm3                \n"
1130       "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
1131       "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
1132       "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
1133       "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
1134       "lea       0x80(%0),%0                     \n"
1135       "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
1136       "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
1137       "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
1138       "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
1139       "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
1140       "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
1141       "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
1142       "vmovdqu    %%ymm0,(%1)                    \n"
1143       "lea       0x20(%1),%1                     \n"
1144       "sub       $0x20,%2                        \n"
1145       "jg        1b                              \n"
1146       "vzeroupper                                \n"
1147       : "+r"(src_argb),         // %0
1148         "+r"(dst_y),            // %1
1149         "+r"(width)             // %2
1150       : "m"(kARGBToY),          // %3
1151         "m"(kAddY16),           // %4
1152         "m"(kPermdARGBToY_AVX)  // %5
1153       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1154 }
1155 #endif  // HAS_ARGBTOYROW_AVX2
1156 
1157 #ifdef HAS_ARGBTOYJROW_AVX2
1158 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYJRow_AVX2(const uint8_t * src_argb,uint8_t * dst_y,int width)1159 void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1160   asm volatile(
1161       "vbroadcastf128 %3,%%ymm4                  \n"
1162       "vbroadcastf128 %4,%%ymm5                  \n"
1163       "vmovdqu    %5,%%ymm6                      \n"
1164 
1165       LABELALIGN
1166       "1:                                        \n"
1167       "vmovdqu    (%0),%%ymm0                    \n"
1168       "vmovdqu    0x20(%0),%%ymm1                \n"
1169       "vmovdqu    0x40(%0),%%ymm2                \n"
1170       "vmovdqu    0x60(%0),%%ymm3                \n"
1171       "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
1172       "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
1173       "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
1174       "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
1175       "lea       0x80(%0),%0                     \n"
1176       "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
1177       "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
1178       "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
1179       "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
1180       "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
1181       "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
1182       "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
1183       "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
1184       "vmovdqu    %%ymm0,(%1)                    \n"
1185       "lea       0x20(%1),%1                     \n"
1186       "sub       $0x20,%2                        \n"
1187       "jg        1b                              \n"
1188       "vzeroupper                                \n"
1189       : "+r"(src_argb),         // %0
1190         "+r"(dst_y),            // %1
1191         "+r"(width)             // %2
1192       : "m"(kARGBToYJ),         // %3
1193         "m"(kAddYJ64),          // %4
1194         "m"(kPermdARGBToY_AVX)  // %5
1195       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1196 }
1197 #endif  // HAS_ARGBTOYJROW_AVX2
1198 
1199 #ifdef HAS_ARGBTOUVROW_SSSE3
ARGBToUVRow_SSSE3(const uint8_t * src_argb0,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1200 void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
1201                        int src_stride_argb,
1202                        uint8_t* dst_u,
1203                        uint8_t* dst_v,
1204                        int width) {
1205   asm volatile(
1206       "movdqa    %5,%%xmm3                       \n"
1207       "movdqa    %6,%%xmm4                       \n"
1208       "movdqa    %7,%%xmm5                       \n"
1209       "sub       %1,%2                           \n"
1210 
1211       LABELALIGN
1212       "1:                                        \n"
1213       "movdqu    (%0),%%xmm0                     \n"
1214       "movdqu    0x00(%0,%4,1),%%xmm7            \n"
1215       "pavgb     %%xmm7,%%xmm0                   \n"
1216       "movdqu    0x10(%0),%%xmm1                 \n"
1217       "movdqu    0x10(%0,%4,1),%%xmm7            \n"
1218       "pavgb     %%xmm7,%%xmm1                   \n"
1219       "movdqu    0x20(%0),%%xmm2                 \n"
1220       "movdqu    0x20(%0,%4,1),%%xmm7            \n"
1221       "pavgb     %%xmm7,%%xmm2                   \n"
1222       "movdqu    0x30(%0),%%xmm6                 \n"
1223       "movdqu    0x30(%0,%4,1),%%xmm7            \n"
1224       "pavgb     %%xmm7,%%xmm6                   \n"
1225 
1226       "lea       0x40(%0),%0                     \n"
1227       "movdqa    %%xmm0,%%xmm7                   \n"
1228       "shufps    $0x88,%%xmm1,%%xmm0             \n"
1229       "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1230       "pavgb     %%xmm7,%%xmm0                   \n"
1231       "movdqa    %%xmm2,%%xmm7                   \n"
1232       "shufps    $0x88,%%xmm6,%%xmm2             \n"
1233       "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1234       "pavgb     %%xmm7,%%xmm2                   \n"
1235       "movdqa    %%xmm0,%%xmm1                   \n"
1236       "movdqa    %%xmm2,%%xmm6                   \n"
1237       "pmaddubsw %%xmm4,%%xmm0                   \n"
1238       "pmaddubsw %%xmm4,%%xmm2                   \n"
1239       "pmaddubsw %%xmm3,%%xmm1                   \n"
1240       "pmaddubsw %%xmm3,%%xmm6                   \n"
1241       "phaddw    %%xmm2,%%xmm0                   \n"
1242       "phaddw    %%xmm6,%%xmm1                   \n"
1243       "psraw     $0x8,%%xmm0                     \n"
1244       "psraw     $0x8,%%xmm1                     \n"
1245       "packsswb  %%xmm1,%%xmm0                   \n"
1246       "paddb     %%xmm5,%%xmm0                   \n"
1247       "movlps    %%xmm0,(%1)                     \n"
1248       "movhps    %%xmm0,0x00(%1,%2,1)            \n"
1249       "lea       0x8(%1),%1                      \n"
1250       "sub       $0x10,%3                        \n"
1251       "jg        1b                              \n"
1252       : "+r"(src_argb0),                   // %0
1253         "+r"(dst_u),                       // %1
1254         "+r"(dst_v),                       // %2
1255         "+rm"(width)                       // %3
1256       : "r"((intptr_t)(src_stride_argb)),  // %4
1257         "m"(kARGBToV),                     // %5
1258         "m"(kARGBToU),                     // %6
1259         "m"(kAddUV128)                     // %7
1260       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1261 }
1262 #endif  // HAS_ARGBTOUVROW_SSSE3
1263 
1264 #ifdef HAS_ARGBTOUVROW_AVX2
1265 // vpshufb for vphaddw + vpackuswb packed to shorts.
1266 static const lvec8 kShufARGBToUV_AVX = {
1267     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
1268     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
ARGBToUVRow_AVX2(const uint8_t * src_argb0,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1269 void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
1270                       int src_stride_argb,
1271                       uint8_t* dst_u,
1272                       uint8_t* dst_v,
1273                       int width) {
1274   asm volatile(
1275       "vbroadcastf128 %5,%%ymm5                  \n"
1276       "vbroadcastf128 %6,%%ymm6                  \n"
1277       "vbroadcastf128 %7,%%ymm7                  \n"
1278       "sub        %1,%2                          \n"
1279 
1280       LABELALIGN
1281       "1:                                        \n"
1282       "vmovdqu    (%0),%%ymm0                    \n"
1283       "vmovdqu    0x20(%0),%%ymm1                \n"
1284       "vmovdqu    0x40(%0),%%ymm2                \n"
1285       "vmovdqu    0x60(%0),%%ymm3                \n"
1286       "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
1287       "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
1288       "vpavgb    0x40(%0,%4,1),%%ymm2,%%ymm2     \n"
1289       "vpavgb    0x60(%0,%4,1),%%ymm3,%%ymm3     \n"
1290       "lea        0x80(%0),%0                    \n"
1291       "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
1292       "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
1293       "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
1294       "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
1295       "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
1296       "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
1297 
1298       "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
1299       "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
1300       "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
1301       "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
1302       "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
1303       "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
1304       "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
1305       "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
1306       "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
1307       "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
1308       "vpshufb    %8,%%ymm0,%%ymm0               \n"
1309       "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
1310 
1311       "vextractf128 $0x0,%%ymm0,(%1)             \n"
1312       "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
1313       "lea        0x10(%1),%1                    \n"
1314       "sub        $0x20,%3                       \n"
1315       "jg         1b                             \n"
1316       "vzeroupper                                \n"
1317       : "+r"(src_argb0),                   // %0
1318         "+r"(dst_u),                       // %1
1319         "+r"(dst_v),                       // %2
1320         "+rm"(width)                       // %3
1321       : "r"((intptr_t)(src_stride_argb)),  // %4
1322         "m"(kAddUV128),                    // %5
1323         "m"(kARGBToV),                     // %6
1324         "m"(kARGBToU),                     // %7
1325         "m"(kShufARGBToUV_AVX)             // %8
1326       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1327         "xmm7");
1328 }
1329 #endif  // HAS_ARGBTOUVROW_AVX2
1330 
1331 #ifdef HAS_ARGBTOUVJROW_AVX2
ARGBToUVJRow_AVX2(const uint8_t * src_argb0,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1332 void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
1333                        int src_stride_argb,
1334                        uint8_t* dst_u,
1335                        uint8_t* dst_v,
1336                        int width) {
1337   asm volatile(
1338       "vbroadcastf128 %5,%%ymm5                  \n"
1339       "vbroadcastf128 %6,%%ymm6                  \n"
1340       "vbroadcastf128 %7,%%ymm7                  \n"
1341       "sub        %1,%2                          \n"
1342 
1343       LABELALIGN
1344       "1:                                        \n"
1345       "vmovdqu    (%0),%%ymm0                    \n"
1346       "vmovdqu    0x20(%0),%%ymm1                \n"
1347       "vmovdqu    0x40(%0),%%ymm2                \n"
1348       "vmovdqu    0x60(%0),%%ymm3                \n"
1349       "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
1350       "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
1351       "vpavgb    0x40(%0,%4,1),%%ymm2,%%ymm2     \n"
1352       "vpavgb    0x60(%0,%4,1),%%ymm3,%%ymm3     \n"
1353       "lea       0x80(%0),%0                     \n"
1354       "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
1355       "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
1356       "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
1357       "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
1358       "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
1359       "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
1360 
1361       "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
1362       "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
1363       "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
1364       "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
1365       "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
1366       "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
1367       "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
1368       "vpaddw     %%ymm5,%%ymm1,%%ymm1           \n"
1369       "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
1370       "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
1371       "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
1372       "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
1373       "vpshufb    %8,%%ymm0,%%ymm0               \n"
1374 
1375       "vextractf128 $0x0,%%ymm0,(%1)             \n"
1376       "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
1377       "lea       0x10(%1),%1                     \n"
1378       "sub       $0x20,%3                        \n"
1379       "jg        1b                              \n"
1380       "vzeroupper                                \n"
1381       : "+r"(src_argb0),                   // %0
1382         "+r"(dst_u),                       // %1
1383         "+r"(dst_v),                       // %2
1384         "+rm"(width)                       // %3
1385       : "r"((intptr_t)(src_stride_argb)),  // %4
1386         "m"(kAddUVJ128),                   // %5
1387         "m"(kARGBToVJ),                    // %6
1388         "m"(kARGBToUJ),                    // %7
1389         "m"(kShufARGBToUV_AVX)             // %8
1390       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1391         "xmm7");
1392 }
1393 #endif  // HAS_ARGBTOUVJROW_AVX2
1394 
1395 #ifdef HAS_ARGBTOUVJROW_SSSE3
ARGBToUVJRow_SSSE3(const uint8_t * src_argb0,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1396 void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
1397                         int src_stride_argb,
1398                         uint8_t* dst_u,
1399                         uint8_t* dst_v,
1400                         int width) {
1401   asm volatile(
1402       "movdqa    %5,%%xmm3                       \n"
1403       "movdqa    %6,%%xmm4                       \n"
1404       "movdqa    %7,%%xmm5                       \n"
1405       "sub       %1,%2                           \n"
1406 
1407       LABELALIGN
1408       "1:                                        \n"
1409       "movdqu    (%0),%%xmm0                     \n"
1410       "movdqu    0x00(%0,%4,1),%%xmm7            \n"
1411       "pavgb     %%xmm7,%%xmm0                   \n"
1412       "movdqu    0x10(%0),%%xmm1                 \n"
1413       "movdqu    0x10(%0,%4,1),%%xmm7            \n"
1414       "pavgb     %%xmm7,%%xmm1                   \n"
1415       "movdqu    0x20(%0),%%xmm2                 \n"
1416       "movdqu    0x20(%0,%4,1),%%xmm7            \n"
1417       "pavgb     %%xmm7,%%xmm2                   \n"
1418       "movdqu    0x30(%0),%%xmm6                 \n"
1419       "movdqu    0x30(%0,%4,1),%%xmm7            \n"
1420       "pavgb     %%xmm7,%%xmm6                   \n"
1421 
1422       "lea       0x40(%0),%0                     \n"
1423       "movdqa    %%xmm0,%%xmm7                   \n"
1424       "shufps    $0x88,%%xmm1,%%xmm0             \n"
1425       "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1426       "pavgb     %%xmm7,%%xmm0                   \n"
1427       "movdqa    %%xmm2,%%xmm7                   \n"
1428       "shufps    $0x88,%%xmm6,%%xmm2             \n"
1429       "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1430       "pavgb     %%xmm7,%%xmm2                   \n"
1431       "movdqa    %%xmm0,%%xmm1                   \n"
1432       "movdqa    %%xmm2,%%xmm6                   \n"
1433       "pmaddubsw %%xmm4,%%xmm0                   \n"
1434       "pmaddubsw %%xmm4,%%xmm2                   \n"
1435       "pmaddubsw %%xmm3,%%xmm1                   \n"
1436       "pmaddubsw %%xmm3,%%xmm6                   \n"
1437       "phaddw    %%xmm2,%%xmm0                   \n"
1438       "phaddw    %%xmm6,%%xmm1                   \n"
1439       "paddw     %%xmm5,%%xmm0                   \n"
1440       "paddw     %%xmm5,%%xmm1                   \n"
1441       "psraw     $0x8,%%xmm0                     \n"
1442       "psraw     $0x8,%%xmm1                     \n"
1443       "packsswb  %%xmm1,%%xmm0                   \n"
1444       "movlps    %%xmm0,(%1)                     \n"
1445       "movhps    %%xmm0,0x00(%1,%2,1)            \n"
1446       "lea       0x8(%1),%1                      \n"
1447       "sub       $0x10,%3                        \n"
1448       "jg        1b                              \n"
1449       : "+r"(src_argb0),                   // %0
1450         "+r"(dst_u),                       // %1
1451         "+r"(dst_v),                       // %2
1452         "+rm"(width)                       // %3
1453       : "r"((intptr_t)(src_stride_argb)),  // %4
1454         "m"(kARGBToVJ),                    // %5
1455         "m"(kARGBToUJ),                    // %6
1456         "m"(kAddUVJ128)                    // %7
1457       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1458 }
1459 #endif  // HAS_ARGBTOUVJROW_SSSE3
1460 
1461 #ifdef HAS_ARGBTOUV444ROW_SSSE3
ARGBToUV444Row_SSSE3(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1462 void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
1463                           uint8_t* dst_u,
1464                           uint8_t* dst_v,
1465                           int width) {
1466   asm volatile(
1467       "movdqa    %4,%%xmm3                       \n"
1468       "movdqa    %5,%%xmm4                       \n"
1469       "movdqa    %6,%%xmm5                       \n"
1470       "sub       %1,%2                           \n"
1471 
1472       LABELALIGN
1473       "1:                                        \n"
1474       "movdqu    (%0),%%xmm0                     \n"
1475       "movdqu    0x10(%0),%%xmm1                 \n"
1476       "movdqu    0x20(%0),%%xmm2                 \n"
1477       "movdqu    0x30(%0),%%xmm6                 \n"
1478       "pmaddubsw %%xmm4,%%xmm0                   \n"
1479       "pmaddubsw %%xmm4,%%xmm1                   \n"
1480       "pmaddubsw %%xmm4,%%xmm2                   \n"
1481       "pmaddubsw %%xmm4,%%xmm6                   \n"
1482       "phaddw    %%xmm1,%%xmm0                   \n"
1483       "phaddw    %%xmm6,%%xmm2                   \n"
1484       "psraw     $0x8,%%xmm0                     \n"
1485       "psraw     $0x8,%%xmm2                     \n"
1486       "packsswb  %%xmm2,%%xmm0                   \n"
1487       "paddb     %%xmm5,%%xmm0                   \n"
1488       "movdqu    %%xmm0,(%1)                     \n"
1489       "movdqu    (%0),%%xmm0                     \n"
1490       "movdqu    0x10(%0),%%xmm1                 \n"
1491       "movdqu    0x20(%0),%%xmm2                 \n"
1492       "movdqu    0x30(%0),%%xmm6                 \n"
1493       "pmaddubsw %%xmm3,%%xmm0                   \n"
1494       "pmaddubsw %%xmm3,%%xmm1                   \n"
1495       "pmaddubsw %%xmm3,%%xmm2                   \n"
1496       "pmaddubsw %%xmm3,%%xmm6                   \n"
1497       "phaddw    %%xmm1,%%xmm0                   \n"
1498       "phaddw    %%xmm6,%%xmm2                   \n"
1499       "psraw     $0x8,%%xmm0                     \n"
1500       "psraw     $0x8,%%xmm2                     \n"
1501       "packsswb  %%xmm2,%%xmm0                   \n"
1502       "paddb     %%xmm5,%%xmm0                   \n"
1503       "lea       0x40(%0),%0                     \n"
1504       "movdqu    %%xmm0,0x00(%1,%2,1)            \n"
1505       "lea       0x10(%1),%1                     \n"
1506       "sub       $0x10,%3                        \n"
1507       "jg        1b                              \n"
1508       : "+r"(src_argb),  // %0
1509         "+r"(dst_u),     // %1
1510         "+r"(dst_v),     // %2
1511         "+rm"(width)     // %3
1512       : "m"(kARGBToV),   // %4
1513         "m"(kARGBToU),   // %5
1514         "m"(kAddUV128)   // %6
1515       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");
1516 }
1517 #endif  // HAS_ARGBTOUV444ROW_SSSE3
1518 
BGRAToYRow_SSSE3(const uint8_t * src_bgra,uint8_t * dst_y,int width)1519 void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
1520   asm volatile(
1521       "movdqa    %4,%%xmm5                       \n"
1522       "movdqa    %3,%%xmm4                       \n"
1523 
1524       LABELALIGN
1525       "1:                                        \n"
1526       "movdqu    (%0),%%xmm0                     \n"
1527       "movdqu    0x10(%0),%%xmm1                 \n"
1528       "movdqu    0x20(%0),%%xmm2                 \n"
1529       "movdqu    0x30(%0),%%xmm3                 \n"
1530       "pmaddubsw %%xmm4,%%xmm0                   \n"
1531       "pmaddubsw %%xmm4,%%xmm1                   \n"
1532       "pmaddubsw %%xmm4,%%xmm2                   \n"
1533       "pmaddubsw %%xmm4,%%xmm3                   \n"
1534       "lea       0x40(%0),%0                     \n"
1535       "phaddw    %%xmm1,%%xmm0                   \n"
1536       "phaddw    %%xmm3,%%xmm2                   \n"
1537       "psrlw     $0x7,%%xmm0                     \n"
1538       "psrlw     $0x7,%%xmm2                     \n"
1539       "packuswb  %%xmm2,%%xmm0                   \n"
1540       "paddb     %%xmm5,%%xmm0                   \n"
1541       "movdqu    %%xmm0,(%1)                     \n"
1542       "lea       0x10(%1),%1                     \n"
1543       "sub       $0x10,%2                        \n"
1544       "jg        1b                              \n"
1545       : "+r"(src_bgra),  // %0
1546         "+r"(dst_y),     // %1
1547         "+r"(width)      // %2
1548       : "m"(kBGRAToY),   // %3
1549         "m"(kAddY16)     // %4
1550       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1551 }
1552 
BGRAToUVRow_SSSE3(const uint8_t * src_bgra0,int src_stride_bgra,uint8_t * dst_u,uint8_t * dst_v,int width)1553 void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
1554                        int src_stride_bgra,
1555                        uint8_t* dst_u,
1556                        uint8_t* dst_v,
1557                        int width) {
1558   asm volatile(
1559       "movdqa    %5,%%xmm3                       \n"
1560       "movdqa    %6,%%xmm4                       \n"
1561       "movdqa    %7,%%xmm5                       \n"
1562       "sub       %1,%2                           \n"
1563 
1564       LABELALIGN
1565       "1:                                        \n"
1566       "movdqu    (%0),%%xmm0                     \n"
1567       "movdqu    0x00(%0,%4,1),%%xmm7            \n"
1568       "pavgb     %%xmm7,%%xmm0                   \n"
1569       "movdqu    0x10(%0),%%xmm1                 \n"
1570       "movdqu    0x10(%0,%4,1),%%xmm7            \n"
1571       "pavgb     %%xmm7,%%xmm1                   \n"
1572       "movdqu    0x20(%0),%%xmm2                 \n"
1573       "movdqu    0x20(%0,%4,1),%%xmm7            \n"
1574       "pavgb     %%xmm7,%%xmm2                   \n"
1575       "movdqu    0x30(%0),%%xmm6                 \n"
1576       "movdqu    0x30(%0,%4,1),%%xmm7            \n"
1577       "pavgb     %%xmm7,%%xmm6                   \n"
1578 
1579       "lea       0x40(%0),%0                     \n"
1580       "movdqa    %%xmm0,%%xmm7                   \n"
1581       "shufps    $0x88,%%xmm1,%%xmm0             \n"
1582       "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1583       "pavgb     %%xmm7,%%xmm0                   \n"
1584       "movdqa    %%xmm2,%%xmm7                   \n"
1585       "shufps    $0x88,%%xmm6,%%xmm2             \n"
1586       "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1587       "pavgb     %%xmm7,%%xmm2                   \n"
1588       "movdqa    %%xmm0,%%xmm1                   \n"
1589       "movdqa    %%xmm2,%%xmm6                   \n"
1590       "pmaddubsw %%xmm4,%%xmm0                   \n"
1591       "pmaddubsw %%xmm4,%%xmm2                   \n"
1592       "pmaddubsw %%xmm3,%%xmm1                   \n"
1593       "pmaddubsw %%xmm3,%%xmm6                   \n"
1594       "phaddw    %%xmm2,%%xmm0                   \n"
1595       "phaddw    %%xmm6,%%xmm1                   \n"
1596       "psraw     $0x8,%%xmm0                     \n"
1597       "psraw     $0x8,%%xmm1                     \n"
1598       "packsswb  %%xmm1,%%xmm0                   \n"
1599       "paddb     %%xmm5,%%xmm0                   \n"
1600       "movlps    %%xmm0,(%1)                     \n"
1601       "movhps    %%xmm0,0x00(%1,%2,1)            \n"
1602       "lea       0x8(%1),%1                      \n"
1603       "sub       $0x10,%3                        \n"
1604       "jg        1b                              \n"
1605       : "+r"(src_bgra0),                   // %0
1606         "+r"(dst_u),                       // %1
1607         "+r"(dst_v),                       // %2
1608         "+rm"(width)                       // %3
1609       : "r"((intptr_t)(src_stride_bgra)),  // %4
1610         "m"(kBGRAToV),                     // %5
1611         "m"(kBGRAToU),                     // %6
1612         "m"(kAddUV128)                     // %7
1613       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1614 }
1615 
ABGRToYRow_SSSE3(const uint8_t * src_abgr,uint8_t * dst_y,int width)1616 void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1617   asm volatile(
1618       "movdqa    %4,%%xmm5                       \n"
1619       "movdqa    %3,%%xmm4                       \n"
1620 
1621       LABELALIGN
1622       "1:                                        \n"
1623       "movdqu    (%0),%%xmm0                     \n"
1624       "movdqu    0x10(%0),%%xmm1                 \n"
1625       "movdqu    0x20(%0),%%xmm2                 \n"
1626       "movdqu    0x30(%0),%%xmm3                 \n"
1627       "pmaddubsw %%xmm4,%%xmm0                   \n"
1628       "pmaddubsw %%xmm4,%%xmm1                   \n"
1629       "pmaddubsw %%xmm4,%%xmm2                   \n"
1630       "pmaddubsw %%xmm4,%%xmm3                   \n"
1631       "lea       0x40(%0),%0                     \n"
1632       "phaddw    %%xmm1,%%xmm0                   \n"
1633       "phaddw    %%xmm3,%%xmm2                   \n"
1634       "psrlw     $0x7,%%xmm0                     \n"
1635       "psrlw     $0x7,%%xmm2                     \n"
1636       "packuswb  %%xmm2,%%xmm0                   \n"
1637       "paddb     %%xmm5,%%xmm0                   \n"
1638       "movdqu    %%xmm0,(%1)                     \n"
1639       "lea       0x10(%1),%1                     \n"
1640       "sub       $0x10,%2                        \n"
1641       "jg        1b                              \n"
1642       : "+r"(src_abgr),  // %0
1643         "+r"(dst_y),     // %1
1644         "+r"(width)      // %2
1645       : "m"(kABGRToY),   // %3
1646         "m"(kAddY16)     // %4
1647       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1648 }
1649 
RGBAToYRow_SSSE3(const uint8_t * src_rgba,uint8_t * dst_y,int width)1650 void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1651   asm volatile(
1652       "movdqa    %4,%%xmm5                       \n"
1653       "movdqa    %3,%%xmm4                       \n"
1654 
1655       LABELALIGN
1656       "1:                                        \n"
1657       "movdqu    (%0),%%xmm0                     \n"
1658       "movdqu    0x10(%0),%%xmm1                 \n"
1659       "movdqu    0x20(%0),%%xmm2                 \n"
1660       "movdqu    0x30(%0),%%xmm3                 \n"
1661       "pmaddubsw %%xmm4,%%xmm0                   \n"
1662       "pmaddubsw %%xmm4,%%xmm1                   \n"
1663       "pmaddubsw %%xmm4,%%xmm2                   \n"
1664       "pmaddubsw %%xmm4,%%xmm3                   \n"
1665       "lea       0x40(%0),%0                     \n"
1666       "phaddw    %%xmm1,%%xmm0                   \n"
1667       "phaddw    %%xmm3,%%xmm2                   \n"
1668       "psrlw     $0x7,%%xmm0                     \n"
1669       "psrlw     $0x7,%%xmm2                     \n"
1670       "packuswb  %%xmm2,%%xmm0                   \n"
1671       "paddb     %%xmm5,%%xmm0                   \n"
1672       "movdqu    %%xmm0,(%1)                     \n"
1673       "lea       0x10(%1),%1                     \n"
1674       "sub       $0x10,%2                        \n"
1675       "jg        1b                              \n"
1676       : "+r"(src_rgba),  // %0
1677         "+r"(dst_y),     // %1
1678         "+r"(width)      // %2
1679       : "m"(kRGBAToY),   // %3
1680         "m"(kAddY16)     // %4
1681       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1682 }
1683 
ABGRToUVRow_SSSE3(const uint8_t * src_abgr0,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)1684 void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
1685                        int src_stride_abgr,
1686                        uint8_t* dst_u,
1687                        uint8_t* dst_v,
1688                        int width) {
1689   asm volatile(
1690       "movdqa    %5,%%xmm3                       \n"
1691       "movdqa    %6,%%xmm4                       \n"
1692       "movdqa    %7,%%xmm5                       \n"
1693       "sub       %1,%2                           \n"
1694 
1695       LABELALIGN
1696       "1:                                        \n"
1697       "movdqu    (%0),%%xmm0                     \n"
1698       "movdqu    0x00(%0,%4,1),%%xmm7            \n"
1699       "pavgb     %%xmm7,%%xmm0                   \n"
1700       "movdqu    0x10(%0),%%xmm1                 \n"
1701       "movdqu    0x10(%0,%4,1),%%xmm7            \n"
1702       "pavgb     %%xmm7,%%xmm1                   \n"
1703       "movdqu    0x20(%0),%%xmm2                 \n"
1704       "movdqu    0x20(%0,%4,1),%%xmm7            \n"
1705       "pavgb     %%xmm7,%%xmm2                   \n"
1706       "movdqu    0x30(%0),%%xmm6                 \n"
1707       "movdqu    0x30(%0,%4,1),%%xmm7            \n"
1708       "pavgb     %%xmm7,%%xmm6                   \n"
1709 
1710       "lea       0x40(%0),%0                     \n"
1711       "movdqa    %%xmm0,%%xmm7                   \n"
1712       "shufps    $0x88,%%xmm1,%%xmm0             \n"
1713       "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1714       "pavgb     %%xmm7,%%xmm0                   \n"
1715       "movdqa    %%xmm2,%%xmm7                   \n"
1716       "shufps    $0x88,%%xmm6,%%xmm2             \n"
1717       "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1718       "pavgb     %%xmm7,%%xmm2                   \n"
1719       "movdqa    %%xmm0,%%xmm1                   \n"
1720       "movdqa    %%xmm2,%%xmm6                   \n"
1721       "pmaddubsw %%xmm4,%%xmm0                   \n"
1722       "pmaddubsw %%xmm4,%%xmm2                   \n"
1723       "pmaddubsw %%xmm3,%%xmm1                   \n"
1724       "pmaddubsw %%xmm3,%%xmm6                   \n"
1725       "phaddw    %%xmm2,%%xmm0                   \n"
1726       "phaddw    %%xmm6,%%xmm1                   \n"
1727       "psraw     $0x8,%%xmm0                     \n"
1728       "psraw     $0x8,%%xmm1                     \n"
1729       "packsswb  %%xmm1,%%xmm0                   \n"
1730       "paddb     %%xmm5,%%xmm0                   \n"
1731       "movlps    %%xmm0,(%1)                     \n"
1732       "movhps    %%xmm0,0x00(%1,%2,1)            \n"
1733       "lea       0x8(%1),%1                      \n"
1734       "sub       $0x10,%3                        \n"
1735       "jg        1b                              \n"
1736       : "+r"(src_abgr0),                   // %0
1737         "+r"(dst_u),                       // %1
1738         "+r"(dst_v),                       // %2
1739         "+rm"(width)                       // %3
1740       : "r"((intptr_t)(src_stride_abgr)),  // %4
1741         "m"(kABGRToV),                     // %5
1742         "m"(kABGRToU),                     // %6
1743         "m"(kAddUV128)                     // %7
1744       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1745 }
1746 
RGBAToUVRow_SSSE3(const uint8_t * src_rgba0,int src_stride_rgba,uint8_t * dst_u,uint8_t * dst_v,int width)1747 void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
1748                        int src_stride_rgba,
1749                        uint8_t* dst_u,
1750                        uint8_t* dst_v,
1751                        int width) {
1752   asm volatile(
1753       "movdqa    %5,%%xmm3                       \n"
1754       "movdqa    %6,%%xmm4                       \n"
1755       "movdqa    %7,%%xmm5                       \n"
1756       "sub       %1,%2                           \n"
1757 
1758       LABELALIGN
1759       "1:                                        \n"
1760       "movdqu    (%0),%%xmm0                     \n"
1761       "movdqu    0x00(%0,%4,1),%%xmm7            \n"
1762       "pavgb     %%xmm7,%%xmm0                   \n"
1763       "movdqu    0x10(%0),%%xmm1                 \n"
1764       "movdqu    0x10(%0,%4,1),%%xmm7            \n"
1765       "pavgb     %%xmm7,%%xmm1                   \n"
1766       "movdqu    0x20(%0),%%xmm2                 \n"
1767       "movdqu    0x20(%0,%4,1),%%xmm7            \n"
1768       "pavgb     %%xmm7,%%xmm2                   \n"
1769       "movdqu    0x30(%0),%%xmm6                 \n"
1770       "movdqu    0x30(%0,%4,1),%%xmm7            \n"
1771       "pavgb     %%xmm7,%%xmm6                   \n"
1772 
1773       "lea       0x40(%0),%0                     \n"
1774       "movdqa    %%xmm0,%%xmm7                   \n"
1775       "shufps    $0x88,%%xmm1,%%xmm0             \n"
1776       "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1777       "pavgb     %%xmm7,%%xmm0                   \n"
1778       "movdqa    %%xmm2,%%xmm7                   \n"
1779       "shufps    $0x88,%%xmm6,%%xmm2             \n"
1780       "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1781       "pavgb     %%xmm7,%%xmm2                   \n"
1782       "movdqa    %%xmm0,%%xmm1                   \n"
1783       "movdqa    %%xmm2,%%xmm6                   \n"
1784       "pmaddubsw %%xmm4,%%xmm0                   \n"
1785       "pmaddubsw %%xmm4,%%xmm2                   \n"
1786       "pmaddubsw %%xmm3,%%xmm1                   \n"
1787       "pmaddubsw %%xmm3,%%xmm6                   \n"
1788       "phaddw    %%xmm2,%%xmm0                   \n"
1789       "phaddw    %%xmm6,%%xmm1                   \n"
1790       "psraw     $0x8,%%xmm0                     \n"
1791       "psraw     $0x8,%%xmm1                     \n"
1792       "packsswb  %%xmm1,%%xmm0                   \n"
1793       "paddb     %%xmm5,%%xmm0                   \n"
1794       "movlps    %%xmm0,(%1)                     \n"
1795       "movhps    %%xmm0,0x00(%1,%2,1)            \n"
1796       "lea       0x8(%1),%1                      \n"
1797       "sub       $0x10,%3                        \n"
1798       "jg        1b                              \n"
1799       : "+r"(src_rgba0),                   // %0
1800         "+r"(dst_u),                       // %1
1801         "+r"(dst_v),                       // %2
1802         "+rm"(width)                       // %3
1803       : "r"((intptr_t)(src_stride_rgba)),  // %4
1804         "m"(kRGBAToV),                     // %5
1805         "m"(kRGBAToU),                     // %6
1806         "m"(kAddUV128)                     // %7
1807       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1808 }
1809 
1810 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
1811 
1812 // Read 8 UV from 444
1813 #define READYUV444                                                \
1814   "movq       (%[u_buf]),%%xmm0                               \n" \
1815   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
1816   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
1817   "punpcklbw  %%xmm1,%%xmm0                                   \n" \
1818   "movq       (%[y_buf]),%%xmm4                               \n" \
1819   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
1820   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
1821 
1822 // Read 4 UV from 422, upsample to 8 UV
1823 #define READYUV422                                                \
1824   "movd       (%[u_buf]),%%xmm0                               \n" \
1825   "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
1826   "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
1827   "punpcklbw  %%xmm1,%%xmm0                                   \n" \
1828   "punpcklwd  %%xmm0,%%xmm0                                   \n" \
1829   "movq       (%[y_buf]),%%xmm4                               \n" \
1830   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
1831   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
1832 
1833 // Read 4 UV from 422 10 bit, upsample to 8 UV
1834 // TODO(fbarchard): Consider shufb to replace pack/unpack
1835 // TODO(fbarchard): Consider pmulhuw to replace psraw
1836 // TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
1837 #define READYUV210                                                \
1838   "movq       (%[u_buf]),%%xmm0                               \n" \
1839   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
1840   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
1841   "punpcklwd  %%xmm1,%%xmm0                                   \n" \
1842   "psraw      $0x2,%%xmm0                                     \n" \
1843   "packuswb   %%xmm0,%%xmm0                                   \n" \
1844   "punpcklwd  %%xmm0,%%xmm0                                   \n" \
1845   "movdqu     (%[y_buf]),%%xmm4                               \n" \
1846   "psllw      $0x6,%%xmm4                                     \n" \
1847   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
1848 
1849 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
1850 #define READYUVA422                                               \
1851   "movd       (%[u_buf]),%%xmm0                               \n" \
1852   "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
1853   "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
1854   "punpcklbw  %%xmm1,%%xmm0                                   \n" \
1855   "punpcklwd  %%xmm0,%%xmm0                                   \n" \
1856   "movq       (%[y_buf]),%%xmm4                               \n" \
1857   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
1858   "lea        0x8(%[y_buf]),%[y_buf]                          \n" \
1859   "movq       (%[a_buf]),%%xmm5                               \n" \
1860   "lea        0x8(%[a_buf]),%[a_buf]                          \n"
1861 
1862 // Read 4 UV from NV12, upsample to 8 UV
1863 #define READNV12                                                  \
1864   "movq       (%[uv_buf]),%%xmm0                              \n" \
1865   "lea        0x8(%[uv_buf]),%[uv_buf]                        \n" \
1866   "punpcklwd  %%xmm0,%%xmm0                                   \n" \
1867   "movq       (%[y_buf]),%%xmm4                               \n" \
1868   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
1869   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
1870 
1871 // Read 4 VU from NV21, upsample to 8 UV
1872 #define READNV21                                                  \
1873   "movq       (%[vu_buf]),%%xmm0                              \n" \
1874   "lea        0x8(%[vu_buf]),%[vu_buf]                        \n" \
1875   "pshufb     %[kShuffleNV21], %%xmm0                         \n" \
1876   "movq       (%[y_buf]),%%xmm4                               \n" \
1877   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
1878   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
1879 
1880 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
1881 #define READYUY2                                                  \
1882   "movdqu     (%[yuy2_buf]),%%xmm4                            \n" \
1883   "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n" \
1884   "movdqu     (%[yuy2_buf]),%%xmm0                            \n" \
1885   "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n" \
1886   "lea        0x10(%[yuy2_buf]),%[yuy2_buf]                   \n"
1887 
1888 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
1889 #define READUYVY                                                  \
1890   "movdqu     (%[uyvy_buf]),%%xmm4                            \n" \
1891   "pshufb     %[kShuffleUYVYY], %%xmm4                        \n" \
1892   "movdqu     (%[uyvy_buf]),%%xmm0                            \n" \
1893   "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n" \
1894   "lea        0x10(%[uyvy_buf]),%[uyvy_buf]                   \n"
1895 
1896 #if defined(__x86_64__)
1897 #define YUVTORGB_SETUP(yuvconstants)                              \
1898   "movdqa     (%[yuvconstants]),%%xmm8                        \n" \
1899   "movdqa     32(%[yuvconstants]),%%xmm9                      \n" \
1900   "movdqa     64(%[yuvconstants]),%%xmm10                     \n" \
1901   "movdqa     96(%[yuvconstants]),%%xmm11                     \n" \
1902   "movdqa     128(%[yuvconstants]),%%xmm12                    \n" \
1903   "movdqa     160(%[yuvconstants]),%%xmm13                    \n" \
1904   "movdqa     192(%[yuvconstants]),%%xmm14                    \n"
1905 // Convert 8 pixels: 8 UV and 8 Y
1906 #define YUVTORGB16(yuvconstants)                                  \
1907   "movdqa     %%xmm0,%%xmm1                                   \n" \
1908   "movdqa     %%xmm0,%%xmm2                                   \n" \
1909   "movdqa     %%xmm0,%%xmm3                                   \n" \
1910   "movdqa     %%xmm11,%%xmm0                                  \n" \
1911   "pmaddubsw  %%xmm8,%%xmm1                                   \n" \
1912   "psubw      %%xmm1,%%xmm0                                   \n" \
1913   "movdqa     %%xmm12,%%xmm1                                  \n" \
1914   "pmaddubsw  %%xmm9,%%xmm2                                   \n" \
1915   "psubw      %%xmm2,%%xmm1                                   \n" \
1916   "movdqa     %%xmm13,%%xmm2                                  \n" \
1917   "pmaddubsw  %%xmm10,%%xmm3                                  \n" \
1918   "psubw      %%xmm3,%%xmm2                                   \n" \
1919   "pmulhuw    %%xmm14,%%xmm4                                  \n" \
1920   "paddsw     %%xmm4,%%xmm0                                   \n" \
1921   "paddsw     %%xmm4,%%xmm1                                   \n" \
1922   "paddsw     %%xmm4,%%xmm2                                   \n"
1923 #define YUVTORGB_REGS \
1924   "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
1925 
1926 #else
1927 #define YUVTORGB_SETUP(yuvconstants)
1928 // Convert 8 pixels: 8 UV and 8 Y
1929 #define YUVTORGB16(yuvconstants)                                  \
1930   "movdqa     %%xmm0,%%xmm1                                   \n" \
1931   "movdqa     %%xmm0,%%xmm2                                   \n" \
1932   "movdqa     %%xmm0,%%xmm3                                   \n" \
1933   "movdqa     96(%[yuvconstants]),%%xmm0                      \n" \
1934   "pmaddubsw  (%[yuvconstants]),%%xmm1                        \n" \
1935   "psubw      %%xmm1,%%xmm0                                   \n" \
1936   "movdqa     128(%[yuvconstants]),%%xmm1                     \n" \
1937   "pmaddubsw  32(%[yuvconstants]),%%xmm2                      \n" \
1938   "psubw      %%xmm2,%%xmm1                                   \n" \
1939   "movdqa     160(%[yuvconstants]),%%xmm2                     \n" \
1940   "pmaddubsw  64(%[yuvconstants]),%%xmm3                      \n" \
1941   "psubw      %%xmm3,%%xmm2                                   \n" \
1942   "pmulhuw    192(%[yuvconstants]),%%xmm4                     \n" \
1943   "paddsw     %%xmm4,%%xmm0                                   \n" \
1944   "paddsw     %%xmm4,%%xmm1                                   \n" \
1945   "paddsw     %%xmm4,%%xmm2                                   \n"
1946 #define YUVTORGB_REGS
1947 #endif
1948 
1949 #define YUVTORGB(yuvconstants)                                    \
1950   YUVTORGB16(yuvconstants)                                        \
1951   "psraw      $0x6,%%xmm0                                     \n" \
1952   "psraw      $0x6,%%xmm1                                     \n" \
1953   "psraw      $0x6,%%xmm2                                     \n" \
1954   "packuswb   %%xmm0,%%xmm0                                   \n" \
1955   "packuswb   %%xmm1,%%xmm1                                   \n" \
1956   "packuswb   %%xmm2,%%xmm2                                   \n"
1957 
1958 // Store 8 ARGB values.
1959 #define STOREARGB                                                  \
1960   "punpcklbw  %%xmm1,%%xmm0                                    \n" \
1961   "punpcklbw  %%xmm5,%%xmm2                                    \n" \
1962   "movdqa     %%xmm0,%%xmm1                                    \n" \
1963   "punpcklwd  %%xmm2,%%xmm0                                    \n" \
1964   "punpckhwd  %%xmm2,%%xmm1                                    \n" \
1965   "movdqu     %%xmm0,(%[dst_argb])                             \n" \
1966   "movdqu     %%xmm1,0x10(%[dst_argb])                         \n" \
1967   "lea        0x20(%[dst_argb]), %[dst_argb]                   \n"
1968 
1969 // Store 8 RGBA values.
1970 #define STORERGBA                                                  \
1971   "pcmpeqb   %%xmm5,%%xmm5                                     \n" \
1972   "punpcklbw %%xmm2,%%xmm1                                     \n" \
1973   "punpcklbw %%xmm0,%%xmm5                                     \n" \
1974   "movdqa    %%xmm5,%%xmm0                                     \n" \
1975   "punpcklwd %%xmm1,%%xmm5                                     \n" \
1976   "punpckhwd %%xmm1,%%xmm0                                     \n" \
1977   "movdqu    %%xmm5,(%[dst_rgba])                              \n" \
1978   "movdqu    %%xmm0,0x10(%[dst_rgba])                          \n" \
1979   "lea       0x20(%[dst_rgba]),%[dst_rgba]                     \n"
1980 
1981 // Store 8 AR30 values.
1982 #define STOREAR30                                                  \
1983   "psraw      $0x4,%%xmm0                                      \n" \
1984   "psraw      $0x4,%%xmm1                                      \n" \
1985   "psraw      $0x4,%%xmm2                                      \n" \
1986   "pminsw     %%xmm7,%%xmm0                                    \n" \
1987   "pminsw     %%xmm7,%%xmm1                                    \n" \
1988   "pminsw     %%xmm7,%%xmm2                                    \n" \
1989   "pmaxsw     %%xmm6,%%xmm0                                    \n" \
1990   "pmaxsw     %%xmm6,%%xmm1                                    \n" \
1991   "pmaxsw     %%xmm6,%%xmm2                                    \n" \
1992   "psllw      $0x4,%%xmm2                                      \n" \
1993   "movdqa     %%xmm0,%%xmm3                                    \n" \
1994   "punpcklwd  %%xmm2,%%xmm0                                    \n" \
1995   "punpckhwd  %%xmm2,%%xmm3                                    \n" \
1996   "movdqa     %%xmm1,%%xmm2                                    \n" \
1997   "punpcklwd  %%xmm5,%%xmm1                                    \n" \
1998   "punpckhwd  %%xmm5,%%xmm2                                    \n" \
1999   "pslld      $0xa,%%xmm1                                      \n" \
2000   "pslld      $0xa,%%xmm2                                      \n" \
2001   "por        %%xmm1,%%xmm0                                    \n" \
2002   "por        %%xmm2,%%xmm3                                    \n" \
2003   "movdqu     %%xmm0,(%[dst_ar30])                             \n" \
2004   "movdqu     %%xmm3,0x10(%[dst_ar30])                         \n" \
2005   "lea        0x20(%[dst_ar30]), %[dst_ar30]                   \n"
2006 
I444ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2007 void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
2008                                 const uint8_t* u_buf,
2009                                 const uint8_t* v_buf,
2010                                 uint8_t* dst_argb,
2011                                 const struct YuvConstants* yuvconstants,
2012                                 int width) {
2013   asm volatile (
2014     YUVTORGB_SETUP(yuvconstants)
2015     "sub       %[u_buf],%[v_buf]               \n"
2016     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2017 
2018     LABELALIGN
2019     "1:                                        \n"
2020     READYUV444
2021     YUVTORGB(yuvconstants)
2022     STOREARGB
2023     "sub       $0x8,%[width]                   \n"
2024     "jg        1b                              \n"
2025   : [y_buf]"+r"(y_buf),    // %[y_buf]
2026     [u_buf]"+r"(u_buf),    // %[u_buf]
2027     [v_buf]"+r"(v_buf),    // %[v_buf]
2028     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2029     [width]"+rm"(width)    // %[width]
2030   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2031   : "memory", "cc", YUVTORGB_REGS
2032     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2033   );
2034 }
2035 
I422ToRGB24Row_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)2036 void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
2037                                  const uint8_t* u_buf,
2038                                  const uint8_t* v_buf,
2039                                  uint8_t* dst_rgb24,
2040                                  const struct YuvConstants* yuvconstants,
2041                                  int width) {
2042   asm volatile (
2043     YUVTORGB_SETUP(yuvconstants)
2044     "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
2045     "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
2046     "sub       %[u_buf],%[v_buf]               \n"
2047 
2048     LABELALIGN
2049     "1:                                        \n"
2050     READYUV422
2051     YUVTORGB(yuvconstants)
2052     "punpcklbw %%xmm1,%%xmm0                   \n"
2053     "punpcklbw %%xmm2,%%xmm2                   \n"
2054     "movdqa    %%xmm0,%%xmm1                   \n"
2055     "punpcklwd %%xmm2,%%xmm0                   \n"
2056     "punpckhwd %%xmm2,%%xmm1                   \n"
2057     "pshufb    %%xmm5,%%xmm0                   \n"
2058     "pshufb    %%xmm6,%%xmm1                   \n"
2059     "palignr   $0xc,%%xmm0,%%xmm1              \n"
2060     "movq      %%xmm0,(%[dst_rgb24])           \n"
2061     "movdqu    %%xmm1,0x8(%[dst_rgb24])        \n"
2062     "lea       0x18(%[dst_rgb24]),%[dst_rgb24] \n"
2063     "subl      $0x8,%[width]                   \n"
2064     "jg        1b                              \n"
2065   : [y_buf]"+r"(y_buf),    // %[y_buf]
2066     [u_buf]"+r"(u_buf),    // %[u_buf]
2067     [v_buf]"+r"(v_buf),    // %[v_buf]
2068     [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
2069 #if defined(__i386__)
2070     [width]"+m"(width)     // %[width]
2071 #else
2072     [width]"+rm"(width)    // %[width]
2073 #endif
2074   : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
2075     [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
2076     [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
2077   : "memory", "cc", YUVTORGB_REGS
2078     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2079   );
2080 }
2081 
I422ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2082 void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
2083                                 const uint8_t* u_buf,
2084                                 const uint8_t* v_buf,
2085                                 uint8_t* dst_argb,
2086                                 const struct YuvConstants* yuvconstants,
2087                                 int width) {
2088   asm volatile (
2089     YUVTORGB_SETUP(yuvconstants)
2090     "sub       %[u_buf],%[v_buf]               \n"
2091     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2092 
2093     LABELALIGN
2094     "1:                                        \n"
2095     READYUV422
2096     YUVTORGB(yuvconstants)
2097     STOREARGB
2098     "sub       $0x8,%[width]                   \n"
2099     "jg        1b                              \n"
2100   : [y_buf]"+r"(y_buf),    // %[y_buf]
2101     [u_buf]"+r"(u_buf),    // %[u_buf]
2102     [v_buf]"+r"(v_buf),    // %[v_buf]
2103     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2104     [width]"+rm"(width)    // %[width]
2105   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2106   : "memory", "cc", YUVTORGB_REGS
2107     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2108   );
2109 }
2110 
I422ToAR30Row_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2111 void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
2112                                 const uint8_t* u_buf,
2113                                 const uint8_t* v_buf,
2114                                 uint8_t* dst_ar30,
2115                                 const struct YuvConstants* yuvconstants,
2116                                 int width) {
2117   asm volatile (
2118     YUVTORGB_SETUP(yuvconstants)
2119     "sub       %[u_buf],%[v_buf]               \n"
2120     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // AR30 constants
2121     "psrlw     $14,%%xmm5                      \n"
2122     "psllw     $4,%%xmm5                       \n"  // 2 alpha bits
2123     "pxor      %%xmm6,%%xmm6                   \n"
2124     "pcmpeqb   %%xmm7,%%xmm7                   \n"  // 0 for min
2125     "psrlw     $6,%%xmm7                       \n"  // 1023 for max
2126 
2127     LABELALIGN
2128     "1:                                        \n"
2129     READYUV422
2130     YUVTORGB16(yuvconstants)
2131     STOREAR30
2132     "sub       $0x8,%[width]                   \n"
2133     "jg        1b                              \n"
2134   : [y_buf]"+r"(y_buf),    // %[y_buf]
2135     [u_buf]"+r"(u_buf),    // %[u_buf]
2136     [v_buf]"+r"(v_buf),    // %[v_buf]
2137     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
2138     [width]"+rm"(width)    // %[width]
2139   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2140   : "memory", "cc", YUVTORGB_REGS
2141     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2142   );
2143 }
2144 
2145 // 10 bit YUV to ARGB
I210ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2146 void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
2147                                 const uint16_t* u_buf,
2148                                 const uint16_t* v_buf,
2149                                 uint8_t* dst_argb,
2150                                 const struct YuvConstants* yuvconstants,
2151                                 int width) {
2152   asm volatile (
2153     YUVTORGB_SETUP(yuvconstants)
2154     "sub       %[u_buf],%[v_buf]               \n"
2155     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2156 
2157     LABELALIGN
2158     "1:                                        \n"
2159     READYUV210
2160     YUVTORGB(yuvconstants)
2161     STOREARGB
2162     "sub       $0x8,%[width]                   \n"
2163     "jg        1b                              \n"
2164   : [y_buf]"+r"(y_buf),    // %[y_buf]
2165     [u_buf]"+r"(u_buf),    // %[u_buf]
2166     [v_buf]"+r"(v_buf),    // %[v_buf]
2167     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2168     [width]"+rm"(width)    // %[width]
2169   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2170   : "memory", "cc", YUVTORGB_REGS
2171     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2172   );
2173 }
2174 
2175 // 10 bit YUV to AR30
I210ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2176 void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
2177                                 const uint16_t* u_buf,
2178                                 const uint16_t* v_buf,
2179                                 uint8_t* dst_ar30,
2180                                 const struct YuvConstants* yuvconstants,
2181                                 int width) {
2182   asm volatile (
2183     YUVTORGB_SETUP(yuvconstants)
2184     "sub       %[u_buf],%[v_buf]               \n"
2185     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2186     "psrlw     $14,%%xmm5                      \n"
2187     "psllw     $4,%%xmm5                       \n"  // 2 alpha bits
2188     "pxor      %%xmm6,%%xmm6                   \n"
2189     "pcmpeqb   %%xmm7,%%xmm7                   \n"  // 0 for min
2190     "psrlw     $6,%%xmm7                       \n"  // 1023 for max
2191 
2192     LABELALIGN
2193     "1:                                        \n"
2194     READYUV210
2195     YUVTORGB16(yuvconstants)
2196     STOREAR30
2197     "sub       $0x8,%[width]                   \n"
2198     "jg        1b                              \n"
2199   : [y_buf]"+r"(y_buf),    // %[y_buf]
2200     [u_buf]"+r"(u_buf),    // %[u_buf]
2201     [v_buf]"+r"(v_buf),    // %[v_buf]
2202     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
2203     [width]"+rm"(width)    // %[width]
2204   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2205   : "memory", "cc", YUVTORGB_REGS
2206     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2207   );
2208 }
2209 
2210 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
I422AlphaToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2211 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
2212                                      const uint8_t* u_buf,
2213                                      const uint8_t* v_buf,
2214                                      const uint8_t* a_buf,
2215                                      uint8_t* dst_argb,
2216                                      const struct YuvConstants* yuvconstants,
2217                                      int width) {
2218   // clang-format off
2219   asm volatile (
2220     YUVTORGB_SETUP(yuvconstants)
2221     "sub       %[u_buf],%[v_buf]               \n"
2222 
2223     LABELALIGN
2224     "1:                                        \n"
2225     READYUVA422
2226     YUVTORGB(yuvconstants)
2227     STOREARGB
2228     "subl      $0x8,%[width]                   \n"
2229     "jg        1b                              \n"
2230   : [y_buf]"+r"(y_buf),    // %[y_buf]
2231     [u_buf]"+r"(u_buf),    // %[u_buf]
2232     [v_buf]"+r"(v_buf),    // %[v_buf]
2233     [a_buf]"+r"(a_buf),    // %[a_buf]
2234     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2235 #if defined(__i386__)
2236     [width]"+m"(width)     // %[width]
2237 #else
2238     [width]"+rm"(width)    // %[width]
2239 #endif
2240   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2241   : "memory", "cc", YUVTORGB_REGS
2242     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2243   );
2244   // clang-format on
2245 }
2246 #endif  // HAS_I422ALPHATOARGBROW_SSSE3
2247 
NV12ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2248 void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
2249                                 const uint8_t* uv_buf,
2250                                 uint8_t* dst_argb,
2251                                 const struct YuvConstants* yuvconstants,
2252                                 int width) {
2253   // clang-format off
2254   asm volatile (
2255     YUVTORGB_SETUP(yuvconstants)
2256     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2257 
2258     LABELALIGN
2259     "1:                                        \n"
2260     READNV12
2261     YUVTORGB(yuvconstants)
2262     STOREARGB
2263     "sub       $0x8,%[width]                   \n"
2264     "jg        1b                              \n"
2265   : [y_buf]"+r"(y_buf),    // %[y_buf]
2266     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
2267     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2268     [width]"+rm"(width)    // %[width]
2269   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2270     : "memory", "cc", YUVTORGB_REGS
2271       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2272   );
2273   // clang-format on
2274 }
2275 
NV21ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * vu_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2276 void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
2277                                 const uint8_t* vu_buf,
2278                                 uint8_t* dst_argb,
2279                                 const struct YuvConstants* yuvconstants,
2280                                 int width) {
2281   // clang-format off
2282   asm volatile (
2283     YUVTORGB_SETUP(yuvconstants)
2284     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2285 
2286     LABELALIGN
2287     "1:                                        \n"
2288     READNV21
2289     YUVTORGB(yuvconstants)
2290     STOREARGB
2291     "sub       $0x8,%[width]                   \n"
2292     "jg        1b                              \n"
2293   : [y_buf]"+r"(y_buf),    // %[y_buf]
2294     [vu_buf]"+r"(vu_buf),    // %[vu_buf]
2295     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2296     [width]"+rm"(width)    // %[width]
2297   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2298     [kShuffleNV21]"m"(kShuffleNV21)
2299     : "memory", "cc", YUVTORGB_REGS
2300       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2301   );
2302   // clang-format on
2303 }
2304 
YUY2ToARGBRow_SSSE3(const uint8_t * yuy2_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2305 void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
2306                                 uint8_t* dst_argb,
2307                                 const struct YuvConstants* yuvconstants,
2308                                 int width) {
2309   // clang-format off
2310   asm volatile (
2311     YUVTORGB_SETUP(yuvconstants)
2312     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2313 
2314     LABELALIGN
2315     "1:                                        \n"
2316     READYUY2
2317     YUVTORGB(yuvconstants)
2318     STOREARGB
2319     "sub       $0x8,%[width]                   \n"
2320     "jg        1b                              \n"
2321   : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
2322     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2323     [width]"+rm"(width)    // %[width]
2324   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2325     [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
2326     [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
2327     : "memory", "cc", YUVTORGB_REGS
2328       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2329   );
2330   // clang-format on
2331 }
2332 
UYVYToARGBRow_SSSE3(const uint8_t * uyvy_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2333 void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
2334                                 uint8_t* dst_argb,
2335                                 const struct YuvConstants* yuvconstants,
2336                                 int width) {
2337   // clang-format off
2338   asm volatile (
2339     YUVTORGB_SETUP(yuvconstants)
2340     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2341 
2342     LABELALIGN
2343     "1:                                        \n"
2344     READUYVY
2345     YUVTORGB(yuvconstants)
2346     STOREARGB
2347     "sub       $0x8,%[width]                   \n"
2348     "jg        1b                              \n"
2349   : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
2350     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2351     [width]"+rm"(width)    // %[width]
2352   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2353     [kShuffleUYVYY]"m"(kShuffleUYVYY),
2354     [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
2355     : "memory", "cc", YUVTORGB_REGS
2356       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2357   );
2358   // clang-format on
2359 }
2360 
I422ToRGBARow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_rgba,const struct YuvConstants * yuvconstants,int width)2361 void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
2362                                 const uint8_t* u_buf,
2363                                 const uint8_t* v_buf,
2364                                 uint8_t* dst_rgba,
2365                                 const struct YuvConstants* yuvconstants,
2366                                 int width) {
2367   asm volatile (
2368     YUVTORGB_SETUP(yuvconstants)
2369     "sub       %[u_buf],%[v_buf]               \n"
2370     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2371 
2372     LABELALIGN
2373     "1:                                        \n"
2374     READYUV422
2375     YUVTORGB(yuvconstants)
2376     STORERGBA
2377     "sub       $0x8,%[width]                   \n"
2378     "jg        1b                              \n"
2379   : [y_buf]"+r"(y_buf),    // %[y_buf]
2380     [u_buf]"+r"(u_buf),    // %[u_buf]
2381     [v_buf]"+r"(v_buf),    // %[v_buf]
2382     [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
2383     [width]"+rm"(width)    // %[width]
2384   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2385   : "memory", "cc", YUVTORGB_REGS
2386     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2387   );
2388 }
2389 
2390 #endif  // HAS_I422TOARGBROW_SSSE3
2391 
2392 // Read 16 UV from 444
2393 #define READYUV444_AVX2                                               \
2394   "vmovdqu    (%[u_buf]),%%xmm0                                   \n" \
2395   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
2396   "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
2397   "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
2398   "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
2399   "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
2400   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
2401   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
2402   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
2403   "lea        0x10(%[y_buf]),%[y_buf]                             \n"
2404 
2405 // Read 8 UV from 422, upsample to 16 UV.
2406 #define READYUV422_AVX2                                               \
2407   "vmovq      (%[u_buf]),%%xmm0                                   \n" \
2408   "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
2409   "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
2410   "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
2411   "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
2412   "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
2413   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
2414   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
2415   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
2416   "lea        0x10(%[y_buf]),%[y_buf]                             \n"
2417 
2418 // Read 8 UV from 210 10 bit, upsample to 16 UV
2419 // TODO(fbarchard): Consider vshufb to replace pack/unpack
2420 // TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
2421 #define READYUV210_AVX2                                            \
2422   "vmovdqu    (%[u_buf]),%%xmm0                                \n" \
2423   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
2424   "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
2425   "vpermq     $0xd8,%%ymm0,%%ymm0                              \n" \
2426   "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
2427   "vpunpcklwd %%ymm1,%%ymm0,%%ymm0                             \n" \
2428   "vpsraw     $0x2,%%ymm0,%%ymm0                               \n" \
2429   "vpackuswb  %%ymm0,%%ymm0,%%ymm0                             \n" \
2430   "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                             \n" \
2431   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
2432   "vpsllw     $0x6,%%ymm4,%%ymm4                               \n" \
2433   "lea        0x20(%[y_buf]),%[y_buf]                          \n"
2434 
2435 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
2436 #define READYUVA422_AVX2                                              \
2437   "vmovq      (%[u_buf]),%%xmm0                                   \n" \
2438   "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
2439   "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
2440   "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
2441   "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
2442   "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
2443   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
2444   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
2445   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
2446   "lea        0x10(%[y_buf]),%[y_buf]                             \n" \
2447   "vmovdqu    (%[a_buf]),%%xmm5                                   \n" \
2448   "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n" \
2449   "lea        0x10(%[a_buf]),%[a_buf]                             \n"
2450 
2451 // Read 8 UV from NV12, upsample to 16 UV.
2452 #define READNV12_AVX2                                                 \
2453   "vmovdqu    (%[uv_buf]),%%xmm0                                  \n" \
2454   "lea        0x10(%[uv_buf]),%[uv_buf]                           \n" \
2455   "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
2456   "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
2457   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
2458   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
2459   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
2460   "lea        0x10(%[y_buf]),%[y_buf]                             \n"
2461 
2462 // Read 8 VU from NV21, upsample to 16 UV.
2463 #define READNV21_AVX2                                                 \
2464   "vmovdqu    (%[vu_buf]),%%xmm0                                  \n" \
2465   "lea        0x10(%[vu_buf]),%[vu_buf]                           \n" \
2466   "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
2467   "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n" \
2468   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
2469   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
2470   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
2471   "lea        0x10(%[y_buf]),%[y_buf]                             \n"
2472 
2473 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
2474 #define READYUY2_AVX2                                                 \
2475   "vmovdqu    (%[yuy2_buf]),%%ymm4                                \n" \
2476   "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n" \
2477   "vmovdqu    (%[yuy2_buf]),%%ymm0                                \n" \
2478   "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n" \
2479   "lea        0x20(%[yuy2_buf]),%[yuy2_buf]                       \n"
2480 
2481 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
2482 #define READUYVY_AVX2                                                 \
2483   "vmovdqu    (%[uyvy_buf]),%%ymm4                                \n" \
2484   "vpshufb    %[kShuffleUYVYY], %%ymm4, %%ymm4                    \n" \
2485   "vmovdqu    (%[uyvy_buf]),%%ymm0                                \n" \
2486   "vpshufb    %[kShuffleUYVYUV], %%ymm0, %%ymm0                   \n" \
2487   "lea        0x20(%[uyvy_buf]),%[uyvy_buf]                       \n"
2488 
2489 #if defined(__x86_64__)
2490 #define YUVTORGB_SETUP_AVX2(yuvconstants)                            \
2491   "vmovdqa     (%[yuvconstants]),%%ymm8                          \n" \
2492   "vmovdqa     32(%[yuvconstants]),%%ymm9                        \n" \
2493   "vmovdqa     64(%[yuvconstants]),%%ymm10                       \n" \
2494   "vmovdqa     96(%[yuvconstants]),%%ymm11                       \n" \
2495   "vmovdqa     128(%[yuvconstants]),%%ymm12                      \n" \
2496   "vmovdqa     160(%[yuvconstants]),%%ymm13                      \n" \
2497   "vmovdqa     192(%[yuvconstants]),%%ymm14                      \n"
2498 
2499 #define YUVTORGB16_AVX2(yuvconstants)                                 \
2500   "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n" \
2501   "vpmaddubsw  %%ymm9,%%ymm0,%%ymm1                               \n" \
2502   "vpmaddubsw  %%ymm8,%%ymm0,%%ymm0                               \n" \
2503   "vpsubw      %%ymm2,%%ymm13,%%ymm2                              \n" \
2504   "vpsubw      %%ymm1,%%ymm12,%%ymm1                              \n" \
2505   "vpsubw      %%ymm0,%%ymm11,%%ymm0                              \n" \
2506   "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n" \
2507   "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
2508   "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \
2509   "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
2510 
2511 #define YUVTORGB_REGS_AVX2 \
2512   "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
2513 
2514 #else  // Convert 16 pixels: 16 UV and 16 Y.
2515 
2516 #define YUVTORGB_SETUP_AVX2(yuvconstants)
2517 #define YUVTORGB16_AVX2(yuvconstants)                                 \
2518   "vpmaddubsw  64(%[yuvconstants]),%%ymm0,%%ymm2                  \n" \
2519   "vpmaddubsw  32(%[yuvconstants]),%%ymm0,%%ymm1                  \n" \
2520   "vpmaddubsw  (%[yuvconstants]),%%ymm0,%%ymm0                    \n" \
2521   "vmovdqu     160(%[yuvconstants]),%%ymm3                        \n" \
2522   "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n" \
2523   "vmovdqu     128(%[yuvconstants]),%%ymm3                        \n" \
2524   "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n" \
2525   "vmovdqu     96(%[yuvconstants]),%%ymm3                         \n" \
2526   "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n" \
2527   "vpmulhuw    192(%[yuvconstants]),%%ymm4,%%ymm4                 \n" \
2528   "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
2529   "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \
2530   "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
2531 #define YUVTORGB_REGS_AVX2
2532 #endif
2533 
2534 #define YUVTORGB_AVX2(yuvconstants)                                   \
2535   YUVTORGB16_AVX2(yuvconstants)                                       \
2536   "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n" \
2537   "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n" \
2538   "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n" \
2539   "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n" \
2540   "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n" \
2541   "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
2542 
2543 // Store 16 ARGB values.
2544 #define STOREARGB_AVX2                                                \
2545   "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
2546   "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
2547   "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n" \
2548   "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
2549   "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n" \
2550   "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n" \
2551   "vmovdqu    %%ymm1,(%[dst_argb])                                \n" \
2552   "vmovdqu    %%ymm0,0x20(%[dst_argb])                            \n" \
2553   "lea       0x40(%[dst_argb]), %[dst_argb]                       \n"
2554 
2555 // Store 16 AR30 values.
2556 #define STOREAR30_AVX2                                                \
2557   "vpsraw     $0x4,%%ymm0,%%ymm0                                  \n" \
2558   "vpsraw     $0x4,%%ymm1,%%ymm1                                  \n" \
2559   "vpsraw     $0x4,%%ymm2,%%ymm2                                  \n" \
2560   "vpminsw    %%ymm7,%%ymm0,%%ymm0                                \n" \
2561   "vpminsw    %%ymm7,%%ymm1,%%ymm1                                \n" \
2562   "vpminsw    %%ymm7,%%ymm2,%%ymm2                                \n" \
2563   "vpmaxsw    %%ymm6,%%ymm0,%%ymm0                                \n" \
2564   "vpmaxsw    %%ymm6,%%ymm1,%%ymm1                                \n" \
2565   "vpmaxsw    %%ymm6,%%ymm2,%%ymm2                                \n" \
2566   "vpsllw     $0x4,%%ymm2,%%ymm2                                  \n" \
2567   "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
2568   "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
2569   "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
2570   "vpunpckhwd %%ymm2,%%ymm0,%%ymm3                                \n" \
2571   "vpunpcklwd %%ymm2,%%ymm0,%%ymm0                                \n" \
2572   "vpunpckhwd %%ymm5,%%ymm1,%%ymm2                                \n" \
2573   "vpunpcklwd %%ymm5,%%ymm1,%%ymm1                                \n" \
2574   "vpslld     $0xa,%%ymm1,%%ymm1                                  \n" \
2575   "vpslld     $0xa,%%ymm2,%%ymm2                                  \n" \
2576   "vpor       %%ymm1,%%ymm0,%%ymm0                                \n" \
2577   "vpor       %%ymm2,%%ymm3,%%ymm3                                \n" \
2578   "vmovdqu    %%ymm0,(%[dst_ar30])                                \n" \
2579   "vmovdqu    %%ymm3,0x20(%[dst_ar30])                            \n" \
2580   "lea        0x40(%[dst_ar30]), %[dst_ar30]                      \n"
2581 
2582 #ifdef HAS_I444TOARGBROW_AVX2
2583 // 16 pixels
2584 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
I444ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2585 void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
2586                                const uint8_t* u_buf,
2587                                const uint8_t* v_buf,
2588                                uint8_t* dst_argb,
2589                                const struct YuvConstants* yuvconstants,
2590                                int width) {
2591   asm volatile (
2592     YUVTORGB_SETUP_AVX2(yuvconstants)
2593     "sub       %[u_buf],%[v_buf]               \n"
2594     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
2595 
2596     LABELALIGN
2597     "1:                                        \n"
2598     READYUV444_AVX2
2599     YUVTORGB_AVX2(yuvconstants)
2600     STOREARGB_AVX2
2601     "sub       $0x10,%[width]                  \n"
2602     "jg        1b                              \n"
2603     "vzeroupper                                \n"
2604   : [y_buf]"+r"(y_buf),    // %[y_buf]
2605     [u_buf]"+r"(u_buf),    // %[u_buf]
2606     [v_buf]"+r"(v_buf),    // %[v_buf]
2607     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2608     [width]"+rm"(width)    // %[width]
2609   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2610   : "memory", "cc", YUVTORGB_REGS_AVX2
2611     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2612   );
2613 }
2614 #endif  // HAS_I444TOARGBROW_AVX2
2615 
2616 #if defined(HAS_I422TOARGBROW_AVX2)
2617 // 16 pixels
2618 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I422ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2619 void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
2620                                const uint8_t* u_buf,
2621                                const uint8_t* v_buf,
2622                                uint8_t* dst_argb,
2623                                const struct YuvConstants* yuvconstants,
2624                                int width) {
2625   asm volatile (
2626     YUVTORGB_SETUP_AVX2(yuvconstants)
2627     "sub       %[u_buf],%[v_buf]               \n"
2628     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
2629 
2630     LABELALIGN
2631     "1:                                        \n"
2632     READYUV422_AVX2
2633     YUVTORGB_AVX2(yuvconstants)
2634     STOREARGB_AVX2
2635     "sub       $0x10,%[width]                  \n"
2636     "jg        1b                              \n"
2637 
2638     "vzeroupper                                \n"
2639   : [y_buf]"+r"(y_buf),    // %[y_buf]
2640     [u_buf]"+r"(u_buf),    // %[u_buf]
2641     [v_buf]"+r"(v_buf),    // %[v_buf]
2642     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2643     [width]"+rm"(width)    // %[width]
2644   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2645   : "memory", "cc", YUVTORGB_REGS_AVX2
2646     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2647   );
2648 }
2649 #endif  // HAS_I422TOARGBROW_AVX2
2650 
2651 #if defined(HAS_I422TOAR30ROW_AVX2)
2652 // 16 pixels
2653 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I422ToAR30Row_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2654 void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
2655                                const uint8_t* u_buf,
2656                                const uint8_t* v_buf,
2657                                uint8_t* dst_ar30,
2658                                const struct YuvConstants* yuvconstants,
2659                                int width) {
2660   asm volatile (
2661     YUVTORGB_SETUP_AVX2(yuvconstants)
2662     "sub       %[u_buf],%[v_buf]               \n"
2663     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"  // AR30 constants
2664     "vpsrlw    $14,%%ymm5,%%ymm5               \n"
2665     "vpsllw    $4,%%ymm5,%%ymm5                \n"  // 2 alpha bits
2666     "vpxor     %%ymm6,%%ymm6,%%ymm6            \n"  // 0 for min
2667     "vpcmpeqb  %%ymm7,%%ymm7,%%ymm7            \n"  // 1023 for max
2668     "vpsrlw    $6,%%ymm7,%%ymm7                \n"
2669 
2670     LABELALIGN
2671     "1:                                        \n"
2672     READYUV422_AVX2
2673     YUVTORGB16_AVX2(yuvconstants)
2674     STOREAR30_AVX2
2675     "sub       $0x10,%[width]                  \n"
2676     "jg        1b                              \n"
2677 
2678     "vzeroupper                                \n"
2679   : [y_buf]"+r"(y_buf),    // %[y_buf]
2680     [u_buf]"+r"(u_buf),    // %[u_buf]
2681     [v_buf]"+r"(v_buf),    // %[v_buf]
2682     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
2683     [width]"+rm"(width)    // %[width]
2684   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2685   : "memory", "cc", YUVTORGB_REGS_AVX2
2686     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2687   );
2688 }
2689 #endif  // HAS_I422TOAR30ROW_AVX2
2690 
2691 #if defined(HAS_I210TOARGBROW_AVX2)
2692 // 16 pixels
2693 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I210ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2694 void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
2695                                const uint16_t* u_buf,
2696                                const uint16_t* v_buf,
2697                                uint8_t* dst_argb,
2698                                const struct YuvConstants* yuvconstants,
2699                                int width) {
2700   asm volatile (
2701     YUVTORGB_SETUP_AVX2(yuvconstants)
2702     "sub       %[u_buf],%[v_buf]               \n"
2703     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
2704 
2705     LABELALIGN
2706     "1:                                        \n"
2707     READYUV210_AVX2
2708     YUVTORGB_AVX2(yuvconstants)
2709     STOREARGB_AVX2
2710     "sub       $0x10,%[width]                  \n"
2711     "jg        1b                              \n"
2712 
2713     "vzeroupper                                \n"
2714   : [y_buf]"+r"(y_buf),    // %[y_buf]
2715     [u_buf]"+r"(u_buf),    // %[u_buf]
2716     [v_buf]"+r"(v_buf),    // %[v_buf]
2717     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2718     [width]"+rm"(width)    // %[width]
2719   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2720   : "memory", "cc", YUVTORGB_REGS_AVX2
2721     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2722   );
2723 }
2724 #endif  // HAS_I210TOARGBROW_AVX2
2725 
2726 #if defined(HAS_I210TOAR30ROW_AVX2)
2727 // 16 pixels
2728 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I210ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2729 void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
2730                                const uint16_t* u_buf,
2731                                const uint16_t* v_buf,
2732                                uint8_t* dst_ar30,
2733                                const struct YuvConstants* yuvconstants,
2734                                int width) {
2735   asm volatile (
2736     YUVTORGB_SETUP_AVX2(yuvconstants)
2737     "sub       %[u_buf],%[v_buf]               \n"
2738     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"  // AR30 constants
2739     "vpsrlw    $14,%%ymm5,%%ymm5               \n"
2740     "vpsllw    $4,%%ymm5,%%ymm5                \n"  // 2 alpha bits
2741     "vpxor     %%ymm6,%%ymm6,%%ymm6            \n"  // 0 for min
2742     "vpcmpeqb  %%ymm7,%%ymm7,%%ymm7            \n"  // 1023 for max
2743     "vpsrlw    $6,%%ymm7,%%ymm7                \n"
2744 
2745     LABELALIGN
2746     "1:                                        \n"
2747     READYUV210_AVX2
2748     YUVTORGB16_AVX2(yuvconstants)
2749     STOREAR30_AVX2
2750     "sub       $0x10,%[width]                  \n"
2751     "jg        1b                              \n"
2752 
2753     "vzeroupper                                \n"
2754   : [y_buf]"+r"(y_buf),    // %[y_buf]
2755     [u_buf]"+r"(u_buf),    // %[u_buf]
2756     [v_buf]"+r"(v_buf),    // %[v_buf]
2757     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
2758     [width]"+rm"(width)    // %[width]
2759   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2760   : "memory", "cc", YUVTORGB_REGS_AVX2
2761     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2762   );
2763 }
2764 #endif  // HAS_I210TOAR30ROW_AVX2
2765 
2766 #if defined(HAS_I422ALPHATOARGBROW_AVX2)
2767 // 16 pixels
2768 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
I422AlphaToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2769 void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
2770                                     const uint8_t* u_buf,
2771                                     const uint8_t* v_buf,
2772                                     const uint8_t* a_buf,
2773                                     uint8_t* dst_argb,
2774                                     const struct YuvConstants* yuvconstants,
2775                                     int width) {
2776   // clang-format off
2777   asm volatile (
2778     YUVTORGB_SETUP_AVX2(yuvconstants)
2779     "sub       %[u_buf],%[v_buf]               \n"
2780 
2781     LABELALIGN
2782     "1:                                        \n"
2783     READYUVA422_AVX2
2784     YUVTORGB_AVX2(yuvconstants)
2785     STOREARGB_AVX2
2786     "subl      $0x10,%[width]                  \n"
2787     "jg        1b                              \n"
2788     "vzeroupper                                \n"
2789   : [y_buf]"+r"(y_buf),    // %[y_buf]
2790     [u_buf]"+r"(u_buf),    // %[u_buf]
2791     [v_buf]"+r"(v_buf),    // %[v_buf]
2792     [a_buf]"+r"(a_buf),    // %[a_buf]
2793     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2794 #if defined(__i386__)
2795     [width]"+m"(width)     // %[width]
2796 #else
2797     [width]"+rm"(width)    // %[width]
2798 #endif
2799   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2800   : "memory", "cc", YUVTORGB_REGS_AVX2
2801     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2802   );
2803   // clang-format on
2804 }
2805 #endif  // HAS_I422ALPHATOARGBROW_AVX2
2806 
2807 #if defined(HAS_I422TORGBAROW_AVX2)
2808 // 16 pixels
2809 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
I422ToRGBARow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2810 void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
2811                                const uint8_t* u_buf,
2812                                const uint8_t* v_buf,
2813                                uint8_t* dst_argb,
2814                                const struct YuvConstants* yuvconstants,
2815                                int width) {
2816   asm volatile (
2817     YUVTORGB_SETUP_AVX2(yuvconstants)
2818     "sub       %[u_buf],%[v_buf]               \n"
2819     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2820 
2821     LABELALIGN
2822     "1:                                        \n"
2823     READYUV422_AVX2
2824     YUVTORGB_AVX2(yuvconstants)
2825 
2826     // Step 3: Weave into RGBA
2827     "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
2828     "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
2829     "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
2830     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
2831     "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
2832     "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
2833     "vmovdqu    %%ymm0,(%[dst_argb])           \n"
2834     "vmovdqu    %%ymm1,0x20(%[dst_argb])       \n"
2835     "lea        0x40(%[dst_argb]),%[dst_argb]  \n"
2836     "sub        $0x10,%[width]                 \n"
2837     "jg         1b                             \n"
2838     "vzeroupper                                \n"
2839   : [y_buf]"+r"(y_buf),    // %[y_buf]
2840     [u_buf]"+r"(u_buf),    // %[u_buf]
2841     [v_buf]"+r"(v_buf),    // %[v_buf]
2842     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2843     [width]"+rm"(width)    // %[width]
2844   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2845   : "memory", "cc", YUVTORGB_REGS_AVX2
2846     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2847   );
2848 }
2849 #endif  // HAS_I422TORGBAROW_AVX2
2850 
2851 #if defined(HAS_NV12TOARGBROW_AVX2)
2852 // 16 pixels.
2853 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV12ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2854 void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
2855                                const uint8_t* uv_buf,
2856                                uint8_t* dst_argb,
2857                                const struct YuvConstants* yuvconstants,
2858                                int width) {
2859   // clang-format off
2860   asm volatile (
2861     YUVTORGB_SETUP_AVX2(yuvconstants)
2862     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2863 
2864     LABELALIGN
2865     "1:                                        \n"
2866     READNV12_AVX2
2867     YUVTORGB_AVX2(yuvconstants)
2868     STOREARGB_AVX2
2869     "sub       $0x10,%[width]                  \n"
2870     "jg        1b                              \n"
2871     "vzeroupper                                \n"
2872   : [y_buf]"+r"(y_buf),    // %[y_buf]
2873     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
2874     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2875     [width]"+rm"(width)    // %[width]
2876   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2877     : "memory", "cc", YUVTORGB_REGS_AVX2
2878     "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2879   );
2880   // clang-format on
2881 }
2882 #endif  // HAS_NV12TOARGBROW_AVX2
2883 
2884 #if defined(HAS_NV21TOARGBROW_AVX2)
2885 // 16 pixels.
2886 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV21ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * vu_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2887 void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
2888                                const uint8_t* vu_buf,
2889                                uint8_t* dst_argb,
2890                                const struct YuvConstants* yuvconstants,
2891                                int width) {
2892   // clang-format off
2893   asm volatile (
2894     YUVTORGB_SETUP_AVX2(yuvconstants)
2895     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2896 
2897     LABELALIGN
2898     "1:                                        \n"
2899     READNV21_AVX2
2900     YUVTORGB_AVX2(yuvconstants)
2901     STOREARGB_AVX2
2902     "sub       $0x10,%[width]                  \n"
2903     "jg        1b                              \n"
2904     "vzeroupper                                \n"
2905   : [y_buf]"+r"(y_buf),    // %[y_buf]
2906     [vu_buf]"+r"(vu_buf),    // %[vu_buf]
2907     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2908     [width]"+rm"(width)    // %[width]
2909   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2910     [kShuffleNV21]"m"(kShuffleNV21)
2911     : "memory", "cc", YUVTORGB_REGS_AVX2
2912       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2913   );
2914   // clang-format on
2915 }
2916 #endif  // HAS_NV21TOARGBROW_AVX2
2917 
2918 #if defined(HAS_YUY2TOARGBROW_AVX2)
2919 // 16 pixels.
2920 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
YUY2ToARGBRow_AVX2(const uint8_t * yuy2_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2921 void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
2922                                uint8_t* dst_argb,
2923                                const struct YuvConstants* yuvconstants,
2924                                int width) {
2925   // clang-format off
2926   asm volatile (
2927     YUVTORGB_SETUP_AVX2(yuvconstants)
2928     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2929 
2930     LABELALIGN
2931     "1:                                        \n"
2932     READYUY2_AVX2
2933     YUVTORGB_AVX2(yuvconstants)
2934     STOREARGB_AVX2
2935     "sub       $0x10,%[width]                  \n"
2936     "jg        1b                              \n"
2937     "vzeroupper                                \n"
2938   : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
2939     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2940     [width]"+rm"(width)    // %[width]
2941   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2942     [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
2943     [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
2944     : "memory", "cc", YUVTORGB_REGS_AVX2
2945       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2946   );
2947   // clang-format on
2948 }
2949 #endif  // HAS_YUY2TOARGBROW_AVX2
2950 
2951 #if defined(HAS_UYVYTOARGBROW_AVX2)
2952 // 16 pixels.
2953 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
UYVYToARGBRow_AVX2(const uint8_t * uyvy_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2954 void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
2955                                uint8_t* dst_argb,
2956                                const struct YuvConstants* yuvconstants,
2957                                int width) {
2958   // clang-format off
2959   asm volatile (
2960     YUVTORGB_SETUP_AVX2(yuvconstants)
2961     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2962 
2963     LABELALIGN
2964     "1:                                        \n"
2965     READUYVY_AVX2
2966     YUVTORGB_AVX2(yuvconstants)
2967     STOREARGB_AVX2
2968     "sub       $0x10,%[width]                  \n"
2969     "jg        1b                              \n"
2970     "vzeroupper                                \n"
2971   : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
2972     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2973     [width]"+rm"(width)    // %[width]
2974   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2975     [kShuffleUYVYY]"m"(kShuffleUYVYY),
2976     [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
2977     : "memory", "cc", YUVTORGB_REGS_AVX2
2978       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2979   );
2980   // clang-format on
2981 }
2982 #endif  // HAS_UYVYTOARGBROW_AVX2
2983 
2984 #ifdef HAS_I400TOARGBROW_SSE2
I400ToARGBRow_SSE2(const uint8_t * y_buf,uint8_t * dst_argb,int width)2985 void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
2986   asm volatile(
2987       "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
2988       "movd      %%eax,%%xmm2                    \n"
2989       "pshufd    $0x0,%%xmm2,%%xmm2              \n"
2990       "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 *
2991                                                       // 16
2992       "movd      %%eax,%%xmm3                    \n"
2993       "pshufd    $0x0,%%xmm3,%%xmm3              \n"
2994       "pcmpeqb   %%xmm4,%%xmm4                   \n"
2995       "pslld     $0x18,%%xmm4                    \n"
2996 
2997       LABELALIGN
2998       "1:                                        \n"
2999       // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
3000       "movq      (%0),%%xmm0                     \n"
3001       "lea       0x8(%0),%0                      \n"
3002       "punpcklbw %%xmm0,%%xmm0                   \n"
3003       "pmulhuw   %%xmm2,%%xmm0                   \n"
3004       "psubusw   %%xmm3,%%xmm0                   \n"
3005       "psrlw     $6, %%xmm0                      \n"
3006       "packuswb  %%xmm0,%%xmm0                   \n"
3007 
3008       // Step 2: Weave into ARGB
3009       "punpcklbw %%xmm0,%%xmm0                   \n"
3010       "movdqa    %%xmm0,%%xmm1                   \n"
3011       "punpcklwd %%xmm0,%%xmm0                   \n"
3012       "punpckhwd %%xmm1,%%xmm1                   \n"
3013       "por       %%xmm4,%%xmm0                   \n"
3014       "por       %%xmm4,%%xmm1                   \n"
3015       "movdqu    %%xmm0,(%1)                     \n"
3016       "movdqu    %%xmm1,0x10(%1)                 \n"
3017       "lea       0x20(%1),%1                     \n"
3018 
3019       "sub       $0x8,%2                         \n"
3020       "jg        1b                              \n"
3021       : "+r"(y_buf),     // %0
3022         "+r"(dst_argb),  // %1
3023         "+rm"(width)     // %2
3024       :
3025       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
3026 }
3027 #endif  // HAS_I400TOARGBROW_SSE2
3028 
3029 #ifdef HAS_I400TOARGBROW_AVX2
3030 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
3031 // note: vpunpcklbw mutates and vpackuswb unmutates.
I400ToARGBRow_AVX2(const uint8_t * y_buf,uint8_t * dst_argb,int width)3032 void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
3033   asm volatile(
3034       "mov        $0x4a354a35,%%eax              \n"  // 0488 = 1160 = 1.164 *
3035                                                       // 16
3036       "vmovd      %%eax,%%xmm2                   \n"
3037       "vbroadcastss %%xmm2,%%ymm2                \n"
3038       "mov        $0x4880488,%%eax               \n"  // 4a35 = 18997 = 1.164
3039       "vmovd      %%eax,%%xmm3                   \n"
3040       "vbroadcastss %%xmm3,%%ymm3                \n"
3041       "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
3042       "vpslld     $0x18,%%ymm4,%%ymm4            \n"
3043 
3044       LABELALIGN
3045       "1:                                        \n"
3046       // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
3047       "vmovdqu    (%0),%%xmm0                    \n"
3048       "lea        0x10(%0),%0                    \n"
3049       "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
3050       "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
3051       "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
3052       "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
3053       "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
3054       "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
3055       "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
3056       "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
3057       "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
3058       "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
3059       "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
3060       "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
3061       "vmovdqu    %%ymm0,(%1)                    \n"
3062       "vmovdqu    %%ymm1,0x20(%1)                \n"
3063       "lea       0x40(%1),%1                     \n"
3064       "sub        $0x10,%2                       \n"
3065       "jg        1b                              \n"
3066       "vzeroupper                                \n"
3067       : "+r"(y_buf),     // %0
3068         "+r"(dst_argb),  // %1
3069         "+rm"(width)     // %2
3070       :
3071       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
3072 }
3073 #endif  // HAS_I400TOARGBROW_AVX2
3074 
3075 #ifdef HAS_MIRRORROW_SSSE3
3076 // Shuffle table for reversing the bytes.
3077 static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
3078                                      7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
3079 
MirrorRow_SSSE3(const uint8_t * src,uint8_t * dst,int width)3080 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
3081   intptr_t temp_width = (intptr_t)(width);
3082   asm volatile(
3083 
3084       "movdqa    %3,%%xmm5                       \n"
3085 
3086       LABELALIGN
3087       "1:                                        \n"
3088       "movdqu    -0x10(%0,%2,1),%%xmm0           \n"
3089       "pshufb    %%xmm5,%%xmm0                   \n"
3090       "movdqu    %%xmm0,(%1)                     \n"
3091       "lea       0x10(%1),%1                     \n"
3092       "sub       $0x10,%2                        \n"
3093       "jg        1b                              \n"
3094       : "+r"(src),           // %0
3095         "+r"(dst),           // %1
3096         "+r"(temp_width)     // %2
3097       : "m"(kShuffleMirror)  // %3
3098       : "memory", "cc", "xmm0", "xmm5");
3099 }
3100 #endif  // HAS_MIRRORROW_SSSE3
3101 
3102 #ifdef HAS_MIRRORROW_AVX2
MirrorRow_AVX2(const uint8_t * src,uint8_t * dst,int width)3103 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
3104   intptr_t temp_width = (intptr_t)(width);
3105   asm volatile(
3106 
3107       "vbroadcastf128 %3,%%ymm5                  \n"
3108 
3109       LABELALIGN
3110       "1:                                        \n"
3111       "vmovdqu    -0x20(%0,%2,1),%%ymm0          \n"
3112       "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
3113       "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
3114       "vmovdqu    %%ymm0,(%1)                    \n"
3115       "lea       0x20(%1),%1                     \n"
3116       "sub       $0x20,%2                        \n"
3117       "jg        1b                              \n"
3118       "vzeroupper                                \n"
3119       : "+r"(src),           // %0
3120         "+r"(dst),           // %1
3121         "+r"(temp_width)     // %2
3122       : "m"(kShuffleMirror)  // %3
3123       : "memory", "cc", "xmm0", "xmm5");
3124 }
3125 #endif  // HAS_MIRRORROW_AVX2
3126 
3127 #ifdef HAS_MIRRORUVROW_SSSE3
3128 // Shuffle table for reversing the bytes of UV channels.
3129 static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
3130                                        15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
MirrorUVRow_SSSE3(const uint8_t * src,uint8_t * dst_u,uint8_t * dst_v,int width)3131 void MirrorUVRow_SSSE3(const uint8_t* src,
3132                        uint8_t* dst_u,
3133                        uint8_t* dst_v,
3134                        int width) {
3135   intptr_t temp_width = (intptr_t)(width);
3136   asm volatile(
3137       "movdqa    %4,%%xmm1                       \n"
3138       "lea       -0x10(%0,%3,2),%0               \n"
3139       "sub       %1,%2                           \n"
3140 
3141       LABELALIGN
3142       "1:                                        \n"
3143       "movdqu    (%0),%%xmm0                     \n"
3144       "lea       -0x10(%0),%0                    \n"
3145       "pshufb    %%xmm1,%%xmm0                   \n"
3146       "movlpd    %%xmm0,(%1)                     \n"
3147       "movhpd    %%xmm0,0x00(%1,%2,1)            \n"
3148       "lea       0x8(%1),%1                      \n"
3149       "sub       $8,%3                           \n"
3150       "jg        1b                              \n"
3151       : "+r"(src),             // %0
3152         "+r"(dst_u),           // %1
3153         "+r"(dst_v),           // %2
3154         "+r"(temp_width)       // %3
3155       : "m"(kShuffleMirrorUV)  // %4
3156       : "memory", "cc", "xmm0", "xmm1");
3157 }
3158 #endif  // HAS_MIRRORUVROW_SSSE3
3159 
3160 #ifdef HAS_ARGBMIRRORROW_SSE2
3161 
ARGBMirrorRow_SSE2(const uint8_t * src,uint8_t * dst,int width)3162 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
3163   intptr_t temp_width = (intptr_t)(width);
3164   asm volatile(
3165 
3166       "lea       -0x10(%0,%2,4),%0               \n"
3167 
3168       LABELALIGN
3169       "1:                                        \n"
3170       "movdqu    (%0),%%xmm0                     \n"
3171       "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
3172       "lea       -0x10(%0),%0                    \n"
3173       "movdqu    %%xmm0,(%1)                     \n"
3174       "lea       0x10(%1),%1                     \n"
3175       "sub       $0x4,%2                         \n"
3176       "jg        1b                              \n"
3177       : "+r"(src),        // %0
3178         "+r"(dst),        // %1
3179         "+r"(temp_width)  // %2
3180       :
3181       : "memory", "cc", "xmm0");
3182 }
3183 #endif  // HAS_ARGBMIRRORROW_SSE2
3184 
3185 #ifdef HAS_ARGBMIRRORROW_AVX2
3186 // Shuffle table for reversing the bytes.
3187 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
ARGBMirrorRow_AVX2(const uint8_t * src,uint8_t * dst,int width)3188 void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
3189   intptr_t temp_width = (intptr_t)(width);
3190   asm volatile(
3191 
3192       "vmovdqu    %3,%%ymm5                      \n"
3193 
3194       LABELALIGN
3195       "1:                                        \n"
3196       "vpermd    -0x20(%0,%2,4),%%ymm5,%%ymm0    \n"
3197       "vmovdqu    %%ymm0,(%1)                    \n"
3198       "lea        0x20(%1),%1                    \n"
3199       "sub        $0x8,%2                        \n"
3200       "jg         1b                             \n"
3201       "vzeroupper                                \n"
3202       : "+r"(src),                    // %0
3203         "+r"(dst),                    // %1
3204         "+r"(temp_width)              // %2
3205       : "m"(kARGBShuffleMirror_AVX2)  // %3
3206       : "memory", "cc", "xmm0", "xmm5");
3207 }
3208 #endif  // HAS_ARGBMIRRORROW_AVX2
3209 
3210 #ifdef HAS_SPLITUVROW_AVX2
SplitUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)3211 void SplitUVRow_AVX2(const uint8_t* src_uv,
3212                      uint8_t* dst_u,
3213                      uint8_t* dst_v,
3214                      int width) {
3215   asm volatile(
3216       "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
3217       "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
3218       "sub        %1,%2                          \n"
3219 
3220       LABELALIGN
3221       "1:                                        \n"
3222       "vmovdqu    (%0),%%ymm0                    \n"
3223       "vmovdqu    0x20(%0),%%ymm1                \n"
3224       "lea        0x40(%0),%0                    \n"
3225       "vpsrlw     $0x8,%%ymm0,%%ymm2             \n"
3226       "vpsrlw     $0x8,%%ymm1,%%ymm3             \n"
3227       "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
3228       "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
3229       "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
3230       "vpackuswb  %%ymm3,%%ymm2,%%ymm2           \n"
3231       "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
3232       "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
3233       "vmovdqu    %%ymm0,(%1)                    \n"
3234       "vmovdqu    %%ymm2,0x00(%1,%2,1)            \n"
3235       "lea        0x20(%1),%1                    \n"
3236       "sub        $0x20,%3                       \n"
3237       "jg         1b                             \n"
3238       "vzeroupper                                \n"
3239       : "+r"(src_uv),  // %0
3240         "+r"(dst_u),   // %1
3241         "+r"(dst_v),   // %2
3242         "+r"(width)    // %3
3243       :
3244       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
3245 }
3246 #endif  // HAS_SPLITUVROW_AVX2
3247 
3248 #ifdef HAS_SPLITUVROW_SSE2
SplitUVRow_SSE2(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)3249 void SplitUVRow_SSE2(const uint8_t* src_uv,
3250                      uint8_t* dst_u,
3251                      uint8_t* dst_v,
3252                      int width) {
3253   asm volatile(
3254       "pcmpeqb    %%xmm5,%%xmm5                  \n"
3255       "psrlw      $0x8,%%xmm5                    \n"
3256       "sub        %1,%2                          \n"
3257 
3258       LABELALIGN
3259       "1:                                        \n"
3260       "movdqu     (%0),%%xmm0                    \n"
3261       "movdqu     0x10(%0),%%xmm1                \n"
3262       "lea        0x20(%0),%0                    \n"
3263       "movdqa     %%xmm0,%%xmm2                  \n"
3264       "movdqa     %%xmm1,%%xmm3                  \n"
3265       "pand       %%xmm5,%%xmm0                  \n"
3266       "pand       %%xmm5,%%xmm1                  \n"
3267       "packuswb   %%xmm1,%%xmm0                  \n"
3268       "psrlw      $0x8,%%xmm2                    \n"
3269       "psrlw      $0x8,%%xmm3                    \n"
3270       "packuswb   %%xmm3,%%xmm2                  \n"
3271       "movdqu     %%xmm0,(%1)                    \n"
3272       "movdqu    %%xmm2,0x00(%1,%2,1)            \n"
3273       "lea        0x10(%1),%1                    \n"
3274       "sub        $0x10,%3                       \n"
3275       "jg         1b                             \n"
3276       : "+r"(src_uv),  // %0
3277         "+r"(dst_u),   // %1
3278         "+r"(dst_v),   // %2
3279         "+r"(width)    // %3
3280       :
3281       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
3282 }
3283 #endif  // HAS_SPLITUVROW_SSE2
3284 
3285 #ifdef HAS_MERGEUVROW_AVX2
MergeUVRow_AVX2(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)3286 void MergeUVRow_AVX2(const uint8_t* src_u,
3287                      const uint8_t* src_v,
3288                      uint8_t* dst_uv,
3289                      int width) {
3290   asm volatile(
3291 
3292       "sub       %0,%1                           \n"
3293 
3294       LABELALIGN
3295       "1:                                        \n"
3296       "vmovdqu   (%0),%%ymm0                     \n"
3297       "vmovdqu    0x00(%0,%1,1),%%ymm1           \n"
3298       "lea       0x20(%0),%0                     \n"
3299       "vpunpcklbw %%ymm1,%%ymm0,%%ymm2           \n"
3300       "vpunpckhbw %%ymm1,%%ymm0,%%ymm0           \n"
3301       "vextractf128 $0x0,%%ymm2,(%2)             \n"
3302       "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
3303       "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
3304       "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
3305       "lea       0x40(%2),%2                     \n"
3306       "sub       $0x20,%3                        \n"
3307       "jg        1b                              \n"
3308       "vzeroupper                                \n"
3309       : "+r"(src_u),   // %0
3310         "+r"(src_v),   // %1
3311         "+r"(dst_uv),  // %2
3312         "+r"(width)    // %3
3313       :
3314       : "memory", "cc", "xmm0", "xmm1", "xmm2");
3315 }
3316 #endif  // HAS_MERGEUVROW_AVX2
3317 
3318 #ifdef HAS_MERGEUVROW_SSE2
MergeUVRow_SSE2(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)3319 void MergeUVRow_SSE2(const uint8_t* src_u,
3320                      const uint8_t* src_v,
3321                      uint8_t* dst_uv,
3322                      int width) {
3323   asm volatile(
3324 
3325       "sub       %0,%1                           \n"
3326 
3327       LABELALIGN
3328       "1:                                        \n"
3329       "movdqu    (%0),%%xmm0                     \n"
3330       "movdqu    0x00(%0,%1,1),%%xmm1            \n"
3331       "lea       0x10(%0),%0                     \n"
3332       "movdqa    %%xmm0,%%xmm2                   \n"
3333       "punpcklbw %%xmm1,%%xmm0                   \n"
3334       "punpckhbw %%xmm1,%%xmm2                   \n"
3335       "movdqu    %%xmm0,(%2)                     \n"
3336       "movdqu    %%xmm2,0x10(%2)                 \n"
3337       "lea       0x20(%2),%2                     \n"
3338       "sub       $0x10,%3                        \n"
3339       "jg        1b                              \n"
3340       : "+r"(src_u),   // %0
3341         "+r"(src_v),   // %1
3342         "+r"(dst_uv),  // %2
3343         "+r"(width)    // %3
3344       :
3345       : "memory", "cc", "xmm0", "xmm1", "xmm2");
3346 }
3347 #endif  // HAS_MERGEUVROW_SSE2
3348 
3349 // Use scale to convert lsb formats to msb, depending how many bits there are:
3350 // 128 = 9 bits
3351 // 64 = 10 bits
3352 // 16 = 12 bits
3353 // 1 = 16 bits
3354 #ifdef HAS_MERGEUVROW_16_AVX2
MergeUVRow_16_AVX2(const uint16_t * src_u,const uint16_t * src_v,uint16_t * dst_uv,int scale,int width)3355 void MergeUVRow_16_AVX2(const uint16_t* src_u,
3356                         const uint16_t* src_v,
3357                         uint16_t* dst_uv,
3358                         int scale,
3359                         int width) {
3360   // clang-format off
3361   asm volatile (
3362     "vmovd      %4,%%xmm3                      \n"
3363     "vpunpcklwd %%xmm3,%%xmm3,%%xmm3           \n"
3364     "vbroadcastss %%xmm3,%%ymm3                \n"
3365     "sub       %0,%1                           \n"
3366 
3367     // 16 pixels per loop.
3368     LABELALIGN
3369     "1:                                        \n"
3370     "vmovdqu   (%0),%%ymm0                     \n"
3371     "vmovdqu   (%0,%1,1),%%ymm1                \n"
3372     "add        $0x20,%0                       \n"
3373 
3374     "vpmullw   %%ymm3,%%ymm0,%%ymm0            \n"
3375     "vpmullw   %%ymm3,%%ymm1,%%ymm1            \n"
3376     "vpunpcklwd %%ymm1,%%ymm0,%%ymm2           \n"  // mutates
3377     "vpunpckhwd %%ymm1,%%ymm0,%%ymm0           \n"
3378     "vextractf128 $0x0,%%ymm2,(%2)             \n"
3379     "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
3380     "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
3381     "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
3382     "add       $0x40,%2                        \n"
3383     "sub       $0x10,%3                        \n"
3384     "jg        1b                              \n"
3385     "vzeroupper                                \n"
3386   : "+r"(src_u),   // %0
3387     "+r"(src_v),   // %1
3388     "+r"(dst_uv),  // %2
3389     "+r"(width)    // %3
3390   : "r"(scale)     // %4
3391   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
3392   // clang-format on
3393 }
3394 #endif  // HAS_MERGEUVROW_AVX2
3395 
3396 // Use scale to convert lsb formats to msb, depending how many bits there are:
3397 // 128 = 9 bits
3398 // 64 = 10 bits
3399 // 16 = 12 bits
3400 // 1 = 16 bits
3401 #ifdef HAS_MULTIPLYROW_16_AVX2
MultiplyRow_16_AVX2(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)3402 void MultiplyRow_16_AVX2(const uint16_t* src_y,
3403                          uint16_t* dst_y,
3404                          int scale,
3405                          int width) {
3406   // clang-format off
3407   asm volatile (
3408     "vmovd      %3,%%xmm3                      \n"
3409     "vpunpcklwd %%xmm3,%%xmm3,%%xmm3           \n"
3410     "vbroadcastss %%xmm3,%%ymm3                \n"
3411     "sub       %0,%1                           \n"
3412 
3413     // 16 pixels per loop.
3414     LABELALIGN
3415     "1:                                        \n"
3416     "vmovdqu   (%0),%%ymm0                     \n"
3417     "vmovdqu   0x20(%0),%%ymm1                 \n"
3418     "vpmullw   %%ymm3,%%ymm0,%%ymm0            \n"
3419     "vpmullw   %%ymm3,%%ymm1,%%ymm1            \n"
3420     "vmovdqu   %%ymm0,(%0,%1)                  \n"
3421     "vmovdqu   %%ymm1,0x20(%0,%1)              \n"
3422     "add        $0x40,%0                       \n"
3423     "sub       $0x20,%2                        \n"
3424     "jg        1b                              \n"
3425     "vzeroupper                                \n"
3426   : "+r"(src_y),   // %0
3427     "+r"(dst_y),   // %1
3428     "+r"(width)    // %2
3429   : "r"(scale)     // %3
3430   : "memory", "cc", "xmm0", "xmm1", "xmm3");
3431   // clang-format on
3432 }
3433 #endif  // HAS_MULTIPLYROW_16_AVX2
3434 
3435 // Use scale to convert lsb formats to msb, depending how many bits there are:
3436 // 32768 = 9 bits
3437 // 16384 = 10 bits
3438 // 4096 = 12 bits
3439 // 256 = 16 bits
Convert16To8Row_SSSE3(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)3440 void Convert16To8Row_SSSE3(const uint16_t* src_y,
3441                            uint8_t* dst_y,
3442                            int scale,
3443                            int width) {
3444   // clang-format off
3445   asm volatile (
3446     "movd      %3,%%xmm2                      \n"
3447     "punpcklwd %%xmm2,%%xmm2                  \n"
3448     "pshufd    $0x0,%%xmm2,%%xmm2             \n"
3449 
3450     // 32 pixels per loop.
3451     LABELALIGN
3452     "1:                                       \n"
3453     "movdqu    (%0),%%xmm0                    \n"
3454     "movdqu    0x10(%0),%%xmm1                \n"
3455     "add       $0x20,%0                       \n"
3456     "pmulhuw   %%xmm2,%%xmm0                  \n"
3457     "pmulhuw   %%xmm2,%%xmm1                  \n"
3458     "packuswb  %%xmm1,%%xmm0                  \n"
3459     "movdqu    %%xmm0,(%1)                    \n"
3460     "add       $0x10,%1                       \n"
3461     "sub       $0x10,%2                       \n"
3462     "jg        1b                             \n"
3463   : "+r"(src_y),   // %0
3464     "+r"(dst_y),   // %1
3465     "+r"(width)    // %2
3466   : "r"(scale)     // %3
3467   : "memory", "cc", "xmm0", "xmm1", "xmm2");
3468   // clang-format on
3469 }
3470 
3471 #ifdef HAS_CONVERT16TO8ROW_AVX2
Convert16To8Row_AVX2(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)3472 void Convert16To8Row_AVX2(const uint16_t* src_y,
3473                           uint8_t* dst_y,
3474                           int scale,
3475                           int width) {
3476   // clang-format off
3477   asm volatile (
3478     "vmovd      %3,%%xmm2                      \n"
3479     "vpunpcklwd %%xmm2,%%xmm2,%%xmm2           \n"
3480     "vbroadcastss %%xmm2,%%ymm2                \n"
3481 
3482     // 32 pixels per loop.
3483     LABELALIGN
3484     "1:                                        \n"
3485     "vmovdqu   (%0),%%ymm0                     \n"
3486     "vmovdqu   0x20(%0),%%ymm1                 \n"
3487     "add       $0x40,%0                        \n"
3488     "vpmulhuw  %%ymm2,%%ymm0,%%ymm0            \n"
3489     "vpmulhuw  %%ymm2,%%ymm1,%%ymm1            \n"
3490     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"  // mutates
3491     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3492     "vmovdqu   %%ymm0,(%1)                     \n"
3493     "add       $0x20,%1                        \n"
3494     "sub       $0x20,%2                        \n"
3495     "jg        1b                              \n"
3496     "vzeroupper                                \n"
3497   : "+r"(src_y),   // %0
3498     "+r"(dst_y),   // %1
3499     "+r"(width)    // %2
3500   : "r"(scale)     // %3
3501   : "memory", "cc", "xmm0", "xmm1", "xmm2");
3502   // clang-format on
3503 }
3504 #endif  // HAS_CONVERT16TO8ROW_AVX2
3505 
3506 // Use scale to convert to lsb formats depending how many bits there are:
3507 // 512 = 9 bits
3508 // 1024 = 10 bits
3509 // 4096 = 12 bits
3510 // TODO(fbarchard): reduce to SSE2
Convert8To16Row_SSE2(const uint8_t * src_y,uint16_t * dst_y,int scale,int width)3511 void Convert8To16Row_SSE2(const uint8_t* src_y,
3512                           uint16_t* dst_y,
3513                           int scale,
3514                           int width) {
3515   // clang-format off
3516   asm volatile (
3517     "movd      %3,%%xmm2                      \n"
3518     "punpcklwd %%xmm2,%%xmm2                  \n"
3519     "pshufd    $0x0,%%xmm2,%%xmm2             \n"
3520 
3521     // 32 pixels per loop.
3522     LABELALIGN
3523     "1:                                       \n"
3524     "movdqu    (%0),%%xmm0                    \n"
3525     "movdqa    %%xmm0,%%xmm1                  \n"
3526     "punpcklbw %%xmm0,%%xmm0                  \n"
3527     "punpckhbw %%xmm1,%%xmm1                  \n"
3528     "add       $0x10,%0                       \n"
3529     "pmulhuw   %%xmm2,%%xmm0                  \n"
3530     "pmulhuw   %%xmm2,%%xmm1                  \n"
3531     "movdqu    %%xmm0,(%1)                    \n"
3532     "movdqu    %%xmm1,0x10(%1)                \n"
3533     "add       $0x20,%1                       \n"
3534     "sub       $0x10,%2                       \n"
3535     "jg        1b                             \n"
3536   : "+r"(src_y),   // %0
3537     "+r"(dst_y),   // %1
3538     "+r"(width)    // %2
3539   : "r"(scale)     // %3
3540   : "memory", "cc", "xmm0", "xmm1", "xmm2");
3541   // clang-format on
3542 }
3543 
3544 #ifdef HAS_CONVERT8TO16ROW_AVX2
Convert8To16Row_AVX2(const uint8_t * src_y,uint16_t * dst_y,int scale,int width)3545 void Convert8To16Row_AVX2(const uint8_t* src_y,
3546                           uint16_t* dst_y,
3547                           int scale,
3548                           int width) {
3549   // clang-format off
3550   asm volatile (
3551     "vmovd      %3,%%xmm2                      \n"
3552     "vpunpcklwd %%xmm2,%%xmm2,%%xmm2           \n"
3553     "vbroadcastss %%xmm2,%%ymm2                \n"
3554 
3555     // 32 pixels per loop.
3556     LABELALIGN
3557     "1:                                        \n"
3558     "vmovdqu   (%0),%%ymm0                     \n"
3559     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3560     "add       $0x20,%0                        \n"
3561     "vpunpckhbw %%ymm0,%%ymm0,%%ymm1           \n"
3562     "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
3563     "vpmulhuw  %%ymm2,%%ymm0,%%ymm0            \n"
3564     "vpmulhuw  %%ymm2,%%ymm1,%%ymm1            \n"
3565     "vmovdqu   %%ymm0,(%1)                     \n"
3566     "vmovdqu   %%ymm1,0x20(%1)                 \n"
3567     "add       $0x40,%1                        \n"
3568     "sub       $0x20,%2                        \n"
3569     "jg        1b                              \n"
3570     "vzeroupper                                \n"
3571   : "+r"(src_y),   // %0
3572     "+r"(dst_y),   // %1
3573     "+r"(width)    // %2
3574   : "r"(scale)     // %3
3575   : "memory", "cc", "xmm0", "xmm1", "xmm2");
3576   // clang-format on
3577 }
3578 #endif  // HAS_CONVERT8TO16ROW_AVX2
3579 
3580 #ifdef HAS_SPLITRGBROW_SSSE3
3581 
3582 // Shuffle table for converting RGB to Planar.
3583 static const uvec8 kShuffleMaskRGBToR0 = {0u,   3u,   6u,   9u,   12u,  15u,
3584                                           128u, 128u, 128u, 128u, 128u, 128u,
3585                                           128u, 128u, 128u, 128u};
3586 static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
3587                                           2u,   5u,   8u,   11u,  14u,  128u,
3588                                           128u, 128u, 128u, 128u};
3589 static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
3590                                           128u, 128u, 128u, 128u, 128u, 1u,
3591                                           4u,   7u,   10u,  13u};
3592 
3593 static const uvec8 kShuffleMaskRGBToG0 = {1u,   4u,   7u,   10u,  13u,  128u,
3594                                           128u, 128u, 128u, 128u, 128u, 128u,
3595                                           128u, 128u, 128u, 128u};
3596 static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
3597                                           3u,   6u,   9u,   12u,  15u,  128u,
3598                                           128u, 128u, 128u, 128u};
3599 static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
3600                                           128u, 128u, 128u, 128u, 128u, 2u,
3601                                           5u,   8u,   11u,  14u};
3602 
3603 static const uvec8 kShuffleMaskRGBToB0 = {2u,   5u,   8u,   11u,  14u,  128u,
3604                                           128u, 128u, 128u, 128u, 128u, 128u,
3605                                           128u, 128u, 128u, 128u};
3606 static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
3607                                           4u,   7u,   10u,  13u,  128u, 128u,
3608                                           128u, 128u, 128u, 128u};
3609 static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
3610                                           128u, 128u, 128u, 128u, 0u,   3u,
3611                                           6u,   9u,   12u,  15u};
3612 
SplitRGBRow_SSSE3(const uint8_t * src_rgb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)3613 void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
3614                        uint8_t* dst_r,
3615                        uint8_t* dst_g,
3616                        uint8_t* dst_b,
3617                        int width) {
3618   asm volatile(
3619 
3620       LABELALIGN
3621       "1:                                        \n"
3622       "movdqu     (%0),%%xmm0                    \n"
3623       "movdqu     0x10(%0),%%xmm1                \n"
3624       "movdqu     0x20(%0),%%xmm2                \n"
3625       "pshufb     %5, %%xmm0                     \n"
3626       "pshufb     %6, %%xmm1                     \n"
3627       "pshufb     %7, %%xmm2                     \n"
3628       "por        %%xmm1,%%xmm0                  \n"
3629       "por        %%xmm2,%%xmm0                  \n"
3630       "movdqu     %%xmm0,(%1)                    \n"
3631       "lea        0x10(%1),%1                    \n"
3632 
3633       "movdqu     (%0),%%xmm0                    \n"
3634       "movdqu     0x10(%0),%%xmm1                \n"
3635       "movdqu     0x20(%0),%%xmm2                \n"
3636       "pshufb     %8, %%xmm0                     \n"
3637       "pshufb     %9, %%xmm1                     \n"
3638       "pshufb     %10, %%xmm2                    \n"
3639       "por        %%xmm1,%%xmm0                  \n"
3640       "por        %%xmm2,%%xmm0                  \n"
3641       "movdqu     %%xmm0,(%2)                    \n"
3642       "lea        0x10(%2),%2                    \n"
3643 
3644       "movdqu     (%0),%%xmm0                    \n"
3645       "movdqu     0x10(%0),%%xmm1                \n"
3646       "movdqu     0x20(%0),%%xmm2                \n"
3647       "pshufb     %11, %%xmm0                    \n"
3648       "pshufb     %12, %%xmm1                    \n"
3649       "pshufb     %13, %%xmm2                    \n"
3650       "por        %%xmm1,%%xmm0                  \n"
3651       "por        %%xmm2,%%xmm0                  \n"
3652       "movdqu     %%xmm0,(%3)                    \n"
3653       "lea        0x10(%3),%3                    \n"
3654       "lea        0x30(%0),%0                    \n"
3655       "sub        $0x10,%4                       \n"
3656       "jg         1b                             \n"
3657       : "+r"(src_rgb),             // %0
3658         "+r"(dst_r),               // %1
3659         "+r"(dst_g),               // %2
3660         "+r"(dst_b),               // %3
3661         "+r"(width)                // %4
3662       : "m"(kShuffleMaskRGBToR0),  // %5
3663         "m"(kShuffleMaskRGBToR1),  // %6
3664         "m"(kShuffleMaskRGBToR2),  // %7
3665         "m"(kShuffleMaskRGBToG0),  // %8
3666         "m"(kShuffleMaskRGBToG1),  // %9
3667         "m"(kShuffleMaskRGBToG2),  // %10
3668         "m"(kShuffleMaskRGBToB0),  // %11
3669         "m"(kShuffleMaskRGBToB1),  // %12
3670         "m"(kShuffleMaskRGBToB2)   // %13
3671       : "memory", "cc", "xmm0", "xmm1", "xmm2");
3672 }
3673 #endif  // HAS_SPLITRGBROW_SSSE3
3674 
3675 #ifdef HAS_MERGERGBROW_SSSE3
3676 
3677 // Shuffle table for converting RGB to Planar.
3678 static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
3679                                           2u, 128u, 128u, 3u, 128u, 128u,
3680                                           4u, 128u, 128u, 5u};
3681 static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
3682                                           128u, 2u, 128u, 128u, 3u, 128u,
3683                                           128u, 4u, 128u, 128u};
3684 static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
3685                                           128u, 128u, 2u, 128u, 128u, 3u,
3686                                           128u, 128u, 4u, 128u};
3687 
3688 static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
3689                                           7u, 128u, 128u, 8u, 128u, 128u,
3690                                           9u, 128u, 128u, 10u};
3691 static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
3692                                           128u, 7u, 128u, 128u, 8u, 128u,
3693                                           128u, 9u, 128u, 128u};
3694 static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u,  128u, 128u, 7u,
3695                                           128u, 128u, 8u,  128u, 128u, 9u,
3696                                           128u, 128u, 10u, 128u};
3697 
3698 static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
3699                                           12u, 128u, 128u, 13u, 128u, 128u,
3700                                           14u, 128u, 128u, 15u};
3701 static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
3702                                           128u, 13u, 128u, 128u, 14u, 128u,
3703                                           128u, 15u, 128u, 128u};
3704 static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
3705                                           128u, 128u, 13u, 128u, 128u, 14u,
3706                                           128u, 128u, 15u, 128u};
3707 
MergeRGBRow_SSSE3(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_rgb,int width)3708 void MergeRGBRow_SSSE3(const uint8_t* src_r,
3709                        const uint8_t* src_g,
3710                        const uint8_t* src_b,
3711                        uint8_t* dst_rgb,
3712                        int width) {
3713   asm volatile(
3714 
3715       LABELALIGN
3716       "1:                                        \n"
3717       "movdqu     (%0),%%xmm0                    \n"
3718       "movdqu     (%1),%%xmm1                    \n"
3719       "movdqu     (%2),%%xmm2                    \n"
3720       "pshufb     %5, %%xmm0                     \n"
3721       "pshufb     %6, %%xmm1                     \n"
3722       "pshufb     %7, %%xmm2                     \n"
3723       "por        %%xmm1,%%xmm0                  \n"
3724       "por        %%xmm2,%%xmm0                  \n"
3725       "movdqu     %%xmm0,(%3)                    \n"
3726 
3727       "movdqu     (%0),%%xmm0                    \n"
3728       "movdqu     (%1),%%xmm1                    \n"
3729       "movdqu     (%2),%%xmm2                    \n"
3730       "pshufb     %8, %%xmm0                     \n"
3731       "pshufb     %9, %%xmm1                     \n"
3732       "pshufb     %10, %%xmm2                    \n"
3733       "por        %%xmm1,%%xmm0                  \n"
3734       "por        %%xmm2,%%xmm0                  \n"
3735       "movdqu     %%xmm0,16(%3)                  \n"
3736 
3737       "movdqu     (%0),%%xmm0                    \n"
3738       "movdqu     (%1),%%xmm1                    \n"
3739       "movdqu     (%2),%%xmm2                    \n"
3740       "pshufb     %11, %%xmm0                    \n"
3741       "pshufb     %12, %%xmm1                    \n"
3742       "pshufb     %13, %%xmm2                    \n"
3743       "por        %%xmm1,%%xmm0                  \n"
3744       "por        %%xmm2,%%xmm0                  \n"
3745       "movdqu     %%xmm0,32(%3)                  \n"
3746 
3747       "lea        0x10(%0),%0                    \n"
3748       "lea        0x10(%1),%1                    \n"
3749       "lea        0x10(%2),%2                    \n"
3750       "lea        0x30(%3),%3                    \n"
3751       "sub        $0x10,%4                       \n"
3752       "jg         1b                             \n"
3753       : "+r"(src_r),               // %0
3754         "+r"(src_g),               // %1
3755         "+r"(src_b),               // %2
3756         "+r"(dst_rgb),             // %3
3757         "+r"(width)                // %4
3758       : "m"(kShuffleMaskRToRGB0),  // %5
3759         "m"(kShuffleMaskGToRGB0),  // %6
3760         "m"(kShuffleMaskBToRGB0),  // %7
3761         "m"(kShuffleMaskRToRGB1),  // %8
3762         "m"(kShuffleMaskGToRGB1),  // %9
3763         "m"(kShuffleMaskBToRGB1),  // %10
3764         "m"(kShuffleMaskRToRGB2),  // %11
3765         "m"(kShuffleMaskGToRGB2),  // %12
3766         "m"(kShuffleMaskBToRGB2)   // %13
3767       : "memory", "cc", "xmm0", "xmm1", "xmm2");
3768 }
3769 #endif  // HAS_MERGERGBROW_SSSE3
3770 
3771 #ifdef HAS_COPYROW_SSE2
CopyRow_SSE2(const uint8_t * src,uint8_t * dst,int width)3772 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
3773   asm volatile(
3774       "test       $0xf,%0                        \n"
3775       "jne        2f                             \n"
3776       "test       $0xf,%1                        \n"
3777       "jne        2f                             \n"
3778 
3779       LABELALIGN
3780       "1:                                        \n"
3781       "movdqa    (%0),%%xmm0                     \n"
3782       "movdqa    0x10(%0),%%xmm1                 \n"
3783       "lea       0x20(%0),%0                     \n"
3784       "movdqa    %%xmm0,(%1)                     \n"
3785       "movdqa    %%xmm1,0x10(%1)                 \n"
3786       "lea       0x20(%1),%1                     \n"
3787       "sub       $0x20,%2                        \n"
3788       "jg        1b                              \n"
3789       "jmp       9f                              \n"
3790 
3791       LABELALIGN
3792       "2:                                        \n"
3793       "movdqu    (%0),%%xmm0                     \n"
3794       "movdqu    0x10(%0),%%xmm1                 \n"
3795       "lea       0x20(%0),%0                     \n"
3796       "movdqu    %%xmm0,(%1)                     \n"
3797       "movdqu    %%xmm1,0x10(%1)                 \n"
3798       "lea       0x20(%1),%1                     \n"
3799       "sub       $0x20,%2                        \n"
3800       "jg        2b                              \n"
3801 
3802       LABELALIGN "9:                             \n"
3803       : "+r"(src),   // %0
3804         "+r"(dst),   // %1
3805         "+r"(width)  // %2
3806       :
3807       : "memory", "cc", "xmm0", "xmm1");
3808 }
3809 #endif  // HAS_COPYROW_SSE2
3810 
3811 #ifdef HAS_COPYROW_AVX
CopyRow_AVX(const uint8_t * src,uint8_t * dst,int width)3812 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
3813   asm volatile(
3814 
3815       LABELALIGN
3816       "1:                                        \n"
3817       "vmovdqu   (%0),%%ymm0                     \n"
3818       "vmovdqu   0x20(%0),%%ymm1                 \n"
3819       "lea       0x40(%0),%0                     \n"
3820       "vmovdqu   %%ymm0,(%1)                     \n"
3821       "vmovdqu   %%ymm1,0x20(%1)                 \n"
3822       "lea       0x40(%1),%1                     \n"
3823       "sub       $0x40,%2                        \n"
3824       "jg        1b                              \n"
3825       : "+r"(src),   // %0
3826         "+r"(dst),   // %1
3827         "+r"(width)  // %2
3828       :
3829       : "memory", "cc", "xmm0", "xmm1");
3830 }
3831 #endif  // HAS_COPYROW_AVX
3832 
3833 #ifdef HAS_COPYROW_ERMS
3834 // Multiple of 1.
CopyRow_ERMS(const uint8_t * src,uint8_t * dst,int width)3835 void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
3836   size_t width_tmp = (size_t)(width);
3837   asm volatile(
3838 
3839       "rep movsb                      \n"
3840       : "+S"(src),       // %0
3841         "+D"(dst),       // %1
3842         "+c"(width_tmp)  // %2
3843       :
3844       : "memory", "cc");
3845 }
3846 #endif  // HAS_COPYROW_ERMS
3847 
3848 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
3849 // width in pixels
ARGBCopyAlphaRow_SSE2(const uint8_t * src,uint8_t * dst,int width)3850 void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
3851   asm volatile(
3852       "pcmpeqb   %%xmm0,%%xmm0                   \n"
3853       "pslld     $0x18,%%xmm0                    \n"
3854       "pcmpeqb   %%xmm1,%%xmm1                   \n"
3855       "psrld     $0x8,%%xmm1                     \n"
3856 
3857       LABELALIGN
3858       "1:                                        \n"
3859       "movdqu    (%0),%%xmm2                     \n"
3860       "movdqu    0x10(%0),%%xmm3                 \n"
3861       "lea       0x20(%0),%0                     \n"
3862       "movdqu    (%1),%%xmm4                     \n"
3863       "movdqu    0x10(%1),%%xmm5                 \n"
3864       "pand      %%xmm0,%%xmm2                   \n"
3865       "pand      %%xmm0,%%xmm3                   \n"
3866       "pand      %%xmm1,%%xmm4                   \n"
3867       "pand      %%xmm1,%%xmm5                   \n"
3868       "por       %%xmm4,%%xmm2                   \n"
3869       "por       %%xmm5,%%xmm3                   \n"
3870       "movdqu    %%xmm2,(%1)                     \n"
3871       "movdqu    %%xmm3,0x10(%1)                 \n"
3872       "lea       0x20(%1),%1                     \n"
3873       "sub       $0x8,%2                         \n"
3874       "jg        1b                              \n"
3875       : "+r"(src),   // %0
3876         "+r"(dst),   // %1
3877         "+r"(width)  // %2
3878       :
3879       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
3880 }
3881 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
3882 
3883 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
3884 // width in pixels
ARGBCopyAlphaRow_AVX2(const uint8_t * src,uint8_t * dst,int width)3885 void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
3886   asm volatile(
3887       "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
3888       "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
3889 
3890       LABELALIGN
3891       "1:                                        \n"
3892       "vmovdqu   (%0),%%ymm1                     \n"
3893       "vmovdqu   0x20(%0),%%ymm2                 \n"
3894       "lea       0x40(%0),%0                     \n"
3895       "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1       \n"
3896       "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2   \n"
3897       "vmovdqu   %%ymm1,(%1)                     \n"
3898       "vmovdqu   %%ymm2,0x20(%1)                 \n"
3899       "lea       0x40(%1),%1                     \n"
3900       "sub       $0x10,%2                        \n"
3901       "jg        1b                              \n"
3902       "vzeroupper                                \n"
3903       : "+r"(src),   // %0
3904         "+r"(dst),   // %1
3905         "+r"(width)  // %2
3906       :
3907       : "memory", "cc", "xmm0", "xmm1", "xmm2");
3908 }
3909 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
3910 
3911 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
3912 // width in pixels
ARGBExtractAlphaRow_SSE2(const uint8_t * src_argb,uint8_t * dst_a,int width)3913 void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
3914                               uint8_t* dst_a,
3915                               int width) {
3916   asm volatile(
3917 
3918       LABELALIGN
3919       "1:                                        \n"
3920       "movdqu    (%0), %%xmm0                    \n"
3921       "movdqu    0x10(%0), %%xmm1                \n"
3922       "lea       0x20(%0), %0                    \n"
3923       "psrld     $0x18, %%xmm0                   \n"
3924       "psrld     $0x18, %%xmm1                   \n"
3925       "packssdw  %%xmm1, %%xmm0                  \n"
3926       "packuswb  %%xmm0, %%xmm0                  \n"
3927       "movq      %%xmm0,(%1)                     \n"
3928       "lea       0x8(%1), %1                     \n"
3929       "sub       $0x8, %2                        \n"
3930       "jg        1b                              \n"
3931       : "+r"(src_argb),  // %0
3932         "+r"(dst_a),     // %1
3933         "+rm"(width)     // %2
3934       :
3935       : "memory", "cc", "xmm0", "xmm1");
3936 }
3937 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
3938 
3939 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
3940 static const uvec8 kShuffleAlphaShort_AVX2 = {
3941     3u,  128u, 128u, 128u, 7u,  128u, 128u, 128u,
3942     11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
3943 
ARGBExtractAlphaRow_AVX2(const uint8_t * src_argb,uint8_t * dst_a,int width)3944 void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
3945                               uint8_t* dst_a,
3946                               int width) {
3947   asm volatile(
3948       "vmovdqa    %3,%%ymm4                      \n"
3949       "vbroadcastf128 %4,%%ymm5                  \n"
3950 
3951       LABELALIGN
3952       "1:                                        \n"
3953       "vmovdqu   (%0), %%ymm0                    \n"
3954       "vmovdqu   0x20(%0), %%ymm1                \n"
3955       "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"  // vpsrld $0x18, %%ymm0
3956       "vpshufb    %%ymm5,%%ymm1,%%ymm1           \n"
3957       "vmovdqu   0x40(%0), %%ymm2                \n"
3958       "vmovdqu   0x60(%0), %%ymm3                \n"
3959       "lea       0x80(%0), %0                    \n"
3960       "vpackssdw  %%ymm1, %%ymm0, %%ymm0         \n"  // mutates
3961       "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
3962       "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
3963       "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // mutates
3964       "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
3965       "vpermd     %%ymm0,%%ymm4,%%ymm0           \n"  // unmutate.
3966       "vmovdqu    %%ymm0,(%1)                    \n"
3967       "lea       0x20(%1),%1                     \n"
3968       "sub        $0x20, %2                      \n"
3969       "jg         1b                             \n"
3970       "vzeroupper                                \n"
3971       : "+r"(src_argb),               // %0
3972         "+r"(dst_a),                  // %1
3973         "+rm"(width)                  // %2
3974       : "m"(kPermdARGBToY_AVX),       // %3
3975         "m"(kShuffleAlphaShort_AVX2)  // %4
3976       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
3977 }
3978 #endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
3979 
3980 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
3981 // width in pixels
ARGBCopyYToAlphaRow_SSE2(const uint8_t * src,uint8_t * dst,int width)3982 void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
3983   asm volatile(
3984       "pcmpeqb   %%xmm0,%%xmm0                   \n"
3985       "pslld     $0x18,%%xmm0                    \n"
3986       "pcmpeqb   %%xmm1,%%xmm1                   \n"
3987       "psrld     $0x8,%%xmm1                     \n"
3988 
3989       LABELALIGN
3990       "1:                                        \n"
3991       "movq      (%0),%%xmm2                     \n"
3992       "lea       0x8(%0),%0                      \n"
3993       "punpcklbw %%xmm2,%%xmm2                   \n"
3994       "punpckhwd %%xmm2,%%xmm3                   \n"
3995       "punpcklwd %%xmm2,%%xmm2                   \n"
3996       "movdqu    (%1),%%xmm4                     \n"
3997       "movdqu    0x10(%1),%%xmm5                 \n"
3998       "pand      %%xmm0,%%xmm2                   \n"
3999       "pand      %%xmm0,%%xmm3                   \n"
4000       "pand      %%xmm1,%%xmm4                   \n"
4001       "pand      %%xmm1,%%xmm5                   \n"
4002       "por       %%xmm4,%%xmm2                   \n"
4003       "por       %%xmm5,%%xmm3                   \n"
4004       "movdqu    %%xmm2,(%1)                     \n"
4005       "movdqu    %%xmm3,0x10(%1)                 \n"
4006       "lea       0x20(%1),%1                     \n"
4007       "sub       $0x8,%2                         \n"
4008       "jg        1b                              \n"
4009       : "+r"(src),   // %0
4010         "+r"(dst),   // %1
4011         "+r"(width)  // %2
4012       :
4013       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
4014 }
4015 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
4016 
4017 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
4018 // width in pixels
ARGBCopyYToAlphaRow_AVX2(const uint8_t * src,uint8_t * dst,int width)4019 void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
4020   asm volatile(
4021       "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
4022       "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
4023 
4024       LABELALIGN
4025       "1:                                        \n"
4026       "vpmovzxbd (%0),%%ymm1                     \n"
4027       "vpmovzxbd 0x8(%0),%%ymm2                  \n"
4028       "lea       0x10(%0),%0                     \n"
4029       "vpslld    $0x18,%%ymm1,%%ymm1             \n"
4030       "vpslld    $0x18,%%ymm2,%%ymm2             \n"
4031       "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1       \n"
4032       "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2   \n"
4033       "vmovdqu   %%ymm1,(%1)                     \n"
4034       "vmovdqu   %%ymm2,0x20(%1)                 \n"
4035       "lea       0x40(%1),%1                     \n"
4036       "sub       $0x10,%2                        \n"
4037       "jg        1b                              \n"
4038       "vzeroupper                                \n"
4039       : "+r"(src),   // %0
4040         "+r"(dst),   // %1
4041         "+r"(width)  // %2
4042       :
4043       : "memory", "cc", "xmm0", "xmm1", "xmm2");
4044 }
4045 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
4046 
4047 #ifdef HAS_SETROW_X86
SetRow_X86(uint8_t * dst,uint8_t v8,int width)4048 void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
4049   size_t width_tmp = (size_t)(width >> 2);
4050   const uint32_t v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
4051   asm volatile(
4052 
4053       "rep stosl                      \n"
4054       : "+D"(dst),       // %0
4055         "+c"(width_tmp)  // %1
4056       : "a"(v32)         // %2
4057       : "memory", "cc");
4058 }
4059 
SetRow_ERMS(uint8_t * dst,uint8_t v8,int width)4060 void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
4061   size_t width_tmp = (size_t)(width);
4062   asm volatile(
4063 
4064       "rep stosb                      \n"
4065       : "+D"(dst),       // %0
4066         "+c"(width_tmp)  // %1
4067       : "a"(v8)          // %2
4068       : "memory", "cc");
4069 }
4070 
ARGBSetRow_X86(uint8_t * dst_argb,uint32_t v32,int width)4071 void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
4072   size_t width_tmp = (size_t)(width);
4073   asm volatile(
4074 
4075       "rep stosl                      \n"
4076       : "+D"(dst_argb),  // %0
4077         "+c"(width_tmp)  // %1
4078       : "a"(v32)         // %2
4079       : "memory", "cc");
4080 }
4081 #endif  // HAS_SETROW_X86
4082 
4083 #ifdef HAS_YUY2TOYROW_SSE2
YUY2ToYRow_SSE2(const uint8_t * src_yuy2,uint8_t * dst_y,int width)4084 void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
4085   asm volatile(
4086       "pcmpeqb   %%xmm5,%%xmm5                   \n"
4087       "psrlw     $0x8,%%xmm5                     \n"
4088 
4089       LABELALIGN
4090       "1:                                        \n"
4091       "movdqu    (%0),%%xmm0                     \n"
4092       "movdqu    0x10(%0),%%xmm1                 \n"
4093       "lea       0x20(%0),%0                     \n"
4094       "pand      %%xmm5,%%xmm0                   \n"
4095       "pand      %%xmm5,%%xmm1                   \n"
4096       "packuswb  %%xmm1,%%xmm0                   \n"
4097       "movdqu    %%xmm0,(%1)                     \n"
4098       "lea       0x10(%1),%1                     \n"
4099       "sub       $0x10,%2                        \n"
4100       "jg        1b                              \n"
4101       : "+r"(src_yuy2),  // %0
4102         "+r"(dst_y),     // %1
4103         "+r"(width)      // %2
4104       :
4105       : "memory", "cc", "xmm0", "xmm1", "xmm5");
4106 }
4107 
YUY2ToUVRow_SSE2(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)4108 void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
4109                       int stride_yuy2,
4110                       uint8_t* dst_u,
4111                       uint8_t* dst_v,
4112                       int width) {
4113   asm volatile(
4114       "pcmpeqb   %%xmm5,%%xmm5                   \n"
4115       "psrlw     $0x8,%%xmm5                     \n"
4116       "sub       %1,%2                           \n"
4117 
4118       LABELALIGN
4119       "1:                                        \n"
4120       "movdqu    (%0),%%xmm0                     \n"
4121       "movdqu    0x10(%0),%%xmm1                 \n"
4122       "movdqu    0x00(%0,%4,1),%%xmm2            \n"
4123       "movdqu    0x10(%0,%4,1),%%xmm3            \n"
4124       "lea       0x20(%0),%0                     \n"
4125       "pavgb     %%xmm2,%%xmm0                   \n"
4126       "pavgb     %%xmm3,%%xmm1                   \n"
4127       "psrlw     $0x8,%%xmm0                     \n"
4128       "psrlw     $0x8,%%xmm1                     \n"
4129       "packuswb  %%xmm1,%%xmm0                   \n"
4130       "movdqa    %%xmm0,%%xmm1                   \n"
4131       "pand      %%xmm5,%%xmm0                   \n"
4132       "packuswb  %%xmm0,%%xmm0                   \n"
4133       "psrlw     $0x8,%%xmm1                     \n"
4134       "packuswb  %%xmm1,%%xmm1                   \n"
4135       "movq      %%xmm0,(%1)                     \n"
4136       "movq    %%xmm1,0x00(%1,%2,1)              \n"
4137       "lea       0x8(%1),%1                      \n"
4138       "sub       $0x10,%3                        \n"
4139       "jg        1b                              \n"
4140       : "+r"(src_yuy2),               // %0
4141         "+r"(dst_u),                  // %1
4142         "+r"(dst_v),                  // %2
4143         "+r"(width)                   // %3
4144       : "r"((intptr_t)(stride_yuy2))  // %4
4145       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
4146 }
4147 
YUY2ToUV422Row_SSE2(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)4148 void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
4149                          uint8_t* dst_u,
4150                          uint8_t* dst_v,
4151                          int width) {
4152   asm volatile(
4153       "pcmpeqb   %%xmm5,%%xmm5                   \n"
4154       "psrlw     $0x8,%%xmm5                     \n"
4155       "sub       %1,%2                           \n"
4156 
4157       LABELALIGN
4158       "1:                                        \n"
4159       "movdqu    (%0),%%xmm0                     \n"
4160       "movdqu    0x10(%0),%%xmm1                 \n"
4161       "lea       0x20(%0),%0                     \n"
4162       "psrlw     $0x8,%%xmm0                     \n"
4163       "psrlw     $0x8,%%xmm1                     \n"
4164       "packuswb  %%xmm1,%%xmm0                   \n"
4165       "movdqa    %%xmm0,%%xmm1                   \n"
4166       "pand      %%xmm5,%%xmm0                   \n"
4167       "packuswb  %%xmm0,%%xmm0                   \n"
4168       "psrlw     $0x8,%%xmm1                     \n"
4169       "packuswb  %%xmm1,%%xmm1                   \n"
4170       "movq      %%xmm0,(%1)                     \n"
4171       "movq    %%xmm1,0x00(%1,%2,1)              \n"
4172       "lea       0x8(%1),%1                      \n"
4173       "sub       $0x10,%3                        \n"
4174       "jg        1b                              \n"
4175       : "+r"(src_yuy2),  // %0
4176         "+r"(dst_u),     // %1
4177         "+r"(dst_v),     // %2
4178         "+r"(width)      // %3
4179       :
4180       : "memory", "cc", "xmm0", "xmm1", "xmm5");
4181 }
4182 
UYVYToYRow_SSE2(const uint8_t * src_uyvy,uint8_t * dst_y,int width)4183 void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
4184   asm volatile(
4185 
4186       LABELALIGN
4187       "1:                                        \n"
4188       "movdqu    (%0),%%xmm0                     \n"
4189       "movdqu    0x10(%0),%%xmm1                 \n"
4190       "lea       0x20(%0),%0                     \n"
4191       "psrlw     $0x8,%%xmm0                     \n"
4192       "psrlw     $0x8,%%xmm1                     \n"
4193       "packuswb  %%xmm1,%%xmm0                   \n"
4194       "movdqu    %%xmm0,(%1)                     \n"
4195       "lea       0x10(%1),%1                     \n"
4196       "sub       $0x10,%2                        \n"
4197       "jg        1b                              \n"
4198       : "+r"(src_uyvy),  // %0
4199         "+r"(dst_y),     // %1
4200         "+r"(width)      // %2
4201       :
4202       : "memory", "cc", "xmm0", "xmm1");
4203 }
4204 
UYVYToUVRow_SSE2(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)4205 void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
4206                       int stride_uyvy,
4207                       uint8_t* dst_u,
4208                       uint8_t* dst_v,
4209                       int width) {
4210   asm volatile(
4211       "pcmpeqb   %%xmm5,%%xmm5                   \n"
4212       "psrlw     $0x8,%%xmm5                     \n"
4213       "sub       %1,%2                           \n"
4214 
4215       LABELALIGN
4216       "1:                                        \n"
4217       "movdqu    (%0),%%xmm0                     \n"
4218       "movdqu    0x10(%0),%%xmm1                 \n"
4219       "movdqu    0x00(%0,%4,1),%%xmm2            \n"
4220       "movdqu    0x10(%0,%4,1),%%xmm3            \n"
4221       "lea       0x20(%0),%0                     \n"
4222       "pavgb     %%xmm2,%%xmm0                   \n"
4223       "pavgb     %%xmm3,%%xmm1                   \n"
4224       "pand      %%xmm5,%%xmm0                   \n"
4225       "pand      %%xmm5,%%xmm1                   \n"
4226       "packuswb  %%xmm1,%%xmm0                   \n"
4227       "movdqa    %%xmm0,%%xmm1                   \n"
4228       "pand      %%xmm5,%%xmm0                   \n"
4229       "packuswb  %%xmm0,%%xmm0                   \n"
4230       "psrlw     $0x8,%%xmm1                     \n"
4231       "packuswb  %%xmm1,%%xmm1                   \n"
4232       "movq      %%xmm0,(%1)                     \n"
4233       "movq    %%xmm1,0x00(%1,%2,1)              \n"
4234       "lea       0x8(%1),%1                      \n"
4235       "sub       $0x10,%3                        \n"
4236       "jg        1b                              \n"
4237       : "+r"(src_uyvy),               // %0
4238         "+r"(dst_u),                  // %1
4239         "+r"(dst_v),                  // %2
4240         "+r"(width)                   // %3
4241       : "r"((intptr_t)(stride_uyvy))  // %4
4242       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
4243 }
4244 
UYVYToUV422Row_SSE2(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)4245 void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
4246                          uint8_t* dst_u,
4247                          uint8_t* dst_v,
4248                          int width) {
4249   asm volatile(
4250       "pcmpeqb   %%xmm5,%%xmm5                   \n"
4251       "psrlw     $0x8,%%xmm5                     \n"
4252       "sub       %1,%2                           \n"
4253 
4254       LABELALIGN
4255       "1:                                        \n"
4256       "movdqu    (%0),%%xmm0                     \n"
4257       "movdqu    0x10(%0),%%xmm1                 \n"
4258       "lea       0x20(%0),%0                     \n"
4259       "pand      %%xmm5,%%xmm0                   \n"
4260       "pand      %%xmm5,%%xmm1                   \n"
4261       "packuswb  %%xmm1,%%xmm0                   \n"
4262       "movdqa    %%xmm0,%%xmm1                   \n"
4263       "pand      %%xmm5,%%xmm0                   \n"
4264       "packuswb  %%xmm0,%%xmm0                   \n"
4265       "psrlw     $0x8,%%xmm1                     \n"
4266       "packuswb  %%xmm1,%%xmm1                   \n"
4267       "movq      %%xmm0,(%1)                     \n"
4268       "movq    %%xmm1,0x00(%1,%2,1)              \n"
4269       "lea       0x8(%1),%1                      \n"
4270       "sub       $0x10,%3                        \n"
4271       "jg        1b                              \n"
4272       : "+r"(src_uyvy),  // %0
4273         "+r"(dst_u),     // %1
4274         "+r"(dst_v),     // %2
4275         "+r"(width)      // %3
4276       :
4277       : "memory", "cc", "xmm0", "xmm1", "xmm5");
4278 }
4279 #endif  // HAS_YUY2TOYROW_SSE2
4280 
4281 #ifdef HAS_YUY2TOYROW_AVX2
YUY2ToYRow_AVX2(const uint8_t * src_yuy2,uint8_t * dst_y,int width)4282 void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
4283   asm volatile(
4284       "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
4285       "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
4286 
4287       LABELALIGN
4288       "1:                                        \n"
4289       "vmovdqu   (%0),%%ymm0                     \n"
4290       "vmovdqu   0x20(%0),%%ymm1                 \n"
4291       "lea       0x40(%0),%0                     \n"
4292       "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
4293       "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
4294       "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
4295       "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
4296       "vmovdqu   %%ymm0,(%1)                     \n"
4297       "lea      0x20(%1),%1                      \n"
4298       "sub       $0x20,%2                        \n"
4299       "jg        1b                              \n"
4300       "vzeroupper                                \n"
4301       : "+r"(src_yuy2),  // %0
4302         "+r"(dst_y),     // %1
4303         "+r"(width)      // %2
4304       :
4305       : "memory", "cc", "xmm0", "xmm1", "xmm5");
4306 }
4307 
YUY2ToUVRow_AVX2(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)4308 void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
4309                       int stride_yuy2,
4310                       uint8_t* dst_u,
4311                       uint8_t* dst_v,
4312                       int width) {
4313   asm volatile(
4314       "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
4315       "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
4316       "sub       %1,%2                           \n"
4317 
4318       LABELALIGN
4319       "1:                                        \n"
4320       "vmovdqu   (%0),%%ymm0                     \n"
4321       "vmovdqu   0x20(%0),%%ymm1                 \n"
4322       "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
4323       "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
4324       "lea       0x40(%0),%0                     \n"
4325       "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
4326       "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
4327       "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
4328       "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
4329       "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
4330       "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
4331       "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
4332       "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
4333       "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
4334       "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
4335       "vextractf128 $0x0,%%ymm1,(%1)             \n"
4336       "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
4337       "lea      0x10(%1),%1                      \n"
4338       "sub       $0x20,%3                        \n"
4339       "jg        1b                              \n"
4340       "vzeroupper                                \n"
4341       : "+r"(src_yuy2),               // %0
4342         "+r"(dst_u),                  // %1
4343         "+r"(dst_v),                  // %2
4344         "+r"(width)                   // %3
4345       : "r"((intptr_t)(stride_yuy2))  // %4
4346       : "memory", "cc", "xmm0", "xmm1", "xmm5");
4347 }
4348 
YUY2ToUV422Row_AVX2(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)4349 void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
4350                          uint8_t* dst_u,
4351                          uint8_t* dst_v,
4352                          int width) {
4353   asm volatile(
4354       "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
4355       "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
4356       "sub       %1,%2                           \n"
4357 
4358       LABELALIGN
4359       "1:                                        \n"
4360       "vmovdqu   (%0),%%ymm0                     \n"
4361       "vmovdqu   0x20(%0),%%ymm1                 \n"
4362       "lea       0x40(%0),%0                     \n"
4363       "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
4364       "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
4365       "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
4366       "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
4367       "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
4368       "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
4369       "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
4370       "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
4371       "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
4372       "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
4373       "vextractf128 $0x0,%%ymm1,(%1)             \n"
4374       "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
4375       "lea      0x10(%1),%1                      \n"
4376       "sub       $0x20,%3                        \n"
4377       "jg        1b                              \n"
4378       "vzeroupper                                \n"
4379       : "+r"(src_yuy2),  // %0
4380         "+r"(dst_u),     // %1
4381         "+r"(dst_v),     // %2
4382         "+r"(width)      // %3
4383       :
4384       : "memory", "cc", "xmm0", "xmm1", "xmm5");
4385 }
4386 
UYVYToYRow_AVX2(const uint8_t * src_uyvy,uint8_t * dst_y,int width)4387 void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
4388   asm volatile(
4389 
4390       LABELALIGN
4391       "1:                                        \n"
4392       "vmovdqu   (%0),%%ymm0                     \n"
4393       "vmovdqu   0x20(%0),%%ymm1                 \n"
4394       "lea       0x40(%0),%0                     \n"
4395       "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
4396       "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
4397       "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
4398       "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
4399       "vmovdqu   %%ymm0,(%1)                     \n"
4400       "lea      0x20(%1),%1                      \n"
4401       "sub       $0x20,%2                        \n"
4402       "jg        1b                              \n"
4403       "vzeroupper                                \n"
4404       : "+r"(src_uyvy),  // %0
4405         "+r"(dst_y),     // %1
4406         "+r"(width)      // %2
4407       :
4408       : "memory", "cc", "xmm0", "xmm1", "xmm5");
4409 }
UYVYToUVRow_AVX2(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)4410 void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
4411                       int stride_uyvy,
4412                       uint8_t* dst_u,
4413                       uint8_t* dst_v,
4414                       int width) {
4415   asm volatile(
4416       "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
4417       "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
4418       "sub       %1,%2                           \n"
4419 
4420       LABELALIGN
4421       "1:                                        \n"
4422       "vmovdqu   (%0),%%ymm0                     \n"
4423       "vmovdqu   0x20(%0),%%ymm1                 \n"
4424       "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
4425       "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
4426       "lea       0x40(%0),%0                     \n"
4427       "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
4428       "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
4429       "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
4430       "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
4431       "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
4432       "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
4433       "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
4434       "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
4435       "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
4436       "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
4437       "vextractf128 $0x0,%%ymm1,(%1)             \n"
4438       "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
4439       "lea      0x10(%1),%1                      \n"
4440       "sub       $0x20,%3                        \n"
4441       "jg        1b                              \n"
4442       "vzeroupper                                \n"
4443       : "+r"(src_uyvy),               // %0
4444         "+r"(dst_u),                  // %1
4445         "+r"(dst_v),                  // %2
4446         "+r"(width)                   // %3
4447       : "r"((intptr_t)(stride_uyvy))  // %4
4448       : "memory", "cc", "xmm0", "xmm1", "xmm5");
4449 }
4450 
UYVYToUV422Row_AVX2(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)4451 void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
4452                          uint8_t* dst_u,
4453                          uint8_t* dst_v,
4454                          int width) {
4455   asm volatile(
4456       "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
4457       "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
4458       "sub       %1,%2                           \n"
4459 
4460       LABELALIGN
4461       "1:                                        \n"
4462       "vmovdqu   (%0),%%ymm0                     \n"
4463       "vmovdqu   0x20(%0),%%ymm1                 \n"
4464       "lea       0x40(%0),%0                     \n"
4465       "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
4466       "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
4467       "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
4468       "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
4469       "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
4470       "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
4471       "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
4472       "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
4473       "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
4474       "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
4475       "vextractf128 $0x0,%%ymm1,(%1)             \n"
4476       "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
4477       "lea      0x10(%1),%1                      \n"
4478       "sub       $0x20,%3                        \n"
4479       "jg        1b                              \n"
4480       "vzeroupper                                \n"
4481       : "+r"(src_uyvy),  // %0
4482         "+r"(dst_u),     // %1
4483         "+r"(dst_v),     // %2
4484         "+r"(width)      // %3
4485       :
4486       : "memory", "cc", "xmm0", "xmm1", "xmm5");
4487 }
4488 #endif  // HAS_YUY2TOYROW_AVX2
4489 
4490 #ifdef HAS_ARGBBLENDROW_SSSE3
4491 // Shuffle table for isolating alpha.
4492 static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
4493                                     11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
4494 
4495 // Blend 8 pixels at a time
ARGBBlendRow_SSSE3(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)4496 void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
4497                         const uint8_t* src_argb1,
4498                         uint8_t* dst_argb,
4499                         int width) {
4500   asm volatile(
4501       "pcmpeqb   %%xmm7,%%xmm7                   \n"
4502       "psrlw     $0xf,%%xmm7                     \n"
4503       "pcmpeqb   %%xmm6,%%xmm6                   \n"
4504       "psrlw     $0x8,%%xmm6                     \n"
4505       "pcmpeqb   %%xmm5,%%xmm5                   \n"
4506       "psllw     $0x8,%%xmm5                     \n"
4507       "pcmpeqb   %%xmm4,%%xmm4                   \n"
4508       "pslld     $0x18,%%xmm4                    \n"
4509       "sub       $0x4,%3                         \n"
4510       "jl        49f                             \n"
4511 
4512       // 4 pixel loop.
4513       LABELALIGN
4514       "40:                                       \n"
4515       "movdqu    (%0),%%xmm3                     \n"
4516       "lea       0x10(%0),%0                     \n"
4517       "movdqa    %%xmm3,%%xmm0                   \n"
4518       "pxor      %%xmm4,%%xmm3                   \n"
4519       "movdqu    (%1),%%xmm2                     \n"
4520       "pshufb    %4,%%xmm3                       \n"
4521       "pand      %%xmm6,%%xmm2                   \n"
4522       "paddw     %%xmm7,%%xmm3                   \n"
4523       "pmullw    %%xmm3,%%xmm2                   \n"
4524       "movdqu    (%1),%%xmm1                     \n"
4525       "lea       0x10(%1),%1                     \n"
4526       "psrlw     $0x8,%%xmm1                     \n"
4527       "por       %%xmm4,%%xmm0                   \n"
4528       "pmullw    %%xmm3,%%xmm1                   \n"
4529       "psrlw     $0x8,%%xmm2                     \n"
4530       "paddusb   %%xmm2,%%xmm0                   \n"
4531       "pand      %%xmm5,%%xmm1                   \n"
4532       "paddusb   %%xmm1,%%xmm0                   \n"
4533       "movdqu    %%xmm0,(%2)                     \n"
4534       "lea       0x10(%2),%2                     \n"
4535       "sub       $0x4,%3                         \n"
4536       "jge       40b                             \n"
4537 
4538       "49:                                       \n"
4539       "add       $0x3,%3                         \n"
4540       "jl        99f                             \n"
4541 
4542       // 1 pixel loop.
4543       "91:                                       \n"
4544       "movd      (%0),%%xmm3                     \n"
4545       "lea       0x4(%0),%0                      \n"
4546       "movdqa    %%xmm3,%%xmm0                   \n"
4547       "pxor      %%xmm4,%%xmm3                   \n"
4548       "movd      (%1),%%xmm2                     \n"
4549       "pshufb    %4,%%xmm3                       \n"
4550       "pand      %%xmm6,%%xmm2                   \n"
4551       "paddw     %%xmm7,%%xmm3                   \n"
4552       "pmullw    %%xmm3,%%xmm2                   \n"
4553       "movd      (%1),%%xmm1                     \n"
4554       "lea       0x4(%1),%1                      \n"
4555       "psrlw     $0x8,%%xmm1                     \n"
4556       "por       %%xmm4,%%xmm0                   \n"
4557       "pmullw    %%xmm3,%%xmm1                   \n"
4558       "psrlw     $0x8,%%xmm2                     \n"
4559       "paddusb   %%xmm2,%%xmm0                   \n"
4560       "pand      %%xmm5,%%xmm1                   \n"
4561       "paddusb   %%xmm1,%%xmm0                   \n"
4562       "movd      %%xmm0,(%2)                     \n"
4563       "lea       0x4(%2),%2                      \n"
4564       "sub       $0x1,%3                         \n"
4565       "jge       91b                             \n"
4566       "99:                                       \n"
4567       : "+r"(src_argb0),    // %0
4568         "+r"(src_argb1),    // %1
4569         "+r"(dst_argb),     // %2
4570         "+r"(width)         // %3
4571       : "m"(kShuffleAlpha)  // %4
4572       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
4573         "xmm7");
4574 }
4575 #endif  // HAS_ARGBBLENDROW_SSSE3
4576 
4577 #ifdef HAS_BLENDPLANEROW_SSSE3
4578 // Blend 8 pixels at a time.
4579 // unsigned version of math
4580 // =((A2*C2)+(B2*(255-C2))+255)/256
4581 // signed version of math
4582 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_SSSE3(const uint8_t * src0,const uint8_t * src1,const uint8_t * alpha,uint8_t * dst,int width)4583 void BlendPlaneRow_SSSE3(const uint8_t* src0,
4584                          const uint8_t* src1,
4585                          const uint8_t* alpha,
4586                          uint8_t* dst,
4587                          int width) {
4588   asm volatile(
4589       "pcmpeqb    %%xmm5,%%xmm5                  \n"
4590       "psllw      $0x8,%%xmm5                    \n"
4591       "mov        $0x80808080,%%eax              \n"
4592       "movd       %%eax,%%xmm6                   \n"
4593       "pshufd     $0x0,%%xmm6,%%xmm6             \n"
4594       "mov        $0x807f807f,%%eax              \n"
4595       "movd       %%eax,%%xmm7                   \n"
4596       "pshufd     $0x0,%%xmm7,%%xmm7             \n"
4597       "sub        %2,%0                          \n"
4598       "sub        %2,%1                          \n"
4599       "sub        %2,%3                          \n"
4600 
4601       // 8 pixel loop.
4602       LABELALIGN
4603       "1:                                        \n"
4604       "movq       (%2),%%xmm0                    \n"
4605       "punpcklbw  %%xmm0,%%xmm0                  \n"
4606       "pxor       %%xmm5,%%xmm0                  \n"
4607       "movq       (%0,%2,1),%%xmm1               \n"
4608       "movq       (%1,%2,1),%%xmm2               \n"
4609       "punpcklbw  %%xmm2,%%xmm1                  \n"
4610       "psubb      %%xmm6,%%xmm1                  \n"
4611       "pmaddubsw  %%xmm1,%%xmm0                  \n"
4612       "paddw      %%xmm7,%%xmm0                  \n"
4613       "psrlw      $0x8,%%xmm0                    \n"
4614       "packuswb   %%xmm0,%%xmm0                  \n"
4615       "movq       %%xmm0,(%3,%2,1)               \n"
4616       "lea        0x8(%2),%2                     \n"
4617       "sub        $0x8,%4                        \n"
4618       "jg        1b                              \n"
4619       : "+r"(src0),   // %0
4620         "+r"(src1),   // %1
4621         "+r"(alpha),  // %2
4622         "+r"(dst),    // %3
4623         "+rm"(width)  // %4
4624         ::"memory",
4625         "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
4626 }
4627 #endif  // HAS_BLENDPLANEROW_SSSE3
4628 
4629 #ifdef HAS_BLENDPLANEROW_AVX2
4630 // Blend 32 pixels at a time.
4631 // unsigned version of math
4632 // =((A2*C2)+(B2*(255-C2))+255)/256
4633 // signed version of math
4634 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_AVX2(const uint8_t * src0,const uint8_t * src1,const uint8_t * alpha,uint8_t * dst,int width)4635 void BlendPlaneRow_AVX2(const uint8_t* src0,
4636                         const uint8_t* src1,
4637                         const uint8_t* alpha,
4638                         uint8_t* dst,
4639                         int width) {
4640   asm volatile(
4641       "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
4642       "vpsllw     $0x8,%%ymm5,%%ymm5             \n"
4643       "mov        $0x80808080,%%eax              \n"
4644       "vmovd      %%eax,%%xmm6                   \n"
4645       "vbroadcastss %%xmm6,%%ymm6                \n"
4646       "mov        $0x807f807f,%%eax              \n"
4647       "vmovd      %%eax,%%xmm7                   \n"
4648       "vbroadcastss %%xmm7,%%ymm7                \n"
4649       "sub        %2,%0                          \n"
4650       "sub        %2,%1                          \n"
4651       "sub        %2,%3                          \n"
4652 
4653       // 32 pixel loop.
4654       LABELALIGN
4655       "1:                                        \n"
4656       "vmovdqu    (%2),%%ymm0                    \n"
4657       "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n"
4658       "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
4659       "vpxor      %%ymm5,%%ymm3,%%ymm3           \n"
4660       "vpxor      %%ymm5,%%ymm0,%%ymm0           \n"
4661       "vmovdqu    (%0,%2,1),%%ymm1               \n"
4662       "vmovdqu    (%1,%2,1),%%ymm2               \n"
4663       "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n"
4664       "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
4665       "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n"
4666       "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n"
4667       "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
4668       "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n"
4669       "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n"
4670       "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n"
4671       "vpsrlw     $0x8,%%ymm3,%%ymm3             \n"
4672       "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
4673       "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n"
4674       "vmovdqu    %%ymm0,(%3,%2,1)               \n"
4675       "lea        0x20(%2),%2                    \n"
4676       "sub        $0x20,%4                       \n"
4677       "jg        1b                              \n"
4678       "vzeroupper                                \n"
4679       : "+r"(src0),   // %0
4680         "+r"(src1),   // %1
4681         "+r"(alpha),  // %2
4682         "+r"(dst),    // %3
4683         "+rm"(width)  // %4
4684         ::"memory",
4685         "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
4686         "xmm7");
4687 }
4688 #endif  // HAS_BLENDPLANEROW_AVX2
4689 
4690 #ifdef HAS_ARGBATTENUATEROW_SSSE3
4691 // Shuffle table duplicating alpha
4692 static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
4693                                      7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
4694 static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
4695                                      15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
4696 // Attenuate 4 pixels at a time.
ARGBAttenuateRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width)4697 void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
4698                             uint8_t* dst_argb,
4699                             int width) {
4700   asm volatile(
4701       "pcmpeqb   %%xmm3,%%xmm3                   \n"
4702       "pslld     $0x18,%%xmm3                    \n"
4703       "movdqa    %3,%%xmm4                       \n"
4704       "movdqa    %4,%%xmm5                       \n"
4705 
4706       // 4 pixel loop.
4707       LABELALIGN
4708       "1:                                        \n"
4709       "movdqu    (%0),%%xmm0                     \n"
4710       "pshufb    %%xmm4,%%xmm0                   \n"
4711       "movdqu    (%0),%%xmm1                     \n"
4712       "punpcklbw %%xmm1,%%xmm1                   \n"
4713       "pmulhuw   %%xmm1,%%xmm0                   \n"
4714       "movdqu    (%0),%%xmm1                     \n"
4715       "pshufb    %%xmm5,%%xmm1                   \n"
4716       "movdqu    (%0),%%xmm2                     \n"
4717       "punpckhbw %%xmm2,%%xmm2                   \n"
4718       "pmulhuw   %%xmm2,%%xmm1                   \n"
4719       "movdqu    (%0),%%xmm2                     \n"
4720       "lea       0x10(%0),%0                     \n"
4721       "pand      %%xmm3,%%xmm2                   \n"
4722       "psrlw     $0x8,%%xmm0                     \n"
4723       "psrlw     $0x8,%%xmm1                     \n"
4724       "packuswb  %%xmm1,%%xmm0                   \n"
4725       "por       %%xmm2,%%xmm0                   \n"
4726       "movdqu    %%xmm0,(%1)                     \n"
4727       "lea       0x10(%1),%1                     \n"
4728       "sub       $0x4,%2                         \n"
4729       "jg        1b                              \n"
4730       : "+r"(src_argb),       // %0
4731         "+r"(dst_argb),       // %1
4732         "+r"(width)           // %2
4733       : "m"(kShuffleAlpha0),  // %3
4734         "m"(kShuffleAlpha1)   // %4
4735       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
4736 }
4737 #endif  // HAS_ARGBATTENUATEROW_SSSE3
4738 
4739 #ifdef HAS_ARGBATTENUATEROW_AVX2
4740 // Shuffle table duplicating alpha.
4741 static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
4742                                          128u, 128u, 14u,  15u, 14u, 15u,
4743                                          14u,  15u,  128u, 128u};
4744 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,int width)4745 void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
4746                            uint8_t* dst_argb,
4747                            int width) {
4748   asm volatile(
4749       "vbroadcastf128 %3,%%ymm4                  \n"
4750       "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
4751       "vpslld     $0x18,%%ymm5,%%ymm5            \n"
4752       "sub        %0,%1                          \n"
4753 
4754       // 8 pixel loop.
4755       LABELALIGN
4756       "1:                                        \n"
4757       "vmovdqu    (%0),%%ymm6                    \n"
4758       "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
4759       "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
4760       "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
4761       "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
4762       "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
4763       "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
4764       "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
4765       "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
4766       "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
4767       "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
4768       "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
4769       "vmovdqu    %%ymm0,0x00(%0,%1,1)           \n"
4770       "lea       0x20(%0),%0                     \n"
4771       "sub        $0x8,%2                        \n"
4772       "jg        1b                              \n"
4773       "vzeroupper                                \n"
4774       : "+r"(src_argb),          // %0
4775         "+r"(dst_argb),          // %1
4776         "+r"(width)              // %2
4777       : "m"(kShuffleAlpha_AVX2)  // %3
4778       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
4779 }
4780 #endif  // HAS_ARGBATTENUATEROW_AVX2
4781 
4782 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
4783 // Unattenuate 4 pixels at a time.
ARGBUnattenuateRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,int width)4784 void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
4785                              uint8_t* dst_argb,
4786                              int width) {
4787   uintptr_t alpha;
4788   asm volatile(
4789       // 4 pixel loop.
4790       LABELALIGN
4791       "1:                                        \n"
4792       "movdqu    (%0),%%xmm0                     \n"
4793       "movzb     0x03(%0),%3                     \n"
4794       "punpcklbw %%xmm0,%%xmm0                   \n"
4795       "movd      0x00(%4,%3,4),%%xmm2            \n"
4796       "movzb     0x07(%0),%3                     \n"
4797       "movd      0x00(%4,%3,4),%%xmm3            \n"
4798       "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
4799       "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
4800       "movlhps   %%xmm3,%%xmm2                   \n"
4801       "pmulhuw   %%xmm2,%%xmm0                   \n"
4802       "movdqu    (%0),%%xmm1                     \n"
4803       "movzb     0x0b(%0),%3                     \n"
4804       "punpckhbw %%xmm1,%%xmm1                   \n"
4805       "movd      0x00(%4,%3,4),%%xmm2            \n"
4806       "movzb     0x0f(%0),%3                     \n"
4807       "movd      0x00(%4,%3,4),%%xmm3            \n"
4808       "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
4809       "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
4810       "movlhps   %%xmm3,%%xmm2                   \n"
4811       "pmulhuw   %%xmm2,%%xmm1                   \n"
4812       "lea       0x10(%0),%0                     \n"
4813       "packuswb  %%xmm1,%%xmm0                   \n"
4814       "movdqu    %%xmm0,(%1)                     \n"
4815       "lea       0x10(%1),%1                     \n"
4816       "sub       $0x4,%2                         \n"
4817       "jg        1b                              \n"
4818       : "+r"(src_argb),     // %0
4819         "+r"(dst_argb),     // %1
4820         "+r"(width),        // %2
4821         "=&r"(alpha)        // %3
4822       : "r"(fixed_invtbl8)  // %4
4823       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
4824 }
4825 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
4826 
4827 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
4828 // Shuffle table duplicating alpha.
4829 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
4830     0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
4831 // Unattenuate 8 pixels at a time.
ARGBUnattenuateRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,int width)4832 void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
4833                              uint8_t* dst_argb,
4834                              int width) {
4835   uintptr_t alpha;
4836   asm volatile(
4837       "sub        %0,%1                          \n"
4838       "vbroadcastf128 %5,%%ymm5                  \n"
4839 
4840       // 8 pixel loop.
4841       LABELALIGN
4842       "1:                                        \n"
4843       // replace VPGATHER
4844       "movzb     0x03(%0),%3                     \n"
4845       "vmovd     0x00(%4,%3,4),%%xmm0            \n"
4846       "movzb     0x07(%0),%3                     \n"
4847       "vmovd     0x00(%4,%3,4),%%xmm1            \n"
4848       "movzb     0x0b(%0),%3                     \n"
4849       "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
4850       "vmovd     0x00(%4,%3,4),%%xmm2            \n"
4851       "movzb     0x0f(%0),%3                     \n"
4852       "vmovd     0x00(%4,%3,4),%%xmm3            \n"
4853       "movzb     0x13(%0),%3                     \n"
4854       "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
4855       "vmovd     0x00(%4,%3,4),%%xmm0            \n"
4856       "movzb     0x17(%0),%3                     \n"
4857       "vmovd     0x00(%4,%3,4),%%xmm1            \n"
4858       "movzb     0x1b(%0),%3                     \n"
4859       "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
4860       "vmovd     0x00(%4,%3,4),%%xmm2            \n"
4861       "movzb     0x1f(%0),%3                     \n"
4862       "vmovd     0x00(%4,%3,4),%%xmm3            \n"
4863       "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
4864       "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
4865       "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
4866       "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
4867       // end of VPGATHER
4868 
4869       "vmovdqu    (%0),%%ymm6                    \n"
4870       "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
4871       "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
4872       "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
4873       "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
4874       "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
4875       "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
4876       "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
4877       "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
4878       "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
4879       "vmovdqu    %%ymm0,0x00(%0,%1,1)           \n"
4880       "lea       0x20(%0),%0                     \n"
4881       "sub        $0x8,%2                        \n"
4882       "jg        1b                              \n"
4883       "vzeroupper                                \n"
4884       : "+r"(src_argb),                 // %0
4885         "+r"(dst_argb),                 // %1
4886         "+r"(width),                    // %2
4887         "=&r"(alpha)                    // %3
4888       : "r"(fixed_invtbl8),             // %4
4889         "m"(kUnattenShuffleAlpha_AVX2)  // %5
4890       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
4891         "xmm7");
4892 }
4893 #endif  // HAS_ARGBUNATTENUATEROW_AVX2
4894 
4895 #ifdef HAS_ARGBGRAYROW_SSSE3
4896 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
ARGBGrayRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width)4897 void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
4898   asm volatile(
4899       "movdqa    %3,%%xmm4                       \n"
4900       "movdqa    %4,%%xmm5                       \n"
4901 
4902       // 8 pixel loop.
4903       LABELALIGN
4904       "1:                                        \n"
4905       "movdqu    (%0),%%xmm0                     \n"
4906       "movdqu    0x10(%0),%%xmm1                 \n"
4907       "pmaddubsw %%xmm4,%%xmm0                   \n"
4908       "pmaddubsw %%xmm4,%%xmm1                   \n"
4909       "phaddw    %%xmm1,%%xmm0                   \n"
4910       "paddw     %%xmm5,%%xmm0                   \n"
4911       "psrlw     $0x7,%%xmm0                     \n"
4912       "packuswb  %%xmm0,%%xmm0                   \n"
4913       "movdqu    (%0),%%xmm2                     \n"
4914       "movdqu    0x10(%0),%%xmm3                 \n"
4915       "lea       0x20(%0),%0                     \n"
4916       "psrld     $0x18,%%xmm2                    \n"
4917       "psrld     $0x18,%%xmm3                    \n"
4918       "packuswb  %%xmm3,%%xmm2                   \n"
4919       "packuswb  %%xmm2,%%xmm2                   \n"
4920       "movdqa    %%xmm0,%%xmm3                   \n"
4921       "punpcklbw %%xmm0,%%xmm0                   \n"
4922       "punpcklbw %%xmm2,%%xmm3                   \n"
4923       "movdqa    %%xmm0,%%xmm1                   \n"
4924       "punpcklwd %%xmm3,%%xmm0                   \n"
4925       "punpckhwd %%xmm3,%%xmm1                   \n"
4926       "movdqu    %%xmm0,(%1)                     \n"
4927       "movdqu    %%xmm1,0x10(%1)                 \n"
4928       "lea       0x20(%1),%1                     \n"
4929       "sub       $0x8,%2                         \n"
4930       "jg        1b                              \n"
4931       : "+r"(src_argb),  // %0
4932         "+r"(dst_argb),  // %1
4933         "+r"(width)      // %2
4934       : "m"(kARGBToYJ),  // %3
4935         "m"(kAddYJ64)    // %4
4936       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
4937 }
4938 #endif  // HAS_ARGBGRAYROW_SSSE3
4939 
4940 #ifdef HAS_ARGBSEPIAROW_SSSE3
4941 //    b = (r * 35 + g * 68 + b * 17) >> 7
4942 //    g = (r * 45 + g * 88 + b * 22) >> 7
4943 //    r = (r * 50 + g * 98 + b * 24) >> 7
4944 // Constant for ARGB color to sepia tone
4945 static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
4946                                    17, 68, 35, 0, 17, 68, 35, 0};
4947 
4948 static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
4949                                    22, 88, 45, 0, 22, 88, 45, 0};
4950 
4951 static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
4952                                    24, 98, 50, 0, 24, 98, 50, 0};
4953 
4954 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
ARGBSepiaRow_SSSE3(uint8_t * dst_argb,int width)4955 void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
4956   asm volatile(
4957       "movdqa    %2,%%xmm2                       \n"
4958       "movdqa    %3,%%xmm3                       \n"
4959       "movdqa    %4,%%xmm4                       \n"
4960 
4961       // 8 pixel loop.
4962       LABELALIGN
4963       "1:                                        \n"
4964       "movdqu    (%0),%%xmm0                     \n"
4965       "movdqu    0x10(%0),%%xmm6                 \n"
4966       "pmaddubsw %%xmm2,%%xmm0                   \n"
4967       "pmaddubsw %%xmm2,%%xmm6                   \n"
4968       "phaddw    %%xmm6,%%xmm0                   \n"
4969       "psrlw     $0x7,%%xmm0                     \n"
4970       "packuswb  %%xmm0,%%xmm0                   \n"
4971       "movdqu    (%0),%%xmm5                     \n"
4972       "movdqu    0x10(%0),%%xmm1                 \n"
4973       "pmaddubsw %%xmm3,%%xmm5                   \n"
4974       "pmaddubsw %%xmm3,%%xmm1                   \n"
4975       "phaddw    %%xmm1,%%xmm5                   \n"
4976       "psrlw     $0x7,%%xmm5                     \n"
4977       "packuswb  %%xmm5,%%xmm5                   \n"
4978       "punpcklbw %%xmm5,%%xmm0                   \n"
4979       "movdqu    (%0),%%xmm5                     \n"
4980       "movdqu    0x10(%0),%%xmm1                 \n"
4981       "pmaddubsw %%xmm4,%%xmm5                   \n"
4982       "pmaddubsw %%xmm4,%%xmm1                   \n"
4983       "phaddw    %%xmm1,%%xmm5                   \n"
4984       "psrlw     $0x7,%%xmm5                     \n"
4985       "packuswb  %%xmm5,%%xmm5                   \n"
4986       "movdqu    (%0),%%xmm6                     \n"
4987       "movdqu    0x10(%0),%%xmm1                 \n"
4988       "psrld     $0x18,%%xmm6                    \n"
4989       "psrld     $0x18,%%xmm1                    \n"
4990       "packuswb  %%xmm1,%%xmm6                   \n"
4991       "packuswb  %%xmm6,%%xmm6                   \n"
4992       "punpcklbw %%xmm6,%%xmm5                   \n"
4993       "movdqa    %%xmm0,%%xmm1                   \n"
4994       "punpcklwd %%xmm5,%%xmm0                   \n"
4995       "punpckhwd %%xmm5,%%xmm1                   \n"
4996       "movdqu    %%xmm0,(%0)                     \n"
4997       "movdqu    %%xmm1,0x10(%0)                 \n"
4998       "lea       0x20(%0),%0                     \n"
4999       "sub       $0x8,%1                         \n"
5000       "jg        1b                              \n"
5001       : "+r"(dst_argb),      // %0
5002         "+r"(width)          // %1
5003       : "m"(kARGBToSepiaB),  // %2
5004         "m"(kARGBToSepiaG),  // %3
5005         "m"(kARGBToSepiaR)   // %4
5006       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
5007 }
5008 #endif  // HAS_ARGBSEPIAROW_SSSE3
5009 
5010 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
5011 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
5012 // Same as Sepia except matrix is provided.
ARGBColorMatrixRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,const int8_t * matrix_argb,int width)5013 void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
5014                               uint8_t* dst_argb,
5015                               const int8_t* matrix_argb,
5016                               int width) {
5017   asm volatile(
5018       "movdqu    (%3),%%xmm5                     \n"
5019       "pshufd    $0x00,%%xmm5,%%xmm2             \n"
5020       "pshufd    $0x55,%%xmm5,%%xmm3             \n"
5021       "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
5022       "pshufd    $0xff,%%xmm5,%%xmm5             \n"
5023 
5024       // 8 pixel loop.
5025       LABELALIGN
5026       "1:                                        \n"
5027       "movdqu    (%0),%%xmm0                     \n"
5028       "movdqu    0x10(%0),%%xmm7                 \n"
5029       "pmaddubsw %%xmm2,%%xmm0                   \n"
5030       "pmaddubsw %%xmm2,%%xmm7                   \n"
5031       "movdqu    (%0),%%xmm6                     \n"
5032       "movdqu    0x10(%0),%%xmm1                 \n"
5033       "pmaddubsw %%xmm3,%%xmm6                   \n"
5034       "pmaddubsw %%xmm3,%%xmm1                   \n"
5035       "phaddsw   %%xmm7,%%xmm0                   \n"
5036       "phaddsw   %%xmm1,%%xmm6                   \n"
5037       "psraw     $0x6,%%xmm0                     \n"
5038       "psraw     $0x6,%%xmm6                     \n"
5039       "packuswb  %%xmm0,%%xmm0                   \n"
5040       "packuswb  %%xmm6,%%xmm6                   \n"
5041       "punpcklbw %%xmm6,%%xmm0                   \n"
5042       "movdqu    (%0),%%xmm1                     \n"
5043       "movdqu    0x10(%0),%%xmm7                 \n"
5044       "pmaddubsw %%xmm4,%%xmm1                   \n"
5045       "pmaddubsw %%xmm4,%%xmm7                   \n"
5046       "phaddsw   %%xmm7,%%xmm1                   \n"
5047       "movdqu    (%0),%%xmm6                     \n"
5048       "movdqu    0x10(%0),%%xmm7                 \n"
5049       "pmaddubsw %%xmm5,%%xmm6                   \n"
5050       "pmaddubsw %%xmm5,%%xmm7                   \n"
5051       "phaddsw   %%xmm7,%%xmm6                   \n"
5052       "psraw     $0x6,%%xmm1                     \n"
5053       "psraw     $0x6,%%xmm6                     \n"
5054       "packuswb  %%xmm1,%%xmm1                   \n"
5055       "packuswb  %%xmm6,%%xmm6                   \n"
5056       "punpcklbw %%xmm6,%%xmm1                   \n"
5057       "movdqa    %%xmm0,%%xmm6                   \n"
5058       "punpcklwd %%xmm1,%%xmm0                   \n"
5059       "punpckhwd %%xmm1,%%xmm6                   \n"
5060       "movdqu    %%xmm0,(%1)                     \n"
5061       "movdqu    %%xmm6,0x10(%1)                 \n"
5062       "lea       0x20(%0),%0                     \n"
5063       "lea       0x20(%1),%1                     \n"
5064       "sub       $0x8,%2                         \n"
5065       "jg        1b                              \n"
5066       : "+r"(src_argb),   // %0
5067         "+r"(dst_argb),   // %1
5068         "+r"(width)       // %2
5069       : "r"(matrix_argb)  // %3
5070       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
5071         "xmm7");
5072 }
5073 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
5074 
5075 #ifdef HAS_ARGBQUANTIZEROW_SSE2
5076 // Quantize 4 ARGB pixels (16 bytes).
ARGBQuantizeRow_SSE2(uint8_t * dst_argb,int scale,int interval_size,int interval_offset,int width)5077 void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
5078                           int scale,
5079                           int interval_size,
5080                           int interval_offset,
5081                           int width) {
5082   asm volatile(
5083       "movd      %2,%%xmm2                       \n"
5084       "movd      %3,%%xmm3                       \n"
5085       "movd      %4,%%xmm4                       \n"
5086       "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
5087       "pshufd    $0x44,%%xmm2,%%xmm2             \n"
5088       "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
5089       "pshufd    $0x44,%%xmm3,%%xmm3             \n"
5090       "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
5091       "pshufd    $0x44,%%xmm4,%%xmm4             \n"
5092       "pxor      %%xmm5,%%xmm5                   \n"
5093       "pcmpeqb   %%xmm6,%%xmm6                   \n"
5094       "pslld     $0x18,%%xmm6                    \n"
5095 
5096       // 4 pixel loop.
5097       LABELALIGN
5098       "1:                                        \n"
5099       "movdqu    (%0),%%xmm0                     \n"
5100       "punpcklbw %%xmm5,%%xmm0                   \n"
5101       "pmulhuw   %%xmm2,%%xmm0                   \n"
5102       "movdqu    (%0),%%xmm1                     \n"
5103       "punpckhbw %%xmm5,%%xmm1                   \n"
5104       "pmulhuw   %%xmm2,%%xmm1                   \n"
5105       "pmullw    %%xmm3,%%xmm0                   \n"
5106       "movdqu    (%0),%%xmm7                     \n"
5107       "pmullw    %%xmm3,%%xmm1                   \n"
5108       "pand      %%xmm6,%%xmm7                   \n"
5109       "paddw     %%xmm4,%%xmm0                   \n"
5110       "paddw     %%xmm4,%%xmm1                   \n"
5111       "packuswb  %%xmm1,%%xmm0                   \n"
5112       "por       %%xmm7,%%xmm0                   \n"
5113       "movdqu    %%xmm0,(%0)                     \n"
5114       "lea       0x10(%0),%0                     \n"
5115       "sub       $0x4,%1                         \n"
5116       "jg        1b                              \n"
5117       : "+r"(dst_argb),       // %0
5118         "+r"(width)           // %1
5119       : "r"(scale),           // %2
5120         "r"(interval_size),   // %3
5121         "r"(interval_offset)  // %4
5122       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
5123         "xmm7");
5124 }
5125 #endif  // HAS_ARGBQUANTIZEROW_SSE2
5126 
5127 #ifdef HAS_ARGBSHADEROW_SSE2
5128 // Shade 4 pixels at a time by specified value.
ARGBShadeRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)5129 void ARGBShadeRow_SSE2(const uint8_t* src_argb,
5130                        uint8_t* dst_argb,
5131                        int width,
5132                        uint32_t value) {
5133   asm volatile(
5134       "movd      %3,%%xmm2                       \n"
5135       "punpcklbw %%xmm2,%%xmm2                   \n"
5136       "punpcklqdq %%xmm2,%%xmm2                  \n"
5137 
5138       // 4 pixel loop.
5139       LABELALIGN
5140       "1:                                        \n"
5141       "movdqu    (%0),%%xmm0                     \n"
5142       "lea       0x10(%0),%0                     \n"
5143       "movdqa    %%xmm0,%%xmm1                   \n"
5144       "punpcklbw %%xmm0,%%xmm0                   \n"
5145       "punpckhbw %%xmm1,%%xmm1                   \n"
5146       "pmulhuw   %%xmm2,%%xmm0                   \n"
5147       "pmulhuw   %%xmm2,%%xmm1                   \n"
5148       "psrlw     $0x8,%%xmm0                     \n"
5149       "psrlw     $0x8,%%xmm1                     \n"
5150       "packuswb  %%xmm1,%%xmm0                   \n"
5151       "movdqu    %%xmm0,(%1)                     \n"
5152       "lea       0x10(%1),%1                     \n"
5153       "sub       $0x4,%2                         \n"
5154       "jg        1b                              \n"
5155       : "+r"(src_argb),  // %0
5156         "+r"(dst_argb),  // %1
5157         "+r"(width)      // %2
5158       : "r"(value)       // %3
5159       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5160 }
5161 #endif  // HAS_ARGBSHADEROW_SSE2
5162 
5163 #ifdef HAS_ARGBMULTIPLYROW_SSE2
5164 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBMultiplyRow_SSE2(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)5165 void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
5166                           const uint8_t* src_argb1,
5167                           uint8_t* dst_argb,
5168                           int width) {
5169   asm volatile(
5170 
5171       "pxor      %%xmm5,%%xmm5                   \n"
5172 
5173       // 4 pixel loop.
5174       LABELALIGN
5175       "1:                                        \n"
5176       "movdqu    (%0),%%xmm0                     \n"
5177       "lea       0x10(%0),%0                     \n"
5178       "movdqu    (%1),%%xmm2                     \n"
5179       "lea       0x10(%1),%1                     \n"
5180       "movdqu    %%xmm0,%%xmm1                   \n"
5181       "movdqu    %%xmm2,%%xmm3                   \n"
5182       "punpcklbw %%xmm0,%%xmm0                   \n"
5183       "punpckhbw %%xmm1,%%xmm1                   \n"
5184       "punpcklbw %%xmm5,%%xmm2                   \n"
5185       "punpckhbw %%xmm5,%%xmm3                   \n"
5186       "pmulhuw   %%xmm2,%%xmm0                   \n"
5187       "pmulhuw   %%xmm3,%%xmm1                   \n"
5188       "packuswb  %%xmm1,%%xmm0                   \n"
5189       "movdqu    %%xmm0,(%2)                     \n"
5190       "lea       0x10(%2),%2                     \n"
5191       "sub       $0x4,%3                         \n"
5192       "jg        1b                              \n"
5193       : "+r"(src_argb0),  // %0
5194         "+r"(src_argb1),  // %1
5195         "+r"(dst_argb),   // %2
5196         "+r"(width)       // %3
5197       :
5198       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
5199 }
5200 #endif  // HAS_ARGBMULTIPLYROW_SSE2
5201 
5202 #ifdef HAS_ARGBMULTIPLYROW_AVX2
5203 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_AVX2(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)5204 void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
5205                           const uint8_t* src_argb1,
5206                           uint8_t* dst_argb,
5207                           int width) {
5208   asm volatile(
5209 
5210       "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
5211 
5212       // 4 pixel loop.
5213       LABELALIGN
5214       "1:                                        \n"
5215       "vmovdqu    (%0),%%ymm1                    \n"
5216       "lea        0x20(%0),%0                    \n"
5217       "vmovdqu    (%1),%%ymm3                    \n"
5218       "lea        0x20(%1),%1                    \n"
5219       "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
5220       "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
5221       "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
5222       "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
5223       "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
5224       "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
5225       "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
5226       "vmovdqu    %%ymm0,(%2)                    \n"
5227       "lea       0x20(%2),%2                     \n"
5228       "sub        $0x8,%3                        \n"
5229       "jg        1b                              \n"
5230       "vzeroupper                                \n"
5231       : "+r"(src_argb0),  // %0
5232         "+r"(src_argb1),  // %1
5233         "+r"(dst_argb),   // %2
5234         "+r"(width)       // %3
5235       :
5236       : "memory", "cc"
5237 #if defined(__AVX2__)
5238         ,
5239         "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
5240 #endif
5241       );
5242 }
5243 #endif  // HAS_ARGBMULTIPLYROW_AVX2
5244 
5245 #ifdef HAS_ARGBADDROW_SSE2
5246 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_SSE2(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)5247 void ARGBAddRow_SSE2(const uint8_t* src_argb0,
5248                      const uint8_t* src_argb1,
5249                      uint8_t* dst_argb,
5250                      int width) {
5251   asm volatile(
5252       // 4 pixel loop.
5253       LABELALIGN
5254       "1:                                        \n"
5255       "movdqu    (%0),%%xmm0                     \n"
5256       "lea       0x10(%0),%0                     \n"
5257       "movdqu    (%1),%%xmm1                     \n"
5258       "lea       0x10(%1),%1                     \n"
5259       "paddusb   %%xmm1,%%xmm0                   \n"
5260       "movdqu    %%xmm0,(%2)                     \n"
5261       "lea       0x10(%2),%2                     \n"
5262       "sub       $0x4,%3                         \n"
5263       "jg        1b                              \n"
5264       : "+r"(src_argb0),  // %0
5265         "+r"(src_argb1),  // %1
5266         "+r"(dst_argb),   // %2
5267         "+r"(width)       // %3
5268       :
5269       : "memory", "cc", "xmm0", "xmm1");
5270 }
5271 #endif  // HAS_ARGBADDROW_SSE2
5272 
5273 #ifdef HAS_ARGBADDROW_AVX2
5274 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_AVX2(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)5275 void ARGBAddRow_AVX2(const uint8_t* src_argb0,
5276                      const uint8_t* src_argb1,
5277                      uint8_t* dst_argb,
5278                      int width) {
5279   asm volatile(
5280       // 4 pixel loop.
5281       LABELALIGN
5282       "1:                                        \n"
5283       "vmovdqu    (%0),%%ymm0                    \n"
5284       "lea        0x20(%0),%0                    \n"
5285       "vpaddusb   (%1),%%ymm0,%%ymm0             \n"
5286       "lea        0x20(%1),%1                    \n"
5287       "vmovdqu    %%ymm0,(%2)                    \n"
5288       "lea        0x20(%2),%2                    \n"
5289       "sub        $0x8,%3                        \n"
5290       "jg        1b                              \n"
5291       "vzeroupper                                \n"
5292       : "+r"(src_argb0),  // %0
5293         "+r"(src_argb1),  // %1
5294         "+r"(dst_argb),   // %2
5295         "+r"(width)       // %3
5296       :
5297       : "memory", "cc", "xmm0");
5298 }
5299 #endif  // HAS_ARGBADDROW_AVX2
5300 
5301 #ifdef HAS_ARGBSUBTRACTROW_SSE2
5302 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
ARGBSubtractRow_SSE2(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)5303 void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
5304                           const uint8_t* src_argb1,
5305                           uint8_t* dst_argb,
5306                           int width) {
5307   asm volatile(
5308       // 4 pixel loop.
5309       LABELALIGN
5310       "1:                                        \n"
5311       "movdqu    (%0),%%xmm0                     \n"
5312       "lea       0x10(%0),%0                     \n"
5313       "movdqu    (%1),%%xmm1                     \n"
5314       "lea       0x10(%1),%1                     \n"
5315       "psubusb   %%xmm1,%%xmm0                   \n"
5316       "movdqu    %%xmm0,(%2)                     \n"
5317       "lea       0x10(%2),%2                     \n"
5318       "sub       $0x4,%3                         \n"
5319       "jg        1b                              \n"
5320       : "+r"(src_argb0),  // %0
5321         "+r"(src_argb1),  // %1
5322         "+r"(dst_argb),   // %2
5323         "+r"(width)       // %3
5324       :
5325       : "memory", "cc", "xmm0", "xmm1");
5326 }
5327 #endif  // HAS_ARGBSUBTRACTROW_SSE2
5328 
5329 #ifdef HAS_ARGBSUBTRACTROW_AVX2
5330 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_AVX2(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)5331 void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
5332                           const uint8_t* src_argb1,
5333                           uint8_t* dst_argb,
5334                           int width) {
5335   asm volatile(
5336       // 4 pixel loop.
5337       LABELALIGN
5338       "1:                                        \n"
5339       "vmovdqu    (%0),%%ymm0                    \n"
5340       "lea        0x20(%0),%0                    \n"
5341       "vpsubusb   (%1),%%ymm0,%%ymm0             \n"
5342       "lea        0x20(%1),%1                    \n"
5343       "vmovdqu    %%ymm0,(%2)                    \n"
5344       "lea        0x20(%2),%2                    \n"
5345       "sub        $0x8,%3                        \n"
5346       "jg         1b                             \n"
5347       "vzeroupper                                \n"
5348       : "+r"(src_argb0),  // %0
5349         "+r"(src_argb1),  // %1
5350         "+r"(dst_argb),   // %2
5351         "+r"(width)       // %3
5352       :
5353       : "memory", "cc", "xmm0");
5354 }
5355 #endif  // HAS_ARGBSUBTRACTROW_AVX2
5356 
5357 #ifdef HAS_SOBELXROW_SSE2
5358 // SobelX as a matrix is
5359 // -1  0  1
5360 // -2  0  2
5361 // -1  0  1
SobelXRow_SSE2(const uint8_t * src_y0,const uint8_t * src_y1,const uint8_t * src_y2,uint8_t * dst_sobelx,int width)5362 void SobelXRow_SSE2(const uint8_t* src_y0,
5363                     const uint8_t* src_y1,
5364                     const uint8_t* src_y2,
5365                     uint8_t* dst_sobelx,
5366                     int width) {
5367   asm volatile(
5368       "sub       %0,%1                           \n"
5369       "sub       %0,%2                           \n"
5370       "sub       %0,%3                           \n"
5371       "pxor      %%xmm5,%%xmm5                   \n"
5372 
5373       // 8 pixel loop.
5374       LABELALIGN
5375       "1:                                        \n"
5376       "movq      (%0),%%xmm0                     \n"
5377       "movq      0x2(%0),%%xmm1                  \n"
5378       "punpcklbw %%xmm5,%%xmm0                   \n"
5379       "punpcklbw %%xmm5,%%xmm1                   \n"
5380       "psubw     %%xmm1,%%xmm0                   \n"
5381       "movq      0x00(%0,%1,1),%%xmm1            \n"
5382       "movq      0x02(%0,%1,1),%%xmm2            \n"
5383       "punpcklbw %%xmm5,%%xmm1                   \n"
5384       "punpcklbw %%xmm5,%%xmm2                   \n"
5385       "psubw     %%xmm2,%%xmm1                   \n"
5386       "movq      0x00(%0,%2,1),%%xmm2            \n"
5387       "movq      0x02(%0,%2,1),%%xmm3            \n"
5388       "punpcklbw %%xmm5,%%xmm2                   \n"
5389       "punpcklbw %%xmm5,%%xmm3                   \n"
5390       "psubw     %%xmm3,%%xmm2                   \n"
5391       "paddw     %%xmm2,%%xmm0                   \n"
5392       "paddw     %%xmm1,%%xmm0                   \n"
5393       "paddw     %%xmm1,%%xmm0                   \n"
5394       "pxor      %%xmm1,%%xmm1                   \n"
5395       "psubw     %%xmm0,%%xmm1                   \n"
5396       "pmaxsw    %%xmm1,%%xmm0                   \n"
5397       "packuswb  %%xmm0,%%xmm0                   \n"
5398       "movq      %%xmm0,0x00(%0,%3,1)            \n"
5399       "lea       0x8(%0),%0                      \n"
5400       "sub       $0x8,%4                         \n"
5401       "jg        1b                              \n"
5402       : "+r"(src_y0),      // %0
5403         "+r"(src_y1),      // %1
5404         "+r"(src_y2),      // %2
5405         "+r"(dst_sobelx),  // %3
5406         "+r"(width)        // %4
5407       :
5408       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
5409 }
5410 #endif  // HAS_SOBELXROW_SSE2
5411 
5412 #ifdef HAS_SOBELYROW_SSE2
5413 // SobelY as a matrix is
5414 // -1 -2 -1
5415 //  0  0  0
5416 //  1  2  1
SobelYRow_SSE2(const uint8_t * src_y0,const uint8_t * src_y1,uint8_t * dst_sobely,int width)5417 void SobelYRow_SSE2(const uint8_t* src_y0,
5418                     const uint8_t* src_y1,
5419                     uint8_t* dst_sobely,
5420                     int width) {
5421   asm volatile(
5422       "sub       %0,%1                           \n"
5423       "sub       %0,%2                           \n"
5424       "pxor      %%xmm5,%%xmm5                   \n"
5425 
5426       // 8 pixel loop.
5427       LABELALIGN
5428       "1:                                        \n"
5429       "movq      (%0),%%xmm0                     \n"
5430       "movq      0x00(%0,%1,1),%%xmm1            \n"
5431       "punpcklbw %%xmm5,%%xmm0                   \n"
5432       "punpcklbw %%xmm5,%%xmm1                   \n"
5433       "psubw     %%xmm1,%%xmm0                   \n"
5434       "movq      0x1(%0),%%xmm1                  \n"
5435       "movq      0x01(%0,%1,1),%%xmm2            \n"
5436       "punpcklbw %%xmm5,%%xmm1                   \n"
5437       "punpcklbw %%xmm5,%%xmm2                   \n"
5438       "psubw     %%xmm2,%%xmm1                   \n"
5439       "movq      0x2(%0),%%xmm2                  \n"
5440       "movq      0x02(%0,%1,1),%%xmm3            \n"
5441       "punpcklbw %%xmm5,%%xmm2                   \n"
5442       "punpcklbw %%xmm5,%%xmm3                   \n"
5443       "psubw     %%xmm3,%%xmm2                   \n"
5444       "paddw     %%xmm2,%%xmm0                   \n"
5445       "paddw     %%xmm1,%%xmm0                   \n"
5446       "paddw     %%xmm1,%%xmm0                   \n"
5447       "pxor      %%xmm1,%%xmm1                   \n"
5448       "psubw     %%xmm0,%%xmm1                   \n"
5449       "pmaxsw    %%xmm1,%%xmm0                   \n"
5450       "packuswb  %%xmm0,%%xmm0                   \n"
5451       "movq      %%xmm0,0x00(%0,%2,1)            \n"
5452       "lea       0x8(%0),%0                      \n"
5453       "sub       $0x8,%3                         \n"
5454       "jg        1b                              \n"
5455       : "+r"(src_y0),      // %0
5456         "+r"(src_y1),      // %1
5457         "+r"(dst_sobely),  // %2
5458         "+r"(width)        // %3
5459       :
5460       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
5461 }
5462 #endif  // HAS_SOBELYROW_SSE2
5463 
5464 #ifdef HAS_SOBELROW_SSE2
5465 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
5466 // A = 255
5467 // R = Sobel
5468 // G = Sobel
5469 // B = Sobel
SobelRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)5470 void SobelRow_SSE2(const uint8_t* src_sobelx,
5471                    const uint8_t* src_sobely,
5472                    uint8_t* dst_argb,
5473                    int width) {
5474   asm volatile(
5475       "sub       %0,%1                           \n"
5476       "pcmpeqb   %%xmm5,%%xmm5                   \n"
5477       "pslld     $0x18,%%xmm5                    \n"
5478 
5479       // 8 pixel loop.
5480       LABELALIGN
5481       "1:                                        \n"
5482       "movdqu    (%0),%%xmm0                     \n"
5483       "movdqu    0x00(%0,%1,1),%%xmm1            \n"
5484       "lea       0x10(%0),%0                     \n"
5485       "paddusb   %%xmm1,%%xmm0                   \n"
5486       "movdqa    %%xmm0,%%xmm2                   \n"
5487       "punpcklbw %%xmm0,%%xmm2                   \n"
5488       "punpckhbw %%xmm0,%%xmm0                   \n"
5489       "movdqa    %%xmm2,%%xmm1                   \n"
5490       "punpcklwd %%xmm2,%%xmm1                   \n"
5491       "punpckhwd %%xmm2,%%xmm2                   \n"
5492       "por       %%xmm5,%%xmm1                   \n"
5493       "por       %%xmm5,%%xmm2                   \n"
5494       "movdqa    %%xmm0,%%xmm3                   \n"
5495       "punpcklwd %%xmm0,%%xmm3                   \n"
5496       "punpckhwd %%xmm0,%%xmm0                   \n"
5497       "por       %%xmm5,%%xmm3                   \n"
5498       "por       %%xmm5,%%xmm0                   \n"
5499       "movdqu    %%xmm1,(%2)                     \n"
5500       "movdqu    %%xmm2,0x10(%2)                 \n"
5501       "movdqu    %%xmm3,0x20(%2)                 \n"
5502       "movdqu    %%xmm0,0x30(%2)                 \n"
5503       "lea       0x40(%2),%2                     \n"
5504       "sub       $0x10,%3                        \n"
5505       "jg        1b                              \n"
5506       : "+r"(src_sobelx),  // %0
5507         "+r"(src_sobely),  // %1
5508         "+r"(dst_argb),    // %2
5509         "+r"(width)        // %3
5510       :
5511       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
5512 }
5513 #endif  // HAS_SOBELROW_SSE2
5514 
5515 #ifdef HAS_SOBELTOPLANEROW_SSE2
5516 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
SobelToPlaneRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_y,int width)5517 void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
5518                           const uint8_t* src_sobely,
5519                           uint8_t* dst_y,
5520                           int width) {
5521   asm volatile(
5522       "sub       %0,%1                           \n"
5523       "pcmpeqb   %%xmm5,%%xmm5                   \n"
5524       "pslld     $0x18,%%xmm5                    \n"
5525 
5526       // 8 pixel loop.
5527       LABELALIGN
5528       "1:                                        \n"
5529       "movdqu    (%0),%%xmm0                     \n"
5530       "movdqu    0x00(%0,%1,1),%%xmm1            \n"
5531       "lea       0x10(%0),%0                     \n"
5532       "paddusb   %%xmm1,%%xmm0                   \n"
5533       "movdqu    %%xmm0,(%2)                     \n"
5534       "lea       0x10(%2),%2                     \n"
5535       "sub       $0x10,%3                        \n"
5536       "jg        1b                              \n"
5537       : "+r"(src_sobelx),  // %0
5538         "+r"(src_sobely),  // %1
5539         "+r"(dst_y),       // %2
5540         "+r"(width)        // %3
5541       :
5542       : "memory", "cc", "xmm0", "xmm1");
5543 }
5544 #endif  // HAS_SOBELTOPLANEROW_SSE2
5545 
5546 #ifdef HAS_SOBELXYROW_SSE2
5547 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
5548 // A = 255
5549 // R = Sobel X
5550 // G = Sobel
5551 // B = Sobel Y
SobelXYRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)5552 void SobelXYRow_SSE2(const uint8_t* src_sobelx,
5553                      const uint8_t* src_sobely,
5554                      uint8_t* dst_argb,
5555                      int width) {
5556   asm volatile(
5557       "sub       %0,%1                           \n"
5558       "pcmpeqb   %%xmm5,%%xmm5                   \n"
5559 
5560       // 8 pixel loop.
5561       LABELALIGN
5562       "1:                                        \n"
5563       "movdqu    (%0),%%xmm0                     \n"
5564       "movdqu    0x00(%0,%1,1),%%xmm1            \n"
5565       "lea       0x10(%0),%0                     \n"
5566       "movdqa    %%xmm0,%%xmm2                   \n"
5567       "paddusb   %%xmm1,%%xmm2                   \n"
5568       "movdqa    %%xmm0,%%xmm3                   \n"
5569       "punpcklbw %%xmm5,%%xmm3                   \n"
5570       "punpckhbw %%xmm5,%%xmm0                   \n"
5571       "movdqa    %%xmm1,%%xmm4                   \n"
5572       "punpcklbw %%xmm2,%%xmm4                   \n"
5573       "punpckhbw %%xmm2,%%xmm1                   \n"
5574       "movdqa    %%xmm4,%%xmm6                   \n"
5575       "punpcklwd %%xmm3,%%xmm6                   \n"
5576       "punpckhwd %%xmm3,%%xmm4                   \n"
5577       "movdqa    %%xmm1,%%xmm7                   \n"
5578       "punpcklwd %%xmm0,%%xmm7                   \n"
5579       "punpckhwd %%xmm0,%%xmm1                   \n"
5580       "movdqu    %%xmm6,(%2)                     \n"
5581       "movdqu    %%xmm4,0x10(%2)                 \n"
5582       "movdqu    %%xmm7,0x20(%2)                 \n"
5583       "movdqu    %%xmm1,0x30(%2)                 \n"
5584       "lea       0x40(%2),%2                     \n"
5585       "sub       $0x10,%3                        \n"
5586       "jg        1b                              \n"
5587       : "+r"(src_sobelx),  // %0
5588         "+r"(src_sobely),  // %1
5589         "+r"(dst_argb),    // %2
5590         "+r"(width)        // %3
5591       :
5592       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
5593         "xmm7");
5594 }
5595 #endif  // HAS_SOBELXYROW_SSE2
5596 
5597 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
5598 // Creates a table of cumulative sums where each value is a sum of all values
5599 // above and to the left of the value, inclusive of the value.
ComputeCumulativeSumRow_SSE2(const uint8_t * row,int32_t * cumsum,const int32_t * previous_cumsum,int width)5600 void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
5601                                   int32_t* cumsum,
5602                                   const int32_t* previous_cumsum,
5603                                   int width) {
5604   asm volatile(
5605       "pxor      %%xmm0,%%xmm0                   \n"
5606       "pxor      %%xmm1,%%xmm1                   \n"
5607       "sub       $0x4,%3                         \n"
5608       "jl        49f                             \n"
5609       "test      $0xf,%1                         \n"
5610       "jne       49f                             \n"
5611 
5612       // 4 pixel loop.
5613       LABELALIGN
5614       "40:                                       \n"
5615       "movdqu    (%0),%%xmm2                     \n"
5616       "lea       0x10(%0),%0                     \n"
5617       "movdqa    %%xmm2,%%xmm4                   \n"
5618       "punpcklbw %%xmm1,%%xmm2                   \n"
5619       "movdqa    %%xmm2,%%xmm3                   \n"
5620       "punpcklwd %%xmm1,%%xmm2                   \n"
5621       "punpckhwd %%xmm1,%%xmm3                   \n"
5622       "punpckhbw %%xmm1,%%xmm4                   \n"
5623       "movdqa    %%xmm4,%%xmm5                   \n"
5624       "punpcklwd %%xmm1,%%xmm4                   \n"
5625       "punpckhwd %%xmm1,%%xmm5                   \n"
5626       "paddd     %%xmm2,%%xmm0                   \n"
5627       "movdqu    (%2),%%xmm2                     \n"
5628       "paddd     %%xmm0,%%xmm2                   \n"
5629       "paddd     %%xmm3,%%xmm0                   \n"
5630       "movdqu    0x10(%2),%%xmm3                 \n"
5631       "paddd     %%xmm0,%%xmm3                   \n"
5632       "paddd     %%xmm4,%%xmm0                   \n"
5633       "movdqu    0x20(%2),%%xmm4                 \n"
5634       "paddd     %%xmm0,%%xmm4                   \n"
5635       "paddd     %%xmm5,%%xmm0                   \n"
5636       "movdqu    0x30(%2),%%xmm5                 \n"
5637       "lea       0x40(%2),%2                     \n"
5638       "paddd     %%xmm0,%%xmm5                   \n"
5639       "movdqu    %%xmm2,(%1)                     \n"
5640       "movdqu    %%xmm3,0x10(%1)                 \n"
5641       "movdqu    %%xmm4,0x20(%1)                 \n"
5642       "movdqu    %%xmm5,0x30(%1)                 \n"
5643       "lea       0x40(%1),%1                     \n"
5644       "sub       $0x4,%3                         \n"
5645       "jge       40b                             \n"
5646 
5647       "49:                                       \n"
5648       "add       $0x3,%3                         \n"
5649       "jl        19f                             \n"
5650 
5651       // 1 pixel loop.
5652       LABELALIGN
5653       "10:                                       \n"
5654       "movd      (%0),%%xmm2                     \n"
5655       "lea       0x4(%0),%0                      \n"
5656       "punpcklbw %%xmm1,%%xmm2                   \n"
5657       "punpcklwd %%xmm1,%%xmm2                   \n"
5658       "paddd     %%xmm2,%%xmm0                   \n"
5659       "movdqu    (%2),%%xmm2                     \n"
5660       "lea       0x10(%2),%2                     \n"
5661       "paddd     %%xmm0,%%xmm2                   \n"
5662       "movdqu    %%xmm2,(%1)                     \n"
5663       "lea       0x10(%1),%1                     \n"
5664       "sub       $0x1,%3                         \n"
5665       "jge       10b                             \n"
5666 
5667       "19:                                       \n"
5668       : "+r"(row),              // %0
5669         "+r"(cumsum),           // %1
5670         "+r"(previous_cumsum),  // %2
5671         "+r"(width)             // %3
5672       :
5673       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
5674 }
5675 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
5676 
5677 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
CumulativeSumToAverageRow_SSE2(const int32_t * topleft,const int32_t * botleft,int width,int area,uint8_t * dst,int count)5678 void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
5679                                     const int32_t* botleft,
5680                                     int width,
5681                                     int area,
5682                                     uint8_t* dst,
5683                                     int count) {
5684   asm volatile(
5685       "movd      %5,%%xmm5                       \n"
5686       "cvtdq2ps  %%xmm5,%%xmm5                   \n"
5687       "rcpss     %%xmm5,%%xmm4                   \n"
5688       "pshufd    $0x0,%%xmm4,%%xmm4              \n"
5689       "sub       $0x4,%3                         \n"
5690       "jl        49f                             \n"
5691       "cmpl      $0x80,%5                        \n"
5692       "ja        40f                             \n"
5693 
5694       "pshufd    $0x0,%%xmm5,%%xmm5              \n"
5695       "pcmpeqb   %%xmm6,%%xmm6                   \n"
5696       "psrld     $0x10,%%xmm6                    \n"
5697       "cvtdq2ps  %%xmm6,%%xmm6                   \n"
5698       "addps     %%xmm6,%%xmm5                   \n"
5699       "mulps     %%xmm4,%%xmm5                   \n"
5700       "cvtps2dq  %%xmm5,%%xmm5                   \n"
5701       "packssdw  %%xmm5,%%xmm5                   \n"
5702 
5703       // 4 pixel small loop.
5704       LABELALIGN
5705       "4:                                        \n"
5706       "movdqu    (%0),%%xmm0                     \n"
5707       "movdqu    0x10(%0),%%xmm1                 \n"
5708       "movdqu    0x20(%0),%%xmm2                 \n"
5709       "movdqu    0x30(%0),%%xmm3                 \n"
5710       "psubd     0x00(%0,%4,4),%%xmm0            \n"
5711       "psubd     0x10(%0,%4,4),%%xmm1            \n"
5712       "psubd     0x20(%0,%4,4),%%xmm2            \n"
5713       "psubd     0x30(%0,%4,4),%%xmm3            \n"
5714       "lea       0x40(%0),%0                     \n"
5715       "psubd     (%1),%%xmm0                     \n"
5716       "psubd     0x10(%1),%%xmm1                 \n"
5717       "psubd     0x20(%1),%%xmm2                 \n"
5718       "psubd     0x30(%1),%%xmm3                 \n"
5719       "paddd     0x00(%1,%4,4),%%xmm0            \n"
5720       "paddd     0x10(%1,%4,4),%%xmm1            \n"
5721       "paddd     0x20(%1,%4,4),%%xmm2            \n"
5722       "paddd     0x30(%1,%4,4),%%xmm3            \n"
5723       "lea       0x40(%1),%1                     \n"
5724       "packssdw  %%xmm1,%%xmm0                   \n"
5725       "packssdw  %%xmm3,%%xmm2                   \n"
5726       "pmulhuw   %%xmm5,%%xmm0                   \n"
5727       "pmulhuw   %%xmm5,%%xmm2                   \n"
5728       "packuswb  %%xmm2,%%xmm0                   \n"
5729       "movdqu    %%xmm0,(%2)                     \n"
5730       "lea       0x10(%2),%2                     \n"
5731       "sub       $0x4,%3                         \n"
5732       "jge       4b                              \n"
5733       "jmp       49f                             \n"
5734 
5735       // 4 pixel loop
5736       LABELALIGN
5737       "40:                                       \n"
5738       "movdqu    (%0),%%xmm0                     \n"
5739       "movdqu    0x10(%0),%%xmm1                 \n"
5740       "movdqu    0x20(%0),%%xmm2                 \n"
5741       "movdqu    0x30(%0),%%xmm3                 \n"
5742       "psubd     0x00(%0,%4,4),%%xmm0            \n"
5743       "psubd     0x10(%0,%4,4),%%xmm1            \n"
5744       "psubd     0x20(%0,%4,4),%%xmm2            \n"
5745       "psubd     0x30(%0,%4,4),%%xmm3            \n"
5746       "lea       0x40(%0),%0                     \n"
5747       "psubd     (%1),%%xmm0                     \n"
5748       "psubd     0x10(%1),%%xmm1                 \n"
5749       "psubd     0x20(%1),%%xmm2                 \n"
5750       "psubd     0x30(%1),%%xmm3                 \n"
5751       "paddd     0x00(%1,%4,4),%%xmm0            \n"
5752       "paddd     0x10(%1,%4,4),%%xmm1            \n"
5753       "paddd     0x20(%1,%4,4),%%xmm2            \n"
5754       "paddd     0x30(%1,%4,4),%%xmm3            \n"
5755       "lea       0x40(%1),%1                     \n"
5756       "cvtdq2ps  %%xmm0,%%xmm0                   \n"
5757       "cvtdq2ps  %%xmm1,%%xmm1                   \n"
5758       "mulps     %%xmm4,%%xmm0                   \n"
5759       "mulps     %%xmm4,%%xmm1                   \n"
5760       "cvtdq2ps  %%xmm2,%%xmm2                   \n"
5761       "cvtdq2ps  %%xmm3,%%xmm3                   \n"
5762       "mulps     %%xmm4,%%xmm2                   \n"
5763       "mulps     %%xmm4,%%xmm3                   \n"
5764       "cvtps2dq  %%xmm0,%%xmm0                   \n"
5765       "cvtps2dq  %%xmm1,%%xmm1                   \n"
5766       "cvtps2dq  %%xmm2,%%xmm2                   \n"
5767       "cvtps2dq  %%xmm3,%%xmm3                   \n"
5768       "packssdw  %%xmm1,%%xmm0                   \n"
5769       "packssdw  %%xmm3,%%xmm2                   \n"
5770       "packuswb  %%xmm2,%%xmm0                   \n"
5771       "movdqu    %%xmm0,(%2)                     \n"
5772       "lea       0x10(%2),%2                     \n"
5773       "sub       $0x4,%3                         \n"
5774       "jge       40b                             \n"
5775 
5776       "49:                                       \n"
5777       "add       $0x3,%3                         \n"
5778       "jl        19f                             \n"
5779 
5780       // 1 pixel loop
5781       LABELALIGN
5782       "10:                                       \n"
5783       "movdqu    (%0),%%xmm0                     \n"
5784       "psubd     0x00(%0,%4,4),%%xmm0            \n"
5785       "lea       0x10(%0),%0                     \n"
5786       "psubd     (%1),%%xmm0                     \n"
5787       "paddd     0x00(%1,%4,4),%%xmm0            \n"
5788       "lea       0x10(%1),%1                     \n"
5789       "cvtdq2ps  %%xmm0,%%xmm0                   \n"
5790       "mulps     %%xmm4,%%xmm0                   \n"
5791       "cvtps2dq  %%xmm0,%%xmm0                   \n"
5792       "packssdw  %%xmm0,%%xmm0                   \n"
5793       "packuswb  %%xmm0,%%xmm0                   \n"
5794       "movd      %%xmm0,(%2)                     \n"
5795       "lea       0x4(%2),%2                      \n"
5796       "sub       $0x1,%3                         \n"
5797       "jge       10b                             \n"
5798       "19:                                       \n"
5799       : "+r"(topleft),           // %0
5800         "+r"(botleft),           // %1
5801         "+r"(dst),               // %2
5802         "+rm"(count)             // %3
5803       : "r"((intptr_t)(width)),  // %4
5804         "rm"(area)               // %5
5805       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
5806 }
5807 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5808 
5809 #ifdef HAS_ARGBAFFINEROW_SSE2
5810 // Copy ARGB pixels from source image with slope to a row of destination.
5811 LIBYUV_API
ARGBAffineRow_SSE2(const uint8_t * src_argb,int src_argb_stride,uint8_t * dst_argb,const float * src_dudv,int width)5812 void ARGBAffineRow_SSE2(const uint8_t* src_argb,
5813                         int src_argb_stride,
5814                         uint8_t* dst_argb,
5815                         const float* src_dudv,
5816                         int width) {
5817   intptr_t src_argb_stride_temp = src_argb_stride;
5818   intptr_t temp;
5819   asm volatile(
5820       "movq      (%3),%%xmm2                     \n"
5821       "movq      0x08(%3),%%xmm7                 \n"
5822       "shl       $0x10,%1                        \n"
5823       "add       $0x4,%1                         \n"
5824       "movd      %1,%%xmm5                       \n"
5825       "sub       $0x4,%4                         \n"
5826       "jl        49f                             \n"
5827 
5828       "pshufd    $0x44,%%xmm7,%%xmm7             \n"
5829       "pshufd    $0x0,%%xmm5,%%xmm5              \n"
5830       "movdqa    %%xmm2,%%xmm0                   \n"
5831       "addps     %%xmm7,%%xmm0                   \n"
5832       "movlhps   %%xmm0,%%xmm2                   \n"
5833       "movdqa    %%xmm7,%%xmm4                   \n"
5834       "addps     %%xmm4,%%xmm4                   \n"
5835       "movdqa    %%xmm2,%%xmm3                   \n"
5836       "addps     %%xmm4,%%xmm3                   \n"
5837       "addps     %%xmm4,%%xmm4                   \n"
5838 
5839       // 4 pixel loop
5840       LABELALIGN
5841       "40:                                       \n"
5842       "cvttps2dq %%xmm2,%%xmm0                   \n"  // x,y float->int first 2
5843       "cvttps2dq %%xmm3,%%xmm1                   \n"  // x,y float->int next 2
5844       "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
5845       "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x*4 + y*stride
5846       "movd      %%xmm0,%k1                      \n"
5847       "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5848       "movd      %%xmm0,%k5                      \n"
5849       "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5850       "movd      0x00(%0,%1,1),%%xmm1            \n"
5851       "movd      0x00(%0,%5,1),%%xmm6            \n"
5852       "punpckldq %%xmm6,%%xmm1                   \n"
5853       "addps     %%xmm4,%%xmm2                   \n"
5854       "movq      %%xmm1,(%2)                     \n"
5855       "movd      %%xmm0,%k1                      \n"
5856       "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5857       "movd      %%xmm0,%k5                      \n"
5858       "movd      0x00(%0,%1,1),%%xmm0            \n"
5859       "movd      0x00(%0,%5,1),%%xmm6            \n"
5860       "punpckldq %%xmm6,%%xmm0                   \n"
5861       "addps     %%xmm4,%%xmm3                   \n"
5862       "movq      %%xmm0,0x08(%2)                 \n"
5863       "lea       0x10(%2),%2                     \n"
5864       "sub       $0x4,%4                         \n"
5865       "jge       40b                             \n"
5866 
5867       "49:                                       \n"
5868       "add       $0x3,%4                         \n"
5869       "jl        19f                             \n"
5870 
5871       // 1 pixel loop
5872       LABELALIGN
5873       "10:                                       \n"
5874       "cvttps2dq %%xmm2,%%xmm0                   \n"
5875       "packssdw  %%xmm0,%%xmm0                   \n"
5876       "pmaddwd   %%xmm5,%%xmm0                   \n"
5877       "addps     %%xmm7,%%xmm2                   \n"
5878       "movd      %%xmm0,%k1                      \n"
5879       "movd      0x00(%0,%1,1),%%xmm0            \n"
5880       "movd      %%xmm0,(%2)                     \n"
5881       "lea       0x04(%2),%2                     \n"
5882       "sub       $0x1,%4                         \n"
5883       "jge       10b                             \n"
5884       "19:                                       \n"
5885       : "+r"(src_argb),              // %0
5886         "+r"(src_argb_stride_temp),  // %1
5887         "+r"(dst_argb),              // %2
5888         "+r"(src_dudv),              // %3
5889         "+rm"(width),                // %4
5890         "=&r"(temp)                  // %5
5891       :
5892       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
5893         "xmm7");
5894 }
5895 #endif  // HAS_ARGBAFFINEROW_SSE2
5896 
5897 #ifdef HAS_INTERPOLATEROW_SSSE3
5898 // Bilinear filter 16x2 -> 16x1
InterpolateRow_SSSE3(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)5899 void InterpolateRow_SSSE3(uint8_t* dst_ptr,
5900                           const uint8_t* src_ptr,
5901                           ptrdiff_t src_stride,
5902                           int dst_width,
5903                           int source_y_fraction) {
5904   asm volatile(
5905       "sub       %1,%0                           \n"
5906       "cmp       $0x0,%3                         \n"
5907       "je        100f                            \n"
5908       "cmp       $0x80,%3                        \n"
5909       "je        50f                             \n"
5910 
5911       "movd      %3,%%xmm0                       \n"
5912       "neg       %3                              \n"
5913       "add       $0x100,%3                       \n"
5914       "movd      %3,%%xmm5                       \n"
5915       "punpcklbw %%xmm0,%%xmm5                   \n"
5916       "punpcklwd %%xmm5,%%xmm5                   \n"
5917       "pshufd    $0x0,%%xmm5,%%xmm5              \n"
5918       "mov       $0x80808080,%%eax               \n"
5919       "movd      %%eax,%%xmm4                    \n"
5920       "pshufd    $0x0,%%xmm4,%%xmm4              \n"
5921 
5922       // General purpose row blend.
5923       LABELALIGN
5924       "1:                                        \n"
5925       "movdqu    (%1),%%xmm0                     \n"
5926       "movdqu    0x00(%1,%4,1),%%xmm2            \n"
5927       "movdqa     %%xmm0,%%xmm1                  \n"
5928       "punpcklbw  %%xmm2,%%xmm0                  \n"
5929       "punpckhbw  %%xmm2,%%xmm1                  \n"
5930       "psubb      %%xmm4,%%xmm0                  \n"
5931       "psubb      %%xmm4,%%xmm1                  \n"
5932       "movdqa     %%xmm5,%%xmm2                  \n"
5933       "movdqa     %%xmm5,%%xmm3                  \n"
5934       "pmaddubsw  %%xmm0,%%xmm2                  \n"
5935       "pmaddubsw  %%xmm1,%%xmm3                  \n"
5936       "paddw      %%xmm4,%%xmm2                  \n"
5937       "paddw      %%xmm4,%%xmm3                  \n"
5938       "psrlw      $0x8,%%xmm2                    \n"
5939       "psrlw      $0x8,%%xmm3                    \n"
5940       "packuswb   %%xmm3,%%xmm2                  \n"
5941       "movdqu    %%xmm2,0x00(%1,%0,1)            \n"
5942       "lea       0x10(%1),%1                     \n"
5943       "sub       $0x10,%2                        \n"
5944       "jg        1b                              \n"
5945       "jmp       99f                             \n"
5946 
5947       // Blend 50 / 50.
5948       LABELALIGN
5949       "50:                                       \n"
5950       "movdqu    (%1),%%xmm0                     \n"
5951       "movdqu    0x00(%1,%4,1),%%xmm1            \n"
5952       "pavgb     %%xmm1,%%xmm0                   \n"
5953       "movdqu    %%xmm0,0x00(%1,%0,1)            \n"
5954       "lea       0x10(%1),%1                     \n"
5955       "sub       $0x10,%2                        \n"
5956       "jg        50b                             \n"
5957       "jmp       99f                             \n"
5958 
5959       // Blend 100 / 0 - Copy row unchanged.
5960       LABELALIGN
5961       "100:                                      \n"
5962       "movdqu    (%1),%%xmm0                     \n"
5963       "movdqu    %%xmm0,0x00(%1,%0,1)            \n"
5964       "lea       0x10(%1),%1                     \n"
5965       "sub       $0x10,%2                        \n"
5966       "jg        100b                            \n"
5967 
5968       "99:                                       \n"
5969       : "+r"(dst_ptr),               // %0
5970         "+r"(src_ptr),               // %1
5971         "+rm"(dst_width),            // %2
5972         "+r"(source_y_fraction)      // %3
5973       : "r"((intptr_t)(src_stride))  // %4
5974       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
5975 }
5976 #endif  // HAS_INTERPOLATEROW_SSSE3
5977 
5978 #ifdef HAS_INTERPOLATEROW_AVX2
5979 // Bilinear filter 32x2 -> 32x1
InterpolateRow_AVX2(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)5980 void InterpolateRow_AVX2(uint8_t* dst_ptr,
5981                          const uint8_t* src_ptr,
5982                          ptrdiff_t src_stride,
5983                          int dst_width,
5984                          int source_y_fraction) {
5985   asm volatile(
5986       "cmp       $0x0,%3                         \n"
5987       "je        100f                            \n"
5988       "sub       %1,%0                           \n"
5989       "cmp       $0x80,%3                        \n"
5990       "je        50f                             \n"
5991 
5992       "vmovd      %3,%%xmm0                      \n"
5993       "neg        %3                             \n"
5994       "add        $0x100,%3                      \n"
5995       "vmovd      %3,%%xmm5                      \n"
5996       "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
5997       "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
5998       "vbroadcastss %%xmm5,%%ymm5                \n"
5999       "mov        $0x80808080,%%eax              \n"
6000       "vmovd      %%eax,%%xmm4                   \n"
6001       "vbroadcastss %%xmm4,%%ymm4                \n"
6002 
6003       // General purpose row blend.
6004       LABELALIGN
6005       "1:                                        \n"
6006       "vmovdqu    (%1),%%ymm0                    \n"
6007       "vmovdqu    0x00(%1,%4,1),%%ymm2           \n"
6008       "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
6009       "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
6010       "vpsubb     %%ymm4,%%ymm1,%%ymm1           \n"
6011       "vpsubb     %%ymm4,%%ymm0,%%ymm0           \n"
6012       "vpmaddubsw %%ymm1,%%ymm5,%%ymm1           \n"
6013       "vpmaddubsw %%ymm0,%%ymm5,%%ymm0           \n"
6014       "vpaddw     %%ymm4,%%ymm1,%%ymm1           \n"
6015       "vpaddw     %%ymm4,%%ymm0,%%ymm0           \n"
6016       "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
6017       "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
6018       "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
6019       "vmovdqu    %%ymm0,0x00(%1,%0,1)           \n"
6020       "lea        0x20(%1),%1                    \n"
6021       "sub        $0x20,%2                       \n"
6022       "jg         1b                             \n"
6023       "jmp        99f                            \n"
6024 
6025       // Blend 50 / 50.
6026       LABELALIGN
6027       "50:                                       \n"
6028       "vmovdqu   (%1),%%ymm0                     \n"
6029       "vpavgb    0x00(%1,%4,1),%%ymm0,%%ymm0     \n"
6030       "vmovdqu   %%ymm0,0x00(%1,%0,1)            \n"
6031       "lea       0x20(%1),%1                     \n"
6032       "sub       $0x20,%2                        \n"
6033       "jg        50b                             \n"
6034       "jmp       99f                             \n"
6035 
6036       // Blend 100 / 0 - Copy row unchanged.
6037       LABELALIGN
6038       "100:                                      \n"
6039       "rep movsb                                 \n"
6040       "jmp       999f                            \n"
6041 
6042       "99:                                       \n"
6043       "vzeroupper                                \n"
6044       "999:                                      \n"
6045       : "+D"(dst_ptr),               // %0
6046         "+S"(src_ptr),               // %1
6047         "+cm"(dst_width),            // %2
6048         "+r"(source_y_fraction)      // %3
6049       : "r"((intptr_t)(src_stride))  // %4
6050       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
6051 }
6052 #endif  // HAS_INTERPOLATEROW_AVX2
6053 
6054 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
6055 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)6056 void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
6057                           uint8_t* dst_argb,
6058                           const uint8_t* shuffler,
6059                           int width) {
6060   asm volatile(
6061 
6062       "movdqu    (%3),%%xmm5                     \n"
6063 
6064       LABELALIGN
6065       "1:                                        \n"
6066       "movdqu    (%0),%%xmm0                     \n"
6067       "movdqu    0x10(%0),%%xmm1                 \n"
6068       "lea       0x20(%0),%0                     \n"
6069       "pshufb    %%xmm5,%%xmm0                   \n"
6070       "pshufb    %%xmm5,%%xmm1                   \n"
6071       "movdqu    %%xmm0,(%1)                     \n"
6072       "movdqu    %%xmm1,0x10(%1)                 \n"
6073       "lea       0x20(%1),%1                     \n"
6074       "sub       $0x8,%2                         \n"
6075       "jg        1b                              \n"
6076       : "+r"(src_argb),  // %0
6077         "+r"(dst_argb),  // %1
6078         "+r"(width)      // %2
6079       : "r"(shuffler)    // %3
6080       : "memory", "cc", "xmm0", "xmm1", "xmm5");
6081 }
6082 #endif  // HAS_ARGBSHUFFLEROW_SSSE3
6083 
6084 #ifdef HAS_ARGBSHUFFLEROW_AVX2
6085 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)6086 void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
6087                          uint8_t* dst_argb,
6088                          const uint8_t* shuffler,
6089                          int width) {
6090   asm volatile(
6091 
6092       "vbroadcastf128 (%3),%%ymm5                \n"
6093 
6094       LABELALIGN
6095       "1:                                        \n"
6096       "vmovdqu   (%0),%%ymm0                     \n"
6097       "vmovdqu   0x20(%0),%%ymm1                 \n"
6098       "lea       0x40(%0),%0                     \n"
6099       "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
6100       "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
6101       "vmovdqu   %%ymm0,(%1)                     \n"
6102       "vmovdqu   %%ymm1,0x20(%1)                 \n"
6103       "lea       0x40(%1),%1                     \n"
6104       "sub       $0x10,%2                        \n"
6105       "jg        1b                              \n"
6106       "vzeroupper                                \n"
6107       : "+r"(src_argb),  // %0
6108         "+r"(dst_argb),  // %1
6109         "+r"(width)      // %2
6110       : "r"(shuffler)    // %3
6111       : "memory", "cc", "xmm0", "xmm1", "xmm5");
6112 }
6113 #endif  // HAS_ARGBSHUFFLEROW_AVX2
6114 
6115 #ifdef HAS_I422TOYUY2ROW_SSE2
I422ToYUY2Row_SSE2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)6116 void I422ToYUY2Row_SSE2(const uint8_t* src_y,
6117                         const uint8_t* src_u,
6118                         const uint8_t* src_v,
6119                         uint8_t* dst_yuy2,
6120                         int width) {
6121   asm volatile(
6122 
6123       "sub       %1,%2                             \n"
6124 
6125       LABELALIGN
6126       "1:                                          \n"
6127       "movq      (%1),%%xmm2                       \n"
6128       "movq      0x00(%1,%2,1),%%xmm1              \n"
6129       "add       $0x8,%1                           \n"
6130       "punpcklbw %%xmm1,%%xmm2                     \n"
6131       "movdqu    (%0),%%xmm0                       \n"
6132       "add       $0x10,%0                          \n"
6133       "movdqa    %%xmm0,%%xmm1                     \n"
6134       "punpcklbw %%xmm2,%%xmm0                     \n"
6135       "punpckhbw %%xmm2,%%xmm1                     \n"
6136       "movdqu    %%xmm0,(%3)                       \n"
6137       "movdqu    %%xmm1,0x10(%3)                   \n"
6138       "lea       0x20(%3),%3                       \n"
6139       "sub       $0x10,%4                          \n"
6140       "jg         1b                               \n"
6141       : "+r"(src_y),     // %0
6142         "+r"(src_u),     // %1
6143         "+r"(src_v),     // %2
6144         "+r"(dst_yuy2),  // %3
6145         "+rm"(width)     // %4
6146       :
6147       : "memory", "cc", "xmm0", "xmm1", "xmm2");
6148 }
6149 #endif  // HAS_I422TOYUY2ROW_SSE2
6150 
6151 #ifdef HAS_I422TOUYVYROW_SSE2
I422ToUYVYRow_SSE2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)6152 void I422ToUYVYRow_SSE2(const uint8_t* src_y,
6153                         const uint8_t* src_u,
6154                         const uint8_t* src_v,
6155                         uint8_t* dst_uyvy,
6156                         int width) {
6157   asm volatile(
6158 
6159       "sub        %1,%2                            \n"
6160 
6161       LABELALIGN
6162       "1:                                          \n"
6163       "movq      (%1),%%xmm2                       \n"
6164       "movq      0x00(%1,%2,1),%%xmm1              \n"
6165       "add       $0x8,%1                           \n"
6166       "punpcklbw %%xmm1,%%xmm2                     \n"
6167       "movdqu    (%0),%%xmm0                       \n"
6168       "movdqa    %%xmm2,%%xmm1                     \n"
6169       "add       $0x10,%0                          \n"
6170       "punpcklbw %%xmm0,%%xmm1                     \n"
6171       "punpckhbw %%xmm0,%%xmm2                     \n"
6172       "movdqu    %%xmm1,(%3)                       \n"
6173       "movdqu    %%xmm2,0x10(%3)                   \n"
6174       "lea       0x20(%3),%3                       \n"
6175       "sub       $0x10,%4                          \n"
6176       "jg         1b                               \n"
6177       : "+r"(src_y),     // %0
6178         "+r"(src_u),     // %1
6179         "+r"(src_v),     // %2
6180         "+r"(dst_uyvy),  // %3
6181         "+rm"(width)     // %4
6182       :
6183       : "memory", "cc", "xmm0", "xmm1", "xmm2");
6184 }
6185 #endif  // HAS_I422TOUYVYROW_SSE2
6186 
6187 #ifdef HAS_I422TOYUY2ROW_AVX2
I422ToYUY2Row_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)6188 void I422ToYUY2Row_AVX2(const uint8_t* src_y,
6189                         const uint8_t* src_u,
6190                         const uint8_t* src_v,
6191                         uint8_t* dst_yuy2,
6192                         int width) {
6193   asm volatile(
6194 
6195       "sub       %1,%2                             \n"
6196 
6197       LABELALIGN
6198       "1:                                          \n"
6199       "vpmovzxbw  (%1),%%ymm1                      \n"
6200       "vpmovzxbw  0x00(%1,%2,1),%%ymm2             \n"
6201       "add        $0x10,%1                         \n"
6202       "vpsllw     $0x8,%%ymm2,%%ymm2               \n"
6203       "vpor       %%ymm1,%%ymm2,%%ymm2             \n"
6204       "vmovdqu    (%0),%%ymm0                      \n"
6205       "add        $0x20,%0                         \n"
6206       "vpunpcklbw %%ymm2,%%ymm0,%%ymm1             \n"
6207       "vpunpckhbw %%ymm2,%%ymm0,%%ymm2             \n"
6208       "vextractf128 $0x0,%%ymm1,(%3)               \n"
6209       "vextractf128 $0x0,%%ymm2,0x10(%3)           \n"
6210       "vextractf128 $0x1,%%ymm1,0x20(%3)           \n"
6211       "vextractf128 $0x1,%%ymm2,0x30(%3)           \n"
6212       "lea        0x40(%3),%3                      \n"
6213       "sub        $0x20,%4                         \n"
6214       "jg         1b                               \n"
6215       "vzeroupper                                  \n"
6216       : "+r"(src_y),     // %0
6217         "+r"(src_u),     // %1
6218         "+r"(src_v),     // %2
6219         "+r"(dst_yuy2),  // %3
6220         "+rm"(width)     // %4
6221       :
6222       : "memory", "cc", "xmm0", "xmm1", "xmm2");
6223 }
6224 #endif  // HAS_I422TOYUY2ROW_AVX2
6225 
6226 #ifdef HAS_I422TOUYVYROW_AVX2
I422ToUYVYRow_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)6227 void I422ToUYVYRow_AVX2(const uint8_t* src_y,
6228                         const uint8_t* src_u,
6229                         const uint8_t* src_v,
6230                         uint8_t* dst_uyvy,
6231                         int width) {
6232   asm volatile(
6233 
6234       "sub        %1,%2                            \n"
6235 
6236       LABELALIGN
6237       "1:                                          \n"
6238       "vpmovzxbw  (%1),%%ymm1                      \n"
6239       "vpmovzxbw  0x00(%1,%2,1),%%ymm2             \n"
6240       "add        $0x10,%1                         \n"
6241       "vpsllw     $0x8,%%ymm2,%%ymm2               \n"
6242       "vpor       %%ymm1,%%ymm2,%%ymm2             \n"
6243       "vmovdqu    (%0),%%ymm0                      \n"
6244       "add        $0x20,%0                         \n"
6245       "vpunpcklbw %%ymm0,%%ymm2,%%ymm1             \n"
6246       "vpunpckhbw %%ymm0,%%ymm2,%%ymm2             \n"
6247       "vextractf128 $0x0,%%ymm1,(%3)               \n"
6248       "vextractf128 $0x0,%%ymm2,0x10(%3)           \n"
6249       "vextractf128 $0x1,%%ymm1,0x20(%3)           \n"
6250       "vextractf128 $0x1,%%ymm2,0x30(%3)           \n"
6251       "lea        0x40(%3),%3                      \n"
6252       "sub        $0x20,%4                         \n"
6253       "jg         1b                               \n"
6254       "vzeroupper                                  \n"
6255       : "+r"(src_y),     // %0
6256         "+r"(src_u),     // %1
6257         "+r"(src_v),     // %2
6258         "+r"(dst_uyvy),  // %3
6259         "+rm"(width)     // %4
6260       :
6261       : "memory", "cc", "xmm0", "xmm1", "xmm2");
6262 }
6263 #endif  // HAS_I422TOUYVYROW_AVX2
6264 
6265 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
ARGBPolynomialRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,const float * poly,int width)6266 void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
6267                             uint8_t* dst_argb,
6268                             const float* poly,
6269                             int width) {
6270   asm volatile(
6271 
6272       "pxor      %%xmm3,%%xmm3                   \n"
6273 
6274       // 2 pixel loop.
6275       LABELALIGN
6276       "1:                                        \n"
6277       "movq      (%0),%%xmm0                     \n"
6278       "lea       0x8(%0),%0                      \n"
6279       "punpcklbw %%xmm3,%%xmm0                   \n"
6280       "movdqa    %%xmm0,%%xmm4                   \n"
6281       "punpcklwd %%xmm3,%%xmm0                   \n"
6282       "punpckhwd %%xmm3,%%xmm4                   \n"
6283       "cvtdq2ps  %%xmm0,%%xmm0                   \n"
6284       "cvtdq2ps  %%xmm4,%%xmm4                   \n"
6285       "movdqa    %%xmm0,%%xmm1                   \n"
6286       "movdqa    %%xmm4,%%xmm5                   \n"
6287       "mulps     0x10(%3),%%xmm0                 \n"
6288       "mulps     0x10(%3),%%xmm4                 \n"
6289       "addps     (%3),%%xmm0                     \n"
6290       "addps     (%3),%%xmm4                     \n"
6291       "movdqa    %%xmm1,%%xmm2                   \n"
6292       "movdqa    %%xmm5,%%xmm6                   \n"
6293       "mulps     %%xmm1,%%xmm2                   \n"
6294       "mulps     %%xmm5,%%xmm6                   \n"
6295       "mulps     %%xmm2,%%xmm1                   \n"
6296       "mulps     %%xmm6,%%xmm5                   \n"
6297       "mulps     0x20(%3),%%xmm2                 \n"
6298       "mulps     0x20(%3),%%xmm6                 \n"
6299       "mulps     0x30(%3),%%xmm1                 \n"
6300       "mulps     0x30(%3),%%xmm5                 \n"
6301       "addps     %%xmm2,%%xmm0                   \n"
6302       "addps     %%xmm6,%%xmm4                   \n"
6303       "addps     %%xmm1,%%xmm0                   \n"
6304       "addps     %%xmm5,%%xmm4                   \n"
6305       "cvttps2dq %%xmm0,%%xmm0                   \n"
6306       "cvttps2dq %%xmm4,%%xmm4                   \n"
6307       "packuswb  %%xmm4,%%xmm0                   \n"
6308       "packuswb  %%xmm0,%%xmm0                   \n"
6309       "movq      %%xmm0,(%1)                     \n"
6310       "lea       0x8(%1),%1                      \n"
6311       "sub       $0x2,%2                         \n"
6312       "jg        1b                              \n"
6313       : "+r"(src_argb),  // %0
6314         "+r"(dst_argb),  // %1
6315         "+r"(width)      // %2
6316       : "r"(poly)        // %3
6317       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
6318 }
6319 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
6320 
6321 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
ARGBPolynomialRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,const float * poly,int width)6322 void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
6323                             uint8_t* dst_argb,
6324                             const float* poly,
6325                             int width) {
6326   asm volatile(
6327       "vbroadcastf128 (%3),%%ymm4                \n"
6328       "vbroadcastf128 0x10(%3),%%ymm5            \n"
6329       "vbroadcastf128 0x20(%3),%%ymm6            \n"
6330       "vbroadcastf128 0x30(%3),%%ymm7            \n"
6331 
6332       // 2 pixel loop.
6333       LABELALIGN
6334       "1:                                        \n"
6335       "vpmovzxbd   (%0),%%ymm0                   \n"  // 2 ARGB pixels
6336       "lea         0x8(%0),%0                    \n"
6337       "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
6338       "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
6339       "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
6340       "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
6341       "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
6342       "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X *
6343                                                       // X
6344       "vcvttps2dq  %%ymm0,%%ymm0                 \n"
6345       "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
6346       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6347       "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
6348       "vmovq       %%xmm0,(%1)                   \n"
6349       "lea         0x8(%1),%1                    \n"
6350       "sub         $0x2,%2                       \n"
6351       "jg          1b                            \n"
6352       "vzeroupper                                \n"
6353       : "+r"(src_argb),  // %0
6354         "+r"(dst_argb),  // %1
6355         "+r"(width)      // %2
6356       : "r"(poly)        // %3
6357       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
6358         "xmm7");
6359 }
6360 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
6361 
6362 #ifdef HAS_HALFFLOATROW_SSE2
6363 static float kScaleBias = 1.9259299444e-34f;
HalfFloatRow_SSE2(const uint16_t * src,uint16_t * dst,float scale,int width)6364 void HalfFloatRow_SSE2(const uint16_t* src,
6365                        uint16_t* dst,
6366                        float scale,
6367                        int width) {
6368   scale *= kScaleBias;
6369   asm volatile(
6370       "movd        %3,%%xmm4                     \n"
6371       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
6372       "pxor        %%xmm5,%%xmm5                 \n"
6373       "sub         %0,%1                         \n"
6374 
6375       // 16 pixel loop.
6376       LABELALIGN
6377       "1:                                        \n"
6378       "movdqu      (%0),%%xmm2                   \n"  // 8 shorts
6379       "add         $0x10,%0                      \n"
6380       "movdqa      %%xmm2,%%xmm3                 \n"
6381       "punpcklwd   %%xmm5,%%xmm2                 \n"  // 8 ints in xmm2/1
6382       "cvtdq2ps    %%xmm2,%%xmm2                 \n"  // 8 floats
6383       "punpckhwd   %%xmm5,%%xmm3                 \n"
6384       "cvtdq2ps    %%xmm3,%%xmm3                 \n"
6385       "mulps       %%xmm4,%%xmm2                 \n"
6386       "mulps       %%xmm4,%%xmm3                 \n"
6387       "psrld       $0xd,%%xmm2                   \n"
6388       "psrld       $0xd,%%xmm3                   \n"
6389       "packssdw    %%xmm3,%%xmm2                 \n"
6390       "movdqu      %%xmm2,-0x10(%0,%1,1)         \n"
6391       "sub         $0x8,%2                       \n"
6392       "jg          1b                            \n"
6393       : "+r"(src),   // %0
6394         "+r"(dst),   // %1
6395         "+r"(width)  // %2
6396       : "m"(scale)   // %3
6397       : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
6398 }
6399 #endif  // HAS_HALFFLOATROW_SSE2
6400 
6401 #ifdef HAS_HALFFLOATROW_AVX2
HalfFloatRow_AVX2(const uint16_t * src,uint16_t * dst,float scale,int width)6402 void HalfFloatRow_AVX2(const uint16_t* src,
6403                        uint16_t* dst,
6404                        float scale,
6405                        int width) {
6406   scale *= kScaleBias;
6407   asm volatile(
6408       "vbroadcastss  %3, %%ymm4                  \n"
6409       "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
6410       "sub        %0,%1                          \n"
6411 
6412       // 16 pixel loop.
6413       LABELALIGN
6414       "1:                                        \n"
6415       "vmovdqu    (%0),%%ymm2                    \n"  // 16 shorts
6416       "add        $0x20,%0                       \n"
6417       "vpunpckhwd %%ymm5,%%ymm2,%%ymm3           \n"  // mutates
6418       "vpunpcklwd %%ymm5,%%ymm2,%%ymm2           \n"
6419       "vcvtdq2ps  %%ymm3,%%ymm3                  \n"
6420       "vcvtdq2ps  %%ymm2,%%ymm2                  \n"
6421       "vmulps     %%ymm3,%%ymm4,%%ymm3           \n"
6422       "vmulps     %%ymm2,%%ymm4,%%ymm2           \n"
6423       "vpsrld     $0xd,%%ymm3,%%ymm3             \n"
6424       "vpsrld     $0xd,%%ymm2,%%ymm2             \n"
6425       "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // unmutates
6426       "vmovdqu    %%ymm2,-0x20(%0,%1,1)          \n"
6427       "sub        $0x10,%2                       \n"
6428       "jg         1b                             \n"
6429 
6430       "vzeroupper                                \n"
6431       : "+r"(src),   // %0
6432         "+r"(dst),   // %1
6433         "+r"(width)  // %2
6434 #if defined(__x86_64__)
6435       : "x"(scale)  // %3
6436 #else
6437       : "m"(scale)  // %3
6438 #endif
6439       : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
6440 }
6441 #endif  // HAS_HALFFLOATROW_AVX2
6442 
6443 #ifdef HAS_HALFFLOATROW_F16C
HalfFloatRow_F16C(const uint16_t * src,uint16_t * dst,float scale,int width)6444 void HalfFloatRow_F16C(const uint16_t* src,
6445                        uint16_t* dst,
6446                        float scale,
6447                        int width) {
6448   asm volatile(
6449       "vbroadcastss  %3, %%ymm4                  \n"
6450       "sub        %0,%1                          \n"
6451 
6452       // 16 pixel loop.
6453       LABELALIGN
6454       "1:                                        \n"
6455       "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
6456       "vpmovzxwd   0x10(%0),%%ymm3               \n"
6457       "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
6458       "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
6459       "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
6460       "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
6461       "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
6462       "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
6463       "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
6464       "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
6465       "add         $0x20,%0                      \n"
6466       "sub         $0x10,%2                      \n"
6467       "jg          1b                            \n"
6468       "vzeroupper                                \n"
6469       : "+r"(src),   // %0
6470         "+r"(dst),   // %1
6471         "+r"(width)  // %2
6472 #if defined(__x86_64__)
6473       : "x"(scale)  // %3
6474 #else
6475       : "m"(scale)  // %3
6476 #endif
6477       : "memory", "cc", "xmm2", "xmm3", "xmm4");
6478 }
6479 #endif  // HAS_HALFFLOATROW_F16C
6480 
6481 #ifdef HAS_HALFFLOATROW_F16C
HalfFloat1Row_F16C(const uint16_t * src,uint16_t * dst,float,int width)6482 void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
6483   asm volatile(
6484       "sub        %0,%1                          \n"
6485       // 16 pixel loop.
6486       LABELALIGN
6487       "1:                                        \n"
6488       "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
6489       "vpmovzxwd   0x10(%0),%%ymm3               \n"
6490       "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
6491       "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
6492       "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
6493       "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
6494       "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
6495       "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
6496       "add         $0x20,%0                      \n"
6497       "sub         $0x10,%2                      \n"
6498       "jg          1b                            \n"
6499       "vzeroupper                                \n"
6500       : "+r"(src),   // %0
6501         "+r"(dst),   // %1
6502         "+r"(width)  // %2
6503       :
6504       : "memory", "cc", "xmm2", "xmm3");
6505 }
6506 #endif  // HAS_HALFFLOATROW_F16C
6507 
6508 #ifdef HAS_ARGBCOLORTABLEROW_X86
6509 // Tranform ARGB pixels with color table.
ARGBColorTableRow_X86(uint8_t * dst_argb,const uint8_t * table_argb,int width)6510 void ARGBColorTableRow_X86(uint8_t* dst_argb,
6511                            const uint8_t* table_argb,
6512                            int width) {
6513   uintptr_t pixel_temp;
6514   asm volatile(
6515       // 1 pixel loop.
6516       LABELALIGN
6517       "1:                                        \n"
6518       "movzb     (%0),%1                         \n"
6519       "lea       0x4(%0),%0                      \n"
6520       "movzb     0x00(%3,%1,4),%1                \n"
6521       "mov       %b1,-0x4(%0)                    \n"
6522       "movzb     -0x3(%0),%1                     \n"
6523       "movzb     0x01(%3,%1,4),%1                \n"
6524       "mov       %b1,-0x3(%0)                    \n"
6525       "movzb     -0x2(%0),%1                     \n"
6526       "movzb     0x02(%3,%1,4),%1                \n"
6527       "mov       %b1,-0x2(%0)                    \n"
6528       "movzb     -0x1(%0),%1                     \n"
6529       "movzb     0x03(%3,%1,4),%1                \n"
6530       "mov       %b1,-0x1(%0)                    \n"
6531       "dec       %2                              \n"
6532       "jg        1b                              \n"
6533       : "+r"(dst_argb),     // %0
6534         "=&d"(pixel_temp),  // %1
6535         "+r"(width)         // %2
6536       : "r"(table_argb)     // %3
6537       : "memory", "cc");
6538 }
6539 #endif  // HAS_ARGBCOLORTABLEROW_X86
6540 
6541 #ifdef HAS_RGBCOLORTABLEROW_X86
6542 // Tranform RGB pixels with color table.
RGBColorTableRow_X86(uint8_t * dst_argb,const uint8_t * table_argb,int width)6543 void RGBColorTableRow_X86(uint8_t* dst_argb,
6544                           const uint8_t* table_argb,
6545                           int width) {
6546   uintptr_t pixel_temp;
6547   asm volatile(
6548       // 1 pixel loop.
6549       LABELALIGN
6550       "1:                                        \n"
6551       "movzb     (%0),%1                         \n"
6552       "lea       0x4(%0),%0                      \n"
6553       "movzb     0x00(%3,%1,4),%1                \n"
6554       "mov       %b1,-0x4(%0)                    \n"
6555       "movzb     -0x3(%0),%1                     \n"
6556       "movzb     0x01(%3,%1,4),%1                \n"
6557       "mov       %b1,-0x3(%0)                    \n"
6558       "movzb     -0x2(%0),%1                     \n"
6559       "movzb     0x02(%3,%1,4),%1                \n"
6560       "mov       %b1,-0x2(%0)                    \n"
6561       "dec       %2                              \n"
6562       "jg        1b                              \n"
6563       : "+r"(dst_argb),     // %0
6564         "=&d"(pixel_temp),  // %1
6565         "+r"(width)         // %2
6566       : "r"(table_argb)     // %3
6567       : "memory", "cc");
6568 }
6569 #endif  // HAS_RGBCOLORTABLEROW_X86
6570 
6571 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
6572 // Tranform RGB pixels with luma table.
ARGBLumaColorTableRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width,const uint8_t * luma,uint32_t lumacoeff)6573 void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
6574                                  uint8_t* dst_argb,
6575                                  int width,
6576                                  const uint8_t* luma,
6577                                  uint32_t lumacoeff) {
6578   uintptr_t pixel_temp;
6579   uintptr_t table_temp;
6580   asm volatile(
6581       "movd      %6,%%xmm3                       \n"
6582       "pshufd    $0x0,%%xmm3,%%xmm3              \n"
6583       "pcmpeqb   %%xmm4,%%xmm4                   \n"
6584       "psllw     $0x8,%%xmm4                     \n"
6585       "pxor      %%xmm5,%%xmm5                   \n"
6586 
6587       // 4 pixel loop.
6588       LABELALIGN
6589       "1:                                        \n"
6590       "movdqu    (%2),%%xmm0                     \n"
6591       "pmaddubsw %%xmm3,%%xmm0                   \n"
6592       "phaddw    %%xmm0,%%xmm0                   \n"
6593       "pand      %%xmm4,%%xmm0                   \n"
6594       "punpcklwd %%xmm5,%%xmm0                   \n"
6595       "movd      %%xmm0,%k1                      \n"  // 32 bit offset
6596       "add       %5,%1                           \n"
6597       "pshufd    $0x39,%%xmm0,%%xmm0             \n"
6598 
6599       "movzb     (%2),%0                         \n"
6600       "movzb     0x00(%1,%0,1),%0                \n"
6601       "mov       %b0,(%3)                        \n"
6602       "movzb     0x1(%2),%0                      \n"
6603       "movzb     0x00(%1,%0,1),%0                \n"
6604       "mov       %b0,0x1(%3)                     \n"
6605       "movzb     0x2(%2),%0                      \n"
6606       "movzb     0x00(%1,%0,1),%0                \n"
6607       "mov       %b0,0x2(%3)                     \n"
6608       "movzb     0x3(%2),%0                      \n"
6609       "mov       %b0,0x3(%3)                     \n"
6610 
6611       "movd      %%xmm0,%k1                      \n"  // 32 bit offset
6612       "add       %5,%1                           \n"
6613       "pshufd    $0x39,%%xmm0,%%xmm0             \n"
6614 
6615       "movzb     0x4(%2),%0                      \n"
6616       "movzb     0x00(%1,%0,1),%0                \n"
6617       "mov       %b0,0x4(%3)                     \n"
6618       "movzb     0x5(%2),%0                      \n"
6619       "movzb     0x00(%1,%0,1),%0                \n"
6620       "mov       %b0,0x5(%3)                     \n"
6621       "movzb     0x6(%2),%0                      \n"
6622       "movzb     0x00(%1,%0,1),%0                \n"
6623       "mov       %b0,0x6(%3)                     \n"
6624       "movzb     0x7(%2),%0                      \n"
6625       "mov       %b0,0x7(%3)                     \n"
6626 
6627       "movd      %%xmm0,%k1                      \n"  // 32 bit offset
6628       "add       %5,%1                           \n"
6629       "pshufd    $0x39,%%xmm0,%%xmm0             \n"
6630 
6631       "movzb     0x8(%2),%0                      \n"
6632       "movzb     0x00(%1,%0,1),%0                \n"
6633       "mov       %b0,0x8(%3)                     \n"
6634       "movzb     0x9(%2),%0                      \n"
6635       "movzb     0x00(%1,%0,1),%0                \n"
6636       "mov       %b0,0x9(%3)                     \n"
6637       "movzb     0xa(%2),%0                      \n"
6638       "movzb     0x00(%1,%0,1),%0                \n"
6639       "mov       %b0,0xa(%3)                     \n"
6640       "movzb     0xb(%2),%0                      \n"
6641       "mov       %b0,0xb(%3)                     \n"
6642 
6643       "movd      %%xmm0,%k1                      \n"  // 32 bit offset
6644       "add       %5,%1                           \n"
6645 
6646       "movzb     0xc(%2),%0                      \n"
6647       "movzb     0x00(%1,%0,1),%0                \n"
6648       "mov       %b0,0xc(%3)                     \n"
6649       "movzb     0xd(%2),%0                      \n"
6650       "movzb     0x00(%1,%0,1),%0                \n"
6651       "mov       %b0,0xd(%3)                     \n"
6652       "movzb     0xe(%2),%0                      \n"
6653       "movzb     0x00(%1,%0,1),%0                \n"
6654       "mov       %b0,0xe(%3)                     \n"
6655       "movzb     0xf(%2),%0                      \n"
6656       "mov       %b0,0xf(%3)                     \n"
6657       "lea       0x10(%2),%2                     \n"
6658       "lea       0x10(%3),%3                     \n"
6659       "sub       $0x4,%4                         \n"
6660       "jg        1b                              \n"
6661       : "=&d"(pixel_temp),  // %0
6662         "=&a"(table_temp),  // %1
6663         "+r"(src_argb),     // %2
6664         "+r"(dst_argb),     // %3
6665         "+rm"(width)        // %4
6666       : "r"(luma),          // %5
6667         "rm"(lumacoeff)     // %6
6668       : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
6669 }
6670 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6671 
6672 #endif  // defined(__x86_64__) || defined(__i386__)
6673 
6674 #ifdef __cplusplus
6675 }  // extern "C"
6676 }  // namespace libyuv
6677 #endif
6678