• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 #ifdef __cplusplus
13 namespace libyuv {
14 extern "C" {
15 #endif
16 
17 // This module is for GCC x86 and x64.
18 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
19 
20 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
21 
22 // Constants for ARGB
23 static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
24                                25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u};
25 
26 // JPeg full range.
27 static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
28                                 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};
29 
30 static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u,
31                                 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u};
32 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
33 
34 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
35 
36 static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
37                               112, -74, -38, 0, 112, -74, -38, 0};
38 
39 static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
40                                127, -84, -43, 0, 127, -84, -43, 0};
41 
42 static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
43                               -18, -94, 112, 0, -18, -94, 112, 0};
44 
45 static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
46                                -20, -107, 127, 0, -20, -107, 127, 0};
47 
48 // Constants for BGRA
49 static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
50                                0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
51 
52 static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
53                               0, -38, -74, 112, 0, -38, -74, 112};
54 
55 static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
56                               0, 112, -94, -18, 0, 112, -94, -18};
57 
58 // Constants for ABGR
59 static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u,
60                                66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u};
61 
62 static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
63                               -38, -74, 112, 0, -38, -74, 112, 0};
64 
65 static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
66                               112, -94, -18, 0, 112, -94, -18, 0};
67 
68 // Constants for RGBA.
69 static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u,
70                                0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u};
71 
72 static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
73                               0, 112, -74, -38, 0, 112, -74, -38};
74 
75 static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
76                               0, -18, -94, 112, 0, -18, -94, 112};
77 
78 static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
79                                0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u};
80 
81 static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
82                                 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
83 
84 static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
85                                0x8080u, 0x8080u, 0x8080u, 0x8080u};
86 
87 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
88 
89 #ifdef HAS_RGB24TOARGBROW_SSSE3
90 
91 // Shuffle table for converting RGB24 to ARGB.
92 static const uvec8 kShuffleMaskRGB24ToARGB = {
93     0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
94 
95 // Shuffle table for converting RAW to ARGB.
96 static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
97                                             8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
98 
99 // Shuffle table for converting RAW to RGBA.
100 static const uvec8 kShuffleMaskRAWToRGBA = {12u, 2u, 1u, 0u, 13u, 5u,  4u,  3u,
101                                             14u, 8u, 7u, 6u, 15u, 11u, 10u, 9u};
102 
103 // Shuffle table for converting RAW to RGB24.  First 8.
104 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
105     2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
106     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
107 
108 // Shuffle table for converting RAW to RGB24.  Middle 8.
109 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
110     2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
111     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
112 
113 // Shuffle table for converting RAW to RGB24.  Last 8.
114 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
115     8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
116     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
117 
118 // Shuffle table for converting ARGB to RGB24.
119 static const uvec8 kShuffleMaskARGBToRGB24 = {
120     0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
121 
122 // Shuffle table for converting ARGB to RAW.
123 static const uvec8 kShuffleMaskARGBToRAW = {
124     2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
125 
126 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
127 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
128     0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
129 
130 // YUY2 shuf 16 Y to 32 Y.
131 static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
132                                     10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
133                                     6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
134 
135 // YUY2 shuf 8 UV to 16 UV.
136 static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
137                                      11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
138                                      5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
139 
140 // UYVY shuf 16 Y to 32 Y.
141 static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
142                                     11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
143                                     7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
144 
145 // UYVY shuf 8 UV to 16 UV.
146 static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
147                                      10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
148                                      4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
149 
150 // NV21 shuf 8 VU to 16 UV.
151 static const lvec8 kShuffleNV21 = {
152     1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
153     1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
154 };
155 #endif  // HAS_RGB24TOARGBROW_SSSE3
156 
157 #ifdef HAS_J400TOARGBROW_SSE2
J400ToARGBRow_SSE2(const uint8_t * src_y,uint8_t * dst_argb,int width)158 void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
159   asm volatile(
160       "pcmpeqb     %%xmm5,%%xmm5                 \n"
161       "pslld       $0x18,%%xmm5                  \n"
162 
163       LABELALIGN
164       "1:                                        \n"
165       "movq        (%0),%%xmm0                   \n"
166       "lea         0x8(%0),%0                    \n"
167       "punpcklbw   %%xmm0,%%xmm0                 \n"
168       "movdqa      %%xmm0,%%xmm1                 \n"
169       "punpcklwd   %%xmm0,%%xmm0                 \n"
170       "punpckhwd   %%xmm1,%%xmm1                 \n"
171       "por         %%xmm5,%%xmm0                 \n"
172       "por         %%xmm5,%%xmm1                 \n"
173       "movdqu      %%xmm0,(%1)                   \n"
174       "movdqu      %%xmm1,0x10(%1)               \n"
175       "lea         0x20(%1),%1                   \n"
176       "sub         $0x8,%2                       \n"
177       "jg          1b                            \n"
178       : "+r"(src_y),     // %0
179         "+r"(dst_argb),  // %1
180         "+r"(width)      // %2
181         ::"memory",
182         "cc", "xmm0", "xmm1", "xmm5");
183 }
184 #endif  // HAS_J400TOARGBROW_SSE2
185 
186 #ifdef HAS_RGB24TOARGBROW_SSSE3
RGB24ToARGBRow_SSSE3(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)187 void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
188                           uint8_t* dst_argb,
189                           int width) {
190   asm volatile(
191       "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
192       "pslld       $0x18,%%xmm5                  \n"
193       "movdqa      %3,%%xmm4                     \n"
194 
195       LABELALIGN
196       "1:                                        \n"
197       "movdqu      (%0),%%xmm0                   \n"
198       "movdqu      0x10(%0),%%xmm1               \n"
199       "movdqu      0x20(%0),%%xmm3               \n"
200       "lea         0x30(%0),%0                   \n"
201       "movdqa      %%xmm3,%%xmm2                 \n"
202       "palignr     $0x8,%%xmm1,%%xmm2            \n"
203       "pshufb      %%xmm4,%%xmm2                 \n"
204       "por         %%xmm5,%%xmm2                 \n"
205       "palignr     $0xc,%%xmm0,%%xmm1            \n"
206       "pshufb      %%xmm4,%%xmm0                 \n"
207       "movdqu      %%xmm2,0x20(%1)               \n"
208       "por         %%xmm5,%%xmm0                 \n"
209       "pshufb      %%xmm4,%%xmm1                 \n"
210       "movdqu      %%xmm0,(%1)                   \n"
211       "por         %%xmm5,%%xmm1                 \n"
212       "palignr     $0x4,%%xmm3,%%xmm3            \n"
213       "pshufb      %%xmm4,%%xmm3                 \n"
214       "movdqu      %%xmm1,0x10(%1)               \n"
215       "por         %%xmm5,%%xmm3                 \n"
216       "movdqu      %%xmm3,0x30(%1)               \n"
217       "lea         0x40(%1),%1                   \n"
218       "sub         $0x10,%2                      \n"
219       "jg          1b                            \n"
220       : "+r"(src_rgb24),              // %0
221         "+r"(dst_argb),               // %1
222         "+r"(width)                   // %2
223       : "m"(kShuffleMaskRGB24ToARGB)  // %3
224       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
225 }
226 
RAWToARGBRow_SSSE3(const uint8_t * src_raw,uint8_t * dst_argb,int width)227 void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
228   asm volatile(
229       "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
230       "pslld       $0x18,%%xmm5                  \n"
231       "movdqa      %3,%%xmm4                     \n"
232 
233       LABELALIGN
234       "1:                                        \n"
235       "movdqu      (%0),%%xmm0                   \n"
236       "movdqu      0x10(%0),%%xmm1               \n"
237       "movdqu      0x20(%0),%%xmm3               \n"
238       "lea         0x30(%0),%0                   \n"
239       "movdqa      %%xmm3,%%xmm2                 \n"
240       "palignr     $0x8,%%xmm1,%%xmm2            \n"
241       "pshufb      %%xmm4,%%xmm2                 \n"
242       "por         %%xmm5,%%xmm2                 \n"
243       "palignr     $0xc,%%xmm0,%%xmm1            \n"
244       "pshufb      %%xmm4,%%xmm0                 \n"
245       "movdqu      %%xmm2,0x20(%1)               \n"
246       "por         %%xmm5,%%xmm0                 \n"
247       "pshufb      %%xmm4,%%xmm1                 \n"
248       "movdqu      %%xmm0,(%1)                   \n"
249       "por         %%xmm5,%%xmm1                 \n"
250       "palignr     $0x4,%%xmm3,%%xmm3            \n"
251       "pshufb      %%xmm4,%%xmm3                 \n"
252       "movdqu      %%xmm1,0x10(%1)               \n"
253       "por         %%xmm5,%%xmm3                 \n"
254       "movdqu      %%xmm3,0x30(%1)               \n"
255       "lea         0x40(%1),%1                   \n"
256       "sub         $0x10,%2                      \n"
257       "jg          1b                            \n"
258       : "+r"(src_raw),              // %0
259         "+r"(dst_argb),             // %1
260         "+r"(width)                 // %2
261       : "m"(kShuffleMaskRAWToARGB)  // %3
262       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
263 }
264 
265 // Same code as RAWToARGB with different shuffler and A in low bits
RAWToRGBARow_SSSE3(const uint8_t * src_raw,uint8_t * dst_rgba,int width)266 void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
267   asm volatile(
268       "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0x000000ff
269       "psrld       $0x18,%%xmm5                  \n"
270       "movdqa      %3,%%xmm4                     \n"
271 
272       LABELALIGN
273       "1:                                        \n"
274       "movdqu      (%0),%%xmm0                   \n"
275       "movdqu      0x10(%0),%%xmm1               \n"
276       "movdqu      0x20(%0),%%xmm3               \n"
277       "lea         0x30(%0),%0                   \n"
278       "movdqa      %%xmm3,%%xmm2                 \n"
279       "palignr     $0x8,%%xmm1,%%xmm2            \n"
280       "pshufb      %%xmm4,%%xmm2                 \n"
281       "por         %%xmm5,%%xmm2                 \n"
282       "palignr     $0xc,%%xmm0,%%xmm1            \n"
283       "pshufb      %%xmm4,%%xmm0                 \n"
284       "movdqu      %%xmm2,0x20(%1)               \n"
285       "por         %%xmm5,%%xmm0                 \n"
286       "pshufb      %%xmm4,%%xmm1                 \n"
287       "movdqu      %%xmm0,(%1)                   \n"
288       "por         %%xmm5,%%xmm1                 \n"
289       "palignr     $0x4,%%xmm3,%%xmm3            \n"
290       "pshufb      %%xmm4,%%xmm3                 \n"
291       "movdqu      %%xmm1,0x10(%1)               \n"
292       "por         %%xmm5,%%xmm3                 \n"
293       "movdqu      %%xmm3,0x30(%1)               \n"
294       "lea         0x40(%1),%1                   \n"
295       "sub         $0x10,%2                      \n"
296       "jg          1b                            \n"
297       : "+r"(src_raw),              // %0
298         "+r"(dst_rgba),             // %1
299         "+r"(width)                 // %2
300       : "m"(kShuffleMaskRAWToRGBA)  // %3
301       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
302 }
303 
RAWToRGB24Row_SSSE3(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)304 void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
305                          uint8_t* dst_rgb24,
306                          int width) {
307   asm volatile(
308       "movdqa      %3,%%xmm3                     \n"
309       "movdqa      %4,%%xmm4                     \n"
310       "movdqa      %5,%%xmm5                     \n"
311 
312       LABELALIGN
313       "1:                                        \n"
314       "movdqu      (%0),%%xmm0                   \n"
315       "movdqu      0x4(%0),%%xmm1                \n"
316       "movdqu      0x8(%0),%%xmm2                \n"
317       "lea         0x18(%0),%0                   \n"
318       "pshufb      %%xmm3,%%xmm0                 \n"
319       "pshufb      %%xmm4,%%xmm1                 \n"
320       "pshufb      %%xmm5,%%xmm2                 \n"
321       "movq        %%xmm0,(%1)                   \n"
322       "movq        %%xmm1,0x8(%1)                \n"
323       "movq        %%xmm2,0x10(%1)               \n"
324       "lea         0x18(%1),%1                   \n"
325       "sub         $0x8,%2                       \n"
326       "jg          1b                            \n"
327       : "+r"(src_raw),                  // %0
328         "+r"(dst_rgb24),                // %1
329         "+r"(width)                     // %2
330       : "m"(kShuffleMaskRAWToRGB24_0),  // %3
331         "m"(kShuffleMaskRAWToRGB24_1),  // %4
332         "m"(kShuffleMaskRAWToRGB24_2)   // %5
333       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
334 }
335 
RGB565ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)336 void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
337   asm volatile(
338       "mov         $0x1080108,%%eax              \n"
339       "movd        %%eax,%%xmm5                  \n"
340       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
341       "mov         $0x20802080,%%eax             \n"
342       "movd        %%eax,%%xmm6                  \n"
343       "pshufd      $0x0,%%xmm6,%%xmm6            \n"
344       "pcmpeqb     %%xmm3,%%xmm3                 \n"
345       "psllw       $0xb,%%xmm3                   \n"
346       "pcmpeqb     %%xmm4,%%xmm4                 \n"
347       "psllw       $0xa,%%xmm4                   \n"
348       "psrlw       $0x5,%%xmm4                   \n"
349       "pcmpeqb     %%xmm7,%%xmm7                 \n"
350       "psllw       $0x8,%%xmm7                   \n"
351       "sub         %0,%1                         \n"
352       "sub         %0,%1                         \n"
353 
354       LABELALIGN
355       "1:                                        \n"
356       "movdqu      (%0),%%xmm0                   \n"
357       "movdqa      %%xmm0,%%xmm1                 \n"
358       "movdqa      %%xmm0,%%xmm2                 \n"
359       "pand        %%xmm3,%%xmm1                 \n"
360       "psllw       $0xb,%%xmm2                   \n"
361       "pmulhuw     %%xmm5,%%xmm1                 \n"
362       "pmulhuw     %%xmm5,%%xmm2                 \n"
363       "psllw       $0x8,%%xmm1                   \n"
364       "por         %%xmm2,%%xmm1                 \n"
365       "pand        %%xmm4,%%xmm0                 \n"
366       "pmulhuw     %%xmm6,%%xmm0                 \n"
367       "por         %%xmm7,%%xmm0                 \n"
368       "movdqa      %%xmm1,%%xmm2                 \n"
369       "punpcklbw   %%xmm0,%%xmm1                 \n"
370       "punpckhbw   %%xmm0,%%xmm2                 \n"
371       "movdqu      %%xmm1,0x00(%1,%0,2)          \n"
372       "movdqu      %%xmm2,0x10(%1,%0,2)          \n"
373       "lea         0x10(%0),%0                   \n"
374       "sub         $0x8,%2                       \n"
375       "jg          1b                            \n"
376       : "+r"(src),   // %0
377         "+r"(dst),   // %1
378         "+r"(width)  // %2
379       :
380       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
381         "xmm6", "xmm7");
382 }
383 
ARGB1555ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)384 void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
385   asm volatile(
386       "mov         $0x1080108,%%eax              \n"
387       "movd        %%eax,%%xmm5                  \n"
388       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
389       "mov         $0x42004200,%%eax             \n"
390       "movd        %%eax,%%xmm6                  \n"
391       "pshufd      $0x0,%%xmm6,%%xmm6            \n"
392       "pcmpeqb     %%xmm3,%%xmm3                 \n"
393       "psllw       $0xb,%%xmm3                   \n"
394       "movdqa      %%xmm3,%%xmm4                 \n"
395       "psrlw       $0x6,%%xmm4                   \n"
396       "pcmpeqb     %%xmm7,%%xmm7                 \n"
397       "psllw       $0x8,%%xmm7                   \n"
398       "sub         %0,%1                         \n"
399       "sub         %0,%1                         \n"
400 
401       LABELALIGN
402       "1:                                        \n"
403       "movdqu      (%0),%%xmm0                   \n"
404       "movdqa      %%xmm0,%%xmm1                 \n"
405       "movdqa      %%xmm0,%%xmm2                 \n"
406       "psllw       $0x1,%%xmm1                   \n"
407       "psllw       $0xb,%%xmm2                   \n"
408       "pand        %%xmm3,%%xmm1                 \n"
409       "pmulhuw     %%xmm5,%%xmm2                 \n"
410       "pmulhuw     %%xmm5,%%xmm1                 \n"
411       "psllw       $0x8,%%xmm1                   \n"
412       "por         %%xmm2,%%xmm1                 \n"
413       "movdqa      %%xmm0,%%xmm2                 \n"
414       "pand        %%xmm4,%%xmm0                 \n"
415       "psraw       $0x8,%%xmm2                   \n"
416       "pmulhuw     %%xmm6,%%xmm0                 \n"
417       "pand        %%xmm7,%%xmm2                 \n"
418       "por         %%xmm2,%%xmm0                 \n"
419       "movdqa      %%xmm1,%%xmm2                 \n"
420       "punpcklbw   %%xmm0,%%xmm1                 \n"
421       "punpckhbw   %%xmm0,%%xmm2                 \n"
422       "movdqu      %%xmm1,0x00(%1,%0,2)          \n"
423       "movdqu      %%xmm2,0x10(%1,%0,2)          \n"
424       "lea         0x10(%0),%0                   \n"
425       "sub         $0x8,%2                       \n"
426       "jg          1b                            \n"
427       : "+r"(src),   // %0
428         "+r"(dst),   // %1
429         "+r"(width)  // %2
430       :
431       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
432         "xmm6", "xmm7");
433 }
434 
ARGB4444ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)435 void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
436   asm volatile(
437       "mov         $0xf0f0f0f,%%eax              \n"
438       "movd        %%eax,%%xmm4                  \n"
439       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
440       "movdqa      %%xmm4,%%xmm5                 \n"
441       "pslld       $0x4,%%xmm5                   \n"
442       "sub         %0,%1                         \n"
443       "sub         %0,%1                         \n"
444 
445       LABELALIGN
446       "1:                                        \n"
447       "movdqu      (%0),%%xmm0                   \n"
448       "movdqa      %%xmm0,%%xmm2                 \n"
449       "pand        %%xmm4,%%xmm0                 \n"
450       "pand        %%xmm5,%%xmm2                 \n"
451       "movdqa      %%xmm0,%%xmm1                 \n"
452       "movdqa      %%xmm2,%%xmm3                 \n"
453       "psllw       $0x4,%%xmm1                   \n"
454       "psrlw       $0x4,%%xmm3                   \n"
455       "por         %%xmm1,%%xmm0                 \n"
456       "por         %%xmm3,%%xmm2                 \n"
457       "movdqa      %%xmm0,%%xmm1                 \n"
458       "punpcklbw   %%xmm2,%%xmm0                 \n"
459       "punpckhbw   %%xmm2,%%xmm1                 \n"
460       "movdqu      %%xmm0,0x00(%1,%0,2)          \n"
461       "movdqu      %%xmm1,0x10(%1,%0,2)          \n"
462       "lea         0x10(%0),%0                   \n"
463       "sub         $0x8,%2                       \n"
464       "jg          1b                            \n"
465       : "+r"(src),   // %0
466         "+r"(dst),   // %1
467         "+r"(width)  // %2
468       :
469       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
470 }
471 
ARGBToRGB24Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)472 void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
473   asm volatile(
474 
475       "movdqa      %3,%%xmm6                     \n"
476 
477       LABELALIGN
478       "1:                                        \n"
479       "movdqu      (%0),%%xmm0                   \n"
480       "movdqu      0x10(%0),%%xmm1               \n"
481       "movdqu      0x20(%0),%%xmm2               \n"
482       "movdqu      0x30(%0),%%xmm3               \n"
483       "lea         0x40(%0),%0                   \n"
484       "pshufb      %%xmm6,%%xmm0                 \n"
485       "pshufb      %%xmm6,%%xmm1                 \n"
486       "pshufb      %%xmm6,%%xmm2                 \n"
487       "pshufb      %%xmm6,%%xmm3                 \n"
488       "movdqa      %%xmm1,%%xmm4                 \n"
489       "psrldq      $0x4,%%xmm1                   \n"
490       "pslldq      $0xc,%%xmm4                   \n"
491       "movdqa      %%xmm2,%%xmm5                 \n"
492       "por         %%xmm4,%%xmm0                 \n"
493       "pslldq      $0x8,%%xmm5                   \n"
494       "movdqu      %%xmm0,(%1)                   \n"
495       "por         %%xmm5,%%xmm1                 \n"
496       "psrldq      $0x8,%%xmm2                   \n"
497       "pslldq      $0x4,%%xmm3                   \n"
498       "por         %%xmm3,%%xmm2                 \n"
499       "movdqu      %%xmm1,0x10(%1)               \n"
500       "movdqu      %%xmm2,0x20(%1)               \n"
501       "lea         0x30(%1),%1                   \n"
502       "sub         $0x10,%2                      \n"
503       "jg          1b                            \n"
504       : "+r"(src),                    // %0
505         "+r"(dst),                    // %1
506         "+r"(width)                   // %2
507       : "m"(kShuffleMaskARGBToRGB24)  // %3
508       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
509 }
510 
ARGBToRAWRow_SSSE3(const uint8_t * src,uint8_t * dst,int width)511 void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
512   asm volatile(
513 
514       "movdqa      %3,%%xmm6                     \n"
515 
516       LABELALIGN
517       "1:                                        \n"
518       "movdqu      (%0),%%xmm0                   \n"
519       "movdqu      0x10(%0),%%xmm1               \n"
520       "movdqu      0x20(%0),%%xmm2               \n"
521       "movdqu      0x30(%0),%%xmm3               \n"
522       "lea         0x40(%0),%0                   \n"
523       "pshufb      %%xmm6,%%xmm0                 \n"
524       "pshufb      %%xmm6,%%xmm1                 \n"
525       "pshufb      %%xmm6,%%xmm2                 \n"
526       "pshufb      %%xmm6,%%xmm3                 \n"
527       "movdqa      %%xmm1,%%xmm4                 \n"
528       "psrldq      $0x4,%%xmm1                   \n"
529       "pslldq      $0xc,%%xmm4                   \n"
530       "movdqa      %%xmm2,%%xmm5                 \n"
531       "por         %%xmm4,%%xmm0                 \n"
532       "pslldq      $0x8,%%xmm5                   \n"
533       "movdqu      %%xmm0,(%1)                   \n"
534       "por         %%xmm5,%%xmm1                 \n"
535       "psrldq      $0x8,%%xmm2                   \n"
536       "pslldq      $0x4,%%xmm3                   \n"
537       "por         %%xmm3,%%xmm2                 \n"
538       "movdqu      %%xmm1,0x10(%1)               \n"
539       "movdqu      %%xmm2,0x20(%1)               \n"
540       "lea         0x30(%1),%1                   \n"
541       "sub         $0x10,%2                      \n"
542       "jg          1b                            \n"
543       : "+r"(src),                  // %0
544         "+r"(dst),                  // %1
545         "+r"(width)                 // %2
546       : "m"(kShuffleMaskARGBToRAW)  // %3
547       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
548 }
549 
550 #ifdef HAS_ARGBTORGB24ROW_AVX2
551 // vpermd for 12+12 to 24
552 static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
553 
ARGBToRGB24Row_AVX2(const uint8_t * src,uint8_t * dst,int width)554 void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
555   asm volatile(
556       "vbroadcastf128 %3,%%ymm6                  \n"
557       "vmovdqa     %4,%%ymm7                     \n"
558 
559       LABELALIGN
560       "1:                                        \n"
561       "vmovdqu     (%0),%%ymm0                   \n"
562       "vmovdqu     0x20(%0),%%ymm1               \n"
563       "vmovdqu     0x40(%0),%%ymm2               \n"
564       "vmovdqu     0x60(%0),%%ymm3               \n"
565       "lea         0x80(%0),%0                   \n"
566       "vpshufb     %%ymm6,%%ymm0,%%ymm0          \n"  // xxx0yyy0
567       "vpshufb     %%ymm6,%%ymm1,%%ymm1          \n"
568       "vpshufb     %%ymm6,%%ymm2,%%ymm2          \n"
569       "vpshufb     %%ymm6,%%ymm3,%%ymm3          \n"
570       "vpermd      %%ymm0,%%ymm7,%%ymm0          \n"  // pack to 24 bytes
571       "vpermd      %%ymm1,%%ymm7,%%ymm1          \n"
572       "vpermd      %%ymm2,%%ymm7,%%ymm2          \n"
573       "vpermd      %%ymm3,%%ymm7,%%ymm3          \n"
574       "vpermq      $0x3f,%%ymm1,%%ymm4           \n"  // combine 24 + 8
575       "vpor        %%ymm4,%%ymm0,%%ymm0          \n"
576       "vmovdqu     %%ymm0,(%1)                   \n"
577       "vpermq      $0xf9,%%ymm1,%%ymm1           \n"  // combine 16 + 16
578       "vpermq      $0x4f,%%ymm2,%%ymm4           \n"
579       "vpor        %%ymm4,%%ymm1,%%ymm1          \n"
580       "vmovdqu     %%ymm1,0x20(%1)               \n"
581       "vpermq      $0xfe,%%ymm2,%%ymm2           \n"  // combine 8 + 24
582       "vpermq      $0x93,%%ymm3,%%ymm3           \n"
583       "vpor        %%ymm3,%%ymm2,%%ymm2          \n"
584       "vmovdqu     %%ymm2,0x40(%1)               \n"
585       "lea         0x60(%1),%1                   \n"
586       "sub         $0x20,%2                      \n"
587       "jg          1b                            \n"
588       "vzeroupper                                \n"
589       : "+r"(src),                     // %0
590         "+r"(dst),                     // %1
591         "+r"(width)                    // %2
592       : "m"(kShuffleMaskARGBToRGB24),  // %3
593         "m"(kPermdRGB24_AVX)           // %4
594       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
595         "xmm7");
596 }
597 #endif
598 
599 #ifdef HAS_ARGBTORGB24ROW_AVX512VBMI
600 // Shuffle table for converting ARGBToRGB24
601 static const ulvec8 kPermARGBToRGB24_0 = {
602     0u,  1u,  2u,  4u,  5u,  6u,  8u,  9u,  10u, 12u, 13u,
603     14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
604     29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
605 static const ulvec8 kPermARGBToRGB24_1 = {
606     10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
607     25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
608     40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
609 static const ulvec8 kPermARGBToRGB24_2 = {
610     21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
611     36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
612     50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
613 
ARGBToRGB24Row_AVX512VBMI(const uint8_t * src,uint8_t * dst,int width)614 void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
615   asm volatile(
616       "vmovdqa     %3,%%ymm5                     \n"
617       "vmovdqa     %4,%%ymm6                     \n"
618       "vmovdqa     %5,%%ymm7                     \n"
619 
620       LABELALIGN
621       "1:                                        \n"
622       "vmovdqu     (%0),%%ymm0                   \n"
623       "vmovdqu     0x20(%0),%%ymm1               \n"
624       "vmovdqu     0x40(%0),%%ymm2               \n"
625       "vmovdqu     0x60(%0),%%ymm3               \n"
626       "lea         0x80(%0),%0                   \n"
627       "vpermt2b    %%ymm1,%%ymm5,%%ymm0          \n"
628       "vpermt2b    %%ymm2,%%ymm6,%%ymm1          \n"
629       "vpermt2b    %%ymm3,%%ymm7,%%ymm2          \n"
630       "vmovdqu     %%ymm0,(%1)                   \n"
631       "vmovdqu     %%ymm1,0x20(%1)               \n"
632       "vmovdqu     %%ymm2,0x40(%1)               \n"
633       "lea         0x60(%1),%1                   \n"
634       "sub         $0x20,%2                      \n"
635       "jg          1b                            \n"
636       "vzeroupper                                \n"
637       : "+r"(src),                // %0
638         "+r"(dst),                // %1
639         "+r"(width)               // %2
640       : "m"(kPermARGBToRGB24_0),  // %3
641         "m"(kPermARGBToRGB24_1),  // %4
642         "m"(kPermARGBToRGB24_2)   // %5
643       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");
644 }
645 #endif
646 
647 #ifdef HAS_ARGBTORAWROW_AVX2
ARGBToRAWRow_AVX2(const uint8_t * src,uint8_t * dst,int width)648 void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
649   asm volatile(
650       "vbroadcastf128 %3,%%ymm6                  \n"
651       "vmovdqa     %4,%%ymm7                     \n"
652 
653       LABELALIGN
654       "1:                                        \n"
655       "vmovdqu     (%0),%%ymm0                   \n"
656       "vmovdqu     0x20(%0),%%ymm1               \n"
657       "vmovdqu     0x40(%0),%%ymm2               \n"
658       "vmovdqu     0x60(%0),%%ymm3               \n"
659       "lea         0x80(%0),%0                   \n"
660       "vpshufb     %%ymm6,%%ymm0,%%ymm0          \n"  // xxx0yyy0
661       "vpshufb     %%ymm6,%%ymm1,%%ymm1          \n"
662       "vpshufb     %%ymm6,%%ymm2,%%ymm2          \n"
663       "vpshufb     %%ymm6,%%ymm3,%%ymm3          \n"
664       "vpermd      %%ymm0,%%ymm7,%%ymm0          \n"  // pack to 24 bytes
665       "vpermd      %%ymm1,%%ymm7,%%ymm1          \n"
666       "vpermd      %%ymm2,%%ymm7,%%ymm2          \n"
667       "vpermd      %%ymm3,%%ymm7,%%ymm3          \n"
668       "vpermq      $0x3f,%%ymm1,%%ymm4           \n"  // combine 24 + 8
669       "vpor        %%ymm4,%%ymm0,%%ymm0          \n"
670       "vmovdqu     %%ymm0,(%1)                   \n"
671       "vpermq      $0xf9,%%ymm1,%%ymm1           \n"  // combine 16 + 16
672       "vpermq      $0x4f,%%ymm2,%%ymm4           \n"
673       "vpor        %%ymm4,%%ymm1,%%ymm1          \n"
674       "vmovdqu     %%ymm1,0x20(%1)               \n"
675       "vpermq      $0xfe,%%ymm2,%%ymm2           \n"  // combine 8 + 24
676       "vpermq      $0x93,%%ymm3,%%ymm3           \n"
677       "vpor        %%ymm3,%%ymm2,%%ymm2          \n"
678       "vmovdqu     %%ymm2,0x40(%1)               \n"
679       "lea         0x60(%1),%1                   \n"
680       "sub         $0x20,%2                      \n"
681       "jg          1b                            \n"
682       "vzeroupper                                \n"
683       : "+r"(src),                   // %0
684         "+r"(dst),                   // %1
685         "+r"(width)                  // %2
686       : "m"(kShuffleMaskARGBToRAW),  // %3
687         "m"(kPermdRGB24_AVX)         // %4
688       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
689         "xmm7");
690 }
691 #endif
692 
ARGBToRGB565Row_SSE2(const uint8_t * src,uint8_t * dst,int width)693 void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
694   asm volatile(
695       "pcmpeqb     %%xmm3,%%xmm3                 \n"
696       "psrld       $0x1b,%%xmm3                  \n"
697       "pcmpeqb     %%xmm4,%%xmm4                 \n"
698       "psrld       $0x1a,%%xmm4                  \n"
699       "pslld       $0x5,%%xmm4                   \n"
700       "pcmpeqb     %%xmm5,%%xmm5                 \n"
701       "pslld       $0xb,%%xmm5                   \n"
702 
703       LABELALIGN
704       "1:                                        \n"
705       "movdqu      (%0),%%xmm0                   \n"
706       "movdqa      %%xmm0,%%xmm1                 \n"
707       "movdqa      %%xmm0,%%xmm2                 \n"
708       "pslld       $0x8,%%xmm0                   \n"
709       "psrld       $0x3,%%xmm1                   \n"
710       "psrld       $0x5,%%xmm2                   \n"
711       "psrad       $0x10,%%xmm0                  \n"
712       "pand        %%xmm3,%%xmm1                 \n"
713       "pand        %%xmm4,%%xmm2                 \n"
714       "pand        %%xmm5,%%xmm0                 \n"
715       "por         %%xmm2,%%xmm1                 \n"
716       "por         %%xmm1,%%xmm0                 \n"
717       "packssdw    %%xmm0,%%xmm0                 \n"
718       "lea         0x10(%0),%0                   \n"
719       "movq        %%xmm0,(%1)                   \n"
720       "lea         0x8(%1),%1                    \n"
721       "sub         $0x4,%2                       \n"
722       "jg          1b                            \n"
723       : "+r"(src),   // %0
724         "+r"(dst),   // %1
725         "+r"(width)  // %2
726         ::"memory",
727         "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
728 }
729 
ARGBToRGB565DitherRow_SSE2(const uint8_t * src,uint8_t * dst,const uint32_t dither4,int width)730 void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
731                                 uint8_t* dst,
732                                 const uint32_t dither4,
733                                 int width) {
734   asm volatile(
735       "movd        %3,%%xmm6                     \n"
736       "punpcklbw   %%xmm6,%%xmm6                 \n"
737       "movdqa      %%xmm6,%%xmm7                 \n"
738       "punpcklwd   %%xmm6,%%xmm6                 \n"
739       "punpckhwd   %%xmm7,%%xmm7                 \n"
740       "pcmpeqb     %%xmm3,%%xmm3                 \n"
741       "psrld       $0x1b,%%xmm3                  \n"
742       "pcmpeqb     %%xmm4,%%xmm4                 \n"
743       "psrld       $0x1a,%%xmm4                  \n"
744       "pslld       $0x5,%%xmm4                   \n"
745       "pcmpeqb     %%xmm5,%%xmm5                 \n"
746       "pslld       $0xb,%%xmm5                   \n"
747 
748       LABELALIGN
749       "1:                                        \n"
750       "movdqu      (%0),%%xmm0                   \n"
751       "paddusb     %%xmm6,%%xmm0                 \n"
752       "movdqa      %%xmm0,%%xmm1                 \n"
753       "movdqa      %%xmm0,%%xmm2                 \n"
754       "pslld       $0x8,%%xmm0                   \n"
755       "psrld       $0x3,%%xmm1                   \n"
756       "psrld       $0x5,%%xmm2                   \n"
757       "psrad       $0x10,%%xmm0                  \n"
758       "pand        %%xmm3,%%xmm1                 \n"
759       "pand        %%xmm4,%%xmm2                 \n"
760       "pand        %%xmm5,%%xmm0                 \n"
761       "por         %%xmm2,%%xmm1                 \n"
762       "por         %%xmm1,%%xmm0                 \n"
763       "packssdw    %%xmm0,%%xmm0                 \n"
764       "lea         0x10(%0),%0                   \n"
765       "movq        %%xmm0,(%1)                   \n"
766       "lea         0x8(%1),%1                    \n"
767       "sub         $0x4,%2                       \n"
768       "jg          1b                            \n"
769       : "+r"(src),    // %0
770         "+r"(dst),    // %1
771         "+r"(width)   // %2
772       : "m"(dither4)  // %3
773       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
774         "xmm7");
775 }
776 
777 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
ARGBToRGB565DitherRow_AVX2(const uint8_t * src,uint8_t * dst,const uint32_t dither4,int width)778 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
779                                 uint8_t* dst,
780                                 const uint32_t dither4,
781                                 int width) {
782   asm volatile(
783       "vbroadcastss %3,%%xmm6                    \n"
784       "vpunpcklbw  %%xmm6,%%xmm6,%%xmm6          \n"
785       "vpermq      $0xd8,%%ymm6,%%ymm6           \n"
786       "vpunpcklwd  %%ymm6,%%ymm6,%%ymm6          \n"
787       "vpcmpeqb    %%ymm3,%%ymm3,%%ymm3          \n"
788       "vpsrld      $0x1b,%%ymm3,%%ymm3           \n"
789       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
790       "vpsrld      $0x1a,%%ymm4,%%ymm4           \n"
791       "vpslld      $0x5,%%ymm4,%%ymm4            \n"
792       "vpslld      $0xb,%%ymm3,%%ymm5            \n"
793 
794       LABELALIGN
795       "1:                                        \n"
796       "vmovdqu     (%0),%%ymm0                   \n"
797       "vpaddusb    %%ymm6,%%ymm0,%%ymm0          \n"
798       "vpsrld      $0x5,%%ymm0,%%ymm2            \n"
799       "vpsrld      $0x3,%%ymm0,%%ymm1            \n"
800       "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
801       "vpand       %%ymm4,%%ymm2,%%ymm2          \n"
802       "vpand       %%ymm3,%%ymm1,%%ymm1          \n"
803       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
804       "vpor        %%ymm2,%%ymm1,%%ymm1          \n"
805       "vpor        %%ymm1,%%ymm0,%%ymm0          \n"
806       "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
807       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
808       "lea         0x20(%0),%0                   \n"
809       "vmovdqu     %%xmm0,(%1)                   \n"
810       "lea         0x10(%1),%1                   \n"
811       "sub         $0x8,%2                       \n"
812       "jg          1b                            \n"
813       "vzeroupper                                \n"
814       : "+r"(src),    // %0
815         "+r"(dst),    // %1
816         "+r"(width)   // %2
817       : "m"(dither4)  // %3
818       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
819         "xmm7");
820 }
821 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
822 
ARGBToARGB1555Row_SSE2(const uint8_t * src,uint8_t * dst,int width)823 void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
824   asm volatile(
825       "pcmpeqb     %%xmm4,%%xmm4                 \n"
826       "psrld       $0x1b,%%xmm4                  \n"
827       "movdqa      %%xmm4,%%xmm5                 \n"
828       "pslld       $0x5,%%xmm5                   \n"
829       "movdqa      %%xmm4,%%xmm6                 \n"
830       "pslld       $0xa,%%xmm6                   \n"
831       "pcmpeqb     %%xmm7,%%xmm7                 \n"
832       "pslld       $0xf,%%xmm7                   \n"
833 
834       LABELALIGN
835       "1:                                        \n"
836       "movdqu      (%0),%%xmm0                   \n"
837       "movdqa      %%xmm0,%%xmm1                 \n"
838       "movdqa      %%xmm0,%%xmm2                 \n"
839       "movdqa      %%xmm0,%%xmm3                 \n"
840       "psrad       $0x10,%%xmm0                  \n"
841       "psrld       $0x3,%%xmm1                   \n"
842       "psrld       $0x6,%%xmm2                   \n"
843       "psrld       $0x9,%%xmm3                   \n"
844       "pand        %%xmm7,%%xmm0                 \n"
845       "pand        %%xmm4,%%xmm1                 \n"
846       "pand        %%xmm5,%%xmm2                 \n"
847       "pand        %%xmm6,%%xmm3                 \n"
848       "por         %%xmm1,%%xmm0                 \n"
849       "por         %%xmm3,%%xmm2                 \n"
850       "por         %%xmm2,%%xmm0                 \n"
851       "packssdw    %%xmm0,%%xmm0                 \n"
852       "lea         0x10(%0),%0                   \n"
853       "movq        %%xmm0,(%1)                   \n"
854       "lea         0x8(%1),%1                    \n"
855       "sub         $0x4,%2                       \n"
856       "jg          1b                            \n"
857       : "+r"(src),   // %0
858         "+r"(dst),   // %1
859         "+r"(width)  // %2
860         ::"memory",
861         "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
862 }
863 
ARGBToARGB4444Row_SSE2(const uint8_t * src,uint8_t * dst,int width)864 void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
865   asm volatile(
866       "pcmpeqb     %%xmm4,%%xmm4                 \n"
867       "psllw       $0xc,%%xmm4                   \n"
868       "movdqa      %%xmm4,%%xmm3                 \n"
869       "psrlw       $0x8,%%xmm3                   \n"
870 
871       LABELALIGN
872       "1:                                        \n"
873       "movdqu      (%0),%%xmm0                   \n"
874       "movdqa      %%xmm0,%%xmm1                 \n"
875       "pand        %%xmm3,%%xmm0                 \n"
876       "pand        %%xmm4,%%xmm1                 \n"
877       "psrlq       $0x4,%%xmm0                   \n"
878       "psrlq       $0x8,%%xmm1                   \n"
879       "por         %%xmm1,%%xmm0                 \n"
880       "packuswb    %%xmm0,%%xmm0                 \n"
881       "lea         0x10(%0),%0                   \n"
882       "movq        %%xmm0,(%1)                   \n"
883       "lea         0x8(%1),%1                    \n"
884       "sub         $0x4,%2                       \n"
885       "jg          1b                            \n"
886       : "+r"(src),   // %0
887         "+r"(dst),   // %1
888         "+r"(width)  // %2
889         ::"memory",
890         "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
891 }
892 #endif  // HAS_RGB24TOARGBROW_SSSE3
893 
894 /*
895 
896 ARGBToAR30Row:
897 
898 Red Blue
899 With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
900 produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
901 wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
902 (1024+4)*16 for red.
903 
904 Alpha Green
905 Alpha and Green are already in the high bits so vpand can zero out the other
906 bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
907 could be used for Green - (1024+4) putting the 10 bit green in the lsb.  Alpha
908 would be a simple multiplier to shift it into position.  It wants a gap of 10
909 above the green.  Green is 10 bits, so there are 6 bits in the low short.  4
910 more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
911 and then a shift of 4 is a multiply of 16, so (4*16) = 64.  Then shift the
912 result left 10 to position the A and G channels.
913 */
914 
915 // Shuffle table for converting RAW to RGB24.  Last 8.
916 static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u,  128u, 4u,  128u, 6u,
917                                    128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
918 
919 static const uvec8 kShuffleBR30 = {128u, 2u,  128u, 0u, 128u, 6u,  128u, 4u,
920                                    128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
921 
922 static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
923 static const uint32_t kMaskRB10 = 0x3ff003ff;
924 static const uint32_t kMaskAG10 = 0xc000ff00;
925 static const uint32_t kMulAG10 = 64 * 65536 + 1028;
926 
ARGBToAR30Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)927 void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
928   asm volatile(
929       "movdqa      %3,%%xmm2                     \n"  // shuffler for RB
930       "movd        %4,%%xmm3                     \n"  // multipler for RB
931       "movd        %5,%%xmm4                     \n"  // mask for R10 B10
932       "movd        %6,%%xmm5                     \n"  // mask for AG
933       "movd        %7,%%xmm6                     \n"  // multipler for AG
934       "pshufd      $0x0,%%xmm3,%%xmm3            \n"
935       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
936       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
937       "pshufd      $0x0,%%xmm6,%%xmm6            \n"
938       "sub         %0,%1                         \n"
939 
940       "1:                                        \n"
941       "movdqu      (%0),%%xmm0                   \n"  // fetch 4 ARGB pixels
942       "movdqa      %%xmm0,%%xmm1                 \n"
943       "pshufb      %%xmm2,%%xmm1                 \n"  // R0B0
944       "pand        %%xmm5,%%xmm0                 \n"  // A0G0
945       "pmulhuw     %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
946       "pmulhuw     %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
947       "pand        %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
948       "pslld       $10,%%xmm0                    \n"  // A2 x10 G10 x10
949       "por         %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
950       "movdqu      %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
951       "add         $0x10,%0                      \n"
952       "sub         $0x4,%2                       \n"
953       "jg          1b                            \n"
954 
955       : "+r"(src),          // %0
956         "+r"(dst),          // %1
957         "+r"(width)         // %2
958       : "m"(kShuffleRB30),  // %3
959         "m"(kMulRB10),      // %4
960         "m"(kMaskRB10),     // %5
961         "m"(kMaskAG10),     // %6
962         "m"(kMulAG10)       // %7
963       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
964 }
965 
ABGRToAR30Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)966 void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
967   asm volatile(
968       "movdqa      %3,%%xmm2                     \n"  // shuffler for RB
969       "movd        %4,%%xmm3                     \n"  // multipler for RB
970       "movd        %5,%%xmm4                     \n"  // mask for R10 B10
971       "movd        %6,%%xmm5                     \n"  // mask for AG
972       "movd        %7,%%xmm6                     \n"  // multipler for AG
973       "pshufd      $0x0,%%xmm3,%%xmm3            \n"
974       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
975       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
976       "pshufd      $0x0,%%xmm6,%%xmm6            \n"
977       "sub         %0,%1                         \n"
978 
979       "1:                                        \n"
980       "movdqu      (%0),%%xmm0                   \n"  // fetch 4 ABGR pixels
981       "movdqa      %%xmm0,%%xmm1                 \n"
982       "pshufb      %%xmm2,%%xmm1                 \n"  // R0B0
983       "pand        %%xmm5,%%xmm0                 \n"  // A0G0
984       "pmulhuw     %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
985       "pmulhuw     %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
986       "pand        %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
987       "pslld       $10,%%xmm0                    \n"  // A2 x10 G10 x10
988       "por         %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
989       "movdqu      %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
990       "add         $0x10,%0                      \n"
991       "sub         $0x4,%2                       \n"
992       "jg          1b                            \n"
993 
994       : "+r"(src),          // %0
995         "+r"(dst),          // %1
996         "+r"(width)         // %2
997       : "m"(kShuffleBR30),  // %3  reversed shuffler
998         "m"(kMulRB10),      // %4
999         "m"(kMaskRB10),     // %5
1000         "m"(kMaskAG10),     // %6
1001         "m"(kMulAG10)       // %7
1002       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1003 }
1004 
1005 #ifdef HAS_ARGBTOAR30ROW_AVX2
ARGBToAR30Row_AVX2(const uint8_t * src,uint8_t * dst,int width)1006 void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
1007   asm volatile(
1008       "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
1009       "vbroadcastss %4,%%ymm3                    \n"  // multipler for RB
1010       "vbroadcastss %5,%%ymm4                    \n"  // mask for R10 B10
1011       "vbroadcastss %6,%%ymm5                    \n"  // mask for AG
1012       "vbroadcastss %7,%%ymm6                    \n"  // multipler for AG
1013       "sub         %0,%1                         \n"
1014 
1015       "1:                                        \n"
1016       "vmovdqu     (%0),%%ymm0                   \n"  // fetch 8 ARGB pixels
1017       "vpshufb     %%ymm2,%%ymm0,%%ymm1          \n"  // R0B0
1018       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"  // A0G0
1019       "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"  // X2 R16 X4  B10
1020       "vpmulhuw    %%ymm6,%%ymm0,%%ymm0          \n"  // X10 A2 X10 G10
1021       "vpand       %%ymm4,%%ymm1,%%ymm1          \n"  // X2 R10 X10 B10
1022       "vpslld      $10,%%ymm0,%%ymm0             \n"  // A2 x10 G10 x10
1023       "vpor        %%ymm1,%%ymm0,%%ymm0          \n"  // A2 R10 G10 B10
1024       "vmovdqu     %%ymm0,(%1,%0)                \n"  // store 8 AR30 pixels
1025       "add         $0x20,%0                      \n"
1026       "sub         $0x8,%2                       \n"
1027       "jg          1b                            \n"
1028       "vzeroupper                                \n"
1029 
1030       : "+r"(src),          // %0
1031         "+r"(dst),          // %1
1032         "+r"(width)         // %2
1033       : "m"(kShuffleRB30),  // %3
1034         "m"(kMulRB10),      // %4
1035         "m"(kMaskRB10),     // %5
1036         "m"(kMaskAG10),     // %6
1037         "m"(kMulAG10)       // %7
1038       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1039 }
1040 #endif
1041 
1042 #ifdef HAS_ABGRTOAR30ROW_AVX2
ABGRToAR30Row_AVX2(const uint8_t * src,uint8_t * dst,int width)1043 void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
1044   asm volatile(
1045       "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
1046       "vbroadcastss %4,%%ymm3                    \n"  // multipler for RB
1047       "vbroadcastss %5,%%ymm4                    \n"  // mask for R10 B10
1048       "vbroadcastss %6,%%ymm5                    \n"  // mask for AG
1049       "vbroadcastss %7,%%ymm6                    \n"  // multipler for AG
1050       "sub         %0,%1                         \n"
1051 
1052       "1:                                        \n"
1053       "vmovdqu     (%0),%%ymm0                   \n"  // fetch 8 ABGR pixels
1054       "vpshufb     %%ymm2,%%ymm0,%%ymm1          \n"  // R0B0
1055       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"  // A0G0
1056       "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"  // X2 R16 X4  B10
1057       "vpmulhuw    %%ymm6,%%ymm0,%%ymm0          \n"  // X10 A2 X10 G10
1058       "vpand       %%ymm4,%%ymm1,%%ymm1          \n"  // X2 R10 X10 B10
1059       "vpslld      $10,%%ymm0,%%ymm0             \n"  // A2 x10 G10 x10
1060       "vpor        %%ymm1,%%ymm0,%%ymm0          \n"  // A2 R10 G10 B10
1061       "vmovdqu     %%ymm0,(%1,%0)                \n"  // store 8 AR30 pixels
1062       "add         $0x20,%0                      \n"
1063       "sub         $0x8,%2                       \n"
1064       "jg          1b                            \n"
1065       "vzeroupper                                \n"
1066 
1067       : "+r"(src),          // %0
1068         "+r"(dst),          // %1
1069         "+r"(width)         // %2
1070       : "m"(kShuffleBR30),  // %3  reversed shuffler
1071         "m"(kMulRB10),      // %4
1072         "m"(kMaskRB10),     // %5
1073         "m"(kMaskAG10),     // %6
1074         "m"(kMulAG10)       // %7
1075       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1076 }
1077 #endif
1078 
1079 static const uvec8 kShuffleARGBToABGR = {2,  1, 0, 3,  6,  5,  4,  7,
1080                                          10, 9, 8, 11, 14, 13, 12, 15};
1081 
1082 static const uvec8 kShuffleARGBToAB64Lo = {2, 2, 1, 1, 0, 0, 3, 3,
1083                                            6, 6, 5, 5, 4, 4, 7, 7};
1084 static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9,  9,  8,  8,  11, 11,
1085                                            14, 14, 13, 13, 12, 12, 15, 15};
1086 
ARGBToAR64Row_SSSE3(const uint8_t * src_argb,uint16_t * dst_ar64,int width)1087 void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
1088                          uint16_t* dst_ar64,
1089                          int width) {
1090   asm volatile(
1091 
1092       LABELALIGN
1093       "1:                                        \n"
1094       "movdqu      (%0),%%xmm0                   \n"
1095       "movdqa      %%xmm0,%%xmm1                 \n"
1096       "punpcklbw   %%xmm0,%%xmm0                 \n"
1097       "punpckhbw   %%xmm1,%%xmm1                 \n"
1098       "movdqu      %%xmm0,(%1)                   \n"
1099       "movdqu      %%xmm1,0x10(%1)               \n"
1100       "lea         0x10(%0),%0                   \n"
1101       "lea         0x20(%1),%1                   \n"
1102       "sub         $0x4,%2                       \n"
1103       "jg          1b                            \n"
1104       : "+r"(src_argb),  // %0
1105         "+r"(dst_ar64),  // %1
1106         "+r"(width)      // %2
1107       :
1108       : "memory", "cc", "xmm0", "xmm1");
1109 }
1110 
ARGBToAB64Row_SSSE3(const uint8_t * src_argb,uint16_t * dst_ab64,int width)1111 void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
1112                          uint16_t* dst_ab64,
1113                          int width) {
1114   asm volatile(
1115 
1116       "movdqa      %3,%%xmm2                     \n"
1117       "movdqa      %4,%%xmm3                     \n" LABELALIGN
1118       "1:                                        \n"
1119       "movdqu      (%0),%%xmm0                   \n"
1120       "movdqa      %%xmm0,%%xmm1                 \n"
1121       "pshufb      %%xmm2,%%xmm0                 \n"
1122       "pshufb      %%xmm3,%%xmm1                 \n"
1123       "movdqu      %%xmm0,(%1)                   \n"
1124       "movdqu      %%xmm1,0x10(%1)               \n"
1125       "lea         0x10(%0),%0                   \n"
1126       "lea         0x20(%1),%1                   \n"
1127       "sub         $0x4,%2                       \n"
1128       "jg          1b                            \n"
1129       : "+r"(src_argb),             // %0
1130         "+r"(dst_ab64),             // %1
1131         "+r"(width)                 // %2
1132       : "m"(kShuffleARGBToAB64Lo),  // %3
1133         "m"(kShuffleARGBToAB64Hi)   // %4
1134       : "memory", "cc", "xmm0", "xmm1", "xmm2");
1135 }
1136 
AR64ToARGBRow_SSSE3(const uint16_t * src_ar64,uint8_t * dst_argb,int width)1137 void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
1138                          uint8_t* dst_argb,
1139                          int width) {
1140   asm volatile(
1141 
1142       LABELALIGN
1143       "1:                                        \n"
1144       "movdqu      (%0),%%xmm0                   \n"
1145       "movdqu      0x10(%0),%%xmm1               \n"
1146       "psrlw       $8,%%xmm0                     \n"
1147       "psrlw       $8,%%xmm1                     \n"
1148       "packuswb    %%xmm1,%%xmm0                 \n"
1149       "movdqu      %%xmm0,(%1)                   \n"
1150       "lea         0x20(%0),%0                   \n"
1151       "lea         0x10(%1),%1                   \n"
1152       "sub         $0x4,%2                       \n"
1153       "jg          1b                            \n"
1154       : "+r"(src_ar64),  // %0
1155         "+r"(dst_argb),  // %1
1156         "+r"(width)      // %2
1157       :
1158       : "memory", "cc", "xmm0", "xmm1");
1159 }
1160 
AB64ToARGBRow_SSSE3(const uint16_t * src_ab64,uint8_t * dst_argb,int width)1161 void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
1162                          uint8_t* dst_argb,
1163                          int width) {
1164   asm volatile(
1165 
1166       "movdqa      %3,%%xmm2                     \n" LABELALIGN
1167       "1:                                        \n"
1168       "movdqu      (%0),%%xmm0                   \n"
1169       "movdqu      0x10(%0),%%xmm1               \n"
1170       "psrlw       $8,%%xmm0                     \n"
1171       "psrlw       $8,%%xmm1                     \n"
1172       "packuswb    %%xmm1,%%xmm0                 \n"
1173       "pshufb      %%xmm2,%%xmm0                 \n"
1174       "movdqu      %%xmm0,(%1)                   \n"
1175       "lea         0x20(%0),%0                   \n"
1176       "lea         0x10(%1),%1                   \n"
1177       "sub         $0x4,%2                       \n"
1178       "jg          1b                            \n"
1179       : "+r"(src_ab64),          // %0
1180         "+r"(dst_argb),          // %1
1181         "+r"(width)              // %2
1182       : "m"(kShuffleARGBToABGR)  // %3
1183       : "memory", "cc", "xmm0", "xmm1", "xmm2");
1184 }
1185 
1186 #ifdef HAS_ARGBTOAR64ROW_AVX2
ARGBToAR64Row_AVX2(const uint8_t * src_argb,uint16_t * dst_ar64,int width)1187 void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
1188                         uint16_t* dst_ar64,
1189                         int width) {
1190   asm volatile(
1191 
1192       LABELALIGN
1193       "1:                                        \n"
1194       "vmovdqu     (%0),%%ymm0                   \n"
1195       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1196       "vpunpckhbw  %%ymm0,%%ymm0,%%ymm1          \n"
1197       "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
1198       "vmovdqu     %%ymm0,(%1)                   \n"
1199       "vmovdqu     %%ymm1,0x20(%1)               \n"
1200       "lea         0x20(%0),%0                   \n"
1201       "lea         0x40(%1),%1                   \n"
1202       "sub         $0x8,%2                       \n"
1203       "jg          1b                            \n"
1204       : "+r"(src_argb),  // %0
1205         "+r"(dst_ar64),  // %1
1206         "+r"(width)      // %2
1207       :
1208       : "memory", "cc", "xmm0", "xmm1");
1209 }
1210 #endif
1211 
1212 #ifdef HAS_ARGBTOAB64ROW_AVX2
ARGBToAB64Row_AVX2(const uint8_t * src_argb,uint16_t * dst_ab64,int width)1213 void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
1214                         uint16_t* dst_ab64,
1215                         int width) {
1216   asm volatile(
1217 
1218       "vbroadcastf128 %3,%%ymm2                  \n"
1219       "vbroadcastf128 %4,%%ymm3                  \n" LABELALIGN
1220       "1:                                        \n"
1221       "vmovdqu     (%0),%%ymm0                   \n"
1222       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1223       "vpshufb     %%ymm3,%%ymm0,%%ymm1          \n"
1224       "vpshufb     %%ymm2,%%ymm0,%%ymm0          \n"
1225       "vmovdqu     %%ymm0,(%1)                   \n"
1226       "vmovdqu     %%ymm1,0x20(%1)               \n"
1227       "lea         0x20(%0),%0                   \n"
1228       "lea         0x40(%1),%1                   \n"
1229       "sub         $0x8,%2                       \n"
1230       "jg          1b                            \n"
1231       : "+r"(src_argb),             // %0
1232         "+r"(dst_ab64),             // %1
1233         "+r"(width)                 // %2
1234       : "m"(kShuffleARGBToAB64Lo),  // %3
1235         "m"(kShuffleARGBToAB64Hi)   // %3
1236       : "memory", "cc", "xmm0", "xmm1", "xmm2");
1237 }
1238 #endif
1239 
1240 #ifdef HAS_AR64TOARGBROW_AVX2
AR64ToARGBRow_AVX2(const uint16_t * src_ar64,uint8_t * dst_argb,int width)1241 void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
1242                         uint8_t* dst_argb,
1243                         int width) {
1244   asm volatile(
1245 
1246       LABELALIGN
1247       "1:                                        \n"
1248       "vmovdqu     (%0),%%ymm0                   \n"
1249       "vmovdqu     0x20(%0),%%ymm1               \n"
1250       "vpsrlw      $8,%%ymm0,%%ymm0              \n"
1251       "vpsrlw      $8,%%ymm1,%%ymm1              \n"
1252       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
1253       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1254       "vmovdqu     %%ymm0,(%1)                   \n"
1255       "lea         0x40(%0),%0                   \n"
1256       "lea         0x20(%1),%1                   \n"
1257       "sub         $0x8,%2                       \n"
1258       "jg          1b                            \n"
1259       : "+r"(src_ar64),  // %0
1260         "+r"(dst_argb),  // %1
1261         "+r"(width)      // %2
1262       :
1263       : "memory", "cc", "xmm0", "xmm1");
1264 }
1265 #endif
1266 
1267 #ifdef HAS_AB64TOARGBROW_AVX2
AB64ToARGBRow_AVX2(const uint16_t * src_ab64,uint8_t * dst_argb,int width)1268 void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
1269                         uint8_t* dst_argb,
1270                         int width) {
1271   asm volatile(
1272 
1273       "vbroadcastf128 %3,%%ymm2                  \n" LABELALIGN
1274       "1:                                        \n"
1275       "vmovdqu     (%0),%%ymm0                   \n"
1276       "vmovdqu     0x20(%0),%%ymm1               \n"
1277       "vpsrlw      $8,%%ymm0,%%ymm0              \n"
1278       "vpsrlw      $8,%%ymm1,%%ymm1              \n"
1279       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
1280       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1281       "vpshufb     %%ymm2,%%ymm0,%%ymm0          \n"
1282       "vmovdqu     %%ymm0,(%1)                   \n"
1283       "lea         0x40(%0),%0                   \n"
1284       "lea         0x20(%1),%1                   \n"
1285       "sub         $0x8,%2                       \n"
1286       "jg          1b                            \n"
1287       : "+r"(src_ab64),          // %0
1288         "+r"(dst_argb),          // %1
1289         "+r"(width)              // %2
1290       : "m"(kShuffleARGBToABGR)  // %3
1291       : "memory", "cc", "xmm0", "xmm1", "xmm2");
1292 }
1293 #endif
1294 
1295 // clang-format off
1296 
1297 // TODO(mraptis): Consider passing R, G, B multipliers as parameter.
1298 // round parameter is register containing value to add before shift.
1299 #define RGBTOY(round)                            \
1300   "1:                                        \n" \
1301   "movdqu    (%0),%%xmm0                     \n" \
1302   "movdqu    0x10(%0),%%xmm1                 \n" \
1303   "movdqu    0x20(%0),%%xmm2                 \n" \
1304   "movdqu    0x30(%0),%%xmm3                 \n" \
1305   "psubb     %%xmm5,%%xmm0                   \n" \
1306   "psubb     %%xmm5,%%xmm1                   \n" \
1307   "psubb     %%xmm5,%%xmm2                   \n" \
1308   "psubb     %%xmm5,%%xmm3                   \n" \
1309   "movdqu    %%xmm4,%%xmm6                   \n" \
1310   "pmaddubsw %%xmm0,%%xmm6                   \n" \
1311   "movdqu    %%xmm4,%%xmm0                   \n" \
1312   "pmaddubsw %%xmm1,%%xmm0                   \n" \
1313   "movdqu    %%xmm4,%%xmm1                   \n" \
1314   "pmaddubsw %%xmm2,%%xmm1                   \n" \
1315   "movdqu    %%xmm4,%%xmm2                   \n" \
1316   "pmaddubsw %%xmm3,%%xmm2                   \n" \
1317   "lea       0x40(%0),%0                     \n" \
1318   "phaddw    %%xmm0,%%xmm6                   \n" \
1319   "phaddw    %%xmm2,%%xmm1                   \n" \
1320   "prefetcht0 1280(%0)                       \n" \
1321   "paddw     %%" #round ",%%xmm6             \n" \
1322   "paddw     %%" #round ",%%xmm1             \n" \
1323   "psrlw     $0x8,%%xmm6                     \n" \
1324   "psrlw     $0x8,%%xmm1                     \n" \
1325   "packuswb  %%xmm1,%%xmm6                   \n" \
1326   "movdqu    %%xmm6,(%1)                     \n" \
1327   "lea       0x10(%1),%1                     \n" \
1328   "sub       $0x10,%2                        \n" \
1329   "jg        1b                              \n"
1330 
1331 #define RGBTOY_AVX2(round)                                       \
1332   "1:                                        \n"                 \
1333   "vmovdqu    (%0),%%ymm0                    \n"                 \
1334   "vmovdqu    0x20(%0),%%ymm1                \n"                 \
1335   "vmovdqu    0x40(%0),%%ymm2                \n"                 \
1336   "vmovdqu    0x60(%0),%%ymm3                \n"                 \
1337   "vpsubb     %%ymm5, %%ymm0, %%ymm0         \n"                 \
1338   "vpsubb     %%ymm5, %%ymm1, %%ymm1         \n"                 \
1339   "vpsubb     %%ymm5, %%ymm2, %%ymm2         \n"                 \
1340   "vpsubb     %%ymm5, %%ymm3, %%ymm3         \n"                 \
1341   "vpmaddubsw %%ymm0,%%ymm4,%%ymm0           \n"                 \
1342   "vpmaddubsw %%ymm1,%%ymm4,%%ymm1           \n"                 \
1343   "vpmaddubsw %%ymm2,%%ymm4,%%ymm2           \n"                 \
1344   "vpmaddubsw %%ymm3,%%ymm4,%%ymm3           \n"                 \
1345   "lea       0x80(%0),%0                     \n"                 \
1346   "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n" /* mutates. */  \
1347   "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"                 \
1348   "prefetcht0 1280(%0)                       \n"                 \
1349   "vpaddw     %%" #round ",%%ymm0,%%ymm0     \n" /* Add .5 for rounding. */             \
1350   "vpaddw     %%" #round ",%%ymm2,%%ymm2     \n" \
1351   "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"                 \
1352   "vpsrlw     $0x8,%%ymm2,%%ymm2             \n"                 \
1353   "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n" /* mutates. */  \
1354   "vpermd     %%ymm0,%%ymm6,%%ymm0           \n" /* unmutate. */ \
1355   "vmovdqu    %%ymm0,(%1)                    \n"                 \
1356   "lea       0x20(%1),%1                     \n"                 \
1357   "sub       $0x20,%2                        \n"                 \
1358   "jg        1b                              \n"                 \
1359   "vzeroupper                                \n"
1360 
1361 // clang-format on
1362 
1363 #ifdef HAS_ARGBTOYROW_SSSE3
1364 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
ARGBToYRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_y,int width)1365 void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1366   asm volatile(
1367       "movdqa      %3,%%xmm4                     \n"
1368       "movdqa      %4,%%xmm5                     \n"
1369       "movdqa      %5,%%xmm7                     \n"
1370 
1371       LABELALIGN RGBTOY(xmm7)
1372       : "+r"(src_argb),  // %0
1373         "+r"(dst_y),     // %1
1374         "+r"(width)      // %2
1375       : "m"(kARGBToY),   // %3
1376         "m"(kSub128),    // %4
1377         "m"(kAddY16)     // %5
1378       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1379         "xmm7");
1380 }
1381 #endif  // HAS_ARGBTOYROW_SSSE3
1382 
1383 #ifdef HAS_ARGBTOYJROW_SSSE3
1384 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1385 // Same as ARGBToYRow but different coefficients, no add 16.
ARGBToYJRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_y,int width)1386 void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1387   asm volatile(
1388       "movdqa      %3,%%xmm4                     \n"
1389       "movdqa      %4,%%xmm5                     \n"
1390 
1391       LABELALIGN RGBTOY(xmm5)
1392       : "+r"(src_argb),  // %0
1393         "+r"(dst_y),     // %1
1394         "+r"(width)      // %2
1395       : "m"(kARGBToYJ),  // %3
1396         "m"(kSub128)     // %4
1397       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1398 }
1399 #endif  // HAS_ARGBTOYJROW_SSSE3
1400 
1401 #ifdef HAS_RGBATOYJROW_SSSE3
1402 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1403 // Same as ARGBToYRow but different coefficients, no add 16.
RGBAToYJRow_SSSE3(const uint8_t * src_rgba,uint8_t * dst_y,int width)1404 void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1405   asm volatile(
1406       "movdqa      %3,%%xmm4                     \n"
1407       "movdqa      %4,%%xmm5                     \n"
1408 
1409       LABELALIGN RGBTOY(xmm5)
1410       : "+r"(src_rgba),  // %0
1411         "+r"(dst_y),     // %1
1412         "+r"(width)      // %2
1413       : "m"(kRGBAToYJ),  // %3
1414         "m"(kSub128)     // %4
1415       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1416 }
1417 #endif  // HAS_RGBATOYJROW_SSSE3
1418 
1419 #if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
1420 // vpermd for vphaddw + vpackuswb vpermd.
1421 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
1422 #endif
1423 
1424 #ifdef HAS_ARGBTOYROW_AVX2
1425 
1426 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYRow_AVX2(const uint8_t * src_argb,uint8_t * dst_y,int width)1427 void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1428   asm volatile(
1429       "vbroadcastf128 %3,%%ymm4                  \n"
1430       "vbroadcastf128 %4,%%ymm5                  \n"
1431       "vbroadcastf128 %5,%%ymm7                  \n"
1432       "vmovdqu     %6,%%ymm6                     \n"
1433 
1434       LABELALIGN RGBTOY_AVX2(ymm7)
1435       : "+r"(src_argb),         // %0
1436         "+r"(dst_y),            // %1
1437         "+r"(width)             // %2
1438       : "m"(kARGBToY),          // %3
1439         "m"(kSub128),           // %4
1440         "m"(kAddY16),           // %5
1441         "m"(kPermdARGBToY_AVX)  // %6
1442       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1443         "xmm7");
1444 }
1445 #endif  // HAS_ARGBTOYROW_AVX2
1446 
1447 #ifdef HAS_ABGRTOYROW_AVX2
1448 // Convert 32 ABGR pixels (128 bytes) to 32 Y values.
ABGRToYRow_AVX2(const uint8_t * src_abgr,uint8_t * dst_y,int width)1449 void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1450   asm volatile(
1451       "vbroadcastf128 %3,%%ymm4                  \n"
1452       "vbroadcastf128 %4,%%ymm5                  \n"
1453       "vbroadcastf128 %5,%%ymm7                  \n"
1454       "vmovdqu     %6,%%ymm6                     \n"
1455 
1456       LABELALIGN RGBTOY_AVX2(ymm7)
1457       : "+r"(src_abgr),         // %0
1458         "+r"(dst_y),            // %1
1459         "+r"(width)             // %2
1460       : "m"(kABGRToY),          // %3
1461         "m"(kSub128),           // %4
1462         "m"(kAddY16),           // %5
1463         "m"(kPermdARGBToY_AVX)  // %6
1464       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1465         "xmm7");
1466 }
1467 #endif  // HAS_ABGRTOYROW_AVX2
1468 
1469 #ifdef HAS_ARGBTOYJROW_AVX2
1470 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYJRow_AVX2(const uint8_t * src_argb,uint8_t * dst_y,int width)1471 void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1472   asm volatile(
1473       "vbroadcastf128 %3,%%ymm4                  \n"
1474       "vbroadcastf128 %4,%%ymm5                  \n"
1475       "vmovdqu     %5,%%ymm6                     \n"
1476 
1477       LABELALIGN RGBTOY_AVX2(ymm5)
1478       : "+r"(src_argb),         // %0
1479         "+r"(dst_y),            // %1
1480         "+r"(width)             // %2
1481       : "m"(kARGBToYJ),         // %3
1482         "m"(kSub128),           // %4
1483         "m"(kPermdARGBToY_AVX)  // %5
1484       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1485         "xmm7");
1486 }
1487 #endif  // HAS_ARGBTOYJROW_AVX2
1488 
1489 #ifdef HAS_RGBATOYJROW_AVX2
1490 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
RGBAToYJRow_AVX2(const uint8_t * src_rgba,uint8_t * dst_y,int width)1491 void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1492   asm volatile(
1493       "vbroadcastf128 %3,%%ymm4                  \n"
1494       "vbroadcastf128 %4,%%ymm5                  \n"
1495       "vmovdqu     %5,%%ymm6                     \n"
1496 
1497       LABELALIGN RGBTOY_AVX2(
1498       ymm5) "vzeroupper                                \n"
1499       : "+r"(src_rgba),         // %0
1500         "+r"(dst_y),            // %1
1501         "+r"(width)             // %2
1502       : "m"(kRGBAToYJ),         // %3
1503         "m"(kSub128),           // %4
1504         "m"(kPermdARGBToY_AVX)  // %5
1505       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1506 }
1507 #endif  // HAS_RGBATOYJROW_AVX2
1508 
1509 #ifdef HAS_ARGBTOUVROW_SSSE3
ARGBToUVRow_SSSE3(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1510 void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
1511                        int src_stride_argb,
1512                        uint8_t* dst_u,
1513                        uint8_t* dst_v,
1514                        int width) {
1515   asm volatile(
1516       "movdqa      %5,%%xmm3                     \n"
1517       "movdqa      %6,%%xmm4                     \n"
1518       "movdqa      %7,%%xmm5                     \n"
1519       "sub         %1,%2                         \n"
1520 
1521       LABELALIGN
1522       "1:                                        \n"
1523       "movdqu      (%0),%%xmm0                   \n"
1524       "movdqu      0x00(%0,%4,1),%%xmm7          \n"
1525       "pavgb       %%xmm7,%%xmm0                 \n"
1526       "movdqu      0x10(%0),%%xmm1               \n"
1527       "movdqu      0x10(%0,%4,1),%%xmm7          \n"
1528       "pavgb       %%xmm7,%%xmm1                 \n"
1529       "movdqu      0x20(%0),%%xmm2               \n"
1530       "movdqu      0x20(%0,%4,1),%%xmm7          \n"
1531       "pavgb       %%xmm7,%%xmm2                 \n"
1532       "movdqu      0x30(%0),%%xmm6               \n"
1533       "movdqu      0x30(%0,%4,1),%%xmm7          \n"
1534       "pavgb       %%xmm7,%%xmm6                 \n"
1535 
1536       "lea         0x40(%0),%0                   \n"
1537       "movdqa      %%xmm0,%%xmm7                 \n"
1538       "shufps      $0x88,%%xmm1,%%xmm0           \n"
1539       "shufps      $0xdd,%%xmm1,%%xmm7           \n"
1540       "pavgb       %%xmm7,%%xmm0                 \n"
1541       "movdqa      %%xmm2,%%xmm7                 \n"
1542       "shufps      $0x88,%%xmm6,%%xmm2           \n"
1543       "shufps      $0xdd,%%xmm6,%%xmm7           \n"
1544       "pavgb       %%xmm7,%%xmm2                 \n"
1545       "movdqa      %%xmm0,%%xmm1                 \n"
1546       "movdqa      %%xmm2,%%xmm6                 \n"
1547       "pmaddubsw   %%xmm4,%%xmm0                 \n"
1548       "pmaddubsw   %%xmm4,%%xmm2                 \n"
1549       "pmaddubsw   %%xmm3,%%xmm1                 \n"
1550       "pmaddubsw   %%xmm3,%%xmm6                 \n"
1551       "phaddw      %%xmm2,%%xmm0                 \n"
1552       "phaddw      %%xmm6,%%xmm1                 \n"
1553       "psraw       $0x8,%%xmm0                   \n"
1554       "psraw       $0x8,%%xmm1                   \n"
1555       "packsswb    %%xmm1,%%xmm0                 \n"
1556       "paddb       %%xmm5,%%xmm0                 \n"
1557       "movlps      %%xmm0,(%1)                   \n"
1558       "movhps      %%xmm0,0x00(%1,%2,1)          \n"
1559       "lea         0x8(%1),%1                    \n"
1560       "sub         $0x10,%3                      \n"
1561       "jg          1b                            \n"
1562       : "+r"(src_argb),                    // %0
1563         "+r"(dst_u),                       // %1
1564         "+r"(dst_v),                       // %2
1565         "+rm"(width)                       // %3
1566       : "r"((intptr_t)(src_stride_argb)),  // %4
1567         "m"(kARGBToV),                     // %5
1568         "m"(kARGBToU),                     // %6
1569         "m"(kAddUV128)                     // %7
1570       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1571 }
1572 #endif  // HAS_ARGBTOUVROW_SSSE3
1573 
1574 #ifdef HAS_ARGBTOUVROW_AVX2
1575 // vpshufb for vphaddw + vpackuswb packed to shorts.
1576 static const lvec8 kShufARGBToUV_AVX = {
1577     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
1578     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
ARGBToUVRow_AVX2(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1579 void ARGBToUVRow_AVX2(const uint8_t* src_argb,
1580                       int src_stride_argb,
1581                       uint8_t* dst_u,
1582                       uint8_t* dst_v,
1583                       int width) {
1584   asm volatile(
1585       "vbroadcastf128 %5,%%ymm5                  \n"
1586       "vbroadcastf128 %6,%%ymm6                  \n"
1587       "vbroadcastf128 %7,%%ymm7                  \n"
1588       "sub         %1,%2                         \n"
1589 
1590       LABELALIGN
1591       "1:                                        \n"
1592       "vmovdqu     (%0),%%ymm0                   \n"
1593       "vmovdqu     0x20(%0),%%ymm1               \n"
1594       "vmovdqu     0x40(%0),%%ymm2               \n"
1595       "vmovdqu     0x60(%0),%%ymm3               \n"
1596       "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
1597       "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
1598       "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
1599       "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
1600       "lea         0x80(%0),%0                   \n"
1601       "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
1602       "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
1603       "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
1604       "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
1605       "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
1606       "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
1607 
1608       "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
1609       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
1610       "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
1611       "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
1612       "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
1613       "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
1614       "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
1615       "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
1616       "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
1617       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1618       "vpshufb     %8,%%ymm0,%%ymm0              \n"
1619       "vpaddb      %%ymm5,%%ymm0,%%ymm0          \n"
1620 
1621       "vextractf128 $0x0,%%ymm0,(%1)             \n"
1622       "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
1623       "lea         0x10(%1),%1                   \n"
1624       "sub         $0x20,%3                      \n"
1625       "jg          1b                            \n"
1626       "vzeroupper                                \n"
1627       : "+r"(src_argb),                    // %0
1628         "+r"(dst_u),                       // %1
1629         "+r"(dst_v),                       // %2
1630         "+rm"(width)                       // %3
1631       : "r"((intptr_t)(src_stride_argb)),  // %4
1632         "m"(kAddUV128),                    // %5
1633         "m"(kARGBToV),                     // %6
1634         "m"(kARGBToU),                     // %7
1635         "m"(kShufARGBToUV_AVX)             // %8
1636       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1637         "xmm7");
1638 }
1639 #endif  // HAS_ARGBTOUVROW_AVX2
1640 
1641 #ifdef HAS_ABGRTOUVROW_AVX2
ABGRToUVRow_AVX2(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)1642 void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
1643                       int src_stride_abgr,
1644                       uint8_t* dst_u,
1645                       uint8_t* dst_v,
1646                       int width) {
1647   asm volatile(
1648       "vbroadcastf128 %5,%%ymm5                  \n"
1649       "vbroadcastf128 %6,%%ymm6                  \n"
1650       "vbroadcastf128 %7,%%ymm7                  \n"
1651       "sub         %1,%2                         \n"
1652 
1653       LABELALIGN
1654       "1:                                        \n"
1655       "vmovdqu     (%0),%%ymm0                   \n"
1656       "vmovdqu     0x20(%0),%%ymm1               \n"
1657       "vmovdqu     0x40(%0),%%ymm2               \n"
1658       "vmovdqu     0x60(%0),%%ymm3               \n"
1659       "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
1660       "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
1661       "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
1662       "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
1663       "lea         0x80(%0),%0                   \n"
1664       "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
1665       "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
1666       "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
1667       "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
1668       "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
1669       "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
1670 
1671       "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
1672       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
1673       "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
1674       "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
1675       "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
1676       "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
1677       "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
1678       "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
1679       "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
1680       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1681       "vpshufb     %8,%%ymm0,%%ymm0              \n"
1682       "vpaddb      %%ymm5,%%ymm0,%%ymm0          \n"
1683 
1684       "vextractf128 $0x0,%%ymm0,(%1)             \n"
1685       "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
1686       "lea         0x10(%1),%1                   \n"
1687       "sub         $0x20,%3                      \n"
1688       "jg          1b                            \n"
1689       "vzeroupper                                \n"
1690       : "+r"(src_abgr),                    // %0
1691         "+r"(dst_u),                       // %1
1692         "+r"(dst_v),                       // %2
1693         "+rm"(width)                       // %3
1694       : "r"((intptr_t)(src_stride_abgr)),  // %4
1695         "m"(kAddUV128),                    // %5
1696         "m"(kABGRToV),                     // %6
1697         "m"(kABGRToU),                     // %7
1698         "m"(kShufARGBToUV_AVX)             // %8
1699       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1700         "xmm7");
1701 }
1702 #endif  // HAS_ABGRTOUVROW_AVX2
1703 
1704 #ifdef HAS_ARGBTOUVJROW_AVX2
ARGBToUVJRow_AVX2(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1705 void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
1706                        int src_stride_argb,
1707                        uint8_t* dst_u,
1708                        uint8_t* dst_v,
1709                        int width) {
1710   asm volatile(
1711       "vbroadcastf128 %5,%%ymm5                  \n"
1712       "vbroadcastf128 %6,%%ymm6                  \n"
1713       "vbroadcastf128 %7,%%ymm7                  \n"
1714       "sub         %1,%2                         \n"
1715 
1716       LABELALIGN
1717       "1:                                        \n"
1718       "vmovdqu     (%0),%%ymm0                   \n"
1719       "vmovdqu     0x20(%0),%%ymm1               \n"
1720       "vmovdqu     0x40(%0),%%ymm2               \n"
1721       "vmovdqu     0x60(%0),%%ymm3               \n"
1722       "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
1723       "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
1724       "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
1725       "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
1726       "lea         0x80(%0),%0                   \n"
1727       "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
1728       "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
1729       "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
1730       "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
1731       "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
1732       "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
1733 
1734       "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
1735       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
1736       "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
1737       "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
1738       "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
1739       "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
1740       "vpaddw      %%ymm5,%%ymm0,%%ymm0          \n"
1741       "vpaddw      %%ymm5,%%ymm1,%%ymm1          \n"
1742       "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
1743       "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
1744       "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
1745       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1746       "vpshufb     %8,%%ymm0,%%ymm0              \n"
1747 
1748       "vextractf128 $0x0,%%ymm0,(%1)             \n"
1749       "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
1750       "lea         0x10(%1),%1                   \n"
1751       "sub         $0x20,%3                      \n"
1752       "jg          1b                            \n"
1753       "vzeroupper                                \n"
1754       : "+r"(src_argb),                    // %0
1755         "+r"(dst_u),                       // %1
1756         "+r"(dst_v),                       // %2
1757         "+rm"(width)                       // %3
1758       : "r"((intptr_t)(src_stride_argb)),  // %4
1759         "m"(kSub128),                      // %5
1760         "m"(kARGBToVJ),                    // %6
1761         "m"(kARGBToUJ),                    // %7
1762         "m"(kShufARGBToUV_AVX)             // %8
1763       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1764         "xmm7");
1765 }
1766 #endif  // HAS_ARGBTOUVJROW_AVX2
1767 
1768 #ifdef HAS_ARGBTOUVJROW_SSSE3
ARGBToUVJRow_SSSE3(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1769 void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
1770                         int src_stride_argb,
1771                         uint8_t* dst_u,
1772                         uint8_t* dst_v,
1773                         int width) {
1774   asm volatile(
1775       "movdqa      %5,%%xmm3                     \n"
1776       "movdqa      %6,%%xmm4                     \n"
1777       "movdqa      %7,%%xmm5                     \n"
1778       "sub         %1,%2                         \n"
1779 
1780       LABELALIGN
1781       "1:                                        \n"
1782       "movdqu      (%0),%%xmm0                   \n"
1783       "movdqu      0x00(%0,%4,1),%%xmm7          \n"
1784       "pavgb       %%xmm7,%%xmm0                 \n"
1785       "movdqu      0x10(%0),%%xmm1               \n"
1786       "movdqu      0x10(%0,%4,1),%%xmm7          \n"
1787       "pavgb       %%xmm7,%%xmm1                 \n"
1788       "movdqu      0x20(%0),%%xmm2               \n"
1789       "movdqu      0x20(%0,%4,1),%%xmm7          \n"
1790       "pavgb       %%xmm7,%%xmm2                 \n"
1791       "movdqu      0x30(%0),%%xmm6               \n"
1792       "movdqu      0x30(%0,%4,1),%%xmm7          \n"
1793       "pavgb       %%xmm7,%%xmm6                 \n"
1794 
1795       "lea         0x40(%0),%0                   \n"
1796       "movdqa      %%xmm0,%%xmm7                 \n"
1797       "shufps      $0x88,%%xmm1,%%xmm0           \n"
1798       "shufps      $0xdd,%%xmm1,%%xmm7           \n"
1799       "pavgb       %%xmm7,%%xmm0                 \n"
1800       "movdqa      %%xmm2,%%xmm7                 \n"
1801       "shufps      $0x88,%%xmm6,%%xmm2           \n"
1802       "shufps      $0xdd,%%xmm6,%%xmm7           \n"
1803       "pavgb       %%xmm7,%%xmm2                 \n"
1804       "movdqa      %%xmm0,%%xmm1                 \n"
1805       "movdqa      %%xmm2,%%xmm6                 \n"
1806       "pmaddubsw   %%xmm4,%%xmm0                 \n"
1807       "pmaddubsw   %%xmm4,%%xmm2                 \n"
1808       "pmaddubsw   %%xmm3,%%xmm1                 \n"
1809       "pmaddubsw   %%xmm3,%%xmm6                 \n"
1810       "phaddw      %%xmm2,%%xmm0                 \n"
1811       "phaddw      %%xmm6,%%xmm1                 \n"
1812       "paddw       %%xmm5,%%xmm0                 \n"
1813       "paddw       %%xmm5,%%xmm1                 \n"
1814       "psraw       $0x8,%%xmm0                   \n"
1815       "psraw       $0x8,%%xmm1                   \n"
1816       "packsswb    %%xmm1,%%xmm0                 \n"
1817       "movlps      %%xmm0,(%1)                   \n"
1818       "movhps      %%xmm0,0x00(%1,%2,1)          \n"
1819       "lea         0x8(%1),%1                    \n"
1820       "sub         $0x10,%3                      \n"
1821       "jg          1b                            \n"
1822       : "+r"(src_argb),                    // %0
1823         "+r"(dst_u),                       // %1
1824         "+r"(dst_v),                       // %2
1825         "+rm"(width)                       // %3
1826       : "r"((intptr_t)(src_stride_argb)),  // %4
1827         "m"(kARGBToVJ),                    // %5
1828         "m"(kARGBToUJ),                    // %6
1829         "m"(kSub128)                       // %7
1830       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1831 }
1832 #endif  // HAS_ARGBTOUVJROW_SSSE3
1833 
1834 #ifdef HAS_ARGBTOUV444ROW_SSSE3
ARGBToUV444Row_SSSE3(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1835 void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
1836                           uint8_t* dst_u,
1837                           uint8_t* dst_v,
1838                           int width) {
1839   asm volatile(
1840       "movdqa      %4,%%xmm3                     \n"
1841       "movdqa      %5,%%xmm4                     \n"
1842       "movdqa      %6,%%xmm5                     \n"
1843       "sub         %1,%2                         \n"
1844 
1845       LABELALIGN
1846       "1:                                        \n"
1847       "movdqu      (%0),%%xmm0                   \n"
1848       "movdqu      0x10(%0),%%xmm1               \n"
1849       "movdqu      0x20(%0),%%xmm2               \n"
1850       "movdqu      0x30(%0),%%xmm6               \n"
1851       "pmaddubsw   %%xmm4,%%xmm0                 \n"
1852       "pmaddubsw   %%xmm4,%%xmm1                 \n"
1853       "pmaddubsw   %%xmm4,%%xmm2                 \n"
1854       "pmaddubsw   %%xmm4,%%xmm6                 \n"
1855       "phaddw      %%xmm1,%%xmm0                 \n"
1856       "phaddw      %%xmm6,%%xmm2                 \n"
1857       "psraw       $0x8,%%xmm0                   \n"
1858       "psraw       $0x8,%%xmm2                   \n"
1859       "packsswb    %%xmm2,%%xmm0                 \n"
1860       "paddb       %%xmm5,%%xmm0                 \n"
1861       "movdqu      %%xmm0,(%1)                   \n"
1862       "movdqu      (%0),%%xmm0                   \n"
1863       "movdqu      0x10(%0),%%xmm1               \n"
1864       "movdqu      0x20(%0),%%xmm2               \n"
1865       "movdqu      0x30(%0),%%xmm6               \n"
1866       "pmaddubsw   %%xmm3,%%xmm0                 \n"
1867       "pmaddubsw   %%xmm3,%%xmm1                 \n"
1868       "pmaddubsw   %%xmm3,%%xmm2                 \n"
1869       "pmaddubsw   %%xmm3,%%xmm6                 \n"
1870       "phaddw      %%xmm1,%%xmm0                 \n"
1871       "phaddw      %%xmm6,%%xmm2                 \n"
1872       "psraw       $0x8,%%xmm0                   \n"
1873       "psraw       $0x8,%%xmm2                   \n"
1874       "packsswb    %%xmm2,%%xmm0                 \n"
1875       "paddb       %%xmm5,%%xmm0                 \n"
1876       "lea         0x40(%0),%0                   \n"
1877       "movdqu      %%xmm0,0x00(%1,%2,1)          \n"
1878       "lea         0x10(%1),%1                   \n"
1879       "sub         $0x10,%3                      \n"
1880       "jg          1b                            \n"
1881       : "+r"(src_argb),  // %0
1882         "+r"(dst_u),     // %1
1883         "+r"(dst_v),     // %2
1884         "+rm"(width)     // %3
1885       : "m"(kARGBToV),   // %4
1886         "m"(kARGBToU),   // %5
1887         "m"(kAddUV128)   // %6
1888       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");
1889 }
1890 #endif  // HAS_ARGBTOUV444ROW_SSSE3
1891 
BGRAToYRow_SSSE3(const uint8_t * src_bgra,uint8_t * dst_y,int width)1892 void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
1893   asm volatile(
1894       "movdqa      %3,%%xmm4                     \n"
1895       "movdqa      %4,%%xmm5                     \n"
1896       "movdqa      %5,%%xmm7                     \n"
1897 
1898       LABELALIGN RGBTOY(xmm7)
1899       : "+r"(src_bgra),  // %0
1900         "+r"(dst_y),     // %1
1901         "+r"(width)      // %2
1902       : "m"(kBGRAToY),   // %3
1903         "m"(kSub128),    // %4
1904         "m"(kAddY16)     // %5
1905       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1906         "xmm7");
1907 }
1908 
BGRAToUVRow_SSSE3(const uint8_t * src_bgra,int src_stride_bgra,uint8_t * dst_u,uint8_t * dst_v,int width)1909 void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
1910                        int src_stride_bgra,
1911                        uint8_t* dst_u,
1912                        uint8_t* dst_v,
1913                        int width) {
1914   asm volatile(
1915       "movdqa      %5,%%xmm3                     \n"
1916       "movdqa      %6,%%xmm4                     \n"
1917       "movdqa      %7,%%xmm5                     \n"
1918       "sub         %1,%2                         \n"
1919 
1920       LABELALIGN
1921       "1:                                        \n"
1922       "movdqu      (%0),%%xmm0                   \n"
1923       "movdqu      0x00(%0,%4,1),%%xmm7          \n"
1924       "pavgb       %%xmm7,%%xmm0                 \n"
1925       "movdqu      0x10(%0),%%xmm1               \n"
1926       "movdqu      0x10(%0,%4,1),%%xmm7          \n"
1927       "pavgb       %%xmm7,%%xmm1                 \n"
1928       "movdqu      0x20(%0),%%xmm2               \n"
1929       "movdqu      0x20(%0,%4,1),%%xmm7          \n"
1930       "pavgb       %%xmm7,%%xmm2                 \n"
1931       "movdqu      0x30(%0),%%xmm6               \n"
1932       "movdqu      0x30(%0,%4,1),%%xmm7          \n"
1933       "pavgb       %%xmm7,%%xmm6                 \n"
1934 
1935       "lea         0x40(%0),%0                   \n"
1936       "movdqa      %%xmm0,%%xmm7                 \n"
1937       "shufps      $0x88,%%xmm1,%%xmm0           \n"
1938       "shufps      $0xdd,%%xmm1,%%xmm7           \n"
1939       "pavgb       %%xmm7,%%xmm0                 \n"
1940       "movdqa      %%xmm2,%%xmm7                 \n"
1941       "shufps      $0x88,%%xmm6,%%xmm2           \n"
1942       "shufps      $0xdd,%%xmm6,%%xmm7           \n"
1943       "pavgb       %%xmm7,%%xmm2                 \n"
1944       "movdqa      %%xmm0,%%xmm1                 \n"
1945       "movdqa      %%xmm2,%%xmm6                 \n"
1946       "pmaddubsw   %%xmm4,%%xmm0                 \n"
1947       "pmaddubsw   %%xmm4,%%xmm2                 \n"
1948       "pmaddubsw   %%xmm3,%%xmm1                 \n"
1949       "pmaddubsw   %%xmm3,%%xmm6                 \n"
1950       "phaddw      %%xmm2,%%xmm0                 \n"
1951       "phaddw      %%xmm6,%%xmm1                 \n"
1952       "psraw       $0x8,%%xmm0                   \n"
1953       "psraw       $0x8,%%xmm1                   \n"
1954       "packsswb    %%xmm1,%%xmm0                 \n"
1955       "paddb       %%xmm5,%%xmm0                 \n"
1956       "movlps      %%xmm0,(%1)                   \n"
1957       "movhps      %%xmm0,0x00(%1,%2,1)          \n"
1958       "lea         0x8(%1),%1                    \n"
1959       "sub         $0x10,%3                      \n"
1960       "jg          1b                            \n"
1961       : "+r"(src_bgra),                    // %0
1962         "+r"(dst_u),                       // %1
1963         "+r"(dst_v),                       // %2
1964         "+rm"(width)                       // %3
1965       : "r"((intptr_t)(src_stride_bgra)),  // %4
1966         "m"(kBGRAToV),                     // %5
1967         "m"(kBGRAToU),                     // %6
1968         "m"(kAddUV128)                     // %7
1969       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1970 }
1971 
ABGRToYRow_SSSE3(const uint8_t * src_abgr,uint8_t * dst_y,int width)1972 void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1973   asm volatile(
1974       "movdqa      %3,%%xmm4                     \n"
1975       "movdqa      %4,%%xmm5                     \n"
1976       "movdqa      %5,%%xmm7                     \n"
1977 
1978       LABELALIGN RGBTOY(xmm7)
1979       : "+r"(src_abgr),  // %0
1980         "+r"(dst_y),     // %1
1981         "+r"(width)      // %2
1982       : "m"(kABGRToY),   // %3
1983         "m"(kSub128),    // %4
1984         "m"(kAddY16)     // %5
1985       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1986         "xmm7");
1987 }
1988 
RGBAToYRow_SSSE3(const uint8_t * src_rgba,uint8_t * dst_y,int width)1989 void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1990   asm volatile(
1991       "movdqa      %3,%%xmm4                     \n"
1992       "movdqa      %4,%%xmm5                     \n"
1993       "movdqa      %5,%%xmm7                     \n"
1994 
1995       LABELALIGN RGBTOY(xmm7)
1996       : "+r"(src_rgba),  // %0
1997         "+r"(dst_y),     // %1
1998         "+r"(width)      // %2
1999       : "m"(kRGBAToY),   // %3
2000         "m"(kSub128),    // %4
2001         "m"(kAddY16)     // %5
2002       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2003         "xmm7");
2004 }
2005 
ABGRToUVRow_SSSE3(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)2006 void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
2007                        int src_stride_abgr,
2008                        uint8_t* dst_u,
2009                        uint8_t* dst_v,
2010                        int width) {
2011   asm volatile(
2012       "movdqa      %5,%%xmm3                     \n"
2013       "movdqa      %6,%%xmm4                     \n"
2014       "movdqa      %7,%%xmm5                     \n"
2015       "sub         %1,%2                         \n"
2016 
2017       LABELALIGN
2018       "1:                                        \n"
2019       "movdqu      (%0),%%xmm0                   \n"
2020       "movdqu      0x00(%0,%4,1),%%xmm7          \n"
2021       "pavgb       %%xmm7,%%xmm0                 \n"
2022       "movdqu      0x10(%0),%%xmm1               \n"
2023       "movdqu      0x10(%0,%4,1),%%xmm7          \n"
2024       "pavgb       %%xmm7,%%xmm1                 \n"
2025       "movdqu      0x20(%0),%%xmm2               \n"
2026       "movdqu      0x20(%0,%4,1),%%xmm7          \n"
2027       "pavgb       %%xmm7,%%xmm2                 \n"
2028       "movdqu      0x30(%0),%%xmm6               \n"
2029       "movdqu      0x30(%0,%4,1),%%xmm7          \n"
2030       "pavgb       %%xmm7,%%xmm6                 \n"
2031 
2032       "lea         0x40(%0),%0                   \n"
2033       "movdqa      %%xmm0,%%xmm7                 \n"
2034       "shufps      $0x88,%%xmm1,%%xmm0           \n"
2035       "shufps      $0xdd,%%xmm1,%%xmm7           \n"
2036       "pavgb       %%xmm7,%%xmm0                 \n"
2037       "movdqa      %%xmm2,%%xmm7                 \n"
2038       "shufps      $0x88,%%xmm6,%%xmm2           \n"
2039       "shufps      $0xdd,%%xmm6,%%xmm7           \n"
2040       "pavgb       %%xmm7,%%xmm2                 \n"
2041       "movdqa      %%xmm0,%%xmm1                 \n"
2042       "movdqa      %%xmm2,%%xmm6                 \n"
2043       "pmaddubsw   %%xmm4,%%xmm0                 \n"
2044       "pmaddubsw   %%xmm4,%%xmm2                 \n"
2045       "pmaddubsw   %%xmm3,%%xmm1                 \n"
2046       "pmaddubsw   %%xmm3,%%xmm6                 \n"
2047       "phaddw      %%xmm2,%%xmm0                 \n"
2048       "phaddw      %%xmm6,%%xmm1                 \n"
2049       "psraw       $0x8,%%xmm0                   \n"
2050       "psraw       $0x8,%%xmm1                   \n"
2051       "packsswb    %%xmm1,%%xmm0                 \n"
2052       "paddb       %%xmm5,%%xmm0                 \n"
2053       "movlps      %%xmm0,(%1)                   \n"
2054       "movhps      %%xmm0,0x00(%1,%2,1)          \n"
2055       "lea         0x8(%1),%1                    \n"
2056       "sub         $0x10,%3                      \n"
2057       "jg          1b                            \n"
2058       : "+r"(src_abgr),                    // %0
2059         "+r"(dst_u),                       // %1
2060         "+r"(dst_v),                       // %2
2061         "+rm"(width)                       // %3
2062       : "r"((intptr_t)(src_stride_abgr)),  // %4
2063         "m"(kABGRToV),                     // %5
2064         "m"(kABGRToU),                     // %6
2065         "m"(kAddUV128)                     // %7
2066       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
2067 }
2068 
RGBAToUVRow_SSSE3(const uint8_t * src_rgba,int src_stride_rgba,uint8_t * dst_u,uint8_t * dst_v,int width)2069 void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
2070                        int src_stride_rgba,
2071                        uint8_t* dst_u,
2072                        uint8_t* dst_v,
2073                        int width) {
2074   asm volatile(
2075       "movdqa      %5,%%xmm3                     \n"
2076       "movdqa      %6,%%xmm4                     \n"
2077       "movdqa      %7,%%xmm5                     \n"
2078       "sub         %1,%2                         \n"
2079 
2080       LABELALIGN
2081       "1:                                        \n"
2082       "movdqu      (%0),%%xmm0                   \n"
2083       "movdqu      0x00(%0,%4,1),%%xmm7          \n"
2084       "pavgb       %%xmm7,%%xmm0                 \n"
2085       "movdqu      0x10(%0),%%xmm1               \n"
2086       "movdqu      0x10(%0,%4,1),%%xmm7          \n"
2087       "pavgb       %%xmm7,%%xmm1                 \n"
2088       "movdqu      0x20(%0),%%xmm2               \n"
2089       "movdqu      0x20(%0,%4,1),%%xmm7          \n"
2090       "pavgb       %%xmm7,%%xmm2                 \n"
2091       "movdqu      0x30(%0),%%xmm6               \n"
2092       "movdqu      0x30(%0,%4,1),%%xmm7          \n"
2093       "pavgb       %%xmm7,%%xmm6                 \n"
2094 
2095       "lea         0x40(%0),%0                   \n"
2096       "movdqa      %%xmm0,%%xmm7                 \n"
2097       "shufps      $0x88,%%xmm1,%%xmm0           \n"
2098       "shufps      $0xdd,%%xmm1,%%xmm7           \n"
2099       "pavgb       %%xmm7,%%xmm0                 \n"
2100       "movdqa      %%xmm2,%%xmm7                 \n"
2101       "shufps      $0x88,%%xmm6,%%xmm2           \n"
2102       "shufps      $0xdd,%%xmm6,%%xmm7           \n"
2103       "pavgb       %%xmm7,%%xmm2                 \n"
2104       "movdqa      %%xmm0,%%xmm1                 \n"
2105       "movdqa      %%xmm2,%%xmm6                 \n"
2106       "pmaddubsw   %%xmm4,%%xmm0                 \n"
2107       "pmaddubsw   %%xmm4,%%xmm2                 \n"
2108       "pmaddubsw   %%xmm3,%%xmm1                 \n"
2109       "pmaddubsw   %%xmm3,%%xmm6                 \n"
2110       "phaddw      %%xmm2,%%xmm0                 \n"
2111       "phaddw      %%xmm6,%%xmm1                 \n"
2112       "psraw       $0x8,%%xmm0                   \n"
2113       "psraw       $0x8,%%xmm1                   \n"
2114       "packsswb    %%xmm1,%%xmm0                 \n"
2115       "paddb       %%xmm5,%%xmm0                 \n"
2116       "movlps      %%xmm0,(%1)                   \n"
2117       "movhps      %%xmm0,0x00(%1,%2,1)          \n"
2118       "lea         0x8(%1),%1                    \n"
2119       "sub         $0x10,%3                      \n"
2120       "jg          1b                            \n"
2121       : "+r"(src_rgba),                    // %0
2122         "+r"(dst_u),                       // %1
2123         "+r"(dst_v),                       // %2
2124         "+rm"(width)                       // %3
2125       : "r"((intptr_t)(src_stride_rgba)),  // %4
2126         "m"(kRGBAToV),                     // %5
2127         "m"(kRGBAToU),                     // %6
2128         "m"(kAddUV128)                     // %7
2129       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
2130 }
2131 
2132 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
2133 
2134 // Read 8 UV from 444
2135 #define READYUV444                                                \
2136   "movq       (%[u_buf]),%%xmm3                               \n" \
2137   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2138   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
2139   "punpcklbw  %%xmm1,%%xmm3                                   \n" \
2140   "movq       (%[y_buf]),%%xmm4                               \n" \
2141   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
2142   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
2143 
2144 // Read 4 UV from 422, upsample to 8 UV
2145 #define READYUV422                                                \
2146   "movd       (%[u_buf]),%%xmm3                               \n" \
2147   "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2148   "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
2149   "punpcklbw  %%xmm1,%%xmm3                                   \n" \
2150   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2151   "movq       (%[y_buf]),%%xmm4                               \n" \
2152   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
2153   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
2154 
2155 // Read 4 UV from 422 10 bit, upsample to 8 UV
2156 // TODO(fbarchard): Consider shufb to replace pack/unpack
2157 // TODO(fbarchard): Consider pmulhuw to replace psraw
2158 // TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
2159 #define READYUV210                                                \
2160   "movq       (%[u_buf]),%%xmm3                               \n" \
2161   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2162   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
2163   "punpcklwd  %%xmm1,%%xmm3                                   \n" \
2164   "psraw      $2,%%xmm3                                       \n" \
2165   "packuswb   %%xmm3,%%xmm3                                   \n" \
2166   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2167   "movdqu     (%[y_buf]),%%xmm4                               \n" \
2168   "psllw      $6,%%xmm4                                       \n" \
2169   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
2170 
2171 #define READYUVA210                                               \
2172   "movq       (%[u_buf]),%%xmm3                               \n" \
2173   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2174   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
2175   "punpcklwd  %%xmm1,%%xmm3                                   \n" \
2176   "psraw      $2,%%xmm3                                       \n" \
2177   "packuswb   %%xmm3,%%xmm3                                   \n" \
2178   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2179   "movdqu     (%[y_buf]),%%xmm4                               \n" \
2180   "psllw      $6,%%xmm4                                       \n" \
2181   "lea        0x10(%[y_buf]),%[y_buf]                         \n" \
2182   "movdqu     (%[a_buf]),%%xmm5                               \n" \
2183   "psraw      $2,%%xmm5                                       \n" \
2184   "packuswb   %%xmm5,%%xmm5                                   \n" \
2185   "lea        0x10(%[a_buf]),%[a_buf]                         \n"
2186 
2187 // Read 8 UV from 444 10 bit
2188 #define READYUV410                                                \
2189   "movdqu     (%[u_buf]),%%xmm3                               \n" \
2190   "movdqu     0x00(%[u_buf],%[v_buf],1),%%xmm2                \n" \
2191   "lea        0x10(%[u_buf]),%[u_buf]                         \n" \
2192   "psraw      $2,%%xmm3                                       \n" \
2193   "psraw      $2,%%xmm2                                       \n" \
2194   "movdqa     %%xmm3,%%xmm1                                   \n" \
2195   "punpcklwd  %%xmm2,%%xmm3                                   \n" \
2196   "punpckhwd  %%xmm2,%%xmm1                                   \n" \
2197   "packuswb   %%xmm1,%%xmm3                                   \n" \
2198   "movdqu     (%[y_buf]),%%xmm4                               \n" \
2199   "psllw      $6,%%xmm4                                       \n" \
2200   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
2201 
2202 // Read 8 UV from 444 10 bit.  With 8 Alpha.
2203 #define READYUVA410                                               \
2204   "movdqu     (%[u_buf]),%%xmm3                               \n" \
2205   "movdqu     0x00(%[u_buf],%[v_buf],1),%%xmm2                \n" \
2206   "lea        0x10(%[u_buf]),%[u_buf]                         \n" \
2207   "psraw      $2,%%xmm3                                       \n" \
2208   "psraw      $2,%%xmm2                                       \n" \
2209   "movdqa     %%xmm3,%%xmm1                                   \n" \
2210   "punpcklwd  %%xmm2,%%xmm3                                   \n" \
2211   "punpckhwd  %%xmm2,%%xmm1                                   \n" \
2212   "packuswb   %%xmm1,%%xmm3                                   \n" \
2213   "movdqu     (%[y_buf]),%%xmm4                               \n" \
2214   "psllw      $0x6,%%xmm4                                     \n" \
2215   "lea        0x10(%[y_buf]),%[y_buf]                         \n" \
2216   "movdqu     (%[a_buf]),%%xmm5                               \n" \
2217   "psraw      $2,%%xmm5                                       \n" \
2218   "packuswb   %%xmm5,%%xmm5                                   \n" \
2219   "lea        0x10(%[a_buf]),%[a_buf]                         \n"
2220 
2221 // Read 4 UV from 422 12 bit, upsample to 8 UV
2222 #define READYUV212                                                \
2223   "movq       (%[u_buf]),%%xmm3                               \n" \
2224   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2225   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
2226   "punpcklwd  %%xmm1,%%xmm3                                   \n" \
2227   "psraw      $0x4,%%xmm3                                     \n" \
2228   "packuswb   %%xmm3,%%xmm3                                   \n" \
2229   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2230   "movdqu     (%[y_buf]),%%xmm4                               \n" \
2231   "psllw      $0x4,%%xmm4                                     \n" \
2232   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
2233 
2234 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
2235 #define READYUVA422                                               \
2236   "movd       (%[u_buf]),%%xmm3                               \n" \
2237   "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2238   "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
2239   "punpcklbw  %%xmm1,%%xmm3                                   \n" \
2240   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2241   "movq       (%[y_buf]),%%xmm4                               \n" \
2242   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
2243   "lea        0x8(%[y_buf]),%[y_buf]                          \n" \
2244   "movq       (%[a_buf]),%%xmm5                               \n" \
2245   "lea        0x8(%[a_buf]),%[a_buf]                          \n"
2246 
2247 // Read 8 UV from 444.  With 8 Alpha.
2248 #define READYUVA444                                               \
2249   "movq       (%[u_buf]),%%xmm3                               \n" \
2250   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2251   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
2252   "punpcklbw  %%xmm1,%%xmm3                                   \n" \
2253   "movq       (%[y_buf]),%%xmm4                               \n" \
2254   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
2255   "lea        0x8(%[y_buf]),%[y_buf]                          \n" \
2256   "movq       (%[a_buf]),%%xmm5                               \n" \
2257   "lea        0x8(%[a_buf]),%[a_buf]                          \n"
2258 
2259 // Read 4 UV from NV12, upsample to 8 UV
2260 #define READNV12                                                  \
2261   "movq       (%[uv_buf]),%%xmm3                              \n" \
2262   "lea        0x8(%[uv_buf]),%[uv_buf]                        \n" \
2263   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2264   "movq       (%[y_buf]),%%xmm4                               \n" \
2265   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
2266   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
2267 
2268 // Read 4 VU from NV21, upsample to 8 UV
2269 #define READNV21                                                  \
2270   "movq       (%[vu_buf]),%%xmm3                              \n" \
2271   "lea        0x8(%[vu_buf]),%[vu_buf]                        \n" \
2272   "pshufb     %[kShuffleNV21], %%xmm3                         \n" \
2273   "movq       (%[y_buf]),%%xmm4                               \n" \
2274   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
2275   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
2276 
2277 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
2278 #define READYUY2                                                  \
2279   "movdqu     (%[yuy2_buf]),%%xmm4                            \n" \
2280   "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n" \
2281   "movdqu     (%[yuy2_buf]),%%xmm3                            \n" \
2282   "pshufb     %[kShuffleYUY2UV], %%xmm3                       \n" \
2283   "lea        0x10(%[yuy2_buf]),%[yuy2_buf]                   \n"
2284 
2285 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
2286 #define READUYVY                                                  \
2287   "movdqu     (%[uyvy_buf]),%%xmm4                            \n" \
2288   "pshufb     %[kShuffleUYVYY], %%xmm4                        \n" \
2289   "movdqu     (%[uyvy_buf]),%%xmm3                            \n" \
2290   "pshufb     %[kShuffleUYVYUV], %%xmm3                       \n" \
2291   "lea        0x10(%[uyvy_buf]),%[uyvy_buf]                   \n"
2292 
2293 // Read 4 UV from P210, upsample to 8 UV
2294 #define READP210                                                  \
2295   "movdqu     (%[uv_buf]),%%xmm3                              \n" \
2296   "lea        0x10(%[uv_buf]),%[uv_buf]                       \n" \
2297   "psrlw      $0x8,%%xmm3                                     \n" \
2298   "packuswb   %%xmm3,%%xmm3                                   \n" \
2299   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2300   "movdqu     (%[y_buf]),%%xmm4                               \n" \
2301   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
2302 
2303 // Read 8 UV from P410
2304 #define READP410                                                  \
2305   "movdqu     (%[uv_buf]),%%xmm3                              \n" \
2306   "movdqu     0x10(%[uv_buf]),%%xmm1                          \n" \
2307   "lea        0x20(%[uv_buf]),%[uv_buf]                       \n" \
2308   "psrlw      $0x8,%%xmm3                                     \n" \
2309   "psrlw      $0x8,%%xmm1                                     \n" \
2310   "packuswb   %%xmm1,%%xmm3                                   \n" \
2311   "movdqu     (%[y_buf]),%%xmm4                               \n" \
2312   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
2313 
2314 #if defined(__x86_64__)
2315 #define YUVTORGB_SETUP(yuvconstants)                              \
2316   "pcmpeqb    %%xmm13,%%xmm13                                 \n" \
2317   "movdqa     (%[yuvconstants]),%%xmm8                        \n" \
2318   "pxor       %%xmm12,%%xmm12                                 \n" \
2319   "movdqa     32(%[yuvconstants]),%%xmm9                      \n" \
2320   "psllw      $7,%%xmm13                                      \n" \
2321   "movdqa     64(%[yuvconstants]),%%xmm10                     \n" \
2322   "pshufb     %%xmm12,%%xmm13                                 \n" \
2323   "movdqa     96(%[yuvconstants]),%%xmm11                     \n" \
2324   "movdqa     128(%[yuvconstants]),%%xmm12                    \n"
2325 
2326 // Convert 8 pixels: 8 UV and 8 Y
2327 #define YUVTORGB16(yuvconstants)                                  \
2328   "psubb      %%xmm13,%%xmm3                                  \n" \
2329   "pmulhuw    %%xmm11,%%xmm4                                  \n" \
2330   "movdqa     %%xmm8,%%xmm0                                   \n" \
2331   "movdqa     %%xmm9,%%xmm1                                   \n" \
2332   "movdqa     %%xmm10,%%xmm2                                  \n" \
2333   "paddw      %%xmm12,%%xmm4                                  \n" \
2334   "pmaddubsw  %%xmm3,%%xmm0                                   \n" \
2335   "pmaddubsw  %%xmm3,%%xmm1                                   \n" \
2336   "pmaddubsw  %%xmm3,%%xmm2                                   \n" \
2337   "paddsw     %%xmm4,%%xmm0                                   \n" \
2338   "paddsw     %%xmm4,%%xmm2                                   \n" \
2339   "psubsw     %%xmm1,%%xmm4                                   \n" \
2340   "movdqa     %%xmm4,%%xmm1                                   \n"
2341 
2342 #define YUVTORGB_REGS "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
2343 
2344 #else
2345 #define YUVTORGB_SETUP(yuvconstants)
2346 // Convert 8 pixels: 8 UV and 8 Y
2347 #define YUVTORGB16(yuvconstants)                                  \
2348   "pcmpeqb    %%xmm0,%%xmm0                                   \n" \
2349   "pxor       %%xmm1,%%xmm1                                   \n" \
2350   "psllw      $7,%%xmm0                                       \n" \
2351   "pshufb     %%xmm1,%%xmm0                                   \n" \
2352   "psubb      %%xmm0,%%xmm3                                   \n" \
2353   "pmulhuw    96(%[yuvconstants]),%%xmm4                      \n" \
2354   "movdqa     (%[yuvconstants]),%%xmm0                        \n" \
2355   "movdqa     32(%[yuvconstants]),%%xmm1                      \n" \
2356   "movdqa     64(%[yuvconstants]),%%xmm2                      \n" \
2357   "pmaddubsw  %%xmm3,%%xmm0                                   \n" \
2358   "pmaddubsw  %%xmm3,%%xmm1                                   \n" \
2359   "pmaddubsw  %%xmm3,%%xmm2                                   \n" \
2360   "movdqa     128(%[yuvconstants]),%%xmm3                     \n" \
2361   "paddw      %%xmm3,%%xmm4                                   \n" \
2362   "paddsw     %%xmm4,%%xmm0                                   \n" \
2363   "paddsw     %%xmm4,%%xmm2                                   \n" \
2364   "psubsw     %%xmm1,%%xmm4                                   \n" \
2365   "movdqa     %%xmm4,%%xmm1                                   \n"
2366 
2367 #define YUVTORGB_REGS
2368 #endif
2369 
2370 #define YUVTORGB(yuvconstants)                                    \
2371   YUVTORGB16(yuvconstants)                                        \
2372   "psraw      $0x6,%%xmm0                                     \n" \
2373   "psraw      $0x6,%%xmm1                                     \n" \
2374   "psraw      $0x6,%%xmm2                                     \n" \
2375   "packuswb   %%xmm0,%%xmm0                                   \n" \
2376   "packuswb   %%xmm1,%%xmm1                                   \n" \
2377   "packuswb   %%xmm2,%%xmm2                                   \n"
2378 
2379 // Store 8 ARGB values.
2380 #define STOREARGB                                                  \
2381   "punpcklbw  %%xmm1,%%xmm0                                    \n" \
2382   "punpcklbw  %%xmm5,%%xmm2                                    \n" \
2383   "movdqa     %%xmm0,%%xmm1                                    \n" \
2384   "punpcklwd  %%xmm2,%%xmm0                                    \n" \
2385   "punpckhwd  %%xmm2,%%xmm1                                    \n" \
2386   "movdqu     %%xmm0,(%[dst_argb])                             \n" \
2387   "movdqu     %%xmm1,0x10(%[dst_argb])                         \n" \
2388   "lea        0x20(%[dst_argb]), %[dst_argb]                   \n"
2389 
2390 // Store 8 RGBA values.
2391 #define STORERGBA                                                  \
2392   "pcmpeqb   %%xmm5,%%xmm5                                     \n" \
2393   "punpcklbw %%xmm2,%%xmm1                                     \n" \
2394   "punpcklbw %%xmm0,%%xmm5                                     \n" \
2395   "movdqa    %%xmm5,%%xmm0                                     \n" \
2396   "punpcklwd %%xmm1,%%xmm5                                     \n" \
2397   "punpckhwd %%xmm1,%%xmm0                                     \n" \
2398   "movdqu    %%xmm5,(%[dst_rgba])                              \n" \
2399   "movdqu    %%xmm0,0x10(%[dst_rgba])                          \n" \
2400   "lea       0x20(%[dst_rgba]),%[dst_rgba]                     \n"
2401 
2402 // Store 8 AR30 values.
2403 #define STOREAR30                                                  \
2404   "psraw      $0x4,%%xmm0                                      \n" \
2405   "psraw      $0x4,%%xmm1                                      \n" \
2406   "psraw      $0x4,%%xmm2                                      \n" \
2407   "pminsw     %%xmm7,%%xmm0                                    \n" \
2408   "pminsw     %%xmm7,%%xmm1                                    \n" \
2409   "pminsw     %%xmm7,%%xmm2                                    \n" \
2410   "pmaxsw     %%xmm6,%%xmm0                                    \n" \
2411   "pmaxsw     %%xmm6,%%xmm1                                    \n" \
2412   "pmaxsw     %%xmm6,%%xmm2                                    \n" \
2413   "psllw      $0x4,%%xmm2                                      \n" \
2414   "movdqa     %%xmm0,%%xmm3                                    \n" \
2415   "punpcklwd  %%xmm2,%%xmm0                                    \n" \
2416   "punpckhwd  %%xmm2,%%xmm3                                    \n" \
2417   "movdqa     %%xmm1,%%xmm2                                    \n" \
2418   "punpcklwd  %%xmm5,%%xmm1                                    \n" \
2419   "punpckhwd  %%xmm5,%%xmm2                                    \n" \
2420   "pslld      $0xa,%%xmm1                                      \n" \
2421   "pslld      $0xa,%%xmm2                                      \n" \
2422   "por        %%xmm1,%%xmm0                                    \n" \
2423   "por        %%xmm2,%%xmm3                                    \n" \
2424   "movdqu     %%xmm0,(%[dst_ar30])                             \n" \
2425   "movdqu     %%xmm3,0x10(%[dst_ar30])                         \n" \
2426   "lea        0x20(%[dst_ar30]), %[dst_ar30]                   \n"
2427 
I444ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2428 void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
2429                                 const uint8_t* u_buf,
2430                                 const uint8_t* v_buf,
2431                                 uint8_t* dst_argb,
2432                                 const struct YuvConstants* yuvconstants,
2433                                 int width) {
2434   asm volatile (
2435     YUVTORGB_SETUP(yuvconstants)
2436       "sub         %[u_buf],%[v_buf]             \n"
2437       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2438 
2439     LABELALIGN
2440       "1:                                        \n"
2441     READYUV444
2442     YUVTORGB(yuvconstants)
2443     STOREARGB
2444       "sub         $0x8,%[width]                 \n"
2445       "jg          1b                            \n"
2446   : [y_buf]"+r"(y_buf),    // %[y_buf]
2447     [u_buf]"+r"(u_buf),    // %[u_buf]
2448     [v_buf]"+r"(v_buf),    // %[v_buf]
2449     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2450     [width]"+rm"(width)    // %[width]
2451   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2452   : "memory", "cc", YUVTORGB_REGS
2453     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2454   );
2455 }
2456 
2457 #ifdef HAS_I444ALPHATOARGBROW_SSSE3
I444AlphaToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2458 void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
2459                                      const uint8_t* u_buf,
2460                                      const uint8_t* v_buf,
2461                                      const uint8_t* a_buf,
2462                                      uint8_t* dst_argb,
2463                                      const struct YuvConstants* yuvconstants,
2464                                      int width) {
2465   // clang-format off
2466   asm volatile (
2467   YUVTORGB_SETUP(yuvconstants)
2468       "sub         %[u_buf],%[v_buf]             \n"
2469 
2470   LABELALIGN
2471       "1:                                        \n"
2472   READYUVA444
2473   YUVTORGB(yuvconstants)
2474   STOREARGB
2475       "subl        $0x8,%[width]                 \n"
2476       "jg          1b                            \n"
2477   : [y_buf]"+r"(y_buf),    // %[y_buf]
2478     [u_buf]"+r"(u_buf),    // %[u_buf]
2479     [v_buf]"+r"(v_buf),    // %[v_buf]
2480     [a_buf]"+r"(a_buf),    // %[a_buf]
2481     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2482 #if defined(__i386__)
2483     [width]"+m"(width)     // %[width]
2484 #else
2485     [width]"+rm"(width)    // %[width]
2486 #endif
2487   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2488   : "memory", "cc", YUVTORGB_REGS
2489       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2490   );
2491   // clang-format on
2492 }
2493 #endif  // HAS_I444ALPHATOARGBROW_SSSE3
2494 
I422ToRGB24Row_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)2495 void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
2496                                  const uint8_t* u_buf,
2497                                  const uint8_t* v_buf,
2498                                  uint8_t* dst_rgb24,
2499                                  const struct YuvConstants* yuvconstants,
2500                                  int width) {
2501   asm volatile (
2502     YUVTORGB_SETUP(yuvconstants)
2503       "movdqa      %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
2504       "movdqa      %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
2505       "sub         %[u_buf],%[v_buf]             \n"
2506 
2507     LABELALIGN
2508       "1:                                        \n"
2509     READYUV422
2510     YUVTORGB(yuvconstants)
2511       "punpcklbw   %%xmm1,%%xmm0                 \n"
2512       "punpcklbw   %%xmm2,%%xmm2                 \n"
2513       "movdqa      %%xmm0,%%xmm1                 \n"
2514       "punpcklwd   %%xmm2,%%xmm0                 \n"
2515       "punpckhwd   %%xmm2,%%xmm1                 \n"
2516       "pshufb      %%xmm5,%%xmm0                 \n"
2517       "pshufb      %%xmm6,%%xmm1                 \n"
2518       "palignr     $0xc,%%xmm0,%%xmm1            \n"
2519       "movq        %%xmm0,(%[dst_rgb24])         \n"
2520       "movdqu      %%xmm1,0x8(%[dst_rgb24])      \n"
2521       "lea         0x18(%[dst_rgb24]),%[dst_rgb24] \n"
2522       "subl        $0x8,%[width]                 \n"
2523       "jg          1b                            \n"
2524   : [y_buf]"+r"(y_buf),    // %[y_buf]
2525     [u_buf]"+r"(u_buf),    // %[u_buf]
2526     [v_buf]"+r"(v_buf),    // %[v_buf]
2527     [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
2528 #if defined(__i386__)
2529     [width]"+m"(width)     // %[width]
2530 #else
2531     [width]"+rm"(width)    // %[width]
2532 #endif
2533   : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
2534     [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
2535     [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
2536   : "memory", "cc", YUVTORGB_REGS
2537     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2538   );
2539 }
2540 
I422ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2541 void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
2542                                 const uint8_t* u_buf,
2543                                 const uint8_t* v_buf,
2544                                 uint8_t* dst_argb,
2545                                 const struct YuvConstants* yuvconstants,
2546                                 int width) {
2547   asm volatile (
2548     YUVTORGB_SETUP(yuvconstants)
2549       "sub         %[u_buf],%[v_buf]             \n"
2550       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2551 
2552     LABELALIGN
2553       "1:                                        \n"
2554     READYUV422
2555     YUVTORGB(yuvconstants)
2556     STOREARGB
2557       "sub         $0x8,%[width]                 \n"
2558       "jg          1b                            \n"
2559   : [y_buf]"+r"(y_buf),    // %[y_buf]
2560     [u_buf]"+r"(u_buf),    // %[u_buf]
2561     [v_buf]"+r"(v_buf),    // %[v_buf]
2562     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2563     [width]"+rm"(width)    // %[width]
2564   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2565   : "memory", "cc", YUVTORGB_REGS
2566     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2567   );
2568 }
2569 
I422ToAR30Row_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2570 void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
2571                                 const uint8_t* u_buf,
2572                                 const uint8_t* v_buf,
2573                                 uint8_t* dst_ar30,
2574                                 const struct YuvConstants* yuvconstants,
2575                                 int width) {
2576   asm volatile (
2577     YUVTORGB_SETUP(yuvconstants)
2578       "sub         %[u_buf],%[v_buf]             \n"
2579       "pcmpeqb     %%xmm5,%%xmm5                 \n"  // AR30 constants
2580       "psrlw       $14,%%xmm5                    \n"
2581       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
2582       "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
2583       "pcmpeqb     %%xmm7,%%xmm7                 \n"
2584       "psrlw       $6,%%xmm7                     \n"  // 1023 for max
2585 
2586     LABELALIGN
2587       "1:                                        \n"
2588     READYUV422
2589     YUVTORGB16(yuvconstants)
2590     STOREAR30
2591       "sub         $0x8,%[width]                 \n"
2592       "jg          1b                            \n"
2593   : [y_buf]"+r"(y_buf),    // %[y_buf]
2594     [u_buf]"+r"(u_buf),    // %[u_buf]
2595     [v_buf]"+r"(v_buf),    // %[v_buf]
2596     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
2597     [width]"+rm"(width)    // %[width]
2598   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2599   : "memory", "cc", YUVTORGB_REGS
2600     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2601   );
2602 }
2603 
2604 // 10 bit YUV to ARGB
I210ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2605 void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
2606                                 const uint16_t* u_buf,
2607                                 const uint16_t* v_buf,
2608                                 uint8_t* dst_argb,
2609                                 const struct YuvConstants* yuvconstants,
2610                                 int width) {
2611   asm volatile (
2612     YUVTORGB_SETUP(yuvconstants)
2613       "sub         %[u_buf],%[v_buf]             \n"
2614       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2615 
2616     LABELALIGN
2617       "1:                                        \n"
2618     READYUV210
2619     YUVTORGB(yuvconstants)
2620     STOREARGB
2621       "sub         $0x8,%[width]                 \n"
2622       "jg          1b                            \n"
2623   : [y_buf]"+r"(y_buf),    // %[y_buf]
2624     [u_buf]"+r"(u_buf),    // %[u_buf]
2625     [v_buf]"+r"(v_buf),    // %[v_buf]
2626     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2627     [width]"+rm"(width)    // %[width]
2628   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2629   : "memory", "cc", YUVTORGB_REGS
2630     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2631   );
2632 }
2633 
2634 // 12 bit YUV to ARGB
I212ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2635 void OMITFP I212ToARGBRow_SSSE3(const uint16_t* y_buf,
2636                                 const uint16_t* u_buf,
2637                                 const uint16_t* v_buf,
2638                                 uint8_t* dst_argb,
2639                                 const struct YuvConstants* yuvconstants,
2640                                 int width) {
2641   asm volatile (
2642     YUVTORGB_SETUP(yuvconstants)
2643       "sub         %[u_buf],%[v_buf]             \n"
2644       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2645 
2646     LABELALIGN
2647       "1:                                        \n"
2648     READYUV212
2649     YUVTORGB(yuvconstants)
2650     STOREARGB
2651       "sub         $0x8,%[width]                 \n"
2652       "jg          1b                            \n"
2653   : [y_buf]"+r"(y_buf),    // %[y_buf]
2654     [u_buf]"+r"(u_buf),    // %[u_buf]
2655     [v_buf]"+r"(v_buf),    // %[v_buf]
2656     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2657     [width]"+rm"(width)    // %[width]
2658   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2659   : "memory", "cc", YUVTORGB_REGS
2660     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2661   );
2662 }
2663 
2664 // 10 bit YUV to AR30
I210ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2665 void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
2666                                 const uint16_t* u_buf,
2667                                 const uint16_t* v_buf,
2668                                 uint8_t* dst_ar30,
2669                                 const struct YuvConstants* yuvconstants,
2670                                 int width) {
2671   asm volatile (
2672     YUVTORGB_SETUP(yuvconstants)
2673       "sub         %[u_buf],%[v_buf]             \n"
2674       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2675       "psrlw       $14,%%xmm5                    \n"
2676       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
2677       "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
2678       "pcmpeqb     %%xmm7,%%xmm7                 \n"
2679       "psrlw       $6,%%xmm7                     \n"  // 1023 for max
2680 
2681     LABELALIGN
2682       "1:                                        \n"
2683     READYUV210
2684     YUVTORGB16(yuvconstants)
2685     STOREAR30
2686       "sub         $0x8,%[width]                 \n"
2687       "jg          1b                            \n"
2688   : [y_buf]"+r"(y_buf),    // %[y_buf]
2689     [u_buf]"+r"(u_buf),    // %[u_buf]
2690     [v_buf]"+r"(v_buf),    // %[v_buf]
2691     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
2692     [width]"+rm"(width)    // %[width]
2693   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2694   : "memory", "cc", YUVTORGB_REGS
2695     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2696   );
2697 }
2698 
2699 // 12 bit YUV to AR30
I212ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2700 void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf,
2701                                 const uint16_t* u_buf,
2702                                 const uint16_t* v_buf,
2703                                 uint8_t* dst_ar30,
2704                                 const struct YuvConstants* yuvconstants,
2705                                 int width) {
2706   asm volatile (
2707     YUVTORGB_SETUP(yuvconstants)
2708       "sub         %[u_buf],%[v_buf]             \n"
2709       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2710       "psrlw       $14,%%xmm5                    \n"
2711       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
2712       "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
2713       "pcmpeqb     %%xmm7,%%xmm7                 \n"
2714       "psrlw       $6,%%xmm7                     \n"  // 1023 for max
2715 
2716     LABELALIGN
2717       "1:                                        \n"
2718     READYUV212
2719     YUVTORGB16(yuvconstants)
2720     STOREAR30
2721       "sub         $0x8,%[width]                 \n"
2722       "jg          1b                            \n"
2723   : [y_buf]"+r"(y_buf),    // %[y_buf]
2724     [u_buf]"+r"(u_buf),    // %[u_buf]
2725     [v_buf]"+r"(v_buf),    // %[v_buf]
2726     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
2727     [width]"+rm"(width)    // %[width]
2728   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2729   : "memory", "cc", YUVTORGB_REGS
2730     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2731   );
2732 }
2733 
2734 // 10 bit YUV to ARGB
I410ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2735 void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf,
2736                                 const uint16_t* u_buf,
2737                                 const uint16_t* v_buf,
2738                                 uint8_t* dst_argb,
2739                                 const struct YuvConstants* yuvconstants,
2740                                 int width) {
2741   asm volatile (
2742     YUVTORGB_SETUP(yuvconstants)
2743       "sub         %[u_buf],%[v_buf]             \n"
2744       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2745 
2746     LABELALIGN
2747       "1:                                        \n"
2748     READYUV410
2749     YUVTORGB(yuvconstants)
2750     STOREARGB
2751       "sub         $0x8,%[width]                 \n"
2752       "jg          1b                            \n"
2753   : [y_buf]"+r"(y_buf),    // %[y_buf]
2754     [u_buf]"+r"(u_buf),    // %[u_buf]
2755     [v_buf]"+r"(v_buf),    // %[v_buf]
2756     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2757     [width]"+rm"(width)    // %[width]
2758   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2759   : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2760   );
2761 }
2762 
2763 #ifdef HAS_I210ALPHATOARGBROW_SSSE3
2764 // 10 bit YUVA to ARGB
I210AlphaToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,const uint16_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2765 void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
2766                                      const uint16_t* u_buf,
2767                                      const uint16_t* v_buf,
2768                                      const uint16_t* a_buf,
2769                                      uint8_t* dst_argb,
2770                                      const struct YuvConstants* yuvconstants,
2771                                      int width) {
2772   asm volatile(
2773       YUVTORGB_SETUP(
2774       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
2775 
2776       LABELALIGN "1:                                        \n" READYUVA210
2777           YUVTORGB(yuvconstants) STOREARGB
2778       "subl        $0x8,%[width]                 \n"
2779       "jg          1b                            \n"
2780       : [y_buf] "+r"(y_buf),  // %[y_buf]
2781         [u_buf] "+r"(u_buf),  // %[u_buf]
2782         [v_buf] "+r"(v_buf),  // %[v_buf]
2783         [a_buf] "+r"(a_buf),
2784         [dst_argb] "+r"(dst_argb),  // %[dst_argb]
2785 #if defined(__i386__)
2786         [width] "+m"(width)  // %[width]
2787 #else
2788         [width] "+rm"(width)  // %[width]
2789 #endif
2790       : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
2791       : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
2792         "xmm5");
2793 }
2794 #endif
2795 
2796 #ifdef HAS_I410ALPHATOARGBROW_SSSE3
2797 // 10 bit YUVA to ARGB
I410AlphaToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,const uint16_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2798 void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
2799                                      const uint16_t* u_buf,
2800                                      const uint16_t* v_buf,
2801                                      const uint16_t* a_buf,
2802                                      uint8_t* dst_argb,
2803                                      const struct YuvConstants* yuvconstants,
2804                                      int width) {
2805   // clang-format off
2806   asm volatile(
2807     YUVTORGB_SETUP(yuvconstants)
2808       "sub         %[u_buf],%[v_buf]             \n"
2809 
2810     LABELALIGN
2811       "1:                                        \n"
2812     READYUVA410
2813     YUVTORGB(yuvconstants)
2814     STOREARGB
2815       "subl        $0x8,%[width]                 \n"
2816       "jg          1b                            \n"
2817     : [y_buf] "+r"(y_buf),  // %[y_buf]
2818       [u_buf] "+r"(u_buf),  // %[u_buf]
2819       [v_buf] "+r"(v_buf),  // %[v_buf]
2820       [a_buf] "+r"(a_buf),
2821       [dst_argb] "+r"(dst_argb),  // %[dst_argb]
2822 #if defined(__i386__)
2823       [width] "+m"(width)  // %[width]
2824 #else
2825       [width] "+rm"(width)  // %[width]
2826 #endif
2827     : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
2828     : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
2829       "xmm5");
2830   // clang-format on
2831 }
2832 #endif
2833 
2834 // 10 bit YUV to AR30
I410ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2835 void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf,
2836                                 const uint16_t* u_buf,
2837                                 const uint16_t* v_buf,
2838                                 uint8_t* dst_ar30,
2839                                 const struct YuvConstants* yuvconstants,
2840                                 int width) {
2841   asm volatile (
2842     YUVTORGB_SETUP(yuvconstants)
2843       "sub         %[u_buf],%[v_buf]             \n"
2844       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2845       "psrlw       $14,%%xmm5                    \n"
2846       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
2847       "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
2848       "pcmpeqb     %%xmm7,%%xmm7                 \n"
2849       "psrlw       $6,%%xmm7                     \n"  // 1023 for max
2850 
2851     LABELALIGN
2852       "1:                                        \n"
2853     READYUV410
2854     YUVTORGB16(yuvconstants)
2855     STOREAR30
2856       "sub         $0x8,%[width]                 \n"
2857       "jg          1b                            \n"
2858   : [y_buf]"+r"(y_buf),    // %[y_buf]
2859     [u_buf]"+r"(u_buf),    // %[u_buf]
2860     [v_buf]"+r"(v_buf),    // %[v_buf]
2861     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
2862     [width]"+rm"(width)    // %[width]
2863   : [yuvconstants]"r"(yuvconstants)   // %[yuvconstants]
2864   : "memory", "cc", YUVTORGB_REGS
2865       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2866   );
2867 }
2868 
2869 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
I422AlphaToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2870 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
2871                                      const uint8_t* u_buf,
2872                                      const uint8_t* v_buf,
2873                                      const uint8_t* a_buf,
2874                                      uint8_t* dst_argb,
2875                                      const struct YuvConstants* yuvconstants,
2876                                      int width) {
2877   // clang-format off
2878   asm volatile (
2879     YUVTORGB_SETUP(yuvconstants)
2880       "sub         %[u_buf],%[v_buf]             \n"
2881 
2882     LABELALIGN
2883       "1:                                        \n"
2884     READYUVA422
2885     YUVTORGB(yuvconstants)
2886     STOREARGB
2887       "subl        $0x8,%[width]                 \n"
2888       "jg          1b                            \n"
2889   : [y_buf]"+r"(y_buf),    // %[y_buf]
2890     [u_buf]"+r"(u_buf),    // %[u_buf]
2891     [v_buf]"+r"(v_buf),    // %[v_buf]
2892     [a_buf]"+r"(a_buf),    // %[a_buf]
2893     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2894 #if defined(__i386__)
2895     [width]"+m"(width)     // %[width]
2896 #else
2897     [width]"+rm"(width)    // %[width]
2898 #endif
2899   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2900   : "memory", "cc", YUVTORGB_REGS
2901     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2902   );
2903   // clang-format on
2904 }
2905 #endif  // HAS_I422ALPHATOARGBROW_SSSE3
2906 
NV12ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2907 void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
2908                                 const uint8_t* uv_buf,
2909                                 uint8_t* dst_argb,
2910                                 const struct YuvConstants* yuvconstants,
2911                                 int width) {
2912   // clang-format off
2913   asm volatile (
2914     YUVTORGB_SETUP(yuvconstants)
2915       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2916 
2917     LABELALIGN
2918       "1:                                        \n"
2919     READNV12
2920     YUVTORGB(yuvconstants)
2921     STOREARGB
2922       "sub         $0x8,%[width]                 \n"
2923       "jg          1b                            \n"
2924   : [y_buf]"+r"(y_buf),    // %[y_buf]
2925     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
2926     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2927     [width]"+rm"(width)    // %[width]
2928   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2929     : "memory", "cc", YUVTORGB_REGS
2930       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2931   );
2932   // clang-format on
2933 }
2934 
NV21ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * vu_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2935 void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
2936                                 const uint8_t* vu_buf,
2937                                 uint8_t* dst_argb,
2938                                 const struct YuvConstants* yuvconstants,
2939                                 int width) {
2940   // clang-format off
2941   asm volatile (
2942     YUVTORGB_SETUP(yuvconstants)
2943       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2944 
2945     LABELALIGN
2946       "1:                                        \n"
2947     READNV21
2948     YUVTORGB(yuvconstants)
2949     STOREARGB
2950       "sub         $0x8,%[width]                 \n"
2951       "jg          1b                            \n"
2952   : [y_buf]"+r"(y_buf),    // %[y_buf]
2953     [vu_buf]"+r"(vu_buf),    // %[vu_buf]
2954     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2955     [width]"+rm"(width)    // %[width]
2956   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2957     [kShuffleNV21]"m"(kShuffleNV21)
2958     : "memory", "cc", YUVTORGB_REGS
2959       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2960   );
2961   // clang-format on
2962 }
2963 
YUY2ToARGBRow_SSSE3(const uint8_t * yuy2_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2964 void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
2965                                 uint8_t* dst_argb,
2966                                 const struct YuvConstants* yuvconstants,
2967                                 int width) {
2968   // clang-format off
2969   asm volatile (
2970     YUVTORGB_SETUP(yuvconstants)
2971       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2972 
2973     LABELALIGN
2974       "1:                                        \n"
2975     READYUY2
2976     YUVTORGB(yuvconstants)
2977     STOREARGB
2978       "sub         $0x8,%[width]                 \n"
2979       "jg          1b                            \n"
2980   : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
2981     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2982     [width]"+rm"(width)    // %[width]
2983   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2984     [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
2985     [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
2986     : "memory", "cc", YUVTORGB_REGS
2987       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2988   );
2989   // clang-format on
2990 }
2991 
UYVYToARGBRow_SSSE3(const uint8_t * uyvy_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2992 void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
2993                                 uint8_t* dst_argb,
2994                                 const struct YuvConstants* yuvconstants,
2995                                 int width) {
2996   // clang-format off
2997   asm volatile (
2998     YUVTORGB_SETUP(yuvconstants)
2999       "pcmpeqb     %%xmm5,%%xmm5                 \n"
3000 
3001     LABELALIGN
3002       "1:                                        \n"
3003     READUYVY
3004     YUVTORGB(yuvconstants)
3005     STOREARGB
3006       "sub         $0x8,%[width]                 \n"
3007       "jg          1b                            \n"
3008   : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
3009     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3010     [width]"+rm"(width)    // %[width]
3011   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
3012     [kShuffleUYVYY]"m"(kShuffleUYVYY),
3013     [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
3014     : "memory", "cc", YUVTORGB_REGS
3015       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3016   );
3017   // clang-format on
3018 }
3019 
P210ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3020 void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf,
3021                                 const uint16_t* uv_buf,
3022                                 uint8_t* dst_argb,
3023                                 const struct YuvConstants* yuvconstants,
3024                                 int width) {
3025   asm volatile(
3026       YUVTORGB_SETUP(
3027       yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
3028 
3029       LABELALIGN "1:                                        \n" READP210
3030           YUVTORGB(yuvconstants) STOREARGB
3031       "sub         $0x8,%[width]                 \n"
3032       "jg          1b                            \n"
3033       : [y_buf] "+r"(y_buf),              // %[y_buf]
3034         [uv_buf] "+r"(uv_buf),            // %[u_buf]
3035         [dst_argb] "+r"(dst_argb),        // %[dst_argb]
3036         [width] "+rm"(width)              // %[width]
3037       : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
3038       : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
3039         "xmm5");
3040 }
3041 
P410ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3042 void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf,
3043                                 const uint16_t* uv_buf,
3044                                 uint8_t* dst_argb,
3045                                 const struct YuvConstants* yuvconstants,
3046                                 int width) {
3047   asm volatile(
3048       YUVTORGB_SETUP(
3049       yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
3050 
3051       LABELALIGN "1:                                        \n" READP410
3052           YUVTORGB(yuvconstants) STOREARGB
3053       "sub         $0x8,%[width]                 \n"
3054       "jg          1b                            \n"
3055       : [y_buf] "+r"(y_buf),              // %[y_buf]
3056         [uv_buf] "+r"(uv_buf),            // %[u_buf]
3057         [dst_argb] "+r"(dst_argb),        // %[dst_argb]
3058         [width] "+rm"(width)              // %[width]
3059       : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
3060       : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
3061         "xmm5");
3062 }
3063 
P210ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3064 void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf,
3065                                 const uint16_t* uv_buf,
3066                                 uint8_t* dst_ar30,
3067                                 const struct YuvConstants* yuvconstants,
3068                                 int width) {
3069   asm volatile (
3070     YUVTORGB_SETUP(yuvconstants)
3071       "pcmpeqb     %%xmm5,%%xmm5                 \n"
3072       "psrlw       $14,%%xmm5                    \n"
3073       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
3074       "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
3075       "pcmpeqb     %%xmm7,%%xmm7                 \n"
3076       "psrlw       $6,%%xmm7                     \n"  // 1023 for max
3077 
3078     LABELALIGN
3079       "1:                                        \n"
3080     READP210
3081     YUVTORGB16(yuvconstants)
3082     STOREAR30
3083       "sub         $0x8,%[width]                 \n"
3084       "jg          1b                            \n"
3085   : [y_buf]"+r"(y_buf),              // %[y_buf]
3086     [uv_buf]"+r"(uv_buf),            // %[uv_buf]
3087     [dst_ar30]"+r"(dst_ar30),        // %[dst_ar30]
3088     [width]"+rm"(width)              // %[width]
3089   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3090   : "memory", "cc", YUVTORGB_REGS
3091       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3092   );
3093 }
3094 
P410ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3095 void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf,
3096                                 const uint16_t* uv_buf,
3097                                 uint8_t* dst_ar30,
3098                                 const struct YuvConstants* yuvconstants,
3099                                 int width) {
3100   asm volatile (
3101     YUVTORGB_SETUP(yuvconstants)
3102       "pcmpeqb     %%xmm5,%%xmm5                 \n"
3103       "psrlw       $14,%%xmm5                    \n"
3104       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
3105       "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
3106       "pcmpeqb     %%xmm7,%%xmm7                 \n"
3107       "psrlw       $6,%%xmm7                     \n"  // 1023 for max
3108 
3109     LABELALIGN
3110       "1:                                        \n"
3111     READP410
3112     YUVTORGB16(yuvconstants)
3113     STOREAR30
3114       "sub         $0x8,%[width]                 \n"
3115       "jg          1b                            \n"
3116   : [y_buf]"+r"(y_buf),              // %[y_buf]
3117     [uv_buf]"+r"(uv_buf),            // %[uv_buf]
3118     [dst_ar30]"+r"(dst_ar30),        // %[dst_ar30]
3119     [width]"+rm"(width)              // %[width]
3120   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3121   : "memory", "cc", YUVTORGB_REGS
3122       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3123   );
3124 }
3125 
I422ToRGBARow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_rgba,const struct YuvConstants * yuvconstants,int width)3126 void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
3127                                 const uint8_t* u_buf,
3128                                 const uint8_t* v_buf,
3129                                 uint8_t* dst_rgba,
3130                                 const struct YuvConstants* yuvconstants,
3131                                 int width) {
3132   asm volatile (
3133     YUVTORGB_SETUP(yuvconstants)
3134       "sub         %[u_buf],%[v_buf]             \n"
3135       "pcmpeqb     %%xmm5,%%xmm5                 \n"
3136 
3137     LABELALIGN
3138       "1:                                        \n"
3139     READYUV422
3140     YUVTORGB(yuvconstants)
3141     STORERGBA
3142       "sub         $0x8,%[width]                 \n"
3143       "jg          1b                            \n"
3144   : [y_buf]"+r"(y_buf),    // %[y_buf]
3145     [u_buf]"+r"(u_buf),    // %[u_buf]
3146     [v_buf]"+r"(v_buf),    // %[v_buf]
3147     [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
3148     [width]"+rm"(width)    // %[width]
3149   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3150   : "memory", "cc", YUVTORGB_REGS
3151     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3152   );
3153 }
3154 
3155 #endif  // HAS_I422TOARGBROW_SSSE3
3156 
3157 // Read 16 UV from 444
3158 #define READYUV444_AVX2                                               \
3159   "vmovdqu    (%[u_buf]),%%xmm3                                   \n" \
3160   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
3161   "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
3162   "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3163   "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
3164   "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
3165   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
3166   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
3167   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
3168   "lea        0x10(%[y_buf]),%[y_buf]                             \n"
3169 
3170 // Read 8 UV from 422, upsample to 16 UV.
3171 #define READYUV422_AVX2                                               \
3172   "vmovq      (%[u_buf]),%%xmm3                                   \n" \
3173   "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
3174   "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
3175   "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
3176   "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3177   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
3178   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
3179   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
3180   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
3181   "lea        0x10(%[y_buf]),%[y_buf]                             \n"
3182 
3183 #define READYUV422_AVX512BW                                           \
3184   "vmovdqu    (%[u_buf]),%%xmm3                                   \n" \
3185   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
3186   "vpermq     %%zmm3,%%zmm16,%%zmm3                               \n" \
3187   "vpermq     %%zmm1,%%zmm16,%%zmm1                               \n" \
3188   "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
3189   "vpunpcklbw %%zmm1,%%zmm3,%%zmm3                                \n" \
3190   "vpermq     $0xd8,%%zmm3,%%zmm3                                 \n" \
3191   "vpunpcklwd %%zmm3,%%zmm3,%%zmm3                                \n" \
3192   "vmovdqu8   (%[y_buf]),%%ymm4                                   \n" \
3193   "vpermq     %%zmm4,%%zmm17,%%zmm4                               \n" \
3194   "vpermq     $0xd8,%%zmm4,%%zmm4                                 \n" \
3195   "vpunpcklbw %%zmm4,%%zmm4,%%zmm4                                \n" \
3196   "lea        0x20(%[y_buf]),%[y_buf]                             \n"
3197 
3198 // Read 8 UV from 210, upsample to 16 UV
3199 // TODO(fbarchard): Consider vshufb to replace pack/unpack
3200 // TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
3201 #define READYUV210_AVX2                                            \
3202   "vmovdqu    (%[u_buf]),%%xmm3                                \n" \
3203   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
3204   "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
3205   "vpermq     $0xd8,%%ymm3,%%ymm3                              \n" \
3206   "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
3207   "vpunpcklwd %%ymm1,%%ymm3,%%ymm3                             \n" \
3208   "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
3209   "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
3210   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
3211   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
3212   "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
3213   "lea        0x20(%[y_buf]),%[y_buf]                          \n"
3214 
3215 // Read 8 UV from 210, upsample to 16 UV. With 16 Alpha.
3216 #define READYUVA210_AVX2                                           \
3217   "vmovdqu    (%[u_buf]),%%xmm3                                \n" \
3218   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
3219   "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
3220   "vpermq     $0xd8,%%ymm3,%%ymm3                              \n" \
3221   "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
3222   "vpunpcklwd %%ymm1,%%ymm3,%%ymm3                             \n" \
3223   "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
3224   "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
3225   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
3226   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
3227   "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
3228   "lea        0x20(%[y_buf]),%[y_buf]                          \n" \
3229   "vmovdqu    (%[a_buf]),%%ymm5                                \n" \
3230   "vpsraw     $2,%%ymm5,%%ymm5                                 \n" \
3231   "vpackuswb  %%ymm5,%%ymm5,%%ymm5                             \n" \
3232   "lea        0x20(%[a_buf]),%[a_buf]                          \n"
3233 
3234 // Read 16 UV from 410
3235 #define READYUV410_AVX2                                            \
3236   "vmovdqu    (%[u_buf]),%%ymm3                                \n" \
3237   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%ymm2                 \n" \
3238   "lea        0x20(%[u_buf]),%[u_buf]                          \n" \
3239   "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
3240   "vpsraw     $2,%%ymm2,%%ymm2                                 \n" \
3241   "vpunpckhwd %%ymm2,%%ymm3,%%ymm1                             \n" \
3242   "vpunpcklwd %%ymm2,%%ymm3,%%ymm3                             \n" \
3243   "vpackuswb  %%ymm1,%%ymm3,%%ymm3                             \n" \
3244   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
3245   "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
3246   "lea        0x20(%[y_buf]),%[y_buf]                          \n"
3247 
3248 // Read 8 UV from 212 12 bit, upsample to 16 UV
3249 #define READYUV212_AVX2                                            \
3250   "vmovdqu    (%[u_buf]),%%xmm3                                \n" \
3251   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
3252   "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
3253   "vpermq     $0xd8,%%ymm3,%%ymm3                              \n" \
3254   "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
3255   "vpunpcklwd %%ymm1,%%ymm3,%%ymm3                             \n" \
3256   "vpsraw     $0x4,%%ymm3,%%ymm3                               \n" \
3257   "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
3258   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
3259   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
3260   "vpsllw     $0x4,%%ymm4,%%ymm4                               \n" \
3261   "lea        0x20(%[y_buf]),%[y_buf]                          \n"
3262 
3263 // Read 16 UV from 410. With 16 Alpha.
3264 #define READYUVA410_AVX2                                           \
3265   "vmovdqu    (%[u_buf]),%%ymm3                                \n" \
3266   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%ymm2                 \n" \
3267   "lea        0x20(%[u_buf]),%[u_buf]                          \n" \
3268   "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
3269   "vpsraw     $2,%%ymm2,%%ymm2                                 \n" \
3270   "vpunpckhwd %%ymm2,%%ymm3,%%ymm1                             \n" \
3271   "vpunpcklwd %%ymm2,%%ymm3,%%ymm3                             \n" \
3272   "vpackuswb  %%ymm1,%%ymm3,%%ymm3                             \n" \
3273   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
3274   "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
3275   "lea        0x20(%[y_buf]),%[y_buf]                          \n" \
3276   "vmovdqu    (%[a_buf]),%%ymm5                                \n" \
3277   "vpsraw     $2,%%ymm5,%%ymm5                                 \n" \
3278   "vpackuswb  %%ymm5,%%ymm5,%%ymm5                             \n" \
3279   "lea        0x20(%[a_buf]),%[a_buf]                          \n"
3280 
3281 // Read 16 UV from 444.  With 16 Alpha.
3282 #define READYUVA444_AVX2                                              \
3283   "vmovdqu    (%[u_buf]),%%xmm3                                   \n" \
3284   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
3285   "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
3286   "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3287   "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
3288   "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
3289   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
3290   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
3291   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
3292   "lea        0x10(%[y_buf]),%[y_buf]                             \n" \
3293   "vmovdqu    (%[a_buf]),%%xmm5                                   \n" \
3294   "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n" \
3295   "lea        0x10(%[a_buf]),%[a_buf]                             \n"
3296 
3297 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
3298 #define READYUVA422_AVX2                                              \
3299   "vmovq      (%[u_buf]),%%xmm3                                   \n" \
3300   "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
3301   "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
3302   "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
3303   "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3304   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
3305   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
3306   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
3307   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
3308   "lea        0x10(%[y_buf]),%[y_buf]                             \n" \
3309   "vmovdqu    (%[a_buf]),%%xmm5                                   \n" \
3310   "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n" \
3311   "lea        0x10(%[a_buf]),%[a_buf]                             \n"
3312 
3313 // Read 8 UV from NV12, upsample to 16 UV.
3314 #define READNV12_AVX2                                                 \
3315   "vmovdqu    (%[uv_buf]),%%xmm3                                  \n" \
3316   "lea        0x10(%[uv_buf]),%[uv_buf]                           \n" \
3317   "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3318   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
3319   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
3320   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
3321   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
3322   "lea        0x10(%[y_buf]),%[y_buf]                             \n"
3323 
3324 // Read 8 VU from NV21, upsample to 16 UV.
3325 #define READNV21_AVX2                                                 \
3326   "vmovdqu    (%[vu_buf]),%%xmm3                                  \n" \
3327   "lea        0x10(%[vu_buf]),%[vu_buf]                           \n" \
3328   "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3329   "vpshufb     %[kShuffleNV21], %%ymm3, %%ymm3                    \n" \
3330   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
3331   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
3332   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
3333   "lea        0x10(%[y_buf]),%[y_buf]                             \n"
3334 
3335 // Read 4 UV from P210, upsample to 8 UV
3336 #define READP210_AVX2                                                 \
3337   "vmovdqu    (%[uv_buf]),%%ymm3                                  \n" \
3338   "lea        0x20(%[uv_buf]),%[uv_buf]                           \n" \
3339   "vpsrlw     $0x8,%%ymm3,%%ymm3                                  \n" \
3340   "vpackuswb  %%ymm3,%%ymm3,%%ymm3                                \n" \
3341   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
3342   "vmovdqu    (%[y_buf]),%%ymm4                                   \n" \
3343   "lea        0x20(%[y_buf]),%[y_buf]                             \n"
3344 
3345 // Read 8 UV from P410
3346 #define READP410_AVX2                                                 \
3347   "vmovdqu    (%[uv_buf]),%%ymm3                                  \n" \
3348   "vmovdqu    0x20(%[uv_buf]),%%ymm1                              \n" \
3349   "lea        0x40(%[uv_buf]),%[uv_buf]                           \n" \
3350   "vpsrlw     $0x8,%%ymm3,%%ymm3                                  \n" \
3351   "vpsrlw     $0x8,%%ymm1,%%ymm1                                  \n" \
3352   "vpackuswb  %%ymm1,%%ymm3,%%ymm3                                \n" \
3353   "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3354   "vmovdqu    (%[y_buf]),%%ymm4                                   \n" \
3355   "lea        0x20(%[y_buf]),%[y_buf]                             \n"
3356 
3357 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
3358 #define READYUY2_AVX2                                                 \
3359   "vmovdqu    (%[yuy2_buf]),%%ymm4                                \n" \
3360   "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n" \
3361   "vmovdqu    (%[yuy2_buf]),%%ymm3                                \n" \
3362   "vpshufb    %[kShuffleYUY2UV], %%ymm3, %%ymm3                   \n" \
3363   "lea        0x20(%[yuy2_buf]),%[yuy2_buf]                       \n"
3364 
3365 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
3366 #define READUYVY_AVX2                                                 \
3367   "vmovdqu    (%[uyvy_buf]),%%ymm4                                \n" \
3368   "vpshufb    %[kShuffleUYVYY], %%ymm4, %%ymm4                    \n" \
3369   "vmovdqu    (%[uyvy_buf]),%%ymm3                                \n" \
3370   "vpshufb    %[kShuffleUYVYUV], %%ymm3, %%ymm3                   \n" \
3371   "lea        0x20(%[uyvy_buf]),%[uyvy_buf]                       \n"
3372 
3373 // TODO(fbarchard): Remove broadcastb
3374 #if defined(__x86_64__)
3375 #define YUVTORGB_SETUP_AVX2(yuvconstants)                             \
3376   "vpcmpeqb    %%xmm13,%%xmm13,%%xmm13                            \n" \
3377   "vmovdqa     (%[yuvconstants]),%%ymm8                           \n" \
3378   "vpsllw      $7,%%xmm13,%%xmm13                                 \n" \
3379   "vmovdqa     32(%[yuvconstants]),%%ymm9                         \n" \
3380   "vpbroadcastb %%xmm13,%%ymm13                                   \n" \
3381   "vmovdqa     64(%[yuvconstants]),%%ymm10                        \n" \
3382   "vmovdqa     96(%[yuvconstants]),%%ymm11                        \n" \
3383   "vmovdqa     128(%[yuvconstants]),%%ymm12                       \n"
3384 
3385 #define YUVTORGB_SETUP_AVX512BW(yuvconstants)                         \
3386   "vpcmpeqb   %%xmm13,%%xmm13,%%xmm13                             \n" \
3387   "movdqa     (%[yuvconstants]),%%xmm8                            \n" \
3388   "vpbroadcastq %%xmm8, %%zmm8                                    \n" \
3389   "vpsllw     $7,%%xmm13,%%xmm13                                  \n" \
3390   "vpbroadcastb %%xmm13,%%zmm13                                   \n" \
3391   "movq     32(%[yuvconstants]),%%xmm9                            \n" \
3392   "vpbroadcastq %%xmm9,%%zmm9                                     \n" \
3393   "movq     64(%[yuvconstants]),%%xmm10                           \n" \
3394   "vpbroadcastq %%xmm10,%%zmm10                                   \n" \
3395   "movq     96(%[yuvconstants]),%%xmm11                           \n" \
3396   "vpbroadcastq %%xmm11,%%zmm11                                   \n" \
3397   "movq     128(%[yuvconstants]),%%xmm12                          \n" \
3398   "vpbroadcastq %%xmm12,%%zmm12                                   \n" \
3399   "vmovdqu8 (%[quadsplitperm]),%%zmm16                            \n" \
3400   "vmovdqu8 (%[dquadsplitperm]),%%zmm17                           \n" \
3401   "vmovdqu8 (%[unperm]),%%zmm18                                   \n"
3402 
3403 #define YUVTORGB16_AVX2(yuvconstants)                                 \
3404   "vpsubb      %%ymm13,%%ymm3,%%ymm3                              \n" \
3405   "vpmulhuw    %%ymm11,%%ymm4,%%ymm4                              \n" \
3406   "vpmaddubsw  %%ymm3,%%ymm8,%%ymm0                               \n" \
3407   "vpmaddubsw  %%ymm3,%%ymm9,%%ymm1                               \n" \
3408   "vpmaddubsw  %%ymm3,%%ymm10,%%ymm2                              \n" \
3409   "vpaddw      %%ymm4,%%ymm12,%%ymm4                              \n" \
3410   "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
3411   "vpsubsw     %%ymm1,%%ymm4,%%ymm1                               \n" \
3412   "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
3413 
3414 #define YUVTORGB16_AVX512BW(yuvconstants)                             \
3415   "vpsubb      %%zmm13,%%zmm3,%%zmm3                              \n" \
3416   "vpmulhuw    %%zmm11,%%zmm4,%%zmm4                              \n" \
3417   "vpmaddubsw  %%zmm3,%%zmm8,%%zmm0                               \n" \
3418   "vpmaddubsw  %%zmm3,%%zmm9,%%zmm1                               \n" \
3419   "vpmaddubsw  %%zmm3,%%zmm10,%%zmm2                              \n" \
3420   "vpaddw      %%zmm4,%%zmm12,%%zmm4                              \n" \
3421   "vpaddsw     %%zmm4,%%zmm0,%%zmm0                               \n" \
3422   "vpsubsw     %%zmm1,%%zmm4,%%zmm1                               \n" \
3423   "vpaddsw     %%zmm4,%%zmm2,%%zmm2                               \n"
3424 
3425 #define YUVTORGB_REGS_AVX2 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
3426 #define YUVTORGB_REGS_AVX512BW \
3427   "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm16", "xmm17", "xmm18",
3428 
3429 #else  // Convert 16 pixels: 16 UV and 16 Y.
3430 
3431 #define YUVTORGB_SETUP_AVX2(yuvconstants)
3432 #define YUVTORGB16_AVX2(yuvconstants)                                 \
3433   "vpcmpeqb    %%xmm0,%%xmm0,%%xmm0                               \n" \
3434   "vpsllw      $7,%%xmm0,%%xmm0                                   \n" \
3435   "vpbroadcastb %%xmm0,%%ymm0                                     \n" \
3436   "vpsubb      %%ymm0,%%ymm3,%%ymm3                               \n" \
3437   "vpmulhuw    96(%[yuvconstants]),%%ymm4,%%ymm4                  \n" \
3438   "vmovdqa     (%[yuvconstants]),%%ymm0                           \n" \
3439   "vmovdqa     32(%[yuvconstants]),%%ymm1                         \n" \
3440   "vmovdqa     64(%[yuvconstants]),%%ymm2                         \n" \
3441   "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0                               \n" \
3442   "vpmaddubsw  %%ymm3,%%ymm1,%%ymm1                               \n" \
3443   "vpmaddubsw  %%ymm3,%%ymm2,%%ymm2                               \n" \
3444   "vmovdqa     128(%[yuvconstants]),%%ymm3                        \n" \
3445   "vpaddw      %%ymm4,%%ymm3,%%ymm4                               \n" \
3446   "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
3447   "vpsubsw     %%ymm1,%%ymm4,%%ymm1                               \n" \
3448   "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
3449 
3450 #define YUVTORGB_REGS_AVX2
3451 #endif
3452 
3453 #define YUVTORGB_AVX2(yuvconstants)                                   \
3454   YUVTORGB16_AVX2(yuvconstants)                                       \
3455   "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n" \
3456   "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n" \
3457   "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n" \
3458   "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n" \
3459   "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n" \
3460   "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
3461 
3462 #define YUVTORGB_AVX512BW(yuvconstants)                               \
3463   YUVTORGB16_AVX512BW(yuvconstants)                                   \
3464   "vpsraw     $0x6,%%zmm0,%%zmm0                                  \n" \
3465   "vpsraw     $0x6,%%zmm1,%%zmm1                                  \n" \
3466   "vpsraw     $0x6,%%zmm2,%%zmm2                                  \n" \
3467   "vpackuswb  %%zmm0,%%zmm0,%%zmm0                                \n" \
3468   "vpackuswb  %%zmm1,%%zmm1,%%zmm1                                \n" \
3469   "vpackuswb  %%zmm2,%%zmm2,%%zmm2                                \n"
3470 
3471 // Store 16 ARGB values.
3472 #define STOREARGB_AVX2                                                \
3473   "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
3474   "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
3475   "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n" \
3476   "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
3477   "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n" \
3478   "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n" \
3479   "vmovdqu    %%ymm1,(%[dst_argb])                                \n" \
3480   "vmovdqu    %%ymm0,0x20(%[dst_argb])                            \n" \
3481   "lea        0x40(%[dst_argb]), %[dst_argb]                      \n"
3482 
3483 // Store 32 ARGB values.
3484 #define STOREARGB_AVX512BW                                            \
3485   "vpunpcklbw %%zmm1,%%zmm0,%%zmm0                                \n" \
3486   "vpermq     %%zmm0,%%zmm18,%%zmm0                               \n" \
3487   "vpunpcklbw %%zmm5,%%zmm2,%%zmm2                                \n" \
3488   "vpermq     %%zmm2,%%zmm18,%%zmm2                               \n" \
3489   "vpunpcklwd %%zmm2,%%zmm0,%%zmm1                                \n" \
3490   "vpunpckhwd %%zmm2,%%zmm0,%%zmm0                                \n" \
3491   "vmovdqu8   %%zmm1,(%[dst_argb])                                \n" \
3492   "vmovdqu8   %%zmm0,0x40(%[dst_argb])                            \n" \
3493   "lea        0x80(%[dst_argb]), %[dst_argb]                      \n"
3494 
3495 // Store 16 AR30 values.
3496 #define STOREAR30_AVX2                                                \
3497   "vpsraw     $0x4,%%ymm0,%%ymm0                                  \n" \
3498   "vpsraw     $0x4,%%ymm1,%%ymm1                                  \n" \
3499   "vpsraw     $0x4,%%ymm2,%%ymm2                                  \n" \
3500   "vpminsw    %%ymm7,%%ymm0,%%ymm0                                \n" \
3501   "vpminsw    %%ymm7,%%ymm1,%%ymm1                                \n" \
3502   "vpminsw    %%ymm7,%%ymm2,%%ymm2                                \n" \
3503   "vpmaxsw    %%ymm6,%%ymm0,%%ymm0                                \n" \
3504   "vpmaxsw    %%ymm6,%%ymm1,%%ymm1                                \n" \
3505   "vpmaxsw    %%ymm6,%%ymm2,%%ymm2                                \n" \
3506   "vpsllw     $0x4,%%ymm2,%%ymm2                                  \n" \
3507   "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
3508   "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
3509   "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
3510   "vpunpckhwd %%ymm2,%%ymm0,%%ymm3                                \n" \
3511   "vpunpcklwd %%ymm2,%%ymm0,%%ymm0                                \n" \
3512   "vpunpckhwd %%ymm5,%%ymm1,%%ymm2                                \n" \
3513   "vpunpcklwd %%ymm5,%%ymm1,%%ymm1                                \n" \
3514   "vpslld     $0xa,%%ymm1,%%ymm1                                  \n" \
3515   "vpslld     $0xa,%%ymm2,%%ymm2                                  \n" \
3516   "vpor       %%ymm1,%%ymm0,%%ymm0                                \n" \
3517   "vpor       %%ymm2,%%ymm3,%%ymm3                                \n" \
3518   "vmovdqu    %%ymm0,(%[dst_ar30])                                \n" \
3519   "vmovdqu    %%ymm3,0x20(%[dst_ar30])                            \n" \
3520   "lea        0x40(%[dst_ar30]), %[dst_ar30]                      \n"
3521 
3522 #ifdef HAS_I444TOARGBROW_AVX2
3523 // 16 pixels
3524 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
I444ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3525 void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
3526                                const uint8_t* u_buf,
3527                                const uint8_t* v_buf,
3528                                uint8_t* dst_argb,
3529                                const struct YuvConstants* yuvconstants,
3530                                int width) {
3531   asm volatile (
3532     YUVTORGB_SETUP_AVX2(yuvconstants)
3533       "sub         %[u_buf],%[v_buf]             \n"
3534       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
3535 
3536     LABELALIGN
3537       "1:                                        \n"
3538     READYUV444_AVX2
3539     YUVTORGB_AVX2(yuvconstants)
3540     STOREARGB_AVX2
3541       "sub         $0x10,%[width]                \n"
3542       "jg          1b                            \n"
3543       "vzeroupper                                \n"
3544   : [y_buf]"+r"(y_buf),    // %[y_buf]
3545     [u_buf]"+r"(u_buf),    // %[u_buf]
3546     [v_buf]"+r"(v_buf),    // %[v_buf]
3547     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3548     [width]"+rm"(width)    // %[width]
3549   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3550   : "memory", "cc", YUVTORGB_REGS_AVX2
3551     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3552   );
3553 }
3554 #endif  // HAS_I444TOARGBROW_AVX2
3555 
3556 #if defined(HAS_I422TOARGBROW_AVX2)
3557 // 16 pixels
3558 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I422ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3559 void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
3560                                const uint8_t* u_buf,
3561                                const uint8_t* v_buf,
3562                                uint8_t* dst_argb,
3563                                const struct YuvConstants* yuvconstants,
3564                                int width) {
3565   asm volatile (
3566     YUVTORGB_SETUP_AVX2(yuvconstants)
3567       "sub         %[u_buf],%[v_buf]             \n"
3568       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
3569 
3570     LABELALIGN
3571       "1:                                        \n"
3572     READYUV422_AVX2
3573     YUVTORGB_AVX2(yuvconstants)
3574     STOREARGB_AVX2
3575       "sub         $0x10,%[width]                \n"
3576       "jg          1b                            \n"
3577 
3578       "vzeroupper                                \n"
3579   : [y_buf]"+r"(y_buf),    // %[y_buf]
3580     [u_buf]"+r"(u_buf),    // %[u_buf]
3581     [v_buf]"+r"(v_buf),    // %[v_buf]
3582     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3583     [width]"+rm"(width)    // %[width]
3584   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3585   : "memory", "cc", YUVTORGB_REGS_AVX2
3586     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3587   );
3588 }
3589 #endif  // HAS_I422TOARGBROW_AVX2
3590 
3591 #if defined(HAS_I422TOARGBROW_AVX512BW)
3592 static const uint64_t kSplitQuadWords[8] = {0, 2, 2, 2, 1, 2, 2, 2};
3593 static const uint64_t kSplitDoubleQuadWords[8] = {0, 1, 4, 4, 2, 3, 4, 4};
3594 static const uint64_t kUnpermuteAVX512[8] = {0, 4, 1, 5, 2, 6, 3, 7};
3595 
3596 // 32 pixels
3597 // 16 UV values upsampled to 32 UV, mixed with 32 Y producing 32 ARGB (128
3598 // bytes).
I422ToARGBRow_AVX512BW(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3599 void OMITFP I422ToARGBRow_AVX512BW(const uint8_t* y_buf,
3600                                    const uint8_t* u_buf,
3601                                    const uint8_t* v_buf,
3602                                    uint8_t* dst_argb,
3603                                    const struct YuvConstants* yuvconstants,
3604                                    int width) {
3605   asm volatile (
3606     YUVTORGB_SETUP_AVX512BW(yuvconstants)
3607       "sub         %[u_buf],%[v_buf]             \n"
3608       "vpcmpeqb    %%xmm5,%%xmm5,%%xmm5          \n"
3609       "vpbroadcastq %%xmm5,%%zmm5                \n"
3610 
3611     LABELALIGN
3612       "1:                                        \n"
3613     READYUV422_AVX512BW
3614     YUVTORGB_AVX512BW(yuvconstants)
3615     STOREARGB_AVX512BW
3616       "sub         $0x20,%[width]                \n"
3617       "jg          1b                            \n"
3618 
3619       "vzeroupper                                \n"
3620   : [y_buf]"+r"(y_buf),                         // %[y_buf]
3621     [u_buf]"+r"(u_buf),                         // %[u_buf]
3622     [v_buf]"+r"(v_buf),                         // %[v_buf]
3623     [dst_argb]"+r"(dst_argb),                   // %[dst_argb]
3624     [width]"+rm"(width)                         // %[width]
3625   : [yuvconstants]"r"(yuvconstants),            // %[yuvconstants]
3626     [quadsplitperm]"r"(kSplitQuadWords),        // %[quadsplitperm]
3627     [dquadsplitperm]"r"(kSplitDoubleQuadWords), // %[dquadsplitperm]
3628     [unperm]"r"(kUnpermuteAVX512)               // %[unperm]
3629   : "memory", "cc", YUVTORGB_REGS_AVX512BW
3630     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3631   );
3632 }
3633 #endif  // HAS_I422TOARGBROW_AVX512BW
3634 
3635 #if defined(HAS_I422TOAR30ROW_AVX2)
3636 // 16 pixels
3637 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I422ToAR30Row_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3638 void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
3639                                const uint8_t* u_buf,
3640                                const uint8_t* v_buf,
3641                                uint8_t* dst_ar30,
3642                                const struct YuvConstants* yuvconstants,
3643                                int width) {
3644   asm volatile (
3645     YUVTORGB_SETUP_AVX2(yuvconstants)
3646       "sub         %[u_buf],%[v_buf]             \n"
3647       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
3648       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
3649       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
3650       "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
3651       "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
3652       "vpsrlw      $6,%%ymm7,%%ymm7              \n"
3653 
3654     LABELALIGN
3655       "1:                                        \n"
3656     READYUV422_AVX2
3657     YUVTORGB16_AVX2(yuvconstants)
3658     STOREAR30_AVX2
3659       "sub         $0x10,%[width]                \n"
3660       "jg          1b                            \n"
3661 
3662       "vzeroupper                                \n"
3663   : [y_buf]"+r"(y_buf),    // %[y_buf]
3664     [u_buf]"+r"(u_buf),    // %[u_buf]
3665     [v_buf]"+r"(v_buf),    // %[v_buf]
3666     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
3667     [width]"+rm"(width)    // %[width]
3668   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3669   : "memory", "cc", YUVTORGB_REGS_AVX2
3670     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3671   );
3672 }
3673 #endif  // HAS_I422TOAR30ROW_AVX2
3674 
3675 #if defined(HAS_I210TOARGBROW_AVX2)
3676 // 16 pixels
3677 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I210ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3678 void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
3679                                const uint16_t* u_buf,
3680                                const uint16_t* v_buf,
3681                                uint8_t* dst_argb,
3682                                const struct YuvConstants* yuvconstants,
3683                                int width) {
3684   asm volatile (
3685     YUVTORGB_SETUP_AVX2(yuvconstants)
3686       "sub         %[u_buf],%[v_buf]             \n"
3687       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
3688 
3689     LABELALIGN
3690       "1:                                        \n"
3691     READYUV210_AVX2
3692     YUVTORGB_AVX2(yuvconstants)
3693     STOREARGB_AVX2
3694       "sub         $0x10,%[width]                \n"
3695       "jg          1b                            \n"
3696 
3697       "vzeroupper                                \n"
3698   : [y_buf]"+r"(y_buf),    // %[y_buf]
3699     [u_buf]"+r"(u_buf),    // %[u_buf]
3700     [v_buf]"+r"(v_buf),    // %[v_buf]
3701     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3702     [width]"+rm"(width)    // %[width]
3703   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3704   : "memory", "cc", YUVTORGB_REGS_AVX2
3705     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3706   );
3707 }
3708 #endif  // HAS_I210TOARGBROW_AVX2
3709 
3710 #if defined(HAS_I212TOARGBROW_AVX2)
3711 // 16 pixels
3712 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I212ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3713 void OMITFP I212ToARGBRow_AVX2(const uint16_t* y_buf,
3714                                const uint16_t* u_buf,
3715                                const uint16_t* v_buf,
3716                                uint8_t* dst_argb,
3717                                const struct YuvConstants* yuvconstants,
3718                                int width) {
3719   asm volatile (
3720     YUVTORGB_SETUP_AVX2(yuvconstants)
3721       "sub         %[u_buf],%[v_buf]             \n"
3722       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
3723 
3724     LABELALIGN
3725       "1:                                        \n"
3726     READYUV212_AVX2
3727     YUVTORGB_AVX2(yuvconstants)
3728     STOREARGB_AVX2
3729       "sub         $0x10,%[width]                \n"
3730       "jg          1b                            \n"
3731 
3732       "vzeroupper                                \n"
3733   : [y_buf]"+r"(y_buf),    // %[y_buf]
3734     [u_buf]"+r"(u_buf),    // %[u_buf]
3735     [v_buf]"+r"(v_buf),    // %[v_buf]
3736     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3737     [width]"+rm"(width)    // %[width]
3738   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3739   : "memory", "cc", YUVTORGB_REGS_AVX2
3740     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3741   );
3742 }
3743 #endif  // HAS_I212TOARGBROW_AVX2
3744 
3745 #if defined(HAS_I210TOAR30ROW_AVX2)
3746 // 16 pixels
3747 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I210ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3748 void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
3749                                const uint16_t* u_buf,
3750                                const uint16_t* v_buf,
3751                                uint8_t* dst_ar30,
3752                                const struct YuvConstants* yuvconstants,
3753                                int width) {
3754   asm volatile (
3755     YUVTORGB_SETUP_AVX2(yuvconstants)
3756       "sub         %[u_buf],%[v_buf]             \n"
3757       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
3758       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
3759       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
3760       "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
3761       "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
3762       "vpsrlw      $6,%%ymm7,%%ymm7              \n"
3763 
3764     LABELALIGN
3765       "1:                                        \n"
3766     READYUV210_AVX2
3767     YUVTORGB16_AVX2(yuvconstants)
3768     STOREAR30_AVX2
3769       "sub         $0x10,%[width]                \n"
3770       "jg          1b                            \n"
3771 
3772       "vzeroupper                                \n"
3773   : [y_buf]"+r"(y_buf),    // %[y_buf]
3774     [u_buf]"+r"(u_buf),    // %[u_buf]
3775     [v_buf]"+r"(v_buf),    // %[v_buf]
3776     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
3777     [width]"+rm"(width)    // %[width]
3778   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3779   : "memory", "cc", YUVTORGB_REGS_AVX2
3780     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3781   );
3782 }
3783 #endif  // HAS_I210TOAR30ROW_AVX2
3784 
3785 #if defined(HAS_I212TOAR30ROW_AVX2)
3786 // 16 pixels
3787 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I212ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3788 void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf,
3789                                const uint16_t* u_buf,
3790                                const uint16_t* v_buf,
3791                                uint8_t* dst_ar30,
3792                                const struct YuvConstants* yuvconstants,
3793                                int width) {
3794   asm volatile (
3795     YUVTORGB_SETUP_AVX2(yuvconstants)
3796       "sub         %[u_buf],%[v_buf]             \n"
3797       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
3798       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
3799       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
3800       "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
3801       "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
3802       "vpsrlw      $6,%%ymm7,%%ymm7              \n"
3803 
3804     LABELALIGN
3805       "1:                                        \n"
3806     READYUV212_AVX2
3807     YUVTORGB16_AVX2(yuvconstants)
3808     STOREAR30_AVX2
3809       "sub         $0x10,%[width]                \n"
3810       "jg          1b                            \n"
3811 
3812       "vzeroupper                                \n"
3813   : [y_buf]"+r"(y_buf),    // %[y_buf]
3814     [u_buf]"+r"(u_buf),    // %[u_buf]
3815     [v_buf]"+r"(v_buf),    // %[v_buf]
3816     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
3817     [width]"+rm"(width)    // %[width]
3818   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3819   : "memory", "cc", YUVTORGB_REGS_AVX2
3820     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3821   );
3822 }
3823 #endif  // HAS_I212TOAR30ROW_AVX2
3824 
3825 #if defined(HAS_I410TOARGBROW_AVX2)
3826 // 16 pixels
3827 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
I410ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3828 void OMITFP I410ToARGBRow_AVX2(const uint16_t* y_buf,
3829                                const uint16_t* u_buf,
3830                                const uint16_t* v_buf,
3831                                uint8_t* dst_argb,
3832                                const struct YuvConstants* yuvconstants,
3833                                int width) {
3834   asm volatile (
3835     YUVTORGB_SETUP_AVX2(yuvconstants)
3836       "sub         %[u_buf],%[v_buf]             \n"
3837       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
3838 
3839     LABELALIGN
3840       "1:                                        \n"
3841     READYUV410_AVX2
3842     YUVTORGB_AVX2(yuvconstants)
3843     STOREARGB_AVX2
3844       "sub         $0x10,%[width]                \n"
3845       "jg          1b                            \n"
3846       "vzeroupper                                \n"
3847 
3848   : [y_buf]"+r"(y_buf),    // %[y_buf]
3849     [u_buf]"+r"(u_buf),    // %[u_buf]
3850     [v_buf]"+r"(v_buf),    // %[v_buf]
3851     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3852     [width]"+rm"(width)    // %[width]
3853   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3854   : "memory", "cc", YUVTORGB_REGS_AVX2
3855       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3856   );
3857 }
3858 #endif  // HAS_I410TOARGBROW_AVX2
3859 
3860 #if defined(HAS_I210ALPHATOARGBROW_AVX2)
3861 // 16 pixels
3862 // 8 UV, 16 Y and 16 A producing 16 ARGB (64 bytes).
I210AlphaToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,const uint16_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3863 void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf,
3864                                     const uint16_t* u_buf,
3865                                     const uint16_t* v_buf,
3866                                     const uint16_t* a_buf,
3867                                     uint8_t* dst_argb,
3868                                     const struct YuvConstants* yuvconstants,
3869                                     int width) {
3870   asm volatile(
3871       YUVTORGB_SETUP_AVX2(
3872       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
3873 
3874       LABELALIGN "1:                                        \n" READYUVA210_AVX2
3875           YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
3876       "subl        $0x10,%[width]                \n"
3877       "jg          1b                            \n"
3878       "vzeroupper                                \n"
3879 
3880       : [y_buf] "+r"(y_buf),        // %[y_buf]
3881         [u_buf] "+r"(u_buf),        // %[u_buf]
3882         [v_buf] "+r"(v_buf),        // %[v_buf]
3883         [a_buf] "+r"(a_buf),        // %[a_buf]
3884         [dst_argb] "+r"(dst_argb),  // %[dst_argb]
3885 #if defined(__i386__)
3886         [width] "+m"(width)  // %[width]
3887 #else
3888         [width] "+rm"(width)  // %[width]
3889 #endif
3890       : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
3891       : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
3892         "xmm4", "xmm5");
3893 }
3894 #endif  // HAS_I210TOARGBROW_AVX2
3895 
3896 #if defined(HAS_I410ALPHATOARGBROW_AVX2)
3897 // 16 pixels
3898 // 16 UV, 16 Y and 16 A producing 16 ARGB (64 bytes).
I410AlphaToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,const uint16_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3899 void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf,
3900                                     const uint16_t* u_buf,
3901                                     const uint16_t* v_buf,
3902                                     const uint16_t* a_buf,
3903                                     uint8_t* dst_argb,
3904                                     const struct YuvConstants* yuvconstants,
3905                                     int width) {
3906   asm volatile(
3907       YUVTORGB_SETUP_AVX2(
3908       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
3909 
3910       LABELALIGN "1:                                        \n" READYUVA410_AVX2
3911           YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
3912       "subl        $0x10,%[width]                \n"
3913       "jg          1b                            \n"
3914       "vzeroupper                                \n"
3915 
3916       : [y_buf] "+r"(y_buf),        // %[y_buf]
3917         [u_buf] "+r"(u_buf),        // %[u_buf]
3918         [v_buf] "+r"(v_buf),        // %[v_buf]
3919         [a_buf] "+r"(a_buf),        // %[a_buf]
3920         [dst_argb] "+r"(dst_argb),  // %[dst_argb]
3921 #if defined(__i386__)
3922         [width] "+m"(width)  // %[width]
3923 #else
3924         [width] "+rm"(width)  // %[width]
3925 #endif
3926       : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
3927       : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
3928         "xmm4", "xmm5");
3929 }
3930 #endif  // HAS_I410TOARGBROW_AVX2
3931 
3932 #if defined(HAS_I410TOAR30ROW_AVX2)
3933 // 16 pixels
3934 // 16 UV values with 16 Y producing 16 AR30 (64 bytes).
I410ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3935 void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf,
3936                                const uint16_t* u_buf,
3937                                const uint16_t* v_buf,
3938                                uint8_t* dst_ar30,
3939                                const struct YuvConstants* yuvconstants,
3940                                int width) {
3941   asm volatile (
3942     YUVTORGB_SETUP_AVX2(yuvconstants)
3943       "sub         %[u_buf],%[v_buf]             \n"
3944       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
3945       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
3946       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
3947       "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
3948       "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
3949       "vpsrlw      $6,%%ymm7,%%ymm7              \n"
3950 
3951     LABELALIGN
3952       "1:                                        \n"
3953     READYUV410_AVX2
3954     YUVTORGB16_AVX2(yuvconstants)
3955     STOREAR30_AVX2
3956       "sub         $0x10,%[width]                \n"
3957       "jg          1b                            \n"
3958 
3959       "vzeroupper                                \n"
3960   : [y_buf]"+r"(y_buf),    // %[y_buf]
3961     [u_buf]"+r"(u_buf),    // %[u_buf]
3962     [v_buf]"+r"(v_buf),    // %[v_buf]
3963     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
3964     [width]"+rm"(width)    // %[width]
3965   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3966   : "memory", "cc", YUVTORGB_REGS_AVX2
3967       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3968   );
3969 }
3970 #endif  // HAS_I410TOAR30ROW_AVX2
3971 
3972 #if defined(HAS_I444ALPHATOARGBROW_AVX2)
3973 // 16 pixels
3974 // 16 UV values with 16 Y and 16 A producing 16 ARGB.
I444AlphaToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3975 void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf,
3976                                     const uint8_t* u_buf,
3977                                     const uint8_t* v_buf,
3978                                     const uint8_t* a_buf,
3979                                     uint8_t* dst_argb,
3980                                     const struct YuvConstants* yuvconstants,
3981                                     int width) {
3982   // clang-format off
3983   asm volatile (
3984   YUVTORGB_SETUP_AVX2(yuvconstants)
3985       "sub         %[u_buf],%[v_buf]             \n"
3986 
3987   LABELALIGN
3988       "1:                                        \n"
3989   READYUVA444_AVX2
3990   YUVTORGB_AVX2(yuvconstants)
3991   STOREARGB_AVX2
3992       "subl        $0x10,%[width]                \n"
3993       "jg          1b                            \n"
3994       "vzeroupper                                \n"
3995   : [y_buf]"+r"(y_buf),    // %[y_buf]
3996     [u_buf]"+r"(u_buf),    // %[u_buf]
3997     [v_buf]"+r"(v_buf),    // %[v_buf]
3998     [a_buf]"+r"(a_buf),    // %[a_buf]
3999     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4000 #if defined(__i386__)
4001     [width]"+m"(width)     // %[width]
4002 #else
4003     [width]"+rm"(width)    // %[width]
4004 #endif
4005   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4006   : "memory", "cc", YUVTORGB_REGS_AVX2
4007       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4008   );
4009   // clang-format on
4010 }
4011 #endif  // HAS_I444ALPHATOARGBROW_AVX2
4012 
4013 #if defined(HAS_I422ALPHATOARGBROW_AVX2)
4014 // 16 pixels
4015 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
I422AlphaToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4016 void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
4017                                     const uint8_t* u_buf,
4018                                     const uint8_t* v_buf,
4019                                     const uint8_t* a_buf,
4020                                     uint8_t* dst_argb,
4021                                     const struct YuvConstants* yuvconstants,
4022                                     int width) {
4023   // clang-format off
4024   asm volatile (
4025     YUVTORGB_SETUP_AVX2(yuvconstants)
4026       "sub         %[u_buf],%[v_buf]             \n"
4027 
4028     LABELALIGN
4029       "1:                                        \n"
4030     READYUVA422_AVX2
4031     YUVTORGB_AVX2(yuvconstants)
4032     STOREARGB_AVX2
4033       "subl        $0x10,%[width]                \n"
4034       "jg          1b                            \n"
4035       "vzeroupper                                \n"
4036   : [y_buf]"+r"(y_buf),    // %[y_buf]
4037     [u_buf]"+r"(u_buf),    // %[u_buf]
4038     [v_buf]"+r"(v_buf),    // %[v_buf]
4039     [a_buf]"+r"(a_buf),    // %[a_buf]
4040     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4041 #if defined(__i386__)
4042     [width]"+m"(width)     // %[width]
4043 #else
4044     [width]"+rm"(width)    // %[width]
4045 #endif
4046   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4047   : "memory", "cc", YUVTORGB_REGS_AVX2
4048     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4049   );
4050   // clang-format on
4051 }
4052 #endif  // HAS_I422ALPHATOARGBROW_AVX2
4053 
4054 #if defined(HAS_I422TORGBAROW_AVX2)
4055 // 16 pixels
4056 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
I422ToRGBARow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4057 void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
4058                                const uint8_t* u_buf,
4059                                const uint8_t* v_buf,
4060                                uint8_t* dst_argb,
4061                                const struct YuvConstants* yuvconstants,
4062                                int width) {
4063   asm volatile (
4064     YUVTORGB_SETUP_AVX2(yuvconstants)
4065       "sub         %[u_buf],%[v_buf]             \n"
4066       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4067 
4068     LABELALIGN
4069       "1:                                        \n"
4070     READYUV422_AVX2
4071     YUVTORGB_AVX2(yuvconstants)
4072 
4073     // Step 3: Weave into RGBA
4074     "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
4075     "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
4076     "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
4077     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
4078     "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
4079     "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
4080     "vmovdqu    %%ymm0,(%[dst_argb])           \n"
4081     "vmovdqu    %%ymm1,0x20(%[dst_argb])       \n"
4082     "lea        0x40(%[dst_argb]),%[dst_argb]  \n"
4083     "sub        $0x10,%[width]                 \n"
4084     "jg         1b                             \n"
4085     "vzeroupper                                \n"
4086   : [y_buf]"+r"(y_buf),    // %[y_buf]
4087     [u_buf]"+r"(u_buf),    // %[u_buf]
4088     [v_buf]"+r"(v_buf),    // %[v_buf]
4089     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4090     [width]"+rm"(width)    // %[width]
4091   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4092   : "memory", "cc", YUVTORGB_REGS_AVX2
4093     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4094   );
4095 }
4096 #endif  // HAS_I422TORGBAROW_AVX2
4097 
4098 #if defined(HAS_NV12TOARGBROW_AVX2)
4099 // 16 pixels.
4100 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV12ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4101 void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
4102                                const uint8_t* uv_buf,
4103                                uint8_t* dst_argb,
4104                                const struct YuvConstants* yuvconstants,
4105                                int width) {
4106   // clang-format off
4107   asm volatile (
4108     YUVTORGB_SETUP_AVX2(yuvconstants)
4109       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4110 
4111     LABELALIGN
4112       "1:                                        \n"
4113     READNV12_AVX2
4114     YUVTORGB_AVX2(yuvconstants)
4115     STOREARGB_AVX2
4116       "sub         $0x10,%[width]                \n"
4117       "jg          1b                            \n"
4118       "vzeroupper                                \n"
4119   : [y_buf]"+r"(y_buf),    // %[y_buf]
4120     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
4121     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4122     [width]"+rm"(width)    // %[width]
4123   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4124     : "memory", "cc", YUVTORGB_REGS_AVX2
4125     "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4126   );
4127   // clang-format on
4128 }
4129 #endif  // HAS_NV12TOARGBROW_AVX2
4130 
4131 #if defined(HAS_NV21TOARGBROW_AVX2)
4132 // 16 pixels.
4133 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV21ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * vu_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4134 void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
4135                                const uint8_t* vu_buf,
4136                                uint8_t* dst_argb,
4137                                const struct YuvConstants* yuvconstants,
4138                                int width) {
4139   // clang-format off
4140   asm volatile (
4141     YUVTORGB_SETUP_AVX2(yuvconstants)
4142       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4143 
4144     LABELALIGN
4145       "1:                                        \n"
4146     READNV21_AVX2
4147     YUVTORGB_AVX2(yuvconstants)
4148     STOREARGB_AVX2
4149       "sub         $0x10,%[width]                \n"
4150       "jg          1b                            \n"
4151       "vzeroupper                                \n"
4152   : [y_buf]"+r"(y_buf),    // %[y_buf]
4153     [vu_buf]"+r"(vu_buf),    // %[vu_buf]
4154     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4155     [width]"+rm"(width)    // %[width]
4156   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
4157     [kShuffleNV21]"m"(kShuffleNV21)
4158     : "memory", "cc", YUVTORGB_REGS_AVX2
4159       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4160   );
4161   // clang-format on
4162 }
4163 #endif  // HAS_NV21TOARGBROW_AVX2
4164 
4165 #if defined(HAS_YUY2TOARGBROW_AVX2)
4166 // 16 pixels.
4167 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
YUY2ToARGBRow_AVX2(const uint8_t * yuy2_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4168 void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
4169                                uint8_t* dst_argb,
4170                                const struct YuvConstants* yuvconstants,
4171                                int width) {
4172   // clang-format off
4173   asm volatile (
4174     YUVTORGB_SETUP_AVX2(yuvconstants)
4175       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4176 
4177     LABELALIGN
4178       "1:                                        \n"
4179     READYUY2_AVX2
4180     YUVTORGB_AVX2(yuvconstants)
4181     STOREARGB_AVX2
4182       "sub         $0x10,%[width]                \n"
4183       "jg          1b                            \n"
4184       "vzeroupper                                \n"
4185   : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
4186     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4187     [width]"+rm"(width)    // %[width]
4188   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
4189     [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
4190     [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
4191     : "memory", "cc", YUVTORGB_REGS_AVX2
4192       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4193   );
4194   // clang-format on
4195 }
4196 #endif  // HAS_YUY2TOARGBROW_AVX2
4197 
4198 #if defined(HAS_UYVYTOARGBROW_AVX2)
4199 // 16 pixels.
4200 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
UYVYToARGBRow_AVX2(const uint8_t * uyvy_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4201 void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
4202                                uint8_t* dst_argb,
4203                                const struct YuvConstants* yuvconstants,
4204                                int width) {
4205   // clang-format off
4206   asm volatile (
4207     YUVTORGB_SETUP_AVX2(yuvconstants)
4208       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4209 
4210     LABELALIGN
4211       "1:                                        \n"
4212     READUYVY_AVX2
4213     YUVTORGB_AVX2(yuvconstants)
4214     STOREARGB_AVX2
4215       "sub         $0x10,%[width]                \n"
4216       "jg          1b                            \n"
4217       "vzeroupper                                \n"
4218   : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
4219     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4220     [width]"+rm"(width)    // %[width]
4221   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
4222     [kShuffleUYVYY]"m"(kShuffleUYVYY),
4223     [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
4224     : "memory", "cc", YUVTORGB_REGS_AVX2
4225       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4226   );
4227   // clang-format on
4228 }
4229 #endif  // HAS_UYVYTOARGBROW_AVX2
4230 
4231 #if defined(HAS_P210TOARGBROW_AVX2)
4232 // 16 pixels.
4233 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
P210ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4234 void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf,
4235                                const uint16_t* uv_buf,
4236                                uint8_t* dst_argb,
4237                                const struct YuvConstants* yuvconstants,
4238                                int width) {
4239   // clang-format off
4240   asm volatile (
4241     YUVTORGB_SETUP_AVX2(yuvconstants)
4242       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4243 
4244     LABELALIGN
4245       "1:                                        \n"
4246     READP210_AVX2
4247     YUVTORGB_AVX2(yuvconstants)
4248     STOREARGB_AVX2
4249       "sub         $0x10,%[width]                \n"
4250       "jg          1b                            \n"
4251       "vzeroupper                                \n"
4252   : [y_buf]"+r"(y_buf),    // %[y_buf]
4253     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
4254     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4255     [width]"+rm"(width)    // %[width]
4256   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4257   : "memory", "cc", YUVTORGB_REGS_AVX2
4258       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4259   );
4260   // clang-format on
4261 }
4262 #endif  // HAS_P210TOARGBROW_AVX2
4263 
4264 #if defined(HAS_P410TOARGBROW_AVX2)
4265 // 16 pixels.
4266 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
P410ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4267 void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf,
4268                                const uint16_t* uv_buf,
4269                                uint8_t* dst_argb,
4270                                const struct YuvConstants* yuvconstants,
4271                                int width) {
4272   // clang-format off
4273   asm volatile (
4274     YUVTORGB_SETUP_AVX2(yuvconstants)
4275       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4276 
4277     LABELALIGN
4278       "1:                                        \n"
4279     READP410_AVX2
4280     YUVTORGB_AVX2(yuvconstants)
4281     STOREARGB_AVX2
4282       "sub         $0x10,%[width]                \n"
4283       "jg          1b                            \n"
4284       "vzeroupper                                \n"
4285   : [y_buf]"+r"(y_buf),    // %[y_buf]
4286     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
4287     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4288     [width]"+rm"(width)    // %[width]
4289   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4290   : "memory", "cc", YUVTORGB_REGS_AVX2
4291       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4292   );
4293   // clang-format on
4294 }
4295 #endif  // HAS_P410TOARGBROW_AVX2
4296 
4297 #if defined(HAS_P210TOAR30ROW_AVX2)
4298 // 16 pixels
4299 // 16 UV values with 16 Y producing 16 AR30 (64 bytes).
P210ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)4300 void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf,
4301                                const uint16_t* uv_buf,
4302                                uint8_t* dst_ar30,
4303                                const struct YuvConstants* yuvconstants,
4304                                int width) {
4305   asm volatile (
4306     YUVTORGB_SETUP_AVX2(yuvconstants)
4307       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
4308       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
4309       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
4310       "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
4311       "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
4312       "vpsrlw      $6,%%ymm7,%%ymm7              \n"
4313 
4314     LABELALIGN
4315       "1:                                        \n"
4316     READP210_AVX2
4317     YUVTORGB16_AVX2(yuvconstants)
4318     STOREAR30_AVX2
4319       "sub         $0x10,%[width]                \n"
4320       "jg          1b                            \n"
4321 
4322       "vzeroupper                                \n"
4323   : [y_buf]"+r"(y_buf),    // %[y_buf]
4324     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
4325     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
4326     [width]"+rm"(width)    // %[width]
4327   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4328   : "memory", "cc", YUVTORGB_REGS_AVX2
4329       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4330   );
4331 }
4332 #endif  // HAS_P210TOAR30ROW_AVX2
4333 
4334 #if defined(HAS_P410TOAR30ROW_AVX2)
4335 // 16 pixels
4336 // 16 UV values with 16 Y producing 16 AR30 (64 bytes).
P410ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)4337 void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf,
4338                                const uint16_t* uv_buf,
4339                                uint8_t* dst_ar30,
4340                                const struct YuvConstants* yuvconstants,
4341                                int width) {
4342   asm volatile (
4343     YUVTORGB_SETUP_AVX2(yuvconstants)
4344       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
4345       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
4346       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
4347       "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
4348       "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
4349       "vpsrlw      $6,%%ymm7,%%ymm7              \n"
4350 
4351     LABELALIGN
4352       "1:                                        \n"
4353     READP410_AVX2
4354     YUVTORGB16_AVX2(yuvconstants)
4355     STOREAR30_AVX2
4356       "sub         $0x10,%[width]                \n"
4357       "jg          1b                            \n"
4358 
4359       "vzeroupper                                \n"
4360   : [y_buf]"+r"(y_buf),    // %[y_buf]
4361     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
4362     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
4363     [width]"+rm"(width)    // %[width]
4364   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4365   : "memory", "cc", YUVTORGB_REGS_AVX2
4366       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4367   );
4368 }
4369 #endif  // HAS_P410TOAR30ROW_AVX2
4370 
4371 #ifdef HAS_I400TOARGBROW_SSE2
I400ToARGBRow_SSE2(const uint8_t * y_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4372 void I400ToARGBRow_SSE2(const uint8_t* y_buf,
4373                         uint8_t* dst_argb,
4374                         const struct YuvConstants* yuvconstants,
4375                         int width) {
4376   asm volatile(
4377       "movdqa      96(%3),%%xmm2                 \n"  // yg = 18997 = 1.164
4378       "movdqa      128(%3),%%xmm3                \n"  // ygb = 1160 = 1.164 * 16
4379       "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 0xff000000
4380       "pslld       $0x18,%%xmm4                  \n"
4381 
4382       LABELALIGN
4383       "1:                                        \n"
4384       // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
4385       "movq      (%0),%%xmm0                     \n"
4386       "lea       0x8(%0),%0                      \n"
4387       "punpcklbw %%xmm0,%%xmm0                   \n"
4388       "pmulhuw   %%xmm2,%%xmm0                   \n"
4389       "paddsw    %%xmm3,%%xmm0                   \n"
4390       "psraw     $6, %%xmm0                      \n"
4391       "packuswb  %%xmm0,%%xmm0                   \n"
4392 
4393       // Step 2: Weave into ARGB
4394       "punpcklbw %%xmm0,%%xmm0                   \n"
4395       "movdqa    %%xmm0,%%xmm1                   \n"
4396       "punpcklwd %%xmm0,%%xmm0                   \n"
4397       "punpckhwd %%xmm1,%%xmm1                   \n"
4398       "por       %%xmm4,%%xmm0                   \n"
4399       "por       %%xmm4,%%xmm1                   \n"
4400       "movdqu    %%xmm0,(%1)                     \n"
4401       "movdqu    %%xmm1,0x10(%1)                 \n"
4402       "lea       0x20(%1),%1                     \n"
4403 
4404       "sub       $0x8,%2                         \n"
4405       "jg        1b                              \n"
4406       : "+r"(y_buf),       // %0
4407         "+r"(dst_argb),    // %1
4408         "+rm"(width)       // %2
4409       : "r"(yuvconstants)  // %3
4410       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
4411 }
4412 #endif  // HAS_I400TOARGBROW_SSE2
4413 
4414 #ifdef HAS_I400TOARGBROW_AVX2
4415 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
4416 // note: vpunpcklbw mutates and vpackuswb unmutates.
I400ToARGBRow_AVX2(const uint8_t * y_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4417 void I400ToARGBRow_AVX2(const uint8_t* y_buf,
4418                         uint8_t* dst_argb,
4419                         const struct YuvConstants* yuvconstants,
4420                         int width) {
4421   asm volatile(
4422       "vmovdqa     96(%3),%%ymm2                 \n"  // yg = 18997 = 1.164
4423       "vmovdqa     128(%3),%%ymm3                \n"  // ygb = -1160 = 1.164*16
4424       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 0xff000000
4425       "vpslld      $0x18,%%ymm4,%%ymm4           \n"
4426 
4427       LABELALIGN
4428       "1:                                        \n"
4429       // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
4430       "vmovdqu    (%0),%%xmm0                    \n"
4431       "lea        0x10(%0),%0                    \n"
4432       "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
4433       "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
4434       "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
4435       "vpaddsw    %%ymm3,%%ymm0,%%ymm0           \n"
4436       "vpsraw     $0x6,%%ymm0,%%ymm0             \n"
4437       "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
4438       "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
4439       "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
4440       "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
4441       "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
4442       "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
4443       "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
4444       "vmovdqu    %%ymm0,(%1)                    \n"
4445       "vmovdqu    %%ymm1,0x20(%1)                \n"
4446       "lea        0x40(%1),%1                     \n"
4447       "sub        $0x10,%2                       \n"
4448       "jg        1b                              \n"
4449       "vzeroupper                                \n"
4450       : "+r"(y_buf),       // %0
4451         "+r"(dst_argb),    // %1
4452         "+rm"(width)       // %2
4453       : "r"(yuvconstants)  // %3
4454       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
4455 }
4456 #endif  // HAS_I400TOARGBROW_AVX2
4457 
4458 #ifdef HAS_MIRRORROW_SSSE3
4459 // Shuffle table for reversing the bytes.
4460 static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
4461                                      7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
4462 
MirrorRow_SSSE3(const uint8_t * src,uint8_t * dst,int width)4463 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
4464   intptr_t temp_width = (intptr_t)(width);
4465   asm volatile(
4466 
4467       "movdqa      %3,%%xmm5                     \n"
4468 
4469       LABELALIGN
4470       "1:                                        \n"
4471       "movdqu      -0x10(%0,%2,1),%%xmm0         \n"
4472       "pshufb      %%xmm5,%%xmm0                 \n"
4473       "movdqu      %%xmm0,(%1)                   \n"
4474       "lea         0x10(%1),%1                   \n"
4475       "sub         $0x10,%2                      \n"
4476       "jg          1b                            \n"
4477       : "+r"(src),           // %0
4478         "+r"(dst),           // %1
4479         "+r"(temp_width)     // %2
4480       : "m"(kShuffleMirror)  // %3
4481       : "memory", "cc", "xmm0", "xmm5");
4482 }
4483 #endif  // HAS_MIRRORROW_SSSE3
4484 
4485 #ifdef HAS_MIRRORROW_AVX2
MirrorRow_AVX2(const uint8_t * src,uint8_t * dst,int width)4486 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
4487   intptr_t temp_width = (intptr_t)(width);
4488   asm volatile(
4489 
4490       "vbroadcastf128 %3,%%ymm5                  \n"
4491 
4492       LABELALIGN
4493       "1:                                        \n"
4494       "vmovdqu     -0x20(%0,%2,1),%%ymm0         \n"
4495       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
4496       "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
4497       "vmovdqu     %%ymm0,(%1)                   \n"
4498       "lea         0x20(%1),%1                   \n"
4499       "sub         $0x20,%2                      \n"
4500       "jg          1b                            \n"
4501       "vzeroupper                                \n"
4502       : "+r"(src),           // %0
4503         "+r"(dst),           // %1
4504         "+r"(temp_width)     // %2
4505       : "m"(kShuffleMirror)  // %3
4506       : "memory", "cc", "xmm0", "xmm5");
4507 }
4508 #endif  // HAS_MIRRORROW_AVX2
4509 
4510 #ifdef HAS_MIRRORUVROW_SSSE3
4511 // Shuffle table for reversing the UV.
4512 static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
4513                                        6u,  7u,  4u,  5u,  2u,  3u,  0u, 1u};
4514 
MirrorUVRow_SSSE3(const uint8_t * src_uv,uint8_t * dst_uv,int width)4515 void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
4516   intptr_t temp_width = (intptr_t)(width);
4517   asm volatile(
4518 
4519       "movdqa      %3,%%xmm5                     \n"
4520 
4521       LABELALIGN
4522       "1:                                        \n"
4523       "movdqu      -0x10(%0,%2,2),%%xmm0         \n"
4524       "pshufb      %%xmm5,%%xmm0                 \n"
4525       "movdqu      %%xmm0,(%1)                   \n"
4526       "lea         0x10(%1),%1                   \n"
4527       "sub         $0x8,%2                       \n"
4528       "jg          1b                            \n"
4529       : "+r"(src_uv),          // %0
4530         "+r"(dst_uv),          // %1
4531         "+r"(temp_width)       // %2
4532       : "m"(kShuffleMirrorUV)  // %3
4533       : "memory", "cc", "xmm0", "xmm5");
4534 }
4535 #endif  // HAS_MIRRORUVROW_SSSE3
4536 
4537 #ifdef HAS_MIRRORUVROW_AVX2
MirrorUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_uv,int width)4538 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
4539   intptr_t temp_width = (intptr_t)(width);
4540   asm volatile(
4541 
4542       "vbroadcastf128 %3,%%ymm5                  \n"
4543 
4544       LABELALIGN
4545       "1:                                        \n"
4546       "vmovdqu     -0x20(%0,%2,2),%%ymm0         \n"
4547       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
4548       "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
4549       "vmovdqu     %%ymm0,(%1)                   \n"
4550       "lea         0x20(%1),%1                   \n"
4551       "sub         $0x10,%2                      \n"
4552       "jg          1b                            \n"
4553       "vzeroupper                                \n"
4554       : "+r"(src_uv),          // %0
4555         "+r"(dst_uv),          // %1
4556         "+r"(temp_width)       // %2
4557       : "m"(kShuffleMirrorUV)  // %3
4558       : "memory", "cc", "xmm0", "xmm5");
4559 }
4560 #endif  // HAS_MIRRORUVROW_AVX2
4561 
4562 #ifdef HAS_MIRRORSPLITUVROW_SSSE3
4563 // Shuffle table for reversing the bytes of UV channels.
4564 static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
4565                                             15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
MirrorSplitUVRow_SSSE3(const uint8_t * src,uint8_t * dst_u,uint8_t * dst_v,int width)4566 void MirrorSplitUVRow_SSSE3(const uint8_t* src,
4567                             uint8_t* dst_u,
4568                             uint8_t* dst_v,
4569                             int width) {
4570   intptr_t temp_width = (intptr_t)(width);
4571   asm volatile(
4572       "movdqa      %4,%%xmm1                     \n"
4573       "lea         -0x10(%0,%3,2),%0             \n"
4574       "sub         %1,%2                         \n"
4575 
4576       LABELALIGN
4577       "1:                                        \n"
4578       "movdqu      (%0),%%xmm0                   \n"
4579       "lea         -0x10(%0),%0                  \n"
4580       "pshufb      %%xmm1,%%xmm0                 \n"
4581       "movlpd      %%xmm0,(%1)                   \n"
4582       "movhpd      %%xmm0,0x00(%1,%2,1)          \n"
4583       "lea         0x8(%1),%1                    \n"
4584       "sub         $8,%3                         \n"
4585       "jg          1b                            \n"
4586       : "+r"(src),                  // %0
4587         "+r"(dst_u),                // %1
4588         "+r"(dst_v),                // %2
4589         "+r"(temp_width)            // %3
4590       : "m"(kShuffleMirrorSplitUV)  // %4
4591       : "memory", "cc", "xmm0", "xmm1");
4592 }
4593 #endif  // HAS_MIRRORSPLITUVROW_SSSE3
4594 
4595 #ifdef HAS_RGB24MIRRORROW_SSSE3
4596 
4597 // Shuffle first 5 pixels to last 5 mirrored.  first byte zero
4598 static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u,
4599                                          7u,   8u,  3u,  4u,  5u, 0u,  1u,  2u};
4600 
4601 // Shuffle last 5 pixels to first 5 mirrored.  last byte zero
4602 static const uvec8 kShuffleMirrorRGB1 = {
4603     13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u};
4604 
4605 // Shuffle 5 pixels at a time (15 bytes)
RGB24MirrorRow_SSSE3(const uint8_t * src_rgb24,uint8_t * dst_rgb24,int width)4606 void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
4607                           uint8_t* dst_rgb24,
4608                           int width) {
4609   intptr_t temp_width = (intptr_t)(width);
4610   src_rgb24 += width * 3 - 48;
4611   asm volatile(
4612       "movdqa      %3,%%xmm4                     \n"
4613       "movdqa      %4,%%xmm5                     \n"
4614 
4615       LABELALIGN
4616       "1:                                        \n"
4617       "movdqu      (%0),%%xmm0                   \n"  // first 5
4618       "movdqu      15(%0),%%xmm1                 \n"  // next 5
4619       "movdqu      30(%0),%%xmm2                 \n"  // next 5
4620       "movdqu      32(%0),%%xmm3                 \n"  // last 1 special
4621       "pshufb      %%xmm4,%%xmm0                 \n"
4622       "pshufb      %%xmm4,%%xmm1                 \n"
4623       "pshufb      %%xmm4,%%xmm2                 \n"
4624       "pshufb      %%xmm5,%%xmm3                 \n"
4625       "lea         -0x30(%0),%0                  \n"
4626       "movdqu      %%xmm0,32(%1)                 \n"  // last 5
4627       "movdqu      %%xmm1,17(%1)                 \n"  // next 5
4628       "movdqu      %%xmm2,2(%1)                  \n"  // next 5
4629       "movlpd      %%xmm3,0(%1)                  \n"  // first 1
4630       "lea         0x30(%1),%1                   \n"
4631       "sub         $0x10,%2                      \n"
4632       "jg          1b                            \n"
4633       : "+r"(src_rgb24),          // %0
4634         "+r"(dst_rgb24),          // %1
4635         "+r"(temp_width)          // %2
4636       : "m"(kShuffleMirrorRGB0),  // %3
4637         "m"(kShuffleMirrorRGB1)   // %4
4638       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
4639 }
4640 #endif  // HAS_RGB24MIRRORROW_SSSE3
4641 
4642 #ifdef HAS_ARGBMIRRORROW_SSE2
4643 
ARGBMirrorRow_SSE2(const uint8_t * src,uint8_t * dst,int width)4644 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
4645   intptr_t temp_width = (intptr_t)(width);
4646   asm volatile(
4647 
4648       "lea         -0x10(%0,%2,4),%0             \n"
4649 
4650       LABELALIGN
4651       "1:                                        \n"
4652       "movdqu      (%0),%%xmm0                   \n"
4653       "pshufd      $0x1b,%%xmm0,%%xmm0           \n"
4654       "lea         -0x10(%0),%0                  \n"
4655       "movdqu      %%xmm0,(%1)                   \n"
4656       "lea         0x10(%1),%1                   \n"
4657       "sub         $0x4,%2                       \n"
4658       "jg          1b                            \n"
4659       : "+r"(src),        // %0
4660         "+r"(dst),        // %1
4661         "+r"(temp_width)  // %2
4662       :
4663       : "memory", "cc", "xmm0");
4664 }
4665 #endif  // HAS_ARGBMIRRORROW_SSE2
4666 
4667 #ifdef HAS_ARGBMIRRORROW_AVX2
4668 // Shuffle table for reversing the bytes.
4669 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
ARGBMirrorRow_AVX2(const uint8_t * src,uint8_t * dst,int width)4670 void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
4671   intptr_t temp_width = (intptr_t)(width);
4672   asm volatile(
4673 
4674       "vmovdqu     %3,%%ymm5                     \n"
4675 
4676       LABELALIGN
4677       "1:                                        \n"
4678       "vpermd      -0x20(%0,%2,4),%%ymm5,%%ymm0  \n"
4679       "vmovdqu     %%ymm0,(%1)                   \n"
4680       "lea         0x20(%1),%1                   \n"
4681       "sub         $0x8,%2                       \n"
4682       "jg          1b                            \n"
4683       "vzeroupper                                \n"
4684       : "+r"(src),                    // %0
4685         "+r"(dst),                    // %1
4686         "+r"(temp_width)              // %2
4687       : "m"(kARGBShuffleMirror_AVX2)  // %3
4688       : "memory", "cc", "xmm0", "xmm5");
4689 }
4690 #endif  // HAS_ARGBMIRRORROW_AVX2
4691 
4692 #ifdef HAS_SPLITUVROW_AVX2
SplitUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)4693 void SplitUVRow_AVX2(const uint8_t* src_uv,
4694                      uint8_t* dst_u,
4695                      uint8_t* dst_v,
4696                      int width) {
4697   asm volatile(
4698       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4699       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
4700       "sub         %1,%2                         \n"
4701 
4702       LABELALIGN
4703       "1:                                        \n"
4704       "vmovdqu     (%0),%%ymm0                   \n"
4705       "vmovdqu     0x20(%0),%%ymm1               \n"
4706       "lea         0x40(%0),%0                   \n"
4707       "vpsrlw      $0x8,%%ymm0,%%ymm2            \n"
4708       "vpsrlw      $0x8,%%ymm1,%%ymm3            \n"
4709       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
4710       "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
4711       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
4712       "vpackuswb   %%ymm3,%%ymm2,%%ymm2          \n"
4713       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
4714       "vpermq      $0xd8,%%ymm2,%%ymm2           \n"
4715       "vmovdqu     %%ymm0,(%1)                   \n"
4716       "vmovdqu     %%ymm2,0x00(%1,%2,1)          \n"
4717       "lea         0x20(%1),%1                   \n"
4718       "sub         $0x20,%3                      \n"
4719       "jg          1b                            \n"
4720       "vzeroupper                                \n"
4721       : "+r"(src_uv),  // %0
4722         "+r"(dst_u),   // %1
4723         "+r"(dst_v),   // %2
4724         "+r"(width)    // %3
4725       :
4726       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
4727 }
4728 #endif  // HAS_SPLITUVROW_AVX2
4729 
4730 #ifdef HAS_SPLITUVROW_SSE2
SplitUVRow_SSE2(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)4731 void SplitUVRow_SSE2(const uint8_t* src_uv,
4732                      uint8_t* dst_u,
4733                      uint8_t* dst_v,
4734                      int width) {
4735   asm volatile(
4736       "pcmpeqb     %%xmm5,%%xmm5                 \n"
4737       "psrlw       $0x8,%%xmm5                   \n"
4738       "sub         %1,%2                         \n"
4739 
4740       LABELALIGN
4741       "1:                                        \n"
4742       "movdqu      (%0),%%xmm0                   \n"
4743       "movdqu      0x10(%0),%%xmm1               \n"
4744       "lea         0x20(%0),%0                   \n"
4745       "movdqa      %%xmm0,%%xmm2                 \n"
4746       "movdqa      %%xmm1,%%xmm3                 \n"
4747       "pand        %%xmm5,%%xmm0                 \n"
4748       "pand        %%xmm5,%%xmm1                 \n"
4749       "packuswb    %%xmm1,%%xmm0                 \n"
4750       "psrlw       $0x8,%%xmm2                   \n"
4751       "psrlw       $0x8,%%xmm3                   \n"
4752       "packuswb    %%xmm3,%%xmm2                 \n"
4753       "movdqu      %%xmm0,(%1)                   \n"
4754       "movdqu      %%xmm2,0x00(%1,%2,1)          \n"
4755       "lea         0x10(%1),%1                   \n"
4756       "sub         $0x10,%3                      \n"
4757       "jg          1b                            \n"
4758       : "+r"(src_uv),  // %0
4759         "+r"(dst_u),   // %1
4760         "+r"(dst_v),   // %2
4761         "+r"(width)    // %3
4762       :
4763       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
4764 }
4765 #endif  // HAS_SPLITUVROW_SSE2
4766 
4767 #ifdef HAS_DETILEROW_SSE2
DetileRow_SSE2(const uint8_t * src,ptrdiff_t src_tile_stride,uint8_t * dst,int width)4768 void DetileRow_SSE2(const uint8_t* src,
4769                     ptrdiff_t src_tile_stride,
4770                     uint8_t* dst,
4771                     int width) {
4772   asm volatile(
4773       "1:                                        \n"
4774       "movdqu      (%0),%%xmm0                   \n"
4775       "sub         $0x10,%2                      \n"
4776       "lea         (%0,%3),%0                    \n"
4777       "movdqu      %%xmm0,(%1)                   \n"
4778       "lea         0x10(%1),%1                   \n"
4779       "jg          1b                            \n"
4780       : "+r"(src),            // %0
4781         "+r"(dst),            // %1
4782         "+r"(width)           // %2
4783       : "r"(src_tile_stride)  // %3
4784       : "cc", "memory", "xmm0");
4785 }
4786 #endif  // HAS_DETILEROW_SSE2
4787 
4788 #ifdef HAS_DETILESPLITUVROW_SSSE3
4789 // TODO(greenjustin): Look into generating these constants instead of loading
4790 // them since this can cause branch mispredicts for fPIC code on 32-bit
4791 // machines.
4792 static const uvec8 kDeinterlaceUV = {0, 2, 4, 6, 8, 10, 12, 14,
4793                                      1, 3, 5, 7, 9, 11, 13, 15};
4794 
4795 // TODO(greenjustin): Research alternatives to pshufb, since pshufb can be very
4796 // slow on older SSE2 processors.
DetileSplitUVRow_SSSE3(const uint8_t * src_uv,ptrdiff_t src_tile_stride,uint8_t * dst_u,uint8_t * dst_v,int width)4797 void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
4798                             ptrdiff_t src_tile_stride,
4799                             uint8_t* dst_u,
4800                             uint8_t* dst_v,
4801                             int width) {
4802   asm volatile(
4803       "movdqu      %4,%%xmm1                     \n"
4804       "1:                                        \n"
4805       "movdqu      (%0),%%xmm0                   \n"
4806       "lea         (%0, %5),%0                   \n"
4807       "pshufb      %%xmm1,%%xmm0                 \n"
4808       "movq        %%xmm0,(%1)                   \n"
4809       "lea         0x8(%1),%1                    \n"
4810       "movhps      %%xmm0,(%2)                   \n"
4811       "lea         0x8(%2),%2                    \n"
4812       "sub         $0x10,%3                      \n"
4813       "jg          1b                            \n"
4814       : "+r"(src_uv),         // %0
4815         "+r"(dst_u),          // %1
4816         "+r"(dst_v),          // %2
4817         "+r"(width)           // %3
4818       : "m"(kDeinterlaceUV),  // %4
4819         "r"(src_tile_stride)  // %5
4820       : "cc", "memory", "xmm0", "xmm1");
4821 }
4822 #endif  // HAS_DETILESPLITUVROW_SSSE3
4823 
4824 #ifdef HAS_MERGEUVROW_AVX2
MergeUVRow_AVX2(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)4825 void MergeUVRow_AVX2(const uint8_t* src_u,
4826                      const uint8_t* src_v,
4827                      uint8_t* dst_uv,
4828                      int width) {
4829   asm volatile(
4830 
4831       "sub         %0,%1                         \n"
4832 
4833       LABELALIGN
4834       "1:                                        \n"
4835       "vmovdqu     (%0),%%ymm0                   \n"
4836       "vmovdqu     0x00(%0,%1,1),%%ymm1          \n"
4837       "lea         0x20(%0),%0                   \n"
4838       "vpunpcklbw  %%ymm1,%%ymm0,%%ymm2          \n"
4839       "vpunpckhbw  %%ymm1,%%ymm0,%%ymm0          \n"
4840       "vextractf128 $0x0,%%ymm2,(%2)             \n"
4841       "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
4842       "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
4843       "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
4844       "lea         0x40(%2),%2                   \n"
4845       "sub         $0x20,%3                      \n"
4846       "jg          1b                            \n"
4847       "vzeroupper                                \n"
4848       : "+r"(src_u),   // %0
4849         "+r"(src_v),   // %1
4850         "+r"(dst_uv),  // %2
4851         "+r"(width)    // %3
4852       :
4853       : "memory", "cc", "xmm0", "xmm1", "xmm2");
4854 }
4855 #endif  // HAS_MERGEUVROW_AVX2
4856 
4857 #ifdef HAS_MERGEUVROW_SSE2
MergeUVRow_SSE2(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)4858 void MergeUVRow_SSE2(const uint8_t* src_u,
4859                      const uint8_t* src_v,
4860                      uint8_t* dst_uv,
4861                      int width) {
4862   asm volatile(
4863 
4864       "sub         %0,%1                         \n"
4865 
4866       LABELALIGN
4867       "1:                                        \n"
4868       "movdqu      (%0),%%xmm0                   \n"
4869       "movdqu      0x00(%0,%1,1),%%xmm1          \n"
4870       "lea         0x10(%0),%0                   \n"
4871       "movdqa      %%xmm0,%%xmm2                 \n"
4872       "punpcklbw   %%xmm1,%%xmm0                 \n"
4873       "punpckhbw   %%xmm1,%%xmm2                 \n"
4874       "movdqu      %%xmm0,(%2)                   \n"
4875       "movdqu      %%xmm2,0x10(%2)               \n"
4876       "lea         0x20(%2),%2                   \n"
4877       "sub         $0x10,%3                      \n"
4878       "jg          1b                            \n"
4879       : "+r"(src_u),   // %0
4880         "+r"(src_v),   // %1
4881         "+r"(dst_uv),  // %2
4882         "+r"(width)    // %3
4883       :
4884       : "memory", "cc", "xmm0", "xmm1", "xmm2");
4885 }
4886 #endif  // HAS_MERGEUVROW_SSE2
4887 
4888 #ifdef HAS_MERGEUVROW_16_AVX2
MergeUVRow_16_AVX2(const uint16_t * src_u,const uint16_t * src_v,uint16_t * dst_uv,int depth,int width)4889 void MergeUVRow_16_AVX2(const uint16_t* src_u,
4890                         const uint16_t* src_v,
4891                         uint16_t* dst_uv,
4892                         int depth,
4893                         int width) {
4894   depth = 16 - depth;
4895   // clang-format off
4896   asm volatile (
4897       "vmovd       %4,%%xmm3                     \n"
4898       "sub         %0,%1                         \n"
4899 
4900     // 16 pixels per loop.
4901     LABELALIGN
4902       "1:                                        \n"
4903       "vmovdqu     (%0),%%ymm0                   \n"
4904       "vmovdqu     (%0,%1,1),%%ymm1              \n"
4905       "add         $0x20,%0                      \n"
4906 
4907       "vpsllw      %%xmm3,%%ymm0,%%ymm0          \n"
4908       "vpsllw      %%xmm3,%%ymm1,%%ymm1          \n"
4909       "vpunpcklwd  %%ymm1,%%ymm0,%%ymm2          \n"  // mutates
4910       "vpunpckhwd  %%ymm1,%%ymm0,%%ymm0          \n"
4911       "vextractf128 $0x0,%%ymm2,(%2)             \n"
4912       "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
4913       "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
4914       "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
4915       "add         $0x40,%2                      \n"
4916       "sub         $0x10,%3                      \n"
4917       "jg          1b                            \n"
4918       "vzeroupper                                \n"
4919   : "+r"(src_u),   // %0
4920     "+r"(src_v),   // %1
4921     "+r"(dst_uv),  // %2
4922     "+r"(width)    // %3
4923   : "r"(depth)     // %4
4924   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
4925   // clang-format on
4926 }
4927 #endif  // HAS_MERGEUVROW_AVX2
4928 
4929 #ifdef HAS_SPLITUVROW_16_AVX2
4930 const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8,  9,  12, 13,
4931                                  2, 3, 6, 7, 10, 11, 14, 15};
SplitUVRow_16_AVX2(const uint16_t * src_uv,uint16_t * dst_u,uint16_t * dst_v,int depth,int width)4932 void SplitUVRow_16_AVX2(const uint16_t* src_uv,
4933                         uint16_t* dst_u,
4934                         uint16_t* dst_v,
4935                         int depth,
4936                         int width) {
4937   depth = 16 - depth;
4938   // clang-format off
4939   asm volatile (
4940       "vmovd       %4,%%xmm3                     \n"
4941       "vbroadcastf128 %5,%%ymm4                  \n"
4942       "sub         %1,%2                         \n"
4943 
4944     // 16 pixels per loop.
4945     LABELALIGN
4946       "1:                                        \n"
4947       "vmovdqu     (%0),%%ymm0                   \n"
4948       "vmovdqu     0x20(%0),%%ymm1               \n"
4949       "add         $0x40,%0                      \n"
4950 
4951       "vpsrlw      %%xmm3,%%ymm0,%%ymm0          \n"
4952       "vpsrlw      %%xmm3,%%ymm1,%%ymm1          \n"
4953       "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
4954       "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
4955       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
4956       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
4957       "vextractf128 $0x0,%%ymm0,(%1)             \n"
4958       "vextractf128 $0x0,%%ymm1,0x10(%1)         \n"
4959       "vextractf128 $0x1,%%ymm0,(%1,%2)          \n"
4960       "vextractf128 $0x1,%%ymm1,0x10(%1,%2)      \n"
4961       "add         $0x20,%1                      \n"
4962       "sub         $0x10,%3                      \n"
4963       "jg          1b                            \n"
4964       "vzeroupper                                \n"
4965   : "+r"(src_uv),   // %0
4966     "+r"(dst_u),    // %1
4967     "+r"(dst_v),    // %2
4968     "+r"(width)     // %3
4969   : "r"(depth),     // %4
4970     "m"(kSplitUVShuffle16) // %5
4971   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
4972   // clang-format on
4973 }
4974 #endif  // HAS_SPLITUVROW_16_AVX2
4975 
4976 // Use scale to convert lsb formats to msb, depending how many bits there are:
4977 // 128 = 9 bits
4978 // 64 = 10 bits
4979 // 16 = 12 bits
4980 // 1 = 16 bits
4981 #ifdef HAS_MULTIPLYROW_16_AVX2
MultiplyRow_16_AVX2(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)4982 void MultiplyRow_16_AVX2(const uint16_t* src_y,
4983                          uint16_t* dst_y,
4984                          int scale,
4985                          int width) {
4986   // clang-format off
4987   asm volatile (
4988       "vmovd       %3,%%xmm3                     \n"
4989       "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
4990       "vbroadcastss %%xmm3,%%ymm3                \n"
4991       "sub         %0,%1                         \n"
4992 
4993     // 32 pixels per loop.
4994     LABELALIGN
4995       "1:                                        \n"
4996       "vmovdqu     (%0),%%ymm0                   \n"
4997       "vmovdqu     0x20(%0),%%ymm1               \n"
4998       "vpmullw     %%ymm3,%%ymm0,%%ymm0          \n"
4999       "vpmullw     %%ymm3,%%ymm1,%%ymm1          \n"
5000       "vmovdqu     %%ymm0,(%0,%1)                \n"
5001       "vmovdqu     %%ymm1,0x20(%0,%1)            \n"
5002       "add         $0x40,%0                      \n"
5003       "sub         $0x20,%2                      \n"
5004       "jg          1b                            \n"
5005       "vzeroupper                                \n"
5006   : "+r"(src_y),   // %0
5007     "+r"(dst_y),   // %1
5008     "+r"(width)    // %2
5009   : "r"(scale)     // %3
5010   : "memory", "cc", "xmm0", "xmm1", "xmm3");
5011   // clang-format on
5012 }
5013 #endif  // HAS_MULTIPLYROW_16_AVX2
5014 
5015 // Use scale to convert msb formats to lsb, depending how many bits there are:
5016 // 512 = 9 bits
5017 // 1024 = 10 bits
5018 // 4096 = 12 bits
5019 // 65536 = 16 bits
5020 #ifdef HAS_DIVIDEROW_16_AVX2
DivideRow_16_AVX2(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)5021 void DivideRow_16_AVX2(const uint16_t* src_y,
5022                        uint16_t* dst_y,
5023                        int scale,
5024                        int width) {
5025   // clang-format off
5026   asm volatile (
5027       "vmovd       %3,%%xmm3                     \n"
5028       "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
5029       "vbroadcastss %%xmm3,%%ymm3                \n"
5030       "sub         %0,%1                         \n"
5031 
5032     // 32 pixels per loop.
5033     LABELALIGN
5034       "1:                                        \n"
5035       "vmovdqu     (%0),%%ymm0                   \n"
5036       "vmovdqu     0x20(%0),%%ymm1               \n"
5037       "vpmulhuw    %%ymm3,%%ymm0,%%ymm0          \n"
5038       "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
5039       "vmovdqu     %%ymm0,(%0,%1)                \n"
5040       "vmovdqu     %%ymm1,0x20(%0,%1)            \n"
5041       "add         $0x40,%0                      \n"
5042       "sub         $0x20,%2                      \n"
5043       "jg          1b                            \n"
5044       "vzeroupper                                \n"
5045   : "+r"(src_y),   // %0
5046     "+r"(dst_y),   // %1
5047     "+r"(width),    // %2
5048     "+r"(scale)     // %3
5049   :
5050   : "memory", "cc", "xmm0", "xmm1", "xmm3");
5051   // clang-format on
5052 }
5053 #endif  // HAS_MULTIPLYROW_16_AVX2
5054 
5055 // Use scale to convert lsb formats to msb, depending how many bits there are:
5056 // 32768 = 9 bits
5057 // 16384 = 10 bits
5058 // 4096 = 12 bits
5059 // 256 = 16 bits
Convert16To8Row_SSSE3(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)5060 void Convert16To8Row_SSSE3(const uint16_t* src_y,
5061                            uint8_t* dst_y,
5062                            int scale,
5063                            int width) {
5064   // clang-format off
5065   asm volatile (
5066       "movd        %3,%%xmm2                     \n"
5067       "punpcklwd   %%xmm2,%%xmm2                 \n"
5068       "pshufd      $0x0,%%xmm2,%%xmm2            \n"
5069 
5070     // 32 pixels per loop.
5071     LABELALIGN
5072       "1:                                        \n"
5073       "movdqu      (%0),%%xmm0                   \n"
5074       "movdqu      0x10(%0),%%xmm1               \n"
5075       "add         $0x20,%0                      \n"
5076       "pmulhuw     %%xmm2,%%xmm0                 \n"
5077       "pmulhuw     %%xmm2,%%xmm1                 \n"
5078       "packuswb    %%xmm1,%%xmm0                 \n"
5079       "movdqu      %%xmm0,(%1)                   \n"
5080       "add         $0x10,%1                      \n"
5081       "sub         $0x10,%2                      \n"
5082       "jg          1b                            \n"
5083   : "+r"(src_y),   // %0
5084     "+r"(dst_y),   // %1
5085     "+r"(width)    // %2
5086   : "r"(scale)     // %3
5087   : "memory", "cc", "xmm0", "xmm1", "xmm2");
5088   // clang-format on
5089 }
5090 
5091 #ifdef HAS_CONVERT16TO8ROW_AVX2
Convert16To8Row_AVX2(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)5092 void Convert16To8Row_AVX2(const uint16_t* src_y,
5093                           uint8_t* dst_y,
5094                           int scale,
5095                           int width) {
5096   // clang-format off
5097   asm volatile (
5098       "vmovd       %3,%%xmm2                     \n"
5099       "vpunpcklwd  %%xmm2,%%xmm2,%%xmm2          \n"
5100       "vbroadcastss %%xmm2,%%ymm2                \n"
5101 
5102     // 32 pixels per loop.
5103     LABELALIGN
5104       "1:                                        \n"
5105       "vmovdqu     (%0),%%ymm0                   \n"
5106       "vmovdqu     0x20(%0),%%ymm1               \n"
5107       "add         $0x40,%0                      \n"
5108       "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
5109       "vpmulhuw    %%ymm2,%%ymm1,%%ymm1          \n"
5110       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"  // mutates
5111       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
5112       "vmovdqu     %%ymm0,(%1)                   \n"
5113       "add         $0x20,%1                      \n"
5114       "sub         $0x20,%2                      \n"
5115       "jg          1b                            \n"
5116       "vzeroupper                                \n"
5117   : "+r"(src_y),   // %0
5118     "+r"(dst_y),   // %1
5119     "+r"(width)    // %2
5120   : "r"(scale)     // %3
5121   : "memory", "cc", "xmm0", "xmm1", "xmm2");
5122   // clang-format on
5123 }
5124 #endif  // HAS_CONVERT16TO8ROW_AVX2
5125 
5126 // Use scale to convert to lsb formats depending how many bits there are:
5127 // 512 = 9 bits
5128 // 1024 = 10 bits
5129 // 4096 = 12 bits
5130 // TODO(fbarchard): reduce to SSE2
Convert8To16Row_SSE2(const uint8_t * src_y,uint16_t * dst_y,int scale,int width)5131 void Convert8To16Row_SSE2(const uint8_t* src_y,
5132                           uint16_t* dst_y,
5133                           int scale,
5134                           int width) {
5135   // clang-format off
5136   asm volatile (
5137       "movd        %3,%%xmm2                     \n"
5138       "punpcklwd   %%xmm2,%%xmm2                 \n"
5139       "pshufd      $0x0,%%xmm2,%%xmm2            \n"
5140 
5141     // 32 pixels per loop.
5142     LABELALIGN
5143       "1:                                        \n"
5144       "movdqu      (%0),%%xmm0                   \n"
5145       "movdqa      %%xmm0,%%xmm1                 \n"
5146       "punpcklbw   %%xmm0,%%xmm0                 \n"
5147       "punpckhbw   %%xmm1,%%xmm1                 \n"
5148       "add         $0x10,%0                      \n"
5149       "pmulhuw     %%xmm2,%%xmm0                 \n"
5150       "pmulhuw     %%xmm2,%%xmm1                 \n"
5151       "movdqu      %%xmm0,(%1)                   \n"
5152       "movdqu      %%xmm1,0x10(%1)               \n"
5153       "add         $0x20,%1                      \n"
5154       "sub         $0x10,%2                      \n"
5155       "jg          1b                            \n"
5156   : "+r"(src_y),   // %0
5157     "+r"(dst_y),   // %1
5158     "+r"(width)    // %2
5159   : "r"(scale)     // %3
5160   : "memory", "cc", "xmm0", "xmm1", "xmm2");
5161   // clang-format on
5162 }
5163 
5164 #ifdef HAS_CONVERT8TO16ROW_AVX2
Convert8To16Row_AVX2(const uint8_t * src_y,uint16_t * dst_y,int scale,int width)5165 void Convert8To16Row_AVX2(const uint8_t* src_y,
5166                           uint16_t* dst_y,
5167                           int scale,
5168                           int width) {
5169   // clang-format off
5170   asm volatile (
5171       "vmovd       %3,%%xmm2                     \n"
5172       "vpunpcklwd  %%xmm2,%%xmm2,%%xmm2          \n"
5173       "vbroadcastss %%xmm2,%%ymm2                \n"
5174 
5175     // 32 pixels per loop.
5176     LABELALIGN
5177       "1:                                        \n"
5178       "vmovdqu     (%0),%%ymm0                   \n"
5179       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
5180       "add         $0x20,%0                      \n"
5181       "vpunpckhbw  %%ymm0,%%ymm0,%%ymm1          \n"
5182       "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
5183       "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
5184       "vpmulhuw    %%ymm2,%%ymm1,%%ymm1          \n"
5185       "vmovdqu     %%ymm0,(%1)                   \n"
5186       "vmovdqu     %%ymm1,0x20(%1)               \n"
5187       "add         $0x40,%1                      \n"
5188       "sub         $0x20,%2                      \n"
5189       "jg          1b                            \n"
5190       "vzeroupper                                \n"
5191   : "+r"(src_y),   // %0
5192     "+r"(dst_y),   // %1
5193     "+r"(width)    // %2
5194   : "r"(scale)     // %3
5195   : "memory", "cc", "xmm0", "xmm1", "xmm2");
5196   // clang-format on
5197 }
5198 #endif  // HAS_CONVERT8TO16ROW_AVX2
5199 
5200 #ifdef HAS_SPLITRGBROW_SSSE3
5201 // Shuffle table for converting RGB to Planar.
5202 static const uvec8 kSplitRGBShuffle[9] = {
5203     {0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
5204      128u, 128u},
5205     {128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u,
5206      128u, 128u},
5207     {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 1u, 4u,
5208      7u, 10u, 13u},
5209     {1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
5210      128u, 128u},
5211     {128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u,
5212      128u, 128u},
5213     {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u,
5214      8u, 11u, 14u},
5215     {2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
5216      128u, 128u},
5217     {128u, 128u, 128u, 128u, 128u, 1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u,
5218      128u, 128u},
5219     {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u,
5220      12u, 15u}};
5221 
SplitRGBRow_SSSE3(const uint8_t * src_rgb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)5222 void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
5223                        uint8_t* dst_r,
5224                        uint8_t* dst_g,
5225                        uint8_t* dst_b,
5226                        int width) {
5227   asm volatile(
5228 
5229       LABELALIGN
5230       "1:                                        \n"
5231       "movdqu      (%0),%%xmm0                   \n"
5232       "movdqu      0x10(%0),%%xmm1               \n"
5233       "movdqu      0x20(%0),%%xmm2               \n"
5234       "pshufb      0(%5), %%xmm0                 \n"
5235       "pshufb      16(%5), %%xmm1                \n"
5236       "pshufb      32(%5), %%xmm2                \n"
5237       "por         %%xmm1,%%xmm0                 \n"
5238       "por         %%xmm2,%%xmm0                 \n"
5239       "movdqu      %%xmm0,(%1)                   \n"
5240       "lea         0x10(%1),%1                   \n"
5241 
5242       "movdqu      (%0),%%xmm0                   \n"
5243       "movdqu      0x10(%0),%%xmm1               \n"
5244       "movdqu      0x20(%0),%%xmm2               \n"
5245       "pshufb      48(%5),%%xmm0                 \n"
5246       "pshufb      64(%5),%%xmm1                 \n"
5247       "pshufb      80(%5), %%xmm2                \n"
5248       "por         %%xmm1,%%xmm0                 \n"
5249       "por         %%xmm2,%%xmm0                 \n"
5250       "movdqu      %%xmm0,(%2)                   \n"
5251       "lea         0x10(%2),%2                   \n"
5252 
5253       "movdqu      (%0),%%xmm0                   \n"
5254       "movdqu      0x10(%0),%%xmm1               \n"
5255       "movdqu      0x20(%0),%%xmm2               \n"
5256       "pshufb      96(%5), %%xmm0                \n"
5257       "pshufb      112(%5), %%xmm1               \n"
5258       "pshufb      128(%5), %%xmm2               \n"
5259       "por         %%xmm1,%%xmm0                 \n"
5260       "por         %%xmm2,%%xmm0                 \n"
5261       "movdqu      %%xmm0,(%3)                   \n"
5262       "lea         0x10(%3),%3                   \n"
5263       "lea         0x30(%0),%0                   \n"
5264       "sub         $0x10,%4                      \n"
5265       "jg          1b                            \n"
5266       : "+r"(src_rgb),             // %0
5267         "+r"(dst_r),               // %1
5268         "+r"(dst_g),               // %2
5269         "+r"(dst_b),               // %3
5270         "+r"(width)                // %4
5271       : "r"(&kSplitRGBShuffle[0])  // %5
5272       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5273 }
5274 #endif  // HAS_SPLITRGBROW_SSSE3
5275 
5276 #ifdef HAS_MERGERGBROW_SSSE3
5277 // Shuffle table for converting Planar to RGB.
5278 static const uvec8 kMergeRGBShuffle[9] = {
5279     {0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u, 128u,
5280      128u, 5u},
5281     {128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u,
5282      128u, 128u},
5283     {128u, 128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u,
5284      4u, 128u},
5285     {128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u, 128u,
5286      10u, 128u},
5287     {5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u,
5288      128u, 10u},
5289     {128u, 5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u,
5290      128u, 128u},
5291     {128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u, 128u,
5292      15u, 128u, 128u},
5293     {128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u,
5294      128u, 15u, 128u},
5295     {10u, 128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u,
5296      128u, 128u, 15u}};
5297 
MergeRGBRow_SSSE3(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_rgb,int width)5298 void MergeRGBRow_SSSE3(const uint8_t* src_r,
5299                        const uint8_t* src_g,
5300                        const uint8_t* src_b,
5301                        uint8_t* dst_rgb,
5302                        int width) {
5303   asm volatile(
5304 
5305       LABELALIGN
5306       "1:                                        \n"
5307       "movdqu      (%0),%%xmm0                   \n"
5308       "movdqu      (%1),%%xmm1                   \n"
5309       "movdqu      (%2),%%xmm2                   \n"
5310       "pshufb      (%5), %%xmm0                  \n"
5311       "pshufb      16(%5), %%xmm1                \n"
5312       "pshufb      32(%5), %%xmm2                \n"
5313       "por         %%xmm1,%%xmm0                 \n"
5314       "por         %%xmm2,%%xmm0                 \n"
5315       "movdqu      %%xmm0,(%3)                   \n"
5316 
5317       "movdqu      (%0),%%xmm0                   \n"
5318       "movdqu      (%1),%%xmm1                   \n"
5319       "movdqu      (%2),%%xmm2                   \n"
5320       "pshufb      48(%5), %%xmm0                \n"
5321       "pshufb      64(%5), %%xmm1                \n"
5322       "pshufb      80(%5), %%xmm2                \n"
5323       "por         %%xmm1,%%xmm0                 \n"
5324       "por         %%xmm2,%%xmm0                 \n"
5325       "movdqu      %%xmm0,16(%3)                 \n"
5326 
5327       "movdqu      (%0),%%xmm0                   \n"
5328       "movdqu      (%1),%%xmm1                   \n"
5329       "movdqu      (%2),%%xmm2                   \n"
5330       "pshufb      96(%5), %%xmm0                \n"
5331       "pshufb      112(%5), %%xmm1               \n"
5332       "pshufb      128(%5), %%xmm2               \n"
5333       "por         %%xmm1,%%xmm0                 \n"
5334       "por         %%xmm2,%%xmm0                 \n"
5335       "movdqu      %%xmm0,32(%3)                 \n"
5336 
5337       "lea         0x10(%0),%0                   \n"
5338       "lea         0x10(%1),%1                   \n"
5339       "lea         0x10(%2),%2                   \n"
5340       "lea         0x30(%3),%3                   \n"
5341       "sub         $0x10,%4                      \n"
5342       "jg          1b                            \n"
5343       : "+r"(src_r),               // %0
5344         "+r"(src_g),               // %1
5345         "+r"(src_b),               // %2
5346         "+r"(dst_rgb),             // %3
5347         "+r"(width)                // %4
5348       : "r"(&kMergeRGBShuffle[0])  // %5
5349       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5350 }
5351 #endif  // HAS_MERGERGBROW_SSSE3
5352 
5353 #ifdef HAS_MERGEARGBROW_SSE2
MergeARGBRow_SSE2(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,const uint8_t * src_a,uint8_t * dst_argb,int width)5354 void MergeARGBRow_SSE2(const uint8_t* src_r,
5355                        const uint8_t* src_g,
5356                        const uint8_t* src_b,
5357                        const uint8_t* src_a,
5358                        uint8_t* dst_argb,
5359                        int width) {
5360   asm volatile(
5361 
5362       "sub         %0,%1                         \n"
5363       "sub         %0,%2                         \n"
5364       "sub         %0,%3                         \n"
5365 
5366       LABELALIGN
5367       "1:                                        \n"
5368 
5369       "movq        (%0,%2),%%xmm0                \n"  // B
5370       "movq        (%0),%%xmm1                   \n"  // R
5371       "movq        (%0,%1),%%xmm2                \n"  // G
5372       "punpcklbw   %%xmm1,%%xmm0                 \n"  // BR
5373       "movq        (%0,%3),%%xmm1                \n"  // A
5374       "punpcklbw   %%xmm1,%%xmm2                 \n"  // GA
5375       "movdqa      %%xmm0,%%xmm1                 \n"  // BR
5376       "punpckhbw   %%xmm2,%%xmm1                 \n"  // BGRA (hi)
5377       "punpcklbw   %%xmm2,%%xmm0                 \n"  // BGRA (lo)
5378       "movdqu      %%xmm0,(%4)                   \n"
5379       "movdqu      %%xmm1,16(%4)                 \n"
5380 
5381       "lea         8(%0),%0                      \n"
5382       "lea         32(%4),%4                     \n"
5383       "sub         $0x8,%5                       \n"
5384       "jg          1b                            \n"
5385       : "+r"(src_r),     // %0
5386         "+r"(src_g),     // %1
5387         "+r"(src_b),     // %2
5388         "+r"(src_a),     // %3
5389         "+r"(dst_argb),  // %4
5390         "+r"(width)      // %5
5391       :
5392       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5393 }
5394 #endif
5395 
5396 #ifdef HAS_MERGEXRGBROW_SSE2
MergeXRGBRow_SSE2(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_argb,int width)5397 void MergeXRGBRow_SSE2(const uint8_t* src_r,
5398                        const uint8_t* src_g,
5399                        const uint8_t* src_b,
5400                        uint8_t* dst_argb,
5401                        int width) {
5402   asm volatile(
5403 
5404       LABELALIGN
5405       "1:                                        \n"
5406 
5407       "movq        (%2),%%xmm0                   \n"  // B
5408       "movq        (%0),%%xmm1                   \n"  // R
5409       "movq        (%1),%%xmm2                   \n"  // G
5410       "punpcklbw   %%xmm1,%%xmm0                 \n"  // BR
5411       "pcmpeqd     %%xmm1,%%xmm1                 \n"  // A(255)
5412       "punpcklbw   %%xmm1,%%xmm2                 \n"  // GA
5413       "movdqa      %%xmm0,%%xmm1                 \n"  // BR
5414       "punpckhbw   %%xmm2,%%xmm1                 \n"  // BGRA (hi)
5415       "punpcklbw   %%xmm2,%%xmm0                 \n"  // BGRA (lo)
5416       "movdqu      %%xmm0,(%3)                   \n"
5417       "movdqu      %%xmm1,16(%3)                 \n"
5418 
5419       "lea         8(%0),%0                      \n"
5420       "lea         8(%1),%1                      \n"
5421       "lea         8(%2),%2                      \n"
5422       "lea         32(%3),%3                     \n"
5423       "sub         $0x8,%4                       \n"
5424       "jg          1b                            \n"
5425       : "+r"(src_r),     // %0
5426         "+r"(src_g),     // %1
5427         "+r"(src_b),     // %2
5428         "+r"(dst_argb),  // %3
5429         "+r"(width)      // %4
5430       :
5431       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5432 }
5433 #endif  // HAS_MERGEARGBROW_SSE2
5434 
5435 #ifdef HAS_MERGEARGBROW_AVX2
MergeARGBRow_AVX2(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,const uint8_t * src_a,uint8_t * dst_argb,int width)5436 void MergeARGBRow_AVX2(const uint8_t* src_r,
5437                        const uint8_t* src_g,
5438                        const uint8_t* src_b,
5439                        const uint8_t* src_a,
5440                        uint8_t* dst_argb,
5441                        int width) {
5442   asm volatile(
5443 
5444       "sub         %0,%1                         \n"
5445       "sub         %0,%2                         \n"
5446       "sub         %0,%3                         \n"
5447 
5448       LABELALIGN
5449       "1:                                        \n"
5450 
5451       "vmovdqu     (%0,%2),%%xmm0                \n"  // B
5452       "vmovdqu     (%0,%1),%%xmm1                \n"  // R
5453       "vinserti128 $1,(%0),%%ymm0,%%ymm0         \n"  // G
5454       "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1      \n"  // A
5455       "vpunpckhbw  %%ymm1,%%ymm0,%%ymm2          \n"
5456       "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
5457       "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
5458       "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
5459       "vpunpckhwd  %%ymm1,%%ymm0,%%ymm2          \n"
5460       "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"
5461       "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
5462       "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
5463       "vmovdqu     %%ymm0,(%4)                   \n"  // First 8
5464       "vmovdqu     %%ymm1,32(%4)                 \n"  // Next 8
5465 
5466       "lea         16(%0),%0                     \n"
5467       "lea         64(%4),%4                     \n"
5468       "sub         $0x10,%5                      \n"
5469       "jg          1b                            \n"
5470       "vzeroupper                                \n"
5471       : "+r"(src_r),     // %0
5472         "+r"(src_g),     // %1
5473         "+r"(src_b),     // %2
5474         "+r"(src_a),     // %3
5475         "+r"(dst_argb),  // %4
5476         "+r"(width)      // %5
5477       :
5478       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5479 }
5480 #endif
5481 
5482 #ifdef HAS_MERGEXRGBROW_AVX2
MergeXRGBRow_AVX2(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_argb,int width)5483 void MergeXRGBRow_AVX2(const uint8_t* src_r,
5484                        const uint8_t* src_g,
5485                        const uint8_t* src_b,
5486                        uint8_t* dst_argb,
5487                        int width) {
5488   asm volatile(
5489 
5490       LABELALIGN
5491       "1:                                        \n"
5492 
5493       "vmovdqu     (%2),%%xmm0                   \n"  // B
5494       "vpcmpeqd    %%ymm1,%%ymm1,%%ymm1          \n"  // A(255)
5495       "vinserti128 $0,(%1),%%ymm1,%%ymm1         \n"  // R
5496       "vinserti128 $1,(%0),%%ymm0,%%ymm0         \n"  // G
5497       "vpunpckhbw  %%ymm1,%%ymm0,%%ymm2          \n"
5498       "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
5499       "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
5500       "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
5501       "vpunpckhwd  %%ymm1,%%ymm0,%%ymm2          \n"
5502       "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"
5503       "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
5504       "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
5505       "vmovdqu     %%ymm0,(%3)                   \n"  // First 8
5506       "vmovdqu     %%ymm1,32(%3)                 \n"  // Next 8
5507 
5508       "lea         16(%0),%0                     \n"
5509       "lea         16(%1),%1                     \n"
5510       "lea         16(%2),%2                     \n"
5511       "lea         64(%3),%3                     \n"
5512       "sub         $0x10,%4                      \n"
5513       "jg          1b                            \n"
5514       "vzeroupper                                \n"
5515       : "+r"(src_r),     // %0
5516         "+r"(src_g),     // %1
5517         "+r"(src_b),     // %2
5518         "+r"(dst_argb),  // %3
5519         "+rm"(width)     // %4
5520       :
5521       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5522 }
5523 #endif  // HAS_MERGEARGBROW_AVX2
5524 
5525 #ifdef HAS_SPLITARGBROW_SSE2
SplitARGBRow_SSE2(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)5526 void SplitARGBRow_SSE2(const uint8_t* src_argb,
5527                        uint8_t* dst_r,
5528                        uint8_t* dst_g,
5529                        uint8_t* dst_b,
5530                        uint8_t* dst_a,
5531                        int width) {
5532   asm volatile(
5533 
5534       "sub         %1,%2                         \n"
5535       "sub         %1,%3                         \n"
5536       "sub         %1,%4                         \n"
5537 
5538       LABELALIGN
5539       "1:                                        \n"
5540 
5541       "movdqu      (%0),%%xmm0                   \n"  // 00-0F
5542       "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
5543       "movdqa      %%xmm0,%%xmm2                 \n"
5544       "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 00-07 10-17
5545       "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 08-0F 18-1F
5546       "movdqa      %%xmm0,%%xmm1                 \n"
5547       "punpcklbw   %%xmm2,%%xmm0                 \n"  // 08192A3B4C5D6E7F (lo)
5548       "punpckhbw   %%xmm2,%%xmm1                 \n"  // 08192A3B4C5D6E7F (hi)
5549       "movdqa      %%xmm0,%%xmm2                 \n"
5550       "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 08192A3B08192A3B
5551       "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 4C5D6E7F4C5D6E7F
5552       "movdqa      %%xmm0,%%xmm1                 \n"
5553       "punpcklbw   %%xmm2,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
5554       "punpckhbw   %%xmm2,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
5555       "movdqa      %%xmm0,%%xmm2                 \n"
5556       "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
5557       "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
5558       "movlps      %%xmm0,(%1,%3)                \n"  // B
5559       "movhps      %%xmm0,(%1,%2)                \n"  // G
5560       "movlps      %%xmm2,(%1)                   \n"  // R
5561       "movhps      %%xmm2,(%1,%4)                \n"  // A
5562 
5563       "lea         32(%0),%0                     \n"
5564       "lea         8(%1),%1                      \n"
5565       "sub         $0x8,%5                       \n"
5566       "jg          1b                            \n"
5567       : "+r"(src_argb),  // %0
5568         "+r"(dst_r),     // %1
5569         "+r"(dst_g),     // %2
5570         "+r"(dst_b),     // %3
5571         "+r"(dst_a),     // %4
5572         "+rm"(width)     // %5
5573       :
5574       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5575 }
5576 #endif
5577 
5578 #ifdef HAS_SPLITXRGBROW_SSE2
SplitXRGBRow_SSE2(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)5579 void SplitXRGBRow_SSE2(const uint8_t* src_argb,
5580                        uint8_t* dst_r,
5581                        uint8_t* dst_g,
5582                        uint8_t* dst_b,
5583                        int width) {
5584   asm volatile(
5585 
5586       LABELALIGN
5587       "1:                                        \n"
5588 
5589       "movdqu      (%0),%%xmm0                   \n"  // 00-0F
5590       "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
5591       "movdqa      %%xmm0,%%xmm2                 \n"
5592       "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 00-07 10-17
5593       "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 08-0F 18-1F
5594       "movdqa      %%xmm0,%%xmm1                 \n"
5595       "punpcklbw   %%xmm2,%%xmm0                 \n"  // 08192A3B4C5D6E7F (lo)
5596       "punpckhbw   %%xmm2,%%xmm1                 \n"  // 08192A3B4C5D6E7F (hi)
5597       "movdqa      %%xmm0,%%xmm2                 \n"
5598       "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 08192A3B08192A3B
5599       "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 4C5D6E7F4C5D6E7F
5600       "movdqa      %%xmm0,%%xmm1                 \n"
5601       "punpcklbw   %%xmm2,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
5602       "punpckhbw   %%xmm2,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
5603       "movdqa      %%xmm0,%%xmm2                 \n"
5604       "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
5605       "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
5606       "movlps      %%xmm0,(%3)                   \n"  // B
5607       "movhps      %%xmm0,(%2)                   \n"  // G
5608       "movlps      %%xmm2,(%1)                   \n"  // R
5609 
5610       "lea         32(%0),%0                     \n"
5611       "lea         8(%1),%1                      \n"
5612       "lea         8(%2),%2                      \n"
5613       "lea         8(%3),%3                      \n"
5614       "sub         $0x8,%4                       \n"
5615       "jg          1b                            \n"
5616       : "+r"(src_argb),  // %0
5617         "+r"(dst_r),     // %1
5618         "+r"(dst_g),     // %2
5619         "+r"(dst_b),     // %3
5620         "+rm"(width)     // %4
5621       :
5622       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5623 }
5624 #endif
5625 
5626 static const uvec8 kShuffleMaskARGBSplit = {0, 4, 8,  12, 1, 5, 9,  13,
5627                                             2, 6, 10, 14, 3, 7, 11, 15};
5628 #ifdef HAS_SPLITARGBROW_SSSE3
SplitARGBRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)5629 void SplitARGBRow_SSSE3(const uint8_t* src_argb,
5630                         uint8_t* dst_r,
5631                         uint8_t* dst_g,
5632                         uint8_t* dst_b,
5633                         uint8_t* dst_a,
5634                         int width) {
5635   asm volatile(
5636 
5637       "movdqa      %6,%%xmm3                     \n"
5638       "sub         %1,%2                         \n"
5639       "sub         %1,%3                         \n"
5640       "sub         %1,%4                         \n"
5641 
5642       LABELALIGN
5643       "1:                                        \n"
5644 
5645       "movdqu      (%0),%%xmm0                   \n"  // 00-0F
5646       "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
5647       "pshufb      %%xmm3,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
5648       "pshufb      %%xmm3,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
5649       "movdqa      %%xmm0,%%xmm2                 \n"
5650       "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
5651       "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
5652       "movlps      %%xmm0,(%1,%3)                \n"  // B
5653       "movhps      %%xmm0,(%1,%2)                \n"  // G
5654       "movlps      %%xmm2,(%1)                   \n"  // R
5655       "movhps      %%xmm2,(%1,%4)                \n"  // A
5656 
5657       "lea         32(%0),%0                     \n"
5658       "lea         8(%1),%1                      \n"
5659       "subl        $0x8,%5                       \n"
5660       "jg          1b                            \n"
5661       : "+r"(src_argb),  // %0
5662         "+r"(dst_r),     // %1
5663         "+r"(dst_g),     // %2
5664         "+r"(dst_b),     // %3
5665         "+r"(dst_a),     // %4
5666 #if defined(__i386__)
5667         "+m"(width)  // %5
5668 #else
5669         "+rm"(width)          // %5
5670 #endif
5671       : "m"(kShuffleMaskARGBSplit)  // %6
5672       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
5673 }
5674 #endif
5675 
5676 #ifdef HAS_SPLITXRGBROW_SSSE3
SplitXRGBRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)5677 void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
5678                         uint8_t* dst_r,
5679                         uint8_t* dst_g,
5680                         uint8_t* dst_b,
5681                         int width) {
5682   asm volatile(
5683 
5684       "movdqa      %5,%%xmm3                     \n"
5685 
5686       LABELALIGN
5687       "1:                                        \n"
5688 
5689       "movdqu      (%0),%%xmm0                   \n"  // 00-0F
5690       "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
5691       "pshufb      %%xmm3,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
5692       "pshufb      %%xmm3,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
5693       "movdqa      %%xmm0,%%xmm2                 \n"
5694       "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
5695       "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
5696       "movlps      %%xmm0,(%3)                   \n"  // B
5697       "movhps      %%xmm0,(%2)                   \n"  // G
5698       "movlps      %%xmm2,(%1)                   \n"  // R
5699 
5700       "lea         32(%0),%0                     \n"
5701       "lea         8(%1),%1                      \n"
5702       "lea         8(%2),%2                      \n"
5703       "lea         8(%3),%3                      \n"
5704       "sub         $0x8,%4                       \n"
5705       "jg          1b                            \n"
5706       : "+r"(src_argb),             // %0
5707         "+r"(dst_r),                // %1
5708         "+r"(dst_g),                // %2
5709         "+r"(dst_b),                // %3
5710         "+r"(width)                 // %4
5711       : "m"(kShuffleMaskARGBSplit)  // %5
5712       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
5713 }
5714 #endif
5715 
5716 #ifdef HAS_SPLITARGBROW_AVX2
5717 static const ulvec32 kShuffleMaskARGBPermute = {0, 4, 1, 5, 2, 6, 3, 7};
SplitARGBRow_AVX2(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)5718 void SplitARGBRow_AVX2(const uint8_t* src_argb,
5719                        uint8_t* dst_r,
5720                        uint8_t* dst_g,
5721                        uint8_t* dst_b,
5722                        uint8_t* dst_a,
5723                        int width) {
5724   asm volatile(
5725 
5726       "sub         %1,%2                         \n"
5727       "sub         %1,%3                         \n"
5728       "sub         %1,%4                         \n"
5729       "vmovdqa     %7,%%ymm3                     \n"
5730       "vbroadcastf128 %6,%%ymm4                  \n"
5731 
5732       LABELALIGN
5733       "1:                                        \n"
5734 
5735       "vmovdqu     (%0),%%xmm0                   \n"  // 00-0F
5736       "vmovdqu     16(%0),%%xmm1                 \n"  // 10-1F
5737       "vinserti128 $1,32(%0),%%ymm0,%%ymm0       \n"  // 00-0F 20-2F
5738       "vinserti128 $1,48(%0),%%ymm1,%%ymm1       \n"  // 10-1F 30-3F
5739       "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
5740       "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
5741       "vpermd      %%ymm0,%%ymm3,%%ymm0          \n"
5742       "vpermd      %%ymm1,%%ymm3,%%ymm1          \n"
5743       "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"  // GA
5744       "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"  // BR
5745       "vmovdqu     %%xmm0,(%1,%3)                \n"  // B
5746       "vextracti128 $1,%%ymm0,(%1)               \n"  // R
5747       "vmovdqu     %%xmm2,(%1,%2)                \n"  // G
5748       "vextracti128 $1,%%ymm2,(%1,%4)            \n"  // A
5749       "lea         64(%0),%0                     \n"
5750       "lea         16(%1),%1                     \n"
5751       "subl        $0x10,%5                      \n"
5752       "jg          1b                            \n"
5753       "vzeroupper                                \n"
5754       : "+r"(src_argb),  // %0
5755         "+r"(dst_r),     // %1
5756         "+r"(dst_g),     // %2
5757         "+r"(dst_b),     // %3
5758         "+r"(dst_a),     // %4
5759 #if defined(__i386__)
5760         "+m"(width)  // %5
5761 #else
5762         "+rm"(width)          // %5
5763 #endif
5764       : "m"(kShuffleMaskARGBSplit),   // %6
5765         "m"(kShuffleMaskARGBPermute)  // %7
5766       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
5767 }
5768 #endif
5769 
5770 #ifdef HAS_SPLITXRGBROW_AVX2
SplitXRGBRow_AVX2(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)5771 void SplitXRGBRow_AVX2(const uint8_t* src_argb,
5772                        uint8_t* dst_r,
5773                        uint8_t* dst_g,
5774                        uint8_t* dst_b,
5775                        int width) {
5776   asm volatile(
5777 
5778       "vmovdqa     %6,%%ymm3                     \n"
5779       "vbroadcastf128 %5,%%ymm4                  \n"
5780 
5781       LABELALIGN
5782       "1:                                        \n"
5783 
5784       "vmovdqu     (%0),%%xmm0                   \n"  // 00-0F
5785       "vmovdqu     16(%0),%%xmm1                 \n"  // 10-1F
5786       "vinserti128 $1,32(%0),%%ymm0,%%ymm0       \n"  // 00-0F 20-2F
5787       "vinserti128 $1,48(%0),%%ymm1,%%ymm1       \n"  // 10-1F 30-3F
5788       "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
5789       "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
5790       "vpermd      %%ymm0,%%ymm3,%%ymm0          \n"
5791       "vpermd      %%ymm1,%%ymm3,%%ymm1          \n"
5792       "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"  // GA
5793       "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"  // BR
5794       "vmovdqu     %%xmm0,(%3)                   \n"  // B
5795       "vextracti128 $1,%%ymm0,(%1)               \n"  // R
5796       "vmovdqu     %%xmm2,(%2)                   \n"  // G
5797 
5798       "lea         64(%0),%0                     \n"
5799       "lea         16(%1),%1                     \n"
5800       "lea         16(%2),%2                     \n"
5801       "lea         16(%3),%3                     \n"
5802       "sub         $0x10,%4                      \n"
5803       "jg          1b                            \n"
5804       "vzeroupper                                \n"
5805       : "+r"(src_argb),               // %0
5806         "+r"(dst_r),                  // %1
5807         "+r"(dst_g),                  // %2
5808         "+r"(dst_b),                  // %3
5809         "+r"(width)                   // %4
5810       : "m"(kShuffleMaskARGBSplit),   // %5
5811         "m"(kShuffleMaskARGBPermute)  // %6
5812       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
5813 }
5814 #endif
5815 
5816 #ifdef HAS_MERGEXR30ROW_AVX2
MergeXR30Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_ar30,int depth,int width)5817 void MergeXR30Row_AVX2(const uint16_t* src_r,
5818                        const uint16_t* src_g,
5819                        const uint16_t* src_b,
5820                        uint8_t* dst_ar30,
5821                        int depth,
5822                        int width) {
5823   int shift = depth - 10;
5824   asm volatile(
5825 
5826       "sub         %0,%1                         \n"
5827       "sub         %0,%2                         \n"
5828       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
5829       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
5830       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
5831       "vpcmpeqb    %%ymm6,%%ymm6,%%ymm6          \n"
5832       "vpsrlw      $6,%%ymm6,%%ymm6              \n"
5833       "vmovd       %5,%%xmm4                     \n"
5834 
5835       LABELALIGN
5836       "1:                                        \n"
5837       "vmovdqu     (%0),%%ymm0                   \n"
5838       "vmovdqu     (%0,%1),%%ymm1                \n"
5839       "vmovdqu     (%0,%2),%%ymm2                \n"
5840       "vpsrlw      %%xmm4,%%ymm0,%%ymm0          \n"
5841       "vpsrlw      %%xmm4,%%ymm1,%%ymm1          \n"
5842       "vpsrlw      %%xmm4,%%ymm2,%%ymm2          \n"
5843       "vpminuw     %%ymm0,%%ymm6,%%ymm0          \n"
5844       "vpminuw     %%ymm1,%%ymm6,%%ymm1          \n"
5845       "vpminuw     %%ymm2,%%ymm6,%%ymm2          \n"
5846       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
5847       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
5848       "vpermq      $0xd8,%%ymm2,%%ymm2           \n"
5849       "vpsllw      $0x4,%%ymm0,%%ymm0            \n"  // Shift R to target bit
5850       "vpunpckhwd  %%ymm0,%%ymm2,%%ymm3          \n"  // RB
5851       "vpunpcklwd  %%ymm0,%%ymm2,%%ymm0          \n"
5852       "vpunpckhwd  %%ymm5,%%ymm1,%%ymm2          \n"  // AG
5853       "vpunpcklwd  %%ymm5,%%ymm1,%%ymm1          \n"
5854       "vpslld      $0xa,%%ymm1,%%ymm1            \n"  // Shift AG to target bit
5855       "vpslld      $0xa,%%ymm2,%%ymm2            \n"
5856       "vpor        %%ymm1,%%ymm0,%%ymm0          \n"  // Combine
5857       "vpor        %%ymm2,%%ymm3,%%ymm3          \n"
5858       "vmovdqu     %%ymm0,(%3)                   \n"
5859       "vmovdqu     %%ymm3,0x20(%3)               \n"
5860       "lea         0x20(%0),%0                   \n"
5861       "lea         0x40(%3),%3                   \n"
5862       "sub         $0x10,%4                      \n"
5863       "jg          1b                            \n"
5864       "vzeroupper                                \n"
5865       : "+r"(src_r),     // %0
5866         "+r"(src_g),     // %1
5867         "+r"(src_b),     // %2
5868         "+r"(dst_ar30),  // %3
5869         "+r"(width)      // %4
5870 #if defined(__i386__)
5871       : "m"(shift)  // %5
5872 #else
5873       : "rm"(shift)           // %5
5874 #endif
5875       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
5876 }
5877 #endif
5878 
5879 #ifdef HAS_MERGEAR64ROW_AVX2
5880 static const lvec32 MergeAR64Permute = {0, 4, 2, 6, 1, 5, 3, 7};
MergeAR64Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,const uint16_t * src_a,uint16_t * dst_ar64,int depth,int width)5881 void MergeAR64Row_AVX2(const uint16_t* src_r,
5882                        const uint16_t* src_g,
5883                        const uint16_t* src_b,
5884                        const uint16_t* src_a,
5885                        uint16_t* dst_ar64,
5886                        int depth,
5887                        int width) {
5888   int shift = 16 - depth;
5889   int mask = (1 << depth) - 1;
5890   mask = (mask << 16) + mask;
5891   asm volatile(
5892 
5893       "sub         %0,%1                         \n"
5894       "sub         %0,%2                         \n"
5895       "sub         %0,%3                         \n"
5896       "vmovdqa     %8,%%ymm5                     \n"
5897       "vmovd       %6,%%xmm6                     \n"
5898       "vbroadcastss %7,%%ymm7                    \n"
5899 
5900       LABELALIGN
5901       "1:                                        \n"
5902       "vmovdqu     (%0),%%ymm0                   \n"  // R
5903       "vmovdqu     (%0,%1),%%ymm1                \n"  // G
5904       "vmovdqu     (%0,%2),%%ymm2                \n"  // B
5905       "vmovdqu     (%0,%3),%%ymm3                \n"  // A
5906       "vpminuw     %%ymm0,%%ymm7,%%ymm0          \n"
5907       "vpminuw     %%ymm1,%%ymm7,%%ymm1          \n"
5908       "vpminuw     %%ymm2,%%ymm7,%%ymm2          \n"
5909       "vpminuw     %%ymm3,%%ymm7,%%ymm3          \n"
5910       "vpsllw      %%xmm6,%%ymm0,%%ymm0          \n"
5911       "vpsllw      %%xmm6,%%ymm1,%%ymm1          \n"
5912       "vpsllw      %%xmm6,%%ymm2,%%ymm2          \n"
5913       "vpsllw      %%xmm6,%%ymm3,%%ymm3          \n"
5914       "vpermd      %%ymm0,%%ymm5,%%ymm0          \n"
5915       "vpermd      %%ymm1,%%ymm5,%%ymm1          \n"
5916       "vpermd      %%ymm2,%%ymm5,%%ymm2          \n"
5917       "vpermd      %%ymm3,%%ymm5,%%ymm3          \n"
5918       "vpunpcklwd  %%ymm1,%%ymm2,%%ymm4          \n"  // BG(low)
5919       "vpunpckhwd  %%ymm1,%%ymm2,%%ymm1          \n"  // BG(hi)
5920       "vpunpcklwd  %%ymm3,%%ymm0,%%ymm2          \n"  // RA(low)
5921       "vpunpckhwd  %%ymm3,%%ymm0,%%ymm0          \n"  // RA(hi)
5922       "vpunpckldq  %%ymm2,%%ymm4,%%ymm3          \n"  // BGRA(1)
5923       "vpunpckhdq  %%ymm2,%%ymm4,%%ymm4          \n"  // BGRA(3)
5924       "vpunpckldq  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA(2)
5925       "vpunpckhdq  %%ymm0,%%ymm1,%%ymm1          \n"  // BGRA(4)
5926       "vmovdqu     %%ymm3,(%4)                   \n"
5927       "vmovdqu     %%ymm2,0x20(%4)               \n"
5928       "vmovdqu     %%ymm4,0x40(%4)               \n"
5929       "vmovdqu     %%ymm1,0x60(%4)               \n"
5930       "lea         0x20(%0),%0                   \n"
5931       "lea         0x80(%4),%4                   \n"
5932       "subl        $0x10,%5                      \n"
5933       "jg          1b                            \n"
5934       "vzeroupper                                \n"
5935       : "+r"(src_r),     // %0
5936         "+r"(src_g),     // %1
5937         "+r"(src_b),     // %2
5938         "+r"(src_a),     // %3
5939         "+r"(dst_ar64),  // %4
5940 #if defined(__i386__)
5941         "+m"(width)  // %5
5942 #else
5943         "+rm"(width)          // %5
5944 #endif
5945       : "m"(shift),            // %6
5946         "m"(mask),             // %7
5947         "m"(MergeAR64Permute)  // %8
5948       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
5949         "xmm7");
5950 }
5951 #endif
5952 
5953 #ifdef HAS_MERGEXR64ROW_AVX2
MergeXR64Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint16_t * dst_ar64,int depth,int width)5954 void MergeXR64Row_AVX2(const uint16_t* src_r,
5955                        const uint16_t* src_g,
5956                        const uint16_t* src_b,
5957                        uint16_t* dst_ar64,
5958                        int depth,
5959                        int width) {
5960   int shift = 16 - depth;
5961   int mask = (1 << depth) - 1;
5962   mask = (mask << 16) + mask;
5963   asm volatile(
5964 
5965       "sub         %0,%1                         \n"
5966       "sub         %0,%2                         \n"
5967       "vmovdqa     %7,%%ymm5                     \n"
5968       "vmovd       %5,%%xmm6                     \n"
5969       "vbroadcastss %6,%%ymm7                    \n"
5970 
5971       LABELALIGN
5972       "1:                                        \n"
5973       "vmovdqu     (%0),%%ymm0                   \n"  // R
5974       "vmovdqu     (%0,%1),%%ymm1                \n"  // G
5975       "vmovdqu     (%0,%2),%%ymm2                \n"  // B
5976       "vpminuw     %%ymm0,%%ymm7,%%ymm0          \n"
5977       "vpminuw     %%ymm1,%%ymm7,%%ymm1          \n"
5978       "vpminuw     %%ymm2,%%ymm7,%%ymm2          \n"
5979       "vpsllw      %%xmm6,%%ymm0,%%ymm0          \n"
5980       "vpsllw      %%xmm6,%%ymm1,%%ymm1          \n"
5981       "vpsllw      %%xmm6,%%ymm2,%%ymm2          \n"
5982       "vpermd      %%ymm0,%%ymm5,%%ymm0          \n"
5983       "vpermd      %%ymm1,%%ymm5,%%ymm1          \n"
5984       "vpermd      %%ymm2,%%ymm5,%%ymm2          \n"
5985       "vpcmpeqb    %%ymm3,%%ymm3,%%ymm3          \n"  // A (0xffff)
5986       "vpunpcklwd  %%ymm1,%%ymm2,%%ymm4          \n"  // BG(low)
5987       "vpunpckhwd  %%ymm1,%%ymm2,%%ymm1          \n"  // BG(hi)
5988       "vpunpcklwd  %%ymm3,%%ymm0,%%ymm2          \n"  // RA(low)
5989       "vpunpckhwd  %%ymm3,%%ymm0,%%ymm0          \n"  // RA(hi)
5990       "vpunpckldq  %%ymm2,%%ymm4,%%ymm3          \n"  // BGRA(1)
5991       "vpunpckhdq  %%ymm2,%%ymm4,%%ymm4          \n"  // BGRA(3)
5992       "vpunpckldq  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA(2)
5993       "vpunpckhdq  %%ymm0,%%ymm1,%%ymm1          \n"  // BGRA(4)
5994       "vmovdqu     %%ymm3,(%3)                   \n"
5995       "vmovdqu     %%ymm2,0x20(%3)               \n"
5996       "vmovdqu     %%ymm4,0x40(%3)               \n"
5997       "vmovdqu     %%ymm1,0x60(%3)               \n"
5998       "lea         0x20(%0),%0                   \n"
5999       "lea         0x80(%3),%3                   \n"
6000       "subl        $0x10,%4                      \n"
6001       "jg          1b                            \n"
6002       "vzeroupper                                \n"
6003       : "+r"(src_r),           // %0
6004         "+r"(src_g),           // %1
6005         "+r"(src_b),           // %2
6006         "+r"(dst_ar64),        // %3
6007         "+r"(width)            // %4
6008       : "m"(shift),            // %5
6009         "m"(mask),             // %6
6010         "m"(MergeAR64Permute)  // %7
6011       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
6012         "xmm7");
6013 }
6014 #endif
6015 
6016 #ifdef HAS_MERGEARGB16TO8ROW_AVX2
6017 static const uvec8 MergeARGB16To8Shuffle = {0, 8,  1, 9,  2, 10, 3, 11,
6018                                             4, 12, 5, 13, 6, 14, 7, 15};
MergeARGB16To8Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,const uint16_t * src_a,uint8_t * dst_argb,int depth,int width)6019 void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
6020                             const uint16_t* src_g,
6021                             const uint16_t* src_b,
6022                             const uint16_t* src_a,
6023                             uint8_t* dst_argb,
6024                             int depth,
6025                             int width) {
6026   int shift = depth - 8;
6027   asm volatile(
6028 
6029       "sub         %0,%1                         \n"
6030       "sub         %0,%2                         \n"
6031       "sub         %0,%3                         \n"
6032       "vbroadcastf128 %7,%%ymm5                  \n"
6033       "vmovd       %6,%%xmm6                     \n"
6034 
6035       LABELALIGN
6036       "1:                                        \n"
6037       "vmovdqu     (%0),%%ymm0                   \n"  // R
6038       "vmovdqu     (%0,%1),%%ymm1                \n"  // G
6039       "vmovdqu     (%0,%2),%%ymm2                \n"  // B
6040       "vmovdqu     (%0,%3),%%ymm3                \n"  // A
6041       "vpsrlw      %%xmm6,%%ymm0,%%ymm0          \n"
6042       "vpsrlw      %%xmm6,%%ymm1,%%ymm1          \n"
6043       "vpsrlw      %%xmm6,%%ymm2,%%ymm2          \n"
6044       "vpsrlw      %%xmm6,%%ymm3,%%ymm3          \n"
6045       "vpackuswb   %%ymm1,%%ymm2,%%ymm1          \n"  // BG (planar)
6046       "vpackuswb   %%ymm3,%%ymm0,%%ymm0          \n"  // RA (planar)
6047       "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"  // BG (interleave)
6048       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"  // RA (interleave)
6049       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
6050       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6051       "vpunpcklwd  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA (low)
6052       "vpunpckhwd  %%ymm0,%%ymm1,%%ymm0          \n"  // BGRA (hi)
6053       "vmovdqu     %%ymm2,(%4)                   \n"
6054       "vmovdqu     %%ymm0,0x20(%4)               \n"
6055       "lea         0x20(%0),%0                   \n"
6056       "lea         0x40(%4),%4                   \n"
6057       "subl        $0x10,%5                      \n"
6058       "jg          1b                            \n"
6059       "vzeroupper                                \n"
6060       : "+r"(src_r),     // %0
6061         "+r"(src_g),     // %1
6062         "+r"(src_b),     // %2
6063         "+r"(src_a),     // %3
6064         "+r"(dst_argb),  // %4
6065 #if defined(__i386__)
6066         "+m"(width)  // %5
6067 #else
6068         "+rm"(width)          // %5
6069 #endif
6070       : "m"(shift),                 // %6
6071         "m"(MergeARGB16To8Shuffle)  // %7
6072       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
6073 }
6074 #endif
6075 
6076 #ifdef HAS_MERGEXRGB16TO8ROW_AVX2
MergeXRGB16To8Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_argb,int depth,int width)6077 void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
6078                             const uint16_t* src_g,
6079                             const uint16_t* src_b,
6080                             uint8_t* dst_argb,
6081                             int depth,
6082                             int width) {
6083   int shift = depth - 8;
6084   asm volatile(
6085 
6086       "sub         %0,%1                         \n"
6087       "sub         %0,%2                         \n"
6088       "vbroadcastf128 %6,%%ymm5                  \n"
6089       "vmovd       %5,%%xmm6                     \n"
6090       "vpcmpeqb    %%ymm3,%%ymm3,%%ymm3          \n"
6091       "vpsrlw      $8,%%ymm3,%%ymm3              \n"  // A (0xff)
6092 
6093       LABELALIGN
6094       "1:                                        \n"
6095       "vmovdqu     (%0),%%ymm0                   \n"  // R
6096       "vmovdqu     (%0,%1),%%ymm1                \n"  // G
6097       "vmovdqu     (%0,%2),%%ymm2                \n"  // B
6098       "vpsrlw      %%xmm6,%%ymm0,%%ymm0          \n"
6099       "vpsrlw      %%xmm6,%%ymm1,%%ymm1          \n"
6100       "vpsrlw      %%xmm6,%%ymm2,%%ymm2          \n"
6101       "vpackuswb   %%ymm1,%%ymm2,%%ymm1          \n"  // BG (planar)
6102       "vpackuswb   %%ymm3,%%ymm0,%%ymm0          \n"  // RA (planar)
6103       "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"  // BG (interleave)
6104       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"  // RA (interleave)
6105       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
6106       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6107       "vpunpcklwd  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA (low)
6108       "vpunpckhwd  %%ymm0,%%ymm1,%%ymm0          \n"  // BGRA (hi)
6109       "vmovdqu     %%ymm2,(%3)                   \n"
6110       "vmovdqu     %%ymm0,0x20(%3)               \n"
6111       "lea         0x20(%0),%0                   \n"
6112       "lea         0x40(%3),%3                   \n"
6113       "subl        $0x10,%4                      \n"
6114       "jg          1b                            \n"
6115       "vzeroupper                                \n"
6116       : "+r"(src_r),                // %0
6117         "+r"(src_g),                // %1
6118         "+r"(src_b),                // %2
6119         "+r"(dst_argb),             // %3
6120         "+r"(width)                 // %4
6121       : "m"(shift),                 // %5
6122         "m"(MergeARGB16To8Shuffle)  // %6
6123       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
6124 }
6125 #endif
6126 
6127 #ifdef HAS_COPYROW_SSE2
CopyRow_SSE2(const uint8_t * src,uint8_t * dst,int width)6128 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
6129   asm volatile(
6130       "test        $0xf,%0                       \n"
6131       "jne         2f                            \n"
6132       "test        $0xf,%1                       \n"
6133       "jne         2f                            \n"
6134 
6135       LABELALIGN
6136       "1:                                        \n"
6137       "movdqa      (%0),%%xmm0                   \n"
6138       "movdqa      0x10(%0),%%xmm1               \n"
6139       "lea         0x20(%0),%0                   \n"
6140       "movdqa      %%xmm0,(%1)                   \n"
6141       "movdqa      %%xmm1,0x10(%1)               \n"
6142       "lea         0x20(%1),%1                   \n"
6143       "sub         $0x20,%2                      \n"
6144       "jg          1b                            \n"
6145       "jmp         9f                            \n"
6146 
6147       LABELALIGN
6148       "2:                                        \n"
6149       "movdqu      (%0),%%xmm0                   \n"
6150       "movdqu      0x10(%0),%%xmm1               \n"
6151       "lea         0x20(%0),%0                   \n"
6152       "movdqu      %%xmm0,(%1)                   \n"
6153       "movdqu      %%xmm1,0x10(%1)               \n"
6154       "lea         0x20(%1),%1                   \n"
6155       "sub         $0x20,%2                      \n"
6156       "jg          2b                            \n"
6157 
6158       LABELALIGN "9:                                        \n"
6159       : "+r"(src),   // %0
6160         "+r"(dst),   // %1
6161         "+r"(width)  // %2
6162       :
6163       : "memory", "cc", "xmm0", "xmm1");
6164 }
6165 #endif  // HAS_COPYROW_SSE2
6166 
6167 #ifdef HAS_COPYROW_AVX
CopyRow_AVX(const uint8_t * src,uint8_t * dst,int width)6168 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
6169   asm volatile(
6170 
6171       LABELALIGN
6172       "1:                                        \n"
6173       "vmovdqu     (%0),%%ymm0                   \n"
6174       "vmovdqu     0x20(%0),%%ymm1               \n"
6175       "lea         0x40(%0),%0                   \n"
6176       "vmovdqu     %%ymm0,(%1)                   \n"
6177       "vmovdqu     %%ymm1,0x20(%1)               \n"
6178       "lea         0x40(%1),%1                   \n"
6179       "sub         $0x40,%2                      \n"
6180       "jg          1b                            \n"
6181       : "+r"(src),   // %0
6182         "+r"(dst),   // %1
6183         "+r"(width)  // %2
6184       :
6185       : "memory", "cc", "xmm0", "xmm1");
6186 }
6187 #endif  // HAS_COPYROW_AVX
6188 
6189 #ifdef HAS_COPYROW_ERMS
6190 // Multiple of 1.
CopyRow_ERMS(const uint8_t * src,uint8_t * dst,int width)6191 void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
6192   size_t width_tmp = (size_t)(width);
6193   asm volatile(
6194 
6195       "rep         movsb                         \n"
6196       : "+S"(src),       // %0
6197         "+D"(dst),       // %1
6198         "+c"(width_tmp)  // %2
6199       :
6200       : "memory", "cc");
6201 }
6202 #endif  // HAS_COPYROW_ERMS
6203 
6204 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
6205 // width in pixels
ARGBCopyAlphaRow_SSE2(const uint8_t * src,uint8_t * dst,int width)6206 void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
6207   asm volatile(
6208       "pcmpeqb     %%xmm0,%%xmm0                 \n"
6209       "pslld       $0x18,%%xmm0                  \n"
6210       "pcmpeqb     %%xmm1,%%xmm1                 \n"
6211       "psrld       $0x8,%%xmm1                   \n"
6212 
6213       LABELALIGN
6214       "1:                                        \n"
6215       "movdqu      (%0),%%xmm2                   \n"
6216       "movdqu      0x10(%0),%%xmm3               \n"
6217       "lea         0x20(%0),%0                   \n"
6218       "movdqu      (%1),%%xmm4                   \n"
6219       "movdqu      0x10(%1),%%xmm5               \n"
6220       "pand        %%xmm0,%%xmm2                 \n"
6221       "pand        %%xmm0,%%xmm3                 \n"
6222       "pand        %%xmm1,%%xmm4                 \n"
6223       "pand        %%xmm1,%%xmm5                 \n"
6224       "por         %%xmm4,%%xmm2                 \n"
6225       "por         %%xmm5,%%xmm3                 \n"
6226       "movdqu      %%xmm2,(%1)                   \n"
6227       "movdqu      %%xmm3,0x10(%1)               \n"
6228       "lea         0x20(%1),%1                   \n"
6229       "sub         $0x8,%2                       \n"
6230       "jg          1b                            \n"
6231       : "+r"(src),   // %0
6232         "+r"(dst),   // %1
6233         "+r"(width)  // %2
6234       :
6235       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6236 }
6237 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
6238 
6239 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
6240 // width in pixels
ARGBCopyAlphaRow_AVX2(const uint8_t * src,uint8_t * dst,int width)6241 void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
6242   asm volatile(
6243       "vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
6244       "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
6245 
6246       LABELALIGN
6247       "1:                                        \n"
6248       "vmovdqu     (%0),%%ymm1                   \n"
6249       "vmovdqu     0x20(%0),%%ymm2               \n"
6250       "lea         0x40(%0),%0                   \n"
6251       "vpblendvb   %%ymm0,(%1),%%ymm1,%%ymm1     \n"
6252       "vpblendvb   %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
6253       "vmovdqu     %%ymm1,(%1)                   \n"
6254       "vmovdqu     %%ymm2,0x20(%1)               \n"
6255       "lea         0x40(%1),%1                   \n"
6256       "sub         $0x10,%2                      \n"
6257       "jg          1b                            \n"
6258       "vzeroupper                                \n"
6259       : "+r"(src),   // %0
6260         "+r"(dst),   // %1
6261         "+r"(width)  // %2
6262       :
6263       : "memory", "cc", "xmm0", "xmm1", "xmm2");
6264 }
6265 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
6266 
6267 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
6268 // width in pixels
ARGBExtractAlphaRow_SSE2(const uint8_t * src_argb,uint8_t * dst_a,int width)6269 void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
6270                               uint8_t* dst_a,
6271                               int width) {
6272   asm volatile(
6273 
6274       LABELALIGN
6275       "1:                                        \n"
6276       "movdqu      (%0), %%xmm0                  \n"
6277       "movdqu      0x10(%0), %%xmm1              \n"
6278       "lea         0x20(%0), %0                  \n"
6279       "psrld       $0x18, %%xmm0                 \n"
6280       "psrld       $0x18, %%xmm1                 \n"
6281       "packssdw    %%xmm1, %%xmm0                \n"
6282       "packuswb    %%xmm0, %%xmm0                \n"
6283       "movq        %%xmm0,(%1)                   \n"
6284       "lea         0x8(%1), %1                   \n"
6285       "sub         $0x8, %2                      \n"
6286       "jg          1b                            \n"
6287       : "+r"(src_argb),  // %0
6288         "+r"(dst_a),     // %1
6289         "+rm"(width)     // %2
6290       :
6291       : "memory", "cc", "xmm0", "xmm1");
6292 }
6293 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
6294 
6295 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
6296 static const uvec8 kShuffleAlphaShort_AVX2 = {
6297     3u,  128u, 128u, 128u, 7u,  128u, 128u, 128u,
6298     11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
6299 
ARGBExtractAlphaRow_AVX2(const uint8_t * src_argb,uint8_t * dst_a,int width)6300 void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
6301                               uint8_t* dst_a,
6302                               int width) {
6303   asm volatile(
6304       "vmovdqa     %3,%%ymm4                     \n"
6305       "vbroadcastf128 %4,%%ymm5                  \n"
6306 
6307       LABELALIGN
6308       "1:                                        \n"
6309       "vmovdqu     (%0), %%ymm0                  \n"
6310       "vmovdqu     0x20(%0), %%ymm1              \n"
6311       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"  // vpsrld $0x18, %%ymm0
6312       "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
6313       "vmovdqu     0x40(%0), %%ymm2              \n"
6314       "vmovdqu     0x60(%0), %%ymm3              \n"
6315       "lea         0x80(%0), %0                  \n"
6316       "vpackssdw   %%ymm1, %%ymm0, %%ymm0        \n"  // mutates
6317       "vpshufb     %%ymm5,%%ymm2,%%ymm2          \n"
6318       "vpshufb     %%ymm5,%%ymm3,%%ymm3          \n"
6319       "vpackssdw   %%ymm3, %%ymm2, %%ymm2        \n"  // mutates
6320       "vpackuswb   %%ymm2,%%ymm0,%%ymm0          \n"  // mutates.
6321       "vpermd      %%ymm0,%%ymm4,%%ymm0          \n"  // unmutate.
6322       "vmovdqu     %%ymm0,(%1)                   \n"
6323       "lea         0x20(%1),%1                   \n"
6324       "sub         $0x20, %2                     \n"
6325       "jg          1b                            \n"
6326       "vzeroupper                                \n"
6327       : "+r"(src_argb),               // %0
6328         "+r"(dst_a),                  // %1
6329         "+rm"(width)                  // %2
6330       : "m"(kPermdARGBToY_AVX),       // %3
6331         "m"(kShuffleAlphaShort_AVX2)  // %4
6332       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6333 }
6334 #endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
6335 
6336 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
6337 // width in pixels
ARGBCopyYToAlphaRow_SSE2(const uint8_t * src,uint8_t * dst,int width)6338 void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
6339   asm volatile(
6340       "pcmpeqb     %%xmm0,%%xmm0                 \n"
6341       "pslld       $0x18,%%xmm0                  \n"
6342       "pcmpeqb     %%xmm1,%%xmm1                 \n"
6343       "psrld       $0x8,%%xmm1                   \n"
6344 
6345       LABELALIGN
6346       "1:                                        \n"
6347       "movq        (%0),%%xmm2                   \n"
6348       "lea         0x8(%0),%0                    \n"
6349       "punpcklbw   %%xmm2,%%xmm2                 \n"
6350       "punpckhwd   %%xmm2,%%xmm3                 \n"
6351       "punpcklwd   %%xmm2,%%xmm2                 \n"
6352       "movdqu      (%1),%%xmm4                   \n"
6353       "movdqu      0x10(%1),%%xmm5               \n"
6354       "pand        %%xmm0,%%xmm2                 \n"
6355       "pand        %%xmm0,%%xmm3                 \n"
6356       "pand        %%xmm1,%%xmm4                 \n"
6357       "pand        %%xmm1,%%xmm5                 \n"
6358       "por         %%xmm4,%%xmm2                 \n"
6359       "por         %%xmm5,%%xmm3                 \n"
6360       "movdqu      %%xmm2,(%1)                   \n"
6361       "movdqu      %%xmm3,0x10(%1)               \n"
6362       "lea         0x20(%1),%1                   \n"
6363       "sub         $0x8,%2                       \n"
6364       "jg          1b                            \n"
6365       : "+r"(src),   // %0
6366         "+r"(dst),   // %1
6367         "+r"(width)  // %2
6368       :
6369       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6370 }
6371 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
6372 
6373 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
6374 // width in pixels
ARGBCopyYToAlphaRow_AVX2(const uint8_t * src,uint8_t * dst,int width)6375 void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
6376   asm volatile(
6377       "vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
6378       "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
6379 
6380       LABELALIGN
6381       "1:                                        \n"
6382       "vpmovzxbd   (%0),%%ymm1                   \n"
6383       "vpmovzxbd   0x8(%0),%%ymm2                \n"
6384       "lea         0x10(%0),%0                   \n"
6385       "vpslld      $0x18,%%ymm1,%%ymm1           \n"
6386       "vpslld      $0x18,%%ymm2,%%ymm2           \n"
6387       "vpblendvb   %%ymm0,(%1),%%ymm1,%%ymm1     \n"
6388       "vpblendvb   %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
6389       "vmovdqu     %%ymm1,(%1)                   \n"
6390       "vmovdqu     %%ymm2,0x20(%1)               \n"
6391       "lea         0x40(%1),%1                   \n"
6392       "sub         $0x10,%2                      \n"
6393       "jg          1b                            \n"
6394       "vzeroupper                                \n"
6395       : "+r"(src),   // %0
6396         "+r"(dst),   // %1
6397         "+r"(width)  // %2
6398       :
6399       : "memory", "cc", "xmm0", "xmm1", "xmm2");
6400 }
6401 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
6402 
6403 #ifdef HAS_SETROW_X86
SetRow_X86(uint8_t * dst,uint8_t v8,int width)6404 void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
6405   size_t width_tmp = (size_t)(width >> 2);
6406   const uint32_t v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
6407   asm volatile(
6408 
6409       "rep         stosl                         \n"
6410       : "+D"(dst),       // %0
6411         "+c"(width_tmp)  // %1
6412       : "a"(v32)         // %2
6413       : "memory", "cc");
6414 }
6415 
SetRow_ERMS(uint8_t * dst,uint8_t v8,int width)6416 void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
6417   size_t width_tmp = (size_t)(width);
6418   asm volatile(
6419 
6420       "rep         stosb                         \n"
6421       : "+D"(dst),       // %0
6422         "+c"(width_tmp)  // %1
6423       : "a"(v8)          // %2
6424       : "memory", "cc");
6425 }
6426 
ARGBSetRow_X86(uint8_t * dst_argb,uint32_t v32,int width)6427 void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
6428   size_t width_tmp = (size_t)(width);
6429   asm volatile(
6430 
6431       "rep         stosl                         \n"
6432       : "+D"(dst_argb),  // %0
6433         "+c"(width_tmp)  // %1
6434       : "a"(v32)         // %2
6435       : "memory", "cc");
6436 }
6437 #endif  // HAS_SETROW_X86
6438 
6439 #ifdef HAS_YUY2TOYROW_SSE2
YUY2ToYRow_SSE2(const uint8_t * src_yuy2,uint8_t * dst_y,int width)6440 void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
6441   asm volatile(
6442       "pcmpeqb     %%xmm5,%%xmm5                 \n"
6443       "psrlw       $0x8,%%xmm5                   \n"
6444 
6445       LABELALIGN
6446       "1:                                        \n"
6447       "movdqu      (%0),%%xmm0                   \n"
6448       "movdqu      0x10(%0),%%xmm1               \n"
6449       "lea         0x20(%0),%0                   \n"
6450       "pand        %%xmm5,%%xmm0                 \n"
6451       "pand        %%xmm5,%%xmm1                 \n"
6452       "packuswb    %%xmm1,%%xmm0                 \n"
6453       "movdqu      %%xmm0,(%1)                   \n"
6454       "lea         0x10(%1),%1                   \n"
6455       "sub         $0x10,%2                      \n"
6456       "jg          1b                            \n"
6457       : "+r"(src_yuy2),  // %0
6458         "+r"(dst_y),     // %1
6459         "+r"(width)      // %2
6460       :
6461       : "memory", "cc", "xmm0", "xmm1", "xmm5");
6462 }
6463 
YUY2ToUVRow_SSE2(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)6464 void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
6465                       int stride_yuy2,
6466                       uint8_t* dst_u,
6467                       uint8_t* dst_v,
6468                       int width) {
6469   asm volatile(
6470       "pcmpeqb     %%xmm5,%%xmm5                 \n"
6471       "psrlw       $0x8,%%xmm5                   \n"
6472       "sub         %1,%2                         \n"
6473 
6474       LABELALIGN
6475       "1:                                        \n"
6476       "movdqu      (%0),%%xmm0                   \n"
6477       "movdqu      0x10(%0),%%xmm1               \n"
6478       "movdqu      0x00(%0,%4,1),%%xmm2          \n"
6479       "movdqu      0x10(%0,%4,1),%%xmm3          \n"
6480       "lea         0x20(%0),%0                   \n"
6481       "pavgb       %%xmm2,%%xmm0                 \n"
6482       "pavgb       %%xmm3,%%xmm1                 \n"
6483       "psrlw       $0x8,%%xmm0                   \n"
6484       "psrlw       $0x8,%%xmm1                   \n"
6485       "packuswb    %%xmm1,%%xmm0                 \n"
6486       "movdqa      %%xmm0,%%xmm1                 \n"
6487       "pand        %%xmm5,%%xmm0                 \n"
6488       "packuswb    %%xmm0,%%xmm0                 \n"
6489       "psrlw       $0x8,%%xmm1                   \n"
6490       "packuswb    %%xmm1,%%xmm1                 \n"
6491       "movq        %%xmm0,(%1)                   \n"
6492       "movq        %%xmm1,0x00(%1,%2,1)          \n"
6493       "lea         0x8(%1),%1                    \n"
6494       "sub         $0x10,%3                      \n"
6495       "jg          1b                            \n"
6496       : "+r"(src_yuy2),               // %0
6497         "+r"(dst_u),                  // %1
6498         "+r"(dst_v),                  // %2
6499         "+r"(width)                   // %3
6500       : "r"((intptr_t)(stride_yuy2))  // %4
6501       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
6502 }
6503 
YUY2ToUV422Row_SSE2(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)6504 void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
6505                          uint8_t* dst_u,
6506                          uint8_t* dst_v,
6507                          int width) {
6508   asm volatile(
6509       "pcmpeqb     %%xmm5,%%xmm5                 \n"
6510       "psrlw       $0x8,%%xmm5                   \n"
6511       "sub         %1,%2                         \n"
6512 
6513       LABELALIGN
6514       "1:                                        \n"
6515       "movdqu      (%0),%%xmm0                   \n"
6516       "movdqu      0x10(%0),%%xmm1               \n"
6517       "lea         0x20(%0),%0                   \n"
6518       "psrlw       $0x8,%%xmm0                   \n"
6519       "psrlw       $0x8,%%xmm1                   \n"
6520       "packuswb    %%xmm1,%%xmm0                 \n"
6521       "movdqa      %%xmm0,%%xmm1                 \n"
6522       "pand        %%xmm5,%%xmm0                 \n"
6523       "packuswb    %%xmm0,%%xmm0                 \n"
6524       "psrlw       $0x8,%%xmm1                   \n"
6525       "packuswb    %%xmm1,%%xmm1                 \n"
6526       "movq        %%xmm0,(%1)                   \n"
6527       "movq        %%xmm1,0x00(%1,%2,1)          \n"
6528       "lea         0x8(%1),%1                    \n"
6529       "sub         $0x10,%3                      \n"
6530       "jg          1b                            \n"
6531       : "+r"(src_yuy2),  // %0
6532         "+r"(dst_u),     // %1
6533         "+r"(dst_v),     // %2
6534         "+r"(width)      // %3
6535       :
6536       : "memory", "cc", "xmm0", "xmm1", "xmm5");
6537 }
6538 
UYVYToYRow_SSE2(const uint8_t * src_uyvy,uint8_t * dst_y,int width)6539 void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
6540   asm volatile(
6541 
6542       LABELALIGN
6543       "1:                                        \n"
6544       "movdqu      (%0),%%xmm0                   \n"
6545       "movdqu      0x10(%0),%%xmm1               \n"
6546       "lea         0x20(%0),%0                   \n"
6547       "psrlw       $0x8,%%xmm0                   \n"
6548       "psrlw       $0x8,%%xmm1                   \n"
6549       "packuswb    %%xmm1,%%xmm0                 \n"
6550       "movdqu      %%xmm0,(%1)                   \n"
6551       "lea         0x10(%1),%1                   \n"
6552       "sub         $0x10,%2                      \n"
6553       "jg          1b                            \n"
6554       : "+r"(src_uyvy),  // %0
6555         "+r"(dst_y),     // %1
6556         "+r"(width)      // %2
6557       :
6558       : "memory", "cc", "xmm0", "xmm1");
6559 }
6560 
UYVYToUVRow_SSE2(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)6561 void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
6562                       int stride_uyvy,
6563                       uint8_t* dst_u,
6564                       uint8_t* dst_v,
6565                       int width) {
6566   asm volatile(
6567       "pcmpeqb     %%xmm5,%%xmm5                 \n"
6568       "psrlw       $0x8,%%xmm5                   \n"
6569       "sub         %1,%2                         \n"
6570 
6571       LABELALIGN
6572       "1:                                        \n"
6573       "movdqu      (%0),%%xmm0                   \n"
6574       "movdqu      0x10(%0),%%xmm1               \n"
6575       "movdqu      0x00(%0,%4,1),%%xmm2          \n"
6576       "movdqu      0x10(%0,%4,1),%%xmm3          \n"
6577       "lea         0x20(%0),%0                   \n"
6578       "pavgb       %%xmm2,%%xmm0                 \n"
6579       "pavgb       %%xmm3,%%xmm1                 \n"
6580       "pand        %%xmm5,%%xmm0                 \n"
6581       "pand        %%xmm5,%%xmm1                 \n"
6582       "packuswb    %%xmm1,%%xmm0                 \n"
6583       "movdqa      %%xmm0,%%xmm1                 \n"
6584       "pand        %%xmm5,%%xmm0                 \n"
6585       "packuswb    %%xmm0,%%xmm0                 \n"
6586       "psrlw       $0x8,%%xmm1                   \n"
6587       "packuswb    %%xmm1,%%xmm1                 \n"
6588       "movq        %%xmm0,(%1)                   \n"
6589       "movq        %%xmm1,0x00(%1,%2,1)          \n"
6590       "lea         0x8(%1),%1                    \n"
6591       "sub         $0x10,%3                      \n"
6592       "jg          1b                            \n"
6593       : "+r"(src_uyvy),               // %0
6594         "+r"(dst_u),                  // %1
6595         "+r"(dst_v),                  // %2
6596         "+r"(width)                   // %3
6597       : "r"((intptr_t)(stride_uyvy))  // %4
6598       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
6599 }
6600 
UYVYToUV422Row_SSE2(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)6601 void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
6602                          uint8_t* dst_u,
6603                          uint8_t* dst_v,
6604                          int width) {
6605   asm volatile(
6606       "pcmpeqb     %%xmm5,%%xmm5                 \n"
6607       "psrlw       $0x8,%%xmm5                   \n"
6608       "sub         %1,%2                         \n"
6609 
6610       LABELALIGN
6611       "1:                                        \n"
6612       "movdqu      (%0),%%xmm0                   \n"
6613       "movdqu      0x10(%0),%%xmm1               \n"
6614       "lea         0x20(%0),%0                   \n"
6615       "pand        %%xmm5,%%xmm0                 \n"
6616       "pand        %%xmm5,%%xmm1                 \n"
6617       "packuswb    %%xmm1,%%xmm0                 \n"
6618       "movdqa      %%xmm0,%%xmm1                 \n"
6619       "pand        %%xmm5,%%xmm0                 \n"
6620       "packuswb    %%xmm0,%%xmm0                 \n"
6621       "psrlw       $0x8,%%xmm1                   \n"
6622       "packuswb    %%xmm1,%%xmm1                 \n"
6623       "movq        %%xmm0,(%1)                   \n"
6624       "movq        %%xmm1,0x00(%1,%2,1)          \n"
6625       "lea         0x8(%1),%1                    \n"
6626       "sub         $0x10,%3                      \n"
6627       "jg          1b                            \n"
6628       : "+r"(src_uyvy),  // %0
6629         "+r"(dst_u),     // %1
6630         "+r"(dst_v),     // %2
6631         "+r"(width)      // %3
6632       :
6633       : "memory", "cc", "xmm0", "xmm1", "xmm5");
6634 }
6635 #endif  // HAS_YUY2TOYROW_SSE2
6636 
6637 #ifdef HAS_YUY2TOYROW_AVX2
YUY2ToYRow_AVX2(const uint8_t * src_yuy2,uint8_t * dst_y,int width)6638 void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
6639   asm volatile(
6640       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
6641       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
6642 
6643       LABELALIGN
6644       "1:                                        \n"
6645       "vmovdqu     (%0),%%ymm0                   \n"
6646       "vmovdqu     0x20(%0),%%ymm1               \n"
6647       "lea         0x40(%0),%0                   \n"
6648       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
6649       "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
6650       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
6651       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6652       "vmovdqu     %%ymm0,(%1)                   \n"
6653       "lea         0x20(%1),%1                   \n"
6654       "sub         $0x20,%2                      \n"
6655       "jg          1b                            \n"
6656       "vzeroupper                                \n"
6657       : "+r"(src_yuy2),  // %0
6658         "+r"(dst_y),     // %1
6659         "+r"(width)      // %2
6660       :
6661       : "memory", "cc", "xmm0", "xmm1", "xmm5");
6662 }
6663 
YUY2ToUVRow_AVX2(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)6664 void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
6665                       int stride_yuy2,
6666                       uint8_t* dst_u,
6667                       uint8_t* dst_v,
6668                       int width) {
6669   asm volatile(
6670       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
6671       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
6672       "sub         %1,%2                         \n"
6673 
6674       LABELALIGN
6675       "1:                                        \n"
6676       "vmovdqu     (%0),%%ymm0                   \n"
6677       "vmovdqu     0x20(%0),%%ymm1               \n"
6678       "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
6679       "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
6680       "lea         0x40(%0),%0                   \n"
6681       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
6682       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
6683       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
6684       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6685       "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
6686       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
6687       "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
6688       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
6689       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
6690       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6691       "vextractf128 $0x0,%%ymm1,(%1)             \n"
6692       "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
6693       "lea         0x10(%1),%1                   \n"
6694       "sub         $0x20,%3                      \n"
6695       "jg          1b                            \n"
6696       "vzeroupper                                \n"
6697       : "+r"(src_yuy2),               // %0
6698         "+r"(dst_u),                  // %1
6699         "+r"(dst_v),                  // %2
6700         "+r"(width)                   // %3
6701       : "r"((intptr_t)(stride_yuy2))  // %4
6702       : "memory", "cc", "xmm0", "xmm1", "xmm5");
6703 }
6704 
YUY2ToUV422Row_AVX2(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)6705 void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
6706                          uint8_t* dst_u,
6707                          uint8_t* dst_v,
6708                          int width) {
6709   asm volatile(
6710       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
6711       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
6712       "sub         %1,%2                         \n"
6713 
6714       LABELALIGN
6715       "1:                                        \n"
6716       "vmovdqu     (%0),%%ymm0                   \n"
6717       "vmovdqu     0x20(%0),%%ymm1               \n"
6718       "lea         0x40(%0),%0                   \n"
6719       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
6720       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
6721       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
6722       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6723       "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
6724       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
6725       "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
6726       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
6727       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
6728       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6729       "vextractf128 $0x0,%%ymm1,(%1)             \n"
6730       "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
6731       "lea         0x10(%1),%1                   \n"
6732       "sub         $0x20,%3                      \n"
6733       "jg          1b                            \n"
6734       "vzeroupper                                \n"
6735       : "+r"(src_yuy2),  // %0
6736         "+r"(dst_u),     // %1
6737         "+r"(dst_v),     // %2
6738         "+r"(width)      // %3
6739       :
6740       : "memory", "cc", "xmm0", "xmm1", "xmm5");
6741 }
6742 
UYVYToYRow_AVX2(const uint8_t * src_uyvy,uint8_t * dst_y,int width)6743 void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
6744   asm volatile(
6745 
6746       LABELALIGN
6747       "1:                                        \n"
6748       "vmovdqu     (%0),%%ymm0                   \n"
6749       "vmovdqu     0x20(%0),%%ymm1               \n"
6750       "lea         0x40(%0),%0                   \n"
6751       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
6752       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
6753       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
6754       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6755       "vmovdqu     %%ymm0,(%1)                   \n"
6756       "lea         0x20(%1),%1                   \n"
6757       "sub         $0x20,%2                      \n"
6758       "jg          1b                            \n"
6759       "vzeroupper                                \n"
6760       : "+r"(src_uyvy),  // %0
6761         "+r"(dst_y),     // %1
6762         "+r"(width)      // %2
6763       :
6764       : "memory", "cc", "xmm0", "xmm1", "xmm5");
6765 }
UYVYToUVRow_AVX2(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)6766 void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
6767                       int stride_uyvy,
6768                       uint8_t* dst_u,
6769                       uint8_t* dst_v,
6770                       int width) {
6771   asm volatile(
6772       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
6773       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
6774       "sub         %1,%2                         \n"
6775 
6776       LABELALIGN
6777       "1:                                        \n"
6778       "vmovdqu     (%0),%%ymm0                   \n"
6779       "vmovdqu     0x20(%0),%%ymm1               \n"
6780       "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
6781       "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
6782       "lea         0x40(%0),%0                   \n"
6783       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
6784       "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
6785       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
6786       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6787       "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
6788       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
6789       "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
6790       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
6791       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
6792       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6793       "vextractf128 $0x0,%%ymm1,(%1)             \n"
6794       "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
6795       "lea         0x10(%1),%1                   \n"
6796       "sub         $0x20,%3                      \n"
6797       "jg          1b                            \n"
6798       "vzeroupper                                \n"
6799       : "+r"(src_uyvy),               // %0
6800         "+r"(dst_u),                  // %1
6801         "+r"(dst_v),                  // %2
6802         "+r"(width)                   // %3
6803       : "r"((intptr_t)(stride_uyvy))  // %4
6804       : "memory", "cc", "xmm0", "xmm1", "xmm5");
6805 }
6806 
UYVYToUV422Row_AVX2(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)6807 void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
6808                          uint8_t* dst_u,
6809                          uint8_t* dst_v,
6810                          int width) {
6811   asm volatile(
6812       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
6813       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
6814       "sub         %1,%2                         \n"
6815 
6816       LABELALIGN
6817       "1:                                        \n"
6818       "vmovdqu     (%0),%%ymm0                   \n"
6819       "vmovdqu     0x20(%0),%%ymm1               \n"
6820       "lea         0x40(%0),%0                   \n"
6821       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
6822       "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
6823       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
6824       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6825       "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
6826       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
6827       "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
6828       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
6829       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
6830       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6831       "vextractf128 $0x0,%%ymm1,(%1)             \n"
6832       "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
6833       "lea         0x10(%1),%1                   \n"
6834       "sub         $0x20,%3                      \n"
6835       "jg          1b                            \n"
6836       "vzeroupper                                \n"
6837       : "+r"(src_uyvy),  // %0
6838         "+r"(dst_u),     // %1
6839         "+r"(dst_v),     // %2
6840         "+r"(width)      // %3
6841       :
6842       : "memory", "cc", "xmm0", "xmm1", "xmm5");
6843 }
6844 #endif  // HAS_YUY2TOYROW_AVX2
6845 
6846 #ifdef HAS_ARGBBLENDROW_SSSE3
6847 // Shuffle table for isolating alpha.
6848 static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
6849                                     11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
6850 
6851 // Blend 8 pixels at a time
ARGBBlendRow_SSSE3(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)6852 void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
6853                         const uint8_t* src_argb1,
6854                         uint8_t* dst_argb,
6855                         int width) {
6856   asm volatile(
6857       "pcmpeqb     %%xmm7,%%xmm7                 \n"
6858       "psrlw       $0xf,%%xmm7                   \n"
6859       "pcmpeqb     %%xmm6,%%xmm6                 \n"
6860       "psrlw       $0x8,%%xmm6                   \n"
6861       "pcmpeqb     %%xmm5,%%xmm5                 \n"
6862       "psllw       $0x8,%%xmm5                   \n"
6863       "pcmpeqb     %%xmm4,%%xmm4                 \n"
6864       "pslld       $0x18,%%xmm4                  \n"
6865       "sub         $0x4,%3                       \n"
6866       "jl          49f                           \n"
6867 
6868       // 4 pixel loop.
6869       LABELALIGN
6870       "40:                                       \n"
6871       "movdqu      (%0),%%xmm3                   \n"
6872       "lea         0x10(%0),%0                   \n"
6873       "movdqa      %%xmm3,%%xmm0                 \n"
6874       "pxor        %%xmm4,%%xmm3                 \n"
6875       "movdqu      (%1),%%xmm2                   \n"
6876       "pshufb      %4,%%xmm3                     \n"
6877       "pand        %%xmm6,%%xmm2                 \n"
6878       "paddw       %%xmm7,%%xmm3                 \n"
6879       "pmullw      %%xmm3,%%xmm2                 \n"
6880       "movdqu      (%1),%%xmm1                   \n"
6881       "lea         0x10(%1),%1                   \n"
6882       "psrlw       $0x8,%%xmm1                   \n"
6883       "por         %%xmm4,%%xmm0                 \n"
6884       "pmullw      %%xmm3,%%xmm1                 \n"
6885       "psrlw       $0x8,%%xmm2                   \n"
6886       "paddusb     %%xmm2,%%xmm0                 \n"
6887       "pand        %%xmm5,%%xmm1                 \n"
6888       "paddusb     %%xmm1,%%xmm0                 \n"
6889       "movdqu      %%xmm0,(%2)                   \n"
6890       "lea         0x10(%2),%2                   \n"
6891       "sub         $0x4,%3                       \n"
6892       "jge         40b                           \n"
6893 
6894       "49:                                       \n"
6895       "add         $0x3,%3                       \n"
6896       "jl          99f                           \n"
6897 
6898       // 1 pixel loop.
6899       "91:                                       \n"
6900       "movd        (%0),%%xmm3                   \n"
6901       "lea         0x4(%0),%0                    \n"
6902       "movdqa      %%xmm3,%%xmm0                 \n"
6903       "pxor        %%xmm4,%%xmm3                 \n"
6904       "movd        (%1),%%xmm2                   \n"
6905       "pshufb      %4,%%xmm3                     \n"
6906       "pand        %%xmm6,%%xmm2                 \n"
6907       "paddw       %%xmm7,%%xmm3                 \n"
6908       "pmullw      %%xmm3,%%xmm2                 \n"
6909       "movd        (%1),%%xmm1                   \n"
6910       "lea         0x4(%1),%1                    \n"
6911       "psrlw       $0x8,%%xmm1                   \n"
6912       "por         %%xmm4,%%xmm0                 \n"
6913       "pmullw      %%xmm3,%%xmm1                 \n"
6914       "psrlw       $0x8,%%xmm2                   \n"
6915       "paddusb     %%xmm2,%%xmm0                 \n"
6916       "pand        %%xmm5,%%xmm1                 \n"
6917       "paddusb     %%xmm1,%%xmm0                 \n"
6918       "movd        %%xmm0,(%2)                   \n"
6919       "lea         0x4(%2),%2                    \n"
6920       "sub         $0x1,%3                       \n"
6921       "jge         91b                           \n"
6922       "99:                                       \n"
6923       : "+r"(src_argb),     // %0
6924         "+r"(src_argb1),    // %1
6925         "+r"(dst_argb),     // %2
6926         "+r"(width)         // %3
6927       : "m"(kShuffleAlpha)  // %4
6928       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
6929         "xmm7");
6930 }
6931 #endif  // HAS_ARGBBLENDROW_SSSE3
6932 
6933 #ifdef HAS_BLENDPLANEROW_SSSE3
6934 // Blend 8 pixels at a time.
6935 // unsigned version of math
6936 // =((A2*C2)+(B2*(255-C2))+255)/256
6937 // signed version of math
6938 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_SSSE3(const uint8_t * src0,const uint8_t * src1,const uint8_t * alpha,uint8_t * dst,int width)6939 void BlendPlaneRow_SSSE3(const uint8_t* src0,
6940                          const uint8_t* src1,
6941                          const uint8_t* alpha,
6942                          uint8_t* dst,
6943                          int width) {
6944   asm volatile(
6945       "pcmpeqb     %%xmm5,%%xmm5                 \n"
6946       "psllw       $0x8,%%xmm5                   \n"
6947       "mov         $0x80808080,%%eax             \n"
6948       "movd        %%eax,%%xmm6                  \n"
6949       "pshufd      $0x0,%%xmm6,%%xmm6            \n"
6950       "mov         $0x807f807f,%%eax             \n"
6951       "movd        %%eax,%%xmm7                  \n"
6952       "pshufd      $0x0,%%xmm7,%%xmm7            \n"
6953       "sub         %2,%0                         \n"
6954       "sub         %2,%1                         \n"
6955       "sub         %2,%3                         \n"
6956 
6957       // 8 pixel loop.
6958       LABELALIGN
6959       "1:                                        \n"
6960       "movq        (%2),%%xmm0                   \n"
6961       "punpcklbw   %%xmm0,%%xmm0                 \n"
6962       "pxor        %%xmm5,%%xmm0                 \n"
6963       "movq        (%0,%2,1),%%xmm1              \n"
6964       "movq        (%1,%2,1),%%xmm2              \n"
6965       "punpcklbw   %%xmm2,%%xmm1                 \n"
6966       "psubb       %%xmm6,%%xmm1                 \n"
6967       "pmaddubsw   %%xmm1,%%xmm0                 \n"
6968       "paddw       %%xmm7,%%xmm0                 \n"
6969       "psrlw       $0x8,%%xmm0                   \n"
6970       "packuswb    %%xmm0,%%xmm0                 \n"
6971       "movq        %%xmm0,(%3,%2,1)              \n"
6972       "lea         0x8(%2),%2                    \n"
6973       "sub         $0x8,%4                       \n"
6974       "jg          1b                            \n"
6975       : "+r"(src0),   // %0
6976         "+r"(src1),   // %1
6977         "+r"(alpha),  // %2
6978         "+r"(dst),    // %3
6979         "+rm"(width)  // %4
6980         ::"memory",
6981         "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
6982 }
6983 #endif  // HAS_BLENDPLANEROW_SSSE3
6984 
6985 #ifdef HAS_BLENDPLANEROW_AVX2
6986 // Blend 32 pixels at a time.
6987 // unsigned version of math
6988 // =((A2*C2)+(B2*(255-C2))+255)/256
6989 // signed version of math
6990 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_AVX2(const uint8_t * src0,const uint8_t * src1,const uint8_t * alpha,uint8_t * dst,int width)6991 void BlendPlaneRow_AVX2(const uint8_t* src0,
6992                         const uint8_t* src1,
6993                         const uint8_t* alpha,
6994                         uint8_t* dst,
6995                         int width) {
6996   asm volatile(
6997       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
6998       "vpsllw      $0x8,%%ymm5,%%ymm5            \n"
6999       "mov         $0x80808080,%%eax             \n"
7000       "vmovd       %%eax,%%xmm6                  \n"
7001       "vbroadcastss %%xmm6,%%ymm6                \n"
7002       "mov         $0x807f807f,%%eax             \n"
7003       "vmovd       %%eax,%%xmm7                  \n"
7004       "vbroadcastss %%xmm7,%%ymm7                \n"
7005       "sub         %2,%0                         \n"
7006       "sub         %2,%1                         \n"
7007       "sub         %2,%3                         \n"
7008 
7009       // 32 pixel loop.
7010       LABELALIGN
7011       "1:                                        \n"
7012       "vmovdqu     (%2),%%ymm0                   \n"
7013       "vpunpckhbw  %%ymm0,%%ymm0,%%ymm3          \n"
7014       "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
7015       "vpxor       %%ymm5,%%ymm3,%%ymm3          \n"
7016       "vpxor       %%ymm5,%%ymm0,%%ymm0          \n"
7017       "vmovdqu     (%0,%2,1),%%ymm1              \n"
7018       "vmovdqu     (%1,%2,1),%%ymm2              \n"
7019       "vpunpckhbw  %%ymm2,%%ymm1,%%ymm4          \n"
7020       "vpunpcklbw  %%ymm2,%%ymm1,%%ymm1          \n"
7021       "vpsubb      %%ymm6,%%ymm4,%%ymm4          \n"
7022       "vpsubb      %%ymm6,%%ymm1,%%ymm1          \n"
7023       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
7024       "vpmaddubsw  %%ymm1,%%ymm0,%%ymm0          \n"
7025       "vpaddw      %%ymm7,%%ymm3,%%ymm3          \n"
7026       "vpaddw      %%ymm7,%%ymm0,%%ymm0          \n"
7027       "vpsrlw      $0x8,%%ymm3,%%ymm3            \n"
7028       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
7029       "vpackuswb   %%ymm3,%%ymm0,%%ymm0          \n"
7030       "vmovdqu     %%ymm0,(%3,%2,1)              \n"
7031       "lea         0x20(%2),%2                   \n"
7032       "sub         $0x20,%4                      \n"
7033       "jg          1b                            \n"
7034       "vzeroupper                                \n"
7035       : "+r"(src0),   // %0
7036         "+r"(src1),   // %1
7037         "+r"(alpha),  // %2
7038         "+r"(dst),    // %3
7039         "+rm"(width)  // %4
7040         ::"memory",
7041         "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7042         "xmm7");
7043 }
7044 #endif  // HAS_BLENDPLANEROW_AVX2
7045 
7046 #ifdef HAS_ARGBATTENUATEROW_SSSE3
7047 // Shuffle table duplicating alpha.
7048 static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
7049                                      7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
7050 static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
7051                                      15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
7052 // Attenuate 4 pixels at a time.
ARGBAttenuateRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width)7053 void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
7054                             uint8_t* dst_argb,
7055                             int width) {
7056   asm volatile(
7057       "pcmpeqb     %%xmm3,%%xmm3                 \n"
7058       "pslld       $0x18,%%xmm3                  \n"
7059       "movdqa      %3,%%xmm4                     \n"
7060       "movdqa      %4,%%xmm5                     \n"
7061 
7062       // 4 pixel loop.
7063       LABELALIGN
7064       "1:                                        \n"
7065       "movdqu      (%0),%%xmm0                   \n"
7066       "pshufb      %%xmm4,%%xmm0                 \n"
7067       "movdqu      (%0),%%xmm1                   \n"
7068       "punpcklbw   %%xmm1,%%xmm1                 \n"
7069       "pmulhuw     %%xmm1,%%xmm0                 \n"
7070       "movdqu      (%0),%%xmm1                   \n"
7071       "pshufb      %%xmm5,%%xmm1                 \n"
7072       "movdqu      (%0),%%xmm2                   \n"
7073       "punpckhbw   %%xmm2,%%xmm2                 \n"
7074       "pmulhuw     %%xmm2,%%xmm1                 \n"
7075       "movdqu      (%0),%%xmm2                   \n"
7076       "lea         0x10(%0),%0                   \n"
7077       "pand        %%xmm3,%%xmm2                 \n"
7078       "psrlw       $0x8,%%xmm0                   \n"
7079       "psrlw       $0x8,%%xmm1                   \n"
7080       "packuswb    %%xmm1,%%xmm0                 \n"
7081       "por         %%xmm2,%%xmm0                 \n"
7082       "movdqu      %%xmm0,(%1)                   \n"
7083       "lea         0x10(%1),%1                   \n"
7084       "sub         $0x4,%2                       \n"
7085       "jg          1b                            \n"
7086       : "+r"(src_argb),       // %0
7087         "+r"(dst_argb),       // %1
7088         "+r"(width)           // %2
7089       : "m"(kShuffleAlpha0),  // %3
7090         "m"(kShuffleAlpha1)   // %4
7091       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
7092 }
7093 #endif  // HAS_ARGBATTENUATEROW_SSSE3
7094 
7095 #ifdef HAS_ARGBATTENUATEROW_AVX2
7096 // Shuffle table duplicating alpha.
7097 static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
7098                                          128u, 128u, 14u,  15u, 14u, 15u,
7099                                          14u,  15u,  128u, 128u};
7100 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,int width)7101 void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
7102                            uint8_t* dst_argb,
7103                            int width) {
7104   asm volatile(
7105       "vbroadcastf128 %3,%%ymm4                  \n"
7106       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
7107       "vpslld      $0x18,%%ymm5,%%ymm5           \n"
7108       "sub         %0,%1                         \n"
7109 
7110       // 8 pixel loop.
7111       LABELALIGN
7112       "1:                                        \n"
7113       "vmovdqu     (%0),%%ymm6                   \n"
7114       "vpunpcklbw  %%ymm6,%%ymm6,%%ymm0          \n"
7115       "vpunpckhbw  %%ymm6,%%ymm6,%%ymm1          \n"
7116       "vpshufb     %%ymm4,%%ymm0,%%ymm2          \n"
7117       "vpshufb     %%ymm4,%%ymm1,%%ymm3          \n"
7118       "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
7119       "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
7120       "vpand       %%ymm5,%%ymm6,%%ymm6          \n"
7121       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
7122       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
7123       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
7124       "vpor        %%ymm6,%%ymm0,%%ymm0          \n"
7125       "vmovdqu     %%ymm0,0x00(%0,%1,1)          \n"
7126       "lea         0x20(%0),%0                   \n"
7127       "sub         $0x8,%2                       \n"
7128       "jg          1b                            \n"
7129       "vzeroupper                                \n"
7130       : "+r"(src_argb),          // %0
7131         "+r"(dst_argb),          // %1
7132         "+r"(width)              // %2
7133       : "m"(kShuffleAlpha_AVX2)  // %3
7134       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
7135 }
7136 #endif  // HAS_ARGBATTENUATEROW_AVX2
7137 
7138 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
7139 // Unattenuate 4 pixels at a time.
ARGBUnattenuateRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,int width)7140 void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
7141                              uint8_t* dst_argb,
7142                              int width) {
7143   uintptr_t alpha;
7144   asm volatile(
7145       // 4 pixel loop.
7146       LABELALIGN
7147       "1:                                        \n"
7148       "movdqu      (%0),%%xmm0                   \n"
7149       "movzb       0x03(%0),%3                   \n"
7150       "punpcklbw   %%xmm0,%%xmm0                 \n"
7151       "movd        0x00(%4,%3,4),%%xmm2          \n"
7152       "movzb       0x07(%0),%3                   \n"
7153       "movd        0x00(%4,%3,4),%%xmm3          \n"
7154       "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
7155       "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
7156       "movlhps     %%xmm3,%%xmm2                 \n"
7157       "pmulhuw     %%xmm2,%%xmm0                 \n"
7158       "movdqu      (%0),%%xmm1                   \n"
7159       "movzb       0x0b(%0),%3                   \n"
7160       "punpckhbw   %%xmm1,%%xmm1                 \n"
7161       "movd        0x00(%4,%3,4),%%xmm2          \n"
7162       "movzb       0x0f(%0),%3                   \n"
7163       "movd        0x00(%4,%3,4),%%xmm3          \n"
7164       "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
7165       "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
7166       "movlhps     %%xmm3,%%xmm2                 \n"
7167       "pmulhuw     %%xmm2,%%xmm1                 \n"
7168       "lea         0x10(%0),%0                   \n"
7169       "packuswb    %%xmm1,%%xmm0                 \n"
7170       "movdqu      %%xmm0,(%1)                   \n"
7171       "lea         0x10(%1),%1                   \n"
7172       "sub         $0x4,%2                       \n"
7173       "jg          1b                            \n"
7174       : "+r"(src_argb),     // %0
7175         "+r"(dst_argb),     // %1
7176         "+r"(width),        // %2
7177         "=&r"(alpha)        // %3
7178       : "r"(fixed_invtbl8)  // %4
7179       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
7180 }
7181 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
7182 
7183 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
7184 // Shuffle table duplicating alpha.
7185 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
7186     0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
7187 // Unattenuate 8 pixels at a time.
ARGBUnattenuateRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,int width)7188 void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
7189                              uint8_t* dst_argb,
7190                              int width) {
7191   uintptr_t alpha;
7192   asm volatile(
7193       "sub         %0,%1                         \n"
7194       "vbroadcastf128 %5,%%ymm5                  \n"
7195 
7196       // 8 pixel loop.
7197       LABELALIGN
7198       "1:                                        \n"
7199       // replace VPGATHER
7200       "movzb       0x03(%0),%3                   \n"
7201       "vmovd       0x00(%4,%3,4),%%xmm0          \n"
7202       "movzb       0x07(%0),%3                   \n"
7203       "vmovd       0x00(%4,%3,4),%%xmm1          \n"
7204       "movzb       0x0b(%0),%3                   \n"
7205       "vpunpckldq  %%xmm1,%%xmm0,%%xmm6          \n"
7206       "vmovd       0x00(%4,%3,4),%%xmm2          \n"
7207       "movzb       0x0f(%0),%3                   \n"
7208       "vmovd       0x00(%4,%3,4),%%xmm3          \n"
7209       "movzb       0x13(%0),%3                   \n"
7210       "vpunpckldq  %%xmm3,%%xmm2,%%xmm7          \n"
7211       "vmovd       0x00(%4,%3,4),%%xmm0          \n"
7212       "movzb       0x17(%0),%3                   \n"
7213       "vmovd       0x00(%4,%3,4),%%xmm1          \n"
7214       "movzb       0x1b(%0),%3                   \n"
7215       "vpunpckldq  %%xmm1,%%xmm0,%%xmm0          \n"
7216       "vmovd       0x00(%4,%3,4),%%xmm2          \n"
7217       "movzb       0x1f(%0),%3                   \n"
7218       "vmovd       0x00(%4,%3,4),%%xmm3          \n"
7219       "vpunpckldq  %%xmm3,%%xmm2,%%xmm2          \n"
7220       "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
7221       "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
7222       "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
7223       // end of VPGATHER
7224 
7225       "vmovdqu     (%0),%%ymm6                   \n"
7226       "vpunpcklbw  %%ymm6,%%ymm6,%%ymm0          \n"
7227       "vpunpckhbw  %%ymm6,%%ymm6,%%ymm1          \n"
7228       "vpunpcklwd  %%ymm3,%%ymm3,%%ymm2          \n"
7229       "vpunpckhwd  %%ymm3,%%ymm3,%%ymm3          \n"
7230       "vpshufb     %%ymm5,%%ymm2,%%ymm2          \n"
7231       "vpshufb     %%ymm5,%%ymm3,%%ymm3          \n"
7232       "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
7233       "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
7234       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
7235       "vmovdqu     %%ymm0,0x00(%0,%1,1)          \n"
7236       "lea         0x20(%0),%0                   \n"
7237       "sub         $0x8,%2                       \n"
7238       "jg          1b                            \n"
7239       "vzeroupper                                \n"
7240       : "+r"(src_argb),                 // %0
7241         "+r"(dst_argb),                 // %1
7242         "+r"(width),                    // %2
7243         "=&r"(alpha)                    // %3
7244       : "r"(fixed_invtbl8),             // %4
7245         "m"(kUnattenShuffleAlpha_AVX2)  // %5
7246       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7247         "xmm7");
7248 }
7249 #endif  // HAS_ARGBUNATTENUATEROW_AVX2
7250 
7251 #ifdef HAS_ARGBGRAYROW_SSSE3
7252 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
ARGBGrayRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width)7253 void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
7254   asm volatile(
7255       "movdqa      %3,%%xmm4                     \n"
7256       "movdqa      %4,%%xmm5                     \n"
7257 
7258       // 8 pixel loop.
7259       LABELALIGN
7260       "1:                                        \n"
7261       "movdqu      (%0),%%xmm0                   \n"
7262       "movdqu      0x10(%0),%%xmm1               \n"
7263       "psubb       %%xmm5,%%xmm0                 \n"
7264       "psubb       %%xmm5,%%xmm1                 \n"
7265       "movdqu      %%xmm4,%%xmm6                 \n"
7266       "pmaddubsw   %%xmm0,%%xmm6                 \n"
7267       "movdqu      %%xmm4,%%xmm0                 \n"
7268       "pmaddubsw   %%xmm1,%%xmm0                 \n"
7269       "phaddw      %%xmm0,%%xmm6                 \n"
7270       "paddw       %%xmm5,%%xmm6                 \n"
7271       "psrlw       $0x8,%%xmm6                   \n"
7272       "packuswb    %%xmm6,%%xmm6                 \n"
7273       "movdqu      (%0),%%xmm2                   \n"
7274       "movdqu      0x10(%0),%%xmm3               \n"
7275       "lea         0x20(%0),%0                   \n"
7276       "psrld       $0x18,%%xmm2                  \n"
7277       "psrld       $0x18,%%xmm3                  \n"
7278       "packuswb    %%xmm3,%%xmm2                 \n"
7279       "packuswb    %%xmm2,%%xmm2                 \n"
7280       "movdqa      %%xmm6,%%xmm3                 \n"
7281       "punpcklbw   %%xmm6,%%xmm6                 \n"
7282       "punpcklbw   %%xmm2,%%xmm3                 \n"
7283       "movdqa      %%xmm6,%%xmm1                 \n"
7284       "punpcklwd   %%xmm3,%%xmm6                 \n"
7285       "punpckhwd   %%xmm3,%%xmm1                 \n"
7286       "movdqu      %%xmm6,(%1)                   \n"
7287       "movdqu      %%xmm1,0x10(%1)               \n"
7288       "lea         0x20(%1),%1                   \n"
7289       "sub         $0x8,%2                       \n"
7290       "jg          1b                            \n"
7291       : "+r"(src_argb),  // %0
7292         "+r"(dst_argb),  // %1
7293         "+r"(width)      // %2
7294       : "m"(kARGBToYJ),  // %3
7295         "m"(kSub128)     // %4
7296       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
7297 }
7298 #endif  // HAS_ARGBGRAYROW_SSSE3
7299 
7300 #ifdef HAS_ARGBSEPIAROW_SSSE3
7301 //    b = (r * 35 + g * 68 + b * 17) >> 7
7302 //    g = (r * 45 + g * 88 + b * 22) >> 7
7303 //    r = (r * 50 + g * 98 + b * 24) >> 7
7304 // Constant for ARGB color to sepia tone
7305 static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
7306                                    17, 68, 35, 0, 17, 68, 35, 0};
7307 
7308 static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
7309                                    22, 88, 45, 0, 22, 88, 45, 0};
7310 
7311 static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
7312                                    24, 98, 50, 0, 24, 98, 50, 0};
7313 
7314 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
ARGBSepiaRow_SSSE3(uint8_t * dst_argb,int width)7315 void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
7316   asm volatile(
7317       "movdqa      %2,%%xmm2                     \n"
7318       "movdqa      %3,%%xmm3                     \n"
7319       "movdqa      %4,%%xmm4                     \n"
7320 
7321       // 8 pixel loop.
7322       LABELALIGN
7323       "1:                                        \n"
7324       "movdqu      (%0),%%xmm0                   \n"
7325       "movdqu      0x10(%0),%%xmm6               \n"
7326       "pmaddubsw   %%xmm2,%%xmm0                 \n"
7327       "pmaddubsw   %%xmm2,%%xmm6                 \n"
7328       "phaddw      %%xmm6,%%xmm0                 \n"
7329       "psrlw       $0x7,%%xmm0                   \n"
7330       "packuswb    %%xmm0,%%xmm0                 \n"
7331       "movdqu      (%0),%%xmm5                   \n"
7332       "movdqu      0x10(%0),%%xmm1               \n"
7333       "pmaddubsw   %%xmm3,%%xmm5                 \n"
7334       "pmaddubsw   %%xmm3,%%xmm1                 \n"
7335       "phaddw      %%xmm1,%%xmm5                 \n"
7336       "psrlw       $0x7,%%xmm5                   \n"
7337       "packuswb    %%xmm5,%%xmm5                 \n"
7338       "punpcklbw   %%xmm5,%%xmm0                 \n"
7339       "movdqu      (%0),%%xmm5                   \n"
7340       "movdqu      0x10(%0),%%xmm1               \n"
7341       "pmaddubsw   %%xmm4,%%xmm5                 \n"
7342       "pmaddubsw   %%xmm4,%%xmm1                 \n"
7343       "phaddw      %%xmm1,%%xmm5                 \n"
7344       "psrlw       $0x7,%%xmm5                   \n"
7345       "packuswb    %%xmm5,%%xmm5                 \n"
7346       "movdqu      (%0),%%xmm6                   \n"
7347       "movdqu      0x10(%0),%%xmm1               \n"
7348       "psrld       $0x18,%%xmm6                  \n"
7349       "psrld       $0x18,%%xmm1                  \n"
7350       "packuswb    %%xmm1,%%xmm6                 \n"
7351       "packuswb    %%xmm6,%%xmm6                 \n"
7352       "punpcklbw   %%xmm6,%%xmm5                 \n"
7353       "movdqa      %%xmm0,%%xmm1                 \n"
7354       "punpcklwd   %%xmm5,%%xmm0                 \n"
7355       "punpckhwd   %%xmm5,%%xmm1                 \n"
7356       "movdqu      %%xmm0,(%0)                   \n"
7357       "movdqu      %%xmm1,0x10(%0)               \n"
7358       "lea         0x20(%0),%0                   \n"
7359       "sub         $0x8,%1                       \n"
7360       "jg          1b                            \n"
7361       : "+r"(dst_argb),      // %0
7362         "+r"(width)          // %1
7363       : "m"(kARGBToSepiaB),  // %2
7364         "m"(kARGBToSepiaG),  // %3
7365         "m"(kARGBToSepiaR)   // %4
7366       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
7367 }
7368 #endif  // HAS_ARGBSEPIAROW_SSSE3
7369 
7370 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
7371 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
7372 // Same as Sepia except matrix is provided.
ARGBColorMatrixRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,const int8_t * matrix_argb,int width)7373 void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
7374                               uint8_t* dst_argb,
7375                               const int8_t* matrix_argb,
7376                               int width) {
7377   asm volatile(
7378       "movdqu      (%3),%%xmm5                   \n"
7379       "pshufd      $0x00,%%xmm5,%%xmm2           \n"
7380       "pshufd      $0x55,%%xmm5,%%xmm3           \n"
7381       "pshufd      $0xaa,%%xmm5,%%xmm4           \n"
7382       "pshufd      $0xff,%%xmm5,%%xmm5           \n"
7383 
7384       // 8 pixel loop.
7385       LABELALIGN
7386       "1:                                        \n"
7387       "movdqu      (%0),%%xmm0                   \n"
7388       "movdqu      0x10(%0),%%xmm7               \n"
7389       "pmaddubsw   %%xmm2,%%xmm0                 \n"
7390       "pmaddubsw   %%xmm2,%%xmm7                 \n"
7391       "movdqu      (%0),%%xmm6                   \n"
7392       "movdqu      0x10(%0),%%xmm1               \n"
7393       "pmaddubsw   %%xmm3,%%xmm6                 \n"
7394       "pmaddubsw   %%xmm3,%%xmm1                 \n"
7395       "phaddsw     %%xmm7,%%xmm0                 \n"
7396       "phaddsw     %%xmm1,%%xmm6                 \n"
7397       "psraw       $0x6,%%xmm0                   \n"
7398       "psraw       $0x6,%%xmm6                   \n"
7399       "packuswb    %%xmm0,%%xmm0                 \n"
7400       "packuswb    %%xmm6,%%xmm6                 \n"
7401       "punpcklbw   %%xmm6,%%xmm0                 \n"
7402       "movdqu      (%0),%%xmm1                   \n"
7403       "movdqu      0x10(%0),%%xmm7               \n"
7404       "pmaddubsw   %%xmm4,%%xmm1                 \n"
7405       "pmaddubsw   %%xmm4,%%xmm7                 \n"
7406       "phaddsw     %%xmm7,%%xmm1                 \n"
7407       "movdqu      (%0),%%xmm6                   \n"
7408       "movdqu      0x10(%0),%%xmm7               \n"
7409       "pmaddubsw   %%xmm5,%%xmm6                 \n"
7410       "pmaddubsw   %%xmm5,%%xmm7                 \n"
7411       "phaddsw     %%xmm7,%%xmm6                 \n"
7412       "psraw       $0x6,%%xmm1                   \n"
7413       "psraw       $0x6,%%xmm6                   \n"
7414       "packuswb    %%xmm1,%%xmm1                 \n"
7415       "packuswb    %%xmm6,%%xmm6                 \n"
7416       "punpcklbw   %%xmm6,%%xmm1                 \n"
7417       "movdqa      %%xmm0,%%xmm6                 \n"
7418       "punpcklwd   %%xmm1,%%xmm0                 \n"
7419       "punpckhwd   %%xmm1,%%xmm6                 \n"
7420       "movdqu      %%xmm0,(%1)                   \n"
7421       "movdqu      %%xmm6,0x10(%1)               \n"
7422       "lea         0x20(%0),%0                   \n"
7423       "lea         0x20(%1),%1                   \n"
7424       "sub         $0x8,%2                       \n"
7425       "jg          1b                            \n"
7426       : "+r"(src_argb),   // %0
7427         "+r"(dst_argb),   // %1
7428         "+r"(width)       // %2
7429       : "r"(matrix_argb)  // %3
7430       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7431         "xmm7");
7432 }
7433 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
7434 
7435 #ifdef HAS_ARGBQUANTIZEROW_SSE2
7436 // Quantize 4 ARGB pixels (16 bytes).
ARGBQuantizeRow_SSE2(uint8_t * dst_argb,int scale,int interval_size,int interval_offset,int width)7437 void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
7438                           int scale,
7439                           int interval_size,
7440                           int interval_offset,
7441                           int width) {
7442   asm volatile(
7443       "movd        %2,%%xmm2                     \n"
7444       "movd        %3,%%xmm3                     \n"
7445       "movd        %4,%%xmm4                     \n"
7446       "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
7447       "pshufd      $0x44,%%xmm2,%%xmm2           \n"
7448       "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
7449       "pshufd      $0x44,%%xmm3,%%xmm3           \n"
7450       "pshuflw     $0x40,%%xmm4,%%xmm4           \n"
7451       "pshufd      $0x44,%%xmm4,%%xmm4           \n"
7452       "pxor        %%xmm5,%%xmm5                 \n"
7453       "pcmpeqb     %%xmm6,%%xmm6                 \n"
7454       "pslld       $0x18,%%xmm6                  \n"
7455 
7456       // 4 pixel loop.
7457       LABELALIGN
7458       "1:                                        \n"
7459       "movdqu      (%0),%%xmm0                   \n"
7460       "punpcklbw   %%xmm5,%%xmm0                 \n"
7461       "pmulhuw     %%xmm2,%%xmm0                 \n"
7462       "movdqu      (%0),%%xmm1                   \n"
7463       "punpckhbw   %%xmm5,%%xmm1                 \n"
7464       "pmulhuw     %%xmm2,%%xmm1                 \n"
7465       "pmullw      %%xmm3,%%xmm0                 \n"
7466       "movdqu      (%0),%%xmm7                   \n"
7467       "pmullw      %%xmm3,%%xmm1                 \n"
7468       "pand        %%xmm6,%%xmm7                 \n"
7469       "paddw       %%xmm4,%%xmm0                 \n"
7470       "paddw       %%xmm4,%%xmm1                 \n"
7471       "packuswb    %%xmm1,%%xmm0                 \n"
7472       "por         %%xmm7,%%xmm0                 \n"
7473       "movdqu      %%xmm0,(%0)                   \n"
7474       "lea         0x10(%0),%0                   \n"
7475       "sub         $0x4,%1                       \n"
7476       "jg          1b                            \n"
7477       : "+r"(dst_argb),       // %0
7478         "+r"(width)           // %1
7479       : "r"(scale),           // %2
7480         "r"(interval_size),   // %3
7481         "r"(interval_offset)  // %4
7482       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7483         "xmm7");
7484 }
7485 #endif  // HAS_ARGBQUANTIZEROW_SSE2
7486 
7487 #ifdef HAS_ARGBSHADEROW_SSE2
7488 // Shade 4 pixels at a time by specified value.
ARGBShadeRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)7489 void ARGBShadeRow_SSE2(const uint8_t* src_argb,
7490                        uint8_t* dst_argb,
7491                        int width,
7492                        uint32_t value) {
7493   asm volatile(
7494       "movd        %3,%%xmm2                     \n"
7495       "punpcklbw   %%xmm2,%%xmm2                 \n"
7496       "punpcklqdq  %%xmm2,%%xmm2                 \n"
7497 
7498       // 4 pixel loop.
7499       LABELALIGN
7500       "1:                                        \n"
7501       "movdqu      (%0),%%xmm0                   \n"
7502       "lea         0x10(%0),%0                   \n"
7503       "movdqa      %%xmm0,%%xmm1                 \n"
7504       "punpcklbw   %%xmm0,%%xmm0                 \n"
7505       "punpckhbw   %%xmm1,%%xmm1                 \n"
7506       "pmulhuw     %%xmm2,%%xmm0                 \n"
7507       "pmulhuw     %%xmm2,%%xmm1                 \n"
7508       "psrlw       $0x8,%%xmm0                   \n"
7509       "psrlw       $0x8,%%xmm1                   \n"
7510       "packuswb    %%xmm1,%%xmm0                 \n"
7511       "movdqu      %%xmm0,(%1)                   \n"
7512       "lea         0x10(%1),%1                   \n"
7513       "sub         $0x4,%2                       \n"
7514       "jg          1b                            \n"
7515       : "+r"(src_argb),  // %0
7516         "+r"(dst_argb),  // %1
7517         "+r"(width)      // %2
7518       : "r"(value)       // %3
7519       : "memory", "cc", "xmm0", "xmm1", "xmm2");
7520 }
7521 #endif  // HAS_ARGBSHADEROW_SSE2
7522 
7523 #ifdef HAS_ARGBMULTIPLYROW_SSE2
7524 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBMultiplyRow_SSE2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7525 void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
7526                           const uint8_t* src_argb1,
7527                           uint8_t* dst_argb,
7528                           int width) {
7529   asm volatile(
7530 
7531       "pxor        %%xmm5,%%xmm5                 \n"
7532 
7533       // 4 pixel loop.
7534       LABELALIGN
7535       "1:                                        \n"
7536       "movdqu      (%0),%%xmm0                   \n"
7537       "lea         0x10(%0),%0                   \n"
7538       "movdqu      (%1),%%xmm2                   \n"
7539       "lea         0x10(%1),%1                   \n"
7540       "movdqu      %%xmm0,%%xmm1                 \n"
7541       "movdqu      %%xmm2,%%xmm3                 \n"
7542       "punpcklbw   %%xmm0,%%xmm0                 \n"
7543       "punpckhbw   %%xmm1,%%xmm1                 \n"
7544       "punpcklbw   %%xmm5,%%xmm2                 \n"
7545       "punpckhbw   %%xmm5,%%xmm3                 \n"
7546       "pmulhuw     %%xmm2,%%xmm0                 \n"
7547       "pmulhuw     %%xmm3,%%xmm1                 \n"
7548       "packuswb    %%xmm1,%%xmm0                 \n"
7549       "movdqu      %%xmm0,(%2)                   \n"
7550       "lea         0x10(%2),%2                   \n"
7551       "sub         $0x4,%3                       \n"
7552       "jg          1b                            \n"
7553       : "+r"(src_argb),   // %0
7554         "+r"(src_argb1),  // %1
7555         "+r"(dst_argb),   // %2
7556         "+r"(width)       // %3
7557       :
7558       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
7559 }
7560 #endif  // HAS_ARGBMULTIPLYROW_SSE2
7561 
7562 #ifdef HAS_ARGBMULTIPLYROW_AVX2
7563 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_AVX2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7564 void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
7565                           const uint8_t* src_argb1,
7566                           uint8_t* dst_argb,
7567                           int width) {
7568   asm volatile(
7569 
7570       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
7571 
7572       // 4 pixel loop.
7573       LABELALIGN
7574       "1:                                        \n"
7575       "vmovdqu     (%0),%%ymm1                   \n"
7576       "lea         0x20(%0),%0                   \n"
7577       "vmovdqu     (%1),%%ymm3                   \n"
7578       "lea         0x20(%1),%1                   \n"
7579       "vpunpcklbw  %%ymm1,%%ymm1,%%ymm0          \n"
7580       "vpunpckhbw  %%ymm1,%%ymm1,%%ymm1          \n"
7581       "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
7582       "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
7583       "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
7584       "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
7585       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
7586       "vmovdqu     %%ymm0,(%2)                   \n"
7587       "lea         0x20(%2),%2                   \n"
7588       "sub         $0x8,%3                       \n"
7589       "jg          1b                            \n"
7590       "vzeroupper                                \n"
7591       : "+r"(src_argb),   // %0
7592         "+r"(src_argb1),  // %1
7593         "+r"(dst_argb),   // %2
7594         "+r"(width)       // %3
7595       :
7596       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
7597 }
7598 #endif  // HAS_ARGBMULTIPLYROW_AVX2
7599 
7600 #ifdef HAS_ARGBADDROW_SSE2
7601 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_SSE2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7602 void ARGBAddRow_SSE2(const uint8_t* src_argb,
7603                      const uint8_t* src_argb1,
7604                      uint8_t* dst_argb,
7605                      int width) {
7606   asm volatile(
7607       // 4 pixel loop.
7608       LABELALIGN
7609       "1:                                        \n"
7610       "movdqu      (%0),%%xmm0                   \n"
7611       "lea         0x10(%0),%0                   \n"
7612       "movdqu      (%1),%%xmm1                   \n"
7613       "lea         0x10(%1),%1                   \n"
7614       "paddusb     %%xmm1,%%xmm0                 \n"
7615       "movdqu      %%xmm0,(%2)                   \n"
7616       "lea         0x10(%2),%2                   \n"
7617       "sub         $0x4,%3                       \n"
7618       "jg          1b                            \n"
7619       : "+r"(src_argb),   // %0
7620         "+r"(src_argb1),  // %1
7621         "+r"(dst_argb),   // %2
7622         "+r"(width)       // %3
7623       :
7624       : "memory", "cc", "xmm0", "xmm1");
7625 }
7626 #endif  // HAS_ARGBADDROW_SSE2
7627 
7628 #ifdef HAS_ARGBADDROW_AVX2
7629 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_AVX2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7630 void ARGBAddRow_AVX2(const uint8_t* src_argb,
7631                      const uint8_t* src_argb1,
7632                      uint8_t* dst_argb,
7633                      int width) {
7634   asm volatile(
7635       // 4 pixel loop.
7636       LABELALIGN
7637       "1:                                        \n"
7638       "vmovdqu     (%0),%%ymm0                   \n"
7639       "lea         0x20(%0),%0                   \n"
7640       "vpaddusb    (%1),%%ymm0,%%ymm0            \n"
7641       "lea         0x20(%1),%1                   \n"
7642       "vmovdqu     %%ymm0,(%2)                   \n"
7643       "lea         0x20(%2),%2                   \n"
7644       "sub         $0x8,%3                       \n"
7645       "jg          1b                            \n"
7646       "vzeroupper                                \n"
7647       : "+r"(src_argb),   // %0
7648         "+r"(src_argb1),  // %1
7649         "+r"(dst_argb),   // %2
7650         "+r"(width)       // %3
7651       :
7652       : "memory", "cc", "xmm0");
7653 }
7654 #endif  // HAS_ARGBADDROW_AVX2
7655 
7656 #ifdef HAS_ARGBSUBTRACTROW_SSE2
7657 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
ARGBSubtractRow_SSE2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7658 void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
7659                           const uint8_t* src_argb1,
7660                           uint8_t* dst_argb,
7661                           int width) {
7662   asm volatile(
7663       // 4 pixel loop.
7664       LABELALIGN
7665       "1:                                        \n"
7666       "movdqu      (%0),%%xmm0                   \n"
7667       "lea         0x10(%0),%0                   \n"
7668       "movdqu      (%1),%%xmm1                   \n"
7669       "lea         0x10(%1),%1                   \n"
7670       "psubusb     %%xmm1,%%xmm0                 \n"
7671       "movdqu      %%xmm0,(%2)                   \n"
7672       "lea         0x10(%2),%2                   \n"
7673       "sub         $0x4,%3                       \n"
7674       "jg          1b                            \n"
7675       : "+r"(src_argb),   // %0
7676         "+r"(src_argb1),  // %1
7677         "+r"(dst_argb),   // %2
7678         "+r"(width)       // %3
7679       :
7680       : "memory", "cc", "xmm0", "xmm1");
7681 }
7682 #endif  // HAS_ARGBSUBTRACTROW_SSE2
7683 
7684 #ifdef HAS_ARGBSUBTRACTROW_AVX2
7685 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_AVX2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7686 void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
7687                           const uint8_t* src_argb1,
7688                           uint8_t* dst_argb,
7689                           int width) {
7690   asm volatile(
7691       // 4 pixel loop.
7692       LABELALIGN
7693       "1:                                        \n"
7694       "vmovdqu     (%0),%%ymm0                   \n"
7695       "lea         0x20(%0),%0                   \n"
7696       "vpsubusb    (%1),%%ymm0,%%ymm0            \n"
7697       "lea         0x20(%1),%1                   \n"
7698       "vmovdqu     %%ymm0,(%2)                   \n"
7699       "lea         0x20(%2),%2                   \n"
7700       "sub         $0x8,%3                       \n"
7701       "jg          1b                            \n"
7702       "vzeroupper                                \n"
7703       : "+r"(src_argb),   // %0
7704         "+r"(src_argb1),  // %1
7705         "+r"(dst_argb),   // %2
7706         "+r"(width)       // %3
7707       :
7708       : "memory", "cc", "xmm0");
7709 }
7710 #endif  // HAS_ARGBSUBTRACTROW_AVX2
7711 
7712 #ifdef HAS_SOBELXROW_SSE2
7713 // SobelX as a matrix is
7714 // -1  0  1
7715 // -2  0  2
7716 // -1  0  1
SobelXRow_SSE2(const uint8_t * src_y0,const uint8_t * src_y1,const uint8_t * src_y2,uint8_t * dst_sobelx,int width)7717 void SobelXRow_SSE2(const uint8_t* src_y0,
7718                     const uint8_t* src_y1,
7719                     const uint8_t* src_y2,
7720                     uint8_t* dst_sobelx,
7721                     int width) {
7722   asm volatile(
7723       "sub         %0,%1                         \n"
7724       "sub         %0,%2                         \n"
7725       "sub         %0,%3                         \n"
7726       "pxor        %%xmm5,%%xmm5                 \n"
7727 
7728       // 8 pixel loop.
7729       LABELALIGN
7730       "1:                                        \n"
7731       "movq        (%0),%%xmm0                   \n"
7732       "movq        0x2(%0),%%xmm1                \n"
7733       "punpcklbw   %%xmm5,%%xmm0                 \n"
7734       "punpcklbw   %%xmm5,%%xmm1                 \n"
7735       "psubw       %%xmm1,%%xmm0                 \n"
7736       "movq        0x00(%0,%1,1),%%xmm1          \n"
7737       "movq        0x02(%0,%1,1),%%xmm2          \n"
7738       "punpcklbw   %%xmm5,%%xmm1                 \n"
7739       "punpcklbw   %%xmm5,%%xmm2                 \n"
7740       "psubw       %%xmm2,%%xmm1                 \n"
7741       "movq        0x00(%0,%2,1),%%xmm2          \n"
7742       "movq        0x02(%0,%2,1),%%xmm3          \n"
7743       "punpcklbw   %%xmm5,%%xmm2                 \n"
7744       "punpcklbw   %%xmm5,%%xmm3                 \n"
7745       "psubw       %%xmm3,%%xmm2                 \n"
7746       "paddw       %%xmm2,%%xmm0                 \n"
7747       "paddw       %%xmm1,%%xmm0                 \n"
7748       "paddw       %%xmm1,%%xmm0                 \n"
7749       "pxor        %%xmm1,%%xmm1                 \n"
7750       "psubw       %%xmm0,%%xmm1                 \n"
7751       "pmaxsw      %%xmm1,%%xmm0                 \n"
7752       "packuswb    %%xmm0,%%xmm0                 \n"
7753       "movq        %%xmm0,0x00(%0,%3,1)          \n"
7754       "lea         0x8(%0),%0                    \n"
7755       "sub         $0x8,%4                       \n"
7756       "jg          1b                            \n"
7757       : "+r"(src_y0),      // %0
7758         "+r"(src_y1),      // %1
7759         "+r"(src_y2),      // %2
7760         "+r"(dst_sobelx),  // %3
7761         "+r"(width)        // %4
7762       :
7763       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
7764 }
7765 #endif  // HAS_SOBELXROW_SSE2
7766 
7767 #ifdef HAS_SOBELYROW_SSE2
7768 // SobelY as a matrix is
7769 // -1 -2 -1
7770 //  0  0  0
7771 //  1  2  1
SobelYRow_SSE2(const uint8_t * src_y0,const uint8_t * src_y1,uint8_t * dst_sobely,int width)7772 void SobelYRow_SSE2(const uint8_t* src_y0,
7773                     const uint8_t* src_y1,
7774                     uint8_t* dst_sobely,
7775                     int width) {
7776   asm volatile(
7777       "sub         %0,%1                         \n"
7778       "sub         %0,%2                         \n"
7779       "pxor        %%xmm5,%%xmm5                 \n"
7780 
7781       // 8 pixel loop.
7782       LABELALIGN
7783       "1:                                        \n"
7784       "movq        (%0),%%xmm0                   \n"
7785       "movq        0x00(%0,%1,1),%%xmm1          \n"
7786       "punpcklbw   %%xmm5,%%xmm0                 \n"
7787       "punpcklbw   %%xmm5,%%xmm1                 \n"
7788       "psubw       %%xmm1,%%xmm0                 \n"
7789       "movq        0x1(%0),%%xmm1                \n"
7790       "movq        0x01(%0,%1,1),%%xmm2          \n"
7791       "punpcklbw   %%xmm5,%%xmm1                 \n"
7792       "punpcklbw   %%xmm5,%%xmm2                 \n"
7793       "psubw       %%xmm2,%%xmm1                 \n"
7794       "movq        0x2(%0),%%xmm2                \n"
7795       "movq        0x02(%0,%1,1),%%xmm3          \n"
7796       "punpcklbw   %%xmm5,%%xmm2                 \n"
7797       "punpcklbw   %%xmm5,%%xmm3                 \n"
7798       "psubw       %%xmm3,%%xmm2                 \n"
7799       "paddw       %%xmm2,%%xmm0                 \n"
7800       "paddw       %%xmm1,%%xmm0                 \n"
7801       "paddw       %%xmm1,%%xmm0                 \n"
7802       "pxor        %%xmm1,%%xmm1                 \n"
7803       "psubw       %%xmm0,%%xmm1                 \n"
7804       "pmaxsw      %%xmm1,%%xmm0                 \n"
7805       "packuswb    %%xmm0,%%xmm0                 \n"
7806       "movq        %%xmm0,0x00(%0,%2,1)          \n"
7807       "lea         0x8(%0),%0                    \n"
7808       "sub         $0x8,%3                       \n"
7809       "jg          1b                            \n"
7810       : "+r"(src_y0),      // %0
7811         "+r"(src_y1),      // %1
7812         "+r"(dst_sobely),  // %2
7813         "+r"(width)        // %3
7814       :
7815       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
7816 }
7817 #endif  // HAS_SOBELYROW_SSE2
7818 
7819 #ifdef HAS_SOBELROW_SSE2
7820 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
7821 // A = 255
7822 // R = Sobel
7823 // G = Sobel
7824 // B = Sobel
SobelRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)7825 void SobelRow_SSE2(const uint8_t* src_sobelx,
7826                    const uint8_t* src_sobely,
7827                    uint8_t* dst_argb,
7828                    int width) {
7829   asm volatile(
7830       "sub         %0,%1                         \n"
7831       "pcmpeqb     %%xmm5,%%xmm5                 \n"
7832       "pslld       $0x18,%%xmm5                  \n"
7833 
7834       // 8 pixel loop.
7835       LABELALIGN
7836       "1:                                        \n"
7837       "movdqu      (%0),%%xmm0                   \n"
7838       "movdqu      0x00(%0,%1,1),%%xmm1          \n"
7839       "lea         0x10(%0),%0                   \n"
7840       "paddusb     %%xmm1,%%xmm0                 \n"
7841       "movdqa      %%xmm0,%%xmm2                 \n"
7842       "punpcklbw   %%xmm0,%%xmm2                 \n"
7843       "punpckhbw   %%xmm0,%%xmm0                 \n"
7844       "movdqa      %%xmm2,%%xmm1                 \n"
7845       "punpcklwd   %%xmm2,%%xmm1                 \n"
7846       "punpckhwd   %%xmm2,%%xmm2                 \n"
7847       "por         %%xmm5,%%xmm1                 \n"
7848       "por         %%xmm5,%%xmm2                 \n"
7849       "movdqa      %%xmm0,%%xmm3                 \n"
7850       "punpcklwd   %%xmm0,%%xmm3                 \n"
7851       "punpckhwd   %%xmm0,%%xmm0                 \n"
7852       "por         %%xmm5,%%xmm3                 \n"
7853       "por         %%xmm5,%%xmm0                 \n"
7854       "movdqu      %%xmm1,(%2)                   \n"
7855       "movdqu      %%xmm2,0x10(%2)               \n"
7856       "movdqu      %%xmm3,0x20(%2)               \n"
7857       "movdqu      %%xmm0,0x30(%2)               \n"
7858       "lea         0x40(%2),%2                   \n"
7859       "sub         $0x10,%3                      \n"
7860       "jg          1b                            \n"
7861       : "+r"(src_sobelx),  // %0
7862         "+r"(src_sobely),  // %1
7863         "+r"(dst_argb),    // %2
7864         "+r"(width)        // %3
7865       :
7866       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
7867 }
7868 #endif  // HAS_SOBELROW_SSE2
7869 
7870 #ifdef HAS_SOBELTOPLANEROW_SSE2
7871 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
SobelToPlaneRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_y,int width)7872 void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
7873                           const uint8_t* src_sobely,
7874                           uint8_t* dst_y,
7875                           int width) {
7876   asm volatile(
7877       "sub         %0,%1                         \n"
7878       "pcmpeqb     %%xmm5,%%xmm5                 \n"
7879       "pslld       $0x18,%%xmm5                  \n"
7880 
7881       // 8 pixel loop.
7882       LABELALIGN
7883       "1:                                        \n"
7884       "movdqu      (%0),%%xmm0                   \n"
7885       "movdqu      0x00(%0,%1,1),%%xmm1          \n"
7886       "lea         0x10(%0),%0                   \n"
7887       "paddusb     %%xmm1,%%xmm0                 \n"
7888       "movdqu      %%xmm0,(%2)                   \n"
7889       "lea         0x10(%2),%2                   \n"
7890       "sub         $0x10,%3                      \n"
7891       "jg          1b                            \n"
7892       : "+r"(src_sobelx),  // %0
7893         "+r"(src_sobely),  // %1
7894         "+r"(dst_y),       // %2
7895         "+r"(width)        // %3
7896       :
7897       : "memory", "cc", "xmm0", "xmm1");
7898 }
7899 #endif  // HAS_SOBELTOPLANEROW_SSE2
7900 
7901 #ifdef HAS_SOBELXYROW_SSE2
7902 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
7903 // A = 255
7904 // R = Sobel X
7905 // G = Sobel
7906 // B = Sobel Y
SobelXYRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)7907 void SobelXYRow_SSE2(const uint8_t* src_sobelx,
7908                      const uint8_t* src_sobely,
7909                      uint8_t* dst_argb,
7910                      int width) {
7911   asm volatile(
7912       "sub         %0,%1                         \n"
7913       "pcmpeqb     %%xmm5,%%xmm5                 \n"
7914 
7915       // 8 pixel loop.
7916       LABELALIGN
7917       "1:                                        \n"
7918       "movdqu      (%0),%%xmm0                   \n"
7919       "movdqu      0x00(%0,%1,1),%%xmm1          \n"
7920       "lea         0x10(%0),%0                   \n"
7921       "movdqa      %%xmm0,%%xmm2                 \n"
7922       "paddusb     %%xmm1,%%xmm2                 \n"
7923       "movdqa      %%xmm0,%%xmm3                 \n"
7924       "punpcklbw   %%xmm5,%%xmm3                 \n"
7925       "punpckhbw   %%xmm5,%%xmm0                 \n"
7926       "movdqa      %%xmm1,%%xmm4                 \n"
7927       "punpcklbw   %%xmm2,%%xmm4                 \n"
7928       "punpckhbw   %%xmm2,%%xmm1                 \n"
7929       "movdqa      %%xmm4,%%xmm6                 \n"
7930       "punpcklwd   %%xmm3,%%xmm6                 \n"
7931       "punpckhwd   %%xmm3,%%xmm4                 \n"
7932       "movdqa      %%xmm1,%%xmm7                 \n"
7933       "punpcklwd   %%xmm0,%%xmm7                 \n"
7934       "punpckhwd   %%xmm0,%%xmm1                 \n"
7935       "movdqu      %%xmm6,(%2)                   \n"
7936       "movdqu      %%xmm4,0x10(%2)               \n"
7937       "movdqu      %%xmm7,0x20(%2)               \n"
7938       "movdqu      %%xmm1,0x30(%2)               \n"
7939       "lea         0x40(%2),%2                   \n"
7940       "sub         $0x10,%3                      \n"
7941       "jg          1b                            \n"
7942       : "+r"(src_sobelx),  // %0
7943         "+r"(src_sobely),  // %1
7944         "+r"(dst_argb),    // %2
7945         "+r"(width)        // %3
7946       :
7947       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7948         "xmm7");
7949 }
7950 #endif  // HAS_SOBELXYROW_SSE2
7951 
7952 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
7953 // Creates a table of cumulative sums where each value is a sum of all values
7954 // above and to the left of the value, inclusive of the value.
ComputeCumulativeSumRow_SSE2(const uint8_t * row,int32_t * cumsum,const int32_t * previous_cumsum,int width)7955 void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
7956                                   int32_t* cumsum,
7957                                   const int32_t* previous_cumsum,
7958                                   int width) {
7959   asm volatile(
7960       "pxor        %%xmm0,%%xmm0                 \n"
7961       "pxor        %%xmm1,%%xmm1                 \n"
7962       "sub         $0x4,%3                       \n"
7963       "jl          49f                           \n"
7964       "test        $0xf,%1                       \n"
7965       "jne         49f                           \n"
7966 
7967       // 4 pixel loop.
7968       LABELALIGN
7969       "40:                                       \n"
7970       "movdqu      (%0),%%xmm2                   \n"
7971       "lea         0x10(%0),%0                   \n"
7972       "movdqa      %%xmm2,%%xmm4                 \n"
7973       "punpcklbw   %%xmm1,%%xmm2                 \n"
7974       "movdqa      %%xmm2,%%xmm3                 \n"
7975       "punpcklwd   %%xmm1,%%xmm2                 \n"
7976       "punpckhwd   %%xmm1,%%xmm3                 \n"
7977       "punpckhbw   %%xmm1,%%xmm4                 \n"
7978       "movdqa      %%xmm4,%%xmm5                 \n"
7979       "punpcklwd   %%xmm1,%%xmm4                 \n"
7980       "punpckhwd   %%xmm1,%%xmm5                 \n"
7981       "paddd       %%xmm2,%%xmm0                 \n"
7982       "movdqu      (%2),%%xmm2                   \n"
7983       "paddd       %%xmm0,%%xmm2                 \n"
7984       "paddd       %%xmm3,%%xmm0                 \n"
7985       "movdqu      0x10(%2),%%xmm3               \n"
7986       "paddd       %%xmm0,%%xmm3                 \n"
7987       "paddd       %%xmm4,%%xmm0                 \n"
7988       "movdqu      0x20(%2),%%xmm4               \n"
7989       "paddd       %%xmm0,%%xmm4                 \n"
7990       "paddd       %%xmm5,%%xmm0                 \n"
7991       "movdqu      0x30(%2),%%xmm5               \n"
7992       "lea         0x40(%2),%2                   \n"
7993       "paddd       %%xmm0,%%xmm5                 \n"
7994       "movdqu      %%xmm2,(%1)                   \n"
7995       "movdqu      %%xmm3,0x10(%1)               \n"
7996       "movdqu      %%xmm4,0x20(%1)               \n"
7997       "movdqu      %%xmm5,0x30(%1)               \n"
7998       "lea         0x40(%1),%1                   \n"
7999       "sub         $0x4,%3                       \n"
8000       "jge         40b                           \n"
8001 
8002       "49:                                       \n"
8003       "add         $0x3,%3                       \n"
8004       "jl          19f                           \n"
8005 
8006       // 1 pixel loop.
8007       LABELALIGN
8008       "10:                                       \n"
8009       "movd        (%0),%%xmm2                   \n"
8010       "lea         0x4(%0),%0                    \n"
8011       "punpcklbw   %%xmm1,%%xmm2                 \n"
8012       "punpcklwd   %%xmm1,%%xmm2                 \n"
8013       "paddd       %%xmm2,%%xmm0                 \n"
8014       "movdqu      (%2),%%xmm2                   \n"
8015       "lea         0x10(%2),%2                   \n"
8016       "paddd       %%xmm0,%%xmm2                 \n"
8017       "movdqu      %%xmm2,(%1)                   \n"
8018       "lea         0x10(%1),%1                   \n"
8019       "sub         $0x1,%3                       \n"
8020       "jge         10b                           \n"
8021 
8022       "19:                                       \n"
8023       : "+r"(row),              // %0
8024         "+r"(cumsum),           // %1
8025         "+r"(previous_cumsum),  // %2
8026         "+r"(width)             // %3
8027       :
8028       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
8029 }
8030 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
8031 
8032 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
CumulativeSumToAverageRow_SSE2(const int32_t * topleft,const int32_t * botleft,int width,int area,uint8_t * dst,int count)8033 void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
8034                                     const int32_t* botleft,
8035                                     int width,
8036                                     int area,
8037                                     uint8_t* dst,
8038                                     int count) {
8039   asm volatile(
8040       "movd        %5,%%xmm5                     \n"
8041       "cvtdq2ps    %%xmm5,%%xmm5                 \n"
8042       "rcpss       %%xmm5,%%xmm4                 \n"
8043       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
8044       "sub         $0x4,%3                       \n"
8045       "jl          49f                           \n"
8046       "cmpl        $0x80,%5                      \n"
8047       "ja          40f                           \n"
8048 
8049       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
8050       "pcmpeqb     %%xmm6,%%xmm6                 \n"
8051       "psrld       $0x10,%%xmm6                  \n"
8052       "cvtdq2ps    %%xmm6,%%xmm6                 \n"
8053       "addps       %%xmm6,%%xmm5                 \n"
8054       "mulps       %%xmm4,%%xmm5                 \n"
8055       "cvtps2dq    %%xmm5,%%xmm5                 \n"
8056       "packssdw    %%xmm5,%%xmm5                 \n"
8057 
8058       // 4 pixel small loop.
8059       LABELALIGN
8060       "4:                                        \n"
8061       "movdqu      (%0),%%xmm0                   \n"
8062       "movdqu      0x10(%0),%%xmm1               \n"
8063       "movdqu      0x20(%0),%%xmm2               \n"
8064       "movdqu      0x30(%0),%%xmm3               \n"
8065       "psubd       0x00(%0,%4,4),%%xmm0          \n"
8066       "psubd       0x10(%0,%4,4),%%xmm1          \n"
8067       "psubd       0x20(%0,%4,4),%%xmm2          \n"
8068       "psubd       0x30(%0,%4,4),%%xmm3          \n"
8069       "lea         0x40(%0),%0                   \n"
8070       "psubd       (%1),%%xmm0                   \n"
8071       "psubd       0x10(%1),%%xmm1               \n"
8072       "psubd       0x20(%1),%%xmm2               \n"
8073       "psubd       0x30(%1),%%xmm3               \n"
8074       "paddd       0x00(%1,%4,4),%%xmm0          \n"
8075       "paddd       0x10(%1,%4,4),%%xmm1          \n"
8076       "paddd       0x20(%1,%4,4),%%xmm2          \n"
8077       "paddd       0x30(%1,%4,4),%%xmm3          \n"
8078       "lea         0x40(%1),%1                   \n"
8079       "packssdw    %%xmm1,%%xmm0                 \n"
8080       "packssdw    %%xmm3,%%xmm2                 \n"
8081       "pmulhuw     %%xmm5,%%xmm0                 \n"
8082       "pmulhuw     %%xmm5,%%xmm2                 \n"
8083       "packuswb    %%xmm2,%%xmm0                 \n"
8084       "movdqu      %%xmm0,(%2)                   \n"
8085       "lea         0x10(%2),%2                   \n"
8086       "sub         $0x4,%3                       \n"
8087       "jge         4b                            \n"
8088       "jmp         49f                           \n"
8089 
8090       // 4 pixel loop
8091       LABELALIGN
8092       "40:                                       \n"
8093       "movdqu      (%0),%%xmm0                   \n"
8094       "movdqu      0x10(%0),%%xmm1               \n"
8095       "movdqu      0x20(%0),%%xmm2               \n"
8096       "movdqu      0x30(%0),%%xmm3               \n"
8097       "psubd       0x00(%0,%4,4),%%xmm0          \n"
8098       "psubd       0x10(%0,%4,4),%%xmm1          \n"
8099       "psubd       0x20(%0,%4,4),%%xmm2          \n"
8100       "psubd       0x30(%0,%4,4),%%xmm3          \n"
8101       "lea         0x40(%0),%0                   \n"
8102       "psubd       (%1),%%xmm0                   \n"
8103       "psubd       0x10(%1),%%xmm1               \n"
8104       "psubd       0x20(%1),%%xmm2               \n"
8105       "psubd       0x30(%1),%%xmm3               \n"
8106       "paddd       0x00(%1,%4,4),%%xmm0          \n"
8107       "paddd       0x10(%1,%4,4),%%xmm1          \n"
8108       "paddd       0x20(%1,%4,4),%%xmm2          \n"
8109       "paddd       0x30(%1,%4,4),%%xmm3          \n"
8110       "lea         0x40(%1),%1                   \n"
8111       "cvtdq2ps    %%xmm0,%%xmm0                 \n"
8112       "cvtdq2ps    %%xmm1,%%xmm1                 \n"
8113       "mulps       %%xmm4,%%xmm0                 \n"
8114       "mulps       %%xmm4,%%xmm1                 \n"
8115       "cvtdq2ps    %%xmm2,%%xmm2                 \n"
8116       "cvtdq2ps    %%xmm3,%%xmm3                 \n"
8117       "mulps       %%xmm4,%%xmm2                 \n"
8118       "mulps       %%xmm4,%%xmm3                 \n"
8119       "cvtps2dq    %%xmm0,%%xmm0                 \n"
8120       "cvtps2dq    %%xmm1,%%xmm1                 \n"
8121       "cvtps2dq    %%xmm2,%%xmm2                 \n"
8122       "cvtps2dq    %%xmm3,%%xmm3                 \n"
8123       "packssdw    %%xmm1,%%xmm0                 \n"
8124       "packssdw    %%xmm3,%%xmm2                 \n"
8125       "packuswb    %%xmm2,%%xmm0                 \n"
8126       "movdqu      %%xmm0,(%2)                   \n"
8127       "lea         0x10(%2),%2                   \n"
8128       "sub         $0x4,%3                       \n"
8129       "jge         40b                           \n"
8130 
8131       "49:                                       \n"
8132       "add         $0x3,%3                       \n"
8133       "jl          19f                           \n"
8134 
8135       // 1 pixel loop
8136       LABELALIGN
8137       "10:                                       \n"
8138       "movdqu      (%0),%%xmm0                   \n"
8139       "psubd       0x00(%0,%4,4),%%xmm0          \n"
8140       "lea         0x10(%0),%0                   \n"
8141       "psubd       (%1),%%xmm0                   \n"
8142       "paddd       0x00(%1,%4,4),%%xmm0          \n"
8143       "lea         0x10(%1),%1                   \n"
8144       "cvtdq2ps    %%xmm0,%%xmm0                 \n"
8145       "mulps       %%xmm4,%%xmm0                 \n"
8146       "cvtps2dq    %%xmm0,%%xmm0                 \n"
8147       "packssdw    %%xmm0,%%xmm0                 \n"
8148       "packuswb    %%xmm0,%%xmm0                 \n"
8149       "movd        %%xmm0,(%2)                   \n"
8150       "lea         0x4(%2),%2                    \n"
8151       "sub         $0x1,%3                       \n"
8152       "jge         10b                           \n"
8153       "19:                                       \n"
8154       : "+r"(topleft),           // %0
8155         "+r"(botleft),           // %1
8156         "+r"(dst),               // %2
8157         "+rm"(count)             // %3
8158       : "r"((intptr_t)(width)),  // %4
8159         "rm"(area)               // %5
8160       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
8161 }
8162 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
8163 
8164 #ifdef HAS_ARGBAFFINEROW_SSE2
8165 // Copy ARGB pixels from source image with slope to a row of destination.
8166 LIBYUV_API
ARGBAffineRow_SSE2(const uint8_t * src_argb,int src_argb_stride,uint8_t * dst_argb,const float * src_dudv,int width)8167 void ARGBAffineRow_SSE2(const uint8_t* src_argb,
8168                         int src_argb_stride,
8169                         uint8_t* dst_argb,
8170                         const float* src_dudv,
8171                         int width) {
8172   intptr_t src_argb_stride_temp = src_argb_stride;
8173   intptr_t temp;
8174   asm volatile(
8175       "movq        (%3),%%xmm2                   \n"
8176       "movq        0x08(%3),%%xmm7               \n"
8177       "shl         $0x10,%1                      \n"
8178       "add         $0x4,%1                       \n"
8179       "movd        %1,%%xmm5                     \n"
8180       "sub         $0x4,%4                       \n"
8181       "jl          49f                           \n"
8182 
8183       "pshufd      $0x44,%%xmm7,%%xmm7           \n"
8184       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
8185       "movdqa      %%xmm2,%%xmm0                 \n"
8186       "addps       %%xmm7,%%xmm0                 \n"
8187       "movlhps     %%xmm0,%%xmm2                 \n"
8188       "movdqa      %%xmm7,%%xmm4                 \n"
8189       "addps       %%xmm4,%%xmm4                 \n"
8190       "movdqa      %%xmm2,%%xmm3                 \n"
8191       "addps       %%xmm4,%%xmm3                 \n"
8192       "addps       %%xmm4,%%xmm4                 \n"
8193 
8194       // 4 pixel loop
8195       LABELALIGN
8196       "40:                                       \n"
8197       "cvttps2dq   %%xmm2,%%xmm0                 \n"  // x,y float->int first 2
8198       "cvttps2dq   %%xmm3,%%xmm1                 \n"  // x,y float->int next 2
8199       "packssdw    %%xmm1,%%xmm0                 \n"  // x, y as 8 shorts
8200       "pmaddwd     %%xmm5,%%xmm0                 \n"  // off = x*4 + y*stride
8201       "movd        %%xmm0,%k1                    \n"
8202       "pshufd      $0x39,%%xmm0,%%xmm0           \n"
8203       "movd        %%xmm0,%k5                    \n"
8204       "pshufd      $0x39,%%xmm0,%%xmm0           \n"
8205       "movd        0x00(%0,%1,1),%%xmm1          \n"
8206       "movd        0x00(%0,%5,1),%%xmm6          \n"
8207       "punpckldq   %%xmm6,%%xmm1                 \n"
8208       "addps       %%xmm4,%%xmm2                 \n"
8209       "movq        %%xmm1,(%2)                   \n"
8210       "movd        %%xmm0,%k1                    \n"
8211       "pshufd      $0x39,%%xmm0,%%xmm0           \n"
8212       "movd        %%xmm0,%k5                    \n"
8213       "movd        0x00(%0,%1,1),%%xmm0          \n"
8214       "movd        0x00(%0,%5,1),%%xmm6          \n"
8215       "punpckldq   %%xmm6,%%xmm0                 \n"
8216       "addps       %%xmm4,%%xmm3                 \n"
8217       "movq        %%xmm0,0x08(%2)               \n"
8218       "lea         0x10(%2),%2                   \n"
8219       "sub         $0x4,%4                       \n"
8220       "jge         40b                           \n"
8221 
8222       "49:                                       \n"
8223       "add         $0x3,%4                       \n"
8224       "jl          19f                           \n"
8225 
8226       // 1 pixel loop
8227       LABELALIGN
8228       "10:                                       \n"
8229       "cvttps2dq   %%xmm2,%%xmm0                 \n"
8230       "packssdw    %%xmm0,%%xmm0                 \n"
8231       "pmaddwd     %%xmm5,%%xmm0                 \n"
8232       "addps       %%xmm7,%%xmm2                 \n"
8233       "movd        %%xmm0,%k1                    \n"
8234       "movd        0x00(%0,%1,1),%%xmm0          \n"
8235       "movd        %%xmm0,(%2)                   \n"
8236       "lea         0x04(%2),%2                   \n"
8237       "sub         $0x1,%4                       \n"
8238       "jge         10b                           \n"
8239       "19:                                       \n"
8240       : "+r"(src_argb),              // %0
8241         "+r"(src_argb_stride_temp),  // %1
8242         "+r"(dst_argb),              // %2
8243         "+r"(src_dudv),              // %3
8244         "+rm"(width),                // %4
8245         "=&r"(temp)                  // %5
8246       :
8247       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
8248         "xmm7");
8249 }
8250 #endif  // HAS_ARGBAFFINEROW_SSE2
8251 
8252 #ifdef HAS_INTERPOLATEROW_SSSE3
8253 // Bilinear filter 16x2 -> 16x1
InterpolateRow_SSSE3(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int width,int source_y_fraction)8254 void InterpolateRow_SSSE3(uint8_t* dst_ptr,
8255                           const uint8_t* src_ptr,
8256                           ptrdiff_t src_stride,
8257                           int width,
8258                           int source_y_fraction) {
8259   asm volatile(
8260       "sub         %1,%0                         \n"
8261       "cmp         $0x0,%3                       \n"
8262       "je          100f                          \n"
8263       "cmp         $0x80,%3                      \n"
8264       "je          50f                           \n"
8265 
8266       "movd        %3,%%xmm0                     \n"
8267       "neg         %3                            \n"
8268       "add         $0x100,%3                     \n"
8269       "movd        %3,%%xmm5                     \n"
8270       "punpcklbw   %%xmm0,%%xmm5                 \n"
8271       "punpcklwd   %%xmm5,%%xmm5                 \n"
8272       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
8273       "mov         $0x80808080,%%eax             \n"
8274       "movd        %%eax,%%xmm4                  \n"
8275       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
8276 
8277       // General purpose row blend.
8278       LABELALIGN
8279       "1:                                        \n"
8280       "movdqu      (%1),%%xmm0                   \n"
8281       "movdqu      0x00(%1,%4,1),%%xmm2          \n"
8282       "movdqa      %%xmm0,%%xmm1                 \n"
8283       "punpcklbw   %%xmm2,%%xmm0                 \n"
8284       "punpckhbw   %%xmm2,%%xmm1                 \n"
8285       "psubb       %%xmm4,%%xmm0                 \n"
8286       "psubb       %%xmm4,%%xmm1                 \n"
8287       "movdqa      %%xmm5,%%xmm2                 \n"
8288       "movdqa      %%xmm5,%%xmm3                 \n"
8289       "pmaddubsw   %%xmm0,%%xmm2                 \n"
8290       "pmaddubsw   %%xmm1,%%xmm3                 \n"
8291       "paddw       %%xmm4,%%xmm2                 \n"
8292       "paddw       %%xmm4,%%xmm3                 \n"
8293       "psrlw       $0x8,%%xmm2                   \n"
8294       "psrlw       $0x8,%%xmm3                   \n"
8295       "packuswb    %%xmm3,%%xmm2                 \n"
8296       "movdqu      %%xmm2,0x00(%1,%0,1)          \n"
8297       "lea         0x10(%1),%1                   \n"
8298       "sub         $0x10,%2                      \n"
8299       "jg          1b                            \n"
8300       "jmp         99f                           \n"
8301 
8302       // Blend 50 / 50.
8303       LABELALIGN
8304       "50:                                       \n"
8305       "movdqu      (%1),%%xmm0                   \n"
8306       "movdqu      0x00(%1,%4,1),%%xmm1          \n"
8307       "pavgb       %%xmm1,%%xmm0                 \n"
8308       "movdqu      %%xmm0,0x00(%1,%0,1)          \n"
8309       "lea         0x10(%1),%1                   \n"
8310       "sub         $0x10,%2                      \n"
8311       "jg          50b                           \n"
8312       "jmp         99f                           \n"
8313 
8314       // Blend 100 / 0 - Copy row unchanged.
8315       LABELALIGN
8316       "100:                                      \n"
8317       "movdqu      (%1),%%xmm0                   \n"
8318       "movdqu      %%xmm0,0x00(%1,%0,1)          \n"
8319       "lea         0x10(%1),%1                   \n"
8320       "sub         $0x10,%2                      \n"
8321       "jg          100b                          \n"
8322 
8323       "99:                                       \n"
8324       : "+r"(dst_ptr),               // %0
8325         "+r"(src_ptr),               // %1
8326         "+rm"(width),                // %2
8327         "+r"(source_y_fraction)      // %3
8328       : "r"((intptr_t)(src_stride))  // %4
8329       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
8330 }
8331 #endif  // HAS_INTERPOLATEROW_SSSE3
8332 
8333 #ifdef HAS_INTERPOLATEROW_AVX2
8334 // Bilinear filter 32x2 -> 32x1
InterpolateRow_AVX2(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int width,int source_y_fraction)8335 void InterpolateRow_AVX2(uint8_t* dst_ptr,
8336                          const uint8_t* src_ptr,
8337                          ptrdiff_t src_stride,
8338                          int width,
8339                          int source_y_fraction) {
8340   asm volatile(
8341       "sub         %1,%0                         \n"
8342       "cmp         $0x0,%3                       \n"
8343       "je          100f                          \n"
8344       "cmp         $0x80,%3                      \n"
8345       "je          50f                           \n"
8346 
8347       "vmovd       %3,%%xmm0                     \n"
8348       "neg         %3                            \n"
8349       "add         $0x100,%3                     \n"
8350       "vmovd       %3,%%xmm5                     \n"
8351       "vpunpcklbw  %%xmm0,%%xmm5,%%xmm5          \n"
8352       "vpunpcklwd  %%xmm5,%%xmm5,%%xmm5          \n"
8353       "vbroadcastss %%xmm5,%%ymm5                \n"
8354       "mov         $0x80808080,%%eax             \n"
8355       "vmovd       %%eax,%%xmm4                  \n"
8356       "vbroadcastss %%xmm4,%%ymm4                \n"
8357 
8358       // General purpose row blend.
8359       LABELALIGN
8360       "1:                                        \n"
8361       "vmovdqu     (%1),%%ymm0                   \n"
8362       "vmovdqu     0x00(%1,%4,1),%%ymm2          \n"
8363       "vpunpckhbw  %%ymm2,%%ymm0,%%ymm1          \n"
8364       "vpunpcklbw  %%ymm2,%%ymm0,%%ymm0          \n"
8365       "vpsubb      %%ymm4,%%ymm1,%%ymm1          \n"
8366       "vpsubb      %%ymm4,%%ymm0,%%ymm0          \n"
8367       "vpmaddubsw  %%ymm1,%%ymm5,%%ymm1          \n"
8368       "vpmaddubsw  %%ymm0,%%ymm5,%%ymm0          \n"
8369       "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"
8370       "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"
8371       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
8372       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
8373       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
8374       "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
8375       "lea         0x20(%1),%1                   \n"
8376       "sub         $0x20,%2                      \n"
8377       "jg          1b                            \n"
8378       "jmp         99f                           \n"
8379 
8380       // Blend 50 / 50.
8381       LABELALIGN
8382       "50:                                       \n"
8383       "vmovdqu     (%1),%%ymm0                   \n"
8384       "vpavgb      0x00(%1,%4,1),%%ymm0,%%ymm0   \n"
8385       "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
8386       "lea         0x20(%1),%1                   \n"
8387       "sub         $0x20,%2                      \n"
8388       "jg          50b                           \n"
8389       "jmp         99f                           \n"
8390 
8391       // Blend 100 / 0 - Copy row unchanged.
8392       LABELALIGN
8393       "100:                                      \n"
8394       "vmovdqu     (%1),%%ymm0                   \n"
8395       "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
8396       "lea         0x20(%1),%1                   \n"
8397       "sub         $0x20,%2                      \n"
8398       "jg          100b                          \n"
8399 
8400       "99:                                       \n"
8401       "vzeroupper                                \n"
8402       : "+r"(dst_ptr),               // %0
8403         "+r"(src_ptr),               // %1
8404         "+r"(width),                 // %2
8405         "+r"(source_y_fraction)      // %3
8406       : "r"((intptr_t)(src_stride))  // %4
8407       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
8408 }
8409 #endif  // HAS_INTERPOLATEROW_AVX2
8410 
8411 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
8412 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)8413 void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
8414                           uint8_t* dst_argb,
8415                           const uint8_t* shuffler,
8416                           int width) {
8417   asm volatile(
8418 
8419       "movdqu      (%3),%%xmm5                   \n"
8420 
8421       LABELALIGN
8422       "1:                                        \n"
8423       "movdqu      (%0),%%xmm0                   \n"
8424       "movdqu      0x10(%0),%%xmm1               \n"
8425       "lea         0x20(%0),%0                   \n"
8426       "pshufb      %%xmm5,%%xmm0                 \n"
8427       "pshufb      %%xmm5,%%xmm1                 \n"
8428       "movdqu      %%xmm0,(%1)                   \n"
8429       "movdqu      %%xmm1,0x10(%1)               \n"
8430       "lea         0x20(%1),%1                   \n"
8431       "sub         $0x8,%2                       \n"
8432       "jg          1b                            \n"
8433       : "+r"(src_argb),  // %0
8434         "+r"(dst_argb),  // %1
8435         "+r"(width)      // %2
8436       : "r"(shuffler)    // %3
8437       : "memory", "cc", "xmm0", "xmm1", "xmm5");
8438 }
8439 #endif  // HAS_ARGBSHUFFLEROW_SSSE3
8440 
8441 #ifdef HAS_ARGBSHUFFLEROW_AVX2
8442 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)8443 void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
8444                          uint8_t* dst_argb,
8445                          const uint8_t* shuffler,
8446                          int width) {
8447   asm volatile(
8448 
8449       "vbroadcastf128 (%3),%%ymm5                \n"
8450 
8451       LABELALIGN
8452       "1:                                        \n"
8453       "vmovdqu     (%0),%%ymm0                   \n"
8454       "vmovdqu     0x20(%0),%%ymm1               \n"
8455       "lea         0x40(%0),%0                   \n"
8456       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
8457       "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
8458       "vmovdqu     %%ymm0,(%1)                   \n"
8459       "vmovdqu     %%ymm1,0x20(%1)               \n"
8460       "lea         0x40(%1),%1                   \n"
8461       "sub         $0x10,%2                      \n"
8462       "jg          1b                            \n"
8463       "vzeroupper                                \n"
8464       : "+r"(src_argb),  // %0
8465         "+r"(dst_argb),  // %1
8466         "+r"(width)      // %2
8467       : "r"(shuffler)    // %3
8468       : "memory", "cc", "xmm0", "xmm1", "xmm5");
8469 }
8470 #endif  // HAS_ARGBSHUFFLEROW_AVX2
8471 
8472 #ifdef HAS_I422TOYUY2ROW_SSE2
I422ToYUY2Row_SSE2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)8473 void I422ToYUY2Row_SSE2(const uint8_t* src_y,
8474                         const uint8_t* src_u,
8475                         const uint8_t* src_v,
8476                         uint8_t* dst_yuy2,
8477                         int width) {
8478   asm volatile(
8479 
8480       "sub         %1,%2                         \n"
8481 
8482       LABELALIGN
8483       "1:                                        \n"
8484       "movq        (%1),%%xmm2                   \n"
8485       "movq        0x00(%1,%2,1),%%xmm1          \n"
8486       "add         $0x8,%1                       \n"
8487       "punpcklbw   %%xmm1,%%xmm2                 \n"
8488       "movdqu      (%0),%%xmm0                   \n"
8489       "add         $0x10,%0                      \n"
8490       "movdqa      %%xmm0,%%xmm1                 \n"
8491       "punpcklbw   %%xmm2,%%xmm0                 \n"
8492       "punpckhbw   %%xmm2,%%xmm1                 \n"
8493       "movdqu      %%xmm0,(%3)                   \n"
8494       "movdqu      %%xmm1,0x10(%3)               \n"
8495       "lea         0x20(%3),%3                   \n"
8496       "sub         $0x10,%4                      \n"
8497       "jg          1b                            \n"
8498       : "+r"(src_y),     // %0
8499         "+r"(src_u),     // %1
8500         "+r"(src_v),     // %2
8501         "+r"(dst_yuy2),  // %3
8502         "+rm"(width)     // %4
8503       :
8504       : "memory", "cc", "xmm0", "xmm1", "xmm2");
8505 }
8506 #endif  // HAS_I422TOYUY2ROW_SSE2
8507 
8508 #ifdef HAS_I422TOUYVYROW_SSE2
I422ToUYVYRow_SSE2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)8509 void I422ToUYVYRow_SSE2(const uint8_t* src_y,
8510                         const uint8_t* src_u,
8511                         const uint8_t* src_v,
8512                         uint8_t* dst_uyvy,
8513                         int width) {
8514   asm volatile(
8515 
8516       "sub         %1,%2                         \n"
8517 
8518       LABELALIGN
8519       "1:                                        \n"
8520       "movq        (%1),%%xmm2                   \n"
8521       "movq        0x00(%1,%2,1),%%xmm1          \n"
8522       "add         $0x8,%1                       \n"
8523       "punpcklbw   %%xmm1,%%xmm2                 \n"
8524       "movdqu      (%0),%%xmm0                   \n"
8525       "movdqa      %%xmm2,%%xmm1                 \n"
8526       "add         $0x10,%0                      \n"
8527       "punpcklbw   %%xmm0,%%xmm1                 \n"
8528       "punpckhbw   %%xmm0,%%xmm2                 \n"
8529       "movdqu      %%xmm1,(%3)                   \n"
8530       "movdqu      %%xmm2,0x10(%3)               \n"
8531       "lea         0x20(%3),%3                   \n"
8532       "sub         $0x10,%4                      \n"
8533       "jg          1b                            \n"
8534       : "+r"(src_y),     // %0
8535         "+r"(src_u),     // %1
8536         "+r"(src_v),     // %2
8537         "+r"(dst_uyvy),  // %3
8538         "+rm"(width)     // %4
8539       :
8540       : "memory", "cc", "xmm0", "xmm1", "xmm2");
8541 }
8542 #endif  // HAS_I422TOUYVYROW_SSE2
8543 
8544 #ifdef HAS_I422TOYUY2ROW_AVX2
I422ToYUY2Row_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)8545 void I422ToYUY2Row_AVX2(const uint8_t* src_y,
8546                         const uint8_t* src_u,
8547                         const uint8_t* src_v,
8548                         uint8_t* dst_yuy2,
8549                         int width) {
8550   asm volatile(
8551 
8552       "sub         %1,%2                         \n"
8553 
8554       LABELALIGN
8555       "1:                                        \n"
8556       "vpmovzxbw   (%1),%%ymm1                   \n"
8557       "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
8558       "add         $0x10,%1                      \n"
8559       "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
8560       "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
8561       "vmovdqu     (%0),%%ymm0                   \n"
8562       "add         $0x20,%0                      \n"
8563       "vpunpcklbw  %%ymm2,%%ymm0,%%ymm1          \n"
8564       "vpunpckhbw  %%ymm2,%%ymm0,%%ymm2          \n"
8565       "vextractf128 $0x0,%%ymm1,(%3)             \n"
8566       "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
8567       "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
8568       "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
8569       "lea         0x40(%3),%3                   \n"
8570       "sub         $0x20,%4                      \n"
8571       "jg          1b                            \n"
8572       "vzeroupper                                \n"
8573       : "+r"(src_y),     // %0
8574         "+r"(src_u),     // %1
8575         "+r"(src_v),     // %2
8576         "+r"(dst_yuy2),  // %3
8577         "+rm"(width)     // %4
8578       :
8579       : "memory", "cc", "xmm0", "xmm1", "xmm2");
8580 }
8581 #endif  // HAS_I422TOYUY2ROW_AVX2
8582 
8583 #ifdef HAS_I422TOUYVYROW_AVX2
I422ToUYVYRow_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)8584 void I422ToUYVYRow_AVX2(const uint8_t* src_y,
8585                         const uint8_t* src_u,
8586                         const uint8_t* src_v,
8587                         uint8_t* dst_uyvy,
8588                         int width) {
8589   asm volatile(
8590 
8591       "sub         %1,%2                         \n"
8592 
8593       LABELALIGN
8594       "1:                                        \n"
8595       "vpmovzxbw   (%1),%%ymm1                   \n"
8596       "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
8597       "add         $0x10,%1                      \n"
8598       "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
8599       "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
8600       "vmovdqu     (%0),%%ymm0                   \n"
8601       "add         $0x20,%0                      \n"
8602       "vpunpcklbw  %%ymm0,%%ymm2,%%ymm1          \n"
8603       "vpunpckhbw  %%ymm0,%%ymm2,%%ymm2          \n"
8604       "vextractf128 $0x0,%%ymm1,(%3)             \n"
8605       "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
8606       "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
8607       "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
8608       "lea         0x40(%3),%3                   \n"
8609       "sub         $0x20,%4                      \n"
8610       "jg          1b                            \n"
8611       "vzeroupper                                \n"
8612       : "+r"(src_y),     // %0
8613         "+r"(src_u),     // %1
8614         "+r"(src_v),     // %2
8615         "+r"(dst_uyvy),  // %3
8616         "+rm"(width)     // %4
8617       :
8618       : "memory", "cc", "xmm0", "xmm1", "xmm2");
8619 }
8620 #endif  // HAS_I422TOUYVYROW_AVX2
8621 
8622 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
ARGBPolynomialRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,const float * poly,int width)8623 void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
8624                             uint8_t* dst_argb,
8625                             const float* poly,
8626                             int width) {
8627   asm volatile(
8628 
8629       "pxor        %%xmm3,%%xmm3                 \n"
8630 
8631       // 2 pixel loop.
8632       LABELALIGN
8633       "1:                                        \n"
8634       "movq        (%0),%%xmm0                   \n"
8635       "lea         0x8(%0),%0                    \n"
8636       "punpcklbw   %%xmm3,%%xmm0                 \n"
8637       "movdqa      %%xmm0,%%xmm4                 \n"
8638       "punpcklwd   %%xmm3,%%xmm0                 \n"
8639       "punpckhwd   %%xmm3,%%xmm4                 \n"
8640       "cvtdq2ps    %%xmm0,%%xmm0                 \n"
8641       "cvtdq2ps    %%xmm4,%%xmm4                 \n"
8642       "movdqa      %%xmm0,%%xmm1                 \n"
8643       "movdqa      %%xmm4,%%xmm5                 \n"
8644       "mulps       0x10(%3),%%xmm0               \n"
8645       "mulps       0x10(%3),%%xmm4               \n"
8646       "addps       (%3),%%xmm0                   \n"
8647       "addps       (%3),%%xmm4                   \n"
8648       "movdqa      %%xmm1,%%xmm2                 \n"
8649       "movdqa      %%xmm5,%%xmm6                 \n"
8650       "mulps       %%xmm1,%%xmm2                 \n"
8651       "mulps       %%xmm5,%%xmm6                 \n"
8652       "mulps       %%xmm2,%%xmm1                 \n"
8653       "mulps       %%xmm6,%%xmm5                 \n"
8654       "mulps       0x20(%3),%%xmm2               \n"
8655       "mulps       0x20(%3),%%xmm6               \n"
8656       "mulps       0x30(%3),%%xmm1               \n"
8657       "mulps       0x30(%3),%%xmm5               \n"
8658       "addps       %%xmm2,%%xmm0                 \n"
8659       "addps       %%xmm6,%%xmm4                 \n"
8660       "addps       %%xmm1,%%xmm0                 \n"
8661       "addps       %%xmm5,%%xmm4                 \n"
8662       "cvttps2dq   %%xmm0,%%xmm0                 \n"
8663       "cvttps2dq   %%xmm4,%%xmm4                 \n"
8664       "packuswb    %%xmm4,%%xmm0                 \n"
8665       "packuswb    %%xmm0,%%xmm0                 \n"
8666       "movq        %%xmm0,(%1)                   \n"
8667       "lea         0x8(%1),%1                    \n"
8668       "sub         $0x2,%2                       \n"
8669       "jg          1b                            \n"
8670       : "+r"(src_argb),  // %0
8671         "+r"(dst_argb),  // %1
8672         "+r"(width)      // %2
8673       : "r"(poly)        // %3
8674       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
8675 }
8676 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
8677 
8678 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
ARGBPolynomialRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,const float * poly,int width)8679 void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
8680                             uint8_t* dst_argb,
8681                             const float* poly,
8682                             int width) {
8683   asm volatile(
8684       "vbroadcastf128 (%3),%%ymm4                \n"
8685       "vbroadcastf128 0x10(%3),%%ymm5            \n"
8686       "vbroadcastf128 0x20(%3),%%ymm6            \n"
8687       "vbroadcastf128 0x30(%3),%%ymm7            \n"
8688 
8689       // 2 pixel loop.
8690       LABELALIGN
8691       "1:                                        \n"
8692       "vpmovzxbd   (%0),%%ymm0                   \n"  // 2 ARGB pixels
8693       "lea         0x8(%0),%0                    \n"
8694       "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
8695       "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
8696       "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
8697       "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
8698       "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
8699       "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X *
8700                                                       // X
8701       "vcvttps2dq  %%ymm0,%%ymm0                 \n"
8702       "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
8703       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
8704       "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
8705       "vmovq       %%xmm0,(%1)                   \n"
8706       "lea         0x8(%1),%1                    \n"
8707       "sub         $0x2,%2                       \n"
8708       "jg          1b                            \n"
8709       "vzeroupper                                \n"
8710       : "+r"(src_argb),  // %0
8711         "+r"(dst_argb),  // %1
8712         "+r"(width)      // %2
8713       : "r"(poly)        // %3
8714       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
8715         "xmm7");
8716 }
8717 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
8718 
8719 #ifdef HAS_HALFFLOATROW_SSE2
8720 static float kScaleBias = 1.9259299444e-34f;
HalfFloatRow_SSE2(const uint16_t * src,uint16_t * dst,float scale,int width)8721 void HalfFloatRow_SSE2(const uint16_t* src,
8722                        uint16_t* dst,
8723                        float scale,
8724                        int width) {
8725   scale *= kScaleBias;
8726   asm volatile(
8727       "movd        %3,%%xmm4                     \n"
8728       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
8729       "pxor        %%xmm5,%%xmm5                 \n"
8730       "sub         %0,%1                         \n"
8731 
8732       // 16 pixel loop.
8733       LABELALIGN
8734       "1:                                        \n"
8735       "movdqu      (%0),%%xmm2                   \n"  // 8 shorts
8736       "add         $0x10,%0                      \n"
8737       "movdqa      %%xmm2,%%xmm3                 \n"
8738       "punpcklwd   %%xmm5,%%xmm2                 \n"  // 8 ints in xmm2/1
8739       "cvtdq2ps    %%xmm2,%%xmm2                 \n"  // 8 floats
8740       "punpckhwd   %%xmm5,%%xmm3                 \n"
8741       "cvtdq2ps    %%xmm3,%%xmm3                 \n"
8742       "mulps       %%xmm4,%%xmm2                 \n"
8743       "mulps       %%xmm4,%%xmm3                 \n"
8744       "psrld       $0xd,%%xmm2                   \n"
8745       "psrld       $0xd,%%xmm3                   \n"
8746       "packssdw    %%xmm3,%%xmm2                 \n"
8747       "movdqu      %%xmm2,-0x10(%0,%1,1)         \n"
8748       "sub         $0x8,%2                       \n"
8749       "jg          1b                            \n"
8750       : "+r"(src),   // %0
8751         "+r"(dst),   // %1
8752         "+r"(width)  // %2
8753       : "m"(scale)   // %3
8754       : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
8755 }
8756 #endif  // HAS_HALFFLOATROW_SSE2
8757 
8758 #ifdef HAS_HALFFLOATROW_AVX2
HalfFloatRow_AVX2(const uint16_t * src,uint16_t * dst,float scale,int width)8759 void HalfFloatRow_AVX2(const uint16_t* src,
8760                        uint16_t* dst,
8761                        float scale,
8762                        int width) {
8763   scale *= kScaleBias;
8764   asm volatile(
8765       "vbroadcastss %3, %%ymm4                   \n"
8766       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
8767       "sub         %0,%1                         \n"
8768 
8769       // 16 pixel loop.
8770       LABELALIGN
8771       "1:                                        \n"
8772       "vmovdqu     (%0),%%ymm2                   \n"  // 16 shorts
8773       "add         $0x20,%0                      \n"
8774       "vpunpckhwd  %%ymm5,%%ymm2,%%ymm3          \n"  // mutates
8775       "vpunpcklwd  %%ymm5,%%ymm2,%%ymm2          \n"
8776       "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
8777       "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
8778       "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
8779       "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
8780       "vpsrld      $0xd,%%ymm3,%%ymm3            \n"
8781       "vpsrld      $0xd,%%ymm2,%%ymm2            \n"
8782       "vpackssdw   %%ymm3, %%ymm2, %%ymm2        \n"  // unmutates
8783       "vmovdqu     %%ymm2,-0x20(%0,%1,1)         \n"
8784       "sub         $0x10,%2                      \n"
8785       "jg          1b                            \n"
8786 
8787       "vzeroupper                                \n"
8788       : "+r"(src),   // %0
8789         "+r"(dst),   // %1
8790         "+r"(width)  // %2
8791 #if defined(__x86_64__)
8792       : "x"(scale)  // %3
8793 #else
8794       : "m"(scale)            // %3
8795 #endif
8796       : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
8797 }
8798 #endif  // HAS_HALFFLOATROW_AVX2
8799 
8800 #ifdef HAS_HALFFLOATROW_F16C
HalfFloatRow_F16C(const uint16_t * src,uint16_t * dst,float scale,int width)8801 void HalfFloatRow_F16C(const uint16_t* src,
8802                        uint16_t* dst,
8803                        float scale,
8804                        int width) {
8805   asm volatile(
8806       "vbroadcastss %3, %%ymm4                   \n"
8807       "sub         %0,%1                         \n"
8808 
8809       // 16 pixel loop.
8810       LABELALIGN
8811       "1:                                        \n"
8812       "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
8813       "vpmovzxwd   0x10(%0),%%ymm3               \n"
8814       "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
8815       "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
8816       "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
8817       "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
8818       "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
8819       "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
8820       "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
8821       "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
8822       "add         $0x20,%0                      \n"
8823       "sub         $0x10,%2                      \n"
8824       "jg          1b                            \n"
8825       "vzeroupper                                \n"
8826       : "+r"(src),   // %0
8827         "+r"(dst),   // %1
8828         "+r"(width)  // %2
8829 #if defined(__x86_64__)
8830       : "x"(scale)  // %3
8831 #else
8832       : "m"(scale)            // %3
8833 #endif
8834       : "memory", "cc", "xmm2", "xmm3", "xmm4");
8835 }
8836 #endif  // HAS_HALFFLOATROW_F16C
8837 
8838 #ifdef HAS_HALFFLOATROW_F16C
HalfFloat1Row_F16C(const uint16_t * src,uint16_t * dst,float,int width)8839 void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
8840   asm volatile(
8841       "sub         %0,%1                         \n"
8842       // 16 pixel loop.
8843       LABELALIGN
8844       "1:                                        \n"
8845       "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
8846       "vpmovzxwd   0x10(%0),%%ymm3               \n"
8847       "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
8848       "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
8849       "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
8850       "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
8851       "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
8852       "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
8853       "add         $0x20,%0                      \n"
8854       "sub         $0x10,%2                      \n"
8855       "jg          1b                            \n"
8856       "vzeroupper                                \n"
8857       : "+r"(src),   // %0
8858         "+r"(dst),   // %1
8859         "+r"(width)  // %2
8860       :
8861       : "memory", "cc", "xmm2", "xmm3");
8862 }
8863 #endif  // HAS_HALFFLOATROW_F16C
8864 
8865 #ifdef HAS_ARGBCOLORTABLEROW_X86
8866 // Tranform ARGB pixels with color table.
ARGBColorTableRow_X86(uint8_t * dst_argb,const uint8_t * table_argb,int width)8867 void ARGBColorTableRow_X86(uint8_t* dst_argb,
8868                            const uint8_t* table_argb,
8869                            int width) {
8870   uintptr_t pixel_temp;
8871   asm volatile(
8872       // 1 pixel loop.
8873       LABELALIGN
8874       "1:                                        \n"
8875       "movzb       (%0),%1                       \n"
8876       "lea         0x4(%0),%0                    \n"
8877       "movzb       0x00(%3,%1,4),%1              \n"
8878       "mov         %b1,-0x4(%0)                  \n"
8879       "movzb       -0x3(%0),%1                   \n"
8880       "movzb       0x01(%3,%1,4),%1              \n"
8881       "mov         %b1,-0x3(%0)                  \n"
8882       "movzb       -0x2(%0),%1                   \n"
8883       "movzb       0x02(%3,%1,4),%1              \n"
8884       "mov         %b1,-0x2(%0)                  \n"
8885       "movzb       -0x1(%0),%1                   \n"
8886       "movzb       0x03(%3,%1,4),%1              \n"
8887       "mov         %b1,-0x1(%0)                  \n"
8888       "dec         %2                            \n"
8889       "jg          1b                            \n"
8890       : "+r"(dst_argb),     // %0
8891         "=&d"(pixel_temp),  // %1
8892         "+r"(width)         // %2
8893       : "r"(table_argb)     // %3
8894       : "memory", "cc");
8895 }
8896 #endif  // HAS_ARGBCOLORTABLEROW_X86
8897 
8898 #ifdef HAS_RGBCOLORTABLEROW_X86
8899 // Tranform RGB pixels with color table.
RGBColorTableRow_X86(uint8_t * dst_argb,const uint8_t * table_argb,int width)8900 void RGBColorTableRow_X86(uint8_t* dst_argb,
8901                           const uint8_t* table_argb,
8902                           int width) {
8903   uintptr_t pixel_temp;
8904   asm volatile(
8905       // 1 pixel loop.
8906       LABELALIGN
8907       "1:                                        \n"
8908       "movzb       (%0),%1                       \n"
8909       "lea         0x4(%0),%0                    \n"
8910       "movzb       0x00(%3,%1,4),%1              \n"
8911       "mov         %b1,-0x4(%0)                  \n"
8912       "movzb       -0x3(%0),%1                   \n"
8913       "movzb       0x01(%3,%1,4),%1              \n"
8914       "mov         %b1,-0x3(%0)                  \n"
8915       "movzb       -0x2(%0),%1                   \n"
8916       "movzb       0x02(%3,%1,4),%1              \n"
8917       "mov         %b1,-0x2(%0)                  \n"
8918       "dec         %2                            \n"
8919       "jg          1b                            \n"
8920       : "+r"(dst_argb),     // %0
8921         "=&d"(pixel_temp),  // %1
8922         "+r"(width)         // %2
8923       : "r"(table_argb)     // %3
8924       : "memory", "cc");
8925 }
8926 #endif  // HAS_RGBCOLORTABLEROW_X86
8927 
8928 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
8929 // Tranform RGB pixels with luma table.
ARGBLumaColorTableRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width,const uint8_t * luma,uint32_t lumacoeff)8930 void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
8931                                  uint8_t* dst_argb,
8932                                  int width,
8933                                  const uint8_t* luma,
8934                                  uint32_t lumacoeff) {
8935   uintptr_t pixel_temp;
8936   uintptr_t table_temp;
8937   asm volatile(
8938       "movd        %6,%%xmm3                     \n"
8939       "pshufd      $0x0,%%xmm3,%%xmm3            \n"
8940       "pcmpeqb     %%xmm4,%%xmm4                 \n"
8941       "psllw       $0x8,%%xmm4                   \n"
8942       "pxor        %%xmm5,%%xmm5                 \n"
8943 
8944       // 4 pixel loop.
8945       LABELALIGN
8946       "1:                                        \n"
8947       "movdqu      (%2),%%xmm0                   \n"
8948       "pmaddubsw   %%xmm3,%%xmm0                 \n"
8949       "phaddw      %%xmm0,%%xmm0                 \n"
8950       "pand        %%xmm4,%%xmm0                 \n"
8951       "punpcklwd   %%xmm5,%%xmm0                 \n"
8952       "movd        %%xmm0,%k1                    \n"  // 32 bit offset
8953       "add         %5,%1                         \n"
8954       "pshufd      $0x39,%%xmm0,%%xmm0           \n"
8955 
8956       "movzb       (%2),%0                       \n"
8957       "movzb       0x00(%1,%0,1),%0              \n"
8958       "mov         %b0,(%3)                      \n"
8959       "movzb       0x1(%2),%0                    \n"
8960       "movzb       0x00(%1,%0,1),%0              \n"
8961       "mov         %b0,0x1(%3)                   \n"
8962       "movzb       0x2(%2),%0                    \n"
8963       "movzb       0x00(%1,%0,1),%0              \n"
8964       "mov         %b0,0x2(%3)                   \n"
8965       "movzb       0x3(%2),%0                    \n"
8966       "mov         %b0,0x3(%3)                   \n"
8967 
8968       "movd        %%xmm0,%k1                    \n"  // 32 bit offset
8969       "add         %5,%1                         \n"
8970       "pshufd      $0x39,%%xmm0,%%xmm0           \n"
8971 
8972       "movzb       0x4(%2),%0                    \n"
8973       "movzb       0x00(%1,%0,1),%0              \n"
8974       "mov         %b0,0x4(%3)                   \n"
8975       "movzb       0x5(%2),%0                    \n"
8976       "movzb       0x00(%1,%0,1),%0              \n"
8977       "mov         %b0,0x5(%3)                   \n"
8978       "movzb       0x6(%2),%0                    \n"
8979       "movzb       0x00(%1,%0,1),%0              \n"
8980       "mov         %b0,0x6(%3)                   \n"
8981       "movzb       0x7(%2),%0                    \n"
8982       "mov         %b0,0x7(%3)                   \n"
8983 
8984       "movd        %%xmm0,%k1                    \n"  // 32 bit offset
8985       "add         %5,%1                         \n"
8986       "pshufd      $0x39,%%xmm0,%%xmm0           \n"
8987 
8988       "movzb       0x8(%2),%0                    \n"
8989       "movzb       0x00(%1,%0,1),%0              \n"
8990       "mov         %b0,0x8(%3)                   \n"
8991       "movzb       0x9(%2),%0                    \n"
8992       "movzb       0x00(%1,%0,1),%0              \n"
8993       "mov         %b0,0x9(%3)                   \n"
8994       "movzb       0xa(%2),%0                    \n"
8995       "movzb       0x00(%1,%0,1),%0              \n"
8996       "mov         %b0,0xa(%3)                   \n"
8997       "movzb       0xb(%2),%0                    \n"
8998       "mov         %b0,0xb(%3)                   \n"
8999 
9000       "movd        %%xmm0,%k1                    \n"  // 32 bit offset
9001       "add         %5,%1                         \n"
9002 
9003       "movzb       0xc(%2),%0                    \n"
9004       "movzb       0x00(%1,%0,1),%0              \n"
9005       "mov         %b0,0xc(%3)                   \n"
9006       "movzb       0xd(%2),%0                    \n"
9007       "movzb       0x00(%1,%0,1),%0              \n"
9008       "mov         %b0,0xd(%3)                   \n"
9009       "movzb       0xe(%2),%0                    \n"
9010       "movzb       0x00(%1,%0,1),%0              \n"
9011       "mov         %b0,0xe(%3)                   \n"
9012       "movzb       0xf(%2),%0                    \n"
9013       "mov         %b0,0xf(%3)                   \n"
9014       "lea         0x10(%2),%2                   \n"
9015       "lea         0x10(%3),%3                   \n"
9016       "sub         $0x4,%4                       \n"
9017       "jg          1b                            \n"
9018       : "=&d"(pixel_temp),  // %0
9019         "=&a"(table_temp),  // %1
9020         "+r"(src_argb),     // %2
9021         "+r"(dst_argb),     // %3
9022         "+rm"(width)        // %4
9023       : "r"(luma),          // %5
9024         "rm"(lumacoeff)     // %6
9025       : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
9026 }
9027 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
9028 
9029 static const uvec8 kYUV24Shuffle[3] = {
9030     {8, 9, 0, 8, 9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12},
9031     {9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15},
9032     {2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15, 6, 14, 15, 7}};
9033 
9034 // Convert biplanar NV21 to packed YUV24
9035 // NV21 has VU in memory for chroma.
9036 // YUV24 is VUY in memory
NV21ToYUV24Row_SSSE3(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)9037 void NV21ToYUV24Row_SSSE3(const uint8_t* src_y,
9038                           const uint8_t* src_vu,
9039                           uint8_t* dst_yuv24,
9040                           int width) {
9041   asm volatile(
9042       "sub         %0,%1                         \n"
9043       "movdqa      (%4),%%xmm4                   \n"  // 3 shuffler constants
9044       "movdqa      16(%4),%%xmm5                 \n"
9045       "movdqa      32(%4),%%xmm6                 \n"
9046       "1:                                        \n"
9047       "movdqu      (%0),%%xmm2                   \n"  // load 16 Y values
9048       "movdqu      (%0,%1),%%xmm3                \n"  // load 8 VU values
9049       "lea         16(%0),%0                     \n"
9050       "movdqa      %%xmm2,%%xmm0                 \n"
9051       "movdqa      %%xmm2,%%xmm1                 \n"
9052       "shufps      $0x44,%%xmm3,%%xmm0           \n"  // Y 0..7,  UV 0..3
9053       "shufps      $0x99,%%xmm3,%%xmm1           \n"  // Y 4..11, UV 2..5
9054       "shufps      $0xee,%%xmm3,%%xmm2           \n"  // Y 8..15, UV 4..7
9055       "pshufb      %%xmm4, %%xmm0                \n"  // weave into YUV24
9056       "pshufb      %%xmm5, %%xmm1                \n"
9057       "pshufb      %%xmm6, %%xmm2                \n"
9058       "movdqu      %%xmm0,(%2)                   \n"
9059       "movdqu      %%xmm1,16(%2)                 \n"
9060       "movdqu      %%xmm2,32(%2)                 \n"
9061       "lea         48(%2),%2                     \n"
9062       "sub         $16,%3                        \n"  // 16 pixels per loop
9063       "jg          1b                            \n"
9064       : "+r"(src_y),            // %0
9065         "+r"(src_vu),           // %1
9066         "+r"(dst_yuv24),        // %2
9067         "+r"(width)             // %3
9068       : "r"(&kYUV24Shuffle[0])  // %4
9069       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
9070 }
9071 
9072 // Convert biplanar NV21 to packed YUV24
9073 // NV21 has VU in memory for chroma.
9074 // YUV24 is VUY in memory
NV21ToYUV24Row_AVX2(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)9075 void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
9076                          const uint8_t* src_vu,
9077                          uint8_t* dst_yuv24,
9078                          int width) {
9079   asm volatile(
9080       "sub         %0,%1                         \n"
9081       "vbroadcastf128 (%4),%%ymm4                \n"  // 3 shuffler constants
9082       "vbroadcastf128 16(%4),%%ymm5              \n"
9083       "vbroadcastf128 32(%4),%%ymm6              \n"
9084 
9085       "1:                                        \n"
9086       "vmovdqu     (%0),%%ymm2                   \n"  // load 32 Y values
9087       "vmovdqu     (%0,%1),%%ymm3                \n"  // load 16 VU values
9088       "lea         32(%0),%0                     \n"
9089       "vshufps     $0x44,%%ymm3,%%ymm2,%%ymm0    \n"  // Y 0..7,  UV 0..3
9090       "vshufps     $0x99,%%ymm3,%%ymm2,%%ymm1    \n"  // Y 4..11, UV 2..5
9091       "vshufps     $0xee,%%ymm3,%%ymm2,%%ymm2    \n"  // Y 8..15, UV 4..7
9092       "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"  // weave into YUV24
9093       "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
9094       "vpshufb     %%ymm6,%%ymm2,%%ymm2          \n"
9095       "vperm2i128  $0x20,%%ymm1,%%ymm0,%%ymm3    \n"
9096       "vperm2i128  $0x30,%%ymm0,%%ymm2,%%ymm0    \n"
9097       "vperm2i128  $0x31,%%ymm2,%%ymm1,%%ymm1    \n"
9098       "vmovdqu     %%ymm3,(%2)                   \n"
9099       "vmovdqu     %%ymm0,32(%2)                 \n"
9100       "vmovdqu     %%ymm1,64(%2)                 \n"
9101       "lea         96(%2),%2                     \n"
9102       "sub         $32,%3                        \n"  // 32 pixels per loop
9103       "jg          1b                            \n"
9104       "vzeroupper                                \n"
9105       : "+r"(src_y),            // %0
9106         "+r"(src_vu),           // %1
9107         "+r"(dst_yuv24),        // %2
9108         "+r"(width)             // %3
9109       : "r"(&kYUV24Shuffle[0])  // %4
9110       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
9111 }
9112 
9113 #ifdef HAS_NV21ToYUV24ROW_AVX512
9114 // The following VMBI VEX256 code tests okay with the intelsde emulator.
9115 static const lvec8 kYUV24Perm[3] = {
9116     {32, 33, 0,  32, 33, 1,  34, 35, 2,  34, 35, 3,  36, 37, 4,  36,
9117      37, 5,  38, 39, 6,  38, 39, 7,  40, 41, 8,  40, 41, 9,  42, 43},
9118     {10, 42, 43, 11, 44, 45, 12, 44, 45, 13, 46, 47, 14, 46, 47, 15,
9119      48, 49, 16, 48, 49, 17, 50, 51, 18, 50, 51, 19, 52, 53, 20, 52},
9120     {53, 21, 54, 55, 22, 54, 55, 23, 56, 57, 24, 56, 57, 25, 58, 59,
9121      26, 58, 59, 27, 60, 61, 28, 60, 61, 29, 62, 63, 30, 62, 63, 31}};
9122 
NV21ToYUV24Row_AVX512(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)9123 void NV21ToYUV24Row_AVX512(const uint8_t* src_y,
9124                            const uint8_t* src_vu,
9125                            uint8_t* dst_yuv24,
9126                            int width) {
9127   asm volatile(
9128       "sub         %0,%1                         \n"
9129       "vmovdqa     (%4),%%ymm4                   \n"  // 3 shuffler constants
9130       "vmovdqa     32(%4),%%ymm5                 \n"
9131       "vmovdqa     64(%4),%%ymm6                 \n" LABELALIGN
9132       "1:                                        \n"
9133       "vmovdqu     (%0),%%ymm2                   \n"  // load 32 Y values
9134       "vmovdqu     (%0,%1),%%ymm3                \n"  // load 16 VU values
9135       "lea         32(%0),%0                     \n"
9136       "vmovdqa     %%ymm2, %%ymm0                \n"
9137       "vmovdqa     %%ymm2, %%ymm1                \n"
9138       "vpermt2b    %%ymm3,%%ymm4,%%ymm0          \n"
9139       "vpermt2b    %%ymm3,%%ymm5,%%ymm1          \n"
9140       "vpermt2b    %%ymm3,%%ymm6,%%ymm2          \n"
9141       "vmovdqu     %%ymm0,(%2)                   \n"
9142       "vmovdqu     %%ymm1,32(%2)                 \n"
9143       "vmovdqu     %%ymm2,64(%2)                 \n"
9144       "lea         96(%2),%2                     \n"
9145       "sub         $32,%3                        \n"
9146       "jg          1b                            \n"
9147       "vzeroupper                                \n"
9148       : "+r"(src_y),         // %0
9149         "+r"(src_vu),        // %1
9150         "+r"(dst_yuv24),     // %2
9151         "+r"(width)          // %3
9152       : "r"(&kYUV24Perm[0])  // %4
9153       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
9154 }
9155 
9156 #endif  // HAS_NV21ToYUV24ROW_AVX512
9157 
9158 #ifdef HAS_SWAPUVROW_SSSE3
9159 
9160 // Shuffle table for reversing the bytes.
9161 static const uvec8 kShuffleUVToVU = {1u, 0u, 3u,  2u,  5u,  4u,  7u,  6u,
9162                                      9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
9163 
9164 // Convert UV plane of NV12 to VU of NV21.
SwapUVRow_SSSE3(const uint8_t * src_uv,uint8_t * dst_vu,int width)9165 void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
9166   asm volatile(
9167 
9168       "movdqu      %3,%%xmm5                     \n"
9169 
9170       LABELALIGN
9171       "1:                                        \n"
9172       "movdqu      (%0),%%xmm0                   \n"
9173       "movdqu      0x10(%0),%%xmm1               \n"
9174       "lea         0x20(%0),%0                   \n"
9175       "pshufb      %%xmm5,%%xmm0                 \n"
9176       "pshufb      %%xmm5,%%xmm1                 \n"
9177       "movdqu      %%xmm0,(%1)                   \n"
9178       "movdqu      %%xmm1,0x10(%1)               \n"
9179       "lea         0x20(%1),%1                   \n"
9180       "sub         $0x10,%2                      \n"
9181       "jg          1b                            \n"
9182       : "+r"(src_uv),        // %0
9183         "+r"(dst_vu),        // %1
9184         "+r"(width)          // %2
9185       : "m"(kShuffleUVToVU)  // %3
9186       : "memory", "cc", "xmm0", "xmm1", "xmm5");
9187 }
9188 #endif  // HAS_SWAPUVROW_SSSE3
9189 
9190 #ifdef HAS_SWAPUVROW_AVX2
SwapUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_vu,int width)9191 void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
9192   asm volatile(
9193 
9194       "vbroadcastf128 %3,%%ymm5                  \n"
9195 
9196       LABELALIGN
9197       "1:                                        \n"
9198       "vmovdqu     (%0),%%ymm0                   \n"
9199       "vmovdqu     0x20(%0),%%ymm1               \n"
9200       "lea         0x40(%0),%0                   \n"
9201       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
9202       "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
9203       "vmovdqu     %%ymm0,(%1)                   \n"
9204       "vmovdqu     %%ymm1,0x20(%1)               \n"
9205       "lea         0x40(%1),%1                   \n"
9206       "sub         $0x20,%2                      \n"
9207       "jg          1b                            \n"
9208       "vzeroupper                                \n"
9209       : "+r"(src_uv),        // %0
9210         "+r"(dst_vu),        // %1
9211         "+r"(width)          // %2
9212       : "m"(kShuffleUVToVU)  // %3
9213       : "memory", "cc", "xmm0", "xmm1", "xmm5");
9214 }
9215 #endif  // HAS_SWAPUVROW_AVX2
9216 
HalfMergeUVRow_SSSE3(const uint8_t * src_u,int src_stride_u,const uint8_t * src_v,int src_stride_v,uint8_t * dst_uv,int width)9217 void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
9218                           int src_stride_u,
9219                           const uint8_t* src_v,
9220                           int src_stride_v,
9221                           uint8_t* dst_uv,
9222                           int width) {
9223   asm volatile(
9224       "pcmpeqb     %%xmm4,%%xmm4                 \n"
9225       "psrlw       $0xf,%%xmm4                   \n"
9226       "packuswb    %%xmm4,%%xmm4                 \n"
9227       "pxor        %%xmm5,%%xmm5                 \n"
9228 
9229       LABELALIGN
9230       "1:                                        \n"
9231       "movdqu      (%0),%%xmm0                   \n"  // load 16 U values
9232       "movdqu      (%1),%%xmm1                   \n"  // load 16 V values
9233       "movdqu      0(%0,%4,1),%%xmm2             \n"  // 16 from next row
9234       "movdqu      0(%1,%5,1),%%xmm3             \n"
9235       "lea         0x10(%0),%0                   \n"
9236       "pmaddubsw   %%xmm4,%%xmm0                 \n"  // half size
9237       "pmaddubsw   %%xmm4,%%xmm1                 \n"
9238       "pmaddubsw   %%xmm4,%%xmm2                 \n"
9239       "pmaddubsw   %%xmm4,%%xmm3                 \n"
9240       "lea         0x10(%1),%1                   \n"
9241       "paddw       %%xmm2,%%xmm0                 \n"
9242       "paddw       %%xmm3,%%xmm1                 \n"
9243       "psrlw       $0x1,%%xmm0                   \n"
9244       "psrlw       $0x1,%%xmm1                   \n"
9245       "pavgw       %%xmm5,%%xmm0                 \n"
9246       "pavgw       %%xmm5,%%xmm1                 \n"
9247       "packuswb    %%xmm0,%%xmm0                 \n"
9248       "packuswb    %%xmm1,%%xmm1                 \n"
9249       "punpcklbw   %%xmm1,%%xmm0                 \n"
9250       "movdqu      %%xmm0,(%2)                   \n"  // store 8 UV pixels
9251       "lea         0x10(%2),%2                   \n"
9252       "sub         $0x10,%3                      \n"  // 16 src pixels per loop
9253       "jg          1b                            \n"
9254       : "+r"(src_u),                    // %0
9255         "+r"(src_v),                    // %1
9256         "+r"(dst_uv),                   // %2
9257         "+r"(width)                     // %3
9258       : "r"((intptr_t)(src_stride_u)),  // %4
9259         "r"((intptr_t)(src_stride_v))   // %5
9260       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
9261 }
9262 
HalfMergeUVRow_AVX2(const uint8_t * src_u,int src_stride_u,const uint8_t * src_v,int src_stride_v,uint8_t * dst_uv,int width)9263 void HalfMergeUVRow_AVX2(const uint8_t* src_u,
9264                          int src_stride_u,
9265                          const uint8_t* src_v,
9266                          int src_stride_v,
9267                          uint8_t* dst_uv,
9268                          int width) {
9269   asm volatile(
9270       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
9271       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
9272       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
9273       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
9274 
9275       LABELALIGN
9276       "1:                                        \n"
9277       "vmovdqu     (%0),%%ymm0                   \n"  // load 32 U values
9278       "vmovdqu     (%1),%%ymm1                   \n"  // load 32 V values
9279       "vmovdqu     0(%0,%4,1),%%ymm2             \n"  // 32 from next row
9280       "vmovdqu     0(%1,%5,1),%%ymm3             \n"
9281       "lea         0x20(%0),%0                   \n"
9282       "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"  // half size
9283       "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
9284       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
9285       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
9286       "lea         0x20(%1),%1                   \n"
9287       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
9288       "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
9289       "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"
9290       "vpsrlw      $0x1,%%ymm1,%%ymm1            \n"
9291       "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
9292       "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
9293       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
9294       "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
9295       "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
9296       "vmovdqu     %%ymm0,(%2)                   \n"  // store 16 UV pixels
9297       "lea         0x20(%2),%2                   \n"
9298       "sub         $0x20,%3                      \n"  // 32 src pixels per loop
9299       "jg          1b                            \n"
9300       "vzeroupper                                \n"
9301       : "+r"(src_u),                    // %0
9302         "+r"(src_v),                    // %1
9303         "+r"(dst_uv),                   // %2
9304         "+r"(width)                     // %3
9305       : "r"((intptr_t)(src_stride_u)),  // %4
9306         "r"((intptr_t)(src_stride_v))   // %5
9307       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
9308 }
9309 
ClampFloatToZero_SSE2(const float * src_x,float * dst_y,int width)9310 void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) {
9311   asm volatile(
9312       "pxor        %%xmm1,%%xmm1                 \n"
9313 
9314       LABELALIGN
9315       "1:                                        \n"
9316       "movd        (%0),%%xmm0                   \n"  // load float
9317       "maxss       %%xmm1, %%xmm0                \n"  // clamp to zero
9318       "add         4, %0                         \n"
9319       "movd        %%xmm0, (%1)                  \n"  // store float
9320       "add         4, %1                         \n"
9321       "sub         $0x4,%2                       \n"  // 1 float per loop
9322       "jg          1b                            \n"
9323       : "+r"(src_x),  // %0
9324         "+r"(dst_y),  // %1
9325         "+r"(width)   // %2
9326       :
9327       : "memory", "cc", "xmm0", "xmm1");
9328 }
9329 
9330 #endif  // defined(__x86_64__) || defined(__i386__)
9331 
9332 #ifdef __cplusplus
9333 }  // extern "C"
9334 }  // namespace libyuv
9335 #endif
9336