• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 #ifdef __cplusplus
13 namespace libyuv {
14 extern "C" {
15 #endif
16 
17 // This module is for GCC x86 and x64.
18 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
19 
20 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
21 
22 // Constants for ARGB
23 static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
24                                25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u};
25 
26 // JPeg full range.
27 static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
28                                 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};
29 
30 static const uvec8 kABGRToYJ = {77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u,
31                                 77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u};
32 
33 static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u,
34                                 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u};
35 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
36 
37 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
38 
39 static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
40                               112, -74, -38, 0, 112, -74, -38, 0};
41 
42 static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
43                                127, -84, -43, 0, 127, -84, -43, 0};
44 
45 static const vec8 kABGRToUJ = {-43, -84, 127, 0, -43, -84, 127, 0,
46                                -43, -84, 127, 0, -43, -84, 127, 0};
47 
48 static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
49                               -18, -94, 112, 0, -18, -94, 112, 0};
50 
51 static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
52                                -20, -107, 127, 0, -20, -107, 127, 0};
53 
54 static const vec8 kABGRToVJ = {127, -107, -20, 0, 127, -107, -20, 0,
55                                127, -107, -20, 0, 127, -107, -20, 0};
56 
57 // Constants for BGRA
58 static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
59                                0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
60 
61 static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
62                               0, -38, -74, 112, 0, -38, -74, 112};
63 
64 static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
65                               0, 112, -94, -18, 0, 112, -94, -18};
66 
67 // Constants for ABGR
68 static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u,
69                                66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u};
70 
71 static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
72                               -38, -74, 112, 0, -38, -74, 112, 0};
73 
74 static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
75                               112, -94, -18, 0, 112, -94, -18, 0};
76 
77 // Constants for RGBA.
78 static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u,
79                                0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u};
80 
81 static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
82                               0, 112, -74, -38, 0, 112, -74, -38};
83 
84 static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
85                               0, -18, -94, 112, 0, -18, -94, 112};
86 
87 static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
88                                0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u};
89 
90 static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
91                                 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
92 
93 static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
94                                0x8080u, 0x8080u, 0x8080u, 0x8080u};
95 
96 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
97 
98 #ifdef HAS_RGB24TOARGBROW_SSSE3
99 
100 // Shuffle table for converting RGB24 to ARGB.
101 static const uvec8 kShuffleMaskRGB24ToARGB = {
102     0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
103 
104 // Shuffle table for converting RAW to ARGB.
105 static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
106                                             8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
107 
108 // Shuffle table for converting RAW to RGBA.
109 static const uvec8 kShuffleMaskRAWToRGBA = {12u, 2u, 1u, 0u, 13u, 5u,  4u,  3u,
110                                             14u, 8u, 7u, 6u, 15u, 11u, 10u, 9u};
111 
112 // Shuffle table for converting RAW to RGB24.  First 8.
113 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
114     2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
115     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
116 
117 // Shuffle table for converting RAW to RGB24.  Middle 8.
118 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
119     2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
120     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
121 
122 // Shuffle table for converting RAW to RGB24.  Last 8.
123 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
124     8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
125     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
126 
127 // Shuffle table for converting ARGB to RGB24.
128 static const uvec8 kShuffleMaskARGBToRGB24 = {
129     0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
130 
131 // Shuffle table for converting ARGB to RAW.
132 static const uvec8 kShuffleMaskARGBToRAW = {
133     2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
134 
135 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
136 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
137     0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
138 
139 // YUY2 shuf 16 Y to 32 Y.
140 static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
141                                     10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
142                                     6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
143 
144 // YUY2 shuf 8 UV to 16 UV.
145 static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
146                                      11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
147                                      5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
148 
149 // UYVY shuf 16 Y to 32 Y.
150 static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
151                                     11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
152                                     7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
153 
154 // UYVY shuf 8 UV to 16 UV.
155 static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
156                                      10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
157                                      4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
158 
159 // NV21 shuf 8 VU to 16 UV.
160 static const lvec8 kShuffleNV21 = {
161     1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
162     1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
163 };
164 #endif  // HAS_RGB24TOARGBROW_SSSE3
165 
166 #ifdef HAS_J400TOARGBROW_SSE2
J400ToARGBRow_SSE2(const uint8_t * src_y,uint8_t * dst_argb,int width)167 void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
168   asm volatile(
169       "pcmpeqb     %%xmm5,%%xmm5                 \n"
170       "pslld       $0x18,%%xmm5                  \n"
171 
172       LABELALIGN
173       "1:                                        \n"
174       "movq        (%0),%%xmm0                   \n"
175       "lea         0x8(%0),%0                    \n"
176       "punpcklbw   %%xmm0,%%xmm0                 \n"
177       "movdqa      %%xmm0,%%xmm1                 \n"
178       "punpcklwd   %%xmm0,%%xmm0                 \n"
179       "punpckhwd   %%xmm1,%%xmm1                 \n"
180       "por         %%xmm5,%%xmm0                 \n"
181       "por         %%xmm5,%%xmm1                 \n"
182       "movdqu      %%xmm0,(%1)                   \n"
183       "movdqu      %%xmm1,0x10(%1)               \n"
184       "lea         0x20(%1),%1                   \n"
185       "sub         $0x8,%2                       \n"
186       "jg          1b                            \n"
187       : "+r"(src_y),     // %0
188         "+r"(dst_argb),  // %1
189         "+r"(width)      // %2
190         ::"memory",
191         "cc", "xmm0", "xmm1", "xmm5");
192 }
193 #endif  // HAS_J400TOARGBROW_SSE2
194 
195 #ifdef HAS_RGB24TOARGBROW_SSSE3
RGB24ToARGBRow_SSSE3(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)196 void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
197                           uint8_t* dst_argb,
198                           int width) {
199   asm volatile(
200       "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
201       "pslld       $0x18,%%xmm5                  \n"
202       "movdqa      %3,%%xmm4                     \n"
203 
204       LABELALIGN
205       "1:                                        \n"
206       "movdqu      (%0),%%xmm0                   \n"
207       "movdqu      0x10(%0),%%xmm1               \n"
208       "movdqu      0x20(%0),%%xmm3               \n"
209       "lea         0x30(%0),%0                   \n"
210       "movdqa      %%xmm3,%%xmm2                 \n"
211       "palignr     $0x8,%%xmm1,%%xmm2            \n"
212       "pshufb      %%xmm4,%%xmm2                 \n"
213       "por         %%xmm5,%%xmm2                 \n"
214       "palignr     $0xc,%%xmm0,%%xmm1            \n"
215       "pshufb      %%xmm4,%%xmm0                 \n"
216       "movdqu      %%xmm2,0x20(%1)               \n"
217       "por         %%xmm5,%%xmm0                 \n"
218       "pshufb      %%xmm4,%%xmm1                 \n"
219       "movdqu      %%xmm0,(%1)                   \n"
220       "por         %%xmm5,%%xmm1                 \n"
221       "palignr     $0x4,%%xmm3,%%xmm3            \n"
222       "pshufb      %%xmm4,%%xmm3                 \n"
223       "movdqu      %%xmm1,0x10(%1)               \n"
224       "por         %%xmm5,%%xmm3                 \n"
225       "movdqu      %%xmm3,0x30(%1)               \n"
226       "lea         0x40(%1),%1                   \n"
227       "sub         $0x10,%2                      \n"
228       "jg          1b                            \n"
229       : "+r"(src_rgb24),              // %0
230         "+r"(dst_argb),               // %1
231         "+r"(width)                   // %2
232       : "m"(kShuffleMaskRGB24ToARGB)  // %3
233       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
234 }
235 
RAWToARGBRow_SSSE3(const uint8_t * src_raw,uint8_t * dst_argb,int width)236 void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
237   asm volatile(
238       "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
239       "pslld       $0x18,%%xmm5                  \n"
240       "movdqa      %3,%%xmm4                     \n"
241 
242       LABELALIGN
243       "1:                                        \n"
244       "movdqu      (%0),%%xmm0                   \n"
245       "movdqu      0x10(%0),%%xmm1               \n"
246       "movdqu      0x20(%0),%%xmm3               \n"
247       "lea         0x30(%0),%0                   \n"
248       "movdqa      %%xmm3,%%xmm2                 \n"
249       "palignr     $0x8,%%xmm1,%%xmm2            \n"
250       "pshufb      %%xmm4,%%xmm2                 \n"
251       "por         %%xmm5,%%xmm2                 \n"
252       "palignr     $0xc,%%xmm0,%%xmm1            \n"
253       "pshufb      %%xmm4,%%xmm0                 \n"
254       "movdqu      %%xmm2,0x20(%1)               \n"
255       "por         %%xmm5,%%xmm0                 \n"
256       "pshufb      %%xmm4,%%xmm1                 \n"
257       "movdqu      %%xmm0,(%1)                   \n"
258       "por         %%xmm5,%%xmm1                 \n"
259       "palignr     $0x4,%%xmm3,%%xmm3            \n"
260       "pshufb      %%xmm4,%%xmm3                 \n"
261       "movdqu      %%xmm1,0x10(%1)               \n"
262       "por         %%xmm5,%%xmm3                 \n"
263       "movdqu      %%xmm3,0x30(%1)               \n"
264       "lea         0x40(%1),%1                   \n"
265       "sub         $0x10,%2                      \n"
266       "jg          1b                            \n"
267       : "+r"(src_raw),              // %0
268         "+r"(dst_argb),             // %1
269         "+r"(width)                 // %2
270       : "m"(kShuffleMaskRAWToARGB)  // %3
271       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
272 }
273 
274 // Same code as RAWToARGB with different shuffler and A in low bits
RAWToRGBARow_SSSE3(const uint8_t * src_raw,uint8_t * dst_rgba,int width)275 void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
276   asm volatile(
277       "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0x000000ff
278       "psrld       $0x18,%%xmm5                  \n"
279       "movdqa      %3,%%xmm4                     \n"
280 
281       LABELALIGN
282       "1:                                        \n"
283       "movdqu      (%0),%%xmm0                   \n"
284       "movdqu      0x10(%0),%%xmm1               \n"
285       "movdqu      0x20(%0),%%xmm3               \n"
286       "lea         0x30(%0),%0                   \n"
287       "movdqa      %%xmm3,%%xmm2                 \n"
288       "palignr     $0x8,%%xmm1,%%xmm2            \n"
289       "pshufb      %%xmm4,%%xmm2                 \n"
290       "por         %%xmm5,%%xmm2                 \n"
291       "palignr     $0xc,%%xmm0,%%xmm1            \n"
292       "pshufb      %%xmm4,%%xmm0                 \n"
293       "movdqu      %%xmm2,0x20(%1)               \n"
294       "por         %%xmm5,%%xmm0                 \n"
295       "pshufb      %%xmm4,%%xmm1                 \n"
296       "movdqu      %%xmm0,(%1)                   \n"
297       "por         %%xmm5,%%xmm1                 \n"
298       "palignr     $0x4,%%xmm3,%%xmm3            \n"
299       "pshufb      %%xmm4,%%xmm3                 \n"
300       "movdqu      %%xmm1,0x10(%1)               \n"
301       "por         %%xmm5,%%xmm3                 \n"
302       "movdqu      %%xmm3,0x30(%1)               \n"
303       "lea         0x40(%1),%1                   \n"
304       "sub         $0x10,%2                      \n"
305       "jg          1b                            \n"
306       : "+r"(src_raw),              // %0
307         "+r"(dst_rgba),             // %1
308         "+r"(width)                 // %2
309       : "m"(kShuffleMaskRAWToRGBA)  // %3
310       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
311 }
312 
RAWToRGB24Row_SSSE3(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)313 void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
314                          uint8_t* dst_rgb24,
315                          int width) {
316   asm volatile(
317       "movdqa      %3,%%xmm3                     \n"
318       "movdqa      %4,%%xmm4                     \n"
319       "movdqa      %5,%%xmm5                     \n"
320 
321       LABELALIGN
322       "1:                                        \n"
323       "movdqu      (%0),%%xmm0                   \n"
324       "movdqu      0x4(%0),%%xmm1                \n"
325       "movdqu      0x8(%0),%%xmm2                \n"
326       "lea         0x18(%0),%0                   \n"
327       "pshufb      %%xmm3,%%xmm0                 \n"
328       "pshufb      %%xmm4,%%xmm1                 \n"
329       "pshufb      %%xmm5,%%xmm2                 \n"
330       "movq        %%xmm0,(%1)                   \n"
331       "movq        %%xmm1,0x8(%1)                \n"
332       "movq        %%xmm2,0x10(%1)               \n"
333       "lea         0x18(%1),%1                   \n"
334       "sub         $0x8,%2                       \n"
335       "jg          1b                            \n"
336       : "+r"(src_raw),                  // %0
337         "+r"(dst_rgb24),                // %1
338         "+r"(width)                     // %2
339       : "m"(kShuffleMaskRAWToRGB24_0),  // %3
340         "m"(kShuffleMaskRAWToRGB24_1),  // %4
341         "m"(kShuffleMaskRAWToRGB24_2)   // %5
342       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
343 }
344 
RGB565ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)345 void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
346   asm volatile(
347       "mov         $0x1080108,%%eax              \n"
348       "movd        %%eax,%%xmm5                  \n"
349       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
350       "mov         $0x20802080,%%eax             \n"
351       "movd        %%eax,%%xmm6                  \n"
352       "pshufd      $0x0,%%xmm6,%%xmm6            \n"
353       "pcmpeqb     %%xmm3,%%xmm3                 \n"
354       "psllw       $0xb,%%xmm3                   \n"
355       "pcmpeqb     %%xmm4,%%xmm4                 \n"
356       "psllw       $0xa,%%xmm4                   \n"
357       "psrlw       $0x5,%%xmm4                   \n"
358       "pcmpeqb     %%xmm7,%%xmm7                 \n"
359       "psllw       $0x8,%%xmm7                   \n"
360       "sub         %0,%1                         \n"
361       "sub         %0,%1                         \n"
362 
363       LABELALIGN
364       "1:                                        \n"
365       "movdqu      (%0),%%xmm0                   \n"
366       "movdqa      %%xmm0,%%xmm1                 \n"
367       "movdqa      %%xmm0,%%xmm2                 \n"
368       "pand        %%xmm3,%%xmm1                 \n"
369       "psllw       $0xb,%%xmm2                   \n"
370       "pmulhuw     %%xmm5,%%xmm1                 \n"
371       "pmulhuw     %%xmm5,%%xmm2                 \n"
372       "psllw       $0x8,%%xmm1                   \n"
373       "por         %%xmm2,%%xmm1                 \n"
374       "pand        %%xmm4,%%xmm0                 \n"
375       "pmulhuw     %%xmm6,%%xmm0                 \n"
376       "por         %%xmm7,%%xmm0                 \n"
377       "movdqa      %%xmm1,%%xmm2                 \n"
378       "punpcklbw   %%xmm0,%%xmm1                 \n"
379       "punpckhbw   %%xmm0,%%xmm2                 \n"
380       "movdqu      %%xmm1,0x00(%1,%0,2)          \n"
381       "movdqu      %%xmm2,0x10(%1,%0,2)          \n"
382       "lea         0x10(%0),%0                   \n"
383       "sub         $0x8,%2                       \n"
384       "jg          1b                            \n"
385       : "+r"(src),   // %0
386         "+r"(dst),   // %1
387         "+r"(width)  // %2
388       :
389       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
390         "xmm6", "xmm7");
391 }
392 
ARGB1555ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)393 void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
394   asm volatile(
395       "mov         $0x1080108,%%eax              \n"
396       "movd        %%eax,%%xmm5                  \n"
397       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
398       "mov         $0x42004200,%%eax             \n"
399       "movd        %%eax,%%xmm6                  \n"
400       "pshufd      $0x0,%%xmm6,%%xmm6            \n"
401       "pcmpeqb     %%xmm3,%%xmm3                 \n"
402       "psllw       $0xb,%%xmm3                   \n"
403       "movdqa      %%xmm3,%%xmm4                 \n"
404       "psrlw       $0x6,%%xmm4                   \n"
405       "pcmpeqb     %%xmm7,%%xmm7                 \n"
406       "psllw       $0x8,%%xmm7                   \n"
407       "sub         %0,%1                         \n"
408       "sub         %0,%1                         \n"
409 
410       LABELALIGN
411       "1:                                        \n"
412       "movdqu      (%0),%%xmm0                   \n"
413       "movdqa      %%xmm0,%%xmm1                 \n"
414       "movdqa      %%xmm0,%%xmm2                 \n"
415       "psllw       $0x1,%%xmm1                   \n"
416       "psllw       $0xb,%%xmm2                   \n"
417       "pand        %%xmm3,%%xmm1                 \n"
418       "pmulhuw     %%xmm5,%%xmm2                 \n"
419       "pmulhuw     %%xmm5,%%xmm1                 \n"
420       "psllw       $0x8,%%xmm1                   \n"
421       "por         %%xmm2,%%xmm1                 \n"
422       "movdqa      %%xmm0,%%xmm2                 \n"
423       "pand        %%xmm4,%%xmm0                 \n"
424       "psraw       $0x8,%%xmm2                   \n"
425       "pmulhuw     %%xmm6,%%xmm0                 \n"
426       "pand        %%xmm7,%%xmm2                 \n"
427       "por         %%xmm2,%%xmm0                 \n"
428       "movdqa      %%xmm1,%%xmm2                 \n"
429       "punpcklbw   %%xmm0,%%xmm1                 \n"
430       "punpckhbw   %%xmm0,%%xmm2                 \n"
431       "movdqu      %%xmm1,0x00(%1,%0,2)          \n"
432       "movdqu      %%xmm2,0x10(%1,%0,2)          \n"
433       "lea         0x10(%0),%0                   \n"
434       "sub         $0x8,%2                       \n"
435       "jg          1b                            \n"
436       : "+r"(src),   // %0
437         "+r"(dst),   // %1
438         "+r"(width)  // %2
439       :
440       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
441         "xmm6", "xmm7");
442 }
443 
ARGB4444ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)444 void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
445   asm volatile(
446       "mov         $0xf0f0f0f,%%eax              \n"
447       "movd        %%eax,%%xmm4                  \n"
448       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
449       "movdqa      %%xmm4,%%xmm5                 \n"
450       "pslld       $0x4,%%xmm5                   \n"
451       "sub         %0,%1                         \n"
452       "sub         %0,%1                         \n"
453 
454       LABELALIGN
455       "1:                                        \n"
456       "movdqu      (%0),%%xmm0                   \n"
457       "movdqa      %%xmm0,%%xmm2                 \n"
458       "pand        %%xmm4,%%xmm0                 \n"
459       "pand        %%xmm5,%%xmm2                 \n"
460       "movdqa      %%xmm0,%%xmm1                 \n"
461       "movdqa      %%xmm2,%%xmm3                 \n"
462       "psllw       $0x4,%%xmm1                   \n"
463       "psrlw       $0x4,%%xmm3                   \n"
464       "por         %%xmm1,%%xmm0                 \n"
465       "por         %%xmm3,%%xmm2                 \n"
466       "movdqa      %%xmm0,%%xmm1                 \n"
467       "punpcklbw   %%xmm2,%%xmm0                 \n"
468       "punpckhbw   %%xmm2,%%xmm1                 \n"
469       "movdqu      %%xmm0,0x00(%1,%0,2)          \n"
470       "movdqu      %%xmm1,0x10(%1,%0,2)          \n"
471       "lea         0x10(%0),%0                   \n"
472       "sub         $0x8,%2                       \n"
473       "jg          1b                            \n"
474       : "+r"(src),   // %0
475         "+r"(dst),   // %1
476         "+r"(width)  // %2
477       :
478       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
479 }
480 
ARGBToRGB24Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)481 void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
482   asm volatile(
483 
484       "movdqa      %3,%%xmm6                     \n"
485 
486       LABELALIGN
487       "1:                                        \n"
488       "movdqu      (%0),%%xmm0                   \n"
489       "movdqu      0x10(%0),%%xmm1               \n"
490       "movdqu      0x20(%0),%%xmm2               \n"
491       "movdqu      0x30(%0),%%xmm3               \n"
492       "lea         0x40(%0),%0                   \n"
493       "pshufb      %%xmm6,%%xmm0                 \n"
494       "pshufb      %%xmm6,%%xmm1                 \n"
495       "pshufb      %%xmm6,%%xmm2                 \n"
496       "pshufb      %%xmm6,%%xmm3                 \n"
497       "movdqa      %%xmm1,%%xmm4                 \n"
498       "psrldq      $0x4,%%xmm1                   \n"
499       "pslldq      $0xc,%%xmm4                   \n"
500       "movdqa      %%xmm2,%%xmm5                 \n"
501       "por         %%xmm4,%%xmm0                 \n"
502       "pslldq      $0x8,%%xmm5                   \n"
503       "movdqu      %%xmm0,(%1)                   \n"
504       "por         %%xmm5,%%xmm1                 \n"
505       "psrldq      $0x8,%%xmm2                   \n"
506       "pslldq      $0x4,%%xmm3                   \n"
507       "por         %%xmm3,%%xmm2                 \n"
508       "movdqu      %%xmm1,0x10(%1)               \n"
509       "movdqu      %%xmm2,0x20(%1)               \n"
510       "lea         0x30(%1),%1                   \n"
511       "sub         $0x10,%2                      \n"
512       "jg          1b                            \n"
513       : "+r"(src),                    // %0
514         "+r"(dst),                    // %1
515         "+r"(width)                   // %2
516       : "m"(kShuffleMaskARGBToRGB24)  // %3
517       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
518 }
519 
ARGBToRAWRow_SSSE3(const uint8_t * src,uint8_t * dst,int width)520 void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
521   asm volatile(
522 
523       "movdqa      %3,%%xmm6                     \n"
524 
525       LABELALIGN
526       "1:                                        \n"
527       "movdqu      (%0),%%xmm0                   \n"
528       "movdqu      0x10(%0),%%xmm1               \n"
529       "movdqu      0x20(%0),%%xmm2               \n"
530       "movdqu      0x30(%0),%%xmm3               \n"
531       "lea         0x40(%0),%0                   \n"
532       "pshufb      %%xmm6,%%xmm0                 \n"
533       "pshufb      %%xmm6,%%xmm1                 \n"
534       "pshufb      %%xmm6,%%xmm2                 \n"
535       "pshufb      %%xmm6,%%xmm3                 \n"
536       "movdqa      %%xmm1,%%xmm4                 \n"
537       "psrldq      $0x4,%%xmm1                   \n"
538       "pslldq      $0xc,%%xmm4                   \n"
539       "movdqa      %%xmm2,%%xmm5                 \n"
540       "por         %%xmm4,%%xmm0                 \n"
541       "pslldq      $0x8,%%xmm5                   \n"
542       "movdqu      %%xmm0,(%1)                   \n"
543       "por         %%xmm5,%%xmm1                 \n"
544       "psrldq      $0x8,%%xmm2                   \n"
545       "pslldq      $0x4,%%xmm3                   \n"
546       "por         %%xmm3,%%xmm2                 \n"
547       "movdqu      %%xmm1,0x10(%1)               \n"
548       "movdqu      %%xmm2,0x20(%1)               \n"
549       "lea         0x30(%1),%1                   \n"
550       "sub         $0x10,%2                      \n"
551       "jg          1b                            \n"
552       : "+r"(src),                  // %0
553         "+r"(dst),                  // %1
554         "+r"(width)                 // %2
555       : "m"(kShuffleMaskARGBToRAW)  // %3
556       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
557 }
558 
559 #ifdef HAS_ARGBTORGB24ROW_AVX2
560 // vpermd for 12+12 to 24
561 static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
562 
ARGBToRGB24Row_AVX2(const uint8_t * src,uint8_t * dst,int width)563 void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
564   asm volatile(
565       "vbroadcastf128 %3,%%ymm6                  \n"
566       "vmovdqa     %4,%%ymm7                     \n"
567 
568       LABELALIGN
569       "1:                                        \n"
570       "vmovdqu     (%0),%%ymm0                   \n"
571       "vmovdqu     0x20(%0),%%ymm1               \n"
572       "vmovdqu     0x40(%0),%%ymm2               \n"
573       "vmovdqu     0x60(%0),%%ymm3               \n"
574       "lea         0x80(%0),%0                   \n"
575       "vpshufb     %%ymm6,%%ymm0,%%ymm0          \n"  // xxx0yyy0
576       "vpshufb     %%ymm6,%%ymm1,%%ymm1          \n"
577       "vpshufb     %%ymm6,%%ymm2,%%ymm2          \n"
578       "vpshufb     %%ymm6,%%ymm3,%%ymm3          \n"
579       "vpermd      %%ymm0,%%ymm7,%%ymm0          \n"  // pack to 24 bytes
580       "vpermd      %%ymm1,%%ymm7,%%ymm1          \n"
581       "vpermd      %%ymm2,%%ymm7,%%ymm2          \n"
582       "vpermd      %%ymm3,%%ymm7,%%ymm3          \n"
583       "vpermq      $0x3f,%%ymm1,%%ymm4           \n"  // combine 24 + 8
584       "vpor        %%ymm4,%%ymm0,%%ymm0          \n"
585       "vmovdqu     %%ymm0,(%1)                   \n"
586       "vpermq      $0xf9,%%ymm1,%%ymm1           \n"  // combine 16 + 16
587       "vpermq      $0x4f,%%ymm2,%%ymm4           \n"
588       "vpor        %%ymm4,%%ymm1,%%ymm1          \n"
589       "vmovdqu     %%ymm1,0x20(%1)               \n"
590       "vpermq      $0xfe,%%ymm2,%%ymm2           \n"  // combine 8 + 24
591       "vpermq      $0x93,%%ymm3,%%ymm3           \n"
592       "vpor        %%ymm3,%%ymm2,%%ymm2          \n"
593       "vmovdqu     %%ymm2,0x40(%1)               \n"
594       "lea         0x60(%1),%1                   \n"
595       "sub         $0x20,%2                      \n"
596       "jg          1b                            \n"
597       "vzeroupper                                \n"
598       : "+r"(src),                     // %0
599         "+r"(dst),                     // %1
600         "+r"(width)                    // %2
601       : "m"(kShuffleMaskARGBToRGB24),  // %3
602         "m"(kPermdRGB24_AVX)           // %4
603       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
604         "xmm7");
605 }
606 #endif
607 
608 #ifdef HAS_ARGBTORGB24ROW_AVX512VBMI
609 // Shuffle table for converting ARGBToRGB24
610 static const ulvec8 kPermARGBToRGB24_0 = {
611     0u,  1u,  2u,  4u,  5u,  6u,  8u,  9u,  10u, 12u, 13u,
612     14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
613     29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
614 static const ulvec8 kPermARGBToRGB24_1 = {
615     10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
616     25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
617     40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
618 static const ulvec8 kPermARGBToRGB24_2 = {
619     21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
620     36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
621     50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
622 
ARGBToRGB24Row_AVX512VBMI(const uint8_t * src,uint8_t * dst,int width)623 void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
624   asm volatile(
625       "vmovdqa     %3,%%ymm5                     \n"
626       "vmovdqa     %4,%%ymm6                     \n"
627       "vmovdqa     %5,%%ymm7                     \n"
628 
629       LABELALIGN
630       "1:                                        \n"
631       "vmovdqu     (%0),%%ymm0                   \n"
632       "vmovdqu     0x20(%0),%%ymm1               \n"
633       "vmovdqu     0x40(%0),%%ymm2               \n"
634       "vmovdqu     0x60(%0),%%ymm3               \n"
635       "lea         0x80(%0),%0                   \n"
636       "vpermt2b    %%ymm1,%%ymm5,%%ymm0          \n"
637       "vpermt2b    %%ymm2,%%ymm6,%%ymm1          \n"
638       "vpermt2b    %%ymm3,%%ymm7,%%ymm2          \n"
639       "vmovdqu     %%ymm0,(%1)                   \n"
640       "vmovdqu     %%ymm1,0x20(%1)               \n"
641       "vmovdqu     %%ymm2,0x40(%1)               \n"
642       "lea         0x60(%1),%1                   \n"
643       "sub         $0x20,%2                      \n"
644       "jg          1b                            \n"
645       "vzeroupper                                \n"
646       : "+r"(src),                // %0
647         "+r"(dst),                // %1
648         "+r"(width)               // %2
649       : "m"(kPermARGBToRGB24_0),  // %3
650         "m"(kPermARGBToRGB24_1),  // %4
651         "m"(kPermARGBToRGB24_2)   // %5
652       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");
653 }
654 #endif
655 
656 #ifdef HAS_ARGBTORAWROW_AVX2
ARGBToRAWRow_AVX2(const uint8_t * src,uint8_t * dst,int width)657 void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
658   asm volatile(
659       "vbroadcastf128 %3,%%ymm6                  \n"
660       "vmovdqa     %4,%%ymm7                     \n"
661 
662       LABELALIGN
663       "1:                                        \n"
664       "vmovdqu     (%0),%%ymm0                   \n"
665       "vmovdqu     0x20(%0),%%ymm1               \n"
666       "vmovdqu     0x40(%0),%%ymm2               \n"
667       "vmovdqu     0x60(%0),%%ymm3               \n"
668       "lea         0x80(%0),%0                   \n"
669       "vpshufb     %%ymm6,%%ymm0,%%ymm0          \n"  // xxx0yyy0
670       "vpshufb     %%ymm6,%%ymm1,%%ymm1          \n"
671       "vpshufb     %%ymm6,%%ymm2,%%ymm2          \n"
672       "vpshufb     %%ymm6,%%ymm3,%%ymm3          \n"
673       "vpermd      %%ymm0,%%ymm7,%%ymm0          \n"  // pack to 24 bytes
674       "vpermd      %%ymm1,%%ymm7,%%ymm1          \n"
675       "vpermd      %%ymm2,%%ymm7,%%ymm2          \n"
676       "vpermd      %%ymm3,%%ymm7,%%ymm3          \n"
677       "vpermq      $0x3f,%%ymm1,%%ymm4           \n"  // combine 24 + 8
678       "vpor        %%ymm4,%%ymm0,%%ymm0          \n"
679       "vmovdqu     %%ymm0,(%1)                   \n"
680       "vpermq      $0xf9,%%ymm1,%%ymm1           \n"  // combine 16 + 16
681       "vpermq      $0x4f,%%ymm2,%%ymm4           \n"
682       "vpor        %%ymm4,%%ymm1,%%ymm1          \n"
683       "vmovdqu     %%ymm1,0x20(%1)               \n"
684       "vpermq      $0xfe,%%ymm2,%%ymm2           \n"  // combine 8 + 24
685       "vpermq      $0x93,%%ymm3,%%ymm3           \n"
686       "vpor        %%ymm3,%%ymm2,%%ymm2          \n"
687       "vmovdqu     %%ymm2,0x40(%1)               \n"
688       "lea         0x60(%1),%1                   \n"
689       "sub         $0x20,%2                      \n"
690       "jg          1b                            \n"
691       "vzeroupper                                \n"
692       : "+r"(src),                   // %0
693         "+r"(dst),                   // %1
694         "+r"(width)                  // %2
695       : "m"(kShuffleMaskARGBToRAW),  // %3
696         "m"(kPermdRGB24_AVX)         // %4
697       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
698         "xmm7");
699 }
700 #endif
701 
ARGBToRGB565Row_SSE2(const uint8_t * src,uint8_t * dst,int width)702 void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
703   asm volatile(
704       "pcmpeqb     %%xmm3,%%xmm3                 \n"
705       "psrld       $0x1b,%%xmm3                  \n"
706       "pcmpeqb     %%xmm4,%%xmm4                 \n"
707       "psrld       $0x1a,%%xmm4                  \n"
708       "pslld       $0x5,%%xmm4                   \n"
709       "pcmpeqb     %%xmm5,%%xmm5                 \n"
710       "pslld       $0xb,%%xmm5                   \n"
711 
712       LABELALIGN
713       "1:                                        \n"
714       "movdqu      (%0),%%xmm0                   \n"
715       "movdqa      %%xmm0,%%xmm1                 \n"
716       "movdqa      %%xmm0,%%xmm2                 \n"
717       "pslld       $0x8,%%xmm0                   \n"
718       "psrld       $0x3,%%xmm1                   \n"
719       "psrld       $0x5,%%xmm2                   \n"
720       "psrad       $0x10,%%xmm0                  \n"
721       "pand        %%xmm3,%%xmm1                 \n"
722       "pand        %%xmm4,%%xmm2                 \n"
723       "pand        %%xmm5,%%xmm0                 \n"
724       "por         %%xmm2,%%xmm1                 \n"
725       "por         %%xmm1,%%xmm0                 \n"
726       "packssdw    %%xmm0,%%xmm0                 \n"
727       "lea         0x10(%0),%0                   \n"
728       "movq        %%xmm0,(%1)                   \n"
729       "lea         0x8(%1),%1                    \n"
730       "sub         $0x4,%2                       \n"
731       "jg          1b                            \n"
732       : "+r"(src),   // %0
733         "+r"(dst),   // %1
734         "+r"(width)  // %2
735         ::"memory",
736         "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
737 }
738 
ARGBToRGB565DitherRow_SSE2(const uint8_t * src,uint8_t * dst,uint32_t dither4,int width)739 void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
740                                 uint8_t* dst,
741                                 uint32_t dither4,
742                                 int width) {
743   asm volatile(
744       "movd        %3,%%xmm6                     \n"
745       "punpcklbw   %%xmm6,%%xmm6                 \n"
746       "movdqa      %%xmm6,%%xmm7                 \n"
747       "punpcklwd   %%xmm6,%%xmm6                 \n"
748       "punpckhwd   %%xmm7,%%xmm7                 \n"
749       "pcmpeqb     %%xmm3,%%xmm3                 \n"
750       "psrld       $0x1b,%%xmm3                  \n"
751       "pcmpeqb     %%xmm4,%%xmm4                 \n"
752       "psrld       $0x1a,%%xmm4                  \n"
753       "pslld       $0x5,%%xmm4                   \n"
754       "pcmpeqb     %%xmm5,%%xmm5                 \n"
755       "pslld       $0xb,%%xmm5                   \n"
756 
757       LABELALIGN
758       "1:                                        \n"
759       "movdqu      (%0),%%xmm0                   \n"
760       "paddusb     %%xmm6,%%xmm0                 \n"
761       "movdqa      %%xmm0,%%xmm1                 \n"
762       "movdqa      %%xmm0,%%xmm2                 \n"
763       "pslld       $0x8,%%xmm0                   \n"
764       "psrld       $0x3,%%xmm1                   \n"
765       "psrld       $0x5,%%xmm2                   \n"
766       "psrad       $0x10,%%xmm0                  \n"
767       "pand        %%xmm3,%%xmm1                 \n"
768       "pand        %%xmm4,%%xmm2                 \n"
769       "pand        %%xmm5,%%xmm0                 \n"
770       "por         %%xmm2,%%xmm1                 \n"
771       "por         %%xmm1,%%xmm0                 \n"
772       "packssdw    %%xmm0,%%xmm0                 \n"
773       "lea         0x10(%0),%0                   \n"
774       "movq        %%xmm0,(%1)                   \n"
775       "lea         0x8(%1),%1                    \n"
776       "sub         $0x4,%2                       \n"
777       "jg          1b                            \n"
778       : "+r"(src),    // %0
779         "+r"(dst),    // %1
780         "+r"(width)   // %2
781       : "m"(dither4)  // %3
782       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
783         "xmm7");
784 }
785 
786 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
ARGBToRGB565DitherRow_AVX2(const uint8_t * src,uint8_t * dst,uint32_t dither4,int width)787 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
788                                 uint8_t* dst,
789                                 uint32_t dither4,
790                                 int width) {
791   asm volatile(
792       "vbroadcastss %3,%%xmm6                    \n"
793       "vpunpcklbw  %%xmm6,%%xmm6,%%xmm6          \n"
794       "vpermq      $0xd8,%%ymm6,%%ymm6           \n"
795       "vpunpcklwd  %%ymm6,%%ymm6,%%ymm6          \n"
796       "vpcmpeqb    %%ymm3,%%ymm3,%%ymm3          \n"
797       "vpsrld      $0x1b,%%ymm3,%%ymm3           \n"
798       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
799       "vpsrld      $0x1a,%%ymm4,%%ymm4           \n"
800       "vpslld      $0x5,%%ymm4,%%ymm4            \n"
801       "vpslld      $0xb,%%ymm3,%%ymm5            \n"
802 
803       LABELALIGN
804       "1:                                        \n"
805       "vmovdqu     (%0),%%ymm0                   \n"
806       "vpaddusb    %%ymm6,%%ymm0,%%ymm0          \n"
807       "vpsrld      $0x5,%%ymm0,%%ymm2            \n"
808       "vpsrld      $0x3,%%ymm0,%%ymm1            \n"
809       "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
810       "vpand       %%ymm4,%%ymm2,%%ymm2          \n"
811       "vpand       %%ymm3,%%ymm1,%%ymm1          \n"
812       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
813       "vpor        %%ymm2,%%ymm1,%%ymm1          \n"
814       "vpor        %%ymm1,%%ymm0,%%ymm0          \n"
815       "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
816       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
817       "lea         0x20(%0),%0                   \n"
818       "vmovdqu     %%xmm0,(%1)                   \n"
819       "lea         0x10(%1),%1                   \n"
820       "sub         $0x8,%2                       \n"
821       "jg          1b                            \n"
822       "vzeroupper                                \n"
823       : "+r"(src),    // %0
824         "+r"(dst),    // %1
825         "+r"(width)   // %2
826       : "m"(dither4)  // %3
827       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
828         "xmm7");
829 }
830 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
831 
ARGBToARGB1555Row_SSE2(const uint8_t * src,uint8_t * dst,int width)832 void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
833   asm volatile(
834       "pcmpeqb     %%xmm4,%%xmm4                 \n"
835       "psrld       $0x1b,%%xmm4                  \n"
836       "movdqa      %%xmm4,%%xmm5                 \n"
837       "pslld       $0x5,%%xmm5                   \n"
838       "movdqa      %%xmm4,%%xmm6                 \n"
839       "pslld       $0xa,%%xmm6                   \n"
840       "pcmpeqb     %%xmm7,%%xmm7                 \n"
841       "pslld       $0xf,%%xmm7                   \n"
842 
843       LABELALIGN
844       "1:                                        \n"
845       "movdqu      (%0),%%xmm0                   \n"
846       "movdqa      %%xmm0,%%xmm1                 \n"
847       "movdqa      %%xmm0,%%xmm2                 \n"
848       "movdqa      %%xmm0,%%xmm3                 \n"
849       "psrad       $0x10,%%xmm0                  \n"
850       "psrld       $0x3,%%xmm1                   \n"
851       "psrld       $0x6,%%xmm2                   \n"
852       "psrld       $0x9,%%xmm3                   \n"
853       "pand        %%xmm7,%%xmm0                 \n"
854       "pand        %%xmm4,%%xmm1                 \n"
855       "pand        %%xmm5,%%xmm2                 \n"
856       "pand        %%xmm6,%%xmm3                 \n"
857       "por         %%xmm1,%%xmm0                 \n"
858       "por         %%xmm3,%%xmm2                 \n"
859       "por         %%xmm2,%%xmm0                 \n"
860       "packssdw    %%xmm0,%%xmm0                 \n"
861       "lea         0x10(%0),%0                   \n"
862       "movq        %%xmm0,(%1)                   \n"
863       "lea         0x8(%1),%1                    \n"
864       "sub         $0x4,%2                       \n"
865       "jg          1b                            \n"
866       : "+r"(src),   // %0
867         "+r"(dst),   // %1
868         "+r"(width)  // %2
869         ::"memory",
870         "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
871 }
872 
ARGBToARGB4444Row_SSE2(const uint8_t * src,uint8_t * dst,int width)873 void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
874   asm volatile(
875       "pcmpeqb     %%xmm4,%%xmm4                 \n"
876       "psllw       $0xc,%%xmm4                   \n"
877       "movdqa      %%xmm4,%%xmm3                 \n"
878       "psrlw       $0x8,%%xmm3                   \n"
879 
880       LABELALIGN
881       "1:                                        \n"
882       "movdqu      (%0),%%xmm0                   \n"
883       "movdqa      %%xmm0,%%xmm1                 \n"
884       "pand        %%xmm3,%%xmm0                 \n"
885       "pand        %%xmm4,%%xmm1                 \n"
886       "psrlq       $0x4,%%xmm0                   \n"
887       "psrlq       $0x8,%%xmm1                   \n"
888       "por         %%xmm1,%%xmm0                 \n"
889       "packuswb    %%xmm0,%%xmm0                 \n"
890       "lea         0x10(%0),%0                   \n"
891       "movq        %%xmm0,(%1)                   \n"
892       "lea         0x8(%1),%1                    \n"
893       "sub         $0x4,%2                       \n"
894       "jg          1b                            \n"
895       : "+r"(src),   // %0
896         "+r"(dst),   // %1
897         "+r"(width)  // %2
898         ::"memory",
899         "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
900 }
901 #endif  // HAS_RGB24TOARGBROW_SSSE3
902 
903 /*
904 
905 ARGBToAR30Row:
906 
907 Red Blue
908 With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
909 produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
910 wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
911 (1024+4)*16 for red.
912 
913 Alpha Green
914 Alpha and Green are already in the high bits so vpand can zero out the other
915 bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
916 could be used for Green - (1024+4) putting the 10 bit green in the lsb.  Alpha
917 would be a simple multiplier to shift it into position.  It wants a gap of 10
918 above the green.  Green is 10 bits, so there are 6 bits in the low short.  4
919 more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
920 and then a shift of 4 is a multiply of 16, so (4*16) = 64.  Then shift the
921 result left 10 to position the A and G channels.
922 */
923 
924 // Shuffle table for converting RAW to RGB24.  Last 8.
925 static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u,  128u, 4u,  128u, 6u,
926                                    128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
927 
928 static const uvec8 kShuffleBR30 = {128u, 2u,  128u, 0u, 128u, 6u,  128u, 4u,
929                                    128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
930 
931 static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
932 static const uint32_t kMaskRB10 = 0x3ff003ff;
933 static const uint32_t kMaskAG10 = 0xc000ff00;
934 static const uint32_t kMulAG10 = 64 * 65536 + 1028;
935 
ARGBToAR30Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)936 void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
937   asm volatile(
938       "movdqa      %3,%%xmm2                     \n"  // shuffler for RB
939       "movd        %4,%%xmm3                     \n"  // multipler for RB
940       "movd        %5,%%xmm4                     \n"  // mask for R10 B10
941       "movd        %6,%%xmm5                     \n"  // mask for AG
942       "movd        %7,%%xmm6                     \n"  // multipler for AG
943       "pshufd      $0x0,%%xmm3,%%xmm3            \n"
944       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
945       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
946       "pshufd      $0x0,%%xmm6,%%xmm6            \n"
947       "sub         %0,%1                         \n"
948 
949       "1:                                        \n"
950       "movdqu      (%0),%%xmm0                   \n"  // fetch 4 ARGB pixels
951       "movdqa      %%xmm0,%%xmm1                 \n"
952       "pshufb      %%xmm2,%%xmm1                 \n"  // R0B0
953       "pand        %%xmm5,%%xmm0                 \n"  // A0G0
954       "pmulhuw     %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
955       "pmulhuw     %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
956       "pand        %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
957       "pslld       $10,%%xmm0                    \n"  // A2 x10 G10 x10
958       "por         %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
959       "movdqu      %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
960       "add         $0x10,%0                      \n"
961       "sub         $0x4,%2                       \n"
962       "jg          1b                            \n"
963 
964       : "+r"(src),          // %0
965         "+r"(dst),          // %1
966         "+r"(width)         // %2
967       : "m"(kShuffleRB30),  // %3
968         "m"(kMulRB10),      // %4
969         "m"(kMaskRB10),     // %5
970         "m"(kMaskAG10),     // %6
971         "m"(kMulAG10)       // %7
972       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
973 }
974 
ABGRToAR30Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)975 void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
976   asm volatile(
977       "movdqa      %3,%%xmm2                     \n"  // shuffler for RB
978       "movd        %4,%%xmm3                     \n"  // multipler for RB
979       "movd        %5,%%xmm4                     \n"  // mask for R10 B10
980       "movd        %6,%%xmm5                     \n"  // mask for AG
981       "movd        %7,%%xmm6                     \n"  // multipler for AG
982       "pshufd      $0x0,%%xmm3,%%xmm3            \n"
983       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
984       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
985       "pshufd      $0x0,%%xmm6,%%xmm6            \n"
986       "sub         %0,%1                         \n"
987 
988       "1:                                        \n"
989       "movdqu      (%0),%%xmm0                   \n"  // fetch 4 ABGR pixels
990       "movdqa      %%xmm0,%%xmm1                 \n"
991       "pshufb      %%xmm2,%%xmm1                 \n"  // R0B0
992       "pand        %%xmm5,%%xmm0                 \n"  // A0G0
993       "pmulhuw     %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
994       "pmulhuw     %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
995       "pand        %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
996       "pslld       $10,%%xmm0                    \n"  // A2 x10 G10 x10
997       "por         %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
998       "movdqu      %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
999       "add         $0x10,%0                      \n"
1000       "sub         $0x4,%2                       \n"
1001       "jg          1b                            \n"
1002 
1003       : "+r"(src),          // %0
1004         "+r"(dst),          // %1
1005         "+r"(width)         // %2
1006       : "m"(kShuffleBR30),  // %3  reversed shuffler
1007         "m"(kMulRB10),      // %4
1008         "m"(kMaskRB10),     // %5
1009         "m"(kMaskAG10),     // %6
1010         "m"(kMulAG10)       // %7
1011       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1012 }
1013 
1014 #ifdef HAS_ARGBTOAR30ROW_AVX2
ARGBToAR30Row_AVX2(const uint8_t * src,uint8_t * dst,int width)1015 void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
1016   asm volatile(
1017       "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
1018       "vbroadcastss %4,%%ymm3                    \n"  // multipler for RB
1019       "vbroadcastss %5,%%ymm4                    \n"  // mask for R10 B10
1020       "vbroadcastss %6,%%ymm5                    \n"  // mask for AG
1021       "vbroadcastss %7,%%ymm6                    \n"  // multipler for AG
1022       "sub         %0,%1                         \n"
1023 
1024       "1:                                        \n"
1025       "vmovdqu     (%0),%%ymm0                   \n"  // fetch 8 ARGB pixels
1026       "vpshufb     %%ymm2,%%ymm0,%%ymm1          \n"  // R0B0
1027       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"  // A0G0
1028       "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"  // X2 R16 X4  B10
1029       "vpmulhuw    %%ymm6,%%ymm0,%%ymm0          \n"  // X10 A2 X10 G10
1030       "vpand       %%ymm4,%%ymm1,%%ymm1          \n"  // X2 R10 X10 B10
1031       "vpslld      $10,%%ymm0,%%ymm0             \n"  // A2 x10 G10 x10
1032       "vpor        %%ymm1,%%ymm0,%%ymm0          \n"  // A2 R10 G10 B10
1033       "vmovdqu     %%ymm0,(%1,%0)                \n"  // store 8 AR30 pixels
1034       "add         $0x20,%0                      \n"
1035       "sub         $0x8,%2                       \n"
1036       "jg          1b                            \n"
1037       "vzeroupper                                \n"
1038 
1039       : "+r"(src),          // %0
1040         "+r"(dst),          // %1
1041         "+r"(width)         // %2
1042       : "m"(kShuffleRB30),  // %3
1043         "m"(kMulRB10),      // %4
1044         "m"(kMaskRB10),     // %5
1045         "m"(kMaskAG10),     // %6
1046         "m"(kMulAG10)       // %7
1047       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1048 }
1049 #endif
1050 
1051 #ifdef HAS_ABGRTOAR30ROW_AVX2
ABGRToAR30Row_AVX2(const uint8_t * src,uint8_t * dst,int width)1052 void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
1053   asm volatile(
1054       "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
1055       "vbroadcastss %4,%%ymm3                    \n"  // multipler for RB
1056       "vbroadcastss %5,%%ymm4                    \n"  // mask for R10 B10
1057       "vbroadcastss %6,%%ymm5                    \n"  // mask for AG
1058       "vbroadcastss %7,%%ymm6                    \n"  // multipler for AG
1059       "sub         %0,%1                         \n"
1060 
1061       "1:                                        \n"
1062       "vmovdqu     (%0),%%ymm0                   \n"  // fetch 8 ABGR pixels
1063       "vpshufb     %%ymm2,%%ymm0,%%ymm1          \n"  // R0B0
1064       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"  // A0G0
1065       "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"  // X2 R16 X4  B10
1066       "vpmulhuw    %%ymm6,%%ymm0,%%ymm0          \n"  // X10 A2 X10 G10
1067       "vpand       %%ymm4,%%ymm1,%%ymm1          \n"  // X2 R10 X10 B10
1068       "vpslld      $10,%%ymm0,%%ymm0             \n"  // A2 x10 G10 x10
1069       "vpor        %%ymm1,%%ymm0,%%ymm0          \n"  // A2 R10 G10 B10
1070       "vmovdqu     %%ymm0,(%1,%0)                \n"  // store 8 AR30 pixels
1071       "add         $0x20,%0                      \n"
1072       "sub         $0x8,%2                       \n"
1073       "jg          1b                            \n"
1074       "vzeroupper                                \n"
1075 
1076       : "+r"(src),          // %0
1077         "+r"(dst),          // %1
1078         "+r"(width)         // %2
1079       : "m"(kShuffleBR30),  // %3  reversed shuffler
1080         "m"(kMulRB10),      // %4
1081         "m"(kMaskRB10),     // %5
1082         "m"(kMaskAG10),     // %6
1083         "m"(kMulAG10)       // %7
1084       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1085 }
1086 #endif
1087 
1088 static const uvec8 kShuffleARGBToABGR = {2,  1, 0, 3,  6,  5,  4,  7,
1089                                          10, 9, 8, 11, 14, 13, 12, 15};
1090 
1091 static const uvec8 kShuffleARGBToAB64Lo = {2, 2, 1, 1, 0, 0, 3, 3,
1092                                            6, 6, 5, 5, 4, 4, 7, 7};
1093 static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9,  9,  8,  8,  11, 11,
1094                                            14, 14, 13, 13, 12, 12, 15, 15};
1095 
ARGBToAR64Row_SSSE3(const uint8_t * src_argb,uint16_t * dst_ar64,int width)1096 void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
1097                          uint16_t* dst_ar64,
1098                          int width) {
1099   asm volatile(
1100 
1101       LABELALIGN
1102       "1:                                        \n"
1103       "movdqu      (%0),%%xmm0                   \n"
1104       "movdqa      %%xmm0,%%xmm1                 \n"
1105       "punpcklbw   %%xmm0,%%xmm0                 \n"
1106       "punpckhbw   %%xmm1,%%xmm1                 \n"
1107       "movdqu      %%xmm0,(%1)                   \n"
1108       "movdqu      %%xmm1,0x10(%1)               \n"
1109       "lea         0x10(%0),%0                   \n"
1110       "lea         0x20(%1),%1                   \n"
1111       "sub         $0x4,%2                       \n"
1112       "jg          1b                            \n"
1113       : "+r"(src_argb),  // %0
1114         "+r"(dst_ar64),  // %1
1115         "+r"(width)      // %2
1116       :
1117       : "memory", "cc", "xmm0", "xmm1");
1118 }
1119 
ARGBToAB64Row_SSSE3(const uint8_t * src_argb,uint16_t * dst_ab64,int width)1120 void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
1121                          uint16_t* dst_ab64,
1122                          int width) {
1123   asm volatile(
1124 
1125       "movdqa      %3,%%xmm2                     \n"
1126       "movdqa      %4,%%xmm3                     \n" LABELALIGN
1127       "1:                                        \n"
1128       "movdqu      (%0),%%xmm0                   \n"
1129       "movdqa      %%xmm0,%%xmm1                 \n"
1130       "pshufb      %%xmm2,%%xmm0                 \n"
1131       "pshufb      %%xmm3,%%xmm1                 \n"
1132       "movdqu      %%xmm0,(%1)                   \n"
1133       "movdqu      %%xmm1,0x10(%1)               \n"
1134       "lea         0x10(%0),%0                   \n"
1135       "lea         0x20(%1),%1                   \n"
1136       "sub         $0x4,%2                       \n"
1137       "jg          1b                            \n"
1138       : "+r"(src_argb),             // %0
1139         "+r"(dst_ab64),             // %1
1140         "+r"(width)                 // %2
1141       : "m"(kShuffleARGBToAB64Lo),  // %3
1142         "m"(kShuffleARGBToAB64Hi)   // %4
1143       : "memory", "cc", "xmm0", "xmm1", "xmm2");
1144 }
1145 
AR64ToARGBRow_SSSE3(const uint16_t * src_ar64,uint8_t * dst_argb,int width)1146 void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
1147                          uint8_t* dst_argb,
1148                          int width) {
1149   asm volatile(
1150 
1151       LABELALIGN
1152       "1:                                        \n"
1153       "movdqu      (%0),%%xmm0                   \n"
1154       "movdqu      0x10(%0),%%xmm1               \n"
1155       "psrlw       $8,%%xmm0                     \n"
1156       "psrlw       $8,%%xmm1                     \n"
1157       "packuswb    %%xmm1,%%xmm0                 \n"
1158       "movdqu      %%xmm0,(%1)                   \n"
1159       "lea         0x20(%0),%0                   \n"
1160       "lea         0x10(%1),%1                   \n"
1161       "sub         $0x4,%2                       \n"
1162       "jg          1b                            \n"
1163       : "+r"(src_ar64),  // %0
1164         "+r"(dst_argb),  // %1
1165         "+r"(width)      // %2
1166       :
1167       : "memory", "cc", "xmm0", "xmm1");
1168 }
1169 
AB64ToARGBRow_SSSE3(const uint16_t * src_ab64,uint8_t * dst_argb,int width)1170 void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
1171                          uint8_t* dst_argb,
1172                          int width) {
1173   asm volatile(
1174 
1175       "movdqa      %3,%%xmm2                     \n" LABELALIGN
1176       "1:                                        \n"
1177       "movdqu      (%0),%%xmm0                   \n"
1178       "movdqu      0x10(%0),%%xmm1               \n"
1179       "psrlw       $8,%%xmm0                     \n"
1180       "psrlw       $8,%%xmm1                     \n"
1181       "packuswb    %%xmm1,%%xmm0                 \n"
1182       "pshufb      %%xmm2,%%xmm0                 \n"
1183       "movdqu      %%xmm0,(%1)                   \n"
1184       "lea         0x20(%0),%0                   \n"
1185       "lea         0x10(%1),%1                   \n"
1186       "sub         $0x4,%2                       \n"
1187       "jg          1b                            \n"
1188       : "+r"(src_ab64),          // %0
1189         "+r"(dst_argb),          // %1
1190         "+r"(width)              // %2
1191       : "m"(kShuffleARGBToABGR)  // %3
1192       : "memory", "cc", "xmm0", "xmm1", "xmm2");
1193 }
1194 
1195 #ifdef HAS_ARGBTOAR64ROW_AVX2
ARGBToAR64Row_AVX2(const uint8_t * src_argb,uint16_t * dst_ar64,int width)1196 void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
1197                         uint16_t* dst_ar64,
1198                         int width) {
1199   asm volatile(
1200 
1201       LABELALIGN
1202       "1:                                        \n"
1203       "vmovdqu     (%0),%%ymm0                   \n"
1204       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1205       "vpunpckhbw  %%ymm0,%%ymm0,%%ymm1          \n"
1206       "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
1207       "vmovdqu     %%ymm0,(%1)                   \n"
1208       "vmovdqu     %%ymm1,0x20(%1)               \n"
1209       "lea         0x20(%0),%0                   \n"
1210       "lea         0x40(%1),%1                   \n"
1211       "sub         $0x8,%2                       \n"
1212       "jg          1b                            \n"
1213       "vzeroupper                                \n"
1214       : "+r"(src_argb),  // %0
1215         "+r"(dst_ar64),  // %1
1216         "+r"(width)      // %2
1217       :
1218       : "memory", "cc", "xmm0", "xmm1");
1219 }
1220 #endif
1221 
1222 #ifdef HAS_ARGBTOAB64ROW_AVX2
ARGBToAB64Row_AVX2(const uint8_t * src_argb,uint16_t * dst_ab64,int width)1223 void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
1224                         uint16_t* dst_ab64,
1225                         int width) {
1226   asm volatile(
1227 
1228       "vbroadcastf128 %3,%%ymm2                  \n"
1229       "vbroadcastf128 %4,%%ymm3                  \n" LABELALIGN
1230       "1:                                        \n"
1231       "vmovdqu     (%0),%%ymm0                   \n"
1232       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1233       "vpshufb     %%ymm3,%%ymm0,%%ymm1          \n"
1234       "vpshufb     %%ymm2,%%ymm0,%%ymm0          \n"
1235       "vmovdqu     %%ymm0,(%1)                   \n"
1236       "vmovdqu     %%ymm1,0x20(%1)               \n"
1237       "lea         0x20(%0),%0                   \n"
1238       "lea         0x40(%1),%1                   \n"
1239       "sub         $0x8,%2                       \n"
1240       "jg          1b                            \n"
1241       "vzeroupper                                \n"
1242       : "+r"(src_argb),             // %0
1243         "+r"(dst_ab64),             // %1
1244         "+r"(width)                 // %2
1245       : "m"(kShuffleARGBToAB64Lo),  // %3
1246         "m"(kShuffleARGBToAB64Hi)   // %3
1247       : "memory", "cc", "xmm0", "xmm1", "xmm2");
1248 }
1249 #endif
1250 
1251 #ifdef HAS_AR64TOARGBROW_AVX2
AR64ToARGBRow_AVX2(const uint16_t * src_ar64,uint8_t * dst_argb,int width)1252 void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
1253                         uint8_t* dst_argb,
1254                         int width) {
1255   asm volatile(
1256 
1257       LABELALIGN
1258       "1:                                        \n"
1259       "vmovdqu     (%0),%%ymm0                   \n"
1260       "vmovdqu     0x20(%0),%%ymm1               \n"
1261       "vpsrlw      $8,%%ymm0,%%ymm0              \n"
1262       "vpsrlw      $8,%%ymm1,%%ymm1              \n"
1263       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
1264       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1265       "vmovdqu     %%ymm0,(%1)                   \n"
1266       "lea         0x40(%0),%0                   \n"
1267       "lea         0x20(%1),%1                   \n"
1268       "sub         $0x8,%2                       \n"
1269       "jg          1b                            \n"
1270       "vzeroupper                                \n"
1271       : "+r"(src_ar64),  // %0
1272         "+r"(dst_argb),  // %1
1273         "+r"(width)      // %2
1274       :
1275       : "memory", "cc", "xmm0", "xmm1");
1276 }
1277 #endif
1278 
1279 #ifdef HAS_AB64TOARGBROW_AVX2
AB64ToARGBRow_AVX2(const uint16_t * src_ab64,uint8_t * dst_argb,int width)1280 void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
1281                         uint8_t* dst_argb,
1282                         int width) {
1283   asm volatile(
1284 
1285       "vbroadcastf128 %3,%%ymm2                  \n" LABELALIGN
1286       "1:                                        \n"
1287       "vmovdqu     (%0),%%ymm0                   \n"
1288       "vmovdqu     0x20(%0),%%ymm1               \n"
1289       "vpsrlw      $8,%%ymm0,%%ymm0              \n"
1290       "vpsrlw      $8,%%ymm1,%%ymm1              \n"
1291       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
1292       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1293       "vpshufb     %%ymm2,%%ymm0,%%ymm0          \n"
1294       "vmovdqu     %%ymm0,(%1)                   \n"
1295       "lea         0x40(%0),%0                   \n"
1296       "lea         0x20(%1),%1                   \n"
1297       "sub         $0x8,%2                       \n"
1298       "jg          1b                            \n"
1299       "vzeroupper                                \n"
1300       : "+r"(src_ab64),          // %0
1301         "+r"(dst_argb),          // %1
1302         "+r"(width)              // %2
1303       : "m"(kShuffleARGBToABGR)  // %3
1304       : "memory", "cc", "xmm0", "xmm1", "xmm2");
1305 }
1306 #endif
1307 
1308 // clang-format off
1309 
1310 // TODO(mraptis): Consider passing R, G, B multipliers as parameter.
1311 // round parameter is register containing value to add before shift.
1312 #define RGBTOY(round)                            \
1313   "1:                                        \n" \
1314   "movdqu    (%0),%%xmm0                     \n" \
1315   "movdqu    0x10(%0),%%xmm1                 \n" \
1316   "movdqu    0x20(%0),%%xmm2                 \n" \
1317   "movdqu    0x30(%0),%%xmm3                 \n" \
1318   "psubb     %%xmm5,%%xmm0                   \n" \
1319   "psubb     %%xmm5,%%xmm1                   \n" \
1320   "psubb     %%xmm5,%%xmm2                   \n" \
1321   "psubb     %%xmm5,%%xmm3                   \n" \
1322   "movdqu    %%xmm4,%%xmm6                   \n" \
1323   "pmaddubsw %%xmm0,%%xmm6                   \n" \
1324   "movdqu    %%xmm4,%%xmm0                   \n" \
1325   "pmaddubsw %%xmm1,%%xmm0                   \n" \
1326   "movdqu    %%xmm4,%%xmm1                   \n" \
1327   "pmaddubsw %%xmm2,%%xmm1                   \n" \
1328   "movdqu    %%xmm4,%%xmm2                   \n" \
1329   "pmaddubsw %%xmm3,%%xmm2                   \n" \
1330   "lea       0x40(%0),%0                     \n" \
1331   "phaddw    %%xmm0,%%xmm6                   \n" \
1332   "phaddw    %%xmm2,%%xmm1                   \n" \
1333   "prefetcht0 1280(%0)                       \n" \
1334   "paddw     %%" #round ",%%xmm6             \n" \
1335   "paddw     %%" #round ",%%xmm1             \n" \
1336   "psrlw     $0x8,%%xmm6                     \n" \
1337   "psrlw     $0x8,%%xmm1                     \n" \
1338   "packuswb  %%xmm1,%%xmm6                   \n" \
1339   "movdqu    %%xmm6,(%1)                     \n" \
1340   "lea       0x10(%1),%1                     \n" \
1341   "sub       $0x10,%2                        \n" \
1342   "jg        1b                              \n"
1343 
1344 #define RGBTOY_AVX2(round)                                       \
1345   "1:                                        \n"                 \
1346   "vmovdqu    (%0),%%ymm0                    \n"                 \
1347   "vmovdqu    0x20(%0),%%ymm1                \n"                 \
1348   "vmovdqu    0x40(%0),%%ymm2                \n"                 \
1349   "vmovdqu    0x60(%0),%%ymm3                \n"                 \
1350   "vpsubb     %%ymm5, %%ymm0, %%ymm0         \n"                 \
1351   "vpsubb     %%ymm5, %%ymm1, %%ymm1         \n"                 \
1352   "vpsubb     %%ymm5, %%ymm2, %%ymm2         \n"                 \
1353   "vpsubb     %%ymm5, %%ymm3, %%ymm3         \n"                 \
1354   "vpmaddubsw %%ymm0,%%ymm4,%%ymm0           \n"                 \
1355   "vpmaddubsw %%ymm1,%%ymm4,%%ymm1           \n"                 \
1356   "vpmaddubsw %%ymm2,%%ymm4,%%ymm2           \n"                 \
1357   "vpmaddubsw %%ymm3,%%ymm4,%%ymm3           \n"                 \
1358   "lea       0x80(%0),%0                     \n"                 \
1359   "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n" /* mutates. */  \
1360   "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"                 \
1361   "prefetcht0 1280(%0)                       \n"                 \
1362   "vpaddw     %%" #round ",%%ymm0,%%ymm0     \n" /* Add .5 for rounding. */             \
1363   "vpaddw     %%" #round ",%%ymm2,%%ymm2     \n" \
1364   "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"                 \
1365   "vpsrlw     $0x8,%%ymm2,%%ymm2             \n"                 \
1366   "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n" /* mutates. */  \
1367   "vpermd     %%ymm0,%%ymm6,%%ymm0           \n" /* unmutate. */ \
1368   "vmovdqu    %%ymm0,(%1)                    \n"                 \
1369   "lea       0x20(%1),%1                     \n"                 \
1370   "sub       $0x20,%2                        \n"                 \
1371   "jg        1b                              \n"                 \
1372   "vzeroupper                                \n"
1373 
1374 // clang-format on
1375 
1376 #ifdef HAS_ARGBTOYROW_SSSE3
1377 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
ARGBToYRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_y,int width)1378 void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1379   asm volatile(
1380       "movdqa      %3,%%xmm4                     \n"
1381       "movdqa      %4,%%xmm5                     \n"
1382       "movdqa      %5,%%xmm7                     \n"
1383 
1384       LABELALIGN RGBTOY(xmm7)
1385       : "+r"(src_argb),  // %0
1386         "+r"(dst_y),     // %1
1387         "+r"(width)      // %2
1388       : "m"(kARGBToY),   // %3
1389         "m"(kSub128),    // %4
1390         "m"(kAddY16)     // %5
1391       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1392         "xmm7");
1393 }
1394 #endif  // HAS_ARGBTOYROW_SSSE3
1395 
1396 #ifdef HAS_ARGBTOYJROW_SSSE3
1397 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1398 // Same as ARGBToYRow but different coefficients, no add 16.
ARGBToYJRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_y,int width)1399 void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1400   asm volatile(
1401       "movdqa      %3,%%xmm4                     \n"
1402       "movdqa      %4,%%xmm5                     \n"
1403 
1404       LABELALIGN RGBTOY(xmm5)
1405       : "+r"(src_argb),  // %0
1406         "+r"(dst_y),     // %1
1407         "+r"(width)      // %2
1408       : "m"(kARGBToYJ),  // %3
1409         "m"(kSub128)     // %4
1410       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1411 }
1412 #endif  // HAS_ARGBTOYJROW_SSSE3
1413 
1414 #ifdef HAS_ABGRTOYJROW_SSSE3
1415 // Convert 16 ABGR pixels (64 bytes) to 16 YJ values.
1416 // Same as ABGRToYRow but different coefficients, no add 16.
ABGRToYJRow_SSSE3(const uint8_t * src_abgr,uint8_t * dst_y,int width)1417 void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1418   asm volatile(
1419       "movdqa      %3,%%xmm4                     \n"
1420       "movdqa      %4,%%xmm5                     \n"
1421 
1422       LABELALIGN RGBTOY(xmm5)
1423       : "+r"(src_abgr),  // %0
1424         "+r"(dst_y),     // %1
1425         "+r"(width)      // %2
1426       : "m"(kABGRToYJ),  // %3
1427         "m"(kSub128)     // %4
1428       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1429 }
1430 #endif  // HAS_ABGRTOYJROW_SSSE3
1431 
1432 #ifdef HAS_RGBATOYJROW_SSSE3
1433 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1434 // Same as ARGBToYRow but different coefficients, no add 16.
RGBAToYJRow_SSSE3(const uint8_t * src_rgba,uint8_t * dst_y,int width)1435 void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1436   asm volatile(
1437       "movdqa      %3,%%xmm4                     \n"
1438       "movdqa      %4,%%xmm5                     \n"
1439 
1440       LABELALIGN RGBTOY(xmm5)
1441       : "+r"(src_rgba),  // %0
1442         "+r"(dst_y),     // %1
1443         "+r"(width)      // %2
1444       : "m"(kRGBAToYJ),  // %3
1445         "m"(kSub128)     // %4
1446       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1447 }
1448 #endif  // HAS_RGBATOYJROW_SSSE3
1449 
1450 #if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ABGRTOYROW_AVX2) || \
1451     defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
1452 // vpermd for vphaddw + vpackuswb vpermd.
1453 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
1454 #endif
1455 
1456 #ifdef HAS_ARGBTOYROW_AVX2
1457 
1458 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYRow_AVX2(const uint8_t * src_argb,uint8_t * dst_y,int width)1459 void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1460   asm volatile(
1461       "vbroadcastf128 %3,%%ymm4                  \n"
1462       "vbroadcastf128 %4,%%ymm5                  \n"
1463       "vbroadcastf128 %5,%%ymm7                  \n"
1464       "vmovdqu     %6,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
1465       ymm7) "vzeroupper                                \n"
1466       : "+r"(src_argb),         // %0
1467         "+r"(dst_y),            // %1
1468         "+r"(width)             // %2
1469       : "m"(kARGBToY),          // %3
1470         "m"(kSub128),           // %4
1471         "m"(kAddY16),           // %5
1472         "m"(kPermdARGBToY_AVX)  // %6
1473       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1474         "xmm7");
1475 }
1476 #endif  // HAS_ARGBTOYROW_AVX2
1477 
1478 #ifdef HAS_ABGRTOYROW_AVX2
1479 // Convert 32 ABGR pixels (128 bytes) to 32 Y values.
ABGRToYRow_AVX2(const uint8_t * src_abgr,uint8_t * dst_y,int width)1480 void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1481   asm volatile(
1482       "vbroadcastf128 %3,%%ymm4                  \n"
1483       "vbroadcastf128 %4,%%ymm5                  \n"
1484       "vbroadcastf128 %5,%%ymm7                  \n"
1485       "vmovdqu     %6,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
1486       ymm7) "vzeroupper                                \n"
1487       : "+r"(src_abgr),         // %0
1488         "+r"(dst_y),            // %1
1489         "+r"(width)             // %2
1490       : "m"(kABGRToY),          // %3
1491         "m"(kSub128),           // %4
1492         "m"(kAddY16),           // %5
1493         "m"(kPermdARGBToY_AVX)  // %6
1494       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1495         "xmm7");
1496 }
1497 #endif  // HAS_ABGRTOYROW_AVX2
1498 
1499 #ifdef HAS_ARGBTOYJROW_AVX2
1500 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYJRow_AVX2(const uint8_t * src_argb,uint8_t * dst_y,int width)1501 void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1502   asm volatile(
1503       "vbroadcastf128 %3,%%ymm4                  \n"
1504       "vbroadcastf128 %4,%%ymm5                  \n"
1505       "vmovdqu     %5,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
1506       ymm5) "vzeroupper                                \n"
1507       : "+r"(src_argb),         // %0
1508         "+r"(dst_y),            // %1
1509         "+r"(width)             // %2
1510       : "m"(kARGBToYJ),         // %3
1511         "m"(kSub128),           // %4
1512         "m"(kPermdARGBToY_AVX)  // %5
1513       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1514         "xmm7");
1515 }
1516 #endif  // HAS_ARGBTOYJROW_AVX2
1517 
1518 #ifdef HAS_ABGRTOYJROW_AVX2
1519 // Convert 32 ABGR pixels (128 bytes) to 32 Y values.
ABGRToYJRow_AVX2(const uint8_t * src_abgr,uint8_t * dst_y,int width)1520 void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1521   asm volatile(
1522       "vbroadcastf128 %3,%%ymm4                  \n"
1523       "vbroadcastf128 %4,%%ymm5                  \n"
1524       "vmovdqu     %5,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
1525       ymm5) "vzeroupper                                \n"
1526       : "+r"(src_abgr),         // %0
1527         "+r"(dst_y),            // %1
1528         "+r"(width)             // %2
1529       : "m"(kABGRToYJ),         // %3
1530         "m"(kSub128),           // %4
1531         "m"(kPermdARGBToY_AVX)  // %5
1532       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1533         "xmm7");
1534 }
1535 #endif  // HAS_ABGRTOYJROW_AVX2
1536 
1537 #ifdef HAS_RGBATOYJROW_AVX2
1538 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
RGBAToYJRow_AVX2(const uint8_t * src_rgba,uint8_t * dst_y,int width)1539 void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1540   asm volatile(
1541       "vbroadcastf128 %3,%%ymm4                  \n"
1542       "vbroadcastf128 %4,%%ymm5                  \n"
1543       "vmovdqu     %5,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
1544       ymm5) "vzeroupper                                \n"
1545       : "+r"(src_rgba),         // %0
1546         "+r"(dst_y),            // %1
1547         "+r"(width)             // %2
1548       : "m"(kRGBAToYJ),         // %3
1549         "m"(kSub128),           // %4
1550         "m"(kPermdARGBToY_AVX)  // %5
1551       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1552 }
1553 #endif  // HAS_RGBATOYJROW_AVX2
1554 
1555 #ifdef HAS_ARGBTOUVROW_SSSE3
ARGBToUVRow_SSSE3(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1556 void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
1557                        int src_stride_argb,
1558                        uint8_t* dst_u,
1559                        uint8_t* dst_v,
1560                        int width) {
1561   asm volatile(
1562       "movdqa      %5,%%xmm3                     \n"
1563       "movdqa      %6,%%xmm4                     \n"
1564       "movdqa      %7,%%xmm5                     \n"
1565       "sub         %1,%2                         \n"
1566 
1567       LABELALIGN
1568       "1:                                        \n"
1569       "movdqu      (%0),%%xmm0                   \n"
1570       "movdqu      0x00(%0,%4,1),%%xmm7          \n"
1571       "pavgb       %%xmm7,%%xmm0                 \n"
1572       "movdqu      0x10(%0),%%xmm1               \n"
1573       "movdqu      0x10(%0,%4,1),%%xmm7          \n"
1574       "pavgb       %%xmm7,%%xmm1                 \n"
1575       "movdqu      0x20(%0),%%xmm2               \n"
1576       "movdqu      0x20(%0,%4,1),%%xmm7          \n"
1577       "pavgb       %%xmm7,%%xmm2                 \n"
1578       "movdqu      0x30(%0),%%xmm6               \n"
1579       "movdqu      0x30(%0,%4,1),%%xmm7          \n"
1580       "pavgb       %%xmm7,%%xmm6                 \n"
1581 
1582       "lea         0x40(%0),%0                   \n"
1583       "movdqa      %%xmm0,%%xmm7                 \n"
1584       "shufps      $0x88,%%xmm1,%%xmm0           \n"
1585       "shufps      $0xdd,%%xmm1,%%xmm7           \n"
1586       "pavgb       %%xmm7,%%xmm0                 \n"
1587       "movdqa      %%xmm2,%%xmm7                 \n"
1588       "shufps      $0x88,%%xmm6,%%xmm2           \n"
1589       "shufps      $0xdd,%%xmm6,%%xmm7           \n"
1590       "pavgb       %%xmm7,%%xmm2                 \n"
1591       "movdqa      %%xmm0,%%xmm1                 \n"
1592       "movdqa      %%xmm2,%%xmm6                 \n"
1593       "pmaddubsw   %%xmm4,%%xmm0                 \n"
1594       "pmaddubsw   %%xmm4,%%xmm2                 \n"
1595       "pmaddubsw   %%xmm3,%%xmm1                 \n"
1596       "pmaddubsw   %%xmm3,%%xmm6                 \n"
1597       "phaddw      %%xmm2,%%xmm0                 \n"
1598       "phaddw      %%xmm6,%%xmm1                 \n"
1599       "psraw       $0x8,%%xmm0                   \n"
1600       "psraw       $0x8,%%xmm1                   \n"
1601       "packsswb    %%xmm1,%%xmm0                 \n"
1602       "paddb       %%xmm5,%%xmm0                 \n"
1603       "movlps      %%xmm0,(%1)                   \n"
1604       "movhps      %%xmm0,0x00(%1,%2,1)          \n"
1605       "lea         0x8(%1),%1                    \n"
1606       "sub         $0x10,%3                      \n"
1607       "jg          1b                            \n"
1608       : "+r"(src_argb),                    // %0
1609         "+r"(dst_u),                       // %1
1610         "+r"(dst_v),                       // %2
1611         "+rm"(width)                       // %3
1612       : "r"((intptr_t)(src_stride_argb)),  // %4
1613         "m"(kARGBToV),                     // %5
1614         "m"(kARGBToU),                     // %6
1615         "m"(kAddUV128)                     // %7
1616       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1617 }
1618 #endif  // HAS_ARGBTOUVROW_SSSE3
1619 
1620 #if defined(HAS_ARGBTOUVROW_AVX2) || defined(HAS_ABGRTOUVROW_AVX2) || \
1621     defined(HAS_ARGBTOUVJROW_AVX2) || defined(HAS_ABGRTOUVJROW_AVX2)
1622 // vpshufb for vphaddw + vpackuswb packed to shorts.
1623 static const lvec8 kShufARGBToUV_AVX = {
1624     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
1625     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
1626 #endif
1627 
1628 #if defined(HAS_ARGBTOUVROW_AVX2)
ARGBToUVRow_AVX2(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1629 void ARGBToUVRow_AVX2(const uint8_t* src_argb,
1630                       int src_stride_argb,
1631                       uint8_t* dst_u,
1632                       uint8_t* dst_v,
1633                       int width) {
1634   asm volatile(
1635       "vbroadcastf128 %5,%%ymm5                  \n"
1636       "vbroadcastf128 %6,%%ymm6                  \n"
1637       "vbroadcastf128 %7,%%ymm7                  \n"
1638       "sub         %1,%2                         \n"
1639 
1640       LABELALIGN
1641       "1:                                        \n"
1642       "vmovdqu     (%0),%%ymm0                   \n"
1643       "vmovdqu     0x20(%0),%%ymm1               \n"
1644       "vmovdqu     0x40(%0),%%ymm2               \n"
1645       "vmovdqu     0x60(%0),%%ymm3               \n"
1646       "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
1647       "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
1648       "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
1649       "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
1650       "lea         0x80(%0),%0                   \n"
1651       "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
1652       "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
1653       "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
1654       "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
1655       "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
1656       "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
1657 
1658       "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
1659       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
1660       "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
1661       "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
1662       "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
1663       "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
1664       "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
1665       "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
1666       "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
1667       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1668       "vpshufb     %8,%%ymm0,%%ymm0              \n"
1669       "vpaddb      %%ymm5,%%ymm0,%%ymm0          \n"
1670 
1671       "vextractf128 $0x0,%%ymm0,(%1)             \n"
1672       "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
1673       "lea         0x10(%1),%1                   \n"
1674       "sub         $0x20,%3                      \n"
1675       "jg          1b                            \n"
1676       "vzeroupper                                \n"
1677       : "+r"(src_argb),                    // %0
1678         "+r"(dst_u),                       // %1
1679         "+r"(dst_v),                       // %2
1680         "+rm"(width)                       // %3
1681       : "r"((intptr_t)(src_stride_argb)),  // %4
1682         "m"(kAddUV128),                    // %5
1683         "m"(kARGBToV),                     // %6
1684         "m"(kARGBToU),                     // %7
1685         "m"(kShufARGBToUV_AVX)             // %8
1686       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1687         "xmm7");
1688 }
1689 #endif  // HAS_ARGBTOUVROW_AVX2
1690 
1691 #ifdef HAS_ABGRTOUVROW_AVX2
ABGRToUVRow_AVX2(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)1692 void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
1693                       int src_stride_abgr,
1694                       uint8_t* dst_u,
1695                       uint8_t* dst_v,
1696                       int width) {
1697   asm volatile(
1698       "vbroadcastf128 %5,%%ymm5                  \n"
1699       "vbroadcastf128 %6,%%ymm6                  \n"
1700       "vbroadcastf128 %7,%%ymm7                  \n"
1701       "sub         %1,%2                         \n"
1702 
1703       LABELALIGN
1704       "1:                                        \n"
1705       "vmovdqu     (%0),%%ymm0                   \n"
1706       "vmovdqu     0x20(%0),%%ymm1               \n"
1707       "vmovdqu     0x40(%0),%%ymm2               \n"
1708       "vmovdqu     0x60(%0),%%ymm3               \n"
1709       "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
1710       "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
1711       "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
1712       "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
1713       "lea         0x80(%0),%0                   \n"
1714       "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
1715       "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
1716       "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
1717       "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
1718       "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
1719       "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
1720 
1721       "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
1722       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
1723       "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
1724       "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
1725       "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
1726       "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
1727       "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
1728       "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
1729       "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
1730       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1731       "vpshufb     %8,%%ymm0,%%ymm0              \n"
1732       "vpaddb      %%ymm5,%%ymm0,%%ymm0          \n"
1733 
1734       "vextractf128 $0x0,%%ymm0,(%1)             \n"
1735       "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
1736       "lea         0x10(%1),%1                   \n"
1737       "sub         $0x20,%3                      \n"
1738       "jg          1b                            \n"
1739       "vzeroupper                                \n"
1740       : "+r"(src_abgr),                    // %0
1741         "+r"(dst_u),                       // %1
1742         "+r"(dst_v),                       // %2
1743         "+rm"(width)                       // %3
1744       : "r"((intptr_t)(src_stride_abgr)),  // %4
1745         "m"(kAddUV128),                    // %5
1746         "m"(kABGRToV),                     // %6
1747         "m"(kABGRToU),                     // %7
1748         "m"(kShufARGBToUV_AVX)             // %8
1749       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1750         "xmm7");
1751 }
1752 #endif  // HAS_ABGRTOUVROW_AVX2
1753 
1754 #ifdef HAS_ARGBTOUVJROW_AVX2
ARGBToUVJRow_AVX2(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1755 void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
1756                        int src_stride_argb,
1757                        uint8_t* dst_u,
1758                        uint8_t* dst_v,
1759                        int width) {
1760   asm volatile(
1761       "vbroadcastf128 %5,%%ymm5                  \n"
1762       "vbroadcastf128 %6,%%ymm6                  \n"
1763       "vbroadcastf128 %7,%%ymm7                  \n"
1764       "sub         %1,%2                         \n"
1765 
1766       LABELALIGN
1767       "1:                                        \n"
1768       "vmovdqu     (%0),%%ymm0                   \n"
1769       "vmovdqu     0x20(%0),%%ymm1               \n"
1770       "vmovdqu     0x40(%0),%%ymm2               \n"
1771       "vmovdqu     0x60(%0),%%ymm3               \n"
1772       "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
1773       "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
1774       "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
1775       "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
1776       "lea         0x80(%0),%0                   \n"
1777       "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
1778       "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
1779       "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
1780       "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
1781       "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
1782       "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
1783 
1784       "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
1785       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
1786       "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
1787       "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
1788       "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
1789       "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
1790       "vpaddw      %%ymm5,%%ymm0,%%ymm0          \n"
1791       "vpaddw      %%ymm5,%%ymm1,%%ymm1          \n"
1792       "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
1793       "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
1794       "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
1795       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1796       "vpshufb     %8,%%ymm0,%%ymm0              \n"
1797 
1798       "vextractf128 $0x0,%%ymm0,(%1)             \n"
1799       "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
1800       "lea         0x10(%1),%1                   \n"
1801       "sub         $0x20,%3                      \n"
1802       "jg          1b                            \n"
1803       "vzeroupper                                \n"
1804       : "+r"(src_argb),                    // %0
1805         "+r"(dst_u),                       // %1
1806         "+r"(dst_v),                       // %2
1807         "+rm"(width)                       // %3
1808       : "r"((intptr_t)(src_stride_argb)),  // %4
1809         "m"(kSub128),                      // %5
1810         "m"(kARGBToVJ),                    // %6
1811         "m"(kARGBToUJ),                    // %7
1812         "m"(kShufARGBToUV_AVX)             // %8
1813       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1814         "xmm7");
1815 }
1816 #endif  // HAS_ARGBTOUVJROW_AVX2
1817 
1818 // TODO(fbarchard): Pass kABGRToVJ / kABGRToUJ as matrix
1819 #ifdef HAS_ABGRTOUVJROW_AVX2
ABGRToUVJRow_AVX2(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)1820 void ABGRToUVJRow_AVX2(const uint8_t* src_abgr,
1821                        int src_stride_abgr,
1822                        uint8_t* dst_u,
1823                        uint8_t* dst_v,
1824                        int width) {
1825   asm volatile(
1826       "vbroadcastf128 %5,%%ymm5                  \n"
1827       "vbroadcastf128 %6,%%ymm6                  \n"
1828       "vbroadcastf128 %7,%%ymm7                  \n"
1829       "sub         %1,%2                         \n"
1830 
1831       LABELALIGN
1832       "1:                                        \n"
1833       "vmovdqu     (%0),%%ymm0                   \n"
1834       "vmovdqu     0x20(%0),%%ymm1               \n"
1835       "vmovdqu     0x40(%0),%%ymm2               \n"
1836       "vmovdqu     0x60(%0),%%ymm3               \n"
1837       "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
1838       "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
1839       "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
1840       "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
1841       "lea         0x80(%0),%0                   \n"
1842       "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
1843       "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
1844       "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
1845       "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
1846       "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
1847       "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
1848 
1849       "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
1850       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
1851       "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
1852       "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
1853       "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
1854       "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
1855       "vpaddw      %%ymm5,%%ymm0,%%ymm0          \n"
1856       "vpaddw      %%ymm5,%%ymm1,%%ymm1          \n"
1857       "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
1858       "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
1859       "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
1860       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1861       "vpshufb     %8,%%ymm0,%%ymm0              \n"
1862 
1863       "vextractf128 $0x0,%%ymm0,(%1)             \n"
1864       "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
1865       "lea         0x10(%1),%1                   \n"
1866       "sub         $0x20,%3                      \n"
1867       "jg          1b                            \n"
1868       "vzeroupper                                \n"
1869       : "+r"(src_abgr),                    // %0
1870         "+r"(dst_u),                       // %1
1871         "+r"(dst_v),                       // %2
1872         "+rm"(width)                       // %3
1873       : "r"((intptr_t)(src_stride_abgr)),  // %4
1874         "m"(kSub128),                      // %5
1875         "m"(kABGRToVJ),                    // %6
1876         "m"(kABGRToUJ),                    // %7
1877         "m"(kShufARGBToUV_AVX)             // %8
1878       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1879         "xmm7");
1880 }
1881 #endif  // HAS_ABGRTOUVJROW_AVX2
1882 
1883 #ifdef HAS_ARGBTOUVJROW_SSSE3
ARGBToUVJRow_SSSE3(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1884 void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
1885                         int src_stride_argb,
1886                         uint8_t* dst_u,
1887                         uint8_t* dst_v,
1888                         int width) {
1889   asm volatile(
1890       "movdqa      %5,%%xmm3                     \n"
1891       "movdqa      %6,%%xmm4                     \n"
1892       "movdqa      %7,%%xmm5                     \n"
1893       "sub         %1,%2                         \n"
1894 
1895       LABELALIGN
1896       "1:                                        \n"
1897       "movdqu      (%0),%%xmm0                   \n"
1898       "movdqu      0x00(%0,%4,1),%%xmm7          \n"
1899       "pavgb       %%xmm7,%%xmm0                 \n"
1900       "movdqu      0x10(%0),%%xmm1               \n"
1901       "movdqu      0x10(%0,%4,1),%%xmm7          \n"
1902       "pavgb       %%xmm7,%%xmm1                 \n"
1903       "movdqu      0x20(%0),%%xmm2               \n"
1904       "movdqu      0x20(%0,%4,1),%%xmm7          \n"
1905       "pavgb       %%xmm7,%%xmm2                 \n"
1906       "movdqu      0x30(%0),%%xmm6               \n"
1907       "movdqu      0x30(%0,%4,1),%%xmm7          \n"
1908       "pavgb       %%xmm7,%%xmm6                 \n"
1909 
1910       "lea         0x40(%0),%0                   \n"
1911       "movdqa      %%xmm0,%%xmm7                 \n"
1912       "shufps      $0x88,%%xmm1,%%xmm0           \n"
1913       "shufps      $0xdd,%%xmm1,%%xmm7           \n"
1914       "pavgb       %%xmm7,%%xmm0                 \n"
1915       "movdqa      %%xmm2,%%xmm7                 \n"
1916       "shufps      $0x88,%%xmm6,%%xmm2           \n"
1917       "shufps      $0xdd,%%xmm6,%%xmm7           \n"
1918       "pavgb       %%xmm7,%%xmm2                 \n"
1919       "movdqa      %%xmm0,%%xmm1                 \n"
1920       "movdqa      %%xmm2,%%xmm6                 \n"
1921       "pmaddubsw   %%xmm4,%%xmm0                 \n"
1922       "pmaddubsw   %%xmm4,%%xmm2                 \n"
1923       "pmaddubsw   %%xmm3,%%xmm1                 \n"
1924       "pmaddubsw   %%xmm3,%%xmm6                 \n"
1925       "phaddw      %%xmm2,%%xmm0                 \n"
1926       "phaddw      %%xmm6,%%xmm1                 \n"
1927       "paddw       %%xmm5,%%xmm0                 \n"
1928       "paddw       %%xmm5,%%xmm1                 \n"
1929       "psraw       $0x8,%%xmm0                   \n"
1930       "psraw       $0x8,%%xmm1                   \n"
1931       "packsswb    %%xmm1,%%xmm0                 \n"
1932       "movlps      %%xmm0,(%1)                   \n"
1933       "movhps      %%xmm0,0x00(%1,%2,1)          \n"
1934       "lea         0x8(%1),%1                    \n"
1935       "sub         $0x10,%3                      \n"
1936       "jg          1b                            \n"
1937       : "+r"(src_argb),                    // %0
1938         "+r"(dst_u),                       // %1
1939         "+r"(dst_v),                       // %2
1940         "+rm"(width)                       // %3
1941       : "r"((intptr_t)(src_stride_argb)),  // %4
1942         "m"(kARGBToVJ),                    // %5
1943         "m"(kARGBToUJ),                    // %6
1944         "m"(kSub128)                       // %7
1945       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1946 }
1947 #endif  // HAS_ARGBTOUVJROW_SSSE3
1948 
1949 #ifdef HAS_ABGRTOUVJROW_SSSE3
ABGRToUVJRow_SSSE3(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)1950 void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr,
1951                         int src_stride_abgr,
1952                         uint8_t* dst_u,
1953                         uint8_t* dst_v,
1954                         int width) {
1955   asm volatile(
1956       "movdqa      %5,%%xmm3                     \n"
1957       "movdqa      %6,%%xmm4                     \n"
1958       "movdqa      %7,%%xmm5                     \n"
1959       "sub         %1,%2                         \n"
1960 
1961       LABELALIGN
1962       "1:                                        \n"
1963       "movdqu      (%0),%%xmm0                   \n"
1964       "movdqu      0x00(%0,%4,1),%%xmm7          \n"
1965       "pavgb       %%xmm7,%%xmm0                 \n"
1966       "movdqu      0x10(%0),%%xmm1               \n"
1967       "movdqu      0x10(%0,%4,1),%%xmm7          \n"
1968       "pavgb       %%xmm7,%%xmm1                 \n"
1969       "movdqu      0x20(%0),%%xmm2               \n"
1970       "movdqu      0x20(%0,%4,1),%%xmm7          \n"
1971       "pavgb       %%xmm7,%%xmm2                 \n"
1972       "movdqu      0x30(%0),%%xmm6               \n"
1973       "movdqu      0x30(%0,%4,1),%%xmm7          \n"
1974       "pavgb       %%xmm7,%%xmm6                 \n"
1975 
1976       "lea         0x40(%0),%0                   \n"
1977       "movdqa      %%xmm0,%%xmm7                 \n"
1978       "shufps      $0x88,%%xmm1,%%xmm0           \n"
1979       "shufps      $0xdd,%%xmm1,%%xmm7           \n"
1980       "pavgb       %%xmm7,%%xmm0                 \n"
1981       "movdqa      %%xmm2,%%xmm7                 \n"
1982       "shufps      $0x88,%%xmm6,%%xmm2           \n"
1983       "shufps      $0xdd,%%xmm6,%%xmm7           \n"
1984       "pavgb       %%xmm7,%%xmm2                 \n"
1985       "movdqa      %%xmm0,%%xmm1                 \n"
1986       "movdqa      %%xmm2,%%xmm6                 \n"
1987       "pmaddubsw   %%xmm4,%%xmm0                 \n"
1988       "pmaddubsw   %%xmm4,%%xmm2                 \n"
1989       "pmaddubsw   %%xmm3,%%xmm1                 \n"
1990       "pmaddubsw   %%xmm3,%%xmm6                 \n"
1991       "phaddw      %%xmm2,%%xmm0                 \n"
1992       "phaddw      %%xmm6,%%xmm1                 \n"
1993       "paddw       %%xmm5,%%xmm0                 \n"
1994       "paddw       %%xmm5,%%xmm1                 \n"
1995       "psraw       $0x8,%%xmm0                   \n"
1996       "psraw       $0x8,%%xmm1                   \n"
1997       "packsswb    %%xmm1,%%xmm0                 \n"
1998       "movlps      %%xmm0,(%1)                   \n"
1999       "movhps      %%xmm0,0x00(%1,%2,1)          \n"
2000       "lea         0x8(%1),%1                    \n"
2001       "sub         $0x10,%3                      \n"
2002       "jg          1b                            \n"
2003       : "+r"(src_abgr),                    // %0
2004         "+r"(dst_u),                       // %1
2005         "+r"(dst_v),                       // %2
2006         "+rm"(width)                       // %3
2007       : "r"((intptr_t)(src_stride_abgr)),  // %4
2008         "m"(kABGRToVJ),                    // %5
2009         "m"(kABGRToUJ),                    // %6
2010         "m"(kSub128)                       // %7
2011       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
2012 }
2013 #endif  // HAS_ABGRTOUVJROW_SSSE3
2014 
2015 #ifdef HAS_ARGBTOUV444ROW_SSSE3
ARGBToUV444Row_SSSE3(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width)2016 void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
2017                           uint8_t* dst_u,
2018                           uint8_t* dst_v,
2019                           int width) {
2020   asm volatile(
2021       "movdqa      %4,%%xmm3                     \n"
2022       "movdqa      %5,%%xmm4                     \n"
2023       "movdqa      %6,%%xmm5                     \n"
2024       "sub         %1,%2                         \n"
2025 
2026       LABELALIGN
2027       "1:                                        \n"
2028       "movdqu      (%0),%%xmm0                   \n"
2029       "movdqu      0x10(%0),%%xmm1               \n"
2030       "movdqu      0x20(%0),%%xmm2               \n"
2031       "movdqu      0x30(%0),%%xmm6               \n"
2032       "pmaddubsw   %%xmm4,%%xmm0                 \n"
2033       "pmaddubsw   %%xmm4,%%xmm1                 \n"
2034       "pmaddubsw   %%xmm4,%%xmm2                 \n"
2035       "pmaddubsw   %%xmm4,%%xmm6                 \n"
2036       "phaddw      %%xmm1,%%xmm0                 \n"
2037       "phaddw      %%xmm6,%%xmm2                 \n"
2038       "psraw       $0x8,%%xmm0                   \n"
2039       "psraw       $0x8,%%xmm2                   \n"
2040       "packsswb    %%xmm2,%%xmm0                 \n"
2041       "paddb       %%xmm5,%%xmm0                 \n"
2042       "movdqu      %%xmm0,(%1)                   \n"
2043       "movdqu      (%0),%%xmm0                   \n"
2044       "movdqu      0x10(%0),%%xmm1               \n"
2045       "movdqu      0x20(%0),%%xmm2               \n"
2046       "movdqu      0x30(%0),%%xmm6               \n"
2047       "pmaddubsw   %%xmm3,%%xmm0                 \n"
2048       "pmaddubsw   %%xmm3,%%xmm1                 \n"
2049       "pmaddubsw   %%xmm3,%%xmm2                 \n"
2050       "pmaddubsw   %%xmm3,%%xmm6                 \n"
2051       "phaddw      %%xmm1,%%xmm0                 \n"
2052       "phaddw      %%xmm6,%%xmm2                 \n"
2053       "psraw       $0x8,%%xmm0                   \n"
2054       "psraw       $0x8,%%xmm2                   \n"
2055       "packsswb    %%xmm2,%%xmm0                 \n"
2056       "paddb       %%xmm5,%%xmm0                 \n"
2057       "lea         0x40(%0),%0                   \n"
2058       "movdqu      %%xmm0,0x00(%1,%2,1)          \n"
2059       "lea         0x10(%1),%1                   \n"
2060       "sub         $0x10,%3                      \n"
2061       "jg          1b                            \n"
2062       : "+r"(src_argb),  // %0
2063         "+r"(dst_u),     // %1
2064         "+r"(dst_v),     // %2
2065         "+rm"(width)     // %3
2066       : "m"(kARGBToV),   // %4
2067         "m"(kARGBToU),   // %5
2068         "m"(kAddUV128)   // %6
2069       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");
2070 }
2071 #endif  // HAS_ARGBTOUV444ROW_SSSE3
2072 
BGRAToYRow_SSSE3(const uint8_t * src_bgra,uint8_t * dst_y,int width)2073 void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
2074   asm volatile(
2075       "movdqa      %3,%%xmm4                     \n"
2076       "movdqa      %4,%%xmm5                     \n"
2077       "movdqa      %5,%%xmm7                     \n"
2078 
2079       LABELALIGN RGBTOY(xmm7)
2080       : "+r"(src_bgra),  // %0
2081         "+r"(dst_y),     // %1
2082         "+r"(width)      // %2
2083       : "m"(kBGRAToY),   // %3
2084         "m"(kSub128),    // %4
2085         "m"(kAddY16)     // %5
2086       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2087         "xmm7");
2088 }
2089 
BGRAToUVRow_SSSE3(const uint8_t * src_bgra,int src_stride_bgra,uint8_t * dst_u,uint8_t * dst_v,int width)2090 void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
2091                        int src_stride_bgra,
2092                        uint8_t* dst_u,
2093                        uint8_t* dst_v,
2094                        int width) {
2095   asm volatile(
2096       "movdqa      %5,%%xmm3                     \n"
2097       "movdqa      %6,%%xmm4                     \n"
2098       "movdqa      %7,%%xmm5                     \n"
2099       "sub         %1,%2                         \n"
2100 
2101       LABELALIGN
2102       "1:                                        \n"
2103       "movdqu      (%0),%%xmm0                   \n"
2104       "movdqu      0x00(%0,%4,1),%%xmm7          \n"
2105       "pavgb       %%xmm7,%%xmm0                 \n"
2106       "movdqu      0x10(%0),%%xmm1               \n"
2107       "movdqu      0x10(%0,%4,1),%%xmm7          \n"
2108       "pavgb       %%xmm7,%%xmm1                 \n"
2109       "movdqu      0x20(%0),%%xmm2               \n"
2110       "movdqu      0x20(%0,%4,1),%%xmm7          \n"
2111       "pavgb       %%xmm7,%%xmm2                 \n"
2112       "movdqu      0x30(%0),%%xmm6               \n"
2113       "movdqu      0x30(%0,%4,1),%%xmm7          \n"
2114       "pavgb       %%xmm7,%%xmm6                 \n"
2115 
2116       "lea         0x40(%0),%0                   \n"
2117       "movdqa      %%xmm0,%%xmm7                 \n"
2118       "shufps      $0x88,%%xmm1,%%xmm0           \n"
2119       "shufps      $0xdd,%%xmm1,%%xmm7           \n"
2120       "pavgb       %%xmm7,%%xmm0                 \n"
2121       "movdqa      %%xmm2,%%xmm7                 \n"
2122       "shufps      $0x88,%%xmm6,%%xmm2           \n"
2123       "shufps      $0xdd,%%xmm6,%%xmm7           \n"
2124       "pavgb       %%xmm7,%%xmm2                 \n"
2125       "movdqa      %%xmm0,%%xmm1                 \n"
2126       "movdqa      %%xmm2,%%xmm6                 \n"
2127       "pmaddubsw   %%xmm4,%%xmm0                 \n"
2128       "pmaddubsw   %%xmm4,%%xmm2                 \n"
2129       "pmaddubsw   %%xmm3,%%xmm1                 \n"
2130       "pmaddubsw   %%xmm3,%%xmm6                 \n"
2131       "phaddw      %%xmm2,%%xmm0                 \n"
2132       "phaddw      %%xmm6,%%xmm1                 \n"
2133       "psraw       $0x8,%%xmm0                   \n"
2134       "psraw       $0x8,%%xmm1                   \n"
2135       "packsswb    %%xmm1,%%xmm0                 \n"
2136       "paddb       %%xmm5,%%xmm0                 \n"
2137       "movlps      %%xmm0,(%1)                   \n"
2138       "movhps      %%xmm0,0x00(%1,%2,1)          \n"
2139       "lea         0x8(%1),%1                    \n"
2140       "sub         $0x10,%3                      \n"
2141       "jg          1b                            \n"
2142       : "+r"(src_bgra),                    // %0
2143         "+r"(dst_u),                       // %1
2144         "+r"(dst_v),                       // %2
2145         "+rm"(width)                       // %3
2146       : "r"((intptr_t)(src_stride_bgra)),  // %4
2147         "m"(kBGRAToV),                     // %5
2148         "m"(kBGRAToU),                     // %6
2149         "m"(kAddUV128)                     // %7
2150       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
2151 }
2152 
ABGRToYRow_SSSE3(const uint8_t * src_abgr,uint8_t * dst_y,int width)2153 void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
2154   asm volatile(
2155       "movdqa      %3,%%xmm4                     \n"
2156       "movdqa      %4,%%xmm5                     \n"
2157       "movdqa      %5,%%xmm7                     \n"
2158 
2159       LABELALIGN RGBTOY(xmm7)
2160       : "+r"(src_abgr),  // %0
2161         "+r"(dst_y),     // %1
2162         "+r"(width)      // %2
2163       : "m"(kABGRToY),   // %3
2164         "m"(kSub128),    // %4
2165         "m"(kAddY16)     // %5
2166       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2167         "xmm7");
2168 }
2169 
RGBAToYRow_SSSE3(const uint8_t * src_rgba,uint8_t * dst_y,int width)2170 void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
2171   asm volatile(
2172       "movdqa      %3,%%xmm4                     \n"
2173       "movdqa      %4,%%xmm5                     \n"
2174       "movdqa      %5,%%xmm7                     \n"
2175 
2176       LABELALIGN RGBTOY(xmm7)
2177       : "+r"(src_rgba),  // %0
2178         "+r"(dst_y),     // %1
2179         "+r"(width)      // %2
2180       : "m"(kRGBAToY),   // %3
2181         "m"(kSub128),    // %4
2182         "m"(kAddY16)     // %5
2183       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2184         "xmm7");
2185 }
2186 
ABGRToUVRow_SSSE3(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)2187 void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
2188                        int src_stride_abgr,
2189                        uint8_t* dst_u,
2190                        uint8_t* dst_v,
2191                        int width) {
2192   asm volatile(
2193       "movdqa      %5,%%xmm3                     \n"
2194       "movdqa      %6,%%xmm4                     \n"
2195       "movdqa      %7,%%xmm5                     \n"
2196       "sub         %1,%2                         \n"
2197 
2198       LABELALIGN
2199       "1:                                        \n"
2200       "movdqu      (%0),%%xmm0                   \n"
2201       "movdqu      0x00(%0,%4,1),%%xmm7          \n"
2202       "pavgb       %%xmm7,%%xmm0                 \n"
2203       "movdqu      0x10(%0),%%xmm1               \n"
2204       "movdqu      0x10(%0,%4,1),%%xmm7          \n"
2205       "pavgb       %%xmm7,%%xmm1                 \n"
2206       "movdqu      0x20(%0),%%xmm2               \n"
2207       "movdqu      0x20(%0,%4,1),%%xmm7          \n"
2208       "pavgb       %%xmm7,%%xmm2                 \n"
2209       "movdqu      0x30(%0),%%xmm6               \n"
2210       "movdqu      0x30(%0,%4,1),%%xmm7          \n"
2211       "pavgb       %%xmm7,%%xmm6                 \n"
2212 
2213       "lea         0x40(%0),%0                   \n"
2214       "movdqa      %%xmm0,%%xmm7                 \n"
2215       "shufps      $0x88,%%xmm1,%%xmm0           \n"
2216       "shufps      $0xdd,%%xmm1,%%xmm7           \n"
2217       "pavgb       %%xmm7,%%xmm0                 \n"
2218       "movdqa      %%xmm2,%%xmm7                 \n"
2219       "shufps      $0x88,%%xmm6,%%xmm2           \n"
2220       "shufps      $0xdd,%%xmm6,%%xmm7           \n"
2221       "pavgb       %%xmm7,%%xmm2                 \n"
2222       "movdqa      %%xmm0,%%xmm1                 \n"
2223       "movdqa      %%xmm2,%%xmm6                 \n"
2224       "pmaddubsw   %%xmm4,%%xmm0                 \n"
2225       "pmaddubsw   %%xmm4,%%xmm2                 \n"
2226       "pmaddubsw   %%xmm3,%%xmm1                 \n"
2227       "pmaddubsw   %%xmm3,%%xmm6                 \n"
2228       "phaddw      %%xmm2,%%xmm0                 \n"
2229       "phaddw      %%xmm6,%%xmm1                 \n"
2230       "psraw       $0x8,%%xmm0                   \n"
2231       "psraw       $0x8,%%xmm1                   \n"
2232       "packsswb    %%xmm1,%%xmm0                 \n"
2233       "paddb       %%xmm5,%%xmm0                 \n"
2234       "movlps      %%xmm0,(%1)                   \n"
2235       "movhps      %%xmm0,0x00(%1,%2,1)          \n"
2236       "lea         0x8(%1),%1                    \n"
2237       "sub         $0x10,%3                      \n"
2238       "jg          1b                            \n"
2239       : "+r"(src_abgr),                    // %0
2240         "+r"(dst_u),                       // %1
2241         "+r"(dst_v),                       // %2
2242         "+rm"(width)                       // %3
2243       : "r"((intptr_t)(src_stride_abgr)),  // %4
2244         "m"(kABGRToV),                     // %5
2245         "m"(kABGRToU),                     // %6
2246         "m"(kAddUV128)                     // %7
2247       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
2248 }
2249 
RGBAToUVRow_SSSE3(const uint8_t * src_rgba,int src_stride_rgba,uint8_t * dst_u,uint8_t * dst_v,int width)2250 void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
2251                        int src_stride_rgba,
2252                        uint8_t* dst_u,
2253                        uint8_t* dst_v,
2254                        int width) {
2255   asm volatile(
2256       "movdqa      %5,%%xmm3                     \n"
2257       "movdqa      %6,%%xmm4                     \n"
2258       "movdqa      %7,%%xmm5                     \n"
2259       "sub         %1,%2                         \n"
2260 
2261       LABELALIGN
2262       "1:                                        \n"
2263       "movdqu      (%0),%%xmm0                   \n"
2264       "movdqu      0x00(%0,%4,1),%%xmm7          \n"
2265       "pavgb       %%xmm7,%%xmm0                 \n"
2266       "movdqu      0x10(%0),%%xmm1               \n"
2267       "movdqu      0x10(%0,%4,1),%%xmm7          \n"
2268       "pavgb       %%xmm7,%%xmm1                 \n"
2269       "movdqu      0x20(%0),%%xmm2               \n"
2270       "movdqu      0x20(%0,%4,1),%%xmm7          \n"
2271       "pavgb       %%xmm7,%%xmm2                 \n"
2272       "movdqu      0x30(%0),%%xmm6               \n"
2273       "movdqu      0x30(%0,%4,1),%%xmm7          \n"
2274       "pavgb       %%xmm7,%%xmm6                 \n"
2275 
2276       "lea         0x40(%0),%0                   \n"
2277       "movdqa      %%xmm0,%%xmm7                 \n"
2278       "shufps      $0x88,%%xmm1,%%xmm0           \n"
2279       "shufps      $0xdd,%%xmm1,%%xmm7           \n"
2280       "pavgb       %%xmm7,%%xmm0                 \n"
2281       "movdqa      %%xmm2,%%xmm7                 \n"
2282       "shufps      $0x88,%%xmm6,%%xmm2           \n"
2283       "shufps      $0xdd,%%xmm6,%%xmm7           \n"
2284       "pavgb       %%xmm7,%%xmm2                 \n"
2285       "movdqa      %%xmm0,%%xmm1                 \n"
2286       "movdqa      %%xmm2,%%xmm6                 \n"
2287       "pmaddubsw   %%xmm4,%%xmm0                 \n"
2288       "pmaddubsw   %%xmm4,%%xmm2                 \n"
2289       "pmaddubsw   %%xmm3,%%xmm1                 \n"
2290       "pmaddubsw   %%xmm3,%%xmm6                 \n"
2291       "phaddw      %%xmm2,%%xmm0                 \n"
2292       "phaddw      %%xmm6,%%xmm1                 \n"
2293       "psraw       $0x8,%%xmm0                   \n"
2294       "psraw       $0x8,%%xmm1                   \n"
2295       "packsswb    %%xmm1,%%xmm0                 \n"
2296       "paddb       %%xmm5,%%xmm0                 \n"
2297       "movlps      %%xmm0,(%1)                   \n"
2298       "movhps      %%xmm0,0x00(%1,%2,1)          \n"
2299       "lea         0x8(%1),%1                    \n"
2300       "sub         $0x10,%3                      \n"
2301       "jg          1b                            \n"
2302       : "+r"(src_rgba),                    // %0
2303         "+r"(dst_u),                       // %1
2304         "+r"(dst_v),                       // %2
2305         "+rm"(width)                       // %3
2306       : "r"((intptr_t)(src_stride_rgba)),  // %4
2307         "m"(kRGBAToV),                     // %5
2308         "m"(kRGBAToU),                     // %6
2309         "m"(kAddUV128)                     // %7
2310       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
2311 }
2312 
2313 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
2314 
2315 // Read 8 UV from 444
2316 #define READYUV444                                                \
2317   "movq       (%[u_buf]),%%xmm3                               \n" \
2318   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2319   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
2320   "punpcklbw  %%xmm1,%%xmm3                                   \n" \
2321   "movq       (%[y_buf]),%%xmm4                               \n" \
2322   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
2323   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
2324 
2325 // Read 4 UV from 422, upsample to 8 UV
2326 #define READYUV422                                                \
2327   "movd       (%[u_buf]),%%xmm3                               \n" \
2328   "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2329   "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
2330   "punpcklbw  %%xmm1,%%xmm3                                   \n" \
2331   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2332   "movq       (%[y_buf]),%%xmm4                               \n" \
2333   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
2334   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
2335 
2336 // Read 4 UV from 422 10 bit, upsample to 8 UV
2337 #define READYUV210                                                \
2338   "movq       (%[u_buf]),%%xmm3                               \n" \
2339   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2340   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
2341   "punpcklwd  %%xmm1,%%xmm3                                   \n" \
2342   "psraw      $2,%%xmm3                                       \n" \
2343   "packuswb   %%xmm3,%%xmm3                                   \n" \
2344   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2345   "movdqu     (%[y_buf]),%%xmm4                               \n" \
2346   "movdqa     %%xmm4,%%xmm2                                   \n" \
2347   "psllw      $6,%%xmm4                                       \n" \
2348   "psrlw      $4,%%xmm2                                       \n" \
2349   "paddw      %%xmm2,%%xmm4                                   \n" \
2350   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
2351 
2352 #define READYUVA210                                               \
2353   "movq       (%[u_buf]),%%xmm3                               \n" \
2354   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2355   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
2356   "punpcklwd  %%xmm1,%%xmm3                                   \n" \
2357   "psraw      $2,%%xmm3                                       \n" \
2358   "packuswb   %%xmm3,%%xmm3                                   \n" \
2359   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2360   "movdqu     (%[y_buf]),%%xmm4                               \n" \
2361   "movdqa     %%xmm4,%%xmm2                                   \n" \
2362   "psllw      $6,%%xmm4                                       \n" \
2363   "psrlw      $4,%%xmm2                                       \n" \
2364   "paddw      %%xmm2,%%xmm4                                   \n" \
2365   "lea        0x10(%[y_buf]),%[y_buf]                         \n" \
2366   "movdqu     (%[a_buf]),%%xmm5                               \n" \
2367   "psraw      $2,%%xmm5                                       \n" \
2368   "packuswb   %%xmm5,%%xmm5                                   \n" \
2369   "lea        0x10(%[a_buf]),%[a_buf]                         \n"
2370 
2371 // Read 8 UV from 444 10 bit
2372 #define READYUV410                                                \
2373   "movdqu     (%[u_buf]),%%xmm3                               \n" \
2374   "movdqu     0x00(%[u_buf],%[v_buf],1),%%xmm2                \n" \
2375   "lea        0x10(%[u_buf]),%[u_buf]                         \n" \
2376   "psraw      $2,%%xmm3                                       \n" \
2377   "psraw      $2,%%xmm2                                       \n" \
2378   "movdqa     %%xmm3,%%xmm1                                   \n" \
2379   "punpcklwd  %%xmm2,%%xmm3                                   \n" \
2380   "punpckhwd  %%xmm2,%%xmm1                                   \n" \
2381   "packuswb   %%xmm1,%%xmm3                                   \n" \
2382   "movdqu     (%[y_buf]),%%xmm4                               \n" \
2383   "movdqa     %%xmm4,%%xmm2                                   \n" \
2384   "psllw      $6,%%xmm4                                       \n" \
2385   "psrlw      $4,%%xmm2                                       \n" \
2386   "paddw      %%xmm2,%%xmm4                                   \n" \
2387   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
2388 
2389 // Read 8 UV from 444 10 bit.  With 8 Alpha.
2390 #define READYUVA410                                               \
2391   "movdqu     (%[u_buf]),%%xmm3                               \n" \
2392   "movdqu     0x00(%[u_buf],%[v_buf],1),%%xmm2                \n" \
2393   "lea        0x10(%[u_buf]),%[u_buf]                         \n" \
2394   "psraw      $2,%%xmm3                                       \n" \
2395   "psraw      $2,%%xmm2                                       \n" \
2396   "movdqa     %%xmm3,%%xmm1                                   \n" \
2397   "punpcklwd  %%xmm2,%%xmm3                                   \n" \
2398   "punpckhwd  %%xmm2,%%xmm1                                   \n" \
2399   "packuswb   %%xmm1,%%xmm3                                   \n" \
2400   "movdqu     (%[y_buf]),%%xmm4                               \n" \
2401   "movdqa     %%xmm4,%%xmm2                                   \n" \
2402   "psllw      $6,%%xmm4                                       \n" \
2403   "psrlw      $4,%%xmm2                                       \n" \
2404   "paddw      %%xmm2,%%xmm4                                   \n" \
2405   "lea        0x10(%[y_buf]),%[y_buf]                         \n" \
2406   "movdqu     (%[a_buf]),%%xmm5                               \n" \
2407   "psraw      $2,%%xmm5                                       \n" \
2408   "packuswb   %%xmm5,%%xmm5                                   \n" \
2409   "lea        0x10(%[a_buf]),%[a_buf]                         \n"
2410 
2411 // Read 4 UV from 422 12 bit, upsample to 8 UV
2412 #define READYUV212                                                \
2413   "movq       (%[u_buf]),%%xmm3                               \n" \
2414   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2415   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
2416   "punpcklwd  %%xmm1,%%xmm3                                   \n" \
2417   "psraw      $0x4,%%xmm3                                     \n" \
2418   "packuswb   %%xmm3,%%xmm3                                   \n" \
2419   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2420   "movdqu     (%[y_buf]),%%xmm4                               \n" \
2421   "movdqa     %%xmm4,%%xmm2                                   \n" \
2422   "psllw      $4,%%xmm4                                       \n" \
2423   "psrlw      $8,%%xmm2                                       \n" \
2424   "paddw      %%xmm2,%%xmm4                                   \n" \
2425   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
2426 
2427 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
2428 #define READYUVA422                                               \
2429   "movd       (%[u_buf]),%%xmm3                               \n" \
2430   "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2431   "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
2432   "punpcklbw  %%xmm1,%%xmm3                                   \n" \
2433   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2434   "movq       (%[y_buf]),%%xmm4                               \n" \
2435   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
2436   "lea        0x8(%[y_buf]),%[y_buf]                          \n" \
2437   "movq       (%[a_buf]),%%xmm5                               \n" \
2438   "lea        0x8(%[a_buf]),%[a_buf]                          \n"
2439 
2440 // Read 8 UV from 444.  With 8 Alpha.
2441 #define READYUVA444                                               \
2442   "movq       (%[u_buf]),%%xmm3                               \n" \
2443   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
2444   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
2445   "punpcklbw  %%xmm1,%%xmm3                                   \n" \
2446   "movq       (%[y_buf]),%%xmm4                               \n" \
2447   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
2448   "lea        0x8(%[y_buf]),%[y_buf]                          \n" \
2449   "movq       (%[a_buf]),%%xmm5                               \n" \
2450   "lea        0x8(%[a_buf]),%[a_buf]                          \n"
2451 
2452 // Read 4 UV from NV12, upsample to 8 UV
2453 #define READNV12                                                  \
2454   "movq       (%[uv_buf]),%%xmm3                              \n" \
2455   "lea        0x8(%[uv_buf]),%[uv_buf]                        \n" \
2456   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2457   "movq       (%[y_buf]),%%xmm4                               \n" \
2458   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
2459   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
2460 
2461 // Read 4 VU from NV21, upsample to 8 UV
2462 #define READNV21                                                  \
2463   "movq       (%[vu_buf]),%%xmm3                              \n" \
2464   "lea        0x8(%[vu_buf]),%[vu_buf]                        \n" \
2465   "pshufb     %[kShuffleNV21], %%xmm3                         \n" \
2466   "movq       (%[y_buf]),%%xmm4                               \n" \
2467   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
2468   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
2469 
2470 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
2471 #define READYUY2                                                  \
2472   "movdqu     (%[yuy2_buf]),%%xmm4                            \n" \
2473   "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n" \
2474   "movdqu     (%[yuy2_buf]),%%xmm3                            \n" \
2475   "pshufb     %[kShuffleYUY2UV], %%xmm3                       \n" \
2476   "lea        0x10(%[yuy2_buf]),%[yuy2_buf]                   \n"
2477 
2478 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
2479 #define READUYVY                                                  \
2480   "movdqu     (%[uyvy_buf]),%%xmm4                            \n" \
2481   "pshufb     %[kShuffleUYVYY], %%xmm4                        \n" \
2482   "movdqu     (%[uyvy_buf]),%%xmm3                            \n" \
2483   "pshufb     %[kShuffleUYVYUV], %%xmm3                       \n" \
2484   "lea        0x10(%[uyvy_buf]),%[uyvy_buf]                   \n"
2485 
2486 // Read 4 UV from P210, upsample to 8 UV
2487 #define READP210                                                  \
2488   "movdqu     (%[uv_buf]),%%xmm3                              \n" \
2489   "lea        0x10(%[uv_buf]),%[uv_buf]                       \n" \
2490   "psrlw      $0x8,%%xmm3                                     \n" \
2491   "packuswb   %%xmm3,%%xmm3                                   \n" \
2492   "punpcklwd  %%xmm3,%%xmm3                                   \n" \
2493   "movdqu     (%[y_buf]),%%xmm4                               \n" \
2494   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
2495 
2496 // Read 8 UV from P410
2497 #define READP410                                                  \
2498   "movdqu     (%[uv_buf]),%%xmm3                              \n" \
2499   "movdqu     0x10(%[uv_buf]),%%xmm1                          \n" \
2500   "lea        0x20(%[uv_buf]),%[uv_buf]                       \n" \
2501   "psrlw      $0x8,%%xmm3                                     \n" \
2502   "psrlw      $0x8,%%xmm1                                     \n" \
2503   "packuswb   %%xmm1,%%xmm3                                   \n" \
2504   "movdqu     (%[y_buf]),%%xmm4                               \n" \
2505   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
2506 
2507 #if defined(__x86_64__)
2508 #define YUVTORGB_SETUP(yuvconstants)                              \
2509   "pcmpeqb    %%xmm13,%%xmm13                                 \n" \
2510   "movdqa     (%[yuvconstants]),%%xmm8                        \n" \
2511   "pxor       %%xmm12,%%xmm12                                 \n" \
2512   "movdqa     32(%[yuvconstants]),%%xmm9                      \n" \
2513   "psllw      $7,%%xmm13                                      \n" \
2514   "movdqa     64(%[yuvconstants]),%%xmm10                     \n" \
2515   "pshufb     %%xmm12,%%xmm13                                 \n" \
2516   "movdqa     96(%[yuvconstants]),%%xmm11                     \n" \
2517   "movdqa     128(%[yuvconstants]),%%xmm12                    \n"
2518 
2519 // Convert 8 pixels: 8 UV and 8 Y
2520 #define YUVTORGB16(yuvconstants)                                  \
2521   "psubb      %%xmm13,%%xmm3                                  \n" \
2522   "pmulhuw    %%xmm11,%%xmm4                                  \n" \
2523   "movdqa     %%xmm8,%%xmm0                                   \n" \
2524   "movdqa     %%xmm9,%%xmm1                                   \n" \
2525   "movdqa     %%xmm10,%%xmm2                                  \n" \
2526   "paddw      %%xmm12,%%xmm4                                  \n" \
2527   "pmaddubsw  %%xmm3,%%xmm0                                   \n" \
2528   "pmaddubsw  %%xmm3,%%xmm1                                   \n" \
2529   "pmaddubsw  %%xmm3,%%xmm2                                   \n" \
2530   "paddsw     %%xmm4,%%xmm0                                   \n" \
2531   "paddsw     %%xmm4,%%xmm2                                   \n" \
2532   "psubsw     %%xmm1,%%xmm4                                   \n" \
2533   "movdqa     %%xmm4,%%xmm1                                   \n"
2534 
2535 #define YUVTORGB_REGS "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
2536 
2537 #else
2538 #define YUVTORGB_SETUP(yuvconstants)
2539 // Convert 8 pixels: 8 UV and 8 Y
2540 #define YUVTORGB16(yuvconstants)                                  \
2541   "pcmpeqb    %%xmm0,%%xmm0                                   \n" \
2542   "pxor       %%xmm1,%%xmm1                                   \n" \
2543   "psllw      $7,%%xmm0                                       \n" \
2544   "pshufb     %%xmm1,%%xmm0                                   \n" \
2545   "psubb      %%xmm0,%%xmm3                                   \n" \
2546   "pmulhuw    96(%[yuvconstants]),%%xmm4                      \n" \
2547   "movdqa     (%[yuvconstants]),%%xmm0                        \n" \
2548   "movdqa     32(%[yuvconstants]),%%xmm1                      \n" \
2549   "movdqa     64(%[yuvconstants]),%%xmm2                      \n" \
2550   "pmaddubsw  %%xmm3,%%xmm0                                   \n" \
2551   "pmaddubsw  %%xmm3,%%xmm1                                   \n" \
2552   "pmaddubsw  %%xmm3,%%xmm2                                   \n" \
2553   "movdqa     128(%[yuvconstants]),%%xmm3                     \n" \
2554   "paddw      %%xmm3,%%xmm4                                   \n" \
2555   "paddsw     %%xmm4,%%xmm0                                   \n" \
2556   "paddsw     %%xmm4,%%xmm2                                   \n" \
2557   "psubsw     %%xmm1,%%xmm4                                   \n" \
2558   "movdqa     %%xmm4,%%xmm1                                   \n"
2559 
2560 #define YUVTORGB_REGS
2561 #endif
2562 
2563 #define YUVTORGB(yuvconstants)                                    \
2564   YUVTORGB16(yuvconstants)                                        \
2565   "psraw      $0x6,%%xmm0                                     \n" \
2566   "psraw      $0x6,%%xmm1                                     \n" \
2567   "psraw      $0x6,%%xmm2                                     \n" \
2568   "packuswb   %%xmm0,%%xmm0                                   \n" \
2569   "packuswb   %%xmm1,%%xmm1                                   \n" \
2570   "packuswb   %%xmm2,%%xmm2                                   \n"
2571 
2572 // Store 8 ARGB values.
2573 #define STOREARGB                                                  \
2574   "punpcklbw  %%xmm1,%%xmm0                                    \n" \
2575   "punpcklbw  %%xmm5,%%xmm2                                    \n" \
2576   "movdqa     %%xmm0,%%xmm1                                    \n" \
2577   "punpcklwd  %%xmm2,%%xmm0                                    \n" \
2578   "punpckhwd  %%xmm2,%%xmm1                                    \n" \
2579   "movdqu     %%xmm0,(%[dst_argb])                             \n" \
2580   "movdqu     %%xmm1,0x10(%[dst_argb])                         \n" \
2581   "lea        0x20(%[dst_argb]), %[dst_argb]                   \n"
2582 
2583 // Store 8 RGBA values.
2584 #define STORERGBA                                                  \
2585   "pcmpeqb   %%xmm5,%%xmm5                                     \n" \
2586   "punpcklbw %%xmm2,%%xmm1                                     \n" \
2587   "punpcklbw %%xmm0,%%xmm5                                     \n" \
2588   "movdqa    %%xmm5,%%xmm0                                     \n" \
2589   "punpcklwd %%xmm1,%%xmm5                                     \n" \
2590   "punpckhwd %%xmm1,%%xmm0                                     \n" \
2591   "movdqu    %%xmm5,(%[dst_rgba])                              \n" \
2592   "movdqu    %%xmm0,0x10(%[dst_rgba])                          \n" \
2593   "lea       0x20(%[dst_rgba]),%[dst_rgba]                     \n"
2594 
2595 // Store 8 RGB24 values.
2596 #define STORERGB24                                                      \
2597   "punpcklbw   %%xmm1,%%xmm0                                        \n" \
2598   "punpcklbw   %%xmm2,%%xmm2                                        \n" \
2599   "movdqa      %%xmm0,%%xmm1                                        \n" \
2600   "punpcklwd   %%xmm2,%%xmm0                                        \n" \
2601   "punpckhwd   %%xmm2,%%xmm1                                        \n" \
2602   "pshufb      %%xmm5,%%xmm0                                        \n" \
2603   "pshufb      %%xmm6,%%xmm1                                        \n" \
2604   "palignr     $0xc,%%xmm0,%%xmm1                                   \n" \
2605   "movq        %%xmm0,(%[dst_rgb24])                                \n" \
2606   "movdqu      %%xmm1,0x8(%[dst_rgb24])                             \n" \
2607   "lea         0x18(%[dst_rgb24]),%[dst_rgb24]                      \n"
2608 
2609 // Store 8 AR30 values.
2610 #define STOREAR30                                                  \
2611   "psraw      $0x4,%%xmm0                                      \n" \
2612   "psraw      $0x4,%%xmm1                                      \n" \
2613   "psraw      $0x4,%%xmm2                                      \n" \
2614   "pminsw     %%xmm7,%%xmm0                                    \n" \
2615   "pminsw     %%xmm7,%%xmm1                                    \n" \
2616   "pminsw     %%xmm7,%%xmm2                                    \n" \
2617   "pmaxsw     %%xmm6,%%xmm0                                    \n" \
2618   "pmaxsw     %%xmm6,%%xmm1                                    \n" \
2619   "pmaxsw     %%xmm6,%%xmm2                                    \n" \
2620   "psllw      $0x4,%%xmm2                                      \n" \
2621   "movdqa     %%xmm0,%%xmm3                                    \n" \
2622   "punpcklwd  %%xmm2,%%xmm0                                    \n" \
2623   "punpckhwd  %%xmm2,%%xmm3                                    \n" \
2624   "movdqa     %%xmm1,%%xmm2                                    \n" \
2625   "punpcklwd  %%xmm5,%%xmm1                                    \n" \
2626   "punpckhwd  %%xmm5,%%xmm2                                    \n" \
2627   "pslld      $0xa,%%xmm1                                      \n" \
2628   "pslld      $0xa,%%xmm2                                      \n" \
2629   "por        %%xmm1,%%xmm0                                    \n" \
2630   "por        %%xmm2,%%xmm3                                    \n" \
2631   "movdqu     %%xmm0,(%[dst_ar30])                             \n" \
2632   "movdqu     %%xmm3,0x10(%[dst_ar30])                         \n" \
2633   "lea        0x20(%[dst_ar30]), %[dst_ar30]                   \n"
2634 
I444ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2635 void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
2636                                 const uint8_t* u_buf,
2637                                 const uint8_t* v_buf,
2638                                 uint8_t* dst_argb,
2639                                 const struct YuvConstants* yuvconstants,
2640                                 int width) {
2641   asm volatile (
2642     YUVTORGB_SETUP(yuvconstants)
2643       "sub         %[u_buf],%[v_buf]             \n"
2644       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2645 
2646     LABELALIGN
2647       "1:                                        \n"
2648     READYUV444
2649     YUVTORGB(yuvconstants)
2650     STOREARGB
2651       "sub         $0x8,%[width]                 \n"
2652       "jg          1b                            \n"
2653   : [y_buf]"+r"(y_buf),    // %[y_buf]
2654     [u_buf]"+r"(u_buf),    // %[u_buf]
2655     [v_buf]"+r"(v_buf),    // %[v_buf]
2656     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2657     [width]"+rm"(width)    // %[width]
2658   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2659   : "memory", "cc", YUVTORGB_REGS
2660     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2661   );
2662 }
2663 
2664 #ifdef HAS_I444ALPHATOARGBROW_SSSE3
I444AlphaToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2665 void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
2666                                      const uint8_t* u_buf,
2667                                      const uint8_t* v_buf,
2668                                      const uint8_t* a_buf,
2669                                      uint8_t* dst_argb,
2670                                      const struct YuvConstants* yuvconstants,
2671                                      int width) {
2672   // clang-format off
2673   asm volatile (
2674   YUVTORGB_SETUP(yuvconstants)
2675       "sub         %[u_buf],%[v_buf]             \n"
2676 
2677   LABELALIGN
2678       "1:                                        \n"
2679   READYUVA444
2680   YUVTORGB(yuvconstants)
2681   STOREARGB
2682       "subl        $0x8,%[width]                 \n"
2683       "jg          1b                            \n"
2684   : [y_buf]"+r"(y_buf),    // %[y_buf]
2685     [u_buf]"+r"(u_buf),    // %[u_buf]
2686     [v_buf]"+r"(v_buf),    // %[v_buf]
2687     [a_buf]"+r"(a_buf),    // %[a_buf]
2688     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2689 #if defined(__i386__)
2690     [width]"+m"(width)     // %[width]
2691 #else
2692     [width]"+rm"(width)    // %[width]
2693 #endif
2694   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2695   : "memory", "cc", YUVTORGB_REGS
2696       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2697   );
2698   // clang-format on
2699 }
2700 #endif  // HAS_I444ALPHATOARGBROW_SSSE3
2701 
I422ToRGB24Row_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)2702 void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
2703                                  const uint8_t* u_buf,
2704                                  const uint8_t* v_buf,
2705                                  uint8_t* dst_rgb24,
2706                                  const struct YuvConstants* yuvconstants,
2707                                  int width) {
2708   asm volatile (
2709     YUVTORGB_SETUP(yuvconstants)
2710       "movdqa      %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
2711       "movdqa      %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
2712       "sub         %[u_buf],%[v_buf]             \n"
2713 
2714     LABELALIGN
2715       "1:                                        \n"
2716     READYUV422
2717     YUVTORGB(yuvconstants)
2718     STORERGB24
2719       "subl        $0x8,%[width]                 \n"
2720       "jg          1b                            \n"
2721   : [y_buf]"+r"(y_buf),    // %[y_buf]
2722     [u_buf]"+r"(u_buf),    // %[u_buf]
2723     [v_buf]"+r"(v_buf),    // %[v_buf]
2724     [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
2725 #if defined(__i386__)
2726     [width]"+m"(width)     // %[width]
2727 #else
2728     [width]"+rm"(width)    // %[width]
2729 #endif
2730   : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
2731     [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
2732     [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
2733   : "memory", "cc", YUVTORGB_REGS
2734     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2735   );
2736 }
2737 
I444ToRGB24Row_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)2738 void OMITFP I444ToRGB24Row_SSSE3(const uint8_t* y_buf,
2739                                  const uint8_t* u_buf,
2740                                  const uint8_t* v_buf,
2741                                  uint8_t* dst_rgb24,
2742                                  const struct YuvConstants* yuvconstants,
2743                                  int width) {
2744   asm volatile (
2745     YUVTORGB_SETUP(yuvconstants)
2746       "movdqa      %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
2747       "movdqa      %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
2748       "sub         %[u_buf],%[v_buf]             \n"
2749 
2750     LABELALIGN
2751       "1:                                        \n"
2752     READYUV444
2753     YUVTORGB(yuvconstants)
2754     STORERGB24
2755       "subl        $0x8,%[width]                 \n"
2756       "jg          1b                            \n"
2757   : [y_buf]"+r"(y_buf),    // %[y_buf]
2758     [u_buf]"+r"(u_buf),    // %[u_buf]
2759     [v_buf]"+r"(v_buf),    // %[v_buf]
2760     [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
2761 #if defined(__i386__)
2762     [width]"+m"(width)     // %[width]
2763 #else
2764     [width]"+rm"(width)    // %[width]
2765 #endif
2766   : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
2767     [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
2768     [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
2769   : "memory", "cc", YUVTORGB_REGS
2770     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2771   );
2772 }
2773 
I422ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2774 void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
2775                                 const uint8_t* u_buf,
2776                                 const uint8_t* v_buf,
2777                                 uint8_t* dst_argb,
2778                                 const struct YuvConstants* yuvconstants,
2779                                 int width) {
2780   asm volatile (
2781     YUVTORGB_SETUP(yuvconstants)
2782       "sub         %[u_buf],%[v_buf]             \n"
2783       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2784 
2785     LABELALIGN
2786       "1:                                        \n"
2787     READYUV422
2788     YUVTORGB(yuvconstants)
2789     STOREARGB
2790       "sub         $0x8,%[width]                 \n"
2791       "jg          1b                            \n"
2792   : [y_buf]"+r"(y_buf),    // %[y_buf]
2793     [u_buf]"+r"(u_buf),    // %[u_buf]
2794     [v_buf]"+r"(v_buf),    // %[v_buf]
2795     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2796     [width]"+rm"(width)    // %[width]
2797   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2798   : "memory", "cc", YUVTORGB_REGS
2799     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2800   );
2801 }
2802 
I422ToAR30Row_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2803 void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
2804                                 const uint8_t* u_buf,
2805                                 const uint8_t* v_buf,
2806                                 uint8_t* dst_ar30,
2807                                 const struct YuvConstants* yuvconstants,
2808                                 int width) {
2809   asm volatile (
2810     YUVTORGB_SETUP(yuvconstants)
2811       "sub         %[u_buf],%[v_buf]             \n"
2812       "pcmpeqb     %%xmm5,%%xmm5                 \n"  // AR30 constants
2813       "psrlw       $14,%%xmm5                    \n"
2814       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
2815       "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
2816       "pcmpeqb     %%xmm7,%%xmm7                 \n"
2817       "psrlw       $6,%%xmm7                     \n"  // 1023 for max
2818 
2819     LABELALIGN
2820       "1:                                        \n"
2821     READYUV422
2822     YUVTORGB16(yuvconstants)
2823     STOREAR30
2824       "sub         $0x8,%[width]                 \n"
2825       "jg          1b                            \n"
2826   : [y_buf]"+r"(y_buf),    // %[y_buf]
2827     [u_buf]"+r"(u_buf),    // %[u_buf]
2828     [v_buf]"+r"(v_buf),    // %[v_buf]
2829     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
2830     [width]"+rm"(width)    // %[width]
2831   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2832   : "memory", "cc", YUVTORGB_REGS
2833     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2834   );
2835 }
2836 
2837 // 10 bit YUV to ARGB
I210ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2838 void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
2839                                 const uint16_t* u_buf,
2840                                 const uint16_t* v_buf,
2841                                 uint8_t* dst_argb,
2842                                 const struct YuvConstants* yuvconstants,
2843                                 int width) {
2844   asm volatile (
2845     YUVTORGB_SETUP(yuvconstants)
2846       "sub         %[u_buf],%[v_buf]             \n"
2847       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2848 
2849     LABELALIGN
2850       "1:                                        \n"
2851     READYUV210
2852     YUVTORGB(yuvconstants)
2853     STOREARGB
2854       "sub         $0x8,%[width]                 \n"
2855       "jg          1b                            \n"
2856   : [y_buf]"+r"(y_buf),    // %[y_buf]
2857     [u_buf]"+r"(u_buf),    // %[u_buf]
2858     [v_buf]"+r"(v_buf),    // %[v_buf]
2859     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2860     [width]"+rm"(width)    // %[width]
2861   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2862   : "memory", "cc", YUVTORGB_REGS
2863     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2864   );
2865 }
2866 
2867 // 12 bit YUV to ARGB
I212ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2868 void OMITFP I212ToARGBRow_SSSE3(const uint16_t* y_buf,
2869                                 const uint16_t* u_buf,
2870                                 const uint16_t* v_buf,
2871                                 uint8_t* dst_argb,
2872                                 const struct YuvConstants* yuvconstants,
2873                                 int width) {
2874   asm volatile (
2875     YUVTORGB_SETUP(yuvconstants)
2876       "sub         %[u_buf],%[v_buf]             \n"
2877       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2878 
2879     LABELALIGN
2880       "1:                                        \n"
2881     READYUV212
2882     YUVTORGB(yuvconstants)
2883     STOREARGB
2884       "sub         $0x8,%[width]                 \n"
2885       "jg          1b                            \n"
2886   : [y_buf]"+r"(y_buf),    // %[y_buf]
2887     [u_buf]"+r"(u_buf),    // %[u_buf]
2888     [v_buf]"+r"(v_buf),    // %[v_buf]
2889     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2890     [width]"+rm"(width)    // %[width]
2891   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2892   : "memory", "cc", YUVTORGB_REGS
2893     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2894   );
2895 }
2896 
2897 // 10 bit YUV to AR30
I210ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2898 void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
2899                                 const uint16_t* u_buf,
2900                                 const uint16_t* v_buf,
2901                                 uint8_t* dst_ar30,
2902                                 const struct YuvConstants* yuvconstants,
2903                                 int width) {
2904   asm volatile (
2905     YUVTORGB_SETUP(yuvconstants)
2906       "sub         %[u_buf],%[v_buf]             \n"
2907       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2908       "psrlw       $14,%%xmm5                    \n"
2909       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
2910       "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
2911       "pcmpeqb     %%xmm7,%%xmm7                 \n"
2912       "psrlw       $6,%%xmm7                     \n"  // 1023 for max
2913 
2914     LABELALIGN
2915       "1:                                        \n"
2916     READYUV210
2917     YUVTORGB16(yuvconstants)
2918     STOREAR30
2919       "sub         $0x8,%[width]                 \n"
2920       "jg          1b                            \n"
2921   : [y_buf]"+r"(y_buf),    // %[y_buf]
2922     [u_buf]"+r"(u_buf),    // %[u_buf]
2923     [v_buf]"+r"(v_buf),    // %[v_buf]
2924     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
2925     [width]"+rm"(width)    // %[width]
2926   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2927   : "memory", "cc", YUVTORGB_REGS
2928     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2929   );
2930 }
2931 
2932 // 12 bit YUV to AR30
I212ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2933 void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf,
2934                                 const uint16_t* u_buf,
2935                                 const uint16_t* v_buf,
2936                                 uint8_t* dst_ar30,
2937                                 const struct YuvConstants* yuvconstants,
2938                                 int width) {
2939   asm volatile (
2940     YUVTORGB_SETUP(yuvconstants)
2941       "sub         %[u_buf],%[v_buf]             \n"
2942       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2943       "psrlw       $14,%%xmm5                    \n"
2944       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
2945       "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
2946       "pcmpeqb     %%xmm7,%%xmm7                 \n"
2947       "psrlw       $6,%%xmm7                     \n"  // 1023 for max
2948 
2949     LABELALIGN
2950       "1:                                        \n"
2951     READYUV212
2952     YUVTORGB16(yuvconstants)
2953     STOREAR30
2954       "sub         $0x8,%[width]                 \n"
2955       "jg          1b                            \n"
2956   : [y_buf]"+r"(y_buf),    // %[y_buf]
2957     [u_buf]"+r"(u_buf),    // %[u_buf]
2958     [v_buf]"+r"(v_buf),    // %[v_buf]
2959     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
2960     [width]"+rm"(width)    // %[width]
2961   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2962   : "memory", "cc", YUVTORGB_REGS
2963     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2964   );
2965 }
2966 
2967 // 10 bit YUV to ARGB
I410ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2968 void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf,
2969                                 const uint16_t* u_buf,
2970                                 const uint16_t* v_buf,
2971                                 uint8_t* dst_argb,
2972                                 const struct YuvConstants* yuvconstants,
2973                                 int width) {
2974   asm volatile (
2975     YUVTORGB_SETUP(yuvconstants)
2976       "sub         %[u_buf],%[v_buf]             \n"
2977       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2978 
2979     LABELALIGN
2980       "1:                                        \n"
2981     READYUV410
2982     YUVTORGB(yuvconstants)
2983     STOREARGB
2984       "sub         $0x8,%[width]                 \n"
2985       "jg          1b                            \n"
2986   : [y_buf]"+r"(y_buf),    // %[y_buf]
2987     [u_buf]"+r"(u_buf),    // %[u_buf]
2988     [v_buf]"+r"(v_buf),    // %[v_buf]
2989     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2990     [width]"+rm"(width)    // %[width]
2991   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2992   : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2993   );
2994 }
2995 
2996 #ifdef HAS_I210ALPHATOARGBROW_SSSE3
2997 // 10 bit YUVA to ARGB
I210AlphaToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,const uint16_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2998 void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
2999                                      const uint16_t* u_buf,
3000                                      const uint16_t* v_buf,
3001                                      const uint16_t* a_buf,
3002                                      uint8_t* dst_argb,
3003                                      const struct YuvConstants* yuvconstants,
3004                                      int width) {
3005   asm volatile(
3006       YUVTORGB_SETUP(
3007       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
3008 
3009       LABELALIGN "1:                                        \n" READYUVA210
3010           YUVTORGB(yuvconstants) STOREARGB
3011       "subl        $0x8,%[width]                 \n"
3012       "jg          1b                            \n"
3013       : [y_buf] "+r"(y_buf),  // %[y_buf]
3014         [u_buf] "+r"(u_buf),  // %[u_buf]
3015         [v_buf] "+r"(v_buf),  // %[v_buf]
3016         [a_buf] "+r"(a_buf),
3017         [dst_argb] "+r"(dst_argb),  // %[dst_argb]
3018 #if defined(__i386__)
3019         [width] "+m"(width)  // %[width]
3020 #else
3021         [width] "+rm"(width)  // %[width]
3022 #endif
3023       : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
3024       : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
3025         "xmm5");
3026 }
3027 #endif
3028 
3029 #ifdef HAS_I410ALPHATOARGBROW_SSSE3
3030 // 10 bit YUVA to ARGB
I410AlphaToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,const uint16_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3031 void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
3032                                      const uint16_t* u_buf,
3033                                      const uint16_t* v_buf,
3034                                      const uint16_t* a_buf,
3035                                      uint8_t* dst_argb,
3036                                      const struct YuvConstants* yuvconstants,
3037                                      int width) {
3038   // clang-format off
3039   asm volatile(
3040     YUVTORGB_SETUP(yuvconstants)
3041       "sub         %[u_buf],%[v_buf]             \n"
3042 
3043     LABELALIGN
3044       "1:                                        \n"
3045     READYUVA410
3046     YUVTORGB(yuvconstants)
3047     STOREARGB
3048       "subl        $0x8,%[width]                 \n"
3049       "jg          1b                            \n"
3050     : [y_buf] "+r"(y_buf),  // %[y_buf]
3051       [u_buf] "+r"(u_buf),  // %[u_buf]
3052       [v_buf] "+r"(v_buf),  // %[v_buf]
3053       [a_buf] "+r"(a_buf),
3054       [dst_argb] "+r"(dst_argb),  // %[dst_argb]
3055 #if defined(__i386__)
3056       [width] "+m"(width)  // %[width]
3057 #else
3058       [width] "+rm"(width)  // %[width]
3059 #endif
3060     : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
3061     : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
3062       "xmm5");
3063   // clang-format on
3064 }
3065 #endif
3066 
3067 // 10 bit YUV to AR30
I410ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3068 void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf,
3069                                 const uint16_t* u_buf,
3070                                 const uint16_t* v_buf,
3071                                 uint8_t* dst_ar30,
3072                                 const struct YuvConstants* yuvconstants,
3073                                 int width) {
3074   asm volatile (
3075     YUVTORGB_SETUP(yuvconstants)
3076       "sub         %[u_buf],%[v_buf]             \n"
3077       "pcmpeqb     %%xmm5,%%xmm5                 \n"
3078       "psrlw       $14,%%xmm5                    \n"
3079       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
3080       "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
3081       "pcmpeqb     %%xmm7,%%xmm7                 \n"
3082       "psrlw       $6,%%xmm7                     \n"  // 1023 for max
3083 
3084     LABELALIGN
3085       "1:                                        \n"
3086     READYUV410
3087     YUVTORGB16(yuvconstants)
3088     STOREAR30
3089       "sub         $0x8,%[width]                 \n"
3090       "jg          1b                            \n"
3091   : [y_buf]"+r"(y_buf),    // %[y_buf]
3092     [u_buf]"+r"(u_buf),    // %[u_buf]
3093     [v_buf]"+r"(v_buf),    // %[v_buf]
3094     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
3095     [width]"+rm"(width)    // %[width]
3096   : [yuvconstants]"r"(yuvconstants)   // %[yuvconstants]
3097   : "memory", "cc", YUVTORGB_REGS
3098       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3099   );
3100 }
3101 
3102 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
I422AlphaToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3103 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
3104                                      const uint8_t* u_buf,
3105                                      const uint8_t* v_buf,
3106                                      const uint8_t* a_buf,
3107                                      uint8_t* dst_argb,
3108                                      const struct YuvConstants* yuvconstants,
3109                                      int width) {
3110   // clang-format off
3111   asm volatile (
3112     YUVTORGB_SETUP(yuvconstants)
3113       "sub         %[u_buf],%[v_buf]             \n"
3114 
3115     LABELALIGN
3116       "1:                                        \n"
3117     READYUVA422
3118     YUVTORGB(yuvconstants)
3119     STOREARGB
3120       "subl        $0x8,%[width]                 \n"
3121       "jg          1b                            \n"
3122   : [y_buf]"+r"(y_buf),    // %[y_buf]
3123     [u_buf]"+r"(u_buf),    // %[u_buf]
3124     [v_buf]"+r"(v_buf),    // %[v_buf]
3125     [a_buf]"+r"(a_buf),    // %[a_buf]
3126     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3127 #if defined(__i386__)
3128     [width]"+m"(width)     // %[width]
3129 #else
3130     [width]"+rm"(width)    // %[width]
3131 #endif
3132   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3133   : "memory", "cc", YUVTORGB_REGS
3134     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3135   );
3136   // clang-format on
3137 }
3138 #endif  // HAS_I422ALPHATOARGBROW_SSSE3
3139 
NV12ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3140 void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
3141                                 const uint8_t* uv_buf,
3142                                 uint8_t* dst_argb,
3143                                 const struct YuvConstants* yuvconstants,
3144                                 int width) {
3145   // clang-format off
3146   asm volatile (
3147     YUVTORGB_SETUP(yuvconstants)
3148       "pcmpeqb     %%xmm5,%%xmm5                 \n"
3149 
3150     LABELALIGN
3151       "1:                                        \n"
3152     READNV12
3153     YUVTORGB(yuvconstants)
3154     STOREARGB
3155       "sub         $0x8,%[width]                 \n"
3156       "jg          1b                            \n"
3157   : [y_buf]"+r"(y_buf),    // %[y_buf]
3158     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
3159     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3160     [width]"+rm"(width)    // %[width]
3161   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3162     : "memory", "cc", YUVTORGB_REGS
3163       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3164   );
3165   // clang-format on
3166 }
3167 
NV21ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * vu_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3168 void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
3169                                 const uint8_t* vu_buf,
3170                                 uint8_t* dst_argb,
3171                                 const struct YuvConstants* yuvconstants,
3172                                 int width) {
3173   // clang-format off
3174   asm volatile (
3175     YUVTORGB_SETUP(yuvconstants)
3176       "pcmpeqb     %%xmm5,%%xmm5                 \n"
3177 
3178     LABELALIGN
3179       "1:                                        \n"
3180     READNV21
3181     YUVTORGB(yuvconstants)
3182     STOREARGB
3183       "sub         $0x8,%[width]                 \n"
3184       "jg          1b                            \n"
3185   : [y_buf]"+r"(y_buf),    // %[y_buf]
3186     [vu_buf]"+r"(vu_buf),    // %[vu_buf]
3187     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3188     [width]"+rm"(width)    // %[width]
3189   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
3190     [kShuffleNV21]"m"(kShuffleNV21)
3191     : "memory", "cc", YUVTORGB_REGS
3192       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3193   );
3194   // clang-format on
3195 }
3196 
YUY2ToARGBRow_SSSE3(const uint8_t * yuy2_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3197 void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
3198                                 uint8_t* dst_argb,
3199                                 const struct YuvConstants* yuvconstants,
3200                                 int width) {
3201   // clang-format off
3202   asm volatile (
3203     YUVTORGB_SETUP(yuvconstants)
3204       "pcmpeqb     %%xmm5,%%xmm5                 \n"
3205 
3206     LABELALIGN
3207       "1:                                        \n"
3208     READYUY2
3209     YUVTORGB(yuvconstants)
3210     STOREARGB
3211       "sub         $0x8,%[width]                 \n"
3212       "jg          1b                            \n"
3213   : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
3214     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3215     [width]"+rm"(width)    // %[width]
3216   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
3217     [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
3218     [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
3219     : "memory", "cc", YUVTORGB_REGS
3220       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3221   );
3222   // clang-format on
3223 }
3224 
UYVYToARGBRow_SSSE3(const uint8_t * uyvy_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3225 void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
3226                                 uint8_t* dst_argb,
3227                                 const struct YuvConstants* yuvconstants,
3228                                 int width) {
3229   // clang-format off
3230   asm volatile (
3231     YUVTORGB_SETUP(yuvconstants)
3232       "pcmpeqb     %%xmm5,%%xmm5                 \n"
3233 
3234     LABELALIGN
3235       "1:                                        \n"
3236     READUYVY
3237     YUVTORGB(yuvconstants)
3238     STOREARGB
3239       "sub         $0x8,%[width]                 \n"
3240       "jg          1b                            \n"
3241   : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
3242     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3243     [width]"+rm"(width)    // %[width]
3244   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
3245     [kShuffleUYVYY]"m"(kShuffleUYVYY),
3246     [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
3247     : "memory", "cc", YUVTORGB_REGS
3248       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3249   );
3250   // clang-format on
3251 }
3252 
P210ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3253 void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf,
3254                                 const uint16_t* uv_buf,
3255                                 uint8_t* dst_argb,
3256                                 const struct YuvConstants* yuvconstants,
3257                                 int width) {
3258   asm volatile(
3259       YUVTORGB_SETUP(
3260       yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
3261 
3262       LABELALIGN "1:                                        \n" READP210
3263           YUVTORGB(yuvconstants) STOREARGB
3264       "sub         $0x8,%[width]                 \n"
3265       "jg          1b                            \n"
3266       : [y_buf] "+r"(y_buf),              // %[y_buf]
3267         [uv_buf] "+r"(uv_buf),            // %[u_buf]
3268         [dst_argb] "+r"(dst_argb),        // %[dst_argb]
3269         [width] "+rm"(width)              // %[width]
3270       : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
3271       : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
3272         "xmm5");
3273 }
3274 
P410ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3275 void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf,
3276                                 const uint16_t* uv_buf,
3277                                 uint8_t* dst_argb,
3278                                 const struct YuvConstants* yuvconstants,
3279                                 int width) {
3280   asm volatile(
3281       YUVTORGB_SETUP(
3282       yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
3283 
3284       LABELALIGN "1:                                        \n" READP410
3285           YUVTORGB(yuvconstants) STOREARGB
3286       "sub         $0x8,%[width]                 \n"
3287       "jg          1b                            \n"
3288       : [y_buf] "+r"(y_buf),              // %[y_buf]
3289         [uv_buf] "+r"(uv_buf),            // %[u_buf]
3290         [dst_argb] "+r"(dst_argb),        // %[dst_argb]
3291         [width] "+rm"(width)              // %[width]
3292       : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
3293       : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
3294         "xmm5");
3295 }
3296 
P210ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3297 void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf,
3298                                 const uint16_t* uv_buf,
3299                                 uint8_t* dst_ar30,
3300                                 const struct YuvConstants* yuvconstants,
3301                                 int width) {
3302   asm volatile (
3303     YUVTORGB_SETUP(yuvconstants)
3304       "pcmpeqb     %%xmm5,%%xmm5                 \n"
3305       "psrlw       $14,%%xmm5                    \n"
3306       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
3307       "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
3308       "pcmpeqb     %%xmm7,%%xmm7                 \n"
3309       "psrlw       $6,%%xmm7                     \n"  // 1023 for max
3310 
3311     LABELALIGN
3312       "1:                                        \n"
3313     READP210
3314     YUVTORGB16(yuvconstants)
3315     STOREAR30
3316       "sub         $0x8,%[width]                 \n"
3317       "jg          1b                            \n"
3318   : [y_buf]"+r"(y_buf),              // %[y_buf]
3319     [uv_buf]"+r"(uv_buf),            // %[uv_buf]
3320     [dst_ar30]"+r"(dst_ar30),        // %[dst_ar30]
3321     [width]"+rm"(width)              // %[width]
3322   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3323   : "memory", "cc", YUVTORGB_REGS
3324       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3325   );
3326 }
3327 
P410ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3328 void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf,
3329                                 const uint16_t* uv_buf,
3330                                 uint8_t* dst_ar30,
3331                                 const struct YuvConstants* yuvconstants,
3332                                 int width) {
3333   asm volatile (
3334     YUVTORGB_SETUP(yuvconstants)
3335       "pcmpeqb     %%xmm5,%%xmm5                 \n"
3336       "psrlw       $14,%%xmm5                    \n"
3337       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
3338       "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
3339       "pcmpeqb     %%xmm7,%%xmm7                 \n"
3340       "psrlw       $6,%%xmm7                     \n"  // 1023 for max
3341 
3342     LABELALIGN
3343       "1:                                        \n"
3344     READP410
3345     YUVTORGB16(yuvconstants)
3346     STOREAR30
3347       "sub         $0x8,%[width]                 \n"
3348       "jg          1b                            \n"
3349   : [y_buf]"+r"(y_buf),              // %[y_buf]
3350     [uv_buf]"+r"(uv_buf),            // %[uv_buf]
3351     [dst_ar30]"+r"(dst_ar30),        // %[dst_ar30]
3352     [width]"+rm"(width)              // %[width]
3353   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3354   : "memory", "cc", YUVTORGB_REGS
3355       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3356   );
3357 }
3358 
I422ToRGBARow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_rgba,const struct YuvConstants * yuvconstants,int width)3359 void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
3360                                 const uint8_t* u_buf,
3361                                 const uint8_t* v_buf,
3362                                 uint8_t* dst_rgba,
3363                                 const struct YuvConstants* yuvconstants,
3364                                 int width) {
3365   asm volatile (
3366     YUVTORGB_SETUP(yuvconstants)
3367       "sub         %[u_buf],%[v_buf]             \n"
3368       "pcmpeqb     %%xmm5,%%xmm5                 \n"
3369 
3370     LABELALIGN
3371       "1:                                        \n"
3372     READYUV422
3373     YUVTORGB(yuvconstants)
3374     STORERGBA
3375       "sub         $0x8,%[width]                 \n"
3376       "jg          1b                            \n"
3377   : [y_buf]"+r"(y_buf),    // %[y_buf]
3378     [u_buf]"+r"(u_buf),    // %[u_buf]
3379     [v_buf]"+r"(v_buf),    // %[v_buf]
3380     [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
3381     [width]"+rm"(width)    // %[width]
3382   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3383   : "memory", "cc", YUVTORGB_REGS
3384     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3385   );
3386 }
3387 
3388 #endif  // HAS_I422TOARGBROW_SSSE3
3389 
3390 // Read 16 UV from 444
3391 #define READYUV444_AVX2                                               \
3392   "vmovdqu    (%[u_buf]),%%xmm3                                   \n" \
3393   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
3394   "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
3395   "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3396   "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
3397   "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
3398   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
3399   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
3400   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
3401   "lea        0x10(%[y_buf]),%[y_buf]                             \n"
3402 
3403 // Read 8 UV from 422, upsample to 16 UV.
3404 #define READYUV422_AVX2                                               \
3405   "vmovq      (%[u_buf]),%%xmm3                                   \n" \
3406   "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
3407   "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
3408   "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
3409   "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3410   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
3411   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
3412   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
3413   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
3414   "lea        0x10(%[y_buf]),%[y_buf]                             \n"
3415 
3416 #define READYUV422_AVX512BW                                           \
3417   "vmovdqu    (%[u_buf]),%%xmm3                                   \n" \
3418   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
3419   "vpermq     %%zmm3,%%zmm16,%%zmm3                               \n" \
3420   "vpermq     %%zmm1,%%zmm16,%%zmm1                               \n" \
3421   "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
3422   "vpunpcklbw %%zmm1,%%zmm3,%%zmm3                                \n" \
3423   "vpermq     $0xd8,%%zmm3,%%zmm3                                 \n" \
3424   "vpunpcklwd %%zmm3,%%zmm3,%%zmm3                                \n" \
3425   "vmovdqu8   (%[y_buf]),%%ymm4                                   \n" \
3426   "vpermq     %%zmm4,%%zmm17,%%zmm4                               \n" \
3427   "vpermq     $0xd8,%%zmm4,%%zmm4                                 \n" \
3428   "vpunpcklbw %%zmm4,%%zmm4,%%zmm4                                \n" \
3429   "lea        0x20(%[y_buf]),%[y_buf]                             \n"
3430 
3431 // Read 8 UV from 210, upsample to 16 UV
3432 // TODO(fbarchard): Consider vshufb to replace pack/unpack
3433 // TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
3434 #define READYUV210_AVX2                                            \
3435   "vmovdqu    (%[u_buf]),%%xmm3                                \n" \
3436   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
3437   "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
3438   "vpermq     $0xd8,%%ymm3,%%ymm3                              \n" \
3439   "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
3440   "vpunpcklwd %%ymm1,%%ymm3,%%ymm3                             \n" \
3441   "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
3442   "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
3443   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
3444   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
3445   "vpsllw     $6,%%ymm4,%%ymm2                                 \n" \
3446   "vpsrlw     $4,%%ymm4,%%ymm4                                 \n" \
3447   "vpaddw     %%ymm2,%%ymm4,%%ymm4                             \n" \
3448   "lea        0x20(%[y_buf]),%[y_buf]                          \n"
3449 
3450 // Read 8 UV from 210, upsample to 16 UV. With 16 Alpha.
3451 #define READYUVA210_AVX2                                           \
3452   "vmovdqu    (%[u_buf]),%%xmm3                                \n" \
3453   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
3454   "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
3455   "vpermq     $0xd8,%%ymm3,%%ymm3                              \n" \
3456   "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
3457   "vpunpcklwd %%ymm1,%%ymm3,%%ymm3                             \n" \
3458   "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
3459   "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
3460   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
3461   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
3462   "vpsllw     $6,%%ymm4,%%ymm2                                 \n" \
3463   "vpsrlw     $4,%%ymm4,%%ymm4                                 \n" \
3464   "vpaddw     %%ymm2,%%ymm4,%%ymm4                             \n" \
3465   "lea        0x20(%[y_buf]),%[y_buf]                          \n" \
3466   "vmovdqu    (%[a_buf]),%%ymm5                                \n" \
3467   "vpsraw     $2,%%ymm5,%%ymm5                                 \n" \
3468   "vpackuswb  %%ymm5,%%ymm5,%%ymm5                             \n" \
3469   "lea        0x20(%[a_buf]),%[a_buf]                          \n"
3470 
3471 // Read 16 UV from 410
3472 #define READYUV410_AVX2                                            \
3473   "vmovdqu    (%[u_buf]),%%ymm3                                \n" \
3474   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%ymm2                 \n" \
3475   "lea        0x20(%[u_buf]),%[u_buf]                          \n" \
3476   "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
3477   "vpsraw     $2,%%ymm2,%%ymm2                                 \n" \
3478   "vpunpckhwd %%ymm2,%%ymm3,%%ymm1                             \n" \
3479   "vpunpcklwd %%ymm2,%%ymm3,%%ymm3                             \n" \
3480   "vpackuswb  %%ymm1,%%ymm3,%%ymm3                             \n" \
3481   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
3482   "vpsllw     $6,%%ymm4,%%ymm2                                 \n" \
3483   "vpsrlw     $4,%%ymm4,%%ymm4                                 \n" \
3484   "vpaddw     %%ymm2,%%ymm4,%%ymm4                             \n" \
3485   "lea        0x20(%[y_buf]),%[y_buf]                          \n"
3486 
3487 // Read 8 UV from 212 12 bit, upsample to 16 UV
3488 #define READYUV212_AVX2                                            \
3489   "vmovdqu    (%[u_buf]),%%xmm3                                \n" \
3490   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
3491   "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
3492   "vpermq     $0xd8,%%ymm3,%%ymm3                              \n" \
3493   "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
3494   "vpunpcklwd %%ymm1,%%ymm3,%%ymm3                             \n" \
3495   "vpsraw     $0x4,%%ymm3,%%ymm3                               \n" \
3496   "vpackuswb  %%ymm3,%%ymm3,%%ymm3                             \n" \
3497   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                             \n" \
3498   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
3499   "vpsllw     $4,%%ymm4,%%ymm2                                 \n" \
3500   "vpsrlw     $8,%%ymm4,%%ymm4                                 \n" \
3501   "vpaddw     %%ymm2,%%ymm4,%%ymm4                             \n" \
3502   "lea        0x20(%[y_buf]),%[y_buf]                          \n"
3503 
3504 // Read 16 UV from 410. With 16 Alpha.
3505 #define READYUVA410_AVX2                                           \
3506   "vmovdqu    (%[u_buf]),%%ymm3                                \n" \
3507   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%ymm2                 \n" \
3508   "lea        0x20(%[u_buf]),%[u_buf]                          \n" \
3509   "vpsraw     $2,%%ymm3,%%ymm3                                 \n" \
3510   "vpsraw     $2,%%ymm2,%%ymm2                                 \n" \
3511   "vpunpckhwd %%ymm2,%%ymm3,%%ymm1                             \n" \
3512   "vpunpcklwd %%ymm2,%%ymm3,%%ymm3                             \n" \
3513   "vpackuswb  %%ymm1,%%ymm3,%%ymm3                             \n" \
3514   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
3515   "vpsllw     $6,%%ymm4,%%ymm2                                 \n" \
3516   "vpsrlw     $4,%%ymm4,%%ymm4                                 \n" \
3517   "vpaddw     %%ymm2,%%ymm4,%%ymm4                             \n" \
3518   "lea        0x20(%[y_buf]),%[y_buf]                          \n" \
3519   "vmovdqu    (%[a_buf]),%%ymm5                                \n" \
3520   "vpsraw     $2,%%ymm5,%%ymm5                                 \n" \
3521   "vpackuswb  %%ymm5,%%ymm5,%%ymm5                             \n" \
3522   "lea        0x20(%[a_buf]),%[a_buf]                          \n"
3523 
3524 // Read 16 UV from 444.  With 16 Alpha.
3525 #define READYUVA444_AVX2                                              \
3526   "vmovdqu    (%[u_buf]),%%xmm3                                   \n" \
3527   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
3528   "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
3529   "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3530   "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
3531   "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
3532   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
3533   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
3534   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
3535   "lea        0x10(%[y_buf]),%[y_buf]                             \n" \
3536   "vmovdqu    (%[a_buf]),%%xmm5                                   \n" \
3537   "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n" \
3538   "lea        0x10(%[a_buf]),%[a_buf]                             \n"
3539 
3540 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
3541 #define READYUVA422_AVX2                                              \
3542   "vmovq      (%[u_buf]),%%xmm3                                   \n" \
3543   "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
3544   "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
3545   "vpunpcklbw %%ymm1,%%ymm3,%%ymm3                                \n" \
3546   "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3547   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
3548   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
3549   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
3550   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
3551   "lea        0x10(%[y_buf]),%[y_buf]                             \n" \
3552   "vmovdqu    (%[a_buf]),%%xmm5                                   \n" \
3553   "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n" \
3554   "lea        0x10(%[a_buf]),%[a_buf]                             \n"
3555 
3556 // Read 8 UV from NV12, upsample to 16 UV.
3557 #define READNV12_AVX2                                                 \
3558   "vmovdqu    (%[uv_buf]),%%xmm3                                  \n" \
3559   "lea        0x10(%[uv_buf]),%[uv_buf]                           \n" \
3560   "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3561   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
3562   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
3563   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
3564   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
3565   "lea        0x10(%[y_buf]),%[y_buf]                             \n"
3566 
3567 // Read 8 VU from NV21, upsample to 16 UV.
3568 #define READNV21_AVX2                                                 \
3569   "vmovdqu    (%[vu_buf]),%%xmm3                                  \n" \
3570   "lea        0x10(%[vu_buf]),%[vu_buf]                           \n" \
3571   "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3572   "vpshufb     %[kShuffleNV21], %%ymm3, %%ymm3                    \n" \
3573   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
3574   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
3575   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
3576   "lea        0x10(%[y_buf]),%[y_buf]                             \n"
3577 
3578 // Read 4 UV from P210, upsample to 8 UV
3579 #define READP210_AVX2                                                 \
3580   "vmovdqu    (%[uv_buf]),%%ymm3                                  \n" \
3581   "lea        0x20(%[uv_buf]),%[uv_buf]                           \n" \
3582   "vpsrlw     $0x8,%%ymm3,%%ymm3                                  \n" \
3583   "vpackuswb  %%ymm3,%%ymm3,%%ymm3                                \n" \
3584   "vpunpcklwd %%ymm3,%%ymm3,%%ymm3                                \n" \
3585   "vmovdqu    (%[y_buf]),%%ymm4                                   \n" \
3586   "lea        0x20(%[y_buf]),%[y_buf]                             \n"
3587 
3588 // Read 8 UV from P410
3589 #define READP410_AVX2                                                 \
3590   "vmovdqu    (%[uv_buf]),%%ymm3                                  \n" \
3591   "vmovdqu    0x20(%[uv_buf]),%%ymm1                              \n" \
3592   "lea        0x40(%[uv_buf]),%[uv_buf]                           \n" \
3593   "vpsrlw     $0x8,%%ymm3,%%ymm3                                  \n" \
3594   "vpsrlw     $0x8,%%ymm1,%%ymm1                                  \n" \
3595   "vpackuswb  %%ymm1,%%ymm3,%%ymm3                                \n" \
3596   "vpermq     $0xd8,%%ymm3,%%ymm3                                 \n" \
3597   "vmovdqu    (%[y_buf]),%%ymm4                                   \n" \
3598   "lea        0x20(%[y_buf]),%[y_buf]                             \n"
3599 
3600 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
3601 #define READYUY2_AVX2                                                 \
3602   "vmovdqu    (%[yuy2_buf]),%%ymm4                                \n" \
3603   "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n" \
3604   "vmovdqu    (%[yuy2_buf]),%%ymm3                                \n" \
3605   "vpshufb    %[kShuffleYUY2UV], %%ymm3, %%ymm3                   \n" \
3606   "lea        0x20(%[yuy2_buf]),%[yuy2_buf]                       \n"
3607 
3608 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
3609 #define READUYVY_AVX2                                                 \
3610   "vmovdqu    (%[uyvy_buf]),%%ymm4                                \n" \
3611   "vpshufb    %[kShuffleUYVYY], %%ymm4, %%ymm4                    \n" \
3612   "vmovdqu    (%[uyvy_buf]),%%ymm3                                \n" \
3613   "vpshufb    %[kShuffleUYVYUV], %%ymm3, %%ymm3                   \n" \
3614   "lea        0x20(%[uyvy_buf]),%[uyvy_buf]                       \n"
3615 
3616 // TODO(fbarchard): Remove broadcastb
3617 #if defined(__x86_64__)
3618 #define YUVTORGB_SETUP_AVX2(yuvconstants)                             \
3619   "vpcmpeqb    %%xmm13,%%xmm13,%%xmm13                            \n" \
3620   "vmovdqa     (%[yuvconstants]),%%ymm8                           \n" \
3621   "vpsllw      $7,%%xmm13,%%xmm13                                 \n" \
3622   "vmovdqa     32(%[yuvconstants]),%%ymm9                         \n" \
3623   "vpbroadcastb %%xmm13,%%ymm13                                   \n" \
3624   "vmovdqa     64(%[yuvconstants]),%%ymm10                        \n" \
3625   "vmovdqa     96(%[yuvconstants]),%%ymm11                        \n" \
3626   "vmovdqa     128(%[yuvconstants]),%%ymm12                       \n"
3627 
3628 #define YUVTORGB_SETUP_AVX512BW(yuvconstants)                         \
3629   "vpcmpeqb   %%xmm13,%%xmm13,%%xmm13                             \n" \
3630   "movdqa     (%[yuvconstants]),%%xmm8                            \n" \
3631   "vpbroadcastq %%xmm8, %%zmm8                                    \n" \
3632   "vpsllw     $7,%%xmm13,%%xmm13                                  \n" \
3633   "vpbroadcastb %%xmm13,%%zmm13                                   \n" \
3634   "movq     32(%[yuvconstants]),%%xmm9                            \n" \
3635   "vpbroadcastq %%xmm9,%%zmm9                                     \n" \
3636   "movq     64(%[yuvconstants]),%%xmm10                           \n" \
3637   "vpbroadcastq %%xmm10,%%zmm10                                   \n" \
3638   "movq     96(%[yuvconstants]),%%xmm11                           \n" \
3639   "vpbroadcastq %%xmm11,%%zmm11                                   \n" \
3640   "movq     128(%[yuvconstants]),%%xmm12                          \n" \
3641   "vpbroadcastq %%xmm12,%%zmm12                                   \n" \
3642   "vmovdqu8 (%[quadsplitperm]),%%zmm16                            \n" \
3643   "vmovdqu8 (%[dquadsplitperm]),%%zmm17                           \n" \
3644   "vmovdqu8 (%[unperm]),%%zmm18                                   \n"
3645 
3646 #define YUVTORGB16_AVX2(yuvconstants)                                 \
3647   "vpsubb      %%ymm13,%%ymm3,%%ymm3                              \n" \
3648   "vpmulhuw    %%ymm11,%%ymm4,%%ymm4                              \n" \
3649   "vpmaddubsw  %%ymm3,%%ymm8,%%ymm0                               \n" \
3650   "vpmaddubsw  %%ymm3,%%ymm9,%%ymm1                               \n" \
3651   "vpmaddubsw  %%ymm3,%%ymm10,%%ymm2                              \n" \
3652   "vpaddw      %%ymm4,%%ymm12,%%ymm4                              \n" \
3653   "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
3654   "vpsubsw     %%ymm1,%%ymm4,%%ymm1                               \n" \
3655   "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
3656 
3657 #define YUVTORGB16_AVX512BW(yuvconstants)                             \
3658   "vpsubb      %%zmm13,%%zmm3,%%zmm3                              \n" \
3659   "vpmulhuw    %%zmm11,%%zmm4,%%zmm4                              \n" \
3660   "vpmaddubsw  %%zmm3,%%zmm8,%%zmm0                               \n" \
3661   "vpmaddubsw  %%zmm3,%%zmm9,%%zmm1                               \n" \
3662   "vpmaddubsw  %%zmm3,%%zmm10,%%zmm2                              \n" \
3663   "vpaddw      %%zmm4,%%zmm12,%%zmm4                              \n" \
3664   "vpaddsw     %%zmm4,%%zmm0,%%zmm0                               \n" \
3665   "vpsubsw     %%zmm1,%%zmm4,%%zmm1                               \n" \
3666   "vpaddsw     %%zmm4,%%zmm2,%%zmm2                               \n"
3667 
3668 #define YUVTORGB_REGS_AVX2 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
3669 #define YUVTORGB_REGS_AVX512BW \
3670   "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm16", "xmm17", "xmm18",
3671 
3672 #else  // Convert 16 pixels: 16 UV and 16 Y.
3673 
3674 #define YUVTORGB_SETUP_AVX2(yuvconstants)
3675 #define YUVTORGB16_AVX2(yuvconstants)                                 \
3676   "vpcmpeqb    %%xmm0,%%xmm0,%%xmm0                               \n" \
3677   "vpsllw      $7,%%xmm0,%%xmm0                                   \n" \
3678   "vpbroadcastb %%xmm0,%%ymm0                                     \n" \
3679   "vpsubb      %%ymm0,%%ymm3,%%ymm3                               \n" \
3680   "vpmulhuw    96(%[yuvconstants]),%%ymm4,%%ymm4                  \n" \
3681   "vmovdqa     (%[yuvconstants]),%%ymm0                           \n" \
3682   "vmovdqa     32(%[yuvconstants]),%%ymm1                         \n" \
3683   "vmovdqa     64(%[yuvconstants]),%%ymm2                         \n" \
3684   "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0                               \n" \
3685   "vpmaddubsw  %%ymm3,%%ymm1,%%ymm1                               \n" \
3686   "vpmaddubsw  %%ymm3,%%ymm2,%%ymm2                               \n" \
3687   "vmovdqa     128(%[yuvconstants]),%%ymm3                        \n" \
3688   "vpaddw      %%ymm4,%%ymm3,%%ymm4                               \n" \
3689   "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
3690   "vpsubsw     %%ymm1,%%ymm4,%%ymm1                               \n" \
3691   "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
3692 
3693 #define YUVTORGB_REGS_AVX2
3694 #endif
3695 
3696 #define YUVTORGB_AVX2(yuvconstants)                                   \
3697   YUVTORGB16_AVX2(yuvconstants)                                       \
3698   "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n" \
3699   "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n" \
3700   "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n" \
3701   "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n" \
3702   "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n" \
3703   "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
3704 
3705 #define YUVTORGB_AVX512BW(yuvconstants)                               \
3706   YUVTORGB16_AVX512BW(yuvconstants)                                   \
3707   "vpsraw     $0x6,%%zmm0,%%zmm0                                  \n" \
3708   "vpsraw     $0x6,%%zmm1,%%zmm1                                  \n" \
3709   "vpsraw     $0x6,%%zmm2,%%zmm2                                  \n" \
3710   "vpackuswb  %%zmm0,%%zmm0,%%zmm0                                \n" \
3711   "vpackuswb  %%zmm1,%%zmm1,%%zmm1                                \n" \
3712   "vpackuswb  %%zmm2,%%zmm2,%%zmm2                                \n"
3713 
3714 // Store 16 ARGB values.
3715 #define STOREARGB_AVX2                                                \
3716   "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
3717   "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
3718   "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n" \
3719   "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
3720   "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n" \
3721   "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n" \
3722   "vmovdqu    %%ymm1,(%[dst_argb])                                \n" \
3723   "vmovdqu    %%ymm0,0x20(%[dst_argb])                            \n" \
3724   "lea        0x40(%[dst_argb]), %[dst_argb]                      \n"
3725 
3726 // Store 32 ARGB values.
3727 #define STOREARGB_AVX512BW                                            \
3728   "vpunpcklbw %%zmm1,%%zmm0,%%zmm0                                \n" \
3729   "vpermq     %%zmm0,%%zmm18,%%zmm0                               \n" \
3730   "vpunpcklbw %%zmm5,%%zmm2,%%zmm2                                \n" \
3731   "vpermq     %%zmm2,%%zmm18,%%zmm2                               \n" \
3732   "vpunpcklwd %%zmm2,%%zmm0,%%zmm1                                \n" \
3733   "vpunpckhwd %%zmm2,%%zmm0,%%zmm0                                \n" \
3734   "vmovdqu8   %%zmm1,(%[dst_argb])                                \n" \
3735   "vmovdqu8   %%zmm0,0x40(%[dst_argb])                            \n" \
3736   "lea        0x80(%[dst_argb]), %[dst_argb]                      \n"
3737 
3738 // Store 16 AR30 values.
3739 #define STOREAR30_AVX2                                                \
3740   "vpsraw     $0x4,%%ymm0,%%ymm0                                  \n" \
3741   "vpsraw     $0x4,%%ymm1,%%ymm1                                  \n" \
3742   "vpsraw     $0x4,%%ymm2,%%ymm2                                  \n" \
3743   "vpminsw    %%ymm7,%%ymm0,%%ymm0                                \n" \
3744   "vpminsw    %%ymm7,%%ymm1,%%ymm1                                \n" \
3745   "vpminsw    %%ymm7,%%ymm2,%%ymm2                                \n" \
3746   "vpmaxsw    %%ymm6,%%ymm0,%%ymm0                                \n" \
3747   "vpmaxsw    %%ymm6,%%ymm1,%%ymm1                                \n" \
3748   "vpmaxsw    %%ymm6,%%ymm2,%%ymm2                                \n" \
3749   "vpsllw     $0x4,%%ymm2,%%ymm2                                  \n" \
3750   "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
3751   "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
3752   "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
3753   "vpunpckhwd %%ymm2,%%ymm0,%%ymm3                                \n" \
3754   "vpunpcklwd %%ymm2,%%ymm0,%%ymm0                                \n" \
3755   "vpunpckhwd %%ymm5,%%ymm1,%%ymm2                                \n" \
3756   "vpunpcklwd %%ymm5,%%ymm1,%%ymm1                                \n" \
3757   "vpslld     $0xa,%%ymm1,%%ymm1                                  \n" \
3758   "vpslld     $0xa,%%ymm2,%%ymm2                                  \n" \
3759   "vpor       %%ymm1,%%ymm0,%%ymm0                                \n" \
3760   "vpor       %%ymm2,%%ymm3,%%ymm3                                \n" \
3761   "vmovdqu    %%ymm0,(%[dst_ar30])                                \n" \
3762   "vmovdqu    %%ymm3,0x20(%[dst_ar30])                            \n" \
3763   "lea        0x40(%[dst_ar30]), %[dst_ar30]                      \n"
3764 
3765 #ifdef HAS_I444TOARGBROW_AVX2
3766 // 16 pixels
3767 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
I444ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3768 void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
3769                                const uint8_t* u_buf,
3770                                const uint8_t* v_buf,
3771                                uint8_t* dst_argb,
3772                                const struct YuvConstants* yuvconstants,
3773                                int width) {
3774   asm volatile (
3775     YUVTORGB_SETUP_AVX2(yuvconstants)
3776       "sub         %[u_buf],%[v_buf]             \n"
3777       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
3778 
3779     LABELALIGN
3780       "1:                                        \n"
3781     READYUV444_AVX2
3782     YUVTORGB_AVX2(yuvconstants)
3783     STOREARGB_AVX2
3784       "sub         $0x10,%[width]                \n"
3785       "jg          1b                            \n"
3786       "vzeroupper                                \n"
3787   : [y_buf]"+r"(y_buf),    // %[y_buf]
3788     [u_buf]"+r"(u_buf),    // %[u_buf]
3789     [v_buf]"+r"(v_buf),    // %[v_buf]
3790     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3791     [width]"+rm"(width)    // %[width]
3792   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3793   : "memory", "cc", YUVTORGB_REGS_AVX2
3794     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3795   );
3796 }
3797 #endif  // HAS_I444TOARGBROW_AVX2
3798 
3799 #if defined(HAS_I422TOARGBROW_AVX2)
3800 // 16 pixels
3801 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I422ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3802 void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
3803                                const uint8_t* u_buf,
3804                                const uint8_t* v_buf,
3805                                uint8_t* dst_argb,
3806                                const struct YuvConstants* yuvconstants,
3807                                int width) {
3808   asm volatile (
3809     YUVTORGB_SETUP_AVX2(yuvconstants)
3810       "sub         %[u_buf],%[v_buf]             \n"
3811       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
3812 
3813     LABELALIGN
3814       "1:                                        \n"
3815     READYUV422_AVX2
3816     YUVTORGB_AVX2(yuvconstants)
3817     STOREARGB_AVX2
3818       "sub         $0x10,%[width]                \n"
3819       "jg          1b                            \n"
3820 
3821       "vzeroupper                                \n"
3822   : [y_buf]"+r"(y_buf),    // %[y_buf]
3823     [u_buf]"+r"(u_buf),    // %[u_buf]
3824     [v_buf]"+r"(v_buf),    // %[v_buf]
3825     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3826     [width]"+rm"(width)    // %[width]
3827   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3828   : "memory", "cc", YUVTORGB_REGS_AVX2
3829     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3830   );
3831 }
3832 #endif  // HAS_I422TOARGBROW_AVX2
3833 
3834 #if defined(HAS_I422TOARGBROW_AVX512BW)
3835 static const uint64_t kSplitQuadWords[8] = {0, 2, 2, 2, 1, 2, 2, 2};
3836 static const uint64_t kSplitDoubleQuadWords[8] = {0, 1, 4, 4, 2, 3, 4, 4};
3837 static const uint64_t kUnpermuteAVX512[8] = {0, 4, 1, 5, 2, 6, 3, 7};
3838 
3839 // 32 pixels
3840 // 16 UV values upsampled to 32 UV, mixed with 32 Y producing 32 ARGB (128
3841 // bytes).
I422ToARGBRow_AVX512BW(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3842 void OMITFP I422ToARGBRow_AVX512BW(const uint8_t* y_buf,
3843                                    const uint8_t* u_buf,
3844                                    const uint8_t* v_buf,
3845                                    uint8_t* dst_argb,
3846                                    const struct YuvConstants* yuvconstants,
3847                                    int width) {
3848   asm volatile (
3849     YUVTORGB_SETUP_AVX512BW(yuvconstants)
3850       "sub         %[u_buf],%[v_buf]             \n"
3851       "vpcmpeqb    %%xmm5,%%xmm5,%%xmm5          \n"
3852       "vpbroadcastq %%xmm5,%%zmm5                \n"
3853 
3854     LABELALIGN
3855       "1:                                        \n"
3856     READYUV422_AVX512BW
3857     YUVTORGB_AVX512BW(yuvconstants)
3858     STOREARGB_AVX512BW
3859       "sub         $0x20,%[width]                \n"
3860       "jg          1b                            \n"
3861 
3862       "vzeroupper                                \n"
3863   : [y_buf]"+r"(y_buf),                         // %[y_buf]
3864     [u_buf]"+r"(u_buf),                         // %[u_buf]
3865     [v_buf]"+r"(v_buf),                         // %[v_buf]
3866     [dst_argb]"+r"(dst_argb),                   // %[dst_argb]
3867     [width]"+rm"(width)                         // %[width]
3868   : [yuvconstants]"r"(yuvconstants),            // %[yuvconstants]
3869     [quadsplitperm]"r"(kSplitQuadWords),        // %[quadsplitperm]
3870     [dquadsplitperm]"r"(kSplitDoubleQuadWords), // %[dquadsplitperm]
3871     [unperm]"r"(kUnpermuteAVX512)               // %[unperm]
3872   : "memory", "cc", YUVTORGB_REGS_AVX512BW
3873     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3874   );
3875 }
3876 #endif  // HAS_I422TOARGBROW_AVX512BW
3877 
3878 #if defined(HAS_I422TOAR30ROW_AVX2)
3879 // 16 pixels
3880 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I422ToAR30Row_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3881 void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
3882                                const uint8_t* u_buf,
3883                                const uint8_t* v_buf,
3884                                uint8_t* dst_ar30,
3885                                const struct YuvConstants* yuvconstants,
3886                                int width) {
3887   asm volatile (
3888     YUVTORGB_SETUP_AVX2(yuvconstants)
3889       "sub         %[u_buf],%[v_buf]             \n"
3890       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
3891       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
3892       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
3893       "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
3894       "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
3895       "vpsrlw      $6,%%ymm7,%%ymm7              \n"
3896 
3897     LABELALIGN
3898       "1:                                        \n"
3899     READYUV422_AVX2
3900     YUVTORGB16_AVX2(yuvconstants)
3901     STOREAR30_AVX2
3902       "sub         $0x10,%[width]                \n"
3903       "jg          1b                            \n"
3904 
3905       "vzeroupper                                \n"
3906   : [y_buf]"+r"(y_buf),    // %[y_buf]
3907     [u_buf]"+r"(u_buf),    // %[u_buf]
3908     [v_buf]"+r"(v_buf),    // %[v_buf]
3909     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
3910     [width]"+rm"(width)    // %[width]
3911   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3912   : "memory", "cc", YUVTORGB_REGS_AVX2
3913     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3914   );
3915 }
3916 #endif  // HAS_I422TOAR30ROW_AVX2
3917 
3918 #if defined(HAS_I210TOARGBROW_AVX2)
3919 // 16 pixels
3920 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I210ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3921 void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
3922                                const uint16_t* u_buf,
3923                                const uint16_t* v_buf,
3924                                uint8_t* dst_argb,
3925                                const struct YuvConstants* yuvconstants,
3926                                int width) {
3927   asm volatile (
3928     YUVTORGB_SETUP_AVX2(yuvconstants)
3929       "sub         %[u_buf],%[v_buf]             \n"
3930       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
3931 
3932     LABELALIGN
3933       "1:                                        \n"
3934     READYUV210_AVX2
3935     YUVTORGB_AVX2(yuvconstants)
3936     STOREARGB_AVX2
3937       "sub         $0x10,%[width]                \n"
3938       "jg          1b                            \n"
3939 
3940       "vzeroupper                                \n"
3941   : [y_buf]"+r"(y_buf),    // %[y_buf]
3942     [u_buf]"+r"(u_buf),    // %[u_buf]
3943     [v_buf]"+r"(v_buf),    // %[v_buf]
3944     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3945     [width]"+rm"(width)    // %[width]
3946   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3947   : "memory", "cc", YUVTORGB_REGS_AVX2
3948     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3949   );
3950 }
3951 #endif  // HAS_I210TOARGBROW_AVX2
3952 
3953 #if defined(HAS_I212TOARGBROW_AVX2)
3954 // 16 pixels
3955 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I212ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3956 void OMITFP I212ToARGBRow_AVX2(const uint16_t* y_buf,
3957                                const uint16_t* u_buf,
3958                                const uint16_t* v_buf,
3959                                uint8_t* dst_argb,
3960                                const struct YuvConstants* yuvconstants,
3961                                int width) {
3962   asm volatile (
3963     YUVTORGB_SETUP_AVX2(yuvconstants)
3964       "sub         %[u_buf],%[v_buf]             \n"
3965       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
3966 
3967     LABELALIGN
3968       "1:                                        \n"
3969     READYUV212_AVX2
3970     YUVTORGB_AVX2(yuvconstants)
3971     STOREARGB_AVX2
3972       "sub         $0x10,%[width]                \n"
3973       "jg          1b                            \n"
3974 
3975       "vzeroupper                                \n"
3976   : [y_buf]"+r"(y_buf),    // %[y_buf]
3977     [u_buf]"+r"(u_buf),    // %[u_buf]
3978     [v_buf]"+r"(v_buf),    // %[v_buf]
3979     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3980     [width]"+rm"(width)    // %[width]
3981   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
3982   : "memory", "cc", YUVTORGB_REGS_AVX2
3983     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3984   );
3985 }
3986 #endif  // HAS_I212TOARGBROW_AVX2
3987 
3988 #if defined(HAS_I210TOAR30ROW_AVX2)
3989 // 16 pixels
3990 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I210ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3991 void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
3992                                const uint16_t* u_buf,
3993                                const uint16_t* v_buf,
3994                                uint8_t* dst_ar30,
3995                                const struct YuvConstants* yuvconstants,
3996                                int width) {
3997   asm volatile (
3998     YUVTORGB_SETUP_AVX2(yuvconstants)
3999       "sub         %[u_buf],%[v_buf]             \n"
4000       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
4001       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
4002       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
4003       "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
4004       "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
4005       "vpsrlw      $6,%%ymm7,%%ymm7              \n"
4006 
4007     LABELALIGN
4008       "1:                                        \n"
4009     READYUV210_AVX2
4010     YUVTORGB16_AVX2(yuvconstants)
4011     STOREAR30_AVX2
4012       "sub         $0x10,%[width]                \n"
4013       "jg          1b                            \n"
4014 
4015       "vzeroupper                                \n"
4016   : [y_buf]"+r"(y_buf),    // %[y_buf]
4017     [u_buf]"+r"(u_buf),    // %[u_buf]
4018     [v_buf]"+r"(v_buf),    // %[v_buf]
4019     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
4020     [width]"+rm"(width)    // %[width]
4021   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4022   : "memory", "cc", YUVTORGB_REGS_AVX2
4023     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4024   );
4025 }
4026 #endif  // HAS_I210TOAR30ROW_AVX2
4027 
4028 #if defined(HAS_I212TOAR30ROW_AVX2)
4029 // 16 pixels
4030 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I212ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)4031 void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf,
4032                                const uint16_t* u_buf,
4033                                const uint16_t* v_buf,
4034                                uint8_t* dst_ar30,
4035                                const struct YuvConstants* yuvconstants,
4036                                int width) {
4037   asm volatile (
4038     YUVTORGB_SETUP_AVX2(yuvconstants)
4039       "sub         %[u_buf],%[v_buf]             \n"
4040       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
4041       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
4042       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
4043       "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
4044       "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
4045       "vpsrlw      $6,%%ymm7,%%ymm7              \n"
4046 
4047     LABELALIGN
4048       "1:                                        \n"
4049     READYUV212_AVX2
4050     YUVTORGB16_AVX2(yuvconstants)
4051     STOREAR30_AVX2
4052       "sub         $0x10,%[width]                \n"
4053       "jg          1b                            \n"
4054 
4055       "vzeroupper                                \n"
4056   : [y_buf]"+r"(y_buf),    // %[y_buf]
4057     [u_buf]"+r"(u_buf),    // %[u_buf]
4058     [v_buf]"+r"(v_buf),    // %[v_buf]
4059     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
4060     [width]"+rm"(width)    // %[width]
4061   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4062   : "memory", "cc", YUVTORGB_REGS_AVX2
4063     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4064   );
4065 }
4066 #endif  // HAS_I212TOAR30ROW_AVX2
4067 
4068 #if defined(HAS_I410TOARGBROW_AVX2)
4069 // 16 pixels
4070 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
I410ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4071 void OMITFP I410ToARGBRow_AVX2(const uint16_t* y_buf,
4072                                const uint16_t* u_buf,
4073                                const uint16_t* v_buf,
4074                                uint8_t* dst_argb,
4075                                const struct YuvConstants* yuvconstants,
4076                                int width) {
4077   asm volatile (
4078     YUVTORGB_SETUP_AVX2(yuvconstants)
4079       "sub         %[u_buf],%[v_buf]             \n"
4080       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4081 
4082     LABELALIGN
4083       "1:                                        \n"
4084     READYUV410_AVX2
4085     YUVTORGB_AVX2(yuvconstants)
4086     STOREARGB_AVX2
4087       "sub         $0x10,%[width]                \n"
4088       "jg          1b                            \n"
4089       "vzeroupper                                \n"
4090 
4091   : [y_buf]"+r"(y_buf),    // %[y_buf]
4092     [u_buf]"+r"(u_buf),    // %[u_buf]
4093     [v_buf]"+r"(v_buf),    // %[v_buf]
4094     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4095     [width]"+rm"(width)    // %[width]
4096   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4097   : "memory", "cc", YUVTORGB_REGS_AVX2
4098       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4099   );
4100 }
4101 #endif  // HAS_I410TOARGBROW_AVX2
4102 
4103 #if defined(HAS_I210ALPHATOARGBROW_AVX2)
4104 // 16 pixels
4105 // 8 UV, 16 Y and 16 A producing 16 ARGB (64 bytes).
I210AlphaToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,const uint16_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4106 void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf,
4107                                     const uint16_t* u_buf,
4108                                     const uint16_t* v_buf,
4109                                     const uint16_t* a_buf,
4110                                     uint8_t* dst_argb,
4111                                     const struct YuvConstants* yuvconstants,
4112                                     int width) {
4113   asm volatile(
4114       YUVTORGB_SETUP_AVX2(
4115       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
4116 
4117       LABELALIGN "1:                                        \n" READYUVA210_AVX2
4118           YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
4119       "subl        $0x10,%[width]                \n"
4120       "jg          1b                            \n"
4121       "vzeroupper                                \n"
4122 
4123       : [y_buf] "+r"(y_buf),        // %[y_buf]
4124         [u_buf] "+r"(u_buf),        // %[u_buf]
4125         [v_buf] "+r"(v_buf),        // %[v_buf]
4126         [a_buf] "+r"(a_buf),        // %[a_buf]
4127         [dst_argb] "+r"(dst_argb),  // %[dst_argb]
4128 #if defined(__i386__)
4129         [width] "+m"(width)  // %[width]
4130 #else
4131         [width] "+rm"(width)  // %[width]
4132 #endif
4133       : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
4134       : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
4135         "xmm4", "xmm5");
4136 }
4137 #endif  // HAS_I210TOARGBROW_AVX2
4138 
4139 #if defined(HAS_I410ALPHATOARGBROW_AVX2)
4140 // 16 pixels
4141 // 16 UV, 16 Y and 16 A producing 16 ARGB (64 bytes).
I410AlphaToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,const uint16_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4142 void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf,
4143                                     const uint16_t* u_buf,
4144                                     const uint16_t* v_buf,
4145                                     const uint16_t* a_buf,
4146                                     uint8_t* dst_argb,
4147                                     const struct YuvConstants* yuvconstants,
4148                                     int width) {
4149   asm volatile(
4150       YUVTORGB_SETUP_AVX2(
4151       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
4152 
4153       LABELALIGN "1:                                        \n" READYUVA410_AVX2
4154           YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
4155       "subl        $0x10,%[width]                \n"
4156       "jg          1b                            \n"
4157       "vzeroupper                                \n"
4158 
4159       : [y_buf] "+r"(y_buf),        // %[y_buf]
4160         [u_buf] "+r"(u_buf),        // %[u_buf]
4161         [v_buf] "+r"(v_buf),        // %[v_buf]
4162         [a_buf] "+r"(a_buf),        // %[a_buf]
4163         [dst_argb] "+r"(dst_argb),  // %[dst_argb]
4164 #if defined(__i386__)
4165         [width] "+m"(width)  // %[width]
4166 #else
4167         [width] "+rm"(width)  // %[width]
4168 #endif
4169       : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
4170       : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
4171         "xmm4", "xmm5");
4172 }
4173 #endif  // HAS_I410TOARGBROW_AVX2
4174 
4175 #if defined(HAS_I410TOAR30ROW_AVX2)
4176 // 16 pixels
4177 // 16 UV values with 16 Y producing 16 AR30 (64 bytes).
I410ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)4178 void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf,
4179                                const uint16_t* u_buf,
4180                                const uint16_t* v_buf,
4181                                uint8_t* dst_ar30,
4182                                const struct YuvConstants* yuvconstants,
4183                                int width) {
4184   asm volatile (
4185     YUVTORGB_SETUP_AVX2(yuvconstants)
4186       "sub         %[u_buf],%[v_buf]             \n"
4187       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
4188       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
4189       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
4190       "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
4191       "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
4192       "vpsrlw      $6,%%ymm7,%%ymm7              \n"
4193 
4194     LABELALIGN
4195       "1:                                        \n"
4196     READYUV410_AVX2
4197     YUVTORGB16_AVX2(yuvconstants)
4198     STOREAR30_AVX2
4199       "sub         $0x10,%[width]                \n"
4200       "jg          1b                            \n"
4201 
4202       "vzeroupper                                \n"
4203   : [y_buf]"+r"(y_buf),    // %[y_buf]
4204     [u_buf]"+r"(u_buf),    // %[u_buf]
4205     [v_buf]"+r"(v_buf),    // %[v_buf]
4206     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
4207     [width]"+rm"(width)    // %[width]
4208   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4209   : "memory", "cc", YUVTORGB_REGS_AVX2
4210       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4211   );
4212 }
4213 #endif  // HAS_I410TOAR30ROW_AVX2
4214 
4215 #if defined(HAS_I444ALPHATOARGBROW_AVX2)
4216 // 16 pixels
4217 // 16 UV values with 16 Y and 16 A producing 16 ARGB.
I444AlphaToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4218 void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf,
4219                                     const uint8_t* u_buf,
4220                                     const uint8_t* v_buf,
4221                                     const uint8_t* a_buf,
4222                                     uint8_t* dst_argb,
4223                                     const struct YuvConstants* yuvconstants,
4224                                     int width) {
4225   // clang-format off
4226   asm volatile (
4227   YUVTORGB_SETUP_AVX2(yuvconstants)
4228       "sub         %[u_buf],%[v_buf]             \n"
4229 
4230   LABELALIGN
4231       "1:                                        \n"
4232   READYUVA444_AVX2
4233   YUVTORGB_AVX2(yuvconstants)
4234   STOREARGB_AVX2
4235       "subl        $0x10,%[width]                \n"
4236       "jg          1b                            \n"
4237       "vzeroupper                                \n"
4238   : [y_buf]"+r"(y_buf),    // %[y_buf]
4239     [u_buf]"+r"(u_buf),    // %[u_buf]
4240     [v_buf]"+r"(v_buf),    // %[v_buf]
4241     [a_buf]"+r"(a_buf),    // %[a_buf]
4242     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4243 #if defined(__i386__)
4244     [width]"+m"(width)     // %[width]
4245 #else
4246     [width]"+rm"(width)    // %[width]
4247 #endif
4248   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4249   : "memory", "cc", YUVTORGB_REGS_AVX2
4250       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4251   );
4252   // clang-format on
4253 }
4254 #endif  // HAS_I444ALPHATOARGBROW_AVX2
4255 
4256 #if defined(HAS_I422ALPHATOARGBROW_AVX2)
4257 // 16 pixels
4258 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
I422AlphaToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4259 void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
4260                                     const uint8_t* u_buf,
4261                                     const uint8_t* v_buf,
4262                                     const uint8_t* a_buf,
4263                                     uint8_t* dst_argb,
4264                                     const struct YuvConstants* yuvconstants,
4265                                     int width) {
4266   // clang-format off
4267   asm volatile (
4268     YUVTORGB_SETUP_AVX2(yuvconstants)
4269       "sub         %[u_buf],%[v_buf]             \n"
4270 
4271     LABELALIGN
4272       "1:                                        \n"
4273     READYUVA422_AVX2
4274     YUVTORGB_AVX2(yuvconstants)
4275     STOREARGB_AVX2
4276       "subl        $0x10,%[width]                \n"
4277       "jg          1b                            \n"
4278       "vzeroupper                                \n"
4279   : [y_buf]"+r"(y_buf),    // %[y_buf]
4280     [u_buf]"+r"(u_buf),    // %[u_buf]
4281     [v_buf]"+r"(v_buf),    // %[v_buf]
4282     [a_buf]"+r"(a_buf),    // %[a_buf]
4283     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4284 #if defined(__i386__)
4285     [width]"+m"(width)     // %[width]
4286 #else
4287     [width]"+rm"(width)    // %[width]
4288 #endif
4289   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4290   : "memory", "cc", YUVTORGB_REGS_AVX2
4291     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4292   );
4293   // clang-format on
4294 }
4295 #endif  // HAS_I422ALPHATOARGBROW_AVX2
4296 
4297 #if defined(HAS_I422TORGBAROW_AVX2)
4298 // 16 pixels
4299 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
I422ToRGBARow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4300 void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
4301                                const uint8_t* u_buf,
4302                                const uint8_t* v_buf,
4303                                uint8_t* dst_argb,
4304                                const struct YuvConstants* yuvconstants,
4305                                int width) {
4306   asm volatile (
4307     YUVTORGB_SETUP_AVX2(yuvconstants)
4308       "sub         %[u_buf],%[v_buf]             \n"
4309       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4310 
4311     LABELALIGN
4312       "1:                                        \n"
4313     READYUV422_AVX2
4314     YUVTORGB_AVX2(yuvconstants)
4315 
4316     // Step 3: Weave into RGBA
4317     "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
4318     "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
4319     "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
4320     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
4321     "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
4322     "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
4323     "vmovdqu    %%ymm0,(%[dst_argb])           \n"
4324     "vmovdqu    %%ymm1,0x20(%[dst_argb])       \n"
4325     "lea        0x40(%[dst_argb]),%[dst_argb]  \n"
4326     "sub        $0x10,%[width]                 \n"
4327     "jg         1b                             \n"
4328     "vzeroupper                                \n"
4329   : [y_buf]"+r"(y_buf),    // %[y_buf]
4330     [u_buf]"+r"(u_buf),    // %[u_buf]
4331     [v_buf]"+r"(v_buf),    // %[v_buf]
4332     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4333     [width]"+rm"(width)    // %[width]
4334   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4335   : "memory", "cc", YUVTORGB_REGS_AVX2
4336     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4337   );
4338 }
4339 #endif  // HAS_I422TORGBAROW_AVX2
4340 
4341 #if defined(HAS_NV12TOARGBROW_AVX2)
4342 // 16 pixels.
4343 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV12ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4344 void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
4345                                const uint8_t* uv_buf,
4346                                uint8_t* dst_argb,
4347                                const struct YuvConstants* yuvconstants,
4348                                int width) {
4349   // clang-format off
4350   asm volatile (
4351     YUVTORGB_SETUP_AVX2(yuvconstants)
4352       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4353 
4354     LABELALIGN
4355       "1:                                        \n"
4356     READNV12_AVX2
4357     YUVTORGB_AVX2(yuvconstants)
4358     STOREARGB_AVX2
4359       "sub         $0x10,%[width]                \n"
4360       "jg          1b                            \n"
4361       "vzeroupper                                \n"
4362   : [y_buf]"+r"(y_buf),    // %[y_buf]
4363     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
4364     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4365     [width]"+rm"(width)    // %[width]
4366   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4367     : "memory", "cc", YUVTORGB_REGS_AVX2
4368     "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4369   );
4370   // clang-format on
4371 }
4372 #endif  // HAS_NV12TOARGBROW_AVX2
4373 
4374 #if defined(HAS_NV21TOARGBROW_AVX2)
4375 // 16 pixels.
4376 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV21ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * vu_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4377 void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
4378                                const uint8_t* vu_buf,
4379                                uint8_t* dst_argb,
4380                                const struct YuvConstants* yuvconstants,
4381                                int width) {
4382   // clang-format off
4383   asm volatile (
4384     YUVTORGB_SETUP_AVX2(yuvconstants)
4385       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4386 
4387     LABELALIGN
4388       "1:                                        \n"
4389     READNV21_AVX2
4390     YUVTORGB_AVX2(yuvconstants)
4391     STOREARGB_AVX2
4392       "sub         $0x10,%[width]                \n"
4393       "jg          1b                            \n"
4394       "vzeroupper                                \n"
4395   : [y_buf]"+r"(y_buf),    // %[y_buf]
4396     [vu_buf]"+r"(vu_buf),    // %[vu_buf]
4397     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4398     [width]"+rm"(width)    // %[width]
4399   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
4400     [kShuffleNV21]"m"(kShuffleNV21)
4401     : "memory", "cc", YUVTORGB_REGS_AVX2
4402       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4403   );
4404   // clang-format on
4405 }
4406 #endif  // HAS_NV21TOARGBROW_AVX2
4407 
4408 #if defined(HAS_YUY2TOARGBROW_AVX2)
4409 // 16 pixels.
4410 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
YUY2ToARGBRow_AVX2(const uint8_t * yuy2_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4411 void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
4412                                uint8_t* dst_argb,
4413                                const struct YuvConstants* yuvconstants,
4414                                int width) {
4415   // clang-format off
4416   asm volatile (
4417     YUVTORGB_SETUP_AVX2(yuvconstants)
4418       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4419 
4420     LABELALIGN
4421       "1:                                        \n"
4422     READYUY2_AVX2
4423     YUVTORGB_AVX2(yuvconstants)
4424     STOREARGB_AVX2
4425       "sub         $0x10,%[width]                \n"
4426       "jg          1b                            \n"
4427       "vzeroupper                                \n"
4428   : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
4429     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4430     [width]"+rm"(width)    // %[width]
4431   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
4432     [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
4433     [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
4434     : "memory", "cc", YUVTORGB_REGS_AVX2
4435       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4436   );
4437   // clang-format on
4438 }
4439 #endif  // HAS_YUY2TOARGBROW_AVX2
4440 
4441 #if defined(HAS_UYVYTOARGBROW_AVX2)
4442 // 16 pixels.
4443 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
UYVYToARGBRow_AVX2(const uint8_t * uyvy_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4444 void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
4445                                uint8_t* dst_argb,
4446                                const struct YuvConstants* yuvconstants,
4447                                int width) {
4448   // clang-format off
4449   asm volatile (
4450     YUVTORGB_SETUP_AVX2(yuvconstants)
4451       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4452 
4453     LABELALIGN
4454       "1:                                        \n"
4455     READUYVY_AVX2
4456     YUVTORGB_AVX2(yuvconstants)
4457     STOREARGB_AVX2
4458       "sub         $0x10,%[width]                \n"
4459       "jg          1b                            \n"
4460       "vzeroupper                                \n"
4461   : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
4462     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4463     [width]"+rm"(width)    // %[width]
4464   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
4465     [kShuffleUYVYY]"m"(kShuffleUYVYY),
4466     [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
4467     : "memory", "cc", YUVTORGB_REGS_AVX2
4468       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4469   );
4470   // clang-format on
4471 }
4472 #endif  // HAS_UYVYTOARGBROW_AVX2
4473 
4474 #if defined(HAS_P210TOARGBROW_AVX2)
4475 // 16 pixels.
4476 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
P210ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4477 void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf,
4478                                const uint16_t* uv_buf,
4479                                uint8_t* dst_argb,
4480                                const struct YuvConstants* yuvconstants,
4481                                int width) {
4482   // clang-format off
4483   asm volatile (
4484     YUVTORGB_SETUP_AVX2(yuvconstants)
4485       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4486 
4487     LABELALIGN
4488       "1:                                        \n"
4489     READP210_AVX2
4490     YUVTORGB_AVX2(yuvconstants)
4491     STOREARGB_AVX2
4492       "sub         $0x10,%[width]                \n"
4493       "jg          1b                            \n"
4494       "vzeroupper                                \n"
4495   : [y_buf]"+r"(y_buf),    // %[y_buf]
4496     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
4497     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4498     [width]"+rm"(width)    // %[width]
4499   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4500   : "memory", "cc", YUVTORGB_REGS_AVX2
4501       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4502   );
4503   // clang-format on
4504 }
4505 #endif  // HAS_P210TOARGBROW_AVX2
4506 
4507 #if defined(HAS_P410TOARGBROW_AVX2)
4508 // 16 pixels.
4509 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
P410ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4510 void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf,
4511                                const uint16_t* uv_buf,
4512                                uint8_t* dst_argb,
4513                                const struct YuvConstants* yuvconstants,
4514                                int width) {
4515   // clang-format off
4516   asm volatile (
4517     YUVTORGB_SETUP_AVX2(yuvconstants)
4518       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4519 
4520     LABELALIGN
4521       "1:                                        \n"
4522     READP410_AVX2
4523     YUVTORGB_AVX2(yuvconstants)
4524     STOREARGB_AVX2
4525       "sub         $0x10,%[width]                \n"
4526       "jg          1b                            \n"
4527       "vzeroupper                                \n"
4528   : [y_buf]"+r"(y_buf),    // %[y_buf]
4529     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
4530     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
4531     [width]"+rm"(width)    // %[width]
4532   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4533   : "memory", "cc", YUVTORGB_REGS_AVX2
4534       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4535   );
4536   // clang-format on
4537 }
4538 #endif  // HAS_P410TOARGBROW_AVX2
4539 
4540 #if defined(HAS_P210TOAR30ROW_AVX2)
4541 // 16 pixels
4542 // 16 UV values with 16 Y producing 16 AR30 (64 bytes).
P210ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)4543 void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf,
4544                                const uint16_t* uv_buf,
4545                                uint8_t* dst_ar30,
4546                                const struct YuvConstants* yuvconstants,
4547                                int width) {
4548   asm volatile (
4549     YUVTORGB_SETUP_AVX2(yuvconstants)
4550       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
4551       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
4552       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
4553       "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
4554       "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
4555       "vpsrlw      $6,%%ymm7,%%ymm7              \n"
4556 
4557     LABELALIGN
4558       "1:                                        \n"
4559     READP210_AVX2
4560     YUVTORGB16_AVX2(yuvconstants)
4561     STOREAR30_AVX2
4562       "sub         $0x10,%[width]                \n"
4563       "jg          1b                            \n"
4564 
4565       "vzeroupper                                \n"
4566   : [y_buf]"+r"(y_buf),    // %[y_buf]
4567     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
4568     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
4569     [width]"+rm"(width)    // %[width]
4570   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4571   : "memory", "cc", YUVTORGB_REGS_AVX2
4572       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4573   );
4574 }
4575 #endif  // HAS_P210TOAR30ROW_AVX2
4576 
4577 #if defined(HAS_P410TOAR30ROW_AVX2)
4578 // 16 pixels
4579 // 16 UV values with 16 Y producing 16 AR30 (64 bytes).
P410ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)4580 void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf,
4581                                const uint16_t* uv_buf,
4582                                uint8_t* dst_ar30,
4583                                const struct YuvConstants* yuvconstants,
4584                                int width) {
4585   asm volatile (
4586     YUVTORGB_SETUP_AVX2(yuvconstants)
4587       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
4588       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
4589       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
4590       "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
4591       "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
4592       "vpsrlw      $6,%%ymm7,%%ymm7              \n"
4593 
4594     LABELALIGN
4595       "1:                                        \n"
4596     READP410_AVX2
4597     YUVTORGB16_AVX2(yuvconstants)
4598     STOREAR30_AVX2
4599       "sub         $0x10,%[width]                \n"
4600       "jg          1b                            \n"
4601 
4602       "vzeroupper                                \n"
4603   : [y_buf]"+r"(y_buf),    // %[y_buf]
4604     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
4605     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
4606     [width]"+rm"(width)    // %[width]
4607   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
4608   : "memory", "cc", YUVTORGB_REGS_AVX2
4609       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4610   );
4611 }
4612 #endif  // HAS_P410TOAR30ROW_AVX2
4613 
4614 #ifdef HAS_I400TOARGBROW_SSE2
I400ToARGBRow_SSE2(const uint8_t * y_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4615 void I400ToARGBRow_SSE2(const uint8_t* y_buf,
4616                         uint8_t* dst_argb,
4617                         const struct YuvConstants* yuvconstants,
4618                         int width) {
4619   asm volatile(
4620       "movdqa      96(%3),%%xmm2                 \n"  // yg = 18997 = 1.164
4621       "movdqa      128(%3),%%xmm3                \n"  // ygb = 1160 = 1.164 * 16
4622       "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 0xff000000
4623       "pslld       $0x18,%%xmm4                  \n"
4624 
4625       LABELALIGN
4626       "1:                                        \n"
4627       // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
4628       "movq      (%0),%%xmm0                     \n"
4629       "lea       0x8(%0),%0                      \n"
4630       "punpcklbw %%xmm0,%%xmm0                   \n"
4631       "pmulhuw   %%xmm2,%%xmm0                   \n"
4632       "paddsw    %%xmm3,%%xmm0                   \n"
4633       "psraw     $6, %%xmm0                      \n"
4634       "packuswb  %%xmm0,%%xmm0                   \n"
4635 
4636       // Step 2: Weave into ARGB
4637       "punpcklbw %%xmm0,%%xmm0                   \n"
4638       "movdqa    %%xmm0,%%xmm1                   \n"
4639       "punpcklwd %%xmm0,%%xmm0                   \n"
4640       "punpckhwd %%xmm1,%%xmm1                   \n"
4641       "por       %%xmm4,%%xmm0                   \n"
4642       "por       %%xmm4,%%xmm1                   \n"
4643       "movdqu    %%xmm0,(%1)                     \n"
4644       "movdqu    %%xmm1,0x10(%1)                 \n"
4645       "lea       0x20(%1),%1                     \n"
4646 
4647       "sub       $0x8,%2                         \n"
4648       "jg        1b                              \n"
4649       : "+r"(y_buf),       // %0
4650         "+r"(dst_argb),    // %1
4651         "+rm"(width)       // %2
4652       : "r"(yuvconstants)  // %3
4653       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
4654 }
4655 #endif  // HAS_I400TOARGBROW_SSE2
4656 
4657 #ifdef HAS_I400TOARGBROW_AVX2
4658 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
4659 // note: vpunpcklbw mutates and vpackuswb unmutates.
I400ToARGBRow_AVX2(const uint8_t * y_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4660 void I400ToARGBRow_AVX2(const uint8_t* y_buf,
4661                         uint8_t* dst_argb,
4662                         const struct YuvConstants* yuvconstants,
4663                         int width) {
4664   asm volatile(
4665       "vmovdqa     96(%3),%%ymm2                 \n"  // yg = 18997 = 1.164
4666       "vmovdqa     128(%3),%%ymm3                \n"  // ygb = -1160 = 1.164*16
4667       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 0xff000000
4668       "vpslld      $0x18,%%ymm4,%%ymm4           \n"
4669 
4670       LABELALIGN
4671       "1:                                        \n"
4672       // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
4673       "vmovdqu    (%0),%%xmm0                    \n"
4674       "lea        0x10(%0),%0                    \n"
4675       "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
4676       "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
4677       "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
4678       "vpaddsw    %%ymm3,%%ymm0,%%ymm0           \n"
4679       "vpsraw     $0x6,%%ymm0,%%ymm0             \n"
4680       "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
4681       "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
4682       "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
4683       "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
4684       "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
4685       "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
4686       "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
4687       "vmovdqu    %%ymm0,(%1)                    \n"
4688       "vmovdqu    %%ymm1,0x20(%1)                \n"
4689       "lea        0x40(%1),%1                     \n"
4690       "sub        $0x10,%2                       \n"
4691       "jg        1b                              \n"
4692       "vzeroupper                                \n"
4693       : "+r"(y_buf),       // %0
4694         "+r"(dst_argb),    // %1
4695         "+rm"(width)       // %2
4696       : "r"(yuvconstants)  // %3
4697       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
4698 }
4699 #endif  // HAS_I400TOARGBROW_AVX2
4700 
4701 #ifdef HAS_MIRRORROW_SSSE3
4702 // Shuffle table for reversing the bytes.
4703 static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
4704                                      7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
4705 
MirrorRow_SSSE3(const uint8_t * src,uint8_t * dst,int width)4706 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
4707   intptr_t temp_width = (intptr_t)(width);
4708   asm volatile(
4709 
4710       "movdqa      %3,%%xmm5                     \n"
4711 
4712       LABELALIGN
4713       "1:                                        \n"
4714       "movdqu      -0x10(%0,%2,1),%%xmm0         \n"
4715       "pshufb      %%xmm5,%%xmm0                 \n"
4716       "movdqu      %%xmm0,(%1)                   \n"
4717       "lea         0x10(%1),%1                   \n"
4718       "sub         $0x10,%2                      \n"
4719       "jg          1b                            \n"
4720       : "+r"(src),           // %0
4721         "+r"(dst),           // %1
4722         "+r"(temp_width)     // %2
4723       : "m"(kShuffleMirror)  // %3
4724       : "memory", "cc", "xmm0", "xmm5");
4725 }
4726 #endif  // HAS_MIRRORROW_SSSE3
4727 
4728 #ifdef HAS_MIRRORROW_AVX2
MirrorRow_AVX2(const uint8_t * src,uint8_t * dst,int width)4729 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
4730   intptr_t temp_width = (intptr_t)(width);
4731   asm volatile(
4732 
4733       "vbroadcastf128 %3,%%ymm5                  \n"
4734 
4735       LABELALIGN
4736       "1:                                        \n"
4737       "vmovdqu     -0x20(%0,%2,1),%%ymm0         \n"
4738       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
4739       "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
4740       "vmovdqu     %%ymm0,(%1)                   \n"
4741       "lea         0x20(%1),%1                   \n"
4742       "sub         $0x20,%2                      \n"
4743       "jg          1b                            \n"
4744       "vzeroupper                                \n"
4745       : "+r"(src),           // %0
4746         "+r"(dst),           // %1
4747         "+r"(temp_width)     // %2
4748       : "m"(kShuffleMirror)  // %3
4749       : "memory", "cc", "xmm0", "xmm5");
4750 }
4751 #endif  // HAS_MIRRORROW_AVX2
4752 
4753 #ifdef HAS_MIRRORUVROW_SSSE3
4754 // Shuffle table for reversing the UV.
4755 static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
4756                                        6u,  7u,  4u,  5u,  2u,  3u,  0u, 1u};
4757 
MirrorUVRow_SSSE3(const uint8_t * src_uv,uint8_t * dst_uv,int width)4758 void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
4759   intptr_t temp_width = (intptr_t)(width);
4760   asm volatile(
4761 
4762       "movdqa      %3,%%xmm5                     \n"
4763 
4764       LABELALIGN
4765       "1:                                        \n"
4766       "movdqu      -0x10(%0,%2,2),%%xmm0         \n"
4767       "pshufb      %%xmm5,%%xmm0                 \n"
4768       "movdqu      %%xmm0,(%1)                   \n"
4769       "lea         0x10(%1),%1                   \n"
4770       "sub         $0x8,%2                       \n"
4771       "jg          1b                            \n"
4772       : "+r"(src_uv),          // %0
4773         "+r"(dst_uv),          // %1
4774         "+r"(temp_width)       // %2
4775       : "m"(kShuffleMirrorUV)  // %3
4776       : "memory", "cc", "xmm0", "xmm5");
4777 }
4778 #endif  // HAS_MIRRORUVROW_SSSE3
4779 
4780 #ifdef HAS_MIRRORUVROW_AVX2
MirrorUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_uv,int width)4781 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
4782   intptr_t temp_width = (intptr_t)(width);
4783   asm volatile(
4784 
4785       "vbroadcastf128 %3,%%ymm5                  \n"
4786 
4787       LABELALIGN
4788       "1:                                        \n"
4789       "vmovdqu     -0x20(%0,%2,2),%%ymm0         \n"
4790       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
4791       "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
4792       "vmovdqu     %%ymm0,(%1)                   \n"
4793       "lea         0x20(%1),%1                   \n"
4794       "sub         $0x10,%2                      \n"
4795       "jg          1b                            \n"
4796       "vzeroupper                                \n"
4797       : "+r"(src_uv),          // %0
4798         "+r"(dst_uv),          // %1
4799         "+r"(temp_width)       // %2
4800       : "m"(kShuffleMirrorUV)  // %3
4801       : "memory", "cc", "xmm0", "xmm5");
4802 }
4803 #endif  // HAS_MIRRORUVROW_AVX2
4804 
4805 #ifdef HAS_MIRRORSPLITUVROW_SSSE3
4806 // Shuffle table for reversing the bytes of UV channels.
4807 static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
4808                                             15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
MirrorSplitUVRow_SSSE3(const uint8_t * src,uint8_t * dst_u,uint8_t * dst_v,int width)4809 void MirrorSplitUVRow_SSSE3(const uint8_t* src,
4810                             uint8_t* dst_u,
4811                             uint8_t* dst_v,
4812                             int width) {
4813   intptr_t temp_width = (intptr_t)(width);
4814   asm volatile(
4815       "movdqa      %4,%%xmm1                     \n"
4816       "lea         -0x10(%0,%3,2),%0             \n"
4817       "sub         %1,%2                         \n"
4818 
4819       LABELALIGN
4820       "1:                                        \n"
4821       "movdqu      (%0),%%xmm0                   \n"
4822       "lea         -0x10(%0),%0                  \n"
4823       "pshufb      %%xmm1,%%xmm0                 \n"
4824       "movlpd      %%xmm0,(%1)                   \n"
4825       "movhpd      %%xmm0,0x00(%1,%2,1)          \n"
4826       "lea         0x8(%1),%1                    \n"
4827       "sub         $8,%3                         \n"
4828       "jg          1b                            \n"
4829       : "+r"(src),                  // %0
4830         "+r"(dst_u),                // %1
4831         "+r"(dst_v),                // %2
4832         "+r"(temp_width)            // %3
4833       : "m"(kShuffleMirrorSplitUV)  // %4
4834       : "memory", "cc", "xmm0", "xmm1");
4835 }
4836 #endif  // HAS_MIRRORSPLITUVROW_SSSE3
4837 
4838 #ifdef HAS_RGB24MIRRORROW_SSSE3
4839 
4840 // Shuffle first 5 pixels to last 5 mirrored.  first byte zero
4841 static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u,
4842                                          7u,   8u,  3u,  4u,  5u, 0u,  1u,  2u};
4843 
4844 // Shuffle last 5 pixels to first 5 mirrored.  last byte zero
4845 static const uvec8 kShuffleMirrorRGB1 = {
4846     13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u};
4847 
4848 // Shuffle 5 pixels at a time (15 bytes)
RGB24MirrorRow_SSSE3(const uint8_t * src_rgb24,uint8_t * dst_rgb24,int width)4849 void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
4850                           uint8_t* dst_rgb24,
4851                           int width) {
4852   intptr_t temp_width = (intptr_t)(width);
4853   src_rgb24 += width * 3 - 48;
4854   asm volatile(
4855       "movdqa      %3,%%xmm4                     \n"
4856       "movdqa      %4,%%xmm5                     \n"
4857 
4858       LABELALIGN
4859       "1:                                        \n"
4860       "movdqu      (%0),%%xmm0                   \n"  // first 5
4861       "movdqu      15(%0),%%xmm1                 \n"  // next 5
4862       "movdqu      30(%0),%%xmm2                 \n"  // next 5
4863       "movdqu      32(%0),%%xmm3                 \n"  // last 1 special
4864       "pshufb      %%xmm4,%%xmm0                 \n"
4865       "pshufb      %%xmm4,%%xmm1                 \n"
4866       "pshufb      %%xmm4,%%xmm2                 \n"
4867       "pshufb      %%xmm5,%%xmm3                 \n"
4868       "lea         -0x30(%0),%0                  \n"
4869       "movdqu      %%xmm0,32(%1)                 \n"  // last 5
4870       "movdqu      %%xmm1,17(%1)                 \n"  // next 5
4871       "movdqu      %%xmm2,2(%1)                  \n"  // next 5
4872       "movlpd      %%xmm3,0(%1)                  \n"  // first 1
4873       "lea         0x30(%1),%1                   \n"
4874       "sub         $0x10,%2                      \n"
4875       "jg          1b                            \n"
4876       : "+r"(src_rgb24),          // %0
4877         "+r"(dst_rgb24),          // %1
4878         "+r"(temp_width)          // %2
4879       : "m"(kShuffleMirrorRGB0),  // %3
4880         "m"(kShuffleMirrorRGB1)   // %4
4881       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
4882 }
4883 #endif  // HAS_RGB24MIRRORROW_SSSE3
4884 
4885 #ifdef HAS_ARGBMIRRORROW_SSE2
4886 
ARGBMirrorRow_SSE2(const uint8_t * src,uint8_t * dst,int width)4887 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
4888   intptr_t temp_width = (intptr_t)(width);
4889   asm volatile(
4890 
4891       "lea         -0x10(%0,%2,4),%0             \n"
4892 
4893       LABELALIGN
4894       "1:                                        \n"
4895       "movdqu      (%0),%%xmm0                   \n"
4896       "pshufd      $0x1b,%%xmm0,%%xmm0           \n"
4897       "lea         -0x10(%0),%0                  \n"
4898       "movdqu      %%xmm0,(%1)                   \n"
4899       "lea         0x10(%1),%1                   \n"
4900       "sub         $0x4,%2                       \n"
4901       "jg          1b                            \n"
4902       : "+r"(src),        // %0
4903         "+r"(dst),        // %1
4904         "+r"(temp_width)  // %2
4905       :
4906       : "memory", "cc", "xmm0");
4907 }
4908 #endif  // HAS_ARGBMIRRORROW_SSE2
4909 
4910 #ifdef HAS_ARGBMIRRORROW_AVX2
4911 // Shuffle table for reversing the bytes.
4912 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
ARGBMirrorRow_AVX2(const uint8_t * src,uint8_t * dst,int width)4913 void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
4914   intptr_t temp_width = (intptr_t)(width);
4915   asm volatile(
4916 
4917       "vmovdqu     %3,%%ymm5                     \n"
4918 
4919       LABELALIGN
4920       "1:                                        \n"
4921       "vpermd      -0x20(%0,%2,4),%%ymm5,%%ymm0  \n"
4922       "vmovdqu     %%ymm0,(%1)                   \n"
4923       "lea         0x20(%1),%1                   \n"
4924       "sub         $0x8,%2                       \n"
4925       "jg          1b                            \n"
4926       "vzeroupper                                \n"
4927       : "+r"(src),                    // %0
4928         "+r"(dst),                    // %1
4929         "+r"(temp_width)              // %2
4930       : "m"(kARGBShuffleMirror_AVX2)  // %3
4931       : "memory", "cc", "xmm0", "xmm5");
4932 }
4933 #endif  // HAS_ARGBMIRRORROW_AVX2
4934 
4935 #ifdef HAS_SPLITUVROW_AVX2
SplitUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)4936 void SplitUVRow_AVX2(const uint8_t* src_uv,
4937                      uint8_t* dst_u,
4938                      uint8_t* dst_v,
4939                      int width) {
4940   asm volatile(
4941       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4942       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
4943       "sub         %1,%2                         \n"
4944 
4945       LABELALIGN
4946       "1:                                        \n"
4947       "vmovdqu     (%0),%%ymm0                   \n"
4948       "vmovdqu     0x20(%0),%%ymm1               \n"
4949       "lea         0x40(%0),%0                   \n"
4950       "vpsrlw      $0x8,%%ymm0,%%ymm2            \n"
4951       "vpsrlw      $0x8,%%ymm1,%%ymm3            \n"
4952       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
4953       "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
4954       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
4955       "vpackuswb   %%ymm3,%%ymm2,%%ymm2          \n"
4956       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
4957       "vpermq      $0xd8,%%ymm2,%%ymm2           \n"
4958       "vmovdqu     %%ymm0,(%1)                   \n"
4959       "vmovdqu     %%ymm2,0x00(%1,%2,1)          \n"
4960       "lea         0x20(%1),%1                   \n"
4961       "sub         $0x20,%3                      \n"
4962       "jg          1b                            \n"
4963       "vzeroupper                                \n"
4964       : "+r"(src_uv),  // %0
4965         "+r"(dst_u),   // %1
4966         "+r"(dst_v),   // %2
4967         "+r"(width)    // %3
4968       :
4969       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
4970 }
4971 #endif  // HAS_SPLITUVROW_AVX2
4972 
4973 #ifdef HAS_SPLITUVROW_SSE2
SplitUVRow_SSE2(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)4974 void SplitUVRow_SSE2(const uint8_t* src_uv,
4975                      uint8_t* dst_u,
4976                      uint8_t* dst_v,
4977                      int width) {
4978   asm volatile(
4979       "pcmpeqb     %%xmm5,%%xmm5                 \n"
4980       "psrlw       $0x8,%%xmm5                   \n"
4981       "sub         %1,%2                         \n"
4982 
4983       LABELALIGN
4984       "1:                                        \n"
4985       "movdqu      (%0),%%xmm0                   \n"
4986       "movdqu      0x10(%0),%%xmm1               \n"
4987       "lea         0x20(%0),%0                   \n"
4988       "movdqa      %%xmm0,%%xmm2                 \n"
4989       "movdqa      %%xmm1,%%xmm3                 \n"
4990       "pand        %%xmm5,%%xmm0                 \n"
4991       "pand        %%xmm5,%%xmm1                 \n"
4992       "packuswb    %%xmm1,%%xmm0                 \n"
4993       "psrlw       $0x8,%%xmm2                   \n"
4994       "psrlw       $0x8,%%xmm3                   \n"
4995       "packuswb    %%xmm3,%%xmm2                 \n"
4996       "movdqu      %%xmm0,(%1)                   \n"
4997       "movdqu      %%xmm2,0x00(%1,%2,1)          \n"
4998       "lea         0x10(%1),%1                   \n"
4999       "sub         $0x10,%3                      \n"
5000       "jg          1b                            \n"
5001       : "+r"(src_uv),  // %0
5002         "+r"(dst_u),   // %1
5003         "+r"(dst_v),   // %2
5004         "+r"(width)    // %3
5005       :
5006       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
5007 }
5008 #endif  // HAS_SPLITUVROW_SSE2
5009 
5010 #ifdef HAS_DETILEROW_SSE2
DetileRow_SSE2(const uint8_t * src,ptrdiff_t src_tile_stride,uint8_t * dst,int width)5011 void DetileRow_SSE2(const uint8_t* src,
5012                     ptrdiff_t src_tile_stride,
5013                     uint8_t* dst,
5014                     int width) {
5015   asm volatile(
5016       "1:                                        \n"
5017       "movdqu      (%0),%%xmm0                   \n"
5018       "sub         $0x10,%2                      \n"
5019       "lea         (%0,%3),%0                    \n"
5020       "movdqu      %%xmm0,(%1)                   \n"
5021       "lea         0x10(%1),%1                   \n"
5022       "jg          1b                            \n"
5023       : "+r"(src),            // %0
5024         "+r"(dst),            // %1
5025         "+r"(width)           // %2
5026       : "r"(src_tile_stride)  // %3
5027       : "cc", "memory", "xmm0");
5028 }
5029 #endif  // HAS_DETILEROW_SSE2
5030 
5031 #ifdef HAS_DETILEROW_16_SSE2
DetileRow_16_SSE2(const uint16_t * src,ptrdiff_t src_tile_stride,uint16_t * dst,int width)5032 void DetileRow_16_SSE2(const uint16_t* src,
5033                        ptrdiff_t src_tile_stride,
5034                        uint16_t* dst,
5035                        int width) {
5036   asm volatile(
5037       "1:                                        \n"
5038       "movdqu      (%0),%%xmm0                   \n"
5039       "movdqu      0x10(%0),%%xmm1               \n"
5040       "lea         (%0,%3,2),%0                  \n"
5041       "movdqu      %%xmm0,(%1)                   \n"
5042       "movdqu      %%xmm1,0x10(%1)               \n"
5043       "lea         0x20(%1),%1                   \n"
5044       "sub         $0x10,%2                      \n"
5045       "jg          1b                            \n"
5046       : "+r"(src),            // %0
5047         "+r"(dst),            // %1
5048         "+r"(width)           // %2
5049       : "r"(src_tile_stride)  // %3
5050       : "cc", "memory", "xmm0", "xmm1");
5051 }
5052 #endif  // HAS_DETILEROW_SSE2
5053 
5054 #ifdef HAS_DETILEROW_16_AVX
DetileRow_16_AVX(const uint16_t * src,ptrdiff_t src_tile_stride,uint16_t * dst,int width)5055 void DetileRow_16_AVX(const uint16_t* src,
5056                       ptrdiff_t src_tile_stride,
5057                       uint16_t* dst,
5058                       int width) {
5059   asm volatile(
5060       "1:                                        \n"
5061       "vmovdqu     (%0),%%ymm0                   \n"
5062       "lea         (%0,%3,2),%0                  \n"
5063       "vmovdqu     %%ymm0,(%1)                   \n"
5064       "lea         0x20(%1),%1                   \n"
5065       "sub         $0x10,%2                      \n"
5066       "jg          1b                            \n"
5067       "vzeroupper                                \n"
5068       : "+r"(src),            // %0
5069         "+r"(dst),            // %1
5070         "+r"(width)           // %2
5071       : "r"(src_tile_stride)  // %3
5072       : "cc", "memory", "xmm0");
5073 }
5074 #endif  // HAS_DETILEROW_AVX
5075 
5076 #ifdef HAS_DETILETOYUY2_SSE2
5077 // Read 16 Y, 8 UV, and write 8 YUYV.
DetileToYUY2_SSE2(const uint8_t * src_y,ptrdiff_t src_y_tile_stride,const uint8_t * src_uv,ptrdiff_t src_uv_tile_stride,uint8_t * dst_yuy2,int width)5078 void DetileToYUY2_SSE2(const uint8_t* src_y,
5079                        ptrdiff_t src_y_tile_stride,
5080                        const uint8_t* src_uv,
5081                        ptrdiff_t src_uv_tile_stride,
5082                        uint8_t* dst_yuy2,
5083                        int width) {
5084   asm volatile(
5085       "1:                                        \n"
5086       "movdqu      (%0),%%xmm0                   \n"  // Load 16 Y
5087       "sub         $0x10,%3                      \n"
5088       "lea         (%0,%4),%0                    \n"
5089       "movdqu      (%1),%%xmm1                   \n"  // Load 8 UV
5090       "lea         (%1,%5),%1                    \n"
5091       "movdqu      %%xmm0,%%xmm2                 \n"
5092       "punpcklbw   %%xmm1,%%xmm0                 \n"
5093       "punpckhbw   %%xmm1,%%xmm2                 \n"
5094       "movdqu      %%xmm0,(%2)                   \n"
5095       "movdqu      %%xmm2,0x10(%2)               \n"
5096       "lea         0x20(%2),%2                   \n"
5097       "jg          1b                            \n"
5098       : "+r"(src_y),                            // %0
5099         "+r"(src_uv),                           // %1
5100         "+r"(dst_yuy2),                         // %2
5101         "+r"(width)                             // %3
5102       : "r"(src_y_tile_stride),                 // %4
5103         "r"(src_uv_tile_stride)                 // %5
5104       : "cc", "memory", "xmm0", "xmm1", "xmm2"  // Clobber list
5105   );
5106 }
5107 #endif
5108 
5109 #ifdef HAS_DETILESPLITUVROW_SSSE3
5110 // TODO(greenjustin): Look into generating these constants instead of loading
5111 // them since this can cause branch mispredicts for fPIC code on 32-bit
5112 // machines.
5113 static const uvec8 kDeinterlaceUV = {0, 2, 4, 6, 8, 10, 12, 14,
5114                                      1, 3, 5, 7, 9, 11, 13, 15};
5115 
5116 // TODO(greenjustin): Research alternatives to pshufb, since pshufb can be very
5117 // slow on older SSE2 processors.
DetileSplitUVRow_SSSE3(const uint8_t * src_uv,ptrdiff_t src_tile_stride,uint8_t * dst_u,uint8_t * dst_v,int width)5118 void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
5119                             ptrdiff_t src_tile_stride,
5120                             uint8_t* dst_u,
5121                             uint8_t* dst_v,
5122                             int width) {
5123   asm volatile(
5124       "movdqu      %4,%%xmm1                     \n"
5125       "1:                                        \n"
5126       "movdqu      (%0),%%xmm0                   \n"
5127       "lea         (%0, %5),%0                   \n"
5128       "pshufb      %%xmm1,%%xmm0                 \n"
5129       "movq        %%xmm0,(%1)                   \n"
5130       "lea         0x8(%1),%1                    \n"
5131       "movhps      %%xmm0,(%2)                   \n"
5132       "lea         0x8(%2),%2                    \n"
5133       "sub         $0x10,%3                      \n"
5134       "jg          1b                            \n"
5135       : "+r"(src_uv),         // %0
5136         "+r"(dst_u),          // %1
5137         "+r"(dst_v),          // %2
5138         "+r"(width)           // %3
5139       : "m"(kDeinterlaceUV),  // %4
5140         "r"(src_tile_stride)  // %5
5141       : "cc", "memory", "xmm0", "xmm1");
5142 }
5143 #endif  // HAS_DETILESPLITUVROW_SSSE3
5144 
5145 #ifdef HAS_MERGEUVROW_AVX512BW
MergeUVRow_AVX512BW(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)5146 void MergeUVRow_AVX512BW(const uint8_t* src_u,
5147                          const uint8_t* src_v,
5148                          uint8_t* dst_uv,
5149                          int width) {
5150       asm volatile("sub         %0,%1                         \n"
5151 
5152                LABELALIGN
5153       "1:                                        \n"
5154       "vpmovzxbw   (%0),%%zmm0                   \n"
5155       "vpmovzxbw   0x00(%0,%1,1),%%zmm1          \n"
5156       "lea         0x20(%0),%0                   \n"
5157       "vpsllw      $0x8,%%zmm1,%%zmm1            \n"
5158       "vporq       %%zmm0,%%zmm1,%%zmm2          \n"
5159       "vmovdqu64   %%zmm2,(%2)                   \n"
5160       "lea         0x40(%2),%2                   \n"
5161       "sub         $0x20,%3                      \n"
5162       "jg          1b                            \n"
5163       "vzeroupper                                \n"
5164                : "+r"(src_u),   // %0
5165                  "+r"(src_v),   // %1
5166                  "+r"(dst_uv),  // %2
5167                  "+r"(width)    // %3
5168                :
5169                : "memory", "cc", "xmm0", "xmm1", "xmm2");
5170 }
5171 #endif  // HAS_MERGEUVROW_AVX512BW
5172 
5173 #ifdef HAS_MERGEUVROW_AVX2
MergeUVRow_AVX2(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)5174 void MergeUVRow_AVX2(const uint8_t* src_u,
5175                      const uint8_t* src_v,
5176                      uint8_t* dst_uv,
5177                      int width) {
5178       asm volatile("sub         %0,%1                         \n"
5179 
5180                LABELALIGN
5181       "1:                                        \n"
5182       "vpmovzxbw   (%0),%%ymm0                   \n"
5183       "vpmovzxbw   0x00(%0,%1,1),%%ymm1          \n"
5184       "lea         0x10(%0),%0                   \n"
5185       "vpsllw      $0x8,%%ymm1,%%ymm1            \n"
5186       "vpor        %%ymm0,%%ymm1,%%ymm2          \n"
5187       "vmovdqu     %%ymm2,(%2)                   \n"
5188       "lea         0x20(%2),%2                   \n"
5189       "sub         $0x10,%3                      \n"
5190       "jg          1b                            \n"
5191       "vzeroupper                                \n"
5192                : "+r"(src_u),   // %0
5193                  "+r"(src_v),   // %1
5194                  "+r"(dst_uv),  // %2
5195                  "+r"(width)    // %3
5196                :
5197                : "memory", "cc", "xmm0", "xmm1", "xmm2");
5198 }
5199 #endif  // HAS_MERGEUVROW_AVX2
5200 
5201 #ifdef HAS_MERGEUVROW_SSE2
MergeUVRow_SSE2(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)5202 void MergeUVRow_SSE2(const uint8_t* src_u,
5203                      const uint8_t* src_v,
5204                      uint8_t* dst_uv,
5205                      int width) {
5206       asm volatile("sub         %0,%1                         \n"
5207 
5208                LABELALIGN
5209       "1:                                        \n"
5210       "movdqu      (%0),%%xmm0                   \n"
5211       "movdqu      0x00(%0,%1,1),%%xmm1          \n"
5212       "lea         0x10(%0),%0                   \n"
5213       "movdqa      %%xmm0,%%xmm2                 \n"
5214       "punpcklbw   %%xmm1,%%xmm0                 \n"
5215       "punpckhbw   %%xmm1,%%xmm2                 \n"
5216       "movdqu      %%xmm0,(%2)                   \n"
5217       "movdqu      %%xmm2,0x10(%2)               \n"
5218       "lea         0x20(%2),%2                   \n"
5219       "sub         $0x10,%3                      \n"
5220       "jg          1b                            \n"
5221                : "+r"(src_u),   // %0
5222                  "+r"(src_v),   // %1
5223                  "+r"(dst_uv),  // %2
5224                  "+r"(width)    // %3
5225                :
5226                : "memory", "cc", "xmm0", "xmm1", "xmm2");
5227 }
5228 #endif  // HAS_MERGEUVROW_SSE2
5229 
5230 #ifdef HAS_MERGEUVROW_16_AVX2
MergeUVRow_16_AVX2(const uint16_t * src_u,const uint16_t * src_v,uint16_t * dst_uv,int depth,int width)5231 void MergeUVRow_16_AVX2(const uint16_t* src_u,
5232                         const uint16_t* src_v,
5233                         uint16_t* dst_uv,
5234                         int depth,
5235                         int width) {
5236   // clang-format off
5237   asm volatile (
5238       "vmovd       %4,%%xmm3                     \n"
5239       "vmovd       %5,%%xmm4                     \n"
5240 
5241 
5242       "sub         %0,%1                         \n"
5243       // 8 pixels per loop.
5244 
5245       LABELALIGN
5246       "1:                                        \n"
5247       "vpmovzxwd   (%0),%%ymm0                   \n"
5248       "vpmovzxwd   0x00(%0,%1,1),%%ymm1          \n"
5249       "lea         0x10(%0),%0                   \n"
5250       "vpsllw      %%xmm3,%%ymm0,%%ymm0          \n"
5251       "vpslld      %%xmm4,%%ymm1,%%ymm1          \n"
5252       "vpor        %%ymm0,%%ymm1,%%ymm2          \n"
5253       "vmovdqu     %%ymm2,(%2)                   \n"
5254       "lea         0x20(%2),%2                   \n"
5255       "sub         $0x8,%3                       \n"
5256       "jg          1b                            \n"
5257       "vzeroupper                                \n"
5258   : "+r"(src_u),      // %0
5259     "+r"(src_v),      // %1
5260     "+r"(dst_uv),     // %2
5261     "+r"(width)       // %3
5262   : "r"(16 - depth),  // %4
5263     "r"(32 - depth)   // %5
5264   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
5265   // clang-format on
5266 }
5267 #endif  // HAS_MERGEUVROW_AVX2
5268 
5269 #ifdef HAS_SPLITUVROW_16_AVX2
5270 const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8,  9,  12, 13,
5271                                  2, 3, 6, 7, 10, 11, 14, 15};
SplitUVRow_16_AVX2(const uint16_t * src_uv,uint16_t * dst_u,uint16_t * dst_v,int depth,int width)5272 void SplitUVRow_16_AVX2(const uint16_t* src_uv,
5273                         uint16_t* dst_u,
5274                         uint16_t* dst_v,
5275                         int depth,
5276                         int width) {
5277   depth = 16 - depth;
5278   // clang-format off
5279   asm volatile (
5280       "vmovd       %4,%%xmm3                     \n"
5281       "vbroadcastf128 %5,%%ymm4                  \n"
5282       "sub         %1,%2                         \n"
5283 
5284     // 16 pixels per loop.
5285     LABELALIGN
5286       "1:                                        \n"
5287       "vmovdqu     (%0),%%ymm0                   \n"
5288       "vmovdqu     0x20(%0),%%ymm1               \n"
5289       "add         $0x40,%0                      \n"
5290 
5291       "vpsrlw      %%xmm3,%%ymm0,%%ymm0          \n"
5292       "vpsrlw      %%xmm3,%%ymm1,%%ymm1          \n"
5293       "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
5294       "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
5295       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
5296       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
5297       "vextractf128 $0x0,%%ymm0,(%1)             \n"
5298       "vextractf128 $0x0,%%ymm1,0x10(%1)         \n"
5299       "vextractf128 $0x1,%%ymm0,(%1,%2)          \n"
5300       "vextractf128 $0x1,%%ymm1,0x10(%1,%2)      \n"
5301       "add         $0x20,%1                      \n"
5302       "sub         $0x10,%3                      \n"
5303       "jg          1b                            \n"
5304       "vzeroupper                                \n"
5305   : "+r"(src_uv),   // %0
5306     "+r"(dst_u),    // %1
5307     "+r"(dst_v),    // %2
5308     "+r"(width)     // %3
5309   : "r"(depth),     // %4
5310     "m"(kSplitUVShuffle16) // %5
5311   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
5312   // clang-format on
5313 }
5314 #endif  // HAS_SPLITUVROW_16_AVX2
5315 
5316 // Use scale to convert lsb formats to msb, depending how many bits there are:
5317 // 128 = 9 bits
5318 // 64 = 10 bits
5319 // 16 = 12 bits
5320 // 1 = 16 bits
5321 #ifdef HAS_MULTIPLYROW_16_AVX2
MultiplyRow_16_AVX2(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)5322 void MultiplyRow_16_AVX2(const uint16_t* src_y,
5323                          uint16_t* dst_y,
5324                          int scale,
5325                          int width) {
5326   // clang-format off
5327   asm volatile (
5328       "vmovd       %3,%%xmm3                     \n"
5329       "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
5330       "vbroadcastss %%xmm3,%%ymm3                \n"
5331       "sub         %0,%1                         \n"
5332 
5333     // 32 pixels per loop.
5334     LABELALIGN
5335       "1:                                        \n"
5336       "vmovdqu     (%0),%%ymm0                   \n"
5337       "vmovdqu     0x20(%0),%%ymm1               \n"
5338       "vpmullw     %%ymm3,%%ymm0,%%ymm0          \n"
5339       "vpmullw     %%ymm3,%%ymm1,%%ymm1          \n"
5340       "vmovdqu     %%ymm0,(%0,%1)                \n"
5341       "vmovdqu     %%ymm1,0x20(%0,%1)            \n"
5342       "add         $0x40,%0                      \n"
5343       "sub         $0x20,%2                      \n"
5344       "jg          1b                            \n"
5345       "vzeroupper                                \n"
5346   : "+r"(src_y),   // %0
5347     "+r"(dst_y),   // %1
5348     "+r"(width)    // %2
5349   : "r"(scale)     // %3
5350   : "memory", "cc", "xmm0", "xmm1", "xmm3");
5351   // clang-format on
5352 }
5353 #endif  // HAS_MULTIPLYROW_16_AVX2
5354 
5355 // Use scale to convert msb formats to lsb, depending how many bits there are:
5356 // 512 = 9 bits
5357 // 1024 = 10 bits
5358 // 4096 = 12 bits
5359 // 65536 = 16 bits
5360 #ifdef HAS_DIVIDEROW_16_AVX2
DivideRow_16_AVX2(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)5361 void DivideRow_16_AVX2(const uint16_t* src_y,
5362                        uint16_t* dst_y,
5363                        int scale,
5364                        int width) {
5365   // clang-format off
5366   asm volatile (
5367       "vmovd       %3,%%xmm3                     \n"
5368       "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
5369       "vbroadcastss %%xmm3,%%ymm3                \n"
5370       "sub         %0,%1                         \n"
5371 
5372     // 32 pixels per loop.
5373     LABELALIGN
5374       "1:                                        \n"
5375       "vmovdqu     (%0),%%ymm0                   \n"
5376       "vmovdqu     0x20(%0),%%ymm1               \n"
5377       "vpmulhuw    %%ymm3,%%ymm0,%%ymm0          \n"
5378       "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
5379       "vmovdqu     %%ymm0,(%0,%1)                \n"
5380       "vmovdqu     %%ymm1,0x20(%0,%1)            \n"
5381       "add         $0x40,%0                      \n"
5382       "sub         $0x20,%2                      \n"
5383       "jg          1b                            \n"
5384       "vzeroupper                                \n"
5385   : "+r"(src_y),   // %0
5386     "+r"(dst_y),   // %1
5387     "+r"(width),    // %2
5388     "+r"(scale)     // %3
5389   :
5390   : "memory", "cc", "xmm0", "xmm1", "xmm3");
5391   // clang-format on
5392 }
5393 #endif  // HAS_MULTIPLYROW_16_AVX2
5394 
5395 // Use scale to convert lsb formats to msb, depending how many bits there are:
5396 // 32768 = 9 bits
5397 // 16384 = 10 bits
5398 // 4096 = 12 bits
5399 // 256 = 16 bits
Convert16To8Row_SSSE3(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)5400 void Convert16To8Row_SSSE3(const uint16_t* src_y,
5401                            uint8_t* dst_y,
5402                            int scale,
5403                            int width) {
5404   // clang-format off
5405   asm volatile (
5406       "movd        %3,%%xmm2                     \n"
5407       "punpcklwd   %%xmm2,%%xmm2                 \n"
5408       "pshufd      $0x0,%%xmm2,%%xmm2            \n"
5409 
5410     // 32 pixels per loop.
5411     LABELALIGN
5412       "1:                                        \n"
5413       "movdqu      (%0),%%xmm0                   \n"
5414       "movdqu      0x10(%0),%%xmm1               \n"
5415       "add         $0x20,%0                      \n"
5416       "pmulhuw     %%xmm2,%%xmm0                 \n"
5417       "pmulhuw     %%xmm2,%%xmm1                 \n"
5418       "packuswb    %%xmm1,%%xmm0                 \n"
5419       "movdqu      %%xmm0,(%1)                   \n"
5420       "add         $0x10,%1                      \n"
5421       "sub         $0x10,%2                      \n"
5422       "jg          1b                            \n"
5423   : "+r"(src_y),   // %0
5424     "+r"(dst_y),   // %1
5425     "+r"(width)    // %2
5426   : "r"(scale)     // %3
5427   : "memory", "cc", "xmm0", "xmm1", "xmm2");
5428   // clang-format on
5429 }
5430 
5431 #ifdef HAS_CONVERT16TO8ROW_AVX2
Convert16To8Row_AVX2(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)5432 void Convert16To8Row_AVX2(const uint16_t* src_y,
5433                           uint8_t* dst_y,
5434                           int scale,
5435                           int width) {
5436   // clang-format off
5437   asm volatile (
5438       "vmovd       %3,%%xmm2                     \n"
5439       "vpunpcklwd  %%xmm2,%%xmm2,%%xmm2          \n"
5440       "vbroadcastss %%xmm2,%%ymm2                \n"
5441 
5442     // 32 pixels per loop.
5443     LABELALIGN
5444       "1:                                        \n"
5445       "vmovdqu     (%0),%%ymm0                   \n"
5446       "vmovdqu     0x20(%0),%%ymm1               \n"
5447       "add         $0x40,%0                      \n"
5448       "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
5449       "vpmulhuw    %%ymm2,%%ymm1,%%ymm1          \n"
5450       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"  // mutates
5451       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
5452       "vmovdqu     %%ymm0,(%1)                   \n"
5453       "add         $0x20,%1                      \n"
5454       "sub         $0x20,%2                      \n"
5455       "jg          1b                            \n"
5456       "vzeroupper                                \n"
5457   : "+r"(src_y),   // %0
5458     "+r"(dst_y),   // %1
5459     "+r"(width)    // %2
5460   : "r"(scale)     // %3
5461   : "memory", "cc", "xmm0", "xmm1", "xmm2");
5462   // clang-format on
5463 }
5464 #endif  // HAS_CONVERT16TO8ROW_AVX2
5465 
5466 // Use scale to convert to lsb formats depending how many bits there are:
5467 // 512 = 9 bits
5468 // 1024 = 10 bits
5469 // 4096 = 12 bits
Convert8To16Row_SSE2(const uint8_t * src_y,uint16_t * dst_y,int scale,int width)5470 void Convert8To16Row_SSE2(const uint8_t* src_y,
5471                           uint16_t* dst_y,
5472                           int scale,
5473                           int width) {
5474   // clang-format off
5475   asm volatile (
5476       "movd        %3,%%xmm2                     \n"
5477       "punpcklwd   %%xmm2,%%xmm2                 \n"
5478       "pshufd      $0x0,%%xmm2,%%xmm2            \n"
5479 
5480     // 32 pixels per loop.
5481     LABELALIGN
5482       "1:                                        \n"
5483       "movdqu      (%0),%%xmm0                   \n"
5484       "movdqa      %%xmm0,%%xmm1                 \n"
5485       "punpcklbw   %%xmm0,%%xmm0                 \n"
5486       "punpckhbw   %%xmm1,%%xmm1                 \n"
5487       "add         $0x10,%0                      \n"
5488       "pmulhuw     %%xmm2,%%xmm0                 \n"
5489       "pmulhuw     %%xmm2,%%xmm1                 \n"
5490       "movdqu      %%xmm0,(%1)                   \n"
5491       "movdqu      %%xmm1,0x10(%1)               \n"
5492       "add         $0x20,%1                      \n"
5493       "sub         $0x10,%2                      \n"
5494       "jg          1b                            \n"
5495   : "+r"(src_y),   // %0
5496     "+r"(dst_y),   // %1
5497     "+r"(width)    // %2
5498   : "r"(scale)     // %3
5499   : "memory", "cc", "xmm0", "xmm1", "xmm2");
5500   // clang-format on
5501 }
5502 
5503 #ifdef HAS_CONVERT8TO16ROW_AVX2
Convert8To16Row_AVX2(const uint8_t * src_y,uint16_t * dst_y,int scale,int width)5504 void Convert8To16Row_AVX2(const uint8_t* src_y,
5505                           uint16_t* dst_y,
5506                           int scale,
5507                           int width) {
5508   // clang-format off
5509   asm volatile (
5510       "vmovd       %3,%%xmm2                     \n"
5511       "vpunpcklwd  %%xmm2,%%xmm2,%%xmm2          \n"
5512       "vbroadcastss %%xmm2,%%ymm2                \n"
5513 
5514     // 32 pixels per loop.
5515     LABELALIGN
5516       "1:                                        \n"
5517       "vmovdqu     (%0),%%ymm0                   \n"
5518       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
5519       "add         $0x20,%0                      \n"
5520       "vpunpckhbw  %%ymm0,%%ymm0,%%ymm1          \n"
5521       "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
5522       "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
5523       "vpmulhuw    %%ymm2,%%ymm1,%%ymm1          \n"
5524       "vmovdqu     %%ymm0,(%1)                   \n"
5525       "vmovdqu     %%ymm1,0x20(%1)               \n"
5526       "add         $0x40,%1                      \n"
5527       "sub         $0x20,%2                      \n"
5528       "jg          1b                            \n"
5529       "vzeroupper                                \n"
5530   : "+r"(src_y),   // %0
5531     "+r"(dst_y),   // %1
5532     "+r"(width)    // %2
5533   : "r"(scale)     // %3
5534   : "memory", "cc", "xmm0", "xmm1", "xmm2");
5535   // clang-format on
5536 }
5537 #endif  // HAS_CONVERT8TO16ROW_AVX2
5538 
5539 #ifdef HAS_SPLITRGBROW_SSSE3
5540 // Shuffle table for converting RGB to Planar.
5541 static const uvec8 kSplitRGBShuffle[9] = {
5542     {0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
5543      128u, 128u},
5544     {128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u,
5545      128u, 128u},
5546     {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 1u, 4u,
5547      7u, 10u, 13u},
5548     {1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
5549      128u, 128u},
5550     {128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u,
5551      128u, 128u},
5552     {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u,
5553      8u, 11u, 14u},
5554     {2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
5555      128u, 128u},
5556     {128u, 128u, 128u, 128u, 128u, 1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u,
5557      128u, 128u},
5558     {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u,
5559      12u, 15u}};
5560 
SplitRGBRow_SSSE3(const uint8_t * src_rgb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)5561 void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
5562                        uint8_t* dst_r,
5563                        uint8_t* dst_g,
5564                        uint8_t* dst_b,
5565                        int width) {
5566   asm volatile(
5567 
5568       LABELALIGN
5569       "1:                                        \n"
5570       "movdqu      (%0),%%xmm0                   \n"
5571       "movdqu      0x10(%0),%%xmm1               \n"
5572       "movdqu      0x20(%0),%%xmm2               \n"
5573       "pshufb      0(%5), %%xmm0                 \n"
5574       "pshufb      16(%5), %%xmm1                \n"
5575       "pshufb      32(%5), %%xmm2                \n"
5576       "por         %%xmm1,%%xmm0                 \n"
5577       "por         %%xmm2,%%xmm0                 \n"
5578       "movdqu      %%xmm0,(%1)                   \n"
5579       "lea         0x10(%1),%1                   \n"
5580 
5581       "movdqu      (%0),%%xmm0                   \n"
5582       "movdqu      0x10(%0),%%xmm1               \n"
5583       "movdqu      0x20(%0),%%xmm2               \n"
5584       "pshufb      48(%5),%%xmm0                 \n"
5585       "pshufb      64(%5),%%xmm1                 \n"
5586       "pshufb      80(%5), %%xmm2                \n"
5587       "por         %%xmm1,%%xmm0                 \n"
5588       "por         %%xmm2,%%xmm0                 \n"
5589       "movdqu      %%xmm0,(%2)                   \n"
5590       "lea         0x10(%2),%2                   \n"
5591 
5592       "movdqu      (%0),%%xmm0                   \n"
5593       "movdqu      0x10(%0),%%xmm1               \n"
5594       "movdqu      0x20(%0),%%xmm2               \n"
5595       "pshufb      96(%5), %%xmm0                \n"
5596       "pshufb      112(%5), %%xmm1               \n"
5597       "pshufb      128(%5), %%xmm2               \n"
5598       "por         %%xmm1,%%xmm0                 \n"
5599       "por         %%xmm2,%%xmm0                 \n"
5600       "movdqu      %%xmm0,(%3)                   \n"
5601       "lea         0x10(%3),%3                   \n"
5602       "lea         0x30(%0),%0                   \n"
5603       "sub         $0x10,%4                      \n"
5604       "jg          1b                            \n"
5605       : "+r"(src_rgb),             // %0
5606         "+r"(dst_r),               // %1
5607         "+r"(dst_g),               // %2
5608         "+r"(dst_b),               // %3
5609         "+r"(width)                // %4
5610       : "r"(&kSplitRGBShuffle[0])  // %5
5611       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5612 }
5613 #endif  // HAS_SPLITRGBROW_SSSE3
5614 
5615 #ifdef HAS_MERGERGBROW_SSSE3
5616 // Shuffle table for converting Planar to RGB.
5617 static const uvec8 kMergeRGBShuffle[9] = {
5618     {0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u, 128u,
5619      128u, 5u},
5620     {128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u,
5621      128u, 128u},
5622     {128u, 128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u,
5623      4u, 128u},
5624     {128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u, 128u,
5625      10u, 128u},
5626     {5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u,
5627      128u, 10u},
5628     {128u, 5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u,
5629      128u, 128u},
5630     {128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u, 128u,
5631      15u, 128u, 128u},
5632     {128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u,
5633      128u, 15u, 128u},
5634     {10u, 128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u,
5635      128u, 128u, 15u}};
5636 
MergeRGBRow_SSSE3(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_rgb,int width)5637 void MergeRGBRow_SSSE3(const uint8_t* src_r,
5638                        const uint8_t* src_g,
5639                        const uint8_t* src_b,
5640                        uint8_t* dst_rgb,
5641                        int width) {
5642   asm volatile(
5643 
5644       LABELALIGN
5645       "1:                                        \n"
5646       "movdqu      (%0),%%xmm0                   \n"
5647       "movdqu      (%1),%%xmm1                   \n"
5648       "movdqu      (%2),%%xmm2                   \n"
5649       "pshufb      (%5), %%xmm0                  \n"
5650       "pshufb      16(%5), %%xmm1                \n"
5651       "pshufb      32(%5), %%xmm2                \n"
5652       "por         %%xmm1,%%xmm0                 \n"
5653       "por         %%xmm2,%%xmm0                 \n"
5654       "movdqu      %%xmm0,(%3)                   \n"
5655 
5656       "movdqu      (%0),%%xmm0                   \n"
5657       "movdqu      (%1),%%xmm1                   \n"
5658       "movdqu      (%2),%%xmm2                   \n"
5659       "pshufb      48(%5), %%xmm0                \n"
5660       "pshufb      64(%5), %%xmm1                \n"
5661       "pshufb      80(%5), %%xmm2                \n"
5662       "por         %%xmm1,%%xmm0                 \n"
5663       "por         %%xmm2,%%xmm0                 \n"
5664       "movdqu      %%xmm0,16(%3)                 \n"
5665 
5666       "movdqu      (%0),%%xmm0                   \n"
5667       "movdqu      (%1),%%xmm1                   \n"
5668       "movdqu      (%2),%%xmm2                   \n"
5669       "pshufb      96(%5), %%xmm0                \n"
5670       "pshufb      112(%5), %%xmm1               \n"
5671       "pshufb      128(%5), %%xmm2               \n"
5672       "por         %%xmm1,%%xmm0                 \n"
5673       "por         %%xmm2,%%xmm0                 \n"
5674       "movdqu      %%xmm0,32(%3)                 \n"
5675 
5676       "lea         0x10(%0),%0                   \n"
5677       "lea         0x10(%1),%1                   \n"
5678       "lea         0x10(%2),%2                   \n"
5679       "lea         0x30(%3),%3                   \n"
5680       "sub         $0x10,%4                      \n"
5681       "jg          1b                            \n"
5682       : "+r"(src_r),               // %0
5683         "+r"(src_g),               // %1
5684         "+r"(src_b),               // %2
5685         "+r"(dst_rgb),             // %3
5686         "+r"(width)                // %4
5687       : "r"(&kMergeRGBShuffle[0])  // %5
5688       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5689 }
5690 #endif  // HAS_MERGERGBROW_SSSE3
5691 
5692 #ifdef HAS_MERGEARGBROW_SSE2
MergeARGBRow_SSE2(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,const uint8_t * src_a,uint8_t * dst_argb,int width)5693 void MergeARGBRow_SSE2(const uint8_t* src_r,
5694                        const uint8_t* src_g,
5695                        const uint8_t* src_b,
5696                        const uint8_t* src_a,
5697                        uint8_t* dst_argb,
5698                        int width) {
5699   asm volatile(
5700 
5701       "sub         %0,%1                         \n"
5702       "sub         %0,%2                         \n"
5703       "sub         %0,%3                         \n"
5704 
5705       LABELALIGN
5706       "1:                                        \n"
5707 
5708       "movq        (%0,%2),%%xmm0                \n"  // B
5709       "movq        (%0),%%xmm1                   \n"  // R
5710       "movq        (%0,%1),%%xmm2                \n"  // G
5711       "punpcklbw   %%xmm1,%%xmm0                 \n"  // BR
5712       "movq        (%0,%3),%%xmm1                \n"  // A
5713       "punpcklbw   %%xmm1,%%xmm2                 \n"  // GA
5714       "movdqa      %%xmm0,%%xmm1                 \n"  // BR
5715       "punpckhbw   %%xmm2,%%xmm1                 \n"  // BGRA (hi)
5716       "punpcklbw   %%xmm2,%%xmm0                 \n"  // BGRA (lo)
5717       "movdqu      %%xmm0,(%4)                   \n"
5718       "movdqu      %%xmm1,16(%4)                 \n"
5719 
5720       "lea         8(%0),%0                      \n"
5721       "lea         32(%4),%4                     \n"
5722       "sub         $0x8,%5                       \n"
5723       "jg          1b                            \n"
5724       : "+r"(src_r),     // %0
5725         "+r"(src_g),     // %1
5726         "+r"(src_b),     // %2
5727         "+r"(src_a),     // %3
5728         "+r"(dst_argb),  // %4
5729         "+r"(width)      // %5
5730       :
5731       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5732 }
5733 #endif
5734 
5735 #ifdef HAS_MERGEXRGBROW_SSE2
MergeXRGBRow_SSE2(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_argb,int width)5736 void MergeXRGBRow_SSE2(const uint8_t* src_r,
5737                        const uint8_t* src_g,
5738                        const uint8_t* src_b,
5739                        uint8_t* dst_argb,
5740                        int width) {
5741   asm volatile(
5742 
5743       LABELALIGN
5744       "1:                                        \n"
5745 
5746       "movq        (%2),%%xmm0                   \n"  // B
5747       "movq        (%0),%%xmm1                   \n"  // R
5748       "movq        (%1),%%xmm2                   \n"  // G
5749       "punpcklbw   %%xmm1,%%xmm0                 \n"  // BR
5750       "pcmpeqd     %%xmm1,%%xmm1                 \n"  // A(255)
5751       "punpcklbw   %%xmm1,%%xmm2                 \n"  // GA
5752       "movdqa      %%xmm0,%%xmm1                 \n"  // BR
5753       "punpckhbw   %%xmm2,%%xmm1                 \n"  // BGRA (hi)
5754       "punpcklbw   %%xmm2,%%xmm0                 \n"  // BGRA (lo)
5755       "movdqu      %%xmm0,(%3)                   \n"
5756       "movdqu      %%xmm1,16(%3)                 \n"
5757 
5758       "lea         8(%0),%0                      \n"
5759       "lea         8(%1),%1                      \n"
5760       "lea         8(%2),%2                      \n"
5761       "lea         32(%3),%3                     \n"
5762       "sub         $0x8,%4                       \n"
5763       "jg          1b                            \n"
5764       : "+r"(src_r),     // %0
5765         "+r"(src_g),     // %1
5766         "+r"(src_b),     // %2
5767         "+r"(dst_argb),  // %3
5768         "+r"(width)      // %4
5769       :
5770       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5771 }
5772 #endif  // HAS_MERGEARGBROW_SSE2
5773 
5774 #ifdef HAS_MERGEARGBROW_AVX2
MergeARGBRow_AVX2(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,const uint8_t * src_a,uint8_t * dst_argb,int width)5775 void MergeARGBRow_AVX2(const uint8_t* src_r,
5776                        const uint8_t* src_g,
5777                        const uint8_t* src_b,
5778                        const uint8_t* src_a,
5779                        uint8_t* dst_argb,
5780                        int width) {
5781   asm volatile(
5782 
5783       "sub         %0,%1                         \n"
5784       "sub         %0,%2                         \n"
5785       "sub         %0,%3                         \n"
5786 
5787       LABELALIGN
5788       "1:                                        \n"
5789 
5790       "vmovdqu     (%0,%2),%%xmm0                \n"  // B
5791       "vmovdqu     (%0,%1),%%xmm1                \n"  // R
5792       "vinserti128 $1,(%0),%%ymm0,%%ymm0         \n"  // G
5793       "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1      \n"  // A
5794       "vpunpckhbw  %%ymm1,%%ymm0,%%ymm2          \n"
5795       "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
5796       "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
5797       "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
5798       "vpunpckhwd  %%ymm1,%%ymm0,%%ymm2          \n"
5799       "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"
5800       "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
5801       "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
5802       "vmovdqu     %%ymm0,(%4)                   \n"  // First 8
5803       "vmovdqu     %%ymm1,32(%4)                 \n"  // Next 8
5804 
5805       "lea         16(%0),%0                     \n"
5806       "lea         64(%4),%4                     \n"
5807       "sub         $0x10,%5                      \n"
5808       "jg          1b                            \n"
5809       "vzeroupper                                \n"
5810       : "+r"(src_r),     // %0
5811         "+r"(src_g),     // %1
5812         "+r"(src_b),     // %2
5813         "+r"(src_a),     // %3
5814         "+r"(dst_argb),  // %4
5815         "+r"(width)      // %5
5816       :
5817       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5818 }
5819 #endif
5820 
5821 #ifdef HAS_MERGEXRGBROW_AVX2
MergeXRGBRow_AVX2(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_argb,int width)5822 void MergeXRGBRow_AVX2(const uint8_t* src_r,
5823                        const uint8_t* src_g,
5824                        const uint8_t* src_b,
5825                        uint8_t* dst_argb,
5826                        int width) {
5827   asm volatile(
5828 
5829       LABELALIGN
5830       "1:                                        \n"
5831 
5832       "vmovdqu     (%2),%%xmm0                   \n"  // B
5833       "vpcmpeqd    %%ymm1,%%ymm1,%%ymm1          \n"  // A(255)
5834       "vinserti128 $0,(%1),%%ymm1,%%ymm1         \n"  // R
5835       "vinserti128 $1,(%0),%%ymm0,%%ymm0         \n"  // G
5836       "vpunpckhbw  %%ymm1,%%ymm0,%%ymm2          \n"
5837       "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
5838       "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
5839       "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
5840       "vpunpckhwd  %%ymm1,%%ymm0,%%ymm2          \n"
5841       "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"
5842       "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
5843       "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
5844       "vmovdqu     %%ymm0,(%3)                   \n"  // First 8
5845       "vmovdqu     %%ymm1,32(%3)                 \n"  // Next 8
5846 
5847       "lea         16(%0),%0                     \n"
5848       "lea         16(%1),%1                     \n"
5849       "lea         16(%2),%2                     \n"
5850       "lea         64(%3),%3                     \n"
5851       "sub         $0x10,%4                      \n"
5852       "jg          1b                            \n"
5853       "vzeroupper                                \n"
5854       : "+r"(src_r),     // %0
5855         "+r"(src_g),     // %1
5856         "+r"(src_b),     // %2
5857         "+r"(dst_argb),  // %3
5858         "+rm"(width)     // %4
5859       :
5860       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5861 }
5862 #endif  // HAS_MERGEARGBROW_AVX2
5863 
5864 #ifdef HAS_SPLITARGBROW_SSE2
SplitARGBRow_SSE2(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)5865 void SplitARGBRow_SSE2(const uint8_t* src_argb,
5866                        uint8_t* dst_r,
5867                        uint8_t* dst_g,
5868                        uint8_t* dst_b,
5869                        uint8_t* dst_a,
5870                        int width) {
5871   asm volatile(
5872 
5873       "sub         %1,%2                         \n"
5874       "sub         %1,%3                         \n"
5875       "sub         %1,%4                         \n"
5876 
5877       LABELALIGN
5878       "1:                                        \n"
5879 
5880       "movdqu      (%0),%%xmm0                   \n"  // 00-0F
5881       "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
5882       "movdqa      %%xmm0,%%xmm2                 \n"
5883       "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 00-07 10-17
5884       "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 08-0F 18-1F
5885       "movdqa      %%xmm0,%%xmm1                 \n"
5886       "punpcklbw   %%xmm2,%%xmm0                 \n"  // 08192A3B4C5D6E7F (lo)
5887       "punpckhbw   %%xmm2,%%xmm1                 \n"  // 08192A3B4C5D6E7F (hi)
5888       "movdqa      %%xmm0,%%xmm2                 \n"
5889       "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 08192A3B08192A3B
5890       "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 4C5D6E7F4C5D6E7F
5891       "movdqa      %%xmm0,%%xmm1                 \n"
5892       "punpcklbw   %%xmm2,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
5893       "punpckhbw   %%xmm2,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
5894       "movdqa      %%xmm0,%%xmm2                 \n"
5895       "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
5896       "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
5897       "movlps      %%xmm0,(%1,%3)                \n"  // B
5898       "movhps      %%xmm0,(%1,%2)                \n"  // G
5899       "movlps      %%xmm2,(%1)                   \n"  // R
5900       "movhps      %%xmm2,(%1,%4)                \n"  // A
5901 
5902       "lea         32(%0),%0                     \n"
5903       "lea         8(%1),%1                      \n"
5904       "sub         $0x8,%5                       \n"
5905       "jg          1b                            \n"
5906       : "+r"(src_argb),  // %0
5907         "+r"(dst_r),     // %1
5908         "+r"(dst_g),     // %2
5909         "+r"(dst_b),     // %3
5910         "+r"(dst_a),     // %4
5911         "+rm"(width)     // %5
5912       :
5913       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5914 }
5915 #endif
5916 
5917 #ifdef HAS_SPLITXRGBROW_SSE2
SplitXRGBRow_SSE2(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)5918 void SplitXRGBRow_SSE2(const uint8_t* src_argb,
5919                        uint8_t* dst_r,
5920                        uint8_t* dst_g,
5921                        uint8_t* dst_b,
5922                        int width) {
5923   asm volatile(
5924 
5925       LABELALIGN
5926       "1:                                        \n"
5927 
5928       "movdqu      (%0),%%xmm0                   \n"  // 00-0F
5929       "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
5930       "movdqa      %%xmm0,%%xmm2                 \n"
5931       "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 00-07 10-17
5932       "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 08-0F 18-1F
5933       "movdqa      %%xmm0,%%xmm1                 \n"
5934       "punpcklbw   %%xmm2,%%xmm0                 \n"  // 08192A3B4C5D6E7F (lo)
5935       "punpckhbw   %%xmm2,%%xmm1                 \n"  // 08192A3B4C5D6E7F (hi)
5936       "movdqa      %%xmm0,%%xmm2                 \n"
5937       "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 08192A3B08192A3B
5938       "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 4C5D6E7F4C5D6E7F
5939       "movdqa      %%xmm0,%%xmm1                 \n"
5940       "punpcklbw   %%xmm2,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
5941       "punpckhbw   %%xmm2,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
5942       "movdqa      %%xmm0,%%xmm2                 \n"
5943       "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
5944       "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
5945       "movlps      %%xmm0,(%3)                   \n"  // B
5946       "movhps      %%xmm0,(%2)                   \n"  // G
5947       "movlps      %%xmm2,(%1)                   \n"  // R
5948 
5949       "lea         32(%0),%0                     \n"
5950       "lea         8(%1),%1                      \n"
5951       "lea         8(%2),%2                      \n"
5952       "lea         8(%3),%3                      \n"
5953       "sub         $0x8,%4                       \n"
5954       "jg          1b                            \n"
5955       : "+r"(src_argb),  // %0
5956         "+r"(dst_r),     // %1
5957         "+r"(dst_g),     // %2
5958         "+r"(dst_b),     // %3
5959         "+rm"(width)     // %4
5960       :
5961       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5962 }
5963 #endif
5964 
5965 static const uvec8 kShuffleMaskARGBSplit = {0, 4, 8,  12, 1, 5, 9,  13,
5966                                             2, 6, 10, 14, 3, 7, 11, 15};
5967 #ifdef HAS_SPLITARGBROW_SSSE3
SplitARGBRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)5968 void SplitARGBRow_SSSE3(const uint8_t* src_argb,
5969                         uint8_t* dst_r,
5970                         uint8_t* dst_g,
5971                         uint8_t* dst_b,
5972                         uint8_t* dst_a,
5973                         int width) {
5974   asm volatile(
5975 
5976       "movdqa      %6,%%xmm3                     \n"
5977       "sub         %1,%2                         \n"
5978       "sub         %1,%3                         \n"
5979       "sub         %1,%4                         \n"
5980 
5981       LABELALIGN
5982       "1:                                        \n"
5983 
5984       "movdqu      (%0),%%xmm0                   \n"  // 00-0F
5985       "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
5986       "pshufb      %%xmm3,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
5987       "pshufb      %%xmm3,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
5988       "movdqa      %%xmm0,%%xmm2                 \n"
5989       "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
5990       "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
5991       "movlps      %%xmm0,(%1,%3)                \n"  // B
5992       "movhps      %%xmm0,(%1,%2)                \n"  // G
5993       "movlps      %%xmm2,(%1)                   \n"  // R
5994       "movhps      %%xmm2,(%1,%4)                \n"  // A
5995 
5996       "lea         32(%0),%0                     \n"
5997       "lea         8(%1),%1                      \n"
5998       "subl        $0x8,%5                       \n"
5999       "jg          1b                            \n"
6000       : "+r"(src_argb),  // %0
6001         "+r"(dst_r),     // %1
6002         "+r"(dst_g),     // %2
6003         "+r"(dst_b),     // %3
6004         "+r"(dst_a),     // %4
6005 #if defined(__i386__)
6006         "+m"(width)  // %5
6007 #else
6008         "+rm"(width)          // %5
6009 #endif
6010       : "m"(kShuffleMaskARGBSplit)  // %6
6011       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
6012 }
6013 #endif
6014 
6015 #ifdef HAS_SPLITXRGBROW_SSSE3
SplitXRGBRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)6016 void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
6017                         uint8_t* dst_r,
6018                         uint8_t* dst_g,
6019                         uint8_t* dst_b,
6020                         int width) {
6021   asm volatile(
6022 
6023       "movdqa      %5,%%xmm3                     \n"
6024 
6025       LABELALIGN
6026       "1:                                        \n"
6027 
6028       "movdqu      (%0),%%xmm0                   \n"  // 00-0F
6029       "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
6030       "pshufb      %%xmm3,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
6031       "pshufb      %%xmm3,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
6032       "movdqa      %%xmm0,%%xmm2                 \n"
6033       "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
6034       "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
6035       "movlps      %%xmm0,(%3)                   \n"  // B
6036       "movhps      %%xmm0,(%2)                   \n"  // G
6037       "movlps      %%xmm2,(%1)                   \n"  // R
6038 
6039       "lea         32(%0),%0                     \n"
6040       "lea         8(%1),%1                      \n"
6041       "lea         8(%2),%2                      \n"
6042       "lea         8(%3),%3                      \n"
6043       "sub         $0x8,%4                       \n"
6044       "jg          1b                            \n"
6045       : "+r"(src_argb),             // %0
6046         "+r"(dst_r),                // %1
6047         "+r"(dst_g),                // %2
6048         "+r"(dst_b),                // %3
6049         "+r"(width)                 // %4
6050       : "m"(kShuffleMaskARGBSplit)  // %5
6051       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
6052 }
6053 #endif
6054 
6055 #ifdef HAS_SPLITARGBROW_AVX2
6056 static const ulvec32 kShuffleMaskARGBPermute = {0, 4, 1, 5, 2, 6, 3, 7};
SplitARGBRow_AVX2(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)6057 void SplitARGBRow_AVX2(const uint8_t* src_argb,
6058                        uint8_t* dst_r,
6059                        uint8_t* dst_g,
6060                        uint8_t* dst_b,
6061                        uint8_t* dst_a,
6062                        int width) {
6063   asm volatile(
6064 
6065       "sub         %1,%2                         \n"
6066       "sub         %1,%3                         \n"
6067       "sub         %1,%4                         \n"
6068       "vmovdqa     %7,%%ymm3                     \n"
6069       "vbroadcastf128 %6,%%ymm4                  \n"
6070 
6071       LABELALIGN
6072       "1:                                        \n"
6073 
6074       "vmovdqu     (%0),%%xmm0                   \n"  // 00-0F
6075       "vmovdqu     16(%0),%%xmm1                 \n"  // 10-1F
6076       "vinserti128 $1,32(%0),%%ymm0,%%ymm0       \n"  // 00-0F 20-2F
6077       "vinserti128 $1,48(%0),%%ymm1,%%ymm1       \n"  // 10-1F 30-3F
6078       "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
6079       "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
6080       "vpermd      %%ymm0,%%ymm3,%%ymm0          \n"
6081       "vpermd      %%ymm1,%%ymm3,%%ymm1          \n"
6082       "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"  // GA
6083       "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"  // BR
6084       "vmovdqu     %%xmm0,(%1,%3)                \n"  // B
6085       "vextracti128 $1,%%ymm0,(%1)               \n"  // R
6086       "vmovdqu     %%xmm2,(%1,%2)                \n"  // G
6087       "vextracti128 $1,%%ymm2,(%1,%4)            \n"  // A
6088       "lea         64(%0),%0                     \n"
6089       "lea         16(%1),%1                     \n"
6090       "subl        $0x10,%5                      \n"
6091       "jg          1b                            \n"
6092       "vzeroupper                                \n"
6093       : "+r"(src_argb),  // %0
6094         "+r"(dst_r),     // %1
6095         "+r"(dst_g),     // %2
6096         "+r"(dst_b),     // %3
6097         "+r"(dst_a),     // %4
6098 #if defined(__i386__)
6099         "+m"(width)  // %5
6100 #else
6101         "+rm"(width)          // %5
6102 #endif
6103       : "m"(kShuffleMaskARGBSplit),   // %6
6104         "m"(kShuffleMaskARGBPermute)  // %7
6105       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
6106 }
6107 #endif
6108 
6109 #ifdef HAS_SPLITXRGBROW_AVX2
SplitXRGBRow_AVX2(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)6110 void SplitXRGBRow_AVX2(const uint8_t* src_argb,
6111                        uint8_t* dst_r,
6112                        uint8_t* dst_g,
6113                        uint8_t* dst_b,
6114                        int width) {
6115   asm volatile(
6116 
6117       "vmovdqa     %6,%%ymm3                     \n"
6118       "vbroadcastf128 %5,%%ymm4                  \n"
6119 
6120       LABELALIGN
6121       "1:                                        \n"
6122 
6123       "vmovdqu     (%0),%%xmm0                   \n"  // 00-0F
6124       "vmovdqu     16(%0),%%xmm1                 \n"  // 10-1F
6125       "vinserti128 $1,32(%0),%%ymm0,%%ymm0       \n"  // 00-0F 20-2F
6126       "vinserti128 $1,48(%0),%%ymm1,%%ymm1       \n"  // 10-1F 30-3F
6127       "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
6128       "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
6129       "vpermd      %%ymm0,%%ymm3,%%ymm0          \n"
6130       "vpermd      %%ymm1,%%ymm3,%%ymm1          \n"
6131       "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"  // GA
6132       "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"  // BR
6133       "vmovdqu     %%xmm0,(%3)                   \n"  // B
6134       "vextracti128 $1,%%ymm0,(%1)               \n"  // R
6135       "vmovdqu     %%xmm2,(%2)                   \n"  // G
6136 
6137       "lea         64(%0),%0                     \n"
6138       "lea         16(%1),%1                     \n"
6139       "lea         16(%2),%2                     \n"
6140       "lea         16(%3),%3                     \n"
6141       "sub         $0x10,%4                      \n"
6142       "jg          1b                            \n"
6143       "vzeroupper                                \n"
6144       : "+r"(src_argb),               // %0
6145         "+r"(dst_r),                  // %1
6146         "+r"(dst_g),                  // %2
6147         "+r"(dst_b),                  // %3
6148         "+r"(width)                   // %4
6149       : "m"(kShuffleMaskARGBSplit),   // %5
6150         "m"(kShuffleMaskARGBPermute)  // %6
6151       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
6152 }
6153 #endif
6154 
6155 #ifdef HAS_MERGEXR30ROW_AVX2
MergeXR30Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_ar30,int depth,int width)6156 void MergeXR30Row_AVX2(const uint16_t* src_r,
6157                        const uint16_t* src_g,
6158                        const uint16_t* src_b,
6159                        uint8_t* dst_ar30,
6160                        int depth,
6161                        int width) {
6162   int shift = depth - 10;
6163   asm volatile(
6164 
6165       "sub         %0,%1                         \n"
6166       "sub         %0,%2                         \n"
6167       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
6168       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
6169       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
6170       "vpcmpeqb    %%ymm6,%%ymm6,%%ymm6          \n"
6171       "vpsrlw      $6,%%ymm6,%%ymm6              \n"
6172       "vmovd       %5,%%xmm4                     \n"
6173 
6174       LABELALIGN
6175       "1:                                        \n"
6176       "vmovdqu     (%0),%%ymm0                   \n"
6177       "vmovdqu     (%0,%1),%%ymm1                \n"
6178       "vmovdqu     (%0,%2),%%ymm2                \n"
6179       "vpsrlw      %%xmm4,%%ymm0,%%ymm0          \n"
6180       "vpsrlw      %%xmm4,%%ymm1,%%ymm1          \n"
6181       "vpsrlw      %%xmm4,%%ymm2,%%ymm2          \n"
6182       "vpminuw     %%ymm0,%%ymm6,%%ymm0          \n"
6183       "vpminuw     %%ymm1,%%ymm6,%%ymm1          \n"
6184       "vpminuw     %%ymm2,%%ymm6,%%ymm2          \n"
6185       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6186       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
6187       "vpermq      $0xd8,%%ymm2,%%ymm2           \n"
6188       "vpsllw      $0x4,%%ymm0,%%ymm0            \n"  // Shift R to target bit
6189       "vpunpckhwd  %%ymm0,%%ymm2,%%ymm3          \n"  // RB
6190       "vpunpcklwd  %%ymm0,%%ymm2,%%ymm0          \n"
6191       "vpunpckhwd  %%ymm5,%%ymm1,%%ymm2          \n"  // AG
6192       "vpunpcklwd  %%ymm5,%%ymm1,%%ymm1          \n"
6193       "vpslld      $0xa,%%ymm1,%%ymm1            \n"  // Shift AG to target bit
6194       "vpslld      $0xa,%%ymm2,%%ymm2            \n"
6195       "vpor        %%ymm1,%%ymm0,%%ymm0          \n"  // Combine
6196       "vpor        %%ymm2,%%ymm3,%%ymm3          \n"
6197       "vmovdqu     %%ymm0,(%3)                   \n"
6198       "vmovdqu     %%ymm3,0x20(%3)               \n"
6199       "lea         0x20(%0),%0                   \n"
6200       "lea         0x40(%3),%3                   \n"
6201       "sub         $0x10,%4                      \n"
6202       "jg          1b                            \n"
6203       "vzeroupper                                \n"
6204       : "+r"(src_r),     // %0
6205         "+r"(src_g),     // %1
6206         "+r"(src_b),     // %2
6207         "+r"(dst_ar30),  // %3
6208         "+r"(width)      // %4
6209 #if defined(__i386__)
6210       : "m"(shift)  // %5
6211 #else
6212       : "rm"(shift)           // %5
6213 #endif
6214       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6215 }
6216 #endif
6217 
6218 #ifdef HAS_MERGEAR64ROW_AVX2
6219 static const lvec32 MergeAR64Permute = {0, 4, 2, 6, 1, 5, 3, 7};
MergeAR64Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,const uint16_t * src_a,uint16_t * dst_ar64,int depth,int width)6220 void MergeAR64Row_AVX2(const uint16_t* src_r,
6221                        const uint16_t* src_g,
6222                        const uint16_t* src_b,
6223                        const uint16_t* src_a,
6224                        uint16_t* dst_ar64,
6225                        int depth,
6226                        int width) {
6227   int shift = 16 - depth;
6228   int mask = (1 << depth) - 1;
6229   mask = (mask << 16) + mask;
6230   asm volatile(
6231 
6232       "sub         %0,%1                         \n"
6233       "sub         %0,%2                         \n"
6234       "sub         %0,%3                         \n"
6235       "vmovdqa     %8,%%ymm5                     \n"
6236       "vmovd       %6,%%xmm6                     \n"
6237       "vbroadcastss %7,%%ymm7                    \n"
6238 
6239       LABELALIGN
6240       "1:                                        \n"
6241       "vmovdqu     (%0),%%ymm0                   \n"  // R
6242       "vmovdqu     (%0,%1),%%ymm1                \n"  // G
6243       "vmovdqu     (%0,%2),%%ymm2                \n"  // B
6244       "vmovdqu     (%0,%3),%%ymm3                \n"  // A
6245       "vpminuw     %%ymm0,%%ymm7,%%ymm0          \n"
6246       "vpminuw     %%ymm1,%%ymm7,%%ymm1          \n"
6247       "vpminuw     %%ymm2,%%ymm7,%%ymm2          \n"
6248       "vpminuw     %%ymm3,%%ymm7,%%ymm3          \n"
6249       "vpsllw      %%xmm6,%%ymm0,%%ymm0          \n"
6250       "vpsllw      %%xmm6,%%ymm1,%%ymm1          \n"
6251       "vpsllw      %%xmm6,%%ymm2,%%ymm2          \n"
6252       "vpsllw      %%xmm6,%%ymm3,%%ymm3          \n"
6253       "vpermd      %%ymm0,%%ymm5,%%ymm0          \n"
6254       "vpermd      %%ymm1,%%ymm5,%%ymm1          \n"
6255       "vpermd      %%ymm2,%%ymm5,%%ymm2          \n"
6256       "vpermd      %%ymm3,%%ymm5,%%ymm3          \n"
6257       "vpunpcklwd  %%ymm1,%%ymm2,%%ymm4          \n"  // BG(low)
6258       "vpunpckhwd  %%ymm1,%%ymm2,%%ymm1          \n"  // BG(hi)
6259       "vpunpcklwd  %%ymm3,%%ymm0,%%ymm2          \n"  // RA(low)
6260       "vpunpckhwd  %%ymm3,%%ymm0,%%ymm0          \n"  // RA(hi)
6261       "vpunpckldq  %%ymm2,%%ymm4,%%ymm3          \n"  // BGRA(1)
6262       "vpunpckhdq  %%ymm2,%%ymm4,%%ymm4          \n"  // BGRA(3)
6263       "vpunpckldq  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA(2)
6264       "vpunpckhdq  %%ymm0,%%ymm1,%%ymm1          \n"  // BGRA(4)
6265       "vmovdqu     %%ymm3,(%4)                   \n"
6266       "vmovdqu     %%ymm2,0x20(%4)               \n"
6267       "vmovdqu     %%ymm4,0x40(%4)               \n"
6268       "vmovdqu     %%ymm1,0x60(%4)               \n"
6269       "lea         0x20(%0),%0                   \n"
6270       "lea         0x80(%4),%4                   \n"
6271       "subl        $0x10,%5                      \n"
6272       "jg          1b                            \n"
6273       "vzeroupper                                \n"
6274       : "+r"(src_r),     // %0
6275         "+r"(src_g),     // %1
6276         "+r"(src_b),     // %2
6277         "+r"(src_a),     // %3
6278         "+r"(dst_ar64),  // %4
6279 #if defined(__i386__)
6280         "+m"(width)  // %5
6281 #else
6282         "+rm"(width)          // %5
6283 #endif
6284       : "m"(shift),            // %6
6285         "m"(mask),             // %7
6286         "m"(MergeAR64Permute)  // %8
6287       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
6288         "xmm7");
6289 }
6290 #endif
6291 
6292 #ifdef HAS_MERGEXR64ROW_AVX2
MergeXR64Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint16_t * dst_ar64,int depth,int width)6293 void MergeXR64Row_AVX2(const uint16_t* src_r,
6294                        const uint16_t* src_g,
6295                        const uint16_t* src_b,
6296                        uint16_t* dst_ar64,
6297                        int depth,
6298                        int width) {
6299   int shift = 16 - depth;
6300   int mask = (1 << depth) - 1;
6301   mask = (mask << 16) + mask;
6302   asm volatile(
6303 
6304       "sub         %0,%1                         \n"
6305       "sub         %0,%2                         \n"
6306       "vmovdqa     %7,%%ymm5                     \n"
6307       "vmovd       %5,%%xmm6                     \n"
6308       "vbroadcastss %6,%%ymm7                    \n"
6309 
6310       LABELALIGN
6311       "1:                                        \n"
6312       "vmovdqu     (%0),%%ymm0                   \n"  // R
6313       "vmovdqu     (%0,%1),%%ymm1                \n"  // G
6314       "vmovdqu     (%0,%2),%%ymm2                \n"  // B
6315       "vpminuw     %%ymm0,%%ymm7,%%ymm0          \n"
6316       "vpminuw     %%ymm1,%%ymm7,%%ymm1          \n"
6317       "vpminuw     %%ymm2,%%ymm7,%%ymm2          \n"
6318       "vpsllw      %%xmm6,%%ymm0,%%ymm0          \n"
6319       "vpsllw      %%xmm6,%%ymm1,%%ymm1          \n"
6320       "vpsllw      %%xmm6,%%ymm2,%%ymm2          \n"
6321       "vpermd      %%ymm0,%%ymm5,%%ymm0          \n"
6322       "vpermd      %%ymm1,%%ymm5,%%ymm1          \n"
6323       "vpermd      %%ymm2,%%ymm5,%%ymm2          \n"
6324       "vpcmpeqb    %%ymm3,%%ymm3,%%ymm3          \n"  // A (0xffff)
6325       "vpunpcklwd  %%ymm1,%%ymm2,%%ymm4          \n"  // BG(low)
6326       "vpunpckhwd  %%ymm1,%%ymm2,%%ymm1          \n"  // BG(hi)
6327       "vpunpcklwd  %%ymm3,%%ymm0,%%ymm2          \n"  // RA(low)
6328       "vpunpckhwd  %%ymm3,%%ymm0,%%ymm0          \n"  // RA(hi)
6329       "vpunpckldq  %%ymm2,%%ymm4,%%ymm3          \n"  // BGRA(1)
6330       "vpunpckhdq  %%ymm2,%%ymm4,%%ymm4          \n"  // BGRA(3)
6331       "vpunpckldq  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA(2)
6332       "vpunpckhdq  %%ymm0,%%ymm1,%%ymm1          \n"  // BGRA(4)
6333       "vmovdqu     %%ymm3,(%3)                   \n"
6334       "vmovdqu     %%ymm2,0x20(%3)               \n"
6335       "vmovdqu     %%ymm4,0x40(%3)               \n"
6336       "vmovdqu     %%ymm1,0x60(%3)               \n"
6337       "lea         0x20(%0),%0                   \n"
6338       "lea         0x80(%3),%3                   \n"
6339       "subl        $0x10,%4                      \n"
6340       "jg          1b                            \n"
6341       "vzeroupper                                \n"
6342       : "+r"(src_r),           // %0
6343         "+r"(src_g),           // %1
6344         "+r"(src_b),           // %2
6345         "+r"(dst_ar64),        // %3
6346         "+r"(width)            // %4
6347       : "m"(shift),            // %5
6348         "m"(mask),             // %6
6349         "m"(MergeAR64Permute)  // %7
6350       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
6351         "xmm7");
6352 }
6353 #endif
6354 
6355 #ifdef HAS_MERGEARGB16TO8ROW_AVX2
6356 static const uvec8 MergeARGB16To8Shuffle = {0, 8,  1, 9,  2, 10, 3, 11,
6357                                             4, 12, 5, 13, 6, 14, 7, 15};
MergeARGB16To8Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,const uint16_t * src_a,uint8_t * dst_argb,int depth,int width)6358 void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
6359                             const uint16_t* src_g,
6360                             const uint16_t* src_b,
6361                             const uint16_t* src_a,
6362                             uint8_t* dst_argb,
6363                             int depth,
6364                             int width) {
6365   int shift = depth - 8;
6366   asm volatile(
6367 
6368       "sub         %0,%1                         \n"
6369       "sub         %0,%2                         \n"
6370       "sub         %0,%3                         \n"
6371       "vbroadcastf128 %7,%%ymm5                  \n"
6372       "vmovd       %6,%%xmm6                     \n"
6373 
6374       LABELALIGN
6375       "1:                                        \n"
6376       "vmovdqu     (%0),%%ymm0                   \n"  // R
6377       "vmovdqu     (%0,%1),%%ymm1                \n"  // G
6378       "vmovdqu     (%0,%2),%%ymm2                \n"  // B
6379       "vmovdqu     (%0,%3),%%ymm3                \n"  // A
6380       "vpsrlw      %%xmm6,%%ymm0,%%ymm0          \n"
6381       "vpsrlw      %%xmm6,%%ymm1,%%ymm1          \n"
6382       "vpsrlw      %%xmm6,%%ymm2,%%ymm2          \n"
6383       "vpsrlw      %%xmm6,%%ymm3,%%ymm3          \n"
6384       "vpackuswb   %%ymm1,%%ymm2,%%ymm1          \n"  // BG (planar)
6385       "vpackuswb   %%ymm3,%%ymm0,%%ymm0          \n"  // RA (planar)
6386       "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"  // BG (interleave)
6387       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"  // RA (interleave)
6388       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
6389       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6390       "vpunpcklwd  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA (low)
6391       "vpunpckhwd  %%ymm0,%%ymm1,%%ymm0          \n"  // BGRA (hi)
6392       "vmovdqu     %%ymm2,(%4)                   \n"
6393       "vmovdqu     %%ymm0,0x20(%4)               \n"
6394       "lea         0x20(%0),%0                   \n"
6395       "lea         0x40(%4),%4                   \n"
6396       "subl        $0x10,%5                      \n"
6397       "jg          1b                            \n"
6398       "vzeroupper                                \n"
6399       : "+r"(src_r),     // %0
6400         "+r"(src_g),     // %1
6401         "+r"(src_b),     // %2
6402         "+r"(src_a),     // %3
6403         "+r"(dst_argb),  // %4
6404 #if defined(__i386__)
6405         "+m"(width)  // %5
6406 #else
6407         "+rm"(width)          // %5
6408 #endif
6409       : "m"(shift),                 // %6
6410         "m"(MergeARGB16To8Shuffle)  // %7
6411       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
6412 }
6413 #endif
6414 
6415 #ifdef HAS_MERGEXRGB16TO8ROW_AVX2
MergeXRGB16To8Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_argb,int depth,int width)6416 void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
6417                             const uint16_t* src_g,
6418                             const uint16_t* src_b,
6419                             uint8_t* dst_argb,
6420                             int depth,
6421                             int width) {
6422   int shift = depth - 8;
6423   asm volatile(
6424 
6425       "sub         %0,%1                         \n"
6426       "sub         %0,%2                         \n"
6427       "vbroadcastf128 %6,%%ymm5                  \n"
6428       "vmovd       %5,%%xmm6                     \n"
6429       "vpcmpeqb    %%ymm3,%%ymm3,%%ymm3          \n"
6430       "vpsrlw      $8,%%ymm3,%%ymm3              \n"  // A (0xff)
6431 
6432       LABELALIGN
6433       "1:                                        \n"
6434       "vmovdqu     (%0),%%ymm0                   \n"  // R
6435       "vmovdqu     (%0,%1),%%ymm1                \n"  // G
6436       "vmovdqu     (%0,%2),%%ymm2                \n"  // B
6437       "vpsrlw      %%xmm6,%%ymm0,%%ymm0          \n"
6438       "vpsrlw      %%xmm6,%%ymm1,%%ymm1          \n"
6439       "vpsrlw      %%xmm6,%%ymm2,%%ymm2          \n"
6440       "vpackuswb   %%ymm1,%%ymm2,%%ymm1          \n"  // BG (planar)
6441       "vpackuswb   %%ymm3,%%ymm0,%%ymm0          \n"  // RA (planar)
6442       "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"  // BG (interleave)
6443       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"  // RA (interleave)
6444       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
6445       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6446       "vpunpcklwd  %%ymm0,%%ymm1,%%ymm2          \n"  // BGRA (low)
6447       "vpunpckhwd  %%ymm0,%%ymm1,%%ymm0          \n"  // BGRA (hi)
6448       "vmovdqu     %%ymm2,(%3)                   \n"
6449       "vmovdqu     %%ymm0,0x20(%3)               \n"
6450       "lea         0x20(%0),%0                   \n"
6451       "lea         0x40(%3),%3                   \n"
6452       "subl        $0x10,%4                      \n"
6453       "jg          1b                            \n"
6454       "vzeroupper                                \n"
6455       : "+r"(src_r),                // %0
6456         "+r"(src_g),                // %1
6457         "+r"(src_b),                // %2
6458         "+r"(dst_argb),             // %3
6459         "+r"(width)                 // %4
6460       : "m"(shift),                 // %5
6461         "m"(MergeARGB16To8Shuffle)  // %6
6462       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
6463 }
6464 #endif
6465 
6466 #ifdef HAS_COPYROW_SSE2
CopyRow_SSE2(const uint8_t * src,uint8_t * dst,int width)6467 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
6468   asm volatile(
6469       "test        $0xf,%0                       \n"
6470       "jne         2f                            \n"
6471       "test        $0xf,%1                       \n"
6472       "jne         2f                            \n"
6473 
6474       LABELALIGN
6475       "1:                                        \n"
6476       "movdqa      (%0),%%xmm0                   \n"
6477       "movdqa      0x10(%0),%%xmm1               \n"
6478       "lea         0x20(%0),%0                   \n"
6479       "movdqa      %%xmm0,(%1)                   \n"
6480       "movdqa      %%xmm1,0x10(%1)               \n"
6481       "lea         0x20(%1),%1                   \n"
6482       "sub         $0x20,%2                      \n"
6483       "jg          1b                            \n"
6484       "jmp         9f                            \n"
6485 
6486       LABELALIGN
6487       "2:                                        \n"
6488       "movdqu      (%0),%%xmm0                   \n"
6489       "movdqu      0x10(%0),%%xmm1               \n"
6490       "lea         0x20(%0),%0                   \n"
6491       "movdqu      %%xmm0,(%1)                   \n"
6492       "movdqu      %%xmm1,0x10(%1)               \n"
6493       "lea         0x20(%1),%1                   \n"
6494       "sub         $0x20,%2                      \n"
6495       "jg          2b                            \n"
6496 
6497       LABELALIGN "9:                                        \n"
6498       : "+r"(src),   // %0
6499         "+r"(dst),   // %1
6500         "+r"(width)  // %2
6501       :
6502       : "memory", "cc", "xmm0", "xmm1");
6503 }
6504 #endif  // HAS_COPYROW_SSE2
6505 
6506 #ifdef HAS_COPYROW_AVX
CopyRow_AVX(const uint8_t * src,uint8_t * dst,int width)6507 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
6508   asm volatile(
6509 
6510       LABELALIGN
6511       "1:                                        \n"
6512       "vmovdqu     (%0),%%ymm0                   \n"
6513       "vmovdqu     0x20(%0),%%ymm1               \n"
6514       "lea         0x40(%0),%0                   \n"
6515       "vmovdqu     %%ymm0,(%1)                   \n"
6516       "vmovdqu     %%ymm1,0x20(%1)               \n"
6517       "lea         0x40(%1),%1                   \n"
6518       "sub         $0x40,%2                      \n"
6519       "jg          1b                            \n"
6520       "vzeroupper                                \n"
6521       : "+r"(src),   // %0
6522         "+r"(dst),   // %1
6523         "+r"(width)  // %2
6524       :
6525       : "memory", "cc", "xmm0", "xmm1");
6526 }
6527 #endif  // HAS_COPYROW_AVX
6528 
6529 #ifdef HAS_COPYROW_ERMS
6530 // Multiple of 1.
CopyRow_ERMS(const uint8_t * src,uint8_t * dst,int width)6531 void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
6532   size_t width_tmp = (size_t)(width);
6533   asm volatile(
6534 
6535       "rep         movsb                         \n"
6536       : "+S"(src),       // %0
6537         "+D"(dst),       // %1
6538         "+c"(width_tmp)  // %2
6539       :
6540       : "memory", "cc");
6541 }
6542 #endif  // HAS_COPYROW_ERMS
6543 
6544 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
6545 // width in pixels
ARGBCopyAlphaRow_SSE2(const uint8_t * src,uint8_t * dst,int width)6546 void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
6547   asm volatile(
6548       "pcmpeqb     %%xmm0,%%xmm0                 \n"
6549       "pslld       $0x18,%%xmm0                  \n"
6550       "pcmpeqb     %%xmm1,%%xmm1                 \n"
6551       "psrld       $0x8,%%xmm1                   \n"
6552 
6553       LABELALIGN
6554       "1:                                        \n"
6555       "movdqu      (%0),%%xmm2                   \n"
6556       "movdqu      0x10(%0),%%xmm3               \n"
6557       "lea         0x20(%0),%0                   \n"
6558       "movdqu      (%1),%%xmm4                   \n"
6559       "movdqu      0x10(%1),%%xmm5               \n"
6560       "pand        %%xmm0,%%xmm2                 \n"
6561       "pand        %%xmm0,%%xmm3                 \n"
6562       "pand        %%xmm1,%%xmm4                 \n"
6563       "pand        %%xmm1,%%xmm5                 \n"
6564       "por         %%xmm4,%%xmm2                 \n"
6565       "por         %%xmm5,%%xmm3                 \n"
6566       "movdqu      %%xmm2,(%1)                   \n"
6567       "movdqu      %%xmm3,0x10(%1)               \n"
6568       "lea         0x20(%1),%1                   \n"
6569       "sub         $0x8,%2                       \n"
6570       "jg          1b                            \n"
6571       : "+r"(src),   // %0
6572         "+r"(dst),   // %1
6573         "+r"(width)  // %2
6574       :
6575       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6576 }
6577 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
6578 
6579 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
6580 // width in pixels
ARGBCopyAlphaRow_AVX2(const uint8_t * src,uint8_t * dst,int width)6581 void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
6582   asm volatile(
6583       "vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
6584       "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
6585 
6586       LABELALIGN
6587       "1:                                        \n"
6588       "vmovdqu     (%0),%%ymm1                   \n"
6589       "vmovdqu     0x20(%0),%%ymm2               \n"
6590       "lea         0x40(%0),%0                   \n"
6591       "vpblendvb   %%ymm0,(%1),%%ymm1,%%ymm1     \n"
6592       "vpblendvb   %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
6593       "vmovdqu     %%ymm1,(%1)                   \n"
6594       "vmovdqu     %%ymm2,0x20(%1)               \n"
6595       "lea         0x40(%1),%1                   \n"
6596       "sub         $0x10,%2                      \n"
6597       "jg          1b                            \n"
6598       "vzeroupper                                \n"
6599       : "+r"(src),   // %0
6600         "+r"(dst),   // %1
6601         "+r"(width)  // %2
6602       :
6603       : "memory", "cc", "xmm0", "xmm1", "xmm2");
6604 }
6605 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
6606 
6607 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
6608 // width in pixels
ARGBExtractAlphaRow_SSE2(const uint8_t * src_argb,uint8_t * dst_a,int width)6609 void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
6610                               uint8_t* dst_a,
6611                               int width) {
6612   asm volatile(
6613 
6614       LABELALIGN
6615       "1:                                        \n"
6616       "movdqu      (%0), %%xmm0                  \n"
6617       "movdqu      0x10(%0), %%xmm1              \n"
6618       "lea         0x20(%0), %0                  \n"
6619       "psrld       $0x18, %%xmm0                 \n"
6620       "psrld       $0x18, %%xmm1                 \n"
6621       "packssdw    %%xmm1, %%xmm0                \n"
6622       "packuswb    %%xmm0, %%xmm0                \n"
6623       "movq        %%xmm0,(%1)                   \n"
6624       "lea         0x8(%1), %1                   \n"
6625       "sub         $0x8, %2                      \n"
6626       "jg          1b                            \n"
6627       : "+r"(src_argb),  // %0
6628         "+r"(dst_a),     // %1
6629         "+rm"(width)     // %2
6630       :
6631       : "memory", "cc", "xmm0", "xmm1");
6632 }
6633 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
6634 
6635 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
6636 static const uvec8 kShuffleAlphaShort_AVX2 = {
6637     3u,  128u, 128u, 128u, 7u,  128u, 128u, 128u,
6638     11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
6639 
ARGBExtractAlphaRow_AVX2(const uint8_t * src_argb,uint8_t * dst_a,int width)6640 void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
6641                               uint8_t* dst_a,
6642                               int width) {
6643   asm volatile(
6644       "vmovdqa     %3,%%ymm4                     \n"
6645       "vbroadcastf128 %4,%%ymm5                  \n"
6646 
6647       LABELALIGN
6648       "1:                                        \n"
6649       "vmovdqu     (%0), %%ymm0                  \n"
6650       "vmovdqu     0x20(%0), %%ymm1              \n"
6651       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"  // vpsrld $0x18, %%ymm0
6652       "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
6653       "vmovdqu     0x40(%0), %%ymm2              \n"
6654       "vmovdqu     0x60(%0), %%ymm3              \n"
6655       "lea         0x80(%0), %0                  \n"
6656       "vpackssdw   %%ymm1, %%ymm0, %%ymm0        \n"  // mutates
6657       "vpshufb     %%ymm5,%%ymm2,%%ymm2          \n"
6658       "vpshufb     %%ymm5,%%ymm3,%%ymm3          \n"
6659       "vpackssdw   %%ymm3, %%ymm2, %%ymm2        \n"  // mutates
6660       "vpackuswb   %%ymm2,%%ymm0,%%ymm0          \n"  // mutates.
6661       "vpermd      %%ymm0,%%ymm4,%%ymm0          \n"  // unmutate.
6662       "vmovdqu     %%ymm0,(%1)                   \n"
6663       "lea         0x20(%1),%1                   \n"
6664       "sub         $0x20, %2                     \n"
6665       "jg          1b                            \n"
6666       "vzeroupper                                \n"
6667       : "+r"(src_argb),               // %0
6668         "+r"(dst_a),                  // %1
6669         "+rm"(width)                  // %2
6670       : "m"(kPermdARGBToY_AVX),       // %3
6671         "m"(kShuffleAlphaShort_AVX2)  // %4
6672       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6673 }
6674 #endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
6675 
6676 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
6677 // width in pixels
ARGBCopyYToAlphaRow_SSE2(const uint8_t * src,uint8_t * dst,int width)6678 void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
6679   asm volatile(
6680       "pcmpeqb     %%xmm0,%%xmm0                 \n"
6681       "pslld       $0x18,%%xmm0                  \n"
6682       "pcmpeqb     %%xmm1,%%xmm1                 \n"
6683       "psrld       $0x8,%%xmm1                   \n"
6684 
6685       LABELALIGN
6686       "1:                                        \n"
6687       "movq        (%0),%%xmm2                   \n"
6688       "lea         0x8(%0),%0                    \n"
6689       "punpcklbw   %%xmm2,%%xmm2                 \n"
6690       "punpckhwd   %%xmm2,%%xmm3                 \n"
6691       "punpcklwd   %%xmm2,%%xmm2                 \n"
6692       "movdqu      (%1),%%xmm4                   \n"
6693       "movdqu      0x10(%1),%%xmm5               \n"
6694       "pand        %%xmm0,%%xmm2                 \n"
6695       "pand        %%xmm0,%%xmm3                 \n"
6696       "pand        %%xmm1,%%xmm4                 \n"
6697       "pand        %%xmm1,%%xmm5                 \n"
6698       "por         %%xmm4,%%xmm2                 \n"
6699       "por         %%xmm5,%%xmm3                 \n"
6700       "movdqu      %%xmm2,(%1)                   \n"
6701       "movdqu      %%xmm3,0x10(%1)               \n"
6702       "lea         0x20(%1),%1                   \n"
6703       "sub         $0x8,%2                       \n"
6704       "jg          1b                            \n"
6705       : "+r"(src),   // %0
6706         "+r"(dst),   // %1
6707         "+r"(width)  // %2
6708       :
6709       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6710 }
6711 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
6712 
6713 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
6714 // width in pixels
ARGBCopyYToAlphaRow_AVX2(const uint8_t * src,uint8_t * dst,int width)6715 void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
6716   asm volatile(
6717       "vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
6718       "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
6719 
6720       LABELALIGN
6721       "1:                                        \n"
6722       "vpmovzxbd   (%0),%%ymm1                   \n"
6723       "vpmovzxbd   0x8(%0),%%ymm2                \n"
6724       "lea         0x10(%0),%0                   \n"
6725       "vpslld      $0x18,%%ymm1,%%ymm1           \n"
6726       "vpslld      $0x18,%%ymm2,%%ymm2           \n"
6727       "vpblendvb   %%ymm0,(%1),%%ymm1,%%ymm1     \n"
6728       "vpblendvb   %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
6729       "vmovdqu     %%ymm1,(%1)                   \n"
6730       "vmovdqu     %%ymm2,0x20(%1)               \n"
6731       "lea         0x40(%1),%1                   \n"
6732       "sub         $0x10,%2                      \n"
6733       "jg          1b                            \n"
6734       "vzeroupper                                \n"
6735       : "+r"(src),   // %0
6736         "+r"(dst),   // %1
6737         "+r"(width)  // %2
6738       :
6739       : "memory", "cc", "xmm0", "xmm1", "xmm2");
6740 }
6741 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
6742 
6743 #ifdef HAS_SETROW_X86
SetRow_X86(uint8_t * dst,uint8_t v8,int width)6744 void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
6745   size_t width_tmp = (size_t)(width >> 2);
6746   const uint32_t v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
6747   asm volatile(
6748 
6749       "rep         stosl                         \n"
6750       : "+D"(dst),       // %0
6751         "+c"(width_tmp)  // %1
6752       : "a"(v32)         // %2
6753       : "memory", "cc");
6754 }
6755 
SetRow_ERMS(uint8_t * dst,uint8_t v8,int width)6756 void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
6757   size_t width_tmp = (size_t)(width);
6758   asm volatile(
6759 
6760       "rep         stosb                         \n"
6761       : "+D"(dst),       // %0
6762         "+c"(width_tmp)  // %1
6763       : "a"(v8)          // %2
6764       : "memory", "cc");
6765 }
6766 
ARGBSetRow_X86(uint8_t * dst_argb,uint32_t v32,int width)6767 void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
6768   size_t width_tmp = (size_t)(width);
6769   asm volatile(
6770 
6771       "rep         stosl                         \n"
6772       : "+D"(dst_argb),  // %0
6773         "+c"(width_tmp)  // %1
6774       : "a"(v32)         // %2
6775       : "memory", "cc");
6776 }
6777 #endif  // HAS_SETROW_X86
6778 
6779 #ifdef HAS_YUY2TOYROW_SSE2
YUY2ToYRow_SSE2(const uint8_t * src_yuy2,uint8_t * dst_y,int width)6780 void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
6781   asm volatile(
6782       "pcmpeqb     %%xmm5,%%xmm5                 \n"
6783       "psrlw       $0x8,%%xmm5                   \n"
6784 
6785       LABELALIGN
6786       "1:                                        \n"
6787       "movdqu      (%0),%%xmm0                   \n"
6788       "movdqu      0x10(%0),%%xmm1               \n"
6789       "lea         0x20(%0),%0                   \n"
6790       "pand        %%xmm5,%%xmm0                 \n"
6791       "pand        %%xmm5,%%xmm1                 \n"
6792       "packuswb    %%xmm1,%%xmm0                 \n"
6793       "movdqu      %%xmm0,(%1)                   \n"
6794       "lea         0x10(%1),%1                   \n"
6795       "sub         $0x10,%2                      \n"
6796       "jg          1b                            \n"
6797       : "+r"(src_yuy2),  // %0
6798         "+r"(dst_y),     // %1
6799         "+r"(width)      // %2
6800       :
6801       : "memory", "cc", "xmm0", "xmm1", "xmm5");
6802 }
6803 
YUY2ToNVUVRow_SSE2(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_uv,int width)6804 void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2,
6805                         int stride_yuy2,
6806                         uint8_t* dst_uv,
6807                         int width) {
6808   asm volatile(LABELALIGN
6809       "1:                                        \n"
6810       "movdqu      (%0),%%xmm0                   \n"
6811       "movdqu      0x10(%0),%%xmm1               \n"
6812       "movdqu      0x00(%0,%3,1),%%xmm2          \n"
6813       "movdqu      0x10(%0,%3,1),%%xmm3          \n"
6814       "lea         0x20(%0),%0                   \n"
6815       "pavgb       %%xmm2,%%xmm0                 \n"
6816       "pavgb       %%xmm3,%%xmm1                 \n"
6817       "psrlw       $0x8,%%xmm0                   \n"
6818       "psrlw       $0x8,%%xmm1                   \n"
6819       "packuswb    %%xmm1,%%xmm0                 \n"
6820       "movdqu      %%xmm0,(%1)                   \n"
6821       "lea         0x10(%1),%1                   \n"
6822       "sub         $0x10,%2                      \n"
6823       "jg          1b                            \n"
6824                : "+r"(src_yuy2),               // %0
6825                  "+r"(dst_uv),                 // %1
6826                  "+r"(width)                   // %2
6827                : "r"((intptr_t)(stride_yuy2))  // %3
6828                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
6829 }
6830 
YUY2ToUVRow_SSE2(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)6831 void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
6832                       int stride_yuy2,
6833                       uint8_t* dst_u,
6834                       uint8_t* dst_v,
6835                       int width) {
6836   asm volatile(
6837       "pcmpeqb     %%xmm5,%%xmm5                 \n"
6838       "psrlw       $0x8,%%xmm5                   \n"
6839       "sub         %1,%2                         \n"
6840 
6841       LABELALIGN
6842       "1:                                        \n"
6843       "movdqu      (%0),%%xmm0                   \n"
6844       "movdqu      0x10(%0),%%xmm1               \n"
6845       "movdqu      0x00(%0,%4,1),%%xmm2          \n"
6846       "movdqu      0x10(%0,%4,1),%%xmm3          \n"
6847       "lea         0x20(%0),%0                   \n"
6848       "pavgb       %%xmm2,%%xmm0                 \n"
6849       "pavgb       %%xmm3,%%xmm1                 \n"
6850       "psrlw       $0x8,%%xmm0                   \n"
6851       "psrlw       $0x8,%%xmm1                   \n"
6852       "packuswb    %%xmm1,%%xmm0                 \n"
6853       "movdqa      %%xmm0,%%xmm1                 \n"
6854       "pand        %%xmm5,%%xmm0                 \n"
6855       "packuswb    %%xmm0,%%xmm0                 \n"
6856       "psrlw       $0x8,%%xmm1                   \n"
6857       "packuswb    %%xmm1,%%xmm1                 \n"
6858       "movq        %%xmm0,(%1)                   \n"
6859       "movq        %%xmm1,0x00(%1,%2,1)          \n"
6860       "lea         0x8(%1),%1                    \n"
6861       "sub         $0x10,%3                      \n"
6862       "jg          1b                            \n"
6863       : "+r"(src_yuy2),               // %0
6864         "+r"(dst_u),                  // %1
6865         "+r"(dst_v),                  // %2
6866         "+r"(width)                   // %3
6867       : "r"((intptr_t)(stride_yuy2))  // %4
6868       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
6869 }
6870 
YUY2ToUV422Row_SSE2(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)6871 void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
6872                          uint8_t* dst_u,
6873                          uint8_t* dst_v,
6874                          int width) {
6875   asm volatile(
6876       "pcmpeqb     %%xmm5,%%xmm5                 \n"
6877       "psrlw       $0x8,%%xmm5                   \n"
6878       "sub         %1,%2                         \n"
6879 
6880       LABELALIGN
6881       "1:                                        \n"
6882       "movdqu      (%0),%%xmm0                   \n"
6883       "movdqu      0x10(%0),%%xmm1               \n"
6884       "lea         0x20(%0),%0                   \n"
6885       "psrlw       $0x8,%%xmm0                   \n"
6886       "psrlw       $0x8,%%xmm1                   \n"
6887       "packuswb    %%xmm1,%%xmm0                 \n"
6888       "movdqa      %%xmm0,%%xmm1                 \n"
6889       "pand        %%xmm5,%%xmm0                 \n"
6890       "packuswb    %%xmm0,%%xmm0                 \n"
6891       "psrlw       $0x8,%%xmm1                   \n"
6892       "packuswb    %%xmm1,%%xmm1                 \n"
6893       "movq        %%xmm0,(%1)                   \n"
6894       "movq        %%xmm1,0x00(%1,%2,1)          \n"
6895       "lea         0x8(%1),%1                    \n"
6896       "sub         $0x10,%3                      \n"
6897       "jg          1b                            \n"
6898       : "+r"(src_yuy2),  // %0
6899         "+r"(dst_u),     // %1
6900         "+r"(dst_v),     // %2
6901         "+r"(width)      // %3
6902       :
6903       : "memory", "cc", "xmm0", "xmm1", "xmm5");
6904 }
6905 
UYVYToYRow_SSE2(const uint8_t * src_uyvy,uint8_t * dst_y,int width)6906 void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
6907   asm volatile(
6908 
6909       LABELALIGN
6910       "1:                                        \n"
6911       "movdqu      (%0),%%xmm0                   \n"
6912       "movdqu      0x10(%0),%%xmm1               \n"
6913       "lea         0x20(%0),%0                   \n"
6914       "psrlw       $0x8,%%xmm0                   \n"
6915       "psrlw       $0x8,%%xmm1                   \n"
6916       "packuswb    %%xmm1,%%xmm0                 \n"
6917       "movdqu      %%xmm0,(%1)                   \n"
6918       "lea         0x10(%1),%1                   \n"
6919       "sub         $0x10,%2                      \n"
6920       "jg          1b                            \n"
6921       : "+r"(src_uyvy),  // %0
6922         "+r"(dst_y),     // %1
6923         "+r"(width)      // %2
6924       :
6925       : "memory", "cc", "xmm0", "xmm1");
6926 }
6927 
UYVYToUVRow_SSE2(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)6928 void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
6929                       int stride_uyvy,
6930                       uint8_t* dst_u,
6931                       uint8_t* dst_v,
6932                       int width) {
6933   asm volatile(
6934       "pcmpeqb     %%xmm5,%%xmm5                 \n"
6935       "psrlw       $0x8,%%xmm5                   \n"
6936       "sub         %1,%2                         \n"
6937 
6938       LABELALIGN
6939       "1:                                        \n"
6940       "movdqu      (%0),%%xmm0                   \n"
6941       "movdqu      0x10(%0),%%xmm1               \n"
6942       "movdqu      0x00(%0,%4,1),%%xmm2          \n"
6943       "movdqu      0x10(%0,%4,1),%%xmm3          \n"
6944       "lea         0x20(%0),%0                   \n"
6945       "pavgb       %%xmm2,%%xmm0                 \n"
6946       "pavgb       %%xmm3,%%xmm1                 \n"
6947       "pand        %%xmm5,%%xmm0                 \n"
6948       "pand        %%xmm5,%%xmm1                 \n"
6949       "packuswb    %%xmm1,%%xmm0                 \n"
6950       "movdqa      %%xmm0,%%xmm1                 \n"
6951       "pand        %%xmm5,%%xmm0                 \n"
6952       "packuswb    %%xmm0,%%xmm0                 \n"
6953       "psrlw       $0x8,%%xmm1                   \n"
6954       "packuswb    %%xmm1,%%xmm1                 \n"
6955       "movq        %%xmm0,(%1)                   \n"
6956       "movq        %%xmm1,0x00(%1,%2,1)          \n"
6957       "lea         0x8(%1),%1                    \n"
6958       "sub         $0x10,%3                      \n"
6959       "jg          1b                            \n"
6960       : "+r"(src_uyvy),               // %0
6961         "+r"(dst_u),                  // %1
6962         "+r"(dst_v),                  // %2
6963         "+r"(width)                   // %3
6964       : "r"((intptr_t)(stride_uyvy))  // %4
6965       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
6966 }
6967 
UYVYToUV422Row_SSE2(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)6968 void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
6969                          uint8_t* dst_u,
6970                          uint8_t* dst_v,
6971                          int width) {
6972   asm volatile(
6973       "pcmpeqb     %%xmm5,%%xmm5                 \n"
6974       "psrlw       $0x8,%%xmm5                   \n"
6975       "sub         %1,%2                         \n"
6976 
6977       LABELALIGN
6978       "1:                                        \n"
6979       "movdqu      (%0),%%xmm0                   \n"
6980       "movdqu      0x10(%0),%%xmm1               \n"
6981       "lea         0x20(%0),%0                   \n"
6982       "pand        %%xmm5,%%xmm0                 \n"
6983       "pand        %%xmm5,%%xmm1                 \n"
6984       "packuswb    %%xmm1,%%xmm0                 \n"
6985       "movdqa      %%xmm0,%%xmm1                 \n"
6986       "pand        %%xmm5,%%xmm0                 \n"
6987       "packuswb    %%xmm0,%%xmm0                 \n"
6988       "psrlw       $0x8,%%xmm1                   \n"
6989       "packuswb    %%xmm1,%%xmm1                 \n"
6990       "movq        %%xmm0,(%1)                   \n"
6991       "movq        %%xmm1,0x00(%1,%2,1)          \n"
6992       "lea         0x8(%1),%1                    \n"
6993       "sub         $0x10,%3                      \n"
6994       "jg          1b                            \n"
6995       : "+r"(src_uyvy),  // %0
6996         "+r"(dst_u),     // %1
6997         "+r"(dst_v),     // %2
6998         "+r"(width)      // %3
6999       :
7000       : "memory", "cc", "xmm0", "xmm1", "xmm5");
7001 }
7002 #endif  // HAS_YUY2TOYROW_SSE2
7003 
7004 #ifdef HAS_YUY2TOYROW_AVX2
YUY2ToYRow_AVX2(const uint8_t * src_yuy2,uint8_t * dst_y,int width)7005 void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
7006   asm volatile(
7007       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
7008       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
7009 
7010       LABELALIGN
7011       "1:                                        \n"
7012       "vmovdqu     (%0),%%ymm0                   \n"
7013       "vmovdqu     0x20(%0),%%ymm1               \n"
7014       "lea         0x40(%0),%0                   \n"
7015       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
7016       "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
7017       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
7018       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
7019       "vmovdqu     %%ymm0,(%1)                   \n"
7020       "lea         0x20(%1),%1                   \n"
7021       "sub         $0x20,%2                      \n"
7022       "jg          1b                            \n"
7023       "vzeroupper                                \n"
7024       : "+r"(src_yuy2),  // %0
7025         "+r"(dst_y),     // %1
7026         "+r"(width)      // %2
7027       :
7028       : "memory", "cc", "xmm0", "xmm1", "xmm5");
7029 }
7030 
YUY2ToNVUVRow_AVX2(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_uv,int width)7031 void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
7032                         int stride_yuy2,
7033                         uint8_t* dst_uv,
7034                         int width) {
7035   asm volatile(
7036 
7037       LABELALIGN
7038       "1:                                        \n"
7039       "vmovdqu     (%0),%%ymm0                   \n"
7040       "vmovdqu     0x20(%0),%%ymm1               \n"
7041       "vpavgb      0x00(%0,%3,1),%%ymm0,%%ymm0   \n"
7042       "vpavgb      0x20(%0,%3,1),%%ymm1,%%ymm1   \n"
7043       "lea         0x40(%0),%0                   \n"
7044       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
7045       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
7046       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
7047       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
7048       "vmovdqu     %%ymm0,(%1)                   \n"
7049       "lea         0x20(%1),%1                   \n"
7050       "sub         $0x20,%2                      \n"
7051       "jg          1b                            \n"
7052       "vzeroupper                                \n"
7053       : "+r"(src_yuy2),               // %0
7054         "+r"(dst_uv),                 // %1
7055         "+r"(width)                   // %2
7056       : "r"((intptr_t)(stride_yuy2))  // %3
7057       : "memory", "cc", "xmm0", "xmm1");
7058 }
7059 
YUY2ToUVRow_AVX2(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)7060 void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
7061                       int stride_yuy2,
7062                       uint8_t* dst_u,
7063                       uint8_t* dst_v,
7064                       int width) {
7065   asm volatile(
7066       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
7067       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
7068       "sub         %1,%2                         \n"
7069 
7070       LABELALIGN
7071       "1:                                        \n"
7072       "vmovdqu     (%0),%%ymm0                   \n"
7073       "vmovdqu     0x20(%0),%%ymm1               \n"
7074       "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
7075       "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
7076       "lea         0x40(%0),%0                   \n"
7077       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
7078       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
7079       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
7080       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
7081       "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
7082       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
7083       "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
7084       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
7085       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
7086       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
7087       "vextractf128 $0x0,%%ymm1,(%1)             \n"
7088       "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
7089       "lea         0x10(%1),%1                   \n"
7090       "sub         $0x20,%3                      \n"
7091       "jg          1b                            \n"
7092       "vzeroupper                                \n"
7093       : "+r"(src_yuy2),               // %0
7094         "+r"(dst_u),                  // %1
7095         "+r"(dst_v),                  // %2
7096         "+r"(width)                   // %3
7097       : "r"((intptr_t)(stride_yuy2))  // %4
7098       : "memory", "cc", "xmm0", "xmm1", "xmm5");
7099 }
7100 
YUY2ToUV422Row_AVX2(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)7101 void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
7102                          uint8_t* dst_u,
7103                          uint8_t* dst_v,
7104                          int width) {
7105   asm volatile(
7106       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
7107       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
7108       "sub         %1,%2                         \n"
7109 
7110       LABELALIGN
7111       "1:                                        \n"
7112       "vmovdqu     (%0),%%ymm0                   \n"
7113       "vmovdqu     0x20(%0),%%ymm1               \n"
7114       "lea         0x40(%0),%0                   \n"
7115       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
7116       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
7117       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
7118       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
7119       "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
7120       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
7121       "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
7122       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
7123       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
7124       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
7125       "vextractf128 $0x0,%%ymm1,(%1)             \n"
7126       "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
7127       "lea         0x10(%1),%1                   \n"
7128       "sub         $0x20,%3                      \n"
7129       "jg          1b                            \n"
7130       "vzeroupper                                \n"
7131       : "+r"(src_yuy2),  // %0
7132         "+r"(dst_u),     // %1
7133         "+r"(dst_v),     // %2
7134         "+r"(width)      // %3
7135       :
7136       : "memory", "cc", "xmm0", "xmm1", "xmm5");
7137 }
7138 
UYVYToYRow_AVX2(const uint8_t * src_uyvy,uint8_t * dst_y,int width)7139 void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
7140   asm volatile(
7141 
7142       LABELALIGN
7143       "1:                                        \n"
7144       "vmovdqu     (%0),%%ymm0                   \n"
7145       "vmovdqu     0x20(%0),%%ymm1               \n"
7146       "lea         0x40(%0),%0                   \n"
7147       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
7148       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
7149       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
7150       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
7151       "vmovdqu     %%ymm0,(%1)                   \n"
7152       "lea         0x20(%1),%1                   \n"
7153       "sub         $0x20,%2                      \n"
7154       "jg          1b                            \n"
7155       "vzeroupper                                \n"
7156       : "+r"(src_uyvy),  // %0
7157         "+r"(dst_y),     // %1
7158         "+r"(width)      // %2
7159       :
7160       : "memory", "cc", "xmm0", "xmm1", "xmm5");
7161 }
UYVYToUVRow_AVX2(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)7162 void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
7163                       int stride_uyvy,
7164                       uint8_t* dst_u,
7165                       uint8_t* dst_v,
7166                       int width) {
7167   asm volatile(
7168       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
7169       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
7170       "sub         %1,%2                         \n"
7171 
7172       LABELALIGN
7173       "1:                                        \n"
7174       "vmovdqu     (%0),%%ymm0                   \n"
7175       "vmovdqu     0x20(%0),%%ymm1               \n"
7176       "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
7177       "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
7178       "lea         0x40(%0),%0                   \n"
7179       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
7180       "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
7181       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
7182       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
7183       "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
7184       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
7185       "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
7186       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
7187       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
7188       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
7189       "vextractf128 $0x0,%%ymm1,(%1)             \n"
7190       "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
7191       "lea         0x10(%1),%1                   \n"
7192       "sub         $0x20,%3                      \n"
7193       "jg          1b                            \n"
7194       "vzeroupper                                \n"
7195       : "+r"(src_uyvy),               // %0
7196         "+r"(dst_u),                  // %1
7197         "+r"(dst_v),                  // %2
7198         "+r"(width)                   // %3
7199       : "r"((intptr_t)(stride_uyvy))  // %4
7200       : "memory", "cc", "xmm0", "xmm1", "xmm5");
7201 }
7202 
UYVYToUV422Row_AVX2(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)7203 void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
7204                          uint8_t* dst_u,
7205                          uint8_t* dst_v,
7206                          int width) {
7207   asm volatile(
7208       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
7209       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
7210       "sub         %1,%2                         \n"
7211 
7212       LABELALIGN
7213       "1:                                        \n"
7214       "vmovdqu     (%0),%%ymm0                   \n"
7215       "vmovdqu     0x20(%0),%%ymm1               \n"
7216       "lea         0x40(%0),%0                   \n"
7217       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
7218       "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
7219       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
7220       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
7221       "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
7222       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
7223       "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
7224       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
7225       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
7226       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
7227       "vextractf128 $0x0,%%ymm1,(%1)             \n"
7228       "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
7229       "lea         0x10(%1),%1                   \n"
7230       "sub         $0x20,%3                      \n"
7231       "jg          1b                            \n"
7232       "vzeroupper                                \n"
7233       : "+r"(src_uyvy),  // %0
7234         "+r"(dst_u),     // %1
7235         "+r"(dst_v),     // %2
7236         "+r"(width)      // %3
7237       :
7238       : "memory", "cc", "xmm0", "xmm1", "xmm5");
7239 }
7240 #endif  // HAS_YUY2TOYROW_AVX2
7241 
7242 #ifdef HAS_ARGBBLENDROW_SSSE3
7243 // Shuffle table for isolating alpha.
7244 static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
7245                                     11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
7246 
7247 // Blend 8 pixels at a time
ARGBBlendRow_SSSE3(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7248 void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
7249                         const uint8_t* src_argb1,
7250                         uint8_t* dst_argb,
7251                         int width) {
7252   asm volatile(
7253       "pcmpeqb     %%xmm7,%%xmm7                 \n"
7254       "psrlw       $0xf,%%xmm7                   \n"
7255       "pcmpeqb     %%xmm6,%%xmm6                 \n"
7256       "psrlw       $0x8,%%xmm6                   \n"
7257       "pcmpeqb     %%xmm5,%%xmm5                 \n"
7258       "psllw       $0x8,%%xmm5                   \n"
7259       "pcmpeqb     %%xmm4,%%xmm4                 \n"
7260       "pslld       $0x18,%%xmm4                  \n"
7261       "sub         $0x4,%3                       \n"
7262       "jl          49f                           \n"
7263 
7264       // 4 pixel loop.
7265       LABELALIGN
7266       "40:                                       \n"
7267       "movdqu      (%0),%%xmm3                   \n"
7268       "lea         0x10(%0),%0                   \n"
7269       "movdqa      %%xmm3,%%xmm0                 \n"
7270       "pxor        %%xmm4,%%xmm3                 \n"
7271       "movdqu      (%1),%%xmm2                   \n"
7272       "pshufb      %4,%%xmm3                     \n"
7273       "pand        %%xmm6,%%xmm2                 \n"
7274       "paddw       %%xmm7,%%xmm3                 \n"
7275       "pmullw      %%xmm3,%%xmm2                 \n"
7276       "movdqu      (%1),%%xmm1                   \n"
7277       "lea         0x10(%1),%1                   \n"
7278       "psrlw       $0x8,%%xmm1                   \n"
7279       "por         %%xmm4,%%xmm0                 \n"
7280       "pmullw      %%xmm3,%%xmm1                 \n"
7281       "psrlw       $0x8,%%xmm2                   \n"
7282       "paddusb     %%xmm2,%%xmm0                 \n"
7283       "pand        %%xmm5,%%xmm1                 \n"
7284       "paddusb     %%xmm1,%%xmm0                 \n"
7285       "movdqu      %%xmm0,(%2)                   \n"
7286       "lea         0x10(%2),%2                   \n"
7287       "sub         $0x4,%3                       \n"
7288       "jge         40b                           \n"
7289 
7290       "49:                                       \n"
7291       "add         $0x3,%3                       \n"
7292       "jl          99f                           \n"
7293 
7294       // 1 pixel loop.
7295       "91:                                       \n"
7296       "movd        (%0),%%xmm3                   \n"
7297       "lea         0x4(%0),%0                    \n"
7298       "movdqa      %%xmm3,%%xmm0                 \n"
7299       "pxor        %%xmm4,%%xmm3                 \n"
7300       "movd        (%1),%%xmm2                   \n"
7301       "pshufb      %4,%%xmm3                     \n"
7302       "pand        %%xmm6,%%xmm2                 \n"
7303       "paddw       %%xmm7,%%xmm3                 \n"
7304       "pmullw      %%xmm3,%%xmm2                 \n"
7305       "movd        (%1),%%xmm1                   \n"
7306       "lea         0x4(%1),%1                    \n"
7307       "psrlw       $0x8,%%xmm1                   \n"
7308       "por         %%xmm4,%%xmm0                 \n"
7309       "pmullw      %%xmm3,%%xmm1                 \n"
7310       "psrlw       $0x8,%%xmm2                   \n"
7311       "paddusb     %%xmm2,%%xmm0                 \n"
7312       "pand        %%xmm5,%%xmm1                 \n"
7313       "paddusb     %%xmm1,%%xmm0                 \n"
7314       "movd        %%xmm0,(%2)                   \n"
7315       "lea         0x4(%2),%2                    \n"
7316       "sub         $0x1,%3                       \n"
7317       "jge         91b                           \n"
7318       "99:                                       \n"
7319       : "+r"(src_argb),     // %0
7320         "+r"(src_argb1),    // %1
7321         "+r"(dst_argb),     // %2
7322         "+r"(width)         // %3
7323       : "m"(kShuffleAlpha)  // %4
7324       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7325         "xmm7");
7326 }
7327 #endif  // HAS_ARGBBLENDROW_SSSE3
7328 
7329 #ifdef HAS_BLENDPLANEROW_SSSE3
7330 // Blend 8 pixels at a time.
7331 // unsigned version of math
7332 // =((A2*C2)+(B2*(255-C2))+255)/256
7333 // signed version of math
7334 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_SSSE3(const uint8_t * src0,const uint8_t * src1,const uint8_t * alpha,uint8_t * dst,int width)7335 void BlendPlaneRow_SSSE3(const uint8_t* src0,
7336                          const uint8_t* src1,
7337                          const uint8_t* alpha,
7338                          uint8_t* dst,
7339                          int width) {
7340   asm volatile(
7341       "pcmpeqb     %%xmm5,%%xmm5                 \n"
7342       "psllw       $0x8,%%xmm5                   \n"
7343       "mov         $0x80808080,%%eax             \n"
7344       "movd        %%eax,%%xmm6                  \n"
7345       "pshufd      $0x0,%%xmm6,%%xmm6            \n"
7346       "mov         $0x807f807f,%%eax             \n"
7347       "movd        %%eax,%%xmm7                  \n"
7348       "pshufd      $0x0,%%xmm7,%%xmm7            \n"
7349       "sub         %2,%0                         \n"
7350       "sub         %2,%1                         \n"
7351       "sub         %2,%3                         \n"
7352 
7353       // 8 pixel loop.
7354       LABELALIGN
7355       "1:                                        \n"
7356       "movq        (%2),%%xmm0                   \n"
7357       "punpcklbw   %%xmm0,%%xmm0                 \n"
7358       "pxor        %%xmm5,%%xmm0                 \n"
7359       "movq        (%0,%2,1),%%xmm1              \n"
7360       "movq        (%1,%2,1),%%xmm2              \n"
7361       "punpcklbw   %%xmm2,%%xmm1                 \n"
7362       "psubb       %%xmm6,%%xmm1                 \n"
7363       "pmaddubsw   %%xmm1,%%xmm0                 \n"
7364       "paddw       %%xmm7,%%xmm0                 \n"
7365       "psrlw       $0x8,%%xmm0                   \n"
7366       "packuswb    %%xmm0,%%xmm0                 \n"
7367       "movq        %%xmm0,(%3,%2,1)              \n"
7368       "lea         0x8(%2),%2                    \n"
7369       "sub         $0x8,%4                       \n"
7370       "jg          1b                            \n"
7371       : "+r"(src0),   // %0
7372         "+r"(src1),   // %1
7373         "+r"(alpha),  // %2
7374         "+r"(dst),    // %3
7375         "+rm"(width)  // %4
7376         ::"memory",
7377         "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
7378 }
7379 #endif  // HAS_BLENDPLANEROW_SSSE3
7380 
7381 #ifdef HAS_BLENDPLANEROW_AVX2
7382 // Blend 32 pixels at a time.
7383 // unsigned version of math
7384 // =((A2*C2)+(B2*(255-C2))+255)/256
7385 // signed version of math
7386 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_AVX2(const uint8_t * src0,const uint8_t * src1,const uint8_t * alpha,uint8_t * dst,int width)7387 void BlendPlaneRow_AVX2(const uint8_t* src0,
7388                         const uint8_t* src1,
7389                         const uint8_t* alpha,
7390                         uint8_t* dst,
7391                         int width) {
7392   asm volatile(
7393       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
7394       "vpsllw      $0x8,%%ymm5,%%ymm5            \n"
7395       "mov         $0x80808080,%%eax             \n"
7396       "vmovd       %%eax,%%xmm6                  \n"
7397       "vbroadcastss %%xmm6,%%ymm6                \n"
7398       "mov         $0x807f807f,%%eax             \n"
7399       "vmovd       %%eax,%%xmm7                  \n"
7400       "vbroadcastss %%xmm7,%%ymm7                \n"
7401       "sub         %2,%0                         \n"
7402       "sub         %2,%1                         \n"
7403       "sub         %2,%3                         \n"
7404 
7405       // 32 pixel loop.
7406       LABELALIGN
7407       "1:                                        \n"
7408       "vmovdqu     (%2),%%ymm0                   \n"
7409       "vpunpckhbw  %%ymm0,%%ymm0,%%ymm3          \n"
7410       "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
7411       "vpxor       %%ymm5,%%ymm3,%%ymm3          \n"
7412       "vpxor       %%ymm5,%%ymm0,%%ymm0          \n"
7413       "vmovdqu     (%0,%2,1),%%ymm1              \n"
7414       "vmovdqu     (%1,%2,1),%%ymm2              \n"
7415       "vpunpckhbw  %%ymm2,%%ymm1,%%ymm4          \n"
7416       "vpunpcklbw  %%ymm2,%%ymm1,%%ymm1          \n"
7417       "vpsubb      %%ymm6,%%ymm4,%%ymm4          \n"
7418       "vpsubb      %%ymm6,%%ymm1,%%ymm1          \n"
7419       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
7420       "vpmaddubsw  %%ymm1,%%ymm0,%%ymm0          \n"
7421       "vpaddw      %%ymm7,%%ymm3,%%ymm3          \n"
7422       "vpaddw      %%ymm7,%%ymm0,%%ymm0          \n"
7423       "vpsrlw      $0x8,%%ymm3,%%ymm3            \n"
7424       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
7425       "vpackuswb   %%ymm3,%%ymm0,%%ymm0          \n"
7426       "vmovdqu     %%ymm0,(%3,%2,1)              \n"
7427       "lea         0x20(%2),%2                   \n"
7428       "sub         $0x20,%4                      \n"
7429       "jg          1b                            \n"
7430       "vzeroupper                                \n"
7431       : "+r"(src0),   // %0
7432         "+r"(src1),   // %1
7433         "+r"(alpha),  // %2
7434         "+r"(dst),    // %3
7435         "+rm"(width)  // %4
7436         ::"memory",
7437         "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7438         "xmm7");
7439 }
7440 #endif  // HAS_BLENDPLANEROW_AVX2
7441 
7442 #ifdef HAS_ARGBATTENUATEROW_SSSE3
7443 // Shuffle table duplicating alpha.
7444 static const vec8 kAttenuateShuffle = {6,    -128, 6,    -128, 6,  -128,
7445                                        -128, -128, 14,   -128, 14, -128,
7446                                        14,   -128, -128, -128};
7447 
7448 // Attenuate 4 pixels at a time.
ARGBAttenuateRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width)7449 void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
7450                             uint8_t* dst_argb,
7451                             int width) {
7452   asm volatile(
7453       "movdqa      %3,%%xmm4                     \n"
7454       "pcmpeqb     %%xmm5,%%xmm5                 \n"
7455       "pslld       $0x18,%%xmm5                  \n"
7456       "pxor        %%xmm6,%%xmm6                 \n"
7457       "pcmpeqb     %%xmm7,%%xmm7                 \n"
7458       "punpcklbw   %%xmm6,%%xmm7                 \n"
7459       "sub         %0,%1                         \n"
7460 
7461       // 4 pixel loop.
7462       LABELALIGN
7463       "1:                                        \n"
7464       "movdqu      (%0),%%xmm6                   \n"
7465       "movdqa      %%xmm6,%%xmm0                 \n"
7466       "movdqa      %%xmm6,%%xmm1                 \n"
7467       "punpcklbw   %%xmm5,%%xmm0                 \n"
7468       "punpckhbw   %%xmm5,%%xmm1                 \n"
7469       "movdqa      %%xmm0,%%xmm2                 \n"
7470       "movdqa      %%xmm1,%%xmm3                 \n"
7471       "pshufb      %%xmm4,%%xmm2                 \n"  // a,a,a,0
7472       "pshufb      %%xmm4,%%xmm3                 \n"
7473       "pmullw      %%xmm2,%%xmm0                 \n"  // rgb * alpha
7474       "pmullw      %%xmm3,%%xmm1                 \n"
7475       "paddw       %%xmm7,%%xmm0                 \n"  // + 255
7476       "paddw       %%xmm7,%%xmm1                 \n"
7477       "psrlw       $0x8,%%xmm0                   \n"
7478       "psrlw       $0x8,%%xmm1                   \n"
7479       "packuswb    %%xmm1,%%xmm0                 \n"
7480       "pand        %%xmm5,%%xmm6                 \n"
7481       "por         %%xmm6,%%xmm0                 \n"
7482       "movdqu      %%xmm0,(%0,%1)                \n"
7483       "lea         0x10(%0),%0                   \n"
7484       "sub         $0x4,%2                       \n"
7485       "jg          1b                            \n"
7486       : "+r"(src_argb),         // %0
7487         "+r"(dst_argb),         // %1
7488         "+r"(width)             // %2
7489       : "m"(kAttenuateShuffle)  // %3
7490       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7491         "xmm7");
7492 }
7493 #endif  // HAS_ARGBATTENUATEROW_SSSE3
7494 
7495 #ifdef HAS_ARGBATTENUATEROW_AVX2
7496 
7497 // Shuffle table duplicating alpha.
7498 static const lvec8 kAttenuateShuffle_AVX2 = {
7499     6,    -128, 6,    -128, 6,    -128, -128, -128, 14,   -128, 14,
7500     -128, 14,   -128, -128, -128, 22,   -128, 22,   -128, 22,   -128,
7501     -128, -128, 30,   -128, 30,   -128, 30,   -128, -128, -128};
7502 
7503 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,int width)7504 void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
7505                            uint8_t* dst_argb,
7506                            int width) {
7507   asm volatile(
7508       "vmovdqa     %3,%%ymm4                     \n"
7509       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
7510       "vpslld      $0x18,%%ymm5,%%ymm5           \n"
7511       "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"
7512       "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"
7513       "vpunpcklbw  %%ymm6,%%ymm7,%%ymm7          \n"
7514       "sub         %0,%1                         \n"
7515 
7516       // 8 pixel loop.
7517       LABELALIGN
7518       "1:                                        \n"
7519       "vmovdqu     (%0),%%ymm6                   \n"
7520       "vpunpcklbw  %%ymm5,%%ymm6,%%ymm0          \n"
7521       "vpunpckhbw  %%ymm5,%%ymm6,%%ymm1          \n"
7522       "vpshufb     %%ymm4,%%ymm0,%%ymm2          \n"
7523       "vpshufb     %%ymm4,%%ymm1,%%ymm3          \n"
7524       "vpmullw     %%ymm2,%%ymm0,%%ymm0          \n"
7525       "vpmullw     %%ymm3,%%ymm1,%%ymm1          \n"
7526       "vpaddw      %%ymm7,%%ymm0,%%ymm0          \n"
7527       "vpaddw      %%ymm7,%%ymm1,%%ymm1          \n"
7528       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
7529       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
7530       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
7531       "vpand       %%ymm5,%%ymm6,%%ymm1          \n"
7532       "vpor        %%ymm1,%%ymm0,%%ymm0          \n"
7533       "vmovdqu     %%ymm0,0x00(%0,%1,1)          \n"
7534       "lea         0x20(%0),%0                   \n"
7535       "sub         $0x8,%2                       \n"
7536       "jg          1b                            \n"
7537       "vzeroupper                                \n"
7538       : "+r"(src_argb),              // %0
7539         "+r"(dst_argb),              // %1
7540         "+r"(width)                  // %2
7541       : "m"(kAttenuateShuffle_AVX2)  // %3
7542       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7543         "xmm7");
7544 }
7545 #endif  // HAS_ARGBATTENUATEROW_AVX2
7546 
7547 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
7548 // Unattenuate 4 pixels at a time.
ARGBUnattenuateRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,int width)7549 void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
7550                              uint8_t* dst_argb,
7551                              int width) {
7552   uintptr_t alpha;
7553   asm volatile(
7554       // 4 pixel loop.
7555       LABELALIGN
7556       "1:                                        \n"
7557       "movdqu      (%0),%%xmm0                   \n"
7558       "movzb       0x03(%0),%3                   \n"
7559       "punpcklbw   %%xmm0,%%xmm0                 \n"
7560       "movd        0x00(%4,%3,4),%%xmm2          \n"
7561       "movzb       0x07(%0),%3                   \n"
7562       "movd        0x00(%4,%3,4),%%xmm3          \n"
7563       "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
7564       "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
7565       "movlhps     %%xmm3,%%xmm2                 \n"
7566       "pmulhuw     %%xmm2,%%xmm0                 \n"
7567       "movdqu      (%0),%%xmm1                   \n"
7568       "movzb       0x0b(%0),%3                   \n"
7569       "punpckhbw   %%xmm1,%%xmm1                 \n"
7570       "movd        0x00(%4,%3,4),%%xmm2          \n"
7571       "movzb       0x0f(%0),%3                   \n"
7572       "movd        0x00(%4,%3,4),%%xmm3          \n"
7573       "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
7574       "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
7575       "movlhps     %%xmm3,%%xmm2                 \n"
7576       "pmulhuw     %%xmm2,%%xmm1                 \n"
7577       "lea         0x10(%0),%0                   \n"
7578       "packuswb    %%xmm1,%%xmm0                 \n"
7579       "movdqu      %%xmm0,(%1)                   \n"
7580       "lea         0x10(%1),%1                   \n"
7581       "sub         $0x4,%2                       \n"
7582       "jg          1b                            \n"
7583       : "+r"(src_argb),     // %0
7584         "+r"(dst_argb),     // %1
7585         "+r"(width),        // %2
7586         "=&r"(alpha)        // %3
7587       : "r"(fixed_invtbl8)  // %4
7588       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
7589 }
7590 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
7591 
7592 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
7593 // Shuffle table duplicating alpha.
7594 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
7595     0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
7596 // Unattenuate 8 pixels at a time.
ARGBUnattenuateRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,int width)7597 void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
7598                              uint8_t* dst_argb,
7599                              int width) {
7600   uintptr_t alpha;
7601   asm volatile(
7602       "sub         %0,%1                         \n"
7603       "vbroadcastf128 %5,%%ymm5                  \n"
7604 
7605       // 8 pixel loop.
7606       LABELALIGN
7607       "1:                                        \n"
7608       // replace VPGATHER
7609       "movzb       0x03(%0),%3                   \n"
7610       "vmovd       0x00(%4,%3,4),%%xmm0          \n"
7611       "movzb       0x07(%0),%3                   \n"
7612       "vmovd       0x00(%4,%3,4),%%xmm1          \n"
7613       "movzb       0x0b(%0),%3                   \n"
7614       "vpunpckldq  %%xmm1,%%xmm0,%%xmm6          \n"
7615       "vmovd       0x00(%4,%3,4),%%xmm2          \n"
7616       "movzb       0x0f(%0),%3                   \n"
7617       "vmovd       0x00(%4,%3,4),%%xmm3          \n"
7618       "movzb       0x13(%0),%3                   \n"
7619       "vpunpckldq  %%xmm3,%%xmm2,%%xmm7          \n"
7620       "vmovd       0x00(%4,%3,4),%%xmm0          \n"
7621       "movzb       0x17(%0),%3                   \n"
7622       "vmovd       0x00(%4,%3,4),%%xmm1          \n"
7623       "movzb       0x1b(%0),%3                   \n"
7624       "vpunpckldq  %%xmm1,%%xmm0,%%xmm0          \n"
7625       "vmovd       0x00(%4,%3,4),%%xmm2          \n"
7626       "movzb       0x1f(%0),%3                   \n"
7627       "vmovd       0x00(%4,%3,4),%%xmm3          \n"
7628       "vpunpckldq  %%xmm3,%%xmm2,%%xmm2          \n"
7629       "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
7630       "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
7631       "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
7632       // end of VPGATHER
7633 
7634       "vmovdqu     (%0),%%ymm6                   \n"
7635       "vpunpcklbw  %%ymm6,%%ymm6,%%ymm0          \n"
7636       "vpunpckhbw  %%ymm6,%%ymm6,%%ymm1          \n"
7637       "vpunpcklwd  %%ymm3,%%ymm3,%%ymm2          \n"
7638       "vpunpckhwd  %%ymm3,%%ymm3,%%ymm3          \n"
7639       "vpshufb     %%ymm5,%%ymm2,%%ymm2          \n"
7640       "vpshufb     %%ymm5,%%ymm3,%%ymm3          \n"
7641       "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
7642       "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
7643       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
7644       "vmovdqu     %%ymm0,0x00(%0,%1,1)          \n"
7645       "lea         0x20(%0),%0                   \n"
7646       "sub         $0x8,%2                       \n"
7647       "jg          1b                            \n"
7648       "vzeroupper                                \n"
7649       : "+r"(src_argb),                 // %0
7650         "+r"(dst_argb),                 // %1
7651         "+r"(width),                    // %2
7652         "=&r"(alpha)                    // %3
7653       : "r"(fixed_invtbl8),             // %4
7654         "m"(kUnattenShuffleAlpha_AVX2)  // %5
7655       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7656         "xmm7");
7657 }
7658 #endif  // HAS_ARGBUNATTENUATEROW_AVX2
7659 
7660 #ifdef HAS_ARGBGRAYROW_SSSE3
7661 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
ARGBGrayRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width)7662 void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
7663   asm volatile(
7664       "movdqa      %3,%%xmm4                     \n"
7665       "movdqa      %4,%%xmm5                     \n"
7666 
7667       // 8 pixel loop.
7668       LABELALIGN
7669       "1:                                        \n"
7670       "movdqu      (%0),%%xmm0                   \n"
7671       "movdqu      0x10(%0),%%xmm1               \n"
7672       "psubb       %%xmm5,%%xmm0                 \n"
7673       "psubb       %%xmm5,%%xmm1                 \n"
7674       "movdqu      %%xmm4,%%xmm6                 \n"
7675       "pmaddubsw   %%xmm0,%%xmm6                 \n"
7676       "movdqu      %%xmm4,%%xmm0                 \n"
7677       "pmaddubsw   %%xmm1,%%xmm0                 \n"
7678       "phaddw      %%xmm0,%%xmm6                 \n"
7679       "paddw       %%xmm5,%%xmm6                 \n"
7680       "psrlw       $0x8,%%xmm6                   \n"
7681       "packuswb    %%xmm6,%%xmm6                 \n"
7682       "movdqu      (%0),%%xmm2                   \n"
7683       "movdqu      0x10(%0),%%xmm3               \n"
7684       "lea         0x20(%0),%0                   \n"
7685       "psrld       $0x18,%%xmm2                  \n"
7686       "psrld       $0x18,%%xmm3                  \n"
7687       "packuswb    %%xmm3,%%xmm2                 \n"
7688       "packuswb    %%xmm2,%%xmm2                 \n"
7689       "movdqa      %%xmm6,%%xmm3                 \n"
7690       "punpcklbw   %%xmm6,%%xmm6                 \n"
7691       "punpcklbw   %%xmm2,%%xmm3                 \n"
7692       "movdqa      %%xmm6,%%xmm1                 \n"
7693       "punpcklwd   %%xmm3,%%xmm6                 \n"
7694       "punpckhwd   %%xmm3,%%xmm1                 \n"
7695       "movdqu      %%xmm6,(%1)                   \n"
7696       "movdqu      %%xmm1,0x10(%1)               \n"
7697       "lea         0x20(%1),%1                   \n"
7698       "sub         $0x8,%2                       \n"
7699       "jg          1b                            \n"
7700       : "+r"(src_argb),  // %0
7701         "+r"(dst_argb),  // %1
7702         "+r"(width)      // %2
7703       : "m"(kARGBToYJ),  // %3
7704         "m"(kSub128)     // %4
7705       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
7706 }
7707 #endif  // HAS_ARGBGRAYROW_SSSE3
7708 
7709 #ifdef HAS_ARGBSEPIAROW_SSSE3
7710 //    b = (r * 35 + g * 68 + b * 17) >> 7
7711 //    g = (r * 45 + g * 88 + b * 22) >> 7
7712 //    r = (r * 50 + g * 98 + b * 24) >> 7
7713 // Constant for ARGB color to sepia tone
7714 static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
7715                                    17, 68, 35, 0, 17, 68, 35, 0};
7716 
7717 static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
7718                                    22, 88, 45, 0, 22, 88, 45, 0};
7719 
7720 static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
7721                                    24, 98, 50, 0, 24, 98, 50, 0};
7722 
7723 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
ARGBSepiaRow_SSSE3(uint8_t * dst_argb,int width)7724 void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
7725   asm volatile(
7726       "movdqa      %2,%%xmm2                     \n"
7727       "movdqa      %3,%%xmm3                     \n"
7728       "movdqa      %4,%%xmm4                     \n"
7729 
7730       // 8 pixel loop.
7731       LABELALIGN
7732       "1:                                        \n"
7733       "movdqu      (%0),%%xmm0                   \n"
7734       "movdqu      0x10(%0),%%xmm6               \n"
7735       "pmaddubsw   %%xmm2,%%xmm0                 \n"
7736       "pmaddubsw   %%xmm2,%%xmm6                 \n"
7737       "phaddw      %%xmm6,%%xmm0                 \n"
7738       "psrlw       $0x7,%%xmm0                   \n"
7739       "packuswb    %%xmm0,%%xmm0                 \n"
7740       "movdqu      (%0),%%xmm5                   \n"
7741       "movdqu      0x10(%0),%%xmm1               \n"
7742       "pmaddubsw   %%xmm3,%%xmm5                 \n"
7743       "pmaddubsw   %%xmm3,%%xmm1                 \n"
7744       "phaddw      %%xmm1,%%xmm5                 \n"
7745       "psrlw       $0x7,%%xmm5                   \n"
7746       "packuswb    %%xmm5,%%xmm5                 \n"
7747       "punpcklbw   %%xmm5,%%xmm0                 \n"
7748       "movdqu      (%0),%%xmm5                   \n"
7749       "movdqu      0x10(%0),%%xmm1               \n"
7750       "pmaddubsw   %%xmm4,%%xmm5                 \n"
7751       "pmaddubsw   %%xmm4,%%xmm1                 \n"
7752       "phaddw      %%xmm1,%%xmm5                 \n"
7753       "psrlw       $0x7,%%xmm5                   \n"
7754       "packuswb    %%xmm5,%%xmm5                 \n"
7755       "movdqu      (%0),%%xmm6                   \n"
7756       "movdqu      0x10(%0),%%xmm1               \n"
7757       "psrld       $0x18,%%xmm6                  \n"
7758       "psrld       $0x18,%%xmm1                  \n"
7759       "packuswb    %%xmm1,%%xmm6                 \n"
7760       "packuswb    %%xmm6,%%xmm6                 \n"
7761       "punpcklbw   %%xmm6,%%xmm5                 \n"
7762       "movdqa      %%xmm0,%%xmm1                 \n"
7763       "punpcklwd   %%xmm5,%%xmm0                 \n"
7764       "punpckhwd   %%xmm5,%%xmm1                 \n"
7765       "movdqu      %%xmm0,(%0)                   \n"
7766       "movdqu      %%xmm1,0x10(%0)               \n"
7767       "lea         0x20(%0),%0                   \n"
7768       "sub         $0x8,%1                       \n"
7769       "jg          1b                            \n"
7770       : "+r"(dst_argb),      // %0
7771         "+r"(width)          // %1
7772       : "m"(kARGBToSepiaB),  // %2
7773         "m"(kARGBToSepiaG),  // %3
7774         "m"(kARGBToSepiaR)   // %4
7775       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
7776 }
7777 #endif  // HAS_ARGBSEPIAROW_SSSE3
7778 
7779 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
7780 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
7781 // Same as Sepia except matrix is provided.
ARGBColorMatrixRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,const int8_t * matrix_argb,int width)7782 void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
7783                               uint8_t* dst_argb,
7784                               const int8_t* matrix_argb,
7785                               int width) {
7786   asm volatile(
7787       "movdqu      (%3),%%xmm5                   \n"
7788       "pshufd      $0x00,%%xmm5,%%xmm2           \n"
7789       "pshufd      $0x55,%%xmm5,%%xmm3           \n"
7790       "pshufd      $0xaa,%%xmm5,%%xmm4           \n"
7791       "pshufd      $0xff,%%xmm5,%%xmm5           \n"
7792 
7793       // 8 pixel loop.
7794       LABELALIGN
7795       "1:                                        \n"
7796       "movdqu      (%0),%%xmm0                   \n"
7797       "movdqu      0x10(%0),%%xmm7               \n"
7798       "pmaddubsw   %%xmm2,%%xmm0                 \n"
7799       "pmaddubsw   %%xmm2,%%xmm7                 \n"
7800       "movdqu      (%0),%%xmm6                   \n"
7801       "movdqu      0x10(%0),%%xmm1               \n"
7802       "pmaddubsw   %%xmm3,%%xmm6                 \n"
7803       "pmaddubsw   %%xmm3,%%xmm1                 \n"
7804       "phaddsw     %%xmm7,%%xmm0                 \n"
7805       "phaddsw     %%xmm1,%%xmm6                 \n"
7806       "psraw       $0x6,%%xmm0                   \n"
7807       "psraw       $0x6,%%xmm6                   \n"
7808       "packuswb    %%xmm0,%%xmm0                 \n"
7809       "packuswb    %%xmm6,%%xmm6                 \n"
7810       "punpcklbw   %%xmm6,%%xmm0                 \n"
7811       "movdqu      (%0),%%xmm1                   \n"
7812       "movdqu      0x10(%0),%%xmm7               \n"
7813       "pmaddubsw   %%xmm4,%%xmm1                 \n"
7814       "pmaddubsw   %%xmm4,%%xmm7                 \n"
7815       "phaddsw     %%xmm7,%%xmm1                 \n"
7816       "movdqu      (%0),%%xmm6                   \n"
7817       "movdqu      0x10(%0),%%xmm7               \n"
7818       "pmaddubsw   %%xmm5,%%xmm6                 \n"
7819       "pmaddubsw   %%xmm5,%%xmm7                 \n"
7820       "phaddsw     %%xmm7,%%xmm6                 \n"
7821       "psraw       $0x6,%%xmm1                   \n"
7822       "psraw       $0x6,%%xmm6                   \n"
7823       "packuswb    %%xmm1,%%xmm1                 \n"
7824       "packuswb    %%xmm6,%%xmm6                 \n"
7825       "punpcklbw   %%xmm6,%%xmm1                 \n"
7826       "movdqa      %%xmm0,%%xmm6                 \n"
7827       "punpcklwd   %%xmm1,%%xmm0                 \n"
7828       "punpckhwd   %%xmm1,%%xmm6                 \n"
7829       "movdqu      %%xmm0,(%1)                   \n"
7830       "movdqu      %%xmm6,0x10(%1)               \n"
7831       "lea         0x20(%0),%0                   \n"
7832       "lea         0x20(%1),%1                   \n"
7833       "sub         $0x8,%2                       \n"
7834       "jg          1b                            \n"
7835       : "+r"(src_argb),   // %0
7836         "+r"(dst_argb),   // %1
7837         "+r"(width)       // %2
7838       : "r"(matrix_argb)  // %3
7839       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7840         "xmm7");
7841 }
7842 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
7843 
7844 #ifdef HAS_ARGBQUANTIZEROW_SSE2
7845 // Quantize 4 ARGB pixels (16 bytes).
ARGBQuantizeRow_SSE2(uint8_t * dst_argb,int scale,int interval_size,int interval_offset,int width)7846 void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
7847                           int scale,
7848                           int interval_size,
7849                           int interval_offset,
7850                           int width) {
7851   asm volatile(
7852       "movd        %2,%%xmm2                     \n"
7853       "movd        %3,%%xmm3                     \n"
7854       "movd        %4,%%xmm4                     \n"
7855       "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
7856       "pshufd      $0x44,%%xmm2,%%xmm2           \n"
7857       "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
7858       "pshufd      $0x44,%%xmm3,%%xmm3           \n"
7859       "pshuflw     $0x40,%%xmm4,%%xmm4           \n"
7860       "pshufd      $0x44,%%xmm4,%%xmm4           \n"
7861       "pxor        %%xmm5,%%xmm5                 \n"
7862       "pcmpeqb     %%xmm6,%%xmm6                 \n"
7863       "pslld       $0x18,%%xmm6                  \n"
7864 
7865       // 4 pixel loop.
7866       LABELALIGN
7867       "1:                                        \n"
7868       "movdqu      (%0),%%xmm0                   \n"
7869       "punpcklbw   %%xmm5,%%xmm0                 \n"
7870       "pmulhuw     %%xmm2,%%xmm0                 \n"
7871       "movdqu      (%0),%%xmm1                   \n"
7872       "punpckhbw   %%xmm5,%%xmm1                 \n"
7873       "pmulhuw     %%xmm2,%%xmm1                 \n"
7874       "pmullw      %%xmm3,%%xmm0                 \n"
7875       "movdqu      (%0),%%xmm7                   \n"
7876       "pmullw      %%xmm3,%%xmm1                 \n"
7877       "pand        %%xmm6,%%xmm7                 \n"
7878       "paddw       %%xmm4,%%xmm0                 \n"
7879       "paddw       %%xmm4,%%xmm1                 \n"
7880       "packuswb    %%xmm1,%%xmm0                 \n"
7881       "por         %%xmm7,%%xmm0                 \n"
7882       "movdqu      %%xmm0,(%0)                   \n"
7883       "lea         0x10(%0),%0                   \n"
7884       "sub         $0x4,%1                       \n"
7885       "jg          1b                            \n"
7886       : "+r"(dst_argb),       // %0
7887         "+r"(width)           // %1
7888       : "r"(scale),           // %2
7889         "r"(interval_size),   // %3
7890         "r"(interval_offset)  // %4
7891       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7892         "xmm7");
7893 }
7894 #endif  // HAS_ARGBQUANTIZEROW_SSE2
7895 
7896 #ifdef HAS_ARGBSHADEROW_SSE2
7897 // Shade 4 pixels at a time by specified value.
ARGBShadeRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)7898 void ARGBShadeRow_SSE2(const uint8_t* src_argb,
7899                        uint8_t* dst_argb,
7900                        int width,
7901                        uint32_t value) {
7902   asm volatile(
7903       "movd        %3,%%xmm2                     \n"
7904       "punpcklbw   %%xmm2,%%xmm2                 \n"
7905       "punpcklqdq  %%xmm2,%%xmm2                 \n"
7906 
7907       // 4 pixel loop.
7908       LABELALIGN
7909       "1:                                        \n"
7910       "movdqu      (%0),%%xmm0                   \n"
7911       "lea         0x10(%0),%0                   \n"
7912       "movdqa      %%xmm0,%%xmm1                 \n"
7913       "punpcklbw   %%xmm0,%%xmm0                 \n"
7914       "punpckhbw   %%xmm1,%%xmm1                 \n"
7915       "pmulhuw     %%xmm2,%%xmm0                 \n"
7916       "pmulhuw     %%xmm2,%%xmm1                 \n"
7917       "psrlw       $0x8,%%xmm0                   \n"
7918       "psrlw       $0x8,%%xmm1                   \n"
7919       "packuswb    %%xmm1,%%xmm0                 \n"
7920       "movdqu      %%xmm0,(%1)                   \n"
7921       "lea         0x10(%1),%1                   \n"
7922       "sub         $0x4,%2                       \n"
7923       "jg          1b                            \n"
7924       : "+r"(src_argb),  // %0
7925         "+r"(dst_argb),  // %1
7926         "+r"(width)      // %2
7927       : "r"(value)       // %3
7928       : "memory", "cc", "xmm0", "xmm1", "xmm2");
7929 }
7930 #endif  // HAS_ARGBSHADEROW_SSE2
7931 
7932 #ifdef HAS_ARGBMULTIPLYROW_SSE2
7933 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBMultiplyRow_SSE2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7934 void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
7935                           const uint8_t* src_argb1,
7936                           uint8_t* dst_argb,
7937                           int width) {
7938   asm volatile(
7939 
7940       "pxor        %%xmm5,%%xmm5                 \n"
7941 
7942       // 4 pixel loop.
7943       LABELALIGN
7944       "1:                                        \n"
7945       "movdqu      (%0),%%xmm0                   \n"
7946       "lea         0x10(%0),%0                   \n"
7947       "movdqu      (%1),%%xmm2                   \n"
7948       "lea         0x10(%1),%1                   \n"
7949       "movdqu      %%xmm0,%%xmm1                 \n"
7950       "movdqu      %%xmm2,%%xmm3                 \n"
7951       "punpcklbw   %%xmm0,%%xmm0                 \n"
7952       "punpckhbw   %%xmm1,%%xmm1                 \n"
7953       "punpcklbw   %%xmm5,%%xmm2                 \n"
7954       "punpckhbw   %%xmm5,%%xmm3                 \n"
7955       "pmulhuw     %%xmm2,%%xmm0                 \n"
7956       "pmulhuw     %%xmm3,%%xmm1                 \n"
7957       "packuswb    %%xmm1,%%xmm0                 \n"
7958       "movdqu      %%xmm0,(%2)                   \n"
7959       "lea         0x10(%2),%2                   \n"
7960       "sub         $0x4,%3                       \n"
7961       "jg          1b                            \n"
7962       : "+r"(src_argb),   // %0
7963         "+r"(src_argb1),  // %1
7964         "+r"(dst_argb),   // %2
7965         "+r"(width)       // %3
7966       :
7967       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
7968 }
7969 #endif  // HAS_ARGBMULTIPLYROW_SSE2
7970 
7971 #ifdef HAS_ARGBMULTIPLYROW_AVX2
7972 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_AVX2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7973 void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
7974                           const uint8_t* src_argb1,
7975                           uint8_t* dst_argb,
7976                           int width) {
7977   asm volatile(
7978 
7979       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
7980 
7981       // 4 pixel loop.
7982       LABELALIGN
7983       "1:                                        \n"
7984       "vmovdqu     (%0),%%ymm1                   \n"
7985       "lea         0x20(%0),%0                   \n"
7986       "vmovdqu     (%1),%%ymm3                   \n"
7987       "lea         0x20(%1),%1                   \n"
7988       "vpunpcklbw  %%ymm1,%%ymm1,%%ymm0          \n"
7989       "vpunpckhbw  %%ymm1,%%ymm1,%%ymm1          \n"
7990       "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
7991       "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
7992       "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
7993       "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
7994       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
7995       "vmovdqu     %%ymm0,(%2)                   \n"
7996       "lea         0x20(%2),%2                   \n"
7997       "sub         $0x8,%3                       \n"
7998       "jg          1b                            \n"
7999       "vzeroupper                                \n"
8000       : "+r"(src_argb),   // %0
8001         "+r"(src_argb1),  // %1
8002         "+r"(dst_argb),   // %2
8003         "+r"(width)       // %3
8004       :
8005       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
8006 }
8007 #endif  // HAS_ARGBMULTIPLYROW_AVX2
8008 
8009 #ifdef HAS_ARGBADDROW_SSE2
8010 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_SSE2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)8011 void ARGBAddRow_SSE2(const uint8_t* src_argb,
8012                      const uint8_t* src_argb1,
8013                      uint8_t* dst_argb,
8014                      int width) {
8015   asm volatile(
8016       // 4 pixel loop.
8017       LABELALIGN
8018       "1:                                        \n"
8019       "movdqu      (%0),%%xmm0                   \n"
8020       "lea         0x10(%0),%0                   \n"
8021       "movdqu      (%1),%%xmm1                   \n"
8022       "lea         0x10(%1),%1                   \n"
8023       "paddusb     %%xmm1,%%xmm0                 \n"
8024       "movdqu      %%xmm0,(%2)                   \n"
8025       "lea         0x10(%2),%2                   \n"
8026       "sub         $0x4,%3                       \n"
8027       "jg          1b                            \n"
8028       : "+r"(src_argb),   // %0
8029         "+r"(src_argb1),  // %1
8030         "+r"(dst_argb),   // %2
8031         "+r"(width)       // %3
8032       :
8033       : "memory", "cc", "xmm0", "xmm1");
8034 }
8035 #endif  // HAS_ARGBADDROW_SSE2
8036 
8037 #ifdef HAS_ARGBADDROW_AVX2
8038 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_AVX2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)8039 void ARGBAddRow_AVX2(const uint8_t* src_argb,
8040                      const uint8_t* src_argb1,
8041                      uint8_t* dst_argb,
8042                      int width) {
8043   asm volatile(
8044       // 4 pixel loop.
8045       LABELALIGN
8046       "1:                                        \n"
8047       "vmovdqu     (%0),%%ymm0                   \n"
8048       "lea         0x20(%0),%0                   \n"
8049       "vpaddusb    (%1),%%ymm0,%%ymm0            \n"
8050       "lea         0x20(%1),%1                   \n"
8051       "vmovdqu     %%ymm0,(%2)                   \n"
8052       "lea         0x20(%2),%2                   \n"
8053       "sub         $0x8,%3                       \n"
8054       "jg          1b                            \n"
8055       "vzeroupper                                \n"
8056       : "+r"(src_argb),   // %0
8057         "+r"(src_argb1),  // %1
8058         "+r"(dst_argb),   // %2
8059         "+r"(width)       // %3
8060       :
8061       : "memory", "cc", "xmm0");
8062 }
8063 #endif  // HAS_ARGBADDROW_AVX2
8064 
8065 #ifdef HAS_ARGBSUBTRACTROW_SSE2
8066 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
ARGBSubtractRow_SSE2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)8067 void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
8068                           const uint8_t* src_argb1,
8069                           uint8_t* dst_argb,
8070                           int width) {
8071   asm volatile(
8072       // 4 pixel loop.
8073       LABELALIGN
8074       "1:                                        \n"
8075       "movdqu      (%0),%%xmm0                   \n"
8076       "lea         0x10(%0),%0                   \n"
8077       "movdqu      (%1),%%xmm1                   \n"
8078       "lea         0x10(%1),%1                   \n"
8079       "psubusb     %%xmm1,%%xmm0                 \n"
8080       "movdqu      %%xmm0,(%2)                   \n"
8081       "lea         0x10(%2),%2                   \n"
8082       "sub         $0x4,%3                       \n"
8083       "jg          1b                            \n"
8084       : "+r"(src_argb),   // %0
8085         "+r"(src_argb1),  // %1
8086         "+r"(dst_argb),   // %2
8087         "+r"(width)       // %3
8088       :
8089       : "memory", "cc", "xmm0", "xmm1");
8090 }
8091 #endif  // HAS_ARGBSUBTRACTROW_SSE2
8092 
8093 #ifdef HAS_ARGBSUBTRACTROW_AVX2
8094 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_AVX2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)8095 void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
8096                           const uint8_t* src_argb1,
8097                           uint8_t* dst_argb,
8098                           int width) {
8099   asm volatile(
8100       // 4 pixel loop.
8101       LABELALIGN
8102       "1:                                        \n"
8103       "vmovdqu     (%0),%%ymm0                   \n"
8104       "lea         0x20(%0),%0                   \n"
8105       "vpsubusb    (%1),%%ymm0,%%ymm0            \n"
8106       "lea         0x20(%1),%1                   \n"
8107       "vmovdqu     %%ymm0,(%2)                   \n"
8108       "lea         0x20(%2),%2                   \n"
8109       "sub         $0x8,%3                       \n"
8110       "jg          1b                            \n"
8111       "vzeroupper                                \n"
8112       : "+r"(src_argb),   // %0
8113         "+r"(src_argb1),  // %1
8114         "+r"(dst_argb),   // %2
8115         "+r"(width)       // %3
8116       :
8117       : "memory", "cc", "xmm0");
8118 }
8119 #endif  // HAS_ARGBSUBTRACTROW_AVX2
8120 
8121 #ifdef HAS_SOBELXROW_SSE2
8122 // SobelX as a matrix is
8123 // -1  0  1
8124 // -2  0  2
8125 // -1  0  1
SobelXRow_SSE2(const uint8_t * src_y0,const uint8_t * src_y1,const uint8_t * src_y2,uint8_t * dst_sobelx,int width)8126 void SobelXRow_SSE2(const uint8_t* src_y0,
8127                     const uint8_t* src_y1,
8128                     const uint8_t* src_y2,
8129                     uint8_t* dst_sobelx,
8130                     int width) {
8131   asm volatile(
8132       "sub         %0,%1                         \n"
8133       "sub         %0,%2                         \n"
8134       "sub         %0,%3                         \n"
8135       "pxor        %%xmm5,%%xmm5                 \n"
8136 
8137       // 8 pixel loop.
8138       LABELALIGN
8139       "1:                                        \n"
8140       "movq        (%0),%%xmm0                   \n"
8141       "movq        0x2(%0),%%xmm1                \n"
8142       "punpcklbw   %%xmm5,%%xmm0                 \n"
8143       "punpcklbw   %%xmm5,%%xmm1                 \n"
8144       "psubw       %%xmm1,%%xmm0                 \n"
8145       "movq        0x00(%0,%1,1),%%xmm1          \n"
8146       "movq        0x02(%0,%1,1),%%xmm2          \n"
8147       "punpcklbw   %%xmm5,%%xmm1                 \n"
8148       "punpcklbw   %%xmm5,%%xmm2                 \n"
8149       "psubw       %%xmm2,%%xmm1                 \n"
8150       "movq        0x00(%0,%2,1),%%xmm2          \n"
8151       "movq        0x02(%0,%2,1),%%xmm3          \n"
8152       "punpcklbw   %%xmm5,%%xmm2                 \n"
8153       "punpcklbw   %%xmm5,%%xmm3                 \n"
8154       "psubw       %%xmm3,%%xmm2                 \n"
8155       "paddw       %%xmm2,%%xmm0                 \n"
8156       "paddw       %%xmm1,%%xmm0                 \n"
8157       "paddw       %%xmm1,%%xmm0                 \n"
8158       "pxor        %%xmm1,%%xmm1                 \n"
8159       "psubw       %%xmm0,%%xmm1                 \n"
8160       "pmaxsw      %%xmm1,%%xmm0                 \n"
8161       "packuswb    %%xmm0,%%xmm0                 \n"
8162       "movq        %%xmm0,0x00(%0,%3,1)          \n"
8163       "lea         0x8(%0),%0                    \n"
8164       "sub         $0x8,%4                       \n"
8165       "jg          1b                            \n"
8166       : "+r"(src_y0),      // %0
8167         "+r"(src_y1),      // %1
8168         "+r"(src_y2),      // %2
8169         "+r"(dst_sobelx),  // %3
8170         "+r"(width)        // %4
8171       :
8172       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
8173 }
8174 #endif  // HAS_SOBELXROW_SSE2
8175 
8176 #ifdef HAS_SOBELYROW_SSE2
8177 // SobelY as a matrix is
8178 // -1 -2 -1
8179 //  0  0  0
8180 //  1  2  1
SobelYRow_SSE2(const uint8_t * src_y0,const uint8_t * src_y1,uint8_t * dst_sobely,int width)8181 void SobelYRow_SSE2(const uint8_t* src_y0,
8182                     const uint8_t* src_y1,
8183                     uint8_t* dst_sobely,
8184                     int width) {
8185   asm volatile(
8186       "sub         %0,%1                         \n"
8187       "sub         %0,%2                         \n"
8188       "pxor        %%xmm5,%%xmm5                 \n"
8189 
8190       // 8 pixel loop.
8191       LABELALIGN
8192       "1:                                        \n"
8193       "movq        (%0),%%xmm0                   \n"
8194       "movq        0x00(%0,%1,1),%%xmm1          \n"
8195       "punpcklbw   %%xmm5,%%xmm0                 \n"
8196       "punpcklbw   %%xmm5,%%xmm1                 \n"
8197       "psubw       %%xmm1,%%xmm0                 \n"
8198       "movq        0x1(%0),%%xmm1                \n"
8199       "movq        0x01(%0,%1,1),%%xmm2          \n"
8200       "punpcklbw   %%xmm5,%%xmm1                 \n"
8201       "punpcklbw   %%xmm5,%%xmm2                 \n"
8202       "psubw       %%xmm2,%%xmm1                 \n"
8203       "movq        0x2(%0),%%xmm2                \n"
8204       "movq        0x02(%0,%1,1),%%xmm3          \n"
8205       "punpcklbw   %%xmm5,%%xmm2                 \n"
8206       "punpcklbw   %%xmm5,%%xmm3                 \n"
8207       "psubw       %%xmm3,%%xmm2                 \n"
8208       "paddw       %%xmm2,%%xmm0                 \n"
8209       "paddw       %%xmm1,%%xmm0                 \n"
8210       "paddw       %%xmm1,%%xmm0                 \n"
8211       "pxor        %%xmm1,%%xmm1                 \n"
8212       "psubw       %%xmm0,%%xmm1                 \n"
8213       "pmaxsw      %%xmm1,%%xmm0                 \n"
8214       "packuswb    %%xmm0,%%xmm0                 \n"
8215       "movq        %%xmm0,0x00(%0,%2,1)          \n"
8216       "lea         0x8(%0),%0                    \n"
8217       "sub         $0x8,%3                       \n"
8218       "jg          1b                            \n"
8219       : "+r"(src_y0),      // %0
8220         "+r"(src_y1),      // %1
8221         "+r"(dst_sobely),  // %2
8222         "+r"(width)        // %3
8223       :
8224       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
8225 }
8226 #endif  // HAS_SOBELYROW_SSE2
8227 
8228 #ifdef HAS_SOBELROW_SSE2
8229 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
8230 // A = 255
8231 // R = Sobel
8232 // G = Sobel
8233 // B = Sobel
SobelRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)8234 void SobelRow_SSE2(const uint8_t* src_sobelx,
8235                    const uint8_t* src_sobely,
8236                    uint8_t* dst_argb,
8237                    int width) {
8238   asm volatile(
8239       "sub         %0,%1                         \n"
8240       "pcmpeqb     %%xmm5,%%xmm5                 \n"
8241       "pslld       $0x18,%%xmm5                  \n"
8242 
8243       // 8 pixel loop.
8244       LABELALIGN
8245       "1:                                        \n"
8246       "movdqu      (%0),%%xmm0                   \n"
8247       "movdqu      0x00(%0,%1,1),%%xmm1          \n"
8248       "lea         0x10(%0),%0                   \n"
8249       "paddusb     %%xmm1,%%xmm0                 \n"
8250       "movdqa      %%xmm0,%%xmm2                 \n"
8251       "punpcklbw   %%xmm0,%%xmm2                 \n"
8252       "punpckhbw   %%xmm0,%%xmm0                 \n"
8253       "movdqa      %%xmm2,%%xmm1                 \n"
8254       "punpcklwd   %%xmm2,%%xmm1                 \n"
8255       "punpckhwd   %%xmm2,%%xmm2                 \n"
8256       "por         %%xmm5,%%xmm1                 \n"
8257       "por         %%xmm5,%%xmm2                 \n"
8258       "movdqa      %%xmm0,%%xmm3                 \n"
8259       "punpcklwd   %%xmm0,%%xmm3                 \n"
8260       "punpckhwd   %%xmm0,%%xmm0                 \n"
8261       "por         %%xmm5,%%xmm3                 \n"
8262       "por         %%xmm5,%%xmm0                 \n"
8263       "movdqu      %%xmm1,(%2)                   \n"
8264       "movdqu      %%xmm2,0x10(%2)               \n"
8265       "movdqu      %%xmm3,0x20(%2)               \n"
8266       "movdqu      %%xmm0,0x30(%2)               \n"
8267       "lea         0x40(%2),%2                   \n"
8268       "sub         $0x10,%3                      \n"
8269       "jg          1b                            \n"
8270       : "+r"(src_sobelx),  // %0
8271         "+r"(src_sobely),  // %1
8272         "+r"(dst_argb),    // %2
8273         "+r"(width)        // %3
8274       :
8275       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
8276 }
8277 #endif  // HAS_SOBELROW_SSE2
8278 
8279 #ifdef HAS_SOBELTOPLANEROW_SSE2
8280 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
SobelToPlaneRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_y,int width)8281 void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
8282                           const uint8_t* src_sobely,
8283                           uint8_t* dst_y,
8284                           int width) {
8285   asm volatile(
8286       "sub         %0,%1                         \n"
8287       "pcmpeqb     %%xmm5,%%xmm5                 \n"
8288       "pslld       $0x18,%%xmm5                  \n"
8289 
8290       // 8 pixel loop.
8291       LABELALIGN
8292       "1:                                        \n"
8293       "movdqu      (%0),%%xmm0                   \n"
8294       "movdqu      0x00(%0,%1,1),%%xmm1          \n"
8295       "lea         0x10(%0),%0                   \n"
8296       "paddusb     %%xmm1,%%xmm0                 \n"
8297       "movdqu      %%xmm0,(%2)                   \n"
8298       "lea         0x10(%2),%2                   \n"
8299       "sub         $0x10,%3                      \n"
8300       "jg          1b                            \n"
8301       : "+r"(src_sobelx),  // %0
8302         "+r"(src_sobely),  // %1
8303         "+r"(dst_y),       // %2
8304         "+r"(width)        // %3
8305       :
8306       : "memory", "cc", "xmm0", "xmm1");
8307 }
8308 #endif  // HAS_SOBELTOPLANEROW_SSE2
8309 
8310 #ifdef HAS_SOBELXYROW_SSE2
8311 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
8312 // A = 255
8313 // R = Sobel X
8314 // G = Sobel
8315 // B = Sobel Y
SobelXYRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)8316 void SobelXYRow_SSE2(const uint8_t* src_sobelx,
8317                      const uint8_t* src_sobely,
8318                      uint8_t* dst_argb,
8319                      int width) {
8320   asm volatile(
8321       "sub         %0,%1                         \n"
8322       "pcmpeqb     %%xmm5,%%xmm5                 \n"
8323 
8324       // 8 pixel loop.
8325       LABELALIGN
8326       "1:                                        \n"
8327       "movdqu      (%0),%%xmm0                   \n"
8328       "movdqu      0x00(%0,%1,1),%%xmm1          \n"
8329       "lea         0x10(%0),%0                   \n"
8330       "movdqa      %%xmm0,%%xmm2                 \n"
8331       "paddusb     %%xmm1,%%xmm2                 \n"
8332       "movdqa      %%xmm0,%%xmm3                 \n"
8333       "punpcklbw   %%xmm5,%%xmm3                 \n"
8334       "punpckhbw   %%xmm5,%%xmm0                 \n"
8335       "movdqa      %%xmm1,%%xmm4                 \n"
8336       "punpcklbw   %%xmm2,%%xmm4                 \n"
8337       "punpckhbw   %%xmm2,%%xmm1                 \n"
8338       "movdqa      %%xmm4,%%xmm6                 \n"
8339       "punpcklwd   %%xmm3,%%xmm6                 \n"
8340       "punpckhwd   %%xmm3,%%xmm4                 \n"
8341       "movdqa      %%xmm1,%%xmm7                 \n"
8342       "punpcklwd   %%xmm0,%%xmm7                 \n"
8343       "punpckhwd   %%xmm0,%%xmm1                 \n"
8344       "movdqu      %%xmm6,(%2)                   \n"
8345       "movdqu      %%xmm4,0x10(%2)               \n"
8346       "movdqu      %%xmm7,0x20(%2)               \n"
8347       "movdqu      %%xmm1,0x30(%2)               \n"
8348       "lea         0x40(%2),%2                   \n"
8349       "sub         $0x10,%3                      \n"
8350       "jg          1b                            \n"
8351       : "+r"(src_sobelx),  // %0
8352         "+r"(src_sobely),  // %1
8353         "+r"(dst_argb),    // %2
8354         "+r"(width)        // %3
8355       :
8356       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
8357         "xmm7");
8358 }
8359 #endif  // HAS_SOBELXYROW_SSE2
8360 
8361 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
8362 // Creates a table of cumulative sums where each value is a sum of all values
8363 // above and to the left of the value, inclusive of the value.
ComputeCumulativeSumRow_SSE2(const uint8_t * row,int32_t * cumsum,const int32_t * previous_cumsum,int width)8364 void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
8365                                   int32_t* cumsum,
8366                                   const int32_t* previous_cumsum,
8367                                   int width) {
8368   asm volatile(
8369       "pxor        %%xmm0,%%xmm0                 \n"
8370       "pxor        %%xmm1,%%xmm1                 \n"
8371       "sub         $0x4,%3                       \n"
8372       "jl          49f                           \n"
8373       "test        $0xf,%1                       \n"
8374       "jne         49f                           \n"
8375 
8376       // 4 pixel loop.
8377       LABELALIGN
8378       "40:                                       \n"
8379       "movdqu      (%0),%%xmm2                   \n"
8380       "lea         0x10(%0),%0                   \n"
8381       "movdqa      %%xmm2,%%xmm4                 \n"
8382       "punpcklbw   %%xmm1,%%xmm2                 \n"
8383       "movdqa      %%xmm2,%%xmm3                 \n"
8384       "punpcklwd   %%xmm1,%%xmm2                 \n"
8385       "punpckhwd   %%xmm1,%%xmm3                 \n"
8386       "punpckhbw   %%xmm1,%%xmm4                 \n"
8387       "movdqa      %%xmm4,%%xmm5                 \n"
8388       "punpcklwd   %%xmm1,%%xmm4                 \n"
8389       "punpckhwd   %%xmm1,%%xmm5                 \n"
8390       "paddd       %%xmm2,%%xmm0                 \n"
8391       "movdqu      (%2),%%xmm2                   \n"
8392       "paddd       %%xmm0,%%xmm2                 \n"
8393       "paddd       %%xmm3,%%xmm0                 \n"
8394       "movdqu      0x10(%2),%%xmm3               \n"
8395       "paddd       %%xmm0,%%xmm3                 \n"
8396       "paddd       %%xmm4,%%xmm0                 \n"
8397       "movdqu      0x20(%2),%%xmm4               \n"
8398       "paddd       %%xmm0,%%xmm4                 \n"
8399       "paddd       %%xmm5,%%xmm0                 \n"
8400       "movdqu      0x30(%2),%%xmm5               \n"
8401       "lea         0x40(%2),%2                   \n"
8402       "paddd       %%xmm0,%%xmm5                 \n"
8403       "movdqu      %%xmm2,(%1)                   \n"
8404       "movdqu      %%xmm3,0x10(%1)               \n"
8405       "movdqu      %%xmm4,0x20(%1)               \n"
8406       "movdqu      %%xmm5,0x30(%1)               \n"
8407       "lea         0x40(%1),%1                   \n"
8408       "sub         $0x4,%3                       \n"
8409       "jge         40b                           \n"
8410 
8411       "49:                                       \n"
8412       "add         $0x3,%3                       \n"
8413       "jl          19f                           \n"
8414 
8415       // 1 pixel loop.
8416       LABELALIGN
8417       "10:                                       \n"
8418       "movd        (%0),%%xmm2                   \n"
8419       "lea         0x4(%0),%0                    \n"
8420       "punpcklbw   %%xmm1,%%xmm2                 \n"
8421       "punpcklwd   %%xmm1,%%xmm2                 \n"
8422       "paddd       %%xmm2,%%xmm0                 \n"
8423       "movdqu      (%2),%%xmm2                   \n"
8424       "lea         0x10(%2),%2                   \n"
8425       "paddd       %%xmm0,%%xmm2                 \n"
8426       "movdqu      %%xmm2,(%1)                   \n"
8427       "lea         0x10(%1),%1                   \n"
8428       "sub         $0x1,%3                       \n"
8429       "jge         10b                           \n"
8430 
8431       "19:                                       \n"
8432       : "+r"(row),              // %0
8433         "+r"(cumsum),           // %1
8434         "+r"(previous_cumsum),  // %2
8435         "+r"(width)             // %3
8436       :
8437       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
8438 }
8439 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
8440 
8441 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
CumulativeSumToAverageRow_SSE2(const int32_t * topleft,const int32_t * botleft,int width,int area,uint8_t * dst,int count)8442 void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
8443                                     const int32_t* botleft,
8444                                     int width,
8445                                     int area,
8446                                     uint8_t* dst,
8447                                     int count) {
8448   asm volatile(
8449       "movd        %5,%%xmm5                     \n"
8450       "cvtdq2ps    %%xmm5,%%xmm5                 \n"
8451       "rcpss       %%xmm5,%%xmm4                 \n"
8452       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
8453       "sub         $0x4,%3                       \n"
8454       "jl          49f                           \n"
8455       "cmpl        $0x80,%5                      \n"
8456       "ja          40f                           \n"
8457 
8458       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
8459       "pcmpeqb     %%xmm6,%%xmm6                 \n"
8460       "psrld       $0x10,%%xmm6                  \n"
8461       "cvtdq2ps    %%xmm6,%%xmm6                 \n"
8462       "addps       %%xmm6,%%xmm5                 \n"
8463       "mulps       %%xmm4,%%xmm5                 \n"
8464       "cvtps2dq    %%xmm5,%%xmm5                 \n"
8465       "packssdw    %%xmm5,%%xmm5                 \n"
8466 
8467       // 4 pixel small loop.
8468       LABELALIGN
8469       "4:                                        \n"
8470       "movdqu      (%0),%%xmm0                   \n"
8471       "movdqu      0x10(%0),%%xmm1               \n"
8472       "movdqu      0x20(%0),%%xmm2               \n"
8473       "movdqu      0x30(%0),%%xmm3               \n"
8474       "psubd       0x00(%0,%4,4),%%xmm0          \n"
8475       "psubd       0x10(%0,%4,4),%%xmm1          \n"
8476       "psubd       0x20(%0,%4,4),%%xmm2          \n"
8477       "psubd       0x30(%0,%4,4),%%xmm3          \n"
8478       "lea         0x40(%0),%0                   \n"
8479       "psubd       (%1),%%xmm0                   \n"
8480       "psubd       0x10(%1),%%xmm1               \n"
8481       "psubd       0x20(%1),%%xmm2               \n"
8482       "psubd       0x30(%1),%%xmm3               \n"
8483       "paddd       0x00(%1,%4,4),%%xmm0          \n"
8484       "paddd       0x10(%1,%4,4),%%xmm1          \n"
8485       "paddd       0x20(%1,%4,4),%%xmm2          \n"
8486       "paddd       0x30(%1,%4,4),%%xmm3          \n"
8487       "lea         0x40(%1),%1                   \n"
8488       "packssdw    %%xmm1,%%xmm0                 \n"
8489       "packssdw    %%xmm3,%%xmm2                 \n"
8490       "pmulhuw     %%xmm5,%%xmm0                 \n"
8491       "pmulhuw     %%xmm5,%%xmm2                 \n"
8492       "packuswb    %%xmm2,%%xmm0                 \n"
8493       "movdqu      %%xmm0,(%2)                   \n"
8494       "lea         0x10(%2),%2                   \n"
8495       "sub         $0x4,%3                       \n"
8496       "jge         4b                            \n"
8497       "jmp         49f                           \n"
8498 
8499       // 4 pixel loop
8500       LABELALIGN
8501       "40:                                       \n"
8502       "movdqu      (%0),%%xmm0                   \n"
8503       "movdqu      0x10(%0),%%xmm1               \n"
8504       "movdqu      0x20(%0),%%xmm2               \n"
8505       "movdqu      0x30(%0),%%xmm3               \n"
8506       "psubd       0x00(%0,%4,4),%%xmm0          \n"
8507       "psubd       0x10(%0,%4,4),%%xmm1          \n"
8508       "psubd       0x20(%0,%4,4),%%xmm2          \n"
8509       "psubd       0x30(%0,%4,4),%%xmm3          \n"
8510       "lea         0x40(%0),%0                   \n"
8511       "psubd       (%1),%%xmm0                   \n"
8512       "psubd       0x10(%1),%%xmm1               \n"
8513       "psubd       0x20(%1),%%xmm2               \n"
8514       "psubd       0x30(%1),%%xmm3               \n"
8515       "paddd       0x00(%1,%4,4),%%xmm0          \n"
8516       "paddd       0x10(%1,%4,4),%%xmm1          \n"
8517       "paddd       0x20(%1,%4,4),%%xmm2          \n"
8518       "paddd       0x30(%1,%4,4),%%xmm3          \n"
8519       "lea         0x40(%1),%1                   \n"
8520       "cvtdq2ps    %%xmm0,%%xmm0                 \n"
8521       "cvtdq2ps    %%xmm1,%%xmm1                 \n"
8522       "mulps       %%xmm4,%%xmm0                 \n"
8523       "mulps       %%xmm4,%%xmm1                 \n"
8524       "cvtdq2ps    %%xmm2,%%xmm2                 \n"
8525       "cvtdq2ps    %%xmm3,%%xmm3                 \n"
8526       "mulps       %%xmm4,%%xmm2                 \n"
8527       "mulps       %%xmm4,%%xmm3                 \n"
8528       "cvtps2dq    %%xmm0,%%xmm0                 \n"
8529       "cvtps2dq    %%xmm1,%%xmm1                 \n"
8530       "cvtps2dq    %%xmm2,%%xmm2                 \n"
8531       "cvtps2dq    %%xmm3,%%xmm3                 \n"
8532       "packssdw    %%xmm1,%%xmm0                 \n"
8533       "packssdw    %%xmm3,%%xmm2                 \n"
8534       "packuswb    %%xmm2,%%xmm0                 \n"
8535       "movdqu      %%xmm0,(%2)                   \n"
8536       "lea         0x10(%2),%2                   \n"
8537       "sub         $0x4,%3                       \n"
8538       "jge         40b                           \n"
8539 
8540       "49:                                       \n"
8541       "add         $0x3,%3                       \n"
8542       "jl          19f                           \n"
8543 
8544       // 1 pixel loop
8545       LABELALIGN
8546       "10:                                       \n"
8547       "movdqu      (%0),%%xmm0                   \n"
8548       "psubd       0x00(%0,%4,4),%%xmm0          \n"
8549       "lea         0x10(%0),%0                   \n"
8550       "psubd       (%1),%%xmm0                   \n"
8551       "paddd       0x00(%1,%4,4),%%xmm0          \n"
8552       "lea         0x10(%1),%1                   \n"
8553       "cvtdq2ps    %%xmm0,%%xmm0                 \n"
8554       "mulps       %%xmm4,%%xmm0                 \n"
8555       "cvtps2dq    %%xmm0,%%xmm0                 \n"
8556       "packssdw    %%xmm0,%%xmm0                 \n"
8557       "packuswb    %%xmm0,%%xmm0                 \n"
8558       "movd        %%xmm0,(%2)                   \n"
8559       "lea         0x4(%2),%2                    \n"
8560       "sub         $0x1,%3                       \n"
8561       "jge         10b                           \n"
8562       "19:                                       \n"
8563       : "+r"(topleft),           // %0
8564         "+r"(botleft),           // %1
8565         "+r"(dst),               // %2
8566         "+rm"(count)             // %3
8567       : "r"((intptr_t)(width)),  // %4
8568         "rm"(area)               // %5
8569       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
8570 }
8571 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
8572 
8573 #ifdef HAS_ARGBAFFINEROW_SSE2
8574 // Copy ARGB pixels from source image with slope to a row of destination.
8575 LIBYUV_API
ARGBAffineRow_SSE2(const uint8_t * src_argb,int src_argb_stride,uint8_t * dst_argb,const float * src_dudv,int width)8576 void ARGBAffineRow_SSE2(const uint8_t* src_argb,
8577                         int src_argb_stride,
8578                         uint8_t* dst_argb,
8579                         const float* src_dudv,
8580                         int width) {
8581   intptr_t src_argb_stride_temp = src_argb_stride;
8582   intptr_t temp;
8583   asm volatile(
8584       "movq        (%3),%%xmm2                   \n"
8585       "movq        0x08(%3),%%xmm7               \n"
8586       "shl         $0x10,%1                      \n"
8587       "add         $0x4,%1                       \n"
8588       "movd        %1,%%xmm5                     \n"
8589       "sub         $0x4,%4                       \n"
8590       "jl          49f                           \n"
8591 
8592       "pshufd      $0x44,%%xmm7,%%xmm7           \n"
8593       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
8594       "movdqa      %%xmm2,%%xmm0                 \n"
8595       "addps       %%xmm7,%%xmm0                 \n"
8596       "movlhps     %%xmm0,%%xmm2                 \n"
8597       "movdqa      %%xmm7,%%xmm4                 \n"
8598       "addps       %%xmm4,%%xmm4                 \n"
8599       "movdqa      %%xmm2,%%xmm3                 \n"
8600       "addps       %%xmm4,%%xmm3                 \n"
8601       "addps       %%xmm4,%%xmm4                 \n"
8602 
8603       // 4 pixel loop
8604       LABELALIGN
8605       "40:                                       \n"
8606       "cvttps2dq   %%xmm2,%%xmm0                 \n"  // x,y float->int first 2
8607       "cvttps2dq   %%xmm3,%%xmm1                 \n"  // x,y float->int next 2
8608       "packssdw    %%xmm1,%%xmm0                 \n"  // x, y as 8 shorts
8609       "pmaddwd     %%xmm5,%%xmm0                 \n"  // off = x*4 + y*stride
8610       "movd        %%xmm0,%k1                    \n"
8611       "pshufd      $0x39,%%xmm0,%%xmm0           \n"
8612       "movd        %%xmm0,%k5                    \n"
8613       "pshufd      $0x39,%%xmm0,%%xmm0           \n"
8614       "movd        0x00(%0,%1,1),%%xmm1          \n"
8615       "movd        0x00(%0,%5,1),%%xmm6          \n"
8616       "punpckldq   %%xmm6,%%xmm1                 \n"
8617       "addps       %%xmm4,%%xmm2                 \n"
8618       "movq        %%xmm1,(%2)                   \n"
8619       "movd        %%xmm0,%k1                    \n"
8620       "pshufd      $0x39,%%xmm0,%%xmm0           \n"
8621       "movd        %%xmm0,%k5                    \n"
8622       "movd        0x00(%0,%1,1),%%xmm0          \n"
8623       "movd        0x00(%0,%5,1),%%xmm6          \n"
8624       "punpckldq   %%xmm6,%%xmm0                 \n"
8625       "addps       %%xmm4,%%xmm3                 \n"
8626       "movq        %%xmm0,0x08(%2)               \n"
8627       "lea         0x10(%2),%2                   \n"
8628       "sub         $0x4,%4                       \n"
8629       "jge         40b                           \n"
8630 
8631       "49:                                       \n"
8632       "add         $0x3,%4                       \n"
8633       "jl          19f                           \n"
8634 
8635       // 1 pixel loop
8636       LABELALIGN
8637       "10:                                       \n"
8638       "cvttps2dq   %%xmm2,%%xmm0                 \n"
8639       "packssdw    %%xmm0,%%xmm0                 \n"
8640       "pmaddwd     %%xmm5,%%xmm0                 \n"
8641       "addps       %%xmm7,%%xmm2                 \n"
8642       "movd        %%xmm0,%k1                    \n"
8643       "movd        0x00(%0,%1,1),%%xmm0          \n"
8644       "movd        %%xmm0,(%2)                   \n"
8645       "lea         0x04(%2),%2                   \n"
8646       "sub         $0x1,%4                       \n"
8647       "jge         10b                           \n"
8648       "19:                                       \n"
8649       : "+r"(src_argb),              // %0
8650         "+r"(src_argb_stride_temp),  // %1
8651         "+r"(dst_argb),              // %2
8652         "+r"(src_dudv),              // %3
8653         "+rm"(width),                // %4
8654         "=&r"(temp)                  // %5
8655       :
8656       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
8657         "xmm7");
8658 }
8659 #endif  // HAS_ARGBAFFINEROW_SSE2
8660 
8661 #ifdef HAS_INTERPOLATEROW_SSSE3
8662 // Bilinear filter 16x2 -> 16x1
InterpolateRow_SSSE3(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int width,int source_y_fraction)8663 void InterpolateRow_SSSE3(uint8_t* dst_ptr,
8664                           const uint8_t* src_ptr,
8665                           ptrdiff_t src_stride,
8666                           int width,
8667                           int source_y_fraction) {
8668   asm volatile(
8669       "sub         %1,%0                         \n"
8670       "cmp         $0x0,%3                       \n"
8671       "je          100f                          \n"
8672       "cmp         $0x80,%3                      \n"
8673       "je          50f                           \n"
8674 
8675       "movd        %3,%%xmm0                     \n"
8676       "neg         %3                            \n"
8677       "add         $0x100,%3                     \n"
8678       "movd        %3,%%xmm5                     \n"
8679       "punpcklbw   %%xmm0,%%xmm5                 \n"
8680       "punpcklwd   %%xmm5,%%xmm5                 \n"
8681       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
8682       "mov         $0x80808080,%%eax             \n"
8683       "movd        %%eax,%%xmm4                  \n"
8684       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
8685 
8686       // General purpose row blend.
8687       LABELALIGN
8688       "1:                                        \n"
8689       "movdqu      (%1),%%xmm0                   \n"
8690       "movdqu      0x00(%1,%4,1),%%xmm2          \n"
8691       "movdqa      %%xmm0,%%xmm1                 \n"
8692       "punpcklbw   %%xmm2,%%xmm0                 \n"
8693       "punpckhbw   %%xmm2,%%xmm1                 \n"
8694       "psubb       %%xmm4,%%xmm0                 \n"
8695       "psubb       %%xmm4,%%xmm1                 \n"
8696       "movdqa      %%xmm5,%%xmm2                 \n"
8697       "movdqa      %%xmm5,%%xmm3                 \n"
8698       "pmaddubsw   %%xmm0,%%xmm2                 \n"
8699       "pmaddubsw   %%xmm1,%%xmm3                 \n"
8700       "paddw       %%xmm4,%%xmm2                 \n"
8701       "paddw       %%xmm4,%%xmm3                 \n"
8702       "psrlw       $0x8,%%xmm2                   \n"
8703       "psrlw       $0x8,%%xmm3                   \n"
8704       "packuswb    %%xmm3,%%xmm2                 \n"
8705       "movdqu      %%xmm2,0x00(%1,%0,1)          \n"
8706       "lea         0x10(%1),%1                   \n"
8707       "sub         $0x10,%2                      \n"
8708       "jg          1b                            \n"
8709       "jmp         99f                           \n"
8710 
8711       // Blend 50 / 50.
8712       LABELALIGN
8713       "50:                                       \n"
8714       "movdqu      (%1),%%xmm0                   \n"
8715       "movdqu      0x00(%1,%4,1),%%xmm1          \n"
8716       "pavgb       %%xmm1,%%xmm0                 \n"
8717       "movdqu      %%xmm0,0x00(%1,%0,1)          \n"
8718       "lea         0x10(%1),%1                   \n"
8719       "sub         $0x10,%2                      \n"
8720       "jg          50b                           \n"
8721       "jmp         99f                           \n"
8722 
8723       // Blend 100 / 0 - Copy row unchanged.
8724       LABELALIGN
8725       "100:                                      \n"
8726       "movdqu      (%1),%%xmm0                   \n"
8727       "movdqu      %%xmm0,0x00(%1,%0,1)          \n"
8728       "lea         0x10(%1),%1                   \n"
8729       "sub         $0x10,%2                      \n"
8730       "jg          100b                          \n"
8731 
8732       "99:                                       \n"
8733       : "+r"(dst_ptr),               // %0
8734         "+r"(src_ptr),               // %1
8735         "+rm"(width),                // %2
8736         "+r"(source_y_fraction)      // %3
8737       : "r"((intptr_t)(src_stride))  // %4
8738       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
8739 }
8740 #endif  // HAS_INTERPOLATEROW_SSSE3
8741 
8742 #ifdef HAS_INTERPOLATEROW_AVX2
8743 // Bilinear filter 32x2 -> 32x1
InterpolateRow_AVX2(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int width,int source_y_fraction)8744 void InterpolateRow_AVX2(uint8_t* dst_ptr,
8745                          const uint8_t* src_ptr,
8746                          ptrdiff_t src_stride,
8747                          int width,
8748                          int source_y_fraction) {
8749   asm volatile(
8750       "sub         %1,%0                         \n"
8751       "cmp         $0x0,%3                       \n"
8752       "je          100f                          \n"
8753       "cmp         $0x80,%3                      \n"
8754       "je          50f                           \n"
8755 
8756       "vmovd       %3,%%xmm0                     \n"
8757       "neg         %3                            \n"
8758       "add         $0x100,%3                     \n"
8759       "vmovd       %3,%%xmm5                     \n"
8760       "vpunpcklbw  %%xmm0,%%xmm5,%%xmm5          \n"
8761       "vpunpcklwd  %%xmm5,%%xmm5,%%xmm5          \n"
8762       "vbroadcastss %%xmm5,%%ymm5                \n"
8763       "mov         $0x80808080,%%eax             \n"
8764       "vmovd       %%eax,%%xmm4                  \n"
8765       "vbroadcastss %%xmm4,%%ymm4                \n"
8766 
8767       // General purpose row blend.
8768       LABELALIGN
8769       "1:                                        \n"
8770       "vmovdqu     (%1),%%ymm0                   \n"
8771       "vmovdqu     0x00(%1,%4,1),%%ymm2          \n"
8772       "vpunpckhbw  %%ymm2,%%ymm0,%%ymm1          \n"
8773       "vpunpcklbw  %%ymm2,%%ymm0,%%ymm0          \n"
8774       "vpsubb      %%ymm4,%%ymm1,%%ymm1          \n"
8775       "vpsubb      %%ymm4,%%ymm0,%%ymm0          \n"
8776       "vpmaddubsw  %%ymm1,%%ymm5,%%ymm1          \n"
8777       "vpmaddubsw  %%ymm0,%%ymm5,%%ymm0          \n"
8778       "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"
8779       "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"
8780       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
8781       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
8782       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
8783       "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
8784       "lea         0x20(%1),%1                   \n"
8785       "sub         $0x20,%2                      \n"
8786       "jg          1b                            \n"
8787       "jmp         99f                           \n"
8788 
8789       // Blend 50 / 50.
8790       LABELALIGN
8791       "50:                                       \n"
8792       "vmovdqu     (%1),%%ymm0                   \n"
8793       "vpavgb      0x00(%1,%4,1),%%ymm0,%%ymm0   \n"
8794       "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
8795       "lea         0x20(%1),%1                   \n"
8796       "sub         $0x20,%2                      \n"
8797       "jg          50b                           \n"
8798       "jmp         99f                           \n"
8799 
8800       // Blend 100 / 0 - Copy row unchanged.
8801       LABELALIGN
8802       "100:                                      \n"
8803       "vmovdqu     (%1),%%ymm0                   \n"
8804       "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
8805       "lea         0x20(%1),%1                   \n"
8806       "sub         $0x20,%2                      \n"
8807       "jg          100b                          \n"
8808 
8809       "99:                                       \n"
8810       "vzeroupper                                \n"
8811       : "+r"(dst_ptr),               // %0
8812         "+r"(src_ptr),               // %1
8813         "+r"(width),                 // %2
8814         "+r"(source_y_fraction)      // %3
8815       : "r"((intptr_t)(src_stride))  // %4
8816       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
8817 }
8818 #endif  // HAS_INTERPOLATEROW_AVX2
8819 
8820 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
8821 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)8822 void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
8823                           uint8_t* dst_argb,
8824                           const uint8_t* shuffler,
8825                           int width) {
8826   asm volatile(
8827 
8828       "movdqu      (%3),%%xmm5                   \n"
8829 
8830       LABELALIGN
8831       "1:                                        \n"
8832       "movdqu      (%0),%%xmm0                   \n"
8833       "movdqu      0x10(%0),%%xmm1               \n"
8834       "lea         0x20(%0),%0                   \n"
8835       "pshufb      %%xmm5,%%xmm0                 \n"
8836       "pshufb      %%xmm5,%%xmm1                 \n"
8837       "movdqu      %%xmm0,(%1)                   \n"
8838       "movdqu      %%xmm1,0x10(%1)               \n"
8839       "lea         0x20(%1),%1                   \n"
8840       "sub         $0x8,%2                       \n"
8841       "jg          1b                            \n"
8842       : "+r"(src_argb),  // %0
8843         "+r"(dst_argb),  // %1
8844         "+r"(width)      // %2
8845       : "r"(shuffler)    // %3
8846       : "memory", "cc", "xmm0", "xmm1", "xmm5");
8847 }
8848 #endif  // HAS_ARGBSHUFFLEROW_SSSE3
8849 
8850 #ifdef HAS_ARGBSHUFFLEROW_AVX2
8851 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)8852 void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
8853                          uint8_t* dst_argb,
8854                          const uint8_t* shuffler,
8855                          int width) {
8856   asm volatile(
8857 
8858       "vbroadcastf128 (%3),%%ymm5                \n"
8859 
8860       LABELALIGN
8861       "1:                                        \n"
8862       "vmovdqu     (%0),%%ymm0                   \n"
8863       "vmovdqu     0x20(%0),%%ymm1               \n"
8864       "lea         0x40(%0),%0                   \n"
8865       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
8866       "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
8867       "vmovdqu     %%ymm0,(%1)                   \n"
8868       "vmovdqu     %%ymm1,0x20(%1)               \n"
8869       "lea         0x40(%1),%1                   \n"
8870       "sub         $0x10,%2                      \n"
8871       "jg          1b                            \n"
8872       "vzeroupper                                \n"
8873       : "+r"(src_argb),  // %0
8874         "+r"(dst_argb),  // %1
8875         "+r"(width)      // %2
8876       : "r"(shuffler)    // %3
8877       : "memory", "cc", "xmm0", "xmm1", "xmm5");
8878 }
8879 #endif  // HAS_ARGBSHUFFLEROW_AVX2
8880 
8881 #ifdef HAS_I422TOYUY2ROW_SSE2
I422ToYUY2Row_SSE2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)8882 void I422ToYUY2Row_SSE2(const uint8_t* src_y,
8883                         const uint8_t* src_u,
8884                         const uint8_t* src_v,
8885                         uint8_t* dst_yuy2,
8886                         int width) {
8887   asm volatile(
8888 
8889       "sub         %1,%2                         \n"
8890 
8891       LABELALIGN
8892       "1:                                        \n"
8893       "movq        (%1),%%xmm2                   \n"
8894       "movq        0x00(%1,%2,1),%%xmm1          \n"
8895       "add         $0x8,%1                       \n"
8896       "punpcklbw   %%xmm1,%%xmm2                 \n"
8897       "movdqu      (%0),%%xmm0                   \n"
8898       "add         $0x10,%0                      \n"
8899       "movdqa      %%xmm0,%%xmm1                 \n"
8900       "punpcklbw   %%xmm2,%%xmm0                 \n"
8901       "punpckhbw   %%xmm2,%%xmm1                 \n"
8902       "movdqu      %%xmm0,(%3)                   \n"
8903       "movdqu      %%xmm1,0x10(%3)               \n"
8904       "lea         0x20(%3),%3                   \n"
8905       "sub         $0x10,%4                      \n"
8906       "jg          1b                            \n"
8907       : "+r"(src_y),     // %0
8908         "+r"(src_u),     // %1
8909         "+r"(src_v),     // %2
8910         "+r"(dst_yuy2),  // %3
8911         "+rm"(width)     // %4
8912       :
8913       : "memory", "cc", "xmm0", "xmm1", "xmm2");
8914 }
8915 #endif  // HAS_I422TOYUY2ROW_SSE2
8916 
8917 #ifdef HAS_I422TOUYVYROW_SSE2
I422ToUYVYRow_SSE2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)8918 void I422ToUYVYRow_SSE2(const uint8_t* src_y,
8919                         const uint8_t* src_u,
8920                         const uint8_t* src_v,
8921                         uint8_t* dst_uyvy,
8922                         int width) {
8923   asm volatile(
8924 
8925       "sub         %1,%2                         \n"
8926 
8927       LABELALIGN
8928       "1:                                        \n"
8929       "movq        (%1),%%xmm2                   \n"
8930       "movq        0x00(%1,%2,1),%%xmm1          \n"
8931       "add         $0x8,%1                       \n"
8932       "punpcklbw   %%xmm1,%%xmm2                 \n"
8933       "movdqu      (%0),%%xmm0                   \n"
8934       "movdqa      %%xmm2,%%xmm1                 \n"
8935       "add         $0x10,%0                      \n"
8936       "punpcklbw   %%xmm0,%%xmm1                 \n"
8937       "punpckhbw   %%xmm0,%%xmm2                 \n"
8938       "movdqu      %%xmm1,(%3)                   \n"
8939       "movdqu      %%xmm2,0x10(%3)               \n"
8940       "lea         0x20(%3),%3                   \n"
8941       "sub         $0x10,%4                      \n"
8942       "jg          1b                            \n"
8943       : "+r"(src_y),     // %0
8944         "+r"(src_u),     // %1
8945         "+r"(src_v),     // %2
8946         "+r"(dst_uyvy),  // %3
8947         "+rm"(width)     // %4
8948       :
8949       : "memory", "cc", "xmm0", "xmm1", "xmm2");
8950 }
8951 #endif  // HAS_I422TOUYVYROW_SSE2
8952 
8953 #ifdef HAS_I422TOYUY2ROW_AVX2
I422ToYUY2Row_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)8954 void I422ToYUY2Row_AVX2(const uint8_t* src_y,
8955                         const uint8_t* src_u,
8956                         const uint8_t* src_v,
8957                         uint8_t* dst_yuy2,
8958                         int width) {
8959   asm volatile(
8960 
8961       "sub         %1,%2                         \n"
8962 
8963       LABELALIGN
8964       "1:                                        \n"
8965       "vpmovzxbw   (%1),%%ymm1                   \n"
8966       "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
8967       "add         $0x10,%1                      \n"
8968       "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
8969       "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
8970       "vmovdqu     (%0),%%ymm0                   \n"
8971       "add         $0x20,%0                      \n"
8972       "vpunpcklbw  %%ymm2,%%ymm0,%%ymm1          \n"
8973       "vpunpckhbw  %%ymm2,%%ymm0,%%ymm2          \n"
8974       "vextractf128 $0x0,%%ymm1,(%3)             \n"
8975       "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
8976       "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
8977       "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
8978       "lea         0x40(%3),%3                   \n"
8979       "sub         $0x20,%4                      \n"
8980       "jg          1b                            \n"
8981       "vzeroupper                                \n"
8982       : "+r"(src_y),     // %0
8983         "+r"(src_u),     // %1
8984         "+r"(src_v),     // %2
8985         "+r"(dst_yuy2),  // %3
8986         "+rm"(width)     // %4
8987       :
8988       : "memory", "cc", "xmm0", "xmm1", "xmm2");
8989 }
8990 #endif  // HAS_I422TOYUY2ROW_AVX2
8991 
8992 #ifdef HAS_I422TOUYVYROW_AVX2
I422ToUYVYRow_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)8993 void I422ToUYVYRow_AVX2(const uint8_t* src_y,
8994                         const uint8_t* src_u,
8995                         const uint8_t* src_v,
8996                         uint8_t* dst_uyvy,
8997                         int width) {
8998   asm volatile(
8999 
9000       "sub         %1,%2                         \n"
9001 
9002       LABELALIGN
9003       "1:                                        \n"
9004       "vpmovzxbw   (%1),%%ymm1                   \n"
9005       "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
9006       "add         $0x10,%1                      \n"
9007       "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
9008       "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
9009       "vmovdqu     (%0),%%ymm0                   \n"
9010       "add         $0x20,%0                      \n"
9011       "vpunpcklbw  %%ymm0,%%ymm2,%%ymm1          \n"
9012       "vpunpckhbw  %%ymm0,%%ymm2,%%ymm2          \n"
9013       "vextractf128 $0x0,%%ymm1,(%3)             \n"
9014       "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
9015       "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
9016       "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
9017       "lea         0x40(%3),%3                   \n"
9018       "sub         $0x20,%4                      \n"
9019       "jg          1b                            \n"
9020       "vzeroupper                                \n"
9021       : "+r"(src_y),     // %0
9022         "+r"(src_u),     // %1
9023         "+r"(src_v),     // %2
9024         "+r"(dst_uyvy),  // %3
9025         "+rm"(width)     // %4
9026       :
9027       : "memory", "cc", "xmm0", "xmm1", "xmm2");
9028 }
9029 #endif  // HAS_I422TOUYVYROW_AVX2
9030 
9031 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
ARGBPolynomialRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,const float * poly,int width)9032 void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
9033                             uint8_t* dst_argb,
9034                             const float* poly,
9035                             int width) {
9036   asm volatile(
9037 
9038       "pxor        %%xmm3,%%xmm3                 \n"
9039 
9040       // 2 pixel loop.
9041       LABELALIGN
9042       "1:                                        \n"
9043       "movq        (%0),%%xmm0                   \n"
9044       "lea         0x8(%0),%0                    \n"
9045       "punpcklbw   %%xmm3,%%xmm0                 \n"
9046       "movdqa      %%xmm0,%%xmm4                 \n"
9047       "punpcklwd   %%xmm3,%%xmm0                 \n"
9048       "punpckhwd   %%xmm3,%%xmm4                 \n"
9049       "cvtdq2ps    %%xmm0,%%xmm0                 \n"
9050       "cvtdq2ps    %%xmm4,%%xmm4                 \n"
9051       "movdqa      %%xmm0,%%xmm1                 \n"
9052       "movdqa      %%xmm4,%%xmm5                 \n"
9053       "mulps       0x10(%3),%%xmm0               \n"
9054       "mulps       0x10(%3),%%xmm4               \n"
9055       "addps       (%3),%%xmm0                   \n"
9056       "addps       (%3),%%xmm4                   \n"
9057       "movdqa      %%xmm1,%%xmm2                 \n"
9058       "movdqa      %%xmm5,%%xmm6                 \n"
9059       "mulps       %%xmm1,%%xmm2                 \n"
9060       "mulps       %%xmm5,%%xmm6                 \n"
9061       "mulps       %%xmm2,%%xmm1                 \n"
9062       "mulps       %%xmm6,%%xmm5                 \n"
9063       "mulps       0x20(%3),%%xmm2               \n"
9064       "mulps       0x20(%3),%%xmm6               \n"
9065       "mulps       0x30(%3),%%xmm1               \n"
9066       "mulps       0x30(%3),%%xmm5               \n"
9067       "addps       %%xmm2,%%xmm0                 \n"
9068       "addps       %%xmm6,%%xmm4                 \n"
9069       "addps       %%xmm1,%%xmm0                 \n"
9070       "addps       %%xmm5,%%xmm4                 \n"
9071       "cvttps2dq   %%xmm0,%%xmm0                 \n"
9072       "cvttps2dq   %%xmm4,%%xmm4                 \n"
9073       "packuswb    %%xmm4,%%xmm0                 \n"
9074       "packuswb    %%xmm0,%%xmm0                 \n"
9075       "movq        %%xmm0,(%1)                   \n"
9076       "lea         0x8(%1),%1                    \n"
9077       "sub         $0x2,%2                       \n"
9078       "jg          1b                            \n"
9079       : "+r"(src_argb),  // %0
9080         "+r"(dst_argb),  // %1
9081         "+r"(width)      // %2
9082       : "r"(poly)        // %3
9083       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
9084 }
9085 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
9086 
9087 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
ARGBPolynomialRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,const float * poly,int width)9088 void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
9089                             uint8_t* dst_argb,
9090                             const float* poly,
9091                             int width) {
9092   asm volatile(
9093       "vbroadcastf128 (%3),%%ymm4                \n"
9094       "vbroadcastf128 0x10(%3),%%ymm5            \n"
9095       "vbroadcastf128 0x20(%3),%%ymm6            \n"
9096       "vbroadcastf128 0x30(%3),%%ymm7            \n"
9097 
9098       // 2 pixel loop.
9099       LABELALIGN
9100       "1:                                        \n"
9101       "vpmovzxbd   (%0),%%ymm0                   \n"  // 2 ARGB pixels
9102       "lea         0x8(%0),%0                    \n"
9103       "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
9104       "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
9105       "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
9106       "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
9107       "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
9108       "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X *
9109                                                       // X
9110       "vcvttps2dq  %%ymm0,%%ymm0                 \n"
9111       "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
9112       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
9113       "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
9114       "vmovq       %%xmm0,(%1)                   \n"
9115       "lea         0x8(%1),%1                    \n"
9116       "sub         $0x2,%2                       \n"
9117       "jg          1b                            \n"
9118       "vzeroupper                                \n"
9119       : "+r"(src_argb),  // %0
9120         "+r"(dst_argb),  // %1
9121         "+r"(width)      // %2
9122       : "r"(poly)        // %3
9123       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
9124         "xmm7");
9125 }
9126 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
9127 
9128 #ifdef HAS_HALFFLOATROW_SSE2
9129 static float kScaleBias = 1.9259299444e-34f;
HalfFloatRow_SSE2(const uint16_t * src,uint16_t * dst,float scale,int width)9130 void HalfFloatRow_SSE2(const uint16_t* src,
9131                        uint16_t* dst,
9132                        float scale,
9133                        int width) {
9134   scale *= kScaleBias;
9135   asm volatile(
9136       "movd        %3,%%xmm4                     \n"
9137       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
9138       "pxor        %%xmm5,%%xmm5                 \n"
9139       "sub         %0,%1                         \n"
9140 
9141       // 16 pixel loop.
9142       LABELALIGN
9143       "1:                                        \n"
9144       "movdqu      (%0),%%xmm2                   \n"  // 8 shorts
9145       "add         $0x10,%0                      \n"
9146       "movdqa      %%xmm2,%%xmm3                 \n"
9147       "punpcklwd   %%xmm5,%%xmm2                 \n"  // 8 ints in xmm2/1
9148       "cvtdq2ps    %%xmm2,%%xmm2                 \n"  // 8 floats
9149       "punpckhwd   %%xmm5,%%xmm3                 \n"
9150       "cvtdq2ps    %%xmm3,%%xmm3                 \n"
9151       "mulps       %%xmm4,%%xmm2                 \n"
9152       "mulps       %%xmm4,%%xmm3                 \n"
9153       "psrld       $0xd,%%xmm2                   \n"
9154       "psrld       $0xd,%%xmm3                   \n"
9155       "packssdw    %%xmm3,%%xmm2                 \n"
9156       "movdqu      %%xmm2,-0x10(%0,%1,1)         \n"
9157       "sub         $0x8,%2                       \n"
9158       "jg          1b                            \n"
9159       : "+r"(src),   // %0
9160         "+r"(dst),   // %1
9161         "+r"(width)  // %2
9162       : "m"(scale)   // %3
9163       : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
9164 }
9165 #endif  // HAS_HALFFLOATROW_SSE2
9166 
9167 #ifdef HAS_HALFFLOATROW_AVX2
HalfFloatRow_AVX2(const uint16_t * src,uint16_t * dst,float scale,int width)9168 void HalfFloatRow_AVX2(const uint16_t* src,
9169                        uint16_t* dst,
9170                        float scale,
9171                        int width) {
9172   scale *= kScaleBias;
9173   asm volatile(
9174       "vbroadcastss %3, %%ymm4                   \n"
9175       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
9176       "sub         %0,%1                         \n"
9177 
9178       // 16 pixel loop.
9179       LABELALIGN
9180       "1:                                        \n"
9181       "vmovdqu     (%0),%%ymm2                   \n"  // 16 shorts
9182       "add         $0x20,%0                      \n"
9183       "vpunpckhwd  %%ymm5,%%ymm2,%%ymm3          \n"  // mutates
9184       "vpunpcklwd  %%ymm5,%%ymm2,%%ymm2          \n"
9185       "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
9186       "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
9187       "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
9188       "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
9189       "vpsrld      $0xd,%%ymm3,%%ymm3            \n"
9190       "vpsrld      $0xd,%%ymm2,%%ymm2            \n"
9191       "vpackssdw   %%ymm3, %%ymm2, %%ymm2        \n"  // unmutates
9192       "vmovdqu     %%ymm2,-0x20(%0,%1,1)         \n"
9193       "sub         $0x10,%2                      \n"
9194       "jg          1b                            \n"
9195 
9196       "vzeroupper                                \n"
9197       : "+r"(src),   // %0
9198         "+r"(dst),   // %1
9199         "+r"(width)  // %2
9200 #if defined(__x86_64__)
9201       : "x"(scale)  // %3
9202 #else
9203       : "m"(scale)            // %3
9204 #endif
9205       : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
9206 }
9207 #endif  // HAS_HALFFLOATROW_AVX2
9208 
9209 #ifdef HAS_HALFFLOATROW_F16C
HalfFloatRow_F16C(const uint16_t * src,uint16_t * dst,float scale,int width)9210 void HalfFloatRow_F16C(const uint16_t* src,
9211                        uint16_t* dst,
9212                        float scale,
9213                        int width) {
9214   asm volatile(
9215       "vbroadcastss %3, %%ymm4                   \n"
9216       "sub         %0,%1                         \n"
9217 
9218       // 16 pixel loop.
9219       LABELALIGN
9220       "1:                                        \n"
9221       "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
9222       "vpmovzxwd   0x10(%0),%%ymm3               \n"
9223       "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
9224       "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
9225       "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
9226       "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
9227       "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
9228       "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
9229       "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
9230       "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
9231       "add         $0x20,%0                      \n"
9232       "sub         $0x10,%2                      \n"
9233       "jg          1b                            \n"
9234       "vzeroupper                                \n"
9235       : "+r"(src),   // %0
9236         "+r"(dst),   // %1
9237         "+r"(width)  // %2
9238 #if defined(__x86_64__)
9239       : "x"(scale)  // %3
9240 #else
9241       : "m"(scale)            // %3
9242 #endif
9243       : "memory", "cc", "xmm2", "xmm3", "xmm4");
9244 }
9245 #endif  // HAS_HALFFLOATROW_F16C
9246 
9247 #ifdef HAS_HALFFLOATROW_F16C
HalfFloat1Row_F16C(const uint16_t * src,uint16_t * dst,float,int width)9248 void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
9249   asm volatile(
9250       "sub         %0,%1                         \n"
9251       // 16 pixel loop.
9252       LABELALIGN
9253       "1:                                        \n"
9254       "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
9255       "vpmovzxwd   0x10(%0),%%ymm3               \n"
9256       "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
9257       "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
9258       "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
9259       "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
9260       "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
9261       "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
9262       "add         $0x20,%0                      \n"
9263       "sub         $0x10,%2                      \n"
9264       "jg          1b                            \n"
9265       "vzeroupper                                \n"
9266       : "+r"(src),   // %0
9267         "+r"(dst),   // %1
9268         "+r"(width)  // %2
9269       :
9270       : "memory", "cc", "xmm2", "xmm3");
9271 }
9272 #endif  // HAS_HALFFLOATROW_F16C
9273 
9274 #ifdef HAS_ARGBCOLORTABLEROW_X86
9275 // Tranform ARGB pixels with color table.
ARGBColorTableRow_X86(uint8_t * dst_argb,const uint8_t * table_argb,int width)9276 void ARGBColorTableRow_X86(uint8_t* dst_argb,
9277                            const uint8_t* table_argb,
9278                            int width) {
9279   uintptr_t pixel_temp;
9280   asm volatile(
9281       // 1 pixel loop.
9282       LABELALIGN
9283       "1:                                        \n"
9284       "movzb       (%0),%1                       \n"
9285       "lea         0x4(%0),%0                    \n"
9286       "movzb       0x00(%3,%1,4),%1              \n"
9287       "mov         %b1,-0x4(%0)                  \n"
9288       "movzb       -0x3(%0),%1                   \n"
9289       "movzb       0x01(%3,%1,4),%1              \n"
9290       "mov         %b1,-0x3(%0)                  \n"
9291       "movzb       -0x2(%0),%1                   \n"
9292       "movzb       0x02(%3,%1,4),%1              \n"
9293       "mov         %b1,-0x2(%0)                  \n"
9294       "movzb       -0x1(%0),%1                   \n"
9295       "movzb       0x03(%3,%1,4),%1              \n"
9296       "mov         %b1,-0x1(%0)                  \n"
9297       "dec         %2                            \n"
9298       "jg          1b                            \n"
9299       : "+r"(dst_argb),     // %0
9300         "=&d"(pixel_temp),  // %1
9301         "+r"(width)         // %2
9302       : "r"(table_argb)     // %3
9303       : "memory", "cc");
9304 }
9305 #endif  // HAS_ARGBCOLORTABLEROW_X86
9306 
9307 #ifdef HAS_RGBCOLORTABLEROW_X86
9308 // Tranform RGB pixels with color table.
RGBColorTableRow_X86(uint8_t * dst_argb,const uint8_t * table_argb,int width)9309 void RGBColorTableRow_X86(uint8_t* dst_argb,
9310                           const uint8_t* table_argb,
9311                           int width) {
9312   uintptr_t pixel_temp;
9313   asm volatile(
9314       // 1 pixel loop.
9315       LABELALIGN
9316       "1:                                        \n"
9317       "movzb       (%0),%1                       \n"
9318       "lea         0x4(%0),%0                    \n"
9319       "movzb       0x00(%3,%1,4),%1              \n"
9320       "mov         %b1,-0x4(%0)                  \n"
9321       "movzb       -0x3(%0),%1                   \n"
9322       "movzb       0x01(%3,%1,4),%1              \n"
9323       "mov         %b1,-0x3(%0)                  \n"
9324       "movzb       -0x2(%0),%1                   \n"
9325       "movzb       0x02(%3,%1,4),%1              \n"
9326       "mov         %b1,-0x2(%0)                  \n"
9327       "dec         %2                            \n"
9328       "jg          1b                            \n"
9329       : "+r"(dst_argb),     // %0
9330         "=&d"(pixel_temp),  // %1
9331         "+r"(width)         // %2
9332       : "r"(table_argb)     // %3
9333       : "memory", "cc");
9334 }
9335 #endif  // HAS_RGBCOLORTABLEROW_X86
9336 
9337 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
9338 // Tranform RGB pixels with luma table.
ARGBLumaColorTableRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width,const uint8_t * luma,uint32_t lumacoeff)9339 void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
9340                                  uint8_t* dst_argb,
9341                                  int width,
9342                                  const uint8_t* luma,
9343                                  uint32_t lumacoeff) {
9344   uintptr_t pixel_temp;
9345   uintptr_t table_temp;
9346   asm volatile(
9347       "movd        %6,%%xmm3                     \n"
9348       "pshufd      $0x0,%%xmm3,%%xmm3            \n"
9349       "pcmpeqb     %%xmm4,%%xmm4                 \n"
9350       "psllw       $0x8,%%xmm4                   \n"
9351       "pxor        %%xmm5,%%xmm5                 \n"
9352 
9353       // 4 pixel loop.
9354       LABELALIGN
9355       "1:                                        \n"
9356       "movdqu      (%2),%%xmm0                   \n"
9357       "pmaddubsw   %%xmm3,%%xmm0                 \n"
9358       "phaddw      %%xmm0,%%xmm0                 \n"
9359       "pand        %%xmm4,%%xmm0                 \n"
9360       "punpcklwd   %%xmm5,%%xmm0                 \n"
9361       "movd        %%xmm0,%k1                    \n"  // 32 bit offset
9362       "add         %5,%1                         \n"
9363       "pshufd      $0x39,%%xmm0,%%xmm0           \n"
9364 
9365       "movzb       (%2),%0                       \n"
9366       "movzb       0x00(%1,%0,1),%0              \n"
9367       "mov         %b0,(%3)                      \n"
9368       "movzb       0x1(%2),%0                    \n"
9369       "movzb       0x00(%1,%0,1),%0              \n"
9370       "mov         %b0,0x1(%3)                   \n"
9371       "movzb       0x2(%2),%0                    \n"
9372       "movzb       0x00(%1,%0,1),%0              \n"
9373       "mov         %b0,0x2(%3)                   \n"
9374       "movzb       0x3(%2),%0                    \n"
9375       "mov         %b0,0x3(%3)                   \n"
9376 
9377       "movd        %%xmm0,%k1                    \n"  // 32 bit offset
9378       "add         %5,%1                         \n"
9379       "pshufd      $0x39,%%xmm0,%%xmm0           \n"
9380 
9381       "movzb       0x4(%2),%0                    \n"
9382       "movzb       0x00(%1,%0,1),%0              \n"
9383       "mov         %b0,0x4(%3)                   \n"
9384       "movzb       0x5(%2),%0                    \n"
9385       "movzb       0x00(%1,%0,1),%0              \n"
9386       "mov         %b0,0x5(%3)                   \n"
9387       "movzb       0x6(%2),%0                    \n"
9388       "movzb       0x00(%1,%0,1),%0              \n"
9389       "mov         %b0,0x6(%3)                   \n"
9390       "movzb       0x7(%2),%0                    \n"
9391       "mov         %b0,0x7(%3)                   \n"
9392 
9393       "movd        %%xmm0,%k1                    \n"  // 32 bit offset
9394       "add         %5,%1                         \n"
9395       "pshufd      $0x39,%%xmm0,%%xmm0           \n"
9396 
9397       "movzb       0x8(%2),%0                    \n"
9398       "movzb       0x00(%1,%0,1),%0              \n"
9399       "mov         %b0,0x8(%3)                   \n"
9400       "movzb       0x9(%2),%0                    \n"
9401       "movzb       0x00(%1,%0,1),%0              \n"
9402       "mov         %b0,0x9(%3)                   \n"
9403       "movzb       0xa(%2),%0                    \n"
9404       "movzb       0x00(%1,%0,1),%0              \n"
9405       "mov         %b0,0xa(%3)                   \n"
9406       "movzb       0xb(%2),%0                    \n"
9407       "mov         %b0,0xb(%3)                   \n"
9408 
9409       "movd        %%xmm0,%k1                    \n"  // 32 bit offset
9410       "add         %5,%1                         \n"
9411 
9412       "movzb       0xc(%2),%0                    \n"
9413       "movzb       0x00(%1,%0,1),%0              \n"
9414       "mov         %b0,0xc(%3)                   \n"
9415       "movzb       0xd(%2),%0                    \n"
9416       "movzb       0x00(%1,%0,1),%0              \n"
9417       "mov         %b0,0xd(%3)                   \n"
9418       "movzb       0xe(%2),%0                    \n"
9419       "movzb       0x00(%1,%0,1),%0              \n"
9420       "mov         %b0,0xe(%3)                   \n"
9421       "movzb       0xf(%2),%0                    \n"
9422       "mov         %b0,0xf(%3)                   \n"
9423       "lea         0x10(%2),%2                   \n"
9424       "lea         0x10(%3),%3                   \n"
9425       "sub         $0x4,%4                       \n"
9426       "jg          1b                            \n"
9427       : "=&d"(pixel_temp),  // %0
9428         "=&a"(table_temp),  // %1
9429         "+r"(src_argb),     // %2
9430         "+r"(dst_argb),     // %3
9431         "+rm"(width)        // %4
9432       : "r"(luma),          // %5
9433         "rm"(lumacoeff)     // %6
9434       : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
9435 }
9436 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
9437 
9438 static const uvec8 kYUV24Shuffle[3] = {
9439     {8, 9, 0, 8, 9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12},
9440     {9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15},
9441     {2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15, 6, 14, 15, 7}};
9442 
9443 // Convert biplanar NV21 to packed YUV24
9444 // NV21 has VU in memory for chroma.
9445 // YUV24 is VUY in memory
NV21ToYUV24Row_SSSE3(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)9446 void NV21ToYUV24Row_SSSE3(const uint8_t* src_y,
9447                           const uint8_t* src_vu,
9448                           uint8_t* dst_yuv24,
9449                           int width) {
9450   asm volatile(
9451       "sub         %0,%1                         \n"
9452       "movdqa      (%4),%%xmm4                   \n"  // 3 shuffler constants
9453       "movdqa      16(%4),%%xmm5                 \n"
9454       "movdqa      32(%4),%%xmm6                 \n"
9455       "1:                                        \n"
9456       "movdqu      (%0),%%xmm2                   \n"  // load 16 Y values
9457       "movdqu      (%0,%1),%%xmm3                \n"  // load 8 VU values
9458       "lea         16(%0),%0                     \n"
9459       "movdqa      %%xmm2,%%xmm0                 \n"
9460       "movdqa      %%xmm2,%%xmm1                 \n"
9461       "shufps      $0x44,%%xmm3,%%xmm0           \n"  // Y 0..7,  UV 0..3
9462       "shufps      $0x99,%%xmm3,%%xmm1           \n"  // Y 4..11, UV 2..5
9463       "shufps      $0xee,%%xmm3,%%xmm2           \n"  // Y 8..15, UV 4..7
9464       "pshufb      %%xmm4, %%xmm0                \n"  // weave into YUV24
9465       "pshufb      %%xmm5, %%xmm1                \n"
9466       "pshufb      %%xmm6, %%xmm2                \n"
9467       "movdqu      %%xmm0,(%2)                   \n"
9468       "movdqu      %%xmm1,16(%2)                 \n"
9469       "movdqu      %%xmm2,32(%2)                 \n"
9470       "lea         48(%2),%2                     \n"
9471       "sub         $16,%3                        \n"  // 16 pixels per loop
9472       "jg          1b                            \n"
9473       : "+r"(src_y),            // %0
9474         "+r"(src_vu),           // %1
9475         "+r"(dst_yuv24),        // %2
9476         "+r"(width)             // %3
9477       : "r"(&kYUV24Shuffle[0])  // %4
9478       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
9479 }
9480 
9481 // Convert biplanar NV21 to packed YUV24
9482 // NV21 has VU in memory for chroma.
9483 // YUV24 is VUY in memory
NV21ToYUV24Row_AVX2(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)9484 void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
9485                          const uint8_t* src_vu,
9486                          uint8_t* dst_yuv24,
9487                          int width) {
9488   asm volatile(
9489       "sub         %0,%1                         \n"
9490       "vbroadcastf128 (%4),%%ymm4                \n"  // 3 shuffler constants
9491       "vbroadcastf128 16(%4),%%ymm5              \n"
9492       "vbroadcastf128 32(%4),%%ymm6              \n"
9493 
9494       "1:                                        \n"
9495       "vmovdqu     (%0),%%ymm2                   \n"  // load 32 Y values
9496       "vmovdqu     (%0,%1),%%ymm3                \n"  // load 16 VU values
9497       "lea         32(%0),%0                     \n"
9498       "vshufps     $0x44,%%ymm3,%%ymm2,%%ymm0    \n"  // Y 0..7,  UV 0..3
9499       "vshufps     $0x99,%%ymm3,%%ymm2,%%ymm1    \n"  // Y 4..11, UV 2..5
9500       "vshufps     $0xee,%%ymm3,%%ymm2,%%ymm2    \n"  // Y 8..15, UV 4..7
9501       "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"  // weave into YUV24
9502       "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
9503       "vpshufb     %%ymm6,%%ymm2,%%ymm2          \n"
9504       "vperm2i128  $0x20,%%ymm1,%%ymm0,%%ymm3    \n"
9505       "vperm2i128  $0x30,%%ymm0,%%ymm2,%%ymm0    \n"
9506       "vperm2i128  $0x31,%%ymm2,%%ymm1,%%ymm1    \n"
9507       "vmovdqu     %%ymm3,(%2)                   \n"
9508       "vmovdqu     %%ymm0,32(%2)                 \n"
9509       "vmovdqu     %%ymm1,64(%2)                 \n"
9510       "lea         96(%2),%2                     \n"
9511       "sub         $32,%3                        \n"  // 32 pixels per loop
9512       "jg          1b                            \n"
9513       "vzeroupper                                \n"
9514       : "+r"(src_y),            // %0
9515         "+r"(src_vu),           // %1
9516         "+r"(dst_yuv24),        // %2
9517         "+r"(width)             // %3
9518       : "r"(&kYUV24Shuffle[0])  // %4
9519       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
9520 }
9521 
9522 #ifdef HAS_NV21ToYUV24ROW_AVX512
9523 // The following VMBI VEX256 code tests okay with the intelsde emulator.
9524 static const lvec8 kYUV24Perm[3] = {
9525     {32, 33, 0,  32, 33, 1,  34, 35, 2,  34, 35, 3,  36, 37, 4,  36,
9526      37, 5,  38, 39, 6,  38, 39, 7,  40, 41, 8,  40, 41, 9,  42, 43},
9527     {10, 42, 43, 11, 44, 45, 12, 44, 45, 13, 46, 47, 14, 46, 47, 15,
9528      48, 49, 16, 48, 49, 17, 50, 51, 18, 50, 51, 19, 52, 53, 20, 52},
9529     {53, 21, 54, 55, 22, 54, 55, 23, 56, 57, 24, 56, 57, 25, 58, 59,
9530      26, 58, 59, 27, 60, 61, 28, 60, 61, 29, 62, 63, 30, 62, 63, 31}};
9531 
NV21ToYUV24Row_AVX512(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)9532 void NV21ToYUV24Row_AVX512(const uint8_t* src_y,
9533                            const uint8_t* src_vu,
9534                            uint8_t* dst_yuv24,
9535                            int width) {
9536   asm volatile(
9537       "sub         %0,%1                         \n"
9538       "vmovdqa     (%4),%%ymm4                   \n"  // 3 shuffler constants
9539       "vmovdqa     32(%4),%%ymm5                 \n"
9540       "vmovdqa     64(%4),%%ymm6                 \n" LABELALIGN
9541       "1:                                        \n"
9542       "vmovdqu     (%0),%%ymm2                   \n"  // load 32 Y values
9543       "vmovdqu     (%0,%1),%%ymm3                \n"  // load 16 VU values
9544       "lea         32(%0),%0                     \n"
9545       "vmovdqa     %%ymm2, %%ymm0                \n"
9546       "vmovdqa     %%ymm2, %%ymm1                \n"
9547       "vpermt2b    %%ymm3,%%ymm4,%%ymm0          \n"
9548       "vpermt2b    %%ymm3,%%ymm5,%%ymm1          \n"
9549       "vpermt2b    %%ymm3,%%ymm6,%%ymm2          \n"
9550       "vmovdqu     %%ymm0,(%2)                   \n"
9551       "vmovdqu     %%ymm1,32(%2)                 \n"
9552       "vmovdqu     %%ymm2,64(%2)                 \n"
9553       "lea         96(%2),%2                     \n"
9554       "sub         $32,%3                        \n"
9555       "jg          1b                            \n"
9556       "vzeroupper                                \n"
9557       : "+r"(src_y),         // %0
9558         "+r"(src_vu),        // %1
9559         "+r"(dst_yuv24),     // %2
9560         "+r"(width)          // %3
9561       : "r"(&kYUV24Perm[0])  // %4
9562       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
9563 }
9564 
9565 #endif  // HAS_NV21ToYUV24ROW_AVX512
9566 
9567 #ifdef HAS_SWAPUVROW_SSSE3
9568 
9569 // Shuffle table for reversing the bytes.
9570 static const uvec8 kShuffleUVToVU = {1u, 0u, 3u,  2u,  5u,  4u,  7u,  6u,
9571                                      9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
9572 
9573 // Convert UV plane of NV12 to VU of NV21.
SwapUVRow_SSSE3(const uint8_t * src_uv,uint8_t * dst_vu,int width)9574 void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
9575   asm volatile(
9576 
9577       "movdqu      %3,%%xmm5                     \n"
9578 
9579       LABELALIGN
9580       "1:                                        \n"
9581       "movdqu      (%0),%%xmm0                   \n"
9582       "movdqu      0x10(%0),%%xmm1               \n"
9583       "lea         0x20(%0),%0                   \n"
9584       "pshufb      %%xmm5,%%xmm0                 \n"
9585       "pshufb      %%xmm5,%%xmm1                 \n"
9586       "movdqu      %%xmm0,(%1)                   \n"
9587       "movdqu      %%xmm1,0x10(%1)               \n"
9588       "lea         0x20(%1),%1                   \n"
9589       "sub         $0x10,%2                      \n"
9590       "jg          1b                            \n"
9591       : "+r"(src_uv),        // %0
9592         "+r"(dst_vu),        // %1
9593         "+r"(width)          // %2
9594       : "m"(kShuffleUVToVU)  // %3
9595       : "memory", "cc", "xmm0", "xmm1", "xmm5");
9596 }
9597 #endif  // HAS_SWAPUVROW_SSSE3
9598 
9599 #ifdef HAS_SWAPUVROW_AVX2
SwapUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_vu,int width)9600 void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
9601   asm volatile(
9602 
9603       "vbroadcastf128 %3,%%ymm5                  \n"
9604 
9605       LABELALIGN
9606       "1:                                        \n"
9607       "vmovdqu     (%0),%%ymm0                   \n"
9608       "vmovdqu     0x20(%0),%%ymm1               \n"
9609       "lea         0x40(%0),%0                   \n"
9610       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
9611       "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
9612       "vmovdqu     %%ymm0,(%1)                   \n"
9613       "vmovdqu     %%ymm1,0x20(%1)               \n"
9614       "lea         0x40(%1),%1                   \n"
9615       "sub         $0x20,%2                      \n"
9616       "jg          1b                            \n"
9617       "vzeroupper                                \n"
9618       : "+r"(src_uv),        // %0
9619         "+r"(dst_vu),        // %1
9620         "+r"(width)          // %2
9621       : "m"(kShuffleUVToVU)  // %3
9622       : "memory", "cc", "xmm0", "xmm1", "xmm5");
9623 }
9624 #endif  // HAS_SWAPUVROW_AVX2
9625 
HalfMergeUVRow_SSSE3(const uint8_t * src_u,int src_stride_u,const uint8_t * src_v,int src_stride_v,uint8_t * dst_uv,int width)9626 void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
9627                           int src_stride_u,
9628                           const uint8_t* src_v,
9629                           int src_stride_v,
9630                           uint8_t* dst_uv,
9631                           int width) {
9632   asm volatile(
9633       "pcmpeqb     %%xmm4,%%xmm4                 \n"
9634       "psrlw       $0xf,%%xmm4                   \n"
9635       "packuswb    %%xmm4,%%xmm4                 \n"
9636       "pxor        %%xmm5,%%xmm5                 \n"
9637 
9638       LABELALIGN
9639       "1:                                        \n"
9640       "movdqu      (%0),%%xmm0                   \n"  // load 16 U values
9641       "movdqu      (%1),%%xmm1                   \n"  // load 16 V values
9642       "movdqu      0(%0,%4,1),%%xmm2             \n"  // 16 from next row
9643       "movdqu      0(%1,%5,1),%%xmm3             \n"
9644       "lea         0x10(%0),%0                   \n"
9645       "pmaddubsw   %%xmm4,%%xmm0                 \n"  // half size
9646       "pmaddubsw   %%xmm4,%%xmm1                 \n"
9647       "pmaddubsw   %%xmm4,%%xmm2                 \n"
9648       "pmaddubsw   %%xmm4,%%xmm3                 \n"
9649       "lea         0x10(%1),%1                   \n"
9650       "paddw       %%xmm2,%%xmm0                 \n"
9651       "paddw       %%xmm3,%%xmm1                 \n"
9652       "psrlw       $0x1,%%xmm0                   \n"
9653       "psrlw       $0x1,%%xmm1                   \n"
9654       "pavgw       %%xmm5,%%xmm0                 \n"
9655       "pavgw       %%xmm5,%%xmm1                 \n"
9656       "packuswb    %%xmm0,%%xmm0                 \n"
9657       "packuswb    %%xmm1,%%xmm1                 \n"
9658       "punpcklbw   %%xmm1,%%xmm0                 \n"
9659       "movdqu      %%xmm0,(%2)                   \n"  // store 8 UV pixels
9660       "lea         0x10(%2),%2                   \n"
9661       "sub         $0x10,%3                      \n"  // 16 src pixels per loop
9662       "jg          1b                            \n"
9663       : "+r"(src_u),                    // %0
9664         "+r"(src_v),                    // %1
9665         "+r"(dst_uv),                   // %2
9666         "+r"(width)                     // %3
9667       : "r"((intptr_t)(src_stride_u)),  // %4
9668         "r"((intptr_t)(src_stride_v))   // %5
9669       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
9670 }
9671 
HalfMergeUVRow_AVX2(const uint8_t * src_u,int src_stride_u,const uint8_t * src_v,int src_stride_v,uint8_t * dst_uv,int width)9672 void HalfMergeUVRow_AVX2(const uint8_t* src_u,
9673                          int src_stride_u,
9674                          const uint8_t* src_v,
9675                          int src_stride_v,
9676                          uint8_t* dst_uv,
9677                          int width) {
9678   asm volatile(
9679       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
9680       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
9681       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
9682       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
9683 
9684       LABELALIGN
9685       "1:                                        \n"
9686       "vmovdqu     (%0),%%ymm0                   \n"  // load 32 U values
9687       "vmovdqu     (%1),%%ymm1                   \n"  // load 32 V values
9688       "vmovdqu     0(%0,%4,1),%%ymm2             \n"  // 32 from next row
9689       "vmovdqu     0(%1,%5,1),%%ymm3             \n"
9690       "lea         0x20(%0),%0                   \n"
9691       "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"  // half size
9692       "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
9693       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
9694       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
9695       "lea         0x20(%1),%1                   \n"
9696       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
9697       "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
9698       "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"
9699       "vpsrlw      $0x1,%%ymm1,%%ymm1            \n"
9700       "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
9701       "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
9702       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
9703       "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
9704       "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
9705       "vmovdqu     %%ymm0,(%2)                   \n"  // store 16 UV pixels
9706       "lea         0x20(%2),%2                   \n"
9707       "sub         $0x20,%3                      \n"  // 32 src pixels per loop
9708       "jg          1b                            \n"
9709       "vzeroupper                                \n"
9710       : "+r"(src_u),                    // %0
9711         "+r"(src_v),                    // %1
9712         "+r"(dst_uv),                   // %2
9713         "+r"(width)                     // %3
9714       : "r"((intptr_t)(src_stride_u)),  // %4
9715         "r"((intptr_t)(src_stride_v))   // %5
9716       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
9717 }
9718 
ClampFloatToZero_SSE2(const float * src_x,float * dst_y,int width)9719 void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) {
9720   asm volatile(
9721       "pxor        %%xmm1,%%xmm1                 \n"
9722 
9723       LABELALIGN
9724       "1:                                        \n"
9725       "movd        (%0),%%xmm0                   \n"  // load float
9726       "maxss       %%xmm1, %%xmm0                \n"  // clamp to zero
9727       "add         4, %0                         \n"
9728       "movd        %%xmm0, (%1)                  \n"  // store float
9729       "add         4, %1                         \n"
9730       "sub         $0x4,%2                       \n"  // 1 float per loop
9731       "jg          1b                            \n"
9732       : "+r"(src_x),  // %0
9733         "+r"(dst_y),  // %1
9734         "+r"(width)   // %2
9735       :
9736       : "memory", "cc", "xmm0", "xmm1");
9737 }
9738 
9739 #endif  // defined(__x86_64__) || defined(__i386__)
9740 
9741 #ifdef __cplusplus
9742 }  // extern "C"
9743 }  // namespace libyuv
9744 #endif
9745