• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17 
18 // This module is for GCC x86 and x64.
19 #if !defined(LIBYUV_DISABLE_X86) && \
20     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
21 
22 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
23 
24 // Constants for ARGB
25 static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
26                                25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u};
27 
28 // JPeg full range.
29 static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
30                                 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};
31 
32 static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u,
33                                 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u};
34 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
35 
36 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
37 
38 static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
39                               112, -74, -38, 0, 112, -74, -38, 0};
40 
41 static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
42                                127, -84, -43, 0, 127, -84, -43, 0};
43 
44 static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
45                               -18, -94, 112, 0, -18, -94, 112, 0};
46 
47 static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
48                                -20, -107, 127, 0, -20, -107, 127, 0};
49 
50 // Constants for BGRA
51 static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
52                                0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
53 
54 static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
55                               0, -38, -74, 112, 0, -38, -74, 112};
56 
57 static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
58                               0, 112, -94, -18, 0, 112, -94, -18};
59 
60 // Constants for ABGR
61 static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u,
62                                66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u};
63 
64 static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
65                               -38, -74, 112, 0, -38, -74, 112, 0};
66 
67 static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
68                               112, -94, -18, 0, 112, -94, -18, 0};
69 
70 // Constants for RGBA.
71 static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u,
72                                0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u};
73 
74 static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
75                               0, 112, -74, -38, 0, 112, -74, -38};
76 
77 static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
78                               0, -18, -94, 112, 0, -18, -94, 112};
79 
80 static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
81                                0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u};
82 
83 static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
84                                 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
85 
86 static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
87                                0x8080u, 0x8080u, 0x8080u, 0x8080u};
88 
89 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
90 
91 #ifdef HAS_RGB24TOARGBROW_SSSE3
92 
93 // Shuffle table for converting RGB24 to ARGB.
94 static const uvec8 kShuffleMaskRGB24ToARGB = {
95     0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
96 
97 // Shuffle table for converting RAW to ARGB.
98 static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
99                                             8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
100 
101 // Shuffle table for converting RAW to RGBA.
102 static const uvec8 kShuffleMaskRAWToRGBA = {12u, 2u, 1u, 0u, 13u, 5u,  4u,  3u,
103                                             14u, 8u, 7u, 6u, 15u, 11u, 10u, 9u};
104 
105 // Shuffle table for converting RAW to RGB24.  First 8.
106 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
107     2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
108     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
109 
110 // Shuffle table for converting RAW to RGB24.  Middle 8.
111 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
112     2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
113     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
114 
115 // Shuffle table for converting RAW to RGB24.  Last 8.
116 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
117     8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
118     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
119 
120 // Shuffle table for converting ARGB to RGB24.
121 static const uvec8 kShuffleMaskARGBToRGB24 = {
122     0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
123 
124 // Shuffle table for converting ARGB to RAW.
125 static const uvec8 kShuffleMaskARGBToRAW = {
126     2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
127 
128 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
129 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
130     0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
131 
132 // YUY2 shuf 16 Y to 32 Y.
133 static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
134                                     10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
135                                     6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
136 
137 // YUY2 shuf 8 UV to 16 UV.
138 static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
139                                      11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
140                                      5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
141 
142 // UYVY shuf 16 Y to 32 Y.
143 static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
144                                     11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
145                                     7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
146 
147 // UYVY shuf 8 UV to 16 UV.
148 static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
149                                      10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
150                                      4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
151 
152 // NV21 shuf 8 VU to 16 UV.
153 static const lvec8 kShuffleNV21 = {
154     1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
155     1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
156 };
157 #endif  // HAS_RGB24TOARGBROW_SSSE3
158 
159 #ifdef HAS_J400TOARGBROW_SSE2
J400ToARGBRow_SSE2(const uint8_t * src_y,uint8_t * dst_argb,int width)160 void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
161   asm volatile(
162       "pcmpeqb     %%xmm5,%%xmm5                 \n"
163       "pslld       $0x18,%%xmm5                  \n"
164 
165       LABELALIGN
166       "1:                                        \n"
167       "movq        (%0),%%xmm0                   \n"
168       "lea         0x8(%0),%0                    \n"
169       "punpcklbw   %%xmm0,%%xmm0                 \n"
170       "movdqa      %%xmm0,%%xmm1                 \n"
171       "punpcklwd   %%xmm0,%%xmm0                 \n"
172       "punpckhwd   %%xmm1,%%xmm1                 \n"
173       "por         %%xmm5,%%xmm0                 \n"
174       "por         %%xmm5,%%xmm1                 \n"
175       "movdqu      %%xmm0,(%1)                   \n"
176       "movdqu      %%xmm1,0x10(%1)               \n"
177       "lea         0x20(%1),%1                   \n"
178       "sub         $0x8,%2                       \n"
179       "jg          1b                            \n"
180       : "+r"(src_y),     // %0
181         "+r"(dst_argb),  // %1
182         "+r"(width)      // %2
183         ::"memory",
184         "cc", "xmm0", "xmm1", "xmm5");
185 }
186 #endif  // HAS_J400TOARGBROW_SSE2
187 
188 #ifdef HAS_RGB24TOARGBROW_SSSE3
RGB24ToARGBRow_SSSE3(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)189 void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
190                           uint8_t* dst_argb,
191                           int width) {
192   asm volatile(
193       "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
194       "pslld       $0x18,%%xmm5                  \n"
195       "movdqa      %3,%%xmm4                     \n"
196 
197       LABELALIGN
198       "1:                                        \n"
199       "movdqu      (%0),%%xmm0                   \n"
200       "movdqu      0x10(%0),%%xmm1               \n"
201       "movdqu      0x20(%0),%%xmm3               \n"
202       "lea         0x30(%0),%0                   \n"
203       "movdqa      %%xmm3,%%xmm2                 \n"
204       "palignr     $0x8,%%xmm1,%%xmm2            \n"
205       "pshufb      %%xmm4,%%xmm2                 \n"
206       "por         %%xmm5,%%xmm2                 \n"
207       "palignr     $0xc,%%xmm0,%%xmm1            \n"
208       "pshufb      %%xmm4,%%xmm0                 \n"
209       "movdqu      %%xmm2,0x20(%1)               \n"
210       "por         %%xmm5,%%xmm0                 \n"
211       "pshufb      %%xmm4,%%xmm1                 \n"
212       "movdqu      %%xmm0,(%1)                   \n"
213       "por         %%xmm5,%%xmm1                 \n"
214       "palignr     $0x4,%%xmm3,%%xmm3            \n"
215       "pshufb      %%xmm4,%%xmm3                 \n"
216       "movdqu      %%xmm1,0x10(%1)               \n"
217       "por         %%xmm5,%%xmm3                 \n"
218       "movdqu      %%xmm3,0x30(%1)               \n"
219       "lea         0x40(%1),%1                   \n"
220       "sub         $0x10,%2                      \n"
221       "jg          1b                            \n"
222       : "+r"(src_rgb24),              // %0
223         "+r"(dst_argb),               // %1
224         "+r"(width)                   // %2
225       : "m"(kShuffleMaskRGB24ToARGB)  // %3
226       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
227 }
228 
RAWToARGBRow_SSSE3(const uint8_t * src_raw,uint8_t * dst_argb,int width)229 void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
230   asm volatile(
231       "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
232       "pslld       $0x18,%%xmm5                  \n"
233       "movdqa      %3,%%xmm4                     \n"
234 
235       LABELALIGN
236       "1:                                        \n"
237       "movdqu      (%0),%%xmm0                   \n"
238       "movdqu      0x10(%0),%%xmm1               \n"
239       "movdqu      0x20(%0),%%xmm3               \n"
240       "lea         0x30(%0),%0                   \n"
241       "movdqa      %%xmm3,%%xmm2                 \n"
242       "palignr     $0x8,%%xmm1,%%xmm2            \n"
243       "pshufb      %%xmm4,%%xmm2                 \n"
244       "por         %%xmm5,%%xmm2                 \n"
245       "palignr     $0xc,%%xmm0,%%xmm1            \n"
246       "pshufb      %%xmm4,%%xmm0                 \n"
247       "movdqu      %%xmm2,0x20(%1)               \n"
248       "por         %%xmm5,%%xmm0                 \n"
249       "pshufb      %%xmm4,%%xmm1                 \n"
250       "movdqu      %%xmm0,(%1)                   \n"
251       "por         %%xmm5,%%xmm1                 \n"
252       "palignr     $0x4,%%xmm3,%%xmm3            \n"
253       "pshufb      %%xmm4,%%xmm3                 \n"
254       "movdqu      %%xmm1,0x10(%1)               \n"
255       "por         %%xmm5,%%xmm3                 \n"
256       "movdqu      %%xmm3,0x30(%1)               \n"
257       "lea         0x40(%1),%1                   \n"
258       "sub         $0x10,%2                      \n"
259       "jg          1b                            \n"
260       : "+r"(src_raw),              // %0
261         "+r"(dst_argb),             // %1
262         "+r"(width)                 // %2
263       : "m"(kShuffleMaskRAWToARGB)  // %3
264       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
265 }
266 
267 // Same code as RAWToARGB with different shuffler and A in low bits
RAWToRGBARow_SSSE3(const uint8_t * src_raw,uint8_t * dst_rgba,int width)268 void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
269   asm volatile(
270       "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0x000000ff
271       "psrld       $0x18,%%xmm5                  \n"
272       "movdqa      %3,%%xmm4                     \n"
273 
274       LABELALIGN
275       "1:                                        \n"
276       "movdqu      (%0),%%xmm0                   \n"
277       "movdqu      0x10(%0),%%xmm1               \n"
278       "movdqu      0x20(%0),%%xmm3               \n"
279       "lea         0x30(%0),%0                   \n"
280       "movdqa      %%xmm3,%%xmm2                 \n"
281       "palignr     $0x8,%%xmm1,%%xmm2            \n"
282       "pshufb      %%xmm4,%%xmm2                 \n"
283       "por         %%xmm5,%%xmm2                 \n"
284       "palignr     $0xc,%%xmm0,%%xmm1            \n"
285       "pshufb      %%xmm4,%%xmm0                 \n"
286       "movdqu      %%xmm2,0x20(%1)               \n"
287       "por         %%xmm5,%%xmm0                 \n"
288       "pshufb      %%xmm4,%%xmm1                 \n"
289       "movdqu      %%xmm0,(%1)                   \n"
290       "por         %%xmm5,%%xmm1                 \n"
291       "palignr     $0x4,%%xmm3,%%xmm3            \n"
292       "pshufb      %%xmm4,%%xmm3                 \n"
293       "movdqu      %%xmm1,0x10(%1)               \n"
294       "por         %%xmm5,%%xmm3                 \n"
295       "movdqu      %%xmm3,0x30(%1)               \n"
296       "lea         0x40(%1),%1                   \n"
297       "sub         $0x10,%2                      \n"
298       "jg          1b                            \n"
299       : "+r"(src_raw),              // %0
300         "+r"(dst_rgba),             // %1
301         "+r"(width)                 // %2
302       : "m"(kShuffleMaskRAWToRGBA)  // %3
303       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
304 }
305 
RAWToRGB24Row_SSSE3(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)306 void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
307                          uint8_t* dst_rgb24,
308                          int width) {
309   asm volatile(
310       "movdqa      %3,%%xmm3                     \n"
311       "movdqa      %4,%%xmm4                     \n"
312       "movdqa      %5,%%xmm5                     \n"
313 
314       LABELALIGN
315       "1:                                        \n"
316       "movdqu      (%0),%%xmm0                   \n"
317       "movdqu      0x4(%0),%%xmm1                \n"
318       "movdqu      0x8(%0),%%xmm2                \n"
319       "lea         0x18(%0),%0                   \n"
320       "pshufb      %%xmm3,%%xmm0                 \n"
321       "pshufb      %%xmm4,%%xmm1                 \n"
322       "pshufb      %%xmm5,%%xmm2                 \n"
323       "movq        %%xmm0,(%1)                   \n"
324       "movq        %%xmm1,0x8(%1)                \n"
325       "movq        %%xmm2,0x10(%1)               \n"
326       "lea         0x18(%1),%1                   \n"
327       "sub         $0x8,%2                       \n"
328       "jg          1b                            \n"
329       : "+r"(src_raw),                  // %0
330         "+r"(dst_rgb24),                // %1
331         "+r"(width)                     // %2
332       : "m"(kShuffleMaskRAWToRGB24_0),  // %3
333         "m"(kShuffleMaskRAWToRGB24_1),  // %4
334         "m"(kShuffleMaskRAWToRGB24_2)   // %5
335       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
336 }
337 
RGB565ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)338 void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
339   asm volatile(
340       "mov         $0x1080108,%%eax              \n"
341       "movd        %%eax,%%xmm5                  \n"
342       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
343       "mov         $0x20802080,%%eax             \n"
344       "movd        %%eax,%%xmm6                  \n"
345       "pshufd      $0x0,%%xmm6,%%xmm6            \n"
346       "pcmpeqb     %%xmm3,%%xmm3                 \n"
347       "psllw       $0xb,%%xmm3                   \n"
348       "pcmpeqb     %%xmm4,%%xmm4                 \n"
349       "psllw       $0xa,%%xmm4                   \n"
350       "psrlw       $0x5,%%xmm4                   \n"
351       "pcmpeqb     %%xmm7,%%xmm7                 \n"
352       "psllw       $0x8,%%xmm7                   \n"
353       "sub         %0,%1                         \n"
354       "sub         %0,%1                         \n"
355 
356       LABELALIGN
357       "1:                                        \n"
358       "movdqu      (%0),%%xmm0                   \n"
359       "movdqa      %%xmm0,%%xmm1                 \n"
360       "movdqa      %%xmm0,%%xmm2                 \n"
361       "pand        %%xmm3,%%xmm1                 \n"
362       "psllw       $0xb,%%xmm2                   \n"
363       "pmulhuw     %%xmm5,%%xmm1                 \n"
364       "pmulhuw     %%xmm5,%%xmm2                 \n"
365       "psllw       $0x8,%%xmm1                   \n"
366       "por         %%xmm2,%%xmm1                 \n"
367       "pand        %%xmm4,%%xmm0                 \n"
368       "pmulhuw     %%xmm6,%%xmm0                 \n"
369       "por         %%xmm7,%%xmm0                 \n"
370       "movdqa      %%xmm1,%%xmm2                 \n"
371       "punpcklbw   %%xmm0,%%xmm1                 \n"
372       "punpckhbw   %%xmm0,%%xmm2                 \n"
373       "movdqu      %%xmm1,0x00(%1,%0,2)          \n"
374       "movdqu      %%xmm2,0x10(%1,%0,2)          \n"
375       "lea         0x10(%0),%0                   \n"
376       "sub         $0x8,%2                       \n"
377       "jg          1b                            \n"
378       : "+r"(src),   // %0
379         "+r"(dst),   // %1
380         "+r"(width)  // %2
381       :
382       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
383         "xmm6", "xmm7");
384 }
385 
ARGB1555ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)386 void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
387   asm volatile(
388       "mov         $0x1080108,%%eax              \n"
389       "movd        %%eax,%%xmm5                  \n"
390       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
391       "mov         $0x42004200,%%eax             \n"
392       "movd        %%eax,%%xmm6                  \n"
393       "pshufd      $0x0,%%xmm6,%%xmm6            \n"
394       "pcmpeqb     %%xmm3,%%xmm3                 \n"
395       "psllw       $0xb,%%xmm3                   \n"
396       "movdqa      %%xmm3,%%xmm4                 \n"
397       "psrlw       $0x6,%%xmm4                   \n"
398       "pcmpeqb     %%xmm7,%%xmm7                 \n"
399       "psllw       $0x8,%%xmm7                   \n"
400       "sub         %0,%1                         \n"
401       "sub         %0,%1                         \n"
402 
403       LABELALIGN
404       "1:                                        \n"
405       "movdqu      (%0),%%xmm0                   \n"
406       "movdqa      %%xmm0,%%xmm1                 \n"
407       "movdqa      %%xmm0,%%xmm2                 \n"
408       "psllw       $0x1,%%xmm1                   \n"
409       "psllw       $0xb,%%xmm2                   \n"
410       "pand        %%xmm3,%%xmm1                 \n"
411       "pmulhuw     %%xmm5,%%xmm2                 \n"
412       "pmulhuw     %%xmm5,%%xmm1                 \n"
413       "psllw       $0x8,%%xmm1                   \n"
414       "por         %%xmm2,%%xmm1                 \n"
415       "movdqa      %%xmm0,%%xmm2                 \n"
416       "pand        %%xmm4,%%xmm0                 \n"
417       "psraw       $0x8,%%xmm2                   \n"
418       "pmulhuw     %%xmm6,%%xmm0                 \n"
419       "pand        %%xmm7,%%xmm2                 \n"
420       "por         %%xmm2,%%xmm0                 \n"
421       "movdqa      %%xmm1,%%xmm2                 \n"
422       "punpcklbw   %%xmm0,%%xmm1                 \n"
423       "punpckhbw   %%xmm0,%%xmm2                 \n"
424       "movdqu      %%xmm1,0x00(%1,%0,2)          \n"
425       "movdqu      %%xmm2,0x10(%1,%0,2)          \n"
426       "lea         0x10(%0),%0                   \n"
427       "sub         $0x8,%2                       \n"
428       "jg          1b                            \n"
429       : "+r"(src),   // %0
430         "+r"(dst),   // %1
431         "+r"(width)  // %2
432       :
433       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
434         "xmm6", "xmm7");
435 }
436 
ARGB4444ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)437 void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
438   asm volatile(
439       "mov         $0xf0f0f0f,%%eax              \n"
440       "movd        %%eax,%%xmm4                  \n"
441       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
442       "movdqa      %%xmm4,%%xmm5                 \n"
443       "pslld       $0x4,%%xmm5                   \n"
444       "sub         %0,%1                         \n"
445       "sub         %0,%1                         \n"
446 
447       LABELALIGN
448       "1:                                        \n"
449       "movdqu      (%0),%%xmm0                   \n"
450       "movdqa      %%xmm0,%%xmm2                 \n"
451       "pand        %%xmm4,%%xmm0                 \n"
452       "pand        %%xmm5,%%xmm2                 \n"
453       "movdqa      %%xmm0,%%xmm1                 \n"
454       "movdqa      %%xmm2,%%xmm3                 \n"
455       "psllw       $0x4,%%xmm1                   \n"
456       "psrlw       $0x4,%%xmm3                   \n"
457       "por         %%xmm1,%%xmm0                 \n"
458       "por         %%xmm3,%%xmm2                 \n"
459       "movdqa      %%xmm0,%%xmm1                 \n"
460       "punpcklbw   %%xmm2,%%xmm0                 \n"
461       "punpckhbw   %%xmm2,%%xmm1                 \n"
462       "movdqu      %%xmm0,0x00(%1,%0,2)          \n"
463       "movdqu      %%xmm1,0x10(%1,%0,2)          \n"
464       "lea         0x10(%0),%0                   \n"
465       "sub         $0x8,%2                       \n"
466       "jg          1b                            \n"
467       : "+r"(src),   // %0
468         "+r"(dst),   // %1
469         "+r"(width)  // %2
470       :
471       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
472 }
473 
ARGBToRGB24Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)474 void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
475   asm volatile(
476 
477       "movdqa      %3,%%xmm6                     \n"
478 
479       LABELALIGN
480       "1:                                        \n"
481       "movdqu      (%0),%%xmm0                   \n"
482       "movdqu      0x10(%0),%%xmm1               \n"
483       "movdqu      0x20(%0),%%xmm2               \n"
484       "movdqu      0x30(%0),%%xmm3               \n"
485       "lea         0x40(%0),%0                   \n"
486       "pshufb      %%xmm6,%%xmm0                 \n"
487       "pshufb      %%xmm6,%%xmm1                 \n"
488       "pshufb      %%xmm6,%%xmm2                 \n"
489       "pshufb      %%xmm6,%%xmm3                 \n"
490       "movdqa      %%xmm1,%%xmm4                 \n"
491       "psrldq      $0x4,%%xmm1                   \n"
492       "pslldq      $0xc,%%xmm4                   \n"
493       "movdqa      %%xmm2,%%xmm5                 \n"
494       "por         %%xmm4,%%xmm0                 \n"
495       "pslldq      $0x8,%%xmm5                   \n"
496       "movdqu      %%xmm0,(%1)                   \n"
497       "por         %%xmm5,%%xmm1                 \n"
498       "psrldq      $0x8,%%xmm2                   \n"
499       "pslldq      $0x4,%%xmm3                   \n"
500       "por         %%xmm3,%%xmm2                 \n"
501       "movdqu      %%xmm1,0x10(%1)               \n"
502       "movdqu      %%xmm2,0x20(%1)               \n"
503       "lea         0x30(%1),%1                   \n"
504       "sub         $0x10,%2                      \n"
505       "jg          1b                            \n"
506       : "+r"(src),                    // %0
507         "+r"(dst),                    // %1
508         "+r"(width)                   // %2
509       : "m"(kShuffleMaskARGBToRGB24)  // %3
510       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
511 }
512 
ARGBToRAWRow_SSSE3(const uint8_t * src,uint8_t * dst,int width)513 void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
514   asm volatile(
515 
516       "movdqa      %3,%%xmm6                     \n"
517 
518       LABELALIGN
519       "1:                                        \n"
520       "movdqu      (%0),%%xmm0                   \n"
521       "movdqu      0x10(%0),%%xmm1               \n"
522       "movdqu      0x20(%0),%%xmm2               \n"
523       "movdqu      0x30(%0),%%xmm3               \n"
524       "lea         0x40(%0),%0                   \n"
525       "pshufb      %%xmm6,%%xmm0                 \n"
526       "pshufb      %%xmm6,%%xmm1                 \n"
527       "pshufb      %%xmm6,%%xmm2                 \n"
528       "pshufb      %%xmm6,%%xmm3                 \n"
529       "movdqa      %%xmm1,%%xmm4                 \n"
530       "psrldq      $0x4,%%xmm1                   \n"
531       "pslldq      $0xc,%%xmm4                   \n"
532       "movdqa      %%xmm2,%%xmm5                 \n"
533       "por         %%xmm4,%%xmm0                 \n"
534       "pslldq      $0x8,%%xmm5                   \n"
535       "movdqu      %%xmm0,(%1)                   \n"
536       "por         %%xmm5,%%xmm1                 \n"
537       "psrldq      $0x8,%%xmm2                   \n"
538       "pslldq      $0x4,%%xmm3                   \n"
539       "por         %%xmm3,%%xmm2                 \n"
540       "movdqu      %%xmm1,0x10(%1)               \n"
541       "movdqu      %%xmm2,0x20(%1)               \n"
542       "lea         0x30(%1),%1                   \n"
543       "sub         $0x10,%2                      \n"
544       "jg          1b                            \n"
545       : "+r"(src),                  // %0
546         "+r"(dst),                  // %1
547         "+r"(width)                 // %2
548       : "m"(kShuffleMaskARGBToRAW)  // %3
549       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
550 }
551 
552 #ifdef HAS_ARGBTORGB24ROW_AVX2
553 // vpermd for 12+12 to 24
554 static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
555 
ARGBToRGB24Row_AVX2(const uint8_t * src,uint8_t * dst,int width)556 void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
557   asm volatile(
558       "vbroadcastf128 %3,%%ymm6                  \n"
559       "vmovdqa     %4,%%ymm7                     \n"
560 
561       LABELALIGN
562       "1:                                        \n"
563       "vmovdqu     (%0),%%ymm0                   \n"
564       "vmovdqu     0x20(%0),%%ymm1               \n"
565       "vmovdqu     0x40(%0),%%ymm2               \n"
566       "vmovdqu     0x60(%0),%%ymm3               \n"
567       "lea         0x80(%0),%0                   \n"
568       "vpshufb     %%ymm6,%%ymm0,%%ymm0          \n"  // xxx0yyy0
569       "vpshufb     %%ymm6,%%ymm1,%%ymm1          \n"
570       "vpshufb     %%ymm6,%%ymm2,%%ymm2          \n"
571       "vpshufb     %%ymm6,%%ymm3,%%ymm3          \n"
572       "vpermd      %%ymm0,%%ymm7,%%ymm0          \n"  // pack to 24 bytes
573       "vpermd      %%ymm1,%%ymm7,%%ymm1          \n"
574       "vpermd      %%ymm2,%%ymm7,%%ymm2          \n"
575       "vpermd      %%ymm3,%%ymm7,%%ymm3          \n"
576       "vpermq      $0x3f,%%ymm1,%%ymm4           \n"  // combine 24 + 8
577       "vpor        %%ymm4,%%ymm0,%%ymm0          \n"
578       "vmovdqu     %%ymm0,(%1)                   \n"
579       "vpermq      $0xf9,%%ymm1,%%ymm1           \n"  // combine 16 + 16
580       "vpermq      $0x4f,%%ymm2,%%ymm4           \n"
581       "vpor        %%ymm4,%%ymm1,%%ymm1          \n"
582       "vmovdqu     %%ymm1,0x20(%1)               \n"
583       "vpermq      $0xfe,%%ymm2,%%ymm2           \n"  // combine 8 + 24
584       "vpermq      $0x93,%%ymm3,%%ymm3           \n"
585       "vpor        %%ymm3,%%ymm2,%%ymm2          \n"
586       "vmovdqu     %%ymm2,0x40(%1)               \n"
587       "lea         0x60(%1),%1                   \n"
588       "sub         $0x20,%2                      \n"
589       "jg          1b                            \n"
590       "vzeroupper                                \n"
591       : "+r"(src),                     // %0
592         "+r"(dst),                     // %1
593         "+r"(width)                    // %2
594       : "m"(kShuffleMaskARGBToRGB24),  // %3
595         "m"(kPermdRGB24_AVX)           // %4
596       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
597         "xmm7");
598 }
599 #endif
600 
601 #ifdef HAS_ARGBTORGB24ROW_AVX512VBMI
602 // Shuffle table for converting ARGBToRGB24
603 static const ulvec8 kPermARGBToRGB24_0 = {
604     0u,  1u,  2u,  4u,  5u,  6u,  8u,  9u,  10u, 12u, 13u,
605     14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
606     29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
607 static const ulvec8 kPermARGBToRGB24_1 = {
608     10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
609     25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
610     40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
611 static const ulvec8 kPermARGBToRGB24_2 = {
612     21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
613     36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
614     50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
615 
ARGBToRGB24Row_AVX512VBMI(const uint8_t * src,uint8_t * dst,int width)616 void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
617   asm volatile(
618       "vmovdqa     %3,%%ymm5                     \n"
619       "vmovdqa     %4,%%ymm6                     \n"
620       "vmovdqa     %5,%%ymm7                     \n"
621 
622       LABELALIGN
623       "1:                                        \n"
624       "vmovdqu     (%0),%%ymm0                   \n"
625       "vmovdqu     0x20(%0),%%ymm1               \n"
626       "vmovdqu     0x40(%0),%%ymm2               \n"
627       "vmovdqu     0x60(%0),%%ymm3               \n"
628       "lea         0x80(%0),%0                   \n"
629       "vpermt2b    %%ymm1,%%ymm5,%%ymm0          \n"
630       "vpermt2b    %%ymm2,%%ymm6,%%ymm1          \n"
631       "vpermt2b    %%ymm3,%%ymm7,%%ymm2          \n"
632       "vmovdqu     %%ymm0,(%1)                   \n"
633       "vmovdqu     %%ymm1,0x20(%1)               \n"
634       "vmovdqu     %%ymm2,0x40(%1)               \n"
635       "lea         0x60(%1),%1                   \n"
636       "sub         $0x20,%2                      \n"
637       "jg          1b                            \n"
638       "vzeroupper                                \n"
639       : "+r"(src),                // %0
640         "+r"(dst),                // %1
641         "+r"(width)               // %2
642       : "m"(kPermARGBToRGB24_0),  // %3
643         "m"(kPermARGBToRGB24_1),  // %4
644         "m"(kPermARGBToRGB24_2)   // %5
645       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");
646 }
647 #endif
648 
649 #ifdef HAS_ARGBTORAWROW_AVX2
ARGBToRAWRow_AVX2(const uint8_t * src,uint8_t * dst,int width)650 void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
651   asm volatile(
652       "vbroadcastf128 %3,%%ymm6                  \n"
653       "vmovdqa     %4,%%ymm7                     \n"
654 
655       LABELALIGN
656       "1:                                        \n"
657       "vmovdqu     (%0),%%ymm0                   \n"
658       "vmovdqu     0x20(%0),%%ymm1               \n"
659       "vmovdqu     0x40(%0),%%ymm2               \n"
660       "vmovdqu     0x60(%0),%%ymm3               \n"
661       "lea         0x80(%0),%0                   \n"
662       "vpshufb     %%ymm6,%%ymm0,%%ymm0          \n"  // xxx0yyy0
663       "vpshufb     %%ymm6,%%ymm1,%%ymm1          \n"
664       "vpshufb     %%ymm6,%%ymm2,%%ymm2          \n"
665       "vpshufb     %%ymm6,%%ymm3,%%ymm3          \n"
666       "vpermd      %%ymm0,%%ymm7,%%ymm0          \n"  // pack to 24 bytes
667       "vpermd      %%ymm1,%%ymm7,%%ymm1          \n"
668       "vpermd      %%ymm2,%%ymm7,%%ymm2          \n"
669       "vpermd      %%ymm3,%%ymm7,%%ymm3          \n"
670       "vpermq      $0x3f,%%ymm1,%%ymm4           \n"  // combine 24 + 8
671       "vpor        %%ymm4,%%ymm0,%%ymm0          \n"
672       "vmovdqu     %%ymm0,(%1)                   \n"
673       "vpermq      $0xf9,%%ymm1,%%ymm1           \n"  // combine 16 + 16
674       "vpermq      $0x4f,%%ymm2,%%ymm4           \n"
675       "vpor        %%ymm4,%%ymm1,%%ymm1          \n"
676       "vmovdqu     %%ymm1,0x20(%1)               \n"
677       "vpermq      $0xfe,%%ymm2,%%ymm2           \n"  // combine 8 + 24
678       "vpermq      $0x93,%%ymm3,%%ymm3           \n"
679       "vpor        %%ymm3,%%ymm2,%%ymm2          \n"
680       "vmovdqu     %%ymm2,0x40(%1)               \n"
681       "lea         0x60(%1),%1                   \n"
682       "sub         $0x20,%2                      \n"
683       "jg          1b                            \n"
684       "vzeroupper                                \n"
685       : "+r"(src),                   // %0
686         "+r"(dst),                   // %1
687         "+r"(width)                  // %2
688       : "m"(kShuffleMaskARGBToRAW),  // %3
689         "m"(kPermdRGB24_AVX)         // %4
690       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
691         "xmm7");
692 }
693 #endif
694 
ARGBToRGB565Row_SSE2(const uint8_t * src,uint8_t * dst,int width)695 void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
696   asm volatile(
697       "pcmpeqb     %%xmm3,%%xmm3                 \n"
698       "psrld       $0x1b,%%xmm3                  \n"
699       "pcmpeqb     %%xmm4,%%xmm4                 \n"
700       "psrld       $0x1a,%%xmm4                  \n"
701       "pslld       $0x5,%%xmm4                   \n"
702       "pcmpeqb     %%xmm5,%%xmm5                 \n"
703       "pslld       $0xb,%%xmm5                   \n"
704 
705       LABELALIGN
706       "1:                                        \n"
707       "movdqu      (%0),%%xmm0                   \n"
708       "movdqa      %%xmm0,%%xmm1                 \n"
709       "movdqa      %%xmm0,%%xmm2                 \n"
710       "pslld       $0x8,%%xmm0                   \n"
711       "psrld       $0x3,%%xmm1                   \n"
712       "psrld       $0x5,%%xmm2                   \n"
713       "psrad       $0x10,%%xmm0                  \n"
714       "pand        %%xmm3,%%xmm1                 \n"
715       "pand        %%xmm4,%%xmm2                 \n"
716       "pand        %%xmm5,%%xmm0                 \n"
717       "por         %%xmm2,%%xmm1                 \n"
718       "por         %%xmm1,%%xmm0                 \n"
719       "packssdw    %%xmm0,%%xmm0                 \n"
720       "lea         0x10(%0),%0                   \n"
721       "movq        %%xmm0,(%1)                   \n"
722       "lea         0x8(%1),%1                    \n"
723       "sub         $0x4,%2                       \n"
724       "jg          1b                            \n"
725       : "+r"(src),   // %0
726         "+r"(dst),   // %1
727         "+r"(width)  // %2
728         ::"memory",
729         "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
730 }
731 
ARGBToRGB565DitherRow_SSE2(const uint8_t * src,uint8_t * dst,const uint32_t dither4,int width)732 void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
733                                 uint8_t* dst,
734                                 const uint32_t dither4,
735                                 int width) {
736   asm volatile(
737       "movd        %3,%%xmm6                     \n"
738       "punpcklbw   %%xmm6,%%xmm6                 \n"
739       "movdqa      %%xmm6,%%xmm7                 \n"
740       "punpcklwd   %%xmm6,%%xmm6                 \n"
741       "punpckhwd   %%xmm7,%%xmm7                 \n"
742       "pcmpeqb     %%xmm3,%%xmm3                 \n"
743       "psrld       $0x1b,%%xmm3                  \n"
744       "pcmpeqb     %%xmm4,%%xmm4                 \n"
745       "psrld       $0x1a,%%xmm4                  \n"
746       "pslld       $0x5,%%xmm4                   \n"
747       "pcmpeqb     %%xmm5,%%xmm5                 \n"
748       "pslld       $0xb,%%xmm5                   \n"
749 
750       LABELALIGN
751       "1:                                        \n"
752       "movdqu      (%0),%%xmm0                   \n"
753       "paddusb     %%xmm6,%%xmm0                 \n"
754       "movdqa      %%xmm0,%%xmm1                 \n"
755       "movdqa      %%xmm0,%%xmm2                 \n"
756       "pslld       $0x8,%%xmm0                   \n"
757       "psrld       $0x3,%%xmm1                   \n"
758       "psrld       $0x5,%%xmm2                   \n"
759       "psrad       $0x10,%%xmm0                  \n"
760       "pand        %%xmm3,%%xmm1                 \n"
761       "pand        %%xmm4,%%xmm2                 \n"
762       "pand        %%xmm5,%%xmm0                 \n"
763       "por         %%xmm2,%%xmm1                 \n"
764       "por         %%xmm1,%%xmm0                 \n"
765       "packssdw    %%xmm0,%%xmm0                 \n"
766       "lea         0x10(%0),%0                   \n"
767       "movq        %%xmm0,(%1)                   \n"
768       "lea         0x8(%1),%1                    \n"
769       "sub         $0x4,%2                       \n"
770       "jg          1b                            \n"
771       : "+r"(src),    // %0
772         "+r"(dst),    // %1
773         "+r"(width)   // %2
774       : "m"(dither4)  // %3
775       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
776         "xmm7");
777 }
778 
779 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
ARGBToRGB565DitherRow_AVX2(const uint8_t * src,uint8_t * dst,const uint32_t dither4,int width)780 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
781                                 uint8_t* dst,
782                                 const uint32_t dither4,
783                                 int width) {
784   asm volatile(
785       "vbroadcastss %3,%%xmm6                    \n"
786       "vpunpcklbw  %%xmm6,%%xmm6,%%xmm6          \n"
787       "vpermq      $0xd8,%%ymm6,%%ymm6           \n"
788       "vpunpcklwd  %%ymm6,%%ymm6,%%ymm6          \n"
789       "vpcmpeqb    %%ymm3,%%ymm3,%%ymm3          \n"
790       "vpsrld      $0x1b,%%ymm3,%%ymm3           \n"
791       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
792       "vpsrld      $0x1a,%%ymm4,%%ymm4           \n"
793       "vpslld      $0x5,%%ymm4,%%ymm4            \n"
794       "vpslld      $0xb,%%ymm3,%%ymm5            \n"
795 
796       LABELALIGN
797       "1:                                        \n"
798       "vmovdqu     (%0),%%ymm0                   \n"
799       "vpaddusb    %%ymm6,%%ymm0,%%ymm0          \n"
800       "vpsrld      $0x5,%%ymm0,%%ymm2            \n"
801       "vpsrld      $0x3,%%ymm0,%%ymm1            \n"
802       "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
803       "vpand       %%ymm4,%%ymm2,%%ymm2          \n"
804       "vpand       %%ymm3,%%ymm1,%%ymm1          \n"
805       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
806       "vpor        %%ymm2,%%ymm1,%%ymm1          \n"
807       "vpor        %%ymm1,%%ymm0,%%ymm0          \n"
808       "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
809       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
810       "lea         0x20(%0),%0                   \n"
811       "vmovdqu     %%xmm0,(%1)                   \n"
812       "lea         0x10(%1),%1                   \n"
813       "sub         $0x8,%2                       \n"
814       "jg          1b                            \n"
815       "vzeroupper                                \n"
816       : "+r"(src),    // %0
817         "+r"(dst),    // %1
818         "+r"(width)   // %2
819       : "m"(dither4)  // %3
820       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
821         "xmm7");
822 }
823 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
824 
ARGBToARGB1555Row_SSE2(const uint8_t * src,uint8_t * dst,int width)825 void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
826   asm volatile(
827       "pcmpeqb     %%xmm4,%%xmm4                 \n"
828       "psrld       $0x1b,%%xmm4                  \n"
829       "movdqa      %%xmm4,%%xmm5                 \n"
830       "pslld       $0x5,%%xmm5                   \n"
831       "movdqa      %%xmm4,%%xmm6                 \n"
832       "pslld       $0xa,%%xmm6                   \n"
833       "pcmpeqb     %%xmm7,%%xmm7                 \n"
834       "pslld       $0xf,%%xmm7                   \n"
835 
836       LABELALIGN
837       "1:                                        \n"
838       "movdqu      (%0),%%xmm0                   \n"
839       "movdqa      %%xmm0,%%xmm1                 \n"
840       "movdqa      %%xmm0,%%xmm2                 \n"
841       "movdqa      %%xmm0,%%xmm3                 \n"
842       "psrad       $0x10,%%xmm0                  \n"
843       "psrld       $0x3,%%xmm1                   \n"
844       "psrld       $0x6,%%xmm2                   \n"
845       "psrld       $0x9,%%xmm3                   \n"
846       "pand        %%xmm7,%%xmm0                 \n"
847       "pand        %%xmm4,%%xmm1                 \n"
848       "pand        %%xmm5,%%xmm2                 \n"
849       "pand        %%xmm6,%%xmm3                 \n"
850       "por         %%xmm1,%%xmm0                 \n"
851       "por         %%xmm3,%%xmm2                 \n"
852       "por         %%xmm2,%%xmm0                 \n"
853       "packssdw    %%xmm0,%%xmm0                 \n"
854       "lea         0x10(%0),%0                   \n"
855       "movq        %%xmm0,(%1)                   \n"
856       "lea         0x8(%1),%1                    \n"
857       "sub         $0x4,%2                       \n"
858       "jg          1b                            \n"
859       : "+r"(src),   // %0
860         "+r"(dst),   // %1
861         "+r"(width)  // %2
862         ::"memory",
863         "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
864 }
865 
ARGBToARGB4444Row_SSE2(const uint8_t * src,uint8_t * dst,int width)866 void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
867   asm volatile(
868       "pcmpeqb     %%xmm4,%%xmm4                 \n"
869       "psllw       $0xc,%%xmm4                   \n"
870       "movdqa      %%xmm4,%%xmm3                 \n"
871       "psrlw       $0x8,%%xmm3                   \n"
872 
873       LABELALIGN
874       "1:                                        \n"
875       "movdqu      (%0),%%xmm0                   \n"
876       "movdqa      %%xmm0,%%xmm1                 \n"
877       "pand        %%xmm3,%%xmm0                 \n"
878       "pand        %%xmm4,%%xmm1                 \n"
879       "psrlq       $0x4,%%xmm0                   \n"
880       "psrlq       $0x8,%%xmm1                   \n"
881       "por         %%xmm1,%%xmm0                 \n"
882       "packuswb    %%xmm0,%%xmm0                 \n"
883       "lea         0x10(%0),%0                   \n"
884       "movq        %%xmm0,(%1)                   \n"
885       "lea         0x8(%1),%1                    \n"
886       "sub         $0x4,%2                       \n"
887       "jg          1b                            \n"
888       : "+r"(src),   // %0
889         "+r"(dst),   // %1
890         "+r"(width)  // %2
891         ::"memory",
892         "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
893 }
894 #endif  // HAS_RGB24TOARGBROW_SSSE3
895 
896 /*
897 
898 ARGBToAR30Row:
899 
900 Red Blue
901 With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
902 produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
903 wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
904 (1024+4)*16 for red.
905 
906 Alpha Green
907 Alpha and Green are already in the high bits so vpand can zero out the other
908 bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
909 could be used for Green - (1024+4) putting the 10 bit green in the lsb.  Alpha
910 would be a simple multiplier to shift it into position.  It wants a gap of 10
911 above the green.  Green is 10 bits, so there are 6 bits in the low short.  4
912 more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
913 and then a shift of 4 is a multiply of 16, so (4*16) = 64.  Then shift the
914 result left 10 to position the A and G channels.
915 */
916 
917 // Shuffle table for converting RAW to RGB24.  Last 8.
918 static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u,  128u, 4u,  128u, 6u,
919                                    128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
920 
921 static const uvec8 kShuffleBR30 = {128u, 2u,  128u, 0u, 128u, 6u,  128u, 4u,
922                                    128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
923 
924 static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
925 static const uint32_t kMaskRB10 = 0x3ff003ff;
926 static const uint32_t kMaskAG10 = 0xc000ff00;
927 static const uint32_t kMulAG10 = 64 * 65536 + 1028;
928 
ARGBToAR30Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)929 void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
930   asm volatile(
931       "movdqa      %3,%%xmm2                     \n"  // shuffler for RB
932       "movd        %4,%%xmm3                     \n"  // multipler for RB
933       "movd        %5,%%xmm4                     \n"  // mask for R10 B10
934       "movd        %6,%%xmm5                     \n"  // mask for AG
935       "movd        %7,%%xmm6                     \n"  // multipler for AG
936       "pshufd      $0x0,%%xmm3,%%xmm3            \n"
937       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
938       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
939       "pshufd      $0x0,%%xmm6,%%xmm6            \n"
940       "sub         %0,%1                         \n"
941 
942       "1:                                        \n"
943       "movdqu      (%0),%%xmm0                   \n"  // fetch 4 ARGB pixels
944       "movdqa      %%xmm0,%%xmm1                 \n"
945       "pshufb      %%xmm2,%%xmm1                 \n"  // R0B0
946       "pand        %%xmm5,%%xmm0                 \n"  // A0G0
947       "pmulhuw     %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
948       "pmulhuw     %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
949       "pand        %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
950       "pslld       $10,%%xmm0                    \n"  // A2 x10 G10 x10
951       "por         %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
952       "movdqu      %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
953       "add         $0x10,%0                      \n"
954       "sub         $0x4,%2                       \n"
955       "jg          1b                            \n"
956 
957       : "+r"(src),          // %0
958         "+r"(dst),          // %1
959         "+r"(width)         // %2
960       : "m"(kShuffleRB30),  // %3
961         "m"(kMulRB10),      // %4
962         "m"(kMaskRB10),     // %5
963         "m"(kMaskAG10),     // %6
964         "m"(kMulAG10)       // %7
965       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
966 }
967 
ABGRToAR30Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)968 void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
969   asm volatile(
970       "movdqa      %3,%%xmm2                     \n"  // shuffler for RB
971       "movd        %4,%%xmm3                     \n"  // multipler for RB
972       "movd        %5,%%xmm4                     \n"  // mask for R10 B10
973       "movd        %6,%%xmm5                     \n"  // mask for AG
974       "movd        %7,%%xmm6                     \n"  // multipler for AG
975       "pshufd      $0x0,%%xmm3,%%xmm3            \n"
976       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
977       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
978       "pshufd      $0x0,%%xmm6,%%xmm6            \n"
979       "sub         %0,%1                         \n"
980 
981       "1:                                        \n"
982       "movdqu      (%0),%%xmm0                   \n"  // fetch 4 ABGR pixels
983       "movdqa      %%xmm0,%%xmm1                 \n"
984       "pshufb      %%xmm2,%%xmm1                 \n"  // R0B0
985       "pand        %%xmm5,%%xmm0                 \n"  // A0G0
986       "pmulhuw     %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
987       "pmulhuw     %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
988       "pand        %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
989       "pslld       $10,%%xmm0                    \n"  // A2 x10 G10 x10
990       "por         %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
991       "movdqu      %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
992       "add         $0x10,%0                      \n"
993       "sub         $0x4,%2                       \n"
994       "jg          1b                            \n"
995 
996       : "+r"(src),          // %0
997         "+r"(dst),          // %1
998         "+r"(width)         // %2
999       : "m"(kShuffleBR30),  // %3  reversed shuffler
1000         "m"(kMulRB10),      // %4
1001         "m"(kMaskRB10),     // %5
1002         "m"(kMaskAG10),     // %6
1003         "m"(kMulAG10)       // %7
1004       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1005 }
1006 
1007 #ifdef HAS_ARGBTOAR30ROW_AVX2
ARGBToAR30Row_AVX2(const uint8_t * src,uint8_t * dst,int width)1008 void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
1009   asm volatile(
1010       "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
1011       "vbroadcastss %4,%%ymm3                    \n"  // multipler for RB
1012       "vbroadcastss %5,%%ymm4                    \n"  // mask for R10 B10
1013       "vbroadcastss %6,%%ymm5                    \n"  // mask for AG
1014       "vbroadcastss %7,%%ymm6                    \n"  // multipler for AG
1015       "sub         %0,%1                         \n"
1016 
1017       "1:                                        \n"
1018       "vmovdqu     (%0),%%ymm0                   \n"  // fetch 8 ARGB pixels
1019       "vpshufb     %%ymm2,%%ymm0,%%ymm1          \n"  // R0B0
1020       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"  // A0G0
1021       "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"  // X2 R16 X4  B10
1022       "vpmulhuw    %%ymm6,%%ymm0,%%ymm0          \n"  // X10 A2 X10 G10
1023       "vpand       %%ymm4,%%ymm1,%%ymm1          \n"  // X2 R10 X10 B10
1024       "vpslld      $10,%%ymm0,%%ymm0             \n"  // A2 x10 G10 x10
1025       "vpor        %%ymm1,%%ymm0,%%ymm0          \n"  // A2 R10 G10 B10
1026       "vmovdqu     %%ymm0,(%1,%0)                \n"  // store 8 AR30 pixels
1027       "add         $0x20,%0                      \n"
1028       "sub         $0x8,%2                       \n"
1029       "jg          1b                            \n"
1030       "vzeroupper                                \n"
1031 
1032       : "+r"(src),          // %0
1033         "+r"(dst),          // %1
1034         "+r"(width)         // %2
1035       : "m"(kShuffleRB30),  // %3
1036         "m"(kMulRB10),      // %4
1037         "m"(kMaskRB10),     // %5
1038         "m"(kMaskAG10),     // %6
1039         "m"(kMulAG10)       // %7
1040       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1041 }
1042 #endif
1043 
1044 #ifdef HAS_ABGRTOAR30ROW_AVX2
ABGRToAR30Row_AVX2(const uint8_t * src,uint8_t * dst,int width)1045 void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
1046   asm volatile(
1047       "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
1048       "vbroadcastss %4,%%ymm3                    \n"  // multipler for RB
1049       "vbroadcastss %5,%%ymm4                    \n"  // mask for R10 B10
1050       "vbroadcastss %6,%%ymm5                    \n"  // mask for AG
1051       "vbroadcastss %7,%%ymm6                    \n"  // multipler for AG
1052       "sub         %0,%1                         \n"
1053 
1054       "1:                                        \n"
1055       "vmovdqu     (%0),%%ymm0                   \n"  // fetch 8 ABGR pixels
1056       "vpshufb     %%ymm2,%%ymm0,%%ymm1          \n"  // R0B0
1057       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"  // A0G0
1058       "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"  // X2 R16 X4  B10
1059       "vpmulhuw    %%ymm6,%%ymm0,%%ymm0          \n"  // X10 A2 X10 G10
1060       "vpand       %%ymm4,%%ymm1,%%ymm1          \n"  // X2 R10 X10 B10
1061       "vpslld      $10,%%ymm0,%%ymm0             \n"  // A2 x10 G10 x10
1062       "vpor        %%ymm1,%%ymm0,%%ymm0          \n"  // A2 R10 G10 B10
1063       "vmovdqu     %%ymm0,(%1,%0)                \n"  // store 8 AR30 pixels
1064       "add         $0x20,%0                      \n"
1065       "sub         $0x8,%2                       \n"
1066       "jg          1b                            \n"
1067       "vzeroupper                                \n"
1068 
1069       : "+r"(src),          // %0
1070         "+r"(dst),          // %1
1071         "+r"(width)         // %2
1072       : "m"(kShuffleBR30),  // %3  reversed shuffler
1073         "m"(kMulRB10),      // %4
1074         "m"(kMaskRB10),     // %5
1075         "m"(kMaskAG10),     // %6
1076         "m"(kMulAG10)       // %7
1077       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1078 }
1079 #endif
1080 
1081 // clang-format off
1082 
1083 // TODO(mraptis): Consider passing R, G, B multipliers as parameter.
1084 // round parameter is register containing value to add before shift.
1085 #define RGBTOY(round)                            \
1086   "1:                                        \n" \
1087   "movdqu    (%0),%%xmm0                     \n" \
1088   "movdqu    0x10(%0),%%xmm1                 \n" \
1089   "movdqu    0x20(%0),%%xmm2                 \n" \
1090   "movdqu    0x30(%0),%%xmm3                 \n" \
1091   "psubb     %%xmm5,%%xmm0                   \n" \
1092   "psubb     %%xmm5,%%xmm1                   \n" \
1093   "psubb     %%xmm5,%%xmm2                   \n" \
1094   "psubb     %%xmm5,%%xmm3                   \n" \
1095   "movdqu    %%xmm4,%%xmm6                   \n" \
1096   "pmaddubsw %%xmm0,%%xmm6                   \n" \
1097   "movdqu    %%xmm4,%%xmm0                   \n" \
1098   "pmaddubsw %%xmm1,%%xmm0                   \n" \
1099   "movdqu    %%xmm4,%%xmm1                   \n" \
1100   "pmaddubsw %%xmm2,%%xmm1                   \n" \
1101   "movdqu    %%xmm4,%%xmm2                   \n" \
1102   "pmaddubsw %%xmm3,%%xmm2                   \n" \
1103   "lea       0x40(%0),%0                     \n" \
1104   "phaddw    %%xmm0,%%xmm6                   \n" \
1105   "phaddw    %%xmm2,%%xmm1                   \n" \
1106   "prefetcht0 1280(%0)                       \n" \
1107   "paddw     %%" #round ",%%xmm6             \n" \
1108   "paddw     %%" #round ",%%xmm1             \n" \
1109   "psrlw     $0x8,%%xmm6                     \n" \
1110   "psrlw     $0x8,%%xmm1                     \n" \
1111   "packuswb  %%xmm1,%%xmm6                   \n" \
1112   "movdqu    %%xmm6,(%1)                     \n" \
1113   "lea       0x10(%1),%1                     \n" \
1114   "sub       $0x10,%2                        \n" \
1115   "jg        1b                              \n"
1116 
1117 #define RGBTOY_AVX2(round)                                       \
1118   "1:                                        \n"                 \
1119   "vmovdqu    (%0),%%ymm0                    \n"                 \
1120   "vmovdqu    0x20(%0),%%ymm1                \n"                 \
1121   "vmovdqu    0x40(%0),%%ymm2                \n"                 \
1122   "vmovdqu    0x60(%0),%%ymm3                \n"                 \
1123   "vpsubb     %%ymm5, %%ymm0, %%ymm0         \n"                 \
1124   "vpsubb     %%ymm5, %%ymm1, %%ymm1         \n"                 \
1125   "vpsubb     %%ymm5, %%ymm2, %%ymm2         \n"                 \
1126   "vpsubb     %%ymm5, %%ymm3, %%ymm3         \n"                 \
1127   "vpmaddubsw %%ymm0,%%ymm4,%%ymm0           \n"                 \
1128   "vpmaddubsw %%ymm1,%%ymm4,%%ymm1           \n"                 \
1129   "vpmaddubsw %%ymm2,%%ymm4,%%ymm2           \n"                 \
1130   "vpmaddubsw %%ymm3,%%ymm4,%%ymm3           \n"                 \
1131   "lea       0x80(%0),%0                     \n"                 \
1132   "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n" /* mutates. */  \
1133   "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"                 \
1134   "prefetcht0 1280(%0)                       \n"                 \
1135   "vpaddw     %%" #round ",%%ymm0,%%ymm0     \n" /* Add .5 for rounding. */             \
1136   "vpaddw     %%" #round ",%%ymm2,%%ymm2     \n" \
1137   "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"                 \
1138   "vpsrlw     $0x8,%%ymm2,%%ymm2             \n"                 \
1139   "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n" /* mutates. */  \
1140   "vpermd     %%ymm0,%%ymm6,%%ymm0           \n" /* unmutate. */ \
1141   "vmovdqu    %%ymm0,(%1)                    \n"                 \
1142   "lea       0x20(%1),%1                     \n"                 \
1143   "sub       $0x20,%2                        \n"                 \
1144   "jg        1b                              \n"                 \
1145   "vzeroupper                                \n"
1146 
1147 // clang-format on
1148 
1149 #ifdef HAS_ARGBTOYROW_SSSE3
1150 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
ARGBToYRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_y,int width)1151 void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1152   asm volatile(
1153       "movdqa      %3,%%xmm4                     \n"
1154       "movdqa      %4,%%xmm5                     \n"
1155       "movdqa      %5,%%xmm7                     \n"
1156 
1157       LABELALIGN RGBTOY(xmm7)
1158       : "+r"(src_argb),  // %0
1159         "+r"(dst_y),     // %1
1160         "+r"(width)      // %2
1161       : "m"(kARGBToY),   // %3
1162         "m"(kSub128),    // %4
1163         "m"(kAddY16)     // %5
1164       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1165         "xmm7");
1166 }
1167 #endif  // HAS_ARGBTOYROW_SSSE3
1168 
1169 #ifdef HAS_ARGBTOYJROW_SSSE3
1170 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1171 // Same as ARGBToYRow but different coefficients, no add 16.
ARGBToYJRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_y,int width)1172 void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1173   asm volatile(
1174       "movdqa      %3,%%xmm4                     \n"
1175       "movdqa      %4,%%xmm5                     \n"
1176 
1177       LABELALIGN RGBTOY(xmm5)
1178       : "+r"(src_argb),  // %0
1179         "+r"(dst_y),     // %1
1180         "+r"(width)      // %2
1181       : "m"(kARGBToYJ),  // %3
1182         "m"(kSub128)     // %4
1183       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1184 }
1185 #endif  // HAS_ARGBTOYJROW_SSSE3
1186 
1187 #ifdef HAS_RGBATOYJROW_SSSE3
1188 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1189 // Same as ARGBToYRow but different coefficients, no add 16.
RGBAToYJRow_SSSE3(const uint8_t * src_rgba,uint8_t * dst_y,int width)1190 void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1191   asm volatile(
1192       "movdqa      %3,%%xmm4                     \n"
1193       "movdqa      %4,%%xmm5                     \n"
1194 
1195       LABELALIGN RGBTOY(xmm5)
1196       : "+r"(src_rgba),  // %0
1197         "+r"(dst_y),     // %1
1198         "+r"(width)      // %2
1199       : "m"(kRGBAToYJ),  // %3
1200         "m"(kSub128)     // %4
1201       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1202 }
1203 #endif  // HAS_RGBATOYJROW_SSSE3
1204 
1205 #ifdef HAS_ARGBTOYROW_AVX2
1206 // vpermd for vphaddw + vpackuswb vpermd.
1207 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
1208 
1209 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYRow_AVX2(const uint8_t * src_argb,uint8_t * dst_y,int width)1210 void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1211   asm volatile(
1212       "vbroadcastf128 %3,%%ymm4                  \n"
1213       "vbroadcastf128 %4,%%ymm5                  \n"
1214       "vbroadcastf128 %5,%%ymm7                  \n"
1215       "vmovdqu     %6,%%ymm6                     \n"
1216 
1217       LABELALIGN RGBTOY_AVX2(ymm7)
1218       : "+r"(src_argb),         // %0
1219         "+r"(dst_y),            // %1
1220         "+r"(width)             // %2
1221       : "m"(kARGBToY),          // %3
1222         "m"(kSub128),           // %4
1223         "m"(kAddY16),           // %5
1224         "m"(kPermdARGBToY_AVX)  // %6
1225       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1226         "xmm7");
1227 }
1228 #endif  // HAS_ARGBTOYROW_AVX2
1229 
1230 #ifdef HAS_ABGRTOYROW_AVX2
1231 // Convert 32 ABGR pixels (128 bytes) to 32 Y values.
ABGRToYRow_AVX2(const uint8_t * src_abgr,uint8_t * dst_y,int width)1232 void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1233   asm volatile(
1234       "vbroadcastf128 %3,%%ymm4                  \n"
1235       "vbroadcastf128 %4,%%ymm5                  \n"
1236       "vbroadcastf128 %5,%%ymm7                  \n"
1237       "vmovdqu     %6,%%ymm6                     \n"
1238 
1239       LABELALIGN RGBTOY_AVX2(ymm7)
1240       : "+r"(src_abgr),         // %0
1241         "+r"(dst_y),            // %1
1242         "+r"(width)             // %2
1243       : "m"(kABGRToY),          // %3
1244         "m"(kSub128),           // %4
1245         "m"(kAddY16),           // %5
1246         "m"(kPermdARGBToY_AVX)  // %6
1247       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1248         "xmm7");
1249 }
1250 #endif  // HAS_ABGRTOYROW_AVX2
1251 
1252 #ifdef HAS_ARGBTOYJROW_AVX2
1253 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYJRow_AVX2(const uint8_t * src_argb,uint8_t * dst_y,int width)1254 void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1255   asm volatile(
1256       "vbroadcastf128 %3,%%ymm4                  \n"
1257       "vbroadcastf128 %4,%%ymm5                  \n"
1258       "vmovdqu     %5,%%ymm6                     \n"
1259 
1260       LABELALIGN RGBTOY_AVX2(ymm5)
1261       : "+r"(src_argb),         // %0
1262         "+r"(dst_y),            // %1
1263         "+r"(width)             // %2
1264       : "m"(kARGBToYJ),         // %3
1265         "m"(kSub128),           // %4
1266         "m"(kPermdARGBToY_AVX)  // %5
1267       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1268         "xmm7");
1269 }
1270 #endif  // HAS_ARGBTOYJROW_AVX2
1271 
1272 #ifdef HAS_RGBATOYJROW_AVX2
1273 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
RGBAToYJRow_AVX2(const uint8_t * src_rgba,uint8_t * dst_y,int width)1274 void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1275   asm volatile(
1276       "vbroadcastf128 %3,%%ymm4                  \n"
1277       "vbroadcastf128 %4,%%ymm5                  \n"
1278       "vmovdqu     %5,%%ymm6                     \n"
1279 
1280       LABELALIGN RGBTOY_AVX2(
1281           ymm5) "vzeroupper                                \n"
1282       : "+r"(src_rgba),         // %0
1283         "+r"(dst_y),            // %1
1284         "+r"(width)             // %2
1285       : "m"(kRGBAToYJ),         // %3
1286         "m"(kSub128),           // %4
1287         "m"(kPermdARGBToY_AVX)  // %5
1288       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1289 }
1290 #endif  // HAS_RGBATOYJROW_AVX2
1291 
1292 #ifdef HAS_ARGBTOUVROW_SSSE3
ARGBToUVRow_SSSE3(const uint8_t * src_argb0,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1293 void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
1294                        int src_stride_argb,
1295                        uint8_t* dst_u,
1296                        uint8_t* dst_v,
1297                        int width) {
1298   asm volatile(
1299       "movdqa      %5,%%xmm3                     \n"
1300       "movdqa      %6,%%xmm4                     \n"
1301       "movdqa      %7,%%xmm5                     \n"
1302       "sub         %1,%2                         \n"
1303 
1304       LABELALIGN
1305       "1:                                        \n"
1306       "movdqu      (%0),%%xmm0                   \n"
1307       "movdqu      0x00(%0,%4,1),%%xmm7          \n"
1308       "pavgb       %%xmm7,%%xmm0                 \n"
1309       "movdqu      0x10(%0),%%xmm1               \n"
1310       "movdqu      0x10(%0,%4,1),%%xmm7          \n"
1311       "pavgb       %%xmm7,%%xmm1                 \n"
1312       "movdqu      0x20(%0),%%xmm2               \n"
1313       "movdqu      0x20(%0,%4,1),%%xmm7          \n"
1314       "pavgb       %%xmm7,%%xmm2                 \n"
1315       "movdqu      0x30(%0),%%xmm6               \n"
1316       "movdqu      0x30(%0,%4,1),%%xmm7          \n"
1317       "pavgb       %%xmm7,%%xmm6                 \n"
1318 
1319       "lea         0x40(%0),%0                   \n"
1320       "movdqa      %%xmm0,%%xmm7                 \n"
1321       "shufps      $0x88,%%xmm1,%%xmm0           \n"
1322       "shufps      $0xdd,%%xmm1,%%xmm7           \n"
1323       "pavgb       %%xmm7,%%xmm0                 \n"
1324       "movdqa      %%xmm2,%%xmm7                 \n"
1325       "shufps      $0x88,%%xmm6,%%xmm2           \n"
1326       "shufps      $0xdd,%%xmm6,%%xmm7           \n"
1327       "pavgb       %%xmm7,%%xmm2                 \n"
1328       "movdqa      %%xmm0,%%xmm1                 \n"
1329       "movdqa      %%xmm2,%%xmm6                 \n"
1330       "pmaddubsw   %%xmm4,%%xmm0                 \n"
1331       "pmaddubsw   %%xmm4,%%xmm2                 \n"
1332       "pmaddubsw   %%xmm3,%%xmm1                 \n"
1333       "pmaddubsw   %%xmm3,%%xmm6                 \n"
1334       "phaddw      %%xmm2,%%xmm0                 \n"
1335       "phaddw      %%xmm6,%%xmm1                 \n"
1336       "psraw       $0x8,%%xmm0                   \n"
1337       "psraw       $0x8,%%xmm1                   \n"
1338       "packsswb    %%xmm1,%%xmm0                 \n"
1339       "paddb       %%xmm5,%%xmm0                 \n"
1340       "movlps      %%xmm0,(%1)                   \n"
1341       "movhps      %%xmm0,0x00(%1,%2,1)          \n"
1342       "lea         0x8(%1),%1                    \n"
1343       "sub         $0x10,%3                      \n"
1344       "jg          1b                            \n"
1345       : "+r"(src_argb0),                   // %0
1346         "+r"(dst_u),                       // %1
1347         "+r"(dst_v),                       // %2
1348         "+rm"(width)                       // %3
1349       : "r"((intptr_t)(src_stride_argb)),  // %4
1350         "m"(kARGBToV),                     // %5
1351         "m"(kARGBToU),                     // %6
1352         "m"(kAddUV128)                     // %7
1353       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1354 }
1355 #endif  // HAS_ARGBTOUVROW_SSSE3
1356 
1357 #ifdef HAS_ARGBTOUVROW_AVX2
1358 // vpshufb for vphaddw + vpackuswb packed to shorts.
1359 static const lvec8 kShufARGBToUV_AVX = {
1360     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
1361     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
ARGBToUVRow_AVX2(const uint8_t * src_argb0,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1362 void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
1363                       int src_stride_argb,
1364                       uint8_t* dst_u,
1365                       uint8_t* dst_v,
1366                       int width) {
1367   asm volatile(
1368       "vbroadcastf128 %5,%%ymm5                  \n"
1369       "vbroadcastf128 %6,%%ymm6                  \n"
1370       "vbroadcastf128 %7,%%ymm7                  \n"
1371       "sub         %1,%2                         \n"
1372 
1373       LABELALIGN
1374       "1:                                        \n"
1375       "vmovdqu     (%0),%%ymm0                   \n"
1376       "vmovdqu     0x20(%0),%%ymm1               \n"
1377       "vmovdqu     0x40(%0),%%ymm2               \n"
1378       "vmovdqu     0x60(%0),%%ymm3               \n"
1379       "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
1380       "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
1381       "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
1382       "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
1383       "lea         0x80(%0),%0                   \n"
1384       "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
1385       "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
1386       "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
1387       "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
1388       "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
1389       "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
1390 
1391       "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
1392       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
1393       "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
1394       "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
1395       "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
1396       "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
1397       "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
1398       "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
1399       "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
1400       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1401       "vpshufb     %8,%%ymm0,%%ymm0              \n"
1402       "vpaddb      %%ymm5,%%ymm0,%%ymm0          \n"
1403 
1404       "vextractf128 $0x0,%%ymm0,(%1)             \n"
1405       "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
1406       "lea         0x10(%1),%1                   \n"
1407       "sub         $0x20,%3                      \n"
1408       "jg          1b                            \n"
1409       "vzeroupper                                \n"
1410       : "+r"(src_argb0),                   // %0
1411         "+r"(dst_u),                       // %1
1412         "+r"(dst_v),                       // %2
1413         "+rm"(width)                       // %3
1414       : "r"((intptr_t)(src_stride_argb)),  // %4
1415         "m"(kAddUV128),                    // %5
1416         "m"(kARGBToV),                     // %6
1417         "m"(kARGBToU),                     // %7
1418         "m"(kShufARGBToUV_AVX)             // %8
1419       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1420         "xmm7");
1421 }
1422 #endif  // HAS_ARGBTOUVROW_AVX2
1423 
1424 #ifdef HAS_ABGRTOUVROW_AVX2
ABGRToUVRow_AVX2(const uint8_t * src_abgr0,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)1425 void ABGRToUVRow_AVX2(const uint8_t* src_abgr0,
1426                       int src_stride_abgr,
1427                       uint8_t* dst_u,
1428                       uint8_t* dst_v,
1429                       int width) {
1430   asm volatile(
1431       "vbroadcastf128 %5,%%ymm5                  \n"
1432       "vbroadcastf128 %6,%%ymm6                  \n"
1433       "vbroadcastf128 %7,%%ymm7                  \n"
1434       "sub         %1,%2                         \n"
1435 
1436       LABELALIGN
1437       "1:                                        \n"
1438       "vmovdqu     (%0),%%ymm0                   \n"
1439       "vmovdqu     0x20(%0),%%ymm1               \n"
1440       "vmovdqu     0x40(%0),%%ymm2               \n"
1441       "vmovdqu     0x60(%0),%%ymm3               \n"
1442       "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
1443       "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
1444       "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
1445       "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
1446       "lea         0x80(%0),%0                   \n"
1447       "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
1448       "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
1449       "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
1450       "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
1451       "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
1452       "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
1453 
1454       "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
1455       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
1456       "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
1457       "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
1458       "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
1459       "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
1460       "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
1461       "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
1462       "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
1463       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1464       "vpshufb     %8,%%ymm0,%%ymm0              \n"
1465       "vpaddb      %%ymm5,%%ymm0,%%ymm0          \n"
1466 
1467       "vextractf128 $0x0,%%ymm0,(%1)             \n"
1468       "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
1469       "lea         0x10(%1),%1                   \n"
1470       "sub         $0x20,%3                      \n"
1471       "jg          1b                            \n"
1472       "vzeroupper                                \n"
1473       : "+r"(src_abgr0),                   // %0
1474         "+r"(dst_u),                       // %1
1475         "+r"(dst_v),                       // %2
1476         "+rm"(width)                       // %3
1477       : "r"((intptr_t)(src_stride_abgr)),  // %4
1478         "m"(kAddUV128),                    // %5
1479         "m"(kABGRToV),                     // %6
1480         "m"(kABGRToU),                     // %7
1481         "m"(kShufARGBToUV_AVX)             // %8
1482       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1483         "xmm7");
1484 }
1485 #endif  // HAS_ABGRTOUVROW_AVX2
1486 
1487 #ifdef HAS_ARGBTOUVJROW_AVX2
ARGBToUVJRow_AVX2(const uint8_t * src_argb0,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1488 void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
1489                        int src_stride_argb,
1490                        uint8_t* dst_u,
1491                        uint8_t* dst_v,
1492                        int width) {
1493   asm volatile(
1494       "vbroadcastf128 %5,%%ymm5                  \n"
1495       "vbroadcastf128 %6,%%ymm6                  \n"
1496       "vbroadcastf128 %7,%%ymm7                  \n"
1497       "sub         %1,%2                         \n"
1498 
1499       LABELALIGN
1500       "1:                                        \n"
1501       "vmovdqu     (%0),%%ymm0                   \n"
1502       "vmovdqu     0x20(%0),%%ymm1               \n"
1503       "vmovdqu     0x40(%0),%%ymm2               \n"
1504       "vmovdqu     0x60(%0),%%ymm3               \n"
1505       "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
1506       "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
1507       "vpavgb      0x40(%0,%4,1),%%ymm2,%%ymm2   \n"
1508       "vpavgb      0x60(%0,%4,1),%%ymm3,%%ymm3   \n"
1509       "lea         0x80(%0),%0                   \n"
1510       "vshufps     $0x88,%%ymm1,%%ymm0,%%ymm4    \n"
1511       "vshufps     $0xdd,%%ymm1,%%ymm0,%%ymm0    \n"
1512       "vpavgb      %%ymm4,%%ymm0,%%ymm0          \n"
1513       "vshufps     $0x88,%%ymm3,%%ymm2,%%ymm4    \n"
1514       "vshufps     $0xdd,%%ymm3,%%ymm2,%%ymm2    \n"
1515       "vpavgb      %%ymm4,%%ymm2,%%ymm2          \n"
1516 
1517       "vpmaddubsw  %%ymm7,%%ymm0,%%ymm1          \n"
1518       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm3          \n"
1519       "vpmaddubsw  %%ymm6,%%ymm0,%%ymm0          \n"
1520       "vpmaddubsw  %%ymm6,%%ymm2,%%ymm2          \n"
1521       "vphaddw     %%ymm3,%%ymm1,%%ymm1          \n"
1522       "vphaddw     %%ymm2,%%ymm0,%%ymm0          \n"
1523       "vpaddw      %%ymm5,%%ymm0,%%ymm0          \n"
1524       "vpaddw      %%ymm5,%%ymm1,%%ymm1          \n"
1525       "vpsraw      $0x8,%%ymm1,%%ymm1            \n"
1526       "vpsraw      $0x8,%%ymm0,%%ymm0            \n"
1527       "vpacksswb   %%ymm0,%%ymm1,%%ymm0          \n"
1528       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
1529       "vpshufb     %8,%%ymm0,%%ymm0              \n"
1530 
1531       "vextractf128 $0x0,%%ymm0,(%1)             \n"
1532       "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
1533       "lea         0x10(%1),%1                   \n"
1534       "sub         $0x20,%3                      \n"
1535       "jg          1b                            \n"
1536       "vzeroupper                                \n"
1537       : "+r"(src_argb0),                   // %0
1538         "+r"(dst_u),                       // %1
1539         "+r"(dst_v),                       // %2
1540         "+rm"(width)                       // %3
1541       : "r"((intptr_t)(src_stride_argb)),  // %4
1542         "m"(kSub128),                      // %5
1543         "m"(kARGBToVJ),                    // %6
1544         "m"(kARGBToUJ),                    // %7
1545         "m"(kShufARGBToUV_AVX)             // %8
1546       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1547         "xmm7");
1548 }
1549 #endif  // HAS_ARGBTOUVJROW_AVX2
1550 
1551 #ifdef HAS_ARGBTOUVJROW_SSSE3
ARGBToUVJRow_SSSE3(const uint8_t * src_argb0,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1552 void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
1553                         int src_stride_argb,
1554                         uint8_t* dst_u,
1555                         uint8_t* dst_v,
1556                         int width) {
1557   asm volatile(
1558       "movdqa      %5,%%xmm3                     \n"
1559       "movdqa      %6,%%xmm4                     \n"
1560       "movdqa      %7,%%xmm5                     \n"
1561       "sub         %1,%2                         \n"
1562 
1563       LABELALIGN
1564       "1:                                        \n"
1565       "movdqu      (%0),%%xmm0                   \n"
1566       "movdqu      0x00(%0,%4,1),%%xmm7          \n"
1567       "pavgb       %%xmm7,%%xmm0                 \n"
1568       "movdqu      0x10(%0),%%xmm1               \n"
1569       "movdqu      0x10(%0,%4,1),%%xmm7          \n"
1570       "pavgb       %%xmm7,%%xmm1                 \n"
1571       "movdqu      0x20(%0),%%xmm2               \n"
1572       "movdqu      0x20(%0,%4,1),%%xmm7          \n"
1573       "pavgb       %%xmm7,%%xmm2                 \n"
1574       "movdqu      0x30(%0),%%xmm6               \n"
1575       "movdqu      0x30(%0,%4,1),%%xmm7          \n"
1576       "pavgb       %%xmm7,%%xmm6                 \n"
1577 
1578       "lea         0x40(%0),%0                   \n"
1579       "movdqa      %%xmm0,%%xmm7                 \n"
1580       "shufps      $0x88,%%xmm1,%%xmm0           \n"
1581       "shufps      $0xdd,%%xmm1,%%xmm7           \n"
1582       "pavgb       %%xmm7,%%xmm0                 \n"
1583       "movdqa      %%xmm2,%%xmm7                 \n"
1584       "shufps      $0x88,%%xmm6,%%xmm2           \n"
1585       "shufps      $0xdd,%%xmm6,%%xmm7           \n"
1586       "pavgb       %%xmm7,%%xmm2                 \n"
1587       "movdqa      %%xmm0,%%xmm1                 \n"
1588       "movdqa      %%xmm2,%%xmm6                 \n"
1589       "pmaddubsw   %%xmm4,%%xmm0                 \n"
1590       "pmaddubsw   %%xmm4,%%xmm2                 \n"
1591       "pmaddubsw   %%xmm3,%%xmm1                 \n"
1592       "pmaddubsw   %%xmm3,%%xmm6                 \n"
1593       "phaddw      %%xmm2,%%xmm0                 \n"
1594       "phaddw      %%xmm6,%%xmm1                 \n"
1595       "paddw       %%xmm5,%%xmm0                 \n"
1596       "paddw       %%xmm5,%%xmm1                 \n"
1597       "psraw       $0x8,%%xmm0                   \n"
1598       "psraw       $0x8,%%xmm1                   \n"
1599       "packsswb    %%xmm1,%%xmm0                 \n"
1600       "movlps      %%xmm0,(%1)                   \n"
1601       "movhps      %%xmm0,0x00(%1,%2,1)          \n"
1602       "lea         0x8(%1),%1                    \n"
1603       "sub         $0x10,%3                      \n"
1604       "jg          1b                            \n"
1605       : "+r"(src_argb0),                   // %0
1606         "+r"(dst_u),                       // %1
1607         "+r"(dst_v),                       // %2
1608         "+rm"(width)                       // %3
1609       : "r"((intptr_t)(src_stride_argb)),  // %4
1610         "m"(kARGBToVJ),                    // %5
1611         "m"(kARGBToUJ),                    // %6
1612         "m"(kSub128)                       // %7
1613       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1614 }
1615 #endif  // HAS_ARGBTOUVJROW_SSSE3
1616 
1617 #ifdef HAS_ARGBTOUV444ROW_SSSE3
ARGBToUV444Row_SSSE3(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1618 void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
1619                           uint8_t* dst_u,
1620                           uint8_t* dst_v,
1621                           int width) {
1622   asm volatile(
1623       "movdqa      %4,%%xmm3                     \n"
1624       "movdqa      %5,%%xmm4                     \n"
1625       "movdqa      %6,%%xmm5                     \n"
1626       "sub         %1,%2                         \n"
1627 
1628       LABELALIGN
1629       "1:                                        \n"
1630       "movdqu      (%0),%%xmm0                   \n"
1631       "movdqu      0x10(%0),%%xmm1               \n"
1632       "movdqu      0x20(%0),%%xmm2               \n"
1633       "movdqu      0x30(%0),%%xmm6               \n"
1634       "pmaddubsw   %%xmm4,%%xmm0                 \n"
1635       "pmaddubsw   %%xmm4,%%xmm1                 \n"
1636       "pmaddubsw   %%xmm4,%%xmm2                 \n"
1637       "pmaddubsw   %%xmm4,%%xmm6                 \n"
1638       "phaddw      %%xmm1,%%xmm0                 \n"
1639       "phaddw      %%xmm6,%%xmm2                 \n"
1640       "psraw       $0x8,%%xmm0                   \n"
1641       "psraw       $0x8,%%xmm2                   \n"
1642       "packsswb    %%xmm2,%%xmm0                 \n"
1643       "paddb       %%xmm5,%%xmm0                 \n"
1644       "movdqu      %%xmm0,(%1)                   \n"
1645       "movdqu      (%0),%%xmm0                   \n"
1646       "movdqu      0x10(%0),%%xmm1               \n"
1647       "movdqu      0x20(%0),%%xmm2               \n"
1648       "movdqu      0x30(%0),%%xmm6               \n"
1649       "pmaddubsw   %%xmm3,%%xmm0                 \n"
1650       "pmaddubsw   %%xmm3,%%xmm1                 \n"
1651       "pmaddubsw   %%xmm3,%%xmm2                 \n"
1652       "pmaddubsw   %%xmm3,%%xmm6                 \n"
1653       "phaddw      %%xmm1,%%xmm0                 \n"
1654       "phaddw      %%xmm6,%%xmm2                 \n"
1655       "psraw       $0x8,%%xmm0                   \n"
1656       "psraw       $0x8,%%xmm2                   \n"
1657       "packsswb    %%xmm2,%%xmm0                 \n"
1658       "paddb       %%xmm5,%%xmm0                 \n"
1659       "lea         0x40(%0),%0                   \n"
1660       "movdqu      %%xmm0,0x00(%1,%2,1)          \n"
1661       "lea         0x10(%1),%1                   \n"
1662       "sub         $0x10,%3                      \n"
1663       "jg          1b                            \n"
1664       : "+r"(src_argb),  // %0
1665         "+r"(dst_u),     // %1
1666         "+r"(dst_v),     // %2
1667         "+rm"(width)     // %3
1668       : "m"(kARGBToV),   // %4
1669         "m"(kARGBToU),   // %5
1670         "m"(kAddUV128)   // %6
1671       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");
1672 }
1673 #endif  // HAS_ARGBTOUV444ROW_SSSE3
1674 
BGRAToYRow_SSSE3(const uint8_t * src_bgra,uint8_t * dst_y,int width)1675 void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
1676   asm volatile(
1677       "movdqa      %3,%%xmm4                     \n"
1678       "movdqa      %4,%%xmm5                     \n"
1679       "movdqa      %5,%%xmm7                     \n"
1680 
1681       LABELALIGN RGBTOY(xmm7)
1682       : "+r"(src_bgra),  // %0
1683         "+r"(dst_y),     // %1
1684         "+r"(width)      // %2
1685       : "m"(kBGRAToY),   // %3
1686         "m"(kSub128),    // %4
1687         "m"(kAddY16)     // %5
1688       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1689         "xmm7");
1690 }
1691 
BGRAToUVRow_SSSE3(const uint8_t * src_bgra0,int src_stride_bgra,uint8_t * dst_u,uint8_t * dst_v,int width)1692 void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
1693                        int src_stride_bgra,
1694                        uint8_t* dst_u,
1695                        uint8_t* dst_v,
1696                        int width) {
1697   asm volatile(
1698       "movdqa      %5,%%xmm3                     \n"
1699       "movdqa      %6,%%xmm4                     \n"
1700       "movdqa      %7,%%xmm5                     \n"
1701       "sub         %1,%2                         \n"
1702 
1703       LABELALIGN
1704       "1:                                        \n"
1705       "movdqu      (%0),%%xmm0                   \n"
1706       "movdqu      0x00(%0,%4,1),%%xmm7          \n"
1707       "pavgb       %%xmm7,%%xmm0                 \n"
1708       "movdqu      0x10(%0),%%xmm1               \n"
1709       "movdqu      0x10(%0,%4,1),%%xmm7          \n"
1710       "pavgb       %%xmm7,%%xmm1                 \n"
1711       "movdqu      0x20(%0),%%xmm2               \n"
1712       "movdqu      0x20(%0,%4,1),%%xmm7          \n"
1713       "pavgb       %%xmm7,%%xmm2                 \n"
1714       "movdqu      0x30(%0),%%xmm6               \n"
1715       "movdqu      0x30(%0,%4,1),%%xmm7          \n"
1716       "pavgb       %%xmm7,%%xmm6                 \n"
1717 
1718       "lea         0x40(%0),%0                   \n"
1719       "movdqa      %%xmm0,%%xmm7                 \n"
1720       "shufps      $0x88,%%xmm1,%%xmm0           \n"
1721       "shufps      $0xdd,%%xmm1,%%xmm7           \n"
1722       "pavgb       %%xmm7,%%xmm0                 \n"
1723       "movdqa      %%xmm2,%%xmm7                 \n"
1724       "shufps      $0x88,%%xmm6,%%xmm2           \n"
1725       "shufps      $0xdd,%%xmm6,%%xmm7           \n"
1726       "pavgb       %%xmm7,%%xmm2                 \n"
1727       "movdqa      %%xmm0,%%xmm1                 \n"
1728       "movdqa      %%xmm2,%%xmm6                 \n"
1729       "pmaddubsw   %%xmm4,%%xmm0                 \n"
1730       "pmaddubsw   %%xmm4,%%xmm2                 \n"
1731       "pmaddubsw   %%xmm3,%%xmm1                 \n"
1732       "pmaddubsw   %%xmm3,%%xmm6                 \n"
1733       "phaddw      %%xmm2,%%xmm0                 \n"
1734       "phaddw      %%xmm6,%%xmm1                 \n"
1735       "psraw       $0x8,%%xmm0                   \n"
1736       "psraw       $0x8,%%xmm1                   \n"
1737       "packsswb    %%xmm1,%%xmm0                 \n"
1738       "paddb       %%xmm5,%%xmm0                 \n"
1739       "movlps      %%xmm0,(%1)                   \n"
1740       "movhps      %%xmm0,0x00(%1,%2,1)          \n"
1741       "lea         0x8(%1),%1                    \n"
1742       "sub         $0x10,%3                      \n"
1743       "jg          1b                            \n"
1744       : "+r"(src_bgra0),                   // %0
1745         "+r"(dst_u),                       // %1
1746         "+r"(dst_v),                       // %2
1747         "+rm"(width)                       // %3
1748       : "r"((intptr_t)(src_stride_bgra)),  // %4
1749         "m"(kBGRAToV),                     // %5
1750         "m"(kBGRAToU),                     // %6
1751         "m"(kAddUV128)                     // %7
1752       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1753 }
1754 
ABGRToYRow_SSSE3(const uint8_t * src_abgr,uint8_t * dst_y,int width)1755 void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1756   asm volatile(
1757       "movdqa      %3,%%xmm4                     \n"
1758       "movdqa      %4,%%xmm5                     \n"
1759       "movdqa      %5,%%xmm7                     \n"
1760 
1761       LABELALIGN RGBTOY(xmm7)
1762       : "+r"(src_abgr),  // %0
1763         "+r"(dst_y),     // %1
1764         "+r"(width)      // %2
1765       : "m"(kABGRToY),   // %3
1766         "m"(kSub128),    // %4
1767         "m"(kAddY16)     // %5
1768       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1769         "xmm7");
1770 }
1771 
RGBAToYRow_SSSE3(const uint8_t * src_rgba,uint8_t * dst_y,int width)1772 void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1773   asm volatile(
1774       "movdqa      %3,%%xmm4                     \n"
1775       "movdqa      %4,%%xmm5                     \n"
1776       "movdqa      %5,%%xmm7                     \n"
1777 
1778       LABELALIGN RGBTOY(xmm7)
1779       : "+r"(src_rgba),  // %0
1780         "+r"(dst_y),     // %1
1781         "+r"(width)      // %2
1782       : "m"(kRGBAToY),   // %3
1783         "m"(kSub128),    // %4
1784         "m"(kAddY16)     // %5
1785       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1786         "xmm7");
1787 }
1788 
ABGRToUVRow_SSSE3(const uint8_t * src_abgr0,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)1789 void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
1790                        int src_stride_abgr,
1791                        uint8_t* dst_u,
1792                        uint8_t* dst_v,
1793                        int width) {
1794   asm volatile(
1795       "movdqa      %5,%%xmm3                     \n"
1796       "movdqa      %6,%%xmm4                     \n"
1797       "movdqa      %7,%%xmm5                     \n"
1798       "sub         %1,%2                         \n"
1799 
1800       LABELALIGN
1801       "1:                                        \n"
1802       "movdqu      (%0),%%xmm0                   \n"
1803       "movdqu      0x00(%0,%4,1),%%xmm7          \n"
1804       "pavgb       %%xmm7,%%xmm0                 \n"
1805       "movdqu      0x10(%0),%%xmm1               \n"
1806       "movdqu      0x10(%0,%4,1),%%xmm7          \n"
1807       "pavgb       %%xmm7,%%xmm1                 \n"
1808       "movdqu      0x20(%0),%%xmm2               \n"
1809       "movdqu      0x20(%0,%4,1),%%xmm7          \n"
1810       "pavgb       %%xmm7,%%xmm2                 \n"
1811       "movdqu      0x30(%0),%%xmm6               \n"
1812       "movdqu      0x30(%0,%4,1),%%xmm7          \n"
1813       "pavgb       %%xmm7,%%xmm6                 \n"
1814 
1815       "lea         0x40(%0),%0                   \n"
1816       "movdqa      %%xmm0,%%xmm7                 \n"
1817       "shufps      $0x88,%%xmm1,%%xmm0           \n"
1818       "shufps      $0xdd,%%xmm1,%%xmm7           \n"
1819       "pavgb       %%xmm7,%%xmm0                 \n"
1820       "movdqa      %%xmm2,%%xmm7                 \n"
1821       "shufps      $0x88,%%xmm6,%%xmm2           \n"
1822       "shufps      $0xdd,%%xmm6,%%xmm7           \n"
1823       "pavgb       %%xmm7,%%xmm2                 \n"
1824       "movdqa      %%xmm0,%%xmm1                 \n"
1825       "movdqa      %%xmm2,%%xmm6                 \n"
1826       "pmaddubsw   %%xmm4,%%xmm0                 \n"
1827       "pmaddubsw   %%xmm4,%%xmm2                 \n"
1828       "pmaddubsw   %%xmm3,%%xmm1                 \n"
1829       "pmaddubsw   %%xmm3,%%xmm6                 \n"
1830       "phaddw      %%xmm2,%%xmm0                 \n"
1831       "phaddw      %%xmm6,%%xmm1                 \n"
1832       "psraw       $0x8,%%xmm0                   \n"
1833       "psraw       $0x8,%%xmm1                   \n"
1834       "packsswb    %%xmm1,%%xmm0                 \n"
1835       "paddb       %%xmm5,%%xmm0                 \n"
1836       "movlps      %%xmm0,(%1)                   \n"
1837       "movhps      %%xmm0,0x00(%1,%2,1)          \n"
1838       "lea         0x8(%1),%1                    \n"
1839       "sub         $0x10,%3                      \n"
1840       "jg          1b                            \n"
1841       : "+r"(src_abgr0),                   // %0
1842         "+r"(dst_u),                       // %1
1843         "+r"(dst_v),                       // %2
1844         "+rm"(width)                       // %3
1845       : "r"((intptr_t)(src_stride_abgr)),  // %4
1846         "m"(kABGRToV),                     // %5
1847         "m"(kABGRToU),                     // %6
1848         "m"(kAddUV128)                     // %7
1849       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1850 }
1851 
RGBAToUVRow_SSSE3(const uint8_t * src_rgba0,int src_stride_rgba,uint8_t * dst_u,uint8_t * dst_v,int width)1852 void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
1853                        int src_stride_rgba,
1854                        uint8_t* dst_u,
1855                        uint8_t* dst_v,
1856                        int width) {
1857   asm volatile(
1858       "movdqa      %5,%%xmm3                     \n"
1859       "movdqa      %6,%%xmm4                     \n"
1860       "movdqa      %7,%%xmm5                     \n"
1861       "sub         %1,%2                         \n"
1862 
1863       LABELALIGN
1864       "1:                                        \n"
1865       "movdqu      (%0),%%xmm0                   \n"
1866       "movdqu      0x00(%0,%4,1),%%xmm7          \n"
1867       "pavgb       %%xmm7,%%xmm0                 \n"
1868       "movdqu      0x10(%0),%%xmm1               \n"
1869       "movdqu      0x10(%0,%4,1),%%xmm7          \n"
1870       "pavgb       %%xmm7,%%xmm1                 \n"
1871       "movdqu      0x20(%0),%%xmm2               \n"
1872       "movdqu      0x20(%0,%4,1),%%xmm7          \n"
1873       "pavgb       %%xmm7,%%xmm2                 \n"
1874       "movdqu      0x30(%0),%%xmm6               \n"
1875       "movdqu      0x30(%0,%4,1),%%xmm7          \n"
1876       "pavgb       %%xmm7,%%xmm6                 \n"
1877 
1878       "lea         0x40(%0),%0                   \n"
1879       "movdqa      %%xmm0,%%xmm7                 \n"
1880       "shufps      $0x88,%%xmm1,%%xmm0           \n"
1881       "shufps      $0xdd,%%xmm1,%%xmm7           \n"
1882       "pavgb       %%xmm7,%%xmm0                 \n"
1883       "movdqa      %%xmm2,%%xmm7                 \n"
1884       "shufps      $0x88,%%xmm6,%%xmm2           \n"
1885       "shufps      $0xdd,%%xmm6,%%xmm7           \n"
1886       "pavgb       %%xmm7,%%xmm2                 \n"
1887       "movdqa      %%xmm0,%%xmm1                 \n"
1888       "movdqa      %%xmm2,%%xmm6                 \n"
1889       "pmaddubsw   %%xmm4,%%xmm0                 \n"
1890       "pmaddubsw   %%xmm4,%%xmm2                 \n"
1891       "pmaddubsw   %%xmm3,%%xmm1                 \n"
1892       "pmaddubsw   %%xmm3,%%xmm6                 \n"
1893       "phaddw      %%xmm2,%%xmm0                 \n"
1894       "phaddw      %%xmm6,%%xmm1                 \n"
1895       "psraw       $0x8,%%xmm0                   \n"
1896       "psraw       $0x8,%%xmm1                   \n"
1897       "packsswb    %%xmm1,%%xmm0                 \n"
1898       "paddb       %%xmm5,%%xmm0                 \n"
1899       "movlps      %%xmm0,(%1)                   \n"
1900       "movhps      %%xmm0,0x00(%1,%2,1)          \n"
1901       "lea         0x8(%1),%1                    \n"
1902       "sub         $0x10,%3                      \n"
1903       "jg          1b                            \n"
1904       : "+r"(src_rgba0),                   // %0
1905         "+r"(dst_u),                       // %1
1906         "+r"(dst_v),                       // %2
1907         "+rm"(width)                       // %3
1908       : "r"((intptr_t)(src_stride_rgba)),  // %4
1909         "m"(kRGBAToV),                     // %5
1910         "m"(kRGBAToU),                     // %6
1911         "m"(kAddUV128)                     // %7
1912       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1913 }
1914 
1915 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
1916 
1917 // Read 8 UV from 444
1918 #define READYUV444                                                \
1919   "movq       (%[u_buf]),%%xmm0                               \n" \
1920   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
1921   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
1922   "punpcklbw  %%xmm1,%%xmm0                                   \n" \
1923   "movq       (%[y_buf]),%%xmm4                               \n" \
1924   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
1925   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
1926 
1927 // Read 4 UV from 422, upsample to 8 UV
1928 #define READYUV422                                                \
1929   "movd       (%[u_buf]),%%xmm0                               \n" \
1930   "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
1931   "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
1932   "punpcklbw  %%xmm1,%%xmm0                                   \n" \
1933   "punpcklwd  %%xmm0,%%xmm0                                   \n" \
1934   "movq       (%[y_buf]),%%xmm4                               \n" \
1935   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
1936   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
1937 
1938 // Read 4 UV from 422 10 bit, upsample to 8 UV
1939 // TODO(fbarchard): Consider shufb to replace pack/unpack
1940 // TODO(fbarchard): Consider pmulhuw to replace psraw
1941 // TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
1942 #define READYUV210                                                \
1943   "movq       (%[u_buf]),%%xmm0                               \n" \
1944   "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
1945   "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
1946   "punpcklwd  %%xmm1,%%xmm0                                   \n" \
1947   "psraw      $0x2,%%xmm0                                     \n" \
1948   "packuswb   %%xmm0,%%xmm0                                   \n" \
1949   "punpcklwd  %%xmm0,%%xmm0                                   \n" \
1950   "movdqu     (%[y_buf]),%%xmm4                               \n" \
1951   "psllw      $0x6,%%xmm4                                     \n" \
1952   "lea        0x10(%[y_buf]),%[y_buf]                         \n"
1953 
1954 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
1955 #define READYUVA422                                               \
1956   "movd       (%[u_buf]),%%xmm0                               \n" \
1957   "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
1958   "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
1959   "punpcklbw  %%xmm1,%%xmm0                                   \n" \
1960   "punpcklwd  %%xmm0,%%xmm0                                   \n" \
1961   "movq       (%[y_buf]),%%xmm4                               \n" \
1962   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
1963   "lea        0x8(%[y_buf]),%[y_buf]                          \n" \
1964   "movq       (%[a_buf]),%%xmm5                               \n" \
1965   "lea        0x8(%[a_buf]),%[a_buf]                          \n"
1966 
1967 // Read 4 UV from NV12, upsample to 8 UV
1968 #define READNV12                                                  \
1969   "movq       (%[uv_buf]),%%xmm0                              \n" \
1970   "lea        0x8(%[uv_buf]),%[uv_buf]                        \n" \
1971   "punpcklwd  %%xmm0,%%xmm0                                   \n" \
1972   "movq       (%[y_buf]),%%xmm4                               \n" \
1973   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
1974   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
1975 
1976 // Read 4 VU from NV21, upsample to 8 UV
1977 #define READNV21                                                  \
1978   "movq       (%[vu_buf]),%%xmm0                              \n" \
1979   "lea        0x8(%[vu_buf]),%[vu_buf]                        \n" \
1980   "pshufb     %[kShuffleNV21], %%xmm0                         \n" \
1981   "movq       (%[y_buf]),%%xmm4                               \n" \
1982   "punpcklbw  %%xmm4,%%xmm4                                   \n" \
1983   "lea        0x8(%[y_buf]),%[y_buf]                          \n"
1984 
1985 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
1986 #define READYUY2                                                  \
1987   "movdqu     (%[yuy2_buf]),%%xmm4                            \n" \
1988   "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n" \
1989   "movdqu     (%[yuy2_buf]),%%xmm0                            \n" \
1990   "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n" \
1991   "lea        0x10(%[yuy2_buf]),%[yuy2_buf]                   \n"
1992 
1993 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
1994 #define READUYVY                                                  \
1995   "movdqu     (%[uyvy_buf]),%%xmm4                            \n" \
1996   "pshufb     %[kShuffleUYVYY], %%xmm4                        \n" \
1997   "movdqu     (%[uyvy_buf]),%%xmm0                            \n" \
1998   "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n" \
1999   "lea        0x10(%[uyvy_buf]),%[uyvy_buf]                   \n"
2000 
2001 #if defined(__x86_64__)
2002 #define YUVTORGB_SETUP(yuvconstants)                              \
2003   "movdqa     (%[yuvconstants]),%%xmm8                        \n" \
2004   "movdqa     32(%[yuvconstants]),%%xmm9                      \n" \
2005   "movdqa     64(%[yuvconstants]),%%xmm10                     \n" \
2006   "movdqa     96(%[yuvconstants]),%%xmm11                     \n" \
2007   "movdqa     128(%[yuvconstants]),%%xmm12                    \n" \
2008   "movdqa     160(%[yuvconstants]),%%xmm13                    \n" \
2009   "movdqa     192(%[yuvconstants]),%%xmm14                    \n"
2010 // Convert 8 pixels: 8 UV and 8 Y
2011 #define YUVTORGB16(yuvconstants)                                  \
2012   "movdqa     %%xmm0,%%xmm1                                   \n" \
2013   "movdqa     %%xmm0,%%xmm2                                   \n" \
2014   "movdqa     %%xmm0,%%xmm3                                   \n" \
2015   "movdqa     %%xmm11,%%xmm0                                  \n" \
2016   "pmaddubsw  %%xmm8,%%xmm1                                   \n" \
2017   "psubw      %%xmm1,%%xmm0                                   \n" \
2018   "movdqa     %%xmm12,%%xmm1                                  \n" \
2019   "pmaddubsw  %%xmm9,%%xmm2                                   \n" \
2020   "psubw      %%xmm2,%%xmm1                                   \n" \
2021   "movdqa     %%xmm13,%%xmm2                                  \n" \
2022   "pmaddubsw  %%xmm10,%%xmm3                                  \n" \
2023   "psubw      %%xmm3,%%xmm2                                   \n" \
2024   "pmulhuw    %%xmm14,%%xmm4                                  \n" \
2025   "paddsw     %%xmm4,%%xmm0                                   \n" \
2026   "paddsw     %%xmm4,%%xmm1                                   \n" \
2027   "paddsw     %%xmm4,%%xmm2                                   \n"
2028 #define YUVTORGB_REGS \
2029   "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
2030 
2031 #else
2032 #define YUVTORGB_SETUP(yuvconstants)
2033 // Convert 8 pixels: 8 UV and 8 Y
2034 #define YUVTORGB16(yuvconstants)                                  \
2035   "movdqa     %%xmm0,%%xmm1                                   \n" \
2036   "movdqa     %%xmm0,%%xmm2                                   \n" \
2037   "movdqa     %%xmm0,%%xmm3                                   \n" \
2038   "movdqa     96(%[yuvconstants]),%%xmm0                      \n" \
2039   "pmaddubsw  (%[yuvconstants]),%%xmm1                        \n" \
2040   "psubw      %%xmm1,%%xmm0                                   \n" \
2041   "movdqa     128(%[yuvconstants]),%%xmm1                     \n" \
2042   "pmaddubsw  32(%[yuvconstants]),%%xmm2                      \n" \
2043   "psubw      %%xmm2,%%xmm1                                   \n" \
2044   "movdqa     160(%[yuvconstants]),%%xmm2                     \n" \
2045   "pmaddubsw  64(%[yuvconstants]),%%xmm3                      \n" \
2046   "psubw      %%xmm3,%%xmm2                                   \n" \
2047   "pmulhuw    192(%[yuvconstants]),%%xmm4                     \n" \
2048   "paddsw     %%xmm4,%%xmm0                                   \n" \
2049   "paddsw     %%xmm4,%%xmm1                                   \n" \
2050   "paddsw     %%xmm4,%%xmm2                                   \n"
2051 #define YUVTORGB_REGS
2052 #endif
2053 
2054 #define YUVTORGB(yuvconstants)                                    \
2055   YUVTORGB16(yuvconstants)                                        \
2056   "psraw      $0x6,%%xmm0                                     \n" \
2057   "psraw      $0x6,%%xmm1                                     \n" \
2058   "psraw      $0x6,%%xmm2                                     \n" \
2059   "packuswb   %%xmm0,%%xmm0                                   \n" \
2060   "packuswb   %%xmm1,%%xmm1                                   \n" \
2061   "packuswb   %%xmm2,%%xmm2                                   \n"
2062 
2063 // Store 8 ARGB values.
2064 #define STOREARGB                                                  \
2065   "punpcklbw  %%xmm1,%%xmm0                                    \n" \
2066   "punpcklbw  %%xmm5,%%xmm2                                    \n" \
2067   "movdqa     %%xmm0,%%xmm1                                    \n" \
2068   "punpcklwd  %%xmm2,%%xmm0                                    \n" \
2069   "punpckhwd  %%xmm2,%%xmm1                                    \n" \
2070   "movdqu     %%xmm0,(%[dst_argb])                             \n" \
2071   "movdqu     %%xmm1,0x10(%[dst_argb])                         \n" \
2072   "lea        0x20(%[dst_argb]), %[dst_argb]                   \n"
2073 
2074 // Store 8 RGBA values.
2075 #define STORERGBA                                                  \
2076   "pcmpeqb   %%xmm5,%%xmm5                                     \n" \
2077   "punpcklbw %%xmm2,%%xmm1                                     \n" \
2078   "punpcklbw %%xmm0,%%xmm5                                     \n" \
2079   "movdqa    %%xmm5,%%xmm0                                     \n" \
2080   "punpcklwd %%xmm1,%%xmm5                                     \n" \
2081   "punpckhwd %%xmm1,%%xmm0                                     \n" \
2082   "movdqu    %%xmm5,(%[dst_rgba])                              \n" \
2083   "movdqu    %%xmm0,0x10(%[dst_rgba])                          \n" \
2084   "lea       0x20(%[dst_rgba]),%[dst_rgba]                     \n"
2085 
2086 // Store 8 AR30 values.
2087 #define STOREAR30                                                  \
2088   "psraw      $0x4,%%xmm0                                      \n" \
2089   "psraw      $0x4,%%xmm1                                      \n" \
2090   "psraw      $0x4,%%xmm2                                      \n" \
2091   "pminsw     %%xmm7,%%xmm0                                    \n" \
2092   "pminsw     %%xmm7,%%xmm1                                    \n" \
2093   "pminsw     %%xmm7,%%xmm2                                    \n" \
2094   "pmaxsw     %%xmm6,%%xmm0                                    \n" \
2095   "pmaxsw     %%xmm6,%%xmm1                                    \n" \
2096   "pmaxsw     %%xmm6,%%xmm2                                    \n" \
2097   "psllw      $0x4,%%xmm2                                      \n" \
2098   "movdqa     %%xmm0,%%xmm3                                    \n" \
2099   "punpcklwd  %%xmm2,%%xmm0                                    \n" \
2100   "punpckhwd  %%xmm2,%%xmm3                                    \n" \
2101   "movdqa     %%xmm1,%%xmm2                                    \n" \
2102   "punpcklwd  %%xmm5,%%xmm1                                    \n" \
2103   "punpckhwd  %%xmm5,%%xmm2                                    \n" \
2104   "pslld      $0xa,%%xmm1                                      \n" \
2105   "pslld      $0xa,%%xmm2                                      \n" \
2106   "por        %%xmm1,%%xmm0                                    \n" \
2107   "por        %%xmm2,%%xmm3                                    \n" \
2108   "movdqu     %%xmm0,(%[dst_ar30])                             \n" \
2109   "movdqu     %%xmm3,0x10(%[dst_ar30])                         \n" \
2110   "lea        0x20(%[dst_ar30]), %[dst_ar30]                   \n"
2111 
I444ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2112 void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
2113                                 const uint8_t* u_buf,
2114                                 const uint8_t* v_buf,
2115                                 uint8_t* dst_argb,
2116                                 const struct YuvConstants* yuvconstants,
2117                                 int width) {
2118   asm volatile (
2119     YUVTORGB_SETUP(yuvconstants)
2120       "sub         %[u_buf],%[v_buf]             \n"
2121       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2122 
2123     LABELALIGN
2124       "1:                                        \n"
2125     READYUV444
2126     YUVTORGB(yuvconstants)
2127     STOREARGB
2128       "sub         $0x8,%[width]                 \n"
2129       "jg          1b                            \n"
2130   : [y_buf]"+r"(y_buf),    // %[y_buf]
2131     [u_buf]"+r"(u_buf),    // %[u_buf]
2132     [v_buf]"+r"(v_buf),    // %[v_buf]
2133     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2134     [width]"+rm"(width)    // %[width]
2135   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2136   : "memory", "cc", YUVTORGB_REGS
2137     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2138   );
2139 }
2140 
I422ToRGB24Row_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)2141 void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
2142                                  const uint8_t* u_buf,
2143                                  const uint8_t* v_buf,
2144                                  uint8_t* dst_rgb24,
2145                                  const struct YuvConstants* yuvconstants,
2146                                  int width) {
2147   asm volatile (
2148     YUVTORGB_SETUP(yuvconstants)
2149       "movdqa      %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
2150       "movdqa      %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
2151       "sub         %[u_buf],%[v_buf]             \n"
2152 
2153     LABELALIGN
2154       "1:                                        \n"
2155     READYUV422
2156     YUVTORGB(yuvconstants)
2157       "punpcklbw   %%xmm1,%%xmm0                 \n"
2158       "punpcklbw   %%xmm2,%%xmm2                 \n"
2159       "movdqa      %%xmm0,%%xmm1                 \n"
2160       "punpcklwd   %%xmm2,%%xmm0                 \n"
2161       "punpckhwd   %%xmm2,%%xmm1                 \n"
2162       "pshufb      %%xmm5,%%xmm0                 \n"
2163       "pshufb      %%xmm6,%%xmm1                 \n"
2164       "palignr     $0xc,%%xmm0,%%xmm1            \n"
2165       "movq        %%xmm0,(%[dst_rgb24])         \n"
2166       "movdqu      %%xmm1,0x8(%[dst_rgb24])      \n"
2167       "lea         0x18(%[dst_rgb24]),%[dst_rgb24] \n"
2168       "subl        $0x8,%[width]                 \n"
2169       "jg          1b                            \n"
2170   : [y_buf]"+r"(y_buf),    // %[y_buf]
2171     [u_buf]"+r"(u_buf),    // %[u_buf]
2172     [v_buf]"+r"(v_buf),    // %[v_buf]
2173     [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
2174 #if defined(__i386__)
2175     [width]"+m"(width)     // %[width]
2176 #else
2177     [width]"+rm"(width)    // %[width]
2178 #endif
2179   : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
2180     [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
2181     [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
2182   : "memory", "cc", YUVTORGB_REGS
2183     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2184   );
2185 }
2186 
I422ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2187 void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
2188                                 const uint8_t* u_buf,
2189                                 const uint8_t* v_buf,
2190                                 uint8_t* dst_argb,
2191                                 const struct YuvConstants* yuvconstants,
2192                                 int width) {
2193   asm volatile (
2194     YUVTORGB_SETUP(yuvconstants)
2195       "sub         %[u_buf],%[v_buf]             \n"
2196       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2197 
2198     LABELALIGN
2199       "1:                                        \n"
2200     READYUV422
2201     YUVTORGB(yuvconstants)
2202     STOREARGB
2203       "sub         $0x8,%[width]                 \n"
2204       "jg          1b                            \n"
2205   : [y_buf]"+r"(y_buf),    // %[y_buf]
2206     [u_buf]"+r"(u_buf),    // %[u_buf]
2207     [v_buf]"+r"(v_buf),    // %[v_buf]
2208     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2209     [width]"+rm"(width)    // %[width]
2210   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2211   : "memory", "cc", YUVTORGB_REGS
2212     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2213   );
2214 }
2215 
I422ToAR30Row_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2216 void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
2217                                 const uint8_t* u_buf,
2218                                 const uint8_t* v_buf,
2219                                 uint8_t* dst_ar30,
2220                                 const struct YuvConstants* yuvconstants,
2221                                 int width) {
2222   asm volatile (
2223     YUVTORGB_SETUP(yuvconstants)
2224       "sub         %[u_buf],%[v_buf]             \n"
2225       "pcmpeqb     %%xmm5,%%xmm5                 \n"  // AR30 constants
2226       "psrlw       $14,%%xmm5                    \n"
2227       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
2228       "pxor        %%xmm6,%%xmm6                 \n"
2229       "pcmpeqb     %%xmm7,%%xmm7                 \n"  // 0 for min
2230       "psrlw       $6,%%xmm7                     \n"  // 1023 for max
2231 
2232     LABELALIGN
2233       "1:                                        \n"
2234     READYUV422
2235     YUVTORGB16(yuvconstants)
2236     STOREAR30
2237       "sub         $0x8,%[width]                 \n"
2238       "jg          1b                            \n"
2239   : [y_buf]"+r"(y_buf),    // %[y_buf]
2240     [u_buf]"+r"(u_buf),    // %[u_buf]
2241     [v_buf]"+r"(v_buf),    // %[v_buf]
2242     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
2243     [width]"+rm"(width)    // %[width]
2244   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2245   : "memory", "cc", YUVTORGB_REGS
2246     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2247   );
2248 }
2249 
2250 // 10 bit YUV to ARGB
I210ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2251 void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
2252                                 const uint16_t* u_buf,
2253                                 const uint16_t* v_buf,
2254                                 uint8_t* dst_argb,
2255                                 const struct YuvConstants* yuvconstants,
2256                                 int width) {
2257   asm volatile (
2258     YUVTORGB_SETUP(yuvconstants)
2259       "sub         %[u_buf],%[v_buf]             \n"
2260       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2261 
2262     LABELALIGN
2263       "1:                                        \n"
2264     READYUV210
2265     YUVTORGB(yuvconstants)
2266     STOREARGB
2267       "sub         $0x8,%[width]                 \n"
2268       "jg          1b                            \n"
2269   : [y_buf]"+r"(y_buf),    // %[y_buf]
2270     [u_buf]"+r"(u_buf),    // %[u_buf]
2271     [v_buf]"+r"(v_buf),    // %[v_buf]
2272     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2273     [width]"+rm"(width)    // %[width]
2274   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2275   : "memory", "cc", YUVTORGB_REGS
2276     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2277   );
2278 }
2279 
2280 // 10 bit YUV to AR30
I210ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2281 void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
2282                                 const uint16_t* u_buf,
2283                                 const uint16_t* v_buf,
2284                                 uint8_t* dst_ar30,
2285                                 const struct YuvConstants* yuvconstants,
2286                                 int width) {
2287   asm volatile (
2288     YUVTORGB_SETUP(yuvconstants)
2289       "sub         %[u_buf],%[v_buf]             \n"
2290       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2291       "psrlw       $14,%%xmm5                    \n"
2292       "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
2293       "pxor        %%xmm6,%%xmm6                 \n"
2294       "pcmpeqb     %%xmm7,%%xmm7                 \n"  // 0 for min
2295       "psrlw       $6,%%xmm7                     \n"  // 1023 for max
2296 
2297     LABELALIGN
2298       "1:                                        \n"
2299     READYUV210
2300     YUVTORGB16(yuvconstants)
2301     STOREAR30
2302       "sub         $0x8,%[width]                 \n"
2303       "jg          1b                            \n"
2304   : [y_buf]"+r"(y_buf),    // %[y_buf]
2305     [u_buf]"+r"(u_buf),    // %[u_buf]
2306     [v_buf]"+r"(v_buf),    // %[v_buf]
2307     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
2308     [width]"+rm"(width)    // %[width]
2309   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2310   : "memory", "cc", YUVTORGB_REGS
2311     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2312   );
2313 }
2314 
2315 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
I422AlphaToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2316 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
2317                                      const uint8_t* u_buf,
2318                                      const uint8_t* v_buf,
2319                                      const uint8_t* a_buf,
2320                                      uint8_t* dst_argb,
2321                                      const struct YuvConstants* yuvconstants,
2322                                      int width) {
2323   // clang-format off
2324   asm volatile (
2325     YUVTORGB_SETUP(yuvconstants)
2326       "sub         %[u_buf],%[v_buf]             \n"
2327 
2328     LABELALIGN
2329       "1:                                        \n"
2330     READYUVA422
2331     YUVTORGB(yuvconstants)
2332     STOREARGB
2333       "subl        $0x8,%[width]                 \n"
2334       "jg          1b                            \n"
2335   : [y_buf]"+r"(y_buf),    // %[y_buf]
2336     [u_buf]"+r"(u_buf),    // %[u_buf]
2337     [v_buf]"+r"(v_buf),    // %[v_buf]
2338     [a_buf]"+r"(a_buf),    // %[a_buf]
2339     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2340 #if defined(__i386__)
2341     [width]"+m"(width)     // %[width]
2342 #else
2343     [width]"+rm"(width)    // %[width]
2344 #endif
2345   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2346   : "memory", "cc", YUVTORGB_REGS
2347     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2348   );
2349   // clang-format on
2350 }
2351 #endif  // HAS_I422ALPHATOARGBROW_SSSE3
2352 
NV12ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2353 void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
2354                                 const uint8_t* uv_buf,
2355                                 uint8_t* dst_argb,
2356                                 const struct YuvConstants* yuvconstants,
2357                                 int width) {
2358   // clang-format off
2359   asm volatile (
2360     YUVTORGB_SETUP(yuvconstants)
2361       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2362 
2363     LABELALIGN
2364       "1:                                        \n"
2365     READNV12
2366     YUVTORGB(yuvconstants)
2367     STOREARGB
2368       "sub         $0x8,%[width]                 \n"
2369       "jg          1b                            \n"
2370   : [y_buf]"+r"(y_buf),    // %[y_buf]
2371     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
2372     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2373     [width]"+rm"(width)    // %[width]
2374   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2375     : "memory", "cc", YUVTORGB_REGS
2376       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2377   );
2378   // clang-format on
2379 }
2380 
NV21ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * vu_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2381 void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
2382                                 const uint8_t* vu_buf,
2383                                 uint8_t* dst_argb,
2384                                 const struct YuvConstants* yuvconstants,
2385                                 int width) {
2386   // clang-format off
2387   asm volatile (
2388     YUVTORGB_SETUP(yuvconstants)
2389       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2390 
2391     LABELALIGN
2392       "1:                                        \n"
2393     READNV21
2394     YUVTORGB(yuvconstants)
2395     STOREARGB
2396       "sub         $0x8,%[width]                 \n"
2397       "jg          1b                            \n"
2398   : [y_buf]"+r"(y_buf),    // %[y_buf]
2399     [vu_buf]"+r"(vu_buf),    // %[vu_buf]
2400     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2401     [width]"+rm"(width)    // %[width]
2402   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2403     [kShuffleNV21]"m"(kShuffleNV21)
2404     : "memory", "cc", YUVTORGB_REGS
2405       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2406   );
2407   // clang-format on
2408 }
2409 
YUY2ToARGBRow_SSSE3(const uint8_t * yuy2_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2410 void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
2411                                 uint8_t* dst_argb,
2412                                 const struct YuvConstants* yuvconstants,
2413                                 int width) {
2414   // clang-format off
2415   asm volatile (
2416     YUVTORGB_SETUP(yuvconstants)
2417       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2418 
2419     LABELALIGN
2420       "1:                                        \n"
2421     READYUY2
2422     YUVTORGB(yuvconstants)
2423     STOREARGB
2424       "sub         $0x8,%[width]                 \n"
2425       "jg          1b                            \n"
2426   : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
2427     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2428     [width]"+rm"(width)    // %[width]
2429   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2430     [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
2431     [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
2432     : "memory", "cc", YUVTORGB_REGS
2433       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2434   );
2435   // clang-format on
2436 }
2437 
UYVYToARGBRow_SSSE3(const uint8_t * uyvy_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2438 void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
2439                                 uint8_t* dst_argb,
2440                                 const struct YuvConstants* yuvconstants,
2441                                 int width) {
2442   // clang-format off
2443   asm volatile (
2444     YUVTORGB_SETUP(yuvconstants)
2445       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2446 
2447     LABELALIGN
2448       "1:                                        \n"
2449     READUYVY
2450     YUVTORGB(yuvconstants)
2451     STOREARGB
2452       "sub         $0x8,%[width]                 \n"
2453       "jg          1b                            \n"
2454   : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
2455     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2456     [width]"+rm"(width)    // %[width]
2457   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2458     [kShuffleUYVYY]"m"(kShuffleUYVYY),
2459     [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
2460     : "memory", "cc", YUVTORGB_REGS
2461       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2462   );
2463   // clang-format on
2464 }
2465 
I422ToRGBARow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_rgba,const struct YuvConstants * yuvconstants,int width)2466 void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
2467                                 const uint8_t* u_buf,
2468                                 const uint8_t* v_buf,
2469                                 uint8_t* dst_rgba,
2470                                 const struct YuvConstants* yuvconstants,
2471                                 int width) {
2472   asm volatile (
2473     YUVTORGB_SETUP(yuvconstants)
2474       "sub         %[u_buf],%[v_buf]             \n"
2475       "pcmpeqb     %%xmm5,%%xmm5                 \n"
2476 
2477     LABELALIGN
2478       "1:                                        \n"
2479     READYUV422
2480     YUVTORGB(yuvconstants)
2481     STORERGBA
2482       "sub         $0x8,%[width]                 \n"
2483       "jg          1b                            \n"
2484   : [y_buf]"+r"(y_buf),    // %[y_buf]
2485     [u_buf]"+r"(u_buf),    // %[u_buf]
2486     [v_buf]"+r"(v_buf),    // %[v_buf]
2487     [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
2488     [width]"+rm"(width)    // %[width]
2489   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2490   : "memory", "cc", YUVTORGB_REGS
2491     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2492   );
2493 }
2494 
2495 #endif  // HAS_I422TOARGBROW_SSSE3
2496 
2497 // Read 16 UV from 444
2498 #define READYUV444_AVX2                                               \
2499   "vmovdqu    (%[u_buf]),%%xmm0                                   \n" \
2500   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
2501   "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
2502   "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
2503   "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
2504   "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
2505   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
2506   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
2507   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
2508   "lea        0x10(%[y_buf]),%[y_buf]                             \n"
2509 
2510 // Read 8 UV from 422, upsample to 16 UV.
2511 #define READYUV422_AVX2                                               \
2512   "vmovq      (%[u_buf]),%%xmm0                                   \n" \
2513   "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
2514   "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
2515   "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
2516   "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
2517   "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
2518   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
2519   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
2520   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
2521   "lea        0x10(%[y_buf]),%[y_buf]                             \n"
2522 
2523 // Read 8 UV from 210 10 bit, upsample to 16 UV
2524 // TODO(fbarchard): Consider vshufb to replace pack/unpack
2525 // TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
2526 #define READYUV210_AVX2                                            \
2527   "vmovdqu    (%[u_buf]),%%xmm0                                \n" \
2528   "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
2529   "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
2530   "vpermq     $0xd8,%%ymm0,%%ymm0                              \n" \
2531   "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
2532   "vpunpcklwd %%ymm1,%%ymm0,%%ymm0                             \n" \
2533   "vpsraw     $0x2,%%ymm0,%%ymm0                               \n" \
2534   "vpackuswb  %%ymm0,%%ymm0,%%ymm0                             \n" \
2535   "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                             \n" \
2536   "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
2537   "vpsllw     $0x6,%%ymm4,%%ymm4                               \n" \
2538   "lea        0x20(%[y_buf]),%[y_buf]                          \n"
2539 
2540 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
2541 #define READYUVA422_AVX2                                              \
2542   "vmovq      (%[u_buf]),%%xmm0                                   \n" \
2543   "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
2544   "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
2545   "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
2546   "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
2547   "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
2548   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
2549   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
2550   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
2551   "lea        0x10(%[y_buf]),%[y_buf]                             \n" \
2552   "vmovdqu    (%[a_buf]),%%xmm5                                   \n" \
2553   "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n" \
2554   "lea        0x10(%[a_buf]),%[a_buf]                             \n"
2555 
2556 // Read 8 UV from NV12, upsample to 16 UV.
2557 #define READNV12_AVX2                                                 \
2558   "vmovdqu    (%[uv_buf]),%%xmm0                                  \n" \
2559   "lea        0x10(%[uv_buf]),%[uv_buf]                           \n" \
2560   "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
2561   "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
2562   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
2563   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
2564   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
2565   "lea        0x10(%[y_buf]),%[y_buf]                             \n"
2566 
2567 // Read 8 VU from NV21, upsample to 16 UV.
2568 #define READNV21_AVX2                                                 \
2569   "vmovdqu    (%[vu_buf]),%%xmm0                                  \n" \
2570   "lea        0x10(%[vu_buf]),%[vu_buf]                           \n" \
2571   "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
2572   "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n" \
2573   "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
2574   "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
2575   "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
2576   "lea        0x10(%[y_buf]),%[y_buf]                             \n"
2577 
2578 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
2579 #define READYUY2_AVX2                                                 \
2580   "vmovdqu    (%[yuy2_buf]),%%ymm4                                \n" \
2581   "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n" \
2582   "vmovdqu    (%[yuy2_buf]),%%ymm0                                \n" \
2583   "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n" \
2584   "lea        0x20(%[yuy2_buf]),%[yuy2_buf]                       \n"
2585 
2586 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
2587 #define READUYVY_AVX2                                                 \
2588   "vmovdqu    (%[uyvy_buf]),%%ymm4                                \n" \
2589   "vpshufb    %[kShuffleUYVYY], %%ymm4, %%ymm4                    \n" \
2590   "vmovdqu    (%[uyvy_buf]),%%ymm0                                \n" \
2591   "vpshufb    %[kShuffleUYVYUV], %%ymm0, %%ymm0                   \n" \
2592   "lea        0x20(%[uyvy_buf]),%[uyvy_buf]                       \n"
2593 
2594 #if defined(__x86_64__)
2595 #define YUVTORGB_SETUP_AVX2(yuvconstants)                            \
2596   "vmovdqa     (%[yuvconstants]),%%ymm8                          \n" \
2597   "vmovdqa     32(%[yuvconstants]),%%ymm9                        \n" \
2598   "vmovdqa     64(%[yuvconstants]),%%ymm10                       \n" \
2599   "vmovdqa     96(%[yuvconstants]),%%ymm11                       \n" \
2600   "vmovdqa     128(%[yuvconstants]),%%ymm12                      \n" \
2601   "vmovdqa     160(%[yuvconstants]),%%ymm13                      \n" \
2602   "vmovdqa     192(%[yuvconstants]),%%ymm14                      \n"
2603 
2604 #define YUVTORGB16_AVX2(yuvconstants)                                 \
2605   "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n" \
2606   "vpmaddubsw  %%ymm9,%%ymm0,%%ymm1                               \n" \
2607   "vpmaddubsw  %%ymm8,%%ymm0,%%ymm0                               \n" \
2608   "vpsubw      %%ymm2,%%ymm13,%%ymm2                              \n" \
2609   "vpsubw      %%ymm1,%%ymm12,%%ymm1                              \n" \
2610   "vpsubw      %%ymm0,%%ymm11,%%ymm0                              \n" \
2611   "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n" \
2612   "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
2613   "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \
2614   "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
2615 
2616 #define YUVTORGB_REGS_AVX2 \
2617   "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
2618 
2619 #else  // Convert 16 pixels: 16 UV and 16 Y.
2620 
2621 #define YUVTORGB_SETUP_AVX2(yuvconstants)
2622 #define YUVTORGB16_AVX2(yuvconstants)                                 \
2623   "vpmaddubsw  64(%[yuvconstants]),%%ymm0,%%ymm2                  \n" \
2624   "vpmaddubsw  32(%[yuvconstants]),%%ymm0,%%ymm1                  \n" \
2625   "vpmaddubsw  (%[yuvconstants]),%%ymm0,%%ymm0                    \n" \
2626   "vmovdqu     160(%[yuvconstants]),%%ymm3                        \n" \
2627   "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n" \
2628   "vmovdqu     128(%[yuvconstants]),%%ymm3                        \n" \
2629   "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n" \
2630   "vmovdqu     96(%[yuvconstants]),%%ymm3                         \n" \
2631   "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n" \
2632   "vpmulhuw    192(%[yuvconstants]),%%ymm4,%%ymm4                 \n" \
2633   "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
2634   "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \
2635   "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
2636 #define YUVTORGB_REGS_AVX2
2637 #endif
2638 
2639 #define YUVTORGB_AVX2(yuvconstants)                                   \
2640   YUVTORGB16_AVX2(yuvconstants)                                       \
2641   "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n" \
2642   "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n" \
2643   "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n" \
2644   "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n" \
2645   "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n" \
2646   "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
2647 
2648 // Store 16 ARGB values.
2649 #define STOREARGB_AVX2                                                \
2650   "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
2651   "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
2652   "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n" \
2653   "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
2654   "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n" \
2655   "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n" \
2656   "vmovdqu    %%ymm1,(%[dst_argb])                                \n" \
2657   "vmovdqu    %%ymm0,0x20(%[dst_argb])                            \n" \
2658   "lea       0x40(%[dst_argb]), %[dst_argb]                       \n"
2659 
2660 // Store 16 AR30 values.
2661 #define STOREAR30_AVX2                                                \
2662   "vpsraw     $0x4,%%ymm0,%%ymm0                                  \n" \
2663   "vpsraw     $0x4,%%ymm1,%%ymm1                                  \n" \
2664   "vpsraw     $0x4,%%ymm2,%%ymm2                                  \n" \
2665   "vpminsw    %%ymm7,%%ymm0,%%ymm0                                \n" \
2666   "vpminsw    %%ymm7,%%ymm1,%%ymm1                                \n" \
2667   "vpminsw    %%ymm7,%%ymm2,%%ymm2                                \n" \
2668   "vpmaxsw    %%ymm6,%%ymm0,%%ymm0                                \n" \
2669   "vpmaxsw    %%ymm6,%%ymm1,%%ymm1                                \n" \
2670   "vpmaxsw    %%ymm6,%%ymm2,%%ymm2                                \n" \
2671   "vpsllw     $0x4,%%ymm2,%%ymm2                                  \n" \
2672   "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
2673   "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
2674   "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
2675   "vpunpckhwd %%ymm2,%%ymm0,%%ymm3                                \n" \
2676   "vpunpcklwd %%ymm2,%%ymm0,%%ymm0                                \n" \
2677   "vpunpckhwd %%ymm5,%%ymm1,%%ymm2                                \n" \
2678   "vpunpcklwd %%ymm5,%%ymm1,%%ymm1                                \n" \
2679   "vpslld     $0xa,%%ymm1,%%ymm1                                  \n" \
2680   "vpslld     $0xa,%%ymm2,%%ymm2                                  \n" \
2681   "vpor       %%ymm1,%%ymm0,%%ymm0                                \n" \
2682   "vpor       %%ymm2,%%ymm3,%%ymm3                                \n" \
2683   "vmovdqu    %%ymm0,(%[dst_ar30])                                \n" \
2684   "vmovdqu    %%ymm3,0x20(%[dst_ar30])                            \n" \
2685   "lea        0x40(%[dst_ar30]), %[dst_ar30]                      \n"
2686 
2687 #ifdef HAS_I444TOARGBROW_AVX2
2688 // 16 pixels
2689 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
I444ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2690 void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
2691                                const uint8_t* u_buf,
2692                                const uint8_t* v_buf,
2693                                uint8_t* dst_argb,
2694                                const struct YuvConstants* yuvconstants,
2695                                int width) {
2696   asm volatile (
2697     YUVTORGB_SETUP_AVX2(yuvconstants)
2698       "sub         %[u_buf],%[v_buf]             \n"
2699       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
2700 
2701     LABELALIGN
2702       "1:                                        \n"
2703     READYUV444_AVX2
2704     YUVTORGB_AVX2(yuvconstants)
2705     STOREARGB_AVX2
2706       "sub         $0x10,%[width]                \n"
2707       "jg          1b                            \n"
2708       "vzeroupper                                \n"
2709   : [y_buf]"+r"(y_buf),    // %[y_buf]
2710     [u_buf]"+r"(u_buf),    // %[u_buf]
2711     [v_buf]"+r"(v_buf),    // %[v_buf]
2712     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2713     [width]"+rm"(width)    // %[width]
2714   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2715   : "memory", "cc", YUVTORGB_REGS_AVX2
2716     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2717   );
2718 }
2719 #endif  // HAS_I444TOARGBROW_AVX2
2720 
2721 #if defined(HAS_I422TOARGBROW_AVX2)
2722 // 16 pixels
2723 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I422ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2724 void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
2725                                const uint8_t* u_buf,
2726                                const uint8_t* v_buf,
2727                                uint8_t* dst_argb,
2728                                const struct YuvConstants* yuvconstants,
2729                                int width) {
2730   asm volatile (
2731     YUVTORGB_SETUP_AVX2(yuvconstants)
2732       "sub         %[u_buf],%[v_buf]             \n"
2733       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
2734 
2735     LABELALIGN
2736       "1:                                        \n"
2737     READYUV422_AVX2
2738     YUVTORGB_AVX2(yuvconstants)
2739     STOREARGB_AVX2
2740       "sub         $0x10,%[width]                \n"
2741       "jg          1b                            \n"
2742 
2743       "vzeroupper                                \n"
2744   : [y_buf]"+r"(y_buf),    // %[y_buf]
2745     [u_buf]"+r"(u_buf),    // %[u_buf]
2746     [v_buf]"+r"(v_buf),    // %[v_buf]
2747     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2748     [width]"+rm"(width)    // %[width]
2749   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2750   : "memory", "cc", YUVTORGB_REGS_AVX2
2751     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2752   );
2753 }
2754 #endif  // HAS_I422TOARGBROW_AVX2
2755 
2756 #if defined(HAS_I422TOAR30ROW_AVX2)
2757 // 16 pixels
2758 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I422ToAR30Row_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2759 void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
2760                                const uint8_t* u_buf,
2761                                const uint8_t* v_buf,
2762                                uint8_t* dst_ar30,
2763                                const struct YuvConstants* yuvconstants,
2764                                int width) {
2765   asm volatile (
2766     YUVTORGB_SETUP_AVX2(yuvconstants)
2767       "sub         %[u_buf],%[v_buf]             \n"
2768       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
2769       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
2770       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
2771       "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
2772       "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
2773       "vpsrlw      $6,%%ymm7,%%ymm7              \n"
2774 
2775     LABELALIGN
2776       "1:                                        \n"
2777     READYUV422_AVX2
2778     YUVTORGB16_AVX2(yuvconstants)
2779     STOREAR30_AVX2
2780       "sub         $0x10,%[width]                \n"
2781       "jg          1b                            \n"
2782 
2783       "vzeroupper                                \n"
2784   : [y_buf]"+r"(y_buf),    // %[y_buf]
2785     [u_buf]"+r"(u_buf),    // %[u_buf]
2786     [v_buf]"+r"(v_buf),    // %[v_buf]
2787     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
2788     [width]"+rm"(width)    // %[width]
2789   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2790   : "memory", "cc", YUVTORGB_REGS_AVX2
2791     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2792   );
2793 }
2794 #endif  // HAS_I422TOAR30ROW_AVX2
2795 
2796 #if defined(HAS_I210TOARGBROW_AVX2)
2797 // 16 pixels
2798 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I210ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2799 void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
2800                                const uint16_t* u_buf,
2801                                const uint16_t* v_buf,
2802                                uint8_t* dst_argb,
2803                                const struct YuvConstants* yuvconstants,
2804                                int width) {
2805   asm volatile (
2806     YUVTORGB_SETUP_AVX2(yuvconstants)
2807       "sub         %[u_buf],%[v_buf]             \n"
2808       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
2809 
2810     LABELALIGN
2811       "1:                                        \n"
2812     READYUV210_AVX2
2813     YUVTORGB_AVX2(yuvconstants)
2814     STOREARGB_AVX2
2815       "sub         $0x10,%[width]                \n"
2816       "jg          1b                            \n"
2817 
2818       "vzeroupper                                \n"
2819   : [y_buf]"+r"(y_buf),    // %[y_buf]
2820     [u_buf]"+r"(u_buf),    // %[u_buf]
2821     [v_buf]"+r"(v_buf),    // %[v_buf]
2822     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2823     [width]"+rm"(width)    // %[width]
2824   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2825   : "memory", "cc", YUVTORGB_REGS_AVX2
2826     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2827   );
2828 }
2829 #endif  // HAS_I210TOARGBROW_AVX2
2830 
2831 #if defined(HAS_I210TOAR30ROW_AVX2)
2832 // 16 pixels
2833 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I210ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2834 void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
2835                                const uint16_t* u_buf,
2836                                const uint16_t* v_buf,
2837                                uint8_t* dst_ar30,
2838                                const struct YuvConstants* yuvconstants,
2839                                int width) {
2840   asm volatile (
2841     YUVTORGB_SETUP_AVX2(yuvconstants)
2842       "sub         %[u_buf],%[v_buf]             \n"
2843       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
2844       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
2845       "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
2846       "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
2847       "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
2848       "vpsrlw      $6,%%ymm7,%%ymm7              \n"
2849 
2850     LABELALIGN
2851       "1:                                        \n"
2852     READYUV210_AVX2
2853     YUVTORGB16_AVX2(yuvconstants)
2854     STOREAR30_AVX2
2855       "sub         $0x10,%[width]                \n"
2856       "jg          1b                            \n"
2857 
2858       "vzeroupper                                \n"
2859   : [y_buf]"+r"(y_buf),    // %[y_buf]
2860     [u_buf]"+r"(u_buf),    // %[u_buf]
2861     [v_buf]"+r"(v_buf),    // %[v_buf]
2862     [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
2863     [width]"+rm"(width)    // %[width]
2864   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2865   : "memory", "cc", YUVTORGB_REGS_AVX2
2866     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2867   );
2868 }
2869 #endif  // HAS_I210TOAR30ROW_AVX2
2870 
2871 #if defined(HAS_I422ALPHATOARGBROW_AVX2)
2872 // 16 pixels
2873 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
I422AlphaToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2874 void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
2875                                     const uint8_t* u_buf,
2876                                     const uint8_t* v_buf,
2877                                     const uint8_t* a_buf,
2878                                     uint8_t* dst_argb,
2879                                     const struct YuvConstants* yuvconstants,
2880                                     int width) {
2881   // clang-format off
2882   asm volatile (
2883     YUVTORGB_SETUP_AVX2(yuvconstants)
2884       "sub         %[u_buf],%[v_buf]             \n"
2885 
2886     LABELALIGN
2887       "1:                                        \n"
2888     READYUVA422_AVX2
2889     YUVTORGB_AVX2(yuvconstants)
2890     STOREARGB_AVX2
2891       "subl        $0x10,%[width]                \n"
2892       "jg          1b                            \n"
2893       "vzeroupper                                \n"
2894   : [y_buf]"+r"(y_buf),    // %[y_buf]
2895     [u_buf]"+r"(u_buf),    // %[u_buf]
2896     [v_buf]"+r"(v_buf),    // %[v_buf]
2897     [a_buf]"+r"(a_buf),    // %[a_buf]
2898     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2899 #if defined(__i386__)
2900     [width]"+m"(width)     // %[width]
2901 #else
2902     [width]"+rm"(width)    // %[width]
2903 #endif
2904   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2905   : "memory", "cc", YUVTORGB_REGS_AVX2
2906     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2907   );
2908   // clang-format on
2909 }
2910 #endif  // HAS_I422ALPHATOARGBROW_AVX2
2911 
2912 #if defined(HAS_I422TORGBAROW_AVX2)
2913 // 16 pixels
2914 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
I422ToRGBARow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2915 void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
2916                                const uint8_t* u_buf,
2917                                const uint8_t* v_buf,
2918                                uint8_t* dst_argb,
2919                                const struct YuvConstants* yuvconstants,
2920                                int width) {
2921   asm volatile (
2922     YUVTORGB_SETUP_AVX2(yuvconstants)
2923       "sub         %[u_buf],%[v_buf]             \n"
2924       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
2925 
2926     LABELALIGN
2927       "1:                                        \n"
2928     READYUV422_AVX2
2929     YUVTORGB_AVX2(yuvconstants)
2930 
2931     // Step 3: Weave into RGBA
2932     "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
2933     "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
2934     "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
2935     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
2936     "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
2937     "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
2938     "vmovdqu    %%ymm0,(%[dst_argb])           \n"
2939     "vmovdqu    %%ymm1,0x20(%[dst_argb])       \n"
2940     "lea        0x40(%[dst_argb]),%[dst_argb]  \n"
2941     "sub        $0x10,%[width]                 \n"
2942     "jg         1b                             \n"
2943     "vzeroupper                                \n"
2944   : [y_buf]"+r"(y_buf),    // %[y_buf]
2945     [u_buf]"+r"(u_buf),    // %[u_buf]
2946     [v_buf]"+r"(v_buf),    // %[v_buf]
2947     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2948     [width]"+rm"(width)    // %[width]
2949   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2950   : "memory", "cc", YUVTORGB_REGS_AVX2
2951     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2952   );
2953 }
2954 #endif  // HAS_I422TORGBAROW_AVX2
2955 
2956 #if defined(HAS_NV12TOARGBROW_AVX2)
2957 // 16 pixels.
2958 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV12ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2959 void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
2960                                const uint8_t* uv_buf,
2961                                uint8_t* dst_argb,
2962                                const struct YuvConstants* yuvconstants,
2963                                int width) {
2964   // clang-format off
2965   asm volatile (
2966     YUVTORGB_SETUP_AVX2(yuvconstants)
2967       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
2968 
2969     LABELALIGN
2970       "1:                                        \n"
2971     READNV12_AVX2
2972     YUVTORGB_AVX2(yuvconstants)
2973     STOREARGB_AVX2
2974       "sub         $0x10,%[width]                \n"
2975       "jg          1b                            \n"
2976       "vzeroupper                                \n"
2977   : [y_buf]"+r"(y_buf),    // %[y_buf]
2978     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
2979     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2980     [width]"+rm"(width)    // %[width]
2981   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2982     : "memory", "cc", YUVTORGB_REGS_AVX2
2983     "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2984   );
2985   // clang-format on
2986 }
2987 #endif  // HAS_NV12TOARGBROW_AVX2
2988 
2989 #if defined(HAS_NV21TOARGBROW_AVX2)
2990 // 16 pixels.
2991 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV21ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * vu_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2992 void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
2993                                const uint8_t* vu_buf,
2994                                uint8_t* dst_argb,
2995                                const struct YuvConstants* yuvconstants,
2996                                int width) {
2997   // clang-format off
2998   asm volatile (
2999     YUVTORGB_SETUP_AVX2(yuvconstants)
3000       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
3001 
3002     LABELALIGN
3003       "1:                                        \n"
3004     READNV21_AVX2
3005     YUVTORGB_AVX2(yuvconstants)
3006     STOREARGB_AVX2
3007       "sub         $0x10,%[width]                \n"
3008       "jg          1b                            \n"
3009       "vzeroupper                                \n"
3010   : [y_buf]"+r"(y_buf),    // %[y_buf]
3011     [vu_buf]"+r"(vu_buf),    // %[vu_buf]
3012     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3013     [width]"+rm"(width)    // %[width]
3014   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
3015     [kShuffleNV21]"m"(kShuffleNV21)
3016     : "memory", "cc", YUVTORGB_REGS_AVX2
3017       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3018   );
3019   // clang-format on
3020 }
3021 #endif  // HAS_NV21TOARGBROW_AVX2
3022 
3023 #if defined(HAS_YUY2TOARGBROW_AVX2)
3024 // 16 pixels.
3025 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
YUY2ToARGBRow_AVX2(const uint8_t * yuy2_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3026 void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
3027                                uint8_t* dst_argb,
3028                                const struct YuvConstants* yuvconstants,
3029                                int width) {
3030   // clang-format off
3031   asm volatile (
3032     YUVTORGB_SETUP_AVX2(yuvconstants)
3033       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
3034 
3035     LABELALIGN
3036       "1:                                        \n"
3037     READYUY2_AVX2
3038     YUVTORGB_AVX2(yuvconstants)
3039     STOREARGB_AVX2
3040       "sub         $0x10,%[width]                \n"
3041       "jg          1b                            \n"
3042       "vzeroupper                                \n"
3043   : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
3044     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3045     [width]"+rm"(width)    // %[width]
3046   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
3047     [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
3048     [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
3049     : "memory", "cc", YUVTORGB_REGS_AVX2
3050       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3051   );
3052   // clang-format on
3053 }
3054 #endif  // HAS_YUY2TOARGBROW_AVX2
3055 
3056 #if defined(HAS_UYVYTOARGBROW_AVX2)
3057 // 16 pixels.
3058 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
UYVYToARGBRow_AVX2(const uint8_t * uyvy_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3059 void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
3060                                uint8_t* dst_argb,
3061                                const struct YuvConstants* yuvconstants,
3062                                int width) {
3063   // clang-format off
3064   asm volatile (
3065     YUVTORGB_SETUP_AVX2(yuvconstants)
3066       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
3067 
3068     LABELALIGN
3069       "1:                                        \n"
3070     READUYVY_AVX2
3071     YUVTORGB_AVX2(yuvconstants)
3072     STOREARGB_AVX2
3073       "sub         $0x10,%[width]                \n"
3074       "jg          1b                            \n"
3075       "vzeroupper                                \n"
3076   : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
3077     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
3078     [width]"+rm"(width)    // %[width]
3079   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
3080     [kShuffleUYVYY]"m"(kShuffleUYVYY),
3081     [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
3082     : "memory", "cc", YUVTORGB_REGS_AVX2
3083       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3084   );
3085   // clang-format on
3086 }
3087 #endif  // HAS_UYVYTOARGBROW_AVX2
3088 
3089 #ifdef HAS_I400TOARGBROW_SSE2
I400ToARGBRow_SSE2(const uint8_t * y_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3090 void I400ToARGBRow_SSE2(const uint8_t* y_buf,
3091                         uint8_t* dst_argb,
3092                         const struct YuvConstants* yuvconstants,
3093                         int width) {
3094   asm volatile(
3095       "movdqa      192(%3),%%xmm2                \n"  // yg = 18997 = 1.164
3096       "movdqa      224(%3),%%xmm3                \n"  // ygb = 1160 = 1.164 * 16
3097       "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 0xff000000
3098       "pslld       $0x18,%%xmm4                  \n"
3099 
3100       LABELALIGN
3101       "1:                                        \n"
3102       // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
3103       "movq      (%0),%%xmm0                     \n"
3104       "lea       0x8(%0),%0                      \n"
3105       "punpcklbw %%xmm0,%%xmm0                   \n"
3106       "pmulhuw   %%xmm2,%%xmm0                   \n"
3107       "paddsw    %%xmm3,%%xmm0                   \n"
3108       "psraw     $6, %%xmm0                      \n"
3109       "packuswb  %%xmm0,%%xmm0                   \n"
3110 
3111       // Step 2: Weave into ARGB
3112       "punpcklbw %%xmm0,%%xmm0                   \n"
3113       "movdqa    %%xmm0,%%xmm1                   \n"
3114       "punpcklwd %%xmm0,%%xmm0                   \n"
3115       "punpckhwd %%xmm1,%%xmm1                   \n"
3116       "por       %%xmm4,%%xmm0                   \n"
3117       "por       %%xmm4,%%xmm1                   \n"
3118       "movdqu    %%xmm0,(%1)                     \n"
3119       "movdqu    %%xmm1,0x10(%1)                 \n"
3120       "lea       0x20(%1),%1                     \n"
3121 
3122       "sub       $0x8,%2                         \n"
3123       "jg        1b                              \n"
3124       : "+r"(y_buf),       // %0
3125         "+r"(dst_argb),    // %1
3126         "+rm"(width)       // %2
3127       : "r"(yuvconstants)  // %3
3128       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
3129 }
3130 #endif  // HAS_I400TOARGBROW_SSE2
3131 
3132 #ifdef HAS_I400TOARGBROW_AVX2
3133 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
3134 // note: vpunpcklbw mutates and vpackuswb unmutates.
I400ToARGBRow_AVX2(const uint8_t * y_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3135 void I400ToARGBRow_AVX2(const uint8_t* y_buf,
3136                         uint8_t* dst_argb,
3137                         const struct YuvConstants* yuvconstants,
3138                         int width) {
3139   asm volatile(
3140       "vmovdqa     192(%3),%%ymm2                \n"  // yg = 18997 = 1.164
3141       "vmovdqa     224(%3),%%ymm3                \n"  // ygb = -1160 = 1.164*16
3142       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 0xff000000
3143       "vpslld      $0x18,%%ymm4,%%ymm4           \n"
3144 
3145       LABELALIGN
3146       "1:                                        \n"
3147       // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
3148       "vmovdqu    (%0),%%xmm0                    \n"
3149       "lea        0x10(%0),%0                    \n"
3150       "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
3151       "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
3152       "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
3153       "vpaddsw    %%ymm3,%%ymm0,%%ymm0           \n"
3154       "vpsraw     $0x6,%%ymm0,%%ymm0             \n"
3155       "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
3156       "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
3157       "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
3158       "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
3159       "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
3160       "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
3161       "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
3162       "vmovdqu    %%ymm0,(%1)                    \n"
3163       "vmovdqu    %%ymm1,0x20(%1)                \n"
3164       "lea        0x40(%1),%1                     \n"
3165       "sub        $0x10,%2                       \n"
3166       "jg        1b                              \n"
3167       "vzeroupper                                \n"
3168       : "+r"(y_buf),       // %0
3169         "+r"(dst_argb),    // %1
3170         "+rm"(width)       // %2
3171       : "r"(yuvconstants)  // %3
3172       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
3173 }
3174 #endif  // HAS_I400TOARGBROW_AVX2
3175 
3176 #ifdef HAS_MIRRORROW_SSSE3
3177 // Shuffle table for reversing the bytes.
3178 static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
3179                                      7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
3180 
MirrorRow_SSSE3(const uint8_t * src,uint8_t * dst,int width)3181 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
3182   intptr_t temp_width = (intptr_t)(width);
3183   asm volatile(
3184 
3185       "movdqa      %3,%%xmm5                     \n"
3186 
3187       LABELALIGN
3188       "1:                                        \n"
3189       "movdqu      -0x10(%0,%2,1),%%xmm0         \n"
3190       "pshufb      %%xmm5,%%xmm0                 \n"
3191       "movdqu      %%xmm0,(%1)                   \n"
3192       "lea         0x10(%1),%1                   \n"
3193       "sub         $0x10,%2                      \n"
3194       "jg          1b                            \n"
3195       : "+r"(src),           // %0
3196         "+r"(dst),           // %1
3197         "+r"(temp_width)     // %2
3198       : "m"(kShuffleMirror)  // %3
3199       : "memory", "cc", "xmm0", "xmm5");
3200 }
3201 #endif  // HAS_MIRRORROW_SSSE3
3202 
3203 #ifdef HAS_MIRRORROW_AVX2
MirrorRow_AVX2(const uint8_t * src,uint8_t * dst,int width)3204 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
3205   intptr_t temp_width = (intptr_t)(width);
3206   asm volatile(
3207 
3208       "vbroadcastf128 %3,%%ymm5                  \n"
3209 
3210       LABELALIGN
3211       "1:                                        \n"
3212       "vmovdqu     -0x20(%0,%2,1),%%ymm0         \n"
3213       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
3214       "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
3215       "vmovdqu     %%ymm0,(%1)                   \n"
3216       "lea         0x20(%1),%1                   \n"
3217       "sub         $0x20,%2                      \n"
3218       "jg          1b                            \n"
3219       "vzeroupper                                \n"
3220       : "+r"(src),           // %0
3221         "+r"(dst),           // %1
3222         "+r"(temp_width)     // %2
3223       : "m"(kShuffleMirror)  // %3
3224       : "memory", "cc", "xmm0", "xmm5");
3225 }
3226 #endif  // HAS_MIRRORROW_AVX2
3227 
3228 #ifdef HAS_MIRRORUVROW_SSSE3
3229 // Shuffle table for reversing the UV.
3230 static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
3231                                        6u,  7u,  4u,  5u,  2u,  3u,  0u, 1u};
3232 
MirrorUVRow_SSSE3(const uint8_t * src_uv,uint8_t * dst_uv,int width)3233 void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
3234   intptr_t temp_width = (intptr_t)(width);
3235   asm volatile(
3236 
3237       "movdqa      %3,%%xmm5                     \n"
3238 
3239       LABELALIGN
3240       "1:                                        \n"
3241       "movdqu      -0x10(%0,%2,2),%%xmm0         \n"
3242       "pshufb      %%xmm5,%%xmm0                 \n"
3243       "movdqu      %%xmm0,(%1)                   \n"
3244       "lea         0x10(%1),%1                   \n"
3245       "sub         $0x8,%2                       \n"
3246       "jg          1b                            \n"
3247       : "+r"(src_uv),          // %0
3248         "+r"(dst_uv),          // %1
3249         "+r"(temp_width)       // %2
3250       : "m"(kShuffleMirrorUV)  // %3
3251       : "memory", "cc", "xmm0", "xmm5");
3252 }
3253 #endif  // HAS_MIRRORUVROW_SSSE3
3254 
3255 #ifdef HAS_MIRRORUVROW_AVX2
MirrorUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_uv,int width)3256 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
3257   intptr_t temp_width = (intptr_t)(width);
3258   asm volatile(
3259 
3260       "vbroadcastf128 %3,%%ymm5                  \n"
3261 
3262       LABELALIGN
3263       "1:                                        \n"
3264       "vmovdqu     -0x20(%0,%2,2),%%ymm0         \n"
3265       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
3266       "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
3267       "vmovdqu     %%ymm0,(%1)                   \n"
3268       "lea         0x20(%1),%1                   \n"
3269       "sub         $0x10,%2                      \n"
3270       "jg          1b                            \n"
3271       "vzeroupper                                \n"
3272       : "+r"(src_uv),          // %0
3273         "+r"(dst_uv),          // %1
3274         "+r"(temp_width)       // %2
3275       : "m"(kShuffleMirrorUV)  // %3
3276       : "memory", "cc", "xmm0", "xmm5");
3277 }
3278 #endif  // HAS_MIRRORUVROW_AVX2
3279 
3280 #ifdef HAS_MIRRORSPLITUVROW_SSSE3
3281 // Shuffle table for reversing the bytes of UV channels.
3282 static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
3283                                             15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
MirrorSplitUVRow_SSSE3(const uint8_t * src,uint8_t * dst_u,uint8_t * dst_v,int width)3284 void MirrorSplitUVRow_SSSE3(const uint8_t* src,
3285                             uint8_t* dst_u,
3286                             uint8_t* dst_v,
3287                             int width) {
3288   intptr_t temp_width = (intptr_t)(width);
3289   asm volatile(
3290       "movdqa      %4,%%xmm1                     \n"
3291       "lea         -0x10(%0,%3,2),%0             \n"
3292       "sub         %1,%2                         \n"
3293 
3294       LABELALIGN
3295       "1:                                        \n"
3296       "movdqu      (%0),%%xmm0                   \n"
3297       "lea         -0x10(%0),%0                  \n"
3298       "pshufb      %%xmm1,%%xmm0                 \n"
3299       "movlpd      %%xmm0,(%1)                   \n"
3300       "movhpd      %%xmm0,0x00(%1,%2,1)          \n"
3301       "lea         0x8(%1),%1                    \n"
3302       "sub         $8,%3                         \n"
3303       "jg          1b                            \n"
3304       : "+r"(src),                  // %0
3305         "+r"(dst_u),                // %1
3306         "+r"(dst_v),                // %2
3307         "+r"(temp_width)            // %3
3308       : "m"(kShuffleMirrorSplitUV)  // %4
3309       : "memory", "cc", "xmm0", "xmm1");
3310 }
3311 #endif  // HAS_MIRRORSPLITUVROW_SSSE3
3312 
3313 #ifdef HAS_RGB24MIRRORROW_SSSE3
3314 
3315 // Shuffle first 5 pixels to last 5 mirrored.  first byte zero
3316 static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u,
3317                                          7u,   8u,  3u,  4u,  5u, 0u,  1u,  2u};
3318 
3319 // Shuffle last 5 pixels to first 5 mirrored.  last byte zero
3320 static const uvec8 kShuffleMirrorRGB1 = {
3321     13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u};
3322 
3323 // Shuffle 5 pixels at a time (15 bytes)
RGB24MirrorRow_SSSE3(const uint8_t * src_rgb24,uint8_t * dst_rgb24,int width)3324 void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
3325                           uint8_t* dst_rgb24,
3326                           int width) {
3327   intptr_t temp_width = (intptr_t)(width);
3328   src_rgb24 += width * 3 - 48;
3329   asm volatile(
3330       "movdqa      %3,%%xmm4                     \n"
3331       "movdqa      %4,%%xmm5                     \n"
3332 
3333       LABELALIGN
3334       "1:                                        \n"
3335       "movdqu      (%0),%%xmm0                   \n"  // first 5
3336       "movdqu      15(%0),%%xmm1                 \n"  // next 5
3337       "movdqu      30(%0),%%xmm2                 \n"  // next 5
3338       "movdqu      32(%0),%%xmm3                 \n"  // last 1 special
3339       "pshufb      %%xmm4,%%xmm0                 \n"
3340       "pshufb      %%xmm4,%%xmm1                 \n"
3341       "pshufb      %%xmm4,%%xmm2                 \n"
3342       "pshufb      %%xmm5,%%xmm3                 \n"
3343       "lea         -0x30(%0),%0                  \n"
3344       "movdqu      %%xmm0,32(%1)                 \n"  // last 5
3345       "movdqu      %%xmm1,17(%1)                 \n"  // next 5
3346       "movdqu      %%xmm2,2(%1)                  \n"  // next 5
3347       "movlpd      %%xmm3,0(%1)                  \n"  // first 1
3348       "lea         0x30(%1),%1                   \n"
3349       "sub         $0x10,%2                      \n"
3350       "jg          1b                            \n"
3351       : "+r"(src_rgb24),          // %0
3352         "+r"(dst_rgb24),          // %1
3353         "+r"(temp_width)          // %2
3354       : "m"(kShuffleMirrorRGB0),  // %3
3355         "m"(kShuffleMirrorRGB1)   // %4
3356       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
3357 }
3358 #endif  // HAS_RGB24MIRRORROW_SSSE3
3359 
3360 #ifdef HAS_ARGBMIRRORROW_SSE2
3361 
ARGBMirrorRow_SSE2(const uint8_t * src,uint8_t * dst,int width)3362 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
3363   intptr_t temp_width = (intptr_t)(width);
3364   asm volatile(
3365 
3366       "lea         -0x10(%0,%2,4),%0             \n"
3367 
3368       LABELALIGN
3369       "1:                                        \n"
3370       "movdqu      (%0),%%xmm0                   \n"
3371       "pshufd      $0x1b,%%xmm0,%%xmm0           \n"
3372       "lea         -0x10(%0),%0                  \n"
3373       "movdqu      %%xmm0,(%1)                   \n"
3374       "lea         0x10(%1),%1                   \n"
3375       "sub         $0x4,%2                       \n"
3376       "jg          1b                            \n"
3377       : "+r"(src),        // %0
3378         "+r"(dst),        // %1
3379         "+r"(temp_width)  // %2
3380       :
3381       : "memory", "cc", "xmm0");
3382 }
3383 #endif  // HAS_ARGBMIRRORROW_SSE2
3384 
3385 #ifdef HAS_ARGBMIRRORROW_AVX2
3386 // Shuffle table for reversing the bytes.
3387 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
ARGBMirrorRow_AVX2(const uint8_t * src,uint8_t * dst,int width)3388 void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
3389   intptr_t temp_width = (intptr_t)(width);
3390   asm volatile(
3391 
3392       "vmovdqu     %3,%%ymm5                     \n"
3393 
3394       LABELALIGN
3395       "1:                                        \n"
3396       "vpermd      -0x20(%0,%2,4),%%ymm5,%%ymm0  \n"
3397       "vmovdqu     %%ymm0,(%1)                   \n"
3398       "lea         0x20(%1),%1                   \n"
3399       "sub         $0x8,%2                       \n"
3400       "jg          1b                            \n"
3401       "vzeroupper                                \n"
3402       : "+r"(src),                    // %0
3403         "+r"(dst),                    // %1
3404         "+r"(temp_width)              // %2
3405       : "m"(kARGBShuffleMirror_AVX2)  // %3
3406       : "memory", "cc", "xmm0", "xmm5");
3407 }
3408 #endif  // HAS_ARGBMIRRORROW_AVX2
3409 
3410 #ifdef HAS_SPLITUVROW_AVX2
SplitUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)3411 void SplitUVRow_AVX2(const uint8_t* src_uv,
3412                      uint8_t* dst_u,
3413                      uint8_t* dst_v,
3414                      int width) {
3415   asm volatile(
3416       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
3417       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
3418       "sub         %1,%2                         \n"
3419 
3420       LABELALIGN
3421       "1:                                        \n"
3422       "vmovdqu     (%0),%%ymm0                   \n"
3423       "vmovdqu     0x20(%0),%%ymm1               \n"
3424       "lea         0x40(%0),%0                   \n"
3425       "vpsrlw      $0x8,%%ymm0,%%ymm2            \n"
3426       "vpsrlw      $0x8,%%ymm1,%%ymm3            \n"
3427       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
3428       "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
3429       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
3430       "vpackuswb   %%ymm3,%%ymm2,%%ymm2          \n"
3431       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
3432       "vpermq      $0xd8,%%ymm2,%%ymm2           \n"
3433       "vmovdqu     %%ymm0,(%1)                   \n"
3434       "vmovdqu     %%ymm2,0x00(%1,%2,1)          \n"
3435       "lea         0x20(%1),%1                   \n"
3436       "sub         $0x20,%3                      \n"
3437       "jg          1b                            \n"
3438       "vzeroupper                                \n"
3439       : "+r"(src_uv),  // %0
3440         "+r"(dst_u),   // %1
3441         "+r"(dst_v),   // %2
3442         "+r"(width)    // %3
3443       :
3444       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
3445 }
3446 #endif  // HAS_SPLITUVROW_AVX2
3447 
3448 #ifdef HAS_SPLITUVROW_SSE2
SplitUVRow_SSE2(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)3449 void SplitUVRow_SSE2(const uint8_t* src_uv,
3450                      uint8_t* dst_u,
3451                      uint8_t* dst_v,
3452                      int width) {
3453   asm volatile(
3454       "pcmpeqb     %%xmm5,%%xmm5                 \n"
3455       "psrlw       $0x8,%%xmm5                   \n"
3456       "sub         %1,%2                         \n"
3457 
3458       LABELALIGN
3459       "1:                                        \n"
3460       "movdqu      (%0),%%xmm0                   \n"
3461       "movdqu      0x10(%0),%%xmm1               \n"
3462       "lea         0x20(%0),%0                   \n"
3463       "movdqa      %%xmm0,%%xmm2                 \n"
3464       "movdqa      %%xmm1,%%xmm3                 \n"
3465       "pand        %%xmm5,%%xmm0                 \n"
3466       "pand        %%xmm5,%%xmm1                 \n"
3467       "packuswb    %%xmm1,%%xmm0                 \n"
3468       "psrlw       $0x8,%%xmm2                   \n"
3469       "psrlw       $0x8,%%xmm3                   \n"
3470       "packuswb    %%xmm3,%%xmm2                 \n"
3471       "movdqu      %%xmm0,(%1)                   \n"
3472       "movdqu      %%xmm2,0x00(%1,%2,1)          \n"
3473       "lea         0x10(%1),%1                   \n"
3474       "sub         $0x10,%3                      \n"
3475       "jg          1b                            \n"
3476       : "+r"(src_uv),  // %0
3477         "+r"(dst_u),   // %1
3478         "+r"(dst_v),   // %2
3479         "+r"(width)    // %3
3480       :
3481       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
3482 }
3483 #endif  // HAS_SPLITUVROW_SSE2
3484 
3485 #ifdef HAS_MERGEUVROW_AVX2
MergeUVRow_AVX2(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)3486 void MergeUVRow_AVX2(const uint8_t* src_u,
3487                      const uint8_t* src_v,
3488                      uint8_t* dst_uv,
3489                      int width) {
3490   asm volatile(
3491 
3492       "sub         %0,%1                         \n"
3493 
3494       LABELALIGN
3495       "1:                                        \n"
3496       "vmovdqu     (%0),%%ymm0                   \n"
3497       "vmovdqu     0x00(%0,%1,1),%%ymm1          \n"
3498       "lea         0x20(%0),%0                   \n"
3499       "vpunpcklbw  %%ymm1,%%ymm0,%%ymm2          \n"
3500       "vpunpckhbw  %%ymm1,%%ymm0,%%ymm0          \n"
3501       "vextractf128 $0x0,%%ymm2,(%2)             \n"
3502       "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
3503       "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
3504       "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
3505       "lea         0x40(%2),%2                   \n"
3506       "sub         $0x20,%3                      \n"
3507       "jg          1b                            \n"
3508       "vzeroupper                                \n"
3509       : "+r"(src_u),   // %0
3510         "+r"(src_v),   // %1
3511         "+r"(dst_uv),  // %2
3512         "+r"(width)    // %3
3513       :
3514       : "memory", "cc", "xmm0", "xmm1", "xmm2");
3515 }
3516 #endif  // HAS_MERGEUVROW_AVX2
3517 
3518 #ifdef HAS_MERGEUVROW_SSE2
MergeUVRow_SSE2(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)3519 void MergeUVRow_SSE2(const uint8_t* src_u,
3520                      const uint8_t* src_v,
3521                      uint8_t* dst_uv,
3522                      int width) {
3523   asm volatile(
3524 
3525       "sub         %0,%1                         \n"
3526 
3527       LABELALIGN
3528       "1:                                        \n"
3529       "movdqu      (%0),%%xmm0                   \n"
3530       "movdqu      0x00(%0,%1,1),%%xmm1          \n"
3531       "lea         0x10(%0),%0                   \n"
3532       "movdqa      %%xmm0,%%xmm2                 \n"
3533       "punpcklbw   %%xmm1,%%xmm0                 \n"
3534       "punpckhbw   %%xmm1,%%xmm2                 \n"
3535       "movdqu      %%xmm0,(%2)                   \n"
3536       "movdqu      %%xmm2,0x10(%2)               \n"
3537       "lea         0x20(%2),%2                   \n"
3538       "sub         $0x10,%3                      \n"
3539       "jg          1b                            \n"
3540       : "+r"(src_u),   // %0
3541         "+r"(src_v),   // %1
3542         "+r"(dst_uv),  // %2
3543         "+r"(width)    // %3
3544       :
3545       : "memory", "cc", "xmm0", "xmm1", "xmm2");
3546 }
3547 #endif  // HAS_MERGEUVROW_SSE2
3548 
3549 // Use scale to convert lsb formats to msb, depending how many bits there are:
3550 // 128 = 9 bits
3551 // 64 = 10 bits
3552 // 16 = 12 bits
3553 // 1 = 16 bits
3554 #ifdef HAS_MERGEUVROW_16_AVX2
MergeUVRow_16_AVX2(const uint16_t * src_u,const uint16_t * src_v,uint16_t * dst_uv,int scale,int width)3555 void MergeUVRow_16_AVX2(const uint16_t* src_u,
3556                         const uint16_t* src_v,
3557                         uint16_t* dst_uv,
3558                         int scale,
3559                         int width) {
3560   // clang-format off
3561   asm volatile (
3562       "vmovd       %4,%%xmm3                     \n"
3563       "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
3564       "vbroadcastss %%xmm3,%%ymm3                \n"
3565       "sub         %0,%1                         \n"
3566 
3567     // 16 pixels per loop.
3568     LABELALIGN
3569       "1:                                        \n"
3570       "vmovdqu     (%0),%%ymm0                   \n"
3571       "vmovdqu     (%0,%1,1),%%ymm1              \n"
3572       "add         $0x20,%0                      \n"
3573 
3574       "vpmullw     %%ymm3,%%ymm0,%%ymm0          \n"
3575       "vpmullw     %%ymm3,%%ymm1,%%ymm1          \n"
3576       "vpunpcklwd  %%ymm1,%%ymm0,%%ymm2          \n"  // mutates
3577       "vpunpckhwd  %%ymm1,%%ymm0,%%ymm0          \n"
3578       "vextractf128 $0x0,%%ymm2,(%2)             \n"
3579       "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
3580       "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
3581       "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
3582       "add         $0x40,%2                      \n"
3583       "sub         $0x10,%3                      \n"
3584       "jg          1b                            \n"
3585       "vzeroupper                                \n"
3586   : "+r"(src_u),   // %0
3587     "+r"(src_v),   // %1
3588     "+r"(dst_uv),  // %2
3589     "+r"(width)    // %3
3590   : "r"(scale)     // %4
3591   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
3592   // clang-format on
3593 }
3594 #endif  // HAS_MERGEUVROW_AVX2
3595 
3596 // Use scale to convert lsb formats to msb, depending how many bits there are:
3597 // 128 = 9 bits
3598 // 64 = 10 bits
3599 // 16 = 12 bits
3600 // 1 = 16 bits
3601 #ifdef HAS_MULTIPLYROW_16_AVX2
MultiplyRow_16_AVX2(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)3602 void MultiplyRow_16_AVX2(const uint16_t* src_y,
3603                          uint16_t* dst_y,
3604                          int scale,
3605                          int width) {
3606   // clang-format off
3607   asm volatile (
3608       "vmovd       %3,%%xmm3                     \n"
3609       "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
3610       "vbroadcastss %%xmm3,%%ymm3                \n"
3611       "sub         %0,%1                         \n"
3612 
3613     // 16 pixels per loop.
3614     LABELALIGN
3615       "1:                                        \n"
3616       "vmovdqu     (%0),%%ymm0                   \n"
3617       "vmovdqu     0x20(%0),%%ymm1               \n"
3618       "vpmullw     %%ymm3,%%ymm0,%%ymm0          \n"
3619       "vpmullw     %%ymm3,%%ymm1,%%ymm1          \n"
3620       "vmovdqu     %%ymm0,(%0,%1)                \n"
3621       "vmovdqu     %%ymm1,0x20(%0,%1)            \n"
3622       "add         $0x40,%0                      \n"
3623       "sub         $0x20,%2                      \n"
3624       "jg          1b                            \n"
3625       "vzeroupper                                \n"
3626   : "+r"(src_y),   // %0
3627     "+r"(dst_y),   // %1
3628     "+r"(width)    // %2
3629   : "r"(scale)     // %3
3630   : "memory", "cc", "xmm0", "xmm1", "xmm3");
3631   // clang-format on
3632 }
3633 #endif  // HAS_MULTIPLYROW_16_AVX2
3634 
3635 // Use scale to convert lsb formats to msb, depending how many bits there are:
3636 // 32768 = 9 bits
3637 // 16384 = 10 bits
3638 // 4096 = 12 bits
3639 // 256 = 16 bits
Convert16To8Row_SSSE3(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)3640 void Convert16To8Row_SSSE3(const uint16_t* src_y,
3641                            uint8_t* dst_y,
3642                            int scale,
3643                            int width) {
3644   // clang-format off
3645   asm volatile (
3646       "movd        %3,%%xmm2                     \n"
3647       "punpcklwd   %%xmm2,%%xmm2                 \n"
3648       "pshufd      $0x0,%%xmm2,%%xmm2            \n"
3649 
3650     // 32 pixels per loop.
3651     LABELALIGN
3652       "1:                                        \n"
3653       "movdqu      (%0),%%xmm0                   \n"
3654       "movdqu      0x10(%0),%%xmm1               \n"
3655       "add         $0x20,%0                      \n"
3656       "pmulhuw     %%xmm2,%%xmm0                 \n"
3657       "pmulhuw     %%xmm2,%%xmm1                 \n"
3658       "packuswb    %%xmm1,%%xmm0                 \n"
3659       "movdqu      %%xmm0,(%1)                   \n"
3660       "add         $0x10,%1                      \n"
3661       "sub         $0x10,%2                      \n"
3662       "jg          1b                            \n"
3663   : "+r"(src_y),   // %0
3664     "+r"(dst_y),   // %1
3665     "+r"(width)    // %2
3666   : "r"(scale)     // %3
3667   : "memory", "cc", "xmm0", "xmm1", "xmm2");
3668   // clang-format on
3669 }
3670 
3671 #ifdef HAS_CONVERT16TO8ROW_AVX2
Convert16To8Row_AVX2(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)3672 void Convert16To8Row_AVX2(const uint16_t* src_y,
3673                           uint8_t* dst_y,
3674                           int scale,
3675                           int width) {
3676   // clang-format off
3677   asm volatile (
3678       "vmovd       %3,%%xmm2                     \n"
3679       "vpunpcklwd  %%xmm2,%%xmm2,%%xmm2          \n"
3680       "vbroadcastss %%xmm2,%%ymm2                \n"
3681 
3682     // 32 pixels per loop.
3683     LABELALIGN
3684       "1:                                        \n"
3685       "vmovdqu     (%0),%%ymm0                   \n"
3686       "vmovdqu     0x20(%0),%%ymm1               \n"
3687       "add         $0x40,%0                      \n"
3688       "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
3689       "vpmulhuw    %%ymm2,%%ymm1,%%ymm1          \n"
3690       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"  // mutates
3691       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
3692       "vmovdqu     %%ymm0,(%1)                   \n"
3693       "add         $0x20,%1                      \n"
3694       "sub         $0x20,%2                      \n"
3695       "jg          1b                            \n"
3696       "vzeroupper                                \n"
3697   : "+r"(src_y),   // %0
3698     "+r"(dst_y),   // %1
3699     "+r"(width)    // %2
3700   : "r"(scale)     // %3
3701   : "memory", "cc", "xmm0", "xmm1", "xmm2");
3702   // clang-format on
3703 }
3704 #endif  // HAS_CONVERT16TO8ROW_AVX2
3705 
3706 // Use scale to convert to lsb formats depending how many bits there are:
3707 // 512 = 9 bits
3708 // 1024 = 10 bits
3709 // 4096 = 12 bits
3710 // TODO(fbarchard): reduce to SSE2
Convert8To16Row_SSE2(const uint8_t * src_y,uint16_t * dst_y,int scale,int width)3711 void Convert8To16Row_SSE2(const uint8_t* src_y,
3712                           uint16_t* dst_y,
3713                           int scale,
3714                           int width) {
3715   // clang-format off
3716   asm volatile (
3717       "movd        %3,%%xmm2                     \n"
3718       "punpcklwd   %%xmm2,%%xmm2                 \n"
3719       "pshufd      $0x0,%%xmm2,%%xmm2            \n"
3720 
3721     // 32 pixels per loop.
3722     LABELALIGN
3723       "1:                                        \n"
3724       "movdqu      (%0),%%xmm0                   \n"
3725       "movdqa      %%xmm0,%%xmm1                 \n"
3726       "punpcklbw   %%xmm0,%%xmm0                 \n"
3727       "punpckhbw   %%xmm1,%%xmm1                 \n"
3728       "add         $0x10,%0                      \n"
3729       "pmulhuw     %%xmm2,%%xmm0                 \n"
3730       "pmulhuw     %%xmm2,%%xmm1                 \n"
3731       "movdqu      %%xmm0,(%1)                   \n"
3732       "movdqu      %%xmm1,0x10(%1)               \n"
3733       "add         $0x20,%1                      \n"
3734       "sub         $0x10,%2                      \n"
3735       "jg          1b                            \n"
3736   : "+r"(src_y),   // %0
3737     "+r"(dst_y),   // %1
3738     "+r"(width)    // %2
3739   : "r"(scale)     // %3
3740   : "memory", "cc", "xmm0", "xmm1", "xmm2");
3741   // clang-format on
3742 }
3743 
3744 #ifdef HAS_CONVERT8TO16ROW_AVX2
Convert8To16Row_AVX2(const uint8_t * src_y,uint16_t * dst_y,int scale,int width)3745 void Convert8To16Row_AVX2(const uint8_t* src_y,
3746                           uint16_t* dst_y,
3747                           int scale,
3748                           int width) {
3749   // clang-format off
3750   asm volatile (
3751       "vmovd       %3,%%xmm2                     \n"
3752       "vpunpcklwd  %%xmm2,%%xmm2,%%xmm2          \n"
3753       "vbroadcastss %%xmm2,%%ymm2                \n"
3754 
3755     // 32 pixels per loop.
3756     LABELALIGN
3757       "1:                                        \n"
3758       "vmovdqu     (%0),%%ymm0                   \n"
3759       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
3760       "add         $0x20,%0                      \n"
3761       "vpunpckhbw  %%ymm0,%%ymm0,%%ymm1          \n"
3762       "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
3763       "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
3764       "vpmulhuw    %%ymm2,%%ymm1,%%ymm1          \n"
3765       "vmovdqu     %%ymm0,(%1)                   \n"
3766       "vmovdqu     %%ymm1,0x20(%1)               \n"
3767       "add         $0x40,%1                      \n"
3768       "sub         $0x20,%2                      \n"
3769       "jg          1b                            \n"
3770       "vzeroupper                                \n"
3771   : "+r"(src_y),   // %0
3772     "+r"(dst_y),   // %1
3773     "+r"(width)    // %2
3774   : "r"(scale)     // %3
3775   : "memory", "cc", "xmm0", "xmm1", "xmm2");
3776   // clang-format on
3777 }
3778 #endif  // HAS_CONVERT8TO16ROW_AVX2
3779 
3780 #ifdef HAS_SPLITRGBROW_SSSE3
3781 
3782 // Shuffle table for converting RGB to Planar.
3783 static const uvec8 kShuffleMaskRGBToR0 = {0u,   3u,   6u,   9u,   12u,  15u,
3784                                           128u, 128u, 128u, 128u, 128u, 128u,
3785                                           128u, 128u, 128u, 128u};
3786 static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
3787                                           2u,   5u,   8u,   11u,  14u,  128u,
3788                                           128u, 128u, 128u, 128u};
3789 static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
3790                                           128u, 128u, 128u, 128u, 128u, 1u,
3791                                           4u,   7u,   10u,  13u};
3792 
3793 static const uvec8 kShuffleMaskRGBToG0 = {1u,   4u,   7u,   10u,  13u,  128u,
3794                                           128u, 128u, 128u, 128u, 128u, 128u,
3795                                           128u, 128u, 128u, 128u};
3796 static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
3797                                           3u,   6u,   9u,   12u,  15u,  128u,
3798                                           128u, 128u, 128u, 128u};
3799 static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
3800                                           128u, 128u, 128u, 128u, 128u, 2u,
3801                                           5u,   8u,   11u,  14u};
3802 
3803 static const uvec8 kShuffleMaskRGBToB0 = {2u,   5u,   8u,   11u,  14u,  128u,
3804                                           128u, 128u, 128u, 128u, 128u, 128u,
3805                                           128u, 128u, 128u, 128u};
3806 static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
3807                                           4u,   7u,   10u,  13u,  128u, 128u,
3808                                           128u, 128u, 128u, 128u};
3809 static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
3810                                           128u, 128u, 128u, 128u, 0u,   3u,
3811                                           6u,   9u,   12u,  15u};
3812 
SplitRGBRow_SSSE3(const uint8_t * src_rgb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)3813 void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
3814                        uint8_t* dst_r,
3815                        uint8_t* dst_g,
3816                        uint8_t* dst_b,
3817                        int width) {
3818   asm volatile(
3819 
3820       LABELALIGN
3821       "1:                                        \n"
3822       "movdqu      (%0),%%xmm0                   \n"
3823       "movdqu      0x10(%0),%%xmm1               \n"
3824       "movdqu      0x20(%0),%%xmm2               \n"
3825       "pshufb      %5, %%xmm0                    \n"
3826       "pshufb      %6, %%xmm1                    \n"
3827       "pshufb      %7, %%xmm2                    \n"
3828       "por         %%xmm1,%%xmm0                 \n"
3829       "por         %%xmm2,%%xmm0                 \n"
3830       "movdqu      %%xmm0,(%1)                   \n"
3831       "lea         0x10(%1),%1                   \n"
3832 
3833       "movdqu      (%0),%%xmm0                   \n"
3834       "movdqu      0x10(%0),%%xmm1               \n"
3835       "movdqu      0x20(%0),%%xmm2               \n"
3836       "pshufb      %8, %%xmm0                    \n"
3837       "pshufb      %9, %%xmm1                    \n"
3838       "pshufb      %10, %%xmm2                   \n"
3839       "por         %%xmm1,%%xmm0                 \n"
3840       "por         %%xmm2,%%xmm0                 \n"
3841       "movdqu      %%xmm0,(%2)                   \n"
3842       "lea         0x10(%2),%2                   \n"
3843 
3844       "movdqu      (%0),%%xmm0                   \n"
3845       "movdqu      0x10(%0),%%xmm1               \n"
3846       "movdqu      0x20(%0),%%xmm2               \n"
3847       "pshufb      %11, %%xmm0                   \n"
3848       "pshufb      %12, %%xmm1                   \n"
3849       "pshufb      %13, %%xmm2                   \n"
3850       "por         %%xmm1,%%xmm0                 \n"
3851       "por         %%xmm2,%%xmm0                 \n"
3852       "movdqu      %%xmm0,(%3)                   \n"
3853       "lea         0x10(%3),%3                   \n"
3854       "lea         0x30(%0),%0                   \n"
3855       "sub         $0x10,%4                      \n"
3856       "jg          1b                            \n"
3857       : "+r"(src_rgb),             // %0
3858         "+r"(dst_r),               // %1
3859         "+r"(dst_g),               // %2
3860         "+r"(dst_b),               // %3
3861         "+r"(width)                // %4
3862       : "m"(kShuffleMaskRGBToR0),  // %5
3863         "m"(kShuffleMaskRGBToR1),  // %6
3864         "m"(kShuffleMaskRGBToR2),  // %7
3865         "m"(kShuffleMaskRGBToG0),  // %8
3866         "m"(kShuffleMaskRGBToG1),  // %9
3867         "m"(kShuffleMaskRGBToG2),  // %10
3868         "m"(kShuffleMaskRGBToB0),  // %11
3869         "m"(kShuffleMaskRGBToB1),  // %12
3870         "m"(kShuffleMaskRGBToB2)   // %13
3871       : "memory", "cc", "xmm0", "xmm1", "xmm2");
3872 }
3873 #endif  // HAS_SPLITRGBROW_SSSE3
3874 
3875 #ifdef HAS_MERGERGBROW_SSSE3
3876 
3877 // Shuffle table for converting RGB to Planar.
3878 static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
3879                                           2u, 128u, 128u, 3u, 128u, 128u,
3880                                           4u, 128u, 128u, 5u};
3881 static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
3882                                           128u, 2u, 128u, 128u, 3u, 128u,
3883                                           128u, 4u, 128u, 128u};
3884 static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
3885                                           128u, 128u, 2u, 128u, 128u, 3u,
3886                                           128u, 128u, 4u, 128u};
3887 
3888 static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
3889                                           7u, 128u, 128u, 8u, 128u, 128u,
3890                                           9u, 128u, 128u, 10u};
3891 static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
3892                                           128u, 7u, 128u, 128u, 8u, 128u,
3893                                           128u, 9u, 128u, 128u};
3894 static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u,  128u, 128u, 7u,
3895                                           128u, 128u, 8u,  128u, 128u, 9u,
3896                                           128u, 128u, 10u, 128u};
3897 
3898 static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
3899                                           12u, 128u, 128u, 13u, 128u, 128u,
3900                                           14u, 128u, 128u, 15u};
3901 static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
3902                                           128u, 13u, 128u, 128u, 14u, 128u,
3903                                           128u, 15u, 128u, 128u};
3904 static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
3905                                           128u, 128u, 13u, 128u, 128u, 14u,
3906                                           128u, 128u, 15u, 128u};
3907 
MergeRGBRow_SSSE3(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_rgb,int width)3908 void MergeRGBRow_SSSE3(const uint8_t* src_r,
3909                        const uint8_t* src_g,
3910                        const uint8_t* src_b,
3911                        uint8_t* dst_rgb,
3912                        int width) {
3913   asm volatile(
3914 
3915       LABELALIGN
3916       "1:                                        \n"
3917       "movdqu      (%0),%%xmm0                   \n"
3918       "movdqu      (%1),%%xmm1                   \n"
3919       "movdqu      (%2),%%xmm2                   \n"
3920       "pshufb      %5, %%xmm0                    \n"
3921       "pshufb      %6, %%xmm1                    \n"
3922       "pshufb      %7, %%xmm2                    \n"
3923       "por         %%xmm1,%%xmm0                 \n"
3924       "por         %%xmm2,%%xmm0                 \n"
3925       "movdqu      %%xmm0,(%3)                   \n"
3926 
3927       "movdqu      (%0),%%xmm0                   \n"
3928       "movdqu      (%1),%%xmm1                   \n"
3929       "movdqu      (%2),%%xmm2                   \n"
3930       "pshufb      %8, %%xmm0                    \n"
3931       "pshufb      %9, %%xmm1                    \n"
3932       "pshufb      %10, %%xmm2                   \n"
3933       "por         %%xmm1,%%xmm0                 \n"
3934       "por         %%xmm2,%%xmm0                 \n"
3935       "movdqu      %%xmm0,16(%3)                 \n"
3936 
3937       "movdqu      (%0),%%xmm0                   \n"
3938       "movdqu      (%1),%%xmm1                   \n"
3939       "movdqu      (%2),%%xmm2                   \n"
3940       "pshufb      %11, %%xmm0                   \n"
3941       "pshufb      %12, %%xmm1                   \n"
3942       "pshufb      %13, %%xmm2                   \n"
3943       "por         %%xmm1,%%xmm0                 \n"
3944       "por         %%xmm2,%%xmm0                 \n"
3945       "movdqu      %%xmm0,32(%3)                 \n"
3946 
3947       "lea         0x10(%0),%0                   \n"
3948       "lea         0x10(%1),%1                   \n"
3949       "lea         0x10(%2),%2                   \n"
3950       "lea         0x30(%3),%3                   \n"
3951       "sub         $0x10,%4                      \n"
3952       "jg          1b                            \n"
3953       : "+r"(src_r),               // %0
3954         "+r"(src_g),               // %1
3955         "+r"(src_b),               // %2
3956         "+r"(dst_rgb),             // %3
3957         "+r"(width)                // %4
3958       : "m"(kShuffleMaskRToRGB0),  // %5
3959         "m"(kShuffleMaskGToRGB0),  // %6
3960         "m"(kShuffleMaskBToRGB0),  // %7
3961         "m"(kShuffleMaskRToRGB1),  // %8
3962         "m"(kShuffleMaskGToRGB1),  // %9
3963         "m"(kShuffleMaskBToRGB1),  // %10
3964         "m"(kShuffleMaskRToRGB2),  // %11
3965         "m"(kShuffleMaskGToRGB2),  // %12
3966         "m"(kShuffleMaskBToRGB2)   // %13
3967       : "memory", "cc", "xmm0", "xmm1", "xmm2");
3968 }
3969 #endif  // HAS_MERGERGBROW_SSSE3
3970 
3971 #ifdef HAS_COPYROW_SSE2
CopyRow_SSE2(const uint8_t * src,uint8_t * dst,int width)3972 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
3973   asm volatile(
3974       "test        $0xf,%0                       \n"
3975       "jne         2f                            \n"
3976       "test        $0xf,%1                       \n"
3977       "jne         2f                            \n"
3978 
3979       LABELALIGN
3980       "1:                                        \n"
3981       "movdqa      (%0),%%xmm0                   \n"
3982       "movdqa      0x10(%0),%%xmm1               \n"
3983       "lea         0x20(%0),%0                   \n"
3984       "movdqa      %%xmm0,(%1)                   \n"
3985       "movdqa      %%xmm1,0x10(%1)               \n"
3986       "lea         0x20(%1),%1                   \n"
3987       "sub         $0x20,%2                      \n"
3988       "jg          1b                            \n"
3989       "jmp         9f                            \n"
3990 
3991       LABELALIGN
3992       "2:                                        \n"
3993       "movdqu      (%0),%%xmm0                   \n"
3994       "movdqu      0x10(%0),%%xmm1               \n"
3995       "lea         0x20(%0),%0                   \n"
3996       "movdqu      %%xmm0,(%1)                   \n"
3997       "movdqu      %%xmm1,0x10(%1)               \n"
3998       "lea         0x20(%1),%1                   \n"
3999       "sub         $0x20,%2                      \n"
4000       "jg          2b                            \n"
4001 
4002       LABELALIGN "9:                                        \n"
4003       : "+r"(src),   // %0
4004         "+r"(dst),   // %1
4005         "+r"(width)  // %2
4006       :
4007       : "memory", "cc", "xmm0", "xmm1");
4008 }
4009 #endif  // HAS_COPYROW_SSE2
4010 
4011 #ifdef HAS_COPYROW_AVX
CopyRow_AVX(const uint8_t * src,uint8_t * dst,int width)4012 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
4013   asm volatile(
4014 
4015       LABELALIGN
4016       "1:                                        \n"
4017       "vmovdqu     (%0),%%ymm0                   \n"
4018       "vmovdqu     0x20(%0),%%ymm1               \n"
4019       "lea         0x40(%0),%0                   \n"
4020       "vmovdqu     %%ymm0,(%1)                   \n"
4021       "vmovdqu     %%ymm1,0x20(%1)               \n"
4022       "lea         0x40(%1),%1                   \n"
4023       "sub         $0x40,%2                      \n"
4024       "jg          1b                            \n"
4025       : "+r"(src),   // %0
4026         "+r"(dst),   // %1
4027         "+r"(width)  // %2
4028       :
4029       : "memory", "cc", "xmm0", "xmm1");
4030 }
4031 #endif  // HAS_COPYROW_AVX
4032 
4033 #ifdef HAS_COPYROW_ERMS
4034 // Multiple of 1.
CopyRow_ERMS(const uint8_t * src,uint8_t * dst,int width)4035 void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
4036   size_t width_tmp = (size_t)(width);
4037   asm volatile(
4038 
4039       "rep         movsb                         \n"
4040       : "+S"(src),       // %0
4041         "+D"(dst),       // %1
4042         "+c"(width_tmp)  // %2
4043       :
4044       : "memory", "cc");
4045 }
4046 #endif  // HAS_COPYROW_ERMS
4047 
4048 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
4049 // width in pixels
ARGBCopyAlphaRow_SSE2(const uint8_t * src,uint8_t * dst,int width)4050 void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
4051   asm volatile(
4052       "pcmpeqb     %%xmm0,%%xmm0                 \n"
4053       "pslld       $0x18,%%xmm0                  \n"
4054       "pcmpeqb     %%xmm1,%%xmm1                 \n"
4055       "psrld       $0x8,%%xmm1                   \n"
4056 
4057       LABELALIGN
4058       "1:                                        \n"
4059       "movdqu      (%0),%%xmm2                   \n"
4060       "movdqu      0x10(%0),%%xmm3               \n"
4061       "lea         0x20(%0),%0                   \n"
4062       "movdqu      (%1),%%xmm4                   \n"
4063       "movdqu      0x10(%1),%%xmm5               \n"
4064       "pand        %%xmm0,%%xmm2                 \n"
4065       "pand        %%xmm0,%%xmm3                 \n"
4066       "pand        %%xmm1,%%xmm4                 \n"
4067       "pand        %%xmm1,%%xmm5                 \n"
4068       "por         %%xmm4,%%xmm2                 \n"
4069       "por         %%xmm5,%%xmm3                 \n"
4070       "movdqu      %%xmm2,(%1)                   \n"
4071       "movdqu      %%xmm3,0x10(%1)               \n"
4072       "lea         0x20(%1),%1                   \n"
4073       "sub         $0x8,%2                       \n"
4074       "jg          1b                            \n"
4075       : "+r"(src),   // %0
4076         "+r"(dst),   // %1
4077         "+r"(width)  // %2
4078       :
4079       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
4080 }
4081 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
4082 
4083 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
4084 // width in pixels
ARGBCopyAlphaRow_AVX2(const uint8_t * src,uint8_t * dst,int width)4085 void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
4086   asm volatile(
4087       "vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
4088       "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
4089 
4090       LABELALIGN
4091       "1:                                        \n"
4092       "vmovdqu     (%0),%%ymm1                   \n"
4093       "vmovdqu     0x20(%0),%%ymm2               \n"
4094       "lea         0x40(%0),%0                   \n"
4095       "vpblendvb   %%ymm0,(%1),%%ymm1,%%ymm1     \n"
4096       "vpblendvb   %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
4097       "vmovdqu     %%ymm1,(%1)                   \n"
4098       "vmovdqu     %%ymm2,0x20(%1)               \n"
4099       "lea         0x40(%1),%1                   \n"
4100       "sub         $0x10,%2                      \n"
4101       "jg          1b                            \n"
4102       "vzeroupper                                \n"
4103       : "+r"(src),   // %0
4104         "+r"(dst),   // %1
4105         "+r"(width)  // %2
4106       :
4107       : "memory", "cc", "xmm0", "xmm1", "xmm2");
4108 }
4109 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
4110 
4111 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
4112 // width in pixels
ARGBExtractAlphaRow_SSE2(const uint8_t * src_argb,uint8_t * dst_a,int width)4113 void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
4114                               uint8_t* dst_a,
4115                               int width) {
4116   asm volatile(
4117 
4118       LABELALIGN
4119       "1:                                        \n"
4120       "movdqu      (%0), %%xmm0                  \n"
4121       "movdqu      0x10(%0), %%xmm1              \n"
4122       "lea         0x20(%0), %0                  \n"
4123       "psrld       $0x18, %%xmm0                 \n"
4124       "psrld       $0x18, %%xmm1                 \n"
4125       "packssdw    %%xmm1, %%xmm0                \n"
4126       "packuswb    %%xmm0, %%xmm0                \n"
4127       "movq        %%xmm0,(%1)                   \n"
4128       "lea         0x8(%1), %1                   \n"
4129       "sub         $0x8, %2                      \n"
4130       "jg          1b                            \n"
4131       : "+r"(src_argb),  // %0
4132         "+r"(dst_a),     // %1
4133         "+rm"(width)     // %2
4134       :
4135       : "memory", "cc", "xmm0", "xmm1");
4136 }
4137 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
4138 
4139 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
4140 static const uvec8 kShuffleAlphaShort_AVX2 = {
4141     3u,  128u, 128u, 128u, 7u,  128u, 128u, 128u,
4142     11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
4143 
ARGBExtractAlphaRow_AVX2(const uint8_t * src_argb,uint8_t * dst_a,int width)4144 void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
4145                               uint8_t* dst_a,
4146                               int width) {
4147   asm volatile(
4148       "vmovdqa     %3,%%ymm4                     \n"
4149       "vbroadcastf128 %4,%%ymm5                  \n"
4150 
4151       LABELALIGN
4152       "1:                                        \n"
4153       "vmovdqu     (%0), %%ymm0                  \n"
4154       "vmovdqu     0x20(%0), %%ymm1              \n"
4155       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"  // vpsrld $0x18, %%ymm0
4156       "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
4157       "vmovdqu     0x40(%0), %%ymm2              \n"
4158       "vmovdqu     0x60(%0), %%ymm3              \n"
4159       "lea         0x80(%0), %0                  \n"
4160       "vpackssdw   %%ymm1, %%ymm0, %%ymm0        \n"  // mutates
4161       "vpshufb     %%ymm5,%%ymm2,%%ymm2          \n"
4162       "vpshufb     %%ymm5,%%ymm3,%%ymm3          \n"
4163       "vpackssdw   %%ymm3, %%ymm2, %%ymm2        \n"  // mutates
4164       "vpackuswb   %%ymm2,%%ymm0,%%ymm0          \n"  // mutates.
4165       "vpermd      %%ymm0,%%ymm4,%%ymm0          \n"  // unmutate.
4166       "vmovdqu     %%ymm0,(%1)                   \n"
4167       "lea         0x20(%1),%1                   \n"
4168       "sub         $0x20, %2                     \n"
4169       "jg          1b                            \n"
4170       "vzeroupper                                \n"
4171       : "+r"(src_argb),               // %0
4172         "+r"(dst_a),                  // %1
4173         "+rm"(width)                  // %2
4174       : "m"(kPermdARGBToY_AVX),       // %3
4175         "m"(kShuffleAlphaShort_AVX2)  // %4
4176       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
4177 }
4178 #endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
4179 
4180 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
4181 // width in pixels
ARGBCopyYToAlphaRow_SSE2(const uint8_t * src,uint8_t * dst,int width)4182 void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
4183   asm volatile(
4184       "pcmpeqb     %%xmm0,%%xmm0                 \n"
4185       "pslld       $0x18,%%xmm0                  \n"
4186       "pcmpeqb     %%xmm1,%%xmm1                 \n"
4187       "psrld       $0x8,%%xmm1                   \n"
4188 
4189       LABELALIGN
4190       "1:                                        \n"
4191       "movq        (%0),%%xmm2                   \n"
4192       "lea         0x8(%0),%0                    \n"
4193       "punpcklbw   %%xmm2,%%xmm2                 \n"
4194       "punpckhwd   %%xmm2,%%xmm3                 \n"
4195       "punpcklwd   %%xmm2,%%xmm2                 \n"
4196       "movdqu      (%1),%%xmm4                   \n"
4197       "movdqu      0x10(%1),%%xmm5               \n"
4198       "pand        %%xmm0,%%xmm2                 \n"
4199       "pand        %%xmm0,%%xmm3                 \n"
4200       "pand        %%xmm1,%%xmm4                 \n"
4201       "pand        %%xmm1,%%xmm5                 \n"
4202       "por         %%xmm4,%%xmm2                 \n"
4203       "por         %%xmm5,%%xmm3                 \n"
4204       "movdqu      %%xmm2,(%1)                   \n"
4205       "movdqu      %%xmm3,0x10(%1)               \n"
4206       "lea         0x20(%1),%1                   \n"
4207       "sub         $0x8,%2                       \n"
4208       "jg          1b                            \n"
4209       : "+r"(src),   // %0
4210         "+r"(dst),   // %1
4211         "+r"(width)  // %2
4212       :
4213       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
4214 }
4215 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
4216 
4217 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
4218 // width in pixels
ARGBCopyYToAlphaRow_AVX2(const uint8_t * src,uint8_t * dst,int width)4219 void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
4220   asm volatile(
4221       "vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
4222       "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
4223 
4224       LABELALIGN
4225       "1:                                        \n"
4226       "vpmovzxbd   (%0),%%ymm1                   \n"
4227       "vpmovzxbd   0x8(%0),%%ymm2                \n"
4228       "lea         0x10(%0),%0                   \n"
4229       "vpslld      $0x18,%%ymm1,%%ymm1           \n"
4230       "vpslld      $0x18,%%ymm2,%%ymm2           \n"
4231       "vpblendvb   %%ymm0,(%1),%%ymm1,%%ymm1     \n"
4232       "vpblendvb   %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
4233       "vmovdqu     %%ymm1,(%1)                   \n"
4234       "vmovdqu     %%ymm2,0x20(%1)               \n"
4235       "lea         0x40(%1),%1                   \n"
4236       "sub         $0x10,%2                      \n"
4237       "jg          1b                            \n"
4238       "vzeroupper                                \n"
4239       : "+r"(src),   // %0
4240         "+r"(dst),   // %1
4241         "+r"(width)  // %2
4242       :
4243       : "memory", "cc", "xmm0", "xmm1", "xmm2");
4244 }
4245 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
4246 
4247 #ifdef HAS_SETROW_X86
SetRow_X86(uint8_t * dst,uint8_t v8,int width)4248 void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
4249   size_t width_tmp = (size_t)(width >> 2);
4250   const uint32_t v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
4251   asm volatile(
4252 
4253       "rep         stosl                         \n"
4254       : "+D"(dst),       // %0
4255         "+c"(width_tmp)  // %1
4256       : "a"(v32)         // %2
4257       : "memory", "cc");
4258 }
4259 
SetRow_ERMS(uint8_t * dst,uint8_t v8,int width)4260 void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
4261   size_t width_tmp = (size_t)(width);
4262   asm volatile(
4263 
4264       "rep         stosb                         \n"
4265       : "+D"(dst),       // %0
4266         "+c"(width_tmp)  // %1
4267       : "a"(v8)          // %2
4268       : "memory", "cc");
4269 }
4270 
ARGBSetRow_X86(uint8_t * dst_argb,uint32_t v32,int width)4271 void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
4272   size_t width_tmp = (size_t)(width);
4273   asm volatile(
4274 
4275       "rep         stosl                         \n"
4276       : "+D"(dst_argb),  // %0
4277         "+c"(width_tmp)  // %1
4278       : "a"(v32)         // %2
4279       : "memory", "cc");
4280 }
4281 #endif  // HAS_SETROW_X86
4282 
4283 #ifdef HAS_YUY2TOYROW_SSE2
YUY2ToYRow_SSE2(const uint8_t * src_yuy2,uint8_t * dst_y,int width)4284 void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
4285   asm volatile(
4286       "pcmpeqb     %%xmm5,%%xmm5                 \n"
4287       "psrlw       $0x8,%%xmm5                   \n"
4288 
4289       LABELALIGN
4290       "1:                                        \n"
4291       "movdqu      (%0),%%xmm0                   \n"
4292       "movdqu      0x10(%0),%%xmm1               \n"
4293       "lea         0x20(%0),%0                   \n"
4294       "pand        %%xmm5,%%xmm0                 \n"
4295       "pand        %%xmm5,%%xmm1                 \n"
4296       "packuswb    %%xmm1,%%xmm0                 \n"
4297       "movdqu      %%xmm0,(%1)                   \n"
4298       "lea         0x10(%1),%1                   \n"
4299       "sub         $0x10,%2                      \n"
4300       "jg          1b                            \n"
4301       : "+r"(src_yuy2),  // %0
4302         "+r"(dst_y),     // %1
4303         "+r"(width)      // %2
4304       :
4305       : "memory", "cc", "xmm0", "xmm1", "xmm5");
4306 }
4307 
YUY2ToUVRow_SSE2(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)4308 void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
4309                       int stride_yuy2,
4310                       uint8_t* dst_u,
4311                       uint8_t* dst_v,
4312                       int width) {
4313   asm volatile(
4314       "pcmpeqb     %%xmm5,%%xmm5                 \n"
4315       "psrlw       $0x8,%%xmm5                   \n"
4316       "sub         %1,%2                         \n"
4317 
4318       LABELALIGN
4319       "1:                                        \n"
4320       "movdqu      (%0),%%xmm0                   \n"
4321       "movdqu      0x10(%0),%%xmm1               \n"
4322       "movdqu      0x00(%0,%4,1),%%xmm2          \n"
4323       "movdqu      0x10(%0,%4,1),%%xmm3          \n"
4324       "lea         0x20(%0),%0                   \n"
4325       "pavgb       %%xmm2,%%xmm0                 \n"
4326       "pavgb       %%xmm3,%%xmm1                 \n"
4327       "psrlw       $0x8,%%xmm0                   \n"
4328       "psrlw       $0x8,%%xmm1                   \n"
4329       "packuswb    %%xmm1,%%xmm0                 \n"
4330       "movdqa      %%xmm0,%%xmm1                 \n"
4331       "pand        %%xmm5,%%xmm0                 \n"
4332       "packuswb    %%xmm0,%%xmm0                 \n"
4333       "psrlw       $0x8,%%xmm1                   \n"
4334       "packuswb    %%xmm1,%%xmm1                 \n"
4335       "movq        %%xmm0,(%1)                   \n"
4336       "movq        %%xmm1,0x00(%1,%2,1)          \n"
4337       "lea         0x8(%1),%1                    \n"
4338       "sub         $0x10,%3                      \n"
4339       "jg          1b                            \n"
4340       : "+r"(src_yuy2),               // %0
4341         "+r"(dst_u),                  // %1
4342         "+r"(dst_v),                  // %2
4343         "+r"(width)                   // %3
4344       : "r"((intptr_t)(stride_yuy2))  // %4
4345       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
4346 }
4347 
YUY2ToUV422Row_SSE2(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)4348 void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
4349                          uint8_t* dst_u,
4350                          uint8_t* dst_v,
4351                          int width) {
4352   asm volatile(
4353       "pcmpeqb     %%xmm5,%%xmm5                 \n"
4354       "psrlw       $0x8,%%xmm5                   \n"
4355       "sub         %1,%2                         \n"
4356 
4357       LABELALIGN
4358       "1:                                        \n"
4359       "movdqu      (%0),%%xmm0                   \n"
4360       "movdqu      0x10(%0),%%xmm1               \n"
4361       "lea         0x20(%0),%0                   \n"
4362       "psrlw       $0x8,%%xmm0                   \n"
4363       "psrlw       $0x8,%%xmm1                   \n"
4364       "packuswb    %%xmm1,%%xmm0                 \n"
4365       "movdqa      %%xmm0,%%xmm1                 \n"
4366       "pand        %%xmm5,%%xmm0                 \n"
4367       "packuswb    %%xmm0,%%xmm0                 \n"
4368       "psrlw       $0x8,%%xmm1                   \n"
4369       "packuswb    %%xmm1,%%xmm1                 \n"
4370       "movq        %%xmm0,(%1)                   \n"
4371       "movq        %%xmm1,0x00(%1,%2,1)          \n"
4372       "lea         0x8(%1),%1                    \n"
4373       "sub         $0x10,%3                      \n"
4374       "jg          1b                            \n"
4375       : "+r"(src_yuy2),  // %0
4376         "+r"(dst_u),     // %1
4377         "+r"(dst_v),     // %2
4378         "+r"(width)      // %3
4379       :
4380       : "memory", "cc", "xmm0", "xmm1", "xmm5");
4381 }
4382 
UYVYToYRow_SSE2(const uint8_t * src_uyvy,uint8_t * dst_y,int width)4383 void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
4384   asm volatile(
4385 
4386       LABELALIGN
4387       "1:                                        \n"
4388       "movdqu      (%0),%%xmm0                   \n"
4389       "movdqu      0x10(%0),%%xmm1               \n"
4390       "lea         0x20(%0),%0                   \n"
4391       "psrlw       $0x8,%%xmm0                   \n"
4392       "psrlw       $0x8,%%xmm1                   \n"
4393       "packuswb    %%xmm1,%%xmm0                 \n"
4394       "movdqu      %%xmm0,(%1)                   \n"
4395       "lea         0x10(%1),%1                   \n"
4396       "sub         $0x10,%2                      \n"
4397       "jg          1b                            \n"
4398       : "+r"(src_uyvy),  // %0
4399         "+r"(dst_y),     // %1
4400         "+r"(width)      // %2
4401       :
4402       : "memory", "cc", "xmm0", "xmm1");
4403 }
4404 
UYVYToUVRow_SSE2(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)4405 void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
4406                       int stride_uyvy,
4407                       uint8_t* dst_u,
4408                       uint8_t* dst_v,
4409                       int width) {
4410   asm volatile(
4411       "pcmpeqb     %%xmm5,%%xmm5                 \n"
4412       "psrlw       $0x8,%%xmm5                   \n"
4413       "sub         %1,%2                         \n"
4414 
4415       LABELALIGN
4416       "1:                                        \n"
4417       "movdqu      (%0),%%xmm0                   \n"
4418       "movdqu      0x10(%0),%%xmm1               \n"
4419       "movdqu      0x00(%0,%4,1),%%xmm2          \n"
4420       "movdqu      0x10(%0,%4,1),%%xmm3          \n"
4421       "lea         0x20(%0),%0                   \n"
4422       "pavgb       %%xmm2,%%xmm0                 \n"
4423       "pavgb       %%xmm3,%%xmm1                 \n"
4424       "pand        %%xmm5,%%xmm0                 \n"
4425       "pand        %%xmm5,%%xmm1                 \n"
4426       "packuswb    %%xmm1,%%xmm0                 \n"
4427       "movdqa      %%xmm0,%%xmm1                 \n"
4428       "pand        %%xmm5,%%xmm0                 \n"
4429       "packuswb    %%xmm0,%%xmm0                 \n"
4430       "psrlw       $0x8,%%xmm1                   \n"
4431       "packuswb    %%xmm1,%%xmm1                 \n"
4432       "movq        %%xmm0,(%1)                   \n"
4433       "movq        %%xmm1,0x00(%1,%2,1)          \n"
4434       "lea         0x8(%1),%1                    \n"
4435       "sub         $0x10,%3                      \n"
4436       "jg          1b                            \n"
4437       : "+r"(src_uyvy),               // %0
4438         "+r"(dst_u),                  // %1
4439         "+r"(dst_v),                  // %2
4440         "+r"(width)                   // %3
4441       : "r"((intptr_t)(stride_uyvy))  // %4
4442       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
4443 }
4444 
UYVYToUV422Row_SSE2(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)4445 void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
4446                          uint8_t* dst_u,
4447                          uint8_t* dst_v,
4448                          int width) {
4449   asm volatile(
4450       "pcmpeqb     %%xmm5,%%xmm5                 \n"
4451       "psrlw       $0x8,%%xmm5                   \n"
4452       "sub         %1,%2                         \n"
4453 
4454       LABELALIGN
4455       "1:                                        \n"
4456       "movdqu      (%0),%%xmm0                   \n"
4457       "movdqu      0x10(%0),%%xmm1               \n"
4458       "lea         0x20(%0),%0                   \n"
4459       "pand        %%xmm5,%%xmm0                 \n"
4460       "pand        %%xmm5,%%xmm1                 \n"
4461       "packuswb    %%xmm1,%%xmm0                 \n"
4462       "movdqa      %%xmm0,%%xmm1                 \n"
4463       "pand        %%xmm5,%%xmm0                 \n"
4464       "packuswb    %%xmm0,%%xmm0                 \n"
4465       "psrlw       $0x8,%%xmm1                   \n"
4466       "packuswb    %%xmm1,%%xmm1                 \n"
4467       "movq        %%xmm0,(%1)                   \n"
4468       "movq        %%xmm1,0x00(%1,%2,1)          \n"
4469       "lea         0x8(%1),%1                    \n"
4470       "sub         $0x10,%3                      \n"
4471       "jg          1b                            \n"
4472       : "+r"(src_uyvy),  // %0
4473         "+r"(dst_u),     // %1
4474         "+r"(dst_v),     // %2
4475         "+r"(width)      // %3
4476       :
4477       : "memory", "cc", "xmm0", "xmm1", "xmm5");
4478 }
4479 #endif  // HAS_YUY2TOYROW_SSE2
4480 
4481 #ifdef HAS_YUY2TOYROW_AVX2
YUY2ToYRow_AVX2(const uint8_t * src_yuy2,uint8_t * dst_y,int width)4482 void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
4483   asm volatile(
4484       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4485       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
4486 
4487       LABELALIGN
4488       "1:                                        \n"
4489       "vmovdqu     (%0),%%ymm0                   \n"
4490       "vmovdqu     0x20(%0),%%ymm1               \n"
4491       "lea         0x40(%0),%0                   \n"
4492       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
4493       "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
4494       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
4495       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
4496       "vmovdqu     %%ymm0,(%1)                   \n"
4497       "lea         0x20(%1),%1                   \n"
4498       "sub         $0x20,%2                      \n"
4499       "jg          1b                            \n"
4500       "vzeroupper                                \n"
4501       : "+r"(src_yuy2),  // %0
4502         "+r"(dst_y),     // %1
4503         "+r"(width)      // %2
4504       :
4505       : "memory", "cc", "xmm0", "xmm1", "xmm5");
4506 }
4507 
YUY2ToUVRow_AVX2(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)4508 void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
4509                       int stride_yuy2,
4510                       uint8_t* dst_u,
4511                       uint8_t* dst_v,
4512                       int width) {
4513   asm volatile(
4514       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4515       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
4516       "sub         %1,%2                         \n"
4517 
4518       LABELALIGN
4519       "1:                                        \n"
4520       "vmovdqu     (%0),%%ymm0                   \n"
4521       "vmovdqu     0x20(%0),%%ymm1               \n"
4522       "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
4523       "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
4524       "lea         0x40(%0),%0                   \n"
4525       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
4526       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
4527       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
4528       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
4529       "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
4530       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
4531       "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
4532       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
4533       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
4534       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
4535       "vextractf128 $0x0,%%ymm1,(%1)             \n"
4536       "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
4537       "lea         0x10(%1),%1                   \n"
4538       "sub         $0x20,%3                      \n"
4539       "jg          1b                            \n"
4540       "vzeroupper                                \n"
4541       : "+r"(src_yuy2),               // %0
4542         "+r"(dst_u),                  // %1
4543         "+r"(dst_v),                  // %2
4544         "+r"(width)                   // %3
4545       : "r"((intptr_t)(stride_yuy2))  // %4
4546       : "memory", "cc", "xmm0", "xmm1", "xmm5");
4547 }
4548 
YUY2ToUV422Row_AVX2(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)4549 void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
4550                          uint8_t* dst_u,
4551                          uint8_t* dst_v,
4552                          int width) {
4553   asm volatile(
4554       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4555       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
4556       "sub         %1,%2                         \n"
4557 
4558       LABELALIGN
4559       "1:                                        \n"
4560       "vmovdqu     (%0),%%ymm0                   \n"
4561       "vmovdqu     0x20(%0),%%ymm1               \n"
4562       "lea         0x40(%0),%0                   \n"
4563       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
4564       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
4565       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
4566       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
4567       "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
4568       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
4569       "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
4570       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
4571       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
4572       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
4573       "vextractf128 $0x0,%%ymm1,(%1)             \n"
4574       "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
4575       "lea         0x10(%1),%1                   \n"
4576       "sub         $0x20,%3                      \n"
4577       "jg          1b                            \n"
4578       "vzeroupper                                \n"
4579       : "+r"(src_yuy2),  // %0
4580         "+r"(dst_u),     // %1
4581         "+r"(dst_v),     // %2
4582         "+r"(width)      // %3
4583       :
4584       : "memory", "cc", "xmm0", "xmm1", "xmm5");
4585 }
4586 
UYVYToYRow_AVX2(const uint8_t * src_uyvy,uint8_t * dst_y,int width)4587 void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
4588   asm volatile(
4589 
4590       LABELALIGN
4591       "1:                                        \n"
4592       "vmovdqu     (%0),%%ymm0                   \n"
4593       "vmovdqu     0x20(%0),%%ymm1               \n"
4594       "lea         0x40(%0),%0                   \n"
4595       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
4596       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
4597       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
4598       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
4599       "vmovdqu     %%ymm0,(%1)                   \n"
4600       "lea         0x20(%1),%1                   \n"
4601       "sub         $0x20,%2                      \n"
4602       "jg          1b                            \n"
4603       "vzeroupper                                \n"
4604       : "+r"(src_uyvy),  // %0
4605         "+r"(dst_y),     // %1
4606         "+r"(width)      // %2
4607       :
4608       : "memory", "cc", "xmm0", "xmm1", "xmm5");
4609 }
UYVYToUVRow_AVX2(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)4610 void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
4611                       int stride_uyvy,
4612                       uint8_t* dst_u,
4613                       uint8_t* dst_v,
4614                       int width) {
4615   asm volatile(
4616       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4617       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
4618       "sub         %1,%2                         \n"
4619 
4620       LABELALIGN
4621       "1:                                        \n"
4622       "vmovdqu     (%0),%%ymm0                   \n"
4623       "vmovdqu     0x20(%0),%%ymm1               \n"
4624       "vpavgb      0x00(%0,%4,1),%%ymm0,%%ymm0   \n"
4625       "vpavgb      0x20(%0,%4,1),%%ymm1,%%ymm1   \n"
4626       "lea         0x40(%0),%0                   \n"
4627       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
4628       "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
4629       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
4630       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
4631       "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
4632       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
4633       "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
4634       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
4635       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
4636       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
4637       "vextractf128 $0x0,%%ymm1,(%1)             \n"
4638       "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
4639       "lea         0x10(%1),%1                   \n"
4640       "sub         $0x20,%3                      \n"
4641       "jg          1b                            \n"
4642       "vzeroupper                                \n"
4643       : "+r"(src_uyvy),               // %0
4644         "+r"(dst_u),                  // %1
4645         "+r"(dst_v),                  // %2
4646         "+r"(width)                   // %3
4647       : "r"((intptr_t)(stride_uyvy))  // %4
4648       : "memory", "cc", "xmm0", "xmm1", "xmm5");
4649 }
4650 
UYVYToUV422Row_AVX2(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)4651 void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
4652                          uint8_t* dst_u,
4653                          uint8_t* dst_v,
4654                          int width) {
4655   asm volatile(
4656       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4657       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
4658       "sub         %1,%2                         \n"
4659 
4660       LABELALIGN
4661       "1:                                        \n"
4662       "vmovdqu     (%0),%%ymm0                   \n"
4663       "vmovdqu     0x20(%0),%%ymm1               \n"
4664       "lea         0x40(%0),%0                   \n"
4665       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
4666       "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
4667       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
4668       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
4669       "vpand       %%ymm5,%%ymm0,%%ymm1          \n"
4670       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
4671       "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
4672       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
4673       "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
4674       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
4675       "vextractf128 $0x0,%%ymm1,(%1)             \n"
4676       "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
4677       "lea         0x10(%1),%1                   \n"
4678       "sub         $0x20,%3                      \n"
4679       "jg          1b                            \n"
4680       "vzeroupper                                \n"
4681       : "+r"(src_uyvy),  // %0
4682         "+r"(dst_u),     // %1
4683         "+r"(dst_v),     // %2
4684         "+r"(width)      // %3
4685       :
4686       : "memory", "cc", "xmm0", "xmm1", "xmm5");
4687 }
4688 #endif  // HAS_YUY2TOYROW_AVX2
4689 
4690 #ifdef HAS_ARGBBLENDROW_SSSE3
4691 // Shuffle table for isolating alpha.
4692 static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
4693                                     11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
4694 
4695 // Blend 8 pixels at a time
ARGBBlendRow_SSSE3(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)4696 void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
4697                         const uint8_t* src_argb1,
4698                         uint8_t* dst_argb,
4699                         int width) {
4700   asm volatile(
4701       "pcmpeqb     %%xmm7,%%xmm7                 \n"
4702       "psrlw       $0xf,%%xmm7                   \n"
4703       "pcmpeqb     %%xmm6,%%xmm6                 \n"
4704       "psrlw       $0x8,%%xmm6                   \n"
4705       "pcmpeqb     %%xmm5,%%xmm5                 \n"
4706       "psllw       $0x8,%%xmm5                   \n"
4707       "pcmpeqb     %%xmm4,%%xmm4                 \n"
4708       "pslld       $0x18,%%xmm4                  \n"
4709       "sub         $0x4,%3                       \n"
4710       "jl          49f                           \n"
4711 
4712       // 4 pixel loop.
4713       LABELALIGN
4714       "40:                                       \n"
4715       "movdqu      (%0),%%xmm3                   \n"
4716       "lea         0x10(%0),%0                   \n"
4717       "movdqa      %%xmm3,%%xmm0                 \n"
4718       "pxor        %%xmm4,%%xmm3                 \n"
4719       "movdqu      (%1),%%xmm2                   \n"
4720       "pshufb      %4,%%xmm3                     \n"
4721       "pand        %%xmm6,%%xmm2                 \n"
4722       "paddw       %%xmm7,%%xmm3                 \n"
4723       "pmullw      %%xmm3,%%xmm2                 \n"
4724       "movdqu      (%1),%%xmm1                   \n"
4725       "lea         0x10(%1),%1                   \n"
4726       "psrlw       $0x8,%%xmm1                   \n"
4727       "por         %%xmm4,%%xmm0                 \n"
4728       "pmullw      %%xmm3,%%xmm1                 \n"
4729       "psrlw       $0x8,%%xmm2                   \n"
4730       "paddusb     %%xmm2,%%xmm0                 \n"
4731       "pand        %%xmm5,%%xmm1                 \n"
4732       "paddusb     %%xmm1,%%xmm0                 \n"
4733       "movdqu      %%xmm0,(%2)                   \n"
4734       "lea         0x10(%2),%2                   \n"
4735       "sub         $0x4,%3                       \n"
4736       "jge         40b                           \n"
4737 
4738       "49:                                       \n"
4739       "add         $0x3,%3                       \n"
4740       "jl          99f                           \n"
4741 
4742       // 1 pixel loop.
4743       "91:                                       \n"
4744       "movd        (%0),%%xmm3                   \n"
4745       "lea         0x4(%0),%0                    \n"
4746       "movdqa      %%xmm3,%%xmm0                 \n"
4747       "pxor        %%xmm4,%%xmm3                 \n"
4748       "movd        (%1),%%xmm2                   \n"
4749       "pshufb      %4,%%xmm3                     \n"
4750       "pand        %%xmm6,%%xmm2                 \n"
4751       "paddw       %%xmm7,%%xmm3                 \n"
4752       "pmullw      %%xmm3,%%xmm2                 \n"
4753       "movd        (%1),%%xmm1                   \n"
4754       "lea         0x4(%1),%1                    \n"
4755       "psrlw       $0x8,%%xmm1                   \n"
4756       "por         %%xmm4,%%xmm0                 \n"
4757       "pmullw      %%xmm3,%%xmm1                 \n"
4758       "psrlw       $0x8,%%xmm2                   \n"
4759       "paddusb     %%xmm2,%%xmm0                 \n"
4760       "pand        %%xmm5,%%xmm1                 \n"
4761       "paddusb     %%xmm1,%%xmm0                 \n"
4762       "movd        %%xmm0,(%2)                   \n"
4763       "lea         0x4(%2),%2                    \n"
4764       "sub         $0x1,%3                       \n"
4765       "jge         91b                           \n"
4766       "99:                                       \n"
4767       : "+r"(src_argb0),    // %0
4768         "+r"(src_argb1),    // %1
4769         "+r"(dst_argb),     // %2
4770         "+r"(width)         // %3
4771       : "m"(kShuffleAlpha)  // %4
4772       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
4773         "xmm7");
4774 }
4775 #endif  // HAS_ARGBBLENDROW_SSSE3
4776 
4777 #ifdef HAS_BLENDPLANEROW_SSSE3
4778 // Blend 8 pixels at a time.
4779 // unsigned version of math
4780 // =((A2*C2)+(B2*(255-C2))+255)/256
4781 // signed version of math
4782 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_SSSE3(const uint8_t * src0,const uint8_t * src1,const uint8_t * alpha,uint8_t * dst,int width)4783 void BlendPlaneRow_SSSE3(const uint8_t* src0,
4784                          const uint8_t* src1,
4785                          const uint8_t* alpha,
4786                          uint8_t* dst,
4787                          int width) {
4788   asm volatile(
4789       "pcmpeqb     %%xmm5,%%xmm5                 \n"
4790       "psllw       $0x8,%%xmm5                   \n"
4791       "mov         $0x80808080,%%eax             \n"
4792       "movd        %%eax,%%xmm6                  \n"
4793       "pshufd      $0x0,%%xmm6,%%xmm6            \n"
4794       "mov         $0x807f807f,%%eax             \n"
4795       "movd        %%eax,%%xmm7                  \n"
4796       "pshufd      $0x0,%%xmm7,%%xmm7            \n"
4797       "sub         %2,%0                         \n"
4798       "sub         %2,%1                         \n"
4799       "sub         %2,%3                         \n"
4800 
4801       // 8 pixel loop.
4802       LABELALIGN
4803       "1:                                        \n"
4804       "movq        (%2),%%xmm0                   \n"
4805       "punpcklbw   %%xmm0,%%xmm0                 \n"
4806       "pxor        %%xmm5,%%xmm0                 \n"
4807       "movq        (%0,%2,1),%%xmm1              \n"
4808       "movq        (%1,%2,1),%%xmm2              \n"
4809       "punpcklbw   %%xmm2,%%xmm1                 \n"
4810       "psubb       %%xmm6,%%xmm1                 \n"
4811       "pmaddubsw   %%xmm1,%%xmm0                 \n"
4812       "paddw       %%xmm7,%%xmm0                 \n"
4813       "psrlw       $0x8,%%xmm0                   \n"
4814       "packuswb    %%xmm0,%%xmm0                 \n"
4815       "movq        %%xmm0,(%3,%2,1)              \n"
4816       "lea         0x8(%2),%2                    \n"
4817       "sub         $0x8,%4                       \n"
4818       "jg          1b                            \n"
4819       : "+r"(src0),   // %0
4820         "+r"(src1),   // %1
4821         "+r"(alpha),  // %2
4822         "+r"(dst),    // %3
4823         "+rm"(width)  // %4
4824         ::"memory",
4825         "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
4826 }
4827 #endif  // HAS_BLENDPLANEROW_SSSE3
4828 
4829 #ifdef HAS_BLENDPLANEROW_AVX2
4830 // Blend 32 pixels at a time.
4831 // unsigned version of math
4832 // =((A2*C2)+(B2*(255-C2))+255)/256
4833 // signed version of math
4834 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_AVX2(const uint8_t * src0,const uint8_t * src1,const uint8_t * alpha,uint8_t * dst,int width)4835 void BlendPlaneRow_AVX2(const uint8_t* src0,
4836                         const uint8_t* src1,
4837                         const uint8_t* alpha,
4838                         uint8_t* dst,
4839                         int width) {
4840   asm volatile(
4841       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4842       "vpsllw      $0x8,%%ymm5,%%ymm5            \n"
4843       "mov         $0x80808080,%%eax             \n"
4844       "vmovd       %%eax,%%xmm6                  \n"
4845       "vbroadcastss %%xmm6,%%ymm6                \n"
4846       "mov         $0x807f807f,%%eax             \n"
4847       "vmovd       %%eax,%%xmm7                  \n"
4848       "vbroadcastss %%xmm7,%%ymm7                \n"
4849       "sub         %2,%0                         \n"
4850       "sub         %2,%1                         \n"
4851       "sub         %2,%3                         \n"
4852 
4853       // 32 pixel loop.
4854       LABELALIGN
4855       "1:                                        \n"
4856       "vmovdqu     (%2),%%ymm0                   \n"
4857       "vpunpckhbw  %%ymm0,%%ymm0,%%ymm3          \n"
4858       "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
4859       "vpxor       %%ymm5,%%ymm3,%%ymm3          \n"
4860       "vpxor       %%ymm5,%%ymm0,%%ymm0          \n"
4861       "vmovdqu     (%0,%2,1),%%ymm1              \n"
4862       "vmovdqu     (%1,%2,1),%%ymm2              \n"
4863       "vpunpckhbw  %%ymm2,%%ymm1,%%ymm4          \n"
4864       "vpunpcklbw  %%ymm2,%%ymm1,%%ymm1          \n"
4865       "vpsubb      %%ymm6,%%ymm4,%%ymm4          \n"
4866       "vpsubb      %%ymm6,%%ymm1,%%ymm1          \n"
4867       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
4868       "vpmaddubsw  %%ymm1,%%ymm0,%%ymm0          \n"
4869       "vpaddw      %%ymm7,%%ymm3,%%ymm3          \n"
4870       "vpaddw      %%ymm7,%%ymm0,%%ymm0          \n"
4871       "vpsrlw      $0x8,%%ymm3,%%ymm3            \n"
4872       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
4873       "vpackuswb   %%ymm3,%%ymm0,%%ymm0          \n"
4874       "vmovdqu     %%ymm0,(%3,%2,1)              \n"
4875       "lea         0x20(%2),%2                   \n"
4876       "sub         $0x20,%4                      \n"
4877       "jg          1b                            \n"
4878       "vzeroupper                                \n"
4879       : "+r"(src0),   // %0
4880         "+r"(src1),   // %1
4881         "+r"(alpha),  // %2
4882         "+r"(dst),    // %3
4883         "+rm"(width)  // %4
4884         ::"memory",
4885         "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
4886         "xmm7");
4887 }
4888 #endif  // HAS_BLENDPLANEROW_AVX2
4889 
4890 #ifdef HAS_ARGBATTENUATEROW_SSSE3
4891 // Shuffle table duplicating alpha.
4892 static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
4893                                      7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
4894 static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
4895                                      15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
4896 // Attenuate 4 pixels at a time.
ARGBAttenuateRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width)4897 void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
4898                             uint8_t* dst_argb,
4899                             int width) {
4900   asm volatile(
4901       "pcmpeqb     %%xmm3,%%xmm3                 \n"
4902       "pslld       $0x18,%%xmm3                  \n"
4903       "movdqa      %3,%%xmm4                     \n"
4904       "movdqa      %4,%%xmm5                     \n"
4905 
4906       // 4 pixel loop.
4907       LABELALIGN
4908       "1:                                        \n"
4909       "movdqu      (%0),%%xmm0                   \n"
4910       "pshufb      %%xmm4,%%xmm0                 \n"
4911       "movdqu      (%0),%%xmm1                   \n"
4912       "punpcklbw   %%xmm1,%%xmm1                 \n"
4913       "pmulhuw     %%xmm1,%%xmm0                 \n"
4914       "movdqu      (%0),%%xmm1                   \n"
4915       "pshufb      %%xmm5,%%xmm1                 \n"
4916       "movdqu      (%0),%%xmm2                   \n"
4917       "punpckhbw   %%xmm2,%%xmm2                 \n"
4918       "pmulhuw     %%xmm2,%%xmm1                 \n"
4919       "movdqu      (%0),%%xmm2                   \n"
4920       "lea         0x10(%0),%0                   \n"
4921       "pand        %%xmm3,%%xmm2                 \n"
4922       "psrlw       $0x8,%%xmm0                   \n"
4923       "psrlw       $0x8,%%xmm1                   \n"
4924       "packuswb    %%xmm1,%%xmm0                 \n"
4925       "por         %%xmm2,%%xmm0                 \n"
4926       "movdqu      %%xmm0,(%1)                   \n"
4927       "lea         0x10(%1),%1                   \n"
4928       "sub         $0x4,%2                       \n"
4929       "jg          1b                            \n"
4930       : "+r"(src_argb),       // %0
4931         "+r"(dst_argb),       // %1
4932         "+r"(width)           // %2
4933       : "m"(kShuffleAlpha0),  // %3
4934         "m"(kShuffleAlpha1)   // %4
4935       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
4936 }
4937 #endif  // HAS_ARGBATTENUATEROW_SSSE3
4938 
4939 #ifdef HAS_ARGBATTENUATEROW_AVX2
4940 // Shuffle table duplicating alpha.
4941 static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
4942                                          128u, 128u, 14u,  15u, 14u, 15u,
4943                                          14u,  15u,  128u, 128u};
4944 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,int width)4945 void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
4946                            uint8_t* dst_argb,
4947                            int width) {
4948   asm volatile(
4949       "vbroadcastf128 %3,%%ymm4                  \n"
4950       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
4951       "vpslld      $0x18,%%ymm5,%%ymm5           \n"
4952       "sub         %0,%1                         \n"
4953 
4954       // 8 pixel loop.
4955       LABELALIGN
4956       "1:                                        \n"
4957       "vmovdqu     (%0),%%ymm6                   \n"
4958       "vpunpcklbw  %%ymm6,%%ymm6,%%ymm0          \n"
4959       "vpunpckhbw  %%ymm6,%%ymm6,%%ymm1          \n"
4960       "vpshufb     %%ymm4,%%ymm0,%%ymm2          \n"
4961       "vpshufb     %%ymm4,%%ymm1,%%ymm3          \n"
4962       "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
4963       "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
4964       "vpand       %%ymm5,%%ymm6,%%ymm6          \n"
4965       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
4966       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
4967       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
4968       "vpor        %%ymm6,%%ymm0,%%ymm0          \n"
4969       "vmovdqu     %%ymm0,0x00(%0,%1,1)          \n"
4970       "lea         0x20(%0),%0                   \n"
4971       "sub         $0x8,%2                       \n"
4972       "jg          1b                            \n"
4973       "vzeroupper                                \n"
4974       : "+r"(src_argb),          // %0
4975         "+r"(dst_argb),          // %1
4976         "+r"(width)              // %2
4977       : "m"(kShuffleAlpha_AVX2)  // %3
4978       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
4979 }
4980 #endif  // HAS_ARGBATTENUATEROW_AVX2
4981 
4982 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
4983 // Unattenuate 4 pixels at a time.
ARGBUnattenuateRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,int width)4984 void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
4985                              uint8_t* dst_argb,
4986                              int width) {
4987   uintptr_t alpha;
4988   asm volatile(
4989       // 4 pixel loop.
4990       LABELALIGN
4991       "1:                                        \n"
4992       "movdqu      (%0),%%xmm0                   \n"
4993       "movzb       0x03(%0),%3                   \n"
4994       "punpcklbw   %%xmm0,%%xmm0                 \n"
4995       "movd        0x00(%4,%3,4),%%xmm2          \n"
4996       "movzb       0x07(%0),%3                   \n"
4997       "movd        0x00(%4,%3,4),%%xmm3          \n"
4998       "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
4999       "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
5000       "movlhps     %%xmm3,%%xmm2                 \n"
5001       "pmulhuw     %%xmm2,%%xmm0                 \n"
5002       "movdqu      (%0),%%xmm1                   \n"
5003       "movzb       0x0b(%0),%3                   \n"
5004       "punpckhbw   %%xmm1,%%xmm1                 \n"
5005       "movd        0x00(%4,%3,4),%%xmm2          \n"
5006       "movzb       0x0f(%0),%3                   \n"
5007       "movd        0x00(%4,%3,4),%%xmm3          \n"
5008       "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
5009       "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
5010       "movlhps     %%xmm3,%%xmm2                 \n"
5011       "pmulhuw     %%xmm2,%%xmm1                 \n"
5012       "lea         0x10(%0),%0                   \n"
5013       "packuswb    %%xmm1,%%xmm0                 \n"
5014       "movdqu      %%xmm0,(%1)                   \n"
5015       "lea         0x10(%1),%1                   \n"
5016       "sub         $0x4,%2                       \n"
5017       "jg          1b                            \n"
5018       : "+r"(src_argb),     // %0
5019         "+r"(dst_argb),     // %1
5020         "+r"(width),        // %2
5021         "=&r"(alpha)        // %3
5022       : "r"(fixed_invtbl8)  // %4
5023       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
5024 }
5025 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
5026 
5027 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
5028 // Shuffle table duplicating alpha.
5029 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
5030     0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
5031 // Unattenuate 8 pixels at a time.
ARGBUnattenuateRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,int width)5032 void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
5033                              uint8_t* dst_argb,
5034                              int width) {
5035   uintptr_t alpha;
5036   asm volatile(
5037       "sub         %0,%1                         \n"
5038       "vbroadcastf128 %5,%%ymm5                  \n"
5039 
5040       // 8 pixel loop.
5041       LABELALIGN
5042       "1:                                        \n"
5043       // replace VPGATHER
5044       "movzb       0x03(%0),%3                   \n"
5045       "vmovd       0x00(%4,%3,4),%%xmm0          \n"
5046       "movzb       0x07(%0),%3                   \n"
5047       "vmovd       0x00(%4,%3,4),%%xmm1          \n"
5048       "movzb       0x0b(%0),%3                   \n"
5049       "vpunpckldq  %%xmm1,%%xmm0,%%xmm6          \n"
5050       "vmovd       0x00(%4,%3,4),%%xmm2          \n"
5051       "movzb       0x0f(%0),%3                   \n"
5052       "vmovd       0x00(%4,%3,4),%%xmm3          \n"
5053       "movzb       0x13(%0),%3                   \n"
5054       "vpunpckldq  %%xmm3,%%xmm2,%%xmm7          \n"
5055       "vmovd       0x00(%4,%3,4),%%xmm0          \n"
5056       "movzb       0x17(%0),%3                   \n"
5057       "vmovd       0x00(%4,%3,4),%%xmm1          \n"
5058       "movzb       0x1b(%0),%3                   \n"
5059       "vpunpckldq  %%xmm1,%%xmm0,%%xmm0          \n"
5060       "vmovd       0x00(%4,%3,4),%%xmm2          \n"
5061       "movzb       0x1f(%0),%3                   \n"
5062       "vmovd       0x00(%4,%3,4),%%xmm3          \n"
5063       "vpunpckldq  %%xmm3,%%xmm2,%%xmm2          \n"
5064       "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
5065       "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
5066       "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
5067       // end of VPGATHER
5068 
5069       "vmovdqu     (%0),%%ymm6                   \n"
5070       "vpunpcklbw  %%ymm6,%%ymm6,%%ymm0          \n"
5071       "vpunpckhbw  %%ymm6,%%ymm6,%%ymm1          \n"
5072       "vpunpcklwd  %%ymm3,%%ymm3,%%ymm2          \n"
5073       "vpunpckhwd  %%ymm3,%%ymm3,%%ymm3          \n"
5074       "vpshufb     %%ymm5,%%ymm2,%%ymm2          \n"
5075       "vpshufb     %%ymm5,%%ymm3,%%ymm3          \n"
5076       "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
5077       "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
5078       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
5079       "vmovdqu     %%ymm0,0x00(%0,%1,1)          \n"
5080       "lea         0x20(%0),%0                   \n"
5081       "sub         $0x8,%2                       \n"
5082       "jg          1b                            \n"
5083       "vzeroupper                                \n"
5084       : "+r"(src_argb),                 // %0
5085         "+r"(dst_argb),                 // %1
5086         "+r"(width),                    // %2
5087         "=&r"(alpha)                    // %3
5088       : "r"(fixed_invtbl8),             // %4
5089         "m"(kUnattenShuffleAlpha_AVX2)  // %5
5090       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
5091         "xmm7");
5092 }
5093 #endif  // HAS_ARGBUNATTENUATEROW_AVX2
5094 
5095 #ifdef HAS_ARGBGRAYROW_SSSE3
5096 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
ARGBGrayRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width)5097 void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
5098   asm volatile(
5099       "movdqa      %3,%%xmm4                     \n"
5100       "movdqa      %4,%%xmm5                     \n"
5101 
5102       // 8 pixel loop.
5103       LABELALIGN
5104       "1:                                        \n"
5105       "movdqu      (%0),%%xmm0                   \n"
5106       "movdqu      0x10(%0),%%xmm1               \n"
5107       "psubb       %%xmm5,%%xmm0                 \n"
5108       "psubb       %%xmm5,%%xmm1                 \n"
5109       "movdqu      %%xmm4,%%xmm6                 \n"
5110       "pmaddubsw   %%xmm0,%%xmm6                 \n"
5111       "movdqu      %%xmm4,%%xmm0                 \n"
5112       "pmaddubsw   %%xmm1,%%xmm0                 \n"
5113       "phaddw      %%xmm0,%%xmm6                 \n"
5114       "paddw       %%xmm5,%%xmm6                 \n"
5115       "psrlw       $0x8,%%xmm6                   \n"
5116       "packuswb    %%xmm6,%%xmm6                 \n"
5117       "movdqu      (%0),%%xmm2                   \n"
5118       "movdqu      0x10(%0),%%xmm3               \n"
5119       "lea         0x20(%0),%0                   \n"
5120       "psrld       $0x18,%%xmm2                  \n"
5121       "psrld       $0x18,%%xmm3                  \n"
5122       "packuswb    %%xmm3,%%xmm2                 \n"
5123       "packuswb    %%xmm2,%%xmm2                 \n"
5124       "movdqa      %%xmm6,%%xmm3                 \n"
5125       "punpcklbw   %%xmm6,%%xmm6                 \n"
5126       "punpcklbw   %%xmm2,%%xmm3                 \n"
5127       "movdqa      %%xmm6,%%xmm1                 \n"
5128       "punpcklwd   %%xmm3,%%xmm6                 \n"
5129       "punpckhwd   %%xmm3,%%xmm1                 \n"
5130       "movdqu      %%xmm6,(%1)                   \n"
5131       "movdqu      %%xmm1,0x10(%1)               \n"
5132       "lea         0x20(%1),%1                   \n"
5133       "sub         $0x8,%2                       \n"
5134       "jg          1b                            \n"
5135       : "+r"(src_argb),  // %0
5136         "+r"(dst_argb),  // %1
5137         "+r"(width)      // %2
5138       : "m"(kARGBToYJ),  // %3
5139         "m"(kSub128)     // %4
5140       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
5141 }
5142 #endif  // HAS_ARGBGRAYROW_SSSE3
5143 
5144 #ifdef HAS_ARGBSEPIAROW_SSSE3
5145 //    b = (r * 35 + g * 68 + b * 17) >> 7
5146 //    g = (r * 45 + g * 88 + b * 22) >> 7
5147 //    r = (r * 50 + g * 98 + b * 24) >> 7
5148 // Constant for ARGB color to sepia tone
5149 static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
5150                                    17, 68, 35, 0, 17, 68, 35, 0};
5151 
5152 static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
5153                                    22, 88, 45, 0, 22, 88, 45, 0};
5154 
5155 static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
5156                                    24, 98, 50, 0, 24, 98, 50, 0};
5157 
5158 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
ARGBSepiaRow_SSSE3(uint8_t * dst_argb,int width)5159 void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
5160   asm volatile(
5161       "movdqa      %2,%%xmm2                     \n"
5162       "movdqa      %3,%%xmm3                     \n"
5163       "movdqa      %4,%%xmm4                     \n"
5164 
5165       // 8 pixel loop.
5166       LABELALIGN
5167       "1:                                        \n"
5168       "movdqu      (%0),%%xmm0                   \n"
5169       "movdqu      0x10(%0),%%xmm6               \n"
5170       "pmaddubsw   %%xmm2,%%xmm0                 \n"
5171       "pmaddubsw   %%xmm2,%%xmm6                 \n"
5172       "phaddw      %%xmm6,%%xmm0                 \n"
5173       "psrlw       $0x7,%%xmm0                   \n"
5174       "packuswb    %%xmm0,%%xmm0                 \n"
5175       "movdqu      (%0),%%xmm5                   \n"
5176       "movdqu      0x10(%0),%%xmm1               \n"
5177       "pmaddubsw   %%xmm3,%%xmm5                 \n"
5178       "pmaddubsw   %%xmm3,%%xmm1                 \n"
5179       "phaddw      %%xmm1,%%xmm5                 \n"
5180       "psrlw       $0x7,%%xmm5                   \n"
5181       "packuswb    %%xmm5,%%xmm5                 \n"
5182       "punpcklbw   %%xmm5,%%xmm0                 \n"
5183       "movdqu      (%0),%%xmm5                   \n"
5184       "movdqu      0x10(%0),%%xmm1               \n"
5185       "pmaddubsw   %%xmm4,%%xmm5                 \n"
5186       "pmaddubsw   %%xmm4,%%xmm1                 \n"
5187       "phaddw      %%xmm1,%%xmm5                 \n"
5188       "psrlw       $0x7,%%xmm5                   \n"
5189       "packuswb    %%xmm5,%%xmm5                 \n"
5190       "movdqu      (%0),%%xmm6                   \n"
5191       "movdqu      0x10(%0),%%xmm1               \n"
5192       "psrld       $0x18,%%xmm6                  \n"
5193       "psrld       $0x18,%%xmm1                  \n"
5194       "packuswb    %%xmm1,%%xmm6                 \n"
5195       "packuswb    %%xmm6,%%xmm6                 \n"
5196       "punpcklbw   %%xmm6,%%xmm5                 \n"
5197       "movdqa      %%xmm0,%%xmm1                 \n"
5198       "punpcklwd   %%xmm5,%%xmm0                 \n"
5199       "punpckhwd   %%xmm5,%%xmm1                 \n"
5200       "movdqu      %%xmm0,(%0)                   \n"
5201       "movdqu      %%xmm1,0x10(%0)               \n"
5202       "lea         0x20(%0),%0                   \n"
5203       "sub         $0x8,%1                       \n"
5204       "jg          1b                            \n"
5205       : "+r"(dst_argb),      // %0
5206         "+r"(width)          // %1
5207       : "m"(kARGBToSepiaB),  // %2
5208         "m"(kARGBToSepiaG),  // %3
5209         "m"(kARGBToSepiaR)   // %4
5210       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
5211 }
5212 #endif  // HAS_ARGBSEPIAROW_SSSE3
5213 
5214 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
5215 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
5216 // Same as Sepia except matrix is provided.
ARGBColorMatrixRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,const int8_t * matrix_argb,int width)5217 void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
5218                               uint8_t* dst_argb,
5219                               const int8_t* matrix_argb,
5220                               int width) {
5221   asm volatile(
5222       "movdqu      (%3),%%xmm5                   \n"
5223       "pshufd      $0x00,%%xmm5,%%xmm2           \n"
5224       "pshufd      $0x55,%%xmm5,%%xmm3           \n"
5225       "pshufd      $0xaa,%%xmm5,%%xmm4           \n"
5226       "pshufd      $0xff,%%xmm5,%%xmm5           \n"
5227 
5228       // 8 pixel loop.
5229       LABELALIGN
5230       "1:                                        \n"
5231       "movdqu      (%0),%%xmm0                   \n"
5232       "movdqu      0x10(%0),%%xmm7               \n"
5233       "pmaddubsw   %%xmm2,%%xmm0                 \n"
5234       "pmaddubsw   %%xmm2,%%xmm7                 \n"
5235       "movdqu      (%0),%%xmm6                   \n"
5236       "movdqu      0x10(%0),%%xmm1               \n"
5237       "pmaddubsw   %%xmm3,%%xmm6                 \n"
5238       "pmaddubsw   %%xmm3,%%xmm1                 \n"
5239       "phaddsw     %%xmm7,%%xmm0                 \n"
5240       "phaddsw     %%xmm1,%%xmm6                 \n"
5241       "psraw       $0x6,%%xmm0                   \n"
5242       "psraw       $0x6,%%xmm6                   \n"
5243       "packuswb    %%xmm0,%%xmm0                 \n"
5244       "packuswb    %%xmm6,%%xmm6                 \n"
5245       "punpcklbw   %%xmm6,%%xmm0                 \n"
5246       "movdqu      (%0),%%xmm1                   \n"
5247       "movdqu      0x10(%0),%%xmm7               \n"
5248       "pmaddubsw   %%xmm4,%%xmm1                 \n"
5249       "pmaddubsw   %%xmm4,%%xmm7                 \n"
5250       "phaddsw     %%xmm7,%%xmm1                 \n"
5251       "movdqu      (%0),%%xmm6                   \n"
5252       "movdqu      0x10(%0),%%xmm7               \n"
5253       "pmaddubsw   %%xmm5,%%xmm6                 \n"
5254       "pmaddubsw   %%xmm5,%%xmm7                 \n"
5255       "phaddsw     %%xmm7,%%xmm6                 \n"
5256       "psraw       $0x6,%%xmm1                   \n"
5257       "psraw       $0x6,%%xmm6                   \n"
5258       "packuswb    %%xmm1,%%xmm1                 \n"
5259       "packuswb    %%xmm6,%%xmm6                 \n"
5260       "punpcklbw   %%xmm6,%%xmm1                 \n"
5261       "movdqa      %%xmm0,%%xmm6                 \n"
5262       "punpcklwd   %%xmm1,%%xmm0                 \n"
5263       "punpckhwd   %%xmm1,%%xmm6                 \n"
5264       "movdqu      %%xmm0,(%1)                   \n"
5265       "movdqu      %%xmm6,0x10(%1)               \n"
5266       "lea         0x20(%0),%0                   \n"
5267       "lea         0x20(%1),%1                   \n"
5268       "sub         $0x8,%2                       \n"
5269       "jg          1b                            \n"
5270       : "+r"(src_argb),   // %0
5271         "+r"(dst_argb),   // %1
5272         "+r"(width)       // %2
5273       : "r"(matrix_argb)  // %3
5274       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
5275         "xmm7");
5276 }
5277 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
5278 
5279 #ifdef HAS_ARGBQUANTIZEROW_SSE2
5280 // Quantize 4 ARGB pixels (16 bytes).
ARGBQuantizeRow_SSE2(uint8_t * dst_argb,int scale,int interval_size,int interval_offset,int width)5281 void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
5282                           int scale,
5283                           int interval_size,
5284                           int interval_offset,
5285                           int width) {
5286   asm volatile(
5287       "movd        %2,%%xmm2                     \n"
5288       "movd        %3,%%xmm3                     \n"
5289       "movd        %4,%%xmm4                     \n"
5290       "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
5291       "pshufd      $0x44,%%xmm2,%%xmm2           \n"
5292       "pshuflw     $0x40,%%xmm3,%%xmm3           \n"
5293       "pshufd      $0x44,%%xmm3,%%xmm3           \n"
5294       "pshuflw     $0x40,%%xmm4,%%xmm4           \n"
5295       "pshufd      $0x44,%%xmm4,%%xmm4           \n"
5296       "pxor        %%xmm5,%%xmm5                 \n"
5297       "pcmpeqb     %%xmm6,%%xmm6                 \n"
5298       "pslld       $0x18,%%xmm6                  \n"
5299 
5300       // 4 pixel loop.
5301       LABELALIGN
5302       "1:                                        \n"
5303       "movdqu      (%0),%%xmm0                   \n"
5304       "punpcklbw   %%xmm5,%%xmm0                 \n"
5305       "pmulhuw     %%xmm2,%%xmm0                 \n"
5306       "movdqu      (%0),%%xmm1                   \n"
5307       "punpckhbw   %%xmm5,%%xmm1                 \n"
5308       "pmulhuw     %%xmm2,%%xmm1                 \n"
5309       "pmullw      %%xmm3,%%xmm0                 \n"
5310       "movdqu      (%0),%%xmm7                   \n"
5311       "pmullw      %%xmm3,%%xmm1                 \n"
5312       "pand        %%xmm6,%%xmm7                 \n"
5313       "paddw       %%xmm4,%%xmm0                 \n"
5314       "paddw       %%xmm4,%%xmm1                 \n"
5315       "packuswb    %%xmm1,%%xmm0                 \n"
5316       "por         %%xmm7,%%xmm0                 \n"
5317       "movdqu      %%xmm0,(%0)                   \n"
5318       "lea         0x10(%0),%0                   \n"
5319       "sub         $0x4,%1                       \n"
5320       "jg          1b                            \n"
5321       : "+r"(dst_argb),       // %0
5322         "+r"(width)           // %1
5323       : "r"(scale),           // %2
5324         "r"(interval_size),   // %3
5325         "r"(interval_offset)  // %4
5326       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
5327         "xmm7");
5328 }
5329 #endif  // HAS_ARGBQUANTIZEROW_SSE2
5330 
5331 #ifdef HAS_ARGBSHADEROW_SSE2
5332 // Shade 4 pixels at a time by specified value.
ARGBShadeRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)5333 void ARGBShadeRow_SSE2(const uint8_t* src_argb,
5334                        uint8_t* dst_argb,
5335                        int width,
5336                        uint32_t value) {
5337   asm volatile(
5338       "movd        %3,%%xmm2                     \n"
5339       "punpcklbw   %%xmm2,%%xmm2                 \n"
5340       "punpcklqdq  %%xmm2,%%xmm2                 \n"
5341 
5342       // 4 pixel loop.
5343       LABELALIGN
5344       "1:                                        \n"
5345       "movdqu      (%0),%%xmm0                   \n"
5346       "lea         0x10(%0),%0                   \n"
5347       "movdqa      %%xmm0,%%xmm1                 \n"
5348       "punpcklbw   %%xmm0,%%xmm0                 \n"
5349       "punpckhbw   %%xmm1,%%xmm1                 \n"
5350       "pmulhuw     %%xmm2,%%xmm0                 \n"
5351       "pmulhuw     %%xmm2,%%xmm1                 \n"
5352       "psrlw       $0x8,%%xmm0                   \n"
5353       "psrlw       $0x8,%%xmm1                   \n"
5354       "packuswb    %%xmm1,%%xmm0                 \n"
5355       "movdqu      %%xmm0,(%1)                   \n"
5356       "lea         0x10(%1),%1                   \n"
5357       "sub         $0x4,%2                       \n"
5358       "jg          1b                            \n"
5359       : "+r"(src_argb),  // %0
5360         "+r"(dst_argb),  // %1
5361         "+r"(width)      // %2
5362       : "r"(value)       // %3
5363       : "memory", "cc", "xmm0", "xmm1", "xmm2");
5364 }
5365 #endif  // HAS_ARGBSHADEROW_SSE2
5366 
5367 #ifdef HAS_ARGBMULTIPLYROW_SSE2
5368 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBMultiplyRow_SSE2(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)5369 void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
5370                           const uint8_t* src_argb1,
5371                           uint8_t* dst_argb,
5372                           int width) {
5373   asm volatile(
5374 
5375       "pxor        %%xmm5,%%xmm5                 \n"
5376 
5377       // 4 pixel loop.
5378       LABELALIGN
5379       "1:                                        \n"
5380       "movdqu      (%0),%%xmm0                   \n"
5381       "lea         0x10(%0),%0                   \n"
5382       "movdqu      (%1),%%xmm2                   \n"
5383       "lea         0x10(%1),%1                   \n"
5384       "movdqu      %%xmm0,%%xmm1                 \n"
5385       "movdqu      %%xmm2,%%xmm3                 \n"
5386       "punpcklbw   %%xmm0,%%xmm0                 \n"
5387       "punpckhbw   %%xmm1,%%xmm1                 \n"
5388       "punpcklbw   %%xmm5,%%xmm2                 \n"
5389       "punpckhbw   %%xmm5,%%xmm3                 \n"
5390       "pmulhuw     %%xmm2,%%xmm0                 \n"
5391       "pmulhuw     %%xmm3,%%xmm1                 \n"
5392       "packuswb    %%xmm1,%%xmm0                 \n"
5393       "movdqu      %%xmm0,(%2)                   \n"
5394       "lea         0x10(%2),%2                   \n"
5395       "sub         $0x4,%3                       \n"
5396       "jg          1b                            \n"
5397       : "+r"(src_argb0),  // %0
5398         "+r"(src_argb1),  // %1
5399         "+r"(dst_argb),   // %2
5400         "+r"(width)       // %3
5401       :
5402       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
5403 }
5404 #endif  // HAS_ARGBMULTIPLYROW_SSE2
5405 
5406 #ifdef HAS_ARGBMULTIPLYROW_AVX2
5407 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_AVX2(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)5408 void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
5409                           const uint8_t* src_argb1,
5410                           uint8_t* dst_argb,
5411                           int width) {
5412   asm volatile(
5413 
5414       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
5415 
5416       // 4 pixel loop.
5417       LABELALIGN
5418       "1:                                        \n"
5419       "vmovdqu     (%0),%%ymm1                   \n"
5420       "lea         0x20(%0),%0                   \n"
5421       "vmovdqu     (%1),%%ymm3                   \n"
5422       "lea         0x20(%1),%1                   \n"
5423       "vpunpcklbw  %%ymm1,%%ymm1,%%ymm0          \n"
5424       "vpunpckhbw  %%ymm1,%%ymm1,%%ymm1          \n"
5425       "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
5426       "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
5427       "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
5428       "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
5429       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
5430       "vmovdqu     %%ymm0,(%2)                   \n"
5431       "lea         0x20(%2),%2                   \n"
5432       "sub         $0x8,%3                       \n"
5433       "jg          1b                            \n"
5434       "vzeroupper                                \n"
5435       : "+r"(src_argb0),  // %0
5436         "+r"(src_argb1),  // %1
5437         "+r"(dst_argb),   // %2
5438         "+r"(width)       // %3
5439       :
5440       : "memory", "cc"
5441 #if defined(__AVX2__)
5442         ,
5443         "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
5444 #endif
5445   );
5446 }
5447 #endif  // HAS_ARGBMULTIPLYROW_AVX2
5448 
5449 #ifdef HAS_ARGBADDROW_SSE2
5450 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_SSE2(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)5451 void ARGBAddRow_SSE2(const uint8_t* src_argb0,
5452                      const uint8_t* src_argb1,
5453                      uint8_t* dst_argb,
5454                      int width) {
5455   asm volatile(
5456       // 4 pixel loop.
5457       LABELALIGN
5458       "1:                                        \n"
5459       "movdqu      (%0),%%xmm0                   \n"
5460       "lea         0x10(%0),%0                   \n"
5461       "movdqu      (%1),%%xmm1                   \n"
5462       "lea         0x10(%1),%1                   \n"
5463       "paddusb     %%xmm1,%%xmm0                 \n"
5464       "movdqu      %%xmm0,(%2)                   \n"
5465       "lea         0x10(%2),%2                   \n"
5466       "sub         $0x4,%3                       \n"
5467       "jg          1b                            \n"
5468       : "+r"(src_argb0),  // %0
5469         "+r"(src_argb1),  // %1
5470         "+r"(dst_argb),   // %2
5471         "+r"(width)       // %3
5472       :
5473       : "memory", "cc", "xmm0", "xmm1");
5474 }
5475 #endif  // HAS_ARGBADDROW_SSE2
5476 
5477 #ifdef HAS_ARGBADDROW_AVX2
5478 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_AVX2(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)5479 void ARGBAddRow_AVX2(const uint8_t* src_argb0,
5480                      const uint8_t* src_argb1,
5481                      uint8_t* dst_argb,
5482                      int width) {
5483   asm volatile(
5484       // 4 pixel loop.
5485       LABELALIGN
5486       "1:                                        \n"
5487       "vmovdqu     (%0),%%ymm0                   \n"
5488       "lea         0x20(%0),%0                   \n"
5489       "vpaddusb    (%1),%%ymm0,%%ymm0            \n"
5490       "lea         0x20(%1),%1                   \n"
5491       "vmovdqu     %%ymm0,(%2)                   \n"
5492       "lea         0x20(%2),%2                   \n"
5493       "sub         $0x8,%3                       \n"
5494       "jg          1b                            \n"
5495       "vzeroupper                                \n"
5496       : "+r"(src_argb0),  // %0
5497         "+r"(src_argb1),  // %1
5498         "+r"(dst_argb),   // %2
5499         "+r"(width)       // %3
5500       :
5501       : "memory", "cc", "xmm0");
5502 }
5503 #endif  // HAS_ARGBADDROW_AVX2
5504 
5505 #ifdef HAS_ARGBSUBTRACTROW_SSE2
5506 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
ARGBSubtractRow_SSE2(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)5507 void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
5508                           const uint8_t* src_argb1,
5509                           uint8_t* dst_argb,
5510                           int width) {
5511   asm volatile(
5512       // 4 pixel loop.
5513       LABELALIGN
5514       "1:                                        \n"
5515       "movdqu      (%0),%%xmm0                   \n"
5516       "lea         0x10(%0),%0                   \n"
5517       "movdqu      (%1),%%xmm1                   \n"
5518       "lea         0x10(%1),%1                   \n"
5519       "psubusb     %%xmm1,%%xmm0                 \n"
5520       "movdqu      %%xmm0,(%2)                   \n"
5521       "lea         0x10(%2),%2                   \n"
5522       "sub         $0x4,%3                       \n"
5523       "jg          1b                            \n"
5524       : "+r"(src_argb0),  // %0
5525         "+r"(src_argb1),  // %1
5526         "+r"(dst_argb),   // %2
5527         "+r"(width)       // %3
5528       :
5529       : "memory", "cc", "xmm0", "xmm1");
5530 }
5531 #endif  // HAS_ARGBSUBTRACTROW_SSE2
5532 
5533 #ifdef HAS_ARGBSUBTRACTROW_AVX2
5534 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_AVX2(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)5535 void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
5536                           const uint8_t* src_argb1,
5537                           uint8_t* dst_argb,
5538                           int width) {
5539   asm volatile(
5540       // 4 pixel loop.
5541       LABELALIGN
5542       "1:                                        \n"
5543       "vmovdqu     (%0),%%ymm0                   \n"
5544       "lea         0x20(%0),%0                   \n"
5545       "vpsubusb    (%1),%%ymm0,%%ymm0            \n"
5546       "lea         0x20(%1),%1                   \n"
5547       "vmovdqu     %%ymm0,(%2)                   \n"
5548       "lea         0x20(%2),%2                   \n"
5549       "sub         $0x8,%3                       \n"
5550       "jg          1b                            \n"
5551       "vzeroupper                                \n"
5552       : "+r"(src_argb0),  // %0
5553         "+r"(src_argb1),  // %1
5554         "+r"(dst_argb),   // %2
5555         "+r"(width)       // %3
5556       :
5557       : "memory", "cc", "xmm0");
5558 }
5559 #endif  // HAS_ARGBSUBTRACTROW_AVX2
5560 
5561 #ifdef HAS_SOBELXROW_SSE2
5562 // SobelX as a matrix is
5563 // -1  0  1
5564 // -2  0  2
5565 // -1  0  1
SobelXRow_SSE2(const uint8_t * src_y0,const uint8_t * src_y1,const uint8_t * src_y2,uint8_t * dst_sobelx,int width)5566 void SobelXRow_SSE2(const uint8_t* src_y0,
5567                     const uint8_t* src_y1,
5568                     const uint8_t* src_y2,
5569                     uint8_t* dst_sobelx,
5570                     int width) {
5571   asm volatile(
5572       "sub         %0,%1                         \n"
5573       "sub         %0,%2                         \n"
5574       "sub         %0,%3                         \n"
5575       "pxor        %%xmm5,%%xmm5                 \n"
5576 
5577       // 8 pixel loop.
5578       LABELALIGN
5579       "1:                                        \n"
5580       "movq        (%0),%%xmm0                   \n"
5581       "movq        0x2(%0),%%xmm1                \n"
5582       "punpcklbw   %%xmm5,%%xmm0                 \n"
5583       "punpcklbw   %%xmm5,%%xmm1                 \n"
5584       "psubw       %%xmm1,%%xmm0                 \n"
5585       "movq        0x00(%0,%1,1),%%xmm1          \n"
5586       "movq        0x02(%0,%1,1),%%xmm2          \n"
5587       "punpcklbw   %%xmm5,%%xmm1                 \n"
5588       "punpcklbw   %%xmm5,%%xmm2                 \n"
5589       "psubw       %%xmm2,%%xmm1                 \n"
5590       "movq        0x00(%0,%2,1),%%xmm2          \n"
5591       "movq        0x02(%0,%2,1),%%xmm3          \n"
5592       "punpcklbw   %%xmm5,%%xmm2                 \n"
5593       "punpcklbw   %%xmm5,%%xmm3                 \n"
5594       "psubw       %%xmm3,%%xmm2                 \n"
5595       "paddw       %%xmm2,%%xmm0                 \n"
5596       "paddw       %%xmm1,%%xmm0                 \n"
5597       "paddw       %%xmm1,%%xmm0                 \n"
5598       "pxor        %%xmm1,%%xmm1                 \n"
5599       "psubw       %%xmm0,%%xmm1                 \n"
5600       "pmaxsw      %%xmm1,%%xmm0                 \n"
5601       "packuswb    %%xmm0,%%xmm0                 \n"
5602       "movq        %%xmm0,0x00(%0,%3,1)          \n"
5603       "lea         0x8(%0),%0                    \n"
5604       "sub         $0x8,%4                       \n"
5605       "jg          1b                            \n"
5606       : "+r"(src_y0),      // %0
5607         "+r"(src_y1),      // %1
5608         "+r"(src_y2),      // %2
5609         "+r"(dst_sobelx),  // %3
5610         "+r"(width)        // %4
5611       :
5612       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
5613 }
5614 #endif  // HAS_SOBELXROW_SSE2
5615 
5616 #ifdef HAS_SOBELYROW_SSE2
5617 // SobelY as a matrix is
5618 // -1 -2 -1
5619 //  0  0  0
5620 //  1  2  1
SobelYRow_SSE2(const uint8_t * src_y0,const uint8_t * src_y1,uint8_t * dst_sobely,int width)5621 void SobelYRow_SSE2(const uint8_t* src_y0,
5622                     const uint8_t* src_y1,
5623                     uint8_t* dst_sobely,
5624                     int width) {
5625   asm volatile(
5626       "sub         %0,%1                         \n"
5627       "sub         %0,%2                         \n"
5628       "pxor        %%xmm5,%%xmm5                 \n"
5629 
5630       // 8 pixel loop.
5631       LABELALIGN
5632       "1:                                        \n"
5633       "movq        (%0),%%xmm0                   \n"
5634       "movq        0x00(%0,%1,1),%%xmm1          \n"
5635       "punpcklbw   %%xmm5,%%xmm0                 \n"
5636       "punpcklbw   %%xmm5,%%xmm1                 \n"
5637       "psubw       %%xmm1,%%xmm0                 \n"
5638       "movq        0x1(%0),%%xmm1                \n"
5639       "movq        0x01(%0,%1,1),%%xmm2          \n"
5640       "punpcklbw   %%xmm5,%%xmm1                 \n"
5641       "punpcklbw   %%xmm5,%%xmm2                 \n"
5642       "psubw       %%xmm2,%%xmm1                 \n"
5643       "movq        0x2(%0),%%xmm2                \n"
5644       "movq        0x02(%0,%1,1),%%xmm3          \n"
5645       "punpcklbw   %%xmm5,%%xmm2                 \n"
5646       "punpcklbw   %%xmm5,%%xmm3                 \n"
5647       "psubw       %%xmm3,%%xmm2                 \n"
5648       "paddw       %%xmm2,%%xmm0                 \n"
5649       "paddw       %%xmm1,%%xmm0                 \n"
5650       "paddw       %%xmm1,%%xmm0                 \n"
5651       "pxor        %%xmm1,%%xmm1                 \n"
5652       "psubw       %%xmm0,%%xmm1                 \n"
5653       "pmaxsw      %%xmm1,%%xmm0                 \n"
5654       "packuswb    %%xmm0,%%xmm0                 \n"
5655       "movq        %%xmm0,0x00(%0,%2,1)          \n"
5656       "lea         0x8(%0),%0                    \n"
5657       "sub         $0x8,%3                       \n"
5658       "jg          1b                            \n"
5659       : "+r"(src_y0),      // %0
5660         "+r"(src_y1),      // %1
5661         "+r"(dst_sobely),  // %2
5662         "+r"(width)        // %3
5663       :
5664       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
5665 }
5666 #endif  // HAS_SOBELYROW_SSE2
5667 
5668 #ifdef HAS_SOBELROW_SSE2
5669 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
5670 // A = 255
5671 // R = Sobel
5672 // G = Sobel
5673 // B = Sobel
SobelRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)5674 void SobelRow_SSE2(const uint8_t* src_sobelx,
5675                    const uint8_t* src_sobely,
5676                    uint8_t* dst_argb,
5677                    int width) {
5678   asm volatile(
5679       "sub         %0,%1                         \n"
5680       "pcmpeqb     %%xmm5,%%xmm5                 \n"
5681       "pslld       $0x18,%%xmm5                  \n"
5682 
5683       // 8 pixel loop.
5684       LABELALIGN
5685       "1:                                        \n"
5686       "movdqu      (%0),%%xmm0                   \n"
5687       "movdqu      0x00(%0,%1,1),%%xmm1          \n"
5688       "lea         0x10(%0),%0                   \n"
5689       "paddusb     %%xmm1,%%xmm0                 \n"
5690       "movdqa      %%xmm0,%%xmm2                 \n"
5691       "punpcklbw   %%xmm0,%%xmm2                 \n"
5692       "punpckhbw   %%xmm0,%%xmm0                 \n"
5693       "movdqa      %%xmm2,%%xmm1                 \n"
5694       "punpcklwd   %%xmm2,%%xmm1                 \n"
5695       "punpckhwd   %%xmm2,%%xmm2                 \n"
5696       "por         %%xmm5,%%xmm1                 \n"
5697       "por         %%xmm5,%%xmm2                 \n"
5698       "movdqa      %%xmm0,%%xmm3                 \n"
5699       "punpcklwd   %%xmm0,%%xmm3                 \n"
5700       "punpckhwd   %%xmm0,%%xmm0                 \n"
5701       "por         %%xmm5,%%xmm3                 \n"
5702       "por         %%xmm5,%%xmm0                 \n"
5703       "movdqu      %%xmm1,(%2)                   \n"
5704       "movdqu      %%xmm2,0x10(%2)               \n"
5705       "movdqu      %%xmm3,0x20(%2)               \n"
5706       "movdqu      %%xmm0,0x30(%2)               \n"
5707       "lea         0x40(%2),%2                   \n"
5708       "sub         $0x10,%3                      \n"
5709       "jg          1b                            \n"
5710       : "+r"(src_sobelx),  // %0
5711         "+r"(src_sobely),  // %1
5712         "+r"(dst_argb),    // %2
5713         "+r"(width)        // %3
5714       :
5715       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
5716 }
5717 #endif  // HAS_SOBELROW_SSE2
5718 
5719 #ifdef HAS_SOBELTOPLANEROW_SSE2
5720 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
SobelToPlaneRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_y,int width)5721 void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
5722                           const uint8_t* src_sobely,
5723                           uint8_t* dst_y,
5724                           int width) {
5725   asm volatile(
5726       "sub         %0,%1                         \n"
5727       "pcmpeqb     %%xmm5,%%xmm5                 \n"
5728       "pslld       $0x18,%%xmm5                  \n"
5729 
5730       // 8 pixel loop.
5731       LABELALIGN
5732       "1:                                        \n"
5733       "movdqu      (%0),%%xmm0                   \n"
5734       "movdqu      0x00(%0,%1,1),%%xmm1          \n"
5735       "lea         0x10(%0),%0                   \n"
5736       "paddusb     %%xmm1,%%xmm0                 \n"
5737       "movdqu      %%xmm0,(%2)                   \n"
5738       "lea         0x10(%2),%2                   \n"
5739       "sub         $0x10,%3                      \n"
5740       "jg          1b                            \n"
5741       : "+r"(src_sobelx),  // %0
5742         "+r"(src_sobely),  // %1
5743         "+r"(dst_y),       // %2
5744         "+r"(width)        // %3
5745       :
5746       : "memory", "cc", "xmm0", "xmm1");
5747 }
5748 #endif  // HAS_SOBELTOPLANEROW_SSE2
5749 
5750 #ifdef HAS_SOBELXYROW_SSE2
5751 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
5752 // A = 255
5753 // R = Sobel X
5754 // G = Sobel
5755 // B = Sobel Y
SobelXYRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)5756 void SobelXYRow_SSE2(const uint8_t* src_sobelx,
5757                      const uint8_t* src_sobely,
5758                      uint8_t* dst_argb,
5759                      int width) {
5760   asm volatile(
5761       "sub         %0,%1                         \n"
5762       "pcmpeqb     %%xmm5,%%xmm5                 \n"
5763 
5764       // 8 pixel loop.
5765       LABELALIGN
5766       "1:                                        \n"
5767       "movdqu      (%0),%%xmm0                   \n"
5768       "movdqu      0x00(%0,%1,1),%%xmm1          \n"
5769       "lea         0x10(%0),%0                   \n"
5770       "movdqa      %%xmm0,%%xmm2                 \n"
5771       "paddusb     %%xmm1,%%xmm2                 \n"
5772       "movdqa      %%xmm0,%%xmm3                 \n"
5773       "punpcklbw   %%xmm5,%%xmm3                 \n"
5774       "punpckhbw   %%xmm5,%%xmm0                 \n"
5775       "movdqa      %%xmm1,%%xmm4                 \n"
5776       "punpcklbw   %%xmm2,%%xmm4                 \n"
5777       "punpckhbw   %%xmm2,%%xmm1                 \n"
5778       "movdqa      %%xmm4,%%xmm6                 \n"
5779       "punpcklwd   %%xmm3,%%xmm6                 \n"
5780       "punpckhwd   %%xmm3,%%xmm4                 \n"
5781       "movdqa      %%xmm1,%%xmm7                 \n"
5782       "punpcklwd   %%xmm0,%%xmm7                 \n"
5783       "punpckhwd   %%xmm0,%%xmm1                 \n"
5784       "movdqu      %%xmm6,(%2)                   \n"
5785       "movdqu      %%xmm4,0x10(%2)               \n"
5786       "movdqu      %%xmm7,0x20(%2)               \n"
5787       "movdqu      %%xmm1,0x30(%2)               \n"
5788       "lea         0x40(%2),%2                   \n"
5789       "sub         $0x10,%3                      \n"
5790       "jg          1b                            \n"
5791       : "+r"(src_sobelx),  // %0
5792         "+r"(src_sobely),  // %1
5793         "+r"(dst_argb),    // %2
5794         "+r"(width)        // %3
5795       :
5796       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
5797         "xmm7");
5798 }
5799 #endif  // HAS_SOBELXYROW_SSE2
5800 
5801 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
5802 // Creates a table of cumulative sums where each value is a sum of all values
5803 // above and to the left of the value, inclusive of the value.
ComputeCumulativeSumRow_SSE2(const uint8_t * row,int32_t * cumsum,const int32_t * previous_cumsum,int width)5804 void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
5805                                   int32_t* cumsum,
5806                                   const int32_t* previous_cumsum,
5807                                   int width) {
5808   asm volatile(
5809       "pxor        %%xmm0,%%xmm0                 \n"
5810       "pxor        %%xmm1,%%xmm1                 \n"
5811       "sub         $0x4,%3                       \n"
5812       "jl          49f                           \n"
5813       "test        $0xf,%1                       \n"
5814       "jne         49f                           \n"
5815 
5816       // 4 pixel loop.
5817       LABELALIGN
5818       "40:                                       \n"
5819       "movdqu      (%0),%%xmm2                   \n"
5820       "lea         0x10(%0),%0                   \n"
5821       "movdqa      %%xmm2,%%xmm4                 \n"
5822       "punpcklbw   %%xmm1,%%xmm2                 \n"
5823       "movdqa      %%xmm2,%%xmm3                 \n"
5824       "punpcklwd   %%xmm1,%%xmm2                 \n"
5825       "punpckhwd   %%xmm1,%%xmm3                 \n"
5826       "punpckhbw   %%xmm1,%%xmm4                 \n"
5827       "movdqa      %%xmm4,%%xmm5                 \n"
5828       "punpcklwd   %%xmm1,%%xmm4                 \n"
5829       "punpckhwd   %%xmm1,%%xmm5                 \n"
5830       "paddd       %%xmm2,%%xmm0                 \n"
5831       "movdqu      (%2),%%xmm2                   \n"
5832       "paddd       %%xmm0,%%xmm2                 \n"
5833       "paddd       %%xmm3,%%xmm0                 \n"
5834       "movdqu      0x10(%2),%%xmm3               \n"
5835       "paddd       %%xmm0,%%xmm3                 \n"
5836       "paddd       %%xmm4,%%xmm0                 \n"
5837       "movdqu      0x20(%2),%%xmm4               \n"
5838       "paddd       %%xmm0,%%xmm4                 \n"
5839       "paddd       %%xmm5,%%xmm0                 \n"
5840       "movdqu      0x30(%2),%%xmm5               \n"
5841       "lea         0x40(%2),%2                   \n"
5842       "paddd       %%xmm0,%%xmm5                 \n"
5843       "movdqu      %%xmm2,(%1)                   \n"
5844       "movdqu      %%xmm3,0x10(%1)               \n"
5845       "movdqu      %%xmm4,0x20(%1)               \n"
5846       "movdqu      %%xmm5,0x30(%1)               \n"
5847       "lea         0x40(%1),%1                   \n"
5848       "sub         $0x4,%3                       \n"
5849       "jge         40b                           \n"
5850 
5851       "49:                                       \n"
5852       "add         $0x3,%3                       \n"
5853       "jl          19f                           \n"
5854 
5855       // 1 pixel loop.
5856       LABELALIGN
5857       "10:                                       \n"
5858       "movd        (%0),%%xmm2                   \n"
5859       "lea         0x4(%0),%0                    \n"
5860       "punpcklbw   %%xmm1,%%xmm2                 \n"
5861       "punpcklwd   %%xmm1,%%xmm2                 \n"
5862       "paddd       %%xmm2,%%xmm0                 \n"
5863       "movdqu      (%2),%%xmm2                   \n"
5864       "lea         0x10(%2),%2                   \n"
5865       "paddd       %%xmm0,%%xmm2                 \n"
5866       "movdqu      %%xmm2,(%1)                   \n"
5867       "lea         0x10(%1),%1                   \n"
5868       "sub         $0x1,%3                       \n"
5869       "jge         10b                           \n"
5870 
5871       "19:                                       \n"
5872       : "+r"(row),              // %0
5873         "+r"(cumsum),           // %1
5874         "+r"(previous_cumsum),  // %2
5875         "+r"(width)             // %3
5876       :
5877       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
5878 }
5879 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
5880 
5881 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
CumulativeSumToAverageRow_SSE2(const int32_t * topleft,const int32_t * botleft,int width,int area,uint8_t * dst,int count)5882 void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
5883                                     const int32_t* botleft,
5884                                     int width,
5885                                     int area,
5886                                     uint8_t* dst,
5887                                     int count) {
5888   asm volatile(
5889       "movd        %5,%%xmm5                     \n"
5890       "cvtdq2ps    %%xmm5,%%xmm5                 \n"
5891       "rcpss       %%xmm5,%%xmm4                 \n"
5892       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
5893       "sub         $0x4,%3                       \n"
5894       "jl          49f                           \n"
5895       "cmpl        $0x80,%5                      \n"
5896       "ja          40f                           \n"
5897 
5898       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
5899       "pcmpeqb     %%xmm6,%%xmm6                 \n"
5900       "psrld       $0x10,%%xmm6                  \n"
5901       "cvtdq2ps    %%xmm6,%%xmm6                 \n"
5902       "addps       %%xmm6,%%xmm5                 \n"
5903       "mulps       %%xmm4,%%xmm5                 \n"
5904       "cvtps2dq    %%xmm5,%%xmm5                 \n"
5905       "packssdw    %%xmm5,%%xmm5                 \n"
5906 
5907       // 4 pixel small loop.
5908       LABELALIGN
5909       "4:                                        \n"
5910       "movdqu      (%0),%%xmm0                   \n"
5911       "movdqu      0x10(%0),%%xmm1               \n"
5912       "movdqu      0x20(%0),%%xmm2               \n"
5913       "movdqu      0x30(%0),%%xmm3               \n"
5914       "psubd       0x00(%0,%4,4),%%xmm0          \n"
5915       "psubd       0x10(%0,%4,4),%%xmm1          \n"
5916       "psubd       0x20(%0,%4,4),%%xmm2          \n"
5917       "psubd       0x30(%0,%4,4),%%xmm3          \n"
5918       "lea         0x40(%0),%0                   \n"
5919       "psubd       (%1),%%xmm0                   \n"
5920       "psubd       0x10(%1),%%xmm1               \n"
5921       "psubd       0x20(%1),%%xmm2               \n"
5922       "psubd       0x30(%1),%%xmm3               \n"
5923       "paddd       0x00(%1,%4,4),%%xmm0          \n"
5924       "paddd       0x10(%1,%4,4),%%xmm1          \n"
5925       "paddd       0x20(%1,%4,4),%%xmm2          \n"
5926       "paddd       0x30(%1,%4,4),%%xmm3          \n"
5927       "lea         0x40(%1),%1                   \n"
5928       "packssdw    %%xmm1,%%xmm0                 \n"
5929       "packssdw    %%xmm3,%%xmm2                 \n"
5930       "pmulhuw     %%xmm5,%%xmm0                 \n"
5931       "pmulhuw     %%xmm5,%%xmm2                 \n"
5932       "packuswb    %%xmm2,%%xmm0                 \n"
5933       "movdqu      %%xmm0,(%2)                   \n"
5934       "lea         0x10(%2),%2                   \n"
5935       "sub         $0x4,%3                       \n"
5936       "jge         4b                            \n"
5937       "jmp         49f                           \n"
5938 
5939       // 4 pixel loop
5940       LABELALIGN
5941       "40:                                       \n"
5942       "movdqu      (%0),%%xmm0                   \n"
5943       "movdqu      0x10(%0),%%xmm1               \n"
5944       "movdqu      0x20(%0),%%xmm2               \n"
5945       "movdqu      0x30(%0),%%xmm3               \n"
5946       "psubd       0x00(%0,%4,4),%%xmm0          \n"
5947       "psubd       0x10(%0,%4,4),%%xmm1          \n"
5948       "psubd       0x20(%0,%4,4),%%xmm2          \n"
5949       "psubd       0x30(%0,%4,4),%%xmm3          \n"
5950       "lea         0x40(%0),%0                   \n"
5951       "psubd       (%1),%%xmm0                   \n"
5952       "psubd       0x10(%1),%%xmm1               \n"
5953       "psubd       0x20(%1),%%xmm2               \n"
5954       "psubd       0x30(%1),%%xmm3               \n"
5955       "paddd       0x00(%1,%4,4),%%xmm0          \n"
5956       "paddd       0x10(%1,%4,4),%%xmm1          \n"
5957       "paddd       0x20(%1,%4,4),%%xmm2          \n"
5958       "paddd       0x30(%1,%4,4),%%xmm3          \n"
5959       "lea         0x40(%1),%1                   \n"
5960       "cvtdq2ps    %%xmm0,%%xmm0                 \n"
5961       "cvtdq2ps    %%xmm1,%%xmm1                 \n"
5962       "mulps       %%xmm4,%%xmm0                 \n"
5963       "mulps       %%xmm4,%%xmm1                 \n"
5964       "cvtdq2ps    %%xmm2,%%xmm2                 \n"
5965       "cvtdq2ps    %%xmm3,%%xmm3                 \n"
5966       "mulps       %%xmm4,%%xmm2                 \n"
5967       "mulps       %%xmm4,%%xmm3                 \n"
5968       "cvtps2dq    %%xmm0,%%xmm0                 \n"
5969       "cvtps2dq    %%xmm1,%%xmm1                 \n"
5970       "cvtps2dq    %%xmm2,%%xmm2                 \n"
5971       "cvtps2dq    %%xmm3,%%xmm3                 \n"
5972       "packssdw    %%xmm1,%%xmm0                 \n"
5973       "packssdw    %%xmm3,%%xmm2                 \n"
5974       "packuswb    %%xmm2,%%xmm0                 \n"
5975       "movdqu      %%xmm0,(%2)                   \n"
5976       "lea         0x10(%2),%2                   \n"
5977       "sub         $0x4,%3                       \n"
5978       "jge         40b                           \n"
5979 
5980       "49:                                       \n"
5981       "add         $0x3,%3                       \n"
5982       "jl          19f                           \n"
5983 
5984       // 1 pixel loop
5985       LABELALIGN
5986       "10:                                       \n"
5987       "movdqu      (%0),%%xmm0                   \n"
5988       "psubd       0x00(%0,%4,4),%%xmm0          \n"
5989       "lea         0x10(%0),%0                   \n"
5990       "psubd       (%1),%%xmm0                   \n"
5991       "paddd       0x00(%1,%4,4),%%xmm0          \n"
5992       "lea         0x10(%1),%1                   \n"
5993       "cvtdq2ps    %%xmm0,%%xmm0                 \n"
5994       "mulps       %%xmm4,%%xmm0                 \n"
5995       "cvtps2dq    %%xmm0,%%xmm0                 \n"
5996       "packssdw    %%xmm0,%%xmm0                 \n"
5997       "packuswb    %%xmm0,%%xmm0                 \n"
5998       "movd        %%xmm0,(%2)                   \n"
5999       "lea         0x4(%2),%2                    \n"
6000       "sub         $0x1,%3                       \n"
6001       "jge         10b                           \n"
6002       "19:                                       \n"
6003       : "+r"(topleft),           // %0
6004         "+r"(botleft),           // %1
6005         "+r"(dst),               // %2
6006         "+rm"(count)             // %3
6007       : "r"((intptr_t)(width)),  // %4
6008         "rm"(area)               // %5
6009       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
6010 }
6011 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
6012 
6013 #ifdef HAS_ARGBAFFINEROW_SSE2
6014 // Copy ARGB pixels from source image with slope to a row of destination.
6015 LIBYUV_API
ARGBAffineRow_SSE2(const uint8_t * src_argb,int src_argb_stride,uint8_t * dst_argb,const float * src_dudv,int width)6016 void ARGBAffineRow_SSE2(const uint8_t* src_argb,
6017                         int src_argb_stride,
6018                         uint8_t* dst_argb,
6019                         const float* src_dudv,
6020                         int width) {
6021   intptr_t src_argb_stride_temp = src_argb_stride;
6022   intptr_t temp;
6023   asm volatile(
6024       "movq        (%3),%%xmm2                   \n"
6025       "movq        0x08(%3),%%xmm7               \n"
6026       "shl         $0x10,%1                      \n"
6027       "add         $0x4,%1                       \n"
6028       "movd        %1,%%xmm5                     \n"
6029       "sub         $0x4,%4                       \n"
6030       "jl          49f                           \n"
6031 
6032       "pshufd      $0x44,%%xmm7,%%xmm7           \n"
6033       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
6034       "movdqa      %%xmm2,%%xmm0                 \n"
6035       "addps       %%xmm7,%%xmm0                 \n"
6036       "movlhps     %%xmm0,%%xmm2                 \n"
6037       "movdqa      %%xmm7,%%xmm4                 \n"
6038       "addps       %%xmm4,%%xmm4                 \n"
6039       "movdqa      %%xmm2,%%xmm3                 \n"
6040       "addps       %%xmm4,%%xmm3                 \n"
6041       "addps       %%xmm4,%%xmm4                 \n"
6042 
6043       // 4 pixel loop
6044       LABELALIGN
6045       "40:                                       \n"
6046       "cvttps2dq   %%xmm2,%%xmm0                 \n"  // x,y float->int first 2
6047       "cvttps2dq   %%xmm3,%%xmm1                 \n"  // x,y float->int next 2
6048       "packssdw    %%xmm1,%%xmm0                 \n"  // x, y as 8 shorts
6049       "pmaddwd     %%xmm5,%%xmm0                 \n"  // off = x*4 + y*stride
6050       "movd        %%xmm0,%k1                    \n"
6051       "pshufd      $0x39,%%xmm0,%%xmm0           \n"
6052       "movd        %%xmm0,%k5                    \n"
6053       "pshufd      $0x39,%%xmm0,%%xmm0           \n"
6054       "movd        0x00(%0,%1,1),%%xmm1          \n"
6055       "movd        0x00(%0,%5,1),%%xmm6          \n"
6056       "punpckldq   %%xmm6,%%xmm1                 \n"
6057       "addps       %%xmm4,%%xmm2                 \n"
6058       "movq        %%xmm1,(%2)                   \n"
6059       "movd        %%xmm0,%k1                    \n"
6060       "pshufd      $0x39,%%xmm0,%%xmm0           \n"
6061       "movd        %%xmm0,%k5                    \n"
6062       "movd        0x00(%0,%1,1),%%xmm0          \n"
6063       "movd        0x00(%0,%5,1),%%xmm6          \n"
6064       "punpckldq   %%xmm6,%%xmm0                 \n"
6065       "addps       %%xmm4,%%xmm3                 \n"
6066       "movq        %%xmm0,0x08(%2)               \n"
6067       "lea         0x10(%2),%2                   \n"
6068       "sub         $0x4,%4                       \n"
6069       "jge         40b                           \n"
6070 
6071       "49:                                       \n"
6072       "add         $0x3,%4                       \n"
6073       "jl          19f                           \n"
6074 
6075       // 1 pixel loop
6076       LABELALIGN
6077       "10:                                       \n"
6078       "cvttps2dq   %%xmm2,%%xmm0                 \n"
6079       "packssdw    %%xmm0,%%xmm0                 \n"
6080       "pmaddwd     %%xmm5,%%xmm0                 \n"
6081       "addps       %%xmm7,%%xmm2                 \n"
6082       "movd        %%xmm0,%k1                    \n"
6083       "movd        0x00(%0,%1,1),%%xmm0          \n"
6084       "movd        %%xmm0,(%2)                   \n"
6085       "lea         0x04(%2),%2                   \n"
6086       "sub         $0x1,%4                       \n"
6087       "jge         10b                           \n"
6088       "19:                                       \n"
6089       : "+r"(src_argb),              // %0
6090         "+r"(src_argb_stride_temp),  // %1
6091         "+r"(dst_argb),              // %2
6092         "+r"(src_dudv),              // %3
6093         "+rm"(width),                // %4
6094         "=&r"(temp)                  // %5
6095       :
6096       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
6097         "xmm7");
6098 }
6099 #endif  // HAS_ARGBAFFINEROW_SSE2
6100 
6101 #ifdef HAS_INTERPOLATEROW_SSSE3
6102 // Bilinear filter 16x2 -> 16x1
InterpolateRow_SSSE3(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)6103 void InterpolateRow_SSSE3(uint8_t* dst_ptr,
6104                           const uint8_t* src_ptr,
6105                           ptrdiff_t src_stride,
6106                           int dst_width,
6107                           int source_y_fraction) {
6108   asm volatile(
6109       "sub         %1,%0                         \n"
6110       "cmp         $0x0,%3                       \n"
6111       "je          100f                          \n"
6112       "cmp         $0x80,%3                      \n"
6113       "je          50f                           \n"
6114 
6115       "movd        %3,%%xmm0                     \n"
6116       "neg         %3                            \n"
6117       "add         $0x100,%3                     \n"
6118       "movd        %3,%%xmm5                     \n"
6119       "punpcklbw   %%xmm0,%%xmm5                 \n"
6120       "punpcklwd   %%xmm5,%%xmm5                 \n"
6121       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
6122       "mov         $0x80808080,%%eax             \n"
6123       "movd        %%eax,%%xmm4                  \n"
6124       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
6125 
6126       // General purpose row blend.
6127       LABELALIGN
6128       "1:                                        \n"
6129       "movdqu      (%1),%%xmm0                   \n"
6130       "movdqu      0x00(%1,%4,1),%%xmm2          \n"
6131       "movdqa      %%xmm0,%%xmm1                 \n"
6132       "punpcklbw   %%xmm2,%%xmm0                 \n"
6133       "punpckhbw   %%xmm2,%%xmm1                 \n"
6134       "psubb       %%xmm4,%%xmm0                 \n"
6135       "psubb       %%xmm4,%%xmm1                 \n"
6136       "movdqa      %%xmm5,%%xmm2                 \n"
6137       "movdqa      %%xmm5,%%xmm3                 \n"
6138       "pmaddubsw   %%xmm0,%%xmm2                 \n"
6139       "pmaddubsw   %%xmm1,%%xmm3                 \n"
6140       "paddw       %%xmm4,%%xmm2                 \n"
6141       "paddw       %%xmm4,%%xmm3                 \n"
6142       "psrlw       $0x8,%%xmm2                   \n"
6143       "psrlw       $0x8,%%xmm3                   \n"
6144       "packuswb    %%xmm3,%%xmm2                 \n"
6145       "movdqu      %%xmm2,0x00(%1,%0,1)          \n"
6146       "lea         0x10(%1),%1                   \n"
6147       "sub         $0x10,%2                      \n"
6148       "jg          1b                            \n"
6149       "jmp         99f                           \n"
6150 
6151       // Blend 50 / 50.
6152       LABELALIGN
6153       "50:                                       \n"
6154       "movdqu      (%1),%%xmm0                   \n"
6155       "movdqu      0x00(%1,%4,1),%%xmm1          \n"
6156       "pavgb       %%xmm1,%%xmm0                 \n"
6157       "movdqu      %%xmm0,0x00(%1,%0,1)          \n"
6158       "lea         0x10(%1),%1                   \n"
6159       "sub         $0x10,%2                      \n"
6160       "jg          50b                           \n"
6161       "jmp         99f                           \n"
6162 
6163       // Blend 100 / 0 - Copy row unchanged.
6164       LABELALIGN
6165       "100:                                      \n"
6166       "movdqu      (%1),%%xmm0                   \n"
6167       "movdqu      %%xmm0,0x00(%1,%0,1)          \n"
6168       "lea         0x10(%1),%1                   \n"
6169       "sub         $0x10,%2                      \n"
6170       "jg          100b                          \n"
6171 
6172       "99:                                       \n"
6173       : "+r"(dst_ptr),               // %0
6174         "+r"(src_ptr),               // %1
6175         "+rm"(dst_width),            // %2
6176         "+r"(source_y_fraction)      // %3
6177       : "r"((intptr_t)(src_stride))  // %4
6178       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6179 }
6180 #endif  // HAS_INTERPOLATEROW_SSSE3
6181 
6182 #ifdef HAS_INTERPOLATEROW_AVX2
6183 // Bilinear filter 32x2 -> 32x1
InterpolateRow_AVX2(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)6184 void InterpolateRow_AVX2(uint8_t* dst_ptr,
6185                          const uint8_t* src_ptr,
6186                          ptrdiff_t src_stride,
6187                          int dst_width,
6188                          int source_y_fraction) {
6189   asm volatile(
6190       "cmp         $0x0,%3                       \n"
6191       "je          100f                          \n"
6192       "sub         %1,%0                         \n"
6193       "cmp         $0x80,%3                      \n"
6194       "je          50f                           \n"
6195 
6196       "vmovd       %3,%%xmm0                     \n"
6197       "neg         %3                            \n"
6198       "add         $0x100,%3                     \n"
6199       "vmovd       %3,%%xmm5                     \n"
6200       "vpunpcklbw  %%xmm0,%%xmm5,%%xmm5          \n"
6201       "vpunpcklwd  %%xmm5,%%xmm5,%%xmm5          \n"
6202       "vbroadcastss %%xmm5,%%ymm5                \n"
6203       "mov         $0x80808080,%%eax             \n"
6204       "vmovd       %%eax,%%xmm4                  \n"
6205       "vbroadcastss %%xmm4,%%ymm4                \n"
6206 
6207       // General purpose row blend.
6208       LABELALIGN
6209       "1:                                        \n"
6210       "vmovdqu     (%1),%%ymm0                   \n"
6211       "vmovdqu     0x00(%1,%4,1),%%ymm2          \n"
6212       "vpunpckhbw  %%ymm2,%%ymm0,%%ymm1          \n"
6213       "vpunpcklbw  %%ymm2,%%ymm0,%%ymm0          \n"
6214       "vpsubb      %%ymm4,%%ymm1,%%ymm1          \n"
6215       "vpsubb      %%ymm4,%%ymm0,%%ymm0          \n"
6216       "vpmaddubsw  %%ymm1,%%ymm5,%%ymm1          \n"
6217       "vpmaddubsw  %%ymm0,%%ymm5,%%ymm0          \n"
6218       "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"
6219       "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"
6220       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
6221       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
6222       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
6223       "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
6224       "lea         0x20(%1),%1                   \n"
6225       "sub         $0x20,%2                      \n"
6226       "jg          1b                            \n"
6227       "jmp         99f                           \n"
6228 
6229       // Blend 50 / 50.
6230       LABELALIGN
6231       "50:                                       \n"
6232       "vmovdqu     (%1),%%ymm0                   \n"
6233       "vpavgb      0x00(%1,%4,1),%%ymm0,%%ymm0   \n"
6234       "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
6235       "lea         0x20(%1),%1                   \n"
6236       "sub         $0x20,%2                      \n"
6237       "jg          50b                           \n"
6238       "jmp         99f                           \n"
6239 
6240       // Blend 100 / 0 - Copy row unchanged.
6241       LABELALIGN
6242       "100:                                      \n"
6243       "rep         movsb                         \n"
6244       "jmp         999f                          \n"
6245 
6246       "99:                                       \n"
6247       "vzeroupper                                \n"
6248       "999:                                      \n"
6249       : "+D"(dst_ptr),               // %0
6250         "+S"(src_ptr),               // %1
6251         "+cm"(dst_width),            // %2
6252         "+r"(source_y_fraction)      // %3
6253       : "r"((intptr_t)(src_stride))  // %4
6254       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
6255 }
6256 #endif  // HAS_INTERPOLATEROW_AVX2
6257 
6258 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
6259 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)6260 void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
6261                           uint8_t* dst_argb,
6262                           const uint8_t* shuffler,
6263                           int width) {
6264   asm volatile(
6265 
6266       "movdqu      (%3),%%xmm5                   \n"
6267 
6268       LABELALIGN
6269       "1:                                        \n"
6270       "movdqu      (%0),%%xmm0                   \n"
6271       "movdqu      0x10(%0),%%xmm1               \n"
6272       "lea         0x20(%0),%0                   \n"
6273       "pshufb      %%xmm5,%%xmm0                 \n"
6274       "pshufb      %%xmm5,%%xmm1                 \n"
6275       "movdqu      %%xmm0,(%1)                   \n"
6276       "movdqu      %%xmm1,0x10(%1)               \n"
6277       "lea         0x20(%1),%1                   \n"
6278       "sub         $0x8,%2                       \n"
6279       "jg          1b                            \n"
6280       : "+r"(src_argb),  // %0
6281         "+r"(dst_argb),  // %1
6282         "+r"(width)      // %2
6283       : "r"(shuffler)    // %3
6284       : "memory", "cc", "xmm0", "xmm1", "xmm5");
6285 }
6286 #endif  // HAS_ARGBSHUFFLEROW_SSSE3
6287 
6288 #ifdef HAS_ARGBSHUFFLEROW_AVX2
6289 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)6290 void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
6291                          uint8_t* dst_argb,
6292                          const uint8_t* shuffler,
6293                          int width) {
6294   asm volatile(
6295 
6296       "vbroadcastf128 (%3),%%ymm5                \n"
6297 
6298       LABELALIGN
6299       "1:                                        \n"
6300       "vmovdqu     (%0),%%ymm0                   \n"
6301       "vmovdqu     0x20(%0),%%ymm1               \n"
6302       "lea         0x40(%0),%0                   \n"
6303       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
6304       "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
6305       "vmovdqu     %%ymm0,(%1)                   \n"
6306       "vmovdqu     %%ymm1,0x20(%1)               \n"
6307       "lea         0x40(%1),%1                   \n"
6308       "sub         $0x10,%2                      \n"
6309       "jg          1b                            \n"
6310       "vzeroupper                                \n"
6311       : "+r"(src_argb),  // %0
6312         "+r"(dst_argb),  // %1
6313         "+r"(width)      // %2
6314       : "r"(shuffler)    // %3
6315       : "memory", "cc", "xmm0", "xmm1", "xmm5");
6316 }
6317 #endif  // HAS_ARGBSHUFFLEROW_AVX2
6318 
6319 #ifdef HAS_I422TOYUY2ROW_SSE2
I422ToYUY2Row_SSE2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)6320 void I422ToYUY2Row_SSE2(const uint8_t* src_y,
6321                         const uint8_t* src_u,
6322                         const uint8_t* src_v,
6323                         uint8_t* dst_yuy2,
6324                         int width) {
6325   asm volatile(
6326 
6327       "sub         %1,%2                         \n"
6328 
6329       LABELALIGN
6330       "1:                                        \n"
6331       "movq        (%1),%%xmm2                   \n"
6332       "movq        0x00(%1,%2,1),%%xmm1          \n"
6333       "add         $0x8,%1                       \n"
6334       "punpcklbw   %%xmm1,%%xmm2                 \n"
6335       "movdqu      (%0),%%xmm0                   \n"
6336       "add         $0x10,%0                      \n"
6337       "movdqa      %%xmm0,%%xmm1                 \n"
6338       "punpcklbw   %%xmm2,%%xmm0                 \n"
6339       "punpckhbw   %%xmm2,%%xmm1                 \n"
6340       "movdqu      %%xmm0,(%3)                   \n"
6341       "movdqu      %%xmm1,0x10(%3)               \n"
6342       "lea         0x20(%3),%3                   \n"
6343       "sub         $0x10,%4                      \n"
6344       "jg          1b                            \n"
6345       : "+r"(src_y),     // %0
6346         "+r"(src_u),     // %1
6347         "+r"(src_v),     // %2
6348         "+r"(dst_yuy2),  // %3
6349         "+rm"(width)     // %4
6350       :
6351       : "memory", "cc", "xmm0", "xmm1", "xmm2");
6352 }
6353 #endif  // HAS_I422TOYUY2ROW_SSE2
6354 
6355 #ifdef HAS_I422TOUYVYROW_SSE2
I422ToUYVYRow_SSE2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)6356 void I422ToUYVYRow_SSE2(const uint8_t* src_y,
6357                         const uint8_t* src_u,
6358                         const uint8_t* src_v,
6359                         uint8_t* dst_uyvy,
6360                         int width) {
6361   asm volatile(
6362 
6363       "sub         %1,%2                         \n"
6364 
6365       LABELALIGN
6366       "1:                                        \n"
6367       "movq        (%1),%%xmm2                   \n"
6368       "movq        0x00(%1,%2,1),%%xmm1          \n"
6369       "add         $0x8,%1                       \n"
6370       "punpcklbw   %%xmm1,%%xmm2                 \n"
6371       "movdqu      (%0),%%xmm0                   \n"
6372       "movdqa      %%xmm2,%%xmm1                 \n"
6373       "add         $0x10,%0                      \n"
6374       "punpcklbw   %%xmm0,%%xmm1                 \n"
6375       "punpckhbw   %%xmm0,%%xmm2                 \n"
6376       "movdqu      %%xmm1,(%3)                   \n"
6377       "movdqu      %%xmm2,0x10(%3)               \n"
6378       "lea         0x20(%3),%3                   \n"
6379       "sub         $0x10,%4                      \n"
6380       "jg          1b                            \n"
6381       : "+r"(src_y),     // %0
6382         "+r"(src_u),     // %1
6383         "+r"(src_v),     // %2
6384         "+r"(dst_uyvy),  // %3
6385         "+rm"(width)     // %4
6386       :
6387       : "memory", "cc", "xmm0", "xmm1", "xmm2");
6388 }
6389 #endif  // HAS_I422TOUYVYROW_SSE2
6390 
6391 #ifdef HAS_I422TOYUY2ROW_AVX2
I422ToYUY2Row_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)6392 void I422ToYUY2Row_AVX2(const uint8_t* src_y,
6393                         const uint8_t* src_u,
6394                         const uint8_t* src_v,
6395                         uint8_t* dst_yuy2,
6396                         int width) {
6397   asm volatile(
6398 
6399       "sub         %1,%2                         \n"
6400 
6401       LABELALIGN
6402       "1:                                        \n"
6403       "vpmovzxbw   (%1),%%ymm1                   \n"
6404       "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
6405       "add         $0x10,%1                      \n"
6406       "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
6407       "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
6408       "vmovdqu     (%0),%%ymm0                   \n"
6409       "add         $0x20,%0                      \n"
6410       "vpunpcklbw  %%ymm2,%%ymm0,%%ymm1          \n"
6411       "vpunpckhbw  %%ymm2,%%ymm0,%%ymm2          \n"
6412       "vextractf128 $0x0,%%ymm1,(%3)             \n"
6413       "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
6414       "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
6415       "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
6416       "lea         0x40(%3),%3                   \n"
6417       "sub         $0x20,%4                      \n"
6418       "jg          1b                            \n"
6419       "vzeroupper                                \n"
6420       : "+r"(src_y),     // %0
6421         "+r"(src_u),     // %1
6422         "+r"(src_v),     // %2
6423         "+r"(dst_yuy2),  // %3
6424         "+rm"(width)     // %4
6425       :
6426       : "memory", "cc", "xmm0", "xmm1", "xmm2");
6427 }
6428 #endif  // HAS_I422TOYUY2ROW_AVX2
6429 
6430 #ifdef HAS_I422TOUYVYROW_AVX2
I422ToUYVYRow_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)6431 void I422ToUYVYRow_AVX2(const uint8_t* src_y,
6432                         const uint8_t* src_u,
6433                         const uint8_t* src_v,
6434                         uint8_t* dst_uyvy,
6435                         int width) {
6436   asm volatile(
6437 
6438       "sub         %1,%2                         \n"
6439 
6440       LABELALIGN
6441       "1:                                        \n"
6442       "vpmovzxbw   (%1),%%ymm1                   \n"
6443       "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
6444       "add         $0x10,%1                      \n"
6445       "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
6446       "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
6447       "vmovdqu     (%0),%%ymm0                   \n"
6448       "add         $0x20,%0                      \n"
6449       "vpunpcklbw  %%ymm0,%%ymm2,%%ymm1          \n"
6450       "vpunpckhbw  %%ymm0,%%ymm2,%%ymm2          \n"
6451       "vextractf128 $0x0,%%ymm1,(%3)             \n"
6452       "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
6453       "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
6454       "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
6455       "lea         0x40(%3),%3                   \n"
6456       "sub         $0x20,%4                      \n"
6457       "jg          1b                            \n"
6458       "vzeroupper                                \n"
6459       : "+r"(src_y),     // %0
6460         "+r"(src_u),     // %1
6461         "+r"(src_v),     // %2
6462         "+r"(dst_uyvy),  // %3
6463         "+rm"(width)     // %4
6464       :
6465       : "memory", "cc", "xmm0", "xmm1", "xmm2");
6466 }
6467 #endif  // HAS_I422TOUYVYROW_AVX2
6468 
6469 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
ARGBPolynomialRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,const float * poly,int width)6470 void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
6471                             uint8_t* dst_argb,
6472                             const float* poly,
6473                             int width) {
6474   asm volatile(
6475 
6476       "pxor        %%xmm3,%%xmm3                 \n"
6477 
6478       // 2 pixel loop.
6479       LABELALIGN
6480       "1:                                        \n"
6481       "movq        (%0),%%xmm0                   \n"
6482       "lea         0x8(%0),%0                    \n"
6483       "punpcklbw   %%xmm3,%%xmm0                 \n"
6484       "movdqa      %%xmm0,%%xmm4                 \n"
6485       "punpcklwd   %%xmm3,%%xmm0                 \n"
6486       "punpckhwd   %%xmm3,%%xmm4                 \n"
6487       "cvtdq2ps    %%xmm0,%%xmm0                 \n"
6488       "cvtdq2ps    %%xmm4,%%xmm4                 \n"
6489       "movdqa      %%xmm0,%%xmm1                 \n"
6490       "movdqa      %%xmm4,%%xmm5                 \n"
6491       "mulps       0x10(%3),%%xmm0               \n"
6492       "mulps       0x10(%3),%%xmm4               \n"
6493       "addps       (%3),%%xmm0                   \n"
6494       "addps       (%3),%%xmm4                   \n"
6495       "movdqa      %%xmm1,%%xmm2                 \n"
6496       "movdqa      %%xmm5,%%xmm6                 \n"
6497       "mulps       %%xmm1,%%xmm2                 \n"
6498       "mulps       %%xmm5,%%xmm6                 \n"
6499       "mulps       %%xmm2,%%xmm1                 \n"
6500       "mulps       %%xmm6,%%xmm5                 \n"
6501       "mulps       0x20(%3),%%xmm2               \n"
6502       "mulps       0x20(%3),%%xmm6               \n"
6503       "mulps       0x30(%3),%%xmm1               \n"
6504       "mulps       0x30(%3),%%xmm5               \n"
6505       "addps       %%xmm2,%%xmm0                 \n"
6506       "addps       %%xmm6,%%xmm4                 \n"
6507       "addps       %%xmm1,%%xmm0                 \n"
6508       "addps       %%xmm5,%%xmm4                 \n"
6509       "cvttps2dq   %%xmm0,%%xmm0                 \n"
6510       "cvttps2dq   %%xmm4,%%xmm4                 \n"
6511       "packuswb    %%xmm4,%%xmm0                 \n"
6512       "packuswb    %%xmm0,%%xmm0                 \n"
6513       "movq        %%xmm0,(%1)                   \n"
6514       "lea         0x8(%1),%1                    \n"
6515       "sub         $0x2,%2                       \n"
6516       "jg          1b                            \n"
6517       : "+r"(src_argb),  // %0
6518         "+r"(dst_argb),  // %1
6519         "+r"(width)      // %2
6520       : "r"(poly)        // %3
6521       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
6522 }
6523 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
6524 
6525 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
ARGBPolynomialRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,const float * poly,int width)6526 void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
6527                             uint8_t* dst_argb,
6528                             const float* poly,
6529                             int width) {
6530   asm volatile(
6531       "vbroadcastf128 (%3),%%ymm4                \n"
6532       "vbroadcastf128 0x10(%3),%%ymm5            \n"
6533       "vbroadcastf128 0x20(%3),%%ymm6            \n"
6534       "vbroadcastf128 0x30(%3),%%ymm7            \n"
6535 
6536       // 2 pixel loop.
6537       LABELALIGN
6538       "1:                                        \n"
6539       "vpmovzxbd   (%0),%%ymm0                   \n"  // 2 ARGB pixels
6540       "lea         0x8(%0),%0                    \n"
6541       "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
6542       "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
6543       "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
6544       "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
6545       "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
6546       "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X *
6547                                                       // X
6548       "vcvttps2dq  %%ymm0,%%ymm0                 \n"
6549       "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
6550       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6551       "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
6552       "vmovq       %%xmm0,(%1)                   \n"
6553       "lea         0x8(%1),%1                    \n"
6554       "sub         $0x2,%2                       \n"
6555       "jg          1b                            \n"
6556       "vzeroupper                                \n"
6557       : "+r"(src_argb),  // %0
6558         "+r"(dst_argb),  // %1
6559         "+r"(width)      // %2
6560       : "r"(poly)        // %3
6561       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
6562         "xmm7");
6563 }
6564 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
6565 
6566 #ifdef HAS_HALFFLOATROW_SSE2
6567 static float kScaleBias = 1.9259299444e-34f;
HalfFloatRow_SSE2(const uint16_t * src,uint16_t * dst,float scale,int width)6568 void HalfFloatRow_SSE2(const uint16_t* src,
6569                        uint16_t* dst,
6570                        float scale,
6571                        int width) {
6572   scale *= kScaleBias;
6573   asm volatile(
6574       "movd        %3,%%xmm4                     \n"
6575       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
6576       "pxor        %%xmm5,%%xmm5                 \n"
6577       "sub         %0,%1                         \n"
6578 
6579       // 16 pixel loop.
6580       LABELALIGN
6581       "1:                                        \n"
6582       "movdqu      (%0),%%xmm2                   \n"  // 8 shorts
6583       "add         $0x10,%0                      \n"
6584       "movdqa      %%xmm2,%%xmm3                 \n"
6585       "punpcklwd   %%xmm5,%%xmm2                 \n"  // 8 ints in xmm2/1
6586       "cvtdq2ps    %%xmm2,%%xmm2                 \n"  // 8 floats
6587       "punpckhwd   %%xmm5,%%xmm3                 \n"
6588       "cvtdq2ps    %%xmm3,%%xmm3                 \n"
6589       "mulps       %%xmm4,%%xmm2                 \n"
6590       "mulps       %%xmm4,%%xmm3                 \n"
6591       "psrld       $0xd,%%xmm2                   \n"
6592       "psrld       $0xd,%%xmm3                   \n"
6593       "packssdw    %%xmm3,%%xmm2                 \n"
6594       "movdqu      %%xmm2,-0x10(%0,%1,1)         \n"
6595       "sub         $0x8,%2                       \n"
6596       "jg          1b                            \n"
6597       : "+r"(src),   // %0
6598         "+r"(dst),   // %1
6599         "+r"(width)  // %2
6600       : "m"(scale)   // %3
6601       : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
6602 }
6603 #endif  // HAS_HALFFLOATROW_SSE2
6604 
6605 #ifdef HAS_HALFFLOATROW_AVX2
HalfFloatRow_AVX2(const uint16_t * src,uint16_t * dst,float scale,int width)6606 void HalfFloatRow_AVX2(const uint16_t* src,
6607                        uint16_t* dst,
6608                        float scale,
6609                        int width) {
6610   scale *= kScaleBias;
6611   asm volatile(
6612       "vbroadcastss %3, %%ymm4                   \n"
6613       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
6614       "sub         %0,%1                         \n"
6615 
6616       // 16 pixel loop.
6617       LABELALIGN
6618       "1:                                        \n"
6619       "vmovdqu     (%0),%%ymm2                   \n"  // 16 shorts
6620       "add         $0x20,%0                      \n"
6621       "vpunpckhwd  %%ymm5,%%ymm2,%%ymm3          \n"  // mutates
6622       "vpunpcklwd  %%ymm5,%%ymm2,%%ymm2          \n"
6623       "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
6624       "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
6625       "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
6626       "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
6627       "vpsrld      $0xd,%%ymm3,%%ymm3            \n"
6628       "vpsrld      $0xd,%%ymm2,%%ymm2            \n"
6629       "vpackssdw   %%ymm3, %%ymm2, %%ymm2        \n"  // unmutates
6630       "vmovdqu     %%ymm2,-0x20(%0,%1,1)         \n"
6631       "sub         $0x10,%2                      \n"
6632       "jg          1b                            \n"
6633 
6634       "vzeroupper                                \n"
6635       : "+r"(src),   // %0
6636         "+r"(dst),   // %1
6637         "+r"(width)  // %2
6638 #if defined(__x86_64__)
6639       : "x"(scale)  // %3
6640 #else
6641       : "m"(scale)  // %3
6642 #endif
6643       : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
6644 }
6645 #endif  // HAS_HALFFLOATROW_AVX2
6646 
6647 #ifdef HAS_HALFFLOATROW_F16C
HalfFloatRow_F16C(const uint16_t * src,uint16_t * dst,float scale,int width)6648 void HalfFloatRow_F16C(const uint16_t* src,
6649                        uint16_t* dst,
6650                        float scale,
6651                        int width) {
6652   asm volatile(
6653       "vbroadcastss %3, %%ymm4                   \n"
6654       "sub         %0,%1                         \n"
6655 
6656       // 16 pixel loop.
6657       LABELALIGN
6658       "1:                                        \n"
6659       "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
6660       "vpmovzxwd   0x10(%0),%%ymm3               \n"
6661       "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
6662       "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
6663       "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
6664       "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
6665       "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
6666       "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
6667       "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
6668       "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
6669       "add         $0x20,%0                      \n"
6670       "sub         $0x10,%2                      \n"
6671       "jg          1b                            \n"
6672       "vzeroupper                                \n"
6673       : "+r"(src),   // %0
6674         "+r"(dst),   // %1
6675         "+r"(width)  // %2
6676 #if defined(__x86_64__)
6677       : "x"(scale)  // %3
6678 #else
6679       : "m"(scale)  // %3
6680 #endif
6681       : "memory", "cc", "xmm2", "xmm3", "xmm4");
6682 }
6683 #endif  // HAS_HALFFLOATROW_F16C
6684 
6685 #ifdef HAS_HALFFLOATROW_F16C
HalfFloat1Row_F16C(const uint16_t * src,uint16_t * dst,float,int width)6686 void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
6687   asm volatile(
6688       "sub         %0,%1                         \n"
6689       // 16 pixel loop.
6690       LABELALIGN
6691       "1:                                        \n"
6692       "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
6693       "vpmovzxwd   0x10(%0),%%ymm3               \n"
6694       "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
6695       "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
6696       "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
6697       "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
6698       "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
6699       "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
6700       "add         $0x20,%0                      \n"
6701       "sub         $0x10,%2                      \n"
6702       "jg          1b                            \n"
6703       "vzeroupper                                \n"
6704       : "+r"(src),   // %0
6705         "+r"(dst),   // %1
6706         "+r"(width)  // %2
6707       :
6708       : "memory", "cc", "xmm2", "xmm3");
6709 }
6710 #endif  // HAS_HALFFLOATROW_F16C
6711 
6712 #ifdef HAS_ARGBCOLORTABLEROW_X86
6713 // Tranform ARGB pixels with color table.
ARGBColorTableRow_X86(uint8_t * dst_argb,const uint8_t * table_argb,int width)6714 void ARGBColorTableRow_X86(uint8_t* dst_argb,
6715                            const uint8_t* table_argb,
6716                            int width) {
6717   uintptr_t pixel_temp;
6718   asm volatile(
6719       // 1 pixel loop.
6720       LABELALIGN
6721       "1:                                        \n"
6722       "movzb       (%0),%1                       \n"
6723       "lea         0x4(%0),%0                    \n"
6724       "movzb       0x00(%3,%1,4),%1              \n"
6725       "mov         %b1,-0x4(%0)                  \n"
6726       "movzb       -0x3(%0),%1                   \n"
6727       "movzb       0x01(%3,%1,4),%1              \n"
6728       "mov         %b1,-0x3(%0)                  \n"
6729       "movzb       -0x2(%0),%1                   \n"
6730       "movzb       0x02(%3,%1,4),%1              \n"
6731       "mov         %b1,-0x2(%0)                  \n"
6732       "movzb       -0x1(%0),%1                   \n"
6733       "movzb       0x03(%3,%1,4),%1              \n"
6734       "mov         %b1,-0x1(%0)                  \n"
6735       "dec         %2                            \n"
6736       "jg          1b                            \n"
6737       : "+r"(dst_argb),     // %0
6738         "=&d"(pixel_temp),  // %1
6739         "+r"(width)         // %2
6740       : "r"(table_argb)     // %3
6741       : "memory", "cc");
6742 }
6743 #endif  // HAS_ARGBCOLORTABLEROW_X86
6744 
6745 #ifdef HAS_RGBCOLORTABLEROW_X86
6746 // Tranform RGB pixels with color table.
RGBColorTableRow_X86(uint8_t * dst_argb,const uint8_t * table_argb,int width)6747 void RGBColorTableRow_X86(uint8_t* dst_argb,
6748                           const uint8_t* table_argb,
6749                           int width) {
6750   uintptr_t pixel_temp;
6751   asm volatile(
6752       // 1 pixel loop.
6753       LABELALIGN
6754       "1:                                        \n"
6755       "movzb       (%0),%1                       \n"
6756       "lea         0x4(%0),%0                    \n"
6757       "movzb       0x00(%3,%1,4),%1              \n"
6758       "mov         %b1,-0x4(%0)                  \n"
6759       "movzb       -0x3(%0),%1                   \n"
6760       "movzb       0x01(%3,%1,4),%1              \n"
6761       "mov         %b1,-0x3(%0)                  \n"
6762       "movzb       -0x2(%0),%1                   \n"
6763       "movzb       0x02(%3,%1,4),%1              \n"
6764       "mov         %b1,-0x2(%0)                  \n"
6765       "dec         %2                            \n"
6766       "jg          1b                            \n"
6767       : "+r"(dst_argb),     // %0
6768         "=&d"(pixel_temp),  // %1
6769         "+r"(width)         // %2
6770       : "r"(table_argb)     // %3
6771       : "memory", "cc");
6772 }
6773 #endif  // HAS_RGBCOLORTABLEROW_X86
6774 
6775 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
6776 // Tranform RGB pixels with luma table.
ARGBLumaColorTableRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width,const uint8_t * luma,uint32_t lumacoeff)6777 void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
6778                                  uint8_t* dst_argb,
6779                                  int width,
6780                                  const uint8_t* luma,
6781                                  uint32_t lumacoeff) {
6782   uintptr_t pixel_temp;
6783   uintptr_t table_temp;
6784   asm volatile(
6785       "movd        %6,%%xmm3                     \n"
6786       "pshufd      $0x0,%%xmm3,%%xmm3            \n"
6787       "pcmpeqb     %%xmm4,%%xmm4                 \n"
6788       "psllw       $0x8,%%xmm4                   \n"
6789       "pxor        %%xmm5,%%xmm5                 \n"
6790 
6791       // 4 pixel loop.
6792       LABELALIGN
6793       "1:                                        \n"
6794       "movdqu      (%2),%%xmm0                   \n"
6795       "pmaddubsw   %%xmm3,%%xmm0                 \n"
6796       "phaddw      %%xmm0,%%xmm0                 \n"
6797       "pand        %%xmm4,%%xmm0                 \n"
6798       "punpcklwd   %%xmm5,%%xmm0                 \n"
6799       "movd        %%xmm0,%k1                    \n"  // 32 bit offset
6800       "add         %5,%1                         \n"
6801       "pshufd      $0x39,%%xmm0,%%xmm0           \n"
6802 
6803       "movzb       (%2),%0                       \n"
6804       "movzb       0x00(%1,%0,1),%0              \n"
6805       "mov         %b0,(%3)                      \n"
6806       "movzb       0x1(%2),%0                    \n"
6807       "movzb       0x00(%1,%0,1),%0              \n"
6808       "mov         %b0,0x1(%3)                   \n"
6809       "movzb       0x2(%2),%0                    \n"
6810       "movzb       0x00(%1,%0,1),%0              \n"
6811       "mov         %b0,0x2(%3)                   \n"
6812       "movzb       0x3(%2),%0                    \n"
6813       "mov         %b0,0x3(%3)                   \n"
6814 
6815       "movd        %%xmm0,%k1                    \n"  // 32 bit offset
6816       "add         %5,%1                         \n"
6817       "pshufd      $0x39,%%xmm0,%%xmm0           \n"
6818 
6819       "movzb       0x4(%2),%0                    \n"
6820       "movzb       0x00(%1,%0,1),%0              \n"
6821       "mov         %b0,0x4(%3)                   \n"
6822       "movzb       0x5(%2),%0                    \n"
6823       "movzb       0x00(%1,%0,1),%0              \n"
6824       "mov         %b0,0x5(%3)                   \n"
6825       "movzb       0x6(%2),%0                    \n"
6826       "movzb       0x00(%1,%0,1),%0              \n"
6827       "mov         %b0,0x6(%3)                   \n"
6828       "movzb       0x7(%2),%0                    \n"
6829       "mov         %b0,0x7(%3)                   \n"
6830 
6831       "movd        %%xmm0,%k1                    \n"  // 32 bit offset
6832       "add         %5,%1                         \n"
6833       "pshufd      $0x39,%%xmm0,%%xmm0           \n"
6834 
6835       "movzb       0x8(%2),%0                    \n"
6836       "movzb       0x00(%1,%0,1),%0              \n"
6837       "mov         %b0,0x8(%3)                   \n"
6838       "movzb       0x9(%2),%0                    \n"
6839       "movzb       0x00(%1,%0,1),%0              \n"
6840       "mov         %b0,0x9(%3)                   \n"
6841       "movzb       0xa(%2),%0                    \n"
6842       "movzb       0x00(%1,%0,1),%0              \n"
6843       "mov         %b0,0xa(%3)                   \n"
6844       "movzb       0xb(%2),%0                    \n"
6845       "mov         %b0,0xb(%3)                   \n"
6846 
6847       "movd        %%xmm0,%k1                    \n"  // 32 bit offset
6848       "add         %5,%1                         \n"
6849 
6850       "movzb       0xc(%2),%0                    \n"
6851       "movzb       0x00(%1,%0,1),%0              \n"
6852       "mov         %b0,0xc(%3)                   \n"
6853       "movzb       0xd(%2),%0                    \n"
6854       "movzb       0x00(%1,%0,1),%0              \n"
6855       "mov         %b0,0xd(%3)                   \n"
6856       "movzb       0xe(%2),%0                    \n"
6857       "movzb       0x00(%1,%0,1),%0              \n"
6858       "mov         %b0,0xe(%3)                   \n"
6859       "movzb       0xf(%2),%0                    \n"
6860       "mov         %b0,0xf(%3)                   \n"
6861       "lea         0x10(%2),%2                   \n"
6862       "lea         0x10(%3),%3                   \n"
6863       "sub         $0x4,%4                       \n"
6864       "jg          1b                            \n"
6865       : "=&d"(pixel_temp),  // %0
6866         "=&a"(table_temp),  // %1
6867         "+r"(src_argb),     // %2
6868         "+r"(dst_argb),     // %3
6869         "+rm"(width)        // %4
6870       : "r"(luma),          // %5
6871         "rm"(lumacoeff)     // %6
6872       : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
6873 }
6874 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6875 
6876 #ifdef HAS_NV21TOYUV24ROW_AVX2
6877 
6878 // begin NV21ToYUV24Row_C avx2 constants
6879 static const ulvec8 kBLEND0 = {0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, 0x00,
6880                                0x80, 0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80,
6881                                0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
6882                                0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00};
6883 
6884 static const ulvec8 kBLEND1 = {0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
6885                                0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
6886                                0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
6887                                0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80};
6888 
6889 static const ulvec8 kBLEND2 = {0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
6890                                0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
6891                                0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
6892                                0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00};
6893 
6894 static const ulvec8 kSHUF0 = {0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
6895                               0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05,
6896                               0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
6897                               0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05};
6898 
6899 static const ulvec8 kSHUF1 = {0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
6900                               0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80,
6901                               0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
6902                               0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80};
6903 
6904 static const ulvec8 kSHUF2 = {0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
6905                               0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f,
6906                               0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
6907                               0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f};
6908 
6909 static const ulvec8 kSHUF3 = {0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
6910                               0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80,
6911                               0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
6912                               0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80};
6913 
6914 static const ulvec8 kSHUF4 = {0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
6915                               0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a,
6916                               0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
6917                               0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a};
6918 
6919 static const ulvec8 kSHUF5 = {0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
6920                               0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80,
6921                               0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
6922                               0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80};
6923 
6924 // NV21ToYUV24Row_AVX2
NV21ToYUV24Row_AVX2(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)6925 void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
6926                          const uint8_t* src_vu,
6927                          uint8_t* dst_yuv24,
6928                          int width) {
6929   uint8_t* src_y_ptr;
6930   uint64_t src_offset = 0;
6931   uint64_t width64;
6932 
6933   width64 = width;
6934   src_y_ptr = (uint8_t*)src_y;
6935 
6936   asm volatile(
6937       "vmovdqu     %5, %%ymm0                    \n"  // init blend value
6938       "vmovdqu     %6, %%ymm1                    \n"  // init blend value
6939       "vmovdqu     %7, %%ymm2                    \n"  // init blend value
6940       //      "sub         $0x20, %3                     \n"  //sub 32 from
6941       //      width for final loop
6942 
6943       LABELALIGN
6944       "1:                                        \n"      // label 1
6945       "vmovdqu     (%0,%4), %%ymm3               \n"      // src_y
6946       "vmovdqu     1(%1,%4), %%ymm4              \n"      // src_uv+1
6947       "vmovdqu     (%1), %%ymm5                  \n"      // src_uv
6948       "vpshufb     %8, %%ymm3, %%ymm13           \n"      // y, kSHUF0 for shuf
6949       "vpshufb     %9, %%ymm4, %%ymm14           \n"      // uv+1, kSHUF1 for
6950                                                           // shuf
6951       "vpshufb     %10, %%ymm5, %%ymm15          \n"      // uv, kSHUF2 for
6952                                                           // shuf
6953       "vpshufb     %11, %%ymm3, %%ymm3           \n"      // y kSHUF3 for shuf
6954       "vpshufb     %12, %%ymm4, %%ymm4           \n"      // uv+1 kSHUF4 for
6955                                                           // shuf
6956       "vpblendvb   %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n"  // blend 0
6957       "vpblendvb   %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n"  // blend 0
6958       "vpblendvb   %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n"  // blend 2
6959       "vpblendvb   %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n"  // blend 1
6960       "vpshufb     %13, %%ymm5, %%ymm15          \n"      // shuffle const
6961       "vpor        %%ymm4, %%ymm3, %%ymm5        \n"      // get results
6962       "vmovdqu     %%ymm12, 0x20(%2)             \n"      // store dst_yuv+20h
6963       "vpor        %%ymm15, %%ymm5, %%ymm3       \n"      // get results
6964       "add         $0x20, %4                     \n"      // add to src buffer
6965                                                           // ptr
6966       "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n"      // insert
6967       "vperm2i128  $0x31, %%ymm13, %%ymm3, %%ymm5 \n"     // insert
6968       "vmovdqu     %%ymm4, (%2)                  \n"      // store dst_yuv
6969       "vmovdqu     %%ymm5, 0x40(%2)              \n"      // store dst_yuv+40h
6970       "add         $0x60,%2                      \n"      // add to dst buffer
6971                                                           // ptr
6972       //      "cmp         %3, %4                        \n" //(width64 -
6973       //      32 bytes) and src_offset
6974       "sub         $0x20,%3                      \n"  // 32 pixels per loop
6975       "jg          1b                            \n"
6976       "vzeroupper                                \n"  // sse-avx2
6977                                                       // transistions
6978 
6979       : "+r"(src_y),      //%0
6980         "+r"(src_vu),     //%1
6981         "+r"(dst_yuv24),  //%2
6982         "+r"(width64),    //%3
6983         "+r"(src_offset)  //%4
6984       : "m"(kBLEND0),     //%5
6985         "m"(kBLEND1),     //%6
6986         "m"(kBLEND2),     //%7
6987         "m"(kSHUF0),      //%8
6988         "m"(kSHUF1),      //%9
6989         "m"(kSHUF2),      //%10
6990         "m"(kSHUF3),      //%11
6991         "m"(kSHUF4),      //%12
6992         "m"(kSHUF5)       //%13
6993       : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12",
6994         "xmm13", "xmm14", "xmm15");
6995 }
6996 #endif  // HAS_NV21TOYUV24ROW_AVX2
6997 
6998 #ifdef HAS_SWAPUVROW_SSSE3
6999 
7000 // Shuffle table for reversing the bytes.
7001 static const uvec8 kShuffleUVToVU = {1u, 0u, 3u,  2u,  5u,  4u,  7u,  6u,
7002                                      9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
7003 
7004 // Convert UV plane of NV12 to VU of NV21.
SwapUVRow_SSSE3(const uint8_t * src_uv,uint8_t * dst_vu,int width)7005 void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
7006   asm volatile(
7007 
7008       "movdqu      %3,%%xmm5                     \n"
7009 
7010       LABELALIGN
7011       "1:                                        \n"
7012       "movdqu      (%0),%%xmm0                   \n"
7013       "movdqu      0x10(%0),%%xmm1               \n"
7014       "lea         0x20(%0),%0                   \n"
7015       "pshufb      %%xmm5,%%xmm0                 \n"
7016       "pshufb      %%xmm5,%%xmm1                 \n"
7017       "movdqu      %%xmm0,(%1)                   \n"
7018       "movdqu      %%xmm1,0x10(%1)               \n"
7019       "lea         0x20(%1),%1                   \n"
7020       "sub         $0x10,%2                      \n"
7021       "jg          1b                            \n"
7022       : "+r"(src_uv),        // %0
7023         "+r"(dst_vu),        // %1
7024         "+r"(width)          // %2
7025       : "m"(kShuffleUVToVU)  // %3
7026       : "memory", "cc", "xmm0", "xmm1", "xmm5");
7027 }
7028 #endif  // HAS_SWAPUVROW_SSSE3
7029 
7030 #ifdef HAS_SWAPUVROW_AVX2
SwapUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_vu,int width)7031 void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
7032   asm volatile(
7033 
7034       "vbroadcastf128 %3,%%ymm5                  \n"
7035 
7036       LABELALIGN
7037       "1:                                        \n"
7038       "vmovdqu     (%0),%%ymm0                   \n"
7039       "vmovdqu     0x20(%0),%%ymm1               \n"
7040       "lea         0x40(%0),%0                   \n"
7041       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
7042       "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
7043       "vmovdqu     %%ymm0,(%1)                   \n"
7044       "vmovdqu     %%ymm1,0x20(%1)               \n"
7045       "lea         0x40(%1),%1                   \n"
7046       "sub         $0x20,%2                      \n"
7047       "jg          1b                            \n"
7048       "vzeroupper                                \n"
7049       : "+r"(src_uv),        // %0
7050         "+r"(dst_vu),        // %1
7051         "+r"(width)          // %2
7052       : "m"(kShuffleUVToVU)  // %3
7053       : "memory", "cc", "xmm0", "xmm1", "xmm5");
7054 }
7055 #endif  // HAS_SWAPUVROW_AVX2
7056 
HalfMergeUVRow_SSSE3(const uint8_t * src_u,int src_stride_u,const uint8_t * src_v,int src_stride_v,uint8_t * dst_uv,int width)7057 void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
7058                           int src_stride_u,
7059                           const uint8_t* src_v,
7060                           int src_stride_v,
7061                           uint8_t* dst_uv,
7062                           int width) {
7063   asm volatile(
7064       "pcmpeqb     %%xmm4,%%xmm4                 \n"
7065       "psrlw       $0xf,%%xmm4                   \n"
7066       "packuswb    %%xmm4,%%xmm4                 \n"
7067       "pxor        %%xmm5,%%xmm5                 \n"
7068 
7069       LABELALIGN
7070       "1:                                        \n"
7071       "movdqu      (%0),%%xmm0                   \n"  // load 16 U values
7072       "movdqu      (%1),%%xmm1                   \n"  // load 16 V values
7073       "movdqu      0(%0,%4,1),%%xmm2             \n"  // 16 from next row
7074       "movdqu      0(%1,%5,1),%%xmm3             \n"
7075       "lea         0x10(%0),%0                   \n"
7076       "pmaddubsw   %%xmm4,%%xmm0                 \n"  // half size
7077       "pmaddubsw   %%xmm4,%%xmm1                 \n"
7078       "pmaddubsw   %%xmm4,%%xmm2                 \n"
7079       "pmaddubsw   %%xmm4,%%xmm3                 \n"
7080       "lea         0x10(%1),%1                   \n"
7081       "paddw       %%xmm2,%%xmm0                 \n"
7082       "paddw       %%xmm3,%%xmm1                 \n"
7083       "psrlw       $0x1,%%xmm0                   \n"
7084       "psrlw       $0x1,%%xmm1                   \n"
7085       "pavgw       %%xmm5,%%xmm0                 \n"
7086       "pavgw       %%xmm5,%%xmm1                 \n"
7087       "packuswb    %%xmm0,%%xmm0                 \n"
7088       "packuswb    %%xmm1,%%xmm1                 \n"
7089       "punpcklbw   %%xmm1,%%xmm0                 \n"
7090       "movdqu      %%xmm0,(%2)                   \n"  // store 8 UV pixels
7091       "lea         0x10(%2),%2                   \n"
7092       "sub         $0x10,%3                      \n"  // 16 src pixels per loop
7093       "jg          1b                            \n"
7094       : "+r"(src_u),                    // %0
7095         "+r"(src_v),                    // %1
7096         "+r"(dst_uv),                   // %2
7097         "+r"(width)                     // %3
7098       : "r"((intptr_t)(src_stride_u)),  // %4
7099         "r"((intptr_t)(src_stride_v))   // %5
7100       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
7101 }
7102 
HalfMergeUVRow_AVX2(const uint8_t * src_u,int src_stride_u,const uint8_t * src_v,int src_stride_v,uint8_t * dst_uv,int width)7103 void HalfMergeUVRow_AVX2(const uint8_t* src_u,
7104                          int src_stride_u,
7105                          const uint8_t* src_v,
7106                          int src_stride_v,
7107                          uint8_t* dst_uv,
7108                          int width) {
7109   asm volatile(
7110       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
7111       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
7112       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
7113       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
7114 
7115       LABELALIGN
7116       "1:                                        \n"
7117       "vmovdqu     (%0),%%ymm0                   \n"  // load 32 U values
7118       "vmovdqu     (%1),%%ymm1                   \n"  // load 32 V values
7119       "vmovdqu     0(%0,%4,1),%%ymm2             \n"  // 32 from next row
7120       "vmovdqu     0(%1,%5,1),%%ymm3             \n"
7121       "lea         0x20(%0),%0                   \n"
7122       "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"  // half size
7123       "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
7124       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
7125       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
7126       "lea         0x20(%1),%1                   \n"
7127       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
7128       "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
7129       "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"
7130       "vpsrlw      $0x1,%%ymm1,%%ymm1            \n"
7131       "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
7132       "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
7133       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
7134       "vpackuswb   %%ymm1,%%ymm1,%%ymm1          \n"
7135       "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
7136       "vmovdqu     %%ymm0,(%2)                   \n"  // store 16 UV pixels
7137       "lea         0x20(%2),%2                   \n"
7138       "sub         $0x20,%3                      \n"  // 32 src pixels per loop
7139       "jg          1b                            \n"
7140       "vzeroupper                                \n"
7141       : "+r"(src_u),                    // %0
7142         "+r"(src_v),                    // %1
7143         "+r"(dst_uv),                   // %2
7144         "+r"(width)                     // %3
7145       : "r"((intptr_t)(src_stride_u)),  // %4
7146         "r"((intptr_t)(src_stride_v))   // %5
7147       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
7148 }
7149 
ClampFloatToZero_SSE2(const float * src_x,float * dst_y,int width)7150 void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) {
7151   asm volatile(
7152       "pxor        %%xmm1,%%xmm1                 \n"
7153 
7154       LABELALIGN
7155       "1:                                        \n"
7156       "movd        (%0),%%xmm0                   \n"  // load float
7157       "maxss       %%xmm1, %%xmm0                \n"  // clamp to zero
7158       "add         4, %0                         \n"
7159       "movd        %%xmm0, (%1)                  \n"  // store float
7160       "add         4, %1                         \n"
7161       "sub         $0x4,%2                       \n"  // 1 float per loop
7162       "jg          1b                            \n"
7163       : "+r"(src_x),  // %0
7164         "+r"(dst_y),  // %1
7165         "+r"(width)   // %2
7166       :
7167       : "memory", "cc", "xmm0", "xmm1");
7168 }
7169 
7170 #endif  // defined(__x86_64__) || defined(__i386__)
7171 
7172 #ifdef __cplusplus
7173 }  // extern "C"
7174 }  // namespace libyuv
7175 #endif
7176