• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 
13 // This module is for Visual C 32/64 bit
14 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
15     !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
16 
17 #if defined(_M_ARM64EC)
18 #include <intrin.h>
19 #elif defined(_M_X64)
20 #include <emmintrin.h>
21 #include <tmmintrin.h>  // For _mm_maddubs_epi16
22 #endif
23 
24 #ifdef __cplusplus
25 namespace libyuv {
26 extern "C" {
27 #endif
28 
29 // 64 bit
30 #if defined(_M_X64)
31 
32 // Read 8 UV from 444
33 #define READYUV444                                    \
34   xmm3 = _mm_loadl_epi64((__m128i*)u_buf);            \
35   xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \
36   xmm3 = _mm_unpacklo_epi8(xmm3, xmm1);               \
37   u_buf += 8;                                         \
38   xmm4 = _mm_loadl_epi64((__m128i*)y_buf);            \
39   xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);               \
40   y_buf += 8;
41 
42 // Read 8 UV from 444, With 8 Alpha.
43 #define READYUVA444                                   \
44   xmm3 = _mm_loadl_epi64((__m128i*)u_buf);            \
45   xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \
46   xmm3 = _mm_unpacklo_epi8(xmm3, xmm1);               \
47   u_buf += 8;                                         \
48   xmm4 = _mm_loadl_epi64((__m128i*)y_buf);            \
49   xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);               \
50   y_buf += 8;                                         \
51   xmm5 = _mm_loadl_epi64((__m128i*)a_buf);            \
52   a_buf += 8;
53 
54 // Read 4 UV from 422, upsample to 8 UV.
55 #define READYUV422                                        \
56   xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
57   xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
58   xmm3 = _mm_unpacklo_epi8(xmm3, xmm1);                   \
59   xmm3 = _mm_unpacklo_epi16(xmm3, xmm3);                  \
60   u_buf += 4;                                             \
61   xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \
62   xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \
63   y_buf += 8;
64 
65 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
66 #define READYUVA422                                       \
67   xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
68   xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
69   xmm3 = _mm_unpacklo_epi8(xmm3, xmm1);                   \
70   xmm3 = _mm_unpacklo_epi16(xmm3, xmm3);                  \
71   u_buf += 4;                                             \
72   xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \
73   xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \
74   y_buf += 8;                                             \
75   xmm5 = _mm_loadl_epi64((__m128i*)a_buf);                \
76   a_buf += 8;
77 
78 // Convert 8 pixels: 8 UV and 8 Y.
79 #define YUVTORGB(yuvconstants)                                      \
80   xmm3 = _mm_sub_epi8(xmm3, _mm_set1_epi8((char)0x80));             \
81   xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);   \
82   xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kYBiasToRgb); \
83   xmm0 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToB, xmm3);  \
84   xmm1 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToG, xmm3);  \
85   xmm2 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToR, xmm3);  \
86   xmm0 = _mm_adds_epi16(xmm4, xmm0);                                \
87   xmm1 = _mm_subs_epi16(xmm4, xmm1);                                \
88   xmm2 = _mm_adds_epi16(xmm4, xmm2);                                \
89   xmm0 = _mm_srai_epi16(xmm0, 6);                                   \
90   xmm1 = _mm_srai_epi16(xmm1, 6);                                   \
91   xmm2 = _mm_srai_epi16(xmm2, 6);                                   \
92   xmm0 = _mm_packus_epi16(xmm0, xmm0);                              \
93   xmm1 = _mm_packus_epi16(xmm1, xmm1);                              \
94   xmm2 = _mm_packus_epi16(xmm2, xmm2);
95 
96 // Store 8 ARGB values.
97 #define STOREARGB                                    \
98   xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);              \
99   xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);              \
100   xmm1 = _mm_loadu_si128(&xmm0);                     \
101   xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);             \
102   xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);             \
103   _mm_storeu_si128((__m128i*)dst_argb, xmm0);        \
104   _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
105   dst_argb += 32;
106 
107 #if defined(HAS_I422TOARGBROW_SSSE3)
I422ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)108 void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
109                          const uint8_t* u_buf,
110                          const uint8_t* v_buf,
111                          uint8_t* dst_argb,
112                          const struct YuvConstants* yuvconstants,
113                          int width) {
114   __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
115   const __m128i xmm5 = _mm_set1_epi8(-1);
116   const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
117   while (width > 0) {
118     READYUV422
119     YUVTORGB(yuvconstants)
120     STOREARGB
121     width -= 8;
122   }
123 }
124 #endif
125 
126 #if defined(HAS_I422ALPHATOARGBROW_SSSE3)
I422AlphaToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)127 void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
128                               const uint8_t* u_buf,
129                               const uint8_t* v_buf,
130                               const uint8_t* a_buf,
131                               uint8_t* dst_argb,
132                               const struct YuvConstants* yuvconstants,
133                               int width) {
134   __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
135   const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
136   while (width > 0) {
137     READYUVA422
138     YUVTORGB(yuvconstants)
139     STOREARGB
140     width -= 8;
141   }
142 }
143 #endif
144 
145 #if defined(HAS_I444TOARGBROW_SSSE3)
I444ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)146 void I444ToARGBRow_SSSE3(const uint8_t* y_buf,
147                          const uint8_t* u_buf,
148                          const uint8_t* v_buf,
149                          uint8_t* dst_argb,
150                          const struct YuvConstants* yuvconstants,
151                          int width) {
152   __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
153   const __m128i xmm5 = _mm_set1_epi8(-1);
154   const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
155   while (width > 0) {
156     READYUV444
157     YUVTORGB(yuvconstants)
158     STOREARGB
159     width -= 8;
160   }
161 }
162 #endif
163 
164 #if defined(HAS_I444ALPHATOARGBROW_SSSE3)
I444AlphaToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)165 void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
166                               const uint8_t* u_buf,
167                               const uint8_t* v_buf,
168                               const uint8_t* a_buf,
169                               uint8_t* dst_argb,
170                               const struct YuvConstants* yuvconstants,
171                               int width) {
172   __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
173   const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
174   while (width > 0) {
175     READYUVA444
176     YUVTORGB(yuvconstants)
177     STOREARGB
178     width -= 8;
179   }
180 }
181 #endif
182 
183 // 32 bit
184 #else  // defined(_M_X64)
185 #ifdef HAS_ARGBTOYROW_SSSE3
186 
187 // Constants for ARGB.
188 static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
189                               13, 65, 33, 0, 13, 65, 33, 0};
190 
191 // JPeg full range.
192 static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
193                                15, 75, 38, 0, 15, 75, 38, 0};
194 
195 static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
196                               112, -74, -38, 0, 112, -74, -38, 0};
197 
198 static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
199                                127, -84, -43, 0, 127, -84, -43, 0};
200 
201 static const vec8 kARGBToV = {
202     -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
203 };
204 
205 static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
206                                -20, -107, 127, 0, -20, -107, 127, 0};
207 
208 // vpshufb for vphaddw + vpackuswb packed to shorts.
209 static const lvec8 kShufARGBToUV_AVX = {
210     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
211     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
212 
213 // Constants for BGRA.
214 static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
215                               0, 33, 65, 13, 0, 33, 65, 13};
216 
217 static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
218                               0, -38, -74, 112, 0, -38, -74, 112};
219 
220 static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
221                               0, 112, -94, -18, 0, 112, -94, -18};
222 
223 // Constants for ABGR.
224 static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
225                               33, 65, 13, 0, 33, 65, 13, 0};
226 
227 static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
228                               -38, -74, 112, 0, -38, -74, 112, 0};
229 
230 static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
231                               112, -94, -18, 0, 112, -94, -18, 0};
232 
233 // Constants for RGBA.
234 static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
235                               0, 13, 65, 33, 0, 13, 65, 33};
236 
237 static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
238                               0, 112, -74, -38, 0, 112, -74, -38};
239 
240 static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
241                               0, -18, -94, 112, 0, -18, -94, 112};
242 
243 static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
244                               16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
245 
246 // 7 bit fixed point 0.5.
247 static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
248 
249 // 8 bit fixed point 0.5, for bias of UV.
250 static const ulvec8 kBiasUV128 = {
251     0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
252     0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
253     0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
254 
255 // Shuffle table for converting RGB24 to ARGB.
256 static const uvec8 kShuffleMaskRGB24ToARGB = {
257     0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
258 
259 // Shuffle table for converting RAW to ARGB.
260 static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
261                                             8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
262 
263 // Shuffle table for converting RAW to RGB24.  First 8.
264 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
265     2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
266     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
267 
268 // Shuffle table for converting RAW to RGB24.  Middle 8.
269 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
270     2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
271     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
272 
273 // Shuffle table for converting RAW to RGB24.  Last 8.
274 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
275     8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
276     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
277 
278 // Shuffle table for converting ARGB to RGB24.
279 static const uvec8 kShuffleMaskARGBToRGB24 = {
280     0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
281 
282 // Shuffle table for converting ARGB to RAW.
283 static const uvec8 kShuffleMaskARGBToRAW = {
284     2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
285 
286 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
287 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
288     0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
289 
290 // YUY2 shuf 16 Y to 32 Y.
291 static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
292                                     10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
293                                     6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
294 
295 // YUY2 shuf 8 UV to 16 UV.
296 static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
297                                      11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
298                                      5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
299 
300 // UYVY shuf 16 Y to 32 Y.
301 static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
302                                     11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
303                                     7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
304 
305 // UYVY shuf 8 UV to 16 UV.
306 static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
307                                      10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
308                                      4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
309 
310 // NV21 shuf 8 VU to 16 UV.
311 static const lvec8 kShuffleNV21 = {
312     1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
313     1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
314 };
315 
316 // Duplicates gray value 3 times and fills in alpha opaque.
317 __declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y,
318                                           uint8_t* dst_argb,
319                                           int width) {
320   __asm {
321     mov        eax, [esp + 4]  // src_y
322     mov        edx, [esp + 8]  // dst_argb
323     mov        ecx, [esp + 12]  // width
324     pcmpeqb    xmm5, xmm5  // generate mask 0xff000000
325     pslld      xmm5, 24
326 
327   convertloop:
328     movq       xmm0, qword ptr [eax]
329     lea        eax,  [eax + 8]
330     punpcklbw  xmm0, xmm0
331     movdqa     xmm1, xmm0
332     punpcklwd  xmm0, xmm0
333     punpckhwd  xmm1, xmm1
334     por        xmm0, xmm5
335     por        xmm1, xmm5
336     movdqu     [edx], xmm0
337     movdqu     [edx + 16], xmm1
338     lea        edx, [edx + 32]
339     sub        ecx, 8
340     jg         convertloop
341     ret
342   }
343 }
344 
345 #ifdef HAS_J400TOARGBROW_AVX2
346 // Duplicates gray value 3 times and fills in alpha opaque.
347 __declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y,
348                                           uint8_t* dst_argb,
349                                           int width) {
350   __asm {
351     mov         eax, [esp + 4]  // src_y
352     mov         edx, [esp + 8]  // dst_argb
353     mov         ecx, [esp + 12]  // width
354     vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff000000
355     vpslld      ymm5, ymm5, 24
356 
357   convertloop:
358     vmovdqu     xmm0, [eax]
359     lea         eax,  [eax + 16]
360     vpermq      ymm0, ymm0, 0xd8
361     vpunpcklbw  ymm0, ymm0, ymm0
362     vpermq      ymm0, ymm0, 0xd8
363     vpunpckhwd  ymm1, ymm0, ymm0
364     vpunpcklwd  ymm0, ymm0, ymm0
365     vpor        ymm0, ymm0, ymm5
366     vpor        ymm1, ymm1, ymm5
367     vmovdqu     [edx], ymm0
368     vmovdqu     [edx + 32], ymm1
369     lea         edx, [edx + 64]
370     sub         ecx, 16
371     jg          convertloop
372     vzeroupper
373     ret
374   }
375 }
376 #endif  // HAS_J400TOARGBROW_AVX2
377 
378 __declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
379                                             uint8_t* dst_argb,
380                                             int width) {
381   __asm {
382     mov       eax, [esp + 4]  // src_rgb24
383     mov       edx, [esp + 8]  // dst_argb
384     mov       ecx, [esp + 12]  // width
385     pcmpeqb   xmm5, xmm5  // generate mask 0xff000000
386     pslld     xmm5, 24
387     movdqa    xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
388 
389  convertloop:
390     movdqu    xmm0, [eax]
391     movdqu    xmm1, [eax + 16]
392     movdqu    xmm3, [eax + 32]
393     lea       eax, [eax + 48]
394     movdqa    xmm2, xmm3
395     palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}
396     pshufb    xmm2, xmm4
397     por       xmm2, xmm5
398     palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}
399     pshufb    xmm0, xmm4
400     movdqu    [edx + 32], xmm2
401     por       xmm0, xmm5
402     pshufb    xmm1, xmm4
403     movdqu    [edx], xmm0
404     por       xmm1, xmm5
405     palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}
406     pshufb    xmm3, xmm4
407     movdqu    [edx + 16], xmm1
408     por       xmm3, xmm5
409     movdqu    [edx + 48], xmm3
410     lea       edx, [edx + 64]
411     sub       ecx, 16
412     jg        convertloop
413     ret
414   }
415 }
416 
417 __declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw,
418                                           uint8_t* dst_argb,
419                                           int width) {
420   __asm {
421     mov       eax, [esp + 4]  // src_raw
422     mov       edx, [esp + 8]  // dst_argb
423     mov       ecx, [esp + 12]  // width
424     pcmpeqb   xmm5, xmm5  // generate mask 0xff000000
425     pslld     xmm5, 24
426     movdqa    xmm4, xmmword ptr kShuffleMaskRAWToARGB
427 
428  convertloop:
429     movdqu    xmm0, [eax]
430     movdqu    xmm1, [eax + 16]
431     movdqu    xmm3, [eax + 32]
432     lea       eax, [eax + 48]
433     movdqa    xmm2, xmm3
434     palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}
435     pshufb    xmm2, xmm4
436     por       xmm2, xmm5
437     palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}
438     pshufb    xmm0, xmm4
439     movdqu    [edx + 32], xmm2
440     por       xmm0, xmm5
441     pshufb    xmm1, xmm4
442     movdqu    [edx], xmm0
443     por       xmm1, xmm5
444     palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}
445     pshufb    xmm3, xmm4
446     movdqu    [edx + 16], xmm1
447     por       xmm3, xmm5
448     movdqu    [edx + 48], xmm3
449     lea       edx, [edx + 64]
450     sub       ecx, 16
451     jg        convertloop
452     ret
453   }
454 }
455 
456 __declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
457                                            uint8_t* dst_rgb24,
458                                            int width) {
459   __asm {
460     mov       eax, [esp + 4]  // src_raw
461     mov       edx, [esp + 8]  // dst_rgb24
462     mov       ecx, [esp + 12]  // width
463     movdqa    xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
464     movdqa    xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
465     movdqa    xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
466 
467  convertloop:
468     movdqu    xmm0, [eax]
469     movdqu    xmm1, [eax + 4]
470     movdqu    xmm2, [eax + 8]
471     lea       eax, [eax + 24]
472     pshufb    xmm0, xmm3
473     pshufb    xmm1, xmm4
474     pshufb    xmm2, xmm5
475     movq      qword ptr [edx], xmm0
476     movq      qword ptr [edx + 8], xmm1
477     movq      qword ptr [edx + 16], xmm2
478     lea       edx, [edx + 24]
479     sub       ecx, 8
480     jg        convertloop
481     ret
482   }
483 }
484 
485 // pmul method to replicate bits.
486 // Math to replicate bits:
487 // (v << 8) | (v << 3)
488 // v * 256 + v * 8
489 // v * (256 + 8)
490 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
491 // 20 instructions.
492 __declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565,
493                                             uint8_t* dst_argb,
494                                             int width) {
495   __asm {
496     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
497     movd      xmm5, eax
498     pshufd    xmm5, xmm5, 0
499     mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
500     movd      xmm6, eax
501     pshufd    xmm6, xmm6, 0
502     pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red
503     psllw     xmm3, 11
504     pcmpeqb   xmm4, xmm4  // generate mask 0x07e007e0 for Green
505     psllw     xmm4, 10
506     psrlw     xmm4, 5
507     pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha
508     psllw     xmm7, 8
509 
510     mov       eax, [esp + 4]  // src_rgb565
511     mov       edx, [esp + 8]  // dst_argb
512     mov       ecx, [esp + 12]  // width
513     sub       edx, eax
514     sub       edx, eax
515 
516  convertloop:
517     movdqu    xmm0, [eax]  // fetch 8 pixels of bgr565
518     movdqa    xmm1, xmm0
519     movdqa    xmm2, xmm0
520     pand      xmm1, xmm3  // R in upper 5 bits
521     psllw     xmm2, 11  // B in upper 5 bits
522     pmulhuw   xmm1, xmm5  // * (256 + 8)
523     pmulhuw   xmm2, xmm5  // * (256 + 8)
524     psllw     xmm1, 8
525     por       xmm1, xmm2  // RB
526     pand      xmm0, xmm4  // G in middle 6 bits
527     pmulhuw   xmm0, xmm6  // << 5 * (256 + 4)
528     por       xmm0, xmm7  // AG
529     movdqa    xmm2, xmm1
530     punpcklbw xmm1, xmm0
531     punpckhbw xmm2, xmm0
532     movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
533     movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
534     lea       eax, [eax + 16]
535     sub       ecx, 8
536     jg        convertloop
537     ret
538   }
539 }
540 
541 #ifdef HAS_RGB565TOARGBROW_AVX2
542 // pmul method to replicate bits.
543 // Math to replicate bits:
544 // (v << 8) | (v << 3)
545 // v * 256 + v * 8
546 // v * (256 + 8)
547 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
548 __declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
549                                             uint8_t* dst_argb,
550                                             int width) {
551   __asm {
552     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
553     vmovd      xmm5, eax
554     vbroadcastss ymm5, xmm5
555     mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
556     vmovd      xmm6, eax
557     vbroadcastss ymm6, xmm6
558     vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red
559     vpsllw     ymm3, ymm3, 11
560     vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x07e007e0 for Green
561     vpsllw     ymm4, ymm4, 10
562     vpsrlw     ymm4, ymm4, 5
563     vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha
564     vpsllw     ymm7, ymm7, 8
565 
566     mov        eax, [esp + 4]  // src_rgb565
567     mov        edx, [esp + 8]  // dst_argb
568     mov        ecx, [esp + 12]  // width
569     sub        edx, eax
570     sub        edx, eax
571 
572  convertloop:
573     vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgr565
574     vpand      ymm1, ymm0, ymm3  // R in upper 5 bits
575     vpsllw     ymm2, ymm0, 11  // B in upper 5 bits
576     vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)
577     vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)
578     vpsllw     ymm1, ymm1, 8
579     vpor       ymm1, ymm1, ymm2  // RB
580     vpand      ymm0, ymm0, ymm4  // G in middle 6 bits
581     vpmulhuw   ymm0, ymm0, ymm6  // << 5 * (256 + 4)
582     vpor       ymm0, ymm0, ymm7  // AG
583     vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
584     vpermq     ymm1, ymm1, 0xd8
585     vpunpckhbw ymm2, ymm1, ymm0
586     vpunpcklbw ymm1, ymm1, ymm0
587     vmovdqu    [eax * 2 + edx], ymm1  // store 4 pixels of ARGB
588     vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 4 pixels of ARGB
589     lea       eax, [eax + 32]
590     sub       ecx, 16
591     jg        convertloop
592     vzeroupper
593     ret
594   }
595 }
596 #endif  // HAS_RGB565TOARGBROW_AVX2
597 
598 #ifdef HAS_ARGB1555TOARGBROW_AVX2
599 __declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
600                                               uint8_t* dst_argb,
601                                               int width) {
602   __asm {
603     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
604     vmovd      xmm5, eax
605     vbroadcastss ymm5, xmm5
606     mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
607     vmovd      xmm6, eax
608     vbroadcastss ymm6, xmm6
609     vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red
610     vpsllw     ymm3, ymm3, 11
611     vpsrlw     ymm4, ymm3, 6  // generate mask 0x03e003e0 for Green
612     vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha
613     vpsllw     ymm7, ymm7, 8
614 
615     mov        eax,  [esp + 4]  // src_argb1555
616     mov        edx,  [esp + 8]  // dst_argb
617     mov        ecx,  [esp + 12]  // width
618     sub        edx,  eax
619     sub        edx,  eax
620 
621  convertloop:
622     vmovdqu    ymm0, [eax]  // fetch 16 pixels of 1555
623     vpsllw     ymm1, ymm0, 1  // R in upper 5 bits
624     vpsllw     ymm2, ymm0, 11  // B in upper 5 bits
625     vpand      ymm1, ymm1, ymm3
626     vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)
627     vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)
628     vpsllw     ymm1, ymm1, 8
629     vpor       ymm1, ymm1, ymm2  // RB
630     vpsraw     ymm2, ymm0, 8  // A
631     vpand      ymm0, ymm0, ymm4  // G in middle 5 bits
632     vpmulhuw   ymm0, ymm0, ymm6  // << 6 * (256 + 8)
633     vpand      ymm2, ymm2, ymm7
634     vpor       ymm0, ymm0, ymm2  // AG
635     vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
636     vpermq     ymm1, ymm1, 0xd8
637     vpunpckhbw ymm2, ymm1, ymm0
638     vpunpcklbw ymm1, ymm1, ymm0
639     vmovdqu    [eax * 2 + edx], ymm1  // store 8 pixels of ARGB
640     vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 8 pixels of ARGB
641     lea       eax, [eax + 32]
642     sub       ecx, 16
643     jg        convertloop
644     vzeroupper
645     ret
646   }
647 }
648 #endif  // HAS_ARGB1555TOARGBROW_AVX2
649 
650 #ifdef HAS_ARGB4444TOARGBROW_AVX2
651 __declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
652                                               uint8_t* dst_argb,
653                                               int width) {
654   __asm {
655     mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f
656     vmovd     xmm4, eax
657     vbroadcastss ymm4, xmm4
658     vpslld    ymm5, ymm4, 4  // 0xf0f0f0f0 for high nibbles
659     mov       eax,  [esp + 4]  // src_argb4444
660     mov       edx,  [esp + 8]  // dst_argb
661     mov       ecx,  [esp + 12]  // width
662     sub       edx,  eax
663     sub       edx,  eax
664 
665  convertloop:
666     vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgra4444
667     vpand      ymm2, ymm0, ymm5  // mask high nibbles
668     vpand      ymm0, ymm0, ymm4  // mask low nibbles
669     vpsrlw     ymm3, ymm2, 4
670     vpsllw     ymm1, ymm0, 4
671     vpor       ymm2, ymm2, ymm3
672     vpor       ymm0, ymm0, ymm1
673     vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
674     vpermq     ymm2, ymm2, 0xd8
675     vpunpckhbw ymm1, ymm0, ymm2
676     vpunpcklbw ymm0, ymm0, ymm2
677     vmovdqu    [eax * 2 + edx], ymm0  // store 8 pixels of ARGB
678     vmovdqu    [eax * 2 + edx + 32], ymm1  // store next 8 pixels of ARGB
679     lea       eax, [eax + 32]
680     sub       ecx, 16
681     jg        convertloop
682     vzeroupper
683     ret
684   }
685 }
686 #endif  // HAS_ARGB4444TOARGBROW_AVX2
687 
688 // 24 instructions
689 __declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555,
690                                               uint8_t* dst_argb,
691                                               int width) {
692   __asm {
693     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
694     movd      xmm5, eax
695     pshufd    xmm5, xmm5, 0
696     mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
697     movd      xmm6, eax
698     pshufd    xmm6, xmm6, 0
699     pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red
700     psllw     xmm3, 11
701     movdqa    xmm4, xmm3  // generate mask 0x03e003e0 for Green
702     psrlw     xmm4, 6
703     pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha
704     psllw     xmm7, 8
705 
706     mov       eax, [esp + 4]  // src_argb1555
707     mov       edx, [esp + 8]  // dst_argb
708     mov       ecx, [esp + 12]  // width
709     sub       edx, eax
710     sub       edx, eax
711 
712  convertloop:
713     movdqu    xmm0, [eax]  // fetch 8 pixels of 1555
714     movdqa    xmm1, xmm0
715     movdqa    xmm2, xmm0
716     psllw     xmm1, 1  // R in upper 5 bits
717     psllw     xmm2, 11  // B in upper 5 bits
718     pand      xmm1, xmm3
719     pmulhuw   xmm2, xmm5  // * (256 + 8)
720     pmulhuw   xmm1, xmm5  // * (256 + 8)
721     psllw     xmm1, 8
722     por       xmm1, xmm2  // RB
723     movdqa    xmm2, xmm0
724     pand      xmm0, xmm4  // G in middle 5 bits
725     psraw     xmm2, 8  // A
726     pmulhuw   xmm0, xmm6  // << 6 * (256 + 8)
727     pand      xmm2, xmm7
728     por       xmm0, xmm2  // AG
729     movdqa    xmm2, xmm1
730     punpcklbw xmm1, xmm0
731     punpckhbw xmm2, xmm0
732     movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
733     movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
734     lea       eax, [eax + 16]
735     sub       ecx, 8
736     jg        convertloop
737     ret
738   }
739 }
740 
741 // 18 instructions.
742 __declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444,
743                                               uint8_t* dst_argb,
744                                               int width) {
745   __asm {
746     mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
747     movd      xmm4, eax
748     pshufd    xmm4, xmm4, 0
749     movdqa    xmm5, xmm4  // 0xf0f0f0f0 for high nibbles
750     pslld     xmm5, 4
751     mov       eax, [esp + 4]  // src_argb4444
752     mov       edx, [esp + 8]  // dst_argb
753     mov       ecx, [esp + 12]  // width
754     sub       edx, eax
755     sub       edx, eax
756 
757  convertloop:
758     movdqu    xmm0, [eax]  // fetch 8 pixels of bgra4444
759     movdqa    xmm2, xmm0
760     pand      xmm0, xmm4  // mask low nibbles
761     pand      xmm2, xmm5  // mask high nibbles
762     movdqa    xmm1, xmm0
763     movdqa    xmm3, xmm2
764     psllw     xmm1, 4
765     psrlw     xmm3, 4
766     por       xmm0, xmm1
767     por       xmm2, xmm3
768     movdqa    xmm1, xmm0
769     punpcklbw xmm0, xmm2
770     punpckhbw xmm1, xmm2
771     movdqu    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
772     movdqu    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
773     lea       eax, [eax + 16]
774     sub       ecx, 8
775     jg        convertloop
776     ret
777   }
778 }
779 
780 __declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb,
781                                             uint8_t* dst_rgb,
782                                             int width) {
783   __asm {
784     mov       eax, [esp + 4]  // src_argb
785     mov       edx, [esp + 8]  // dst_rgb
786     mov       ecx, [esp + 12]  // width
787     movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRGB24
788 
789  convertloop:
790     movdqu    xmm0, [eax]  // fetch 16 pixels of argb
791     movdqu    xmm1, [eax + 16]
792     movdqu    xmm2, [eax + 32]
793     movdqu    xmm3, [eax + 48]
794     lea       eax, [eax + 64]
795     pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB
796     pshufb    xmm1, xmm6
797     pshufb    xmm2, xmm6
798     pshufb    xmm3, xmm6
799     movdqa    xmm4, xmm1  // 4 bytes from 1 for 0
800     psrldq    xmm1, 4  // 8 bytes from 1
801     pslldq    xmm4, 12  // 4 bytes from 1 for 0
802     movdqa    xmm5, xmm2  // 8 bytes from 2 for 1
803     por       xmm0, xmm4  // 4 bytes from 1 for 0
804     pslldq    xmm5, 8  // 8 bytes from 2 for 1
805     movdqu    [edx], xmm0  // store 0
806     por       xmm1, xmm5  // 8 bytes from 2 for 1
807     psrldq    xmm2, 8  // 4 bytes from 2
808     pslldq    xmm3, 4  // 12 bytes from 3 for 2
809     por       xmm2, xmm3  // 12 bytes from 3 for 2
810     movdqu    [edx + 16], xmm1  // store 1
811     movdqu    [edx + 32], xmm2  // store 2
812     lea       edx, [edx + 48]
813     sub       ecx, 16
814     jg        convertloop
815     ret
816   }
817 }
818 
819 __declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb,
820                                           uint8_t* dst_rgb,
821                                           int width) {
822   __asm {
823     mov       eax, [esp + 4]  // src_argb
824     mov       edx, [esp + 8]  // dst_rgb
825     mov       ecx, [esp + 12]  // width
826     movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRAW
827 
828  convertloop:
829     movdqu    xmm0, [eax]  // fetch 16 pixels of argb
830     movdqu    xmm1, [eax + 16]
831     movdqu    xmm2, [eax + 32]
832     movdqu    xmm3, [eax + 48]
833     lea       eax, [eax + 64]
834     pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB
835     pshufb    xmm1, xmm6
836     pshufb    xmm2, xmm6
837     pshufb    xmm3, xmm6
838     movdqa    xmm4, xmm1  // 4 bytes from 1 for 0
839     psrldq    xmm1, 4  // 8 bytes from 1
840     pslldq    xmm4, 12  // 4 bytes from 1 for 0
841     movdqa    xmm5, xmm2  // 8 bytes from 2 for 1
842     por       xmm0, xmm4  // 4 bytes from 1 for 0
843     pslldq    xmm5, 8  // 8 bytes from 2 for 1
844     movdqu    [edx], xmm0  // store 0
845     por       xmm1, xmm5  // 8 bytes from 2 for 1
846     psrldq    xmm2, 8  // 4 bytes from 2
847     pslldq    xmm3, 4  // 12 bytes from 3 for 2
848     por       xmm2, xmm3  // 12 bytes from 3 for 2
849     movdqu    [edx + 16], xmm1  // store 1
850     movdqu    [edx + 32], xmm2  // store 2
851     lea       edx, [edx + 48]
852     sub       ecx, 16
853     jg        convertloop
854     ret
855   }
856 }
857 
858 __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb,
859                                             uint8_t* dst_rgb,
860                                             int width) {
861   __asm {
862     mov       eax, [esp + 4]  // src_argb
863     mov       edx, [esp + 8]  // dst_rgb
864     mov       ecx, [esp + 12]  // width
865     pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f
866     psrld     xmm3, 27
867     pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0
868     psrld     xmm4, 26
869     pslld     xmm4, 5
870     pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800
871     pslld     xmm5, 11
872 
873  convertloop:
874     movdqu    xmm0, [eax]  // fetch 4 pixels of argb
875     movdqa    xmm1, xmm0  // B
876     movdqa    xmm2, xmm0  // G
877     pslld     xmm0, 8  // R
878     psrld     xmm1, 3  // B
879     psrld     xmm2, 5  // G
880     psrad     xmm0, 16  // R
881     pand      xmm1, xmm3  // B
882     pand      xmm2, xmm4  // G
883     pand      xmm0, xmm5  // R
884     por       xmm1, xmm2  // BG
885     por       xmm0, xmm1  // BGR
886     packssdw  xmm0, xmm0
887     lea       eax, [eax + 16]
888     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
889     lea       edx, [edx + 8]
890     sub       ecx, 4
891     jg        convertloop
892     ret
893   }
894 }
895 
896 __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
897                                                   uint8_t* dst_rgb,
898                                                   uint32_t dither4,
899                                                   int width) {
900   __asm {
901 
902     mov       eax, [esp + 4]  // src_argb
903     mov       edx, [esp + 8]  // dst_rgb
904     movd      xmm6, [esp + 12]  // dither4
905     mov       ecx, [esp + 16]  // width
906     punpcklbw xmm6, xmm6  // make dither 16 bytes
907     movdqa    xmm7, xmm6
908     punpcklwd xmm6, xmm6
909     punpckhwd xmm7, xmm7
910     pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f
911     psrld     xmm3, 27
912     pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0
913     psrld     xmm4, 26
914     pslld     xmm4, 5
915     pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800
916     pslld     xmm5, 11
917 
918  convertloop:
919     movdqu    xmm0, [eax]  // fetch 4 pixels of argb
920     paddusb   xmm0, xmm6  // add dither
921     movdqa    xmm1, xmm0  // B
922     movdqa    xmm2, xmm0  // G
923     pslld     xmm0, 8  // R
924     psrld     xmm1, 3  // B
925     psrld     xmm2, 5  // G
926     psrad     xmm0, 16  // R
927     pand      xmm1, xmm3  // B
928     pand      xmm2, xmm4  // G
929     pand      xmm0, xmm5  // R
930     por       xmm1, xmm2  // BG
931     por       xmm0, xmm1  // BGR
932     packssdw  xmm0, xmm0
933     lea       eax, [eax + 16]
934     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
935     lea       edx, [edx + 8]
936     sub       ecx, 4
937     jg        convertloop
938     ret
939   }
940 }
941 
942 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
943 __declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb,
944                                                   uint8_t* dst_rgb,
945                                                   uint32_t dither4,
946                                                   int width) {
947   __asm {
948     mov        eax, [esp + 4]  // src_argb
949     mov        edx, [esp + 8]  // dst_rgb
950     vbroadcastss xmm6, [esp + 12]  // dither4
951     mov        ecx, [esp + 16]  // width
952     vpunpcklbw xmm6, xmm6, xmm6  // make dither 32 bytes
953     vpermq     ymm6, ymm6, 0xd8
954     vpunpcklwd ymm6, ymm6, ymm6
955     vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f
956     vpsrld     ymm3, ymm3, 27
957     vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0
958     vpsrld     ymm4, ymm4, 26
959     vpslld     ymm4, ymm4, 5
960     vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800
961 
962  convertloop:
963     vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
964     vpaddusb   ymm0, ymm0, ymm6  // add dither
965     vpsrld     ymm2, ymm0, 5  // G
966     vpsrld     ymm1, ymm0, 3  // B
967     vpsrld     ymm0, ymm0, 8  // R
968     vpand      ymm2, ymm2, ymm4  // G
969     vpand      ymm1, ymm1, ymm3  // B
970     vpand      ymm0, ymm0, ymm5  // R
971     vpor       ymm1, ymm1, ymm2  // BG
972     vpor       ymm0, ymm0, ymm1  // BGR
973     vpackusdw  ymm0, ymm0, ymm0
974     vpermq     ymm0, ymm0, 0xd8
975     lea        eax, [eax + 32]
976     vmovdqu    [edx], xmm0  // store 8 pixels of RGB565
977     lea        edx, [edx + 16]
978     sub        ecx, 8
979     jg         convertloop
980     vzeroupper
981     ret
982   }
983 }
984 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
985 
986 // TODO(fbarchard): Improve sign extension/packing.
987 __declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb,
988                                               uint8_t* dst_rgb,
989                                               int width) {
990   __asm {
991     mov       eax, [esp + 4]  // src_argb
992     mov       edx, [esp + 8]  // dst_rgb
993     mov       ecx, [esp + 12]  // width
994     pcmpeqb   xmm4, xmm4  // generate mask 0x0000001f
995     psrld     xmm4, 27
996     movdqa    xmm5, xmm4  // generate mask 0x000003e0
997     pslld     xmm5, 5
998     movdqa    xmm6, xmm4  // generate mask 0x00007c00
999     pslld     xmm6, 10
1000     pcmpeqb   xmm7, xmm7  // generate mask 0xffff8000
1001     pslld     xmm7, 15
1002 
1003  convertloop:
1004     movdqu    xmm0, [eax]  // fetch 4 pixels of argb
1005     movdqa    xmm1, xmm0  // B
1006     movdqa    xmm2, xmm0  // G
1007     movdqa    xmm3, xmm0  // R
1008     psrad     xmm0, 16  // A
1009     psrld     xmm1, 3  // B
1010     psrld     xmm2, 6  // G
1011     psrld     xmm3, 9  // R
1012     pand      xmm0, xmm7  // A
1013     pand      xmm1, xmm4  // B
1014     pand      xmm2, xmm5  // G
1015     pand      xmm3, xmm6  // R
1016     por       xmm0, xmm1  // BA
1017     por       xmm2, xmm3  // GR
1018     por       xmm0, xmm2  // BGRA
1019     packssdw  xmm0, xmm0
1020     lea       eax, [eax + 16]
1021     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
1022     lea       edx, [edx + 8]
1023     sub       ecx, 4
1024     jg        convertloop
1025     ret
1026   }
1027 }
1028 
1029 __declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb,
1030                                               uint8_t* dst_rgb,
1031                                               int width) {
1032   __asm {
1033     mov       eax, [esp + 4]  // src_argb
1034     mov       edx, [esp + 8]  // dst_rgb
1035     mov       ecx, [esp + 12]  // width
1036     pcmpeqb   xmm4, xmm4  // generate mask 0xf000f000
1037     psllw     xmm4, 12
1038     movdqa    xmm3, xmm4  // generate mask 0x00f000f0
1039     psrlw     xmm3, 8
1040 
1041  convertloop:
1042     movdqu    xmm0, [eax]  // fetch 4 pixels of argb
1043     movdqa    xmm1, xmm0
1044     pand      xmm0, xmm3  // low nibble
1045     pand      xmm1, xmm4  // high nibble
1046     psrld     xmm0, 4
1047     psrld     xmm1, 8
1048     por       xmm0, xmm1
1049     packuswb  xmm0, xmm0
1050     lea       eax, [eax + 16]
1051     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
1052     lea       edx, [edx + 8]
1053     sub       ecx, 4
1054     jg        convertloop
1055     ret
1056   }
1057 }
1058 
1059 #ifdef HAS_ARGBTORGB565ROW_AVX2
1060 __declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb,
1061                                             uint8_t* dst_rgb,
1062                                             int width) {
1063   __asm {
1064     mov        eax, [esp + 4]  // src_argb
1065     mov        edx, [esp + 8]  // dst_rgb
1066     mov        ecx, [esp + 12]  // width
1067     vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f
1068     vpsrld     ymm3, ymm3, 27
1069     vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0
1070     vpsrld     ymm4, ymm4, 26
1071     vpslld     ymm4, ymm4, 5
1072     vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800
1073 
1074  convertloop:
1075     vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
1076     vpsrld     ymm2, ymm0, 5  // G
1077     vpsrld     ymm1, ymm0, 3  // B
1078     vpsrld     ymm0, ymm0, 8  // R
1079     vpand      ymm2, ymm2, ymm4  // G
1080     vpand      ymm1, ymm1, ymm3  // B
1081     vpand      ymm0, ymm0, ymm5  // R
1082     vpor       ymm1, ymm1, ymm2  // BG
1083     vpor       ymm0, ymm0, ymm1  // BGR
1084     vpackusdw  ymm0, ymm0, ymm0
1085     vpermq     ymm0, ymm0, 0xd8
1086     lea        eax, [eax + 32]
1087     vmovdqu    [edx], xmm0  // store 8 pixels of RGB565
1088     lea        edx, [edx + 16]
1089     sub        ecx, 8
1090     jg         convertloop
1091     vzeroupper
1092     ret
1093   }
1094 }
1095 #endif  // HAS_ARGBTORGB565ROW_AVX2
1096 
1097 #ifdef HAS_ARGBTOARGB1555ROW_AVX2
1098 __declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
1099                                               uint8_t* dst_rgb,
1100                                               int width) {
1101   __asm {
1102     mov        eax, [esp + 4]  // src_argb
1103     mov        edx, [esp + 8]  // dst_rgb
1104     mov        ecx, [esp + 12]  // width
1105     vpcmpeqb   ymm4, ymm4, ymm4
1106     vpsrld     ymm4, ymm4, 27  // generate mask 0x0000001f
1107     vpslld     ymm5, ymm4, 5  // generate mask 0x000003e0
1108     vpslld     ymm6, ymm4, 10  // generate mask 0x00007c00
1109     vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xffff8000
1110     vpslld     ymm7, ymm7, 15
1111 
1112  convertloop:
1113     vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
1114     vpsrld     ymm3, ymm0, 9  // R
1115     vpsrld     ymm2, ymm0, 6  // G
1116     vpsrld     ymm1, ymm0, 3  // B
1117     vpsrad     ymm0, ymm0, 16  // A
1118     vpand      ymm3, ymm3, ymm6  // R
1119     vpand      ymm2, ymm2, ymm5  // G
1120     vpand      ymm1, ymm1, ymm4  // B
1121     vpand      ymm0, ymm0, ymm7  // A
1122     vpor       ymm0, ymm0, ymm1  // BA
1123     vpor       ymm2, ymm2, ymm3  // GR
1124     vpor       ymm0, ymm0, ymm2  // BGRA
1125     vpackssdw  ymm0, ymm0, ymm0
1126     vpermq     ymm0, ymm0, 0xd8
1127     lea        eax, [eax + 32]
1128     vmovdqu    [edx], xmm0  // store 8 pixels of ARGB1555
1129     lea        edx, [edx + 16]
1130     sub        ecx, 8
1131     jg         convertloop
1132     vzeroupper
1133     ret
1134   }
1135 }
1136 #endif  // HAS_ARGBTOARGB1555ROW_AVX2
1137 
1138 #ifdef HAS_ARGBTOARGB4444ROW_AVX2
1139 __declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
1140                                               uint8_t* dst_rgb,
1141                                               int width) {
1142   __asm {
1143     mov        eax, [esp + 4]  // src_argb
1144     mov        edx, [esp + 8]  // dst_rgb
1145     mov        ecx, [esp + 12]  // width
1146     vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xf000f000
1147     vpsllw     ymm4, ymm4, 12
1148     vpsrlw     ymm3, ymm4, 8  // generate mask 0x00f000f0
1149 
1150  convertloop:
1151     vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
1152     vpand      ymm1, ymm0, ymm4  // high nibble
1153     vpand      ymm0, ymm0, ymm3  // low nibble
1154     vpsrld     ymm1, ymm1, 8
1155     vpsrld     ymm0, ymm0, 4
1156     vpor       ymm0, ymm0, ymm1
1157     vpackuswb  ymm0, ymm0, ymm0
1158     vpermq     ymm0, ymm0, 0xd8
1159     lea        eax, [eax + 32]
1160     vmovdqu    [edx], xmm0  // store 8 pixels of ARGB4444
1161     lea        edx, [edx + 16]
1162     sub        ecx, 8
1163     jg         convertloop
1164     vzeroupper
1165     ret
1166   }
1167 }
1168 #endif  // HAS_ARGBTOARGB4444ROW_AVX2
1169 
1170 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
1171 __declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb,
1172                                         uint8_t* dst_y,
1173                                         int width) {
1174   __asm {
1175     mov        eax, [esp + 4] /* src_argb */
1176     mov        edx, [esp + 8] /* dst_y */
1177     mov        ecx, [esp + 12] /* width */
1178     movdqa     xmm4, xmmword ptr kARGBToY
1179     movdqa     xmm5, xmmword ptr kAddY16
1180 
1181  convertloop:
1182     movdqu     xmm0, [eax]
1183     movdqu     xmm1, [eax + 16]
1184     movdqu     xmm2, [eax + 32]
1185     movdqu     xmm3, [eax + 48]
1186     pmaddubsw  xmm0, xmm4
1187     pmaddubsw  xmm1, xmm4
1188     pmaddubsw  xmm2, xmm4
1189     pmaddubsw  xmm3, xmm4
1190     lea        eax, [eax + 64]
1191     phaddw     xmm0, xmm1
1192     phaddw     xmm2, xmm3
1193     psrlw      xmm0, 7
1194     psrlw      xmm2, 7
1195     packuswb   xmm0, xmm2
1196     paddb      xmm0, xmm5
1197     movdqu     [edx], xmm0
1198     lea        edx, [edx + 16]
1199     sub        ecx, 16
1200     jg         convertloop
1201     ret
1202   }
1203 }
1204 
1205 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1206 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
1207 __declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb,
1208                                          uint8_t* dst_y,
1209                                          int width) {
1210   __asm {
1211     mov        eax, [esp + 4] /* src_argb */
1212     mov        edx, [esp + 8] /* dst_y */
1213     mov        ecx, [esp + 12] /* width */
1214     movdqa     xmm4, xmmword ptr kARGBToYJ
1215     movdqa     xmm5, xmmword ptr kAddYJ64
1216 
1217  convertloop:
1218     movdqu     xmm0, [eax]
1219     movdqu     xmm1, [eax + 16]
1220     movdqu     xmm2, [eax + 32]
1221     movdqu     xmm3, [eax + 48]
1222     pmaddubsw  xmm0, xmm4
1223     pmaddubsw  xmm1, xmm4
1224     pmaddubsw  xmm2, xmm4
1225     pmaddubsw  xmm3, xmm4
1226     lea        eax, [eax + 64]
1227     phaddw     xmm0, xmm1
1228     phaddw     xmm2, xmm3
1229     paddw      xmm0, xmm5  // Add .5 for rounding.
1230     paddw      xmm2, xmm5
1231     psrlw      xmm0, 7
1232     psrlw      xmm2, 7
1233     packuswb   xmm0, xmm2
1234     movdqu     [edx], xmm0
1235     lea        edx, [edx + 16]
1236     sub        ecx, 16
1237     jg         convertloop
1238     ret
1239   }
1240 }
1241 
1242 #ifdef HAS_ARGBTOYROW_AVX2
1243 // vpermd for vphaddw + vpackuswb vpermd.
1244 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
1245 
1246 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1247 __declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb,
1248                                        uint8_t* dst_y,
1249                                        int width) {
1250   __asm {
1251     mov        eax, [esp + 4] /* src_argb */
1252     mov        edx, [esp + 8] /* dst_y */
1253     mov        ecx, [esp + 12] /* width */
1254     vbroadcastf128 ymm4, xmmword ptr kARGBToY
1255     vbroadcastf128 ymm5, xmmword ptr kAddY16
1256     vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
1257 
1258  convertloop:
1259     vmovdqu    ymm0, [eax]
1260     vmovdqu    ymm1, [eax + 32]
1261     vmovdqu    ymm2, [eax + 64]
1262     vmovdqu    ymm3, [eax + 96]
1263     vpmaddubsw ymm0, ymm0, ymm4
1264     vpmaddubsw ymm1, ymm1, ymm4
1265     vpmaddubsw ymm2, ymm2, ymm4
1266     vpmaddubsw ymm3, ymm3, ymm4
1267     lea        eax, [eax + 128]
1268     vphaddw    ymm0, ymm0, ymm1  // mutates.
1269     vphaddw    ymm2, ymm2, ymm3
1270     vpsrlw     ymm0, ymm0, 7
1271     vpsrlw     ymm2, ymm2, 7
1272     vpackuswb  ymm0, ymm0, ymm2  // mutates.
1273     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
1274     vpaddb     ymm0, ymm0, ymm5  // add 16 for Y
1275     vmovdqu    [edx], ymm0
1276     lea        edx, [edx + 32]
1277     sub        ecx, 32
1278     jg         convertloop
1279     vzeroupper
1280     ret
1281   }
1282 }
1283 #endif  //  HAS_ARGBTOYROW_AVX2
1284 
1285 #ifdef HAS_ARGBTOYJROW_AVX2
1286 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1287 __declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb,
1288                                         uint8_t* dst_y,
1289                                         int width) {
1290   __asm {
1291     mov        eax, [esp + 4] /* src_argb */
1292     mov        edx, [esp + 8] /* dst_y */
1293     mov        ecx, [esp + 12] /* width */
1294     vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
1295     vbroadcastf128 ymm5, xmmword ptr kAddYJ64
1296     vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
1297 
1298  convertloop:
1299     vmovdqu    ymm0, [eax]
1300     vmovdqu    ymm1, [eax + 32]
1301     vmovdqu    ymm2, [eax + 64]
1302     vmovdqu    ymm3, [eax + 96]
1303     vpmaddubsw ymm0, ymm0, ymm4
1304     vpmaddubsw ymm1, ymm1, ymm4
1305     vpmaddubsw ymm2, ymm2, ymm4
1306     vpmaddubsw ymm3, ymm3, ymm4
1307     lea        eax, [eax + 128]
1308     vphaddw    ymm0, ymm0, ymm1  // mutates.
1309     vphaddw    ymm2, ymm2, ymm3
1310     vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
1311     vpaddw     ymm2, ymm2, ymm5
1312     vpsrlw     ymm0, ymm0, 7
1313     vpsrlw     ymm2, ymm2, 7
1314     vpackuswb  ymm0, ymm0, ymm2  // mutates.
1315     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
1316     vmovdqu    [edx], ymm0
1317     lea        edx, [edx + 32]
1318     sub        ecx, 32
1319     jg         convertloop
1320 
1321     vzeroupper
1322     ret
1323   }
1324 }
1325 #endif  //  HAS_ARGBTOYJROW_AVX2
1326 
1327 __declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb,
1328                                         uint8_t* dst_y,
1329                                         int width) {
1330   __asm {
1331     mov        eax, [esp + 4] /* src_argb */
1332     mov        edx, [esp + 8] /* dst_y */
1333     mov        ecx, [esp + 12] /* width */
1334     movdqa     xmm4, xmmword ptr kBGRAToY
1335     movdqa     xmm5, xmmword ptr kAddY16
1336 
1337  convertloop:
1338     movdqu     xmm0, [eax]
1339     movdqu     xmm1, [eax + 16]
1340     movdqu     xmm2, [eax + 32]
1341     movdqu     xmm3, [eax + 48]
1342     pmaddubsw  xmm0, xmm4
1343     pmaddubsw  xmm1, xmm4
1344     pmaddubsw  xmm2, xmm4
1345     pmaddubsw  xmm3, xmm4
1346     lea        eax, [eax + 64]
1347     phaddw     xmm0, xmm1
1348     phaddw     xmm2, xmm3
1349     psrlw      xmm0, 7
1350     psrlw      xmm2, 7
1351     packuswb   xmm0, xmm2
1352     paddb      xmm0, xmm5
1353     movdqu     [edx], xmm0
1354     lea        edx, [edx + 16]
1355     sub        ecx, 16
1356     jg         convertloop
1357     ret
1358   }
1359 }
1360 
1361 __declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb,
1362                                         uint8_t* dst_y,
1363                                         int width) {
1364   __asm {
1365     mov        eax, [esp + 4] /* src_argb */
1366     mov        edx, [esp + 8] /* dst_y */
1367     mov        ecx, [esp + 12] /* width */
1368     movdqa     xmm4, xmmword ptr kABGRToY
1369     movdqa     xmm5, xmmword ptr kAddY16
1370 
1371  convertloop:
1372     movdqu     xmm0, [eax]
1373     movdqu     xmm1, [eax + 16]
1374     movdqu     xmm2, [eax + 32]
1375     movdqu     xmm3, [eax + 48]
1376     pmaddubsw  xmm0, xmm4
1377     pmaddubsw  xmm1, xmm4
1378     pmaddubsw  xmm2, xmm4
1379     pmaddubsw  xmm3, xmm4
1380     lea        eax, [eax + 64]
1381     phaddw     xmm0, xmm1
1382     phaddw     xmm2, xmm3
1383     psrlw      xmm0, 7
1384     psrlw      xmm2, 7
1385     packuswb   xmm0, xmm2
1386     paddb      xmm0, xmm5
1387     movdqu     [edx], xmm0
1388     lea        edx, [edx + 16]
1389     sub        ecx, 16
1390     jg         convertloop
1391     ret
1392   }
1393 }
1394 
1395 __declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
1396                                         uint8_t* dst_y,
1397                                         int width) {
1398   __asm {
1399     mov        eax, [esp + 4] /* src_argb */
1400     mov        edx, [esp + 8] /* dst_y */
1401     mov        ecx, [esp + 12] /* width */
1402     movdqa     xmm4, xmmword ptr kRGBAToY
1403     movdqa     xmm5, xmmword ptr kAddY16
1404 
1405  convertloop:
1406     movdqu     xmm0, [eax]
1407     movdqu     xmm1, [eax + 16]
1408     movdqu     xmm2, [eax + 32]
1409     movdqu     xmm3, [eax + 48]
1410     pmaddubsw  xmm0, xmm4
1411     pmaddubsw  xmm1, xmm4
1412     pmaddubsw  xmm2, xmm4
1413     pmaddubsw  xmm3, xmm4
1414     lea        eax, [eax + 64]
1415     phaddw     xmm0, xmm1
1416     phaddw     xmm2, xmm3
1417     psrlw      xmm0, 7
1418     psrlw      xmm2, 7
1419     packuswb   xmm0, xmm2
1420     paddb      xmm0, xmm5
1421     movdqu     [edx], xmm0
1422     lea        edx, [edx + 16]
1423     sub        ecx, 16
1424     jg         convertloop
1425     ret
1426   }
1427 }
1428 
1429 __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
1430                                          int src_stride_argb,
1431                                          uint8_t* dst_u,
1432                                          uint8_t* dst_v,
1433                                          int width) {
1434   __asm {
1435     push       esi
1436     push       edi
1437     mov        eax, [esp + 8 + 4]  // src_argb
1438     mov        esi, [esp + 8 + 8]  // src_stride_argb
1439     mov        edx, [esp + 8 + 12]  // dst_u
1440     mov        edi, [esp + 8 + 16]  // dst_v
1441     mov        ecx, [esp + 8 + 20]  // width
1442     movdqa     xmm5, xmmword ptr kBiasUV128
1443     movdqa     xmm6, xmmword ptr kARGBToV
1444     movdqa     xmm7, xmmword ptr kARGBToU
1445     sub        edi, edx  // stride from u to v
1446 
1447  convertloop:
1448          /* step 1 - subsample 16x2 argb pixels to 8x1 */
1449     movdqu     xmm0, [eax]
1450     movdqu     xmm4, [eax + esi]
1451     pavgb      xmm0, xmm4
1452     movdqu     xmm1, [eax + 16]
1453     movdqu     xmm4, [eax + esi + 16]
1454     pavgb      xmm1, xmm4
1455     movdqu     xmm2, [eax + 32]
1456     movdqu     xmm4, [eax + esi + 32]
1457     pavgb      xmm2, xmm4
1458     movdqu     xmm3, [eax + 48]
1459     movdqu     xmm4, [eax + esi + 48]
1460     pavgb      xmm3, xmm4
1461 
1462     lea        eax,  [eax + 64]
1463     movdqa     xmm4, xmm0
1464     shufps     xmm0, xmm1, 0x88
1465     shufps     xmm4, xmm1, 0xdd
1466     pavgb      xmm0, xmm4
1467     movdqa     xmm4, xmm2
1468     shufps     xmm2, xmm3, 0x88
1469     shufps     xmm4, xmm3, 0xdd
1470     pavgb      xmm2, xmm4
1471 
1472         // step 2 - convert to U and V
1473         // from here down is very similar to Y code except
1474         // instead of 16 different pixels, its 8 pixels of U and 8 of V
1475     movdqa     xmm1, xmm0
1476     movdqa     xmm3, xmm2
1477     pmaddubsw  xmm0, xmm7  // U
1478     pmaddubsw  xmm2, xmm7
1479     pmaddubsw  xmm1, xmm6  // V
1480     pmaddubsw  xmm3, xmm6
1481     phaddw     xmm0, xmm2
1482     phaddw     xmm1, xmm3
1483     psraw      xmm0, 8
1484     psraw      xmm1, 8
1485     packsswb   xmm0, xmm1
1486     paddb      xmm0, xmm5  // -> unsigned
1487 
1488         // step 3 - store 8 U and 8 V values
1489     movlps     qword ptr [edx], xmm0  // U
1490     movhps     qword ptr [edx + edi], xmm0  // V
1491     lea        edx, [edx + 8]
1492     sub        ecx, 16
1493     jg         convertloop
1494 
1495     pop        edi
1496     pop        esi
1497     ret
1498   }
1499 }
1500 
1501 __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
1502                                           int src_stride_argb,
1503                                           uint8_t* dst_u,
1504                                           uint8_t* dst_v,
1505                                           int width) {
1506   __asm {
1507     push       esi
1508     push       edi
1509     mov        eax, [esp + 8 + 4]  // src_argb
1510     mov        esi, [esp + 8 + 8]  // src_stride_argb
1511     mov        edx, [esp + 8 + 12]  // dst_u
1512     mov        edi, [esp + 8 + 16]  // dst_v
1513     mov        ecx, [esp + 8 + 20]  // width
1514     movdqa     xmm5, xmmword ptr kBiasUV128
1515     movdqa     xmm6, xmmword ptr kARGBToVJ
1516     movdqa     xmm7, xmmword ptr kARGBToUJ
1517     sub        edi, edx  // stride from u to v
1518 
1519  convertloop:
1520          /* step 1 - subsample 16x2 argb pixels to 8x1 */
1521     movdqu     xmm0, [eax]
1522     movdqu     xmm4, [eax + esi]
1523     pavgb      xmm0, xmm4
1524     movdqu     xmm1, [eax + 16]
1525     movdqu     xmm4, [eax + esi + 16]
1526     pavgb      xmm1, xmm4
1527     movdqu     xmm2, [eax + 32]
1528     movdqu     xmm4, [eax + esi + 32]
1529     pavgb      xmm2, xmm4
1530     movdqu     xmm3, [eax + 48]
1531     movdqu     xmm4, [eax + esi + 48]
1532     pavgb      xmm3, xmm4
1533 
1534     lea        eax,  [eax + 64]
1535     movdqa     xmm4, xmm0
1536     shufps     xmm0, xmm1, 0x88
1537     shufps     xmm4, xmm1, 0xdd
1538     pavgb      xmm0, xmm4
1539     movdqa     xmm4, xmm2
1540     shufps     xmm2, xmm3, 0x88
1541     shufps     xmm4, xmm3, 0xdd
1542     pavgb      xmm2, xmm4
1543 
1544         // step 2 - convert to U and V
1545         // from here down is very similar to Y code except
1546         // instead of 16 different pixels, its 8 pixels of U and 8 of V
1547     movdqa     xmm1, xmm0
1548     movdqa     xmm3, xmm2
1549     pmaddubsw  xmm0, xmm7  // U
1550     pmaddubsw  xmm2, xmm7
1551     pmaddubsw  xmm1, xmm6  // V
1552     pmaddubsw  xmm3, xmm6
1553     phaddw     xmm0, xmm2
1554     phaddw     xmm1, xmm3
1555     paddw      xmm0, xmm5  // +.5 rounding -> unsigned
1556     paddw      xmm1, xmm5
1557     psraw      xmm0, 8
1558     psraw      xmm1, 8
1559     packsswb   xmm0, xmm1
1560 
1561         // step 3 - store 8 U and 8 V values
1562     movlps     qword ptr [edx], xmm0  // U
1563     movhps     qword ptr [edx + edi], xmm0  // V
1564     lea        edx, [edx + 8]
1565     sub        ecx, 16
1566     jg         convertloop
1567 
1568     pop        edi
1569     pop        esi
1570     ret
1571   }
1572 }
1573 
1574 #ifdef HAS_ARGBTOUVROW_AVX2
1575 __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb,
1576                                         int src_stride_argb,
1577                                         uint8_t* dst_u,
1578                                         uint8_t* dst_v,
1579                                         int width) {
1580   __asm {
1581     push       esi
1582     push       edi
1583     mov        eax, [esp + 8 + 4]  // src_argb
1584     mov        esi, [esp + 8 + 8]  // src_stride_argb
1585     mov        edx, [esp + 8 + 12]  // dst_u
1586     mov        edi, [esp + 8 + 16]  // dst_v
1587     mov        ecx, [esp + 8 + 20]  // width
1588     vbroadcastf128 ymm5, xmmword ptr kBiasUV128
1589     vbroadcastf128 ymm6, xmmword ptr kARGBToV
1590     vbroadcastf128 ymm7, xmmword ptr kARGBToU
1591     sub        edi, edx   // stride from u to v
1592 
1593  convertloop:
1594         /* step 1 - subsample 32x2 argb pixels to 16x1 */
1595     vmovdqu    ymm0, [eax]
1596     vmovdqu    ymm1, [eax + 32]
1597     vmovdqu    ymm2, [eax + 64]
1598     vmovdqu    ymm3, [eax + 96]
1599     vpavgb     ymm0, ymm0, [eax + esi]
1600     vpavgb     ymm1, ymm1, [eax + esi + 32]
1601     vpavgb     ymm2, ymm2, [eax + esi + 64]
1602     vpavgb     ymm3, ymm3, [eax + esi + 96]
1603     lea        eax,  [eax + 128]
1604     vshufps    ymm4, ymm0, ymm1, 0x88
1605     vshufps    ymm0, ymm0, ymm1, 0xdd
1606     vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
1607     vshufps    ymm4, ymm2, ymm3, 0x88
1608     vshufps    ymm2, ymm2, ymm3, 0xdd
1609     vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
1610 
1611         // step 2 - convert to U and V
1612         // from here down is very similar to Y code except
1613         // instead of 32 different pixels, its 16 pixels of U and 16 of V
1614     vpmaddubsw ymm1, ymm0, ymm7  // U
1615     vpmaddubsw ymm3, ymm2, ymm7
1616     vpmaddubsw ymm0, ymm0, ymm6  // V
1617     vpmaddubsw ymm2, ymm2, ymm6
1618     vphaddw    ymm1, ymm1, ymm3  // mutates
1619     vphaddw    ymm0, ymm0, ymm2
1620     vpsraw     ymm1, ymm1, 8
1621     vpsraw     ymm0, ymm0, 8
1622     vpacksswb  ymm0, ymm1, ymm0  // mutates
1623     vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
1624     vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
1625     vpaddb     ymm0, ymm0, ymm5  // -> unsigned
1626 
1627         // step 3 - store 16 U and 16 V values
1628     vextractf128 [edx], ymm0, 0  // U
1629     vextractf128 [edx + edi], ymm0, 1  // V
1630     lea        edx, [edx + 16]
1631     sub        ecx, 32
1632     jg         convertloop
1633 
1634     pop        edi
1635     pop        esi
1636     vzeroupper
1637     ret
1638   }
1639 }
1640 #endif  // HAS_ARGBTOUVROW_AVX2
1641 
1642 #ifdef HAS_ARGBTOUVJROW_AVX2
1643 __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
1644                                          int src_stride_argb,
1645                                          uint8_t* dst_u,
1646                                          uint8_t* dst_v,
1647                                          int width) {
1648   __asm {
1649     push       esi
1650     push       edi
1651     mov        eax, [esp + 8 + 4]  // src_argb
1652     mov        esi, [esp + 8 + 8]  // src_stride_argb
1653     mov        edx, [esp + 8 + 12]  // dst_u
1654     mov        edi, [esp + 8 + 16]  // dst_v
1655     mov        ecx, [esp + 8 + 20]  // width
1656     vbroadcastf128 ymm5, xmmword ptr kBiasUV128
1657     vbroadcastf128 ymm6, xmmword ptr kARGBToVJ
1658     vbroadcastf128 ymm7, xmmword ptr kARGBToUJ
1659     sub        edi, edx   // stride from u to v
1660 
1661  convertloop:
1662         /* step 1 - subsample 32x2 argb pixels to 16x1 */
1663     vmovdqu    ymm0, [eax]
1664     vmovdqu    ymm1, [eax + 32]
1665     vmovdqu    ymm2, [eax + 64]
1666     vmovdqu    ymm3, [eax + 96]
1667     vpavgb     ymm0, ymm0, [eax + esi]
1668     vpavgb     ymm1, ymm1, [eax + esi + 32]
1669     vpavgb     ymm2, ymm2, [eax + esi + 64]
1670     vpavgb     ymm3, ymm3, [eax + esi + 96]
1671     lea        eax,  [eax + 128]
1672     vshufps    ymm4, ymm0, ymm1, 0x88
1673     vshufps    ymm0, ymm0, ymm1, 0xdd
1674     vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
1675     vshufps    ymm4, ymm2, ymm3, 0x88
1676     vshufps    ymm2, ymm2, ymm3, 0xdd
1677     vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
1678 
1679         // step 2 - convert to U and V
1680         // from here down is very similar to Y code except
1681         // instead of 32 different pixels, its 16 pixels of U and 16 of V
1682     vpmaddubsw ymm1, ymm0, ymm7  // U
1683     vpmaddubsw ymm3, ymm2, ymm7
1684     vpmaddubsw ymm0, ymm0, ymm6  // V
1685     vpmaddubsw ymm2, ymm2, ymm6
1686     vphaddw    ymm1, ymm1, ymm3  // mutates
1687     vphaddw    ymm0, ymm0, ymm2
1688     vpaddw     ymm1, ymm1, ymm5  // +.5 rounding -> unsigned
1689     vpaddw     ymm0, ymm0, ymm5
1690     vpsraw     ymm1, ymm1, 8
1691     vpsraw     ymm0, ymm0, 8
1692     vpacksswb  ymm0, ymm1, ymm0  // mutates
1693     vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
1694     vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
1695 
1696         // step 3 - store 16 U and 16 V values
1697     vextractf128 [edx], ymm0, 0  // U
1698     vextractf128 [edx + edi], ymm0, 1  // V
1699     lea        edx, [edx + 16]
1700     sub        ecx, 32
1701     jg         convertloop
1702 
1703     pop        edi
1704     pop        esi
1705     vzeroupper
1706     ret
1707   }
1708 }
1709 #endif  // HAS_ARGBTOUVJROW_AVX2
1710 
1711 __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
1712                                             uint8_t* dst_u,
1713                                             uint8_t* dst_v,
1714                                             int width) {
1715   __asm {
1716     push       edi
1717     mov        eax, [esp + 4 + 4]  // src_argb
1718     mov        edx, [esp + 4 + 8]  // dst_u
1719     mov        edi, [esp + 4 + 12]  // dst_v
1720     mov        ecx, [esp + 4 + 16]  // width
1721     movdqa     xmm5, xmmword ptr kBiasUV128
1722     movdqa     xmm6, xmmword ptr kARGBToV
1723     movdqa     xmm7, xmmword ptr kARGBToU
1724     sub        edi, edx    // stride from u to v
1725 
1726  convertloop:
1727         /* convert to U and V */
1728     movdqu     xmm0, [eax]  // U
1729     movdqu     xmm1, [eax + 16]
1730     movdqu     xmm2, [eax + 32]
1731     movdqu     xmm3, [eax + 48]
1732     pmaddubsw  xmm0, xmm7
1733     pmaddubsw  xmm1, xmm7
1734     pmaddubsw  xmm2, xmm7
1735     pmaddubsw  xmm3, xmm7
1736     phaddw     xmm0, xmm1
1737     phaddw     xmm2, xmm3
1738     psraw      xmm0, 8
1739     psraw      xmm2, 8
1740     packsswb   xmm0, xmm2
1741     paddb      xmm0, xmm5
1742     movdqu     [edx], xmm0
1743 
1744     movdqu     xmm0, [eax]  // V
1745     movdqu     xmm1, [eax + 16]
1746     movdqu     xmm2, [eax + 32]
1747     movdqu     xmm3, [eax + 48]
1748     pmaddubsw  xmm0, xmm6
1749     pmaddubsw  xmm1, xmm6
1750     pmaddubsw  xmm2, xmm6
1751     pmaddubsw  xmm3, xmm6
1752     phaddw     xmm0, xmm1
1753     phaddw     xmm2, xmm3
1754     psraw      xmm0, 8
1755     psraw      xmm2, 8
1756     packsswb   xmm0, xmm2
1757     paddb      xmm0, xmm5
1758     lea        eax,  [eax + 64]
1759     movdqu     [edx + edi], xmm0
1760     lea        edx,  [edx + 16]
1761     sub        ecx,  16
1762     jg         convertloop
1763 
1764     pop        edi
1765     ret
1766   }
1767 }
1768 
1769 __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb,
1770                                          int src_stride_argb,
1771                                          uint8_t* dst_u,
1772                                          uint8_t* dst_v,
1773                                          int width) {
1774   __asm {
1775     push       esi
1776     push       edi
1777     mov        eax, [esp + 8 + 4]  // src_argb
1778     mov        esi, [esp + 8 + 8]  // src_stride_argb
1779     mov        edx, [esp + 8 + 12]  // dst_u
1780     mov        edi, [esp + 8 + 16]  // dst_v
1781     mov        ecx, [esp + 8 + 20]  // width
1782     movdqa     xmm5, xmmword ptr kBiasUV128
1783     movdqa     xmm6, xmmword ptr kBGRAToV
1784     movdqa     xmm7, xmmword ptr kBGRAToU
1785     sub        edi, edx  // stride from u to v
1786 
1787  convertloop:
1788          /* step 1 - subsample 16x2 argb pixels to 8x1 */
1789     movdqu     xmm0, [eax]
1790     movdqu     xmm4, [eax + esi]
1791     pavgb      xmm0, xmm4
1792     movdqu     xmm1, [eax + 16]
1793     movdqu     xmm4, [eax + esi + 16]
1794     pavgb      xmm1, xmm4
1795     movdqu     xmm2, [eax + 32]
1796     movdqu     xmm4, [eax + esi + 32]
1797     pavgb      xmm2, xmm4
1798     movdqu     xmm3, [eax + 48]
1799     movdqu     xmm4, [eax + esi + 48]
1800     pavgb      xmm3, xmm4
1801 
1802     lea        eax,  [eax + 64]
1803     movdqa     xmm4, xmm0
1804     shufps     xmm0, xmm1, 0x88
1805     shufps     xmm4, xmm1, 0xdd
1806     pavgb      xmm0, xmm4
1807     movdqa     xmm4, xmm2
1808     shufps     xmm2, xmm3, 0x88
1809     shufps     xmm4, xmm3, 0xdd
1810     pavgb      xmm2, xmm4
1811 
1812         // step 2 - convert to U and V
1813         // from here down is very similar to Y code except
1814         // instead of 16 different pixels, its 8 pixels of U and 8 of V
1815     movdqa     xmm1, xmm0
1816     movdqa     xmm3, xmm2
1817     pmaddubsw  xmm0, xmm7  // U
1818     pmaddubsw  xmm2, xmm7
1819     pmaddubsw  xmm1, xmm6  // V
1820     pmaddubsw  xmm3, xmm6
1821     phaddw     xmm0, xmm2
1822     phaddw     xmm1, xmm3
1823     psraw      xmm0, 8
1824     psraw      xmm1, 8
1825     packsswb   xmm0, xmm1
1826     paddb      xmm0, xmm5  // -> unsigned
1827 
1828         // step 3 - store 8 U and 8 V values
1829     movlps     qword ptr [edx], xmm0  // U
1830     movhps     qword ptr [edx + edi], xmm0  // V
1831     lea        edx, [edx + 8]
1832     sub        ecx, 16
1833     jg         convertloop
1834 
1835     pop        edi
1836     pop        esi
1837     ret
1838   }
1839 }
1840 
1841 __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb,
1842                                          int src_stride_argb,
1843                                          uint8_t* dst_u,
1844                                          uint8_t* dst_v,
1845                                          int width) {
1846   __asm {
1847     push       esi
1848     push       edi
1849     mov        eax, [esp + 8 + 4]  // src_argb
1850     mov        esi, [esp + 8 + 8]  // src_stride_argb
1851     mov        edx, [esp + 8 + 12]  // dst_u
1852     mov        edi, [esp + 8 + 16]  // dst_v
1853     mov        ecx, [esp + 8 + 20]  // width
1854     movdqa     xmm5, xmmword ptr kBiasUV128
1855     movdqa     xmm6, xmmword ptr kABGRToV
1856     movdqa     xmm7, xmmword ptr kABGRToU
1857     sub        edi, edx  // stride from u to v
1858 
1859  convertloop:
1860          /* step 1 - subsample 16x2 argb pixels to 8x1 */
1861     movdqu     xmm0, [eax]
1862     movdqu     xmm4, [eax + esi]
1863     pavgb      xmm0, xmm4
1864     movdqu     xmm1, [eax + 16]
1865     movdqu     xmm4, [eax + esi + 16]
1866     pavgb      xmm1, xmm4
1867     movdqu     xmm2, [eax + 32]
1868     movdqu     xmm4, [eax + esi + 32]
1869     pavgb      xmm2, xmm4
1870     movdqu     xmm3, [eax + 48]
1871     movdqu     xmm4, [eax + esi + 48]
1872     pavgb      xmm3, xmm4
1873 
1874     lea        eax,  [eax + 64]
1875     movdqa     xmm4, xmm0
1876     shufps     xmm0, xmm1, 0x88
1877     shufps     xmm4, xmm1, 0xdd
1878     pavgb      xmm0, xmm4
1879     movdqa     xmm4, xmm2
1880     shufps     xmm2, xmm3, 0x88
1881     shufps     xmm4, xmm3, 0xdd
1882     pavgb      xmm2, xmm4
1883 
1884         // step 2 - convert to U and V
1885         // from here down is very similar to Y code except
1886         // instead of 16 different pixels, its 8 pixels of U and 8 of V
1887     movdqa     xmm1, xmm0
1888     movdqa     xmm3, xmm2
1889     pmaddubsw  xmm0, xmm7  // U
1890     pmaddubsw  xmm2, xmm7
1891     pmaddubsw  xmm1, xmm6  // V
1892     pmaddubsw  xmm3, xmm6
1893     phaddw     xmm0, xmm2
1894     phaddw     xmm1, xmm3
1895     psraw      xmm0, 8
1896     psraw      xmm1, 8
1897     packsswb   xmm0, xmm1
1898     paddb      xmm0, xmm5  // -> unsigned
1899 
1900         // step 3 - store 8 U and 8 V values
1901     movlps     qword ptr [edx], xmm0  // U
1902     movhps     qword ptr [edx + edi], xmm0  // V
1903     lea        edx, [edx + 8]
1904     sub        ecx, 16
1905     jg         convertloop
1906 
1907     pop        edi
1908     pop        esi
1909     ret
1910   }
1911 }
1912 
1913 __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb,
1914                                          int src_stride_argb,
1915                                          uint8_t* dst_u,
1916                                          uint8_t* dst_v,
1917                                          int width) {
1918   __asm {
1919     push       esi
1920     push       edi
1921     mov        eax, [esp + 8 + 4]  // src_argb
1922     mov        esi, [esp + 8 + 8]  // src_stride_argb
1923     mov        edx, [esp + 8 + 12]  // dst_u
1924     mov        edi, [esp + 8 + 16]  // dst_v
1925     mov        ecx, [esp + 8 + 20]  // width
1926     movdqa     xmm5, xmmword ptr kBiasUV128
1927     movdqa     xmm6, xmmword ptr kRGBAToV
1928     movdqa     xmm7, xmmword ptr kRGBAToU
1929     sub        edi, edx  // stride from u to v
1930 
1931  convertloop:
1932          /* step 1 - subsample 16x2 argb pixels to 8x1 */
1933     movdqu     xmm0, [eax]
1934     movdqu     xmm4, [eax + esi]
1935     pavgb      xmm0, xmm4
1936     movdqu     xmm1, [eax + 16]
1937     movdqu     xmm4, [eax + esi + 16]
1938     pavgb      xmm1, xmm4
1939     movdqu     xmm2, [eax + 32]
1940     movdqu     xmm4, [eax + esi + 32]
1941     pavgb      xmm2, xmm4
1942     movdqu     xmm3, [eax + 48]
1943     movdqu     xmm4, [eax + esi + 48]
1944     pavgb      xmm3, xmm4
1945 
1946     lea        eax,  [eax + 64]
1947     movdqa     xmm4, xmm0
1948     shufps     xmm0, xmm1, 0x88
1949     shufps     xmm4, xmm1, 0xdd
1950     pavgb      xmm0, xmm4
1951     movdqa     xmm4, xmm2
1952     shufps     xmm2, xmm3, 0x88
1953     shufps     xmm4, xmm3, 0xdd
1954     pavgb      xmm2, xmm4
1955 
1956         // step 2 - convert to U and V
1957         // from here down is very similar to Y code except
1958         // instead of 16 different pixels, its 8 pixels of U and 8 of V
1959     movdqa     xmm1, xmm0
1960     movdqa     xmm3, xmm2
1961     pmaddubsw  xmm0, xmm7  // U
1962     pmaddubsw  xmm2, xmm7
1963     pmaddubsw  xmm1, xmm6  // V
1964     pmaddubsw  xmm3, xmm6
1965     phaddw     xmm0, xmm2
1966     phaddw     xmm1, xmm3
1967     psraw      xmm0, 8
1968     psraw      xmm1, 8
1969     packsswb   xmm0, xmm1
1970     paddb      xmm0, xmm5  // -> unsigned
1971 
1972         // step 3 - store 8 U and 8 V values
1973     movlps     qword ptr [edx], xmm0  // U
1974     movhps     qword ptr [edx + edi], xmm0  // V
1975     lea        edx, [edx + 8]
1976     sub        ecx, 16
1977     jg         convertloop
1978 
1979     pop        edi
1980     pop        esi
1981     ret
1982   }
1983 }
1984 #endif  // HAS_ARGBTOYROW_SSSE3
1985 
1986 // Read 16 UV from 444
1987 #define READYUV444_AVX2 \
1988   __asm {                                                                      \
1989     __asm vmovdqu    xmm3, [esi] /* U */                                       \
1990     __asm vmovdqu    xmm1, [esi + edi] /* V */                                 \
1991     __asm lea        esi,  [esi + 16]                                          \
1992     __asm vpermq     ymm3, ymm3, 0xd8                                          \
1993     __asm vpermq     ymm1, ymm1, 0xd8                                          \
1994     __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */                                 \
1995     __asm vmovdqu    xmm4, [eax] /* Y */                                       \
1996     __asm vpermq     ymm4, ymm4, 0xd8                                          \
1997     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
1998     __asm lea        eax, [eax + 16]}
1999 
2000 // Read 16 UV from 444.  With 16 Alpha.
2001 #define READYUVA444_AVX2 \
2002   __asm {                                                                      \
2003     __asm vmovdqu    xmm3, [esi] /* U */                                       \
2004     __asm vmovdqu    xmm1, [esi + edi] /* V */                                 \
2005     __asm lea        esi,  [esi + 16]                                          \
2006     __asm vpermq     ymm3, ymm3, 0xd8                                          \
2007     __asm vpermq     ymm1, ymm1, 0xd8                                          \
2008     __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */                                 \
2009     __asm vmovdqu    xmm4, [eax] /* Y */                                       \
2010     __asm vpermq     ymm4, ymm4, 0xd8                                          \
2011     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
2012     __asm lea        eax, [eax + 16]                                           \
2013     __asm vmovdqu    xmm5, [ebp] /* A */                                       \
2014     __asm vpermq     ymm5, ymm5, 0xd8                                          \
2015     __asm lea        ebp, [ebp + 16]}
2016 
2017 // Read 8 UV from 422, upsample to 16 UV.
2018 #define READYUV422_AVX2 \
2019   __asm {                                                                      \
2020     __asm vmovq      xmm3, qword ptr [esi] /* U */                             \
2021     __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                       \
2022     __asm lea        esi,  [esi + 8]                                           \
2023     __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */                                 \
2024     __asm vpermq     ymm3, ymm3, 0xd8                                          \
2025     __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */                    \
2026     __asm vmovdqu    xmm4, [eax] /* Y */                                       \
2027     __asm vpermq     ymm4, ymm4, 0xd8                                          \
2028     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
2029     __asm lea        eax, [eax + 16]}
2030 
2031 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
2032 #define READYUVA422_AVX2 \
2033   __asm {                                                                      \
2034     __asm vmovq      xmm3, qword ptr [esi] /* U */                             \
2035     __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                       \
2036     __asm lea        esi,  [esi + 8]                                           \
2037     __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */                                 \
2038     __asm vpermq     ymm3, ymm3, 0xd8                                          \
2039     __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */                    \
2040     __asm vmovdqu    xmm4, [eax] /* Y */                                       \
2041     __asm vpermq     ymm4, ymm4, 0xd8                                          \
2042     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
2043     __asm lea        eax, [eax + 16]                                           \
2044     __asm vmovdqu    xmm5, [ebp] /* A */                                       \
2045     __asm vpermq     ymm5, ymm5, 0xd8                                          \
2046     __asm lea        ebp, [ebp + 16]}
2047 
2048 // Read 8 UV from NV12, upsample to 16 UV.
2049 #define READNV12_AVX2 \
2050   __asm {                                                                      \
2051     __asm vmovdqu    xmm3, [esi] /* UV */                                      \
2052     __asm lea        esi,  [esi + 16]                                          \
2053     __asm vpermq     ymm3, ymm3, 0xd8                                          \
2054     __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */                    \
2055     __asm vmovdqu    xmm4, [eax] /* Y */                                       \
2056     __asm vpermq     ymm4, ymm4, 0xd8                                          \
2057     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
2058     __asm lea        eax, [eax + 16]}
2059 
2060 // Read 8 UV from NV21, upsample to 16 UV.
2061 #define READNV21_AVX2 \
2062   __asm {                                                                      \
2063     __asm vmovdqu    xmm3, [esi] /* UV */                                      \
2064     __asm lea        esi,  [esi + 16]                                          \
2065     __asm vpermq     ymm3, ymm3, 0xd8                                          \
2066     __asm vpshufb    ymm3, ymm3, ymmword ptr kShuffleNV21                      \
2067     __asm vmovdqu    xmm4, [eax] /* Y */                                       \
2068     __asm vpermq     ymm4, ymm4, 0xd8                                          \
2069     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
2070     __asm lea        eax, [eax + 16]}
2071 
2072 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
2073 #define READYUY2_AVX2 \
2074   __asm {                                                                      \
2075     __asm vmovdqu    ymm4, [eax] /* YUY2 */                                    \
2076     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleYUY2Y                     \
2077     __asm vmovdqu    ymm3, [eax] /* UV */                                      \
2078     __asm vpshufb    ymm3, ymm3, ymmword ptr kShuffleYUY2UV                    \
2079     __asm lea        eax, [eax + 32]}
2080 
2081 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
2082 #define READUYVY_AVX2 \
2083   __asm {                                                                      \
2084     __asm vmovdqu    ymm4, [eax] /* UYVY */                                    \
2085     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleUYVYY                     \
2086     __asm vmovdqu    ymm3, [eax] /* UV */                                      \
2087     __asm vpshufb    ymm3, ymm3, ymmword ptr kShuffleUYVYUV                    \
2088     __asm lea        eax, [eax + 32]}
2089 
2090 // Convert 16 pixels: 16 UV and 16 Y.
2091 #define YUVTORGB_AVX2(YuvConstants) \
2092   __asm {                                                                      \
2093     __asm vpsubb     ymm3, ymm3, ymmword ptr kBiasUV128                        \
2094     __asm vpmulhuw   ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB]          \
2095     __asm vmovdqa    ymm0, ymmword ptr [YuvConstants + KUVTOB]                 \
2096     __asm vmovdqa    ymm1, ymmword ptr [YuvConstants + KUVTOG]                 \
2097     __asm vmovdqa    ymm2, ymmword ptr [YuvConstants + KUVTOR]                 \
2098     __asm vpmaddubsw ymm0, ymm0, ymm3 /* B UV */                               \
2099     __asm vpmaddubsw ymm1, ymm1, ymm3 /* G UV */                               \
2100     __asm vpmaddubsw ymm2, ymm2, ymm3 /* B UV */                               \
2101     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KYBIASTORGB]            \
2102     __asm vpaddw     ymm4, ymm3, ymm4                                          \
2103     __asm vpaddsw    ymm0, ymm0, ymm4                                          \
2104     __asm vpsubsw    ymm1, ymm4, ymm1                                          \
2105     __asm vpaddsw    ymm2, ymm2, ymm4                                          \
2106     __asm vpsraw     ymm0, ymm0, 6                                             \
2107     __asm vpsraw     ymm1, ymm1, 6                                             \
2108     __asm vpsraw     ymm2, ymm2, 6                                             \
2109     __asm vpackuswb  ymm0, ymm0, ymm0                                          \
2110     __asm vpackuswb  ymm1, ymm1, ymm1                                          \
2111     __asm vpackuswb  ymm2, ymm2, ymm2}
2112 
2113 // Store 16 ARGB values.
2114 #define STOREARGB_AVX2 \
2115   __asm {                                                                      \
2116     __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */                                 \
2117     __asm vpermq     ymm0, ymm0, 0xd8                                          \
2118     __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */                                 \
2119     __asm vpermq     ymm2, ymm2, 0xd8                                          \
2120     __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */                \
2121     __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */                 \
2122     __asm vmovdqu    0[edx], ymm1                                              \
2123     __asm vmovdqu    32[edx], ymm0                                             \
2124     __asm lea        edx,  [edx + 64]}
2125 
2126 // Store 16 RGBA values.
2127 #define STORERGBA_AVX2 \
2128   __asm {                                                                      \
2129     __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */                                 \
2130     __asm vpermq     ymm1, ymm1, 0xd8                                          \
2131     __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */                                 \
2132     __asm vpermq     ymm2, ymm2, 0xd8                                          \
2133     __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */                \
2134     __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */                 \
2135     __asm vmovdqu    [edx], ymm0                                               \
2136     __asm vmovdqu    [edx + 32], ymm1                                          \
2137     __asm lea        edx,  [edx + 64]}
2138 
2139 #ifdef HAS_I422TOARGBROW_AVX2
2140 // 16 pixels
2141 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2142 __declspec(naked) void I422ToARGBRow_AVX2(
2143     const uint8_t* y_buf,
2144     const uint8_t* u_buf,
2145     const uint8_t* v_buf,
2146     uint8_t* dst_argb,
2147     const struct YuvConstants* yuvconstants,
2148     int width) {
2149   __asm {
2150     push       esi
2151     push       edi
2152     push       ebx
2153     mov        eax, [esp + 12 + 4]  // Y
2154     mov        esi, [esp + 12 + 8]  // U
2155     mov        edi, [esp + 12 + 12]  // V
2156     mov        edx, [esp + 12 + 16]  // argb
2157     mov        ebx, [esp + 12 + 20]  // yuvconstants
2158     mov        ecx, [esp + 12 + 24]  // width
2159     sub        edi, esi
2160     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2161 
2162  convertloop:
2163     READYUV422_AVX2
2164     YUVTORGB_AVX2(ebx)
2165     STOREARGB_AVX2
2166 
2167     sub        ecx, 16
2168     jg         convertloop
2169 
2170     pop        ebx
2171     pop        edi
2172     pop        esi
2173     vzeroupper
2174     ret
2175   }
2176 }
2177 #endif  // HAS_I422TOARGBROW_AVX2
2178 
2179 #ifdef HAS_I422ALPHATOARGBROW_AVX2
2180 // 16 pixels
2181 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
2182 __declspec(naked) void I422AlphaToARGBRow_AVX2(
2183     const uint8_t* y_buf,
2184     const uint8_t* u_buf,
2185     const uint8_t* v_buf,
2186     const uint8_t* a_buf,
2187     uint8_t* dst_argb,
2188     const struct YuvConstants* yuvconstants,
2189     int width) {
2190   __asm {
2191     push       esi
2192     push       edi
2193     push       ebx
2194     push       ebp
2195     mov        eax, [esp + 16 + 4]  // Y
2196     mov        esi, [esp + 16 + 8]  // U
2197     mov        edi, [esp + 16 + 12]  // V
2198     mov        ebp, [esp + 16 + 16]  // A
2199     mov        edx, [esp + 16 + 20]  // argb
2200     mov        ebx, [esp + 16 + 24]  // yuvconstants
2201     mov        ecx, [esp + 16 + 28]  // width
2202     sub        edi, esi
2203 
2204  convertloop:
2205     READYUVA422_AVX2
2206     YUVTORGB_AVX2(ebx)
2207     STOREARGB_AVX2
2208 
2209     sub        ecx, 16
2210     jg         convertloop
2211 
2212     pop        ebp
2213     pop        ebx
2214     pop        edi
2215     pop        esi
2216     vzeroupper
2217     ret
2218   }
2219 }
2220 #endif  // HAS_I422ALPHATOARGBROW_AVX2
2221 
2222 #ifdef HAS_I444TOARGBROW_AVX2
2223 // 16 pixels
2224 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
2225 __declspec(naked) void I444ToARGBRow_AVX2(
2226     const uint8_t* y_buf,
2227     const uint8_t* u_buf,
2228     const uint8_t* v_buf,
2229     uint8_t* dst_argb,
2230     const struct YuvConstants* yuvconstants,
2231     int width) {
2232   __asm {
2233     push       esi
2234     push       edi
2235     push       ebx
2236     mov        eax, [esp + 12 + 4]  // Y
2237     mov        esi, [esp + 12 + 8]  // U
2238     mov        edi, [esp + 12 + 12]  // V
2239     mov        edx, [esp + 12 + 16]  // argb
2240     mov        ebx, [esp + 12 + 20]  // yuvconstants
2241     mov        ecx, [esp + 12 + 24]  // width
2242     sub        edi, esi
2243     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2244  convertloop:
2245     READYUV444_AVX2
2246     YUVTORGB_AVX2(ebx)
2247     STOREARGB_AVX2
2248 
2249     sub        ecx, 16
2250     jg         convertloop
2251 
2252     pop        ebx
2253     pop        edi
2254     pop        esi
2255     vzeroupper
2256     ret
2257   }
2258 }
2259 #endif  // HAS_I444TOARGBROW_AVX2
2260 
2261 #ifdef HAS_I444ALPHATOARGBROW_AVX2
2262 // 16 pixels
2263 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
2264 __declspec(naked) void I444AlphaToARGBRow_AVX2(
2265     const uint8_t* y_buf,
2266     const uint8_t* u_buf,
2267     const uint8_t* v_buf,
2268     const uint8_t* a_buf,
2269     uint8_t* dst_argb,
2270     const struct YuvConstants* yuvconstants,
2271     int width) {
2272   __asm {
2273   push       esi
2274   push       edi
2275   push       ebx
2276   push       ebp
2277   mov        eax, [esp + 16 + 4]  // Y
2278   mov        esi, [esp + 16 + 8]  // U
2279   mov        edi, [esp + 16 + 12]  // V
2280   mov        ebp, [esp + 16 + 16]  // A
2281   mov        edx, [esp + 16 + 20]  // argb
2282   mov        ebx, [esp + 16 + 24]  // yuvconstants
2283   mov        ecx, [esp + 16 + 28]  // width
2284   sub        edi, esi
2285   convertloop:
2286   READYUVA444_AVX2
2287   YUVTORGB_AVX2(ebx)
2288   STOREARGB_AVX2
2289 
2290   sub        ecx, 16
2291   jg         convertloop
2292 
2293   pop        ebp
2294   pop        ebx
2295   pop        edi
2296   pop        esi
2297   vzeroupper
2298   ret
2299   }
2300 }
2301 #endif  // HAS_I444AlphaTOARGBROW_AVX2
2302 
2303 #ifdef HAS_NV12TOARGBROW_AVX2
2304 // 16 pixels.
2305 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2306 __declspec(naked) void NV12ToARGBRow_AVX2(
2307     const uint8_t* y_buf,
2308     const uint8_t* uv_buf,
2309     uint8_t* dst_argb,
2310     const struct YuvConstants* yuvconstants,
2311     int width) {
2312   __asm {
2313     push       esi
2314     push       ebx
2315     mov        eax, [esp + 8 + 4]  // Y
2316     mov        esi, [esp + 8 + 8]  // UV
2317     mov        edx, [esp + 8 + 12]  // argb
2318     mov        ebx, [esp + 8 + 16]  // yuvconstants
2319     mov        ecx, [esp + 8 + 20]  // width
2320     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2321 
2322  convertloop:
2323     READNV12_AVX2
2324     YUVTORGB_AVX2(ebx)
2325     STOREARGB_AVX2
2326 
2327     sub        ecx, 16
2328     jg         convertloop
2329 
2330     pop        ebx
2331     pop        esi
2332     vzeroupper
2333     ret
2334   }
2335 }
2336 #endif  // HAS_NV12TOARGBROW_AVX2
2337 
2338 #ifdef HAS_NV21TOARGBROW_AVX2
2339 // 16 pixels.
2340 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2341 __declspec(naked) void NV21ToARGBRow_AVX2(
2342     const uint8_t* y_buf,
2343     const uint8_t* vu_buf,
2344     uint8_t* dst_argb,
2345     const struct YuvConstants* yuvconstants,
2346     int width) {
2347   __asm {
2348     push       esi
2349     push       ebx
2350     mov        eax, [esp + 8 + 4]  // Y
2351     mov        esi, [esp + 8 + 8]  // VU
2352     mov        edx, [esp + 8 + 12]  // argb
2353     mov        ebx, [esp + 8 + 16]  // yuvconstants
2354     mov        ecx, [esp + 8 + 20]  // width
2355     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2356 
2357  convertloop:
2358     READNV21_AVX2
2359     YUVTORGB_AVX2(ebx)
2360     STOREARGB_AVX2
2361 
2362     sub        ecx, 16
2363     jg         convertloop
2364 
2365     pop        ebx
2366     pop        esi
2367     vzeroupper
2368     ret
2369   }
2370 }
2371 #endif  // HAS_NV21TOARGBROW_AVX2
2372 
2373 #ifdef HAS_YUY2TOARGBROW_AVX2
2374 // 16 pixels.
2375 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2376 __declspec(naked) void YUY2ToARGBRow_AVX2(
2377     const uint8_t* src_yuy2,
2378     uint8_t* dst_argb,
2379     const struct YuvConstants* yuvconstants,
2380     int width) {
2381   __asm {
2382     push       ebx
2383     mov        eax, [esp + 4 + 4]  // yuy2
2384     mov        edx, [esp + 4 + 8]  // argb
2385     mov        ebx, [esp + 4 + 12]  // yuvconstants
2386     mov        ecx, [esp + 4 + 16]  // width
2387     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2388 
2389  convertloop:
2390     READYUY2_AVX2
2391     YUVTORGB_AVX2(ebx)
2392     STOREARGB_AVX2
2393 
2394     sub        ecx, 16
2395     jg         convertloop
2396 
2397     pop        ebx
2398     vzeroupper
2399     ret
2400   }
2401 }
2402 #endif  // HAS_YUY2TOARGBROW_AVX2
2403 
2404 #ifdef HAS_UYVYTOARGBROW_AVX2
2405 // 16 pixels.
2406 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2407 __declspec(naked) void UYVYToARGBRow_AVX2(
2408     const uint8_t* src_uyvy,
2409     uint8_t* dst_argb,
2410     const struct YuvConstants* yuvconstants,
2411     int width) {
2412   __asm {
2413     push       ebx
2414     mov        eax, [esp + 4 + 4]  // uyvy
2415     mov        edx, [esp + 4 + 8]  // argb
2416     mov        ebx, [esp + 4 + 12]  // yuvconstants
2417     mov        ecx, [esp + 4 + 16]  // width
2418     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2419 
2420  convertloop:
2421     READUYVY_AVX2
2422     YUVTORGB_AVX2(ebx)
2423     STOREARGB_AVX2
2424 
2425     sub        ecx, 16
2426     jg         convertloop
2427 
2428     pop        ebx
2429     vzeroupper
2430     ret
2431   }
2432 }
2433 #endif  // HAS_UYVYTOARGBROW_AVX2
2434 
2435 #ifdef HAS_I422TORGBAROW_AVX2
2436 // 16 pixels
2437 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
2438 __declspec(naked) void I422ToRGBARow_AVX2(
2439     const uint8_t* y_buf,
2440     const uint8_t* u_buf,
2441     const uint8_t* v_buf,
2442     uint8_t* dst_argb,
2443     const struct YuvConstants* yuvconstants,
2444     int width) {
2445   __asm {
2446     push       esi
2447     push       edi
2448     push       ebx
2449     mov        eax, [esp + 12 + 4]  // Y
2450     mov        esi, [esp + 12 + 8]  // U
2451     mov        edi, [esp + 12 + 12]  // V
2452     mov        edx, [esp + 12 + 16]  // abgr
2453     mov        ebx, [esp + 12 + 20]  // yuvconstants
2454     mov        ecx, [esp + 12 + 24]  // width
2455     sub        edi, esi
2456     vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
2457 
2458  convertloop:
2459     READYUV422_AVX2
2460     YUVTORGB_AVX2(ebx)
2461     STORERGBA_AVX2
2462 
2463     sub        ecx, 16
2464     jg         convertloop
2465 
2466     pop        ebx
2467     pop        edi
2468     pop        esi
2469     vzeroupper
2470     ret
2471   }
2472 }
2473 #endif  // HAS_I422TORGBAROW_AVX2
2474 
2475 #if defined(HAS_I422TOARGBROW_SSSE3)
2476 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
2477 // Allows a conversion with half size scaling.
2478 
2479 // Read 8 UV from 444.
2480 #define READYUV444 \
2481   __asm {                                                                      \
2482     __asm movq       xmm3, qword ptr [esi] /* U */                             \
2483     __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
2484     __asm lea        esi,  [esi + 8]                                           \
2485     __asm punpcklbw  xmm3, xmm1 /* UV */                                       \
2486     __asm movq       xmm4, qword ptr [eax]                                     \
2487     __asm punpcklbw  xmm4, xmm4                                                \
2488     __asm lea        eax, [eax + 8]}
2489 
2490 // Read 4 UV from 444.  With 8 Alpha.
2491 #define READYUVA444 \
2492   __asm {                                                                      \
2493     __asm movq       xmm3, qword ptr [esi] /* U */                             \
2494     __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
2495     __asm lea        esi,  [esi + 8]                                           \
2496     __asm punpcklbw  xmm3, xmm1 /* UV */                                       \
2497     __asm movq       xmm4, qword ptr [eax]                                     \
2498     __asm punpcklbw  xmm4, xmm4                                                \
2499     __asm lea        eax, [eax + 8]                                            \
2500     __asm movq       xmm5, qword ptr [ebp] /* A */                             \
2501     __asm lea        ebp, [ebp + 8]}
2502 
2503 // Read 4 UV from 422, upsample to 8 UV.
2504 #define READYUV422 \
2505   __asm {                                                                      \
2506     __asm movd       xmm3, [esi] /* U */                                       \
2507     __asm movd       xmm1, [esi + edi] /* V */                                 \
2508     __asm lea        esi,  [esi + 4]                                           \
2509     __asm punpcklbw  xmm3, xmm1 /* UV */                                       \
2510     __asm punpcklwd  xmm3, xmm3 /* UVUV (upsample) */                          \
2511     __asm movq       xmm4, qword ptr [eax]                                     \
2512     __asm punpcklbw  xmm4, xmm4                                                \
2513     __asm lea        eax, [eax + 8]}
2514 
2515 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
2516 #define READYUVA422 \
2517   __asm {                                                                      \
2518     __asm movd       xmm3, [esi] /* U */                                       \
2519     __asm movd       xmm1, [esi + edi] /* V */                                 \
2520     __asm lea        esi,  [esi + 4]                                           \
2521     __asm punpcklbw  xmm3, xmm1 /* UV */                                       \
2522     __asm punpcklwd  xmm3, xmm3 /* UVUV (upsample) */                          \
2523     __asm movq       xmm4, qword ptr [eax] /* Y */                             \
2524     __asm punpcklbw  xmm4, xmm4                                                \
2525     __asm lea        eax, [eax + 8]                                            \
2526     __asm movq       xmm5, qword ptr [ebp] /* A */                             \
2527     __asm lea        ebp, [ebp + 8]}
2528 
2529 // Read 4 UV from NV12, upsample to 8 UV.
2530 #define READNV12 \
2531   __asm {                                                                      \
2532     __asm movq       xmm3, qword ptr [esi] /* UV */                            \
2533     __asm lea        esi,  [esi + 8]                                           \
2534     __asm punpcklwd  xmm3, xmm3 /* UVUV (upsample) */                          \
2535     __asm movq       xmm4, qword ptr [eax]                                     \
2536     __asm punpcklbw  xmm4, xmm4                                                \
2537     __asm lea        eax, [eax + 8]}
2538 
2539 // Read 4 VU from NV21, upsample to 8 UV.
2540 #define READNV21 \
2541   __asm {                                                                      \
2542     __asm movq       xmm3, qword ptr [esi] /* UV */                            \
2543     __asm lea        esi,  [esi + 8]                                           \
2544     __asm pshufb     xmm3, xmmword ptr kShuffleNV21                            \
2545     __asm movq       xmm4, qword ptr [eax]                                     \
2546     __asm punpcklbw  xmm4, xmm4                                                \
2547     __asm lea        eax, [eax + 8]}
2548 
2549 // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
2550 #define READYUY2 \
2551   __asm {                                                                      \
2552     __asm movdqu     xmm4, [eax] /* YUY2 */                                    \
2553     __asm pshufb     xmm4, xmmword ptr kShuffleYUY2Y                           \
2554     __asm movdqu     xmm3, [eax] /* UV */                                      \
2555     __asm pshufb     xmm3, xmmword ptr kShuffleYUY2UV                          \
2556     __asm lea        eax, [eax + 16]}
2557 
2558 // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
2559 #define READUYVY \
2560   __asm {                                                                      \
2561     __asm movdqu     xmm4, [eax] /* UYVY */                                    \
2562     __asm pshufb     xmm4, xmmword ptr kShuffleUYVYY                           \
2563     __asm movdqu     xmm3, [eax] /* UV */                                      \
2564     __asm pshufb     xmm3, xmmword ptr kShuffleUYVYUV                          \
2565     __asm lea        eax, [eax + 16]}
2566 
2567 // Convert 8 pixels: 8 UV and 8 Y.
2568 #define YUVTORGB(YuvConstants) \
2569   __asm {                                                                      \
2570     __asm psubb      xmm3, xmmword ptr kBiasUV128                              \
2571     __asm pmulhuw    xmm4, xmmword ptr [YuvConstants + KYTORGB]                \
2572     __asm movdqa     xmm0, xmmword ptr [YuvConstants + KUVTOB]                 \
2573     __asm movdqa     xmm1, xmmword ptr [YuvConstants + KUVTOG]                 \
2574     __asm movdqa     xmm2, xmmword ptr [YuvConstants + KUVTOR]                 \
2575     __asm pmaddubsw  xmm0, xmm3                                                \
2576     __asm pmaddubsw  xmm1, xmm3                                                \
2577     __asm pmaddubsw  xmm2, xmm3                                                \
2578     __asm movdqa     xmm3, xmmword ptr [YuvConstants + KYBIASTORGB]            \
2579     __asm paddw      xmm4, xmm3                                                \
2580     __asm paddsw     xmm0, xmm4                                                \
2581     __asm paddsw     xmm2, xmm4                                                \
2582     __asm psubsw     xmm4, xmm1                                                \
2583     __asm movdqa     xmm1, xmm4                                                \
2584     __asm psraw      xmm0, 6                                                   \
2585     __asm psraw      xmm1, 6                                                   \
2586     __asm psraw      xmm2, 6                                                   \
2587     __asm packuswb   xmm0, xmm0 /* B */                                        \
2588     __asm packuswb   xmm1, xmm1 /* G */                                        \
2589     __asm packuswb   xmm2, xmm2 /* R */             \
2590   }
2591 
2592 // Store 8 ARGB values.
2593 #define STOREARGB \
2594   __asm {                                                                      \
2595     __asm punpcklbw  xmm0, xmm1 /* BG */                                       \
2596     __asm punpcklbw  xmm2, xmm5 /* RA */                                       \
2597     __asm movdqa     xmm1, xmm0                                                \
2598     __asm punpcklwd  xmm0, xmm2 /* BGRA first 4 pixels */                      \
2599     __asm punpckhwd  xmm1, xmm2 /* BGRA next 4 pixels */                       \
2600     __asm movdqu     0[edx], xmm0                                              \
2601     __asm movdqu     16[edx], xmm1                                             \
2602     __asm lea        edx,  [edx + 32]}
2603 
2604 // Store 8 BGRA values.
2605 #define STOREBGRA \
2606   __asm {                                                                      \
2607     __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */            \
2608     __asm punpcklbw  xmm1, xmm0 /* GB */                                       \
2609     __asm punpcklbw  xmm5, xmm2 /* AR */                                       \
2610     __asm movdqa     xmm0, xmm5                                                \
2611     __asm punpcklwd  xmm5, xmm1 /* BGRA first 4 pixels */                      \
2612     __asm punpckhwd  xmm0, xmm1 /* BGRA next 4 pixels */                       \
2613     __asm movdqu     0[edx], xmm5                                              \
2614     __asm movdqu     16[edx], xmm0                                             \
2615     __asm lea        edx,  [edx + 32]}
2616 
2617 // Store 8 RGBA values.
2618 #define STORERGBA \
2619   __asm {                                                                      \
2620     __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */            \
2621     __asm punpcklbw  xmm1, xmm2 /* GR */                                       \
2622     __asm punpcklbw  xmm5, xmm0 /* AB */                                       \
2623     __asm movdqa     xmm0, xmm5                                                \
2624     __asm punpcklwd  xmm5, xmm1 /* RGBA first 4 pixels */                      \
2625     __asm punpckhwd  xmm0, xmm1 /* RGBA next 4 pixels */                       \
2626     __asm movdqu     0[edx], xmm5                                              \
2627     __asm movdqu     16[edx], xmm0                                             \
2628     __asm lea        edx,  [edx + 32]}
2629 
2630 // Store 8 RGB24 values.
2631 #define STORERGB24 \
2632   __asm {/* Weave into RRGB */                                                 \
2633     __asm punpcklbw  xmm0, xmm1 /* BG */                                       \
2634     __asm punpcklbw  xmm2, xmm2 /* RR */                                       \
2635     __asm movdqa     xmm1, xmm0                                                \
2636     __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */                      \
2637     __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */   \
2638     __asm pshufb     xmm0, xmm5 /* Pack first 8 and last 4 bytes. */           \
2639     __asm pshufb     xmm1, xmm6 /* Pack first 12 bytes. */                     \
2640     __asm palignr    xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */       \
2641     __asm movq       qword ptr 0[edx], xmm0 /* First 8 bytes */                \
2642     __asm movdqu     8[edx], xmm1 /* Last 16 bytes */                          \
2643     __asm lea        edx,  [edx + 24]}
2644 
2645 // Store 8 RGB565 values.
2646 #define STORERGB565 \
2647   __asm {/* Weave into RRGB */                                                 \
2648     __asm punpcklbw  xmm0, xmm1 /* BG */                                       \
2649     __asm punpcklbw  xmm2, xmm2 /* RR */                                       \
2650     __asm movdqa     xmm1, xmm0                                                \
2651     __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */                      \
2652     __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */  \
2653     __asm movdqa     xmm3, xmm0 /* B  first 4 pixels of argb */                \
2654     __asm movdqa     xmm2, xmm0 /* G */                                        \
2655     __asm pslld      xmm0, 8 /* R */                                           \
2656     __asm psrld      xmm3, 3 /* B */                                           \
2657     __asm psrld      xmm2, 5 /* G */                                           \
2658     __asm psrad      xmm0, 16 /* R */                                          \
2659     __asm pand       xmm3, xmm5 /* B */                                        \
2660     __asm pand       xmm2, xmm6 /* G */                                        \
2661     __asm pand       xmm0, xmm7 /* R */                                        \
2662     __asm por        xmm3, xmm2 /* BG */                                       \
2663     __asm por        xmm0, xmm3 /* BGR */                                      \
2664     __asm movdqa     xmm3, xmm1 /* B  next 4 pixels of argb */                 \
2665     __asm movdqa     xmm2, xmm1 /* G */                                        \
2666     __asm pslld      xmm1, 8 /* R */                                           \
2667     __asm psrld      xmm3, 3 /* B */                                           \
2668     __asm psrld      xmm2, 5 /* G */                                           \
2669     __asm psrad      xmm1, 16 /* R */                                          \
2670     __asm pand       xmm3, xmm5 /* B */                                        \
2671     __asm pand       xmm2, xmm6 /* G */                                        \
2672     __asm pand       xmm1, xmm7 /* R */                                        \
2673     __asm por        xmm3, xmm2 /* BG */                                       \
2674     __asm por        xmm1, xmm3 /* BGR */                                      \
2675     __asm packssdw   xmm0, xmm1                                                \
2676     __asm movdqu     0[edx], xmm0 /* store 8 pixels of RGB565 */               \
2677     __asm lea        edx, [edx + 16]}
2678 
2679 // 8 pixels.
2680 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
2681 __declspec(naked) void I444ToARGBRow_SSSE3(
2682     const uint8_t* y_buf,
2683     const uint8_t* u_buf,
2684     const uint8_t* v_buf,
2685     uint8_t* dst_argb,
2686     const struct YuvConstants* yuvconstants,
2687     int width) {
2688   __asm {
2689     push       esi
2690     push       edi
2691     push       ebx
2692     mov        eax, [esp + 12 + 4]  // Y
2693     mov        esi, [esp + 12 + 8]  // U
2694     mov        edi, [esp + 12 + 12]  // V
2695     mov        edx, [esp + 12 + 16]  // argb
2696     mov        ebx, [esp + 12 + 20]  // yuvconstants
2697     mov        ecx, [esp + 12 + 24]  // width
2698     sub        edi, esi
2699     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
2700 
2701  convertloop:
2702     READYUV444
2703     YUVTORGB(ebx)
2704     STOREARGB
2705 
2706     sub        ecx, 8
2707     jg         convertloop
2708 
2709     pop        ebx
2710     pop        edi
2711     pop        esi
2712     ret
2713   }
2714 }
2715 
2716 // 8 pixels.
2717 // 8 UV values, mixed with 8 Y and 8A producing 8 ARGB (32 bytes).
2718 __declspec(naked) void I444AlphaToARGBRow_SSSE3(
2719     const uint8_t* y_buf,
2720     const uint8_t* u_buf,
2721     const uint8_t* v_buf,
2722     const uint8_t* a_buf,
2723     uint8_t* dst_argb,
2724     const struct YuvConstants* yuvconstants,
2725     int width) {
2726   __asm {
2727     push       esi
2728     push       edi
2729     push       ebx
2730     push       ebp
2731     mov        eax, [esp + 16 + 4]  // Y
2732     mov        esi, [esp + 16 + 8]  // U
2733     mov        edi, [esp + 16 + 12]  // V
2734     mov        ebp, [esp + 16 + 16]  // A
2735     mov        edx, [esp + 16 + 20]  // argb
2736     mov        ebx, [esp + 16 + 24]  // yuvconstants
2737     mov        ecx, [esp + 16 + 28]  // width
2738     sub        edi, esi
2739 
2740  convertloop:
2741     READYUVA444
2742     YUVTORGB(ebx)
2743     STOREARGB
2744 
2745     sub        ecx, 8
2746     jg         convertloop
2747 
2748     pop        ebp
2749     pop        ebx
2750     pop        edi
2751     pop        esi
2752     ret
2753   }
2754 }
2755 
2756 // 8 pixels.
2757 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
2758 __declspec(naked) void I422ToRGB24Row_SSSE3(
2759     const uint8_t* y_buf,
2760     const uint8_t* u_buf,
2761     const uint8_t* v_buf,
2762     uint8_t* dst_rgb24,
2763     const struct YuvConstants* yuvconstants,
2764     int width) {
2765   __asm {
2766     push       esi
2767     push       edi
2768     push       ebx
2769     mov        eax, [esp + 12 + 4]  // Y
2770     mov        esi, [esp + 12 + 8]  // U
2771     mov        edi, [esp + 12 + 12]  // V
2772     mov        edx, [esp + 12 + 16]  // argb
2773     mov        ebx, [esp + 12 + 20]  // yuvconstants
2774     mov        ecx, [esp + 12 + 24]  // width
2775     sub        edi, esi
2776     movdqa     xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
2777     movdqa     xmm6, xmmword ptr kShuffleMaskARGBToRGB24
2778 
2779  convertloop:
2780     READYUV422
2781     YUVTORGB(ebx)
2782     STORERGB24
2783 
2784     sub        ecx, 8
2785     jg         convertloop
2786 
2787     pop        ebx
2788     pop        edi
2789     pop        esi
2790     ret
2791   }
2792 }
2793 
2794 // 8 pixels.
2795 // 8 UV values, mixed with 8 Y producing 8 RGB24 (24 bytes).
2796 __declspec(naked) void I444ToRGB24Row_SSSE3(
2797     const uint8_t* y_buf,
2798     const uint8_t* u_buf,
2799     const uint8_t* v_buf,
2800     uint8_t* dst_rgb24,
2801     const struct YuvConstants* yuvconstants,
2802     int width) {
2803   __asm {
2804     push       esi
2805     push       edi
2806     push       ebx
2807     mov        eax, [esp + 12 + 4]  // Y
2808     mov        esi, [esp + 12 + 8]  // U
2809     mov        edi, [esp + 12 + 12]  // V
2810     mov        edx, [esp + 12 + 16]  // argb
2811     mov        ebx, [esp + 12 + 20]  // yuvconstants
2812     mov        ecx, [esp + 12 + 24]  // width
2813     sub        edi, esi
2814     movdqa     xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
2815     movdqa     xmm6, xmmword ptr kShuffleMaskARGBToRGB24
2816 
2817  convertloop:
2818     READYUV444
2819     YUVTORGB(ebx)
2820     STORERGB24
2821 
2822     sub        ecx, 8
2823     jg         convertloop
2824 
2825     pop        ebx
2826     pop        edi
2827     pop        esi
2828     ret
2829   }
2830 }
2831 
2832 // 8 pixels
2833 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
2834 __declspec(naked) void I422ToRGB565Row_SSSE3(
2835     const uint8_t* y_buf,
2836     const uint8_t* u_buf,
2837     const uint8_t* v_buf,
2838     uint8_t* rgb565_buf,
2839     const struct YuvConstants* yuvconstants,
2840     int width) {
2841   __asm {
2842     push       esi
2843     push       edi
2844     push       ebx
2845     mov        eax, [esp + 12 + 4]  // Y
2846     mov        esi, [esp + 12 + 8]  // U
2847     mov        edi, [esp + 12 + 12]  // V
2848     mov        edx, [esp + 12 + 16]  // argb
2849     mov        ebx, [esp + 12 + 20]  // yuvconstants
2850     mov        ecx, [esp + 12 + 24]  // width
2851     sub        edi, esi
2852     pcmpeqb    xmm5, xmm5  // generate mask 0x0000001f
2853     psrld      xmm5, 27
2854     pcmpeqb    xmm6, xmm6  // generate mask 0x000007e0
2855     psrld      xmm6, 26
2856     pslld      xmm6, 5
2857     pcmpeqb    xmm7, xmm7  // generate mask 0xfffff800
2858     pslld      xmm7, 11
2859 
2860  convertloop:
2861     READYUV422
2862     YUVTORGB(ebx)
2863     STORERGB565
2864 
2865     sub        ecx, 8
2866     jg         convertloop
2867 
2868     pop        ebx
2869     pop        edi
2870     pop        esi
2871     ret
2872   }
2873 }
2874 
2875 // 8 pixels.
2876 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2877 __declspec(naked) void I422ToARGBRow_SSSE3(
2878     const uint8_t* y_buf,
2879     const uint8_t* u_buf,
2880     const uint8_t* v_buf,
2881     uint8_t* dst_argb,
2882     const struct YuvConstants* yuvconstants,
2883     int width) {
2884   __asm {
2885     push       esi
2886     push       edi
2887     push       ebx
2888     mov        eax, [esp + 12 + 4]  // Y
2889     mov        esi, [esp + 12 + 8]  // U
2890     mov        edi, [esp + 12 + 12]  // V
2891     mov        edx, [esp + 12 + 16]  // argb
2892     mov        ebx, [esp + 12 + 20]  // yuvconstants
2893     mov        ecx, [esp + 12 + 24]  // width
2894     sub        edi, esi
2895     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
2896 
2897  convertloop:
2898     READYUV422
2899     YUVTORGB(ebx)
2900     STOREARGB
2901 
2902     sub        ecx, 8
2903     jg         convertloop
2904 
2905     pop        ebx
2906     pop        edi
2907     pop        esi
2908     ret
2909   }
2910 }
2911 
2912 // 8 pixels.
2913 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
2914 __declspec(naked) void I422AlphaToARGBRow_SSSE3(
2915     const uint8_t* y_buf,
2916     const uint8_t* u_buf,
2917     const uint8_t* v_buf,
2918     const uint8_t* a_buf,
2919     uint8_t* dst_argb,
2920     const struct YuvConstants* yuvconstants,
2921     int width) {
2922   __asm {
2923     push       esi
2924     push       edi
2925     push       ebx
2926     push       ebp
2927     mov        eax, [esp + 16 + 4]  // Y
2928     mov        esi, [esp + 16 + 8]  // U
2929     mov        edi, [esp + 16 + 12]  // V
2930     mov        ebp, [esp + 16 + 16]  // A
2931     mov        edx, [esp + 16 + 20]  // argb
2932     mov        ebx, [esp + 16 + 24]  // yuvconstants
2933     mov        ecx, [esp + 16 + 28]  // width
2934     sub        edi, esi
2935 
2936  convertloop:
2937     READYUVA422
2938     YUVTORGB(ebx)
2939     STOREARGB
2940 
2941     sub        ecx, 8
2942     jg         convertloop
2943 
2944     pop        ebp
2945     pop        ebx
2946     pop        edi
2947     pop        esi
2948     ret
2949   }
2950 }
2951 
2952 // 8 pixels.
2953 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2954 __declspec(naked) void NV12ToARGBRow_SSSE3(
2955     const uint8_t* y_buf,
2956     const uint8_t* uv_buf,
2957     uint8_t* dst_argb,
2958     const struct YuvConstants* yuvconstants,
2959     int width) {
2960   __asm {
2961     push       esi
2962     push       ebx
2963     mov        eax, [esp + 8 + 4]  // Y
2964     mov        esi, [esp + 8 + 8]  // UV
2965     mov        edx, [esp + 8 + 12]  // argb
2966     mov        ebx, [esp + 8 + 16]  // yuvconstants
2967     mov        ecx, [esp + 8 + 20]  // width
2968     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
2969 
2970  convertloop:
2971     READNV12
2972     YUVTORGB(ebx)
2973     STOREARGB
2974 
2975     sub        ecx, 8
2976     jg         convertloop
2977 
2978     pop        ebx
2979     pop        esi
2980     ret
2981   }
2982 }
2983 
2984 // 8 pixels.
2985 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2986 __declspec(naked) void NV21ToARGBRow_SSSE3(
2987     const uint8_t* y_buf,
2988     const uint8_t* vu_buf,
2989     uint8_t* dst_argb,
2990     const struct YuvConstants* yuvconstants,
2991     int width) {
2992   __asm {
2993     push       esi
2994     push       ebx
2995     mov        eax, [esp + 8 + 4]  // Y
2996     mov        esi, [esp + 8 + 8]  // VU
2997     mov        edx, [esp + 8 + 12]  // argb
2998     mov        ebx, [esp + 8 + 16]  // yuvconstants
2999     mov        ecx, [esp + 8 + 20]  // width
3000     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
3001 
3002  convertloop:
3003     READNV21
3004     YUVTORGB(ebx)
3005     STOREARGB
3006 
3007     sub        ecx, 8
3008     jg         convertloop
3009 
3010     pop        ebx
3011     pop        esi
3012     ret
3013   }
3014 }
3015 
3016 // 8 pixels.
3017 // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
3018 __declspec(naked) void YUY2ToARGBRow_SSSE3(
3019     const uint8_t* src_yuy2,
3020     uint8_t* dst_argb,
3021     const struct YuvConstants* yuvconstants,
3022     int width) {
3023   __asm {
3024     push       ebx
3025     mov        eax, [esp + 4 + 4]  // yuy2
3026     mov        edx, [esp + 4 + 8]  // argb
3027     mov        ebx, [esp + 4 + 12]  // yuvconstants
3028     mov        ecx, [esp + 4 + 16]  // width
3029     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
3030 
3031  convertloop:
3032     READYUY2
3033     YUVTORGB(ebx)
3034     STOREARGB
3035 
3036     sub        ecx, 8
3037     jg         convertloop
3038 
3039     pop        ebx
3040     ret
3041   }
3042 }
3043 
3044 // 8 pixels.
3045 // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
3046 __declspec(naked) void UYVYToARGBRow_SSSE3(
3047     const uint8_t* src_uyvy,
3048     uint8_t* dst_argb,
3049     const struct YuvConstants* yuvconstants,
3050     int width) {
3051   __asm {
3052     push       ebx
3053     mov        eax, [esp + 4 + 4]  // uyvy
3054     mov        edx, [esp + 4 + 8]  // argb
3055     mov        ebx, [esp + 4 + 12]  // yuvconstants
3056     mov        ecx, [esp + 4 + 16]  // width
3057     pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
3058 
3059  convertloop:
3060     READUYVY
3061     YUVTORGB(ebx)
3062     STOREARGB
3063 
3064     sub        ecx, 8
3065     jg         convertloop
3066 
3067     pop        ebx
3068     ret
3069   }
3070 }
3071 
3072 __declspec(naked) void I422ToRGBARow_SSSE3(
3073     const uint8_t* y_buf,
3074     const uint8_t* u_buf,
3075     const uint8_t* v_buf,
3076     uint8_t* dst_rgba,
3077     const struct YuvConstants* yuvconstants,
3078     int width) {
3079   __asm {
3080     push       esi
3081     push       edi
3082     push       ebx
3083     mov        eax, [esp + 12 + 4]  // Y
3084     mov        esi, [esp + 12 + 8]  // U
3085     mov        edi, [esp + 12 + 12]  // V
3086     mov        edx, [esp + 12 + 16]  // argb
3087     mov        ebx, [esp + 12 + 20]  // yuvconstants
3088     mov        ecx, [esp + 12 + 24]  // width
3089     sub        edi, esi
3090 
3091  convertloop:
3092     READYUV422
3093     YUVTORGB(ebx)
3094     STORERGBA
3095 
3096     sub        ecx, 8
3097     jg         convertloop
3098 
3099     pop        ebx
3100     pop        edi
3101     pop        esi
3102     ret
3103   }
3104 }
3105 #endif  // HAS_I422TOARGBROW_SSSE3
3106 
3107 // I400ToARGBRow_SSE2 is disabled due to new yuvconstant parameter
3108 #ifdef HAS_I400TOARGBROW_SSE2
3109 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
3110 __declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
3111                                           uint8_t* rgb_buf,
3112                                           const struct YuvConstants*,
3113                                           int width) {
3114   __asm {
3115     mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
3116     movd       xmm2, eax
3117     pshufd     xmm2, xmm2,0
3118     mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)
3119     movd       xmm3, eax
3120     pshufd     xmm3, xmm3, 0
3121     pcmpeqb    xmm4, xmm4  // generate mask 0xff000000
3122     pslld      xmm4, 24
3123 
3124     mov        eax, [esp + 4]  // Y
3125     mov        edx, [esp + 8]  // rgb
3126     mov        ecx, [esp + 12]  // width
3127 
3128  convertloop:
3129         // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
3130     movq       xmm0, qword ptr [eax]
3131     lea        eax, [eax + 8]
3132     punpcklbw  xmm0, xmm0  // Y.Y
3133     pmulhuw    xmm0, xmm2
3134     psubusw    xmm0, xmm3
3135     psrlw      xmm0, 6
3136     packuswb   xmm0, xmm0        // G
3137 
3138         // Step 2: Weave into ARGB
3139     punpcklbw  xmm0, xmm0  // GG
3140     movdqa     xmm1, xmm0
3141     punpcklwd  xmm0, xmm0  // BGRA first 4 pixels
3142     punpckhwd  xmm1, xmm1  // BGRA next 4 pixels
3143     por        xmm0, xmm4
3144     por        xmm1, xmm4
3145     movdqu     [edx], xmm0
3146     movdqu     [edx + 16], xmm1
3147     lea        edx,  [edx + 32]
3148     sub        ecx, 8
3149     jg         convertloop
3150     ret
3151   }
3152 }
3153 #endif  // HAS_I400TOARGBROW_SSE2
3154 
3155 #ifdef HAS_I400TOARGBROW_AVX2
3156 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
3157 // note: vpunpcklbw mutates and vpackuswb unmutates.
3158 __declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf,
3159                                           uint8_t* rgb_buf,
3160                                           const struct YuvConstants*,
3161                                           int width) {
3162   __asm {
3163     mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
3164     vmovd      xmm2, eax
3165     vbroadcastss ymm2, xmm2
3166     mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)
3167     vmovd      xmm3, eax
3168     vbroadcastss ymm3, xmm3
3169     vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xff000000
3170     vpslld     ymm4, ymm4, 24
3171 
3172     mov        eax, [esp + 4]  // Y
3173     mov        edx, [esp + 8]  // rgb
3174     mov        ecx, [esp + 12]  // width
3175 
3176  convertloop:
3177         // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
3178     vmovdqu    xmm0, [eax]
3179     lea        eax, [eax + 16]
3180     vpermq     ymm0, ymm0, 0xd8  // vpunpcklbw mutates
3181     vpunpcklbw ymm0, ymm0, ymm0  // Y.Y
3182     vpmulhuw   ymm0, ymm0, ymm2
3183     vpsubusw   ymm0, ymm0, ymm3
3184     vpsrlw     ymm0, ymm0, 6
3185     vpackuswb  ymm0, ymm0, ymm0        // G.  still mutated: 3120
3186 
3187         // TODO(fbarchard): Weave alpha with unpack.
3188         // Step 2: Weave into ARGB
3189     vpunpcklbw ymm1, ymm0, ymm0  // GG - mutates
3190     vpermq     ymm1, ymm1, 0xd8
3191     vpunpcklwd ymm0, ymm1, ymm1  // GGGG first 8 pixels
3192     vpunpckhwd ymm1, ymm1, ymm1  // GGGG next 8 pixels
3193     vpor       ymm0, ymm0, ymm4
3194     vpor       ymm1, ymm1, ymm4
3195     vmovdqu    [edx], ymm0
3196     vmovdqu    [edx + 32], ymm1
3197     lea        edx,  [edx + 64]
3198     sub        ecx, 16
3199     jg         convertloop
3200     vzeroupper
3201     ret
3202   }
3203 }
3204 #endif  // HAS_I400TOARGBROW_AVX2
3205 
3206 #ifdef HAS_MIRRORROW_SSSE3
3207 // Shuffle table for reversing the bytes.
3208 static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
3209                                      7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
3210 
3211 // TODO(fbarchard): Replace lea with -16 offset.
3212 __declspec(naked) void MirrorRow_SSSE3(const uint8_t* src,
3213                                        uint8_t* dst,
3214                                        int width) {
3215   __asm {
3216     mov       eax, [esp + 4]  // src
3217     mov       edx, [esp + 8]  // dst
3218     mov       ecx, [esp + 12]  // width
3219     movdqa    xmm5, xmmword ptr kShuffleMirror
3220 
3221  convertloop:
3222     movdqu    xmm0, [eax - 16 + ecx]
3223     pshufb    xmm0, xmm5
3224     movdqu    [edx], xmm0
3225     lea       edx, [edx + 16]
3226     sub       ecx, 16
3227     jg        convertloop
3228     ret
3229   }
3230 }
3231 #endif  // HAS_MIRRORROW_SSSE3
3232 
3233 #ifdef HAS_MIRRORROW_AVX2
3234 __declspec(naked) void MirrorRow_AVX2(const uint8_t* src,
3235                                       uint8_t* dst,
3236                                       int width) {
3237   __asm {
3238     mov       eax, [esp + 4]  // src
3239     mov       edx, [esp + 8]  // dst
3240     mov       ecx, [esp + 12]  // width
3241     vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
3242 
3243  convertloop:
3244     vmovdqu   ymm0, [eax - 32 + ecx]
3245     vpshufb   ymm0, ymm0, ymm5
3246     vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
3247     vmovdqu   [edx], ymm0
3248     lea       edx, [edx + 32]
3249     sub       ecx, 32
3250     jg        convertloop
3251     vzeroupper
3252     ret
3253   }
3254 }
3255 #endif  // HAS_MIRRORROW_AVX2
3256 
3257 #ifdef HAS_MIRRORSPLITUVROW_SSSE3
3258 // Shuffle table for reversing the bytes of UV channels.
3259 static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
3260                                        15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
3261 
3262 __declspec(naked) void MirrorSplitUVRow_SSSE3(const uint8_t* src,
3263                                               uint8_t* dst_u,
3264                                               uint8_t* dst_v,
3265                                               int width) {
3266   __asm {
3267     push      edi
3268     mov       eax, [esp + 4 + 4]  // src
3269     mov       edx, [esp + 4 + 8]  // dst_u
3270     mov       edi, [esp + 4 + 12]  // dst_v
3271     mov       ecx, [esp + 4 + 16]  // width
3272     movdqa    xmm1, xmmword ptr kShuffleMirrorUV
3273     lea       eax, [eax + ecx * 2 - 16]
3274     sub       edi, edx
3275 
3276  convertloop:
3277     movdqu    xmm0, [eax]
3278     lea       eax, [eax - 16]
3279     pshufb    xmm0, xmm1
3280     movlpd    qword ptr [edx], xmm0
3281     movhpd    qword ptr [edx + edi], xmm0
3282     lea       edx, [edx + 8]
3283     sub       ecx, 8
3284     jg        convertloop
3285 
3286     pop       edi
3287     ret
3288   }
3289 }
3290 #endif  // HAS_MIRRORSPLITUVROW_SSSE3
3291 
3292 #ifdef HAS_ARGBMIRRORROW_SSE2
3293 __declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src,
3294                                           uint8_t* dst,
3295                                           int width) {
3296   __asm {
3297     mov       eax, [esp + 4]  // src
3298     mov       edx, [esp + 8]  // dst
3299     mov       ecx, [esp + 12]  // width
3300     lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
3301 
3302  convertloop:
3303     movdqu    xmm0, [eax]
3304     lea       eax, [eax - 16]
3305     pshufd    xmm0, xmm0, 0x1b
3306     movdqu    [edx], xmm0
3307     lea       edx, [edx + 16]
3308     sub       ecx, 4
3309     jg        convertloop
3310     ret
3311   }
3312 }
3313 #endif  // HAS_ARGBMIRRORROW_SSE2
3314 
3315 #ifdef HAS_ARGBMIRRORROW_AVX2
3316 // Shuffle table for reversing the bytes.
3317 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
3318 
3319 __declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src,
3320                                           uint8_t* dst,
3321                                           int width) {
3322   __asm {
3323     mov       eax, [esp + 4]  // src
3324     mov       edx, [esp + 8]  // dst
3325     mov       ecx, [esp + 12]  // width
3326     vmovdqu   ymm5, ymmword ptr kARGBShuffleMirror_AVX2
3327 
3328  convertloop:
3329     vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order
3330     vmovdqu   [edx], ymm0
3331     lea       edx, [edx + 32]
3332     sub       ecx, 8
3333     jg        convertloop
3334     vzeroupper
3335     ret
3336   }
3337 }
3338 #endif  // HAS_ARGBMIRRORROW_AVX2
3339 
3340 #ifdef HAS_SPLITUVROW_SSE2
3341 __declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv,
3342                                        uint8_t* dst_u,
3343                                        uint8_t* dst_v,
3344                                        int width) {
3345   __asm {
3346     push       edi
3347     mov        eax, [esp + 4 + 4]  // src_uv
3348     mov        edx, [esp + 4 + 8]  // dst_u
3349     mov        edi, [esp + 4 + 12]  // dst_v
3350     mov        ecx, [esp + 4 + 16]  // width
3351     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
3352     psrlw      xmm5, 8
3353     sub        edi, edx
3354 
3355   convertloop:
3356     movdqu     xmm0, [eax]
3357     movdqu     xmm1, [eax + 16]
3358     lea        eax,  [eax + 32]
3359     movdqa     xmm2, xmm0
3360     movdqa     xmm3, xmm1
3361     pand       xmm0, xmm5  // even bytes
3362     pand       xmm1, xmm5
3363     packuswb   xmm0, xmm1
3364     psrlw      xmm2, 8  // odd bytes
3365     psrlw      xmm3, 8
3366     packuswb   xmm2, xmm3
3367     movdqu     [edx], xmm0
3368     movdqu     [edx + edi], xmm2
3369     lea        edx, [edx + 16]
3370     sub        ecx, 16
3371     jg         convertloop
3372 
3373     pop        edi
3374     ret
3375   }
3376 }
3377 
3378 #endif  // HAS_SPLITUVROW_SSE2
3379 
3380 #ifdef HAS_SPLITUVROW_AVX2
3381 __declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv,
3382                                        uint8_t* dst_u,
3383                                        uint8_t* dst_v,
3384                                        int width) {
3385   __asm {
3386     push       edi
3387     mov        eax, [esp + 4 + 4]  // src_uv
3388     mov        edx, [esp + 4 + 8]  // dst_u
3389     mov        edi, [esp + 4 + 12]  // dst_v
3390     mov        ecx, [esp + 4 + 16]  // width
3391     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3392     vpsrlw     ymm5, ymm5, 8
3393     sub        edi, edx
3394 
3395   convertloop:
3396     vmovdqu    ymm0, [eax]
3397     vmovdqu    ymm1, [eax + 32]
3398     lea        eax,  [eax + 64]
3399     vpsrlw     ymm2, ymm0, 8  // odd bytes
3400     vpsrlw     ymm3, ymm1, 8
3401     vpand      ymm0, ymm0, ymm5  // even bytes
3402     vpand      ymm1, ymm1, ymm5
3403     vpackuswb  ymm0, ymm0, ymm1
3404     vpackuswb  ymm2, ymm2, ymm3
3405     vpermq     ymm0, ymm0, 0xd8
3406     vpermq     ymm2, ymm2, 0xd8
3407     vmovdqu    [edx], ymm0
3408     vmovdqu    [edx + edi], ymm2
3409     lea        edx, [edx + 32]
3410     sub        ecx, 32
3411     jg         convertloop
3412 
3413     pop        edi
3414     vzeroupper
3415     ret
3416   }
3417 }
3418 #endif  // HAS_SPLITUVROW_AVX2
3419 
3420 #ifdef HAS_MERGEUVROW_SSE2
3421 __declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u,
3422                                        const uint8_t* src_v,
3423                                        uint8_t* dst_uv,
3424                                        int width) {
3425   __asm {
3426     push       edi
3427     mov        eax, [esp + 4 + 4]  // src_u
3428     mov        edx, [esp + 4 + 8]  // src_v
3429     mov        edi, [esp + 4 + 12]  // dst_uv
3430     mov        ecx, [esp + 4 + 16]  // width
3431     sub        edx, eax
3432 
3433   convertloop:
3434     movdqu     xmm0, [eax]  // read 16 U's
3435     movdqu     xmm1, [eax + edx]  // and 16 V's
3436     lea        eax,  [eax + 16]
3437     movdqa     xmm2, xmm0
3438     punpcklbw  xmm0, xmm1  // first 8 UV pairs
3439     punpckhbw  xmm2, xmm1  // next 8 UV pairs
3440     movdqu     [edi], xmm0
3441     movdqu     [edi + 16], xmm2
3442     lea        edi, [edi + 32]
3443     sub        ecx, 16
3444     jg         convertloop
3445 
3446     pop        edi
3447     ret
3448   }
3449 }
3450 #endif  //  HAS_MERGEUVROW_SSE2
3451 
3452 #ifdef HAS_MERGEUVROW_AVX2
3453 __declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u,
3454                                        const uint8_t* src_v,
3455                                        uint8_t* dst_uv,
3456                                        int width) {
3457   __asm {
3458     push       edi
3459     mov        eax, [esp + 4 + 4]  // src_u
3460     mov        edx, [esp + 4 + 8]  // src_v
3461     mov        edi, [esp + 4 + 12]  // dst_uv
3462     mov        ecx, [esp + 4 + 16]  // width
3463     sub        edx, eax
3464 
3465   convertloop:
3466     vpmovzxbw  ymm0, [eax]
3467     vpmovzxbw  ymm1, [eax + edx]
3468     lea        eax,  [eax + 16]
3469     vpsllw     ymm1, ymm1, 8
3470     vpor       ymm2, ymm1, ymm0
3471     vmovdqu    [edi], ymm2
3472     lea        edi, [edi + 32]
3473     sub        ecx, 16
3474     jg         convertloop
3475 
3476     pop        edi
3477     vzeroupper
3478     ret
3479   }
3480 }
3481 #endif  //  HAS_MERGEUVROW_AVX2
3482 
3483 #ifdef HAS_COPYROW_SSE2
3484 // CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time.
3485 __declspec(naked) void CopyRow_SSE2(const uint8_t* src,
3486                                     uint8_t* dst,
3487                                     int width) {
3488   __asm {
3489     mov        eax, [esp + 4]  // src
3490     mov        edx, [esp + 8]  // dst
3491     mov        ecx, [esp + 12]  // width
3492     test       eax, 15
3493     jne        convertloopu
3494     test       edx, 15
3495     jne        convertloopu
3496 
3497   convertloopa:
3498     movdqa     xmm0, [eax]
3499     movdqa     xmm1, [eax + 16]
3500     lea        eax, [eax + 32]
3501     movdqa     [edx], xmm0
3502     movdqa     [edx + 16], xmm1
3503     lea        edx, [edx + 32]
3504     sub        ecx, 32
3505     jg         convertloopa
3506     ret
3507 
3508   convertloopu:
3509     movdqu     xmm0, [eax]
3510     movdqu     xmm1, [eax + 16]
3511     lea        eax, [eax + 32]
3512     movdqu     [edx], xmm0
3513     movdqu     [edx + 16], xmm1
3514     lea        edx, [edx + 32]
3515     sub        ecx, 32
3516     jg         convertloopu
3517     ret
3518   }
3519 }
3520 #endif  // HAS_COPYROW_SSE2
3521 
3522 #ifdef HAS_COPYROW_AVX
3523 // CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time.
3524 __declspec(naked) void CopyRow_AVX(const uint8_t* src,
3525                                    uint8_t* dst,
3526                                    int width) {
3527   __asm {
3528     mov        eax, [esp + 4]  // src
3529     mov        edx, [esp + 8]  // dst
3530     mov        ecx, [esp + 12]  // width
3531 
3532   convertloop:
3533     vmovdqu    ymm0, [eax]
3534     vmovdqu    ymm1, [eax + 32]
3535     lea        eax, [eax + 64]
3536     vmovdqu    [edx], ymm0
3537     vmovdqu    [edx + 32], ymm1
3538     lea        edx, [edx + 64]
3539     sub        ecx, 64
3540     jg         convertloop
3541 
3542     vzeroupper
3543     ret
3544   }
3545 }
3546 #endif  // HAS_COPYROW_AVX
3547 
3548 // Multiple of 1.
3549 __declspec(naked) void CopyRow_ERMS(const uint8_t* src,
3550                                     uint8_t* dst,
3551                                     int width) {
3552   __asm {
3553     mov        eax, esi
3554     mov        edx, edi
3555     mov        esi, [esp + 4]  // src
3556     mov        edi, [esp + 8]  // dst
3557     mov        ecx, [esp + 12]  // width
3558     rep movsb
3559     mov        edi, edx
3560     mov        esi, eax
3561     ret
3562   }
3563 }
3564 
3565 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
3566 // width in pixels
3567 __declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src,
3568                                              uint8_t* dst,
3569                                              int width) {
3570   __asm {
3571     mov        eax, [esp + 4]  // src
3572     mov        edx, [esp + 8]  // dst
3573     mov        ecx, [esp + 12]  // width
3574     pcmpeqb    xmm0, xmm0  // generate mask 0xff000000
3575     pslld      xmm0, 24
3576     pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff
3577     psrld      xmm1, 8
3578 
3579   convertloop:
3580     movdqu     xmm2, [eax]
3581     movdqu     xmm3, [eax + 16]
3582     lea        eax, [eax + 32]
3583     movdqu     xmm4, [edx]
3584     movdqu     xmm5, [edx + 16]
3585     pand       xmm2, xmm0
3586     pand       xmm3, xmm0
3587     pand       xmm4, xmm1
3588     pand       xmm5, xmm1
3589     por        xmm2, xmm4
3590     por        xmm3, xmm5
3591     movdqu     [edx], xmm2
3592     movdqu     [edx + 16], xmm3
3593     lea        edx, [edx + 32]
3594     sub        ecx, 8
3595     jg         convertloop
3596 
3597     ret
3598   }
3599 }
3600 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
3601 
3602 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
3603 // width in pixels
3604 __declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src,
3605                                              uint8_t* dst,
3606                                              int width) {
3607   __asm {
3608     mov        eax, [esp + 4]  // src
3609     mov        edx, [esp + 8]  // dst
3610     mov        ecx, [esp + 12]  // width
3611     vpcmpeqb   ymm0, ymm0, ymm0
3612     vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff
3613 
3614   convertloop:
3615     vmovdqu    ymm1, [eax]
3616     vmovdqu    ymm2, [eax + 32]
3617     lea        eax, [eax + 64]
3618     vpblendvb  ymm1, ymm1, [edx], ymm0
3619     vpblendvb  ymm2, ymm2, [edx + 32], ymm0
3620     vmovdqu    [edx], ymm1
3621     vmovdqu    [edx + 32], ymm2
3622     lea        edx, [edx + 64]
3623     sub        ecx, 16
3624     jg         convertloop
3625 
3626     vzeroupper
3627     ret
3628   }
3629 }
3630 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
3631 
3632 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
3633 // width in pixels
3634 __declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
3635                                                 uint8_t* dst_a,
3636                                                 int width) {
3637   __asm {
3638     mov        eax, [esp + 4]  // src_argb
3639     mov        edx, [esp + 8]  // dst_a
3640     mov        ecx, [esp + 12]  // width
3641 
3642   extractloop:
3643     movdqu     xmm0, [eax]
3644     movdqu     xmm1, [eax + 16]
3645     lea        eax, [eax + 32]
3646     psrld      xmm0, 24
3647     psrld      xmm1, 24
3648     packssdw   xmm0, xmm1
3649     packuswb   xmm0, xmm0
3650     movq       qword ptr [edx], xmm0
3651     lea        edx, [edx + 8]
3652     sub        ecx, 8
3653     jg         extractloop
3654 
3655     ret
3656   }
3657 }
3658 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
3659 
3660 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
3661 // width in pixels
3662 __declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
3663                                                 uint8_t* dst_a,
3664                                                 int width) {
3665   __asm {
3666     mov        eax, [esp + 4]  // src_argb
3667     mov        edx, [esp + 8]  // dst_a
3668     mov        ecx, [esp + 12]  // width
3669     vmovdqa    ymm4, ymmword ptr kPermdARGBToY_AVX
3670 
3671   extractloop:
3672     vmovdqu    ymm0, [eax]
3673     vmovdqu    ymm1, [eax + 32]
3674     vpsrld     ymm0, ymm0, 24
3675     vpsrld     ymm1, ymm1, 24
3676     vmovdqu    ymm2, [eax + 64]
3677     vmovdqu    ymm3, [eax + 96]
3678     lea        eax, [eax + 128]
3679     vpackssdw  ymm0, ymm0, ymm1  // mutates
3680     vpsrld     ymm2, ymm2, 24
3681     vpsrld     ymm3, ymm3, 24
3682     vpackssdw  ymm2, ymm2, ymm3  // mutates
3683     vpackuswb  ymm0, ymm0, ymm2  // mutates
3684     vpermd     ymm0, ymm4, ymm0  // unmutate
3685     vmovdqu    [edx], ymm0
3686     lea        edx, [edx + 32]
3687     sub        ecx, 32
3688     jg         extractloop
3689 
3690     vzeroupper
3691     ret
3692   }
3693 }
3694 #endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
3695 
3696 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
3697 // width in pixels
3698 __declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src,
3699                                                 uint8_t* dst,
3700                                                 int width) {
3701   __asm {
3702     mov        eax, [esp + 4]  // src
3703     mov        edx, [esp + 8]  // dst
3704     mov        ecx, [esp + 12]  // width
3705     pcmpeqb    xmm0, xmm0  // generate mask 0xff000000
3706     pslld      xmm0, 24
3707     pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff
3708     psrld      xmm1, 8
3709 
3710   convertloop:
3711     movq       xmm2, qword ptr [eax]  // 8 Y's
3712     lea        eax, [eax + 8]
3713     punpcklbw  xmm2, xmm2
3714     punpckhwd  xmm3, xmm2
3715     punpcklwd  xmm2, xmm2
3716     movdqu     xmm4, [edx]
3717     movdqu     xmm5, [edx + 16]
3718     pand       xmm2, xmm0
3719     pand       xmm3, xmm0
3720     pand       xmm4, xmm1
3721     pand       xmm5, xmm1
3722     por        xmm2, xmm4
3723     por        xmm3, xmm5
3724     movdqu     [edx], xmm2
3725     movdqu     [edx + 16], xmm3
3726     lea        edx, [edx + 32]
3727     sub        ecx, 8
3728     jg         convertloop
3729 
3730     ret
3731   }
3732 }
3733 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
3734 
3735 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3736 // width in pixels
3737 __declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src,
3738                                                 uint8_t* dst,
3739                                                 int width) {
3740   __asm {
3741     mov        eax, [esp + 4]  // src
3742     mov        edx, [esp + 8]  // dst
3743     mov        ecx, [esp + 12]  // width
3744     vpcmpeqb   ymm0, ymm0, ymm0
3745     vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff
3746 
3747   convertloop:
3748     vpmovzxbd  ymm1, qword ptr [eax]
3749     vpmovzxbd  ymm2, qword ptr [eax + 8]
3750     lea        eax, [eax + 16]
3751     vpslld     ymm1, ymm1, 24
3752     vpslld     ymm2, ymm2, 24
3753     vpblendvb  ymm1, ymm1, [edx], ymm0
3754     vpblendvb  ymm2, ymm2, [edx + 32], ymm0
3755     vmovdqu    [edx], ymm1
3756     vmovdqu    [edx + 32], ymm2
3757     lea        edx, [edx + 64]
3758     sub        ecx, 16
3759     jg         convertloop
3760 
3761     vzeroupper
3762     ret
3763   }
3764 }
3765 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
3766 
3767 #ifdef HAS_SETROW_X86
3768 // Write 'width' bytes using an 8 bit value repeated.
3769 // width should be multiple of 4.
3770 __declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
3771   __asm {
3772     movzx      eax, byte ptr [esp + 8]  // v8
3773     mov        edx, 0x01010101  // Duplicate byte to all bytes.
3774     mul        edx  // overwrites edx with upper part of result.
3775     mov        edx, edi
3776     mov        edi, [esp + 4]  // dst
3777     mov        ecx, [esp + 12]  // width
3778     shr        ecx, 2
3779     rep stosd
3780     mov        edi, edx
3781     ret
3782   }
3783 }
3784 
3785 // Write 'width' bytes using an 8 bit value repeated.
3786 __declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
3787   __asm {
3788     mov        edx, edi
3789     mov        edi, [esp + 4]  // dst
3790     mov        eax, [esp + 8]  // v8
3791     mov        ecx, [esp + 12]  // width
3792     rep stosb
3793     mov        edi, edx
3794     ret
3795   }
3796 }
3797 
3798 // Write 'width' 32 bit values.
3799 __declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb,
3800                                       uint32_t v32,
3801                                       int width) {
3802   __asm {
3803     mov        edx, edi
3804     mov        edi, [esp + 4]  // dst
3805     mov        eax, [esp + 8]  // v32
3806     mov        ecx, [esp + 12]  // width
3807     rep stosd
3808     mov        edi, edx
3809     ret
3810   }
3811 }
3812 #endif  // HAS_SETROW_X86
3813 
3814 #ifdef HAS_YUY2TOYROW_AVX2
3815 __declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2,
3816                                        uint8_t* dst_y,
3817                                        int width) {
3818   __asm {
3819     mov        eax, [esp + 4]  // src_yuy2
3820     mov        edx, [esp + 8]  // dst_y
3821     mov        ecx, [esp + 12]  // width
3822     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3823     vpsrlw     ymm5, ymm5, 8
3824 
3825   convertloop:
3826     vmovdqu    ymm0, [eax]
3827     vmovdqu    ymm1, [eax + 32]
3828     lea        eax,  [eax + 64]
3829     vpand      ymm0, ymm0, ymm5  // even bytes are Y
3830     vpand      ymm1, ymm1, ymm5
3831     vpackuswb  ymm0, ymm0, ymm1  // mutates.
3832     vpermq     ymm0, ymm0, 0xd8
3833     vmovdqu    [edx], ymm0
3834     lea        edx, [edx + 32]
3835     sub        ecx, 32
3836     jg         convertloop
3837     vzeroupper
3838     ret
3839   }
3840 }
3841 
3842 __declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
3843                                         int stride_yuy2,
3844                                         uint8_t* dst_u,
3845                                         uint8_t* dst_v,
3846                                         int width) {
3847   __asm {
3848     push       esi
3849     push       edi
3850     mov        eax, [esp + 8 + 4]  // src_yuy2
3851     mov        esi, [esp + 8 + 8]  // stride_yuy2
3852     mov        edx, [esp + 8 + 12]  // dst_u
3853     mov        edi, [esp + 8 + 16]  // dst_v
3854     mov        ecx, [esp + 8 + 20]  // width
3855     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3856     vpsrlw     ymm5, ymm5, 8
3857     sub        edi, edx
3858 
3859   convertloop:
3860     vmovdqu    ymm0, [eax]
3861     vmovdqu    ymm1, [eax + 32]
3862     vpavgb     ymm0, ymm0, [eax + esi]
3863     vpavgb     ymm1, ymm1, [eax + esi + 32]
3864     lea        eax,  [eax + 64]
3865     vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV
3866     vpsrlw     ymm1, ymm1, 8
3867     vpackuswb  ymm0, ymm0, ymm1  // mutates.
3868     vpermq     ymm0, ymm0, 0xd8
3869     vpand      ymm1, ymm0, ymm5  // U
3870     vpsrlw     ymm0, ymm0, 8  // V
3871     vpackuswb  ymm1, ymm1, ymm1  // mutates.
3872     vpackuswb  ymm0, ymm0, ymm0  // mutates.
3873     vpermq     ymm1, ymm1, 0xd8
3874     vpermq     ymm0, ymm0, 0xd8
3875     vextractf128 [edx], ymm1, 0  // U
3876     vextractf128 [edx + edi], ymm0, 0  // V
3877     lea        edx, [edx + 16]
3878     sub        ecx, 32
3879     jg         convertloop
3880 
3881     pop        edi
3882     pop        esi
3883     vzeroupper
3884     ret
3885   }
3886 }
3887 
3888 __declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
3889                                            uint8_t* dst_u,
3890                                            uint8_t* dst_v,
3891                                            int width) {
3892   __asm {
3893     push       edi
3894     mov        eax, [esp + 4 + 4]  // src_yuy2
3895     mov        edx, [esp + 4 + 8]  // dst_u
3896     mov        edi, [esp + 4 + 12]  // dst_v
3897     mov        ecx, [esp + 4 + 16]  // width
3898     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3899     vpsrlw     ymm5, ymm5, 8
3900     sub        edi, edx
3901 
3902   convertloop:
3903     vmovdqu    ymm0, [eax]
3904     vmovdqu    ymm1, [eax + 32]
3905     lea        eax,  [eax + 64]
3906     vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV
3907     vpsrlw     ymm1, ymm1, 8
3908     vpackuswb  ymm0, ymm0, ymm1  // mutates.
3909     vpermq     ymm0, ymm0, 0xd8
3910     vpand      ymm1, ymm0, ymm5  // U
3911     vpsrlw     ymm0, ymm0, 8  // V
3912     vpackuswb  ymm1, ymm1, ymm1  // mutates.
3913     vpackuswb  ymm0, ymm0, ymm0  // mutates.
3914     vpermq     ymm1, ymm1, 0xd8
3915     vpermq     ymm0, ymm0, 0xd8
3916     vextractf128 [edx], ymm1, 0  // U
3917     vextractf128 [edx + edi], ymm0, 0  // V
3918     lea        edx, [edx + 16]
3919     sub        ecx, 32
3920     jg         convertloop
3921 
3922     pop        edi
3923     vzeroupper
3924     ret
3925   }
3926 }
3927 
3928 __declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy,
3929                                        uint8_t* dst_y,
3930                                        int width) {
3931   __asm {
3932     mov        eax, [esp + 4]  // src_uyvy
3933     mov        edx, [esp + 8]  // dst_y
3934     mov        ecx, [esp + 12]  // width
3935 
3936   convertloop:
3937     vmovdqu    ymm0, [eax]
3938     vmovdqu    ymm1, [eax + 32]
3939     lea        eax,  [eax + 64]
3940     vpsrlw     ymm0, ymm0, 8  // odd bytes are Y
3941     vpsrlw     ymm1, ymm1, 8
3942     vpackuswb  ymm0, ymm0, ymm1  // mutates.
3943     vpermq     ymm0, ymm0, 0xd8
3944     vmovdqu    [edx], ymm0
3945     lea        edx, [edx + 32]
3946     sub        ecx, 32
3947     jg         convertloop
3948     vzeroupper
3949     ret
3950   }
3951 }
3952 
3953 __declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
3954                                         int stride_uyvy,
3955                                         uint8_t* dst_u,
3956                                         uint8_t* dst_v,
3957                                         int width) {
3958   __asm {
3959     push       esi
3960     push       edi
3961     mov        eax, [esp + 8 + 4]  // src_yuy2
3962     mov        esi, [esp + 8 + 8]  // stride_yuy2
3963     mov        edx, [esp + 8 + 12]  // dst_u
3964     mov        edi, [esp + 8 + 16]  // dst_v
3965     mov        ecx, [esp + 8 + 20]  // width
3966     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3967     vpsrlw     ymm5, ymm5, 8
3968     sub        edi, edx
3969 
3970   convertloop:
3971     vmovdqu    ymm0, [eax]
3972     vmovdqu    ymm1, [eax + 32]
3973     vpavgb     ymm0, ymm0, [eax + esi]
3974     vpavgb     ymm1, ymm1, [eax + esi + 32]
3975     lea        eax,  [eax + 64]
3976     vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV
3977     vpand      ymm1, ymm1, ymm5
3978     vpackuswb  ymm0, ymm0, ymm1  // mutates.
3979     vpermq     ymm0, ymm0, 0xd8
3980     vpand      ymm1, ymm0, ymm5  // U
3981     vpsrlw     ymm0, ymm0, 8  // V
3982     vpackuswb  ymm1, ymm1, ymm1  // mutates.
3983     vpackuswb  ymm0, ymm0, ymm0  // mutates.
3984     vpermq     ymm1, ymm1, 0xd8
3985     vpermq     ymm0, ymm0, 0xd8
3986     vextractf128 [edx], ymm1, 0  // U
3987     vextractf128 [edx + edi], ymm0, 0  // V
3988     lea        edx, [edx + 16]
3989     sub        ecx, 32
3990     jg         convertloop
3991 
3992     pop        edi
3993     pop        esi
3994     vzeroupper
3995     ret
3996   }
3997 }
3998 
3999 __declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
4000                                            uint8_t* dst_u,
4001                                            uint8_t* dst_v,
4002                                            int width) {
4003   __asm {
4004     push       edi
4005     mov        eax, [esp + 4 + 4]  // src_yuy2
4006     mov        edx, [esp + 4 + 8]  // dst_u
4007     mov        edi, [esp + 4 + 12]  // dst_v
4008     mov        ecx, [esp + 4 + 16]  // width
4009     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
4010     vpsrlw     ymm5, ymm5, 8
4011     sub        edi, edx
4012 
4013   convertloop:
4014     vmovdqu    ymm0, [eax]
4015     vmovdqu    ymm1, [eax + 32]
4016     lea        eax,  [eax + 64]
4017     vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV
4018     vpand      ymm1, ymm1, ymm5
4019     vpackuswb  ymm0, ymm0, ymm1  // mutates.
4020     vpermq     ymm0, ymm0, 0xd8
4021     vpand      ymm1, ymm0, ymm5  // U
4022     vpsrlw     ymm0, ymm0, 8  // V
4023     vpackuswb  ymm1, ymm1, ymm1  // mutates.
4024     vpackuswb  ymm0, ymm0, ymm0  // mutates.
4025     vpermq     ymm1, ymm1, 0xd8
4026     vpermq     ymm0, ymm0, 0xd8
4027     vextractf128 [edx], ymm1, 0  // U
4028     vextractf128 [edx + edi], ymm0, 0  // V
4029     lea        edx, [edx + 16]
4030     sub        ecx, 32
4031     jg         convertloop
4032 
4033     pop        edi
4034     vzeroupper
4035     ret
4036   }
4037 }
4038 #endif  // HAS_YUY2TOYROW_AVX2
4039 
4040 #ifdef HAS_YUY2TOYROW_SSE2
4041 __declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2,
4042                                        uint8_t* dst_y,
4043                                        int width) {
4044   __asm {
4045     mov        eax, [esp + 4]  // src_yuy2
4046     mov        edx, [esp + 8]  // dst_y
4047     mov        ecx, [esp + 12]  // width
4048     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
4049     psrlw      xmm5, 8
4050 
4051   convertloop:
4052     movdqu     xmm0, [eax]
4053     movdqu     xmm1, [eax + 16]
4054     lea        eax,  [eax + 32]
4055     pand       xmm0, xmm5  // even bytes are Y
4056     pand       xmm1, xmm5
4057     packuswb   xmm0, xmm1
4058     movdqu     [edx], xmm0
4059     lea        edx, [edx + 16]
4060     sub        ecx, 16
4061     jg         convertloop
4062     ret
4063   }
4064 }
4065 
4066 __declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
4067                                         int stride_yuy2,
4068                                         uint8_t* dst_u,
4069                                         uint8_t* dst_v,
4070                                         int width) {
4071   __asm {
4072     push       esi
4073     push       edi
4074     mov        eax, [esp + 8 + 4]  // src_yuy2
4075     mov        esi, [esp + 8 + 8]  // stride_yuy2
4076     mov        edx, [esp + 8 + 12]  // dst_u
4077     mov        edi, [esp + 8 + 16]  // dst_v
4078     mov        ecx, [esp + 8 + 20]  // width
4079     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
4080     psrlw      xmm5, 8
4081     sub        edi, edx
4082 
4083   convertloop:
4084     movdqu     xmm0, [eax]
4085     movdqu     xmm1, [eax + 16]
4086     movdqu     xmm2, [eax + esi]
4087     movdqu     xmm3, [eax + esi + 16]
4088     lea        eax,  [eax + 32]
4089     pavgb      xmm0, xmm2
4090     pavgb      xmm1, xmm3
4091     psrlw      xmm0, 8  // YUYV -> UVUV
4092     psrlw      xmm1, 8
4093     packuswb   xmm0, xmm1
4094     movdqa     xmm1, xmm0
4095     pand       xmm0, xmm5  // U
4096     packuswb   xmm0, xmm0
4097     psrlw      xmm1, 8  // V
4098     packuswb   xmm1, xmm1
4099     movq       qword ptr [edx], xmm0
4100     movq       qword ptr [edx + edi], xmm1
4101     lea        edx, [edx + 8]
4102     sub        ecx, 16
4103     jg         convertloop
4104 
4105     pop        edi
4106     pop        esi
4107     ret
4108   }
4109 }
4110 
4111 __declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
4112                                            uint8_t* dst_u,
4113                                            uint8_t* dst_v,
4114                                            int width) {
4115   __asm {
4116     push       edi
4117     mov        eax, [esp + 4 + 4]  // src_yuy2
4118     mov        edx, [esp + 4 + 8]  // dst_u
4119     mov        edi, [esp + 4 + 12]  // dst_v
4120     mov        ecx, [esp + 4 + 16]  // width
4121     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
4122     psrlw      xmm5, 8
4123     sub        edi, edx
4124 
4125   convertloop:
4126     movdqu     xmm0, [eax]
4127     movdqu     xmm1, [eax + 16]
4128     lea        eax,  [eax + 32]
4129     psrlw      xmm0, 8  // YUYV -> UVUV
4130     psrlw      xmm1, 8
4131     packuswb   xmm0, xmm1
4132     movdqa     xmm1, xmm0
4133     pand       xmm0, xmm5  // U
4134     packuswb   xmm0, xmm0
4135     psrlw      xmm1, 8  // V
4136     packuswb   xmm1, xmm1
4137     movq       qword ptr [edx], xmm0
4138     movq       qword ptr [edx + edi], xmm1
4139     lea        edx, [edx + 8]
4140     sub        ecx, 16
4141     jg         convertloop
4142 
4143     pop        edi
4144     ret
4145   }
4146 }
4147 
4148 __declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy,
4149                                        uint8_t* dst_y,
4150                                        int width) {
4151   __asm {
4152     mov        eax, [esp + 4]  // src_uyvy
4153     mov        edx, [esp + 8]  // dst_y
4154     mov        ecx, [esp + 12]  // width
4155 
4156   convertloop:
4157     movdqu     xmm0, [eax]
4158     movdqu     xmm1, [eax + 16]
4159     lea        eax,  [eax + 32]
4160     psrlw      xmm0, 8  // odd bytes are Y
4161     psrlw      xmm1, 8
4162     packuswb   xmm0, xmm1
4163     movdqu     [edx], xmm0
4164     lea        edx, [edx + 16]
4165     sub        ecx, 16
4166     jg         convertloop
4167     ret
4168   }
4169 }
4170 
4171 __declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
4172                                         int stride_uyvy,
4173                                         uint8_t* dst_u,
4174                                         uint8_t* dst_v,
4175                                         int width) {
4176   __asm {
4177     push       esi
4178     push       edi
4179     mov        eax, [esp + 8 + 4]  // src_yuy2
4180     mov        esi, [esp + 8 + 8]  // stride_yuy2
4181     mov        edx, [esp + 8 + 12]  // dst_u
4182     mov        edi, [esp + 8 + 16]  // dst_v
4183     mov        ecx, [esp + 8 + 20]  // width
4184     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
4185     psrlw      xmm5, 8
4186     sub        edi, edx
4187 
4188   convertloop:
4189     movdqu     xmm0, [eax]
4190     movdqu     xmm1, [eax + 16]
4191     movdqu     xmm2, [eax + esi]
4192     movdqu     xmm3, [eax + esi + 16]
4193     lea        eax,  [eax + 32]
4194     pavgb      xmm0, xmm2
4195     pavgb      xmm1, xmm3
4196     pand       xmm0, xmm5  // UYVY -> UVUV
4197     pand       xmm1, xmm5
4198     packuswb   xmm0, xmm1
4199     movdqa     xmm1, xmm0
4200     pand       xmm0, xmm5  // U
4201     packuswb   xmm0, xmm0
4202     psrlw      xmm1, 8  // V
4203     packuswb   xmm1, xmm1
4204     movq       qword ptr [edx], xmm0
4205     movq       qword ptr [edx + edi], xmm1
4206     lea        edx, [edx + 8]
4207     sub        ecx, 16
4208     jg         convertloop
4209 
4210     pop        edi
4211     pop        esi
4212     ret
4213   }
4214 }
4215 
4216 __declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
4217                                            uint8_t* dst_u,
4218                                            uint8_t* dst_v,
4219                                            int width) {
4220   __asm {
4221     push       edi
4222     mov        eax, [esp + 4 + 4]  // src_yuy2
4223     mov        edx, [esp + 4 + 8]  // dst_u
4224     mov        edi, [esp + 4 + 12]  // dst_v
4225     mov        ecx, [esp + 4 + 16]  // width
4226     pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
4227     psrlw      xmm5, 8
4228     sub        edi, edx
4229 
4230   convertloop:
4231     movdqu     xmm0, [eax]
4232     movdqu     xmm1, [eax + 16]
4233     lea        eax,  [eax + 32]
4234     pand       xmm0, xmm5  // UYVY -> UVUV
4235     pand       xmm1, xmm5
4236     packuswb   xmm0, xmm1
4237     movdqa     xmm1, xmm0
4238     pand       xmm0, xmm5  // U
4239     packuswb   xmm0, xmm0
4240     psrlw      xmm1, 8  // V
4241     packuswb   xmm1, xmm1
4242     movq       qword ptr [edx], xmm0
4243     movq       qword ptr [edx + edi], xmm1
4244     lea        edx, [edx + 8]
4245     sub        ecx, 16
4246     jg         convertloop
4247 
4248     pop        edi
4249     ret
4250   }
4251 }
4252 #endif  // HAS_YUY2TOYROW_SSE2
4253 
4254 #ifdef HAS_BLENDPLANEROW_SSSE3
4255 // Blend 8 pixels at a time.
4256 // unsigned version of math
4257 // =((A2*C2)+(B2*(255-C2))+255)/256
4258 // signed version of math
4259 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
4260 __declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0,
4261                                            const uint8_t* src1,
4262                                            const uint8_t* alpha,
4263                                            uint8_t* dst,
4264                                            int width) {
4265   __asm {
4266     push       esi
4267     push       edi
4268     pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00
4269     psllw      xmm5, 8
4270     mov        eax, 0x80808080  // 128 for biasing image to signed.
4271     movd       xmm6, eax
4272     pshufd     xmm6, xmm6, 0x00
4273 
4274     mov        eax, 0x807f807f  // 32768 + 127 for unbias and round.
4275     movd       xmm7, eax
4276     pshufd     xmm7, xmm7, 0x00
4277     mov        eax, [esp + 8 + 4]  // src0
4278     mov        edx, [esp + 8 + 8]  // src1
4279     mov        esi, [esp + 8 + 12]  // alpha
4280     mov        edi, [esp + 8 + 16]  // dst
4281     mov        ecx, [esp + 8 + 20]  // width
4282     sub        eax, esi
4283     sub        edx, esi
4284     sub        edi, esi
4285 
4286         // 8 pixel loop.
4287   convertloop8:
4288     movq       xmm0, qword ptr [esi]  // alpha
4289     punpcklbw  xmm0, xmm0
4290     pxor       xmm0, xmm5  // a, 255-a
4291     movq       xmm1, qword ptr [eax + esi]  // src0
4292     movq       xmm2, qword ptr [edx + esi]  // src1
4293     punpcklbw  xmm1, xmm2
4294     psubb      xmm1, xmm6  // bias src0/1 - 128
4295     pmaddubsw  xmm0, xmm1
4296     paddw      xmm0, xmm7  // unbias result - 32768 and round.
4297     psrlw      xmm0, 8
4298     packuswb   xmm0, xmm0
4299     movq       qword ptr [edi + esi], xmm0
4300     lea        esi, [esi + 8]
4301     sub        ecx, 8
4302     jg         convertloop8
4303 
4304     pop        edi
4305     pop        esi
4306     ret
4307   }
4308 }
4309 #endif  // HAS_BLENDPLANEROW_SSSE3
4310 
4311 #ifdef HAS_BLENDPLANEROW_AVX2
4312 // Blend 32 pixels at a time.
4313 // unsigned version of math
4314 // =((A2*C2)+(B2*(255-C2))+255)/256
4315 // signed version of math
4316 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
4317 __declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0,
4318                                           const uint8_t* src1,
4319                                           const uint8_t* alpha,
4320                                           uint8_t* dst,
4321                                           int width) {
4322   __asm {
4323     push        esi
4324     push        edi
4325     vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff00ff00
4326     vpsllw      ymm5, ymm5, 8
4327     mov         eax, 0x80808080  // 128 for biasing image to signed.
4328     vmovd       xmm6, eax
4329     vbroadcastss ymm6, xmm6
4330     mov         eax, 0x807f807f  // 32768 + 127 for unbias and round.
4331     vmovd       xmm7, eax
4332     vbroadcastss ymm7, xmm7
4333     mov         eax, [esp + 8 + 4]  // src0
4334     mov         edx, [esp + 8 + 8]  // src1
4335     mov         esi, [esp + 8 + 12]  // alpha
4336     mov         edi, [esp + 8 + 16]  // dst
4337     mov         ecx, [esp + 8 + 20]  // width
4338     sub         eax, esi
4339     sub         edx, esi
4340     sub         edi, esi
4341 
4342         // 32 pixel loop.
4343   convertloop32:
4344     vmovdqu     ymm0, [esi]  // alpha
4345     vpunpckhbw  ymm3, ymm0, ymm0  // 8..15, 24..31
4346     vpunpcklbw  ymm0, ymm0, ymm0  // 0..7, 16..23
4347     vpxor       ymm3, ymm3, ymm5  // a, 255-a
4348     vpxor       ymm0, ymm0, ymm5  // a, 255-a
4349     vmovdqu     ymm1, [eax + esi]  // src0
4350     vmovdqu     ymm2, [edx + esi]  // src1
4351     vpunpckhbw  ymm4, ymm1, ymm2
4352     vpunpcklbw  ymm1, ymm1, ymm2
4353     vpsubb      ymm4, ymm4, ymm6  // bias src0/1 - 128
4354     vpsubb      ymm1, ymm1, ymm6  // bias src0/1 - 128
4355     vpmaddubsw  ymm3, ymm3, ymm4
4356     vpmaddubsw  ymm0, ymm0, ymm1
4357     vpaddw      ymm3, ymm3, ymm7  // unbias result - 32768 and round.
4358     vpaddw      ymm0, ymm0, ymm7  // unbias result - 32768 and round.
4359     vpsrlw      ymm3, ymm3, 8
4360     vpsrlw      ymm0, ymm0, 8
4361     vpackuswb   ymm0, ymm0, ymm3
4362     vmovdqu     [edi + esi], ymm0
4363     lea         esi, [esi + 32]
4364     sub         ecx, 32
4365     jg          convertloop32
4366 
4367     pop         edi
4368     pop         esi
4369     vzeroupper
4370     ret
4371   }
4372 }
4373 #endif  // HAS_BLENDPLANEROW_AVX2
4374 
4375 #ifdef HAS_ARGBBLENDROW_SSSE3
4376 // Shuffle table for isolating alpha.
4377 static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
4378                                     11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
4379 
4380 // Blend 8 pixels at a time.
4381 __declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
4382                                           const uint8_t* src_argb1,
4383                                           uint8_t* dst_argb,
4384                                           int width) {
4385   __asm {
4386     push       esi
4387     mov        eax, [esp + 4 + 4]  // src_argb
4388     mov        esi, [esp + 4 + 8]  // src_argb1
4389     mov        edx, [esp + 4 + 12]  // dst_argb
4390     mov        ecx, [esp + 4 + 16]  // width
4391     pcmpeqb    xmm7, xmm7  // generate constant 0x0001
4392     psrlw      xmm7, 15
4393     pcmpeqb    xmm6, xmm6  // generate mask 0x00ff00ff
4394     psrlw      xmm6, 8
4395     pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00
4396     psllw      xmm5, 8
4397     pcmpeqb    xmm4, xmm4  // generate mask 0xff000000
4398     pslld      xmm4, 24
4399     sub        ecx, 4
4400     jl         convertloop4b  // less than 4 pixels?
4401 
4402         // 4 pixel loop.
4403   convertloop4:
4404     movdqu     xmm3, [eax]  // src argb
4405     lea        eax, [eax + 16]
4406     movdqa     xmm0, xmm3  // src argb
4407     pxor       xmm3, xmm4  // ~alpha
4408     movdqu     xmm2, [esi]  // _r_b
4409     pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha
4410     pand       xmm2, xmm6  // _r_b
4411     paddw      xmm3, xmm7  // 256 - alpha
4412     pmullw     xmm2, xmm3  // _r_b * alpha
4413     movdqu     xmm1, [esi]  // _a_g
4414     lea        esi, [esi + 16]
4415     psrlw      xmm1, 8  // _a_g
4416     por        xmm0, xmm4  // set alpha to 255
4417     pmullw     xmm1, xmm3  // _a_g * alpha
4418     psrlw      xmm2, 8  // _r_b convert to 8 bits again
4419     paddusb    xmm0, xmm2  // + src argb
4420     pand       xmm1, xmm5  // a_g_ convert to 8 bits again
4421     paddusb    xmm0, xmm1  // + src argb
4422     movdqu     [edx], xmm0
4423     lea        edx, [edx + 16]
4424     sub        ecx, 4
4425     jge        convertloop4
4426 
4427   convertloop4b:
4428     add        ecx, 4 - 1
4429     jl         convertloop1b
4430 
4431             // 1 pixel loop.
4432   convertloop1:
4433     movd       xmm3, [eax]  // src argb
4434     lea        eax, [eax + 4]
4435     movdqa     xmm0, xmm3  // src argb
4436     pxor       xmm3, xmm4  // ~alpha
4437     movd       xmm2, [esi]  // _r_b
4438     pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha
4439     pand       xmm2, xmm6  // _r_b
4440     paddw      xmm3, xmm7  // 256 - alpha
4441     pmullw     xmm2, xmm3  // _r_b * alpha
4442     movd       xmm1, [esi]  // _a_g
4443     lea        esi, [esi + 4]
4444     psrlw      xmm1, 8  // _a_g
4445     por        xmm0, xmm4  // set alpha to 255
4446     pmullw     xmm1, xmm3  // _a_g * alpha
4447     psrlw      xmm2, 8  // _r_b convert to 8 bits again
4448     paddusb    xmm0, xmm2  // + src argb
4449     pand       xmm1, xmm5  // a_g_ convert to 8 bits again
4450     paddusb    xmm0, xmm1  // + src argb
4451     movd       [edx], xmm0
4452     lea        edx, [edx + 4]
4453     sub        ecx, 1
4454     jge        convertloop1
4455 
4456   convertloop1b:
4457     pop        esi
4458     ret
4459   }
4460 }
4461 #endif  // HAS_ARGBBLENDROW_SSSE3
4462 
4463 #ifdef HAS_ARGBATTENUATEROW_SSSE3
4464 // Shuffle table duplicating alpha.
4465 static const uvec8 kShuffleAlpha0 = {
4466     3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
4467 };
4468 static const uvec8 kShuffleAlpha1 = {
4469     11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
4470     15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
4471 };
4472 __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
4473                                               uint8_t* dst_argb,
4474                                               int width) {
4475   __asm {
4476     mov        eax, [esp + 4]  // src_argb
4477     mov        edx, [esp + 8]  // dst_argb
4478     mov        ecx, [esp + 12]  // width
4479     pcmpeqb    xmm3, xmm3  // generate mask 0xff000000
4480     pslld      xmm3, 24
4481     movdqa     xmm4, xmmword ptr kShuffleAlpha0
4482     movdqa     xmm5, xmmword ptr kShuffleAlpha1
4483 
4484  convertloop:
4485     movdqu     xmm0, [eax]  // read 4 pixels
4486     pshufb     xmm0, xmm4  // isolate first 2 alphas
4487     movdqu     xmm1, [eax]  // read 4 pixels
4488     punpcklbw  xmm1, xmm1  // first 2 pixel rgbs
4489     pmulhuw    xmm0, xmm1  // rgb * a
4490     movdqu     xmm1, [eax]  // read 4 pixels
4491     pshufb     xmm1, xmm5  // isolate next 2 alphas
4492     movdqu     xmm2, [eax]  // read 4 pixels
4493     punpckhbw  xmm2, xmm2  // next 2 pixel rgbs
4494     pmulhuw    xmm1, xmm2  // rgb * a
4495     movdqu     xmm2, [eax]  // mask original alpha
4496     lea        eax, [eax + 16]
4497     pand       xmm2, xmm3
4498     psrlw      xmm0, 8
4499     psrlw      xmm1, 8
4500     packuswb   xmm0, xmm1
4501     por        xmm0, xmm2  // copy original alpha
4502     movdqu     [edx], xmm0
4503     lea        edx, [edx + 16]
4504     sub        ecx, 4
4505     jg         convertloop
4506 
4507     ret
4508   }
4509 }
4510 #endif  // HAS_ARGBATTENUATEROW_SSSE3
4511 
4512 #ifdef HAS_ARGBATTENUATEROW_AVX2
4513 // Shuffle table duplicating alpha.
4514 static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
4515                                          128u, 128u, 14u,  15u, 14u, 15u,
4516                                          14u,  15u,  128u, 128u};
4517 __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
4518                                              uint8_t* dst_argb,
4519                                              int width) {
4520   __asm {
4521     mov        eax, [esp + 4]  // src_argb
4522     mov        edx, [esp + 8]  // dst_argb
4523     mov        ecx, [esp + 12]  // width
4524     sub        edx, eax
4525     vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
4526     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xff000000
4527     vpslld     ymm5, ymm5, 24
4528 
4529  convertloop:
4530     vmovdqu    ymm6, [eax]  // read 8 pixels.
4531     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
4532     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
4533     vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
4534     vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
4535     vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
4536     vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
4537     vpand      ymm6, ymm6, ymm5  // isolate alpha
4538     vpsrlw     ymm0, ymm0, 8
4539     vpsrlw     ymm1, ymm1, 8
4540     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
4541     vpor       ymm0, ymm0, ymm6  // copy original alpha
4542     vmovdqu    [eax + edx], ymm0
4543     lea        eax, [eax + 32]
4544     sub        ecx, 8
4545     jg         convertloop
4546 
4547     vzeroupper
4548     ret
4549   }
4550 }
4551 #endif  // HAS_ARGBATTENUATEROW_AVX2
4552 
4553 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
4554 // Unattenuate 4 pixels at a time.
4555 __declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
4556                                                uint8_t* dst_argb,
4557                                                int width) {
4558   __asm {
4559     push       ebx
4560     push       esi
4561     push       edi
4562     mov        eax, [esp + 12 + 4]  // src_argb
4563     mov        edx, [esp + 12 + 8]  // dst_argb
4564     mov        ecx, [esp + 12 + 12]  // width
4565     lea        ebx, fixed_invtbl8
4566 
4567  convertloop:
4568     movdqu     xmm0, [eax]  // read 4 pixels
4569     movzx      esi, byte ptr [eax + 3]  // first alpha
4570     movzx      edi, byte ptr [eax + 7]  // second alpha
4571     punpcklbw  xmm0, xmm0  // first 2
4572     movd       xmm2, dword ptr [ebx + esi * 4]
4573     movd       xmm3, dword ptr [ebx + edi * 4]
4574     pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words.  1, a, a, a
4575     pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words
4576     movlhps    xmm2, xmm3
4577     pmulhuw    xmm0, xmm2  // rgb * a
4578 
4579     movdqu     xmm1, [eax]  // read 4 pixels
4580     movzx      esi, byte ptr [eax + 11]  // third alpha
4581     movzx      edi, byte ptr [eax + 15]  // forth alpha
4582     punpckhbw  xmm1, xmm1  // next 2
4583     movd       xmm2, dword ptr [ebx + esi * 4]
4584     movd       xmm3, dword ptr [ebx + edi * 4]
4585     pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words
4586     pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words
4587     movlhps    xmm2, xmm3
4588     pmulhuw    xmm1, xmm2  // rgb * a
4589     lea        eax, [eax + 16]
4590     packuswb   xmm0, xmm1
4591     movdqu     [edx], xmm0
4592     lea        edx, [edx + 16]
4593     sub        ecx, 4
4594     jg         convertloop
4595 
4596     pop        edi
4597     pop        esi
4598     pop        ebx
4599     ret
4600   }
4601 }
4602 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
4603 
4604 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
4605 // Shuffle table duplicating alpha.
4606 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
4607     0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
4608 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
4609 // USE_GATHER is not on by default, due to being a slow instruction.
4610 #ifdef USE_GATHER
4611 __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
4612                                                uint8_t* dst_argb,
4613                                                int width) {
4614   __asm {
4615     mov        eax, [esp + 4]  // src_argb
4616     mov        edx, [esp + 8]  // dst_argb
4617     mov        ecx, [esp + 12]  // width
4618     sub        edx, eax
4619     vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
4620 
4621  convertloop:
4622     vmovdqu    ymm6, [eax]  // read 8 pixels.
4623     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
4624     vpsrld     ymm2, ymm6, 24  // alpha in low 8 bits.
4625     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
4626     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
4627     vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
4628     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
4629     vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
4630     vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
4631     vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
4632     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
4633     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
4634     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
4635     vmovdqu    [eax + edx], ymm0
4636     lea        eax, [eax + 32]
4637     sub        ecx, 8
4638     jg         convertloop
4639 
4640     vzeroupper
4641     ret
4642   }
4643 }
4644 #else   // USE_GATHER
4645 __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
4646                                                uint8_t* dst_argb,
4647                                                int width) {
4648   __asm {
4649 
4650     push       ebx
4651     push       esi
4652     push       edi
4653     mov        eax, [esp + 12 + 4]  // src_argb
4654     mov        edx, [esp + 12 + 8]  // dst_argb
4655     mov        ecx, [esp + 12 + 12]  // width
4656     sub        edx, eax
4657     lea        ebx, fixed_invtbl8
4658     vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
4659 
4660  convertloop:
4661         // replace VPGATHER
4662     movzx      esi, byte ptr [eax + 3]  // alpha0
4663     movzx      edi, byte ptr [eax + 7]  // alpha1
4664     vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a0]
4665     vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a1]
4666     movzx      esi, byte ptr [eax + 11]  // alpha2
4667     movzx      edi, byte ptr [eax + 15]  // alpha3
4668     vpunpckldq xmm6, xmm0, xmm1  // [1,a1,1,a0]
4669     vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a2]
4670     vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a3]
4671     movzx      esi, byte ptr [eax + 19]  // alpha4
4672     movzx      edi, byte ptr [eax + 23]  // alpha5
4673     vpunpckldq xmm7, xmm2, xmm3  // [1,a3,1,a2]
4674     vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a4]
4675     vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a5]
4676     movzx      esi, byte ptr [eax + 27]  // alpha6
4677     movzx      edi, byte ptr [eax + 31]  // alpha7
4678     vpunpckldq xmm0, xmm0, xmm1  // [1,a5,1,a4]
4679     vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a6]
4680     vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a7]
4681     vpunpckldq xmm2, xmm2, xmm3  // [1,a7,1,a6]
4682     vpunpcklqdq xmm3, xmm6, xmm7  // [1,a3,1,a2,1,a1,1,a0]
4683     vpunpcklqdq xmm0, xmm0, xmm2  // [1,a7,1,a6,1,a5,1,a4]
4684     vinserti128 ymm3, ymm3, xmm0, 1                // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
4685     // end of VPGATHER
4686 
4687     vmovdqu    ymm6, [eax]  // read 8 pixels.
4688     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
4689     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
4690     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
4691     vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
4692     vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
4693     vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
4694     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
4695     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
4696     vpackuswb  ymm0, ymm0, ymm1             // unmutated.
4697     vmovdqu    [eax + edx], ymm0
4698     lea        eax, [eax + 32]
4699     sub        ecx, 8
4700     jg         convertloop
4701 
4702     pop        edi
4703     pop        esi
4704     pop        ebx
4705     vzeroupper
4706     ret
4707   }
4708 }
4709 #endif  // USE_GATHER
4710 #endif  // HAS_ARGBATTENUATEROW_AVX2
4711 
4712 #ifdef HAS_ARGBGRAYROW_SSSE3
4713 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
4714 __declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb,
4715                                          uint8_t* dst_argb,
4716                                          int width) {
4717   __asm {
4718     mov        eax, [esp + 4] /* src_argb */
4719     mov        edx, [esp + 8] /* dst_argb */
4720     mov        ecx, [esp + 12] /* width */
4721     movdqa     xmm4, xmmword ptr kARGBToYJ
4722     movdqa     xmm5, xmmword ptr kAddYJ64
4723 
4724  convertloop:
4725     movdqu     xmm0, [eax]  // G
4726     movdqu     xmm1, [eax + 16]
4727     pmaddubsw  xmm0, xmm4
4728     pmaddubsw  xmm1, xmm4
4729     phaddw     xmm0, xmm1
4730     paddw      xmm0, xmm5  // Add .5 for rounding.
4731     psrlw      xmm0, 7
4732     packuswb   xmm0, xmm0  // 8 G bytes
4733     movdqu     xmm2, [eax]  // A
4734     movdqu     xmm3, [eax + 16]
4735     lea        eax, [eax + 32]
4736     psrld      xmm2, 24
4737     psrld      xmm3, 24
4738     packuswb   xmm2, xmm3
4739     packuswb   xmm2, xmm2  // 8 A bytes
4740     movdqa     xmm3, xmm0  // Weave into GG, GA, then GGGA
4741     punpcklbw  xmm0, xmm0  // 8 GG words
4742     punpcklbw  xmm3, xmm2  // 8 GA words
4743     movdqa     xmm1, xmm0
4744     punpcklwd  xmm0, xmm3  // GGGA first 4
4745     punpckhwd  xmm1, xmm3  // GGGA next 4
4746     movdqu     [edx], xmm0
4747     movdqu     [edx + 16], xmm1
4748     lea        edx, [edx + 32]
4749     sub        ecx, 8
4750     jg         convertloop
4751     ret
4752   }
4753 }
4754 #endif  // HAS_ARGBGRAYROW_SSSE3
4755 
4756 #ifdef HAS_ARGBSEPIAROW_SSSE3
4757 //    b = (r * 35 + g * 68 + b * 17) >> 7
4758 //    g = (r * 45 + g * 88 + b * 22) >> 7
4759 //    r = (r * 50 + g * 98 + b * 24) >> 7
4760 // Constant for ARGB color to sepia tone.
4761 static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
4762                                    17, 68, 35, 0, 17, 68, 35, 0};
4763 
4764 static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
4765                                    22, 88, 45, 0, 22, 88, 45, 0};
4766 
4767 static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
4768                                    24, 98, 50, 0, 24, 98, 50, 0};
4769 
4770 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
4771 __declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
4772   __asm {
4773     mov        eax, [esp + 4] /* dst_argb */
4774     mov        ecx, [esp + 8] /* width */
4775     movdqa     xmm2, xmmword ptr kARGBToSepiaB
4776     movdqa     xmm3, xmmword ptr kARGBToSepiaG
4777     movdqa     xmm4, xmmword ptr kARGBToSepiaR
4778 
4779  convertloop:
4780     movdqu     xmm0, [eax]  // B
4781     movdqu     xmm6, [eax + 16]
4782     pmaddubsw  xmm0, xmm2
4783     pmaddubsw  xmm6, xmm2
4784     phaddw     xmm0, xmm6
4785     psrlw      xmm0, 7
4786     packuswb   xmm0, xmm0  // 8 B values
4787     movdqu     xmm5, [eax]  // G
4788     movdqu     xmm1, [eax + 16]
4789     pmaddubsw  xmm5, xmm3
4790     pmaddubsw  xmm1, xmm3
4791     phaddw     xmm5, xmm1
4792     psrlw      xmm5, 7
4793     packuswb   xmm5, xmm5  // 8 G values
4794     punpcklbw  xmm0, xmm5  // 8 BG values
4795     movdqu     xmm5, [eax]  // R
4796     movdqu     xmm1, [eax + 16]
4797     pmaddubsw  xmm5, xmm4
4798     pmaddubsw  xmm1, xmm4
4799     phaddw     xmm5, xmm1
4800     psrlw      xmm5, 7
4801     packuswb   xmm5, xmm5  // 8 R values
4802     movdqu     xmm6, [eax]  // A
4803     movdqu     xmm1, [eax + 16]
4804     psrld      xmm6, 24
4805     psrld      xmm1, 24
4806     packuswb   xmm6, xmm1
4807     packuswb   xmm6, xmm6  // 8 A values
4808     punpcklbw  xmm5, xmm6  // 8 RA values
4809     movdqa     xmm1, xmm0  // Weave BG, RA together
4810     punpcklwd  xmm0, xmm5  // BGRA first 4
4811     punpckhwd  xmm1, xmm5  // BGRA next 4
4812     movdqu     [eax], xmm0
4813     movdqu     [eax + 16], xmm1
4814     lea        eax, [eax + 32]
4815     sub        ecx, 8
4816     jg         convertloop
4817     ret
4818   }
4819 }
4820 #endif  // HAS_ARGBSEPIAROW_SSSE3
4821 
4822 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
4823 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
4824 // Same as Sepia except matrix is provided.
4825 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
4826 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
4827 __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
4828                                                 uint8_t* dst_argb,
4829                                                 const int8_t* matrix_argb,
4830                                                 int width) {
4831   __asm {
4832     mov        eax, [esp + 4] /* src_argb */
4833     mov        edx, [esp + 8] /* dst_argb */
4834     mov        ecx, [esp + 12] /* matrix_argb */
4835     movdqu     xmm5, [ecx]
4836     pshufd     xmm2, xmm5, 0x00
4837     pshufd     xmm3, xmm5, 0x55
4838     pshufd     xmm4, xmm5, 0xaa
4839     pshufd     xmm5, xmm5, 0xff
4840     mov        ecx, [esp + 16] /* width */
4841 
4842  convertloop:
4843     movdqu     xmm0, [eax]  // B
4844     movdqu     xmm7, [eax + 16]
4845     pmaddubsw  xmm0, xmm2
4846     pmaddubsw  xmm7, xmm2
4847     movdqu     xmm6, [eax]  // G
4848     movdqu     xmm1, [eax + 16]
4849     pmaddubsw  xmm6, xmm3
4850     pmaddubsw  xmm1, xmm3
4851     phaddsw    xmm0, xmm7  // B
4852     phaddsw    xmm6, xmm1  // G
4853     psraw      xmm0, 6  // B
4854     psraw      xmm6, 6  // G
4855     packuswb   xmm0, xmm0  // 8 B values
4856     packuswb   xmm6, xmm6  // 8 G values
4857     punpcklbw  xmm0, xmm6  // 8 BG values
4858     movdqu     xmm1, [eax]  // R
4859     movdqu     xmm7, [eax + 16]
4860     pmaddubsw  xmm1, xmm4
4861     pmaddubsw  xmm7, xmm4
4862     phaddsw    xmm1, xmm7  // R
4863     movdqu     xmm6, [eax]  // A
4864     movdqu     xmm7, [eax + 16]
4865     pmaddubsw  xmm6, xmm5
4866     pmaddubsw  xmm7, xmm5
4867     phaddsw    xmm6, xmm7  // A
4868     psraw      xmm1, 6  // R
4869     psraw      xmm6, 6  // A
4870     packuswb   xmm1, xmm1  // 8 R values
4871     packuswb   xmm6, xmm6  // 8 A values
4872     punpcklbw  xmm1, xmm6  // 8 RA values
4873     movdqa     xmm6, xmm0  // Weave BG, RA together
4874     punpcklwd  xmm0, xmm1  // BGRA first 4
4875     punpckhwd  xmm6, xmm1  // BGRA next 4
4876     movdqu     [edx], xmm0
4877     movdqu     [edx + 16], xmm6
4878     lea        eax, [eax + 32]
4879     lea        edx, [edx + 32]
4880     sub        ecx, 8
4881     jg         convertloop
4882     ret
4883   }
4884 }
4885 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
4886 
4887 #ifdef HAS_ARGBQUANTIZEROW_SSE2
4888 // Quantize 4 ARGB pixels (16 bytes).
4889 __declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
4890                                             int scale,
4891                                             int interval_size,
4892                                             int interval_offset,
4893                                             int width) {
4894   __asm {
4895     mov        eax, [esp + 4] /* dst_argb */
4896     movd       xmm2, [esp + 8] /* scale */
4897     movd       xmm3, [esp + 12] /* interval_size */
4898     movd       xmm4, [esp + 16] /* interval_offset */
4899     mov        ecx, [esp + 20] /* width */
4900     pshuflw    xmm2, xmm2, 040h
4901     pshufd     xmm2, xmm2, 044h
4902     pshuflw    xmm3, xmm3, 040h
4903     pshufd     xmm3, xmm3, 044h
4904     pshuflw    xmm4, xmm4, 040h
4905     pshufd     xmm4, xmm4, 044h
4906     pxor       xmm5, xmm5  // constant 0
4907     pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
4908     pslld      xmm6, 24
4909 
4910  convertloop:
4911     movdqu     xmm0, [eax]  // read 4 pixels
4912     punpcklbw  xmm0, xmm5  // first 2 pixels
4913     pmulhuw    xmm0, xmm2  // pixel * scale >> 16
4914     movdqu     xmm1, [eax]  // read 4 pixels
4915     punpckhbw  xmm1, xmm5  // next 2 pixels
4916     pmulhuw    xmm1, xmm2
4917     pmullw     xmm0, xmm3  // * interval_size
4918     movdqu     xmm7, [eax]  // read 4 pixels
4919     pmullw     xmm1, xmm3
4920     pand       xmm7, xmm6  // mask alpha
4921     paddw      xmm0, xmm4  // + interval_size / 2
4922     paddw      xmm1, xmm4
4923     packuswb   xmm0, xmm1
4924     por        xmm0, xmm7
4925     movdqu     [eax], xmm0
4926     lea        eax, [eax + 16]
4927     sub        ecx, 4
4928     jg         convertloop
4929     ret
4930   }
4931 }
4932 #endif  // HAS_ARGBQUANTIZEROW_SSE2
4933 
4934 #ifdef HAS_ARGBSHADEROW_SSE2
4935 // Shade 4 pixels at a time by specified value.
4936 __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
4937                                          uint8_t* dst_argb,
4938                                          int width,
4939                                          uint32_t value) {
4940   __asm {
4941     mov        eax, [esp + 4]  // src_argb
4942     mov        edx, [esp + 8]  // dst_argb
4943     mov        ecx, [esp + 12]  // width
4944     movd       xmm2, [esp + 16]  // value
4945     punpcklbw  xmm2, xmm2
4946     punpcklqdq xmm2, xmm2
4947 
4948  convertloop:
4949     movdqu     xmm0, [eax]  // read 4 pixels
4950     lea        eax, [eax + 16]
4951     movdqa     xmm1, xmm0
4952     punpcklbw  xmm0, xmm0  // first 2
4953     punpckhbw  xmm1, xmm1  // next 2
4954     pmulhuw    xmm0, xmm2  // argb * value
4955     pmulhuw    xmm1, xmm2  // argb * value
4956     psrlw      xmm0, 8
4957     psrlw      xmm1, 8
4958     packuswb   xmm0, xmm1
4959     movdqu     [edx], xmm0
4960     lea        edx, [edx + 16]
4961     sub        ecx, 4
4962     jg         convertloop
4963 
4964     ret
4965   }
4966 }
4967 #endif  // HAS_ARGBSHADEROW_SSE2
4968 
4969 #ifdef HAS_ARGBMULTIPLYROW_SSE2
4970 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
4971 __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
4972                                             const uint8_t* src_argb1,
4973                                             uint8_t* dst_argb,
4974                                             int width) {
4975   __asm {
4976     push       esi
4977     mov        eax, [esp + 4 + 4]  // src_argb
4978     mov        esi, [esp + 4 + 8]  // src_argb1
4979     mov        edx, [esp + 4 + 12]  // dst_argb
4980     mov        ecx, [esp + 4 + 16]  // width
4981     pxor       xmm5, xmm5  // constant 0
4982 
4983  convertloop:
4984     movdqu     xmm0, [eax]  // read 4 pixels from src_argb
4985     movdqu     xmm2, [esi]  // read 4 pixels from src_argb1
4986     movdqu     xmm1, xmm0
4987     movdqu     xmm3, xmm2
4988     punpcklbw  xmm0, xmm0  // first 2
4989     punpckhbw  xmm1, xmm1  // next 2
4990     punpcklbw  xmm2, xmm5  // first 2
4991     punpckhbw  xmm3, xmm5  // next 2
4992     pmulhuw    xmm0, xmm2  // src_argb * src_argb1 first 2
4993     pmulhuw    xmm1, xmm3  // src_argb * src_argb1 next 2
4994     lea        eax, [eax + 16]
4995     lea        esi, [esi + 16]
4996     packuswb   xmm0, xmm1
4997     movdqu     [edx], xmm0
4998     lea        edx, [edx + 16]
4999     sub        ecx, 4
5000     jg         convertloop
5001 
5002     pop        esi
5003     ret
5004   }
5005 }
5006 #endif  // HAS_ARGBMULTIPLYROW_SSE2
5007 
5008 #ifdef HAS_ARGBADDROW_SSE2
5009 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
5010 // TODO(fbarchard): Port this to posix, neon and other math functions.
5011 __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb,
5012                                        const uint8_t* src_argb1,
5013                                        uint8_t* dst_argb,
5014                                        int width) {
5015   __asm {
5016     push       esi
5017     mov        eax, [esp + 4 + 4]  // src_argb
5018     mov        esi, [esp + 4 + 8]  // src_argb1
5019     mov        edx, [esp + 4 + 12]  // dst_argb
5020     mov        ecx, [esp + 4 + 16]  // width
5021 
5022     sub        ecx, 4
5023     jl         convertloop49
5024 
5025  convertloop4:
5026     movdqu     xmm0, [eax]  // read 4 pixels from src_argb
5027     lea        eax, [eax + 16]
5028     movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
5029     lea        esi, [esi + 16]
5030     paddusb    xmm0, xmm1  // src_argb + src_argb1
5031     movdqu     [edx], xmm0
5032     lea        edx, [edx + 16]
5033     sub        ecx, 4
5034     jge        convertloop4
5035 
5036  convertloop49:
5037     add        ecx, 4 - 1
5038     jl         convertloop19
5039 
5040  convertloop1:
5041     movd       xmm0, [eax]  // read 1 pixels from src_argb
5042     lea        eax, [eax + 4]
5043     movd       xmm1, [esi]  // read 1 pixels from src_argb1
5044     lea        esi, [esi + 4]
5045     paddusb    xmm0, xmm1  // src_argb + src_argb1
5046     movd       [edx], xmm0
5047     lea        edx, [edx + 4]
5048     sub        ecx, 1
5049     jge        convertloop1
5050 
5051  convertloop19:
5052     pop        esi
5053     ret
5054   }
5055 }
5056 #endif  // HAS_ARGBADDROW_SSE2
5057 
5058 #ifdef HAS_ARGBSUBTRACTROW_SSE2
5059 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
5060 __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
5061                                             const uint8_t* src_argb1,
5062                                             uint8_t* dst_argb,
5063                                             int width) {
5064   __asm {
5065     push       esi
5066     mov        eax, [esp + 4 + 4]  // src_argb
5067     mov        esi, [esp + 4 + 8]  // src_argb1
5068     mov        edx, [esp + 4 + 12]  // dst_argb
5069     mov        ecx, [esp + 4 + 16]  // width
5070 
5071  convertloop:
5072     movdqu     xmm0, [eax]  // read 4 pixels from src_argb
5073     lea        eax, [eax + 16]
5074     movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
5075     lea        esi, [esi + 16]
5076     psubusb    xmm0, xmm1  // src_argb - src_argb1
5077     movdqu     [edx], xmm0
5078     lea        edx, [edx + 16]
5079     sub        ecx, 4
5080     jg         convertloop
5081 
5082     pop        esi
5083     ret
5084   }
5085 }
5086 #endif  // HAS_ARGBSUBTRACTROW_SSE2
5087 
5088 #ifdef HAS_ARGBMULTIPLYROW_AVX2
5089 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
5090 __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
5091                                             const uint8_t* src_argb1,
5092                                             uint8_t* dst_argb,
5093                                             int width) {
5094   __asm {
5095     push       esi
5096     mov        eax, [esp + 4 + 4]  // src_argb
5097     mov        esi, [esp + 4 + 8]  // src_argb1
5098     mov        edx, [esp + 4 + 12]  // dst_argb
5099     mov        ecx, [esp + 4 + 16]  // width
5100     vpxor      ymm5, ymm5, ymm5  // constant 0
5101 
5102  convertloop:
5103     vmovdqu    ymm1, [eax]  // read 8 pixels from src_argb
5104     lea        eax, [eax + 32]
5105     vmovdqu    ymm3, [esi]  // read 8 pixels from src_argb1
5106     lea        esi, [esi + 32]
5107     vpunpcklbw ymm0, ymm1, ymm1  // low 4
5108     vpunpckhbw ymm1, ymm1, ymm1  // high 4
5109     vpunpcklbw ymm2, ymm3, ymm5  // low 4
5110     vpunpckhbw ymm3, ymm3, ymm5  // high 4
5111     vpmulhuw   ymm0, ymm0, ymm2  // src_argb * src_argb1 low 4
5112     vpmulhuw   ymm1, ymm1, ymm3  // src_argb * src_argb1 high 4
5113     vpackuswb  ymm0, ymm0, ymm1
5114     vmovdqu    [edx], ymm0
5115     lea        edx, [edx + 32]
5116     sub        ecx, 8
5117     jg         convertloop
5118 
5119     pop        esi
5120     vzeroupper
5121     ret
5122   }
5123 }
5124 #endif  // HAS_ARGBMULTIPLYROW_AVX2
5125 
5126 #ifdef HAS_ARGBADDROW_AVX2
5127 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
5128 __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb,
5129                                        const uint8_t* src_argb1,
5130                                        uint8_t* dst_argb,
5131                                        int width) {
5132   __asm {
5133     push       esi
5134     mov        eax, [esp + 4 + 4]  // src_argb
5135     mov        esi, [esp + 4 + 8]  // src_argb1
5136     mov        edx, [esp + 4 + 12]  // dst_argb
5137     mov        ecx, [esp + 4 + 16]  // width
5138 
5139  convertloop:
5140     vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb
5141     lea        eax, [eax + 32]
5142     vpaddusb   ymm0, ymm0, [esi]  // add 8 pixels from src_argb1
5143     lea        esi, [esi + 32]
5144     vmovdqu    [edx], ymm0
5145     lea        edx, [edx + 32]
5146     sub        ecx, 8
5147     jg         convertloop
5148 
5149     pop        esi
5150     vzeroupper
5151     ret
5152   }
5153 }
5154 #endif  // HAS_ARGBADDROW_AVX2
5155 
5156 #ifdef HAS_ARGBSUBTRACTROW_AVX2
5157 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
5158 __declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
5159                                             const uint8_t* src_argb1,
5160                                             uint8_t* dst_argb,
5161                                             int width) {
5162   __asm {
5163     push       esi
5164     mov        eax, [esp + 4 + 4]  // src_argb
5165     mov        esi, [esp + 4 + 8]  // src_argb1
5166     mov        edx, [esp + 4 + 12]  // dst_argb
5167     mov        ecx, [esp + 4 + 16]  // width
5168 
5169  convertloop:
5170     vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb
5171     lea        eax, [eax + 32]
5172     vpsubusb   ymm0, ymm0, [esi]  // src_argb - src_argb1
5173     lea        esi, [esi + 32]
5174     vmovdqu    [edx], ymm0
5175     lea        edx, [edx + 32]
5176     sub        ecx, 8
5177     jg         convertloop
5178 
5179     pop        esi
5180     vzeroupper
5181     ret
5182   }
5183 }
5184 #endif  // HAS_ARGBSUBTRACTROW_AVX2
5185 
5186 #ifdef HAS_SOBELXROW_SSE2
5187 // SobelX as a matrix is
5188 // -1  0  1
5189 // -2  0  2
5190 // -1  0  1
5191 __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
5192                                       const uint8_t* src_y1,
5193                                       const uint8_t* src_y2,
5194                                       uint8_t* dst_sobelx,
5195                                       int width) {
5196   __asm {
5197     push       esi
5198     push       edi
5199     mov        eax, [esp + 8 + 4]  // src_y0
5200     mov        esi, [esp + 8 + 8]  // src_y1
5201     mov        edi, [esp + 8 + 12]  // src_y2
5202     mov        edx, [esp + 8 + 16]  // dst_sobelx
5203     mov        ecx, [esp + 8 + 20]  // width
5204     sub        esi, eax
5205     sub        edi, eax
5206     sub        edx, eax
5207     pxor       xmm5, xmm5  // constant 0
5208 
5209  convertloop:
5210     movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
5211     movq       xmm1, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
5212     punpcklbw  xmm0, xmm5
5213     punpcklbw  xmm1, xmm5
5214     psubw      xmm0, xmm1
5215     movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
5216     movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
5217     punpcklbw  xmm1, xmm5
5218     punpcklbw  xmm2, xmm5
5219     psubw      xmm1, xmm2
5220     movq       xmm2, qword ptr [eax + edi]  // read 8 pixels from src_y2[0]
5221     movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
5222     punpcklbw  xmm2, xmm5
5223     punpcklbw  xmm3, xmm5
5224     psubw      xmm2, xmm3
5225     paddw      xmm0, xmm2
5226     paddw      xmm0, xmm1
5227     paddw      xmm0, xmm1
5228     pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
5229     psubw      xmm1, xmm0
5230     pmaxsw     xmm0, xmm1
5231     packuswb   xmm0, xmm0
5232     movq       qword ptr [eax + edx], xmm0
5233     lea        eax, [eax + 8]
5234     sub        ecx, 8
5235     jg         convertloop
5236 
5237     pop        edi
5238     pop        esi
5239     ret
5240   }
5241 }
5242 #endif  // HAS_SOBELXROW_SSE2
5243 
5244 #ifdef HAS_SOBELYROW_SSE2
5245 // SobelY as a matrix is
5246 // -1 -2 -1
5247 //  0  0  0
5248 //  1  2  1
5249 __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
5250                                       const uint8_t* src_y1,
5251                                       uint8_t* dst_sobely,
5252                                       int width) {
5253   __asm {
5254     push       esi
5255     mov        eax, [esp + 4 + 4]  // src_y0
5256     mov        esi, [esp + 4 + 8]  // src_y1
5257     mov        edx, [esp + 4 + 12]  // dst_sobely
5258     mov        ecx, [esp + 4 + 16]  // width
5259     sub        esi, eax
5260     sub        edx, eax
5261     pxor       xmm5, xmm5  // constant 0
5262 
5263  convertloop:
5264     movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
5265     movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
5266     punpcklbw  xmm0, xmm5
5267     punpcklbw  xmm1, xmm5
5268     psubw      xmm0, xmm1
5269     movq       xmm1, qword ptr [eax + 1]  // read 8 pixels from src_y0[1]
5270     movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
5271     punpcklbw  xmm1, xmm5
5272     punpcklbw  xmm2, xmm5
5273     psubw      xmm1, xmm2
5274     movq       xmm2, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
5275     movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
5276     punpcklbw  xmm2, xmm5
5277     punpcklbw  xmm3, xmm5
5278     psubw      xmm2, xmm3
5279     paddw      xmm0, xmm2
5280     paddw      xmm0, xmm1
5281     paddw      xmm0, xmm1
5282     pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
5283     psubw      xmm1, xmm0
5284     pmaxsw     xmm0, xmm1
5285     packuswb   xmm0, xmm0
5286     movq       qword ptr [eax + edx], xmm0
5287     lea        eax, [eax + 8]
5288     sub        ecx, 8
5289     jg         convertloop
5290 
5291     pop        esi
5292     ret
5293   }
5294 }
5295 #endif  // HAS_SOBELYROW_SSE2
5296 
5297 #ifdef HAS_SOBELROW_SSE2
5298 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
5299 // A = 255
5300 // R = Sobel
5301 // G = Sobel
5302 // B = Sobel
5303 __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
5304                                      const uint8_t* src_sobely,
5305                                      uint8_t* dst_argb,
5306                                      int width) {
5307   __asm {
5308     push       esi
5309     mov        eax, [esp + 4 + 4]  // src_sobelx
5310     mov        esi, [esp + 4 + 8]  // src_sobely
5311     mov        edx, [esp + 4 + 12]  // dst_argb
5312     mov        ecx, [esp + 4 + 16]  // width
5313     sub        esi, eax
5314     pcmpeqb    xmm5, xmm5  // alpha 255
5315     pslld      xmm5, 24  // 0xff000000
5316 
5317  convertloop:
5318     movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
5319     movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
5320     lea        eax, [eax + 16]
5321     paddusb    xmm0, xmm1  // sobel = sobelx + sobely
5322     movdqa     xmm2, xmm0  // GG
5323     punpcklbw  xmm2, xmm0  // First 8
5324     punpckhbw  xmm0, xmm0  // Next 8
5325     movdqa     xmm1, xmm2  // GGGG
5326     punpcklwd  xmm1, xmm2  // First 4
5327     punpckhwd  xmm2, xmm2  // Next 4
5328     por        xmm1, xmm5  // GGGA
5329     por        xmm2, xmm5
5330     movdqa     xmm3, xmm0  // GGGG
5331     punpcklwd  xmm3, xmm0  // Next 4
5332     punpckhwd  xmm0, xmm0  // Last 4
5333     por        xmm3, xmm5  // GGGA
5334     por        xmm0, xmm5
5335     movdqu     [edx], xmm1
5336     movdqu     [edx + 16], xmm2
5337     movdqu     [edx + 32], xmm3
5338     movdqu     [edx + 48], xmm0
5339     lea        edx, [edx + 64]
5340     sub        ecx, 16
5341     jg         convertloop
5342 
5343     pop        esi
5344     ret
5345   }
5346 }
5347 #endif  // HAS_SOBELROW_SSE2
5348 
5349 #ifdef HAS_SOBELTOPLANEROW_SSE2
5350 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
5351 __declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
5352                                             const uint8_t* src_sobely,
5353                                             uint8_t* dst_y,
5354                                             int width) {
5355   __asm {
5356     push       esi
5357     mov        eax, [esp + 4 + 4]  // src_sobelx
5358     mov        esi, [esp + 4 + 8]  // src_sobely
5359     mov        edx, [esp + 4 + 12]  // dst_argb
5360     mov        ecx, [esp + 4 + 16]  // width
5361     sub        esi, eax
5362 
5363  convertloop:
5364     movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
5365     movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
5366     lea        eax, [eax + 16]
5367     paddusb    xmm0, xmm1  // sobel = sobelx + sobely
5368     movdqu     [edx], xmm0
5369     lea        edx, [edx + 16]
5370     sub        ecx, 16
5371     jg         convertloop
5372 
5373     pop        esi
5374     ret
5375   }
5376 }
5377 #endif  // HAS_SOBELTOPLANEROW_SSE2
5378 
5379 #ifdef HAS_SOBELXYROW_SSE2
5380 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
5381 // A = 255
5382 // R = Sobel X
5383 // G = Sobel
5384 // B = Sobel Y
5385 __declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,
5386                                        const uint8_t* src_sobely,
5387                                        uint8_t* dst_argb,
5388                                        int width) {
5389   __asm {
5390     push       esi
5391     mov        eax, [esp + 4 + 4]  // src_sobelx
5392     mov        esi, [esp + 4 + 8]  // src_sobely
5393     mov        edx, [esp + 4 + 12]  // dst_argb
5394     mov        ecx, [esp + 4 + 16]  // width
5395     sub        esi, eax
5396     pcmpeqb    xmm5, xmm5  // alpha 255
5397 
5398  convertloop:
5399     movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
5400     movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
5401     lea        eax, [eax + 16]
5402     movdqa     xmm2, xmm0
5403     paddusb    xmm2, xmm1  // sobel = sobelx + sobely
5404     movdqa     xmm3, xmm0  // XA
5405     punpcklbw  xmm3, xmm5
5406     punpckhbw  xmm0, xmm5
5407     movdqa     xmm4, xmm1  // YS
5408     punpcklbw  xmm4, xmm2
5409     punpckhbw  xmm1, xmm2
5410     movdqa     xmm6, xmm4  // YSXA
5411     punpcklwd  xmm6, xmm3  // First 4
5412     punpckhwd  xmm4, xmm3  // Next 4
5413     movdqa     xmm7, xmm1  // YSXA
5414     punpcklwd  xmm7, xmm0  // Next 4
5415     punpckhwd  xmm1, xmm0  // Last 4
5416     movdqu     [edx], xmm6
5417     movdqu     [edx + 16], xmm4
5418     movdqu     [edx + 32], xmm7
5419     movdqu     [edx + 48], xmm1
5420     lea        edx, [edx + 64]
5421     sub        ecx, 16
5422     jg         convertloop
5423 
5424     pop        esi
5425     ret
5426   }
5427 }
5428 #endif  // HAS_SOBELXYROW_SSE2
5429 
5430 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5431 // Consider float CumulativeSum.
5432 // Consider calling CumulativeSum one row at time as needed.
5433 // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
5434 // Convert cumulative sum for an area to an average for 1 pixel.
5435 // topleft is pointer to top left of CumulativeSum buffer for area.
5436 // botleft is pointer to bottom left of CumulativeSum buffer.
5437 // width is offset from left to right of area in CumulativeSum buffer measured
5438 //   in number of ints.
5439 // area is the number of pixels in the area being averaged.
5440 // dst points to pixel to store result to.
5441 // count is number of averaged pixels to produce.
5442 // Does 4 pixels at a time.
5443 // This function requires alignment on accumulation buffer pointers.
5444 void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
5445                                     const int32_t* botleft,
5446                                     int width,
5447                                     int area,
5448                                     uint8_t* dst,
5449                                     int count) {
5450   __asm {
5451     mov        eax, topleft  // eax topleft
5452     mov        esi, botleft  // esi botleft
5453     mov        edx, width
5454     movd       xmm5, area
5455     mov        edi, dst
5456     mov        ecx, count
5457     cvtdq2ps   xmm5, xmm5
5458     rcpss      xmm4, xmm5  // 1.0f / area
5459     pshufd     xmm4, xmm4, 0
5460     sub        ecx, 4
5461     jl         l4b
5462 
5463     cmp        area, 128  // 128 pixels will not overflow 15 bits.
5464     ja         l4
5465 
5466     pshufd     xmm5, xmm5, 0  // area
5467     pcmpeqb    xmm6, xmm6  // constant of 65536.0 - 1 = 65535.0
5468     psrld      xmm6, 16
5469     cvtdq2ps   xmm6, xmm6
5470     addps      xmm5, xmm6  // (65536.0 + area - 1)
5471     mulps      xmm5, xmm4  // (65536.0 + area - 1) * 1 / area
5472     cvtps2dq   xmm5, xmm5  // 0.16 fixed point
5473     packssdw   xmm5, xmm5  // 16 bit shorts
5474 
5475         // 4 pixel loop small blocks.
5476   s4:
5477         // top left
5478     movdqu     xmm0, [eax]
5479     movdqu     xmm1, [eax + 16]
5480     movdqu     xmm2, [eax + 32]
5481     movdqu     xmm3, [eax + 48]
5482 
5483     // - top right
5484     psubd      xmm0, [eax + edx * 4]
5485     psubd      xmm1, [eax + edx * 4 + 16]
5486     psubd      xmm2, [eax + edx * 4 + 32]
5487     psubd      xmm3, [eax + edx * 4 + 48]
5488     lea        eax, [eax + 64]
5489 
5490     // - bottom left
5491     psubd      xmm0, [esi]
5492     psubd      xmm1, [esi + 16]
5493     psubd      xmm2, [esi + 32]
5494     psubd      xmm3, [esi + 48]
5495 
5496     // + bottom right
5497     paddd      xmm0, [esi + edx * 4]
5498     paddd      xmm1, [esi + edx * 4 + 16]
5499     paddd      xmm2, [esi + edx * 4 + 32]
5500     paddd      xmm3, [esi + edx * 4 + 48]
5501     lea        esi, [esi + 64]
5502 
5503     packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
5504     packssdw   xmm2, xmm3
5505 
5506     pmulhuw    xmm0, xmm5
5507     pmulhuw    xmm2, xmm5
5508 
5509     packuswb   xmm0, xmm2
5510     movdqu     [edi], xmm0
5511     lea        edi, [edi + 16]
5512     sub        ecx, 4
5513     jge        s4
5514 
5515     jmp        l4b
5516 
5517             // 4 pixel loop
5518   l4:
5519         // top left
5520     movdqu     xmm0, [eax]
5521     movdqu     xmm1, [eax + 16]
5522     movdqu     xmm2, [eax + 32]
5523     movdqu     xmm3, [eax + 48]
5524 
5525     // - top right
5526     psubd      xmm0, [eax + edx * 4]
5527     psubd      xmm1, [eax + edx * 4 + 16]
5528     psubd      xmm2, [eax + edx * 4 + 32]
5529     psubd      xmm3, [eax + edx * 4 + 48]
5530     lea        eax, [eax + 64]
5531 
5532     // - bottom left
5533     psubd      xmm0, [esi]
5534     psubd      xmm1, [esi + 16]
5535     psubd      xmm2, [esi + 32]
5536     psubd      xmm3, [esi + 48]
5537 
5538     // + bottom right
5539     paddd      xmm0, [esi + edx * 4]
5540     paddd      xmm1, [esi + edx * 4 + 16]
5541     paddd      xmm2, [esi + edx * 4 + 32]
5542     paddd      xmm3, [esi + edx * 4 + 48]
5543     lea        esi, [esi + 64]
5544 
5545     cvtdq2ps   xmm0, xmm0  // Average = Sum * 1 / Area
5546     cvtdq2ps   xmm1, xmm1
5547     mulps      xmm0, xmm4
5548     mulps      xmm1, xmm4
5549     cvtdq2ps   xmm2, xmm2
5550     cvtdq2ps   xmm3, xmm3
5551     mulps      xmm2, xmm4
5552     mulps      xmm3, xmm4
5553     cvtps2dq   xmm0, xmm0
5554     cvtps2dq   xmm1, xmm1
5555     cvtps2dq   xmm2, xmm2
5556     cvtps2dq   xmm3, xmm3
5557     packssdw   xmm0, xmm1
5558     packssdw   xmm2, xmm3
5559     packuswb   xmm0, xmm2
5560     movdqu     [edi], xmm0
5561     lea        edi, [edi + 16]
5562     sub        ecx, 4
5563     jge        l4
5564 
5565   l4b:
5566     add        ecx, 4 - 1
5567     jl         l1b
5568 
5569             // 1 pixel loop
5570   l1:
5571     movdqu     xmm0, [eax]
5572     psubd      xmm0, [eax + edx * 4]
5573     lea        eax, [eax + 16]
5574     psubd      xmm0, [esi]
5575     paddd      xmm0, [esi + edx * 4]
5576     lea        esi, [esi + 16]
5577     cvtdq2ps   xmm0, xmm0
5578     mulps      xmm0, xmm4
5579     cvtps2dq   xmm0, xmm0
5580     packssdw   xmm0, xmm0
5581     packuswb   xmm0, xmm0
5582     movd       dword ptr [edi], xmm0
5583     lea        edi, [edi + 4]
5584     sub        ecx, 1
5585     jge        l1
5586   l1b:
5587   }
5588 }
5589 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5590 
5591 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
5592 // Creates a table of cumulative sums where each value is a sum of all values
5593 // above and to the left of the value.
5594 void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
5595                                   int32_t* cumsum,
5596                                   const int32_t* previous_cumsum,
5597                                   int width) {
5598   __asm {
5599     mov        eax, row
5600     mov        edx, cumsum
5601     mov        esi, previous_cumsum
5602     mov        ecx, width
5603     pxor       xmm0, xmm0
5604     pxor       xmm1, xmm1
5605 
5606     sub        ecx, 4
5607     jl         l4b
5608     test       edx, 15
5609     jne        l4b
5610 
5611         // 4 pixel loop
5612   l4:
5613     movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
5614     lea        eax, [eax + 16]
5615     movdqa     xmm4, xmm2
5616 
5617     punpcklbw  xmm2, xmm1
5618     movdqa     xmm3, xmm2
5619     punpcklwd  xmm2, xmm1
5620     punpckhwd  xmm3, xmm1
5621 
5622     punpckhbw  xmm4, xmm1
5623     movdqa     xmm5, xmm4
5624     punpcklwd  xmm4, xmm1
5625     punpckhwd  xmm5, xmm1
5626 
5627     paddd      xmm0, xmm2
5628     movdqu     xmm2, [esi]  // previous row above.
5629     paddd      xmm2, xmm0
5630 
5631     paddd      xmm0, xmm3
5632     movdqu     xmm3, [esi + 16]
5633     paddd      xmm3, xmm0
5634 
5635     paddd      xmm0, xmm4
5636     movdqu     xmm4, [esi + 32]
5637     paddd      xmm4, xmm0
5638 
5639     paddd      xmm0, xmm5
5640     movdqu     xmm5, [esi + 48]
5641     lea        esi, [esi + 64]
5642     paddd      xmm5, xmm0
5643 
5644     movdqu     [edx], xmm2
5645     movdqu     [edx + 16], xmm3
5646     movdqu     [edx + 32], xmm4
5647     movdqu     [edx + 48], xmm5
5648 
5649     lea        edx, [edx + 64]
5650     sub        ecx, 4
5651     jge        l4
5652 
5653   l4b:
5654     add        ecx, 4 - 1
5655     jl         l1b
5656 
5657             // 1 pixel loop
5658   l1:
5659     movd       xmm2, dword ptr [eax]  // 1 argb pixel
5660     lea        eax, [eax + 4]
5661     punpcklbw  xmm2, xmm1
5662     punpcklwd  xmm2, xmm1
5663     paddd      xmm0, xmm2
5664     movdqu     xmm2, [esi]
5665     lea        esi, [esi + 16]
5666     paddd      xmm2, xmm0
5667     movdqu     [edx], xmm2
5668     lea        edx, [edx + 16]
5669     sub        ecx, 1
5670     jge        l1
5671 
5672  l1b:
5673   }
5674 }
5675 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
5676 
5677 #ifdef HAS_ARGBAFFINEROW_SSE2
5678 // Copy ARGB pixels from source image with slope to a row of destination.
5679 __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
5680                                                      int src_argb_stride,
5681                                                      uint8_t* dst_argb,
5682                                                      const float* uv_dudv,
5683                                                      int width) {
5684   __asm {
5685     push       esi
5686     push       edi
5687     mov        eax, [esp + 12]  // src_argb
5688     mov        esi, [esp + 16]  // stride
5689     mov        edx, [esp + 20]  // dst_argb
5690     mov        ecx, [esp + 24]  // pointer to uv_dudv
5691     movq       xmm2, qword ptr [ecx]  // uv
5692     movq       xmm7, qword ptr [ecx + 8]  // dudv
5693     mov        ecx, [esp + 28]  // width
5694     shl        esi, 16  // 4, stride
5695     add        esi, 4
5696     movd       xmm5, esi
5697     sub        ecx, 4
5698     jl         l4b
5699 
5700         // setup for 4 pixel loop
5701     pshufd     xmm7, xmm7, 0x44  // dup dudv
5702     pshufd     xmm5, xmm5, 0  // dup 4, stride
5703     movdqa     xmm0, xmm2  // x0, y0, x1, y1
5704     addps      xmm0, xmm7
5705     movlhps    xmm2, xmm0
5706     movdqa     xmm4, xmm7
5707     addps      xmm4, xmm4  // dudv *= 2
5708     movdqa     xmm3, xmm2  // x2, y2, x3, y3
5709     addps      xmm3, xmm4
5710     addps      xmm4, xmm4  // dudv *= 4
5711 
5712         // 4 pixel loop
5713   l4:
5714     cvttps2dq  xmm0, xmm2  // x, y float to int first 2
5715     cvttps2dq  xmm1, xmm3  // x, y float to int next 2
5716     packssdw   xmm0, xmm1  // x, y as 8 shorts
5717     pmaddwd    xmm0, xmm5  // offsets = x * 4 + y * stride.
5718     movd       esi, xmm0
5719     pshufd     xmm0, xmm0, 0x39  // shift right
5720     movd       edi, xmm0
5721     pshufd     xmm0, xmm0, 0x39  // shift right
5722     movd       xmm1, [eax + esi]  // read pixel 0
5723     movd       xmm6, [eax + edi]  // read pixel 1
5724     punpckldq  xmm1, xmm6  // combine pixel 0 and 1
5725     addps      xmm2, xmm4  // x, y += dx, dy first 2
5726     movq       qword ptr [edx], xmm1
5727     movd       esi, xmm0
5728     pshufd     xmm0, xmm0, 0x39  // shift right
5729     movd       edi, xmm0
5730     movd       xmm6, [eax + esi]  // read pixel 2
5731     movd       xmm0, [eax + edi]  // read pixel 3
5732     punpckldq  xmm6, xmm0  // combine pixel 2 and 3
5733     addps      xmm3, xmm4  // x, y += dx, dy next 2
5734     movq       qword ptr 8[edx], xmm6
5735     lea        edx, [edx + 16]
5736     sub        ecx, 4
5737     jge        l4
5738 
5739   l4b:
5740     add        ecx, 4 - 1
5741     jl         l1b
5742 
5743             // 1 pixel loop
5744   l1:
5745     cvttps2dq  xmm0, xmm2  // x, y float to int
5746     packssdw   xmm0, xmm0  // x, y as shorts
5747     pmaddwd    xmm0, xmm5  // offset = x * 4 + y * stride
5748     addps      xmm2, xmm7  // x, y += dx, dy
5749     movd       esi, xmm0
5750     movd       xmm0, [eax + esi]  // copy a pixel
5751     movd       [edx], xmm0
5752     lea        edx, [edx + 4]
5753     sub        ecx, 1
5754     jge        l1
5755   l1b:
5756     pop        edi
5757     pop        esi
5758     ret
5759   }
5760 }
5761 #endif  // HAS_ARGBAFFINEROW_SSE2
5762 
5763 #ifdef HAS_INTERPOLATEROW_AVX2
5764 // Bilinear filter 32x2 -> 32x1
5765 __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
5766                                            const uint8_t* src_ptr,
5767                                            ptrdiff_t src_stride,
5768                                            int dst_width,
5769                                            int source_y_fraction) {
5770   __asm {
5771     push       esi
5772     push       edi
5773     mov        edi, [esp + 8 + 4]  // dst_ptr
5774     mov        esi, [esp + 8 + 8]  // src_ptr
5775     mov        edx, [esp + 8 + 12]  // src_stride
5776     mov        ecx, [esp + 8 + 16]  // dst_width
5777     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
5778     // Dispatch to specialized filters if applicable.
5779     cmp        eax, 0
5780     je         xloop100  // 0 / 256.  Blend 100 / 0.
5781     sub        edi, esi
5782     cmp        eax, 128
5783     je         xloop50  // 128 /256 is 0.50.  Blend 50 / 50.
5784 
5785     vmovd      xmm0, eax  // high fraction 0..255
5786     neg        eax
5787     add        eax, 256
5788     vmovd      xmm5, eax  // low fraction 256..1
5789     vpunpcklbw xmm5, xmm5, xmm0
5790     vpunpcklwd xmm5, xmm5, xmm5
5791     vbroadcastss ymm5, xmm5
5792 
5793     mov        eax, 0x80808080  // 128b for bias and rounding.
5794     vmovd      xmm4, eax
5795     vbroadcastss ymm4, xmm4
5796 
5797   xloop:
5798     vmovdqu    ymm0, [esi]
5799     vmovdqu    ymm2, [esi + edx]
5800     vpunpckhbw ymm1, ymm0, ymm2  // mutates
5801     vpunpcklbw ymm0, ymm0, ymm2
5802     vpsubb     ymm1, ymm1, ymm4  // bias to signed image
5803     vpsubb     ymm0, ymm0, ymm4
5804     vpmaddubsw ymm1, ymm5, ymm1
5805     vpmaddubsw ymm0, ymm5, ymm0
5806     vpaddw     ymm1, ymm1, ymm4  // unbias and round
5807     vpaddw     ymm0, ymm0, ymm4
5808     vpsrlw     ymm1, ymm1, 8
5809     vpsrlw     ymm0, ymm0, 8
5810     vpackuswb  ymm0, ymm0, ymm1            // unmutates
5811     vmovdqu    [esi + edi], ymm0
5812     lea        esi, [esi + 32]
5813     sub        ecx, 32
5814     jg         xloop
5815     jmp        xloop99
5816 
5817         // Blend 50 / 50.
5818  xloop50:
5819    vmovdqu    ymm0, [esi]
5820    vpavgb     ymm0, ymm0, [esi + edx]
5821    vmovdqu    [esi + edi], ymm0
5822    lea        esi, [esi + 32]
5823    sub        ecx, 32
5824    jg         xloop50
5825    jmp        xloop99
5826 
5827         // Blend 100 / 0 - Copy row unchanged.
5828  xloop100:
5829    rep movsb
5830 
5831   xloop99:
5832     pop        edi
5833     pop        esi
5834     vzeroupper
5835     ret
5836   }
5837 }
5838 #endif  // HAS_INTERPOLATEROW_AVX2
5839 
5840 // Bilinear filter 16x2 -> 16x1
5841 // TODO(fbarchard): Consider allowing 256 using memcpy.
5842 __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
5843                                             const uint8_t* src_ptr,
5844                                             ptrdiff_t src_stride,
5845                                             int dst_width,
5846                                             int source_y_fraction) {
5847   __asm {
5848     push       esi
5849     push       edi
5850 
5851     mov        edi, [esp + 8 + 4]  // dst_ptr
5852     mov        esi, [esp + 8 + 8]  // src_ptr
5853     mov        edx, [esp + 8 + 12]  // src_stride
5854     mov        ecx, [esp + 8 + 16]  // dst_width
5855     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
5856     sub        edi, esi
5857         // Dispatch to specialized filters if applicable.
5858     cmp        eax, 0
5859     je         xloop100  // 0 /256.  Blend 100 / 0.
5860     cmp        eax, 128
5861     je         xloop50  // 128 / 256 is 0.50.  Blend 50 / 50.
5862 
5863     movd       xmm0, eax  // high fraction 0..255
5864     neg        eax
5865     add        eax, 256
5866     movd       xmm5, eax  // low fraction 255..1
5867     punpcklbw  xmm5, xmm0
5868     punpcklwd  xmm5, xmm5
5869     pshufd     xmm5, xmm5, 0
5870     mov        eax, 0x80808080  // 128 for biasing image to signed.
5871     movd       xmm4, eax
5872     pshufd     xmm4, xmm4, 0x00
5873 
5874   xloop:
5875     movdqu     xmm0, [esi]
5876     movdqu     xmm2, [esi + edx]
5877     movdqu     xmm1, xmm0
5878     punpcklbw  xmm0, xmm2
5879     punpckhbw  xmm1, xmm2
5880     psubb      xmm0, xmm4            // bias image by -128
5881     psubb      xmm1, xmm4
5882     movdqa     xmm2, xmm5
5883     movdqa     xmm3, xmm5
5884     pmaddubsw  xmm2, xmm0
5885     pmaddubsw  xmm3, xmm1
5886     paddw      xmm2, xmm4
5887     paddw      xmm3, xmm4
5888     psrlw      xmm2, 8
5889     psrlw      xmm3, 8
5890     packuswb   xmm2, xmm3
5891     movdqu     [esi + edi], xmm2
5892     lea        esi, [esi + 16]
5893     sub        ecx, 16
5894     jg         xloop
5895     jmp        xloop99
5896 
5897         // Blend 50 / 50.
5898   xloop50:
5899     movdqu     xmm0, [esi]
5900     movdqu     xmm1, [esi + edx]
5901     pavgb      xmm0, xmm1
5902     movdqu     [esi + edi], xmm0
5903     lea        esi, [esi + 16]
5904     sub        ecx, 16
5905     jg         xloop50
5906     jmp        xloop99
5907 
5908         // Blend 100 / 0 - Copy row unchanged.
5909   xloop100:
5910     movdqu     xmm0, [esi]
5911     movdqu     [esi + edi], xmm0
5912     lea        esi, [esi + 16]
5913     sub        ecx, 16
5914     jg         xloop100
5915 
5916   xloop99:
5917     pop        edi
5918     pop        esi
5919     ret
5920   }
5921 }
5922 
5923 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5924 __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
5925                                             uint8_t* dst_argb,
5926                                             const uint8_t* shuffler,
5927                                             int width) {
5928   __asm {
5929     mov        eax, [esp + 4]  // src_argb
5930     mov        edx, [esp + 8]  // dst_argb
5931     mov        ecx, [esp + 12]  // shuffler
5932     movdqu     xmm5, [ecx]
5933     mov        ecx, [esp + 16]  // width
5934 
5935   wloop:
5936     movdqu     xmm0, [eax]
5937     movdqu     xmm1, [eax + 16]
5938     lea        eax, [eax + 32]
5939     pshufb     xmm0, xmm5
5940     pshufb     xmm1, xmm5
5941     movdqu     [edx], xmm0
5942     movdqu     [edx + 16], xmm1
5943     lea        edx, [edx + 32]
5944     sub        ecx, 8
5945     jg         wloop
5946     ret
5947   }
5948 }
5949 
5950 #ifdef HAS_ARGBSHUFFLEROW_AVX2
5951 __declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
5952                                            uint8_t* dst_argb,
5953                                            const uint8_t* shuffler,
5954                                            int width) {
5955   __asm {
5956     mov        eax, [esp + 4]  // src_argb
5957     mov        edx, [esp + 8]  // dst_argb
5958     mov        ecx, [esp + 12]  // shuffler
5959     vbroadcastf128 ymm5, [ecx]  // same shuffle in high as low.
5960     mov        ecx, [esp + 16]  // width
5961 
5962   wloop:
5963     vmovdqu    ymm0, [eax]
5964     vmovdqu    ymm1, [eax + 32]
5965     lea        eax, [eax + 64]
5966     vpshufb    ymm0, ymm0, ymm5
5967     vpshufb    ymm1, ymm1, ymm5
5968     vmovdqu    [edx], ymm0
5969     vmovdqu    [edx + 32], ymm1
5970     lea        edx, [edx + 64]
5971     sub        ecx, 16
5972     jg         wloop
5973 
5974     vzeroupper
5975     ret
5976   }
5977 }
5978 #endif  // HAS_ARGBSHUFFLEROW_AVX2
5979 
5980 // YUY2 - Macro-pixel = 2 image pixels
5981 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
5982 
5983 // UYVY - Macro-pixel = 2 image pixels
5984 // U0Y0V0Y1
5985 
5986 __declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,
5987                                           const uint8_t* src_u,
5988                                           const uint8_t* src_v,
5989                                           uint8_t* dst_frame,
5990                                           int width) {
5991   __asm {
5992     push       esi
5993     push       edi
5994     mov        eax, [esp + 8 + 4]  // src_y
5995     mov        esi, [esp + 8 + 8]  // src_u
5996     mov        edx, [esp + 8 + 12]  // src_v
5997     mov        edi, [esp + 8 + 16]  // dst_frame
5998     mov        ecx, [esp + 8 + 20]  // width
5999     sub        edx, esi
6000 
6001   convertloop:
6002     movq       xmm2, qword ptr [esi]  // U
6003     movq       xmm3, qword ptr [esi + edx]  // V
6004     lea        esi, [esi + 8]
6005     punpcklbw  xmm2, xmm3  // UV
6006     movdqu     xmm0, [eax]  // Y
6007     lea        eax, [eax + 16]
6008     movdqa     xmm1, xmm0
6009     punpcklbw  xmm0, xmm2  // YUYV
6010     punpckhbw  xmm1, xmm2
6011     movdqu     [edi], xmm0
6012     movdqu     [edi + 16], xmm1
6013     lea        edi, [edi + 32]
6014     sub        ecx, 16
6015     jg         convertloop
6016 
6017     pop        edi
6018     pop        esi
6019     ret
6020   }
6021 }
6022 
6023 __declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,
6024                                           const uint8_t* src_u,
6025                                           const uint8_t* src_v,
6026                                           uint8_t* dst_frame,
6027                                           int width) {
6028   __asm {
6029     push       esi
6030     push       edi
6031     mov        eax, [esp + 8 + 4]  // src_y
6032     mov        esi, [esp + 8 + 8]  // src_u
6033     mov        edx, [esp + 8 + 12]  // src_v
6034     mov        edi, [esp + 8 + 16]  // dst_frame
6035     mov        ecx, [esp + 8 + 20]  // width
6036     sub        edx, esi
6037 
6038   convertloop:
6039     movq       xmm2, qword ptr [esi]  // U
6040     movq       xmm3, qword ptr [esi + edx]  // V
6041     lea        esi, [esi + 8]
6042     punpcklbw  xmm2, xmm3  // UV
6043     movdqu     xmm0, [eax]  // Y
6044     movdqa     xmm1, xmm2
6045     lea        eax, [eax + 16]
6046     punpcklbw  xmm1, xmm0  // UYVY
6047     punpckhbw  xmm2, xmm0
6048     movdqu     [edi], xmm1
6049     movdqu     [edi + 16], xmm2
6050     lea        edi, [edi + 32]
6051     sub        ecx, 16
6052     jg         convertloop
6053 
6054     pop        edi
6055     pop        esi
6056     ret
6057   }
6058 }
6059 
6060 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
6061 __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
6062                                               uint8_t* dst_argb,
6063                                               const float* poly,
6064                                               int width) {
6065   __asm {
6066     push       esi
6067     mov        eax, [esp + 4 + 4] /* src_argb */
6068     mov        edx, [esp + 4 + 8] /* dst_argb */
6069     mov        esi, [esp + 4 + 12] /* poly */
6070     mov        ecx, [esp + 4 + 16] /* width */
6071     pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
6072 
6073         // 2 pixel loop.
6074  convertloop:
6075         //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
6076         //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
6077     movq       xmm0, qword ptr [eax]  // BGRABGRA
6078     lea        eax, [eax + 8]
6079     punpcklbw  xmm0, xmm3
6080     movdqa     xmm4, xmm0
6081     punpcklwd  xmm0, xmm3  // pixel 0
6082     punpckhwd  xmm4, xmm3  // pixel 1
6083     cvtdq2ps   xmm0, xmm0  // 4 floats
6084     cvtdq2ps   xmm4, xmm4
6085     movdqa     xmm1, xmm0  // X
6086     movdqa     xmm5, xmm4
6087     mulps      xmm0, [esi + 16]  // C1 * X
6088     mulps      xmm4, [esi + 16]
6089     addps      xmm0, [esi]  // result = C0 + C1 * X
6090     addps      xmm4, [esi]
6091     movdqa     xmm2, xmm1
6092     movdqa     xmm6, xmm5
6093     mulps      xmm2, xmm1  // X * X
6094     mulps      xmm6, xmm5
6095     mulps      xmm1, xmm2  // X * X * X
6096     mulps      xmm5, xmm6
6097     mulps      xmm2, [esi + 32]  // C2 * X * X
6098     mulps      xmm6, [esi + 32]
6099     mulps      xmm1, [esi + 48]  // C3 * X * X * X
6100     mulps      xmm5, [esi + 48]
6101     addps      xmm0, xmm2  // result += C2 * X * X
6102     addps      xmm4, xmm6
6103     addps      xmm0, xmm1  // result += C3 * X * X * X
6104     addps      xmm4, xmm5
6105     cvttps2dq  xmm0, xmm0
6106     cvttps2dq  xmm4, xmm4
6107     packuswb   xmm0, xmm4
6108     packuswb   xmm0, xmm0
6109     movq       qword ptr [edx], xmm0
6110     lea        edx, [edx + 8]
6111     sub        ecx, 2
6112     jg         convertloop
6113     pop        esi
6114     ret
6115   }
6116 }
6117 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
6118 
6119 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
6120 __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
6121                                               uint8_t* dst_argb,
6122                                               const float* poly,
6123                                               int width) {
6124   __asm {
6125     mov        eax, [esp + 4] /* src_argb */
6126     mov        edx, [esp + 8] /* dst_argb */
6127     mov        ecx, [esp + 12] /* poly */
6128     vbroadcastf128 ymm4, [ecx]  // C0
6129     vbroadcastf128 ymm5, [ecx + 16]  // C1
6130     vbroadcastf128 ymm6, [ecx + 32]  // C2
6131     vbroadcastf128 ymm7, [ecx + 48]  // C3
6132     mov        ecx, [esp + 16] /* width */
6133 
6134     // 2 pixel loop.
6135  convertloop:
6136     vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
6137     lea         eax, [eax + 8]
6138     vcvtdq2ps   ymm0, ymm0  // X 8 floats
6139     vmulps      ymm2, ymm0, ymm0  // X * X
6140     vmulps      ymm3, ymm0, ymm7  // C3 * X
6141     vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
6142     vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
6143     vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
6144     vcvttps2dq  ymm0, ymm0
6145     vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
6146     vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
6147     vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
6148     vmovq       qword ptr [edx], xmm0
6149     lea         edx, [edx + 8]
6150     sub         ecx, 2
6151     jg          convertloop
6152     vzeroupper
6153     ret
6154   }
6155 }
6156 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
6157 
6158 #ifdef HAS_HALFFLOATROW_SSE2
6159 static float kExpBias = 1.9259299444e-34f;
6160 __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
6161                                          uint16_t* dst,
6162                                          float scale,
6163                                          int width) {
6164   __asm {
6165     mov        eax, [esp + 4] /* src */
6166     mov        edx, [esp + 8] /* dst */
6167     movd       xmm4, dword ptr [esp + 12] /* scale */
6168     mov        ecx, [esp + 16] /* width */
6169     mulss      xmm4, kExpBias
6170     pshufd     xmm4, xmm4, 0
6171     pxor       xmm5, xmm5
6172     sub        edx, eax
6173 
6174         // 8 pixel loop.
6175  convertloop:
6176     movdqu      xmm2, xmmword ptr [eax]  // 8 shorts
6177     add         eax, 16
6178     movdqa      xmm3, xmm2
6179     punpcklwd   xmm2, xmm5
6180     cvtdq2ps    xmm2, xmm2  // convert 8 ints to floats
6181     punpckhwd   xmm3, xmm5
6182     cvtdq2ps    xmm3, xmm3
6183     mulps       xmm2, xmm4
6184     mulps       xmm3, xmm4
6185     psrld       xmm2, 13
6186     psrld       xmm3, 13
6187     packssdw    xmm2, xmm3
6188     movdqu      [eax + edx - 16], xmm2
6189     sub         ecx, 8
6190     jg          convertloop
6191     ret
6192   }
6193 }
6194 #endif  // HAS_HALFFLOATROW_SSE2
6195 
6196 #ifdef HAS_HALFFLOATROW_AVX2
6197 __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
6198                                          uint16_t* dst,
6199                                          float scale,
6200                                          int width) {
6201   __asm {
6202     mov        eax, [esp + 4] /* src */
6203     mov        edx, [esp + 8] /* dst */
6204     movd       xmm4, dword ptr [esp + 12] /* scale */
6205     mov        ecx, [esp + 16] /* width */
6206 
6207     vmulss     xmm4, xmm4, kExpBias
6208     vbroadcastss ymm4, xmm4
6209     vpxor      ymm5, ymm5, ymm5
6210     sub        edx, eax
6211 
6212         // 16 pixel loop.
6213  convertloop:
6214     vmovdqu     ymm2, [eax]  // 16 shorts
6215     add         eax, 32
6216     vpunpckhwd  ymm3, ymm2, ymm5  // convert 16 shorts to 16 ints
6217     vpunpcklwd  ymm2, ymm2, ymm5
6218     vcvtdq2ps   ymm3, ymm3  // convert 16 ints to floats
6219     vcvtdq2ps   ymm2, ymm2
6220     vmulps      ymm3, ymm3, ymm4  // scale to adjust exponent for 5 bit range.
6221     vmulps      ymm2, ymm2, ymm4
6222     vpsrld      ymm3, ymm3, 13  // float convert to 8 half floats truncate
6223     vpsrld      ymm2, ymm2, 13
6224     vpackssdw   ymm2, ymm2, ymm3
6225     vmovdqu     [eax + edx - 32], ymm2
6226     sub         ecx, 16
6227     jg          convertloop
6228     vzeroupper
6229     ret
6230   }
6231 }
6232 #endif  // HAS_HALFFLOATROW_AVX2
6233 
6234 #ifdef HAS_HALFFLOATROW_F16C
6235 __declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,
6236                                          uint16_t* dst,
6237                                          float scale,
6238                                          int width) {
6239   __asm {
6240     mov        eax, [esp + 4] /* src */
6241     mov        edx, [esp + 8] /* dst */
6242     vbroadcastss ymm4, [esp + 12] /* scale */
6243     mov        ecx, [esp + 16] /* width */
6244     sub        edx, eax
6245 
6246         // 16 pixel loop.
6247  convertloop:
6248     vpmovzxwd   ymm2, xmmword ptr [eax]  // 8 shorts -> 8 ints
6249     vpmovzxwd   ymm3, xmmword ptr [eax + 16]  // 8 more shorts
6250     add         eax, 32
6251     vcvtdq2ps   ymm2, ymm2  // convert 8 ints to floats
6252     vcvtdq2ps   ymm3, ymm3
6253     vmulps      ymm2, ymm2, ymm4  // scale to normalized range 0 to 1
6254     vmulps      ymm3, ymm3, ymm4
6255     vcvtps2ph   xmm2, ymm2, 3  // float convert to 8 half floats truncate
6256     vcvtps2ph   xmm3, ymm3, 3
6257     vmovdqu     [eax + edx + 32], xmm2
6258     vmovdqu     [eax + edx + 32 + 16], xmm3
6259     sub         ecx, 16
6260     jg          convertloop
6261     vzeroupper
6262     ret
6263   }
6264 }
6265 #endif  // HAS_HALFFLOATROW_F16C
6266 
6267 #ifdef HAS_ARGBCOLORTABLEROW_X86
6268 // Tranform ARGB pixels with color table.
6269 __declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,
6270                                              const uint8_t* table_argb,
6271                                              int width) {
6272   __asm {
6273     push       esi
6274     mov        eax, [esp + 4 + 4] /* dst_argb */
6275     mov        esi, [esp + 4 + 8] /* table_argb */
6276     mov        ecx, [esp + 4 + 12] /* width */
6277 
6278     // 1 pixel loop.
6279   convertloop:
6280     movzx      edx, byte ptr [eax]
6281     lea        eax, [eax + 4]
6282     movzx      edx, byte ptr [esi + edx * 4]
6283     mov        byte ptr [eax - 4], dl
6284     movzx      edx, byte ptr [eax - 4 + 1]
6285     movzx      edx, byte ptr [esi + edx * 4 + 1]
6286     mov        byte ptr [eax - 4 + 1], dl
6287     movzx      edx, byte ptr [eax - 4 + 2]
6288     movzx      edx, byte ptr [esi + edx * 4 + 2]
6289     mov        byte ptr [eax - 4 + 2], dl
6290     movzx      edx, byte ptr [eax - 4 + 3]
6291     movzx      edx, byte ptr [esi + edx * 4 + 3]
6292     mov        byte ptr [eax - 4 + 3], dl
6293     dec        ecx
6294     jg         convertloop
6295     pop        esi
6296     ret
6297   }
6298 }
6299 #endif  // HAS_ARGBCOLORTABLEROW_X86
6300 
6301 #ifdef HAS_RGBCOLORTABLEROW_X86
6302 // Tranform RGB pixels with color table.
6303 __declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,
6304                                             const uint8_t* table_argb,
6305                                             int width) {
6306   __asm {
6307     push       esi
6308     mov        eax, [esp + 4 + 4] /* dst_argb */
6309     mov        esi, [esp + 4 + 8] /* table_argb */
6310     mov        ecx, [esp + 4 + 12] /* width */
6311 
6312     // 1 pixel loop.
6313   convertloop:
6314     movzx      edx, byte ptr [eax]
6315     lea        eax, [eax + 4]
6316     movzx      edx, byte ptr [esi + edx * 4]
6317     mov        byte ptr [eax - 4], dl
6318     movzx      edx, byte ptr [eax - 4 + 1]
6319     movzx      edx, byte ptr [esi + edx * 4 + 1]
6320     mov        byte ptr [eax - 4 + 1], dl
6321     movzx      edx, byte ptr [eax - 4 + 2]
6322     movzx      edx, byte ptr [esi + edx * 4 + 2]
6323     mov        byte ptr [eax - 4 + 2], dl
6324     dec        ecx
6325     jg         convertloop
6326 
6327     pop        esi
6328     ret
6329   }
6330 }
6331 #endif  // HAS_RGBCOLORTABLEROW_X86
6332 
6333 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
6334 // Tranform RGB pixels with luma table.
6335 __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
6336                                                    uint8_t* dst_argb,
6337                                                    int width,
6338                                                    const uint8_t* luma,
6339                                                    uint32_t lumacoeff) {
6340   __asm {
6341     push       esi
6342     push       edi
6343     mov        eax, [esp + 8 + 4] /* src_argb */
6344     mov        edi, [esp + 8 + 8] /* dst_argb */
6345     mov        ecx, [esp + 8 + 12] /* width */
6346     movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
6347     movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
6348     pshufd     xmm2, xmm2, 0
6349     pshufd     xmm3, xmm3, 0
6350     pcmpeqb    xmm4, xmm4  // generate mask 0xff00ff00
6351     psllw      xmm4, 8
6352     pxor       xmm5, xmm5
6353 
6354         // 4 pixel loop.
6355   convertloop:
6356     movdqu     xmm0, xmmword ptr [eax]  // generate luma ptr
6357     pmaddubsw  xmm0, xmm3
6358     phaddw     xmm0, xmm0
6359     pand       xmm0, xmm4  // mask out low bits
6360     punpcklwd  xmm0, xmm5
6361     paddd      xmm0, xmm2  // add table base
6362     movd       esi, xmm0
6363     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
6364 
6365     movzx      edx, byte ptr [eax]
6366     movzx      edx, byte ptr [esi + edx]
6367     mov        byte ptr [edi], dl
6368     movzx      edx, byte ptr [eax + 1]
6369     movzx      edx, byte ptr [esi + edx]
6370     mov        byte ptr [edi + 1], dl
6371     movzx      edx, byte ptr [eax + 2]
6372     movzx      edx, byte ptr [esi + edx]
6373     mov        byte ptr [edi + 2], dl
6374     movzx      edx, byte ptr [eax + 3]  // copy alpha.
6375     mov        byte ptr [edi + 3], dl
6376 
6377     movd       esi, xmm0
6378     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
6379 
6380     movzx      edx, byte ptr [eax + 4]
6381     movzx      edx, byte ptr [esi + edx]
6382     mov        byte ptr [edi + 4], dl
6383     movzx      edx, byte ptr [eax + 5]
6384     movzx      edx, byte ptr [esi + edx]
6385     mov        byte ptr [edi + 5], dl
6386     movzx      edx, byte ptr [eax + 6]
6387     movzx      edx, byte ptr [esi + edx]
6388     mov        byte ptr [edi + 6], dl
6389     movzx      edx, byte ptr [eax + 7]  // copy alpha.
6390     mov        byte ptr [edi + 7], dl
6391 
6392     movd       esi, xmm0
6393     pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
6394 
6395     movzx      edx, byte ptr [eax + 8]
6396     movzx      edx, byte ptr [esi + edx]
6397     mov        byte ptr [edi + 8], dl
6398     movzx      edx, byte ptr [eax + 9]
6399     movzx      edx, byte ptr [esi + edx]
6400     mov        byte ptr [edi + 9], dl
6401     movzx      edx, byte ptr [eax + 10]
6402     movzx      edx, byte ptr [esi + edx]
6403     mov        byte ptr [edi + 10], dl
6404     movzx      edx, byte ptr [eax + 11]  // copy alpha.
6405     mov        byte ptr [edi + 11], dl
6406 
6407     movd       esi, xmm0
6408 
6409     movzx      edx, byte ptr [eax + 12]
6410     movzx      edx, byte ptr [esi + edx]
6411     mov        byte ptr [edi + 12], dl
6412     movzx      edx, byte ptr [eax + 13]
6413     movzx      edx, byte ptr [esi + edx]
6414     mov        byte ptr [edi + 13], dl
6415     movzx      edx, byte ptr [eax + 14]
6416     movzx      edx, byte ptr [esi + edx]
6417     mov        byte ptr [edi + 14], dl
6418     movzx      edx, byte ptr [eax + 15]  // copy alpha.
6419     mov        byte ptr [edi + 15], dl
6420 
6421     lea        eax, [eax + 16]
6422     lea        edi, [edi + 16]
6423     sub        ecx, 4
6424     jg         convertloop
6425 
6426     pop        edi
6427     pop        esi
6428     ret
6429   }
6430 }
6431 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6432 
6433 #endif  // defined(_M_X64)
6434 
6435 #ifdef __cplusplus
6436 }  // extern "C"
6437 }  // namespace libyuv
6438 #endif
6439 
6440 #endif  // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
6441