1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 // This module is for Visual C 32/64 bit
14 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
15 !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
16
17 #if defined(_M_ARM64EC)
18 #include <intrin.h>
19 #elif defined(_M_X64)
20 #include <emmintrin.h>
21 #include <tmmintrin.h> // For _mm_maddubs_epi16
22 #endif
23
24 #ifdef __cplusplus
25 namespace libyuv {
26 extern "C" {
27 #endif
28
29 // 64 bit
30 #if defined(_M_X64)
31
32 // Read 8 UV from 444
33 #define READYUV444 \
34 xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \
35 xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \
36 xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
37 u_buf += 8; \
38 xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
39 xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
40 y_buf += 8;
41
42 // Read 8 UV from 444, With 8 Alpha.
43 #define READYUVA444 \
44 xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \
45 xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \
46 xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
47 u_buf += 8; \
48 xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
49 xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
50 y_buf += 8; \
51 xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
52 a_buf += 8;
53
54 // Read 4 UV from 422, upsample to 8 UV.
55 #define READYUV422 \
56 xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
57 xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
58 xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
59 xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \
60 u_buf += 4; \
61 xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
62 xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
63 y_buf += 8;
64
65 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
66 #define READYUVA422 \
67 xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
68 xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
69 xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
70 xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \
71 u_buf += 4; \
72 xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
73 xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
74 y_buf += 8; \
75 xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
76 a_buf += 8;
77
78 // Convert 8 pixels: 8 UV and 8 Y.
79 #define YUVTORGB(yuvconstants) \
80 xmm3 = _mm_sub_epi8(xmm3, _mm_set1_epi8((char)0x80)); \
81 xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
82 xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kYBiasToRgb); \
83 xmm0 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToB, xmm3); \
84 xmm1 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToG, xmm3); \
85 xmm2 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToR, xmm3); \
86 xmm0 = _mm_adds_epi16(xmm4, xmm0); \
87 xmm1 = _mm_subs_epi16(xmm4, xmm1); \
88 xmm2 = _mm_adds_epi16(xmm4, xmm2); \
89 xmm0 = _mm_srai_epi16(xmm0, 6); \
90 xmm1 = _mm_srai_epi16(xmm1, 6); \
91 xmm2 = _mm_srai_epi16(xmm2, 6); \
92 xmm0 = _mm_packus_epi16(xmm0, xmm0); \
93 xmm1 = _mm_packus_epi16(xmm1, xmm1); \
94 xmm2 = _mm_packus_epi16(xmm2, xmm2);
95
96 // Store 8 ARGB values.
97 #define STOREARGB \
98 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
99 xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \
100 xmm1 = _mm_loadu_si128(&xmm0); \
101 xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \
102 xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \
103 _mm_storeu_si128((__m128i*)dst_argb, xmm0); \
104 _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
105 dst_argb += 32;
106
107 #if defined(HAS_I422TOARGBROW_SSSE3)
I422ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)108 void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
109 const uint8_t* u_buf,
110 const uint8_t* v_buf,
111 uint8_t* dst_argb,
112 const struct YuvConstants* yuvconstants,
113 int width) {
114 __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
115 const __m128i xmm5 = _mm_set1_epi8(-1);
116 const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
117 while (width > 0) {
118 READYUV422
119 YUVTORGB(yuvconstants)
120 STOREARGB
121 width -= 8;
122 }
123 }
124 #endif
125
126 #if defined(HAS_I422ALPHATOARGBROW_SSSE3)
I422AlphaToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)127 void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
128 const uint8_t* u_buf,
129 const uint8_t* v_buf,
130 const uint8_t* a_buf,
131 uint8_t* dst_argb,
132 const struct YuvConstants* yuvconstants,
133 int width) {
134 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
135 const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
136 while (width > 0) {
137 READYUVA422
138 YUVTORGB(yuvconstants)
139 STOREARGB
140 width -= 8;
141 }
142 }
143 #endif
144
145 #if defined(HAS_I444TOARGBROW_SSSE3)
I444ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)146 void I444ToARGBRow_SSSE3(const uint8_t* y_buf,
147 const uint8_t* u_buf,
148 const uint8_t* v_buf,
149 uint8_t* dst_argb,
150 const struct YuvConstants* yuvconstants,
151 int width) {
152 __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
153 const __m128i xmm5 = _mm_set1_epi8(-1);
154 const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
155 while (width > 0) {
156 READYUV444
157 YUVTORGB(yuvconstants)
158 STOREARGB
159 width -= 8;
160 }
161 }
162 #endif
163
164 #if defined(HAS_I444ALPHATOARGBROW_SSSE3)
I444AlphaToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)165 void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
166 const uint8_t* u_buf,
167 const uint8_t* v_buf,
168 const uint8_t* a_buf,
169 uint8_t* dst_argb,
170 const struct YuvConstants* yuvconstants,
171 int width) {
172 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
173 const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
174 while (width > 0) {
175 READYUVA444
176 YUVTORGB(yuvconstants)
177 STOREARGB
178 width -= 8;
179 }
180 }
181 #endif
182
183 // 32 bit
184 #else // defined(_M_X64)
185 #ifdef HAS_ARGBTOYROW_SSSE3
186
187 // Constants for ARGB.
188 static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
189 13, 65, 33, 0, 13, 65, 33, 0};
190
191 // JPeg full range.
192 static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
193 15, 75, 38, 0, 15, 75, 38, 0};
194
195 static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
196 112, -74, -38, 0, 112, -74, -38, 0};
197
198 static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
199 127, -84, -43, 0, 127, -84, -43, 0};
200
201 static const vec8 kARGBToV = {
202 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
203 };
204
205 static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
206 -20, -107, 127, 0, -20, -107, 127, 0};
207
208 // vpshufb for vphaddw + vpackuswb packed to shorts.
209 static const lvec8 kShufARGBToUV_AVX = {
210 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
211 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
212
213 // Constants for BGRA.
214 static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
215 0, 33, 65, 13, 0, 33, 65, 13};
216
217 static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
218 0, -38, -74, 112, 0, -38, -74, 112};
219
220 static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
221 0, 112, -94, -18, 0, 112, -94, -18};
222
223 // Constants for ABGR.
224 static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
225 33, 65, 13, 0, 33, 65, 13, 0};
226
227 static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
228 -38, -74, 112, 0, -38, -74, 112, 0};
229
230 static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
231 112, -94, -18, 0, 112, -94, -18, 0};
232
233 // Constants for RGBA.
234 static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
235 0, 13, 65, 33, 0, 13, 65, 33};
236
237 static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
238 0, 112, -74, -38, 0, 112, -74, -38};
239
240 static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
241 0, -18, -94, 112, 0, -18, -94, 112};
242
243 static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
244 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
245
246 // 7 bit fixed point 0.5.
247 static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
248
249 // 8 bit fixed point 0.5, for bias of UV.
250 static const ulvec8 kBiasUV128 = {
251 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
252 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
253 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
254
255 // Shuffle table for converting RGB24 to ARGB.
256 static const uvec8 kShuffleMaskRGB24ToARGB = {
257 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
258
259 // Shuffle table for converting RAW to ARGB.
260 static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
261 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
262
263 // Shuffle table for converting RAW to RGB24. First 8.
264 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
265 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
266 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
267
268 // Shuffle table for converting RAW to RGB24. Middle 8.
269 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
270 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
271 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
272
273 // Shuffle table for converting RAW to RGB24. Last 8.
274 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
275 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
276 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
277
278 // Shuffle table for converting ARGB to RGB24.
279 static const uvec8 kShuffleMaskARGBToRGB24 = {
280 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
281
282 // Shuffle table for converting ARGB to RAW.
283 static const uvec8 kShuffleMaskARGBToRAW = {
284 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
285
286 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
287 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
288 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
289
290 // YUY2 shuf 16 Y to 32 Y.
291 static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
292 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
293 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
294
295 // YUY2 shuf 8 UV to 16 UV.
296 static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
297 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
298 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
299
300 // UYVY shuf 16 Y to 32 Y.
301 static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
302 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
303 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
304
305 // UYVY shuf 8 UV to 16 UV.
306 static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
307 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
308 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
309
310 // NV21 shuf 8 VU to 16 UV.
311 static const lvec8 kShuffleNV21 = {
312 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
313 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
314 };
315
316 // Duplicates gray value 3 times and fills in alpha opaque.
317 __declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y,
318 uint8_t* dst_argb,
319 int width) {
320 __asm {
321 mov eax, [esp + 4] // src_y
322 mov edx, [esp + 8] // dst_argb
323 mov ecx, [esp + 12] // width
324 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
325 pslld xmm5, 24
326
327 convertloop:
328 movq xmm0, qword ptr [eax]
329 lea eax, [eax + 8]
330 punpcklbw xmm0, xmm0
331 movdqa xmm1, xmm0
332 punpcklwd xmm0, xmm0
333 punpckhwd xmm1, xmm1
334 por xmm0, xmm5
335 por xmm1, xmm5
336 movdqu [edx], xmm0
337 movdqu [edx + 16], xmm1
338 lea edx, [edx + 32]
339 sub ecx, 8
340 jg convertloop
341 ret
342 }
343 }
344
345 #ifdef HAS_J400TOARGBROW_AVX2
346 // Duplicates gray value 3 times and fills in alpha opaque.
347 __declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y,
348 uint8_t* dst_argb,
349 int width) {
350 __asm {
351 mov eax, [esp + 4] // src_y
352 mov edx, [esp + 8] // dst_argb
353 mov ecx, [esp + 12] // width
354 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
355 vpslld ymm5, ymm5, 24
356
357 convertloop:
358 vmovdqu xmm0, [eax]
359 lea eax, [eax + 16]
360 vpermq ymm0, ymm0, 0xd8
361 vpunpcklbw ymm0, ymm0, ymm0
362 vpermq ymm0, ymm0, 0xd8
363 vpunpckhwd ymm1, ymm0, ymm0
364 vpunpcklwd ymm0, ymm0, ymm0
365 vpor ymm0, ymm0, ymm5
366 vpor ymm1, ymm1, ymm5
367 vmovdqu [edx], ymm0
368 vmovdqu [edx + 32], ymm1
369 lea edx, [edx + 64]
370 sub ecx, 16
371 jg convertloop
372 vzeroupper
373 ret
374 }
375 }
376 #endif // HAS_J400TOARGBROW_AVX2
377
378 __declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
379 uint8_t* dst_argb,
380 int width) {
381 __asm {
382 mov eax, [esp + 4] // src_rgb24
383 mov edx, [esp + 8] // dst_argb
384 mov ecx, [esp + 12] // width
385 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
386 pslld xmm5, 24
387 movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
388
389 convertloop:
390 movdqu xmm0, [eax]
391 movdqu xmm1, [eax + 16]
392 movdqu xmm3, [eax + 32]
393 lea eax, [eax + 48]
394 movdqa xmm2, xmm3
395 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
396 pshufb xmm2, xmm4
397 por xmm2, xmm5
398 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
399 pshufb xmm0, xmm4
400 movdqu [edx + 32], xmm2
401 por xmm0, xmm5
402 pshufb xmm1, xmm4
403 movdqu [edx], xmm0
404 por xmm1, xmm5
405 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
406 pshufb xmm3, xmm4
407 movdqu [edx + 16], xmm1
408 por xmm3, xmm5
409 movdqu [edx + 48], xmm3
410 lea edx, [edx + 64]
411 sub ecx, 16
412 jg convertloop
413 ret
414 }
415 }
416
417 __declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw,
418 uint8_t* dst_argb,
419 int width) {
420 __asm {
421 mov eax, [esp + 4] // src_raw
422 mov edx, [esp + 8] // dst_argb
423 mov ecx, [esp + 12] // width
424 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
425 pslld xmm5, 24
426 movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB
427
428 convertloop:
429 movdqu xmm0, [eax]
430 movdqu xmm1, [eax + 16]
431 movdqu xmm3, [eax + 32]
432 lea eax, [eax + 48]
433 movdqa xmm2, xmm3
434 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
435 pshufb xmm2, xmm4
436 por xmm2, xmm5
437 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
438 pshufb xmm0, xmm4
439 movdqu [edx + 32], xmm2
440 por xmm0, xmm5
441 pshufb xmm1, xmm4
442 movdqu [edx], xmm0
443 por xmm1, xmm5
444 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
445 pshufb xmm3, xmm4
446 movdqu [edx + 16], xmm1
447 por xmm3, xmm5
448 movdqu [edx + 48], xmm3
449 lea edx, [edx + 64]
450 sub ecx, 16
451 jg convertloop
452 ret
453 }
454 }
455
456 __declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
457 uint8_t* dst_rgb24,
458 int width) {
459 __asm {
460 mov eax, [esp + 4] // src_raw
461 mov edx, [esp + 8] // dst_rgb24
462 mov ecx, [esp + 12] // width
463 movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
464 movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
465 movdqa xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
466
467 convertloop:
468 movdqu xmm0, [eax]
469 movdqu xmm1, [eax + 4]
470 movdqu xmm2, [eax + 8]
471 lea eax, [eax + 24]
472 pshufb xmm0, xmm3
473 pshufb xmm1, xmm4
474 pshufb xmm2, xmm5
475 movq qword ptr [edx], xmm0
476 movq qword ptr [edx + 8], xmm1
477 movq qword ptr [edx + 16], xmm2
478 lea edx, [edx + 24]
479 sub ecx, 8
480 jg convertloop
481 ret
482 }
483 }
484
485 // pmul method to replicate bits.
486 // Math to replicate bits:
487 // (v << 8) | (v << 3)
488 // v * 256 + v * 8
489 // v * (256 + 8)
490 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
491 // 20 instructions.
492 __declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565,
493 uint8_t* dst_argb,
494 int width) {
495 __asm {
496 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
497 movd xmm5, eax
498 pshufd xmm5, xmm5, 0
499 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
500 movd xmm6, eax
501 pshufd xmm6, xmm6, 0
502 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
503 psllw xmm3, 11
504 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
505 psllw xmm4, 10
506 psrlw xmm4, 5
507 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
508 psllw xmm7, 8
509
510 mov eax, [esp + 4] // src_rgb565
511 mov edx, [esp + 8] // dst_argb
512 mov ecx, [esp + 12] // width
513 sub edx, eax
514 sub edx, eax
515
516 convertloop:
517 movdqu xmm0, [eax] // fetch 8 pixels of bgr565
518 movdqa xmm1, xmm0
519 movdqa xmm2, xmm0
520 pand xmm1, xmm3 // R in upper 5 bits
521 psllw xmm2, 11 // B in upper 5 bits
522 pmulhuw xmm1, xmm5 // * (256 + 8)
523 pmulhuw xmm2, xmm5 // * (256 + 8)
524 psllw xmm1, 8
525 por xmm1, xmm2 // RB
526 pand xmm0, xmm4 // G in middle 6 bits
527 pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
528 por xmm0, xmm7 // AG
529 movdqa xmm2, xmm1
530 punpcklbw xmm1, xmm0
531 punpckhbw xmm2, xmm0
532 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
533 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
534 lea eax, [eax + 16]
535 sub ecx, 8
536 jg convertloop
537 ret
538 }
539 }
540
541 #ifdef HAS_RGB565TOARGBROW_AVX2
542 // pmul method to replicate bits.
543 // Math to replicate bits:
544 // (v << 8) | (v << 3)
545 // v * 256 + v * 8
546 // v * (256 + 8)
547 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
548 __declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
549 uint8_t* dst_argb,
550 int width) {
551 __asm {
552 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
553 vmovd xmm5, eax
554 vbroadcastss ymm5, xmm5
555 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
556 vmovd xmm6, eax
557 vbroadcastss ymm6, xmm6
558 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
559 vpsllw ymm3, ymm3, 11
560 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green
561 vpsllw ymm4, ymm4, 10
562 vpsrlw ymm4, ymm4, 5
563 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
564 vpsllw ymm7, ymm7, 8
565
566 mov eax, [esp + 4] // src_rgb565
567 mov edx, [esp + 8] // dst_argb
568 mov ecx, [esp + 12] // width
569 sub edx, eax
570 sub edx, eax
571
572 convertloop:
573 vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565
574 vpand ymm1, ymm0, ymm3 // R in upper 5 bits
575 vpsllw ymm2, ymm0, 11 // B in upper 5 bits
576 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
577 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
578 vpsllw ymm1, ymm1, 8
579 vpor ymm1, ymm1, ymm2 // RB
580 vpand ymm0, ymm0, ymm4 // G in middle 6 bits
581 vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4)
582 vpor ymm0, ymm0, ymm7 // AG
583 vpermq ymm0, ymm0, 0xd8 // mutate for unpack
584 vpermq ymm1, ymm1, 0xd8
585 vpunpckhbw ymm2, ymm1, ymm0
586 vpunpcklbw ymm1, ymm1, ymm0
587 vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB
588 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB
589 lea eax, [eax + 32]
590 sub ecx, 16
591 jg convertloop
592 vzeroupper
593 ret
594 }
595 }
596 #endif // HAS_RGB565TOARGBROW_AVX2
597
598 #ifdef HAS_ARGB1555TOARGBROW_AVX2
599 __declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
600 uint8_t* dst_argb,
601 int width) {
602 __asm {
603 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
604 vmovd xmm5, eax
605 vbroadcastss ymm5, xmm5
606 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
607 vmovd xmm6, eax
608 vbroadcastss ymm6, xmm6
609 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
610 vpsllw ymm3, ymm3, 11
611 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green
612 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
613 vpsllw ymm7, ymm7, 8
614
615 mov eax, [esp + 4] // src_argb1555
616 mov edx, [esp + 8] // dst_argb
617 mov ecx, [esp + 12] // width
618 sub edx, eax
619 sub edx, eax
620
621 convertloop:
622 vmovdqu ymm0, [eax] // fetch 16 pixels of 1555
623 vpsllw ymm1, ymm0, 1 // R in upper 5 bits
624 vpsllw ymm2, ymm0, 11 // B in upper 5 bits
625 vpand ymm1, ymm1, ymm3
626 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
627 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
628 vpsllw ymm1, ymm1, 8
629 vpor ymm1, ymm1, ymm2 // RB
630 vpsraw ymm2, ymm0, 8 // A
631 vpand ymm0, ymm0, ymm4 // G in middle 5 bits
632 vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8)
633 vpand ymm2, ymm2, ymm7
634 vpor ymm0, ymm0, ymm2 // AG
635 vpermq ymm0, ymm0, 0xd8 // mutate for unpack
636 vpermq ymm1, ymm1, 0xd8
637 vpunpckhbw ymm2, ymm1, ymm0
638 vpunpcklbw ymm1, ymm1, ymm0
639 vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB
640 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB
641 lea eax, [eax + 32]
642 sub ecx, 16
643 jg convertloop
644 vzeroupper
645 ret
646 }
647 }
648 #endif // HAS_ARGB1555TOARGBROW_AVX2
649
650 #ifdef HAS_ARGB4444TOARGBROW_AVX2
651 __declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
652 uint8_t* dst_argb,
653 int width) {
654 __asm {
655 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
656 vmovd xmm4, eax
657 vbroadcastss ymm4, xmm4
658 vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles
659 mov eax, [esp + 4] // src_argb4444
660 mov edx, [esp + 8] // dst_argb
661 mov ecx, [esp + 12] // width
662 sub edx, eax
663 sub edx, eax
664
665 convertloop:
666 vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444
667 vpand ymm2, ymm0, ymm5 // mask high nibbles
668 vpand ymm0, ymm0, ymm4 // mask low nibbles
669 vpsrlw ymm3, ymm2, 4
670 vpsllw ymm1, ymm0, 4
671 vpor ymm2, ymm2, ymm3
672 vpor ymm0, ymm0, ymm1
673 vpermq ymm0, ymm0, 0xd8 // mutate for unpack
674 vpermq ymm2, ymm2, 0xd8
675 vpunpckhbw ymm1, ymm0, ymm2
676 vpunpcklbw ymm0, ymm0, ymm2
677 vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB
678 vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB
679 lea eax, [eax + 32]
680 sub ecx, 16
681 jg convertloop
682 vzeroupper
683 ret
684 }
685 }
686 #endif // HAS_ARGB4444TOARGBROW_AVX2
687
688 // 24 instructions
689 __declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555,
690 uint8_t* dst_argb,
691 int width) {
692 __asm {
693 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
694 movd xmm5, eax
695 pshufd xmm5, xmm5, 0
696 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
697 movd xmm6, eax
698 pshufd xmm6, xmm6, 0
699 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
700 psllw xmm3, 11
701 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
702 psrlw xmm4, 6
703 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
704 psllw xmm7, 8
705
706 mov eax, [esp + 4] // src_argb1555
707 mov edx, [esp + 8] // dst_argb
708 mov ecx, [esp + 12] // width
709 sub edx, eax
710 sub edx, eax
711
712 convertloop:
713 movdqu xmm0, [eax] // fetch 8 pixels of 1555
714 movdqa xmm1, xmm0
715 movdqa xmm2, xmm0
716 psllw xmm1, 1 // R in upper 5 bits
717 psllw xmm2, 11 // B in upper 5 bits
718 pand xmm1, xmm3
719 pmulhuw xmm2, xmm5 // * (256 + 8)
720 pmulhuw xmm1, xmm5 // * (256 + 8)
721 psllw xmm1, 8
722 por xmm1, xmm2 // RB
723 movdqa xmm2, xmm0
724 pand xmm0, xmm4 // G in middle 5 bits
725 psraw xmm2, 8 // A
726 pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
727 pand xmm2, xmm7
728 por xmm0, xmm2 // AG
729 movdqa xmm2, xmm1
730 punpcklbw xmm1, xmm0
731 punpckhbw xmm2, xmm0
732 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
733 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
734 lea eax, [eax + 16]
735 sub ecx, 8
736 jg convertloop
737 ret
738 }
739 }
740
741 // 18 instructions.
742 __declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444,
743 uint8_t* dst_argb,
744 int width) {
745 __asm {
746 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
747 movd xmm4, eax
748 pshufd xmm4, xmm4, 0
749 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
750 pslld xmm5, 4
751 mov eax, [esp + 4] // src_argb4444
752 mov edx, [esp + 8] // dst_argb
753 mov ecx, [esp + 12] // width
754 sub edx, eax
755 sub edx, eax
756
757 convertloop:
758 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
759 movdqa xmm2, xmm0
760 pand xmm0, xmm4 // mask low nibbles
761 pand xmm2, xmm5 // mask high nibbles
762 movdqa xmm1, xmm0
763 movdqa xmm3, xmm2
764 psllw xmm1, 4
765 psrlw xmm3, 4
766 por xmm0, xmm1
767 por xmm2, xmm3
768 movdqa xmm1, xmm0
769 punpcklbw xmm0, xmm2
770 punpckhbw xmm1, xmm2
771 movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
772 movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
773 lea eax, [eax + 16]
774 sub ecx, 8
775 jg convertloop
776 ret
777 }
778 }
779
780 __declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb,
781 uint8_t* dst_rgb,
782 int width) {
783 __asm {
784 mov eax, [esp + 4] // src_argb
785 mov edx, [esp + 8] // dst_rgb
786 mov ecx, [esp + 12] // width
787 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
788
789 convertloop:
790 movdqu xmm0, [eax] // fetch 16 pixels of argb
791 movdqu xmm1, [eax + 16]
792 movdqu xmm2, [eax + 32]
793 movdqu xmm3, [eax + 48]
794 lea eax, [eax + 64]
795 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
796 pshufb xmm1, xmm6
797 pshufb xmm2, xmm6
798 pshufb xmm3, xmm6
799 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
800 psrldq xmm1, 4 // 8 bytes from 1
801 pslldq xmm4, 12 // 4 bytes from 1 for 0
802 movdqa xmm5, xmm2 // 8 bytes from 2 for 1
803 por xmm0, xmm4 // 4 bytes from 1 for 0
804 pslldq xmm5, 8 // 8 bytes from 2 for 1
805 movdqu [edx], xmm0 // store 0
806 por xmm1, xmm5 // 8 bytes from 2 for 1
807 psrldq xmm2, 8 // 4 bytes from 2
808 pslldq xmm3, 4 // 12 bytes from 3 for 2
809 por xmm2, xmm3 // 12 bytes from 3 for 2
810 movdqu [edx + 16], xmm1 // store 1
811 movdqu [edx + 32], xmm2 // store 2
812 lea edx, [edx + 48]
813 sub ecx, 16
814 jg convertloop
815 ret
816 }
817 }
818
819 __declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb,
820 uint8_t* dst_rgb,
821 int width) {
822 __asm {
823 mov eax, [esp + 4] // src_argb
824 mov edx, [esp + 8] // dst_rgb
825 mov ecx, [esp + 12] // width
826 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW
827
828 convertloop:
829 movdqu xmm0, [eax] // fetch 16 pixels of argb
830 movdqu xmm1, [eax + 16]
831 movdqu xmm2, [eax + 32]
832 movdqu xmm3, [eax + 48]
833 lea eax, [eax + 64]
834 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
835 pshufb xmm1, xmm6
836 pshufb xmm2, xmm6
837 pshufb xmm3, xmm6
838 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
839 psrldq xmm1, 4 // 8 bytes from 1
840 pslldq xmm4, 12 // 4 bytes from 1 for 0
841 movdqa xmm5, xmm2 // 8 bytes from 2 for 1
842 por xmm0, xmm4 // 4 bytes from 1 for 0
843 pslldq xmm5, 8 // 8 bytes from 2 for 1
844 movdqu [edx], xmm0 // store 0
845 por xmm1, xmm5 // 8 bytes from 2 for 1
846 psrldq xmm2, 8 // 4 bytes from 2
847 pslldq xmm3, 4 // 12 bytes from 3 for 2
848 por xmm2, xmm3 // 12 bytes from 3 for 2
849 movdqu [edx + 16], xmm1 // store 1
850 movdqu [edx + 32], xmm2 // store 2
851 lea edx, [edx + 48]
852 sub ecx, 16
853 jg convertloop
854 ret
855 }
856 }
857
858 __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb,
859 uint8_t* dst_rgb,
860 int width) {
861 __asm {
862 mov eax, [esp + 4] // src_argb
863 mov edx, [esp + 8] // dst_rgb
864 mov ecx, [esp + 12] // width
865 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
866 psrld xmm3, 27
867 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
868 psrld xmm4, 26
869 pslld xmm4, 5
870 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
871 pslld xmm5, 11
872
873 convertloop:
874 movdqu xmm0, [eax] // fetch 4 pixels of argb
875 movdqa xmm1, xmm0 // B
876 movdqa xmm2, xmm0 // G
877 pslld xmm0, 8 // R
878 psrld xmm1, 3 // B
879 psrld xmm2, 5 // G
880 psrad xmm0, 16 // R
881 pand xmm1, xmm3 // B
882 pand xmm2, xmm4 // G
883 pand xmm0, xmm5 // R
884 por xmm1, xmm2 // BG
885 por xmm0, xmm1 // BGR
886 packssdw xmm0, xmm0
887 lea eax, [eax + 16]
888 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
889 lea edx, [edx + 8]
890 sub ecx, 4
891 jg convertloop
892 ret
893 }
894 }
895
896 __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
897 uint8_t* dst_rgb,
898 uint32_t dither4,
899 int width) {
900 __asm {
901
902 mov eax, [esp + 4] // src_argb
903 mov edx, [esp + 8] // dst_rgb
904 movd xmm6, [esp + 12] // dither4
905 mov ecx, [esp + 16] // width
906 punpcklbw xmm6, xmm6 // make dither 16 bytes
907 movdqa xmm7, xmm6
908 punpcklwd xmm6, xmm6
909 punpckhwd xmm7, xmm7
910 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
911 psrld xmm3, 27
912 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
913 psrld xmm4, 26
914 pslld xmm4, 5
915 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
916 pslld xmm5, 11
917
918 convertloop:
919 movdqu xmm0, [eax] // fetch 4 pixels of argb
920 paddusb xmm0, xmm6 // add dither
921 movdqa xmm1, xmm0 // B
922 movdqa xmm2, xmm0 // G
923 pslld xmm0, 8 // R
924 psrld xmm1, 3 // B
925 psrld xmm2, 5 // G
926 psrad xmm0, 16 // R
927 pand xmm1, xmm3 // B
928 pand xmm2, xmm4 // G
929 pand xmm0, xmm5 // R
930 por xmm1, xmm2 // BG
931 por xmm0, xmm1 // BGR
932 packssdw xmm0, xmm0
933 lea eax, [eax + 16]
934 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
935 lea edx, [edx + 8]
936 sub ecx, 4
937 jg convertloop
938 ret
939 }
940 }
941
942 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
943 __declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb,
944 uint8_t* dst_rgb,
945 uint32_t dither4,
946 int width) {
947 __asm {
948 mov eax, [esp + 4] // src_argb
949 mov edx, [esp + 8] // dst_rgb
950 vbroadcastss xmm6, [esp + 12] // dither4
951 mov ecx, [esp + 16] // width
952 vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes
953 vpermq ymm6, ymm6, 0xd8
954 vpunpcklwd ymm6, ymm6, ymm6
955 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
956 vpsrld ymm3, ymm3, 27
957 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
958 vpsrld ymm4, ymm4, 26
959 vpslld ymm4, ymm4, 5
960 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
961
962 convertloop:
963 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
964 vpaddusb ymm0, ymm0, ymm6 // add dither
965 vpsrld ymm2, ymm0, 5 // G
966 vpsrld ymm1, ymm0, 3 // B
967 vpsrld ymm0, ymm0, 8 // R
968 vpand ymm2, ymm2, ymm4 // G
969 vpand ymm1, ymm1, ymm3 // B
970 vpand ymm0, ymm0, ymm5 // R
971 vpor ymm1, ymm1, ymm2 // BG
972 vpor ymm0, ymm0, ymm1 // BGR
973 vpackusdw ymm0, ymm0, ymm0
974 vpermq ymm0, ymm0, 0xd8
975 lea eax, [eax + 32]
976 vmovdqu [edx], xmm0 // store 8 pixels of RGB565
977 lea edx, [edx + 16]
978 sub ecx, 8
979 jg convertloop
980 vzeroupper
981 ret
982 }
983 }
984 #endif // HAS_ARGBTORGB565DITHERROW_AVX2
985
986 // TODO(fbarchard): Improve sign extension/packing.
987 __declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb,
988 uint8_t* dst_rgb,
989 int width) {
990 __asm {
991 mov eax, [esp + 4] // src_argb
992 mov edx, [esp + 8] // dst_rgb
993 mov ecx, [esp + 12] // width
994 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
995 psrld xmm4, 27
996 movdqa xmm5, xmm4 // generate mask 0x000003e0
997 pslld xmm5, 5
998 movdqa xmm6, xmm4 // generate mask 0x00007c00
999 pslld xmm6, 10
1000 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
1001 pslld xmm7, 15
1002
1003 convertloop:
1004 movdqu xmm0, [eax] // fetch 4 pixels of argb
1005 movdqa xmm1, xmm0 // B
1006 movdqa xmm2, xmm0 // G
1007 movdqa xmm3, xmm0 // R
1008 psrad xmm0, 16 // A
1009 psrld xmm1, 3 // B
1010 psrld xmm2, 6 // G
1011 psrld xmm3, 9 // R
1012 pand xmm0, xmm7 // A
1013 pand xmm1, xmm4 // B
1014 pand xmm2, xmm5 // G
1015 pand xmm3, xmm6 // R
1016 por xmm0, xmm1 // BA
1017 por xmm2, xmm3 // GR
1018 por xmm0, xmm2 // BGRA
1019 packssdw xmm0, xmm0
1020 lea eax, [eax + 16]
1021 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
1022 lea edx, [edx + 8]
1023 sub ecx, 4
1024 jg convertloop
1025 ret
1026 }
1027 }
1028
1029 __declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb,
1030 uint8_t* dst_rgb,
1031 int width) {
1032 __asm {
1033 mov eax, [esp + 4] // src_argb
1034 mov edx, [esp + 8] // dst_rgb
1035 mov ecx, [esp + 12] // width
1036 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
1037 psllw xmm4, 12
1038 movdqa xmm3, xmm4 // generate mask 0x00f000f0
1039 psrlw xmm3, 8
1040
1041 convertloop:
1042 movdqu xmm0, [eax] // fetch 4 pixels of argb
1043 movdqa xmm1, xmm0
1044 pand xmm0, xmm3 // low nibble
1045 pand xmm1, xmm4 // high nibble
1046 psrld xmm0, 4
1047 psrld xmm1, 8
1048 por xmm0, xmm1
1049 packuswb xmm0, xmm0
1050 lea eax, [eax + 16]
1051 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
1052 lea edx, [edx + 8]
1053 sub ecx, 4
1054 jg convertloop
1055 ret
1056 }
1057 }
1058
1059 #ifdef HAS_ARGBTORGB565ROW_AVX2
1060 __declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb,
1061 uint8_t* dst_rgb,
1062 int width) {
1063 __asm {
1064 mov eax, [esp + 4] // src_argb
1065 mov edx, [esp + 8] // dst_rgb
1066 mov ecx, [esp + 12] // width
1067 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
1068 vpsrld ymm3, ymm3, 27
1069 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
1070 vpsrld ymm4, ymm4, 26
1071 vpslld ymm4, ymm4, 5
1072 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
1073
1074 convertloop:
1075 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
1076 vpsrld ymm2, ymm0, 5 // G
1077 vpsrld ymm1, ymm0, 3 // B
1078 vpsrld ymm0, ymm0, 8 // R
1079 vpand ymm2, ymm2, ymm4 // G
1080 vpand ymm1, ymm1, ymm3 // B
1081 vpand ymm0, ymm0, ymm5 // R
1082 vpor ymm1, ymm1, ymm2 // BG
1083 vpor ymm0, ymm0, ymm1 // BGR
1084 vpackusdw ymm0, ymm0, ymm0
1085 vpermq ymm0, ymm0, 0xd8
1086 lea eax, [eax + 32]
1087 vmovdqu [edx], xmm0 // store 8 pixels of RGB565
1088 lea edx, [edx + 16]
1089 sub ecx, 8
1090 jg convertloop
1091 vzeroupper
1092 ret
1093 }
1094 }
1095 #endif // HAS_ARGBTORGB565ROW_AVX2
1096
1097 #ifdef HAS_ARGBTOARGB1555ROW_AVX2
1098 __declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
1099 uint8_t* dst_rgb,
1100 int width) {
1101 __asm {
1102 mov eax, [esp + 4] // src_argb
1103 mov edx, [esp + 8] // dst_rgb
1104 mov ecx, [esp + 12] // width
1105 vpcmpeqb ymm4, ymm4, ymm4
1106 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f
1107 vpslld ymm5, ymm4, 5 // generate mask 0x000003e0
1108 vpslld ymm6, ymm4, 10 // generate mask 0x00007c00
1109 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000
1110 vpslld ymm7, ymm7, 15
1111
1112 convertloop:
1113 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
1114 vpsrld ymm3, ymm0, 9 // R
1115 vpsrld ymm2, ymm0, 6 // G
1116 vpsrld ymm1, ymm0, 3 // B
1117 vpsrad ymm0, ymm0, 16 // A
1118 vpand ymm3, ymm3, ymm6 // R
1119 vpand ymm2, ymm2, ymm5 // G
1120 vpand ymm1, ymm1, ymm4 // B
1121 vpand ymm0, ymm0, ymm7 // A
1122 vpor ymm0, ymm0, ymm1 // BA
1123 vpor ymm2, ymm2, ymm3 // GR
1124 vpor ymm0, ymm0, ymm2 // BGRA
1125 vpackssdw ymm0, ymm0, ymm0
1126 vpermq ymm0, ymm0, 0xd8
1127 lea eax, [eax + 32]
1128 vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555
1129 lea edx, [edx + 16]
1130 sub ecx, 8
1131 jg convertloop
1132 vzeroupper
1133 ret
1134 }
1135 }
1136 #endif // HAS_ARGBTOARGB1555ROW_AVX2
1137
1138 #ifdef HAS_ARGBTOARGB4444ROW_AVX2
1139 __declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
1140 uint8_t* dst_rgb,
1141 int width) {
1142 __asm {
1143 mov eax, [esp + 4] // src_argb
1144 mov edx, [esp + 8] // dst_rgb
1145 mov ecx, [esp + 12] // width
1146 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000
1147 vpsllw ymm4, ymm4, 12
1148 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0
1149
1150 convertloop:
1151 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
1152 vpand ymm1, ymm0, ymm4 // high nibble
1153 vpand ymm0, ymm0, ymm3 // low nibble
1154 vpsrld ymm1, ymm1, 8
1155 vpsrld ymm0, ymm0, 4
1156 vpor ymm0, ymm0, ymm1
1157 vpackuswb ymm0, ymm0, ymm0
1158 vpermq ymm0, ymm0, 0xd8
1159 lea eax, [eax + 32]
1160 vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444
1161 lea edx, [edx + 16]
1162 sub ecx, 8
1163 jg convertloop
1164 vzeroupper
1165 ret
1166 }
1167 }
1168 #endif // HAS_ARGBTOARGB4444ROW_AVX2
1169
1170 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
1171 __declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb,
1172 uint8_t* dst_y,
1173 int width) {
1174 __asm {
1175 mov eax, [esp + 4] /* src_argb */
1176 mov edx, [esp + 8] /* dst_y */
1177 mov ecx, [esp + 12] /* width */
1178 movdqa xmm4, xmmword ptr kARGBToY
1179 movdqa xmm5, xmmword ptr kAddY16
1180
1181 convertloop:
1182 movdqu xmm0, [eax]
1183 movdqu xmm1, [eax + 16]
1184 movdqu xmm2, [eax + 32]
1185 movdqu xmm3, [eax + 48]
1186 pmaddubsw xmm0, xmm4
1187 pmaddubsw xmm1, xmm4
1188 pmaddubsw xmm2, xmm4
1189 pmaddubsw xmm3, xmm4
1190 lea eax, [eax + 64]
1191 phaddw xmm0, xmm1
1192 phaddw xmm2, xmm3
1193 psrlw xmm0, 7
1194 psrlw xmm2, 7
1195 packuswb xmm0, xmm2
1196 paddb xmm0, xmm5
1197 movdqu [edx], xmm0
1198 lea edx, [edx + 16]
1199 sub ecx, 16
1200 jg convertloop
1201 ret
1202 }
1203 }
1204
1205 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1206 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
1207 __declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb,
1208 uint8_t* dst_y,
1209 int width) {
1210 __asm {
1211 mov eax, [esp + 4] /* src_argb */
1212 mov edx, [esp + 8] /* dst_y */
1213 mov ecx, [esp + 12] /* width */
1214 movdqa xmm4, xmmword ptr kARGBToYJ
1215 movdqa xmm5, xmmword ptr kAddYJ64
1216
1217 convertloop:
1218 movdqu xmm0, [eax]
1219 movdqu xmm1, [eax + 16]
1220 movdqu xmm2, [eax + 32]
1221 movdqu xmm3, [eax + 48]
1222 pmaddubsw xmm0, xmm4
1223 pmaddubsw xmm1, xmm4
1224 pmaddubsw xmm2, xmm4
1225 pmaddubsw xmm3, xmm4
1226 lea eax, [eax + 64]
1227 phaddw xmm0, xmm1
1228 phaddw xmm2, xmm3
1229 paddw xmm0, xmm5 // Add .5 for rounding.
1230 paddw xmm2, xmm5
1231 psrlw xmm0, 7
1232 psrlw xmm2, 7
1233 packuswb xmm0, xmm2
1234 movdqu [edx], xmm0
1235 lea edx, [edx + 16]
1236 sub ecx, 16
1237 jg convertloop
1238 ret
1239 }
1240 }
1241
1242 #ifdef HAS_ARGBTOYROW_AVX2
1243 // vpermd for vphaddw + vpackuswb vpermd.
1244 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
1245
1246 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1247 __declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb,
1248 uint8_t* dst_y,
1249 int width) {
1250 __asm {
1251 mov eax, [esp + 4] /* src_argb */
1252 mov edx, [esp + 8] /* dst_y */
1253 mov ecx, [esp + 12] /* width */
1254 vbroadcastf128 ymm4, xmmword ptr kARGBToY
1255 vbroadcastf128 ymm5, xmmword ptr kAddY16
1256 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
1257
1258 convertloop:
1259 vmovdqu ymm0, [eax]
1260 vmovdqu ymm1, [eax + 32]
1261 vmovdqu ymm2, [eax + 64]
1262 vmovdqu ymm3, [eax + 96]
1263 vpmaddubsw ymm0, ymm0, ymm4
1264 vpmaddubsw ymm1, ymm1, ymm4
1265 vpmaddubsw ymm2, ymm2, ymm4
1266 vpmaddubsw ymm3, ymm3, ymm4
1267 lea eax, [eax + 128]
1268 vphaddw ymm0, ymm0, ymm1 // mutates.
1269 vphaddw ymm2, ymm2, ymm3
1270 vpsrlw ymm0, ymm0, 7
1271 vpsrlw ymm2, ymm2, 7
1272 vpackuswb ymm0, ymm0, ymm2 // mutates.
1273 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
1274 vpaddb ymm0, ymm0, ymm5 // add 16 for Y
1275 vmovdqu [edx], ymm0
1276 lea edx, [edx + 32]
1277 sub ecx, 32
1278 jg convertloop
1279 vzeroupper
1280 ret
1281 }
1282 }
1283 #endif // HAS_ARGBTOYROW_AVX2
1284
1285 #ifdef HAS_ARGBTOYJROW_AVX2
1286 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1287 __declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb,
1288 uint8_t* dst_y,
1289 int width) {
1290 __asm {
1291 mov eax, [esp + 4] /* src_argb */
1292 mov edx, [esp + 8] /* dst_y */
1293 mov ecx, [esp + 12] /* width */
1294 vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
1295 vbroadcastf128 ymm5, xmmword ptr kAddYJ64
1296 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
1297
1298 convertloop:
1299 vmovdqu ymm0, [eax]
1300 vmovdqu ymm1, [eax + 32]
1301 vmovdqu ymm2, [eax + 64]
1302 vmovdqu ymm3, [eax + 96]
1303 vpmaddubsw ymm0, ymm0, ymm4
1304 vpmaddubsw ymm1, ymm1, ymm4
1305 vpmaddubsw ymm2, ymm2, ymm4
1306 vpmaddubsw ymm3, ymm3, ymm4
1307 lea eax, [eax + 128]
1308 vphaddw ymm0, ymm0, ymm1 // mutates.
1309 vphaddw ymm2, ymm2, ymm3
1310 vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding.
1311 vpaddw ymm2, ymm2, ymm5
1312 vpsrlw ymm0, ymm0, 7
1313 vpsrlw ymm2, ymm2, 7
1314 vpackuswb ymm0, ymm0, ymm2 // mutates.
1315 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
1316 vmovdqu [edx], ymm0
1317 lea edx, [edx + 32]
1318 sub ecx, 32
1319 jg convertloop
1320
1321 vzeroupper
1322 ret
1323 }
1324 }
1325 #endif // HAS_ARGBTOYJROW_AVX2
1326
1327 __declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb,
1328 uint8_t* dst_y,
1329 int width) {
1330 __asm {
1331 mov eax, [esp + 4] /* src_argb */
1332 mov edx, [esp + 8] /* dst_y */
1333 mov ecx, [esp + 12] /* width */
1334 movdqa xmm4, xmmword ptr kBGRAToY
1335 movdqa xmm5, xmmword ptr kAddY16
1336
1337 convertloop:
1338 movdqu xmm0, [eax]
1339 movdqu xmm1, [eax + 16]
1340 movdqu xmm2, [eax + 32]
1341 movdqu xmm3, [eax + 48]
1342 pmaddubsw xmm0, xmm4
1343 pmaddubsw xmm1, xmm4
1344 pmaddubsw xmm2, xmm4
1345 pmaddubsw xmm3, xmm4
1346 lea eax, [eax + 64]
1347 phaddw xmm0, xmm1
1348 phaddw xmm2, xmm3
1349 psrlw xmm0, 7
1350 psrlw xmm2, 7
1351 packuswb xmm0, xmm2
1352 paddb xmm0, xmm5
1353 movdqu [edx], xmm0
1354 lea edx, [edx + 16]
1355 sub ecx, 16
1356 jg convertloop
1357 ret
1358 }
1359 }
1360
1361 __declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb,
1362 uint8_t* dst_y,
1363 int width) {
1364 __asm {
1365 mov eax, [esp + 4] /* src_argb */
1366 mov edx, [esp + 8] /* dst_y */
1367 mov ecx, [esp + 12] /* width */
1368 movdqa xmm4, xmmword ptr kABGRToY
1369 movdqa xmm5, xmmword ptr kAddY16
1370
1371 convertloop:
1372 movdqu xmm0, [eax]
1373 movdqu xmm1, [eax + 16]
1374 movdqu xmm2, [eax + 32]
1375 movdqu xmm3, [eax + 48]
1376 pmaddubsw xmm0, xmm4
1377 pmaddubsw xmm1, xmm4
1378 pmaddubsw xmm2, xmm4
1379 pmaddubsw xmm3, xmm4
1380 lea eax, [eax + 64]
1381 phaddw xmm0, xmm1
1382 phaddw xmm2, xmm3
1383 psrlw xmm0, 7
1384 psrlw xmm2, 7
1385 packuswb xmm0, xmm2
1386 paddb xmm0, xmm5
1387 movdqu [edx], xmm0
1388 lea edx, [edx + 16]
1389 sub ecx, 16
1390 jg convertloop
1391 ret
1392 }
1393 }
1394
1395 __declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
1396 uint8_t* dst_y,
1397 int width) {
1398 __asm {
1399 mov eax, [esp + 4] /* src_argb */
1400 mov edx, [esp + 8] /* dst_y */
1401 mov ecx, [esp + 12] /* width */
1402 movdqa xmm4, xmmword ptr kRGBAToY
1403 movdqa xmm5, xmmword ptr kAddY16
1404
1405 convertloop:
1406 movdqu xmm0, [eax]
1407 movdqu xmm1, [eax + 16]
1408 movdqu xmm2, [eax + 32]
1409 movdqu xmm3, [eax + 48]
1410 pmaddubsw xmm0, xmm4
1411 pmaddubsw xmm1, xmm4
1412 pmaddubsw xmm2, xmm4
1413 pmaddubsw xmm3, xmm4
1414 lea eax, [eax + 64]
1415 phaddw xmm0, xmm1
1416 phaddw xmm2, xmm3
1417 psrlw xmm0, 7
1418 psrlw xmm2, 7
1419 packuswb xmm0, xmm2
1420 paddb xmm0, xmm5
1421 movdqu [edx], xmm0
1422 lea edx, [edx + 16]
1423 sub ecx, 16
1424 jg convertloop
1425 ret
1426 }
1427 }
1428
1429 __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
1430 int src_stride_argb,
1431 uint8_t* dst_u,
1432 uint8_t* dst_v,
1433 int width) {
1434 __asm {
1435 push esi
1436 push edi
1437 mov eax, [esp + 8 + 4] // src_argb
1438 mov esi, [esp + 8 + 8] // src_stride_argb
1439 mov edx, [esp + 8 + 12] // dst_u
1440 mov edi, [esp + 8 + 16] // dst_v
1441 mov ecx, [esp + 8 + 20] // width
1442 movdqa xmm5, xmmword ptr kBiasUV128
1443 movdqa xmm6, xmmword ptr kARGBToV
1444 movdqa xmm7, xmmword ptr kARGBToU
1445 sub edi, edx // stride from u to v
1446
1447 convertloop:
1448 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1449 movdqu xmm0, [eax]
1450 movdqu xmm4, [eax + esi]
1451 pavgb xmm0, xmm4
1452 movdqu xmm1, [eax + 16]
1453 movdqu xmm4, [eax + esi + 16]
1454 pavgb xmm1, xmm4
1455 movdqu xmm2, [eax + 32]
1456 movdqu xmm4, [eax + esi + 32]
1457 pavgb xmm2, xmm4
1458 movdqu xmm3, [eax + 48]
1459 movdqu xmm4, [eax + esi + 48]
1460 pavgb xmm3, xmm4
1461
1462 lea eax, [eax + 64]
1463 movdqa xmm4, xmm0
1464 shufps xmm0, xmm1, 0x88
1465 shufps xmm4, xmm1, 0xdd
1466 pavgb xmm0, xmm4
1467 movdqa xmm4, xmm2
1468 shufps xmm2, xmm3, 0x88
1469 shufps xmm4, xmm3, 0xdd
1470 pavgb xmm2, xmm4
1471
1472 // step 2 - convert to U and V
1473 // from here down is very similar to Y code except
1474 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1475 movdqa xmm1, xmm0
1476 movdqa xmm3, xmm2
1477 pmaddubsw xmm0, xmm7 // U
1478 pmaddubsw xmm2, xmm7
1479 pmaddubsw xmm1, xmm6 // V
1480 pmaddubsw xmm3, xmm6
1481 phaddw xmm0, xmm2
1482 phaddw xmm1, xmm3
1483 psraw xmm0, 8
1484 psraw xmm1, 8
1485 packsswb xmm0, xmm1
1486 paddb xmm0, xmm5 // -> unsigned
1487
1488 // step 3 - store 8 U and 8 V values
1489 movlps qword ptr [edx], xmm0 // U
1490 movhps qword ptr [edx + edi], xmm0 // V
1491 lea edx, [edx + 8]
1492 sub ecx, 16
1493 jg convertloop
1494
1495 pop edi
1496 pop esi
1497 ret
1498 }
1499 }
1500
1501 __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
1502 int src_stride_argb,
1503 uint8_t* dst_u,
1504 uint8_t* dst_v,
1505 int width) {
1506 __asm {
1507 push esi
1508 push edi
1509 mov eax, [esp + 8 + 4] // src_argb
1510 mov esi, [esp + 8 + 8] // src_stride_argb
1511 mov edx, [esp + 8 + 12] // dst_u
1512 mov edi, [esp + 8 + 16] // dst_v
1513 mov ecx, [esp + 8 + 20] // width
1514 movdqa xmm5, xmmword ptr kBiasUV128
1515 movdqa xmm6, xmmword ptr kARGBToVJ
1516 movdqa xmm7, xmmword ptr kARGBToUJ
1517 sub edi, edx // stride from u to v
1518
1519 convertloop:
1520 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1521 movdqu xmm0, [eax]
1522 movdqu xmm4, [eax + esi]
1523 pavgb xmm0, xmm4
1524 movdqu xmm1, [eax + 16]
1525 movdqu xmm4, [eax + esi + 16]
1526 pavgb xmm1, xmm4
1527 movdqu xmm2, [eax + 32]
1528 movdqu xmm4, [eax + esi + 32]
1529 pavgb xmm2, xmm4
1530 movdqu xmm3, [eax + 48]
1531 movdqu xmm4, [eax + esi + 48]
1532 pavgb xmm3, xmm4
1533
1534 lea eax, [eax + 64]
1535 movdqa xmm4, xmm0
1536 shufps xmm0, xmm1, 0x88
1537 shufps xmm4, xmm1, 0xdd
1538 pavgb xmm0, xmm4
1539 movdqa xmm4, xmm2
1540 shufps xmm2, xmm3, 0x88
1541 shufps xmm4, xmm3, 0xdd
1542 pavgb xmm2, xmm4
1543
1544 // step 2 - convert to U and V
1545 // from here down is very similar to Y code except
1546 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1547 movdqa xmm1, xmm0
1548 movdqa xmm3, xmm2
1549 pmaddubsw xmm0, xmm7 // U
1550 pmaddubsw xmm2, xmm7
1551 pmaddubsw xmm1, xmm6 // V
1552 pmaddubsw xmm3, xmm6
1553 phaddw xmm0, xmm2
1554 phaddw xmm1, xmm3
1555 paddw xmm0, xmm5 // +.5 rounding -> unsigned
1556 paddw xmm1, xmm5
1557 psraw xmm0, 8
1558 psraw xmm1, 8
1559 packsswb xmm0, xmm1
1560
1561 // step 3 - store 8 U and 8 V values
1562 movlps qword ptr [edx], xmm0 // U
1563 movhps qword ptr [edx + edi], xmm0 // V
1564 lea edx, [edx + 8]
1565 sub ecx, 16
1566 jg convertloop
1567
1568 pop edi
1569 pop esi
1570 ret
1571 }
1572 }
1573
1574 #ifdef HAS_ARGBTOUVROW_AVX2
1575 __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb,
1576 int src_stride_argb,
1577 uint8_t* dst_u,
1578 uint8_t* dst_v,
1579 int width) {
1580 __asm {
1581 push esi
1582 push edi
1583 mov eax, [esp + 8 + 4] // src_argb
1584 mov esi, [esp + 8 + 8] // src_stride_argb
1585 mov edx, [esp + 8 + 12] // dst_u
1586 mov edi, [esp + 8 + 16] // dst_v
1587 mov ecx, [esp + 8 + 20] // width
1588 vbroadcastf128 ymm5, xmmword ptr kBiasUV128
1589 vbroadcastf128 ymm6, xmmword ptr kARGBToV
1590 vbroadcastf128 ymm7, xmmword ptr kARGBToU
1591 sub edi, edx // stride from u to v
1592
1593 convertloop:
1594 /* step 1 - subsample 32x2 argb pixels to 16x1 */
1595 vmovdqu ymm0, [eax]
1596 vmovdqu ymm1, [eax + 32]
1597 vmovdqu ymm2, [eax + 64]
1598 vmovdqu ymm3, [eax + 96]
1599 vpavgb ymm0, ymm0, [eax + esi]
1600 vpavgb ymm1, ymm1, [eax + esi + 32]
1601 vpavgb ymm2, ymm2, [eax + esi + 64]
1602 vpavgb ymm3, ymm3, [eax + esi + 96]
1603 lea eax, [eax + 128]
1604 vshufps ymm4, ymm0, ymm1, 0x88
1605 vshufps ymm0, ymm0, ymm1, 0xdd
1606 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
1607 vshufps ymm4, ymm2, ymm3, 0x88
1608 vshufps ymm2, ymm2, ymm3, 0xdd
1609 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
1610
1611 // step 2 - convert to U and V
1612 // from here down is very similar to Y code except
1613 // instead of 32 different pixels, its 16 pixels of U and 16 of V
1614 vpmaddubsw ymm1, ymm0, ymm7 // U
1615 vpmaddubsw ymm3, ymm2, ymm7
1616 vpmaddubsw ymm0, ymm0, ymm6 // V
1617 vpmaddubsw ymm2, ymm2, ymm6
1618 vphaddw ymm1, ymm1, ymm3 // mutates
1619 vphaddw ymm0, ymm0, ymm2
1620 vpsraw ymm1, ymm1, 8
1621 vpsraw ymm0, ymm0, 8
1622 vpacksswb ymm0, ymm1, ymm0 // mutates
1623 vpermq ymm0, ymm0, 0xd8 // For vpacksswb
1624 vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
1625 vpaddb ymm0, ymm0, ymm5 // -> unsigned
1626
1627 // step 3 - store 16 U and 16 V values
1628 vextractf128 [edx], ymm0, 0 // U
1629 vextractf128 [edx + edi], ymm0, 1 // V
1630 lea edx, [edx + 16]
1631 sub ecx, 32
1632 jg convertloop
1633
1634 pop edi
1635 pop esi
1636 vzeroupper
1637 ret
1638 }
1639 }
1640 #endif // HAS_ARGBTOUVROW_AVX2
1641
1642 #ifdef HAS_ARGBTOUVJROW_AVX2
1643 __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
1644 int src_stride_argb,
1645 uint8_t* dst_u,
1646 uint8_t* dst_v,
1647 int width) {
1648 __asm {
1649 push esi
1650 push edi
1651 mov eax, [esp + 8 + 4] // src_argb
1652 mov esi, [esp + 8 + 8] // src_stride_argb
1653 mov edx, [esp + 8 + 12] // dst_u
1654 mov edi, [esp + 8 + 16] // dst_v
1655 mov ecx, [esp + 8 + 20] // width
1656 vbroadcastf128 ymm5, xmmword ptr kBiasUV128
1657 vbroadcastf128 ymm6, xmmword ptr kARGBToVJ
1658 vbroadcastf128 ymm7, xmmword ptr kARGBToUJ
1659 sub edi, edx // stride from u to v
1660
1661 convertloop:
1662 /* step 1 - subsample 32x2 argb pixels to 16x1 */
1663 vmovdqu ymm0, [eax]
1664 vmovdqu ymm1, [eax + 32]
1665 vmovdqu ymm2, [eax + 64]
1666 vmovdqu ymm3, [eax + 96]
1667 vpavgb ymm0, ymm0, [eax + esi]
1668 vpavgb ymm1, ymm1, [eax + esi + 32]
1669 vpavgb ymm2, ymm2, [eax + esi + 64]
1670 vpavgb ymm3, ymm3, [eax + esi + 96]
1671 lea eax, [eax + 128]
1672 vshufps ymm4, ymm0, ymm1, 0x88
1673 vshufps ymm0, ymm0, ymm1, 0xdd
1674 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
1675 vshufps ymm4, ymm2, ymm3, 0x88
1676 vshufps ymm2, ymm2, ymm3, 0xdd
1677 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
1678
1679 // step 2 - convert to U and V
1680 // from here down is very similar to Y code except
1681 // instead of 32 different pixels, its 16 pixels of U and 16 of V
1682 vpmaddubsw ymm1, ymm0, ymm7 // U
1683 vpmaddubsw ymm3, ymm2, ymm7
1684 vpmaddubsw ymm0, ymm0, ymm6 // V
1685 vpmaddubsw ymm2, ymm2, ymm6
1686 vphaddw ymm1, ymm1, ymm3 // mutates
1687 vphaddw ymm0, ymm0, ymm2
1688 vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned
1689 vpaddw ymm0, ymm0, ymm5
1690 vpsraw ymm1, ymm1, 8
1691 vpsraw ymm0, ymm0, 8
1692 vpacksswb ymm0, ymm1, ymm0 // mutates
1693 vpermq ymm0, ymm0, 0xd8 // For vpacksswb
1694 vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
1695
1696 // step 3 - store 16 U and 16 V values
1697 vextractf128 [edx], ymm0, 0 // U
1698 vextractf128 [edx + edi], ymm0, 1 // V
1699 lea edx, [edx + 16]
1700 sub ecx, 32
1701 jg convertloop
1702
1703 pop edi
1704 pop esi
1705 vzeroupper
1706 ret
1707 }
1708 }
1709 #endif // HAS_ARGBTOUVJROW_AVX2
1710
1711 __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
1712 uint8_t* dst_u,
1713 uint8_t* dst_v,
1714 int width) {
1715 __asm {
1716 push edi
1717 mov eax, [esp + 4 + 4] // src_argb
1718 mov edx, [esp + 4 + 8] // dst_u
1719 mov edi, [esp + 4 + 12] // dst_v
1720 mov ecx, [esp + 4 + 16] // width
1721 movdqa xmm5, xmmword ptr kBiasUV128
1722 movdqa xmm6, xmmword ptr kARGBToV
1723 movdqa xmm7, xmmword ptr kARGBToU
1724 sub edi, edx // stride from u to v
1725
1726 convertloop:
1727 /* convert to U and V */
1728 movdqu xmm0, [eax] // U
1729 movdqu xmm1, [eax + 16]
1730 movdqu xmm2, [eax + 32]
1731 movdqu xmm3, [eax + 48]
1732 pmaddubsw xmm0, xmm7
1733 pmaddubsw xmm1, xmm7
1734 pmaddubsw xmm2, xmm7
1735 pmaddubsw xmm3, xmm7
1736 phaddw xmm0, xmm1
1737 phaddw xmm2, xmm3
1738 psraw xmm0, 8
1739 psraw xmm2, 8
1740 packsswb xmm0, xmm2
1741 paddb xmm0, xmm5
1742 movdqu [edx], xmm0
1743
1744 movdqu xmm0, [eax] // V
1745 movdqu xmm1, [eax + 16]
1746 movdqu xmm2, [eax + 32]
1747 movdqu xmm3, [eax + 48]
1748 pmaddubsw xmm0, xmm6
1749 pmaddubsw xmm1, xmm6
1750 pmaddubsw xmm2, xmm6
1751 pmaddubsw xmm3, xmm6
1752 phaddw xmm0, xmm1
1753 phaddw xmm2, xmm3
1754 psraw xmm0, 8
1755 psraw xmm2, 8
1756 packsswb xmm0, xmm2
1757 paddb xmm0, xmm5
1758 lea eax, [eax + 64]
1759 movdqu [edx + edi], xmm0
1760 lea edx, [edx + 16]
1761 sub ecx, 16
1762 jg convertloop
1763
1764 pop edi
1765 ret
1766 }
1767 }
1768
1769 __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb,
1770 int src_stride_argb,
1771 uint8_t* dst_u,
1772 uint8_t* dst_v,
1773 int width) {
1774 __asm {
1775 push esi
1776 push edi
1777 mov eax, [esp + 8 + 4] // src_argb
1778 mov esi, [esp + 8 + 8] // src_stride_argb
1779 mov edx, [esp + 8 + 12] // dst_u
1780 mov edi, [esp + 8 + 16] // dst_v
1781 mov ecx, [esp + 8 + 20] // width
1782 movdqa xmm5, xmmword ptr kBiasUV128
1783 movdqa xmm6, xmmword ptr kBGRAToV
1784 movdqa xmm7, xmmword ptr kBGRAToU
1785 sub edi, edx // stride from u to v
1786
1787 convertloop:
1788 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1789 movdqu xmm0, [eax]
1790 movdqu xmm4, [eax + esi]
1791 pavgb xmm0, xmm4
1792 movdqu xmm1, [eax + 16]
1793 movdqu xmm4, [eax + esi + 16]
1794 pavgb xmm1, xmm4
1795 movdqu xmm2, [eax + 32]
1796 movdqu xmm4, [eax + esi + 32]
1797 pavgb xmm2, xmm4
1798 movdqu xmm3, [eax + 48]
1799 movdqu xmm4, [eax + esi + 48]
1800 pavgb xmm3, xmm4
1801
1802 lea eax, [eax + 64]
1803 movdqa xmm4, xmm0
1804 shufps xmm0, xmm1, 0x88
1805 shufps xmm4, xmm1, 0xdd
1806 pavgb xmm0, xmm4
1807 movdqa xmm4, xmm2
1808 shufps xmm2, xmm3, 0x88
1809 shufps xmm4, xmm3, 0xdd
1810 pavgb xmm2, xmm4
1811
1812 // step 2 - convert to U and V
1813 // from here down is very similar to Y code except
1814 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1815 movdqa xmm1, xmm0
1816 movdqa xmm3, xmm2
1817 pmaddubsw xmm0, xmm7 // U
1818 pmaddubsw xmm2, xmm7
1819 pmaddubsw xmm1, xmm6 // V
1820 pmaddubsw xmm3, xmm6
1821 phaddw xmm0, xmm2
1822 phaddw xmm1, xmm3
1823 psraw xmm0, 8
1824 psraw xmm1, 8
1825 packsswb xmm0, xmm1
1826 paddb xmm0, xmm5 // -> unsigned
1827
1828 // step 3 - store 8 U and 8 V values
1829 movlps qword ptr [edx], xmm0 // U
1830 movhps qword ptr [edx + edi], xmm0 // V
1831 lea edx, [edx + 8]
1832 sub ecx, 16
1833 jg convertloop
1834
1835 pop edi
1836 pop esi
1837 ret
1838 }
1839 }
1840
1841 __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb,
1842 int src_stride_argb,
1843 uint8_t* dst_u,
1844 uint8_t* dst_v,
1845 int width) {
1846 __asm {
1847 push esi
1848 push edi
1849 mov eax, [esp + 8 + 4] // src_argb
1850 mov esi, [esp + 8 + 8] // src_stride_argb
1851 mov edx, [esp + 8 + 12] // dst_u
1852 mov edi, [esp + 8 + 16] // dst_v
1853 mov ecx, [esp + 8 + 20] // width
1854 movdqa xmm5, xmmword ptr kBiasUV128
1855 movdqa xmm6, xmmword ptr kABGRToV
1856 movdqa xmm7, xmmword ptr kABGRToU
1857 sub edi, edx // stride from u to v
1858
1859 convertloop:
1860 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1861 movdqu xmm0, [eax]
1862 movdqu xmm4, [eax + esi]
1863 pavgb xmm0, xmm4
1864 movdqu xmm1, [eax + 16]
1865 movdqu xmm4, [eax + esi + 16]
1866 pavgb xmm1, xmm4
1867 movdqu xmm2, [eax + 32]
1868 movdqu xmm4, [eax + esi + 32]
1869 pavgb xmm2, xmm4
1870 movdqu xmm3, [eax + 48]
1871 movdqu xmm4, [eax + esi + 48]
1872 pavgb xmm3, xmm4
1873
1874 lea eax, [eax + 64]
1875 movdqa xmm4, xmm0
1876 shufps xmm0, xmm1, 0x88
1877 shufps xmm4, xmm1, 0xdd
1878 pavgb xmm0, xmm4
1879 movdqa xmm4, xmm2
1880 shufps xmm2, xmm3, 0x88
1881 shufps xmm4, xmm3, 0xdd
1882 pavgb xmm2, xmm4
1883
1884 // step 2 - convert to U and V
1885 // from here down is very similar to Y code except
1886 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1887 movdqa xmm1, xmm0
1888 movdqa xmm3, xmm2
1889 pmaddubsw xmm0, xmm7 // U
1890 pmaddubsw xmm2, xmm7
1891 pmaddubsw xmm1, xmm6 // V
1892 pmaddubsw xmm3, xmm6
1893 phaddw xmm0, xmm2
1894 phaddw xmm1, xmm3
1895 psraw xmm0, 8
1896 psraw xmm1, 8
1897 packsswb xmm0, xmm1
1898 paddb xmm0, xmm5 // -> unsigned
1899
1900 // step 3 - store 8 U and 8 V values
1901 movlps qword ptr [edx], xmm0 // U
1902 movhps qword ptr [edx + edi], xmm0 // V
1903 lea edx, [edx + 8]
1904 sub ecx, 16
1905 jg convertloop
1906
1907 pop edi
1908 pop esi
1909 ret
1910 }
1911 }
1912
1913 __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb,
1914 int src_stride_argb,
1915 uint8_t* dst_u,
1916 uint8_t* dst_v,
1917 int width) {
1918 __asm {
1919 push esi
1920 push edi
1921 mov eax, [esp + 8 + 4] // src_argb
1922 mov esi, [esp + 8 + 8] // src_stride_argb
1923 mov edx, [esp + 8 + 12] // dst_u
1924 mov edi, [esp + 8 + 16] // dst_v
1925 mov ecx, [esp + 8 + 20] // width
1926 movdqa xmm5, xmmword ptr kBiasUV128
1927 movdqa xmm6, xmmword ptr kRGBAToV
1928 movdqa xmm7, xmmword ptr kRGBAToU
1929 sub edi, edx // stride from u to v
1930
1931 convertloop:
1932 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1933 movdqu xmm0, [eax]
1934 movdqu xmm4, [eax + esi]
1935 pavgb xmm0, xmm4
1936 movdqu xmm1, [eax + 16]
1937 movdqu xmm4, [eax + esi + 16]
1938 pavgb xmm1, xmm4
1939 movdqu xmm2, [eax + 32]
1940 movdqu xmm4, [eax + esi + 32]
1941 pavgb xmm2, xmm4
1942 movdqu xmm3, [eax + 48]
1943 movdqu xmm4, [eax + esi + 48]
1944 pavgb xmm3, xmm4
1945
1946 lea eax, [eax + 64]
1947 movdqa xmm4, xmm0
1948 shufps xmm0, xmm1, 0x88
1949 shufps xmm4, xmm1, 0xdd
1950 pavgb xmm0, xmm4
1951 movdqa xmm4, xmm2
1952 shufps xmm2, xmm3, 0x88
1953 shufps xmm4, xmm3, 0xdd
1954 pavgb xmm2, xmm4
1955
1956 // step 2 - convert to U and V
1957 // from here down is very similar to Y code except
1958 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1959 movdqa xmm1, xmm0
1960 movdqa xmm3, xmm2
1961 pmaddubsw xmm0, xmm7 // U
1962 pmaddubsw xmm2, xmm7
1963 pmaddubsw xmm1, xmm6 // V
1964 pmaddubsw xmm3, xmm6
1965 phaddw xmm0, xmm2
1966 phaddw xmm1, xmm3
1967 psraw xmm0, 8
1968 psraw xmm1, 8
1969 packsswb xmm0, xmm1
1970 paddb xmm0, xmm5 // -> unsigned
1971
1972 // step 3 - store 8 U and 8 V values
1973 movlps qword ptr [edx], xmm0 // U
1974 movhps qword ptr [edx + edi], xmm0 // V
1975 lea edx, [edx + 8]
1976 sub ecx, 16
1977 jg convertloop
1978
1979 pop edi
1980 pop esi
1981 ret
1982 }
1983 }
1984 #endif // HAS_ARGBTOYROW_SSSE3
1985
1986 // Read 16 UV from 444
1987 #define READYUV444_AVX2 \
1988 __asm { \
1989 __asm vmovdqu xmm3, [esi] /* U */ \
1990 __asm vmovdqu xmm1, [esi + edi] /* V */ \
1991 __asm lea esi, [esi + 16] \
1992 __asm vpermq ymm3, ymm3, 0xd8 \
1993 __asm vpermq ymm1, ymm1, 0xd8 \
1994 __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \
1995 __asm vmovdqu xmm4, [eax] /* Y */ \
1996 __asm vpermq ymm4, ymm4, 0xd8 \
1997 __asm vpunpcklbw ymm4, ymm4, ymm4 \
1998 __asm lea eax, [eax + 16]}
1999
2000 // Read 16 UV from 444. With 16 Alpha.
2001 #define READYUVA444_AVX2 \
2002 __asm { \
2003 __asm vmovdqu xmm3, [esi] /* U */ \
2004 __asm vmovdqu xmm1, [esi + edi] /* V */ \
2005 __asm lea esi, [esi + 16] \
2006 __asm vpermq ymm3, ymm3, 0xd8 \
2007 __asm vpermq ymm1, ymm1, 0xd8 \
2008 __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \
2009 __asm vmovdqu xmm4, [eax] /* Y */ \
2010 __asm vpermq ymm4, ymm4, 0xd8 \
2011 __asm vpunpcklbw ymm4, ymm4, ymm4 \
2012 __asm lea eax, [eax + 16] \
2013 __asm vmovdqu xmm5, [ebp] /* A */ \
2014 __asm vpermq ymm5, ymm5, 0xd8 \
2015 __asm lea ebp, [ebp + 16]}
2016
2017 // Read 8 UV from 422, upsample to 16 UV.
2018 #define READYUV422_AVX2 \
2019 __asm { \
2020 __asm vmovq xmm3, qword ptr [esi] /* U */ \
2021 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
2022 __asm lea esi, [esi + 8] \
2023 __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \
2024 __asm vpermq ymm3, ymm3, 0xd8 \
2025 __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \
2026 __asm vmovdqu xmm4, [eax] /* Y */ \
2027 __asm vpermq ymm4, ymm4, 0xd8 \
2028 __asm vpunpcklbw ymm4, ymm4, ymm4 \
2029 __asm lea eax, [eax + 16]}
2030
2031 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
2032 #define READYUVA422_AVX2 \
2033 __asm { \
2034 __asm vmovq xmm3, qword ptr [esi] /* U */ \
2035 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
2036 __asm lea esi, [esi + 8] \
2037 __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \
2038 __asm vpermq ymm3, ymm3, 0xd8 \
2039 __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \
2040 __asm vmovdqu xmm4, [eax] /* Y */ \
2041 __asm vpermq ymm4, ymm4, 0xd8 \
2042 __asm vpunpcklbw ymm4, ymm4, ymm4 \
2043 __asm lea eax, [eax + 16] \
2044 __asm vmovdqu xmm5, [ebp] /* A */ \
2045 __asm vpermq ymm5, ymm5, 0xd8 \
2046 __asm lea ebp, [ebp + 16]}
2047
2048 // Read 8 UV from NV12, upsample to 16 UV.
2049 #define READNV12_AVX2 \
2050 __asm { \
2051 __asm vmovdqu xmm3, [esi] /* UV */ \
2052 __asm lea esi, [esi + 16] \
2053 __asm vpermq ymm3, ymm3, 0xd8 \
2054 __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \
2055 __asm vmovdqu xmm4, [eax] /* Y */ \
2056 __asm vpermq ymm4, ymm4, 0xd8 \
2057 __asm vpunpcklbw ymm4, ymm4, ymm4 \
2058 __asm lea eax, [eax + 16]}
2059
2060 // Read 8 UV from NV21, upsample to 16 UV.
2061 #define READNV21_AVX2 \
2062 __asm { \
2063 __asm vmovdqu xmm3, [esi] /* UV */ \
2064 __asm lea esi, [esi + 16] \
2065 __asm vpermq ymm3, ymm3, 0xd8 \
2066 __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleNV21 \
2067 __asm vmovdqu xmm4, [eax] /* Y */ \
2068 __asm vpermq ymm4, ymm4, 0xd8 \
2069 __asm vpunpcklbw ymm4, ymm4, ymm4 \
2070 __asm lea eax, [eax + 16]}
2071
2072 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
2073 #define READYUY2_AVX2 \
2074 __asm { \
2075 __asm vmovdqu ymm4, [eax] /* YUY2 */ \
2076 __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \
2077 __asm vmovdqu ymm3, [eax] /* UV */ \
2078 __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleYUY2UV \
2079 __asm lea eax, [eax + 32]}
2080
2081 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
2082 #define READUYVY_AVX2 \
2083 __asm { \
2084 __asm vmovdqu ymm4, [eax] /* UYVY */ \
2085 __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \
2086 __asm vmovdqu ymm3, [eax] /* UV */ \
2087 __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleUYVYUV \
2088 __asm lea eax, [eax + 32]}
2089
2090 // Convert 16 pixels: 16 UV and 16 Y.
2091 #define YUVTORGB_AVX2(YuvConstants) \
2092 __asm { \
2093 __asm vpsubb ymm3, ymm3, ymmword ptr kBiasUV128 \
2094 __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \
2095 __asm vmovdqa ymm0, ymmword ptr [YuvConstants + KUVTOB] \
2096 __asm vmovdqa ymm1, ymmword ptr [YuvConstants + KUVTOG] \
2097 __asm vmovdqa ymm2, ymmword ptr [YuvConstants + KUVTOR] \
2098 __asm vpmaddubsw ymm0, ymm0, ymm3 /* B UV */ \
2099 __asm vpmaddubsw ymm1, ymm1, ymm3 /* G UV */ \
2100 __asm vpmaddubsw ymm2, ymm2, ymm3 /* B UV */ \
2101 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KYBIASTORGB] \
2102 __asm vpaddw ymm4, ymm3, ymm4 \
2103 __asm vpaddsw ymm0, ymm0, ymm4 \
2104 __asm vpsubsw ymm1, ymm4, ymm1 \
2105 __asm vpaddsw ymm2, ymm2, ymm4 \
2106 __asm vpsraw ymm0, ymm0, 6 \
2107 __asm vpsraw ymm1, ymm1, 6 \
2108 __asm vpsraw ymm2, ymm2, 6 \
2109 __asm vpackuswb ymm0, ymm0, ymm0 \
2110 __asm vpackuswb ymm1, ymm1, ymm1 \
2111 __asm vpackuswb ymm2, ymm2, ymm2}
2112
2113 // Store 16 ARGB values.
2114 #define STOREARGB_AVX2 \
2115 __asm { \
2116 __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
2117 __asm vpermq ymm0, ymm0, 0xd8 \
2118 __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
2119 __asm vpermq ymm2, ymm2, 0xd8 \
2120 __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
2121 __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
2122 __asm vmovdqu 0[edx], ymm1 \
2123 __asm vmovdqu 32[edx], ymm0 \
2124 __asm lea edx, [edx + 64]}
2125
2126 // Store 16 RGBA values.
2127 #define STORERGBA_AVX2 \
2128 __asm { \
2129 __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \
2130 __asm vpermq ymm1, ymm1, 0xd8 \
2131 __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \
2132 __asm vpermq ymm2, ymm2, 0xd8 \
2133 __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \
2134 __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \
2135 __asm vmovdqu [edx], ymm0 \
2136 __asm vmovdqu [edx + 32], ymm1 \
2137 __asm lea edx, [edx + 64]}
2138
2139 #ifdef HAS_I422TOARGBROW_AVX2
2140 // 16 pixels
2141 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2142 __declspec(naked) void I422ToARGBRow_AVX2(
2143 const uint8_t* y_buf,
2144 const uint8_t* u_buf,
2145 const uint8_t* v_buf,
2146 uint8_t* dst_argb,
2147 const struct YuvConstants* yuvconstants,
2148 int width) {
2149 __asm {
2150 push esi
2151 push edi
2152 push ebx
2153 mov eax, [esp + 12 + 4] // Y
2154 mov esi, [esp + 12 + 8] // U
2155 mov edi, [esp + 12 + 12] // V
2156 mov edx, [esp + 12 + 16] // argb
2157 mov ebx, [esp + 12 + 20] // yuvconstants
2158 mov ecx, [esp + 12 + 24] // width
2159 sub edi, esi
2160 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2161
2162 convertloop:
2163 READYUV422_AVX2
2164 YUVTORGB_AVX2(ebx)
2165 STOREARGB_AVX2
2166
2167 sub ecx, 16
2168 jg convertloop
2169
2170 pop ebx
2171 pop edi
2172 pop esi
2173 vzeroupper
2174 ret
2175 }
2176 }
2177 #endif // HAS_I422TOARGBROW_AVX2
2178
2179 #ifdef HAS_I422ALPHATOARGBROW_AVX2
2180 // 16 pixels
2181 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
2182 __declspec(naked) void I422AlphaToARGBRow_AVX2(
2183 const uint8_t* y_buf,
2184 const uint8_t* u_buf,
2185 const uint8_t* v_buf,
2186 const uint8_t* a_buf,
2187 uint8_t* dst_argb,
2188 const struct YuvConstants* yuvconstants,
2189 int width) {
2190 __asm {
2191 push esi
2192 push edi
2193 push ebx
2194 push ebp
2195 mov eax, [esp + 16 + 4] // Y
2196 mov esi, [esp + 16 + 8] // U
2197 mov edi, [esp + 16 + 12] // V
2198 mov ebp, [esp + 16 + 16] // A
2199 mov edx, [esp + 16 + 20] // argb
2200 mov ebx, [esp + 16 + 24] // yuvconstants
2201 mov ecx, [esp + 16 + 28] // width
2202 sub edi, esi
2203
2204 convertloop:
2205 READYUVA422_AVX2
2206 YUVTORGB_AVX2(ebx)
2207 STOREARGB_AVX2
2208
2209 sub ecx, 16
2210 jg convertloop
2211
2212 pop ebp
2213 pop ebx
2214 pop edi
2215 pop esi
2216 vzeroupper
2217 ret
2218 }
2219 }
2220 #endif // HAS_I422ALPHATOARGBROW_AVX2
2221
2222 #ifdef HAS_I444TOARGBROW_AVX2
2223 // 16 pixels
2224 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
2225 __declspec(naked) void I444ToARGBRow_AVX2(
2226 const uint8_t* y_buf,
2227 const uint8_t* u_buf,
2228 const uint8_t* v_buf,
2229 uint8_t* dst_argb,
2230 const struct YuvConstants* yuvconstants,
2231 int width) {
2232 __asm {
2233 push esi
2234 push edi
2235 push ebx
2236 mov eax, [esp + 12 + 4] // Y
2237 mov esi, [esp + 12 + 8] // U
2238 mov edi, [esp + 12 + 12] // V
2239 mov edx, [esp + 12 + 16] // argb
2240 mov ebx, [esp + 12 + 20] // yuvconstants
2241 mov ecx, [esp + 12 + 24] // width
2242 sub edi, esi
2243 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2244 convertloop:
2245 READYUV444_AVX2
2246 YUVTORGB_AVX2(ebx)
2247 STOREARGB_AVX2
2248
2249 sub ecx, 16
2250 jg convertloop
2251
2252 pop ebx
2253 pop edi
2254 pop esi
2255 vzeroupper
2256 ret
2257 }
2258 }
2259 #endif // HAS_I444TOARGBROW_AVX2
2260
2261 #ifdef HAS_I444ALPHATOARGBROW_AVX2
2262 // 16 pixels
2263 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
2264 __declspec(naked) void I444AlphaToARGBRow_AVX2(
2265 const uint8_t* y_buf,
2266 const uint8_t* u_buf,
2267 const uint8_t* v_buf,
2268 const uint8_t* a_buf,
2269 uint8_t* dst_argb,
2270 const struct YuvConstants* yuvconstants,
2271 int width) {
2272 __asm {
2273 push esi
2274 push edi
2275 push ebx
2276 push ebp
2277 mov eax, [esp + 16 + 4] // Y
2278 mov esi, [esp + 16 + 8] // U
2279 mov edi, [esp + 16 + 12] // V
2280 mov ebp, [esp + 16 + 16] // A
2281 mov edx, [esp + 16 + 20] // argb
2282 mov ebx, [esp + 16 + 24] // yuvconstants
2283 mov ecx, [esp + 16 + 28] // width
2284 sub edi, esi
2285 convertloop:
2286 READYUVA444_AVX2
2287 YUVTORGB_AVX2(ebx)
2288 STOREARGB_AVX2
2289
2290 sub ecx, 16
2291 jg convertloop
2292
2293 pop ebp
2294 pop ebx
2295 pop edi
2296 pop esi
2297 vzeroupper
2298 ret
2299 }
2300 }
2301 #endif // HAS_I444AlphaTOARGBROW_AVX2
2302
2303 #ifdef HAS_NV12TOARGBROW_AVX2
2304 // 16 pixels.
2305 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2306 __declspec(naked) void NV12ToARGBRow_AVX2(
2307 const uint8_t* y_buf,
2308 const uint8_t* uv_buf,
2309 uint8_t* dst_argb,
2310 const struct YuvConstants* yuvconstants,
2311 int width) {
2312 __asm {
2313 push esi
2314 push ebx
2315 mov eax, [esp + 8 + 4] // Y
2316 mov esi, [esp + 8 + 8] // UV
2317 mov edx, [esp + 8 + 12] // argb
2318 mov ebx, [esp + 8 + 16] // yuvconstants
2319 mov ecx, [esp + 8 + 20] // width
2320 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2321
2322 convertloop:
2323 READNV12_AVX2
2324 YUVTORGB_AVX2(ebx)
2325 STOREARGB_AVX2
2326
2327 sub ecx, 16
2328 jg convertloop
2329
2330 pop ebx
2331 pop esi
2332 vzeroupper
2333 ret
2334 }
2335 }
2336 #endif // HAS_NV12TOARGBROW_AVX2
2337
2338 #ifdef HAS_NV21TOARGBROW_AVX2
2339 // 16 pixels.
2340 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2341 __declspec(naked) void NV21ToARGBRow_AVX2(
2342 const uint8_t* y_buf,
2343 const uint8_t* vu_buf,
2344 uint8_t* dst_argb,
2345 const struct YuvConstants* yuvconstants,
2346 int width) {
2347 __asm {
2348 push esi
2349 push ebx
2350 mov eax, [esp + 8 + 4] // Y
2351 mov esi, [esp + 8 + 8] // VU
2352 mov edx, [esp + 8 + 12] // argb
2353 mov ebx, [esp + 8 + 16] // yuvconstants
2354 mov ecx, [esp + 8 + 20] // width
2355 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2356
2357 convertloop:
2358 READNV21_AVX2
2359 YUVTORGB_AVX2(ebx)
2360 STOREARGB_AVX2
2361
2362 sub ecx, 16
2363 jg convertloop
2364
2365 pop ebx
2366 pop esi
2367 vzeroupper
2368 ret
2369 }
2370 }
2371 #endif // HAS_NV21TOARGBROW_AVX2
2372
2373 #ifdef HAS_YUY2TOARGBROW_AVX2
2374 // 16 pixels.
2375 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2376 __declspec(naked) void YUY2ToARGBRow_AVX2(
2377 const uint8_t* src_yuy2,
2378 uint8_t* dst_argb,
2379 const struct YuvConstants* yuvconstants,
2380 int width) {
2381 __asm {
2382 push ebx
2383 mov eax, [esp + 4 + 4] // yuy2
2384 mov edx, [esp + 4 + 8] // argb
2385 mov ebx, [esp + 4 + 12] // yuvconstants
2386 mov ecx, [esp + 4 + 16] // width
2387 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2388
2389 convertloop:
2390 READYUY2_AVX2
2391 YUVTORGB_AVX2(ebx)
2392 STOREARGB_AVX2
2393
2394 sub ecx, 16
2395 jg convertloop
2396
2397 pop ebx
2398 vzeroupper
2399 ret
2400 }
2401 }
2402 #endif // HAS_YUY2TOARGBROW_AVX2
2403
2404 #ifdef HAS_UYVYTOARGBROW_AVX2
2405 // 16 pixels.
2406 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2407 __declspec(naked) void UYVYToARGBRow_AVX2(
2408 const uint8_t* src_uyvy,
2409 uint8_t* dst_argb,
2410 const struct YuvConstants* yuvconstants,
2411 int width) {
2412 __asm {
2413 push ebx
2414 mov eax, [esp + 4 + 4] // uyvy
2415 mov edx, [esp + 4 + 8] // argb
2416 mov ebx, [esp + 4 + 12] // yuvconstants
2417 mov ecx, [esp + 4 + 16] // width
2418 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2419
2420 convertloop:
2421 READUYVY_AVX2
2422 YUVTORGB_AVX2(ebx)
2423 STOREARGB_AVX2
2424
2425 sub ecx, 16
2426 jg convertloop
2427
2428 pop ebx
2429 vzeroupper
2430 ret
2431 }
2432 }
2433 #endif // HAS_UYVYTOARGBROW_AVX2
2434
2435 #ifdef HAS_I422TORGBAROW_AVX2
2436 // 16 pixels
2437 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
2438 __declspec(naked) void I422ToRGBARow_AVX2(
2439 const uint8_t* y_buf,
2440 const uint8_t* u_buf,
2441 const uint8_t* v_buf,
2442 uint8_t* dst_argb,
2443 const struct YuvConstants* yuvconstants,
2444 int width) {
2445 __asm {
2446 push esi
2447 push edi
2448 push ebx
2449 mov eax, [esp + 12 + 4] // Y
2450 mov esi, [esp + 12 + 8] // U
2451 mov edi, [esp + 12 + 12] // V
2452 mov edx, [esp + 12 + 16] // abgr
2453 mov ebx, [esp + 12 + 20] // yuvconstants
2454 mov ecx, [esp + 12 + 24] // width
2455 sub edi, esi
2456 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2457
2458 convertloop:
2459 READYUV422_AVX2
2460 YUVTORGB_AVX2(ebx)
2461 STORERGBA_AVX2
2462
2463 sub ecx, 16
2464 jg convertloop
2465
2466 pop ebx
2467 pop edi
2468 pop esi
2469 vzeroupper
2470 ret
2471 }
2472 }
2473 #endif // HAS_I422TORGBAROW_AVX2
2474
2475 #if defined(HAS_I422TOARGBROW_SSSE3)
2476 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
2477 // Allows a conversion with half size scaling.
2478
2479 // Read 8 UV from 444.
2480 #define READYUV444 \
2481 __asm { \
2482 __asm movq xmm3, qword ptr [esi] /* U */ \
2483 __asm movq xmm1, qword ptr [esi + edi] /* V */ \
2484 __asm lea esi, [esi + 8] \
2485 __asm punpcklbw xmm3, xmm1 /* UV */ \
2486 __asm movq xmm4, qword ptr [eax] \
2487 __asm punpcklbw xmm4, xmm4 \
2488 __asm lea eax, [eax + 8]}
2489
2490 // Read 4 UV from 444. With 8 Alpha.
2491 #define READYUVA444 \
2492 __asm { \
2493 __asm movq xmm3, qword ptr [esi] /* U */ \
2494 __asm movq xmm1, qword ptr [esi + edi] /* V */ \
2495 __asm lea esi, [esi + 8] \
2496 __asm punpcklbw xmm3, xmm1 /* UV */ \
2497 __asm movq xmm4, qword ptr [eax] \
2498 __asm punpcklbw xmm4, xmm4 \
2499 __asm lea eax, [eax + 8] \
2500 __asm movq xmm5, qword ptr [ebp] /* A */ \
2501 __asm lea ebp, [ebp + 8]}
2502
2503 // Read 4 UV from 422, upsample to 8 UV.
2504 #define READYUV422 \
2505 __asm { \
2506 __asm movd xmm3, [esi] /* U */ \
2507 __asm movd xmm1, [esi + edi] /* V */ \
2508 __asm lea esi, [esi + 4] \
2509 __asm punpcklbw xmm3, xmm1 /* UV */ \
2510 __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \
2511 __asm movq xmm4, qword ptr [eax] \
2512 __asm punpcklbw xmm4, xmm4 \
2513 __asm lea eax, [eax + 8]}
2514
2515 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
2516 #define READYUVA422 \
2517 __asm { \
2518 __asm movd xmm3, [esi] /* U */ \
2519 __asm movd xmm1, [esi + edi] /* V */ \
2520 __asm lea esi, [esi + 4] \
2521 __asm punpcklbw xmm3, xmm1 /* UV */ \
2522 __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \
2523 __asm movq xmm4, qword ptr [eax] /* Y */ \
2524 __asm punpcklbw xmm4, xmm4 \
2525 __asm lea eax, [eax + 8] \
2526 __asm movq xmm5, qword ptr [ebp] /* A */ \
2527 __asm lea ebp, [ebp + 8]}
2528
2529 // Read 4 UV from NV12, upsample to 8 UV.
2530 #define READNV12 \
2531 __asm { \
2532 __asm movq xmm3, qword ptr [esi] /* UV */ \
2533 __asm lea esi, [esi + 8] \
2534 __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \
2535 __asm movq xmm4, qword ptr [eax] \
2536 __asm punpcklbw xmm4, xmm4 \
2537 __asm lea eax, [eax + 8]}
2538
2539 // Read 4 VU from NV21, upsample to 8 UV.
2540 #define READNV21 \
2541 __asm { \
2542 __asm movq xmm3, qword ptr [esi] /* UV */ \
2543 __asm lea esi, [esi + 8] \
2544 __asm pshufb xmm3, xmmword ptr kShuffleNV21 \
2545 __asm movq xmm4, qword ptr [eax] \
2546 __asm punpcklbw xmm4, xmm4 \
2547 __asm lea eax, [eax + 8]}
2548
2549 // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
2550 #define READYUY2 \
2551 __asm { \
2552 __asm movdqu xmm4, [eax] /* YUY2 */ \
2553 __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \
2554 __asm movdqu xmm3, [eax] /* UV */ \
2555 __asm pshufb xmm3, xmmword ptr kShuffleYUY2UV \
2556 __asm lea eax, [eax + 16]}
2557
2558 // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
2559 #define READUYVY \
2560 __asm { \
2561 __asm movdqu xmm4, [eax] /* UYVY */ \
2562 __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \
2563 __asm movdqu xmm3, [eax] /* UV */ \
2564 __asm pshufb xmm3, xmmword ptr kShuffleUYVYUV \
2565 __asm lea eax, [eax + 16]}
2566
2567 // Convert 8 pixels: 8 UV and 8 Y.
2568 #define YUVTORGB(YuvConstants) \
2569 __asm { \
2570 __asm psubb xmm3, xmmword ptr kBiasUV128 \
2571 __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \
2572 __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVTOB] \
2573 __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVTOG] \
2574 __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVTOR] \
2575 __asm pmaddubsw xmm0, xmm3 \
2576 __asm pmaddubsw xmm1, xmm3 \
2577 __asm pmaddubsw xmm2, xmm3 \
2578 __asm movdqa xmm3, xmmword ptr [YuvConstants + KYBIASTORGB] \
2579 __asm paddw xmm4, xmm3 \
2580 __asm paddsw xmm0, xmm4 \
2581 __asm paddsw xmm2, xmm4 \
2582 __asm psubsw xmm4, xmm1 \
2583 __asm movdqa xmm1, xmm4 \
2584 __asm psraw xmm0, 6 \
2585 __asm psraw xmm1, 6 \
2586 __asm psraw xmm2, 6 \
2587 __asm packuswb xmm0, xmm0 /* B */ \
2588 __asm packuswb xmm1, xmm1 /* G */ \
2589 __asm packuswb xmm2, xmm2 /* R */ \
2590 }
2591
2592 // Store 8 ARGB values.
2593 #define STOREARGB \
2594 __asm { \
2595 __asm punpcklbw xmm0, xmm1 /* BG */ \
2596 __asm punpcklbw xmm2, xmm5 /* RA */ \
2597 __asm movdqa xmm1, xmm0 \
2598 __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
2599 __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
2600 __asm movdqu 0[edx], xmm0 \
2601 __asm movdqu 16[edx], xmm1 \
2602 __asm lea edx, [edx + 32]}
2603
2604 // Store 8 BGRA values.
2605 #define STOREBGRA \
2606 __asm { \
2607 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
2608 __asm punpcklbw xmm1, xmm0 /* GB */ \
2609 __asm punpcklbw xmm5, xmm2 /* AR */ \
2610 __asm movdqa xmm0, xmm5 \
2611 __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
2612 __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
2613 __asm movdqu 0[edx], xmm5 \
2614 __asm movdqu 16[edx], xmm0 \
2615 __asm lea edx, [edx + 32]}
2616
2617 // Store 8 RGBA values.
2618 #define STORERGBA \
2619 __asm { \
2620 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
2621 __asm punpcklbw xmm1, xmm2 /* GR */ \
2622 __asm punpcklbw xmm5, xmm0 /* AB */ \
2623 __asm movdqa xmm0, xmm5 \
2624 __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
2625 __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
2626 __asm movdqu 0[edx], xmm5 \
2627 __asm movdqu 16[edx], xmm0 \
2628 __asm lea edx, [edx + 32]}
2629
2630 // Store 8 RGB24 values.
2631 #define STORERGB24 \
2632 __asm {/* Weave into RRGB */ \
2633 __asm punpcklbw xmm0, xmm1 /* BG */ \
2634 __asm punpcklbw xmm2, xmm2 /* RR */ \
2635 __asm movdqa xmm1, xmm0 \
2636 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
2637 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \
2638 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
2639 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
2640 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
2641 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
2642 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
2643 __asm lea edx, [edx + 24]}
2644
2645 // Store 8 RGB565 values.
2646 #define STORERGB565 \
2647 __asm {/* Weave into RRGB */ \
2648 __asm punpcklbw xmm0, xmm1 /* BG */ \
2649 __asm punpcklbw xmm2, xmm2 /* RR */ \
2650 __asm movdqa xmm1, xmm0 \
2651 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
2652 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \
2653 __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
2654 __asm movdqa xmm2, xmm0 /* G */ \
2655 __asm pslld xmm0, 8 /* R */ \
2656 __asm psrld xmm3, 3 /* B */ \
2657 __asm psrld xmm2, 5 /* G */ \
2658 __asm psrad xmm0, 16 /* R */ \
2659 __asm pand xmm3, xmm5 /* B */ \
2660 __asm pand xmm2, xmm6 /* G */ \
2661 __asm pand xmm0, xmm7 /* R */ \
2662 __asm por xmm3, xmm2 /* BG */ \
2663 __asm por xmm0, xmm3 /* BGR */ \
2664 __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
2665 __asm movdqa xmm2, xmm1 /* G */ \
2666 __asm pslld xmm1, 8 /* R */ \
2667 __asm psrld xmm3, 3 /* B */ \
2668 __asm psrld xmm2, 5 /* G */ \
2669 __asm psrad xmm1, 16 /* R */ \
2670 __asm pand xmm3, xmm5 /* B */ \
2671 __asm pand xmm2, xmm6 /* G */ \
2672 __asm pand xmm1, xmm7 /* R */ \
2673 __asm por xmm3, xmm2 /* BG */ \
2674 __asm por xmm1, xmm3 /* BGR */ \
2675 __asm packssdw xmm0, xmm1 \
2676 __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
2677 __asm lea edx, [edx + 16]}
2678
2679 // 8 pixels.
2680 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
2681 __declspec(naked) void I444ToARGBRow_SSSE3(
2682 const uint8_t* y_buf,
2683 const uint8_t* u_buf,
2684 const uint8_t* v_buf,
2685 uint8_t* dst_argb,
2686 const struct YuvConstants* yuvconstants,
2687 int width) {
2688 __asm {
2689 push esi
2690 push edi
2691 push ebx
2692 mov eax, [esp + 12 + 4] // Y
2693 mov esi, [esp + 12 + 8] // U
2694 mov edi, [esp + 12 + 12] // V
2695 mov edx, [esp + 12 + 16] // argb
2696 mov ebx, [esp + 12 + 20] // yuvconstants
2697 mov ecx, [esp + 12 + 24] // width
2698 sub edi, esi
2699 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2700
2701 convertloop:
2702 READYUV444
2703 YUVTORGB(ebx)
2704 STOREARGB
2705
2706 sub ecx, 8
2707 jg convertloop
2708
2709 pop ebx
2710 pop edi
2711 pop esi
2712 ret
2713 }
2714 }
2715
2716 // 8 pixels.
2717 // 8 UV values, mixed with 8 Y and 8A producing 8 ARGB (32 bytes).
2718 __declspec(naked) void I444AlphaToARGBRow_SSSE3(
2719 const uint8_t* y_buf,
2720 const uint8_t* u_buf,
2721 const uint8_t* v_buf,
2722 const uint8_t* a_buf,
2723 uint8_t* dst_argb,
2724 const struct YuvConstants* yuvconstants,
2725 int width) {
2726 __asm {
2727 push esi
2728 push edi
2729 push ebx
2730 push ebp
2731 mov eax, [esp + 16 + 4] // Y
2732 mov esi, [esp + 16 + 8] // U
2733 mov edi, [esp + 16 + 12] // V
2734 mov ebp, [esp + 16 + 16] // A
2735 mov edx, [esp + 16 + 20] // argb
2736 mov ebx, [esp + 16 + 24] // yuvconstants
2737 mov ecx, [esp + 16 + 28] // width
2738 sub edi, esi
2739
2740 convertloop:
2741 READYUVA444
2742 YUVTORGB(ebx)
2743 STOREARGB
2744
2745 sub ecx, 8
2746 jg convertloop
2747
2748 pop ebp
2749 pop ebx
2750 pop edi
2751 pop esi
2752 ret
2753 }
2754 }
2755
2756 // 8 pixels.
2757 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
2758 __declspec(naked) void I422ToRGB24Row_SSSE3(
2759 const uint8_t* y_buf,
2760 const uint8_t* u_buf,
2761 const uint8_t* v_buf,
2762 uint8_t* dst_rgb24,
2763 const struct YuvConstants* yuvconstants,
2764 int width) {
2765 __asm {
2766 push esi
2767 push edi
2768 push ebx
2769 mov eax, [esp + 12 + 4] // Y
2770 mov esi, [esp + 12 + 8] // U
2771 mov edi, [esp + 12 + 12] // V
2772 mov edx, [esp + 12 + 16] // argb
2773 mov ebx, [esp + 12 + 20] // yuvconstants
2774 mov ecx, [esp + 12 + 24] // width
2775 sub edi, esi
2776 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
2777 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
2778
2779 convertloop:
2780 READYUV422
2781 YUVTORGB(ebx)
2782 STORERGB24
2783
2784 sub ecx, 8
2785 jg convertloop
2786
2787 pop ebx
2788 pop edi
2789 pop esi
2790 ret
2791 }
2792 }
2793
2794 // 8 pixels.
2795 // 8 UV values, mixed with 8 Y producing 8 RGB24 (24 bytes).
2796 __declspec(naked) void I444ToRGB24Row_SSSE3(
2797 const uint8_t* y_buf,
2798 const uint8_t* u_buf,
2799 const uint8_t* v_buf,
2800 uint8_t* dst_rgb24,
2801 const struct YuvConstants* yuvconstants,
2802 int width) {
2803 __asm {
2804 push esi
2805 push edi
2806 push ebx
2807 mov eax, [esp + 12 + 4] // Y
2808 mov esi, [esp + 12 + 8] // U
2809 mov edi, [esp + 12 + 12] // V
2810 mov edx, [esp + 12 + 16] // argb
2811 mov ebx, [esp + 12 + 20] // yuvconstants
2812 mov ecx, [esp + 12 + 24] // width
2813 sub edi, esi
2814 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
2815 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
2816
2817 convertloop:
2818 READYUV444
2819 YUVTORGB(ebx)
2820 STORERGB24
2821
2822 sub ecx, 8
2823 jg convertloop
2824
2825 pop ebx
2826 pop edi
2827 pop esi
2828 ret
2829 }
2830 }
2831
2832 // 8 pixels
2833 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
2834 __declspec(naked) void I422ToRGB565Row_SSSE3(
2835 const uint8_t* y_buf,
2836 const uint8_t* u_buf,
2837 const uint8_t* v_buf,
2838 uint8_t* rgb565_buf,
2839 const struct YuvConstants* yuvconstants,
2840 int width) {
2841 __asm {
2842 push esi
2843 push edi
2844 push ebx
2845 mov eax, [esp + 12 + 4] // Y
2846 mov esi, [esp + 12 + 8] // U
2847 mov edi, [esp + 12 + 12] // V
2848 mov edx, [esp + 12 + 16] // argb
2849 mov ebx, [esp + 12 + 20] // yuvconstants
2850 mov ecx, [esp + 12 + 24] // width
2851 sub edi, esi
2852 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
2853 psrld xmm5, 27
2854 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
2855 psrld xmm6, 26
2856 pslld xmm6, 5
2857 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
2858 pslld xmm7, 11
2859
2860 convertloop:
2861 READYUV422
2862 YUVTORGB(ebx)
2863 STORERGB565
2864
2865 sub ecx, 8
2866 jg convertloop
2867
2868 pop ebx
2869 pop edi
2870 pop esi
2871 ret
2872 }
2873 }
2874
2875 // 8 pixels.
2876 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2877 __declspec(naked) void I422ToARGBRow_SSSE3(
2878 const uint8_t* y_buf,
2879 const uint8_t* u_buf,
2880 const uint8_t* v_buf,
2881 uint8_t* dst_argb,
2882 const struct YuvConstants* yuvconstants,
2883 int width) {
2884 __asm {
2885 push esi
2886 push edi
2887 push ebx
2888 mov eax, [esp + 12 + 4] // Y
2889 mov esi, [esp + 12 + 8] // U
2890 mov edi, [esp + 12 + 12] // V
2891 mov edx, [esp + 12 + 16] // argb
2892 mov ebx, [esp + 12 + 20] // yuvconstants
2893 mov ecx, [esp + 12 + 24] // width
2894 sub edi, esi
2895 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2896
2897 convertloop:
2898 READYUV422
2899 YUVTORGB(ebx)
2900 STOREARGB
2901
2902 sub ecx, 8
2903 jg convertloop
2904
2905 pop ebx
2906 pop edi
2907 pop esi
2908 ret
2909 }
2910 }
2911
2912 // 8 pixels.
2913 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
2914 __declspec(naked) void I422AlphaToARGBRow_SSSE3(
2915 const uint8_t* y_buf,
2916 const uint8_t* u_buf,
2917 const uint8_t* v_buf,
2918 const uint8_t* a_buf,
2919 uint8_t* dst_argb,
2920 const struct YuvConstants* yuvconstants,
2921 int width) {
2922 __asm {
2923 push esi
2924 push edi
2925 push ebx
2926 push ebp
2927 mov eax, [esp + 16 + 4] // Y
2928 mov esi, [esp + 16 + 8] // U
2929 mov edi, [esp + 16 + 12] // V
2930 mov ebp, [esp + 16 + 16] // A
2931 mov edx, [esp + 16 + 20] // argb
2932 mov ebx, [esp + 16 + 24] // yuvconstants
2933 mov ecx, [esp + 16 + 28] // width
2934 sub edi, esi
2935
2936 convertloop:
2937 READYUVA422
2938 YUVTORGB(ebx)
2939 STOREARGB
2940
2941 sub ecx, 8
2942 jg convertloop
2943
2944 pop ebp
2945 pop ebx
2946 pop edi
2947 pop esi
2948 ret
2949 }
2950 }
2951
2952 // 8 pixels.
2953 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2954 __declspec(naked) void NV12ToARGBRow_SSSE3(
2955 const uint8_t* y_buf,
2956 const uint8_t* uv_buf,
2957 uint8_t* dst_argb,
2958 const struct YuvConstants* yuvconstants,
2959 int width) {
2960 __asm {
2961 push esi
2962 push ebx
2963 mov eax, [esp + 8 + 4] // Y
2964 mov esi, [esp + 8 + 8] // UV
2965 mov edx, [esp + 8 + 12] // argb
2966 mov ebx, [esp + 8 + 16] // yuvconstants
2967 mov ecx, [esp + 8 + 20] // width
2968 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2969
2970 convertloop:
2971 READNV12
2972 YUVTORGB(ebx)
2973 STOREARGB
2974
2975 sub ecx, 8
2976 jg convertloop
2977
2978 pop ebx
2979 pop esi
2980 ret
2981 }
2982 }
2983
2984 // 8 pixels.
2985 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2986 __declspec(naked) void NV21ToARGBRow_SSSE3(
2987 const uint8_t* y_buf,
2988 const uint8_t* vu_buf,
2989 uint8_t* dst_argb,
2990 const struct YuvConstants* yuvconstants,
2991 int width) {
2992 __asm {
2993 push esi
2994 push ebx
2995 mov eax, [esp + 8 + 4] // Y
2996 mov esi, [esp + 8 + 8] // VU
2997 mov edx, [esp + 8 + 12] // argb
2998 mov ebx, [esp + 8 + 16] // yuvconstants
2999 mov ecx, [esp + 8 + 20] // width
3000 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
3001
3002 convertloop:
3003 READNV21
3004 YUVTORGB(ebx)
3005 STOREARGB
3006
3007 sub ecx, 8
3008 jg convertloop
3009
3010 pop ebx
3011 pop esi
3012 ret
3013 }
3014 }
3015
3016 // 8 pixels.
3017 // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
3018 __declspec(naked) void YUY2ToARGBRow_SSSE3(
3019 const uint8_t* src_yuy2,
3020 uint8_t* dst_argb,
3021 const struct YuvConstants* yuvconstants,
3022 int width) {
3023 __asm {
3024 push ebx
3025 mov eax, [esp + 4 + 4] // yuy2
3026 mov edx, [esp + 4 + 8] // argb
3027 mov ebx, [esp + 4 + 12] // yuvconstants
3028 mov ecx, [esp + 4 + 16] // width
3029 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
3030
3031 convertloop:
3032 READYUY2
3033 YUVTORGB(ebx)
3034 STOREARGB
3035
3036 sub ecx, 8
3037 jg convertloop
3038
3039 pop ebx
3040 ret
3041 }
3042 }
3043
3044 // 8 pixels.
3045 // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
3046 __declspec(naked) void UYVYToARGBRow_SSSE3(
3047 const uint8_t* src_uyvy,
3048 uint8_t* dst_argb,
3049 const struct YuvConstants* yuvconstants,
3050 int width) {
3051 __asm {
3052 push ebx
3053 mov eax, [esp + 4 + 4] // uyvy
3054 mov edx, [esp + 4 + 8] // argb
3055 mov ebx, [esp + 4 + 12] // yuvconstants
3056 mov ecx, [esp + 4 + 16] // width
3057 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
3058
3059 convertloop:
3060 READUYVY
3061 YUVTORGB(ebx)
3062 STOREARGB
3063
3064 sub ecx, 8
3065 jg convertloop
3066
3067 pop ebx
3068 ret
3069 }
3070 }
3071
3072 __declspec(naked) void I422ToRGBARow_SSSE3(
3073 const uint8_t* y_buf,
3074 const uint8_t* u_buf,
3075 const uint8_t* v_buf,
3076 uint8_t* dst_rgba,
3077 const struct YuvConstants* yuvconstants,
3078 int width) {
3079 __asm {
3080 push esi
3081 push edi
3082 push ebx
3083 mov eax, [esp + 12 + 4] // Y
3084 mov esi, [esp + 12 + 8] // U
3085 mov edi, [esp + 12 + 12] // V
3086 mov edx, [esp + 12 + 16] // argb
3087 mov ebx, [esp + 12 + 20] // yuvconstants
3088 mov ecx, [esp + 12 + 24] // width
3089 sub edi, esi
3090
3091 convertloop:
3092 READYUV422
3093 YUVTORGB(ebx)
3094 STORERGBA
3095
3096 sub ecx, 8
3097 jg convertloop
3098
3099 pop ebx
3100 pop edi
3101 pop esi
3102 ret
3103 }
3104 }
3105 #endif // HAS_I422TOARGBROW_SSSE3
3106
3107 // I400ToARGBRow_SSE2 is disabled due to new yuvconstant parameter
3108 #ifdef HAS_I400TOARGBROW_SSE2
3109 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
3110 __declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
3111 uint8_t* rgb_buf,
3112 const struct YuvConstants*,
3113 int width) {
3114 __asm {
3115 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
3116 movd xmm2, eax
3117 pshufd xmm2, xmm2,0
3118 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
3119 movd xmm3, eax
3120 pshufd xmm3, xmm3, 0
3121 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
3122 pslld xmm4, 24
3123
3124 mov eax, [esp + 4] // Y
3125 mov edx, [esp + 8] // rgb
3126 mov ecx, [esp + 12] // width
3127
3128 convertloop:
3129 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
3130 movq xmm0, qword ptr [eax]
3131 lea eax, [eax + 8]
3132 punpcklbw xmm0, xmm0 // Y.Y
3133 pmulhuw xmm0, xmm2
3134 psubusw xmm0, xmm3
3135 psrlw xmm0, 6
3136 packuswb xmm0, xmm0 // G
3137
3138 // Step 2: Weave into ARGB
3139 punpcklbw xmm0, xmm0 // GG
3140 movdqa xmm1, xmm0
3141 punpcklwd xmm0, xmm0 // BGRA first 4 pixels
3142 punpckhwd xmm1, xmm1 // BGRA next 4 pixels
3143 por xmm0, xmm4
3144 por xmm1, xmm4
3145 movdqu [edx], xmm0
3146 movdqu [edx + 16], xmm1
3147 lea edx, [edx + 32]
3148 sub ecx, 8
3149 jg convertloop
3150 ret
3151 }
3152 }
3153 #endif // HAS_I400TOARGBROW_SSE2
3154
3155 #ifdef HAS_I400TOARGBROW_AVX2
3156 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
3157 // note: vpunpcklbw mutates and vpackuswb unmutates.
3158 __declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf,
3159 uint8_t* rgb_buf,
3160 const struct YuvConstants*,
3161 int width) {
3162 __asm {
3163 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
3164 vmovd xmm2, eax
3165 vbroadcastss ymm2, xmm2
3166 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
3167 vmovd xmm3, eax
3168 vbroadcastss ymm3, xmm3
3169 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
3170 vpslld ymm4, ymm4, 24
3171
3172 mov eax, [esp + 4] // Y
3173 mov edx, [esp + 8] // rgb
3174 mov ecx, [esp + 12] // width
3175
3176 convertloop:
3177 // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
3178 vmovdqu xmm0, [eax]
3179 lea eax, [eax + 16]
3180 vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates
3181 vpunpcklbw ymm0, ymm0, ymm0 // Y.Y
3182 vpmulhuw ymm0, ymm0, ymm2
3183 vpsubusw ymm0, ymm0, ymm3
3184 vpsrlw ymm0, ymm0, 6
3185 vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
3186
3187 // TODO(fbarchard): Weave alpha with unpack.
3188 // Step 2: Weave into ARGB
3189 vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
3190 vpermq ymm1, ymm1, 0xd8
3191 vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
3192 vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels
3193 vpor ymm0, ymm0, ymm4
3194 vpor ymm1, ymm1, ymm4
3195 vmovdqu [edx], ymm0
3196 vmovdqu [edx + 32], ymm1
3197 lea edx, [edx + 64]
3198 sub ecx, 16
3199 jg convertloop
3200 vzeroupper
3201 ret
3202 }
3203 }
3204 #endif // HAS_I400TOARGBROW_AVX2
3205
3206 #ifdef HAS_MIRRORROW_SSSE3
3207 // Shuffle table for reversing the bytes.
3208 static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
3209 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
3210
3211 // TODO(fbarchard): Replace lea with -16 offset.
3212 __declspec(naked) void MirrorRow_SSSE3(const uint8_t* src,
3213 uint8_t* dst,
3214 int width) {
3215 __asm {
3216 mov eax, [esp + 4] // src
3217 mov edx, [esp + 8] // dst
3218 mov ecx, [esp + 12] // width
3219 movdqa xmm5, xmmword ptr kShuffleMirror
3220
3221 convertloop:
3222 movdqu xmm0, [eax - 16 + ecx]
3223 pshufb xmm0, xmm5
3224 movdqu [edx], xmm0
3225 lea edx, [edx + 16]
3226 sub ecx, 16
3227 jg convertloop
3228 ret
3229 }
3230 }
3231 #endif // HAS_MIRRORROW_SSSE3
3232
3233 #ifdef HAS_MIRRORROW_AVX2
3234 __declspec(naked) void MirrorRow_AVX2(const uint8_t* src,
3235 uint8_t* dst,
3236 int width) {
3237 __asm {
3238 mov eax, [esp + 4] // src
3239 mov edx, [esp + 8] // dst
3240 mov ecx, [esp + 12] // width
3241 vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
3242
3243 convertloop:
3244 vmovdqu ymm0, [eax - 32 + ecx]
3245 vpshufb ymm0, ymm0, ymm5
3246 vpermq ymm0, ymm0, 0x4e // swap high and low halfs
3247 vmovdqu [edx], ymm0
3248 lea edx, [edx + 32]
3249 sub ecx, 32
3250 jg convertloop
3251 vzeroupper
3252 ret
3253 }
3254 }
3255 #endif // HAS_MIRRORROW_AVX2
3256
3257 #ifdef HAS_MIRRORSPLITUVROW_SSSE3
3258 // Shuffle table for reversing the bytes of UV channels.
3259 static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
3260 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
3261
3262 __declspec(naked) void MirrorSplitUVRow_SSSE3(const uint8_t* src,
3263 uint8_t* dst_u,
3264 uint8_t* dst_v,
3265 int width) {
3266 __asm {
3267 push edi
3268 mov eax, [esp + 4 + 4] // src
3269 mov edx, [esp + 4 + 8] // dst_u
3270 mov edi, [esp + 4 + 12] // dst_v
3271 mov ecx, [esp + 4 + 16] // width
3272 movdqa xmm1, xmmword ptr kShuffleMirrorUV
3273 lea eax, [eax + ecx * 2 - 16]
3274 sub edi, edx
3275
3276 convertloop:
3277 movdqu xmm0, [eax]
3278 lea eax, [eax - 16]
3279 pshufb xmm0, xmm1
3280 movlpd qword ptr [edx], xmm0
3281 movhpd qword ptr [edx + edi], xmm0
3282 lea edx, [edx + 8]
3283 sub ecx, 8
3284 jg convertloop
3285
3286 pop edi
3287 ret
3288 }
3289 }
3290 #endif // HAS_MIRRORSPLITUVROW_SSSE3
3291
3292 #ifdef HAS_ARGBMIRRORROW_SSE2
3293 __declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src,
3294 uint8_t* dst,
3295 int width) {
3296 __asm {
3297 mov eax, [esp + 4] // src
3298 mov edx, [esp + 8] // dst
3299 mov ecx, [esp + 12] // width
3300 lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
3301
3302 convertloop:
3303 movdqu xmm0, [eax]
3304 lea eax, [eax - 16]
3305 pshufd xmm0, xmm0, 0x1b
3306 movdqu [edx], xmm0
3307 lea edx, [edx + 16]
3308 sub ecx, 4
3309 jg convertloop
3310 ret
3311 }
3312 }
3313 #endif // HAS_ARGBMIRRORROW_SSE2
3314
3315 #ifdef HAS_ARGBMIRRORROW_AVX2
3316 // Shuffle table for reversing the bytes.
3317 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
3318
3319 __declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src,
3320 uint8_t* dst,
3321 int width) {
3322 __asm {
3323 mov eax, [esp + 4] // src
3324 mov edx, [esp + 8] // dst
3325 mov ecx, [esp + 12] // width
3326 vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2
3327
3328 convertloop:
3329 vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order
3330 vmovdqu [edx], ymm0
3331 lea edx, [edx + 32]
3332 sub ecx, 8
3333 jg convertloop
3334 vzeroupper
3335 ret
3336 }
3337 }
3338 #endif // HAS_ARGBMIRRORROW_AVX2
3339
3340 #ifdef HAS_SPLITUVROW_SSE2
3341 __declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv,
3342 uint8_t* dst_u,
3343 uint8_t* dst_v,
3344 int width) {
3345 __asm {
3346 push edi
3347 mov eax, [esp + 4 + 4] // src_uv
3348 mov edx, [esp + 4 + 8] // dst_u
3349 mov edi, [esp + 4 + 12] // dst_v
3350 mov ecx, [esp + 4 + 16] // width
3351 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3352 psrlw xmm5, 8
3353 sub edi, edx
3354
3355 convertloop:
3356 movdqu xmm0, [eax]
3357 movdqu xmm1, [eax + 16]
3358 lea eax, [eax + 32]
3359 movdqa xmm2, xmm0
3360 movdqa xmm3, xmm1
3361 pand xmm0, xmm5 // even bytes
3362 pand xmm1, xmm5
3363 packuswb xmm0, xmm1
3364 psrlw xmm2, 8 // odd bytes
3365 psrlw xmm3, 8
3366 packuswb xmm2, xmm3
3367 movdqu [edx], xmm0
3368 movdqu [edx + edi], xmm2
3369 lea edx, [edx + 16]
3370 sub ecx, 16
3371 jg convertloop
3372
3373 pop edi
3374 ret
3375 }
3376 }
3377
3378 #endif // HAS_SPLITUVROW_SSE2
3379
3380 #ifdef HAS_SPLITUVROW_AVX2
3381 __declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv,
3382 uint8_t* dst_u,
3383 uint8_t* dst_v,
3384 int width) {
3385 __asm {
3386 push edi
3387 mov eax, [esp + 4 + 4] // src_uv
3388 mov edx, [esp + 4 + 8] // dst_u
3389 mov edi, [esp + 4 + 12] // dst_v
3390 mov ecx, [esp + 4 + 16] // width
3391 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3392 vpsrlw ymm5, ymm5, 8
3393 sub edi, edx
3394
3395 convertloop:
3396 vmovdqu ymm0, [eax]
3397 vmovdqu ymm1, [eax + 32]
3398 lea eax, [eax + 64]
3399 vpsrlw ymm2, ymm0, 8 // odd bytes
3400 vpsrlw ymm3, ymm1, 8
3401 vpand ymm0, ymm0, ymm5 // even bytes
3402 vpand ymm1, ymm1, ymm5
3403 vpackuswb ymm0, ymm0, ymm1
3404 vpackuswb ymm2, ymm2, ymm3
3405 vpermq ymm0, ymm0, 0xd8
3406 vpermq ymm2, ymm2, 0xd8
3407 vmovdqu [edx], ymm0
3408 vmovdqu [edx + edi], ymm2
3409 lea edx, [edx + 32]
3410 sub ecx, 32
3411 jg convertloop
3412
3413 pop edi
3414 vzeroupper
3415 ret
3416 }
3417 }
3418 #endif // HAS_SPLITUVROW_AVX2
3419
3420 #ifdef HAS_MERGEUVROW_SSE2
3421 __declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u,
3422 const uint8_t* src_v,
3423 uint8_t* dst_uv,
3424 int width) {
3425 __asm {
3426 push edi
3427 mov eax, [esp + 4 + 4] // src_u
3428 mov edx, [esp + 4 + 8] // src_v
3429 mov edi, [esp + 4 + 12] // dst_uv
3430 mov ecx, [esp + 4 + 16] // width
3431 sub edx, eax
3432
3433 convertloop:
3434 movdqu xmm0, [eax] // read 16 U's
3435 movdqu xmm1, [eax + edx] // and 16 V's
3436 lea eax, [eax + 16]
3437 movdqa xmm2, xmm0
3438 punpcklbw xmm0, xmm1 // first 8 UV pairs
3439 punpckhbw xmm2, xmm1 // next 8 UV pairs
3440 movdqu [edi], xmm0
3441 movdqu [edi + 16], xmm2
3442 lea edi, [edi + 32]
3443 sub ecx, 16
3444 jg convertloop
3445
3446 pop edi
3447 ret
3448 }
3449 }
3450 #endif // HAS_MERGEUVROW_SSE2
3451
3452 #ifdef HAS_MERGEUVROW_AVX2
3453 __declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u,
3454 const uint8_t* src_v,
3455 uint8_t* dst_uv,
3456 int width) {
3457 __asm {
3458 push edi
3459 mov eax, [esp + 4 + 4] // src_u
3460 mov edx, [esp + 4 + 8] // src_v
3461 mov edi, [esp + 4 + 12] // dst_uv
3462 mov ecx, [esp + 4 + 16] // width
3463 sub edx, eax
3464
3465 convertloop:
3466 vpmovzxbw ymm0, [eax]
3467 vpmovzxbw ymm1, [eax + edx]
3468 lea eax, [eax + 16]
3469 vpsllw ymm1, ymm1, 8
3470 vpor ymm2, ymm1, ymm0
3471 vmovdqu [edi], ymm2
3472 lea edi, [edi + 32]
3473 sub ecx, 16
3474 jg convertloop
3475
3476 pop edi
3477 vzeroupper
3478 ret
3479 }
3480 }
3481 #endif // HAS_MERGEUVROW_AVX2
3482
3483 #ifdef HAS_COPYROW_SSE2
3484 // CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time.
3485 __declspec(naked) void CopyRow_SSE2(const uint8_t* src,
3486 uint8_t* dst,
3487 int width) {
3488 __asm {
3489 mov eax, [esp + 4] // src
3490 mov edx, [esp + 8] // dst
3491 mov ecx, [esp + 12] // width
3492 test eax, 15
3493 jne convertloopu
3494 test edx, 15
3495 jne convertloopu
3496
3497 convertloopa:
3498 movdqa xmm0, [eax]
3499 movdqa xmm1, [eax + 16]
3500 lea eax, [eax + 32]
3501 movdqa [edx], xmm0
3502 movdqa [edx + 16], xmm1
3503 lea edx, [edx + 32]
3504 sub ecx, 32
3505 jg convertloopa
3506 ret
3507
3508 convertloopu:
3509 movdqu xmm0, [eax]
3510 movdqu xmm1, [eax + 16]
3511 lea eax, [eax + 32]
3512 movdqu [edx], xmm0
3513 movdqu [edx + 16], xmm1
3514 lea edx, [edx + 32]
3515 sub ecx, 32
3516 jg convertloopu
3517 ret
3518 }
3519 }
3520 #endif // HAS_COPYROW_SSE2
3521
3522 #ifdef HAS_COPYROW_AVX
3523 // CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time.
3524 __declspec(naked) void CopyRow_AVX(const uint8_t* src,
3525 uint8_t* dst,
3526 int width) {
3527 __asm {
3528 mov eax, [esp + 4] // src
3529 mov edx, [esp + 8] // dst
3530 mov ecx, [esp + 12] // width
3531
3532 convertloop:
3533 vmovdqu ymm0, [eax]
3534 vmovdqu ymm1, [eax + 32]
3535 lea eax, [eax + 64]
3536 vmovdqu [edx], ymm0
3537 vmovdqu [edx + 32], ymm1
3538 lea edx, [edx + 64]
3539 sub ecx, 64
3540 jg convertloop
3541
3542 vzeroupper
3543 ret
3544 }
3545 }
3546 #endif // HAS_COPYROW_AVX
3547
3548 // Multiple of 1.
3549 __declspec(naked) void CopyRow_ERMS(const uint8_t* src,
3550 uint8_t* dst,
3551 int width) {
3552 __asm {
3553 mov eax, esi
3554 mov edx, edi
3555 mov esi, [esp + 4] // src
3556 mov edi, [esp + 8] // dst
3557 mov ecx, [esp + 12] // width
3558 rep movsb
3559 mov edi, edx
3560 mov esi, eax
3561 ret
3562 }
3563 }
3564
3565 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
3566 // width in pixels
3567 __declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src,
3568 uint8_t* dst,
3569 int width) {
3570 __asm {
3571 mov eax, [esp + 4] // src
3572 mov edx, [esp + 8] // dst
3573 mov ecx, [esp + 12] // width
3574 pcmpeqb xmm0, xmm0 // generate mask 0xff000000
3575 pslld xmm0, 24
3576 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
3577 psrld xmm1, 8
3578
3579 convertloop:
3580 movdqu xmm2, [eax]
3581 movdqu xmm3, [eax + 16]
3582 lea eax, [eax + 32]
3583 movdqu xmm4, [edx]
3584 movdqu xmm5, [edx + 16]
3585 pand xmm2, xmm0
3586 pand xmm3, xmm0
3587 pand xmm4, xmm1
3588 pand xmm5, xmm1
3589 por xmm2, xmm4
3590 por xmm3, xmm5
3591 movdqu [edx], xmm2
3592 movdqu [edx + 16], xmm3
3593 lea edx, [edx + 32]
3594 sub ecx, 8
3595 jg convertloop
3596
3597 ret
3598 }
3599 }
3600 #endif // HAS_ARGBCOPYALPHAROW_SSE2
3601
3602 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
3603 // width in pixels
3604 __declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src,
3605 uint8_t* dst,
3606 int width) {
3607 __asm {
3608 mov eax, [esp + 4] // src
3609 mov edx, [esp + 8] // dst
3610 mov ecx, [esp + 12] // width
3611 vpcmpeqb ymm0, ymm0, ymm0
3612 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
3613
3614 convertloop:
3615 vmovdqu ymm1, [eax]
3616 vmovdqu ymm2, [eax + 32]
3617 lea eax, [eax + 64]
3618 vpblendvb ymm1, ymm1, [edx], ymm0
3619 vpblendvb ymm2, ymm2, [edx + 32], ymm0
3620 vmovdqu [edx], ymm1
3621 vmovdqu [edx + 32], ymm2
3622 lea edx, [edx + 64]
3623 sub ecx, 16
3624 jg convertloop
3625
3626 vzeroupper
3627 ret
3628 }
3629 }
3630 #endif // HAS_ARGBCOPYALPHAROW_AVX2
3631
3632 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
3633 // width in pixels
3634 __declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
3635 uint8_t* dst_a,
3636 int width) {
3637 __asm {
3638 mov eax, [esp + 4] // src_argb
3639 mov edx, [esp + 8] // dst_a
3640 mov ecx, [esp + 12] // width
3641
3642 extractloop:
3643 movdqu xmm0, [eax]
3644 movdqu xmm1, [eax + 16]
3645 lea eax, [eax + 32]
3646 psrld xmm0, 24
3647 psrld xmm1, 24
3648 packssdw xmm0, xmm1
3649 packuswb xmm0, xmm0
3650 movq qword ptr [edx], xmm0
3651 lea edx, [edx + 8]
3652 sub ecx, 8
3653 jg extractloop
3654
3655 ret
3656 }
3657 }
3658 #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
3659
3660 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
3661 // width in pixels
3662 __declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
3663 uint8_t* dst_a,
3664 int width) {
3665 __asm {
3666 mov eax, [esp + 4] // src_argb
3667 mov edx, [esp + 8] // dst_a
3668 mov ecx, [esp + 12] // width
3669 vmovdqa ymm4, ymmword ptr kPermdARGBToY_AVX
3670
3671 extractloop:
3672 vmovdqu ymm0, [eax]
3673 vmovdqu ymm1, [eax + 32]
3674 vpsrld ymm0, ymm0, 24
3675 vpsrld ymm1, ymm1, 24
3676 vmovdqu ymm2, [eax + 64]
3677 vmovdqu ymm3, [eax + 96]
3678 lea eax, [eax + 128]
3679 vpackssdw ymm0, ymm0, ymm1 // mutates
3680 vpsrld ymm2, ymm2, 24
3681 vpsrld ymm3, ymm3, 24
3682 vpackssdw ymm2, ymm2, ymm3 // mutates
3683 vpackuswb ymm0, ymm0, ymm2 // mutates
3684 vpermd ymm0, ymm4, ymm0 // unmutate
3685 vmovdqu [edx], ymm0
3686 lea edx, [edx + 32]
3687 sub ecx, 32
3688 jg extractloop
3689
3690 vzeroupper
3691 ret
3692 }
3693 }
3694 #endif // HAS_ARGBEXTRACTALPHAROW_AVX2
3695
3696 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
3697 // width in pixels
3698 __declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src,
3699 uint8_t* dst,
3700 int width) {
3701 __asm {
3702 mov eax, [esp + 4] // src
3703 mov edx, [esp + 8] // dst
3704 mov ecx, [esp + 12] // width
3705 pcmpeqb xmm0, xmm0 // generate mask 0xff000000
3706 pslld xmm0, 24
3707 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
3708 psrld xmm1, 8
3709
3710 convertloop:
3711 movq xmm2, qword ptr [eax] // 8 Y's
3712 lea eax, [eax + 8]
3713 punpcklbw xmm2, xmm2
3714 punpckhwd xmm3, xmm2
3715 punpcklwd xmm2, xmm2
3716 movdqu xmm4, [edx]
3717 movdqu xmm5, [edx + 16]
3718 pand xmm2, xmm0
3719 pand xmm3, xmm0
3720 pand xmm4, xmm1
3721 pand xmm5, xmm1
3722 por xmm2, xmm4
3723 por xmm3, xmm5
3724 movdqu [edx], xmm2
3725 movdqu [edx + 16], xmm3
3726 lea edx, [edx + 32]
3727 sub ecx, 8
3728 jg convertloop
3729
3730 ret
3731 }
3732 }
3733 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
3734
3735 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3736 // width in pixels
3737 __declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src,
3738 uint8_t* dst,
3739 int width) {
3740 __asm {
3741 mov eax, [esp + 4] // src
3742 mov edx, [esp + 8] // dst
3743 mov ecx, [esp + 12] // width
3744 vpcmpeqb ymm0, ymm0, ymm0
3745 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
3746
3747 convertloop:
3748 vpmovzxbd ymm1, qword ptr [eax]
3749 vpmovzxbd ymm2, qword ptr [eax + 8]
3750 lea eax, [eax + 16]
3751 vpslld ymm1, ymm1, 24
3752 vpslld ymm2, ymm2, 24
3753 vpblendvb ymm1, ymm1, [edx], ymm0
3754 vpblendvb ymm2, ymm2, [edx + 32], ymm0
3755 vmovdqu [edx], ymm1
3756 vmovdqu [edx + 32], ymm2
3757 lea edx, [edx + 64]
3758 sub ecx, 16
3759 jg convertloop
3760
3761 vzeroupper
3762 ret
3763 }
3764 }
3765 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
3766
3767 #ifdef HAS_SETROW_X86
3768 // Write 'width' bytes using an 8 bit value repeated.
3769 // width should be multiple of 4.
3770 __declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
3771 __asm {
3772 movzx eax, byte ptr [esp + 8] // v8
3773 mov edx, 0x01010101 // Duplicate byte to all bytes.
3774 mul edx // overwrites edx with upper part of result.
3775 mov edx, edi
3776 mov edi, [esp + 4] // dst
3777 mov ecx, [esp + 12] // width
3778 shr ecx, 2
3779 rep stosd
3780 mov edi, edx
3781 ret
3782 }
3783 }
3784
3785 // Write 'width' bytes using an 8 bit value repeated.
3786 __declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
3787 __asm {
3788 mov edx, edi
3789 mov edi, [esp + 4] // dst
3790 mov eax, [esp + 8] // v8
3791 mov ecx, [esp + 12] // width
3792 rep stosb
3793 mov edi, edx
3794 ret
3795 }
3796 }
3797
3798 // Write 'width' 32 bit values.
3799 __declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb,
3800 uint32_t v32,
3801 int width) {
3802 __asm {
3803 mov edx, edi
3804 mov edi, [esp + 4] // dst
3805 mov eax, [esp + 8] // v32
3806 mov ecx, [esp + 12] // width
3807 rep stosd
3808 mov edi, edx
3809 ret
3810 }
3811 }
3812 #endif // HAS_SETROW_X86
3813
3814 #ifdef HAS_YUY2TOYROW_AVX2
3815 __declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2,
3816 uint8_t* dst_y,
3817 int width) {
3818 __asm {
3819 mov eax, [esp + 4] // src_yuy2
3820 mov edx, [esp + 8] // dst_y
3821 mov ecx, [esp + 12] // width
3822 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3823 vpsrlw ymm5, ymm5, 8
3824
3825 convertloop:
3826 vmovdqu ymm0, [eax]
3827 vmovdqu ymm1, [eax + 32]
3828 lea eax, [eax + 64]
3829 vpand ymm0, ymm0, ymm5 // even bytes are Y
3830 vpand ymm1, ymm1, ymm5
3831 vpackuswb ymm0, ymm0, ymm1 // mutates.
3832 vpermq ymm0, ymm0, 0xd8
3833 vmovdqu [edx], ymm0
3834 lea edx, [edx + 32]
3835 sub ecx, 32
3836 jg convertloop
3837 vzeroupper
3838 ret
3839 }
3840 }
3841
3842 __declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
3843 int stride_yuy2,
3844 uint8_t* dst_u,
3845 uint8_t* dst_v,
3846 int width) {
3847 __asm {
3848 push esi
3849 push edi
3850 mov eax, [esp + 8 + 4] // src_yuy2
3851 mov esi, [esp + 8 + 8] // stride_yuy2
3852 mov edx, [esp + 8 + 12] // dst_u
3853 mov edi, [esp + 8 + 16] // dst_v
3854 mov ecx, [esp + 8 + 20] // width
3855 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3856 vpsrlw ymm5, ymm5, 8
3857 sub edi, edx
3858
3859 convertloop:
3860 vmovdqu ymm0, [eax]
3861 vmovdqu ymm1, [eax + 32]
3862 vpavgb ymm0, ymm0, [eax + esi]
3863 vpavgb ymm1, ymm1, [eax + esi + 32]
3864 lea eax, [eax + 64]
3865 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
3866 vpsrlw ymm1, ymm1, 8
3867 vpackuswb ymm0, ymm0, ymm1 // mutates.
3868 vpermq ymm0, ymm0, 0xd8
3869 vpand ymm1, ymm0, ymm5 // U
3870 vpsrlw ymm0, ymm0, 8 // V
3871 vpackuswb ymm1, ymm1, ymm1 // mutates.
3872 vpackuswb ymm0, ymm0, ymm0 // mutates.
3873 vpermq ymm1, ymm1, 0xd8
3874 vpermq ymm0, ymm0, 0xd8
3875 vextractf128 [edx], ymm1, 0 // U
3876 vextractf128 [edx + edi], ymm0, 0 // V
3877 lea edx, [edx + 16]
3878 sub ecx, 32
3879 jg convertloop
3880
3881 pop edi
3882 pop esi
3883 vzeroupper
3884 ret
3885 }
3886 }
3887
3888 __declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
3889 uint8_t* dst_u,
3890 uint8_t* dst_v,
3891 int width) {
3892 __asm {
3893 push edi
3894 mov eax, [esp + 4 + 4] // src_yuy2
3895 mov edx, [esp + 4 + 8] // dst_u
3896 mov edi, [esp + 4 + 12] // dst_v
3897 mov ecx, [esp + 4 + 16] // width
3898 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3899 vpsrlw ymm5, ymm5, 8
3900 sub edi, edx
3901
3902 convertloop:
3903 vmovdqu ymm0, [eax]
3904 vmovdqu ymm1, [eax + 32]
3905 lea eax, [eax + 64]
3906 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
3907 vpsrlw ymm1, ymm1, 8
3908 vpackuswb ymm0, ymm0, ymm1 // mutates.
3909 vpermq ymm0, ymm0, 0xd8
3910 vpand ymm1, ymm0, ymm5 // U
3911 vpsrlw ymm0, ymm0, 8 // V
3912 vpackuswb ymm1, ymm1, ymm1 // mutates.
3913 vpackuswb ymm0, ymm0, ymm0 // mutates.
3914 vpermq ymm1, ymm1, 0xd8
3915 vpermq ymm0, ymm0, 0xd8
3916 vextractf128 [edx], ymm1, 0 // U
3917 vextractf128 [edx + edi], ymm0, 0 // V
3918 lea edx, [edx + 16]
3919 sub ecx, 32
3920 jg convertloop
3921
3922 pop edi
3923 vzeroupper
3924 ret
3925 }
3926 }
3927
3928 __declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy,
3929 uint8_t* dst_y,
3930 int width) {
3931 __asm {
3932 mov eax, [esp + 4] // src_uyvy
3933 mov edx, [esp + 8] // dst_y
3934 mov ecx, [esp + 12] // width
3935
3936 convertloop:
3937 vmovdqu ymm0, [eax]
3938 vmovdqu ymm1, [eax + 32]
3939 lea eax, [eax + 64]
3940 vpsrlw ymm0, ymm0, 8 // odd bytes are Y
3941 vpsrlw ymm1, ymm1, 8
3942 vpackuswb ymm0, ymm0, ymm1 // mutates.
3943 vpermq ymm0, ymm0, 0xd8
3944 vmovdqu [edx], ymm0
3945 lea edx, [edx + 32]
3946 sub ecx, 32
3947 jg convertloop
3948 vzeroupper
3949 ret
3950 }
3951 }
3952
3953 __declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
3954 int stride_uyvy,
3955 uint8_t* dst_u,
3956 uint8_t* dst_v,
3957 int width) {
3958 __asm {
3959 push esi
3960 push edi
3961 mov eax, [esp + 8 + 4] // src_yuy2
3962 mov esi, [esp + 8 + 8] // stride_yuy2
3963 mov edx, [esp + 8 + 12] // dst_u
3964 mov edi, [esp + 8 + 16] // dst_v
3965 mov ecx, [esp + 8 + 20] // width
3966 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3967 vpsrlw ymm5, ymm5, 8
3968 sub edi, edx
3969
3970 convertloop:
3971 vmovdqu ymm0, [eax]
3972 vmovdqu ymm1, [eax + 32]
3973 vpavgb ymm0, ymm0, [eax + esi]
3974 vpavgb ymm1, ymm1, [eax + esi + 32]
3975 lea eax, [eax + 64]
3976 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
3977 vpand ymm1, ymm1, ymm5
3978 vpackuswb ymm0, ymm0, ymm1 // mutates.
3979 vpermq ymm0, ymm0, 0xd8
3980 vpand ymm1, ymm0, ymm5 // U
3981 vpsrlw ymm0, ymm0, 8 // V
3982 vpackuswb ymm1, ymm1, ymm1 // mutates.
3983 vpackuswb ymm0, ymm0, ymm0 // mutates.
3984 vpermq ymm1, ymm1, 0xd8
3985 vpermq ymm0, ymm0, 0xd8
3986 vextractf128 [edx], ymm1, 0 // U
3987 vextractf128 [edx + edi], ymm0, 0 // V
3988 lea edx, [edx + 16]
3989 sub ecx, 32
3990 jg convertloop
3991
3992 pop edi
3993 pop esi
3994 vzeroupper
3995 ret
3996 }
3997 }
3998
3999 __declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
4000 uint8_t* dst_u,
4001 uint8_t* dst_v,
4002 int width) {
4003 __asm {
4004 push edi
4005 mov eax, [esp + 4 + 4] // src_yuy2
4006 mov edx, [esp + 4 + 8] // dst_u
4007 mov edi, [esp + 4 + 12] // dst_v
4008 mov ecx, [esp + 4 + 16] // width
4009 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
4010 vpsrlw ymm5, ymm5, 8
4011 sub edi, edx
4012
4013 convertloop:
4014 vmovdqu ymm0, [eax]
4015 vmovdqu ymm1, [eax + 32]
4016 lea eax, [eax + 64]
4017 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
4018 vpand ymm1, ymm1, ymm5
4019 vpackuswb ymm0, ymm0, ymm1 // mutates.
4020 vpermq ymm0, ymm0, 0xd8
4021 vpand ymm1, ymm0, ymm5 // U
4022 vpsrlw ymm0, ymm0, 8 // V
4023 vpackuswb ymm1, ymm1, ymm1 // mutates.
4024 vpackuswb ymm0, ymm0, ymm0 // mutates.
4025 vpermq ymm1, ymm1, 0xd8
4026 vpermq ymm0, ymm0, 0xd8
4027 vextractf128 [edx], ymm1, 0 // U
4028 vextractf128 [edx + edi], ymm0, 0 // V
4029 lea edx, [edx + 16]
4030 sub ecx, 32
4031 jg convertloop
4032
4033 pop edi
4034 vzeroupper
4035 ret
4036 }
4037 }
4038 #endif // HAS_YUY2TOYROW_AVX2
4039
4040 #ifdef HAS_YUY2TOYROW_SSE2
4041 __declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2,
4042 uint8_t* dst_y,
4043 int width) {
4044 __asm {
4045 mov eax, [esp + 4] // src_yuy2
4046 mov edx, [esp + 8] // dst_y
4047 mov ecx, [esp + 12] // width
4048 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4049 psrlw xmm5, 8
4050
4051 convertloop:
4052 movdqu xmm0, [eax]
4053 movdqu xmm1, [eax + 16]
4054 lea eax, [eax + 32]
4055 pand xmm0, xmm5 // even bytes are Y
4056 pand xmm1, xmm5
4057 packuswb xmm0, xmm1
4058 movdqu [edx], xmm0
4059 lea edx, [edx + 16]
4060 sub ecx, 16
4061 jg convertloop
4062 ret
4063 }
4064 }
4065
4066 __declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
4067 int stride_yuy2,
4068 uint8_t* dst_u,
4069 uint8_t* dst_v,
4070 int width) {
4071 __asm {
4072 push esi
4073 push edi
4074 mov eax, [esp + 8 + 4] // src_yuy2
4075 mov esi, [esp + 8 + 8] // stride_yuy2
4076 mov edx, [esp + 8 + 12] // dst_u
4077 mov edi, [esp + 8 + 16] // dst_v
4078 mov ecx, [esp + 8 + 20] // width
4079 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4080 psrlw xmm5, 8
4081 sub edi, edx
4082
4083 convertloop:
4084 movdqu xmm0, [eax]
4085 movdqu xmm1, [eax + 16]
4086 movdqu xmm2, [eax + esi]
4087 movdqu xmm3, [eax + esi + 16]
4088 lea eax, [eax + 32]
4089 pavgb xmm0, xmm2
4090 pavgb xmm1, xmm3
4091 psrlw xmm0, 8 // YUYV -> UVUV
4092 psrlw xmm1, 8
4093 packuswb xmm0, xmm1
4094 movdqa xmm1, xmm0
4095 pand xmm0, xmm5 // U
4096 packuswb xmm0, xmm0
4097 psrlw xmm1, 8 // V
4098 packuswb xmm1, xmm1
4099 movq qword ptr [edx], xmm0
4100 movq qword ptr [edx + edi], xmm1
4101 lea edx, [edx + 8]
4102 sub ecx, 16
4103 jg convertloop
4104
4105 pop edi
4106 pop esi
4107 ret
4108 }
4109 }
4110
4111 __declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
4112 uint8_t* dst_u,
4113 uint8_t* dst_v,
4114 int width) {
4115 __asm {
4116 push edi
4117 mov eax, [esp + 4 + 4] // src_yuy2
4118 mov edx, [esp + 4 + 8] // dst_u
4119 mov edi, [esp + 4 + 12] // dst_v
4120 mov ecx, [esp + 4 + 16] // width
4121 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4122 psrlw xmm5, 8
4123 sub edi, edx
4124
4125 convertloop:
4126 movdqu xmm0, [eax]
4127 movdqu xmm1, [eax + 16]
4128 lea eax, [eax + 32]
4129 psrlw xmm0, 8 // YUYV -> UVUV
4130 psrlw xmm1, 8
4131 packuswb xmm0, xmm1
4132 movdqa xmm1, xmm0
4133 pand xmm0, xmm5 // U
4134 packuswb xmm0, xmm0
4135 psrlw xmm1, 8 // V
4136 packuswb xmm1, xmm1
4137 movq qword ptr [edx], xmm0
4138 movq qword ptr [edx + edi], xmm1
4139 lea edx, [edx + 8]
4140 sub ecx, 16
4141 jg convertloop
4142
4143 pop edi
4144 ret
4145 }
4146 }
4147
4148 __declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy,
4149 uint8_t* dst_y,
4150 int width) {
4151 __asm {
4152 mov eax, [esp + 4] // src_uyvy
4153 mov edx, [esp + 8] // dst_y
4154 mov ecx, [esp + 12] // width
4155
4156 convertloop:
4157 movdqu xmm0, [eax]
4158 movdqu xmm1, [eax + 16]
4159 lea eax, [eax + 32]
4160 psrlw xmm0, 8 // odd bytes are Y
4161 psrlw xmm1, 8
4162 packuswb xmm0, xmm1
4163 movdqu [edx], xmm0
4164 lea edx, [edx + 16]
4165 sub ecx, 16
4166 jg convertloop
4167 ret
4168 }
4169 }
4170
4171 __declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
4172 int stride_uyvy,
4173 uint8_t* dst_u,
4174 uint8_t* dst_v,
4175 int width) {
4176 __asm {
4177 push esi
4178 push edi
4179 mov eax, [esp + 8 + 4] // src_yuy2
4180 mov esi, [esp + 8 + 8] // stride_yuy2
4181 mov edx, [esp + 8 + 12] // dst_u
4182 mov edi, [esp + 8 + 16] // dst_v
4183 mov ecx, [esp + 8 + 20] // width
4184 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4185 psrlw xmm5, 8
4186 sub edi, edx
4187
4188 convertloop:
4189 movdqu xmm0, [eax]
4190 movdqu xmm1, [eax + 16]
4191 movdqu xmm2, [eax + esi]
4192 movdqu xmm3, [eax + esi + 16]
4193 lea eax, [eax + 32]
4194 pavgb xmm0, xmm2
4195 pavgb xmm1, xmm3
4196 pand xmm0, xmm5 // UYVY -> UVUV
4197 pand xmm1, xmm5
4198 packuswb xmm0, xmm1
4199 movdqa xmm1, xmm0
4200 pand xmm0, xmm5 // U
4201 packuswb xmm0, xmm0
4202 psrlw xmm1, 8 // V
4203 packuswb xmm1, xmm1
4204 movq qword ptr [edx], xmm0
4205 movq qword ptr [edx + edi], xmm1
4206 lea edx, [edx + 8]
4207 sub ecx, 16
4208 jg convertloop
4209
4210 pop edi
4211 pop esi
4212 ret
4213 }
4214 }
4215
4216 __declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
4217 uint8_t* dst_u,
4218 uint8_t* dst_v,
4219 int width) {
4220 __asm {
4221 push edi
4222 mov eax, [esp + 4 + 4] // src_yuy2
4223 mov edx, [esp + 4 + 8] // dst_u
4224 mov edi, [esp + 4 + 12] // dst_v
4225 mov ecx, [esp + 4 + 16] // width
4226 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4227 psrlw xmm5, 8
4228 sub edi, edx
4229
4230 convertloop:
4231 movdqu xmm0, [eax]
4232 movdqu xmm1, [eax + 16]
4233 lea eax, [eax + 32]
4234 pand xmm0, xmm5 // UYVY -> UVUV
4235 pand xmm1, xmm5
4236 packuswb xmm0, xmm1
4237 movdqa xmm1, xmm0
4238 pand xmm0, xmm5 // U
4239 packuswb xmm0, xmm0
4240 psrlw xmm1, 8 // V
4241 packuswb xmm1, xmm1
4242 movq qword ptr [edx], xmm0
4243 movq qword ptr [edx + edi], xmm1
4244 lea edx, [edx + 8]
4245 sub ecx, 16
4246 jg convertloop
4247
4248 pop edi
4249 ret
4250 }
4251 }
4252 #endif // HAS_YUY2TOYROW_SSE2
4253
4254 #ifdef HAS_BLENDPLANEROW_SSSE3
4255 // Blend 8 pixels at a time.
4256 // unsigned version of math
4257 // =((A2*C2)+(B2*(255-C2))+255)/256
4258 // signed version of math
4259 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
4260 __declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0,
4261 const uint8_t* src1,
4262 const uint8_t* alpha,
4263 uint8_t* dst,
4264 int width) {
4265 __asm {
4266 push esi
4267 push edi
4268 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
4269 psllw xmm5, 8
4270 mov eax, 0x80808080 // 128 for biasing image to signed.
4271 movd xmm6, eax
4272 pshufd xmm6, xmm6, 0x00
4273
4274 mov eax, 0x807f807f // 32768 + 127 for unbias and round.
4275 movd xmm7, eax
4276 pshufd xmm7, xmm7, 0x00
4277 mov eax, [esp + 8 + 4] // src0
4278 mov edx, [esp + 8 + 8] // src1
4279 mov esi, [esp + 8 + 12] // alpha
4280 mov edi, [esp + 8 + 16] // dst
4281 mov ecx, [esp + 8 + 20] // width
4282 sub eax, esi
4283 sub edx, esi
4284 sub edi, esi
4285
4286 // 8 pixel loop.
4287 convertloop8:
4288 movq xmm0, qword ptr [esi] // alpha
4289 punpcklbw xmm0, xmm0
4290 pxor xmm0, xmm5 // a, 255-a
4291 movq xmm1, qword ptr [eax + esi] // src0
4292 movq xmm2, qword ptr [edx + esi] // src1
4293 punpcklbw xmm1, xmm2
4294 psubb xmm1, xmm6 // bias src0/1 - 128
4295 pmaddubsw xmm0, xmm1
4296 paddw xmm0, xmm7 // unbias result - 32768 and round.
4297 psrlw xmm0, 8
4298 packuswb xmm0, xmm0
4299 movq qword ptr [edi + esi], xmm0
4300 lea esi, [esi + 8]
4301 sub ecx, 8
4302 jg convertloop8
4303
4304 pop edi
4305 pop esi
4306 ret
4307 }
4308 }
4309 #endif // HAS_BLENDPLANEROW_SSSE3
4310
4311 #ifdef HAS_BLENDPLANEROW_AVX2
4312 // Blend 32 pixels at a time.
4313 // unsigned version of math
4314 // =((A2*C2)+(B2*(255-C2))+255)/256
4315 // signed version of math
4316 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
4317 __declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0,
4318 const uint8_t* src1,
4319 const uint8_t* alpha,
4320 uint8_t* dst,
4321 int width) {
4322 __asm {
4323 push esi
4324 push edi
4325 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00
4326 vpsllw ymm5, ymm5, 8
4327 mov eax, 0x80808080 // 128 for biasing image to signed.
4328 vmovd xmm6, eax
4329 vbroadcastss ymm6, xmm6
4330 mov eax, 0x807f807f // 32768 + 127 for unbias and round.
4331 vmovd xmm7, eax
4332 vbroadcastss ymm7, xmm7
4333 mov eax, [esp + 8 + 4] // src0
4334 mov edx, [esp + 8 + 8] // src1
4335 mov esi, [esp + 8 + 12] // alpha
4336 mov edi, [esp + 8 + 16] // dst
4337 mov ecx, [esp + 8 + 20] // width
4338 sub eax, esi
4339 sub edx, esi
4340 sub edi, esi
4341
4342 // 32 pixel loop.
4343 convertloop32:
4344 vmovdqu ymm0, [esi] // alpha
4345 vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
4346 vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23
4347 vpxor ymm3, ymm3, ymm5 // a, 255-a
4348 vpxor ymm0, ymm0, ymm5 // a, 255-a
4349 vmovdqu ymm1, [eax + esi] // src0
4350 vmovdqu ymm2, [edx + esi] // src1
4351 vpunpckhbw ymm4, ymm1, ymm2
4352 vpunpcklbw ymm1, ymm1, ymm2
4353 vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128
4354 vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128
4355 vpmaddubsw ymm3, ymm3, ymm4
4356 vpmaddubsw ymm0, ymm0, ymm1
4357 vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round.
4358 vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.
4359 vpsrlw ymm3, ymm3, 8
4360 vpsrlw ymm0, ymm0, 8
4361 vpackuswb ymm0, ymm0, ymm3
4362 vmovdqu [edi + esi], ymm0
4363 lea esi, [esi + 32]
4364 sub ecx, 32
4365 jg convertloop32
4366
4367 pop edi
4368 pop esi
4369 vzeroupper
4370 ret
4371 }
4372 }
4373 #endif // HAS_BLENDPLANEROW_AVX2
4374
4375 #ifdef HAS_ARGBBLENDROW_SSSE3
4376 // Shuffle table for isolating alpha.
4377 static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
4378 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
4379
4380 // Blend 8 pixels at a time.
4381 __declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
4382 const uint8_t* src_argb1,
4383 uint8_t* dst_argb,
4384 int width) {
4385 __asm {
4386 push esi
4387 mov eax, [esp + 4 + 4] // src_argb
4388 mov esi, [esp + 4 + 8] // src_argb1
4389 mov edx, [esp + 4 + 12] // dst_argb
4390 mov ecx, [esp + 4 + 16] // width
4391 pcmpeqb xmm7, xmm7 // generate constant 0x0001
4392 psrlw xmm7, 15
4393 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
4394 psrlw xmm6, 8
4395 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
4396 psllw xmm5, 8
4397 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
4398 pslld xmm4, 24
4399 sub ecx, 4
4400 jl convertloop4b // less than 4 pixels?
4401
4402 // 4 pixel loop.
4403 convertloop4:
4404 movdqu xmm3, [eax] // src argb
4405 lea eax, [eax + 16]
4406 movdqa xmm0, xmm3 // src argb
4407 pxor xmm3, xmm4 // ~alpha
4408 movdqu xmm2, [esi] // _r_b
4409 pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
4410 pand xmm2, xmm6 // _r_b
4411 paddw xmm3, xmm7 // 256 - alpha
4412 pmullw xmm2, xmm3 // _r_b * alpha
4413 movdqu xmm1, [esi] // _a_g
4414 lea esi, [esi + 16]
4415 psrlw xmm1, 8 // _a_g
4416 por xmm0, xmm4 // set alpha to 255
4417 pmullw xmm1, xmm3 // _a_g * alpha
4418 psrlw xmm2, 8 // _r_b convert to 8 bits again
4419 paddusb xmm0, xmm2 // + src argb
4420 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4421 paddusb xmm0, xmm1 // + src argb
4422 movdqu [edx], xmm0
4423 lea edx, [edx + 16]
4424 sub ecx, 4
4425 jge convertloop4
4426
4427 convertloop4b:
4428 add ecx, 4 - 1
4429 jl convertloop1b
4430
4431 // 1 pixel loop.
4432 convertloop1:
4433 movd xmm3, [eax] // src argb
4434 lea eax, [eax + 4]
4435 movdqa xmm0, xmm3 // src argb
4436 pxor xmm3, xmm4 // ~alpha
4437 movd xmm2, [esi] // _r_b
4438 pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
4439 pand xmm2, xmm6 // _r_b
4440 paddw xmm3, xmm7 // 256 - alpha
4441 pmullw xmm2, xmm3 // _r_b * alpha
4442 movd xmm1, [esi] // _a_g
4443 lea esi, [esi + 4]
4444 psrlw xmm1, 8 // _a_g
4445 por xmm0, xmm4 // set alpha to 255
4446 pmullw xmm1, xmm3 // _a_g * alpha
4447 psrlw xmm2, 8 // _r_b convert to 8 bits again
4448 paddusb xmm0, xmm2 // + src argb
4449 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4450 paddusb xmm0, xmm1 // + src argb
4451 movd [edx], xmm0
4452 lea edx, [edx + 4]
4453 sub ecx, 1
4454 jge convertloop1
4455
4456 convertloop1b:
4457 pop esi
4458 ret
4459 }
4460 }
4461 #endif // HAS_ARGBBLENDROW_SSSE3
4462
4463 #ifdef HAS_ARGBATTENUATEROW_SSSE3
4464 // Shuffle table duplicating alpha.
4465 static const uvec8 kShuffleAlpha0 = {
4466 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
4467 };
4468 static const uvec8 kShuffleAlpha1 = {
4469 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
4470 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
4471 };
4472 __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
4473 uint8_t* dst_argb,
4474 int width) {
4475 __asm {
4476 mov eax, [esp + 4] // src_argb
4477 mov edx, [esp + 8] // dst_argb
4478 mov ecx, [esp + 12] // width
4479 pcmpeqb xmm3, xmm3 // generate mask 0xff000000
4480 pslld xmm3, 24
4481 movdqa xmm4, xmmword ptr kShuffleAlpha0
4482 movdqa xmm5, xmmword ptr kShuffleAlpha1
4483
4484 convertloop:
4485 movdqu xmm0, [eax] // read 4 pixels
4486 pshufb xmm0, xmm4 // isolate first 2 alphas
4487 movdqu xmm1, [eax] // read 4 pixels
4488 punpcklbw xmm1, xmm1 // first 2 pixel rgbs
4489 pmulhuw xmm0, xmm1 // rgb * a
4490 movdqu xmm1, [eax] // read 4 pixels
4491 pshufb xmm1, xmm5 // isolate next 2 alphas
4492 movdqu xmm2, [eax] // read 4 pixels
4493 punpckhbw xmm2, xmm2 // next 2 pixel rgbs
4494 pmulhuw xmm1, xmm2 // rgb * a
4495 movdqu xmm2, [eax] // mask original alpha
4496 lea eax, [eax + 16]
4497 pand xmm2, xmm3
4498 psrlw xmm0, 8
4499 psrlw xmm1, 8
4500 packuswb xmm0, xmm1
4501 por xmm0, xmm2 // copy original alpha
4502 movdqu [edx], xmm0
4503 lea edx, [edx + 16]
4504 sub ecx, 4
4505 jg convertloop
4506
4507 ret
4508 }
4509 }
4510 #endif // HAS_ARGBATTENUATEROW_SSSE3
4511
4512 #ifdef HAS_ARGBATTENUATEROW_AVX2
4513 // Shuffle table duplicating alpha.
4514 static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
4515 128u, 128u, 14u, 15u, 14u, 15u,
4516 14u, 15u, 128u, 128u};
4517 __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
4518 uint8_t* dst_argb,
4519 int width) {
4520 __asm {
4521 mov eax, [esp + 4] // src_argb
4522 mov edx, [esp + 8] // dst_argb
4523 mov ecx, [esp + 12] // width
4524 sub edx, eax
4525 vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
4526 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
4527 vpslld ymm5, ymm5, 24
4528
4529 convertloop:
4530 vmovdqu ymm6, [eax] // read 8 pixels.
4531 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4532 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4533 vpshufb ymm2, ymm0, ymm4 // low 4 alphas
4534 vpshufb ymm3, ymm1, ymm4 // high 4 alphas
4535 vpmulhuw ymm0, ymm0, ymm2 // rgb * a
4536 vpmulhuw ymm1, ymm1, ymm3 // rgb * a
4537 vpand ymm6, ymm6, ymm5 // isolate alpha
4538 vpsrlw ymm0, ymm0, 8
4539 vpsrlw ymm1, ymm1, 8
4540 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4541 vpor ymm0, ymm0, ymm6 // copy original alpha
4542 vmovdqu [eax + edx], ymm0
4543 lea eax, [eax + 32]
4544 sub ecx, 8
4545 jg convertloop
4546
4547 vzeroupper
4548 ret
4549 }
4550 }
4551 #endif // HAS_ARGBATTENUATEROW_AVX2
4552
4553 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
4554 // Unattenuate 4 pixels at a time.
4555 __declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
4556 uint8_t* dst_argb,
4557 int width) {
4558 __asm {
4559 push ebx
4560 push esi
4561 push edi
4562 mov eax, [esp + 12 + 4] // src_argb
4563 mov edx, [esp + 12 + 8] // dst_argb
4564 mov ecx, [esp + 12 + 12] // width
4565 lea ebx, fixed_invtbl8
4566
4567 convertloop:
4568 movdqu xmm0, [eax] // read 4 pixels
4569 movzx esi, byte ptr [eax + 3] // first alpha
4570 movzx edi, byte ptr [eax + 7] // second alpha
4571 punpcklbw xmm0, xmm0 // first 2
4572 movd xmm2, dword ptr [ebx + esi * 4]
4573 movd xmm3, dword ptr [ebx + edi * 4]
4574 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
4575 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
4576 movlhps xmm2, xmm3
4577 pmulhuw xmm0, xmm2 // rgb * a
4578
4579 movdqu xmm1, [eax] // read 4 pixels
4580 movzx esi, byte ptr [eax + 11] // third alpha
4581 movzx edi, byte ptr [eax + 15] // forth alpha
4582 punpckhbw xmm1, xmm1 // next 2
4583 movd xmm2, dword ptr [ebx + esi * 4]
4584 movd xmm3, dword ptr [ebx + edi * 4]
4585 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
4586 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
4587 movlhps xmm2, xmm3
4588 pmulhuw xmm1, xmm2 // rgb * a
4589 lea eax, [eax + 16]
4590 packuswb xmm0, xmm1
4591 movdqu [edx], xmm0
4592 lea edx, [edx + 16]
4593 sub ecx, 4
4594 jg convertloop
4595
4596 pop edi
4597 pop esi
4598 pop ebx
4599 ret
4600 }
4601 }
4602 #endif // HAS_ARGBUNATTENUATEROW_SSE2
4603
4604 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
4605 // Shuffle table duplicating alpha.
4606 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
4607 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
4608 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
4609 // USE_GATHER is not on by default, due to being a slow instruction.
4610 #ifdef USE_GATHER
4611 __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
4612 uint8_t* dst_argb,
4613 int width) {
4614 __asm {
4615 mov eax, [esp + 4] // src_argb
4616 mov edx, [esp + 8] // dst_argb
4617 mov ecx, [esp + 12] // width
4618 sub edx, eax
4619 vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
4620
4621 convertloop:
4622 vmovdqu ymm6, [eax] // read 8 pixels.
4623 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
4624 vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
4625 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4626 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4627 vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
4628 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
4629 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
4630 vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a
4631 vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas
4632 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
4633 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
4634 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4635 vmovdqu [eax + edx], ymm0
4636 lea eax, [eax + 32]
4637 sub ecx, 8
4638 jg convertloop
4639
4640 vzeroupper
4641 ret
4642 }
4643 }
4644 #else // USE_GATHER
4645 __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
4646 uint8_t* dst_argb,
4647 int width) {
4648 __asm {
4649
4650 push ebx
4651 push esi
4652 push edi
4653 mov eax, [esp + 12 + 4] // src_argb
4654 mov edx, [esp + 12 + 8] // dst_argb
4655 mov ecx, [esp + 12 + 12] // width
4656 sub edx, eax
4657 lea ebx, fixed_invtbl8
4658 vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
4659
4660 convertloop:
4661 // replace VPGATHER
4662 movzx esi, byte ptr [eax + 3] // alpha0
4663 movzx edi, byte ptr [eax + 7] // alpha1
4664 vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0]
4665 vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1]
4666 movzx esi, byte ptr [eax + 11] // alpha2
4667 movzx edi, byte ptr [eax + 15] // alpha3
4668 vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
4669 vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2]
4670 vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3]
4671 movzx esi, byte ptr [eax + 19] // alpha4
4672 movzx edi, byte ptr [eax + 23] // alpha5
4673 vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
4674 vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4]
4675 vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5]
4676 movzx esi, byte ptr [eax + 27] // alpha6
4677 movzx edi, byte ptr [eax + 31] // alpha7
4678 vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
4679 vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6]
4680 vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7]
4681 vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
4682 vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
4683 vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
4684 vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
4685 // end of VPGATHER
4686
4687 vmovdqu ymm6, [eax] // read 8 pixels.
4688 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4689 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4690 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
4691 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
4692 vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a
4693 vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas
4694 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
4695 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
4696 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4697 vmovdqu [eax + edx], ymm0
4698 lea eax, [eax + 32]
4699 sub ecx, 8
4700 jg convertloop
4701
4702 pop edi
4703 pop esi
4704 pop ebx
4705 vzeroupper
4706 ret
4707 }
4708 }
4709 #endif // USE_GATHER
4710 #endif // HAS_ARGBATTENUATEROW_AVX2
4711
4712 #ifdef HAS_ARGBGRAYROW_SSSE3
4713 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
4714 __declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb,
4715 uint8_t* dst_argb,
4716 int width) {
4717 __asm {
4718 mov eax, [esp + 4] /* src_argb */
4719 mov edx, [esp + 8] /* dst_argb */
4720 mov ecx, [esp + 12] /* width */
4721 movdqa xmm4, xmmword ptr kARGBToYJ
4722 movdqa xmm5, xmmword ptr kAddYJ64
4723
4724 convertloop:
4725 movdqu xmm0, [eax] // G
4726 movdqu xmm1, [eax + 16]
4727 pmaddubsw xmm0, xmm4
4728 pmaddubsw xmm1, xmm4
4729 phaddw xmm0, xmm1
4730 paddw xmm0, xmm5 // Add .5 for rounding.
4731 psrlw xmm0, 7
4732 packuswb xmm0, xmm0 // 8 G bytes
4733 movdqu xmm2, [eax] // A
4734 movdqu xmm3, [eax + 16]
4735 lea eax, [eax + 32]
4736 psrld xmm2, 24
4737 psrld xmm3, 24
4738 packuswb xmm2, xmm3
4739 packuswb xmm2, xmm2 // 8 A bytes
4740 movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
4741 punpcklbw xmm0, xmm0 // 8 GG words
4742 punpcklbw xmm3, xmm2 // 8 GA words
4743 movdqa xmm1, xmm0
4744 punpcklwd xmm0, xmm3 // GGGA first 4
4745 punpckhwd xmm1, xmm3 // GGGA next 4
4746 movdqu [edx], xmm0
4747 movdqu [edx + 16], xmm1
4748 lea edx, [edx + 32]
4749 sub ecx, 8
4750 jg convertloop
4751 ret
4752 }
4753 }
4754 #endif // HAS_ARGBGRAYROW_SSSE3
4755
4756 #ifdef HAS_ARGBSEPIAROW_SSSE3
4757 // b = (r * 35 + g * 68 + b * 17) >> 7
4758 // g = (r * 45 + g * 88 + b * 22) >> 7
4759 // r = (r * 50 + g * 98 + b * 24) >> 7
4760 // Constant for ARGB color to sepia tone.
4761 static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
4762 17, 68, 35, 0, 17, 68, 35, 0};
4763
4764 static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
4765 22, 88, 45, 0, 22, 88, 45, 0};
4766
4767 static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
4768 24, 98, 50, 0, 24, 98, 50, 0};
4769
4770 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
4771 __declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
4772 __asm {
4773 mov eax, [esp + 4] /* dst_argb */
4774 mov ecx, [esp + 8] /* width */
4775 movdqa xmm2, xmmword ptr kARGBToSepiaB
4776 movdqa xmm3, xmmword ptr kARGBToSepiaG
4777 movdqa xmm4, xmmword ptr kARGBToSepiaR
4778
4779 convertloop:
4780 movdqu xmm0, [eax] // B
4781 movdqu xmm6, [eax + 16]
4782 pmaddubsw xmm0, xmm2
4783 pmaddubsw xmm6, xmm2
4784 phaddw xmm0, xmm6
4785 psrlw xmm0, 7
4786 packuswb xmm0, xmm0 // 8 B values
4787 movdqu xmm5, [eax] // G
4788 movdqu xmm1, [eax + 16]
4789 pmaddubsw xmm5, xmm3
4790 pmaddubsw xmm1, xmm3
4791 phaddw xmm5, xmm1
4792 psrlw xmm5, 7
4793 packuswb xmm5, xmm5 // 8 G values
4794 punpcklbw xmm0, xmm5 // 8 BG values
4795 movdqu xmm5, [eax] // R
4796 movdqu xmm1, [eax + 16]
4797 pmaddubsw xmm5, xmm4
4798 pmaddubsw xmm1, xmm4
4799 phaddw xmm5, xmm1
4800 psrlw xmm5, 7
4801 packuswb xmm5, xmm5 // 8 R values
4802 movdqu xmm6, [eax] // A
4803 movdqu xmm1, [eax + 16]
4804 psrld xmm6, 24
4805 psrld xmm1, 24
4806 packuswb xmm6, xmm1
4807 packuswb xmm6, xmm6 // 8 A values
4808 punpcklbw xmm5, xmm6 // 8 RA values
4809 movdqa xmm1, xmm0 // Weave BG, RA together
4810 punpcklwd xmm0, xmm5 // BGRA first 4
4811 punpckhwd xmm1, xmm5 // BGRA next 4
4812 movdqu [eax], xmm0
4813 movdqu [eax + 16], xmm1
4814 lea eax, [eax + 32]
4815 sub ecx, 8
4816 jg convertloop
4817 ret
4818 }
4819 }
4820 #endif // HAS_ARGBSEPIAROW_SSSE3
4821
4822 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
4823 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
4824 // Same as Sepia except matrix is provided.
4825 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
4826 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
4827 __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
4828 uint8_t* dst_argb,
4829 const int8_t* matrix_argb,
4830 int width) {
4831 __asm {
4832 mov eax, [esp + 4] /* src_argb */
4833 mov edx, [esp + 8] /* dst_argb */
4834 mov ecx, [esp + 12] /* matrix_argb */
4835 movdqu xmm5, [ecx]
4836 pshufd xmm2, xmm5, 0x00
4837 pshufd xmm3, xmm5, 0x55
4838 pshufd xmm4, xmm5, 0xaa
4839 pshufd xmm5, xmm5, 0xff
4840 mov ecx, [esp + 16] /* width */
4841
4842 convertloop:
4843 movdqu xmm0, [eax] // B
4844 movdqu xmm7, [eax + 16]
4845 pmaddubsw xmm0, xmm2
4846 pmaddubsw xmm7, xmm2
4847 movdqu xmm6, [eax] // G
4848 movdqu xmm1, [eax + 16]
4849 pmaddubsw xmm6, xmm3
4850 pmaddubsw xmm1, xmm3
4851 phaddsw xmm0, xmm7 // B
4852 phaddsw xmm6, xmm1 // G
4853 psraw xmm0, 6 // B
4854 psraw xmm6, 6 // G
4855 packuswb xmm0, xmm0 // 8 B values
4856 packuswb xmm6, xmm6 // 8 G values
4857 punpcklbw xmm0, xmm6 // 8 BG values
4858 movdqu xmm1, [eax] // R
4859 movdqu xmm7, [eax + 16]
4860 pmaddubsw xmm1, xmm4
4861 pmaddubsw xmm7, xmm4
4862 phaddsw xmm1, xmm7 // R
4863 movdqu xmm6, [eax] // A
4864 movdqu xmm7, [eax + 16]
4865 pmaddubsw xmm6, xmm5
4866 pmaddubsw xmm7, xmm5
4867 phaddsw xmm6, xmm7 // A
4868 psraw xmm1, 6 // R
4869 psraw xmm6, 6 // A
4870 packuswb xmm1, xmm1 // 8 R values
4871 packuswb xmm6, xmm6 // 8 A values
4872 punpcklbw xmm1, xmm6 // 8 RA values
4873 movdqa xmm6, xmm0 // Weave BG, RA together
4874 punpcklwd xmm0, xmm1 // BGRA first 4
4875 punpckhwd xmm6, xmm1 // BGRA next 4
4876 movdqu [edx], xmm0
4877 movdqu [edx + 16], xmm6
4878 lea eax, [eax + 32]
4879 lea edx, [edx + 32]
4880 sub ecx, 8
4881 jg convertloop
4882 ret
4883 }
4884 }
4885 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
4886
4887 #ifdef HAS_ARGBQUANTIZEROW_SSE2
4888 // Quantize 4 ARGB pixels (16 bytes).
4889 __declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
4890 int scale,
4891 int interval_size,
4892 int interval_offset,
4893 int width) {
4894 __asm {
4895 mov eax, [esp + 4] /* dst_argb */
4896 movd xmm2, [esp + 8] /* scale */
4897 movd xmm3, [esp + 12] /* interval_size */
4898 movd xmm4, [esp + 16] /* interval_offset */
4899 mov ecx, [esp + 20] /* width */
4900 pshuflw xmm2, xmm2, 040h
4901 pshufd xmm2, xmm2, 044h
4902 pshuflw xmm3, xmm3, 040h
4903 pshufd xmm3, xmm3, 044h
4904 pshuflw xmm4, xmm4, 040h
4905 pshufd xmm4, xmm4, 044h
4906 pxor xmm5, xmm5 // constant 0
4907 pcmpeqb xmm6, xmm6 // generate mask 0xff000000
4908 pslld xmm6, 24
4909
4910 convertloop:
4911 movdqu xmm0, [eax] // read 4 pixels
4912 punpcklbw xmm0, xmm5 // first 2 pixels
4913 pmulhuw xmm0, xmm2 // pixel * scale >> 16
4914 movdqu xmm1, [eax] // read 4 pixels
4915 punpckhbw xmm1, xmm5 // next 2 pixels
4916 pmulhuw xmm1, xmm2
4917 pmullw xmm0, xmm3 // * interval_size
4918 movdqu xmm7, [eax] // read 4 pixels
4919 pmullw xmm1, xmm3
4920 pand xmm7, xmm6 // mask alpha
4921 paddw xmm0, xmm4 // + interval_size / 2
4922 paddw xmm1, xmm4
4923 packuswb xmm0, xmm1
4924 por xmm0, xmm7
4925 movdqu [eax], xmm0
4926 lea eax, [eax + 16]
4927 sub ecx, 4
4928 jg convertloop
4929 ret
4930 }
4931 }
4932 #endif // HAS_ARGBQUANTIZEROW_SSE2
4933
4934 #ifdef HAS_ARGBSHADEROW_SSE2
4935 // Shade 4 pixels at a time by specified value.
4936 __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
4937 uint8_t* dst_argb,
4938 int width,
4939 uint32_t value) {
4940 __asm {
4941 mov eax, [esp + 4] // src_argb
4942 mov edx, [esp + 8] // dst_argb
4943 mov ecx, [esp + 12] // width
4944 movd xmm2, [esp + 16] // value
4945 punpcklbw xmm2, xmm2
4946 punpcklqdq xmm2, xmm2
4947
4948 convertloop:
4949 movdqu xmm0, [eax] // read 4 pixels
4950 lea eax, [eax + 16]
4951 movdqa xmm1, xmm0
4952 punpcklbw xmm0, xmm0 // first 2
4953 punpckhbw xmm1, xmm1 // next 2
4954 pmulhuw xmm0, xmm2 // argb * value
4955 pmulhuw xmm1, xmm2 // argb * value
4956 psrlw xmm0, 8
4957 psrlw xmm1, 8
4958 packuswb xmm0, xmm1
4959 movdqu [edx], xmm0
4960 lea edx, [edx + 16]
4961 sub ecx, 4
4962 jg convertloop
4963
4964 ret
4965 }
4966 }
4967 #endif // HAS_ARGBSHADEROW_SSE2
4968
4969 #ifdef HAS_ARGBMULTIPLYROW_SSE2
4970 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
4971 __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
4972 const uint8_t* src_argb1,
4973 uint8_t* dst_argb,
4974 int width) {
4975 __asm {
4976 push esi
4977 mov eax, [esp + 4 + 4] // src_argb
4978 mov esi, [esp + 4 + 8] // src_argb1
4979 mov edx, [esp + 4 + 12] // dst_argb
4980 mov ecx, [esp + 4 + 16] // width
4981 pxor xmm5, xmm5 // constant 0
4982
4983 convertloop:
4984 movdqu xmm0, [eax] // read 4 pixels from src_argb
4985 movdqu xmm2, [esi] // read 4 pixels from src_argb1
4986 movdqu xmm1, xmm0
4987 movdqu xmm3, xmm2
4988 punpcklbw xmm0, xmm0 // first 2
4989 punpckhbw xmm1, xmm1 // next 2
4990 punpcklbw xmm2, xmm5 // first 2
4991 punpckhbw xmm3, xmm5 // next 2
4992 pmulhuw xmm0, xmm2 // src_argb * src_argb1 first 2
4993 pmulhuw xmm1, xmm3 // src_argb * src_argb1 next 2
4994 lea eax, [eax + 16]
4995 lea esi, [esi + 16]
4996 packuswb xmm0, xmm1
4997 movdqu [edx], xmm0
4998 lea edx, [edx + 16]
4999 sub ecx, 4
5000 jg convertloop
5001
5002 pop esi
5003 ret
5004 }
5005 }
5006 #endif // HAS_ARGBMULTIPLYROW_SSE2
5007
5008 #ifdef HAS_ARGBADDROW_SSE2
5009 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
5010 // TODO(fbarchard): Port this to posix, neon and other math functions.
5011 __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb,
5012 const uint8_t* src_argb1,
5013 uint8_t* dst_argb,
5014 int width) {
5015 __asm {
5016 push esi
5017 mov eax, [esp + 4 + 4] // src_argb
5018 mov esi, [esp + 4 + 8] // src_argb1
5019 mov edx, [esp + 4 + 12] // dst_argb
5020 mov ecx, [esp + 4 + 16] // width
5021
5022 sub ecx, 4
5023 jl convertloop49
5024
5025 convertloop4:
5026 movdqu xmm0, [eax] // read 4 pixels from src_argb
5027 lea eax, [eax + 16]
5028 movdqu xmm1, [esi] // read 4 pixels from src_argb1
5029 lea esi, [esi + 16]
5030 paddusb xmm0, xmm1 // src_argb + src_argb1
5031 movdqu [edx], xmm0
5032 lea edx, [edx + 16]
5033 sub ecx, 4
5034 jge convertloop4
5035
5036 convertloop49:
5037 add ecx, 4 - 1
5038 jl convertloop19
5039
5040 convertloop1:
5041 movd xmm0, [eax] // read 1 pixels from src_argb
5042 lea eax, [eax + 4]
5043 movd xmm1, [esi] // read 1 pixels from src_argb1
5044 lea esi, [esi + 4]
5045 paddusb xmm0, xmm1 // src_argb + src_argb1
5046 movd [edx], xmm0
5047 lea edx, [edx + 4]
5048 sub ecx, 1
5049 jge convertloop1
5050
5051 convertloop19:
5052 pop esi
5053 ret
5054 }
5055 }
5056 #endif // HAS_ARGBADDROW_SSE2
5057
5058 #ifdef HAS_ARGBSUBTRACTROW_SSE2
5059 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
5060 __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
5061 const uint8_t* src_argb1,
5062 uint8_t* dst_argb,
5063 int width) {
5064 __asm {
5065 push esi
5066 mov eax, [esp + 4 + 4] // src_argb
5067 mov esi, [esp + 4 + 8] // src_argb1
5068 mov edx, [esp + 4 + 12] // dst_argb
5069 mov ecx, [esp + 4 + 16] // width
5070
5071 convertloop:
5072 movdqu xmm0, [eax] // read 4 pixels from src_argb
5073 lea eax, [eax + 16]
5074 movdqu xmm1, [esi] // read 4 pixels from src_argb1
5075 lea esi, [esi + 16]
5076 psubusb xmm0, xmm1 // src_argb - src_argb1
5077 movdqu [edx], xmm0
5078 lea edx, [edx + 16]
5079 sub ecx, 4
5080 jg convertloop
5081
5082 pop esi
5083 ret
5084 }
5085 }
5086 #endif // HAS_ARGBSUBTRACTROW_SSE2
5087
5088 #ifdef HAS_ARGBMULTIPLYROW_AVX2
5089 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
5090 __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
5091 const uint8_t* src_argb1,
5092 uint8_t* dst_argb,
5093 int width) {
5094 __asm {
5095 push esi
5096 mov eax, [esp + 4 + 4] // src_argb
5097 mov esi, [esp + 4 + 8] // src_argb1
5098 mov edx, [esp + 4 + 12] // dst_argb
5099 mov ecx, [esp + 4 + 16] // width
5100 vpxor ymm5, ymm5, ymm5 // constant 0
5101
5102 convertloop:
5103 vmovdqu ymm1, [eax] // read 8 pixels from src_argb
5104 lea eax, [eax + 32]
5105 vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
5106 lea esi, [esi + 32]
5107 vpunpcklbw ymm0, ymm1, ymm1 // low 4
5108 vpunpckhbw ymm1, ymm1, ymm1 // high 4
5109 vpunpcklbw ymm2, ymm3, ymm5 // low 4
5110 vpunpckhbw ymm3, ymm3, ymm5 // high 4
5111 vpmulhuw ymm0, ymm0, ymm2 // src_argb * src_argb1 low 4
5112 vpmulhuw ymm1, ymm1, ymm3 // src_argb * src_argb1 high 4
5113 vpackuswb ymm0, ymm0, ymm1
5114 vmovdqu [edx], ymm0
5115 lea edx, [edx + 32]
5116 sub ecx, 8
5117 jg convertloop
5118
5119 pop esi
5120 vzeroupper
5121 ret
5122 }
5123 }
5124 #endif // HAS_ARGBMULTIPLYROW_AVX2
5125
5126 #ifdef HAS_ARGBADDROW_AVX2
5127 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
5128 __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb,
5129 const uint8_t* src_argb1,
5130 uint8_t* dst_argb,
5131 int width) {
5132 __asm {
5133 push esi
5134 mov eax, [esp + 4 + 4] // src_argb
5135 mov esi, [esp + 4 + 8] // src_argb1
5136 mov edx, [esp + 4 + 12] // dst_argb
5137 mov ecx, [esp + 4 + 16] // width
5138
5139 convertloop:
5140 vmovdqu ymm0, [eax] // read 8 pixels from src_argb
5141 lea eax, [eax + 32]
5142 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
5143 lea esi, [esi + 32]
5144 vmovdqu [edx], ymm0
5145 lea edx, [edx + 32]
5146 sub ecx, 8
5147 jg convertloop
5148
5149 pop esi
5150 vzeroupper
5151 ret
5152 }
5153 }
5154 #endif // HAS_ARGBADDROW_AVX2
5155
5156 #ifdef HAS_ARGBSUBTRACTROW_AVX2
5157 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
5158 __declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
5159 const uint8_t* src_argb1,
5160 uint8_t* dst_argb,
5161 int width) {
5162 __asm {
5163 push esi
5164 mov eax, [esp + 4 + 4] // src_argb
5165 mov esi, [esp + 4 + 8] // src_argb1
5166 mov edx, [esp + 4 + 12] // dst_argb
5167 mov ecx, [esp + 4 + 16] // width
5168
5169 convertloop:
5170 vmovdqu ymm0, [eax] // read 8 pixels from src_argb
5171 lea eax, [eax + 32]
5172 vpsubusb ymm0, ymm0, [esi] // src_argb - src_argb1
5173 lea esi, [esi + 32]
5174 vmovdqu [edx], ymm0
5175 lea edx, [edx + 32]
5176 sub ecx, 8
5177 jg convertloop
5178
5179 pop esi
5180 vzeroupper
5181 ret
5182 }
5183 }
5184 #endif // HAS_ARGBSUBTRACTROW_AVX2
5185
5186 #ifdef HAS_SOBELXROW_SSE2
5187 // SobelX as a matrix is
5188 // -1 0 1
5189 // -2 0 2
5190 // -1 0 1
5191 __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
5192 const uint8_t* src_y1,
5193 const uint8_t* src_y2,
5194 uint8_t* dst_sobelx,
5195 int width) {
5196 __asm {
5197 push esi
5198 push edi
5199 mov eax, [esp + 8 + 4] // src_y0
5200 mov esi, [esp + 8 + 8] // src_y1
5201 mov edi, [esp + 8 + 12] // src_y2
5202 mov edx, [esp + 8 + 16] // dst_sobelx
5203 mov ecx, [esp + 8 + 20] // width
5204 sub esi, eax
5205 sub edi, eax
5206 sub edx, eax
5207 pxor xmm5, xmm5 // constant 0
5208
5209 convertloop:
5210 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
5211 movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
5212 punpcklbw xmm0, xmm5
5213 punpcklbw xmm1, xmm5
5214 psubw xmm0, xmm1
5215 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
5216 movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
5217 punpcklbw xmm1, xmm5
5218 punpcklbw xmm2, xmm5
5219 psubw xmm1, xmm2
5220 movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
5221 movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
5222 punpcklbw xmm2, xmm5
5223 punpcklbw xmm3, xmm5
5224 psubw xmm2, xmm3
5225 paddw xmm0, xmm2
5226 paddw xmm0, xmm1
5227 paddw xmm0, xmm1
5228 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
5229 psubw xmm1, xmm0
5230 pmaxsw xmm0, xmm1
5231 packuswb xmm0, xmm0
5232 movq qword ptr [eax + edx], xmm0
5233 lea eax, [eax + 8]
5234 sub ecx, 8
5235 jg convertloop
5236
5237 pop edi
5238 pop esi
5239 ret
5240 }
5241 }
5242 #endif // HAS_SOBELXROW_SSE2
5243
5244 #ifdef HAS_SOBELYROW_SSE2
5245 // SobelY as a matrix is
5246 // -1 -2 -1
5247 // 0 0 0
5248 // 1 2 1
5249 __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
5250 const uint8_t* src_y1,
5251 uint8_t* dst_sobely,
5252 int width) {
5253 __asm {
5254 push esi
5255 mov eax, [esp + 4 + 4] // src_y0
5256 mov esi, [esp + 4 + 8] // src_y1
5257 mov edx, [esp + 4 + 12] // dst_sobely
5258 mov ecx, [esp + 4 + 16] // width
5259 sub esi, eax
5260 sub edx, eax
5261 pxor xmm5, xmm5 // constant 0
5262
5263 convertloop:
5264 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
5265 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
5266 punpcklbw xmm0, xmm5
5267 punpcklbw xmm1, xmm5
5268 psubw xmm0, xmm1
5269 movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
5270 movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
5271 punpcklbw xmm1, xmm5
5272 punpcklbw xmm2, xmm5
5273 psubw xmm1, xmm2
5274 movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
5275 movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
5276 punpcklbw xmm2, xmm5
5277 punpcklbw xmm3, xmm5
5278 psubw xmm2, xmm3
5279 paddw xmm0, xmm2
5280 paddw xmm0, xmm1
5281 paddw xmm0, xmm1
5282 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
5283 psubw xmm1, xmm0
5284 pmaxsw xmm0, xmm1
5285 packuswb xmm0, xmm0
5286 movq qword ptr [eax + edx], xmm0
5287 lea eax, [eax + 8]
5288 sub ecx, 8
5289 jg convertloop
5290
5291 pop esi
5292 ret
5293 }
5294 }
5295 #endif // HAS_SOBELYROW_SSE2
5296
5297 #ifdef HAS_SOBELROW_SSE2
5298 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
5299 // A = 255
5300 // R = Sobel
5301 // G = Sobel
5302 // B = Sobel
5303 __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
5304 const uint8_t* src_sobely,
5305 uint8_t* dst_argb,
5306 int width) {
5307 __asm {
5308 push esi
5309 mov eax, [esp + 4 + 4] // src_sobelx
5310 mov esi, [esp + 4 + 8] // src_sobely
5311 mov edx, [esp + 4 + 12] // dst_argb
5312 mov ecx, [esp + 4 + 16] // width
5313 sub esi, eax
5314 pcmpeqb xmm5, xmm5 // alpha 255
5315 pslld xmm5, 24 // 0xff000000
5316
5317 convertloop:
5318 movdqu xmm0, [eax] // read 16 pixels src_sobelx
5319 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
5320 lea eax, [eax + 16]
5321 paddusb xmm0, xmm1 // sobel = sobelx + sobely
5322 movdqa xmm2, xmm0 // GG
5323 punpcklbw xmm2, xmm0 // First 8
5324 punpckhbw xmm0, xmm0 // Next 8
5325 movdqa xmm1, xmm2 // GGGG
5326 punpcklwd xmm1, xmm2 // First 4
5327 punpckhwd xmm2, xmm2 // Next 4
5328 por xmm1, xmm5 // GGGA
5329 por xmm2, xmm5
5330 movdqa xmm3, xmm0 // GGGG
5331 punpcklwd xmm3, xmm0 // Next 4
5332 punpckhwd xmm0, xmm0 // Last 4
5333 por xmm3, xmm5 // GGGA
5334 por xmm0, xmm5
5335 movdqu [edx], xmm1
5336 movdqu [edx + 16], xmm2
5337 movdqu [edx + 32], xmm3
5338 movdqu [edx + 48], xmm0
5339 lea edx, [edx + 64]
5340 sub ecx, 16
5341 jg convertloop
5342
5343 pop esi
5344 ret
5345 }
5346 }
5347 #endif // HAS_SOBELROW_SSE2
5348
5349 #ifdef HAS_SOBELTOPLANEROW_SSE2
5350 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
5351 __declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
5352 const uint8_t* src_sobely,
5353 uint8_t* dst_y,
5354 int width) {
5355 __asm {
5356 push esi
5357 mov eax, [esp + 4 + 4] // src_sobelx
5358 mov esi, [esp + 4 + 8] // src_sobely
5359 mov edx, [esp + 4 + 12] // dst_argb
5360 mov ecx, [esp + 4 + 16] // width
5361 sub esi, eax
5362
5363 convertloop:
5364 movdqu xmm0, [eax] // read 16 pixels src_sobelx
5365 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
5366 lea eax, [eax + 16]
5367 paddusb xmm0, xmm1 // sobel = sobelx + sobely
5368 movdqu [edx], xmm0
5369 lea edx, [edx + 16]
5370 sub ecx, 16
5371 jg convertloop
5372
5373 pop esi
5374 ret
5375 }
5376 }
5377 #endif // HAS_SOBELTOPLANEROW_SSE2
5378
5379 #ifdef HAS_SOBELXYROW_SSE2
5380 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
5381 // A = 255
5382 // R = Sobel X
5383 // G = Sobel
5384 // B = Sobel Y
5385 __declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,
5386 const uint8_t* src_sobely,
5387 uint8_t* dst_argb,
5388 int width) {
5389 __asm {
5390 push esi
5391 mov eax, [esp + 4 + 4] // src_sobelx
5392 mov esi, [esp + 4 + 8] // src_sobely
5393 mov edx, [esp + 4 + 12] // dst_argb
5394 mov ecx, [esp + 4 + 16] // width
5395 sub esi, eax
5396 pcmpeqb xmm5, xmm5 // alpha 255
5397
5398 convertloop:
5399 movdqu xmm0, [eax] // read 16 pixels src_sobelx
5400 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
5401 lea eax, [eax + 16]
5402 movdqa xmm2, xmm0
5403 paddusb xmm2, xmm1 // sobel = sobelx + sobely
5404 movdqa xmm3, xmm0 // XA
5405 punpcklbw xmm3, xmm5
5406 punpckhbw xmm0, xmm5
5407 movdqa xmm4, xmm1 // YS
5408 punpcklbw xmm4, xmm2
5409 punpckhbw xmm1, xmm2
5410 movdqa xmm6, xmm4 // YSXA
5411 punpcklwd xmm6, xmm3 // First 4
5412 punpckhwd xmm4, xmm3 // Next 4
5413 movdqa xmm7, xmm1 // YSXA
5414 punpcklwd xmm7, xmm0 // Next 4
5415 punpckhwd xmm1, xmm0 // Last 4
5416 movdqu [edx], xmm6
5417 movdqu [edx + 16], xmm4
5418 movdqu [edx + 32], xmm7
5419 movdqu [edx + 48], xmm1
5420 lea edx, [edx + 64]
5421 sub ecx, 16
5422 jg convertloop
5423
5424 pop esi
5425 ret
5426 }
5427 }
5428 #endif // HAS_SOBELXYROW_SSE2
5429
5430 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5431 // Consider float CumulativeSum.
5432 // Consider calling CumulativeSum one row at time as needed.
5433 // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
5434 // Convert cumulative sum for an area to an average for 1 pixel.
5435 // topleft is pointer to top left of CumulativeSum buffer for area.
5436 // botleft is pointer to bottom left of CumulativeSum buffer.
5437 // width is offset from left to right of area in CumulativeSum buffer measured
5438 // in number of ints.
5439 // area is the number of pixels in the area being averaged.
5440 // dst points to pixel to store result to.
5441 // count is number of averaged pixels to produce.
5442 // Does 4 pixels at a time.
5443 // This function requires alignment on accumulation buffer pointers.
5444 void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
5445 const int32_t* botleft,
5446 int width,
5447 int area,
5448 uint8_t* dst,
5449 int count) {
5450 __asm {
5451 mov eax, topleft // eax topleft
5452 mov esi, botleft // esi botleft
5453 mov edx, width
5454 movd xmm5, area
5455 mov edi, dst
5456 mov ecx, count
5457 cvtdq2ps xmm5, xmm5
5458 rcpss xmm4, xmm5 // 1.0f / area
5459 pshufd xmm4, xmm4, 0
5460 sub ecx, 4
5461 jl l4b
5462
5463 cmp area, 128 // 128 pixels will not overflow 15 bits.
5464 ja l4
5465
5466 pshufd xmm5, xmm5, 0 // area
5467 pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0
5468 psrld xmm6, 16
5469 cvtdq2ps xmm6, xmm6
5470 addps xmm5, xmm6 // (65536.0 + area - 1)
5471 mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area
5472 cvtps2dq xmm5, xmm5 // 0.16 fixed point
5473 packssdw xmm5, xmm5 // 16 bit shorts
5474
5475 // 4 pixel loop small blocks.
5476 s4:
5477 // top left
5478 movdqu xmm0, [eax]
5479 movdqu xmm1, [eax + 16]
5480 movdqu xmm2, [eax + 32]
5481 movdqu xmm3, [eax + 48]
5482
5483 // - top right
5484 psubd xmm0, [eax + edx * 4]
5485 psubd xmm1, [eax + edx * 4 + 16]
5486 psubd xmm2, [eax + edx * 4 + 32]
5487 psubd xmm3, [eax + edx * 4 + 48]
5488 lea eax, [eax + 64]
5489
5490 // - bottom left
5491 psubd xmm0, [esi]
5492 psubd xmm1, [esi + 16]
5493 psubd xmm2, [esi + 32]
5494 psubd xmm3, [esi + 48]
5495
5496 // + bottom right
5497 paddd xmm0, [esi + edx * 4]
5498 paddd xmm1, [esi + edx * 4 + 16]
5499 paddd xmm2, [esi + edx * 4 + 32]
5500 paddd xmm3, [esi + edx * 4 + 48]
5501 lea esi, [esi + 64]
5502
5503 packssdw xmm0, xmm1 // pack 4 pixels into 2 registers
5504 packssdw xmm2, xmm3
5505
5506 pmulhuw xmm0, xmm5
5507 pmulhuw xmm2, xmm5
5508
5509 packuswb xmm0, xmm2
5510 movdqu [edi], xmm0
5511 lea edi, [edi + 16]
5512 sub ecx, 4
5513 jge s4
5514
5515 jmp l4b
5516
5517 // 4 pixel loop
5518 l4:
5519 // top left
5520 movdqu xmm0, [eax]
5521 movdqu xmm1, [eax + 16]
5522 movdqu xmm2, [eax + 32]
5523 movdqu xmm3, [eax + 48]
5524
5525 // - top right
5526 psubd xmm0, [eax + edx * 4]
5527 psubd xmm1, [eax + edx * 4 + 16]
5528 psubd xmm2, [eax + edx * 4 + 32]
5529 psubd xmm3, [eax + edx * 4 + 48]
5530 lea eax, [eax + 64]
5531
5532 // - bottom left
5533 psubd xmm0, [esi]
5534 psubd xmm1, [esi + 16]
5535 psubd xmm2, [esi + 32]
5536 psubd xmm3, [esi + 48]
5537
5538 // + bottom right
5539 paddd xmm0, [esi + edx * 4]
5540 paddd xmm1, [esi + edx * 4 + 16]
5541 paddd xmm2, [esi + edx * 4 + 32]
5542 paddd xmm3, [esi + edx * 4 + 48]
5543 lea esi, [esi + 64]
5544
5545 cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
5546 cvtdq2ps xmm1, xmm1
5547 mulps xmm0, xmm4
5548 mulps xmm1, xmm4
5549 cvtdq2ps xmm2, xmm2
5550 cvtdq2ps xmm3, xmm3
5551 mulps xmm2, xmm4
5552 mulps xmm3, xmm4
5553 cvtps2dq xmm0, xmm0
5554 cvtps2dq xmm1, xmm1
5555 cvtps2dq xmm2, xmm2
5556 cvtps2dq xmm3, xmm3
5557 packssdw xmm0, xmm1
5558 packssdw xmm2, xmm3
5559 packuswb xmm0, xmm2
5560 movdqu [edi], xmm0
5561 lea edi, [edi + 16]
5562 sub ecx, 4
5563 jge l4
5564
5565 l4b:
5566 add ecx, 4 - 1
5567 jl l1b
5568
5569 // 1 pixel loop
5570 l1:
5571 movdqu xmm0, [eax]
5572 psubd xmm0, [eax + edx * 4]
5573 lea eax, [eax + 16]
5574 psubd xmm0, [esi]
5575 paddd xmm0, [esi + edx * 4]
5576 lea esi, [esi + 16]
5577 cvtdq2ps xmm0, xmm0
5578 mulps xmm0, xmm4
5579 cvtps2dq xmm0, xmm0
5580 packssdw xmm0, xmm0
5581 packuswb xmm0, xmm0
5582 movd dword ptr [edi], xmm0
5583 lea edi, [edi + 4]
5584 sub ecx, 1
5585 jge l1
5586 l1b:
5587 }
5588 }
5589 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5590
5591 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
5592 // Creates a table of cumulative sums where each value is a sum of all values
5593 // above and to the left of the value.
5594 void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
5595 int32_t* cumsum,
5596 const int32_t* previous_cumsum,
5597 int width) {
5598 __asm {
5599 mov eax, row
5600 mov edx, cumsum
5601 mov esi, previous_cumsum
5602 mov ecx, width
5603 pxor xmm0, xmm0
5604 pxor xmm1, xmm1
5605
5606 sub ecx, 4
5607 jl l4b
5608 test edx, 15
5609 jne l4b
5610
5611 // 4 pixel loop
5612 l4:
5613 movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
5614 lea eax, [eax + 16]
5615 movdqa xmm4, xmm2
5616
5617 punpcklbw xmm2, xmm1
5618 movdqa xmm3, xmm2
5619 punpcklwd xmm2, xmm1
5620 punpckhwd xmm3, xmm1
5621
5622 punpckhbw xmm4, xmm1
5623 movdqa xmm5, xmm4
5624 punpcklwd xmm4, xmm1
5625 punpckhwd xmm5, xmm1
5626
5627 paddd xmm0, xmm2
5628 movdqu xmm2, [esi] // previous row above.
5629 paddd xmm2, xmm0
5630
5631 paddd xmm0, xmm3
5632 movdqu xmm3, [esi + 16]
5633 paddd xmm3, xmm0
5634
5635 paddd xmm0, xmm4
5636 movdqu xmm4, [esi + 32]
5637 paddd xmm4, xmm0
5638
5639 paddd xmm0, xmm5
5640 movdqu xmm5, [esi + 48]
5641 lea esi, [esi + 64]
5642 paddd xmm5, xmm0
5643
5644 movdqu [edx], xmm2
5645 movdqu [edx + 16], xmm3
5646 movdqu [edx + 32], xmm4
5647 movdqu [edx + 48], xmm5
5648
5649 lea edx, [edx + 64]
5650 sub ecx, 4
5651 jge l4
5652
5653 l4b:
5654 add ecx, 4 - 1
5655 jl l1b
5656
5657 // 1 pixel loop
5658 l1:
5659 movd xmm2, dword ptr [eax] // 1 argb pixel
5660 lea eax, [eax + 4]
5661 punpcklbw xmm2, xmm1
5662 punpcklwd xmm2, xmm1
5663 paddd xmm0, xmm2
5664 movdqu xmm2, [esi]
5665 lea esi, [esi + 16]
5666 paddd xmm2, xmm0
5667 movdqu [edx], xmm2
5668 lea edx, [edx + 16]
5669 sub ecx, 1
5670 jge l1
5671
5672 l1b:
5673 }
5674 }
5675 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
5676
5677 #ifdef HAS_ARGBAFFINEROW_SSE2
5678 // Copy ARGB pixels from source image with slope to a row of destination.
5679 __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
5680 int src_argb_stride,
5681 uint8_t* dst_argb,
5682 const float* uv_dudv,
5683 int width) {
5684 __asm {
5685 push esi
5686 push edi
5687 mov eax, [esp + 12] // src_argb
5688 mov esi, [esp + 16] // stride
5689 mov edx, [esp + 20] // dst_argb
5690 mov ecx, [esp + 24] // pointer to uv_dudv
5691 movq xmm2, qword ptr [ecx] // uv
5692 movq xmm7, qword ptr [ecx + 8] // dudv
5693 mov ecx, [esp + 28] // width
5694 shl esi, 16 // 4, stride
5695 add esi, 4
5696 movd xmm5, esi
5697 sub ecx, 4
5698 jl l4b
5699
5700 // setup for 4 pixel loop
5701 pshufd xmm7, xmm7, 0x44 // dup dudv
5702 pshufd xmm5, xmm5, 0 // dup 4, stride
5703 movdqa xmm0, xmm2 // x0, y0, x1, y1
5704 addps xmm0, xmm7
5705 movlhps xmm2, xmm0
5706 movdqa xmm4, xmm7
5707 addps xmm4, xmm4 // dudv *= 2
5708 movdqa xmm3, xmm2 // x2, y2, x3, y3
5709 addps xmm3, xmm4
5710 addps xmm4, xmm4 // dudv *= 4
5711
5712 // 4 pixel loop
5713 l4:
5714 cvttps2dq xmm0, xmm2 // x, y float to int first 2
5715 cvttps2dq xmm1, xmm3 // x, y float to int next 2
5716 packssdw xmm0, xmm1 // x, y as 8 shorts
5717 pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
5718 movd esi, xmm0
5719 pshufd xmm0, xmm0, 0x39 // shift right
5720 movd edi, xmm0
5721 pshufd xmm0, xmm0, 0x39 // shift right
5722 movd xmm1, [eax + esi] // read pixel 0
5723 movd xmm6, [eax + edi] // read pixel 1
5724 punpckldq xmm1, xmm6 // combine pixel 0 and 1
5725 addps xmm2, xmm4 // x, y += dx, dy first 2
5726 movq qword ptr [edx], xmm1
5727 movd esi, xmm0
5728 pshufd xmm0, xmm0, 0x39 // shift right
5729 movd edi, xmm0
5730 movd xmm6, [eax + esi] // read pixel 2
5731 movd xmm0, [eax + edi] // read pixel 3
5732 punpckldq xmm6, xmm0 // combine pixel 2 and 3
5733 addps xmm3, xmm4 // x, y += dx, dy next 2
5734 movq qword ptr 8[edx], xmm6
5735 lea edx, [edx + 16]
5736 sub ecx, 4
5737 jge l4
5738
5739 l4b:
5740 add ecx, 4 - 1
5741 jl l1b
5742
5743 // 1 pixel loop
5744 l1:
5745 cvttps2dq xmm0, xmm2 // x, y float to int
5746 packssdw xmm0, xmm0 // x, y as shorts
5747 pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
5748 addps xmm2, xmm7 // x, y += dx, dy
5749 movd esi, xmm0
5750 movd xmm0, [eax + esi] // copy a pixel
5751 movd [edx], xmm0
5752 lea edx, [edx + 4]
5753 sub ecx, 1
5754 jge l1
5755 l1b:
5756 pop edi
5757 pop esi
5758 ret
5759 }
5760 }
5761 #endif // HAS_ARGBAFFINEROW_SSE2
5762
5763 #ifdef HAS_INTERPOLATEROW_AVX2
5764 // Bilinear filter 32x2 -> 32x1
5765 __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
5766 const uint8_t* src_ptr,
5767 ptrdiff_t src_stride,
5768 int dst_width,
5769 int source_y_fraction) {
5770 __asm {
5771 push esi
5772 push edi
5773 mov edi, [esp + 8 + 4] // dst_ptr
5774 mov esi, [esp + 8 + 8] // src_ptr
5775 mov edx, [esp + 8 + 12] // src_stride
5776 mov ecx, [esp + 8 + 16] // dst_width
5777 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
5778 // Dispatch to specialized filters if applicable.
5779 cmp eax, 0
5780 je xloop100 // 0 / 256. Blend 100 / 0.
5781 sub edi, esi
5782 cmp eax, 128
5783 je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
5784
5785 vmovd xmm0, eax // high fraction 0..255
5786 neg eax
5787 add eax, 256
5788 vmovd xmm5, eax // low fraction 256..1
5789 vpunpcklbw xmm5, xmm5, xmm0
5790 vpunpcklwd xmm5, xmm5, xmm5
5791 vbroadcastss ymm5, xmm5
5792
5793 mov eax, 0x80808080 // 128b for bias and rounding.
5794 vmovd xmm4, eax
5795 vbroadcastss ymm4, xmm4
5796
5797 xloop:
5798 vmovdqu ymm0, [esi]
5799 vmovdqu ymm2, [esi + edx]
5800 vpunpckhbw ymm1, ymm0, ymm2 // mutates
5801 vpunpcklbw ymm0, ymm0, ymm2
5802 vpsubb ymm1, ymm1, ymm4 // bias to signed image
5803 vpsubb ymm0, ymm0, ymm4
5804 vpmaddubsw ymm1, ymm5, ymm1
5805 vpmaddubsw ymm0, ymm5, ymm0
5806 vpaddw ymm1, ymm1, ymm4 // unbias and round
5807 vpaddw ymm0, ymm0, ymm4
5808 vpsrlw ymm1, ymm1, 8
5809 vpsrlw ymm0, ymm0, 8
5810 vpackuswb ymm0, ymm0, ymm1 // unmutates
5811 vmovdqu [esi + edi], ymm0
5812 lea esi, [esi + 32]
5813 sub ecx, 32
5814 jg xloop
5815 jmp xloop99
5816
5817 // Blend 50 / 50.
5818 xloop50:
5819 vmovdqu ymm0, [esi]
5820 vpavgb ymm0, ymm0, [esi + edx]
5821 vmovdqu [esi + edi], ymm0
5822 lea esi, [esi + 32]
5823 sub ecx, 32
5824 jg xloop50
5825 jmp xloop99
5826
5827 // Blend 100 / 0 - Copy row unchanged.
5828 xloop100:
5829 rep movsb
5830
5831 xloop99:
5832 pop edi
5833 pop esi
5834 vzeroupper
5835 ret
5836 }
5837 }
5838 #endif // HAS_INTERPOLATEROW_AVX2
5839
5840 // Bilinear filter 16x2 -> 16x1
5841 // TODO(fbarchard): Consider allowing 256 using memcpy.
5842 __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
5843 const uint8_t* src_ptr,
5844 ptrdiff_t src_stride,
5845 int dst_width,
5846 int source_y_fraction) {
5847 __asm {
5848 push esi
5849 push edi
5850
5851 mov edi, [esp + 8 + 4] // dst_ptr
5852 mov esi, [esp + 8 + 8] // src_ptr
5853 mov edx, [esp + 8 + 12] // src_stride
5854 mov ecx, [esp + 8 + 16] // dst_width
5855 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
5856 sub edi, esi
5857 // Dispatch to specialized filters if applicable.
5858 cmp eax, 0
5859 je xloop100 // 0 /256. Blend 100 / 0.
5860 cmp eax, 128
5861 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
5862
5863 movd xmm0, eax // high fraction 0..255
5864 neg eax
5865 add eax, 256
5866 movd xmm5, eax // low fraction 255..1
5867 punpcklbw xmm5, xmm0
5868 punpcklwd xmm5, xmm5
5869 pshufd xmm5, xmm5, 0
5870 mov eax, 0x80808080 // 128 for biasing image to signed.
5871 movd xmm4, eax
5872 pshufd xmm4, xmm4, 0x00
5873
5874 xloop:
5875 movdqu xmm0, [esi]
5876 movdqu xmm2, [esi + edx]
5877 movdqu xmm1, xmm0
5878 punpcklbw xmm0, xmm2
5879 punpckhbw xmm1, xmm2
5880 psubb xmm0, xmm4 // bias image by -128
5881 psubb xmm1, xmm4
5882 movdqa xmm2, xmm5
5883 movdqa xmm3, xmm5
5884 pmaddubsw xmm2, xmm0
5885 pmaddubsw xmm3, xmm1
5886 paddw xmm2, xmm4
5887 paddw xmm3, xmm4
5888 psrlw xmm2, 8
5889 psrlw xmm3, 8
5890 packuswb xmm2, xmm3
5891 movdqu [esi + edi], xmm2
5892 lea esi, [esi + 16]
5893 sub ecx, 16
5894 jg xloop
5895 jmp xloop99
5896
5897 // Blend 50 / 50.
5898 xloop50:
5899 movdqu xmm0, [esi]
5900 movdqu xmm1, [esi + edx]
5901 pavgb xmm0, xmm1
5902 movdqu [esi + edi], xmm0
5903 lea esi, [esi + 16]
5904 sub ecx, 16
5905 jg xloop50
5906 jmp xloop99
5907
5908 // Blend 100 / 0 - Copy row unchanged.
5909 xloop100:
5910 movdqu xmm0, [esi]
5911 movdqu [esi + edi], xmm0
5912 lea esi, [esi + 16]
5913 sub ecx, 16
5914 jg xloop100
5915
5916 xloop99:
5917 pop edi
5918 pop esi
5919 ret
5920 }
5921 }
5922
5923 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5924 __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
5925 uint8_t* dst_argb,
5926 const uint8_t* shuffler,
5927 int width) {
5928 __asm {
5929 mov eax, [esp + 4] // src_argb
5930 mov edx, [esp + 8] // dst_argb
5931 mov ecx, [esp + 12] // shuffler
5932 movdqu xmm5, [ecx]
5933 mov ecx, [esp + 16] // width
5934
5935 wloop:
5936 movdqu xmm0, [eax]
5937 movdqu xmm1, [eax + 16]
5938 lea eax, [eax + 32]
5939 pshufb xmm0, xmm5
5940 pshufb xmm1, xmm5
5941 movdqu [edx], xmm0
5942 movdqu [edx + 16], xmm1
5943 lea edx, [edx + 32]
5944 sub ecx, 8
5945 jg wloop
5946 ret
5947 }
5948 }
5949
5950 #ifdef HAS_ARGBSHUFFLEROW_AVX2
5951 __declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
5952 uint8_t* dst_argb,
5953 const uint8_t* shuffler,
5954 int width) {
5955 __asm {
5956 mov eax, [esp + 4] // src_argb
5957 mov edx, [esp + 8] // dst_argb
5958 mov ecx, [esp + 12] // shuffler
5959 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
5960 mov ecx, [esp + 16] // width
5961
5962 wloop:
5963 vmovdqu ymm0, [eax]
5964 vmovdqu ymm1, [eax + 32]
5965 lea eax, [eax + 64]
5966 vpshufb ymm0, ymm0, ymm5
5967 vpshufb ymm1, ymm1, ymm5
5968 vmovdqu [edx], ymm0
5969 vmovdqu [edx + 32], ymm1
5970 lea edx, [edx + 64]
5971 sub ecx, 16
5972 jg wloop
5973
5974 vzeroupper
5975 ret
5976 }
5977 }
5978 #endif // HAS_ARGBSHUFFLEROW_AVX2
5979
5980 // YUY2 - Macro-pixel = 2 image pixels
5981 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
5982
5983 // UYVY - Macro-pixel = 2 image pixels
5984 // U0Y0V0Y1
5985
5986 __declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,
5987 const uint8_t* src_u,
5988 const uint8_t* src_v,
5989 uint8_t* dst_frame,
5990 int width) {
5991 __asm {
5992 push esi
5993 push edi
5994 mov eax, [esp + 8 + 4] // src_y
5995 mov esi, [esp + 8 + 8] // src_u
5996 mov edx, [esp + 8 + 12] // src_v
5997 mov edi, [esp + 8 + 16] // dst_frame
5998 mov ecx, [esp + 8 + 20] // width
5999 sub edx, esi
6000
6001 convertloop:
6002 movq xmm2, qword ptr [esi] // U
6003 movq xmm3, qword ptr [esi + edx] // V
6004 lea esi, [esi + 8]
6005 punpcklbw xmm2, xmm3 // UV
6006 movdqu xmm0, [eax] // Y
6007 lea eax, [eax + 16]
6008 movdqa xmm1, xmm0
6009 punpcklbw xmm0, xmm2 // YUYV
6010 punpckhbw xmm1, xmm2
6011 movdqu [edi], xmm0
6012 movdqu [edi + 16], xmm1
6013 lea edi, [edi + 32]
6014 sub ecx, 16
6015 jg convertloop
6016
6017 pop edi
6018 pop esi
6019 ret
6020 }
6021 }
6022
6023 __declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,
6024 const uint8_t* src_u,
6025 const uint8_t* src_v,
6026 uint8_t* dst_frame,
6027 int width) {
6028 __asm {
6029 push esi
6030 push edi
6031 mov eax, [esp + 8 + 4] // src_y
6032 mov esi, [esp + 8 + 8] // src_u
6033 mov edx, [esp + 8 + 12] // src_v
6034 mov edi, [esp + 8 + 16] // dst_frame
6035 mov ecx, [esp + 8 + 20] // width
6036 sub edx, esi
6037
6038 convertloop:
6039 movq xmm2, qword ptr [esi] // U
6040 movq xmm3, qword ptr [esi + edx] // V
6041 lea esi, [esi + 8]
6042 punpcklbw xmm2, xmm3 // UV
6043 movdqu xmm0, [eax] // Y
6044 movdqa xmm1, xmm2
6045 lea eax, [eax + 16]
6046 punpcklbw xmm1, xmm0 // UYVY
6047 punpckhbw xmm2, xmm0
6048 movdqu [edi], xmm1
6049 movdqu [edi + 16], xmm2
6050 lea edi, [edi + 32]
6051 sub ecx, 16
6052 jg convertloop
6053
6054 pop edi
6055 pop esi
6056 ret
6057 }
6058 }
6059
6060 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
6061 __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
6062 uint8_t* dst_argb,
6063 const float* poly,
6064 int width) {
6065 __asm {
6066 push esi
6067 mov eax, [esp + 4 + 4] /* src_argb */
6068 mov edx, [esp + 4 + 8] /* dst_argb */
6069 mov esi, [esp + 4 + 12] /* poly */
6070 mov ecx, [esp + 4 + 16] /* width */
6071 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
6072
6073 // 2 pixel loop.
6074 convertloop:
6075 // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
6076 // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
6077 movq xmm0, qword ptr [eax] // BGRABGRA
6078 lea eax, [eax + 8]
6079 punpcklbw xmm0, xmm3
6080 movdqa xmm4, xmm0
6081 punpcklwd xmm0, xmm3 // pixel 0
6082 punpckhwd xmm4, xmm3 // pixel 1
6083 cvtdq2ps xmm0, xmm0 // 4 floats
6084 cvtdq2ps xmm4, xmm4
6085 movdqa xmm1, xmm0 // X
6086 movdqa xmm5, xmm4
6087 mulps xmm0, [esi + 16] // C1 * X
6088 mulps xmm4, [esi + 16]
6089 addps xmm0, [esi] // result = C0 + C1 * X
6090 addps xmm4, [esi]
6091 movdqa xmm2, xmm1
6092 movdqa xmm6, xmm5
6093 mulps xmm2, xmm1 // X * X
6094 mulps xmm6, xmm5
6095 mulps xmm1, xmm2 // X * X * X
6096 mulps xmm5, xmm6
6097 mulps xmm2, [esi + 32] // C2 * X * X
6098 mulps xmm6, [esi + 32]
6099 mulps xmm1, [esi + 48] // C3 * X * X * X
6100 mulps xmm5, [esi + 48]
6101 addps xmm0, xmm2 // result += C2 * X * X
6102 addps xmm4, xmm6
6103 addps xmm0, xmm1 // result += C3 * X * X * X
6104 addps xmm4, xmm5
6105 cvttps2dq xmm0, xmm0
6106 cvttps2dq xmm4, xmm4
6107 packuswb xmm0, xmm4
6108 packuswb xmm0, xmm0
6109 movq qword ptr [edx], xmm0
6110 lea edx, [edx + 8]
6111 sub ecx, 2
6112 jg convertloop
6113 pop esi
6114 ret
6115 }
6116 }
6117 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
6118
6119 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
6120 __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
6121 uint8_t* dst_argb,
6122 const float* poly,
6123 int width) {
6124 __asm {
6125 mov eax, [esp + 4] /* src_argb */
6126 mov edx, [esp + 8] /* dst_argb */
6127 mov ecx, [esp + 12] /* poly */
6128 vbroadcastf128 ymm4, [ecx] // C0
6129 vbroadcastf128 ymm5, [ecx + 16] // C1
6130 vbroadcastf128 ymm6, [ecx + 32] // C2
6131 vbroadcastf128 ymm7, [ecx + 48] // C3
6132 mov ecx, [esp + 16] /* width */
6133
6134 // 2 pixel loop.
6135 convertloop:
6136 vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
6137 lea eax, [eax + 8]
6138 vcvtdq2ps ymm0, ymm0 // X 8 floats
6139 vmulps ymm2, ymm0, ymm0 // X * X
6140 vmulps ymm3, ymm0, ymm7 // C3 * X
6141 vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
6142 vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
6143 vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
6144 vcvttps2dq ymm0, ymm0
6145 vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
6146 vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
6147 vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
6148 vmovq qword ptr [edx], xmm0
6149 lea edx, [edx + 8]
6150 sub ecx, 2
6151 jg convertloop
6152 vzeroupper
6153 ret
6154 }
6155 }
6156 #endif // HAS_ARGBPOLYNOMIALROW_AVX2
6157
6158 #ifdef HAS_HALFFLOATROW_SSE2
6159 static float kExpBias = 1.9259299444e-34f;
6160 __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
6161 uint16_t* dst,
6162 float scale,
6163 int width) {
6164 __asm {
6165 mov eax, [esp + 4] /* src */
6166 mov edx, [esp + 8] /* dst */
6167 movd xmm4, dword ptr [esp + 12] /* scale */
6168 mov ecx, [esp + 16] /* width */
6169 mulss xmm4, kExpBias
6170 pshufd xmm4, xmm4, 0
6171 pxor xmm5, xmm5
6172 sub edx, eax
6173
6174 // 8 pixel loop.
6175 convertloop:
6176 movdqu xmm2, xmmword ptr [eax] // 8 shorts
6177 add eax, 16
6178 movdqa xmm3, xmm2
6179 punpcklwd xmm2, xmm5
6180 cvtdq2ps xmm2, xmm2 // convert 8 ints to floats
6181 punpckhwd xmm3, xmm5
6182 cvtdq2ps xmm3, xmm3
6183 mulps xmm2, xmm4
6184 mulps xmm3, xmm4
6185 psrld xmm2, 13
6186 psrld xmm3, 13
6187 packssdw xmm2, xmm3
6188 movdqu [eax + edx - 16], xmm2
6189 sub ecx, 8
6190 jg convertloop
6191 ret
6192 }
6193 }
6194 #endif // HAS_HALFFLOATROW_SSE2
6195
6196 #ifdef HAS_HALFFLOATROW_AVX2
6197 __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
6198 uint16_t* dst,
6199 float scale,
6200 int width) {
6201 __asm {
6202 mov eax, [esp + 4] /* src */
6203 mov edx, [esp + 8] /* dst */
6204 movd xmm4, dword ptr [esp + 12] /* scale */
6205 mov ecx, [esp + 16] /* width */
6206
6207 vmulss xmm4, xmm4, kExpBias
6208 vbroadcastss ymm4, xmm4
6209 vpxor ymm5, ymm5, ymm5
6210 sub edx, eax
6211
6212 // 16 pixel loop.
6213 convertloop:
6214 vmovdqu ymm2, [eax] // 16 shorts
6215 add eax, 32
6216 vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints
6217 vpunpcklwd ymm2, ymm2, ymm5
6218 vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats
6219 vcvtdq2ps ymm2, ymm2
6220 vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range.
6221 vmulps ymm2, ymm2, ymm4
6222 vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate
6223 vpsrld ymm2, ymm2, 13
6224 vpackssdw ymm2, ymm2, ymm3
6225 vmovdqu [eax + edx - 32], ymm2
6226 sub ecx, 16
6227 jg convertloop
6228 vzeroupper
6229 ret
6230 }
6231 }
6232 #endif // HAS_HALFFLOATROW_AVX2
6233
6234 #ifdef HAS_HALFFLOATROW_F16C
6235 __declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,
6236 uint16_t* dst,
6237 float scale,
6238 int width) {
6239 __asm {
6240 mov eax, [esp + 4] /* src */
6241 mov edx, [esp + 8] /* dst */
6242 vbroadcastss ymm4, [esp + 12] /* scale */
6243 mov ecx, [esp + 16] /* width */
6244 sub edx, eax
6245
6246 // 16 pixel loop.
6247 convertloop:
6248 vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
6249 vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
6250 add eax, 32
6251 vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats
6252 vcvtdq2ps ymm3, ymm3
6253 vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1
6254 vmulps ymm3, ymm3, ymm4
6255 vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate
6256 vcvtps2ph xmm3, ymm3, 3
6257 vmovdqu [eax + edx + 32], xmm2
6258 vmovdqu [eax + edx + 32 + 16], xmm3
6259 sub ecx, 16
6260 jg convertloop
6261 vzeroupper
6262 ret
6263 }
6264 }
6265 #endif // HAS_HALFFLOATROW_F16C
6266
6267 #ifdef HAS_ARGBCOLORTABLEROW_X86
6268 // Tranform ARGB pixels with color table.
6269 __declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,
6270 const uint8_t* table_argb,
6271 int width) {
6272 __asm {
6273 push esi
6274 mov eax, [esp + 4 + 4] /* dst_argb */
6275 mov esi, [esp + 4 + 8] /* table_argb */
6276 mov ecx, [esp + 4 + 12] /* width */
6277
6278 // 1 pixel loop.
6279 convertloop:
6280 movzx edx, byte ptr [eax]
6281 lea eax, [eax + 4]
6282 movzx edx, byte ptr [esi + edx * 4]
6283 mov byte ptr [eax - 4], dl
6284 movzx edx, byte ptr [eax - 4 + 1]
6285 movzx edx, byte ptr [esi + edx * 4 + 1]
6286 mov byte ptr [eax - 4 + 1], dl
6287 movzx edx, byte ptr [eax - 4 + 2]
6288 movzx edx, byte ptr [esi + edx * 4 + 2]
6289 mov byte ptr [eax - 4 + 2], dl
6290 movzx edx, byte ptr [eax - 4 + 3]
6291 movzx edx, byte ptr [esi + edx * 4 + 3]
6292 mov byte ptr [eax - 4 + 3], dl
6293 dec ecx
6294 jg convertloop
6295 pop esi
6296 ret
6297 }
6298 }
6299 #endif // HAS_ARGBCOLORTABLEROW_X86
6300
6301 #ifdef HAS_RGBCOLORTABLEROW_X86
6302 // Tranform RGB pixels with color table.
6303 __declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,
6304 const uint8_t* table_argb,
6305 int width) {
6306 __asm {
6307 push esi
6308 mov eax, [esp + 4 + 4] /* dst_argb */
6309 mov esi, [esp + 4 + 8] /* table_argb */
6310 mov ecx, [esp + 4 + 12] /* width */
6311
6312 // 1 pixel loop.
6313 convertloop:
6314 movzx edx, byte ptr [eax]
6315 lea eax, [eax + 4]
6316 movzx edx, byte ptr [esi + edx * 4]
6317 mov byte ptr [eax - 4], dl
6318 movzx edx, byte ptr [eax - 4 + 1]
6319 movzx edx, byte ptr [esi + edx * 4 + 1]
6320 mov byte ptr [eax - 4 + 1], dl
6321 movzx edx, byte ptr [eax - 4 + 2]
6322 movzx edx, byte ptr [esi + edx * 4 + 2]
6323 mov byte ptr [eax - 4 + 2], dl
6324 dec ecx
6325 jg convertloop
6326
6327 pop esi
6328 ret
6329 }
6330 }
6331 #endif // HAS_RGBCOLORTABLEROW_X86
6332
6333 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
6334 // Tranform RGB pixels with luma table.
6335 __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
6336 uint8_t* dst_argb,
6337 int width,
6338 const uint8_t* luma,
6339 uint32_t lumacoeff) {
6340 __asm {
6341 push esi
6342 push edi
6343 mov eax, [esp + 8 + 4] /* src_argb */
6344 mov edi, [esp + 8 + 8] /* dst_argb */
6345 mov ecx, [esp + 8 + 12] /* width */
6346 movd xmm2, dword ptr [esp + 8 + 16] // luma table
6347 movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
6348 pshufd xmm2, xmm2, 0
6349 pshufd xmm3, xmm3, 0
6350 pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00
6351 psllw xmm4, 8
6352 pxor xmm5, xmm5
6353
6354 // 4 pixel loop.
6355 convertloop:
6356 movdqu xmm0, xmmword ptr [eax] // generate luma ptr
6357 pmaddubsw xmm0, xmm3
6358 phaddw xmm0, xmm0
6359 pand xmm0, xmm4 // mask out low bits
6360 punpcklwd xmm0, xmm5
6361 paddd xmm0, xmm2 // add table base
6362 movd esi, xmm0
6363 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
6364
6365 movzx edx, byte ptr [eax]
6366 movzx edx, byte ptr [esi + edx]
6367 mov byte ptr [edi], dl
6368 movzx edx, byte ptr [eax + 1]
6369 movzx edx, byte ptr [esi + edx]
6370 mov byte ptr [edi + 1], dl
6371 movzx edx, byte ptr [eax + 2]
6372 movzx edx, byte ptr [esi + edx]
6373 mov byte ptr [edi + 2], dl
6374 movzx edx, byte ptr [eax + 3] // copy alpha.
6375 mov byte ptr [edi + 3], dl
6376
6377 movd esi, xmm0
6378 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
6379
6380 movzx edx, byte ptr [eax + 4]
6381 movzx edx, byte ptr [esi + edx]
6382 mov byte ptr [edi + 4], dl
6383 movzx edx, byte ptr [eax + 5]
6384 movzx edx, byte ptr [esi + edx]
6385 mov byte ptr [edi + 5], dl
6386 movzx edx, byte ptr [eax + 6]
6387 movzx edx, byte ptr [esi + edx]
6388 mov byte ptr [edi + 6], dl
6389 movzx edx, byte ptr [eax + 7] // copy alpha.
6390 mov byte ptr [edi + 7], dl
6391
6392 movd esi, xmm0
6393 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
6394
6395 movzx edx, byte ptr [eax + 8]
6396 movzx edx, byte ptr [esi + edx]
6397 mov byte ptr [edi + 8], dl
6398 movzx edx, byte ptr [eax + 9]
6399 movzx edx, byte ptr [esi + edx]
6400 mov byte ptr [edi + 9], dl
6401 movzx edx, byte ptr [eax + 10]
6402 movzx edx, byte ptr [esi + edx]
6403 mov byte ptr [edi + 10], dl
6404 movzx edx, byte ptr [eax + 11] // copy alpha.
6405 mov byte ptr [edi + 11], dl
6406
6407 movd esi, xmm0
6408
6409 movzx edx, byte ptr [eax + 12]
6410 movzx edx, byte ptr [esi + edx]
6411 mov byte ptr [edi + 12], dl
6412 movzx edx, byte ptr [eax + 13]
6413 movzx edx, byte ptr [esi + edx]
6414 mov byte ptr [edi + 13], dl
6415 movzx edx, byte ptr [eax + 14]
6416 movzx edx, byte ptr [esi + edx]
6417 mov byte ptr [edi + 14], dl
6418 movzx edx, byte ptr [eax + 15] // copy alpha.
6419 mov byte ptr [edi + 15], dl
6420
6421 lea eax, [eax + 16]
6422 lea edi, [edi + 16]
6423 sub ecx, 4
6424 jg convertloop
6425
6426 pop edi
6427 pop esi
6428 ret
6429 }
6430 }
6431 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6432
6433 #endif // defined(_M_X64)
6434
6435 #ifdef __cplusplus
6436 } // extern "C"
6437 } // namespace libyuv
6438 #endif
6439
6440 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
6441