1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // This module is for Visual C x86.
19 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
20
21 #ifdef HAS_ARGBTOYROW_SSSE3
22
23 // Constants for ARGB.
24 static const vec8 kARGBToY = {
25 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
26 };
27
28 // JPeg full range.
29 static const vec8 kARGBToYJ = {
30 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
31 };
32
33 static const vec8 kARGBToU = {
34 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
35 };
36
37 static const vec8 kARGBToUJ = {
38 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
39 };
40
41 static const vec8 kARGBToV = {
42 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
43 };
44
45 static const vec8 kARGBToVJ = {
46 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
47 };
48
49 // vpermd for vphaddw + vpackuswb vpermd.
50 static const lvec32 kPermdARGBToY_AVX = {
51 0, 4, 1, 5, 2, 6, 3, 7
52 };
53
54 // vpshufb for vphaddw + vpackuswb packed to shorts.
55 static const lvec8 kShufARGBToUV_AVX = {
56 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
57 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
58 };
59
60 // Constants for BGRA.
61 static const vec8 kBGRAToY = {
62 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
63 };
64
65 static const vec8 kBGRAToU = {
66 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
67 };
68
69 static const vec8 kBGRAToV = {
70 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
71 };
72
73 // Constants for ABGR.
74 static const vec8 kABGRToY = {
75 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
76 };
77
78 static const vec8 kABGRToU = {
79 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
80 };
81
82 static const vec8 kABGRToV = {
83 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
84 };
85
86 // Constants for RGBA.
87 static const vec8 kRGBAToY = {
88 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
89 };
90
91 static const vec8 kRGBAToU = {
92 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
93 };
94
95 static const vec8 kRGBAToV = {
96 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
97 };
98
99 static const uvec8 kAddY16 = {
100 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
101 };
102
103 static const vec16 kAddYJ64 = {
104 64, 64, 64, 64, 64, 64, 64, 64
105 };
106
107 static const uvec8 kAddUV128 = {
108 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
109 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
110 };
111
112 static const uvec16 kAddUVJ128 = {
113 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
114 };
115
116 // Shuffle table for converting RGB24 to ARGB.
117 static const uvec8 kShuffleMaskRGB24ToARGB = {
118 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
119 };
120
121 // Shuffle table for converting RAW to ARGB.
122 static const uvec8 kShuffleMaskRAWToARGB = {
123 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
124 };
125
126 // Shuffle table for converting ARGB to RGB24.
127 static const uvec8 kShuffleMaskARGBToRGB24 = {
128 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
129 };
130
131 // Shuffle table for converting ARGB to RAW.
132 static const uvec8 kShuffleMaskARGBToRAW = {
133 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
134 };
135
136 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
137 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
138 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
139 };
140
141 // Shuffle table for converting ARGB to RAW.
142 static const uvec8 kShuffleMaskARGBToRAW_0 = {
143 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
144 };
145
146 // Duplicates gray value 3 times and fills in alpha opaque.
147 __declspec(naked) __declspec(align(16))
I400ToARGBRow_SSE2(const uint8 * src_y,uint8 * dst_argb,int pix)148 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
149 __asm {
150 mov eax, [esp + 4] // src_y
151 mov edx, [esp + 8] // dst_argb
152 mov ecx, [esp + 12] // pix
153 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
154 pslld xmm5, 24
155
156 align 4
157 convertloop:
158 movq xmm0, qword ptr [eax]
159 lea eax, [eax + 8]
160 punpcklbw xmm0, xmm0
161 movdqa xmm1, xmm0
162 punpcklwd xmm0, xmm0
163 punpckhwd xmm1, xmm1
164 por xmm0, xmm5
165 por xmm1, xmm5
166 movdqa [edx], xmm0
167 movdqa [edx + 16], xmm1
168 lea edx, [edx + 32]
169 sub ecx, 8
170 jg convertloop
171 ret
172 }
173 }
174
175 __declspec(naked) __declspec(align(16))
I400ToARGBRow_Unaligned_SSE2(const uint8 * src_y,uint8 * dst_argb,int pix)176 void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
177 int pix) {
178 __asm {
179 mov eax, [esp + 4] // src_y
180 mov edx, [esp + 8] // dst_argb
181 mov ecx, [esp + 12] // pix
182 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
183 pslld xmm5, 24
184
185 align 4
186 convertloop:
187 movq xmm0, qword ptr [eax]
188 lea eax, [eax + 8]
189 punpcklbw xmm0, xmm0
190 movdqa xmm1, xmm0
191 punpcklwd xmm0, xmm0
192 punpckhwd xmm1, xmm1
193 por xmm0, xmm5
194 por xmm1, xmm5
195 movdqu [edx], xmm0
196 movdqu [edx + 16], xmm1
197 lea edx, [edx + 32]
198 sub ecx, 8
199 jg convertloop
200 ret
201 }
202 }
203
204 __declspec(naked) __declspec(align(16))
RGB24ToARGBRow_SSSE3(const uint8 * src_rgb24,uint8 * dst_argb,int pix)205 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
206 __asm {
207 mov eax, [esp + 4] // src_rgb24
208 mov edx, [esp + 8] // dst_argb
209 mov ecx, [esp + 12] // pix
210 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
211 pslld xmm5, 24
212 movdqa xmm4, kShuffleMaskRGB24ToARGB
213
214 align 4
215 convertloop:
216 movdqu xmm0, [eax]
217 movdqu xmm1, [eax + 16]
218 movdqu xmm3, [eax + 32]
219 lea eax, [eax + 48]
220 movdqa xmm2, xmm3
221 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
222 pshufb xmm2, xmm4
223 por xmm2, xmm5
224 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
225 pshufb xmm0, xmm4
226 movdqa [edx + 32], xmm2
227 por xmm0, xmm5
228 pshufb xmm1, xmm4
229 movdqa [edx], xmm0
230 por xmm1, xmm5
231 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
232 pshufb xmm3, xmm4
233 movdqa [edx + 16], xmm1
234 por xmm3, xmm5
235 sub ecx, 16
236 movdqa [edx + 48], xmm3
237 lea edx, [edx + 64]
238 jg convertloop
239 ret
240 }
241 }
242
243 __declspec(naked) __declspec(align(16))
RAWToARGBRow_SSSE3(const uint8 * src_raw,uint8 * dst_argb,int pix)244 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
245 int pix) {
246 __asm {
247 mov eax, [esp + 4] // src_raw
248 mov edx, [esp + 8] // dst_argb
249 mov ecx, [esp + 12] // pix
250 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
251 pslld xmm5, 24
252 movdqa xmm4, kShuffleMaskRAWToARGB
253
254 align 4
255 convertloop:
256 movdqu xmm0, [eax]
257 movdqu xmm1, [eax + 16]
258 movdqu xmm3, [eax + 32]
259 lea eax, [eax + 48]
260 movdqa xmm2, xmm3
261 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
262 pshufb xmm2, xmm4
263 por xmm2, xmm5
264 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
265 pshufb xmm0, xmm4
266 movdqa [edx + 32], xmm2
267 por xmm0, xmm5
268 pshufb xmm1, xmm4
269 movdqa [edx], xmm0
270 por xmm1, xmm5
271 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
272 pshufb xmm3, xmm4
273 movdqa [edx + 16], xmm1
274 por xmm3, xmm5
275 sub ecx, 16
276 movdqa [edx + 48], xmm3
277 lea edx, [edx + 64]
278 jg convertloop
279 ret
280 }
281 }
282
283 // pmul method to replicate bits.
284 // Math to replicate bits:
285 // (v << 8) | (v << 3)
286 // v * 256 + v * 8
287 // v * (256 + 8)
288 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
289 // 20 instructions.
290 __declspec(naked) __declspec(align(16))
RGB565ToARGBRow_SSE2(const uint8 * src_rgb565,uint8 * dst_argb,int pix)291 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
292 int pix) {
293 __asm {
294 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
295 movd xmm5, eax
296 pshufd xmm5, xmm5, 0
297 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
298 movd xmm6, eax
299 pshufd xmm6, xmm6, 0
300 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
301 psllw xmm3, 11
302 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
303 psllw xmm4, 10
304 psrlw xmm4, 5
305 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
306 psllw xmm7, 8
307
308 mov eax, [esp + 4] // src_rgb565
309 mov edx, [esp + 8] // dst_argb
310 mov ecx, [esp + 12] // pix
311 sub edx, eax
312 sub edx, eax
313
314 align 4
315 convertloop:
316 movdqu xmm0, [eax] // fetch 8 pixels of bgr565
317 movdqa xmm1, xmm0
318 movdqa xmm2, xmm0
319 pand xmm1, xmm3 // R in upper 5 bits
320 psllw xmm2, 11 // B in upper 5 bits
321 pmulhuw xmm1, xmm5 // * (256 + 8)
322 pmulhuw xmm2, xmm5 // * (256 + 8)
323 psllw xmm1, 8
324 por xmm1, xmm2 // RB
325 pand xmm0, xmm4 // G in middle 6 bits
326 pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
327 por xmm0, xmm7 // AG
328 movdqa xmm2, xmm1
329 punpcklbw xmm1, xmm0
330 punpckhbw xmm2, xmm0
331 movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
332 movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
333 lea eax, [eax + 16]
334 sub ecx, 8
335 jg convertloop
336 ret
337 }
338 }
339
340 // 24 instructions
341 __declspec(naked) __declspec(align(16))
ARGB1555ToARGBRow_SSE2(const uint8 * src_argb1555,uint8 * dst_argb,int pix)342 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
343 int pix) {
344 __asm {
345 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
346 movd xmm5, eax
347 pshufd xmm5, xmm5, 0
348 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
349 movd xmm6, eax
350 pshufd xmm6, xmm6, 0
351 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
352 psllw xmm3, 11
353 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
354 psrlw xmm4, 6
355 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
356 psllw xmm7, 8
357
358 mov eax, [esp + 4] // src_argb1555
359 mov edx, [esp + 8] // dst_argb
360 mov ecx, [esp + 12] // pix
361 sub edx, eax
362 sub edx, eax
363
364 align 4
365 convertloop:
366 movdqu xmm0, [eax] // fetch 8 pixels of 1555
367 movdqa xmm1, xmm0
368 movdqa xmm2, xmm0
369 psllw xmm1, 1 // R in upper 5 bits
370 psllw xmm2, 11 // B in upper 5 bits
371 pand xmm1, xmm3
372 pmulhuw xmm2, xmm5 // * (256 + 8)
373 pmulhuw xmm1, xmm5 // * (256 + 8)
374 psllw xmm1, 8
375 por xmm1, xmm2 // RB
376 movdqa xmm2, xmm0
377 pand xmm0, xmm4 // G in middle 5 bits
378 psraw xmm2, 8 // A
379 pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
380 pand xmm2, xmm7
381 por xmm0, xmm2 // AG
382 movdqa xmm2, xmm1
383 punpcklbw xmm1, xmm0
384 punpckhbw xmm2, xmm0
385 movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
386 movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
387 lea eax, [eax + 16]
388 sub ecx, 8
389 jg convertloop
390 ret
391 }
392 }
393
394 // 18 instructions.
395 __declspec(naked) __declspec(align(16))
ARGB4444ToARGBRow_SSE2(const uint8 * src_argb4444,uint8 * dst_argb,int pix)396 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
397 int pix) {
398 __asm {
399 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
400 movd xmm4, eax
401 pshufd xmm4, xmm4, 0
402 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
403 pslld xmm5, 4
404 mov eax, [esp + 4] // src_argb4444
405 mov edx, [esp + 8] // dst_argb
406 mov ecx, [esp + 12] // pix
407 sub edx, eax
408 sub edx, eax
409
410 align 4
411 convertloop:
412 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
413 movdqa xmm2, xmm0
414 pand xmm0, xmm4 // mask low nibbles
415 pand xmm2, xmm5 // mask high nibbles
416 movdqa xmm1, xmm0
417 movdqa xmm3, xmm2
418 psllw xmm1, 4
419 psrlw xmm3, 4
420 por xmm0, xmm1
421 por xmm2, xmm3
422 movdqa xmm1, xmm0
423 punpcklbw xmm0, xmm2
424 punpckhbw xmm1, xmm2
425 movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
426 movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
427 lea eax, [eax + 16]
428 sub ecx, 8
429 jg convertloop
430 ret
431 }
432 }
433
434 __declspec(naked) __declspec(align(16))
ARGBToRGB24Row_SSSE3(const uint8 * src_argb,uint8 * dst_rgb,int pix)435 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
436 __asm {
437 mov eax, [esp + 4] // src_argb
438 mov edx, [esp + 8] // dst_rgb
439 mov ecx, [esp + 12] // pix
440 movdqa xmm6, kShuffleMaskARGBToRGB24
441
442 align 4
443 convertloop:
444 movdqu xmm0, [eax] // fetch 16 pixels of argb
445 movdqu xmm1, [eax + 16]
446 movdqu xmm2, [eax + 32]
447 movdqu xmm3, [eax + 48]
448 lea eax, [eax + 64]
449 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
450 pshufb xmm1, xmm6
451 pshufb xmm2, xmm6
452 pshufb xmm3, xmm6
453 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
454 psrldq xmm1, 4 // 8 bytes from 1
455 pslldq xmm4, 12 // 4 bytes from 1 for 0
456 movdqa xmm5, xmm2 // 8 bytes from 2 for 1
457 por xmm0, xmm4 // 4 bytes from 1 for 0
458 pslldq xmm5, 8 // 8 bytes from 2 for 1
459 movdqu [edx], xmm0 // store 0
460 por xmm1, xmm5 // 8 bytes from 2 for 1
461 psrldq xmm2, 8 // 4 bytes from 2
462 pslldq xmm3, 4 // 12 bytes from 3 for 2
463 por xmm2, xmm3 // 12 bytes from 3 for 2
464 movdqu [edx + 16], xmm1 // store 1
465 movdqu [edx + 32], xmm2 // store 2
466 lea edx, [edx + 48]
467 sub ecx, 16
468 jg convertloop
469 ret
470 }
471 }
472
473 __declspec(naked) __declspec(align(16))
ARGBToRAWRow_SSSE3(const uint8 * src_argb,uint8 * dst_rgb,int pix)474 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
475 __asm {
476 mov eax, [esp + 4] // src_argb
477 mov edx, [esp + 8] // dst_rgb
478 mov ecx, [esp + 12] // pix
479 movdqa xmm6, kShuffleMaskARGBToRAW
480
481 align 4
482 convertloop:
483 movdqu xmm0, [eax] // fetch 16 pixels of argb
484 movdqu xmm1, [eax + 16]
485 movdqu xmm2, [eax + 32]
486 movdqu xmm3, [eax + 48]
487 lea eax, [eax + 64]
488 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
489 pshufb xmm1, xmm6
490 pshufb xmm2, xmm6
491 pshufb xmm3, xmm6
492 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
493 psrldq xmm1, 4 // 8 bytes from 1
494 pslldq xmm4, 12 // 4 bytes from 1 for 0
495 movdqa xmm5, xmm2 // 8 bytes from 2 for 1
496 por xmm0, xmm4 // 4 bytes from 1 for 0
497 pslldq xmm5, 8 // 8 bytes from 2 for 1
498 movdqu [edx], xmm0 // store 0
499 por xmm1, xmm5 // 8 bytes from 2 for 1
500 psrldq xmm2, 8 // 4 bytes from 2
501 pslldq xmm3, 4 // 12 bytes from 3 for 2
502 por xmm2, xmm3 // 12 bytes from 3 for 2
503 movdqu [edx + 16], xmm1 // store 1
504 movdqu [edx + 32], xmm2 // store 2
505 lea edx, [edx + 48]
506 sub ecx, 16
507 jg convertloop
508 ret
509 }
510 }
511
512 __declspec(naked) __declspec(align(16))
ARGBToRGB565Row_SSE2(const uint8 * src_argb,uint8 * dst_rgb,int pix)513 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
514 __asm {
515 mov eax, [esp + 4] // src_argb
516 mov edx, [esp + 8] // dst_rgb
517 mov ecx, [esp + 12] // pix
518 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
519 psrld xmm3, 27
520 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
521 psrld xmm4, 26
522 pslld xmm4, 5
523 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
524 pslld xmm5, 11
525
526 align 4
527 convertloop:
528 movdqa xmm0, [eax] // fetch 4 pixels of argb
529 movdqa xmm1, xmm0 // B
530 movdqa xmm2, xmm0 // G
531 pslld xmm0, 8 // R
532 psrld xmm1, 3 // B
533 psrld xmm2, 5 // G
534 psrad xmm0, 16 // R
535 pand xmm1, xmm3 // B
536 pand xmm2, xmm4 // G
537 pand xmm0, xmm5 // R
538 por xmm1, xmm2 // BG
539 por xmm0, xmm1 // BGR
540 packssdw xmm0, xmm0
541 lea eax, [eax + 16]
542 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
543 lea edx, [edx + 8]
544 sub ecx, 4
545 jg convertloop
546 ret
547 }
548 }
549
550 // TODO(fbarchard): Improve sign extension/packing.
551 __declspec(naked) __declspec(align(16))
ARGBToARGB1555Row_SSE2(const uint8 * src_argb,uint8 * dst_rgb,int pix)552 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
553 __asm {
554 mov eax, [esp + 4] // src_argb
555 mov edx, [esp + 8] // dst_rgb
556 mov ecx, [esp + 12] // pix
557 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
558 psrld xmm4, 27
559 movdqa xmm5, xmm4 // generate mask 0x000003e0
560 pslld xmm5, 5
561 movdqa xmm6, xmm4 // generate mask 0x00007c00
562 pslld xmm6, 10
563 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
564 pslld xmm7, 15
565
566 align 4
567 convertloop:
568 movdqa xmm0, [eax] // fetch 4 pixels of argb
569 movdqa xmm1, xmm0 // B
570 movdqa xmm2, xmm0 // G
571 movdqa xmm3, xmm0 // R
572 psrad xmm0, 16 // A
573 psrld xmm1, 3 // B
574 psrld xmm2, 6 // G
575 psrld xmm3, 9 // R
576 pand xmm0, xmm7 // A
577 pand xmm1, xmm4 // B
578 pand xmm2, xmm5 // G
579 pand xmm3, xmm6 // R
580 por xmm0, xmm1 // BA
581 por xmm2, xmm3 // GR
582 por xmm0, xmm2 // BGRA
583 packssdw xmm0, xmm0
584 lea eax, [eax + 16]
585 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
586 lea edx, [edx + 8]
587 sub ecx, 4
588 jg convertloop
589 ret
590 }
591 }
592
593 __declspec(naked) __declspec(align(16))
ARGBToARGB4444Row_SSE2(const uint8 * src_argb,uint8 * dst_rgb,int pix)594 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
595 __asm {
596 mov eax, [esp + 4] // src_argb
597 mov edx, [esp + 8] // dst_rgb
598 mov ecx, [esp + 12] // pix
599 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
600 psllw xmm4, 12
601 movdqa xmm3, xmm4 // generate mask 0x00f000f0
602 psrlw xmm3, 8
603
604 align 4
605 convertloop:
606 movdqa xmm0, [eax] // fetch 4 pixels of argb
607 movdqa xmm1, xmm0
608 pand xmm0, xmm3 // low nibble
609 pand xmm1, xmm4 // high nibble
610 psrl xmm0, 4
611 psrl xmm1, 8
612 por xmm0, xmm1
613 packuswb xmm0, xmm0
614 lea eax, [eax + 16]
615 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
616 lea edx, [edx + 8]
617 sub ecx, 4
618 jg convertloop
619 ret
620 }
621 }
622
623 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
624 __declspec(naked) __declspec(align(16))
ARGBToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)625 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
626 __asm {
627 mov eax, [esp + 4] /* src_argb */
628 mov edx, [esp + 8] /* dst_y */
629 mov ecx, [esp + 12] /* pix */
630 movdqa xmm5, kAddY16
631 movdqa xmm4, kARGBToY
632
633 align 4
634 convertloop:
635 movdqa xmm0, [eax]
636 movdqa xmm1, [eax + 16]
637 movdqa xmm2, [eax + 32]
638 movdqa xmm3, [eax + 48]
639 pmaddubsw xmm0, xmm4
640 pmaddubsw xmm1, xmm4
641 pmaddubsw xmm2, xmm4
642 pmaddubsw xmm3, xmm4
643 lea eax, [eax + 64]
644 phaddw xmm0, xmm1
645 phaddw xmm2, xmm3
646 psrlw xmm0, 7
647 psrlw xmm2, 7
648 packuswb xmm0, xmm2
649 paddb xmm0, xmm5
650 sub ecx, 16
651 movdqa [edx], xmm0
652 lea edx, [edx + 16]
653 jg convertloop
654 ret
655 }
656 }
657
658 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
659 __declspec(naked) __declspec(align(16))
ARGBToYJRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)660 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
661 __asm {
662 mov eax, [esp + 4] /* src_argb */
663 mov edx, [esp + 8] /* dst_y */
664 mov ecx, [esp + 12] /* pix */
665 movdqa xmm4, kARGBToYJ
666 movdqa xmm5, kAddYJ64
667
668 align 4
669 convertloop:
670 movdqa xmm0, [eax]
671 movdqa xmm1, [eax + 16]
672 movdqa xmm2, [eax + 32]
673 movdqa xmm3, [eax + 48]
674 pmaddubsw xmm0, xmm4
675 pmaddubsw xmm1, xmm4
676 pmaddubsw xmm2, xmm4
677 pmaddubsw xmm3, xmm4
678 lea eax, [eax + 64]
679 phaddw xmm0, xmm1
680 phaddw xmm2, xmm3
681 paddw xmm0, xmm5 // Add .5 for rounding.
682 paddw xmm2, xmm5
683 psrlw xmm0, 7
684 psrlw xmm2, 7
685 packuswb xmm0, xmm2
686 sub ecx, 16
687 movdqa [edx], xmm0
688 lea edx, [edx + 16]
689 jg convertloop
690 ret
691 }
692 }
693
694 #ifdef HAS_ARGBTOYROW_AVX2
695 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
696 __declspec(naked) __declspec(align(32))
ARGBToYRow_AVX2(const uint8 * src_argb,uint8 * dst_y,int pix)697 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
698 __asm {
699 mov eax, [esp + 4] /* src_argb */
700 mov edx, [esp + 8] /* dst_y */
701 mov ecx, [esp + 12] /* pix */
702 vbroadcastf128 ymm4, kARGBToY
703 vbroadcastf128 ymm5, kAddY16
704 vmovdqa ymm6, kPermdARGBToY_AVX
705
706 align 4
707 convertloop:
708 vmovdqu ymm0, [eax]
709 vmovdqu ymm1, [eax + 32]
710 vmovdqu ymm2, [eax + 64]
711 vmovdqu ymm3, [eax + 96]
712 vpmaddubsw ymm0, ymm0, ymm4
713 vpmaddubsw ymm1, ymm1, ymm4
714 vpmaddubsw ymm2, ymm2, ymm4
715 vpmaddubsw ymm3, ymm3, ymm4
716 lea eax, [eax + 128]
717 vphaddw ymm0, ymm0, ymm1 // mutates.
718 vphaddw ymm2, ymm2, ymm3
719 vpsrlw ymm0, ymm0, 7
720 vpsrlw ymm2, ymm2, 7
721 vpackuswb ymm0, ymm0, ymm2 // mutates.
722 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
723 vpaddb ymm0, ymm0, ymm5
724 sub ecx, 32
725 vmovdqu [edx], ymm0
726 lea edx, [edx + 32]
727 jg convertloop
728 vzeroupper
729 ret
730 }
731 }
732 #endif // HAS_ARGBTOYROW_AVX2
733
734 #ifdef HAS_ARGBTOYROW_AVX2
735 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
736 __declspec(naked) __declspec(align(32))
ARGBToYJRow_AVX2(const uint8 * src_argb,uint8 * dst_y,int pix)737 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
738 __asm {
739 mov eax, [esp + 4] /* src_argb */
740 mov edx, [esp + 8] /* dst_y */
741 mov ecx, [esp + 12] /* pix */
742 vbroadcastf128 ymm4, kARGBToYJ
743 vbroadcastf128 ymm5, kAddYJ64
744 vmovdqa ymm6, kPermdARGBToY_AVX
745
746 align 4
747 convertloop:
748 vmovdqu ymm0, [eax]
749 vmovdqu ymm1, [eax + 32]
750 vmovdqu ymm2, [eax + 64]
751 vmovdqu ymm3, [eax + 96]
752 vpmaddubsw ymm0, ymm0, ymm4
753 vpmaddubsw ymm1, ymm1, ymm4
754 vpmaddubsw ymm2, ymm2, ymm4
755 vpmaddubsw ymm3, ymm3, ymm4
756 lea eax, [eax + 128]
757 vphaddw ymm0, ymm0, ymm1 // mutates.
758 vphaddw ymm2, ymm2, ymm3
759 vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding.
760 vpaddw ymm2, ymm2, ymm5
761 vpsrlw ymm0, ymm0, 7
762 vpsrlw ymm2, ymm2, 7
763 vpackuswb ymm0, ymm0, ymm2 // mutates.
764 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
765 sub ecx, 32
766 vmovdqu [edx], ymm0
767 lea edx, [edx + 32]
768 jg convertloop
769
770 vzeroupper
771 ret
772 }
773 }
774 #endif // HAS_ARGBTOYJROW_AVX2
775
776 __declspec(naked) __declspec(align(16))
ARGBToYRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)777 void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
778 __asm {
779 mov eax, [esp + 4] /* src_argb */
780 mov edx, [esp + 8] /* dst_y */
781 mov ecx, [esp + 12] /* pix */
782 movdqa xmm5, kAddY16
783 movdqa xmm4, kARGBToY
784
785 align 4
786 convertloop:
787 movdqu xmm0, [eax]
788 movdqu xmm1, [eax + 16]
789 movdqu xmm2, [eax + 32]
790 movdqu xmm3, [eax + 48]
791 pmaddubsw xmm0, xmm4
792 pmaddubsw xmm1, xmm4
793 pmaddubsw xmm2, xmm4
794 pmaddubsw xmm3, xmm4
795 lea eax, [eax + 64]
796 phaddw xmm0, xmm1
797 phaddw xmm2, xmm3
798 psrlw xmm0, 7
799 psrlw xmm2, 7
800 packuswb xmm0, xmm2
801 paddb xmm0, xmm5
802 sub ecx, 16
803 movdqu [edx], xmm0
804 lea edx, [edx + 16]
805 jg convertloop
806 ret
807 }
808 }
809
810 __declspec(naked) __declspec(align(16))
ARGBToYJRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)811 void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
812 __asm {
813 mov eax, [esp + 4] /* src_argb */
814 mov edx, [esp + 8] /* dst_y */
815 mov ecx, [esp + 12] /* pix */
816 movdqa xmm4, kARGBToYJ
817 movdqa xmm5, kAddYJ64
818
819 align 4
820 convertloop:
821 movdqu xmm0, [eax]
822 movdqu xmm1, [eax + 16]
823 movdqu xmm2, [eax + 32]
824 movdqu xmm3, [eax + 48]
825 pmaddubsw xmm0, xmm4
826 pmaddubsw xmm1, xmm4
827 pmaddubsw xmm2, xmm4
828 pmaddubsw xmm3, xmm4
829 lea eax, [eax + 64]
830 phaddw xmm0, xmm1
831 phaddw xmm2, xmm3
832 paddw xmm0, xmm5
833 paddw xmm2, xmm5
834 psrlw xmm0, 7
835 psrlw xmm2, 7
836 packuswb xmm0, xmm2
837 sub ecx, 16
838 movdqu [edx], xmm0
839 lea edx, [edx + 16]
840 jg convertloop
841 ret
842 }
843 }
844
845 __declspec(naked) __declspec(align(16))
BGRAToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)846 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
847 __asm {
848 mov eax, [esp + 4] /* src_argb */
849 mov edx, [esp + 8] /* dst_y */
850 mov ecx, [esp + 12] /* pix */
851 movdqa xmm5, kAddY16
852 movdqa xmm4, kBGRAToY
853
854 align 4
855 convertloop:
856 movdqa xmm0, [eax]
857 movdqa xmm1, [eax + 16]
858 movdqa xmm2, [eax + 32]
859 movdqa xmm3, [eax + 48]
860 pmaddubsw xmm0, xmm4
861 pmaddubsw xmm1, xmm4
862 pmaddubsw xmm2, xmm4
863 pmaddubsw xmm3, xmm4
864 lea eax, [eax + 64]
865 phaddw xmm0, xmm1
866 phaddw xmm2, xmm3
867 psrlw xmm0, 7
868 psrlw xmm2, 7
869 packuswb xmm0, xmm2
870 paddb xmm0, xmm5
871 sub ecx, 16
872 movdqa [edx], xmm0
873 lea edx, [edx + 16]
874 jg convertloop
875 ret
876 }
877 }
878
879 __declspec(naked) __declspec(align(16))
BGRAToYRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)880 void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
881 __asm {
882 mov eax, [esp + 4] /* src_argb */
883 mov edx, [esp + 8] /* dst_y */
884 mov ecx, [esp + 12] /* pix */
885 movdqa xmm5, kAddY16
886 movdqa xmm4, kBGRAToY
887
888 align 4
889 convertloop:
890 movdqu xmm0, [eax]
891 movdqu xmm1, [eax + 16]
892 movdqu xmm2, [eax + 32]
893 movdqu xmm3, [eax + 48]
894 pmaddubsw xmm0, xmm4
895 pmaddubsw xmm1, xmm4
896 pmaddubsw xmm2, xmm4
897 pmaddubsw xmm3, xmm4
898 lea eax, [eax + 64]
899 phaddw xmm0, xmm1
900 phaddw xmm2, xmm3
901 psrlw xmm0, 7
902 psrlw xmm2, 7
903 packuswb xmm0, xmm2
904 paddb xmm0, xmm5
905 sub ecx, 16
906 movdqu [edx], xmm0
907 lea edx, [edx + 16]
908 jg convertloop
909 ret
910 }
911 }
912
913 __declspec(naked) __declspec(align(16))
ABGRToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)914 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
915 __asm {
916 mov eax, [esp + 4] /* src_argb */
917 mov edx, [esp + 8] /* dst_y */
918 mov ecx, [esp + 12] /* pix */
919 movdqa xmm5, kAddY16
920 movdqa xmm4, kABGRToY
921
922 align 4
923 convertloop:
924 movdqa xmm0, [eax]
925 movdqa xmm1, [eax + 16]
926 movdqa xmm2, [eax + 32]
927 movdqa xmm3, [eax + 48]
928 pmaddubsw xmm0, xmm4
929 pmaddubsw xmm1, xmm4
930 pmaddubsw xmm2, xmm4
931 pmaddubsw xmm3, xmm4
932 lea eax, [eax + 64]
933 phaddw xmm0, xmm1
934 phaddw xmm2, xmm3
935 psrlw xmm0, 7
936 psrlw xmm2, 7
937 packuswb xmm0, xmm2
938 paddb xmm0, xmm5
939 sub ecx, 16
940 movdqa [edx], xmm0
941 lea edx, [edx + 16]
942 jg convertloop
943 ret
944 }
945 }
946
947 __declspec(naked) __declspec(align(16))
ABGRToYRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)948 void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
949 __asm {
950 mov eax, [esp + 4] /* src_argb */
951 mov edx, [esp + 8] /* dst_y */
952 mov ecx, [esp + 12] /* pix */
953 movdqa xmm5, kAddY16
954 movdqa xmm4, kABGRToY
955
956 align 4
957 convertloop:
958 movdqu xmm0, [eax]
959 movdqu xmm1, [eax + 16]
960 movdqu xmm2, [eax + 32]
961 movdqu xmm3, [eax + 48]
962 pmaddubsw xmm0, xmm4
963 pmaddubsw xmm1, xmm4
964 pmaddubsw xmm2, xmm4
965 pmaddubsw xmm3, xmm4
966 lea eax, [eax + 64]
967 phaddw xmm0, xmm1
968 phaddw xmm2, xmm3
969 psrlw xmm0, 7
970 psrlw xmm2, 7
971 packuswb xmm0, xmm2
972 paddb xmm0, xmm5
973 sub ecx, 16
974 movdqu [edx], xmm0
975 lea edx, [edx + 16]
976 jg convertloop
977 ret
978 }
979 }
980
981 __declspec(naked) __declspec(align(16))
RGBAToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)982 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
983 __asm {
984 mov eax, [esp + 4] /* src_argb */
985 mov edx, [esp + 8] /* dst_y */
986 mov ecx, [esp + 12] /* pix */
987 movdqa xmm5, kAddY16
988 movdqa xmm4, kRGBAToY
989
990 align 4
991 convertloop:
992 movdqa xmm0, [eax]
993 movdqa xmm1, [eax + 16]
994 movdqa xmm2, [eax + 32]
995 movdqa xmm3, [eax + 48]
996 pmaddubsw xmm0, xmm4
997 pmaddubsw xmm1, xmm4
998 pmaddubsw xmm2, xmm4
999 pmaddubsw xmm3, xmm4
1000 lea eax, [eax + 64]
1001 phaddw xmm0, xmm1
1002 phaddw xmm2, xmm3
1003 psrlw xmm0, 7
1004 psrlw xmm2, 7
1005 packuswb xmm0, xmm2
1006 paddb xmm0, xmm5
1007 sub ecx, 16
1008 movdqa [edx], xmm0
1009 lea edx, [edx + 16]
1010 jg convertloop
1011 ret
1012 }
1013 }
1014
1015 __declspec(naked) __declspec(align(16))
RGBAToYRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)1016 void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1017 __asm {
1018 mov eax, [esp + 4] /* src_argb */
1019 mov edx, [esp + 8] /* dst_y */
1020 mov ecx, [esp + 12] /* pix */
1021 movdqa xmm5, kAddY16
1022 movdqa xmm4, kRGBAToY
1023
1024 align 4
1025 convertloop:
1026 movdqu xmm0, [eax]
1027 movdqu xmm1, [eax + 16]
1028 movdqu xmm2, [eax + 32]
1029 movdqu xmm3, [eax + 48]
1030 pmaddubsw xmm0, xmm4
1031 pmaddubsw xmm1, xmm4
1032 pmaddubsw xmm2, xmm4
1033 pmaddubsw xmm3, xmm4
1034 lea eax, [eax + 64]
1035 phaddw xmm0, xmm1
1036 phaddw xmm2, xmm3
1037 psrlw xmm0, 7
1038 psrlw xmm2, 7
1039 packuswb xmm0, xmm2
1040 paddb xmm0, xmm5
1041 sub ecx, 16
1042 movdqu [edx], xmm0
1043 lea edx, [edx + 16]
1044 jg convertloop
1045 ret
1046 }
1047 }
1048
1049 __declspec(naked) __declspec(align(16))
ARGBToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1050 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1051 uint8* dst_u, uint8* dst_v, int width) {
1052 __asm {
1053 push esi
1054 push edi
1055 mov eax, [esp + 8 + 4] // src_argb
1056 mov esi, [esp + 8 + 8] // src_stride_argb
1057 mov edx, [esp + 8 + 12] // dst_u
1058 mov edi, [esp + 8 + 16] // dst_v
1059 mov ecx, [esp + 8 + 20] // pix
1060 movdqa xmm7, kARGBToU
1061 movdqa xmm6, kARGBToV
1062 movdqa xmm5, kAddUV128
1063 sub edi, edx // stride from u to v
1064
1065 align 4
1066 convertloop:
1067 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1068 movdqa xmm0, [eax]
1069 movdqa xmm1, [eax + 16]
1070 movdqa xmm2, [eax + 32]
1071 movdqa xmm3, [eax + 48]
1072 pavgb xmm0, [eax + esi]
1073 pavgb xmm1, [eax + esi + 16]
1074 pavgb xmm2, [eax + esi + 32]
1075 pavgb xmm3, [eax + esi + 48]
1076 lea eax, [eax + 64]
1077 movdqa xmm4, xmm0
1078 shufps xmm0, xmm1, 0x88
1079 shufps xmm4, xmm1, 0xdd
1080 pavgb xmm0, xmm4
1081 movdqa xmm4, xmm2
1082 shufps xmm2, xmm3, 0x88
1083 shufps xmm4, xmm3, 0xdd
1084 pavgb xmm2, xmm4
1085
1086 // step 2 - convert to U and V
1087 // from here down is very similar to Y code except
1088 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1089 movdqa xmm1, xmm0
1090 movdqa xmm3, xmm2
1091 pmaddubsw xmm0, xmm7 // U
1092 pmaddubsw xmm2, xmm7
1093 pmaddubsw xmm1, xmm6 // V
1094 pmaddubsw xmm3, xmm6
1095 phaddw xmm0, xmm2
1096 phaddw xmm1, xmm3
1097 psraw xmm0, 8
1098 psraw xmm1, 8
1099 packsswb xmm0, xmm1
1100 paddb xmm0, xmm5 // -> unsigned
1101
1102 // step 3 - store 8 U and 8 V values
1103 sub ecx, 16
1104 movlps qword ptr [edx], xmm0 // U
1105 movhps qword ptr [edx + edi], xmm0 // V
1106 lea edx, [edx + 8]
1107 jg convertloop
1108
1109 pop edi
1110 pop esi
1111 ret
1112 }
1113 }
1114
1115 __declspec(naked) __declspec(align(16))
ARGBToUVJRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1116 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1117 uint8* dst_u, uint8* dst_v, int width) {
1118 __asm {
1119 push esi
1120 push edi
1121 mov eax, [esp + 8 + 4] // src_argb
1122 mov esi, [esp + 8 + 8] // src_stride_argb
1123 mov edx, [esp + 8 + 12] // dst_u
1124 mov edi, [esp + 8 + 16] // dst_v
1125 mov ecx, [esp + 8 + 20] // pix
1126 movdqa xmm7, kARGBToUJ
1127 movdqa xmm6, kARGBToVJ
1128 movdqa xmm5, kAddUVJ128
1129 sub edi, edx // stride from u to v
1130
1131 align 4
1132 convertloop:
1133 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1134 movdqa xmm0, [eax]
1135 movdqa xmm1, [eax + 16]
1136 movdqa xmm2, [eax + 32]
1137 movdqa xmm3, [eax + 48]
1138 pavgb xmm0, [eax + esi]
1139 pavgb xmm1, [eax + esi + 16]
1140 pavgb xmm2, [eax + esi + 32]
1141 pavgb xmm3, [eax + esi + 48]
1142 lea eax, [eax + 64]
1143 movdqa xmm4, xmm0
1144 shufps xmm0, xmm1, 0x88
1145 shufps xmm4, xmm1, 0xdd
1146 pavgb xmm0, xmm4
1147 movdqa xmm4, xmm2
1148 shufps xmm2, xmm3, 0x88
1149 shufps xmm4, xmm3, 0xdd
1150 pavgb xmm2, xmm4
1151
1152 // step 2 - convert to U and V
1153 // from here down is very similar to Y code except
1154 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1155 movdqa xmm1, xmm0
1156 movdqa xmm3, xmm2
1157 pmaddubsw xmm0, xmm7 // U
1158 pmaddubsw xmm2, xmm7
1159 pmaddubsw xmm1, xmm6 // V
1160 pmaddubsw xmm3, xmm6
1161 phaddw xmm0, xmm2
1162 phaddw xmm1, xmm3
1163 paddw xmm0, xmm5 // +.5 rounding -> unsigned
1164 paddw xmm1, xmm5
1165 psraw xmm0, 8
1166 psraw xmm1, 8
1167 packsswb xmm0, xmm1
1168
1169 // step 3 - store 8 U and 8 V values
1170 sub ecx, 16
1171 movlps qword ptr [edx], xmm0 // U
1172 movhps qword ptr [edx + edi], xmm0 // V
1173 lea edx, [edx + 8]
1174 jg convertloop
1175
1176 pop edi
1177 pop esi
1178 ret
1179 }
1180 }
1181
1182 #ifdef HAS_ARGBTOUVROW_AVX2
1183 __declspec(naked) __declspec(align(32))
ARGBToUVRow_AVX2(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1184 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
1185 uint8* dst_u, uint8* dst_v, int width) {
1186 __asm {
1187 push esi
1188 push edi
1189 mov eax, [esp + 8 + 4] // src_argb
1190 mov esi, [esp + 8 + 8] // src_stride_argb
1191 mov edx, [esp + 8 + 12] // dst_u
1192 mov edi, [esp + 8 + 16] // dst_v
1193 mov ecx, [esp + 8 + 20] // pix
1194 vbroadcastf128 ymm5, kAddUV128
1195 vbroadcastf128 ymm6, kARGBToV
1196 vbroadcastf128 ymm7, kARGBToU
1197 sub edi, edx // stride from u to v
1198
1199 align 4
1200 convertloop:
1201 /* step 1 - subsample 32x2 argb pixels to 16x1 */
1202 vmovdqu ymm0, [eax]
1203 vmovdqu ymm1, [eax + 32]
1204 vmovdqu ymm2, [eax + 64]
1205 vmovdqu ymm3, [eax + 96]
1206 vpavgb ymm0, ymm0, [eax + esi]
1207 vpavgb ymm1, ymm1, [eax + esi + 32]
1208 vpavgb ymm2, ymm2, [eax + esi + 64]
1209 vpavgb ymm3, ymm3, [eax + esi + 96]
1210 lea eax, [eax + 128]
1211 vshufps ymm4, ymm0, ymm1, 0x88
1212 vshufps ymm0, ymm0, ymm1, 0xdd
1213 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
1214 vshufps ymm4, ymm2, ymm3, 0x88
1215 vshufps ymm2, ymm2, ymm3, 0xdd
1216 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
1217
1218 // step 2 - convert to U and V
1219 // from here down is very similar to Y code except
1220 // instead of 32 different pixels, its 16 pixels of U and 16 of V
1221 vpmaddubsw ymm1, ymm0, ymm7 // U
1222 vpmaddubsw ymm3, ymm2, ymm7
1223 vpmaddubsw ymm0, ymm0, ymm6 // V
1224 vpmaddubsw ymm2, ymm2, ymm6
1225 vphaddw ymm1, ymm1, ymm3 // mutates
1226 vphaddw ymm0, ymm0, ymm2
1227 vpsraw ymm1, ymm1, 8
1228 vpsraw ymm0, ymm0, 8
1229 vpacksswb ymm0, ymm1, ymm0 // mutates
1230 vpermq ymm0, ymm0, 0xd8 // For vpacksswb
1231 vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw
1232 vpaddb ymm0, ymm0, ymm5 // -> unsigned
1233
1234 // step 3 - store 16 U and 16 V values
1235 sub ecx, 32
1236 vextractf128 [edx], ymm0, 0 // U
1237 vextractf128 [edx + edi], ymm0, 1 // V
1238 lea edx, [edx + 16]
1239 jg convertloop
1240
1241 pop edi
1242 pop esi
1243 vzeroupper
1244 ret
1245 }
1246 }
1247 #endif // HAS_ARGBTOUVROW_AVX2
1248
1249 __declspec(naked) __declspec(align(16))
ARGBToUVRow_Unaligned_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1250 void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1251 uint8* dst_u, uint8* dst_v, int width) {
1252 __asm {
1253 push esi
1254 push edi
1255 mov eax, [esp + 8 + 4] // src_argb
1256 mov esi, [esp + 8 + 8] // src_stride_argb
1257 mov edx, [esp + 8 + 12] // dst_u
1258 mov edi, [esp + 8 + 16] // dst_v
1259 mov ecx, [esp + 8 + 20] // pix
1260 movdqa xmm7, kARGBToU
1261 movdqa xmm6, kARGBToV
1262 movdqa xmm5, kAddUV128
1263 sub edi, edx // stride from u to v
1264
1265 align 4
1266 convertloop:
1267 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1268 movdqu xmm0, [eax]
1269 movdqu xmm1, [eax + 16]
1270 movdqu xmm2, [eax + 32]
1271 movdqu xmm3, [eax + 48]
1272 movdqu xmm4, [eax + esi]
1273 pavgb xmm0, xmm4
1274 movdqu xmm4, [eax + esi + 16]
1275 pavgb xmm1, xmm4
1276 movdqu xmm4, [eax + esi + 32]
1277 pavgb xmm2, xmm4
1278 movdqu xmm4, [eax + esi + 48]
1279 pavgb xmm3, xmm4
1280 lea eax, [eax + 64]
1281 movdqa xmm4, xmm0
1282 shufps xmm0, xmm1, 0x88
1283 shufps xmm4, xmm1, 0xdd
1284 pavgb xmm0, xmm4
1285 movdqa xmm4, xmm2
1286 shufps xmm2, xmm3, 0x88
1287 shufps xmm4, xmm3, 0xdd
1288 pavgb xmm2, xmm4
1289
1290 // step 2 - convert to U and V
1291 // from here down is very similar to Y code except
1292 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1293 movdqa xmm1, xmm0
1294 movdqa xmm3, xmm2
1295 pmaddubsw xmm0, xmm7 // U
1296 pmaddubsw xmm2, xmm7
1297 pmaddubsw xmm1, xmm6 // V
1298 pmaddubsw xmm3, xmm6
1299 phaddw xmm0, xmm2
1300 phaddw xmm1, xmm3
1301 psraw xmm0, 8
1302 psraw xmm1, 8
1303 packsswb xmm0, xmm1
1304 paddb xmm0, xmm5 // -> unsigned
1305
1306 // step 3 - store 8 U and 8 V values
1307 sub ecx, 16
1308 movlps qword ptr [edx], xmm0 // U
1309 movhps qword ptr [edx + edi], xmm0 // V
1310 lea edx, [edx + 8]
1311 jg convertloop
1312
1313 pop edi
1314 pop esi
1315 ret
1316 }
1317 }
1318
1319 __declspec(naked) __declspec(align(16))
ARGBToUVJRow_Unaligned_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1320 void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1321 uint8* dst_u, uint8* dst_v, int width) {
1322 __asm {
1323 push esi
1324 push edi
1325 mov eax, [esp + 8 + 4] // src_argb
1326 mov esi, [esp + 8 + 8] // src_stride_argb
1327 mov edx, [esp + 8 + 12] // dst_u
1328 mov edi, [esp + 8 + 16] // dst_v
1329 mov ecx, [esp + 8 + 20] // pix
1330 movdqa xmm7, kARGBToUJ
1331 movdqa xmm6, kARGBToVJ
1332 movdqa xmm5, kAddUVJ128
1333 sub edi, edx // stride from u to v
1334
1335 align 4
1336 convertloop:
1337 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1338 movdqu xmm0, [eax]
1339 movdqu xmm1, [eax + 16]
1340 movdqu xmm2, [eax + 32]
1341 movdqu xmm3, [eax + 48]
1342 movdqu xmm4, [eax + esi]
1343 pavgb xmm0, xmm4
1344 movdqu xmm4, [eax + esi + 16]
1345 pavgb xmm1, xmm4
1346 movdqu xmm4, [eax + esi + 32]
1347 pavgb xmm2, xmm4
1348 movdqu xmm4, [eax + esi + 48]
1349 pavgb xmm3, xmm4
1350 lea eax, [eax + 64]
1351 movdqa xmm4, xmm0
1352 shufps xmm0, xmm1, 0x88
1353 shufps xmm4, xmm1, 0xdd
1354 pavgb xmm0, xmm4
1355 movdqa xmm4, xmm2
1356 shufps xmm2, xmm3, 0x88
1357 shufps xmm4, xmm3, 0xdd
1358 pavgb xmm2, xmm4
1359
1360 // step 2 - convert to U and V
1361 // from here down is very similar to Y code except
1362 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1363 movdqa xmm1, xmm0
1364 movdqa xmm3, xmm2
1365 pmaddubsw xmm0, xmm7 // U
1366 pmaddubsw xmm2, xmm7
1367 pmaddubsw xmm1, xmm6 // V
1368 pmaddubsw xmm3, xmm6
1369 phaddw xmm0, xmm2
1370 phaddw xmm1, xmm3
1371 paddw xmm0, xmm5 // +.5 rounding -> unsigned
1372 paddw xmm1, xmm5
1373 psraw xmm0, 8
1374 psraw xmm1, 8
1375 packsswb xmm0, xmm1
1376
1377 // step 3 - store 8 U and 8 V values
1378 sub ecx, 16
1379 movlps qword ptr [edx], xmm0 // U
1380 movhps qword ptr [edx + edi], xmm0 // V
1381 lea edx, [edx + 8]
1382 jg convertloop
1383
1384 pop edi
1385 pop esi
1386 ret
1387 }
1388 }
1389
1390 __declspec(naked) __declspec(align(16))
ARGBToUV444Row_SSSE3(const uint8 * src_argb0,uint8 * dst_u,uint8 * dst_v,int width)1391 void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
1392 uint8* dst_u, uint8* dst_v, int width) {
1393 __asm {
1394 push edi
1395 mov eax, [esp + 4 + 4] // src_argb
1396 mov edx, [esp + 4 + 8] // dst_u
1397 mov edi, [esp + 4 + 12] // dst_v
1398 mov ecx, [esp + 4 + 16] // pix
1399 movdqa xmm7, kARGBToU
1400 movdqa xmm6, kARGBToV
1401 movdqa xmm5, kAddUV128
1402 sub edi, edx // stride from u to v
1403
1404 align 4
1405 convertloop:
1406 /* convert to U and V */
1407 movdqa xmm0, [eax] // U
1408 movdqa xmm1, [eax + 16]
1409 movdqa xmm2, [eax + 32]
1410 movdqa xmm3, [eax + 48]
1411 pmaddubsw xmm0, xmm7
1412 pmaddubsw xmm1, xmm7
1413 pmaddubsw xmm2, xmm7
1414 pmaddubsw xmm3, xmm7
1415 phaddw xmm0, xmm1
1416 phaddw xmm2, xmm3
1417 psraw xmm0, 8
1418 psraw xmm2, 8
1419 packsswb xmm0, xmm2
1420 paddb xmm0, xmm5
1421 sub ecx, 16
1422 movdqa [edx], xmm0
1423
1424 movdqa xmm0, [eax] // V
1425 movdqa xmm1, [eax + 16]
1426 movdqa xmm2, [eax + 32]
1427 movdqa xmm3, [eax + 48]
1428 pmaddubsw xmm0, xmm6
1429 pmaddubsw xmm1, xmm6
1430 pmaddubsw xmm2, xmm6
1431 pmaddubsw xmm3, xmm6
1432 phaddw xmm0, xmm1
1433 phaddw xmm2, xmm3
1434 psraw xmm0, 8
1435 psraw xmm2, 8
1436 packsswb xmm0, xmm2
1437 paddb xmm0, xmm5
1438 lea eax, [eax + 64]
1439 movdqa [edx + edi], xmm0
1440 lea edx, [edx + 16]
1441 jg convertloop
1442
1443 pop edi
1444 ret
1445 }
1446 }
1447
1448 __declspec(naked) __declspec(align(16))
ARGBToUV444Row_Unaligned_SSSE3(const uint8 * src_argb0,uint8 * dst_u,uint8 * dst_v,int width)1449 void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
1450 uint8* dst_u, uint8* dst_v, int width) {
1451 __asm {
1452 push edi
1453 mov eax, [esp + 4 + 4] // src_argb
1454 mov edx, [esp + 4 + 8] // dst_u
1455 mov edi, [esp + 4 + 12] // dst_v
1456 mov ecx, [esp + 4 + 16] // pix
1457 movdqa xmm7, kARGBToU
1458 movdqa xmm6, kARGBToV
1459 movdqa xmm5, kAddUV128
1460 sub edi, edx // stride from u to v
1461
1462 align 4
1463 convertloop:
1464 /* convert to U and V */
1465 movdqu xmm0, [eax] // U
1466 movdqu xmm1, [eax + 16]
1467 movdqu xmm2, [eax + 32]
1468 movdqu xmm3, [eax + 48]
1469 pmaddubsw xmm0, xmm7
1470 pmaddubsw xmm1, xmm7
1471 pmaddubsw xmm2, xmm7
1472 pmaddubsw xmm3, xmm7
1473 phaddw xmm0, xmm1
1474 phaddw xmm2, xmm3
1475 psraw xmm0, 8
1476 psraw xmm2, 8
1477 packsswb xmm0, xmm2
1478 paddb xmm0, xmm5
1479 sub ecx, 16
1480 movdqu [edx], xmm0
1481
1482 movdqu xmm0, [eax] // V
1483 movdqu xmm1, [eax + 16]
1484 movdqu xmm2, [eax + 32]
1485 movdqu xmm3, [eax + 48]
1486 pmaddubsw xmm0, xmm6
1487 pmaddubsw xmm1, xmm6
1488 pmaddubsw xmm2, xmm6
1489 pmaddubsw xmm3, xmm6
1490 phaddw xmm0, xmm1
1491 phaddw xmm2, xmm3
1492 psraw xmm0, 8
1493 psraw xmm2, 8
1494 packsswb xmm0, xmm2
1495 paddb xmm0, xmm5
1496 lea eax, [eax + 64]
1497 movdqu [edx + edi], xmm0
1498 lea edx, [edx + 16]
1499 jg convertloop
1500
1501 pop edi
1502 ret
1503 }
1504 }
1505
1506 __declspec(naked) __declspec(align(16))
ARGBToUV422Row_SSSE3(const uint8 * src_argb0,uint8 * dst_u,uint8 * dst_v,int width)1507 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
1508 uint8* dst_u, uint8* dst_v, int width) {
1509 __asm {
1510 push edi
1511 mov eax, [esp + 4 + 4] // src_argb
1512 mov edx, [esp + 4 + 8] // dst_u
1513 mov edi, [esp + 4 + 12] // dst_v
1514 mov ecx, [esp + 4 + 16] // pix
1515 movdqa xmm7, kARGBToU
1516 movdqa xmm6, kARGBToV
1517 movdqa xmm5, kAddUV128
1518 sub edi, edx // stride from u to v
1519
1520 align 4
1521 convertloop:
1522 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1523 movdqa xmm0, [eax]
1524 movdqa xmm1, [eax + 16]
1525 movdqa xmm2, [eax + 32]
1526 movdqa xmm3, [eax + 48]
1527 lea eax, [eax + 64]
1528 movdqa xmm4, xmm0
1529 shufps xmm0, xmm1, 0x88
1530 shufps xmm4, xmm1, 0xdd
1531 pavgb xmm0, xmm4
1532 movdqa xmm4, xmm2
1533 shufps xmm2, xmm3, 0x88
1534 shufps xmm4, xmm3, 0xdd
1535 pavgb xmm2, xmm4
1536
1537 // step 2 - convert to U and V
1538 // from here down is very similar to Y code except
1539 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1540 movdqa xmm1, xmm0
1541 movdqa xmm3, xmm2
1542 pmaddubsw xmm0, xmm7 // U
1543 pmaddubsw xmm2, xmm7
1544 pmaddubsw xmm1, xmm6 // V
1545 pmaddubsw xmm3, xmm6
1546 phaddw xmm0, xmm2
1547 phaddw xmm1, xmm3
1548 psraw xmm0, 8
1549 psraw xmm1, 8
1550 packsswb xmm0, xmm1
1551 paddb xmm0, xmm5 // -> unsigned
1552
1553 // step 3 - store 8 U and 8 V values
1554 sub ecx, 16
1555 movlps qword ptr [edx], xmm0 // U
1556 movhps qword ptr [edx + edi], xmm0 // V
1557 lea edx, [edx + 8]
1558 jg convertloop
1559
1560 pop edi
1561 ret
1562 }
1563 }
1564
1565 __declspec(naked) __declspec(align(16))
ARGBToUV422Row_Unaligned_SSSE3(const uint8 * src_argb0,uint8 * dst_u,uint8 * dst_v,int width)1566 void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
1567 uint8* dst_u, uint8* dst_v, int width) {
1568 __asm {
1569 push edi
1570 mov eax, [esp + 4 + 4] // src_argb
1571 mov edx, [esp + 4 + 8] // dst_u
1572 mov edi, [esp + 4 + 12] // dst_v
1573 mov ecx, [esp + 4 + 16] // pix
1574 movdqa xmm7, kARGBToU
1575 movdqa xmm6, kARGBToV
1576 movdqa xmm5, kAddUV128
1577 sub edi, edx // stride from u to v
1578
1579 align 4
1580 convertloop:
1581 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1582 movdqu xmm0, [eax]
1583 movdqu xmm1, [eax + 16]
1584 movdqu xmm2, [eax + 32]
1585 movdqu xmm3, [eax + 48]
1586 lea eax, [eax + 64]
1587 movdqa xmm4, xmm0
1588 shufps xmm0, xmm1, 0x88
1589 shufps xmm4, xmm1, 0xdd
1590 pavgb xmm0, xmm4
1591 movdqa xmm4, xmm2
1592 shufps xmm2, xmm3, 0x88
1593 shufps xmm4, xmm3, 0xdd
1594 pavgb xmm2, xmm4
1595
1596 // step 2 - convert to U and V
1597 // from here down is very similar to Y code except
1598 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1599 movdqa xmm1, xmm0
1600 movdqa xmm3, xmm2
1601 pmaddubsw xmm0, xmm7 // U
1602 pmaddubsw xmm2, xmm7
1603 pmaddubsw xmm1, xmm6 // V
1604 pmaddubsw xmm3, xmm6
1605 phaddw xmm0, xmm2
1606 phaddw xmm1, xmm3
1607 psraw xmm0, 8
1608 psraw xmm1, 8
1609 packsswb xmm0, xmm1
1610 paddb xmm0, xmm5 // -> unsigned
1611
1612 // step 3 - store 8 U and 8 V values
1613 sub ecx, 16
1614 movlps qword ptr [edx], xmm0 // U
1615 movhps qword ptr [edx + edi], xmm0 // V
1616 lea edx, [edx + 8]
1617 jg convertloop
1618
1619 pop edi
1620 ret
1621 }
1622 }
1623
1624 __declspec(naked) __declspec(align(16))
BGRAToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1625 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1626 uint8* dst_u, uint8* dst_v, int width) {
1627 __asm {
1628 push esi
1629 push edi
1630 mov eax, [esp + 8 + 4] // src_argb
1631 mov esi, [esp + 8 + 8] // src_stride_argb
1632 mov edx, [esp + 8 + 12] // dst_u
1633 mov edi, [esp + 8 + 16] // dst_v
1634 mov ecx, [esp + 8 + 20] // pix
1635 movdqa xmm7, kBGRAToU
1636 movdqa xmm6, kBGRAToV
1637 movdqa xmm5, kAddUV128
1638 sub edi, edx // stride from u to v
1639
1640 align 4
1641 convertloop:
1642 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1643 movdqa xmm0, [eax]
1644 movdqa xmm1, [eax + 16]
1645 movdqa xmm2, [eax + 32]
1646 movdqa xmm3, [eax + 48]
1647 pavgb xmm0, [eax + esi]
1648 pavgb xmm1, [eax + esi + 16]
1649 pavgb xmm2, [eax + esi + 32]
1650 pavgb xmm3, [eax + esi + 48]
1651 lea eax, [eax + 64]
1652 movdqa xmm4, xmm0
1653 shufps xmm0, xmm1, 0x88
1654 shufps xmm4, xmm1, 0xdd
1655 pavgb xmm0, xmm4
1656 movdqa xmm4, xmm2
1657 shufps xmm2, xmm3, 0x88
1658 shufps xmm4, xmm3, 0xdd
1659 pavgb xmm2, xmm4
1660
1661 // step 2 - convert to U and V
1662 // from here down is very similar to Y code except
1663 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1664 movdqa xmm1, xmm0
1665 movdqa xmm3, xmm2
1666 pmaddubsw xmm0, xmm7 // U
1667 pmaddubsw xmm2, xmm7
1668 pmaddubsw xmm1, xmm6 // V
1669 pmaddubsw xmm3, xmm6
1670 phaddw xmm0, xmm2
1671 phaddw xmm1, xmm3
1672 psraw xmm0, 8
1673 psraw xmm1, 8
1674 packsswb xmm0, xmm1
1675 paddb xmm0, xmm5 // -> unsigned
1676
1677 // step 3 - store 8 U and 8 V values
1678 sub ecx, 16
1679 movlps qword ptr [edx], xmm0 // U
1680 movhps qword ptr [edx + edi], xmm0 // V
1681 lea edx, [edx + 8]
1682 jg convertloop
1683
1684 pop edi
1685 pop esi
1686 ret
1687 }
1688 }
1689
1690 __declspec(naked) __declspec(align(16))
BGRAToUVRow_Unaligned_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1691 void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1692 uint8* dst_u, uint8* dst_v, int width) {
1693 __asm {
1694 push esi
1695 push edi
1696 mov eax, [esp + 8 + 4] // src_argb
1697 mov esi, [esp + 8 + 8] // src_stride_argb
1698 mov edx, [esp + 8 + 12] // dst_u
1699 mov edi, [esp + 8 + 16] // dst_v
1700 mov ecx, [esp + 8 + 20] // pix
1701 movdqa xmm7, kBGRAToU
1702 movdqa xmm6, kBGRAToV
1703 movdqa xmm5, kAddUV128
1704 sub edi, edx // stride from u to v
1705
1706 align 4
1707 convertloop:
1708 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1709 movdqu xmm0, [eax]
1710 movdqu xmm1, [eax + 16]
1711 movdqu xmm2, [eax + 32]
1712 movdqu xmm3, [eax + 48]
1713 movdqu xmm4, [eax + esi]
1714 pavgb xmm0, xmm4
1715 movdqu xmm4, [eax + esi + 16]
1716 pavgb xmm1, xmm4
1717 movdqu xmm4, [eax + esi + 32]
1718 pavgb xmm2, xmm4
1719 movdqu xmm4, [eax + esi + 48]
1720 pavgb xmm3, xmm4
1721 lea eax, [eax + 64]
1722 movdqa xmm4, xmm0
1723 shufps xmm0, xmm1, 0x88
1724 shufps xmm4, xmm1, 0xdd
1725 pavgb xmm0, xmm4
1726 movdqa xmm4, xmm2
1727 shufps xmm2, xmm3, 0x88
1728 shufps xmm4, xmm3, 0xdd
1729 pavgb xmm2, xmm4
1730
1731 // step 2 - convert to U and V
1732 // from here down is very similar to Y code except
1733 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1734 movdqa xmm1, xmm0
1735 movdqa xmm3, xmm2
1736 pmaddubsw xmm0, xmm7 // U
1737 pmaddubsw xmm2, xmm7
1738 pmaddubsw xmm1, xmm6 // V
1739 pmaddubsw xmm3, xmm6
1740 phaddw xmm0, xmm2
1741 phaddw xmm1, xmm3
1742 psraw xmm0, 8
1743 psraw xmm1, 8
1744 packsswb xmm0, xmm1
1745 paddb xmm0, xmm5 // -> unsigned
1746
1747 // step 3 - store 8 U and 8 V values
1748 sub ecx, 16
1749 movlps qword ptr [edx], xmm0 // U
1750 movhps qword ptr [edx + edi], xmm0 // V
1751 lea edx, [edx + 8]
1752 jg convertloop
1753
1754 pop edi
1755 pop esi
1756 ret
1757 }
1758 }
1759
1760 __declspec(naked) __declspec(align(16))
ABGRToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1761 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1762 uint8* dst_u, uint8* dst_v, int width) {
1763 __asm {
1764 push esi
1765 push edi
1766 mov eax, [esp + 8 + 4] // src_argb
1767 mov esi, [esp + 8 + 8] // src_stride_argb
1768 mov edx, [esp + 8 + 12] // dst_u
1769 mov edi, [esp + 8 + 16] // dst_v
1770 mov ecx, [esp + 8 + 20] // pix
1771 movdqa xmm7, kABGRToU
1772 movdqa xmm6, kABGRToV
1773 movdqa xmm5, kAddUV128
1774 sub edi, edx // stride from u to v
1775
1776 align 4
1777 convertloop:
1778 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1779 movdqa xmm0, [eax]
1780 movdqa xmm1, [eax + 16]
1781 movdqa xmm2, [eax + 32]
1782 movdqa xmm3, [eax + 48]
1783 pavgb xmm0, [eax + esi]
1784 pavgb xmm1, [eax + esi + 16]
1785 pavgb xmm2, [eax + esi + 32]
1786 pavgb xmm3, [eax + esi + 48]
1787 lea eax, [eax + 64]
1788 movdqa xmm4, xmm0
1789 shufps xmm0, xmm1, 0x88
1790 shufps xmm4, xmm1, 0xdd
1791 pavgb xmm0, xmm4
1792 movdqa xmm4, xmm2
1793 shufps xmm2, xmm3, 0x88
1794 shufps xmm4, xmm3, 0xdd
1795 pavgb xmm2, xmm4
1796
1797 // step 2 - convert to U and V
1798 // from here down is very similar to Y code except
1799 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1800 movdqa xmm1, xmm0
1801 movdqa xmm3, xmm2
1802 pmaddubsw xmm0, xmm7 // U
1803 pmaddubsw xmm2, xmm7
1804 pmaddubsw xmm1, xmm6 // V
1805 pmaddubsw xmm3, xmm6
1806 phaddw xmm0, xmm2
1807 phaddw xmm1, xmm3
1808 psraw xmm0, 8
1809 psraw xmm1, 8
1810 packsswb xmm0, xmm1
1811 paddb xmm0, xmm5 // -> unsigned
1812
1813 // step 3 - store 8 U and 8 V values
1814 sub ecx, 16
1815 movlps qword ptr [edx], xmm0 // U
1816 movhps qword ptr [edx + edi], xmm0 // V
1817 lea edx, [edx + 8]
1818 jg convertloop
1819
1820 pop edi
1821 pop esi
1822 ret
1823 }
1824 }
1825
1826 __declspec(naked) __declspec(align(16))
ABGRToUVRow_Unaligned_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1827 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1828 uint8* dst_u, uint8* dst_v, int width) {
1829 __asm {
1830 push esi
1831 push edi
1832 mov eax, [esp + 8 + 4] // src_argb
1833 mov esi, [esp + 8 + 8] // src_stride_argb
1834 mov edx, [esp + 8 + 12] // dst_u
1835 mov edi, [esp + 8 + 16] // dst_v
1836 mov ecx, [esp + 8 + 20] // pix
1837 movdqa xmm7, kABGRToU
1838 movdqa xmm6, kABGRToV
1839 movdqa xmm5, kAddUV128
1840 sub edi, edx // stride from u to v
1841
1842 align 4
1843 convertloop:
1844 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1845 movdqu xmm0, [eax]
1846 movdqu xmm1, [eax + 16]
1847 movdqu xmm2, [eax + 32]
1848 movdqu xmm3, [eax + 48]
1849 movdqu xmm4, [eax + esi]
1850 pavgb xmm0, xmm4
1851 movdqu xmm4, [eax + esi + 16]
1852 pavgb xmm1, xmm4
1853 movdqu xmm4, [eax + esi + 32]
1854 pavgb xmm2, xmm4
1855 movdqu xmm4, [eax + esi + 48]
1856 pavgb xmm3, xmm4
1857 lea eax, [eax + 64]
1858 movdqa xmm4, xmm0
1859 shufps xmm0, xmm1, 0x88
1860 shufps xmm4, xmm1, 0xdd
1861 pavgb xmm0, xmm4
1862 movdqa xmm4, xmm2
1863 shufps xmm2, xmm3, 0x88
1864 shufps xmm4, xmm3, 0xdd
1865 pavgb xmm2, xmm4
1866
1867 // step 2 - convert to U and V
1868 // from here down is very similar to Y code except
1869 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1870 movdqa xmm1, xmm0
1871 movdqa xmm3, xmm2
1872 pmaddubsw xmm0, xmm7 // U
1873 pmaddubsw xmm2, xmm7
1874 pmaddubsw xmm1, xmm6 // V
1875 pmaddubsw xmm3, xmm6
1876 phaddw xmm0, xmm2
1877 phaddw xmm1, xmm3
1878 psraw xmm0, 8
1879 psraw xmm1, 8
1880 packsswb xmm0, xmm1
1881 paddb xmm0, xmm5 // -> unsigned
1882
1883 // step 3 - store 8 U and 8 V values
1884 sub ecx, 16
1885 movlps qword ptr [edx], xmm0 // U
1886 movhps qword ptr [edx + edi], xmm0 // V
1887 lea edx, [edx + 8]
1888 jg convertloop
1889
1890 pop edi
1891 pop esi
1892 ret
1893 }
1894 }
1895
1896 __declspec(naked) __declspec(align(16))
RGBAToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1897 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1898 uint8* dst_u, uint8* dst_v, int width) {
1899 __asm {
1900 push esi
1901 push edi
1902 mov eax, [esp + 8 + 4] // src_argb
1903 mov esi, [esp + 8 + 8] // src_stride_argb
1904 mov edx, [esp + 8 + 12] // dst_u
1905 mov edi, [esp + 8 + 16] // dst_v
1906 mov ecx, [esp + 8 + 20] // pix
1907 movdqa xmm7, kRGBAToU
1908 movdqa xmm6, kRGBAToV
1909 movdqa xmm5, kAddUV128
1910 sub edi, edx // stride from u to v
1911
1912 align 4
1913 convertloop:
1914 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1915 movdqa xmm0, [eax]
1916 movdqa xmm1, [eax + 16]
1917 movdqa xmm2, [eax + 32]
1918 movdqa xmm3, [eax + 48]
1919 pavgb xmm0, [eax + esi]
1920 pavgb xmm1, [eax + esi + 16]
1921 pavgb xmm2, [eax + esi + 32]
1922 pavgb xmm3, [eax + esi + 48]
1923 lea eax, [eax + 64]
1924 movdqa xmm4, xmm0
1925 shufps xmm0, xmm1, 0x88
1926 shufps xmm4, xmm1, 0xdd
1927 pavgb xmm0, xmm4
1928 movdqa xmm4, xmm2
1929 shufps xmm2, xmm3, 0x88
1930 shufps xmm4, xmm3, 0xdd
1931 pavgb xmm2, xmm4
1932
1933 // step 2 - convert to U and V
1934 // from here down is very similar to Y code except
1935 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1936 movdqa xmm1, xmm0
1937 movdqa xmm3, xmm2
1938 pmaddubsw xmm0, xmm7 // U
1939 pmaddubsw xmm2, xmm7
1940 pmaddubsw xmm1, xmm6 // V
1941 pmaddubsw xmm3, xmm6
1942 phaddw xmm0, xmm2
1943 phaddw xmm1, xmm3
1944 psraw xmm0, 8
1945 psraw xmm1, 8
1946 packsswb xmm0, xmm1
1947 paddb xmm0, xmm5 // -> unsigned
1948
1949 // step 3 - store 8 U and 8 V values
1950 sub ecx, 16
1951 movlps qword ptr [edx], xmm0 // U
1952 movhps qword ptr [edx + edi], xmm0 // V
1953 lea edx, [edx + 8]
1954 jg convertloop
1955
1956 pop edi
1957 pop esi
1958 ret
1959 }
1960 }
1961
1962 __declspec(naked) __declspec(align(16))
RGBAToUVRow_Unaligned_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1963 void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1964 uint8* dst_u, uint8* dst_v, int width) {
1965 __asm {
1966 push esi
1967 push edi
1968 mov eax, [esp + 8 + 4] // src_argb
1969 mov esi, [esp + 8 + 8] // src_stride_argb
1970 mov edx, [esp + 8 + 12] // dst_u
1971 mov edi, [esp + 8 + 16] // dst_v
1972 mov ecx, [esp + 8 + 20] // pix
1973 movdqa xmm7, kRGBAToU
1974 movdqa xmm6, kRGBAToV
1975 movdqa xmm5, kAddUV128
1976 sub edi, edx // stride from u to v
1977
1978 align 4
1979 convertloop:
1980 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1981 movdqu xmm0, [eax]
1982 movdqu xmm1, [eax + 16]
1983 movdqu xmm2, [eax + 32]
1984 movdqu xmm3, [eax + 48]
1985 movdqu xmm4, [eax + esi]
1986 pavgb xmm0, xmm4
1987 movdqu xmm4, [eax + esi + 16]
1988 pavgb xmm1, xmm4
1989 movdqu xmm4, [eax + esi + 32]
1990 pavgb xmm2, xmm4
1991 movdqu xmm4, [eax + esi + 48]
1992 pavgb xmm3, xmm4
1993 lea eax, [eax + 64]
1994 movdqa xmm4, xmm0
1995 shufps xmm0, xmm1, 0x88
1996 shufps xmm4, xmm1, 0xdd
1997 pavgb xmm0, xmm4
1998 movdqa xmm4, xmm2
1999 shufps xmm2, xmm3, 0x88
2000 shufps xmm4, xmm3, 0xdd
2001 pavgb xmm2, xmm4
2002
2003 // step 2 - convert to U and V
2004 // from here down is very similar to Y code except
2005 // instead of 16 different pixels, its 8 pixels of U and 8 of V
2006 movdqa xmm1, xmm0
2007 movdqa xmm3, xmm2
2008 pmaddubsw xmm0, xmm7 // U
2009 pmaddubsw xmm2, xmm7
2010 pmaddubsw xmm1, xmm6 // V
2011 pmaddubsw xmm3, xmm6
2012 phaddw xmm0, xmm2
2013 phaddw xmm1, xmm3
2014 psraw xmm0, 8
2015 psraw xmm1, 8
2016 packsswb xmm0, xmm1
2017 paddb xmm0, xmm5 // -> unsigned
2018
2019 // step 3 - store 8 U and 8 V values
2020 sub ecx, 16
2021 movlps qword ptr [edx], xmm0 // U
2022 movhps qword ptr [edx + edi], xmm0 // V
2023 lea edx, [edx + 8]
2024 jg convertloop
2025
2026 pop edi
2027 pop esi
2028 ret
2029 }
2030 }
2031 #endif // HAS_ARGBTOYROW_SSSE3
2032
2033 #define YG 74 /* (int8)(1.164 * 64 + 0.5) */
2034
2035 #define UB 127 /* min(63,(int8)(2.018 * 64)) */
2036 #define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
2037 #define UR 0
2038
2039 #define VB 0
2040 #define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
2041 #define VR 102 /* (int8)(1.596 * 64 + 0.5) */
2042
2043 // Bias
2044 #define BB UB * 128 + VB * 128
2045 #define BG UG * 128 + VG * 128
2046 #define BR UR * 128 + VR * 128
2047
2048 #ifdef HAS_I422TOARGBROW_AVX2
2049
2050 static const lvec8 kUVToB_AVX = {
2051 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB,
2052 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
2053 };
2054 static const lvec8 kUVToR_AVX = {
2055 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR,
2056 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
2057 };
2058 static const lvec8 kUVToG_AVX = {
2059 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
2060 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
2061 };
2062 static const lvec16 kYToRgb_AVX = {
2063 YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG
2064 };
2065 static const lvec16 kYSub16_AVX = {
2066 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
2067 };
2068 static const lvec16 kUVBiasB_AVX = {
2069 BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB
2070 };
2071 static const lvec16 kUVBiasG_AVX = {
2072 BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG
2073 };
2074 static const lvec16 kUVBiasR_AVX = {
2075 BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR
2076 };
2077
2078 // 16 pixels
2079 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2080 __declspec(naked) __declspec(align(16))
I422ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2081 void I422ToARGBRow_AVX2(const uint8* y_buf,
2082 const uint8* u_buf,
2083 const uint8* v_buf,
2084 uint8* dst_argb,
2085 int width) {
2086 __asm {
2087 push esi
2088 push edi
2089 mov eax, [esp + 8 + 4] // Y
2090 mov esi, [esp + 8 + 8] // U
2091 mov edi, [esp + 8 + 12] // V
2092 mov edx, [esp + 8 + 16] // argb
2093 mov ecx, [esp + 8 + 20] // width
2094 sub edi, esi
2095 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2096 vpxor ymm4, ymm4, ymm4
2097
2098 align 4
2099 convertloop:
2100 vmovq xmm0, qword ptr [esi] // U
2101 vmovq xmm1, qword ptr [esi + edi] // V
2102 lea esi, [esi + 8]
2103 vpunpcklbw ymm0, ymm0, ymm1 // UV
2104 vpermq ymm0, ymm0, 0xd8
2105 vpunpcklwd ymm0, ymm0, ymm0 // UVUV
2106 vpmaddubsw ymm2, ymm0, kUVToB_AVX // scale B UV
2107 vpmaddubsw ymm1, ymm0, kUVToG_AVX // scale G UV
2108 vpmaddubsw ymm0, ymm0, kUVToR_AVX // scale R UV
2109 vpsubw ymm2, ymm2, kUVBiasB_AVX // unbias back to signed
2110 vpsubw ymm1, ymm1, kUVBiasG_AVX
2111 vpsubw ymm0, ymm0, kUVBiasR_AVX
2112
2113 // Step 2: Find Y contribution to 16 R,G,B values
2114 vmovdqu xmm3, [eax] // NOLINT
2115 lea eax, [eax + 16]
2116 vpermq ymm3, ymm3, 0xd8
2117 vpunpcklbw ymm3, ymm3, ymm4
2118 vpsubsw ymm3, ymm3, kYSub16_AVX
2119 vpmullw ymm3, ymm3, kYToRgb_AVX
2120 vpaddsw ymm2, ymm2, ymm3 // B += Y
2121 vpaddsw ymm1, ymm1, ymm3 // G += Y
2122 vpaddsw ymm0, ymm0, ymm3 // R += Y
2123 vpsraw ymm2, ymm2, 6
2124 vpsraw ymm1, ymm1, 6
2125 vpsraw ymm0, ymm0, 6
2126 vpackuswb ymm2, ymm2, ymm2 // B
2127 vpackuswb ymm1, ymm1, ymm1 // G
2128 vpackuswb ymm0, ymm0, ymm0 // R
2129
2130 // Step 3: Weave into ARGB
2131 vpunpcklbw ymm2, ymm2, ymm1 // BG
2132 vpermq ymm2, ymm2, 0xd8
2133 vpunpcklbw ymm0, ymm0, ymm5 // RA
2134 vpermq ymm0, ymm0, 0xd8
2135 vpunpcklwd ymm1, ymm2, ymm0 // BGRA first 8 pixels
2136 vpunpckhwd ymm2, ymm2, ymm0 // BGRA next 8 pixels
2137 vmovdqu [edx], ymm1
2138 vmovdqu [edx + 32], ymm2
2139 lea edx, [edx + 64]
2140 sub ecx, 16
2141 jg convertloop
2142 vzeroupper
2143
2144 pop edi
2145 pop esi
2146 ret
2147 }
2148 }
2149 #endif // HAS_I422TOARGBROW_AVX2
2150
2151 #ifdef HAS_I422TOARGBROW_SSSE3
2152
2153 static const vec8 kUVToB = {
2154 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
2155 };
2156
2157 static const vec8 kUVToR = {
2158 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
2159 };
2160
2161 static const vec8 kUVToG = {
2162 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
2163 };
2164
2165 static const vec8 kVUToB = {
2166 VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
2167 };
2168
2169 static const vec8 kVUToR = {
2170 VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
2171 };
2172
2173 static const vec8 kVUToG = {
2174 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
2175 };
2176
2177 static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
2178 static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
2179 static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
2180 static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
2181 static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
2182
2183 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
2184
2185 // Read 8 UV from 444.
2186 #define READYUV444 __asm { \
2187 __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \
2188 __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \
2189 __asm lea esi, [esi + 8] \
2190 __asm punpcklbw xmm0, xmm1 /* UV */ \
2191 }
2192
2193 // Read 4 UV from 422, upsample to 8 UV.
2194 #define READYUV422 __asm { \
2195 __asm movd xmm0, [esi] /* U */ \
2196 __asm movd xmm1, [esi + edi] /* V */ \
2197 __asm lea esi, [esi + 4] \
2198 __asm punpcklbw xmm0, xmm1 /* UV */ \
2199 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2200 }
2201
2202 // Read 2 UV from 411, upsample to 8 UV.
2203 #define READYUV411 __asm { \
2204 __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \
2205 __asm movd xmm0, ebx \
2206 __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \
2207 __asm movd xmm1, ebx \
2208 __asm lea esi, [esi + 2] \
2209 __asm punpcklbw xmm0, xmm1 /* UV */ \
2210 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2211 __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \
2212 }
2213
2214 // Read 4 UV from NV12, upsample to 8 UV.
2215 #define READNV12 __asm { \
2216 __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \
2217 __asm lea esi, [esi + 8] \
2218 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2219 }
2220
2221 // Convert 8 pixels: 8 UV and 8 Y.
2222 #define YUVTORGB __asm { \
2223 /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
2224 __asm movdqa xmm1, xmm0 \
2225 __asm movdqa xmm2, xmm0 \
2226 __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
2227 __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \
2228 __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \
2229 __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
2230 __asm psubw xmm1, kUVBiasG \
2231 __asm psubw xmm2, kUVBiasR \
2232 /* Step 2: Find Y contribution to 8 R,G,B values */ \
2233 __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
2234 __asm lea eax, [eax + 8] \
2235 __asm punpcklbw xmm3, xmm4 \
2236 __asm psubsw xmm3, kYSub16 \
2237 __asm pmullw xmm3, kYToRgb \
2238 __asm paddsw xmm0, xmm3 /* B += Y */ \
2239 __asm paddsw xmm1, xmm3 /* G += Y */ \
2240 __asm paddsw xmm2, xmm3 /* R += Y */ \
2241 __asm psraw xmm0, 6 \
2242 __asm psraw xmm1, 6 \
2243 __asm psraw xmm2, 6 \
2244 __asm packuswb xmm0, xmm0 /* B */ \
2245 __asm packuswb xmm1, xmm1 /* G */ \
2246 __asm packuswb xmm2, xmm2 /* R */ \
2247 }
2248
2249 // Convert 8 pixels: 8 VU and 8 Y.
2250 #define YVUTORGB __asm { \
2251 /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
2252 __asm movdqa xmm1, xmm0 \
2253 __asm movdqa xmm2, xmm0 \
2254 __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \
2255 __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \
2256 __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \
2257 __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
2258 __asm psubw xmm1, kUVBiasG \
2259 __asm psubw xmm2, kUVBiasR \
2260 /* Step 2: Find Y contribution to 8 R,G,B values */ \
2261 __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
2262 __asm lea eax, [eax + 8] \
2263 __asm punpcklbw xmm3, xmm4 \
2264 __asm psubsw xmm3, kYSub16 \
2265 __asm pmullw xmm3, kYToRgb \
2266 __asm paddsw xmm0, xmm3 /* B += Y */ \
2267 __asm paddsw xmm1, xmm3 /* G += Y */ \
2268 __asm paddsw xmm2, xmm3 /* R += Y */ \
2269 __asm psraw xmm0, 6 \
2270 __asm psraw xmm1, 6 \
2271 __asm psraw xmm2, 6 \
2272 __asm packuswb xmm0, xmm0 /* B */ \
2273 __asm packuswb xmm1, xmm1 /* G */ \
2274 __asm packuswb xmm2, xmm2 /* R */ \
2275 }
2276
2277 // 8 pixels, dest aligned 16.
2278 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
2279 __declspec(naked) __declspec(align(16))
I444ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2280 void I444ToARGBRow_SSSE3(const uint8* y_buf,
2281 const uint8* u_buf,
2282 const uint8* v_buf,
2283 uint8* dst_argb,
2284 int width) {
2285 __asm {
2286 push esi
2287 push edi
2288 mov eax, [esp + 8 + 4] // Y
2289 mov esi, [esp + 8 + 8] // U
2290 mov edi, [esp + 8 + 12] // V
2291 mov edx, [esp + 8 + 16] // argb
2292 mov ecx, [esp + 8 + 20] // width
2293 sub edi, esi
2294 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2295 pxor xmm4, xmm4
2296
2297 align 4
2298 convertloop:
2299 READYUV444
2300 YUVTORGB
2301
2302 // Step 3: Weave into ARGB
2303 punpcklbw xmm0, xmm1 // BG
2304 punpcklbw xmm2, xmm5 // RA
2305 movdqa xmm1, xmm0
2306 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2307 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2308 movdqa [edx], xmm0
2309 movdqa [edx + 16], xmm1
2310 lea edx, [edx + 32]
2311 sub ecx, 8
2312 jg convertloop
2313
2314 pop edi
2315 pop esi
2316 ret
2317 }
2318 }
2319
2320 // 8 pixels, dest aligned 16.
2321 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2322 __declspec(naked) __declspec(align(16))
I422ToRGB24Row_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_rgb24,int width)2323 void I422ToRGB24Row_SSSE3(const uint8* y_buf,
2324 const uint8* u_buf,
2325 const uint8* v_buf,
2326 uint8* dst_rgb24,
2327 int width) {
2328 __asm {
2329 push esi
2330 push edi
2331 mov eax, [esp + 8 + 4] // Y
2332 mov esi, [esp + 8 + 8] // U
2333 mov edi, [esp + 8 + 12] // V
2334 mov edx, [esp + 8 + 16] // rgb24
2335 mov ecx, [esp + 8 + 20] // width
2336 sub edi, esi
2337 pxor xmm4, xmm4
2338 movdqa xmm5, kShuffleMaskARGBToRGB24_0
2339 movdqa xmm6, kShuffleMaskARGBToRGB24
2340
2341 align 4
2342 convertloop:
2343 READYUV422
2344 YUVTORGB
2345
2346 // Step 3: Weave into RRGB
2347 punpcklbw xmm0, xmm1 // BG
2348 punpcklbw xmm2, xmm2 // RR
2349 movdqa xmm1, xmm0
2350 punpcklwd xmm0, xmm2 // BGRR first 4 pixels
2351 punpckhwd xmm1, xmm2 // BGRR next 4 pixels
2352 pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes.
2353 pshufb xmm1, xmm6 // Pack into first 12 bytes.
2354 palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1
2355 movq qword ptr [edx], xmm0 // First 8 bytes
2356 movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels.
2357 lea edx, [edx + 24]
2358 sub ecx, 8
2359 jg convertloop
2360
2361 pop edi
2362 pop esi
2363 ret
2364 }
2365 }
2366
2367 // 8 pixels, dest aligned 16.
2368 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2369 __declspec(naked) __declspec(align(16))
I422ToRAWRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_raw,int width)2370 void I422ToRAWRow_SSSE3(const uint8* y_buf,
2371 const uint8* u_buf,
2372 const uint8* v_buf,
2373 uint8* dst_raw,
2374 int width) {
2375 __asm {
2376 push esi
2377 push edi
2378 mov eax, [esp + 8 + 4] // Y
2379 mov esi, [esp + 8 + 8] // U
2380 mov edi, [esp + 8 + 12] // V
2381 mov edx, [esp + 8 + 16] // raw
2382 mov ecx, [esp + 8 + 20] // width
2383 sub edi, esi
2384 pxor xmm4, xmm4
2385 movdqa xmm5, kShuffleMaskARGBToRAW_0
2386 movdqa xmm6, kShuffleMaskARGBToRAW
2387
2388 align 4
2389 convertloop:
2390 READYUV422
2391 YUVTORGB
2392
2393 // Step 3: Weave into RRGB
2394 punpcklbw xmm0, xmm1 // BG
2395 punpcklbw xmm2, xmm2 // RR
2396 movdqa xmm1, xmm0
2397 punpcklwd xmm0, xmm2 // BGRR first 4 pixels
2398 punpckhwd xmm1, xmm2 // BGRR next 4 pixels
2399 pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes.
2400 pshufb xmm1, xmm6 // Pack into first 12 bytes.
2401 palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1
2402 movq qword ptr [edx], xmm0 // First 8 bytes
2403 movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels.
2404 lea edx, [edx + 24]
2405 sub ecx, 8
2406 jg convertloop
2407
2408 pop edi
2409 pop esi
2410 ret
2411 }
2412 }
2413
2414 // 8 pixels, dest unaligned.
2415 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2416 __declspec(naked) __declspec(align(16))
I422ToRGB565Row_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb565_buf,int width)2417 void I422ToRGB565Row_SSSE3(const uint8* y_buf,
2418 const uint8* u_buf,
2419 const uint8* v_buf,
2420 uint8* rgb565_buf,
2421 int width) {
2422 __asm {
2423 push esi
2424 push edi
2425 mov eax, [esp + 8 + 4] // Y
2426 mov esi, [esp + 8 + 8] // U
2427 mov edi, [esp + 8 + 12] // V
2428 mov edx, [esp + 8 + 16] // rgb565
2429 mov ecx, [esp + 8 + 20] // width
2430 sub edi, esi
2431 pxor xmm4, xmm4
2432 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
2433 psrld xmm5, 27
2434 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
2435 psrld xmm6, 26
2436 pslld xmm6, 5
2437 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
2438 pslld xmm7, 11
2439
2440 align 4
2441 convertloop:
2442 READYUV422
2443 YUVTORGB
2444
2445 // Step 3: Weave into RRGB
2446 punpcklbw xmm0, xmm1 // BG
2447 punpcklbw xmm2, xmm2 // RR
2448 movdqa xmm1, xmm0
2449 punpcklwd xmm0, xmm2 // BGRR first 4 pixels
2450 punpckhwd xmm1, xmm2 // BGRR next 4 pixels
2451
2452 // Step 3b: RRGB -> RGB565
2453 movdqa xmm3, xmm0 // B first 4 pixels of argb
2454 movdqa xmm2, xmm0 // G
2455 pslld xmm0, 8 // R
2456 psrld xmm3, 3 // B
2457 psrld xmm2, 5 // G
2458 psrad xmm0, 16 // R
2459 pand xmm3, xmm5 // B
2460 pand xmm2, xmm6 // G
2461 pand xmm0, xmm7 // R
2462 por xmm3, xmm2 // BG
2463 por xmm0, xmm3 // BGR
2464 movdqa xmm3, xmm1 // B next 4 pixels of argb
2465 movdqa xmm2, xmm1 // G
2466 pslld xmm1, 8 // R
2467 psrld xmm3, 3 // B
2468 psrld xmm2, 5 // G
2469 psrad xmm1, 16 // R
2470 pand xmm3, xmm5 // B
2471 pand xmm2, xmm6 // G
2472 pand xmm1, xmm7 // R
2473 por xmm3, xmm2 // BG
2474 por xmm1, xmm3 // BGR
2475 packssdw xmm0, xmm1
2476 sub ecx, 8
2477 movdqu [edx], xmm0 // store 8 pixels of RGB565
2478 lea edx, [edx + 16]
2479 jg convertloop
2480
2481 pop edi
2482 pop esi
2483 ret
2484 }
2485 }
2486
2487 // 8 pixels, dest aligned 16.
2488 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2489 __declspec(naked) __declspec(align(16))
I422ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2490 void I422ToARGBRow_SSSE3(const uint8* y_buf,
2491 const uint8* u_buf,
2492 const uint8* v_buf,
2493 uint8* dst_argb,
2494 int width) {
2495 __asm {
2496 push esi
2497 push edi
2498 mov eax, [esp + 8 + 4] // Y
2499 mov esi, [esp + 8 + 8] // U
2500 mov edi, [esp + 8 + 12] // V
2501 mov edx, [esp + 8 + 16] // argb
2502 mov ecx, [esp + 8 + 20] // width
2503 sub edi, esi
2504 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2505 pxor xmm4, xmm4
2506
2507 align 4
2508 convertloop:
2509 READYUV422
2510 YUVTORGB
2511
2512 // Step 3: Weave into ARGB
2513 punpcklbw xmm0, xmm1 // BG
2514 punpcklbw xmm2, xmm5 // RA
2515 movdqa xmm1, xmm0
2516 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2517 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2518 movdqa [edx], xmm0
2519 movdqa [edx + 16], xmm1
2520 lea edx, [edx + 32]
2521 sub ecx, 8
2522 jg convertloop
2523
2524 pop edi
2525 pop esi
2526 ret
2527 }
2528 }
2529
2530 // 8 pixels, dest aligned 16.
2531 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2532 // Similar to I420 but duplicate UV once more.
2533 __declspec(naked) __declspec(align(16))
I411ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2534 void I411ToARGBRow_SSSE3(const uint8* y_buf,
2535 const uint8* u_buf,
2536 const uint8* v_buf,
2537 uint8* dst_argb,
2538 int width) {
2539 __asm {
2540 push ebx
2541 push esi
2542 push edi
2543 mov eax, [esp + 12 + 4] // Y
2544 mov esi, [esp + 12 + 8] // U
2545 mov edi, [esp + 12 + 12] // V
2546 mov edx, [esp + 12 + 16] // argb
2547 mov ecx, [esp + 12 + 20] // width
2548 sub edi, esi
2549 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2550 pxor xmm4, xmm4
2551
2552 align 4
2553 convertloop:
2554 READYUV411 // modifies EBX
2555 YUVTORGB
2556
2557 // Step 3: Weave into ARGB
2558 punpcklbw xmm0, xmm1 // BG
2559 punpcklbw xmm2, xmm5 // RA
2560 movdqa xmm1, xmm0
2561 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2562 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2563 movdqa [edx], xmm0
2564 movdqa [edx + 16], xmm1
2565 lea edx, [edx + 32]
2566 sub ecx, 8
2567 jg convertloop
2568
2569 pop edi
2570 pop esi
2571 pop ebx
2572 ret
2573 }
2574 }
2575
2576 // 8 pixels, dest aligned 16.
2577 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2578 __declspec(naked) __declspec(align(16))
NV12ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * dst_argb,int width)2579 void NV12ToARGBRow_SSSE3(const uint8* y_buf,
2580 const uint8* uv_buf,
2581 uint8* dst_argb,
2582 int width) {
2583 __asm {
2584 push esi
2585 mov eax, [esp + 4 + 4] // Y
2586 mov esi, [esp + 4 + 8] // UV
2587 mov edx, [esp + 4 + 12] // argb
2588 mov ecx, [esp + 4 + 16] // width
2589 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2590 pxor xmm4, xmm4
2591
2592 align 4
2593 convertloop:
2594 READNV12
2595 YUVTORGB
2596
2597 // Step 3: Weave into ARGB
2598 punpcklbw xmm0, xmm1 // BG
2599 punpcklbw xmm2, xmm5 // RA
2600 movdqa xmm1, xmm0
2601 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2602 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2603 movdqa [edx], xmm0
2604 movdqa [edx + 16], xmm1
2605 lea edx, [edx + 32]
2606 sub ecx, 8
2607 jg convertloop
2608
2609 pop esi
2610 ret
2611 }
2612 }
2613
2614 // 8 pixels, dest aligned 16.
2615 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2616 __declspec(naked) __declspec(align(16))
NV21ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * dst_argb,int width)2617 void NV21ToARGBRow_SSSE3(const uint8* y_buf,
2618 const uint8* uv_buf,
2619 uint8* dst_argb,
2620 int width) {
2621 __asm {
2622 push esi
2623 mov eax, [esp + 4 + 4] // Y
2624 mov esi, [esp + 4 + 8] // VU
2625 mov edx, [esp + 4 + 12] // argb
2626 mov ecx, [esp + 4 + 16] // width
2627 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2628 pxor xmm4, xmm4
2629
2630 align 4
2631 convertloop:
2632 READNV12
2633 YVUTORGB
2634
2635 // Step 3: Weave into ARGB
2636 punpcklbw xmm0, xmm1 // BG
2637 punpcklbw xmm2, xmm5 // RA
2638 movdqa xmm1, xmm0
2639 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2640 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2641 movdqa [edx], xmm0
2642 movdqa [edx + 16], xmm1
2643 lea edx, [edx + 32]
2644 sub ecx, 8
2645 jg convertloop
2646
2647 pop esi
2648 ret
2649 }
2650 }
2651
2652 // 8 pixels, unaligned.
2653 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
2654 __declspec(naked) __declspec(align(16))
I444ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2655 void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2656 const uint8* u_buf,
2657 const uint8* v_buf,
2658 uint8* dst_argb,
2659 int width) {
2660 __asm {
2661 push esi
2662 push edi
2663 mov eax, [esp + 8 + 4] // Y
2664 mov esi, [esp + 8 + 8] // U
2665 mov edi, [esp + 8 + 12] // V
2666 mov edx, [esp + 8 + 16] // argb
2667 mov ecx, [esp + 8 + 20] // width
2668 sub edi, esi
2669 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2670 pxor xmm4, xmm4
2671
2672 align 4
2673 convertloop:
2674 READYUV444
2675 YUVTORGB
2676
2677 // Step 3: Weave into ARGB
2678 punpcklbw xmm0, xmm1 // BG
2679 punpcklbw xmm2, xmm5 // RA
2680 movdqa xmm1, xmm0
2681 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2682 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2683 movdqu [edx], xmm0
2684 movdqu [edx + 16], xmm1
2685 lea edx, [edx + 32]
2686 sub ecx, 8
2687 jg convertloop
2688
2689 pop edi
2690 pop esi
2691 ret
2692 }
2693 }
2694
2695 // 8 pixels, unaligned.
2696 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2697 __declspec(naked) __declspec(align(16))
I422ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2698 void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2699 const uint8* u_buf,
2700 const uint8* v_buf,
2701 uint8* dst_argb,
2702 int width) {
2703 __asm {
2704 push esi
2705 push edi
2706 mov eax, [esp + 8 + 4] // Y
2707 mov esi, [esp + 8 + 8] // U
2708 mov edi, [esp + 8 + 12] // V
2709 mov edx, [esp + 8 + 16] // argb
2710 mov ecx, [esp + 8 + 20] // width
2711 sub edi, esi
2712 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2713 pxor xmm4, xmm4
2714
2715 align 4
2716 convertloop:
2717 READYUV422
2718 YUVTORGB
2719
2720 // Step 3: Weave into ARGB
2721 punpcklbw xmm0, xmm1 // BG
2722 punpcklbw xmm2, xmm5 // RA
2723 movdqa xmm1, xmm0
2724 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2725 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2726 movdqu [edx], xmm0
2727 movdqu [edx + 16], xmm1
2728 lea edx, [edx + 32]
2729 sub ecx, 8
2730 jg convertloop
2731
2732 pop edi
2733 pop esi
2734 ret
2735 }
2736 }
2737
2738 // 8 pixels, unaligned.
2739 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2740 // Similar to I420 but duplicate UV once more.
2741 __declspec(naked) __declspec(align(16))
I411ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2742 void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2743 const uint8* u_buf,
2744 const uint8* v_buf,
2745 uint8* dst_argb,
2746 int width) {
2747 __asm {
2748 push ebx
2749 push esi
2750 push edi
2751 mov eax, [esp + 12 + 4] // Y
2752 mov esi, [esp + 12 + 8] // U
2753 mov edi, [esp + 12 + 12] // V
2754 mov edx, [esp + 12 + 16] // argb
2755 mov ecx, [esp + 12 + 20] // width
2756 sub edi, esi
2757 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2758 pxor xmm4, xmm4
2759
2760 align 4
2761 convertloop:
2762 READYUV411 // modifies EBX
2763 YUVTORGB
2764
2765 // Step 3: Weave into ARGB
2766 punpcklbw xmm0, xmm1 // BG
2767 punpcklbw xmm2, xmm5 // RA
2768 movdqa xmm1, xmm0
2769 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2770 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2771 movdqu [edx], xmm0
2772 movdqu [edx + 16], xmm1
2773 lea edx, [edx + 32]
2774 sub ecx, 8
2775 jg convertloop
2776
2777 pop edi
2778 pop esi
2779 pop ebx
2780 ret
2781 }
2782 }
2783
2784 // 8 pixels, dest aligned 16.
2785 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2786 __declspec(naked) __declspec(align(16))
NV12ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * dst_argb,int width)2787 void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2788 const uint8* uv_buf,
2789 uint8* dst_argb,
2790 int width) {
2791 __asm {
2792 push esi
2793 mov eax, [esp + 4 + 4] // Y
2794 mov esi, [esp + 4 + 8] // UV
2795 mov edx, [esp + 4 + 12] // argb
2796 mov ecx, [esp + 4 + 16] // width
2797 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2798 pxor xmm4, xmm4
2799
2800 align 4
2801 convertloop:
2802 READNV12
2803 YUVTORGB
2804
2805 // Step 3: Weave into ARGB
2806 punpcklbw xmm0, xmm1 // BG
2807 punpcklbw xmm2, xmm5 // RA
2808 movdqa xmm1, xmm0
2809 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2810 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2811 movdqu [edx], xmm0
2812 movdqu [edx + 16], xmm1
2813 lea edx, [edx + 32]
2814 sub ecx, 8
2815 jg convertloop
2816
2817 pop esi
2818 ret
2819 }
2820 }
2821
2822 // 8 pixels, dest aligned 16.
2823 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2824 __declspec(naked) __declspec(align(16))
NV21ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * dst_argb,int width)2825 void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2826 const uint8* uv_buf,
2827 uint8* dst_argb,
2828 int width) {
2829 __asm {
2830 push esi
2831 mov eax, [esp + 4 + 4] // Y
2832 mov esi, [esp + 4 + 8] // VU
2833 mov edx, [esp + 4 + 12] // argb
2834 mov ecx, [esp + 4 + 16] // width
2835 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2836 pxor xmm4, xmm4
2837
2838 align 4
2839 convertloop:
2840 READNV12
2841 YVUTORGB
2842
2843 // Step 3: Weave into ARGB
2844 punpcklbw xmm0, xmm1 // BG
2845 punpcklbw xmm2, xmm5 // RA
2846 movdqa xmm1, xmm0
2847 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2848 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2849 movdqu [edx], xmm0
2850 movdqu [edx + 16], xmm1
2851 lea edx, [edx + 32]
2852 sub ecx, 8
2853 jg convertloop
2854
2855 pop esi
2856 ret
2857 }
2858 }
2859
2860 __declspec(naked) __declspec(align(16))
I422ToBGRARow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_bgra,int width)2861 void I422ToBGRARow_SSSE3(const uint8* y_buf,
2862 const uint8* u_buf,
2863 const uint8* v_buf,
2864 uint8* dst_bgra,
2865 int width) {
2866 __asm {
2867 push esi
2868 push edi
2869 mov eax, [esp + 8 + 4] // Y
2870 mov esi, [esp + 8 + 8] // U
2871 mov edi, [esp + 8 + 12] // V
2872 mov edx, [esp + 8 + 16] // bgra
2873 mov ecx, [esp + 8 + 20] // width
2874 sub edi, esi
2875 pxor xmm4, xmm4
2876
2877 align 4
2878 convertloop:
2879 READYUV422
2880 YUVTORGB
2881
2882 // Step 3: Weave into BGRA
2883 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2884 punpcklbw xmm1, xmm0 // GB
2885 punpcklbw xmm5, xmm2 // AR
2886 movdqa xmm0, xmm5
2887 punpcklwd xmm5, xmm1 // BGRA first 4 pixels
2888 punpckhwd xmm0, xmm1 // BGRA next 4 pixels
2889 movdqa [edx], xmm5
2890 movdqa [edx + 16], xmm0
2891 lea edx, [edx + 32]
2892 sub ecx, 8
2893 jg convertloop
2894
2895 pop edi
2896 pop esi
2897 ret
2898 }
2899 }
2900
2901 __declspec(naked) __declspec(align(16))
I422ToBGRARow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_bgra,int width)2902 void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
2903 const uint8* u_buf,
2904 const uint8* v_buf,
2905 uint8* dst_bgra,
2906 int width) {
2907 __asm {
2908 push esi
2909 push edi
2910 mov eax, [esp + 8 + 4] // Y
2911 mov esi, [esp + 8 + 8] // U
2912 mov edi, [esp + 8 + 12] // V
2913 mov edx, [esp + 8 + 16] // bgra
2914 mov ecx, [esp + 8 + 20] // width
2915 sub edi, esi
2916 pxor xmm4, xmm4
2917
2918 align 4
2919 convertloop:
2920 READYUV422
2921 YUVTORGB
2922
2923 // Step 3: Weave into BGRA
2924 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2925 punpcklbw xmm1, xmm0 // GB
2926 punpcklbw xmm5, xmm2 // AR
2927 movdqa xmm0, xmm5
2928 punpcklwd xmm5, xmm1 // BGRA first 4 pixels
2929 punpckhwd xmm0, xmm1 // BGRA next 4 pixels
2930 movdqu [edx], xmm5
2931 movdqu [edx + 16], xmm0
2932 lea edx, [edx + 32]
2933 sub ecx, 8
2934 jg convertloop
2935
2936 pop edi
2937 pop esi
2938 ret
2939 }
2940 }
2941
2942 __declspec(naked) __declspec(align(16))
I422ToABGRRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_abgr,int width)2943 void I422ToABGRRow_SSSE3(const uint8* y_buf,
2944 const uint8* u_buf,
2945 const uint8* v_buf,
2946 uint8* dst_abgr,
2947 int width) {
2948 __asm {
2949 push esi
2950 push edi
2951 mov eax, [esp + 8 + 4] // Y
2952 mov esi, [esp + 8 + 8] // U
2953 mov edi, [esp + 8 + 12] // V
2954 mov edx, [esp + 8 + 16] // abgr
2955 mov ecx, [esp + 8 + 20] // width
2956 sub edi, esi
2957 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2958 pxor xmm4, xmm4
2959
2960 align 4
2961 convertloop:
2962 READYUV422
2963 YUVTORGB
2964
2965 // Step 3: Weave into ARGB
2966 punpcklbw xmm2, xmm1 // RG
2967 punpcklbw xmm0, xmm5 // BA
2968 movdqa xmm1, xmm2
2969 punpcklwd xmm2, xmm0 // RGBA first 4 pixels
2970 punpckhwd xmm1, xmm0 // RGBA next 4 pixels
2971 movdqa [edx], xmm2
2972 movdqa [edx + 16], xmm1
2973 lea edx, [edx + 32]
2974 sub ecx, 8
2975 jg convertloop
2976
2977 pop edi
2978 pop esi
2979 ret
2980 }
2981 }
2982
2983 __declspec(naked) __declspec(align(16))
I422ToABGRRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_abgr,int width)2984 void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
2985 const uint8* u_buf,
2986 const uint8* v_buf,
2987 uint8* dst_abgr,
2988 int width) {
2989 __asm {
2990 push esi
2991 push edi
2992 mov eax, [esp + 8 + 4] // Y
2993 mov esi, [esp + 8 + 8] // U
2994 mov edi, [esp + 8 + 12] // V
2995 mov edx, [esp + 8 + 16] // abgr
2996 mov ecx, [esp + 8 + 20] // width
2997 sub edi, esi
2998 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2999 pxor xmm4, xmm4
3000
3001 align 4
3002 convertloop:
3003 READYUV422
3004 YUVTORGB
3005
3006 // Step 3: Weave into ARGB
3007 punpcklbw xmm2, xmm1 // RG
3008 punpcklbw xmm0, xmm5 // BA
3009 movdqa xmm1, xmm2
3010 punpcklwd xmm2, xmm0 // RGBA first 4 pixels
3011 punpckhwd xmm1, xmm0 // RGBA next 4 pixels
3012 movdqu [edx], xmm2
3013 movdqu [edx + 16], xmm1
3014 lea edx, [edx + 32]
3015 sub ecx, 8
3016 jg convertloop
3017
3018 pop edi
3019 pop esi
3020 ret
3021 }
3022 }
3023
3024 __declspec(naked) __declspec(align(16))
I422ToRGBARow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_rgba,int width)3025 void I422ToRGBARow_SSSE3(const uint8* y_buf,
3026 const uint8* u_buf,
3027 const uint8* v_buf,
3028 uint8* dst_rgba,
3029 int width) {
3030 __asm {
3031 push esi
3032 push edi
3033 mov eax, [esp + 8 + 4] // Y
3034 mov esi, [esp + 8 + 8] // U
3035 mov edi, [esp + 8 + 12] // V
3036 mov edx, [esp + 8 + 16] // rgba
3037 mov ecx, [esp + 8 + 20] // width
3038 sub edi, esi
3039 pxor xmm4, xmm4
3040
3041 align 4
3042 convertloop:
3043 READYUV422
3044 YUVTORGB
3045
3046 // Step 3: Weave into RGBA
3047 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
3048 punpcklbw xmm1, xmm2 // GR
3049 punpcklbw xmm5, xmm0 // AB
3050 movdqa xmm0, xmm5
3051 punpcklwd xmm5, xmm1 // RGBA first 4 pixels
3052 punpckhwd xmm0, xmm1 // RGBA next 4 pixels
3053 movdqa [edx], xmm5
3054 movdqa [edx + 16], xmm0
3055 lea edx, [edx + 32]
3056 sub ecx, 8
3057 jg convertloop
3058
3059 pop edi
3060 pop esi
3061 ret
3062 }
3063 }
3064
3065 __declspec(naked) __declspec(align(16))
I422ToRGBARow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_rgba,int width)3066 void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
3067 const uint8* u_buf,
3068 const uint8* v_buf,
3069 uint8* dst_rgba,
3070 int width) {
3071 __asm {
3072 push esi
3073 push edi
3074 mov eax, [esp + 8 + 4] // Y
3075 mov esi, [esp + 8 + 8] // U
3076 mov edi, [esp + 8 + 12] // V
3077 mov edx, [esp + 8 + 16] // rgba
3078 mov ecx, [esp + 8 + 20] // width
3079 sub edi, esi
3080 pxor xmm4, xmm4
3081
3082 align 4
3083 convertloop:
3084 READYUV422
3085 YUVTORGB
3086
3087 // Step 3: Weave into RGBA
3088 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
3089 punpcklbw xmm1, xmm2 // GR
3090 punpcklbw xmm5, xmm0 // AB
3091 movdqa xmm0, xmm5
3092 punpcklwd xmm5, xmm1 // RGBA first 4 pixels
3093 punpckhwd xmm0, xmm1 // RGBA next 4 pixels
3094 movdqu [edx], xmm5
3095 movdqu [edx + 16], xmm0
3096 lea edx, [edx + 32]
3097 sub ecx, 8
3098 jg convertloop
3099
3100 pop edi
3101 pop esi
3102 ret
3103 }
3104 }
3105
3106 #endif // HAS_I422TOARGBROW_SSSE3
3107
3108 #ifdef HAS_YTOARGBROW_SSE2
3109 __declspec(naked) __declspec(align(16))
YToARGBRow_SSE2(const uint8 * y_buf,uint8 * rgb_buf,int width)3110 void YToARGBRow_SSE2(const uint8* y_buf,
3111 uint8* rgb_buf,
3112 int width) {
3113 __asm {
3114 pxor xmm5, xmm5
3115 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
3116 pslld xmm4, 24
3117 mov eax, 0x00100010
3118 movd xmm3, eax
3119 pshufd xmm3, xmm3, 0
3120 mov eax, 0x004a004a // 74
3121 movd xmm2, eax
3122 pshufd xmm2, xmm2,0
3123 mov eax, [esp + 4] // Y
3124 mov edx, [esp + 8] // rgb
3125 mov ecx, [esp + 12] // width
3126
3127 align 4
3128 convertloop:
3129 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
3130 movq xmm0, qword ptr [eax]
3131 lea eax, [eax + 8]
3132 punpcklbw xmm0, xmm5 // 0.Y
3133 psubusw xmm0, xmm3
3134 pmullw xmm0, xmm2
3135 psrlw xmm0, 6
3136 packuswb xmm0, xmm0 // G
3137
3138 // Step 2: Weave into ARGB
3139 punpcklbw xmm0, xmm0 // GG
3140 movdqa xmm1, xmm0
3141 punpcklwd xmm0, xmm0 // BGRA first 4 pixels
3142 punpckhwd xmm1, xmm1 // BGRA next 4 pixels
3143 por xmm0, xmm4
3144 por xmm1, xmm4
3145 movdqa [edx], xmm0
3146 movdqa [edx + 16], xmm1
3147 lea edx, [edx + 32]
3148 sub ecx, 8
3149 jg convertloop
3150
3151 ret
3152 }
3153 }
3154 #endif // HAS_YTOARGBROW_SSE2
3155
3156 #ifdef HAS_MIRRORROW_SSSE3
3157 // Shuffle table for reversing the bytes.
3158 static const uvec8 kShuffleMirror = {
3159 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
3160 };
3161
3162 __declspec(naked) __declspec(align(16))
MirrorRow_SSSE3(const uint8 * src,uint8 * dst,int width)3163 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
3164 __asm {
3165 mov eax, [esp + 4] // src
3166 mov edx, [esp + 8] // dst
3167 mov ecx, [esp + 12] // width
3168 movdqa xmm5, kShuffleMirror
3169 lea eax, [eax - 16]
3170
3171 align 4
3172 convertloop:
3173 movdqa xmm0, [eax + ecx]
3174 pshufb xmm0, xmm5
3175 sub ecx, 16
3176 movdqa [edx], xmm0
3177 lea edx, [edx + 16]
3178 jg convertloop
3179 ret
3180 }
3181 }
3182 #endif // HAS_MIRRORROW_SSSE3
3183
3184 #ifdef HAS_MIRRORROW_AVX2
3185 // Shuffle table for reversing the bytes.
3186 static const ulvec8 kShuffleMirror_AVX2 = {
3187 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u,
3188 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
3189 };
3190
3191 __declspec(naked) __declspec(align(16))
MirrorRow_AVX2(const uint8 * src,uint8 * dst,int width)3192 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
3193 __asm {
3194 mov eax, [esp + 4] // src
3195 mov edx, [esp + 8] // dst
3196 mov ecx, [esp + 12] // width
3197 vmovdqa ymm5, kShuffleMirror_AVX2
3198 lea eax, [eax - 32]
3199
3200 align 4
3201 convertloop:
3202 vmovdqu ymm0, [eax + ecx]
3203 vpshufb ymm0, ymm0, ymm5
3204 vpermq ymm0, ymm0, 0x4e // swap high and low halfs
3205 sub ecx, 32
3206 vmovdqu [edx], ymm0
3207 lea edx, [edx + 32]
3208 jg convertloop
3209 vzeroupper
3210 ret
3211 }
3212 }
3213 #endif // HAS_MIRRORROW_AVX2
3214
3215 #ifdef HAS_MIRRORROW_SSE2
3216 // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
3217 // version can not.
3218 __declspec(naked) __declspec(align(16))
MirrorRow_SSE2(const uint8 * src,uint8 * dst,int width)3219 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
3220 __asm {
3221 mov eax, [esp + 4] // src
3222 mov edx, [esp + 8] // dst
3223 mov ecx, [esp + 12] // width
3224 lea eax, [eax - 16]
3225
3226 align 4
3227 convertloop:
3228 movdqu xmm0, [eax + ecx]
3229 movdqa xmm1, xmm0 // swap bytes
3230 psllw xmm0, 8
3231 psrlw xmm1, 8
3232 por xmm0, xmm1
3233 pshuflw xmm0, xmm0, 0x1b // swap words
3234 pshufhw xmm0, xmm0, 0x1b
3235 pshufd xmm0, xmm0, 0x4e // swap qwords
3236 sub ecx, 16
3237 movdqu [edx], xmm0
3238 lea edx, [edx + 16]
3239 jg convertloop
3240 ret
3241 }
3242 }
3243 #endif // HAS_MIRRORROW_SSE2
3244
3245 #ifdef HAS_MIRRORROW_UV_SSSE3
3246 // Shuffle table for reversing the bytes of UV channels.
3247 static const uvec8 kShuffleMirrorUV = {
3248 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
3249 };
3250
3251 __declspec(naked) __declspec(align(16))
MirrorUVRow_SSSE3(const uint8 * src,uint8 * dst_u,uint8 * dst_v,int width)3252 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
3253 int width) {
3254 __asm {
3255 push edi
3256 mov eax, [esp + 4 + 4] // src
3257 mov edx, [esp + 4 + 8] // dst_u
3258 mov edi, [esp + 4 + 12] // dst_v
3259 mov ecx, [esp + 4 + 16] // width
3260 movdqa xmm1, kShuffleMirrorUV
3261 lea eax, [eax + ecx * 2 - 16]
3262 sub edi, edx
3263
3264 align 4
3265 convertloop:
3266 movdqa xmm0, [eax]
3267 lea eax, [eax - 16]
3268 pshufb xmm0, xmm1
3269 sub ecx, 8
3270 movlpd qword ptr [edx], xmm0
3271 movhpd qword ptr [edx + edi], xmm0
3272 lea edx, [edx + 8]
3273 jg convertloop
3274
3275 pop edi
3276 ret
3277 }
3278 }
3279 #endif // HAS_MIRRORROW_UV_SSSE3
3280
3281 #ifdef HAS_ARGBMIRRORROW_SSSE3
3282 // Shuffle table for reversing the bytes.
3283 static const uvec8 kARGBShuffleMirror = {
3284 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
3285 };
3286
3287 __declspec(naked) __declspec(align(16))
ARGBMirrorRow_SSSE3(const uint8 * src,uint8 * dst,int width)3288 void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
3289 __asm {
3290 mov eax, [esp + 4] // src
3291 mov edx, [esp + 8] // dst
3292 mov ecx, [esp + 12] // width
3293 lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
3294 movdqa xmm5, kARGBShuffleMirror
3295
3296 align 4
3297 convertloop:
3298 movdqa xmm0, [eax]
3299 lea eax, [eax - 16]
3300 pshufb xmm0, xmm5
3301 sub ecx, 4
3302 movdqa [edx], xmm0
3303 lea edx, [edx + 16]
3304 jg convertloop
3305 ret
3306 }
3307 }
3308 #endif // HAS_ARGBMIRRORROW_SSSE3
3309
3310 #ifdef HAS_ARGBMIRRORROW_AVX2
3311 // Shuffle table for reversing the bytes.
3312 static const ulvec32 kARGBShuffleMirror_AVX2 = {
3313 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
3314 };
3315
3316 __declspec(naked) __declspec(align(16))
ARGBMirrorRow_AVX2(const uint8 * src,uint8 * dst,int width)3317 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
3318 __asm {
3319 mov eax, [esp + 4] // src
3320 mov edx, [esp + 8] // dst
3321 mov ecx, [esp + 12] // width
3322 lea eax, [eax - 32]
3323 vmovdqa ymm5, kARGBShuffleMirror_AVX2
3324
3325 align 4
3326 convertloop:
3327 vpermd ymm0, ymm5, [eax + ecx * 4] // permute dword order
3328 sub ecx, 8
3329 vmovdqu [edx], ymm0
3330 lea edx, [edx + 32]
3331 jg convertloop
3332 vzeroupper
3333 ret
3334 }
3335 }
3336 #endif // HAS_ARGBMIRRORROW_AVX2
3337
3338 #ifdef HAS_SPLITUVROW_SSE2
3339 __declspec(naked) __declspec(align(16))
SplitUVRow_SSE2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int pix)3340 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
3341 __asm {
3342 push edi
3343 mov eax, [esp + 4 + 4] // src_uv
3344 mov edx, [esp + 4 + 8] // dst_u
3345 mov edi, [esp + 4 + 12] // dst_v
3346 mov ecx, [esp + 4 + 16] // pix
3347 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3348 psrlw xmm5, 8
3349 sub edi, edx
3350
3351 align 4
3352 convertloop:
3353 movdqa xmm0, [eax]
3354 movdqa xmm1, [eax + 16]
3355 lea eax, [eax + 32]
3356 movdqa xmm2, xmm0
3357 movdqa xmm3, xmm1
3358 pand xmm0, xmm5 // even bytes
3359 pand xmm1, xmm5
3360 packuswb xmm0, xmm1
3361 psrlw xmm2, 8 // odd bytes
3362 psrlw xmm3, 8
3363 packuswb xmm2, xmm3
3364 movdqa [edx], xmm0
3365 movdqa [edx + edi], xmm2
3366 lea edx, [edx + 16]
3367 sub ecx, 16
3368 jg convertloop
3369
3370 pop edi
3371 ret
3372 }
3373 }
3374
3375 __declspec(naked) __declspec(align(16))
SplitUVRow_Unaligned_SSE2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int pix)3376 void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
3377 int pix) {
3378 __asm {
3379 push edi
3380 mov eax, [esp + 4 + 4] // src_uv
3381 mov edx, [esp + 4 + 8] // dst_u
3382 mov edi, [esp + 4 + 12] // dst_v
3383 mov ecx, [esp + 4 + 16] // pix
3384 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3385 psrlw xmm5, 8
3386 sub edi, edx
3387
3388 align 4
3389 convertloop:
3390 movdqu xmm0, [eax]
3391 movdqu xmm1, [eax + 16]
3392 lea eax, [eax + 32]
3393 movdqa xmm2, xmm0
3394 movdqa xmm3, xmm1
3395 pand xmm0, xmm5 // even bytes
3396 pand xmm1, xmm5
3397 packuswb xmm0, xmm1
3398 psrlw xmm2, 8 // odd bytes
3399 psrlw xmm3, 8
3400 packuswb xmm2, xmm3
3401 movdqu [edx], xmm0
3402 movdqu [edx + edi], xmm2
3403 lea edx, [edx + 16]
3404 sub ecx, 16
3405 jg convertloop
3406
3407 pop edi
3408 ret
3409 }
3410 }
3411 #endif // HAS_SPLITUVROW_SSE2
3412
3413 #ifdef HAS_SPLITUVROW_AVX2
3414 __declspec(naked) __declspec(align(16))
SplitUVRow_AVX2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int pix)3415 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
3416 __asm {
3417 push edi
3418 mov eax, [esp + 4 + 4] // src_uv
3419 mov edx, [esp + 4 + 8] // dst_u
3420 mov edi, [esp + 4 + 12] // dst_v
3421 mov ecx, [esp + 4 + 16] // pix
3422 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3423 vpsrlw ymm5, ymm5, 8
3424 sub edi, edx
3425
3426 align 4
3427 convertloop:
3428 vmovdqu ymm0, [eax]
3429 vmovdqu ymm1, [eax + 32]
3430 lea eax, [eax + 64]
3431 vpsrlw ymm2, ymm0, 8 // odd bytes
3432 vpsrlw ymm3, ymm1, 8
3433 vpand ymm0, ymm0, ymm5 // even bytes
3434 vpand ymm1, ymm1, ymm5
3435 vpackuswb ymm0, ymm0, ymm1
3436 vpackuswb ymm2, ymm2, ymm3
3437 vpermq ymm0, ymm0, 0xd8
3438 vpermq ymm2, ymm2, 0xd8
3439 vmovdqu [edx], ymm0
3440 vmovdqu [edx + edi], ymm2
3441 lea edx, [edx + 32]
3442 sub ecx, 32
3443 jg convertloop
3444
3445 pop edi
3446 vzeroupper
3447 ret
3448 }
3449 }
3450 #endif // HAS_SPLITUVROW_AVX2
3451
3452 #ifdef HAS_MERGEUVROW_SSE2
3453 __declspec(naked) __declspec(align(16))
MergeUVRow_SSE2(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)3454 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
3455 int width) {
3456 __asm {
3457 push edi
3458 mov eax, [esp + 4 + 4] // src_u
3459 mov edx, [esp + 4 + 8] // src_v
3460 mov edi, [esp + 4 + 12] // dst_uv
3461 mov ecx, [esp + 4 + 16] // width
3462 sub edx, eax
3463
3464 align 4
3465 convertloop:
3466 movdqa xmm0, [eax] // read 16 U's
3467 movdqa xmm1, [eax + edx] // and 16 V's
3468 lea eax, [eax + 16]
3469 movdqa xmm2, xmm0
3470 punpcklbw xmm0, xmm1 // first 8 UV pairs
3471 punpckhbw xmm2, xmm1 // next 8 UV pairs
3472 movdqa [edi], xmm0
3473 movdqa [edi + 16], xmm2
3474 lea edi, [edi + 32]
3475 sub ecx, 16
3476 jg convertloop
3477
3478 pop edi
3479 ret
3480 }
3481 }
3482
3483 __declspec(naked) __declspec(align(16))
MergeUVRow_Unaligned_SSE2(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)3484 void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
3485 uint8* dst_uv, int width) {
3486 __asm {
3487 push edi
3488 mov eax, [esp + 4 + 4] // src_u
3489 mov edx, [esp + 4 + 8] // src_v
3490 mov edi, [esp + 4 + 12] // dst_uv
3491 mov ecx, [esp + 4 + 16] // width
3492 sub edx, eax
3493
3494 align 4
3495 convertloop:
3496 movdqu xmm0, [eax] // read 16 U's
3497 movdqu xmm1, [eax + edx] // and 16 V's
3498 lea eax, [eax + 16]
3499 movdqa xmm2, xmm0
3500 punpcklbw xmm0, xmm1 // first 8 UV pairs
3501 punpckhbw xmm2, xmm1 // next 8 UV pairs
3502 movdqu [edi], xmm0
3503 movdqu [edi + 16], xmm2
3504 lea edi, [edi + 32]
3505 sub ecx, 16
3506 jg convertloop
3507
3508 pop edi
3509 ret
3510 }
3511 }
3512 #endif // HAS_MERGEUVROW_SSE2
3513
3514 #ifdef HAS_MERGEUVROW_AVX2
3515 __declspec(naked) __declspec(align(16))
MergeUVRow_AVX2(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)3516 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
3517 int width) {
3518 __asm {
3519 push edi
3520 mov eax, [esp + 4 + 4] // src_u
3521 mov edx, [esp + 4 + 8] // src_v
3522 mov edi, [esp + 4 + 12] // dst_uv
3523 mov ecx, [esp + 4 + 16] // width
3524 sub edx, eax
3525
3526 align 4
3527 convertloop:
3528 vmovdqu ymm0, [eax] // read 32 U's
3529 vmovdqu ymm1, [eax + edx] // and 32 V's
3530 lea eax, [eax + 32]
3531 vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
3532 vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
3533 vperm2i128 ymm1, ymm2, ymm0, 0x20 // low 128 of ymm2 and low 128 of ymm0
3534 vperm2i128 ymm2, ymm2, ymm0, 0x31 // high 128 of ymm2 and high 128 of ymm0
3535 vmovdqu [edi], ymm1
3536 vmovdqu [edi + 32], ymm2
3537 lea edi, [edi + 64]
3538 sub ecx, 32
3539 jg convertloop
3540
3541 pop edi
3542 vzeroupper
3543 ret
3544 }
3545 }
3546 #endif // HAS_MERGEUVROW_AVX2
3547
3548 #ifdef HAS_COPYROW_SSE2
3549 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
3550 __declspec(naked) __declspec(align(16))
CopyRow_SSE2(const uint8 * src,uint8 * dst,int count)3551 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
3552 __asm {
3553 mov eax, [esp + 4] // src
3554 mov edx, [esp + 8] // dst
3555 mov ecx, [esp + 12] // count
3556
3557 align 4
3558 convertloop:
3559 movdqa xmm0, [eax]
3560 movdqa xmm1, [eax + 16]
3561 lea eax, [eax + 32]
3562 movdqa [edx], xmm0
3563 movdqa [edx + 16], xmm1
3564 lea edx, [edx + 32]
3565 sub ecx, 32
3566 jg convertloop
3567 ret
3568 }
3569 }
3570 #endif // HAS_COPYROW_SSE2
3571
3572 // Unaligned Multiple of 1.
3573 __declspec(naked) __declspec(align(16))
CopyRow_ERMS(const uint8 * src,uint8 * dst,int count)3574 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
3575 __asm {
3576 mov eax, esi
3577 mov edx, edi
3578 mov esi, [esp + 4] // src
3579 mov edi, [esp + 8] // dst
3580 mov ecx, [esp + 12] // count
3581 rep movsb
3582 mov edi, edx
3583 mov esi, eax
3584 ret
3585 }
3586 }
3587
3588 #ifdef HAS_COPYROW_X86
3589 __declspec(naked) __declspec(align(16))
CopyRow_X86(const uint8 * src,uint8 * dst,int count)3590 void CopyRow_X86(const uint8* src, uint8* dst, int count) {
3591 __asm {
3592 mov eax, esi
3593 mov edx, edi
3594 mov esi, [esp + 4] // src
3595 mov edi, [esp + 8] // dst
3596 mov ecx, [esp + 12] // count
3597 shr ecx, 2
3598 rep movsd
3599 mov edi, edx
3600 mov esi, eax
3601 ret
3602 }
3603 }
3604 #endif // HAS_COPYROW_X86
3605
3606 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
3607 // width in pixels
3608 __declspec(naked) __declspec(align(16))
ARGBCopyAlphaRow_SSE2(const uint8 * src,uint8 * dst,int width)3609 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
3610 __asm {
3611 mov eax, [esp + 4] // src
3612 mov edx, [esp + 8] // dst
3613 mov ecx, [esp + 12] // count
3614 pcmpeqb xmm0, xmm0 // generate mask 0xff000000
3615 pslld xmm0, 24
3616 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
3617 psrld xmm1, 8
3618
3619 align 4
3620 convertloop:
3621 movdqa xmm2, [eax]
3622 movdqa xmm3, [eax + 16]
3623 lea eax, [eax + 32]
3624 movdqa xmm4, [edx]
3625 movdqa xmm5, [edx + 16]
3626 pand xmm2, xmm0
3627 pand xmm3, xmm0
3628 pand xmm4, xmm1
3629 pand xmm5, xmm1
3630 por xmm2, xmm4
3631 por xmm3, xmm5
3632 movdqa [edx], xmm2
3633 movdqa [edx + 16], xmm3
3634 lea edx, [edx + 32]
3635 sub ecx, 8
3636 jg convertloop
3637
3638 ret
3639 }
3640 }
3641 #endif // HAS_ARGBCOPYALPHAROW_SSE2
3642
3643 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
3644 // width in pixels
3645 __declspec(naked) __declspec(align(16))
ARGBCopyAlphaRow_AVX2(const uint8 * src,uint8 * dst,int width)3646 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3647 __asm {
3648 mov eax, [esp + 4] // src
3649 mov edx, [esp + 8] // dst
3650 mov ecx, [esp + 12] // count
3651 vpcmpeqb ymm0, ymm0, ymm0
3652 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
3653
3654 align 4
3655 convertloop:
3656 vmovdqu ymm1, [eax]
3657 vmovdqu ymm2, [eax + 32]
3658 lea eax, [eax + 64]
3659 vpblendvb ymm1, ymm1, [edx], ymm0
3660 vpblendvb ymm2, ymm2, [edx + 32], ymm0
3661 vmovdqu [edx], ymm1
3662 vmovdqu [edx + 32], ymm2
3663 lea edx, [edx + 64]
3664 sub ecx, 16
3665 jg convertloop
3666
3667 vzeroupper
3668 ret
3669 }
3670 }
3671 #endif // HAS_ARGBCOPYALPHAROW_AVX2
3672
3673 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
3674 // width in pixels
3675 __declspec(naked) __declspec(align(16))
ARGBCopyYToAlphaRow_SSE2(const uint8 * src,uint8 * dst,int width)3676 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
3677 __asm {
3678 mov eax, [esp + 4] // src
3679 mov edx, [esp + 8] // dst
3680 mov ecx, [esp + 12] // count
3681 pcmpeqb xmm0, xmm0 // generate mask 0xff000000
3682 pslld xmm0, 24
3683 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
3684 psrld xmm1, 8
3685
3686 align 4
3687 convertloop:
3688 movq xmm2, qword ptr [eax] // 8 Y's
3689 lea eax, [eax + 8]
3690 punpcklbw xmm2, xmm2
3691 punpckhwd xmm3, xmm2
3692 punpcklwd xmm2, xmm2
3693 movdqa xmm4, [edx]
3694 movdqa xmm5, [edx + 16]
3695 pand xmm2, xmm0
3696 pand xmm3, xmm0
3697 pand xmm4, xmm1
3698 pand xmm5, xmm1
3699 por xmm2, xmm4
3700 por xmm3, xmm5
3701 movdqa [edx], xmm2
3702 movdqa [edx + 16], xmm3
3703 lea edx, [edx + 32]
3704 sub ecx, 8
3705 jg convertloop
3706
3707 ret
3708 }
3709 }
3710 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
3711
3712 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3713 // width in pixels
3714 __declspec(naked) __declspec(align(16))
ARGBCopyYToAlphaRow_AVX2(const uint8 * src,uint8 * dst,int width)3715 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3716 __asm {
3717 mov eax, [esp + 4] // src
3718 mov edx, [esp + 8] // dst
3719 mov ecx, [esp + 12] // count
3720 vpcmpeqb ymm0, ymm0, ymm0
3721 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
3722
3723 align 4
3724 convertloop:
3725 vpmovzxbd ymm1, qword ptr [eax]
3726 vpmovzxbd ymm2, qword ptr [eax + 8]
3727 lea eax, [eax + 16]
3728 vpslld ymm1, ymm1, 24
3729 vpslld ymm2, ymm2, 24
3730 vpblendvb ymm1, ymm1, [edx], ymm0
3731 vpblendvb ymm2, ymm2, [edx + 32], ymm0
3732 vmovdqu [edx], ymm1
3733 vmovdqu [edx + 32], ymm2
3734 lea edx, [edx + 64]
3735 sub ecx, 16
3736 jg convertloop
3737
3738 vzeroupper
3739 ret
3740 }
3741 }
3742 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
3743
3744 #ifdef HAS_SETROW_X86
3745 // SetRow8 writes 'count' bytes using a 32 bit value repeated.
3746 __declspec(naked) __declspec(align(16))
SetRow_X86(uint8 * dst,uint32 v32,int count)3747 void SetRow_X86(uint8* dst, uint32 v32, int count) {
3748 __asm {
3749 mov edx, edi
3750 mov edi, [esp + 4] // dst
3751 mov eax, [esp + 8] // v32
3752 mov ecx, [esp + 12] // count
3753 shr ecx, 2
3754 rep stosd
3755 mov edi, edx
3756 ret
3757 }
3758 }
3759
3760 // SetRow32 writes 'count' words using a 32 bit value repeated.
3761 __declspec(naked) __declspec(align(16))
ARGBSetRows_X86(uint8 * dst,uint32 v32,int width,int dst_stride,int height)3762 void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
3763 int dst_stride, int height) {
3764 __asm {
3765 push esi
3766 push edi
3767 push ebp
3768 mov edi, [esp + 12 + 4] // dst
3769 mov eax, [esp + 12 + 8] // v32
3770 mov ebp, [esp + 12 + 12] // width
3771 mov edx, [esp + 12 + 16] // dst_stride
3772 mov esi, [esp + 12 + 20] // height
3773 lea ecx, [ebp * 4]
3774 sub edx, ecx // stride - width * 4
3775
3776 align 4
3777 convertloop:
3778 mov ecx, ebp
3779 rep stosd
3780 add edi, edx
3781 sub esi, 1
3782 jg convertloop
3783
3784 pop ebp
3785 pop edi
3786 pop esi
3787 ret
3788 }
3789 }
3790 #endif // HAS_SETROW_X86
3791
3792 #ifdef HAS_YUY2TOYROW_AVX2
3793 __declspec(naked) __declspec(align(16))
YUY2ToYRow_AVX2(const uint8 * src_yuy2,uint8 * dst_y,int pix)3794 void YUY2ToYRow_AVX2(const uint8* src_yuy2,
3795 uint8* dst_y, int pix) {
3796 __asm {
3797 mov eax, [esp + 4] // src_yuy2
3798 mov edx, [esp + 8] // dst_y
3799 mov ecx, [esp + 12] // pix
3800 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3801 vpsrlw ymm5, ymm5, 8
3802
3803 align 4
3804 convertloop:
3805 vmovdqu ymm0, [eax]
3806 vmovdqu ymm1, [eax + 32]
3807 lea eax, [eax + 64]
3808 vpand ymm0, ymm0, ymm5 // even bytes are Y
3809 vpand ymm1, ymm1, ymm5
3810 vpackuswb ymm0, ymm0, ymm1 // mutates.
3811 vpermq ymm0, ymm0, 0xd8
3812 sub ecx, 32
3813 vmovdqu [edx], ymm0
3814 lea edx, [edx + 32]
3815 jg convertloop
3816 vzeroupper
3817 ret
3818 }
3819 }
3820
3821 __declspec(naked) __declspec(align(16))
YUY2ToUVRow_AVX2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)3822 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
3823 uint8* dst_u, uint8* dst_v, int pix) {
3824 __asm {
3825 push esi
3826 push edi
3827 mov eax, [esp + 8 + 4] // src_yuy2
3828 mov esi, [esp + 8 + 8] // stride_yuy2
3829 mov edx, [esp + 8 + 12] // dst_u
3830 mov edi, [esp + 8 + 16] // dst_v
3831 mov ecx, [esp + 8 + 20] // pix
3832 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3833 vpsrlw ymm5, ymm5, 8
3834 sub edi, edx
3835
3836 align 4
3837 convertloop:
3838 vmovdqu ymm0, [eax]
3839 vmovdqu ymm1, [eax + 32]
3840 vpavgb ymm0, ymm0, [eax + esi]
3841 vpavgb ymm1, ymm1, [eax + esi + 32]
3842 lea eax, [eax + 64]
3843 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
3844 vpsrlw ymm1, ymm1, 8
3845 vpackuswb ymm0, ymm0, ymm1 // mutates.
3846 vpermq ymm0, ymm0, 0xd8
3847 vpand ymm1, ymm0, ymm5 // U
3848 vpsrlw ymm0, ymm0, 8 // V
3849 vpackuswb ymm1, ymm1, ymm1 // mutates.
3850 vpackuswb ymm0, ymm0, ymm0 // mutates.
3851 vpermq ymm1, ymm1, 0xd8
3852 vpermq ymm0, ymm0, 0xd8
3853 vextractf128 [edx], ymm1, 0 // U
3854 vextractf128 [edx + edi], ymm0, 0 // V
3855 lea edx, [edx + 16]
3856 sub ecx, 32
3857 jg convertloop
3858
3859 pop edi
3860 pop esi
3861 vzeroupper
3862 ret
3863 }
3864 }
3865
3866 __declspec(naked) __declspec(align(16))
YUY2ToUV422Row_AVX2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)3867 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3868 uint8* dst_u, uint8* dst_v, int pix) {
3869 __asm {
3870 push edi
3871 mov eax, [esp + 4 + 4] // src_yuy2
3872 mov edx, [esp + 4 + 8] // dst_u
3873 mov edi, [esp + 4 + 12] // dst_v
3874 mov ecx, [esp + 4 + 16] // pix
3875 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3876 vpsrlw ymm5, ymm5, 8
3877 sub edi, edx
3878
3879 align 4
3880 convertloop:
3881 vmovdqu ymm0, [eax]
3882 vmovdqu ymm1, [eax + 32]
3883 lea eax, [eax + 64]
3884 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
3885 vpsrlw ymm1, ymm1, 8
3886 vpackuswb ymm0, ymm0, ymm1 // mutates.
3887 vpermq ymm0, ymm0, 0xd8
3888 vpand ymm1, ymm0, ymm5 // U
3889 vpsrlw ymm0, ymm0, 8 // V
3890 vpackuswb ymm1, ymm1, ymm1 // mutates.
3891 vpackuswb ymm0, ymm0, ymm0 // mutates.
3892 vpermq ymm1, ymm1, 0xd8
3893 vpermq ymm0, ymm0, 0xd8
3894 vextractf128 [edx], ymm1, 0 // U
3895 vextractf128 [edx + edi], ymm0, 0 // V
3896 lea edx, [edx + 16]
3897 sub ecx, 32
3898 jg convertloop
3899
3900 pop edi
3901 vzeroupper
3902 ret
3903 }
3904 }
3905
3906 __declspec(naked) __declspec(align(16))
UYVYToYRow_AVX2(const uint8 * src_uyvy,uint8 * dst_y,int pix)3907 void UYVYToYRow_AVX2(const uint8* src_uyvy,
3908 uint8* dst_y, int pix) {
3909 __asm {
3910 mov eax, [esp + 4] // src_uyvy
3911 mov edx, [esp + 8] // dst_y
3912 mov ecx, [esp + 12] // pix
3913
3914 align 4
3915 convertloop:
3916 vmovdqu ymm0, [eax]
3917 vmovdqu ymm1, [eax + 32]
3918 lea eax, [eax + 64]
3919 vpsrlw ymm0, ymm0, 8 // odd bytes are Y
3920 vpsrlw ymm1, ymm1, 8
3921 vpackuswb ymm0, ymm0, ymm1 // mutates.
3922 vpermq ymm0, ymm0, 0xd8
3923 sub ecx, 32
3924 vmovdqu [edx], ymm0
3925 lea edx, [edx + 32]
3926 jg convertloop
3927 ret
3928 vzeroupper
3929 }
3930 }
3931
3932 __declspec(naked) __declspec(align(16))
UYVYToUVRow_AVX2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)3933 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
3934 uint8* dst_u, uint8* dst_v, int pix) {
3935 __asm {
3936 push esi
3937 push edi
3938 mov eax, [esp + 8 + 4] // src_yuy2
3939 mov esi, [esp + 8 + 8] // stride_yuy2
3940 mov edx, [esp + 8 + 12] // dst_u
3941 mov edi, [esp + 8 + 16] // dst_v
3942 mov ecx, [esp + 8 + 20] // pix
3943 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3944 vpsrlw ymm5, ymm5, 8
3945 sub edi, edx
3946
3947 align 4
3948 convertloop:
3949 vmovdqu ymm0, [eax]
3950 vmovdqu ymm1, [eax + 32]
3951 vpavgb ymm0, ymm0, [eax + esi]
3952 vpavgb ymm1, ymm1, [eax + esi + 32]
3953 lea eax, [eax + 64]
3954 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
3955 vpand ymm1, ymm1, ymm5
3956 vpackuswb ymm0, ymm0, ymm1 // mutates.
3957 vpermq ymm0, ymm0, 0xd8
3958 vpand ymm1, ymm0, ymm5 // U
3959 vpsrlw ymm0, ymm0, 8 // V
3960 vpackuswb ymm1, ymm1, ymm1 // mutates.
3961 vpackuswb ymm0, ymm0, ymm0 // mutates.
3962 vpermq ymm1, ymm1, 0xd8
3963 vpermq ymm0, ymm0, 0xd8
3964 vextractf128 [edx], ymm1, 0 // U
3965 vextractf128 [edx + edi], ymm0, 0 // V
3966 lea edx, [edx + 16]
3967 sub ecx, 32
3968 jg convertloop
3969
3970 pop edi
3971 pop esi
3972 vzeroupper
3973 ret
3974 }
3975 }
3976
3977 __declspec(naked) __declspec(align(16))
UYVYToUV422Row_AVX2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)3978 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3979 uint8* dst_u, uint8* dst_v, int pix) {
3980 __asm {
3981 push edi
3982 mov eax, [esp + 4 + 4] // src_yuy2
3983 mov edx, [esp + 4 + 8] // dst_u
3984 mov edi, [esp + 4 + 12] // dst_v
3985 mov ecx, [esp + 4 + 16] // pix
3986 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3987 vpsrlw ymm5, ymm5, 8
3988 sub edi, edx
3989
3990 align 4
3991 convertloop:
3992 vmovdqu ymm0, [eax]
3993 vmovdqu ymm1, [eax + 32]
3994 lea eax, [eax + 64]
3995 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
3996 vpand ymm1, ymm1, ymm5
3997 vpackuswb ymm0, ymm0, ymm1 // mutates.
3998 vpermq ymm0, ymm0, 0xd8
3999 vpand ymm1, ymm0, ymm5 // U
4000 vpsrlw ymm0, ymm0, 8 // V
4001 vpackuswb ymm1, ymm1, ymm1 // mutates.
4002 vpackuswb ymm0, ymm0, ymm0 // mutates.
4003 vpermq ymm1, ymm1, 0xd8
4004 vpermq ymm0, ymm0, 0xd8
4005 vextractf128 [edx], ymm1, 0 // U
4006 vextractf128 [edx + edi], ymm0, 0 // V
4007 lea edx, [edx + 16]
4008 sub ecx, 32
4009 jg convertloop
4010
4011 pop edi
4012 vzeroupper
4013 ret
4014 }
4015 }
4016 #endif // HAS_YUY2TOYROW_AVX2
4017
4018 #ifdef HAS_YUY2TOYROW_SSE2
4019 __declspec(naked) __declspec(align(16))
YUY2ToYRow_SSE2(const uint8 * src_yuy2,uint8 * dst_y,int pix)4020 void YUY2ToYRow_SSE2(const uint8* src_yuy2,
4021 uint8* dst_y, int pix) {
4022 __asm {
4023 mov eax, [esp + 4] // src_yuy2
4024 mov edx, [esp + 8] // dst_y
4025 mov ecx, [esp + 12] // pix
4026 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4027 psrlw xmm5, 8
4028
4029 align 4
4030 convertloop:
4031 movdqa xmm0, [eax]
4032 movdqa xmm1, [eax + 16]
4033 lea eax, [eax + 32]
4034 pand xmm0, xmm5 // even bytes are Y
4035 pand xmm1, xmm5
4036 packuswb xmm0, xmm1
4037 sub ecx, 16
4038 movdqa [edx], xmm0
4039 lea edx, [edx + 16]
4040 jg convertloop
4041 ret
4042 }
4043 }
4044
4045 __declspec(naked) __declspec(align(16))
YUY2ToUVRow_SSE2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)4046 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
4047 uint8* dst_u, uint8* dst_v, int pix) {
4048 __asm {
4049 push esi
4050 push edi
4051 mov eax, [esp + 8 + 4] // src_yuy2
4052 mov esi, [esp + 8 + 8] // stride_yuy2
4053 mov edx, [esp + 8 + 12] // dst_u
4054 mov edi, [esp + 8 + 16] // dst_v
4055 mov ecx, [esp + 8 + 20] // pix
4056 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4057 psrlw xmm5, 8
4058 sub edi, edx
4059
4060 align 4
4061 convertloop:
4062 movdqa xmm0, [eax]
4063 movdqa xmm1, [eax + 16]
4064 movdqa xmm2, [eax + esi]
4065 movdqa xmm3, [eax + esi + 16]
4066 lea eax, [eax + 32]
4067 pavgb xmm0, xmm2
4068 pavgb xmm1, xmm3
4069 psrlw xmm0, 8 // YUYV -> UVUV
4070 psrlw xmm1, 8
4071 packuswb xmm0, xmm1
4072 movdqa xmm1, xmm0
4073 pand xmm0, xmm5 // U
4074 packuswb xmm0, xmm0
4075 psrlw xmm1, 8 // V
4076 packuswb xmm1, xmm1
4077 movq qword ptr [edx], xmm0
4078 movq qword ptr [edx + edi], xmm1
4079 lea edx, [edx + 8]
4080 sub ecx, 16
4081 jg convertloop
4082
4083 pop edi
4084 pop esi
4085 ret
4086 }
4087 }
4088
4089 __declspec(naked) __declspec(align(16))
YUY2ToUV422Row_SSE2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)4090 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
4091 uint8* dst_u, uint8* dst_v, int pix) {
4092 __asm {
4093 push edi
4094 mov eax, [esp + 4 + 4] // src_yuy2
4095 mov edx, [esp + 4 + 8] // dst_u
4096 mov edi, [esp + 4 + 12] // dst_v
4097 mov ecx, [esp + 4 + 16] // pix
4098 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4099 psrlw xmm5, 8
4100 sub edi, edx
4101
4102 align 4
4103 convertloop:
4104 movdqa xmm0, [eax]
4105 movdqa xmm1, [eax + 16]
4106 lea eax, [eax + 32]
4107 psrlw xmm0, 8 // YUYV -> UVUV
4108 psrlw xmm1, 8
4109 packuswb xmm0, xmm1
4110 movdqa xmm1, xmm0
4111 pand xmm0, xmm5 // U
4112 packuswb xmm0, xmm0
4113 psrlw xmm1, 8 // V
4114 packuswb xmm1, xmm1
4115 movq qword ptr [edx], xmm0
4116 movq qword ptr [edx + edi], xmm1
4117 lea edx, [edx + 8]
4118 sub ecx, 16
4119 jg convertloop
4120
4121 pop edi
4122 ret
4123 }
4124 }
4125
4126 __declspec(naked) __declspec(align(16))
YUY2ToYRow_Unaligned_SSE2(const uint8 * src_yuy2,uint8 * dst_y,int pix)4127 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
4128 uint8* dst_y, int pix) {
4129 __asm {
4130 mov eax, [esp + 4] // src_yuy2
4131 mov edx, [esp + 8] // dst_y
4132 mov ecx, [esp + 12] // pix
4133 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4134 psrlw xmm5, 8
4135
4136 align 4
4137 convertloop:
4138 movdqu xmm0, [eax]
4139 movdqu xmm1, [eax + 16]
4140 lea eax, [eax + 32]
4141 pand xmm0, xmm5 // even bytes are Y
4142 pand xmm1, xmm5
4143 packuswb xmm0, xmm1
4144 sub ecx, 16
4145 movdqu [edx], xmm0
4146 lea edx, [edx + 16]
4147 jg convertloop
4148 ret
4149 }
4150 }
4151
4152 __declspec(naked) __declspec(align(16))
YUY2ToUVRow_Unaligned_SSE2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)4153 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
4154 uint8* dst_u, uint8* dst_v, int pix) {
4155 __asm {
4156 push esi
4157 push edi
4158 mov eax, [esp + 8 + 4] // src_yuy2
4159 mov esi, [esp + 8 + 8] // stride_yuy2
4160 mov edx, [esp + 8 + 12] // dst_u
4161 mov edi, [esp + 8 + 16] // dst_v
4162 mov ecx, [esp + 8 + 20] // pix
4163 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4164 psrlw xmm5, 8
4165 sub edi, edx
4166
4167 align 4
4168 convertloop:
4169 movdqu xmm0, [eax]
4170 movdqu xmm1, [eax + 16]
4171 movdqu xmm2, [eax + esi]
4172 movdqu xmm3, [eax + esi + 16]
4173 lea eax, [eax + 32]
4174 pavgb xmm0, xmm2
4175 pavgb xmm1, xmm3
4176 psrlw xmm0, 8 // YUYV -> UVUV
4177 psrlw xmm1, 8
4178 packuswb xmm0, xmm1
4179 movdqa xmm1, xmm0
4180 pand xmm0, xmm5 // U
4181 packuswb xmm0, xmm0
4182 psrlw xmm1, 8 // V
4183 packuswb xmm1, xmm1
4184 movq qword ptr [edx], xmm0
4185 movq qword ptr [edx + edi], xmm1
4186 lea edx, [edx + 8]
4187 sub ecx, 16
4188 jg convertloop
4189
4190 pop edi
4191 pop esi
4192 ret
4193 }
4194 }
4195
4196 __declspec(naked) __declspec(align(16))
YUY2ToUV422Row_Unaligned_SSE2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)4197 void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
4198 uint8* dst_u, uint8* dst_v, int pix) {
4199 __asm {
4200 push edi
4201 mov eax, [esp + 4 + 4] // src_yuy2
4202 mov edx, [esp + 4 + 8] // dst_u
4203 mov edi, [esp + 4 + 12] // dst_v
4204 mov ecx, [esp + 4 + 16] // pix
4205 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4206 psrlw xmm5, 8
4207 sub edi, edx
4208
4209 align 4
4210 convertloop:
4211 movdqu xmm0, [eax]
4212 movdqu xmm1, [eax + 16]
4213 lea eax, [eax + 32]
4214 psrlw xmm0, 8 // YUYV -> UVUV
4215 psrlw xmm1, 8
4216 packuswb xmm0, xmm1
4217 movdqa xmm1, xmm0
4218 pand xmm0, xmm5 // U
4219 packuswb xmm0, xmm0
4220 psrlw xmm1, 8 // V
4221 packuswb xmm1, xmm1
4222 movq qword ptr [edx], xmm0
4223 movq qword ptr [edx + edi], xmm1
4224 lea edx, [edx + 8]
4225 sub ecx, 16
4226 jg convertloop
4227
4228 pop edi
4229 ret
4230 }
4231 }
4232
4233 __declspec(naked) __declspec(align(16))
UYVYToYRow_SSE2(const uint8 * src_uyvy,uint8 * dst_y,int pix)4234 void UYVYToYRow_SSE2(const uint8* src_uyvy,
4235 uint8* dst_y, int pix) {
4236 __asm {
4237 mov eax, [esp + 4] // src_uyvy
4238 mov edx, [esp + 8] // dst_y
4239 mov ecx, [esp + 12] // pix
4240
4241 align 4
4242 convertloop:
4243 movdqa xmm0, [eax]
4244 movdqa xmm1, [eax + 16]
4245 lea eax, [eax + 32]
4246 psrlw xmm0, 8 // odd bytes are Y
4247 psrlw xmm1, 8
4248 packuswb xmm0, xmm1
4249 sub ecx, 16
4250 movdqa [edx], xmm0
4251 lea edx, [edx + 16]
4252 jg convertloop
4253 ret
4254 }
4255 }
4256
4257 __declspec(naked) __declspec(align(16))
UYVYToUVRow_SSE2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)4258 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
4259 uint8* dst_u, uint8* dst_v, int pix) {
4260 __asm {
4261 push esi
4262 push edi
4263 mov eax, [esp + 8 + 4] // src_yuy2
4264 mov esi, [esp + 8 + 8] // stride_yuy2
4265 mov edx, [esp + 8 + 12] // dst_u
4266 mov edi, [esp + 8 + 16] // dst_v
4267 mov ecx, [esp + 8 + 20] // pix
4268 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4269 psrlw xmm5, 8
4270 sub edi, edx
4271
4272 align 4
4273 convertloop:
4274 movdqa xmm0, [eax]
4275 movdqa xmm1, [eax + 16]
4276 movdqa xmm2, [eax + esi]
4277 movdqa xmm3, [eax + esi + 16]
4278 lea eax, [eax + 32]
4279 pavgb xmm0, xmm2
4280 pavgb xmm1, xmm3
4281 pand xmm0, xmm5 // UYVY -> UVUV
4282 pand xmm1, xmm5
4283 packuswb xmm0, xmm1
4284 movdqa xmm1, xmm0
4285 pand xmm0, xmm5 // U
4286 packuswb xmm0, xmm0
4287 psrlw xmm1, 8 // V
4288 packuswb xmm1, xmm1
4289 movq qword ptr [edx], xmm0
4290 movq qword ptr [edx + edi], xmm1
4291 lea edx, [edx + 8]
4292 sub ecx, 16
4293 jg convertloop
4294
4295 pop edi
4296 pop esi
4297 ret
4298 }
4299 }
4300
4301 __declspec(naked) __declspec(align(16))
UYVYToUV422Row_SSE2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)4302 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
4303 uint8* dst_u, uint8* dst_v, int pix) {
4304 __asm {
4305 push edi
4306 mov eax, [esp + 4 + 4] // src_yuy2
4307 mov edx, [esp + 4 + 8] // dst_u
4308 mov edi, [esp + 4 + 12] // dst_v
4309 mov ecx, [esp + 4 + 16] // pix
4310 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4311 psrlw xmm5, 8
4312 sub edi, edx
4313
4314 align 4
4315 convertloop:
4316 movdqa xmm0, [eax]
4317 movdqa xmm1, [eax + 16]
4318 lea eax, [eax + 32]
4319 pand xmm0, xmm5 // UYVY -> UVUV
4320 pand xmm1, xmm5
4321 packuswb xmm0, xmm1
4322 movdqa xmm1, xmm0
4323 pand xmm0, xmm5 // U
4324 packuswb xmm0, xmm0
4325 psrlw xmm1, 8 // V
4326 packuswb xmm1, xmm1
4327 movq qword ptr [edx], xmm0
4328 movq qword ptr [edx + edi], xmm1
4329 lea edx, [edx + 8]
4330 sub ecx, 16
4331 jg convertloop
4332
4333 pop edi
4334 ret
4335 }
4336 }
4337
4338 __declspec(naked) __declspec(align(16))
UYVYToYRow_Unaligned_SSE2(const uint8 * src_uyvy,uint8 * dst_y,int pix)4339 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
4340 uint8* dst_y, int pix) {
4341 __asm {
4342 mov eax, [esp + 4] // src_uyvy
4343 mov edx, [esp + 8] // dst_y
4344 mov ecx, [esp + 12] // pix
4345
4346 align 4
4347 convertloop:
4348 movdqu xmm0, [eax]
4349 movdqu xmm1, [eax + 16]
4350 lea eax, [eax + 32]
4351 psrlw xmm0, 8 // odd bytes are Y
4352 psrlw xmm1, 8
4353 packuswb xmm0, xmm1
4354 sub ecx, 16
4355 movdqu [edx], xmm0
4356 lea edx, [edx + 16]
4357 jg convertloop
4358 ret
4359 }
4360 }
4361
4362 __declspec(naked) __declspec(align(16))
UYVYToUVRow_Unaligned_SSE2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)4363 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
4364 uint8* dst_u, uint8* dst_v, int pix) {
4365 __asm {
4366 push esi
4367 push edi
4368 mov eax, [esp + 8 + 4] // src_yuy2
4369 mov esi, [esp + 8 + 8] // stride_yuy2
4370 mov edx, [esp + 8 + 12] // dst_u
4371 mov edi, [esp + 8 + 16] // dst_v
4372 mov ecx, [esp + 8 + 20] // pix
4373 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4374 psrlw xmm5, 8
4375 sub edi, edx
4376
4377 align 4
4378 convertloop:
4379 movdqu xmm0, [eax]
4380 movdqu xmm1, [eax + 16]
4381 movdqu xmm2, [eax + esi]
4382 movdqu xmm3, [eax + esi + 16]
4383 lea eax, [eax + 32]
4384 pavgb xmm0, xmm2
4385 pavgb xmm1, xmm3
4386 pand xmm0, xmm5 // UYVY -> UVUV
4387 pand xmm1, xmm5
4388 packuswb xmm0, xmm1
4389 movdqa xmm1, xmm0
4390 pand xmm0, xmm5 // U
4391 packuswb xmm0, xmm0
4392 psrlw xmm1, 8 // V
4393 packuswb xmm1, xmm1
4394 movq qword ptr [edx], xmm0
4395 movq qword ptr [edx + edi], xmm1
4396 lea edx, [edx + 8]
4397 sub ecx, 16
4398 jg convertloop
4399
4400 pop edi
4401 pop esi
4402 ret
4403 }
4404 }
4405
4406 __declspec(naked) __declspec(align(16))
UYVYToUV422Row_Unaligned_SSE2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)4407 void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
4408 uint8* dst_u, uint8* dst_v, int pix) {
4409 __asm {
4410 push edi
4411 mov eax, [esp + 4 + 4] // src_yuy2
4412 mov edx, [esp + 4 + 8] // dst_u
4413 mov edi, [esp + 4 + 12] // dst_v
4414 mov ecx, [esp + 4 + 16] // pix
4415 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4416 psrlw xmm5, 8
4417 sub edi, edx
4418
4419 align 4
4420 convertloop:
4421 movdqu xmm0, [eax]
4422 movdqu xmm1, [eax + 16]
4423 lea eax, [eax + 32]
4424 pand xmm0, xmm5 // UYVY -> UVUV
4425 pand xmm1, xmm5
4426 packuswb xmm0, xmm1
4427 movdqa xmm1, xmm0
4428 pand xmm0, xmm5 // U
4429 packuswb xmm0, xmm0
4430 psrlw xmm1, 8 // V
4431 packuswb xmm1, xmm1
4432 movq qword ptr [edx], xmm0
4433 movq qword ptr [edx + edi], xmm1
4434 lea edx, [edx + 8]
4435 sub ecx, 16
4436 jg convertloop
4437
4438 pop edi
4439 ret
4440 }
4441 }
4442 #endif // HAS_YUY2TOYROW_SSE2
4443
4444 #ifdef HAS_ARGBBLENDROW_SSE2
4445 // Blend 8 pixels at a time.
4446 __declspec(naked) __declspec(align(16))
ARGBBlendRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4447 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4448 uint8* dst_argb, int width) {
4449 __asm {
4450 push esi
4451 mov eax, [esp + 4 + 4] // src_argb0
4452 mov esi, [esp + 4 + 8] // src_argb1
4453 mov edx, [esp + 4 + 12] // dst_argb
4454 mov ecx, [esp + 4 + 16] // width
4455 pcmpeqb xmm7, xmm7 // generate constant 1
4456 psrlw xmm7, 15
4457 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
4458 psrlw xmm6, 8
4459 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
4460 psllw xmm5, 8
4461 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
4462 pslld xmm4, 24
4463
4464 sub ecx, 1
4465 je convertloop1 // only 1 pixel?
4466 jl convertloop1b
4467
4468 // 1 pixel loop until destination pointer is aligned.
4469 alignloop1:
4470 test edx, 15 // aligned?
4471 je alignloop1b
4472 movd xmm3, [eax]
4473 lea eax, [eax + 4]
4474 movdqa xmm0, xmm3 // src argb
4475 pxor xmm3, xmm4 // ~alpha
4476 movd xmm2, [esi] // _r_b
4477 psrlw xmm3, 8 // alpha
4478 pshufhw xmm3, xmm3, 0F5h // 8 alpha words
4479 pshuflw xmm3, xmm3, 0F5h
4480 pand xmm2, xmm6 // _r_b
4481 paddw xmm3, xmm7 // 256 - alpha
4482 pmullw xmm2, xmm3 // _r_b * alpha
4483 movd xmm1, [esi] // _a_g
4484 lea esi, [esi + 4]
4485 psrlw xmm1, 8 // _a_g
4486 por xmm0, xmm4 // set alpha to 255
4487 pmullw xmm1, xmm3 // _a_g * alpha
4488 psrlw xmm2, 8 // _r_b convert to 8 bits again
4489 paddusb xmm0, xmm2 // + src argb
4490 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4491 paddusb xmm0, xmm1 // + src argb
4492 sub ecx, 1
4493 movd [edx], xmm0
4494 lea edx, [edx + 4]
4495 jge alignloop1
4496
4497 alignloop1b:
4498 add ecx, 1 - 4
4499 jl convertloop4b
4500
4501 // 4 pixel loop.
4502 convertloop4:
4503 movdqu xmm3, [eax] // src argb
4504 lea eax, [eax + 16]
4505 movdqa xmm0, xmm3 // src argb
4506 pxor xmm3, xmm4 // ~alpha
4507 movdqu xmm2, [esi] // _r_b
4508 psrlw xmm3, 8 // alpha
4509 pshufhw xmm3, xmm3, 0F5h // 8 alpha words
4510 pshuflw xmm3, xmm3, 0F5h
4511 pand xmm2, xmm6 // _r_b
4512 paddw xmm3, xmm7 // 256 - alpha
4513 pmullw xmm2, xmm3 // _r_b * alpha
4514 movdqu xmm1, [esi] // _a_g
4515 lea esi, [esi + 16]
4516 psrlw xmm1, 8 // _a_g
4517 por xmm0, xmm4 // set alpha to 255
4518 pmullw xmm1, xmm3 // _a_g * alpha
4519 psrlw xmm2, 8 // _r_b convert to 8 bits again
4520 paddusb xmm0, xmm2 // + src argb
4521 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4522 paddusb xmm0, xmm1 // + src argb
4523 sub ecx, 4
4524 movdqa [edx], xmm0
4525 lea edx, [edx + 16]
4526 jge convertloop4
4527
4528 convertloop4b:
4529 add ecx, 4 - 1
4530 jl convertloop1b
4531
4532 // 1 pixel loop.
4533 convertloop1:
4534 movd xmm3, [eax] // src argb
4535 lea eax, [eax + 4]
4536 movdqa xmm0, xmm3 // src argb
4537 pxor xmm3, xmm4 // ~alpha
4538 movd xmm2, [esi] // _r_b
4539 psrlw xmm3, 8 // alpha
4540 pshufhw xmm3, xmm3, 0F5h // 8 alpha words
4541 pshuflw xmm3, xmm3, 0F5h
4542 pand xmm2, xmm6 // _r_b
4543 paddw xmm3, xmm7 // 256 - alpha
4544 pmullw xmm2, xmm3 // _r_b * alpha
4545 movd xmm1, [esi] // _a_g
4546 lea esi, [esi + 4]
4547 psrlw xmm1, 8 // _a_g
4548 por xmm0, xmm4 // set alpha to 255
4549 pmullw xmm1, xmm3 // _a_g * alpha
4550 psrlw xmm2, 8 // _r_b convert to 8 bits again
4551 paddusb xmm0, xmm2 // + src argb
4552 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4553 paddusb xmm0, xmm1 // + src argb
4554 sub ecx, 1
4555 movd [edx], xmm0
4556 lea edx, [edx + 4]
4557 jge convertloop1
4558
4559 convertloop1b:
4560 pop esi
4561 ret
4562 }
4563 }
4564 #endif // HAS_ARGBBLENDROW_SSE2
4565
4566 #ifdef HAS_ARGBBLENDROW_SSSE3
4567 // Shuffle table for isolating alpha.
4568 static const uvec8 kShuffleAlpha = {
4569 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
4570 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
4571 };
4572 // Same as SSE2, but replaces:
4573 // psrlw xmm3, 8 // alpha
4574 // pshufhw xmm3, xmm3, 0F5h // 8 alpha words
4575 // pshuflw xmm3, xmm3, 0F5h
4576 // with..
4577 // pshufb xmm3, kShuffleAlpha // alpha
4578 // Blend 8 pixels at a time.
4579
4580 __declspec(naked) __declspec(align(16))
ARGBBlendRow_SSSE3(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4581 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
4582 uint8* dst_argb, int width) {
4583 __asm {
4584 push esi
4585 mov eax, [esp + 4 + 4] // src_argb0
4586 mov esi, [esp + 4 + 8] // src_argb1
4587 mov edx, [esp + 4 + 12] // dst_argb
4588 mov ecx, [esp + 4 + 16] // width
4589 pcmpeqb xmm7, xmm7 // generate constant 0x0001
4590 psrlw xmm7, 15
4591 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
4592 psrlw xmm6, 8
4593 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
4594 psllw xmm5, 8
4595 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
4596 pslld xmm4, 24
4597
4598 sub ecx, 1
4599 je convertloop1 // only 1 pixel?
4600 jl convertloop1b
4601
4602 // 1 pixel loop until destination pointer is aligned.
4603 alignloop1:
4604 test edx, 15 // aligned?
4605 je alignloop1b
4606 movd xmm3, [eax]
4607 lea eax, [eax + 4]
4608 movdqa xmm0, xmm3 // src argb
4609 pxor xmm3, xmm4 // ~alpha
4610 movd xmm2, [esi] // _r_b
4611 pshufb xmm3, kShuffleAlpha // alpha
4612 pand xmm2, xmm6 // _r_b
4613 paddw xmm3, xmm7 // 256 - alpha
4614 pmullw xmm2, xmm3 // _r_b * alpha
4615 movd xmm1, [esi] // _a_g
4616 lea esi, [esi + 4]
4617 psrlw xmm1, 8 // _a_g
4618 por xmm0, xmm4 // set alpha to 255
4619 pmullw xmm1, xmm3 // _a_g * alpha
4620 psrlw xmm2, 8 // _r_b convert to 8 bits again
4621 paddusb xmm0, xmm2 // + src argb
4622 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4623 paddusb xmm0, xmm1 // + src argb
4624 sub ecx, 1
4625 movd [edx], xmm0
4626 lea edx, [edx + 4]
4627 jge alignloop1
4628
4629 alignloop1b:
4630 add ecx, 1 - 4
4631 jl convertloop4b
4632
4633 test eax, 15 // unaligned?
4634 jne convertuloop4
4635 test esi, 15 // unaligned?
4636 jne convertuloop4
4637
4638 // 4 pixel loop.
4639 convertloop4:
4640 movdqa xmm3, [eax] // src argb
4641 lea eax, [eax + 16]
4642 movdqa xmm0, xmm3 // src argb
4643 pxor xmm3, xmm4 // ~alpha
4644 movdqa xmm2, [esi] // _r_b
4645 pshufb xmm3, kShuffleAlpha // alpha
4646 pand xmm2, xmm6 // _r_b
4647 paddw xmm3, xmm7 // 256 - alpha
4648 pmullw xmm2, xmm3 // _r_b * alpha
4649 movdqa xmm1, [esi] // _a_g
4650 lea esi, [esi + 16]
4651 psrlw xmm1, 8 // _a_g
4652 por xmm0, xmm4 // set alpha to 255
4653 pmullw xmm1, xmm3 // _a_g * alpha
4654 psrlw xmm2, 8 // _r_b convert to 8 bits again
4655 paddusb xmm0, xmm2 // + src argb
4656 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4657 paddusb xmm0, xmm1 // + src argb
4658 sub ecx, 4
4659 movdqa [edx], xmm0
4660 lea edx, [edx + 16]
4661 jge convertloop4
4662 jmp convertloop4b
4663
4664 // 4 pixel unaligned loop.
4665 convertuloop4:
4666 movdqu xmm3, [eax] // src argb
4667 lea eax, [eax + 16]
4668 movdqa xmm0, xmm3 // src argb
4669 pxor xmm3, xmm4 // ~alpha
4670 movdqu xmm2, [esi] // _r_b
4671 pshufb xmm3, kShuffleAlpha // alpha
4672 pand xmm2, xmm6 // _r_b
4673 paddw xmm3, xmm7 // 256 - alpha
4674 pmullw xmm2, xmm3 // _r_b * alpha
4675 movdqu xmm1, [esi] // _a_g
4676 lea esi, [esi + 16]
4677 psrlw xmm1, 8 // _a_g
4678 por xmm0, xmm4 // set alpha to 255
4679 pmullw xmm1, xmm3 // _a_g * alpha
4680 psrlw xmm2, 8 // _r_b convert to 8 bits again
4681 paddusb xmm0, xmm2 // + src argb
4682 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4683 paddusb xmm0, xmm1 // + src argb
4684 sub ecx, 4
4685 movdqa [edx], xmm0
4686 lea edx, [edx + 16]
4687 jge convertuloop4
4688
4689 convertloop4b:
4690 add ecx, 4 - 1
4691 jl convertloop1b
4692
4693 // 1 pixel loop.
4694 convertloop1:
4695 movd xmm3, [eax] // src argb
4696 lea eax, [eax + 4]
4697 movdqa xmm0, xmm3 // src argb
4698 pxor xmm3, xmm4 // ~alpha
4699 movd xmm2, [esi] // _r_b
4700 pshufb xmm3, kShuffleAlpha // alpha
4701 pand xmm2, xmm6 // _r_b
4702 paddw xmm3, xmm7 // 256 - alpha
4703 pmullw xmm2, xmm3 // _r_b * alpha
4704 movd xmm1, [esi] // _a_g
4705 lea esi, [esi + 4]
4706 psrlw xmm1, 8 // _a_g
4707 por xmm0, xmm4 // set alpha to 255
4708 pmullw xmm1, xmm3 // _a_g * alpha
4709 psrlw xmm2, 8 // _r_b convert to 8 bits again
4710 paddusb xmm0, xmm2 // + src argb
4711 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4712 paddusb xmm0, xmm1 // + src argb
4713 sub ecx, 1
4714 movd [edx], xmm0
4715 lea edx, [edx + 4]
4716 jge convertloop1
4717
4718 convertloop1b:
4719 pop esi
4720 ret
4721 }
4722 }
4723 #endif // HAS_ARGBBLENDROW_SSSE3
4724
4725 #ifdef HAS_ARGBATTENUATEROW_SSE2
4726 // Attenuate 4 pixels at a time.
4727 // Aligned to 16 bytes.
4728 __declspec(naked) __declspec(align(16))
ARGBAttenuateRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width)4729 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
4730 __asm {
4731 mov eax, [esp + 4] // src_argb0
4732 mov edx, [esp + 8] // dst_argb
4733 mov ecx, [esp + 12] // width
4734 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
4735 pslld xmm4, 24
4736 pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff
4737 psrld xmm5, 8
4738
4739 align 4
4740 convertloop:
4741 movdqa xmm0, [eax] // read 4 pixels
4742 punpcklbw xmm0, xmm0 // first 2
4743 pshufhw xmm2, xmm0, 0FFh // 8 alpha words
4744 pshuflw xmm2, xmm2, 0FFh
4745 pmulhuw xmm0, xmm2 // rgb * a
4746 movdqa xmm1, [eax] // read 4 pixels
4747 punpckhbw xmm1, xmm1 // next 2 pixels
4748 pshufhw xmm2, xmm1, 0FFh // 8 alpha words
4749 pshuflw xmm2, xmm2, 0FFh
4750 pmulhuw xmm1, xmm2 // rgb * a
4751 movdqa xmm2, [eax] // alphas
4752 lea eax, [eax + 16]
4753 psrlw xmm0, 8
4754 pand xmm2, xmm4
4755 psrlw xmm1, 8
4756 packuswb xmm0, xmm1
4757 pand xmm0, xmm5 // keep original alphas
4758 por xmm0, xmm2
4759 sub ecx, 4
4760 movdqa [edx], xmm0
4761 lea edx, [edx + 16]
4762 jg convertloop
4763
4764 ret
4765 }
4766 }
4767 #endif // HAS_ARGBATTENUATEROW_SSE2
4768
4769 #ifdef HAS_ARGBATTENUATEROW_SSSE3
4770 // Shuffle table duplicating alpha.
4771 static const uvec8 kShuffleAlpha0 = {
4772 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
4773 };
4774 static const uvec8 kShuffleAlpha1 = {
4775 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
4776 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
4777 };
4778 __declspec(naked) __declspec(align(16))
ARGBAttenuateRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)4779 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
4780 __asm {
4781 mov eax, [esp + 4] // src_argb0
4782 mov edx, [esp + 8] // dst_argb
4783 mov ecx, [esp + 12] // width
4784 pcmpeqb xmm3, xmm3 // generate mask 0xff000000
4785 pslld xmm3, 24
4786 movdqa xmm4, kShuffleAlpha0
4787 movdqa xmm5, kShuffleAlpha1
4788
4789 align 4
4790 convertloop:
4791 movdqu xmm0, [eax] // read 4 pixels
4792 pshufb xmm0, xmm4 // isolate first 2 alphas
4793 movdqu xmm1, [eax] // read 4 pixels
4794 punpcklbw xmm1, xmm1 // first 2 pixel rgbs
4795 pmulhuw xmm0, xmm1 // rgb * a
4796 movdqu xmm1, [eax] // read 4 pixels
4797 pshufb xmm1, xmm5 // isolate next 2 alphas
4798 movdqu xmm2, [eax] // read 4 pixels
4799 punpckhbw xmm2, xmm2 // next 2 pixel rgbs
4800 pmulhuw xmm1, xmm2 // rgb * a
4801 movdqu xmm2, [eax] // mask original alpha
4802 lea eax, [eax + 16]
4803 pand xmm2, xmm3
4804 psrlw xmm0, 8
4805 psrlw xmm1, 8
4806 packuswb xmm0, xmm1
4807 por xmm0, xmm2 // copy original alpha
4808 sub ecx, 4
4809 movdqu [edx], xmm0
4810 lea edx, [edx + 16]
4811 jg convertloop
4812
4813 ret
4814 }
4815 }
4816 #endif // HAS_ARGBATTENUATEROW_SSSE3
4817
4818 #ifdef HAS_ARGBATTENUATEROW_AVX2
4819 // Shuffle table duplicating alpha.
4820 static const ulvec8 kShuffleAlpha_AVX2 = {
4821 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
4822 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
4823 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
4824 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
4825 };
4826 __declspec(naked) __declspec(align(16))
ARGBAttenuateRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,int width)4827 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
4828 __asm {
4829 mov eax, [esp + 4] // src_argb0
4830 mov edx, [esp + 8] // dst_argb
4831 mov ecx, [esp + 12] // width
4832 sub edx, eax
4833 vmovdqa ymm4, kShuffleAlpha_AVX2
4834 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
4835 vpslld ymm5, ymm5, 24
4836
4837 align 4
4838 convertloop:
4839 vmovdqu ymm6, [eax] // read 8 pixels.
4840 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4841 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4842 vpshufb ymm2, ymm0, ymm4 // low 4 alphas
4843 vpshufb ymm3, ymm1, ymm4 // high 4 alphas
4844 vpmulhuw ymm0, ymm0, ymm2 // rgb * a
4845 vpmulhuw ymm1, ymm1, ymm3 // rgb * a
4846 vpand ymm6, ymm6, ymm5 // isolate alpha
4847 vpsrlw ymm0, ymm0, 8
4848 vpsrlw ymm1, ymm1, 8
4849 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4850 vpor ymm0, ymm0, ymm6 // copy original alpha
4851 sub ecx, 8
4852 vmovdqu [eax + edx], ymm0
4853 lea eax, [eax + 32]
4854 jg convertloop
4855
4856 vzeroupper
4857 ret
4858 }
4859 }
4860 #endif // HAS_ARGBATTENUATEROW_AVX2
4861
4862 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
4863 // Unattenuate 4 pixels at a time.
4864 // Aligned to 16 bytes.
4865 __declspec(naked) __declspec(align(16))
ARGBUnattenuateRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width)4866 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
4867 int width) {
4868 __asm {
4869 push esi
4870 push edi
4871 mov eax, [esp + 8 + 4] // src_argb0
4872 mov edx, [esp + 8 + 8] // dst_argb
4873 mov ecx, [esp + 8 + 12] // width
4874
4875 align 4
4876 convertloop:
4877 movdqu xmm0, [eax] // read 4 pixels
4878 movzx esi, byte ptr [eax + 3] // first alpha
4879 movzx edi, byte ptr [eax + 7] // second alpha
4880 punpcklbw xmm0, xmm0 // first 2
4881 movd xmm2, dword ptr fixed_invtbl8[esi * 4]
4882 movd xmm3, dword ptr fixed_invtbl8[edi * 4]
4883 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
4884 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
4885 movlhps xmm2, xmm3
4886 pmulhuw xmm0, xmm2 // rgb * a
4887
4888 movdqu xmm1, [eax] // read 4 pixels
4889 movzx esi, byte ptr [eax + 11] // third alpha
4890 movzx edi, byte ptr [eax + 15] // forth alpha
4891 punpckhbw xmm1, xmm1 // next 2
4892 movd xmm2, dword ptr fixed_invtbl8[esi * 4]
4893 movd xmm3, dword ptr fixed_invtbl8[edi * 4]
4894 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
4895 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
4896 movlhps xmm2, xmm3
4897 pmulhuw xmm1, xmm2 // rgb * a
4898 lea eax, [eax + 16]
4899
4900 packuswb xmm0, xmm1
4901 sub ecx, 4
4902 movdqu [edx], xmm0
4903 lea edx, [edx + 16]
4904 jg convertloop
4905 pop edi
4906 pop esi
4907 ret
4908 }
4909 }
4910 #endif // HAS_ARGBUNATTENUATEROW_SSE2
4911
4912 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
4913 // Shuffle table duplicating alpha.
4914 static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
4915 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
4916 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
4917 };
4918 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
4919 // USE_GATHER is not on by default, due to being a slow instruction.
4920 #ifdef USE_GATHER
4921 __declspec(naked) __declspec(align(16))
ARGBUnattenuateRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,int width)4922 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
4923 int width) {
4924 __asm {
4925 mov eax, [esp + 4] // src_argb0
4926 mov edx, [esp + 8] // dst_argb
4927 mov ecx, [esp + 12] // width
4928 sub edx, eax
4929 vmovdqa ymm4, kUnattenShuffleAlpha_AVX2
4930
4931 align 4
4932 convertloop:
4933 vmovdqu ymm6, [eax] // read 8 pixels.
4934 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
4935 vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
4936 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4937 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4938 vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
4939 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
4940 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
4941 vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a
4942 vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas
4943 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
4944 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
4945 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4946 sub ecx, 8
4947 vmovdqu [eax + edx], ymm0
4948 lea eax, [eax + 32]
4949 jg convertloop
4950
4951 vzeroupper
4952 ret
4953 }
4954 }
4955 #else // USE_GATHER
4956 __declspec(naked) __declspec(align(16))
ARGBUnattenuateRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,int width)4957 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
4958 int width) {
4959 __asm {
4960
4961 mov eax, [esp + 4] // src_argb0
4962 mov edx, [esp + 8] // dst_argb
4963 mov ecx, [esp + 12] // width
4964 sub edx, eax
4965 vmovdqa ymm5, kUnattenShuffleAlpha_AVX2
4966
4967 push esi
4968 push edi
4969
4970 align 4
4971 convertloop:
4972 // replace VPGATHER
4973 movzx esi, byte ptr [eax + 3] // alpha0
4974 movzx edi, byte ptr [eax + 7] // alpha1
4975 vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0]
4976 vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1]
4977 movzx esi, byte ptr [eax + 11] // alpha2
4978 movzx edi, byte ptr [eax + 15] // alpha3
4979 vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
4980 vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2]
4981 vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3]
4982 movzx esi, byte ptr [eax + 19] // alpha4
4983 movzx edi, byte ptr [eax + 23] // alpha5
4984 vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
4985 vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4]
4986 vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5]
4987 movzx esi, byte ptr [eax + 27] // alpha6
4988 movzx edi, byte ptr [eax + 31] // alpha7
4989 vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
4990 vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6]
4991 vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7]
4992 vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
4993 vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
4994 vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
4995 vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
4996 // end of VPGATHER
4997
4998 vmovdqu ymm6, [eax] // read 8 pixels.
4999 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
5000 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
5001 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
5002 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
5003 vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a
5004 vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas
5005 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
5006 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
5007 vpackuswb ymm0, ymm0, ymm1 // unmutated.
5008 sub ecx, 8
5009 vmovdqu [eax + edx], ymm0
5010 lea eax, [eax + 32]
5011 jg convertloop
5012
5013 pop edi
5014 pop esi
5015 vzeroupper
5016 ret
5017 }
5018 }
5019 #endif // USE_GATHER
5020 #endif // HAS_ARGBATTENUATEROW_AVX2
5021
5022 #ifdef HAS_ARGBGRAYROW_SSSE3
5023 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
5024 __declspec(naked) __declspec(align(16))
ARGBGrayRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)5025 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
5026 __asm {
5027 mov eax, [esp + 4] /* src_argb */
5028 mov edx, [esp + 8] /* dst_argb */
5029 mov ecx, [esp + 12] /* width */
5030 movdqa xmm4, kARGBToYJ
5031 movdqa xmm5, kAddYJ64
5032
5033 align 4
5034 convertloop:
5035 movdqa xmm0, [eax] // G
5036 movdqa xmm1, [eax + 16]
5037 pmaddubsw xmm0, xmm4
5038 pmaddubsw xmm1, xmm4
5039 phaddw xmm0, xmm1
5040 paddw xmm0, xmm5 // Add .5 for rounding.
5041 psrlw xmm0, 7
5042 packuswb xmm0, xmm0 // 8 G bytes
5043 movdqa xmm2, [eax] // A
5044 movdqa xmm3, [eax + 16]
5045 lea eax, [eax + 32]
5046 psrld xmm2, 24
5047 psrld xmm3, 24
5048 packuswb xmm2, xmm3
5049 packuswb xmm2, xmm2 // 8 A bytes
5050 movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
5051 punpcklbw xmm0, xmm0 // 8 GG words
5052 punpcklbw xmm3, xmm2 // 8 GA words
5053 movdqa xmm1, xmm0
5054 punpcklwd xmm0, xmm3 // GGGA first 4
5055 punpckhwd xmm1, xmm3 // GGGA next 4
5056 sub ecx, 8
5057 movdqa [edx], xmm0
5058 movdqa [edx + 16], xmm1
5059 lea edx, [edx + 32]
5060 jg convertloop
5061 ret
5062 }
5063 }
5064 #endif // HAS_ARGBGRAYROW_SSSE3
5065
5066 #ifdef HAS_ARGBSEPIAROW_SSSE3
5067 // b = (r * 35 + g * 68 + b * 17) >> 7
5068 // g = (r * 45 + g * 88 + b * 22) >> 7
5069 // r = (r * 50 + g * 98 + b * 24) >> 7
5070 // Constant for ARGB color to sepia tone.
5071 static const vec8 kARGBToSepiaB = {
5072 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
5073 };
5074
5075 static const vec8 kARGBToSepiaG = {
5076 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
5077 };
5078
5079 static const vec8 kARGBToSepiaR = {
5080 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
5081 };
5082
5083 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
5084 __declspec(naked) __declspec(align(16))
ARGBSepiaRow_SSSE3(uint8 * dst_argb,int width)5085 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
5086 __asm {
5087 mov eax, [esp + 4] /* dst_argb */
5088 mov ecx, [esp + 8] /* width */
5089 movdqa xmm2, kARGBToSepiaB
5090 movdqa xmm3, kARGBToSepiaG
5091 movdqa xmm4, kARGBToSepiaR
5092
5093 align 4
5094 convertloop:
5095 movdqa xmm0, [eax] // B
5096 movdqa xmm6, [eax + 16]
5097 pmaddubsw xmm0, xmm2
5098 pmaddubsw xmm6, xmm2
5099 phaddw xmm0, xmm6
5100 psrlw xmm0, 7
5101 packuswb xmm0, xmm0 // 8 B values
5102 movdqa xmm5, [eax] // G
5103 movdqa xmm1, [eax + 16]
5104 pmaddubsw xmm5, xmm3
5105 pmaddubsw xmm1, xmm3
5106 phaddw xmm5, xmm1
5107 psrlw xmm5, 7
5108 packuswb xmm5, xmm5 // 8 G values
5109 punpcklbw xmm0, xmm5 // 8 BG values
5110 movdqa xmm5, [eax] // R
5111 movdqa xmm1, [eax + 16]
5112 pmaddubsw xmm5, xmm4
5113 pmaddubsw xmm1, xmm4
5114 phaddw xmm5, xmm1
5115 psrlw xmm5, 7
5116 packuswb xmm5, xmm5 // 8 R values
5117 movdqa xmm6, [eax] // A
5118 movdqa xmm1, [eax + 16]
5119 psrld xmm6, 24
5120 psrld xmm1, 24
5121 packuswb xmm6, xmm1
5122 packuswb xmm6, xmm6 // 8 A values
5123 punpcklbw xmm5, xmm6 // 8 RA values
5124 movdqa xmm1, xmm0 // Weave BG, RA together
5125 punpcklwd xmm0, xmm5 // BGRA first 4
5126 punpckhwd xmm1, xmm5 // BGRA next 4
5127 sub ecx, 8
5128 movdqa [eax], xmm0
5129 movdqa [eax + 16], xmm1
5130 lea eax, [eax + 32]
5131 jg convertloop
5132 ret
5133 }
5134 }
5135 #endif // HAS_ARGBSEPIAROW_SSSE3
5136
5137 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
5138 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
5139 // Same as Sepia except matrix is provided.
5140 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
5141 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
5142 __declspec(naked) __declspec(align(16))
ARGBColorMatrixRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,const int8 * matrix_argb,int width)5143 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5144 const int8* matrix_argb, int width) {
5145 __asm {
5146 mov eax, [esp + 4] /* src_argb */
5147 mov edx, [esp + 8] /* dst_argb */
5148 mov ecx, [esp + 12] /* matrix_argb */
5149 movdqu xmm5, [ecx]
5150 pshufd xmm2, xmm5, 0x00
5151 pshufd xmm3, xmm5, 0x55
5152 pshufd xmm4, xmm5, 0xaa
5153 pshufd xmm5, xmm5, 0xff
5154 mov ecx, [esp + 16] /* width */
5155
5156 align 4
5157 convertloop:
5158 movdqa xmm0, [eax] // B
5159 movdqa xmm7, [eax + 16]
5160 pmaddubsw xmm0, xmm2
5161 pmaddubsw xmm7, xmm2
5162 movdqa xmm6, [eax] // G
5163 movdqa xmm1, [eax + 16]
5164 pmaddubsw xmm6, xmm3
5165 pmaddubsw xmm1, xmm3
5166 phaddsw xmm0, xmm7 // B
5167 phaddsw xmm6, xmm1 // G
5168 psraw xmm0, 6 // B
5169 psraw xmm6, 6 // G
5170 packuswb xmm0, xmm0 // 8 B values
5171 packuswb xmm6, xmm6 // 8 G values
5172 punpcklbw xmm0, xmm6 // 8 BG values
5173 movdqa xmm1, [eax] // R
5174 movdqa xmm7, [eax + 16]
5175 pmaddubsw xmm1, xmm4
5176 pmaddubsw xmm7, xmm4
5177 phaddsw xmm1, xmm7 // R
5178 movdqa xmm6, [eax] // A
5179 movdqa xmm7, [eax + 16]
5180 pmaddubsw xmm6, xmm5
5181 pmaddubsw xmm7, xmm5
5182 phaddsw xmm6, xmm7 // A
5183 psraw xmm1, 6 // R
5184 psraw xmm6, 6 // A
5185 packuswb xmm1, xmm1 // 8 R values
5186 packuswb xmm6, xmm6 // 8 A values
5187 punpcklbw xmm1, xmm6 // 8 RA values
5188 movdqa xmm6, xmm0 // Weave BG, RA together
5189 punpcklwd xmm0, xmm1 // BGRA first 4
5190 punpckhwd xmm6, xmm1 // BGRA next 4
5191 sub ecx, 8
5192 movdqa [edx], xmm0
5193 movdqa [edx + 16], xmm6
5194 lea eax, [eax + 32]
5195 lea edx, [edx + 32]
5196 jg convertloop
5197 ret
5198 }
5199 }
5200 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
5201
5202 #ifdef HAS_ARGBQUANTIZEROW_SSE2
5203 // Quantize 4 ARGB pixels (16 bytes).
5204 // Aligned to 16 bytes.
5205 __declspec(naked) __declspec(align(16))
ARGBQuantizeRow_SSE2(uint8 * dst_argb,int scale,int interval_size,int interval_offset,int width)5206 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
5207 int interval_offset, int width) {
5208 __asm {
5209 mov eax, [esp + 4] /* dst_argb */
5210 movd xmm2, [esp + 8] /* scale */
5211 movd xmm3, [esp + 12] /* interval_size */
5212 movd xmm4, [esp + 16] /* interval_offset */
5213 mov ecx, [esp + 20] /* width */
5214 pshuflw xmm2, xmm2, 040h
5215 pshufd xmm2, xmm2, 044h
5216 pshuflw xmm3, xmm3, 040h
5217 pshufd xmm3, xmm3, 044h
5218 pshuflw xmm4, xmm4, 040h
5219 pshufd xmm4, xmm4, 044h
5220 pxor xmm5, xmm5 // constant 0
5221 pcmpeqb xmm6, xmm6 // generate mask 0xff000000
5222 pslld xmm6, 24
5223
5224 align 4
5225 convertloop:
5226 movdqa xmm0, [eax] // read 4 pixels
5227 punpcklbw xmm0, xmm5 // first 2 pixels
5228 pmulhuw xmm0, xmm2 // pixel * scale >> 16
5229 movdqa xmm1, [eax] // read 4 pixels
5230 punpckhbw xmm1, xmm5 // next 2 pixels
5231 pmulhuw xmm1, xmm2
5232 pmullw xmm0, xmm3 // * interval_size
5233 movdqa xmm7, [eax] // read 4 pixels
5234 pmullw xmm1, xmm3
5235 pand xmm7, xmm6 // mask alpha
5236 paddw xmm0, xmm4 // + interval_size / 2
5237 paddw xmm1, xmm4
5238 packuswb xmm0, xmm1
5239 por xmm0, xmm7
5240 sub ecx, 4
5241 movdqa [eax], xmm0
5242 lea eax, [eax + 16]
5243 jg convertloop
5244 ret
5245 }
5246 }
5247 #endif // HAS_ARGBQUANTIZEROW_SSE2
5248
5249 #ifdef HAS_ARGBSHADEROW_SSE2
5250 // Shade 4 pixels at a time by specified value.
5251 // Aligned to 16 bytes.
5252 __declspec(naked) __declspec(align(16))
ARGBShadeRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width,uint32 value)5253 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
5254 uint32 value) {
5255 __asm {
5256 mov eax, [esp + 4] // src_argb
5257 mov edx, [esp + 8] // dst_argb
5258 mov ecx, [esp + 12] // width
5259 movd xmm2, [esp + 16] // value
5260 punpcklbw xmm2, xmm2
5261 punpcklqdq xmm2, xmm2
5262
5263 align 4
5264 convertloop:
5265 movdqa xmm0, [eax] // read 4 pixels
5266 lea eax, [eax + 16]
5267 movdqa xmm1, xmm0
5268 punpcklbw xmm0, xmm0 // first 2
5269 punpckhbw xmm1, xmm1 // next 2
5270 pmulhuw xmm0, xmm2 // argb * value
5271 pmulhuw xmm1, xmm2 // argb * value
5272 psrlw xmm0, 8
5273 psrlw xmm1, 8
5274 packuswb xmm0, xmm1
5275 sub ecx, 4
5276 movdqa [edx], xmm0
5277 lea edx, [edx + 16]
5278 jg convertloop
5279
5280 ret
5281 }
5282 }
5283 #endif // HAS_ARGBSHADEROW_SSE2
5284
5285 #ifdef HAS_ARGBMULTIPLYROW_SSE2
5286 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
5287 __declspec(naked) __declspec(align(16))
ARGBMultiplyRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)5288 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
5289 uint8* dst_argb, int width) {
5290 __asm {
5291 push esi
5292 mov eax, [esp + 4 + 4] // src_argb0
5293 mov esi, [esp + 4 + 8] // src_argb1
5294 mov edx, [esp + 4 + 12] // dst_argb
5295 mov ecx, [esp + 4 + 16] // width
5296 pxor xmm5, xmm5 // constant 0
5297
5298 align 4
5299 convertloop:
5300 movdqu xmm0, [eax] // read 4 pixels from src_argb0
5301 movdqu xmm2, [esi] // read 4 pixels from src_argb1
5302 movdqu xmm1, xmm0
5303 movdqu xmm3, xmm2
5304 punpcklbw xmm0, xmm0 // first 2
5305 punpckhbw xmm1, xmm1 // next 2
5306 punpcklbw xmm2, xmm5 // first 2
5307 punpckhbw xmm3, xmm5 // next 2
5308 pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
5309 pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
5310 lea eax, [eax + 16]
5311 lea esi, [esi + 16]
5312 packuswb xmm0, xmm1
5313 sub ecx, 4
5314 movdqu [edx], xmm0
5315 lea edx, [edx + 16]
5316 jg convertloop
5317
5318 pop esi
5319 ret
5320 }
5321 }
5322 #endif // HAS_ARGBMULTIPLYROW_SSE2
5323
5324 #ifdef HAS_ARGBADDROW_SSE2
5325 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
5326 // TODO(fbarchard): Port this to posix, neon and other math functions.
5327 __declspec(naked) __declspec(align(16))
ARGBAddRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)5328 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
5329 uint8* dst_argb, int width) {
5330 __asm {
5331 push esi
5332 mov eax, [esp + 4 + 4] // src_argb0
5333 mov esi, [esp + 4 + 8] // src_argb1
5334 mov edx, [esp + 4 + 12] // dst_argb
5335 mov ecx, [esp + 4 + 16] // width
5336
5337 sub ecx, 4
5338 jl convertloop49
5339
5340 align 4
5341 convertloop4:
5342 movdqu xmm0, [eax] // read 4 pixels from src_argb0
5343 lea eax, [eax + 16]
5344 movdqu xmm1, [esi] // read 4 pixels from src_argb1
5345 lea esi, [esi + 16]
5346 paddusb xmm0, xmm1 // src_argb0 + src_argb1
5347 sub ecx, 4
5348 movdqu [edx], xmm0
5349 lea edx, [edx + 16]
5350 jge convertloop4
5351
5352 convertloop49:
5353 add ecx, 4 - 1
5354 jl convertloop19
5355
5356 convertloop1:
5357 movd xmm0, [eax] // read 1 pixels from src_argb0
5358 lea eax, [eax + 4]
5359 movd xmm1, [esi] // read 1 pixels from src_argb1
5360 lea esi, [esi + 4]
5361 paddusb xmm0, xmm1 // src_argb0 + src_argb1
5362 sub ecx, 1
5363 movd [edx], xmm0
5364 lea edx, [edx + 4]
5365 jge convertloop1
5366
5367 convertloop19:
5368 pop esi
5369 ret
5370 }
5371 }
5372 #endif // HAS_ARGBADDROW_SSE2
5373
5374 #ifdef HAS_ARGBSUBTRACTROW_SSE2
5375 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
5376 __declspec(naked) __declspec(align(16))
ARGBSubtractRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)5377 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
5378 uint8* dst_argb, int width) {
5379 __asm {
5380 push esi
5381 mov eax, [esp + 4 + 4] // src_argb0
5382 mov esi, [esp + 4 + 8] // src_argb1
5383 mov edx, [esp + 4 + 12] // dst_argb
5384 mov ecx, [esp + 4 + 16] // width
5385
5386 align 4
5387 convertloop:
5388 movdqu xmm0, [eax] // read 4 pixels from src_argb0
5389 lea eax, [eax + 16]
5390 movdqu xmm1, [esi] // read 4 pixels from src_argb1
5391 lea esi, [esi + 16]
5392 psubusb xmm0, xmm1 // src_argb0 - src_argb1
5393 sub ecx, 4
5394 movdqu [edx], xmm0
5395 lea edx, [edx + 16]
5396 jg convertloop
5397
5398 pop esi
5399 ret
5400 }
5401 }
5402 #endif // HAS_ARGBSUBTRACTROW_SSE2
5403
5404 #ifdef HAS_ARGBMULTIPLYROW_AVX2
5405 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
5406 __declspec(naked) __declspec(align(16))
ARGBMultiplyRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)5407 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
5408 uint8* dst_argb, int width) {
5409 __asm {
5410 push esi
5411 mov eax, [esp + 4 + 4] // src_argb0
5412 mov esi, [esp + 4 + 8] // src_argb1
5413 mov edx, [esp + 4 + 12] // dst_argb
5414 mov ecx, [esp + 4 + 16] // width
5415 vpxor ymm5, ymm5, ymm5 // constant 0
5416
5417 align 4
5418 convertloop:
5419 vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
5420 lea eax, [eax + 32]
5421 vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
5422 lea esi, [esi + 32]
5423 vpunpcklbw ymm0, ymm1, ymm1 // low 4
5424 vpunpckhbw ymm1, ymm1, ymm1 // high 4
5425 vpunpcklbw ymm2, ymm3, ymm5 // low 4
5426 vpunpckhbw ymm3, ymm3, ymm5 // high 4
5427 vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
5428 vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
5429 vpackuswb ymm0, ymm0, ymm1
5430 vmovdqu [edx], ymm0
5431 lea edx, [edx + 32]
5432 sub ecx, 8
5433 jg convertloop
5434
5435 pop esi
5436 vzeroupper
5437 ret
5438 }
5439 }
5440 #endif // HAS_ARGBMULTIPLYROW_AVX2
5441
5442 #ifdef HAS_ARGBADDROW_AVX2
5443 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
5444 __declspec(naked) __declspec(align(16))
ARGBAddRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)5445 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
5446 uint8* dst_argb, int width) {
5447 __asm {
5448 push esi
5449 mov eax, [esp + 4 + 4] // src_argb0
5450 mov esi, [esp + 4 + 8] // src_argb1
5451 mov edx, [esp + 4 + 12] // dst_argb
5452 mov ecx, [esp + 4 + 16] // width
5453
5454 align 4
5455 convertloop:
5456 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
5457 lea eax, [eax + 32]
5458 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
5459 lea esi, [esi + 32]
5460 vmovdqu [edx], ymm0
5461 lea edx, [edx + 32]
5462 sub ecx, 8
5463 jg convertloop
5464
5465 pop esi
5466 vzeroupper
5467 ret
5468 }
5469 }
5470 #endif // HAS_ARGBADDROW_AVX2
5471
5472 #ifdef HAS_ARGBSUBTRACTROW_AVX2
5473 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
5474 __declspec(naked) __declspec(align(16))
ARGBSubtractRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)5475 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
5476 uint8* dst_argb, int width) {
5477 __asm {
5478 push esi
5479 mov eax, [esp + 4 + 4] // src_argb0
5480 mov esi, [esp + 4 + 8] // src_argb1
5481 mov edx, [esp + 4 + 12] // dst_argb
5482 mov ecx, [esp + 4 + 16] // width
5483
5484 align 4
5485 convertloop:
5486 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
5487 lea eax, [eax + 32]
5488 vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
5489 lea esi, [esi + 32]
5490 vmovdqu [edx], ymm0
5491 lea edx, [edx + 32]
5492 sub ecx, 8
5493 jg convertloop
5494
5495 pop esi
5496 vzeroupper
5497 ret
5498 }
5499 }
5500 #endif // HAS_ARGBSUBTRACTROW_AVX2
5501
5502 #ifdef HAS_SOBELXROW_SSE2
5503 // SobelX as a matrix is
5504 // -1 0 1
5505 // -2 0 2
5506 // -1 0 1
5507 __declspec(naked) __declspec(align(16))
SobelXRow_SSE2(const uint8 * src_y0,const uint8 * src_y1,const uint8 * src_y2,uint8 * dst_sobelx,int width)5508 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
5509 const uint8* src_y2, uint8* dst_sobelx, int width) {
5510 __asm {
5511 push esi
5512 push edi
5513 mov eax, [esp + 8 + 4] // src_y0
5514 mov esi, [esp + 8 + 8] // src_y1
5515 mov edi, [esp + 8 + 12] // src_y2
5516 mov edx, [esp + 8 + 16] // dst_sobelx
5517 mov ecx, [esp + 8 + 20] // width
5518 sub esi, eax
5519 sub edi, eax
5520 sub edx, eax
5521 pxor xmm5, xmm5 // constant 0
5522
5523 align 4
5524 convertloop:
5525 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
5526 movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
5527 punpcklbw xmm0, xmm5
5528 punpcklbw xmm1, xmm5
5529 psubw xmm0, xmm1
5530 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
5531 movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
5532 punpcklbw xmm1, xmm5
5533 punpcklbw xmm2, xmm5
5534 psubw xmm1, xmm2
5535 movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
5536 movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
5537 punpcklbw xmm2, xmm5
5538 punpcklbw xmm3, xmm5
5539 psubw xmm2, xmm3
5540 paddw xmm0, xmm2
5541 paddw xmm0, xmm1
5542 paddw xmm0, xmm1
5543 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
5544 psubw xmm1, xmm0
5545 pmaxsw xmm0, xmm1
5546 packuswb xmm0, xmm0
5547 sub ecx, 8
5548 movq qword ptr [eax + edx], xmm0
5549 lea eax, [eax + 8]
5550 jg convertloop
5551
5552 pop edi
5553 pop esi
5554 ret
5555 }
5556 }
5557 #endif // HAS_SOBELXROW_SSE2
5558
5559 #ifdef HAS_SOBELYROW_SSE2
5560 // SobelY as a matrix is
5561 // -1 -2 -1
5562 // 0 0 0
5563 // 1 2 1
5564 __declspec(naked) __declspec(align(16))
SobelYRow_SSE2(const uint8 * src_y0,const uint8 * src_y1,uint8 * dst_sobely,int width)5565 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
5566 uint8* dst_sobely, int width) {
5567 __asm {
5568 push esi
5569 mov eax, [esp + 4 + 4] // src_y0
5570 mov esi, [esp + 4 + 8] // src_y1
5571 mov edx, [esp + 4 + 12] // dst_sobely
5572 mov ecx, [esp + 4 + 16] // width
5573 sub esi, eax
5574 sub edx, eax
5575 pxor xmm5, xmm5 // constant 0
5576
5577 align 4
5578 convertloop:
5579 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
5580 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
5581 punpcklbw xmm0, xmm5
5582 punpcklbw xmm1, xmm5
5583 psubw xmm0, xmm1
5584 movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
5585 movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
5586 punpcklbw xmm1, xmm5
5587 punpcklbw xmm2, xmm5
5588 psubw xmm1, xmm2
5589 movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
5590 movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
5591 punpcklbw xmm2, xmm5
5592 punpcklbw xmm3, xmm5
5593 psubw xmm2, xmm3
5594 paddw xmm0, xmm2
5595 paddw xmm0, xmm1
5596 paddw xmm0, xmm1
5597 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
5598 psubw xmm1, xmm0
5599 pmaxsw xmm0, xmm1
5600 packuswb xmm0, xmm0
5601 sub ecx, 8
5602 movq qword ptr [eax + edx], xmm0
5603 lea eax, [eax + 8]
5604 jg convertloop
5605
5606 pop esi
5607 ret
5608 }
5609 }
5610 #endif // HAS_SOBELYROW_SSE2
5611
5612 #ifdef HAS_SOBELROW_SSE2
5613 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
5614 // A = 255
5615 // R = Sobel
5616 // G = Sobel
5617 // B = Sobel
5618 __declspec(naked) __declspec(align(16))
SobelRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)5619 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5620 uint8* dst_argb, int width) {
5621 __asm {
5622 push esi
5623 mov eax, [esp + 4 + 4] // src_sobelx
5624 mov esi, [esp + 4 + 8] // src_sobely
5625 mov edx, [esp + 4 + 12] // dst_argb
5626 mov ecx, [esp + 4 + 16] // width
5627 sub esi, eax
5628 pcmpeqb xmm5, xmm5 // alpha 255
5629 pslld xmm5, 24 // 0xff000000
5630
5631 align 4
5632 convertloop:
5633 movdqa xmm0, [eax] // read 16 pixels src_sobelx
5634 movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
5635 lea eax, [eax + 16]
5636 paddusb xmm0, xmm1 // sobel = sobelx + sobely
5637 movdqa xmm2, xmm0 // GG
5638 punpcklbw xmm2, xmm0 // First 8
5639 punpckhbw xmm0, xmm0 // Next 8
5640 movdqa xmm1, xmm2 // GGGG
5641 punpcklwd xmm1, xmm2 // First 4
5642 punpckhwd xmm2, xmm2 // Next 4
5643 por xmm1, xmm5 // GGGA
5644 por xmm2, xmm5
5645 movdqa xmm3, xmm0 // GGGG
5646 punpcklwd xmm3, xmm0 // Next 4
5647 punpckhwd xmm0, xmm0 // Last 4
5648 por xmm3, xmm5 // GGGA
5649 por xmm0, xmm5
5650 sub ecx, 16
5651 movdqa [edx], xmm1
5652 movdqa [edx + 16], xmm2
5653 movdqa [edx + 32], xmm3
5654 movdqa [edx + 48], xmm0
5655 lea edx, [edx + 64]
5656 jg convertloop
5657
5658 pop esi
5659 ret
5660 }
5661 }
5662 #endif // HAS_SOBELROW_SSE2
5663
5664 #ifdef HAS_SOBELTOPLANEROW_SSE2
5665 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
5666 __declspec(naked) __declspec(align(16))
SobelToPlaneRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_y,int width)5667 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5668 uint8* dst_y, int width) {
5669 __asm {
5670 push esi
5671 mov eax, [esp + 4 + 4] // src_sobelx
5672 mov esi, [esp + 4 + 8] // src_sobely
5673 mov edx, [esp + 4 + 12] // dst_argb
5674 mov ecx, [esp + 4 + 16] // width
5675 sub esi, eax
5676
5677 align 4
5678 convertloop:
5679 movdqa xmm0, [eax] // read 16 pixels src_sobelx
5680 movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
5681 lea eax, [eax + 16]
5682 paddusb xmm0, xmm1 // sobel = sobelx + sobely
5683 sub ecx, 16
5684 movdqa [edx], xmm0
5685 lea edx, [edx + 16]
5686 jg convertloop
5687
5688 pop esi
5689 ret
5690 }
5691 }
5692 #endif // HAS_SOBELTOPLANEROW_SSE2
5693
5694 #ifdef HAS_SOBELXYROW_SSE2
5695 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
5696 // A = 255
5697 // R = Sobel X
5698 // G = Sobel
5699 // B = Sobel Y
5700 __declspec(naked) __declspec(align(16))
SobelXYRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)5701 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5702 uint8* dst_argb, int width) {
5703 __asm {
5704 push esi
5705 mov eax, [esp + 4 + 4] // src_sobelx
5706 mov esi, [esp + 4 + 8] // src_sobely
5707 mov edx, [esp + 4 + 12] // dst_argb
5708 mov ecx, [esp + 4 + 16] // width
5709 sub esi, eax
5710 pcmpeqb xmm5, xmm5 // alpha 255
5711
5712 align 4
5713 convertloop:
5714 movdqa xmm0, [eax] // read 16 pixels src_sobelx
5715 movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
5716 lea eax, [eax + 16]
5717 movdqa xmm2, xmm0
5718 paddusb xmm2, xmm1 // sobel = sobelx + sobely
5719 movdqa xmm3, xmm0 // XA
5720 punpcklbw xmm3, xmm5
5721 punpckhbw xmm0, xmm5
5722 movdqa xmm4, xmm1 // YS
5723 punpcklbw xmm4, xmm2
5724 punpckhbw xmm1, xmm2
5725 movdqa xmm6, xmm4 // YSXA
5726 punpcklwd xmm6, xmm3 // First 4
5727 punpckhwd xmm4, xmm3 // Next 4
5728 movdqa xmm7, xmm1 // YSXA
5729 punpcklwd xmm7, xmm0 // Next 4
5730 punpckhwd xmm1, xmm0 // Last 4
5731 sub ecx, 16
5732 movdqa [edx], xmm6
5733 movdqa [edx + 16], xmm4
5734 movdqa [edx + 32], xmm7
5735 movdqa [edx + 48], xmm1
5736 lea edx, [edx + 64]
5737 jg convertloop
5738
5739 pop esi
5740 ret
5741 }
5742 }
5743 #endif // HAS_SOBELXYROW_SSE2
5744
5745 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5746 // Consider float CumulativeSum.
5747 // Consider calling CumulativeSum one row at time as needed.
5748 // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
5749 // Convert cumulative sum for an area to an average for 1 pixel.
5750 // topleft is pointer to top left of CumulativeSum buffer for area.
5751 // botleft is pointer to bottom left of CumulativeSum buffer.
5752 // width is offset from left to right of area in CumulativeSum buffer measured
5753 // in number of ints.
5754 // area is the number of pixels in the area being averaged.
5755 // dst points to pixel to store result to.
5756 // count is number of averaged pixels to produce.
5757 // Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
5758 // aligned.
CumulativeSumToAverageRow_SSE2(const int32 * topleft,const int32 * botleft,int width,int area,uint8 * dst,int count)5759 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
5760 int width, int area, uint8* dst,
5761 int count) {
5762 __asm {
5763 mov eax, topleft // eax topleft
5764 mov esi, botleft // esi botleft
5765 mov edx, width
5766 movd xmm5, area
5767 mov edi, dst
5768 mov ecx, count
5769 cvtdq2ps xmm5, xmm5
5770 rcpss xmm4, xmm5 // 1.0f / area
5771 pshufd xmm4, xmm4, 0
5772 sub ecx, 4
5773 jl l4b
5774
5775 cmp area, 128 // 128 pixels will not overflow 15 bits.
5776 ja l4
5777
5778 pshufd xmm5, xmm5, 0 // area
5779 pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0
5780 psrld xmm6, 16
5781 cvtdq2ps xmm6, xmm6
5782 addps xmm5, xmm6 // (65536.0 + area - 1)
5783 mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area
5784 cvtps2dq xmm5, xmm5 // 0.16 fixed point
5785 packssdw xmm5, xmm5 // 16 bit shorts
5786
5787 // 4 pixel loop small blocks.
5788 align 4
5789 s4:
5790 // top left
5791 movdqa xmm0, [eax]
5792 movdqa xmm1, [eax + 16]
5793 movdqa xmm2, [eax + 32]
5794 movdqa xmm3, [eax + 48]
5795
5796 // - top right
5797 psubd xmm0, [eax + edx * 4]
5798 psubd xmm1, [eax + edx * 4 + 16]
5799 psubd xmm2, [eax + edx * 4 + 32]
5800 psubd xmm3, [eax + edx * 4 + 48]
5801 lea eax, [eax + 64]
5802
5803 // - bottom left
5804 psubd xmm0, [esi]
5805 psubd xmm1, [esi + 16]
5806 psubd xmm2, [esi + 32]
5807 psubd xmm3, [esi + 48]
5808
5809 // + bottom right
5810 paddd xmm0, [esi + edx * 4]
5811 paddd xmm1, [esi + edx * 4 + 16]
5812 paddd xmm2, [esi + edx * 4 + 32]
5813 paddd xmm3, [esi + edx * 4 + 48]
5814 lea esi, [esi + 64]
5815
5816 packssdw xmm0, xmm1 // pack 4 pixels into 2 registers
5817 packssdw xmm2, xmm3
5818
5819 pmulhuw xmm0, xmm5
5820 pmulhuw xmm2, xmm5
5821
5822 packuswb xmm0, xmm2
5823 movdqu [edi], xmm0
5824 lea edi, [edi + 16]
5825 sub ecx, 4
5826 jge s4
5827
5828 jmp l4b
5829
5830 // 4 pixel loop
5831 align 4
5832 l4:
5833 // top left
5834 movdqa xmm0, [eax]
5835 movdqa xmm1, [eax + 16]
5836 movdqa xmm2, [eax + 32]
5837 movdqa xmm3, [eax + 48]
5838
5839 // - top right
5840 psubd xmm0, [eax + edx * 4]
5841 psubd xmm1, [eax + edx * 4 + 16]
5842 psubd xmm2, [eax + edx * 4 + 32]
5843 psubd xmm3, [eax + edx * 4 + 48]
5844 lea eax, [eax + 64]
5845
5846 // - bottom left
5847 psubd xmm0, [esi]
5848 psubd xmm1, [esi + 16]
5849 psubd xmm2, [esi + 32]
5850 psubd xmm3, [esi + 48]
5851
5852 // + bottom right
5853 paddd xmm0, [esi + edx * 4]
5854 paddd xmm1, [esi + edx * 4 + 16]
5855 paddd xmm2, [esi + edx * 4 + 32]
5856 paddd xmm3, [esi + edx * 4 + 48]
5857 lea esi, [esi + 64]
5858
5859 cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
5860 cvtdq2ps xmm1, xmm1
5861 mulps xmm0, xmm4
5862 mulps xmm1, xmm4
5863 cvtdq2ps xmm2, xmm2
5864 cvtdq2ps xmm3, xmm3
5865 mulps xmm2, xmm4
5866 mulps xmm3, xmm4
5867 cvtps2dq xmm0, xmm0
5868 cvtps2dq xmm1, xmm1
5869 cvtps2dq xmm2, xmm2
5870 cvtps2dq xmm3, xmm3
5871 packssdw xmm0, xmm1
5872 packssdw xmm2, xmm3
5873 packuswb xmm0, xmm2
5874 movdqu [edi], xmm0
5875 lea edi, [edi + 16]
5876 sub ecx, 4
5877 jge l4
5878
5879 l4b:
5880 add ecx, 4 - 1
5881 jl l1b
5882
5883 // 1 pixel loop
5884 align 4
5885 l1:
5886 movdqa xmm0, [eax]
5887 psubd xmm0, [eax + edx * 4]
5888 lea eax, [eax + 16]
5889 psubd xmm0, [esi]
5890 paddd xmm0, [esi + edx * 4]
5891 lea esi, [esi + 16]
5892 cvtdq2ps xmm0, xmm0
5893 mulps xmm0, xmm4
5894 cvtps2dq xmm0, xmm0
5895 packssdw xmm0, xmm0
5896 packuswb xmm0, xmm0
5897 movd dword ptr [edi], xmm0
5898 lea edi, [edi + 4]
5899 sub ecx, 1
5900 jge l1
5901 l1b:
5902 }
5903 }
5904 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5905
5906 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
5907 // Creates a table of cumulative sums where each value is a sum of all values
5908 // above and to the left of the value.
ComputeCumulativeSumRow_SSE2(const uint8 * row,int32 * cumsum,const int32 * previous_cumsum,int width)5909 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
5910 const int32* previous_cumsum, int width) {
5911 __asm {
5912 mov eax, row
5913 mov edx, cumsum
5914 mov esi, previous_cumsum
5915 mov ecx, width
5916 pxor xmm0, xmm0
5917 pxor xmm1, xmm1
5918
5919 sub ecx, 4
5920 jl l4b
5921 test edx, 15
5922 jne l4b
5923
5924 // 4 pixel loop
5925 align 4
5926 l4:
5927 movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
5928 lea eax, [eax + 16]
5929 movdqa xmm4, xmm2
5930
5931 punpcklbw xmm2, xmm1
5932 movdqa xmm3, xmm2
5933 punpcklwd xmm2, xmm1
5934 punpckhwd xmm3, xmm1
5935
5936 punpckhbw xmm4, xmm1
5937 movdqa xmm5, xmm4
5938 punpcklwd xmm4, xmm1
5939 punpckhwd xmm5, xmm1
5940
5941 paddd xmm0, xmm2
5942 movdqa xmm2, [esi] // previous row above.
5943 paddd xmm2, xmm0
5944
5945 paddd xmm0, xmm3
5946 movdqa xmm3, [esi + 16]
5947 paddd xmm3, xmm0
5948
5949 paddd xmm0, xmm4
5950 movdqa xmm4, [esi + 32]
5951 paddd xmm4, xmm0
5952
5953 paddd xmm0, xmm5
5954 movdqa xmm5, [esi + 48]
5955 lea esi, [esi + 64]
5956 paddd xmm5, xmm0
5957
5958 movdqa [edx], xmm2
5959 movdqa [edx + 16], xmm3
5960 movdqa [edx + 32], xmm4
5961 movdqa [edx + 48], xmm5
5962
5963 lea edx, [edx + 64]
5964 sub ecx, 4
5965 jge l4
5966
5967 l4b:
5968 add ecx, 4 - 1
5969 jl l1b
5970
5971 // 1 pixel loop
5972 align 4
5973 l1:
5974 movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
5975 lea eax, [eax + 4]
5976 punpcklbw xmm2, xmm1
5977 punpcklwd xmm2, xmm1
5978 paddd xmm0, xmm2
5979 movdqu xmm2, [esi]
5980 lea esi, [esi + 16]
5981 paddd xmm2, xmm0
5982 movdqu [edx], xmm2
5983 lea edx, [edx + 16]
5984 sub ecx, 1
5985 jge l1
5986
5987 l1b:
5988 }
5989 }
5990 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
5991
5992 #ifdef HAS_ARGBAFFINEROW_SSE2
5993 // Copy ARGB pixels from source image with slope to a row of destination.
5994 __declspec(naked) __declspec(align(16))
5995 LIBYUV_API
ARGBAffineRow_SSE2(const uint8 * src_argb,int src_argb_stride,uint8 * dst_argb,const float * uv_dudv,int width)5996 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
5997 uint8* dst_argb, const float* uv_dudv, int width) {
5998 __asm {
5999 push esi
6000 push edi
6001 mov eax, [esp + 12] // src_argb
6002 mov esi, [esp + 16] // stride
6003 mov edx, [esp + 20] // dst_argb
6004 mov ecx, [esp + 24] // pointer to uv_dudv
6005 movq xmm2, qword ptr [ecx] // uv
6006 movq xmm7, qword ptr [ecx + 8] // dudv
6007 mov ecx, [esp + 28] // width
6008 shl esi, 16 // 4, stride
6009 add esi, 4
6010 movd xmm5, esi
6011 sub ecx, 4
6012 jl l4b
6013
6014 // setup for 4 pixel loop
6015 pshufd xmm7, xmm7, 0x44 // dup dudv
6016 pshufd xmm5, xmm5, 0 // dup 4, stride
6017 movdqa xmm0, xmm2 // x0, y0, x1, y1
6018 addps xmm0, xmm7
6019 movlhps xmm2, xmm0
6020 movdqa xmm4, xmm7
6021 addps xmm4, xmm4 // dudv *= 2
6022 movdqa xmm3, xmm2 // x2, y2, x3, y3
6023 addps xmm3, xmm4
6024 addps xmm4, xmm4 // dudv *= 4
6025
6026 // 4 pixel loop
6027 align 4
6028 l4:
6029 cvttps2dq xmm0, xmm2 // x, y float to int first 2
6030 cvttps2dq xmm1, xmm3 // x, y float to int next 2
6031 packssdw xmm0, xmm1 // x, y as 8 shorts
6032 pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
6033 movd esi, xmm0
6034 pshufd xmm0, xmm0, 0x39 // shift right
6035 movd edi, xmm0
6036 pshufd xmm0, xmm0, 0x39 // shift right
6037 movd xmm1, [eax + esi] // read pixel 0
6038 movd xmm6, [eax + edi] // read pixel 1
6039 punpckldq xmm1, xmm6 // combine pixel 0 and 1
6040 addps xmm2, xmm4 // x, y += dx, dy first 2
6041 movq qword ptr [edx], xmm1
6042 movd esi, xmm0
6043 pshufd xmm0, xmm0, 0x39 // shift right
6044 movd edi, xmm0
6045 movd xmm6, [eax + esi] // read pixel 2
6046 movd xmm0, [eax + edi] // read pixel 3
6047 punpckldq xmm6, xmm0 // combine pixel 2 and 3
6048 addps xmm3, xmm4 // x, y += dx, dy next 2
6049 sub ecx, 4
6050 movq qword ptr 8[edx], xmm6
6051 lea edx, [edx + 16]
6052 jge l4
6053
6054 l4b:
6055 add ecx, 4 - 1
6056 jl l1b
6057
6058 // 1 pixel loop
6059 align 4
6060 l1:
6061 cvttps2dq xmm0, xmm2 // x, y float to int
6062 packssdw xmm0, xmm0 // x, y as shorts
6063 pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
6064 addps xmm2, xmm7 // x, y += dx, dy
6065 movd esi, xmm0
6066 movd xmm0, [eax + esi] // copy a pixel
6067 sub ecx, 1
6068 movd [edx], xmm0
6069 lea edx, [edx + 4]
6070 jge l1
6071 l1b:
6072 pop edi
6073 pop esi
6074 ret
6075 }
6076 }
6077 #endif // HAS_ARGBAFFINEROW_SSE2
6078
6079 #ifdef HAS_INTERPOLATEROW_AVX2
6080 // Bilinear filter 16x2 -> 16x1
6081 __declspec(naked) __declspec(align(16))
InterpolateRow_AVX2(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)6082 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
6083 ptrdiff_t src_stride, int dst_width,
6084 int source_y_fraction) {
6085 __asm {
6086 push esi
6087 push edi
6088 mov edi, [esp + 8 + 4] // dst_ptr
6089 mov esi, [esp + 8 + 8] // src_ptr
6090 mov edx, [esp + 8 + 12] // src_stride
6091 mov ecx, [esp + 8 + 16] // dst_width
6092 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
6093 shr eax, 1
6094 // Dispatch to specialized filters if applicable.
6095 cmp eax, 0
6096 je xloop100 // 0 / 128. Blend 100 / 0.
6097 sub edi, esi
6098 cmp eax, 32
6099 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
6100 cmp eax, 64
6101 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
6102 cmp eax, 96
6103 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
6104
6105 vmovd xmm0, eax // high fraction 0..127
6106 neg eax
6107 add eax, 128
6108 vmovd xmm5, eax // low fraction 128..1
6109 vpunpcklbw xmm5, xmm5, xmm0
6110 vpunpcklwd xmm5, xmm5, xmm5
6111 vpxor ymm0, ymm0, ymm0
6112 vpermd ymm5, ymm0, ymm5
6113
6114 align 4
6115 xloop:
6116 vmovdqu ymm0, [esi]
6117 vmovdqu ymm2, [esi + edx]
6118 vpunpckhbw ymm1, ymm0, ymm2 // mutates
6119 vpunpcklbw ymm0, ymm0, ymm2 // mutates
6120 vpmaddubsw ymm0, ymm0, ymm5
6121 vpmaddubsw ymm1, ymm1, ymm5
6122 vpsrlw ymm0, ymm0, 7
6123 vpsrlw ymm1, ymm1, 7
6124 vpackuswb ymm0, ymm0, ymm1 // unmutates
6125 sub ecx, 32
6126 vmovdqu [esi + edi], ymm0
6127 lea esi, [esi + 32]
6128 jg xloop
6129 jmp xloop99
6130
6131 // Blend 25 / 75.
6132 align 4
6133 xloop25:
6134 vmovdqu ymm0, [esi]
6135 vpavgb ymm0, ymm0, [esi + edx]
6136 vpavgb ymm0, ymm0, [esi + edx]
6137 sub ecx, 32
6138 vmovdqu [esi + edi], ymm0
6139 lea esi, [esi + 32]
6140 jg xloop25
6141 jmp xloop99
6142
6143 // Blend 50 / 50.
6144 align 4
6145 xloop50:
6146 vmovdqu ymm0, [esi]
6147 vpavgb ymm0, ymm0, [esi + edx]
6148 sub ecx, 32
6149 vmovdqu [esi + edi], ymm0
6150 lea esi, [esi + 32]
6151 jg xloop50
6152 jmp xloop99
6153
6154 // Blend 75 / 25.
6155 align 4
6156 xloop75:
6157 vmovdqu ymm0, [esi + edx]
6158 vpavgb ymm0, ymm0, [esi]
6159 vpavgb ymm0, ymm0, [esi]
6160 sub ecx, 32
6161 vmovdqu [esi + edi], ymm0
6162 lea esi, [esi + 32]
6163 jg xloop75
6164 jmp xloop99
6165
6166 // Blend 100 / 0 - Copy row unchanged.
6167 align 4
6168 xloop100:
6169 rep movsb
6170
6171 xloop99:
6172 pop edi
6173 pop esi
6174 vzeroupper
6175 ret
6176 }
6177 }
6178 #endif // HAS_INTERPOLATEROW_AVX2
6179
6180 #ifdef HAS_INTERPOLATEROW_SSSE3
6181 // Bilinear filter 16x2 -> 16x1
6182 __declspec(naked) __declspec(align(16))
InterpolateRow_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)6183 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
6184 ptrdiff_t src_stride, int dst_width,
6185 int source_y_fraction) {
6186 __asm {
6187 push esi
6188 push edi
6189 mov edi, [esp + 8 + 4] // dst_ptr
6190 mov esi, [esp + 8 + 8] // src_ptr
6191 mov edx, [esp + 8 + 12] // src_stride
6192 mov ecx, [esp + 8 + 16] // dst_width
6193 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
6194 sub edi, esi
6195 shr eax, 1
6196 // Dispatch to specialized filters if applicable.
6197 cmp eax, 0
6198 je xloop100 // 0 / 128. Blend 100 / 0.
6199 cmp eax, 32
6200 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
6201 cmp eax, 64
6202 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
6203 cmp eax, 96
6204 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
6205
6206 movd xmm0, eax // high fraction 0..127
6207 neg eax
6208 add eax, 128
6209 movd xmm5, eax // low fraction 128..1
6210 punpcklbw xmm5, xmm0
6211 punpcklwd xmm5, xmm5
6212 pshufd xmm5, xmm5, 0
6213
6214 align 4
6215 xloop:
6216 movdqa xmm0, [esi]
6217 movdqa xmm2, [esi + edx]
6218 movdqa xmm1, xmm0
6219 punpcklbw xmm0, xmm2
6220 punpckhbw xmm1, xmm2
6221 pmaddubsw xmm0, xmm5
6222 pmaddubsw xmm1, xmm5
6223 psrlw xmm0, 7
6224 psrlw xmm1, 7
6225 packuswb xmm0, xmm1
6226 sub ecx, 16
6227 movdqa [esi + edi], xmm0
6228 lea esi, [esi + 16]
6229 jg xloop
6230 jmp xloop99
6231
6232 // Blend 25 / 75.
6233 align 4
6234 xloop25:
6235 movdqa xmm0, [esi]
6236 movdqa xmm1, [esi + edx]
6237 pavgb xmm0, xmm1
6238 pavgb xmm0, xmm1
6239 sub ecx, 16
6240 movdqa [esi + edi], xmm0
6241 lea esi, [esi + 16]
6242 jg xloop25
6243 jmp xloop99
6244
6245 // Blend 50 / 50.
6246 align 4
6247 xloop50:
6248 movdqa xmm0, [esi]
6249 movdqa xmm1, [esi + edx]
6250 pavgb xmm0, xmm1
6251 sub ecx, 16
6252 movdqa [esi + edi], xmm0
6253 lea esi, [esi + 16]
6254 jg xloop50
6255 jmp xloop99
6256
6257 // Blend 75 / 25.
6258 align 4
6259 xloop75:
6260 movdqa xmm1, [esi]
6261 movdqa xmm0, [esi + edx]
6262 pavgb xmm0, xmm1
6263 pavgb xmm0, xmm1
6264 sub ecx, 16
6265 movdqa [esi + edi], xmm0
6266 lea esi, [esi + 16]
6267 jg xloop75
6268 jmp xloop99
6269
6270 // Blend 100 / 0 - Copy row unchanged.
6271 align 4
6272 xloop100:
6273 movdqa xmm0, [esi]
6274 sub ecx, 16
6275 movdqa [esi + edi], xmm0
6276 lea esi, [esi + 16]
6277 jg xloop100
6278
6279 xloop99:
6280 pop edi
6281 pop esi
6282 ret
6283 }
6284 }
6285 #endif // HAS_INTERPOLATEROW_SSSE3
6286
6287 #ifdef HAS_INTERPOLATEROW_SSE2
6288 // Bilinear filter 16x2 -> 16x1
6289 __declspec(naked) __declspec(align(16))
InterpolateRow_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)6290 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
6291 ptrdiff_t src_stride, int dst_width,
6292 int source_y_fraction) {
6293 __asm {
6294 push esi
6295 push edi
6296 mov edi, [esp + 8 + 4] // dst_ptr
6297 mov esi, [esp + 8 + 8] // src_ptr
6298 mov edx, [esp + 8 + 12] // src_stride
6299 mov ecx, [esp + 8 + 16] // dst_width
6300 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
6301 sub edi, esi
6302 // Dispatch to specialized filters if applicable.
6303 cmp eax, 0
6304 je xloop100 // 0 / 256. Blend 100 / 0.
6305 cmp eax, 64
6306 je xloop75 // 64 / 256 is 0.25. Blend 75 / 25.
6307 cmp eax, 128
6308 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
6309 cmp eax, 192
6310 je xloop25 // 192 / 256 is 0.75. Blend 25 / 75.
6311
6312 movd xmm5, eax // xmm5 = y fraction
6313 punpcklbw xmm5, xmm5
6314 psrlw xmm5, 1
6315 punpcklwd xmm5, xmm5
6316 punpckldq xmm5, xmm5
6317 punpcklqdq xmm5, xmm5
6318 pxor xmm4, xmm4
6319
6320 align 4
6321 xloop:
6322 movdqa xmm0, [esi] // row0
6323 movdqa xmm2, [esi + edx] // row1
6324 movdqa xmm1, xmm0
6325 movdqa xmm3, xmm2
6326 punpcklbw xmm2, xmm4
6327 punpckhbw xmm3, xmm4
6328 punpcklbw xmm0, xmm4
6329 punpckhbw xmm1, xmm4
6330 psubw xmm2, xmm0 // row1 - row0
6331 psubw xmm3, xmm1
6332 paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16
6333 paddw xmm3, xmm3
6334 pmulhw xmm2, xmm5 // scale diff
6335 pmulhw xmm3, xmm5
6336 paddw xmm0, xmm2 // sum rows
6337 paddw xmm1, xmm3
6338 packuswb xmm0, xmm1
6339 sub ecx, 16
6340 movdqa [esi + edi], xmm0
6341 lea esi, [esi + 16]
6342 jg xloop
6343 jmp xloop99
6344
6345 // Blend 25 / 75.
6346 align 4
6347 xloop25:
6348 movdqa xmm0, [esi]
6349 movdqa xmm1, [esi + edx]
6350 pavgb xmm0, xmm1
6351 pavgb xmm0, xmm1
6352 sub ecx, 16
6353 movdqa [esi + edi], xmm0
6354 lea esi, [esi + 16]
6355 jg xloop25
6356 jmp xloop99
6357
6358 // Blend 50 / 50.
6359 align 4
6360 xloop50:
6361 movdqa xmm0, [esi]
6362 movdqa xmm1, [esi + edx]
6363 pavgb xmm0, xmm1
6364 sub ecx, 16
6365 movdqa [esi + edi], xmm0
6366 lea esi, [esi + 16]
6367 jg xloop50
6368 jmp xloop99
6369
6370 // Blend 75 / 25.
6371 align 4
6372 xloop75:
6373 movdqa xmm1, [esi]
6374 movdqa xmm0, [esi + edx]
6375 pavgb xmm0, xmm1
6376 pavgb xmm0, xmm1
6377 sub ecx, 16
6378 movdqa [esi + edi], xmm0
6379 lea esi, [esi + 16]
6380 jg xloop75
6381 jmp xloop99
6382
6383 // Blend 100 / 0 - Copy row unchanged.
6384 align 4
6385 xloop100:
6386 movdqa xmm0, [esi]
6387 sub ecx, 16
6388 movdqa [esi + edi], xmm0
6389 lea esi, [esi + 16]
6390 jg xloop100
6391
6392 xloop99:
6393 pop edi
6394 pop esi
6395 ret
6396 }
6397 }
6398 #endif // HAS_INTERPOLATEROW_SSE2
6399
6400 // Bilinear filter 16x2 -> 16x1
6401 __declspec(naked) __declspec(align(16))
InterpolateRow_Unaligned_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)6402 void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
6403 ptrdiff_t src_stride, int dst_width,
6404 int source_y_fraction) {
6405 __asm {
6406 push esi
6407 push edi
6408 mov edi, [esp + 8 + 4] // dst_ptr
6409 mov esi, [esp + 8 + 8] // src_ptr
6410 mov edx, [esp + 8 + 12] // src_stride
6411 mov ecx, [esp + 8 + 16] // dst_width
6412 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
6413 sub edi, esi
6414 shr eax, 1
6415 // Dispatch to specialized filters if applicable.
6416 cmp eax, 0
6417 je xloop100 // 0 / 128. Blend 100 / 0.
6418 cmp eax, 32
6419 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
6420 cmp eax, 64
6421 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
6422 cmp eax, 96
6423 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
6424
6425 movd xmm0, eax // high fraction 0..127
6426 neg eax
6427 add eax, 128
6428 movd xmm5, eax // low fraction 128..1
6429 punpcklbw xmm5, xmm0
6430 punpcklwd xmm5, xmm5
6431 pshufd xmm5, xmm5, 0
6432
6433 align 4
6434 xloop:
6435 movdqu xmm0, [esi]
6436 movdqu xmm2, [esi + edx]
6437 movdqu xmm1, xmm0
6438 punpcklbw xmm0, xmm2
6439 punpckhbw xmm1, xmm2
6440 pmaddubsw xmm0, xmm5
6441 pmaddubsw xmm1, xmm5
6442 psrlw xmm0, 7
6443 psrlw xmm1, 7
6444 packuswb xmm0, xmm1
6445 sub ecx, 16
6446 movdqu [esi + edi], xmm0
6447 lea esi, [esi + 16]
6448 jg xloop
6449 jmp xloop99
6450
6451 // Blend 25 / 75.
6452 align 4
6453 xloop25:
6454 movdqu xmm0, [esi]
6455 movdqu xmm1, [esi + edx]
6456 pavgb xmm0, xmm1
6457 pavgb xmm0, xmm1
6458 sub ecx, 16
6459 movdqu [esi + edi], xmm0
6460 lea esi, [esi + 16]
6461 jg xloop25
6462 jmp xloop99
6463
6464 // Blend 50 / 50.
6465 align 4
6466 xloop50:
6467 movdqu xmm0, [esi]
6468 movdqu xmm1, [esi + edx]
6469 pavgb xmm0, xmm1
6470 sub ecx, 16
6471 movdqu [esi + edi], xmm0
6472 lea esi, [esi + 16]
6473 jg xloop50
6474 jmp xloop99
6475
6476 // Blend 75 / 25.
6477 align 4
6478 xloop75:
6479 movdqu xmm1, [esi]
6480 movdqu xmm0, [esi + edx]
6481 pavgb xmm0, xmm1
6482 pavgb xmm0, xmm1
6483 sub ecx, 16
6484 movdqu [esi + edi], xmm0
6485 lea esi, [esi + 16]
6486 jg xloop75
6487 jmp xloop99
6488
6489 // Blend 100 / 0 - Copy row unchanged.
6490 align 4
6491 xloop100:
6492 movdqu xmm0, [esi]
6493 sub ecx, 16
6494 movdqu [esi + edi], xmm0
6495 lea esi, [esi + 16]
6496 jg xloop100
6497
6498 xloop99:
6499 pop edi
6500 pop esi
6501 ret
6502 }
6503 }
6504
6505 #ifdef HAS_INTERPOLATEROW_SSE2
6506 // Bilinear filter 16x2 -> 16x1
6507 __declspec(naked) __declspec(align(16))
InterpolateRow_Unaligned_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)6508 void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
6509 ptrdiff_t src_stride, int dst_width,
6510 int source_y_fraction) {
6511 __asm {
6512 push esi
6513 push edi
6514 mov edi, [esp + 8 + 4] // dst_ptr
6515 mov esi, [esp + 8 + 8] // src_ptr
6516 mov edx, [esp + 8 + 12] // src_stride
6517 mov ecx, [esp + 8 + 16] // dst_width
6518 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
6519 sub edi, esi
6520 // Dispatch to specialized filters if applicable.
6521 cmp eax, 0
6522 je xloop100 // 0 / 256. Blend 100 / 0.
6523 cmp eax, 64
6524 je xloop75 // 64 / 256 is 0.25. Blend 75 / 25.
6525 cmp eax, 128
6526 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
6527 cmp eax, 192
6528 je xloop25 // 192 / 256 is 0.75. Blend 25 / 75.
6529
6530 movd xmm5, eax // xmm5 = y fraction
6531 punpcklbw xmm5, xmm5
6532 psrlw xmm5, 1
6533 punpcklwd xmm5, xmm5
6534 punpckldq xmm5, xmm5
6535 punpcklqdq xmm5, xmm5
6536 pxor xmm4, xmm4
6537
6538 align 4
6539 xloop:
6540 movdqu xmm0, [esi] // row0
6541 movdqu xmm2, [esi + edx] // row1
6542 movdqu xmm1, xmm0
6543 movdqu xmm3, xmm2
6544 punpcklbw xmm2, xmm4
6545 punpckhbw xmm3, xmm4
6546 punpcklbw xmm0, xmm4
6547 punpckhbw xmm1, xmm4
6548 psubw xmm2, xmm0 // row1 - row0
6549 psubw xmm3, xmm1
6550 paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16
6551 paddw xmm3, xmm3
6552 pmulhw xmm2, xmm5 // scale diff
6553 pmulhw xmm3, xmm5
6554 paddw xmm0, xmm2 // sum rows
6555 paddw xmm1, xmm3
6556 packuswb xmm0, xmm1
6557 sub ecx, 16
6558 movdqu [esi + edi], xmm0
6559 lea esi, [esi + 16]
6560 jg xloop
6561 jmp xloop99
6562
6563 // Blend 25 / 75.
6564 align 4
6565 xloop25:
6566 movdqu xmm0, [esi]
6567 movdqu xmm1, [esi + edx]
6568 pavgb xmm0, xmm1
6569 pavgb xmm0, xmm1
6570 sub ecx, 16
6571 movdqu [esi + edi], xmm0
6572 lea esi, [esi + 16]
6573 jg xloop25
6574 jmp xloop99
6575
6576 // Blend 50 / 50.
6577 align 4
6578 xloop50:
6579 movdqu xmm0, [esi]
6580 movdqu xmm1, [esi + edx]
6581 pavgb xmm0, xmm1
6582 sub ecx, 16
6583 movdqu [esi + edi], xmm0
6584 lea esi, [esi + 16]
6585 jg xloop50
6586 jmp xloop99
6587
6588 // Blend 75 / 25.
6589 align 4
6590 xloop75:
6591 movdqu xmm1, [esi]
6592 movdqu xmm0, [esi + edx]
6593 pavgb xmm0, xmm1
6594 pavgb xmm0, xmm1
6595 sub ecx, 16
6596 movdqu [esi + edi], xmm0
6597 lea esi, [esi + 16]
6598 jg xloop75
6599 jmp xloop99
6600
6601 // Blend 100 / 0 - Copy row unchanged.
6602 align 4
6603 xloop100:
6604 movdqu xmm0, [esi]
6605 sub ecx, 16
6606 movdqu [esi + edi], xmm0
6607 lea esi, [esi + 16]
6608 jg xloop100
6609
6610 xloop99:
6611 pop edi
6612 pop esi
6613 ret
6614 }
6615 }
6616 #endif // HAS_INTERPOLATEROW_SSE2
6617
6618 __declspec(naked) __declspec(align(16))
HalfRow_SSE2(const uint8 * src_uv,int src_uv_stride,uint8 * dst_uv,int pix)6619 void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
6620 uint8* dst_uv, int pix) {
6621 __asm {
6622 push edi
6623 mov eax, [esp + 4 + 4] // src_uv
6624 mov edx, [esp + 4 + 8] // src_uv_stride
6625 mov edi, [esp + 4 + 12] // dst_v
6626 mov ecx, [esp + 4 + 16] // pix
6627 sub edi, eax
6628
6629 align 4
6630 convertloop:
6631 movdqa xmm0, [eax]
6632 pavgb xmm0, [eax + edx]
6633 sub ecx, 16
6634 movdqa [eax + edi], xmm0
6635 lea eax, [eax + 16]
6636 jg convertloop
6637 pop edi
6638 ret
6639 }
6640 }
6641
6642 #ifdef HAS_HALFROW_AVX2
6643 __declspec(naked) __declspec(align(16))
HalfRow_AVX2(const uint8 * src_uv,int src_uv_stride,uint8 * dst_uv,int pix)6644 void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
6645 uint8* dst_uv, int pix) {
6646 __asm {
6647 push edi
6648 mov eax, [esp + 4 + 4] // src_uv
6649 mov edx, [esp + 4 + 8] // src_uv_stride
6650 mov edi, [esp + 4 + 12] // dst_v
6651 mov ecx, [esp + 4 + 16] // pix
6652 sub edi, eax
6653
6654 align 4
6655 convertloop:
6656 vmovdqu ymm0, [eax]
6657 vpavgb ymm0, ymm0, [eax + edx]
6658 sub ecx, 32
6659 vmovdqu [eax + edi], ymm0
6660 lea eax, [eax + 32]
6661 jg convertloop
6662
6663 pop edi
6664 vzeroupper
6665 ret
6666 }
6667 }
6668 #endif // HAS_HALFROW_AVX2
6669
6670 __declspec(naked) __declspec(align(16))
ARGBToBayerRow_SSSE3(const uint8 * src_argb,uint8 * dst_bayer,uint32 selector,int pix)6671 void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
6672 uint32 selector, int pix) {
6673 __asm {
6674 mov eax, [esp + 4] // src_argb
6675 mov edx, [esp + 8] // dst_bayer
6676 movd xmm5, [esp + 12] // selector
6677 mov ecx, [esp + 16] // pix
6678 pshufd xmm5, xmm5, 0
6679
6680 align 4
6681 wloop:
6682 movdqa xmm0, [eax]
6683 movdqa xmm1, [eax + 16]
6684 lea eax, [eax + 32]
6685 pshufb xmm0, xmm5
6686 pshufb xmm1, xmm5
6687 punpckldq xmm0, xmm1
6688 sub ecx, 8
6689 movq qword ptr [edx], xmm0
6690 lea edx, [edx + 8]
6691 jg wloop
6692 ret
6693 }
6694 }
6695
6696 // Specialized ARGB to Bayer that just isolates G channel.
6697 __declspec(naked) __declspec(align(16))
ARGBToBayerGGRow_SSE2(const uint8 * src_argb,uint8 * dst_bayer,uint32 selector,int pix)6698 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
6699 uint32 selector, int pix) {
6700 __asm {
6701 mov eax, [esp + 4] // src_argb
6702 mov edx, [esp + 8] // dst_bayer
6703 // selector
6704 mov ecx, [esp + 16] // pix
6705 pcmpeqb xmm5, xmm5 // generate mask 0x000000ff
6706 psrld xmm5, 24
6707
6708 align 4
6709 wloop:
6710 movdqa xmm0, [eax]
6711 movdqa xmm1, [eax + 16]
6712 lea eax, [eax + 32]
6713 psrld xmm0, 8 // Move green to bottom.
6714 psrld xmm1, 8
6715 pand xmm0, xmm5
6716 pand xmm1, xmm5
6717 packssdw xmm0, xmm1
6718 packuswb xmm0, xmm1
6719 sub ecx, 8
6720 movq qword ptr [edx], xmm0
6721 lea edx, [edx + 8]
6722 jg wloop
6723 ret
6724 }
6725 }
6726
6727 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
6728 __declspec(naked) __declspec(align(16))
ARGBShuffleRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int pix)6729 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
6730 const uint8* shuffler, int pix) {
6731 __asm {
6732 mov eax, [esp + 4] // src_argb
6733 mov edx, [esp + 8] // dst_argb
6734 mov ecx, [esp + 12] // shuffler
6735 movdqa xmm5, [ecx]
6736 mov ecx, [esp + 16] // pix
6737
6738 align 4
6739 wloop:
6740 movdqa xmm0, [eax]
6741 movdqa xmm1, [eax + 16]
6742 lea eax, [eax + 32]
6743 pshufb xmm0, xmm5
6744 pshufb xmm1, xmm5
6745 sub ecx, 8
6746 movdqa [edx], xmm0
6747 movdqa [edx + 16], xmm1
6748 lea edx, [edx + 32]
6749 jg wloop
6750 ret
6751 }
6752 }
6753
6754 __declspec(naked) __declspec(align(16))
ARGBShuffleRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int pix)6755 void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
6756 const uint8* shuffler, int pix) {
6757 __asm {
6758 mov eax, [esp + 4] // src_argb
6759 mov edx, [esp + 8] // dst_argb
6760 mov ecx, [esp + 12] // shuffler
6761 movdqa xmm5, [ecx]
6762 mov ecx, [esp + 16] // pix
6763
6764 align 4
6765 wloop:
6766 movdqu xmm0, [eax]
6767 movdqu xmm1, [eax + 16]
6768 lea eax, [eax + 32]
6769 pshufb xmm0, xmm5
6770 pshufb xmm1, xmm5
6771 sub ecx, 8
6772 movdqu [edx], xmm0
6773 movdqu [edx + 16], xmm1
6774 lea edx, [edx + 32]
6775 jg wloop
6776 ret
6777 }
6778 }
6779
6780 #ifdef HAS_ARGBSHUFFLEROW_AVX2
6781 __declspec(naked) __declspec(align(16))
ARGBShuffleRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int pix)6782 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
6783 const uint8* shuffler, int pix) {
6784 __asm {
6785 mov eax, [esp + 4] // src_argb
6786 mov edx, [esp + 8] // dst_argb
6787 mov ecx, [esp + 12] // shuffler
6788 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
6789 mov ecx, [esp + 16] // pix
6790
6791 align 4
6792 wloop:
6793 vmovdqu ymm0, [eax]
6794 vmovdqu ymm1, [eax + 32]
6795 lea eax, [eax + 64]
6796 vpshufb ymm0, ymm0, ymm5
6797 vpshufb ymm1, ymm1, ymm5
6798 sub ecx, 16
6799 vmovdqu [edx], ymm0
6800 vmovdqu [edx + 32], ymm1
6801 lea edx, [edx + 64]
6802 jg wloop
6803
6804 vzeroupper
6805 ret
6806 }
6807 }
6808 #endif // HAS_ARGBSHUFFLEROW_AVX2
6809
6810 __declspec(naked) __declspec(align(16))
ARGBShuffleRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int pix)6811 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
6812 const uint8* shuffler, int pix) {
6813 __asm {
6814 push ebx
6815 push esi
6816 mov eax, [esp + 8 + 4] // src_argb
6817 mov edx, [esp + 8 + 8] // dst_argb
6818 mov esi, [esp + 8 + 12] // shuffler
6819 mov ecx, [esp + 8 + 16] // pix
6820 pxor xmm5, xmm5
6821
6822 mov ebx, [esi] // shuffler
6823 cmp ebx, 0x03000102
6824 je shuf_3012
6825 cmp ebx, 0x00010203
6826 je shuf_0123
6827 cmp ebx, 0x00030201
6828 je shuf_0321
6829 cmp ebx, 0x02010003
6830 je shuf_2103
6831
6832 // TODO(fbarchard): Use one source pointer and 3 offsets.
6833 shuf_any1:
6834 movzx ebx, byte ptr [esi]
6835 movzx ebx, byte ptr [eax + ebx]
6836 mov [edx], bl
6837 movzx ebx, byte ptr [esi + 1]
6838 movzx ebx, byte ptr [eax + ebx]
6839 mov [edx + 1], bl
6840 movzx ebx, byte ptr [esi + 2]
6841 movzx ebx, byte ptr [eax + ebx]
6842 mov [edx + 2], bl
6843 movzx ebx, byte ptr [esi + 3]
6844 movzx ebx, byte ptr [eax + ebx]
6845 mov [edx + 3], bl
6846 lea eax, [eax + 4]
6847 lea edx, [edx + 4]
6848 sub ecx, 1
6849 jg shuf_any1
6850 jmp shuf99
6851
6852 align 4
6853 shuf_0123:
6854 movdqu xmm0, [eax]
6855 lea eax, [eax + 16]
6856 movdqa xmm1, xmm0
6857 punpcklbw xmm0, xmm5
6858 punpckhbw xmm1, xmm5
6859 pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB
6860 pshuflw xmm0, xmm0, 01Bh
6861 pshufhw xmm1, xmm1, 01Bh
6862 pshuflw xmm1, xmm1, 01Bh
6863 packuswb xmm0, xmm1
6864 sub ecx, 4
6865 movdqu [edx], xmm0
6866 lea edx, [edx + 16]
6867 jg shuf_0123
6868 jmp shuf99
6869
6870 align 4
6871 shuf_0321:
6872 movdqu xmm0, [eax]
6873 lea eax, [eax + 16]
6874 movdqa xmm1, xmm0
6875 punpcklbw xmm0, xmm5
6876 punpckhbw xmm1, xmm5
6877 pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB
6878 pshuflw xmm0, xmm0, 039h
6879 pshufhw xmm1, xmm1, 039h
6880 pshuflw xmm1, xmm1, 039h
6881 packuswb xmm0, xmm1
6882 sub ecx, 4
6883 movdqu [edx], xmm0
6884 lea edx, [edx + 16]
6885 jg shuf_0321
6886 jmp shuf99
6887
6888 align 4
6889 shuf_2103:
6890 movdqu xmm0, [eax]
6891 lea eax, [eax + 16]
6892 movdqa xmm1, xmm0
6893 punpcklbw xmm0, xmm5
6894 punpckhbw xmm1, xmm5
6895 pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA
6896 pshuflw xmm0, xmm0, 093h
6897 pshufhw xmm1, xmm1, 093h
6898 pshuflw xmm1, xmm1, 093h
6899 packuswb xmm0, xmm1
6900 sub ecx, 4
6901 movdqu [edx], xmm0
6902 lea edx, [edx + 16]
6903 jg shuf_2103
6904 jmp shuf99
6905
6906 align 4
6907 shuf_3012:
6908 movdqu xmm0, [eax]
6909 lea eax, [eax + 16]
6910 movdqa xmm1, xmm0
6911 punpcklbw xmm0, xmm5
6912 punpckhbw xmm1, xmm5
6913 pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB
6914 pshuflw xmm0, xmm0, 0C6h
6915 pshufhw xmm1, xmm1, 0C6h
6916 pshuflw xmm1, xmm1, 0C6h
6917 packuswb xmm0, xmm1
6918 sub ecx, 4
6919 movdqu [edx], xmm0
6920 lea edx, [edx + 16]
6921 jg shuf_3012
6922
6923 shuf99:
6924 pop esi
6925 pop ebx
6926 ret
6927 }
6928 }
6929
6930 // YUY2 - Macro-pixel = 2 image pixels
6931 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
6932
6933 // UYVY - Macro-pixel = 2 image pixels
6934 // U0Y0V0Y1
6935
6936 __declspec(naked) __declspec(align(16))
I422ToYUY2Row_SSE2(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_frame,int width)6937 void I422ToYUY2Row_SSE2(const uint8* src_y,
6938 const uint8* src_u,
6939 const uint8* src_v,
6940 uint8* dst_frame, int width) {
6941 __asm {
6942 push esi
6943 push edi
6944 mov eax, [esp + 8 + 4] // src_y
6945 mov esi, [esp + 8 + 8] // src_u
6946 mov edx, [esp + 8 + 12] // src_v
6947 mov edi, [esp + 8 + 16] // dst_frame
6948 mov ecx, [esp + 8 + 20] // width
6949 sub edx, esi
6950
6951 align 4
6952 convertloop:
6953 movq xmm2, qword ptr [esi] // U
6954 movq xmm3, qword ptr [esi + edx] // V
6955 lea esi, [esi + 8]
6956 punpcklbw xmm2, xmm3 // UV
6957 movdqu xmm0, [eax] // Y
6958 lea eax, [eax + 16]
6959 movdqa xmm1, xmm0
6960 punpcklbw xmm0, xmm2 // YUYV
6961 punpckhbw xmm1, xmm2
6962 movdqu [edi], xmm0
6963 movdqu [edi + 16], xmm1
6964 lea edi, [edi + 32]
6965 sub ecx, 16
6966 jg convertloop
6967
6968 pop edi
6969 pop esi
6970 ret
6971 }
6972 }
6973
6974 __declspec(naked) __declspec(align(16))
I422ToUYVYRow_SSE2(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_frame,int width)6975 void I422ToUYVYRow_SSE2(const uint8* src_y,
6976 const uint8* src_u,
6977 const uint8* src_v,
6978 uint8* dst_frame, int width) {
6979 __asm {
6980 push esi
6981 push edi
6982 mov eax, [esp + 8 + 4] // src_y
6983 mov esi, [esp + 8 + 8] // src_u
6984 mov edx, [esp + 8 + 12] // src_v
6985 mov edi, [esp + 8 + 16] // dst_frame
6986 mov ecx, [esp + 8 + 20] // width
6987 sub edx, esi
6988
6989 align 4
6990 convertloop:
6991 movq xmm2, qword ptr [esi] // U
6992 movq xmm3, qword ptr [esi + edx] // V
6993 lea esi, [esi + 8]
6994 punpcklbw xmm2, xmm3 // UV
6995 movdqu xmm0, [eax] // Y
6996 movdqa xmm1, xmm2
6997 lea eax, [eax + 16]
6998 punpcklbw xmm1, xmm0 // UYVY
6999 punpckhbw xmm2, xmm0
7000 movdqu [edi], xmm1
7001 movdqu [edi + 16], xmm2
7002 lea edi, [edi + 32]
7003 sub ecx, 16
7004 jg convertloop
7005
7006 pop edi
7007 pop esi
7008 ret
7009 }
7010 }
7011
7012 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
7013 __declspec(naked) __declspec(align(16))
ARGBPolynomialRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,const float * poly,int width)7014 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
7015 uint8* dst_argb, const float* poly,
7016 int width) {
7017 __asm {
7018 push esi
7019 mov eax, [esp + 4 + 4] /* src_argb */
7020 mov edx, [esp + 4 + 8] /* dst_argb */
7021 mov esi, [esp + 4 + 12] /* poly */
7022 mov ecx, [esp + 4 + 16] /* width */
7023 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
7024
7025 // 2 pixel loop.
7026 align 4
7027 convertloop:
7028 // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
7029 // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
7030 movq xmm0, qword ptr [eax] // BGRABGRA
7031 lea eax, [eax + 8]
7032 punpcklbw xmm0, xmm3
7033 movdqa xmm4, xmm0
7034 punpcklwd xmm0, xmm3 // pixel 0
7035 punpckhwd xmm4, xmm3 // pixel 1
7036 cvtdq2ps xmm0, xmm0 // 4 floats
7037 cvtdq2ps xmm4, xmm4
7038 movdqa xmm1, xmm0 // X
7039 movdqa xmm5, xmm4
7040 mulps xmm0, [esi + 16] // C1 * X
7041 mulps xmm4, [esi + 16]
7042 addps xmm0, [esi] // result = C0 + C1 * X
7043 addps xmm4, [esi]
7044 movdqa xmm2, xmm1
7045 movdqa xmm6, xmm5
7046 mulps xmm2, xmm1 // X * X
7047 mulps xmm6, xmm5
7048 mulps xmm1, xmm2 // X * X * X
7049 mulps xmm5, xmm6
7050 mulps xmm2, [esi + 32] // C2 * X * X
7051 mulps xmm6, [esi + 32]
7052 mulps xmm1, [esi + 48] // C3 * X * X * X
7053 mulps xmm5, [esi + 48]
7054 addps xmm0, xmm2 // result += C2 * X * X
7055 addps xmm4, xmm6
7056 addps xmm0, xmm1 // result += C3 * X * X * X
7057 addps xmm4, xmm5
7058 cvttps2dq xmm0, xmm0
7059 cvttps2dq xmm4, xmm4
7060 packuswb xmm0, xmm4
7061 packuswb xmm0, xmm0
7062 sub ecx, 2
7063 movq qword ptr [edx], xmm0
7064 lea edx, [edx + 8]
7065 jg convertloop
7066 pop esi
7067 ret
7068 }
7069 }
7070 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
7071
7072 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
7073 __declspec(naked) __declspec(align(16))
ARGBPolynomialRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,const float * poly,int width)7074 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
7075 uint8* dst_argb, const float* poly,
7076 int width) {
7077 __asm {
7078 mov eax, [esp + 4] /* src_argb */
7079 mov edx, [esp + 8] /* dst_argb */
7080 mov ecx, [esp + 12] /* poly */
7081 vbroadcastf128 ymm4, [ecx] // C0
7082 vbroadcastf128 ymm5, [ecx + 16] // C1
7083 vbroadcastf128 ymm6, [ecx + 32] // C2
7084 vbroadcastf128 ymm7, [ecx + 48] // C3
7085 mov ecx, [esp + 16] /* width */
7086
7087 // 2 pixel loop.
7088 align 4
7089 convertloop:
7090 vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
7091 lea eax, [eax + 8]
7092 vcvtdq2ps ymm0, ymm0 // X 8 floats
7093 vmulps ymm2, ymm0, ymm0 // X * X
7094 vmulps ymm3, ymm0, ymm7 // C3 * X
7095 vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
7096 vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
7097 vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
7098 vcvttps2dq ymm0, ymm0
7099 vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
7100 vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
7101 vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
7102 sub ecx, 2
7103 vmovq qword ptr [edx], xmm0
7104 lea edx, [edx + 8]
7105 jg convertloop
7106 vzeroupper
7107 ret
7108 }
7109 }
7110 #endif // HAS_ARGBPOLYNOMIALROW_AVX2
7111
7112 #ifdef HAS_ARGBCOLORTABLEROW_X86
7113 // Tranform ARGB pixels with color table.
7114 __declspec(naked) __declspec(align(16))
ARGBColorTableRow_X86(uint8 * dst_argb,const uint8 * table_argb,int width)7115 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
7116 int width) {
7117 __asm {
7118 push esi
7119 mov eax, [esp + 4 + 4] /* dst_argb */
7120 mov esi, [esp + 4 + 8] /* table_argb */
7121 mov ecx, [esp + 4 + 12] /* width */
7122
7123 // 1 pixel loop.
7124 align 4
7125 convertloop:
7126 movzx edx, byte ptr [eax]
7127 lea eax, [eax + 4]
7128 movzx edx, byte ptr [esi + edx * 4]
7129 mov byte ptr [eax - 4], dl
7130 movzx edx, byte ptr [eax - 4 + 1]
7131 movzx edx, byte ptr [esi + edx * 4 + 1]
7132 mov byte ptr [eax - 4 + 1], dl
7133 movzx edx, byte ptr [eax - 4 + 2]
7134 movzx edx, byte ptr [esi + edx * 4 + 2]
7135 mov byte ptr [eax - 4 + 2], dl
7136 movzx edx, byte ptr [eax - 4 + 3]
7137 movzx edx, byte ptr [esi + edx * 4 + 3]
7138 mov byte ptr [eax - 4 + 3], dl
7139 dec ecx
7140 jg convertloop
7141 pop esi
7142 ret
7143 }
7144 }
7145 #endif // HAS_ARGBCOLORTABLEROW_X86
7146
7147 #ifdef HAS_RGBCOLORTABLEROW_X86
7148 // Tranform RGB pixels with color table.
7149 __declspec(naked) __declspec(align(16))
RGBColorTableRow_X86(uint8 * dst_argb,const uint8 * table_argb,int width)7150 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
7151 __asm {
7152 push esi
7153 mov eax, [esp + 4 + 4] /* dst_argb */
7154 mov esi, [esp + 4 + 8] /* table_argb */
7155 mov ecx, [esp + 4 + 12] /* width */
7156
7157 // 1 pixel loop.
7158 align 4
7159 convertloop:
7160 movzx edx, byte ptr [eax]
7161 lea eax, [eax + 4]
7162 movzx edx, byte ptr [esi + edx * 4]
7163 mov byte ptr [eax - 4], dl
7164 movzx edx, byte ptr [eax - 4 + 1]
7165 movzx edx, byte ptr [esi + edx * 4 + 1]
7166 mov byte ptr [eax - 4 + 1], dl
7167 movzx edx, byte ptr [eax - 4 + 2]
7168 movzx edx, byte ptr [esi + edx * 4 + 2]
7169 mov byte ptr [eax - 4 + 2], dl
7170 dec ecx
7171 jg convertloop
7172
7173 pop esi
7174 ret
7175 }
7176 }
7177 #endif // HAS_RGBCOLORTABLEROW_X86
7178
7179 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
7180 // Tranform RGB pixels with luma table.
7181 __declspec(naked) __declspec(align(16))
ARGBLumaColorTableRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width,const uint8 * luma,uint32 lumacoeff)7182 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
7183 int width,
7184 const uint8* luma, uint32 lumacoeff) {
7185 __asm {
7186 push esi
7187 push edi
7188 mov eax, [esp + 8 + 4] /* src_argb */
7189 mov edi, [esp + 8 + 8] /* dst_argb */
7190 mov ecx, [esp + 8 + 12] /* width */
7191 movd xmm2, dword ptr [esp + 8 + 16] // luma table
7192 movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
7193 pshufd xmm2, xmm2, 0
7194 pshufd xmm3, xmm3, 0
7195 pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00
7196 psllw xmm4, 8
7197 pxor xmm5, xmm5
7198
7199 // 4 pixel loop.
7200 align 4
7201 convertloop:
7202 movdqu xmm0, qword ptr [eax] // generate luma ptr
7203 pmaddubsw xmm0, xmm3
7204 phaddw xmm0, xmm0
7205 pand xmm0, xmm4 // mask out low bits
7206 punpcklwd xmm0, xmm5
7207 paddd xmm0, xmm2 // add table base
7208 movd esi, xmm0
7209 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
7210
7211 movzx edx, byte ptr [eax]
7212 movzx edx, byte ptr [esi + edx]
7213 mov byte ptr [edi], dl
7214 movzx edx, byte ptr [eax + 1]
7215 movzx edx, byte ptr [esi + edx]
7216 mov byte ptr [edi + 1], dl
7217 movzx edx, byte ptr [eax + 2]
7218 movzx edx, byte ptr [esi + edx]
7219 mov byte ptr [edi + 2], dl
7220 movzx edx, byte ptr [eax + 3] // copy alpha.
7221 mov byte ptr [edi + 3], dl
7222
7223 movd esi, xmm0
7224 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
7225
7226 movzx edx, byte ptr [eax + 4]
7227 movzx edx, byte ptr [esi + edx]
7228 mov byte ptr [edi + 4], dl
7229 movzx edx, byte ptr [eax + 5]
7230 movzx edx, byte ptr [esi + edx]
7231 mov byte ptr [edi + 5], dl
7232 movzx edx, byte ptr [eax + 6]
7233 movzx edx, byte ptr [esi + edx]
7234 mov byte ptr [edi + 6], dl
7235 movzx edx, byte ptr [eax + 7] // copy alpha.
7236 mov byte ptr [edi + 7], dl
7237
7238 movd esi, xmm0
7239 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
7240
7241 movzx edx, byte ptr [eax + 8]
7242 movzx edx, byte ptr [esi + edx]
7243 mov byte ptr [edi + 8], dl
7244 movzx edx, byte ptr [eax + 9]
7245 movzx edx, byte ptr [esi + edx]
7246 mov byte ptr [edi + 9], dl
7247 movzx edx, byte ptr [eax + 10]
7248 movzx edx, byte ptr [esi + edx]
7249 mov byte ptr [edi + 10], dl
7250 movzx edx, byte ptr [eax + 11] // copy alpha.
7251 mov byte ptr [edi + 11], dl
7252
7253 movd esi, xmm0
7254
7255 movzx edx, byte ptr [eax + 12]
7256 movzx edx, byte ptr [esi + edx]
7257 mov byte ptr [edi + 12], dl
7258 movzx edx, byte ptr [eax + 13]
7259 movzx edx, byte ptr [esi + edx]
7260 mov byte ptr [edi + 13], dl
7261 movzx edx, byte ptr [eax + 14]
7262 movzx edx, byte ptr [esi + edx]
7263 mov byte ptr [edi + 14], dl
7264 movzx edx, byte ptr [eax + 15] // copy alpha.
7265 mov byte ptr [edi + 15], dl
7266
7267 sub ecx, 4
7268 lea eax, [eax + 16]
7269 lea edi, [edi + 16]
7270 jg convertloop
7271
7272 pop edi
7273 pop esi
7274 ret
7275 }
7276 }
7277 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
7278
7279 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
7280
7281 #ifdef __cplusplus
7282 } // extern "C"
7283 } // namespace libyuv
7284 #endif
7285