1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // This module is for Visual C x86.
19 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
20
21 // TODO(fbarchard): I420ToRGB24, I420ToRAW
22 #ifdef HAS_ARGBTOYROW_SSSE3
23
24 // Constants for ARGB.
25 static const vec8 kARGBToY = {
26 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
27 };
28
29 static const vec8 kARGBToU = {
30 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
31 };
32
33 static const vec8 kARGBToV = {
34 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
35 };
36
37 // Constants for BGRA.
38 static const vec8 kBGRAToY = {
39 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
40 };
41
42 static const vec8 kBGRAToU = {
43 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
44 };
45
46 static const vec8 kBGRAToV = {
47 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
48 };
49
50 // Constants for ABGR.
51 static const vec8 kABGRToY = {
52 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
53 };
54
55 static const vec8 kABGRToU = {
56 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
57 };
58
59 static const vec8 kABGRToV = {
60 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
61 };
62
63 // Constants for RGBA.
64 static const vec8 kRGBAToY = {
65 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
66 };
67
68 static const vec8 kRGBAToU = {
69 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
70 };
71
72 static const vec8 kRGBAToV = {
73 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
74 };
75
76 static const uvec8 kAddY16 = {
77 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
78 };
79
80 static const uvec8 kAddUV128 = {
81 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
82 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
83 };
84
85 // Shuffle table for converting RGB24 to ARGB.
86 static const uvec8 kShuffleMaskRGB24ToARGB = {
87 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
88 };
89
90 // Shuffle table for converting RAW to ARGB.
91 static const uvec8 kShuffleMaskRAWToARGB = {
92 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
93 };
94
95 // Shuffle table for converting BGRA to ARGB.
96 static const uvec8 kShuffleMaskBGRAToARGB = {
97 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
98 };
99
100 // Shuffle table for converting ABGR to ARGB.
101 static const uvec8 kShuffleMaskABGRToARGB = {
102 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
103 };
104
105 // Shuffle table for converting RGBA to ARGB.
106 static const uvec8 kShuffleMaskRGBAToARGB = {
107 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
108 };
109
110 // Shuffle table for converting ARGB to RGBA.
111 static const uvec8 kShuffleMaskARGBToRGBA = {
112 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
113 };
114
115 // Shuffle table for converting ARGB to RGB24.
116 static const uvec8 kShuffleMaskARGBToRGB24 = {
117 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
118 };
119
120 // Shuffle table for converting ARGB to RAW.
121 static const uvec8 kShuffleMaskARGBToRAW = {
122 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
123 };
124
125 __declspec(naked) __declspec(align(16))
I400ToARGBRow_SSE2(const uint8 * src_y,uint8 * dst_argb,int pix)126 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
127 __asm {
128 mov eax, [esp + 4] // src_y
129 mov edx, [esp + 8] // dst_argb
130 mov ecx, [esp + 12] // pix
131 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
132 pslld xmm5, 24
133
134 align 16
135 convertloop:
136 movq xmm0, qword ptr [eax]
137 lea eax, [eax + 8]
138 punpcklbw xmm0, xmm0
139 movdqa xmm1, xmm0
140 punpcklwd xmm0, xmm0
141 punpckhwd xmm1, xmm1
142 por xmm0, xmm5
143 por xmm1, xmm5
144 movdqa [edx], xmm0
145 movdqa [edx + 16], xmm1
146 lea edx, [edx + 32]
147 sub ecx, 8
148 jg convertloop
149 ret
150 }
151 }
152
153 __declspec(naked) __declspec(align(16))
BGRAToARGBRow_SSSE3(const uint8 * src_bgra,uint8 * dst_argb,int pix)154 void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
155 __asm {
156 mov eax, [esp + 4] // src_bgra
157 mov edx, [esp + 8] // dst_argb
158 mov ecx, [esp + 12] // pix
159 movdqa xmm5, kShuffleMaskBGRAToARGB
160 sub edx, eax
161
162 align 16
163 convertloop:
164 movdqa xmm0, [eax]
165 pshufb xmm0, xmm5
166 sub ecx, 4
167 movdqa [eax + edx], xmm0
168 lea eax, [eax + 16]
169 jg convertloop
170 ret
171 }
172 }
173
174 __declspec(naked) __declspec(align(16))
ABGRToARGBRow_SSSE3(const uint8 * src_abgr,uint8 * dst_argb,int pix)175 void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
176 __asm {
177 mov eax, [esp + 4] // src_abgr
178 mov edx, [esp + 8] // dst_argb
179 mov ecx, [esp + 12] // pix
180 movdqa xmm5, kShuffleMaskABGRToARGB
181 sub edx, eax
182
183 align 16
184 convertloop:
185 movdqa xmm0, [eax]
186 pshufb xmm0, xmm5
187 sub ecx, 4
188 movdqa [eax + edx], xmm0
189 lea eax, [eax + 16]
190 jg convertloop
191 ret
192 }
193 }
194
195 __declspec(naked) __declspec(align(16))
RGBAToARGBRow_SSSE3(const uint8 * src_rgba,uint8 * dst_argb,int pix)196 void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
197 __asm {
198 mov eax, [esp + 4] // src_rgba
199 mov edx, [esp + 8] // dst_argb
200 mov ecx, [esp + 12] // pix
201 movdqa xmm5, kShuffleMaskRGBAToARGB
202 sub edx, eax
203
204 align 16
205 convertloop:
206 movdqa xmm0, [eax]
207 pshufb xmm0, xmm5
208 sub ecx, 4
209 movdqa [eax + edx], xmm0
210 lea eax, [eax + 16]
211 jg convertloop
212 ret
213 }
214 }
215
216 __declspec(naked) __declspec(align(16))
ARGBToRGBARow_SSSE3(const uint8 * src_argb,uint8 * dst_rgba,int pix)217 void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
218 __asm {
219 mov eax, [esp + 4] // src_argb
220 mov edx, [esp + 8] // dst_rgba
221 mov ecx, [esp + 12] // pix
222 movdqa xmm5, kShuffleMaskARGBToRGBA
223 sub edx, eax
224
225 align 16
226 convertloop:
227 movdqa xmm0, [eax]
228 pshufb xmm0, xmm5
229 sub ecx, 4
230 movdqa [eax + edx], xmm0
231 lea eax, [eax + 16]
232 jg convertloop
233 ret
234 }
235 }
236
237 __declspec(naked) __declspec(align(16))
RGB24ToARGBRow_SSSE3(const uint8 * src_rgb24,uint8 * dst_argb,int pix)238 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
239 __asm {
240 mov eax, [esp + 4] // src_rgb24
241 mov edx, [esp + 8] // dst_argb
242 mov ecx, [esp + 12] // pix
243 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
244 pslld xmm5, 24
245 movdqa xmm4, kShuffleMaskRGB24ToARGB
246
247 align 16
248 convertloop:
249 movdqu xmm0, [eax]
250 movdqu xmm1, [eax + 16]
251 movdqu xmm3, [eax + 32]
252 lea eax, [eax + 48]
253 movdqa xmm2, xmm3
254 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
255 pshufb xmm2, xmm4
256 por xmm2, xmm5
257 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
258 pshufb xmm0, xmm4
259 movdqa [edx + 32], xmm2
260 por xmm0, xmm5
261 pshufb xmm1, xmm4
262 movdqa [edx], xmm0
263 por xmm1, xmm5
264 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
265 pshufb xmm3, xmm4
266 movdqa [edx + 16], xmm1
267 por xmm3, xmm5
268 sub ecx, 16
269 movdqa [edx + 48], xmm3
270 lea edx, [edx + 64]
271 jg convertloop
272 ret
273 }
274 }
275
276 __declspec(naked) __declspec(align(16))
RAWToARGBRow_SSSE3(const uint8 * src_raw,uint8 * dst_argb,int pix)277 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
278 int pix) {
279 __asm {
280 mov eax, [esp + 4] // src_raw
281 mov edx, [esp + 8] // dst_argb
282 mov ecx, [esp + 12] // pix
283 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
284 pslld xmm5, 24
285 movdqa xmm4, kShuffleMaskRAWToARGB
286
287 align 16
288 convertloop:
289 movdqu xmm0, [eax]
290 movdqu xmm1, [eax + 16]
291 movdqu xmm3, [eax + 32]
292 lea eax, [eax + 48]
293 movdqa xmm2, xmm3
294 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
295 pshufb xmm2, xmm4
296 por xmm2, xmm5
297 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
298 pshufb xmm0, xmm4
299 movdqa [edx + 32], xmm2
300 por xmm0, xmm5
301 pshufb xmm1, xmm4
302 movdqa [edx], xmm0
303 por xmm1, xmm5
304 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
305 pshufb xmm3, xmm4
306 movdqa [edx + 16], xmm1
307 por xmm3, xmm5
308 sub ecx, 16
309 movdqa [edx + 48], xmm3
310 lea edx, [edx + 64]
311 jg convertloop
312 ret
313 }
314 }
315
316 // pmul method to replicate bits.
317 // Math to replicate bits:
318 // (v << 8) | (v << 3)
319 // v * 256 + v * 8
320 // v * (256 + 8)
321 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
322 // 20 instructions.
323 __declspec(naked) __declspec(align(16))
RGB565ToARGBRow_SSE2(const uint8 * src_rgb565,uint8 * dst_argb,int pix)324 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
325 int pix) {
326 __asm {
327 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
328 movd xmm5, eax
329 pshufd xmm5, xmm5, 0
330 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
331 movd xmm6, eax
332 pshufd xmm6, xmm6, 0
333 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
334 psllw xmm3, 11
335 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
336 psllw xmm4, 10
337 psrlw xmm4, 5
338 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
339 psllw xmm7, 8
340
341 mov eax, [esp + 4] // src_rgb565
342 mov edx, [esp + 8] // dst_argb
343 mov ecx, [esp + 12] // pix
344 sub edx, eax
345 sub edx, eax
346
347 align 16
348 convertloop:
349 movdqu xmm0, [eax] // fetch 8 pixels of bgr565
350 movdqa xmm1, xmm0
351 movdqa xmm2, xmm0
352 pand xmm1, xmm3 // R in upper 5 bits
353 psllw xmm2, 11 // B in upper 5 bits
354 pmulhuw xmm1, xmm5 // * (256 + 8)
355 pmulhuw xmm2, xmm5 // * (256 + 8)
356 psllw xmm1, 8
357 por xmm1, xmm2 // RB
358 pand xmm0, xmm4 // G in middle 6 bits
359 pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
360 por xmm0, xmm7 // AG
361 movdqa xmm2, xmm1
362 punpcklbw xmm1, xmm0
363 punpckhbw xmm2, xmm0
364 movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
365 movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
366 lea eax, [eax + 16]
367 sub ecx, 8
368 jg convertloop
369 ret
370 }
371 }
372
373 // 24 instructions
374 __declspec(naked) __declspec(align(16))
ARGB1555ToARGBRow_SSE2(const uint8 * src_argb1555,uint8 * dst_argb,int pix)375 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
376 int pix) {
377 __asm {
378 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
379 movd xmm5, eax
380 pshufd xmm5, xmm5, 0
381 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
382 movd xmm6, eax
383 pshufd xmm6, xmm6, 0
384 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
385 psllw xmm3, 11
386 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
387 psrlw xmm4, 6
388 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
389 psllw xmm7, 8
390
391 mov eax, [esp + 4] // src_argb1555
392 mov edx, [esp + 8] // dst_argb
393 mov ecx, [esp + 12] // pix
394 sub edx, eax
395 sub edx, eax
396
397 align 16
398 convertloop:
399 movdqu xmm0, [eax] // fetch 8 pixels of 1555
400 movdqa xmm1, xmm0
401 movdqa xmm2, xmm0
402 psllw xmm1, 1 // R in upper 5 bits
403 psllw xmm2, 11 // B in upper 5 bits
404 pand xmm1, xmm3
405 pmulhuw xmm2, xmm5 // * (256 + 8)
406 pmulhuw xmm1, xmm5 // * (256 + 8)
407 psllw xmm1, 8
408 por xmm1, xmm2 // RB
409 movdqa xmm2, xmm0
410 pand xmm0, xmm4 // G in middle 5 bits
411 psraw xmm2, 8 // A
412 pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
413 pand xmm2, xmm7
414 por xmm0, xmm2 // AG
415 movdqa xmm2, xmm1
416 punpcklbw xmm1, xmm0
417 punpckhbw xmm2, xmm0
418 movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
419 movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
420 lea eax, [eax + 16]
421 sub ecx, 8
422 jg convertloop
423 ret
424 }
425 }
426
427 // 18 instructions.
428 __declspec(naked) __declspec(align(16))
ARGB4444ToARGBRow_SSE2(const uint8 * src_argb4444,uint8 * dst_argb,int pix)429 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
430 int pix) {
431 __asm {
432 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
433 movd xmm4, eax
434 pshufd xmm4, xmm4, 0
435 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
436 pslld xmm5, 4
437 mov eax, [esp + 4] // src_argb4444
438 mov edx, [esp + 8] // dst_argb
439 mov ecx, [esp + 12] // pix
440 sub edx, eax
441 sub edx, eax
442
443 align 16
444 convertloop:
445 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
446 movdqa xmm2, xmm0
447 pand xmm0, xmm4 // mask low nibbles
448 pand xmm2, xmm5 // mask high nibbles
449 movdqa xmm1, xmm0
450 movdqa xmm3, xmm2
451 psllw xmm1, 4
452 psrlw xmm3, 4
453 por xmm0, xmm1
454 por xmm2, xmm3
455 movdqa xmm1, xmm0
456 punpcklbw xmm0, xmm2
457 punpckhbw xmm1, xmm2
458 movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
459 movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
460 lea eax, [eax + 16]
461 sub ecx, 8
462 jg convertloop
463 ret
464 }
465 }
466
467 __declspec(naked) __declspec(align(16))
ARGBToRGB24Row_SSSE3(const uint8 * src_argb,uint8 * dst_rgb,int pix)468 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
469 __asm {
470 mov eax, [esp + 4] // src_argb
471 mov edx, [esp + 8] // dst_rgb
472 mov ecx, [esp + 12] // pix
473 movdqa xmm6, kShuffleMaskARGBToRGB24
474
475 align 16
476 convertloop:
477 movdqa xmm0, [eax] // fetch 16 pixels of argb
478 movdqa xmm1, [eax + 16]
479 movdqa xmm2, [eax + 32]
480 movdqa xmm3, [eax + 48]
481 lea eax, [eax + 64]
482 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
483 pshufb xmm1, xmm6
484 pshufb xmm2, xmm6
485 pshufb xmm3, xmm6
486 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
487 psrldq xmm1, 4 // 8 bytes from 1
488 pslldq xmm4, 12 // 4 bytes from 1 for 0
489 movdqa xmm5, xmm2 // 8 bytes from 2 for 1
490 por xmm0, xmm4 // 4 bytes from 1 for 0
491 pslldq xmm5, 8 // 8 bytes from 2 for 1
492 movdqa [edx], xmm0 // store 0
493 por xmm1, xmm5 // 8 bytes from 2 for 1
494 psrldq xmm2, 8 // 4 bytes from 2
495 pslldq xmm3, 4 // 12 bytes from 3 for 2
496 por xmm2, xmm3 // 12 bytes from 3 for 2
497 movdqa [edx + 16], xmm1 // store 1
498 movdqa [edx + 32], xmm2 // store 2
499 lea edx, [edx + 48]
500 sub ecx, 16
501 jg convertloop
502 ret
503 }
504 }
505
506 __declspec(naked) __declspec(align(16))
ARGBToRAWRow_SSSE3(const uint8 * src_argb,uint8 * dst_rgb,int pix)507 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
508 __asm {
509 mov eax, [esp + 4] // src_argb
510 mov edx, [esp + 8] // dst_rgb
511 mov ecx, [esp + 12] // pix
512 movdqa xmm6, kShuffleMaskARGBToRAW
513
514 align 16
515 convertloop:
516 movdqa xmm0, [eax] // fetch 16 pixels of argb
517 movdqa xmm1, [eax + 16]
518 movdqa xmm2, [eax + 32]
519 movdqa xmm3, [eax + 48]
520 lea eax, [eax + 64]
521 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
522 pshufb xmm1, xmm6
523 pshufb xmm2, xmm6
524 pshufb xmm3, xmm6
525 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
526 psrldq xmm1, 4 // 8 bytes from 1
527 pslldq xmm4, 12 // 4 bytes from 1 for 0
528 movdqa xmm5, xmm2 // 8 bytes from 2 for 1
529 por xmm0, xmm4 // 4 bytes from 1 for 0
530 pslldq xmm5, 8 // 8 bytes from 2 for 1
531 movdqa [edx], xmm0 // store 0
532 por xmm1, xmm5 // 8 bytes from 2 for 1
533 psrldq xmm2, 8 // 4 bytes from 2
534 pslldq xmm3, 4 // 12 bytes from 3 for 2
535 por xmm2, xmm3 // 12 bytes from 3 for 2
536 movdqa [edx + 16], xmm1 // store 1
537 movdqa [edx + 32], xmm2 // store 2
538 lea edx, [edx + 48]
539 sub ecx, 16
540 jg convertloop
541 ret
542 }
543 }
544
545 __declspec(naked) __declspec(align(16))
ARGBToRGB565Row_SSE2(const uint8 * src_argb,uint8 * dst_rgb,int pix)546 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
547 __asm {
548 mov eax, [esp + 4] // src_argb
549 mov edx, [esp + 8] // dst_rgb
550 mov ecx, [esp + 12] // pix
551 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
552 psrld xmm3, 27
553 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
554 psrld xmm4, 26
555 pslld xmm4, 5
556 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
557 pslld xmm5, 11
558
559 align 16
560 convertloop:
561 movdqa xmm0, [eax] // fetch 4 pixels of argb
562 movdqa xmm1, xmm0 // B
563 movdqa xmm2, xmm0 // G
564 pslld xmm0, 8 // R
565 psrld xmm1, 3 // B
566 psrld xmm2, 5 // G
567 psrad xmm0, 16 // R
568 pand xmm1, xmm3 // B
569 pand xmm2, xmm4 // G
570 pand xmm0, xmm5 // R
571 por xmm1, xmm2 // BG
572 por xmm0, xmm1 // BGR
573 packssdw xmm0, xmm0
574 lea eax, [eax + 16]
575 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
576 lea edx, [edx + 8]
577 sub ecx, 4
578 jg convertloop
579 ret
580 }
581 }
582
583 // TODO(fbarchard): Improve sign extension/packing.
584 __declspec(naked) __declspec(align(16))
ARGBToARGB1555Row_SSE2(const uint8 * src_argb,uint8 * dst_rgb,int pix)585 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
586 __asm {
587 mov eax, [esp + 4] // src_argb
588 mov edx, [esp + 8] // dst_rgb
589 mov ecx, [esp + 12] // pix
590 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
591 psrld xmm4, 27
592 movdqa xmm5, xmm4 // generate mask 0x000003e0
593 pslld xmm5, 5
594 movdqa xmm6, xmm4 // generate mask 0x00007c00
595 pslld xmm6, 10
596 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
597 pslld xmm7, 15
598
599 align 16
600 convertloop:
601 movdqa xmm0, [eax] // fetch 4 pixels of argb
602 movdqa xmm1, xmm0 // B
603 movdqa xmm2, xmm0 // G
604 movdqa xmm3, xmm0 // R
605 psrad xmm0, 16 // A
606 psrld xmm1, 3 // B
607 psrld xmm2, 6 // G
608 psrld xmm3, 9 // R
609 pand xmm0, xmm7 // A
610 pand xmm1, xmm4 // B
611 pand xmm2, xmm5 // G
612 pand xmm3, xmm6 // R
613 por xmm0, xmm1 // BA
614 por xmm2, xmm3 // GR
615 por xmm0, xmm2 // BGRA
616 packssdw xmm0, xmm0
617 lea eax, [eax + 16]
618 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
619 lea edx, [edx + 8]
620 sub ecx, 4
621 jg convertloop
622 ret
623 }
624 }
625
626 __declspec(naked) __declspec(align(16))
ARGBToARGB4444Row_SSE2(const uint8 * src_argb,uint8 * dst_rgb,int pix)627 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
628 __asm {
629 mov eax, [esp + 4] // src_argb
630 mov edx, [esp + 8] // dst_rgb
631 mov ecx, [esp + 12] // pix
632 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
633 psllw xmm4, 12
634 movdqa xmm3, xmm4 // generate mask 0x00f000f0
635 psrlw xmm3, 8
636
637 align 16
638 convertloop:
639 movdqa xmm0, [eax] // fetch 4 pixels of argb
640 movdqa xmm1, xmm0
641 pand xmm0, xmm3 // low nibble
642 pand xmm1, xmm4 // high nibble
643 psrl xmm0, 4
644 psrl xmm1, 8
645 por xmm0, xmm1
646 packuswb xmm0, xmm0
647 lea eax, [eax + 16]
648 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
649 lea edx, [edx + 8]
650 sub ecx, 4
651 jg convertloop
652 ret
653 }
654 }
655
656 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
657 __declspec(naked) __declspec(align(16))
ARGBToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)658 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
659 __asm {
660 mov eax, [esp + 4] /* src_argb */
661 mov edx, [esp + 8] /* dst_y */
662 mov ecx, [esp + 12] /* pix */
663 movdqa xmm5, kAddY16
664 movdqa xmm4, kARGBToY
665
666 align 16
667 convertloop:
668 movdqa xmm0, [eax]
669 movdqa xmm1, [eax + 16]
670 movdqa xmm2, [eax + 32]
671 movdqa xmm3, [eax + 48]
672 pmaddubsw xmm0, xmm4
673 pmaddubsw xmm1, xmm4
674 pmaddubsw xmm2, xmm4
675 pmaddubsw xmm3, xmm4
676 lea eax, [eax + 64]
677 phaddw xmm0, xmm1
678 phaddw xmm2, xmm3
679 psrlw xmm0, 7
680 psrlw xmm2, 7
681 packuswb xmm0, xmm2
682 paddb xmm0, xmm5
683 sub ecx, 16
684 movdqa [edx], xmm0
685 lea edx, [edx + 16]
686 jg convertloop
687 ret
688 }
689 }
690
691 __declspec(naked) __declspec(align(16))
ARGBToYRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)692 void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
693 __asm {
694 mov eax, [esp + 4] /* src_argb */
695 mov edx, [esp + 8] /* dst_y */
696 mov ecx, [esp + 12] /* pix */
697 movdqa xmm5, kAddY16
698 movdqa xmm4, kARGBToY
699
700 align 16
701 convertloop:
702 movdqu xmm0, [eax]
703 movdqu xmm1, [eax + 16]
704 movdqu xmm2, [eax + 32]
705 movdqu xmm3, [eax + 48]
706 pmaddubsw xmm0, xmm4
707 pmaddubsw xmm1, xmm4
708 pmaddubsw xmm2, xmm4
709 pmaddubsw xmm3, xmm4
710 lea eax, [eax + 64]
711 phaddw xmm0, xmm1
712 phaddw xmm2, xmm3
713 psrlw xmm0, 7
714 psrlw xmm2, 7
715 packuswb xmm0, xmm2
716 paddb xmm0, xmm5
717 sub ecx, 16
718 movdqu [edx], xmm0
719 lea edx, [edx + 16]
720 jg convertloop
721 ret
722 }
723 }
724
725 __declspec(naked) __declspec(align(16))
BGRAToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)726 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
727 __asm {
728 mov eax, [esp + 4] /* src_argb */
729 mov edx, [esp + 8] /* dst_y */
730 mov ecx, [esp + 12] /* pix */
731 movdqa xmm5, kAddY16
732 movdqa xmm4, kBGRAToY
733
734 align 16
735 convertloop:
736 movdqa xmm0, [eax]
737 movdqa xmm1, [eax + 16]
738 movdqa xmm2, [eax + 32]
739 movdqa xmm3, [eax + 48]
740 pmaddubsw xmm0, xmm4
741 pmaddubsw xmm1, xmm4
742 pmaddubsw xmm2, xmm4
743 pmaddubsw xmm3, xmm4
744 lea eax, [eax + 64]
745 phaddw xmm0, xmm1
746 phaddw xmm2, xmm3
747 psrlw xmm0, 7
748 psrlw xmm2, 7
749 packuswb xmm0, xmm2
750 paddb xmm0, xmm5
751 sub ecx, 16
752 movdqa [edx], xmm0
753 lea edx, [edx + 16]
754 jg convertloop
755 ret
756 }
757 }
758
759 __declspec(naked) __declspec(align(16))
BGRAToYRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)760 void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
761 __asm {
762 mov eax, [esp + 4] /* src_argb */
763 mov edx, [esp + 8] /* dst_y */
764 mov ecx, [esp + 12] /* pix */
765 movdqa xmm5, kAddY16
766 movdqa xmm4, kBGRAToY
767
768 align 16
769 convertloop:
770 movdqu xmm0, [eax]
771 movdqu xmm1, [eax + 16]
772 movdqu xmm2, [eax + 32]
773 movdqu xmm3, [eax + 48]
774 pmaddubsw xmm0, xmm4
775 pmaddubsw xmm1, xmm4
776 pmaddubsw xmm2, xmm4
777 pmaddubsw xmm3, xmm4
778 lea eax, [eax + 64]
779 phaddw xmm0, xmm1
780 phaddw xmm2, xmm3
781 psrlw xmm0, 7
782 psrlw xmm2, 7
783 packuswb xmm0, xmm2
784 paddb xmm0, xmm5
785 sub ecx, 16
786 movdqu [edx], xmm0
787 lea edx, [edx + 16]
788 jg convertloop
789 ret
790 }
791 }
792
793 __declspec(naked) __declspec(align(16))
ABGRToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)794 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
795 __asm {
796 mov eax, [esp + 4] /* src_argb */
797 mov edx, [esp + 8] /* dst_y */
798 mov ecx, [esp + 12] /* pix */
799 movdqa xmm5, kAddY16
800 movdqa xmm4, kABGRToY
801
802 align 16
803 convertloop:
804 movdqa xmm0, [eax]
805 movdqa xmm1, [eax + 16]
806 movdqa xmm2, [eax + 32]
807 movdqa xmm3, [eax + 48]
808 pmaddubsw xmm0, xmm4
809 pmaddubsw xmm1, xmm4
810 pmaddubsw xmm2, xmm4
811 pmaddubsw xmm3, xmm4
812 lea eax, [eax + 64]
813 phaddw xmm0, xmm1
814 phaddw xmm2, xmm3
815 psrlw xmm0, 7
816 psrlw xmm2, 7
817 packuswb xmm0, xmm2
818 paddb xmm0, xmm5
819 sub ecx, 16
820 movdqa [edx], xmm0
821 lea edx, [edx + 16]
822 jg convertloop
823 ret
824 }
825 }
826
827 __declspec(naked) __declspec(align(16))
ABGRToYRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)828 void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
829 __asm {
830 mov eax, [esp + 4] /* src_argb */
831 mov edx, [esp + 8] /* dst_y */
832 mov ecx, [esp + 12] /* pix */
833 movdqa xmm5, kAddY16
834 movdqa xmm4, kABGRToY
835
836 align 16
837 convertloop:
838 movdqu xmm0, [eax]
839 movdqu xmm1, [eax + 16]
840 movdqu xmm2, [eax + 32]
841 movdqu xmm3, [eax + 48]
842 pmaddubsw xmm0, xmm4
843 pmaddubsw xmm1, xmm4
844 pmaddubsw xmm2, xmm4
845 pmaddubsw xmm3, xmm4
846 lea eax, [eax + 64]
847 phaddw xmm0, xmm1
848 phaddw xmm2, xmm3
849 psrlw xmm0, 7
850 psrlw xmm2, 7
851 packuswb xmm0, xmm2
852 paddb xmm0, xmm5
853 sub ecx, 16
854 movdqu [edx], xmm0
855 lea edx, [edx + 16]
856 jg convertloop
857 ret
858 }
859 }
860
861 __declspec(naked) __declspec(align(16))
RGBAToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)862 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
863 __asm {
864 mov eax, [esp + 4] /* src_argb */
865 mov edx, [esp + 8] /* dst_y */
866 mov ecx, [esp + 12] /* pix */
867 movdqa xmm5, kAddY16
868 movdqa xmm4, kRGBAToY
869
870 align 16
871 convertloop:
872 movdqa xmm0, [eax]
873 movdqa xmm1, [eax + 16]
874 movdqa xmm2, [eax + 32]
875 movdqa xmm3, [eax + 48]
876 pmaddubsw xmm0, xmm4
877 pmaddubsw xmm1, xmm4
878 pmaddubsw xmm2, xmm4
879 pmaddubsw xmm3, xmm4
880 lea eax, [eax + 64]
881 phaddw xmm0, xmm1
882 phaddw xmm2, xmm3
883 psrlw xmm0, 7
884 psrlw xmm2, 7
885 packuswb xmm0, xmm2
886 paddb xmm0, xmm5
887 sub ecx, 16
888 movdqa [edx], xmm0
889 lea edx, [edx + 16]
890 jg convertloop
891 ret
892 }
893 }
894
895 __declspec(naked) __declspec(align(16))
RGBAToYRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)896 void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
897 __asm {
898 mov eax, [esp + 4] /* src_argb */
899 mov edx, [esp + 8] /* dst_y */
900 mov ecx, [esp + 12] /* pix */
901 movdqa xmm5, kAddY16
902 movdqa xmm4, kRGBAToY
903
904 align 16
905 convertloop:
906 movdqu xmm0, [eax]
907 movdqu xmm1, [eax + 16]
908 movdqu xmm2, [eax + 32]
909 movdqu xmm3, [eax + 48]
910 pmaddubsw xmm0, xmm4
911 pmaddubsw xmm1, xmm4
912 pmaddubsw xmm2, xmm4
913 pmaddubsw xmm3, xmm4
914 lea eax, [eax + 64]
915 phaddw xmm0, xmm1
916 phaddw xmm2, xmm3
917 psrlw xmm0, 7
918 psrlw xmm2, 7
919 packuswb xmm0, xmm2
920 paddb xmm0, xmm5
921 sub ecx, 16
922 movdqu [edx], xmm0
923 lea edx, [edx + 16]
924 jg convertloop
925 ret
926 }
927 }
928
929 __declspec(naked) __declspec(align(16))
ARGBToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)930 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
931 uint8* dst_u, uint8* dst_v, int width) {
932 __asm {
933 push esi
934 push edi
935 mov eax, [esp + 8 + 4] // src_argb
936 mov esi, [esp + 8 + 8] // src_stride_argb
937 mov edx, [esp + 8 + 12] // dst_u
938 mov edi, [esp + 8 + 16] // dst_v
939 mov ecx, [esp + 8 + 20] // pix
940 movdqa xmm7, kARGBToU
941 movdqa xmm6, kARGBToV
942 movdqa xmm5, kAddUV128
943 sub edi, edx // stride from u to v
944
945 align 16
946 convertloop:
947 /* step 1 - subsample 16x2 argb pixels to 8x1 */
948 movdqa xmm0, [eax]
949 movdqa xmm1, [eax + 16]
950 movdqa xmm2, [eax + 32]
951 movdqa xmm3, [eax + 48]
952 pavgb xmm0, [eax + esi]
953 pavgb xmm1, [eax + esi + 16]
954 pavgb xmm2, [eax + esi + 32]
955 pavgb xmm3, [eax + esi + 48]
956 lea eax, [eax + 64]
957 movdqa xmm4, xmm0
958 shufps xmm0, xmm1, 0x88
959 shufps xmm4, xmm1, 0xdd
960 pavgb xmm0, xmm4
961 movdqa xmm4, xmm2
962 shufps xmm2, xmm3, 0x88
963 shufps xmm4, xmm3, 0xdd
964 pavgb xmm2, xmm4
965
966 // step 2 - convert to U and V
967 // from here down is very similar to Y code except
968 // instead of 16 different pixels, its 8 pixels of U and 8 of V
969 movdqa xmm1, xmm0
970 movdqa xmm3, xmm2
971 pmaddubsw xmm0, xmm7 // U
972 pmaddubsw xmm2, xmm7
973 pmaddubsw xmm1, xmm6 // V
974 pmaddubsw xmm3, xmm6
975 phaddw xmm0, xmm2
976 phaddw xmm1, xmm3
977 psraw xmm0, 8
978 psraw xmm1, 8
979 packsswb xmm0, xmm1
980 paddb xmm0, xmm5 // -> unsigned
981
982 // step 3 - store 8 U and 8 V values
983 sub ecx, 16
984 movlps qword ptr [edx], xmm0 // U
985 movhps qword ptr [edx + edi], xmm0 // V
986 lea edx, [edx + 8]
987 jg convertloop
988
989 pop edi
990 pop esi
991 ret
992 }
993 }
994
995 __declspec(naked) __declspec(align(16))
ARGBToUVRow_Unaligned_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)996 void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
997 uint8* dst_u, uint8* dst_v, int width) {
998 __asm {
999 push esi
1000 push edi
1001 mov eax, [esp + 8 + 4] // src_argb
1002 mov esi, [esp + 8 + 8] // src_stride_argb
1003 mov edx, [esp + 8 + 12] // dst_u
1004 mov edi, [esp + 8 + 16] // dst_v
1005 mov ecx, [esp + 8 + 20] // pix
1006 movdqa xmm7, kARGBToU
1007 movdqa xmm6, kARGBToV
1008 movdqa xmm5, kAddUV128
1009 sub edi, edx // stride from u to v
1010
1011 align 16
1012 convertloop:
1013 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1014 movdqu xmm0, [eax]
1015 movdqu xmm1, [eax + 16]
1016 movdqu xmm2, [eax + 32]
1017 movdqu xmm3, [eax + 48]
1018 movdqu xmm4, [eax + esi]
1019 pavgb xmm0, xmm4
1020 movdqu xmm4, [eax + esi + 16]
1021 pavgb xmm1, xmm4
1022 movdqu xmm4, [eax + esi + 32]
1023 pavgb xmm2, xmm4
1024 movdqu xmm4, [eax + esi + 48]
1025 pavgb xmm3, xmm4
1026 lea eax, [eax + 64]
1027 movdqa xmm4, xmm0
1028 shufps xmm0, xmm1, 0x88
1029 shufps xmm4, xmm1, 0xdd
1030 pavgb xmm0, xmm4
1031 movdqa xmm4, xmm2
1032 shufps xmm2, xmm3, 0x88
1033 shufps xmm4, xmm3, 0xdd
1034 pavgb xmm2, xmm4
1035
1036 // step 2 - convert to U and V
1037 // from here down is very similar to Y code except
1038 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1039 movdqa xmm1, xmm0
1040 movdqa xmm3, xmm2
1041 pmaddubsw xmm0, xmm7 // U
1042 pmaddubsw xmm2, xmm7
1043 pmaddubsw xmm1, xmm6 // V
1044 pmaddubsw xmm3, xmm6
1045 phaddw xmm0, xmm2
1046 phaddw xmm1, xmm3
1047 psraw xmm0, 8
1048 psraw xmm1, 8
1049 packsswb xmm0, xmm1
1050 paddb xmm0, xmm5 // -> unsigned
1051
1052 // step 3 - store 8 U and 8 V values
1053 sub ecx, 16
1054 movlps qword ptr [edx], xmm0 // U
1055 movhps qword ptr [edx + edi], xmm0 // V
1056 lea edx, [edx + 8]
1057 jg convertloop
1058
1059 pop edi
1060 pop esi
1061 ret
1062 }
1063 }
1064
1065 __declspec(naked) __declspec(align(16))
BGRAToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1066 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1067 uint8* dst_u, uint8* dst_v, int width) {
1068 __asm {
1069 push esi
1070 push edi
1071 mov eax, [esp + 8 + 4] // src_argb
1072 mov esi, [esp + 8 + 8] // src_stride_argb
1073 mov edx, [esp + 8 + 12] // dst_u
1074 mov edi, [esp + 8 + 16] // dst_v
1075 mov ecx, [esp + 8 + 20] // pix
1076 movdqa xmm7, kBGRAToU
1077 movdqa xmm6, kBGRAToV
1078 movdqa xmm5, kAddUV128
1079 sub edi, edx // stride from u to v
1080
1081 align 16
1082 convertloop:
1083 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1084 movdqa xmm0, [eax]
1085 movdqa xmm1, [eax + 16]
1086 movdqa xmm2, [eax + 32]
1087 movdqa xmm3, [eax + 48]
1088 pavgb xmm0, [eax + esi]
1089 pavgb xmm1, [eax + esi + 16]
1090 pavgb xmm2, [eax + esi + 32]
1091 pavgb xmm3, [eax + esi + 48]
1092 lea eax, [eax + 64]
1093 movdqa xmm4, xmm0
1094 shufps xmm0, xmm1, 0x88
1095 shufps xmm4, xmm1, 0xdd
1096 pavgb xmm0, xmm4
1097 movdqa xmm4, xmm2
1098 shufps xmm2, xmm3, 0x88
1099 shufps xmm4, xmm3, 0xdd
1100 pavgb xmm2, xmm4
1101
1102 // step 2 - convert to U and V
1103 // from here down is very similar to Y code except
1104 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1105 movdqa xmm1, xmm0
1106 movdqa xmm3, xmm2
1107 pmaddubsw xmm0, xmm7 // U
1108 pmaddubsw xmm2, xmm7
1109 pmaddubsw xmm1, xmm6 // V
1110 pmaddubsw xmm3, xmm6
1111 phaddw xmm0, xmm2
1112 phaddw xmm1, xmm3
1113 psraw xmm0, 8
1114 psraw xmm1, 8
1115 packsswb xmm0, xmm1
1116 paddb xmm0, xmm5 // -> unsigned
1117
1118 // step 3 - store 8 U and 8 V values
1119 sub ecx, 16
1120 movlps qword ptr [edx], xmm0 // U
1121 movhps qword ptr [edx + edi], xmm0 // V
1122 lea edx, [edx + 8]
1123 jg convertloop
1124
1125 pop edi
1126 pop esi
1127 ret
1128 }
1129 }
1130
1131 __declspec(naked) __declspec(align(16))
BGRAToUVRow_Unaligned_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1132 void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1133 uint8* dst_u, uint8* dst_v, int width) {
1134 __asm {
1135 push esi
1136 push edi
1137 mov eax, [esp + 8 + 4] // src_argb
1138 mov esi, [esp + 8 + 8] // src_stride_argb
1139 mov edx, [esp + 8 + 12] // dst_u
1140 mov edi, [esp + 8 + 16] // dst_v
1141 mov ecx, [esp + 8 + 20] // pix
1142 movdqa xmm7, kBGRAToU
1143 movdqa xmm6, kBGRAToV
1144 movdqa xmm5, kAddUV128
1145 sub edi, edx // stride from u to v
1146
1147 align 16
1148 convertloop:
1149 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1150 movdqu xmm0, [eax]
1151 movdqu xmm1, [eax + 16]
1152 movdqu xmm2, [eax + 32]
1153 movdqu xmm3, [eax + 48]
1154 movdqu xmm4, [eax + esi]
1155 pavgb xmm0, xmm4
1156 movdqu xmm4, [eax + esi + 16]
1157 pavgb xmm1, xmm4
1158 movdqu xmm4, [eax + esi + 32]
1159 pavgb xmm2, xmm4
1160 movdqu xmm4, [eax + esi + 48]
1161 pavgb xmm3, xmm4
1162 lea eax, [eax + 64]
1163 movdqa xmm4, xmm0
1164 shufps xmm0, xmm1, 0x88
1165 shufps xmm4, xmm1, 0xdd
1166 pavgb xmm0, xmm4
1167 movdqa xmm4, xmm2
1168 shufps xmm2, xmm3, 0x88
1169 shufps xmm4, xmm3, 0xdd
1170 pavgb xmm2, xmm4
1171
1172 // step 2 - convert to U and V
1173 // from here down is very similar to Y code except
1174 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1175 movdqa xmm1, xmm0
1176 movdqa xmm3, xmm2
1177 pmaddubsw xmm0, xmm7 // U
1178 pmaddubsw xmm2, xmm7
1179 pmaddubsw xmm1, xmm6 // V
1180 pmaddubsw xmm3, xmm6
1181 phaddw xmm0, xmm2
1182 phaddw xmm1, xmm3
1183 psraw xmm0, 8
1184 psraw xmm1, 8
1185 packsswb xmm0, xmm1
1186 paddb xmm0, xmm5 // -> unsigned
1187
1188 // step 3 - store 8 U and 8 V values
1189 sub ecx, 16
1190 movlps qword ptr [edx], xmm0 // U
1191 movhps qword ptr [edx + edi], xmm0 // V
1192 lea edx, [edx + 8]
1193 jg convertloop
1194
1195 pop edi
1196 pop esi
1197 ret
1198 }
1199 }
1200
1201 __declspec(naked) __declspec(align(16))
ABGRToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1202 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1203 uint8* dst_u, uint8* dst_v, int width) {
1204 __asm {
1205 push esi
1206 push edi
1207 mov eax, [esp + 8 + 4] // src_argb
1208 mov esi, [esp + 8 + 8] // src_stride_argb
1209 mov edx, [esp + 8 + 12] // dst_u
1210 mov edi, [esp + 8 + 16] // dst_v
1211 mov ecx, [esp + 8 + 20] // pix
1212 movdqa xmm7, kABGRToU
1213 movdqa xmm6, kABGRToV
1214 movdqa xmm5, kAddUV128
1215 sub edi, edx // stride from u to v
1216
1217 align 16
1218 convertloop:
1219 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1220 movdqa xmm0, [eax]
1221 movdqa xmm1, [eax + 16]
1222 movdqa xmm2, [eax + 32]
1223 movdqa xmm3, [eax + 48]
1224 pavgb xmm0, [eax + esi]
1225 pavgb xmm1, [eax + esi + 16]
1226 pavgb xmm2, [eax + esi + 32]
1227 pavgb xmm3, [eax + esi + 48]
1228 lea eax, [eax + 64]
1229 movdqa xmm4, xmm0
1230 shufps xmm0, xmm1, 0x88
1231 shufps xmm4, xmm1, 0xdd
1232 pavgb xmm0, xmm4
1233 movdqa xmm4, xmm2
1234 shufps xmm2, xmm3, 0x88
1235 shufps xmm4, xmm3, 0xdd
1236 pavgb xmm2, xmm4
1237
1238 // step 2 - convert to U and V
1239 // from here down is very similar to Y code except
1240 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1241 movdqa xmm1, xmm0
1242 movdqa xmm3, xmm2
1243 pmaddubsw xmm0, xmm7 // U
1244 pmaddubsw xmm2, xmm7
1245 pmaddubsw xmm1, xmm6 // V
1246 pmaddubsw xmm3, xmm6
1247 phaddw xmm0, xmm2
1248 phaddw xmm1, xmm3
1249 psraw xmm0, 8
1250 psraw xmm1, 8
1251 packsswb xmm0, xmm1
1252 paddb xmm0, xmm5 // -> unsigned
1253
1254 // step 3 - store 8 U and 8 V values
1255 sub ecx, 16
1256 movlps qword ptr [edx], xmm0 // U
1257 movhps qword ptr [edx + edi], xmm0 // V
1258 lea edx, [edx + 8]
1259 jg convertloop
1260
1261 pop edi
1262 pop esi
1263 ret
1264 }
1265 }
1266
1267 __declspec(naked) __declspec(align(16))
ABGRToUVRow_Unaligned_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1268 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1269 uint8* dst_u, uint8* dst_v, int width) {
1270 __asm {
1271 push esi
1272 push edi
1273 mov eax, [esp + 8 + 4] // src_argb
1274 mov esi, [esp + 8 + 8] // src_stride_argb
1275 mov edx, [esp + 8 + 12] // dst_u
1276 mov edi, [esp + 8 + 16] // dst_v
1277 mov ecx, [esp + 8 + 20] // pix
1278 movdqa xmm7, kABGRToU
1279 movdqa xmm6, kABGRToV
1280 movdqa xmm5, kAddUV128
1281 sub edi, edx // stride from u to v
1282
1283 align 16
1284 convertloop:
1285 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1286 movdqu xmm0, [eax]
1287 movdqu xmm1, [eax + 16]
1288 movdqu xmm2, [eax + 32]
1289 movdqu xmm3, [eax + 48]
1290 movdqu xmm4, [eax + esi]
1291 pavgb xmm0, xmm4
1292 movdqu xmm4, [eax + esi + 16]
1293 pavgb xmm1, xmm4
1294 movdqu xmm4, [eax + esi + 32]
1295 pavgb xmm2, xmm4
1296 movdqu xmm4, [eax + esi + 48]
1297 pavgb xmm3, xmm4
1298 lea eax, [eax + 64]
1299 movdqa xmm4, xmm0
1300 shufps xmm0, xmm1, 0x88
1301 shufps xmm4, xmm1, 0xdd
1302 pavgb xmm0, xmm4
1303 movdqa xmm4, xmm2
1304 shufps xmm2, xmm3, 0x88
1305 shufps xmm4, xmm3, 0xdd
1306 pavgb xmm2, xmm4
1307
1308 // step 2 - convert to U and V
1309 // from here down is very similar to Y code except
1310 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1311 movdqa xmm1, xmm0
1312 movdqa xmm3, xmm2
1313 pmaddubsw xmm0, xmm7 // U
1314 pmaddubsw xmm2, xmm7
1315 pmaddubsw xmm1, xmm6 // V
1316 pmaddubsw xmm3, xmm6
1317 phaddw xmm0, xmm2
1318 phaddw xmm1, xmm3
1319 psraw xmm0, 8
1320 psraw xmm1, 8
1321 packsswb xmm0, xmm1
1322 paddb xmm0, xmm5 // -> unsigned
1323
1324 // step 3 - store 8 U and 8 V values
1325 sub ecx, 16
1326 movlps qword ptr [edx], xmm0 // U
1327 movhps qword ptr [edx + edi], xmm0 // V
1328 lea edx, [edx + 8]
1329 jg convertloop
1330
1331 pop edi
1332 pop esi
1333 ret
1334 }
1335 }
1336
1337 __declspec(naked) __declspec(align(16))
RGBAToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1338 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1339 uint8* dst_u, uint8* dst_v, int width) {
1340 __asm {
1341 push esi
1342 push edi
1343 mov eax, [esp + 8 + 4] // src_argb
1344 mov esi, [esp + 8 + 8] // src_stride_argb
1345 mov edx, [esp + 8 + 12] // dst_u
1346 mov edi, [esp + 8 + 16] // dst_v
1347 mov ecx, [esp + 8 + 20] // pix
1348 movdqa xmm7, kRGBAToU
1349 movdqa xmm6, kRGBAToV
1350 movdqa xmm5, kAddUV128
1351 sub edi, edx // stride from u to v
1352
1353 align 16
1354 convertloop:
1355 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1356 movdqa xmm0, [eax]
1357 movdqa xmm1, [eax + 16]
1358 movdqa xmm2, [eax + 32]
1359 movdqa xmm3, [eax + 48]
1360 pavgb xmm0, [eax + esi]
1361 pavgb xmm1, [eax + esi + 16]
1362 pavgb xmm2, [eax + esi + 32]
1363 pavgb xmm3, [eax + esi + 48]
1364 lea eax, [eax + 64]
1365 movdqa xmm4, xmm0
1366 shufps xmm0, xmm1, 0x88
1367 shufps xmm4, xmm1, 0xdd
1368 pavgb xmm0, xmm4
1369 movdqa xmm4, xmm2
1370 shufps xmm2, xmm3, 0x88
1371 shufps xmm4, xmm3, 0xdd
1372 pavgb xmm2, xmm4
1373
1374 // step 2 - convert to U and V
1375 // from here down is very similar to Y code except
1376 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1377 movdqa xmm1, xmm0
1378 movdqa xmm3, xmm2
1379 pmaddubsw xmm0, xmm7 // U
1380 pmaddubsw xmm2, xmm7
1381 pmaddubsw xmm1, xmm6 // V
1382 pmaddubsw xmm3, xmm6
1383 phaddw xmm0, xmm2
1384 phaddw xmm1, xmm3
1385 psraw xmm0, 8
1386 psraw xmm1, 8
1387 packsswb xmm0, xmm1
1388 paddb xmm0, xmm5 // -> unsigned
1389
1390 // step 3 - store 8 U and 8 V values
1391 sub ecx, 16
1392 movlps qword ptr [edx], xmm0 // U
1393 movhps qword ptr [edx + edi], xmm0 // V
1394 lea edx, [edx + 8]
1395 jg convertloop
1396
1397 pop edi
1398 pop esi
1399 ret
1400 }
1401 }
1402
1403 __declspec(naked) __declspec(align(16))
RGBAToUVRow_Unaligned_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1404 void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1405 uint8* dst_u, uint8* dst_v, int width) {
1406 __asm {
1407 push esi
1408 push edi
1409 mov eax, [esp + 8 + 4] // src_argb
1410 mov esi, [esp + 8 + 8] // src_stride_argb
1411 mov edx, [esp + 8 + 12] // dst_u
1412 mov edi, [esp + 8 + 16] // dst_v
1413 mov ecx, [esp + 8 + 20] // pix
1414 movdqa xmm7, kRGBAToU
1415 movdqa xmm6, kRGBAToV
1416 movdqa xmm5, kAddUV128
1417 sub edi, edx // stride from u to v
1418
1419 align 16
1420 convertloop:
1421 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1422 movdqu xmm0, [eax]
1423 movdqu xmm1, [eax + 16]
1424 movdqu xmm2, [eax + 32]
1425 movdqu xmm3, [eax + 48]
1426 movdqu xmm4, [eax + esi]
1427 pavgb xmm0, xmm4
1428 movdqu xmm4, [eax + esi + 16]
1429 pavgb xmm1, xmm4
1430 movdqu xmm4, [eax + esi + 32]
1431 pavgb xmm2, xmm4
1432 movdqu xmm4, [eax + esi + 48]
1433 pavgb xmm3, xmm4
1434 lea eax, [eax + 64]
1435 movdqa xmm4, xmm0
1436 shufps xmm0, xmm1, 0x88
1437 shufps xmm4, xmm1, 0xdd
1438 pavgb xmm0, xmm4
1439 movdqa xmm4, xmm2
1440 shufps xmm2, xmm3, 0x88
1441 shufps xmm4, xmm3, 0xdd
1442 pavgb xmm2, xmm4
1443
1444 // step 2 - convert to U and V
1445 // from here down is very similar to Y code except
1446 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1447 movdqa xmm1, xmm0
1448 movdqa xmm3, xmm2
1449 pmaddubsw xmm0, xmm7 // U
1450 pmaddubsw xmm2, xmm7
1451 pmaddubsw xmm1, xmm6 // V
1452 pmaddubsw xmm3, xmm6
1453 phaddw xmm0, xmm2
1454 phaddw xmm1, xmm3
1455 psraw xmm0, 8
1456 psraw xmm1, 8
1457 packsswb xmm0, xmm1
1458 paddb xmm0, xmm5 // -> unsigned
1459
1460 // step 3 - store 8 U and 8 V values
1461 sub ecx, 16
1462 movlps qword ptr [edx], xmm0 // U
1463 movhps qword ptr [edx + edi], xmm0 // V
1464 lea edx, [edx + 8]
1465 jg convertloop
1466
1467 pop edi
1468 pop esi
1469 ret
1470 }
1471 }
1472 #endif // HAS_ARGBTOYROW_SSSE3
1473
1474 #ifdef HAS_I422TOARGBROW_SSSE3
1475
1476 #define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
1477
1478 #define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1479 #define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1480 #define UR 0
1481
1482 #define VB 0
1483 #define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1484 #define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1485
1486 // Bias
1487 #define BB UB * 128 + VB * 128
1488 #define BG UG * 128 + VG * 128
1489 #define BR UR * 128 + VR * 128
1490
1491 static const vec8 kUVToB = {
1492 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
1493 };
1494
1495 static const vec8 kUVToR = {
1496 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
1497 };
1498
1499 static const vec8 kUVToG = {
1500 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
1501 };
1502
1503 static const vec8 kVUToB = {
1504 VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
1505 };
1506
1507 static const vec8 kVUToR = {
1508 VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
1509 };
1510
1511 static const vec8 kVUToG = {
1512 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
1513 };
1514
1515 static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
1516 static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
1517 static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
1518 static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
1519 static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
1520
1521 // TODO(fbarchard): NV12/NV21 fetch UV and use directly.
1522 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
1523
1524 // Read 8 UV from 411.
1525 #define READYUV444 __asm { \
1526 __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \
1527 __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \
1528 __asm lea esi, [esi + 8] \
1529 __asm punpcklbw xmm0, xmm1 /* UV */ \
1530 }
1531
1532 // Read 4 UV from 422, upsample to 8 UV.
1533 #define READYUV422 __asm { \
1534 __asm movd xmm0, [esi] /* U */ \
1535 __asm movd xmm1, [esi + edi] /* V */ \
1536 __asm lea esi, [esi + 4] \
1537 __asm punpcklbw xmm0, xmm1 /* UV */ \
1538 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
1539 }
1540
1541 // Read 2 UV from 411, upsample to 8 UV.
1542 #define READYUV411 __asm { \
1543 __asm movd xmm0, [esi] /* U */ \
1544 __asm movd xmm1, [esi + edi] /* V */ \
1545 __asm lea esi, [esi + 2] \
1546 __asm punpcklbw xmm0, xmm1 /* UV */ \
1547 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
1548 __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \
1549 }
1550
1551 // Read 4 UV from NV12, upsample to 8 UV.
1552 #define READNV12 __asm { \
1553 __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \
1554 __asm lea esi, [esi + 8] \
1555 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
1556 }
1557
1558 // Convert 8 pixels: 8 UV and 8 Y.
1559 #define YUVTORGB __asm { \
1560 /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
1561 __asm movdqa xmm1, xmm0 \
1562 __asm movdqa xmm2, xmm0 \
1563 __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
1564 __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \
1565 __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \
1566 __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
1567 __asm psubw xmm1, kUVBiasG \
1568 __asm psubw xmm2, kUVBiasR \
1569 /* Step 2: Find Y contribution to 8 R,G,B values */ \
1570 __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
1571 __asm lea eax, [eax + 8] \
1572 __asm punpcklbw xmm3, xmm4 \
1573 __asm psubsw xmm3, kYSub16 \
1574 __asm pmullw xmm3, kYToRgb \
1575 __asm paddsw xmm0, xmm3 /* B += Y */ \
1576 __asm paddsw xmm1, xmm3 /* G += Y */ \
1577 __asm paddsw xmm2, xmm3 /* R += Y */ \
1578 __asm psraw xmm0, 6 \
1579 __asm psraw xmm1, 6 \
1580 __asm psraw xmm2, 6 \
1581 __asm packuswb xmm0, xmm0 /* B */ \
1582 __asm packuswb xmm1, xmm1 /* G */ \
1583 __asm packuswb xmm2, xmm2 /* R */ \
1584 }
1585
1586 // Convert 8 pixels: 8 VU and 8 Y.
1587 #define YVUTORGB __asm { \
1588 /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
1589 __asm movdqa xmm1, xmm0 \
1590 __asm movdqa xmm2, xmm0 \
1591 __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \
1592 __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \
1593 __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \
1594 __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
1595 __asm psubw xmm1, kUVBiasG \
1596 __asm psubw xmm2, kUVBiasR \
1597 /* Step 2: Find Y contribution to 8 R,G,B values */ \
1598 __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
1599 __asm lea eax, [eax + 8] \
1600 __asm punpcklbw xmm3, xmm4 \
1601 __asm psubsw xmm3, kYSub16 \
1602 __asm pmullw xmm3, kYToRgb \
1603 __asm paddsw xmm0, xmm3 /* B += Y */ \
1604 __asm paddsw xmm1, xmm3 /* G += Y */ \
1605 __asm paddsw xmm2, xmm3 /* R += Y */ \
1606 __asm psraw xmm0, 6 \
1607 __asm psraw xmm1, 6 \
1608 __asm psraw xmm2, 6 \
1609 __asm packuswb xmm0, xmm0 /* B */ \
1610 __asm packuswb xmm1, xmm1 /* G */ \
1611 __asm packuswb xmm2, xmm2 /* R */ \
1612 }
1613
1614 // 8 pixels, dest aligned 16.
1615 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
1616 __declspec(naked) __declspec(align(16))
I444ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1617 void I444ToARGBRow_SSSE3(const uint8* y_buf,
1618 const uint8* u_buf,
1619 const uint8* v_buf,
1620 uint8* argb_buf,
1621 int width) {
1622 __asm {
1623 push esi
1624 push edi
1625 mov eax, [esp + 8 + 4] // Y
1626 mov esi, [esp + 8 + 8] // U
1627 mov edi, [esp + 8 + 12] // V
1628 mov edx, [esp + 8 + 16] // argb
1629 mov ecx, [esp + 8 + 20] // width
1630 sub edi, esi
1631 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
1632 pxor xmm4, xmm4
1633
1634 align 16
1635 convertloop:
1636 READYUV444
1637 YUVTORGB
1638
1639 // Step 3: Weave into ARGB
1640 punpcklbw xmm0, xmm1 // BG
1641 punpcklbw xmm2, xmm5 // RA
1642 movdqa xmm1, xmm0
1643 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
1644 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
1645 movdqa [edx], xmm0
1646 movdqa [edx + 16], xmm1
1647 lea edx, [edx + 32]
1648 sub ecx, 8
1649 jg convertloop
1650
1651 pop edi
1652 pop esi
1653 ret
1654 }
1655 }
1656
1657 // 8 pixels, dest aligned 16.
1658 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1659 __declspec(naked) __declspec(align(16))
I422ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1660 void I422ToARGBRow_SSSE3(const uint8* y_buf,
1661 const uint8* u_buf,
1662 const uint8* v_buf,
1663 uint8* argb_buf,
1664 int width) {
1665 __asm {
1666 push esi
1667 push edi
1668 mov eax, [esp + 8 + 4] // Y
1669 mov esi, [esp + 8 + 8] // U
1670 mov edi, [esp + 8 + 12] // V
1671 mov edx, [esp + 8 + 16] // argb
1672 mov ecx, [esp + 8 + 20] // width
1673 sub edi, esi
1674 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
1675 pxor xmm4, xmm4
1676
1677 align 16
1678 convertloop:
1679 READYUV422
1680 YUVTORGB
1681
1682 // Step 3: Weave into ARGB
1683 punpcklbw xmm0, xmm1 // BG
1684 punpcklbw xmm2, xmm5 // RA
1685 movdqa xmm1, xmm0
1686 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
1687 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
1688 movdqa [edx], xmm0
1689 movdqa [edx + 16], xmm1
1690 lea edx, [edx + 32]
1691 sub ecx, 8
1692 jg convertloop
1693
1694 pop edi
1695 pop esi
1696 ret
1697 }
1698 }
1699
1700 // 8 pixels, dest aligned 16.
1701 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1702 // Similar to I420 but duplicate UV once more.
1703 __declspec(naked) __declspec(align(16))
I411ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1704 void I411ToARGBRow_SSSE3(const uint8* y_buf,
1705 const uint8* u_buf,
1706 const uint8* v_buf,
1707 uint8* argb_buf,
1708 int width) {
1709 __asm {
1710 push esi
1711 push edi
1712 mov eax, [esp + 8 + 4] // Y
1713 mov esi, [esp + 8 + 8] // U
1714 mov edi, [esp + 8 + 12] // V
1715 mov edx, [esp + 8 + 16] // argb
1716 mov ecx, [esp + 8 + 20] // width
1717 sub edi, esi
1718 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
1719 pxor xmm4, xmm4
1720
1721 align 16
1722 convertloop:
1723 READYUV411
1724 YUVTORGB
1725
1726 // Step 3: Weave into ARGB
1727 punpcklbw xmm0, xmm1 // BG
1728 punpcklbw xmm2, xmm5 // RA
1729 movdqa xmm1, xmm0
1730 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
1731 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
1732 movdqa [edx], xmm0
1733 movdqa [edx + 16], xmm1
1734 lea edx, [edx + 32]
1735 sub ecx, 8
1736 jg convertloop
1737
1738 pop edi
1739 pop esi
1740 ret
1741 }
1742 }
1743
1744 // 8 pixels, dest aligned 16.
1745 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1746 __declspec(naked) __declspec(align(16))
NV12ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * argb_buf,int width)1747 void NV12ToARGBRow_SSSE3(const uint8* y_buf,
1748 const uint8* uv_buf,
1749 uint8* argb_buf,
1750 int width) {
1751 __asm {
1752 push esi
1753 mov eax, [esp + 4 + 4] // Y
1754 mov esi, [esp + 4 + 8] // UV
1755 mov edx, [esp + 4 + 12] // argb
1756 mov ecx, [esp + 4 + 16] // width
1757 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
1758 pxor xmm4, xmm4
1759
1760 align 16
1761 convertloop:
1762 READNV12
1763 YUVTORGB
1764
1765 // Step 3: Weave into ARGB
1766 punpcklbw xmm0, xmm1 // BG
1767 punpcklbw xmm2, xmm5 // RA
1768 movdqa xmm1, xmm0
1769 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
1770 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
1771 movdqa [edx], xmm0
1772 movdqa [edx + 16], xmm1
1773 lea edx, [edx + 32]
1774 sub ecx, 8
1775 jg convertloop
1776
1777 pop esi
1778 ret
1779 }
1780 }
1781
1782 // 8 pixels, dest aligned 16.
1783 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1784 __declspec(naked) __declspec(align(16))
NV21ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * argb_buf,int width)1785 void NV21ToARGBRow_SSSE3(const uint8* y_buf,
1786 const uint8* uv_buf,
1787 uint8* argb_buf,
1788 int width) {
1789 __asm {
1790 push esi
1791 mov eax, [esp + 4 + 4] // Y
1792 mov esi, [esp + 4 + 8] // VU
1793 mov edx, [esp + 4 + 12] // argb
1794 mov ecx, [esp + 4 + 16] // width
1795 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
1796 pxor xmm4, xmm4
1797
1798 align 16
1799 convertloop:
1800 READNV12
1801 YVUTORGB
1802
1803 // Step 3: Weave into ARGB
1804 punpcklbw xmm0, xmm1 // BG
1805 punpcklbw xmm2, xmm5 // RA
1806 movdqa xmm1, xmm0
1807 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
1808 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
1809 movdqa [edx], xmm0
1810 movdqa [edx + 16], xmm1
1811 lea edx, [edx + 32]
1812 sub ecx, 8
1813 jg convertloop
1814
1815 pop esi
1816 ret
1817 }
1818 }
1819
1820 // 8 pixels, unaligned.
1821 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
1822 __declspec(naked) __declspec(align(16))
I444ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1823 void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1824 const uint8* u_buf,
1825 const uint8* v_buf,
1826 uint8* argb_buf,
1827 int width) {
1828 __asm {
1829 push esi
1830 push edi
1831 mov eax, [esp + 8 + 4] // Y
1832 mov esi, [esp + 8 + 8] // U
1833 mov edi, [esp + 8 + 12] // V
1834 mov edx, [esp + 8 + 16] // argb
1835 mov ecx, [esp + 8 + 20] // width
1836 sub edi, esi
1837 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
1838 pxor xmm4, xmm4
1839
1840 align 16
1841 convertloop:
1842 READYUV444
1843 YUVTORGB
1844
1845 // Step 3: Weave into ARGB
1846 punpcklbw xmm0, xmm1 // BG
1847 punpcklbw xmm2, xmm5 // RA
1848 movdqa xmm1, xmm0
1849 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
1850 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
1851 movdqu [edx], xmm0
1852 movdqu [edx + 16], xmm1
1853 lea edx, [edx + 32]
1854 sub ecx, 8
1855 jg convertloop
1856
1857 pop edi
1858 pop esi
1859 ret
1860 }
1861 }
1862
1863 // 8 pixels, unaligned.
1864 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1865 __declspec(naked) __declspec(align(16))
I422ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1866 void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1867 const uint8* u_buf,
1868 const uint8* v_buf,
1869 uint8* argb_buf,
1870 int width) {
1871 __asm {
1872 push esi
1873 push edi
1874 mov eax, [esp + 8 + 4] // Y
1875 mov esi, [esp + 8 + 8] // U
1876 mov edi, [esp + 8 + 12] // V
1877 mov edx, [esp + 8 + 16] // argb
1878 mov ecx, [esp + 8 + 20] // width
1879 sub edi, esi
1880 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
1881 pxor xmm4, xmm4
1882
1883 align 16
1884 convertloop:
1885 READYUV422
1886 YUVTORGB
1887
1888 // Step 3: Weave into ARGB
1889 punpcklbw xmm0, xmm1 // BG
1890 punpcklbw xmm2, xmm5 // RA
1891 movdqa xmm1, xmm0
1892 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
1893 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
1894 movdqu [edx], xmm0
1895 movdqu [edx + 16], xmm1
1896 lea edx, [edx + 32]
1897 sub ecx, 8
1898 jg convertloop
1899
1900 pop edi
1901 pop esi
1902 ret
1903 }
1904 }
1905
1906 // 8 pixels, unaligned.
1907 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1908 // Similar to I420 but duplicate UV once more.
1909 __declspec(naked) __declspec(align(16))
I411ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1910 void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1911 const uint8* u_buf,
1912 const uint8* v_buf,
1913 uint8* argb_buf,
1914 int width) {
1915 __asm {
1916 push esi
1917 push edi
1918 mov eax, [esp + 8 + 4] // Y
1919 mov esi, [esp + 8 + 8] // U
1920 mov edi, [esp + 8 + 12] // V
1921 mov edx, [esp + 8 + 16] // argb
1922 mov ecx, [esp + 8 + 20] // width
1923 sub edi, esi
1924 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
1925 pxor xmm4, xmm4
1926
1927 align 16
1928 convertloop:
1929 READYUV411
1930 YUVTORGB
1931
1932 // Step 3: Weave into ARGB
1933 punpcklbw xmm0, xmm1 // BG
1934 punpcklbw xmm2, xmm5 // RA
1935 movdqa xmm1, xmm0
1936 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
1937 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
1938 movdqu [edx], xmm0
1939 movdqu [edx + 16], xmm1
1940 lea edx, [edx + 32]
1941 sub ecx, 8
1942 jg convertloop
1943
1944 pop edi
1945 pop esi
1946 ret
1947 }
1948 }
1949
1950
1951 // 8 pixels, dest aligned 16.
1952 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1953 __declspec(naked) __declspec(align(16))
NV12ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * argb_buf,int width)1954 void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1955 const uint8* uv_buf,
1956 uint8* argb_buf,
1957 int width) {
1958 __asm {
1959 push esi
1960 mov eax, [esp + 4 + 4] // Y
1961 mov esi, [esp + 4 + 8] // UV
1962 mov edx, [esp + 4 + 12] // argb
1963 mov ecx, [esp + 4 + 16] // width
1964 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
1965 pxor xmm4, xmm4
1966
1967 align 16
1968 convertloop:
1969 READNV12
1970 YUVTORGB
1971
1972 // Step 3: Weave into ARGB
1973 punpcklbw xmm0, xmm1 // BG
1974 punpcklbw xmm2, xmm5 // RA
1975 movdqa xmm1, xmm0
1976 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
1977 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
1978 movdqu [edx], xmm0
1979 movdqu [edx + 16], xmm1
1980 lea edx, [edx + 32]
1981 sub ecx, 8
1982 jg convertloop
1983
1984 pop esi
1985 ret
1986 }
1987 }
1988
1989 // 8 pixels, dest aligned 16.
1990 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1991 __declspec(naked) __declspec(align(16))
NV21ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * argb_buf,int width)1992 void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1993 const uint8* uv_buf,
1994 uint8* argb_buf,
1995 int width) {
1996 __asm {
1997 push esi
1998 mov eax, [esp + 4 + 4] // Y
1999 mov esi, [esp + 4 + 8] // VU
2000 mov edx, [esp + 4 + 12] // argb
2001 mov ecx, [esp + 4 + 16] // width
2002 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2003 pxor xmm4, xmm4
2004
2005 align 16
2006 convertloop:
2007 READNV12
2008 YVUTORGB
2009
2010 // Step 3: Weave into ARGB
2011 punpcklbw xmm0, xmm1 // BG
2012 punpcklbw xmm2, xmm5 // RA
2013 movdqa xmm1, xmm0
2014 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2015 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2016 movdqu [edx], xmm0
2017 movdqu [edx + 16], xmm1
2018 lea edx, [edx + 32]
2019 sub ecx, 8
2020 jg convertloop
2021
2022 pop esi
2023 ret
2024 }
2025 }
2026
2027 __declspec(naked) __declspec(align(16))
I422ToBGRARow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * bgra_buf,int width)2028 void I422ToBGRARow_SSSE3(const uint8* y_buf,
2029 const uint8* u_buf,
2030 const uint8* v_buf,
2031 uint8* bgra_buf,
2032 int width) {
2033 __asm {
2034 push esi
2035 push edi
2036 mov eax, [esp + 8 + 4] // Y
2037 mov esi, [esp + 8 + 8] // U
2038 mov edi, [esp + 8 + 12] // V
2039 mov edx, [esp + 8 + 16] // bgra
2040 mov ecx, [esp + 8 + 20] // width
2041 sub edi, esi
2042 pxor xmm4, xmm4
2043
2044 align 16
2045 convertloop:
2046 READYUV422
2047 YUVTORGB
2048
2049 // Step 3: Weave into BGRA
2050 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2051 punpcklbw xmm1, xmm0 // GB
2052 punpcklbw xmm5, xmm2 // AR
2053 movdqa xmm0, xmm5
2054 punpcklwd xmm5, xmm1 // BGRA first 4 pixels
2055 punpckhwd xmm0, xmm1 // BGRA next 4 pixels
2056 movdqa [edx], xmm5
2057 movdqa [edx + 16], xmm0
2058 lea edx, [edx + 32]
2059 sub ecx, 8
2060 jg convertloop
2061
2062 pop edi
2063 pop esi
2064 ret
2065 }
2066 }
2067
2068 __declspec(naked) __declspec(align(16))
I422ToBGRARow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * bgra_buf,int width)2069 void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
2070 const uint8* u_buf,
2071 const uint8* v_buf,
2072 uint8* bgra_buf,
2073 int width) {
2074 __asm {
2075 push esi
2076 push edi
2077 mov eax, [esp + 8 + 4] // Y
2078 mov esi, [esp + 8 + 8] // U
2079 mov edi, [esp + 8 + 12] // V
2080 mov edx, [esp + 8 + 16] // bgra
2081 mov ecx, [esp + 8 + 20] // width
2082 sub edi, esi
2083 pxor xmm4, xmm4
2084
2085 align 16
2086 convertloop:
2087 READYUV422
2088 YUVTORGB
2089
2090 // Step 3: Weave into BGRA
2091 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2092 punpcklbw xmm1, xmm0 // GB
2093 punpcklbw xmm5, xmm2 // AR
2094 movdqa xmm0, xmm5
2095 punpcklwd xmm5, xmm1 // BGRA first 4 pixels
2096 punpckhwd xmm0, xmm1 // BGRA next 4 pixels
2097 movdqu [edx], xmm5
2098 movdqu [edx + 16], xmm0
2099 lea edx, [edx + 32]
2100 sub ecx, 8
2101 jg convertloop
2102
2103 pop edi
2104 pop esi
2105 ret
2106 }
2107 }
2108
2109 __declspec(naked) __declspec(align(16))
I422ToABGRRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * abgr_buf,int width)2110 void I422ToABGRRow_SSSE3(const uint8* y_buf,
2111 const uint8* u_buf,
2112 const uint8* v_buf,
2113 uint8* abgr_buf,
2114 int width) {
2115 __asm {
2116 push esi
2117 push edi
2118 mov eax, [esp + 8 + 4] // Y
2119 mov esi, [esp + 8 + 8] // U
2120 mov edi, [esp + 8 + 12] // V
2121 mov edx, [esp + 8 + 16] // abgr
2122 mov ecx, [esp + 8 + 20] // width
2123 sub edi, esi
2124 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2125 pxor xmm4, xmm4
2126
2127 align 16
2128 convertloop:
2129 READYUV422
2130 YUVTORGB
2131
2132 // Step 3: Weave into ARGB
2133 punpcklbw xmm2, xmm1 // RG
2134 punpcklbw xmm0, xmm5 // BA
2135 movdqa xmm1, xmm2
2136 punpcklwd xmm2, xmm0 // RGBA first 4 pixels
2137 punpckhwd xmm1, xmm0 // RGBA next 4 pixels
2138 movdqa [edx], xmm2
2139 movdqa [edx + 16], xmm1
2140 lea edx, [edx + 32]
2141 sub ecx, 8
2142 jg convertloop
2143
2144 pop edi
2145 pop esi
2146 ret
2147 }
2148 }
2149
2150 __declspec(naked) __declspec(align(16))
I422ToABGRRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * abgr_buf,int width)2151 void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
2152 const uint8* u_buf,
2153 const uint8* v_buf,
2154 uint8* abgr_buf,
2155 int width) {
2156 __asm {
2157 push esi
2158 push edi
2159 mov eax, [esp + 8 + 4] // Y
2160 mov esi, [esp + 8 + 8] // U
2161 mov edi, [esp + 8 + 12] // V
2162 mov edx, [esp + 8 + 16] // abgr
2163 mov ecx, [esp + 8 + 20] // width
2164 sub edi, esi
2165 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2166 pxor xmm4, xmm4
2167
2168 align 16
2169 convertloop:
2170 READYUV422
2171 YUVTORGB
2172
2173 // Step 3: Weave into ARGB
2174 punpcklbw xmm2, xmm1 // RG
2175 punpcklbw xmm0, xmm5 // BA
2176 movdqa xmm1, xmm2
2177 punpcklwd xmm2, xmm0 // RGBA first 4 pixels
2178 punpckhwd xmm1, xmm0 // RGBA next 4 pixels
2179 movdqu [edx], xmm2
2180 movdqu [edx + 16], xmm1
2181 lea edx, [edx + 32]
2182 sub ecx, 8
2183 jg convertloop
2184
2185 pop edi
2186 pop esi
2187 ret
2188 }
2189 }
2190
2191 __declspec(naked) __declspec(align(16))
I422ToRGBARow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgba_buf,int width)2192 void I422ToRGBARow_SSSE3(const uint8* y_buf,
2193 const uint8* u_buf,
2194 const uint8* v_buf,
2195 uint8* rgba_buf,
2196 int width) {
2197 __asm {
2198 push esi
2199 push edi
2200 mov eax, [esp + 8 + 4] // Y
2201 mov esi, [esp + 8 + 8] // U
2202 mov edi, [esp + 8 + 12] // V
2203 mov edx, [esp + 8 + 16] // rgba
2204 mov ecx, [esp + 8 + 20] // width
2205 sub edi, esi
2206 pxor xmm4, xmm4
2207
2208 align 16
2209 convertloop:
2210 READYUV422
2211 YUVTORGB
2212
2213 // Step 3: Weave into RGBA
2214 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2215 punpcklbw xmm1, xmm2 // GR
2216 punpcklbw xmm5, xmm0 // AB
2217 movdqa xmm0, xmm5
2218 punpcklwd xmm5, xmm1 // RGBA first 4 pixels
2219 punpckhwd xmm0, xmm1 // RGBA next 4 pixels
2220 movdqa [edx], xmm5
2221 movdqa [edx + 16], xmm0
2222 lea edx, [edx + 32]
2223 sub ecx, 8
2224 jg convertloop
2225
2226 pop edi
2227 pop esi
2228 ret
2229 }
2230 }
2231
2232 __declspec(naked) __declspec(align(16))
I422ToRGBARow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgba_buf,int width)2233 void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
2234 const uint8* u_buf,
2235 const uint8* v_buf,
2236 uint8* rgba_buf,
2237 int width) {
2238 __asm {
2239 push esi
2240 push edi
2241 mov eax, [esp + 8 + 4] // Y
2242 mov esi, [esp + 8 + 8] // U
2243 mov edi, [esp + 8 + 12] // V
2244 mov edx, [esp + 8 + 16] // rgba
2245 mov ecx, [esp + 8 + 20] // width
2246 sub edi, esi
2247 pxor xmm4, xmm4
2248
2249 align 16
2250 convertloop:
2251 READYUV422
2252 YUVTORGB
2253
2254 // Step 3: Weave into RGBA
2255 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2256 punpcklbw xmm1, xmm2 // GR
2257 punpcklbw xmm5, xmm0 // AB
2258 movdqa xmm0, xmm5
2259 punpcklwd xmm5, xmm1 // RGBA first 4 pixels
2260 punpckhwd xmm0, xmm1 // RGBA next 4 pixels
2261 movdqu [edx], xmm5
2262 movdqu [edx + 16], xmm0
2263 lea edx, [edx + 32]
2264 sub ecx, 8
2265 jg convertloop
2266
2267 pop edi
2268 pop esi
2269 ret
2270 }
2271 }
2272
2273 #endif // HAS_I422TOARGBROW_SSSE3
2274
2275 #ifdef HAS_YTOARGBROW_SSE2
2276 __declspec(naked) __declspec(align(16))
YToARGBRow_SSE2(const uint8 * y_buf,uint8 * rgb_buf,int width)2277 void YToARGBRow_SSE2(const uint8* y_buf,
2278 uint8* rgb_buf,
2279 int width) {
2280 __asm {
2281 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
2282 pslld xmm4, 24
2283 mov eax,0x10001000
2284 movd xmm3,eax
2285 pshufd xmm3,xmm3,0
2286 mov eax,0x012a012a
2287 movd xmm2,eax
2288 pshufd xmm2,xmm2,0
2289 mov eax, [esp + 4] // Y
2290 mov edx, [esp + 8] // rgb
2291 mov ecx, [esp + 12] // width
2292
2293 align 16
2294 convertloop:
2295 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2296 movq xmm0, qword ptr [eax]
2297 lea eax, [eax + 8]
2298 punpcklbw xmm0, xmm0 // Y.Y
2299 psubusw xmm0, xmm3
2300 pmulhuw xmm0, xmm2
2301 packuswb xmm0, xmm0 // G
2302
2303 // Step 2: Weave into ARGB
2304 punpcklbw xmm0, xmm0 // GG
2305 movdqa xmm1, xmm0
2306 punpcklwd xmm0, xmm0 // BGRA first 4 pixels
2307 punpckhwd xmm1, xmm1 // BGRA next 4 pixels
2308 por xmm0, xmm4
2309 por xmm1, xmm4
2310 movdqa [edx], xmm0
2311 movdqa [edx + 16], xmm1
2312 lea edx, [edx + 32]
2313 sub ecx, 8
2314 jg convertloop
2315
2316 ret
2317 }
2318 }
2319 #endif // HAS_YTOARGBROW_SSE2
2320
2321 #ifdef HAS_MIRRORROW_SSSE3
2322
2323 // Shuffle table for reversing the bytes.
2324 static const uvec8 kShuffleMirror = {
2325 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2326 };
2327
2328 __declspec(naked) __declspec(align(16))
MirrorRow_SSSE3(const uint8 * src,uint8 * dst,int width)2329 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2330 __asm {
2331 mov eax, [esp + 4] // src
2332 mov edx, [esp + 8] // dst
2333 mov ecx, [esp + 12] // width
2334 movdqa xmm5, kShuffleMirror
2335 lea eax, [eax - 16]
2336
2337 align 16
2338 convertloop:
2339 movdqa xmm0, [eax + ecx]
2340 pshufb xmm0, xmm5
2341 sub ecx, 16
2342 movdqa [edx], xmm0
2343 lea edx, [edx + 16]
2344 jg convertloop
2345 ret
2346 }
2347 }
2348 #endif // HAS_MIRRORROW_SSSE3
2349
2350 #ifdef HAS_MIRRORROW_SSE2
2351 // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
2352 // version can not.
2353 __declspec(naked) __declspec(align(16))
MirrorRow_SSE2(const uint8 * src,uint8 * dst,int width)2354 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2355 __asm {
2356 mov eax, [esp + 4] // src
2357 mov edx, [esp + 8] // dst
2358 mov ecx, [esp + 12] // width
2359 lea eax, [eax - 16]
2360
2361 align 16
2362 convertloop:
2363 movdqu xmm0, [eax + ecx]
2364 movdqa xmm1, xmm0 // swap bytes
2365 psllw xmm0, 8
2366 psrlw xmm1, 8
2367 por xmm0, xmm1
2368 pshuflw xmm0, xmm0, 0x1b // swap words
2369 pshufhw xmm0, xmm0, 0x1b
2370 pshufd xmm0, xmm0, 0x4e // swap qwords
2371 sub ecx, 16
2372 movdqu [edx], xmm0
2373 lea edx, [edx + 16]
2374 jg convertloop
2375 ret
2376 }
2377 }
2378 #endif // HAS_MIRRORROW_SSE2
2379
2380 #ifdef HAS_MIRRORROW_UV_SSSE3
2381 // Shuffle table for reversing the bytes of UV channels.
2382 static const uvec8 kShuffleMirrorUV = {
2383 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2384 };
2385
2386 __declspec(naked) __declspec(align(16))
MirrorRowUV_SSSE3(const uint8 * src,uint8 * dst_u,uint8 * dst_v,int width)2387 void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2388 int width) {
2389 __asm {
2390 push edi
2391 mov eax, [esp + 4 + 4] // src
2392 mov edx, [esp + 4 + 8] // dst_u
2393 mov edi, [esp + 4 + 12] // dst_v
2394 mov ecx, [esp + 4 + 16] // width
2395 movdqa xmm1, kShuffleMirrorUV
2396 lea eax, [eax + ecx * 2 - 16]
2397 sub edi, edx
2398
2399 align 16
2400 convertloop:
2401 movdqa xmm0, [eax]
2402 lea eax, [eax - 16]
2403 pshufb xmm0, xmm1
2404 sub ecx, 8
2405 movlpd qword ptr [edx], xmm0
2406 movhpd qword ptr [edx + edi], xmm0
2407 lea edx, [edx + 8]
2408 jg convertloop
2409
2410 pop edi
2411 ret
2412 }
2413 }
2414 #endif // HAS_MIRRORROW_UV_SSSE3
2415
2416 #ifdef HAS_ARGBMIRRORROW_SSSE3
2417
2418 // Shuffle table for reversing the bytes.
2419 static const uvec8 kARGBShuffleMirror = {
2420 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
2421 };
2422
2423 __declspec(naked) __declspec(align(16))
ARGBMirrorRow_SSSE3(const uint8 * src,uint8 * dst,int width)2424 void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2425 __asm {
2426 mov eax, [esp + 4] // src
2427 mov edx, [esp + 8] // dst
2428 mov ecx, [esp + 12] // width
2429 movdqa xmm5, kARGBShuffleMirror
2430 lea eax, [eax - 16]
2431
2432 align 16
2433 convertloop:
2434 movdqa xmm0, [eax + ecx * 4]
2435 pshufb xmm0, xmm5
2436 sub ecx, 4
2437 movdqa [edx], xmm0
2438 lea edx, [edx + 16]
2439 jg convertloop
2440 ret
2441 }
2442 }
2443 #endif // HAS_ARGBMIRRORROW_SSSE3
2444
2445 #ifdef HAS_SPLITUV_SSE2
2446 __declspec(naked) __declspec(align(16))
SplitUV_SSE2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int pix)2447 void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
2448 __asm {
2449 push edi
2450 mov eax, [esp + 4 + 4] // src_uv
2451 mov edx, [esp + 4 + 8] // dst_u
2452 mov edi, [esp + 4 + 12] // dst_v
2453 mov ecx, [esp + 4 + 16] // pix
2454 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
2455 psrlw xmm5, 8
2456 sub edi, edx
2457
2458 align 16
2459 convertloop:
2460 movdqa xmm0, [eax]
2461 movdqa xmm1, [eax + 16]
2462 lea eax, [eax + 32]
2463 movdqa xmm2, xmm0
2464 movdqa xmm3, xmm1
2465 pand xmm0, xmm5 // even bytes
2466 pand xmm1, xmm5
2467 packuswb xmm0, xmm1
2468 psrlw xmm2, 8 // odd bytes
2469 psrlw xmm3, 8
2470 packuswb xmm2, xmm3
2471 movdqa [edx], xmm0
2472 movdqa [edx + edi], xmm2
2473 lea edx, [edx + 16]
2474 sub ecx, 16
2475 jg convertloop
2476
2477 pop edi
2478 ret
2479 }
2480 }
2481 #endif // HAS_SPLITUV_SSE2
2482
2483 #ifdef HAS_COPYROW_SSE2
2484 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
2485 __declspec(naked) __declspec(align(16))
CopyRow_SSE2(const uint8 * src,uint8 * dst,int count)2486 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
2487 __asm {
2488 mov eax, [esp + 4] // src
2489 mov edx, [esp + 8] // dst
2490 mov ecx, [esp + 12] // count
2491 sub edx, eax
2492
2493 align 16
2494 convertloop:
2495 movdqa xmm0, [eax]
2496 movdqa xmm1, [eax + 16]
2497 movdqa [eax + edx], xmm0
2498 movdqa [eax + edx + 16], xmm1
2499 lea eax, [eax + 32]
2500 sub ecx, 32
2501 jg convertloop
2502 ret
2503 }
2504 }
2505 #endif // HAS_COPYROW_SSE2
2506
2507 #ifdef HAS_COPYROW_X86
2508 __declspec(naked) __declspec(align(16))
CopyRow_X86(const uint8 * src,uint8 * dst,int count)2509 void CopyRow_X86(const uint8* src, uint8* dst, int count) {
2510 __asm {
2511 mov eax, esi
2512 mov edx, edi
2513 mov esi, [esp + 4] // src
2514 mov edi, [esp + 8] // dst
2515 mov ecx, [esp + 12] // count
2516 shr ecx, 2
2517 rep movsd
2518 mov edi, edx
2519 mov esi, eax
2520 ret
2521 }
2522 }
2523 #endif // HAS_COPYROW_X86
2524
2525 #ifdef HAS_SETROW_X86
2526 // SetRow8 writes 'count' bytes using a 32 bit value repeated.
2527 __declspec(naked) __declspec(align(16))
SetRow8_X86(uint8 * dst,uint32 v32,int count)2528 void SetRow8_X86(uint8* dst, uint32 v32, int count) {
2529 __asm {
2530 mov edx, edi
2531 mov edi, [esp + 4] // dst
2532 mov eax, [esp + 8] // v32
2533 mov ecx, [esp + 12] // count
2534 shr ecx, 2
2535 rep stosd
2536 mov edi, edx
2537 ret
2538 }
2539 }
2540
2541 // SetRow32 writes 'count' words using a 32 bit value repeated.
2542 __declspec(naked) __declspec(align(16))
SetRows32_X86(uint8 * dst,uint32 v32,int width,int dst_stride,int height)2543 void SetRows32_X86(uint8* dst, uint32 v32, int width,
2544 int dst_stride, int height) {
2545 __asm {
2546 push esi
2547 push edi
2548 push ebp
2549 mov edi, [esp + 12 + 4] // dst
2550 mov eax, [esp + 12 + 8] // v32
2551 mov ebp, [esp + 12 + 12] // width
2552 mov edx, [esp + 12 + 16] // dst_stride
2553 mov esi, [esp + 12 + 20] // height
2554 lea ecx, [ebp * 4]
2555 sub edx, ecx // stride - width * 4
2556
2557 align 16
2558 convertloop:
2559 mov ecx, ebp
2560 rep stosd
2561 add edi, edx
2562 sub esi, 1
2563 jg convertloop
2564
2565 pop ebp
2566 pop edi
2567 pop esi
2568 ret
2569 }
2570 }
2571 #endif // HAS_SETROW_X86
2572
2573 #ifdef HAS_YUY2TOYROW_SSE2
2574 __declspec(naked) __declspec(align(16))
YUY2ToYRow_SSE2(const uint8 * src_yuy2,uint8 * dst_y,int pix)2575 void YUY2ToYRow_SSE2(const uint8* src_yuy2,
2576 uint8* dst_y, int pix) {
2577 __asm {
2578 mov eax, [esp + 4] // src_yuy2
2579 mov edx, [esp + 8] // dst_y
2580 mov ecx, [esp + 12] // pix
2581 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
2582 psrlw xmm5, 8
2583
2584 align 16
2585 convertloop:
2586 movdqa xmm0, [eax]
2587 movdqa xmm1, [eax + 16]
2588 lea eax, [eax + 32]
2589 pand xmm0, xmm5 // even bytes are Y
2590 pand xmm1, xmm5
2591 packuswb xmm0, xmm1
2592 sub ecx, 16
2593 movdqa [edx], xmm0
2594 lea edx, [edx + 16]
2595 jg convertloop
2596 ret
2597 }
2598 }
2599
2600 __declspec(naked) __declspec(align(16))
YUY2ToUVRow_SSE2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)2601 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
2602 uint8* dst_u, uint8* dst_v, int pix) {
2603 __asm {
2604 push esi
2605 push edi
2606 mov eax, [esp + 8 + 4] // src_yuy2
2607 mov esi, [esp + 8 + 8] // stride_yuy2
2608 mov edx, [esp + 8 + 12] // dst_u
2609 mov edi, [esp + 8 + 16] // dst_v
2610 mov ecx, [esp + 8 + 20] // pix
2611 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
2612 psrlw xmm5, 8
2613 sub edi, edx
2614
2615 align 16
2616 convertloop:
2617 movdqa xmm0, [eax]
2618 movdqa xmm1, [eax + 16]
2619 movdqa xmm2, [eax + esi]
2620 movdqa xmm3, [eax + esi + 16]
2621 lea eax, [eax + 32]
2622 pavgb xmm0, xmm2
2623 pavgb xmm1, xmm3
2624 psrlw xmm0, 8 // YUYV -> UVUV
2625 psrlw xmm1, 8
2626 packuswb xmm0, xmm1
2627 movdqa xmm1, xmm0
2628 pand xmm0, xmm5 // U
2629 packuswb xmm0, xmm0
2630 psrlw xmm1, 8 // V
2631 packuswb xmm1, xmm1
2632 movq qword ptr [edx], xmm0
2633 movq qword ptr [edx + edi], xmm1
2634 lea edx, [edx + 8]
2635 sub ecx, 16
2636 jg convertloop
2637
2638 pop edi
2639 pop esi
2640 ret
2641 }
2642 }
2643
2644 __declspec(naked) __declspec(align(16))
YUY2ToUV422Row_SSE2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)2645 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2646 uint8* dst_u, uint8* dst_v, int pix) {
2647 __asm {
2648 push edi
2649 mov eax, [esp + 4 + 4] // src_yuy2
2650 mov edx, [esp + 4 + 8] // dst_u
2651 mov edi, [esp + 4 + 12] // dst_v
2652 mov ecx, [esp + 4 + 16] // pix
2653 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
2654 psrlw xmm5, 8
2655 sub edi, edx
2656
2657 align 16
2658 convertloop:
2659 movdqa xmm0, [eax]
2660 movdqa xmm1, [eax + 16]
2661 lea eax, [eax + 32]
2662 psrlw xmm0, 8 // YUYV -> UVUV
2663 psrlw xmm1, 8
2664 packuswb xmm0, xmm1
2665 movdqa xmm1, xmm0
2666 pand xmm0, xmm5 // U
2667 packuswb xmm0, xmm0
2668 psrlw xmm1, 8 // V
2669 packuswb xmm1, xmm1
2670 movq qword ptr [edx], xmm0
2671 movq qword ptr [edx + edi], xmm1
2672 lea edx, [edx + 8]
2673 sub ecx, 16
2674 jg convertloop
2675
2676 pop edi
2677 ret
2678 }
2679 }
2680
2681 __declspec(naked) __declspec(align(16))
YUY2ToYRow_Unaligned_SSE2(const uint8 * src_yuy2,uint8 * dst_y,int pix)2682 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
2683 uint8* dst_y, int pix) {
2684 __asm {
2685 mov eax, [esp + 4] // src_yuy2
2686 mov edx, [esp + 8] // dst_y
2687 mov ecx, [esp + 12] // pix
2688 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
2689 psrlw xmm5, 8
2690
2691 align 16
2692 convertloop:
2693 movdqu xmm0, [eax]
2694 movdqu xmm1, [eax + 16]
2695 lea eax, [eax + 32]
2696 pand xmm0, xmm5 // even bytes are Y
2697 pand xmm1, xmm5
2698 packuswb xmm0, xmm1
2699 sub ecx, 16
2700 movdqu [edx], xmm0
2701 lea edx, [edx + 16]
2702 jg convertloop
2703 ret
2704 }
2705 }
2706
2707 __declspec(naked) __declspec(align(16))
YUY2ToUVRow_Unaligned_SSE2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)2708 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
2709 uint8* dst_u, uint8* dst_v, int pix) {
2710 __asm {
2711 push esi
2712 push edi
2713 mov eax, [esp + 8 + 4] // src_yuy2
2714 mov esi, [esp + 8 + 8] // stride_yuy2
2715 mov edx, [esp + 8 + 12] // dst_u
2716 mov edi, [esp + 8 + 16] // dst_v
2717 mov ecx, [esp + 8 + 20] // pix
2718 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
2719 psrlw xmm5, 8
2720 sub edi, edx
2721
2722 align 16
2723 convertloop:
2724 movdqu xmm0, [eax]
2725 movdqu xmm1, [eax + 16]
2726 movdqu xmm2, [eax + esi]
2727 movdqu xmm3, [eax + esi + 16]
2728 lea eax, [eax + 32]
2729 pavgb xmm0, xmm2
2730 pavgb xmm1, xmm3
2731 psrlw xmm0, 8 // YUYV -> UVUV
2732 psrlw xmm1, 8
2733 packuswb xmm0, xmm1
2734 movdqa xmm1, xmm0
2735 pand xmm0, xmm5 // U
2736 packuswb xmm0, xmm0
2737 psrlw xmm1, 8 // V
2738 packuswb xmm1, xmm1
2739 movq qword ptr [edx], xmm0
2740 movq qword ptr [edx + edi], xmm1
2741 lea edx, [edx + 8]
2742 sub ecx, 16
2743 jg convertloop
2744
2745 pop edi
2746 pop esi
2747 ret
2748 }
2749 }
2750
2751 __declspec(naked) __declspec(align(16))
YUY2ToUV422Row_Unaligned_SSE2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)2752 void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
2753 uint8* dst_u, uint8* dst_v, int pix) {
2754 __asm {
2755 push edi
2756 mov eax, [esp + 4 + 4] // src_yuy2
2757 mov edx, [esp + 4 + 8] // dst_u
2758 mov edi, [esp + 4 + 12] // dst_v
2759 mov ecx, [esp + 4 + 16] // pix
2760 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
2761 psrlw xmm5, 8
2762 sub edi, edx
2763
2764 align 16
2765 convertloop:
2766 movdqu xmm0, [eax]
2767 movdqu xmm1, [eax + 16]
2768 lea eax, [eax + 32]
2769 psrlw xmm0, 8 // YUYV -> UVUV
2770 psrlw xmm1, 8
2771 packuswb xmm0, xmm1
2772 movdqa xmm1, xmm0
2773 pand xmm0, xmm5 // U
2774 packuswb xmm0, xmm0
2775 psrlw xmm1, 8 // V
2776 packuswb xmm1, xmm1
2777 movq qword ptr [edx], xmm0
2778 movq qword ptr [edx + edi], xmm1
2779 lea edx, [edx + 8]
2780 sub ecx, 16
2781 jg convertloop
2782
2783 pop edi
2784 ret
2785 }
2786 }
2787
2788 __declspec(naked) __declspec(align(16))
UYVYToYRow_SSE2(const uint8 * src_uyvy,uint8 * dst_y,int pix)2789 void UYVYToYRow_SSE2(const uint8* src_uyvy,
2790 uint8* dst_y, int pix) {
2791 __asm {
2792 mov eax, [esp + 4] // src_uyvy
2793 mov edx, [esp + 8] // dst_y
2794 mov ecx, [esp + 12] // pix
2795
2796 align 16
2797 convertloop:
2798 movdqa xmm0, [eax]
2799 movdqa xmm1, [eax + 16]
2800 lea eax, [eax + 32]
2801 psrlw xmm0, 8 // odd bytes are Y
2802 psrlw xmm1, 8
2803 packuswb xmm0, xmm1
2804 sub ecx, 16
2805 movdqa [edx], xmm0
2806 lea edx, [edx + 16]
2807 jg convertloop
2808 ret
2809 }
2810 }
2811
2812 __declspec(naked) __declspec(align(16))
UYVYToUVRow_SSE2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)2813 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
2814 uint8* dst_u, uint8* dst_v, int pix) {
2815 __asm {
2816 push esi
2817 push edi
2818 mov eax, [esp + 8 + 4] // src_yuy2
2819 mov esi, [esp + 8 + 8] // stride_yuy2
2820 mov edx, [esp + 8 + 12] // dst_u
2821 mov edi, [esp + 8 + 16] // dst_v
2822 mov ecx, [esp + 8 + 20] // pix
2823 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
2824 psrlw xmm5, 8
2825 sub edi, edx
2826
2827 align 16
2828 convertloop:
2829 movdqa xmm0, [eax]
2830 movdqa xmm1, [eax + 16]
2831 movdqa xmm2, [eax + esi]
2832 movdqa xmm3, [eax + esi + 16]
2833 lea eax, [eax + 32]
2834 pavgb xmm0, xmm2
2835 pavgb xmm1, xmm3
2836 pand xmm0, xmm5 // UYVY -> UVUV
2837 pand xmm1, xmm5
2838 packuswb xmm0, xmm1
2839 movdqa xmm1, xmm0
2840 pand xmm0, xmm5 // U
2841 packuswb xmm0, xmm0
2842 psrlw xmm1, 8 // V
2843 packuswb xmm1, xmm1
2844 movq qword ptr [edx], xmm0
2845 movq qword ptr [edx + edi], xmm1
2846 lea edx, [edx + 8]
2847 sub ecx, 16
2848 jg convertloop
2849
2850 pop edi
2851 pop esi
2852 ret
2853 }
2854 }
2855
2856 __declspec(naked) __declspec(align(16))
UYVYToUV422Row_SSE2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)2857 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
2858 uint8* dst_u, uint8* dst_v, int pix) {
2859 __asm {
2860 push edi
2861 mov eax, [esp + 4 + 4] // src_yuy2
2862 mov edx, [esp + 4 + 8] // dst_u
2863 mov edi, [esp + 4 + 12] // dst_v
2864 mov ecx, [esp + 4 + 16] // pix
2865 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
2866 psrlw xmm5, 8
2867 sub edi, edx
2868
2869 align 16
2870 convertloop:
2871 movdqa xmm0, [eax]
2872 movdqa xmm1, [eax + 16]
2873 lea eax, [eax + 32]
2874 pand xmm0, xmm5 // UYVY -> UVUV
2875 pand xmm1, xmm5
2876 packuswb xmm0, xmm1
2877 movdqa xmm1, xmm0
2878 pand xmm0, xmm5 // U
2879 packuswb xmm0, xmm0
2880 psrlw xmm1, 8 // V
2881 packuswb xmm1, xmm1
2882 movq qword ptr [edx], xmm0
2883 movq qword ptr [edx + edi], xmm1
2884 lea edx, [edx + 8]
2885 sub ecx, 16
2886 jg convertloop
2887
2888 pop edi
2889 ret
2890 }
2891 }
2892
2893 __declspec(naked) __declspec(align(16))
UYVYToYRow_Unaligned_SSE2(const uint8 * src_uyvy,uint8 * dst_y,int pix)2894 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
2895 uint8* dst_y, int pix) {
2896 __asm {
2897 mov eax, [esp + 4] // src_uyvy
2898 mov edx, [esp + 8] // dst_y
2899 mov ecx, [esp + 12] // pix
2900
2901 align 16
2902 convertloop:
2903 movdqu xmm0, [eax]
2904 movdqu xmm1, [eax + 16]
2905 lea eax, [eax + 32]
2906 psrlw xmm0, 8 // odd bytes are Y
2907 psrlw xmm1, 8
2908 packuswb xmm0, xmm1
2909 sub ecx, 16
2910 movdqu [edx], xmm0
2911 lea edx, [edx + 16]
2912 jg convertloop
2913 ret
2914 }
2915 }
2916
2917 __declspec(naked) __declspec(align(16))
UYVYToUVRow_Unaligned_SSE2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)2918 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
2919 uint8* dst_u, uint8* dst_v, int pix) {
2920 __asm {
2921 push esi
2922 push edi
2923 mov eax, [esp + 8 + 4] // src_yuy2
2924 mov esi, [esp + 8 + 8] // stride_yuy2
2925 mov edx, [esp + 8 + 12] // dst_u
2926 mov edi, [esp + 8 + 16] // dst_v
2927 mov ecx, [esp + 8 + 20] // pix
2928 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
2929 psrlw xmm5, 8
2930 sub edi, edx
2931
2932 align 16
2933 convertloop:
2934 movdqu xmm0, [eax]
2935 movdqu xmm1, [eax + 16]
2936 movdqu xmm2, [eax + esi]
2937 movdqu xmm3, [eax + esi + 16]
2938 lea eax, [eax + 32]
2939 pavgb xmm0, xmm2
2940 pavgb xmm1, xmm3
2941 pand xmm0, xmm5 // UYVY -> UVUV
2942 pand xmm1, xmm5
2943 packuswb xmm0, xmm1
2944 movdqa xmm1, xmm0
2945 pand xmm0, xmm5 // U
2946 packuswb xmm0, xmm0
2947 psrlw xmm1, 8 // V
2948 packuswb xmm1, xmm1
2949 movq qword ptr [edx], xmm0
2950 movq qword ptr [edx + edi], xmm1
2951 lea edx, [edx + 8]
2952 sub ecx, 16
2953 jg convertloop
2954
2955 pop edi
2956 pop esi
2957 ret
2958 }
2959 }
2960
2961 __declspec(naked) __declspec(align(16))
UYVYToUV422Row_Unaligned_SSE2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)2962 void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
2963 uint8* dst_u, uint8* dst_v, int pix) {
2964 __asm {
2965 push edi
2966 mov eax, [esp + 4 + 4] // src_yuy2
2967 mov edx, [esp + 4 + 8] // dst_u
2968 mov edi, [esp + 4 + 12] // dst_v
2969 mov ecx, [esp + 4 + 16] // pix
2970 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
2971 psrlw xmm5, 8
2972 sub edi, edx
2973
2974 align 16
2975 convertloop:
2976 movdqu xmm0, [eax]
2977 movdqu xmm1, [eax + 16]
2978 lea eax, [eax + 32]
2979 pand xmm0, xmm5 // UYVY -> UVUV
2980 pand xmm1, xmm5
2981 packuswb xmm0, xmm1
2982 movdqa xmm1, xmm0
2983 pand xmm0, xmm5 // U
2984 packuswb xmm0, xmm0
2985 psrlw xmm1, 8 // V
2986 packuswb xmm1, xmm1
2987 movq qword ptr [edx], xmm0
2988 movq qword ptr [edx + edi], xmm1
2989 lea edx, [edx + 8]
2990 sub ecx, 16
2991 jg convertloop
2992
2993 pop edi
2994 ret
2995 }
2996 }
2997 #endif // HAS_YUY2TOYROW_SSE2
2998
2999 #ifdef HAS_ARGBBLENDROW_SSE2
3000 // Blend 8 pixels at a time.
3001 __declspec(naked) __declspec(align(16))
ARGBBlendRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)3002 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3003 uint8* dst_argb, int width) {
3004 __asm {
3005 push esi
3006 mov eax, [esp + 4 + 4] // src_argb0
3007 mov esi, [esp + 4 + 8] // src_argb1
3008 mov edx, [esp + 4 + 12] // dst_argb
3009 mov ecx, [esp + 4 + 16] // width
3010 pcmpeqb xmm7, xmm7 // generate constant 1
3011 psrlw xmm7, 15
3012 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
3013 psrlw xmm6, 8
3014 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
3015 psllw xmm5, 8
3016 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
3017 pslld xmm4, 24
3018
3019 sub ecx, 1
3020 je convertloop1 // only 1 pixel?
3021 jl convertloop1b
3022
3023 // 1 pixel loop until destination pointer is aligned.
3024 alignloop1:
3025 test edx, 15 // aligned?
3026 je alignloop1b
3027 movd xmm3, [eax]
3028 lea eax, [eax + 4]
3029 movdqa xmm0, xmm3 // src argb
3030 pxor xmm3, xmm4 // ~alpha
3031 movd xmm2, [esi] // _r_b
3032 psrlw xmm3, 8 // alpha
3033 pshufhw xmm3, xmm3,0F5h // 8 alpha words
3034 pshuflw xmm3, xmm3,0F5h
3035 pand xmm2, xmm6 // _r_b
3036 paddw xmm3, xmm7 // 256 - alpha
3037 pmullw xmm2, xmm3 // _r_b * alpha
3038 movd xmm1, [esi] // _a_g
3039 lea esi, [esi + 4]
3040 psrlw xmm1, 8 // _a_g
3041 por xmm0, xmm4 // set alpha to 255
3042 pmullw xmm1, xmm3 // _a_g * alpha
3043 psrlw xmm2, 8 // _r_b convert to 8 bits again
3044 paddusb xmm0, xmm2 // + src argb
3045 pand xmm1, xmm5 // a_g_ convert to 8 bits again
3046 paddusb xmm0, xmm1 // + src argb
3047 sub ecx, 1
3048 movd [edx], xmm0
3049 lea edx, [edx + 4]
3050 jge alignloop1
3051
3052 alignloop1b:
3053 add ecx, 1 - 4
3054 jl convertloop4b
3055
3056 // 4 pixel loop.
3057 convertloop4:
3058 movdqu xmm3, [eax] // src argb
3059 lea eax, [eax + 16]
3060 movdqa xmm0, xmm3 // src argb
3061 pxor xmm3, xmm4 // ~alpha
3062 movdqu xmm2, [esi] // _r_b
3063 psrlw xmm3, 8 // alpha
3064 pshufhw xmm3, xmm3,0F5h // 8 alpha words
3065 pshuflw xmm3, xmm3,0F5h
3066 pand xmm2, xmm6 // _r_b
3067 paddw xmm3, xmm7 // 256 - alpha
3068 pmullw xmm2, xmm3 // _r_b * alpha
3069 movdqu xmm1, [esi] // _a_g
3070 lea esi, [esi + 16]
3071 psrlw xmm1, 8 // _a_g
3072 por xmm0, xmm4 // set alpha to 255
3073 pmullw xmm1, xmm3 // _a_g * alpha
3074 psrlw xmm2, 8 // _r_b convert to 8 bits again
3075 paddusb xmm0, xmm2 // + src argb
3076 pand xmm1, xmm5 // a_g_ convert to 8 bits again
3077 paddusb xmm0, xmm1 // + src argb
3078 sub ecx, 4
3079 movdqa [edx], xmm0
3080 lea edx, [edx + 16]
3081 jge convertloop4
3082
3083 convertloop4b:
3084 add ecx, 4 - 1
3085 jl convertloop1b
3086
3087 // 1 pixel loop.
3088 convertloop1:
3089 movd xmm3, [eax] // src argb
3090 lea eax, [eax + 4]
3091 movdqa xmm0, xmm3 // src argb
3092 pxor xmm3, xmm4 // ~alpha
3093 movd xmm2, [esi] // _r_b
3094 psrlw xmm3, 8 // alpha
3095 pshufhw xmm3, xmm3,0F5h // 8 alpha words
3096 pshuflw xmm3, xmm3,0F5h
3097 pand xmm2, xmm6 // _r_b
3098 paddw xmm3, xmm7 // 256 - alpha
3099 pmullw xmm2, xmm3 // _r_b * alpha
3100 movd xmm1, [esi] // _a_g
3101 lea esi, [esi + 4]
3102 psrlw xmm1, 8 // _a_g
3103 por xmm0, xmm4 // set alpha to 255
3104 pmullw xmm1, xmm3 // _a_g * alpha
3105 psrlw xmm2, 8 // _r_b convert to 8 bits again
3106 paddusb xmm0, xmm2 // + src argb
3107 pand xmm1, xmm5 // a_g_ convert to 8 bits again
3108 paddusb xmm0, xmm1 // + src argb
3109 sub ecx, 1
3110 movd [edx], xmm0
3111 lea edx, [edx + 4]
3112 jge convertloop1
3113
3114 convertloop1b:
3115 pop esi
3116 ret
3117 }
3118 }
3119 #endif // HAS_ARGBBLENDROW_SSE2
3120
3121 #ifdef HAS_ARGBBLENDROW_SSSE3
3122 // Shuffle table for isolating alpha.
3123 static const uvec8 kShuffleAlpha = {
3124 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3125 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3126 };
3127 // Same as SSE2, but replaces:
3128 // psrlw xmm3, 8 // alpha
3129 // pshufhw xmm3, xmm3,0F5h // 8 alpha words
3130 // pshuflw xmm3, xmm3,0F5h
3131 // with..
3132 // pshufb xmm3, kShuffleAlpha // alpha
3133 // Blend 8 pixels at a time.
3134
3135 __declspec(naked) __declspec(align(16))
ARGBBlendRow_SSSE3(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)3136 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3137 uint8* dst_argb, int width) {
3138 __asm {
3139 push esi
3140 mov eax, [esp + 4 + 4] // src_argb0
3141 mov esi, [esp + 4 + 8] // src_argb1
3142 mov edx, [esp + 4 + 12] // dst_argb
3143 mov ecx, [esp + 4 + 16] // width
3144 pcmpeqb xmm7, xmm7 // generate constant 1
3145 psrlw xmm7, 15
3146 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
3147 psrlw xmm6, 8
3148 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
3149 psllw xmm5, 8
3150 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
3151 pslld xmm4, 24
3152
3153 sub ecx, 1
3154 je convertloop1 // only 1 pixel?
3155 jl convertloop1b
3156
3157 // 1 pixel loop until destination pointer is aligned.
3158 alignloop1:
3159 test edx, 15 // aligned?
3160 je alignloop1b
3161 movd xmm3, [eax]
3162 lea eax, [eax + 4]
3163 movdqa xmm0, xmm3 // src argb
3164 pxor xmm3, xmm4 // ~alpha
3165 movd xmm2, [esi] // _r_b
3166 pshufb xmm3, kShuffleAlpha // alpha
3167 pand xmm2, xmm6 // _r_b
3168 paddw xmm3, xmm7 // 256 - alpha
3169 pmullw xmm2, xmm3 // _r_b * alpha
3170 movd xmm1, [esi] // _a_g
3171 lea esi, [esi + 4]
3172 psrlw xmm1, 8 // _a_g
3173 por xmm0, xmm4 // set alpha to 255
3174 pmullw xmm1, xmm3 // _a_g * alpha
3175 psrlw xmm2, 8 // _r_b convert to 8 bits again
3176 paddusb xmm0, xmm2 // + src argb
3177 pand xmm1, xmm5 // a_g_ convert to 8 bits again
3178 paddusb xmm0, xmm1 // + src argb
3179 sub ecx, 1
3180 movd [edx], xmm0
3181 lea edx, [edx + 4]
3182 jge alignloop1
3183
3184 alignloop1b:
3185 add ecx, 1 - 4
3186 jl convertloop4b
3187
3188 test eax, 15 // unaligned?
3189 jne convertuloop4
3190 test esi, 15 // unaligned?
3191 jne convertuloop4
3192
3193 // 4 pixel loop.
3194 convertloop4:
3195 movdqa xmm3, [eax] // src argb
3196 lea eax, [eax + 16]
3197 movdqa xmm0, xmm3 // src argb
3198 pxor xmm3, xmm4 // ~alpha
3199 movdqa xmm2, [esi] // _r_b
3200 pshufb xmm3, kShuffleAlpha // alpha
3201 pand xmm2, xmm6 // _r_b
3202 paddw xmm3, xmm7 // 256 - alpha
3203 pmullw xmm2, xmm3 // _r_b * alpha
3204 movdqa xmm1, [esi] // _a_g
3205 lea esi, [esi + 16]
3206 psrlw xmm1, 8 // _a_g
3207 por xmm0, xmm4 // set alpha to 255
3208 pmullw xmm1, xmm3 // _a_g * alpha
3209 psrlw xmm2, 8 // _r_b convert to 8 bits again
3210 paddusb xmm0, xmm2 // + src argb
3211 pand xmm1, xmm5 // a_g_ convert to 8 bits again
3212 paddusb xmm0, xmm1 // + src argb
3213 sub ecx, 4
3214 movdqa [edx], xmm0
3215 lea edx, [edx + 16]
3216 jge convertloop4
3217 jmp convertloop4b
3218
3219 // 4 pixel unaligned loop.
3220 convertuloop4:
3221 movdqu xmm3, [eax] // src argb
3222 lea eax, [eax + 16]
3223 movdqa xmm0, xmm3 // src argb
3224 pxor xmm3, xmm4 // ~alpha
3225 movdqu xmm2, [esi] // _r_b
3226 pshufb xmm3, kShuffleAlpha // alpha
3227 pand xmm2, xmm6 // _r_b
3228 paddw xmm3, xmm7 // 256 - alpha
3229 pmullw xmm2, xmm3 // _r_b * alpha
3230 movdqu xmm1, [esi] // _a_g
3231 lea esi, [esi + 16]
3232 psrlw xmm1, 8 // _a_g
3233 por xmm0, xmm4 // set alpha to 255
3234 pmullw xmm1, xmm3 // _a_g * alpha
3235 psrlw xmm2, 8 // _r_b convert to 8 bits again
3236 paddusb xmm0, xmm2 // + src argb
3237 pand xmm1, xmm5 // a_g_ convert to 8 bits again
3238 paddusb xmm0, xmm1 // + src argb
3239 sub ecx, 4
3240 movdqa [edx], xmm0
3241 lea edx, [edx + 16]
3242 jge convertuloop4
3243
3244 convertloop4b:
3245 add ecx, 4 - 1
3246 jl convertloop1b
3247
3248 // 1 pixel loop.
3249 convertloop1:
3250 movd xmm3, [eax] // src argb
3251 lea eax, [eax + 4]
3252 movdqa xmm0, xmm3 // src argb
3253 pxor xmm3, xmm4 // ~alpha
3254 movd xmm2, [esi] // _r_b
3255 pshufb xmm3, kShuffleAlpha // alpha
3256 pand xmm2, xmm6 // _r_b
3257 paddw xmm3, xmm7 // 256 - alpha
3258 pmullw xmm2, xmm3 // _r_b * alpha
3259 movd xmm1, [esi] // _a_g
3260 lea esi, [esi + 4]
3261 psrlw xmm1, 8 // _a_g
3262 por xmm0, xmm4 // set alpha to 255
3263 pmullw xmm1, xmm3 // _a_g * alpha
3264 psrlw xmm2, 8 // _r_b convert to 8 bits again
3265 paddusb xmm0, xmm2 // + src argb
3266 pand xmm1, xmm5 // a_g_ convert to 8 bits again
3267 paddusb xmm0, xmm1 // + src argb
3268 sub ecx, 1
3269 movd [edx], xmm0
3270 lea edx, [edx + 4]
3271 jge convertloop1
3272
3273 convertloop1b:
3274 pop esi
3275 ret
3276 }
3277 }
3278 #endif // HAS_ARGBBLENDROW_SSSE3
3279
3280 #ifdef HAS_ARGBATTENUATE_SSE2
3281 // Attenuate 4 pixels at a time.
3282 // Aligned to 16 bytes.
3283 __declspec(naked) __declspec(align(16))
ARGBAttenuateRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width)3284 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3285 __asm {
3286 mov eax, [esp + 4] // src_argb0
3287 mov edx, [esp + 8] // dst_argb
3288 mov ecx, [esp + 12] // width
3289 sub edx, eax
3290 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
3291 pslld xmm4, 24
3292 pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff
3293 psrld xmm5, 8
3294
3295 align 16
3296 convertloop:
3297 movdqa xmm0, [eax] // read 4 pixels
3298 punpcklbw xmm0, xmm0 // first 2
3299 pshufhw xmm2, xmm0,0FFh // 8 alpha words
3300 pshuflw xmm2, xmm2,0FFh
3301 pmulhuw xmm0, xmm2 // rgb * a
3302 movdqa xmm1, [eax] // read 4 pixels
3303 punpckhbw xmm1, xmm1 // next 2 pixels
3304 pshufhw xmm2, xmm1,0FFh // 8 alpha words
3305 pshuflw xmm2, xmm2,0FFh
3306 pmulhuw xmm1, xmm2 // rgb * a
3307 movdqa xmm2, [eax] // alphas
3308 psrlw xmm0, 8
3309 pand xmm2, xmm4
3310 psrlw xmm1, 8
3311 packuswb xmm0, xmm1
3312 pand xmm0, xmm5 // keep original alphas
3313 por xmm0, xmm2
3314 sub ecx, 4
3315 movdqa [eax + edx], xmm0
3316 lea eax, [eax + 16]
3317 jg convertloop
3318
3319 ret
3320 }
3321 }
3322 #endif // HAS_ARGBATTENUATE_SSE2
3323
3324 #ifdef HAS_ARGBATTENUATEROW_SSSE3
3325 // Shuffle table duplicating alpha.
3326 static const uvec8 kShuffleAlpha0 = {
3327 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
3328 };
3329 static const uvec8 kShuffleAlpha1 = {
3330 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3331 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
3332 };
3333 __declspec(naked) __declspec(align(16))
ARGBAttenuateRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)3334 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3335 __asm {
3336 mov eax, [esp + 4] // src_argb0
3337 mov edx, [esp + 8] // dst_argb
3338 mov ecx, [esp + 12] // width
3339 sub edx, eax
3340 pcmpeqb xmm3, xmm3 // generate mask 0xff000000
3341 pslld xmm3, 24
3342 movdqa xmm4, kShuffleAlpha0
3343 movdqa xmm5, kShuffleAlpha1
3344
3345 align 16
3346 convertloop:
3347 movdqa xmm0, [eax] // read 4 pixels
3348 pshufb xmm0, xmm4 // isolate first 2 alphas
3349 movdqa xmm1, [eax] // read 4 pixels
3350 punpcklbw xmm1, xmm1 // first 2 pixel rgbs
3351 pmulhuw xmm0, xmm1 // rgb * a
3352 movdqa xmm1, [eax] // read 4 pixels
3353 pshufb xmm1, xmm5 // isolate next 2 alphas
3354 movdqa xmm2, [eax] // read 4 pixels
3355 punpckhbw xmm2, xmm2 // next 2 pixel rgbs
3356 pmulhuw xmm1, xmm2 // rgb * a
3357 movdqa xmm2, [eax] // mask original alpha
3358 pand xmm2, xmm3
3359 psrlw xmm0, 8
3360 psrlw xmm1, 8
3361 packuswb xmm0, xmm1
3362 por xmm0, xmm2 // copy original alpha
3363 sub ecx, 4
3364 movdqa [eax + edx], xmm0
3365 lea eax, [eax + 16]
3366 jg convertloop
3367
3368 ret
3369 }
3370 }
3371 #endif // HAS_ARGBATTENUATEROW_SSSE3
3372
3373 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
3374 // Unattenuate 4 pixels at a time.
3375 // Aligned to 16 bytes.
3376 __declspec(naked) __declspec(align(16))
ARGBUnattenuateRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width)3377 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3378 int width) {
3379 __asm {
3380 push esi
3381 push edi
3382 mov eax, [esp + 8 + 4] // src_argb0
3383 mov edx, [esp + 8 + 8] // dst_argb
3384 mov ecx, [esp + 8 + 12] // width
3385 sub edx, eax
3386 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
3387 pslld xmm4, 24
3388
3389 align 16
3390 convertloop:
3391 movdqa xmm0, [eax] // read 4 pixels
3392 movzx esi, byte ptr [eax + 3] // first alpha
3393 movzx edi, byte ptr [eax + 7] // second alpha
3394 punpcklbw xmm0, xmm0 // first 2
3395 movd xmm2, dword ptr fixed_invtbl8[esi * 4]
3396 movd xmm3, dword ptr fixed_invtbl8[edi * 4]
3397 pshuflw xmm2, xmm2,0C0h // first 4 inv_alpha words
3398 pshuflw xmm3, xmm3,0C0h // next 4 inv_alpha words
3399 movlhps xmm2, xmm3
3400 pmulhuw xmm0, xmm2 // rgb * a
3401
3402 movdqa xmm1, [eax] // read 4 pixels
3403 movzx esi, byte ptr [eax + 11] // third alpha
3404 movzx edi, byte ptr [eax + 15] // forth alpha
3405 punpckhbw xmm1, xmm1 // next 2
3406 movd xmm2, dword ptr fixed_invtbl8[esi * 4]
3407 movd xmm3, dword ptr fixed_invtbl8[edi * 4]
3408 pshuflw xmm2, xmm2,0C0h // first 4 inv_alpha words
3409 pshuflw xmm3, xmm3,0C0h // next 4 inv_alpha words
3410 movlhps xmm2, xmm3
3411 pmulhuw xmm1, xmm2 // rgb * a
3412
3413 movdqa xmm2, [eax] // alphas
3414 pand xmm2, xmm4
3415 packuswb xmm0, xmm1
3416 por xmm0, xmm2
3417 sub ecx, 4
3418 movdqa [eax + edx], xmm0
3419 lea eax, [eax + 16]
3420 jg convertloop
3421 pop edi
3422 pop esi
3423 ret
3424 }
3425 }
3426 #endif // HAS_ARGBUNATTENUATEROW_SSE2
3427
3428 #ifdef HAS_ARGBGRAYROW_SSSE3
3429 // Constant for ARGB color to gray scale: 0.11 * B + 0.59 * G + 0.30 * R
3430 static const vec8 kARGBToGray = {
3431 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
3432 };
3433
3434 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
3435 __declspec(naked) __declspec(align(16))
ARGBGrayRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)3436 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3437 __asm {
3438 mov eax, [esp + 4] /* src_argb */
3439 mov edx, [esp + 8] /* dst_argb */
3440 mov ecx, [esp + 12] /* width */
3441 movdqa xmm4, kARGBToGray
3442 sub edx, eax
3443
3444 align 16
3445 convertloop:
3446 movdqa xmm0, [eax] // G
3447 movdqa xmm1, [eax + 16]
3448 pmaddubsw xmm0, xmm4
3449 pmaddubsw xmm1, xmm4
3450 phaddw xmm0, xmm1
3451 psrlw xmm0, 7
3452 packuswb xmm0, xmm0 // 8 G bytes
3453 movdqa xmm2, [eax] // A
3454 movdqa xmm3, [eax + 16]
3455 psrld xmm2, 24
3456 psrld xmm3, 24
3457 packuswb xmm2, xmm3
3458 packuswb xmm2, xmm2 // 8 A bytes
3459 movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
3460 punpcklbw xmm0, xmm0 // 8 GG words
3461 punpcklbw xmm3, xmm2 // 8 GA words
3462 movdqa xmm1, xmm0
3463 punpcklwd xmm0, xmm3 // GGGA first 4
3464 punpckhwd xmm1, xmm3 // GGGA next 4
3465 sub ecx, 8
3466 movdqa [eax + edx], xmm0
3467 movdqa [eax + edx + 16], xmm1
3468 lea eax, [eax + 32]
3469 jg convertloop
3470 ret
3471 }
3472 }
3473 #endif // HAS_ARGBGRAYROW_SSSE3
3474
3475 #ifdef HAS_ARGBSEPIAROW_SSSE3
3476 // b = (r * 35 + g * 68 + b * 17) >> 7
3477 // g = (r * 45 + g * 88 + b * 22) >> 7
3478 // r = (r * 50 + g * 98 + b * 24) >> 7
3479 // Constant for ARGB color to sepia tone.
3480 static const vec8 kARGBToSepiaB = {
3481 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3482 };
3483
3484 static const vec8 kARGBToSepiaG = {
3485 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3486 };
3487
3488 static const vec8 kARGBToSepiaR = {
3489 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3490 };
3491
3492 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
3493 __declspec(naked) __declspec(align(16))
ARGBSepiaRow_SSSE3(uint8 * dst_argb,int width)3494 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3495 __asm {
3496 mov eax, [esp + 4] /* dst_argb */
3497 mov ecx, [esp + 8] /* width */
3498 movdqa xmm2, kARGBToSepiaB
3499 movdqa xmm3, kARGBToSepiaG
3500 movdqa xmm4, kARGBToSepiaR
3501
3502 align 16
3503 convertloop:
3504 movdqa xmm0, [eax] // B
3505 movdqa xmm6, [eax + 16]
3506 pmaddubsw xmm0, xmm2
3507 pmaddubsw xmm6, xmm2
3508 phaddw xmm0, xmm6
3509 psrlw xmm0, 7
3510 packuswb xmm0, xmm0 // 8 B values
3511 movdqa xmm5, [eax] // G
3512 movdqa xmm1, [eax + 16]
3513 pmaddubsw xmm5, xmm3
3514 pmaddubsw xmm1, xmm3
3515 phaddw xmm5, xmm1
3516 psrlw xmm5, 7
3517 packuswb xmm5, xmm5 // 8 G values
3518 punpcklbw xmm0, xmm5 // 8 BG values
3519 movdqa xmm5, [eax] // R
3520 movdqa xmm1, [eax + 16]
3521 pmaddubsw xmm5, xmm4
3522 pmaddubsw xmm1, xmm4
3523 phaddw xmm5, xmm1
3524 psrlw xmm5, 7
3525 packuswb xmm5, xmm5 // 8 R values
3526 movdqa xmm6, [eax] // A
3527 movdqa xmm1, [eax + 16]
3528 psrld xmm6, 24
3529 psrld xmm1, 24
3530 packuswb xmm6, xmm1
3531 packuswb xmm6, xmm6 // 8 A values
3532 punpcklbw xmm5, xmm6 // 8 RA values
3533 movdqa xmm1, xmm0 // Weave BG, RA together
3534 punpcklwd xmm0, xmm5 // BGRA first 4
3535 punpckhwd xmm1, xmm5 // BGRA next 4
3536 sub ecx, 8
3537 movdqa [eax], xmm0
3538 movdqa [eax + 16], xmm1
3539 lea eax, [eax + 32]
3540 jg convertloop
3541 ret
3542 }
3543 }
3544 #endif // HAS_ARGBSEPIAROW_SSSE3
3545
3546 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3547 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
3548 // Same as Sepia except matrix is provided.
3549 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
3550 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
3551 __declspec(naked) __declspec(align(16))
ARGBColorMatrixRow_SSSE3(uint8 * dst_argb,const int8 * matrix_argb,int width)3552 void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
3553 int width) {
3554 __asm {
3555 mov eax, [esp + 4] /* dst_argb */
3556 mov edx, [esp + 8] /* matrix_argb */
3557 mov ecx, [esp + 12] /* width */
3558 movd xmm2, [edx]
3559 movd xmm3, [edx + 4]
3560 movd xmm4, [edx + 8]
3561 pshufd xmm2, xmm2, 0
3562 pshufd xmm3, xmm3, 0
3563 pshufd xmm4, xmm4, 0
3564
3565 align 16
3566 convertloop:
3567 movdqa xmm0, [eax] // B
3568 movdqa xmm6, [eax + 16]
3569 pmaddubsw xmm0, xmm2
3570 pmaddubsw xmm6, xmm2
3571 movdqa xmm5, [eax] // G
3572 movdqa xmm1, [eax + 16]
3573 pmaddubsw xmm5, xmm3
3574 pmaddubsw xmm1, xmm3
3575 phaddsw xmm0, xmm6 // B
3576 phaddsw xmm5, xmm1 // G
3577 psraw xmm0, 7 // B
3578 psraw xmm5, 7 // G
3579 packuswb xmm0, xmm0 // 8 B values
3580 packuswb xmm5, xmm5 // 8 G values
3581 punpcklbw xmm0, xmm5 // 8 BG values
3582 movdqa xmm5, [eax] // R
3583 movdqa xmm1, [eax + 16]
3584 pmaddubsw xmm5, xmm4
3585 pmaddubsw xmm1, xmm4
3586 phaddsw xmm5, xmm1
3587 psraw xmm5, 7
3588 packuswb xmm5, xmm5 // 8 R values
3589 movdqa xmm6, [eax] // A
3590 movdqa xmm1, [eax + 16]
3591 psrld xmm6, 24
3592 psrld xmm1, 24
3593 packuswb xmm6, xmm1
3594 packuswb xmm6, xmm6 // 8 A values
3595 movdqa xmm1, xmm0 // Weave BG, RA together
3596 punpcklbw xmm5, xmm6 // 8 RA values
3597 punpcklwd xmm0, xmm5 // BGRA first 4
3598 punpckhwd xmm1, xmm5 // BGRA next 4
3599 sub ecx, 8
3600 movdqa [eax], xmm0
3601 movdqa [eax + 16], xmm1
3602 lea eax, [eax + 32]
3603 jg convertloop
3604 ret
3605 }
3606 }
3607 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
3608
3609 #ifdef HAS_ARGBCOLORTABLEROW_X86
3610 // Tranform ARGB pixels with color table.
3611 __declspec(naked) __declspec(align(16))
ARGBColorTableRow_X86(uint8 * dst_argb,const uint8 * table_argb,int width)3612 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
3613 int width) {
3614 __asm {
3615 push ebx
3616 push esi
3617 push edi
3618 push ebp
3619 mov eax, [esp + 16 + 4] /* dst_argb */
3620 mov edi, [esp + 16 + 8] /* table_argb */
3621 mov ecx, [esp + 16 + 12] /* width */
3622 xor ebx, ebx
3623 xor edx, edx
3624
3625 align 16
3626 convertloop:
3627 mov ebp, dword ptr [eax] // BGRA
3628 mov esi, ebp
3629 and ebp, 255
3630 shr esi, 8
3631 and esi, 255
3632 mov bl, [edi + ebp * 4 + 0] // B
3633 mov dl, [edi + esi * 4 + 1] // G
3634 mov ebp, dword ptr [eax] // BGRA
3635 mov esi, ebp
3636 shr ebp, 16
3637 shr esi, 24
3638 and ebp, 255
3639 mov [eax], bl
3640 mov [eax + 1], dl
3641 mov bl, [edi + ebp * 4 + 2] // R
3642 mov dl, [edi + esi * 4 + 3] // A
3643 mov [eax + 2], bl
3644 mov [eax + 3], dl
3645 lea eax, [eax + 4]
3646 sub ecx, 1
3647 jg convertloop
3648 pop ebp
3649 pop edi
3650 pop esi
3651 pop ebx
3652 ret
3653 }
3654 }
3655 #endif // HAS_ARGBCOLORTABLEROW_X86
3656
3657 #ifdef HAS_ARGBQUANTIZEROW_SSE2
3658 // Quantize 4 ARGB pixels (16 bytes).
3659 // Aligned to 16 bytes.
3660 __declspec(naked) __declspec(align(16))
ARGBQuantizeRow_SSE2(uint8 * dst_argb,int scale,int interval_size,int interval_offset,int width)3661 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3662 int interval_offset, int width) {
3663 __asm {
3664 mov eax, [esp + 4] /* dst_argb */
3665 movd xmm2, [esp + 8] /* scale */
3666 movd xmm3, [esp + 12] /* interval_size */
3667 movd xmm4, [esp + 16] /* interval_offset */
3668 mov ecx, [esp + 20] /* width */
3669 pshuflw xmm2, xmm2, 040h
3670 pshufd xmm2, xmm2, 044h
3671 pshuflw xmm3, xmm3, 040h
3672 pshufd xmm3, xmm3, 044h
3673 pshuflw xmm4, xmm4, 040h
3674 pshufd xmm4, xmm4, 044h
3675 pxor xmm5, xmm5 // constant 0
3676 pcmpeqb xmm6, xmm6 // generate mask 0xff000000
3677 pslld xmm6, 24
3678
3679 align 16
3680 convertloop:
3681 movdqa xmm0, [eax] // read 4 pixels
3682 punpcklbw xmm0, xmm5 // first 2 pixels
3683 pmulhuw xmm0, xmm2 // pixel * scale >> 16
3684 movdqa xmm1, [eax] // read 4 pixels
3685 punpckhbw xmm1, xmm5 // next 2 pixels
3686 pmulhuw xmm1, xmm2
3687 pmullw xmm0, xmm3 // * interval_size
3688 movdqa xmm7, [eax] // read 4 pixels
3689 pmullw xmm1, xmm3
3690 pand xmm7, xmm6 // mask alpha
3691 paddw xmm0, xmm4 // + interval_size / 2
3692 paddw xmm1, xmm4
3693 packuswb xmm0, xmm1
3694 por xmm0, xmm7
3695 sub ecx, 4
3696 movdqa [eax], xmm0
3697 lea eax, [eax + 16]
3698 jg convertloop
3699 ret
3700 }
3701 }
3702 #endif // HAS_ARGBQUANTIZEROW_SSE2
3703
3704 #ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
3705 // Consider float CumulativeSum.
3706 // Consider calling CumulativeSum one row at time as needed.
3707 // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
3708 // Convert cumulative sum for an area to an average for 1 pixel.
3709 // topleft is pointer to top left of CumulativeSum buffer for area.
3710 // botleft is pointer to bottom left of CumulativeSum buffer.
3711 // width is offset from left to right of area in CumulativeSum buffer measured
3712 // in number of ints.
3713 // area is the number of pixels in the area being averaged.
3714 // dst points to pixel to store result to.
3715 // count is number of averaged pixels to produce.
3716 // Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
3717 // aligned.
CumulativeSumToAverage_SSE2(const int32 * topleft,const int32 * botleft,int width,int area,uint8 * dst,int count)3718 void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
3719 int width, int area, uint8* dst, int count) {
3720 __asm {
3721 mov eax, topleft // eax topleft
3722 mov esi, botleft // esi botleft
3723 mov edx, width
3724 movd xmm4, area
3725 mov edi, dst
3726 mov ecx, count
3727 cvtdq2ps xmm4, xmm4
3728 rcpss xmm4, xmm4 // 1.0f / area
3729 pshufd xmm4, xmm4, 0
3730 sub ecx, 4
3731 jl l4b
3732
3733 // 4 pixel loop
3734 align 4
3735 l4:
3736 // top left
3737 movdqa xmm0, [eax]
3738 movdqa xmm1, [eax + 16]
3739 movdqa xmm2, [eax + 32]
3740 movdqa xmm3, [eax + 48]
3741
3742 // - top right
3743 psubd xmm0, [eax + edx * 4]
3744 psubd xmm1, [eax + edx * 4 + 16]
3745 psubd xmm2, [eax + edx * 4 + 32]
3746 psubd xmm3, [eax + edx * 4 + 48]
3747 lea eax, [eax + 64]
3748
3749 // - bottom left
3750 psubd xmm0, [esi]
3751 psubd xmm1, [esi + 16]
3752 psubd xmm2, [esi + 32]
3753 psubd xmm3, [esi + 48]
3754
3755 // + bottom right
3756 paddd xmm0, [esi + edx * 4]
3757 paddd xmm1, [esi + edx * 4 + 16]
3758 paddd xmm2, [esi + edx * 4 + 32]
3759 paddd xmm3, [esi + edx * 4 + 48]
3760 lea esi, [esi + 64]
3761
3762 cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
3763 cvtdq2ps xmm1, xmm1
3764 mulps xmm0, xmm4
3765 mulps xmm1, xmm4
3766 cvtdq2ps xmm2, xmm2
3767 cvtdq2ps xmm3, xmm3
3768 mulps xmm2, xmm4
3769 mulps xmm3, xmm4
3770 cvtps2dq xmm0, xmm0
3771 cvtps2dq xmm1, xmm1
3772 cvtps2dq xmm2, xmm2
3773 cvtps2dq xmm3, xmm3
3774 packssdw xmm0, xmm1
3775 packssdw xmm2, xmm3
3776 packuswb xmm0, xmm2
3777 movdqu [edi], xmm0
3778 lea edi, [edi + 16]
3779 sub ecx, 4
3780 jge l4
3781
3782 l4b:
3783 add ecx, 4 - 1
3784 jl l1b
3785
3786 // 1 pixel loop
3787 align 4
3788 l1:
3789 movdqa xmm0, [eax]
3790 psubd xmm0, [eax + edx * 4]
3791 lea eax, [eax + 16]
3792 psubd xmm0, [esi]
3793 paddd xmm0, [esi + edx * 4]
3794 lea esi, [esi + 16]
3795 cvtdq2ps xmm0, xmm0
3796 mulps xmm0, xmm4
3797 cvtps2dq xmm0, xmm0
3798 packssdw xmm0, xmm0
3799 packuswb xmm0, xmm0
3800 movd dword ptr [edi], xmm0
3801 lea edi, [edi + 4]
3802 sub ecx, 1
3803 jge l1
3804 l1b:
3805 }
3806 }
3807 #endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2
3808
3809 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
3810 // Creates a table of cumulative sums where each value is a sum of all values
3811 // above and to the left of the value.
ComputeCumulativeSumRow_SSE2(const uint8 * row,int32 * cumsum,const int32 * previous_cumsum,int width)3812 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
3813 const int32* previous_cumsum, int width) {
3814 __asm {
3815 mov eax, row
3816 mov edx, cumsum
3817 mov esi, previous_cumsum
3818 mov ecx, width
3819 sub esi, edx
3820 pxor xmm0, xmm0
3821 pxor xmm1, xmm1
3822
3823 sub ecx, 4
3824 jl l4b
3825 test edx, 15
3826 jne l4b
3827
3828 // 4 pixel loop
3829 align 4
3830 l4:
3831 movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
3832 lea eax, [eax + 16]
3833 movdqa xmm4, xmm2
3834
3835 punpcklbw xmm2, xmm1
3836 movdqa xmm3, xmm2
3837 punpcklwd xmm2, xmm1
3838 punpckhwd xmm3, xmm1
3839
3840 punpckhbw xmm4, xmm1
3841 movdqa xmm5, xmm4
3842 punpcklwd xmm4, xmm1
3843 punpckhwd xmm5, xmm1
3844
3845 paddd xmm0, xmm2
3846 movdqa xmm2, [edx + esi] // previous row above.
3847 paddd xmm2, xmm0
3848
3849 paddd xmm0, xmm3
3850 movdqa xmm3, [edx + esi + 16]
3851 paddd xmm3, xmm0
3852
3853 paddd xmm0, xmm4
3854 movdqa xmm4, [edx + esi + 32]
3855 paddd xmm4, xmm0
3856
3857 paddd xmm0, xmm5
3858 movdqa xmm5, [edx + esi + 48]
3859 paddd xmm5, xmm0
3860
3861 movdqa [edx], xmm2
3862 movdqa [edx + 16], xmm3
3863 movdqa [edx + 32], xmm4
3864 movdqa [edx + 48], xmm5
3865
3866 lea edx, [edx + 64]
3867 sub ecx, 4
3868 jge l4
3869
3870 l4b:
3871 add ecx, 4 - 1
3872 jl l1b
3873
3874 // 1 pixel loop
3875 align 4
3876 l1:
3877 movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
3878 lea eax, [eax + 4]
3879 punpcklbw xmm2, xmm1
3880 punpcklwd xmm2, xmm1
3881 paddd xmm0, xmm2
3882 movdqu xmm2, [edx + esi]
3883 paddd xmm2, xmm0
3884 movdqu [edx], xmm2
3885 lea edx, [edx + 16]
3886 sub ecx, 1
3887 jge l1
3888
3889 l1b:
3890 }
3891 }
3892 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
3893
3894 #ifdef HAS_ARGBSHADE_SSE2
3895 // Shade 4 pixels at a time by specified value.
3896 // Aligned to 16 bytes.
3897 __declspec(naked) __declspec(align(16))
ARGBShadeRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width,uint32 value)3898 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
3899 uint32 value) {
3900 __asm {
3901 mov eax, [esp + 4] // src_argb
3902 mov edx, [esp + 8] // dst_argb
3903 mov ecx, [esp + 12] // width
3904 movd xmm2, [esp + 16] // value
3905 sub edx, eax
3906 punpcklbw xmm2, xmm2
3907 punpcklqdq xmm2, xmm2
3908
3909 align 16
3910 convertloop:
3911 movdqa xmm0, [eax] // read 4 pixels
3912 movdqa xmm1, xmm0
3913 punpcklbw xmm0, xmm0 // first 2
3914 punpckhbw xmm1, xmm1 // next 2
3915 pmulhuw xmm0, xmm2 // argb * value
3916 pmulhuw xmm1, xmm2 // argb * value
3917 psrlw xmm0, 8
3918 psrlw xmm1, 8
3919 packuswb xmm0, xmm1
3920 sub ecx, 4
3921 movdqa [eax + edx], xmm0
3922 lea eax, [eax + 16]
3923 jg convertloop
3924
3925 ret
3926 }
3927 }
3928 #endif // HAS_ARGBSHADE_SSE2
3929
3930 #ifdef HAS_ARGBAFFINEROW_SSE2
3931 // Copy ARGB pixels from source image with slope to a row of destination.
3932 __declspec(naked) __declspec(align(16))
3933 LIBYUV_API
ARGBAffineRow_SSE2(const uint8 * src_argb,int src_argb_stride,uint8 * dst_argb,const float * uv_dudv,int width)3934 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
3935 uint8* dst_argb, const float* uv_dudv, int width) {
3936 __asm {
3937 push esi
3938 push edi
3939 mov eax, [esp + 12] // src_argb
3940 mov esi, [esp + 16] // stride
3941 mov edx, [esp + 20] // dst_argb
3942 mov ecx, [esp + 24] // pointer to uv_dudv
3943 movq xmm2, qword ptr [ecx] // uv
3944 movq xmm7, qword ptr [ecx + 8] // dudv
3945 mov ecx, [esp + 28] // width
3946 shl esi, 16 // 4, stride
3947 add esi, 4
3948 movd xmm5, esi
3949 sub ecx, 4
3950 jl l4b
3951
3952 // setup for 4 pixel loop
3953 pshufd xmm7, xmm7, 0x44 // dup dudv
3954 pshufd xmm5, xmm5, 0 // dup 4, stride
3955 movdqa xmm0, xmm2 // x0, y0, x1, y1
3956 addps xmm0, xmm7
3957 movlhps xmm2, xmm0
3958 movdqa xmm4, xmm7
3959 addps xmm4, xmm4 // dudv *= 2
3960 movdqa xmm3, xmm2 // x2, y2, x3, y3
3961 addps xmm3, xmm4
3962 addps xmm4, xmm4 // dudv *= 4
3963
3964 // 4 pixel loop
3965 align 4
3966 l4:
3967 cvttps2dq xmm0, xmm2 // x, y float to int first 2
3968 cvttps2dq xmm1, xmm3 // x, y float to int next 2
3969 packssdw xmm0, xmm1 // x, y as 8 shorts
3970 pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
3971 movd esi, xmm0
3972 pshufd xmm0, xmm0, 0x39 // shift right
3973 movd edi, xmm0
3974 pshufd xmm0, xmm0, 0x39 // shift right
3975 movd xmm1, [eax + esi] // read pixel 0
3976 movd xmm6, [eax + edi] // read pixel 1
3977 punpckldq xmm1, xmm6 // combine pixel 0 and 1
3978 addps xmm2, xmm4 // x, y += dx, dy first 2
3979 movq qword ptr [edx], xmm1
3980 movd esi, xmm0
3981 pshufd xmm0, xmm0, 0x39 // shift right
3982 movd edi, xmm0
3983 movd xmm6, [eax + esi] // read pixel 2
3984 movd xmm0, [eax + edi] // read pixel 3
3985 punpckldq xmm6, xmm0 // combine pixel 2 and 3
3986 addps xmm3, xmm4 // x, y += dx, dy next 2
3987 sub ecx, 4
3988 movq qword ptr 8[edx], xmm6
3989 lea edx, [edx + 16]
3990 jge l4
3991
3992 l4b:
3993 add ecx, 4 - 1
3994 jl l1b
3995
3996 // 1 pixel loop
3997 align 4
3998 l1:
3999 cvttps2dq xmm0, xmm2 // x, y float to int
4000 packssdw xmm0, xmm0 // x, y as shorts
4001 pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
4002 addps xmm2, xmm7 // x, y += dx, dy
4003 movd esi, xmm0
4004 movd xmm0, [eax + esi] // copy a pixel
4005 sub ecx, 1
4006 movd [edx], xmm0
4007 lea edx, [edx + 4]
4008 jge l1
4009 l1b:
4010 pop edi
4011 pop esi
4012 ret
4013 }
4014 }
4015 #endif // HAS_ARGBAFFINEROW_SSE2
4016
4017 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version.
4018 __declspec(naked) __declspec(align(16))
ARGBInterpolateRow_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)4019 void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
4020 ptrdiff_t src_stride, int dst_width,
4021 int source_y_fraction) {
4022 __asm {
4023 push esi
4024 push edi
4025 mov edi, [esp + 8 + 4] // dst_ptr
4026 mov esi, [esp + 8 + 8] // src_ptr
4027 mov edx, [esp + 8 + 12] // src_stride
4028 mov ecx, [esp + 8 + 16] // dst_width
4029 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
4030 sub edi, esi
4031 shr eax, 1
4032 cmp eax, 0
4033 je xloop1
4034 cmp eax, 64
4035 je xloop2
4036 movd xmm0, eax // high fraction 0..127
4037 neg eax
4038 add eax, 128
4039 movd xmm5, eax // low fraction 128..1
4040 punpcklbw xmm5, xmm0
4041 punpcklwd xmm5, xmm5
4042 pshufd xmm5, xmm5, 0
4043
4044 align 16
4045 xloop:
4046 movdqa xmm0, [esi]
4047 movdqa xmm2, [esi + edx]
4048 movdqa xmm1, xmm0
4049 punpcklbw xmm0, xmm2
4050 punpckhbw xmm1, xmm2
4051 pmaddubsw xmm0, xmm5
4052 pmaddubsw xmm1, xmm5
4053 psrlw xmm0, 7
4054 psrlw xmm1, 7
4055 packuswb xmm0, xmm1
4056 sub ecx, 4
4057 movdqa [esi + edi], xmm0
4058 lea esi, [esi + 16]
4059 jg xloop
4060
4061 pop edi
4062 pop esi
4063 ret
4064
4065 align 16
4066 xloop1:
4067 movdqa xmm0, [esi]
4068 sub ecx, 4
4069 movdqa [esi + edi], xmm0
4070 lea esi, [esi + 16]
4071 jg xloop1
4072
4073 pop edi
4074 pop esi
4075 ret
4076
4077 align 16
4078 xloop2:
4079 movdqa xmm0, [esi]
4080 pavgb xmm0, [esi + edx]
4081 sub ecx, 4
4082 movdqa [esi + edi], xmm0
4083 lea esi, [esi + 16]
4084 jg xloop2
4085
4086 pop edi
4087 pop esi
4088 ret
4089 }
4090 }
4091
4092 #endif // _M_IX86
4093
4094 #ifdef __cplusplus
4095 } // extern "C"
4096 } // namespace libyuv
4097 #endif
4098