1 /*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "row.h"
12
13 extern "C" {
14
15 #ifdef HAS_ARGBTOYROW_SSSE3
16 #define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
17
18 // Constant multiplication table for converting ARGB to I400.
19 extern "C" TALIGN16(const int8, kARGBToY[16]) = {
20 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
21 };
22
23 extern "C" TALIGN16(const int8, kARGBToU[16]) = {
24 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
25 };
26
27 extern "C" TALIGN16(const int8, kARGBToV[16]) = {
28 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
29 };
30
31 // Constants for BGRA
32 extern "C" TALIGN16(const int8, kBGRAToY[16]) = {
33 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
34 };
35
36 extern "C" TALIGN16(const int8, kBGRAToU[16]) = {
37 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
38 };
39
40 extern "C" TALIGN16(const int8, kBGRAToV[16]) = {
41 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
42 };
43
44 // Constants for ABGR
45 extern "C" TALIGN16(const int8, kABGRToY[16]) = {
46 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
47 };
48
49 extern "C" TALIGN16(const int8, kABGRToU[16]) = {
50 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
51 };
52
53 extern "C" TALIGN16(const int8, kABGRToV[16]) = {
54 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
55 };
56
57 extern "C" TALIGN16(const uint8, kAddY16[16]) = {
58 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
59 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
60 };
61
62 extern "C" TALIGN16(const uint8, kAddUV128[16]) = {
63 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
64 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
65 };
66
67 // Shuffle table for converting BG24 to ARGB.
68 extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
69 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
70 };
71
72 // Shuffle table for converting RAW to ARGB.
73 extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
74 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
75 };
76
77 // Convert 16 ARGB pixels (64 bytes) to 16 Y values
78 __declspec(naked)
ARGBToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)79 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
80 __asm {
81 mov eax, [esp + 4] /* src_argb */
82 mov edx, [esp + 8] /* dst_y */
83 mov ecx, [esp + 12] /* pix */
84 movdqa xmm7, _kARGBToY
85 movdqa xmm6, _kAddY16
86
87 convertloop :
88 movdqa xmm0, [eax]
89 movdqa xmm1, [eax + 16]
90 movdqa xmm2, [eax + 32]
91 movdqa xmm3, [eax + 48]
92 pmaddubsw xmm0, xmm7
93 pmaddubsw xmm1, xmm7
94 pmaddubsw xmm2, xmm7
95 pmaddubsw xmm3, xmm7
96 lea eax, [eax + 64]
97 phaddw xmm0, xmm1
98 phaddw xmm2, xmm3
99 psrlw xmm0, 7
100 psrlw xmm2, 7
101 packuswb xmm0, xmm2
102 paddb xmm0, xmm6
103 movdqa [edx], xmm0
104 lea edx, [edx + 16]
105 sub ecx, 16
106 ja convertloop
107 ret
108 }
109 }
110
111 __declspec(naked)
BGRAToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)112 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
113 __asm {
114 mov eax, [esp + 4] /* src_argb */
115 mov edx, [esp + 8] /* dst_y */
116 mov ecx, [esp + 12] /* pix */
117 movdqa xmm7, _kBGRAToY
118 movdqa xmm6, _kAddY16
119
120 convertloop :
121 movdqa xmm0, [eax]
122 movdqa xmm1, [eax + 16]
123 movdqa xmm2, [eax + 32]
124 movdqa xmm3, [eax + 48]
125 pmaddubsw xmm0, xmm7
126 pmaddubsw xmm1, xmm7
127 pmaddubsw xmm2, xmm7
128 pmaddubsw xmm3, xmm7
129 lea eax, [eax + 64]
130 phaddw xmm0, xmm1
131 phaddw xmm2, xmm3
132 psrlw xmm0, 7
133 psrlw xmm2, 7
134 packuswb xmm0, xmm2
135 paddb xmm0, xmm6
136 movdqa [edx], xmm0
137 lea edx, [edx + 16]
138 sub ecx, 16
139 ja convertloop
140 ret
141 }
142 }
143
144 __declspec(naked)
ABGRToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)145 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
146 __asm {
147 mov eax, [esp + 4] /* src_argb */
148 mov edx, [esp + 8] /* dst_y */
149 mov ecx, [esp + 12] /* pix */
150 movdqa xmm7, _kABGRToY
151 movdqa xmm6, _kAddY16
152
153 convertloop :
154 movdqa xmm0, [eax]
155 movdqa xmm1, [eax + 16]
156 movdqa xmm2, [eax + 32]
157 movdqa xmm3, [eax + 48]
158 pmaddubsw xmm0, xmm7
159 pmaddubsw xmm1, xmm7
160 pmaddubsw xmm2, xmm7
161 pmaddubsw xmm3, xmm7
162 lea eax, [eax + 64]
163 phaddw xmm0, xmm1
164 phaddw xmm2, xmm3
165 psrlw xmm0, 7
166 psrlw xmm2, 7
167 packuswb xmm0, xmm2
168 paddb xmm0, xmm6
169 movdqa [edx], xmm0
170 lea edx, [edx + 16]
171 sub ecx, 16
172 ja convertloop
173 ret
174 }
175 }
176
177 __declspec(naked)
ARGBToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)178 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
179 uint8* dst_u, uint8* dst_v, int width) {
180 __asm {
181 push esi
182 push edi
183 mov eax, [esp + 8 + 4] // src_argb
184 mov esi, [esp + 8 + 8] // src_stride_argb
185 mov edx, [esp + 8 + 12] // dst_u
186 mov edi, [esp + 8 + 16] // dst_v
187 mov ecx, [esp + 8 + 20] // pix
188 movdqa xmm7, _kARGBToU
189 movdqa xmm6, _kARGBToV
190 movdqa xmm5, _kAddUV128
191 sub edi, edx // stride from u to v
192
193 convertloop :
194 /* step 1 - subsample 16x2 argb pixels to 8x1 */
195 movdqa xmm0, [eax]
196 movdqa xmm1, [eax + 16]
197 movdqa xmm2, [eax + 32]
198 movdqa xmm3, [eax + 48]
199 pavgb xmm0, [eax + esi]
200 pavgb xmm1, [eax + esi + 16]
201 pavgb xmm2, [eax + esi + 32]
202 pavgb xmm3, [eax + esi + 48]
203 lea eax, [eax + 64]
204 movdqa xmm4, xmm0
205 shufps xmm0, xmm1, 0x88
206 shufps xmm4, xmm1, 0xdd
207 pavgb xmm0, xmm4
208 movdqa xmm4, xmm2
209 shufps xmm2, xmm3, 0x88
210 shufps xmm4, xmm3, 0xdd
211 pavgb xmm2, xmm4
212
213 // step 2 - convert to U and V
214 // from here down is very similar to Y code except
215 // instead of 16 different pixels, its 8 pixels of U and 8 of V
216 movdqa xmm1, xmm0
217 movdqa xmm3, xmm2
218 pmaddubsw xmm0, xmm7 // U
219 pmaddubsw xmm2, xmm7
220 pmaddubsw xmm1, xmm6 // V
221 pmaddubsw xmm3, xmm6
222 phaddw xmm0, xmm2
223 phaddw xmm1, xmm3
224 psraw xmm0, 8
225 psraw xmm1, 8
226 packsswb xmm0, xmm1
227 paddb xmm0, xmm5 // -> unsigned
228
229 // step 3 - store 8 U and 8 V values
230 movlps qword ptr [edx], xmm0 // U
231 movhps qword ptr [edx + edi], xmm0 // V
232 lea edx, [edx + 8]
233 sub ecx, 16
234 ja convertloop
235 pop edi
236 pop esi
237 ret
238 }
239 }
240
241 __declspec(naked)
BGRAToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)242 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
243 uint8* dst_u, uint8* dst_v, int width) {
244 __asm {
245 push esi
246 push edi
247 mov eax, [esp + 8 + 4] // src_argb
248 mov esi, [esp + 8 + 8] // src_stride_argb
249 mov edx, [esp + 8 + 12] // dst_u
250 mov edi, [esp + 8 + 16] // dst_v
251 mov ecx, [esp + 8 + 20] // pix
252 movdqa xmm7, _kBGRAToU
253 movdqa xmm6, _kBGRAToV
254 movdqa xmm5, _kAddUV128
255 sub edi, edx // stride from u to v
256
257 convertloop :
258 /* step 1 - subsample 16x2 argb pixels to 8x1 */
259 movdqa xmm0, [eax]
260 movdqa xmm1, [eax + 16]
261 movdqa xmm2, [eax + 32]
262 movdqa xmm3, [eax + 48]
263 pavgb xmm0, [eax + esi]
264 pavgb xmm1, [eax + esi + 16]
265 pavgb xmm2, [eax + esi + 32]
266 pavgb xmm3, [eax + esi + 48]
267 lea eax, [eax + 64]
268 movdqa xmm4, xmm0
269 shufps xmm0, xmm1, 0x88
270 shufps xmm4, xmm1, 0xdd
271 pavgb xmm0, xmm4
272 movdqa xmm4, xmm2
273 shufps xmm2, xmm3, 0x88
274 shufps xmm4, xmm3, 0xdd
275 pavgb xmm2, xmm4
276
277 // step 2 - convert to U and V
278 // from here down is very similar to Y code except
279 // instead of 16 different pixels, its 8 pixels of U and 8 of V
280 movdqa xmm1, xmm0
281 movdqa xmm3, xmm2
282 pmaddubsw xmm0, xmm7 // U
283 pmaddubsw xmm2, xmm7
284 pmaddubsw xmm1, xmm6 // V
285 pmaddubsw xmm3, xmm6
286 phaddw xmm0, xmm2
287 phaddw xmm1, xmm3
288 psraw xmm0, 8
289 psraw xmm1, 8
290 packsswb xmm0, xmm1
291 paddb xmm0, xmm5 // -> unsigned
292
293 // step 3 - store 8 U and 8 V values
294 movlps qword ptr [edx], xmm0 // U
295 movhps qword ptr [edx + edi], xmm0 // V
296 lea edx, [edx + 8]
297 sub ecx, 16
298 ja convertloop
299 pop edi
300 pop esi
301 ret
302 }
303 }
304
305 __declspec(naked)
ABGRToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)306 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
307 uint8* dst_u, uint8* dst_v, int width) {
308 __asm {
309 push esi
310 push edi
311 mov eax, [esp + 8 + 4] // src_argb
312 mov esi, [esp + 8 + 8] // src_stride_argb
313 mov edx, [esp + 8 + 12] // dst_u
314 mov edi, [esp + 8 + 16] // dst_v
315 mov ecx, [esp + 8 + 20] // pix
316 movdqa xmm7, _kABGRToU
317 movdqa xmm6, _kABGRToV
318 movdqa xmm5, _kAddUV128
319 sub edi, edx // stride from u to v
320
321 convertloop :
322 /* step 1 - subsample 16x2 argb pixels to 8x1 */
323 movdqa xmm0, [eax]
324 movdqa xmm1, [eax + 16]
325 movdqa xmm2, [eax + 32]
326 movdqa xmm3, [eax + 48]
327 pavgb xmm0, [eax + esi]
328 pavgb xmm1, [eax + esi + 16]
329 pavgb xmm2, [eax + esi + 32]
330 pavgb xmm3, [eax + esi + 48]
331 lea eax, [eax + 64]
332 movdqa xmm4, xmm0
333 shufps xmm0, xmm1, 0x88
334 shufps xmm4, xmm1, 0xdd
335 pavgb xmm0, xmm4
336 movdqa xmm4, xmm2
337 shufps xmm2, xmm3, 0x88
338 shufps xmm4, xmm3, 0xdd
339 pavgb xmm2, xmm4
340
341 // step 2 - convert to U and V
342 // from here down is very similar to Y code except
343 // instead of 16 different pixels, its 8 pixels of U and 8 of V
344 movdqa xmm1, xmm0
345 movdqa xmm3, xmm2
346 pmaddubsw xmm0, xmm7 // U
347 pmaddubsw xmm2, xmm7
348 pmaddubsw xmm1, xmm6 // V
349 pmaddubsw xmm3, xmm6
350 phaddw xmm0, xmm2
351 phaddw xmm1, xmm3
352 psraw xmm0, 8
353 psraw xmm1, 8
354 packsswb xmm0, xmm1
355 paddb xmm0, xmm5 // -> unsigned
356
357 // step 3 - store 8 U and 8 V values
358 movlps qword ptr [edx], xmm0 // U
359 movhps qword ptr [edx + edi], xmm0 // V
360 lea edx, [edx + 8]
361 sub ecx, 16
362 ja convertloop
363 pop edi
364 pop esi
365 ret
366 }
367 }
368
369 __declspec(naked)
BG24ToARGBRow_SSSE3(const uint8 * src_bg24,uint8 * dst_argb,int pix)370 void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
371 __asm {
372 mov eax, [esp + 4] // src_bg24
373 mov edx, [esp + 8] // dst_argb
374 mov ecx, [esp + 12] // pix
375 pcmpeqb xmm7, xmm7 // generate mask 0xff000000
376 pslld xmm7, 24
377 movdqa xmm6, _kShuffleMaskBG24ToARGB
378
379 convertloop :
380 movdqa xmm0, [eax]
381 movdqa xmm1, [eax + 16]
382 movdqa xmm3, [eax + 32]
383 lea eax, [eax + 48]
384 movdqa xmm2, xmm3
385 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
386 pshufb xmm2, xmm6
387 por xmm2, xmm7
388 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
389 pshufb xmm0, xmm6
390 movdqa [edx + 32], xmm2
391 por xmm0, xmm7
392 pshufb xmm1, xmm6
393 movdqa [edx], xmm0
394 por xmm1, xmm7
395 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
396 pshufb xmm3, xmm6
397 movdqa [edx + 16], xmm1
398 por xmm3, xmm7
399 movdqa [edx + 48], xmm3
400 lea edx, [edx + 64]
401 sub ecx, 16
402 ja convertloop
403 ret
404 }
405 }
406
407 __declspec(naked)
RAWToARGBRow_SSSE3(const uint8 * src_raw,uint8 * dst_argb,int pix)408 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
409 int pix) {
410 __asm {
411 mov eax, [esp + 4] // src_raw
412 mov edx, [esp + 8] // dst_argb
413 mov ecx, [esp + 12] // pix
414 pcmpeqb xmm7, xmm7 // generate mask 0xff000000
415 pslld xmm7, 24
416 movdqa xmm6, _kShuffleMaskRAWToARGB
417
418 convertloop :
419 movdqa xmm0, [eax]
420 movdqa xmm1, [eax + 16]
421 movdqa xmm3, [eax + 32]
422 lea eax, [eax + 48]
423 movdqa xmm2, xmm3
424 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
425 pshufb xmm2, xmm6
426 por xmm2, xmm7
427 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
428 pshufb xmm0, xmm6
429 movdqa [edx + 32], xmm2
430 por xmm0, xmm7
431 pshufb xmm1, xmm6
432 movdqa [edx], xmm0
433 por xmm1, xmm7
434 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
435 pshufb xmm3, xmm6
436 movdqa [edx + 16], xmm1
437 por xmm3, xmm7
438 movdqa [edx + 48], xmm3
439 lea edx, [edx + 64]
440 sub ecx, 16
441 ja convertloop
442 ret
443 }
444 }
445
446 __declspec(naked)
FastConvertYUVToRGB32Row(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)447 void FastConvertYUVToRGB32Row(const uint8* y_buf,
448 const uint8* u_buf,
449 const uint8* v_buf,
450 uint8* rgb_buf,
451 int width) {
452 __asm {
453 pushad
454 mov edx, [esp + 32 + 4]
455 mov edi, [esp + 32 + 8]
456 mov esi, [esp + 32 + 12]
457 mov ebp, [esp + 32 + 16]
458 mov ecx, [esp + 32 + 20]
459
460 convertloop :
461 movzx eax, byte ptr [edi]
462 lea edi, [edi + 1]
463 movzx ebx, byte ptr [esi]
464 lea esi, [esi + 1]
465 movq mm0, [_kCoefficientsRgbY + 2048 + 8 * eax]
466 movzx eax, byte ptr [edx]
467 paddsw mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx]
468 movzx ebx, byte ptr [edx + 1]
469 movq mm1, [_kCoefficientsRgbY + 8 * eax]
470 lea edx, [edx + 2]
471 movq mm2, [_kCoefficientsRgbY + 8 * ebx]
472 paddsw mm1, mm0
473 paddsw mm2, mm0
474 psraw mm1, 6
475 psraw mm2, 6
476 packuswb mm1, mm2
477 movntq [ebp], mm1
478 lea ebp, [ebp + 8]
479 sub ecx, 2
480 ja convertloop
481
482 popad
483 ret
484 }
485 }
486
487 __declspec(naked)
FastConvertYUVToBGRARow(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)488 void FastConvertYUVToBGRARow(const uint8* y_buf,
489 const uint8* u_buf,
490 const uint8* v_buf,
491 uint8* rgb_buf,
492 int width) {
493 __asm {
494 pushad
495 mov edx, [esp + 32 + 4]
496 mov edi, [esp + 32 + 8]
497 mov esi, [esp + 32 + 12]
498 mov ebp, [esp + 32 + 16]
499 mov ecx, [esp + 32 + 20]
500
501 convertloop :
502 movzx eax, byte ptr [edi]
503 lea edi, [edi + 1]
504 movzx ebx, byte ptr [esi]
505 lea esi, [esi + 1]
506 movq mm0, [_kCoefficientsBgraY + 2048 + 8 * eax]
507 movzx eax, byte ptr [edx]
508 paddsw mm0, [_kCoefficientsBgraY + 4096 + 8 * ebx]
509 movzx ebx, byte ptr [edx + 1]
510 movq mm1, [_kCoefficientsBgraY + 8 * eax]
511 lea edx, [edx + 2]
512 movq mm2, [_kCoefficientsBgraY + 8 * ebx]
513 paddsw mm1, mm0
514 paddsw mm2, mm0
515 psraw mm1, 6
516 psraw mm2, 6
517 packuswb mm1, mm2
518 movntq [ebp], mm1
519 lea ebp, [ebp + 8]
520 sub ecx, 2
521 ja convertloop
522
523 popad
524 ret
525 }
526 }
527
528 __declspec(naked)
FastConvertYUVToABGRRow(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)529 void FastConvertYUVToABGRRow(const uint8* y_buf,
530 const uint8* u_buf,
531 const uint8* v_buf,
532 uint8* rgb_buf,
533 int width) {
534 __asm {
535 pushad
536 mov edx, [esp + 32 + 4]
537 mov edi, [esp + 32 + 8]
538 mov esi, [esp + 32 + 12]
539 mov ebp, [esp + 32 + 16]
540 mov ecx, [esp + 32 + 20]
541
542 convertloop :
543 movzx eax, byte ptr [edi]
544 lea edi, [edi + 1]
545 movzx ebx, byte ptr [esi]
546 lea esi, [esi + 1]
547 movq mm0, [_kCoefficientsAbgrY + 2048 + 8 * eax]
548 movzx eax, byte ptr [edx]
549 paddsw mm0, [_kCoefficientsAbgrY + 4096 + 8 * ebx]
550 movzx ebx, byte ptr [edx + 1]
551 movq mm1, [_kCoefficientsAbgrY + 8 * eax]
552 lea edx, [edx + 2]
553 movq mm2, [_kCoefficientsAbgrY + 8 * ebx]
554 paddsw mm1, mm0
555 paddsw mm2, mm0
556 psraw mm1, 6
557 psraw mm2, 6
558 packuswb mm1, mm2
559 movntq [ebp], mm1
560 lea ebp, [ebp + 8]
561 sub ecx, 2
562 ja convertloop
563
564 popad
565 ret
566 }
567 }
568
569 __declspec(naked)
FastConvertYUV444ToRGB32Row(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)570 void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
571 const uint8* u_buf,
572 const uint8* v_buf,
573 uint8* rgb_buf,
574 int width) {
575 __asm {
576 pushad
577 mov edx, [esp + 32 + 4] // Y
578 mov edi, [esp + 32 + 8] // U
579 mov esi, [esp + 32 + 12] // V
580 mov ebp, [esp + 32 + 16] // rgb
581 mov ecx, [esp + 32 + 20] // width
582
583 convertloop :
584 movzx eax, byte ptr [edi]
585 lea edi, [edi + 1]
586 movzx ebx, byte ptr [esi]
587 lea esi, [esi + 1]
588 movq mm0, [_kCoefficientsRgbY + 2048 + 8 * eax]
589 movzx eax, byte ptr [edx]
590 paddsw mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx]
591 lea edx, [edx + 1]
592 paddsw mm0, [_kCoefficientsRgbY + 8 * eax]
593 psraw mm0, 6
594 packuswb mm0, mm0
595 movd [ebp], mm0
596 lea ebp, [ebp + 4]
597 sub ecx, 1
598 ja convertloop
599
600 popad
601 ret
602 }
603 }
604
605 __declspec(naked)
FastConvertYToRGB32Row(const uint8 * y_buf,uint8 * rgb_buf,int width)606 void FastConvertYToRGB32Row(const uint8* y_buf,
607 uint8* rgb_buf,
608 int width) {
609 __asm {
610 push ebx
611 mov eax, [esp + 4 + 4] // Y
612 mov edx, [esp + 4 + 8] // rgb
613 mov ecx, [esp + 4 + 12] // width
614
615 convertloop :
616 movzx ebx, byte ptr [eax]
617 movq mm0, [_kCoefficientsRgbY + 8 * ebx]
618 psraw mm0, 6
619 movzx ebx, byte ptr [eax + 1]
620 movq mm1, [_kCoefficientsRgbY + 8 * ebx]
621 psraw mm1, 6
622 packuswb mm0, mm1
623 lea eax, [eax + 2]
624 movq [edx], mm0
625 lea edx, [edx + 8]
626 sub ecx, 2
627 ja convertloop
628
629 pop ebx
630 ret
631 }
632 }
633
634 #endif
635
636 } // extern "C"
637