1 /*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "row.h"
12
13 extern "C" {
14
15 #ifdef HAS_ARGBTOYROW_SSSE3
16
17 // Constant multiplication table for converting ARGB to I400.
18 extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) = {
19 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u
20 };
21
22 extern "C" TALIGN16(const uint8, kAdd16[16]) = {
23 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u
24 };
25
26 // Shuffle table for converting BG24 to ARGB.
27 extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
28 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
29 };
30
31 // Shuffle table for converting RAW to ARGB.
32 extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
33 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
34 };
35
ARGBToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)36 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
37 asm volatile(
38 "movdqa (%3),%%xmm7\n"
39 "movdqa (%4),%%xmm6\n"
40 "movdqa %%xmm6,%%xmm5\n"
41 "psllw $0x4,%%xmm5\n" // Generate a mask of 0x10 on each byte.
42 "1:"
43 "movdqa (%0),%%xmm0\n"
44 "pmaddubsw %%xmm7,%%xmm0\n"
45 "movdqa 0x10(%0),%%xmm1\n"
46 "psrlw $0x7,%%xmm0\n"
47 "pmaddubsw %%xmm7,%%xmm1\n"
48 "lea 0x20(%0),%0\n"
49 "psrlw $0x7,%%xmm1\n"
50 "packuswb %%xmm1,%%xmm0\n"
51 "pmaddubsw %%xmm6,%%xmm0\n"
52 "packuswb %%xmm0,%%xmm0\n"
53 "paddb %%xmm5,%%xmm0\n"
54 "movq %%xmm0,(%1)\n"
55 "lea 0x8(%1),%1\n"
56 "sub $0x8,%2\n"
57 "ja 1b\n"
58 : "+r"(src_argb), // %0
59 "+r"(dst_y), // %1
60 "+r"(pix) // %2
61 : "r"(kMultiplyMaskARGBToI400), // %3
62 "r"(kAdd16) // %4
63 : "memory"
64 );
65 }
66 #endif
67
68 #ifdef HAS_BG24TOARGBROW_SSSE3
BG24ToARGBRow_SSSE3(const uint8 * src_bg24,uint8 * dst_argb,int pix)69 void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
70 asm volatile(
71 "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
72 "pslld $0x18,%%xmm7\n"
73 "movdqa (%3),%%xmm6\n"
74 "1:"
75 "movdqa (%0),%%xmm0\n"
76 "movdqa 0x10(%0),%%xmm1\n"
77 "movdqa 0x20(%0),%%xmm3\n"
78 "lea 0x30(%0),%0\n"
79 "movdqa %%xmm3,%%xmm2\n"
80 "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
81 "pshufb %%xmm6,%%xmm2\n"
82 "por %%xmm7,%%xmm2\n"
83 "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
84 "pshufb %%xmm6,%%xmm0\n"
85 "movdqa %%xmm2,0x20(%1)\n"
86 "por %%xmm7,%%xmm0\n"
87 "pshufb %%xmm6,%%xmm1\n"
88 "movdqa %%xmm0,(%1)\n"
89 "por %%xmm7,%%xmm1\n"
90 "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
91 "pshufb %%xmm6,%%xmm3\n"
92 "movdqa %%xmm1,0x10(%1)\n"
93 "por %%xmm7,%%xmm3\n"
94 "movdqa %%xmm3,0x30(%1)\n"
95 "lea 0x40(%1),%1\n"
96 "sub $0x10,%2\n"
97 "ja 1b\n"
98 : "+r"(src_bg24), // %0
99 "+r"(dst_argb), // %1
100 "+r"(pix) // %2
101 : "r"(kShuffleMaskBG24ToARGB) // %3
102 : "memory"
103 );
104 }
105
RAWToARGBRow_SSSE3(const uint8 * src_raw,uint8 * dst_argb,int pix)106 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
107 asm volatile(
108 "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
109 "pslld $0x18,%%xmm7\n"
110 "movdqa (%3),%%xmm6\n"
111 "1:"
112 "movdqa (%0),%%xmm0\n"
113 "movdqa 0x10(%0),%%xmm1\n"
114 "movdqa 0x20(%0),%%xmm3\n"
115 "lea 0x30(%0),%0\n"
116 "movdqa %%xmm3,%%xmm2\n"
117 "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
118 "pshufb %%xmm6,%%xmm2\n"
119 "por %%xmm7,%%xmm2\n"
120 "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
121 "pshufb %%xmm6,%%xmm0\n"
122 "movdqa %%xmm2,0x20(%1)\n"
123 "por %%xmm7,%%xmm0\n"
124 "pshufb %%xmm6,%%xmm1\n"
125 "movdqa %%xmm0,(%1)\n"
126 "por %%xmm7,%%xmm1\n"
127 "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
128 "pshufb %%xmm6,%%xmm3\n"
129 "movdqa %%xmm1,0x10(%1)\n"
130 "por %%xmm7,%%xmm3\n"
131 "movdqa %%xmm3,0x30(%1)\n"
132 "lea 0x40(%1),%1\n"
133 "sub $0x10,%2\n"
134 "ja 1b\n"
135 : "+r"(src_raw), // %0
136 "+r"(dst_argb), // %1
137 "+r"(pix) // %2
138 : "r"(kShuffleMaskRAWToARGB) // %3
139 : "memory"
140 );
141 }
142 #endif
143
144 #if defined(__x86_64__)
145
146 // 64 bit linux gcc version
147
FastConvertYUVToRGB32Row(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)148 void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi
149 const uint8* u_buf, // rsi
150 const uint8* v_buf, // rdx
151 uint8* rgb_buf, // rcx
152 int width) { // r8
153 asm volatile(
154 "1:"
155 "movzb (%1),%%r10\n"
156 "lea 1(%1),%1\n"
157 "movzb (%2),%%r11\n"
158 "lea 1(%2),%2\n"
159 "movq 2048(%5,%%r10,8),%%xmm0\n"
160 "movzb (%0),%%r10\n"
161 "movq 4096(%5,%%r11,8),%%xmm1\n"
162 "movzb 0x1(%0),%%r11\n"
163 "paddsw %%xmm1,%%xmm0\n"
164 "movq (%5,%%r10,8),%%xmm2\n"
165 "lea 2(%0),%0\n"
166 "movq (%5,%%r11,8),%%xmm3\n"
167 "paddsw %%xmm0,%%xmm2\n"
168 "paddsw %%xmm0,%%xmm3\n"
169 "shufps $0x44,%%xmm3,%%xmm2\n"
170 "psraw $0x6,%%xmm2\n"
171 "packuswb %%xmm2,%%xmm2\n"
172 "movq %%xmm2,0x0(%3)\n"
173 "lea 8(%3),%3\n"
174 "sub $0x2,%4\n"
175 "ja 1b\n"
176 : "+r"(y_buf), // %0
177 "+r"(u_buf), // %1
178 "+r"(v_buf), // %2
179 "+r"(rgb_buf), // %3
180 "+r"(width) // %4
181 : "r" (_kCoefficientsRgbY) // %5
182 : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
183 );
184 }
185
FastConvertYUVToBGRARow(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)186 void FastConvertYUVToBGRARow(const uint8* y_buf, // rdi
187 const uint8* u_buf, // rsi
188 const uint8* v_buf, // rdx
189 uint8* rgb_buf, // rcx
190 int width) { // r8
191 asm volatile(
192 "1:"
193 "movzb (%1),%%r10\n"
194 "lea 1(%1),%1\n"
195 "movzb (%2),%%r11\n"
196 "lea 1(%2),%2\n"
197 "movq 2048(%5,%%r10,8),%%xmm0\n"
198 "movzb (%0),%%r10\n"
199 "movq 4096(%5,%%r11,8),%%xmm1\n"
200 "movzb 0x1(%0),%%r11\n"
201 "paddsw %%xmm1,%%xmm0\n"
202 "movq (%5,%%r10,8),%%xmm2\n"
203 "lea 2(%0),%0\n"
204 "movq (%5,%%r11,8),%%xmm3\n"
205 "paddsw %%xmm0,%%xmm2\n"
206 "paddsw %%xmm0,%%xmm3\n"
207 "shufps $0x44,%%xmm3,%%xmm2\n"
208 "psraw $0x6,%%xmm2\n"
209 "packuswb %%xmm2,%%xmm2\n"
210 "movq %%xmm2,0x0(%3)\n"
211 "lea 8(%3),%3\n"
212 "sub $0x2,%4\n"
213 "ja 1b\n"
214 : "+r"(y_buf), // %0
215 "+r"(u_buf), // %1
216 "+r"(v_buf), // %2
217 "+r"(rgb_buf), // %3
218 "+r"(width) // %4
219 : "r" (_kCoefficientsBgraY) // %5
220 : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
221 );
222 }
223
FastConvertYUVToABGRRow(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)224 void FastConvertYUVToABGRRow(const uint8* y_buf, // rdi
225 const uint8* u_buf, // rsi
226 const uint8* v_buf, // rdx
227 uint8* rgb_buf, // rcx
228 int width) { // r8
229 asm volatile(
230 "1:"
231 "movzb (%1),%%r10\n"
232 "lea 1(%1),%1\n"
233 "movzb (%2),%%r11\n"
234 "lea 1(%2),%2\n"
235 "movq 2048(%5,%%r10,8),%%xmm0\n"
236 "movzb (%0),%%r10\n"
237 "movq 4096(%5,%%r11,8),%%xmm1\n"
238 "movzb 0x1(%0),%%r11\n"
239 "paddsw %%xmm1,%%xmm0\n"
240 "movq (%5,%%r10,8),%%xmm2\n"
241 "lea 2(%0),%0\n"
242 "movq (%5,%%r11,8),%%xmm3\n"
243 "paddsw %%xmm0,%%xmm2\n"
244 "paddsw %%xmm0,%%xmm3\n"
245 "shufps $0x44,%%xmm3,%%xmm2\n"
246 "psraw $0x6,%%xmm2\n"
247 "packuswb %%xmm2,%%xmm2\n"
248 "movq %%xmm2,0x0(%3)\n"
249 "lea 8(%3),%3\n"
250 "sub $0x2,%4\n"
251 "ja 1b\n"
252 : "+r"(y_buf), // %0
253 "+r"(u_buf), // %1
254 "+r"(v_buf), // %2
255 "+r"(rgb_buf), // %3
256 "+r"(width) // %4
257 : "r" (_kCoefficientsAbgrY) // %5
258 : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
259 );
260 }
261
FastConvertYUV444ToRGB32Row(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)262 void FastConvertYUV444ToRGB32Row(const uint8* y_buf, // rdi
263 const uint8* u_buf, // rsi
264 const uint8* v_buf, // rdx
265 uint8* rgb_buf, // rcx
266 int width) { // r8
267 asm volatile(
268 "1:"
269 "movzb (%1),%%r10\n"
270 "lea 1(%1),%1\n"
271 "movzb (%2),%%r11\n"
272 "lea 1(%2),%2\n"
273 "movq 2048(%5,%%r10,8),%%xmm0\n"
274 "movzb (%0),%%r10\n"
275 "movq 4096(%5,%%r11,8),%%xmm1\n"
276 "paddsw %%xmm1,%%xmm0\n"
277 "movq (%5,%%r10,8),%%xmm2\n"
278 "lea 1(%0),%0\n"
279 "paddsw %%xmm0,%%xmm2\n"
280 "shufps $0x44,%%xmm2,%%xmm2\n"
281 "psraw $0x6,%%xmm2\n"
282 "packuswb %%xmm2,%%xmm2\n"
283 "movd %%xmm2,0x0(%3)\n"
284 "lea 4(%3),%3\n"
285 "sub $0x1,%4\n"
286 "ja 1b\n"
287 : "+r"(y_buf), // %0
288 "+r"(u_buf), // %1
289 "+r"(v_buf), // %2
290 "+r"(rgb_buf), // %3
291 "+r"(width) // %4
292 : "r" (_kCoefficientsRgbY) // %5
293 : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2"
294 );
295 }
296
FastConvertYToRGB32Row(const uint8 * y_buf,uint8 * rgb_buf,int width)297 void FastConvertYToRGB32Row(const uint8* y_buf, // rdi
298 uint8* rgb_buf, // rcx
299 int width) { // r8
300 asm volatile(
301 "1:"
302 "movzb (%0),%%r10\n"
303 "movzb 0x1(%0),%%r11\n"
304 "movq (%3,%%r10,8),%%xmm2\n"
305 "lea 2(%0),%0\n"
306 "movq (%3,%%r11,8),%%xmm3\n"
307 "shufps $0x44,%%xmm3,%%xmm2\n"
308 "psraw $0x6,%%xmm2\n"
309 "packuswb %%xmm2,%%xmm2\n"
310 "movq %%xmm2,0x0(%1)\n"
311 "lea 8(%1),%1\n"
312 "sub $0x2,%2\n"
313 "ja 1b\n"
314 : "+r"(y_buf), // %0
315 "+r"(rgb_buf), // %1
316 "+r"(width) // %2
317 : "r" (_kCoefficientsRgbY) // %3
318 : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
319 );
320 }
321
322 #elif defined(__i386__)
323 // 32 bit gcc version
324
325 void FastConvertYUVToRGB32Row(const uint8* y_buf,
326 const uint8* u_buf,
327 const uint8* v_buf,
328 uint8* rgb_buf,
329 int width);
330 asm(
331 ".text\n"
332 #if defined(OSX) || defined(IOS)
333 ".globl _FastConvertYUVToRGB32Row\n"
334 "_FastConvertYUVToRGB32Row:\n"
335 #else
336 ".global FastConvertYUVToRGB32Row\n"
337 "FastConvertYUVToRGB32Row:\n"
338 #endif
339 "pusha\n"
340 "mov 0x24(%esp),%edx\n"
341 "mov 0x28(%esp),%edi\n"
342 "mov 0x2c(%esp),%esi\n"
343 "mov 0x30(%esp),%ebp\n"
344 "mov 0x34(%esp),%ecx\n"
345
346 "1:"
347 "movzbl (%edi),%eax\n"
348 "lea 1(%edi),%edi\n"
349 "movzbl (%esi),%ebx\n"
350 "lea 1(%esi),%esi\n"
351 "movq _kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
352 "movzbl (%edx),%eax\n"
353 "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
354 "movzbl 0x1(%edx),%ebx\n"
355 "movq _kCoefficientsRgbY(,%eax,8),%mm1\n"
356 "lea 2(%edx),%edx\n"
357 "movq _kCoefficientsRgbY(,%ebx,8),%mm2\n"
358 "paddsw %mm0,%mm1\n"
359 "paddsw %mm0,%mm2\n"
360 "psraw $0x6,%mm1\n"
361 "psraw $0x6,%mm2\n"
362 "packuswb %mm2,%mm1\n"
363 "movntq %mm1,0x0(%ebp)\n"
364 "lea 8(%ebp),%ebp\n"
365 "sub $0x2,%ecx\n"
366 "ja 1b\n"
367 "popa\n"
368 "ret\n"
369 );
370
371 void FastConvertYUVToBGRARow(const uint8* y_buf,
372 const uint8* u_buf,
373 const uint8* v_buf,
374 uint8* rgb_buf,
375 int width);
376 asm(
377 ".text\n"
378 #if defined(OSX) || defined(IOS)
379 ".globl _FastConvertYUVToBGRARow\n"
380 "_FastConvertYUVToBGRARow:\n"
381 #else
382 ".global FastConvertYUVToBGRARow\n"
383 "FastConvertYUVToBGRARow:\n"
384 #endif
385 "pusha\n"
386 "mov 0x24(%esp),%edx\n"
387 "mov 0x28(%esp),%edi\n"
388 "mov 0x2c(%esp),%esi\n"
389 "mov 0x30(%esp),%ebp\n"
390 "mov 0x34(%esp),%ecx\n"
391
392 "1:"
393 "movzbl (%edi),%eax\n"
394 "lea 1(%edi),%edi\n"
395 "movzbl (%esi),%ebx\n"
396 "lea 1(%esi),%esi\n"
397 "movq _kCoefficientsBgraY+2048(,%eax,8),%mm0\n"
398 "movzbl (%edx),%eax\n"
399 "paddsw _kCoefficientsBgraY+4096(,%ebx,8),%mm0\n"
400 "movzbl 0x1(%edx),%ebx\n"
401 "movq _kCoefficientsBgraY(,%eax,8),%mm1\n"
402 "lea 2(%edx),%edx\n"
403 "movq _kCoefficientsBgraY(,%ebx,8),%mm2\n"
404 "paddsw %mm0,%mm1\n"
405 "paddsw %mm0,%mm2\n"
406 "psraw $0x6,%mm1\n"
407 "psraw $0x6,%mm2\n"
408 "packuswb %mm2,%mm1\n"
409 "movntq %mm1,0x0(%ebp)\n"
410 "lea 8(%ebp),%ebp\n"
411 "sub $0x2,%ecx\n"
412 "ja 1b\n"
413 "popa\n"
414 "ret\n"
415 );
416
417 void FastConvertYUVToABGRRow(const uint8* y_buf,
418 const uint8* u_buf,
419 const uint8* v_buf,
420 uint8* rgb_buf,
421 int width);
422 asm(
423 ".text\n"
424 #if defined(OSX) || defined(IOS)
425 ".globl _FastConvertYUVToABGRRow\n"
426 "_FastConvertYUVToABGRRow:\n"
427 #else
428 ".global FastConvertYUVToABGRRow\n"
429 "FastConvertYUVToABGRRow:\n"
430 #endif
431 "pusha\n"
432 "mov 0x24(%esp),%edx\n"
433 "mov 0x28(%esp),%edi\n"
434 "mov 0x2c(%esp),%esi\n"
435 "mov 0x30(%esp),%ebp\n"
436 "mov 0x34(%esp),%ecx\n"
437
438 "1:"
439 "movzbl (%edi),%eax\n"
440 "lea 1(%edi),%edi\n"
441 "movzbl (%esi),%ebx\n"
442 "lea 1(%esi),%esi\n"
443 "movq _kCoefficientsAbgrY+2048(,%eax,8),%mm0\n"
444 "movzbl (%edx),%eax\n"
445 "paddsw _kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n"
446 "movzbl 0x1(%edx),%ebx\n"
447 "movq _kCoefficientsAbgrY(,%eax,8),%mm1\n"
448 "lea 2(%edx),%edx\n"
449 "movq _kCoefficientsAbgrY(,%ebx,8),%mm2\n"
450 "paddsw %mm0,%mm1\n"
451 "paddsw %mm0,%mm2\n"
452 "psraw $0x6,%mm1\n"
453 "psraw $0x6,%mm2\n"
454 "packuswb %mm2,%mm1\n"
455 "movntq %mm1,0x0(%ebp)\n"
456 "lea 8(%ebp),%ebp\n"
457 "sub $0x2,%ecx\n"
458 "ja 1b\n"
459 "popa\n"
460 "ret\n"
461 );
462
463 void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
464 const uint8* u_buf,
465 const uint8* v_buf,
466 uint8* rgb_buf,
467 int width);
468 asm(
469 ".text\n"
470 #if defined(OSX) || defined(IOS)
471 ".globl _FastConvertYUV444ToRGB32Row\n"
472 "_FastConvertYUV444ToRGB32Row:\n"
473 #else
474 ".global FastConvertYUV444ToRGB32Row\n"
475 "FastConvertYUV444ToRGB32Row:\n"
476 #endif
477 "pusha\n"
478 "mov 0x24(%esp),%edx\n"
479 "mov 0x28(%esp),%edi\n"
480 "mov 0x2c(%esp),%esi\n"
481 "mov 0x30(%esp),%ebp\n"
482 "mov 0x34(%esp),%ecx\n"
483
484 "1:"
485 "movzbl (%edi),%eax\n"
486 "lea 1(%edi),%edi\n"
487 "movzbl (%esi),%ebx\n"
488 "lea 1(%esi),%esi\n"
489 "movq _kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
490 "movzbl (%edx),%eax\n"
491 "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
492 "lea 1(%edx),%edx\n"
493 "paddsw _kCoefficientsRgbY(,%eax,8),%mm0\n"
494 "psraw $0x6,%mm0\n"
495 "packuswb %mm0,%mm0\n"
496 "movd %mm0,0x0(%ebp)\n"
497 "lea 4(%ebp),%ebp\n"
498 "sub $0x1,%ecx\n"
499 "ja 1b\n"
500 "popa\n"
501 "ret\n"
502 );
503
504 void FastConvertYToRGB32Row(const uint8* y_buf,
505 uint8* rgb_buf,
506 int width);
507 asm(
508 ".text\n"
509 #if defined(OSX) || defined(IOS)
510 ".globl _FastConvertYToRGB32Row\n"
511 "_FastConvertYToRGB32Row:\n"
512 #else
513 ".global FastConvertYToRGB32Row\n"
514 "FastConvertYToRGB32Row:\n"
515 #endif
516 "push %ebx\n"
517 "mov 0x8(%esp),%eax\n"
518 "mov 0xc(%esp),%edx\n"
519 "mov 0x10(%esp),%ecx\n"
520
521 "1:"
522 "movzbl (%eax),%ebx\n"
523 "movq _kCoefficientsRgbY(,%ebx,8),%mm0\n"
524 "psraw $0x6,%mm0\n"
525 "movzbl 0x1(%eax),%ebx\n"
526 "movq _kCoefficientsRgbY(,%ebx,8),%mm1\n"
527 "psraw $0x6,%mm1\n"
528 "packuswb %mm1,%mm0\n"
529 "lea 0x2(%eax),%eax\n"
530 "movq %mm0,(%edx)\n"
531 "lea 0x8(%edx),%edx\n"
532 "sub $0x2,%ecx\n"
533 "ja 1b\n"
534 "pop %ebx\n"
535 "ret\n"
536 );
537
538 #else
539 // C reference code that mimic the YUV assembly.
540 #define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
541 #define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
542 (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
543
YuvPixel(uint8 y,uint8 u,uint8 v,uint8 * rgb_buf,int ashift,int rshift,int gshift,int bshift)544 static inline void YuvPixel(uint8 y,
545 uint8 u,
546 uint8 v,
547 uint8* rgb_buf,
548 int ashift,
549 int rshift,
550 int gshift,
551 int bshift) {
552
553 int b = _kCoefficientsRgbY[256+u][0];
554 int g = _kCoefficientsRgbY[256+u][1];
555 int r = _kCoefficientsRgbY[256+u][2];
556 int a = _kCoefficientsRgbY[256+u][3];
557
558 b = paddsw(b, _kCoefficientsRgbY[512+v][0]);
559 g = paddsw(g, _kCoefficientsRgbY[512+v][1]);
560 r = paddsw(r, _kCoefficientsRgbY[512+v][2]);
561 a = paddsw(a, _kCoefficientsRgbY[512+v][3]);
562
563 b = paddsw(b, _kCoefficientsRgbY[y][0]);
564 g = paddsw(g, _kCoefficientsRgbY[y][1]);
565 r = paddsw(r, _kCoefficientsRgbY[y][2]);
566 a = paddsw(a, _kCoefficientsRgbY[y][3]);
567
568 b >>= 6;
569 g >>= 6;
570 r >>= 6;
571 a >>= 6;
572
573 *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b) << bshift) |
574 (packuswb(g) << gshift) |
575 (packuswb(r) << rshift) |
576 (packuswb(a) << ashift);
577 }
578
FastConvertYUVToRGB32Row(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)579 void FastConvertYUVToRGB32Row(const uint8* y_buf,
580 const uint8* u_buf,
581 const uint8* v_buf,
582 uint8* rgb_buf,
583 int width) {
584 for (int x = 0; x < width; x += 2) {
585 uint8 u = u_buf[x >> 1];
586 uint8 v = v_buf[x >> 1];
587 uint8 y0 = y_buf[x];
588 YuvPixel(y0, u, v, rgb_buf, 24, 16, 8, 0);
589 if ((x + 1) < width) {
590 uint8 y1 = y_buf[x + 1];
591 YuvPixel(y1, u, v, rgb_buf + 4, 24, 16, 8, 0);
592 }
593 rgb_buf += 8; // Advance 2 pixels.
594 }
595 }
596
FastConvertYUVToBGRARow(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)597 void FastConvertYUVToBGRARow(const uint8* y_buf,
598 const uint8* u_buf,
599 const uint8* v_buf,
600 uint8* rgb_buf,
601 int width) {
602 for (int x = 0; x < width; x += 2) {
603 uint8 u = u_buf[x >> 1];
604 uint8 v = v_buf[x >> 1];
605 uint8 y0 = y_buf[x];
606 YuvPixel(y0, u, v, rgb_buf, 0, 8, 16, 24);
607 if ((x + 1) < width) {
608 uint8 y1 = y_buf[x + 1];
609 YuvPixel(y1, u, v, rgb_buf + 4, 0, 8, 16, 24);
610 }
611 rgb_buf += 8; // Advance 2 pixels.
612 }
613 }
614
FastConvertYUVToABGRRow(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)615 void FastConvertYUVToABGRRow(const uint8* y_buf,
616 const uint8* u_buf,
617 const uint8* v_buf,
618 uint8* rgb_buf,
619 int width) {
620 for (int x = 0; x < width; x += 2) {
621 uint8 u = u_buf[x >> 1];
622 uint8 v = v_buf[x >> 1];
623 uint8 y0 = y_buf[x];
624 YuvPixel(y0, u, v, rgb_buf, 24, 0, 8, 16);
625 if ((x + 1) < width) {
626 uint8 y1 = y_buf[x + 1];
627 YuvPixel(y1, u, v, rgb_buf + 4, 24, 0, 8, 16);
628 }
629 rgb_buf += 8; // Advance 2 pixels.
630 }
631 }
632
FastConvertYUV444ToRGB32Row(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)633 void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
634 const uint8* u_buf,
635 const uint8* v_buf,
636 uint8* rgb_buf,
637 int width) {
638 for (int x = 0; x < width; ++x) {
639 uint8 u = u_buf[x];
640 uint8 v = v_buf[x];
641 uint8 y = y_buf[x];
642 YuvPixel(y, u, v, rgb_buf, 24, 16, 8, 0);
643 rgb_buf += 4; // Advance 1 pixel.
644 }
645 }
646
FastConvertYToRGB32Row(const uint8 * y_buf,uint8 * rgb_buf,int width)647 void FastConvertYToRGB32Row(const uint8* y_buf,
648 uint8* rgb_buf,
649 int width) {
650 for (int x = 0; x < width; ++x) {
651 uint8 y = y_buf[x];
652 YuvPixel(y, 128, 128, rgb_buf, 24, 16, 8, 0);
653 rgb_buf += 4; // Advance 1 pixel.
654 }
655 }
656
657 #endif
658
659 } // extern "C"
660