• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "row.h"
12 
13 extern "C" {
14 
15 #ifdef HAS_ARGBTOYROW_SSSE3
16 
17 // Constant multiplication table for converting ARGB to I400.
18 extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) = {
19   13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u
20 };
21 
22 extern "C" TALIGN16(const uint8, kAdd16[16]) = {
23   1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u
24 };
25 
26 // Shuffle table for converting BG24 to ARGB.
27 extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
28   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
29 };
30 
31 // Shuffle table for converting RAW to ARGB.
32 extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
33   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
34 };
35 
ARGBToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)36 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
37   asm volatile(
38   "movdqa     (%3),%%xmm7\n"
39   "movdqa     (%4),%%xmm6\n"
40   "movdqa     %%xmm6,%%xmm5\n"
41   "psllw      $0x4,%%xmm5\n"  // Generate a mask of 0x10 on each byte.
42 "1:"
43   "movdqa     (%0),%%xmm0\n"
44   "pmaddubsw  %%xmm7,%%xmm0\n"
45   "movdqa     0x10(%0),%%xmm1\n"
46   "psrlw      $0x7,%%xmm0\n"
47   "pmaddubsw  %%xmm7,%%xmm1\n"
48   "lea        0x20(%0),%0\n"
49   "psrlw      $0x7,%%xmm1\n"
50   "packuswb   %%xmm1,%%xmm0\n"
51   "pmaddubsw  %%xmm6,%%xmm0\n"
52   "packuswb   %%xmm0,%%xmm0\n"
53   "paddb      %%xmm5,%%xmm0\n"
54   "movq       %%xmm0,(%1)\n"
55   "lea        0x8(%1),%1\n"
56   "sub        $0x8,%2\n"
57   "ja         1b\n"
58   : "+r"(src_argb),   // %0
59     "+r"(dst_y),      // %1
60     "+r"(pix)         // %2
61   : "r"(kMultiplyMaskARGBToI400),    // %3
62     "r"(kAdd16)   // %4
63   : "memory"
64 );
65 }
66 #endif
67 
68 #ifdef  HAS_BG24TOARGBROW_SSSE3
BG24ToARGBRow_SSSE3(const uint8 * src_bg24,uint8 * dst_argb,int pix)69 void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
70   asm volatile(
71   "pcmpeqb    %%xmm7,%%xmm7\n"  // generate mask 0xff000000
72   "pslld      $0x18,%%xmm7\n"
73   "movdqa     (%3),%%xmm6\n"
74 "1:"
75   "movdqa     (%0),%%xmm0\n"
76   "movdqa     0x10(%0),%%xmm1\n"
77   "movdqa     0x20(%0),%%xmm3\n"
78   "lea        0x30(%0),%0\n"
79   "movdqa     %%xmm3,%%xmm2\n"
80   "palignr    $0x8,%%xmm1,%%xmm2\n"  // xmm2 = { xmm3[0:3] xmm1[8:15] }
81   "pshufb     %%xmm6,%%xmm2\n"
82   "por        %%xmm7,%%xmm2\n"
83   "palignr    $0xc,%%xmm0,%%xmm1\n"  // xmm1 = { xmm3[0:7] xmm0[12:15] }
84   "pshufb     %%xmm6,%%xmm0\n"
85   "movdqa     %%xmm2,0x20(%1)\n"
86   "por        %%xmm7,%%xmm0\n"
87   "pshufb     %%xmm6,%%xmm1\n"
88   "movdqa     %%xmm0,(%1)\n"
89   "por        %%xmm7,%%xmm1\n"
90   "palignr    $0x4,%%xmm3,%%xmm3\n"  // xmm3 = { xmm3[4:15] }
91   "pshufb     %%xmm6,%%xmm3\n"
92   "movdqa     %%xmm1,0x10(%1)\n"
93   "por        %%xmm7,%%xmm3\n"
94   "movdqa     %%xmm3,0x30(%1)\n"
95   "lea        0x40(%1),%1\n"
96   "sub        $0x10,%2\n"
97   "ja         1b\n"
98   : "+r"(src_bg24),  // %0
99     "+r"(dst_argb),  // %1
100     "+r"(pix)        // %2
101   : "r"(kShuffleMaskBG24ToARGB)  // %3
102   : "memory"
103 );
104 }
105 
RAWToARGBRow_SSSE3(const uint8 * src_raw,uint8 * dst_argb,int pix)106 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
107   asm volatile(
108   "pcmpeqb    %%xmm7,%%xmm7\n"  // generate mask 0xff000000
109   "pslld      $0x18,%%xmm7\n"
110   "movdqa     (%3),%%xmm6\n"
111 "1:"
112   "movdqa     (%0),%%xmm0\n"
113   "movdqa     0x10(%0),%%xmm1\n"
114   "movdqa     0x20(%0),%%xmm3\n"
115   "lea        0x30(%0),%0\n"
116   "movdqa     %%xmm3,%%xmm2\n"
117   "palignr    $0x8,%%xmm1,%%xmm2\n"  // xmm2 = { xmm3[0:3] xmm1[8:15] }
118   "pshufb     %%xmm6,%%xmm2\n"
119   "por        %%xmm7,%%xmm2\n"
120   "palignr    $0xc,%%xmm0,%%xmm1\n"  // xmm1 = { xmm3[0:7] xmm0[12:15] }
121   "pshufb     %%xmm6,%%xmm0\n"
122   "movdqa     %%xmm2,0x20(%1)\n"
123   "por        %%xmm7,%%xmm0\n"
124   "pshufb     %%xmm6,%%xmm1\n"
125   "movdqa     %%xmm0,(%1)\n"
126   "por        %%xmm7,%%xmm1\n"
127   "palignr    $0x4,%%xmm3,%%xmm3\n"  // xmm3 = { xmm3[4:15] }
128   "pshufb     %%xmm6,%%xmm3\n"
129   "movdqa     %%xmm1,0x10(%1)\n"
130   "por        %%xmm7,%%xmm3\n"
131   "movdqa     %%xmm3,0x30(%1)\n"
132   "lea        0x40(%1),%1\n"
133   "sub        $0x10,%2\n"
134   "ja         1b\n"
135   : "+r"(src_raw),   // %0
136     "+r"(dst_argb),  // %1
137     "+r"(pix)        // %2
138   : "r"(kShuffleMaskRAWToARGB)  // %3
139   : "memory"
140 );
141 }
142 #endif
143 
144 #if defined(__x86_64__)
145 
146 // 64 bit linux gcc version
147 
FastConvertYUVToRGB32Row(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)148 void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
149                               const uint8* u_buf,  // rsi
150                               const uint8* v_buf,  // rdx
151                               uint8* rgb_buf,      // rcx
152                               int width) {         // r8
153   asm volatile(
154 "1:"
155   "movzb  (%1),%%r10\n"
156   "lea    1(%1),%1\n"
157   "movzb  (%2),%%r11\n"
158   "lea    1(%2),%2\n"
159   "movq   2048(%5,%%r10,8),%%xmm0\n"
160   "movzb  (%0),%%r10\n"
161   "movq   4096(%5,%%r11,8),%%xmm1\n"
162   "movzb  0x1(%0),%%r11\n"
163   "paddsw %%xmm1,%%xmm0\n"
164   "movq   (%5,%%r10,8),%%xmm2\n"
165   "lea    2(%0),%0\n"
166   "movq   (%5,%%r11,8),%%xmm3\n"
167   "paddsw %%xmm0,%%xmm2\n"
168   "paddsw %%xmm0,%%xmm3\n"
169   "shufps $0x44,%%xmm3,%%xmm2\n"
170   "psraw  $0x6,%%xmm2\n"
171   "packuswb %%xmm2,%%xmm2\n"
172   "movq   %%xmm2,0x0(%3)\n"
173   "lea    8(%3),%3\n"
174   "sub    $0x2,%4\n"
175   "ja     1b\n"
176   : "+r"(y_buf),    // %0
177     "+r"(u_buf),    // %1
178     "+r"(v_buf),    // %2
179     "+r"(rgb_buf),  // %3
180     "+r"(width)     // %4
181   : "r" (_kCoefficientsRgbY)  // %5
182   : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
183 );
184 }
185 
FastConvertYUVToBGRARow(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)186 void FastConvertYUVToBGRARow(const uint8* y_buf,  // rdi
187                              const uint8* u_buf,  // rsi
188                              const uint8* v_buf,  // rdx
189                              uint8* rgb_buf,      // rcx
190                              int width) {         // r8
191   asm volatile(
192 "1:"
193   "movzb  (%1),%%r10\n"
194   "lea    1(%1),%1\n"
195   "movzb  (%2),%%r11\n"
196   "lea    1(%2),%2\n"
197   "movq   2048(%5,%%r10,8),%%xmm0\n"
198   "movzb  (%0),%%r10\n"
199   "movq   4096(%5,%%r11,8),%%xmm1\n"
200   "movzb  0x1(%0),%%r11\n"
201   "paddsw %%xmm1,%%xmm0\n"
202   "movq   (%5,%%r10,8),%%xmm2\n"
203   "lea    2(%0),%0\n"
204   "movq   (%5,%%r11,8),%%xmm3\n"
205   "paddsw %%xmm0,%%xmm2\n"
206   "paddsw %%xmm0,%%xmm3\n"
207   "shufps $0x44,%%xmm3,%%xmm2\n"
208   "psraw  $0x6,%%xmm2\n"
209   "packuswb %%xmm2,%%xmm2\n"
210   "movq   %%xmm2,0x0(%3)\n"
211   "lea    8(%3),%3\n"
212   "sub    $0x2,%4\n"
213   "ja     1b\n"
214   : "+r"(y_buf),    // %0
215     "+r"(u_buf),    // %1
216     "+r"(v_buf),    // %2
217     "+r"(rgb_buf),  // %3
218     "+r"(width)     // %4
219   : "r" (_kCoefficientsBgraY)  // %5
220   : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
221 );
222 }
223 
FastConvertYUVToABGRRow(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)224 void FastConvertYUVToABGRRow(const uint8* y_buf,  // rdi
225                              const uint8* u_buf,  // rsi
226                              const uint8* v_buf,  // rdx
227                              uint8* rgb_buf,      // rcx
228                              int width) {         // r8
229   asm volatile(
230 "1:"
231   "movzb  (%1),%%r10\n"
232   "lea    1(%1),%1\n"
233   "movzb  (%2),%%r11\n"
234   "lea    1(%2),%2\n"
235   "movq   2048(%5,%%r10,8),%%xmm0\n"
236   "movzb  (%0),%%r10\n"
237   "movq   4096(%5,%%r11,8),%%xmm1\n"
238   "movzb  0x1(%0),%%r11\n"
239   "paddsw %%xmm1,%%xmm0\n"
240   "movq   (%5,%%r10,8),%%xmm2\n"
241   "lea    2(%0),%0\n"
242   "movq   (%5,%%r11,8),%%xmm3\n"
243   "paddsw %%xmm0,%%xmm2\n"
244   "paddsw %%xmm0,%%xmm3\n"
245   "shufps $0x44,%%xmm3,%%xmm2\n"
246   "psraw  $0x6,%%xmm2\n"
247   "packuswb %%xmm2,%%xmm2\n"
248   "movq   %%xmm2,0x0(%3)\n"
249   "lea    8(%3),%3\n"
250   "sub    $0x2,%4\n"
251   "ja     1b\n"
252   : "+r"(y_buf),    // %0
253     "+r"(u_buf),    // %1
254     "+r"(v_buf),    // %2
255     "+r"(rgb_buf),  // %3
256     "+r"(width)     // %4
257   : "r" (_kCoefficientsAbgrY)  // %5
258   : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
259 );
260 }
261 
FastConvertYUV444ToRGB32Row(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)262 void FastConvertYUV444ToRGB32Row(const uint8* y_buf,  // rdi
263                                  const uint8* u_buf,  // rsi
264                                  const uint8* v_buf,  // rdx
265                                  uint8* rgb_buf,      // rcx
266                                  int width) {         // r8
267   asm volatile(
268 "1:"
269   "movzb  (%1),%%r10\n"
270   "lea    1(%1),%1\n"
271   "movzb  (%2),%%r11\n"
272   "lea    1(%2),%2\n"
273   "movq   2048(%5,%%r10,8),%%xmm0\n"
274   "movzb  (%0),%%r10\n"
275   "movq   4096(%5,%%r11,8),%%xmm1\n"
276   "paddsw %%xmm1,%%xmm0\n"
277   "movq   (%5,%%r10,8),%%xmm2\n"
278   "lea    1(%0),%0\n"
279   "paddsw %%xmm0,%%xmm2\n"
280   "shufps $0x44,%%xmm2,%%xmm2\n"
281   "psraw  $0x6,%%xmm2\n"
282   "packuswb %%xmm2,%%xmm2\n"
283   "movd   %%xmm2,0x0(%3)\n"
284   "lea    4(%3),%3\n"
285   "sub    $0x1,%4\n"
286   "ja     1b\n"
287   : "+r"(y_buf),    // %0
288     "+r"(u_buf),    // %1
289     "+r"(v_buf),    // %2
290     "+r"(rgb_buf),  // %3
291     "+r"(width)     // %4
292   : "r" (_kCoefficientsRgbY)  // %5
293   : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2"
294 );
295 }
296 
FastConvertYToRGB32Row(const uint8 * y_buf,uint8 * rgb_buf,int width)297 void FastConvertYToRGB32Row(const uint8* y_buf,  // rdi
298                             uint8* rgb_buf,      // rcx
299                             int width) {         // r8
300   asm volatile(
301 "1:"
302   "movzb  (%0),%%r10\n"
303   "movzb  0x1(%0),%%r11\n"
304   "movq   (%3,%%r10,8),%%xmm2\n"
305   "lea    2(%0),%0\n"
306   "movq   (%3,%%r11,8),%%xmm3\n"
307   "shufps $0x44,%%xmm3,%%xmm2\n"
308   "psraw  $0x6,%%xmm2\n"
309   "packuswb %%xmm2,%%xmm2\n"
310   "movq   %%xmm2,0x0(%1)\n"
311   "lea    8(%1),%1\n"
312   "sub    $0x2,%2\n"
313   "ja     1b\n"
314   : "+r"(y_buf),    // %0
315     "+r"(rgb_buf),  // %1
316     "+r"(width)     // %2
317   : "r" (_kCoefficientsRgbY)  // %3
318   : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
319 );
320 }
321 
322 #elif defined(__i386__)
323 // 32 bit gcc version
324 
325 void FastConvertYUVToRGB32Row(const uint8* y_buf,
326                               const uint8* u_buf,
327                               const uint8* v_buf,
328                               uint8* rgb_buf,
329                               int width);
330   asm(
331   ".text\n"
332 #if defined(OSX) || defined(IOS)
333   ".globl _FastConvertYUVToRGB32Row\n"
334 "_FastConvertYUVToRGB32Row:\n"
335 #else
336   ".global FastConvertYUVToRGB32Row\n"
337 "FastConvertYUVToRGB32Row:\n"
338 #endif
339   "pusha\n"
340   "mov    0x24(%esp),%edx\n"
341   "mov    0x28(%esp),%edi\n"
342   "mov    0x2c(%esp),%esi\n"
343   "mov    0x30(%esp),%ebp\n"
344   "mov    0x34(%esp),%ecx\n"
345 
346 "1:"
347   "movzbl (%edi),%eax\n"
348   "lea    1(%edi),%edi\n"
349   "movzbl (%esi),%ebx\n"
350   "lea    1(%esi),%esi\n"
351   "movq   _kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
352   "movzbl (%edx),%eax\n"
353   "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
354   "movzbl 0x1(%edx),%ebx\n"
355   "movq   _kCoefficientsRgbY(,%eax,8),%mm1\n"
356   "lea    2(%edx),%edx\n"
357   "movq   _kCoefficientsRgbY(,%ebx,8),%mm2\n"
358   "paddsw %mm0,%mm1\n"
359   "paddsw %mm0,%mm2\n"
360   "psraw  $0x6,%mm1\n"
361   "psraw  $0x6,%mm2\n"
362   "packuswb %mm2,%mm1\n"
363   "movntq %mm1,0x0(%ebp)\n"
364   "lea    8(%ebp),%ebp\n"
365   "sub    $0x2,%ecx\n"
366   "ja     1b\n"
367   "popa\n"
368   "ret\n"
369 );
370 
371 void FastConvertYUVToBGRARow(const uint8* y_buf,
372                               const uint8* u_buf,
373                               const uint8* v_buf,
374                               uint8* rgb_buf,
375                               int width);
376   asm(
377   ".text\n"
378 #if defined(OSX) || defined(IOS)
379   ".globl _FastConvertYUVToBGRARow\n"
380 "_FastConvertYUVToBGRARow:\n"
381 #else
382   ".global FastConvertYUVToBGRARow\n"
383 "FastConvertYUVToBGRARow:\n"
384 #endif
385   "pusha\n"
386   "mov    0x24(%esp),%edx\n"
387   "mov    0x28(%esp),%edi\n"
388   "mov    0x2c(%esp),%esi\n"
389   "mov    0x30(%esp),%ebp\n"
390   "mov    0x34(%esp),%ecx\n"
391 
392 "1:"
393   "movzbl (%edi),%eax\n"
394   "lea    1(%edi),%edi\n"
395   "movzbl (%esi),%ebx\n"
396   "lea    1(%esi),%esi\n"
397   "movq   _kCoefficientsBgraY+2048(,%eax,8),%mm0\n"
398   "movzbl (%edx),%eax\n"
399   "paddsw _kCoefficientsBgraY+4096(,%ebx,8),%mm0\n"
400   "movzbl 0x1(%edx),%ebx\n"
401   "movq   _kCoefficientsBgraY(,%eax,8),%mm1\n"
402   "lea    2(%edx),%edx\n"
403   "movq   _kCoefficientsBgraY(,%ebx,8),%mm2\n"
404   "paddsw %mm0,%mm1\n"
405   "paddsw %mm0,%mm2\n"
406   "psraw  $0x6,%mm1\n"
407   "psraw  $0x6,%mm2\n"
408   "packuswb %mm2,%mm1\n"
409   "movntq %mm1,0x0(%ebp)\n"
410   "lea    8(%ebp),%ebp\n"
411   "sub    $0x2,%ecx\n"
412   "ja     1b\n"
413   "popa\n"
414   "ret\n"
415 );
416 
417 void FastConvertYUVToABGRRow(const uint8* y_buf,
418                               const uint8* u_buf,
419                               const uint8* v_buf,
420                               uint8* rgb_buf,
421                               int width);
422   asm(
423   ".text\n"
424 #if defined(OSX) || defined(IOS)
425   ".globl _FastConvertYUVToABGRRow\n"
426 "_FastConvertYUVToABGRRow:\n"
427 #else
428   ".global FastConvertYUVToABGRRow\n"
429 "FastConvertYUVToABGRRow:\n"
430 #endif
431   "pusha\n"
432   "mov    0x24(%esp),%edx\n"
433   "mov    0x28(%esp),%edi\n"
434   "mov    0x2c(%esp),%esi\n"
435   "mov    0x30(%esp),%ebp\n"
436   "mov    0x34(%esp),%ecx\n"
437 
438 "1:"
439   "movzbl (%edi),%eax\n"
440   "lea    1(%edi),%edi\n"
441   "movzbl (%esi),%ebx\n"
442   "lea    1(%esi),%esi\n"
443   "movq   _kCoefficientsAbgrY+2048(,%eax,8),%mm0\n"
444   "movzbl (%edx),%eax\n"
445   "paddsw _kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n"
446   "movzbl 0x1(%edx),%ebx\n"
447   "movq   _kCoefficientsAbgrY(,%eax,8),%mm1\n"
448   "lea    2(%edx),%edx\n"
449   "movq   _kCoefficientsAbgrY(,%ebx,8),%mm2\n"
450   "paddsw %mm0,%mm1\n"
451   "paddsw %mm0,%mm2\n"
452   "psraw  $0x6,%mm1\n"
453   "psraw  $0x6,%mm2\n"
454   "packuswb %mm2,%mm1\n"
455   "movntq %mm1,0x0(%ebp)\n"
456   "lea    8(%ebp),%ebp\n"
457   "sub    $0x2,%ecx\n"
458   "ja     1b\n"
459   "popa\n"
460   "ret\n"
461 );
462 
463 void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
464                                  const uint8* u_buf,
465                                  const uint8* v_buf,
466                                  uint8* rgb_buf,
467                                  int width);
468   asm(
469   ".text\n"
470 #if defined(OSX) || defined(IOS)
471   ".globl _FastConvertYUV444ToRGB32Row\n"
472 "_FastConvertYUV444ToRGB32Row:\n"
473 #else
474   ".global FastConvertYUV444ToRGB32Row\n"
475 "FastConvertYUV444ToRGB32Row:\n"
476 #endif
477   "pusha\n"
478   "mov    0x24(%esp),%edx\n"
479   "mov    0x28(%esp),%edi\n"
480   "mov    0x2c(%esp),%esi\n"
481   "mov    0x30(%esp),%ebp\n"
482   "mov    0x34(%esp),%ecx\n"
483 
484 "1:"
485   "movzbl (%edi),%eax\n"
486   "lea    1(%edi),%edi\n"
487   "movzbl (%esi),%ebx\n"
488   "lea    1(%esi),%esi\n"
489   "movq   _kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
490   "movzbl (%edx),%eax\n"
491   "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
492   "lea    1(%edx),%edx\n"
493   "paddsw _kCoefficientsRgbY(,%eax,8),%mm0\n"
494   "psraw  $0x6,%mm0\n"
495   "packuswb %mm0,%mm0\n"
496   "movd   %mm0,0x0(%ebp)\n"
497   "lea    4(%ebp),%ebp\n"
498   "sub    $0x1,%ecx\n"
499   "ja     1b\n"
500   "popa\n"
501   "ret\n"
502 );
503 
504 void FastConvertYToRGB32Row(const uint8* y_buf,
505                             uint8* rgb_buf,
506                             int width);
507   asm(
508   ".text\n"
509 #if defined(OSX) || defined(IOS)
510   ".globl _FastConvertYToRGB32Row\n"
511 "_FastConvertYToRGB32Row:\n"
512 #else
513   ".global FastConvertYToRGB32Row\n"
514 "FastConvertYToRGB32Row:\n"
515 #endif
516   "push   %ebx\n"
517   "mov    0x8(%esp),%eax\n"
518   "mov    0xc(%esp),%edx\n"
519   "mov    0x10(%esp),%ecx\n"
520 
521 "1:"
522   "movzbl (%eax),%ebx\n"
523   "movq   _kCoefficientsRgbY(,%ebx,8),%mm0\n"
524   "psraw  $0x6,%mm0\n"
525   "movzbl 0x1(%eax),%ebx\n"
526   "movq   _kCoefficientsRgbY(,%ebx,8),%mm1\n"
527   "psraw  $0x6,%mm1\n"
528   "packuswb %mm1,%mm0\n"
529   "lea    0x2(%eax),%eax\n"
530   "movq   %mm0,(%edx)\n"
531   "lea    0x8(%edx),%edx\n"
532   "sub    $0x2,%ecx\n"
533   "ja     1b\n"
534   "pop    %ebx\n"
535   "ret\n"
536 );
537 
538 #else
539 // C reference code that mimic the YUV assembly.
540 #define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
541 #define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
542     (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
543 
YuvPixel(uint8 y,uint8 u,uint8 v,uint8 * rgb_buf,int ashift,int rshift,int gshift,int bshift)544 static inline void YuvPixel(uint8 y,
545                             uint8 u,
546                             uint8 v,
547                             uint8* rgb_buf,
548                             int ashift,
549                             int rshift,
550                             int gshift,
551                             int bshift) {
552 
553   int b = _kCoefficientsRgbY[256+u][0];
554   int g = _kCoefficientsRgbY[256+u][1];
555   int r = _kCoefficientsRgbY[256+u][2];
556   int a = _kCoefficientsRgbY[256+u][3];
557 
558   b = paddsw(b, _kCoefficientsRgbY[512+v][0]);
559   g = paddsw(g, _kCoefficientsRgbY[512+v][1]);
560   r = paddsw(r, _kCoefficientsRgbY[512+v][2]);
561   a = paddsw(a, _kCoefficientsRgbY[512+v][3]);
562 
563   b = paddsw(b, _kCoefficientsRgbY[y][0]);
564   g = paddsw(g, _kCoefficientsRgbY[y][1]);
565   r = paddsw(r, _kCoefficientsRgbY[y][2]);
566   a = paddsw(a, _kCoefficientsRgbY[y][3]);
567 
568   b >>= 6;
569   g >>= 6;
570   r >>= 6;
571   a >>= 6;
572 
573   *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b) << bshift) |
574                                         (packuswb(g) << gshift) |
575                                         (packuswb(r) << rshift) |
576                                         (packuswb(a) << ashift);
577 }
578 
FastConvertYUVToRGB32Row(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)579 void FastConvertYUVToRGB32Row(const uint8* y_buf,
580                               const uint8* u_buf,
581                               const uint8* v_buf,
582                               uint8* rgb_buf,
583                               int width) {
584   for (int x = 0; x < width; x += 2) {
585     uint8 u = u_buf[x >> 1];
586     uint8 v = v_buf[x >> 1];
587     uint8 y0 = y_buf[x];
588     YuvPixel(y0, u, v, rgb_buf, 24, 16, 8, 0);
589     if ((x + 1) < width) {
590       uint8 y1 = y_buf[x + 1];
591       YuvPixel(y1, u, v, rgb_buf + 4, 24, 16, 8, 0);
592     }
593     rgb_buf += 8;  // Advance 2 pixels.
594   }
595 }
596 
FastConvertYUVToBGRARow(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)597 void FastConvertYUVToBGRARow(const uint8* y_buf,
598                              const uint8* u_buf,
599                              const uint8* v_buf,
600                              uint8* rgb_buf,
601                              int width) {
602   for (int x = 0; x < width; x += 2) {
603     uint8 u = u_buf[x >> 1];
604     uint8 v = v_buf[x >> 1];
605     uint8 y0 = y_buf[x];
606     YuvPixel(y0, u, v, rgb_buf, 0, 8, 16, 24);
607     if ((x + 1) < width) {
608       uint8 y1 = y_buf[x + 1];
609       YuvPixel(y1, u, v, rgb_buf + 4, 0, 8, 16, 24);
610     }
611     rgb_buf += 8;  // Advance 2 pixels.
612   }
613 }
614 
FastConvertYUVToABGRRow(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)615 void FastConvertYUVToABGRRow(const uint8* y_buf,
616                              const uint8* u_buf,
617                              const uint8* v_buf,
618                              uint8* rgb_buf,
619                              int width) {
620   for (int x = 0; x < width; x += 2) {
621     uint8 u = u_buf[x >> 1];
622     uint8 v = v_buf[x >> 1];
623     uint8 y0 = y_buf[x];
624     YuvPixel(y0, u, v, rgb_buf, 24, 0, 8, 16);
625     if ((x + 1) < width) {
626       uint8 y1 = y_buf[x + 1];
627       YuvPixel(y1, u, v, rgb_buf + 4, 24, 0, 8, 16);
628     }
629     rgb_buf += 8;  // Advance 2 pixels.
630   }
631 }
632 
FastConvertYUV444ToRGB32Row(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)633 void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
634                                  const uint8* u_buf,
635                                  const uint8* v_buf,
636                                  uint8* rgb_buf,
637                                  int width) {
638   for (int x = 0; x < width; ++x) {
639     uint8 u = u_buf[x];
640     uint8 v = v_buf[x];
641     uint8 y = y_buf[x];
642     YuvPixel(y, u, v, rgb_buf, 24, 16, 8, 0);
643     rgb_buf += 4;  // Advance 1 pixel.
644   }
645 }
646 
FastConvertYToRGB32Row(const uint8 * y_buf,uint8 * rgb_buf,int width)647 void FastConvertYToRGB32Row(const uint8* y_buf,
648                             uint8* rgb_buf,
649                             int width) {
650   for (int x = 0; x < width; ++x) {
651     uint8 y = y_buf[x];
652     YuvPixel(y, 128, 128, rgb_buf, 24, 16, 8, 0);
653     rgb_buf += 4;  // Advance 1 pixel.
654   }
655 }
656 
657 #endif
658 
659 }  // extern "C"
660