• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "row.h"
12 
13 extern "C" {
14 
15 #ifdef HAS_ARGBTOYROW_SSSE3
16 #define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
17 
18 // Constant multiplication table for converting ARGB to I400.
19 extern "C" TALIGN16(const int8, kARGBToY[16]) = {
20   13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
21 };
22 
23 extern "C" TALIGN16(const int8, kARGBToU[16]) = {
24   112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
25 };
26 
27 extern "C" TALIGN16(const int8, kARGBToV[16]) = {
28   -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
29 };
30 
31 // Constants for BGRA
32 extern "C" TALIGN16(const int8, kBGRAToY[16]) = {
33   0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
34 };
35 
36 extern "C" TALIGN16(const int8, kBGRAToU[16]) = {
37   0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
38 };
39 
40 extern "C" TALIGN16(const int8, kBGRAToV[16]) = {
41   0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
42 };
43 
44 // Constants for ABGR
45 extern "C" TALIGN16(const int8, kABGRToY[16]) = {
46   33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
47 };
48 
49 extern "C" TALIGN16(const int8, kABGRToU[16]) = {
50   -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
51 };
52 
53 extern "C" TALIGN16(const int8, kABGRToV[16]) = {
54   112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
55 };
56 
57 extern "C" TALIGN16(const uint8, kAddY16[16]) = {
58   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
59   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
60 };
61 
62 extern "C" TALIGN16(const uint8, kAddUV128[16]) = {
63   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
64   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
65 };
66 
67 // Shuffle table for converting BG24 to ARGB.
68 extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
69   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
70 };
71 
72 // Shuffle table for converting RAW to ARGB.
73 extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
74   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
75 };
76 
77 // Convert 16 ARGB pixels (64 bytes) to 16 Y values
78 __declspec(naked)
ARGBToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)79 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
80 __asm {
81     mov        eax, [esp + 4]   /* src_argb */
82     mov        edx, [esp + 8]   /* dst_y */
83     mov        ecx, [esp + 12]  /* pix */
84     movdqa     xmm7, _kARGBToY
85     movdqa     xmm6, _kAddY16
86 
87  convertloop :
88     movdqa     xmm0, [eax]
89     movdqa     xmm1, [eax + 16]
90     movdqa     xmm2, [eax + 32]
91     movdqa     xmm3, [eax + 48]
92     pmaddubsw  xmm0, xmm7
93     pmaddubsw  xmm1, xmm7
94     pmaddubsw  xmm2, xmm7
95     pmaddubsw  xmm3, xmm7
96     lea        eax, [eax + 64]
97     phaddw     xmm0, xmm1
98     phaddw     xmm2, xmm3
99     psrlw      xmm0, 7
100     psrlw      xmm2, 7
101     packuswb   xmm0, xmm2
102     paddb      xmm0, xmm6
103     movdqa     [edx], xmm0
104     lea        edx, [edx + 16]
105     sub        ecx, 16
106     ja         convertloop
107     ret
108   }
109 }
110 
111 __declspec(naked)
BGRAToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)112 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
113 __asm {
114     mov        eax, [esp + 4]   /* src_argb */
115     mov        edx, [esp + 8]   /* dst_y */
116     mov        ecx, [esp + 12]  /* pix */
117     movdqa     xmm7, _kBGRAToY
118     movdqa     xmm6, _kAddY16
119 
120  convertloop :
121     movdqa     xmm0, [eax]
122     movdqa     xmm1, [eax + 16]
123     movdqa     xmm2, [eax + 32]
124     movdqa     xmm3, [eax + 48]
125     pmaddubsw  xmm0, xmm7
126     pmaddubsw  xmm1, xmm7
127     pmaddubsw  xmm2, xmm7
128     pmaddubsw  xmm3, xmm7
129     lea        eax, [eax + 64]
130     phaddw     xmm0, xmm1
131     phaddw     xmm2, xmm3
132     psrlw      xmm0, 7
133     psrlw      xmm2, 7
134     packuswb   xmm0, xmm2
135     paddb      xmm0, xmm6
136     movdqa     [edx], xmm0
137     lea        edx, [edx + 16]
138     sub        ecx, 16
139     ja         convertloop
140     ret
141   }
142 }
143 
144 __declspec(naked)
ABGRToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)145 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
146 __asm {
147     mov        eax, [esp + 4]   /* src_argb */
148     mov        edx, [esp + 8]   /* dst_y */
149     mov        ecx, [esp + 12]  /* pix */
150     movdqa     xmm7, _kABGRToY
151     movdqa     xmm6, _kAddY16
152 
153  convertloop :
154     movdqa     xmm0, [eax]
155     movdqa     xmm1, [eax + 16]
156     movdqa     xmm2, [eax + 32]
157     movdqa     xmm3, [eax + 48]
158     pmaddubsw  xmm0, xmm7
159     pmaddubsw  xmm1, xmm7
160     pmaddubsw  xmm2, xmm7
161     pmaddubsw  xmm3, xmm7
162     lea        eax, [eax + 64]
163     phaddw     xmm0, xmm1
164     phaddw     xmm2, xmm3
165     psrlw      xmm0, 7
166     psrlw      xmm2, 7
167     packuswb   xmm0, xmm2
168     paddb      xmm0, xmm6
169     movdqa     [edx], xmm0
170     lea        edx, [edx + 16]
171     sub        ecx, 16
172     ja         convertloop
173     ret
174   }
175 }
176 
177 __declspec(naked)
ARGBToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)178 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
179                        uint8* dst_u, uint8* dst_v, int width) {
180 __asm {
181     push       esi
182     push       edi
183     mov        eax, [esp + 8 + 4]   // src_argb
184     mov        esi, [esp + 8 + 8]   // src_stride_argb
185     mov        edx, [esp + 8 + 12]  // dst_u
186     mov        edi, [esp + 8 + 16]  // dst_v
187     mov        ecx, [esp + 8 + 20]  // pix
188     movdqa     xmm7, _kARGBToU
189     movdqa     xmm6, _kARGBToV
190     movdqa     xmm5, _kAddUV128
191     sub        edi, edx             // stride from u to v
192 
193  convertloop :
194     /* step 1 - subsample 16x2 argb pixels to 8x1 */
195     movdqa     xmm0, [eax]
196     movdqa     xmm1, [eax + 16]
197     movdqa     xmm2, [eax + 32]
198     movdqa     xmm3, [eax + 48]
199     pavgb      xmm0, [eax + esi]
200     pavgb      xmm1, [eax + esi + 16]
201     pavgb      xmm2, [eax + esi + 32]
202     pavgb      xmm3, [eax + esi + 48]
203     lea        eax,  [eax + 64]
204     movdqa     xmm4, xmm0
205     shufps     xmm0, xmm1, 0x88
206     shufps     xmm4, xmm1, 0xdd
207     pavgb      xmm0, xmm4
208     movdqa     xmm4, xmm2
209     shufps     xmm2, xmm3, 0x88
210     shufps     xmm4, xmm3, 0xdd
211     pavgb      xmm2, xmm4
212 
213     // step 2 - convert to U and V
214     // from here down is very similar to Y code except
215     // instead of 16 different pixels, its 8 pixels of U and 8 of V
216     movdqa     xmm1, xmm0
217     movdqa     xmm3, xmm2
218     pmaddubsw  xmm0, xmm7  // U
219     pmaddubsw  xmm2, xmm7
220     pmaddubsw  xmm1, xmm6  // V
221     pmaddubsw  xmm3, xmm6
222     phaddw     xmm0, xmm2
223     phaddw     xmm1, xmm3
224     psraw      xmm0, 8
225     psraw      xmm1, 8
226     packsswb   xmm0, xmm1
227     paddb      xmm0, xmm5            // -> unsigned
228 
229     // step 3 - store 8 U and 8 V values
230     movlps     qword ptr [edx], xmm0 // U
231     movhps     qword ptr [edx + edi], xmm0 // V
232     lea        edx, [edx + 8]
233     sub        ecx, 16
234     ja         convertloop
235     pop        edi
236     pop        esi
237     ret
238   }
239 }
240 
241 __declspec(naked)
BGRAToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)242 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
243                        uint8* dst_u, uint8* dst_v, int width) {
244 __asm {
245     push       esi
246     push       edi
247     mov        eax, [esp + 8 + 4]   // src_argb
248     mov        esi, [esp + 8 + 8]   // src_stride_argb
249     mov        edx, [esp + 8 + 12]  // dst_u
250     mov        edi, [esp + 8 + 16]  // dst_v
251     mov        ecx, [esp + 8 + 20]  // pix
252     movdqa     xmm7, _kBGRAToU
253     movdqa     xmm6, _kBGRAToV
254     movdqa     xmm5, _kAddUV128
255     sub        edi, edx             // stride from u to v
256 
257  convertloop :
258     /* step 1 - subsample 16x2 argb pixels to 8x1 */
259     movdqa     xmm0, [eax]
260     movdqa     xmm1, [eax + 16]
261     movdqa     xmm2, [eax + 32]
262     movdqa     xmm3, [eax + 48]
263     pavgb      xmm0, [eax + esi]
264     pavgb      xmm1, [eax + esi + 16]
265     pavgb      xmm2, [eax + esi + 32]
266     pavgb      xmm3, [eax + esi + 48]
267     lea        eax,  [eax + 64]
268     movdqa     xmm4, xmm0
269     shufps     xmm0, xmm1, 0x88
270     shufps     xmm4, xmm1, 0xdd
271     pavgb      xmm0, xmm4
272     movdqa     xmm4, xmm2
273     shufps     xmm2, xmm3, 0x88
274     shufps     xmm4, xmm3, 0xdd
275     pavgb      xmm2, xmm4
276 
277     // step 2 - convert to U and V
278     // from here down is very similar to Y code except
279     // instead of 16 different pixels, its 8 pixels of U and 8 of V
280     movdqa     xmm1, xmm0
281     movdqa     xmm3, xmm2
282     pmaddubsw  xmm0, xmm7  // U
283     pmaddubsw  xmm2, xmm7
284     pmaddubsw  xmm1, xmm6  // V
285     pmaddubsw  xmm3, xmm6
286     phaddw     xmm0, xmm2
287     phaddw     xmm1, xmm3
288     psraw      xmm0, 8
289     psraw      xmm1, 8
290     packsswb   xmm0, xmm1
291     paddb      xmm0, xmm5            // -> unsigned
292 
293     // step 3 - store 8 U and 8 V values
294     movlps     qword ptr [edx], xmm0 // U
295     movhps     qword ptr [edx + edi], xmm0 // V
296     lea        edx, [edx + 8]
297     sub        ecx, 16
298     ja         convertloop
299     pop        edi
300     pop        esi
301     ret
302   }
303 }
304 
305 __declspec(naked)
ABGRToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)306 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
307                        uint8* dst_u, uint8* dst_v, int width) {
308 __asm {
309     push       esi
310     push       edi
311     mov        eax, [esp + 8 + 4]   // src_argb
312     mov        esi, [esp + 8 + 8]   // src_stride_argb
313     mov        edx, [esp + 8 + 12]  // dst_u
314     mov        edi, [esp + 8 + 16]  // dst_v
315     mov        ecx, [esp + 8 + 20]  // pix
316     movdqa     xmm7, _kABGRToU
317     movdqa     xmm6, _kABGRToV
318     movdqa     xmm5, _kAddUV128
319     sub        edi, edx             // stride from u to v
320 
321  convertloop :
322     /* step 1 - subsample 16x2 argb pixels to 8x1 */
323     movdqa     xmm0, [eax]
324     movdqa     xmm1, [eax + 16]
325     movdqa     xmm2, [eax + 32]
326     movdqa     xmm3, [eax + 48]
327     pavgb      xmm0, [eax + esi]
328     pavgb      xmm1, [eax + esi + 16]
329     pavgb      xmm2, [eax + esi + 32]
330     pavgb      xmm3, [eax + esi + 48]
331     lea        eax,  [eax + 64]
332     movdqa     xmm4, xmm0
333     shufps     xmm0, xmm1, 0x88
334     shufps     xmm4, xmm1, 0xdd
335     pavgb      xmm0, xmm4
336     movdqa     xmm4, xmm2
337     shufps     xmm2, xmm3, 0x88
338     shufps     xmm4, xmm3, 0xdd
339     pavgb      xmm2, xmm4
340 
341     // step 2 - convert to U and V
342     // from here down is very similar to Y code except
343     // instead of 16 different pixels, its 8 pixels of U and 8 of V
344     movdqa     xmm1, xmm0
345     movdqa     xmm3, xmm2
346     pmaddubsw  xmm0, xmm7  // U
347     pmaddubsw  xmm2, xmm7
348     pmaddubsw  xmm1, xmm6  // V
349     pmaddubsw  xmm3, xmm6
350     phaddw     xmm0, xmm2
351     phaddw     xmm1, xmm3
352     psraw      xmm0, 8
353     psraw      xmm1, 8
354     packsswb   xmm0, xmm1
355     paddb      xmm0, xmm5            // -> unsigned
356 
357     // step 3 - store 8 U and 8 V values
358     movlps     qword ptr [edx], xmm0 // U
359     movhps     qword ptr [edx + edi], xmm0 // V
360     lea        edx, [edx + 8]
361     sub        ecx, 16
362     ja         convertloop
363     pop        edi
364     pop        esi
365     ret
366   }
367 }
368 
369 __declspec(naked)
BG24ToARGBRow_SSSE3(const uint8 * src_bg24,uint8 * dst_argb,int pix)370 void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
371 __asm {
372     mov       eax, [esp + 4]   // src_bg24
373     mov       edx, [esp + 8]   // dst_argb
374     mov       ecx, [esp + 12]  // pix
375     pcmpeqb   xmm7, xmm7       // generate mask 0xff000000
376     pslld     xmm7, 24
377     movdqa    xmm6, _kShuffleMaskBG24ToARGB
378 
379  convertloop :
380     movdqa    xmm0, [eax]
381     movdqa    xmm1, [eax + 16]
382     movdqa    xmm3, [eax + 32]
383     lea       eax, [eax + 48]
384     movdqa    xmm2, xmm3
385     palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
386     pshufb    xmm2, xmm6
387     por       xmm2, xmm7
388     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
389     pshufb    xmm0, xmm6
390     movdqa    [edx + 32], xmm2
391     por       xmm0, xmm7
392     pshufb    xmm1, xmm6
393     movdqa    [edx], xmm0
394     por       xmm1, xmm7
395     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
396     pshufb    xmm3, xmm6
397     movdqa    [edx + 16], xmm1
398     por       xmm3, xmm7
399     movdqa    [edx + 48], xmm3
400     lea       edx, [edx + 64]
401     sub       ecx, 16
402     ja        convertloop
403     ret
404   }
405 }
406 
407 __declspec(naked)
RAWToARGBRow_SSSE3(const uint8 * src_raw,uint8 * dst_argb,int pix)408 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
409                         int pix) {
410 __asm {
411     mov       eax, [esp + 4]   // src_raw
412     mov       edx, [esp + 8]   // dst_argb
413     mov       ecx, [esp + 12]  // pix
414     pcmpeqb   xmm7, xmm7       // generate mask 0xff000000
415     pslld     xmm7, 24
416     movdqa    xmm6, _kShuffleMaskRAWToARGB
417 
418  convertloop :
419     movdqa    xmm0, [eax]
420     movdqa    xmm1, [eax + 16]
421     movdqa    xmm3, [eax + 32]
422     lea       eax, [eax + 48]
423     movdqa    xmm2, xmm3
424     palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
425     pshufb    xmm2, xmm6
426     por       xmm2, xmm7
427     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
428     pshufb    xmm0, xmm6
429     movdqa    [edx + 32], xmm2
430     por       xmm0, xmm7
431     pshufb    xmm1, xmm6
432     movdqa    [edx], xmm0
433     por       xmm1, xmm7
434     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
435     pshufb    xmm3, xmm6
436     movdqa    [edx + 16], xmm1
437     por       xmm3, xmm7
438     movdqa    [edx + 48], xmm3
439     lea       edx, [edx + 64]
440     sub       ecx, 16
441     ja        convertloop
442     ret
443   }
444 }
445 
446 __declspec(naked)
FastConvertYUVToRGB32Row(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)447 void FastConvertYUVToRGB32Row(const uint8* y_buf,
448                               const uint8* u_buf,
449                               const uint8* v_buf,
450                               uint8* rgb_buf,
451                               int width) {
452   __asm {
453     pushad
454     mov       edx, [esp + 32 + 4]
455     mov       edi, [esp + 32 + 8]
456     mov       esi, [esp + 32 + 12]
457     mov       ebp, [esp + 32 + 16]
458     mov       ecx, [esp + 32 + 20]
459 
460  convertloop :
461     movzx     eax, byte ptr [edi]
462     lea       edi, [edi + 1]
463     movzx     ebx, byte ptr [esi]
464     lea       esi, [esi + 1]
465     movq      mm0, [_kCoefficientsRgbY + 2048 + 8 * eax]
466     movzx     eax, byte ptr [edx]
467     paddsw    mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx]
468     movzx     ebx, byte ptr [edx + 1]
469     movq      mm1, [_kCoefficientsRgbY + 8 * eax]
470     lea       edx, [edx + 2]
471     movq      mm2, [_kCoefficientsRgbY + 8 * ebx]
472     paddsw    mm1, mm0
473     paddsw    mm2, mm0
474     psraw     mm1, 6
475     psraw     mm2, 6
476     packuswb  mm1, mm2
477     movntq    [ebp], mm1
478     lea       ebp, [ebp + 8]
479     sub       ecx, 2
480     ja        convertloop
481 
482     popad
483     ret
484   }
485 }
486 
487 __declspec(naked)
FastConvertYUVToBGRARow(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)488 void FastConvertYUVToBGRARow(const uint8* y_buf,
489                              const uint8* u_buf,
490                              const uint8* v_buf,
491                              uint8* rgb_buf,
492                              int width) {
493   __asm {
494     pushad
495     mov       edx, [esp + 32 + 4]
496     mov       edi, [esp + 32 + 8]
497     mov       esi, [esp + 32 + 12]
498     mov       ebp, [esp + 32 + 16]
499     mov       ecx, [esp + 32 + 20]
500 
501  convertloop :
502     movzx     eax, byte ptr [edi]
503     lea       edi, [edi + 1]
504     movzx     ebx, byte ptr [esi]
505     lea       esi, [esi + 1]
506     movq      mm0, [_kCoefficientsBgraY + 2048 + 8 * eax]
507     movzx     eax, byte ptr [edx]
508     paddsw    mm0, [_kCoefficientsBgraY + 4096 + 8 * ebx]
509     movzx     ebx, byte ptr [edx + 1]
510     movq      mm1, [_kCoefficientsBgraY + 8 * eax]
511     lea       edx, [edx + 2]
512     movq      mm2, [_kCoefficientsBgraY + 8 * ebx]
513     paddsw    mm1, mm0
514     paddsw    mm2, mm0
515     psraw     mm1, 6
516     psraw     mm2, 6
517     packuswb  mm1, mm2
518     movntq    [ebp], mm1
519     lea       ebp, [ebp + 8]
520     sub       ecx, 2
521     ja        convertloop
522 
523     popad
524     ret
525   }
526 }
527 
528 __declspec(naked)
FastConvertYUVToABGRRow(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)529 void FastConvertYUVToABGRRow(const uint8* y_buf,
530                              const uint8* u_buf,
531                              const uint8* v_buf,
532                              uint8* rgb_buf,
533                              int width) {
534   __asm {
535     pushad
536     mov       edx, [esp + 32 + 4]
537     mov       edi, [esp + 32 + 8]
538     mov       esi, [esp + 32 + 12]
539     mov       ebp, [esp + 32 + 16]
540     mov       ecx, [esp + 32 + 20]
541 
542  convertloop :
543     movzx     eax, byte ptr [edi]
544     lea       edi, [edi + 1]
545     movzx     ebx, byte ptr [esi]
546     lea       esi, [esi + 1]
547     movq      mm0, [_kCoefficientsAbgrY + 2048 + 8 * eax]
548     movzx     eax, byte ptr [edx]
549     paddsw    mm0, [_kCoefficientsAbgrY + 4096 + 8 * ebx]
550     movzx     ebx, byte ptr [edx + 1]
551     movq      mm1, [_kCoefficientsAbgrY + 8 * eax]
552     lea       edx, [edx + 2]
553     movq      mm2, [_kCoefficientsAbgrY + 8 * ebx]
554     paddsw    mm1, mm0
555     paddsw    mm2, mm0
556     psraw     mm1, 6
557     psraw     mm2, 6
558     packuswb  mm1, mm2
559     movntq    [ebp], mm1
560     lea       ebp, [ebp + 8]
561     sub       ecx, 2
562     ja        convertloop
563 
564     popad
565     ret
566   }
567 }
568 
569 __declspec(naked)
FastConvertYUV444ToRGB32Row(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)570 void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
571                                  const uint8* u_buf,
572                                  const uint8* v_buf,
573                                  uint8* rgb_buf,
574                                  int width) {
575   __asm {
576     pushad
577     mov       edx, [esp + 32 + 4]   // Y
578     mov       edi, [esp + 32 + 8]   // U
579     mov       esi, [esp + 32 + 12]  // V
580     mov       ebp, [esp + 32 + 16]  // rgb
581     mov       ecx, [esp + 32 + 20]  // width
582 
583  convertloop :
584     movzx     eax, byte ptr [edi]
585     lea       edi, [edi + 1]
586     movzx     ebx, byte ptr [esi]
587     lea       esi, [esi + 1]
588     movq      mm0, [_kCoefficientsRgbY + 2048 + 8 * eax]
589     movzx     eax, byte ptr [edx]
590     paddsw    mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx]
591     lea       edx, [edx + 1]
592     paddsw    mm0, [_kCoefficientsRgbY + 8 * eax]
593     psraw     mm0, 6
594     packuswb  mm0, mm0
595     movd      [ebp], mm0
596     lea       ebp, [ebp + 4]
597     sub       ecx, 1
598     ja        convertloop
599 
600     popad
601     ret
602   }
603 }
604 
605 __declspec(naked)
FastConvertYToRGB32Row(const uint8 * y_buf,uint8 * rgb_buf,int width)606 void FastConvertYToRGB32Row(const uint8* y_buf,
607                             uint8* rgb_buf,
608                             int width) {
609   __asm {
610     push      ebx
611     mov       eax, [esp + 4 + 4]   // Y
612     mov       edx, [esp + 4 + 8]   // rgb
613     mov       ecx, [esp + 4 + 12]  // width
614 
615  convertloop :
616     movzx     ebx, byte ptr [eax]
617     movq      mm0, [_kCoefficientsRgbY + 8 * ebx]
618     psraw     mm0, 6
619     movzx     ebx, byte ptr [eax + 1]
620     movq      mm1, [_kCoefficientsRgbY + 8 * ebx]
621     psraw     mm1, 6
622     packuswb  mm0, mm1
623     lea       eax, [eax + 2]
624     movq      [edx], mm0
625     lea       edx, [edx + 8]
626     sub       ecx, 2
627     ja        convertloop
628 
629     pop       ebx
630     ret
631   }
632 }
633 
634 #endif
635 
636 }  // extern "C"
637