• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*
2   *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3   *
4   *  Use of this source code is governed by a BSD-style license
5   *  that can be found in the LICENSE file in the root of the source
6   *  tree. An additional intellectual property rights grant can be found
7   *  in the file PATENTS.  All contributing project authors may
8   *  be found in the AUTHORS file in the root of the source tree.
9   */
10  
11  #include "libyuv/row.h"
12  
13  #ifdef __cplusplus
14  namespace libyuv {
15  extern "C" {
16  #endif
17  
18  // This module is for Visual C x86.
19  #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
20  
21  // TODO(fbarchard): I420ToRGB24, I420ToRAW
22  #ifdef HAS_ARGBTOYROW_SSSE3
23  
24  // Constants for ARGB.
25  static const vec8 kARGBToY = {
26    13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
27  };
28  
29  static const vec8 kARGBToU = {
30    112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
31  };
32  
33  static const vec8 kARGBToV = {
34    -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
35  };
36  
37  // Constants for BGRA.
38  static const vec8 kBGRAToY = {
39    0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
40  };
41  
42  static const vec8 kBGRAToU = {
43    0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
44  };
45  
46  static const vec8 kBGRAToV = {
47    0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
48  };
49  
50  // Constants for ABGR.
51  static const vec8 kABGRToY = {
52    33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
53  };
54  
55  static const vec8 kABGRToU = {
56    -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
57  };
58  
59  static const vec8 kABGRToV = {
60    112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
61  };
62  
63  // Constants for RGBA.
64  static const vec8 kRGBAToY = {
65    0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
66  };
67  
68  static const vec8 kRGBAToU = {
69    0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
70  };
71  
72  static const vec8 kRGBAToV = {
73    0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
74  };
75  
76  static const uvec8 kAddY16 = {
77    16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
78  };
79  
80  static const uvec8 kAddUV128 = {
81    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
82    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
83  };
84  
85  // Shuffle table for converting RGB24 to ARGB.
86  static const uvec8 kShuffleMaskRGB24ToARGB = {
87    0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
88  };
89  
90  // Shuffle table for converting RAW to ARGB.
91  static const uvec8 kShuffleMaskRAWToARGB = {
92    2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
93  };
94  
95  // Shuffle table for converting BGRA to ARGB.
96  static const uvec8 kShuffleMaskBGRAToARGB = {
97    3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
98  };
99  
100  // Shuffle table for converting ABGR to ARGB.
101  static const uvec8 kShuffleMaskABGRToARGB = {
102    2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
103  };
104  
105  // Shuffle table for converting RGBA to ARGB.
106  static const uvec8 kShuffleMaskRGBAToARGB = {
107    1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
108  };
109  
110  // Shuffle table for converting ARGB to RGBA.
111  static const uvec8 kShuffleMaskARGBToRGBA = {
112    3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
113  };
114  
115  // Shuffle table for converting ARGB to RGB24.
116  static const uvec8 kShuffleMaskARGBToRGB24 = {
117    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
118  };
119  
120  // Shuffle table for converting ARGB to RAW.
121  static const uvec8 kShuffleMaskARGBToRAW = {
122    2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
123  };
124  
125  __declspec(naked) __declspec(align(16))
I400ToARGBRow_SSE2(const uint8 * src_y,uint8 * dst_argb,int pix)126  void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
127    __asm {
128      mov        eax, [esp + 4]        // src_y
129      mov        edx, [esp + 8]        // dst_argb
130      mov        ecx, [esp + 12]       // pix
131      pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
132      pslld      xmm5, 24
133  
134      align      16
135    convertloop:
136      movq       xmm0, qword ptr [eax]
137      lea        eax,  [eax + 8]
138      punpcklbw  xmm0, xmm0
139      movdqa     xmm1, xmm0
140      punpcklwd  xmm0, xmm0
141      punpckhwd  xmm1, xmm1
142      por        xmm0, xmm5
143      por        xmm1, xmm5
144      movdqa     [edx], xmm0
145      movdqa     [edx + 16], xmm1
146      lea        edx, [edx + 32]
147      sub        ecx, 8
148      jg         convertloop
149      ret
150    }
151  }
152  
153  __declspec(naked) __declspec(align(16))
BGRAToARGBRow_SSSE3(const uint8 * src_bgra,uint8 * dst_argb,int pix)154  void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
155  __asm {
156      mov       eax, [esp + 4]   // src_bgra
157      mov       edx, [esp + 8]   // dst_argb
158      mov       ecx, [esp + 12]  // pix
159      movdqa    xmm5, kShuffleMaskBGRAToARGB
160      sub       edx, eax
161  
162      align      16
163   convertloop:
164      movdqa    xmm0, [eax]
165      pshufb    xmm0, xmm5
166      sub       ecx, 4
167      movdqa    [eax + edx], xmm0
168      lea       eax, [eax + 16]
169      jg        convertloop
170      ret
171    }
172  }
173  
174  __declspec(naked) __declspec(align(16))
ABGRToARGBRow_SSSE3(const uint8 * src_abgr,uint8 * dst_argb,int pix)175  void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
176  __asm {
177      mov       eax, [esp + 4]   // src_abgr
178      mov       edx, [esp + 8]   // dst_argb
179      mov       ecx, [esp + 12]  // pix
180      movdqa    xmm5, kShuffleMaskABGRToARGB
181      sub       edx, eax
182  
183      align      16
184   convertloop:
185      movdqa    xmm0, [eax]
186      pshufb    xmm0, xmm5
187      sub       ecx, 4
188      movdqa    [eax + edx], xmm0
189      lea       eax, [eax + 16]
190      jg        convertloop
191      ret
192    }
193  }
194  
195  __declspec(naked) __declspec(align(16))
RGBAToARGBRow_SSSE3(const uint8 * src_rgba,uint8 * dst_argb,int pix)196  void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
197  __asm {
198      mov       eax, [esp + 4]   // src_rgba
199      mov       edx, [esp + 8]   // dst_argb
200      mov       ecx, [esp + 12]  // pix
201      movdqa    xmm5, kShuffleMaskRGBAToARGB
202      sub       edx, eax
203  
204      align      16
205   convertloop:
206      movdqa    xmm0, [eax]
207      pshufb    xmm0, xmm5
208      sub       ecx, 4
209      movdqa    [eax + edx], xmm0
210      lea       eax, [eax + 16]
211      jg        convertloop
212      ret
213    }
214  }
215  
216  __declspec(naked) __declspec(align(16))
ARGBToRGBARow_SSSE3(const uint8 * src_argb,uint8 * dst_rgba,int pix)217  void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
218  __asm {
219      mov       eax, [esp + 4]   // src_argb
220      mov       edx, [esp + 8]   // dst_rgba
221      mov       ecx, [esp + 12]  // pix
222      movdqa    xmm5, kShuffleMaskARGBToRGBA
223      sub       edx, eax
224  
225      align      16
226   convertloop:
227      movdqa    xmm0, [eax]
228      pshufb    xmm0, xmm5
229      sub       ecx, 4
230      movdqa    [eax + edx], xmm0
231      lea       eax, [eax + 16]
232      jg        convertloop
233      ret
234    }
235  }
236  
237  __declspec(naked) __declspec(align(16))
RGB24ToARGBRow_SSSE3(const uint8 * src_rgb24,uint8 * dst_argb,int pix)238  void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
239  __asm {
240      mov       eax, [esp + 4]   // src_rgb24
241      mov       edx, [esp + 8]   // dst_argb
242      mov       ecx, [esp + 12]  // pix
243      pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
244      pslld     xmm5, 24
245      movdqa    xmm4, kShuffleMaskRGB24ToARGB
246  
247      align      16
248   convertloop:
249      movdqu    xmm0, [eax]
250      movdqu    xmm1, [eax + 16]
251      movdqu    xmm3, [eax + 32]
252      lea       eax, [eax + 48]
253      movdqa    xmm2, xmm3
254      palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
255      pshufb    xmm2, xmm4
256      por       xmm2, xmm5
257      palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
258      pshufb    xmm0, xmm4
259      movdqa    [edx + 32], xmm2
260      por       xmm0, xmm5
261      pshufb    xmm1, xmm4
262      movdqa    [edx], xmm0
263      por       xmm1, xmm5
264      palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
265      pshufb    xmm3, xmm4
266      movdqa    [edx + 16], xmm1
267      por       xmm3, xmm5
268      sub       ecx, 16
269      movdqa    [edx + 48], xmm3
270      lea       edx, [edx + 64]
271      jg        convertloop
272      ret
273    }
274  }
275  
276  __declspec(naked) __declspec(align(16))
RAWToARGBRow_SSSE3(const uint8 * src_raw,uint8 * dst_argb,int pix)277  void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
278                          int pix) {
279  __asm {
280      mov       eax, [esp + 4]   // src_raw
281      mov       edx, [esp + 8]   // dst_argb
282      mov       ecx, [esp + 12]  // pix
283      pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
284      pslld     xmm5, 24
285      movdqa    xmm4, kShuffleMaskRAWToARGB
286  
287      align      16
288   convertloop:
289      movdqu    xmm0, [eax]
290      movdqu    xmm1, [eax + 16]
291      movdqu    xmm3, [eax + 32]
292      lea       eax, [eax + 48]
293      movdqa    xmm2, xmm3
294      palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
295      pshufb    xmm2, xmm4
296      por       xmm2, xmm5
297      palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
298      pshufb    xmm0, xmm4
299      movdqa    [edx + 32], xmm2
300      por       xmm0, xmm5
301      pshufb    xmm1, xmm4
302      movdqa    [edx], xmm0
303      por       xmm1, xmm5
304      palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
305      pshufb    xmm3, xmm4
306      movdqa    [edx + 16], xmm1
307      por       xmm3, xmm5
308      sub       ecx, 16
309      movdqa    [edx + 48], xmm3
310      lea       edx, [edx + 64]
311      jg        convertloop
312      ret
313    }
314  }
315  
316  // pmul method to replicate bits.
317  // Math to replicate bits:
318  // (v << 8) | (v << 3)
319  // v * 256 + v * 8
320  // v * (256 + 8)
321  // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
322  // 20 instructions.
323  __declspec(naked) __declspec(align(16))
RGB565ToARGBRow_SSE2(const uint8 * src_rgb565,uint8 * dst_argb,int pix)324  void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
325                            int pix) {
326  __asm {
327      mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
328      movd      xmm5, eax
329      pshufd    xmm5, xmm5, 0
330      mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
331      movd      xmm6, eax
332      pshufd    xmm6, xmm6, 0
333      pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
334      psllw     xmm3, 11
335      pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
336      psllw     xmm4, 10
337      psrlw     xmm4, 5
338      pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
339      psllw     xmm7, 8
340  
341      mov       eax, [esp + 4]   // src_rgb565
342      mov       edx, [esp + 8]   // dst_argb
343      mov       ecx, [esp + 12]  // pix
344      sub       edx, eax
345      sub       edx, eax
346  
347      align      16
348   convertloop:
349      movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
350      movdqa    xmm1, xmm0
351      movdqa    xmm2, xmm0
352      pand      xmm1, xmm3    // R in upper 5 bits
353      psllw     xmm2, 11      // B in upper 5 bits
354      pmulhuw   xmm1, xmm5    // * (256 + 8)
355      pmulhuw   xmm2, xmm5    // * (256 + 8)
356      psllw     xmm1, 8
357      por       xmm1, xmm2    // RB
358      pand      xmm0, xmm4    // G in middle 6 bits
359      pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
360      por       xmm0, xmm7    // AG
361      movdqa    xmm2, xmm1
362      punpcklbw xmm1, xmm0
363      punpckhbw xmm2, xmm0
364      movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
365      movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
366      lea       eax, [eax + 16]
367      sub       ecx, 8
368      jg        convertloop
369      ret
370    }
371  }
372  
373  // 24 instructions
374  __declspec(naked) __declspec(align(16))
ARGB1555ToARGBRow_SSE2(const uint8 * src_argb1555,uint8 * dst_argb,int pix)375  void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
376                              int pix) {
377  __asm {
378      mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
379      movd      xmm5, eax
380      pshufd    xmm5, xmm5, 0
381      mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
382      movd      xmm6, eax
383      pshufd    xmm6, xmm6, 0
384      pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
385      psllw     xmm3, 11
386      movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
387      psrlw     xmm4, 6
388      pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
389      psllw     xmm7, 8
390  
391      mov       eax, [esp + 4]   // src_argb1555
392      mov       edx, [esp + 8]   // dst_argb
393      mov       ecx, [esp + 12]  // pix
394      sub       edx, eax
395      sub       edx, eax
396  
397      align      16
398   convertloop:
399      movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
400      movdqa    xmm1, xmm0
401      movdqa    xmm2, xmm0
402      psllw     xmm1, 1       // R in upper 5 bits
403      psllw     xmm2, 11      // B in upper 5 bits
404      pand      xmm1, xmm3
405      pmulhuw   xmm2, xmm5    // * (256 + 8)
406      pmulhuw   xmm1, xmm5    // * (256 + 8)
407      psllw     xmm1, 8
408      por       xmm1, xmm2    // RB
409      movdqa    xmm2, xmm0
410      pand      xmm0, xmm4    // G in middle 5 bits
411      psraw     xmm2, 8       // A
412      pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
413      pand      xmm2, xmm7
414      por       xmm0, xmm2    // AG
415      movdqa    xmm2, xmm1
416      punpcklbw xmm1, xmm0
417      punpckhbw xmm2, xmm0
418      movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
419      movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
420      lea       eax, [eax + 16]
421      sub       ecx, 8
422      jg        convertloop
423      ret
424    }
425  }
426  
427  // 18 instructions.
428  __declspec(naked) __declspec(align(16))
ARGB4444ToARGBRow_SSE2(const uint8 * src_argb4444,uint8 * dst_argb,int pix)429  void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
430                              int pix) {
431  __asm {
432      mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
433      movd      xmm4, eax
434      pshufd    xmm4, xmm4, 0
435      movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
436      pslld     xmm5, 4
437      mov       eax, [esp + 4]   // src_argb4444
438      mov       edx, [esp + 8]   // dst_argb
439      mov       ecx, [esp + 12]  // pix
440      sub       edx, eax
441      sub       edx, eax
442  
443      align      16
444   convertloop:
445      movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
446      movdqa    xmm2, xmm0
447      pand      xmm0, xmm4    // mask low nibbles
448      pand      xmm2, xmm5    // mask high nibbles
449      movdqa    xmm1, xmm0
450      movdqa    xmm3, xmm2
451      psllw     xmm1, 4
452      psrlw     xmm3, 4
453      por       xmm0, xmm1
454      por       xmm2, xmm3
455      movdqa    xmm1, xmm0
456      punpcklbw xmm0, xmm2
457      punpckhbw xmm1, xmm2
458      movdqa    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
459      movdqa    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
460      lea       eax, [eax + 16]
461      sub       ecx, 8
462      jg        convertloop
463      ret
464    }
465  }
466  
467  __declspec(naked) __declspec(align(16))
ARGBToRGB24Row_SSSE3(const uint8 * src_argb,uint8 * dst_rgb,int pix)468  void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
469  __asm {
470      mov       eax, [esp + 4]   // src_argb
471      mov       edx, [esp + 8]   // dst_rgb
472      mov       ecx, [esp + 12]  // pix
473      movdqa    xmm6, kShuffleMaskARGBToRGB24
474  
475      align      16
476   convertloop:
477      movdqa    xmm0, [eax]   // fetch 16 pixels of argb
478      movdqa    xmm1, [eax + 16]
479      movdqa    xmm2, [eax + 32]
480      movdqa    xmm3, [eax + 48]
481      lea       eax, [eax + 64]
482      pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
483      pshufb    xmm1, xmm6
484      pshufb    xmm2, xmm6
485      pshufb    xmm3, xmm6
486      movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
487      psrldq    xmm1, 4      // 8 bytes from 1
488      pslldq    xmm4, 12     // 4 bytes from 1 for 0
489      movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
490      por       xmm0, xmm4   // 4 bytes from 1 for 0
491      pslldq    xmm5, 8      // 8 bytes from 2 for 1
492      movdqa    [edx], xmm0  // store 0
493      por       xmm1, xmm5   // 8 bytes from 2 for 1
494      psrldq    xmm2, 8      // 4 bytes from 2
495      pslldq    xmm3, 4      // 12 bytes from 3 for 2
496      por       xmm2, xmm3   // 12 bytes from 3 for 2
497      movdqa    [edx + 16], xmm1   // store 1
498      movdqa    [edx + 32], xmm2   // store 2
499      lea       edx, [edx + 48]
500      sub       ecx, 16
501      jg        convertloop
502      ret
503    }
504  }
505  
506  __declspec(naked) __declspec(align(16))
ARGBToRAWRow_SSSE3(const uint8 * src_argb,uint8 * dst_rgb,int pix)507  void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
508  __asm {
509      mov       eax, [esp + 4]   // src_argb
510      mov       edx, [esp + 8]   // dst_rgb
511      mov       ecx, [esp + 12]  // pix
512      movdqa    xmm6, kShuffleMaskARGBToRAW
513  
514      align      16
515   convertloop:
516      movdqa    xmm0, [eax]   // fetch 16 pixels of argb
517      movdqa    xmm1, [eax + 16]
518      movdqa    xmm2, [eax + 32]
519      movdqa    xmm3, [eax + 48]
520      lea       eax, [eax + 64]
521      pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
522      pshufb    xmm1, xmm6
523      pshufb    xmm2, xmm6
524      pshufb    xmm3, xmm6
525      movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
526      psrldq    xmm1, 4      // 8 bytes from 1
527      pslldq    xmm4, 12     // 4 bytes from 1 for 0
528      movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
529      por       xmm0, xmm4   // 4 bytes from 1 for 0
530      pslldq    xmm5, 8      // 8 bytes from 2 for 1
531      movdqa    [edx], xmm0  // store 0
532      por       xmm1, xmm5   // 8 bytes from 2 for 1
533      psrldq    xmm2, 8      // 4 bytes from 2
534      pslldq    xmm3, 4      // 12 bytes from 3 for 2
535      por       xmm2, xmm3   // 12 bytes from 3 for 2
536      movdqa    [edx + 16], xmm1   // store 1
537      movdqa    [edx + 32], xmm2   // store 2
538      lea       edx, [edx + 48]
539      sub       ecx, 16
540      jg        convertloop
541      ret
542    }
543  }
544  
545  __declspec(naked) __declspec(align(16))
ARGBToRGB565Row_SSE2(const uint8 * src_argb,uint8 * dst_rgb,int pix)546  void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
547  __asm {
548      mov       eax, [esp + 4]   // src_argb
549      mov       edx, [esp + 8]   // dst_rgb
550      mov       ecx, [esp + 12]  // pix
551      pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
552      psrld     xmm3, 27
553      pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
554      psrld     xmm4, 26
555      pslld     xmm4, 5
556      pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
557      pslld     xmm5, 11
558  
559      align      16
560   convertloop:
561      movdqa    xmm0, [eax]   // fetch 4 pixels of argb
562      movdqa    xmm1, xmm0    // B
563      movdqa    xmm2, xmm0    // G
564      pslld     xmm0, 8       // R
565      psrld     xmm1, 3       // B
566      psrld     xmm2, 5       // G
567      psrad     xmm0, 16      // R
568      pand      xmm1, xmm3    // B
569      pand      xmm2, xmm4    // G
570      pand      xmm0, xmm5    // R
571      por       xmm1, xmm2    // BG
572      por       xmm0, xmm1    // BGR
573      packssdw  xmm0, xmm0
574      lea       eax, [eax + 16]
575      movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
576      lea       edx, [edx + 8]
577      sub       ecx, 4
578      jg        convertloop
579      ret
580    }
581  }
582  
583  // TODO(fbarchard): Improve sign extension/packing.
584  __declspec(naked) __declspec(align(16))
ARGBToARGB1555Row_SSE2(const uint8 * src_argb,uint8 * dst_rgb,int pix)585  void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
586  __asm {
587      mov       eax, [esp + 4]   // src_argb
588      mov       edx, [esp + 8]   // dst_rgb
589      mov       ecx, [esp + 12]  // pix
590      pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
591      psrld     xmm4, 27
592      movdqa    xmm5, xmm4       // generate mask 0x000003e0
593      pslld     xmm5, 5
594      movdqa    xmm6, xmm4       // generate mask 0x00007c00
595      pslld     xmm6, 10
596      pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
597      pslld     xmm7, 15
598  
599      align      16
600   convertloop:
601      movdqa    xmm0, [eax]   // fetch 4 pixels of argb
602      movdqa    xmm1, xmm0    // B
603      movdqa    xmm2, xmm0    // G
604      movdqa    xmm3, xmm0    // R
605      psrad     xmm0, 16      // A
606      psrld     xmm1, 3       // B
607      psrld     xmm2, 6       // G
608      psrld     xmm3, 9       // R
609      pand      xmm0, xmm7    // A
610      pand      xmm1, xmm4    // B
611      pand      xmm2, xmm5    // G
612      pand      xmm3, xmm6    // R
613      por       xmm0, xmm1    // BA
614      por       xmm2, xmm3    // GR
615      por       xmm0, xmm2    // BGRA
616      packssdw  xmm0, xmm0
617      lea       eax, [eax + 16]
618      movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
619      lea       edx, [edx + 8]
620      sub       ecx, 4
621      jg        convertloop
622      ret
623    }
624  }
625  
626  __declspec(naked) __declspec(align(16))
ARGBToARGB4444Row_SSE2(const uint8 * src_argb,uint8 * dst_rgb,int pix)627  void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
628  __asm {
629      mov       eax, [esp + 4]   // src_argb
630      mov       edx, [esp + 8]   // dst_rgb
631      mov       ecx, [esp + 12]  // pix
632      pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
633      psllw     xmm4, 12
634      movdqa    xmm3, xmm4       // generate mask 0x00f000f0
635      psrlw     xmm3, 8
636  
637      align      16
638   convertloop:
639      movdqa    xmm0, [eax]   // fetch 4 pixels of argb
640      movdqa    xmm1, xmm0
641      pand      xmm0, xmm3    // low nibble
642      pand      xmm1, xmm4    // high nibble
643      psrl      xmm0, 4
644      psrl      xmm1, 8
645      por       xmm0, xmm1
646      packuswb  xmm0, xmm0
647      lea       eax, [eax + 16]
648      movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
649      lea       edx, [edx + 8]
650      sub       ecx, 4
651      jg        convertloop
652      ret
653    }
654  }
655  
656  // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
657  __declspec(naked) __declspec(align(16))
ARGBToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)658  void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
659  __asm {
660      mov        eax, [esp + 4]   /* src_argb */
661      mov        edx, [esp + 8]   /* dst_y */
662      mov        ecx, [esp + 12]  /* pix */
663      movdqa     xmm5, kAddY16
664      movdqa     xmm4, kARGBToY
665  
666      align      16
667   convertloop:
668      movdqa     xmm0, [eax]
669      movdqa     xmm1, [eax + 16]
670      movdqa     xmm2, [eax + 32]
671      movdqa     xmm3, [eax + 48]
672      pmaddubsw  xmm0, xmm4
673      pmaddubsw  xmm1, xmm4
674      pmaddubsw  xmm2, xmm4
675      pmaddubsw  xmm3, xmm4
676      lea        eax, [eax + 64]
677      phaddw     xmm0, xmm1
678      phaddw     xmm2, xmm3
679      psrlw      xmm0, 7
680      psrlw      xmm2, 7
681      packuswb   xmm0, xmm2
682      paddb      xmm0, xmm5
683      sub        ecx, 16
684      movdqa     [edx], xmm0
685      lea        edx, [edx + 16]
686      jg         convertloop
687      ret
688    }
689  }
690  
691  __declspec(naked) __declspec(align(16))
ARGBToYRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)692  void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
693  __asm {
694      mov        eax, [esp + 4]   /* src_argb */
695      mov        edx, [esp + 8]   /* dst_y */
696      mov        ecx, [esp + 12]  /* pix */
697      movdqa     xmm5, kAddY16
698      movdqa     xmm4, kARGBToY
699  
700      align      16
701   convertloop:
702      movdqu     xmm0, [eax]
703      movdqu     xmm1, [eax + 16]
704      movdqu     xmm2, [eax + 32]
705      movdqu     xmm3, [eax + 48]
706      pmaddubsw  xmm0, xmm4
707      pmaddubsw  xmm1, xmm4
708      pmaddubsw  xmm2, xmm4
709      pmaddubsw  xmm3, xmm4
710      lea        eax, [eax + 64]
711      phaddw     xmm0, xmm1
712      phaddw     xmm2, xmm3
713      psrlw      xmm0, 7
714      psrlw      xmm2, 7
715      packuswb   xmm0, xmm2
716      paddb      xmm0, xmm5
717      sub        ecx, 16
718      movdqu     [edx], xmm0
719      lea        edx, [edx + 16]
720      jg         convertloop
721      ret
722    }
723  }
724  
725  __declspec(naked) __declspec(align(16))
BGRAToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)726  void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
727  __asm {
728      mov        eax, [esp + 4]   /* src_argb */
729      mov        edx, [esp + 8]   /* dst_y */
730      mov        ecx, [esp + 12]  /* pix */
731      movdqa     xmm5, kAddY16
732      movdqa     xmm4, kBGRAToY
733  
734      align      16
735   convertloop:
736      movdqa     xmm0, [eax]
737      movdqa     xmm1, [eax + 16]
738      movdqa     xmm2, [eax + 32]
739      movdqa     xmm3, [eax + 48]
740      pmaddubsw  xmm0, xmm4
741      pmaddubsw  xmm1, xmm4
742      pmaddubsw  xmm2, xmm4
743      pmaddubsw  xmm3, xmm4
744      lea        eax, [eax + 64]
745      phaddw     xmm0, xmm1
746      phaddw     xmm2, xmm3
747      psrlw      xmm0, 7
748      psrlw      xmm2, 7
749      packuswb   xmm0, xmm2
750      paddb      xmm0, xmm5
751      sub        ecx, 16
752      movdqa     [edx], xmm0
753      lea        edx, [edx + 16]
754      jg         convertloop
755      ret
756    }
757  }
758  
759  __declspec(naked) __declspec(align(16))
BGRAToYRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)760  void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
761  __asm {
762      mov        eax, [esp + 4]   /* src_argb */
763      mov        edx, [esp + 8]   /* dst_y */
764      mov        ecx, [esp + 12]  /* pix */
765      movdqa     xmm5, kAddY16
766      movdqa     xmm4, kBGRAToY
767  
768      align      16
769   convertloop:
770      movdqu     xmm0, [eax]
771      movdqu     xmm1, [eax + 16]
772      movdqu     xmm2, [eax + 32]
773      movdqu     xmm3, [eax + 48]
774      pmaddubsw  xmm0, xmm4
775      pmaddubsw  xmm1, xmm4
776      pmaddubsw  xmm2, xmm4
777      pmaddubsw  xmm3, xmm4
778      lea        eax, [eax + 64]
779      phaddw     xmm0, xmm1
780      phaddw     xmm2, xmm3
781      psrlw      xmm0, 7
782      psrlw      xmm2, 7
783      packuswb   xmm0, xmm2
784      paddb      xmm0, xmm5
785      sub        ecx, 16
786      movdqu     [edx], xmm0
787      lea        edx, [edx + 16]
788      jg         convertloop
789      ret
790    }
791  }
792  
793  __declspec(naked) __declspec(align(16))
ABGRToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)794  void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
795  __asm {
796      mov        eax, [esp + 4]   /* src_argb */
797      mov        edx, [esp + 8]   /* dst_y */
798      mov        ecx, [esp + 12]  /* pix */
799      movdqa     xmm5, kAddY16
800      movdqa     xmm4, kABGRToY
801  
802      align      16
803   convertloop:
804      movdqa     xmm0, [eax]
805      movdqa     xmm1, [eax + 16]
806      movdqa     xmm2, [eax + 32]
807      movdqa     xmm3, [eax + 48]
808      pmaddubsw  xmm0, xmm4
809      pmaddubsw  xmm1, xmm4
810      pmaddubsw  xmm2, xmm4
811      pmaddubsw  xmm3, xmm4
812      lea        eax, [eax + 64]
813      phaddw     xmm0, xmm1
814      phaddw     xmm2, xmm3
815      psrlw      xmm0, 7
816      psrlw      xmm2, 7
817      packuswb   xmm0, xmm2
818      paddb      xmm0, xmm5
819      sub        ecx, 16
820      movdqa     [edx], xmm0
821      lea        edx, [edx + 16]
822      jg         convertloop
823      ret
824    }
825  }
826  
827  __declspec(naked) __declspec(align(16))
ABGRToYRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)828  void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
829  __asm {
830      mov        eax, [esp + 4]   /* src_argb */
831      mov        edx, [esp + 8]   /* dst_y */
832      mov        ecx, [esp + 12]  /* pix */
833      movdqa     xmm5, kAddY16
834      movdqa     xmm4, kABGRToY
835  
836      align      16
837   convertloop:
838      movdqu     xmm0, [eax]
839      movdqu     xmm1, [eax + 16]
840      movdqu     xmm2, [eax + 32]
841      movdqu     xmm3, [eax + 48]
842      pmaddubsw  xmm0, xmm4
843      pmaddubsw  xmm1, xmm4
844      pmaddubsw  xmm2, xmm4
845      pmaddubsw  xmm3, xmm4
846      lea        eax, [eax + 64]
847      phaddw     xmm0, xmm1
848      phaddw     xmm2, xmm3
849      psrlw      xmm0, 7
850      psrlw      xmm2, 7
851      packuswb   xmm0, xmm2
852      paddb      xmm0, xmm5
853      sub        ecx, 16
854      movdqu     [edx], xmm0
855      lea        edx, [edx + 16]
856      jg         convertloop
857      ret
858    }
859  }
860  
861  __declspec(naked) __declspec(align(16))
RGBAToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)862  void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
863  __asm {
864      mov        eax, [esp + 4]   /* src_argb */
865      mov        edx, [esp + 8]   /* dst_y */
866      mov        ecx, [esp + 12]  /* pix */
867      movdqa     xmm5, kAddY16
868      movdqa     xmm4, kRGBAToY
869  
870      align      16
871   convertloop:
872      movdqa     xmm0, [eax]
873      movdqa     xmm1, [eax + 16]
874      movdqa     xmm2, [eax + 32]
875      movdqa     xmm3, [eax + 48]
876      pmaddubsw  xmm0, xmm4
877      pmaddubsw  xmm1, xmm4
878      pmaddubsw  xmm2, xmm4
879      pmaddubsw  xmm3, xmm4
880      lea        eax, [eax + 64]
881      phaddw     xmm0, xmm1
882      phaddw     xmm2, xmm3
883      psrlw      xmm0, 7
884      psrlw      xmm2, 7
885      packuswb   xmm0, xmm2
886      paddb      xmm0, xmm5
887      sub        ecx, 16
888      movdqa     [edx], xmm0
889      lea        edx, [edx + 16]
890      jg         convertloop
891      ret
892    }
893  }
894  
895  __declspec(naked) __declspec(align(16))
RGBAToYRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)896  void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
897  __asm {
898      mov        eax, [esp + 4]   /* src_argb */
899      mov        edx, [esp + 8]   /* dst_y */
900      mov        ecx, [esp + 12]  /* pix */
901      movdqa     xmm5, kAddY16
902      movdqa     xmm4, kRGBAToY
903  
904      align      16
905   convertloop:
906      movdqu     xmm0, [eax]
907      movdqu     xmm1, [eax + 16]
908      movdqu     xmm2, [eax + 32]
909      movdqu     xmm3, [eax + 48]
910      pmaddubsw  xmm0, xmm4
911      pmaddubsw  xmm1, xmm4
912      pmaddubsw  xmm2, xmm4
913      pmaddubsw  xmm3, xmm4
914      lea        eax, [eax + 64]
915      phaddw     xmm0, xmm1
916      phaddw     xmm2, xmm3
917      psrlw      xmm0, 7
918      psrlw      xmm2, 7
919      packuswb   xmm0, xmm2
920      paddb      xmm0, xmm5
921      sub        ecx, 16
922      movdqu     [edx], xmm0
923      lea        edx, [edx + 16]
924      jg         convertloop
925      ret
926    }
927  }
928  
929  __declspec(naked) __declspec(align(16))
ARGBToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)930  void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
931                         uint8* dst_u, uint8* dst_v, int width) {
932  __asm {
933      push       esi
934      push       edi
935      mov        eax, [esp + 8 + 4]   // src_argb
936      mov        esi, [esp + 8 + 8]   // src_stride_argb
937      mov        edx, [esp + 8 + 12]  // dst_u
938      mov        edi, [esp + 8 + 16]  // dst_v
939      mov        ecx, [esp + 8 + 20]  // pix
940      movdqa     xmm7, kARGBToU
941      movdqa     xmm6, kARGBToV
942      movdqa     xmm5, kAddUV128
943      sub        edi, edx             // stride from u to v
944  
945      align      16
946   convertloop:
947      /* step 1 - subsample 16x2 argb pixels to 8x1 */
948      movdqa     xmm0, [eax]
949      movdqa     xmm1, [eax + 16]
950      movdqa     xmm2, [eax + 32]
951      movdqa     xmm3, [eax + 48]
952      pavgb      xmm0, [eax + esi]
953      pavgb      xmm1, [eax + esi + 16]
954      pavgb      xmm2, [eax + esi + 32]
955      pavgb      xmm3, [eax + esi + 48]
956      lea        eax,  [eax + 64]
957      movdqa     xmm4, xmm0
958      shufps     xmm0, xmm1, 0x88
959      shufps     xmm4, xmm1, 0xdd
960      pavgb      xmm0, xmm4
961      movdqa     xmm4, xmm2
962      shufps     xmm2, xmm3, 0x88
963      shufps     xmm4, xmm3, 0xdd
964      pavgb      xmm2, xmm4
965  
966      // step 2 - convert to U and V
967      // from here down is very similar to Y code except
968      // instead of 16 different pixels, its 8 pixels of U and 8 of V
969      movdqa     xmm1, xmm0
970      movdqa     xmm3, xmm2
971      pmaddubsw  xmm0, xmm7  // U
972      pmaddubsw  xmm2, xmm7
973      pmaddubsw  xmm1, xmm6  // V
974      pmaddubsw  xmm3, xmm6
975      phaddw     xmm0, xmm2
976      phaddw     xmm1, xmm3
977      psraw      xmm0, 8
978      psraw      xmm1, 8
979      packsswb   xmm0, xmm1
980      paddb      xmm0, xmm5            // -> unsigned
981  
982      // step 3 - store 8 U and 8 V values
983      sub        ecx, 16
984      movlps     qword ptr [edx], xmm0 // U
985      movhps     qword ptr [edx + edi], xmm0 // V
986      lea        edx, [edx + 8]
987      jg         convertloop
988  
989      pop        edi
990      pop        esi
991      ret
992    }
993  }
994  
995  __declspec(naked) __declspec(align(16))
ARGBToUVRow_Unaligned_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)996  void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
997                                   uint8* dst_u, uint8* dst_v, int width) {
998  __asm {
999      push       esi
1000      push       edi
1001      mov        eax, [esp + 8 + 4]   // src_argb
1002      mov        esi, [esp + 8 + 8]   // src_stride_argb
1003      mov        edx, [esp + 8 + 12]  // dst_u
1004      mov        edi, [esp + 8 + 16]  // dst_v
1005      mov        ecx, [esp + 8 + 20]  // pix
1006      movdqa     xmm7, kARGBToU
1007      movdqa     xmm6, kARGBToV
1008      movdqa     xmm5, kAddUV128
1009      sub        edi, edx             // stride from u to v
1010  
1011      align      16
1012   convertloop:
1013      /* step 1 - subsample 16x2 argb pixels to 8x1 */
1014      movdqu     xmm0, [eax]
1015      movdqu     xmm1, [eax + 16]
1016      movdqu     xmm2, [eax + 32]
1017      movdqu     xmm3, [eax + 48]
1018      movdqu     xmm4, [eax + esi]
1019      pavgb      xmm0, xmm4
1020      movdqu     xmm4, [eax + esi + 16]
1021      pavgb      xmm1, xmm4
1022      movdqu     xmm4, [eax + esi + 32]
1023      pavgb      xmm2, xmm4
1024      movdqu     xmm4, [eax + esi + 48]
1025      pavgb      xmm3, xmm4
1026      lea        eax,  [eax + 64]
1027      movdqa     xmm4, xmm0
1028      shufps     xmm0, xmm1, 0x88
1029      shufps     xmm4, xmm1, 0xdd
1030      pavgb      xmm0, xmm4
1031      movdqa     xmm4, xmm2
1032      shufps     xmm2, xmm3, 0x88
1033      shufps     xmm4, xmm3, 0xdd
1034      pavgb      xmm2, xmm4
1035  
1036      // step 2 - convert to U and V
1037      // from here down is very similar to Y code except
1038      // instead of 16 different pixels, its 8 pixels of U and 8 of V
1039      movdqa     xmm1, xmm0
1040      movdqa     xmm3, xmm2
1041      pmaddubsw  xmm0, xmm7  // U
1042      pmaddubsw  xmm2, xmm7
1043      pmaddubsw  xmm1, xmm6  // V
1044      pmaddubsw  xmm3, xmm6
1045      phaddw     xmm0, xmm2
1046      phaddw     xmm1, xmm3
1047      psraw      xmm0, 8
1048      psraw      xmm1, 8
1049      packsswb   xmm0, xmm1
1050      paddb      xmm0, xmm5            // -> unsigned
1051  
1052      // step 3 - store 8 U and 8 V values
1053      sub        ecx, 16
1054      movlps     qword ptr [edx], xmm0 // U
1055      movhps     qword ptr [edx + edi], xmm0 // V
1056      lea        edx, [edx + 8]
1057      jg         convertloop
1058  
1059      pop        edi
1060      pop        esi
1061      ret
1062    }
1063  }
1064  
1065  __declspec(naked) __declspec(align(16))
BGRAToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1066  void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1067                         uint8* dst_u, uint8* dst_v, int width) {
1068  __asm {
1069      push       esi
1070      push       edi
1071      mov        eax, [esp + 8 + 4]   // src_argb
1072      mov        esi, [esp + 8 + 8]   // src_stride_argb
1073      mov        edx, [esp + 8 + 12]  // dst_u
1074      mov        edi, [esp + 8 + 16]  // dst_v
1075      mov        ecx, [esp + 8 + 20]  // pix
1076      movdqa     xmm7, kBGRAToU
1077      movdqa     xmm6, kBGRAToV
1078      movdqa     xmm5, kAddUV128
1079      sub        edi, edx             // stride from u to v
1080  
1081      align      16
1082   convertloop:
1083      /* step 1 - subsample 16x2 argb pixels to 8x1 */
1084      movdqa     xmm0, [eax]
1085      movdqa     xmm1, [eax + 16]
1086      movdqa     xmm2, [eax + 32]
1087      movdqa     xmm3, [eax + 48]
1088      pavgb      xmm0, [eax + esi]
1089      pavgb      xmm1, [eax + esi + 16]
1090      pavgb      xmm2, [eax + esi + 32]
1091      pavgb      xmm3, [eax + esi + 48]
1092      lea        eax,  [eax + 64]
1093      movdqa     xmm4, xmm0
1094      shufps     xmm0, xmm1, 0x88
1095      shufps     xmm4, xmm1, 0xdd
1096      pavgb      xmm0, xmm4
1097      movdqa     xmm4, xmm2
1098      shufps     xmm2, xmm3, 0x88
1099      shufps     xmm4, xmm3, 0xdd
1100      pavgb      xmm2, xmm4
1101  
1102      // step 2 - convert to U and V
1103      // from here down is very similar to Y code except
1104      // instead of 16 different pixels, its 8 pixels of U and 8 of V
1105      movdqa     xmm1, xmm0
1106      movdqa     xmm3, xmm2
1107      pmaddubsw  xmm0, xmm7  // U
1108      pmaddubsw  xmm2, xmm7
1109      pmaddubsw  xmm1, xmm6  // V
1110      pmaddubsw  xmm3, xmm6
1111      phaddw     xmm0, xmm2
1112      phaddw     xmm1, xmm3
1113      psraw      xmm0, 8
1114      psraw      xmm1, 8
1115      packsswb   xmm0, xmm1
1116      paddb      xmm0, xmm5            // -> unsigned
1117  
1118      // step 3 - store 8 U and 8 V values
1119      sub        ecx, 16
1120      movlps     qword ptr [edx], xmm0 // U
1121      movhps     qword ptr [edx + edi], xmm0 // V
1122      lea        edx, [edx + 8]
1123      jg         convertloop
1124  
1125      pop        edi
1126      pop        esi
1127      ret
1128    }
1129  }
1130  
1131  __declspec(naked) __declspec(align(16))
BGRAToUVRow_Unaligned_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1132  void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1133                                   uint8* dst_u, uint8* dst_v, int width) {
1134  __asm {
1135      push       esi
1136      push       edi
1137      mov        eax, [esp + 8 + 4]   // src_argb
1138      mov        esi, [esp + 8 + 8]   // src_stride_argb
1139      mov        edx, [esp + 8 + 12]  // dst_u
1140      mov        edi, [esp + 8 + 16]  // dst_v
1141      mov        ecx, [esp + 8 + 20]  // pix
1142      movdqa     xmm7, kBGRAToU
1143      movdqa     xmm6, kBGRAToV
1144      movdqa     xmm5, kAddUV128
1145      sub        edi, edx             // stride from u to v
1146  
1147      align      16
1148   convertloop:
1149      /* step 1 - subsample 16x2 argb pixels to 8x1 */
1150      movdqu     xmm0, [eax]
1151      movdqu     xmm1, [eax + 16]
1152      movdqu     xmm2, [eax + 32]
1153      movdqu     xmm3, [eax + 48]
1154      movdqu     xmm4, [eax + esi]
1155      pavgb      xmm0, xmm4
1156      movdqu     xmm4, [eax + esi + 16]
1157      pavgb      xmm1, xmm4
1158      movdqu     xmm4, [eax + esi + 32]
1159      pavgb      xmm2, xmm4
1160      movdqu     xmm4, [eax + esi + 48]
1161      pavgb      xmm3, xmm4
1162      lea        eax,  [eax + 64]
1163      movdqa     xmm4, xmm0
1164      shufps     xmm0, xmm1, 0x88
1165      shufps     xmm4, xmm1, 0xdd
1166      pavgb      xmm0, xmm4
1167      movdqa     xmm4, xmm2
1168      shufps     xmm2, xmm3, 0x88
1169      shufps     xmm4, xmm3, 0xdd
1170      pavgb      xmm2, xmm4
1171  
1172      // step 2 - convert to U and V
1173      // from here down is very similar to Y code except
1174      // instead of 16 different pixels, its 8 pixels of U and 8 of V
1175      movdqa     xmm1, xmm0
1176      movdqa     xmm3, xmm2
1177      pmaddubsw  xmm0, xmm7  // U
1178      pmaddubsw  xmm2, xmm7
1179      pmaddubsw  xmm1, xmm6  // V
1180      pmaddubsw  xmm3, xmm6
1181      phaddw     xmm0, xmm2
1182      phaddw     xmm1, xmm3
1183      psraw      xmm0, 8
1184      psraw      xmm1, 8
1185      packsswb   xmm0, xmm1
1186      paddb      xmm0, xmm5            // -> unsigned
1187  
1188      // step 3 - store 8 U and 8 V values
1189      sub        ecx, 16
1190      movlps     qword ptr [edx], xmm0 // U
1191      movhps     qword ptr [edx + edi], xmm0 // V
1192      lea        edx, [edx + 8]
1193      jg         convertloop
1194  
1195      pop        edi
1196      pop        esi
1197      ret
1198    }
1199  }
1200  
1201  __declspec(naked) __declspec(align(16))
ABGRToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1202  void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1203                         uint8* dst_u, uint8* dst_v, int width) {
1204  __asm {
1205      push       esi
1206      push       edi
1207      mov        eax, [esp + 8 + 4]   // src_argb
1208      mov        esi, [esp + 8 + 8]   // src_stride_argb
1209      mov        edx, [esp + 8 + 12]  // dst_u
1210      mov        edi, [esp + 8 + 16]  // dst_v
1211      mov        ecx, [esp + 8 + 20]  // pix
1212      movdqa     xmm7, kABGRToU
1213      movdqa     xmm6, kABGRToV
1214      movdqa     xmm5, kAddUV128
1215      sub        edi, edx             // stride from u to v
1216  
1217      align      16
1218   convertloop:
1219      /* step 1 - subsample 16x2 argb pixels to 8x1 */
1220      movdqa     xmm0, [eax]
1221      movdqa     xmm1, [eax + 16]
1222      movdqa     xmm2, [eax + 32]
1223      movdqa     xmm3, [eax + 48]
1224      pavgb      xmm0, [eax + esi]
1225      pavgb      xmm1, [eax + esi + 16]
1226      pavgb      xmm2, [eax + esi + 32]
1227      pavgb      xmm3, [eax + esi + 48]
1228      lea        eax,  [eax + 64]
1229      movdqa     xmm4, xmm0
1230      shufps     xmm0, xmm1, 0x88
1231      shufps     xmm4, xmm1, 0xdd
1232      pavgb      xmm0, xmm4
1233      movdqa     xmm4, xmm2
1234      shufps     xmm2, xmm3, 0x88
1235      shufps     xmm4, xmm3, 0xdd
1236      pavgb      xmm2, xmm4
1237  
1238      // step 2 - convert to U and V
1239      // from here down is very similar to Y code except
1240      // instead of 16 different pixels, its 8 pixels of U and 8 of V
1241      movdqa     xmm1, xmm0
1242      movdqa     xmm3, xmm2
1243      pmaddubsw  xmm0, xmm7  // U
1244      pmaddubsw  xmm2, xmm7
1245      pmaddubsw  xmm1, xmm6  // V
1246      pmaddubsw  xmm3, xmm6
1247      phaddw     xmm0, xmm2
1248      phaddw     xmm1, xmm3
1249      psraw      xmm0, 8
1250      psraw      xmm1, 8
1251      packsswb   xmm0, xmm1
1252      paddb      xmm0, xmm5            // -> unsigned
1253  
1254      // step 3 - store 8 U and 8 V values
1255      sub        ecx, 16
1256      movlps     qword ptr [edx], xmm0 // U
1257      movhps     qword ptr [edx + edi], xmm0 // V
1258      lea        edx, [edx + 8]
1259      jg         convertloop
1260  
1261      pop        edi
1262      pop        esi
1263      ret
1264    }
1265  }
1266  
1267  __declspec(naked) __declspec(align(16))
ABGRToUVRow_Unaligned_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1268  void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1269                                   uint8* dst_u, uint8* dst_v, int width) {
1270  __asm {
1271      push       esi
1272      push       edi
1273      mov        eax, [esp + 8 + 4]   // src_argb
1274      mov        esi, [esp + 8 + 8]   // src_stride_argb
1275      mov        edx, [esp + 8 + 12]  // dst_u
1276      mov        edi, [esp + 8 + 16]  // dst_v
1277      mov        ecx, [esp + 8 + 20]  // pix
1278      movdqa     xmm7, kABGRToU
1279      movdqa     xmm6, kABGRToV
1280      movdqa     xmm5, kAddUV128
1281      sub        edi, edx             // stride from u to v
1282  
1283      align      16
1284   convertloop:
1285      /* step 1 - subsample 16x2 argb pixels to 8x1 */
1286      movdqu     xmm0, [eax]
1287      movdqu     xmm1, [eax + 16]
1288      movdqu     xmm2, [eax + 32]
1289      movdqu     xmm3, [eax + 48]
1290      movdqu     xmm4, [eax + esi]
1291      pavgb      xmm0, xmm4
1292      movdqu     xmm4, [eax + esi + 16]
1293      pavgb      xmm1, xmm4
1294      movdqu     xmm4, [eax + esi + 32]
1295      pavgb      xmm2, xmm4
1296      movdqu     xmm4, [eax + esi + 48]
1297      pavgb      xmm3, xmm4
1298      lea        eax,  [eax + 64]
1299      movdqa     xmm4, xmm0
1300      shufps     xmm0, xmm1, 0x88
1301      shufps     xmm4, xmm1, 0xdd
1302      pavgb      xmm0, xmm4
1303      movdqa     xmm4, xmm2
1304      shufps     xmm2, xmm3, 0x88
1305      shufps     xmm4, xmm3, 0xdd
1306      pavgb      xmm2, xmm4
1307  
1308      // step 2 - convert to U and V
1309      // from here down is very similar to Y code except
1310      // instead of 16 different pixels, its 8 pixels of U and 8 of V
1311      movdqa     xmm1, xmm0
1312      movdqa     xmm3, xmm2
1313      pmaddubsw  xmm0, xmm7  // U
1314      pmaddubsw  xmm2, xmm7
1315      pmaddubsw  xmm1, xmm6  // V
1316      pmaddubsw  xmm3, xmm6
1317      phaddw     xmm0, xmm2
1318      phaddw     xmm1, xmm3
1319      psraw      xmm0, 8
1320      psraw      xmm1, 8
1321      packsswb   xmm0, xmm1
1322      paddb      xmm0, xmm5            // -> unsigned
1323  
1324      // step 3 - store 8 U and 8 V values
1325      sub        ecx, 16
1326      movlps     qword ptr [edx], xmm0 // U
1327      movhps     qword ptr [edx + edi], xmm0 // V
1328      lea        edx, [edx + 8]
1329      jg         convertloop
1330  
1331      pop        edi
1332      pop        esi
1333      ret
1334    }
1335  }
1336  
1337  __declspec(naked) __declspec(align(16))
RGBAToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1338  void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1339                         uint8* dst_u, uint8* dst_v, int width) {
1340  __asm {
1341      push       esi
1342      push       edi
1343      mov        eax, [esp + 8 + 4]   // src_argb
1344      mov        esi, [esp + 8 + 8]   // src_stride_argb
1345      mov        edx, [esp + 8 + 12]  // dst_u
1346      mov        edi, [esp + 8 + 16]  // dst_v
1347      mov        ecx, [esp + 8 + 20]  // pix
1348      movdqa     xmm7, kRGBAToU
1349      movdqa     xmm6, kRGBAToV
1350      movdqa     xmm5, kAddUV128
1351      sub        edi, edx             // stride from u to v
1352  
1353      align      16
1354   convertloop:
1355      /* step 1 - subsample 16x2 argb pixels to 8x1 */
1356      movdqa     xmm0, [eax]
1357      movdqa     xmm1, [eax + 16]
1358      movdqa     xmm2, [eax + 32]
1359      movdqa     xmm3, [eax + 48]
1360      pavgb      xmm0, [eax + esi]
1361      pavgb      xmm1, [eax + esi + 16]
1362      pavgb      xmm2, [eax + esi + 32]
1363      pavgb      xmm3, [eax + esi + 48]
1364      lea        eax,  [eax + 64]
1365      movdqa     xmm4, xmm0
1366      shufps     xmm0, xmm1, 0x88
1367      shufps     xmm4, xmm1, 0xdd
1368      pavgb      xmm0, xmm4
1369      movdqa     xmm4, xmm2
1370      shufps     xmm2, xmm3, 0x88
1371      shufps     xmm4, xmm3, 0xdd
1372      pavgb      xmm2, xmm4
1373  
1374      // step 2 - convert to U and V
1375      // from here down is very similar to Y code except
1376      // instead of 16 different pixels, its 8 pixels of U and 8 of V
1377      movdqa     xmm1, xmm0
1378      movdqa     xmm3, xmm2
1379      pmaddubsw  xmm0, xmm7  // U
1380      pmaddubsw  xmm2, xmm7
1381      pmaddubsw  xmm1, xmm6  // V
1382      pmaddubsw  xmm3, xmm6
1383      phaddw     xmm0, xmm2
1384      phaddw     xmm1, xmm3
1385      psraw      xmm0, 8
1386      psraw      xmm1, 8
1387      packsswb   xmm0, xmm1
1388      paddb      xmm0, xmm5            // -> unsigned
1389  
1390      // step 3 - store 8 U and 8 V values
1391      sub        ecx, 16
1392      movlps     qword ptr [edx], xmm0 // U
1393      movhps     qword ptr [edx + edi], xmm0 // V
1394      lea        edx, [edx + 8]
1395      jg         convertloop
1396  
1397      pop        edi
1398      pop        esi
1399      ret
1400    }
1401  }
1402  
1403  __declspec(naked) __declspec(align(16))
RGBAToUVRow_Unaligned_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1404  void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1405                                   uint8* dst_u, uint8* dst_v, int width) {
1406  __asm {
1407      push       esi
1408      push       edi
1409      mov        eax, [esp + 8 + 4]   // src_argb
1410      mov        esi, [esp + 8 + 8]   // src_stride_argb
1411      mov        edx, [esp + 8 + 12]  // dst_u
1412      mov        edi, [esp + 8 + 16]  // dst_v
1413      mov        ecx, [esp + 8 + 20]  // pix
1414      movdqa     xmm7, kRGBAToU
1415      movdqa     xmm6, kRGBAToV
1416      movdqa     xmm5, kAddUV128
1417      sub        edi, edx             // stride from u to v
1418  
1419      align      16
1420   convertloop:
1421      /* step 1 - subsample 16x2 argb pixels to 8x1 */
1422      movdqu     xmm0, [eax]
1423      movdqu     xmm1, [eax + 16]
1424      movdqu     xmm2, [eax + 32]
1425      movdqu     xmm3, [eax + 48]
1426      movdqu     xmm4, [eax + esi]
1427      pavgb      xmm0, xmm4
1428      movdqu     xmm4, [eax + esi + 16]
1429      pavgb      xmm1, xmm4
1430      movdqu     xmm4, [eax + esi + 32]
1431      pavgb      xmm2, xmm4
1432      movdqu     xmm4, [eax + esi + 48]
1433      pavgb      xmm3, xmm4
1434      lea        eax,  [eax + 64]
1435      movdqa     xmm4, xmm0
1436      shufps     xmm0, xmm1, 0x88
1437      shufps     xmm4, xmm1, 0xdd
1438      pavgb      xmm0, xmm4
1439      movdqa     xmm4, xmm2
1440      shufps     xmm2, xmm3, 0x88
1441      shufps     xmm4, xmm3, 0xdd
1442      pavgb      xmm2, xmm4
1443  
1444      // step 2 - convert to U and V
1445      // from here down is very similar to Y code except
1446      // instead of 16 different pixels, its 8 pixels of U and 8 of V
1447      movdqa     xmm1, xmm0
1448      movdqa     xmm3, xmm2
1449      pmaddubsw  xmm0, xmm7  // U
1450      pmaddubsw  xmm2, xmm7
1451      pmaddubsw  xmm1, xmm6  // V
1452      pmaddubsw  xmm3, xmm6
1453      phaddw     xmm0, xmm2
1454      phaddw     xmm1, xmm3
1455      psraw      xmm0, 8
1456      psraw      xmm1, 8
1457      packsswb   xmm0, xmm1
1458      paddb      xmm0, xmm5            // -> unsigned
1459  
1460      // step 3 - store 8 U and 8 V values
1461      sub        ecx, 16
1462      movlps     qword ptr [edx], xmm0 // U
1463      movhps     qword ptr [edx + edi], xmm0 // V
1464      lea        edx, [edx + 8]
1465      jg         convertloop
1466  
1467      pop        edi
1468      pop        esi
1469      ret
1470    }
1471  }
1472  #endif  // HAS_ARGBTOYROW_SSSE3
1473  
1474  #ifdef HAS_I422TOARGBROW_SSSE3
1475  
1476  #define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
1477  
1478  #define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1479  #define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1480  #define UR 0
1481  
1482  #define VB 0
1483  #define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1484  #define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1485  
1486  // Bias
1487  #define BB UB * 128 + VB * 128
1488  #define BG UG * 128 + VG * 128
1489  #define BR UR * 128 + VR * 128
1490  
1491  static const vec8 kUVToB = {
1492    UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
1493  };
1494  
1495  static const vec8 kUVToR = {
1496    UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
1497  };
1498  
1499  static const vec8 kUVToG = {
1500    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
1501  };
1502  
1503  static const vec8 kVUToB = {
1504    VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
1505  };
1506  
1507  static const vec8 kVUToR = {
1508    VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
1509  };
1510  
1511  static const vec8 kVUToG = {
1512    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
1513  };
1514  
1515  static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
1516  static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
1517  static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
1518  static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
1519  static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
1520  
1521  // TODO(fbarchard): NV12/NV21 fetch UV and use directly.
1522  // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
1523  
1524  // Read 8 UV from 411.
1525  #define READYUV444 __asm {                                                     \
1526      __asm movq       xmm0, qword ptr [esi] /* U */                /* NOLINT */ \
1527      __asm movq       xmm1, qword ptr [esi + edi] /* V */          /* NOLINT */ \
1528      __asm lea        esi,  [esi + 8]                                           \
1529      __asm punpcklbw  xmm0, xmm1           /* UV */                             \
1530    }
1531  
1532  // Read 4 UV from 422, upsample to 8 UV.
1533  #define READYUV422 __asm {                                                     \
1534      __asm movd       xmm0, [esi]          /* U */                              \
1535      __asm movd       xmm1, [esi + edi]    /* V */                              \
1536      __asm lea        esi,  [esi + 4]                                           \
1537      __asm punpcklbw  xmm0, xmm1           /* UV */                             \
1538      __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
1539    }
1540  
1541  // Read 2 UV from 411, upsample to 8 UV.
1542  #define READYUV411 __asm {                                                     \
1543      __asm movd       xmm0, [esi]          /* U */                              \
1544      __asm movd       xmm1, [esi + edi]    /* V */                              \
1545      __asm lea        esi,  [esi + 2]                                           \
1546      __asm punpcklbw  xmm0, xmm1           /* UV */                             \
1547      __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
1548      __asm punpckldq  xmm0, xmm0           /* UVUV (upsample) */                \
1549    }
1550  
1551  // Read 4 UV from NV12, upsample to 8 UV.
1552  #define READNV12 __asm {                                                       \
1553      __asm movq       xmm0, qword ptr [esi] /* UV */               /* NOLINT */ \
1554      __asm lea        esi,  [esi + 8]                                           \
1555      __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
1556    }
1557  
1558  // Convert 8 pixels: 8 UV and 8 Y.
1559  #define YUVTORGB __asm {                                                       \
1560      /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
1561      __asm movdqa     xmm1, xmm0                                                \
1562      __asm movdqa     xmm2, xmm0                                                \
1563      __asm pmaddubsw  xmm0, kUVToB        /* scale B UV */                      \
1564      __asm pmaddubsw  xmm1, kUVToG        /* scale G UV */                      \
1565      __asm pmaddubsw  xmm2, kUVToR        /* scale R UV */                      \
1566      __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
1567      __asm psubw      xmm1, kUVBiasG                                            \
1568      __asm psubw      xmm2, kUVBiasR                                            \
1569      /* Step 2: Find Y contribution to 8 R,G,B values */                        \
1570      __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
1571      __asm lea        eax, [eax + 8]                                            \
1572      __asm punpcklbw  xmm3, xmm4                                                \
1573      __asm psubsw     xmm3, kYSub16                                             \
1574      __asm pmullw     xmm3, kYToRgb                                             \
1575      __asm paddsw     xmm0, xmm3           /* B += Y */                         \
1576      __asm paddsw     xmm1, xmm3           /* G += Y */                         \
1577      __asm paddsw     xmm2, xmm3           /* R += Y */                         \
1578      __asm psraw      xmm0, 6                                                   \
1579      __asm psraw      xmm1, 6                                                   \
1580      __asm psraw      xmm2, 6                                                   \
1581      __asm packuswb   xmm0, xmm0           /* B */                              \
1582      __asm packuswb   xmm1, xmm1           /* G */                              \
1583      __asm packuswb   xmm2, xmm2           /* R */                              \
1584    }
1585  
1586  // Convert 8 pixels: 8 VU and 8 Y.
1587  #define YVUTORGB __asm {                                                       \
1588      /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
1589      __asm movdqa     xmm1, xmm0                                                \
1590      __asm movdqa     xmm2, xmm0                                                \
1591      __asm pmaddubsw  xmm0, kVUToB        /* scale B UV */                      \
1592      __asm pmaddubsw  xmm1, kVUToG        /* scale G UV */                      \
1593      __asm pmaddubsw  xmm2, kVUToR        /* scale R UV */                      \
1594      __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
1595      __asm psubw      xmm1, kUVBiasG                                            \
1596      __asm psubw      xmm2, kUVBiasR                                            \
1597      /* Step 2: Find Y contribution to 8 R,G,B values */                        \
1598      __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
1599      __asm lea        eax, [eax + 8]                                            \
1600      __asm punpcklbw  xmm3, xmm4                                                \
1601      __asm psubsw     xmm3, kYSub16                                             \
1602      __asm pmullw     xmm3, kYToRgb                                             \
1603      __asm paddsw     xmm0, xmm3           /* B += Y */                         \
1604      __asm paddsw     xmm1, xmm3           /* G += Y */                         \
1605      __asm paddsw     xmm2, xmm3           /* R += Y */                         \
1606      __asm psraw      xmm0, 6                                                   \
1607      __asm psraw      xmm1, 6                                                   \
1608      __asm psraw      xmm2, 6                                                   \
1609      __asm packuswb   xmm0, xmm0           /* B */                              \
1610      __asm packuswb   xmm1, xmm1           /* G */                              \
1611      __asm packuswb   xmm2, xmm2           /* R */                              \
1612    }
1613  
1614  // 8 pixels, dest aligned 16.
1615  // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
1616  __declspec(naked) __declspec(align(16))
I444ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1617  void I444ToARGBRow_SSSE3(const uint8* y_buf,
1618                           const uint8* u_buf,
1619                           const uint8* v_buf,
1620                           uint8* argb_buf,
1621                           int width) {
1622    __asm {
1623      push       esi
1624      push       edi
1625      mov        eax, [esp + 8 + 4]   // Y
1626      mov        esi, [esp + 8 + 8]   // U
1627      mov        edi, [esp + 8 + 12]  // V
1628      mov        edx, [esp + 8 + 16]  // argb
1629      mov        ecx, [esp + 8 + 20]  // width
1630      sub        edi, esi
1631      pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1632      pxor       xmm4, xmm4
1633  
1634      align      16
1635   convertloop:
1636      READYUV444
1637      YUVTORGB
1638  
1639      // Step 3: Weave into ARGB
1640      punpcklbw  xmm0, xmm1           // BG
1641      punpcklbw  xmm2, xmm5           // RA
1642      movdqa     xmm1, xmm0
1643      punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1644      punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1645      movdqa     [edx], xmm0
1646      movdqa     [edx + 16], xmm1
1647      lea        edx,  [edx + 32]
1648      sub        ecx, 8
1649      jg         convertloop
1650  
1651      pop        edi
1652      pop        esi
1653      ret
1654    }
1655  }
1656  
1657  // 8 pixels, dest aligned 16.
1658  // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1659  __declspec(naked) __declspec(align(16))
I422ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1660  void I422ToARGBRow_SSSE3(const uint8* y_buf,
1661                           const uint8* u_buf,
1662                           const uint8* v_buf,
1663                           uint8* argb_buf,
1664                           int width) {
1665    __asm {
1666      push       esi
1667      push       edi
1668      mov        eax, [esp + 8 + 4]   // Y
1669      mov        esi, [esp + 8 + 8]   // U
1670      mov        edi, [esp + 8 + 12]  // V
1671      mov        edx, [esp + 8 + 16]  // argb
1672      mov        ecx, [esp + 8 + 20]  // width
1673      sub        edi, esi
1674      pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1675      pxor       xmm4, xmm4
1676  
1677      align      16
1678   convertloop:
1679      READYUV422
1680      YUVTORGB
1681  
1682      // Step 3: Weave into ARGB
1683      punpcklbw  xmm0, xmm1           // BG
1684      punpcklbw  xmm2, xmm5           // RA
1685      movdqa     xmm1, xmm0
1686      punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1687      punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1688      movdqa     [edx], xmm0
1689      movdqa     [edx + 16], xmm1
1690      lea        edx,  [edx + 32]
1691      sub        ecx, 8
1692      jg         convertloop
1693  
1694      pop        edi
1695      pop        esi
1696      ret
1697    }
1698  }
1699  
1700  // 8 pixels, dest aligned 16.
1701  // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1702  // Similar to I420 but duplicate UV once more.
1703  __declspec(naked) __declspec(align(16))
I411ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1704  void I411ToARGBRow_SSSE3(const uint8* y_buf,
1705                           const uint8* u_buf,
1706                           const uint8* v_buf,
1707                           uint8* argb_buf,
1708                           int width) {
1709    __asm {
1710      push       esi
1711      push       edi
1712      mov        eax, [esp + 8 + 4]   // Y
1713      mov        esi, [esp + 8 + 8]   // U
1714      mov        edi, [esp + 8 + 12]  // V
1715      mov        edx, [esp + 8 + 16]  // argb
1716      mov        ecx, [esp + 8 + 20]  // width
1717      sub        edi, esi
1718      pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1719      pxor       xmm4, xmm4
1720  
1721      align      16
1722   convertloop:
1723      READYUV411
1724      YUVTORGB
1725  
1726      // Step 3: Weave into ARGB
1727      punpcklbw  xmm0, xmm1           // BG
1728      punpcklbw  xmm2, xmm5           // RA
1729      movdqa     xmm1, xmm0
1730      punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1731      punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1732      movdqa     [edx], xmm0
1733      movdqa     [edx + 16], xmm1
1734      lea        edx,  [edx + 32]
1735      sub        ecx, 8
1736      jg         convertloop
1737  
1738      pop        edi
1739      pop        esi
1740      ret
1741    }
1742  }
1743  
1744  // 8 pixels, dest aligned 16.
1745  // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1746  __declspec(naked) __declspec(align(16))
NV12ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * argb_buf,int width)1747  void NV12ToARGBRow_SSSE3(const uint8* y_buf,
1748                           const uint8* uv_buf,
1749                           uint8* argb_buf,
1750                           int width) {
1751    __asm {
1752      push       esi
1753      mov        eax, [esp + 4 + 4]   // Y
1754      mov        esi, [esp + 4 + 8]   // UV
1755      mov        edx, [esp + 4 + 12]  // argb
1756      mov        ecx, [esp + 4 + 16]  // width
1757      pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1758      pxor       xmm4, xmm4
1759  
1760      align      16
1761   convertloop:
1762      READNV12
1763      YUVTORGB
1764  
1765      // Step 3: Weave into ARGB
1766      punpcklbw  xmm0, xmm1           // BG
1767      punpcklbw  xmm2, xmm5           // RA
1768      movdqa     xmm1, xmm0
1769      punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1770      punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1771      movdqa     [edx], xmm0
1772      movdqa     [edx + 16], xmm1
1773      lea        edx,  [edx + 32]
1774      sub        ecx, 8
1775      jg         convertloop
1776  
1777      pop        esi
1778      ret
1779    }
1780  }
1781  
1782  // 8 pixels, dest aligned 16.
1783  // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1784  __declspec(naked) __declspec(align(16))
NV21ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * argb_buf,int width)1785  void NV21ToARGBRow_SSSE3(const uint8* y_buf,
1786                           const uint8* uv_buf,
1787                           uint8* argb_buf,
1788                           int width) {
1789    __asm {
1790      push       esi
1791      mov        eax, [esp + 4 + 4]   // Y
1792      mov        esi, [esp + 4 + 8]   // VU
1793      mov        edx, [esp + 4 + 12]  // argb
1794      mov        ecx, [esp + 4 + 16]  // width
1795      pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1796      pxor       xmm4, xmm4
1797  
1798      align      16
1799   convertloop:
1800      READNV12
1801      YVUTORGB
1802  
1803      // Step 3: Weave into ARGB
1804      punpcklbw  xmm0, xmm1           // BG
1805      punpcklbw  xmm2, xmm5           // RA
1806      movdqa     xmm1, xmm0
1807      punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1808      punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1809      movdqa     [edx], xmm0
1810      movdqa     [edx + 16], xmm1
1811      lea        edx,  [edx + 32]
1812      sub        ecx, 8
1813      jg         convertloop
1814  
1815      pop        esi
1816      ret
1817    }
1818  }
1819  
1820  // 8 pixels, unaligned.
1821  // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
1822  __declspec(naked) __declspec(align(16))
I444ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1823  void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1824                                     const uint8* u_buf,
1825                                     const uint8* v_buf,
1826                                     uint8* argb_buf,
1827                                     int width) {
1828    __asm {
1829      push       esi
1830      push       edi
1831      mov        eax, [esp + 8 + 4]   // Y
1832      mov        esi, [esp + 8 + 8]   // U
1833      mov        edi, [esp + 8 + 12]  // V
1834      mov        edx, [esp + 8 + 16]  // argb
1835      mov        ecx, [esp + 8 + 20]  // width
1836      sub        edi, esi
1837      pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1838      pxor       xmm4, xmm4
1839  
1840      align      16
1841   convertloop:
1842      READYUV444
1843      YUVTORGB
1844  
1845      // Step 3: Weave into ARGB
1846      punpcklbw  xmm0, xmm1           // BG
1847      punpcklbw  xmm2, xmm5           // RA
1848      movdqa     xmm1, xmm0
1849      punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1850      punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1851      movdqu     [edx], xmm0
1852      movdqu     [edx + 16], xmm1
1853      lea        edx,  [edx + 32]
1854      sub        ecx, 8
1855      jg         convertloop
1856  
1857      pop        edi
1858      pop        esi
1859      ret
1860    }
1861  }
1862  
1863  // 8 pixels, unaligned.
1864  // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1865  __declspec(naked) __declspec(align(16))
I422ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1866  void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1867                                     const uint8* u_buf,
1868                                     const uint8* v_buf,
1869                                     uint8* argb_buf,
1870                                     int width) {
1871    __asm {
1872      push       esi
1873      push       edi
1874      mov        eax, [esp + 8 + 4]   // Y
1875      mov        esi, [esp + 8 + 8]   // U
1876      mov        edi, [esp + 8 + 12]  // V
1877      mov        edx, [esp + 8 + 16]  // argb
1878      mov        ecx, [esp + 8 + 20]  // width
1879      sub        edi, esi
1880      pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1881      pxor       xmm4, xmm4
1882  
1883      align      16
1884   convertloop:
1885      READYUV422
1886      YUVTORGB
1887  
1888      // Step 3: Weave into ARGB
1889      punpcklbw  xmm0, xmm1           // BG
1890      punpcklbw  xmm2, xmm5           // RA
1891      movdqa     xmm1, xmm0
1892      punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1893      punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1894      movdqu     [edx], xmm0
1895      movdqu     [edx + 16], xmm1
1896      lea        edx,  [edx + 32]
1897      sub        ecx, 8
1898      jg         convertloop
1899  
1900      pop        edi
1901      pop        esi
1902      ret
1903    }
1904  }
1905  
1906  // 8 pixels, unaligned.
1907  // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1908  // Similar to I420 but duplicate UV once more.
1909  __declspec(naked) __declspec(align(16))
I411ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1910  void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1911                                     const uint8* u_buf,
1912                                     const uint8* v_buf,
1913                                     uint8* argb_buf,
1914                                     int width) {
1915    __asm {
1916      push       esi
1917      push       edi
1918      mov        eax, [esp + 8 + 4]   // Y
1919      mov        esi, [esp + 8 + 8]   // U
1920      mov        edi, [esp + 8 + 12]  // V
1921      mov        edx, [esp + 8 + 16]  // argb
1922      mov        ecx, [esp + 8 + 20]  // width
1923      sub        edi, esi
1924      pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1925      pxor       xmm4, xmm4
1926  
1927      align      16
1928   convertloop:
1929      READYUV411
1930      YUVTORGB
1931  
1932      // Step 3: Weave into ARGB
1933      punpcklbw  xmm0, xmm1           // BG
1934      punpcklbw  xmm2, xmm5           // RA
1935      movdqa     xmm1, xmm0
1936      punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1937      punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1938      movdqu     [edx], xmm0
1939      movdqu     [edx + 16], xmm1
1940      lea        edx,  [edx + 32]
1941      sub        ecx, 8
1942      jg         convertloop
1943  
1944      pop        edi
1945      pop        esi
1946      ret
1947    }
1948  }
1949  
1950  
1951  // 8 pixels, dest aligned 16.
1952  // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1953  __declspec(naked) __declspec(align(16))
NV12ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * argb_buf,int width)1954  void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1955                                     const uint8* uv_buf,
1956                                     uint8* argb_buf,
1957                                     int width) {
1958    __asm {
1959      push       esi
1960      mov        eax, [esp + 4 + 4]   // Y
1961      mov        esi, [esp + 4 + 8]   // UV
1962      mov        edx, [esp + 4 + 12]  // argb
1963      mov        ecx, [esp + 4 + 16]  // width
1964      pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1965      pxor       xmm4, xmm4
1966  
1967      align      16
1968   convertloop:
1969      READNV12
1970      YUVTORGB
1971  
1972      // Step 3: Weave into ARGB
1973      punpcklbw  xmm0, xmm1           // BG
1974      punpcklbw  xmm2, xmm5           // RA
1975      movdqa     xmm1, xmm0
1976      punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1977      punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1978      movdqu     [edx], xmm0
1979      movdqu     [edx + 16], xmm1
1980      lea        edx,  [edx + 32]
1981      sub        ecx, 8
1982      jg         convertloop
1983  
1984      pop        esi
1985      ret
1986    }
1987  }
1988  
1989  // 8 pixels, dest aligned 16.
1990  // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1991  __declspec(naked) __declspec(align(16))
NV21ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * argb_buf,int width)1992  void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1993                                     const uint8* uv_buf,
1994                                     uint8* argb_buf,
1995                                     int width) {
1996    __asm {
1997      push       esi
1998      mov        eax, [esp + 4 + 4]   // Y
1999      mov        esi, [esp + 4 + 8]   // VU
2000      mov        edx, [esp + 4 + 12]  // argb
2001      mov        ecx, [esp + 4 + 16]  // width
2002      pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2003      pxor       xmm4, xmm4
2004  
2005      align      16
2006   convertloop:
2007      READNV12
2008      YVUTORGB
2009  
2010      // Step 3: Weave into ARGB
2011      punpcklbw  xmm0, xmm1           // BG
2012      punpcklbw  xmm2, xmm5           // RA
2013      movdqa     xmm1, xmm0
2014      punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
2015      punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
2016      movdqu     [edx], xmm0
2017      movdqu     [edx + 16], xmm1
2018      lea        edx,  [edx + 32]
2019      sub        ecx, 8
2020      jg         convertloop
2021  
2022      pop        esi
2023      ret
2024    }
2025  }
2026  
2027  __declspec(naked) __declspec(align(16))
I422ToBGRARow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * bgra_buf,int width)2028  void I422ToBGRARow_SSSE3(const uint8* y_buf,
2029                           const uint8* u_buf,
2030                           const uint8* v_buf,
2031                           uint8* bgra_buf,
2032                           int width) {
2033    __asm {
2034      push       esi
2035      push       edi
2036      mov        eax, [esp + 8 + 4]   // Y
2037      mov        esi, [esp + 8 + 8]   // U
2038      mov        edi, [esp + 8 + 12]  // V
2039      mov        edx, [esp + 8 + 16]  // bgra
2040      mov        ecx, [esp + 8 + 20]  // width
2041      sub        edi, esi
2042      pxor       xmm4, xmm4
2043  
2044      align      16
2045   convertloop:
2046      READYUV422
2047      YUVTORGB
2048  
2049      // Step 3: Weave into BGRA
2050      pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2051      punpcklbw  xmm1, xmm0           // GB
2052      punpcklbw  xmm5, xmm2           // AR
2053      movdqa     xmm0, xmm5
2054      punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
2055      punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
2056      movdqa     [edx], xmm5
2057      movdqa     [edx + 16], xmm0
2058      lea        edx,  [edx + 32]
2059      sub        ecx, 8
2060      jg         convertloop
2061  
2062      pop        edi
2063      pop        esi
2064      ret
2065    }
2066  }
2067  
2068  __declspec(naked) __declspec(align(16))
I422ToBGRARow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * bgra_buf,int width)2069  void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
2070                                     const uint8* u_buf,
2071                                     const uint8* v_buf,
2072                                     uint8* bgra_buf,
2073                                     int width) {
2074    __asm {
2075      push       esi
2076      push       edi
2077      mov        eax, [esp + 8 + 4]   // Y
2078      mov        esi, [esp + 8 + 8]   // U
2079      mov        edi, [esp + 8 + 12]  // V
2080      mov        edx, [esp + 8 + 16]  // bgra
2081      mov        ecx, [esp + 8 + 20]  // width
2082      sub        edi, esi
2083      pxor       xmm4, xmm4
2084  
2085      align      16
2086   convertloop:
2087      READYUV422
2088      YUVTORGB
2089  
2090      // Step 3: Weave into BGRA
2091      pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2092      punpcklbw  xmm1, xmm0           // GB
2093      punpcklbw  xmm5, xmm2           // AR
2094      movdqa     xmm0, xmm5
2095      punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
2096      punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
2097      movdqu     [edx], xmm5
2098      movdqu     [edx + 16], xmm0
2099      lea        edx,  [edx + 32]
2100      sub        ecx, 8
2101      jg         convertloop
2102  
2103      pop        edi
2104      pop        esi
2105      ret
2106    }
2107  }
2108  
2109  __declspec(naked) __declspec(align(16))
I422ToABGRRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * abgr_buf,int width)2110  void I422ToABGRRow_SSSE3(const uint8* y_buf,
2111                           const uint8* u_buf,
2112                           const uint8* v_buf,
2113                           uint8* abgr_buf,
2114                           int width) {
2115    __asm {
2116      push       esi
2117      push       edi
2118      mov        eax, [esp + 8 + 4]   // Y
2119      mov        esi, [esp + 8 + 8]   // U
2120      mov        edi, [esp + 8 + 12]  // V
2121      mov        edx, [esp + 8 + 16]  // abgr
2122      mov        ecx, [esp + 8 + 20]  // width
2123      sub        edi, esi
2124      pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2125      pxor       xmm4, xmm4
2126  
2127      align      16
2128   convertloop:
2129      READYUV422
2130      YUVTORGB
2131  
2132      // Step 3: Weave into ARGB
2133      punpcklbw  xmm2, xmm1           // RG
2134      punpcklbw  xmm0, xmm5           // BA
2135      movdqa     xmm1, xmm2
2136      punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
2137      punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
2138      movdqa     [edx], xmm2
2139      movdqa     [edx + 16], xmm1
2140      lea        edx,  [edx + 32]
2141      sub        ecx, 8
2142      jg         convertloop
2143  
2144      pop        edi
2145      pop        esi
2146      ret
2147    }
2148  }
2149  
2150  __declspec(naked) __declspec(align(16))
I422ToABGRRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * abgr_buf,int width)2151  void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
2152                                     const uint8* u_buf,
2153                                     const uint8* v_buf,
2154                                     uint8* abgr_buf,
2155                                     int width) {
2156    __asm {
2157      push       esi
2158      push       edi
2159      mov        eax, [esp + 8 + 4]   // Y
2160      mov        esi, [esp + 8 + 8]   // U
2161      mov        edi, [esp + 8 + 12]  // V
2162      mov        edx, [esp + 8 + 16]  // abgr
2163      mov        ecx, [esp + 8 + 20]  // width
2164      sub        edi, esi
2165      pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2166      pxor       xmm4, xmm4
2167  
2168      align      16
2169   convertloop:
2170      READYUV422
2171      YUVTORGB
2172  
2173      // Step 3: Weave into ARGB
2174      punpcklbw  xmm2, xmm1           // RG
2175      punpcklbw  xmm0, xmm5           // BA
2176      movdqa     xmm1, xmm2
2177      punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
2178      punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
2179      movdqu     [edx], xmm2
2180      movdqu     [edx + 16], xmm1
2181      lea        edx,  [edx + 32]
2182      sub        ecx, 8
2183      jg         convertloop
2184  
2185      pop        edi
2186      pop        esi
2187      ret
2188    }
2189  }
2190  
2191  __declspec(naked) __declspec(align(16))
I422ToRGBARow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgba_buf,int width)2192  void I422ToRGBARow_SSSE3(const uint8* y_buf,
2193                           const uint8* u_buf,
2194                           const uint8* v_buf,
2195                           uint8* rgba_buf,
2196                           int width) {
2197    __asm {
2198      push       esi
2199      push       edi
2200      mov        eax, [esp + 8 + 4]   // Y
2201      mov        esi, [esp + 8 + 8]   // U
2202      mov        edi, [esp + 8 + 12]  // V
2203      mov        edx, [esp + 8 + 16]  // rgba
2204      mov        ecx, [esp + 8 + 20]  // width
2205      sub        edi, esi
2206      pxor       xmm4, xmm4
2207  
2208      align      16
2209   convertloop:
2210      READYUV422
2211      YUVTORGB
2212  
2213      // Step 3: Weave into RGBA
2214      pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2215      punpcklbw  xmm1, xmm2           // GR
2216      punpcklbw  xmm5, xmm0           // AB
2217      movdqa     xmm0, xmm5
2218      punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
2219      punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
2220      movdqa     [edx], xmm5
2221      movdqa     [edx + 16], xmm0
2222      lea        edx,  [edx + 32]
2223      sub        ecx, 8
2224      jg         convertloop
2225  
2226      pop        edi
2227      pop        esi
2228      ret
2229    }
2230  }
2231  
2232  __declspec(naked) __declspec(align(16))
I422ToRGBARow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgba_buf,int width)2233  void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
2234                                     const uint8* u_buf,
2235                                     const uint8* v_buf,
2236                                     uint8* rgba_buf,
2237                                     int width) {
2238    __asm {
2239      push       esi
2240      push       edi
2241      mov        eax, [esp + 8 + 4]   // Y
2242      mov        esi, [esp + 8 + 8]   // U
2243      mov        edi, [esp + 8 + 12]  // V
2244      mov        edx, [esp + 8 + 16]  // rgba
2245      mov        ecx, [esp + 8 + 20]  // width
2246      sub        edi, esi
2247      pxor       xmm4, xmm4
2248  
2249      align      16
2250   convertloop:
2251      READYUV422
2252      YUVTORGB
2253  
2254      // Step 3: Weave into RGBA
2255      pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2256      punpcklbw  xmm1, xmm2           // GR
2257      punpcklbw  xmm5, xmm0           // AB
2258      movdqa     xmm0, xmm5
2259      punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
2260      punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
2261      movdqu     [edx], xmm5
2262      movdqu     [edx + 16], xmm0
2263      lea        edx,  [edx + 32]
2264      sub        ecx, 8
2265      jg         convertloop
2266  
2267      pop        edi
2268      pop        esi
2269      ret
2270    }
2271  }
2272  
2273  #endif  // HAS_I422TOARGBROW_SSSE3
2274  
2275  #ifdef HAS_YTOARGBROW_SSE2
2276  __declspec(naked) __declspec(align(16))
YToARGBRow_SSE2(const uint8 * y_buf,uint8 * rgb_buf,int width)2277  void YToARGBRow_SSE2(const uint8* y_buf,
2278                       uint8* rgb_buf,
2279                       int width) {
2280    __asm {
2281      pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
2282      pslld      xmm4, 24
2283      mov        eax,0x10001000
2284      movd       xmm3,eax
2285      pshufd     xmm3,xmm3,0
2286      mov        eax,0x012a012a
2287      movd       xmm2,eax
2288      pshufd     xmm2,xmm2,0
2289      mov        eax, [esp + 4]       // Y
2290      mov        edx, [esp + 8]       // rgb
2291      mov        ecx, [esp + 12]      // width
2292  
2293      align      16
2294   convertloop:
2295      // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2296      movq       xmm0, qword ptr [eax]
2297      lea        eax, [eax + 8]
2298      punpcklbw  xmm0, xmm0           // Y.Y
2299      psubusw    xmm0, xmm3
2300      pmulhuw    xmm0, xmm2
2301      packuswb   xmm0, xmm0           // G
2302  
2303      // Step 2: Weave into ARGB
2304      punpcklbw  xmm0, xmm0           // GG
2305      movdqa     xmm1, xmm0
2306      punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
2307      punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
2308      por        xmm0, xmm4
2309      por        xmm1, xmm4
2310      movdqa     [edx], xmm0
2311      movdqa     [edx + 16], xmm1
2312      lea        edx,  [edx + 32]
2313      sub        ecx, 8
2314      jg         convertloop
2315  
2316      ret
2317    }
2318  }
2319  #endif  // HAS_YTOARGBROW_SSE2
2320  
2321  #ifdef HAS_MIRRORROW_SSSE3
2322  
2323  // Shuffle table for reversing the bytes.
2324  static const uvec8 kShuffleMirror = {
2325    15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2326  };
2327  
2328  __declspec(naked) __declspec(align(16))
MirrorRow_SSSE3(const uint8 * src,uint8 * dst,int width)2329  void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2330  __asm {
2331      mov       eax, [esp + 4]   // src
2332      mov       edx, [esp + 8]   // dst
2333      mov       ecx, [esp + 12]  // width
2334      movdqa    xmm5, kShuffleMirror
2335      lea       eax, [eax - 16]
2336  
2337      align      16
2338   convertloop:
2339      movdqa    xmm0, [eax + ecx]
2340      pshufb    xmm0, xmm5
2341      sub       ecx, 16
2342      movdqa    [edx], xmm0
2343      lea       edx, [edx + 16]
2344      jg        convertloop
2345      ret
2346    }
2347  }
2348  #endif  // HAS_MIRRORROW_SSSE3
2349  
2350  #ifdef HAS_MIRRORROW_SSE2
2351  // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
2352  // version can not.
2353  __declspec(naked) __declspec(align(16))
MirrorRow_SSE2(const uint8 * src,uint8 * dst,int width)2354  void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2355  __asm {
2356      mov       eax, [esp + 4]   // src
2357      mov       edx, [esp + 8]   // dst
2358      mov       ecx, [esp + 12]  // width
2359      lea       eax, [eax - 16]
2360  
2361      align      16
2362   convertloop:
2363      movdqu    xmm0, [eax + ecx]
2364      movdqa    xmm1, xmm0        // swap bytes
2365      psllw     xmm0, 8
2366      psrlw     xmm1, 8
2367      por       xmm0, xmm1
2368      pshuflw   xmm0, xmm0, 0x1b  // swap words
2369      pshufhw   xmm0, xmm0, 0x1b
2370      pshufd    xmm0, xmm0, 0x4e  // swap qwords
2371      sub       ecx, 16
2372      movdqu    [edx], xmm0
2373      lea       edx, [edx + 16]
2374      jg        convertloop
2375      ret
2376    }
2377  }
2378  #endif  // HAS_MIRRORROW_SSE2
2379  
2380  #ifdef HAS_MIRRORROW_UV_SSSE3
2381  // Shuffle table for reversing the bytes of UV channels.
2382  static const uvec8 kShuffleMirrorUV = {
2383    14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2384  };
2385  
2386  __declspec(naked) __declspec(align(16))
MirrorRowUV_SSSE3(const uint8 * src,uint8 * dst_u,uint8 * dst_v,int width)2387  void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2388                         int width) {
2389    __asm {
2390      push      edi
2391      mov       eax, [esp + 4 + 4]   // src
2392      mov       edx, [esp + 4 + 8]   // dst_u
2393      mov       edi, [esp + 4 + 12]  // dst_v
2394      mov       ecx, [esp + 4 + 16]  // width
2395      movdqa    xmm1, kShuffleMirrorUV
2396      lea       eax, [eax + ecx * 2 - 16]
2397      sub       edi, edx
2398  
2399      align      16
2400   convertloop:
2401      movdqa    xmm0, [eax]
2402      lea       eax, [eax - 16]
2403      pshufb    xmm0, xmm1
2404      sub       ecx, 8
2405      movlpd    qword ptr [edx], xmm0
2406      movhpd    qword ptr [edx + edi], xmm0
2407      lea       edx, [edx + 8]
2408      jg        convertloop
2409  
2410      pop       edi
2411      ret
2412    }
2413  }
2414  #endif  // HAS_MIRRORROW_UV_SSSE3
2415  
2416  #ifdef HAS_ARGBMIRRORROW_SSSE3
2417  
2418  // Shuffle table for reversing the bytes.
2419  static const uvec8 kARGBShuffleMirror = {
2420    12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
2421  };
2422  
2423  __declspec(naked) __declspec(align(16))
ARGBMirrorRow_SSSE3(const uint8 * src,uint8 * dst,int width)2424  void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2425  __asm {
2426      mov       eax, [esp + 4]   // src
2427      mov       edx, [esp + 8]   // dst
2428      mov       ecx, [esp + 12]  // width
2429      movdqa    xmm5, kARGBShuffleMirror
2430      lea       eax, [eax - 16]
2431  
2432      align      16
2433   convertloop:
2434      movdqa    xmm0, [eax + ecx * 4]
2435      pshufb    xmm0, xmm5
2436      sub       ecx, 4
2437      movdqa    [edx], xmm0
2438      lea       edx, [edx + 16]
2439      jg        convertloop
2440      ret
2441    }
2442  }
2443  #endif  // HAS_ARGBMIRRORROW_SSSE3
2444  
2445  #ifdef HAS_SPLITUV_SSE2
2446  __declspec(naked) __declspec(align(16))
SplitUV_SSE2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int pix)2447  void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
2448    __asm {
2449      push       edi
2450      mov        eax, [esp + 4 + 4]    // src_uv
2451      mov        edx, [esp + 4 + 8]    // dst_u
2452      mov        edi, [esp + 4 + 12]   // dst_v
2453      mov        ecx, [esp + 4 + 16]   // pix
2454      pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2455      psrlw      xmm5, 8
2456      sub        edi, edx
2457  
2458      align      16
2459    convertloop:
2460      movdqa     xmm0, [eax]
2461      movdqa     xmm1, [eax + 16]
2462      lea        eax,  [eax + 32]
2463      movdqa     xmm2, xmm0
2464      movdqa     xmm3, xmm1
2465      pand       xmm0, xmm5   // even bytes
2466      pand       xmm1, xmm5
2467      packuswb   xmm0, xmm1
2468      psrlw      xmm2, 8      // odd bytes
2469      psrlw      xmm3, 8
2470      packuswb   xmm2, xmm3
2471      movdqa     [edx], xmm0
2472      movdqa     [edx + edi], xmm2
2473      lea        edx, [edx + 16]
2474      sub        ecx, 16
2475      jg         convertloop
2476  
2477      pop        edi
2478      ret
2479    }
2480  }
2481  #endif  // HAS_SPLITUV_SSE2
2482  
2483  #ifdef HAS_COPYROW_SSE2
2484  // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
2485  __declspec(naked) __declspec(align(16))
CopyRow_SSE2(const uint8 * src,uint8 * dst,int count)2486  void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
2487    __asm {
2488      mov        eax, [esp + 4]   // src
2489      mov        edx, [esp + 8]   // dst
2490      mov        ecx, [esp + 12]  // count
2491      sub        edx, eax
2492  
2493      align      16
2494    convertloop:
2495      movdqa     xmm0, [eax]
2496      movdqa     xmm1, [eax + 16]
2497      movdqa     [eax + edx], xmm0
2498      movdqa     [eax + edx + 16], xmm1
2499      lea        eax, [eax + 32]
2500      sub        ecx, 32
2501      jg         convertloop
2502      ret
2503    }
2504  }
2505  #endif  // HAS_COPYROW_SSE2
2506  
2507  #ifdef HAS_COPYROW_X86
2508  __declspec(naked) __declspec(align(16))
CopyRow_X86(const uint8 * src,uint8 * dst,int count)2509  void CopyRow_X86(const uint8* src, uint8* dst, int count) {
2510    __asm {
2511      mov        eax, esi
2512      mov        edx, edi
2513      mov        esi, [esp + 4]   // src
2514      mov        edi, [esp + 8]   // dst
2515      mov        ecx, [esp + 12]  // count
2516      shr        ecx, 2
2517      rep movsd
2518      mov        edi, edx
2519      mov        esi, eax
2520      ret
2521    }
2522  }
2523  #endif  // HAS_COPYROW_X86
2524  
2525  #ifdef HAS_SETROW_X86
2526  // SetRow8 writes 'count' bytes using a 32 bit value repeated.
2527  __declspec(naked) __declspec(align(16))
SetRow8_X86(uint8 * dst,uint32 v32,int count)2528  void SetRow8_X86(uint8* dst, uint32 v32, int count) {
2529    __asm {
2530      mov        edx, edi
2531      mov        edi, [esp + 4]   // dst
2532      mov        eax, [esp + 8]   // v32
2533      mov        ecx, [esp + 12]  // count
2534      shr        ecx, 2
2535      rep stosd
2536      mov        edi, edx
2537      ret
2538    }
2539  }
2540  
2541  // SetRow32 writes 'count' words using a 32 bit value repeated.
2542  __declspec(naked) __declspec(align(16))
SetRows32_X86(uint8 * dst,uint32 v32,int width,int dst_stride,int height)2543  void SetRows32_X86(uint8* dst, uint32 v32, int width,
2544                     int dst_stride, int height) {
2545    __asm {
2546      push       esi
2547      push       edi
2548      push       ebp
2549      mov        edi, [esp + 12 + 4]   // dst
2550      mov        eax, [esp + 12 + 8]   // v32
2551      mov        ebp, [esp + 12 + 12]  // width
2552      mov        edx, [esp + 12 + 16]  // dst_stride
2553      mov        esi, [esp + 12 + 20]  // height
2554      lea        ecx, [ebp * 4]
2555      sub        edx, ecx             // stride - width * 4
2556  
2557      align      16
2558    convertloop:
2559      mov        ecx, ebp
2560      rep stosd
2561      add        edi, edx
2562      sub        esi, 1
2563      jg         convertloop
2564  
2565      pop        ebp
2566      pop        edi
2567      pop        esi
2568      ret
2569    }
2570  }
2571  #endif  // HAS_SETROW_X86
2572  
2573  #ifdef HAS_YUY2TOYROW_SSE2
2574  __declspec(naked) __declspec(align(16))
YUY2ToYRow_SSE2(const uint8 * src_yuy2,uint8 * dst_y,int pix)2575  void YUY2ToYRow_SSE2(const uint8* src_yuy2,
2576                       uint8* dst_y, int pix) {
2577    __asm {
2578      mov        eax, [esp + 4]    // src_yuy2
2579      mov        edx, [esp + 8]    // dst_y
2580      mov        ecx, [esp + 12]   // pix
2581      pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
2582      psrlw      xmm5, 8
2583  
2584      align      16
2585    convertloop:
2586      movdqa     xmm0, [eax]
2587      movdqa     xmm1, [eax + 16]
2588      lea        eax,  [eax + 32]
2589      pand       xmm0, xmm5   // even bytes are Y
2590      pand       xmm1, xmm5
2591      packuswb   xmm0, xmm1
2592      sub        ecx, 16
2593      movdqa     [edx], xmm0
2594      lea        edx, [edx + 16]
2595      jg         convertloop
2596      ret
2597    }
2598  }
2599  
2600  __declspec(naked) __declspec(align(16))
YUY2ToUVRow_SSE2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)2601  void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
2602                        uint8* dst_u, uint8* dst_v, int pix) {
2603    __asm {
2604      push       esi
2605      push       edi
2606      mov        eax, [esp + 8 + 4]    // src_yuy2
2607      mov        esi, [esp + 8 + 8]    // stride_yuy2
2608      mov        edx, [esp + 8 + 12]   // dst_u
2609      mov        edi, [esp + 8 + 16]   // dst_v
2610      mov        ecx, [esp + 8 + 20]   // pix
2611      pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2612      psrlw      xmm5, 8
2613      sub        edi, edx
2614  
2615      align      16
2616    convertloop:
2617      movdqa     xmm0, [eax]
2618      movdqa     xmm1, [eax + 16]
2619      movdqa     xmm2, [eax + esi]
2620      movdqa     xmm3, [eax + esi + 16]
2621      lea        eax,  [eax + 32]
2622      pavgb      xmm0, xmm2
2623      pavgb      xmm1, xmm3
2624      psrlw      xmm0, 8      // YUYV -> UVUV
2625      psrlw      xmm1, 8
2626      packuswb   xmm0, xmm1
2627      movdqa     xmm1, xmm0
2628      pand       xmm0, xmm5  // U
2629      packuswb   xmm0, xmm0
2630      psrlw      xmm1, 8     // V
2631      packuswb   xmm1, xmm1
2632      movq       qword ptr [edx], xmm0
2633      movq       qword ptr [edx + edi], xmm1
2634      lea        edx, [edx + 8]
2635      sub        ecx, 16
2636      jg         convertloop
2637  
2638      pop        edi
2639      pop        esi
2640      ret
2641    }
2642  }
2643  
2644  __declspec(naked) __declspec(align(16))
YUY2ToUV422Row_SSE2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)2645  void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2646                           uint8* dst_u, uint8* dst_v, int pix) {
2647    __asm {
2648      push       edi
2649      mov        eax, [esp + 4 + 4]    // src_yuy2
2650      mov        edx, [esp + 4 + 8]    // dst_u
2651      mov        edi, [esp + 4 + 12]   // dst_v
2652      mov        ecx, [esp + 4 + 16]   // pix
2653      pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2654      psrlw      xmm5, 8
2655      sub        edi, edx
2656  
2657      align      16
2658    convertloop:
2659      movdqa     xmm0, [eax]
2660      movdqa     xmm1, [eax + 16]
2661      lea        eax,  [eax + 32]
2662      psrlw      xmm0, 8      // YUYV -> UVUV
2663      psrlw      xmm1, 8
2664      packuswb   xmm0, xmm1
2665      movdqa     xmm1, xmm0
2666      pand       xmm0, xmm5  // U
2667      packuswb   xmm0, xmm0
2668      psrlw      xmm1, 8     // V
2669      packuswb   xmm1, xmm1
2670      movq       qword ptr [edx], xmm0
2671      movq       qword ptr [edx + edi], xmm1
2672      lea        edx, [edx + 8]
2673      sub        ecx, 16
2674      jg         convertloop
2675  
2676      pop        edi
2677      ret
2678    }
2679  }
2680  
2681  __declspec(naked) __declspec(align(16))
YUY2ToYRow_Unaligned_SSE2(const uint8 * src_yuy2,uint8 * dst_y,int pix)2682  void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
2683                                 uint8* dst_y, int pix) {
2684    __asm {
2685      mov        eax, [esp + 4]    // src_yuy2
2686      mov        edx, [esp + 8]    // dst_y
2687      mov        ecx, [esp + 12]   // pix
2688      pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
2689      psrlw      xmm5, 8
2690  
2691      align      16
2692    convertloop:
2693      movdqu     xmm0, [eax]
2694      movdqu     xmm1, [eax + 16]
2695      lea        eax,  [eax + 32]
2696      pand       xmm0, xmm5   // even bytes are Y
2697      pand       xmm1, xmm5
2698      packuswb   xmm0, xmm1
2699      sub        ecx, 16
2700      movdqu     [edx], xmm0
2701      lea        edx, [edx + 16]
2702      jg         convertloop
2703      ret
2704    }
2705  }
2706  
2707  __declspec(naked) __declspec(align(16))
YUY2ToUVRow_Unaligned_SSE2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)2708  void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
2709                                  uint8* dst_u, uint8* dst_v, int pix) {
2710    __asm {
2711      push       esi
2712      push       edi
2713      mov        eax, [esp + 8 + 4]    // src_yuy2
2714      mov        esi, [esp + 8 + 8]    // stride_yuy2
2715      mov        edx, [esp + 8 + 12]   // dst_u
2716      mov        edi, [esp + 8 + 16]   // dst_v
2717      mov        ecx, [esp + 8 + 20]   // pix
2718      pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2719      psrlw      xmm5, 8
2720      sub        edi, edx
2721  
2722      align      16
2723    convertloop:
2724      movdqu     xmm0, [eax]
2725      movdqu     xmm1, [eax + 16]
2726      movdqu     xmm2, [eax + esi]
2727      movdqu     xmm3, [eax + esi + 16]
2728      lea        eax,  [eax + 32]
2729      pavgb      xmm0, xmm2
2730      pavgb      xmm1, xmm3
2731      psrlw      xmm0, 8      // YUYV -> UVUV
2732      psrlw      xmm1, 8
2733      packuswb   xmm0, xmm1
2734      movdqa     xmm1, xmm0
2735      pand       xmm0, xmm5  // U
2736      packuswb   xmm0, xmm0
2737      psrlw      xmm1, 8     // V
2738      packuswb   xmm1, xmm1
2739      movq       qword ptr [edx], xmm0
2740      movq       qword ptr [edx + edi], xmm1
2741      lea        edx, [edx + 8]
2742      sub        ecx, 16
2743      jg         convertloop
2744  
2745      pop        edi
2746      pop        esi
2747      ret
2748    }
2749  }
2750  
2751  __declspec(naked) __declspec(align(16))
YUY2ToUV422Row_Unaligned_SSE2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)2752  void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
2753                                     uint8* dst_u, uint8* dst_v, int pix) {
2754    __asm {
2755      push       edi
2756      mov        eax, [esp + 4 + 4]    // src_yuy2
2757      mov        edx, [esp + 4 + 8]    // dst_u
2758      mov        edi, [esp + 4 + 12]   // dst_v
2759      mov        ecx, [esp + 4 + 16]   // pix
2760      pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2761      psrlw      xmm5, 8
2762      sub        edi, edx
2763  
2764      align      16
2765    convertloop:
2766      movdqu     xmm0, [eax]
2767      movdqu     xmm1, [eax + 16]
2768      lea        eax,  [eax + 32]
2769      psrlw      xmm0, 8      // YUYV -> UVUV
2770      psrlw      xmm1, 8
2771      packuswb   xmm0, xmm1
2772      movdqa     xmm1, xmm0
2773      pand       xmm0, xmm5  // U
2774      packuswb   xmm0, xmm0
2775      psrlw      xmm1, 8     // V
2776      packuswb   xmm1, xmm1
2777      movq       qword ptr [edx], xmm0
2778      movq       qword ptr [edx + edi], xmm1
2779      lea        edx, [edx + 8]
2780      sub        ecx, 16
2781      jg         convertloop
2782  
2783      pop        edi
2784      ret
2785    }
2786  }
2787  
2788  __declspec(naked) __declspec(align(16))
UYVYToYRow_SSE2(const uint8 * src_uyvy,uint8 * dst_y,int pix)2789  void UYVYToYRow_SSE2(const uint8* src_uyvy,
2790                       uint8* dst_y, int pix) {
2791    __asm {
2792      mov        eax, [esp + 4]    // src_uyvy
2793      mov        edx, [esp + 8]    // dst_y
2794      mov        ecx, [esp + 12]   // pix
2795  
2796      align      16
2797    convertloop:
2798      movdqa     xmm0, [eax]
2799      movdqa     xmm1, [eax + 16]
2800      lea        eax,  [eax + 32]
2801      psrlw      xmm0, 8    // odd bytes are Y
2802      psrlw      xmm1, 8
2803      packuswb   xmm0, xmm1
2804      sub        ecx, 16
2805      movdqa     [edx], xmm0
2806      lea        edx, [edx + 16]
2807      jg         convertloop
2808      ret
2809    }
2810  }
2811  
2812  __declspec(naked) __declspec(align(16))
UYVYToUVRow_SSE2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)2813  void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
2814                        uint8* dst_u, uint8* dst_v, int pix) {
2815    __asm {
2816      push       esi
2817      push       edi
2818      mov        eax, [esp + 8 + 4]    // src_yuy2
2819      mov        esi, [esp + 8 + 8]    // stride_yuy2
2820      mov        edx, [esp + 8 + 12]   // dst_u
2821      mov        edi, [esp + 8 + 16]   // dst_v
2822      mov        ecx, [esp + 8 + 20]   // pix
2823      pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2824      psrlw      xmm5, 8
2825      sub        edi, edx
2826  
2827      align      16
2828    convertloop:
2829      movdqa     xmm0, [eax]
2830      movdqa     xmm1, [eax + 16]
2831      movdqa     xmm2, [eax + esi]
2832      movdqa     xmm3, [eax + esi + 16]
2833      lea        eax,  [eax + 32]
2834      pavgb      xmm0, xmm2
2835      pavgb      xmm1, xmm3
2836      pand       xmm0, xmm5   // UYVY -> UVUV
2837      pand       xmm1, xmm5
2838      packuswb   xmm0, xmm1
2839      movdqa     xmm1, xmm0
2840      pand       xmm0, xmm5  // U
2841      packuswb   xmm0, xmm0
2842      psrlw      xmm1, 8     // V
2843      packuswb   xmm1, xmm1
2844      movq       qword ptr [edx], xmm0
2845      movq       qword ptr [edx + edi], xmm1
2846      lea        edx, [edx + 8]
2847      sub        ecx, 16
2848      jg         convertloop
2849  
2850      pop        edi
2851      pop        esi
2852      ret
2853    }
2854  }
2855  
2856  __declspec(naked) __declspec(align(16))
UYVYToUV422Row_SSE2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)2857  void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
2858                           uint8* dst_u, uint8* dst_v, int pix) {
2859    __asm {
2860      push       edi
2861      mov        eax, [esp + 4 + 4]    // src_yuy2
2862      mov        edx, [esp + 4 + 8]    // dst_u
2863      mov        edi, [esp + 4 + 12]   // dst_v
2864      mov        ecx, [esp + 4 + 16]   // pix
2865      pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2866      psrlw      xmm5, 8
2867      sub        edi, edx
2868  
2869      align      16
2870    convertloop:
2871      movdqa     xmm0, [eax]
2872      movdqa     xmm1, [eax + 16]
2873      lea        eax,  [eax + 32]
2874      pand       xmm0, xmm5   // UYVY -> UVUV
2875      pand       xmm1, xmm5
2876      packuswb   xmm0, xmm1
2877      movdqa     xmm1, xmm0
2878      pand       xmm0, xmm5  // U
2879      packuswb   xmm0, xmm0
2880      psrlw      xmm1, 8     // V
2881      packuswb   xmm1, xmm1
2882      movq       qword ptr [edx], xmm0
2883      movq       qword ptr [edx + edi], xmm1
2884      lea        edx, [edx + 8]
2885      sub        ecx, 16
2886      jg         convertloop
2887  
2888      pop        edi
2889      ret
2890    }
2891  }
2892  
2893  __declspec(naked) __declspec(align(16))
UYVYToYRow_Unaligned_SSE2(const uint8 * src_uyvy,uint8 * dst_y,int pix)2894  void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
2895                                 uint8* dst_y, int pix) {
2896    __asm {
2897      mov        eax, [esp + 4]    // src_uyvy
2898      mov        edx, [esp + 8]    // dst_y
2899      mov        ecx, [esp + 12]   // pix
2900  
2901      align      16
2902    convertloop:
2903      movdqu     xmm0, [eax]
2904      movdqu     xmm1, [eax + 16]
2905      lea        eax,  [eax + 32]
2906      psrlw      xmm0, 8    // odd bytes are Y
2907      psrlw      xmm1, 8
2908      packuswb   xmm0, xmm1
2909      sub        ecx, 16
2910      movdqu     [edx], xmm0
2911      lea        edx, [edx + 16]
2912      jg         convertloop
2913      ret
2914    }
2915  }
2916  
2917  __declspec(naked) __declspec(align(16))
UYVYToUVRow_Unaligned_SSE2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)2918  void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
2919                                  uint8* dst_u, uint8* dst_v, int pix) {
2920    __asm {
2921      push       esi
2922      push       edi
2923      mov        eax, [esp + 8 + 4]    // src_yuy2
2924      mov        esi, [esp + 8 + 8]    // stride_yuy2
2925      mov        edx, [esp + 8 + 12]   // dst_u
2926      mov        edi, [esp + 8 + 16]   // dst_v
2927      mov        ecx, [esp + 8 + 20]   // pix
2928      pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2929      psrlw      xmm5, 8
2930      sub        edi, edx
2931  
2932      align      16
2933    convertloop:
2934      movdqu     xmm0, [eax]
2935      movdqu     xmm1, [eax + 16]
2936      movdqu     xmm2, [eax + esi]
2937      movdqu     xmm3, [eax + esi + 16]
2938      lea        eax,  [eax + 32]
2939      pavgb      xmm0, xmm2
2940      pavgb      xmm1, xmm3
2941      pand       xmm0, xmm5   // UYVY -> UVUV
2942      pand       xmm1, xmm5
2943      packuswb   xmm0, xmm1
2944      movdqa     xmm1, xmm0
2945      pand       xmm0, xmm5  // U
2946      packuswb   xmm0, xmm0
2947      psrlw      xmm1, 8     // V
2948      packuswb   xmm1, xmm1
2949      movq       qword ptr [edx], xmm0
2950      movq       qword ptr [edx + edi], xmm1
2951      lea        edx, [edx + 8]
2952      sub        ecx, 16
2953      jg         convertloop
2954  
2955      pop        edi
2956      pop        esi
2957      ret
2958    }
2959  }
2960  
2961  __declspec(naked) __declspec(align(16))
UYVYToUV422Row_Unaligned_SSE2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)2962  void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
2963                                     uint8* dst_u, uint8* dst_v, int pix) {
2964    __asm {
2965      push       edi
2966      mov        eax, [esp + 4 + 4]    // src_yuy2
2967      mov        edx, [esp + 4 + 8]    // dst_u
2968      mov        edi, [esp + 4 + 12]   // dst_v
2969      mov        ecx, [esp + 4 + 16]   // pix
2970      pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2971      psrlw      xmm5, 8
2972      sub        edi, edx
2973  
2974      align      16
2975    convertloop:
2976      movdqu     xmm0, [eax]
2977      movdqu     xmm1, [eax + 16]
2978      lea        eax,  [eax + 32]
2979      pand       xmm0, xmm5   // UYVY -> UVUV
2980      pand       xmm1, xmm5
2981      packuswb   xmm0, xmm1
2982      movdqa     xmm1, xmm0
2983      pand       xmm0, xmm5  // U
2984      packuswb   xmm0, xmm0
2985      psrlw      xmm1, 8     // V
2986      packuswb   xmm1, xmm1
2987      movq       qword ptr [edx], xmm0
2988      movq       qword ptr [edx + edi], xmm1
2989      lea        edx, [edx + 8]
2990      sub        ecx, 16
2991      jg         convertloop
2992  
2993      pop        edi
2994      ret
2995    }
2996  }
2997  #endif  // HAS_YUY2TOYROW_SSE2
2998  
2999  #ifdef HAS_ARGBBLENDROW_SSE2
3000  // Blend 8 pixels at a time.
3001  __declspec(naked) __declspec(align(16))
ARGBBlendRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)3002  void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3003                         uint8* dst_argb, int width) {
3004    __asm {
3005      push       esi
3006      mov        eax, [esp + 4 + 4]   // src_argb0
3007      mov        esi, [esp + 4 + 8]   // src_argb1
3008      mov        edx, [esp + 4 + 12]  // dst_argb
3009      mov        ecx, [esp + 4 + 16]  // width
3010      pcmpeqb    xmm7, xmm7       // generate constant 1
3011      psrlw      xmm7, 15
3012      pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
3013      psrlw      xmm6, 8
3014      pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
3015      psllw      xmm5, 8
3016      pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
3017      pslld      xmm4, 24
3018  
3019      sub        ecx, 1
3020      je         convertloop1     // only 1 pixel?
3021      jl         convertloop1b
3022  
3023      // 1 pixel loop until destination pointer is aligned.
3024    alignloop1:
3025      test       edx, 15          // aligned?
3026      je         alignloop1b
3027      movd       xmm3, [eax]
3028      lea        eax, [eax + 4]
3029      movdqa     xmm0, xmm3       // src argb
3030      pxor       xmm3, xmm4       // ~alpha
3031      movd       xmm2, [esi]      // _r_b
3032      psrlw      xmm3, 8          // alpha
3033      pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
3034      pshuflw    xmm3, xmm3,0F5h
3035      pand       xmm2, xmm6       // _r_b
3036      paddw      xmm3, xmm7       // 256 - alpha
3037      pmullw     xmm2, xmm3       // _r_b * alpha
3038      movd       xmm1, [esi]      // _a_g
3039      lea        esi, [esi + 4]
3040      psrlw      xmm1, 8          // _a_g
3041      por        xmm0, xmm4       // set alpha to 255
3042      pmullw     xmm1, xmm3       // _a_g * alpha
3043      psrlw      xmm2, 8          // _r_b convert to 8 bits again
3044      paddusb    xmm0, xmm2       // + src argb
3045      pand       xmm1, xmm5       // a_g_ convert to 8 bits again
3046      paddusb    xmm0, xmm1       // + src argb
3047      sub        ecx, 1
3048      movd       [edx], xmm0
3049      lea        edx, [edx + 4]
3050      jge        alignloop1
3051  
3052    alignloop1b:
3053      add        ecx, 1 - 4
3054      jl         convertloop4b
3055  
3056      // 4 pixel loop.
3057    convertloop4:
3058      movdqu     xmm3, [eax]      // src argb
3059      lea        eax, [eax + 16]
3060      movdqa     xmm0, xmm3       // src argb
3061      pxor       xmm3, xmm4       // ~alpha
3062      movdqu     xmm2, [esi]      // _r_b
3063      psrlw      xmm3, 8          // alpha
3064      pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
3065      pshuflw    xmm3, xmm3,0F5h
3066      pand       xmm2, xmm6       // _r_b
3067      paddw      xmm3, xmm7       // 256 - alpha
3068      pmullw     xmm2, xmm3       // _r_b * alpha
3069      movdqu     xmm1, [esi]      // _a_g
3070      lea        esi, [esi + 16]
3071      psrlw      xmm1, 8          // _a_g
3072      por        xmm0, xmm4       // set alpha to 255
3073      pmullw     xmm1, xmm3       // _a_g * alpha
3074      psrlw      xmm2, 8          // _r_b convert to 8 bits again
3075      paddusb    xmm0, xmm2       // + src argb
3076      pand       xmm1, xmm5       // a_g_ convert to 8 bits again
3077      paddusb    xmm0, xmm1       // + src argb
3078      sub        ecx, 4
3079      movdqa     [edx], xmm0
3080      lea        edx, [edx + 16]
3081      jge        convertloop4
3082  
3083    convertloop4b:
3084      add        ecx, 4 - 1
3085      jl         convertloop1b
3086  
3087      // 1 pixel loop.
3088    convertloop1:
3089      movd       xmm3, [eax]      // src argb
3090      lea        eax, [eax + 4]
3091      movdqa     xmm0, xmm3       // src argb
3092      pxor       xmm3, xmm4       // ~alpha
3093      movd       xmm2, [esi]      // _r_b
3094      psrlw      xmm3, 8          // alpha
3095      pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
3096      pshuflw    xmm3, xmm3,0F5h
3097      pand       xmm2, xmm6       // _r_b
3098      paddw      xmm3, xmm7       // 256 - alpha
3099      pmullw     xmm2, xmm3       // _r_b * alpha
3100      movd       xmm1, [esi]      // _a_g
3101      lea        esi, [esi + 4]
3102      psrlw      xmm1, 8          // _a_g
3103      por        xmm0, xmm4       // set alpha to 255
3104      pmullw     xmm1, xmm3       // _a_g * alpha
3105      psrlw      xmm2, 8          // _r_b convert to 8 bits again
3106      paddusb    xmm0, xmm2       // + src argb
3107      pand       xmm1, xmm5       // a_g_ convert to 8 bits again
3108      paddusb    xmm0, xmm1       // + src argb
3109      sub        ecx, 1
3110      movd       [edx], xmm0
3111      lea        edx, [edx + 4]
3112      jge        convertloop1
3113  
3114    convertloop1b:
3115      pop        esi
3116      ret
3117    }
3118  }
3119  #endif  // HAS_ARGBBLENDROW_SSE2
3120  
3121  #ifdef HAS_ARGBBLENDROW_SSSE3
3122  // Shuffle table for isolating alpha.
3123  static const uvec8 kShuffleAlpha = {
3124    3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3125    11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3126  };
3127  // Same as SSE2, but replaces:
3128  //    psrlw      xmm3, 8          // alpha
3129  //    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
3130  //    pshuflw    xmm3, xmm3,0F5h
3131  // with..
3132  //    pshufb     xmm3, kShuffleAlpha // alpha
3133  // Blend 8 pixels at a time.
3134  
3135  __declspec(naked) __declspec(align(16))
ARGBBlendRow_SSSE3(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)3136  void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3137                          uint8* dst_argb, int width) {
3138    __asm {
3139      push       esi
3140      mov        eax, [esp + 4 + 4]   // src_argb0
3141      mov        esi, [esp + 4 + 8]   // src_argb1
3142      mov        edx, [esp + 4 + 12]  // dst_argb
3143      mov        ecx, [esp + 4 + 16]  // width
3144      pcmpeqb    xmm7, xmm7       // generate constant 1
3145      psrlw      xmm7, 15
3146      pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
3147      psrlw      xmm6, 8
3148      pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
3149      psllw      xmm5, 8
3150      pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
3151      pslld      xmm4, 24
3152  
3153      sub        ecx, 1
3154      je         convertloop1     // only 1 pixel?
3155      jl         convertloop1b
3156  
3157      // 1 pixel loop until destination pointer is aligned.
3158    alignloop1:
3159      test       edx, 15          // aligned?
3160      je         alignloop1b
3161      movd       xmm3, [eax]
3162      lea        eax, [eax + 4]
3163      movdqa     xmm0, xmm3       // src argb
3164      pxor       xmm3, xmm4       // ~alpha
3165      movd       xmm2, [esi]      // _r_b
3166      pshufb     xmm3, kShuffleAlpha // alpha
3167      pand       xmm2, xmm6       // _r_b
3168      paddw      xmm3, xmm7       // 256 - alpha
3169      pmullw     xmm2, xmm3       // _r_b * alpha
3170      movd       xmm1, [esi]      // _a_g
3171      lea        esi, [esi + 4]
3172      psrlw      xmm1, 8          // _a_g
3173      por        xmm0, xmm4       // set alpha to 255
3174      pmullw     xmm1, xmm3       // _a_g * alpha
3175      psrlw      xmm2, 8          // _r_b convert to 8 bits again
3176      paddusb    xmm0, xmm2       // + src argb
3177      pand       xmm1, xmm5       // a_g_ convert to 8 bits again
3178      paddusb    xmm0, xmm1       // + src argb
3179      sub        ecx, 1
3180      movd       [edx], xmm0
3181      lea        edx, [edx + 4]
3182      jge        alignloop1
3183  
3184    alignloop1b:
3185      add        ecx, 1 - 4
3186      jl         convertloop4b
3187  
3188      test       eax, 15          // unaligned?
3189      jne        convertuloop4
3190      test       esi, 15          // unaligned?
3191      jne        convertuloop4
3192  
3193      // 4 pixel loop.
3194    convertloop4:
3195      movdqa     xmm3, [eax]      // src argb
3196      lea        eax, [eax + 16]
3197      movdqa     xmm0, xmm3       // src argb
3198      pxor       xmm3, xmm4       // ~alpha
3199      movdqa     xmm2, [esi]      // _r_b
3200      pshufb     xmm3, kShuffleAlpha // alpha
3201      pand       xmm2, xmm6       // _r_b
3202      paddw      xmm3, xmm7       // 256 - alpha
3203      pmullw     xmm2, xmm3       // _r_b * alpha
3204      movdqa     xmm1, [esi]      // _a_g
3205      lea        esi, [esi + 16]
3206      psrlw      xmm1, 8          // _a_g
3207      por        xmm0, xmm4       // set alpha to 255
3208      pmullw     xmm1, xmm3       // _a_g * alpha
3209      psrlw      xmm2, 8          // _r_b convert to 8 bits again
3210      paddusb    xmm0, xmm2       // + src argb
3211      pand       xmm1, xmm5       // a_g_ convert to 8 bits again
3212      paddusb    xmm0, xmm1       // + src argb
3213      sub        ecx, 4
3214      movdqa     [edx], xmm0
3215      lea        edx, [edx + 16]
3216      jge        convertloop4
3217      jmp        convertloop4b
3218  
3219      // 4 pixel unaligned loop.
3220    convertuloop4:
3221      movdqu     xmm3, [eax]      // src argb
3222      lea        eax, [eax + 16]
3223      movdqa     xmm0, xmm3       // src argb
3224      pxor       xmm3, xmm4       // ~alpha
3225      movdqu     xmm2, [esi]      // _r_b
3226      pshufb     xmm3, kShuffleAlpha // alpha
3227      pand       xmm2, xmm6       // _r_b
3228      paddw      xmm3, xmm7       // 256 - alpha
3229      pmullw     xmm2, xmm3       // _r_b * alpha
3230      movdqu     xmm1, [esi]      // _a_g
3231      lea        esi, [esi + 16]
3232      psrlw      xmm1, 8          // _a_g
3233      por        xmm0, xmm4       // set alpha to 255
3234      pmullw     xmm1, xmm3       // _a_g * alpha
3235      psrlw      xmm2, 8          // _r_b convert to 8 bits again
3236      paddusb    xmm0, xmm2       // + src argb
3237      pand       xmm1, xmm5       // a_g_ convert to 8 bits again
3238      paddusb    xmm0, xmm1       // + src argb
3239      sub        ecx, 4
3240      movdqa     [edx], xmm0
3241      lea        edx, [edx + 16]
3242      jge        convertuloop4
3243  
3244    convertloop4b:
3245      add        ecx, 4 - 1
3246      jl         convertloop1b
3247  
3248      // 1 pixel loop.
3249    convertloop1:
3250      movd       xmm3, [eax]      // src argb
3251      lea        eax, [eax + 4]
3252      movdqa     xmm0, xmm3       // src argb
3253      pxor       xmm3, xmm4       // ~alpha
3254      movd       xmm2, [esi]      // _r_b
3255      pshufb     xmm3, kShuffleAlpha // alpha
3256      pand       xmm2, xmm6       // _r_b
3257      paddw      xmm3, xmm7       // 256 - alpha
3258      pmullw     xmm2, xmm3       // _r_b * alpha
3259      movd       xmm1, [esi]      // _a_g
3260      lea        esi, [esi + 4]
3261      psrlw      xmm1, 8          // _a_g
3262      por        xmm0, xmm4       // set alpha to 255
3263      pmullw     xmm1, xmm3       // _a_g * alpha
3264      psrlw      xmm2, 8          // _r_b convert to 8 bits again
3265      paddusb    xmm0, xmm2       // + src argb
3266      pand       xmm1, xmm5       // a_g_ convert to 8 bits again
3267      paddusb    xmm0, xmm1       // + src argb
3268      sub        ecx, 1
3269      movd       [edx], xmm0
3270      lea        edx, [edx + 4]
3271      jge        convertloop1
3272  
3273    convertloop1b:
3274      pop        esi
3275      ret
3276    }
3277  }
3278  #endif  // HAS_ARGBBLENDROW_SSSE3
3279  
3280  #ifdef HAS_ARGBATTENUATE_SSE2
3281  // Attenuate 4 pixels at a time.
3282  // Aligned to 16 bytes.
3283  __declspec(naked) __declspec(align(16))
ARGBAttenuateRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width)3284  void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3285    __asm {
3286      mov        eax, [esp + 4]   // src_argb0
3287      mov        edx, [esp + 8]   // dst_argb
3288      mov        ecx, [esp + 12]  // width
3289      sub        edx, eax
3290      pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
3291      pslld      xmm4, 24
3292      pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
3293      psrld      xmm5, 8
3294  
3295      align      16
3296   convertloop:
3297      movdqa     xmm0, [eax]      // read 4 pixels
3298      punpcklbw  xmm0, xmm0       // first 2
3299      pshufhw    xmm2, xmm0,0FFh  // 8 alpha words
3300      pshuflw    xmm2, xmm2,0FFh
3301      pmulhuw    xmm0, xmm2       // rgb * a
3302      movdqa     xmm1, [eax]      // read 4 pixels
3303      punpckhbw  xmm1, xmm1       // next 2 pixels
3304      pshufhw    xmm2, xmm1,0FFh  // 8 alpha words
3305      pshuflw    xmm2, xmm2,0FFh
3306      pmulhuw    xmm1, xmm2       // rgb * a
3307      movdqa     xmm2, [eax]      // alphas
3308      psrlw      xmm0, 8
3309      pand       xmm2, xmm4
3310      psrlw      xmm1, 8
3311      packuswb   xmm0, xmm1
3312      pand       xmm0, xmm5       // keep original alphas
3313      por        xmm0, xmm2
3314      sub        ecx, 4
3315      movdqa     [eax + edx], xmm0
3316      lea        eax, [eax + 16]
3317      jg         convertloop
3318  
3319      ret
3320    }
3321  }
3322  #endif  // HAS_ARGBATTENUATE_SSE2
3323  
3324  #ifdef HAS_ARGBATTENUATEROW_SSSE3
3325  // Shuffle table duplicating alpha.
3326  static const uvec8 kShuffleAlpha0 = {
3327    3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
3328  };
3329  static const uvec8 kShuffleAlpha1 = {
3330    11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3331    15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
3332  };
3333  __declspec(naked) __declspec(align(16))
ARGBAttenuateRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)3334  void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3335    __asm {
3336      mov        eax, [esp + 4]   // src_argb0
3337      mov        edx, [esp + 8]   // dst_argb
3338      mov        ecx, [esp + 12]  // width
3339      sub        edx, eax
3340      pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
3341      pslld      xmm3, 24
3342      movdqa     xmm4, kShuffleAlpha0
3343      movdqa     xmm5, kShuffleAlpha1
3344  
3345      align      16
3346   convertloop:
3347      movdqa     xmm0, [eax]      // read 4 pixels
3348      pshufb     xmm0, xmm4       // isolate first 2 alphas
3349      movdqa     xmm1, [eax]      // read 4 pixels
3350      punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
3351      pmulhuw    xmm0, xmm1       // rgb * a
3352      movdqa     xmm1, [eax]      // read 4 pixels
3353      pshufb     xmm1, xmm5       // isolate next 2 alphas
3354      movdqa     xmm2, [eax]      // read 4 pixels
3355      punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
3356      pmulhuw    xmm1, xmm2       // rgb * a
3357      movdqa     xmm2, [eax]      // mask original alpha
3358      pand       xmm2, xmm3
3359      psrlw      xmm0, 8
3360      psrlw      xmm1, 8
3361      packuswb   xmm0, xmm1
3362      por        xmm0, xmm2       // copy original alpha
3363      sub        ecx, 4
3364      movdqa     [eax + edx], xmm0
3365      lea        eax, [eax + 16]
3366      jg         convertloop
3367  
3368      ret
3369    }
3370  }
3371  #endif  // HAS_ARGBATTENUATEROW_SSSE3
3372  
3373  #ifdef HAS_ARGBUNATTENUATEROW_SSE2
3374  // Unattenuate 4 pixels at a time.
3375  // Aligned to 16 bytes.
3376  __declspec(naked) __declspec(align(16))
ARGBUnattenuateRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width)3377  void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3378                               int width) {
3379    __asm {
3380      push       esi
3381      push       edi
3382      mov        eax, [esp + 8 + 4]   // src_argb0
3383      mov        edx, [esp + 8 + 8]   // dst_argb
3384      mov        ecx, [esp + 8 + 12]  // width
3385      sub        edx, eax
3386      pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
3387      pslld      xmm4, 24
3388  
3389      align      16
3390   convertloop:
3391      movdqa     xmm0, [eax]      // read 4 pixels
3392      movzx      esi, byte ptr [eax + 3]  // first alpha
3393      movzx      edi, byte ptr [eax + 7]  // second alpha
3394      punpcklbw  xmm0, xmm0       // first 2
3395      movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
3396      movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
3397      pshuflw    xmm2, xmm2,0C0h  // first 4 inv_alpha words
3398      pshuflw    xmm3, xmm3,0C0h  // next 4 inv_alpha words
3399      movlhps    xmm2, xmm3
3400      pmulhuw    xmm0, xmm2       // rgb * a
3401  
3402      movdqa     xmm1, [eax]      // read 4 pixels
3403      movzx      esi, byte ptr [eax + 11]  // third alpha
3404      movzx      edi, byte ptr [eax + 15]  // forth alpha
3405      punpckhbw  xmm1, xmm1       // next 2
3406      movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
3407      movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
3408      pshuflw    xmm2, xmm2,0C0h  // first 4 inv_alpha words
3409      pshuflw    xmm3, xmm3,0C0h  // next 4 inv_alpha words
3410      movlhps    xmm2, xmm3
3411      pmulhuw    xmm1, xmm2       // rgb * a
3412  
3413      movdqa     xmm2, [eax]      // alphas
3414      pand       xmm2, xmm4
3415      packuswb   xmm0, xmm1
3416      por        xmm0, xmm2
3417      sub        ecx, 4
3418      movdqa     [eax + edx], xmm0
3419      lea        eax, [eax + 16]
3420      jg         convertloop
3421      pop        edi
3422      pop        esi
3423      ret
3424    }
3425  }
3426  #endif  // HAS_ARGBUNATTENUATEROW_SSE2
3427  
3428  #ifdef HAS_ARGBGRAYROW_SSSE3
3429  // Constant for ARGB color to gray scale: 0.11 * B + 0.59 * G + 0.30 * R
3430  static const vec8 kARGBToGray = {
3431    14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
3432  };
3433  
3434  // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
3435  __declspec(naked) __declspec(align(16))
ARGBGrayRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)3436  void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3437    __asm {
3438      mov        eax, [esp + 4]   /* src_argb */
3439      mov        edx, [esp + 8]   /* dst_argb */
3440      mov        ecx, [esp + 12]  /* width */
3441      movdqa     xmm4, kARGBToGray
3442      sub        edx, eax
3443  
3444      align      16
3445   convertloop:
3446      movdqa     xmm0, [eax]  // G
3447      movdqa     xmm1, [eax + 16]
3448      pmaddubsw  xmm0, xmm4
3449      pmaddubsw  xmm1, xmm4
3450      phaddw     xmm0, xmm1
3451      psrlw      xmm0, 7
3452      packuswb   xmm0, xmm0   // 8 G bytes
3453      movdqa     xmm2, [eax]  // A
3454      movdqa     xmm3, [eax + 16]
3455      psrld      xmm2, 24
3456      psrld      xmm3, 24
3457      packuswb   xmm2, xmm3
3458      packuswb   xmm2, xmm2   // 8 A bytes
3459      movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
3460      punpcklbw  xmm0, xmm0   // 8 GG words
3461      punpcklbw  xmm3, xmm2   // 8 GA words
3462      movdqa     xmm1, xmm0
3463      punpcklwd  xmm0, xmm3   // GGGA first 4
3464      punpckhwd  xmm1, xmm3   // GGGA next 4
3465      sub        ecx, 8
3466      movdqa     [eax + edx], xmm0
3467      movdqa     [eax + edx + 16], xmm1
3468      lea        eax, [eax + 32]
3469      jg         convertloop
3470      ret
3471    }
3472  }
3473  #endif  // HAS_ARGBGRAYROW_SSSE3
3474  
3475  #ifdef HAS_ARGBSEPIAROW_SSSE3
3476  //    b = (r * 35 + g * 68 + b * 17) >> 7
3477  //    g = (r * 45 + g * 88 + b * 22) >> 7
3478  //    r = (r * 50 + g * 98 + b * 24) >> 7
3479  // Constant for ARGB color to sepia tone.
3480  static const vec8 kARGBToSepiaB = {
3481    17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3482  };
3483  
3484  static const vec8 kARGBToSepiaG = {
3485    22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3486  };
3487  
3488  static const vec8 kARGBToSepiaR = {
3489    24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3490  };
3491  
3492  // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
3493  __declspec(naked) __declspec(align(16))
ARGBSepiaRow_SSSE3(uint8 * dst_argb,int width)3494  void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3495    __asm {
3496      mov        eax, [esp + 4]   /* dst_argb */
3497      mov        ecx, [esp + 8]   /* width */
3498      movdqa     xmm2, kARGBToSepiaB
3499      movdqa     xmm3, kARGBToSepiaG
3500      movdqa     xmm4, kARGBToSepiaR
3501  
3502      align      16
3503   convertloop:
3504      movdqa     xmm0, [eax]  // B
3505      movdqa     xmm6, [eax + 16]
3506      pmaddubsw  xmm0, xmm2
3507      pmaddubsw  xmm6, xmm2
3508      phaddw     xmm0, xmm6
3509      psrlw      xmm0, 7
3510      packuswb   xmm0, xmm0   // 8 B values
3511      movdqa     xmm5, [eax]  // G
3512      movdqa     xmm1, [eax + 16]
3513      pmaddubsw  xmm5, xmm3
3514      pmaddubsw  xmm1, xmm3
3515      phaddw     xmm5, xmm1
3516      psrlw      xmm5, 7
3517      packuswb   xmm5, xmm5   // 8 G values
3518      punpcklbw  xmm0, xmm5   // 8 BG values
3519      movdqa     xmm5, [eax]  // R
3520      movdqa     xmm1, [eax + 16]
3521      pmaddubsw  xmm5, xmm4
3522      pmaddubsw  xmm1, xmm4
3523      phaddw     xmm5, xmm1
3524      psrlw      xmm5, 7
3525      packuswb   xmm5, xmm5   // 8 R values
3526      movdqa     xmm6, [eax]  // A
3527      movdqa     xmm1, [eax + 16]
3528      psrld      xmm6, 24
3529      psrld      xmm1, 24
3530      packuswb   xmm6, xmm1
3531      packuswb   xmm6, xmm6   // 8 A values
3532      punpcklbw  xmm5, xmm6   // 8 RA values
3533      movdqa     xmm1, xmm0   // Weave BG, RA together
3534      punpcklwd  xmm0, xmm5   // BGRA first 4
3535      punpckhwd  xmm1, xmm5   // BGRA next 4
3536      sub        ecx, 8
3537      movdqa     [eax], xmm0
3538      movdqa     [eax + 16], xmm1
3539      lea        eax, [eax + 32]
3540      jg         convertloop
3541      ret
3542    }
3543  }
3544  #endif  // HAS_ARGBSEPIAROW_SSSE3
3545  
3546  #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3547  // Tranform 8 ARGB pixels (32 bytes) with color matrix.
3548  // Same as Sepia except matrix is provided.
3549  // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
3550  // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
3551  __declspec(naked) __declspec(align(16))
ARGBColorMatrixRow_SSSE3(uint8 * dst_argb,const int8 * matrix_argb,int width)3552  void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
3553                                int width) {
3554    __asm {
3555      mov        eax, [esp + 4]   /* dst_argb */
3556      mov        edx, [esp + 8]   /* matrix_argb */
3557      mov        ecx, [esp + 12]  /* width */
3558      movd       xmm2, [edx]
3559      movd       xmm3, [edx + 4]
3560      movd       xmm4, [edx + 8]
3561      pshufd     xmm2, xmm2, 0
3562      pshufd     xmm3, xmm3, 0
3563      pshufd     xmm4, xmm4, 0
3564  
3565      align      16
3566   convertloop:
3567      movdqa     xmm0, [eax]  // B
3568      movdqa     xmm6, [eax + 16]
3569      pmaddubsw  xmm0, xmm2
3570      pmaddubsw  xmm6, xmm2
3571      movdqa     xmm5, [eax]  // G
3572      movdqa     xmm1, [eax + 16]
3573      pmaddubsw  xmm5, xmm3
3574      pmaddubsw  xmm1, xmm3
3575      phaddsw    xmm0, xmm6   // B
3576      phaddsw    xmm5, xmm1   // G
3577      psraw      xmm0, 7      // B
3578      psraw      xmm5, 7      // G
3579      packuswb   xmm0, xmm0   // 8 B values
3580      packuswb   xmm5, xmm5   // 8 G values
3581      punpcklbw  xmm0, xmm5   // 8 BG values
3582      movdqa     xmm5, [eax]  // R
3583      movdqa     xmm1, [eax + 16]
3584      pmaddubsw  xmm5, xmm4
3585      pmaddubsw  xmm1, xmm4
3586      phaddsw    xmm5, xmm1
3587      psraw      xmm5, 7
3588      packuswb   xmm5, xmm5   // 8 R values
3589      movdqa     xmm6, [eax]  // A
3590      movdqa     xmm1, [eax + 16]
3591      psrld      xmm6, 24
3592      psrld      xmm1, 24
3593      packuswb   xmm6, xmm1
3594      packuswb   xmm6, xmm6   // 8 A values
3595      movdqa     xmm1, xmm0   // Weave BG, RA together
3596      punpcklbw  xmm5, xmm6   // 8 RA values
3597      punpcklwd  xmm0, xmm5   // BGRA first 4
3598      punpckhwd  xmm1, xmm5   // BGRA next 4
3599      sub        ecx, 8
3600      movdqa     [eax], xmm0
3601      movdqa     [eax + 16], xmm1
3602      lea        eax, [eax + 32]
3603      jg         convertloop
3604      ret
3605    }
3606  }
3607  #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
3608  
3609  #ifdef HAS_ARGBCOLORTABLEROW_X86
3610  // Tranform ARGB pixels with color table.
3611  __declspec(naked) __declspec(align(16))
ARGBColorTableRow_X86(uint8 * dst_argb,const uint8 * table_argb,int width)3612  void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
3613                             int width) {
3614    __asm {
3615      push       ebx
3616      push       esi
3617      push       edi
3618      push       ebp
3619      mov        eax, [esp + 16 + 4]   /* dst_argb */
3620      mov        edi, [esp + 16 + 8]   /* table_argb */
3621      mov        ecx, [esp + 16 + 12]  /* width */
3622      xor        ebx, ebx
3623      xor        edx, edx
3624  
3625      align      16
3626   convertloop:
3627      mov        ebp, dword ptr [eax]  // BGRA
3628      mov        esi, ebp
3629      and        ebp, 255
3630      shr        esi, 8
3631      and        esi, 255
3632      mov        bl, [edi + ebp * 4 + 0]  // B
3633      mov        dl, [edi + esi * 4 + 1]  // G
3634      mov        ebp, dword ptr [eax]  // BGRA
3635      mov        esi, ebp
3636      shr        ebp, 16
3637      shr        esi, 24
3638      and        ebp, 255
3639      mov        [eax], bl
3640      mov        [eax + 1], dl
3641      mov        bl, [edi + ebp * 4 + 2]  // R
3642      mov        dl, [edi + esi * 4 + 3]  // A
3643      mov        [eax + 2], bl
3644      mov        [eax + 3], dl
3645      lea        eax, [eax + 4]
3646      sub        ecx, 1
3647      jg         convertloop
3648      pop        ebp
3649      pop        edi
3650      pop        esi
3651      pop        ebx
3652      ret
3653    }
3654  }
3655  #endif  // HAS_ARGBCOLORTABLEROW_X86
3656  
3657  #ifdef HAS_ARGBQUANTIZEROW_SSE2
3658  // Quantize 4 ARGB pixels (16 bytes).
3659  // Aligned to 16 bytes.
3660  __declspec(naked) __declspec(align(16))
ARGBQuantizeRow_SSE2(uint8 * dst_argb,int scale,int interval_size,int interval_offset,int width)3661  void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3662                            int interval_offset, int width) {
3663    __asm {
3664      mov        eax, [esp + 4]    /* dst_argb */
3665      movd       xmm2, [esp + 8]   /* scale */
3666      movd       xmm3, [esp + 12]  /* interval_size */
3667      movd       xmm4, [esp + 16]  /* interval_offset */
3668      mov        ecx, [esp + 20]   /* width */
3669      pshuflw    xmm2, xmm2, 040h
3670      pshufd     xmm2, xmm2, 044h
3671      pshuflw    xmm3, xmm3, 040h
3672      pshufd     xmm3, xmm3, 044h
3673      pshuflw    xmm4, xmm4, 040h
3674      pshufd     xmm4, xmm4, 044h
3675      pxor       xmm5, xmm5  // constant 0
3676      pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
3677      pslld      xmm6, 24
3678  
3679      align      16
3680   convertloop:
3681      movdqa     xmm0, [eax]  // read 4 pixels
3682      punpcklbw  xmm0, xmm5   // first 2 pixels
3683      pmulhuw    xmm0, xmm2   // pixel * scale >> 16
3684      movdqa     xmm1, [eax]  // read 4 pixels
3685      punpckhbw  xmm1, xmm5   // next 2 pixels
3686      pmulhuw    xmm1, xmm2
3687      pmullw     xmm0, xmm3   // * interval_size
3688      movdqa     xmm7, [eax]  // read 4 pixels
3689      pmullw     xmm1, xmm3
3690      pand       xmm7, xmm6   // mask alpha
3691      paddw      xmm0, xmm4   // + interval_size / 2
3692      paddw      xmm1, xmm4
3693      packuswb   xmm0, xmm1
3694      por        xmm0, xmm7
3695      sub        ecx, 4
3696      movdqa     [eax], xmm0
3697      lea        eax, [eax + 16]
3698      jg         convertloop
3699      ret
3700    }
3701  }
3702  #endif  // HAS_ARGBQUANTIZEROW_SSE2
3703  
3704  #ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
3705  // Consider float CumulativeSum.
3706  // Consider calling CumulativeSum one row at time as needed.
3707  // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
3708  // Convert cumulative sum for an area to an average for 1 pixel.
3709  // topleft is pointer to top left of CumulativeSum buffer for area.
3710  // botleft is pointer to bottom left of CumulativeSum buffer.
3711  // width is offset from left to right of area in CumulativeSum buffer measured
3712  //   in number of ints.
3713  // area is the number of pixels in the area being averaged.
3714  // dst points to pixel to store result to.
3715  // count is number of averaged pixels to produce.
3716  // Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
3717  // aligned.
CumulativeSumToAverage_SSE2(const int32 * topleft,const int32 * botleft,int width,int area,uint8 * dst,int count)3718  void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
3719                                   int width, int area, uint8* dst, int count) {
3720    __asm {
3721      mov        eax, topleft  // eax topleft
3722      mov        esi, botleft  // esi botleft
3723      mov        edx, width
3724      movd       xmm4, area
3725      mov        edi, dst
3726      mov        ecx, count
3727      cvtdq2ps   xmm4, xmm4
3728      rcpss      xmm4, xmm4  // 1.0f / area
3729      pshufd     xmm4, xmm4, 0
3730      sub        ecx, 4
3731      jl         l4b
3732  
3733      // 4 pixel loop
3734      align      4
3735    l4:
3736      // top left
3737      movdqa     xmm0, [eax]
3738      movdqa     xmm1, [eax + 16]
3739      movdqa     xmm2, [eax + 32]
3740      movdqa     xmm3, [eax + 48]
3741  
3742      // - top right
3743      psubd      xmm0, [eax + edx * 4]
3744      psubd      xmm1, [eax + edx * 4 + 16]
3745      psubd      xmm2, [eax + edx * 4 + 32]
3746      psubd      xmm3, [eax + edx * 4 + 48]
3747      lea        eax, [eax + 64]
3748  
3749      // - bottom left
3750      psubd      xmm0, [esi]
3751      psubd      xmm1, [esi + 16]
3752      psubd      xmm2, [esi + 32]
3753      psubd      xmm3, [esi + 48]
3754  
3755      // + bottom right
3756      paddd      xmm0, [esi + edx * 4]
3757      paddd      xmm1, [esi + edx * 4 + 16]
3758      paddd      xmm2, [esi + edx * 4 + 32]
3759      paddd      xmm3, [esi + edx * 4 + 48]
3760      lea        esi, [esi + 64]
3761  
3762      cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
3763      cvtdq2ps   xmm1, xmm1
3764      mulps      xmm0, xmm4
3765      mulps      xmm1, xmm4
3766      cvtdq2ps   xmm2, xmm2
3767      cvtdq2ps   xmm3, xmm3
3768      mulps      xmm2, xmm4
3769      mulps      xmm3, xmm4
3770      cvtps2dq   xmm0, xmm0
3771      cvtps2dq   xmm1, xmm1
3772      cvtps2dq   xmm2, xmm2
3773      cvtps2dq   xmm3, xmm3
3774      packssdw   xmm0, xmm1
3775      packssdw   xmm2, xmm3
3776      packuswb   xmm0, xmm2
3777      movdqu     [edi], xmm0
3778      lea        edi, [edi + 16]
3779      sub        ecx, 4
3780      jge        l4
3781  
3782    l4b:
3783      add        ecx, 4 - 1
3784      jl         l1b
3785  
3786      // 1 pixel loop
3787      align      4
3788    l1:
3789      movdqa     xmm0, [eax]
3790      psubd      xmm0, [eax + edx * 4]
3791      lea        eax, [eax + 16]
3792      psubd      xmm0, [esi]
3793      paddd      xmm0, [esi + edx * 4]
3794      lea        esi, [esi + 16]
3795      cvtdq2ps   xmm0, xmm0
3796      mulps      xmm0, xmm4
3797      cvtps2dq   xmm0, xmm0
3798      packssdw   xmm0, xmm0
3799      packuswb   xmm0, xmm0
3800      movd       dword ptr [edi], xmm0
3801      lea        edi, [edi + 4]
3802      sub        ecx, 1
3803      jge        l1
3804    l1b:
3805    }
3806  }
3807  #endif  // HAS_CUMULATIVESUMTOAVERAGE_SSE2
3808  
3809  #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
3810  // Creates a table of cumulative sums where each value is a sum of all values
3811  // above and to the left of the value.
ComputeCumulativeSumRow_SSE2(const uint8 * row,int32 * cumsum,const int32 * previous_cumsum,int width)3812  void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
3813                                    const int32* previous_cumsum, int width) {
3814    __asm {
3815      mov        eax, row
3816      mov        edx, cumsum
3817      mov        esi, previous_cumsum
3818      mov        ecx, width
3819      sub        esi, edx
3820      pxor       xmm0, xmm0
3821      pxor       xmm1, xmm1
3822  
3823      sub        ecx, 4
3824      jl         l4b
3825      test       edx, 15
3826      jne        l4b
3827  
3828      // 4 pixel loop
3829      align      4
3830    l4:
3831      movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
3832      lea        eax, [eax + 16]
3833      movdqa     xmm4, xmm2
3834  
3835      punpcklbw  xmm2, xmm1
3836      movdqa     xmm3, xmm2
3837      punpcklwd  xmm2, xmm1
3838      punpckhwd  xmm3, xmm1
3839  
3840      punpckhbw  xmm4, xmm1
3841      movdqa     xmm5, xmm4
3842      punpcklwd  xmm4, xmm1
3843      punpckhwd  xmm5, xmm1
3844  
3845      paddd      xmm0, xmm2
3846      movdqa     xmm2, [edx + esi]  // previous row above.
3847      paddd      xmm2, xmm0
3848  
3849      paddd      xmm0, xmm3
3850      movdqa     xmm3, [edx + esi + 16]
3851      paddd      xmm3, xmm0
3852  
3853      paddd      xmm0, xmm4
3854      movdqa     xmm4, [edx + esi + 32]
3855      paddd      xmm4, xmm0
3856  
3857      paddd      xmm0, xmm5
3858      movdqa     xmm5, [edx + esi + 48]
3859      paddd      xmm5, xmm0
3860  
3861      movdqa     [edx], xmm2
3862      movdqa     [edx + 16], xmm3
3863      movdqa     [edx + 32], xmm4
3864      movdqa     [edx + 48], xmm5
3865  
3866      lea        edx, [edx + 64]
3867      sub        ecx, 4
3868      jge        l4
3869  
3870    l4b:
3871      add        ecx, 4 - 1
3872      jl         l1b
3873  
3874      // 1 pixel loop
3875      align      4
3876    l1:
3877      movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
3878      lea        eax, [eax + 4]
3879      punpcklbw  xmm2, xmm1
3880      punpcklwd  xmm2, xmm1
3881      paddd      xmm0, xmm2
3882      movdqu     xmm2, [edx + esi]
3883      paddd      xmm2, xmm0
3884      movdqu     [edx], xmm2
3885      lea        edx, [edx + 16]
3886      sub        ecx, 1
3887      jge        l1
3888  
3889   l1b:
3890    }
3891  }
3892  #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
3893  
3894  #ifdef HAS_ARGBSHADE_SSE2
3895  // Shade 4 pixels at a time by specified value.
3896  // Aligned to 16 bytes.
3897  __declspec(naked) __declspec(align(16))
ARGBShadeRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width,uint32 value)3898  void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
3899                         uint32 value) {
3900    __asm {
3901      mov        eax, [esp + 4]   // src_argb
3902      mov        edx, [esp + 8]   // dst_argb
3903      mov        ecx, [esp + 12]  // width
3904      movd       xmm2, [esp + 16]  // value
3905      sub        edx, eax
3906      punpcklbw  xmm2, xmm2
3907      punpcklqdq xmm2, xmm2
3908  
3909      align      16
3910   convertloop:
3911      movdqa     xmm0, [eax]      // read 4 pixels
3912      movdqa     xmm1, xmm0
3913      punpcklbw  xmm0, xmm0       // first 2
3914      punpckhbw  xmm1, xmm1       // next 2
3915      pmulhuw    xmm0, xmm2       // argb * value
3916      pmulhuw    xmm1, xmm2       // argb * value
3917      psrlw      xmm0, 8
3918      psrlw      xmm1, 8
3919      packuswb   xmm0, xmm1
3920      sub        ecx, 4
3921      movdqa     [eax + edx], xmm0
3922      lea        eax, [eax + 16]
3923      jg         convertloop
3924  
3925      ret
3926    }
3927  }
3928  #endif  // HAS_ARGBSHADE_SSE2
3929  
3930  #ifdef HAS_ARGBAFFINEROW_SSE2
3931  // Copy ARGB pixels from source image with slope to a row of destination.
3932  __declspec(naked) __declspec(align(16))
3933  LIBYUV_API
ARGBAffineRow_SSE2(const uint8 * src_argb,int src_argb_stride,uint8 * dst_argb,const float * uv_dudv,int width)3934  void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
3935                          uint8* dst_argb, const float* uv_dudv, int width) {
3936    __asm {
3937      push       esi
3938      push       edi
3939      mov        eax, [esp + 12]   // src_argb
3940      mov        esi, [esp + 16]  // stride
3941      mov        edx, [esp + 20]  // dst_argb
3942      mov        ecx, [esp + 24]  // pointer to uv_dudv
3943      movq       xmm2, qword ptr [ecx]  // uv
3944      movq       xmm7, qword ptr [ecx + 8]  // dudv
3945      mov        ecx, [esp + 28]  // width
3946      shl        esi, 16          // 4, stride
3947      add        esi, 4
3948      movd       xmm5, esi
3949      sub        ecx, 4
3950      jl         l4b
3951  
3952      // setup for 4 pixel loop
3953      pshufd     xmm7, xmm7, 0x44  // dup dudv
3954      pshufd     xmm5, xmm5, 0  // dup 4, stride
3955      movdqa     xmm0, xmm2    // x0, y0, x1, y1
3956      addps      xmm0, xmm7
3957      movlhps    xmm2, xmm0
3958      movdqa     xmm4, xmm7
3959      addps      xmm4, xmm4    // dudv *= 2
3960      movdqa     xmm3, xmm2    // x2, y2, x3, y3
3961      addps      xmm3, xmm4
3962      addps      xmm4, xmm4    // dudv *= 4
3963  
3964      // 4 pixel loop
3965      align      4
3966    l4:
3967      cvttps2dq  xmm0, xmm2    // x, y float to int first 2
3968      cvttps2dq  xmm1, xmm3    // x, y float to int next 2
3969      packssdw   xmm0, xmm1    // x, y as 8 shorts
3970      pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
3971      movd       esi, xmm0
3972      pshufd     xmm0, xmm0, 0x39  // shift right
3973      movd       edi, xmm0
3974      pshufd     xmm0, xmm0, 0x39  // shift right
3975      movd       xmm1, [eax + esi]  // read pixel 0
3976      movd       xmm6, [eax + edi]  // read pixel 1
3977      punpckldq  xmm1, xmm6     // combine pixel 0 and 1
3978      addps      xmm2, xmm4    // x, y += dx, dy first 2
3979      movq       qword ptr [edx], xmm1
3980      movd       esi, xmm0
3981      pshufd     xmm0, xmm0, 0x39  // shift right
3982      movd       edi, xmm0
3983      movd       xmm6, [eax + esi]  // read pixel 2
3984      movd       xmm0, [eax + edi]  // read pixel 3
3985      punpckldq  xmm6, xmm0     // combine pixel 2 and 3
3986      addps      xmm3, xmm4    // x, y += dx, dy next 2
3987      sub        ecx, 4
3988      movq       qword ptr 8[edx], xmm6
3989      lea        edx, [edx + 16]
3990      jge        l4
3991  
3992    l4b:
3993      add        ecx, 4 - 1
3994      jl         l1b
3995  
3996      // 1 pixel loop
3997      align      4
3998    l1:
3999      cvttps2dq  xmm0, xmm2    // x, y float to int
4000      packssdw   xmm0, xmm0    // x, y as shorts
4001      pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
4002      addps      xmm2, xmm7    // x, y += dx, dy
4003      movd       esi, xmm0
4004      movd       xmm0, [eax + esi]  // copy a pixel
4005      sub        ecx, 1
4006      movd       [edx], xmm0
4007      lea        edx, [edx + 4]
4008      jge        l1
4009    l1b:
4010      pop        edi
4011      pop        esi
4012      ret
4013    }
4014  }
4015  #endif  // HAS_ARGBAFFINEROW_SSE2
4016  
4017  // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version.
4018  __declspec(naked) __declspec(align(16))
ARGBInterpolateRow_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)4019  void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
4020                                ptrdiff_t src_stride, int dst_width,
4021                                int source_y_fraction) {
4022    __asm {
4023      push       esi
4024      push       edi
4025      mov        edi, [esp + 8 + 4]   // dst_ptr
4026      mov        esi, [esp + 8 + 8]   // src_ptr
4027      mov        edx, [esp + 8 + 12]  // src_stride
4028      mov        ecx, [esp + 8 + 16]  // dst_width
4029      mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
4030      sub        edi, esi
4031      shr        eax, 1
4032      cmp        eax, 0
4033      je         xloop1
4034      cmp        eax, 64
4035      je         xloop2
4036      movd       xmm0, eax  // high fraction 0..127
4037      neg        eax
4038      add        eax, 128
4039      movd       xmm5, eax  // low fraction 128..1
4040      punpcklbw  xmm5, xmm0
4041      punpcklwd  xmm5, xmm5
4042      pshufd     xmm5, xmm5, 0
4043  
4044      align      16
4045    xloop:
4046      movdqa     xmm0, [esi]
4047      movdqa     xmm2, [esi + edx]
4048      movdqa     xmm1, xmm0
4049      punpcklbw  xmm0, xmm2
4050      punpckhbw  xmm1, xmm2
4051      pmaddubsw  xmm0, xmm5
4052      pmaddubsw  xmm1, xmm5
4053      psrlw      xmm0, 7
4054      psrlw      xmm1, 7
4055      packuswb   xmm0, xmm1
4056      sub        ecx, 4
4057      movdqa     [esi + edi], xmm0
4058      lea        esi, [esi + 16]
4059      jg         xloop
4060  
4061      pop        edi
4062      pop        esi
4063      ret
4064  
4065      align      16
4066    xloop1:
4067      movdqa     xmm0, [esi]
4068      sub        ecx, 4
4069      movdqa     [esi + edi], xmm0
4070      lea        esi, [esi + 16]
4071      jg         xloop1
4072  
4073      pop        edi
4074      pop        esi
4075      ret
4076  
4077      align      16
4078    xloop2:
4079      movdqa     xmm0, [esi]
4080      pavgb      xmm0, [esi + edx]
4081      sub        ecx, 4
4082      movdqa     [esi + edi], xmm0
4083      lea        esi, [esi + 16]
4084      jg         xloop2
4085  
4086      pop        edi
4087      pop        esi
4088      ret
4089    }
4090  }
4091  
4092  #endif  // _M_IX86
4093  
4094  #ifdef __cplusplus
4095  }  // extern "C"
4096  }  // namespace libyuv
4097  #endif
4098