• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; Copyright (c) 2011 The Chromium Authors. All rights reserved.
2; Use of this source code is governed by a BSD-style license that can be
3; found in the LICENSE file.
4
5%include "media/base/simd/media_export.asm"
6%include "third_party/x86inc/x86inc.asm"
7
8;
9; This file uses SSE, SSE2, SSE3, and SSSE3, which are supported by all ATOM
10; processors.
11;
12  SECTION_TEXT
13  CPU       SSE, SSE3, SSE3, SSSE3
14
15;
16; XMM registers representing constants. We must not use these registers as
17; destination operands.
18; for (int i = 0; i < 16; i += 4) {
19;   xmm7.b[i] = 25;  xmm7.b[i+1] = 2;   xmm7.b[i+2] = 66;  xmm7.b[i+3] = 0;
20;   xmm6.b[i] = 0;   xmm6.b[i+1] = 127; xmm6.b[i+2] = 0;   xmm6.b[i+3] = 0;
21;   xmm5.b[i] = 112; xmm5.b[i+1] = -74; xmm5.b[i+2] = -38; xmm5.b[i+3] = 0;
22;   xmm4.b[i] = -18; xmm4.b[i+1] = -94; xmm4.b[i+2] = 112; xmm4.b[i+3] = 0;
23; }
24;
25%define XMM_CONST_Y0    xmm7
26%define XMM_CONST_Y1    xmm6
27%define XMM_CONST_U     xmm5
28%define XMM_CONST_V     xmm4
29%define XMM_CONST_128   xmm3
30
31;
32; LOAD_XMM %1 (xmm), %2 (imm32)
33; Loads an immediate value to an XMM register.
34;   %1.d[0] = %1.d[1] =  %1.d[2] =  %1.d[3] = %2;
35;
36%macro LOAD_XMM 2
37  mov       TEMPd, %2
38  movd      %1, TEMPd
39  pshufd    %1, %1, 00000000B
40%endmacro
41
42;
43; UNPACKRGB %1 (xmm), %2 (imm8)
44; Unpacks one RGB pixel in the specified XMM register.
45;   for (int i = 15; i > %2; --i) %1.b[i] = %1.b[i - 1];
46;   %1.b[%2] = 0;
47;   for (int i = %2 - 1; i >= 0; --i) %1.b[i] = %1.b[i];
48;
49%macro UNPACKRGB 2
50  movdqa    xmm1, %1
51  psrldq    xmm1, %2
52  pslldq    xmm1, %2
53  pxor      %1, xmm1
54  pslldq    xmm1, 1
55  por       %1, xmm1
56%endmacro
57
58;
59; READ_ARGB %1 (xmm), %2 (imm)
60; Read the specified number of ARGB (or RGB) pixels from the source and store
61; them to the destination xmm register. If the input format is RGB, we read RGB
62; pixels and convert them to ARGB pixels. (For this case, the alpha values of
63; the output pixels become 0.)
64;
65%macro READ_ARGB 2
66
67%if PIXELSIZE == 4
68
69  ; Read ARGB pixels from the source. (This macro assumes the input buffer may
70  ; not be aligned to a 16-byte boundary.)
71%if %2 == 1
72  movd      %1, DWORD [ARGBq + WIDTHq * 4 * 2]
73%elif %2 == 2
74  movq      %1, QWORD [ARGBq + WIDTHq * 4 * 2]
75%elif %2 == 4
76  movdqu    %1, DQWORD [ARGBq + WIDTHq * 4 * 2]
77%else
78%error unsupported number of pixels.
79%endif
80
81%elif PIXELSIZE == 3
82
83  ; Read RGB pixels from the source and convert them to ARGB pixels.
84%if %2 == 1
85  ; Read one RGB pixel and convert it to one ARGB pixel.
86  ; Save the WIDTH register to xmm1. (This macro needs to break it.)
87  MOVq      xmm1, WIDTHq
88
89  ; Once read three bytes from the source to TEMPd, and copy it to the
90  ; destination xmm register.
91  lea       WIDTHq, [WIDTHq + WIDTHq * 2]
92  movzx     TEMPd, BYTE [ARGBq + WIDTHq * 2 + 2]
93  shl       TEMPd, 16
94  mov       TEMPw, WORD [ARGBq + WIDTHq * 2]
95  movd      %1, TEMPd
96
97  ; Restore the WIDTH register.
98  MOVq      WIDTHq, xmm1
99%elif %2 == 2
100  ; Read two RGB pixels and convert them to two ARGB pixels.
101  ; Read six bytes from the source to the destination xmm register.
102  mov       TEMPq, WIDTHq
103  lea       TEMPq, [TEMPq + TEMPq * 2]
104  movd      %1, DWORD [ARGBq + TEMPq * 2]
105  pinsrw    %1, WORD [ARGBq + TEMPq * 2 + 4], 3
106
107  ; Fill the alpha values of these RGB pixels with 0 and convert them to two
108  ; ARGB pixels.
109  UNPACKRGB %1, 3
110%elif %2 == 4
111  ; Read four RGB pixels and convert them to four ARGB pixels.
112  ; Read twelve bytes from the source to the destination xmm register.
113  mov       TEMPq, WIDTHq
114  lea       TEMPq, [TEMPq + TEMPq * 2]
115  movq      %1, QWORD [ARGBq + TEMPq * 2]
116  movd      xmm1, DWORD [ARGBq + TEMPq * 2 + 8]
117  shufps    %1, xmm1, 01000100B
118
119  ; Fill the alpha values of these RGB pixels with 0 and convert them to four
120  ; ARGB pixels.
121  UNPACKRGB %1, 3
122  UNPACKRGB %1, 4 + 3
123  UNPACKRGB %1, 4 + 4 + 3
124%else
125%error unsupported number of pixels.
126%endif
127
128%else
129%error unsupported PIXELSIZE value.
130%endif
131
132%endmacro
133
134;
135; CALC_Y %1 (xmm), %2 (xmm)
136; Calculates four Y values from four ARGB pixels stored in %2.
137;   %1.b[0] = ToByte((25 * B(0) + 129 * G(0) + 66 * R(0) + 128) / 256 + 16);
138;   %1.b[1] = ToByte((25 * B(1) + 129 * G(1) + 66 * R(1) + 128) / 256 + 16);
139;   %1.b[2] = ToByte((25 * B(2) + 129 * G(2) + 66 * R(2) + 128) / 256 + 16);
140;   %1.b[3] = ToByte((25 * B(3) + 129 * G(3) + 66 * R(3) + 128) / 256 + 16);
141;
142%macro CALC_Y 2
143  ; To avoid signed saturation, we divide this conversion formula into two
144  ; formulae and store their results into two XMM registers %1 and xmm2.
145  ; %1.w[0]   = 25  * %2.b[0]  + 2   * %2.b[1]  + 66  * %2.b[2]  + 0 * %2.b[3];
146  ; %1.w[1]   = 25  * %2.b[4]  + 2   * %2.b[5]  + 66  * %2.b[6]  + 0 * %2.b[7];
147  ; %1.w[2]   = 25  * %2.b[8]  + 2   * %2.b[9]  + 66  * %2.b[10] + 0 * %2.b[11];
148  ; %1.w[3]   = 25  * %2.b[12] + 2   * %2.b[13] + 66  * %2.b[14] + 0 * %2.b[15];
149  ; xmm2.w[0] = 0   * %2.b[0]  + 127 * %2.b[1]  + 0   * %2.b[2]  + 0 * %2.b[3];
150  ; xmm2.w[1] = 0   * %2.b[4]  + 127 * %2.b[5]  + 0   * %2.b[6]  + 0 * %2.b[7];
151  ; xmm2.w[2] = 0   * %2.b[8]  + 127 * %2.b[9]  + 0   * %2.b[10] + 0 * %2.b[11];
152  ; xmm2.w[3] = 0   * %2.b[12] + 127 * %2.b[13] + 0   * %2.b[14] + 0 * %2.b[15];
153  movdqa    %1, %2
154  pmaddubsw %1, XMM_CONST_Y0
155  phaddsw   %1, %1
156  movdqa    xmm2, %2
157  pmaddubsw xmm2, XMM_CONST_Y1
158  phaddsw   xmm2, xmm2
159
160  ; %1.b[0] = ToByte((%1.w[0] + xmm2.w[0] + 128) / 256 + 16);
161  ; %1.b[1] = ToByte((%1.w[1] + xmm2.w[1] + 128) / 256 + 16);
162  ; %1.b[2] = ToByte((%1.w[2] + xmm2.w[2] + 128) / 256 + 16);
163  ; %1.b[3] = ToByte((%1.w[3] + xmm2.w[3] + 128) / 256 + 16);
164  paddw     %1, xmm2
165  movdqa    xmm2, XMM_CONST_128
166  paddw     %1, xmm2
167  psrlw     %1, 8
168  psrlw     xmm2, 3
169  paddw     %1, xmm2
170  packuswb  %1, %1
171%endmacro
172
173;
174; INIT_UV %1 (r32), %2 (reg) %3 (imm)
175;
176%macro INIT_UV 3
177
178%if SUBSAMPLING == 1 && LINE == 1
179%if %3 == 1 || %3 == 2
180  movzx     %1, BYTE [%2 + WIDTHq]
181%elif %3 == 4
182  movzx     %1, WORD [%2 + WIDTHq]
183%else
184%error unsupported number of pixels.
185%endif
186%endif
187
188%endmacro
189
190;
191; CALC_UV %1 (xmm), %2 (xmm), %3 (xmm), %4 (r32)
192; Calculates two U (or V) values from four ARGB pixels stored in %2.
193; if %3 == XMM_CONST_U
194; if (SUBSAMPLING) {
195;   %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128);
196;   %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128);
197;   %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128);
198;   %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128);
199; } else {
200;   %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128);
201;   %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128);
202; }
203; if %3 == XMM_CONST_V
204;   %1.b[0] = ToByte((-18 * B(0) - 94 * G(0) + 112 * R(0) + 128) / 256 + 128);
205;   %1.b[1] = ToByte((-18 * B(2) - 94 * G(2) + 112 * R(2) + 128) / 256 + 128);
206;
207%macro CALC_UV 4
208  ; for (int i = 0; i < 4; ++i) {
209  ;   %1.w[i] = 0;
210  ;   for (int j = 0; j < 4; ++j)
211  ;     %1.w[i] += %3.b[i * 4 + j] + %2.b[i * 4 + j];
212  ; }
213  movdqa    %1, %2
214  pmaddubsw %1, %3
215  phaddsw   %1, %1
216
217%if SUBSAMPLING == 1
218  ; %1.w[0] = (%1.w[0] + %1.w[1] + 1) / 2;
219  ; %1.w[1] = (%1.w[1] + %1.w[0] + 1) / 2;
220  ; %1.w[2] = (%1.w[2] + %1.w[3] + 1) / 2;
221  ; %1.w[3] = (%1.w[3] + %1.w[2] + 1) / 2;
222  pshuflw   xmm2, %1, 10110001B
223  pavgw     %1, xmm2
224%endif
225
226  ; %1.b[0] = ToByte((%1.w[0] + 128) / 256 + 128);
227  ; %1.b[1] = ToByte((%1.w[2] + 128) / 256 + 128);
228  pshuflw   %1, %1, 10001000B
229  paddw     %1, XMM_CONST_128
230  psraw     %1, 8
231  paddw     %1, XMM_CONST_128
232  packuswb  %1, %1
233
234%if SUBSAMPLING == 1 && LINE == 1
235  ; %1.b[0] = (%1.b[0] + %3.b[0] + 1) / 2;
236  ; %1.b[1] = (%1.b[1] + %3.b[1] + 1) / 2;
237  movd      xmm2, %4
238  pavgb     %1, xmm2
239%endif
240%endmacro
241
242;
243; extern "C" void ConvertARGBToYUVRow_SSSE3(const uint8* argb,
244;                                           uint8* y,
245;                                           uint8* u,
246;                                           uint8* v,
247;                                           ptrdiff_t width);
248;
249%define SYMBOL          ConvertARGBToYUVRow_SSSE3
250%define PIXELSIZE       4
251%define SUBSAMPLING     0
252%define LINE            0
253%include "convert_rgb_to_yuv_ssse3.inc"
254
255;
256; extern "C" void ConvertRGBToYUVRow_SSSE3(const uint8* rgb,
257;                                          uint8* y,
258;                                          uint8* u,
259;                                          uint8* v,
260;                                          ptrdiff_t width);
261;
262%define SYMBOL          ConvertRGBToYUVRow_SSSE3
263%define PIXELSIZE       3
264%define SUBSAMPLING     0
265%define LINE            0
266%include "convert_rgb_to_yuv_ssse3.inc"
267
268;
269; extern "C" void ConvertARGBToYUVEven_SSSE3(const uint8* argb,
270;                                            uint8* y,
271;                                            uint8* u,
272;                                            uint8* v,
273;                                            ptrdiff_t width);
274;
275%define SYMBOL          ConvertARGBToYUVEven_SSSE3
276%define PIXELSIZE       4
277%define SUBSAMPLING     1
278%define LINE            0
279%include "convert_rgb_to_yuv_ssse3.inc"
280
281;
282; extern "C" void ConvertARGBToYUVOdd_SSSE3(const uint8* argb,
283;                                           uint8* y,
284;                                           uint8* u,
285;                                           uint8* v,
286;                                           ptrdiff_t width);
287;
288%define SYMBOL          ConvertARGBToYUVOdd_SSSE3
289%define PIXELSIZE       4
290%define SUBSAMPLING     1
291%define LINE            1
292%include "convert_rgb_to_yuv_ssse3.inc"
293
294;
295; extern "C" void ConvertRGBToYUVEven_SSSE3(const uint8* rgb,
296;                                           uint8* y,
297;                                           uint8* u,
298;                                           uint8* v,
299;                                           ptrdiff_t width);
300;
301%define SYMBOL          ConvertRGBToYUVEven_SSSE3
302%define PIXELSIZE       3
303%define SUBSAMPLING     1
304%define LINE            0
305%include "convert_rgb_to_yuv_ssse3.inc"
306
307;
308; extern "C" void ConvertRGBToYUVOdd_SSSE3(const uint8* rgb,
309;                                          uint8* y,
310;                                          uint8* u,
311;                                          uint8* v,
312;                                          ptrdiff_t width);
313;
314%define SYMBOL          ConvertRGBToYUVOdd_SSSE3
315%define PIXELSIZE       3
316%define SUBSAMPLING     1
317%define LINE            1
318%include "convert_rgb_to_yuv_ssse3.inc"
319