1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // This module is for GCC x86 and x64.
19 #if !defined(LIBYUV_DISABLE_X86) && \
20 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
21
22 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
23
24 // Constants for ARGB
25 static vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
26 13, 65, 33, 0, 13, 65, 33, 0};
27
28 // JPeg full range.
29 static vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
30 15, 75, 38, 0, 15, 75, 38, 0};
31 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
32
33 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
34
35 static vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
36 112, -74, -38, 0, 112, -74, -38, 0};
37
38 static vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
39 127, -84, -43, 0, 127, -84, -43, 0};
40
41 static vec8 kARGBToV = {
42 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
43 };
44
45 static vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
46 -20, -107, 127, 0, -20, -107, 127, 0};
47
48 // Constants for BGRA
49 static vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
50 0, 33, 65, 13, 0, 33, 65, 13};
51
52 static vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
53 0, -38, -74, 112, 0, -38, -74, 112};
54
55 static vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
56 0, 112, -94, -18, 0, 112, -94, -18};
57
58 // Constants for ABGR
59 static vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
60 33, 65, 13, 0, 33, 65, 13, 0};
61
62 static vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
63 -38, -74, 112, 0, -38, -74, 112, 0};
64
65 static vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
66 112, -94, -18, 0, 112, -94, -18, 0};
67
68 // Constants for RGBA.
69 static vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
70 0, 13, 65, 33, 0, 13, 65, 33};
71
72 static vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
73 0, 112, -74, -38, 0, 112, -74, -38};
74
75 static vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
76 0, -18, -94, 112, 0, -18, -94, 112};
77
78 static uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
79 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
80
81 // 7 bit fixed point 0.5.
82 static vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
83
84 static uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
85 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
86
87 static uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
88 0x8080u, 0x8080u, 0x8080u, 0x8080u};
89 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
90
91 #ifdef HAS_RGB24TOARGBROW_SSSE3
92
93 // Shuffle table for converting RGB24 to ARGB.
94 static uvec8 kShuffleMaskRGB24ToARGB = {0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u,
95 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
96
97 // Shuffle table for converting RAW to ARGB.
98 static uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
99 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
100
101 // Shuffle table for converting RAW to RGB24. First 8.
102 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
103 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
104 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
105
106 // Shuffle table for converting RAW to RGB24. Middle 8.
107 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
108 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
109 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
110
111 // Shuffle table for converting RAW to RGB24. Last 8.
112 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
113 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
114 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
115
116 // Shuffle table for converting ARGB to RGB24.
117 static uvec8 kShuffleMaskARGBToRGB24 = {
118 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
119
120 // Shuffle table for converting ARGB to RAW.
121 static uvec8 kShuffleMaskARGBToRAW = {
122 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
123
124 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
125 static uvec8 kShuffleMaskARGBToRGB24_0 = {
126 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
127
128 // YUY2 shuf 16 Y to 32 Y.
129 static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
130 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
131 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
132
133 // YUY2 shuf 8 UV to 16 UV.
134 static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
135 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
136 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
137
138 // UYVY shuf 16 Y to 32 Y.
139 static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
140 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
141 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
142
143 // UYVY shuf 8 UV to 16 UV.
144 static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
145 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
146 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
147
148 // NV21 shuf 8 VU to 16 UV.
149 static const lvec8 kShuffleNV21 = {
150 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
151 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
152 };
153 #endif // HAS_RGB24TOARGBROW_SSSE3
154
155 #ifdef HAS_J400TOARGBROW_SSE2
J400ToARGBRow_SSE2(const uint8 * src_y,uint8 * dst_argb,int width)156 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
157 asm volatile (
158 "pcmpeqb %%xmm5,%%xmm5 \n"
159 "pslld $0x18,%%xmm5 \n"
160 LABELALIGN
161 "1: \n"
162 "movq " MEMACCESS(0) ",%%xmm0 \n"
163 "lea " MEMLEA(0x8,0) ",%0 \n"
164 "punpcklbw %%xmm0,%%xmm0 \n"
165 "movdqa %%xmm0,%%xmm1 \n"
166 "punpcklwd %%xmm0,%%xmm0 \n"
167 "punpckhwd %%xmm1,%%xmm1 \n"
168 "por %%xmm5,%%xmm0 \n"
169 "por %%xmm5,%%xmm1 \n"
170 "movdqu %%xmm0," MEMACCESS(1) " \n"
171 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
172 "lea " MEMLEA(0x20,1) ",%1 \n"
173 "sub $0x8,%2 \n"
174 "jg 1b \n"
175 : "+r"(src_y), // %0
176 "+r"(dst_argb), // %1
177 "+r"(width) // %2
178 :: "memory", "cc", "xmm0", "xmm1", "xmm5"
179 );
180 }
181 #endif // HAS_J400TOARGBROW_SSE2
182
183 #ifdef HAS_RGB24TOARGBROW_SSSE3
RGB24ToARGBRow_SSSE3(const uint8 * src_rgb24,uint8 * dst_argb,int width)184 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
185 asm volatile (
186 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
187 "pslld $0x18,%%xmm5 \n"
188 "movdqa %3,%%xmm4 \n"
189 LABELALIGN
190 "1: \n"
191 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
192 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
193 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
194 "lea " MEMLEA(0x30,0) ",%0 \n"
195 "movdqa %%xmm3,%%xmm2 \n"
196 "palignr $0x8,%%xmm1,%%xmm2 \n"
197 "pshufb %%xmm4,%%xmm2 \n"
198 "por %%xmm5,%%xmm2 \n"
199 "palignr $0xc,%%xmm0,%%xmm1 \n"
200 "pshufb %%xmm4,%%xmm0 \n"
201 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
202 "por %%xmm5,%%xmm0 \n"
203 "pshufb %%xmm4,%%xmm1 \n"
204 "movdqu %%xmm0," MEMACCESS(1) " \n"
205 "por %%xmm5,%%xmm1 \n"
206 "palignr $0x4,%%xmm3,%%xmm3 \n"
207 "pshufb %%xmm4,%%xmm3 \n"
208 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
209 "por %%xmm5,%%xmm3 \n"
210 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
211 "lea " MEMLEA(0x40,1) ",%1 \n"
212 "sub $0x10,%2 \n"
213 "jg 1b \n"
214 : "+r"(src_rgb24), // %0
215 "+r"(dst_argb), // %1
216 "+r"(width) // %2
217 : "m"(kShuffleMaskRGB24ToARGB) // %3
218 : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
219 );
220 }
221
RAWToARGBRow_SSSE3(const uint8 * src_raw,uint8 * dst_argb,int width)222 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) {
223 asm volatile (
224 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
225 "pslld $0x18,%%xmm5 \n"
226 "movdqa %3,%%xmm4 \n"
227 LABELALIGN
228 "1: \n"
229 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
230 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
231 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
232 "lea " MEMLEA(0x30,0) ",%0 \n"
233 "movdqa %%xmm3,%%xmm2 \n"
234 "palignr $0x8,%%xmm1,%%xmm2 \n"
235 "pshufb %%xmm4,%%xmm2 \n"
236 "por %%xmm5,%%xmm2 \n"
237 "palignr $0xc,%%xmm0,%%xmm1 \n"
238 "pshufb %%xmm4,%%xmm0 \n"
239 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
240 "por %%xmm5,%%xmm0 \n"
241 "pshufb %%xmm4,%%xmm1 \n"
242 "movdqu %%xmm0," MEMACCESS(1) " \n"
243 "por %%xmm5,%%xmm1 \n"
244 "palignr $0x4,%%xmm3,%%xmm3 \n"
245 "pshufb %%xmm4,%%xmm3 \n"
246 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
247 "por %%xmm5,%%xmm3 \n"
248 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
249 "lea " MEMLEA(0x40,1) ",%1 \n"
250 "sub $0x10,%2 \n"
251 "jg 1b \n"
252 : "+r"(src_raw), // %0
253 "+r"(dst_argb), // %1
254 "+r"(width) // %2
255 : "m"(kShuffleMaskRAWToARGB) // %3
256 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
257 );
258 }
259
RAWToRGB24Row_SSSE3(const uint8 * src_raw,uint8 * dst_rgb24,int width)260 void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
261 asm volatile (
262 "movdqa %3,%%xmm3 \n"
263 "movdqa %4,%%xmm4 \n"
264 "movdqa %5,%%xmm5 \n"
265 LABELALIGN
266 "1: \n"
267 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
268 "movdqu " MEMACCESS2(0x4,0) ",%%xmm1 \n"
269 "movdqu " MEMACCESS2(0x8,0) ",%%xmm2 \n"
270 "lea " MEMLEA(0x18,0) ",%0 \n"
271 "pshufb %%xmm3,%%xmm0 \n"
272 "pshufb %%xmm4,%%xmm1 \n"
273 "pshufb %%xmm5,%%xmm2 \n"
274 "movq %%xmm0," MEMACCESS(1) " \n"
275 "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
276 "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
277 "lea " MEMLEA(0x18,1) ",%1 \n"
278 "sub $0x8,%2 \n"
279 "jg 1b \n"
280 : "+r"(src_raw), // %0
281 "+r"(dst_rgb24), // %1
282 "+r"(width) // %2
283 : "m"(kShuffleMaskRAWToRGB24_0), // %3
284 "m"(kShuffleMaskRAWToRGB24_1), // %4
285 "m"(kShuffleMaskRAWToRGB24_2) // %5
286 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
287 );
288 }
289
RGB565ToARGBRow_SSE2(const uint8 * src,uint8 * dst,int width)290 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
291 asm volatile (
292 "mov $0x1080108,%%eax \n"
293 "movd %%eax,%%xmm5 \n"
294 "pshufd $0x0,%%xmm5,%%xmm5 \n"
295 "mov $0x20802080,%%eax \n"
296 "movd %%eax,%%xmm6 \n"
297 "pshufd $0x0,%%xmm6,%%xmm6 \n"
298 "pcmpeqb %%xmm3,%%xmm3 \n"
299 "psllw $0xb,%%xmm3 \n"
300 "pcmpeqb %%xmm4,%%xmm4 \n"
301 "psllw $0xa,%%xmm4 \n"
302 "psrlw $0x5,%%xmm4 \n"
303 "pcmpeqb %%xmm7,%%xmm7 \n"
304 "psllw $0x8,%%xmm7 \n"
305 "sub %0,%1 \n"
306 "sub %0,%1 \n"
307 LABELALIGN
308 "1: \n"
309 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
310 "movdqa %%xmm0,%%xmm1 \n"
311 "movdqa %%xmm0,%%xmm2 \n"
312 "pand %%xmm3,%%xmm1 \n"
313 "psllw $0xb,%%xmm2 \n"
314 "pmulhuw %%xmm5,%%xmm1 \n"
315 "pmulhuw %%xmm5,%%xmm2 \n"
316 "psllw $0x8,%%xmm1 \n"
317 "por %%xmm2,%%xmm1 \n"
318 "pand %%xmm4,%%xmm0 \n"
319 "pmulhuw %%xmm6,%%xmm0 \n"
320 "por %%xmm7,%%xmm0 \n"
321 "movdqa %%xmm1,%%xmm2 \n"
322 "punpcklbw %%xmm0,%%xmm1 \n"
323 "punpckhbw %%xmm0,%%xmm2 \n"
324 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
325 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
326 "lea " MEMLEA(0x10,0) ",%0 \n"
327 "sub $0x8,%2 \n"
328 "jg 1b \n"
329 : "+r"(src), // %0
330 "+r"(dst), // %1
331 "+r"(width) // %2
332 :
333 : "memory", "cc", "eax", NACL_R14
334 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
335 );
336 }
337
ARGB1555ToARGBRow_SSE2(const uint8 * src,uint8 * dst,int width)338 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
339 asm volatile (
340 "mov $0x1080108,%%eax \n"
341 "movd %%eax,%%xmm5 \n"
342 "pshufd $0x0,%%xmm5,%%xmm5 \n"
343 "mov $0x42004200,%%eax \n"
344 "movd %%eax,%%xmm6 \n"
345 "pshufd $0x0,%%xmm6,%%xmm6 \n"
346 "pcmpeqb %%xmm3,%%xmm3 \n"
347 "psllw $0xb,%%xmm3 \n"
348 "movdqa %%xmm3,%%xmm4 \n"
349 "psrlw $0x6,%%xmm4 \n"
350 "pcmpeqb %%xmm7,%%xmm7 \n"
351 "psllw $0x8,%%xmm7 \n"
352 "sub %0,%1 \n"
353 "sub %0,%1 \n"
354 LABELALIGN
355 "1: \n"
356 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
357 "movdqa %%xmm0,%%xmm1 \n"
358 "movdqa %%xmm0,%%xmm2 \n"
359 "psllw $0x1,%%xmm1 \n"
360 "psllw $0xb,%%xmm2 \n"
361 "pand %%xmm3,%%xmm1 \n"
362 "pmulhuw %%xmm5,%%xmm2 \n"
363 "pmulhuw %%xmm5,%%xmm1 \n"
364 "psllw $0x8,%%xmm1 \n"
365 "por %%xmm2,%%xmm1 \n"
366 "movdqa %%xmm0,%%xmm2 \n"
367 "pand %%xmm4,%%xmm0 \n"
368 "psraw $0x8,%%xmm2 \n"
369 "pmulhuw %%xmm6,%%xmm0 \n"
370 "pand %%xmm7,%%xmm2 \n"
371 "por %%xmm2,%%xmm0 \n"
372 "movdqa %%xmm1,%%xmm2 \n"
373 "punpcklbw %%xmm0,%%xmm1 \n"
374 "punpckhbw %%xmm0,%%xmm2 \n"
375 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
376 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
377 "lea " MEMLEA(0x10,0) ",%0 \n"
378 "sub $0x8,%2 \n"
379 "jg 1b \n"
380 : "+r"(src), // %0
381 "+r"(dst), // %1
382 "+r"(width) // %2
383 :
384 : "memory", "cc", "eax", NACL_R14
385 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
386 );
387 }
388
ARGB4444ToARGBRow_SSE2(const uint8 * src,uint8 * dst,int width)389 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
390 asm volatile (
391 "mov $0xf0f0f0f,%%eax \n"
392 "movd %%eax,%%xmm4 \n"
393 "pshufd $0x0,%%xmm4,%%xmm4 \n"
394 "movdqa %%xmm4,%%xmm5 \n"
395 "pslld $0x4,%%xmm5 \n"
396 "sub %0,%1 \n"
397 "sub %0,%1 \n"
398 LABELALIGN
399 "1: \n"
400 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
401 "movdqa %%xmm0,%%xmm2 \n"
402 "pand %%xmm4,%%xmm0 \n"
403 "pand %%xmm5,%%xmm2 \n"
404 "movdqa %%xmm0,%%xmm1 \n"
405 "movdqa %%xmm2,%%xmm3 \n"
406 "psllw $0x4,%%xmm1 \n"
407 "psrlw $0x4,%%xmm3 \n"
408 "por %%xmm1,%%xmm0 \n"
409 "por %%xmm3,%%xmm2 \n"
410 "movdqa %%xmm0,%%xmm1 \n"
411 "punpcklbw %%xmm2,%%xmm0 \n"
412 "punpckhbw %%xmm2,%%xmm1 \n"
413 MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2)
414 MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2)
415 "lea " MEMLEA(0x10,0) ",%0 \n"
416 "sub $0x8,%2 \n"
417 "jg 1b \n"
418 : "+r"(src), // %0
419 "+r"(dst), // %1
420 "+r"(width) // %2
421 :
422 : "memory", "cc", "eax", NACL_R14
423 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
424 );
425 }
426
ARGBToRGB24Row_SSSE3(const uint8 * src,uint8 * dst,int width)427 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) {
428 asm volatile (
429 "movdqa %3,%%xmm6 \n"
430 LABELALIGN
431 "1: \n"
432 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
433 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
434 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
435 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
436 "lea " MEMLEA(0x40,0) ",%0 \n"
437 "pshufb %%xmm6,%%xmm0 \n"
438 "pshufb %%xmm6,%%xmm1 \n"
439 "pshufb %%xmm6,%%xmm2 \n"
440 "pshufb %%xmm6,%%xmm3 \n"
441 "movdqa %%xmm1,%%xmm4 \n"
442 "psrldq $0x4,%%xmm1 \n"
443 "pslldq $0xc,%%xmm4 \n"
444 "movdqa %%xmm2,%%xmm5 \n"
445 "por %%xmm4,%%xmm0 \n"
446 "pslldq $0x8,%%xmm5 \n"
447 "movdqu %%xmm0," MEMACCESS(1) " \n"
448 "por %%xmm5,%%xmm1 \n"
449 "psrldq $0x8,%%xmm2 \n"
450 "pslldq $0x4,%%xmm3 \n"
451 "por %%xmm3,%%xmm2 \n"
452 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
453 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
454 "lea " MEMLEA(0x30,1) ",%1 \n"
455 "sub $0x10,%2 \n"
456 "jg 1b \n"
457 : "+r"(src), // %0
458 "+r"(dst), // %1
459 "+r"(width) // %2
460 : "m"(kShuffleMaskARGBToRGB24) // %3
461 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
462 );
463 }
464
ARGBToRAWRow_SSSE3(const uint8 * src,uint8 * dst,int width)465 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) {
466 asm volatile (
467 "movdqa %3,%%xmm6 \n"
468 LABELALIGN
469 "1: \n"
470 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
471 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
472 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
473 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
474 "lea " MEMLEA(0x40,0) ",%0 \n"
475 "pshufb %%xmm6,%%xmm0 \n"
476 "pshufb %%xmm6,%%xmm1 \n"
477 "pshufb %%xmm6,%%xmm2 \n"
478 "pshufb %%xmm6,%%xmm3 \n"
479 "movdqa %%xmm1,%%xmm4 \n"
480 "psrldq $0x4,%%xmm1 \n"
481 "pslldq $0xc,%%xmm4 \n"
482 "movdqa %%xmm2,%%xmm5 \n"
483 "por %%xmm4,%%xmm0 \n"
484 "pslldq $0x8,%%xmm5 \n"
485 "movdqu %%xmm0," MEMACCESS(1) " \n"
486 "por %%xmm5,%%xmm1 \n"
487 "psrldq $0x8,%%xmm2 \n"
488 "pslldq $0x4,%%xmm3 \n"
489 "por %%xmm3,%%xmm2 \n"
490 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
491 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
492 "lea " MEMLEA(0x30,1) ",%1 \n"
493 "sub $0x10,%2 \n"
494 "jg 1b \n"
495 : "+r"(src), // %0
496 "+r"(dst), // %1
497 "+r"(width) // %2
498 : "m"(kShuffleMaskARGBToRAW) // %3
499 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
500 );
501 }
502
ARGBToRGB565Row_SSE2(const uint8 * src,uint8 * dst,int width)503 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {
504 asm volatile (
505 "pcmpeqb %%xmm3,%%xmm3 \n"
506 "psrld $0x1b,%%xmm3 \n"
507 "pcmpeqb %%xmm4,%%xmm4 \n"
508 "psrld $0x1a,%%xmm4 \n"
509 "pslld $0x5,%%xmm4 \n"
510 "pcmpeqb %%xmm5,%%xmm5 \n"
511 "pslld $0xb,%%xmm5 \n"
512 LABELALIGN
513 "1: \n"
514 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
515 "movdqa %%xmm0,%%xmm1 \n"
516 "movdqa %%xmm0,%%xmm2 \n"
517 "pslld $0x8,%%xmm0 \n"
518 "psrld $0x3,%%xmm1 \n"
519 "psrld $0x5,%%xmm2 \n"
520 "psrad $0x10,%%xmm0 \n"
521 "pand %%xmm3,%%xmm1 \n"
522 "pand %%xmm4,%%xmm2 \n"
523 "pand %%xmm5,%%xmm0 \n"
524 "por %%xmm2,%%xmm1 \n"
525 "por %%xmm1,%%xmm0 \n"
526 "packssdw %%xmm0,%%xmm0 \n"
527 "lea " MEMLEA(0x10,0) ",%0 \n"
528 "movq %%xmm0," MEMACCESS(1) " \n"
529 "lea " MEMLEA(0x8,1) ",%1 \n"
530 "sub $0x4,%2 \n"
531 "jg 1b \n"
532 : "+r"(src), // %0
533 "+r"(dst), // %1
534 "+r"(width) // %2
535 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
536 );
537 }
538
ARGBToRGB565DitherRow_SSE2(const uint8 * src,uint8 * dst,const uint32 dither4,int width)539 void ARGBToRGB565DitherRow_SSE2(const uint8* src,
540 uint8* dst,
541 const uint32 dither4,
542 int width) {
543 asm volatile(
544 "movd %3,%%xmm6 \n"
545 "punpcklbw %%xmm6,%%xmm6 \n"
546 "movdqa %%xmm6,%%xmm7 \n"
547 "punpcklwd %%xmm6,%%xmm6 \n"
548 "punpckhwd %%xmm7,%%xmm7 \n"
549 "pcmpeqb %%xmm3,%%xmm3 \n"
550 "psrld $0x1b,%%xmm3 \n"
551 "pcmpeqb %%xmm4,%%xmm4 \n"
552 "psrld $0x1a,%%xmm4 \n"
553 "pslld $0x5,%%xmm4 \n"
554 "pcmpeqb %%xmm5,%%xmm5 \n"
555 "pslld $0xb,%%xmm5 \n"
556
557 LABELALIGN
558 "1: \n"
559 "movdqu (%0),%%xmm0 \n"
560 "paddusb %%xmm6,%%xmm0 \n"
561 "movdqa %%xmm0,%%xmm1 \n"
562 "movdqa %%xmm0,%%xmm2 \n"
563 "pslld $0x8,%%xmm0 \n"
564 "psrld $0x3,%%xmm1 \n"
565 "psrld $0x5,%%xmm2 \n"
566 "psrad $0x10,%%xmm0 \n"
567 "pand %%xmm3,%%xmm1 \n"
568 "pand %%xmm4,%%xmm2 \n"
569 "pand %%xmm5,%%xmm0 \n"
570 "por %%xmm2,%%xmm1 \n"
571 "por %%xmm1,%%xmm0 \n"
572 "packssdw %%xmm0,%%xmm0 \n"
573 "lea 0x10(%0),%0 \n"
574 "movq %%xmm0,(%1) \n"
575 "lea 0x8(%1),%1 \n"
576 "sub $0x4,%2 \n"
577 "jg 1b \n"
578 : "+r"(src), // %0
579 "+r"(dst), // %1
580 "+r"(width) // %2
581 : "m"(dither4) // %3
582 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
583 "xmm7");
584 }
585
586 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
ARGBToRGB565DitherRow_AVX2(const uint8 * src,uint8 * dst,const uint32 dither4,int width)587 void ARGBToRGB565DitherRow_AVX2(const uint8* src,
588 uint8* dst,
589 const uint32 dither4,
590 int width) {
591 asm volatile(
592 "vbroadcastss %3,%%xmm6 \n"
593 "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
594 "vpermq $0xd8,%%ymm6,%%ymm6 \n"
595 "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
596 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
597 "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
598 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
599 "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
600 "vpslld $0x5,%%ymm4,%%ymm4 \n"
601 "vpslld $0xb,%%ymm3,%%ymm5 \n"
602
603 LABELALIGN
604 "1: \n"
605 "vmovdqu (%0),%%ymm0 \n"
606 "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
607 "vpsrld $0x5,%%ymm0,%%ymm2 \n"
608 "vpsrld $0x3,%%ymm0,%%ymm1 \n"
609 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
610 "vpand %%ymm4,%%ymm2,%%ymm2 \n"
611 "vpand %%ymm3,%%ymm1,%%ymm1 \n"
612 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
613 "vpor %%ymm2,%%ymm1,%%ymm1 \n"
614 "vpor %%ymm1,%%ymm0,%%ymm0 \n"
615 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
616 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
617 "lea 0x20(%0),%0 \n"
618 "vmovdqu %%xmm0,(%1) \n"
619 "lea 0x10(%1),%1 \n"
620 "sub $0x8,%2 \n"
621 "jg 1b \n"
622 "vzeroupper \n"
623 : "+r"(src), // %0
624 "+r"(dst), // %1
625 "+r"(width) // %2
626 : "m"(dither4) // %3
627 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
628 "xmm7");
629 }
630 #endif // HAS_ARGBTORGB565DITHERROW_AVX2
631
ARGBToARGB1555Row_SSE2(const uint8 * src,uint8 * dst,int width)632 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
633 asm volatile (
634 "pcmpeqb %%xmm4,%%xmm4 \n"
635 "psrld $0x1b,%%xmm4 \n"
636 "movdqa %%xmm4,%%xmm5 \n"
637 "pslld $0x5,%%xmm5 \n"
638 "movdqa %%xmm4,%%xmm6 \n"
639 "pslld $0xa,%%xmm6 \n"
640 "pcmpeqb %%xmm7,%%xmm7 \n"
641 "pslld $0xf,%%xmm7 \n"
642
643 LABELALIGN
644 "1: \n"
645 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
646 "movdqa %%xmm0,%%xmm1 \n"
647 "movdqa %%xmm0,%%xmm2 \n"
648 "movdqa %%xmm0,%%xmm3 \n"
649 "psrad $0x10,%%xmm0 \n"
650 "psrld $0x3,%%xmm1 \n"
651 "psrld $0x6,%%xmm2 \n"
652 "psrld $0x9,%%xmm3 \n"
653 "pand %%xmm7,%%xmm0 \n"
654 "pand %%xmm4,%%xmm1 \n"
655 "pand %%xmm5,%%xmm2 \n"
656 "pand %%xmm6,%%xmm3 \n"
657 "por %%xmm1,%%xmm0 \n"
658 "por %%xmm3,%%xmm2 \n"
659 "por %%xmm2,%%xmm0 \n"
660 "packssdw %%xmm0,%%xmm0 \n"
661 "lea " MEMLEA(0x10,0) ",%0 \n"
662 "movq %%xmm0," MEMACCESS(1) " \n"
663 "lea " MEMLEA(0x8,1) ",%1 \n"
664 "sub $0x4,%2 \n"
665 "jg 1b \n"
666 : "+r"(src), // %0
667 "+r"(dst), // %1
668 "+r"(width) // %2
669 :: "memory", "cc",
670 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
671 );
672 }
673
ARGBToARGB4444Row_SSE2(const uint8 * src,uint8 * dst,int width)674 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
675 asm volatile (
676 "pcmpeqb %%xmm4,%%xmm4 \n"
677 "psllw $0xc,%%xmm4 \n"
678 "movdqa %%xmm4,%%xmm3 \n"
679 "psrlw $0x8,%%xmm3 \n"
680
681 LABELALIGN
682 "1: \n"
683 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
684 "movdqa %%xmm0,%%xmm1 \n"
685 "pand %%xmm3,%%xmm0 \n"
686 "pand %%xmm4,%%xmm1 \n"
687 "psrlq $0x4,%%xmm0 \n"
688 "psrlq $0x8,%%xmm1 \n"
689 "por %%xmm1,%%xmm0 \n"
690 "packuswb %%xmm0,%%xmm0 \n"
691 "lea " MEMLEA(0x10,0) ",%0 \n"
692 "movq %%xmm0," MEMACCESS(1) " \n"
693 "lea " MEMLEA(0x8,1) ",%1 \n"
694 "sub $0x4,%2 \n"
695 "jg 1b \n"
696 : "+r"(src), // %0
697 "+r"(dst), // %1
698 "+r"(width) // %2
699 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
700 );
701 }
702 #endif // HAS_RGB24TOARGBROW_SSSE3
703
704 #ifdef HAS_ARGBTOYROW_SSSE3
705 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
ARGBToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int width)706 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
707 asm volatile (
708 "movdqa %3,%%xmm4 \n"
709 "movdqa %4,%%xmm5 \n"
710
711 LABELALIGN
712 "1: \n"
713 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
714 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
715 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
716 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
717 "pmaddubsw %%xmm4,%%xmm0 \n"
718 "pmaddubsw %%xmm4,%%xmm1 \n"
719 "pmaddubsw %%xmm4,%%xmm2 \n"
720 "pmaddubsw %%xmm4,%%xmm3 \n"
721 "lea " MEMLEA(0x40,0) ",%0 \n"
722 "phaddw %%xmm1,%%xmm0 \n"
723 "phaddw %%xmm3,%%xmm2 \n"
724 "psrlw $0x7,%%xmm0 \n"
725 "psrlw $0x7,%%xmm2 \n"
726 "packuswb %%xmm2,%%xmm0 \n"
727 "paddb %%xmm5,%%xmm0 \n"
728 "movdqu %%xmm0," MEMACCESS(1) " \n"
729 "lea " MEMLEA(0x10,1) ",%1 \n"
730 "sub $0x10,%2 \n"
731 "jg 1b \n"
732 : "+r"(src_argb), // %0
733 "+r"(dst_y), // %1
734 "+r"(width) // %2
735 : "m"(kARGBToY), // %3
736 "m"(kAddY16) // %4
737 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
738 );
739 }
740 #endif // HAS_ARGBTOYROW_SSSE3
741
742 #ifdef HAS_ARGBTOYJROW_SSSE3
743 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
744 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
ARGBToYJRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int width)745 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
746 asm volatile (
747 "movdqa %3,%%xmm4 \n"
748 "movdqa %4,%%xmm5 \n"
749
750 LABELALIGN
751 "1: \n"
752 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
753 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
754 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
755 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
756 "pmaddubsw %%xmm4,%%xmm0 \n"
757 "pmaddubsw %%xmm4,%%xmm1 \n"
758 "pmaddubsw %%xmm4,%%xmm2 \n"
759 "pmaddubsw %%xmm4,%%xmm3 \n"
760 "lea " MEMLEA(0x40,0) ",%0 \n"
761 "phaddw %%xmm1,%%xmm0 \n"
762 "phaddw %%xmm3,%%xmm2 \n"
763 "paddw %%xmm5,%%xmm0 \n"
764 "paddw %%xmm5,%%xmm2 \n"
765 "psrlw $0x7,%%xmm0 \n"
766 "psrlw $0x7,%%xmm2 \n"
767 "packuswb %%xmm2,%%xmm0 \n"
768 "movdqu %%xmm0," MEMACCESS(1) " \n"
769 "lea " MEMLEA(0x10,1) ",%1 \n"
770 "sub $0x10,%2 \n"
771 "jg 1b \n"
772 : "+r"(src_argb), // %0
773 "+r"(dst_y), // %1
774 "+r"(width) // %2
775 : "m"(kARGBToYJ), // %3
776 "m"(kAddYJ64) // %4
777 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
778 );
779 }
780 #endif // HAS_ARGBTOYJROW_SSSE3
781
782 #ifdef HAS_ARGBTOYROW_AVX2
783 // vpermd for vphaddw + vpackuswb vpermd.
784 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
785
786 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYRow_AVX2(const uint8 * src_argb,uint8 * dst_y,int width)787 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
788 asm volatile (
789 "vbroadcastf128 %3,%%ymm4 \n"
790 "vbroadcastf128 %4,%%ymm5 \n"
791 "vmovdqu %5,%%ymm6 \n"
792
793 LABELALIGN
794 "1: \n"
795 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
796 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
797 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
798 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
799 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
800 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
801 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
802 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
803 "lea " MEMLEA(0x80,0) ",%0 \n"
804 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
805 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
806 "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
807 "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
808 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
809 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
810 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
811 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
812 "lea " MEMLEA(0x20,1) ",%1 \n"
813 "sub $0x20,%2 \n"
814 "jg 1b \n"
815 "vzeroupper \n"
816 : "+r"(src_argb), // %0
817 "+r"(dst_y), // %1
818 "+r"(width) // %2
819 : "m"(kARGBToY), // %3
820 "m"(kAddY16), // %4
821 "m"(kPermdARGBToY_AVX) // %5
822 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
823 );
824 }
825 #endif // HAS_ARGBTOYROW_AVX2
826
827 #ifdef HAS_ARGBTOYJROW_AVX2
828 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYJRow_AVX2(const uint8 * src_argb,uint8 * dst_y,int width)829 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
830 asm volatile (
831 "vbroadcastf128 %3,%%ymm4 \n"
832 "vbroadcastf128 %4,%%ymm5 \n"
833 "vmovdqu %5,%%ymm6 \n"
834
835 LABELALIGN
836 "1: \n"
837 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
838 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
839 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
840 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
841 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
842 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
843 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
844 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
845 "lea " MEMLEA(0x80,0) ",%0 \n"
846 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
847 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
848 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding.
849 "vpaddw %%ymm5,%%ymm2,%%ymm2 \n"
850 "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
851 "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
852 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
853 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
854 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
855 "lea " MEMLEA(0x20,1) ",%1 \n"
856 "sub $0x20,%2 \n"
857 "jg 1b \n"
858 "vzeroupper \n"
859 : "+r"(src_argb), // %0
860 "+r"(dst_y), // %1
861 "+r"(width) // %2
862 : "m"(kARGBToYJ), // %3
863 "m"(kAddYJ64), // %4
864 "m"(kPermdARGBToY_AVX) // %5
865 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
866 );
867 }
868 #endif // HAS_ARGBTOYJROW_AVX2
869
870 #ifdef HAS_ARGBTOUVROW_SSSE3
ARGBToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)871 void ARGBToUVRow_SSSE3(const uint8* src_argb0,
872 int src_stride_argb,
873 uint8* dst_u,
874 uint8* dst_v,
875 int width) {
876 asm volatile (
877 "movdqa %5,%%xmm3 \n"
878 "movdqa %6,%%xmm4 \n"
879 "movdqa %7,%%xmm5 \n"
880 "sub %1,%2 \n"
881
882 LABELALIGN
883 "1: \n"
884 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
885 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
886 "pavgb %%xmm7,%%xmm0 \n"
887 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
888 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
889 "pavgb %%xmm7,%%xmm1 \n"
890 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
891 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
892 "pavgb %%xmm7,%%xmm2 \n"
893 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
894 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
895 "pavgb %%xmm7,%%xmm6 \n"
896
897 "lea " MEMLEA(0x40,0) ",%0 \n"
898 "movdqa %%xmm0,%%xmm7 \n"
899 "shufps $0x88,%%xmm1,%%xmm0 \n"
900 "shufps $0xdd,%%xmm1,%%xmm7 \n"
901 "pavgb %%xmm7,%%xmm0 \n"
902 "movdqa %%xmm2,%%xmm7 \n"
903 "shufps $0x88,%%xmm6,%%xmm2 \n"
904 "shufps $0xdd,%%xmm6,%%xmm7 \n"
905 "pavgb %%xmm7,%%xmm2 \n"
906 "movdqa %%xmm0,%%xmm1 \n"
907 "movdqa %%xmm2,%%xmm6 \n"
908 "pmaddubsw %%xmm4,%%xmm0 \n"
909 "pmaddubsw %%xmm4,%%xmm2 \n"
910 "pmaddubsw %%xmm3,%%xmm1 \n"
911 "pmaddubsw %%xmm3,%%xmm6 \n"
912 "phaddw %%xmm2,%%xmm0 \n"
913 "phaddw %%xmm6,%%xmm1 \n"
914 "psraw $0x8,%%xmm0 \n"
915 "psraw $0x8,%%xmm1 \n"
916 "packsswb %%xmm1,%%xmm0 \n"
917 "paddb %%xmm5,%%xmm0 \n"
918 "movlps %%xmm0," MEMACCESS(1) " \n"
919 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
920 "lea " MEMLEA(0x8,1) ",%1 \n"
921 "sub $0x10,%3 \n"
922 "jg 1b \n"
923 : "+r"(src_argb0), // %0
924 "+r"(dst_u), // %1
925 "+r"(dst_v), // %2
926 "+rm"(width) // %3
927 : "r"((intptr_t)(src_stride_argb)), // %4
928 "m"(kARGBToV), // %5
929 "m"(kARGBToU), // %6
930 "m"(kAddUV128) // %7
931 : "memory", "cc", NACL_R14
932 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
933 );
934 }
935 #endif // HAS_ARGBTOUVROW_SSSE3
936
937 #ifdef HAS_ARGBTOUVROW_AVX2
938 // vpshufb for vphaddw + vpackuswb packed to shorts.
939 static const lvec8 kShufARGBToUV_AVX = {
940 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
941 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
ARGBToUVRow_AVX2(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)942 void ARGBToUVRow_AVX2(const uint8* src_argb0,
943 int src_stride_argb,
944 uint8* dst_u,
945 uint8* dst_v,
946 int width) {
947 asm volatile (
948 "vbroadcastf128 %5,%%ymm5 \n"
949 "vbroadcastf128 %6,%%ymm6 \n"
950 "vbroadcastf128 %7,%%ymm7 \n"
951 "sub %1,%2 \n"
952
953 LABELALIGN
954 "1: \n"
955 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
956 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
957 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
958 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
959 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
960 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
961 VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
962 VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
963 "lea " MEMLEA(0x80,0) ",%0 \n"
964 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
965 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
966 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
967 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
968 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
969 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
970
971 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
972 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
973 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
974 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
975 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
976 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
977 "vpsraw $0x8,%%ymm1,%%ymm1 \n"
978 "vpsraw $0x8,%%ymm0,%%ymm0 \n"
979 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
980 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
981 "vpshufb %8,%%ymm0,%%ymm0 \n"
982 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
983
984 "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
985 VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
986 "lea " MEMLEA(0x10,1) ",%1 \n"
987 "sub $0x20,%3 \n"
988 "jg 1b \n"
989 "vzeroupper \n"
990 : "+r"(src_argb0), // %0
991 "+r"(dst_u), // %1
992 "+r"(dst_v), // %2
993 "+rm"(width) // %3
994 : "r"((intptr_t)(src_stride_argb)), // %4
995 "m"(kAddUV128), // %5
996 "m"(kARGBToV), // %6
997 "m"(kARGBToU), // %7
998 "m"(kShufARGBToUV_AVX) // %8
999 : "memory", "cc", NACL_R14
1000 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
1001 );
1002 }
1003 #endif // HAS_ARGBTOUVROW_AVX2
1004
1005 #ifdef HAS_ARGBTOUVJROW_AVX2
ARGBToUVJRow_AVX2(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1006 void ARGBToUVJRow_AVX2(const uint8* src_argb0,
1007 int src_stride_argb,
1008 uint8* dst_u,
1009 uint8* dst_v,
1010 int width) {
1011 asm volatile (
1012 "vbroadcastf128 %5,%%ymm5 \n"
1013 "vbroadcastf128 %6,%%ymm6 \n"
1014 "vbroadcastf128 %7,%%ymm7 \n"
1015 "sub %1,%2 \n"
1016
1017 LABELALIGN
1018 "1: \n"
1019 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
1020 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
1021 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
1022 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
1023 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
1024 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
1025 VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
1026 VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
1027 "lea " MEMLEA(0x80,0) ",%0 \n"
1028 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
1029 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
1030 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
1031 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
1032 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
1033 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
1034
1035 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
1036 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
1037 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
1038 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
1039 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
1040 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
1041 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
1042 "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
1043 "vpsraw $0x8,%%ymm1,%%ymm1 \n"
1044 "vpsraw $0x8,%%ymm0,%%ymm0 \n"
1045 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
1046 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1047 "vpshufb %8,%%ymm0,%%ymm0 \n"
1048
1049 "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
1050 VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
1051 "lea " MEMLEA(0x10,1) ",%1 \n"
1052 "sub $0x20,%3 \n"
1053 "jg 1b \n"
1054 "vzeroupper \n"
1055 : "+r"(src_argb0), // %0
1056 "+r"(dst_u), // %1
1057 "+r"(dst_v), // %2
1058 "+rm"(width) // %3
1059 : "r"((intptr_t)(src_stride_argb)), // %4
1060 "m"(kAddUVJ128), // %5
1061 "m"(kARGBToVJ), // %6
1062 "m"(kARGBToUJ), // %7
1063 "m"(kShufARGBToUV_AVX) // %8
1064 : "memory", "cc", NACL_R14
1065 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
1066 );
1067 }
1068 #endif // HAS_ARGBTOUVJROW_AVX2
1069
1070 #ifdef HAS_ARGBTOUVJROW_SSSE3
ARGBToUVJRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1071 void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
1072 int src_stride_argb,
1073 uint8* dst_u,
1074 uint8* dst_v,
1075 int width) {
1076 asm volatile (
1077 "movdqa %5,%%xmm3 \n"
1078 "movdqa %6,%%xmm4 \n"
1079 "movdqa %7,%%xmm5 \n"
1080 "sub %1,%2 \n"
1081
1082 LABELALIGN
1083 "1: \n"
1084 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1085 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1086 "pavgb %%xmm7,%%xmm0 \n"
1087 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1088 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1089 "pavgb %%xmm7,%%xmm1 \n"
1090 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1091 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1092 "pavgb %%xmm7,%%xmm2 \n"
1093 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1094 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1095 "pavgb %%xmm7,%%xmm6 \n"
1096
1097 "lea " MEMLEA(0x40,0) ",%0 \n"
1098 "movdqa %%xmm0,%%xmm7 \n"
1099 "shufps $0x88,%%xmm1,%%xmm0 \n"
1100 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1101 "pavgb %%xmm7,%%xmm0 \n"
1102 "movdqa %%xmm2,%%xmm7 \n"
1103 "shufps $0x88,%%xmm6,%%xmm2 \n"
1104 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1105 "pavgb %%xmm7,%%xmm2 \n"
1106 "movdqa %%xmm0,%%xmm1 \n"
1107 "movdqa %%xmm2,%%xmm6 \n"
1108 "pmaddubsw %%xmm4,%%xmm0 \n"
1109 "pmaddubsw %%xmm4,%%xmm2 \n"
1110 "pmaddubsw %%xmm3,%%xmm1 \n"
1111 "pmaddubsw %%xmm3,%%xmm6 \n"
1112 "phaddw %%xmm2,%%xmm0 \n"
1113 "phaddw %%xmm6,%%xmm1 \n"
1114 "paddw %%xmm5,%%xmm0 \n"
1115 "paddw %%xmm5,%%xmm1 \n"
1116 "psraw $0x8,%%xmm0 \n"
1117 "psraw $0x8,%%xmm1 \n"
1118 "packsswb %%xmm1,%%xmm0 \n"
1119 "movlps %%xmm0," MEMACCESS(1) " \n"
1120 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1121 "lea " MEMLEA(0x8,1) ",%1 \n"
1122 "sub $0x10,%3 \n"
1123 "jg 1b \n"
1124 : "+r"(src_argb0), // %0
1125 "+r"(dst_u), // %1
1126 "+r"(dst_v), // %2
1127 "+rm"(width) // %3
1128 : "r"((intptr_t)(src_stride_argb)), // %4
1129 "m"(kARGBToVJ), // %5
1130 "m"(kARGBToUJ), // %6
1131 "m"(kAddUVJ128) // %7
1132 : "memory", "cc", NACL_R14
1133 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1134 );
1135 }
1136 #endif // HAS_ARGBTOUVJROW_SSSE3
1137
1138 #ifdef HAS_ARGBTOUV444ROW_SSSE3
ARGBToUV444Row_SSSE3(const uint8 * src_argb,uint8 * dst_u,uint8 * dst_v,int width)1139 void ARGBToUV444Row_SSSE3(const uint8* src_argb,
1140 uint8* dst_u,
1141 uint8* dst_v,
1142 int width) {
1143 asm volatile (
1144 "movdqa %4,%%xmm3 \n"
1145 "movdqa %5,%%xmm4 \n"
1146 "movdqa %6,%%xmm5 \n"
1147 "sub %1,%2 \n"
1148
1149 LABELALIGN
1150 "1: \n"
1151 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1152 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1153 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1154 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1155 "pmaddubsw %%xmm4,%%xmm0 \n"
1156 "pmaddubsw %%xmm4,%%xmm1 \n"
1157 "pmaddubsw %%xmm4,%%xmm2 \n"
1158 "pmaddubsw %%xmm4,%%xmm6 \n"
1159 "phaddw %%xmm1,%%xmm0 \n"
1160 "phaddw %%xmm6,%%xmm2 \n"
1161 "psraw $0x8,%%xmm0 \n"
1162 "psraw $0x8,%%xmm2 \n"
1163 "packsswb %%xmm2,%%xmm0 \n"
1164 "paddb %%xmm5,%%xmm0 \n"
1165 "movdqu %%xmm0," MEMACCESS(1) " \n"
1166 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1167 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1168 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1169 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1170 "pmaddubsw %%xmm3,%%xmm0 \n"
1171 "pmaddubsw %%xmm3,%%xmm1 \n"
1172 "pmaddubsw %%xmm3,%%xmm2 \n"
1173 "pmaddubsw %%xmm3,%%xmm6 \n"
1174 "phaddw %%xmm1,%%xmm0 \n"
1175 "phaddw %%xmm6,%%xmm2 \n"
1176 "psraw $0x8,%%xmm0 \n"
1177 "psraw $0x8,%%xmm2 \n"
1178 "packsswb %%xmm2,%%xmm0 \n"
1179 "paddb %%xmm5,%%xmm0 \n"
1180 "lea " MEMLEA(0x40,0) ",%0 \n"
1181 MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1)
1182 "lea " MEMLEA(0x10,1) ",%1 \n"
1183 "sub $0x10,%3 \n"
1184 "jg 1b \n"
1185 : "+r"(src_argb), // %0
1186 "+r"(dst_u), // %1
1187 "+r"(dst_v), // %2
1188 "+rm"(width) // %3
1189 : "m"(kARGBToV), // %4
1190 "m"(kARGBToU), // %5
1191 "m"(kAddUV128) // %6
1192 : "memory", "cc", NACL_R14
1193 "xmm0", "xmm1", "xmm2", "xmm6"
1194 );
1195 }
1196 #endif // HAS_ARGBTOUV444ROW_SSSE3
1197
BGRAToYRow_SSSE3(const uint8 * src_bgra,uint8 * dst_y,int width)1198 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {
1199 asm volatile (
1200 "movdqa %4,%%xmm5 \n"
1201 "movdqa %3,%%xmm4 \n"
1202
1203 LABELALIGN
1204 "1: \n"
1205 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1206 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1207 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1208 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1209 "pmaddubsw %%xmm4,%%xmm0 \n"
1210 "pmaddubsw %%xmm4,%%xmm1 \n"
1211 "pmaddubsw %%xmm4,%%xmm2 \n"
1212 "pmaddubsw %%xmm4,%%xmm3 \n"
1213 "lea " MEMLEA(0x40,0) ",%0 \n"
1214 "phaddw %%xmm1,%%xmm0 \n"
1215 "phaddw %%xmm3,%%xmm2 \n"
1216 "psrlw $0x7,%%xmm0 \n"
1217 "psrlw $0x7,%%xmm2 \n"
1218 "packuswb %%xmm2,%%xmm0 \n"
1219 "paddb %%xmm5,%%xmm0 \n"
1220 "movdqu %%xmm0," MEMACCESS(1) " \n"
1221 "lea " MEMLEA(0x10,1) ",%1 \n"
1222 "sub $0x10,%2 \n"
1223 "jg 1b \n"
1224 : "+r"(src_bgra), // %0
1225 "+r"(dst_y), // %1
1226 "+r"(width) // %2
1227 : "m"(kBGRAToY), // %3
1228 "m"(kAddY16) // %4
1229 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1230 );
1231 }
1232
BGRAToUVRow_SSSE3(const uint8 * src_bgra0,int src_stride_bgra,uint8 * dst_u,uint8 * dst_v,int width)1233 void BGRAToUVRow_SSSE3(const uint8* src_bgra0,
1234 int src_stride_bgra,
1235 uint8* dst_u,
1236 uint8* dst_v,
1237 int width) {
1238 asm volatile (
1239 "movdqa %5,%%xmm3 \n"
1240 "movdqa %6,%%xmm4 \n"
1241 "movdqa %7,%%xmm5 \n"
1242 "sub %1,%2 \n"
1243
1244 LABELALIGN
1245 "1: \n"
1246 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1247 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1248 "pavgb %%xmm7,%%xmm0 \n"
1249 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1250 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1251 "pavgb %%xmm7,%%xmm1 \n"
1252 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1253 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1254 "pavgb %%xmm7,%%xmm2 \n"
1255 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1256 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1257 "pavgb %%xmm7,%%xmm6 \n"
1258
1259 "lea " MEMLEA(0x40,0) ",%0 \n"
1260 "movdqa %%xmm0,%%xmm7 \n"
1261 "shufps $0x88,%%xmm1,%%xmm0 \n"
1262 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1263 "pavgb %%xmm7,%%xmm0 \n"
1264 "movdqa %%xmm2,%%xmm7 \n"
1265 "shufps $0x88,%%xmm6,%%xmm2 \n"
1266 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1267 "pavgb %%xmm7,%%xmm2 \n"
1268 "movdqa %%xmm0,%%xmm1 \n"
1269 "movdqa %%xmm2,%%xmm6 \n"
1270 "pmaddubsw %%xmm4,%%xmm0 \n"
1271 "pmaddubsw %%xmm4,%%xmm2 \n"
1272 "pmaddubsw %%xmm3,%%xmm1 \n"
1273 "pmaddubsw %%xmm3,%%xmm6 \n"
1274 "phaddw %%xmm2,%%xmm0 \n"
1275 "phaddw %%xmm6,%%xmm1 \n"
1276 "psraw $0x8,%%xmm0 \n"
1277 "psraw $0x8,%%xmm1 \n"
1278 "packsswb %%xmm1,%%xmm0 \n"
1279 "paddb %%xmm5,%%xmm0 \n"
1280 "movlps %%xmm0," MEMACCESS(1) " \n"
1281 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1282 "lea " MEMLEA(0x8,1) ",%1 \n"
1283 "sub $0x10,%3 \n"
1284 "jg 1b \n"
1285 : "+r"(src_bgra0), // %0
1286 "+r"(dst_u), // %1
1287 "+r"(dst_v), // %2
1288 "+rm"(width) // %3
1289 : "r"((intptr_t)(src_stride_bgra)), // %4
1290 "m"(kBGRAToV), // %5
1291 "m"(kBGRAToU), // %6
1292 "m"(kAddUV128) // %7
1293 : "memory", "cc", NACL_R14
1294 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1295 );
1296 }
1297
ABGRToYRow_SSSE3(const uint8 * src_abgr,uint8 * dst_y,int width)1298 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) {
1299 asm volatile (
1300 "movdqa %4,%%xmm5 \n"
1301 "movdqa %3,%%xmm4 \n"
1302
1303 LABELALIGN
1304 "1: \n"
1305 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1306 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1307 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1308 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1309 "pmaddubsw %%xmm4,%%xmm0 \n"
1310 "pmaddubsw %%xmm4,%%xmm1 \n"
1311 "pmaddubsw %%xmm4,%%xmm2 \n"
1312 "pmaddubsw %%xmm4,%%xmm3 \n"
1313 "lea " MEMLEA(0x40,0) ",%0 \n"
1314 "phaddw %%xmm1,%%xmm0 \n"
1315 "phaddw %%xmm3,%%xmm2 \n"
1316 "psrlw $0x7,%%xmm0 \n"
1317 "psrlw $0x7,%%xmm2 \n"
1318 "packuswb %%xmm2,%%xmm0 \n"
1319 "paddb %%xmm5,%%xmm0 \n"
1320 "movdqu %%xmm0," MEMACCESS(1) " \n"
1321 "lea " MEMLEA(0x10,1) ",%1 \n"
1322 "sub $0x10,%2 \n"
1323 "jg 1b \n"
1324 : "+r"(src_abgr), // %0
1325 "+r"(dst_y), // %1
1326 "+r"(width) // %2
1327 : "m"(kABGRToY), // %3
1328 "m"(kAddY16) // %4
1329 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1330 );
1331 }
1332
RGBAToYRow_SSSE3(const uint8 * src_rgba,uint8 * dst_y,int width)1333 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {
1334 asm volatile (
1335 "movdqa %4,%%xmm5 \n"
1336 "movdqa %3,%%xmm4 \n"
1337
1338 LABELALIGN
1339 "1: \n"
1340 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1341 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1342 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1343 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1344 "pmaddubsw %%xmm4,%%xmm0 \n"
1345 "pmaddubsw %%xmm4,%%xmm1 \n"
1346 "pmaddubsw %%xmm4,%%xmm2 \n"
1347 "pmaddubsw %%xmm4,%%xmm3 \n"
1348 "lea " MEMLEA(0x40,0) ",%0 \n"
1349 "phaddw %%xmm1,%%xmm0 \n"
1350 "phaddw %%xmm3,%%xmm2 \n"
1351 "psrlw $0x7,%%xmm0 \n"
1352 "psrlw $0x7,%%xmm2 \n"
1353 "packuswb %%xmm2,%%xmm0 \n"
1354 "paddb %%xmm5,%%xmm0 \n"
1355 "movdqu %%xmm0," MEMACCESS(1) " \n"
1356 "lea " MEMLEA(0x10,1) ",%1 \n"
1357 "sub $0x10,%2 \n"
1358 "jg 1b \n"
1359 : "+r"(src_rgba), // %0
1360 "+r"(dst_y), // %1
1361 "+r"(width) // %2
1362 : "m"(kRGBAToY), // %3
1363 "m"(kAddY16) // %4
1364 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1365 );
1366 }
1367
ABGRToUVRow_SSSE3(const uint8 * src_abgr0,int src_stride_abgr,uint8 * dst_u,uint8 * dst_v,int width)1368 void ABGRToUVRow_SSSE3(const uint8* src_abgr0,
1369 int src_stride_abgr,
1370 uint8* dst_u,
1371 uint8* dst_v,
1372 int width) {
1373 asm volatile (
1374 "movdqa %5,%%xmm3 \n"
1375 "movdqa %6,%%xmm4 \n"
1376 "movdqa %7,%%xmm5 \n"
1377 "sub %1,%2 \n"
1378
1379 LABELALIGN
1380 "1: \n"
1381 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1382 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1383 "pavgb %%xmm7,%%xmm0 \n"
1384 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1385 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1386 "pavgb %%xmm7,%%xmm1 \n"
1387 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1388 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1389 "pavgb %%xmm7,%%xmm2 \n"
1390 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1391 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1392 "pavgb %%xmm7,%%xmm6 \n"
1393
1394 "lea " MEMLEA(0x40,0) ",%0 \n"
1395 "movdqa %%xmm0,%%xmm7 \n"
1396 "shufps $0x88,%%xmm1,%%xmm0 \n"
1397 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1398 "pavgb %%xmm7,%%xmm0 \n"
1399 "movdqa %%xmm2,%%xmm7 \n"
1400 "shufps $0x88,%%xmm6,%%xmm2 \n"
1401 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1402 "pavgb %%xmm7,%%xmm2 \n"
1403 "movdqa %%xmm0,%%xmm1 \n"
1404 "movdqa %%xmm2,%%xmm6 \n"
1405 "pmaddubsw %%xmm4,%%xmm0 \n"
1406 "pmaddubsw %%xmm4,%%xmm2 \n"
1407 "pmaddubsw %%xmm3,%%xmm1 \n"
1408 "pmaddubsw %%xmm3,%%xmm6 \n"
1409 "phaddw %%xmm2,%%xmm0 \n"
1410 "phaddw %%xmm6,%%xmm1 \n"
1411 "psraw $0x8,%%xmm0 \n"
1412 "psraw $0x8,%%xmm1 \n"
1413 "packsswb %%xmm1,%%xmm0 \n"
1414 "paddb %%xmm5,%%xmm0 \n"
1415 "movlps %%xmm0," MEMACCESS(1) " \n"
1416 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1417 "lea " MEMLEA(0x8,1) ",%1 \n"
1418 "sub $0x10,%3 \n"
1419 "jg 1b \n"
1420 : "+r"(src_abgr0), // %0
1421 "+r"(dst_u), // %1
1422 "+r"(dst_v), // %2
1423 "+rm"(width) // %3
1424 : "r"((intptr_t)(src_stride_abgr)), // %4
1425 "m"(kABGRToV), // %5
1426 "m"(kABGRToU), // %6
1427 "m"(kAddUV128) // %7
1428 : "memory", "cc", NACL_R14
1429 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1430 );
1431 }
1432
RGBAToUVRow_SSSE3(const uint8 * src_rgba0,int src_stride_rgba,uint8 * dst_u,uint8 * dst_v,int width)1433 void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
1434 int src_stride_rgba,
1435 uint8* dst_u,
1436 uint8* dst_v,
1437 int width) {
1438 asm volatile (
1439 "movdqa %5,%%xmm3 \n"
1440 "movdqa %6,%%xmm4 \n"
1441 "movdqa %7,%%xmm5 \n"
1442 "sub %1,%2 \n"
1443
1444 LABELALIGN
1445 "1: \n"
1446 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1447 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1448 "pavgb %%xmm7,%%xmm0 \n"
1449 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1450 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1451 "pavgb %%xmm7,%%xmm1 \n"
1452 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1453 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1454 "pavgb %%xmm7,%%xmm2 \n"
1455 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1456 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1457 "pavgb %%xmm7,%%xmm6 \n"
1458
1459 "lea " MEMLEA(0x40,0) ",%0 \n"
1460 "movdqa %%xmm0,%%xmm7 \n"
1461 "shufps $0x88,%%xmm1,%%xmm0 \n"
1462 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1463 "pavgb %%xmm7,%%xmm0 \n"
1464 "movdqa %%xmm2,%%xmm7 \n"
1465 "shufps $0x88,%%xmm6,%%xmm2 \n"
1466 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1467 "pavgb %%xmm7,%%xmm2 \n"
1468 "movdqa %%xmm0,%%xmm1 \n"
1469 "movdqa %%xmm2,%%xmm6 \n"
1470 "pmaddubsw %%xmm4,%%xmm0 \n"
1471 "pmaddubsw %%xmm4,%%xmm2 \n"
1472 "pmaddubsw %%xmm3,%%xmm1 \n"
1473 "pmaddubsw %%xmm3,%%xmm6 \n"
1474 "phaddw %%xmm2,%%xmm0 \n"
1475 "phaddw %%xmm6,%%xmm1 \n"
1476 "psraw $0x8,%%xmm0 \n"
1477 "psraw $0x8,%%xmm1 \n"
1478 "packsswb %%xmm1,%%xmm0 \n"
1479 "paddb %%xmm5,%%xmm0 \n"
1480 "movlps %%xmm0," MEMACCESS(1) " \n"
1481 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1482 "lea " MEMLEA(0x8,1) ",%1 \n"
1483 "sub $0x10,%3 \n"
1484 "jg 1b \n"
1485 : "+r"(src_rgba0), // %0
1486 "+r"(dst_u), // %1
1487 "+r"(dst_v), // %2
1488 "+rm"(width) // %3
1489 : "r"((intptr_t)(src_stride_rgba)), // %4
1490 "m"(kRGBAToV), // %5
1491 "m"(kRGBAToU), // %6
1492 "m"(kAddUV128) // %7
1493 : "memory", "cc", NACL_R14
1494 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1495 );
1496 }
1497
1498 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
1499
1500 // Read 8 UV from 444
1501 #define READYUV444 \
1502 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1503 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
1504 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
1505 "punpcklbw %%xmm1,%%xmm0 \n" \
1506 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1507 "punpcklbw %%xmm4,%%xmm4 \n" \
1508 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
1509
1510 // Read 4 UV from 422, upsample to 8 UV
1511 #define READYUV422 \
1512 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1513 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
1514 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
1515 "punpcklbw %%xmm1,%%xmm0 \n" \
1516 "punpcklwd %%xmm0,%%xmm0 \n" \
1517 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1518 "punpcklbw %%xmm4,%%xmm4 \n" \
1519 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
1520
1521 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
1522 #define READYUVA422 \
1523 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1524 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
1525 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
1526 "punpcklbw %%xmm1,%%xmm0 \n" \
1527 "punpcklwd %%xmm0,%%xmm0 \n" \
1528 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1529 "punpcklbw %%xmm4,%%xmm4 \n" \
1530 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
1531 "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \
1532 "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n"
1533
1534 // Read 4 UV from NV12, upsample to 8 UV
1535 #define READNV12 \
1536 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
1537 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \
1538 "punpcklwd %%xmm0,%%xmm0 \n" \
1539 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1540 "punpcklbw %%xmm4,%%xmm4 \n" \
1541 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
1542
1543 // Read 4 VU from NV21, upsample to 8 UV
1544 #define READNV21 \
1545 "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
1546 "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \
1547 "pshufb %[kShuffleNV21], %%xmm0 \n" \
1548 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1549 "punpcklbw %%xmm4,%%xmm4 \n" \
1550 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
1551
1552 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
1553 #define READYUY2 \
1554 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \
1555 "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
1556 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \
1557 "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \
1558 "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n"
1559
1560 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
1561 #define READUYVY \
1562 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \
1563 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
1564 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \
1565 "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \
1566 "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n"
1567
1568 #if defined(__x86_64__)
1569 #define YUVTORGB_SETUP(yuvconstants) \
1570 "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \
1571 "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \
1572 "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \
1573 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \
1574 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm12 \n" \
1575 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \
1576 "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n"
1577 // Convert 8 pixels: 8 UV and 8 Y
1578 #define YUVTORGB(yuvconstants) \
1579 "movdqa %%xmm0,%%xmm1 \n" \
1580 "movdqa %%xmm0,%%xmm2 \n" \
1581 "movdqa %%xmm0,%%xmm3 \n" \
1582 "movdqa %%xmm11,%%xmm0 \n" \
1583 "pmaddubsw %%xmm8,%%xmm1 \n" \
1584 "psubw %%xmm1,%%xmm0 \n" \
1585 "movdqa %%xmm12,%%xmm1 \n" \
1586 "pmaddubsw %%xmm9,%%xmm2 \n" \
1587 "psubw %%xmm2,%%xmm1 \n" \
1588 "movdqa %%xmm13,%%xmm2 \n" \
1589 "pmaddubsw %%xmm10,%%xmm3 \n" \
1590 "psubw %%xmm3,%%xmm2 \n" \
1591 "pmulhuw %%xmm14,%%xmm4 \n" \
1592 "paddsw %%xmm4,%%xmm0 \n" \
1593 "paddsw %%xmm4,%%xmm1 \n" \
1594 "paddsw %%xmm4,%%xmm2 \n" \
1595 "psraw $0x6,%%xmm0 \n" \
1596 "psraw $0x6,%%xmm1 \n" \
1597 "psraw $0x6,%%xmm2 \n" \
1598 "packuswb %%xmm0,%%xmm0 \n" \
1599 "packuswb %%xmm1,%%xmm1 \n" \
1600 "packuswb %%xmm2,%%xmm2 \n"
1601 #define YUVTORGB_REGS \
1602 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
1603
1604 #else
1605 #define YUVTORGB_SETUP(yuvconstants)
1606 // Convert 8 pixels: 8 UV and 8 Y
1607 #define YUVTORGB(yuvconstants) \
1608 "movdqa %%xmm0,%%xmm1 \n" \
1609 "movdqa %%xmm0,%%xmm2 \n" \
1610 "movdqa %%xmm0,%%xmm3 \n" \
1611 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \
1612 "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \
1613 "psubw %%xmm1,%%xmm0 \n" \
1614 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \
1615 "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \
1616 "psubw %%xmm2,%%xmm1 \n" \
1617 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \
1618 "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \
1619 "psubw %%xmm3,%%xmm2 \n" \
1620 "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \
1621 "paddsw %%xmm4,%%xmm0 \n" \
1622 "paddsw %%xmm4,%%xmm1 \n" \
1623 "paddsw %%xmm4,%%xmm2 \n" \
1624 "psraw $0x6,%%xmm0 \n" \
1625 "psraw $0x6,%%xmm1 \n" \
1626 "psraw $0x6,%%xmm2 \n" \
1627 "packuswb %%xmm0,%%xmm0 \n" \
1628 "packuswb %%xmm1,%%xmm1 \n" \
1629 "packuswb %%xmm2,%%xmm2 \n"
1630 #define YUVTORGB_REGS
1631 #endif
1632
1633 // Store 8 ARGB values.
1634 #define STOREARGB \
1635 "punpcklbw %%xmm1,%%xmm0 \n" \
1636 "punpcklbw %%xmm5,%%xmm2 \n" \
1637 "movdqa %%xmm0,%%xmm1 \n" \
1638 "punpcklwd %%xmm2,%%xmm0 \n" \
1639 "punpckhwd %%xmm2,%%xmm1 \n" \
1640 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \
1641 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \
1642 "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n"
1643
1644 // Store 8 RGBA values.
1645 #define STORERGBA \
1646 "pcmpeqb %%xmm5,%%xmm5 \n" \
1647 "punpcklbw %%xmm2,%%xmm1 \n" \
1648 "punpcklbw %%xmm0,%%xmm5 \n" \
1649 "movdqa %%xmm5,%%xmm0 \n" \
1650 "punpcklwd %%xmm1,%%xmm5 \n" \
1651 "punpckhwd %%xmm1,%%xmm0 \n" \
1652 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \
1653 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \
1654 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n"
1655
I444ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1656 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1657 const uint8* u_buf,
1658 const uint8* v_buf,
1659 uint8* dst_argb,
1660 const struct YuvConstants* yuvconstants,
1661 int width) {
1662 asm volatile (
1663 YUVTORGB_SETUP(yuvconstants)
1664 "sub %[u_buf],%[v_buf] \n"
1665 "pcmpeqb %%xmm5,%%xmm5 \n"
1666
1667 LABELALIGN
1668 "1: \n"
1669 READYUV444
1670 YUVTORGB(yuvconstants)
1671 STOREARGB
1672 "sub $0x8,%[width] \n"
1673 "jg 1b \n"
1674 : [y_buf]"+r"(y_buf), // %[y_buf]
1675 [u_buf]"+r"(u_buf), // %[u_buf]
1676 [v_buf]"+r"(v_buf), // %[v_buf]
1677 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1678 [width]"+rm"(width) // %[width]
1679 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1680 : "memory", "cc", NACL_R14 YUVTORGB_REGS
1681 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1682 );
1683 }
1684
I422ToRGB24Row_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_rgb24,const struct YuvConstants * yuvconstants,int width)1685 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1686 const uint8* u_buf,
1687 const uint8* v_buf,
1688 uint8* dst_rgb24,
1689 const struct YuvConstants* yuvconstants,
1690 int width) {
1691 asm volatile (
1692 YUVTORGB_SETUP(yuvconstants)
1693 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1694 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
1695 "sub %[u_buf],%[v_buf] \n"
1696
1697 LABELALIGN
1698 "1: \n"
1699 READYUV422
1700 YUVTORGB(yuvconstants)
1701 "punpcklbw %%xmm1,%%xmm0 \n"
1702 "punpcklbw %%xmm2,%%xmm2 \n"
1703 "movdqa %%xmm0,%%xmm1 \n"
1704 "punpcklwd %%xmm2,%%xmm0 \n"
1705 "punpckhwd %%xmm2,%%xmm1 \n"
1706 "pshufb %%xmm5,%%xmm0 \n"
1707 "pshufb %%xmm6,%%xmm1 \n"
1708 "palignr $0xc,%%xmm0,%%xmm1 \n"
1709 "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n"
1710 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
1711 "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
1712 "subl $0x8,%[width] \n"
1713 "jg 1b \n"
1714 : [y_buf]"+r"(y_buf), // %[y_buf]
1715 [u_buf]"+r"(u_buf), // %[u_buf]
1716 [v_buf]"+r"(v_buf), // %[v_buf]
1717 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
1718 #if defined(__i386__)
1719 [width]"+m"(width) // %[width]
1720 #else
1721 [width]"+rm"(width) // %[width]
1722 #endif
1723 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1724 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1725 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
1726 : "memory", "cc", NACL_R14 YUVTORGB_REGS
1727 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1728 );
1729 }
1730
I422ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1731 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
1732 const uint8* u_buf,
1733 const uint8* v_buf,
1734 uint8* dst_argb,
1735 const struct YuvConstants* yuvconstants,
1736 int width) {
1737 asm volatile (
1738 YUVTORGB_SETUP(yuvconstants)
1739 "sub %[u_buf],%[v_buf] \n"
1740 "pcmpeqb %%xmm5,%%xmm5 \n"
1741
1742 LABELALIGN
1743 "1: \n"
1744 READYUV422
1745 YUVTORGB(yuvconstants)
1746 STOREARGB
1747 "sub $0x8,%[width] \n"
1748 "jg 1b \n"
1749 : [y_buf]"+r"(y_buf), // %[y_buf]
1750 [u_buf]"+r"(u_buf), // %[u_buf]
1751 [v_buf]"+r"(v_buf), // %[v_buf]
1752 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1753 [width]"+rm"(width) // %[width]
1754 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1755 : "memory", "cc", NACL_R14 YUVTORGB_REGS
1756 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1757 );
1758 }
1759
1760 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
I422AlphaToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,const uint8 * a_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1761 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
1762 const uint8* u_buf,
1763 const uint8* v_buf,
1764 const uint8* a_buf,
1765 uint8* dst_argb,
1766 const struct YuvConstants* yuvconstants,
1767 int width) {
1768 // clang-format off
1769 asm volatile (
1770 YUVTORGB_SETUP(yuvconstants)
1771 "sub %[u_buf],%[v_buf] \n"
1772
1773 LABELALIGN
1774 "1: \n"
1775 READYUVA422
1776 YUVTORGB(yuvconstants)
1777 STOREARGB
1778 "subl $0x8,%[width] \n"
1779 "jg 1b \n"
1780 : [y_buf]"+r"(y_buf), // %[y_buf]
1781 [u_buf]"+r"(u_buf), // %[u_buf]
1782 [v_buf]"+r"(v_buf), // %[v_buf]
1783 [a_buf]"+r"(a_buf), // %[a_buf]
1784 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1785 #if defined(__i386__)
1786 [width]"+m"(width) // %[width]
1787 #else
1788 [width]"+rm"(width) // %[width]
1789 #endif
1790 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1791 : "memory", "cc", NACL_R14 YUVTORGB_REGS
1792 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1793 );
1794 // clang-format on
1795 }
1796 #endif // HAS_I422ALPHATOARGBROW_SSSE3
1797
NV12ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1798 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1799 const uint8* uv_buf,
1800 uint8* dst_argb,
1801 const struct YuvConstants* yuvconstants,
1802 int width) {
1803 // clang-format off
1804 asm volatile (
1805 YUVTORGB_SETUP(yuvconstants)
1806 "pcmpeqb %%xmm5,%%xmm5 \n"
1807
1808 LABELALIGN
1809 "1: \n"
1810 READNV12
1811 YUVTORGB(yuvconstants)
1812 STOREARGB
1813 "sub $0x8,%[width] \n"
1814 "jg 1b \n"
1815 : [y_buf]"+r"(y_buf), // %[y_buf]
1816 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1817 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1818 [width]"+rm"(width) // %[width]
1819 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1820 : "memory", "cc", YUVTORGB_REGS // Does not use r14.
1821 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1822 );
1823 // clang-format on
1824 }
1825
NV21ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * vu_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1826 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1827 const uint8* vu_buf,
1828 uint8* dst_argb,
1829 const struct YuvConstants* yuvconstants,
1830 int width) {
1831 // clang-format off
1832 asm volatile (
1833 YUVTORGB_SETUP(yuvconstants)
1834 "pcmpeqb %%xmm5,%%xmm5 \n"
1835
1836 LABELALIGN
1837 "1: \n"
1838 READNV21
1839 YUVTORGB(yuvconstants)
1840 STOREARGB
1841 "sub $0x8,%[width] \n"
1842 "jg 1b \n"
1843 : [y_buf]"+r"(y_buf), // %[y_buf]
1844 [vu_buf]"+r"(vu_buf), // %[vu_buf]
1845 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1846 [width]"+rm"(width) // %[width]
1847 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1848 [kShuffleNV21]"m"(kShuffleNV21)
1849 : "memory", "cc", YUVTORGB_REGS // Does not use r14.
1850 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1851 );
1852 // clang-format on
1853 }
1854
YUY2ToARGBRow_SSSE3(const uint8 * yuy2_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1855 void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
1856 uint8* dst_argb,
1857 const struct YuvConstants* yuvconstants,
1858 int width) {
1859 // clang-format off
1860 asm volatile (
1861 YUVTORGB_SETUP(yuvconstants)
1862 "pcmpeqb %%xmm5,%%xmm5 \n"
1863
1864 LABELALIGN
1865 "1: \n"
1866 READYUY2
1867 YUVTORGB(yuvconstants)
1868 STOREARGB
1869 "sub $0x8,%[width] \n"
1870 "jg 1b \n"
1871 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
1872 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1873 [width]"+rm"(width) // %[width]
1874 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1875 [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
1876 [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
1877 : "memory", "cc", YUVTORGB_REGS // Does not use r14.
1878 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1879 );
1880 // clang-format on
1881 }
1882
UYVYToARGBRow_SSSE3(const uint8 * uyvy_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1883 void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
1884 uint8* dst_argb,
1885 const struct YuvConstants* yuvconstants,
1886 int width) {
1887 // clang-format off
1888 asm volatile (
1889 YUVTORGB_SETUP(yuvconstants)
1890 "pcmpeqb %%xmm5,%%xmm5 \n"
1891
1892 LABELALIGN
1893 "1: \n"
1894 READUYVY
1895 YUVTORGB(yuvconstants)
1896 STOREARGB
1897 "sub $0x8,%[width] \n"
1898 "jg 1b \n"
1899 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
1900 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1901 [width]"+rm"(width) // %[width]
1902 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1903 [kShuffleUYVYY]"m"(kShuffleUYVYY),
1904 [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
1905 : "memory", "cc", YUVTORGB_REGS // Does not use r14.
1906 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1907 );
1908 // clang-format on
1909 }
1910
I422ToRGBARow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_rgba,const struct YuvConstants * yuvconstants,int width)1911 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
1912 const uint8* u_buf,
1913 const uint8* v_buf,
1914 uint8* dst_rgba,
1915 const struct YuvConstants* yuvconstants,
1916 int width) {
1917 asm volatile (
1918 YUVTORGB_SETUP(yuvconstants)
1919 "sub %[u_buf],%[v_buf] \n"
1920 "pcmpeqb %%xmm5,%%xmm5 \n"
1921
1922 LABELALIGN
1923 "1: \n"
1924 READYUV422
1925 YUVTORGB(yuvconstants)
1926 STORERGBA
1927 "sub $0x8,%[width] \n"
1928 "jg 1b \n"
1929 : [y_buf]"+r"(y_buf), // %[y_buf]
1930 [u_buf]"+r"(u_buf), // %[u_buf]
1931 [v_buf]"+r"(v_buf), // %[v_buf]
1932 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
1933 [width]"+rm"(width) // %[width]
1934 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1935 : "memory", "cc", NACL_R14 YUVTORGB_REGS
1936 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1937 );
1938 }
1939
1940 #endif // HAS_I422TOARGBROW_SSSE3
1941
1942 // Read 16 UV from 444
1943 #define READYUV444_AVX2 \
1944 "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1945 MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \
1946 "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \
1947 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1948 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
1949 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
1950 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1951 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
1952 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
1953 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
1954
1955 // Read 8 UV from 422, upsample to 16 UV.
1956 #define READYUV422_AVX2 \
1957 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1958 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
1959 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
1960 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
1961 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1962 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
1963 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1964 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
1965 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
1966 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
1967
1968 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
1969 #define READYUVA422_AVX2 \
1970 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1971 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
1972 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
1973 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
1974 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1975 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
1976 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1977 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
1978 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
1979 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \
1980 "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \
1981 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
1982 "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n"
1983
1984 // Read 8 UV from NV12, upsample to 16 UV.
1985 #define READNV12_AVX2 \
1986 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
1987 "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \
1988 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1989 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
1990 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1991 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
1992 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
1993 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
1994
1995 // Read 8 VU from NV21, upsample to 16 UV.
1996 #define READNV21_AVX2 \
1997 "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
1998 "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \
1999 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2000 "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \
2001 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
2002 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
2003 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
2004 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
2005
2006 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
2007 #define READYUY2_AVX2 \
2008 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \
2009 "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
2010 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \
2011 "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \
2012 "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n"
2013
2014 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
2015 #define READUYVY_AVX2 \
2016 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \
2017 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
2018 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \
2019 "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \
2020 "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n"
2021
2022 #if defined(__x86_64__)
2023 #define YUVTORGB_SETUP_AVX2(yuvconstants) \
2024 "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \
2025 "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \
2026 "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \
2027 "vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \
2028 "vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \
2029 "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \
2030 "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n"
2031
2032 #define YUVTORGB_AVX2(yuvconstants) \
2033 "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \
2034 "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \
2035 "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \
2036 "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \
2037 "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \
2038 "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \
2039 "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \
2040 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
2041 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
2042 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
2043 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
2044 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
2045 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
2046 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
2047 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
2048 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
2049
2050 #define YUVTORGB_REGS_AVX2 \
2051 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
2052
2053 #else // Convert 16 pixels: 16 UV and 16 Y.
2054
2055 #define YUVTORGB_SETUP_AVX2(yuvconstants)
2056 #define YUVTORGB_AVX2(yuvconstants) \
2057 "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \
2058 "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \
2059 "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \
2060 "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \
2061 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
2062 "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \
2063 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
2064 "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \
2065 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
2066 "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \
2067 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
2068 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
2069 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
2070 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
2071 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
2072 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
2073 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
2074 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
2075 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
2076 #define YUVTORGB_REGS_AVX2
2077 #endif
2078
2079 // Store 16 ARGB values.
2080 #define STOREARGB_AVX2 \
2081 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
2082 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2083 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
2084 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
2085 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
2086 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
2087 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \
2088 "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \
2089 "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n"
2090
2091 #ifdef HAS_I444TOARGBROW_AVX2
2092 // 16 pixels
2093 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
I444ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2094 void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,
2095 const uint8* u_buf,
2096 const uint8* v_buf,
2097 uint8* dst_argb,
2098 const struct YuvConstants* yuvconstants,
2099 int width) {
2100 asm volatile (
2101 YUVTORGB_SETUP_AVX2(yuvconstants)
2102 "sub %[u_buf],%[v_buf] \n"
2103 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2104
2105 LABELALIGN
2106 "1: \n"
2107 READYUV444_AVX2
2108 YUVTORGB_AVX2(yuvconstants)
2109 STOREARGB_AVX2
2110 "sub $0x10,%[width] \n"
2111 "jg 1b \n"
2112 "vzeroupper \n"
2113 : [y_buf]"+r"(y_buf), // %[y_buf]
2114 [u_buf]"+r"(u_buf), // %[u_buf]
2115 [v_buf]"+r"(v_buf), // %[v_buf]
2116 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2117 [width]"+rm"(width) // %[width]
2118 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2119 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2120 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2121 );
2122 }
2123 #endif // HAS_I444TOARGBROW_AVX2
2124
2125 #if defined(HAS_I422TOARGBROW_AVX2)
2126 // 16 pixels
2127 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I422ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2128 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
2129 const uint8* u_buf,
2130 const uint8* v_buf,
2131 uint8* dst_argb,
2132 const struct YuvConstants* yuvconstants,
2133 int width) {
2134 asm volatile (
2135 YUVTORGB_SETUP_AVX2(yuvconstants)
2136 "sub %[u_buf],%[v_buf] \n"
2137 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2138
2139 LABELALIGN
2140 "1: \n"
2141 READYUV422_AVX2
2142 YUVTORGB_AVX2(yuvconstants)
2143 STOREARGB_AVX2
2144 "sub $0x10,%[width] \n"
2145 "jg 1b \n"
2146
2147 "vzeroupper \n"
2148 : [y_buf]"+r"(y_buf), // %[y_buf]
2149 [u_buf]"+r"(u_buf), // %[u_buf]
2150 [v_buf]"+r"(v_buf), // %[v_buf]
2151 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2152 [width]"+rm"(width) // %[width]
2153 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2154 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2155 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2156 );
2157 }
2158 #endif // HAS_I422TOARGBROW_AVX2
2159
2160 #if defined(HAS_I422ALPHATOARGBROW_AVX2)
2161 // 16 pixels
2162 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
I422AlphaToARGBRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,const uint8 * a_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2163 void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
2164 const uint8* u_buf,
2165 const uint8* v_buf,
2166 const uint8* a_buf,
2167 uint8* dst_argb,
2168 const struct YuvConstants* yuvconstants,
2169 int width) {
2170 // clang-format off
2171 asm volatile (
2172 YUVTORGB_SETUP_AVX2(yuvconstants)
2173 "sub %[u_buf],%[v_buf] \n"
2174
2175 LABELALIGN
2176 "1: \n"
2177 READYUVA422_AVX2
2178 YUVTORGB_AVX2(yuvconstants)
2179 STOREARGB_AVX2
2180 "subl $0x10,%[width] \n"
2181 "jg 1b \n"
2182 "vzeroupper \n"
2183 : [y_buf]"+r"(y_buf), // %[y_buf]
2184 [u_buf]"+r"(u_buf), // %[u_buf]
2185 [v_buf]"+r"(v_buf), // %[v_buf]
2186 [a_buf]"+r"(a_buf), // %[a_buf]
2187 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2188 #if defined(__i386__)
2189 [width]"+m"(width) // %[width]
2190 #else
2191 [width]"+rm"(width) // %[width]
2192 #endif
2193 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2194 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2195 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2196 );
2197 // clang-format on
2198 }
2199 #endif // HAS_I422ALPHATOARGBROW_AVX2
2200
2201 #if defined(HAS_I422TORGBAROW_AVX2)
2202 // 16 pixels
2203 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
I422ToRGBARow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2204 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
2205 const uint8* u_buf,
2206 const uint8* v_buf,
2207 uint8* dst_argb,
2208 const struct YuvConstants* yuvconstants,
2209 int width) {
2210 asm volatile (
2211 YUVTORGB_SETUP_AVX2(yuvconstants)
2212 "sub %[u_buf],%[v_buf] \n"
2213 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2214
2215 LABELALIGN
2216 "1: \n"
2217 READYUV422_AVX2
2218 YUVTORGB_AVX2(yuvconstants)
2219
2220 // Step 3: Weave into RGBA
2221 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
2222 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2223 "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n"
2224 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2225 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
2226 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
2227 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n"
2228 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
2229 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2230 "sub $0x10,%[width] \n"
2231 "jg 1b \n"
2232 "vzeroupper \n"
2233 : [y_buf]"+r"(y_buf), // %[y_buf]
2234 [u_buf]"+r"(u_buf), // %[u_buf]
2235 [v_buf]"+r"(v_buf), // %[v_buf]
2236 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2237 [width]"+rm"(width) // %[width]
2238 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2239 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2240 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2241 );
2242 }
2243 #endif // HAS_I422TORGBAROW_AVX2
2244
2245 #if defined(HAS_NV12TOARGBROW_AVX2)
2246 // 16 pixels.
2247 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV12ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * uv_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2248 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
2249 const uint8* uv_buf,
2250 uint8* dst_argb,
2251 const struct YuvConstants* yuvconstants,
2252 int width) {
2253 // clang-format off
2254 asm volatile (
2255 YUVTORGB_SETUP_AVX2(yuvconstants)
2256 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2257
2258 LABELALIGN
2259 "1: \n"
2260 READNV12_AVX2
2261 YUVTORGB_AVX2(yuvconstants)
2262 STOREARGB_AVX2
2263 "sub $0x10,%[width] \n"
2264 "jg 1b \n"
2265 "vzeroupper \n"
2266 : [y_buf]"+r"(y_buf), // %[y_buf]
2267 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2268 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2269 [width]"+rm"(width) // %[width]
2270 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2271 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
2272 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2273 );
2274 // clang-format on
2275 }
2276 #endif // HAS_NV12TOARGBROW_AVX2
2277
2278 #if defined(HAS_NV21TOARGBROW_AVX2)
2279 // 16 pixels.
2280 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV21ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * vu_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2281 void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
2282 const uint8* vu_buf,
2283 uint8* dst_argb,
2284 const struct YuvConstants* yuvconstants,
2285 int width) {
2286 // clang-format off
2287 asm volatile (
2288 YUVTORGB_SETUP_AVX2(yuvconstants)
2289 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2290
2291 LABELALIGN
2292 "1: \n"
2293 READNV21_AVX2
2294 YUVTORGB_AVX2(yuvconstants)
2295 STOREARGB_AVX2
2296 "sub $0x10,%[width] \n"
2297 "jg 1b \n"
2298 "vzeroupper \n"
2299 : [y_buf]"+r"(y_buf), // %[y_buf]
2300 [vu_buf]"+r"(vu_buf), // %[vu_buf]
2301 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2302 [width]"+rm"(width) // %[width]
2303 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2304 [kShuffleNV21]"m"(kShuffleNV21)
2305 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
2306 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2307 );
2308 // clang-format on
2309 }
2310 #endif // HAS_NV21TOARGBROW_AVX2
2311
2312 #if defined(HAS_YUY2TOARGBROW_AVX2)
2313 // 16 pixels.
2314 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
YUY2ToARGBRow_AVX2(const uint8 * yuy2_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2315 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
2316 uint8* dst_argb,
2317 const struct YuvConstants* yuvconstants,
2318 int width) {
2319 // clang-format off
2320 asm volatile (
2321 YUVTORGB_SETUP_AVX2(yuvconstants)
2322 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2323
2324 LABELALIGN
2325 "1: \n"
2326 READYUY2_AVX2
2327 YUVTORGB_AVX2(yuvconstants)
2328 STOREARGB_AVX2
2329 "sub $0x10,%[width] \n"
2330 "jg 1b \n"
2331 "vzeroupper \n"
2332 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
2333 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2334 [width]"+rm"(width) // %[width]
2335 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2336 [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
2337 [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
2338 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
2339 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2340 );
2341 // clang-format on
2342 }
2343 #endif // HAS_YUY2TOARGBROW_AVX2
2344
2345 #if defined(HAS_UYVYTOARGBROW_AVX2)
2346 // 16 pixels.
2347 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
UYVYToARGBRow_AVX2(const uint8 * uyvy_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2348 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
2349 uint8* dst_argb,
2350 const struct YuvConstants* yuvconstants,
2351 int width) {
2352 // clang-format off
2353 asm volatile (
2354 YUVTORGB_SETUP_AVX2(yuvconstants)
2355 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2356
2357 LABELALIGN
2358 "1: \n"
2359 READUYVY_AVX2
2360 YUVTORGB_AVX2(yuvconstants)
2361 STOREARGB_AVX2
2362 "sub $0x10,%[width] \n"
2363 "jg 1b \n"
2364 "vzeroupper \n"
2365 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
2366 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2367 [width]"+rm"(width) // %[width]
2368 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2369 [kShuffleUYVYY]"m"(kShuffleUYVYY),
2370 [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
2371 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
2372 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2373 );
2374 // clang-format on
2375 }
2376 #endif // HAS_UYVYTOARGBROW_AVX2
2377
2378 #ifdef HAS_I400TOARGBROW_SSE2
I400ToARGBRow_SSE2(const uint8 * y_buf,uint8 * dst_argb,int width)2379 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
2380 asm volatile (
2381 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
2382 "movd %%eax,%%xmm2 \n"
2383 "pshufd $0x0,%%xmm2,%%xmm2 \n"
2384 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16
2385 "movd %%eax,%%xmm3 \n"
2386 "pshufd $0x0,%%xmm3,%%xmm3 \n"
2387 "pcmpeqb %%xmm4,%%xmm4 \n"
2388 "pslld $0x18,%%xmm4 \n"
2389
2390 LABELALIGN
2391 "1: \n"
2392 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2393 "movq " MEMACCESS(0) ",%%xmm0 \n"
2394 "lea " MEMLEA(0x8,0) ",%0 \n"
2395 "punpcklbw %%xmm0,%%xmm0 \n"
2396 "pmulhuw %%xmm2,%%xmm0 \n"
2397 "psubusw %%xmm3,%%xmm0 \n"
2398 "psrlw $6, %%xmm0 \n"
2399 "packuswb %%xmm0,%%xmm0 \n"
2400
2401 // Step 2: Weave into ARGB
2402 "punpcklbw %%xmm0,%%xmm0 \n"
2403 "movdqa %%xmm0,%%xmm1 \n"
2404 "punpcklwd %%xmm0,%%xmm0 \n"
2405 "punpckhwd %%xmm1,%%xmm1 \n"
2406 "por %%xmm4,%%xmm0 \n"
2407 "por %%xmm4,%%xmm1 \n"
2408 "movdqu %%xmm0," MEMACCESS(1) " \n"
2409 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
2410 "lea " MEMLEA(0x20,1) ",%1 \n"
2411
2412 "sub $0x8,%2 \n"
2413 "jg 1b \n"
2414 : "+r"(y_buf), // %0
2415 "+r"(dst_argb), // %1
2416 "+rm"(width) // %2
2417 :
2418 : "memory", "cc", "eax"
2419 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2420 );
2421 }
2422 #endif // HAS_I400TOARGBROW_SSE2
2423
2424 #ifdef HAS_I400TOARGBROW_AVX2
2425 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
2426 // note: vpunpcklbw mutates and vpackuswb unmutates.
I400ToARGBRow_AVX2(const uint8 * y_buf,uint8 * dst_argb,int width)2427 void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
2428 asm volatile (
2429 "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16
2430 "vmovd %%eax,%%xmm2 \n"
2431 "vbroadcastss %%xmm2,%%ymm2 \n"
2432 "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164
2433 "vmovd %%eax,%%xmm3 \n"
2434 "vbroadcastss %%xmm3,%%ymm3 \n"
2435 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
2436 "vpslld $0x18,%%ymm4,%%ymm4 \n"
2437
2438 LABELALIGN
2439 "1: \n"
2440 // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
2441 "vmovdqu " MEMACCESS(0) ",%%xmm0 \n"
2442 "lea " MEMLEA(0x10,0) ",%0 \n"
2443 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2444 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
2445 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
2446 "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n"
2447 "vpsrlw $0x6,%%ymm0,%%ymm0 \n"
2448 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
2449 "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
2450 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2451 "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
2452 "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
2453 "vpor %%ymm4,%%ymm0,%%ymm0 \n"
2454 "vpor %%ymm4,%%ymm1,%%ymm1 \n"
2455 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2456 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
2457 "lea " MEMLEA(0x40,1) ",%1 \n"
2458 "sub $0x10,%2 \n"
2459 "jg 1b \n"
2460 "vzeroupper \n"
2461 : "+r"(y_buf), // %0
2462 "+r"(dst_argb), // %1
2463 "+rm"(width) // %2
2464 :
2465 : "memory", "cc", "eax"
2466 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2467 );
2468 }
2469 #endif // HAS_I400TOARGBROW_AVX2
2470
2471 #ifdef HAS_MIRRORROW_SSSE3
2472 // Shuffle table for reversing the bytes.
2473 static uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
2474 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
2475
MirrorRow_SSSE3(const uint8 * src,uint8 * dst,int width)2476 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2477 intptr_t temp_width = (intptr_t)(width);
2478 asm volatile (
2479 "movdqa %3,%%xmm5 \n"
2480
2481 LABELALIGN
2482 "1: \n"
2483 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
2484 "pshufb %%xmm5,%%xmm0 \n"
2485 "movdqu %%xmm0," MEMACCESS(1) " \n"
2486 "lea " MEMLEA(0x10,1) ",%1 \n"
2487 "sub $0x10,%2 \n"
2488 "jg 1b \n"
2489 : "+r"(src), // %0
2490 "+r"(dst), // %1
2491 "+r"(temp_width) // %2
2492 : "m"(kShuffleMirror) // %3
2493 : "memory", "cc", NACL_R14
2494 "xmm0", "xmm5"
2495 );
2496 }
2497 #endif // HAS_MIRRORROW_SSSE3
2498
2499 #ifdef HAS_MIRRORROW_AVX2
MirrorRow_AVX2(const uint8 * src,uint8 * dst,int width)2500 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2501 intptr_t temp_width = (intptr_t)(width);
2502 asm volatile (
2503 "vbroadcastf128 %3,%%ymm5 \n"
2504
2505 LABELALIGN
2506 "1: \n"
2507 MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0
2508 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
2509 "vpermq $0x4e,%%ymm0,%%ymm0 \n"
2510 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2511 "lea " MEMLEA(0x20,1) ",%1 \n"
2512 "sub $0x20,%2 \n"
2513 "jg 1b \n"
2514 "vzeroupper \n"
2515 : "+r"(src), // %0
2516 "+r"(dst), // %1
2517 "+r"(temp_width) // %2
2518 : "m"(kShuffleMirror) // %3
2519 : "memory", "cc", NACL_R14
2520 "xmm0", "xmm5"
2521 );
2522 }
2523 #endif // HAS_MIRRORROW_AVX2
2524
2525 #ifdef HAS_MIRRORUVROW_SSSE3
2526 // Shuffle table for reversing the bytes of UV channels.
2527 static uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
2528 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
MirrorUVRow_SSSE3(const uint8 * src,uint8 * dst_u,uint8 * dst_v,int width)2529 void MirrorUVRow_SSSE3(const uint8* src,
2530 uint8* dst_u,
2531 uint8* dst_v,
2532 int width) {
2533 intptr_t temp_width = (intptr_t)(width);
2534 asm volatile (
2535 "movdqa %4,%%xmm1 \n"
2536 "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n"
2537 "sub %1,%2 \n"
2538
2539 LABELALIGN
2540 "1: \n"
2541 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2542 "lea " MEMLEA(-0x10,0) ",%0 \n"
2543 "pshufb %%xmm1,%%xmm0 \n"
2544 "movlpd %%xmm0," MEMACCESS(1) " \n"
2545 MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2)
2546 "lea " MEMLEA(0x8,1) ",%1 \n"
2547 "sub $8,%3 \n"
2548 "jg 1b \n"
2549 : "+r"(src), // %0
2550 "+r"(dst_u), // %1
2551 "+r"(dst_v), // %2
2552 "+r"(temp_width) // %3
2553 : "m"(kShuffleMirrorUV) // %4
2554 : "memory", "cc", NACL_R14
2555 "xmm0", "xmm1"
2556 );
2557 }
2558 #endif // HAS_MIRRORUVROW_SSSE3
2559
2560 #ifdef HAS_ARGBMIRRORROW_SSE2
2561
ARGBMirrorRow_SSE2(const uint8 * src,uint8 * dst,int width)2562 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2563 intptr_t temp_width = (intptr_t)(width);
2564 asm volatile (
2565 "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n"
2566
2567 LABELALIGN
2568 "1: \n"
2569 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2570 "pshufd $0x1b,%%xmm0,%%xmm0 \n"
2571 "lea " MEMLEA(-0x10,0) ",%0 \n"
2572 "movdqu %%xmm0," MEMACCESS(1) " \n"
2573 "lea " MEMLEA(0x10,1) ",%1 \n"
2574 "sub $0x4,%2 \n"
2575 "jg 1b \n"
2576 : "+r"(src), // %0
2577 "+r"(dst), // %1
2578 "+r"(temp_width) // %2
2579 :
2580 : "memory", "cc"
2581 , "xmm0"
2582 );
2583 }
2584 #endif // HAS_ARGBMIRRORROW_SSE2
2585
2586 #ifdef HAS_ARGBMIRRORROW_AVX2
2587 // Shuffle table for reversing the bytes.
2588 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
ARGBMirrorRow_AVX2(const uint8 * src,uint8 * dst,int width)2589 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2590 intptr_t temp_width = (intptr_t)(width);
2591 asm volatile (
2592 "vmovdqu %3,%%ymm5 \n"
2593
2594 LABELALIGN
2595 "1: \n"
2596 VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
2597 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2598 "lea " MEMLEA(0x20,1) ",%1 \n"
2599 "sub $0x8,%2 \n"
2600 "jg 1b \n"
2601 "vzeroupper \n"
2602 : "+r"(src), // %0
2603 "+r"(dst), // %1
2604 "+r"(temp_width) // %2
2605 : "m"(kARGBShuffleMirror_AVX2) // %3
2606 : "memory", "cc", NACL_R14
2607 "xmm0", "xmm5"
2608 );
2609 }
2610 #endif // HAS_ARGBMIRRORROW_AVX2
2611
2612 #ifdef HAS_SPLITUVROW_AVX2
SplitUVRow_AVX2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int width)2613 void SplitUVRow_AVX2(const uint8* src_uv,
2614 uint8* dst_u,
2615 uint8* dst_v,
2616 int width) {
2617 asm volatile (
2618 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2619 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
2620 "sub %1,%2 \n"
2621
2622 LABELALIGN
2623 "1: \n"
2624 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2625 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
2626 "lea " MEMLEA(0x40,0) ",%0 \n"
2627 "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
2628 "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
2629 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
2630 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
2631 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
2632 "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
2633 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2634 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2635 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2636 MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2)
2637 "lea " MEMLEA(0x20,1) ",%1 \n"
2638 "sub $0x20,%3 \n"
2639 "jg 1b \n"
2640 "vzeroupper \n"
2641 : "+r"(src_uv), // %0
2642 "+r"(dst_u), // %1
2643 "+r"(dst_v), // %2
2644 "+r"(width) // %3
2645 :
2646 : "memory", "cc", NACL_R14
2647 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2648 );
2649 }
2650 #endif // HAS_SPLITUVROW_AVX2
2651
2652 #ifdef HAS_SPLITUVROW_SSE2
SplitUVRow_SSE2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int width)2653 void SplitUVRow_SSE2(const uint8* src_uv,
2654 uint8* dst_u,
2655 uint8* dst_v,
2656 int width) {
2657 asm volatile (
2658 "pcmpeqb %%xmm5,%%xmm5 \n"
2659 "psrlw $0x8,%%xmm5 \n"
2660 "sub %1,%2 \n"
2661
2662 LABELALIGN
2663 "1: \n"
2664 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2665 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2666 "lea " MEMLEA(0x20,0) ",%0 \n"
2667 "movdqa %%xmm0,%%xmm2 \n"
2668 "movdqa %%xmm1,%%xmm3 \n"
2669 "pand %%xmm5,%%xmm0 \n"
2670 "pand %%xmm5,%%xmm1 \n"
2671 "packuswb %%xmm1,%%xmm0 \n"
2672 "psrlw $0x8,%%xmm2 \n"
2673 "psrlw $0x8,%%xmm3 \n"
2674 "packuswb %%xmm3,%%xmm2 \n"
2675 "movdqu %%xmm0," MEMACCESS(1) " \n"
2676 MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2)
2677 "lea " MEMLEA(0x10,1) ",%1 \n"
2678 "sub $0x10,%3 \n"
2679 "jg 1b \n"
2680 : "+r"(src_uv), // %0
2681 "+r"(dst_u), // %1
2682 "+r"(dst_v), // %2
2683 "+r"(width) // %3
2684 :
2685 : "memory", "cc", NACL_R14
2686 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2687 );
2688 }
2689 #endif // HAS_SPLITUVROW_SSE2
2690
2691 #ifdef HAS_MERGEUVROW_AVX2
MergeUVRow_AVX2(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)2692 void MergeUVRow_AVX2(const uint8* src_u,
2693 const uint8* src_v,
2694 uint8* dst_uv,
2695 int width) {
2696 asm volatile (
2697 "sub %0,%1 \n"
2698
2699 LABELALIGN
2700 "1: \n"
2701 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2702 MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1
2703 "lea " MEMLEA(0x20,0) ",%0 \n"
2704 "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
2705 "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
2706 "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n"
2707 "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
2708 "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
2709 "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
2710 "lea " MEMLEA(0x40,2) ",%2 \n"
2711 "sub $0x20,%3 \n"
2712 "jg 1b \n"
2713 "vzeroupper \n"
2714 : "+r"(src_u), // %0
2715 "+r"(src_v), // %1
2716 "+r"(dst_uv), // %2
2717 "+r"(width) // %3
2718 :
2719 : "memory", "cc", NACL_R14
2720 "xmm0", "xmm1", "xmm2"
2721 );
2722 }
2723 #endif // HAS_MERGEUVROW_AVX2
2724
2725 #ifdef HAS_MERGEUVROW_SSE2
MergeUVRow_SSE2(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)2726 void MergeUVRow_SSE2(const uint8* src_u,
2727 const uint8* src_v,
2728 uint8* dst_uv,
2729 int width) {
2730 asm volatile (
2731 "sub %0,%1 \n"
2732
2733 LABELALIGN
2734 "1: \n"
2735 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2736 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
2737 "lea " MEMLEA(0x10,0) ",%0 \n"
2738 "movdqa %%xmm0,%%xmm2 \n"
2739 "punpcklbw %%xmm1,%%xmm0 \n"
2740 "punpckhbw %%xmm1,%%xmm2 \n"
2741 "movdqu %%xmm0," MEMACCESS(2) " \n"
2742 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
2743 "lea " MEMLEA(0x20,2) ",%2 \n"
2744 "sub $0x10,%3 \n"
2745 "jg 1b \n"
2746 : "+r"(src_u), // %0
2747 "+r"(src_v), // %1
2748 "+r"(dst_uv), // %2
2749 "+r"(width) // %3
2750 :
2751 : "memory", "cc", NACL_R14
2752 "xmm0", "xmm1", "xmm2"
2753 );
2754 }
2755 #endif // HAS_MERGEUVROW_SSE2
2756
2757 #ifdef HAS_COPYROW_SSE2
CopyRow_SSE2(const uint8 * src,uint8 * dst,int count)2758 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
2759 asm volatile (
2760 "test $0xf,%0 \n"
2761 "jne 2f \n"
2762 "test $0xf,%1 \n"
2763 "jne 2f \n"
2764
2765 LABELALIGN
2766 "1: \n"
2767 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
2768 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2769 "lea " MEMLEA(0x20,0) ",%0 \n"
2770 "movdqa %%xmm0," MEMACCESS(1) " \n"
2771 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
2772 "lea " MEMLEA(0x20,1) ",%1 \n"
2773 "sub $0x20,%2 \n"
2774 "jg 1b \n"
2775 "jmp 9f \n"
2776
2777 LABELALIGN
2778 "2: \n"
2779 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2780 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2781 "lea " MEMLEA(0x20,0) ",%0 \n"
2782 "movdqu %%xmm0," MEMACCESS(1) " \n"
2783 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
2784 "lea " MEMLEA(0x20,1) ",%1 \n"
2785 "sub $0x20,%2 \n"
2786 "jg 2b \n"
2787 "9: \n"
2788 : "+r"(src), // %0
2789 "+r"(dst), // %1
2790 "+r"(count) // %2
2791 :
2792 : "memory", "cc"
2793 , "xmm0", "xmm1"
2794 );
2795 }
2796 #endif // HAS_COPYROW_SSE2
2797
2798 #ifdef HAS_COPYROW_AVX
CopyRow_AVX(const uint8 * src,uint8 * dst,int count)2799 void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
2800 asm volatile (
2801 LABELALIGN
2802 "1: \n"
2803 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2804 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
2805 "lea " MEMLEA(0x40,0) ",%0 \n"
2806 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2807 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
2808 "lea " MEMLEA(0x40,1) ",%1 \n"
2809 "sub $0x40,%2 \n"
2810 "jg 1b \n"
2811 : "+r"(src), // %0
2812 "+r"(dst), // %1
2813 "+r"(count) // %2
2814 :
2815 : "memory", "cc"
2816 , "xmm0", "xmm1"
2817 );
2818 }
2819 #endif // HAS_COPYROW_AVX
2820
2821 #ifdef HAS_COPYROW_ERMS
2822 // Multiple of 1.
CopyRow_ERMS(const uint8 * src,uint8 * dst,int width)2823 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
2824 size_t width_tmp = (size_t)(width);
2825 asm volatile("rep movsb " MEMMOVESTRING(0, 1) " \n"
2826 : "+S"(src), // %0
2827 "+D"(dst), // %1
2828 "+c"(width_tmp) // %2
2829 :
2830 : "memory", "cc");
2831 }
2832 #endif // HAS_COPYROW_ERMS
2833
2834 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
2835 // width in pixels
ARGBCopyAlphaRow_SSE2(const uint8 * src,uint8 * dst,int width)2836 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2837 asm volatile (
2838 "pcmpeqb %%xmm0,%%xmm0 \n"
2839 "pslld $0x18,%%xmm0 \n"
2840 "pcmpeqb %%xmm1,%%xmm1 \n"
2841 "psrld $0x8,%%xmm1 \n"
2842
2843 LABELALIGN
2844 "1: \n"
2845 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
2846 "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
2847 "lea " MEMLEA(0x20,0) ",%0 \n"
2848 "movdqu " MEMACCESS(1) ",%%xmm4 \n"
2849 "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
2850 "pand %%xmm0,%%xmm2 \n"
2851 "pand %%xmm0,%%xmm3 \n"
2852 "pand %%xmm1,%%xmm4 \n"
2853 "pand %%xmm1,%%xmm5 \n"
2854 "por %%xmm4,%%xmm2 \n"
2855 "por %%xmm5,%%xmm3 \n"
2856 "movdqu %%xmm2," MEMACCESS(1) " \n"
2857 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
2858 "lea " MEMLEA(0x20,1) ",%1 \n"
2859 "sub $0x8,%2 \n"
2860 "jg 1b \n"
2861 : "+r"(src), // %0
2862 "+r"(dst), // %1
2863 "+r"(width) // %2
2864 :
2865 : "memory", "cc"
2866 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2867 );
2868 }
2869 #endif // HAS_ARGBCOPYALPHAROW_SSE2
2870
2871 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
2872 // width in pixels
ARGBCopyAlphaRow_AVX2(const uint8 * src,uint8 * dst,int width)2873 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
2874 asm volatile (
2875 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
2876 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
2877
2878 LABELALIGN
2879 "1: \n"
2880 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
2881 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n"
2882 "lea " MEMLEA(0x40,0) ",%0 \n"
2883 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
2884 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
2885 "vmovdqu %%ymm1," MEMACCESS(1) " \n"
2886 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
2887 "lea " MEMLEA(0x40,1) ",%1 \n"
2888 "sub $0x10,%2 \n"
2889 "jg 1b \n"
2890 "vzeroupper \n"
2891 : "+r"(src), // %0
2892 "+r"(dst), // %1
2893 "+r"(width) // %2
2894 :
2895 : "memory", "cc"
2896 , "xmm0", "xmm1", "xmm2"
2897 );
2898 }
2899 #endif // HAS_ARGBCOPYALPHAROW_AVX2
2900
2901 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
2902 // width in pixels
ARGBExtractAlphaRow_SSE2(const uint8 * src_argb,uint8 * dst_a,int width)2903 void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
2904 asm volatile (
2905 LABELALIGN
2906 "1: \n"
2907 "movdqu " MEMACCESS(0) ", %%xmm0 \n"
2908 "movdqu " MEMACCESS2(0x10, 0) ", %%xmm1 \n"
2909 "lea " MEMLEA(0x20, 0) ", %0 \n"
2910 "psrld $0x18, %%xmm0 \n"
2911 "psrld $0x18, %%xmm1 \n"
2912 "packssdw %%xmm1, %%xmm0 \n"
2913 "packuswb %%xmm0, %%xmm0 \n"
2914 "movq %%xmm0," MEMACCESS(1) " \n"
2915 "lea " MEMLEA(0x8, 1) ", %1 \n"
2916 "sub $0x8, %2 \n"
2917 "jg 1b \n"
2918 : "+r"(src_argb), // %0
2919 "+r"(dst_a), // %1
2920 "+rm"(width) // %2
2921 :
2922 : "memory", "cc"
2923 , "xmm0", "xmm1"
2924 );
2925 }
2926 #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
2927
2928 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
2929 static const uvec8 kShuffleAlphaShort_AVX2 = {
2930 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u,
2931 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
2932
ARGBExtractAlphaRow_AVX2(const uint8 * src_argb,uint8 * dst_a,int width)2933 void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width) {
2934 asm volatile (
2935 "vmovdqa %3,%%ymm4 \n"
2936 "vbroadcastf128 %4,%%ymm5 \n"
2937
2938 LABELALIGN
2939 "1: \n"
2940 "vmovdqu " MEMACCESS(0) ", %%ymm0 \n"
2941 "vmovdqu " MEMACCESS2(0x20, 0) ", %%ymm1 \n"
2942 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
2943 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
2944 "vmovdqu " MEMACCESS2(0x40, 0) ", %%ymm2 \n"
2945 "vmovdqu " MEMACCESS2(0x60, 0) ", %%ymm3 \n"
2946 "lea " MEMLEA(0x80, 0) ", %0 \n"
2947 "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
2948 "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
2949 "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
2950 "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
2951 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
2952 "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
2953 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2954 "lea " MEMLEA(0x20,1) ",%1 \n"
2955 "sub $0x20, %2 \n"
2956 "jg 1b \n"
2957 "vzeroupper \n"
2958 : "+r"(src_argb), // %0
2959 "+r"(dst_a), // %1
2960 "+rm"(width) // %2
2961 : "m"(kPermdARGBToY_AVX), // %3
2962 "m"(kShuffleAlphaShort_AVX2) // %4
2963 : "memory", "cc"
2964 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2965 );
2966 }
2967 #endif // HAS_ARGBEXTRACTALPHAROW_AVX2
2968
2969 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
2970 // width in pixels
ARGBCopyYToAlphaRow_SSE2(const uint8 * src,uint8 * dst,int width)2971 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2972 asm volatile (
2973 "pcmpeqb %%xmm0,%%xmm0 \n"
2974 "pslld $0x18,%%xmm0 \n"
2975 "pcmpeqb %%xmm1,%%xmm1 \n"
2976 "psrld $0x8,%%xmm1 \n"
2977
2978 LABELALIGN
2979 "1: \n"
2980 "movq " MEMACCESS(0) ",%%xmm2 \n"
2981 "lea " MEMLEA(0x8,0) ",%0 \n"
2982 "punpcklbw %%xmm2,%%xmm2 \n"
2983 "punpckhwd %%xmm2,%%xmm3 \n"
2984 "punpcklwd %%xmm2,%%xmm2 \n"
2985 "movdqu " MEMACCESS(1) ",%%xmm4 \n"
2986 "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
2987 "pand %%xmm0,%%xmm2 \n"
2988 "pand %%xmm0,%%xmm3 \n"
2989 "pand %%xmm1,%%xmm4 \n"
2990 "pand %%xmm1,%%xmm5 \n"
2991 "por %%xmm4,%%xmm2 \n"
2992 "por %%xmm5,%%xmm3 \n"
2993 "movdqu %%xmm2," MEMACCESS(1) " \n"
2994 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
2995 "lea " MEMLEA(0x20,1) ",%1 \n"
2996 "sub $0x8,%2 \n"
2997 "jg 1b \n"
2998 : "+r"(src), // %0
2999 "+r"(dst), // %1
3000 "+r"(width) // %2
3001 :
3002 : "memory", "cc"
3003 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3004 );
3005 }
3006 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
3007
3008 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3009 // width in pixels
ARGBCopyYToAlphaRow_AVX2(const uint8 * src,uint8 * dst,int width)3010 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3011 asm volatile (
3012 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
3013 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
3014
3015 LABELALIGN
3016 "1: \n"
3017 "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n"
3018 "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n"
3019 "lea " MEMLEA(0x10,0) ",%0 \n"
3020 "vpslld $0x18,%%ymm1,%%ymm1 \n"
3021 "vpslld $0x18,%%ymm2,%%ymm2 \n"
3022 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
3023 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
3024 "vmovdqu %%ymm1," MEMACCESS(1) " \n"
3025 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
3026 "lea " MEMLEA(0x40,1) ",%1 \n"
3027 "sub $0x10,%2 \n"
3028 "jg 1b \n"
3029 "vzeroupper \n"
3030 : "+r"(src), // %0
3031 "+r"(dst), // %1
3032 "+r"(width) // %2
3033 :
3034 : "memory", "cc"
3035 , "xmm0", "xmm1", "xmm2"
3036 );
3037 }
3038 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
3039
3040 #ifdef HAS_SETROW_X86
SetRow_X86(uint8 * dst,uint8 v8,int width)3041 void SetRow_X86(uint8* dst, uint8 v8, int width) {
3042 size_t width_tmp = (size_t)(width >> 2);
3043 const uint32 v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
3044 asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n"
3045 : "+D"(dst), // %0
3046 "+c"(width_tmp) // %1
3047 : "a"(v32) // %2
3048 : "memory", "cc");
3049 }
3050
SetRow_ERMS(uint8 * dst,uint8 v8,int width)3051 void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
3052 size_t width_tmp = (size_t)(width);
3053 asm volatile("rep stosb " MEMSTORESTRING(al, 0) " \n"
3054 : "+D"(dst), // %0
3055 "+c"(width_tmp) // %1
3056 : "a"(v8) // %2
3057 : "memory", "cc");
3058 }
3059
ARGBSetRow_X86(uint8 * dst_argb,uint32 v32,int width)3060 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
3061 size_t width_tmp = (size_t)(width);
3062 asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n"
3063 : "+D"(dst_argb), // %0
3064 "+c"(width_tmp) // %1
3065 : "a"(v32) // %2
3066 : "memory", "cc");
3067 }
3068 #endif // HAS_SETROW_X86
3069
3070 #ifdef HAS_YUY2TOYROW_SSE2
YUY2ToYRow_SSE2(const uint8 * src_yuy2,uint8 * dst_y,int width)3071 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {
3072 asm volatile (
3073 "pcmpeqb %%xmm5,%%xmm5 \n"
3074 "psrlw $0x8,%%xmm5 \n"
3075
3076 LABELALIGN
3077 "1: \n"
3078 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3079 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3080 "lea " MEMLEA(0x20,0) ",%0 \n"
3081 "pand %%xmm5,%%xmm0 \n"
3082 "pand %%xmm5,%%xmm1 \n"
3083 "packuswb %%xmm1,%%xmm0 \n"
3084 "movdqu %%xmm0," MEMACCESS(1) " \n"
3085 "lea " MEMLEA(0x10,1) ",%1 \n"
3086 "sub $0x10,%2 \n"
3087 "jg 1b \n"
3088 : "+r"(src_yuy2), // %0
3089 "+r"(dst_y), // %1
3090 "+r"(width) // %2
3091 :
3092 : "memory", "cc"
3093 , "xmm0", "xmm1", "xmm5"
3094 );
3095 }
3096
YUY2ToUVRow_SSE2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int width)3097 void YUY2ToUVRow_SSE2(const uint8* src_yuy2,
3098 int stride_yuy2,
3099 uint8* dst_u,
3100 uint8* dst_v,
3101 int width) {
3102 asm volatile (
3103 "pcmpeqb %%xmm5,%%xmm5 \n"
3104 "psrlw $0x8,%%xmm5 \n"
3105 "sub %1,%2 \n"
3106
3107 LABELALIGN
3108 "1: \n"
3109 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3110 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3111 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
3112 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
3113 "lea " MEMLEA(0x20,0) ",%0 \n"
3114 "pavgb %%xmm2,%%xmm0 \n"
3115 "pavgb %%xmm3,%%xmm1 \n"
3116 "psrlw $0x8,%%xmm0 \n"
3117 "psrlw $0x8,%%xmm1 \n"
3118 "packuswb %%xmm1,%%xmm0 \n"
3119 "movdqa %%xmm0,%%xmm1 \n"
3120 "pand %%xmm5,%%xmm0 \n"
3121 "packuswb %%xmm0,%%xmm0 \n"
3122 "psrlw $0x8,%%xmm1 \n"
3123 "packuswb %%xmm1,%%xmm1 \n"
3124 "movq %%xmm0," MEMACCESS(1) " \n"
3125 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
3126 "lea " MEMLEA(0x8,1) ",%1 \n"
3127 "sub $0x10,%3 \n"
3128 "jg 1b \n"
3129 : "+r"(src_yuy2), // %0
3130 "+r"(dst_u), // %1
3131 "+r"(dst_v), // %2
3132 "+r"(width) // %3
3133 : "r"((intptr_t)(stride_yuy2)) // %4
3134 : "memory", "cc", NACL_R14
3135 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3136 );
3137 }
3138
YUY2ToUV422Row_SSE2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int width)3139 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
3140 uint8* dst_u,
3141 uint8* dst_v,
3142 int width) {
3143 asm volatile (
3144 "pcmpeqb %%xmm5,%%xmm5 \n"
3145 "psrlw $0x8,%%xmm5 \n"
3146 "sub %1,%2 \n"
3147
3148 LABELALIGN
3149 "1: \n"
3150 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3151 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3152 "lea " MEMLEA(0x20,0) ",%0 \n"
3153 "psrlw $0x8,%%xmm0 \n"
3154 "psrlw $0x8,%%xmm1 \n"
3155 "packuswb %%xmm1,%%xmm0 \n"
3156 "movdqa %%xmm0,%%xmm1 \n"
3157 "pand %%xmm5,%%xmm0 \n"
3158 "packuswb %%xmm0,%%xmm0 \n"
3159 "psrlw $0x8,%%xmm1 \n"
3160 "packuswb %%xmm1,%%xmm1 \n"
3161 "movq %%xmm0," MEMACCESS(1) " \n"
3162 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
3163 "lea " MEMLEA(0x8,1) ",%1 \n"
3164 "sub $0x10,%3 \n"
3165 "jg 1b \n"
3166 : "+r"(src_yuy2), // %0
3167 "+r"(dst_u), // %1
3168 "+r"(dst_v), // %2
3169 "+r"(width) // %3
3170 :
3171 : "memory", "cc", NACL_R14
3172 "xmm0", "xmm1", "xmm5"
3173 );
3174 }
3175
UYVYToYRow_SSE2(const uint8 * src_uyvy,uint8 * dst_y,int width)3176 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {
3177 asm volatile (
3178 LABELALIGN
3179 "1: \n"
3180 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3181 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3182 "lea " MEMLEA(0x20,0) ",%0 \n"
3183 "psrlw $0x8,%%xmm0 \n"
3184 "psrlw $0x8,%%xmm1 \n"
3185 "packuswb %%xmm1,%%xmm0 \n"
3186 "movdqu %%xmm0," MEMACCESS(1) " \n"
3187 "lea " MEMLEA(0x10,1) ",%1 \n"
3188 "sub $0x10,%2 \n"
3189 "jg 1b \n"
3190 : "+r"(src_uyvy), // %0
3191 "+r"(dst_y), // %1
3192 "+r"(width) // %2
3193 :
3194 : "memory", "cc"
3195 , "xmm0", "xmm1"
3196 );
3197 }
3198
UYVYToUVRow_SSE2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int width)3199 void UYVYToUVRow_SSE2(const uint8* src_uyvy,
3200 int stride_uyvy,
3201 uint8* dst_u,
3202 uint8* dst_v,
3203 int width) {
3204 asm volatile (
3205 "pcmpeqb %%xmm5,%%xmm5 \n"
3206 "psrlw $0x8,%%xmm5 \n"
3207 "sub %1,%2 \n"
3208
3209 LABELALIGN
3210 "1: \n"
3211 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3212 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3213 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
3214 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
3215 "lea " MEMLEA(0x20,0) ",%0 \n"
3216 "pavgb %%xmm2,%%xmm0 \n"
3217 "pavgb %%xmm3,%%xmm1 \n"
3218 "pand %%xmm5,%%xmm0 \n"
3219 "pand %%xmm5,%%xmm1 \n"
3220 "packuswb %%xmm1,%%xmm0 \n"
3221 "movdqa %%xmm0,%%xmm1 \n"
3222 "pand %%xmm5,%%xmm0 \n"
3223 "packuswb %%xmm0,%%xmm0 \n"
3224 "psrlw $0x8,%%xmm1 \n"
3225 "packuswb %%xmm1,%%xmm1 \n"
3226 "movq %%xmm0," MEMACCESS(1) " \n"
3227 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
3228 "lea " MEMLEA(0x8,1) ",%1 \n"
3229 "sub $0x10,%3 \n"
3230 "jg 1b \n"
3231 : "+r"(src_uyvy), // %0
3232 "+r"(dst_u), // %1
3233 "+r"(dst_v), // %2
3234 "+r"(width) // %3
3235 : "r"((intptr_t)(stride_uyvy)) // %4
3236 : "memory", "cc", NACL_R14
3237 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3238 );
3239 }
3240
UYVYToUV422Row_SSE2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int width)3241 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
3242 uint8* dst_u,
3243 uint8* dst_v,
3244 int width) {
3245 asm volatile (
3246 "pcmpeqb %%xmm5,%%xmm5 \n"
3247 "psrlw $0x8,%%xmm5 \n"
3248 "sub %1,%2 \n"
3249
3250 LABELALIGN
3251 "1: \n"
3252 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3253 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3254 "lea " MEMLEA(0x20,0) ",%0 \n"
3255 "pand %%xmm5,%%xmm0 \n"
3256 "pand %%xmm5,%%xmm1 \n"
3257 "packuswb %%xmm1,%%xmm0 \n"
3258 "movdqa %%xmm0,%%xmm1 \n"
3259 "pand %%xmm5,%%xmm0 \n"
3260 "packuswb %%xmm0,%%xmm0 \n"
3261 "psrlw $0x8,%%xmm1 \n"
3262 "packuswb %%xmm1,%%xmm1 \n"
3263 "movq %%xmm0," MEMACCESS(1) " \n"
3264 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
3265 "lea " MEMLEA(0x8,1) ",%1 \n"
3266 "sub $0x10,%3 \n"
3267 "jg 1b \n"
3268 : "+r"(src_uyvy), // %0
3269 "+r"(dst_u), // %1
3270 "+r"(dst_v), // %2
3271 "+r"(width) // %3
3272 :
3273 : "memory", "cc", NACL_R14
3274 "xmm0", "xmm1", "xmm5"
3275 );
3276 }
3277 #endif // HAS_YUY2TOYROW_SSE2
3278
3279 #ifdef HAS_YUY2TOYROW_AVX2
YUY2ToYRow_AVX2(const uint8 * src_yuy2,uint8 * dst_y,int width)3280 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
3281 asm volatile (
3282 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3283 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3284
3285 LABELALIGN
3286 "1: \n"
3287 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3288 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3289 "lea " MEMLEA(0x40,0) ",%0 \n"
3290 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
3291 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
3292 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3293 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3294 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
3295 "lea " MEMLEA(0x20,1) ",%1 \n"
3296 "sub $0x20,%2 \n"
3297 "jg 1b \n"
3298 "vzeroupper \n"
3299 : "+r"(src_yuy2), // %0
3300 "+r"(dst_y), // %1
3301 "+r"(width) // %2
3302 :
3303 : "memory", "cc"
3304 , "xmm0", "xmm1", "xmm5"
3305 );
3306 }
3307
YUY2ToUVRow_AVX2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int width)3308 void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
3309 int stride_yuy2,
3310 uint8* dst_u,
3311 uint8* dst_v,
3312 int width) {
3313 asm volatile (
3314 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3315 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3316 "sub %1,%2 \n"
3317
3318 LABELALIGN
3319 "1: \n"
3320 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3321 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3322 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3323 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
3324 "lea " MEMLEA(0x40,0) ",%0 \n"
3325 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3326 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
3327 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3328 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3329 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3330 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3331 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3332 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3333 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3334 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3335 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3336 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3337 "lea " MEMLEA(0x10,1) ",%1 \n"
3338 "sub $0x20,%3 \n"
3339 "jg 1b \n"
3340 "vzeroupper \n"
3341 : "+r"(src_yuy2), // %0
3342 "+r"(dst_u), // %1
3343 "+r"(dst_v), // %2
3344 "+r"(width) // %3
3345 : "r"((intptr_t)(stride_yuy2)) // %4
3346 : "memory", "cc", NACL_R14
3347 "xmm0", "xmm1", "xmm5"
3348 );
3349 }
3350
YUY2ToUV422Row_AVX2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int width)3351 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3352 uint8* dst_u,
3353 uint8* dst_v,
3354 int width) {
3355 asm volatile (
3356 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3357 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3358 "sub %1,%2 \n"
3359
3360 LABELALIGN
3361 "1: \n"
3362 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3363 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3364 "lea " MEMLEA(0x40,0) ",%0 \n"
3365 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3366 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
3367 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3368 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3369 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3370 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3371 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3372 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3373 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3374 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3375 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3376 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3377 "lea " MEMLEA(0x10,1) ",%1 \n"
3378 "sub $0x20,%3 \n"
3379 "jg 1b \n"
3380 "vzeroupper \n"
3381 : "+r"(src_yuy2), // %0
3382 "+r"(dst_u), // %1
3383 "+r"(dst_v), // %2
3384 "+r"(width) // %3
3385 :
3386 : "memory", "cc", NACL_R14
3387 "xmm0", "xmm1", "xmm5"
3388 );
3389 }
3390
UYVYToYRow_AVX2(const uint8 * src_uyvy,uint8 * dst_y,int width)3391 void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {
3392 asm volatile (
3393 LABELALIGN
3394 "1: \n"
3395 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3396 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3397 "lea " MEMLEA(0x40,0) ",%0 \n"
3398 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3399 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
3400 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3401 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3402 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
3403 "lea " MEMLEA(0x20,1) ",%1 \n"
3404 "sub $0x20,%2 \n"
3405 "jg 1b \n"
3406 "vzeroupper \n"
3407 : "+r"(src_uyvy), // %0
3408 "+r"(dst_y), // %1
3409 "+r"(width) // %2
3410 :
3411 : "memory", "cc"
3412 , "xmm0", "xmm1", "xmm5"
3413 );
3414 }
UYVYToUVRow_AVX2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int width)3415 void UYVYToUVRow_AVX2(const uint8* src_uyvy,
3416 int stride_uyvy,
3417 uint8* dst_u,
3418 uint8* dst_v,
3419 int width) {
3420 asm volatile (
3421 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3422 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3423 "sub %1,%2 \n"
3424
3425 LABELALIGN
3426 "1: \n"
3427 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3428 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3429 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3430 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
3431 "lea " MEMLEA(0x40,0) ",%0 \n"
3432 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
3433 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
3434 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3435 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3436 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3437 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3438 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3439 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3440 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3441 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3442 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3443 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3444 "lea " MEMLEA(0x10,1) ",%1 \n"
3445 "sub $0x20,%3 \n"
3446 "jg 1b \n"
3447 "vzeroupper \n"
3448 : "+r"(src_uyvy), // %0
3449 "+r"(dst_u), // %1
3450 "+r"(dst_v), // %2
3451 "+r"(width) // %3
3452 : "r"((intptr_t)(stride_uyvy)) // %4
3453 : "memory", "cc", NACL_R14
3454 "xmm0", "xmm1", "xmm5"
3455 );
3456 }
3457
UYVYToUV422Row_AVX2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int width)3458 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3459 uint8* dst_u,
3460 uint8* dst_v,
3461 int width) {
3462 asm volatile (
3463 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3464 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3465 "sub %1,%2 \n"
3466
3467 LABELALIGN
3468 "1: \n"
3469 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3470 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3471 "lea " MEMLEA(0x40,0) ",%0 \n"
3472 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
3473 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
3474 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3475 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3476 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3477 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3478 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3479 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3480 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3481 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3482 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3483 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3484 "lea " MEMLEA(0x10,1) ",%1 \n"
3485 "sub $0x20,%3 \n"
3486 "jg 1b \n"
3487 "vzeroupper \n"
3488 : "+r"(src_uyvy), // %0
3489 "+r"(dst_u), // %1
3490 "+r"(dst_v), // %2
3491 "+r"(width) // %3
3492 :
3493 : "memory", "cc", NACL_R14
3494 "xmm0", "xmm1", "xmm5"
3495 );
3496 }
3497 #endif // HAS_YUY2TOYROW_AVX2
3498
3499 #ifdef HAS_ARGBBLENDROW_SSSE3
3500 // Shuffle table for isolating alpha.
3501 static uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3502 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
3503
3504 // Blend 8 pixels at a time
ARGBBlendRow_SSSE3(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)3505 void ARGBBlendRow_SSSE3(const uint8* src_argb0,
3506 const uint8* src_argb1,
3507 uint8* dst_argb,
3508 int width) {
3509 asm volatile (
3510 "pcmpeqb %%xmm7,%%xmm7 \n"
3511 "psrlw $0xf,%%xmm7 \n"
3512 "pcmpeqb %%xmm6,%%xmm6 \n"
3513 "psrlw $0x8,%%xmm6 \n"
3514 "pcmpeqb %%xmm5,%%xmm5 \n"
3515 "psllw $0x8,%%xmm5 \n"
3516 "pcmpeqb %%xmm4,%%xmm4 \n"
3517 "pslld $0x18,%%xmm4 \n"
3518 "sub $0x4,%3 \n"
3519 "jl 49f \n"
3520
3521 // 4 pixel loop.
3522 LABELALIGN
3523 "40: \n"
3524 "movdqu " MEMACCESS(0) ",%%xmm3 \n"
3525 "lea " MEMLEA(0x10,0) ",%0 \n"
3526 "movdqa %%xmm3,%%xmm0 \n"
3527 "pxor %%xmm4,%%xmm3 \n"
3528 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
3529 "pshufb %4,%%xmm3 \n"
3530 "pand %%xmm6,%%xmm2 \n"
3531 "paddw %%xmm7,%%xmm3 \n"
3532 "pmullw %%xmm3,%%xmm2 \n"
3533 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
3534 "lea " MEMLEA(0x10,1) ",%1 \n"
3535 "psrlw $0x8,%%xmm1 \n"
3536 "por %%xmm4,%%xmm0 \n"
3537 "pmullw %%xmm3,%%xmm1 \n"
3538 "psrlw $0x8,%%xmm2 \n"
3539 "paddusb %%xmm2,%%xmm0 \n"
3540 "pand %%xmm5,%%xmm1 \n"
3541 "paddusb %%xmm1,%%xmm0 \n"
3542 "movdqu %%xmm0," MEMACCESS(2) " \n"
3543 "lea " MEMLEA(0x10,2) ",%2 \n"
3544 "sub $0x4,%3 \n"
3545 "jge 40b \n"
3546
3547 "49: \n"
3548 "add $0x3,%3 \n"
3549 "jl 99f \n"
3550
3551 // 1 pixel loop.
3552 "91: \n"
3553 "movd " MEMACCESS(0) ",%%xmm3 \n"
3554 "lea " MEMLEA(0x4,0) ",%0 \n"
3555 "movdqa %%xmm3,%%xmm0 \n"
3556 "pxor %%xmm4,%%xmm3 \n"
3557 "movd " MEMACCESS(1) ",%%xmm2 \n"
3558 "pshufb %4,%%xmm3 \n"
3559 "pand %%xmm6,%%xmm2 \n"
3560 "paddw %%xmm7,%%xmm3 \n"
3561 "pmullw %%xmm3,%%xmm2 \n"
3562 "movd " MEMACCESS(1) ",%%xmm1 \n"
3563 "lea " MEMLEA(0x4,1) ",%1 \n"
3564 "psrlw $0x8,%%xmm1 \n"
3565 "por %%xmm4,%%xmm0 \n"
3566 "pmullw %%xmm3,%%xmm1 \n"
3567 "psrlw $0x8,%%xmm2 \n"
3568 "paddusb %%xmm2,%%xmm0 \n"
3569 "pand %%xmm5,%%xmm1 \n"
3570 "paddusb %%xmm1,%%xmm0 \n"
3571 "movd %%xmm0," MEMACCESS(2) " \n"
3572 "lea " MEMLEA(0x4,2) ",%2 \n"
3573 "sub $0x1,%3 \n"
3574 "jge 91b \n"
3575 "99: \n"
3576 : "+r"(src_argb0), // %0
3577 "+r"(src_argb1), // %1
3578 "+r"(dst_argb), // %2
3579 "+r"(width) // %3
3580 : "m"(kShuffleAlpha) // %4
3581 : "memory", "cc"
3582 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3583 );
3584 }
3585 #endif // HAS_ARGBBLENDROW_SSSE3
3586
3587 #ifdef HAS_BLENDPLANEROW_SSSE3
3588 // Blend 8 pixels at a time.
3589 // unsigned version of math
3590 // =((A2*C2)+(B2*(255-C2))+255)/256
3591 // signed version of math
3592 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_SSSE3(const uint8 * src0,const uint8 * src1,const uint8 * alpha,uint8 * dst,int width)3593 void BlendPlaneRow_SSSE3(const uint8* src0,
3594 const uint8* src1,
3595 const uint8* alpha,
3596 uint8* dst,
3597 int width) {
3598 asm volatile(
3599 "pcmpeqb %%xmm5,%%xmm5 \n"
3600 "psllw $0x8,%%xmm5 \n"
3601 "mov $0x80808080,%%eax \n"
3602 "movd %%eax,%%xmm6 \n"
3603 "pshufd $0x0,%%xmm6,%%xmm6 \n"
3604 "mov $0x807f807f,%%eax \n"
3605 "movd %%eax,%%xmm7 \n"
3606 "pshufd $0x0,%%xmm7,%%xmm7 \n"
3607 "sub %2,%0 \n"
3608 "sub %2,%1 \n"
3609 "sub %2,%3 \n"
3610
3611 // 8 pixel loop.
3612 LABELALIGN
3613 "1: \n"
3614 "movq (%2),%%xmm0 \n"
3615 "punpcklbw %%xmm0,%%xmm0 \n"
3616 "pxor %%xmm5,%%xmm0 \n"
3617 "movq (%0,%2,1),%%xmm1 \n"
3618 "movq (%1,%2,1),%%xmm2 \n"
3619 "punpcklbw %%xmm2,%%xmm1 \n"
3620 "psubb %%xmm6,%%xmm1 \n"
3621 "pmaddubsw %%xmm1,%%xmm0 \n"
3622 "paddw %%xmm7,%%xmm0 \n"
3623 "psrlw $0x8,%%xmm0 \n"
3624 "packuswb %%xmm0,%%xmm0 \n"
3625 "movq %%xmm0,(%3,%2,1) \n"
3626 "lea 0x8(%2),%2 \n"
3627 "sub $0x8,%4 \n"
3628 "jg 1b \n"
3629 : "+r"(src0), // %0
3630 "+r"(src1), // %1
3631 "+r"(alpha), // %2
3632 "+r"(dst), // %3
3633 "+rm"(width) // %4
3634 ::"memory",
3635 "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
3636 }
3637 #endif // HAS_BLENDPLANEROW_SSSE3
3638
3639 #ifdef HAS_BLENDPLANEROW_AVX2
3640 // Blend 32 pixels at a time.
3641 // unsigned version of math
3642 // =((A2*C2)+(B2*(255-C2))+255)/256
3643 // signed version of math
3644 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_AVX2(const uint8 * src0,const uint8 * src1,const uint8 * alpha,uint8 * dst,int width)3645 void BlendPlaneRow_AVX2(const uint8* src0,
3646 const uint8* src1,
3647 const uint8* alpha,
3648 uint8* dst,
3649 int width) {
3650 asm volatile(
3651 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3652 "vpsllw $0x8,%%ymm5,%%ymm5 \n"
3653 "mov $0x80808080,%%eax \n"
3654 "vmovd %%eax,%%xmm6 \n"
3655 "vbroadcastss %%xmm6,%%ymm6 \n"
3656 "mov $0x807f807f,%%eax \n"
3657 "vmovd %%eax,%%xmm7 \n"
3658 "vbroadcastss %%xmm7,%%ymm7 \n"
3659 "sub %2,%0 \n"
3660 "sub %2,%1 \n"
3661 "sub %2,%3 \n"
3662
3663 // 32 pixel loop.
3664 LABELALIGN
3665 "1: \n"
3666 "vmovdqu (%2),%%ymm0 \n"
3667 "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
3668 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
3669 "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
3670 "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
3671 "vmovdqu (%0,%2,1),%%ymm1 \n"
3672 "vmovdqu (%1,%2,1),%%ymm2 \n"
3673 "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
3674 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
3675 "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
3676 "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
3677 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
3678 "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
3679 "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
3680 "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
3681 "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
3682 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3683 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
3684 "vmovdqu %%ymm0,(%3,%2,1) \n"
3685 "lea 0x20(%2),%2 \n"
3686 "sub $0x20,%4 \n"
3687 "jg 1b \n"
3688 "vzeroupper \n"
3689 : "+r"(src0), // %0
3690 "+r"(src1), // %1
3691 "+r"(alpha), // %2
3692 "+r"(dst), // %3
3693 "+rm"(width) // %4
3694 ::"memory",
3695 "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
3696 "xmm7");
3697 }
3698 #endif // HAS_BLENDPLANEROW_AVX2
3699
3700 #ifdef HAS_ARGBATTENUATEROW_SSSE3
3701 // Shuffle table duplicating alpha
3702 static uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
3703 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
3704 static uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3705 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
3706 // Attenuate 4 pixels at a time.
ARGBAttenuateRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)3707 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3708 asm volatile (
3709 "pcmpeqb %%xmm3,%%xmm3 \n"
3710 "pslld $0x18,%%xmm3 \n"
3711 "movdqa %3,%%xmm4 \n"
3712 "movdqa %4,%%xmm5 \n"
3713
3714 // 4 pixel loop.
3715 LABELALIGN
3716 "1: \n"
3717 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3718 "pshufb %%xmm4,%%xmm0 \n"
3719 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3720 "punpcklbw %%xmm1,%%xmm1 \n"
3721 "pmulhuw %%xmm1,%%xmm0 \n"
3722 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3723 "pshufb %%xmm5,%%xmm1 \n"
3724 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3725 "punpckhbw %%xmm2,%%xmm2 \n"
3726 "pmulhuw %%xmm2,%%xmm1 \n"
3727 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3728 "lea " MEMLEA(0x10,0) ",%0 \n"
3729 "pand %%xmm3,%%xmm2 \n"
3730 "psrlw $0x8,%%xmm0 \n"
3731 "psrlw $0x8,%%xmm1 \n"
3732 "packuswb %%xmm1,%%xmm0 \n"
3733 "por %%xmm2,%%xmm0 \n"
3734 "movdqu %%xmm0," MEMACCESS(1) " \n"
3735 "lea " MEMLEA(0x10,1) ",%1 \n"
3736 "sub $0x4,%2 \n"
3737 "jg 1b \n"
3738 : "+r"(src_argb), // %0
3739 "+r"(dst_argb), // %1
3740 "+r"(width) // %2
3741 : "m"(kShuffleAlpha0), // %3
3742 "m"(kShuffleAlpha1) // %4
3743 : "memory", "cc"
3744 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3745 );
3746 }
3747 #endif // HAS_ARGBATTENUATEROW_SSSE3
3748
3749 #ifdef HAS_ARGBATTENUATEROW_AVX2
3750 // Shuffle table duplicating alpha.
3751 static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
3752 128u, 128u, 14u, 15u, 14u, 15u,
3753 14u, 15u, 128u, 128u};
3754 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,int width)3755 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
3756 asm volatile (
3757 "vbroadcastf128 %3,%%ymm4 \n"
3758 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3759 "vpslld $0x18,%%ymm5,%%ymm5 \n"
3760 "sub %0,%1 \n"
3761
3762 // 8 pixel loop.
3763 LABELALIGN
3764 "1: \n"
3765 "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
3766 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
3767 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
3768 "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
3769 "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
3770 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
3771 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
3772 "vpand %%ymm5,%%ymm6,%%ymm6 \n"
3773 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3774 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
3775 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3776 "vpor %%ymm6,%%ymm0,%%ymm0 \n"
3777 MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
3778 "lea " MEMLEA(0x20,0) ",%0 \n"
3779 "sub $0x8,%2 \n"
3780 "jg 1b \n"
3781 "vzeroupper \n"
3782 : "+r"(src_argb), // %0
3783 "+r"(dst_argb), // %1
3784 "+r"(width) // %2
3785 : "m"(kShuffleAlpha_AVX2) // %3
3786 : "memory", "cc"
3787 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3788 );
3789 }
3790 #endif // HAS_ARGBATTENUATEROW_AVX2
3791
3792 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
3793 // Unattenuate 4 pixels at a time.
ARGBUnattenuateRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width)3794 void ARGBUnattenuateRow_SSE2(const uint8* src_argb,
3795 uint8* dst_argb,
3796 int width) {
3797 uintptr_t alpha;
3798 asm volatile (
3799 // 4 pixel loop.
3800 LABELALIGN
3801 "1: \n"
3802 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3803 "movzb " MEMACCESS2(0x03,0) ",%3 \n"
3804 "punpcklbw %%xmm0,%%xmm0 \n"
3805 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
3806 "movzb " MEMACCESS2(0x07,0) ",%3 \n"
3807 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
3808 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3809 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3810 "movlhps %%xmm3,%%xmm2 \n"
3811 "pmulhuw %%xmm2,%%xmm0 \n"
3812 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3813 "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
3814 "punpckhbw %%xmm1,%%xmm1 \n"
3815 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
3816 "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
3817 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
3818 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3819 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3820 "movlhps %%xmm3,%%xmm2 \n"
3821 "pmulhuw %%xmm2,%%xmm1 \n"
3822 "lea " MEMLEA(0x10,0) ",%0 \n"
3823 "packuswb %%xmm1,%%xmm0 \n"
3824 "movdqu %%xmm0," MEMACCESS(1) " \n"
3825 "lea " MEMLEA(0x10,1) ",%1 \n"
3826 "sub $0x4,%2 \n"
3827 "jg 1b \n"
3828 : "+r"(src_argb), // %0
3829 "+r"(dst_argb), // %1
3830 "+r"(width), // %2
3831 "=&r"(alpha) // %3
3832 : "r"(fixed_invtbl8) // %4
3833 : "memory", "cc", NACL_R14
3834 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3835 );
3836 }
3837 #endif // HAS_ARGBUNATTENUATEROW_SSE2
3838
3839 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
3840 // Shuffle table duplicating alpha.
3841 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
3842 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
3843 // Unattenuate 8 pixels at a time.
ARGBUnattenuateRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,int width)3844 void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
3845 uint8* dst_argb,
3846 int width) {
3847 uintptr_t alpha;
3848 asm volatile (
3849 "sub %0,%1 \n"
3850 "vbroadcastf128 %5,%%ymm5 \n"
3851
3852 // 8 pixel loop.
3853 LABELALIGN
3854 "1: \n"
3855 // replace VPGATHER
3856 "movzb " MEMACCESS2(0x03,0) ",%3 \n"
3857 MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
3858 "movzb " MEMACCESS2(0x07,0) ",%3 \n"
3859 MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
3860 "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
3861 "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
3862 MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
3863 "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
3864 MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
3865 "movzb " MEMACCESS2(0x13,0) ",%3 \n"
3866 "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
3867 MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
3868 "movzb " MEMACCESS2(0x17,0) ",%3 \n"
3869 MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
3870 "movzb " MEMACCESS2(0x1b,0) ",%3 \n"
3871 "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
3872 MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
3873 "movzb " MEMACCESS2(0x1f,0) ",%3 \n"
3874 MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
3875 "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
3876 "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
3877 "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
3878 "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
3879 // end of VPGATHER
3880
3881 "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
3882 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
3883 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
3884 "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
3885 "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
3886 "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
3887 "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
3888 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
3889 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
3890 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3891 MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
3892 "lea " MEMLEA(0x20,0) ",%0 \n"
3893 "sub $0x8,%2 \n"
3894 "jg 1b \n"
3895 "vzeroupper \n"
3896 : "+r"(src_argb), // %0
3897 "+r"(dst_argb), // %1
3898 "+r"(width), // %2
3899 "=&r"(alpha) // %3
3900 : "r"(fixed_invtbl8), // %4
3901 "m"(kUnattenShuffleAlpha_AVX2) // %5
3902 : "memory", "cc", NACL_R14
3903 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3904 );
3905 }
3906 #endif // HAS_ARGBUNATTENUATEROW_AVX2
3907
3908 #ifdef HAS_ARGBGRAYROW_SSSE3
3909 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
ARGBGrayRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)3910 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3911 asm volatile (
3912 "movdqa %3,%%xmm4 \n"
3913 "movdqa %4,%%xmm5 \n"
3914
3915 // 8 pixel loop.
3916 LABELALIGN
3917 "1: \n"
3918 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3919 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3920 "pmaddubsw %%xmm4,%%xmm0 \n"
3921 "pmaddubsw %%xmm4,%%xmm1 \n"
3922 "phaddw %%xmm1,%%xmm0 \n"
3923 "paddw %%xmm5,%%xmm0 \n"
3924 "psrlw $0x7,%%xmm0 \n"
3925 "packuswb %%xmm0,%%xmm0 \n"
3926 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3927 "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
3928 "lea " MEMLEA(0x20,0) ",%0 \n"
3929 "psrld $0x18,%%xmm2 \n"
3930 "psrld $0x18,%%xmm3 \n"
3931 "packuswb %%xmm3,%%xmm2 \n"
3932 "packuswb %%xmm2,%%xmm2 \n"
3933 "movdqa %%xmm0,%%xmm3 \n"
3934 "punpcklbw %%xmm0,%%xmm0 \n"
3935 "punpcklbw %%xmm2,%%xmm3 \n"
3936 "movdqa %%xmm0,%%xmm1 \n"
3937 "punpcklwd %%xmm3,%%xmm0 \n"
3938 "punpckhwd %%xmm3,%%xmm1 \n"
3939 "movdqu %%xmm0," MEMACCESS(1) " \n"
3940 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
3941 "lea " MEMLEA(0x20,1) ",%1 \n"
3942 "sub $0x8,%2 \n"
3943 "jg 1b \n"
3944 : "+r"(src_argb), // %0
3945 "+r"(dst_argb), // %1
3946 "+r"(width) // %2
3947 : "m"(kARGBToYJ), // %3
3948 "m"(kAddYJ64) // %4
3949 : "memory", "cc"
3950 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3951 );
3952 }
3953 #endif // HAS_ARGBGRAYROW_SSSE3
3954
3955 #ifdef HAS_ARGBSEPIAROW_SSSE3
3956 // b = (r * 35 + g * 68 + b * 17) >> 7
3957 // g = (r * 45 + g * 88 + b * 22) >> 7
3958 // r = (r * 50 + g * 98 + b * 24) >> 7
3959 // Constant for ARGB color to sepia tone
3960 static vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
3961 17, 68, 35, 0, 17, 68, 35, 0};
3962
3963 static vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
3964 22, 88, 45, 0, 22, 88, 45, 0};
3965
3966 static vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
3967 24, 98, 50, 0, 24, 98, 50, 0};
3968
3969 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
ARGBSepiaRow_SSSE3(uint8 * dst_argb,int width)3970 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3971 asm volatile (
3972 "movdqa %2,%%xmm2 \n"
3973 "movdqa %3,%%xmm3 \n"
3974 "movdqa %4,%%xmm4 \n"
3975
3976 // 8 pixel loop.
3977 LABELALIGN
3978 "1: \n"
3979 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3980 "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
3981 "pmaddubsw %%xmm2,%%xmm0 \n"
3982 "pmaddubsw %%xmm2,%%xmm6 \n"
3983 "phaddw %%xmm6,%%xmm0 \n"
3984 "psrlw $0x7,%%xmm0 \n"
3985 "packuswb %%xmm0,%%xmm0 \n"
3986 "movdqu " MEMACCESS(0) ",%%xmm5 \n"
3987 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3988 "pmaddubsw %%xmm3,%%xmm5 \n"
3989 "pmaddubsw %%xmm3,%%xmm1 \n"
3990 "phaddw %%xmm1,%%xmm5 \n"
3991 "psrlw $0x7,%%xmm5 \n"
3992 "packuswb %%xmm5,%%xmm5 \n"
3993 "punpcklbw %%xmm5,%%xmm0 \n"
3994 "movdqu " MEMACCESS(0) ",%%xmm5 \n"
3995 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3996 "pmaddubsw %%xmm4,%%xmm5 \n"
3997 "pmaddubsw %%xmm4,%%xmm1 \n"
3998 "phaddw %%xmm1,%%xmm5 \n"
3999 "psrlw $0x7,%%xmm5 \n"
4000 "packuswb %%xmm5,%%xmm5 \n"
4001 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
4002 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4003 "psrld $0x18,%%xmm6 \n"
4004 "psrld $0x18,%%xmm1 \n"
4005 "packuswb %%xmm1,%%xmm6 \n"
4006 "packuswb %%xmm6,%%xmm6 \n"
4007 "punpcklbw %%xmm6,%%xmm5 \n"
4008 "movdqa %%xmm0,%%xmm1 \n"
4009 "punpcklwd %%xmm5,%%xmm0 \n"
4010 "punpckhwd %%xmm5,%%xmm1 \n"
4011 "movdqu %%xmm0," MEMACCESS(0) " \n"
4012 "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
4013 "lea " MEMLEA(0x20,0) ",%0 \n"
4014 "sub $0x8,%1 \n"
4015 "jg 1b \n"
4016 : "+r"(dst_argb), // %0
4017 "+r"(width) // %1
4018 : "m"(kARGBToSepiaB), // %2
4019 "m"(kARGBToSepiaG), // %3
4020 "m"(kARGBToSepiaR) // %4
4021 : "memory", "cc"
4022 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4023 );
4024 }
4025 #endif // HAS_ARGBSEPIAROW_SSSE3
4026
4027 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
4028 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
4029 // Same as Sepia except matrix is provided.
ARGBColorMatrixRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,const int8 * matrix_argb,int width)4030 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb,
4031 uint8* dst_argb,
4032 const int8* matrix_argb,
4033 int width) {
4034 asm volatile (
4035 "movdqu " MEMACCESS(3) ",%%xmm5 \n"
4036 "pshufd $0x00,%%xmm5,%%xmm2 \n"
4037 "pshufd $0x55,%%xmm5,%%xmm3 \n"
4038 "pshufd $0xaa,%%xmm5,%%xmm4 \n"
4039 "pshufd $0xff,%%xmm5,%%xmm5 \n"
4040
4041 // 8 pixel loop.
4042 LABELALIGN
4043 "1: \n"
4044 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4045 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
4046 "pmaddubsw %%xmm2,%%xmm0 \n"
4047 "pmaddubsw %%xmm2,%%xmm7 \n"
4048 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
4049 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4050 "pmaddubsw %%xmm3,%%xmm6 \n"
4051 "pmaddubsw %%xmm3,%%xmm1 \n"
4052 "phaddsw %%xmm7,%%xmm0 \n"
4053 "phaddsw %%xmm1,%%xmm6 \n"
4054 "psraw $0x6,%%xmm0 \n"
4055 "psraw $0x6,%%xmm6 \n"
4056 "packuswb %%xmm0,%%xmm0 \n"
4057 "packuswb %%xmm6,%%xmm6 \n"
4058 "punpcklbw %%xmm6,%%xmm0 \n"
4059 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
4060 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
4061 "pmaddubsw %%xmm4,%%xmm1 \n"
4062 "pmaddubsw %%xmm4,%%xmm7 \n"
4063 "phaddsw %%xmm7,%%xmm1 \n"
4064 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
4065 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
4066 "pmaddubsw %%xmm5,%%xmm6 \n"
4067 "pmaddubsw %%xmm5,%%xmm7 \n"
4068 "phaddsw %%xmm7,%%xmm6 \n"
4069 "psraw $0x6,%%xmm1 \n"
4070 "psraw $0x6,%%xmm6 \n"
4071 "packuswb %%xmm1,%%xmm1 \n"
4072 "packuswb %%xmm6,%%xmm6 \n"
4073 "punpcklbw %%xmm6,%%xmm1 \n"
4074 "movdqa %%xmm0,%%xmm6 \n"
4075 "punpcklwd %%xmm1,%%xmm0 \n"
4076 "punpckhwd %%xmm1,%%xmm6 \n"
4077 "movdqu %%xmm0," MEMACCESS(1) " \n"
4078 "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n"
4079 "lea " MEMLEA(0x20,0) ",%0 \n"
4080 "lea " MEMLEA(0x20,1) ",%1 \n"
4081 "sub $0x8,%2 \n"
4082 "jg 1b \n"
4083 : "+r"(src_argb), // %0
4084 "+r"(dst_argb), // %1
4085 "+r"(width) // %2
4086 : "r"(matrix_argb) // %3
4087 : "memory", "cc"
4088 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4089 );
4090 }
4091 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
4092
4093 #ifdef HAS_ARGBQUANTIZEROW_SSE2
4094 // Quantize 4 ARGB pixels (16 bytes).
ARGBQuantizeRow_SSE2(uint8 * dst_argb,int scale,int interval_size,int interval_offset,int width)4095 void ARGBQuantizeRow_SSE2(uint8* dst_argb,
4096 int scale,
4097 int interval_size,
4098 int interval_offset,
4099 int width) {
4100 asm volatile (
4101 "movd %2,%%xmm2 \n"
4102 "movd %3,%%xmm3 \n"
4103 "movd %4,%%xmm4 \n"
4104 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
4105 "pshufd $0x44,%%xmm2,%%xmm2 \n"
4106 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
4107 "pshufd $0x44,%%xmm3,%%xmm3 \n"
4108 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
4109 "pshufd $0x44,%%xmm4,%%xmm4 \n"
4110 "pxor %%xmm5,%%xmm5 \n"
4111 "pcmpeqb %%xmm6,%%xmm6 \n"
4112 "pslld $0x18,%%xmm6 \n"
4113
4114 // 4 pixel loop.
4115 LABELALIGN
4116 "1: \n"
4117 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4118 "punpcklbw %%xmm5,%%xmm0 \n"
4119 "pmulhuw %%xmm2,%%xmm0 \n"
4120 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
4121 "punpckhbw %%xmm5,%%xmm1 \n"
4122 "pmulhuw %%xmm2,%%xmm1 \n"
4123 "pmullw %%xmm3,%%xmm0 \n"
4124 "movdqu " MEMACCESS(0) ",%%xmm7 \n"
4125 "pmullw %%xmm3,%%xmm1 \n"
4126 "pand %%xmm6,%%xmm7 \n"
4127 "paddw %%xmm4,%%xmm0 \n"
4128 "paddw %%xmm4,%%xmm1 \n"
4129 "packuswb %%xmm1,%%xmm0 \n"
4130 "por %%xmm7,%%xmm0 \n"
4131 "movdqu %%xmm0," MEMACCESS(0) " \n"
4132 "lea " MEMLEA(0x10,0) ",%0 \n"
4133 "sub $0x4,%1 \n"
4134 "jg 1b \n"
4135 : "+r"(dst_argb), // %0
4136 "+r"(width) // %1
4137 : "r"(scale), // %2
4138 "r"(interval_size), // %3
4139 "r"(interval_offset) // %4
4140 : "memory", "cc"
4141 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4142 );
4143 }
4144 #endif // HAS_ARGBQUANTIZEROW_SSE2
4145
4146 #ifdef HAS_ARGBSHADEROW_SSE2
4147 // Shade 4 pixels at a time by specified value.
ARGBShadeRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width,uint32 value)4148 void ARGBShadeRow_SSE2(const uint8* src_argb,
4149 uint8* dst_argb,
4150 int width,
4151 uint32 value) {
4152 asm volatile (
4153 "movd %3,%%xmm2 \n"
4154 "punpcklbw %%xmm2,%%xmm2 \n"
4155 "punpcklqdq %%xmm2,%%xmm2 \n"
4156
4157 // 4 pixel loop.
4158 LABELALIGN
4159 "1: \n"
4160 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4161 "lea " MEMLEA(0x10,0) ",%0 \n"
4162 "movdqa %%xmm0,%%xmm1 \n"
4163 "punpcklbw %%xmm0,%%xmm0 \n"
4164 "punpckhbw %%xmm1,%%xmm1 \n"
4165 "pmulhuw %%xmm2,%%xmm0 \n"
4166 "pmulhuw %%xmm2,%%xmm1 \n"
4167 "psrlw $0x8,%%xmm0 \n"
4168 "psrlw $0x8,%%xmm1 \n"
4169 "packuswb %%xmm1,%%xmm0 \n"
4170 "movdqu %%xmm0," MEMACCESS(1) " \n"
4171 "lea " MEMLEA(0x10,1) ",%1 \n"
4172 "sub $0x4,%2 \n"
4173 "jg 1b \n"
4174 : "+r"(src_argb), // %0
4175 "+r"(dst_argb), // %1
4176 "+r"(width) // %2
4177 : "r"(value) // %3
4178 : "memory", "cc"
4179 , "xmm0", "xmm1", "xmm2"
4180 );
4181 }
4182 #endif // HAS_ARGBSHADEROW_SSE2
4183
4184 #ifdef HAS_ARGBMULTIPLYROW_SSE2
4185 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBMultiplyRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4186 void ARGBMultiplyRow_SSE2(const uint8* src_argb0,
4187 const uint8* src_argb1,
4188 uint8* dst_argb,
4189 int width) {
4190 asm volatile (
4191 "pxor %%xmm5,%%xmm5 \n"
4192
4193 // 4 pixel loop.
4194 LABELALIGN
4195 "1: \n"
4196 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4197 "lea " MEMLEA(0x10,0) ",%0 \n"
4198 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
4199 "lea " MEMLEA(0x10,1) ",%1 \n"
4200 "movdqu %%xmm0,%%xmm1 \n"
4201 "movdqu %%xmm2,%%xmm3 \n"
4202 "punpcklbw %%xmm0,%%xmm0 \n"
4203 "punpckhbw %%xmm1,%%xmm1 \n"
4204 "punpcklbw %%xmm5,%%xmm2 \n"
4205 "punpckhbw %%xmm5,%%xmm3 \n"
4206 "pmulhuw %%xmm2,%%xmm0 \n"
4207 "pmulhuw %%xmm3,%%xmm1 \n"
4208 "packuswb %%xmm1,%%xmm0 \n"
4209 "movdqu %%xmm0," MEMACCESS(2) " \n"
4210 "lea " MEMLEA(0x10,2) ",%2 \n"
4211 "sub $0x4,%3 \n"
4212 "jg 1b \n"
4213 : "+r"(src_argb0), // %0
4214 "+r"(src_argb1), // %1
4215 "+r"(dst_argb), // %2
4216 "+r"(width) // %3
4217 :
4218 : "memory", "cc"
4219 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4220 );
4221 }
4222 #endif // HAS_ARGBMULTIPLYROW_SSE2
4223
4224 #ifdef HAS_ARGBMULTIPLYROW_AVX2
4225 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4226 void ARGBMultiplyRow_AVX2(const uint8* src_argb0,
4227 const uint8* src_argb1,
4228 uint8* dst_argb,
4229 int width) {
4230 asm volatile (
4231 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
4232
4233 // 4 pixel loop.
4234 LABELALIGN
4235 "1: \n"
4236 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
4237 "lea " MEMLEA(0x20,0) ",%0 \n"
4238 "vmovdqu " MEMACCESS(1) ",%%ymm3 \n"
4239 "lea " MEMLEA(0x20,1) ",%1 \n"
4240 "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
4241 "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
4242 "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
4243 "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
4244 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
4245 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
4246 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4247 "vmovdqu %%ymm0," MEMACCESS(2) " \n"
4248 "lea " MEMLEA(0x20,2) ",%2 \n"
4249 "sub $0x8,%3 \n"
4250 "jg 1b \n"
4251 "vzeroupper \n"
4252 : "+r"(src_argb0), // %0
4253 "+r"(src_argb1), // %1
4254 "+r"(dst_argb), // %2
4255 "+r"(width) // %3
4256 :
4257 : "memory", "cc"
4258 #if defined(__AVX2__)
4259 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4260 #endif
4261 );
4262 }
4263 #endif // HAS_ARGBMULTIPLYROW_AVX2
4264
4265 #ifdef HAS_ARGBADDROW_SSE2
4266 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4267 void ARGBAddRow_SSE2(const uint8* src_argb0,
4268 const uint8* src_argb1,
4269 uint8* dst_argb,
4270 int width) {
4271 asm volatile (
4272 // 4 pixel loop.
4273 LABELALIGN
4274 "1: \n"
4275 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4276 "lea " MEMLEA(0x10,0) ",%0 \n"
4277 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
4278 "lea " MEMLEA(0x10,1) ",%1 \n"
4279 "paddusb %%xmm1,%%xmm0 \n"
4280 "movdqu %%xmm0," MEMACCESS(2) " \n"
4281 "lea " MEMLEA(0x10,2) ",%2 \n"
4282 "sub $0x4,%3 \n"
4283 "jg 1b \n"
4284 : "+r"(src_argb0), // %0
4285 "+r"(src_argb1), // %1
4286 "+r"(dst_argb), // %2
4287 "+r"(width) // %3
4288 :
4289 : "memory", "cc"
4290 , "xmm0", "xmm1"
4291 );
4292 }
4293 #endif // HAS_ARGBADDROW_SSE2
4294
4295 #ifdef HAS_ARGBADDROW_AVX2
4296 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4297 void ARGBAddRow_AVX2(const uint8* src_argb0,
4298 const uint8* src_argb1,
4299 uint8* dst_argb,
4300 int width) {
4301 asm volatile (
4302 // 4 pixel loop.
4303 LABELALIGN
4304 "1: \n"
4305 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
4306 "lea " MEMLEA(0x20,0) ",%0 \n"
4307 "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4308 "lea " MEMLEA(0x20,1) ",%1 \n"
4309 "vmovdqu %%ymm0," MEMACCESS(2) " \n"
4310 "lea " MEMLEA(0x20,2) ",%2 \n"
4311 "sub $0x8,%3 \n"
4312 "jg 1b \n"
4313 "vzeroupper \n"
4314 : "+r"(src_argb0), // %0
4315 "+r"(src_argb1), // %1
4316 "+r"(dst_argb), // %2
4317 "+r"(width) // %3
4318 :
4319 : "memory", "cc"
4320 , "xmm0"
4321 );
4322 }
4323 #endif // HAS_ARGBADDROW_AVX2
4324
4325 #ifdef HAS_ARGBSUBTRACTROW_SSE2
4326 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
ARGBSubtractRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4327 void ARGBSubtractRow_SSE2(const uint8* src_argb0,
4328 const uint8* src_argb1,
4329 uint8* dst_argb,
4330 int width) {
4331 asm volatile (
4332 // 4 pixel loop.
4333 LABELALIGN
4334 "1: \n"
4335 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4336 "lea " MEMLEA(0x10,0) ",%0 \n"
4337 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
4338 "lea " MEMLEA(0x10,1) ",%1 \n"
4339 "psubusb %%xmm1,%%xmm0 \n"
4340 "movdqu %%xmm0," MEMACCESS(2) " \n"
4341 "lea " MEMLEA(0x10,2) ",%2 \n"
4342 "sub $0x4,%3 \n"
4343 "jg 1b \n"
4344 : "+r"(src_argb0), // %0
4345 "+r"(src_argb1), // %1
4346 "+r"(dst_argb), // %2
4347 "+r"(width) // %3
4348 :
4349 : "memory", "cc"
4350 , "xmm0", "xmm1"
4351 );
4352 }
4353 #endif // HAS_ARGBSUBTRACTROW_SSE2
4354
4355 #ifdef HAS_ARGBSUBTRACTROW_AVX2
4356 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4357 void ARGBSubtractRow_AVX2(const uint8* src_argb0,
4358 const uint8* src_argb1,
4359 uint8* dst_argb,
4360 int width) {
4361 asm volatile (
4362 // 4 pixel loop.
4363 LABELALIGN
4364 "1: \n"
4365 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
4366 "lea " MEMLEA(0x20,0) ",%0 \n"
4367 "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4368 "lea " MEMLEA(0x20,1) ",%1 \n"
4369 "vmovdqu %%ymm0," MEMACCESS(2) " \n"
4370 "lea " MEMLEA(0x20,2) ",%2 \n"
4371 "sub $0x8,%3 \n"
4372 "jg 1b \n"
4373 "vzeroupper \n"
4374 : "+r"(src_argb0), // %0
4375 "+r"(src_argb1), // %1
4376 "+r"(dst_argb), // %2
4377 "+r"(width) // %3
4378 :
4379 : "memory", "cc"
4380 , "xmm0"
4381 );
4382 }
4383 #endif // HAS_ARGBSUBTRACTROW_AVX2
4384
4385 #ifdef HAS_SOBELXROW_SSE2
4386 // SobelX as a matrix is
4387 // -1 0 1
4388 // -2 0 2
4389 // -1 0 1
SobelXRow_SSE2(const uint8 * src_y0,const uint8 * src_y1,const uint8 * src_y2,uint8 * dst_sobelx,int width)4390 void SobelXRow_SSE2(const uint8* src_y0,
4391 const uint8* src_y1,
4392 const uint8* src_y2,
4393 uint8* dst_sobelx,
4394 int width) {
4395 asm volatile (
4396 "sub %0,%1 \n"
4397 "sub %0,%2 \n"
4398 "sub %0,%3 \n"
4399 "pxor %%xmm5,%%xmm5 \n"
4400
4401 // 8 pixel loop.
4402 LABELALIGN
4403 "1: \n"
4404 "movq " MEMACCESS(0) ",%%xmm0 \n"
4405 "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n"
4406 "punpcklbw %%xmm5,%%xmm0 \n"
4407 "punpcklbw %%xmm5,%%xmm1 \n"
4408 "psubw %%xmm1,%%xmm0 \n"
4409 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
4410 MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2
4411 "punpcklbw %%xmm5,%%xmm1 \n"
4412 "punpcklbw %%xmm5,%%xmm2 \n"
4413 "psubw %%xmm2,%%xmm1 \n"
4414 MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2
4415 MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3
4416 "punpcklbw %%xmm5,%%xmm2 \n"
4417 "punpcklbw %%xmm5,%%xmm3 \n"
4418 "psubw %%xmm3,%%xmm2 \n"
4419 "paddw %%xmm2,%%xmm0 \n"
4420 "paddw %%xmm1,%%xmm0 \n"
4421 "paddw %%xmm1,%%xmm0 \n"
4422 "pxor %%xmm1,%%xmm1 \n"
4423 "psubw %%xmm0,%%xmm1 \n"
4424 "pmaxsw %%xmm1,%%xmm0 \n"
4425 "packuswb %%xmm0,%%xmm0 \n"
4426 MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1)
4427 "lea " MEMLEA(0x8,0) ",%0 \n"
4428 "sub $0x8,%4 \n"
4429 "jg 1b \n"
4430 : "+r"(src_y0), // %0
4431 "+r"(src_y1), // %1
4432 "+r"(src_y2), // %2
4433 "+r"(dst_sobelx), // %3
4434 "+r"(width) // %4
4435 :
4436 : "memory", "cc", NACL_R14
4437 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4438 );
4439 }
4440 #endif // HAS_SOBELXROW_SSE2
4441
4442 #ifdef HAS_SOBELYROW_SSE2
4443 // SobelY as a matrix is
4444 // -1 -2 -1
4445 // 0 0 0
4446 // 1 2 1
SobelYRow_SSE2(const uint8 * src_y0,const uint8 * src_y1,uint8 * dst_sobely,int width)4447 void SobelYRow_SSE2(const uint8* src_y0,
4448 const uint8* src_y1,
4449 uint8* dst_sobely,
4450 int width) {
4451 asm volatile (
4452 "sub %0,%1 \n"
4453 "sub %0,%2 \n"
4454 "pxor %%xmm5,%%xmm5 \n"
4455
4456 // 8 pixel loop.
4457 LABELALIGN
4458 "1: \n"
4459 "movq " MEMACCESS(0) ",%%xmm0 \n"
4460 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
4461 "punpcklbw %%xmm5,%%xmm0 \n"
4462 "punpcklbw %%xmm5,%%xmm1 \n"
4463 "psubw %%xmm1,%%xmm0 \n"
4464 "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n"
4465 MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2
4466 "punpcklbw %%xmm5,%%xmm1 \n"
4467 "punpcklbw %%xmm5,%%xmm2 \n"
4468 "psubw %%xmm2,%%xmm1 \n"
4469 "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n"
4470 MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3
4471 "punpcklbw %%xmm5,%%xmm2 \n"
4472 "punpcklbw %%xmm5,%%xmm3 \n"
4473 "psubw %%xmm3,%%xmm2 \n"
4474 "paddw %%xmm2,%%xmm0 \n"
4475 "paddw %%xmm1,%%xmm0 \n"
4476 "paddw %%xmm1,%%xmm0 \n"
4477 "pxor %%xmm1,%%xmm1 \n"
4478 "psubw %%xmm0,%%xmm1 \n"
4479 "pmaxsw %%xmm1,%%xmm0 \n"
4480 "packuswb %%xmm0,%%xmm0 \n"
4481 MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1)
4482 "lea " MEMLEA(0x8,0) ",%0 \n"
4483 "sub $0x8,%3 \n"
4484 "jg 1b \n"
4485 : "+r"(src_y0), // %0
4486 "+r"(src_y1), // %1
4487 "+r"(dst_sobely), // %2
4488 "+r"(width) // %3
4489 :
4490 : "memory", "cc", NACL_R14
4491 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4492 );
4493 }
4494 #endif // HAS_SOBELYROW_SSE2
4495
4496 #ifdef HAS_SOBELROW_SSE2
4497 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
4498 // A = 255
4499 // R = Sobel
4500 // G = Sobel
4501 // B = Sobel
SobelRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)4502 void SobelRow_SSE2(const uint8* src_sobelx,
4503 const uint8* src_sobely,
4504 uint8* dst_argb,
4505 int width) {
4506 asm volatile (
4507 "sub %0,%1 \n"
4508 "pcmpeqb %%xmm5,%%xmm5 \n"
4509 "pslld $0x18,%%xmm5 \n"
4510
4511 // 8 pixel loop.
4512 LABELALIGN
4513 "1: \n"
4514 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4515 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
4516 "lea " MEMLEA(0x10,0) ",%0 \n"
4517 "paddusb %%xmm1,%%xmm0 \n"
4518 "movdqa %%xmm0,%%xmm2 \n"
4519 "punpcklbw %%xmm0,%%xmm2 \n"
4520 "punpckhbw %%xmm0,%%xmm0 \n"
4521 "movdqa %%xmm2,%%xmm1 \n"
4522 "punpcklwd %%xmm2,%%xmm1 \n"
4523 "punpckhwd %%xmm2,%%xmm2 \n"
4524 "por %%xmm5,%%xmm1 \n"
4525 "por %%xmm5,%%xmm2 \n"
4526 "movdqa %%xmm0,%%xmm3 \n"
4527 "punpcklwd %%xmm0,%%xmm3 \n"
4528 "punpckhwd %%xmm0,%%xmm0 \n"
4529 "por %%xmm5,%%xmm3 \n"
4530 "por %%xmm5,%%xmm0 \n"
4531 "movdqu %%xmm1," MEMACCESS(2) " \n"
4532 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
4533 "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n"
4534 "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n"
4535 "lea " MEMLEA(0x40,2) ",%2 \n"
4536 "sub $0x10,%3 \n"
4537 "jg 1b \n"
4538 : "+r"(src_sobelx), // %0
4539 "+r"(src_sobely), // %1
4540 "+r"(dst_argb), // %2
4541 "+r"(width) // %3
4542 :
4543 : "memory", "cc", NACL_R14
4544 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4545 );
4546 }
4547 #endif // HAS_SOBELROW_SSE2
4548
4549 #ifdef HAS_SOBELTOPLANEROW_SSE2
4550 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
SobelToPlaneRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_y,int width)4551 void SobelToPlaneRow_SSE2(const uint8* src_sobelx,
4552 const uint8* src_sobely,
4553 uint8* dst_y,
4554 int width) {
4555 asm volatile (
4556 "sub %0,%1 \n"
4557 "pcmpeqb %%xmm5,%%xmm5 \n"
4558 "pslld $0x18,%%xmm5 \n"
4559
4560 // 8 pixel loop.
4561 LABELALIGN
4562 "1: \n"
4563 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4564 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
4565 "lea " MEMLEA(0x10,0) ",%0 \n"
4566 "paddusb %%xmm1,%%xmm0 \n"
4567 "movdqu %%xmm0," MEMACCESS(2) " \n"
4568 "lea " MEMLEA(0x10,2) ",%2 \n"
4569 "sub $0x10,%3 \n"
4570 "jg 1b \n"
4571 : "+r"(src_sobelx), // %0
4572 "+r"(src_sobely), // %1
4573 "+r"(dst_y), // %2
4574 "+r"(width) // %3
4575 :
4576 : "memory", "cc", NACL_R14
4577 "xmm0", "xmm1"
4578 );
4579 }
4580 #endif // HAS_SOBELTOPLANEROW_SSE2
4581
4582 #ifdef HAS_SOBELXYROW_SSE2
4583 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
4584 // A = 255
4585 // R = Sobel X
4586 // G = Sobel
4587 // B = Sobel Y
SobelXYRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)4588 void SobelXYRow_SSE2(const uint8* src_sobelx,
4589 const uint8* src_sobely,
4590 uint8* dst_argb,
4591 int width) {
4592 asm volatile (
4593 "sub %0,%1 \n"
4594 "pcmpeqb %%xmm5,%%xmm5 \n"
4595
4596 // 8 pixel loop.
4597 LABELALIGN
4598 "1: \n"
4599 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4600 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
4601 "lea " MEMLEA(0x10,0) ",%0 \n"
4602 "movdqa %%xmm0,%%xmm2 \n"
4603 "paddusb %%xmm1,%%xmm2 \n"
4604 "movdqa %%xmm0,%%xmm3 \n"
4605 "punpcklbw %%xmm5,%%xmm3 \n"
4606 "punpckhbw %%xmm5,%%xmm0 \n"
4607 "movdqa %%xmm1,%%xmm4 \n"
4608 "punpcklbw %%xmm2,%%xmm4 \n"
4609 "punpckhbw %%xmm2,%%xmm1 \n"
4610 "movdqa %%xmm4,%%xmm6 \n"
4611 "punpcklwd %%xmm3,%%xmm6 \n"
4612 "punpckhwd %%xmm3,%%xmm4 \n"
4613 "movdqa %%xmm1,%%xmm7 \n"
4614 "punpcklwd %%xmm0,%%xmm7 \n"
4615 "punpckhwd %%xmm0,%%xmm1 \n"
4616 "movdqu %%xmm6," MEMACCESS(2) " \n"
4617 "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n"
4618 "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n"
4619 "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n"
4620 "lea " MEMLEA(0x40,2) ",%2 \n"
4621 "sub $0x10,%3 \n"
4622 "jg 1b \n"
4623 : "+r"(src_sobelx), // %0
4624 "+r"(src_sobely), // %1
4625 "+r"(dst_argb), // %2
4626 "+r"(width) // %3
4627 :
4628 : "memory", "cc", NACL_R14
4629 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4630 );
4631 }
4632 #endif // HAS_SOBELXYROW_SSE2
4633
4634 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
4635 // Creates a table of cumulative sums where each value is a sum of all values
4636 // above and to the left of the value, inclusive of the value.
ComputeCumulativeSumRow_SSE2(const uint8 * row,int32 * cumsum,const int32 * previous_cumsum,int width)4637 void ComputeCumulativeSumRow_SSE2(const uint8* row,
4638 int32* cumsum,
4639 const int32* previous_cumsum,
4640 int width) {
4641 asm volatile (
4642 "pxor %%xmm0,%%xmm0 \n"
4643 "pxor %%xmm1,%%xmm1 \n"
4644 "sub $0x4,%3 \n"
4645 "jl 49f \n"
4646 "test $0xf,%1 \n"
4647 "jne 49f \n"
4648
4649 // 4 pixel loop.
4650 LABELALIGN
4651 "40: \n"
4652 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
4653 "lea " MEMLEA(0x10,0) ",%0 \n"
4654 "movdqa %%xmm2,%%xmm4 \n"
4655 "punpcklbw %%xmm1,%%xmm2 \n"
4656 "movdqa %%xmm2,%%xmm3 \n"
4657 "punpcklwd %%xmm1,%%xmm2 \n"
4658 "punpckhwd %%xmm1,%%xmm3 \n"
4659 "punpckhbw %%xmm1,%%xmm4 \n"
4660 "movdqa %%xmm4,%%xmm5 \n"
4661 "punpcklwd %%xmm1,%%xmm4 \n"
4662 "punpckhwd %%xmm1,%%xmm5 \n"
4663 "paddd %%xmm2,%%xmm0 \n"
4664 "movdqu " MEMACCESS(2) ",%%xmm2 \n"
4665 "paddd %%xmm0,%%xmm2 \n"
4666 "paddd %%xmm3,%%xmm0 \n"
4667 "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n"
4668 "paddd %%xmm0,%%xmm3 \n"
4669 "paddd %%xmm4,%%xmm0 \n"
4670 "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n"
4671 "paddd %%xmm0,%%xmm4 \n"
4672 "paddd %%xmm5,%%xmm0 \n"
4673 "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n"
4674 "lea " MEMLEA(0x40,2) ",%2 \n"
4675 "paddd %%xmm0,%%xmm5 \n"
4676 "movdqu %%xmm2," MEMACCESS(1) " \n"
4677 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
4678 "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n"
4679 "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n"
4680 "lea " MEMLEA(0x40,1) ",%1 \n"
4681 "sub $0x4,%3 \n"
4682 "jge 40b \n"
4683
4684 "49: \n"
4685 "add $0x3,%3 \n"
4686 "jl 19f \n"
4687
4688 // 1 pixel loop.
4689 LABELALIGN
4690 "10: \n"
4691 "movd " MEMACCESS(0) ",%%xmm2 \n"
4692 "lea " MEMLEA(0x4,0) ",%0 \n"
4693 "punpcklbw %%xmm1,%%xmm2 \n"
4694 "punpcklwd %%xmm1,%%xmm2 \n"
4695 "paddd %%xmm2,%%xmm0 \n"
4696 "movdqu " MEMACCESS(2) ",%%xmm2 \n"
4697 "lea " MEMLEA(0x10,2) ",%2 \n"
4698 "paddd %%xmm0,%%xmm2 \n"
4699 "movdqu %%xmm2," MEMACCESS(1) " \n"
4700 "lea " MEMLEA(0x10,1) ",%1 \n"
4701 "sub $0x1,%3 \n"
4702 "jge 10b \n"
4703
4704 "19: \n"
4705 : "+r"(row), // %0
4706 "+r"(cumsum), // %1
4707 "+r"(previous_cumsum), // %2
4708 "+r"(width) // %3
4709 :
4710 : "memory", "cc"
4711 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4712 );
4713 }
4714 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
4715
4716 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
CumulativeSumToAverageRow_SSE2(const int32 * topleft,const int32 * botleft,int width,int area,uint8 * dst,int count)4717 void CumulativeSumToAverageRow_SSE2(const int32* topleft,
4718 const int32* botleft,
4719 int width,
4720 int area,
4721 uint8* dst,
4722 int count) {
4723 asm volatile (
4724 "movd %5,%%xmm5 \n"
4725 "cvtdq2ps %%xmm5,%%xmm5 \n"
4726 "rcpss %%xmm5,%%xmm4 \n"
4727 "pshufd $0x0,%%xmm4,%%xmm4 \n"
4728 "sub $0x4,%3 \n"
4729 "jl 49f \n"
4730 "cmpl $0x80,%5 \n"
4731 "ja 40f \n"
4732
4733 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4734 "pcmpeqb %%xmm6,%%xmm6 \n"
4735 "psrld $0x10,%%xmm6 \n"
4736 "cvtdq2ps %%xmm6,%%xmm6 \n"
4737 "addps %%xmm6,%%xmm5 \n"
4738 "mulps %%xmm4,%%xmm5 \n"
4739 "cvtps2dq %%xmm5,%%xmm5 \n"
4740 "packssdw %%xmm5,%%xmm5 \n"
4741
4742 // 4 pixel small loop.
4743 LABELALIGN
4744 "4: \n"
4745 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4746 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4747 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
4748 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
4749 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
4750 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
4751 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
4752 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
4753 "lea " MEMLEA(0x40,0) ",%0 \n"
4754 "psubd " MEMACCESS(1) ",%%xmm0 \n"
4755 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
4756 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
4757 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
4758 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
4759 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
4760 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
4761 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
4762 "lea " MEMLEA(0x40,1) ",%1 \n"
4763 "packssdw %%xmm1,%%xmm0 \n"
4764 "packssdw %%xmm3,%%xmm2 \n"
4765 "pmulhuw %%xmm5,%%xmm0 \n"
4766 "pmulhuw %%xmm5,%%xmm2 \n"
4767 "packuswb %%xmm2,%%xmm0 \n"
4768 "movdqu %%xmm0," MEMACCESS(2) " \n"
4769 "lea " MEMLEA(0x10,2) ",%2 \n"
4770 "sub $0x4,%3 \n"
4771 "jge 4b \n"
4772 "jmp 49f \n"
4773
4774 // 4 pixel loop \n"
4775 LABELALIGN
4776 "40: \n"
4777 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4778 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4779 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
4780 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
4781 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
4782 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
4783 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
4784 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
4785 "lea " MEMLEA(0x40,0) ",%0 \n"
4786 "psubd " MEMACCESS(1) ",%%xmm0 \n"
4787 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
4788 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
4789 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
4790 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
4791 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
4792 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
4793 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
4794 "lea " MEMLEA(0x40,1) ",%1 \n"
4795 "cvtdq2ps %%xmm0,%%xmm0 \n"
4796 "cvtdq2ps %%xmm1,%%xmm1 \n"
4797 "mulps %%xmm4,%%xmm0 \n"
4798 "mulps %%xmm4,%%xmm1 \n"
4799 "cvtdq2ps %%xmm2,%%xmm2 \n"
4800 "cvtdq2ps %%xmm3,%%xmm3 \n"
4801 "mulps %%xmm4,%%xmm2 \n"
4802 "mulps %%xmm4,%%xmm3 \n"
4803 "cvtps2dq %%xmm0,%%xmm0 \n"
4804 "cvtps2dq %%xmm1,%%xmm1 \n"
4805 "cvtps2dq %%xmm2,%%xmm2 \n"
4806 "cvtps2dq %%xmm3,%%xmm3 \n"
4807 "packssdw %%xmm1,%%xmm0 \n"
4808 "packssdw %%xmm3,%%xmm2 \n"
4809 "packuswb %%xmm2,%%xmm0 \n"
4810 "movdqu %%xmm0," MEMACCESS(2) " \n"
4811 "lea " MEMLEA(0x10,2) ",%2 \n"
4812 "sub $0x4,%3 \n"
4813 "jge 40b \n"
4814
4815 "49: \n"
4816 "add $0x3,%3 \n"
4817 "jl 19f \n"
4818
4819 // 1 pixel loop \n"
4820 LABELALIGN
4821 "10: \n"
4822 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4823 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
4824 "lea " MEMLEA(0x10,0) ",%0 \n"
4825 "psubd " MEMACCESS(1) ",%%xmm0 \n"
4826 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
4827 "lea " MEMLEA(0x10,1) ",%1 \n"
4828 "cvtdq2ps %%xmm0,%%xmm0 \n"
4829 "mulps %%xmm4,%%xmm0 \n"
4830 "cvtps2dq %%xmm0,%%xmm0 \n"
4831 "packssdw %%xmm0,%%xmm0 \n"
4832 "packuswb %%xmm0,%%xmm0 \n"
4833 "movd %%xmm0," MEMACCESS(2) " \n"
4834 "lea " MEMLEA(0x4,2) ",%2 \n"
4835 "sub $0x1,%3 \n"
4836 "jge 10b \n"
4837 "19: \n"
4838 : "+r"(topleft), // %0
4839 "+r"(botleft), // %1
4840 "+r"(dst), // %2
4841 "+rm"(count) // %3
4842 : "r"((intptr_t)(width)), // %4
4843 "rm"(area) // %5
4844 : "memory", "cc", NACL_R14
4845 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4846 );
4847 }
4848 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4849
4850 #ifdef HAS_ARGBAFFINEROW_SSE2
4851 // Copy ARGB pixels from source image with slope to a row of destination.
4852 LIBYUV_API
ARGBAffineRow_SSE2(const uint8 * src_argb,int src_argb_stride,uint8 * dst_argb,const float * src_dudv,int width)4853 void ARGBAffineRow_SSE2(const uint8* src_argb,
4854 int src_argb_stride,
4855 uint8* dst_argb,
4856 const float* src_dudv,
4857 int width) {
4858 intptr_t src_argb_stride_temp = src_argb_stride;
4859 intptr_t temp;
4860 asm volatile (
4861 "movq " MEMACCESS(3) ",%%xmm2 \n"
4862 "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n"
4863 "shl $0x10,%1 \n"
4864 "add $0x4,%1 \n"
4865 "movd %1,%%xmm5 \n"
4866 "sub $0x4,%4 \n"
4867 "jl 49f \n"
4868
4869 "pshufd $0x44,%%xmm7,%%xmm7 \n"
4870 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4871 "movdqa %%xmm2,%%xmm0 \n"
4872 "addps %%xmm7,%%xmm0 \n"
4873 "movlhps %%xmm0,%%xmm2 \n"
4874 "movdqa %%xmm7,%%xmm4 \n"
4875 "addps %%xmm4,%%xmm4 \n"
4876 "movdqa %%xmm2,%%xmm3 \n"
4877 "addps %%xmm4,%%xmm3 \n"
4878 "addps %%xmm4,%%xmm4 \n"
4879
4880 // 4 pixel loop \n"
4881 LABELALIGN
4882 "40: \n"
4883 "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2
4884 "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2
4885 "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
4886 "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride
4887 "movd %%xmm0,%k1 \n"
4888 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4889 "movd %%xmm0,%k5 \n"
4890 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4891 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
4892 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
4893 "punpckldq %%xmm6,%%xmm1 \n"
4894 "addps %%xmm4,%%xmm2 \n"
4895 "movq %%xmm1," MEMACCESS(2) " \n"
4896 "movd %%xmm0,%k1 \n"
4897 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4898 "movd %%xmm0,%k5 \n"
4899 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
4900 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
4901 "punpckldq %%xmm6,%%xmm0 \n"
4902 "addps %%xmm4,%%xmm3 \n"
4903 "movq %%xmm0," MEMACCESS2(0x08,2) " \n"
4904 "lea " MEMLEA(0x10,2) ",%2 \n"
4905 "sub $0x4,%4 \n"
4906 "jge 40b \n"
4907
4908 "49: \n"
4909 "add $0x3,%4 \n"
4910 "jl 19f \n"
4911
4912 // 1 pixel loop \n"
4913 LABELALIGN
4914 "10: \n"
4915 "cvttps2dq %%xmm2,%%xmm0 \n"
4916 "packssdw %%xmm0,%%xmm0 \n"
4917 "pmaddwd %%xmm5,%%xmm0 \n"
4918 "addps %%xmm7,%%xmm2 \n"
4919 "movd %%xmm0,%k1 \n"
4920 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
4921 "movd %%xmm0," MEMACCESS(2) " \n"
4922 "lea " MEMLEA(0x04,2) ",%2 \n"
4923 "sub $0x1,%4 \n"
4924 "jge 10b \n"
4925 "19: \n"
4926 : "+r"(src_argb), // %0
4927 "+r"(src_argb_stride_temp), // %1
4928 "+r"(dst_argb), // %2
4929 "+r"(src_dudv), // %3
4930 "+rm"(width), // %4
4931 "=&r"(temp) // %5
4932 :
4933 : "memory", "cc", NACL_R14
4934 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4935 );
4936 }
4937 #endif // HAS_ARGBAFFINEROW_SSE2
4938
4939 #ifdef HAS_INTERPOLATEROW_SSSE3
4940 // Bilinear filter 16x2 -> 16x1
InterpolateRow_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)4941 void InterpolateRow_SSSE3(uint8* dst_ptr,
4942 const uint8* src_ptr,
4943 ptrdiff_t src_stride,
4944 int dst_width,
4945 int source_y_fraction) {
4946 asm volatile (
4947 "sub %1,%0 \n"
4948 "cmp $0x0,%3 \n"
4949 "je 100f \n"
4950 "cmp $0x80,%3 \n"
4951 "je 50f \n"
4952
4953 "movd %3,%%xmm0 \n"
4954 "neg %3 \n"
4955 "add $0x100,%3 \n"
4956 "movd %3,%%xmm5 \n"
4957 "punpcklbw %%xmm0,%%xmm5 \n"
4958 "punpcklwd %%xmm5,%%xmm5 \n"
4959 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4960 "mov $0x80808080,%%eax \n"
4961 "movd %%eax,%%xmm4 \n"
4962 "pshufd $0x0,%%xmm4,%%xmm4 \n"
4963
4964 // General purpose row blend.
4965 LABELALIGN
4966 "1: \n"
4967 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4968 MEMOPREG(movdqu,0x00,1,4,1,xmm2)
4969 "movdqa %%xmm0,%%xmm1 \n"
4970 "punpcklbw %%xmm2,%%xmm0 \n"
4971 "punpckhbw %%xmm2,%%xmm1 \n"
4972 "psubb %%xmm4,%%xmm0 \n"
4973 "psubb %%xmm4,%%xmm1 \n"
4974 "movdqa %%xmm5,%%xmm2 \n"
4975 "movdqa %%xmm5,%%xmm3 \n"
4976 "pmaddubsw %%xmm0,%%xmm2 \n"
4977 "pmaddubsw %%xmm1,%%xmm3 \n"
4978 "paddw %%xmm4,%%xmm2 \n"
4979 "paddw %%xmm4,%%xmm3 \n"
4980 "psrlw $0x8,%%xmm2 \n"
4981 "psrlw $0x8,%%xmm3 \n"
4982 "packuswb %%xmm3,%%xmm2 \n"
4983 MEMOPMEM(movdqu,xmm2,0x00,1,0,1)
4984 "lea " MEMLEA(0x10,1) ",%1 \n"
4985 "sub $0x10,%2 \n"
4986 "jg 1b \n"
4987 "jmp 99f \n"
4988
4989 // Blend 50 / 50.
4990 LABELALIGN
4991 "50: \n"
4992 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4993 MEMOPREG(movdqu,0x00,1,4,1,xmm1)
4994 "pavgb %%xmm1,%%xmm0 \n"
4995 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4996 "lea " MEMLEA(0x10,1) ",%1 \n"
4997 "sub $0x10,%2 \n"
4998 "jg 50b \n"
4999 "jmp 99f \n"
5000
5001 // Blend 100 / 0 - Copy row unchanged.
5002 LABELALIGN
5003 "100: \n"
5004 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
5005 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
5006 "lea " MEMLEA(0x10,1) ",%1 \n"
5007 "sub $0x10,%2 \n"
5008 "jg 100b \n"
5009
5010 "99: \n"
5011 : "+r"(dst_ptr), // %0
5012 "+r"(src_ptr), // %1
5013 "+rm"(dst_width), // %2
5014 "+r"(source_y_fraction) // %3
5015 : "r"((intptr_t)(src_stride)) // %4
5016 : "memory", "cc", "eax", NACL_R14
5017 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
5018 );
5019 }
5020 #endif // HAS_INTERPOLATEROW_SSSE3
5021
5022 #ifdef HAS_INTERPOLATEROW_AVX2
5023 // Bilinear filter 32x2 -> 32x1
InterpolateRow_AVX2(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)5024 void InterpolateRow_AVX2(uint8* dst_ptr,
5025 const uint8* src_ptr,
5026 ptrdiff_t src_stride,
5027 int dst_width,
5028 int source_y_fraction) {
5029 asm volatile (
5030 "cmp $0x0,%3 \n"
5031 "je 100f \n"
5032 "sub %1,%0 \n"
5033 "cmp $0x80,%3 \n"
5034 "je 50f \n"
5035
5036 "vmovd %3,%%xmm0 \n"
5037 "neg %3 \n"
5038 "add $0x100,%3 \n"
5039 "vmovd %3,%%xmm5 \n"
5040 "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
5041 "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
5042 "vbroadcastss %%xmm5,%%ymm5 \n"
5043 "mov $0x80808080,%%eax \n"
5044 "vmovd %%eax,%%xmm4 \n"
5045 "vbroadcastss %%xmm4,%%ymm4 \n"
5046
5047 // General purpose row blend.
5048 LABELALIGN
5049 "1: \n"
5050 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
5051 MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
5052 "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
5053 "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
5054 "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
5055 "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
5056 "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
5057 "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
5058 "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
5059 "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
5060 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
5061 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
5062 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
5063 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
5064 "lea " MEMLEA(0x20,1) ",%1 \n"
5065 "sub $0x20,%2 \n"
5066 "jg 1b \n"
5067 "jmp 99f \n"
5068
5069 // Blend 50 / 50.
5070 LABELALIGN
5071 "50: \n"
5072 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
5073 VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0) // vpavgb (%1,%4,1),%%ymm0,%%ymm0
5074 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
5075 "lea " MEMLEA(0x20,1) ",%1 \n"
5076 "sub $0x20,%2 \n"
5077 "jg 50b \n"
5078 "jmp 99f \n"
5079
5080 // Blend 100 / 0 - Copy row unchanged.
5081 LABELALIGN
5082 "100: \n"
5083 "rep movsb " MEMMOVESTRING(1,0) " \n"
5084 "jmp 999f \n"
5085
5086 "99: \n"
5087 "vzeroupper \n"
5088 "999: \n"
5089 : "+D"(dst_ptr), // %0
5090 "+S"(src_ptr), // %1
5091 "+cm"(dst_width), // %2
5092 "+r"(source_y_fraction) // %3
5093 : "r"((intptr_t)(src_stride)) // %4
5094 : "memory", "cc", "eax", NACL_R14
5095 "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
5096 );
5097 }
5098 #endif // HAS_INTERPOLATEROW_AVX2
5099
5100 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
5101 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int width)5102 void ARGBShuffleRow_SSSE3(const uint8* src_argb,
5103 uint8* dst_argb,
5104 const uint8* shuffler,
5105 int width) {
5106 asm volatile (
5107 "movdqu " MEMACCESS(3) ",%%xmm5 \n"
5108 LABELALIGN
5109 "1: \n"
5110 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5111 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
5112 "lea " MEMLEA(0x20,0) ",%0 \n"
5113 "pshufb %%xmm5,%%xmm0 \n"
5114 "pshufb %%xmm5,%%xmm1 \n"
5115 "movdqu %%xmm0," MEMACCESS(1) " \n"
5116 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
5117 "lea " MEMLEA(0x20,1) ",%1 \n"
5118 "sub $0x8,%2 \n"
5119 "jg 1b \n"
5120 : "+r"(src_argb), // %0
5121 "+r"(dst_argb), // %1
5122 "+r"(width) // %2
5123 : "r"(shuffler) // %3
5124 : "memory", "cc"
5125 , "xmm0", "xmm1", "xmm5"
5126 );
5127 }
5128 #endif // HAS_ARGBSHUFFLEROW_SSSE3
5129
5130 #ifdef HAS_ARGBSHUFFLEROW_AVX2
5131 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int width)5132 void ARGBShuffleRow_AVX2(const uint8* src_argb,
5133 uint8* dst_argb,
5134 const uint8* shuffler,
5135 int width) {
5136 asm volatile (
5137 "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n"
5138 LABELALIGN
5139 "1: \n"
5140 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
5141 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
5142 "lea " MEMLEA(0x40,0) ",%0 \n"
5143 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
5144 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
5145 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
5146 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
5147 "lea " MEMLEA(0x40,1) ",%1 \n"
5148 "sub $0x10,%2 \n"
5149 "jg 1b \n"
5150 "vzeroupper \n"
5151 : "+r"(src_argb), // %0
5152 "+r"(dst_argb), // %1
5153 "+r"(width) // %2
5154 : "r"(shuffler) // %3
5155 : "memory", "cc"
5156 , "xmm0", "xmm1", "xmm5"
5157 );
5158 }
5159 #endif // HAS_ARGBSHUFFLEROW_AVX2
5160
5161 #ifdef HAS_ARGBSHUFFLEROW_SSE2
5162 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int width)5163 void ARGBShuffleRow_SSE2(const uint8* src_argb,
5164 uint8* dst_argb,
5165 const uint8* shuffler,
5166 int width) {
5167 uintptr_t pixel_temp;
5168 asm volatile (
5169 "pxor %%xmm5,%%xmm5 \n"
5170 "mov " MEMACCESS(4) ",%k2 \n"
5171 "cmp $0x3000102,%k2 \n"
5172 "je 3012f \n"
5173 "cmp $0x10203,%k2 \n"
5174 "je 123f \n"
5175 "cmp $0x30201,%k2 \n"
5176 "je 321f \n"
5177 "cmp $0x2010003,%k2 \n"
5178 "je 2103f \n"
5179
5180 LABELALIGN
5181 "1: \n"
5182 "movzb " MEMACCESS(4) ",%2 \n"
5183 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5184 "mov %b2," MEMACCESS(1) " \n"
5185 "movzb " MEMACCESS2(0x1,4) ",%2 \n"
5186 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5187 "mov %b2," MEMACCESS2(0x1,1) " \n"
5188 "movzb " MEMACCESS2(0x2,4) ",%2 \n"
5189 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5190 "mov %b2," MEMACCESS2(0x2,1) " \n"
5191 "movzb " MEMACCESS2(0x3,4) ",%2 \n"
5192 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5193 "mov %b2," MEMACCESS2(0x3,1) " \n"
5194 "lea " MEMLEA(0x4,0) ",%0 \n"
5195 "lea " MEMLEA(0x4,1) ",%1 \n"
5196 "sub $0x1,%3 \n"
5197 "jg 1b \n"
5198 "jmp 99f \n"
5199
5200 LABELALIGN
5201 "123: \n"
5202 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5203 "lea " MEMLEA(0x10,0) ",%0 \n"
5204 "movdqa %%xmm0,%%xmm1 \n"
5205 "punpcklbw %%xmm5,%%xmm0 \n"
5206 "punpckhbw %%xmm5,%%xmm1 \n"
5207 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
5208 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
5209 "pshufhw $0x1b,%%xmm1,%%xmm1 \n"
5210 "pshuflw $0x1b,%%xmm1,%%xmm1 \n"
5211 "packuswb %%xmm1,%%xmm0 \n"
5212 "movdqu %%xmm0," MEMACCESS(1) " \n"
5213 "lea " MEMLEA(0x10,1) ",%1 \n"
5214 "sub $0x4,%3 \n"
5215 "jg 123b \n"
5216 "jmp 99f \n"
5217
5218 LABELALIGN
5219 "321: \n"
5220 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5221 "lea " MEMLEA(0x10,0) ",%0 \n"
5222 "movdqa %%xmm0,%%xmm1 \n"
5223 "punpcklbw %%xmm5,%%xmm0 \n"
5224 "punpckhbw %%xmm5,%%xmm1 \n"
5225 "pshufhw $0x39,%%xmm0,%%xmm0 \n"
5226 "pshuflw $0x39,%%xmm0,%%xmm0 \n"
5227 "pshufhw $0x39,%%xmm1,%%xmm1 \n"
5228 "pshuflw $0x39,%%xmm1,%%xmm1 \n"
5229 "packuswb %%xmm1,%%xmm0 \n"
5230 "movdqu %%xmm0," MEMACCESS(1) " \n"
5231 "lea " MEMLEA(0x10,1) ",%1 \n"
5232 "sub $0x4,%3 \n"
5233 "jg 321b \n"
5234 "jmp 99f \n"
5235
5236 LABELALIGN
5237 "2103: \n"
5238 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5239 "lea " MEMLEA(0x10,0) ",%0 \n"
5240 "movdqa %%xmm0,%%xmm1 \n"
5241 "punpcklbw %%xmm5,%%xmm0 \n"
5242 "punpckhbw %%xmm5,%%xmm1 \n"
5243 "pshufhw $0x93,%%xmm0,%%xmm0 \n"
5244 "pshuflw $0x93,%%xmm0,%%xmm0 \n"
5245 "pshufhw $0x93,%%xmm1,%%xmm1 \n"
5246 "pshuflw $0x93,%%xmm1,%%xmm1 \n"
5247 "packuswb %%xmm1,%%xmm0 \n"
5248 "movdqu %%xmm0," MEMACCESS(1) " \n"
5249 "lea " MEMLEA(0x10,1) ",%1 \n"
5250 "sub $0x4,%3 \n"
5251 "jg 2103b \n"
5252 "jmp 99f \n"
5253
5254 LABELALIGN
5255 "3012: \n"
5256 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5257 "lea " MEMLEA(0x10,0) ",%0 \n"
5258 "movdqa %%xmm0,%%xmm1 \n"
5259 "punpcklbw %%xmm5,%%xmm0 \n"
5260 "punpckhbw %%xmm5,%%xmm1 \n"
5261 "pshufhw $0xc6,%%xmm0,%%xmm0 \n"
5262 "pshuflw $0xc6,%%xmm0,%%xmm0 \n"
5263 "pshufhw $0xc6,%%xmm1,%%xmm1 \n"
5264 "pshuflw $0xc6,%%xmm1,%%xmm1 \n"
5265 "packuswb %%xmm1,%%xmm0 \n"
5266 "movdqu %%xmm0," MEMACCESS(1) " \n"
5267 "lea " MEMLEA(0x10,1) ",%1 \n"
5268 "sub $0x4,%3 \n"
5269 "jg 3012b \n"
5270
5271 "99: \n"
5272 : "+r"(src_argb), // %0
5273 "+r"(dst_argb), // %1
5274 "=&d"(pixel_temp), // %2
5275 "+r"(width) // %3
5276 : "r"(shuffler) // %4
5277 : "memory", "cc", NACL_R14
5278 "xmm0", "xmm1", "xmm5"
5279 );
5280 }
5281 #endif // HAS_ARGBSHUFFLEROW_SSE2
5282
5283 #ifdef HAS_I422TOYUY2ROW_SSE2
I422ToYUY2Row_SSE2(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_frame,int width)5284 void I422ToYUY2Row_SSE2(const uint8* src_y,
5285 const uint8* src_u,
5286 const uint8* src_v,
5287 uint8* dst_frame,
5288 int width) {
5289 asm volatile (
5290 "sub %1,%2 \n"
5291 LABELALIGN
5292 "1: \n"
5293 "movq " MEMACCESS(1) ",%%xmm2 \n"
5294 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
5295 "lea " MEMLEA(0x8,1) ",%1 \n"
5296 "punpcklbw %%xmm3,%%xmm2 \n"
5297 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5298 "lea " MEMLEA(0x10,0) ",%0 \n"
5299 "movdqa %%xmm0,%%xmm1 \n"
5300 "punpcklbw %%xmm2,%%xmm0 \n"
5301 "punpckhbw %%xmm2,%%xmm1 \n"
5302 "movdqu %%xmm0," MEMACCESS(3) " \n"
5303 "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n"
5304 "lea " MEMLEA(0x20,3) ",%3 \n"
5305 "sub $0x10,%4 \n"
5306 "jg 1b \n"
5307 : "+r"(src_y), // %0
5308 "+r"(src_u), // %1
5309 "+r"(src_v), // %2
5310 "+r"(dst_frame), // %3
5311 "+rm"(width) // %4
5312 :
5313 : "memory", "cc", NACL_R14
5314 "xmm0", "xmm1", "xmm2", "xmm3"
5315 );
5316 }
5317 #endif // HAS_I422TOYUY2ROW_SSE2
5318
5319 #ifdef HAS_I422TOUYVYROW_SSE2
I422ToUYVYRow_SSE2(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_frame,int width)5320 void I422ToUYVYRow_SSE2(const uint8* src_y,
5321 const uint8* src_u,
5322 const uint8* src_v,
5323 uint8* dst_frame,
5324 int width) {
5325 asm volatile (
5326 "sub %1,%2 \n"
5327 LABELALIGN
5328 "1: \n"
5329 "movq " MEMACCESS(1) ",%%xmm2 \n"
5330 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
5331 "lea " MEMLEA(0x8,1) ",%1 \n"
5332 "punpcklbw %%xmm3,%%xmm2 \n"
5333 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5334 "movdqa %%xmm2,%%xmm1 \n"
5335 "lea " MEMLEA(0x10,0) ",%0 \n"
5336 "punpcklbw %%xmm0,%%xmm1 \n"
5337 "punpckhbw %%xmm0,%%xmm2 \n"
5338 "movdqu %%xmm1," MEMACCESS(3) " \n"
5339 "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n"
5340 "lea " MEMLEA(0x20,3) ",%3 \n"
5341 "sub $0x10,%4 \n"
5342 "jg 1b \n"
5343 : "+r"(src_y), // %0
5344 "+r"(src_u), // %1
5345 "+r"(src_v), // %2
5346 "+r"(dst_frame), // %3
5347 "+rm"(width) // %4
5348 :
5349 : "memory", "cc", NACL_R14
5350 "xmm0", "xmm1", "xmm2", "xmm3"
5351 );
5352 }
5353 #endif // HAS_I422TOUYVYROW_SSE2
5354
5355 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
ARGBPolynomialRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,const float * poly,int width)5356 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
5357 uint8* dst_argb,
5358 const float* poly,
5359 int width) {
5360 asm volatile (
5361 "pxor %%xmm3,%%xmm3 \n"
5362
5363 // 2 pixel loop.
5364 LABELALIGN
5365 "1: \n"
5366 "movq " MEMACCESS(0) ",%%xmm0 \n"
5367 "lea " MEMLEA(0x8,0) ",%0 \n"
5368 "punpcklbw %%xmm3,%%xmm0 \n"
5369 "movdqa %%xmm0,%%xmm4 \n"
5370 "punpcklwd %%xmm3,%%xmm0 \n"
5371 "punpckhwd %%xmm3,%%xmm4 \n"
5372 "cvtdq2ps %%xmm0,%%xmm0 \n"
5373 "cvtdq2ps %%xmm4,%%xmm4 \n"
5374 "movdqa %%xmm0,%%xmm1 \n"
5375 "movdqa %%xmm4,%%xmm5 \n"
5376 "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n"
5377 "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n"
5378 "addps " MEMACCESS(3) ",%%xmm0 \n"
5379 "addps " MEMACCESS(3) ",%%xmm4 \n"
5380 "movdqa %%xmm1,%%xmm2 \n"
5381 "movdqa %%xmm5,%%xmm6 \n"
5382 "mulps %%xmm1,%%xmm2 \n"
5383 "mulps %%xmm5,%%xmm6 \n"
5384 "mulps %%xmm2,%%xmm1 \n"
5385 "mulps %%xmm6,%%xmm5 \n"
5386 "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n"
5387 "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n"
5388 "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n"
5389 "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n"
5390 "addps %%xmm2,%%xmm0 \n"
5391 "addps %%xmm6,%%xmm4 \n"
5392 "addps %%xmm1,%%xmm0 \n"
5393 "addps %%xmm5,%%xmm4 \n"
5394 "cvttps2dq %%xmm0,%%xmm0 \n"
5395 "cvttps2dq %%xmm4,%%xmm4 \n"
5396 "packuswb %%xmm4,%%xmm0 \n"
5397 "packuswb %%xmm0,%%xmm0 \n"
5398 "movq %%xmm0," MEMACCESS(1) " \n"
5399 "lea " MEMLEA(0x8,1) ",%1 \n"
5400 "sub $0x2,%2 \n"
5401 "jg 1b \n"
5402 : "+r"(src_argb), // %0
5403 "+r"(dst_argb), // %1
5404 "+r"(width) // %2
5405 : "r"(poly) // %3
5406 : "memory", "cc"
5407 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
5408 );
5409 }
5410 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
5411
5412 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
ARGBPolynomialRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,const float * poly,int width)5413 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
5414 uint8* dst_argb,
5415 const float* poly,
5416 int width) {
5417 asm volatile (
5418 "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n"
5419 "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
5420 "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
5421 "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
5422
5423 // 2 pixel loop.
5424 LABELALIGN
5425 "1: \n"
5426 "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels
5427 "lea " MEMLEA(0x8,0) ",%0 \n"
5428 "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
5429 "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
5430 "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
5431 "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
5432 "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
5433 "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X
5434 "vcvttps2dq %%ymm0,%%ymm0 \n"
5435 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
5436 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
5437 "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
5438 "vmovq %%xmm0," MEMACCESS(1) " \n"
5439 "lea " MEMLEA(0x8,1) ",%1 \n"
5440 "sub $0x2,%2 \n"
5441 "jg 1b \n"
5442 "vzeroupper \n"
5443 : "+r"(src_argb), // %0
5444 "+r"(dst_argb), // %1
5445 "+r"(width) // %2
5446 : "r"(poly) // %3
5447 : "memory", "cc",
5448 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
5449 );
5450 }
5451 #endif // HAS_ARGBPOLYNOMIALROW_AVX2
5452
5453 #ifdef HAS_HALFFLOATROW_SSE2
5454 static float kScaleBias = 1.9259299444e-34f;
HalfFloatRow_SSE2(const uint16 * src,uint16 * dst,float scale,int width)5455 void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
5456 asm volatile (
5457 "pshufd $0x0,%3,%%xmm4 \n"
5458 "pxor %%xmm5,%%xmm5 \n"
5459 "sub %0,%1 \n"
5460
5461 // 16 pixel loop.
5462 LABELALIGN
5463 "1: \n"
5464 "movdqu " MEMACCESS(0) ",%%xmm2 \n" // 8 shorts
5465 "add $0x10,%0 \n"
5466 "movdqa %%xmm2,%%xmm3 \n"
5467 "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1
5468 "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats
5469 "punpckhwd %%xmm5,%%xmm3 \n"
5470 "cvtdq2ps %%xmm3,%%xmm3 \n"
5471 "mulps %%xmm4,%%xmm2 \n"
5472 "mulps %%xmm4,%%xmm3 \n"
5473 "psrld $0xd,%%xmm2 \n"
5474 "psrld $0xd,%%xmm3 \n"
5475 "packssdw %%xmm3,%%xmm2 \n"
5476 MEMOPMEM(movdqu,xmm2,-0x10,0,1,1)
5477 "sub $0x8,%2 \n"
5478 "jg 1b \n"
5479 : "+r"(src), // %0
5480 "+r"(dst), // %1
5481 "+r"(width) // %2
5482 : "x"(scale * kScaleBias) // %3
5483 : "memory", "cc",
5484 "xmm2", "xmm3", "xmm4", "xmm5"
5485 );
5486 }
5487 #endif // HAS_HALFFLOATROW_SSE2
5488
5489 #ifdef HAS_HALFFLOATROW_AVX2
HalfFloatRow_AVX2(const uint16 * src,uint16 * dst,float scale,int width)5490 void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
5491 asm volatile (
5492 "vbroadcastss %3, %%ymm4 \n"
5493 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
5494 "sub %0,%1 \n"
5495
5496 // 16 pixel loop.
5497 LABELALIGN
5498 "1: \n"
5499 "vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts
5500 "add $0x20,%0 \n"
5501 "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
5502 "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
5503 "vcvtdq2ps %%ymm3,%%ymm3 \n"
5504 "vcvtdq2ps %%ymm2,%%ymm2 \n"
5505 "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
5506 "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
5507 "vpsrld $0xd,%%ymm3,%%ymm3 \n"
5508 "vpsrld $0xd,%%ymm2,%%ymm2 \n"
5509 "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
5510 MEMOPMEM(vmovdqu,ymm2,-0x20,0,1,1)
5511 "sub $0x10,%2 \n"
5512 "jg 1b \n"
5513
5514 "vzeroupper \n"
5515 : "+r"(src), // %0
5516 "+r"(dst), // %1
5517 "+r"(width) // %2
5518 : "x"(scale * kScaleBias) // %3
5519 : "memory", "cc",
5520 "xmm2", "xmm3", "xmm4", "xmm5"
5521 );
5522 }
5523 #endif // HAS_HALFFLOATROW_AVX2
5524
5525 #ifdef HAS_HALFFLOATROW_F16C
HalfFloatRow_F16C(const uint16 * src,uint16 * dst,float scale,int width)5526 void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
5527 asm volatile (
5528 "vbroadcastss %3, %%ymm4 \n"
5529 "sub %0,%1 \n"
5530
5531 // 16 pixel loop.
5532 LABELALIGN
5533 "1: \n"
5534 "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints
5535 "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n"
5536 "vcvtdq2ps %%ymm2,%%ymm2 \n"
5537 "vcvtdq2ps %%ymm3,%%ymm3 \n"
5538 "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
5539 "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
5540 "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
5541 "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
5542 MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
5543 MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
5544 "add $0x20,%0 \n"
5545 "sub $0x10,%2 \n"
5546 "jg 1b \n"
5547 "vzeroupper \n"
5548 : "+r"(src), // %0
5549 "+r"(dst), // %1
5550 "+r"(width) // %2
5551 : "x"(scale) // %3
5552 : "memory", "cc",
5553 "xmm2", "xmm3", "xmm4"
5554 );
5555 }
5556 #endif // HAS_HALFFLOATROW_F16C
5557
5558 #ifdef HAS_HALFFLOATROW_F16C
HalfFloat1Row_F16C(const uint16 * src,uint16 * dst,float,int width)5559 void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float, int width) {
5560 asm volatile (
5561 "sub %0,%1 \n"
5562 // 16 pixel loop.
5563 LABELALIGN
5564 "1: \n"
5565 "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints
5566 "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n"
5567 "vcvtdq2ps %%ymm2,%%ymm2 \n"
5568 "vcvtdq2ps %%ymm3,%%ymm3 \n"
5569 "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
5570 "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
5571 MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
5572 MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
5573 "add $0x20,%0 \n"
5574 "sub $0x10,%2 \n"
5575 "jg 1b \n"
5576 "vzeroupper \n"
5577 : "+r"(src), // %0
5578 "+r"(dst), // %1
5579 "+r"(width) // %2
5580 :
5581 : "memory", "cc",
5582 "xmm2", "xmm3"
5583 );
5584 }
5585 #endif // HAS_HALFFLOATROW_F16C
5586
5587 #ifdef HAS_ARGBCOLORTABLEROW_X86
5588 // Tranform ARGB pixels with color table.
ARGBColorTableRow_X86(uint8 * dst_argb,const uint8 * table_argb,int width)5589 void ARGBColorTableRow_X86(uint8* dst_argb,
5590 const uint8* table_argb,
5591 int width) {
5592 uintptr_t pixel_temp;
5593 asm volatile (
5594 // 1 pixel loop.
5595 LABELALIGN
5596 "1: \n"
5597 "movzb " MEMACCESS(0) ",%1 \n"
5598 "lea " MEMLEA(0x4,0) ",%0 \n"
5599 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
5600 "mov %b1," MEMACCESS2(-0x4,0) " \n"
5601 "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
5602 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
5603 "mov %b1," MEMACCESS2(-0x3,0) " \n"
5604 "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
5605 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
5606 "mov %b1," MEMACCESS2(-0x2,0) " \n"
5607 "movzb " MEMACCESS2(-0x1,0) ",%1 \n"
5608 MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1
5609 "mov %b1," MEMACCESS2(-0x1,0) " \n"
5610 "dec %2 \n"
5611 "jg 1b \n"
5612 : "+r"(dst_argb), // %0
5613 "=&d"(pixel_temp), // %1
5614 "+r"(width) // %2
5615 : "r"(table_argb) // %3
5616 : "memory", "cc");
5617 }
5618 #endif // HAS_ARGBCOLORTABLEROW_X86
5619
5620 #ifdef HAS_RGBCOLORTABLEROW_X86
5621 // Tranform RGB pixels with color table.
RGBColorTableRow_X86(uint8 * dst_argb,const uint8 * table_argb,int width)5622 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
5623 uintptr_t pixel_temp;
5624 asm volatile (
5625 // 1 pixel loop.
5626 LABELALIGN
5627 "1: \n"
5628 "movzb " MEMACCESS(0) ",%1 \n"
5629 "lea " MEMLEA(0x4,0) ",%0 \n"
5630 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
5631 "mov %b1," MEMACCESS2(-0x4,0) " \n"
5632 "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
5633 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
5634 "mov %b1," MEMACCESS2(-0x3,0) " \n"
5635 "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
5636 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
5637 "mov %b1," MEMACCESS2(-0x2,0) " \n"
5638 "dec %2 \n"
5639 "jg 1b \n"
5640 : "+r"(dst_argb), // %0
5641 "=&d"(pixel_temp), // %1
5642 "+r"(width) // %2
5643 : "r"(table_argb) // %3
5644 : "memory", "cc");
5645 }
5646 #endif // HAS_RGBCOLORTABLEROW_X86
5647
5648 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
5649 // Tranform RGB pixels with luma table.
ARGBLumaColorTableRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width,const uint8 * luma,uint32 lumacoeff)5650 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
5651 uint8* dst_argb,
5652 int width,
5653 const uint8* luma,
5654 uint32 lumacoeff) {
5655 uintptr_t pixel_temp;
5656 uintptr_t table_temp;
5657 asm volatile (
5658 "movd %6,%%xmm3 \n"
5659 "pshufd $0x0,%%xmm3,%%xmm3 \n"
5660 "pcmpeqb %%xmm4,%%xmm4 \n"
5661 "psllw $0x8,%%xmm4 \n"
5662 "pxor %%xmm5,%%xmm5 \n"
5663
5664 // 4 pixel loop.
5665 LABELALIGN
5666 "1: \n"
5667 "movdqu " MEMACCESS(2) ",%%xmm0 \n"
5668 "pmaddubsw %%xmm3,%%xmm0 \n"
5669 "phaddw %%xmm0,%%xmm0 \n"
5670 "pand %%xmm4,%%xmm0 \n"
5671 "punpcklwd %%xmm5,%%xmm0 \n"
5672 "movd %%xmm0,%k1 \n" // 32 bit offset
5673 "add %5,%1 \n"
5674 "pshufd $0x39,%%xmm0,%%xmm0 \n"
5675
5676 "movzb " MEMACCESS(2) ",%0 \n"
5677 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5678 "mov %b0," MEMACCESS(3) " \n"
5679 "movzb " MEMACCESS2(0x1,2) ",%0 \n"
5680 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5681 "mov %b0," MEMACCESS2(0x1,3) " \n"
5682 "movzb " MEMACCESS2(0x2,2) ",%0 \n"
5683 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5684 "mov %b0," MEMACCESS2(0x2,3) " \n"
5685 "movzb " MEMACCESS2(0x3,2) ",%0 \n"
5686 "mov %b0," MEMACCESS2(0x3,3) " \n"
5687
5688 "movd %%xmm0,%k1 \n" // 32 bit offset
5689 "add %5,%1 \n"
5690 "pshufd $0x39,%%xmm0,%%xmm0 \n"
5691
5692 "movzb " MEMACCESS2(0x4,2) ",%0 \n"
5693 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5694 "mov %b0," MEMACCESS2(0x4,3) " \n"
5695 "movzb " MEMACCESS2(0x5,2) ",%0 \n"
5696 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5697 "mov %b0," MEMACCESS2(0x5,3) " \n"
5698 "movzb " MEMACCESS2(0x6,2) ",%0 \n"
5699 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5700 "mov %b0," MEMACCESS2(0x6,3) " \n"
5701 "movzb " MEMACCESS2(0x7,2) ",%0 \n"
5702 "mov %b0," MEMACCESS2(0x7,3) " \n"
5703
5704 "movd %%xmm0,%k1 \n" // 32 bit offset
5705 "add %5,%1 \n"
5706 "pshufd $0x39,%%xmm0,%%xmm0 \n"
5707
5708 "movzb " MEMACCESS2(0x8,2) ",%0 \n"
5709 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5710 "mov %b0," MEMACCESS2(0x8,3) " \n"
5711 "movzb " MEMACCESS2(0x9,2) ",%0 \n"
5712 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5713 "mov %b0," MEMACCESS2(0x9,3) " \n"
5714 "movzb " MEMACCESS2(0xa,2) ",%0 \n"
5715 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5716 "mov %b0," MEMACCESS2(0xa,3) " \n"
5717 "movzb " MEMACCESS2(0xb,2) ",%0 \n"
5718 "mov %b0," MEMACCESS2(0xb,3) " \n"
5719
5720 "movd %%xmm0,%k1 \n" // 32 bit offset
5721 "add %5,%1 \n"
5722
5723 "movzb " MEMACCESS2(0xc,2) ",%0 \n"
5724 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5725 "mov %b0," MEMACCESS2(0xc,3) " \n"
5726 "movzb " MEMACCESS2(0xd,2) ",%0 \n"
5727 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5728 "mov %b0," MEMACCESS2(0xd,3) " \n"
5729 "movzb " MEMACCESS2(0xe,2) ",%0 \n"
5730 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5731 "mov %b0," MEMACCESS2(0xe,3) " \n"
5732 "movzb " MEMACCESS2(0xf,2) ",%0 \n"
5733 "mov %b0," MEMACCESS2(0xf,3) " \n"
5734 "lea " MEMLEA(0x10,2) ",%2 \n"
5735 "lea " MEMLEA(0x10,3) ",%3 \n"
5736 "sub $0x4,%4 \n"
5737 "jg 1b \n"
5738 : "=&d"(pixel_temp), // %0
5739 "=&a"(table_temp), // %1
5740 "+r"(src_argb), // %2
5741 "+r"(dst_argb), // %3
5742 "+rm"(width) // %4
5743 : "r"(luma), // %5
5744 "rm"(lumacoeff) // %6
5745 : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
5746 );
5747 }
5748 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5749
5750 #endif // defined(__x86_64__) || defined(__i386__)
5751
5752 #ifdef __cplusplus
5753 } // extern "C"
5754 } // namespace libyuv
5755 #endif
5756