1 // VERSION 2
2 /*
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
4 *
5 * Use of this source code is governed by a BSD-style license
6 * that can be found in the LICENSE file in the root of the source
7 * tree. An additional intellectual property rights grant can be found
8 * in the file PATENTS. All contributing project authors may
9 * be found in the AUTHORS file in the root of the source tree.
10 */
11
12 #include "libyuv/row.h"
13
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18
19 // This module is for GCC x86 and x64.
20 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
21
22 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
23
24 // Constants for ARGB
25 static vec8 kARGBToY = {
26 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
27 };
28
29 // JPeg full range.
30 static vec8 kARGBToYJ = {
31 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
32 };
33 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
34
35 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
36
37 static vec8 kARGBToU = {
38 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
39 };
40
41 static vec8 kARGBToUJ = {
42 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
43 };
44
45 static vec8 kARGBToV = {
46 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
47 };
48
49 static vec8 kARGBToVJ = {
50 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
51 };
52
53 // Constants for BGRA
54 static vec8 kBGRAToY = {
55 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
56 };
57
58 static vec8 kBGRAToU = {
59 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
60 };
61
62 static vec8 kBGRAToV = {
63 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
64 };
65
66 // Constants for ABGR
67 static vec8 kABGRToY = {
68 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
69 };
70
71 static vec8 kABGRToU = {
72 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
73 };
74
75 static vec8 kABGRToV = {
76 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
77 };
78
79 // Constants for RGBA.
80 static vec8 kRGBAToY = {
81 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
82 };
83
84 static vec8 kRGBAToU = {
85 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
86 };
87
88 static vec8 kRGBAToV = {
89 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
90 };
91
92 static uvec8 kAddY16 = {
93 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
94 };
95
96 // 7 bit fixed point 0.5.
97 static vec16 kAddYJ64 = {
98 64, 64, 64, 64, 64, 64, 64, 64
99 };
100
101 static uvec8 kAddUV128 = {
102 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
103 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
104 };
105
106 static uvec16 kAddUVJ128 = {
107 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
108 };
109 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
110
111 #ifdef HAS_RGB24TOARGBROW_SSSE3
112
113 // Shuffle table for converting RGB24 to ARGB.
114 static uvec8 kShuffleMaskRGB24ToARGB = {
115 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
116 };
117
118 // Shuffle table for converting RAW to ARGB.
119 static uvec8 kShuffleMaskRAWToARGB = {
120 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
121 };
122
123 // Shuffle table for converting ARGB to RGB24.
124 static uvec8 kShuffleMaskARGBToRGB24 = {
125 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
126 };
127
128 // Shuffle table for converting ARGB to RAW.
129 static uvec8 kShuffleMaskARGBToRAW = {
130 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
131 };
132
133 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
134 static uvec8 kShuffleMaskARGBToRGB24_0 = {
135 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
136 };
137
138 // Shuffle table for converting ARGB to RAW.
139 static uvec8 kShuffleMaskARGBToRAW_0 = {
140 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
141 };
142 #endif // HAS_RGB24TOARGBROW_SSSE3
143
144 #if defined(TESTING) && defined(__x86_64__)
TestRow_SSE2(const uint8 * src_y,uint8 * dst_argb,int pix)145 void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
146 asm volatile (
147 ".p2align 5 \n"
148 "mov %%eax,%%eax \n"
149 "mov %%ebx,%%ebx \n"
150 "mov %%ecx,%%ecx \n"
151 "mov %%edx,%%edx \n"
152 "mov %%esi,%%esi \n"
153 "mov %%edi,%%edi \n"
154 "mov %%ebp,%%ebp \n"
155 "mov %%esp,%%esp \n"
156 ".p2align 5 \n"
157 "mov %%r8d,%%r8d \n"
158 "mov %%r9d,%%r9d \n"
159 "mov %%r10d,%%r10d \n"
160 "mov %%r11d,%%r11d \n"
161 "mov %%r12d,%%r12d \n"
162 "mov %%r13d,%%r13d \n"
163 "mov %%r14d,%%r14d \n"
164 "mov %%r15d,%%r15d \n"
165 ".p2align 5 \n"
166 "lea (%%rax),%%eax \n"
167 "lea (%%rbx),%%ebx \n"
168 "lea (%%rcx),%%ecx \n"
169 "lea (%%rdx),%%edx \n"
170 "lea (%%rsi),%%esi \n"
171 "lea (%%rdi),%%edi \n"
172 "lea (%%rbp),%%ebp \n"
173 "lea (%%rsp),%%esp \n"
174 ".p2align 5 \n"
175 "lea (%%r8),%%r8d \n"
176 "lea (%%r9),%%r9d \n"
177 "lea (%%r10),%%r10d \n"
178 "lea (%%r11),%%r11d \n"
179 "lea (%%r12),%%r12d \n"
180 "lea (%%r13),%%r13d \n"
181 "lea (%%r14),%%r14d \n"
182 "lea (%%r15),%%r15d \n"
183
184 ".p2align 5 \n"
185 "lea 0x10(%%rax),%%eax \n"
186 "lea 0x10(%%rbx),%%ebx \n"
187 "lea 0x10(%%rcx),%%ecx \n"
188 "lea 0x10(%%rdx),%%edx \n"
189 "lea 0x10(%%rsi),%%esi \n"
190 "lea 0x10(%%rdi),%%edi \n"
191 "lea 0x10(%%rbp),%%ebp \n"
192 "lea 0x10(%%rsp),%%esp \n"
193 ".p2align 5 \n"
194 "lea 0x10(%%r8),%%r8d \n"
195 "lea 0x10(%%r9),%%r9d \n"
196 "lea 0x10(%%r10),%%r10d \n"
197 "lea 0x10(%%r11),%%r11d \n"
198 "lea 0x10(%%r12),%%r12d \n"
199 "lea 0x10(%%r13),%%r13d \n"
200 "lea 0x10(%%r14),%%r14d \n"
201 "lea 0x10(%%r15),%%r15d \n"
202
203 ".p2align 5 \n"
204 "add 0x10,%%eax \n"
205 "add 0x10,%%ebx \n"
206 "add 0x10,%%ecx \n"
207 "add 0x10,%%edx \n"
208 "add 0x10,%%esi \n"
209 "add 0x10,%%edi \n"
210 "add 0x10,%%ebp \n"
211 "add 0x10,%%esp \n"
212 ".p2align 5 \n"
213 "add 0x10,%%r8d \n"
214 "add 0x10,%%r9d \n"
215 "add 0x10,%%r10d \n"
216 "add 0x10,%%r11d \n"
217 "add 0x10,%%r12d \n"
218 "add 0x10,%%r13d \n"
219 "add 0x10,%%r14d \n"
220 "add 0x10,%%r15d \n"
221
222 ".p2align 2 \n"
223 "1: \n"
224 "movq " MEMACCESS(0) ",%%xmm0 \n"
225 "lea " MEMLEA(0x8,0) ",%0 \n"
226 "movdqu %%xmm0," MEMACCESS(1) " \n"
227 "lea " MEMLEA(0x20,1) ",%1 \n"
228 "sub $0x8,%2 \n"
229 "jg 1b \n"
230 : "+r"(src_y), // %0
231 "+r"(dst_argb), // %1
232 "+r"(pix) // %2
233 :
234 : "memory", "cc", "xmm0", "xmm1", "xmm5"
235 );
236 }
237 #endif // TESTING
238
239 #ifdef HAS_J400TOARGBROW_SSE2
J400ToARGBRow_SSE2(const uint8 * src_y,uint8 * dst_argb,int pix)240 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
241 asm volatile (
242 "pcmpeqb %%xmm5,%%xmm5 \n"
243 "pslld $0x18,%%xmm5 \n"
244 LABELALIGN
245 "1: \n"
246 "movq " MEMACCESS(0) ",%%xmm0 \n"
247 "lea " MEMLEA(0x8,0) ",%0 \n"
248 "punpcklbw %%xmm0,%%xmm0 \n"
249 "movdqa %%xmm0,%%xmm1 \n"
250 "punpcklwd %%xmm0,%%xmm0 \n"
251 "punpckhwd %%xmm1,%%xmm1 \n"
252 "por %%xmm5,%%xmm0 \n"
253 "por %%xmm5,%%xmm1 \n"
254 "movdqu %%xmm0," MEMACCESS(1) " \n"
255 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
256 "lea " MEMLEA(0x20,1) ",%1 \n"
257 "sub $0x8,%2 \n"
258 "jg 1b \n"
259 : "+r"(src_y), // %0
260 "+r"(dst_argb), // %1
261 "+r"(pix) // %2
262 :: "memory", "cc", "xmm0", "xmm1", "xmm5"
263 );
264 }
265 #endif // HAS_J400TOARGBROW_SSE2
266
267 #ifdef HAS_RGB24TOARGBROW_SSSE3
RGB24ToARGBRow_SSSE3(const uint8 * src_rgb24,uint8 * dst_argb,int pix)268 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
269 asm volatile (
270 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
271 "pslld $0x18,%%xmm5 \n"
272 "movdqa %3,%%xmm4 \n"
273 LABELALIGN
274 "1: \n"
275 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
276 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
277 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
278 "lea " MEMLEA(0x30,0) ",%0 \n"
279 "movdqa %%xmm3,%%xmm2 \n"
280 "palignr $0x8,%%xmm1,%%xmm2 \n"
281 "pshufb %%xmm4,%%xmm2 \n"
282 "por %%xmm5,%%xmm2 \n"
283 "palignr $0xc,%%xmm0,%%xmm1 \n"
284 "pshufb %%xmm4,%%xmm0 \n"
285 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
286 "por %%xmm5,%%xmm0 \n"
287 "pshufb %%xmm4,%%xmm1 \n"
288 "movdqu %%xmm0," MEMACCESS(1) " \n"
289 "por %%xmm5,%%xmm1 \n"
290 "palignr $0x4,%%xmm3,%%xmm3 \n"
291 "pshufb %%xmm4,%%xmm3 \n"
292 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
293 "por %%xmm5,%%xmm3 \n"
294 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
295 "lea " MEMLEA(0x40,1) ",%1 \n"
296 "sub $0x10,%2 \n"
297 "jg 1b \n"
298 : "+r"(src_rgb24), // %0
299 "+r"(dst_argb), // %1
300 "+r"(pix) // %2
301 : "m"(kShuffleMaskRGB24ToARGB) // %3
302 : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
303 );
304 }
305
RAWToARGBRow_SSSE3(const uint8 * src_raw,uint8 * dst_argb,int pix)306 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
307 asm volatile (
308 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
309 "pslld $0x18,%%xmm5 \n"
310 "movdqa %3,%%xmm4 \n"
311 LABELALIGN
312 "1: \n"
313 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
314 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
315 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
316 "lea " MEMLEA(0x30,0) ",%0 \n"
317 "movdqa %%xmm3,%%xmm2 \n"
318 "palignr $0x8,%%xmm1,%%xmm2 \n"
319 "pshufb %%xmm4,%%xmm2 \n"
320 "por %%xmm5,%%xmm2 \n"
321 "palignr $0xc,%%xmm0,%%xmm1 \n"
322 "pshufb %%xmm4,%%xmm0 \n"
323 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
324 "por %%xmm5,%%xmm0 \n"
325 "pshufb %%xmm4,%%xmm1 \n"
326 "movdqu %%xmm0," MEMACCESS(1) " \n"
327 "por %%xmm5,%%xmm1 \n"
328 "palignr $0x4,%%xmm3,%%xmm3 \n"
329 "pshufb %%xmm4,%%xmm3 \n"
330 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
331 "por %%xmm5,%%xmm3 \n"
332 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
333 "lea " MEMLEA(0x40,1) ",%1 \n"
334 "sub $0x10,%2 \n"
335 "jg 1b \n"
336 : "+r"(src_raw), // %0
337 "+r"(dst_argb), // %1
338 "+r"(pix) // %2
339 : "m"(kShuffleMaskRAWToARGB) // %3
340 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
341 );
342 }
343
RGB565ToARGBRow_SSE2(const uint8 * src,uint8 * dst,int pix)344 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
345 asm volatile (
346 "mov $0x1080108,%%eax \n"
347 "movd %%eax,%%xmm5 \n"
348 "pshufd $0x0,%%xmm5,%%xmm5 \n"
349 "mov $0x20802080,%%eax \n"
350 "movd %%eax,%%xmm6 \n"
351 "pshufd $0x0,%%xmm6,%%xmm6 \n"
352 "pcmpeqb %%xmm3,%%xmm3 \n"
353 "psllw $0xb,%%xmm3 \n"
354 "pcmpeqb %%xmm4,%%xmm4 \n"
355 "psllw $0xa,%%xmm4 \n"
356 "psrlw $0x5,%%xmm4 \n"
357 "pcmpeqb %%xmm7,%%xmm7 \n"
358 "psllw $0x8,%%xmm7 \n"
359 "sub %0,%1 \n"
360 "sub %0,%1 \n"
361 LABELALIGN
362 "1: \n"
363 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
364 "movdqa %%xmm0,%%xmm1 \n"
365 "movdqa %%xmm0,%%xmm2 \n"
366 "pand %%xmm3,%%xmm1 \n"
367 "psllw $0xb,%%xmm2 \n"
368 "pmulhuw %%xmm5,%%xmm1 \n"
369 "pmulhuw %%xmm5,%%xmm2 \n"
370 "psllw $0x8,%%xmm1 \n"
371 "por %%xmm2,%%xmm1 \n"
372 "pand %%xmm4,%%xmm0 \n"
373 "pmulhuw %%xmm6,%%xmm0 \n"
374 "por %%xmm7,%%xmm0 \n"
375 "movdqa %%xmm1,%%xmm2 \n"
376 "punpcklbw %%xmm0,%%xmm1 \n"
377 "punpckhbw %%xmm0,%%xmm2 \n"
378 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
379 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
380 "lea " MEMLEA(0x10,0) ",%0 \n"
381 "sub $0x8,%2 \n"
382 "jg 1b \n"
383 : "+r"(src), // %0
384 "+r"(dst), // %1
385 "+r"(pix) // %2
386 :
387 : "memory", "cc", "eax", NACL_R14
388 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
389 );
390 }
391
ARGB1555ToARGBRow_SSE2(const uint8 * src,uint8 * dst,int pix)392 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
393 asm volatile (
394 "mov $0x1080108,%%eax \n"
395 "movd %%eax,%%xmm5 \n"
396 "pshufd $0x0,%%xmm5,%%xmm5 \n"
397 "mov $0x42004200,%%eax \n"
398 "movd %%eax,%%xmm6 \n"
399 "pshufd $0x0,%%xmm6,%%xmm6 \n"
400 "pcmpeqb %%xmm3,%%xmm3 \n"
401 "psllw $0xb,%%xmm3 \n"
402 "movdqa %%xmm3,%%xmm4 \n"
403 "psrlw $0x6,%%xmm4 \n"
404 "pcmpeqb %%xmm7,%%xmm7 \n"
405 "psllw $0x8,%%xmm7 \n"
406 "sub %0,%1 \n"
407 "sub %0,%1 \n"
408 LABELALIGN
409 "1: \n"
410 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
411 "movdqa %%xmm0,%%xmm1 \n"
412 "movdqa %%xmm0,%%xmm2 \n"
413 "psllw $0x1,%%xmm1 \n"
414 "psllw $0xb,%%xmm2 \n"
415 "pand %%xmm3,%%xmm1 \n"
416 "pmulhuw %%xmm5,%%xmm2 \n"
417 "pmulhuw %%xmm5,%%xmm1 \n"
418 "psllw $0x8,%%xmm1 \n"
419 "por %%xmm2,%%xmm1 \n"
420 "movdqa %%xmm0,%%xmm2 \n"
421 "pand %%xmm4,%%xmm0 \n"
422 "psraw $0x8,%%xmm2 \n"
423 "pmulhuw %%xmm6,%%xmm0 \n"
424 "pand %%xmm7,%%xmm2 \n"
425 "por %%xmm2,%%xmm0 \n"
426 "movdqa %%xmm1,%%xmm2 \n"
427 "punpcklbw %%xmm0,%%xmm1 \n"
428 "punpckhbw %%xmm0,%%xmm2 \n"
429 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
430 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
431 "lea " MEMLEA(0x10,0) ",%0 \n"
432 "sub $0x8,%2 \n"
433 "jg 1b \n"
434 : "+r"(src), // %0
435 "+r"(dst), // %1
436 "+r"(pix) // %2
437 :
438 : "memory", "cc", "eax", NACL_R14
439 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
440 );
441 }
442
ARGB4444ToARGBRow_SSE2(const uint8 * src,uint8 * dst,int pix)443 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
444 asm volatile (
445 "mov $0xf0f0f0f,%%eax \n"
446 "movd %%eax,%%xmm4 \n"
447 "pshufd $0x0,%%xmm4,%%xmm4 \n"
448 "movdqa %%xmm4,%%xmm5 \n"
449 "pslld $0x4,%%xmm5 \n"
450 "sub %0,%1 \n"
451 "sub %0,%1 \n"
452 LABELALIGN
453 "1: \n"
454 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
455 "movdqa %%xmm0,%%xmm2 \n"
456 "pand %%xmm4,%%xmm0 \n"
457 "pand %%xmm5,%%xmm2 \n"
458 "movdqa %%xmm0,%%xmm1 \n"
459 "movdqa %%xmm2,%%xmm3 \n"
460 "psllw $0x4,%%xmm1 \n"
461 "psrlw $0x4,%%xmm3 \n"
462 "por %%xmm1,%%xmm0 \n"
463 "por %%xmm3,%%xmm2 \n"
464 "movdqa %%xmm0,%%xmm1 \n"
465 "punpcklbw %%xmm2,%%xmm0 \n"
466 "punpckhbw %%xmm2,%%xmm1 \n"
467 MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2)
468 MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2)
469 "lea " MEMLEA(0x10,0) ",%0 \n"
470 "sub $0x8,%2 \n"
471 "jg 1b \n"
472 : "+r"(src), // %0
473 "+r"(dst), // %1
474 "+r"(pix) // %2
475 :
476 : "memory", "cc", "eax", NACL_R14
477 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
478 );
479 }
480
ARGBToRGB24Row_SSSE3(const uint8 * src,uint8 * dst,int pix)481 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
482 asm volatile (
483 "movdqa %3,%%xmm6 \n"
484 LABELALIGN
485 "1: \n"
486 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
487 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
488 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
489 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
490 "lea " MEMLEA(0x40,0) ",%0 \n"
491 "pshufb %%xmm6,%%xmm0 \n"
492 "pshufb %%xmm6,%%xmm1 \n"
493 "pshufb %%xmm6,%%xmm2 \n"
494 "pshufb %%xmm6,%%xmm3 \n"
495 "movdqa %%xmm1,%%xmm4 \n"
496 "psrldq $0x4,%%xmm1 \n"
497 "pslldq $0xc,%%xmm4 \n"
498 "movdqa %%xmm2,%%xmm5 \n"
499 "por %%xmm4,%%xmm0 \n"
500 "pslldq $0x8,%%xmm5 \n"
501 "movdqu %%xmm0," MEMACCESS(1) " \n"
502 "por %%xmm5,%%xmm1 \n"
503 "psrldq $0x8,%%xmm2 \n"
504 "pslldq $0x4,%%xmm3 \n"
505 "por %%xmm3,%%xmm2 \n"
506 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
507 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
508 "lea " MEMLEA(0x30,1) ",%1 \n"
509 "sub $0x10,%2 \n"
510 "jg 1b \n"
511 : "+r"(src), // %0
512 "+r"(dst), // %1
513 "+r"(pix) // %2
514 : "m"(kShuffleMaskARGBToRGB24) // %3
515 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
516 );
517 }
518
ARGBToRAWRow_SSSE3(const uint8 * src,uint8 * dst,int pix)519 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
520 asm volatile (
521 "movdqa %3,%%xmm6 \n"
522 LABELALIGN
523 "1: \n"
524 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
525 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
526 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
527 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
528 "lea " MEMLEA(0x40,0) ",%0 \n"
529 "pshufb %%xmm6,%%xmm0 \n"
530 "pshufb %%xmm6,%%xmm1 \n"
531 "pshufb %%xmm6,%%xmm2 \n"
532 "pshufb %%xmm6,%%xmm3 \n"
533 "movdqa %%xmm1,%%xmm4 \n"
534 "psrldq $0x4,%%xmm1 \n"
535 "pslldq $0xc,%%xmm4 \n"
536 "movdqa %%xmm2,%%xmm5 \n"
537 "por %%xmm4,%%xmm0 \n"
538 "pslldq $0x8,%%xmm5 \n"
539 "movdqu %%xmm0," MEMACCESS(1) " \n"
540 "por %%xmm5,%%xmm1 \n"
541 "psrldq $0x8,%%xmm2 \n"
542 "pslldq $0x4,%%xmm3 \n"
543 "por %%xmm3,%%xmm2 \n"
544 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
545 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
546 "lea " MEMLEA(0x30,1) ",%1 \n"
547 "sub $0x10,%2 \n"
548 "jg 1b \n"
549 : "+r"(src), // %0
550 "+r"(dst), // %1
551 "+r"(pix) // %2
552 : "m"(kShuffleMaskARGBToRAW) // %3
553 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
554 );
555 }
556
ARGBToRGB565Row_SSE2(const uint8 * src,uint8 * dst,int pix)557 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
558 asm volatile (
559 "pcmpeqb %%xmm3,%%xmm3 \n"
560 "psrld $0x1b,%%xmm3 \n"
561 "pcmpeqb %%xmm4,%%xmm4 \n"
562 "psrld $0x1a,%%xmm4 \n"
563 "pslld $0x5,%%xmm4 \n"
564 "pcmpeqb %%xmm5,%%xmm5 \n"
565 "pslld $0xb,%%xmm5 \n"
566 LABELALIGN
567 "1: \n"
568 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
569 "movdqa %%xmm0,%%xmm1 \n"
570 "movdqa %%xmm0,%%xmm2 \n"
571 "pslld $0x8,%%xmm0 \n"
572 "psrld $0x3,%%xmm1 \n"
573 "psrld $0x5,%%xmm2 \n"
574 "psrad $0x10,%%xmm0 \n"
575 "pand %%xmm3,%%xmm1 \n"
576 "pand %%xmm4,%%xmm2 \n"
577 "pand %%xmm5,%%xmm0 \n"
578 "por %%xmm2,%%xmm1 \n"
579 "por %%xmm1,%%xmm0 \n"
580 "packssdw %%xmm0,%%xmm0 \n"
581 "lea " MEMLEA(0x10,0) ",%0 \n"
582 "movq %%xmm0," MEMACCESS(1) " \n"
583 "lea " MEMLEA(0x8,1) ",%1 \n"
584 "sub $0x4,%2 \n"
585 "jg 1b \n"
586 : "+r"(src), // %0
587 "+r"(dst), // %1
588 "+r"(pix) // %2
589 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
590 );
591 }
592
ARGBToARGB1555Row_SSE2(const uint8 * src,uint8 * dst,int pix)593 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
594 asm volatile (
595 "pcmpeqb %%xmm4,%%xmm4 \n"
596 "psrld $0x1b,%%xmm4 \n"
597 "movdqa %%xmm4,%%xmm5 \n"
598 "pslld $0x5,%%xmm5 \n"
599 "movdqa %%xmm4,%%xmm6 \n"
600 "pslld $0xa,%%xmm6 \n"
601 "pcmpeqb %%xmm7,%%xmm7 \n"
602 "pslld $0xf,%%xmm7 \n"
603 LABELALIGN
604 "1: \n"
605 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
606 "movdqa %%xmm0,%%xmm1 \n"
607 "movdqa %%xmm0,%%xmm2 \n"
608 "movdqa %%xmm0,%%xmm3 \n"
609 "psrad $0x10,%%xmm0 \n"
610 "psrld $0x3,%%xmm1 \n"
611 "psrld $0x6,%%xmm2 \n"
612 "psrld $0x9,%%xmm3 \n"
613 "pand %%xmm7,%%xmm0 \n"
614 "pand %%xmm4,%%xmm1 \n"
615 "pand %%xmm5,%%xmm2 \n"
616 "pand %%xmm6,%%xmm3 \n"
617 "por %%xmm1,%%xmm0 \n"
618 "por %%xmm3,%%xmm2 \n"
619 "por %%xmm2,%%xmm0 \n"
620 "packssdw %%xmm0,%%xmm0 \n"
621 "lea " MEMLEA(0x10,0) ",%0 \n"
622 "movq %%xmm0," MEMACCESS(1) " \n"
623 "lea " MEMLEA(0x8,1) ",%1 \n"
624 "sub $0x4,%2 \n"
625 "jg 1b \n"
626 : "+r"(src), // %0
627 "+r"(dst), // %1
628 "+r"(pix) // %2
629 :: "memory", "cc",
630 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
631 );
632 }
633
ARGBToARGB4444Row_SSE2(const uint8 * src,uint8 * dst,int pix)634 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
635 asm volatile (
636 "pcmpeqb %%xmm4,%%xmm4 \n"
637 "psllw $0xc,%%xmm4 \n"
638 "movdqa %%xmm4,%%xmm3 \n"
639 "psrlw $0x8,%%xmm3 \n"
640 LABELALIGN
641 "1: \n"
642 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
643 "movdqa %%xmm0,%%xmm1 \n"
644 "pand %%xmm3,%%xmm0 \n"
645 "pand %%xmm4,%%xmm1 \n"
646 "psrlq $0x4,%%xmm0 \n"
647 "psrlq $0x8,%%xmm1 \n"
648 "por %%xmm1,%%xmm0 \n"
649 "packuswb %%xmm0,%%xmm0 \n"
650 "lea " MEMLEA(0x10,0) ",%0 \n"
651 "movq %%xmm0," MEMACCESS(1) " \n"
652 "lea " MEMLEA(0x8,1) ",%1 \n"
653 "sub $0x4,%2 \n"
654 "jg 1b \n"
655 : "+r"(src), // %0
656 "+r"(dst), // %1
657 "+r"(pix) // %2
658 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
659 );
660 }
661 #endif // HAS_RGB24TOARGBROW_SSSE3
662
663 #ifdef HAS_ARGBTOYROW_SSSE3
664 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
ARGBToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)665 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
666 asm volatile (
667 "movdqa %3,%%xmm4 \n"
668 "movdqa %4,%%xmm5 \n"
669 LABELALIGN
670 "1: \n"
671 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
672 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
673 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
674 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
675 "pmaddubsw %%xmm4,%%xmm0 \n"
676 "pmaddubsw %%xmm4,%%xmm1 \n"
677 "pmaddubsw %%xmm4,%%xmm2 \n"
678 "pmaddubsw %%xmm4,%%xmm3 \n"
679 "lea " MEMLEA(0x40,0) ",%0 \n"
680 "phaddw %%xmm1,%%xmm0 \n"
681 "phaddw %%xmm3,%%xmm2 \n"
682 "psrlw $0x7,%%xmm0 \n"
683 "psrlw $0x7,%%xmm2 \n"
684 "packuswb %%xmm2,%%xmm0 \n"
685 "paddb %%xmm5,%%xmm0 \n"
686 "movdqu %%xmm0," MEMACCESS(1) " \n"
687 "lea " MEMLEA(0x10,1) ",%1 \n"
688 "sub $0x10,%2 \n"
689 "jg 1b \n"
690 : "+r"(src_argb), // %0
691 "+r"(dst_y), // %1
692 "+r"(pix) // %2
693 : "m"(kARGBToY), // %3
694 "m"(kAddY16) // %4
695 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
696 );
697 }
698 #endif // HAS_ARGBTOYROW_SSSE3
699
700 #ifdef HAS_ARGBTOYJROW_SSSE3
701 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
702 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
ARGBToYJRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)703 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
704 asm volatile (
705 "movdqa %3,%%xmm4 \n"
706 "movdqa %4,%%xmm5 \n"
707 LABELALIGN
708 "1: \n"
709 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
710 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
711 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
712 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
713 "pmaddubsw %%xmm4,%%xmm0 \n"
714 "pmaddubsw %%xmm4,%%xmm1 \n"
715 "pmaddubsw %%xmm4,%%xmm2 \n"
716 "pmaddubsw %%xmm4,%%xmm3 \n"
717 "lea " MEMLEA(0x40,0) ",%0 \n"
718 "phaddw %%xmm1,%%xmm0 \n"
719 "phaddw %%xmm3,%%xmm2 \n"
720 "paddw %%xmm5,%%xmm0 \n"
721 "paddw %%xmm5,%%xmm2 \n"
722 "psrlw $0x7,%%xmm0 \n"
723 "psrlw $0x7,%%xmm2 \n"
724 "packuswb %%xmm2,%%xmm0 \n"
725 "movdqu %%xmm0," MEMACCESS(1) " \n"
726 "lea " MEMLEA(0x10,1) ",%1 \n"
727 "sub $0x10,%2 \n"
728 "jg 1b \n"
729 : "+r"(src_argb), // %0
730 "+r"(dst_y), // %1
731 "+r"(pix) // %2
732 : "m"(kARGBToYJ), // %3
733 "m"(kAddYJ64) // %4
734 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
735 );
736 }
737 #endif // HAS_ARGBTOYJROW_SSSE3
738
739 #ifdef HAS_ARGBTOYROW_AVX2
740 // vpermd for vphaddw + vpackuswb vpermd.
741 static const lvec32 kPermdARGBToY_AVX = {
742 0, 4, 1, 5, 2, 6, 3, 7
743 };
744
745 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYRow_AVX2(const uint8 * src_argb,uint8 * dst_y,int pix)746 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
747 asm volatile (
748 "vbroadcastf128 %3,%%ymm4 \n"
749 "vbroadcastf128 %4,%%ymm5 \n"
750 "vmovdqu %5,%%ymm6 \n"
751 LABELALIGN
752 "1: \n"
753 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
754 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
755 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
756 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
757 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
758 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
759 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
760 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
761 "lea " MEMLEA(0x80,0) ",%0 \n"
762 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
763 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
764 "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
765 "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
766 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
767 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
768 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
769 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
770 "lea " MEMLEA(0x20,1) ",%1 \n"
771 "sub $0x20,%2 \n"
772 "jg 1b \n"
773 "vzeroupper \n"
774 : "+r"(src_argb), // %0
775 "+r"(dst_y), // %1
776 "+r"(pix) // %2
777 : "m"(kARGBToY), // %3
778 "m"(kAddY16), // %4
779 "m"(kPermdARGBToY_AVX) // %5
780 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
781 );
782 }
783 #endif // HAS_ARGBTOYROW_AVX2
784
785 #ifdef HAS_ARGBTOYJROW_AVX2
786 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYJRow_AVX2(const uint8 * src_argb,uint8 * dst_y,int pix)787 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
788 asm volatile (
789 "vbroadcastf128 %3,%%ymm4 \n"
790 "vbroadcastf128 %4,%%ymm5 \n"
791 "vmovdqu %5,%%ymm6 \n"
792 LABELALIGN
793 "1: \n"
794 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
795 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
796 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
797 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
798 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
799 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
800 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
801 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
802 "lea " MEMLEA(0x80,0) ",%0 \n"
803 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
804 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
805 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding.
806 "vpaddw %%ymm5,%%ymm2,%%ymm2 \n"
807 "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
808 "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
809 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
810 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
811 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
812 "lea " MEMLEA(0x20,1) ",%1 \n"
813 "sub $0x20,%2 \n"
814 "jg 1b \n"
815 "vzeroupper \n"
816 : "+r"(src_argb), // %0
817 "+r"(dst_y), // %1
818 "+r"(pix) // %2
819 : "m"(kARGBToYJ), // %3
820 "m"(kAddYJ64), // %4
821 "m"(kPermdARGBToY_AVX) // %5
822 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
823 );
824 }
825 #endif // HAS_ARGBTOYJROW_AVX2
826
827 #ifdef HAS_ARGBTOUVROW_SSSE3
ARGBToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)828 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
829 uint8* dst_u, uint8* dst_v, int width) {
830 asm volatile (
831 "movdqa %5,%%xmm3 \n"
832 "movdqa %6,%%xmm4 \n"
833 "movdqa %7,%%xmm5 \n"
834 "sub %1,%2 \n"
835 LABELALIGN
836 "1: \n"
837 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
838 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
839 "pavgb %%xmm7,%%xmm0 \n"
840 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
841 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
842 "pavgb %%xmm7,%%xmm1 \n"
843 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
844 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
845 "pavgb %%xmm7,%%xmm2 \n"
846 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
847 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
848 "pavgb %%xmm7,%%xmm6 \n"
849
850 "lea " MEMLEA(0x40,0) ",%0 \n"
851 "movdqa %%xmm0,%%xmm7 \n"
852 "shufps $0x88,%%xmm1,%%xmm0 \n"
853 "shufps $0xdd,%%xmm1,%%xmm7 \n"
854 "pavgb %%xmm7,%%xmm0 \n"
855 "movdqa %%xmm2,%%xmm7 \n"
856 "shufps $0x88,%%xmm6,%%xmm2 \n"
857 "shufps $0xdd,%%xmm6,%%xmm7 \n"
858 "pavgb %%xmm7,%%xmm2 \n"
859 "movdqa %%xmm0,%%xmm1 \n"
860 "movdqa %%xmm2,%%xmm6 \n"
861 "pmaddubsw %%xmm4,%%xmm0 \n"
862 "pmaddubsw %%xmm4,%%xmm2 \n"
863 "pmaddubsw %%xmm3,%%xmm1 \n"
864 "pmaddubsw %%xmm3,%%xmm6 \n"
865 "phaddw %%xmm2,%%xmm0 \n"
866 "phaddw %%xmm6,%%xmm1 \n"
867 "psraw $0x8,%%xmm0 \n"
868 "psraw $0x8,%%xmm1 \n"
869 "packsswb %%xmm1,%%xmm0 \n"
870 "paddb %%xmm5,%%xmm0 \n"
871 "movlps %%xmm0," MEMACCESS(1) " \n"
872 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
873 "lea " MEMLEA(0x8,1) ",%1 \n"
874 "sub $0x10,%3 \n"
875 "jg 1b \n"
876 : "+r"(src_argb0), // %0
877 "+r"(dst_u), // %1
878 "+r"(dst_v), // %2
879 "+rm"(width) // %3
880 : "r"((intptr_t)(src_stride_argb)), // %4
881 "m"(kARGBToV), // %5
882 "m"(kARGBToU), // %6
883 "m"(kAddUV128) // %7
884 : "memory", "cc", NACL_R14
885 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
886 );
887 }
888 #endif // HAS_ARGBTOUVROW_SSSE3
889
890 #ifdef HAS_ARGBTOUVROW_AVX2
891 // vpshufb for vphaddw + vpackuswb packed to shorts.
892 static const lvec8 kShufARGBToUV_AVX = {
893 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
894 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
895 };
ARGBToUVRow_AVX2(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)896 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
897 uint8* dst_u, uint8* dst_v, int width) {
898 asm volatile (
899 "vbroadcastf128 %5,%%ymm5 \n"
900 "vbroadcastf128 %6,%%ymm6 \n"
901 "vbroadcastf128 %7,%%ymm7 \n"
902 "sub %1,%2 \n"
903 LABELALIGN
904 "1: \n"
905 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
906 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
907 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
908 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
909 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
910 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
911 VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
912 VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
913 "lea " MEMLEA(0x80,0) ",%0 \n"
914 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
915 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
916 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
917 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
918 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
919 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
920
921 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
922 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
923 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
924 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
925 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
926 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
927 "vpsraw $0x8,%%ymm1,%%ymm1 \n"
928 "vpsraw $0x8,%%ymm0,%%ymm0 \n"
929 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
930 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
931 "vpshufb %8,%%ymm0,%%ymm0 \n"
932 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
933
934 "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
935 VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
936 "lea " MEMLEA(0x10,1) ",%1 \n"
937 "sub $0x20,%3 \n"
938 "jg 1b \n"
939 "vzeroupper \n"
940 : "+r"(src_argb0), // %0
941 "+r"(dst_u), // %1
942 "+r"(dst_v), // %2
943 "+rm"(width) // %3
944 : "r"((intptr_t)(src_stride_argb)), // %4
945 "m"(kAddUV128), // %5
946 "m"(kARGBToV), // %6
947 "m"(kARGBToU), // %7
948 "m"(kShufARGBToUV_AVX) // %8
949 : "memory", "cc", NACL_R14
950 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
951 );
952 }
953 #endif // HAS_ARGBTOUVROW_AVX2
954
955 #ifdef HAS_ARGBTOUVJROW_SSSE3
ARGBToUVJRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)956 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
957 uint8* dst_u, uint8* dst_v, int width) {
958 asm volatile (
959 "movdqa %5,%%xmm3 \n"
960 "movdqa %6,%%xmm4 \n"
961 "movdqa %7,%%xmm5 \n"
962 "sub %1,%2 \n"
963 LABELALIGN
964 "1: \n"
965 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
966 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
967 "pavgb %%xmm7,%%xmm0 \n"
968 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
969 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
970 "pavgb %%xmm7,%%xmm1 \n"
971 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
972 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
973 "pavgb %%xmm7,%%xmm2 \n"
974 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
975 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
976 "pavgb %%xmm7,%%xmm6 \n"
977
978 "lea " MEMLEA(0x40,0) ",%0 \n"
979 "movdqa %%xmm0,%%xmm7 \n"
980 "shufps $0x88,%%xmm1,%%xmm0 \n"
981 "shufps $0xdd,%%xmm1,%%xmm7 \n"
982 "pavgb %%xmm7,%%xmm0 \n"
983 "movdqa %%xmm2,%%xmm7 \n"
984 "shufps $0x88,%%xmm6,%%xmm2 \n"
985 "shufps $0xdd,%%xmm6,%%xmm7 \n"
986 "pavgb %%xmm7,%%xmm2 \n"
987 "movdqa %%xmm0,%%xmm1 \n"
988 "movdqa %%xmm2,%%xmm6 \n"
989 "pmaddubsw %%xmm4,%%xmm0 \n"
990 "pmaddubsw %%xmm4,%%xmm2 \n"
991 "pmaddubsw %%xmm3,%%xmm1 \n"
992 "pmaddubsw %%xmm3,%%xmm6 \n"
993 "phaddw %%xmm2,%%xmm0 \n"
994 "phaddw %%xmm6,%%xmm1 \n"
995 "paddw %%xmm5,%%xmm0 \n"
996 "paddw %%xmm5,%%xmm1 \n"
997 "psraw $0x8,%%xmm0 \n"
998 "psraw $0x8,%%xmm1 \n"
999 "packsswb %%xmm1,%%xmm0 \n"
1000 "movlps %%xmm0," MEMACCESS(1) " \n"
1001 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1002 "lea " MEMLEA(0x8,1) ",%1 \n"
1003 "sub $0x10,%3 \n"
1004 "jg 1b \n"
1005 : "+r"(src_argb0), // %0
1006 "+r"(dst_u), // %1
1007 "+r"(dst_v), // %2
1008 "+rm"(width) // %3
1009 : "r"((intptr_t)(src_stride_argb)), // %4
1010 "m"(kARGBToVJ), // %5
1011 "m"(kARGBToUJ), // %6
1012 "m"(kAddUVJ128) // %7
1013 : "memory", "cc", NACL_R14
1014 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1015 );
1016 }
1017 #endif // HAS_ARGBTOUVJROW_SSSE3
1018
1019 #ifdef HAS_ARGBTOUV444ROW_SSSE3
ARGBToUV444Row_SSSE3(const uint8 * src_argb,uint8 * dst_u,uint8 * dst_v,int width)1020 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1021 int width) {
1022 asm volatile (
1023 "movdqa %4,%%xmm3 \n"
1024 "movdqa %5,%%xmm4 \n"
1025 "movdqa %6,%%xmm5 \n"
1026 "sub %1,%2 \n"
1027 LABELALIGN
1028 "1: \n"
1029 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1030 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1031 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1032 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1033 "pmaddubsw %%xmm4,%%xmm0 \n"
1034 "pmaddubsw %%xmm4,%%xmm1 \n"
1035 "pmaddubsw %%xmm4,%%xmm2 \n"
1036 "pmaddubsw %%xmm4,%%xmm6 \n"
1037 "phaddw %%xmm1,%%xmm0 \n"
1038 "phaddw %%xmm6,%%xmm2 \n"
1039 "psraw $0x8,%%xmm0 \n"
1040 "psraw $0x8,%%xmm2 \n"
1041 "packsswb %%xmm2,%%xmm0 \n"
1042 "paddb %%xmm5,%%xmm0 \n"
1043 "movdqu %%xmm0," MEMACCESS(1) " \n"
1044 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1045 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1046 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1047 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1048 "pmaddubsw %%xmm3,%%xmm0 \n"
1049 "pmaddubsw %%xmm3,%%xmm1 \n"
1050 "pmaddubsw %%xmm3,%%xmm2 \n"
1051 "pmaddubsw %%xmm3,%%xmm6 \n"
1052 "phaddw %%xmm1,%%xmm0 \n"
1053 "phaddw %%xmm6,%%xmm2 \n"
1054 "psraw $0x8,%%xmm0 \n"
1055 "psraw $0x8,%%xmm2 \n"
1056 "packsswb %%xmm2,%%xmm0 \n"
1057 "paddb %%xmm5,%%xmm0 \n"
1058 "lea " MEMLEA(0x40,0) ",%0 \n"
1059 MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1)
1060 "lea " MEMLEA(0x10,1) ",%1 \n"
1061 "sub $0x10,%3 \n"
1062 "jg 1b \n"
1063 : "+r"(src_argb), // %0
1064 "+r"(dst_u), // %1
1065 "+r"(dst_v), // %2
1066 "+rm"(width) // %3
1067 : "m"(kARGBToV), // %4
1068 "m"(kARGBToU), // %5
1069 "m"(kAddUV128) // %6
1070 : "memory", "cc", NACL_R14
1071 "xmm0", "xmm1", "xmm2", "xmm6"
1072 );
1073 }
1074 #endif // HAS_ARGBTOUV444ROW_SSSE3
1075
1076 #ifdef HAS_ARGBTOUV422ROW_SSSE3
ARGBToUV422Row_SSSE3(const uint8 * src_argb0,uint8 * dst_u,uint8 * dst_v,int width)1077 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
1078 uint8* dst_u, uint8* dst_v, int width) {
1079 asm volatile (
1080 "movdqa %4,%%xmm3 \n"
1081 "movdqa %5,%%xmm4 \n"
1082 "movdqa %6,%%xmm5 \n"
1083 "sub %1,%2 \n"
1084 LABELALIGN
1085 "1: \n"
1086 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1087 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1088 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1089 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1090 "lea " MEMLEA(0x40,0) ",%0 \n"
1091 "movdqa %%xmm0,%%xmm7 \n"
1092 "shufps $0x88,%%xmm1,%%xmm0 \n"
1093 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1094 "pavgb %%xmm7,%%xmm0 \n"
1095 "movdqa %%xmm2,%%xmm7 \n"
1096 "shufps $0x88,%%xmm6,%%xmm2 \n"
1097 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1098 "pavgb %%xmm7,%%xmm2 \n"
1099 "movdqa %%xmm0,%%xmm1 \n"
1100 "movdqa %%xmm2,%%xmm6 \n"
1101 "pmaddubsw %%xmm4,%%xmm0 \n"
1102 "pmaddubsw %%xmm4,%%xmm2 \n"
1103 "pmaddubsw %%xmm3,%%xmm1 \n"
1104 "pmaddubsw %%xmm3,%%xmm6 \n"
1105 "phaddw %%xmm2,%%xmm0 \n"
1106 "phaddw %%xmm6,%%xmm1 \n"
1107 "psraw $0x8,%%xmm0 \n"
1108 "psraw $0x8,%%xmm1 \n"
1109 "packsswb %%xmm1,%%xmm0 \n"
1110 "paddb %%xmm5,%%xmm0 \n"
1111 "movlps %%xmm0," MEMACCESS(1) " \n"
1112 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1113 "lea " MEMLEA(0x8,1) ",%1 \n"
1114 "sub $0x10,%3 \n"
1115 "jg 1b \n"
1116 : "+r"(src_argb0), // %0
1117 "+r"(dst_u), // %1
1118 "+r"(dst_v), // %2
1119 "+rm"(width) // %3
1120 : "m"(kARGBToV), // %4
1121 "m"(kARGBToU), // %5
1122 "m"(kAddUV128) // %6
1123 : "memory", "cc", NACL_R14
1124 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1125 );
1126 }
1127 #endif // HAS_ARGBTOUV422ROW_SSSE3
1128
BGRAToYRow_SSSE3(const uint8 * src_bgra,uint8 * dst_y,int pix)1129 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
1130 asm volatile (
1131 "movdqa %4,%%xmm5 \n"
1132 "movdqa %3,%%xmm4 \n"
1133 LABELALIGN
1134 "1: \n"
1135 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1136 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1137 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1138 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1139 "pmaddubsw %%xmm4,%%xmm0 \n"
1140 "pmaddubsw %%xmm4,%%xmm1 \n"
1141 "pmaddubsw %%xmm4,%%xmm2 \n"
1142 "pmaddubsw %%xmm4,%%xmm3 \n"
1143 "lea " MEMLEA(0x40,0) ",%0 \n"
1144 "phaddw %%xmm1,%%xmm0 \n"
1145 "phaddw %%xmm3,%%xmm2 \n"
1146 "psrlw $0x7,%%xmm0 \n"
1147 "psrlw $0x7,%%xmm2 \n"
1148 "packuswb %%xmm2,%%xmm0 \n"
1149 "paddb %%xmm5,%%xmm0 \n"
1150 "movdqu %%xmm0," MEMACCESS(1) " \n"
1151 "lea " MEMLEA(0x10,1) ",%1 \n"
1152 "sub $0x10,%2 \n"
1153 "jg 1b \n"
1154 : "+r"(src_bgra), // %0
1155 "+r"(dst_y), // %1
1156 "+r"(pix) // %2
1157 : "m"(kBGRAToY), // %3
1158 "m"(kAddY16) // %4
1159 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1160 );
1161 }
1162
BGRAToUVRow_SSSE3(const uint8 * src_bgra0,int src_stride_bgra,uint8 * dst_u,uint8 * dst_v,int width)1163 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1164 uint8* dst_u, uint8* dst_v, int width) {
1165 asm volatile (
1166 "movdqa %5,%%xmm3 \n"
1167 "movdqa %6,%%xmm4 \n"
1168 "movdqa %7,%%xmm5 \n"
1169 "sub %1,%2 \n"
1170 LABELALIGN
1171 "1: \n"
1172 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1173 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1174 "pavgb %%xmm7,%%xmm0 \n"
1175 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1176 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1177 "pavgb %%xmm7,%%xmm1 \n"
1178 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1179 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1180 "pavgb %%xmm7,%%xmm2 \n"
1181 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1182 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1183 "pavgb %%xmm7,%%xmm6 \n"
1184
1185 "lea " MEMLEA(0x40,0) ",%0 \n"
1186 "movdqa %%xmm0,%%xmm7 \n"
1187 "shufps $0x88,%%xmm1,%%xmm0 \n"
1188 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1189 "pavgb %%xmm7,%%xmm0 \n"
1190 "movdqa %%xmm2,%%xmm7 \n"
1191 "shufps $0x88,%%xmm6,%%xmm2 \n"
1192 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1193 "pavgb %%xmm7,%%xmm2 \n"
1194 "movdqa %%xmm0,%%xmm1 \n"
1195 "movdqa %%xmm2,%%xmm6 \n"
1196 "pmaddubsw %%xmm4,%%xmm0 \n"
1197 "pmaddubsw %%xmm4,%%xmm2 \n"
1198 "pmaddubsw %%xmm3,%%xmm1 \n"
1199 "pmaddubsw %%xmm3,%%xmm6 \n"
1200 "phaddw %%xmm2,%%xmm0 \n"
1201 "phaddw %%xmm6,%%xmm1 \n"
1202 "psraw $0x8,%%xmm0 \n"
1203 "psraw $0x8,%%xmm1 \n"
1204 "packsswb %%xmm1,%%xmm0 \n"
1205 "paddb %%xmm5,%%xmm0 \n"
1206 "movlps %%xmm0," MEMACCESS(1) " \n"
1207 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1208 "lea " MEMLEA(0x8,1) ",%1 \n"
1209 "sub $0x10,%3 \n"
1210 "jg 1b \n"
1211 : "+r"(src_bgra0), // %0
1212 "+r"(dst_u), // %1
1213 "+r"(dst_v), // %2
1214 "+rm"(width) // %3
1215 : "r"((intptr_t)(src_stride_bgra)), // %4
1216 "m"(kBGRAToV), // %5
1217 "m"(kBGRAToU), // %6
1218 "m"(kAddUV128) // %7
1219 : "memory", "cc", NACL_R14
1220 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1221 );
1222 }
1223
ABGRToYRow_SSSE3(const uint8 * src_abgr,uint8 * dst_y,int pix)1224 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
1225 asm volatile (
1226 "movdqa %4,%%xmm5 \n"
1227 "movdqa %3,%%xmm4 \n"
1228 LABELALIGN
1229 "1: \n"
1230 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1231 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1232 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1233 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1234 "pmaddubsw %%xmm4,%%xmm0 \n"
1235 "pmaddubsw %%xmm4,%%xmm1 \n"
1236 "pmaddubsw %%xmm4,%%xmm2 \n"
1237 "pmaddubsw %%xmm4,%%xmm3 \n"
1238 "lea " MEMLEA(0x40,0) ",%0 \n"
1239 "phaddw %%xmm1,%%xmm0 \n"
1240 "phaddw %%xmm3,%%xmm2 \n"
1241 "psrlw $0x7,%%xmm0 \n"
1242 "psrlw $0x7,%%xmm2 \n"
1243 "packuswb %%xmm2,%%xmm0 \n"
1244 "paddb %%xmm5,%%xmm0 \n"
1245 "movdqu %%xmm0," MEMACCESS(1) " \n"
1246 "lea " MEMLEA(0x10,1) ",%1 \n"
1247 "sub $0x10,%2 \n"
1248 "jg 1b \n"
1249 : "+r"(src_abgr), // %0
1250 "+r"(dst_y), // %1
1251 "+r"(pix) // %2
1252 : "m"(kABGRToY), // %3
1253 "m"(kAddY16) // %4
1254 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1255 );
1256 }
1257
RGBAToYRow_SSSE3(const uint8 * src_rgba,uint8 * dst_y,int pix)1258 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1259 asm volatile (
1260 "movdqa %4,%%xmm5 \n"
1261 "movdqa %3,%%xmm4 \n"
1262 LABELALIGN
1263 "1: \n"
1264 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1265 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1266 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1267 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1268 "pmaddubsw %%xmm4,%%xmm0 \n"
1269 "pmaddubsw %%xmm4,%%xmm1 \n"
1270 "pmaddubsw %%xmm4,%%xmm2 \n"
1271 "pmaddubsw %%xmm4,%%xmm3 \n"
1272 "lea " MEMLEA(0x40,0) ",%0 \n"
1273 "phaddw %%xmm1,%%xmm0 \n"
1274 "phaddw %%xmm3,%%xmm2 \n"
1275 "psrlw $0x7,%%xmm0 \n"
1276 "psrlw $0x7,%%xmm2 \n"
1277 "packuswb %%xmm2,%%xmm0 \n"
1278 "paddb %%xmm5,%%xmm0 \n"
1279 "movdqu %%xmm0," MEMACCESS(1) " \n"
1280 "lea " MEMLEA(0x10,1) ",%1 \n"
1281 "sub $0x10,%2 \n"
1282 "jg 1b \n"
1283 : "+r"(src_rgba), // %0
1284 "+r"(dst_y), // %1
1285 "+r"(pix) // %2
1286 : "m"(kRGBAToY), // %3
1287 "m"(kAddY16) // %4
1288 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1289 );
1290 }
1291
ABGRToUVRow_SSSE3(const uint8 * src_abgr0,int src_stride_abgr,uint8 * dst_u,uint8 * dst_v,int width)1292 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1293 uint8* dst_u, uint8* dst_v, int width) {
1294 asm volatile (
1295 "movdqa %5,%%xmm3 \n"
1296 "movdqa %6,%%xmm4 \n"
1297 "movdqa %7,%%xmm5 \n"
1298 "sub %1,%2 \n"
1299 LABELALIGN
1300 "1: \n"
1301 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1302 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1303 "pavgb %%xmm7,%%xmm0 \n"
1304 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1305 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1306 "pavgb %%xmm7,%%xmm1 \n"
1307 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1308 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1309 "pavgb %%xmm7,%%xmm2 \n"
1310 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1311 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1312 "pavgb %%xmm7,%%xmm6 \n"
1313
1314 "lea " MEMLEA(0x40,0) ",%0 \n"
1315 "movdqa %%xmm0,%%xmm7 \n"
1316 "shufps $0x88,%%xmm1,%%xmm0 \n"
1317 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1318 "pavgb %%xmm7,%%xmm0 \n"
1319 "movdqa %%xmm2,%%xmm7 \n"
1320 "shufps $0x88,%%xmm6,%%xmm2 \n"
1321 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1322 "pavgb %%xmm7,%%xmm2 \n"
1323 "movdqa %%xmm0,%%xmm1 \n"
1324 "movdqa %%xmm2,%%xmm6 \n"
1325 "pmaddubsw %%xmm4,%%xmm0 \n"
1326 "pmaddubsw %%xmm4,%%xmm2 \n"
1327 "pmaddubsw %%xmm3,%%xmm1 \n"
1328 "pmaddubsw %%xmm3,%%xmm6 \n"
1329 "phaddw %%xmm2,%%xmm0 \n"
1330 "phaddw %%xmm6,%%xmm1 \n"
1331 "psraw $0x8,%%xmm0 \n"
1332 "psraw $0x8,%%xmm1 \n"
1333 "packsswb %%xmm1,%%xmm0 \n"
1334 "paddb %%xmm5,%%xmm0 \n"
1335 "movlps %%xmm0," MEMACCESS(1) " \n"
1336 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1337 "lea " MEMLEA(0x8,1) ",%1 \n"
1338 "sub $0x10,%3 \n"
1339 "jg 1b \n"
1340 : "+r"(src_abgr0), // %0
1341 "+r"(dst_u), // %1
1342 "+r"(dst_v), // %2
1343 "+rm"(width) // %3
1344 : "r"((intptr_t)(src_stride_abgr)), // %4
1345 "m"(kABGRToV), // %5
1346 "m"(kABGRToU), // %6
1347 "m"(kAddUV128) // %7
1348 : "memory", "cc", NACL_R14
1349 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1350 );
1351 }
1352
RGBAToUVRow_SSSE3(const uint8 * src_rgba0,int src_stride_rgba,uint8 * dst_u,uint8 * dst_v,int width)1353 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1354 uint8* dst_u, uint8* dst_v, int width) {
1355 asm volatile (
1356 "movdqa %5,%%xmm3 \n"
1357 "movdqa %6,%%xmm4 \n"
1358 "movdqa %7,%%xmm5 \n"
1359 "sub %1,%2 \n"
1360 LABELALIGN
1361 "1: \n"
1362 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1363 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1364 "pavgb %%xmm7,%%xmm0 \n"
1365 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1366 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1367 "pavgb %%xmm7,%%xmm1 \n"
1368 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1369 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1370 "pavgb %%xmm7,%%xmm2 \n"
1371 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1372 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1373 "pavgb %%xmm7,%%xmm6 \n"
1374
1375 "lea " MEMLEA(0x40,0) ",%0 \n"
1376 "movdqa %%xmm0,%%xmm7 \n"
1377 "shufps $0x88,%%xmm1,%%xmm0 \n"
1378 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1379 "pavgb %%xmm7,%%xmm0 \n"
1380 "movdqa %%xmm2,%%xmm7 \n"
1381 "shufps $0x88,%%xmm6,%%xmm2 \n"
1382 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1383 "pavgb %%xmm7,%%xmm2 \n"
1384 "movdqa %%xmm0,%%xmm1 \n"
1385 "movdqa %%xmm2,%%xmm6 \n"
1386 "pmaddubsw %%xmm4,%%xmm0 \n"
1387 "pmaddubsw %%xmm4,%%xmm2 \n"
1388 "pmaddubsw %%xmm3,%%xmm1 \n"
1389 "pmaddubsw %%xmm3,%%xmm6 \n"
1390 "phaddw %%xmm2,%%xmm0 \n"
1391 "phaddw %%xmm6,%%xmm1 \n"
1392 "psraw $0x8,%%xmm0 \n"
1393 "psraw $0x8,%%xmm1 \n"
1394 "packsswb %%xmm1,%%xmm0 \n"
1395 "paddb %%xmm5,%%xmm0 \n"
1396 "movlps %%xmm0," MEMACCESS(1) " \n"
1397 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1398 "lea " MEMLEA(0x8,1) ",%1 \n"
1399 "sub $0x10,%3 \n"
1400 "jg 1b \n"
1401 : "+r"(src_rgba0), // %0
1402 "+r"(dst_u), // %1
1403 "+r"(dst_v), // %2
1404 "+rm"(width) // %3
1405 : "r"((intptr_t)(src_stride_rgba)), // %4
1406 "m"(kRGBAToV), // %5
1407 "m"(kRGBAToU), // %6
1408 "m"(kAddUV128) // %7
1409 : "memory", "cc", NACL_R14
1410 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1411 );
1412 }
1413
1414 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
1415
1416 struct YuvConstants {
1417 lvec8 kUVToB; // 0
1418 lvec8 kUVToG; // 32
1419 lvec8 kUVToR; // 64
1420 lvec16 kUVBiasB; // 96
1421 lvec16 kUVBiasG; // 128
1422 lvec16 kUVBiasR; // 160
1423 lvec16 kYToRgb; // 192
1424 };
1425
1426 // BT.601 YUV to RGB reference
1427 // R = (Y - 16) * 1.164 - V * -1.596
1428 // G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
1429 // B = (Y - 16) * 1.164 - U * -2.018
1430
1431 // Y contribution to R,G,B. Scale and bias.
1432 // TODO(fbarchard): Consider moving constants into a common header.
1433 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
1434 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
1435
1436 // U and V contributions to R,G,B.
1437 #define UB -128 /* max(-128, round(-2.018 * 64)) */
1438 #define UG 25 /* round(0.391 * 64) */
1439 #define VG 52 /* round(0.813 * 64) */
1440 #define VR -102 /* round(-1.596 * 64) */
1441
1442 // Bias values to subtract 16 from Y and 128 from U and V.
1443 #define BB (UB * 128 + YGB)
1444 #define BG (UG * 128 + VG * 128 + YGB)
1445 #define BR (VR * 128 + YGB)
1446
1447 // BT601 constants for YUV to RGB.
1448 static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
1449 { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
1450 UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
1451 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
1452 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1453 { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
1454 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
1455 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
1456 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
1457 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
1458 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
1459 };
1460
1461 // BT601 constants for NV21 where chroma plane is VU instead of UV.
1462 static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
1463 { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
1464 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
1465 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
1466 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1467 { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
1468 VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
1469 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
1470 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
1471 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
1472 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
1473 };
1474
1475 #undef YG
1476 #undef YGB
1477 #undef UB
1478 #undef UG
1479 #undef VG
1480 #undef VR
1481 #undef BB
1482 #undef BG
1483 #undef BR
1484
1485 // JPEG YUV to RGB reference
1486 // * R = Y - V * -1.40200
1487 // * G = Y - U * 0.34414 - V * 0.71414
1488 // * B = Y - U * -1.77200
1489
1490 // Y contribution to R,G,B. Scale and bias.
1491 // TODO(fbarchard): Consider moving constants into a common header.
1492 #define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
1493 #define YGBJ 32 /* 64 / 2 */
1494
1495 // U and V contributions to R,G,B.
1496 #define UBJ -113 /* round(-1.77200 * 64) */
1497 #define UGJ 22 /* round(0.34414 * 64) */
1498 #define VGJ 46 /* round(0.71414 * 64) */
1499 #define VRJ -90 /* round(-1.40200 * 64) */
1500
1501 // Bias values to subtract 16 from Y and 128 from U and V.
1502 #define BBJ (UBJ * 128 + YGBJ)
1503 #define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
1504 #define BRJ (VRJ * 128 + YGBJ)
1505
1506 // JPEG constants for YUV to RGB.
1507 YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
1508 { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
1509 UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
1510 { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
1511 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
1512 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
1513 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
1514 { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
1515 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
1516 { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
1517 BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
1518 { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
1519 BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
1520 { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
1521 BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
1522 { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
1523 YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
1524 };
1525
1526 #undef YGJ
1527 #undef YGBJ
1528 #undef UBJ
1529 #undef UGJ
1530 #undef VGJ
1531 #undef VRJ
1532 #undef BBJ
1533 #undef BGJ
1534 #undef BRJ
1535
1536 // Read 8 UV from 411
1537 #define READYUV444 \
1538 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1539 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
1540 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
1541 "punpcklbw %%xmm1,%%xmm0 \n"
1542
1543 // Read 4 UV from 422, upsample to 8 UV
1544 #define READYUV422 \
1545 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1546 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
1547 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
1548 "punpcklbw %%xmm1,%%xmm0 \n" \
1549 "punpcklwd %%xmm0,%%xmm0 \n"
1550
1551 // Read 2 UV from 411, upsample to 8 UV
1552 #define READYUV411 \
1553 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1554 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
1555 "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \
1556 "punpcklbw %%xmm1,%%xmm0 \n" \
1557 "punpcklwd %%xmm0,%%xmm0 \n" \
1558 "punpckldq %%xmm0,%%xmm0 \n"
1559
1560 // Read 4 UV from NV12, upsample to 8 UV
1561 #define READNV12 \
1562 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
1563 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \
1564 "punpcklwd %%xmm0,%%xmm0 \n"
1565
1566 // Convert 8 pixels: 8 UV and 8 Y
1567 #define YUVTORGB(YuvConstants) \
1568 "movdqa %%xmm0,%%xmm1 \n" \
1569 "movdqa %%xmm0,%%xmm2 \n" \
1570 "movdqa %%xmm0,%%xmm3 \n" \
1571 "movdqa " MEMACCESS2(96, [YuvConstants]) ",%%xmm0 \n" \
1572 "pmaddubsw " MEMACCESS([YuvConstants]) ",%%xmm1 \n" \
1573 "psubw %%xmm1,%%xmm0 \n" \
1574 "movdqa " MEMACCESS2(128, [YuvConstants]) ",%%xmm1 \n" \
1575 "pmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%xmm2 \n" \
1576 "psubw %%xmm2,%%xmm1 \n" \
1577 "movdqa " MEMACCESS2(160, [YuvConstants]) ",%%xmm2 \n" \
1578 "pmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%xmm3 \n" \
1579 "psubw %%xmm3,%%xmm2 \n" \
1580 "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \
1581 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
1582 "punpcklbw %%xmm3,%%xmm3 \n" \
1583 "pmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%xmm3 \n" \
1584 "paddsw %%xmm3,%%xmm0 \n" \
1585 "paddsw %%xmm3,%%xmm1 \n" \
1586 "paddsw %%xmm3,%%xmm2 \n" \
1587 "psraw $0x6,%%xmm0 \n" \
1588 "psraw $0x6,%%xmm1 \n" \
1589 "psraw $0x6,%%xmm2 \n" \
1590 "packuswb %%xmm0,%%xmm0 \n" \
1591 "packuswb %%xmm1,%%xmm1 \n" \
1592 "packuswb %%xmm2,%%xmm2 \n"
1593
1594 // Store 8 ARGB values. Assumes XMM5 is zero.
1595 #define STOREARGB \
1596 "punpcklbw %%xmm1,%%xmm0 \n" \
1597 "punpcklbw %%xmm5,%%xmm2 \n" \
1598 "movdqa %%xmm0,%%xmm1 \n" \
1599 "punpcklwd %%xmm2,%%xmm0 \n" \
1600 "punpckhwd %%xmm2,%%xmm1 \n" \
1601 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \
1602 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \
1603 "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n"
1604
1605 // Store 8 BGRA values. Assumes XMM5 is zero.
1606 #define STOREBGRA \
1607 "pcmpeqb %%xmm5,%%xmm5 \n" \
1608 "punpcklbw %%xmm0,%%xmm1 \n" \
1609 "punpcklbw %%xmm2,%%xmm5 \n" \
1610 "movdqa %%xmm5,%%xmm0 \n" \
1611 "punpcklwd %%xmm1,%%xmm5 \n" \
1612 "punpckhwd %%xmm1,%%xmm0 \n" \
1613 "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \
1614 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \
1615 "lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n"
1616
1617 // Store 8 ABGR values. Assumes XMM5 is zero.
1618 #define STOREABGR \
1619 "punpcklbw %%xmm1,%%xmm2 \n" \
1620 "punpcklbw %%xmm5,%%xmm0 \n" \
1621 "movdqa %%xmm2,%%xmm1 \n" \
1622 "punpcklwd %%xmm0,%%xmm2 \n" \
1623 "punpckhwd %%xmm0,%%xmm1 \n" \
1624 "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \
1625 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \
1626 "lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n"
1627
1628 // Store 8 RGBA values. Assumes XMM5 is zero.
1629 #define STORERGBA \
1630 "pcmpeqb %%xmm5,%%xmm5 \n" \
1631 "punpcklbw %%xmm2,%%xmm1 \n" \
1632 "punpcklbw %%xmm0,%%xmm5 \n" \
1633 "movdqa %%xmm5,%%xmm0 \n" \
1634 "punpcklwd %%xmm1,%%xmm5 \n" \
1635 "punpckhwd %%xmm1,%%xmm0 \n" \
1636 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \
1637 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \
1638 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n"
1639
I444ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)1640 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1641 const uint8* u_buf,
1642 const uint8* v_buf,
1643 uint8* dst_argb,
1644 int width) {
1645 asm volatile (
1646 "sub %[u_buf],%[v_buf] \n"
1647 "pcmpeqb %%xmm5,%%xmm5 \n"
1648 LABELALIGN
1649 "1: \n"
1650 READYUV444
1651 YUVTORGB(kYuvConstants)
1652 STOREARGB
1653 "sub $0x8,%[width] \n"
1654 "jg 1b \n"
1655 : [y_buf]"+r"(y_buf), // %[y_buf]
1656 [u_buf]"+r"(u_buf), // %[u_buf]
1657 [v_buf]"+r"(v_buf), // %[v_buf]
1658 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1659 [width]"+rm"(width) // %[width]
1660 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1661 : "memory", "cc", NACL_R14
1662 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1663 );
1664 }
1665
1666 // TODO(fbarchard): Consider putting masks into constants.
I422ToRGB24Row_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_rgb24,int width)1667 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1668 const uint8* u_buf,
1669 const uint8* v_buf,
1670 uint8* dst_rgb24,
1671 int width) {
1672 asm volatile (
1673 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1674 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
1675 "sub %[u_buf],%[v_buf] \n"
1676 LABELALIGN
1677 "1: \n"
1678 READYUV422
1679 YUVTORGB(kYuvConstants)
1680 "punpcklbw %%xmm1,%%xmm0 \n"
1681 "punpcklbw %%xmm2,%%xmm2 \n"
1682 "movdqa %%xmm0,%%xmm1 \n"
1683 "punpcklwd %%xmm2,%%xmm0 \n"
1684 "punpckhwd %%xmm2,%%xmm1 \n"
1685 "pshufb %%xmm5,%%xmm0 \n"
1686 "pshufb %%xmm6,%%xmm1 \n"
1687 "palignr $0xc,%%xmm0,%%xmm1 \n"
1688 "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n"
1689 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
1690 "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
1691 "subl $0x8,%[width] \n"
1692 "jg 1b \n"
1693 : [y_buf]"+r"(y_buf), // %[y_buf]
1694 [u_buf]"+r"(u_buf), // %[u_buf]
1695 [v_buf]"+r"(v_buf), // %[v_buf]
1696 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
1697 // TODO(fbarchard): Make width a register for 32 bit.
1698 #if defined(__i386__) && defined(__pic__)
1699 [width]"+m"(width) // %[width]
1700 #else
1701 [width]"+rm"(width) // %[width]
1702 #endif
1703 : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
1704 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1705 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
1706 : "memory", "cc", NACL_R14
1707 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
1708 );
1709 }
1710
I422ToRAWRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_raw,int width)1711 void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
1712 const uint8* u_buf,
1713 const uint8* v_buf,
1714 uint8* dst_raw,
1715 int width) {
1716 asm volatile (
1717 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
1718 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
1719 "sub %[u_buf],%[v_buf] \n"
1720 LABELALIGN
1721 "1: \n"
1722 READYUV422
1723 YUVTORGB(kYuvConstants)
1724 "punpcklbw %%xmm1,%%xmm0 \n"
1725 "punpcklbw %%xmm2,%%xmm2 \n"
1726 "movdqa %%xmm0,%%xmm1 \n"
1727 "punpcklwd %%xmm2,%%xmm0 \n"
1728 "punpckhwd %%xmm2,%%xmm1 \n"
1729 "pshufb %%xmm5,%%xmm0 \n"
1730 "pshufb %%xmm6,%%xmm1 \n"
1731 "palignr $0xc,%%xmm0,%%xmm1 \n"
1732 "movq %%xmm0," MEMACCESS([dst_raw]) " \n"
1733 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"
1734 "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"
1735 "subl $0x8,%[width] \n"
1736 "jg 1b \n"
1737 : [y_buf]"+r"(y_buf), // %[y_buf]
1738 [u_buf]"+r"(u_buf), // %[u_buf]
1739 [v_buf]"+r"(v_buf), // %[v_buf]
1740 [dst_raw]"+r"(dst_raw), // %[dst_raw]
1741 // TODO(fbarchard): Make width a register for 32 bit.
1742 #if defined(__i386__) && defined(__pic__)
1743 [width]"+m"(width) // %[width]
1744 #else
1745 [width]"+rm"(width) // %[width]
1746 #endif
1747 : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
1748 [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
1749 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
1750 : "memory", "cc", NACL_R14
1751 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
1752 );
1753 }
1754
I422ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)1755 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
1756 const uint8* u_buf,
1757 const uint8* v_buf,
1758 uint8* dst_argb,
1759 int width) {
1760 asm volatile (
1761 "sub %[u_buf],%[v_buf] \n"
1762 "pcmpeqb %%xmm5,%%xmm5 \n"
1763 LABELALIGN
1764 "1: \n"
1765 READYUV422
1766 YUVTORGB(kYuvConstants)
1767 STOREARGB
1768 "sub $0x8,%[width] \n"
1769 "jg 1b \n"
1770 : [y_buf]"+r"(y_buf), // %[y_buf]
1771 [u_buf]"+r"(u_buf), // %[u_buf]
1772 [v_buf]"+r"(v_buf), // %[v_buf]
1773 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1774 [width]"+rm"(width) // %[width]
1775 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1776 : "memory", "cc", NACL_R14
1777 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1778 );
1779 }
1780
J422ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)1781 void OMITFP J422ToARGBRow_SSSE3(const uint8* y_buf,
1782 const uint8* u_buf,
1783 const uint8* v_buf,
1784 uint8* dst_argb,
1785 int width) {
1786 asm volatile (
1787 "sub %[u_buf],%[v_buf] \n"
1788 "pcmpeqb %%xmm5,%%xmm5 \n"
1789 LABELALIGN
1790 "1: \n"
1791 READYUV422
1792 YUVTORGB(kYuvConstants)
1793 STOREARGB
1794 "sub $0x8,%[width] \n"
1795 "jg 1b \n"
1796 : [y_buf]"+r"(y_buf), // %[y_buf]
1797 [u_buf]"+r"(u_buf), // %[u_buf]
1798 [v_buf]"+r"(v_buf), // %[v_buf]
1799 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1800 [width]"+rm"(width) // %[width]
1801 : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]
1802 : "memory", "cc", NACL_R14
1803 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1804 );
1805 }
1806
I411ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)1807 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1808 const uint8* u_buf,
1809 const uint8* v_buf,
1810 uint8* dst_argb,
1811 int width) {
1812 asm volatile (
1813 "sub %[u_buf],%[v_buf] \n"
1814 "pcmpeqb %%xmm5,%%xmm5 \n"
1815 LABELALIGN
1816 "1: \n"
1817 READYUV411
1818 YUVTORGB(kYuvConstants)
1819 STOREARGB
1820 "sub $0x8,%[width] \n"
1821 "jg 1b \n"
1822 : [y_buf]"+r"(y_buf), // %[y_buf]
1823 [u_buf]"+r"(u_buf), // %[u_buf]
1824 [v_buf]"+r"(v_buf), // %[v_buf]
1825 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1826 [width]"+rm"(width) // %[width]
1827 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1828 : "memory", "cc", NACL_R14
1829 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1830 );
1831 }
1832
NV12ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * dst_argb,int width)1833 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1834 const uint8* uv_buf,
1835 uint8* dst_argb,
1836 int width) {
1837 asm volatile (
1838 "pcmpeqb %%xmm5,%%xmm5 \n"
1839 LABELALIGN
1840 "1: \n"
1841 READNV12
1842 YUVTORGB(kYuvConstants)
1843 STOREARGB
1844 "sub $0x8,%[width] \n"
1845 "jg 1b \n"
1846 : [y_buf]"+r"(y_buf), // %[y_buf]
1847 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1848 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1849 [width]"+rm"(width) // %[width]
1850 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1851 // Does not use r14.
1852 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1853 );
1854 }
1855
NV21ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * dst_argb,int width)1856 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1857 const uint8* uv_buf,
1858 uint8* dst_argb,
1859 int width) {
1860 asm volatile (
1861 "pcmpeqb %%xmm5,%%xmm5 \n"
1862 LABELALIGN
1863 "1: \n"
1864 READNV12
1865 YUVTORGB(kYuvConstants)
1866 STOREARGB
1867 "sub $0x8,%[width] \n"
1868 "jg 1b \n"
1869 : [y_buf]"+r"(y_buf), // %[y_buf]
1870 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1871 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1872 [width]"+rm"(width) // %[width]
1873 : [kYuvConstants]"r"(&kYvuConstants.kUVToB) // %[kYuvConstants]
1874 // Does not use r14.
1875 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1876 );
1877 }
1878
I422ToBGRARow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_bgra,int width)1879 void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
1880 const uint8* u_buf,
1881 const uint8* v_buf,
1882 uint8* dst_bgra,
1883 int width) {
1884 asm volatile (
1885 "sub %[u_buf],%[v_buf] \n"
1886 "pcmpeqb %%xmm5,%%xmm5 \n"
1887 LABELALIGN
1888 "1: \n"
1889 READYUV422
1890 YUVTORGB(kYuvConstants)
1891 STOREBGRA
1892 "sub $0x8,%[width] \n"
1893 "jg 1b \n"
1894 : [y_buf]"+r"(y_buf), // %[y_buf]
1895 [u_buf]"+r"(u_buf), // %[u_buf]
1896 [v_buf]"+r"(v_buf), // %[v_buf]
1897 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
1898 [width]"+rm"(width) // %[width]
1899 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1900 : "memory", "cc", NACL_R14
1901 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1902 );
1903 }
1904
I422ToABGRRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_abgr,int width)1905 void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
1906 const uint8* u_buf,
1907 const uint8* v_buf,
1908 uint8* dst_abgr,
1909 int width) {
1910 asm volatile (
1911 "sub %[u_buf],%[v_buf] \n"
1912 "pcmpeqb %%xmm5,%%xmm5 \n"
1913 LABELALIGN
1914 "1: \n"
1915 READYUV422
1916 YUVTORGB(kYuvConstants)
1917 STOREABGR
1918 "sub $0x8,%[width] \n"
1919 "jg 1b \n"
1920 : [y_buf]"+r"(y_buf), // %[y_buf]
1921 [u_buf]"+r"(u_buf), // %[u_buf]
1922 [v_buf]"+r"(v_buf), // %[v_buf]
1923 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
1924 [width]"+rm"(width) // %[width]
1925 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1926 : "memory", "cc", NACL_R14
1927 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1928 );
1929 }
1930
I422ToRGBARow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_rgba,int width)1931 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
1932 const uint8* u_buf,
1933 const uint8* v_buf,
1934 uint8* dst_rgba,
1935 int width) {
1936 asm volatile (
1937 "sub %[u_buf],%[v_buf] \n"
1938 "pcmpeqb %%xmm5,%%xmm5 \n"
1939 LABELALIGN
1940 "1: \n"
1941 READYUV422
1942 YUVTORGB(kYuvConstants)
1943 STORERGBA
1944 "sub $0x8,%[width] \n"
1945 "jg 1b \n"
1946 : [y_buf]"+r"(y_buf), // %[y_buf]
1947 [u_buf]"+r"(u_buf), // %[u_buf]
1948 [v_buf]"+r"(v_buf), // %[v_buf]
1949 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
1950 [width]"+rm"(width) // %[width]
1951 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1952 : "memory", "cc", NACL_R14
1953 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1954 );
1955 }
1956
1957 #endif // HAS_I422TOARGBROW_SSSE3
1958
1959 // Read 8 UV from 422, upsample to 16 UV.
1960 #define READYUV422_AVX2 \
1961 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1962 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
1963 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
1964 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
1965 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1966 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
1967
1968 // Convert 16 pixels: 16 UV and 16 Y.
1969 #define YUVTORGB_AVX2(YuvConstants) \
1970 "vpmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2 \n" \
1971 "vpmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1 \n" \
1972 "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \
1973 "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \
1974 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
1975 "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm3 \n" \
1976 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
1977 "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm3 \n" \
1978 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
1979 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \
1980 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \
1981 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
1982 "vpunpcklbw %%ymm3,%%ymm3,%%ymm3 \n" \
1983 "vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3 \n" \
1984 "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" \
1985 "vpaddsw %%ymm3,%%ymm1,%%ymm1 \n" \
1986 "vpaddsw %%ymm3,%%ymm2,%%ymm2 \n" \
1987 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
1988 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
1989 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
1990 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
1991 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
1992 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
1993
1994 #if defined(HAS_I422TOBGRAROW_AVX2)
1995 // 16 pixels
1996 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
I422ToBGRARow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_bgra,int width)1997 void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,
1998 const uint8* u_buf,
1999 const uint8* v_buf,
2000 uint8* dst_bgra,
2001 int width) {
2002 asm volatile (
2003 "sub %[u_buf],%[v_buf] \n"
2004 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2005 LABELALIGN
2006 "1: \n"
2007 READYUV422_AVX2
2008 YUVTORGB_AVX2(kYuvConstants)
2009
2010 // Step 3: Weave into BGRA
2011 "vpunpcklbw %%ymm0,%%ymm1,%%ymm1 \n" // GB
2012 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2013 "vpunpcklbw %%ymm2,%%ymm5,%%ymm2 \n" // AR
2014 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2015 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" // ARGB first 8 pixels
2016 "vpunpckhwd %%ymm1,%%ymm2,%%ymm2 \n" // ARGB next 8 pixels
2017
2018 "vmovdqu %%ymm0," MEMACCESS([dst_bgra]) "\n"
2019 "vmovdqu %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n"
2020 "lea " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n"
2021 "sub $0x10,%[width] \n"
2022 "jg 1b \n"
2023 "vzeroupper \n"
2024 : [y_buf]"+r"(y_buf), // %[y_buf]
2025 [u_buf]"+r"(u_buf), // %[u_buf]
2026 [v_buf]"+r"(v_buf), // %[v_buf]
2027 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
2028 [width]"+rm"(width) // %[width]
2029 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2030 : "memory", "cc", NACL_R14
2031 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2032 );
2033 }
2034 #endif // HAS_I422TOBGRAROW_AVX2
2035
2036 #if defined(HAS_I422TOARGBROW_AVX2)
2037 // 16 pixels
2038 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I422ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2039 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
2040 const uint8* u_buf,
2041 const uint8* v_buf,
2042 uint8* dst_argb,
2043 int width) {
2044 asm volatile (
2045 "sub %[u_buf],%[v_buf] \n"
2046 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2047 LABELALIGN
2048 "1: \n"
2049 READYUV422_AVX2
2050 YUVTORGB_AVX2(kYuvConstants)
2051
2052 // Step 3: Weave into ARGB
2053 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG
2054 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2055 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA
2056 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2057 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels
2058 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels
2059
2060 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n"
2061 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
2062 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2063 "sub $0x10,%[width] \n"
2064 "jg 1b \n"
2065 "vzeroupper \n"
2066 : [y_buf]"+r"(y_buf), // %[y_buf]
2067 [u_buf]"+r"(u_buf), // %[u_buf]
2068 [v_buf]"+r"(v_buf), // %[v_buf]
2069 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2070 [width]"+rm"(width) // %[width]
2071 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2072 : "memory", "cc", NACL_R14
2073 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2074 );
2075 }
2076 #endif // HAS_I422TOARGBROW_AVX2
2077
2078 #if defined(HAS_J422TOARGBROW_AVX2)
2079 // 16 pixels
2080 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
J422ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2081 void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf,
2082 const uint8* u_buf,
2083 const uint8* v_buf,
2084 uint8* dst_argb,
2085 int width) {
2086 asm volatile (
2087 "sub %[u_buf],%[v_buf] \n"
2088 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2089 LABELALIGN
2090 "1: \n"
2091 READYUV422_AVX2
2092 YUVTORGB_AVX2(kYuvConstants)
2093
2094 // Step 3: Weave into ARGB
2095 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG
2096 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2097 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA
2098 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2099 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels
2100 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels
2101
2102 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n"
2103 "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
2104 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2105 "sub $0x10,%[width] \n"
2106 "jg 1b \n"
2107 "vzeroupper \n"
2108 : [y_buf]"+r"(y_buf), // %[y_buf]
2109 [u_buf]"+r"(u_buf), // %[u_buf]
2110 [v_buf]"+r"(v_buf), // %[v_buf]
2111 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2112 [width]"+rm"(width) // %[width]
2113 : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]
2114 : "memory", "cc", NACL_R14
2115 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2116 );
2117 }
2118 #endif // HAS_J422TOARGBROW_AVX2
2119
2120 #if defined(HAS_I422TOABGRROW_AVX2)
2121 // 16 pixels
2122 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
I422ToABGRRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2123 void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf,
2124 const uint8* u_buf,
2125 const uint8* v_buf,
2126 uint8* dst_argb,
2127 int width) {
2128 asm volatile (
2129 "sub %[u_buf],%[v_buf] \n"
2130 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2131 LABELALIGN
2132 "1: \n"
2133 READYUV422_AVX2
2134 YUVTORGB_AVX2(kYuvConstants)
2135
2136 // Step 3: Weave into ABGR
2137 "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" // RG
2138 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2139 "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" // BA
2140 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2141 "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" // RGBA first 8 pixels
2142 "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" // RGBA next 8 pixels
2143 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n"
2144 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
2145 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2146 "sub $0x10,%[width] \n"
2147 "jg 1b \n"
2148 "vzeroupper \n"
2149 : [y_buf]"+r"(y_buf), // %[y_buf]
2150 [u_buf]"+r"(u_buf), // %[u_buf]
2151 [v_buf]"+r"(v_buf), // %[v_buf]
2152 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2153 [width]"+rm"(width) // %[width]
2154 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2155 : "memory", "cc", NACL_R14
2156 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2157 );
2158 }
2159 #endif // HAS_I422TOABGRROW_AVX2
2160
2161 #if defined(HAS_I422TORGBAROW_AVX2)
2162 // 16 pixels
2163 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
I422ToRGBARow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2164 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
2165 const uint8* u_buf,
2166 const uint8* v_buf,
2167 uint8* dst_argb,
2168 int width) {
2169 asm volatile (
2170 "sub %[u_buf],%[v_buf] \n"
2171 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2172 LABELALIGN
2173 "1: \n"
2174 READYUV422_AVX2
2175 YUVTORGB_AVX2(kYuvConstants)
2176
2177 // Step 3: Weave into RGBA
2178 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
2179 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2180 "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n"
2181 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2182 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
2183 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
2184 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n"
2185 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
2186 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2187 "sub $0x10,%[width] \n"
2188 "jg 1b \n"
2189 "vzeroupper \n"
2190 : [y_buf]"+r"(y_buf), // %[y_buf]
2191 [u_buf]"+r"(u_buf), // %[u_buf]
2192 [v_buf]"+r"(v_buf), // %[v_buf]
2193 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2194 [width]"+rm"(width) // %[width]
2195 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2196 : "memory", "cc", NACL_R14
2197 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2198 );
2199 }
2200 #endif // HAS_I422TORGBAROW_AVX2
2201
2202 #ifdef HAS_I400TOARGBROW_SSE2
I400ToARGBRow_SSE2(const uint8 * y_buf,uint8 * dst_argb,int width)2203 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
2204 asm volatile (
2205 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
2206 "movd %%eax,%%xmm2 \n"
2207 "pshufd $0x0,%%xmm2,%%xmm2 \n"
2208 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16
2209 "movd %%eax,%%xmm3 \n"
2210 "pshufd $0x0,%%xmm3,%%xmm3 \n"
2211 "pcmpeqb %%xmm4,%%xmm4 \n"
2212 "pslld $0x18,%%xmm4 \n"
2213 LABELALIGN
2214 "1: \n"
2215 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2216 "movq " MEMACCESS(0) ",%%xmm0 \n"
2217 "lea " MEMLEA(0x8,0) ",%0 \n"
2218 "punpcklbw %%xmm0,%%xmm0 \n"
2219 "pmulhuw %%xmm2,%%xmm0 \n"
2220 "psubusw %%xmm3,%%xmm0 \n"
2221 "psrlw $6, %%xmm0 \n"
2222 "packuswb %%xmm0,%%xmm0 \n"
2223
2224 // Step 2: Weave into ARGB
2225 "punpcklbw %%xmm0,%%xmm0 \n"
2226 "movdqa %%xmm0,%%xmm1 \n"
2227 "punpcklwd %%xmm0,%%xmm0 \n"
2228 "punpckhwd %%xmm1,%%xmm1 \n"
2229 "por %%xmm4,%%xmm0 \n"
2230 "por %%xmm4,%%xmm1 \n"
2231 "movdqu %%xmm0," MEMACCESS(1) " \n"
2232 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
2233 "lea " MEMLEA(0x20,1) ",%1 \n"
2234
2235 "sub $0x8,%2 \n"
2236 "jg 1b \n"
2237 : "+r"(y_buf), // %0
2238 "+r"(dst_argb), // %1
2239 "+rm"(width) // %2
2240 :
2241 : "memory", "cc", "eax"
2242 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2243 );
2244 }
2245 #endif // HAS_I400TOARGBROW_SSE2
2246
2247 #ifdef HAS_I400TOARGBROW_AVX2
2248 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
2249 // note: vpunpcklbw mutates and vpackuswb unmutates.
I400ToARGBRow_AVX2(const uint8 * y_buf,uint8 * dst_argb,int width)2250 void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
2251 asm volatile (
2252 "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16
2253 "vmovd %%eax,%%xmm2 \n"
2254 "vbroadcastss %%xmm2,%%ymm2 \n"
2255 "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164
2256 "vmovd %%eax,%%xmm3 \n"
2257 "vbroadcastss %%xmm3,%%ymm3 \n"
2258 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
2259 "vpslld $0x18,%%ymm4,%%ymm4 \n"
2260
2261 LABELALIGN
2262 "1: \n"
2263 // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
2264 "vmovdqu " MEMACCESS(0) ",%%xmm0 \n"
2265 "lea " MEMLEA(0x10,0) ",%0 \n"
2266 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2267 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
2268 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
2269 "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n"
2270 "vpsrlw $0x6,%%ymm0,%%ymm0 \n"
2271 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
2272 "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
2273 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2274 "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
2275 "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
2276 "vpor %%ymm4,%%ymm0,%%ymm0 \n"
2277 "vpor %%ymm4,%%ymm1,%%ymm1 \n"
2278 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2279 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
2280 "lea " MEMLEA(0x40,1) ",%1 \n"
2281 "sub $0x10,%2 \n"
2282 "jg 1b \n"
2283 "vzeroupper \n"
2284 : "+r"(y_buf), // %0
2285 "+r"(dst_argb), // %1
2286 "+rm"(width) // %2
2287 :
2288 : "memory", "cc", "eax"
2289 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2290 );
2291 }
2292 #endif // HAS_I400TOARGBROW_AVX2
2293
2294 #ifdef HAS_MIRRORROW_SSSE3
2295 // Shuffle table for reversing the bytes.
2296 static uvec8 kShuffleMirror = {
2297 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2298 };
2299
MirrorRow_SSSE3(const uint8 * src,uint8 * dst,int width)2300 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2301 intptr_t temp_width = (intptr_t)(width);
2302 asm volatile (
2303 "movdqa %3,%%xmm5 \n"
2304 LABELALIGN
2305 "1: \n"
2306 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
2307 "pshufb %%xmm5,%%xmm0 \n"
2308 "movdqu %%xmm0," MEMACCESS(1) " \n"
2309 "lea " MEMLEA(0x10,1) ",%1 \n"
2310 "sub $0x10,%2 \n"
2311 "jg 1b \n"
2312 : "+r"(src), // %0
2313 "+r"(dst), // %1
2314 "+r"(temp_width) // %2
2315 : "m"(kShuffleMirror) // %3
2316 : "memory", "cc", NACL_R14
2317 "xmm0", "xmm5"
2318 );
2319 }
2320 #endif // HAS_MIRRORROW_SSSE3
2321
2322 #ifdef HAS_MIRRORROW_AVX2
MirrorRow_AVX2(const uint8 * src,uint8 * dst,int width)2323 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2324 intptr_t temp_width = (intptr_t)(width);
2325 asm volatile (
2326 "vbroadcastf128 %3,%%ymm5 \n"
2327 LABELALIGN
2328 "1: \n"
2329 MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0
2330 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
2331 "vpermq $0x4e,%%ymm0,%%ymm0 \n"
2332 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2333 "lea " MEMLEA(0x20,1) ",%1 \n"
2334 "sub $0x20,%2 \n"
2335 "jg 1b \n"
2336 "vzeroupper \n"
2337 : "+r"(src), // %0
2338 "+r"(dst), // %1
2339 "+r"(temp_width) // %2
2340 : "m"(kShuffleMirror) // %3
2341 : "memory", "cc", NACL_R14
2342 "xmm0", "xmm5"
2343 );
2344 }
2345 #endif // HAS_MIRRORROW_AVX2
2346
2347 #ifdef HAS_MIRRORROW_SSE2
MirrorRow_SSE2(const uint8 * src,uint8 * dst,int width)2348 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2349 intptr_t temp_width = (intptr_t)(width);
2350 asm volatile (
2351 LABELALIGN
2352 "1: \n"
2353 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
2354 "movdqa %%xmm0,%%xmm1 \n"
2355 "psllw $0x8,%%xmm0 \n"
2356 "psrlw $0x8,%%xmm1 \n"
2357 "por %%xmm1,%%xmm0 \n"
2358 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
2359 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
2360 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
2361 "movdqu %%xmm0," MEMACCESS(1) " \n"
2362 "lea " MEMLEA(0x10,1)",%1 \n"
2363 "sub $0x10,%2 \n"
2364 "jg 1b \n"
2365 : "+r"(src), // %0
2366 "+r"(dst), // %1
2367 "+r"(temp_width) // %2
2368 :
2369 : "memory", "cc", NACL_R14
2370 "xmm0", "xmm1"
2371 );
2372 }
2373 #endif // HAS_MIRRORROW_SSE2
2374
2375 #ifdef HAS_MIRRORROW_UV_SSSE3
2376 // Shuffle table for reversing the bytes of UV channels.
2377 static uvec8 kShuffleMirrorUV = {
2378 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2379 };
MirrorUVRow_SSSE3(const uint8 * src,uint8 * dst_u,uint8 * dst_v,int width)2380 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2381 int width) {
2382 intptr_t temp_width = (intptr_t)(width);
2383 asm volatile (
2384 "movdqa %4,%%xmm1 \n"
2385 "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n"
2386 "sub %1,%2 \n"
2387 LABELALIGN
2388 "1: \n"
2389 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2390 "lea " MEMLEA(-0x10,0) ",%0 \n"
2391 "pshufb %%xmm1,%%xmm0 \n"
2392 "movlpd %%xmm0," MEMACCESS(1) " \n"
2393 MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2)
2394 "lea " MEMLEA(0x8,1) ",%1 \n"
2395 "sub $8,%3 \n"
2396 "jg 1b \n"
2397 : "+r"(src), // %0
2398 "+r"(dst_u), // %1
2399 "+r"(dst_v), // %2
2400 "+r"(temp_width) // %3
2401 : "m"(kShuffleMirrorUV) // %4
2402 : "memory", "cc", NACL_R14
2403 "xmm0", "xmm1"
2404 );
2405 }
2406 #endif // HAS_MIRRORROW_UV_SSSE3
2407
2408 #ifdef HAS_ARGBMIRRORROW_SSE2
2409
ARGBMirrorRow_SSE2(const uint8 * src,uint8 * dst,int width)2410 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2411 intptr_t temp_width = (intptr_t)(width);
2412 asm volatile (
2413 "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n"
2414 LABELALIGN
2415 "1: \n"
2416 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2417 "pshufd $0x1b,%%xmm0,%%xmm0 \n"
2418 "lea " MEMLEA(-0x10,0) ",%0 \n"
2419 "movdqu %%xmm0," MEMACCESS(1) " \n"
2420 "lea " MEMLEA(0x10,1) ",%1 \n"
2421 "sub $0x4,%2 \n"
2422 "jg 1b \n"
2423 : "+r"(src), // %0
2424 "+r"(dst), // %1
2425 "+r"(temp_width) // %2
2426 :
2427 : "memory", "cc"
2428 , "xmm0"
2429 );
2430 }
2431 #endif // HAS_ARGBMIRRORROW_SSE2
2432
2433 #ifdef HAS_ARGBMIRRORROW_AVX2
2434 // Shuffle table for reversing the bytes.
2435 static const ulvec32 kARGBShuffleMirror_AVX2 = {
2436 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2437 };
ARGBMirrorRow_AVX2(const uint8 * src,uint8 * dst,int width)2438 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2439 intptr_t temp_width = (intptr_t)(width);
2440 asm volatile (
2441 "vmovdqu %3,%%ymm5 \n"
2442 LABELALIGN
2443 "1: \n"
2444 VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
2445 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2446 "lea " MEMLEA(0x20,1) ",%1 \n"
2447 "sub $0x8,%2 \n"
2448 "jg 1b \n"
2449 "vzeroupper \n"
2450 : "+r"(src), // %0
2451 "+r"(dst), // %1
2452 "+r"(temp_width) // %2
2453 : "m"(kARGBShuffleMirror_AVX2) // %3
2454 : "memory", "cc", NACL_R14
2455 "xmm0", "xmm5"
2456 );
2457 }
2458 #endif // HAS_ARGBMIRRORROW_AVX2
2459
2460 #ifdef HAS_SPLITUVROW_AVX2
SplitUVRow_AVX2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int pix)2461 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
2462 asm volatile (
2463 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2464 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
2465 "sub %1,%2 \n"
2466 LABELALIGN
2467 "1: \n"
2468 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2469 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
2470 "lea " MEMLEA(0x40,0) ",%0 \n"
2471 "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
2472 "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
2473 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
2474 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
2475 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
2476 "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
2477 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2478 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2479 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2480 MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2)
2481 "lea " MEMLEA(0x20,1) ",%1 \n"
2482 "sub $0x20,%3 \n"
2483 "jg 1b \n"
2484 "vzeroupper \n"
2485 : "+r"(src_uv), // %0
2486 "+r"(dst_u), // %1
2487 "+r"(dst_v), // %2
2488 "+r"(pix) // %3
2489 :
2490 : "memory", "cc", NACL_R14
2491 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2492 );
2493 }
2494 #endif // HAS_SPLITUVROW_AVX2
2495
2496 #ifdef HAS_SPLITUVROW_SSE2
SplitUVRow_SSE2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int pix)2497 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
2498 asm volatile (
2499 "pcmpeqb %%xmm5,%%xmm5 \n"
2500 "psrlw $0x8,%%xmm5 \n"
2501 "sub %1,%2 \n"
2502 LABELALIGN
2503 "1: \n"
2504 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2505 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2506 "lea " MEMLEA(0x20,0) ",%0 \n"
2507 "movdqa %%xmm0,%%xmm2 \n"
2508 "movdqa %%xmm1,%%xmm3 \n"
2509 "pand %%xmm5,%%xmm0 \n"
2510 "pand %%xmm5,%%xmm1 \n"
2511 "packuswb %%xmm1,%%xmm0 \n"
2512 "psrlw $0x8,%%xmm2 \n"
2513 "psrlw $0x8,%%xmm3 \n"
2514 "packuswb %%xmm3,%%xmm2 \n"
2515 "movdqu %%xmm0," MEMACCESS(1) " \n"
2516 MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2)
2517 "lea " MEMLEA(0x10,1) ",%1 \n"
2518 "sub $0x10,%3 \n"
2519 "jg 1b \n"
2520 : "+r"(src_uv), // %0
2521 "+r"(dst_u), // %1
2522 "+r"(dst_v), // %2
2523 "+r"(pix) // %3
2524 :
2525 : "memory", "cc", NACL_R14
2526 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2527 );
2528 }
2529 #endif // HAS_SPLITUVROW_SSE2
2530
2531 #ifdef HAS_MERGEUVROW_AVX2
MergeUVRow_AVX2(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)2532 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2533 int width) {
2534 asm volatile (
2535 "sub %0,%1 \n"
2536 LABELALIGN
2537 "1: \n"
2538 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2539 MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1
2540 "lea " MEMLEA(0x20,0) ",%0 \n"
2541 "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
2542 "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
2543 "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n"
2544 "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
2545 "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
2546 "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
2547 "lea " MEMLEA(0x40,2) ",%2 \n"
2548 "sub $0x20,%3 \n"
2549 "jg 1b \n"
2550 "vzeroupper \n"
2551 : "+r"(src_u), // %0
2552 "+r"(src_v), // %1
2553 "+r"(dst_uv), // %2
2554 "+r"(width) // %3
2555 :
2556 : "memory", "cc", NACL_R14
2557 "xmm0", "xmm1", "xmm2"
2558 );
2559 }
2560 #endif // HAS_MERGEUVROW_AVX2
2561
2562 #ifdef HAS_MERGEUVROW_SSE2
MergeUVRow_SSE2(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)2563 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2564 int width) {
2565 asm volatile (
2566 "sub %0,%1 \n"
2567 LABELALIGN
2568 "1: \n"
2569 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2570 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
2571 "lea " MEMLEA(0x10,0) ",%0 \n"
2572 "movdqa %%xmm0,%%xmm2 \n"
2573 "punpcklbw %%xmm1,%%xmm0 \n"
2574 "punpckhbw %%xmm1,%%xmm2 \n"
2575 "movdqu %%xmm0," MEMACCESS(2) " \n"
2576 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
2577 "lea " MEMLEA(0x20,2) ",%2 \n"
2578 "sub $0x10,%3 \n"
2579 "jg 1b \n"
2580 : "+r"(src_u), // %0
2581 "+r"(src_v), // %1
2582 "+r"(dst_uv), // %2
2583 "+r"(width) // %3
2584 :
2585 : "memory", "cc", NACL_R14
2586 "xmm0", "xmm1", "xmm2"
2587 );
2588 }
2589 #endif // HAS_MERGEUVROW_SSE2
2590
2591 #ifdef HAS_COPYROW_SSE2
CopyRow_SSE2(const uint8 * src,uint8 * dst,int count)2592 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
2593 asm volatile (
2594 LABELALIGN
2595 "1: \n"
2596 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2597 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2598 "lea " MEMLEA(0x20,0) ",%0 \n"
2599 "movdqu %%xmm0," MEMACCESS(1) " \n"
2600 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
2601 "lea " MEMLEA(0x20,1) ",%1 \n"
2602 "sub $0x20,%2 \n"
2603 "jg 1b \n"
2604 : "+r"(src), // %0
2605 "+r"(dst), // %1
2606 "+r"(count) // %2
2607 :
2608 : "memory", "cc"
2609 , "xmm0", "xmm1"
2610 );
2611 }
2612 #endif // HAS_COPYROW_SSE2
2613
2614 #ifdef HAS_COPYROW_AVX
CopyRow_AVX(const uint8 * src,uint8 * dst,int count)2615 void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
2616 asm volatile (
2617 LABELALIGN
2618 "1: \n"
2619 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2620 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
2621 "lea " MEMLEA(0x40,0) ",%0 \n"
2622 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2623 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
2624 "lea " MEMLEA(0x40,1) ",%1 \n"
2625 "sub $0x40,%2 \n"
2626 "jg 1b \n"
2627 : "+r"(src), // %0
2628 "+r"(dst), // %1
2629 "+r"(count) // %2
2630 :
2631 : "memory", "cc"
2632 , "xmm0", "xmm1"
2633 );
2634 }
2635 #endif // HAS_COPYROW_AVX
2636
2637 #ifdef HAS_COPYROW_ERMS
2638 // Multiple of 1.
CopyRow_ERMS(const uint8 * src,uint8 * dst,int width)2639 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
2640 size_t width_tmp = (size_t)(width);
2641 asm volatile (
2642 "rep movsb " MEMMOVESTRING(0,1) " \n"
2643 : "+S"(src), // %0
2644 "+D"(dst), // %1
2645 "+c"(width_tmp) // %2
2646 :
2647 : "memory", "cc"
2648 );
2649 }
2650 #endif // HAS_COPYROW_ERMS
2651
2652 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
2653 // width in pixels
ARGBCopyAlphaRow_SSE2(const uint8 * src,uint8 * dst,int width)2654 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2655 asm volatile (
2656 "pcmpeqb %%xmm0,%%xmm0 \n"
2657 "pslld $0x18,%%xmm0 \n"
2658 "pcmpeqb %%xmm1,%%xmm1 \n"
2659 "psrld $0x8,%%xmm1 \n"
2660 LABELALIGN
2661 "1: \n"
2662 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
2663 "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
2664 "lea " MEMLEA(0x20,0) ",%0 \n"
2665 "movdqu " MEMACCESS(1) ",%%xmm4 \n"
2666 "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
2667 "pand %%xmm0,%%xmm2 \n"
2668 "pand %%xmm0,%%xmm3 \n"
2669 "pand %%xmm1,%%xmm4 \n"
2670 "pand %%xmm1,%%xmm5 \n"
2671 "por %%xmm4,%%xmm2 \n"
2672 "por %%xmm5,%%xmm3 \n"
2673 "movdqu %%xmm2," MEMACCESS(1) " \n"
2674 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
2675 "lea " MEMLEA(0x20,1) ",%1 \n"
2676 "sub $0x8,%2 \n"
2677 "jg 1b \n"
2678 : "+r"(src), // %0
2679 "+r"(dst), // %1
2680 "+r"(width) // %2
2681 :
2682 : "memory", "cc"
2683 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2684 );
2685 }
2686 #endif // HAS_ARGBCOPYALPHAROW_SSE2
2687
2688 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
2689 // width in pixels
ARGBCopyAlphaRow_AVX2(const uint8 * src,uint8 * dst,int width)2690 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
2691 asm volatile (
2692 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
2693 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
2694 LABELALIGN
2695 "1: \n"
2696 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
2697 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n"
2698 "lea " MEMLEA(0x40,0) ",%0 \n"
2699 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
2700 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
2701 "vmovdqu %%ymm1," MEMACCESS(1) " \n"
2702 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
2703 "lea " MEMLEA(0x40,1) ",%1 \n"
2704 "sub $0x10,%2 \n"
2705 "jg 1b \n"
2706 "vzeroupper \n"
2707 : "+r"(src), // %0
2708 "+r"(dst), // %1
2709 "+r"(width) // %2
2710 :
2711 : "memory", "cc"
2712 , "xmm0", "xmm1", "xmm2"
2713 );
2714 }
2715 #endif // HAS_ARGBCOPYALPHAROW_AVX2
2716
2717 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
2718 // width in pixels
ARGBCopyYToAlphaRow_SSE2(const uint8 * src,uint8 * dst,int width)2719 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2720 asm volatile (
2721 "pcmpeqb %%xmm0,%%xmm0 \n"
2722 "pslld $0x18,%%xmm0 \n"
2723 "pcmpeqb %%xmm1,%%xmm1 \n"
2724 "psrld $0x8,%%xmm1 \n"
2725 LABELALIGN
2726 "1: \n"
2727 "movq " MEMACCESS(0) ",%%xmm2 \n"
2728 "lea " MEMLEA(0x8,0) ",%0 \n"
2729 "punpcklbw %%xmm2,%%xmm2 \n"
2730 "punpckhwd %%xmm2,%%xmm3 \n"
2731 "punpcklwd %%xmm2,%%xmm2 \n"
2732 "movdqu " MEMACCESS(1) ",%%xmm4 \n"
2733 "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
2734 "pand %%xmm0,%%xmm2 \n"
2735 "pand %%xmm0,%%xmm3 \n"
2736 "pand %%xmm1,%%xmm4 \n"
2737 "pand %%xmm1,%%xmm5 \n"
2738 "por %%xmm4,%%xmm2 \n"
2739 "por %%xmm5,%%xmm3 \n"
2740 "movdqu %%xmm2," MEMACCESS(1) " \n"
2741 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
2742 "lea " MEMLEA(0x20,1) ",%1 \n"
2743 "sub $0x8,%2 \n"
2744 "jg 1b \n"
2745 : "+r"(src), // %0
2746 "+r"(dst), // %1
2747 "+r"(width) // %2
2748 :
2749 : "memory", "cc"
2750 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2751 );
2752 }
2753 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
2754
2755 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
2756 // width in pixels
ARGBCopyYToAlphaRow_AVX2(const uint8 * src,uint8 * dst,int width)2757 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
2758 asm volatile (
2759 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
2760 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
2761 LABELALIGN
2762 "1: \n"
2763 "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n"
2764 "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n"
2765 "lea " MEMLEA(0x10,0) ",%0 \n"
2766 "vpslld $0x18,%%ymm1,%%ymm1 \n"
2767 "vpslld $0x18,%%ymm2,%%ymm2 \n"
2768 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
2769 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
2770 "vmovdqu %%ymm1," MEMACCESS(1) " \n"
2771 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
2772 "lea " MEMLEA(0x40,1) ",%1 \n"
2773 "sub $0x10,%2 \n"
2774 "jg 1b \n"
2775 "vzeroupper \n"
2776 : "+r"(src), // %0
2777 "+r"(dst), // %1
2778 "+r"(width) // %2
2779 :
2780 : "memory", "cc"
2781 , "xmm0", "xmm1", "xmm2"
2782 );
2783 }
2784 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
2785
2786 #ifdef HAS_SETROW_X86
SetRow_X86(uint8 * dst,uint8 v8,int width)2787 void SetRow_X86(uint8* dst, uint8 v8, int width) {
2788 size_t width_tmp = (size_t)(width >> 2);
2789 const uint32 v32 = v8 * 0x01010101; // Duplicate byte to all bytes.
2790 asm volatile (
2791 "rep stosl " MEMSTORESTRING(eax,0) " \n"
2792 : "+D"(dst), // %0
2793 "+c"(width_tmp) // %1
2794 : "a"(v32) // %2
2795 : "memory", "cc");
2796 }
2797
SetRow_ERMS(uint8 * dst,uint8 v8,int width)2798 void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
2799 size_t width_tmp = (size_t)(width);
2800 asm volatile (
2801 "rep stosb " MEMSTORESTRING(al,0) " \n"
2802 : "+D"(dst), // %0
2803 "+c"(width_tmp) // %1
2804 : "a"(v8) // %2
2805 : "memory", "cc");
2806 }
2807
ARGBSetRow_X86(uint8 * dst_argb,uint32 v32,int width)2808 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
2809 size_t width_tmp = (size_t)(width);
2810 asm volatile (
2811 "rep stosl " MEMSTORESTRING(eax,0) " \n"
2812 : "+D"(dst_argb), // %0
2813 "+c"(width_tmp) // %1
2814 : "a"(v32) // %2
2815 : "memory", "cc");
2816 }
2817 #endif // HAS_SETROW_X86
2818
2819 #ifdef HAS_YUY2TOYROW_SSE2
YUY2ToYRow_SSE2(const uint8 * src_yuy2,uint8 * dst_y,int pix)2820 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
2821 asm volatile (
2822 "pcmpeqb %%xmm5,%%xmm5 \n"
2823 "psrlw $0x8,%%xmm5 \n"
2824 LABELALIGN
2825 "1: \n"
2826 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2827 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2828 "lea " MEMLEA(0x20,0) ",%0 \n"
2829 "pand %%xmm5,%%xmm0 \n"
2830 "pand %%xmm5,%%xmm1 \n"
2831 "packuswb %%xmm1,%%xmm0 \n"
2832 "movdqu %%xmm0," MEMACCESS(1) " \n"
2833 "lea " MEMLEA(0x10,1) ",%1 \n"
2834 "sub $0x10,%2 \n"
2835 "jg 1b \n"
2836 : "+r"(src_yuy2), // %0
2837 "+r"(dst_y), // %1
2838 "+r"(pix) // %2
2839 :
2840 : "memory", "cc"
2841 , "xmm0", "xmm1", "xmm5"
2842 );
2843 }
2844
YUY2ToUVRow_SSE2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)2845 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
2846 uint8* dst_u, uint8* dst_v, int pix) {
2847 asm volatile (
2848 "pcmpeqb %%xmm5,%%xmm5 \n"
2849 "psrlw $0x8,%%xmm5 \n"
2850 "sub %1,%2 \n"
2851 LABELALIGN
2852 "1: \n"
2853 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2854 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2855 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
2856 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
2857 "lea " MEMLEA(0x20,0) ",%0 \n"
2858 "pavgb %%xmm2,%%xmm0 \n"
2859 "pavgb %%xmm3,%%xmm1 \n"
2860 "psrlw $0x8,%%xmm0 \n"
2861 "psrlw $0x8,%%xmm1 \n"
2862 "packuswb %%xmm1,%%xmm0 \n"
2863 "movdqa %%xmm0,%%xmm1 \n"
2864 "pand %%xmm5,%%xmm0 \n"
2865 "packuswb %%xmm0,%%xmm0 \n"
2866 "psrlw $0x8,%%xmm1 \n"
2867 "packuswb %%xmm1,%%xmm1 \n"
2868 "movq %%xmm0," MEMACCESS(1) " \n"
2869 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
2870 "lea " MEMLEA(0x8,1) ",%1 \n"
2871 "sub $0x10,%3 \n"
2872 "jg 1b \n"
2873 : "+r"(src_yuy2), // %0
2874 "+r"(dst_u), // %1
2875 "+r"(dst_v), // %2
2876 "+r"(pix) // %3
2877 : "r"((intptr_t)(stride_yuy2)) // %4
2878 : "memory", "cc", NACL_R14
2879 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2880 );
2881 }
2882
YUY2ToUV422Row_SSE2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)2883 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2884 uint8* dst_u, uint8* dst_v, int pix) {
2885 asm volatile (
2886 "pcmpeqb %%xmm5,%%xmm5 \n"
2887 "psrlw $0x8,%%xmm5 \n"
2888 "sub %1,%2 \n"
2889 LABELALIGN
2890 "1: \n"
2891 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2892 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2893 "lea " MEMLEA(0x20,0) ",%0 \n"
2894 "psrlw $0x8,%%xmm0 \n"
2895 "psrlw $0x8,%%xmm1 \n"
2896 "packuswb %%xmm1,%%xmm0 \n"
2897 "movdqa %%xmm0,%%xmm1 \n"
2898 "pand %%xmm5,%%xmm0 \n"
2899 "packuswb %%xmm0,%%xmm0 \n"
2900 "psrlw $0x8,%%xmm1 \n"
2901 "packuswb %%xmm1,%%xmm1 \n"
2902 "movq %%xmm0," MEMACCESS(1) " \n"
2903 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
2904 "lea " MEMLEA(0x8,1) ",%1 \n"
2905 "sub $0x10,%3 \n"
2906 "jg 1b \n"
2907 : "+r"(src_yuy2), // %0
2908 "+r"(dst_u), // %1
2909 "+r"(dst_v), // %2
2910 "+r"(pix) // %3
2911 :
2912 : "memory", "cc", NACL_R14
2913 "xmm0", "xmm1", "xmm5"
2914 );
2915 }
2916
UYVYToYRow_SSE2(const uint8 * src_uyvy,uint8 * dst_y,int pix)2917 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
2918 asm volatile (
2919 LABELALIGN
2920 "1: \n"
2921 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2922 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2923 "lea " MEMLEA(0x20,0) ",%0 \n"
2924 "psrlw $0x8,%%xmm0 \n"
2925 "psrlw $0x8,%%xmm1 \n"
2926 "packuswb %%xmm1,%%xmm0 \n"
2927 "movdqu %%xmm0," MEMACCESS(1) " \n"
2928 "lea " MEMLEA(0x10,1) ",%1 \n"
2929 "sub $0x10,%2 \n"
2930 "jg 1b \n"
2931 : "+r"(src_uyvy), // %0
2932 "+r"(dst_y), // %1
2933 "+r"(pix) // %2
2934 :
2935 : "memory", "cc"
2936 , "xmm0", "xmm1"
2937 );
2938 }
2939
UYVYToUVRow_SSE2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)2940 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
2941 uint8* dst_u, uint8* dst_v, int pix) {
2942 asm volatile (
2943 "pcmpeqb %%xmm5,%%xmm5 \n"
2944 "psrlw $0x8,%%xmm5 \n"
2945 "sub %1,%2 \n"
2946 LABELALIGN
2947 "1: \n"
2948 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2949 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2950 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
2951 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
2952 "lea " MEMLEA(0x20,0) ",%0 \n"
2953 "pavgb %%xmm2,%%xmm0 \n"
2954 "pavgb %%xmm3,%%xmm1 \n"
2955 "pand %%xmm5,%%xmm0 \n"
2956 "pand %%xmm5,%%xmm1 \n"
2957 "packuswb %%xmm1,%%xmm0 \n"
2958 "movdqa %%xmm0,%%xmm1 \n"
2959 "pand %%xmm5,%%xmm0 \n"
2960 "packuswb %%xmm0,%%xmm0 \n"
2961 "psrlw $0x8,%%xmm1 \n"
2962 "packuswb %%xmm1,%%xmm1 \n"
2963 "movq %%xmm0," MEMACCESS(1) " \n"
2964 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
2965 "lea " MEMLEA(0x8,1) ",%1 \n"
2966 "sub $0x10,%3 \n"
2967 "jg 1b \n"
2968 : "+r"(src_uyvy), // %0
2969 "+r"(dst_u), // %1
2970 "+r"(dst_v), // %2
2971 "+r"(pix) // %3
2972 : "r"((intptr_t)(stride_uyvy)) // %4
2973 : "memory", "cc", NACL_R14
2974 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2975 );
2976 }
2977
UYVYToUV422Row_SSE2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)2978 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
2979 uint8* dst_u, uint8* dst_v, int pix) {
2980 asm volatile (
2981 "pcmpeqb %%xmm5,%%xmm5 \n"
2982 "psrlw $0x8,%%xmm5 \n"
2983 "sub %1,%2 \n"
2984 LABELALIGN
2985 "1: \n"
2986 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2987 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2988 "lea " MEMLEA(0x20,0) ",%0 \n"
2989 "pand %%xmm5,%%xmm0 \n"
2990 "pand %%xmm5,%%xmm1 \n"
2991 "packuswb %%xmm1,%%xmm0 \n"
2992 "movdqa %%xmm0,%%xmm1 \n"
2993 "pand %%xmm5,%%xmm0 \n"
2994 "packuswb %%xmm0,%%xmm0 \n"
2995 "psrlw $0x8,%%xmm1 \n"
2996 "packuswb %%xmm1,%%xmm1 \n"
2997 "movq %%xmm0," MEMACCESS(1) " \n"
2998 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
2999 "lea " MEMLEA(0x8,1) ",%1 \n"
3000 "sub $0x10,%3 \n"
3001 "jg 1b \n"
3002 : "+r"(src_uyvy), // %0
3003 "+r"(dst_u), // %1
3004 "+r"(dst_v), // %2
3005 "+r"(pix) // %3
3006 :
3007 : "memory", "cc", NACL_R14
3008 "xmm0", "xmm1", "xmm5"
3009 );
3010 }
3011 #endif // HAS_YUY2TOYROW_SSE2
3012
3013 #ifdef HAS_YUY2TOYROW_AVX2
YUY2ToYRow_AVX2(const uint8 * src_yuy2,uint8 * dst_y,int pix)3014 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) {
3015 asm volatile (
3016 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3017 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3018 LABELALIGN
3019 "1: \n"
3020 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3021 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3022 "lea " MEMLEA(0x40,0) ",%0 \n"
3023 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
3024 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
3025 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3026 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3027 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
3028 "lea " MEMLEA(0x20,1) ",%1 \n"
3029 "sub $0x20,%2 \n"
3030 "jg 1b \n"
3031 "vzeroupper \n"
3032 : "+r"(src_yuy2), // %0
3033 "+r"(dst_y), // %1
3034 "+r"(pix) // %2
3035 :
3036 : "memory", "cc"
3037 , "xmm0", "xmm1", "xmm5"
3038 );
3039 }
3040
YUY2ToUVRow_AVX2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)3041 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
3042 uint8* dst_u, uint8* dst_v, int pix) {
3043 asm volatile (
3044 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3045 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3046 "sub %1,%2 \n"
3047 LABELALIGN
3048 "1: \n"
3049 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3050 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3051 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3052 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
3053 "lea " MEMLEA(0x40,0) ",%0 \n"
3054 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3055 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
3056 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3057 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3058 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3059 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3060 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3061 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3062 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3063 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3064 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3065 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3066 "lea " MEMLEA(0x10,1) ",%1 \n"
3067 "sub $0x20,%3 \n"
3068 "jg 1b \n"
3069 "vzeroupper \n"
3070 : "+r"(src_yuy2), // %0
3071 "+r"(dst_u), // %1
3072 "+r"(dst_v), // %2
3073 "+r"(pix) // %3
3074 : "r"((intptr_t)(stride_yuy2)) // %4
3075 : "memory", "cc", NACL_R14
3076 "xmm0", "xmm1", "xmm5"
3077 );
3078 }
3079
YUY2ToUV422Row_AVX2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)3080 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3081 uint8* dst_u, uint8* dst_v, int pix) {
3082 asm volatile (
3083 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3084 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3085 "sub %1,%2 \n"
3086 LABELALIGN
3087 "1: \n"
3088 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3089 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3090 "lea " MEMLEA(0x40,0) ",%0 \n"
3091 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3092 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
3093 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3094 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3095 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3096 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3097 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3098 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3099 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3100 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3101 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3102 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3103 "lea " MEMLEA(0x10,1) ",%1 \n"
3104 "sub $0x20,%3 \n"
3105 "jg 1b \n"
3106 "vzeroupper \n"
3107 : "+r"(src_yuy2), // %0
3108 "+r"(dst_u), // %1
3109 "+r"(dst_v), // %2
3110 "+r"(pix) // %3
3111 :
3112 : "memory", "cc", NACL_R14
3113 "xmm0", "xmm1", "xmm5"
3114 );
3115 }
3116
UYVYToYRow_AVX2(const uint8 * src_uyvy,uint8 * dst_y,int pix)3117 void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) {
3118 asm volatile (
3119 LABELALIGN
3120 "1: \n"
3121 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3122 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3123 "lea " MEMLEA(0x40,0) ",%0 \n"
3124 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3125 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
3126 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3127 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3128 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
3129 "lea " MEMLEA(0x20,1) ",%1 \n"
3130 "sub $0x20,%2 \n"
3131 "jg 1b \n"
3132 "vzeroupper \n"
3133 : "+r"(src_uyvy), // %0
3134 "+r"(dst_y), // %1
3135 "+r"(pix) // %2
3136 :
3137 : "memory", "cc"
3138 , "xmm0", "xmm1", "xmm5"
3139 );
3140 }
UYVYToUVRow_AVX2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)3141 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
3142 uint8* dst_u, uint8* dst_v, int pix) {
3143 asm volatile (
3144 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3145 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3146 "sub %1,%2 \n"
3147
3148 LABELALIGN
3149 "1: \n"
3150 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3151 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3152 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3153 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
3154 "lea " MEMLEA(0x40,0) ",%0 \n"
3155 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
3156 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
3157 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3158 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3159 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3160 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3161 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3162 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3163 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3164 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3165 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3166 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3167 "lea " MEMLEA(0x10,1) ",%1 \n"
3168 "sub $0x20,%3 \n"
3169 "jg 1b \n"
3170 "vzeroupper \n"
3171 : "+r"(src_uyvy), // %0
3172 "+r"(dst_u), // %1
3173 "+r"(dst_v), // %2
3174 "+r"(pix) // %3
3175 : "r"((intptr_t)(stride_uyvy)) // %4
3176 : "memory", "cc", NACL_R14
3177 "xmm0", "xmm1", "xmm5"
3178 );
3179 }
3180
UYVYToUV422Row_AVX2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)3181 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3182 uint8* dst_u, uint8* dst_v, int pix) {
3183 asm volatile (
3184 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3185 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3186 "sub %1,%2 \n"
3187 LABELALIGN
3188 "1: \n"
3189 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3190 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3191 "lea " MEMLEA(0x40,0) ",%0 \n"
3192 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
3193 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
3194 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3195 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3196 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3197 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3198 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3199 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3200 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3201 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3202 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3203 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3204 "lea " MEMLEA(0x10,1) ",%1 \n"
3205 "sub $0x20,%3 \n"
3206 "jg 1b \n"
3207 "vzeroupper \n"
3208 : "+r"(src_uyvy), // %0
3209 "+r"(dst_u), // %1
3210 "+r"(dst_v), // %2
3211 "+r"(pix) // %3
3212 :
3213 : "memory", "cc", NACL_R14
3214 "xmm0", "xmm1", "xmm5"
3215 );
3216 }
3217 #endif // HAS_YUY2TOYROW_AVX2
3218
3219 #ifdef HAS_ARGBBLENDROW_SSE2
3220 // Blend 8 pixels at a time.
ARGBBlendRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)3221 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3222 uint8* dst_argb, int width) {
3223 asm volatile (
3224 "pcmpeqb %%xmm7,%%xmm7 \n"
3225 "psrlw $0xf,%%xmm7 \n"
3226 "pcmpeqb %%xmm6,%%xmm6 \n"
3227 "psrlw $0x8,%%xmm6 \n"
3228 "pcmpeqb %%xmm5,%%xmm5 \n"
3229 "psllw $0x8,%%xmm5 \n"
3230 "pcmpeqb %%xmm4,%%xmm4 \n"
3231 "pslld $0x18,%%xmm4 \n"
3232 "sub $0x4,%3 \n"
3233 "jl 49f \n"
3234
3235 // 4 pixel loop.
3236 LABELALIGN
3237 "41: \n"
3238 "movdqu " MEMACCESS(0) ",%%xmm3 \n"
3239 "lea " MEMLEA(0x10,0) ",%0 \n"
3240 "movdqa %%xmm3,%%xmm0 \n"
3241 "pxor %%xmm4,%%xmm3 \n"
3242 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
3243 "psrlw $0x8,%%xmm3 \n"
3244 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3245 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3246 "pand %%xmm6,%%xmm2 \n"
3247 "paddw %%xmm7,%%xmm3 \n"
3248 "pmullw %%xmm3,%%xmm2 \n"
3249 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
3250 "lea " MEMLEA(0x10,1) ",%1 \n"
3251 "psrlw $0x8,%%xmm1 \n"
3252 "por %%xmm4,%%xmm0 \n"
3253 "pmullw %%xmm3,%%xmm1 \n"
3254 "psrlw $0x8,%%xmm2 \n"
3255 "paddusb %%xmm2,%%xmm0 \n"
3256 "pand %%xmm5,%%xmm1 \n"
3257 "paddusb %%xmm1,%%xmm0 \n"
3258 "movdqu %%xmm0," MEMACCESS(2) " \n"
3259 "lea " MEMLEA(0x10,2) ",%2 \n"
3260 "sub $0x4,%3 \n"
3261 "jge 41b \n"
3262
3263 "49: \n"
3264 "add $0x3,%3 \n"
3265 "jl 99f \n"
3266
3267 // 1 pixel loop.
3268 "91: \n"
3269 "movd " MEMACCESS(0) ",%%xmm3 \n"
3270 "lea " MEMLEA(0x4,0) ",%0 \n"
3271 "movdqa %%xmm3,%%xmm0 \n"
3272 "pxor %%xmm4,%%xmm3 \n"
3273 "movd " MEMACCESS(1) ",%%xmm2 \n"
3274 "psrlw $0x8,%%xmm3 \n"
3275 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
3276 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
3277 "pand %%xmm6,%%xmm2 \n"
3278 "paddw %%xmm7,%%xmm3 \n"
3279 "pmullw %%xmm3,%%xmm2 \n"
3280 "movd " MEMACCESS(1) ",%%xmm1 \n"
3281 "lea " MEMLEA(0x4,1) ",%1 \n"
3282 "psrlw $0x8,%%xmm1 \n"
3283 "por %%xmm4,%%xmm0 \n"
3284 "pmullw %%xmm3,%%xmm1 \n"
3285 "psrlw $0x8,%%xmm2 \n"
3286 "paddusb %%xmm2,%%xmm0 \n"
3287 "pand %%xmm5,%%xmm1 \n"
3288 "paddusb %%xmm1,%%xmm0 \n"
3289 "movd %%xmm0," MEMACCESS(2) " \n"
3290 "lea " MEMLEA(0x4,2) ",%2 \n"
3291 "sub $0x1,%3 \n"
3292 "jge 91b \n"
3293 "99: \n"
3294 : "+r"(src_argb0), // %0
3295 "+r"(src_argb1), // %1
3296 "+r"(dst_argb), // %2
3297 "+r"(width) // %3
3298 :
3299 : "memory", "cc"
3300 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3301 );
3302 }
3303 #endif // HAS_ARGBBLENDROW_SSE2
3304
3305 #ifdef HAS_ARGBBLENDROW_SSSE3
3306 // Shuffle table for isolating alpha.
3307 static uvec8 kShuffleAlpha = {
3308 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3309 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3310 };
3311
3312 // Blend 8 pixels at a time
3313 // Shuffle table for reversing the bytes.
3314
3315 // Same as SSE2, but replaces
3316 // psrlw xmm3, 8 // alpha
3317 // pshufhw xmm3, xmm3,0F5h // 8 alpha words
3318 // pshuflw xmm3, xmm3,0F5h
3319 // with..
3320 // pshufb xmm3, kShuffleAlpha // alpha
3321
ARGBBlendRow_SSSE3(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)3322 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3323 uint8* dst_argb, int width) {
3324 asm volatile (
3325 "pcmpeqb %%xmm7,%%xmm7 \n"
3326 "psrlw $0xf,%%xmm7 \n"
3327 "pcmpeqb %%xmm6,%%xmm6 \n"
3328 "psrlw $0x8,%%xmm6 \n"
3329 "pcmpeqb %%xmm5,%%xmm5 \n"
3330 "psllw $0x8,%%xmm5 \n"
3331 "pcmpeqb %%xmm4,%%xmm4 \n"
3332 "pslld $0x18,%%xmm4 \n"
3333 "sub $0x4,%3 \n"
3334 "jl 49f \n"
3335
3336 // 4 pixel loop.
3337 LABELALIGN
3338 "40: \n"
3339 "movdqu " MEMACCESS(0) ",%%xmm3 \n"
3340 "lea " MEMLEA(0x10,0) ",%0 \n"
3341 "movdqa %%xmm3,%%xmm0 \n"
3342 "pxor %%xmm4,%%xmm3 \n"
3343 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
3344 "pshufb %4,%%xmm3 \n"
3345 "pand %%xmm6,%%xmm2 \n"
3346 "paddw %%xmm7,%%xmm3 \n"
3347 "pmullw %%xmm3,%%xmm2 \n"
3348 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
3349 "lea " MEMLEA(0x10,1) ",%1 \n"
3350 "psrlw $0x8,%%xmm1 \n"
3351 "por %%xmm4,%%xmm0 \n"
3352 "pmullw %%xmm3,%%xmm1 \n"
3353 "psrlw $0x8,%%xmm2 \n"
3354 "paddusb %%xmm2,%%xmm0 \n"
3355 "pand %%xmm5,%%xmm1 \n"
3356 "paddusb %%xmm1,%%xmm0 \n"
3357 "movdqu %%xmm0," MEMACCESS(2) " \n"
3358 "lea " MEMLEA(0x10,2) ",%2 \n"
3359 "sub $0x4,%3 \n"
3360 "jge 40b \n"
3361
3362 "49: \n"
3363 "add $0x3,%3 \n"
3364 "jl 99f \n"
3365
3366 // 1 pixel loop.
3367 "91: \n"
3368 "movd " MEMACCESS(0) ",%%xmm3 \n"
3369 "lea " MEMLEA(0x4,0) ",%0 \n"
3370 "movdqa %%xmm3,%%xmm0 \n"
3371 "pxor %%xmm4,%%xmm3 \n"
3372 "movd " MEMACCESS(1) ",%%xmm2 \n"
3373 "pshufb %4,%%xmm3 \n"
3374 "pand %%xmm6,%%xmm2 \n"
3375 "paddw %%xmm7,%%xmm3 \n"
3376 "pmullw %%xmm3,%%xmm2 \n"
3377 "movd " MEMACCESS(1) ",%%xmm1 \n"
3378 "lea " MEMLEA(0x4,1) ",%1 \n"
3379 "psrlw $0x8,%%xmm1 \n"
3380 "por %%xmm4,%%xmm0 \n"
3381 "pmullw %%xmm3,%%xmm1 \n"
3382 "psrlw $0x8,%%xmm2 \n"
3383 "paddusb %%xmm2,%%xmm0 \n"
3384 "pand %%xmm5,%%xmm1 \n"
3385 "paddusb %%xmm1,%%xmm0 \n"
3386 "movd %%xmm0," MEMACCESS(2) " \n"
3387 "lea " MEMLEA(0x4,2) ",%2 \n"
3388 "sub $0x1,%3 \n"
3389 "jge 91b \n"
3390 "99: \n"
3391 : "+r"(src_argb0), // %0
3392 "+r"(src_argb1), // %1
3393 "+r"(dst_argb), // %2
3394 "+r"(width) // %3
3395 : "m"(kShuffleAlpha) // %4
3396 : "memory", "cc"
3397 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3398 );
3399 }
3400 #endif // HAS_ARGBBLENDROW_SSSE3
3401
3402 #ifdef HAS_ARGBATTENUATEROW_SSE2
3403 // Attenuate 4 pixels at a time.
ARGBAttenuateRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width)3404 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3405 asm volatile (
3406 "pcmpeqb %%xmm4,%%xmm4 \n"
3407 "pslld $0x18,%%xmm4 \n"
3408 "pcmpeqb %%xmm5,%%xmm5 \n"
3409 "psrld $0x8,%%xmm5 \n"
3410
3411 // 4 pixel loop.
3412 LABELALIGN
3413 "1: \n"
3414 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3415 "punpcklbw %%xmm0,%%xmm0 \n"
3416 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
3417 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3418 "pmulhuw %%xmm2,%%xmm0 \n"
3419 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3420 "punpckhbw %%xmm1,%%xmm1 \n"
3421 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
3422 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
3423 "pmulhuw %%xmm2,%%xmm1 \n"
3424 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3425 "lea " MEMLEA(0x10,0) ",%0 \n"
3426 "psrlw $0x8,%%xmm0 \n"
3427 "pand %%xmm4,%%xmm2 \n"
3428 "psrlw $0x8,%%xmm1 \n"
3429 "packuswb %%xmm1,%%xmm0 \n"
3430 "pand %%xmm5,%%xmm0 \n"
3431 "por %%xmm2,%%xmm0 \n"
3432 "movdqu %%xmm0," MEMACCESS(1) " \n"
3433 "lea " MEMLEA(0x10,1) ",%1 \n"
3434 "sub $0x4,%2 \n"
3435 "jg 1b \n"
3436 : "+r"(src_argb), // %0
3437 "+r"(dst_argb), // %1
3438 "+r"(width) // %2
3439 :
3440 : "memory", "cc"
3441 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3442 );
3443 }
3444 #endif // HAS_ARGBATTENUATEROW_SSE2
3445
3446 #ifdef HAS_ARGBATTENUATEROW_SSSE3
3447 // Shuffle table duplicating alpha
3448 static uvec8 kShuffleAlpha0 = {
3449 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
3450 };
3451 static uvec8 kShuffleAlpha1 = {
3452 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3453 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
3454 };
3455 // Attenuate 4 pixels at a time.
ARGBAttenuateRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)3456 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3457 asm volatile (
3458 "pcmpeqb %%xmm3,%%xmm3 \n"
3459 "pslld $0x18,%%xmm3 \n"
3460 "movdqa %3,%%xmm4 \n"
3461 "movdqa %4,%%xmm5 \n"
3462
3463 // 4 pixel loop.
3464 LABELALIGN
3465 "1: \n"
3466 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3467 "pshufb %%xmm4,%%xmm0 \n"
3468 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3469 "punpcklbw %%xmm1,%%xmm1 \n"
3470 "pmulhuw %%xmm1,%%xmm0 \n"
3471 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3472 "pshufb %%xmm5,%%xmm1 \n"
3473 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3474 "punpckhbw %%xmm2,%%xmm2 \n"
3475 "pmulhuw %%xmm2,%%xmm1 \n"
3476 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3477 "lea " MEMLEA(0x10,0) ",%0 \n"
3478 "pand %%xmm3,%%xmm2 \n"
3479 "psrlw $0x8,%%xmm0 \n"
3480 "psrlw $0x8,%%xmm1 \n"
3481 "packuswb %%xmm1,%%xmm0 \n"
3482 "por %%xmm2,%%xmm0 \n"
3483 "movdqu %%xmm0," MEMACCESS(1) " \n"
3484 "lea " MEMLEA(0x10,1) ",%1 \n"
3485 "sub $0x4,%2 \n"
3486 "jg 1b \n"
3487 : "+r"(src_argb), // %0
3488 "+r"(dst_argb), // %1
3489 "+r"(width) // %2
3490 : "m"(kShuffleAlpha0), // %3
3491 "m"(kShuffleAlpha1) // %4
3492 : "memory", "cc"
3493 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3494 );
3495 }
3496 #endif // HAS_ARGBATTENUATEROW_SSSE3
3497
3498 #ifdef HAS_ARGBATTENUATEROW_AVX2
3499 // Shuffle table duplicating alpha.
3500 static const uvec8 kShuffleAlpha_AVX2 = {
3501 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
3502 };
3503 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,int width)3504 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
3505 asm volatile (
3506 "vbroadcastf128 %3,%%ymm4 \n"
3507 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3508 "vpslld $0x18,%%ymm5,%%ymm5 \n"
3509 "sub %0,%1 \n"
3510
3511 // 8 pixel loop.
3512 LABELALIGN
3513 "1: \n"
3514 "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
3515 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
3516 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
3517 "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
3518 "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
3519 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
3520 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
3521 "vpand %%ymm5,%%ymm6,%%ymm6 \n"
3522 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3523 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
3524 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3525 "vpor %%ymm6,%%ymm0,%%ymm0 \n"
3526 MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
3527 "lea " MEMLEA(0x20,0) ",%0 \n"
3528 "sub $0x8,%2 \n"
3529 "jg 1b \n"
3530 "vzeroupper \n"
3531 : "+r"(src_argb), // %0
3532 "+r"(dst_argb), // %1
3533 "+r"(width) // %2
3534 : "m"(kShuffleAlpha_AVX2) // %3
3535 : "memory", "cc"
3536 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3537 );
3538 }
3539 #endif // HAS_ARGBATTENUATEROW_AVX2
3540
3541 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
3542 // Unattenuate 4 pixels at a time.
ARGBUnattenuateRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width)3543 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3544 int width) {
3545 uintptr_t alpha = 0;
3546 asm volatile (
3547 // 4 pixel loop.
3548 LABELALIGN
3549 "1: \n"
3550 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3551 "movzb " MEMACCESS2(0x03,0) ",%3 \n"
3552 "punpcklbw %%xmm0,%%xmm0 \n"
3553 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
3554 "movzb " MEMACCESS2(0x07,0) ",%3 \n"
3555 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
3556 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3557 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3558 "movlhps %%xmm3,%%xmm2 \n"
3559 "pmulhuw %%xmm2,%%xmm0 \n"
3560 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3561 "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
3562 "punpckhbw %%xmm1,%%xmm1 \n"
3563 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
3564 "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
3565 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
3566 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3567 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3568 "movlhps %%xmm3,%%xmm2 \n"
3569 "pmulhuw %%xmm2,%%xmm1 \n"
3570 "lea " MEMLEA(0x10,0) ",%0 \n"
3571 "packuswb %%xmm1,%%xmm0 \n"
3572 "movdqu %%xmm0," MEMACCESS(1) " \n"
3573 "lea " MEMLEA(0x10,1) ",%1 \n"
3574 "sub $0x4,%2 \n"
3575 "jg 1b \n"
3576 : "+r"(src_argb), // %0
3577 "+r"(dst_argb), // %1
3578 "+r"(width), // %2
3579 "+r"(alpha) // %3
3580 : "r"(fixed_invtbl8) // %4
3581 : "memory", "cc", NACL_R14
3582 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3583 );
3584 }
3585 #endif // HAS_ARGBUNATTENUATEROW_SSE2
3586
3587 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
3588 // Shuffle table duplicating alpha.
3589 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
3590 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
3591 };
3592 // Unattenuate 8 pixels at a time.
ARGBUnattenuateRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,int width)3593 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
3594 int width) {
3595 uintptr_t alpha = 0;
3596 asm volatile (
3597 "sub %0,%1 \n"
3598 "vbroadcastf128 %5,%%ymm5 \n"
3599
3600 // 8 pixel loop.
3601 LABELALIGN
3602 "1: \n"
3603 // replace VPGATHER
3604 "movzb " MEMACCESS2(0x03,0) ",%3 \n"
3605 MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
3606 "movzb " MEMACCESS2(0x07,0) ",%3 \n"
3607 MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
3608 "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
3609 "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
3610 MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
3611 "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
3612 MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
3613 "movzb " MEMACCESS2(0x13,0) ",%3 \n"
3614 "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
3615 MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
3616 "movzb " MEMACCESS2(0x17,0) ",%3 \n"
3617 MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
3618 "movzb " MEMACCESS2(0x1b,0) ",%3 \n"
3619 "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
3620 MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
3621 "movzb " MEMACCESS2(0x1f,0) ",%3 \n"
3622 MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
3623 "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
3624 "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
3625 "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
3626 "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
3627 // end of VPGATHER
3628
3629 "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
3630 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
3631 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
3632 "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
3633 "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
3634 "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
3635 "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
3636 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
3637 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
3638 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3639 MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
3640 "lea " MEMLEA(0x20,0) ",%0 \n"
3641 "sub $0x8,%2 \n"
3642 "jg 1b \n"
3643 "vzeroupper \n"
3644 : "+r"(src_argb), // %0
3645 "+r"(dst_argb), // %1
3646 "+r"(width), // %2
3647 "+r"(alpha) // %3
3648 : "r"(fixed_invtbl8), // %4
3649 "m"(kUnattenShuffleAlpha_AVX2) // %5
3650 : "memory", "cc", NACL_R14
3651 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3652 );
3653 }
3654 #endif // HAS_ARGBUNATTENUATEROW_AVX2
3655
3656 #ifdef HAS_ARGBGRAYROW_SSSE3
3657 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
ARGBGrayRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)3658 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3659 asm volatile (
3660 "movdqa %3,%%xmm4 \n"
3661 "movdqa %4,%%xmm5 \n"
3662
3663 // 8 pixel loop.
3664 LABELALIGN
3665 "1: \n"
3666 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3667 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3668 "pmaddubsw %%xmm4,%%xmm0 \n"
3669 "pmaddubsw %%xmm4,%%xmm1 \n"
3670 "phaddw %%xmm1,%%xmm0 \n"
3671 "paddw %%xmm5,%%xmm0 \n"
3672 "psrlw $0x7,%%xmm0 \n"
3673 "packuswb %%xmm0,%%xmm0 \n"
3674 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3675 "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
3676 "lea " MEMLEA(0x20,0) ",%0 \n"
3677 "psrld $0x18,%%xmm2 \n"
3678 "psrld $0x18,%%xmm3 \n"
3679 "packuswb %%xmm3,%%xmm2 \n"
3680 "packuswb %%xmm2,%%xmm2 \n"
3681 "movdqa %%xmm0,%%xmm3 \n"
3682 "punpcklbw %%xmm0,%%xmm0 \n"
3683 "punpcklbw %%xmm2,%%xmm3 \n"
3684 "movdqa %%xmm0,%%xmm1 \n"
3685 "punpcklwd %%xmm3,%%xmm0 \n"
3686 "punpckhwd %%xmm3,%%xmm1 \n"
3687 "movdqu %%xmm0," MEMACCESS(1) " \n"
3688 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
3689 "lea " MEMLEA(0x20,1) ",%1 \n"
3690 "sub $0x8,%2 \n"
3691 "jg 1b \n"
3692 : "+r"(src_argb), // %0
3693 "+r"(dst_argb), // %1
3694 "+r"(width) // %2
3695 : "m"(kARGBToYJ), // %3
3696 "m"(kAddYJ64) // %4
3697 : "memory", "cc"
3698 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3699 );
3700 }
3701 #endif // HAS_ARGBGRAYROW_SSSE3
3702
3703 #ifdef HAS_ARGBSEPIAROW_SSSE3
3704 // b = (r * 35 + g * 68 + b * 17) >> 7
3705 // g = (r * 45 + g * 88 + b * 22) >> 7
3706 // r = (r * 50 + g * 98 + b * 24) >> 7
3707 // Constant for ARGB color to sepia tone
3708 static vec8 kARGBToSepiaB = {
3709 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3710 };
3711
3712 static vec8 kARGBToSepiaG = {
3713 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3714 };
3715
3716 static vec8 kARGBToSepiaR = {
3717 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3718 };
3719
3720 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
ARGBSepiaRow_SSSE3(uint8 * dst_argb,int width)3721 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3722 asm volatile (
3723 "movdqa %2,%%xmm2 \n"
3724 "movdqa %3,%%xmm3 \n"
3725 "movdqa %4,%%xmm4 \n"
3726
3727 // 8 pixel loop.
3728 LABELALIGN
3729 "1: \n"
3730 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3731 "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
3732 "pmaddubsw %%xmm2,%%xmm0 \n"
3733 "pmaddubsw %%xmm2,%%xmm6 \n"
3734 "phaddw %%xmm6,%%xmm0 \n"
3735 "psrlw $0x7,%%xmm0 \n"
3736 "packuswb %%xmm0,%%xmm0 \n"
3737 "movdqu " MEMACCESS(0) ",%%xmm5 \n"
3738 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3739 "pmaddubsw %%xmm3,%%xmm5 \n"
3740 "pmaddubsw %%xmm3,%%xmm1 \n"
3741 "phaddw %%xmm1,%%xmm5 \n"
3742 "psrlw $0x7,%%xmm5 \n"
3743 "packuswb %%xmm5,%%xmm5 \n"
3744 "punpcklbw %%xmm5,%%xmm0 \n"
3745 "movdqu " MEMACCESS(0) ",%%xmm5 \n"
3746 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3747 "pmaddubsw %%xmm4,%%xmm5 \n"
3748 "pmaddubsw %%xmm4,%%xmm1 \n"
3749 "phaddw %%xmm1,%%xmm5 \n"
3750 "psrlw $0x7,%%xmm5 \n"
3751 "packuswb %%xmm5,%%xmm5 \n"
3752 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
3753 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3754 "psrld $0x18,%%xmm6 \n"
3755 "psrld $0x18,%%xmm1 \n"
3756 "packuswb %%xmm1,%%xmm6 \n"
3757 "packuswb %%xmm6,%%xmm6 \n"
3758 "punpcklbw %%xmm6,%%xmm5 \n"
3759 "movdqa %%xmm0,%%xmm1 \n"
3760 "punpcklwd %%xmm5,%%xmm0 \n"
3761 "punpckhwd %%xmm5,%%xmm1 \n"
3762 "movdqu %%xmm0," MEMACCESS(0) " \n"
3763 "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
3764 "lea " MEMLEA(0x20,0) ",%0 \n"
3765 "sub $0x8,%1 \n"
3766 "jg 1b \n"
3767 : "+r"(dst_argb), // %0
3768 "+r"(width) // %1
3769 : "m"(kARGBToSepiaB), // %2
3770 "m"(kARGBToSepiaG), // %3
3771 "m"(kARGBToSepiaR) // %4
3772 : "memory", "cc"
3773 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3774 );
3775 }
3776 #endif // HAS_ARGBSEPIAROW_SSSE3
3777
3778 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3779 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
3780 // Same as Sepia except matrix is provided.
ARGBColorMatrixRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,const int8 * matrix_argb,int width)3781 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
3782 const int8* matrix_argb, int width) {
3783 asm volatile (
3784 "movdqu " MEMACCESS(3) ",%%xmm5 \n"
3785 "pshufd $0x00,%%xmm5,%%xmm2 \n"
3786 "pshufd $0x55,%%xmm5,%%xmm3 \n"
3787 "pshufd $0xaa,%%xmm5,%%xmm4 \n"
3788 "pshufd $0xff,%%xmm5,%%xmm5 \n"
3789
3790 // 8 pixel loop.
3791 LABELALIGN
3792 "1: \n"
3793 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3794 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
3795 "pmaddubsw %%xmm2,%%xmm0 \n"
3796 "pmaddubsw %%xmm2,%%xmm7 \n"
3797 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
3798 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3799 "pmaddubsw %%xmm3,%%xmm6 \n"
3800 "pmaddubsw %%xmm3,%%xmm1 \n"
3801 "phaddsw %%xmm7,%%xmm0 \n"
3802 "phaddsw %%xmm1,%%xmm6 \n"
3803 "psraw $0x6,%%xmm0 \n"
3804 "psraw $0x6,%%xmm6 \n"
3805 "packuswb %%xmm0,%%xmm0 \n"
3806 "packuswb %%xmm6,%%xmm6 \n"
3807 "punpcklbw %%xmm6,%%xmm0 \n"
3808 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3809 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
3810 "pmaddubsw %%xmm4,%%xmm1 \n"
3811 "pmaddubsw %%xmm4,%%xmm7 \n"
3812 "phaddsw %%xmm7,%%xmm1 \n"
3813 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
3814 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
3815 "pmaddubsw %%xmm5,%%xmm6 \n"
3816 "pmaddubsw %%xmm5,%%xmm7 \n"
3817 "phaddsw %%xmm7,%%xmm6 \n"
3818 "psraw $0x6,%%xmm1 \n"
3819 "psraw $0x6,%%xmm6 \n"
3820 "packuswb %%xmm1,%%xmm1 \n"
3821 "packuswb %%xmm6,%%xmm6 \n"
3822 "punpcklbw %%xmm6,%%xmm1 \n"
3823 "movdqa %%xmm0,%%xmm6 \n"
3824 "punpcklwd %%xmm1,%%xmm0 \n"
3825 "punpckhwd %%xmm1,%%xmm6 \n"
3826 "movdqu %%xmm0," MEMACCESS(1) " \n"
3827 "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n"
3828 "lea " MEMLEA(0x20,0) ",%0 \n"
3829 "lea " MEMLEA(0x20,1) ",%1 \n"
3830 "sub $0x8,%2 \n"
3831 "jg 1b \n"
3832 : "+r"(src_argb), // %0
3833 "+r"(dst_argb), // %1
3834 "+r"(width) // %2
3835 : "r"(matrix_argb) // %3
3836 : "memory", "cc"
3837 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3838 );
3839 }
3840 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
3841
3842 #ifdef HAS_ARGBQUANTIZEROW_SSE2
3843 // Quantize 4 ARGB pixels (16 bytes).
ARGBQuantizeRow_SSE2(uint8 * dst_argb,int scale,int interval_size,int interval_offset,int width)3844 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3845 int interval_offset, int width) {
3846 asm volatile (
3847 "movd %2,%%xmm2 \n"
3848 "movd %3,%%xmm3 \n"
3849 "movd %4,%%xmm4 \n"
3850 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3851 "pshufd $0x44,%%xmm2,%%xmm2 \n"
3852 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3853 "pshufd $0x44,%%xmm3,%%xmm3 \n"
3854 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
3855 "pshufd $0x44,%%xmm4,%%xmm4 \n"
3856 "pxor %%xmm5,%%xmm5 \n"
3857 "pcmpeqb %%xmm6,%%xmm6 \n"
3858 "pslld $0x18,%%xmm6 \n"
3859
3860 // 4 pixel loop.
3861 LABELALIGN
3862 "1: \n"
3863 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3864 "punpcklbw %%xmm5,%%xmm0 \n"
3865 "pmulhuw %%xmm2,%%xmm0 \n"
3866 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3867 "punpckhbw %%xmm5,%%xmm1 \n"
3868 "pmulhuw %%xmm2,%%xmm1 \n"
3869 "pmullw %%xmm3,%%xmm0 \n"
3870 "movdqu " MEMACCESS(0) ",%%xmm7 \n"
3871 "pmullw %%xmm3,%%xmm1 \n"
3872 "pand %%xmm6,%%xmm7 \n"
3873 "paddw %%xmm4,%%xmm0 \n"
3874 "paddw %%xmm4,%%xmm1 \n"
3875 "packuswb %%xmm1,%%xmm0 \n"
3876 "por %%xmm7,%%xmm0 \n"
3877 "movdqu %%xmm0," MEMACCESS(0) " \n"
3878 "lea " MEMLEA(0x10,0) ",%0 \n"
3879 "sub $0x4,%1 \n"
3880 "jg 1b \n"
3881 : "+r"(dst_argb), // %0
3882 "+r"(width) // %1
3883 : "r"(scale), // %2
3884 "r"(interval_size), // %3
3885 "r"(interval_offset) // %4
3886 : "memory", "cc"
3887 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3888 );
3889 }
3890 #endif // HAS_ARGBQUANTIZEROW_SSE2
3891
3892 #ifdef HAS_ARGBSHADEROW_SSE2
3893 // Shade 4 pixels at a time by specified value.
ARGBShadeRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width,uint32 value)3894 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
3895 uint32 value) {
3896 asm volatile (
3897 "movd %3,%%xmm2 \n"
3898 "punpcklbw %%xmm2,%%xmm2 \n"
3899 "punpcklqdq %%xmm2,%%xmm2 \n"
3900
3901 // 4 pixel loop.
3902 LABELALIGN
3903 "1: \n"
3904 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3905 "lea " MEMLEA(0x10,0) ",%0 \n"
3906 "movdqa %%xmm0,%%xmm1 \n"
3907 "punpcklbw %%xmm0,%%xmm0 \n"
3908 "punpckhbw %%xmm1,%%xmm1 \n"
3909 "pmulhuw %%xmm2,%%xmm0 \n"
3910 "pmulhuw %%xmm2,%%xmm1 \n"
3911 "psrlw $0x8,%%xmm0 \n"
3912 "psrlw $0x8,%%xmm1 \n"
3913 "packuswb %%xmm1,%%xmm0 \n"
3914 "movdqu %%xmm0," MEMACCESS(1) " \n"
3915 "lea " MEMLEA(0x10,1) ",%1 \n"
3916 "sub $0x4,%2 \n"
3917 "jg 1b \n"
3918 : "+r"(src_argb), // %0
3919 "+r"(dst_argb), // %1
3920 "+r"(width) // %2
3921 : "r"(value) // %3
3922 : "memory", "cc"
3923 , "xmm0", "xmm1", "xmm2"
3924 );
3925 }
3926 #endif // HAS_ARGBSHADEROW_SSE2
3927
3928 #ifdef HAS_ARGBMULTIPLYROW_SSE2
3929 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBMultiplyRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)3930 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3931 uint8* dst_argb, int width) {
3932 asm volatile (
3933 "pxor %%xmm5,%%xmm5 \n"
3934
3935 // 4 pixel loop.
3936 LABELALIGN
3937 "1: \n"
3938 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3939 "lea " MEMLEA(0x10,0) ",%0 \n"
3940 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
3941 "lea " MEMLEA(0x10,1) ",%1 \n"
3942 "movdqu %%xmm0,%%xmm1 \n"
3943 "movdqu %%xmm2,%%xmm3 \n"
3944 "punpcklbw %%xmm0,%%xmm0 \n"
3945 "punpckhbw %%xmm1,%%xmm1 \n"
3946 "punpcklbw %%xmm5,%%xmm2 \n"
3947 "punpckhbw %%xmm5,%%xmm3 \n"
3948 "pmulhuw %%xmm2,%%xmm0 \n"
3949 "pmulhuw %%xmm3,%%xmm1 \n"
3950 "packuswb %%xmm1,%%xmm0 \n"
3951 "movdqu %%xmm0," MEMACCESS(2) " \n"
3952 "lea " MEMLEA(0x10,2) ",%2 \n"
3953 "sub $0x4,%3 \n"
3954 "jg 1b \n"
3955 : "+r"(src_argb0), // %0
3956 "+r"(src_argb1), // %1
3957 "+r"(dst_argb), // %2
3958 "+r"(width) // %3
3959 :
3960 : "memory", "cc"
3961 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3962 );
3963 }
3964 #endif // HAS_ARGBMULTIPLYROW_SSE2
3965
3966 #ifdef HAS_ARGBMULTIPLYROW_AVX2
3967 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)3968 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
3969 uint8* dst_argb, int width) {
3970 asm volatile (
3971 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
3972
3973 // 4 pixel loop.
3974 LABELALIGN
3975 "1: \n"
3976 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
3977 "lea " MEMLEA(0x20,0) ",%0 \n"
3978 "vmovdqu " MEMACCESS(1) ",%%ymm3 \n"
3979 "lea " MEMLEA(0x20,1) ",%1 \n"
3980 "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
3981 "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
3982 "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
3983 "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
3984 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
3985 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
3986 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3987 "vmovdqu %%ymm0," MEMACCESS(2) " \n"
3988 "lea " MEMLEA(0x20,2) ",%2 \n"
3989 "sub $0x8,%3 \n"
3990 "jg 1b \n"
3991 "vzeroupper \n"
3992 : "+r"(src_argb0), // %0
3993 "+r"(src_argb1), // %1
3994 "+r"(dst_argb), // %2
3995 "+r"(width) // %3
3996 :
3997 : "memory", "cc"
3998 #if defined(__AVX2__)
3999 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4000 #endif
4001 );
4002 }
4003 #endif // HAS_ARGBMULTIPLYROW_AVX2
4004
4005 #ifdef HAS_ARGBADDROW_SSE2
4006 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4007 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4008 uint8* dst_argb, int width) {
4009 asm volatile (
4010 // 4 pixel loop.
4011 LABELALIGN
4012 "1: \n"
4013 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4014 "lea " MEMLEA(0x10,0) ",%0 \n"
4015 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
4016 "lea " MEMLEA(0x10,1) ",%1 \n"
4017 "paddusb %%xmm1,%%xmm0 \n"
4018 "movdqu %%xmm0," MEMACCESS(2) " \n"
4019 "lea " MEMLEA(0x10,2) ",%2 \n"
4020 "sub $0x4,%3 \n"
4021 "jg 1b \n"
4022 : "+r"(src_argb0), // %0
4023 "+r"(src_argb1), // %1
4024 "+r"(dst_argb), // %2
4025 "+r"(width) // %3
4026 :
4027 : "memory", "cc"
4028 , "xmm0", "xmm1"
4029 );
4030 }
4031 #endif // HAS_ARGBADDROW_SSE2
4032
4033 #ifdef HAS_ARGBADDROW_AVX2
4034 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4035 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4036 uint8* dst_argb, int width) {
4037 asm volatile (
4038 // 4 pixel loop.
4039 LABELALIGN
4040 "1: \n"
4041 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
4042 "lea " MEMLEA(0x20,0) ",%0 \n"
4043 "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4044 "lea " MEMLEA(0x20,1) ",%1 \n"
4045 "vmovdqu %%ymm0," MEMACCESS(2) " \n"
4046 "lea " MEMLEA(0x20,2) ",%2 \n"
4047 "sub $0x8,%3 \n"
4048 "jg 1b \n"
4049 "vzeroupper \n"
4050 : "+r"(src_argb0), // %0
4051 "+r"(src_argb1), // %1
4052 "+r"(dst_argb), // %2
4053 "+r"(width) // %3
4054 :
4055 : "memory", "cc"
4056 , "xmm0"
4057 );
4058 }
4059 #endif // HAS_ARGBADDROW_AVX2
4060
4061 #ifdef HAS_ARGBSUBTRACTROW_SSE2
4062 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
ARGBSubtractRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4063 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4064 uint8* dst_argb, int width) {
4065 asm volatile (
4066 // 4 pixel loop.
4067 LABELALIGN
4068 "1: \n"
4069 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4070 "lea " MEMLEA(0x10,0) ",%0 \n"
4071 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
4072 "lea " MEMLEA(0x10,1) ",%1 \n"
4073 "psubusb %%xmm1,%%xmm0 \n"
4074 "movdqu %%xmm0," MEMACCESS(2) " \n"
4075 "lea " MEMLEA(0x10,2) ",%2 \n"
4076 "sub $0x4,%3 \n"
4077 "jg 1b \n"
4078 : "+r"(src_argb0), // %0
4079 "+r"(src_argb1), // %1
4080 "+r"(dst_argb), // %2
4081 "+r"(width) // %3
4082 :
4083 : "memory", "cc"
4084 , "xmm0", "xmm1"
4085 );
4086 }
4087 #endif // HAS_ARGBSUBTRACTROW_SSE2
4088
4089 #ifdef HAS_ARGBSUBTRACTROW_AVX2
4090 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4091 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4092 uint8* dst_argb, int width) {
4093 asm volatile (
4094 // 4 pixel loop.
4095 LABELALIGN
4096 "1: \n"
4097 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
4098 "lea " MEMLEA(0x20,0) ",%0 \n"
4099 "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4100 "lea " MEMLEA(0x20,1) ",%1 \n"
4101 "vmovdqu %%ymm0," MEMACCESS(2) " \n"
4102 "lea " MEMLEA(0x20,2) ",%2 \n"
4103 "sub $0x8,%3 \n"
4104 "jg 1b \n"
4105 "vzeroupper \n"
4106 : "+r"(src_argb0), // %0
4107 "+r"(src_argb1), // %1
4108 "+r"(dst_argb), // %2
4109 "+r"(width) // %3
4110 :
4111 : "memory", "cc"
4112 , "xmm0"
4113 );
4114 }
4115 #endif // HAS_ARGBSUBTRACTROW_AVX2
4116
4117 #ifdef HAS_SOBELXROW_SSE2
4118 // SobelX as a matrix is
4119 // -1 0 1
4120 // -2 0 2
4121 // -1 0 1
SobelXRow_SSE2(const uint8 * src_y0,const uint8 * src_y1,const uint8 * src_y2,uint8 * dst_sobelx,int width)4122 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4123 const uint8* src_y2, uint8* dst_sobelx, int width) {
4124 asm volatile (
4125 "sub %0,%1 \n"
4126 "sub %0,%2 \n"
4127 "sub %0,%3 \n"
4128 "pxor %%xmm5,%%xmm5 \n"
4129
4130 // 8 pixel loop.
4131 LABELALIGN
4132 "1: \n"
4133 "movq " MEMACCESS(0) ",%%xmm0 \n"
4134 "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n"
4135 "punpcklbw %%xmm5,%%xmm0 \n"
4136 "punpcklbw %%xmm5,%%xmm1 \n"
4137 "psubw %%xmm1,%%xmm0 \n"
4138 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
4139 MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2
4140 "punpcklbw %%xmm5,%%xmm1 \n"
4141 "punpcklbw %%xmm5,%%xmm2 \n"
4142 "psubw %%xmm2,%%xmm1 \n"
4143 MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2
4144 MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3
4145 "punpcklbw %%xmm5,%%xmm2 \n"
4146 "punpcklbw %%xmm5,%%xmm3 \n"
4147 "psubw %%xmm3,%%xmm2 \n"
4148 "paddw %%xmm2,%%xmm0 \n"
4149 "paddw %%xmm1,%%xmm0 \n"
4150 "paddw %%xmm1,%%xmm0 \n"
4151 "pxor %%xmm1,%%xmm1 \n"
4152 "psubw %%xmm0,%%xmm1 \n"
4153 "pmaxsw %%xmm1,%%xmm0 \n"
4154 "packuswb %%xmm0,%%xmm0 \n"
4155 MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1)
4156 "lea " MEMLEA(0x8,0) ",%0 \n"
4157 "sub $0x8,%4 \n"
4158 "jg 1b \n"
4159 : "+r"(src_y0), // %0
4160 "+r"(src_y1), // %1
4161 "+r"(src_y2), // %2
4162 "+r"(dst_sobelx), // %3
4163 "+r"(width) // %4
4164 :
4165 : "memory", "cc", NACL_R14
4166 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4167 );
4168 }
4169 #endif // HAS_SOBELXROW_SSE2
4170
4171 #ifdef HAS_SOBELYROW_SSE2
4172 // SobelY as a matrix is
4173 // -1 -2 -1
4174 // 0 0 0
4175 // 1 2 1
SobelYRow_SSE2(const uint8 * src_y0,const uint8 * src_y1,uint8 * dst_sobely,int width)4176 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4177 uint8* dst_sobely, int width) {
4178 asm volatile (
4179 "sub %0,%1 \n"
4180 "sub %0,%2 \n"
4181 "pxor %%xmm5,%%xmm5 \n"
4182
4183 // 8 pixel loop.
4184 LABELALIGN
4185 "1: \n"
4186 "movq " MEMACCESS(0) ",%%xmm0 \n"
4187 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
4188 "punpcklbw %%xmm5,%%xmm0 \n"
4189 "punpcklbw %%xmm5,%%xmm1 \n"
4190 "psubw %%xmm1,%%xmm0 \n"
4191 "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n"
4192 MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2
4193 "punpcklbw %%xmm5,%%xmm1 \n"
4194 "punpcklbw %%xmm5,%%xmm2 \n"
4195 "psubw %%xmm2,%%xmm1 \n"
4196 "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n"
4197 MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3
4198 "punpcklbw %%xmm5,%%xmm2 \n"
4199 "punpcklbw %%xmm5,%%xmm3 \n"
4200 "psubw %%xmm3,%%xmm2 \n"
4201 "paddw %%xmm2,%%xmm0 \n"
4202 "paddw %%xmm1,%%xmm0 \n"
4203 "paddw %%xmm1,%%xmm0 \n"
4204 "pxor %%xmm1,%%xmm1 \n"
4205 "psubw %%xmm0,%%xmm1 \n"
4206 "pmaxsw %%xmm1,%%xmm0 \n"
4207 "packuswb %%xmm0,%%xmm0 \n"
4208 MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1)
4209 "lea " MEMLEA(0x8,0) ",%0 \n"
4210 "sub $0x8,%3 \n"
4211 "jg 1b \n"
4212 : "+r"(src_y0), // %0
4213 "+r"(src_y1), // %1
4214 "+r"(dst_sobely), // %2
4215 "+r"(width) // %3
4216 :
4217 : "memory", "cc", NACL_R14
4218 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4219 );
4220 }
4221 #endif // HAS_SOBELYROW_SSE2
4222
4223 #ifdef HAS_SOBELROW_SSE2
4224 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
4225 // A = 255
4226 // R = Sobel
4227 // G = Sobel
4228 // B = Sobel
SobelRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)4229 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4230 uint8* dst_argb, int width) {
4231 asm volatile (
4232 "sub %0,%1 \n"
4233 "pcmpeqb %%xmm5,%%xmm5 \n"
4234 "pslld $0x18,%%xmm5 \n"
4235
4236 // 8 pixel loop.
4237 LABELALIGN
4238 "1: \n"
4239 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4240 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
4241 "lea " MEMLEA(0x10,0) ",%0 \n"
4242 "paddusb %%xmm1,%%xmm0 \n"
4243 "movdqa %%xmm0,%%xmm2 \n"
4244 "punpcklbw %%xmm0,%%xmm2 \n"
4245 "punpckhbw %%xmm0,%%xmm0 \n"
4246 "movdqa %%xmm2,%%xmm1 \n"
4247 "punpcklwd %%xmm2,%%xmm1 \n"
4248 "punpckhwd %%xmm2,%%xmm2 \n"
4249 "por %%xmm5,%%xmm1 \n"
4250 "por %%xmm5,%%xmm2 \n"
4251 "movdqa %%xmm0,%%xmm3 \n"
4252 "punpcklwd %%xmm0,%%xmm3 \n"
4253 "punpckhwd %%xmm0,%%xmm0 \n"
4254 "por %%xmm5,%%xmm3 \n"
4255 "por %%xmm5,%%xmm0 \n"
4256 "movdqu %%xmm1," MEMACCESS(2) " \n"
4257 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
4258 "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n"
4259 "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n"
4260 "lea " MEMLEA(0x40,2) ",%2 \n"
4261 "sub $0x10,%3 \n"
4262 "jg 1b \n"
4263 : "+r"(src_sobelx), // %0
4264 "+r"(src_sobely), // %1
4265 "+r"(dst_argb), // %2
4266 "+r"(width) // %3
4267 :
4268 : "memory", "cc", NACL_R14
4269 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4270 );
4271 }
4272 #endif // HAS_SOBELROW_SSE2
4273
4274 #ifdef HAS_SOBELTOPLANEROW_SSE2
4275 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
SobelToPlaneRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_y,int width)4276 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4277 uint8* dst_y, int width) {
4278 asm volatile (
4279 "sub %0,%1 \n"
4280 "pcmpeqb %%xmm5,%%xmm5 \n"
4281 "pslld $0x18,%%xmm5 \n"
4282
4283 // 8 pixel loop.
4284 LABELALIGN
4285 "1: \n"
4286 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4287 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
4288 "lea " MEMLEA(0x10,0) ",%0 \n"
4289 "paddusb %%xmm1,%%xmm0 \n"
4290 "movdqu %%xmm0," MEMACCESS(2) " \n"
4291 "lea " MEMLEA(0x10,2) ",%2 \n"
4292 "sub $0x10,%3 \n"
4293 "jg 1b \n"
4294 : "+r"(src_sobelx), // %0
4295 "+r"(src_sobely), // %1
4296 "+r"(dst_y), // %2
4297 "+r"(width) // %3
4298 :
4299 : "memory", "cc", NACL_R14
4300 "xmm0", "xmm1"
4301 );
4302 }
4303 #endif // HAS_SOBELTOPLANEROW_SSE2
4304
4305 #ifdef HAS_SOBELXYROW_SSE2
4306 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
4307 // A = 255
4308 // R = Sobel X
4309 // G = Sobel
4310 // B = Sobel Y
SobelXYRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)4311 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4312 uint8* dst_argb, int width) {
4313 asm volatile (
4314 "sub %0,%1 \n"
4315 "pcmpeqb %%xmm5,%%xmm5 \n"
4316
4317 // 8 pixel loop.
4318 LABELALIGN
4319 "1: \n"
4320 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4321 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
4322 "lea " MEMLEA(0x10,0) ",%0 \n"
4323 "movdqa %%xmm0,%%xmm2 \n"
4324 "paddusb %%xmm1,%%xmm2 \n"
4325 "movdqa %%xmm0,%%xmm3 \n"
4326 "punpcklbw %%xmm5,%%xmm3 \n"
4327 "punpckhbw %%xmm5,%%xmm0 \n"
4328 "movdqa %%xmm1,%%xmm4 \n"
4329 "punpcklbw %%xmm2,%%xmm4 \n"
4330 "punpckhbw %%xmm2,%%xmm1 \n"
4331 "movdqa %%xmm4,%%xmm6 \n"
4332 "punpcklwd %%xmm3,%%xmm6 \n"
4333 "punpckhwd %%xmm3,%%xmm4 \n"
4334 "movdqa %%xmm1,%%xmm7 \n"
4335 "punpcklwd %%xmm0,%%xmm7 \n"
4336 "punpckhwd %%xmm0,%%xmm1 \n"
4337 "movdqu %%xmm6," MEMACCESS(2) " \n"
4338 "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n"
4339 "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n"
4340 "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n"
4341 "lea " MEMLEA(0x40,2) ",%2 \n"
4342 "sub $0x10,%3 \n"
4343 "jg 1b \n"
4344 : "+r"(src_sobelx), // %0
4345 "+r"(src_sobely), // %1
4346 "+r"(dst_argb), // %2
4347 "+r"(width) // %3
4348 :
4349 : "memory", "cc", NACL_R14
4350 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4351 );
4352 }
4353 #endif // HAS_SOBELXYROW_SSE2
4354
4355 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
4356 // Creates a table of cumulative sums where each value is a sum of all values
4357 // above and to the left of the value, inclusive of the value.
ComputeCumulativeSumRow_SSE2(const uint8 * row,int32 * cumsum,const int32 * previous_cumsum,int width)4358 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
4359 const int32* previous_cumsum, int width) {
4360 asm volatile (
4361 "pxor %%xmm0,%%xmm0 \n"
4362 "pxor %%xmm1,%%xmm1 \n"
4363 "sub $0x4,%3 \n"
4364 "jl 49f \n"
4365 "test $0xf,%1 \n"
4366 "jne 49f \n"
4367
4368 // 4 pixel loop \n"
4369 LABELALIGN
4370 "40: \n"
4371 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
4372 "lea " MEMLEA(0x10,0) ",%0 \n"
4373 "movdqa %%xmm2,%%xmm4 \n"
4374 "punpcklbw %%xmm1,%%xmm2 \n"
4375 "movdqa %%xmm2,%%xmm3 \n"
4376 "punpcklwd %%xmm1,%%xmm2 \n"
4377 "punpckhwd %%xmm1,%%xmm3 \n"
4378 "punpckhbw %%xmm1,%%xmm4 \n"
4379 "movdqa %%xmm4,%%xmm5 \n"
4380 "punpcklwd %%xmm1,%%xmm4 \n"
4381 "punpckhwd %%xmm1,%%xmm5 \n"
4382 "paddd %%xmm2,%%xmm0 \n"
4383 "movdqu " MEMACCESS(2) ",%%xmm2 \n"
4384 "paddd %%xmm0,%%xmm2 \n"
4385 "paddd %%xmm3,%%xmm0 \n"
4386 "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n"
4387 "paddd %%xmm0,%%xmm3 \n"
4388 "paddd %%xmm4,%%xmm0 \n"
4389 "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n"
4390 "paddd %%xmm0,%%xmm4 \n"
4391 "paddd %%xmm5,%%xmm0 \n"
4392 "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n"
4393 "lea " MEMLEA(0x40,2) ",%2 \n"
4394 "paddd %%xmm0,%%xmm5 \n"
4395 "movdqu %%xmm2," MEMACCESS(1) " \n"
4396 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
4397 "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n"
4398 "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n"
4399 "lea " MEMLEA(0x40,1) ",%1 \n"
4400 "sub $0x4,%3 \n"
4401 "jge 40b \n"
4402
4403 "49: \n"
4404 "add $0x3,%3 \n"
4405 "jl 19f \n"
4406
4407 // 1 pixel loop \n"
4408 LABELALIGN
4409 "10: \n"
4410 "movd " MEMACCESS(0) ",%%xmm2 \n"
4411 "lea " MEMLEA(0x4,0) ",%0 \n"
4412 "punpcklbw %%xmm1,%%xmm2 \n"
4413 "punpcklwd %%xmm1,%%xmm2 \n"
4414 "paddd %%xmm2,%%xmm0 \n"
4415 "movdqu " MEMACCESS(2) ",%%xmm2 \n"
4416 "lea " MEMLEA(0x10,2) ",%2 \n"
4417 "paddd %%xmm0,%%xmm2 \n"
4418 "movdqu %%xmm2," MEMACCESS(1) " \n"
4419 "lea " MEMLEA(0x10,1) ",%1 \n"
4420 "sub $0x1,%3 \n"
4421 "jge 10b \n"
4422
4423 "19: \n"
4424 : "+r"(row), // %0
4425 "+r"(cumsum), // %1
4426 "+r"(previous_cumsum), // %2
4427 "+r"(width) // %3
4428 :
4429 : "memory", "cc"
4430 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4431 );
4432 }
4433 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
4434
4435 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
CumulativeSumToAverageRow_SSE2(const int32 * topleft,const int32 * botleft,int width,int area,uint8 * dst,int count)4436 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
4437 int width, int area, uint8* dst,
4438 int count) {
4439 asm volatile (
4440 "movd %5,%%xmm5 \n"
4441 "cvtdq2ps %%xmm5,%%xmm5 \n"
4442 "rcpss %%xmm5,%%xmm4 \n"
4443 "pshufd $0x0,%%xmm4,%%xmm4 \n"
4444 "sub $0x4,%3 \n"
4445 "jl 49f \n"
4446 "cmpl $0x80,%5 \n"
4447 "ja 40f \n"
4448
4449 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4450 "pcmpeqb %%xmm6,%%xmm6 \n"
4451 "psrld $0x10,%%xmm6 \n"
4452 "cvtdq2ps %%xmm6,%%xmm6 \n"
4453 "addps %%xmm6,%%xmm5 \n"
4454 "mulps %%xmm4,%%xmm5 \n"
4455 "cvtps2dq %%xmm5,%%xmm5 \n"
4456 "packssdw %%xmm5,%%xmm5 \n"
4457
4458 // 4 pixel small loop \n"
4459 LABELALIGN
4460 "4: \n"
4461 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4462 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4463 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
4464 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
4465 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
4466 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
4467 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
4468 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
4469 "lea " MEMLEA(0x40,0) ",%0 \n"
4470 "psubd " MEMACCESS(1) ",%%xmm0 \n"
4471 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
4472 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
4473 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
4474 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
4475 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
4476 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
4477 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
4478 "lea " MEMLEA(0x40,1) ",%1 \n"
4479 "packssdw %%xmm1,%%xmm0 \n"
4480 "packssdw %%xmm3,%%xmm2 \n"
4481 "pmulhuw %%xmm5,%%xmm0 \n"
4482 "pmulhuw %%xmm5,%%xmm2 \n"
4483 "packuswb %%xmm2,%%xmm0 \n"
4484 "movdqu %%xmm0," MEMACCESS(2) " \n"
4485 "lea " MEMLEA(0x10,2) ",%2 \n"
4486 "sub $0x4,%3 \n"
4487 "jge 4b \n"
4488 "jmp 49f \n"
4489
4490 // 4 pixel loop \n"
4491 LABELALIGN
4492 "40: \n"
4493 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4494 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4495 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
4496 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
4497 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
4498 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
4499 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
4500 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
4501 "lea " MEMLEA(0x40,0) ",%0 \n"
4502 "psubd " MEMACCESS(1) ",%%xmm0 \n"
4503 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
4504 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
4505 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
4506 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
4507 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
4508 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
4509 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
4510 "lea " MEMLEA(0x40,1) ",%1 \n"
4511 "cvtdq2ps %%xmm0,%%xmm0 \n"
4512 "cvtdq2ps %%xmm1,%%xmm1 \n"
4513 "mulps %%xmm4,%%xmm0 \n"
4514 "mulps %%xmm4,%%xmm1 \n"
4515 "cvtdq2ps %%xmm2,%%xmm2 \n"
4516 "cvtdq2ps %%xmm3,%%xmm3 \n"
4517 "mulps %%xmm4,%%xmm2 \n"
4518 "mulps %%xmm4,%%xmm3 \n"
4519 "cvtps2dq %%xmm0,%%xmm0 \n"
4520 "cvtps2dq %%xmm1,%%xmm1 \n"
4521 "cvtps2dq %%xmm2,%%xmm2 \n"
4522 "cvtps2dq %%xmm3,%%xmm3 \n"
4523 "packssdw %%xmm1,%%xmm0 \n"
4524 "packssdw %%xmm3,%%xmm2 \n"
4525 "packuswb %%xmm2,%%xmm0 \n"
4526 "movdqu %%xmm0," MEMACCESS(2) " \n"
4527 "lea " MEMLEA(0x10,2) ",%2 \n"
4528 "sub $0x4,%3 \n"
4529 "jge 40b \n"
4530
4531 "49: \n"
4532 "add $0x3,%3 \n"
4533 "jl 19f \n"
4534
4535 // 1 pixel loop \n"
4536 LABELALIGN
4537 "10: \n"
4538 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4539 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
4540 "lea " MEMLEA(0x10,0) ",%0 \n"
4541 "psubd " MEMACCESS(1) ",%%xmm0 \n"
4542 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
4543 "lea " MEMLEA(0x10,1) ",%1 \n"
4544 "cvtdq2ps %%xmm0,%%xmm0 \n"
4545 "mulps %%xmm4,%%xmm0 \n"
4546 "cvtps2dq %%xmm0,%%xmm0 \n"
4547 "packssdw %%xmm0,%%xmm0 \n"
4548 "packuswb %%xmm0,%%xmm0 \n"
4549 "movd %%xmm0," MEMACCESS(2) " \n"
4550 "lea " MEMLEA(0x4,2) ",%2 \n"
4551 "sub $0x1,%3 \n"
4552 "jge 10b \n"
4553 "19: \n"
4554 : "+r"(topleft), // %0
4555 "+r"(botleft), // %1
4556 "+r"(dst), // %2
4557 "+rm"(count) // %3
4558 : "r"((intptr_t)(width)), // %4
4559 "rm"(area) // %5
4560 : "memory", "cc", NACL_R14
4561 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4562 );
4563 }
4564 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4565
4566 #ifdef HAS_ARGBAFFINEROW_SSE2
4567 // Copy ARGB pixels from source image with slope to a row of destination.
4568 LIBYUV_API
ARGBAffineRow_SSE2(const uint8 * src_argb,int src_argb_stride,uint8 * dst_argb,const float * src_dudv,int width)4569 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
4570 uint8* dst_argb, const float* src_dudv, int width) {
4571 intptr_t src_argb_stride_temp = src_argb_stride;
4572 intptr_t temp = 0;
4573 asm volatile (
4574 "movq " MEMACCESS(3) ",%%xmm2 \n"
4575 "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n"
4576 "shl $0x10,%1 \n"
4577 "add $0x4,%1 \n"
4578 "movd %1,%%xmm5 \n"
4579 "sub $0x4,%4 \n"
4580 "jl 49f \n"
4581
4582 "pshufd $0x44,%%xmm7,%%xmm7 \n"
4583 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4584 "movdqa %%xmm2,%%xmm0 \n"
4585 "addps %%xmm7,%%xmm0 \n"
4586 "movlhps %%xmm0,%%xmm2 \n"
4587 "movdqa %%xmm7,%%xmm4 \n"
4588 "addps %%xmm4,%%xmm4 \n"
4589 "movdqa %%xmm2,%%xmm3 \n"
4590 "addps %%xmm4,%%xmm3 \n"
4591 "addps %%xmm4,%%xmm4 \n"
4592
4593 // 4 pixel loop \n"
4594 LABELALIGN
4595 "40: \n"
4596 "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2
4597 "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2
4598 "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
4599 "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride
4600 "movd %%xmm0,%k1 \n"
4601 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4602 "movd %%xmm0,%k5 \n"
4603 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4604 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
4605 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
4606 "punpckldq %%xmm6,%%xmm1 \n"
4607 "addps %%xmm4,%%xmm2 \n"
4608 "movq %%xmm1," MEMACCESS(2) " \n"
4609 "movd %%xmm0,%k1 \n"
4610 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4611 "movd %%xmm0,%k5 \n"
4612 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
4613 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
4614 "punpckldq %%xmm6,%%xmm0 \n"
4615 "addps %%xmm4,%%xmm3 \n"
4616 "movq %%xmm0," MEMACCESS2(0x08,2) " \n"
4617 "lea " MEMLEA(0x10,2) ",%2 \n"
4618 "sub $0x4,%4 \n"
4619 "jge 40b \n"
4620
4621 "49: \n"
4622 "add $0x3,%4 \n"
4623 "jl 19f \n"
4624
4625 // 1 pixel loop \n"
4626 LABELALIGN
4627 "10: \n"
4628 "cvttps2dq %%xmm2,%%xmm0 \n"
4629 "packssdw %%xmm0,%%xmm0 \n"
4630 "pmaddwd %%xmm5,%%xmm0 \n"
4631 "addps %%xmm7,%%xmm2 \n"
4632 "movd %%xmm0,%k1 \n"
4633 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
4634 "movd %%xmm0," MEMACCESS(2) " \n"
4635 "lea " MEMLEA(0x04,2) ",%2 \n"
4636 "sub $0x1,%4 \n"
4637 "jge 10b \n"
4638 "19: \n"
4639 : "+r"(src_argb), // %0
4640 "+r"(src_argb_stride_temp), // %1
4641 "+r"(dst_argb), // %2
4642 "+r"(src_dudv), // %3
4643 "+rm"(width), // %4
4644 "+r"(temp) // %5
4645 :
4646 : "memory", "cc", NACL_R14
4647 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4648 );
4649 }
4650 #endif // HAS_ARGBAFFINEROW_SSE2
4651
4652 #ifdef HAS_INTERPOLATEROW_SSSE3
4653 // Bilinear filter 16x2 -> 16x1
InterpolateRow_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)4654 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
4655 ptrdiff_t src_stride, int dst_width,
4656 int source_y_fraction) {
4657 asm volatile (
4658 "sub %1,%0 \n"
4659 "shr %3 \n"
4660 "cmp $0x0,%3 \n"
4661 "je 100f \n"
4662 "cmp $0x20,%3 \n"
4663 "je 75f \n"
4664 "cmp $0x40,%3 \n"
4665 "je 50f \n"
4666 "cmp $0x60,%3 \n"
4667 "je 25f \n"
4668
4669 "movd %3,%%xmm0 \n"
4670 "neg %3 \n"
4671 "add $0x80,%3 \n"
4672 "movd %3,%%xmm5 \n"
4673 "punpcklbw %%xmm0,%%xmm5 \n"
4674 "punpcklwd %%xmm5,%%xmm5 \n"
4675 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4676
4677 // General purpose row blend.
4678 LABELALIGN
4679 "1: \n"
4680 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4681 MEMOPREG(movdqu,0x00,1,4,1,xmm2)
4682 "movdqa %%xmm0,%%xmm1 \n"
4683 "punpcklbw %%xmm2,%%xmm0 \n"
4684 "punpckhbw %%xmm2,%%xmm1 \n"
4685 "pmaddubsw %%xmm5,%%xmm0 \n"
4686 "pmaddubsw %%xmm5,%%xmm1 \n"
4687 "psrlw $0x7,%%xmm0 \n"
4688 "psrlw $0x7,%%xmm1 \n"
4689 "packuswb %%xmm1,%%xmm0 \n"
4690 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4691 "lea " MEMLEA(0x10,1) ",%1 \n"
4692 "sub $0x10,%2 \n"
4693 "jg 1b \n"
4694 "jmp 99f \n"
4695
4696 // Blend 25 / 75.
4697 LABELALIGN
4698 "25: \n"
4699 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4700 MEMOPREG(movdqu,0x00,1,4,1,xmm1)
4701 "pavgb %%xmm1,%%xmm0 \n"
4702 "pavgb %%xmm1,%%xmm0 \n"
4703 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4704 "lea " MEMLEA(0x10,1) ",%1 \n"
4705 "sub $0x10,%2 \n"
4706 "jg 25b \n"
4707 "jmp 99f \n"
4708
4709 // Blend 50 / 50.
4710 LABELALIGN
4711 "50: \n"
4712 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4713 MEMOPREG(movdqu,0x00,1,4,1,xmm1)
4714 "pavgb %%xmm1,%%xmm0 \n"
4715 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4716 "lea " MEMLEA(0x10,1) ",%1 \n"
4717 "sub $0x10,%2 \n"
4718 "jg 50b \n"
4719 "jmp 99f \n"
4720
4721 // Blend 75 / 25.
4722 LABELALIGN
4723 "75: \n"
4724 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
4725 MEMOPREG(movdqu,0x00,1,4,1,xmm0)
4726 "pavgb %%xmm1,%%xmm0 \n"
4727 "pavgb %%xmm1,%%xmm0 \n"
4728 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4729 "lea " MEMLEA(0x10,1) ",%1 \n"
4730 "sub $0x10,%2 \n"
4731 "jg 75b \n"
4732 "jmp 99f \n"
4733
4734 // Blend 100 / 0 - Copy row unchanged.
4735 LABELALIGN
4736 "100: \n"
4737 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4738 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4739 "lea " MEMLEA(0x10,1) ",%1 \n"
4740 "sub $0x10,%2 \n"
4741 "jg 100b \n"
4742
4743 "99: \n"
4744 : "+r"(dst_ptr), // %0
4745 "+r"(src_ptr), // %1
4746 "+r"(dst_width), // %2
4747 "+r"(source_y_fraction) // %3
4748 : "r"((intptr_t)(src_stride)) // %4
4749 : "memory", "cc", NACL_R14
4750 "xmm0", "xmm1", "xmm2", "xmm5"
4751 );
4752 }
4753 #endif // HAS_INTERPOLATEROW_SSSE3
4754
4755 #ifdef HAS_INTERPOLATEROW_AVX2
4756 // Bilinear filter 32x2 -> 32x1
InterpolateRow_AVX2(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)4757 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
4758 ptrdiff_t src_stride, int dst_width,
4759 int source_y_fraction) {
4760 asm volatile (
4761 "shr %3 \n"
4762 "cmp $0x0,%3 \n"
4763 "je 100f \n"
4764 "sub %1,%0 \n"
4765 "cmp $0x20,%3 \n"
4766 "je 75f \n"
4767 "cmp $0x40,%3 \n"
4768 "je 50f \n"
4769 "cmp $0x60,%3 \n"
4770 "je 25f \n"
4771
4772 "vmovd %3,%%xmm0 \n"
4773 "neg %3 \n"
4774 "add $0x80,%3 \n"
4775 "vmovd %3,%%xmm5 \n"
4776 "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
4777 "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
4778 "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
4779 "vpermd %%ymm5,%%ymm0,%%ymm5 \n"
4780
4781 // General purpose row blend.
4782 LABELALIGN
4783 "1: \n"
4784 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
4785 MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
4786 "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
4787 "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
4788 "vpmaddubsw %%ymm5,%%ymm0,%%ymm0 \n"
4789 "vpmaddubsw %%ymm5,%%ymm1,%%ymm1 \n"
4790 "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
4791 "vpsrlw $0x7,%%ymm1,%%ymm1 \n"
4792 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4793 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4794 "lea " MEMLEA(0x20,1) ",%1 \n"
4795 "sub $0x20,%2 \n"
4796 "jg 1b \n"
4797 "jmp 99f \n"
4798
4799 // Blend 25 / 75.
4800 LABELALIGN
4801 "25: \n"
4802 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
4803 MEMOPREG(vmovdqu,0x00,1,4,1,ymm1)
4804 "vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
4805 "vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
4806 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4807 "lea " MEMLEA(0x20,1) ",%1 \n"
4808 "sub $0x20,%2 \n"
4809 "jg 25b \n"
4810 "jmp 99f \n"
4811
4812 // Blend 50 / 50.
4813 LABELALIGN
4814 "50: \n"
4815 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
4816 VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0) // vpavgb (%1,%4,1),%%ymm0,%%ymm0
4817 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4818 "lea " MEMLEA(0x20,1) ",%1 \n"
4819 "sub $0x20,%2 \n"
4820 "jg 50b \n"
4821 "jmp 99f \n"
4822
4823 // Blend 75 / 25.
4824 LABELALIGN
4825 "75: \n"
4826 "vmovdqu " MEMACCESS(1) ",%%ymm1 \n"
4827 MEMOPREG(vmovdqu,0x00,1,4,1,ymm0)
4828 "vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
4829 "vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
4830 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4831 "lea " MEMLEA(0x20,1) ",%1 \n"
4832 "sub $0x20,%2 \n"
4833 "jg 75b \n"
4834 "jmp 99f \n"
4835
4836 // Blend 100 / 0 - Copy row unchanged.
4837 LABELALIGN
4838 "100: \n"
4839 "rep movsb " MEMMOVESTRING(1,0) " \n"
4840 "jmp 999f \n"
4841
4842 "99: \n"
4843 "vzeroupper \n"
4844 "999: \n"
4845 : "+D"(dst_ptr), // %0
4846 "+S"(src_ptr), // %1
4847 "+c"(dst_width), // %2
4848 "+r"(source_y_fraction) // %3
4849 : "r"((intptr_t)(src_stride)) // %4
4850 : "memory", "cc", NACL_R14
4851 "xmm0", "xmm1", "xmm2", "xmm5"
4852 );
4853 }
4854 #endif // HAS_INTERPOLATEROW_AVX2
4855
4856 #ifdef HAS_INTERPOLATEROW_SSE2
4857 // Bilinear filter 16x2 -> 16x1
InterpolateRow_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)4858 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
4859 ptrdiff_t src_stride, int dst_width,
4860 int source_y_fraction) {
4861 asm volatile (
4862 "sub %1,%0 \n"
4863 "shr %3 \n"
4864 "cmp $0x0,%3 \n"
4865 "je 100f \n"
4866 "cmp $0x20,%3 \n"
4867 "je 75f \n"
4868 "cmp $0x40,%3 \n"
4869 "je 50f \n"
4870 "cmp $0x60,%3 \n"
4871 "je 25f \n"
4872
4873 "movd %3,%%xmm0 \n"
4874 "neg %3 \n"
4875 "add $0x80,%3 \n"
4876 "movd %3,%%xmm5 \n"
4877 "punpcklbw %%xmm0,%%xmm5 \n"
4878 "punpcklwd %%xmm5,%%xmm5 \n"
4879 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4880 "pxor %%xmm4,%%xmm4 \n"
4881
4882 // General purpose row blend.
4883 LABELALIGN
4884 "1: \n"
4885 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4886 MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2
4887 "movdqa %%xmm0,%%xmm1 \n"
4888 "movdqa %%xmm2,%%xmm3 \n"
4889 "punpcklbw %%xmm4,%%xmm2 \n"
4890 "punpckhbw %%xmm4,%%xmm3 \n"
4891 "punpcklbw %%xmm4,%%xmm0 \n"
4892 "punpckhbw %%xmm4,%%xmm1 \n"
4893 "psubw %%xmm0,%%xmm2 \n"
4894 "psubw %%xmm1,%%xmm3 \n"
4895 "paddw %%xmm2,%%xmm2 \n"
4896 "paddw %%xmm3,%%xmm3 \n"
4897 "pmulhw %%xmm5,%%xmm2 \n"
4898 "pmulhw %%xmm5,%%xmm3 \n"
4899 "paddw %%xmm2,%%xmm0 \n"
4900 "paddw %%xmm3,%%xmm1 \n"
4901 "packuswb %%xmm1,%%xmm0 \n"
4902 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
4903 "lea " MEMLEA(0x10,1) ",%1 \n"
4904 "sub $0x10,%2 \n"
4905 "jg 1b \n"
4906 "jmp 99f \n"
4907
4908 // Blend 25 / 75.
4909 LABELALIGN
4910 "25: \n"
4911 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4912 MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
4913 "pavgb %%xmm1,%%xmm0 \n"
4914 "pavgb %%xmm1,%%xmm0 \n"
4915 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
4916 "lea " MEMLEA(0x10,1) ",%1 \n"
4917 "sub $0x10,%2 \n"
4918 "jg 25b \n"
4919 "jmp 99f \n"
4920
4921 // Blend 50 / 50.
4922 LABELALIGN
4923 "50: \n"
4924 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4925 MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
4926 "pavgb %%xmm1,%%xmm0 \n"
4927 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
4928 "lea " MEMLEA(0x10,1) ",%1 \n"
4929 "sub $0x10,%2 \n"
4930 "jg 50b \n"
4931 "jmp 99f \n"
4932
4933 // Blend 75 / 25.
4934 LABELALIGN
4935 "75: \n"
4936 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
4937 MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0
4938 "pavgb %%xmm1,%%xmm0 \n"
4939 "pavgb %%xmm1,%%xmm0 \n"
4940 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
4941 "lea " MEMLEA(0x10,1) ",%1 \n"
4942 "sub $0x10,%2 \n"
4943 "jg 75b \n"
4944 "jmp 99f \n"
4945
4946 // Blend 100 / 0 - Copy row unchanged.
4947 LABELALIGN
4948 "100: \n"
4949 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4950 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
4951 "lea " MEMLEA(0x10,1) ",%1 \n"
4952 "sub $0x10,%2 \n"
4953 "jg 100b \n"
4954
4955 "99: \n"
4956 : "+r"(dst_ptr), // %0
4957 "+r"(src_ptr), // %1
4958 "+r"(dst_width), // %2
4959 "+r"(source_y_fraction) // %3
4960 : "r"((intptr_t)(src_stride)) // %4
4961 : "memory", "cc", NACL_R14
4962 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4963 );
4964 }
4965 #endif // HAS_INTERPOLATEROW_SSE2
4966
4967 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
4968 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int pix)4969 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
4970 const uint8* shuffler, int pix) {
4971 asm volatile (
4972 "movdqu " MEMACCESS(3) ",%%xmm5 \n"
4973 LABELALIGN
4974 "1: \n"
4975 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4976 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4977 "lea " MEMLEA(0x20,0) ",%0 \n"
4978 "pshufb %%xmm5,%%xmm0 \n"
4979 "pshufb %%xmm5,%%xmm1 \n"
4980 "movdqu %%xmm0," MEMACCESS(1) " \n"
4981 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
4982 "lea " MEMLEA(0x20,1) ",%1 \n"
4983 "sub $0x8,%2 \n"
4984 "jg 1b \n"
4985 : "+r"(src_argb), // %0
4986 "+r"(dst_argb), // %1
4987 "+r"(pix) // %2
4988 : "r"(shuffler) // %3
4989 : "memory", "cc"
4990 , "xmm0", "xmm1", "xmm5"
4991 );
4992 }
4993 #endif // HAS_ARGBSHUFFLEROW_SSSE3
4994
4995 #ifdef HAS_ARGBSHUFFLEROW_AVX2
4996 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int pix)4997 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
4998 const uint8* shuffler, int pix) {
4999 asm volatile (
5000 "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n"
5001 LABELALIGN
5002 "1: \n"
5003 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
5004 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
5005 "lea " MEMLEA(0x40,0) ",%0 \n"
5006 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
5007 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
5008 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
5009 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
5010 "lea " MEMLEA(0x40,1) ",%1 \n"
5011 "sub $0x10,%2 \n"
5012 "jg 1b \n"
5013 "vzeroupper \n"
5014 : "+r"(src_argb), // %0
5015 "+r"(dst_argb), // %1
5016 "+r"(pix) // %2
5017 : "r"(shuffler) // %3
5018 : "memory", "cc"
5019 , "xmm0", "xmm1", "xmm5"
5020 );
5021 }
5022 #endif // HAS_ARGBSHUFFLEROW_AVX2
5023
5024 #ifdef HAS_ARGBSHUFFLEROW_SSE2
5025 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int pix)5026 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
5027 const uint8* shuffler, int pix) {
5028 uintptr_t pixel_temp = 0u;
5029 asm volatile (
5030 "pxor %%xmm5,%%xmm5 \n"
5031 "mov " MEMACCESS(4) ",%k2 \n"
5032 "cmp $0x3000102,%k2 \n"
5033 "je 3012f \n"
5034 "cmp $0x10203,%k2 \n"
5035 "je 123f \n"
5036 "cmp $0x30201,%k2 \n"
5037 "je 321f \n"
5038 "cmp $0x2010003,%k2 \n"
5039 "je 2103f \n"
5040
5041 LABELALIGN
5042 "1: \n"
5043 "movzb " MEMACCESS(4) ",%2 \n"
5044 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5045 "mov %b2," MEMACCESS(1) " \n"
5046 "movzb " MEMACCESS2(0x1,4) ",%2 \n"
5047 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5048 "mov %b2," MEMACCESS2(0x1,1) " \n"
5049 "movzb " MEMACCESS2(0x2,4) ",%2 \n"
5050 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5051 "mov %b2," MEMACCESS2(0x2,1) " \n"
5052 "movzb " MEMACCESS2(0x3,4) ",%2 \n"
5053 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5054 "mov %b2," MEMACCESS2(0x3,1) " \n"
5055 "lea " MEMLEA(0x4,0) ",%0 \n"
5056 "lea " MEMLEA(0x4,1) ",%1 \n"
5057 "sub $0x1,%3 \n"
5058 "jg 1b \n"
5059 "jmp 99f \n"
5060
5061 LABELALIGN
5062 "123: \n"
5063 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5064 "lea " MEMLEA(0x10,0) ",%0 \n"
5065 "movdqa %%xmm0,%%xmm1 \n"
5066 "punpcklbw %%xmm5,%%xmm0 \n"
5067 "punpckhbw %%xmm5,%%xmm1 \n"
5068 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
5069 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
5070 "pshufhw $0x1b,%%xmm1,%%xmm1 \n"
5071 "pshuflw $0x1b,%%xmm1,%%xmm1 \n"
5072 "packuswb %%xmm1,%%xmm0 \n"
5073 "movdqu %%xmm0," MEMACCESS(1) " \n"
5074 "lea " MEMLEA(0x10,1) ",%1 \n"
5075 "sub $0x4,%3 \n"
5076 "jg 123b \n"
5077 "jmp 99f \n"
5078
5079 LABELALIGN
5080 "321: \n"
5081 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5082 "lea " MEMLEA(0x10,0) ",%0 \n"
5083 "movdqa %%xmm0,%%xmm1 \n"
5084 "punpcklbw %%xmm5,%%xmm0 \n"
5085 "punpckhbw %%xmm5,%%xmm1 \n"
5086 "pshufhw $0x39,%%xmm0,%%xmm0 \n"
5087 "pshuflw $0x39,%%xmm0,%%xmm0 \n"
5088 "pshufhw $0x39,%%xmm1,%%xmm1 \n"
5089 "pshuflw $0x39,%%xmm1,%%xmm1 \n"
5090 "packuswb %%xmm1,%%xmm0 \n"
5091 "movdqu %%xmm0," MEMACCESS(1) " \n"
5092 "lea " MEMLEA(0x10,1) ",%1 \n"
5093 "sub $0x4,%3 \n"
5094 "jg 321b \n"
5095 "jmp 99f \n"
5096
5097 LABELALIGN
5098 "2103: \n"
5099 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5100 "lea " MEMLEA(0x10,0) ",%0 \n"
5101 "movdqa %%xmm0,%%xmm1 \n"
5102 "punpcklbw %%xmm5,%%xmm0 \n"
5103 "punpckhbw %%xmm5,%%xmm1 \n"
5104 "pshufhw $0x93,%%xmm0,%%xmm0 \n"
5105 "pshuflw $0x93,%%xmm0,%%xmm0 \n"
5106 "pshufhw $0x93,%%xmm1,%%xmm1 \n"
5107 "pshuflw $0x93,%%xmm1,%%xmm1 \n"
5108 "packuswb %%xmm1,%%xmm0 \n"
5109 "movdqu %%xmm0," MEMACCESS(1) " \n"
5110 "lea " MEMLEA(0x10,1) ",%1 \n"
5111 "sub $0x4,%3 \n"
5112 "jg 2103b \n"
5113 "jmp 99f \n"
5114
5115 LABELALIGN
5116 "3012: \n"
5117 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5118 "lea " MEMLEA(0x10,0) ",%0 \n"
5119 "movdqa %%xmm0,%%xmm1 \n"
5120 "punpcklbw %%xmm5,%%xmm0 \n"
5121 "punpckhbw %%xmm5,%%xmm1 \n"
5122 "pshufhw $0xc6,%%xmm0,%%xmm0 \n"
5123 "pshuflw $0xc6,%%xmm0,%%xmm0 \n"
5124 "pshufhw $0xc6,%%xmm1,%%xmm1 \n"
5125 "pshuflw $0xc6,%%xmm1,%%xmm1 \n"
5126 "packuswb %%xmm1,%%xmm0 \n"
5127 "movdqu %%xmm0," MEMACCESS(1) " \n"
5128 "lea " MEMLEA(0x10,1) ",%1 \n"
5129 "sub $0x4,%3 \n"
5130 "jg 3012b \n"
5131
5132 "99: \n"
5133 : "+r"(src_argb), // %0
5134 "+r"(dst_argb), // %1
5135 "+d"(pixel_temp), // %2
5136 "+r"(pix) // %3
5137 : "r"(shuffler) // %4
5138 : "memory", "cc", NACL_R14
5139 "xmm0", "xmm1", "xmm5"
5140 );
5141 }
5142 #endif // HAS_ARGBSHUFFLEROW_SSE2
5143
5144 #ifdef HAS_I422TOYUY2ROW_SSE2
I422ToYUY2Row_SSE2(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_frame,int width)5145 void I422ToYUY2Row_SSE2(const uint8* src_y,
5146 const uint8* src_u,
5147 const uint8* src_v,
5148 uint8* dst_frame, int width) {
5149 asm volatile (
5150 "sub %1,%2 \n"
5151 LABELALIGN
5152 "1: \n"
5153 "movq " MEMACCESS(1) ",%%xmm2 \n"
5154 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
5155 "lea " MEMLEA(0x8,1) ",%1 \n"
5156 "punpcklbw %%xmm3,%%xmm2 \n"
5157 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5158 "lea " MEMLEA(0x10,0) ",%0 \n"
5159 "movdqa %%xmm0,%%xmm1 \n"
5160 "punpcklbw %%xmm2,%%xmm0 \n"
5161 "punpckhbw %%xmm2,%%xmm1 \n"
5162 "movdqu %%xmm0," MEMACCESS(3) " \n"
5163 "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n"
5164 "lea " MEMLEA(0x20,3) ",%3 \n"
5165 "sub $0x10,%4 \n"
5166 "jg 1b \n"
5167 : "+r"(src_y), // %0
5168 "+r"(src_u), // %1
5169 "+r"(src_v), // %2
5170 "+r"(dst_frame), // %3
5171 "+rm"(width) // %4
5172 :
5173 : "memory", "cc", NACL_R14
5174 "xmm0", "xmm1", "xmm2", "xmm3"
5175 );
5176 }
5177 #endif // HAS_I422TOYUY2ROW_SSE2
5178
5179 #ifdef HAS_I422TOUYVYROW_SSE2
I422ToUYVYRow_SSE2(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_frame,int width)5180 void I422ToUYVYRow_SSE2(const uint8* src_y,
5181 const uint8* src_u,
5182 const uint8* src_v,
5183 uint8* dst_frame, int width) {
5184 asm volatile (
5185 "sub %1,%2 \n"
5186 LABELALIGN
5187 "1: \n"
5188 "movq " MEMACCESS(1) ",%%xmm2 \n"
5189 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
5190 "lea " MEMLEA(0x8,1) ",%1 \n"
5191 "punpcklbw %%xmm3,%%xmm2 \n"
5192 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5193 "movdqa %%xmm2,%%xmm1 \n"
5194 "lea " MEMLEA(0x10,0) ",%0 \n"
5195 "punpcklbw %%xmm0,%%xmm1 \n"
5196 "punpckhbw %%xmm0,%%xmm2 \n"
5197 "movdqu %%xmm1," MEMACCESS(3) " \n"
5198 "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n"
5199 "lea " MEMLEA(0x20,3) ",%3 \n"
5200 "sub $0x10,%4 \n"
5201 "jg 1b \n"
5202 : "+r"(src_y), // %0
5203 "+r"(src_u), // %1
5204 "+r"(src_v), // %2
5205 "+r"(dst_frame), // %3
5206 "+rm"(width) // %4
5207 :
5208 : "memory", "cc", NACL_R14
5209 "xmm0", "xmm1", "xmm2", "xmm3"
5210 );
5211 }
5212 #endif // HAS_I422TOUYVYROW_SSE2
5213
5214 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
ARGBPolynomialRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,const float * poly,int width)5215 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
5216 uint8* dst_argb, const float* poly,
5217 int width) {
5218 asm volatile (
5219 "pxor %%xmm3,%%xmm3 \n"
5220
5221 // 2 pixel loop.
5222 LABELALIGN
5223 "1: \n"
5224 "movq " MEMACCESS(0) ",%%xmm0 \n"
5225 "lea " MEMLEA(0x8,0) ",%0 \n"
5226 "punpcklbw %%xmm3,%%xmm0 \n"
5227 "movdqa %%xmm0,%%xmm4 \n"
5228 "punpcklwd %%xmm3,%%xmm0 \n"
5229 "punpckhwd %%xmm3,%%xmm4 \n"
5230 "cvtdq2ps %%xmm0,%%xmm0 \n"
5231 "cvtdq2ps %%xmm4,%%xmm4 \n"
5232 "movdqa %%xmm0,%%xmm1 \n"
5233 "movdqa %%xmm4,%%xmm5 \n"
5234 "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n"
5235 "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n"
5236 "addps " MEMACCESS(3) ",%%xmm0 \n"
5237 "addps " MEMACCESS(3) ",%%xmm4 \n"
5238 "movdqa %%xmm1,%%xmm2 \n"
5239 "movdqa %%xmm5,%%xmm6 \n"
5240 "mulps %%xmm1,%%xmm2 \n"
5241 "mulps %%xmm5,%%xmm6 \n"
5242 "mulps %%xmm2,%%xmm1 \n"
5243 "mulps %%xmm6,%%xmm5 \n"
5244 "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n"
5245 "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n"
5246 "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n"
5247 "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n"
5248 "addps %%xmm2,%%xmm0 \n"
5249 "addps %%xmm6,%%xmm4 \n"
5250 "addps %%xmm1,%%xmm0 \n"
5251 "addps %%xmm5,%%xmm4 \n"
5252 "cvttps2dq %%xmm0,%%xmm0 \n"
5253 "cvttps2dq %%xmm4,%%xmm4 \n"
5254 "packuswb %%xmm4,%%xmm0 \n"
5255 "packuswb %%xmm0,%%xmm0 \n"
5256 "movq %%xmm0," MEMACCESS(1) " \n"
5257 "lea " MEMLEA(0x8,1) ",%1 \n"
5258 "sub $0x2,%2 \n"
5259 "jg 1b \n"
5260 : "+r"(src_argb), // %0
5261 "+r"(dst_argb), // %1
5262 "+r"(width) // %2
5263 : "r"(poly) // %3
5264 : "memory", "cc"
5265 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
5266 );
5267 }
5268 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
5269
5270 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
ARGBPolynomialRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,const float * poly,int width)5271 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
5272 uint8* dst_argb, const float* poly,
5273 int width) {
5274 asm volatile (
5275 "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n"
5276 "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
5277 "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
5278 "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
5279
5280 // 2 pixel loop.
5281 LABELALIGN
5282 "1: \n"
5283 "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels
5284 "lea " MEMLEA(0x8,0) ",%0 \n"
5285 "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
5286 "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
5287 "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
5288 "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
5289 "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
5290 "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X
5291 "vcvttps2dq %%ymm0,%%ymm0 \n"
5292 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
5293 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
5294 "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
5295 "vmovq %%xmm0," MEMACCESS(1) " \n"
5296 "lea " MEMLEA(0x8,1) ",%1 \n"
5297 "sub $0x2,%2 \n"
5298 "jg 1b \n"
5299 "vzeroupper \n"
5300 : "+r"(src_argb), // %0
5301 "+r"(dst_argb), // %1
5302 "+r"(width) // %2
5303 : "r"(poly) // %3
5304 : "memory", "cc",
5305 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
5306 );
5307 }
5308 #endif // HAS_ARGBPOLYNOMIALROW_AVX2
5309
5310 #ifdef HAS_ARGBCOLORTABLEROW_X86
5311 // Tranform ARGB pixels with color table.
ARGBColorTableRow_X86(uint8 * dst_argb,const uint8 * table_argb,int width)5312 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
5313 int width) {
5314 uintptr_t pixel_temp = 0u;
5315 asm volatile (
5316 // 1 pixel loop.
5317 LABELALIGN
5318 "1: \n"
5319 "movzb " MEMACCESS(0) ",%1 \n"
5320 "lea " MEMLEA(0x4,0) ",%0 \n"
5321 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
5322 "mov %b1," MEMACCESS2(-0x4,0) " \n"
5323 "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
5324 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
5325 "mov %b1," MEMACCESS2(-0x3,0) " \n"
5326 "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
5327 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
5328 "mov %b1," MEMACCESS2(-0x2,0) " \n"
5329 "movzb " MEMACCESS2(-0x1,0) ",%1 \n"
5330 MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1
5331 "mov %b1," MEMACCESS2(-0x1,0) " \n"
5332 "dec %2 \n"
5333 "jg 1b \n"
5334 : "+r"(dst_argb), // %0
5335 "+d"(pixel_temp), // %1
5336 "+r"(width) // %2
5337 : "r"(table_argb) // %3
5338 : "memory", "cc");
5339 }
5340 #endif // HAS_ARGBCOLORTABLEROW_X86
5341
5342 #ifdef HAS_RGBCOLORTABLEROW_X86
5343 // Tranform RGB pixels with color table.
RGBColorTableRow_X86(uint8 * dst_argb,const uint8 * table_argb,int width)5344 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
5345 uintptr_t pixel_temp = 0u;
5346 asm volatile (
5347 // 1 pixel loop.
5348 LABELALIGN
5349 "1: \n"
5350 "movzb " MEMACCESS(0) ",%1 \n"
5351 "lea " MEMLEA(0x4,0) ",%0 \n"
5352 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
5353 "mov %b1," MEMACCESS2(-0x4,0) " \n"
5354 "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
5355 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
5356 "mov %b1," MEMACCESS2(-0x3,0) " \n"
5357 "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
5358 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
5359 "mov %b1," MEMACCESS2(-0x2,0) " \n"
5360 "dec %2 \n"
5361 "jg 1b \n"
5362 : "+r"(dst_argb), // %0
5363 "+d"(pixel_temp), // %1
5364 "+r"(width) // %2
5365 : "r"(table_argb) // %3
5366 : "memory", "cc");
5367 }
5368 #endif // HAS_RGBCOLORTABLEROW_X86
5369
5370 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
5371 // Tranform RGB pixels with luma table.
ARGBLumaColorTableRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width,const uint8 * luma,uint32 lumacoeff)5372 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5373 int width,
5374 const uint8* luma, uint32 lumacoeff) {
5375 uintptr_t pixel_temp = 0u;
5376 uintptr_t table_temp = 0u;
5377 asm volatile (
5378 "movd %6,%%xmm3 \n"
5379 "pshufd $0x0,%%xmm3,%%xmm3 \n"
5380 "pcmpeqb %%xmm4,%%xmm4 \n"
5381 "psllw $0x8,%%xmm4 \n"
5382 "pxor %%xmm5,%%xmm5 \n"
5383
5384 // 4 pixel loop.
5385 LABELALIGN
5386 "1: \n"
5387 "movdqu " MEMACCESS(2) ",%%xmm0 \n"
5388 "pmaddubsw %%xmm3,%%xmm0 \n"
5389 "phaddw %%xmm0,%%xmm0 \n"
5390 "pand %%xmm4,%%xmm0 \n"
5391 "punpcklwd %%xmm5,%%xmm0 \n"
5392 "movd %%xmm0,%k1 \n" // 32 bit offset
5393 "add %5,%1 \n"
5394 "pshufd $0x39,%%xmm0,%%xmm0 \n"
5395
5396 "movzb " MEMACCESS(2) ",%0 \n"
5397 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5398 "mov %b0," MEMACCESS(3) " \n"
5399 "movzb " MEMACCESS2(0x1,2) ",%0 \n"
5400 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5401 "mov %b0," MEMACCESS2(0x1,3) " \n"
5402 "movzb " MEMACCESS2(0x2,2) ",%0 \n"
5403 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5404 "mov %b0," MEMACCESS2(0x2,3) " \n"
5405 "movzb " MEMACCESS2(0x3,2) ",%0 \n"
5406 "mov %b0," MEMACCESS2(0x3,3) " \n"
5407
5408 "movd %%xmm0,%k1 \n" // 32 bit offset
5409 "add %5,%1 \n"
5410 "pshufd $0x39,%%xmm0,%%xmm0 \n"
5411
5412 "movzb " MEMACCESS2(0x4,2) ",%0 \n"
5413 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5414 "mov %b0," MEMACCESS2(0x4,3) " \n"
5415 "movzb " MEMACCESS2(0x5,2) ",%0 \n"
5416 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5417 "mov %b0," MEMACCESS2(0x5,3) " \n"
5418 "movzb " MEMACCESS2(0x6,2) ",%0 \n"
5419 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5420 "mov %b0," MEMACCESS2(0x6,3) " \n"
5421 "movzb " MEMACCESS2(0x7,2) ",%0 \n"
5422 "mov %b0," MEMACCESS2(0x7,3) " \n"
5423
5424 "movd %%xmm0,%k1 \n" // 32 bit offset
5425 "add %5,%1 \n"
5426 "pshufd $0x39,%%xmm0,%%xmm0 \n"
5427
5428 "movzb " MEMACCESS2(0x8,2) ",%0 \n"
5429 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5430 "mov %b0," MEMACCESS2(0x8,3) " \n"
5431 "movzb " MEMACCESS2(0x9,2) ",%0 \n"
5432 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5433 "mov %b0," MEMACCESS2(0x9,3) " \n"
5434 "movzb " MEMACCESS2(0xa,2) ",%0 \n"
5435 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5436 "mov %b0," MEMACCESS2(0xa,3) " \n"
5437 "movzb " MEMACCESS2(0xb,2) ",%0 \n"
5438 "mov %b0," MEMACCESS2(0xb,3) " \n"
5439
5440 "movd %%xmm0,%k1 \n" // 32 bit offset
5441 "add %5,%1 \n"
5442
5443 "movzb " MEMACCESS2(0xc,2) ",%0 \n"
5444 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5445 "mov %b0," MEMACCESS2(0xc,3) " \n"
5446 "movzb " MEMACCESS2(0xd,2) ",%0 \n"
5447 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5448 "mov %b0," MEMACCESS2(0xd,3) " \n"
5449 "movzb " MEMACCESS2(0xe,2) ",%0 \n"
5450 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5451 "mov %b0," MEMACCESS2(0xe,3) " \n"
5452 "movzb " MEMACCESS2(0xf,2) ",%0 \n"
5453 "mov %b0," MEMACCESS2(0xf,3) " \n"
5454 "lea " MEMLEA(0x10,2) ",%2 \n"
5455 "lea " MEMLEA(0x10,3) ",%3 \n"
5456 "sub $0x4,%4 \n"
5457 "jg 1b \n"
5458 : "+d"(pixel_temp), // %0
5459 "+a"(table_temp), // %1
5460 "+r"(src_argb), // %2
5461 "+r"(dst_argb), // %3
5462 "+rm"(width) // %4
5463 : "r"(luma), // %5
5464 "rm"(lumacoeff) // %6
5465 : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
5466 );
5467 }
5468 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5469
5470 #endif // defined(__x86_64__) || defined(__i386__)
5471
5472 #ifdef __cplusplus
5473 } // extern "C"
5474 } // namespace libyuv
5475 #endif
5476