1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // This module is for GCC x86 and x64.
19 #if !defined(LIBYUV_DISABLE_X86) && \
20 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
21
22 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
23
24 // Constants for ARGB
25 static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
26 13, 65, 33, 0, 13, 65, 33, 0};
27
28 // JPeg full range.
29 static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
30 15, 75, 38, 0, 15, 75, 38, 0};
31 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
32
33 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
34
35 static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
36 112, -74, -38, 0, 112, -74, -38, 0};
37
38 static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
39 127, -84, -43, 0, 127, -84, -43, 0};
40
41 static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
42 -18, -94, 112, 0, -18, -94, 112, 0};
43
44 static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
45 -20, -107, 127, 0, -20, -107, 127, 0};
46
47 // Constants for BGRA
48 static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
49 0, 33, 65, 13, 0, 33, 65, 13};
50
51 static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
52 0, -38, -74, 112, 0, -38, -74, 112};
53
54 static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
55 0, 112, -94, -18, 0, 112, -94, -18};
56
57 // Constants for ABGR
58 static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
59 33, 65, 13, 0, 33, 65, 13, 0};
60
61 static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
62 -38, -74, 112, 0, -38, -74, 112, 0};
63
64 static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
65 112, -94, -18, 0, 112, -94, -18, 0};
66
67 // Constants for RGBA.
68 static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
69 0, 13, 65, 33, 0, 13, 65, 33};
70
71 static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
72 0, 112, -74, -38, 0, 112, -74, -38};
73
74 static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
75 0, -18, -94, 112, 0, -18, -94, 112};
76
77 static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
78 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
79
80 // 7 bit fixed point 0.5.
81 static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
82
83 static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
84 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
85
86 static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
87 0x8080u, 0x8080u, 0x8080u, 0x8080u};
88 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
89
90 #ifdef HAS_RGB24TOARGBROW_SSSE3
91
92 // Shuffle table for converting RGB24 to ARGB.
93 static const uvec8 kShuffleMaskRGB24ToARGB = {
94 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
95
96 // Shuffle table for converting RAW to ARGB.
97 static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
98 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
99
100 // Shuffle table for converting RAW to RGB24. First 8.
101 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
102 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
103 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
104
105 // Shuffle table for converting RAW to RGB24. Middle 8.
106 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
107 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
108 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
109
110 // Shuffle table for converting RAW to RGB24. Last 8.
111 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
112 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
113 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
114
115 // Shuffle table for converting ARGB to RGB24.
116 static const uvec8 kShuffleMaskARGBToRGB24 = {
117 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
118
119 // Shuffle table for converting ARGB to RAW.
120 static const uvec8 kShuffleMaskARGBToRAW = {
121 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
122
123 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
124 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
125 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
126
127 // YUY2 shuf 16 Y to 32 Y.
128 static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
129 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
130 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
131
132 // YUY2 shuf 8 UV to 16 UV.
133 static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
134 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
135 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
136
137 // UYVY shuf 16 Y to 32 Y.
138 static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
139 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
140 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
141
142 // UYVY shuf 8 UV to 16 UV.
143 static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
144 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
145 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
146
147 // NV21 shuf 8 VU to 16 UV.
148 static const lvec8 kShuffleNV21 = {
149 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
150 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
151 };
152 #endif // HAS_RGB24TOARGBROW_SSSE3
153
154 #ifdef HAS_J400TOARGBROW_SSE2
J400ToARGBRow_SSE2(const uint8_t * src_y,uint8_t * dst_argb,int width)155 void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
156 asm volatile(
157 "pcmpeqb %%xmm5,%%xmm5 \n"
158 "pslld $0x18,%%xmm5 \n"
159
160 LABELALIGN
161 "1: \n"
162 "movq (%0),%%xmm0 \n"
163 "lea 0x8(%0),%0 \n"
164 "punpcklbw %%xmm0,%%xmm0 \n"
165 "movdqa %%xmm0,%%xmm1 \n"
166 "punpcklwd %%xmm0,%%xmm0 \n"
167 "punpckhwd %%xmm1,%%xmm1 \n"
168 "por %%xmm5,%%xmm0 \n"
169 "por %%xmm5,%%xmm1 \n"
170 "movdqu %%xmm0,(%1) \n"
171 "movdqu %%xmm1,0x10(%1) \n"
172 "lea 0x20(%1),%1 \n"
173 "sub $0x8,%2 \n"
174 "jg 1b \n"
175 : "+r"(src_y), // %0
176 "+r"(dst_argb), // %1
177 "+r"(width) // %2
178 ::"memory",
179 "cc", "xmm0", "xmm1", "xmm5");
180 }
181 #endif // HAS_J400TOARGBROW_SSE2
182
183 #ifdef HAS_RGB24TOARGBROW_SSSE3
RGB24ToARGBRow_SSSE3(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)184 void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
185 uint8_t* dst_argb,
186 int width) {
187 asm volatile(
188 "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
189 "pslld $0x18,%%xmm5 \n"
190 "movdqa %3,%%xmm4 \n"
191
192 LABELALIGN
193 "1: \n"
194 "movdqu (%0),%%xmm0 \n"
195 "movdqu 0x10(%0),%%xmm1 \n"
196 "movdqu 0x20(%0),%%xmm3 \n"
197 "lea 0x30(%0),%0 \n"
198 "movdqa %%xmm3,%%xmm2 \n"
199 "palignr $0x8,%%xmm1,%%xmm2 \n"
200 "pshufb %%xmm4,%%xmm2 \n"
201 "por %%xmm5,%%xmm2 \n"
202 "palignr $0xc,%%xmm0,%%xmm1 \n"
203 "pshufb %%xmm4,%%xmm0 \n"
204 "movdqu %%xmm2,0x20(%1) \n"
205 "por %%xmm5,%%xmm0 \n"
206 "pshufb %%xmm4,%%xmm1 \n"
207 "movdqu %%xmm0,(%1) \n"
208 "por %%xmm5,%%xmm1 \n"
209 "palignr $0x4,%%xmm3,%%xmm3 \n"
210 "pshufb %%xmm4,%%xmm3 \n"
211 "movdqu %%xmm1,0x10(%1) \n"
212 "por %%xmm5,%%xmm3 \n"
213 "movdqu %%xmm3,0x30(%1) \n"
214 "lea 0x40(%1),%1 \n"
215 "sub $0x10,%2 \n"
216 "jg 1b \n"
217 : "+r"(src_rgb24), // %0
218 "+r"(dst_argb), // %1
219 "+r"(width) // %2
220 : "m"(kShuffleMaskRGB24ToARGB) // %3
221 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
222 }
223
RAWToARGBRow_SSSE3(const uint8_t * src_raw,uint8_t * dst_argb,int width)224 void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
225 asm volatile(
226 "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
227 "pslld $0x18,%%xmm5 \n"
228 "movdqa %3,%%xmm4 \n"
229
230 LABELALIGN
231 "1: \n"
232 "movdqu (%0),%%xmm0 \n"
233 "movdqu 0x10(%0),%%xmm1 \n"
234 "movdqu 0x20(%0),%%xmm3 \n"
235 "lea 0x30(%0),%0 \n"
236 "movdqa %%xmm3,%%xmm2 \n"
237 "palignr $0x8,%%xmm1,%%xmm2 \n"
238 "pshufb %%xmm4,%%xmm2 \n"
239 "por %%xmm5,%%xmm2 \n"
240 "palignr $0xc,%%xmm0,%%xmm1 \n"
241 "pshufb %%xmm4,%%xmm0 \n"
242 "movdqu %%xmm2,0x20(%1) \n"
243 "por %%xmm5,%%xmm0 \n"
244 "pshufb %%xmm4,%%xmm1 \n"
245 "movdqu %%xmm0,(%1) \n"
246 "por %%xmm5,%%xmm1 \n"
247 "palignr $0x4,%%xmm3,%%xmm3 \n"
248 "pshufb %%xmm4,%%xmm3 \n"
249 "movdqu %%xmm1,0x10(%1) \n"
250 "por %%xmm5,%%xmm3 \n"
251 "movdqu %%xmm3,0x30(%1) \n"
252 "lea 0x40(%1),%1 \n"
253 "sub $0x10,%2 \n"
254 "jg 1b \n"
255 : "+r"(src_raw), // %0
256 "+r"(dst_argb), // %1
257 "+r"(width) // %2
258 : "m"(kShuffleMaskRAWToARGB) // %3
259 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
260 }
261
RAWToRGB24Row_SSSE3(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)262 void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
263 uint8_t* dst_rgb24,
264 int width) {
265 asm volatile(
266 "movdqa %3,%%xmm3 \n"
267 "movdqa %4,%%xmm4 \n"
268 "movdqa %5,%%xmm5 \n"
269
270 LABELALIGN
271 "1: \n"
272 "movdqu (%0),%%xmm0 \n"
273 "movdqu 0x4(%0),%%xmm1 \n"
274 "movdqu 0x8(%0),%%xmm2 \n"
275 "lea 0x18(%0),%0 \n"
276 "pshufb %%xmm3,%%xmm0 \n"
277 "pshufb %%xmm4,%%xmm1 \n"
278 "pshufb %%xmm5,%%xmm2 \n"
279 "movq %%xmm0,(%1) \n"
280 "movq %%xmm1,0x8(%1) \n"
281 "movq %%xmm2,0x10(%1) \n"
282 "lea 0x18(%1),%1 \n"
283 "sub $0x8,%2 \n"
284 "jg 1b \n"
285 : "+r"(src_raw), // %0
286 "+r"(dst_rgb24), // %1
287 "+r"(width) // %2
288 : "m"(kShuffleMaskRAWToRGB24_0), // %3
289 "m"(kShuffleMaskRAWToRGB24_1), // %4
290 "m"(kShuffleMaskRAWToRGB24_2) // %5
291 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
292 }
293
RGB565ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)294 void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
295 asm volatile(
296 "mov $0x1080108,%%eax \n"
297 "movd %%eax,%%xmm5 \n"
298 "pshufd $0x0,%%xmm5,%%xmm5 \n"
299 "mov $0x20802080,%%eax \n"
300 "movd %%eax,%%xmm6 \n"
301 "pshufd $0x0,%%xmm6,%%xmm6 \n"
302 "pcmpeqb %%xmm3,%%xmm3 \n"
303 "psllw $0xb,%%xmm3 \n"
304 "pcmpeqb %%xmm4,%%xmm4 \n"
305 "psllw $0xa,%%xmm4 \n"
306 "psrlw $0x5,%%xmm4 \n"
307 "pcmpeqb %%xmm7,%%xmm7 \n"
308 "psllw $0x8,%%xmm7 \n"
309 "sub %0,%1 \n"
310 "sub %0,%1 \n"
311
312 LABELALIGN
313 "1: \n"
314 "movdqu (%0),%%xmm0 \n"
315 "movdqa %%xmm0,%%xmm1 \n"
316 "movdqa %%xmm0,%%xmm2 \n"
317 "pand %%xmm3,%%xmm1 \n"
318 "psllw $0xb,%%xmm2 \n"
319 "pmulhuw %%xmm5,%%xmm1 \n"
320 "pmulhuw %%xmm5,%%xmm2 \n"
321 "psllw $0x8,%%xmm1 \n"
322 "por %%xmm2,%%xmm1 \n"
323 "pand %%xmm4,%%xmm0 \n"
324 "pmulhuw %%xmm6,%%xmm0 \n"
325 "por %%xmm7,%%xmm0 \n"
326 "movdqa %%xmm1,%%xmm2 \n"
327 "punpcklbw %%xmm0,%%xmm1 \n"
328 "punpckhbw %%xmm0,%%xmm2 \n"
329 "movdqu %%xmm1,0x00(%1,%0,2) \n"
330 "movdqu %%xmm2,0x10(%1,%0,2) \n"
331 "lea 0x10(%0),%0 \n"
332 "sub $0x8,%2 \n"
333 "jg 1b \n"
334 : "+r"(src), // %0
335 "+r"(dst), // %1
336 "+r"(width) // %2
337 :
338 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
339 "xmm6", "xmm7");
340 }
341
ARGB1555ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)342 void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
343 asm volatile(
344 "mov $0x1080108,%%eax \n"
345 "movd %%eax,%%xmm5 \n"
346 "pshufd $0x0,%%xmm5,%%xmm5 \n"
347 "mov $0x42004200,%%eax \n"
348 "movd %%eax,%%xmm6 \n"
349 "pshufd $0x0,%%xmm6,%%xmm6 \n"
350 "pcmpeqb %%xmm3,%%xmm3 \n"
351 "psllw $0xb,%%xmm3 \n"
352 "movdqa %%xmm3,%%xmm4 \n"
353 "psrlw $0x6,%%xmm4 \n"
354 "pcmpeqb %%xmm7,%%xmm7 \n"
355 "psllw $0x8,%%xmm7 \n"
356 "sub %0,%1 \n"
357 "sub %0,%1 \n"
358
359 LABELALIGN
360 "1: \n"
361 "movdqu (%0),%%xmm0 \n"
362 "movdqa %%xmm0,%%xmm1 \n"
363 "movdqa %%xmm0,%%xmm2 \n"
364 "psllw $0x1,%%xmm1 \n"
365 "psllw $0xb,%%xmm2 \n"
366 "pand %%xmm3,%%xmm1 \n"
367 "pmulhuw %%xmm5,%%xmm2 \n"
368 "pmulhuw %%xmm5,%%xmm1 \n"
369 "psllw $0x8,%%xmm1 \n"
370 "por %%xmm2,%%xmm1 \n"
371 "movdqa %%xmm0,%%xmm2 \n"
372 "pand %%xmm4,%%xmm0 \n"
373 "psraw $0x8,%%xmm2 \n"
374 "pmulhuw %%xmm6,%%xmm0 \n"
375 "pand %%xmm7,%%xmm2 \n"
376 "por %%xmm2,%%xmm0 \n"
377 "movdqa %%xmm1,%%xmm2 \n"
378 "punpcklbw %%xmm0,%%xmm1 \n"
379 "punpckhbw %%xmm0,%%xmm2 \n"
380 "movdqu %%xmm1,0x00(%1,%0,2) \n"
381 "movdqu %%xmm2,0x10(%1,%0,2) \n"
382 "lea 0x10(%0),%0 \n"
383 "sub $0x8,%2 \n"
384 "jg 1b \n"
385 : "+r"(src), // %0
386 "+r"(dst), // %1
387 "+r"(width) // %2
388 :
389 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
390 "xmm6", "xmm7");
391 }
392
ARGB4444ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)393 void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
394 asm volatile(
395 "mov $0xf0f0f0f,%%eax \n"
396 "movd %%eax,%%xmm4 \n"
397 "pshufd $0x0,%%xmm4,%%xmm4 \n"
398 "movdqa %%xmm4,%%xmm5 \n"
399 "pslld $0x4,%%xmm5 \n"
400 "sub %0,%1 \n"
401 "sub %0,%1 \n"
402
403 LABELALIGN
404 "1: \n"
405 "movdqu (%0),%%xmm0 \n"
406 "movdqa %%xmm0,%%xmm2 \n"
407 "pand %%xmm4,%%xmm0 \n"
408 "pand %%xmm5,%%xmm2 \n"
409 "movdqa %%xmm0,%%xmm1 \n"
410 "movdqa %%xmm2,%%xmm3 \n"
411 "psllw $0x4,%%xmm1 \n"
412 "psrlw $0x4,%%xmm3 \n"
413 "por %%xmm1,%%xmm0 \n"
414 "por %%xmm3,%%xmm2 \n"
415 "movdqa %%xmm0,%%xmm1 \n"
416 "punpcklbw %%xmm2,%%xmm0 \n"
417 "punpckhbw %%xmm2,%%xmm1 \n"
418 "movdqu %%xmm0,0x00(%1,%0,2) \n"
419 "movdqu %%xmm1,0x10(%1,%0,2) \n"
420 "lea 0x10(%0),%0 \n"
421 "sub $0x8,%2 \n"
422 "jg 1b \n"
423 : "+r"(src), // %0
424 "+r"(dst), // %1
425 "+r"(width) // %2
426 :
427 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
428 }
429
ARGBToRGB24Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)430 void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
431 asm volatile(
432
433 "movdqa %3,%%xmm6 \n"
434
435 LABELALIGN
436 "1: \n"
437 "movdqu (%0),%%xmm0 \n"
438 "movdqu 0x10(%0),%%xmm1 \n"
439 "movdqu 0x20(%0),%%xmm2 \n"
440 "movdqu 0x30(%0),%%xmm3 \n"
441 "lea 0x40(%0),%0 \n"
442 "pshufb %%xmm6,%%xmm0 \n"
443 "pshufb %%xmm6,%%xmm1 \n"
444 "pshufb %%xmm6,%%xmm2 \n"
445 "pshufb %%xmm6,%%xmm3 \n"
446 "movdqa %%xmm1,%%xmm4 \n"
447 "psrldq $0x4,%%xmm1 \n"
448 "pslldq $0xc,%%xmm4 \n"
449 "movdqa %%xmm2,%%xmm5 \n"
450 "por %%xmm4,%%xmm0 \n"
451 "pslldq $0x8,%%xmm5 \n"
452 "movdqu %%xmm0,(%1) \n"
453 "por %%xmm5,%%xmm1 \n"
454 "psrldq $0x8,%%xmm2 \n"
455 "pslldq $0x4,%%xmm3 \n"
456 "por %%xmm3,%%xmm2 \n"
457 "movdqu %%xmm1,0x10(%1) \n"
458 "movdqu %%xmm2,0x20(%1) \n"
459 "lea 0x30(%1),%1 \n"
460 "sub $0x10,%2 \n"
461 "jg 1b \n"
462 : "+r"(src), // %0
463 "+r"(dst), // %1
464 "+r"(width) // %2
465 : "m"(kShuffleMaskARGBToRGB24) // %3
466 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
467 }
468
ARGBToRAWRow_SSSE3(const uint8_t * src,uint8_t * dst,int width)469 void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
470 asm volatile(
471
472 "movdqa %3,%%xmm6 \n"
473
474 LABELALIGN
475 "1: \n"
476 "movdqu (%0),%%xmm0 \n"
477 "movdqu 0x10(%0),%%xmm1 \n"
478 "movdqu 0x20(%0),%%xmm2 \n"
479 "movdqu 0x30(%0),%%xmm3 \n"
480 "lea 0x40(%0),%0 \n"
481 "pshufb %%xmm6,%%xmm0 \n"
482 "pshufb %%xmm6,%%xmm1 \n"
483 "pshufb %%xmm6,%%xmm2 \n"
484 "pshufb %%xmm6,%%xmm3 \n"
485 "movdqa %%xmm1,%%xmm4 \n"
486 "psrldq $0x4,%%xmm1 \n"
487 "pslldq $0xc,%%xmm4 \n"
488 "movdqa %%xmm2,%%xmm5 \n"
489 "por %%xmm4,%%xmm0 \n"
490 "pslldq $0x8,%%xmm5 \n"
491 "movdqu %%xmm0,(%1) \n"
492 "por %%xmm5,%%xmm1 \n"
493 "psrldq $0x8,%%xmm2 \n"
494 "pslldq $0x4,%%xmm3 \n"
495 "por %%xmm3,%%xmm2 \n"
496 "movdqu %%xmm1,0x10(%1) \n"
497 "movdqu %%xmm2,0x20(%1) \n"
498 "lea 0x30(%1),%1 \n"
499 "sub $0x10,%2 \n"
500 "jg 1b \n"
501 : "+r"(src), // %0
502 "+r"(dst), // %1
503 "+r"(width) // %2
504 : "m"(kShuffleMaskARGBToRAW) // %3
505 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
506 }
507
508 #ifdef HAS_ARGBTORGB24ROW_AVX2
509 // vpermd for 12+12 to 24
510 static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
511
ARGBToRGB24Row_AVX2(const uint8_t * src,uint8_t * dst,int width)512 void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
513 asm volatile(
514 "vbroadcastf128 %3,%%ymm6 \n"
515 "vmovdqa %4,%%ymm7 \n"
516
517 LABELALIGN
518 "1: \n"
519 "vmovdqu (%0),%%ymm0 \n"
520 "vmovdqu 0x20(%0),%%ymm1 \n"
521 "vmovdqu 0x40(%0),%%ymm2 \n"
522 "vmovdqu 0x60(%0),%%ymm3 \n"
523 "lea 0x80(%0),%0 \n"
524 "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
525 "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
526 "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
527 "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
528 "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
529 "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
530 "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
531 "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
532 "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
533 "vpor %%ymm4,%%ymm0,%%ymm0 \n"
534 "vmovdqu %%ymm0,(%1) \n"
535 "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
536 "vpermq $0x4f,%%ymm2,%%ymm4 \n"
537 "vpor %%ymm4,%%ymm1,%%ymm1 \n"
538 "vmovdqu %%ymm1,0x20(%1) \n"
539 "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
540 "vpermq $0x93,%%ymm3,%%ymm3 \n"
541 "vpor %%ymm3,%%ymm2,%%ymm2 \n"
542 "vmovdqu %%ymm2,0x40(%1) \n"
543 "lea 0x60(%1),%1 \n"
544 "sub $0x20,%2 \n"
545 "jg 1b \n"
546 "vzeroupper \n"
547 : "+r"(src), // %0
548 "+r"(dst), // %1
549 "+r"(width) // %2
550 : "m"(kShuffleMaskARGBToRGB24), // %3
551 "m"(kPermdRGB24_AVX) // %4
552 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
553 "xmm7");
554 }
555 #endif
556
557 #ifdef HAS_ARGBTORGB24ROW_AVX512VBMI
558 // Shuffle table for converting ARGBToRGB24
559 static const ulvec8 kPermARGBToRGB24_0 = {
560 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u,
561 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
562 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
563 static const ulvec8 kPermARGBToRGB24_1 = {
564 10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
565 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
566 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
567 static const ulvec8 kPermARGBToRGB24_2 = {
568 21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
569 36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
570 50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
571
ARGBToRGB24Row_AVX512VBMI(const uint8_t * src,uint8_t * dst,int width)572 void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
573 asm volatile(
574 "vmovdqa %3,%%ymm5 \n"
575 "vmovdqa %4,%%ymm6 \n"
576 "vmovdqa %5,%%ymm7 \n"
577
578 LABELALIGN
579 "1: \n"
580 "vmovdqu (%0),%%ymm0 \n"
581 "vmovdqu 0x20(%0),%%ymm1 \n"
582 "vmovdqu 0x40(%0),%%ymm2 \n"
583 "vmovdqu 0x60(%0),%%ymm3 \n"
584 "lea 0x80(%0),%0 \n"
585 "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n"
586 "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n"
587 "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n"
588 "vmovdqu %%ymm0,(%1) \n"
589 "vmovdqu %%ymm1,0x20(%1) \n"
590 "vmovdqu %%ymm2,0x40(%1) \n"
591 "lea 0x60(%1),%1 \n"
592 "sub $0x20,%2 \n"
593 "jg 1b \n"
594 "vzeroupper \n"
595 : "+r"(src), // %0
596 "+r"(dst), // %1
597 "+r"(width) // %2
598 : "m"(kPermARGBToRGB24_0), // %3
599 "m"(kPermARGBToRGB24_1), // %4
600 "m"(kPermARGBToRGB24_2) // %5
601 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");
602 }
603 #endif
604
605 #ifdef HAS_ARGBTORAWROW_AVX2
ARGBToRAWRow_AVX2(const uint8_t * src,uint8_t * dst,int width)606 void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
607 asm volatile(
608 "vbroadcastf128 %3,%%ymm6 \n"
609 "vmovdqa %4,%%ymm7 \n"
610
611 LABELALIGN
612 "1: \n"
613 "vmovdqu (%0),%%ymm0 \n"
614 "vmovdqu 0x20(%0),%%ymm1 \n"
615 "vmovdqu 0x40(%0),%%ymm2 \n"
616 "vmovdqu 0x60(%0),%%ymm3 \n"
617 "lea 0x80(%0),%0 \n"
618 "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
619 "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
620 "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
621 "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
622 "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
623 "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
624 "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
625 "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
626 "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
627 "vpor %%ymm4,%%ymm0,%%ymm0 \n"
628 "vmovdqu %%ymm0,(%1) \n"
629 "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
630 "vpermq $0x4f,%%ymm2,%%ymm4 \n"
631 "vpor %%ymm4,%%ymm1,%%ymm1 \n"
632 "vmovdqu %%ymm1,0x20(%1) \n"
633 "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
634 "vpermq $0x93,%%ymm3,%%ymm3 \n"
635 "vpor %%ymm3,%%ymm2,%%ymm2 \n"
636 "vmovdqu %%ymm2,0x40(%1) \n"
637 "lea 0x60(%1),%1 \n"
638 "sub $0x20,%2 \n"
639 "jg 1b \n"
640 "vzeroupper \n"
641 : "+r"(src), // %0
642 "+r"(dst), // %1
643 "+r"(width) // %2
644 : "m"(kShuffleMaskARGBToRAW), // %3
645 "m"(kPermdRGB24_AVX) // %4
646 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
647 "xmm7");
648 }
649 #endif
650
ARGBToRGB565Row_SSE2(const uint8_t * src,uint8_t * dst,int width)651 void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
652 asm volatile(
653 "pcmpeqb %%xmm3,%%xmm3 \n"
654 "psrld $0x1b,%%xmm3 \n"
655 "pcmpeqb %%xmm4,%%xmm4 \n"
656 "psrld $0x1a,%%xmm4 \n"
657 "pslld $0x5,%%xmm4 \n"
658 "pcmpeqb %%xmm5,%%xmm5 \n"
659 "pslld $0xb,%%xmm5 \n"
660
661 LABELALIGN
662 "1: \n"
663 "movdqu (%0),%%xmm0 \n"
664 "movdqa %%xmm0,%%xmm1 \n"
665 "movdqa %%xmm0,%%xmm2 \n"
666 "pslld $0x8,%%xmm0 \n"
667 "psrld $0x3,%%xmm1 \n"
668 "psrld $0x5,%%xmm2 \n"
669 "psrad $0x10,%%xmm0 \n"
670 "pand %%xmm3,%%xmm1 \n"
671 "pand %%xmm4,%%xmm2 \n"
672 "pand %%xmm5,%%xmm0 \n"
673 "por %%xmm2,%%xmm1 \n"
674 "por %%xmm1,%%xmm0 \n"
675 "packssdw %%xmm0,%%xmm0 \n"
676 "lea 0x10(%0),%0 \n"
677 "movq %%xmm0,(%1) \n"
678 "lea 0x8(%1),%1 \n"
679 "sub $0x4,%2 \n"
680 "jg 1b \n"
681 : "+r"(src), // %0
682 "+r"(dst), // %1
683 "+r"(width) // %2
684 ::"memory",
685 "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
686 }
687
ARGBToRGB565DitherRow_SSE2(const uint8_t * src,uint8_t * dst,const uint32_t dither4,int width)688 void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
689 uint8_t* dst,
690 const uint32_t dither4,
691 int width) {
692 asm volatile(
693 "movd %3,%%xmm6 \n"
694 "punpcklbw %%xmm6,%%xmm6 \n"
695 "movdqa %%xmm6,%%xmm7 \n"
696 "punpcklwd %%xmm6,%%xmm6 \n"
697 "punpckhwd %%xmm7,%%xmm7 \n"
698 "pcmpeqb %%xmm3,%%xmm3 \n"
699 "psrld $0x1b,%%xmm3 \n"
700 "pcmpeqb %%xmm4,%%xmm4 \n"
701 "psrld $0x1a,%%xmm4 \n"
702 "pslld $0x5,%%xmm4 \n"
703 "pcmpeqb %%xmm5,%%xmm5 \n"
704 "pslld $0xb,%%xmm5 \n"
705
706 LABELALIGN
707 "1: \n"
708 "movdqu (%0),%%xmm0 \n"
709 "paddusb %%xmm6,%%xmm0 \n"
710 "movdqa %%xmm0,%%xmm1 \n"
711 "movdqa %%xmm0,%%xmm2 \n"
712 "pslld $0x8,%%xmm0 \n"
713 "psrld $0x3,%%xmm1 \n"
714 "psrld $0x5,%%xmm2 \n"
715 "psrad $0x10,%%xmm0 \n"
716 "pand %%xmm3,%%xmm1 \n"
717 "pand %%xmm4,%%xmm2 \n"
718 "pand %%xmm5,%%xmm0 \n"
719 "por %%xmm2,%%xmm1 \n"
720 "por %%xmm1,%%xmm0 \n"
721 "packssdw %%xmm0,%%xmm0 \n"
722 "lea 0x10(%0),%0 \n"
723 "movq %%xmm0,(%1) \n"
724 "lea 0x8(%1),%1 \n"
725 "sub $0x4,%2 \n"
726 "jg 1b \n"
727 : "+r"(src), // %0
728 "+r"(dst), // %1
729 "+r"(width) // %2
730 : "m"(dither4) // %3
731 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
732 "xmm7");
733 }
734
735 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
ARGBToRGB565DitherRow_AVX2(const uint8_t * src,uint8_t * dst,const uint32_t dither4,int width)736 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
737 uint8_t* dst,
738 const uint32_t dither4,
739 int width) {
740 asm volatile(
741 "vbroadcastss %3,%%xmm6 \n"
742 "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
743 "vpermq $0xd8,%%ymm6,%%ymm6 \n"
744 "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
745 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
746 "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
747 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
748 "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
749 "vpslld $0x5,%%ymm4,%%ymm4 \n"
750 "vpslld $0xb,%%ymm3,%%ymm5 \n"
751
752 LABELALIGN
753 "1: \n"
754 "vmovdqu (%0),%%ymm0 \n"
755 "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
756 "vpsrld $0x5,%%ymm0,%%ymm2 \n"
757 "vpsrld $0x3,%%ymm0,%%ymm1 \n"
758 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
759 "vpand %%ymm4,%%ymm2,%%ymm2 \n"
760 "vpand %%ymm3,%%ymm1,%%ymm1 \n"
761 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
762 "vpor %%ymm2,%%ymm1,%%ymm1 \n"
763 "vpor %%ymm1,%%ymm0,%%ymm0 \n"
764 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
765 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
766 "lea 0x20(%0),%0 \n"
767 "vmovdqu %%xmm0,(%1) \n"
768 "lea 0x10(%1),%1 \n"
769 "sub $0x8,%2 \n"
770 "jg 1b \n"
771 "vzeroupper \n"
772 : "+r"(src), // %0
773 "+r"(dst), // %1
774 "+r"(width) // %2
775 : "m"(dither4) // %3
776 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
777 "xmm7");
778 }
779 #endif // HAS_ARGBTORGB565DITHERROW_AVX2
780
ARGBToARGB1555Row_SSE2(const uint8_t * src,uint8_t * dst,int width)781 void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
782 asm volatile(
783 "pcmpeqb %%xmm4,%%xmm4 \n"
784 "psrld $0x1b,%%xmm4 \n"
785 "movdqa %%xmm4,%%xmm5 \n"
786 "pslld $0x5,%%xmm5 \n"
787 "movdqa %%xmm4,%%xmm6 \n"
788 "pslld $0xa,%%xmm6 \n"
789 "pcmpeqb %%xmm7,%%xmm7 \n"
790 "pslld $0xf,%%xmm7 \n"
791
792 LABELALIGN
793 "1: \n"
794 "movdqu (%0),%%xmm0 \n"
795 "movdqa %%xmm0,%%xmm1 \n"
796 "movdqa %%xmm0,%%xmm2 \n"
797 "movdqa %%xmm0,%%xmm3 \n"
798 "psrad $0x10,%%xmm0 \n"
799 "psrld $0x3,%%xmm1 \n"
800 "psrld $0x6,%%xmm2 \n"
801 "psrld $0x9,%%xmm3 \n"
802 "pand %%xmm7,%%xmm0 \n"
803 "pand %%xmm4,%%xmm1 \n"
804 "pand %%xmm5,%%xmm2 \n"
805 "pand %%xmm6,%%xmm3 \n"
806 "por %%xmm1,%%xmm0 \n"
807 "por %%xmm3,%%xmm2 \n"
808 "por %%xmm2,%%xmm0 \n"
809 "packssdw %%xmm0,%%xmm0 \n"
810 "lea 0x10(%0),%0 \n"
811 "movq %%xmm0,(%1) \n"
812 "lea 0x8(%1),%1 \n"
813 "sub $0x4,%2 \n"
814 "jg 1b \n"
815 : "+r"(src), // %0
816 "+r"(dst), // %1
817 "+r"(width) // %2
818 ::"memory",
819 "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
820 }
821
ARGBToARGB4444Row_SSE2(const uint8_t * src,uint8_t * dst,int width)822 void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
823 asm volatile(
824 "pcmpeqb %%xmm4,%%xmm4 \n"
825 "psllw $0xc,%%xmm4 \n"
826 "movdqa %%xmm4,%%xmm3 \n"
827 "psrlw $0x8,%%xmm3 \n"
828
829 LABELALIGN
830 "1: \n"
831 "movdqu (%0),%%xmm0 \n"
832 "movdqa %%xmm0,%%xmm1 \n"
833 "pand %%xmm3,%%xmm0 \n"
834 "pand %%xmm4,%%xmm1 \n"
835 "psrlq $0x4,%%xmm0 \n"
836 "psrlq $0x8,%%xmm1 \n"
837 "por %%xmm1,%%xmm0 \n"
838 "packuswb %%xmm0,%%xmm0 \n"
839 "lea 0x10(%0),%0 \n"
840 "movq %%xmm0,(%1) \n"
841 "lea 0x8(%1),%1 \n"
842 "sub $0x4,%2 \n"
843 "jg 1b \n"
844 : "+r"(src), // %0
845 "+r"(dst), // %1
846 "+r"(width) // %2
847 ::"memory",
848 "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
849 }
850 #endif // HAS_RGB24TOARGBROW_SSSE3
851
852 /*
853
854 ARGBToAR30Row:
855
856 Red Blue
857 With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
858 produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
859 wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
860 (1024+4)*16 for red.
861
862 Alpha Green
863 Alpha and Green are already in the high bits so vpand can zero out the other
864 bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
865 could be used for Green - (1024+4) putting the 10 bit green in the lsb. Alpha
866 would be a simple multiplier to shift it into position. It wants a gap of 10
867 above the green. Green is 10 bits, so there are 6 bits in the low short. 4
868 more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
869 and then a shift of 4 is a multiply of 16, so (4*16) = 64. Then shift the
870 result left 10 to position the A and G channels.
871 */
872
873 // Shuffle table for converting RAW to RGB24. Last 8.
874 static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u,
875 128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
876
877 static const uvec8 kShuffleBR30 = {128u, 2u, 128u, 0u, 128u, 6u, 128u, 4u,
878 128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
879
880 static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
881 static const uint32_t kMaskRB10 = 0x3ff003ff;
882 static const uint32_t kMaskAG10 = 0xc000ff00;
883 static const uint32_t kMulAG10 = 64 * 65536 + 1028;
884
ARGBToAR30Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)885 void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
886 asm volatile(
887 "movdqa %3,%%xmm2 \n" // shuffler for RB
888 "movd %4,%%xmm3 \n" // multipler for RB
889 "movd %5,%%xmm4 \n" // mask for R10 B10
890 "movd %6,%%xmm5 \n" // mask for AG
891 "movd %7,%%xmm6 \n" // multipler for AG
892 "pshufd $0x0,%%xmm3,%%xmm3 \n"
893 "pshufd $0x0,%%xmm4,%%xmm4 \n"
894 "pshufd $0x0,%%xmm5,%%xmm5 \n"
895 "pshufd $0x0,%%xmm6,%%xmm6 \n"
896 "sub %0,%1 \n"
897
898 "1: \n"
899 "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels
900 "movdqa %%xmm0,%%xmm1 \n"
901 "pshufb %%xmm2,%%xmm1 \n" // R0B0
902 "pand %%xmm5,%%xmm0 \n" // A0G0
903 "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
904 "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
905 "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
906 "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
907 "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
908 "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
909 "add $0x10,%0 \n"
910 "sub $0x4,%2 \n"
911 "jg 1b \n"
912
913 : "+r"(src), // %0
914 "+r"(dst), // %1
915 "+r"(width) // %2
916 : "m"(kShuffleRB30), // %3
917 "m"(kMulRB10), // %4
918 "m"(kMaskRB10), // %5
919 "m"(kMaskAG10), // %6
920 "m"(kMulAG10) // %7
921 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
922 }
923
ABGRToAR30Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)924 void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
925 asm volatile(
926 "movdqa %3,%%xmm2 \n" // shuffler for RB
927 "movd %4,%%xmm3 \n" // multipler for RB
928 "movd %5,%%xmm4 \n" // mask for R10 B10
929 "movd %6,%%xmm5 \n" // mask for AG
930 "movd %7,%%xmm6 \n" // multipler for AG
931 "pshufd $0x0,%%xmm3,%%xmm3 \n"
932 "pshufd $0x0,%%xmm4,%%xmm4 \n"
933 "pshufd $0x0,%%xmm5,%%xmm5 \n"
934 "pshufd $0x0,%%xmm6,%%xmm6 \n"
935 "sub %0,%1 \n"
936
937 "1: \n"
938 "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels
939 "movdqa %%xmm0,%%xmm1 \n"
940 "pshufb %%xmm2,%%xmm1 \n" // R0B0
941 "pand %%xmm5,%%xmm0 \n" // A0G0
942 "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
943 "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
944 "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
945 "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
946 "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
947 "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
948 "add $0x10,%0 \n"
949 "sub $0x4,%2 \n"
950 "jg 1b \n"
951
952 : "+r"(src), // %0
953 "+r"(dst), // %1
954 "+r"(width) // %2
955 : "m"(kShuffleBR30), // %3 reversed shuffler
956 "m"(kMulRB10), // %4
957 "m"(kMaskRB10), // %5
958 "m"(kMaskAG10), // %6
959 "m"(kMulAG10) // %7
960 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
961 }
962
963 #ifdef HAS_ARGBTOAR30ROW_AVX2
ARGBToAR30Row_AVX2(const uint8_t * src,uint8_t * dst,int width)964 void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
965 asm volatile(
966 "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
967 "vbroadcastss %4,%%ymm3 \n" // multipler for RB
968 "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
969 "vbroadcastss %6,%%ymm5 \n" // mask for AG
970 "vbroadcastss %7,%%ymm6 \n" // multipler for AG
971 "sub %0,%1 \n"
972
973 "1: \n"
974 "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels
975 "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
976 "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
977 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
978 "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
979 "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
980 "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
981 "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
982 "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
983 "add $0x20,%0 \n"
984 "sub $0x8,%2 \n"
985 "jg 1b \n"
986 "vzeroupper \n"
987
988 : "+r"(src), // %0
989 "+r"(dst), // %1
990 "+r"(width) // %2
991 : "m"(kShuffleRB30), // %3
992 "m"(kMulRB10), // %4
993 "m"(kMaskRB10), // %5
994 "m"(kMaskAG10), // %6
995 "m"(kMulAG10) // %7
996 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
997 }
998 #endif
999
1000 #ifdef HAS_ABGRTOAR30ROW_AVX2
ABGRToAR30Row_AVX2(const uint8_t * src,uint8_t * dst,int width)1001 void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
1002 asm volatile(
1003 "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
1004 "vbroadcastss %4,%%ymm3 \n" // multipler for RB
1005 "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
1006 "vbroadcastss %6,%%ymm5 \n" // mask for AG
1007 "vbroadcastss %7,%%ymm6 \n" // multipler for AG
1008 "sub %0,%1 \n"
1009
1010 "1: \n"
1011 "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels
1012 "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
1013 "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
1014 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
1015 "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
1016 "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
1017 "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
1018 "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
1019 "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
1020 "add $0x20,%0 \n"
1021 "sub $0x8,%2 \n"
1022 "jg 1b \n"
1023 "vzeroupper \n"
1024
1025 : "+r"(src), // %0
1026 "+r"(dst), // %1
1027 "+r"(width) // %2
1028 : "m"(kShuffleBR30), // %3 reversed shuffler
1029 "m"(kMulRB10), // %4
1030 "m"(kMaskRB10), // %5
1031 "m"(kMaskAG10), // %6
1032 "m"(kMulAG10) // %7
1033 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1034 }
1035 #endif
1036
1037 #ifdef HAS_ARGBTOYROW_SSSE3
1038 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
ARGBToYRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_y,int width)1039 void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1040 asm volatile(
1041 "movdqa %3,%%xmm4 \n"
1042 "movdqa %4,%%xmm5 \n"
1043
1044 LABELALIGN
1045 "1: \n"
1046 "movdqu (%0),%%xmm0 \n"
1047 "movdqu 0x10(%0),%%xmm1 \n"
1048 "movdqu 0x20(%0),%%xmm2 \n"
1049 "movdqu 0x30(%0),%%xmm3 \n"
1050 "pmaddubsw %%xmm4,%%xmm0 \n"
1051 "pmaddubsw %%xmm4,%%xmm1 \n"
1052 "pmaddubsw %%xmm4,%%xmm2 \n"
1053 "pmaddubsw %%xmm4,%%xmm3 \n"
1054 "lea 0x40(%0),%0 \n"
1055 "phaddw %%xmm1,%%xmm0 \n"
1056 "phaddw %%xmm3,%%xmm2 \n"
1057 "psrlw $0x7,%%xmm0 \n"
1058 "psrlw $0x7,%%xmm2 \n"
1059 "packuswb %%xmm2,%%xmm0 \n"
1060 "paddb %%xmm5,%%xmm0 \n"
1061 "movdqu %%xmm0,(%1) \n"
1062 "lea 0x10(%1),%1 \n"
1063 "sub $0x10,%2 \n"
1064 "jg 1b \n"
1065 : "+r"(src_argb), // %0
1066 "+r"(dst_y), // %1
1067 "+r"(width) // %2
1068 : "m"(kARGBToY), // %3
1069 "m"(kAddY16) // %4
1070 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1071 }
1072 #endif // HAS_ARGBTOYROW_SSSE3
1073
1074 #ifdef HAS_ARGBTOYJROW_SSSE3
1075 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1076 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
ARGBToYJRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_y,int width)1077 void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1078 asm volatile(
1079 "movdqa %3,%%xmm4 \n"
1080 "movdqa %4,%%xmm5 \n"
1081
1082 LABELALIGN
1083 "1: \n"
1084 "movdqu (%0),%%xmm0 \n"
1085 "movdqu 0x10(%0),%%xmm1 \n"
1086 "movdqu 0x20(%0),%%xmm2 \n"
1087 "movdqu 0x30(%0),%%xmm3 \n"
1088 "pmaddubsw %%xmm4,%%xmm0 \n"
1089 "pmaddubsw %%xmm4,%%xmm1 \n"
1090 "pmaddubsw %%xmm4,%%xmm2 \n"
1091 "pmaddubsw %%xmm4,%%xmm3 \n"
1092 "lea 0x40(%0),%0 \n"
1093 "phaddw %%xmm1,%%xmm0 \n"
1094 "phaddw %%xmm3,%%xmm2 \n"
1095 "paddw %%xmm5,%%xmm0 \n"
1096 "paddw %%xmm5,%%xmm2 \n"
1097 "psrlw $0x7,%%xmm0 \n"
1098 "psrlw $0x7,%%xmm2 \n"
1099 "packuswb %%xmm2,%%xmm0 \n"
1100 "movdqu %%xmm0,(%1) \n"
1101 "lea 0x10(%1),%1 \n"
1102 "sub $0x10,%2 \n"
1103 "jg 1b \n"
1104 : "+r"(src_argb), // %0
1105 "+r"(dst_y), // %1
1106 "+r"(width) // %2
1107 : "m"(kARGBToYJ), // %3
1108 "m"(kAddYJ64) // %4
1109 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1110 }
1111 #endif // HAS_ARGBTOYJROW_SSSE3
1112
1113 #ifdef HAS_ARGBTOYROW_AVX2
1114 // vpermd for vphaddw + vpackuswb vpermd.
1115 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
1116
1117 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYRow_AVX2(const uint8_t * src_argb,uint8_t * dst_y,int width)1118 void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1119 asm volatile(
1120 "vbroadcastf128 %3,%%ymm4 \n"
1121 "vbroadcastf128 %4,%%ymm5 \n"
1122 "vmovdqu %5,%%ymm6 \n"
1123
1124 LABELALIGN
1125 "1: \n"
1126 "vmovdqu (%0),%%ymm0 \n"
1127 "vmovdqu 0x20(%0),%%ymm1 \n"
1128 "vmovdqu 0x40(%0),%%ymm2 \n"
1129 "vmovdqu 0x60(%0),%%ymm3 \n"
1130 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
1131 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
1132 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
1133 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
1134 "lea 0x80(%0),%0 \n"
1135 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
1136 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
1137 "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
1138 "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
1139 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
1140 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
1141 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
1142 "vmovdqu %%ymm0,(%1) \n"
1143 "lea 0x20(%1),%1 \n"
1144 "sub $0x20,%2 \n"
1145 "jg 1b \n"
1146 "vzeroupper \n"
1147 : "+r"(src_argb), // %0
1148 "+r"(dst_y), // %1
1149 "+r"(width) // %2
1150 : "m"(kARGBToY), // %3
1151 "m"(kAddY16), // %4
1152 "m"(kPermdARGBToY_AVX) // %5
1153 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1154 }
1155 #endif // HAS_ARGBTOYROW_AVX2
1156
1157 #ifdef HAS_ARGBTOYJROW_AVX2
1158 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYJRow_AVX2(const uint8_t * src_argb,uint8_t * dst_y,int width)1159 void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1160 asm volatile(
1161 "vbroadcastf128 %3,%%ymm4 \n"
1162 "vbroadcastf128 %4,%%ymm5 \n"
1163 "vmovdqu %5,%%ymm6 \n"
1164
1165 LABELALIGN
1166 "1: \n"
1167 "vmovdqu (%0),%%ymm0 \n"
1168 "vmovdqu 0x20(%0),%%ymm1 \n"
1169 "vmovdqu 0x40(%0),%%ymm2 \n"
1170 "vmovdqu 0x60(%0),%%ymm3 \n"
1171 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
1172 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
1173 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
1174 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
1175 "lea 0x80(%0),%0 \n"
1176 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
1177 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
1178 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding.
1179 "vpaddw %%ymm5,%%ymm2,%%ymm2 \n"
1180 "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
1181 "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
1182 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
1183 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
1184 "vmovdqu %%ymm0,(%1) \n"
1185 "lea 0x20(%1),%1 \n"
1186 "sub $0x20,%2 \n"
1187 "jg 1b \n"
1188 "vzeroupper \n"
1189 : "+r"(src_argb), // %0
1190 "+r"(dst_y), // %1
1191 "+r"(width) // %2
1192 : "m"(kARGBToYJ), // %3
1193 "m"(kAddYJ64), // %4
1194 "m"(kPermdARGBToY_AVX) // %5
1195 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1196 }
1197 #endif // HAS_ARGBTOYJROW_AVX2
1198
1199 #ifdef HAS_ARGBTOUVROW_SSSE3
ARGBToUVRow_SSSE3(const uint8_t * src_argb0,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1200 void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
1201 int src_stride_argb,
1202 uint8_t* dst_u,
1203 uint8_t* dst_v,
1204 int width) {
1205 asm volatile(
1206 "movdqa %5,%%xmm3 \n"
1207 "movdqa %6,%%xmm4 \n"
1208 "movdqa %7,%%xmm5 \n"
1209 "sub %1,%2 \n"
1210
1211 LABELALIGN
1212 "1: \n"
1213 "movdqu (%0),%%xmm0 \n"
1214 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
1215 "pavgb %%xmm7,%%xmm0 \n"
1216 "movdqu 0x10(%0),%%xmm1 \n"
1217 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1218 "pavgb %%xmm7,%%xmm1 \n"
1219 "movdqu 0x20(%0),%%xmm2 \n"
1220 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1221 "pavgb %%xmm7,%%xmm2 \n"
1222 "movdqu 0x30(%0),%%xmm6 \n"
1223 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1224 "pavgb %%xmm7,%%xmm6 \n"
1225
1226 "lea 0x40(%0),%0 \n"
1227 "movdqa %%xmm0,%%xmm7 \n"
1228 "shufps $0x88,%%xmm1,%%xmm0 \n"
1229 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1230 "pavgb %%xmm7,%%xmm0 \n"
1231 "movdqa %%xmm2,%%xmm7 \n"
1232 "shufps $0x88,%%xmm6,%%xmm2 \n"
1233 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1234 "pavgb %%xmm7,%%xmm2 \n"
1235 "movdqa %%xmm0,%%xmm1 \n"
1236 "movdqa %%xmm2,%%xmm6 \n"
1237 "pmaddubsw %%xmm4,%%xmm0 \n"
1238 "pmaddubsw %%xmm4,%%xmm2 \n"
1239 "pmaddubsw %%xmm3,%%xmm1 \n"
1240 "pmaddubsw %%xmm3,%%xmm6 \n"
1241 "phaddw %%xmm2,%%xmm0 \n"
1242 "phaddw %%xmm6,%%xmm1 \n"
1243 "psraw $0x8,%%xmm0 \n"
1244 "psraw $0x8,%%xmm1 \n"
1245 "packsswb %%xmm1,%%xmm0 \n"
1246 "paddb %%xmm5,%%xmm0 \n"
1247 "movlps %%xmm0,(%1) \n"
1248 "movhps %%xmm0,0x00(%1,%2,1) \n"
1249 "lea 0x8(%1),%1 \n"
1250 "sub $0x10,%3 \n"
1251 "jg 1b \n"
1252 : "+r"(src_argb0), // %0
1253 "+r"(dst_u), // %1
1254 "+r"(dst_v), // %2
1255 "+rm"(width) // %3
1256 : "r"((intptr_t)(src_stride_argb)), // %4
1257 "m"(kARGBToV), // %5
1258 "m"(kARGBToU), // %6
1259 "m"(kAddUV128) // %7
1260 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1261 }
1262 #endif // HAS_ARGBTOUVROW_SSSE3
1263
1264 #ifdef HAS_ARGBTOUVROW_AVX2
1265 // vpshufb for vphaddw + vpackuswb packed to shorts.
1266 static const lvec8 kShufARGBToUV_AVX = {
1267 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
1268 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
ARGBToUVRow_AVX2(const uint8_t * src_argb0,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1269 void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
1270 int src_stride_argb,
1271 uint8_t* dst_u,
1272 uint8_t* dst_v,
1273 int width) {
1274 asm volatile(
1275 "vbroadcastf128 %5,%%ymm5 \n"
1276 "vbroadcastf128 %6,%%ymm6 \n"
1277 "vbroadcastf128 %7,%%ymm7 \n"
1278 "sub %1,%2 \n"
1279
1280 LABELALIGN
1281 "1: \n"
1282 "vmovdqu (%0),%%ymm0 \n"
1283 "vmovdqu 0x20(%0),%%ymm1 \n"
1284 "vmovdqu 0x40(%0),%%ymm2 \n"
1285 "vmovdqu 0x60(%0),%%ymm3 \n"
1286 "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
1287 "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
1288 "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
1289 "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
1290 "lea 0x80(%0),%0 \n"
1291 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
1292 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
1293 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
1294 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
1295 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
1296 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
1297
1298 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
1299 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
1300 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
1301 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
1302 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
1303 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
1304 "vpsraw $0x8,%%ymm1,%%ymm1 \n"
1305 "vpsraw $0x8,%%ymm0,%%ymm0 \n"
1306 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
1307 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1308 "vpshufb %8,%%ymm0,%%ymm0 \n"
1309 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
1310
1311 "vextractf128 $0x0,%%ymm0,(%1) \n"
1312 "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
1313 "lea 0x10(%1),%1 \n"
1314 "sub $0x20,%3 \n"
1315 "jg 1b \n"
1316 "vzeroupper \n"
1317 : "+r"(src_argb0), // %0
1318 "+r"(dst_u), // %1
1319 "+r"(dst_v), // %2
1320 "+rm"(width) // %3
1321 : "r"((intptr_t)(src_stride_argb)), // %4
1322 "m"(kAddUV128), // %5
1323 "m"(kARGBToV), // %6
1324 "m"(kARGBToU), // %7
1325 "m"(kShufARGBToUV_AVX) // %8
1326 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1327 "xmm7");
1328 }
1329 #endif // HAS_ARGBTOUVROW_AVX2
1330
1331 #ifdef HAS_ARGBTOUVJROW_AVX2
ARGBToUVJRow_AVX2(const uint8_t * src_argb0,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1332 void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
1333 int src_stride_argb,
1334 uint8_t* dst_u,
1335 uint8_t* dst_v,
1336 int width) {
1337 asm volatile(
1338 "vbroadcastf128 %5,%%ymm5 \n"
1339 "vbroadcastf128 %6,%%ymm6 \n"
1340 "vbroadcastf128 %7,%%ymm7 \n"
1341 "sub %1,%2 \n"
1342
1343 LABELALIGN
1344 "1: \n"
1345 "vmovdqu (%0),%%ymm0 \n"
1346 "vmovdqu 0x20(%0),%%ymm1 \n"
1347 "vmovdqu 0x40(%0),%%ymm2 \n"
1348 "vmovdqu 0x60(%0),%%ymm3 \n"
1349 "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
1350 "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
1351 "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
1352 "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
1353 "lea 0x80(%0),%0 \n"
1354 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
1355 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
1356 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
1357 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
1358 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
1359 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
1360
1361 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
1362 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
1363 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
1364 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
1365 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
1366 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
1367 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
1368 "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
1369 "vpsraw $0x8,%%ymm1,%%ymm1 \n"
1370 "vpsraw $0x8,%%ymm0,%%ymm0 \n"
1371 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
1372 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1373 "vpshufb %8,%%ymm0,%%ymm0 \n"
1374
1375 "vextractf128 $0x0,%%ymm0,(%1) \n"
1376 "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
1377 "lea 0x10(%1),%1 \n"
1378 "sub $0x20,%3 \n"
1379 "jg 1b \n"
1380 "vzeroupper \n"
1381 : "+r"(src_argb0), // %0
1382 "+r"(dst_u), // %1
1383 "+r"(dst_v), // %2
1384 "+rm"(width) // %3
1385 : "r"((intptr_t)(src_stride_argb)), // %4
1386 "m"(kAddUVJ128), // %5
1387 "m"(kARGBToVJ), // %6
1388 "m"(kARGBToUJ), // %7
1389 "m"(kShufARGBToUV_AVX) // %8
1390 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1391 "xmm7");
1392 }
1393 #endif // HAS_ARGBTOUVJROW_AVX2
1394
1395 #ifdef HAS_ARGBTOUVJROW_SSSE3
ARGBToUVJRow_SSSE3(const uint8_t * src_argb0,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1396 void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
1397 int src_stride_argb,
1398 uint8_t* dst_u,
1399 uint8_t* dst_v,
1400 int width) {
1401 asm volatile(
1402 "movdqa %5,%%xmm3 \n"
1403 "movdqa %6,%%xmm4 \n"
1404 "movdqa %7,%%xmm5 \n"
1405 "sub %1,%2 \n"
1406
1407 LABELALIGN
1408 "1: \n"
1409 "movdqu (%0),%%xmm0 \n"
1410 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
1411 "pavgb %%xmm7,%%xmm0 \n"
1412 "movdqu 0x10(%0),%%xmm1 \n"
1413 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1414 "pavgb %%xmm7,%%xmm1 \n"
1415 "movdqu 0x20(%0),%%xmm2 \n"
1416 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1417 "pavgb %%xmm7,%%xmm2 \n"
1418 "movdqu 0x30(%0),%%xmm6 \n"
1419 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1420 "pavgb %%xmm7,%%xmm6 \n"
1421
1422 "lea 0x40(%0),%0 \n"
1423 "movdqa %%xmm0,%%xmm7 \n"
1424 "shufps $0x88,%%xmm1,%%xmm0 \n"
1425 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1426 "pavgb %%xmm7,%%xmm0 \n"
1427 "movdqa %%xmm2,%%xmm7 \n"
1428 "shufps $0x88,%%xmm6,%%xmm2 \n"
1429 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1430 "pavgb %%xmm7,%%xmm2 \n"
1431 "movdqa %%xmm0,%%xmm1 \n"
1432 "movdqa %%xmm2,%%xmm6 \n"
1433 "pmaddubsw %%xmm4,%%xmm0 \n"
1434 "pmaddubsw %%xmm4,%%xmm2 \n"
1435 "pmaddubsw %%xmm3,%%xmm1 \n"
1436 "pmaddubsw %%xmm3,%%xmm6 \n"
1437 "phaddw %%xmm2,%%xmm0 \n"
1438 "phaddw %%xmm6,%%xmm1 \n"
1439 "paddw %%xmm5,%%xmm0 \n"
1440 "paddw %%xmm5,%%xmm1 \n"
1441 "psraw $0x8,%%xmm0 \n"
1442 "psraw $0x8,%%xmm1 \n"
1443 "packsswb %%xmm1,%%xmm0 \n"
1444 "movlps %%xmm0,(%1) \n"
1445 "movhps %%xmm0,0x00(%1,%2,1) \n"
1446 "lea 0x8(%1),%1 \n"
1447 "sub $0x10,%3 \n"
1448 "jg 1b \n"
1449 : "+r"(src_argb0), // %0
1450 "+r"(dst_u), // %1
1451 "+r"(dst_v), // %2
1452 "+rm"(width) // %3
1453 : "r"((intptr_t)(src_stride_argb)), // %4
1454 "m"(kARGBToVJ), // %5
1455 "m"(kARGBToUJ), // %6
1456 "m"(kAddUVJ128) // %7
1457 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1458 }
1459 #endif // HAS_ARGBTOUVJROW_SSSE3
1460
1461 #ifdef HAS_ARGBTOUV444ROW_SSSE3
ARGBToUV444Row_SSSE3(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1462 void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
1463 uint8_t* dst_u,
1464 uint8_t* dst_v,
1465 int width) {
1466 asm volatile(
1467 "movdqa %4,%%xmm3 \n"
1468 "movdqa %5,%%xmm4 \n"
1469 "movdqa %6,%%xmm5 \n"
1470 "sub %1,%2 \n"
1471
1472 LABELALIGN
1473 "1: \n"
1474 "movdqu (%0),%%xmm0 \n"
1475 "movdqu 0x10(%0),%%xmm1 \n"
1476 "movdqu 0x20(%0),%%xmm2 \n"
1477 "movdqu 0x30(%0),%%xmm6 \n"
1478 "pmaddubsw %%xmm4,%%xmm0 \n"
1479 "pmaddubsw %%xmm4,%%xmm1 \n"
1480 "pmaddubsw %%xmm4,%%xmm2 \n"
1481 "pmaddubsw %%xmm4,%%xmm6 \n"
1482 "phaddw %%xmm1,%%xmm0 \n"
1483 "phaddw %%xmm6,%%xmm2 \n"
1484 "psraw $0x8,%%xmm0 \n"
1485 "psraw $0x8,%%xmm2 \n"
1486 "packsswb %%xmm2,%%xmm0 \n"
1487 "paddb %%xmm5,%%xmm0 \n"
1488 "movdqu %%xmm0,(%1) \n"
1489 "movdqu (%0),%%xmm0 \n"
1490 "movdqu 0x10(%0),%%xmm1 \n"
1491 "movdqu 0x20(%0),%%xmm2 \n"
1492 "movdqu 0x30(%0),%%xmm6 \n"
1493 "pmaddubsw %%xmm3,%%xmm0 \n"
1494 "pmaddubsw %%xmm3,%%xmm1 \n"
1495 "pmaddubsw %%xmm3,%%xmm2 \n"
1496 "pmaddubsw %%xmm3,%%xmm6 \n"
1497 "phaddw %%xmm1,%%xmm0 \n"
1498 "phaddw %%xmm6,%%xmm2 \n"
1499 "psraw $0x8,%%xmm0 \n"
1500 "psraw $0x8,%%xmm2 \n"
1501 "packsswb %%xmm2,%%xmm0 \n"
1502 "paddb %%xmm5,%%xmm0 \n"
1503 "lea 0x40(%0),%0 \n"
1504 "movdqu %%xmm0,0x00(%1,%2,1) \n"
1505 "lea 0x10(%1),%1 \n"
1506 "sub $0x10,%3 \n"
1507 "jg 1b \n"
1508 : "+r"(src_argb), // %0
1509 "+r"(dst_u), // %1
1510 "+r"(dst_v), // %2
1511 "+rm"(width) // %3
1512 : "m"(kARGBToV), // %4
1513 "m"(kARGBToU), // %5
1514 "m"(kAddUV128) // %6
1515 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");
1516 }
1517 #endif // HAS_ARGBTOUV444ROW_SSSE3
1518
BGRAToYRow_SSSE3(const uint8_t * src_bgra,uint8_t * dst_y,int width)1519 void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
1520 asm volatile(
1521 "movdqa %4,%%xmm5 \n"
1522 "movdqa %3,%%xmm4 \n"
1523
1524 LABELALIGN
1525 "1: \n"
1526 "movdqu (%0),%%xmm0 \n"
1527 "movdqu 0x10(%0),%%xmm1 \n"
1528 "movdqu 0x20(%0),%%xmm2 \n"
1529 "movdqu 0x30(%0),%%xmm3 \n"
1530 "pmaddubsw %%xmm4,%%xmm0 \n"
1531 "pmaddubsw %%xmm4,%%xmm1 \n"
1532 "pmaddubsw %%xmm4,%%xmm2 \n"
1533 "pmaddubsw %%xmm4,%%xmm3 \n"
1534 "lea 0x40(%0),%0 \n"
1535 "phaddw %%xmm1,%%xmm0 \n"
1536 "phaddw %%xmm3,%%xmm2 \n"
1537 "psrlw $0x7,%%xmm0 \n"
1538 "psrlw $0x7,%%xmm2 \n"
1539 "packuswb %%xmm2,%%xmm0 \n"
1540 "paddb %%xmm5,%%xmm0 \n"
1541 "movdqu %%xmm0,(%1) \n"
1542 "lea 0x10(%1),%1 \n"
1543 "sub $0x10,%2 \n"
1544 "jg 1b \n"
1545 : "+r"(src_bgra), // %0
1546 "+r"(dst_y), // %1
1547 "+r"(width) // %2
1548 : "m"(kBGRAToY), // %3
1549 "m"(kAddY16) // %4
1550 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1551 }
1552
BGRAToUVRow_SSSE3(const uint8_t * src_bgra0,int src_stride_bgra,uint8_t * dst_u,uint8_t * dst_v,int width)1553 void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
1554 int src_stride_bgra,
1555 uint8_t* dst_u,
1556 uint8_t* dst_v,
1557 int width) {
1558 asm volatile(
1559 "movdqa %5,%%xmm3 \n"
1560 "movdqa %6,%%xmm4 \n"
1561 "movdqa %7,%%xmm5 \n"
1562 "sub %1,%2 \n"
1563
1564 LABELALIGN
1565 "1: \n"
1566 "movdqu (%0),%%xmm0 \n"
1567 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
1568 "pavgb %%xmm7,%%xmm0 \n"
1569 "movdqu 0x10(%0),%%xmm1 \n"
1570 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1571 "pavgb %%xmm7,%%xmm1 \n"
1572 "movdqu 0x20(%0),%%xmm2 \n"
1573 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1574 "pavgb %%xmm7,%%xmm2 \n"
1575 "movdqu 0x30(%0),%%xmm6 \n"
1576 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1577 "pavgb %%xmm7,%%xmm6 \n"
1578
1579 "lea 0x40(%0),%0 \n"
1580 "movdqa %%xmm0,%%xmm7 \n"
1581 "shufps $0x88,%%xmm1,%%xmm0 \n"
1582 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1583 "pavgb %%xmm7,%%xmm0 \n"
1584 "movdqa %%xmm2,%%xmm7 \n"
1585 "shufps $0x88,%%xmm6,%%xmm2 \n"
1586 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1587 "pavgb %%xmm7,%%xmm2 \n"
1588 "movdqa %%xmm0,%%xmm1 \n"
1589 "movdqa %%xmm2,%%xmm6 \n"
1590 "pmaddubsw %%xmm4,%%xmm0 \n"
1591 "pmaddubsw %%xmm4,%%xmm2 \n"
1592 "pmaddubsw %%xmm3,%%xmm1 \n"
1593 "pmaddubsw %%xmm3,%%xmm6 \n"
1594 "phaddw %%xmm2,%%xmm0 \n"
1595 "phaddw %%xmm6,%%xmm1 \n"
1596 "psraw $0x8,%%xmm0 \n"
1597 "psraw $0x8,%%xmm1 \n"
1598 "packsswb %%xmm1,%%xmm0 \n"
1599 "paddb %%xmm5,%%xmm0 \n"
1600 "movlps %%xmm0,(%1) \n"
1601 "movhps %%xmm0,0x00(%1,%2,1) \n"
1602 "lea 0x8(%1),%1 \n"
1603 "sub $0x10,%3 \n"
1604 "jg 1b \n"
1605 : "+r"(src_bgra0), // %0
1606 "+r"(dst_u), // %1
1607 "+r"(dst_v), // %2
1608 "+rm"(width) // %3
1609 : "r"((intptr_t)(src_stride_bgra)), // %4
1610 "m"(kBGRAToV), // %5
1611 "m"(kBGRAToU), // %6
1612 "m"(kAddUV128) // %7
1613 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1614 }
1615
ABGRToYRow_SSSE3(const uint8_t * src_abgr,uint8_t * dst_y,int width)1616 void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1617 asm volatile(
1618 "movdqa %4,%%xmm5 \n"
1619 "movdqa %3,%%xmm4 \n"
1620
1621 LABELALIGN
1622 "1: \n"
1623 "movdqu (%0),%%xmm0 \n"
1624 "movdqu 0x10(%0),%%xmm1 \n"
1625 "movdqu 0x20(%0),%%xmm2 \n"
1626 "movdqu 0x30(%0),%%xmm3 \n"
1627 "pmaddubsw %%xmm4,%%xmm0 \n"
1628 "pmaddubsw %%xmm4,%%xmm1 \n"
1629 "pmaddubsw %%xmm4,%%xmm2 \n"
1630 "pmaddubsw %%xmm4,%%xmm3 \n"
1631 "lea 0x40(%0),%0 \n"
1632 "phaddw %%xmm1,%%xmm0 \n"
1633 "phaddw %%xmm3,%%xmm2 \n"
1634 "psrlw $0x7,%%xmm0 \n"
1635 "psrlw $0x7,%%xmm2 \n"
1636 "packuswb %%xmm2,%%xmm0 \n"
1637 "paddb %%xmm5,%%xmm0 \n"
1638 "movdqu %%xmm0,(%1) \n"
1639 "lea 0x10(%1),%1 \n"
1640 "sub $0x10,%2 \n"
1641 "jg 1b \n"
1642 : "+r"(src_abgr), // %0
1643 "+r"(dst_y), // %1
1644 "+r"(width) // %2
1645 : "m"(kABGRToY), // %3
1646 "m"(kAddY16) // %4
1647 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1648 }
1649
RGBAToYRow_SSSE3(const uint8_t * src_rgba,uint8_t * dst_y,int width)1650 void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1651 asm volatile(
1652 "movdqa %4,%%xmm5 \n"
1653 "movdqa %3,%%xmm4 \n"
1654
1655 LABELALIGN
1656 "1: \n"
1657 "movdqu (%0),%%xmm0 \n"
1658 "movdqu 0x10(%0),%%xmm1 \n"
1659 "movdqu 0x20(%0),%%xmm2 \n"
1660 "movdqu 0x30(%0),%%xmm3 \n"
1661 "pmaddubsw %%xmm4,%%xmm0 \n"
1662 "pmaddubsw %%xmm4,%%xmm1 \n"
1663 "pmaddubsw %%xmm4,%%xmm2 \n"
1664 "pmaddubsw %%xmm4,%%xmm3 \n"
1665 "lea 0x40(%0),%0 \n"
1666 "phaddw %%xmm1,%%xmm0 \n"
1667 "phaddw %%xmm3,%%xmm2 \n"
1668 "psrlw $0x7,%%xmm0 \n"
1669 "psrlw $0x7,%%xmm2 \n"
1670 "packuswb %%xmm2,%%xmm0 \n"
1671 "paddb %%xmm5,%%xmm0 \n"
1672 "movdqu %%xmm0,(%1) \n"
1673 "lea 0x10(%1),%1 \n"
1674 "sub $0x10,%2 \n"
1675 "jg 1b \n"
1676 : "+r"(src_rgba), // %0
1677 "+r"(dst_y), // %1
1678 "+r"(width) // %2
1679 : "m"(kRGBAToY), // %3
1680 "m"(kAddY16) // %4
1681 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1682 }
1683
ABGRToUVRow_SSSE3(const uint8_t * src_abgr0,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)1684 void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
1685 int src_stride_abgr,
1686 uint8_t* dst_u,
1687 uint8_t* dst_v,
1688 int width) {
1689 asm volatile(
1690 "movdqa %5,%%xmm3 \n"
1691 "movdqa %6,%%xmm4 \n"
1692 "movdqa %7,%%xmm5 \n"
1693 "sub %1,%2 \n"
1694
1695 LABELALIGN
1696 "1: \n"
1697 "movdqu (%0),%%xmm0 \n"
1698 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
1699 "pavgb %%xmm7,%%xmm0 \n"
1700 "movdqu 0x10(%0),%%xmm1 \n"
1701 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1702 "pavgb %%xmm7,%%xmm1 \n"
1703 "movdqu 0x20(%0),%%xmm2 \n"
1704 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1705 "pavgb %%xmm7,%%xmm2 \n"
1706 "movdqu 0x30(%0),%%xmm6 \n"
1707 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1708 "pavgb %%xmm7,%%xmm6 \n"
1709
1710 "lea 0x40(%0),%0 \n"
1711 "movdqa %%xmm0,%%xmm7 \n"
1712 "shufps $0x88,%%xmm1,%%xmm0 \n"
1713 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1714 "pavgb %%xmm7,%%xmm0 \n"
1715 "movdqa %%xmm2,%%xmm7 \n"
1716 "shufps $0x88,%%xmm6,%%xmm2 \n"
1717 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1718 "pavgb %%xmm7,%%xmm2 \n"
1719 "movdqa %%xmm0,%%xmm1 \n"
1720 "movdqa %%xmm2,%%xmm6 \n"
1721 "pmaddubsw %%xmm4,%%xmm0 \n"
1722 "pmaddubsw %%xmm4,%%xmm2 \n"
1723 "pmaddubsw %%xmm3,%%xmm1 \n"
1724 "pmaddubsw %%xmm3,%%xmm6 \n"
1725 "phaddw %%xmm2,%%xmm0 \n"
1726 "phaddw %%xmm6,%%xmm1 \n"
1727 "psraw $0x8,%%xmm0 \n"
1728 "psraw $0x8,%%xmm1 \n"
1729 "packsswb %%xmm1,%%xmm0 \n"
1730 "paddb %%xmm5,%%xmm0 \n"
1731 "movlps %%xmm0,(%1) \n"
1732 "movhps %%xmm0,0x00(%1,%2,1) \n"
1733 "lea 0x8(%1),%1 \n"
1734 "sub $0x10,%3 \n"
1735 "jg 1b \n"
1736 : "+r"(src_abgr0), // %0
1737 "+r"(dst_u), // %1
1738 "+r"(dst_v), // %2
1739 "+rm"(width) // %3
1740 : "r"((intptr_t)(src_stride_abgr)), // %4
1741 "m"(kABGRToV), // %5
1742 "m"(kABGRToU), // %6
1743 "m"(kAddUV128) // %7
1744 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1745 }
1746
RGBAToUVRow_SSSE3(const uint8_t * src_rgba0,int src_stride_rgba,uint8_t * dst_u,uint8_t * dst_v,int width)1747 void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
1748 int src_stride_rgba,
1749 uint8_t* dst_u,
1750 uint8_t* dst_v,
1751 int width) {
1752 asm volatile(
1753 "movdqa %5,%%xmm3 \n"
1754 "movdqa %6,%%xmm4 \n"
1755 "movdqa %7,%%xmm5 \n"
1756 "sub %1,%2 \n"
1757
1758 LABELALIGN
1759 "1: \n"
1760 "movdqu (%0),%%xmm0 \n"
1761 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
1762 "pavgb %%xmm7,%%xmm0 \n"
1763 "movdqu 0x10(%0),%%xmm1 \n"
1764 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1765 "pavgb %%xmm7,%%xmm1 \n"
1766 "movdqu 0x20(%0),%%xmm2 \n"
1767 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1768 "pavgb %%xmm7,%%xmm2 \n"
1769 "movdqu 0x30(%0),%%xmm6 \n"
1770 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1771 "pavgb %%xmm7,%%xmm6 \n"
1772
1773 "lea 0x40(%0),%0 \n"
1774 "movdqa %%xmm0,%%xmm7 \n"
1775 "shufps $0x88,%%xmm1,%%xmm0 \n"
1776 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1777 "pavgb %%xmm7,%%xmm0 \n"
1778 "movdqa %%xmm2,%%xmm7 \n"
1779 "shufps $0x88,%%xmm6,%%xmm2 \n"
1780 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1781 "pavgb %%xmm7,%%xmm2 \n"
1782 "movdqa %%xmm0,%%xmm1 \n"
1783 "movdqa %%xmm2,%%xmm6 \n"
1784 "pmaddubsw %%xmm4,%%xmm0 \n"
1785 "pmaddubsw %%xmm4,%%xmm2 \n"
1786 "pmaddubsw %%xmm3,%%xmm1 \n"
1787 "pmaddubsw %%xmm3,%%xmm6 \n"
1788 "phaddw %%xmm2,%%xmm0 \n"
1789 "phaddw %%xmm6,%%xmm1 \n"
1790 "psraw $0x8,%%xmm0 \n"
1791 "psraw $0x8,%%xmm1 \n"
1792 "packsswb %%xmm1,%%xmm0 \n"
1793 "paddb %%xmm5,%%xmm0 \n"
1794 "movlps %%xmm0,(%1) \n"
1795 "movhps %%xmm0,0x00(%1,%2,1) \n"
1796 "lea 0x8(%1),%1 \n"
1797 "sub $0x10,%3 \n"
1798 "jg 1b \n"
1799 : "+r"(src_rgba0), // %0
1800 "+r"(dst_u), // %1
1801 "+r"(dst_v), // %2
1802 "+rm"(width) // %3
1803 : "r"((intptr_t)(src_stride_rgba)), // %4
1804 "m"(kRGBAToV), // %5
1805 "m"(kRGBAToU), // %6
1806 "m"(kAddUV128) // %7
1807 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1808 }
1809
1810 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
1811
1812 // Read 8 UV from 444
1813 #define READYUV444 \
1814 "movq (%[u_buf]),%%xmm0 \n" \
1815 "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
1816 "lea 0x8(%[u_buf]),%[u_buf] \n" \
1817 "punpcklbw %%xmm1,%%xmm0 \n" \
1818 "movq (%[y_buf]),%%xmm4 \n" \
1819 "punpcklbw %%xmm4,%%xmm4 \n" \
1820 "lea 0x8(%[y_buf]),%[y_buf] \n"
1821
1822 // Read 4 UV from 422, upsample to 8 UV
1823 #define READYUV422 \
1824 "movd (%[u_buf]),%%xmm0 \n" \
1825 "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
1826 "lea 0x4(%[u_buf]),%[u_buf] \n" \
1827 "punpcklbw %%xmm1,%%xmm0 \n" \
1828 "punpcklwd %%xmm0,%%xmm0 \n" \
1829 "movq (%[y_buf]),%%xmm4 \n" \
1830 "punpcklbw %%xmm4,%%xmm4 \n" \
1831 "lea 0x8(%[y_buf]),%[y_buf] \n"
1832
1833 // Read 4 UV from 422 10 bit, upsample to 8 UV
1834 // TODO(fbarchard): Consider shufb to replace pack/unpack
1835 // TODO(fbarchard): Consider pmulhuw to replace psraw
1836 // TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
1837 #define READYUV210 \
1838 "movq (%[u_buf]),%%xmm0 \n" \
1839 "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
1840 "lea 0x8(%[u_buf]),%[u_buf] \n" \
1841 "punpcklwd %%xmm1,%%xmm0 \n" \
1842 "psraw $0x2,%%xmm0 \n" \
1843 "packuswb %%xmm0,%%xmm0 \n" \
1844 "punpcklwd %%xmm0,%%xmm0 \n" \
1845 "movdqu (%[y_buf]),%%xmm4 \n" \
1846 "psllw $0x6,%%xmm4 \n" \
1847 "lea 0x10(%[y_buf]),%[y_buf] \n"
1848
1849 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
1850 #define READYUVA422 \
1851 "movd (%[u_buf]),%%xmm0 \n" \
1852 "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
1853 "lea 0x4(%[u_buf]),%[u_buf] \n" \
1854 "punpcklbw %%xmm1,%%xmm0 \n" \
1855 "punpcklwd %%xmm0,%%xmm0 \n" \
1856 "movq (%[y_buf]),%%xmm4 \n" \
1857 "punpcklbw %%xmm4,%%xmm4 \n" \
1858 "lea 0x8(%[y_buf]),%[y_buf] \n" \
1859 "movq (%[a_buf]),%%xmm5 \n" \
1860 "lea 0x8(%[a_buf]),%[a_buf] \n"
1861
1862 // Read 4 UV from NV12, upsample to 8 UV
1863 #define READNV12 \
1864 "movq (%[uv_buf]),%%xmm0 \n" \
1865 "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
1866 "punpcklwd %%xmm0,%%xmm0 \n" \
1867 "movq (%[y_buf]),%%xmm4 \n" \
1868 "punpcklbw %%xmm4,%%xmm4 \n" \
1869 "lea 0x8(%[y_buf]),%[y_buf] \n"
1870
1871 // Read 4 VU from NV21, upsample to 8 UV
1872 #define READNV21 \
1873 "movq (%[vu_buf]),%%xmm0 \n" \
1874 "lea 0x8(%[vu_buf]),%[vu_buf] \n" \
1875 "pshufb %[kShuffleNV21], %%xmm0 \n" \
1876 "movq (%[y_buf]),%%xmm4 \n" \
1877 "punpcklbw %%xmm4,%%xmm4 \n" \
1878 "lea 0x8(%[y_buf]),%[y_buf] \n"
1879
1880 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
1881 #define READYUY2 \
1882 "movdqu (%[yuy2_buf]),%%xmm4 \n" \
1883 "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
1884 "movdqu (%[yuy2_buf]),%%xmm0 \n" \
1885 "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \
1886 "lea 0x10(%[yuy2_buf]),%[yuy2_buf] \n"
1887
1888 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
1889 #define READUYVY \
1890 "movdqu (%[uyvy_buf]),%%xmm4 \n" \
1891 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
1892 "movdqu (%[uyvy_buf]),%%xmm0 \n" \
1893 "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \
1894 "lea 0x10(%[uyvy_buf]),%[uyvy_buf] \n"
1895
1896 #if defined(__x86_64__)
1897 #define YUVTORGB_SETUP(yuvconstants) \
1898 "movdqa (%[yuvconstants]),%%xmm8 \n" \
1899 "movdqa 32(%[yuvconstants]),%%xmm9 \n" \
1900 "movdqa 64(%[yuvconstants]),%%xmm10 \n" \
1901 "movdqa 96(%[yuvconstants]),%%xmm11 \n" \
1902 "movdqa 128(%[yuvconstants]),%%xmm12 \n" \
1903 "movdqa 160(%[yuvconstants]),%%xmm13 \n" \
1904 "movdqa 192(%[yuvconstants]),%%xmm14 \n"
1905 // Convert 8 pixels: 8 UV and 8 Y
1906 #define YUVTORGB16(yuvconstants) \
1907 "movdqa %%xmm0,%%xmm1 \n" \
1908 "movdqa %%xmm0,%%xmm2 \n" \
1909 "movdqa %%xmm0,%%xmm3 \n" \
1910 "movdqa %%xmm11,%%xmm0 \n" \
1911 "pmaddubsw %%xmm8,%%xmm1 \n" \
1912 "psubw %%xmm1,%%xmm0 \n" \
1913 "movdqa %%xmm12,%%xmm1 \n" \
1914 "pmaddubsw %%xmm9,%%xmm2 \n" \
1915 "psubw %%xmm2,%%xmm1 \n" \
1916 "movdqa %%xmm13,%%xmm2 \n" \
1917 "pmaddubsw %%xmm10,%%xmm3 \n" \
1918 "psubw %%xmm3,%%xmm2 \n" \
1919 "pmulhuw %%xmm14,%%xmm4 \n" \
1920 "paddsw %%xmm4,%%xmm0 \n" \
1921 "paddsw %%xmm4,%%xmm1 \n" \
1922 "paddsw %%xmm4,%%xmm2 \n"
1923 #define YUVTORGB_REGS \
1924 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
1925
1926 #else
1927 #define YUVTORGB_SETUP(yuvconstants)
1928 // Convert 8 pixels: 8 UV and 8 Y
1929 #define YUVTORGB16(yuvconstants) \
1930 "movdqa %%xmm0,%%xmm1 \n" \
1931 "movdqa %%xmm0,%%xmm2 \n" \
1932 "movdqa %%xmm0,%%xmm3 \n" \
1933 "movdqa 96(%[yuvconstants]),%%xmm0 \n" \
1934 "pmaddubsw (%[yuvconstants]),%%xmm1 \n" \
1935 "psubw %%xmm1,%%xmm0 \n" \
1936 "movdqa 128(%[yuvconstants]),%%xmm1 \n" \
1937 "pmaddubsw 32(%[yuvconstants]),%%xmm2 \n" \
1938 "psubw %%xmm2,%%xmm1 \n" \
1939 "movdqa 160(%[yuvconstants]),%%xmm2 \n" \
1940 "pmaddubsw 64(%[yuvconstants]),%%xmm3 \n" \
1941 "psubw %%xmm3,%%xmm2 \n" \
1942 "pmulhuw 192(%[yuvconstants]),%%xmm4 \n" \
1943 "paddsw %%xmm4,%%xmm0 \n" \
1944 "paddsw %%xmm4,%%xmm1 \n" \
1945 "paddsw %%xmm4,%%xmm2 \n"
1946 #define YUVTORGB_REGS
1947 #endif
1948
1949 #define YUVTORGB(yuvconstants) \
1950 YUVTORGB16(yuvconstants) \
1951 "psraw $0x6,%%xmm0 \n" \
1952 "psraw $0x6,%%xmm1 \n" \
1953 "psraw $0x6,%%xmm2 \n" \
1954 "packuswb %%xmm0,%%xmm0 \n" \
1955 "packuswb %%xmm1,%%xmm1 \n" \
1956 "packuswb %%xmm2,%%xmm2 \n"
1957
1958 // Store 8 ARGB values.
1959 #define STOREARGB \
1960 "punpcklbw %%xmm1,%%xmm0 \n" \
1961 "punpcklbw %%xmm5,%%xmm2 \n" \
1962 "movdqa %%xmm0,%%xmm1 \n" \
1963 "punpcklwd %%xmm2,%%xmm0 \n" \
1964 "punpckhwd %%xmm2,%%xmm1 \n" \
1965 "movdqu %%xmm0,(%[dst_argb]) \n" \
1966 "movdqu %%xmm1,0x10(%[dst_argb]) \n" \
1967 "lea 0x20(%[dst_argb]), %[dst_argb] \n"
1968
1969 // Store 8 RGBA values.
1970 #define STORERGBA \
1971 "pcmpeqb %%xmm5,%%xmm5 \n" \
1972 "punpcklbw %%xmm2,%%xmm1 \n" \
1973 "punpcklbw %%xmm0,%%xmm5 \n" \
1974 "movdqa %%xmm5,%%xmm0 \n" \
1975 "punpcklwd %%xmm1,%%xmm5 \n" \
1976 "punpckhwd %%xmm1,%%xmm0 \n" \
1977 "movdqu %%xmm5,(%[dst_rgba]) \n" \
1978 "movdqu %%xmm0,0x10(%[dst_rgba]) \n" \
1979 "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
1980
1981 // Store 8 AR30 values.
1982 #define STOREAR30 \
1983 "psraw $0x4,%%xmm0 \n" \
1984 "psraw $0x4,%%xmm1 \n" \
1985 "psraw $0x4,%%xmm2 \n" \
1986 "pminsw %%xmm7,%%xmm0 \n" \
1987 "pminsw %%xmm7,%%xmm1 \n" \
1988 "pminsw %%xmm7,%%xmm2 \n" \
1989 "pmaxsw %%xmm6,%%xmm0 \n" \
1990 "pmaxsw %%xmm6,%%xmm1 \n" \
1991 "pmaxsw %%xmm6,%%xmm2 \n" \
1992 "psllw $0x4,%%xmm2 \n" \
1993 "movdqa %%xmm0,%%xmm3 \n" \
1994 "punpcklwd %%xmm2,%%xmm0 \n" \
1995 "punpckhwd %%xmm2,%%xmm3 \n" \
1996 "movdqa %%xmm1,%%xmm2 \n" \
1997 "punpcklwd %%xmm5,%%xmm1 \n" \
1998 "punpckhwd %%xmm5,%%xmm2 \n" \
1999 "pslld $0xa,%%xmm1 \n" \
2000 "pslld $0xa,%%xmm2 \n" \
2001 "por %%xmm1,%%xmm0 \n" \
2002 "por %%xmm2,%%xmm3 \n" \
2003 "movdqu %%xmm0,(%[dst_ar30]) \n" \
2004 "movdqu %%xmm3,0x10(%[dst_ar30]) \n" \
2005 "lea 0x20(%[dst_ar30]), %[dst_ar30] \n"
2006
I444ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2007 void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
2008 const uint8_t* u_buf,
2009 const uint8_t* v_buf,
2010 uint8_t* dst_argb,
2011 const struct YuvConstants* yuvconstants,
2012 int width) {
2013 asm volatile (
2014 YUVTORGB_SETUP(yuvconstants)
2015 "sub %[u_buf],%[v_buf] \n"
2016 "pcmpeqb %%xmm5,%%xmm5 \n"
2017
2018 LABELALIGN
2019 "1: \n"
2020 READYUV444
2021 YUVTORGB(yuvconstants)
2022 STOREARGB
2023 "sub $0x8,%[width] \n"
2024 "jg 1b \n"
2025 : [y_buf]"+r"(y_buf), // %[y_buf]
2026 [u_buf]"+r"(u_buf), // %[u_buf]
2027 [v_buf]"+r"(v_buf), // %[v_buf]
2028 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2029 [width]"+rm"(width) // %[width]
2030 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2031 : "memory", "cc", YUVTORGB_REGS
2032 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2033 );
2034 }
2035
I422ToRGB24Row_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)2036 void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
2037 const uint8_t* u_buf,
2038 const uint8_t* v_buf,
2039 uint8_t* dst_rgb24,
2040 const struct YuvConstants* yuvconstants,
2041 int width) {
2042 asm volatile (
2043 YUVTORGB_SETUP(yuvconstants)
2044 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
2045 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
2046 "sub %[u_buf],%[v_buf] \n"
2047
2048 LABELALIGN
2049 "1: \n"
2050 READYUV422
2051 YUVTORGB(yuvconstants)
2052 "punpcklbw %%xmm1,%%xmm0 \n"
2053 "punpcklbw %%xmm2,%%xmm2 \n"
2054 "movdqa %%xmm0,%%xmm1 \n"
2055 "punpcklwd %%xmm2,%%xmm0 \n"
2056 "punpckhwd %%xmm2,%%xmm1 \n"
2057 "pshufb %%xmm5,%%xmm0 \n"
2058 "pshufb %%xmm6,%%xmm1 \n"
2059 "palignr $0xc,%%xmm0,%%xmm1 \n"
2060 "movq %%xmm0,(%[dst_rgb24]) \n"
2061 "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
2062 "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
2063 "subl $0x8,%[width] \n"
2064 "jg 1b \n"
2065 : [y_buf]"+r"(y_buf), // %[y_buf]
2066 [u_buf]"+r"(u_buf), // %[u_buf]
2067 [v_buf]"+r"(v_buf), // %[v_buf]
2068 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
2069 #if defined(__i386__)
2070 [width]"+m"(width) // %[width]
2071 #else
2072 [width]"+rm"(width) // %[width]
2073 #endif
2074 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2075 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
2076 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
2077 : "memory", "cc", YUVTORGB_REGS
2078 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2079 );
2080 }
2081
I422ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2082 void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
2083 const uint8_t* u_buf,
2084 const uint8_t* v_buf,
2085 uint8_t* dst_argb,
2086 const struct YuvConstants* yuvconstants,
2087 int width) {
2088 asm volatile (
2089 YUVTORGB_SETUP(yuvconstants)
2090 "sub %[u_buf],%[v_buf] \n"
2091 "pcmpeqb %%xmm5,%%xmm5 \n"
2092
2093 LABELALIGN
2094 "1: \n"
2095 READYUV422
2096 YUVTORGB(yuvconstants)
2097 STOREARGB
2098 "sub $0x8,%[width] \n"
2099 "jg 1b \n"
2100 : [y_buf]"+r"(y_buf), // %[y_buf]
2101 [u_buf]"+r"(u_buf), // %[u_buf]
2102 [v_buf]"+r"(v_buf), // %[v_buf]
2103 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2104 [width]"+rm"(width) // %[width]
2105 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2106 : "memory", "cc", YUVTORGB_REGS
2107 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2108 );
2109 }
2110
I422ToAR30Row_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2111 void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
2112 const uint8_t* u_buf,
2113 const uint8_t* v_buf,
2114 uint8_t* dst_ar30,
2115 const struct YuvConstants* yuvconstants,
2116 int width) {
2117 asm volatile (
2118 YUVTORGB_SETUP(yuvconstants)
2119 "sub %[u_buf],%[v_buf] \n"
2120 "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants
2121 "psrlw $14,%%xmm5 \n"
2122 "psllw $4,%%xmm5 \n" // 2 alpha bits
2123 "pxor %%xmm6,%%xmm6 \n"
2124 "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
2125 "psrlw $6,%%xmm7 \n" // 1023 for max
2126
2127 LABELALIGN
2128 "1: \n"
2129 READYUV422
2130 YUVTORGB16(yuvconstants)
2131 STOREAR30
2132 "sub $0x8,%[width] \n"
2133 "jg 1b \n"
2134 : [y_buf]"+r"(y_buf), // %[y_buf]
2135 [u_buf]"+r"(u_buf), // %[u_buf]
2136 [v_buf]"+r"(v_buf), // %[v_buf]
2137 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
2138 [width]"+rm"(width) // %[width]
2139 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2140 : "memory", "cc", YUVTORGB_REGS
2141 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2142 );
2143 }
2144
2145 // 10 bit YUV to ARGB
I210ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2146 void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
2147 const uint16_t* u_buf,
2148 const uint16_t* v_buf,
2149 uint8_t* dst_argb,
2150 const struct YuvConstants* yuvconstants,
2151 int width) {
2152 asm volatile (
2153 YUVTORGB_SETUP(yuvconstants)
2154 "sub %[u_buf],%[v_buf] \n"
2155 "pcmpeqb %%xmm5,%%xmm5 \n"
2156
2157 LABELALIGN
2158 "1: \n"
2159 READYUV210
2160 YUVTORGB(yuvconstants)
2161 STOREARGB
2162 "sub $0x8,%[width] \n"
2163 "jg 1b \n"
2164 : [y_buf]"+r"(y_buf), // %[y_buf]
2165 [u_buf]"+r"(u_buf), // %[u_buf]
2166 [v_buf]"+r"(v_buf), // %[v_buf]
2167 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2168 [width]"+rm"(width) // %[width]
2169 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2170 : "memory", "cc", YUVTORGB_REGS
2171 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2172 );
2173 }
2174
2175 // 10 bit YUV to AR30
I210ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2176 void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
2177 const uint16_t* u_buf,
2178 const uint16_t* v_buf,
2179 uint8_t* dst_ar30,
2180 const struct YuvConstants* yuvconstants,
2181 int width) {
2182 asm volatile (
2183 YUVTORGB_SETUP(yuvconstants)
2184 "sub %[u_buf],%[v_buf] \n"
2185 "pcmpeqb %%xmm5,%%xmm5 \n"
2186 "psrlw $14,%%xmm5 \n"
2187 "psllw $4,%%xmm5 \n" // 2 alpha bits
2188 "pxor %%xmm6,%%xmm6 \n"
2189 "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
2190 "psrlw $6,%%xmm7 \n" // 1023 for max
2191
2192 LABELALIGN
2193 "1: \n"
2194 READYUV210
2195 YUVTORGB16(yuvconstants)
2196 STOREAR30
2197 "sub $0x8,%[width] \n"
2198 "jg 1b \n"
2199 : [y_buf]"+r"(y_buf), // %[y_buf]
2200 [u_buf]"+r"(u_buf), // %[u_buf]
2201 [v_buf]"+r"(v_buf), // %[v_buf]
2202 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
2203 [width]"+rm"(width) // %[width]
2204 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2205 : "memory", "cc", YUVTORGB_REGS
2206 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2207 );
2208 }
2209
2210 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
I422AlphaToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2211 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
2212 const uint8_t* u_buf,
2213 const uint8_t* v_buf,
2214 const uint8_t* a_buf,
2215 uint8_t* dst_argb,
2216 const struct YuvConstants* yuvconstants,
2217 int width) {
2218 // clang-format off
2219 asm volatile (
2220 YUVTORGB_SETUP(yuvconstants)
2221 "sub %[u_buf],%[v_buf] \n"
2222
2223 LABELALIGN
2224 "1: \n"
2225 READYUVA422
2226 YUVTORGB(yuvconstants)
2227 STOREARGB
2228 "subl $0x8,%[width] \n"
2229 "jg 1b \n"
2230 : [y_buf]"+r"(y_buf), // %[y_buf]
2231 [u_buf]"+r"(u_buf), // %[u_buf]
2232 [v_buf]"+r"(v_buf), // %[v_buf]
2233 [a_buf]"+r"(a_buf), // %[a_buf]
2234 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2235 #if defined(__i386__)
2236 [width]"+m"(width) // %[width]
2237 #else
2238 [width]"+rm"(width) // %[width]
2239 #endif
2240 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2241 : "memory", "cc", YUVTORGB_REGS
2242 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2243 );
2244 // clang-format on
2245 }
2246 #endif // HAS_I422ALPHATOARGBROW_SSSE3
2247
NV12ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2248 void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
2249 const uint8_t* uv_buf,
2250 uint8_t* dst_argb,
2251 const struct YuvConstants* yuvconstants,
2252 int width) {
2253 // clang-format off
2254 asm volatile (
2255 YUVTORGB_SETUP(yuvconstants)
2256 "pcmpeqb %%xmm5,%%xmm5 \n"
2257
2258 LABELALIGN
2259 "1: \n"
2260 READNV12
2261 YUVTORGB(yuvconstants)
2262 STOREARGB
2263 "sub $0x8,%[width] \n"
2264 "jg 1b \n"
2265 : [y_buf]"+r"(y_buf), // %[y_buf]
2266 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2267 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2268 [width]"+rm"(width) // %[width]
2269 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2270 : "memory", "cc", YUVTORGB_REGS
2271 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2272 );
2273 // clang-format on
2274 }
2275
NV21ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * vu_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2276 void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
2277 const uint8_t* vu_buf,
2278 uint8_t* dst_argb,
2279 const struct YuvConstants* yuvconstants,
2280 int width) {
2281 // clang-format off
2282 asm volatile (
2283 YUVTORGB_SETUP(yuvconstants)
2284 "pcmpeqb %%xmm5,%%xmm5 \n"
2285
2286 LABELALIGN
2287 "1: \n"
2288 READNV21
2289 YUVTORGB(yuvconstants)
2290 STOREARGB
2291 "sub $0x8,%[width] \n"
2292 "jg 1b \n"
2293 : [y_buf]"+r"(y_buf), // %[y_buf]
2294 [vu_buf]"+r"(vu_buf), // %[vu_buf]
2295 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2296 [width]"+rm"(width) // %[width]
2297 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2298 [kShuffleNV21]"m"(kShuffleNV21)
2299 : "memory", "cc", YUVTORGB_REGS
2300 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2301 );
2302 // clang-format on
2303 }
2304
YUY2ToARGBRow_SSSE3(const uint8_t * yuy2_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2305 void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
2306 uint8_t* dst_argb,
2307 const struct YuvConstants* yuvconstants,
2308 int width) {
2309 // clang-format off
2310 asm volatile (
2311 YUVTORGB_SETUP(yuvconstants)
2312 "pcmpeqb %%xmm5,%%xmm5 \n"
2313
2314 LABELALIGN
2315 "1: \n"
2316 READYUY2
2317 YUVTORGB(yuvconstants)
2318 STOREARGB
2319 "sub $0x8,%[width] \n"
2320 "jg 1b \n"
2321 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
2322 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2323 [width]"+rm"(width) // %[width]
2324 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2325 [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
2326 [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
2327 : "memory", "cc", YUVTORGB_REGS
2328 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2329 );
2330 // clang-format on
2331 }
2332
UYVYToARGBRow_SSSE3(const uint8_t * uyvy_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2333 void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
2334 uint8_t* dst_argb,
2335 const struct YuvConstants* yuvconstants,
2336 int width) {
2337 // clang-format off
2338 asm volatile (
2339 YUVTORGB_SETUP(yuvconstants)
2340 "pcmpeqb %%xmm5,%%xmm5 \n"
2341
2342 LABELALIGN
2343 "1: \n"
2344 READUYVY
2345 YUVTORGB(yuvconstants)
2346 STOREARGB
2347 "sub $0x8,%[width] \n"
2348 "jg 1b \n"
2349 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
2350 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2351 [width]"+rm"(width) // %[width]
2352 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2353 [kShuffleUYVYY]"m"(kShuffleUYVYY),
2354 [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
2355 : "memory", "cc", YUVTORGB_REGS
2356 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2357 );
2358 // clang-format on
2359 }
2360
I422ToRGBARow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_rgba,const struct YuvConstants * yuvconstants,int width)2361 void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
2362 const uint8_t* u_buf,
2363 const uint8_t* v_buf,
2364 uint8_t* dst_rgba,
2365 const struct YuvConstants* yuvconstants,
2366 int width) {
2367 asm volatile (
2368 YUVTORGB_SETUP(yuvconstants)
2369 "sub %[u_buf],%[v_buf] \n"
2370 "pcmpeqb %%xmm5,%%xmm5 \n"
2371
2372 LABELALIGN
2373 "1: \n"
2374 READYUV422
2375 YUVTORGB(yuvconstants)
2376 STORERGBA
2377 "sub $0x8,%[width] \n"
2378 "jg 1b \n"
2379 : [y_buf]"+r"(y_buf), // %[y_buf]
2380 [u_buf]"+r"(u_buf), // %[u_buf]
2381 [v_buf]"+r"(v_buf), // %[v_buf]
2382 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
2383 [width]"+rm"(width) // %[width]
2384 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2385 : "memory", "cc", YUVTORGB_REGS
2386 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2387 );
2388 }
2389
2390 #endif // HAS_I422TOARGBROW_SSSE3
2391
2392 // Read 16 UV from 444
2393 #define READYUV444_AVX2 \
2394 "vmovdqu (%[u_buf]),%%xmm0 \n" \
2395 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2396 "lea 0x10(%[u_buf]),%[u_buf] \n" \
2397 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2398 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
2399 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
2400 "vmovdqu (%[y_buf]),%%xmm4 \n" \
2401 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
2402 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
2403 "lea 0x10(%[y_buf]),%[y_buf] \n"
2404
2405 // Read 8 UV from 422, upsample to 16 UV.
2406 #define READYUV422_AVX2 \
2407 "vmovq (%[u_buf]),%%xmm0 \n" \
2408 "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2409 "lea 0x8(%[u_buf]),%[u_buf] \n" \
2410 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
2411 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2412 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
2413 "vmovdqu (%[y_buf]),%%xmm4 \n" \
2414 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
2415 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
2416 "lea 0x10(%[y_buf]),%[y_buf] \n"
2417
2418 // Read 8 UV from 210 10 bit, upsample to 16 UV
2419 // TODO(fbarchard): Consider vshufb to replace pack/unpack
2420 // TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
2421 #define READYUV210_AVX2 \
2422 "vmovdqu (%[u_buf]),%%xmm0 \n" \
2423 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2424 "lea 0x10(%[u_buf]),%[u_buf] \n" \
2425 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2426 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
2427 "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" \
2428 "vpsraw $0x2,%%ymm0,%%ymm0 \n" \
2429 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
2430 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
2431 "vmovdqu (%[y_buf]),%%ymm4 \n" \
2432 "vpsllw $0x6,%%ymm4,%%ymm4 \n" \
2433 "lea 0x20(%[y_buf]),%[y_buf] \n"
2434
2435 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
2436 #define READYUVA422_AVX2 \
2437 "vmovq (%[u_buf]),%%xmm0 \n" \
2438 "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2439 "lea 0x8(%[u_buf]),%[u_buf] \n" \
2440 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
2441 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2442 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
2443 "vmovdqu (%[y_buf]),%%xmm4 \n" \
2444 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
2445 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
2446 "lea 0x10(%[y_buf]),%[y_buf] \n" \
2447 "vmovdqu (%[a_buf]),%%xmm5 \n" \
2448 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
2449 "lea 0x10(%[a_buf]),%[a_buf] \n"
2450
2451 // Read 8 UV from NV12, upsample to 16 UV.
2452 #define READNV12_AVX2 \
2453 "vmovdqu (%[uv_buf]),%%xmm0 \n" \
2454 "lea 0x10(%[uv_buf]),%[uv_buf] \n" \
2455 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2456 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
2457 "vmovdqu (%[y_buf]),%%xmm4 \n" \
2458 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
2459 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
2460 "lea 0x10(%[y_buf]),%[y_buf] \n"
2461
2462 // Read 8 VU from NV21, upsample to 16 UV.
2463 #define READNV21_AVX2 \
2464 "vmovdqu (%[vu_buf]),%%xmm0 \n" \
2465 "lea 0x10(%[vu_buf]),%[vu_buf] \n" \
2466 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2467 "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \
2468 "vmovdqu (%[y_buf]),%%xmm4 \n" \
2469 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
2470 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
2471 "lea 0x10(%[y_buf]),%[y_buf] \n"
2472
2473 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
2474 #define READYUY2_AVX2 \
2475 "vmovdqu (%[yuy2_buf]),%%ymm4 \n" \
2476 "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
2477 "vmovdqu (%[yuy2_buf]),%%ymm0 \n" \
2478 "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \
2479 "lea 0x20(%[yuy2_buf]),%[yuy2_buf] \n"
2480
2481 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
2482 #define READUYVY_AVX2 \
2483 "vmovdqu (%[uyvy_buf]),%%ymm4 \n" \
2484 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
2485 "vmovdqu (%[uyvy_buf]),%%ymm0 \n" \
2486 "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \
2487 "lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n"
2488
2489 #if defined(__x86_64__)
2490 #define YUVTORGB_SETUP_AVX2(yuvconstants) \
2491 "vmovdqa (%[yuvconstants]),%%ymm8 \n" \
2492 "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \
2493 "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \
2494 "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \
2495 "vmovdqa 128(%[yuvconstants]),%%ymm12 \n" \
2496 "vmovdqa 160(%[yuvconstants]),%%ymm13 \n" \
2497 "vmovdqa 192(%[yuvconstants]),%%ymm14 \n"
2498
2499 #define YUVTORGB16_AVX2(yuvconstants) \
2500 "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \
2501 "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \
2502 "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \
2503 "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \
2504 "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \
2505 "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \
2506 "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \
2507 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
2508 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
2509 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
2510
2511 #define YUVTORGB_REGS_AVX2 \
2512 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
2513
2514 #else // Convert 16 pixels: 16 UV and 16 Y.
2515
2516 #define YUVTORGB_SETUP_AVX2(yuvconstants)
2517 #define YUVTORGB16_AVX2(yuvconstants) \
2518 "vpmaddubsw 64(%[yuvconstants]),%%ymm0,%%ymm2 \n" \
2519 "vpmaddubsw 32(%[yuvconstants]),%%ymm0,%%ymm1 \n" \
2520 "vpmaddubsw (%[yuvconstants]),%%ymm0,%%ymm0 \n" \
2521 "vmovdqu 160(%[yuvconstants]),%%ymm3 \n" \
2522 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
2523 "vmovdqu 128(%[yuvconstants]),%%ymm3 \n" \
2524 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
2525 "vmovdqu 96(%[yuvconstants]),%%ymm3 \n" \
2526 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
2527 "vpmulhuw 192(%[yuvconstants]),%%ymm4,%%ymm4 \n" \
2528 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
2529 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
2530 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
2531 #define YUVTORGB_REGS_AVX2
2532 #endif
2533
2534 #define YUVTORGB_AVX2(yuvconstants) \
2535 YUVTORGB16_AVX2(yuvconstants) \
2536 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
2537 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
2538 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
2539 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
2540 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
2541 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
2542
2543 // Store 16 ARGB values.
2544 #define STOREARGB_AVX2 \
2545 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
2546 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2547 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
2548 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
2549 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
2550 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
2551 "vmovdqu %%ymm1,(%[dst_argb]) \n" \
2552 "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \
2553 "lea 0x40(%[dst_argb]), %[dst_argb] \n"
2554
2555 // Store 16 AR30 values.
2556 #define STOREAR30_AVX2 \
2557 "vpsraw $0x4,%%ymm0,%%ymm0 \n" \
2558 "vpsraw $0x4,%%ymm1,%%ymm1 \n" \
2559 "vpsraw $0x4,%%ymm2,%%ymm2 \n" \
2560 "vpminsw %%ymm7,%%ymm0,%%ymm0 \n" \
2561 "vpminsw %%ymm7,%%ymm1,%%ymm1 \n" \
2562 "vpminsw %%ymm7,%%ymm2,%%ymm2 \n" \
2563 "vpmaxsw %%ymm6,%%ymm0,%%ymm0 \n" \
2564 "vpmaxsw %%ymm6,%%ymm1,%%ymm1 \n" \
2565 "vpmaxsw %%ymm6,%%ymm2,%%ymm2 \n" \
2566 "vpsllw $0x4,%%ymm2,%%ymm2 \n" \
2567 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2568 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
2569 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
2570 "vpunpckhwd %%ymm2,%%ymm0,%%ymm3 \n" \
2571 "vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" \
2572 "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" \
2573 "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" \
2574 "vpslld $0xa,%%ymm1,%%ymm1 \n" \
2575 "vpslld $0xa,%%ymm2,%%ymm2 \n" \
2576 "vpor %%ymm1,%%ymm0,%%ymm0 \n" \
2577 "vpor %%ymm2,%%ymm3,%%ymm3 \n" \
2578 "vmovdqu %%ymm0,(%[dst_ar30]) \n" \
2579 "vmovdqu %%ymm3,0x20(%[dst_ar30]) \n" \
2580 "lea 0x40(%[dst_ar30]), %[dst_ar30] \n"
2581
2582 #ifdef HAS_I444TOARGBROW_AVX2
2583 // 16 pixels
2584 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
I444ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2585 void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
2586 const uint8_t* u_buf,
2587 const uint8_t* v_buf,
2588 uint8_t* dst_argb,
2589 const struct YuvConstants* yuvconstants,
2590 int width) {
2591 asm volatile (
2592 YUVTORGB_SETUP_AVX2(yuvconstants)
2593 "sub %[u_buf],%[v_buf] \n"
2594 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2595
2596 LABELALIGN
2597 "1: \n"
2598 READYUV444_AVX2
2599 YUVTORGB_AVX2(yuvconstants)
2600 STOREARGB_AVX2
2601 "sub $0x10,%[width] \n"
2602 "jg 1b \n"
2603 "vzeroupper \n"
2604 : [y_buf]"+r"(y_buf), // %[y_buf]
2605 [u_buf]"+r"(u_buf), // %[u_buf]
2606 [v_buf]"+r"(v_buf), // %[v_buf]
2607 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2608 [width]"+rm"(width) // %[width]
2609 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2610 : "memory", "cc", YUVTORGB_REGS_AVX2
2611 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2612 );
2613 }
2614 #endif // HAS_I444TOARGBROW_AVX2
2615
2616 #if defined(HAS_I422TOARGBROW_AVX2)
2617 // 16 pixels
2618 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I422ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2619 void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
2620 const uint8_t* u_buf,
2621 const uint8_t* v_buf,
2622 uint8_t* dst_argb,
2623 const struct YuvConstants* yuvconstants,
2624 int width) {
2625 asm volatile (
2626 YUVTORGB_SETUP_AVX2(yuvconstants)
2627 "sub %[u_buf],%[v_buf] \n"
2628 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2629
2630 LABELALIGN
2631 "1: \n"
2632 READYUV422_AVX2
2633 YUVTORGB_AVX2(yuvconstants)
2634 STOREARGB_AVX2
2635 "sub $0x10,%[width] \n"
2636 "jg 1b \n"
2637
2638 "vzeroupper \n"
2639 : [y_buf]"+r"(y_buf), // %[y_buf]
2640 [u_buf]"+r"(u_buf), // %[u_buf]
2641 [v_buf]"+r"(v_buf), // %[v_buf]
2642 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2643 [width]"+rm"(width) // %[width]
2644 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2645 : "memory", "cc", YUVTORGB_REGS_AVX2
2646 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2647 );
2648 }
2649 #endif // HAS_I422TOARGBROW_AVX2
2650
2651 #if defined(HAS_I422TOAR30ROW_AVX2)
2652 // 16 pixels
2653 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I422ToAR30Row_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2654 void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
2655 const uint8_t* u_buf,
2656 const uint8_t* v_buf,
2657 uint8_t* dst_ar30,
2658 const struct YuvConstants* yuvconstants,
2659 int width) {
2660 asm volatile (
2661 YUVTORGB_SETUP_AVX2(yuvconstants)
2662 "sub %[u_buf],%[v_buf] \n"
2663 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
2664 "vpsrlw $14,%%ymm5,%%ymm5 \n"
2665 "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
2666 "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
2667 "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
2668 "vpsrlw $6,%%ymm7,%%ymm7 \n"
2669
2670 LABELALIGN
2671 "1: \n"
2672 READYUV422_AVX2
2673 YUVTORGB16_AVX2(yuvconstants)
2674 STOREAR30_AVX2
2675 "sub $0x10,%[width] \n"
2676 "jg 1b \n"
2677
2678 "vzeroupper \n"
2679 : [y_buf]"+r"(y_buf), // %[y_buf]
2680 [u_buf]"+r"(u_buf), // %[u_buf]
2681 [v_buf]"+r"(v_buf), // %[v_buf]
2682 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
2683 [width]"+rm"(width) // %[width]
2684 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2685 : "memory", "cc", YUVTORGB_REGS_AVX2
2686 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2687 );
2688 }
2689 #endif // HAS_I422TOAR30ROW_AVX2
2690
2691 #if defined(HAS_I210TOARGBROW_AVX2)
2692 // 16 pixels
2693 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I210ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2694 void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
2695 const uint16_t* u_buf,
2696 const uint16_t* v_buf,
2697 uint8_t* dst_argb,
2698 const struct YuvConstants* yuvconstants,
2699 int width) {
2700 asm volatile (
2701 YUVTORGB_SETUP_AVX2(yuvconstants)
2702 "sub %[u_buf],%[v_buf] \n"
2703 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2704
2705 LABELALIGN
2706 "1: \n"
2707 READYUV210_AVX2
2708 YUVTORGB_AVX2(yuvconstants)
2709 STOREARGB_AVX2
2710 "sub $0x10,%[width] \n"
2711 "jg 1b \n"
2712
2713 "vzeroupper \n"
2714 : [y_buf]"+r"(y_buf), // %[y_buf]
2715 [u_buf]"+r"(u_buf), // %[u_buf]
2716 [v_buf]"+r"(v_buf), // %[v_buf]
2717 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2718 [width]"+rm"(width) // %[width]
2719 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2720 : "memory", "cc", YUVTORGB_REGS_AVX2
2721 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2722 );
2723 }
2724 #endif // HAS_I210TOARGBROW_AVX2
2725
2726 #if defined(HAS_I210TOAR30ROW_AVX2)
2727 // 16 pixels
2728 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I210ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2729 void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
2730 const uint16_t* u_buf,
2731 const uint16_t* v_buf,
2732 uint8_t* dst_ar30,
2733 const struct YuvConstants* yuvconstants,
2734 int width) {
2735 asm volatile (
2736 YUVTORGB_SETUP_AVX2(yuvconstants)
2737 "sub %[u_buf],%[v_buf] \n"
2738 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
2739 "vpsrlw $14,%%ymm5,%%ymm5 \n"
2740 "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
2741 "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
2742 "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
2743 "vpsrlw $6,%%ymm7,%%ymm7 \n"
2744
2745 LABELALIGN
2746 "1: \n"
2747 READYUV210_AVX2
2748 YUVTORGB16_AVX2(yuvconstants)
2749 STOREAR30_AVX2
2750 "sub $0x10,%[width] \n"
2751 "jg 1b \n"
2752
2753 "vzeroupper \n"
2754 : [y_buf]"+r"(y_buf), // %[y_buf]
2755 [u_buf]"+r"(u_buf), // %[u_buf]
2756 [v_buf]"+r"(v_buf), // %[v_buf]
2757 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
2758 [width]"+rm"(width) // %[width]
2759 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2760 : "memory", "cc", YUVTORGB_REGS_AVX2
2761 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2762 );
2763 }
2764 #endif // HAS_I210TOAR30ROW_AVX2
2765
2766 #if defined(HAS_I422ALPHATOARGBROW_AVX2)
2767 // 16 pixels
2768 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
I422AlphaToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2769 void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
2770 const uint8_t* u_buf,
2771 const uint8_t* v_buf,
2772 const uint8_t* a_buf,
2773 uint8_t* dst_argb,
2774 const struct YuvConstants* yuvconstants,
2775 int width) {
2776 // clang-format off
2777 asm volatile (
2778 YUVTORGB_SETUP_AVX2(yuvconstants)
2779 "sub %[u_buf],%[v_buf] \n"
2780
2781 LABELALIGN
2782 "1: \n"
2783 READYUVA422_AVX2
2784 YUVTORGB_AVX2(yuvconstants)
2785 STOREARGB_AVX2
2786 "subl $0x10,%[width] \n"
2787 "jg 1b \n"
2788 "vzeroupper \n"
2789 : [y_buf]"+r"(y_buf), // %[y_buf]
2790 [u_buf]"+r"(u_buf), // %[u_buf]
2791 [v_buf]"+r"(v_buf), // %[v_buf]
2792 [a_buf]"+r"(a_buf), // %[a_buf]
2793 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2794 #if defined(__i386__)
2795 [width]"+m"(width) // %[width]
2796 #else
2797 [width]"+rm"(width) // %[width]
2798 #endif
2799 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2800 : "memory", "cc", YUVTORGB_REGS_AVX2
2801 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2802 );
2803 // clang-format on
2804 }
2805 #endif // HAS_I422ALPHATOARGBROW_AVX2
2806
2807 #if defined(HAS_I422TORGBAROW_AVX2)
2808 // 16 pixels
2809 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
I422ToRGBARow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2810 void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
2811 const uint8_t* u_buf,
2812 const uint8_t* v_buf,
2813 uint8_t* dst_argb,
2814 const struct YuvConstants* yuvconstants,
2815 int width) {
2816 asm volatile (
2817 YUVTORGB_SETUP_AVX2(yuvconstants)
2818 "sub %[u_buf],%[v_buf] \n"
2819 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2820
2821 LABELALIGN
2822 "1: \n"
2823 READYUV422_AVX2
2824 YUVTORGB_AVX2(yuvconstants)
2825
2826 // Step 3: Weave into RGBA
2827 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
2828 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2829 "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n"
2830 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2831 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
2832 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
2833 "vmovdqu %%ymm0,(%[dst_argb]) \n"
2834 "vmovdqu %%ymm1,0x20(%[dst_argb]) \n"
2835 "lea 0x40(%[dst_argb]),%[dst_argb] \n"
2836 "sub $0x10,%[width] \n"
2837 "jg 1b \n"
2838 "vzeroupper \n"
2839 : [y_buf]"+r"(y_buf), // %[y_buf]
2840 [u_buf]"+r"(u_buf), // %[u_buf]
2841 [v_buf]"+r"(v_buf), // %[v_buf]
2842 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2843 [width]"+rm"(width) // %[width]
2844 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2845 : "memory", "cc", YUVTORGB_REGS_AVX2
2846 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2847 );
2848 }
2849 #endif // HAS_I422TORGBAROW_AVX2
2850
2851 #if defined(HAS_NV12TOARGBROW_AVX2)
2852 // 16 pixels.
2853 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV12ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2854 void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
2855 const uint8_t* uv_buf,
2856 uint8_t* dst_argb,
2857 const struct YuvConstants* yuvconstants,
2858 int width) {
2859 // clang-format off
2860 asm volatile (
2861 YUVTORGB_SETUP_AVX2(yuvconstants)
2862 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2863
2864 LABELALIGN
2865 "1: \n"
2866 READNV12_AVX2
2867 YUVTORGB_AVX2(yuvconstants)
2868 STOREARGB_AVX2
2869 "sub $0x10,%[width] \n"
2870 "jg 1b \n"
2871 "vzeroupper \n"
2872 : [y_buf]"+r"(y_buf), // %[y_buf]
2873 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2874 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2875 [width]"+rm"(width) // %[width]
2876 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2877 : "memory", "cc", YUVTORGB_REGS_AVX2
2878 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2879 );
2880 // clang-format on
2881 }
2882 #endif // HAS_NV12TOARGBROW_AVX2
2883
2884 #if defined(HAS_NV21TOARGBROW_AVX2)
2885 // 16 pixels.
2886 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV21ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * vu_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2887 void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
2888 const uint8_t* vu_buf,
2889 uint8_t* dst_argb,
2890 const struct YuvConstants* yuvconstants,
2891 int width) {
2892 // clang-format off
2893 asm volatile (
2894 YUVTORGB_SETUP_AVX2(yuvconstants)
2895 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2896
2897 LABELALIGN
2898 "1: \n"
2899 READNV21_AVX2
2900 YUVTORGB_AVX2(yuvconstants)
2901 STOREARGB_AVX2
2902 "sub $0x10,%[width] \n"
2903 "jg 1b \n"
2904 "vzeroupper \n"
2905 : [y_buf]"+r"(y_buf), // %[y_buf]
2906 [vu_buf]"+r"(vu_buf), // %[vu_buf]
2907 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2908 [width]"+rm"(width) // %[width]
2909 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2910 [kShuffleNV21]"m"(kShuffleNV21)
2911 : "memory", "cc", YUVTORGB_REGS_AVX2
2912 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2913 );
2914 // clang-format on
2915 }
2916 #endif // HAS_NV21TOARGBROW_AVX2
2917
2918 #if defined(HAS_YUY2TOARGBROW_AVX2)
2919 // 16 pixels.
2920 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
YUY2ToARGBRow_AVX2(const uint8_t * yuy2_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2921 void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
2922 uint8_t* dst_argb,
2923 const struct YuvConstants* yuvconstants,
2924 int width) {
2925 // clang-format off
2926 asm volatile (
2927 YUVTORGB_SETUP_AVX2(yuvconstants)
2928 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2929
2930 LABELALIGN
2931 "1: \n"
2932 READYUY2_AVX2
2933 YUVTORGB_AVX2(yuvconstants)
2934 STOREARGB_AVX2
2935 "sub $0x10,%[width] \n"
2936 "jg 1b \n"
2937 "vzeroupper \n"
2938 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
2939 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2940 [width]"+rm"(width) // %[width]
2941 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2942 [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
2943 [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
2944 : "memory", "cc", YUVTORGB_REGS_AVX2
2945 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2946 );
2947 // clang-format on
2948 }
2949 #endif // HAS_YUY2TOARGBROW_AVX2
2950
2951 #if defined(HAS_UYVYTOARGBROW_AVX2)
2952 // 16 pixels.
2953 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
UYVYToARGBRow_AVX2(const uint8_t * uyvy_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2954 void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
2955 uint8_t* dst_argb,
2956 const struct YuvConstants* yuvconstants,
2957 int width) {
2958 // clang-format off
2959 asm volatile (
2960 YUVTORGB_SETUP_AVX2(yuvconstants)
2961 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2962
2963 LABELALIGN
2964 "1: \n"
2965 READUYVY_AVX2
2966 YUVTORGB_AVX2(yuvconstants)
2967 STOREARGB_AVX2
2968 "sub $0x10,%[width] \n"
2969 "jg 1b \n"
2970 "vzeroupper \n"
2971 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
2972 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2973 [width]"+rm"(width) // %[width]
2974 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2975 [kShuffleUYVYY]"m"(kShuffleUYVYY),
2976 [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
2977 : "memory", "cc", YUVTORGB_REGS_AVX2
2978 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2979 );
2980 // clang-format on
2981 }
2982 #endif // HAS_UYVYTOARGBROW_AVX2
2983
2984 #ifdef HAS_I400TOARGBROW_SSE2
I400ToARGBRow_SSE2(const uint8_t * y_buf,uint8_t * dst_argb,int width)2985 void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
2986 asm volatile(
2987 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
2988 "movd %%eax,%%xmm2 \n"
2989 "pshufd $0x0,%%xmm2,%%xmm2 \n"
2990 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 *
2991 // 16
2992 "movd %%eax,%%xmm3 \n"
2993 "pshufd $0x0,%%xmm3,%%xmm3 \n"
2994 "pcmpeqb %%xmm4,%%xmm4 \n"
2995 "pslld $0x18,%%xmm4 \n"
2996
2997 LABELALIGN
2998 "1: \n"
2999 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
3000 "movq (%0),%%xmm0 \n"
3001 "lea 0x8(%0),%0 \n"
3002 "punpcklbw %%xmm0,%%xmm0 \n"
3003 "pmulhuw %%xmm2,%%xmm0 \n"
3004 "psubusw %%xmm3,%%xmm0 \n"
3005 "psrlw $6, %%xmm0 \n"
3006 "packuswb %%xmm0,%%xmm0 \n"
3007
3008 // Step 2: Weave into ARGB
3009 "punpcklbw %%xmm0,%%xmm0 \n"
3010 "movdqa %%xmm0,%%xmm1 \n"
3011 "punpcklwd %%xmm0,%%xmm0 \n"
3012 "punpckhwd %%xmm1,%%xmm1 \n"
3013 "por %%xmm4,%%xmm0 \n"
3014 "por %%xmm4,%%xmm1 \n"
3015 "movdqu %%xmm0,(%1) \n"
3016 "movdqu %%xmm1,0x10(%1) \n"
3017 "lea 0x20(%1),%1 \n"
3018
3019 "sub $0x8,%2 \n"
3020 "jg 1b \n"
3021 : "+r"(y_buf), // %0
3022 "+r"(dst_argb), // %1
3023 "+rm"(width) // %2
3024 :
3025 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
3026 }
3027 #endif // HAS_I400TOARGBROW_SSE2
3028
3029 #ifdef HAS_I400TOARGBROW_AVX2
3030 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
3031 // note: vpunpcklbw mutates and vpackuswb unmutates.
I400ToARGBRow_AVX2(const uint8_t * y_buf,uint8_t * dst_argb,int width)3032 void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
3033 asm volatile(
3034 "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 *
3035 // 16
3036 "vmovd %%eax,%%xmm2 \n"
3037 "vbroadcastss %%xmm2,%%ymm2 \n"
3038 "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164
3039 "vmovd %%eax,%%xmm3 \n"
3040 "vbroadcastss %%xmm3,%%ymm3 \n"
3041 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
3042 "vpslld $0x18,%%ymm4,%%ymm4 \n"
3043
3044 LABELALIGN
3045 "1: \n"
3046 // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
3047 "vmovdqu (%0),%%xmm0 \n"
3048 "lea 0x10(%0),%0 \n"
3049 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3050 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
3051 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
3052 "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n"
3053 "vpsrlw $0x6,%%ymm0,%%ymm0 \n"
3054 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3055 "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
3056 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3057 "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
3058 "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
3059 "vpor %%ymm4,%%ymm0,%%ymm0 \n"
3060 "vpor %%ymm4,%%ymm1,%%ymm1 \n"
3061 "vmovdqu %%ymm0,(%1) \n"
3062 "vmovdqu %%ymm1,0x20(%1) \n"
3063 "lea 0x40(%1),%1 \n"
3064 "sub $0x10,%2 \n"
3065 "jg 1b \n"
3066 "vzeroupper \n"
3067 : "+r"(y_buf), // %0
3068 "+r"(dst_argb), // %1
3069 "+rm"(width) // %2
3070 :
3071 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
3072 }
3073 #endif // HAS_I400TOARGBROW_AVX2
3074
3075 #ifdef HAS_MIRRORROW_SSSE3
3076 // Shuffle table for reversing the bytes.
3077 static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
3078 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
3079
MirrorRow_SSSE3(const uint8_t * src,uint8_t * dst,int width)3080 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
3081 intptr_t temp_width = (intptr_t)(width);
3082 asm volatile(
3083
3084 "movdqa %3,%%xmm5 \n"
3085
3086 LABELALIGN
3087 "1: \n"
3088 "movdqu -0x10(%0,%2,1),%%xmm0 \n"
3089 "pshufb %%xmm5,%%xmm0 \n"
3090 "movdqu %%xmm0,(%1) \n"
3091 "lea 0x10(%1),%1 \n"
3092 "sub $0x10,%2 \n"
3093 "jg 1b \n"
3094 : "+r"(src), // %0
3095 "+r"(dst), // %1
3096 "+r"(temp_width) // %2
3097 : "m"(kShuffleMirror) // %3
3098 : "memory", "cc", "xmm0", "xmm5");
3099 }
3100 #endif // HAS_MIRRORROW_SSSE3
3101
3102 #ifdef HAS_MIRRORROW_AVX2
MirrorRow_AVX2(const uint8_t * src,uint8_t * dst,int width)3103 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
3104 intptr_t temp_width = (intptr_t)(width);
3105 asm volatile(
3106
3107 "vbroadcastf128 %3,%%ymm5 \n"
3108
3109 LABELALIGN
3110 "1: \n"
3111 "vmovdqu -0x20(%0,%2,1),%%ymm0 \n"
3112 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
3113 "vpermq $0x4e,%%ymm0,%%ymm0 \n"
3114 "vmovdqu %%ymm0,(%1) \n"
3115 "lea 0x20(%1),%1 \n"
3116 "sub $0x20,%2 \n"
3117 "jg 1b \n"
3118 "vzeroupper \n"
3119 : "+r"(src), // %0
3120 "+r"(dst), // %1
3121 "+r"(temp_width) // %2
3122 : "m"(kShuffleMirror) // %3
3123 : "memory", "cc", "xmm0", "xmm5");
3124 }
3125 #endif // HAS_MIRRORROW_AVX2
3126
3127 #ifdef HAS_MIRRORUVROW_SSSE3
3128 // Shuffle table for reversing the bytes of UV channels.
3129 static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
3130 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
MirrorUVRow_SSSE3(const uint8_t * src,uint8_t * dst_u,uint8_t * dst_v,int width)3131 void MirrorUVRow_SSSE3(const uint8_t* src,
3132 uint8_t* dst_u,
3133 uint8_t* dst_v,
3134 int width) {
3135 intptr_t temp_width = (intptr_t)(width);
3136 asm volatile(
3137 "movdqa %4,%%xmm1 \n"
3138 "lea -0x10(%0,%3,2),%0 \n"
3139 "sub %1,%2 \n"
3140
3141 LABELALIGN
3142 "1: \n"
3143 "movdqu (%0),%%xmm0 \n"
3144 "lea -0x10(%0),%0 \n"
3145 "pshufb %%xmm1,%%xmm0 \n"
3146 "movlpd %%xmm0,(%1) \n"
3147 "movhpd %%xmm0,0x00(%1,%2,1) \n"
3148 "lea 0x8(%1),%1 \n"
3149 "sub $8,%3 \n"
3150 "jg 1b \n"
3151 : "+r"(src), // %0
3152 "+r"(dst_u), // %1
3153 "+r"(dst_v), // %2
3154 "+r"(temp_width) // %3
3155 : "m"(kShuffleMirrorUV) // %4
3156 : "memory", "cc", "xmm0", "xmm1");
3157 }
3158 #endif // HAS_MIRRORUVROW_SSSE3
3159
3160 #ifdef HAS_ARGBMIRRORROW_SSE2
3161
ARGBMirrorRow_SSE2(const uint8_t * src,uint8_t * dst,int width)3162 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
3163 intptr_t temp_width = (intptr_t)(width);
3164 asm volatile(
3165
3166 "lea -0x10(%0,%2,4),%0 \n"
3167
3168 LABELALIGN
3169 "1: \n"
3170 "movdqu (%0),%%xmm0 \n"
3171 "pshufd $0x1b,%%xmm0,%%xmm0 \n"
3172 "lea -0x10(%0),%0 \n"
3173 "movdqu %%xmm0,(%1) \n"
3174 "lea 0x10(%1),%1 \n"
3175 "sub $0x4,%2 \n"
3176 "jg 1b \n"
3177 : "+r"(src), // %0
3178 "+r"(dst), // %1
3179 "+r"(temp_width) // %2
3180 :
3181 : "memory", "cc", "xmm0");
3182 }
3183 #endif // HAS_ARGBMIRRORROW_SSE2
3184
3185 #ifdef HAS_ARGBMIRRORROW_AVX2
3186 // Shuffle table for reversing the bytes.
3187 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
ARGBMirrorRow_AVX2(const uint8_t * src,uint8_t * dst,int width)3188 void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
3189 intptr_t temp_width = (intptr_t)(width);
3190 asm volatile(
3191
3192 "vmovdqu %3,%%ymm5 \n"
3193
3194 LABELALIGN
3195 "1: \n"
3196 "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n"
3197 "vmovdqu %%ymm0,(%1) \n"
3198 "lea 0x20(%1),%1 \n"
3199 "sub $0x8,%2 \n"
3200 "jg 1b \n"
3201 "vzeroupper \n"
3202 : "+r"(src), // %0
3203 "+r"(dst), // %1
3204 "+r"(temp_width) // %2
3205 : "m"(kARGBShuffleMirror_AVX2) // %3
3206 : "memory", "cc", "xmm0", "xmm5");
3207 }
3208 #endif // HAS_ARGBMIRRORROW_AVX2
3209
3210 #ifdef HAS_SPLITUVROW_AVX2
SplitUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)3211 void SplitUVRow_AVX2(const uint8_t* src_uv,
3212 uint8_t* dst_u,
3213 uint8_t* dst_v,
3214 int width) {
3215 asm volatile(
3216 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3217 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3218 "sub %1,%2 \n"
3219
3220 LABELALIGN
3221 "1: \n"
3222 "vmovdqu (%0),%%ymm0 \n"
3223 "vmovdqu 0x20(%0),%%ymm1 \n"
3224 "lea 0x40(%0),%0 \n"
3225 "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
3226 "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
3227 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
3228 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
3229 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3230 "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
3231 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3232 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
3233 "vmovdqu %%ymm0,(%1) \n"
3234 "vmovdqu %%ymm2,0x00(%1,%2,1) \n"
3235 "lea 0x20(%1),%1 \n"
3236 "sub $0x20,%3 \n"
3237 "jg 1b \n"
3238 "vzeroupper \n"
3239 : "+r"(src_uv), // %0
3240 "+r"(dst_u), // %1
3241 "+r"(dst_v), // %2
3242 "+r"(width) // %3
3243 :
3244 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
3245 }
3246 #endif // HAS_SPLITUVROW_AVX2
3247
3248 #ifdef HAS_SPLITUVROW_SSE2
SplitUVRow_SSE2(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)3249 void SplitUVRow_SSE2(const uint8_t* src_uv,
3250 uint8_t* dst_u,
3251 uint8_t* dst_v,
3252 int width) {
3253 asm volatile(
3254 "pcmpeqb %%xmm5,%%xmm5 \n"
3255 "psrlw $0x8,%%xmm5 \n"
3256 "sub %1,%2 \n"
3257
3258 LABELALIGN
3259 "1: \n"
3260 "movdqu (%0),%%xmm0 \n"
3261 "movdqu 0x10(%0),%%xmm1 \n"
3262 "lea 0x20(%0),%0 \n"
3263 "movdqa %%xmm0,%%xmm2 \n"
3264 "movdqa %%xmm1,%%xmm3 \n"
3265 "pand %%xmm5,%%xmm0 \n"
3266 "pand %%xmm5,%%xmm1 \n"
3267 "packuswb %%xmm1,%%xmm0 \n"
3268 "psrlw $0x8,%%xmm2 \n"
3269 "psrlw $0x8,%%xmm3 \n"
3270 "packuswb %%xmm3,%%xmm2 \n"
3271 "movdqu %%xmm0,(%1) \n"
3272 "movdqu %%xmm2,0x00(%1,%2,1) \n"
3273 "lea 0x10(%1),%1 \n"
3274 "sub $0x10,%3 \n"
3275 "jg 1b \n"
3276 : "+r"(src_uv), // %0
3277 "+r"(dst_u), // %1
3278 "+r"(dst_v), // %2
3279 "+r"(width) // %3
3280 :
3281 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
3282 }
3283 #endif // HAS_SPLITUVROW_SSE2
3284
3285 #ifdef HAS_MERGEUVROW_AVX2
MergeUVRow_AVX2(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)3286 void MergeUVRow_AVX2(const uint8_t* src_u,
3287 const uint8_t* src_v,
3288 uint8_t* dst_uv,
3289 int width) {
3290 asm volatile(
3291
3292 "sub %0,%1 \n"
3293
3294 LABELALIGN
3295 "1: \n"
3296 "vmovdqu (%0),%%ymm0 \n"
3297 "vmovdqu 0x00(%0,%1,1),%%ymm1 \n"
3298 "lea 0x20(%0),%0 \n"
3299 "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
3300 "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
3301 "vextractf128 $0x0,%%ymm2,(%2) \n"
3302 "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
3303 "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
3304 "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
3305 "lea 0x40(%2),%2 \n"
3306 "sub $0x20,%3 \n"
3307 "jg 1b \n"
3308 "vzeroupper \n"
3309 : "+r"(src_u), // %0
3310 "+r"(src_v), // %1
3311 "+r"(dst_uv), // %2
3312 "+r"(width) // %3
3313 :
3314 : "memory", "cc", "xmm0", "xmm1", "xmm2");
3315 }
3316 #endif // HAS_MERGEUVROW_AVX2
3317
3318 #ifdef HAS_MERGEUVROW_SSE2
MergeUVRow_SSE2(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)3319 void MergeUVRow_SSE2(const uint8_t* src_u,
3320 const uint8_t* src_v,
3321 uint8_t* dst_uv,
3322 int width) {
3323 asm volatile(
3324
3325 "sub %0,%1 \n"
3326
3327 LABELALIGN
3328 "1: \n"
3329 "movdqu (%0),%%xmm0 \n"
3330 "movdqu 0x00(%0,%1,1),%%xmm1 \n"
3331 "lea 0x10(%0),%0 \n"
3332 "movdqa %%xmm0,%%xmm2 \n"
3333 "punpcklbw %%xmm1,%%xmm0 \n"
3334 "punpckhbw %%xmm1,%%xmm2 \n"
3335 "movdqu %%xmm0,(%2) \n"
3336 "movdqu %%xmm2,0x10(%2) \n"
3337 "lea 0x20(%2),%2 \n"
3338 "sub $0x10,%3 \n"
3339 "jg 1b \n"
3340 : "+r"(src_u), // %0
3341 "+r"(src_v), // %1
3342 "+r"(dst_uv), // %2
3343 "+r"(width) // %3
3344 :
3345 : "memory", "cc", "xmm0", "xmm1", "xmm2");
3346 }
3347 #endif // HAS_MERGEUVROW_SSE2
3348
3349 // Use scale to convert lsb formats to msb, depending how many bits there are:
3350 // 128 = 9 bits
3351 // 64 = 10 bits
3352 // 16 = 12 bits
3353 // 1 = 16 bits
3354 #ifdef HAS_MERGEUVROW_16_AVX2
MergeUVRow_16_AVX2(const uint16_t * src_u,const uint16_t * src_v,uint16_t * dst_uv,int scale,int width)3355 void MergeUVRow_16_AVX2(const uint16_t* src_u,
3356 const uint16_t* src_v,
3357 uint16_t* dst_uv,
3358 int scale,
3359 int width) {
3360 // clang-format off
3361 asm volatile (
3362 "vmovd %4,%%xmm3 \n"
3363 "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
3364 "vbroadcastss %%xmm3,%%ymm3 \n"
3365 "sub %0,%1 \n"
3366
3367 // 16 pixels per loop.
3368 LABELALIGN
3369 "1: \n"
3370 "vmovdqu (%0),%%ymm0 \n"
3371 "vmovdqu (%0,%1,1),%%ymm1 \n"
3372 "add $0x20,%0 \n"
3373
3374 "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
3375 "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
3376 "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
3377 "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
3378 "vextractf128 $0x0,%%ymm2,(%2) \n"
3379 "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
3380 "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
3381 "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
3382 "add $0x40,%2 \n"
3383 "sub $0x10,%3 \n"
3384 "jg 1b \n"
3385 "vzeroupper \n"
3386 : "+r"(src_u), // %0
3387 "+r"(src_v), // %1
3388 "+r"(dst_uv), // %2
3389 "+r"(width) // %3
3390 : "r"(scale) // %4
3391 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
3392 // clang-format on
3393 }
3394 #endif // HAS_MERGEUVROW_AVX2
3395
3396 // Use scale to convert lsb formats to msb, depending how many bits there are:
3397 // 128 = 9 bits
3398 // 64 = 10 bits
3399 // 16 = 12 bits
3400 // 1 = 16 bits
3401 #ifdef HAS_MULTIPLYROW_16_AVX2
MultiplyRow_16_AVX2(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)3402 void MultiplyRow_16_AVX2(const uint16_t* src_y,
3403 uint16_t* dst_y,
3404 int scale,
3405 int width) {
3406 // clang-format off
3407 asm volatile (
3408 "vmovd %3,%%xmm3 \n"
3409 "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
3410 "vbroadcastss %%xmm3,%%ymm3 \n"
3411 "sub %0,%1 \n"
3412
3413 // 16 pixels per loop.
3414 LABELALIGN
3415 "1: \n"
3416 "vmovdqu (%0),%%ymm0 \n"
3417 "vmovdqu 0x20(%0),%%ymm1 \n"
3418 "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
3419 "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
3420 "vmovdqu %%ymm0,(%0,%1) \n"
3421 "vmovdqu %%ymm1,0x20(%0,%1) \n"
3422 "add $0x40,%0 \n"
3423 "sub $0x20,%2 \n"
3424 "jg 1b \n"
3425 "vzeroupper \n"
3426 : "+r"(src_y), // %0
3427 "+r"(dst_y), // %1
3428 "+r"(width) // %2
3429 : "r"(scale) // %3
3430 : "memory", "cc", "xmm0", "xmm1", "xmm3");
3431 // clang-format on
3432 }
3433 #endif // HAS_MULTIPLYROW_16_AVX2
3434
3435 // Use scale to convert lsb formats to msb, depending how many bits there are:
3436 // 32768 = 9 bits
3437 // 16384 = 10 bits
3438 // 4096 = 12 bits
3439 // 256 = 16 bits
Convert16To8Row_SSSE3(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)3440 void Convert16To8Row_SSSE3(const uint16_t* src_y,
3441 uint8_t* dst_y,
3442 int scale,
3443 int width) {
3444 // clang-format off
3445 asm volatile (
3446 "movd %3,%%xmm2 \n"
3447 "punpcklwd %%xmm2,%%xmm2 \n"
3448 "pshufd $0x0,%%xmm2,%%xmm2 \n"
3449
3450 // 32 pixels per loop.
3451 LABELALIGN
3452 "1: \n"
3453 "movdqu (%0),%%xmm0 \n"
3454 "movdqu 0x10(%0),%%xmm1 \n"
3455 "add $0x20,%0 \n"
3456 "pmulhuw %%xmm2,%%xmm0 \n"
3457 "pmulhuw %%xmm2,%%xmm1 \n"
3458 "packuswb %%xmm1,%%xmm0 \n"
3459 "movdqu %%xmm0,(%1) \n"
3460 "add $0x10,%1 \n"
3461 "sub $0x10,%2 \n"
3462 "jg 1b \n"
3463 : "+r"(src_y), // %0
3464 "+r"(dst_y), // %1
3465 "+r"(width) // %2
3466 : "r"(scale) // %3
3467 : "memory", "cc", "xmm0", "xmm1", "xmm2");
3468 // clang-format on
3469 }
3470
3471 #ifdef HAS_CONVERT16TO8ROW_AVX2
Convert16To8Row_AVX2(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)3472 void Convert16To8Row_AVX2(const uint16_t* src_y,
3473 uint8_t* dst_y,
3474 int scale,
3475 int width) {
3476 // clang-format off
3477 asm volatile (
3478 "vmovd %3,%%xmm2 \n"
3479 "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
3480 "vbroadcastss %%xmm2,%%ymm2 \n"
3481
3482 // 32 pixels per loop.
3483 LABELALIGN
3484 "1: \n"
3485 "vmovdqu (%0),%%ymm0 \n"
3486 "vmovdqu 0x20(%0),%%ymm1 \n"
3487 "add $0x40,%0 \n"
3488 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
3489 "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
3490 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates
3491 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3492 "vmovdqu %%ymm0,(%1) \n"
3493 "add $0x20,%1 \n"
3494 "sub $0x20,%2 \n"
3495 "jg 1b \n"
3496 "vzeroupper \n"
3497 : "+r"(src_y), // %0
3498 "+r"(dst_y), // %1
3499 "+r"(width) // %2
3500 : "r"(scale) // %3
3501 : "memory", "cc", "xmm0", "xmm1", "xmm2");
3502 // clang-format on
3503 }
3504 #endif // HAS_CONVERT16TO8ROW_AVX2
3505
3506 // Use scale to convert to lsb formats depending how many bits there are:
3507 // 512 = 9 bits
3508 // 1024 = 10 bits
3509 // 4096 = 12 bits
3510 // TODO(fbarchard): reduce to SSE2
Convert8To16Row_SSE2(const uint8_t * src_y,uint16_t * dst_y,int scale,int width)3511 void Convert8To16Row_SSE2(const uint8_t* src_y,
3512 uint16_t* dst_y,
3513 int scale,
3514 int width) {
3515 // clang-format off
3516 asm volatile (
3517 "movd %3,%%xmm2 \n"
3518 "punpcklwd %%xmm2,%%xmm2 \n"
3519 "pshufd $0x0,%%xmm2,%%xmm2 \n"
3520
3521 // 32 pixels per loop.
3522 LABELALIGN
3523 "1: \n"
3524 "movdqu (%0),%%xmm0 \n"
3525 "movdqa %%xmm0,%%xmm1 \n"
3526 "punpcklbw %%xmm0,%%xmm0 \n"
3527 "punpckhbw %%xmm1,%%xmm1 \n"
3528 "add $0x10,%0 \n"
3529 "pmulhuw %%xmm2,%%xmm0 \n"
3530 "pmulhuw %%xmm2,%%xmm1 \n"
3531 "movdqu %%xmm0,(%1) \n"
3532 "movdqu %%xmm1,0x10(%1) \n"
3533 "add $0x20,%1 \n"
3534 "sub $0x10,%2 \n"
3535 "jg 1b \n"
3536 : "+r"(src_y), // %0
3537 "+r"(dst_y), // %1
3538 "+r"(width) // %2
3539 : "r"(scale) // %3
3540 : "memory", "cc", "xmm0", "xmm1", "xmm2");
3541 // clang-format on
3542 }
3543
3544 #ifdef HAS_CONVERT8TO16ROW_AVX2
Convert8To16Row_AVX2(const uint8_t * src_y,uint16_t * dst_y,int scale,int width)3545 void Convert8To16Row_AVX2(const uint8_t* src_y,
3546 uint16_t* dst_y,
3547 int scale,
3548 int width) {
3549 // clang-format off
3550 asm volatile (
3551 "vmovd %3,%%xmm2 \n"
3552 "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
3553 "vbroadcastss %%xmm2,%%ymm2 \n"
3554
3555 // 32 pixels per loop.
3556 LABELALIGN
3557 "1: \n"
3558 "vmovdqu (%0),%%ymm0 \n"
3559 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3560 "add $0x20,%0 \n"
3561 "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
3562 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
3563 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
3564 "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
3565 "vmovdqu %%ymm0,(%1) \n"
3566 "vmovdqu %%ymm1,0x20(%1) \n"
3567 "add $0x40,%1 \n"
3568 "sub $0x20,%2 \n"
3569 "jg 1b \n"
3570 "vzeroupper \n"
3571 : "+r"(src_y), // %0
3572 "+r"(dst_y), // %1
3573 "+r"(width) // %2
3574 : "r"(scale) // %3
3575 : "memory", "cc", "xmm0", "xmm1", "xmm2");
3576 // clang-format on
3577 }
3578 #endif // HAS_CONVERT8TO16ROW_AVX2
3579
3580 #ifdef HAS_SPLITRGBROW_SSSE3
3581
3582 // Shuffle table for converting RGB to Planar.
3583 static const uvec8 kShuffleMaskRGBToR0 = {0u, 3u, 6u, 9u, 12u, 15u,
3584 128u, 128u, 128u, 128u, 128u, 128u,
3585 128u, 128u, 128u, 128u};
3586 static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
3587 2u, 5u, 8u, 11u, 14u, 128u,
3588 128u, 128u, 128u, 128u};
3589 static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
3590 128u, 128u, 128u, 128u, 128u, 1u,
3591 4u, 7u, 10u, 13u};
3592
3593 static const uvec8 kShuffleMaskRGBToG0 = {1u, 4u, 7u, 10u, 13u, 128u,
3594 128u, 128u, 128u, 128u, 128u, 128u,
3595 128u, 128u, 128u, 128u};
3596 static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
3597 3u, 6u, 9u, 12u, 15u, 128u,
3598 128u, 128u, 128u, 128u};
3599 static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
3600 128u, 128u, 128u, 128u, 128u, 2u,
3601 5u, 8u, 11u, 14u};
3602
3603 static const uvec8 kShuffleMaskRGBToB0 = {2u, 5u, 8u, 11u, 14u, 128u,
3604 128u, 128u, 128u, 128u, 128u, 128u,
3605 128u, 128u, 128u, 128u};
3606 static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
3607 4u, 7u, 10u, 13u, 128u, 128u,
3608 128u, 128u, 128u, 128u};
3609 static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
3610 128u, 128u, 128u, 128u, 0u, 3u,
3611 6u, 9u, 12u, 15u};
3612
SplitRGBRow_SSSE3(const uint8_t * src_rgb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)3613 void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
3614 uint8_t* dst_r,
3615 uint8_t* dst_g,
3616 uint8_t* dst_b,
3617 int width) {
3618 asm volatile(
3619
3620 LABELALIGN
3621 "1: \n"
3622 "movdqu (%0),%%xmm0 \n"
3623 "movdqu 0x10(%0),%%xmm1 \n"
3624 "movdqu 0x20(%0),%%xmm2 \n"
3625 "pshufb %5, %%xmm0 \n"
3626 "pshufb %6, %%xmm1 \n"
3627 "pshufb %7, %%xmm2 \n"
3628 "por %%xmm1,%%xmm0 \n"
3629 "por %%xmm2,%%xmm0 \n"
3630 "movdqu %%xmm0,(%1) \n"
3631 "lea 0x10(%1),%1 \n"
3632
3633 "movdqu (%0),%%xmm0 \n"
3634 "movdqu 0x10(%0),%%xmm1 \n"
3635 "movdqu 0x20(%0),%%xmm2 \n"
3636 "pshufb %8, %%xmm0 \n"
3637 "pshufb %9, %%xmm1 \n"
3638 "pshufb %10, %%xmm2 \n"
3639 "por %%xmm1,%%xmm0 \n"
3640 "por %%xmm2,%%xmm0 \n"
3641 "movdqu %%xmm0,(%2) \n"
3642 "lea 0x10(%2),%2 \n"
3643
3644 "movdqu (%0),%%xmm0 \n"
3645 "movdqu 0x10(%0),%%xmm1 \n"
3646 "movdqu 0x20(%0),%%xmm2 \n"
3647 "pshufb %11, %%xmm0 \n"
3648 "pshufb %12, %%xmm1 \n"
3649 "pshufb %13, %%xmm2 \n"
3650 "por %%xmm1,%%xmm0 \n"
3651 "por %%xmm2,%%xmm0 \n"
3652 "movdqu %%xmm0,(%3) \n"
3653 "lea 0x10(%3),%3 \n"
3654 "lea 0x30(%0),%0 \n"
3655 "sub $0x10,%4 \n"
3656 "jg 1b \n"
3657 : "+r"(src_rgb), // %0
3658 "+r"(dst_r), // %1
3659 "+r"(dst_g), // %2
3660 "+r"(dst_b), // %3
3661 "+r"(width) // %4
3662 : "m"(kShuffleMaskRGBToR0), // %5
3663 "m"(kShuffleMaskRGBToR1), // %6
3664 "m"(kShuffleMaskRGBToR2), // %7
3665 "m"(kShuffleMaskRGBToG0), // %8
3666 "m"(kShuffleMaskRGBToG1), // %9
3667 "m"(kShuffleMaskRGBToG2), // %10
3668 "m"(kShuffleMaskRGBToB0), // %11
3669 "m"(kShuffleMaskRGBToB1), // %12
3670 "m"(kShuffleMaskRGBToB2) // %13
3671 : "memory", "cc", "xmm0", "xmm1", "xmm2");
3672 }
3673 #endif // HAS_SPLITRGBROW_SSSE3
3674
3675 #ifdef HAS_MERGERGBROW_SSSE3
3676
3677 // Shuffle table for converting RGB to Planar.
3678 static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
3679 2u, 128u, 128u, 3u, 128u, 128u,
3680 4u, 128u, 128u, 5u};
3681 static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
3682 128u, 2u, 128u, 128u, 3u, 128u,
3683 128u, 4u, 128u, 128u};
3684 static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
3685 128u, 128u, 2u, 128u, 128u, 3u,
3686 128u, 128u, 4u, 128u};
3687
3688 static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
3689 7u, 128u, 128u, 8u, 128u, 128u,
3690 9u, 128u, 128u, 10u};
3691 static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
3692 128u, 7u, 128u, 128u, 8u, 128u,
3693 128u, 9u, 128u, 128u};
3694 static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u, 128u, 128u, 7u,
3695 128u, 128u, 8u, 128u, 128u, 9u,
3696 128u, 128u, 10u, 128u};
3697
3698 static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
3699 12u, 128u, 128u, 13u, 128u, 128u,
3700 14u, 128u, 128u, 15u};
3701 static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
3702 128u, 13u, 128u, 128u, 14u, 128u,
3703 128u, 15u, 128u, 128u};
3704 static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
3705 128u, 128u, 13u, 128u, 128u, 14u,
3706 128u, 128u, 15u, 128u};
3707
MergeRGBRow_SSSE3(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_rgb,int width)3708 void MergeRGBRow_SSSE3(const uint8_t* src_r,
3709 const uint8_t* src_g,
3710 const uint8_t* src_b,
3711 uint8_t* dst_rgb,
3712 int width) {
3713 asm volatile(
3714
3715 LABELALIGN
3716 "1: \n"
3717 "movdqu (%0),%%xmm0 \n"
3718 "movdqu (%1),%%xmm1 \n"
3719 "movdqu (%2),%%xmm2 \n"
3720 "pshufb %5, %%xmm0 \n"
3721 "pshufb %6, %%xmm1 \n"
3722 "pshufb %7, %%xmm2 \n"
3723 "por %%xmm1,%%xmm0 \n"
3724 "por %%xmm2,%%xmm0 \n"
3725 "movdqu %%xmm0,(%3) \n"
3726
3727 "movdqu (%0),%%xmm0 \n"
3728 "movdqu (%1),%%xmm1 \n"
3729 "movdqu (%2),%%xmm2 \n"
3730 "pshufb %8, %%xmm0 \n"
3731 "pshufb %9, %%xmm1 \n"
3732 "pshufb %10, %%xmm2 \n"
3733 "por %%xmm1,%%xmm0 \n"
3734 "por %%xmm2,%%xmm0 \n"
3735 "movdqu %%xmm0,16(%3) \n"
3736
3737 "movdqu (%0),%%xmm0 \n"
3738 "movdqu (%1),%%xmm1 \n"
3739 "movdqu (%2),%%xmm2 \n"
3740 "pshufb %11, %%xmm0 \n"
3741 "pshufb %12, %%xmm1 \n"
3742 "pshufb %13, %%xmm2 \n"
3743 "por %%xmm1,%%xmm0 \n"
3744 "por %%xmm2,%%xmm0 \n"
3745 "movdqu %%xmm0,32(%3) \n"
3746
3747 "lea 0x10(%0),%0 \n"
3748 "lea 0x10(%1),%1 \n"
3749 "lea 0x10(%2),%2 \n"
3750 "lea 0x30(%3),%3 \n"
3751 "sub $0x10,%4 \n"
3752 "jg 1b \n"
3753 : "+r"(src_r), // %0
3754 "+r"(src_g), // %1
3755 "+r"(src_b), // %2
3756 "+r"(dst_rgb), // %3
3757 "+r"(width) // %4
3758 : "m"(kShuffleMaskRToRGB0), // %5
3759 "m"(kShuffleMaskGToRGB0), // %6
3760 "m"(kShuffleMaskBToRGB0), // %7
3761 "m"(kShuffleMaskRToRGB1), // %8
3762 "m"(kShuffleMaskGToRGB1), // %9
3763 "m"(kShuffleMaskBToRGB1), // %10
3764 "m"(kShuffleMaskRToRGB2), // %11
3765 "m"(kShuffleMaskGToRGB2), // %12
3766 "m"(kShuffleMaskBToRGB2) // %13
3767 : "memory", "cc", "xmm0", "xmm1", "xmm2");
3768 }
3769 #endif // HAS_MERGERGBROW_SSSE3
3770
3771 #ifdef HAS_COPYROW_SSE2
CopyRow_SSE2(const uint8_t * src,uint8_t * dst,int width)3772 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
3773 asm volatile(
3774 "test $0xf,%0 \n"
3775 "jne 2f \n"
3776 "test $0xf,%1 \n"
3777 "jne 2f \n"
3778
3779 LABELALIGN
3780 "1: \n"
3781 "movdqa (%0),%%xmm0 \n"
3782 "movdqa 0x10(%0),%%xmm1 \n"
3783 "lea 0x20(%0),%0 \n"
3784 "movdqa %%xmm0,(%1) \n"
3785 "movdqa %%xmm1,0x10(%1) \n"
3786 "lea 0x20(%1),%1 \n"
3787 "sub $0x20,%2 \n"
3788 "jg 1b \n"
3789 "jmp 9f \n"
3790
3791 LABELALIGN
3792 "2: \n"
3793 "movdqu (%0),%%xmm0 \n"
3794 "movdqu 0x10(%0),%%xmm1 \n"
3795 "lea 0x20(%0),%0 \n"
3796 "movdqu %%xmm0,(%1) \n"
3797 "movdqu %%xmm1,0x10(%1) \n"
3798 "lea 0x20(%1),%1 \n"
3799 "sub $0x20,%2 \n"
3800 "jg 2b \n"
3801
3802 LABELALIGN "9: \n"
3803 : "+r"(src), // %0
3804 "+r"(dst), // %1
3805 "+r"(width) // %2
3806 :
3807 : "memory", "cc", "xmm0", "xmm1");
3808 }
3809 #endif // HAS_COPYROW_SSE2
3810
3811 #ifdef HAS_COPYROW_AVX
CopyRow_AVX(const uint8_t * src,uint8_t * dst,int width)3812 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
3813 asm volatile(
3814
3815 LABELALIGN
3816 "1: \n"
3817 "vmovdqu (%0),%%ymm0 \n"
3818 "vmovdqu 0x20(%0),%%ymm1 \n"
3819 "lea 0x40(%0),%0 \n"
3820 "vmovdqu %%ymm0,(%1) \n"
3821 "vmovdqu %%ymm1,0x20(%1) \n"
3822 "lea 0x40(%1),%1 \n"
3823 "sub $0x40,%2 \n"
3824 "jg 1b \n"
3825 : "+r"(src), // %0
3826 "+r"(dst), // %1
3827 "+r"(width) // %2
3828 :
3829 : "memory", "cc", "xmm0", "xmm1");
3830 }
3831 #endif // HAS_COPYROW_AVX
3832
3833 #ifdef HAS_COPYROW_ERMS
3834 // Multiple of 1.
CopyRow_ERMS(const uint8_t * src,uint8_t * dst,int width)3835 void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
3836 size_t width_tmp = (size_t)(width);
3837 asm volatile(
3838
3839 "rep movsb \n"
3840 : "+S"(src), // %0
3841 "+D"(dst), // %1
3842 "+c"(width_tmp) // %2
3843 :
3844 : "memory", "cc");
3845 }
3846 #endif // HAS_COPYROW_ERMS
3847
3848 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
3849 // width in pixels
ARGBCopyAlphaRow_SSE2(const uint8_t * src,uint8_t * dst,int width)3850 void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
3851 asm volatile(
3852 "pcmpeqb %%xmm0,%%xmm0 \n"
3853 "pslld $0x18,%%xmm0 \n"
3854 "pcmpeqb %%xmm1,%%xmm1 \n"
3855 "psrld $0x8,%%xmm1 \n"
3856
3857 LABELALIGN
3858 "1: \n"
3859 "movdqu (%0),%%xmm2 \n"
3860 "movdqu 0x10(%0),%%xmm3 \n"
3861 "lea 0x20(%0),%0 \n"
3862 "movdqu (%1),%%xmm4 \n"
3863 "movdqu 0x10(%1),%%xmm5 \n"
3864 "pand %%xmm0,%%xmm2 \n"
3865 "pand %%xmm0,%%xmm3 \n"
3866 "pand %%xmm1,%%xmm4 \n"
3867 "pand %%xmm1,%%xmm5 \n"
3868 "por %%xmm4,%%xmm2 \n"
3869 "por %%xmm5,%%xmm3 \n"
3870 "movdqu %%xmm2,(%1) \n"
3871 "movdqu %%xmm3,0x10(%1) \n"
3872 "lea 0x20(%1),%1 \n"
3873 "sub $0x8,%2 \n"
3874 "jg 1b \n"
3875 : "+r"(src), // %0
3876 "+r"(dst), // %1
3877 "+r"(width) // %2
3878 :
3879 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
3880 }
3881 #endif // HAS_ARGBCOPYALPHAROW_SSE2
3882
3883 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
3884 // width in pixels
ARGBCopyAlphaRow_AVX2(const uint8_t * src,uint8_t * dst,int width)3885 void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
3886 asm volatile(
3887 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
3888 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
3889
3890 LABELALIGN
3891 "1: \n"
3892 "vmovdqu (%0),%%ymm1 \n"
3893 "vmovdqu 0x20(%0),%%ymm2 \n"
3894 "lea 0x40(%0),%0 \n"
3895 "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
3896 "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
3897 "vmovdqu %%ymm1,(%1) \n"
3898 "vmovdqu %%ymm2,0x20(%1) \n"
3899 "lea 0x40(%1),%1 \n"
3900 "sub $0x10,%2 \n"
3901 "jg 1b \n"
3902 "vzeroupper \n"
3903 : "+r"(src), // %0
3904 "+r"(dst), // %1
3905 "+r"(width) // %2
3906 :
3907 : "memory", "cc", "xmm0", "xmm1", "xmm2");
3908 }
3909 #endif // HAS_ARGBCOPYALPHAROW_AVX2
3910
3911 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
3912 // width in pixels
ARGBExtractAlphaRow_SSE2(const uint8_t * src_argb,uint8_t * dst_a,int width)3913 void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
3914 uint8_t* dst_a,
3915 int width) {
3916 asm volatile(
3917
3918 LABELALIGN
3919 "1: \n"
3920 "movdqu (%0), %%xmm0 \n"
3921 "movdqu 0x10(%0), %%xmm1 \n"
3922 "lea 0x20(%0), %0 \n"
3923 "psrld $0x18, %%xmm0 \n"
3924 "psrld $0x18, %%xmm1 \n"
3925 "packssdw %%xmm1, %%xmm0 \n"
3926 "packuswb %%xmm0, %%xmm0 \n"
3927 "movq %%xmm0,(%1) \n"
3928 "lea 0x8(%1), %1 \n"
3929 "sub $0x8, %2 \n"
3930 "jg 1b \n"
3931 : "+r"(src_argb), // %0
3932 "+r"(dst_a), // %1
3933 "+rm"(width) // %2
3934 :
3935 : "memory", "cc", "xmm0", "xmm1");
3936 }
3937 #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
3938
3939 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
3940 static const uvec8 kShuffleAlphaShort_AVX2 = {
3941 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u,
3942 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
3943
ARGBExtractAlphaRow_AVX2(const uint8_t * src_argb,uint8_t * dst_a,int width)3944 void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
3945 uint8_t* dst_a,
3946 int width) {
3947 asm volatile(
3948 "vmovdqa %3,%%ymm4 \n"
3949 "vbroadcastf128 %4,%%ymm5 \n"
3950
3951 LABELALIGN
3952 "1: \n"
3953 "vmovdqu (%0), %%ymm0 \n"
3954 "vmovdqu 0x20(%0), %%ymm1 \n"
3955 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
3956 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
3957 "vmovdqu 0x40(%0), %%ymm2 \n"
3958 "vmovdqu 0x60(%0), %%ymm3 \n"
3959 "lea 0x80(%0), %0 \n"
3960 "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
3961 "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
3962 "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
3963 "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
3964 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
3965 "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
3966 "vmovdqu %%ymm0,(%1) \n"
3967 "lea 0x20(%1),%1 \n"
3968 "sub $0x20, %2 \n"
3969 "jg 1b \n"
3970 "vzeroupper \n"
3971 : "+r"(src_argb), // %0
3972 "+r"(dst_a), // %1
3973 "+rm"(width) // %2
3974 : "m"(kPermdARGBToY_AVX), // %3
3975 "m"(kShuffleAlphaShort_AVX2) // %4
3976 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
3977 }
3978 #endif // HAS_ARGBEXTRACTALPHAROW_AVX2
3979
3980 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
3981 // width in pixels
ARGBCopyYToAlphaRow_SSE2(const uint8_t * src,uint8_t * dst,int width)3982 void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
3983 asm volatile(
3984 "pcmpeqb %%xmm0,%%xmm0 \n"
3985 "pslld $0x18,%%xmm0 \n"
3986 "pcmpeqb %%xmm1,%%xmm1 \n"
3987 "psrld $0x8,%%xmm1 \n"
3988
3989 LABELALIGN
3990 "1: \n"
3991 "movq (%0),%%xmm2 \n"
3992 "lea 0x8(%0),%0 \n"
3993 "punpcklbw %%xmm2,%%xmm2 \n"
3994 "punpckhwd %%xmm2,%%xmm3 \n"
3995 "punpcklwd %%xmm2,%%xmm2 \n"
3996 "movdqu (%1),%%xmm4 \n"
3997 "movdqu 0x10(%1),%%xmm5 \n"
3998 "pand %%xmm0,%%xmm2 \n"
3999 "pand %%xmm0,%%xmm3 \n"
4000 "pand %%xmm1,%%xmm4 \n"
4001 "pand %%xmm1,%%xmm5 \n"
4002 "por %%xmm4,%%xmm2 \n"
4003 "por %%xmm5,%%xmm3 \n"
4004 "movdqu %%xmm2,(%1) \n"
4005 "movdqu %%xmm3,0x10(%1) \n"
4006 "lea 0x20(%1),%1 \n"
4007 "sub $0x8,%2 \n"
4008 "jg 1b \n"
4009 : "+r"(src), // %0
4010 "+r"(dst), // %1
4011 "+r"(width) // %2
4012 :
4013 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
4014 }
4015 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
4016
4017 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
4018 // width in pixels
ARGBCopyYToAlphaRow_AVX2(const uint8_t * src,uint8_t * dst,int width)4019 void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
4020 asm volatile(
4021 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
4022 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
4023
4024 LABELALIGN
4025 "1: \n"
4026 "vpmovzxbd (%0),%%ymm1 \n"
4027 "vpmovzxbd 0x8(%0),%%ymm2 \n"
4028 "lea 0x10(%0),%0 \n"
4029 "vpslld $0x18,%%ymm1,%%ymm1 \n"
4030 "vpslld $0x18,%%ymm2,%%ymm2 \n"
4031 "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
4032 "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
4033 "vmovdqu %%ymm1,(%1) \n"
4034 "vmovdqu %%ymm2,0x20(%1) \n"
4035 "lea 0x40(%1),%1 \n"
4036 "sub $0x10,%2 \n"
4037 "jg 1b \n"
4038 "vzeroupper \n"
4039 : "+r"(src), // %0
4040 "+r"(dst), // %1
4041 "+r"(width) // %2
4042 :
4043 : "memory", "cc", "xmm0", "xmm1", "xmm2");
4044 }
4045 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
4046
4047 #ifdef HAS_SETROW_X86
SetRow_X86(uint8_t * dst,uint8_t v8,int width)4048 void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
4049 size_t width_tmp = (size_t)(width >> 2);
4050 const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
4051 asm volatile(
4052
4053 "rep stosl \n"
4054 : "+D"(dst), // %0
4055 "+c"(width_tmp) // %1
4056 : "a"(v32) // %2
4057 : "memory", "cc");
4058 }
4059
SetRow_ERMS(uint8_t * dst,uint8_t v8,int width)4060 void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
4061 size_t width_tmp = (size_t)(width);
4062 asm volatile(
4063
4064 "rep stosb \n"
4065 : "+D"(dst), // %0
4066 "+c"(width_tmp) // %1
4067 : "a"(v8) // %2
4068 : "memory", "cc");
4069 }
4070
ARGBSetRow_X86(uint8_t * dst_argb,uint32_t v32,int width)4071 void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
4072 size_t width_tmp = (size_t)(width);
4073 asm volatile(
4074
4075 "rep stosl \n"
4076 : "+D"(dst_argb), // %0
4077 "+c"(width_tmp) // %1
4078 : "a"(v32) // %2
4079 : "memory", "cc");
4080 }
4081 #endif // HAS_SETROW_X86
4082
4083 #ifdef HAS_YUY2TOYROW_SSE2
YUY2ToYRow_SSE2(const uint8_t * src_yuy2,uint8_t * dst_y,int width)4084 void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
4085 asm volatile(
4086 "pcmpeqb %%xmm5,%%xmm5 \n"
4087 "psrlw $0x8,%%xmm5 \n"
4088
4089 LABELALIGN
4090 "1: \n"
4091 "movdqu (%0),%%xmm0 \n"
4092 "movdqu 0x10(%0),%%xmm1 \n"
4093 "lea 0x20(%0),%0 \n"
4094 "pand %%xmm5,%%xmm0 \n"
4095 "pand %%xmm5,%%xmm1 \n"
4096 "packuswb %%xmm1,%%xmm0 \n"
4097 "movdqu %%xmm0,(%1) \n"
4098 "lea 0x10(%1),%1 \n"
4099 "sub $0x10,%2 \n"
4100 "jg 1b \n"
4101 : "+r"(src_yuy2), // %0
4102 "+r"(dst_y), // %1
4103 "+r"(width) // %2
4104 :
4105 : "memory", "cc", "xmm0", "xmm1", "xmm5");
4106 }
4107
YUY2ToUVRow_SSE2(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)4108 void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
4109 int stride_yuy2,
4110 uint8_t* dst_u,
4111 uint8_t* dst_v,
4112 int width) {
4113 asm volatile(
4114 "pcmpeqb %%xmm5,%%xmm5 \n"
4115 "psrlw $0x8,%%xmm5 \n"
4116 "sub %1,%2 \n"
4117
4118 LABELALIGN
4119 "1: \n"
4120 "movdqu (%0),%%xmm0 \n"
4121 "movdqu 0x10(%0),%%xmm1 \n"
4122 "movdqu 0x00(%0,%4,1),%%xmm2 \n"
4123 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
4124 "lea 0x20(%0),%0 \n"
4125 "pavgb %%xmm2,%%xmm0 \n"
4126 "pavgb %%xmm3,%%xmm1 \n"
4127 "psrlw $0x8,%%xmm0 \n"
4128 "psrlw $0x8,%%xmm1 \n"
4129 "packuswb %%xmm1,%%xmm0 \n"
4130 "movdqa %%xmm0,%%xmm1 \n"
4131 "pand %%xmm5,%%xmm0 \n"
4132 "packuswb %%xmm0,%%xmm0 \n"
4133 "psrlw $0x8,%%xmm1 \n"
4134 "packuswb %%xmm1,%%xmm1 \n"
4135 "movq %%xmm0,(%1) \n"
4136 "movq %%xmm1,0x00(%1,%2,1) \n"
4137 "lea 0x8(%1),%1 \n"
4138 "sub $0x10,%3 \n"
4139 "jg 1b \n"
4140 : "+r"(src_yuy2), // %0
4141 "+r"(dst_u), // %1
4142 "+r"(dst_v), // %2
4143 "+r"(width) // %3
4144 : "r"((intptr_t)(stride_yuy2)) // %4
4145 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
4146 }
4147
YUY2ToUV422Row_SSE2(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)4148 void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
4149 uint8_t* dst_u,
4150 uint8_t* dst_v,
4151 int width) {
4152 asm volatile(
4153 "pcmpeqb %%xmm5,%%xmm5 \n"
4154 "psrlw $0x8,%%xmm5 \n"
4155 "sub %1,%2 \n"
4156
4157 LABELALIGN
4158 "1: \n"
4159 "movdqu (%0),%%xmm0 \n"
4160 "movdqu 0x10(%0),%%xmm1 \n"
4161 "lea 0x20(%0),%0 \n"
4162 "psrlw $0x8,%%xmm0 \n"
4163 "psrlw $0x8,%%xmm1 \n"
4164 "packuswb %%xmm1,%%xmm0 \n"
4165 "movdqa %%xmm0,%%xmm1 \n"
4166 "pand %%xmm5,%%xmm0 \n"
4167 "packuswb %%xmm0,%%xmm0 \n"
4168 "psrlw $0x8,%%xmm1 \n"
4169 "packuswb %%xmm1,%%xmm1 \n"
4170 "movq %%xmm0,(%1) \n"
4171 "movq %%xmm1,0x00(%1,%2,1) \n"
4172 "lea 0x8(%1),%1 \n"
4173 "sub $0x10,%3 \n"
4174 "jg 1b \n"
4175 : "+r"(src_yuy2), // %0
4176 "+r"(dst_u), // %1
4177 "+r"(dst_v), // %2
4178 "+r"(width) // %3
4179 :
4180 : "memory", "cc", "xmm0", "xmm1", "xmm5");
4181 }
4182
UYVYToYRow_SSE2(const uint8_t * src_uyvy,uint8_t * dst_y,int width)4183 void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
4184 asm volatile(
4185
4186 LABELALIGN
4187 "1: \n"
4188 "movdqu (%0),%%xmm0 \n"
4189 "movdqu 0x10(%0),%%xmm1 \n"
4190 "lea 0x20(%0),%0 \n"
4191 "psrlw $0x8,%%xmm0 \n"
4192 "psrlw $0x8,%%xmm1 \n"
4193 "packuswb %%xmm1,%%xmm0 \n"
4194 "movdqu %%xmm0,(%1) \n"
4195 "lea 0x10(%1),%1 \n"
4196 "sub $0x10,%2 \n"
4197 "jg 1b \n"
4198 : "+r"(src_uyvy), // %0
4199 "+r"(dst_y), // %1
4200 "+r"(width) // %2
4201 :
4202 : "memory", "cc", "xmm0", "xmm1");
4203 }
4204
UYVYToUVRow_SSE2(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)4205 void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
4206 int stride_uyvy,
4207 uint8_t* dst_u,
4208 uint8_t* dst_v,
4209 int width) {
4210 asm volatile(
4211 "pcmpeqb %%xmm5,%%xmm5 \n"
4212 "psrlw $0x8,%%xmm5 \n"
4213 "sub %1,%2 \n"
4214
4215 LABELALIGN
4216 "1: \n"
4217 "movdqu (%0),%%xmm0 \n"
4218 "movdqu 0x10(%0),%%xmm1 \n"
4219 "movdqu 0x00(%0,%4,1),%%xmm2 \n"
4220 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
4221 "lea 0x20(%0),%0 \n"
4222 "pavgb %%xmm2,%%xmm0 \n"
4223 "pavgb %%xmm3,%%xmm1 \n"
4224 "pand %%xmm5,%%xmm0 \n"
4225 "pand %%xmm5,%%xmm1 \n"
4226 "packuswb %%xmm1,%%xmm0 \n"
4227 "movdqa %%xmm0,%%xmm1 \n"
4228 "pand %%xmm5,%%xmm0 \n"
4229 "packuswb %%xmm0,%%xmm0 \n"
4230 "psrlw $0x8,%%xmm1 \n"
4231 "packuswb %%xmm1,%%xmm1 \n"
4232 "movq %%xmm0,(%1) \n"
4233 "movq %%xmm1,0x00(%1,%2,1) \n"
4234 "lea 0x8(%1),%1 \n"
4235 "sub $0x10,%3 \n"
4236 "jg 1b \n"
4237 : "+r"(src_uyvy), // %0
4238 "+r"(dst_u), // %1
4239 "+r"(dst_v), // %2
4240 "+r"(width) // %3
4241 : "r"((intptr_t)(stride_uyvy)) // %4
4242 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
4243 }
4244
UYVYToUV422Row_SSE2(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)4245 void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
4246 uint8_t* dst_u,
4247 uint8_t* dst_v,
4248 int width) {
4249 asm volatile(
4250 "pcmpeqb %%xmm5,%%xmm5 \n"
4251 "psrlw $0x8,%%xmm5 \n"
4252 "sub %1,%2 \n"
4253
4254 LABELALIGN
4255 "1: \n"
4256 "movdqu (%0),%%xmm0 \n"
4257 "movdqu 0x10(%0),%%xmm1 \n"
4258 "lea 0x20(%0),%0 \n"
4259 "pand %%xmm5,%%xmm0 \n"
4260 "pand %%xmm5,%%xmm1 \n"
4261 "packuswb %%xmm1,%%xmm0 \n"
4262 "movdqa %%xmm0,%%xmm1 \n"
4263 "pand %%xmm5,%%xmm0 \n"
4264 "packuswb %%xmm0,%%xmm0 \n"
4265 "psrlw $0x8,%%xmm1 \n"
4266 "packuswb %%xmm1,%%xmm1 \n"
4267 "movq %%xmm0,(%1) \n"
4268 "movq %%xmm1,0x00(%1,%2,1) \n"
4269 "lea 0x8(%1),%1 \n"
4270 "sub $0x10,%3 \n"
4271 "jg 1b \n"
4272 : "+r"(src_uyvy), // %0
4273 "+r"(dst_u), // %1
4274 "+r"(dst_v), // %2
4275 "+r"(width) // %3
4276 :
4277 : "memory", "cc", "xmm0", "xmm1", "xmm5");
4278 }
4279 #endif // HAS_YUY2TOYROW_SSE2
4280
4281 #ifdef HAS_YUY2TOYROW_AVX2
YUY2ToYRow_AVX2(const uint8_t * src_yuy2,uint8_t * dst_y,int width)4282 void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
4283 asm volatile(
4284 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4285 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
4286
4287 LABELALIGN
4288 "1: \n"
4289 "vmovdqu (%0),%%ymm0 \n"
4290 "vmovdqu 0x20(%0),%%ymm1 \n"
4291 "lea 0x40(%0),%0 \n"
4292 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
4293 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
4294 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4295 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4296 "vmovdqu %%ymm0,(%1) \n"
4297 "lea 0x20(%1),%1 \n"
4298 "sub $0x20,%2 \n"
4299 "jg 1b \n"
4300 "vzeroupper \n"
4301 : "+r"(src_yuy2), // %0
4302 "+r"(dst_y), // %1
4303 "+r"(width) // %2
4304 :
4305 : "memory", "cc", "xmm0", "xmm1", "xmm5");
4306 }
4307
YUY2ToUVRow_AVX2(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)4308 void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
4309 int stride_yuy2,
4310 uint8_t* dst_u,
4311 uint8_t* dst_v,
4312 int width) {
4313 asm volatile(
4314 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4315 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
4316 "sub %1,%2 \n"
4317
4318 LABELALIGN
4319 "1: \n"
4320 "vmovdqu (%0),%%ymm0 \n"
4321 "vmovdqu 0x20(%0),%%ymm1 \n"
4322 "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
4323 "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
4324 "lea 0x40(%0),%0 \n"
4325 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
4326 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
4327 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4328 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4329 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
4330 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
4331 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
4332 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
4333 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
4334 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4335 "vextractf128 $0x0,%%ymm1,(%1) \n"
4336 "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
4337 "lea 0x10(%1),%1 \n"
4338 "sub $0x20,%3 \n"
4339 "jg 1b \n"
4340 "vzeroupper \n"
4341 : "+r"(src_yuy2), // %0
4342 "+r"(dst_u), // %1
4343 "+r"(dst_v), // %2
4344 "+r"(width) // %3
4345 : "r"((intptr_t)(stride_yuy2)) // %4
4346 : "memory", "cc", "xmm0", "xmm1", "xmm5");
4347 }
4348
YUY2ToUV422Row_AVX2(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)4349 void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
4350 uint8_t* dst_u,
4351 uint8_t* dst_v,
4352 int width) {
4353 asm volatile(
4354 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4355 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
4356 "sub %1,%2 \n"
4357
4358 LABELALIGN
4359 "1: \n"
4360 "vmovdqu (%0),%%ymm0 \n"
4361 "vmovdqu 0x20(%0),%%ymm1 \n"
4362 "lea 0x40(%0),%0 \n"
4363 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
4364 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
4365 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4366 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4367 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
4368 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
4369 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
4370 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
4371 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
4372 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4373 "vextractf128 $0x0,%%ymm1,(%1) \n"
4374 "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
4375 "lea 0x10(%1),%1 \n"
4376 "sub $0x20,%3 \n"
4377 "jg 1b \n"
4378 "vzeroupper \n"
4379 : "+r"(src_yuy2), // %0
4380 "+r"(dst_u), // %1
4381 "+r"(dst_v), // %2
4382 "+r"(width) // %3
4383 :
4384 : "memory", "cc", "xmm0", "xmm1", "xmm5");
4385 }
4386
UYVYToYRow_AVX2(const uint8_t * src_uyvy,uint8_t * dst_y,int width)4387 void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
4388 asm volatile(
4389
4390 LABELALIGN
4391 "1: \n"
4392 "vmovdqu (%0),%%ymm0 \n"
4393 "vmovdqu 0x20(%0),%%ymm1 \n"
4394 "lea 0x40(%0),%0 \n"
4395 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
4396 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
4397 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4398 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4399 "vmovdqu %%ymm0,(%1) \n"
4400 "lea 0x20(%1),%1 \n"
4401 "sub $0x20,%2 \n"
4402 "jg 1b \n"
4403 "vzeroupper \n"
4404 : "+r"(src_uyvy), // %0
4405 "+r"(dst_y), // %1
4406 "+r"(width) // %2
4407 :
4408 : "memory", "cc", "xmm0", "xmm1", "xmm5");
4409 }
UYVYToUVRow_AVX2(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)4410 void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
4411 int stride_uyvy,
4412 uint8_t* dst_u,
4413 uint8_t* dst_v,
4414 int width) {
4415 asm volatile(
4416 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4417 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
4418 "sub %1,%2 \n"
4419
4420 LABELALIGN
4421 "1: \n"
4422 "vmovdqu (%0),%%ymm0 \n"
4423 "vmovdqu 0x20(%0),%%ymm1 \n"
4424 "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
4425 "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
4426 "lea 0x40(%0),%0 \n"
4427 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
4428 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
4429 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4430 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4431 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
4432 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
4433 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
4434 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
4435 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
4436 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4437 "vextractf128 $0x0,%%ymm1,(%1) \n"
4438 "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
4439 "lea 0x10(%1),%1 \n"
4440 "sub $0x20,%3 \n"
4441 "jg 1b \n"
4442 "vzeroupper \n"
4443 : "+r"(src_uyvy), // %0
4444 "+r"(dst_u), // %1
4445 "+r"(dst_v), // %2
4446 "+r"(width) // %3
4447 : "r"((intptr_t)(stride_uyvy)) // %4
4448 : "memory", "cc", "xmm0", "xmm1", "xmm5");
4449 }
4450
UYVYToUV422Row_AVX2(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)4451 void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
4452 uint8_t* dst_u,
4453 uint8_t* dst_v,
4454 int width) {
4455 asm volatile(
4456 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4457 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
4458 "sub %1,%2 \n"
4459
4460 LABELALIGN
4461 "1: \n"
4462 "vmovdqu (%0),%%ymm0 \n"
4463 "vmovdqu 0x20(%0),%%ymm1 \n"
4464 "lea 0x40(%0),%0 \n"
4465 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
4466 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
4467 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4468 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4469 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
4470 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
4471 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
4472 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
4473 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
4474 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4475 "vextractf128 $0x0,%%ymm1,(%1) \n"
4476 "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
4477 "lea 0x10(%1),%1 \n"
4478 "sub $0x20,%3 \n"
4479 "jg 1b \n"
4480 "vzeroupper \n"
4481 : "+r"(src_uyvy), // %0
4482 "+r"(dst_u), // %1
4483 "+r"(dst_v), // %2
4484 "+r"(width) // %3
4485 :
4486 : "memory", "cc", "xmm0", "xmm1", "xmm5");
4487 }
4488 #endif // HAS_YUY2TOYROW_AVX2
4489
4490 #ifdef HAS_ARGBBLENDROW_SSSE3
4491 // Shuffle table for isolating alpha.
4492 static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
4493 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
4494
4495 // Blend 8 pixels at a time
ARGBBlendRow_SSSE3(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)4496 void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
4497 const uint8_t* src_argb1,
4498 uint8_t* dst_argb,
4499 int width) {
4500 asm volatile(
4501 "pcmpeqb %%xmm7,%%xmm7 \n"
4502 "psrlw $0xf,%%xmm7 \n"
4503 "pcmpeqb %%xmm6,%%xmm6 \n"
4504 "psrlw $0x8,%%xmm6 \n"
4505 "pcmpeqb %%xmm5,%%xmm5 \n"
4506 "psllw $0x8,%%xmm5 \n"
4507 "pcmpeqb %%xmm4,%%xmm4 \n"
4508 "pslld $0x18,%%xmm4 \n"
4509 "sub $0x4,%3 \n"
4510 "jl 49f \n"
4511
4512 // 4 pixel loop.
4513 LABELALIGN
4514 "40: \n"
4515 "movdqu (%0),%%xmm3 \n"
4516 "lea 0x10(%0),%0 \n"
4517 "movdqa %%xmm3,%%xmm0 \n"
4518 "pxor %%xmm4,%%xmm3 \n"
4519 "movdqu (%1),%%xmm2 \n"
4520 "pshufb %4,%%xmm3 \n"
4521 "pand %%xmm6,%%xmm2 \n"
4522 "paddw %%xmm7,%%xmm3 \n"
4523 "pmullw %%xmm3,%%xmm2 \n"
4524 "movdqu (%1),%%xmm1 \n"
4525 "lea 0x10(%1),%1 \n"
4526 "psrlw $0x8,%%xmm1 \n"
4527 "por %%xmm4,%%xmm0 \n"
4528 "pmullw %%xmm3,%%xmm1 \n"
4529 "psrlw $0x8,%%xmm2 \n"
4530 "paddusb %%xmm2,%%xmm0 \n"
4531 "pand %%xmm5,%%xmm1 \n"
4532 "paddusb %%xmm1,%%xmm0 \n"
4533 "movdqu %%xmm0,(%2) \n"
4534 "lea 0x10(%2),%2 \n"
4535 "sub $0x4,%3 \n"
4536 "jge 40b \n"
4537
4538 "49: \n"
4539 "add $0x3,%3 \n"
4540 "jl 99f \n"
4541
4542 // 1 pixel loop.
4543 "91: \n"
4544 "movd (%0),%%xmm3 \n"
4545 "lea 0x4(%0),%0 \n"
4546 "movdqa %%xmm3,%%xmm0 \n"
4547 "pxor %%xmm4,%%xmm3 \n"
4548 "movd (%1),%%xmm2 \n"
4549 "pshufb %4,%%xmm3 \n"
4550 "pand %%xmm6,%%xmm2 \n"
4551 "paddw %%xmm7,%%xmm3 \n"
4552 "pmullw %%xmm3,%%xmm2 \n"
4553 "movd (%1),%%xmm1 \n"
4554 "lea 0x4(%1),%1 \n"
4555 "psrlw $0x8,%%xmm1 \n"
4556 "por %%xmm4,%%xmm0 \n"
4557 "pmullw %%xmm3,%%xmm1 \n"
4558 "psrlw $0x8,%%xmm2 \n"
4559 "paddusb %%xmm2,%%xmm0 \n"
4560 "pand %%xmm5,%%xmm1 \n"
4561 "paddusb %%xmm1,%%xmm0 \n"
4562 "movd %%xmm0,(%2) \n"
4563 "lea 0x4(%2),%2 \n"
4564 "sub $0x1,%3 \n"
4565 "jge 91b \n"
4566 "99: \n"
4567 : "+r"(src_argb0), // %0
4568 "+r"(src_argb1), // %1
4569 "+r"(dst_argb), // %2
4570 "+r"(width) // %3
4571 : "m"(kShuffleAlpha) // %4
4572 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
4573 "xmm7");
4574 }
4575 #endif // HAS_ARGBBLENDROW_SSSE3
4576
4577 #ifdef HAS_BLENDPLANEROW_SSSE3
4578 // Blend 8 pixels at a time.
4579 // unsigned version of math
4580 // =((A2*C2)+(B2*(255-C2))+255)/256
4581 // signed version of math
4582 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_SSSE3(const uint8_t * src0,const uint8_t * src1,const uint8_t * alpha,uint8_t * dst,int width)4583 void BlendPlaneRow_SSSE3(const uint8_t* src0,
4584 const uint8_t* src1,
4585 const uint8_t* alpha,
4586 uint8_t* dst,
4587 int width) {
4588 asm volatile(
4589 "pcmpeqb %%xmm5,%%xmm5 \n"
4590 "psllw $0x8,%%xmm5 \n"
4591 "mov $0x80808080,%%eax \n"
4592 "movd %%eax,%%xmm6 \n"
4593 "pshufd $0x0,%%xmm6,%%xmm6 \n"
4594 "mov $0x807f807f,%%eax \n"
4595 "movd %%eax,%%xmm7 \n"
4596 "pshufd $0x0,%%xmm7,%%xmm7 \n"
4597 "sub %2,%0 \n"
4598 "sub %2,%1 \n"
4599 "sub %2,%3 \n"
4600
4601 // 8 pixel loop.
4602 LABELALIGN
4603 "1: \n"
4604 "movq (%2),%%xmm0 \n"
4605 "punpcklbw %%xmm0,%%xmm0 \n"
4606 "pxor %%xmm5,%%xmm0 \n"
4607 "movq (%0,%2,1),%%xmm1 \n"
4608 "movq (%1,%2,1),%%xmm2 \n"
4609 "punpcklbw %%xmm2,%%xmm1 \n"
4610 "psubb %%xmm6,%%xmm1 \n"
4611 "pmaddubsw %%xmm1,%%xmm0 \n"
4612 "paddw %%xmm7,%%xmm0 \n"
4613 "psrlw $0x8,%%xmm0 \n"
4614 "packuswb %%xmm0,%%xmm0 \n"
4615 "movq %%xmm0,(%3,%2,1) \n"
4616 "lea 0x8(%2),%2 \n"
4617 "sub $0x8,%4 \n"
4618 "jg 1b \n"
4619 : "+r"(src0), // %0
4620 "+r"(src1), // %1
4621 "+r"(alpha), // %2
4622 "+r"(dst), // %3
4623 "+rm"(width) // %4
4624 ::"memory",
4625 "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
4626 }
4627 #endif // HAS_BLENDPLANEROW_SSSE3
4628
4629 #ifdef HAS_BLENDPLANEROW_AVX2
4630 // Blend 32 pixels at a time.
4631 // unsigned version of math
4632 // =((A2*C2)+(B2*(255-C2))+255)/256
4633 // signed version of math
4634 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_AVX2(const uint8_t * src0,const uint8_t * src1,const uint8_t * alpha,uint8_t * dst,int width)4635 void BlendPlaneRow_AVX2(const uint8_t* src0,
4636 const uint8_t* src1,
4637 const uint8_t* alpha,
4638 uint8_t* dst,
4639 int width) {
4640 asm volatile(
4641 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4642 "vpsllw $0x8,%%ymm5,%%ymm5 \n"
4643 "mov $0x80808080,%%eax \n"
4644 "vmovd %%eax,%%xmm6 \n"
4645 "vbroadcastss %%xmm6,%%ymm6 \n"
4646 "mov $0x807f807f,%%eax \n"
4647 "vmovd %%eax,%%xmm7 \n"
4648 "vbroadcastss %%xmm7,%%ymm7 \n"
4649 "sub %2,%0 \n"
4650 "sub %2,%1 \n"
4651 "sub %2,%3 \n"
4652
4653 // 32 pixel loop.
4654 LABELALIGN
4655 "1: \n"
4656 "vmovdqu (%2),%%ymm0 \n"
4657 "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
4658 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
4659 "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
4660 "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
4661 "vmovdqu (%0,%2,1),%%ymm1 \n"
4662 "vmovdqu (%1,%2,1),%%ymm2 \n"
4663 "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
4664 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
4665 "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
4666 "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
4667 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
4668 "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
4669 "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
4670 "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
4671 "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
4672 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
4673 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
4674 "vmovdqu %%ymm0,(%3,%2,1) \n"
4675 "lea 0x20(%2),%2 \n"
4676 "sub $0x20,%4 \n"
4677 "jg 1b \n"
4678 "vzeroupper \n"
4679 : "+r"(src0), // %0
4680 "+r"(src1), // %1
4681 "+r"(alpha), // %2
4682 "+r"(dst), // %3
4683 "+rm"(width) // %4
4684 ::"memory",
4685 "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
4686 "xmm7");
4687 }
4688 #endif // HAS_BLENDPLANEROW_AVX2
4689
4690 #ifdef HAS_ARGBATTENUATEROW_SSSE3
4691 // Shuffle table duplicating alpha
4692 static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
4693 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
4694 static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
4695 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
4696 // Attenuate 4 pixels at a time.
ARGBAttenuateRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width)4697 void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
4698 uint8_t* dst_argb,
4699 int width) {
4700 asm volatile(
4701 "pcmpeqb %%xmm3,%%xmm3 \n"
4702 "pslld $0x18,%%xmm3 \n"
4703 "movdqa %3,%%xmm4 \n"
4704 "movdqa %4,%%xmm5 \n"
4705
4706 // 4 pixel loop.
4707 LABELALIGN
4708 "1: \n"
4709 "movdqu (%0),%%xmm0 \n"
4710 "pshufb %%xmm4,%%xmm0 \n"
4711 "movdqu (%0),%%xmm1 \n"
4712 "punpcklbw %%xmm1,%%xmm1 \n"
4713 "pmulhuw %%xmm1,%%xmm0 \n"
4714 "movdqu (%0),%%xmm1 \n"
4715 "pshufb %%xmm5,%%xmm1 \n"
4716 "movdqu (%0),%%xmm2 \n"
4717 "punpckhbw %%xmm2,%%xmm2 \n"
4718 "pmulhuw %%xmm2,%%xmm1 \n"
4719 "movdqu (%0),%%xmm2 \n"
4720 "lea 0x10(%0),%0 \n"
4721 "pand %%xmm3,%%xmm2 \n"
4722 "psrlw $0x8,%%xmm0 \n"
4723 "psrlw $0x8,%%xmm1 \n"
4724 "packuswb %%xmm1,%%xmm0 \n"
4725 "por %%xmm2,%%xmm0 \n"
4726 "movdqu %%xmm0,(%1) \n"
4727 "lea 0x10(%1),%1 \n"
4728 "sub $0x4,%2 \n"
4729 "jg 1b \n"
4730 : "+r"(src_argb), // %0
4731 "+r"(dst_argb), // %1
4732 "+r"(width) // %2
4733 : "m"(kShuffleAlpha0), // %3
4734 "m"(kShuffleAlpha1) // %4
4735 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
4736 }
4737 #endif // HAS_ARGBATTENUATEROW_SSSE3
4738
4739 #ifdef HAS_ARGBATTENUATEROW_AVX2
4740 // Shuffle table duplicating alpha.
4741 static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
4742 128u, 128u, 14u, 15u, 14u, 15u,
4743 14u, 15u, 128u, 128u};
4744 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,int width)4745 void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
4746 uint8_t* dst_argb,
4747 int width) {
4748 asm volatile(
4749 "vbroadcastf128 %3,%%ymm4 \n"
4750 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4751 "vpslld $0x18,%%ymm5,%%ymm5 \n"
4752 "sub %0,%1 \n"
4753
4754 // 8 pixel loop.
4755 LABELALIGN
4756 "1: \n"
4757 "vmovdqu (%0),%%ymm6 \n"
4758 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
4759 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
4760 "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
4761 "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
4762 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
4763 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
4764 "vpand %%ymm5,%%ymm6,%%ymm6 \n"
4765 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
4766 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
4767 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4768 "vpor %%ymm6,%%ymm0,%%ymm0 \n"
4769 "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
4770 "lea 0x20(%0),%0 \n"
4771 "sub $0x8,%2 \n"
4772 "jg 1b \n"
4773 "vzeroupper \n"
4774 : "+r"(src_argb), // %0
4775 "+r"(dst_argb), // %1
4776 "+r"(width) // %2
4777 : "m"(kShuffleAlpha_AVX2) // %3
4778 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
4779 }
4780 #endif // HAS_ARGBATTENUATEROW_AVX2
4781
4782 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
4783 // Unattenuate 4 pixels at a time.
ARGBUnattenuateRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,int width)4784 void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
4785 uint8_t* dst_argb,
4786 int width) {
4787 uintptr_t alpha;
4788 asm volatile(
4789 // 4 pixel loop.
4790 LABELALIGN
4791 "1: \n"
4792 "movdqu (%0),%%xmm0 \n"
4793 "movzb 0x03(%0),%3 \n"
4794 "punpcklbw %%xmm0,%%xmm0 \n"
4795 "movd 0x00(%4,%3,4),%%xmm2 \n"
4796 "movzb 0x07(%0),%3 \n"
4797 "movd 0x00(%4,%3,4),%%xmm3 \n"
4798 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
4799 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
4800 "movlhps %%xmm3,%%xmm2 \n"
4801 "pmulhuw %%xmm2,%%xmm0 \n"
4802 "movdqu (%0),%%xmm1 \n"
4803 "movzb 0x0b(%0),%3 \n"
4804 "punpckhbw %%xmm1,%%xmm1 \n"
4805 "movd 0x00(%4,%3,4),%%xmm2 \n"
4806 "movzb 0x0f(%0),%3 \n"
4807 "movd 0x00(%4,%3,4),%%xmm3 \n"
4808 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
4809 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
4810 "movlhps %%xmm3,%%xmm2 \n"
4811 "pmulhuw %%xmm2,%%xmm1 \n"
4812 "lea 0x10(%0),%0 \n"
4813 "packuswb %%xmm1,%%xmm0 \n"
4814 "movdqu %%xmm0,(%1) \n"
4815 "lea 0x10(%1),%1 \n"
4816 "sub $0x4,%2 \n"
4817 "jg 1b \n"
4818 : "+r"(src_argb), // %0
4819 "+r"(dst_argb), // %1
4820 "+r"(width), // %2
4821 "=&r"(alpha) // %3
4822 : "r"(fixed_invtbl8) // %4
4823 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
4824 }
4825 #endif // HAS_ARGBUNATTENUATEROW_SSE2
4826
4827 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
4828 // Shuffle table duplicating alpha.
4829 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
4830 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
4831 // Unattenuate 8 pixels at a time.
ARGBUnattenuateRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,int width)4832 void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
4833 uint8_t* dst_argb,
4834 int width) {
4835 uintptr_t alpha;
4836 asm volatile(
4837 "sub %0,%1 \n"
4838 "vbroadcastf128 %5,%%ymm5 \n"
4839
4840 // 8 pixel loop.
4841 LABELALIGN
4842 "1: \n"
4843 // replace VPGATHER
4844 "movzb 0x03(%0),%3 \n"
4845 "vmovd 0x00(%4,%3,4),%%xmm0 \n"
4846 "movzb 0x07(%0),%3 \n"
4847 "vmovd 0x00(%4,%3,4),%%xmm1 \n"
4848 "movzb 0x0b(%0),%3 \n"
4849 "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
4850 "vmovd 0x00(%4,%3,4),%%xmm2 \n"
4851 "movzb 0x0f(%0),%3 \n"
4852 "vmovd 0x00(%4,%3,4),%%xmm3 \n"
4853 "movzb 0x13(%0),%3 \n"
4854 "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
4855 "vmovd 0x00(%4,%3,4),%%xmm0 \n"
4856 "movzb 0x17(%0),%3 \n"
4857 "vmovd 0x00(%4,%3,4),%%xmm1 \n"
4858 "movzb 0x1b(%0),%3 \n"
4859 "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
4860 "vmovd 0x00(%4,%3,4),%%xmm2 \n"
4861 "movzb 0x1f(%0),%3 \n"
4862 "vmovd 0x00(%4,%3,4),%%xmm3 \n"
4863 "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
4864 "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
4865 "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
4866 "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
4867 // end of VPGATHER
4868
4869 "vmovdqu (%0),%%ymm6 \n"
4870 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
4871 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
4872 "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
4873 "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
4874 "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
4875 "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
4876 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
4877 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
4878 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4879 "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
4880 "lea 0x20(%0),%0 \n"
4881 "sub $0x8,%2 \n"
4882 "jg 1b \n"
4883 "vzeroupper \n"
4884 : "+r"(src_argb), // %0
4885 "+r"(dst_argb), // %1
4886 "+r"(width), // %2
4887 "=&r"(alpha) // %3
4888 : "r"(fixed_invtbl8), // %4
4889 "m"(kUnattenShuffleAlpha_AVX2) // %5
4890 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
4891 "xmm7");
4892 }
4893 #endif // HAS_ARGBUNATTENUATEROW_AVX2
4894
4895 #ifdef HAS_ARGBGRAYROW_SSSE3
4896 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
ARGBGrayRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width)4897 void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
4898 asm volatile(
4899 "movdqa %3,%%xmm4 \n"
4900 "movdqa %4,%%xmm5 \n"
4901
4902 // 8 pixel loop.
4903 LABELALIGN
4904 "1: \n"
4905 "movdqu (%0),%%xmm0 \n"
4906 "movdqu 0x10(%0),%%xmm1 \n"
4907 "pmaddubsw %%xmm4,%%xmm0 \n"
4908 "pmaddubsw %%xmm4,%%xmm1 \n"
4909 "phaddw %%xmm1,%%xmm0 \n"
4910 "paddw %%xmm5,%%xmm0 \n"
4911 "psrlw $0x7,%%xmm0 \n"
4912 "packuswb %%xmm0,%%xmm0 \n"
4913 "movdqu (%0),%%xmm2 \n"
4914 "movdqu 0x10(%0),%%xmm3 \n"
4915 "lea 0x20(%0),%0 \n"
4916 "psrld $0x18,%%xmm2 \n"
4917 "psrld $0x18,%%xmm3 \n"
4918 "packuswb %%xmm3,%%xmm2 \n"
4919 "packuswb %%xmm2,%%xmm2 \n"
4920 "movdqa %%xmm0,%%xmm3 \n"
4921 "punpcklbw %%xmm0,%%xmm0 \n"
4922 "punpcklbw %%xmm2,%%xmm3 \n"
4923 "movdqa %%xmm0,%%xmm1 \n"
4924 "punpcklwd %%xmm3,%%xmm0 \n"
4925 "punpckhwd %%xmm3,%%xmm1 \n"
4926 "movdqu %%xmm0,(%1) \n"
4927 "movdqu %%xmm1,0x10(%1) \n"
4928 "lea 0x20(%1),%1 \n"
4929 "sub $0x8,%2 \n"
4930 "jg 1b \n"
4931 : "+r"(src_argb), // %0
4932 "+r"(dst_argb), // %1
4933 "+r"(width) // %2
4934 : "m"(kARGBToYJ), // %3
4935 "m"(kAddYJ64) // %4
4936 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
4937 }
4938 #endif // HAS_ARGBGRAYROW_SSSE3
4939
4940 #ifdef HAS_ARGBSEPIAROW_SSSE3
4941 // b = (r * 35 + g * 68 + b * 17) >> 7
4942 // g = (r * 45 + g * 88 + b * 22) >> 7
4943 // r = (r * 50 + g * 98 + b * 24) >> 7
4944 // Constant for ARGB color to sepia tone
4945 static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
4946 17, 68, 35, 0, 17, 68, 35, 0};
4947
4948 static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
4949 22, 88, 45, 0, 22, 88, 45, 0};
4950
4951 static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
4952 24, 98, 50, 0, 24, 98, 50, 0};
4953
4954 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
ARGBSepiaRow_SSSE3(uint8_t * dst_argb,int width)4955 void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
4956 asm volatile(
4957 "movdqa %2,%%xmm2 \n"
4958 "movdqa %3,%%xmm3 \n"
4959 "movdqa %4,%%xmm4 \n"
4960
4961 // 8 pixel loop.
4962 LABELALIGN
4963 "1: \n"
4964 "movdqu (%0),%%xmm0 \n"
4965 "movdqu 0x10(%0),%%xmm6 \n"
4966 "pmaddubsw %%xmm2,%%xmm0 \n"
4967 "pmaddubsw %%xmm2,%%xmm6 \n"
4968 "phaddw %%xmm6,%%xmm0 \n"
4969 "psrlw $0x7,%%xmm0 \n"
4970 "packuswb %%xmm0,%%xmm0 \n"
4971 "movdqu (%0),%%xmm5 \n"
4972 "movdqu 0x10(%0),%%xmm1 \n"
4973 "pmaddubsw %%xmm3,%%xmm5 \n"
4974 "pmaddubsw %%xmm3,%%xmm1 \n"
4975 "phaddw %%xmm1,%%xmm5 \n"
4976 "psrlw $0x7,%%xmm5 \n"
4977 "packuswb %%xmm5,%%xmm5 \n"
4978 "punpcklbw %%xmm5,%%xmm0 \n"
4979 "movdqu (%0),%%xmm5 \n"
4980 "movdqu 0x10(%0),%%xmm1 \n"
4981 "pmaddubsw %%xmm4,%%xmm5 \n"
4982 "pmaddubsw %%xmm4,%%xmm1 \n"
4983 "phaddw %%xmm1,%%xmm5 \n"
4984 "psrlw $0x7,%%xmm5 \n"
4985 "packuswb %%xmm5,%%xmm5 \n"
4986 "movdqu (%0),%%xmm6 \n"
4987 "movdqu 0x10(%0),%%xmm1 \n"
4988 "psrld $0x18,%%xmm6 \n"
4989 "psrld $0x18,%%xmm1 \n"
4990 "packuswb %%xmm1,%%xmm6 \n"
4991 "packuswb %%xmm6,%%xmm6 \n"
4992 "punpcklbw %%xmm6,%%xmm5 \n"
4993 "movdqa %%xmm0,%%xmm1 \n"
4994 "punpcklwd %%xmm5,%%xmm0 \n"
4995 "punpckhwd %%xmm5,%%xmm1 \n"
4996 "movdqu %%xmm0,(%0) \n"
4997 "movdqu %%xmm1,0x10(%0) \n"
4998 "lea 0x20(%0),%0 \n"
4999 "sub $0x8,%1 \n"
5000 "jg 1b \n"
5001 : "+r"(dst_argb), // %0
5002 "+r"(width) // %1
5003 : "m"(kARGBToSepiaB), // %2
5004 "m"(kARGBToSepiaG), // %3
5005 "m"(kARGBToSepiaR) // %4
5006 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
5007 }
5008 #endif // HAS_ARGBSEPIAROW_SSSE3
5009
5010 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
5011 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
5012 // Same as Sepia except matrix is provided.
ARGBColorMatrixRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,const int8_t * matrix_argb,int width)5013 void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
5014 uint8_t* dst_argb,
5015 const int8_t* matrix_argb,
5016 int width) {
5017 asm volatile(
5018 "movdqu (%3),%%xmm5 \n"
5019 "pshufd $0x00,%%xmm5,%%xmm2 \n"
5020 "pshufd $0x55,%%xmm5,%%xmm3 \n"
5021 "pshufd $0xaa,%%xmm5,%%xmm4 \n"
5022 "pshufd $0xff,%%xmm5,%%xmm5 \n"
5023
5024 // 8 pixel loop.
5025 LABELALIGN
5026 "1: \n"
5027 "movdqu (%0),%%xmm0 \n"
5028 "movdqu 0x10(%0),%%xmm7 \n"
5029 "pmaddubsw %%xmm2,%%xmm0 \n"
5030 "pmaddubsw %%xmm2,%%xmm7 \n"
5031 "movdqu (%0),%%xmm6 \n"
5032 "movdqu 0x10(%0),%%xmm1 \n"
5033 "pmaddubsw %%xmm3,%%xmm6 \n"
5034 "pmaddubsw %%xmm3,%%xmm1 \n"
5035 "phaddsw %%xmm7,%%xmm0 \n"
5036 "phaddsw %%xmm1,%%xmm6 \n"
5037 "psraw $0x6,%%xmm0 \n"
5038 "psraw $0x6,%%xmm6 \n"
5039 "packuswb %%xmm0,%%xmm0 \n"
5040 "packuswb %%xmm6,%%xmm6 \n"
5041 "punpcklbw %%xmm6,%%xmm0 \n"
5042 "movdqu (%0),%%xmm1 \n"
5043 "movdqu 0x10(%0),%%xmm7 \n"
5044 "pmaddubsw %%xmm4,%%xmm1 \n"
5045 "pmaddubsw %%xmm4,%%xmm7 \n"
5046 "phaddsw %%xmm7,%%xmm1 \n"
5047 "movdqu (%0),%%xmm6 \n"
5048 "movdqu 0x10(%0),%%xmm7 \n"
5049 "pmaddubsw %%xmm5,%%xmm6 \n"
5050 "pmaddubsw %%xmm5,%%xmm7 \n"
5051 "phaddsw %%xmm7,%%xmm6 \n"
5052 "psraw $0x6,%%xmm1 \n"
5053 "psraw $0x6,%%xmm6 \n"
5054 "packuswb %%xmm1,%%xmm1 \n"
5055 "packuswb %%xmm6,%%xmm6 \n"
5056 "punpcklbw %%xmm6,%%xmm1 \n"
5057 "movdqa %%xmm0,%%xmm6 \n"
5058 "punpcklwd %%xmm1,%%xmm0 \n"
5059 "punpckhwd %%xmm1,%%xmm6 \n"
5060 "movdqu %%xmm0,(%1) \n"
5061 "movdqu %%xmm6,0x10(%1) \n"
5062 "lea 0x20(%0),%0 \n"
5063 "lea 0x20(%1),%1 \n"
5064 "sub $0x8,%2 \n"
5065 "jg 1b \n"
5066 : "+r"(src_argb), // %0
5067 "+r"(dst_argb), // %1
5068 "+r"(width) // %2
5069 : "r"(matrix_argb) // %3
5070 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
5071 "xmm7");
5072 }
5073 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
5074
5075 #ifdef HAS_ARGBQUANTIZEROW_SSE2
5076 // Quantize 4 ARGB pixels (16 bytes).
ARGBQuantizeRow_SSE2(uint8_t * dst_argb,int scale,int interval_size,int interval_offset,int width)5077 void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
5078 int scale,
5079 int interval_size,
5080 int interval_offset,
5081 int width) {
5082 asm volatile(
5083 "movd %2,%%xmm2 \n"
5084 "movd %3,%%xmm3 \n"
5085 "movd %4,%%xmm4 \n"
5086 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
5087 "pshufd $0x44,%%xmm2,%%xmm2 \n"
5088 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
5089 "pshufd $0x44,%%xmm3,%%xmm3 \n"
5090 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
5091 "pshufd $0x44,%%xmm4,%%xmm4 \n"
5092 "pxor %%xmm5,%%xmm5 \n"
5093 "pcmpeqb %%xmm6,%%xmm6 \n"
5094 "pslld $0x18,%%xmm6 \n"
5095
5096 // 4 pixel loop.
5097 LABELALIGN
5098 "1: \n"
5099 "movdqu (%0),%%xmm0 \n"
5100 "punpcklbw %%xmm5,%%xmm0 \n"
5101 "pmulhuw %%xmm2,%%xmm0 \n"
5102 "movdqu (%0),%%xmm1 \n"
5103 "punpckhbw %%xmm5,%%xmm1 \n"
5104 "pmulhuw %%xmm2,%%xmm1 \n"
5105 "pmullw %%xmm3,%%xmm0 \n"
5106 "movdqu (%0),%%xmm7 \n"
5107 "pmullw %%xmm3,%%xmm1 \n"
5108 "pand %%xmm6,%%xmm7 \n"
5109 "paddw %%xmm4,%%xmm0 \n"
5110 "paddw %%xmm4,%%xmm1 \n"
5111 "packuswb %%xmm1,%%xmm0 \n"
5112 "por %%xmm7,%%xmm0 \n"
5113 "movdqu %%xmm0,(%0) \n"
5114 "lea 0x10(%0),%0 \n"
5115 "sub $0x4,%1 \n"
5116 "jg 1b \n"
5117 : "+r"(dst_argb), // %0
5118 "+r"(width) // %1
5119 : "r"(scale), // %2
5120 "r"(interval_size), // %3
5121 "r"(interval_offset) // %4
5122 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
5123 "xmm7");
5124 }
5125 #endif // HAS_ARGBQUANTIZEROW_SSE2
5126
5127 #ifdef HAS_ARGBSHADEROW_SSE2
5128 // Shade 4 pixels at a time by specified value.
ARGBShadeRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)5129 void ARGBShadeRow_SSE2(const uint8_t* src_argb,
5130 uint8_t* dst_argb,
5131 int width,
5132 uint32_t value) {
5133 asm volatile(
5134 "movd %3,%%xmm2 \n"
5135 "punpcklbw %%xmm2,%%xmm2 \n"
5136 "punpcklqdq %%xmm2,%%xmm2 \n"
5137
5138 // 4 pixel loop.
5139 LABELALIGN
5140 "1: \n"
5141 "movdqu (%0),%%xmm0 \n"
5142 "lea 0x10(%0),%0 \n"
5143 "movdqa %%xmm0,%%xmm1 \n"
5144 "punpcklbw %%xmm0,%%xmm0 \n"
5145 "punpckhbw %%xmm1,%%xmm1 \n"
5146 "pmulhuw %%xmm2,%%xmm0 \n"
5147 "pmulhuw %%xmm2,%%xmm1 \n"
5148 "psrlw $0x8,%%xmm0 \n"
5149 "psrlw $0x8,%%xmm1 \n"
5150 "packuswb %%xmm1,%%xmm0 \n"
5151 "movdqu %%xmm0,(%1) \n"
5152 "lea 0x10(%1),%1 \n"
5153 "sub $0x4,%2 \n"
5154 "jg 1b \n"
5155 : "+r"(src_argb), // %0
5156 "+r"(dst_argb), // %1
5157 "+r"(width) // %2
5158 : "r"(value) // %3
5159 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5160 }
5161 #endif // HAS_ARGBSHADEROW_SSE2
5162
5163 #ifdef HAS_ARGBMULTIPLYROW_SSE2
5164 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBMultiplyRow_SSE2(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)5165 void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
5166 const uint8_t* src_argb1,
5167 uint8_t* dst_argb,
5168 int width) {
5169 asm volatile(
5170
5171 "pxor %%xmm5,%%xmm5 \n"
5172
5173 // 4 pixel loop.
5174 LABELALIGN
5175 "1: \n"
5176 "movdqu (%0),%%xmm0 \n"
5177 "lea 0x10(%0),%0 \n"
5178 "movdqu (%1),%%xmm2 \n"
5179 "lea 0x10(%1),%1 \n"
5180 "movdqu %%xmm0,%%xmm1 \n"
5181 "movdqu %%xmm2,%%xmm3 \n"
5182 "punpcklbw %%xmm0,%%xmm0 \n"
5183 "punpckhbw %%xmm1,%%xmm1 \n"
5184 "punpcklbw %%xmm5,%%xmm2 \n"
5185 "punpckhbw %%xmm5,%%xmm3 \n"
5186 "pmulhuw %%xmm2,%%xmm0 \n"
5187 "pmulhuw %%xmm3,%%xmm1 \n"
5188 "packuswb %%xmm1,%%xmm0 \n"
5189 "movdqu %%xmm0,(%2) \n"
5190 "lea 0x10(%2),%2 \n"
5191 "sub $0x4,%3 \n"
5192 "jg 1b \n"
5193 : "+r"(src_argb0), // %0
5194 "+r"(src_argb1), // %1
5195 "+r"(dst_argb), // %2
5196 "+r"(width) // %3
5197 :
5198 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
5199 }
5200 #endif // HAS_ARGBMULTIPLYROW_SSE2
5201
5202 #ifdef HAS_ARGBMULTIPLYROW_AVX2
5203 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_AVX2(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)5204 void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
5205 const uint8_t* src_argb1,
5206 uint8_t* dst_argb,
5207 int width) {
5208 asm volatile(
5209
5210 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
5211
5212 // 4 pixel loop.
5213 LABELALIGN
5214 "1: \n"
5215 "vmovdqu (%0),%%ymm1 \n"
5216 "lea 0x20(%0),%0 \n"
5217 "vmovdqu (%1),%%ymm3 \n"
5218 "lea 0x20(%1),%1 \n"
5219 "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
5220 "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
5221 "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
5222 "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
5223 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
5224 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
5225 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
5226 "vmovdqu %%ymm0,(%2) \n"
5227 "lea 0x20(%2),%2 \n"
5228 "sub $0x8,%3 \n"
5229 "jg 1b \n"
5230 "vzeroupper \n"
5231 : "+r"(src_argb0), // %0
5232 "+r"(src_argb1), // %1
5233 "+r"(dst_argb), // %2
5234 "+r"(width) // %3
5235 :
5236 : "memory", "cc"
5237 #if defined(__AVX2__)
5238 ,
5239 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
5240 #endif
5241 );
5242 }
5243 #endif // HAS_ARGBMULTIPLYROW_AVX2
5244
5245 #ifdef HAS_ARGBADDROW_SSE2
5246 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_SSE2(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)5247 void ARGBAddRow_SSE2(const uint8_t* src_argb0,
5248 const uint8_t* src_argb1,
5249 uint8_t* dst_argb,
5250 int width) {
5251 asm volatile(
5252 // 4 pixel loop.
5253 LABELALIGN
5254 "1: \n"
5255 "movdqu (%0),%%xmm0 \n"
5256 "lea 0x10(%0),%0 \n"
5257 "movdqu (%1),%%xmm1 \n"
5258 "lea 0x10(%1),%1 \n"
5259 "paddusb %%xmm1,%%xmm0 \n"
5260 "movdqu %%xmm0,(%2) \n"
5261 "lea 0x10(%2),%2 \n"
5262 "sub $0x4,%3 \n"
5263 "jg 1b \n"
5264 : "+r"(src_argb0), // %0
5265 "+r"(src_argb1), // %1
5266 "+r"(dst_argb), // %2
5267 "+r"(width) // %3
5268 :
5269 : "memory", "cc", "xmm0", "xmm1");
5270 }
5271 #endif // HAS_ARGBADDROW_SSE2
5272
5273 #ifdef HAS_ARGBADDROW_AVX2
5274 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_AVX2(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)5275 void ARGBAddRow_AVX2(const uint8_t* src_argb0,
5276 const uint8_t* src_argb1,
5277 uint8_t* dst_argb,
5278 int width) {
5279 asm volatile(
5280 // 4 pixel loop.
5281 LABELALIGN
5282 "1: \n"
5283 "vmovdqu (%0),%%ymm0 \n"
5284 "lea 0x20(%0),%0 \n"
5285 "vpaddusb (%1),%%ymm0,%%ymm0 \n"
5286 "lea 0x20(%1),%1 \n"
5287 "vmovdqu %%ymm0,(%2) \n"
5288 "lea 0x20(%2),%2 \n"
5289 "sub $0x8,%3 \n"
5290 "jg 1b \n"
5291 "vzeroupper \n"
5292 : "+r"(src_argb0), // %0
5293 "+r"(src_argb1), // %1
5294 "+r"(dst_argb), // %2
5295 "+r"(width) // %3
5296 :
5297 : "memory", "cc", "xmm0");
5298 }
5299 #endif // HAS_ARGBADDROW_AVX2
5300
5301 #ifdef HAS_ARGBSUBTRACTROW_SSE2
5302 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
ARGBSubtractRow_SSE2(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)5303 void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
5304 const uint8_t* src_argb1,
5305 uint8_t* dst_argb,
5306 int width) {
5307 asm volatile(
5308 // 4 pixel loop.
5309 LABELALIGN
5310 "1: \n"
5311 "movdqu (%0),%%xmm0 \n"
5312 "lea 0x10(%0),%0 \n"
5313 "movdqu (%1),%%xmm1 \n"
5314 "lea 0x10(%1),%1 \n"
5315 "psubusb %%xmm1,%%xmm0 \n"
5316 "movdqu %%xmm0,(%2) \n"
5317 "lea 0x10(%2),%2 \n"
5318 "sub $0x4,%3 \n"
5319 "jg 1b \n"
5320 : "+r"(src_argb0), // %0
5321 "+r"(src_argb1), // %1
5322 "+r"(dst_argb), // %2
5323 "+r"(width) // %3
5324 :
5325 : "memory", "cc", "xmm0", "xmm1");
5326 }
5327 #endif // HAS_ARGBSUBTRACTROW_SSE2
5328
5329 #ifdef HAS_ARGBSUBTRACTROW_AVX2
5330 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_AVX2(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)5331 void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
5332 const uint8_t* src_argb1,
5333 uint8_t* dst_argb,
5334 int width) {
5335 asm volatile(
5336 // 4 pixel loop.
5337 LABELALIGN
5338 "1: \n"
5339 "vmovdqu (%0),%%ymm0 \n"
5340 "lea 0x20(%0),%0 \n"
5341 "vpsubusb (%1),%%ymm0,%%ymm0 \n"
5342 "lea 0x20(%1),%1 \n"
5343 "vmovdqu %%ymm0,(%2) \n"
5344 "lea 0x20(%2),%2 \n"
5345 "sub $0x8,%3 \n"
5346 "jg 1b \n"
5347 "vzeroupper \n"
5348 : "+r"(src_argb0), // %0
5349 "+r"(src_argb1), // %1
5350 "+r"(dst_argb), // %2
5351 "+r"(width) // %3
5352 :
5353 : "memory", "cc", "xmm0");
5354 }
5355 #endif // HAS_ARGBSUBTRACTROW_AVX2
5356
5357 #ifdef HAS_SOBELXROW_SSE2
5358 // SobelX as a matrix is
5359 // -1 0 1
5360 // -2 0 2
5361 // -1 0 1
SobelXRow_SSE2(const uint8_t * src_y0,const uint8_t * src_y1,const uint8_t * src_y2,uint8_t * dst_sobelx,int width)5362 void SobelXRow_SSE2(const uint8_t* src_y0,
5363 const uint8_t* src_y1,
5364 const uint8_t* src_y2,
5365 uint8_t* dst_sobelx,
5366 int width) {
5367 asm volatile(
5368 "sub %0,%1 \n"
5369 "sub %0,%2 \n"
5370 "sub %0,%3 \n"
5371 "pxor %%xmm5,%%xmm5 \n"
5372
5373 // 8 pixel loop.
5374 LABELALIGN
5375 "1: \n"
5376 "movq (%0),%%xmm0 \n"
5377 "movq 0x2(%0),%%xmm1 \n"
5378 "punpcklbw %%xmm5,%%xmm0 \n"
5379 "punpcklbw %%xmm5,%%xmm1 \n"
5380 "psubw %%xmm1,%%xmm0 \n"
5381 "movq 0x00(%0,%1,1),%%xmm1 \n"
5382 "movq 0x02(%0,%1,1),%%xmm2 \n"
5383 "punpcklbw %%xmm5,%%xmm1 \n"
5384 "punpcklbw %%xmm5,%%xmm2 \n"
5385 "psubw %%xmm2,%%xmm1 \n"
5386 "movq 0x00(%0,%2,1),%%xmm2 \n"
5387 "movq 0x02(%0,%2,1),%%xmm3 \n"
5388 "punpcklbw %%xmm5,%%xmm2 \n"
5389 "punpcklbw %%xmm5,%%xmm3 \n"
5390 "psubw %%xmm3,%%xmm2 \n"
5391 "paddw %%xmm2,%%xmm0 \n"
5392 "paddw %%xmm1,%%xmm0 \n"
5393 "paddw %%xmm1,%%xmm0 \n"
5394 "pxor %%xmm1,%%xmm1 \n"
5395 "psubw %%xmm0,%%xmm1 \n"
5396 "pmaxsw %%xmm1,%%xmm0 \n"
5397 "packuswb %%xmm0,%%xmm0 \n"
5398 "movq %%xmm0,0x00(%0,%3,1) \n"
5399 "lea 0x8(%0),%0 \n"
5400 "sub $0x8,%4 \n"
5401 "jg 1b \n"
5402 : "+r"(src_y0), // %0
5403 "+r"(src_y1), // %1
5404 "+r"(src_y2), // %2
5405 "+r"(dst_sobelx), // %3
5406 "+r"(width) // %4
5407 :
5408 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
5409 }
5410 #endif // HAS_SOBELXROW_SSE2
5411
5412 #ifdef HAS_SOBELYROW_SSE2
5413 // SobelY as a matrix is
5414 // -1 -2 -1
5415 // 0 0 0
5416 // 1 2 1
SobelYRow_SSE2(const uint8_t * src_y0,const uint8_t * src_y1,uint8_t * dst_sobely,int width)5417 void SobelYRow_SSE2(const uint8_t* src_y0,
5418 const uint8_t* src_y1,
5419 uint8_t* dst_sobely,
5420 int width) {
5421 asm volatile(
5422 "sub %0,%1 \n"
5423 "sub %0,%2 \n"
5424 "pxor %%xmm5,%%xmm5 \n"
5425
5426 // 8 pixel loop.
5427 LABELALIGN
5428 "1: \n"
5429 "movq (%0),%%xmm0 \n"
5430 "movq 0x00(%0,%1,1),%%xmm1 \n"
5431 "punpcklbw %%xmm5,%%xmm0 \n"
5432 "punpcklbw %%xmm5,%%xmm1 \n"
5433 "psubw %%xmm1,%%xmm0 \n"
5434 "movq 0x1(%0),%%xmm1 \n"
5435 "movq 0x01(%0,%1,1),%%xmm2 \n"
5436 "punpcklbw %%xmm5,%%xmm1 \n"
5437 "punpcklbw %%xmm5,%%xmm2 \n"
5438 "psubw %%xmm2,%%xmm1 \n"
5439 "movq 0x2(%0),%%xmm2 \n"
5440 "movq 0x02(%0,%1,1),%%xmm3 \n"
5441 "punpcklbw %%xmm5,%%xmm2 \n"
5442 "punpcklbw %%xmm5,%%xmm3 \n"
5443 "psubw %%xmm3,%%xmm2 \n"
5444 "paddw %%xmm2,%%xmm0 \n"
5445 "paddw %%xmm1,%%xmm0 \n"
5446 "paddw %%xmm1,%%xmm0 \n"
5447 "pxor %%xmm1,%%xmm1 \n"
5448 "psubw %%xmm0,%%xmm1 \n"
5449 "pmaxsw %%xmm1,%%xmm0 \n"
5450 "packuswb %%xmm0,%%xmm0 \n"
5451 "movq %%xmm0,0x00(%0,%2,1) \n"
5452 "lea 0x8(%0),%0 \n"
5453 "sub $0x8,%3 \n"
5454 "jg 1b \n"
5455 : "+r"(src_y0), // %0
5456 "+r"(src_y1), // %1
5457 "+r"(dst_sobely), // %2
5458 "+r"(width) // %3
5459 :
5460 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
5461 }
5462 #endif // HAS_SOBELYROW_SSE2
5463
5464 #ifdef HAS_SOBELROW_SSE2
5465 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
5466 // A = 255
5467 // R = Sobel
5468 // G = Sobel
5469 // B = Sobel
SobelRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)5470 void SobelRow_SSE2(const uint8_t* src_sobelx,
5471 const uint8_t* src_sobely,
5472 uint8_t* dst_argb,
5473 int width) {
5474 asm volatile(
5475 "sub %0,%1 \n"
5476 "pcmpeqb %%xmm5,%%xmm5 \n"
5477 "pslld $0x18,%%xmm5 \n"
5478
5479 // 8 pixel loop.
5480 LABELALIGN
5481 "1: \n"
5482 "movdqu (%0),%%xmm0 \n"
5483 "movdqu 0x00(%0,%1,1),%%xmm1 \n"
5484 "lea 0x10(%0),%0 \n"
5485 "paddusb %%xmm1,%%xmm0 \n"
5486 "movdqa %%xmm0,%%xmm2 \n"
5487 "punpcklbw %%xmm0,%%xmm2 \n"
5488 "punpckhbw %%xmm0,%%xmm0 \n"
5489 "movdqa %%xmm2,%%xmm1 \n"
5490 "punpcklwd %%xmm2,%%xmm1 \n"
5491 "punpckhwd %%xmm2,%%xmm2 \n"
5492 "por %%xmm5,%%xmm1 \n"
5493 "por %%xmm5,%%xmm2 \n"
5494 "movdqa %%xmm0,%%xmm3 \n"
5495 "punpcklwd %%xmm0,%%xmm3 \n"
5496 "punpckhwd %%xmm0,%%xmm0 \n"
5497 "por %%xmm5,%%xmm3 \n"
5498 "por %%xmm5,%%xmm0 \n"
5499 "movdqu %%xmm1,(%2) \n"
5500 "movdqu %%xmm2,0x10(%2) \n"
5501 "movdqu %%xmm3,0x20(%2) \n"
5502 "movdqu %%xmm0,0x30(%2) \n"
5503 "lea 0x40(%2),%2 \n"
5504 "sub $0x10,%3 \n"
5505 "jg 1b \n"
5506 : "+r"(src_sobelx), // %0
5507 "+r"(src_sobely), // %1
5508 "+r"(dst_argb), // %2
5509 "+r"(width) // %3
5510 :
5511 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
5512 }
5513 #endif // HAS_SOBELROW_SSE2
5514
5515 #ifdef HAS_SOBELTOPLANEROW_SSE2
5516 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
SobelToPlaneRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_y,int width)5517 void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
5518 const uint8_t* src_sobely,
5519 uint8_t* dst_y,
5520 int width) {
5521 asm volatile(
5522 "sub %0,%1 \n"
5523 "pcmpeqb %%xmm5,%%xmm5 \n"
5524 "pslld $0x18,%%xmm5 \n"
5525
5526 // 8 pixel loop.
5527 LABELALIGN
5528 "1: \n"
5529 "movdqu (%0),%%xmm0 \n"
5530 "movdqu 0x00(%0,%1,1),%%xmm1 \n"
5531 "lea 0x10(%0),%0 \n"
5532 "paddusb %%xmm1,%%xmm0 \n"
5533 "movdqu %%xmm0,(%2) \n"
5534 "lea 0x10(%2),%2 \n"
5535 "sub $0x10,%3 \n"
5536 "jg 1b \n"
5537 : "+r"(src_sobelx), // %0
5538 "+r"(src_sobely), // %1
5539 "+r"(dst_y), // %2
5540 "+r"(width) // %3
5541 :
5542 : "memory", "cc", "xmm0", "xmm1");
5543 }
5544 #endif // HAS_SOBELTOPLANEROW_SSE2
5545
5546 #ifdef HAS_SOBELXYROW_SSE2
5547 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
5548 // A = 255
5549 // R = Sobel X
5550 // G = Sobel
5551 // B = Sobel Y
SobelXYRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)5552 void SobelXYRow_SSE2(const uint8_t* src_sobelx,
5553 const uint8_t* src_sobely,
5554 uint8_t* dst_argb,
5555 int width) {
5556 asm volatile(
5557 "sub %0,%1 \n"
5558 "pcmpeqb %%xmm5,%%xmm5 \n"
5559
5560 // 8 pixel loop.
5561 LABELALIGN
5562 "1: \n"
5563 "movdqu (%0),%%xmm0 \n"
5564 "movdqu 0x00(%0,%1,1),%%xmm1 \n"
5565 "lea 0x10(%0),%0 \n"
5566 "movdqa %%xmm0,%%xmm2 \n"
5567 "paddusb %%xmm1,%%xmm2 \n"
5568 "movdqa %%xmm0,%%xmm3 \n"
5569 "punpcklbw %%xmm5,%%xmm3 \n"
5570 "punpckhbw %%xmm5,%%xmm0 \n"
5571 "movdqa %%xmm1,%%xmm4 \n"
5572 "punpcklbw %%xmm2,%%xmm4 \n"
5573 "punpckhbw %%xmm2,%%xmm1 \n"
5574 "movdqa %%xmm4,%%xmm6 \n"
5575 "punpcklwd %%xmm3,%%xmm6 \n"
5576 "punpckhwd %%xmm3,%%xmm4 \n"
5577 "movdqa %%xmm1,%%xmm7 \n"
5578 "punpcklwd %%xmm0,%%xmm7 \n"
5579 "punpckhwd %%xmm0,%%xmm1 \n"
5580 "movdqu %%xmm6,(%2) \n"
5581 "movdqu %%xmm4,0x10(%2) \n"
5582 "movdqu %%xmm7,0x20(%2) \n"
5583 "movdqu %%xmm1,0x30(%2) \n"
5584 "lea 0x40(%2),%2 \n"
5585 "sub $0x10,%3 \n"
5586 "jg 1b \n"
5587 : "+r"(src_sobelx), // %0
5588 "+r"(src_sobely), // %1
5589 "+r"(dst_argb), // %2
5590 "+r"(width) // %3
5591 :
5592 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
5593 "xmm7");
5594 }
5595 #endif // HAS_SOBELXYROW_SSE2
5596
5597 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
5598 // Creates a table of cumulative sums where each value is a sum of all values
5599 // above and to the left of the value, inclusive of the value.
ComputeCumulativeSumRow_SSE2(const uint8_t * row,int32_t * cumsum,const int32_t * previous_cumsum,int width)5600 void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
5601 int32_t* cumsum,
5602 const int32_t* previous_cumsum,
5603 int width) {
5604 asm volatile(
5605 "pxor %%xmm0,%%xmm0 \n"
5606 "pxor %%xmm1,%%xmm1 \n"
5607 "sub $0x4,%3 \n"
5608 "jl 49f \n"
5609 "test $0xf,%1 \n"
5610 "jne 49f \n"
5611
5612 // 4 pixel loop.
5613 LABELALIGN
5614 "40: \n"
5615 "movdqu (%0),%%xmm2 \n"
5616 "lea 0x10(%0),%0 \n"
5617 "movdqa %%xmm2,%%xmm4 \n"
5618 "punpcklbw %%xmm1,%%xmm2 \n"
5619 "movdqa %%xmm2,%%xmm3 \n"
5620 "punpcklwd %%xmm1,%%xmm2 \n"
5621 "punpckhwd %%xmm1,%%xmm3 \n"
5622 "punpckhbw %%xmm1,%%xmm4 \n"
5623 "movdqa %%xmm4,%%xmm5 \n"
5624 "punpcklwd %%xmm1,%%xmm4 \n"
5625 "punpckhwd %%xmm1,%%xmm5 \n"
5626 "paddd %%xmm2,%%xmm0 \n"
5627 "movdqu (%2),%%xmm2 \n"
5628 "paddd %%xmm0,%%xmm2 \n"
5629 "paddd %%xmm3,%%xmm0 \n"
5630 "movdqu 0x10(%2),%%xmm3 \n"
5631 "paddd %%xmm0,%%xmm3 \n"
5632 "paddd %%xmm4,%%xmm0 \n"
5633 "movdqu 0x20(%2),%%xmm4 \n"
5634 "paddd %%xmm0,%%xmm4 \n"
5635 "paddd %%xmm5,%%xmm0 \n"
5636 "movdqu 0x30(%2),%%xmm5 \n"
5637 "lea 0x40(%2),%2 \n"
5638 "paddd %%xmm0,%%xmm5 \n"
5639 "movdqu %%xmm2,(%1) \n"
5640 "movdqu %%xmm3,0x10(%1) \n"
5641 "movdqu %%xmm4,0x20(%1) \n"
5642 "movdqu %%xmm5,0x30(%1) \n"
5643 "lea 0x40(%1),%1 \n"
5644 "sub $0x4,%3 \n"
5645 "jge 40b \n"
5646
5647 "49: \n"
5648 "add $0x3,%3 \n"
5649 "jl 19f \n"
5650
5651 // 1 pixel loop.
5652 LABELALIGN
5653 "10: \n"
5654 "movd (%0),%%xmm2 \n"
5655 "lea 0x4(%0),%0 \n"
5656 "punpcklbw %%xmm1,%%xmm2 \n"
5657 "punpcklwd %%xmm1,%%xmm2 \n"
5658 "paddd %%xmm2,%%xmm0 \n"
5659 "movdqu (%2),%%xmm2 \n"
5660 "lea 0x10(%2),%2 \n"
5661 "paddd %%xmm0,%%xmm2 \n"
5662 "movdqu %%xmm2,(%1) \n"
5663 "lea 0x10(%1),%1 \n"
5664 "sub $0x1,%3 \n"
5665 "jge 10b \n"
5666
5667 "19: \n"
5668 : "+r"(row), // %0
5669 "+r"(cumsum), // %1
5670 "+r"(previous_cumsum), // %2
5671 "+r"(width) // %3
5672 :
5673 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
5674 }
5675 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
5676
5677 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
CumulativeSumToAverageRow_SSE2(const int32_t * topleft,const int32_t * botleft,int width,int area,uint8_t * dst,int count)5678 void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
5679 const int32_t* botleft,
5680 int width,
5681 int area,
5682 uint8_t* dst,
5683 int count) {
5684 asm volatile(
5685 "movd %5,%%xmm5 \n"
5686 "cvtdq2ps %%xmm5,%%xmm5 \n"
5687 "rcpss %%xmm5,%%xmm4 \n"
5688 "pshufd $0x0,%%xmm4,%%xmm4 \n"
5689 "sub $0x4,%3 \n"
5690 "jl 49f \n"
5691 "cmpl $0x80,%5 \n"
5692 "ja 40f \n"
5693
5694 "pshufd $0x0,%%xmm5,%%xmm5 \n"
5695 "pcmpeqb %%xmm6,%%xmm6 \n"
5696 "psrld $0x10,%%xmm6 \n"
5697 "cvtdq2ps %%xmm6,%%xmm6 \n"
5698 "addps %%xmm6,%%xmm5 \n"
5699 "mulps %%xmm4,%%xmm5 \n"
5700 "cvtps2dq %%xmm5,%%xmm5 \n"
5701 "packssdw %%xmm5,%%xmm5 \n"
5702
5703 // 4 pixel small loop.
5704 LABELALIGN
5705 "4: \n"
5706 "movdqu (%0),%%xmm0 \n"
5707 "movdqu 0x10(%0),%%xmm1 \n"
5708 "movdqu 0x20(%0),%%xmm2 \n"
5709 "movdqu 0x30(%0),%%xmm3 \n"
5710 "psubd 0x00(%0,%4,4),%%xmm0 \n"
5711 "psubd 0x10(%0,%4,4),%%xmm1 \n"
5712 "psubd 0x20(%0,%4,4),%%xmm2 \n"
5713 "psubd 0x30(%0,%4,4),%%xmm3 \n"
5714 "lea 0x40(%0),%0 \n"
5715 "psubd (%1),%%xmm0 \n"
5716 "psubd 0x10(%1),%%xmm1 \n"
5717 "psubd 0x20(%1),%%xmm2 \n"
5718 "psubd 0x30(%1),%%xmm3 \n"
5719 "paddd 0x00(%1,%4,4),%%xmm0 \n"
5720 "paddd 0x10(%1,%4,4),%%xmm1 \n"
5721 "paddd 0x20(%1,%4,4),%%xmm2 \n"
5722 "paddd 0x30(%1,%4,4),%%xmm3 \n"
5723 "lea 0x40(%1),%1 \n"
5724 "packssdw %%xmm1,%%xmm0 \n"
5725 "packssdw %%xmm3,%%xmm2 \n"
5726 "pmulhuw %%xmm5,%%xmm0 \n"
5727 "pmulhuw %%xmm5,%%xmm2 \n"
5728 "packuswb %%xmm2,%%xmm0 \n"
5729 "movdqu %%xmm0,(%2) \n"
5730 "lea 0x10(%2),%2 \n"
5731 "sub $0x4,%3 \n"
5732 "jge 4b \n"
5733 "jmp 49f \n"
5734
5735 // 4 pixel loop
5736 LABELALIGN
5737 "40: \n"
5738 "movdqu (%0),%%xmm0 \n"
5739 "movdqu 0x10(%0),%%xmm1 \n"
5740 "movdqu 0x20(%0),%%xmm2 \n"
5741 "movdqu 0x30(%0),%%xmm3 \n"
5742 "psubd 0x00(%0,%4,4),%%xmm0 \n"
5743 "psubd 0x10(%0,%4,4),%%xmm1 \n"
5744 "psubd 0x20(%0,%4,4),%%xmm2 \n"
5745 "psubd 0x30(%0,%4,4),%%xmm3 \n"
5746 "lea 0x40(%0),%0 \n"
5747 "psubd (%1),%%xmm0 \n"
5748 "psubd 0x10(%1),%%xmm1 \n"
5749 "psubd 0x20(%1),%%xmm2 \n"
5750 "psubd 0x30(%1),%%xmm3 \n"
5751 "paddd 0x00(%1,%4,4),%%xmm0 \n"
5752 "paddd 0x10(%1,%4,4),%%xmm1 \n"
5753 "paddd 0x20(%1,%4,4),%%xmm2 \n"
5754 "paddd 0x30(%1,%4,4),%%xmm3 \n"
5755 "lea 0x40(%1),%1 \n"
5756 "cvtdq2ps %%xmm0,%%xmm0 \n"
5757 "cvtdq2ps %%xmm1,%%xmm1 \n"
5758 "mulps %%xmm4,%%xmm0 \n"
5759 "mulps %%xmm4,%%xmm1 \n"
5760 "cvtdq2ps %%xmm2,%%xmm2 \n"
5761 "cvtdq2ps %%xmm3,%%xmm3 \n"
5762 "mulps %%xmm4,%%xmm2 \n"
5763 "mulps %%xmm4,%%xmm3 \n"
5764 "cvtps2dq %%xmm0,%%xmm0 \n"
5765 "cvtps2dq %%xmm1,%%xmm1 \n"
5766 "cvtps2dq %%xmm2,%%xmm2 \n"
5767 "cvtps2dq %%xmm3,%%xmm3 \n"
5768 "packssdw %%xmm1,%%xmm0 \n"
5769 "packssdw %%xmm3,%%xmm2 \n"
5770 "packuswb %%xmm2,%%xmm0 \n"
5771 "movdqu %%xmm0,(%2) \n"
5772 "lea 0x10(%2),%2 \n"
5773 "sub $0x4,%3 \n"
5774 "jge 40b \n"
5775
5776 "49: \n"
5777 "add $0x3,%3 \n"
5778 "jl 19f \n"
5779
5780 // 1 pixel loop
5781 LABELALIGN
5782 "10: \n"
5783 "movdqu (%0),%%xmm0 \n"
5784 "psubd 0x00(%0,%4,4),%%xmm0 \n"
5785 "lea 0x10(%0),%0 \n"
5786 "psubd (%1),%%xmm0 \n"
5787 "paddd 0x00(%1,%4,4),%%xmm0 \n"
5788 "lea 0x10(%1),%1 \n"
5789 "cvtdq2ps %%xmm0,%%xmm0 \n"
5790 "mulps %%xmm4,%%xmm0 \n"
5791 "cvtps2dq %%xmm0,%%xmm0 \n"
5792 "packssdw %%xmm0,%%xmm0 \n"
5793 "packuswb %%xmm0,%%xmm0 \n"
5794 "movd %%xmm0,(%2) \n"
5795 "lea 0x4(%2),%2 \n"
5796 "sub $0x1,%3 \n"
5797 "jge 10b \n"
5798 "19: \n"
5799 : "+r"(topleft), // %0
5800 "+r"(botleft), // %1
5801 "+r"(dst), // %2
5802 "+rm"(count) // %3
5803 : "r"((intptr_t)(width)), // %4
5804 "rm"(area) // %5
5805 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
5806 }
5807 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5808
5809 #ifdef HAS_ARGBAFFINEROW_SSE2
5810 // Copy ARGB pixels from source image with slope to a row of destination.
5811 LIBYUV_API
ARGBAffineRow_SSE2(const uint8_t * src_argb,int src_argb_stride,uint8_t * dst_argb,const float * src_dudv,int width)5812 void ARGBAffineRow_SSE2(const uint8_t* src_argb,
5813 int src_argb_stride,
5814 uint8_t* dst_argb,
5815 const float* src_dudv,
5816 int width) {
5817 intptr_t src_argb_stride_temp = src_argb_stride;
5818 intptr_t temp;
5819 asm volatile(
5820 "movq (%3),%%xmm2 \n"
5821 "movq 0x08(%3),%%xmm7 \n"
5822 "shl $0x10,%1 \n"
5823 "add $0x4,%1 \n"
5824 "movd %1,%%xmm5 \n"
5825 "sub $0x4,%4 \n"
5826 "jl 49f \n"
5827
5828 "pshufd $0x44,%%xmm7,%%xmm7 \n"
5829 "pshufd $0x0,%%xmm5,%%xmm5 \n"
5830 "movdqa %%xmm2,%%xmm0 \n"
5831 "addps %%xmm7,%%xmm0 \n"
5832 "movlhps %%xmm0,%%xmm2 \n"
5833 "movdqa %%xmm7,%%xmm4 \n"
5834 "addps %%xmm4,%%xmm4 \n"
5835 "movdqa %%xmm2,%%xmm3 \n"
5836 "addps %%xmm4,%%xmm3 \n"
5837 "addps %%xmm4,%%xmm4 \n"
5838
5839 // 4 pixel loop
5840 LABELALIGN
5841 "40: \n"
5842 "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2
5843 "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2
5844 "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
5845 "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride
5846 "movd %%xmm0,%k1 \n"
5847 "pshufd $0x39,%%xmm0,%%xmm0 \n"
5848 "movd %%xmm0,%k5 \n"
5849 "pshufd $0x39,%%xmm0,%%xmm0 \n"
5850 "movd 0x00(%0,%1,1),%%xmm1 \n"
5851 "movd 0x00(%0,%5,1),%%xmm6 \n"
5852 "punpckldq %%xmm6,%%xmm1 \n"
5853 "addps %%xmm4,%%xmm2 \n"
5854 "movq %%xmm1,(%2) \n"
5855 "movd %%xmm0,%k1 \n"
5856 "pshufd $0x39,%%xmm0,%%xmm0 \n"
5857 "movd %%xmm0,%k5 \n"
5858 "movd 0x00(%0,%1,1),%%xmm0 \n"
5859 "movd 0x00(%0,%5,1),%%xmm6 \n"
5860 "punpckldq %%xmm6,%%xmm0 \n"
5861 "addps %%xmm4,%%xmm3 \n"
5862 "movq %%xmm0,0x08(%2) \n"
5863 "lea 0x10(%2),%2 \n"
5864 "sub $0x4,%4 \n"
5865 "jge 40b \n"
5866
5867 "49: \n"
5868 "add $0x3,%4 \n"
5869 "jl 19f \n"
5870
5871 // 1 pixel loop
5872 LABELALIGN
5873 "10: \n"
5874 "cvttps2dq %%xmm2,%%xmm0 \n"
5875 "packssdw %%xmm0,%%xmm0 \n"
5876 "pmaddwd %%xmm5,%%xmm0 \n"
5877 "addps %%xmm7,%%xmm2 \n"
5878 "movd %%xmm0,%k1 \n"
5879 "movd 0x00(%0,%1,1),%%xmm0 \n"
5880 "movd %%xmm0,(%2) \n"
5881 "lea 0x04(%2),%2 \n"
5882 "sub $0x1,%4 \n"
5883 "jge 10b \n"
5884 "19: \n"
5885 : "+r"(src_argb), // %0
5886 "+r"(src_argb_stride_temp), // %1
5887 "+r"(dst_argb), // %2
5888 "+r"(src_dudv), // %3
5889 "+rm"(width), // %4
5890 "=&r"(temp) // %5
5891 :
5892 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
5893 "xmm7");
5894 }
5895 #endif // HAS_ARGBAFFINEROW_SSE2
5896
5897 #ifdef HAS_INTERPOLATEROW_SSSE3
5898 // Bilinear filter 16x2 -> 16x1
InterpolateRow_SSSE3(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)5899 void InterpolateRow_SSSE3(uint8_t* dst_ptr,
5900 const uint8_t* src_ptr,
5901 ptrdiff_t src_stride,
5902 int dst_width,
5903 int source_y_fraction) {
5904 asm volatile(
5905 "sub %1,%0 \n"
5906 "cmp $0x0,%3 \n"
5907 "je 100f \n"
5908 "cmp $0x80,%3 \n"
5909 "je 50f \n"
5910
5911 "movd %3,%%xmm0 \n"
5912 "neg %3 \n"
5913 "add $0x100,%3 \n"
5914 "movd %3,%%xmm5 \n"
5915 "punpcklbw %%xmm0,%%xmm5 \n"
5916 "punpcklwd %%xmm5,%%xmm5 \n"
5917 "pshufd $0x0,%%xmm5,%%xmm5 \n"
5918 "mov $0x80808080,%%eax \n"
5919 "movd %%eax,%%xmm4 \n"
5920 "pshufd $0x0,%%xmm4,%%xmm4 \n"
5921
5922 // General purpose row blend.
5923 LABELALIGN
5924 "1: \n"
5925 "movdqu (%1),%%xmm0 \n"
5926 "movdqu 0x00(%1,%4,1),%%xmm2 \n"
5927 "movdqa %%xmm0,%%xmm1 \n"
5928 "punpcklbw %%xmm2,%%xmm0 \n"
5929 "punpckhbw %%xmm2,%%xmm1 \n"
5930 "psubb %%xmm4,%%xmm0 \n"
5931 "psubb %%xmm4,%%xmm1 \n"
5932 "movdqa %%xmm5,%%xmm2 \n"
5933 "movdqa %%xmm5,%%xmm3 \n"
5934 "pmaddubsw %%xmm0,%%xmm2 \n"
5935 "pmaddubsw %%xmm1,%%xmm3 \n"
5936 "paddw %%xmm4,%%xmm2 \n"
5937 "paddw %%xmm4,%%xmm3 \n"
5938 "psrlw $0x8,%%xmm2 \n"
5939 "psrlw $0x8,%%xmm3 \n"
5940 "packuswb %%xmm3,%%xmm2 \n"
5941 "movdqu %%xmm2,0x00(%1,%0,1) \n"
5942 "lea 0x10(%1),%1 \n"
5943 "sub $0x10,%2 \n"
5944 "jg 1b \n"
5945 "jmp 99f \n"
5946
5947 // Blend 50 / 50.
5948 LABELALIGN
5949 "50: \n"
5950 "movdqu (%1),%%xmm0 \n"
5951 "movdqu 0x00(%1,%4,1),%%xmm1 \n"
5952 "pavgb %%xmm1,%%xmm0 \n"
5953 "movdqu %%xmm0,0x00(%1,%0,1) \n"
5954 "lea 0x10(%1),%1 \n"
5955 "sub $0x10,%2 \n"
5956 "jg 50b \n"
5957 "jmp 99f \n"
5958
5959 // Blend 100 / 0 - Copy row unchanged.
5960 LABELALIGN
5961 "100: \n"
5962 "movdqu (%1),%%xmm0 \n"
5963 "movdqu %%xmm0,0x00(%1,%0,1) \n"
5964 "lea 0x10(%1),%1 \n"
5965 "sub $0x10,%2 \n"
5966 "jg 100b \n"
5967
5968 "99: \n"
5969 : "+r"(dst_ptr), // %0
5970 "+r"(src_ptr), // %1
5971 "+rm"(dst_width), // %2
5972 "+r"(source_y_fraction) // %3
5973 : "r"((intptr_t)(src_stride)) // %4
5974 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
5975 }
5976 #endif // HAS_INTERPOLATEROW_SSSE3
5977
5978 #ifdef HAS_INTERPOLATEROW_AVX2
5979 // Bilinear filter 32x2 -> 32x1
InterpolateRow_AVX2(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)5980 void InterpolateRow_AVX2(uint8_t* dst_ptr,
5981 const uint8_t* src_ptr,
5982 ptrdiff_t src_stride,
5983 int dst_width,
5984 int source_y_fraction) {
5985 asm volatile(
5986 "cmp $0x0,%3 \n"
5987 "je 100f \n"
5988 "sub %1,%0 \n"
5989 "cmp $0x80,%3 \n"
5990 "je 50f \n"
5991
5992 "vmovd %3,%%xmm0 \n"
5993 "neg %3 \n"
5994 "add $0x100,%3 \n"
5995 "vmovd %3,%%xmm5 \n"
5996 "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
5997 "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
5998 "vbroadcastss %%xmm5,%%ymm5 \n"
5999 "mov $0x80808080,%%eax \n"
6000 "vmovd %%eax,%%xmm4 \n"
6001 "vbroadcastss %%xmm4,%%ymm4 \n"
6002
6003 // General purpose row blend.
6004 LABELALIGN
6005 "1: \n"
6006 "vmovdqu (%1),%%ymm0 \n"
6007 "vmovdqu 0x00(%1,%4,1),%%ymm2 \n"
6008 "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
6009 "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
6010 "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
6011 "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
6012 "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
6013 "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
6014 "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
6015 "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
6016 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
6017 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
6018 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
6019 "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
6020 "lea 0x20(%1),%1 \n"
6021 "sub $0x20,%2 \n"
6022 "jg 1b \n"
6023 "jmp 99f \n"
6024
6025 // Blend 50 / 50.
6026 LABELALIGN
6027 "50: \n"
6028 "vmovdqu (%1),%%ymm0 \n"
6029 "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n"
6030 "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
6031 "lea 0x20(%1),%1 \n"
6032 "sub $0x20,%2 \n"
6033 "jg 50b \n"
6034 "jmp 99f \n"
6035
6036 // Blend 100 / 0 - Copy row unchanged.
6037 LABELALIGN
6038 "100: \n"
6039 "rep movsb \n"
6040 "jmp 999f \n"
6041
6042 "99: \n"
6043 "vzeroupper \n"
6044 "999: \n"
6045 : "+D"(dst_ptr), // %0
6046 "+S"(src_ptr), // %1
6047 "+cm"(dst_width), // %2
6048 "+r"(source_y_fraction) // %3
6049 : "r"((intptr_t)(src_stride)) // %4
6050 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
6051 }
6052 #endif // HAS_INTERPOLATEROW_AVX2
6053
6054 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
6055 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)6056 void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
6057 uint8_t* dst_argb,
6058 const uint8_t* shuffler,
6059 int width) {
6060 asm volatile(
6061
6062 "movdqu (%3),%%xmm5 \n"
6063
6064 LABELALIGN
6065 "1: \n"
6066 "movdqu (%0),%%xmm0 \n"
6067 "movdqu 0x10(%0),%%xmm1 \n"
6068 "lea 0x20(%0),%0 \n"
6069 "pshufb %%xmm5,%%xmm0 \n"
6070 "pshufb %%xmm5,%%xmm1 \n"
6071 "movdqu %%xmm0,(%1) \n"
6072 "movdqu %%xmm1,0x10(%1) \n"
6073 "lea 0x20(%1),%1 \n"
6074 "sub $0x8,%2 \n"
6075 "jg 1b \n"
6076 : "+r"(src_argb), // %0
6077 "+r"(dst_argb), // %1
6078 "+r"(width) // %2
6079 : "r"(shuffler) // %3
6080 : "memory", "cc", "xmm0", "xmm1", "xmm5");
6081 }
6082 #endif // HAS_ARGBSHUFFLEROW_SSSE3
6083
6084 #ifdef HAS_ARGBSHUFFLEROW_AVX2
6085 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)6086 void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
6087 uint8_t* dst_argb,
6088 const uint8_t* shuffler,
6089 int width) {
6090 asm volatile(
6091
6092 "vbroadcastf128 (%3),%%ymm5 \n"
6093
6094 LABELALIGN
6095 "1: \n"
6096 "vmovdqu (%0),%%ymm0 \n"
6097 "vmovdqu 0x20(%0),%%ymm1 \n"
6098 "lea 0x40(%0),%0 \n"
6099 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
6100 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
6101 "vmovdqu %%ymm0,(%1) \n"
6102 "vmovdqu %%ymm1,0x20(%1) \n"
6103 "lea 0x40(%1),%1 \n"
6104 "sub $0x10,%2 \n"
6105 "jg 1b \n"
6106 "vzeroupper \n"
6107 : "+r"(src_argb), // %0
6108 "+r"(dst_argb), // %1
6109 "+r"(width) // %2
6110 : "r"(shuffler) // %3
6111 : "memory", "cc", "xmm0", "xmm1", "xmm5");
6112 }
6113 #endif // HAS_ARGBSHUFFLEROW_AVX2
6114
6115 #ifdef HAS_I422TOYUY2ROW_SSE2
I422ToYUY2Row_SSE2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)6116 void I422ToYUY2Row_SSE2(const uint8_t* src_y,
6117 const uint8_t* src_u,
6118 const uint8_t* src_v,
6119 uint8_t* dst_yuy2,
6120 int width) {
6121 asm volatile(
6122
6123 "sub %1,%2 \n"
6124
6125 LABELALIGN
6126 "1: \n"
6127 "movq (%1),%%xmm2 \n"
6128 "movq 0x00(%1,%2,1),%%xmm1 \n"
6129 "add $0x8,%1 \n"
6130 "punpcklbw %%xmm1,%%xmm2 \n"
6131 "movdqu (%0),%%xmm0 \n"
6132 "add $0x10,%0 \n"
6133 "movdqa %%xmm0,%%xmm1 \n"
6134 "punpcklbw %%xmm2,%%xmm0 \n"
6135 "punpckhbw %%xmm2,%%xmm1 \n"
6136 "movdqu %%xmm0,(%3) \n"
6137 "movdqu %%xmm1,0x10(%3) \n"
6138 "lea 0x20(%3),%3 \n"
6139 "sub $0x10,%4 \n"
6140 "jg 1b \n"
6141 : "+r"(src_y), // %0
6142 "+r"(src_u), // %1
6143 "+r"(src_v), // %2
6144 "+r"(dst_yuy2), // %3
6145 "+rm"(width) // %4
6146 :
6147 : "memory", "cc", "xmm0", "xmm1", "xmm2");
6148 }
6149 #endif // HAS_I422TOYUY2ROW_SSE2
6150
6151 #ifdef HAS_I422TOUYVYROW_SSE2
I422ToUYVYRow_SSE2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)6152 void I422ToUYVYRow_SSE2(const uint8_t* src_y,
6153 const uint8_t* src_u,
6154 const uint8_t* src_v,
6155 uint8_t* dst_uyvy,
6156 int width) {
6157 asm volatile(
6158
6159 "sub %1,%2 \n"
6160
6161 LABELALIGN
6162 "1: \n"
6163 "movq (%1),%%xmm2 \n"
6164 "movq 0x00(%1,%2,1),%%xmm1 \n"
6165 "add $0x8,%1 \n"
6166 "punpcklbw %%xmm1,%%xmm2 \n"
6167 "movdqu (%0),%%xmm0 \n"
6168 "movdqa %%xmm2,%%xmm1 \n"
6169 "add $0x10,%0 \n"
6170 "punpcklbw %%xmm0,%%xmm1 \n"
6171 "punpckhbw %%xmm0,%%xmm2 \n"
6172 "movdqu %%xmm1,(%3) \n"
6173 "movdqu %%xmm2,0x10(%3) \n"
6174 "lea 0x20(%3),%3 \n"
6175 "sub $0x10,%4 \n"
6176 "jg 1b \n"
6177 : "+r"(src_y), // %0
6178 "+r"(src_u), // %1
6179 "+r"(src_v), // %2
6180 "+r"(dst_uyvy), // %3
6181 "+rm"(width) // %4
6182 :
6183 : "memory", "cc", "xmm0", "xmm1", "xmm2");
6184 }
6185 #endif // HAS_I422TOUYVYROW_SSE2
6186
6187 #ifdef HAS_I422TOYUY2ROW_AVX2
I422ToYUY2Row_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)6188 void I422ToYUY2Row_AVX2(const uint8_t* src_y,
6189 const uint8_t* src_u,
6190 const uint8_t* src_v,
6191 uint8_t* dst_yuy2,
6192 int width) {
6193 asm volatile(
6194
6195 "sub %1,%2 \n"
6196
6197 LABELALIGN
6198 "1: \n"
6199 "vpmovzxbw (%1),%%ymm1 \n"
6200 "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
6201 "add $0x10,%1 \n"
6202 "vpsllw $0x8,%%ymm2,%%ymm2 \n"
6203 "vpor %%ymm1,%%ymm2,%%ymm2 \n"
6204 "vmovdqu (%0),%%ymm0 \n"
6205 "add $0x20,%0 \n"
6206 "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n"
6207 "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n"
6208 "vextractf128 $0x0,%%ymm1,(%3) \n"
6209 "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
6210 "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
6211 "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
6212 "lea 0x40(%3),%3 \n"
6213 "sub $0x20,%4 \n"
6214 "jg 1b \n"
6215 "vzeroupper \n"
6216 : "+r"(src_y), // %0
6217 "+r"(src_u), // %1
6218 "+r"(src_v), // %2
6219 "+r"(dst_yuy2), // %3
6220 "+rm"(width) // %4
6221 :
6222 : "memory", "cc", "xmm0", "xmm1", "xmm2");
6223 }
6224 #endif // HAS_I422TOYUY2ROW_AVX2
6225
6226 #ifdef HAS_I422TOUYVYROW_AVX2
I422ToUYVYRow_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)6227 void I422ToUYVYRow_AVX2(const uint8_t* src_y,
6228 const uint8_t* src_u,
6229 const uint8_t* src_v,
6230 uint8_t* dst_uyvy,
6231 int width) {
6232 asm volatile(
6233
6234 "sub %1,%2 \n"
6235
6236 LABELALIGN
6237 "1: \n"
6238 "vpmovzxbw (%1),%%ymm1 \n"
6239 "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
6240 "add $0x10,%1 \n"
6241 "vpsllw $0x8,%%ymm2,%%ymm2 \n"
6242 "vpor %%ymm1,%%ymm2,%%ymm2 \n"
6243 "vmovdqu (%0),%%ymm0 \n"
6244 "add $0x20,%0 \n"
6245 "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
6246 "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
6247 "vextractf128 $0x0,%%ymm1,(%3) \n"
6248 "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
6249 "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
6250 "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
6251 "lea 0x40(%3),%3 \n"
6252 "sub $0x20,%4 \n"
6253 "jg 1b \n"
6254 "vzeroupper \n"
6255 : "+r"(src_y), // %0
6256 "+r"(src_u), // %1
6257 "+r"(src_v), // %2
6258 "+r"(dst_uyvy), // %3
6259 "+rm"(width) // %4
6260 :
6261 : "memory", "cc", "xmm0", "xmm1", "xmm2");
6262 }
6263 #endif // HAS_I422TOUYVYROW_AVX2
6264
6265 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
ARGBPolynomialRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,const float * poly,int width)6266 void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
6267 uint8_t* dst_argb,
6268 const float* poly,
6269 int width) {
6270 asm volatile(
6271
6272 "pxor %%xmm3,%%xmm3 \n"
6273
6274 // 2 pixel loop.
6275 LABELALIGN
6276 "1: \n"
6277 "movq (%0),%%xmm0 \n"
6278 "lea 0x8(%0),%0 \n"
6279 "punpcklbw %%xmm3,%%xmm0 \n"
6280 "movdqa %%xmm0,%%xmm4 \n"
6281 "punpcklwd %%xmm3,%%xmm0 \n"
6282 "punpckhwd %%xmm3,%%xmm4 \n"
6283 "cvtdq2ps %%xmm0,%%xmm0 \n"
6284 "cvtdq2ps %%xmm4,%%xmm4 \n"
6285 "movdqa %%xmm0,%%xmm1 \n"
6286 "movdqa %%xmm4,%%xmm5 \n"
6287 "mulps 0x10(%3),%%xmm0 \n"
6288 "mulps 0x10(%3),%%xmm4 \n"
6289 "addps (%3),%%xmm0 \n"
6290 "addps (%3),%%xmm4 \n"
6291 "movdqa %%xmm1,%%xmm2 \n"
6292 "movdqa %%xmm5,%%xmm6 \n"
6293 "mulps %%xmm1,%%xmm2 \n"
6294 "mulps %%xmm5,%%xmm6 \n"
6295 "mulps %%xmm2,%%xmm1 \n"
6296 "mulps %%xmm6,%%xmm5 \n"
6297 "mulps 0x20(%3),%%xmm2 \n"
6298 "mulps 0x20(%3),%%xmm6 \n"
6299 "mulps 0x30(%3),%%xmm1 \n"
6300 "mulps 0x30(%3),%%xmm5 \n"
6301 "addps %%xmm2,%%xmm0 \n"
6302 "addps %%xmm6,%%xmm4 \n"
6303 "addps %%xmm1,%%xmm0 \n"
6304 "addps %%xmm5,%%xmm4 \n"
6305 "cvttps2dq %%xmm0,%%xmm0 \n"
6306 "cvttps2dq %%xmm4,%%xmm4 \n"
6307 "packuswb %%xmm4,%%xmm0 \n"
6308 "packuswb %%xmm0,%%xmm0 \n"
6309 "movq %%xmm0,(%1) \n"
6310 "lea 0x8(%1),%1 \n"
6311 "sub $0x2,%2 \n"
6312 "jg 1b \n"
6313 : "+r"(src_argb), // %0
6314 "+r"(dst_argb), // %1
6315 "+r"(width) // %2
6316 : "r"(poly) // %3
6317 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
6318 }
6319 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
6320
6321 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
ARGBPolynomialRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,const float * poly,int width)6322 void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
6323 uint8_t* dst_argb,
6324 const float* poly,
6325 int width) {
6326 asm volatile(
6327 "vbroadcastf128 (%3),%%ymm4 \n"
6328 "vbroadcastf128 0x10(%3),%%ymm5 \n"
6329 "vbroadcastf128 0x20(%3),%%ymm6 \n"
6330 "vbroadcastf128 0x30(%3),%%ymm7 \n"
6331
6332 // 2 pixel loop.
6333 LABELALIGN
6334 "1: \n"
6335 "vpmovzxbd (%0),%%ymm0 \n" // 2 ARGB pixels
6336 "lea 0x8(%0),%0 \n"
6337 "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
6338 "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
6339 "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
6340 "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
6341 "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
6342 "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X *
6343 // X
6344 "vcvttps2dq %%ymm0,%%ymm0 \n"
6345 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
6346 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6347 "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
6348 "vmovq %%xmm0,(%1) \n"
6349 "lea 0x8(%1),%1 \n"
6350 "sub $0x2,%2 \n"
6351 "jg 1b \n"
6352 "vzeroupper \n"
6353 : "+r"(src_argb), // %0
6354 "+r"(dst_argb), // %1
6355 "+r"(width) // %2
6356 : "r"(poly) // %3
6357 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
6358 "xmm7");
6359 }
6360 #endif // HAS_ARGBPOLYNOMIALROW_AVX2
6361
6362 #ifdef HAS_HALFFLOATROW_SSE2
6363 static float kScaleBias = 1.9259299444e-34f;
HalfFloatRow_SSE2(const uint16_t * src,uint16_t * dst,float scale,int width)6364 void HalfFloatRow_SSE2(const uint16_t* src,
6365 uint16_t* dst,
6366 float scale,
6367 int width) {
6368 scale *= kScaleBias;
6369 asm volatile(
6370 "movd %3,%%xmm4 \n"
6371 "pshufd $0x0,%%xmm4,%%xmm4 \n"
6372 "pxor %%xmm5,%%xmm5 \n"
6373 "sub %0,%1 \n"
6374
6375 // 16 pixel loop.
6376 LABELALIGN
6377 "1: \n"
6378 "movdqu (%0),%%xmm2 \n" // 8 shorts
6379 "add $0x10,%0 \n"
6380 "movdqa %%xmm2,%%xmm3 \n"
6381 "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1
6382 "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats
6383 "punpckhwd %%xmm5,%%xmm3 \n"
6384 "cvtdq2ps %%xmm3,%%xmm3 \n"
6385 "mulps %%xmm4,%%xmm2 \n"
6386 "mulps %%xmm4,%%xmm3 \n"
6387 "psrld $0xd,%%xmm2 \n"
6388 "psrld $0xd,%%xmm3 \n"
6389 "packssdw %%xmm3,%%xmm2 \n"
6390 "movdqu %%xmm2,-0x10(%0,%1,1) \n"
6391 "sub $0x8,%2 \n"
6392 "jg 1b \n"
6393 : "+r"(src), // %0
6394 "+r"(dst), // %1
6395 "+r"(width) // %2
6396 : "m"(scale) // %3
6397 : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
6398 }
6399 #endif // HAS_HALFFLOATROW_SSE2
6400
6401 #ifdef HAS_HALFFLOATROW_AVX2
HalfFloatRow_AVX2(const uint16_t * src,uint16_t * dst,float scale,int width)6402 void HalfFloatRow_AVX2(const uint16_t* src,
6403 uint16_t* dst,
6404 float scale,
6405 int width) {
6406 scale *= kScaleBias;
6407 asm volatile(
6408 "vbroadcastss %3, %%ymm4 \n"
6409 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
6410 "sub %0,%1 \n"
6411
6412 // 16 pixel loop.
6413 LABELALIGN
6414 "1: \n"
6415 "vmovdqu (%0),%%ymm2 \n" // 16 shorts
6416 "add $0x20,%0 \n"
6417 "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
6418 "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
6419 "vcvtdq2ps %%ymm3,%%ymm3 \n"
6420 "vcvtdq2ps %%ymm2,%%ymm2 \n"
6421 "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
6422 "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
6423 "vpsrld $0xd,%%ymm3,%%ymm3 \n"
6424 "vpsrld $0xd,%%ymm2,%%ymm2 \n"
6425 "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
6426 "vmovdqu %%ymm2,-0x20(%0,%1,1) \n"
6427 "sub $0x10,%2 \n"
6428 "jg 1b \n"
6429
6430 "vzeroupper \n"
6431 : "+r"(src), // %0
6432 "+r"(dst), // %1
6433 "+r"(width) // %2
6434 #if defined(__x86_64__)
6435 : "x"(scale) // %3
6436 #else
6437 : "m"(scale) // %3
6438 #endif
6439 : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
6440 }
6441 #endif // HAS_HALFFLOATROW_AVX2
6442
6443 #ifdef HAS_HALFFLOATROW_F16C
HalfFloatRow_F16C(const uint16_t * src,uint16_t * dst,float scale,int width)6444 void HalfFloatRow_F16C(const uint16_t* src,
6445 uint16_t* dst,
6446 float scale,
6447 int width) {
6448 asm volatile(
6449 "vbroadcastss %3, %%ymm4 \n"
6450 "sub %0,%1 \n"
6451
6452 // 16 pixel loop.
6453 LABELALIGN
6454 "1: \n"
6455 "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints
6456 "vpmovzxwd 0x10(%0),%%ymm3 \n"
6457 "vcvtdq2ps %%ymm2,%%ymm2 \n"
6458 "vcvtdq2ps %%ymm3,%%ymm3 \n"
6459 "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
6460 "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
6461 "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
6462 "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
6463 "vmovdqu %%xmm2,0x00(%0,%1,1) \n"
6464 "vmovdqu %%xmm3,0x10(%0,%1,1) \n"
6465 "add $0x20,%0 \n"
6466 "sub $0x10,%2 \n"
6467 "jg 1b \n"
6468 "vzeroupper \n"
6469 : "+r"(src), // %0
6470 "+r"(dst), // %1
6471 "+r"(width) // %2
6472 #if defined(__x86_64__)
6473 : "x"(scale) // %3
6474 #else
6475 : "m"(scale) // %3
6476 #endif
6477 : "memory", "cc", "xmm2", "xmm3", "xmm4");
6478 }
6479 #endif // HAS_HALFFLOATROW_F16C
6480
6481 #ifdef HAS_HALFFLOATROW_F16C
HalfFloat1Row_F16C(const uint16_t * src,uint16_t * dst,float,int width)6482 void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
6483 asm volatile(
6484 "sub %0,%1 \n"
6485 // 16 pixel loop.
6486 LABELALIGN
6487 "1: \n"
6488 "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints
6489 "vpmovzxwd 0x10(%0),%%ymm3 \n"
6490 "vcvtdq2ps %%ymm2,%%ymm2 \n"
6491 "vcvtdq2ps %%ymm3,%%ymm3 \n"
6492 "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
6493 "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
6494 "vmovdqu %%xmm2,0x00(%0,%1,1) \n"
6495 "vmovdqu %%xmm3,0x10(%0,%1,1) \n"
6496 "add $0x20,%0 \n"
6497 "sub $0x10,%2 \n"
6498 "jg 1b \n"
6499 "vzeroupper \n"
6500 : "+r"(src), // %0
6501 "+r"(dst), // %1
6502 "+r"(width) // %2
6503 :
6504 : "memory", "cc", "xmm2", "xmm3");
6505 }
6506 #endif // HAS_HALFFLOATROW_F16C
6507
6508 #ifdef HAS_ARGBCOLORTABLEROW_X86
6509 // Tranform ARGB pixels with color table.
ARGBColorTableRow_X86(uint8_t * dst_argb,const uint8_t * table_argb,int width)6510 void ARGBColorTableRow_X86(uint8_t* dst_argb,
6511 const uint8_t* table_argb,
6512 int width) {
6513 uintptr_t pixel_temp;
6514 asm volatile(
6515 // 1 pixel loop.
6516 LABELALIGN
6517 "1: \n"
6518 "movzb (%0),%1 \n"
6519 "lea 0x4(%0),%0 \n"
6520 "movzb 0x00(%3,%1,4),%1 \n"
6521 "mov %b1,-0x4(%0) \n"
6522 "movzb -0x3(%0),%1 \n"
6523 "movzb 0x01(%3,%1,4),%1 \n"
6524 "mov %b1,-0x3(%0) \n"
6525 "movzb -0x2(%0),%1 \n"
6526 "movzb 0x02(%3,%1,4),%1 \n"
6527 "mov %b1,-0x2(%0) \n"
6528 "movzb -0x1(%0),%1 \n"
6529 "movzb 0x03(%3,%1,4),%1 \n"
6530 "mov %b1,-0x1(%0) \n"
6531 "dec %2 \n"
6532 "jg 1b \n"
6533 : "+r"(dst_argb), // %0
6534 "=&d"(pixel_temp), // %1
6535 "+r"(width) // %2
6536 : "r"(table_argb) // %3
6537 : "memory", "cc");
6538 }
6539 #endif // HAS_ARGBCOLORTABLEROW_X86
6540
6541 #ifdef HAS_RGBCOLORTABLEROW_X86
6542 // Tranform RGB pixels with color table.
RGBColorTableRow_X86(uint8_t * dst_argb,const uint8_t * table_argb,int width)6543 void RGBColorTableRow_X86(uint8_t* dst_argb,
6544 const uint8_t* table_argb,
6545 int width) {
6546 uintptr_t pixel_temp;
6547 asm volatile(
6548 // 1 pixel loop.
6549 LABELALIGN
6550 "1: \n"
6551 "movzb (%0),%1 \n"
6552 "lea 0x4(%0),%0 \n"
6553 "movzb 0x00(%3,%1,4),%1 \n"
6554 "mov %b1,-0x4(%0) \n"
6555 "movzb -0x3(%0),%1 \n"
6556 "movzb 0x01(%3,%1,4),%1 \n"
6557 "mov %b1,-0x3(%0) \n"
6558 "movzb -0x2(%0),%1 \n"
6559 "movzb 0x02(%3,%1,4),%1 \n"
6560 "mov %b1,-0x2(%0) \n"
6561 "dec %2 \n"
6562 "jg 1b \n"
6563 : "+r"(dst_argb), // %0
6564 "=&d"(pixel_temp), // %1
6565 "+r"(width) // %2
6566 : "r"(table_argb) // %3
6567 : "memory", "cc");
6568 }
6569 #endif // HAS_RGBCOLORTABLEROW_X86
6570
6571 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
6572 // Tranform RGB pixels with luma table.
ARGBLumaColorTableRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width,const uint8_t * luma,uint32_t lumacoeff)6573 void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
6574 uint8_t* dst_argb,
6575 int width,
6576 const uint8_t* luma,
6577 uint32_t lumacoeff) {
6578 uintptr_t pixel_temp;
6579 uintptr_t table_temp;
6580 asm volatile(
6581 "movd %6,%%xmm3 \n"
6582 "pshufd $0x0,%%xmm3,%%xmm3 \n"
6583 "pcmpeqb %%xmm4,%%xmm4 \n"
6584 "psllw $0x8,%%xmm4 \n"
6585 "pxor %%xmm5,%%xmm5 \n"
6586
6587 // 4 pixel loop.
6588 LABELALIGN
6589 "1: \n"
6590 "movdqu (%2),%%xmm0 \n"
6591 "pmaddubsw %%xmm3,%%xmm0 \n"
6592 "phaddw %%xmm0,%%xmm0 \n"
6593 "pand %%xmm4,%%xmm0 \n"
6594 "punpcklwd %%xmm5,%%xmm0 \n"
6595 "movd %%xmm0,%k1 \n" // 32 bit offset
6596 "add %5,%1 \n"
6597 "pshufd $0x39,%%xmm0,%%xmm0 \n"
6598
6599 "movzb (%2),%0 \n"
6600 "movzb 0x00(%1,%0,1),%0 \n"
6601 "mov %b0,(%3) \n"
6602 "movzb 0x1(%2),%0 \n"
6603 "movzb 0x00(%1,%0,1),%0 \n"
6604 "mov %b0,0x1(%3) \n"
6605 "movzb 0x2(%2),%0 \n"
6606 "movzb 0x00(%1,%0,1),%0 \n"
6607 "mov %b0,0x2(%3) \n"
6608 "movzb 0x3(%2),%0 \n"
6609 "mov %b0,0x3(%3) \n"
6610
6611 "movd %%xmm0,%k1 \n" // 32 bit offset
6612 "add %5,%1 \n"
6613 "pshufd $0x39,%%xmm0,%%xmm0 \n"
6614
6615 "movzb 0x4(%2),%0 \n"
6616 "movzb 0x00(%1,%0,1),%0 \n"
6617 "mov %b0,0x4(%3) \n"
6618 "movzb 0x5(%2),%0 \n"
6619 "movzb 0x00(%1,%0,1),%0 \n"
6620 "mov %b0,0x5(%3) \n"
6621 "movzb 0x6(%2),%0 \n"
6622 "movzb 0x00(%1,%0,1),%0 \n"
6623 "mov %b0,0x6(%3) \n"
6624 "movzb 0x7(%2),%0 \n"
6625 "mov %b0,0x7(%3) \n"
6626
6627 "movd %%xmm0,%k1 \n" // 32 bit offset
6628 "add %5,%1 \n"
6629 "pshufd $0x39,%%xmm0,%%xmm0 \n"
6630
6631 "movzb 0x8(%2),%0 \n"
6632 "movzb 0x00(%1,%0,1),%0 \n"
6633 "mov %b0,0x8(%3) \n"
6634 "movzb 0x9(%2),%0 \n"
6635 "movzb 0x00(%1,%0,1),%0 \n"
6636 "mov %b0,0x9(%3) \n"
6637 "movzb 0xa(%2),%0 \n"
6638 "movzb 0x00(%1,%0,1),%0 \n"
6639 "mov %b0,0xa(%3) \n"
6640 "movzb 0xb(%2),%0 \n"
6641 "mov %b0,0xb(%3) \n"
6642
6643 "movd %%xmm0,%k1 \n" // 32 bit offset
6644 "add %5,%1 \n"
6645
6646 "movzb 0xc(%2),%0 \n"
6647 "movzb 0x00(%1,%0,1),%0 \n"
6648 "mov %b0,0xc(%3) \n"
6649 "movzb 0xd(%2),%0 \n"
6650 "movzb 0x00(%1,%0,1),%0 \n"
6651 "mov %b0,0xd(%3) \n"
6652 "movzb 0xe(%2),%0 \n"
6653 "movzb 0x00(%1,%0,1),%0 \n"
6654 "mov %b0,0xe(%3) \n"
6655 "movzb 0xf(%2),%0 \n"
6656 "mov %b0,0xf(%3) \n"
6657 "lea 0x10(%2),%2 \n"
6658 "lea 0x10(%3),%3 \n"
6659 "sub $0x4,%4 \n"
6660 "jg 1b \n"
6661 : "=&d"(pixel_temp), // %0
6662 "=&a"(table_temp), // %1
6663 "+r"(src_argb), // %2
6664 "+r"(dst_argb), // %3
6665 "+rm"(width) // %4
6666 : "r"(luma), // %5
6667 "rm"(lumacoeff) // %6
6668 : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
6669 }
6670 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6671
6672 #endif // defined(__x86_64__) || defined(__i386__)
6673
6674 #ifdef __cplusplus
6675 } // extern "C"
6676 } // namespace libyuv
6677 #endif
6678