1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12 #ifdef __cplusplus
13 namespace libyuv {
14 extern "C" {
15 #endif
16
17 // This module is for GCC x86 and x64.
18 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
19
20 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
21
22 // Constants for ARGB
23 static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
24 25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u};
25
26 // JPeg full range.
27 static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
28 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};
29
30 static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u,
31 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u};
32 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
33
34 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
35
36 static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
37 112, -74, -38, 0, 112, -74, -38, 0};
38
39 static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
40 127, -84, -43, 0, 127, -84, -43, 0};
41
42 static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
43 -18, -94, 112, 0, -18, -94, 112, 0};
44
45 static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
46 -20, -107, 127, 0, -20, -107, 127, 0};
47
48 // Constants for BGRA
49 static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
50 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
51
52 static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
53 0, -38, -74, 112, 0, -38, -74, 112};
54
55 static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
56 0, 112, -94, -18, 0, 112, -94, -18};
57
58 // Constants for ABGR
59 static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u,
60 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u};
61
62 static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
63 -38, -74, 112, 0, -38, -74, 112, 0};
64
65 static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
66 112, -94, -18, 0, 112, -94, -18, 0};
67
68 // Constants for RGBA.
69 static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u,
70 0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u};
71
72 static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
73 0, 112, -74, -38, 0, 112, -74, -38};
74
75 static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
76 0, -18, -94, 112, 0, -18, -94, 112};
77
78 static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
79 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u};
80
81 static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
82 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
83
84 static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
85 0x8080u, 0x8080u, 0x8080u, 0x8080u};
86
87 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
88
89 #ifdef HAS_RGB24TOARGBROW_SSSE3
90
91 // Shuffle table for converting RGB24 to ARGB.
92 static const uvec8 kShuffleMaskRGB24ToARGB = {
93 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
94
95 // Shuffle table for converting RAW to ARGB.
96 static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
97 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
98
99 // Shuffle table for converting RAW to RGBA.
100 static const uvec8 kShuffleMaskRAWToRGBA = {12u, 2u, 1u, 0u, 13u, 5u, 4u, 3u,
101 14u, 8u, 7u, 6u, 15u, 11u, 10u, 9u};
102
103 // Shuffle table for converting RAW to RGB24. First 8.
104 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
105 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
106 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
107
108 // Shuffle table for converting RAW to RGB24. Middle 8.
109 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
110 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
111 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
112
113 // Shuffle table for converting RAW to RGB24. Last 8.
114 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
115 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
116 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
117
118 // Shuffle table for converting ARGB to RGB24.
119 static const uvec8 kShuffleMaskARGBToRGB24 = {
120 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
121
122 // Shuffle table for converting ARGB to RAW.
123 static const uvec8 kShuffleMaskARGBToRAW = {
124 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
125
126 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
127 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
128 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
129
130 // YUY2 shuf 16 Y to 32 Y.
131 static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
132 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
133 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
134
135 // YUY2 shuf 8 UV to 16 UV.
136 static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
137 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
138 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
139
140 // UYVY shuf 16 Y to 32 Y.
141 static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
142 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
143 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
144
145 // UYVY shuf 8 UV to 16 UV.
146 static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
147 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
148 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
149
150 // NV21 shuf 8 VU to 16 UV.
151 static const lvec8 kShuffleNV21 = {
152 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
153 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
154 };
155 #endif // HAS_RGB24TOARGBROW_SSSE3
156
157 #ifdef HAS_J400TOARGBROW_SSE2
J400ToARGBRow_SSE2(const uint8_t * src_y,uint8_t * dst_argb,int width)158 void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
159 asm volatile(
160 "pcmpeqb %%xmm5,%%xmm5 \n"
161 "pslld $0x18,%%xmm5 \n"
162
163 LABELALIGN
164 "1: \n"
165 "movq (%0),%%xmm0 \n"
166 "lea 0x8(%0),%0 \n"
167 "punpcklbw %%xmm0,%%xmm0 \n"
168 "movdqa %%xmm0,%%xmm1 \n"
169 "punpcklwd %%xmm0,%%xmm0 \n"
170 "punpckhwd %%xmm1,%%xmm1 \n"
171 "por %%xmm5,%%xmm0 \n"
172 "por %%xmm5,%%xmm1 \n"
173 "movdqu %%xmm0,(%1) \n"
174 "movdqu %%xmm1,0x10(%1) \n"
175 "lea 0x20(%1),%1 \n"
176 "sub $0x8,%2 \n"
177 "jg 1b \n"
178 : "+r"(src_y), // %0
179 "+r"(dst_argb), // %1
180 "+r"(width) // %2
181 ::"memory",
182 "cc", "xmm0", "xmm1", "xmm5");
183 }
184 #endif // HAS_J400TOARGBROW_SSE2
185
186 #ifdef HAS_RGB24TOARGBROW_SSSE3
RGB24ToARGBRow_SSSE3(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)187 void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
188 uint8_t* dst_argb,
189 int width) {
190 asm volatile(
191 "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
192 "pslld $0x18,%%xmm5 \n"
193 "movdqa %3,%%xmm4 \n"
194
195 LABELALIGN
196 "1: \n"
197 "movdqu (%0),%%xmm0 \n"
198 "movdqu 0x10(%0),%%xmm1 \n"
199 "movdqu 0x20(%0),%%xmm3 \n"
200 "lea 0x30(%0),%0 \n"
201 "movdqa %%xmm3,%%xmm2 \n"
202 "palignr $0x8,%%xmm1,%%xmm2 \n"
203 "pshufb %%xmm4,%%xmm2 \n"
204 "por %%xmm5,%%xmm2 \n"
205 "palignr $0xc,%%xmm0,%%xmm1 \n"
206 "pshufb %%xmm4,%%xmm0 \n"
207 "movdqu %%xmm2,0x20(%1) \n"
208 "por %%xmm5,%%xmm0 \n"
209 "pshufb %%xmm4,%%xmm1 \n"
210 "movdqu %%xmm0,(%1) \n"
211 "por %%xmm5,%%xmm1 \n"
212 "palignr $0x4,%%xmm3,%%xmm3 \n"
213 "pshufb %%xmm4,%%xmm3 \n"
214 "movdqu %%xmm1,0x10(%1) \n"
215 "por %%xmm5,%%xmm3 \n"
216 "movdqu %%xmm3,0x30(%1) \n"
217 "lea 0x40(%1),%1 \n"
218 "sub $0x10,%2 \n"
219 "jg 1b \n"
220 : "+r"(src_rgb24), // %0
221 "+r"(dst_argb), // %1
222 "+r"(width) // %2
223 : "m"(kShuffleMaskRGB24ToARGB) // %3
224 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
225 }
226
RAWToARGBRow_SSSE3(const uint8_t * src_raw,uint8_t * dst_argb,int width)227 void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
228 asm volatile(
229 "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
230 "pslld $0x18,%%xmm5 \n"
231 "movdqa %3,%%xmm4 \n"
232
233 LABELALIGN
234 "1: \n"
235 "movdqu (%0),%%xmm0 \n"
236 "movdqu 0x10(%0),%%xmm1 \n"
237 "movdqu 0x20(%0),%%xmm3 \n"
238 "lea 0x30(%0),%0 \n"
239 "movdqa %%xmm3,%%xmm2 \n"
240 "palignr $0x8,%%xmm1,%%xmm2 \n"
241 "pshufb %%xmm4,%%xmm2 \n"
242 "por %%xmm5,%%xmm2 \n"
243 "palignr $0xc,%%xmm0,%%xmm1 \n"
244 "pshufb %%xmm4,%%xmm0 \n"
245 "movdqu %%xmm2,0x20(%1) \n"
246 "por %%xmm5,%%xmm0 \n"
247 "pshufb %%xmm4,%%xmm1 \n"
248 "movdqu %%xmm0,(%1) \n"
249 "por %%xmm5,%%xmm1 \n"
250 "palignr $0x4,%%xmm3,%%xmm3 \n"
251 "pshufb %%xmm4,%%xmm3 \n"
252 "movdqu %%xmm1,0x10(%1) \n"
253 "por %%xmm5,%%xmm3 \n"
254 "movdqu %%xmm3,0x30(%1) \n"
255 "lea 0x40(%1),%1 \n"
256 "sub $0x10,%2 \n"
257 "jg 1b \n"
258 : "+r"(src_raw), // %0
259 "+r"(dst_argb), // %1
260 "+r"(width) // %2
261 : "m"(kShuffleMaskRAWToARGB) // %3
262 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
263 }
264
265 // Same code as RAWToARGB with different shuffler and A in low bits
RAWToRGBARow_SSSE3(const uint8_t * src_raw,uint8_t * dst_rgba,int width)266 void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
267 asm volatile(
268 "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff
269 "psrld $0x18,%%xmm5 \n"
270 "movdqa %3,%%xmm4 \n"
271
272 LABELALIGN
273 "1: \n"
274 "movdqu (%0),%%xmm0 \n"
275 "movdqu 0x10(%0),%%xmm1 \n"
276 "movdqu 0x20(%0),%%xmm3 \n"
277 "lea 0x30(%0),%0 \n"
278 "movdqa %%xmm3,%%xmm2 \n"
279 "palignr $0x8,%%xmm1,%%xmm2 \n"
280 "pshufb %%xmm4,%%xmm2 \n"
281 "por %%xmm5,%%xmm2 \n"
282 "palignr $0xc,%%xmm0,%%xmm1 \n"
283 "pshufb %%xmm4,%%xmm0 \n"
284 "movdqu %%xmm2,0x20(%1) \n"
285 "por %%xmm5,%%xmm0 \n"
286 "pshufb %%xmm4,%%xmm1 \n"
287 "movdqu %%xmm0,(%1) \n"
288 "por %%xmm5,%%xmm1 \n"
289 "palignr $0x4,%%xmm3,%%xmm3 \n"
290 "pshufb %%xmm4,%%xmm3 \n"
291 "movdqu %%xmm1,0x10(%1) \n"
292 "por %%xmm5,%%xmm3 \n"
293 "movdqu %%xmm3,0x30(%1) \n"
294 "lea 0x40(%1),%1 \n"
295 "sub $0x10,%2 \n"
296 "jg 1b \n"
297 : "+r"(src_raw), // %0
298 "+r"(dst_rgba), // %1
299 "+r"(width) // %2
300 : "m"(kShuffleMaskRAWToRGBA) // %3
301 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
302 }
303
RAWToRGB24Row_SSSE3(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)304 void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
305 uint8_t* dst_rgb24,
306 int width) {
307 asm volatile(
308 "movdqa %3,%%xmm3 \n"
309 "movdqa %4,%%xmm4 \n"
310 "movdqa %5,%%xmm5 \n"
311
312 LABELALIGN
313 "1: \n"
314 "movdqu (%0),%%xmm0 \n"
315 "movdqu 0x4(%0),%%xmm1 \n"
316 "movdqu 0x8(%0),%%xmm2 \n"
317 "lea 0x18(%0),%0 \n"
318 "pshufb %%xmm3,%%xmm0 \n"
319 "pshufb %%xmm4,%%xmm1 \n"
320 "pshufb %%xmm5,%%xmm2 \n"
321 "movq %%xmm0,(%1) \n"
322 "movq %%xmm1,0x8(%1) \n"
323 "movq %%xmm2,0x10(%1) \n"
324 "lea 0x18(%1),%1 \n"
325 "sub $0x8,%2 \n"
326 "jg 1b \n"
327 : "+r"(src_raw), // %0
328 "+r"(dst_rgb24), // %1
329 "+r"(width) // %2
330 : "m"(kShuffleMaskRAWToRGB24_0), // %3
331 "m"(kShuffleMaskRAWToRGB24_1), // %4
332 "m"(kShuffleMaskRAWToRGB24_2) // %5
333 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
334 }
335
RGB565ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)336 void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
337 asm volatile(
338 "mov $0x1080108,%%eax \n"
339 "movd %%eax,%%xmm5 \n"
340 "pshufd $0x0,%%xmm5,%%xmm5 \n"
341 "mov $0x20802080,%%eax \n"
342 "movd %%eax,%%xmm6 \n"
343 "pshufd $0x0,%%xmm6,%%xmm6 \n"
344 "pcmpeqb %%xmm3,%%xmm3 \n"
345 "psllw $0xb,%%xmm3 \n"
346 "pcmpeqb %%xmm4,%%xmm4 \n"
347 "psllw $0xa,%%xmm4 \n"
348 "psrlw $0x5,%%xmm4 \n"
349 "pcmpeqb %%xmm7,%%xmm7 \n"
350 "psllw $0x8,%%xmm7 \n"
351 "sub %0,%1 \n"
352 "sub %0,%1 \n"
353
354 LABELALIGN
355 "1: \n"
356 "movdqu (%0),%%xmm0 \n"
357 "movdqa %%xmm0,%%xmm1 \n"
358 "movdqa %%xmm0,%%xmm2 \n"
359 "pand %%xmm3,%%xmm1 \n"
360 "psllw $0xb,%%xmm2 \n"
361 "pmulhuw %%xmm5,%%xmm1 \n"
362 "pmulhuw %%xmm5,%%xmm2 \n"
363 "psllw $0x8,%%xmm1 \n"
364 "por %%xmm2,%%xmm1 \n"
365 "pand %%xmm4,%%xmm0 \n"
366 "pmulhuw %%xmm6,%%xmm0 \n"
367 "por %%xmm7,%%xmm0 \n"
368 "movdqa %%xmm1,%%xmm2 \n"
369 "punpcklbw %%xmm0,%%xmm1 \n"
370 "punpckhbw %%xmm0,%%xmm2 \n"
371 "movdqu %%xmm1,0x00(%1,%0,2) \n"
372 "movdqu %%xmm2,0x10(%1,%0,2) \n"
373 "lea 0x10(%0),%0 \n"
374 "sub $0x8,%2 \n"
375 "jg 1b \n"
376 : "+r"(src), // %0
377 "+r"(dst), // %1
378 "+r"(width) // %2
379 :
380 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
381 "xmm6", "xmm7");
382 }
383
ARGB1555ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)384 void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
385 asm volatile(
386 "mov $0x1080108,%%eax \n"
387 "movd %%eax,%%xmm5 \n"
388 "pshufd $0x0,%%xmm5,%%xmm5 \n"
389 "mov $0x42004200,%%eax \n"
390 "movd %%eax,%%xmm6 \n"
391 "pshufd $0x0,%%xmm6,%%xmm6 \n"
392 "pcmpeqb %%xmm3,%%xmm3 \n"
393 "psllw $0xb,%%xmm3 \n"
394 "movdqa %%xmm3,%%xmm4 \n"
395 "psrlw $0x6,%%xmm4 \n"
396 "pcmpeqb %%xmm7,%%xmm7 \n"
397 "psllw $0x8,%%xmm7 \n"
398 "sub %0,%1 \n"
399 "sub %0,%1 \n"
400
401 LABELALIGN
402 "1: \n"
403 "movdqu (%0),%%xmm0 \n"
404 "movdqa %%xmm0,%%xmm1 \n"
405 "movdqa %%xmm0,%%xmm2 \n"
406 "psllw $0x1,%%xmm1 \n"
407 "psllw $0xb,%%xmm2 \n"
408 "pand %%xmm3,%%xmm1 \n"
409 "pmulhuw %%xmm5,%%xmm2 \n"
410 "pmulhuw %%xmm5,%%xmm1 \n"
411 "psllw $0x8,%%xmm1 \n"
412 "por %%xmm2,%%xmm1 \n"
413 "movdqa %%xmm0,%%xmm2 \n"
414 "pand %%xmm4,%%xmm0 \n"
415 "psraw $0x8,%%xmm2 \n"
416 "pmulhuw %%xmm6,%%xmm0 \n"
417 "pand %%xmm7,%%xmm2 \n"
418 "por %%xmm2,%%xmm0 \n"
419 "movdqa %%xmm1,%%xmm2 \n"
420 "punpcklbw %%xmm0,%%xmm1 \n"
421 "punpckhbw %%xmm0,%%xmm2 \n"
422 "movdqu %%xmm1,0x00(%1,%0,2) \n"
423 "movdqu %%xmm2,0x10(%1,%0,2) \n"
424 "lea 0x10(%0),%0 \n"
425 "sub $0x8,%2 \n"
426 "jg 1b \n"
427 : "+r"(src), // %0
428 "+r"(dst), // %1
429 "+r"(width) // %2
430 :
431 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
432 "xmm6", "xmm7");
433 }
434
ARGB4444ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)435 void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
436 asm volatile(
437 "mov $0xf0f0f0f,%%eax \n"
438 "movd %%eax,%%xmm4 \n"
439 "pshufd $0x0,%%xmm4,%%xmm4 \n"
440 "movdqa %%xmm4,%%xmm5 \n"
441 "pslld $0x4,%%xmm5 \n"
442 "sub %0,%1 \n"
443 "sub %0,%1 \n"
444
445 LABELALIGN
446 "1: \n"
447 "movdqu (%0),%%xmm0 \n"
448 "movdqa %%xmm0,%%xmm2 \n"
449 "pand %%xmm4,%%xmm0 \n"
450 "pand %%xmm5,%%xmm2 \n"
451 "movdqa %%xmm0,%%xmm1 \n"
452 "movdqa %%xmm2,%%xmm3 \n"
453 "psllw $0x4,%%xmm1 \n"
454 "psrlw $0x4,%%xmm3 \n"
455 "por %%xmm1,%%xmm0 \n"
456 "por %%xmm3,%%xmm2 \n"
457 "movdqa %%xmm0,%%xmm1 \n"
458 "punpcklbw %%xmm2,%%xmm0 \n"
459 "punpckhbw %%xmm2,%%xmm1 \n"
460 "movdqu %%xmm0,0x00(%1,%0,2) \n"
461 "movdqu %%xmm1,0x10(%1,%0,2) \n"
462 "lea 0x10(%0),%0 \n"
463 "sub $0x8,%2 \n"
464 "jg 1b \n"
465 : "+r"(src), // %0
466 "+r"(dst), // %1
467 "+r"(width) // %2
468 :
469 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
470 }
471
ARGBToRGB24Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)472 void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
473 asm volatile(
474
475 "movdqa %3,%%xmm6 \n"
476
477 LABELALIGN
478 "1: \n"
479 "movdqu (%0),%%xmm0 \n"
480 "movdqu 0x10(%0),%%xmm1 \n"
481 "movdqu 0x20(%0),%%xmm2 \n"
482 "movdqu 0x30(%0),%%xmm3 \n"
483 "lea 0x40(%0),%0 \n"
484 "pshufb %%xmm6,%%xmm0 \n"
485 "pshufb %%xmm6,%%xmm1 \n"
486 "pshufb %%xmm6,%%xmm2 \n"
487 "pshufb %%xmm6,%%xmm3 \n"
488 "movdqa %%xmm1,%%xmm4 \n"
489 "psrldq $0x4,%%xmm1 \n"
490 "pslldq $0xc,%%xmm4 \n"
491 "movdqa %%xmm2,%%xmm5 \n"
492 "por %%xmm4,%%xmm0 \n"
493 "pslldq $0x8,%%xmm5 \n"
494 "movdqu %%xmm0,(%1) \n"
495 "por %%xmm5,%%xmm1 \n"
496 "psrldq $0x8,%%xmm2 \n"
497 "pslldq $0x4,%%xmm3 \n"
498 "por %%xmm3,%%xmm2 \n"
499 "movdqu %%xmm1,0x10(%1) \n"
500 "movdqu %%xmm2,0x20(%1) \n"
501 "lea 0x30(%1),%1 \n"
502 "sub $0x10,%2 \n"
503 "jg 1b \n"
504 : "+r"(src), // %0
505 "+r"(dst), // %1
506 "+r"(width) // %2
507 : "m"(kShuffleMaskARGBToRGB24) // %3
508 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
509 }
510
ARGBToRAWRow_SSSE3(const uint8_t * src,uint8_t * dst,int width)511 void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
512 asm volatile(
513
514 "movdqa %3,%%xmm6 \n"
515
516 LABELALIGN
517 "1: \n"
518 "movdqu (%0),%%xmm0 \n"
519 "movdqu 0x10(%0),%%xmm1 \n"
520 "movdqu 0x20(%0),%%xmm2 \n"
521 "movdqu 0x30(%0),%%xmm3 \n"
522 "lea 0x40(%0),%0 \n"
523 "pshufb %%xmm6,%%xmm0 \n"
524 "pshufb %%xmm6,%%xmm1 \n"
525 "pshufb %%xmm6,%%xmm2 \n"
526 "pshufb %%xmm6,%%xmm3 \n"
527 "movdqa %%xmm1,%%xmm4 \n"
528 "psrldq $0x4,%%xmm1 \n"
529 "pslldq $0xc,%%xmm4 \n"
530 "movdqa %%xmm2,%%xmm5 \n"
531 "por %%xmm4,%%xmm0 \n"
532 "pslldq $0x8,%%xmm5 \n"
533 "movdqu %%xmm0,(%1) \n"
534 "por %%xmm5,%%xmm1 \n"
535 "psrldq $0x8,%%xmm2 \n"
536 "pslldq $0x4,%%xmm3 \n"
537 "por %%xmm3,%%xmm2 \n"
538 "movdqu %%xmm1,0x10(%1) \n"
539 "movdqu %%xmm2,0x20(%1) \n"
540 "lea 0x30(%1),%1 \n"
541 "sub $0x10,%2 \n"
542 "jg 1b \n"
543 : "+r"(src), // %0
544 "+r"(dst), // %1
545 "+r"(width) // %2
546 : "m"(kShuffleMaskARGBToRAW) // %3
547 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
548 }
549
550 #ifdef HAS_ARGBTORGB24ROW_AVX2
551 // vpermd for 12+12 to 24
552 static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
553
ARGBToRGB24Row_AVX2(const uint8_t * src,uint8_t * dst,int width)554 void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
555 asm volatile(
556 "vbroadcastf128 %3,%%ymm6 \n"
557 "vmovdqa %4,%%ymm7 \n"
558
559 LABELALIGN
560 "1: \n"
561 "vmovdqu (%0),%%ymm0 \n"
562 "vmovdqu 0x20(%0),%%ymm1 \n"
563 "vmovdqu 0x40(%0),%%ymm2 \n"
564 "vmovdqu 0x60(%0),%%ymm3 \n"
565 "lea 0x80(%0),%0 \n"
566 "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
567 "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
568 "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
569 "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
570 "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
571 "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
572 "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
573 "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
574 "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
575 "vpor %%ymm4,%%ymm0,%%ymm0 \n"
576 "vmovdqu %%ymm0,(%1) \n"
577 "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
578 "vpermq $0x4f,%%ymm2,%%ymm4 \n"
579 "vpor %%ymm4,%%ymm1,%%ymm1 \n"
580 "vmovdqu %%ymm1,0x20(%1) \n"
581 "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
582 "vpermq $0x93,%%ymm3,%%ymm3 \n"
583 "vpor %%ymm3,%%ymm2,%%ymm2 \n"
584 "vmovdqu %%ymm2,0x40(%1) \n"
585 "lea 0x60(%1),%1 \n"
586 "sub $0x20,%2 \n"
587 "jg 1b \n"
588 "vzeroupper \n"
589 : "+r"(src), // %0
590 "+r"(dst), // %1
591 "+r"(width) // %2
592 : "m"(kShuffleMaskARGBToRGB24), // %3
593 "m"(kPermdRGB24_AVX) // %4
594 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
595 "xmm7");
596 }
597 #endif
598
599 #ifdef HAS_ARGBTORGB24ROW_AVX512VBMI
600 // Shuffle table for converting ARGBToRGB24
601 static const ulvec8 kPermARGBToRGB24_0 = {
602 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u,
603 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
604 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
605 static const ulvec8 kPermARGBToRGB24_1 = {
606 10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
607 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
608 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
609 static const ulvec8 kPermARGBToRGB24_2 = {
610 21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
611 36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
612 50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
613
ARGBToRGB24Row_AVX512VBMI(const uint8_t * src,uint8_t * dst,int width)614 void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
615 asm volatile(
616 "vmovdqa %3,%%ymm5 \n"
617 "vmovdqa %4,%%ymm6 \n"
618 "vmovdqa %5,%%ymm7 \n"
619
620 LABELALIGN
621 "1: \n"
622 "vmovdqu (%0),%%ymm0 \n"
623 "vmovdqu 0x20(%0),%%ymm1 \n"
624 "vmovdqu 0x40(%0),%%ymm2 \n"
625 "vmovdqu 0x60(%0),%%ymm3 \n"
626 "lea 0x80(%0),%0 \n"
627 "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n"
628 "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n"
629 "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n"
630 "vmovdqu %%ymm0,(%1) \n"
631 "vmovdqu %%ymm1,0x20(%1) \n"
632 "vmovdqu %%ymm2,0x40(%1) \n"
633 "lea 0x60(%1),%1 \n"
634 "sub $0x20,%2 \n"
635 "jg 1b \n"
636 "vzeroupper \n"
637 : "+r"(src), // %0
638 "+r"(dst), // %1
639 "+r"(width) // %2
640 : "m"(kPermARGBToRGB24_0), // %3
641 "m"(kPermARGBToRGB24_1), // %4
642 "m"(kPermARGBToRGB24_2) // %5
643 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");
644 }
645 #endif
646
647 #ifdef HAS_ARGBTORAWROW_AVX2
ARGBToRAWRow_AVX2(const uint8_t * src,uint8_t * dst,int width)648 void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
649 asm volatile(
650 "vbroadcastf128 %3,%%ymm6 \n"
651 "vmovdqa %4,%%ymm7 \n"
652
653 LABELALIGN
654 "1: \n"
655 "vmovdqu (%0),%%ymm0 \n"
656 "vmovdqu 0x20(%0),%%ymm1 \n"
657 "vmovdqu 0x40(%0),%%ymm2 \n"
658 "vmovdqu 0x60(%0),%%ymm3 \n"
659 "lea 0x80(%0),%0 \n"
660 "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
661 "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
662 "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
663 "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
664 "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
665 "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
666 "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
667 "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
668 "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
669 "vpor %%ymm4,%%ymm0,%%ymm0 \n"
670 "vmovdqu %%ymm0,(%1) \n"
671 "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
672 "vpermq $0x4f,%%ymm2,%%ymm4 \n"
673 "vpor %%ymm4,%%ymm1,%%ymm1 \n"
674 "vmovdqu %%ymm1,0x20(%1) \n"
675 "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
676 "vpermq $0x93,%%ymm3,%%ymm3 \n"
677 "vpor %%ymm3,%%ymm2,%%ymm2 \n"
678 "vmovdqu %%ymm2,0x40(%1) \n"
679 "lea 0x60(%1),%1 \n"
680 "sub $0x20,%2 \n"
681 "jg 1b \n"
682 "vzeroupper \n"
683 : "+r"(src), // %0
684 "+r"(dst), // %1
685 "+r"(width) // %2
686 : "m"(kShuffleMaskARGBToRAW), // %3
687 "m"(kPermdRGB24_AVX) // %4
688 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
689 "xmm7");
690 }
691 #endif
692
ARGBToRGB565Row_SSE2(const uint8_t * src,uint8_t * dst,int width)693 void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
694 asm volatile(
695 "pcmpeqb %%xmm3,%%xmm3 \n"
696 "psrld $0x1b,%%xmm3 \n"
697 "pcmpeqb %%xmm4,%%xmm4 \n"
698 "psrld $0x1a,%%xmm4 \n"
699 "pslld $0x5,%%xmm4 \n"
700 "pcmpeqb %%xmm5,%%xmm5 \n"
701 "pslld $0xb,%%xmm5 \n"
702
703 LABELALIGN
704 "1: \n"
705 "movdqu (%0),%%xmm0 \n"
706 "movdqa %%xmm0,%%xmm1 \n"
707 "movdqa %%xmm0,%%xmm2 \n"
708 "pslld $0x8,%%xmm0 \n"
709 "psrld $0x3,%%xmm1 \n"
710 "psrld $0x5,%%xmm2 \n"
711 "psrad $0x10,%%xmm0 \n"
712 "pand %%xmm3,%%xmm1 \n"
713 "pand %%xmm4,%%xmm2 \n"
714 "pand %%xmm5,%%xmm0 \n"
715 "por %%xmm2,%%xmm1 \n"
716 "por %%xmm1,%%xmm0 \n"
717 "packssdw %%xmm0,%%xmm0 \n"
718 "lea 0x10(%0),%0 \n"
719 "movq %%xmm0,(%1) \n"
720 "lea 0x8(%1),%1 \n"
721 "sub $0x4,%2 \n"
722 "jg 1b \n"
723 : "+r"(src), // %0
724 "+r"(dst), // %1
725 "+r"(width) // %2
726 ::"memory",
727 "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
728 }
729
ARGBToRGB565DitherRow_SSE2(const uint8_t * src,uint8_t * dst,const uint32_t dither4,int width)730 void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
731 uint8_t* dst,
732 const uint32_t dither4,
733 int width) {
734 asm volatile(
735 "movd %3,%%xmm6 \n"
736 "punpcklbw %%xmm6,%%xmm6 \n"
737 "movdqa %%xmm6,%%xmm7 \n"
738 "punpcklwd %%xmm6,%%xmm6 \n"
739 "punpckhwd %%xmm7,%%xmm7 \n"
740 "pcmpeqb %%xmm3,%%xmm3 \n"
741 "psrld $0x1b,%%xmm3 \n"
742 "pcmpeqb %%xmm4,%%xmm4 \n"
743 "psrld $0x1a,%%xmm4 \n"
744 "pslld $0x5,%%xmm4 \n"
745 "pcmpeqb %%xmm5,%%xmm5 \n"
746 "pslld $0xb,%%xmm5 \n"
747
748 LABELALIGN
749 "1: \n"
750 "movdqu (%0),%%xmm0 \n"
751 "paddusb %%xmm6,%%xmm0 \n"
752 "movdqa %%xmm0,%%xmm1 \n"
753 "movdqa %%xmm0,%%xmm2 \n"
754 "pslld $0x8,%%xmm0 \n"
755 "psrld $0x3,%%xmm1 \n"
756 "psrld $0x5,%%xmm2 \n"
757 "psrad $0x10,%%xmm0 \n"
758 "pand %%xmm3,%%xmm1 \n"
759 "pand %%xmm4,%%xmm2 \n"
760 "pand %%xmm5,%%xmm0 \n"
761 "por %%xmm2,%%xmm1 \n"
762 "por %%xmm1,%%xmm0 \n"
763 "packssdw %%xmm0,%%xmm0 \n"
764 "lea 0x10(%0),%0 \n"
765 "movq %%xmm0,(%1) \n"
766 "lea 0x8(%1),%1 \n"
767 "sub $0x4,%2 \n"
768 "jg 1b \n"
769 : "+r"(src), // %0
770 "+r"(dst), // %1
771 "+r"(width) // %2
772 : "m"(dither4) // %3
773 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
774 "xmm7");
775 }
776
777 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
ARGBToRGB565DitherRow_AVX2(const uint8_t * src,uint8_t * dst,const uint32_t dither4,int width)778 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
779 uint8_t* dst,
780 const uint32_t dither4,
781 int width) {
782 asm volatile(
783 "vbroadcastss %3,%%xmm6 \n"
784 "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
785 "vpermq $0xd8,%%ymm6,%%ymm6 \n"
786 "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
787 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
788 "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
789 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
790 "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
791 "vpslld $0x5,%%ymm4,%%ymm4 \n"
792 "vpslld $0xb,%%ymm3,%%ymm5 \n"
793
794 LABELALIGN
795 "1: \n"
796 "vmovdqu (%0),%%ymm0 \n"
797 "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
798 "vpsrld $0x5,%%ymm0,%%ymm2 \n"
799 "vpsrld $0x3,%%ymm0,%%ymm1 \n"
800 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
801 "vpand %%ymm4,%%ymm2,%%ymm2 \n"
802 "vpand %%ymm3,%%ymm1,%%ymm1 \n"
803 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
804 "vpor %%ymm2,%%ymm1,%%ymm1 \n"
805 "vpor %%ymm1,%%ymm0,%%ymm0 \n"
806 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
807 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
808 "lea 0x20(%0),%0 \n"
809 "vmovdqu %%xmm0,(%1) \n"
810 "lea 0x10(%1),%1 \n"
811 "sub $0x8,%2 \n"
812 "jg 1b \n"
813 "vzeroupper \n"
814 : "+r"(src), // %0
815 "+r"(dst), // %1
816 "+r"(width) // %2
817 : "m"(dither4) // %3
818 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
819 "xmm7");
820 }
821 #endif // HAS_ARGBTORGB565DITHERROW_AVX2
822
ARGBToARGB1555Row_SSE2(const uint8_t * src,uint8_t * dst,int width)823 void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
824 asm volatile(
825 "pcmpeqb %%xmm4,%%xmm4 \n"
826 "psrld $0x1b,%%xmm4 \n"
827 "movdqa %%xmm4,%%xmm5 \n"
828 "pslld $0x5,%%xmm5 \n"
829 "movdqa %%xmm4,%%xmm6 \n"
830 "pslld $0xa,%%xmm6 \n"
831 "pcmpeqb %%xmm7,%%xmm7 \n"
832 "pslld $0xf,%%xmm7 \n"
833
834 LABELALIGN
835 "1: \n"
836 "movdqu (%0),%%xmm0 \n"
837 "movdqa %%xmm0,%%xmm1 \n"
838 "movdqa %%xmm0,%%xmm2 \n"
839 "movdqa %%xmm0,%%xmm3 \n"
840 "psrad $0x10,%%xmm0 \n"
841 "psrld $0x3,%%xmm1 \n"
842 "psrld $0x6,%%xmm2 \n"
843 "psrld $0x9,%%xmm3 \n"
844 "pand %%xmm7,%%xmm0 \n"
845 "pand %%xmm4,%%xmm1 \n"
846 "pand %%xmm5,%%xmm2 \n"
847 "pand %%xmm6,%%xmm3 \n"
848 "por %%xmm1,%%xmm0 \n"
849 "por %%xmm3,%%xmm2 \n"
850 "por %%xmm2,%%xmm0 \n"
851 "packssdw %%xmm0,%%xmm0 \n"
852 "lea 0x10(%0),%0 \n"
853 "movq %%xmm0,(%1) \n"
854 "lea 0x8(%1),%1 \n"
855 "sub $0x4,%2 \n"
856 "jg 1b \n"
857 : "+r"(src), // %0
858 "+r"(dst), // %1
859 "+r"(width) // %2
860 ::"memory",
861 "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
862 }
863
ARGBToARGB4444Row_SSE2(const uint8_t * src,uint8_t * dst,int width)864 void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
865 asm volatile(
866 "pcmpeqb %%xmm4,%%xmm4 \n"
867 "psllw $0xc,%%xmm4 \n"
868 "movdqa %%xmm4,%%xmm3 \n"
869 "psrlw $0x8,%%xmm3 \n"
870
871 LABELALIGN
872 "1: \n"
873 "movdqu (%0),%%xmm0 \n"
874 "movdqa %%xmm0,%%xmm1 \n"
875 "pand %%xmm3,%%xmm0 \n"
876 "pand %%xmm4,%%xmm1 \n"
877 "psrlq $0x4,%%xmm0 \n"
878 "psrlq $0x8,%%xmm1 \n"
879 "por %%xmm1,%%xmm0 \n"
880 "packuswb %%xmm0,%%xmm0 \n"
881 "lea 0x10(%0),%0 \n"
882 "movq %%xmm0,(%1) \n"
883 "lea 0x8(%1),%1 \n"
884 "sub $0x4,%2 \n"
885 "jg 1b \n"
886 : "+r"(src), // %0
887 "+r"(dst), // %1
888 "+r"(width) // %2
889 ::"memory",
890 "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
891 }
892 #endif // HAS_RGB24TOARGBROW_SSSE3
893
894 /*
895
896 ARGBToAR30Row:
897
898 Red Blue
899 With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
900 produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
901 wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
902 (1024+4)*16 for red.
903
904 Alpha Green
905 Alpha and Green are already in the high bits so vpand can zero out the other
906 bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
907 could be used for Green - (1024+4) putting the 10 bit green in the lsb. Alpha
908 would be a simple multiplier to shift it into position. It wants a gap of 10
909 above the green. Green is 10 bits, so there are 6 bits in the low short. 4
910 more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
911 and then a shift of 4 is a multiply of 16, so (4*16) = 64. Then shift the
912 result left 10 to position the A and G channels.
913 */
914
915 // Shuffle table for converting RAW to RGB24. Last 8.
916 static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u,
917 128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
918
919 static const uvec8 kShuffleBR30 = {128u, 2u, 128u, 0u, 128u, 6u, 128u, 4u,
920 128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
921
922 static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
923 static const uint32_t kMaskRB10 = 0x3ff003ff;
924 static const uint32_t kMaskAG10 = 0xc000ff00;
925 static const uint32_t kMulAG10 = 64 * 65536 + 1028;
926
ARGBToAR30Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)927 void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
928 asm volatile(
929 "movdqa %3,%%xmm2 \n" // shuffler for RB
930 "movd %4,%%xmm3 \n" // multipler for RB
931 "movd %5,%%xmm4 \n" // mask for R10 B10
932 "movd %6,%%xmm5 \n" // mask for AG
933 "movd %7,%%xmm6 \n" // multipler for AG
934 "pshufd $0x0,%%xmm3,%%xmm3 \n"
935 "pshufd $0x0,%%xmm4,%%xmm4 \n"
936 "pshufd $0x0,%%xmm5,%%xmm5 \n"
937 "pshufd $0x0,%%xmm6,%%xmm6 \n"
938 "sub %0,%1 \n"
939
940 "1: \n"
941 "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels
942 "movdqa %%xmm0,%%xmm1 \n"
943 "pshufb %%xmm2,%%xmm1 \n" // R0B0
944 "pand %%xmm5,%%xmm0 \n" // A0G0
945 "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
946 "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
947 "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
948 "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
949 "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
950 "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
951 "add $0x10,%0 \n"
952 "sub $0x4,%2 \n"
953 "jg 1b \n"
954
955 : "+r"(src), // %0
956 "+r"(dst), // %1
957 "+r"(width) // %2
958 : "m"(kShuffleRB30), // %3
959 "m"(kMulRB10), // %4
960 "m"(kMaskRB10), // %5
961 "m"(kMaskAG10), // %6
962 "m"(kMulAG10) // %7
963 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
964 }
965
ABGRToAR30Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)966 void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
967 asm volatile(
968 "movdqa %3,%%xmm2 \n" // shuffler for RB
969 "movd %4,%%xmm3 \n" // multipler for RB
970 "movd %5,%%xmm4 \n" // mask for R10 B10
971 "movd %6,%%xmm5 \n" // mask for AG
972 "movd %7,%%xmm6 \n" // multipler for AG
973 "pshufd $0x0,%%xmm3,%%xmm3 \n"
974 "pshufd $0x0,%%xmm4,%%xmm4 \n"
975 "pshufd $0x0,%%xmm5,%%xmm5 \n"
976 "pshufd $0x0,%%xmm6,%%xmm6 \n"
977 "sub %0,%1 \n"
978
979 "1: \n"
980 "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels
981 "movdqa %%xmm0,%%xmm1 \n"
982 "pshufb %%xmm2,%%xmm1 \n" // R0B0
983 "pand %%xmm5,%%xmm0 \n" // A0G0
984 "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
985 "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
986 "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
987 "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
988 "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
989 "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
990 "add $0x10,%0 \n"
991 "sub $0x4,%2 \n"
992 "jg 1b \n"
993
994 : "+r"(src), // %0
995 "+r"(dst), // %1
996 "+r"(width) // %2
997 : "m"(kShuffleBR30), // %3 reversed shuffler
998 "m"(kMulRB10), // %4
999 "m"(kMaskRB10), // %5
1000 "m"(kMaskAG10), // %6
1001 "m"(kMulAG10) // %7
1002 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1003 }
1004
1005 #ifdef HAS_ARGBTOAR30ROW_AVX2
ARGBToAR30Row_AVX2(const uint8_t * src,uint8_t * dst,int width)1006 void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
1007 asm volatile(
1008 "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
1009 "vbroadcastss %4,%%ymm3 \n" // multipler for RB
1010 "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
1011 "vbroadcastss %6,%%ymm5 \n" // mask for AG
1012 "vbroadcastss %7,%%ymm6 \n" // multipler for AG
1013 "sub %0,%1 \n"
1014
1015 "1: \n"
1016 "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels
1017 "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
1018 "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
1019 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
1020 "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
1021 "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
1022 "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
1023 "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
1024 "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
1025 "add $0x20,%0 \n"
1026 "sub $0x8,%2 \n"
1027 "jg 1b \n"
1028 "vzeroupper \n"
1029
1030 : "+r"(src), // %0
1031 "+r"(dst), // %1
1032 "+r"(width) // %2
1033 : "m"(kShuffleRB30), // %3
1034 "m"(kMulRB10), // %4
1035 "m"(kMaskRB10), // %5
1036 "m"(kMaskAG10), // %6
1037 "m"(kMulAG10) // %7
1038 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1039 }
1040 #endif
1041
1042 #ifdef HAS_ABGRTOAR30ROW_AVX2
ABGRToAR30Row_AVX2(const uint8_t * src,uint8_t * dst,int width)1043 void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
1044 asm volatile(
1045 "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
1046 "vbroadcastss %4,%%ymm3 \n" // multipler for RB
1047 "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
1048 "vbroadcastss %6,%%ymm5 \n" // mask for AG
1049 "vbroadcastss %7,%%ymm6 \n" // multipler for AG
1050 "sub %0,%1 \n"
1051
1052 "1: \n"
1053 "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels
1054 "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
1055 "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
1056 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
1057 "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
1058 "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
1059 "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
1060 "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
1061 "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
1062 "add $0x20,%0 \n"
1063 "sub $0x8,%2 \n"
1064 "jg 1b \n"
1065 "vzeroupper \n"
1066
1067 : "+r"(src), // %0
1068 "+r"(dst), // %1
1069 "+r"(width) // %2
1070 : "m"(kShuffleBR30), // %3 reversed shuffler
1071 "m"(kMulRB10), // %4
1072 "m"(kMaskRB10), // %5
1073 "m"(kMaskAG10), // %6
1074 "m"(kMulAG10) // %7
1075 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1076 }
1077 #endif
1078
1079 static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7,
1080 10, 9, 8, 11, 14, 13, 12, 15};
1081
1082 static const uvec8 kShuffleARGBToAB64Lo = {2, 2, 1, 1, 0, 0, 3, 3,
1083 6, 6, 5, 5, 4, 4, 7, 7};
1084 static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9, 9, 8, 8, 11, 11,
1085 14, 14, 13, 13, 12, 12, 15, 15};
1086
ARGBToAR64Row_SSSE3(const uint8_t * src_argb,uint16_t * dst_ar64,int width)1087 void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
1088 uint16_t* dst_ar64,
1089 int width) {
1090 asm volatile(
1091
1092 LABELALIGN
1093 "1: \n"
1094 "movdqu (%0),%%xmm0 \n"
1095 "movdqa %%xmm0,%%xmm1 \n"
1096 "punpcklbw %%xmm0,%%xmm0 \n"
1097 "punpckhbw %%xmm1,%%xmm1 \n"
1098 "movdqu %%xmm0,(%1) \n"
1099 "movdqu %%xmm1,0x10(%1) \n"
1100 "lea 0x10(%0),%0 \n"
1101 "lea 0x20(%1),%1 \n"
1102 "sub $0x4,%2 \n"
1103 "jg 1b \n"
1104 : "+r"(src_argb), // %0
1105 "+r"(dst_ar64), // %1
1106 "+r"(width) // %2
1107 :
1108 : "memory", "cc", "xmm0", "xmm1");
1109 }
1110
ARGBToAB64Row_SSSE3(const uint8_t * src_argb,uint16_t * dst_ab64,int width)1111 void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
1112 uint16_t* dst_ab64,
1113 int width) {
1114 asm volatile(
1115
1116 "movdqa %3,%%xmm2 \n"
1117 "movdqa %4,%%xmm3 \n" LABELALIGN
1118 "1: \n"
1119 "movdqu (%0),%%xmm0 \n"
1120 "movdqa %%xmm0,%%xmm1 \n"
1121 "pshufb %%xmm2,%%xmm0 \n"
1122 "pshufb %%xmm3,%%xmm1 \n"
1123 "movdqu %%xmm0,(%1) \n"
1124 "movdqu %%xmm1,0x10(%1) \n"
1125 "lea 0x10(%0),%0 \n"
1126 "lea 0x20(%1),%1 \n"
1127 "sub $0x4,%2 \n"
1128 "jg 1b \n"
1129 : "+r"(src_argb), // %0
1130 "+r"(dst_ab64), // %1
1131 "+r"(width) // %2
1132 : "m"(kShuffleARGBToAB64Lo), // %3
1133 "m"(kShuffleARGBToAB64Hi) // %4
1134 : "memory", "cc", "xmm0", "xmm1", "xmm2");
1135 }
1136
AR64ToARGBRow_SSSE3(const uint16_t * src_ar64,uint8_t * dst_argb,int width)1137 void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
1138 uint8_t* dst_argb,
1139 int width) {
1140 asm volatile(
1141
1142 LABELALIGN
1143 "1: \n"
1144 "movdqu (%0),%%xmm0 \n"
1145 "movdqu 0x10(%0),%%xmm1 \n"
1146 "psrlw $8,%%xmm0 \n"
1147 "psrlw $8,%%xmm1 \n"
1148 "packuswb %%xmm1,%%xmm0 \n"
1149 "movdqu %%xmm0,(%1) \n"
1150 "lea 0x20(%0),%0 \n"
1151 "lea 0x10(%1),%1 \n"
1152 "sub $0x4,%2 \n"
1153 "jg 1b \n"
1154 : "+r"(src_ar64), // %0
1155 "+r"(dst_argb), // %1
1156 "+r"(width) // %2
1157 :
1158 : "memory", "cc", "xmm0", "xmm1");
1159 }
1160
AB64ToARGBRow_SSSE3(const uint16_t * src_ab64,uint8_t * dst_argb,int width)1161 void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
1162 uint8_t* dst_argb,
1163 int width) {
1164 asm volatile(
1165
1166 "movdqa %3,%%xmm2 \n" LABELALIGN
1167 "1: \n"
1168 "movdqu (%0),%%xmm0 \n"
1169 "movdqu 0x10(%0),%%xmm1 \n"
1170 "psrlw $8,%%xmm0 \n"
1171 "psrlw $8,%%xmm1 \n"
1172 "packuswb %%xmm1,%%xmm0 \n"
1173 "pshufb %%xmm2,%%xmm0 \n"
1174 "movdqu %%xmm0,(%1) \n"
1175 "lea 0x20(%0),%0 \n"
1176 "lea 0x10(%1),%1 \n"
1177 "sub $0x4,%2 \n"
1178 "jg 1b \n"
1179 : "+r"(src_ab64), // %0
1180 "+r"(dst_argb), // %1
1181 "+r"(width) // %2
1182 : "m"(kShuffleARGBToABGR) // %3
1183 : "memory", "cc", "xmm0", "xmm1", "xmm2");
1184 }
1185
1186 #ifdef HAS_ARGBTOAR64ROW_AVX2
ARGBToAR64Row_AVX2(const uint8_t * src_argb,uint16_t * dst_ar64,int width)1187 void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
1188 uint16_t* dst_ar64,
1189 int width) {
1190 asm volatile(
1191
1192 LABELALIGN
1193 "1: \n"
1194 "vmovdqu (%0),%%ymm0 \n"
1195 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1196 "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
1197 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
1198 "vmovdqu %%ymm0,(%1) \n"
1199 "vmovdqu %%ymm1,0x20(%1) \n"
1200 "lea 0x20(%0),%0 \n"
1201 "lea 0x40(%1),%1 \n"
1202 "sub $0x8,%2 \n"
1203 "jg 1b \n"
1204 : "+r"(src_argb), // %0
1205 "+r"(dst_ar64), // %1
1206 "+r"(width) // %2
1207 :
1208 : "memory", "cc", "xmm0", "xmm1");
1209 }
1210 #endif
1211
1212 #ifdef HAS_ARGBTOAB64ROW_AVX2
ARGBToAB64Row_AVX2(const uint8_t * src_argb,uint16_t * dst_ab64,int width)1213 void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
1214 uint16_t* dst_ab64,
1215 int width) {
1216 asm volatile(
1217
1218 "vbroadcastf128 %3,%%ymm2 \n"
1219 "vbroadcastf128 %4,%%ymm3 \n" LABELALIGN
1220 "1: \n"
1221 "vmovdqu (%0),%%ymm0 \n"
1222 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1223 "vpshufb %%ymm3,%%ymm0,%%ymm1 \n"
1224 "vpshufb %%ymm2,%%ymm0,%%ymm0 \n"
1225 "vmovdqu %%ymm0,(%1) \n"
1226 "vmovdqu %%ymm1,0x20(%1) \n"
1227 "lea 0x20(%0),%0 \n"
1228 "lea 0x40(%1),%1 \n"
1229 "sub $0x8,%2 \n"
1230 "jg 1b \n"
1231 : "+r"(src_argb), // %0
1232 "+r"(dst_ab64), // %1
1233 "+r"(width) // %2
1234 : "m"(kShuffleARGBToAB64Lo), // %3
1235 "m"(kShuffleARGBToAB64Hi) // %3
1236 : "memory", "cc", "xmm0", "xmm1", "xmm2");
1237 }
1238 #endif
1239
1240 #ifdef HAS_AR64TOARGBROW_AVX2
AR64ToARGBRow_AVX2(const uint16_t * src_ar64,uint8_t * dst_argb,int width)1241 void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
1242 uint8_t* dst_argb,
1243 int width) {
1244 asm volatile(
1245
1246 LABELALIGN
1247 "1: \n"
1248 "vmovdqu (%0),%%ymm0 \n"
1249 "vmovdqu 0x20(%0),%%ymm1 \n"
1250 "vpsrlw $8,%%ymm0,%%ymm0 \n"
1251 "vpsrlw $8,%%ymm1,%%ymm1 \n"
1252 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
1253 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1254 "vmovdqu %%ymm0,(%1) \n"
1255 "lea 0x40(%0),%0 \n"
1256 "lea 0x20(%1),%1 \n"
1257 "sub $0x8,%2 \n"
1258 "jg 1b \n"
1259 : "+r"(src_ar64), // %0
1260 "+r"(dst_argb), // %1
1261 "+r"(width) // %2
1262 :
1263 : "memory", "cc", "xmm0", "xmm1");
1264 }
1265 #endif
1266
1267 #ifdef HAS_AB64TOARGBROW_AVX2
AB64ToARGBRow_AVX2(const uint16_t * src_ab64,uint8_t * dst_argb,int width)1268 void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
1269 uint8_t* dst_argb,
1270 int width) {
1271 asm volatile(
1272
1273 "vbroadcastf128 %3,%%ymm2 \n" LABELALIGN
1274 "1: \n"
1275 "vmovdqu (%0),%%ymm0 \n"
1276 "vmovdqu 0x20(%0),%%ymm1 \n"
1277 "vpsrlw $8,%%ymm0,%%ymm0 \n"
1278 "vpsrlw $8,%%ymm1,%%ymm1 \n"
1279 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
1280 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1281 "vpshufb %%ymm2,%%ymm0,%%ymm0 \n"
1282 "vmovdqu %%ymm0,(%1) \n"
1283 "lea 0x40(%0),%0 \n"
1284 "lea 0x20(%1),%1 \n"
1285 "sub $0x8,%2 \n"
1286 "jg 1b \n"
1287 : "+r"(src_ab64), // %0
1288 "+r"(dst_argb), // %1
1289 "+r"(width) // %2
1290 : "m"(kShuffleARGBToABGR) // %3
1291 : "memory", "cc", "xmm0", "xmm1", "xmm2");
1292 }
1293 #endif
1294
1295 // clang-format off
1296
1297 // TODO(mraptis): Consider passing R, G, B multipliers as parameter.
1298 // round parameter is register containing value to add before shift.
1299 #define RGBTOY(round) \
1300 "1: \n" \
1301 "movdqu (%0),%%xmm0 \n" \
1302 "movdqu 0x10(%0),%%xmm1 \n" \
1303 "movdqu 0x20(%0),%%xmm2 \n" \
1304 "movdqu 0x30(%0),%%xmm3 \n" \
1305 "psubb %%xmm5,%%xmm0 \n" \
1306 "psubb %%xmm5,%%xmm1 \n" \
1307 "psubb %%xmm5,%%xmm2 \n" \
1308 "psubb %%xmm5,%%xmm3 \n" \
1309 "movdqu %%xmm4,%%xmm6 \n" \
1310 "pmaddubsw %%xmm0,%%xmm6 \n" \
1311 "movdqu %%xmm4,%%xmm0 \n" \
1312 "pmaddubsw %%xmm1,%%xmm0 \n" \
1313 "movdqu %%xmm4,%%xmm1 \n" \
1314 "pmaddubsw %%xmm2,%%xmm1 \n" \
1315 "movdqu %%xmm4,%%xmm2 \n" \
1316 "pmaddubsw %%xmm3,%%xmm2 \n" \
1317 "lea 0x40(%0),%0 \n" \
1318 "phaddw %%xmm0,%%xmm6 \n" \
1319 "phaddw %%xmm2,%%xmm1 \n" \
1320 "prefetcht0 1280(%0) \n" \
1321 "paddw %%" #round ",%%xmm6 \n" \
1322 "paddw %%" #round ",%%xmm1 \n" \
1323 "psrlw $0x8,%%xmm6 \n" \
1324 "psrlw $0x8,%%xmm1 \n" \
1325 "packuswb %%xmm1,%%xmm6 \n" \
1326 "movdqu %%xmm6,(%1) \n" \
1327 "lea 0x10(%1),%1 \n" \
1328 "sub $0x10,%2 \n" \
1329 "jg 1b \n"
1330
1331 #define RGBTOY_AVX2(round) \
1332 "1: \n" \
1333 "vmovdqu (%0),%%ymm0 \n" \
1334 "vmovdqu 0x20(%0),%%ymm1 \n" \
1335 "vmovdqu 0x40(%0),%%ymm2 \n" \
1336 "vmovdqu 0x60(%0),%%ymm3 \n" \
1337 "vpsubb %%ymm5, %%ymm0, %%ymm0 \n" \
1338 "vpsubb %%ymm5, %%ymm1, %%ymm1 \n" \
1339 "vpsubb %%ymm5, %%ymm2, %%ymm2 \n" \
1340 "vpsubb %%ymm5, %%ymm3, %%ymm3 \n" \
1341 "vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \
1342 "vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \
1343 "vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \
1344 "vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \
1345 "lea 0x80(%0),%0 \n" \
1346 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \
1347 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \
1348 "prefetcht0 1280(%0) \n" \
1349 "vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \
1350 "vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \
1351 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \
1352 "vpsrlw $0x8,%%ymm2,%%ymm2 \n" \
1353 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \
1354 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" /* unmutate. */ \
1355 "vmovdqu %%ymm0,(%1) \n" \
1356 "lea 0x20(%1),%1 \n" \
1357 "sub $0x20,%2 \n" \
1358 "jg 1b \n" \
1359 "vzeroupper \n"
1360
1361 // clang-format on
1362
1363 #ifdef HAS_ARGBTOYROW_SSSE3
1364 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
ARGBToYRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_y,int width)1365 void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1366 asm volatile(
1367 "movdqa %3,%%xmm4 \n"
1368 "movdqa %4,%%xmm5 \n"
1369 "movdqa %5,%%xmm7 \n"
1370
1371 LABELALIGN RGBTOY(xmm7)
1372 : "+r"(src_argb), // %0
1373 "+r"(dst_y), // %1
1374 "+r"(width) // %2
1375 : "m"(kARGBToY), // %3
1376 "m"(kSub128), // %4
1377 "m"(kAddY16) // %5
1378 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1379 "xmm7");
1380 }
1381 #endif // HAS_ARGBTOYROW_SSSE3
1382
1383 #ifdef HAS_ARGBTOYJROW_SSSE3
1384 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1385 // Same as ARGBToYRow but different coefficients, no add 16.
ARGBToYJRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_y,int width)1386 void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1387 asm volatile(
1388 "movdqa %3,%%xmm4 \n"
1389 "movdqa %4,%%xmm5 \n"
1390
1391 LABELALIGN RGBTOY(xmm5)
1392 : "+r"(src_argb), // %0
1393 "+r"(dst_y), // %1
1394 "+r"(width) // %2
1395 : "m"(kARGBToYJ), // %3
1396 "m"(kSub128) // %4
1397 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1398 }
1399 #endif // HAS_ARGBTOYJROW_SSSE3
1400
1401 #ifdef HAS_RGBATOYJROW_SSSE3
1402 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1403 // Same as ARGBToYRow but different coefficients, no add 16.
RGBAToYJRow_SSSE3(const uint8_t * src_rgba,uint8_t * dst_y,int width)1404 void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1405 asm volatile(
1406 "movdqa %3,%%xmm4 \n"
1407 "movdqa %4,%%xmm5 \n"
1408
1409 LABELALIGN RGBTOY(xmm5)
1410 : "+r"(src_rgba), // %0
1411 "+r"(dst_y), // %1
1412 "+r"(width) // %2
1413 : "m"(kRGBAToYJ), // %3
1414 "m"(kSub128) // %4
1415 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1416 }
1417 #endif // HAS_RGBATOYJROW_SSSE3
1418
1419 #if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
1420 // vpermd for vphaddw + vpackuswb vpermd.
1421 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
1422 #endif
1423
1424 #ifdef HAS_ARGBTOYROW_AVX2
1425
1426 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYRow_AVX2(const uint8_t * src_argb,uint8_t * dst_y,int width)1427 void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1428 asm volatile(
1429 "vbroadcastf128 %3,%%ymm4 \n"
1430 "vbroadcastf128 %4,%%ymm5 \n"
1431 "vbroadcastf128 %5,%%ymm7 \n"
1432 "vmovdqu %6,%%ymm6 \n"
1433
1434 LABELALIGN RGBTOY_AVX2(ymm7)
1435 : "+r"(src_argb), // %0
1436 "+r"(dst_y), // %1
1437 "+r"(width) // %2
1438 : "m"(kARGBToY), // %3
1439 "m"(kSub128), // %4
1440 "m"(kAddY16), // %5
1441 "m"(kPermdARGBToY_AVX) // %6
1442 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1443 "xmm7");
1444 }
1445 #endif // HAS_ARGBTOYROW_AVX2
1446
1447 #ifdef HAS_ABGRTOYROW_AVX2
1448 // Convert 32 ABGR pixels (128 bytes) to 32 Y values.
ABGRToYRow_AVX2(const uint8_t * src_abgr,uint8_t * dst_y,int width)1449 void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1450 asm volatile(
1451 "vbroadcastf128 %3,%%ymm4 \n"
1452 "vbroadcastf128 %4,%%ymm5 \n"
1453 "vbroadcastf128 %5,%%ymm7 \n"
1454 "vmovdqu %6,%%ymm6 \n"
1455
1456 LABELALIGN RGBTOY_AVX2(ymm7)
1457 : "+r"(src_abgr), // %0
1458 "+r"(dst_y), // %1
1459 "+r"(width) // %2
1460 : "m"(kABGRToY), // %3
1461 "m"(kSub128), // %4
1462 "m"(kAddY16), // %5
1463 "m"(kPermdARGBToY_AVX) // %6
1464 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1465 "xmm7");
1466 }
1467 #endif // HAS_ABGRTOYROW_AVX2
1468
1469 #ifdef HAS_ARGBTOYJROW_AVX2
1470 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYJRow_AVX2(const uint8_t * src_argb,uint8_t * dst_y,int width)1471 void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1472 asm volatile(
1473 "vbroadcastf128 %3,%%ymm4 \n"
1474 "vbroadcastf128 %4,%%ymm5 \n"
1475 "vmovdqu %5,%%ymm6 \n"
1476
1477 LABELALIGN RGBTOY_AVX2(ymm5)
1478 : "+r"(src_argb), // %0
1479 "+r"(dst_y), // %1
1480 "+r"(width) // %2
1481 : "m"(kARGBToYJ), // %3
1482 "m"(kSub128), // %4
1483 "m"(kPermdARGBToY_AVX) // %5
1484 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1485 "xmm7");
1486 }
1487 #endif // HAS_ARGBTOYJROW_AVX2
1488
1489 #ifdef HAS_RGBATOYJROW_AVX2
1490 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
RGBAToYJRow_AVX2(const uint8_t * src_rgba,uint8_t * dst_y,int width)1491 void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1492 asm volatile(
1493 "vbroadcastf128 %3,%%ymm4 \n"
1494 "vbroadcastf128 %4,%%ymm5 \n"
1495 "vmovdqu %5,%%ymm6 \n"
1496
1497 LABELALIGN RGBTOY_AVX2(
1498 ymm5) "vzeroupper \n"
1499 : "+r"(src_rgba), // %0
1500 "+r"(dst_y), // %1
1501 "+r"(width) // %2
1502 : "m"(kRGBAToYJ), // %3
1503 "m"(kSub128), // %4
1504 "m"(kPermdARGBToY_AVX) // %5
1505 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1506 }
1507 #endif // HAS_RGBATOYJROW_AVX2
1508
1509 #ifdef HAS_ARGBTOUVROW_SSSE3
ARGBToUVRow_SSSE3(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1510 void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
1511 int src_stride_argb,
1512 uint8_t* dst_u,
1513 uint8_t* dst_v,
1514 int width) {
1515 asm volatile(
1516 "movdqa %5,%%xmm3 \n"
1517 "movdqa %6,%%xmm4 \n"
1518 "movdqa %7,%%xmm5 \n"
1519 "sub %1,%2 \n"
1520
1521 LABELALIGN
1522 "1: \n"
1523 "movdqu (%0),%%xmm0 \n"
1524 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
1525 "pavgb %%xmm7,%%xmm0 \n"
1526 "movdqu 0x10(%0),%%xmm1 \n"
1527 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1528 "pavgb %%xmm7,%%xmm1 \n"
1529 "movdqu 0x20(%0),%%xmm2 \n"
1530 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1531 "pavgb %%xmm7,%%xmm2 \n"
1532 "movdqu 0x30(%0),%%xmm6 \n"
1533 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1534 "pavgb %%xmm7,%%xmm6 \n"
1535
1536 "lea 0x40(%0),%0 \n"
1537 "movdqa %%xmm0,%%xmm7 \n"
1538 "shufps $0x88,%%xmm1,%%xmm0 \n"
1539 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1540 "pavgb %%xmm7,%%xmm0 \n"
1541 "movdqa %%xmm2,%%xmm7 \n"
1542 "shufps $0x88,%%xmm6,%%xmm2 \n"
1543 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1544 "pavgb %%xmm7,%%xmm2 \n"
1545 "movdqa %%xmm0,%%xmm1 \n"
1546 "movdqa %%xmm2,%%xmm6 \n"
1547 "pmaddubsw %%xmm4,%%xmm0 \n"
1548 "pmaddubsw %%xmm4,%%xmm2 \n"
1549 "pmaddubsw %%xmm3,%%xmm1 \n"
1550 "pmaddubsw %%xmm3,%%xmm6 \n"
1551 "phaddw %%xmm2,%%xmm0 \n"
1552 "phaddw %%xmm6,%%xmm1 \n"
1553 "psraw $0x8,%%xmm0 \n"
1554 "psraw $0x8,%%xmm1 \n"
1555 "packsswb %%xmm1,%%xmm0 \n"
1556 "paddb %%xmm5,%%xmm0 \n"
1557 "movlps %%xmm0,(%1) \n"
1558 "movhps %%xmm0,0x00(%1,%2,1) \n"
1559 "lea 0x8(%1),%1 \n"
1560 "sub $0x10,%3 \n"
1561 "jg 1b \n"
1562 : "+r"(src_argb), // %0
1563 "+r"(dst_u), // %1
1564 "+r"(dst_v), // %2
1565 "+rm"(width) // %3
1566 : "r"((intptr_t)(src_stride_argb)), // %4
1567 "m"(kARGBToV), // %5
1568 "m"(kARGBToU), // %6
1569 "m"(kAddUV128) // %7
1570 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1571 }
1572 #endif // HAS_ARGBTOUVROW_SSSE3
1573
1574 #ifdef HAS_ARGBTOUVROW_AVX2
1575 // vpshufb for vphaddw + vpackuswb packed to shorts.
1576 static const lvec8 kShufARGBToUV_AVX = {
1577 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
1578 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
ARGBToUVRow_AVX2(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1579 void ARGBToUVRow_AVX2(const uint8_t* src_argb,
1580 int src_stride_argb,
1581 uint8_t* dst_u,
1582 uint8_t* dst_v,
1583 int width) {
1584 asm volatile(
1585 "vbroadcastf128 %5,%%ymm5 \n"
1586 "vbroadcastf128 %6,%%ymm6 \n"
1587 "vbroadcastf128 %7,%%ymm7 \n"
1588 "sub %1,%2 \n"
1589
1590 LABELALIGN
1591 "1: \n"
1592 "vmovdqu (%0),%%ymm0 \n"
1593 "vmovdqu 0x20(%0),%%ymm1 \n"
1594 "vmovdqu 0x40(%0),%%ymm2 \n"
1595 "vmovdqu 0x60(%0),%%ymm3 \n"
1596 "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
1597 "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
1598 "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
1599 "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
1600 "lea 0x80(%0),%0 \n"
1601 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
1602 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
1603 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
1604 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
1605 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
1606 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
1607
1608 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
1609 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
1610 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
1611 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
1612 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
1613 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
1614 "vpsraw $0x8,%%ymm1,%%ymm1 \n"
1615 "vpsraw $0x8,%%ymm0,%%ymm0 \n"
1616 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
1617 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1618 "vpshufb %8,%%ymm0,%%ymm0 \n"
1619 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
1620
1621 "vextractf128 $0x0,%%ymm0,(%1) \n"
1622 "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
1623 "lea 0x10(%1),%1 \n"
1624 "sub $0x20,%3 \n"
1625 "jg 1b \n"
1626 "vzeroupper \n"
1627 : "+r"(src_argb), // %0
1628 "+r"(dst_u), // %1
1629 "+r"(dst_v), // %2
1630 "+rm"(width) // %3
1631 : "r"((intptr_t)(src_stride_argb)), // %4
1632 "m"(kAddUV128), // %5
1633 "m"(kARGBToV), // %6
1634 "m"(kARGBToU), // %7
1635 "m"(kShufARGBToUV_AVX) // %8
1636 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1637 "xmm7");
1638 }
1639 #endif // HAS_ARGBTOUVROW_AVX2
1640
1641 #ifdef HAS_ABGRTOUVROW_AVX2
ABGRToUVRow_AVX2(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)1642 void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
1643 int src_stride_abgr,
1644 uint8_t* dst_u,
1645 uint8_t* dst_v,
1646 int width) {
1647 asm volatile(
1648 "vbroadcastf128 %5,%%ymm5 \n"
1649 "vbroadcastf128 %6,%%ymm6 \n"
1650 "vbroadcastf128 %7,%%ymm7 \n"
1651 "sub %1,%2 \n"
1652
1653 LABELALIGN
1654 "1: \n"
1655 "vmovdqu (%0),%%ymm0 \n"
1656 "vmovdqu 0x20(%0),%%ymm1 \n"
1657 "vmovdqu 0x40(%0),%%ymm2 \n"
1658 "vmovdqu 0x60(%0),%%ymm3 \n"
1659 "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
1660 "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
1661 "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
1662 "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
1663 "lea 0x80(%0),%0 \n"
1664 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
1665 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
1666 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
1667 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
1668 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
1669 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
1670
1671 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
1672 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
1673 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
1674 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
1675 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
1676 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
1677 "vpsraw $0x8,%%ymm1,%%ymm1 \n"
1678 "vpsraw $0x8,%%ymm0,%%ymm0 \n"
1679 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
1680 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1681 "vpshufb %8,%%ymm0,%%ymm0 \n"
1682 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
1683
1684 "vextractf128 $0x0,%%ymm0,(%1) \n"
1685 "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
1686 "lea 0x10(%1),%1 \n"
1687 "sub $0x20,%3 \n"
1688 "jg 1b \n"
1689 "vzeroupper \n"
1690 : "+r"(src_abgr), // %0
1691 "+r"(dst_u), // %1
1692 "+r"(dst_v), // %2
1693 "+rm"(width) // %3
1694 : "r"((intptr_t)(src_stride_abgr)), // %4
1695 "m"(kAddUV128), // %5
1696 "m"(kABGRToV), // %6
1697 "m"(kABGRToU), // %7
1698 "m"(kShufARGBToUV_AVX) // %8
1699 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1700 "xmm7");
1701 }
1702 #endif // HAS_ABGRTOUVROW_AVX2
1703
1704 #ifdef HAS_ARGBTOUVJROW_AVX2
ARGBToUVJRow_AVX2(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1705 void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
1706 int src_stride_argb,
1707 uint8_t* dst_u,
1708 uint8_t* dst_v,
1709 int width) {
1710 asm volatile(
1711 "vbroadcastf128 %5,%%ymm5 \n"
1712 "vbroadcastf128 %6,%%ymm6 \n"
1713 "vbroadcastf128 %7,%%ymm7 \n"
1714 "sub %1,%2 \n"
1715
1716 LABELALIGN
1717 "1: \n"
1718 "vmovdqu (%0),%%ymm0 \n"
1719 "vmovdqu 0x20(%0),%%ymm1 \n"
1720 "vmovdqu 0x40(%0),%%ymm2 \n"
1721 "vmovdqu 0x60(%0),%%ymm3 \n"
1722 "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
1723 "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
1724 "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
1725 "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
1726 "lea 0x80(%0),%0 \n"
1727 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
1728 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
1729 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
1730 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
1731 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
1732 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
1733
1734 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
1735 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
1736 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
1737 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
1738 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
1739 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
1740 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
1741 "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
1742 "vpsraw $0x8,%%ymm1,%%ymm1 \n"
1743 "vpsraw $0x8,%%ymm0,%%ymm0 \n"
1744 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
1745 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1746 "vpshufb %8,%%ymm0,%%ymm0 \n"
1747
1748 "vextractf128 $0x0,%%ymm0,(%1) \n"
1749 "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
1750 "lea 0x10(%1),%1 \n"
1751 "sub $0x20,%3 \n"
1752 "jg 1b \n"
1753 "vzeroupper \n"
1754 : "+r"(src_argb), // %0
1755 "+r"(dst_u), // %1
1756 "+r"(dst_v), // %2
1757 "+rm"(width) // %3
1758 : "r"((intptr_t)(src_stride_argb)), // %4
1759 "m"(kSub128), // %5
1760 "m"(kARGBToVJ), // %6
1761 "m"(kARGBToUJ), // %7
1762 "m"(kShufARGBToUV_AVX) // %8
1763 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1764 "xmm7");
1765 }
1766 #endif // HAS_ARGBTOUVJROW_AVX2
1767
1768 #ifdef HAS_ARGBTOUVJROW_SSSE3
ARGBToUVJRow_SSSE3(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1769 void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
1770 int src_stride_argb,
1771 uint8_t* dst_u,
1772 uint8_t* dst_v,
1773 int width) {
1774 asm volatile(
1775 "movdqa %5,%%xmm3 \n"
1776 "movdqa %6,%%xmm4 \n"
1777 "movdqa %7,%%xmm5 \n"
1778 "sub %1,%2 \n"
1779
1780 LABELALIGN
1781 "1: \n"
1782 "movdqu (%0),%%xmm0 \n"
1783 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
1784 "pavgb %%xmm7,%%xmm0 \n"
1785 "movdqu 0x10(%0),%%xmm1 \n"
1786 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1787 "pavgb %%xmm7,%%xmm1 \n"
1788 "movdqu 0x20(%0),%%xmm2 \n"
1789 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1790 "pavgb %%xmm7,%%xmm2 \n"
1791 "movdqu 0x30(%0),%%xmm6 \n"
1792 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1793 "pavgb %%xmm7,%%xmm6 \n"
1794
1795 "lea 0x40(%0),%0 \n"
1796 "movdqa %%xmm0,%%xmm7 \n"
1797 "shufps $0x88,%%xmm1,%%xmm0 \n"
1798 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1799 "pavgb %%xmm7,%%xmm0 \n"
1800 "movdqa %%xmm2,%%xmm7 \n"
1801 "shufps $0x88,%%xmm6,%%xmm2 \n"
1802 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1803 "pavgb %%xmm7,%%xmm2 \n"
1804 "movdqa %%xmm0,%%xmm1 \n"
1805 "movdqa %%xmm2,%%xmm6 \n"
1806 "pmaddubsw %%xmm4,%%xmm0 \n"
1807 "pmaddubsw %%xmm4,%%xmm2 \n"
1808 "pmaddubsw %%xmm3,%%xmm1 \n"
1809 "pmaddubsw %%xmm3,%%xmm6 \n"
1810 "phaddw %%xmm2,%%xmm0 \n"
1811 "phaddw %%xmm6,%%xmm1 \n"
1812 "paddw %%xmm5,%%xmm0 \n"
1813 "paddw %%xmm5,%%xmm1 \n"
1814 "psraw $0x8,%%xmm0 \n"
1815 "psraw $0x8,%%xmm1 \n"
1816 "packsswb %%xmm1,%%xmm0 \n"
1817 "movlps %%xmm0,(%1) \n"
1818 "movhps %%xmm0,0x00(%1,%2,1) \n"
1819 "lea 0x8(%1),%1 \n"
1820 "sub $0x10,%3 \n"
1821 "jg 1b \n"
1822 : "+r"(src_argb), // %0
1823 "+r"(dst_u), // %1
1824 "+r"(dst_v), // %2
1825 "+rm"(width) // %3
1826 : "r"((intptr_t)(src_stride_argb)), // %4
1827 "m"(kARGBToVJ), // %5
1828 "m"(kARGBToUJ), // %6
1829 "m"(kSub128) // %7
1830 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1831 }
1832 #endif // HAS_ARGBTOUVJROW_SSSE3
1833
1834 #ifdef HAS_ARGBTOUV444ROW_SSSE3
ARGBToUV444Row_SSSE3(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1835 void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
1836 uint8_t* dst_u,
1837 uint8_t* dst_v,
1838 int width) {
1839 asm volatile(
1840 "movdqa %4,%%xmm3 \n"
1841 "movdqa %5,%%xmm4 \n"
1842 "movdqa %6,%%xmm5 \n"
1843 "sub %1,%2 \n"
1844
1845 LABELALIGN
1846 "1: \n"
1847 "movdqu (%0),%%xmm0 \n"
1848 "movdqu 0x10(%0),%%xmm1 \n"
1849 "movdqu 0x20(%0),%%xmm2 \n"
1850 "movdqu 0x30(%0),%%xmm6 \n"
1851 "pmaddubsw %%xmm4,%%xmm0 \n"
1852 "pmaddubsw %%xmm4,%%xmm1 \n"
1853 "pmaddubsw %%xmm4,%%xmm2 \n"
1854 "pmaddubsw %%xmm4,%%xmm6 \n"
1855 "phaddw %%xmm1,%%xmm0 \n"
1856 "phaddw %%xmm6,%%xmm2 \n"
1857 "psraw $0x8,%%xmm0 \n"
1858 "psraw $0x8,%%xmm2 \n"
1859 "packsswb %%xmm2,%%xmm0 \n"
1860 "paddb %%xmm5,%%xmm0 \n"
1861 "movdqu %%xmm0,(%1) \n"
1862 "movdqu (%0),%%xmm0 \n"
1863 "movdqu 0x10(%0),%%xmm1 \n"
1864 "movdqu 0x20(%0),%%xmm2 \n"
1865 "movdqu 0x30(%0),%%xmm6 \n"
1866 "pmaddubsw %%xmm3,%%xmm0 \n"
1867 "pmaddubsw %%xmm3,%%xmm1 \n"
1868 "pmaddubsw %%xmm3,%%xmm2 \n"
1869 "pmaddubsw %%xmm3,%%xmm6 \n"
1870 "phaddw %%xmm1,%%xmm0 \n"
1871 "phaddw %%xmm6,%%xmm2 \n"
1872 "psraw $0x8,%%xmm0 \n"
1873 "psraw $0x8,%%xmm2 \n"
1874 "packsswb %%xmm2,%%xmm0 \n"
1875 "paddb %%xmm5,%%xmm0 \n"
1876 "lea 0x40(%0),%0 \n"
1877 "movdqu %%xmm0,0x00(%1,%2,1) \n"
1878 "lea 0x10(%1),%1 \n"
1879 "sub $0x10,%3 \n"
1880 "jg 1b \n"
1881 : "+r"(src_argb), // %0
1882 "+r"(dst_u), // %1
1883 "+r"(dst_v), // %2
1884 "+rm"(width) // %3
1885 : "m"(kARGBToV), // %4
1886 "m"(kARGBToU), // %5
1887 "m"(kAddUV128) // %6
1888 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");
1889 }
1890 #endif // HAS_ARGBTOUV444ROW_SSSE3
1891
BGRAToYRow_SSSE3(const uint8_t * src_bgra,uint8_t * dst_y,int width)1892 void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
1893 asm volatile(
1894 "movdqa %3,%%xmm4 \n"
1895 "movdqa %4,%%xmm5 \n"
1896 "movdqa %5,%%xmm7 \n"
1897
1898 LABELALIGN RGBTOY(xmm7)
1899 : "+r"(src_bgra), // %0
1900 "+r"(dst_y), // %1
1901 "+r"(width) // %2
1902 : "m"(kBGRAToY), // %3
1903 "m"(kSub128), // %4
1904 "m"(kAddY16) // %5
1905 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1906 "xmm7");
1907 }
1908
BGRAToUVRow_SSSE3(const uint8_t * src_bgra,int src_stride_bgra,uint8_t * dst_u,uint8_t * dst_v,int width)1909 void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
1910 int src_stride_bgra,
1911 uint8_t* dst_u,
1912 uint8_t* dst_v,
1913 int width) {
1914 asm volatile(
1915 "movdqa %5,%%xmm3 \n"
1916 "movdqa %6,%%xmm4 \n"
1917 "movdqa %7,%%xmm5 \n"
1918 "sub %1,%2 \n"
1919
1920 LABELALIGN
1921 "1: \n"
1922 "movdqu (%0),%%xmm0 \n"
1923 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
1924 "pavgb %%xmm7,%%xmm0 \n"
1925 "movdqu 0x10(%0),%%xmm1 \n"
1926 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1927 "pavgb %%xmm7,%%xmm1 \n"
1928 "movdqu 0x20(%0),%%xmm2 \n"
1929 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1930 "pavgb %%xmm7,%%xmm2 \n"
1931 "movdqu 0x30(%0),%%xmm6 \n"
1932 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1933 "pavgb %%xmm7,%%xmm6 \n"
1934
1935 "lea 0x40(%0),%0 \n"
1936 "movdqa %%xmm0,%%xmm7 \n"
1937 "shufps $0x88,%%xmm1,%%xmm0 \n"
1938 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1939 "pavgb %%xmm7,%%xmm0 \n"
1940 "movdqa %%xmm2,%%xmm7 \n"
1941 "shufps $0x88,%%xmm6,%%xmm2 \n"
1942 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1943 "pavgb %%xmm7,%%xmm2 \n"
1944 "movdqa %%xmm0,%%xmm1 \n"
1945 "movdqa %%xmm2,%%xmm6 \n"
1946 "pmaddubsw %%xmm4,%%xmm0 \n"
1947 "pmaddubsw %%xmm4,%%xmm2 \n"
1948 "pmaddubsw %%xmm3,%%xmm1 \n"
1949 "pmaddubsw %%xmm3,%%xmm6 \n"
1950 "phaddw %%xmm2,%%xmm0 \n"
1951 "phaddw %%xmm6,%%xmm1 \n"
1952 "psraw $0x8,%%xmm0 \n"
1953 "psraw $0x8,%%xmm1 \n"
1954 "packsswb %%xmm1,%%xmm0 \n"
1955 "paddb %%xmm5,%%xmm0 \n"
1956 "movlps %%xmm0,(%1) \n"
1957 "movhps %%xmm0,0x00(%1,%2,1) \n"
1958 "lea 0x8(%1),%1 \n"
1959 "sub $0x10,%3 \n"
1960 "jg 1b \n"
1961 : "+r"(src_bgra), // %0
1962 "+r"(dst_u), // %1
1963 "+r"(dst_v), // %2
1964 "+rm"(width) // %3
1965 : "r"((intptr_t)(src_stride_bgra)), // %4
1966 "m"(kBGRAToV), // %5
1967 "m"(kBGRAToU), // %6
1968 "m"(kAddUV128) // %7
1969 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1970 }
1971
ABGRToYRow_SSSE3(const uint8_t * src_abgr,uint8_t * dst_y,int width)1972 void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1973 asm volatile(
1974 "movdqa %3,%%xmm4 \n"
1975 "movdqa %4,%%xmm5 \n"
1976 "movdqa %5,%%xmm7 \n"
1977
1978 LABELALIGN RGBTOY(xmm7)
1979 : "+r"(src_abgr), // %0
1980 "+r"(dst_y), // %1
1981 "+r"(width) // %2
1982 : "m"(kABGRToY), // %3
1983 "m"(kSub128), // %4
1984 "m"(kAddY16) // %5
1985 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1986 "xmm7");
1987 }
1988
RGBAToYRow_SSSE3(const uint8_t * src_rgba,uint8_t * dst_y,int width)1989 void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1990 asm volatile(
1991 "movdqa %3,%%xmm4 \n"
1992 "movdqa %4,%%xmm5 \n"
1993 "movdqa %5,%%xmm7 \n"
1994
1995 LABELALIGN RGBTOY(xmm7)
1996 : "+r"(src_rgba), // %0
1997 "+r"(dst_y), // %1
1998 "+r"(width) // %2
1999 : "m"(kRGBAToY), // %3
2000 "m"(kSub128), // %4
2001 "m"(kAddY16) // %5
2002 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2003 "xmm7");
2004 }
2005
ABGRToUVRow_SSSE3(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)2006 void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
2007 int src_stride_abgr,
2008 uint8_t* dst_u,
2009 uint8_t* dst_v,
2010 int width) {
2011 asm volatile(
2012 "movdqa %5,%%xmm3 \n"
2013 "movdqa %6,%%xmm4 \n"
2014 "movdqa %7,%%xmm5 \n"
2015 "sub %1,%2 \n"
2016
2017 LABELALIGN
2018 "1: \n"
2019 "movdqu (%0),%%xmm0 \n"
2020 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
2021 "pavgb %%xmm7,%%xmm0 \n"
2022 "movdqu 0x10(%0),%%xmm1 \n"
2023 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
2024 "pavgb %%xmm7,%%xmm1 \n"
2025 "movdqu 0x20(%0),%%xmm2 \n"
2026 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
2027 "pavgb %%xmm7,%%xmm2 \n"
2028 "movdqu 0x30(%0),%%xmm6 \n"
2029 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
2030 "pavgb %%xmm7,%%xmm6 \n"
2031
2032 "lea 0x40(%0),%0 \n"
2033 "movdqa %%xmm0,%%xmm7 \n"
2034 "shufps $0x88,%%xmm1,%%xmm0 \n"
2035 "shufps $0xdd,%%xmm1,%%xmm7 \n"
2036 "pavgb %%xmm7,%%xmm0 \n"
2037 "movdqa %%xmm2,%%xmm7 \n"
2038 "shufps $0x88,%%xmm6,%%xmm2 \n"
2039 "shufps $0xdd,%%xmm6,%%xmm7 \n"
2040 "pavgb %%xmm7,%%xmm2 \n"
2041 "movdqa %%xmm0,%%xmm1 \n"
2042 "movdqa %%xmm2,%%xmm6 \n"
2043 "pmaddubsw %%xmm4,%%xmm0 \n"
2044 "pmaddubsw %%xmm4,%%xmm2 \n"
2045 "pmaddubsw %%xmm3,%%xmm1 \n"
2046 "pmaddubsw %%xmm3,%%xmm6 \n"
2047 "phaddw %%xmm2,%%xmm0 \n"
2048 "phaddw %%xmm6,%%xmm1 \n"
2049 "psraw $0x8,%%xmm0 \n"
2050 "psraw $0x8,%%xmm1 \n"
2051 "packsswb %%xmm1,%%xmm0 \n"
2052 "paddb %%xmm5,%%xmm0 \n"
2053 "movlps %%xmm0,(%1) \n"
2054 "movhps %%xmm0,0x00(%1,%2,1) \n"
2055 "lea 0x8(%1),%1 \n"
2056 "sub $0x10,%3 \n"
2057 "jg 1b \n"
2058 : "+r"(src_abgr), // %0
2059 "+r"(dst_u), // %1
2060 "+r"(dst_v), // %2
2061 "+rm"(width) // %3
2062 : "r"((intptr_t)(src_stride_abgr)), // %4
2063 "m"(kABGRToV), // %5
2064 "m"(kABGRToU), // %6
2065 "m"(kAddUV128) // %7
2066 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
2067 }
2068
RGBAToUVRow_SSSE3(const uint8_t * src_rgba,int src_stride_rgba,uint8_t * dst_u,uint8_t * dst_v,int width)2069 void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
2070 int src_stride_rgba,
2071 uint8_t* dst_u,
2072 uint8_t* dst_v,
2073 int width) {
2074 asm volatile(
2075 "movdqa %5,%%xmm3 \n"
2076 "movdqa %6,%%xmm4 \n"
2077 "movdqa %7,%%xmm5 \n"
2078 "sub %1,%2 \n"
2079
2080 LABELALIGN
2081 "1: \n"
2082 "movdqu (%0),%%xmm0 \n"
2083 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
2084 "pavgb %%xmm7,%%xmm0 \n"
2085 "movdqu 0x10(%0),%%xmm1 \n"
2086 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
2087 "pavgb %%xmm7,%%xmm1 \n"
2088 "movdqu 0x20(%0),%%xmm2 \n"
2089 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
2090 "pavgb %%xmm7,%%xmm2 \n"
2091 "movdqu 0x30(%0),%%xmm6 \n"
2092 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
2093 "pavgb %%xmm7,%%xmm6 \n"
2094
2095 "lea 0x40(%0),%0 \n"
2096 "movdqa %%xmm0,%%xmm7 \n"
2097 "shufps $0x88,%%xmm1,%%xmm0 \n"
2098 "shufps $0xdd,%%xmm1,%%xmm7 \n"
2099 "pavgb %%xmm7,%%xmm0 \n"
2100 "movdqa %%xmm2,%%xmm7 \n"
2101 "shufps $0x88,%%xmm6,%%xmm2 \n"
2102 "shufps $0xdd,%%xmm6,%%xmm7 \n"
2103 "pavgb %%xmm7,%%xmm2 \n"
2104 "movdqa %%xmm0,%%xmm1 \n"
2105 "movdqa %%xmm2,%%xmm6 \n"
2106 "pmaddubsw %%xmm4,%%xmm0 \n"
2107 "pmaddubsw %%xmm4,%%xmm2 \n"
2108 "pmaddubsw %%xmm3,%%xmm1 \n"
2109 "pmaddubsw %%xmm3,%%xmm6 \n"
2110 "phaddw %%xmm2,%%xmm0 \n"
2111 "phaddw %%xmm6,%%xmm1 \n"
2112 "psraw $0x8,%%xmm0 \n"
2113 "psraw $0x8,%%xmm1 \n"
2114 "packsswb %%xmm1,%%xmm0 \n"
2115 "paddb %%xmm5,%%xmm0 \n"
2116 "movlps %%xmm0,(%1) \n"
2117 "movhps %%xmm0,0x00(%1,%2,1) \n"
2118 "lea 0x8(%1),%1 \n"
2119 "sub $0x10,%3 \n"
2120 "jg 1b \n"
2121 : "+r"(src_rgba), // %0
2122 "+r"(dst_u), // %1
2123 "+r"(dst_v), // %2
2124 "+rm"(width) // %3
2125 : "r"((intptr_t)(src_stride_rgba)), // %4
2126 "m"(kRGBAToV), // %5
2127 "m"(kRGBAToU), // %6
2128 "m"(kAddUV128) // %7
2129 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
2130 }
2131
2132 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
2133
2134 // Read 8 UV from 444
2135 #define READYUV444 \
2136 "movq (%[u_buf]),%%xmm3 \n" \
2137 "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2138 "lea 0x8(%[u_buf]),%[u_buf] \n" \
2139 "punpcklbw %%xmm1,%%xmm3 \n" \
2140 "movq (%[y_buf]),%%xmm4 \n" \
2141 "punpcklbw %%xmm4,%%xmm4 \n" \
2142 "lea 0x8(%[y_buf]),%[y_buf] \n"
2143
2144 // Read 4 UV from 422, upsample to 8 UV
2145 #define READYUV422 \
2146 "movd (%[u_buf]),%%xmm3 \n" \
2147 "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2148 "lea 0x4(%[u_buf]),%[u_buf] \n" \
2149 "punpcklbw %%xmm1,%%xmm3 \n" \
2150 "punpcklwd %%xmm3,%%xmm3 \n" \
2151 "movq (%[y_buf]),%%xmm4 \n" \
2152 "punpcklbw %%xmm4,%%xmm4 \n" \
2153 "lea 0x8(%[y_buf]),%[y_buf] \n"
2154
2155 // Read 4 UV from 422 10 bit, upsample to 8 UV
2156 // TODO(fbarchard): Consider shufb to replace pack/unpack
2157 // TODO(fbarchard): Consider pmulhuw to replace psraw
2158 // TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
2159 #define READYUV210 \
2160 "movq (%[u_buf]),%%xmm3 \n" \
2161 "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2162 "lea 0x8(%[u_buf]),%[u_buf] \n" \
2163 "punpcklwd %%xmm1,%%xmm3 \n" \
2164 "psraw $2,%%xmm3 \n" \
2165 "packuswb %%xmm3,%%xmm3 \n" \
2166 "punpcklwd %%xmm3,%%xmm3 \n" \
2167 "movdqu (%[y_buf]),%%xmm4 \n" \
2168 "psllw $6,%%xmm4 \n" \
2169 "lea 0x10(%[y_buf]),%[y_buf] \n"
2170
2171 #define READYUVA210 \
2172 "movq (%[u_buf]),%%xmm3 \n" \
2173 "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2174 "lea 0x8(%[u_buf]),%[u_buf] \n" \
2175 "punpcklwd %%xmm1,%%xmm3 \n" \
2176 "psraw $2,%%xmm3 \n" \
2177 "packuswb %%xmm3,%%xmm3 \n" \
2178 "punpcklwd %%xmm3,%%xmm3 \n" \
2179 "movdqu (%[y_buf]),%%xmm4 \n" \
2180 "psllw $6,%%xmm4 \n" \
2181 "lea 0x10(%[y_buf]),%[y_buf] \n" \
2182 "movdqu (%[a_buf]),%%xmm5 \n" \
2183 "psraw $2,%%xmm5 \n" \
2184 "packuswb %%xmm5,%%xmm5 \n" \
2185 "lea 0x10(%[a_buf]),%[a_buf] \n"
2186
2187 // Read 8 UV from 444 10 bit
2188 #define READYUV410 \
2189 "movdqu (%[u_buf]),%%xmm3 \n" \
2190 "movdqu 0x00(%[u_buf],%[v_buf],1),%%xmm2 \n" \
2191 "lea 0x10(%[u_buf]),%[u_buf] \n" \
2192 "psraw $2,%%xmm3 \n" \
2193 "psraw $2,%%xmm2 \n" \
2194 "movdqa %%xmm3,%%xmm1 \n" \
2195 "punpcklwd %%xmm2,%%xmm3 \n" \
2196 "punpckhwd %%xmm2,%%xmm1 \n" \
2197 "packuswb %%xmm1,%%xmm3 \n" \
2198 "movdqu (%[y_buf]),%%xmm4 \n" \
2199 "psllw $6,%%xmm4 \n" \
2200 "lea 0x10(%[y_buf]),%[y_buf] \n"
2201
2202 // Read 8 UV from 444 10 bit. With 8 Alpha.
2203 #define READYUVA410 \
2204 "movdqu (%[u_buf]),%%xmm3 \n" \
2205 "movdqu 0x00(%[u_buf],%[v_buf],1),%%xmm2 \n" \
2206 "lea 0x10(%[u_buf]),%[u_buf] \n" \
2207 "psraw $2,%%xmm3 \n" \
2208 "psraw $2,%%xmm2 \n" \
2209 "movdqa %%xmm3,%%xmm1 \n" \
2210 "punpcklwd %%xmm2,%%xmm3 \n" \
2211 "punpckhwd %%xmm2,%%xmm1 \n" \
2212 "packuswb %%xmm1,%%xmm3 \n" \
2213 "movdqu (%[y_buf]),%%xmm4 \n" \
2214 "psllw $0x6,%%xmm4 \n" \
2215 "lea 0x10(%[y_buf]),%[y_buf] \n" \
2216 "movdqu (%[a_buf]),%%xmm5 \n" \
2217 "psraw $2,%%xmm5 \n" \
2218 "packuswb %%xmm5,%%xmm5 \n" \
2219 "lea 0x10(%[a_buf]),%[a_buf] \n"
2220
2221 // Read 4 UV from 422 12 bit, upsample to 8 UV
2222 #define READYUV212 \
2223 "movq (%[u_buf]),%%xmm3 \n" \
2224 "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2225 "lea 0x8(%[u_buf]),%[u_buf] \n" \
2226 "punpcklwd %%xmm1,%%xmm3 \n" \
2227 "psraw $0x4,%%xmm3 \n" \
2228 "packuswb %%xmm3,%%xmm3 \n" \
2229 "punpcklwd %%xmm3,%%xmm3 \n" \
2230 "movdqu (%[y_buf]),%%xmm4 \n" \
2231 "psllw $0x4,%%xmm4 \n" \
2232 "lea 0x10(%[y_buf]),%[y_buf] \n"
2233
2234 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
2235 #define READYUVA422 \
2236 "movd (%[u_buf]),%%xmm3 \n" \
2237 "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2238 "lea 0x4(%[u_buf]),%[u_buf] \n" \
2239 "punpcklbw %%xmm1,%%xmm3 \n" \
2240 "punpcklwd %%xmm3,%%xmm3 \n" \
2241 "movq (%[y_buf]),%%xmm4 \n" \
2242 "punpcklbw %%xmm4,%%xmm4 \n" \
2243 "lea 0x8(%[y_buf]),%[y_buf] \n" \
2244 "movq (%[a_buf]),%%xmm5 \n" \
2245 "lea 0x8(%[a_buf]),%[a_buf] \n"
2246
2247 // Read 8 UV from 444. With 8 Alpha.
2248 #define READYUVA444 \
2249 "movq (%[u_buf]),%%xmm3 \n" \
2250 "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2251 "lea 0x8(%[u_buf]),%[u_buf] \n" \
2252 "punpcklbw %%xmm1,%%xmm3 \n" \
2253 "movq (%[y_buf]),%%xmm4 \n" \
2254 "punpcklbw %%xmm4,%%xmm4 \n" \
2255 "lea 0x8(%[y_buf]),%[y_buf] \n" \
2256 "movq (%[a_buf]),%%xmm5 \n" \
2257 "lea 0x8(%[a_buf]),%[a_buf] \n"
2258
2259 // Read 4 UV from NV12, upsample to 8 UV
2260 #define READNV12 \
2261 "movq (%[uv_buf]),%%xmm3 \n" \
2262 "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
2263 "punpcklwd %%xmm3,%%xmm3 \n" \
2264 "movq (%[y_buf]),%%xmm4 \n" \
2265 "punpcklbw %%xmm4,%%xmm4 \n" \
2266 "lea 0x8(%[y_buf]),%[y_buf] \n"
2267
2268 // Read 4 VU from NV21, upsample to 8 UV
2269 #define READNV21 \
2270 "movq (%[vu_buf]),%%xmm3 \n" \
2271 "lea 0x8(%[vu_buf]),%[vu_buf] \n" \
2272 "pshufb %[kShuffleNV21], %%xmm3 \n" \
2273 "movq (%[y_buf]),%%xmm4 \n" \
2274 "punpcklbw %%xmm4,%%xmm4 \n" \
2275 "lea 0x8(%[y_buf]),%[y_buf] \n"
2276
2277 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
2278 #define READYUY2 \
2279 "movdqu (%[yuy2_buf]),%%xmm4 \n" \
2280 "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
2281 "movdqu (%[yuy2_buf]),%%xmm3 \n" \
2282 "pshufb %[kShuffleYUY2UV], %%xmm3 \n" \
2283 "lea 0x10(%[yuy2_buf]),%[yuy2_buf] \n"
2284
2285 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
2286 #define READUYVY \
2287 "movdqu (%[uyvy_buf]),%%xmm4 \n" \
2288 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
2289 "movdqu (%[uyvy_buf]),%%xmm3 \n" \
2290 "pshufb %[kShuffleUYVYUV], %%xmm3 \n" \
2291 "lea 0x10(%[uyvy_buf]),%[uyvy_buf] \n"
2292
2293 // Read 4 UV from P210, upsample to 8 UV
2294 #define READP210 \
2295 "movdqu (%[uv_buf]),%%xmm3 \n" \
2296 "lea 0x10(%[uv_buf]),%[uv_buf] \n" \
2297 "psrlw $0x8,%%xmm3 \n" \
2298 "packuswb %%xmm3,%%xmm3 \n" \
2299 "punpcklwd %%xmm3,%%xmm3 \n" \
2300 "movdqu (%[y_buf]),%%xmm4 \n" \
2301 "lea 0x10(%[y_buf]),%[y_buf] \n"
2302
2303 // Read 8 UV from P410
2304 #define READP410 \
2305 "movdqu (%[uv_buf]),%%xmm3 \n" \
2306 "movdqu 0x10(%[uv_buf]),%%xmm1 \n" \
2307 "lea 0x20(%[uv_buf]),%[uv_buf] \n" \
2308 "psrlw $0x8,%%xmm3 \n" \
2309 "psrlw $0x8,%%xmm1 \n" \
2310 "packuswb %%xmm1,%%xmm3 \n" \
2311 "movdqu (%[y_buf]),%%xmm4 \n" \
2312 "lea 0x10(%[y_buf]),%[y_buf] \n"
2313
2314 #if defined(__x86_64__)
2315 #define YUVTORGB_SETUP(yuvconstants) \
2316 "pcmpeqb %%xmm13,%%xmm13 \n" \
2317 "movdqa (%[yuvconstants]),%%xmm8 \n" \
2318 "pxor %%xmm12,%%xmm12 \n" \
2319 "movdqa 32(%[yuvconstants]),%%xmm9 \n" \
2320 "psllw $7,%%xmm13 \n" \
2321 "movdqa 64(%[yuvconstants]),%%xmm10 \n" \
2322 "pshufb %%xmm12,%%xmm13 \n" \
2323 "movdqa 96(%[yuvconstants]),%%xmm11 \n" \
2324 "movdqa 128(%[yuvconstants]),%%xmm12 \n"
2325
2326 // Convert 8 pixels: 8 UV and 8 Y
2327 #define YUVTORGB16(yuvconstants) \
2328 "psubb %%xmm13,%%xmm3 \n" \
2329 "pmulhuw %%xmm11,%%xmm4 \n" \
2330 "movdqa %%xmm8,%%xmm0 \n" \
2331 "movdqa %%xmm9,%%xmm1 \n" \
2332 "movdqa %%xmm10,%%xmm2 \n" \
2333 "paddw %%xmm12,%%xmm4 \n" \
2334 "pmaddubsw %%xmm3,%%xmm0 \n" \
2335 "pmaddubsw %%xmm3,%%xmm1 \n" \
2336 "pmaddubsw %%xmm3,%%xmm2 \n" \
2337 "paddsw %%xmm4,%%xmm0 \n" \
2338 "paddsw %%xmm4,%%xmm2 \n" \
2339 "psubsw %%xmm1,%%xmm4 \n" \
2340 "movdqa %%xmm4,%%xmm1 \n"
2341
2342 #define YUVTORGB_REGS "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
2343
2344 #else
2345 #define YUVTORGB_SETUP(yuvconstants)
2346 // Convert 8 pixels: 8 UV and 8 Y
2347 #define YUVTORGB16(yuvconstants) \
2348 "pcmpeqb %%xmm0,%%xmm0 \n" \
2349 "pxor %%xmm1,%%xmm1 \n" \
2350 "psllw $7,%%xmm0 \n" \
2351 "pshufb %%xmm1,%%xmm0 \n" \
2352 "psubb %%xmm0,%%xmm3 \n" \
2353 "pmulhuw 96(%[yuvconstants]),%%xmm4 \n" \
2354 "movdqa (%[yuvconstants]),%%xmm0 \n" \
2355 "movdqa 32(%[yuvconstants]),%%xmm1 \n" \
2356 "movdqa 64(%[yuvconstants]),%%xmm2 \n" \
2357 "pmaddubsw %%xmm3,%%xmm0 \n" \
2358 "pmaddubsw %%xmm3,%%xmm1 \n" \
2359 "pmaddubsw %%xmm3,%%xmm2 \n" \
2360 "movdqa 128(%[yuvconstants]),%%xmm3 \n" \
2361 "paddw %%xmm3,%%xmm4 \n" \
2362 "paddsw %%xmm4,%%xmm0 \n" \
2363 "paddsw %%xmm4,%%xmm2 \n" \
2364 "psubsw %%xmm1,%%xmm4 \n" \
2365 "movdqa %%xmm4,%%xmm1 \n"
2366
2367 #define YUVTORGB_REGS
2368 #endif
2369
2370 #define YUVTORGB(yuvconstants) \
2371 YUVTORGB16(yuvconstants) \
2372 "psraw $0x6,%%xmm0 \n" \
2373 "psraw $0x6,%%xmm1 \n" \
2374 "psraw $0x6,%%xmm2 \n" \
2375 "packuswb %%xmm0,%%xmm0 \n" \
2376 "packuswb %%xmm1,%%xmm1 \n" \
2377 "packuswb %%xmm2,%%xmm2 \n"
2378
2379 // Store 8 ARGB values.
2380 #define STOREARGB \
2381 "punpcklbw %%xmm1,%%xmm0 \n" \
2382 "punpcklbw %%xmm5,%%xmm2 \n" \
2383 "movdqa %%xmm0,%%xmm1 \n" \
2384 "punpcklwd %%xmm2,%%xmm0 \n" \
2385 "punpckhwd %%xmm2,%%xmm1 \n" \
2386 "movdqu %%xmm0,(%[dst_argb]) \n" \
2387 "movdqu %%xmm1,0x10(%[dst_argb]) \n" \
2388 "lea 0x20(%[dst_argb]), %[dst_argb] \n"
2389
2390 // Store 8 RGBA values.
2391 #define STORERGBA \
2392 "pcmpeqb %%xmm5,%%xmm5 \n" \
2393 "punpcklbw %%xmm2,%%xmm1 \n" \
2394 "punpcklbw %%xmm0,%%xmm5 \n" \
2395 "movdqa %%xmm5,%%xmm0 \n" \
2396 "punpcklwd %%xmm1,%%xmm5 \n" \
2397 "punpckhwd %%xmm1,%%xmm0 \n" \
2398 "movdqu %%xmm5,(%[dst_rgba]) \n" \
2399 "movdqu %%xmm0,0x10(%[dst_rgba]) \n" \
2400 "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
2401
2402 // Store 8 AR30 values.
2403 #define STOREAR30 \
2404 "psraw $0x4,%%xmm0 \n" \
2405 "psraw $0x4,%%xmm1 \n" \
2406 "psraw $0x4,%%xmm2 \n" \
2407 "pminsw %%xmm7,%%xmm0 \n" \
2408 "pminsw %%xmm7,%%xmm1 \n" \
2409 "pminsw %%xmm7,%%xmm2 \n" \
2410 "pmaxsw %%xmm6,%%xmm0 \n" \
2411 "pmaxsw %%xmm6,%%xmm1 \n" \
2412 "pmaxsw %%xmm6,%%xmm2 \n" \
2413 "psllw $0x4,%%xmm2 \n" \
2414 "movdqa %%xmm0,%%xmm3 \n" \
2415 "punpcklwd %%xmm2,%%xmm0 \n" \
2416 "punpckhwd %%xmm2,%%xmm3 \n" \
2417 "movdqa %%xmm1,%%xmm2 \n" \
2418 "punpcklwd %%xmm5,%%xmm1 \n" \
2419 "punpckhwd %%xmm5,%%xmm2 \n" \
2420 "pslld $0xa,%%xmm1 \n" \
2421 "pslld $0xa,%%xmm2 \n" \
2422 "por %%xmm1,%%xmm0 \n" \
2423 "por %%xmm2,%%xmm3 \n" \
2424 "movdqu %%xmm0,(%[dst_ar30]) \n" \
2425 "movdqu %%xmm3,0x10(%[dst_ar30]) \n" \
2426 "lea 0x20(%[dst_ar30]), %[dst_ar30] \n"
2427
I444ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2428 void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
2429 const uint8_t* u_buf,
2430 const uint8_t* v_buf,
2431 uint8_t* dst_argb,
2432 const struct YuvConstants* yuvconstants,
2433 int width) {
2434 asm volatile (
2435 YUVTORGB_SETUP(yuvconstants)
2436 "sub %[u_buf],%[v_buf] \n"
2437 "pcmpeqb %%xmm5,%%xmm5 \n"
2438
2439 LABELALIGN
2440 "1: \n"
2441 READYUV444
2442 YUVTORGB(yuvconstants)
2443 STOREARGB
2444 "sub $0x8,%[width] \n"
2445 "jg 1b \n"
2446 : [y_buf]"+r"(y_buf), // %[y_buf]
2447 [u_buf]"+r"(u_buf), // %[u_buf]
2448 [v_buf]"+r"(v_buf), // %[v_buf]
2449 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2450 [width]"+rm"(width) // %[width]
2451 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2452 : "memory", "cc", YUVTORGB_REGS
2453 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2454 );
2455 }
2456
2457 #ifdef HAS_I444ALPHATOARGBROW_SSSE3
I444AlphaToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2458 void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
2459 const uint8_t* u_buf,
2460 const uint8_t* v_buf,
2461 const uint8_t* a_buf,
2462 uint8_t* dst_argb,
2463 const struct YuvConstants* yuvconstants,
2464 int width) {
2465 // clang-format off
2466 asm volatile (
2467 YUVTORGB_SETUP(yuvconstants)
2468 "sub %[u_buf],%[v_buf] \n"
2469
2470 LABELALIGN
2471 "1: \n"
2472 READYUVA444
2473 YUVTORGB(yuvconstants)
2474 STOREARGB
2475 "subl $0x8,%[width] \n"
2476 "jg 1b \n"
2477 : [y_buf]"+r"(y_buf), // %[y_buf]
2478 [u_buf]"+r"(u_buf), // %[u_buf]
2479 [v_buf]"+r"(v_buf), // %[v_buf]
2480 [a_buf]"+r"(a_buf), // %[a_buf]
2481 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2482 #if defined(__i386__)
2483 [width]"+m"(width) // %[width]
2484 #else
2485 [width]"+rm"(width) // %[width]
2486 #endif
2487 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2488 : "memory", "cc", YUVTORGB_REGS
2489 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2490 );
2491 // clang-format on
2492 }
2493 #endif // HAS_I444ALPHATOARGBROW_SSSE3
2494
I422ToRGB24Row_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)2495 void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
2496 const uint8_t* u_buf,
2497 const uint8_t* v_buf,
2498 uint8_t* dst_rgb24,
2499 const struct YuvConstants* yuvconstants,
2500 int width) {
2501 asm volatile (
2502 YUVTORGB_SETUP(yuvconstants)
2503 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
2504 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
2505 "sub %[u_buf],%[v_buf] \n"
2506
2507 LABELALIGN
2508 "1: \n"
2509 READYUV422
2510 YUVTORGB(yuvconstants)
2511 "punpcklbw %%xmm1,%%xmm0 \n"
2512 "punpcklbw %%xmm2,%%xmm2 \n"
2513 "movdqa %%xmm0,%%xmm1 \n"
2514 "punpcklwd %%xmm2,%%xmm0 \n"
2515 "punpckhwd %%xmm2,%%xmm1 \n"
2516 "pshufb %%xmm5,%%xmm0 \n"
2517 "pshufb %%xmm6,%%xmm1 \n"
2518 "palignr $0xc,%%xmm0,%%xmm1 \n"
2519 "movq %%xmm0,(%[dst_rgb24]) \n"
2520 "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
2521 "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
2522 "subl $0x8,%[width] \n"
2523 "jg 1b \n"
2524 : [y_buf]"+r"(y_buf), // %[y_buf]
2525 [u_buf]"+r"(u_buf), // %[u_buf]
2526 [v_buf]"+r"(v_buf), // %[v_buf]
2527 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
2528 #if defined(__i386__)
2529 [width]"+m"(width) // %[width]
2530 #else
2531 [width]"+rm"(width) // %[width]
2532 #endif
2533 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2534 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
2535 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
2536 : "memory", "cc", YUVTORGB_REGS
2537 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2538 );
2539 }
2540
I422ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2541 void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
2542 const uint8_t* u_buf,
2543 const uint8_t* v_buf,
2544 uint8_t* dst_argb,
2545 const struct YuvConstants* yuvconstants,
2546 int width) {
2547 asm volatile (
2548 YUVTORGB_SETUP(yuvconstants)
2549 "sub %[u_buf],%[v_buf] \n"
2550 "pcmpeqb %%xmm5,%%xmm5 \n"
2551
2552 LABELALIGN
2553 "1: \n"
2554 READYUV422
2555 YUVTORGB(yuvconstants)
2556 STOREARGB
2557 "sub $0x8,%[width] \n"
2558 "jg 1b \n"
2559 : [y_buf]"+r"(y_buf), // %[y_buf]
2560 [u_buf]"+r"(u_buf), // %[u_buf]
2561 [v_buf]"+r"(v_buf), // %[v_buf]
2562 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2563 [width]"+rm"(width) // %[width]
2564 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2565 : "memory", "cc", YUVTORGB_REGS
2566 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2567 );
2568 }
2569
I422ToAR30Row_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2570 void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
2571 const uint8_t* u_buf,
2572 const uint8_t* v_buf,
2573 uint8_t* dst_ar30,
2574 const struct YuvConstants* yuvconstants,
2575 int width) {
2576 asm volatile (
2577 YUVTORGB_SETUP(yuvconstants)
2578 "sub %[u_buf],%[v_buf] \n"
2579 "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants
2580 "psrlw $14,%%xmm5 \n"
2581 "psllw $4,%%xmm5 \n" // 2 alpha bits
2582 "pxor %%xmm6,%%xmm6 \n" // 0 for min
2583 "pcmpeqb %%xmm7,%%xmm7 \n"
2584 "psrlw $6,%%xmm7 \n" // 1023 for max
2585
2586 LABELALIGN
2587 "1: \n"
2588 READYUV422
2589 YUVTORGB16(yuvconstants)
2590 STOREAR30
2591 "sub $0x8,%[width] \n"
2592 "jg 1b \n"
2593 : [y_buf]"+r"(y_buf), // %[y_buf]
2594 [u_buf]"+r"(u_buf), // %[u_buf]
2595 [v_buf]"+r"(v_buf), // %[v_buf]
2596 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
2597 [width]"+rm"(width) // %[width]
2598 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2599 : "memory", "cc", YUVTORGB_REGS
2600 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2601 );
2602 }
2603
2604 // 10 bit YUV to ARGB
I210ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2605 void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
2606 const uint16_t* u_buf,
2607 const uint16_t* v_buf,
2608 uint8_t* dst_argb,
2609 const struct YuvConstants* yuvconstants,
2610 int width) {
2611 asm volatile (
2612 YUVTORGB_SETUP(yuvconstants)
2613 "sub %[u_buf],%[v_buf] \n"
2614 "pcmpeqb %%xmm5,%%xmm5 \n"
2615
2616 LABELALIGN
2617 "1: \n"
2618 READYUV210
2619 YUVTORGB(yuvconstants)
2620 STOREARGB
2621 "sub $0x8,%[width] \n"
2622 "jg 1b \n"
2623 : [y_buf]"+r"(y_buf), // %[y_buf]
2624 [u_buf]"+r"(u_buf), // %[u_buf]
2625 [v_buf]"+r"(v_buf), // %[v_buf]
2626 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2627 [width]"+rm"(width) // %[width]
2628 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2629 : "memory", "cc", YUVTORGB_REGS
2630 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2631 );
2632 }
2633
2634 // 12 bit YUV to ARGB
I212ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2635 void OMITFP I212ToARGBRow_SSSE3(const uint16_t* y_buf,
2636 const uint16_t* u_buf,
2637 const uint16_t* v_buf,
2638 uint8_t* dst_argb,
2639 const struct YuvConstants* yuvconstants,
2640 int width) {
2641 asm volatile (
2642 YUVTORGB_SETUP(yuvconstants)
2643 "sub %[u_buf],%[v_buf] \n"
2644 "pcmpeqb %%xmm5,%%xmm5 \n"
2645
2646 LABELALIGN
2647 "1: \n"
2648 READYUV212
2649 YUVTORGB(yuvconstants)
2650 STOREARGB
2651 "sub $0x8,%[width] \n"
2652 "jg 1b \n"
2653 : [y_buf]"+r"(y_buf), // %[y_buf]
2654 [u_buf]"+r"(u_buf), // %[u_buf]
2655 [v_buf]"+r"(v_buf), // %[v_buf]
2656 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2657 [width]"+rm"(width) // %[width]
2658 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2659 : "memory", "cc", YUVTORGB_REGS
2660 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2661 );
2662 }
2663
2664 // 10 bit YUV to AR30
I210ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2665 void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
2666 const uint16_t* u_buf,
2667 const uint16_t* v_buf,
2668 uint8_t* dst_ar30,
2669 const struct YuvConstants* yuvconstants,
2670 int width) {
2671 asm volatile (
2672 YUVTORGB_SETUP(yuvconstants)
2673 "sub %[u_buf],%[v_buf] \n"
2674 "pcmpeqb %%xmm5,%%xmm5 \n"
2675 "psrlw $14,%%xmm5 \n"
2676 "psllw $4,%%xmm5 \n" // 2 alpha bits
2677 "pxor %%xmm6,%%xmm6 \n" // 0 for min
2678 "pcmpeqb %%xmm7,%%xmm7 \n"
2679 "psrlw $6,%%xmm7 \n" // 1023 for max
2680
2681 LABELALIGN
2682 "1: \n"
2683 READYUV210
2684 YUVTORGB16(yuvconstants)
2685 STOREAR30
2686 "sub $0x8,%[width] \n"
2687 "jg 1b \n"
2688 : [y_buf]"+r"(y_buf), // %[y_buf]
2689 [u_buf]"+r"(u_buf), // %[u_buf]
2690 [v_buf]"+r"(v_buf), // %[v_buf]
2691 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
2692 [width]"+rm"(width) // %[width]
2693 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2694 : "memory", "cc", YUVTORGB_REGS
2695 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2696 );
2697 }
2698
2699 // 12 bit YUV to AR30
I212ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2700 void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf,
2701 const uint16_t* u_buf,
2702 const uint16_t* v_buf,
2703 uint8_t* dst_ar30,
2704 const struct YuvConstants* yuvconstants,
2705 int width) {
2706 asm volatile (
2707 YUVTORGB_SETUP(yuvconstants)
2708 "sub %[u_buf],%[v_buf] \n"
2709 "pcmpeqb %%xmm5,%%xmm5 \n"
2710 "psrlw $14,%%xmm5 \n"
2711 "psllw $4,%%xmm5 \n" // 2 alpha bits
2712 "pxor %%xmm6,%%xmm6 \n" // 0 for min
2713 "pcmpeqb %%xmm7,%%xmm7 \n"
2714 "psrlw $6,%%xmm7 \n" // 1023 for max
2715
2716 LABELALIGN
2717 "1: \n"
2718 READYUV212
2719 YUVTORGB16(yuvconstants)
2720 STOREAR30
2721 "sub $0x8,%[width] \n"
2722 "jg 1b \n"
2723 : [y_buf]"+r"(y_buf), // %[y_buf]
2724 [u_buf]"+r"(u_buf), // %[u_buf]
2725 [v_buf]"+r"(v_buf), // %[v_buf]
2726 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
2727 [width]"+rm"(width) // %[width]
2728 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2729 : "memory", "cc", YUVTORGB_REGS
2730 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2731 );
2732 }
2733
2734 // 10 bit YUV to ARGB
I410ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2735 void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf,
2736 const uint16_t* u_buf,
2737 const uint16_t* v_buf,
2738 uint8_t* dst_argb,
2739 const struct YuvConstants* yuvconstants,
2740 int width) {
2741 asm volatile (
2742 YUVTORGB_SETUP(yuvconstants)
2743 "sub %[u_buf],%[v_buf] \n"
2744 "pcmpeqb %%xmm5,%%xmm5 \n"
2745
2746 LABELALIGN
2747 "1: \n"
2748 READYUV410
2749 YUVTORGB(yuvconstants)
2750 STOREARGB
2751 "sub $0x8,%[width] \n"
2752 "jg 1b \n"
2753 : [y_buf]"+r"(y_buf), // %[y_buf]
2754 [u_buf]"+r"(u_buf), // %[u_buf]
2755 [v_buf]"+r"(v_buf), // %[v_buf]
2756 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2757 [width]"+rm"(width) // %[width]
2758 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2759 : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2760 );
2761 }
2762
2763 #ifdef HAS_I210ALPHATOARGBROW_SSSE3
2764 // 10 bit YUVA to ARGB
I210AlphaToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,const uint16_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2765 void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
2766 const uint16_t* u_buf,
2767 const uint16_t* v_buf,
2768 const uint16_t* a_buf,
2769 uint8_t* dst_argb,
2770 const struct YuvConstants* yuvconstants,
2771 int width) {
2772 asm volatile(
2773 YUVTORGB_SETUP(
2774 yuvconstants) "sub %[u_buf],%[v_buf] \n"
2775
2776 LABELALIGN "1: \n" READYUVA210
2777 YUVTORGB(yuvconstants) STOREARGB
2778 "subl $0x8,%[width] \n"
2779 "jg 1b \n"
2780 : [y_buf] "+r"(y_buf), // %[y_buf]
2781 [u_buf] "+r"(u_buf), // %[u_buf]
2782 [v_buf] "+r"(v_buf), // %[v_buf]
2783 [a_buf] "+r"(a_buf),
2784 [dst_argb] "+r"(dst_argb), // %[dst_argb]
2785 #if defined(__i386__)
2786 [width] "+m"(width) // %[width]
2787 #else
2788 [width] "+rm"(width) // %[width]
2789 #endif
2790 : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
2791 : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
2792 "xmm5");
2793 }
2794 #endif
2795
2796 #ifdef HAS_I410ALPHATOARGBROW_SSSE3
2797 // 10 bit YUVA to ARGB
I410AlphaToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,const uint16_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2798 void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
2799 const uint16_t* u_buf,
2800 const uint16_t* v_buf,
2801 const uint16_t* a_buf,
2802 uint8_t* dst_argb,
2803 const struct YuvConstants* yuvconstants,
2804 int width) {
2805 // clang-format off
2806 asm volatile(
2807 YUVTORGB_SETUP(yuvconstants)
2808 "sub %[u_buf],%[v_buf] \n"
2809
2810 LABELALIGN
2811 "1: \n"
2812 READYUVA410
2813 YUVTORGB(yuvconstants)
2814 STOREARGB
2815 "subl $0x8,%[width] \n"
2816 "jg 1b \n"
2817 : [y_buf] "+r"(y_buf), // %[y_buf]
2818 [u_buf] "+r"(u_buf), // %[u_buf]
2819 [v_buf] "+r"(v_buf), // %[v_buf]
2820 [a_buf] "+r"(a_buf),
2821 [dst_argb] "+r"(dst_argb), // %[dst_argb]
2822 #if defined(__i386__)
2823 [width] "+m"(width) // %[width]
2824 #else
2825 [width] "+rm"(width) // %[width]
2826 #endif
2827 : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
2828 : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
2829 "xmm5");
2830 // clang-format on
2831 }
2832 #endif
2833
2834 // 10 bit YUV to AR30
I410ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2835 void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf,
2836 const uint16_t* u_buf,
2837 const uint16_t* v_buf,
2838 uint8_t* dst_ar30,
2839 const struct YuvConstants* yuvconstants,
2840 int width) {
2841 asm volatile (
2842 YUVTORGB_SETUP(yuvconstants)
2843 "sub %[u_buf],%[v_buf] \n"
2844 "pcmpeqb %%xmm5,%%xmm5 \n"
2845 "psrlw $14,%%xmm5 \n"
2846 "psllw $4,%%xmm5 \n" // 2 alpha bits
2847 "pxor %%xmm6,%%xmm6 \n" // 0 for min
2848 "pcmpeqb %%xmm7,%%xmm7 \n"
2849 "psrlw $6,%%xmm7 \n" // 1023 for max
2850
2851 LABELALIGN
2852 "1: \n"
2853 READYUV410
2854 YUVTORGB16(yuvconstants)
2855 STOREAR30
2856 "sub $0x8,%[width] \n"
2857 "jg 1b \n"
2858 : [y_buf]"+r"(y_buf), // %[y_buf]
2859 [u_buf]"+r"(u_buf), // %[u_buf]
2860 [v_buf]"+r"(v_buf), // %[v_buf]
2861 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
2862 [width]"+rm"(width) // %[width]
2863 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2864 : "memory", "cc", YUVTORGB_REGS
2865 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2866 );
2867 }
2868
2869 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
I422AlphaToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2870 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
2871 const uint8_t* u_buf,
2872 const uint8_t* v_buf,
2873 const uint8_t* a_buf,
2874 uint8_t* dst_argb,
2875 const struct YuvConstants* yuvconstants,
2876 int width) {
2877 // clang-format off
2878 asm volatile (
2879 YUVTORGB_SETUP(yuvconstants)
2880 "sub %[u_buf],%[v_buf] \n"
2881
2882 LABELALIGN
2883 "1: \n"
2884 READYUVA422
2885 YUVTORGB(yuvconstants)
2886 STOREARGB
2887 "subl $0x8,%[width] \n"
2888 "jg 1b \n"
2889 : [y_buf]"+r"(y_buf), // %[y_buf]
2890 [u_buf]"+r"(u_buf), // %[u_buf]
2891 [v_buf]"+r"(v_buf), // %[v_buf]
2892 [a_buf]"+r"(a_buf), // %[a_buf]
2893 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2894 #if defined(__i386__)
2895 [width]"+m"(width) // %[width]
2896 #else
2897 [width]"+rm"(width) // %[width]
2898 #endif
2899 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2900 : "memory", "cc", YUVTORGB_REGS
2901 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2902 );
2903 // clang-format on
2904 }
2905 #endif // HAS_I422ALPHATOARGBROW_SSSE3
2906
NV12ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2907 void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
2908 const uint8_t* uv_buf,
2909 uint8_t* dst_argb,
2910 const struct YuvConstants* yuvconstants,
2911 int width) {
2912 // clang-format off
2913 asm volatile (
2914 YUVTORGB_SETUP(yuvconstants)
2915 "pcmpeqb %%xmm5,%%xmm5 \n"
2916
2917 LABELALIGN
2918 "1: \n"
2919 READNV12
2920 YUVTORGB(yuvconstants)
2921 STOREARGB
2922 "sub $0x8,%[width] \n"
2923 "jg 1b \n"
2924 : [y_buf]"+r"(y_buf), // %[y_buf]
2925 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2926 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2927 [width]"+rm"(width) // %[width]
2928 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2929 : "memory", "cc", YUVTORGB_REGS
2930 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2931 );
2932 // clang-format on
2933 }
2934
NV21ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * vu_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2935 void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
2936 const uint8_t* vu_buf,
2937 uint8_t* dst_argb,
2938 const struct YuvConstants* yuvconstants,
2939 int width) {
2940 // clang-format off
2941 asm volatile (
2942 YUVTORGB_SETUP(yuvconstants)
2943 "pcmpeqb %%xmm5,%%xmm5 \n"
2944
2945 LABELALIGN
2946 "1: \n"
2947 READNV21
2948 YUVTORGB(yuvconstants)
2949 STOREARGB
2950 "sub $0x8,%[width] \n"
2951 "jg 1b \n"
2952 : [y_buf]"+r"(y_buf), // %[y_buf]
2953 [vu_buf]"+r"(vu_buf), // %[vu_buf]
2954 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2955 [width]"+rm"(width) // %[width]
2956 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2957 [kShuffleNV21]"m"(kShuffleNV21)
2958 : "memory", "cc", YUVTORGB_REGS
2959 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2960 );
2961 // clang-format on
2962 }
2963
YUY2ToARGBRow_SSSE3(const uint8_t * yuy2_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2964 void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
2965 uint8_t* dst_argb,
2966 const struct YuvConstants* yuvconstants,
2967 int width) {
2968 // clang-format off
2969 asm volatile (
2970 YUVTORGB_SETUP(yuvconstants)
2971 "pcmpeqb %%xmm5,%%xmm5 \n"
2972
2973 LABELALIGN
2974 "1: \n"
2975 READYUY2
2976 YUVTORGB(yuvconstants)
2977 STOREARGB
2978 "sub $0x8,%[width] \n"
2979 "jg 1b \n"
2980 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
2981 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2982 [width]"+rm"(width) // %[width]
2983 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2984 [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
2985 [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
2986 : "memory", "cc", YUVTORGB_REGS
2987 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2988 );
2989 // clang-format on
2990 }
2991
UYVYToARGBRow_SSSE3(const uint8_t * uyvy_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2992 void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
2993 uint8_t* dst_argb,
2994 const struct YuvConstants* yuvconstants,
2995 int width) {
2996 // clang-format off
2997 asm volatile (
2998 YUVTORGB_SETUP(yuvconstants)
2999 "pcmpeqb %%xmm5,%%xmm5 \n"
3000
3001 LABELALIGN
3002 "1: \n"
3003 READUYVY
3004 YUVTORGB(yuvconstants)
3005 STOREARGB
3006 "sub $0x8,%[width] \n"
3007 "jg 1b \n"
3008 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
3009 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3010 [width]"+rm"(width) // %[width]
3011 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
3012 [kShuffleUYVYY]"m"(kShuffleUYVYY),
3013 [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
3014 : "memory", "cc", YUVTORGB_REGS
3015 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3016 );
3017 // clang-format on
3018 }
3019
P210ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3020 void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf,
3021 const uint16_t* uv_buf,
3022 uint8_t* dst_argb,
3023 const struct YuvConstants* yuvconstants,
3024 int width) {
3025 asm volatile(
3026 YUVTORGB_SETUP(
3027 yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n"
3028
3029 LABELALIGN "1: \n" READP210
3030 YUVTORGB(yuvconstants) STOREARGB
3031 "sub $0x8,%[width] \n"
3032 "jg 1b \n"
3033 : [y_buf] "+r"(y_buf), // %[y_buf]
3034 [uv_buf] "+r"(uv_buf), // %[u_buf]
3035 [dst_argb] "+r"(dst_argb), // %[dst_argb]
3036 [width] "+rm"(width) // %[width]
3037 : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
3038 : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
3039 "xmm5");
3040 }
3041
P410ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3042 void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf,
3043 const uint16_t* uv_buf,
3044 uint8_t* dst_argb,
3045 const struct YuvConstants* yuvconstants,
3046 int width) {
3047 asm volatile(
3048 YUVTORGB_SETUP(
3049 yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n"
3050
3051 LABELALIGN "1: \n" READP410
3052 YUVTORGB(yuvconstants) STOREARGB
3053 "sub $0x8,%[width] \n"
3054 "jg 1b \n"
3055 : [y_buf] "+r"(y_buf), // %[y_buf]
3056 [uv_buf] "+r"(uv_buf), // %[u_buf]
3057 [dst_argb] "+r"(dst_argb), // %[dst_argb]
3058 [width] "+rm"(width) // %[width]
3059 : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
3060 : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
3061 "xmm5");
3062 }
3063
P210ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3064 void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf,
3065 const uint16_t* uv_buf,
3066 uint8_t* dst_ar30,
3067 const struct YuvConstants* yuvconstants,
3068 int width) {
3069 asm volatile (
3070 YUVTORGB_SETUP(yuvconstants)
3071 "pcmpeqb %%xmm5,%%xmm5 \n"
3072 "psrlw $14,%%xmm5 \n"
3073 "psllw $4,%%xmm5 \n" // 2 alpha bits
3074 "pxor %%xmm6,%%xmm6 \n" // 0 for min
3075 "pcmpeqb %%xmm7,%%xmm7 \n"
3076 "psrlw $6,%%xmm7 \n" // 1023 for max
3077
3078 LABELALIGN
3079 "1: \n"
3080 READP210
3081 YUVTORGB16(yuvconstants)
3082 STOREAR30
3083 "sub $0x8,%[width] \n"
3084 "jg 1b \n"
3085 : [y_buf]"+r"(y_buf), // %[y_buf]
3086 [uv_buf]"+r"(uv_buf), // %[uv_buf]
3087 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
3088 [width]"+rm"(width) // %[width]
3089 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3090 : "memory", "cc", YUVTORGB_REGS
3091 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3092 );
3093 }
3094
P410ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3095 void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf,
3096 const uint16_t* uv_buf,
3097 uint8_t* dst_ar30,
3098 const struct YuvConstants* yuvconstants,
3099 int width) {
3100 asm volatile (
3101 YUVTORGB_SETUP(yuvconstants)
3102 "pcmpeqb %%xmm5,%%xmm5 \n"
3103 "psrlw $14,%%xmm5 \n"
3104 "psllw $4,%%xmm5 \n" // 2 alpha bits
3105 "pxor %%xmm6,%%xmm6 \n" // 0 for min
3106 "pcmpeqb %%xmm7,%%xmm7 \n"
3107 "psrlw $6,%%xmm7 \n" // 1023 for max
3108
3109 LABELALIGN
3110 "1: \n"
3111 READP410
3112 YUVTORGB16(yuvconstants)
3113 STOREAR30
3114 "sub $0x8,%[width] \n"
3115 "jg 1b \n"
3116 : [y_buf]"+r"(y_buf), // %[y_buf]
3117 [uv_buf]"+r"(uv_buf), // %[uv_buf]
3118 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
3119 [width]"+rm"(width) // %[width]
3120 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3121 : "memory", "cc", YUVTORGB_REGS
3122 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3123 );
3124 }
3125
I422ToRGBARow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_rgba,const struct YuvConstants * yuvconstants,int width)3126 void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
3127 const uint8_t* u_buf,
3128 const uint8_t* v_buf,
3129 uint8_t* dst_rgba,
3130 const struct YuvConstants* yuvconstants,
3131 int width) {
3132 asm volatile (
3133 YUVTORGB_SETUP(yuvconstants)
3134 "sub %[u_buf],%[v_buf] \n"
3135 "pcmpeqb %%xmm5,%%xmm5 \n"
3136
3137 LABELALIGN
3138 "1: \n"
3139 READYUV422
3140 YUVTORGB(yuvconstants)
3141 STORERGBA
3142 "sub $0x8,%[width] \n"
3143 "jg 1b \n"
3144 : [y_buf]"+r"(y_buf), // %[y_buf]
3145 [u_buf]"+r"(u_buf), // %[u_buf]
3146 [v_buf]"+r"(v_buf), // %[v_buf]
3147 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
3148 [width]"+rm"(width) // %[width]
3149 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3150 : "memory", "cc", YUVTORGB_REGS
3151 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3152 );
3153 }
3154
3155 #endif // HAS_I422TOARGBROW_SSSE3
3156
3157 // Read 16 UV from 444
3158 #define READYUV444_AVX2 \
3159 "vmovdqu (%[u_buf]),%%xmm3 \n" \
3160 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
3161 "lea 0x10(%[u_buf]),%[u_buf] \n" \
3162 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3163 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
3164 "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \
3165 "vmovdqu (%[y_buf]),%%xmm4 \n" \
3166 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
3167 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
3168 "lea 0x10(%[y_buf]),%[y_buf] \n"
3169
3170 // Read 8 UV from 422, upsample to 16 UV.
3171 #define READYUV422_AVX2 \
3172 "vmovq (%[u_buf]),%%xmm3 \n" \
3173 "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
3174 "lea 0x8(%[u_buf]),%[u_buf] \n" \
3175 "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \
3176 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3177 "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
3178 "vmovdqu (%[y_buf]),%%xmm4 \n" \
3179 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
3180 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
3181 "lea 0x10(%[y_buf]),%[y_buf] \n"
3182
3183 #define READYUV422_AVX512BW \
3184 "vmovdqu (%[u_buf]),%%xmm3 \n" \
3185 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
3186 "vpermq %%zmm3,%%zmm16,%%zmm3 \n" \
3187 "vpermq %%zmm1,%%zmm16,%%zmm1 \n" \
3188 "lea 0x10(%[u_buf]),%[u_buf] \n" \
3189 "vpunpcklbw %%zmm1,%%zmm3,%%zmm3 \n" \
3190 "vpermq $0xd8,%%zmm3,%%zmm3 \n" \
3191 "vpunpcklwd %%zmm3,%%zmm3,%%zmm3 \n" \
3192 "vmovdqu8 (%[y_buf]),%%ymm4 \n" \
3193 "vpermq %%zmm4,%%zmm17,%%zmm4 \n" \
3194 "vpermq $0xd8,%%zmm4,%%zmm4 \n" \
3195 "vpunpcklbw %%zmm4,%%zmm4,%%zmm4 \n" \
3196 "lea 0x20(%[y_buf]),%[y_buf] \n"
3197
3198 // Read 8 UV from 210, upsample to 16 UV
3199 // TODO(fbarchard): Consider vshufb to replace pack/unpack
3200 // TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
3201 #define READYUV210_AVX2 \
3202 "vmovdqu (%[u_buf]),%%xmm3 \n" \
3203 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
3204 "lea 0x10(%[u_buf]),%[u_buf] \n" \
3205 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3206 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
3207 "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \
3208 "vpsraw $2,%%ymm3,%%ymm3 \n" \
3209 "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
3210 "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
3211 "vmovdqu (%[y_buf]),%%ymm4 \n" \
3212 "vpsllw $6,%%ymm4,%%ymm4 \n" \
3213 "lea 0x20(%[y_buf]),%[y_buf] \n"
3214
3215 // Read 8 UV from 210, upsample to 16 UV. With 16 Alpha.
3216 #define READYUVA210_AVX2 \
3217 "vmovdqu (%[u_buf]),%%xmm3 \n" \
3218 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
3219 "lea 0x10(%[u_buf]),%[u_buf] \n" \
3220 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3221 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
3222 "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \
3223 "vpsraw $2,%%ymm3,%%ymm3 \n" \
3224 "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
3225 "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
3226 "vmovdqu (%[y_buf]),%%ymm4 \n" \
3227 "vpsllw $6,%%ymm4,%%ymm4 \n" \
3228 "lea 0x20(%[y_buf]),%[y_buf] \n" \
3229 "vmovdqu (%[a_buf]),%%ymm5 \n" \
3230 "vpsraw $2,%%ymm5,%%ymm5 \n" \
3231 "vpackuswb %%ymm5,%%ymm5,%%ymm5 \n" \
3232 "lea 0x20(%[a_buf]),%[a_buf] \n"
3233
3234 // Read 16 UV from 410
3235 #define READYUV410_AVX2 \
3236 "vmovdqu (%[u_buf]),%%ymm3 \n" \
3237 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%ymm2 \n" \
3238 "lea 0x20(%[u_buf]),%[u_buf] \n" \
3239 "vpsraw $2,%%ymm3,%%ymm3 \n" \
3240 "vpsraw $2,%%ymm2,%%ymm2 \n" \
3241 "vpunpckhwd %%ymm2,%%ymm3,%%ymm1 \n" \
3242 "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \
3243 "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \
3244 "vmovdqu (%[y_buf]),%%ymm4 \n" \
3245 "vpsllw $6,%%ymm4,%%ymm4 \n" \
3246 "lea 0x20(%[y_buf]),%[y_buf] \n"
3247
3248 // Read 8 UV from 212 12 bit, upsample to 16 UV
3249 #define READYUV212_AVX2 \
3250 "vmovdqu (%[u_buf]),%%xmm3 \n" \
3251 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
3252 "lea 0x10(%[u_buf]),%[u_buf] \n" \
3253 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3254 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
3255 "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \
3256 "vpsraw $0x4,%%ymm3,%%ymm3 \n" \
3257 "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
3258 "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
3259 "vmovdqu (%[y_buf]),%%ymm4 \n" \
3260 "vpsllw $0x4,%%ymm4,%%ymm4 \n" \
3261 "lea 0x20(%[y_buf]),%[y_buf] \n"
3262
3263 // Read 16 UV from 410. With 16 Alpha.
3264 #define READYUVA410_AVX2 \
3265 "vmovdqu (%[u_buf]),%%ymm3 \n" \
3266 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%ymm2 \n" \
3267 "lea 0x20(%[u_buf]),%[u_buf] \n" \
3268 "vpsraw $2,%%ymm3,%%ymm3 \n" \
3269 "vpsraw $2,%%ymm2,%%ymm2 \n" \
3270 "vpunpckhwd %%ymm2,%%ymm3,%%ymm1 \n" \
3271 "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \
3272 "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \
3273 "vmovdqu (%[y_buf]),%%ymm4 \n" \
3274 "vpsllw $6,%%ymm4,%%ymm4 \n" \
3275 "lea 0x20(%[y_buf]),%[y_buf] \n" \
3276 "vmovdqu (%[a_buf]),%%ymm5 \n" \
3277 "vpsraw $2,%%ymm5,%%ymm5 \n" \
3278 "vpackuswb %%ymm5,%%ymm5,%%ymm5 \n" \
3279 "lea 0x20(%[a_buf]),%[a_buf] \n"
3280
3281 // Read 16 UV from 444. With 16 Alpha.
3282 #define READYUVA444_AVX2 \
3283 "vmovdqu (%[u_buf]),%%xmm3 \n" \
3284 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
3285 "lea 0x10(%[u_buf]),%[u_buf] \n" \
3286 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3287 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
3288 "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \
3289 "vmovdqu (%[y_buf]),%%xmm4 \n" \
3290 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
3291 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
3292 "lea 0x10(%[y_buf]),%[y_buf] \n" \
3293 "vmovdqu (%[a_buf]),%%xmm5 \n" \
3294 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
3295 "lea 0x10(%[a_buf]),%[a_buf] \n"
3296
3297 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
3298 #define READYUVA422_AVX2 \
3299 "vmovq (%[u_buf]),%%xmm3 \n" \
3300 "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
3301 "lea 0x8(%[u_buf]),%[u_buf] \n" \
3302 "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \
3303 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3304 "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
3305 "vmovdqu (%[y_buf]),%%xmm4 \n" \
3306 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
3307 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
3308 "lea 0x10(%[y_buf]),%[y_buf] \n" \
3309 "vmovdqu (%[a_buf]),%%xmm5 \n" \
3310 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
3311 "lea 0x10(%[a_buf]),%[a_buf] \n"
3312
3313 // Read 8 UV from NV12, upsample to 16 UV.
3314 #define READNV12_AVX2 \
3315 "vmovdqu (%[uv_buf]),%%xmm3 \n" \
3316 "lea 0x10(%[uv_buf]),%[uv_buf] \n" \
3317 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3318 "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
3319 "vmovdqu (%[y_buf]),%%xmm4 \n" \
3320 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
3321 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
3322 "lea 0x10(%[y_buf]),%[y_buf] \n"
3323
3324 // Read 8 VU from NV21, upsample to 16 UV.
3325 #define READNV21_AVX2 \
3326 "vmovdqu (%[vu_buf]),%%xmm3 \n" \
3327 "lea 0x10(%[vu_buf]),%[vu_buf] \n" \
3328 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3329 "vpshufb %[kShuffleNV21], %%ymm3, %%ymm3 \n" \
3330 "vmovdqu (%[y_buf]),%%xmm4 \n" \
3331 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
3332 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
3333 "lea 0x10(%[y_buf]),%[y_buf] \n"
3334
3335 // Read 4 UV from P210, upsample to 8 UV
3336 #define READP210_AVX2 \
3337 "vmovdqu (%[uv_buf]),%%ymm3 \n" \
3338 "lea 0x20(%[uv_buf]),%[uv_buf] \n" \
3339 "vpsrlw $0x8,%%ymm3,%%ymm3 \n" \
3340 "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
3341 "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
3342 "vmovdqu (%[y_buf]),%%ymm4 \n" \
3343 "lea 0x20(%[y_buf]),%[y_buf] \n"
3344
3345 // Read 8 UV from P410
3346 #define READP410_AVX2 \
3347 "vmovdqu (%[uv_buf]),%%ymm3 \n" \
3348 "vmovdqu 0x20(%[uv_buf]),%%ymm1 \n" \
3349 "lea 0x40(%[uv_buf]),%[uv_buf] \n" \
3350 "vpsrlw $0x8,%%ymm3,%%ymm3 \n" \
3351 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" \
3352 "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \
3353 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3354 "vmovdqu (%[y_buf]),%%ymm4 \n" \
3355 "lea 0x20(%[y_buf]),%[y_buf] \n"
3356
3357 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
3358 #define READYUY2_AVX2 \
3359 "vmovdqu (%[yuy2_buf]),%%ymm4 \n" \
3360 "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
3361 "vmovdqu (%[yuy2_buf]),%%ymm3 \n" \
3362 "vpshufb %[kShuffleYUY2UV], %%ymm3, %%ymm3 \n" \
3363 "lea 0x20(%[yuy2_buf]),%[yuy2_buf] \n"
3364
3365 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
3366 #define READUYVY_AVX2 \
3367 "vmovdqu (%[uyvy_buf]),%%ymm4 \n" \
3368 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
3369 "vmovdqu (%[uyvy_buf]),%%ymm3 \n" \
3370 "vpshufb %[kShuffleUYVYUV], %%ymm3, %%ymm3 \n" \
3371 "lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n"
3372
3373 // TODO(fbarchard): Remove broadcastb
3374 #if defined(__x86_64__)
3375 #define YUVTORGB_SETUP_AVX2(yuvconstants) \
3376 "vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \
3377 "vmovdqa (%[yuvconstants]),%%ymm8 \n" \
3378 "vpsllw $7,%%xmm13,%%xmm13 \n" \
3379 "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \
3380 "vpbroadcastb %%xmm13,%%ymm13 \n" \
3381 "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \
3382 "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \
3383 "vmovdqa 128(%[yuvconstants]),%%ymm12 \n"
3384
3385 #define YUVTORGB_SETUP_AVX512BW(yuvconstants) \
3386 "vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \
3387 "movdqa (%[yuvconstants]),%%xmm8 \n" \
3388 "vpbroadcastq %%xmm8, %%zmm8 \n" \
3389 "vpsllw $7,%%xmm13,%%xmm13 \n" \
3390 "vpbroadcastb %%xmm13,%%zmm13 \n" \
3391 "movq 32(%[yuvconstants]),%%xmm9 \n" \
3392 "vpbroadcastq %%xmm9,%%zmm9 \n" \
3393 "movq 64(%[yuvconstants]),%%xmm10 \n" \
3394 "vpbroadcastq %%xmm10,%%zmm10 \n" \
3395 "movq 96(%[yuvconstants]),%%xmm11 \n" \
3396 "vpbroadcastq %%xmm11,%%zmm11 \n" \
3397 "movq 128(%[yuvconstants]),%%xmm12 \n" \
3398 "vpbroadcastq %%xmm12,%%zmm12 \n" \
3399 "vmovdqu8 (%[quadsplitperm]),%%zmm16 \n" \
3400 "vmovdqu8 (%[dquadsplitperm]),%%zmm17 \n" \
3401 "vmovdqu8 (%[unperm]),%%zmm18 \n"
3402
3403 #define YUVTORGB16_AVX2(yuvconstants) \
3404 "vpsubb %%ymm13,%%ymm3,%%ymm3 \n" \
3405 "vpmulhuw %%ymm11,%%ymm4,%%ymm4 \n" \
3406 "vpmaddubsw %%ymm3,%%ymm8,%%ymm0 \n" \
3407 "vpmaddubsw %%ymm3,%%ymm9,%%ymm1 \n" \
3408 "vpmaddubsw %%ymm3,%%ymm10,%%ymm2 \n" \
3409 "vpaddw %%ymm4,%%ymm12,%%ymm4 \n" \
3410 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
3411 "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \
3412 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
3413
3414 #define YUVTORGB16_AVX512BW(yuvconstants) \
3415 "vpsubb %%zmm13,%%zmm3,%%zmm3 \n" \
3416 "vpmulhuw %%zmm11,%%zmm4,%%zmm4 \n" \
3417 "vpmaddubsw %%zmm3,%%zmm8,%%zmm0 \n" \
3418 "vpmaddubsw %%zmm3,%%zmm9,%%zmm1 \n" \
3419 "vpmaddubsw %%zmm3,%%zmm10,%%zmm2 \n" \
3420 "vpaddw %%zmm4,%%zmm12,%%zmm4 \n" \
3421 "vpaddsw %%zmm4,%%zmm0,%%zmm0 \n" \
3422 "vpsubsw %%zmm1,%%zmm4,%%zmm1 \n" \
3423 "vpaddsw %%zmm4,%%zmm2,%%zmm2 \n"
3424
3425 #define YUVTORGB_REGS_AVX2 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
3426 #define YUVTORGB_REGS_AVX512BW \
3427 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm16", "xmm17", "xmm18",
3428
3429 #else // Convert 16 pixels: 16 UV and 16 Y.
3430
3431 #define YUVTORGB_SETUP_AVX2(yuvconstants)
3432 #define YUVTORGB16_AVX2(yuvconstants) \
3433 "vpcmpeqb %%xmm0,%%xmm0,%%xmm0 \n" \
3434 "vpsllw $7,%%xmm0,%%xmm0 \n" \
3435 "vpbroadcastb %%xmm0,%%ymm0 \n" \
3436 "vpsubb %%ymm0,%%ymm3,%%ymm3 \n" \
3437 "vpmulhuw 96(%[yuvconstants]),%%ymm4,%%ymm4 \n" \
3438 "vmovdqa (%[yuvconstants]),%%ymm0 \n" \
3439 "vmovdqa 32(%[yuvconstants]),%%ymm1 \n" \
3440 "vmovdqa 64(%[yuvconstants]),%%ymm2 \n" \
3441 "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" \
3442 "vpmaddubsw %%ymm3,%%ymm1,%%ymm1 \n" \
3443 "vpmaddubsw %%ymm3,%%ymm2,%%ymm2 \n" \
3444 "vmovdqa 128(%[yuvconstants]),%%ymm3 \n" \
3445 "vpaddw %%ymm4,%%ymm3,%%ymm4 \n" \
3446 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
3447 "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \
3448 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
3449
3450 #define YUVTORGB_REGS_AVX2
3451 #endif
3452
3453 #define YUVTORGB_AVX2(yuvconstants) \
3454 YUVTORGB16_AVX2(yuvconstants) \
3455 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
3456 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
3457 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
3458 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
3459 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
3460 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
3461
3462 #define YUVTORGB_AVX512BW(yuvconstants) \
3463 YUVTORGB16_AVX512BW(yuvconstants) \
3464 "vpsraw $0x6,%%zmm0,%%zmm0 \n" \
3465 "vpsraw $0x6,%%zmm1,%%zmm1 \n" \
3466 "vpsraw $0x6,%%zmm2,%%zmm2 \n" \
3467 "vpackuswb %%zmm0,%%zmm0,%%zmm0 \n" \
3468 "vpackuswb %%zmm1,%%zmm1,%%zmm1 \n" \
3469 "vpackuswb %%zmm2,%%zmm2,%%zmm2 \n"
3470
3471 // Store 16 ARGB values.
3472 #define STOREARGB_AVX2 \
3473 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
3474 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
3475 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
3476 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
3477 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
3478 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
3479 "vmovdqu %%ymm1,(%[dst_argb]) \n" \
3480 "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \
3481 "lea 0x40(%[dst_argb]), %[dst_argb] \n"
3482
3483 // Store 32 ARGB values.
3484 #define STOREARGB_AVX512BW \
3485 "vpunpcklbw %%zmm1,%%zmm0,%%zmm0 \n" \
3486 "vpermq %%zmm0,%%zmm18,%%zmm0 \n" \
3487 "vpunpcklbw %%zmm5,%%zmm2,%%zmm2 \n" \
3488 "vpermq %%zmm2,%%zmm18,%%zmm2 \n" \
3489 "vpunpcklwd %%zmm2,%%zmm0,%%zmm1 \n" \
3490 "vpunpckhwd %%zmm2,%%zmm0,%%zmm0 \n" \
3491 "vmovdqu8 %%zmm1,(%[dst_argb]) \n" \
3492 "vmovdqu8 %%zmm0,0x40(%[dst_argb]) \n" \
3493 "lea 0x80(%[dst_argb]), %[dst_argb] \n"
3494
3495 // Store 16 AR30 values.
3496 #define STOREAR30_AVX2 \
3497 "vpsraw $0x4,%%ymm0,%%ymm0 \n" \
3498 "vpsraw $0x4,%%ymm1,%%ymm1 \n" \
3499 "vpsraw $0x4,%%ymm2,%%ymm2 \n" \
3500 "vpminsw %%ymm7,%%ymm0,%%ymm0 \n" \
3501 "vpminsw %%ymm7,%%ymm1,%%ymm1 \n" \
3502 "vpminsw %%ymm7,%%ymm2,%%ymm2 \n" \
3503 "vpmaxsw %%ymm6,%%ymm0,%%ymm0 \n" \
3504 "vpmaxsw %%ymm6,%%ymm1,%%ymm1 \n" \
3505 "vpmaxsw %%ymm6,%%ymm2,%%ymm2 \n" \
3506 "vpsllw $0x4,%%ymm2,%%ymm2 \n" \
3507 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
3508 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
3509 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
3510 "vpunpckhwd %%ymm2,%%ymm0,%%ymm3 \n" \
3511 "vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" \
3512 "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" \
3513 "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" \
3514 "vpslld $0xa,%%ymm1,%%ymm1 \n" \
3515 "vpslld $0xa,%%ymm2,%%ymm2 \n" \
3516 "vpor %%ymm1,%%ymm0,%%ymm0 \n" \
3517 "vpor %%ymm2,%%ymm3,%%ymm3 \n" \
3518 "vmovdqu %%ymm0,(%[dst_ar30]) \n" \
3519 "vmovdqu %%ymm3,0x20(%[dst_ar30]) \n" \
3520 "lea 0x40(%[dst_ar30]), %[dst_ar30] \n"
3521
3522 #ifdef HAS_I444TOARGBROW_AVX2
3523 // 16 pixels
3524 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
I444ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3525 void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
3526 const uint8_t* u_buf,
3527 const uint8_t* v_buf,
3528 uint8_t* dst_argb,
3529 const struct YuvConstants* yuvconstants,
3530 int width) {
3531 asm volatile (
3532 YUVTORGB_SETUP_AVX2(yuvconstants)
3533 "sub %[u_buf],%[v_buf] \n"
3534 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3535
3536 LABELALIGN
3537 "1: \n"
3538 READYUV444_AVX2
3539 YUVTORGB_AVX2(yuvconstants)
3540 STOREARGB_AVX2
3541 "sub $0x10,%[width] \n"
3542 "jg 1b \n"
3543 "vzeroupper \n"
3544 : [y_buf]"+r"(y_buf), // %[y_buf]
3545 [u_buf]"+r"(u_buf), // %[u_buf]
3546 [v_buf]"+r"(v_buf), // %[v_buf]
3547 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3548 [width]"+rm"(width) // %[width]
3549 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3550 : "memory", "cc", YUVTORGB_REGS_AVX2
3551 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3552 );
3553 }
3554 #endif // HAS_I444TOARGBROW_AVX2
3555
3556 #if defined(HAS_I422TOARGBROW_AVX2)
3557 // 16 pixels
3558 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I422ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3559 void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
3560 const uint8_t* u_buf,
3561 const uint8_t* v_buf,
3562 uint8_t* dst_argb,
3563 const struct YuvConstants* yuvconstants,
3564 int width) {
3565 asm volatile (
3566 YUVTORGB_SETUP_AVX2(yuvconstants)
3567 "sub %[u_buf],%[v_buf] \n"
3568 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3569
3570 LABELALIGN
3571 "1: \n"
3572 READYUV422_AVX2
3573 YUVTORGB_AVX2(yuvconstants)
3574 STOREARGB_AVX2
3575 "sub $0x10,%[width] \n"
3576 "jg 1b \n"
3577
3578 "vzeroupper \n"
3579 : [y_buf]"+r"(y_buf), // %[y_buf]
3580 [u_buf]"+r"(u_buf), // %[u_buf]
3581 [v_buf]"+r"(v_buf), // %[v_buf]
3582 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3583 [width]"+rm"(width) // %[width]
3584 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3585 : "memory", "cc", YUVTORGB_REGS_AVX2
3586 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3587 );
3588 }
3589 #endif // HAS_I422TOARGBROW_AVX2
3590
3591 #if defined(HAS_I422TOARGBROW_AVX512BW)
3592 static const uint64_t kSplitQuadWords[8] = {0, 2, 2, 2, 1, 2, 2, 2};
3593 static const uint64_t kSplitDoubleQuadWords[8] = {0, 1, 4, 4, 2, 3, 4, 4};
3594 static const uint64_t kUnpermuteAVX512[8] = {0, 4, 1, 5, 2, 6, 3, 7};
3595
3596 // 32 pixels
3597 // 16 UV values upsampled to 32 UV, mixed with 32 Y producing 32 ARGB (128
3598 // bytes).
I422ToARGBRow_AVX512BW(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3599 void OMITFP I422ToARGBRow_AVX512BW(const uint8_t* y_buf,
3600 const uint8_t* u_buf,
3601 const uint8_t* v_buf,
3602 uint8_t* dst_argb,
3603 const struct YuvConstants* yuvconstants,
3604 int width) {
3605 asm volatile (
3606 YUVTORGB_SETUP_AVX512BW(yuvconstants)
3607 "sub %[u_buf],%[v_buf] \n"
3608 "vpcmpeqb %%xmm5,%%xmm5,%%xmm5 \n"
3609 "vpbroadcastq %%xmm5,%%zmm5 \n"
3610
3611 LABELALIGN
3612 "1: \n"
3613 READYUV422_AVX512BW
3614 YUVTORGB_AVX512BW(yuvconstants)
3615 STOREARGB_AVX512BW
3616 "sub $0x20,%[width] \n"
3617 "jg 1b \n"
3618
3619 "vzeroupper \n"
3620 : [y_buf]"+r"(y_buf), // %[y_buf]
3621 [u_buf]"+r"(u_buf), // %[u_buf]
3622 [v_buf]"+r"(v_buf), // %[v_buf]
3623 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3624 [width]"+rm"(width) // %[width]
3625 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
3626 [quadsplitperm]"r"(kSplitQuadWords), // %[quadsplitperm]
3627 [dquadsplitperm]"r"(kSplitDoubleQuadWords), // %[dquadsplitperm]
3628 [unperm]"r"(kUnpermuteAVX512) // %[unperm]
3629 : "memory", "cc", YUVTORGB_REGS_AVX512BW
3630 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3631 );
3632 }
3633 #endif // HAS_I422TOARGBROW_AVX512BW
3634
3635 #if defined(HAS_I422TOAR30ROW_AVX2)
3636 // 16 pixels
3637 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I422ToAR30Row_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3638 void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
3639 const uint8_t* u_buf,
3640 const uint8_t* v_buf,
3641 uint8_t* dst_ar30,
3642 const struct YuvConstants* yuvconstants,
3643 int width) {
3644 asm volatile (
3645 YUVTORGB_SETUP_AVX2(yuvconstants)
3646 "sub %[u_buf],%[v_buf] \n"
3647 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
3648 "vpsrlw $14,%%ymm5,%%ymm5 \n"
3649 "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
3650 "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
3651 "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
3652 "vpsrlw $6,%%ymm7,%%ymm7 \n"
3653
3654 LABELALIGN
3655 "1: \n"
3656 READYUV422_AVX2
3657 YUVTORGB16_AVX2(yuvconstants)
3658 STOREAR30_AVX2
3659 "sub $0x10,%[width] \n"
3660 "jg 1b \n"
3661
3662 "vzeroupper \n"
3663 : [y_buf]"+r"(y_buf), // %[y_buf]
3664 [u_buf]"+r"(u_buf), // %[u_buf]
3665 [v_buf]"+r"(v_buf), // %[v_buf]
3666 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
3667 [width]"+rm"(width) // %[width]
3668 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3669 : "memory", "cc", YUVTORGB_REGS_AVX2
3670 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3671 );
3672 }
3673 #endif // HAS_I422TOAR30ROW_AVX2
3674
3675 #if defined(HAS_I210TOARGBROW_AVX2)
3676 // 16 pixels
3677 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I210ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3678 void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
3679 const uint16_t* u_buf,
3680 const uint16_t* v_buf,
3681 uint8_t* dst_argb,
3682 const struct YuvConstants* yuvconstants,
3683 int width) {
3684 asm volatile (
3685 YUVTORGB_SETUP_AVX2(yuvconstants)
3686 "sub %[u_buf],%[v_buf] \n"
3687 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3688
3689 LABELALIGN
3690 "1: \n"
3691 READYUV210_AVX2
3692 YUVTORGB_AVX2(yuvconstants)
3693 STOREARGB_AVX2
3694 "sub $0x10,%[width] \n"
3695 "jg 1b \n"
3696
3697 "vzeroupper \n"
3698 : [y_buf]"+r"(y_buf), // %[y_buf]
3699 [u_buf]"+r"(u_buf), // %[u_buf]
3700 [v_buf]"+r"(v_buf), // %[v_buf]
3701 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3702 [width]"+rm"(width) // %[width]
3703 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3704 : "memory", "cc", YUVTORGB_REGS_AVX2
3705 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3706 );
3707 }
3708 #endif // HAS_I210TOARGBROW_AVX2
3709
3710 #if defined(HAS_I212TOARGBROW_AVX2)
3711 // 16 pixels
3712 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I212ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3713 void OMITFP I212ToARGBRow_AVX2(const uint16_t* y_buf,
3714 const uint16_t* u_buf,
3715 const uint16_t* v_buf,
3716 uint8_t* dst_argb,
3717 const struct YuvConstants* yuvconstants,
3718 int width) {
3719 asm volatile (
3720 YUVTORGB_SETUP_AVX2(yuvconstants)
3721 "sub %[u_buf],%[v_buf] \n"
3722 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3723
3724 LABELALIGN
3725 "1: \n"
3726 READYUV212_AVX2
3727 YUVTORGB_AVX2(yuvconstants)
3728 STOREARGB_AVX2
3729 "sub $0x10,%[width] \n"
3730 "jg 1b \n"
3731
3732 "vzeroupper \n"
3733 : [y_buf]"+r"(y_buf), // %[y_buf]
3734 [u_buf]"+r"(u_buf), // %[u_buf]
3735 [v_buf]"+r"(v_buf), // %[v_buf]
3736 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3737 [width]"+rm"(width) // %[width]
3738 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3739 : "memory", "cc", YUVTORGB_REGS_AVX2
3740 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3741 );
3742 }
3743 #endif // HAS_I212TOARGBROW_AVX2
3744
3745 #if defined(HAS_I210TOAR30ROW_AVX2)
3746 // 16 pixels
3747 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I210ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3748 void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
3749 const uint16_t* u_buf,
3750 const uint16_t* v_buf,
3751 uint8_t* dst_ar30,
3752 const struct YuvConstants* yuvconstants,
3753 int width) {
3754 asm volatile (
3755 YUVTORGB_SETUP_AVX2(yuvconstants)
3756 "sub %[u_buf],%[v_buf] \n"
3757 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
3758 "vpsrlw $14,%%ymm5,%%ymm5 \n"
3759 "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
3760 "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
3761 "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
3762 "vpsrlw $6,%%ymm7,%%ymm7 \n"
3763
3764 LABELALIGN
3765 "1: \n"
3766 READYUV210_AVX2
3767 YUVTORGB16_AVX2(yuvconstants)
3768 STOREAR30_AVX2
3769 "sub $0x10,%[width] \n"
3770 "jg 1b \n"
3771
3772 "vzeroupper \n"
3773 : [y_buf]"+r"(y_buf), // %[y_buf]
3774 [u_buf]"+r"(u_buf), // %[u_buf]
3775 [v_buf]"+r"(v_buf), // %[v_buf]
3776 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
3777 [width]"+rm"(width) // %[width]
3778 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3779 : "memory", "cc", YUVTORGB_REGS_AVX2
3780 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3781 );
3782 }
3783 #endif // HAS_I210TOAR30ROW_AVX2
3784
3785 #if defined(HAS_I212TOAR30ROW_AVX2)
3786 // 16 pixels
3787 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I212ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3788 void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf,
3789 const uint16_t* u_buf,
3790 const uint16_t* v_buf,
3791 uint8_t* dst_ar30,
3792 const struct YuvConstants* yuvconstants,
3793 int width) {
3794 asm volatile (
3795 YUVTORGB_SETUP_AVX2(yuvconstants)
3796 "sub %[u_buf],%[v_buf] \n"
3797 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
3798 "vpsrlw $14,%%ymm5,%%ymm5 \n"
3799 "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
3800 "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
3801 "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
3802 "vpsrlw $6,%%ymm7,%%ymm7 \n"
3803
3804 LABELALIGN
3805 "1: \n"
3806 READYUV212_AVX2
3807 YUVTORGB16_AVX2(yuvconstants)
3808 STOREAR30_AVX2
3809 "sub $0x10,%[width] \n"
3810 "jg 1b \n"
3811
3812 "vzeroupper \n"
3813 : [y_buf]"+r"(y_buf), // %[y_buf]
3814 [u_buf]"+r"(u_buf), // %[u_buf]
3815 [v_buf]"+r"(v_buf), // %[v_buf]
3816 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
3817 [width]"+rm"(width) // %[width]
3818 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3819 : "memory", "cc", YUVTORGB_REGS_AVX2
3820 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3821 );
3822 }
3823 #endif // HAS_I212TOAR30ROW_AVX2
3824
3825 #if defined(HAS_I410TOARGBROW_AVX2)
3826 // 16 pixels
3827 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
I410ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3828 void OMITFP I410ToARGBRow_AVX2(const uint16_t* y_buf,
3829 const uint16_t* u_buf,
3830 const uint16_t* v_buf,
3831 uint8_t* dst_argb,
3832 const struct YuvConstants* yuvconstants,
3833 int width) {
3834 asm volatile (
3835 YUVTORGB_SETUP_AVX2(yuvconstants)
3836 "sub %[u_buf],%[v_buf] \n"
3837 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3838
3839 LABELALIGN
3840 "1: \n"
3841 READYUV410_AVX2
3842 YUVTORGB_AVX2(yuvconstants)
3843 STOREARGB_AVX2
3844 "sub $0x10,%[width] \n"
3845 "jg 1b \n"
3846 "vzeroupper \n"
3847
3848 : [y_buf]"+r"(y_buf), // %[y_buf]
3849 [u_buf]"+r"(u_buf), // %[u_buf]
3850 [v_buf]"+r"(v_buf), // %[v_buf]
3851 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3852 [width]"+rm"(width) // %[width]
3853 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3854 : "memory", "cc", YUVTORGB_REGS_AVX2
3855 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3856 );
3857 }
3858 #endif // HAS_I410TOARGBROW_AVX2
3859
3860 #if defined(HAS_I210ALPHATOARGBROW_AVX2)
3861 // 16 pixels
3862 // 8 UV, 16 Y and 16 A producing 16 ARGB (64 bytes).
I210AlphaToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,const uint16_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3863 void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf,
3864 const uint16_t* u_buf,
3865 const uint16_t* v_buf,
3866 const uint16_t* a_buf,
3867 uint8_t* dst_argb,
3868 const struct YuvConstants* yuvconstants,
3869 int width) {
3870 asm volatile(
3871 YUVTORGB_SETUP_AVX2(
3872 yuvconstants) "sub %[u_buf],%[v_buf] \n"
3873
3874 LABELALIGN "1: \n" READYUVA210_AVX2
3875 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
3876 "subl $0x10,%[width] \n"
3877 "jg 1b \n"
3878 "vzeroupper \n"
3879
3880 : [y_buf] "+r"(y_buf), // %[y_buf]
3881 [u_buf] "+r"(u_buf), // %[u_buf]
3882 [v_buf] "+r"(v_buf), // %[v_buf]
3883 [a_buf] "+r"(a_buf), // %[a_buf]
3884 [dst_argb] "+r"(dst_argb), // %[dst_argb]
3885 #if defined(__i386__)
3886 [width] "+m"(width) // %[width]
3887 #else
3888 [width] "+rm"(width) // %[width]
3889 #endif
3890 : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
3891 : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
3892 "xmm4", "xmm5");
3893 }
3894 #endif // HAS_I210TOARGBROW_AVX2
3895
3896 #if defined(HAS_I410ALPHATOARGBROW_AVX2)
3897 // 16 pixels
3898 // 16 UV, 16 Y and 16 A producing 16 ARGB (64 bytes).
I410AlphaToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,const uint16_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3899 void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf,
3900 const uint16_t* u_buf,
3901 const uint16_t* v_buf,
3902 const uint16_t* a_buf,
3903 uint8_t* dst_argb,
3904 const struct YuvConstants* yuvconstants,
3905 int width) {
3906 asm volatile(
3907 YUVTORGB_SETUP_AVX2(
3908 yuvconstants) "sub %[u_buf],%[v_buf] \n"
3909
3910 LABELALIGN "1: \n" READYUVA410_AVX2
3911 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
3912 "subl $0x10,%[width] \n"
3913 "jg 1b \n"
3914 "vzeroupper \n"
3915
3916 : [y_buf] "+r"(y_buf), // %[y_buf]
3917 [u_buf] "+r"(u_buf), // %[u_buf]
3918 [v_buf] "+r"(v_buf), // %[v_buf]
3919 [a_buf] "+r"(a_buf), // %[a_buf]
3920 [dst_argb] "+r"(dst_argb), // %[dst_argb]
3921 #if defined(__i386__)
3922 [width] "+m"(width) // %[width]
3923 #else
3924 [width] "+rm"(width) // %[width]
3925 #endif
3926 : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
3927 : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
3928 "xmm4", "xmm5");
3929 }
3930 #endif // HAS_I410TOARGBROW_AVX2
3931
3932 #if defined(HAS_I410TOAR30ROW_AVX2)
3933 // 16 pixels
3934 // 16 UV values with 16 Y producing 16 AR30 (64 bytes).
I410ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3935 void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf,
3936 const uint16_t* u_buf,
3937 const uint16_t* v_buf,
3938 uint8_t* dst_ar30,
3939 const struct YuvConstants* yuvconstants,
3940 int width) {
3941 asm volatile (
3942 YUVTORGB_SETUP_AVX2(yuvconstants)
3943 "sub %[u_buf],%[v_buf] \n"
3944 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
3945 "vpsrlw $14,%%ymm5,%%ymm5 \n"
3946 "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
3947 "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
3948 "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
3949 "vpsrlw $6,%%ymm7,%%ymm7 \n"
3950
3951 LABELALIGN
3952 "1: \n"
3953 READYUV410_AVX2
3954 YUVTORGB16_AVX2(yuvconstants)
3955 STOREAR30_AVX2
3956 "sub $0x10,%[width] \n"
3957 "jg 1b \n"
3958
3959 "vzeroupper \n"
3960 : [y_buf]"+r"(y_buf), // %[y_buf]
3961 [u_buf]"+r"(u_buf), // %[u_buf]
3962 [v_buf]"+r"(v_buf), // %[v_buf]
3963 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
3964 [width]"+rm"(width) // %[width]
3965 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3966 : "memory", "cc", YUVTORGB_REGS_AVX2
3967 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3968 );
3969 }
3970 #endif // HAS_I410TOAR30ROW_AVX2
3971
3972 #if defined(HAS_I444ALPHATOARGBROW_AVX2)
3973 // 16 pixels
3974 // 16 UV values with 16 Y and 16 A producing 16 ARGB.
I444AlphaToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3975 void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf,
3976 const uint8_t* u_buf,
3977 const uint8_t* v_buf,
3978 const uint8_t* a_buf,
3979 uint8_t* dst_argb,
3980 const struct YuvConstants* yuvconstants,
3981 int width) {
3982 // clang-format off
3983 asm volatile (
3984 YUVTORGB_SETUP_AVX2(yuvconstants)
3985 "sub %[u_buf],%[v_buf] \n"
3986
3987 LABELALIGN
3988 "1: \n"
3989 READYUVA444_AVX2
3990 YUVTORGB_AVX2(yuvconstants)
3991 STOREARGB_AVX2
3992 "subl $0x10,%[width] \n"
3993 "jg 1b \n"
3994 "vzeroupper \n"
3995 : [y_buf]"+r"(y_buf), // %[y_buf]
3996 [u_buf]"+r"(u_buf), // %[u_buf]
3997 [v_buf]"+r"(v_buf), // %[v_buf]
3998 [a_buf]"+r"(a_buf), // %[a_buf]
3999 [dst_argb]"+r"(dst_argb), // %[dst_argb]
4000 #if defined(__i386__)
4001 [width]"+m"(width) // %[width]
4002 #else
4003 [width]"+rm"(width) // %[width]
4004 #endif
4005 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
4006 : "memory", "cc", YUVTORGB_REGS_AVX2
4007 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4008 );
4009 // clang-format on
4010 }
4011 #endif // HAS_I444ALPHATOARGBROW_AVX2
4012
4013 #if defined(HAS_I422ALPHATOARGBROW_AVX2)
4014 // 16 pixels
4015 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
I422AlphaToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4016 void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
4017 const uint8_t* u_buf,
4018 const uint8_t* v_buf,
4019 const uint8_t* a_buf,
4020 uint8_t* dst_argb,
4021 const struct YuvConstants* yuvconstants,
4022 int width) {
4023 // clang-format off
4024 asm volatile (
4025 YUVTORGB_SETUP_AVX2(yuvconstants)
4026 "sub %[u_buf],%[v_buf] \n"
4027
4028 LABELALIGN
4029 "1: \n"
4030 READYUVA422_AVX2
4031 YUVTORGB_AVX2(yuvconstants)
4032 STOREARGB_AVX2
4033 "subl $0x10,%[width] \n"
4034 "jg 1b \n"
4035 "vzeroupper \n"
4036 : [y_buf]"+r"(y_buf), // %[y_buf]
4037 [u_buf]"+r"(u_buf), // %[u_buf]
4038 [v_buf]"+r"(v_buf), // %[v_buf]
4039 [a_buf]"+r"(a_buf), // %[a_buf]
4040 [dst_argb]"+r"(dst_argb), // %[dst_argb]
4041 #if defined(__i386__)
4042 [width]"+m"(width) // %[width]
4043 #else
4044 [width]"+rm"(width) // %[width]
4045 #endif
4046 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
4047 : "memory", "cc", YUVTORGB_REGS_AVX2
4048 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4049 );
4050 // clang-format on
4051 }
4052 #endif // HAS_I422ALPHATOARGBROW_AVX2
4053
4054 #if defined(HAS_I422TORGBAROW_AVX2)
4055 // 16 pixels
4056 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
I422ToRGBARow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4057 void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
4058 const uint8_t* u_buf,
4059 const uint8_t* v_buf,
4060 uint8_t* dst_argb,
4061 const struct YuvConstants* yuvconstants,
4062 int width) {
4063 asm volatile (
4064 YUVTORGB_SETUP_AVX2(yuvconstants)
4065 "sub %[u_buf],%[v_buf] \n"
4066 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4067
4068 LABELALIGN
4069 "1: \n"
4070 READYUV422_AVX2
4071 YUVTORGB_AVX2(yuvconstants)
4072
4073 // Step 3: Weave into RGBA
4074 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
4075 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
4076 "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n"
4077 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
4078 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
4079 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
4080 "vmovdqu %%ymm0,(%[dst_argb]) \n"
4081 "vmovdqu %%ymm1,0x20(%[dst_argb]) \n"
4082 "lea 0x40(%[dst_argb]),%[dst_argb] \n"
4083 "sub $0x10,%[width] \n"
4084 "jg 1b \n"
4085 "vzeroupper \n"
4086 : [y_buf]"+r"(y_buf), // %[y_buf]
4087 [u_buf]"+r"(u_buf), // %[u_buf]
4088 [v_buf]"+r"(v_buf), // %[v_buf]
4089 [dst_argb]"+r"(dst_argb), // %[dst_argb]
4090 [width]"+rm"(width) // %[width]
4091 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
4092 : "memory", "cc", YUVTORGB_REGS_AVX2
4093 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4094 );
4095 }
4096 #endif // HAS_I422TORGBAROW_AVX2
4097
4098 #if defined(HAS_NV12TOARGBROW_AVX2)
4099 // 16 pixels.
4100 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV12ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4101 void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
4102 const uint8_t* uv_buf,
4103 uint8_t* dst_argb,
4104 const struct YuvConstants* yuvconstants,
4105 int width) {
4106 // clang-format off
4107 asm volatile (
4108 YUVTORGB_SETUP_AVX2(yuvconstants)
4109 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4110
4111 LABELALIGN
4112 "1: \n"
4113 READNV12_AVX2
4114 YUVTORGB_AVX2(yuvconstants)
4115 STOREARGB_AVX2
4116 "sub $0x10,%[width] \n"
4117 "jg 1b \n"
4118 "vzeroupper \n"
4119 : [y_buf]"+r"(y_buf), // %[y_buf]
4120 [uv_buf]"+r"(uv_buf), // %[uv_buf]
4121 [dst_argb]"+r"(dst_argb), // %[dst_argb]
4122 [width]"+rm"(width) // %[width]
4123 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
4124 : "memory", "cc", YUVTORGB_REGS_AVX2
4125 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4126 );
4127 // clang-format on
4128 }
4129 #endif // HAS_NV12TOARGBROW_AVX2
4130
4131 #if defined(HAS_NV21TOARGBROW_AVX2)
4132 // 16 pixels.
4133 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV21ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * vu_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4134 void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
4135 const uint8_t* vu_buf,
4136 uint8_t* dst_argb,
4137 const struct YuvConstants* yuvconstants,
4138 int width) {
4139 // clang-format off
4140 asm volatile (
4141 YUVTORGB_SETUP_AVX2(yuvconstants)
4142 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4143
4144 LABELALIGN
4145 "1: \n"
4146 READNV21_AVX2
4147 YUVTORGB_AVX2(yuvconstants)
4148 STOREARGB_AVX2
4149 "sub $0x10,%[width] \n"
4150 "jg 1b \n"
4151 "vzeroupper \n"
4152 : [y_buf]"+r"(y_buf), // %[y_buf]
4153 [vu_buf]"+r"(vu_buf), // %[vu_buf]
4154 [dst_argb]"+r"(dst_argb), // %[dst_argb]
4155 [width]"+rm"(width) // %[width]
4156 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
4157 [kShuffleNV21]"m"(kShuffleNV21)
4158 : "memory", "cc", YUVTORGB_REGS_AVX2
4159 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4160 );
4161 // clang-format on
4162 }
4163 #endif // HAS_NV21TOARGBROW_AVX2
4164
4165 #if defined(HAS_YUY2TOARGBROW_AVX2)
4166 // 16 pixels.
4167 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
YUY2ToARGBRow_AVX2(const uint8_t * yuy2_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4168 void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
4169 uint8_t* dst_argb,
4170 const struct YuvConstants* yuvconstants,
4171 int width) {
4172 // clang-format off
4173 asm volatile (
4174 YUVTORGB_SETUP_AVX2(yuvconstants)
4175 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4176
4177 LABELALIGN
4178 "1: \n"
4179 READYUY2_AVX2
4180 YUVTORGB_AVX2(yuvconstants)
4181 STOREARGB_AVX2
4182 "sub $0x10,%[width] \n"
4183 "jg 1b \n"
4184 "vzeroupper \n"
4185 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
4186 [dst_argb]"+r"(dst_argb), // %[dst_argb]
4187 [width]"+rm"(width) // %[width]
4188 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
4189 [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
4190 [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
4191 : "memory", "cc", YUVTORGB_REGS_AVX2
4192 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4193 );
4194 // clang-format on
4195 }
4196 #endif // HAS_YUY2TOARGBROW_AVX2
4197
4198 #if defined(HAS_UYVYTOARGBROW_AVX2)
4199 // 16 pixels.
4200 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
UYVYToARGBRow_AVX2(const uint8_t * uyvy_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4201 void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
4202 uint8_t* dst_argb,
4203 const struct YuvConstants* yuvconstants,
4204 int width) {
4205 // clang-format off
4206 asm volatile (
4207 YUVTORGB_SETUP_AVX2(yuvconstants)
4208 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4209
4210 LABELALIGN
4211 "1: \n"
4212 READUYVY_AVX2
4213 YUVTORGB_AVX2(yuvconstants)
4214 STOREARGB_AVX2
4215 "sub $0x10,%[width] \n"
4216 "jg 1b \n"
4217 "vzeroupper \n"
4218 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
4219 [dst_argb]"+r"(dst_argb), // %[dst_argb]
4220 [width]"+rm"(width) // %[width]
4221 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
4222 [kShuffleUYVYY]"m"(kShuffleUYVYY),
4223 [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
4224 : "memory", "cc", YUVTORGB_REGS_AVX2
4225 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4226 );
4227 // clang-format on
4228 }
4229 #endif // HAS_UYVYTOARGBROW_AVX2
4230
4231 #if defined(HAS_P210TOARGBROW_AVX2)
4232 // 16 pixels.
4233 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
P210ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4234 void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf,
4235 const uint16_t* uv_buf,
4236 uint8_t* dst_argb,
4237 const struct YuvConstants* yuvconstants,
4238 int width) {
4239 // clang-format off
4240 asm volatile (
4241 YUVTORGB_SETUP_AVX2(yuvconstants)
4242 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4243
4244 LABELALIGN
4245 "1: \n"
4246 READP210_AVX2
4247 YUVTORGB_AVX2(yuvconstants)
4248 STOREARGB_AVX2
4249 "sub $0x10,%[width] \n"
4250 "jg 1b \n"
4251 "vzeroupper \n"
4252 : [y_buf]"+r"(y_buf), // %[y_buf]
4253 [uv_buf]"+r"(uv_buf), // %[uv_buf]
4254 [dst_argb]"+r"(dst_argb), // %[dst_argb]
4255 [width]"+rm"(width) // %[width]
4256 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
4257 : "memory", "cc", YUVTORGB_REGS_AVX2
4258 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4259 );
4260 // clang-format on
4261 }
4262 #endif // HAS_P210TOARGBROW_AVX2
4263
4264 #if defined(HAS_P410TOARGBROW_AVX2)
4265 // 16 pixels.
4266 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
P410ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4267 void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf,
4268 const uint16_t* uv_buf,
4269 uint8_t* dst_argb,
4270 const struct YuvConstants* yuvconstants,
4271 int width) {
4272 // clang-format off
4273 asm volatile (
4274 YUVTORGB_SETUP_AVX2(yuvconstants)
4275 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4276
4277 LABELALIGN
4278 "1: \n"
4279 READP410_AVX2
4280 YUVTORGB_AVX2(yuvconstants)
4281 STOREARGB_AVX2
4282 "sub $0x10,%[width] \n"
4283 "jg 1b \n"
4284 "vzeroupper \n"
4285 : [y_buf]"+r"(y_buf), // %[y_buf]
4286 [uv_buf]"+r"(uv_buf), // %[uv_buf]
4287 [dst_argb]"+r"(dst_argb), // %[dst_argb]
4288 [width]"+rm"(width) // %[width]
4289 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
4290 : "memory", "cc", YUVTORGB_REGS_AVX2
4291 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4292 );
4293 // clang-format on
4294 }
4295 #endif // HAS_P410TOARGBROW_AVX2
4296
4297 #if defined(HAS_P210TOAR30ROW_AVX2)
4298 // 16 pixels
4299 // 16 UV values with 16 Y producing 16 AR30 (64 bytes).
P210ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)4300 void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf,
4301 const uint16_t* uv_buf,
4302 uint8_t* dst_ar30,
4303 const struct YuvConstants* yuvconstants,
4304 int width) {
4305 asm volatile (
4306 YUVTORGB_SETUP_AVX2(yuvconstants)
4307 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
4308 "vpsrlw $14,%%ymm5,%%ymm5 \n"
4309 "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
4310 "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
4311 "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
4312 "vpsrlw $6,%%ymm7,%%ymm7 \n"
4313
4314 LABELALIGN
4315 "1: \n"
4316 READP210_AVX2
4317 YUVTORGB16_AVX2(yuvconstants)
4318 STOREAR30_AVX2
4319 "sub $0x10,%[width] \n"
4320 "jg 1b \n"
4321
4322 "vzeroupper \n"
4323 : [y_buf]"+r"(y_buf), // %[y_buf]
4324 [uv_buf]"+r"(uv_buf), // %[uv_buf]
4325 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
4326 [width]"+rm"(width) // %[width]
4327 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
4328 : "memory", "cc", YUVTORGB_REGS_AVX2
4329 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4330 );
4331 }
4332 #endif // HAS_P210TOAR30ROW_AVX2
4333
4334 #if defined(HAS_P410TOAR30ROW_AVX2)
4335 // 16 pixels
4336 // 16 UV values with 16 Y producing 16 AR30 (64 bytes).
P410ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)4337 void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf,
4338 const uint16_t* uv_buf,
4339 uint8_t* dst_ar30,
4340 const struct YuvConstants* yuvconstants,
4341 int width) {
4342 asm volatile (
4343 YUVTORGB_SETUP_AVX2(yuvconstants)
4344 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
4345 "vpsrlw $14,%%ymm5,%%ymm5 \n"
4346 "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
4347 "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
4348 "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
4349 "vpsrlw $6,%%ymm7,%%ymm7 \n"
4350
4351 LABELALIGN
4352 "1: \n"
4353 READP410_AVX2
4354 YUVTORGB16_AVX2(yuvconstants)
4355 STOREAR30_AVX2
4356 "sub $0x10,%[width] \n"
4357 "jg 1b \n"
4358
4359 "vzeroupper \n"
4360 : [y_buf]"+r"(y_buf), // %[y_buf]
4361 [uv_buf]"+r"(uv_buf), // %[uv_buf]
4362 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
4363 [width]"+rm"(width) // %[width]
4364 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
4365 : "memory", "cc", YUVTORGB_REGS_AVX2
4366 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4367 );
4368 }
4369 #endif // HAS_P410TOAR30ROW_AVX2
4370
4371 #ifdef HAS_I400TOARGBROW_SSE2
I400ToARGBRow_SSE2(const uint8_t * y_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4372 void I400ToARGBRow_SSE2(const uint8_t* y_buf,
4373 uint8_t* dst_argb,
4374 const struct YuvConstants* yuvconstants,
4375 int width) {
4376 asm volatile(
4377 "movdqa 96(%3),%%xmm2 \n" // yg = 18997 = 1.164
4378 "movdqa 128(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16
4379 "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000
4380 "pslld $0x18,%%xmm4 \n"
4381
4382 LABELALIGN
4383 "1: \n"
4384 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
4385 "movq (%0),%%xmm0 \n"
4386 "lea 0x8(%0),%0 \n"
4387 "punpcklbw %%xmm0,%%xmm0 \n"
4388 "pmulhuw %%xmm2,%%xmm0 \n"
4389 "paddsw %%xmm3,%%xmm0 \n"
4390 "psraw $6, %%xmm0 \n"
4391 "packuswb %%xmm0,%%xmm0 \n"
4392
4393 // Step 2: Weave into ARGB
4394 "punpcklbw %%xmm0,%%xmm0 \n"
4395 "movdqa %%xmm0,%%xmm1 \n"
4396 "punpcklwd %%xmm0,%%xmm0 \n"
4397 "punpckhwd %%xmm1,%%xmm1 \n"
4398 "por %%xmm4,%%xmm0 \n"
4399 "por %%xmm4,%%xmm1 \n"
4400 "movdqu %%xmm0,(%1) \n"
4401 "movdqu %%xmm1,0x10(%1) \n"
4402 "lea 0x20(%1),%1 \n"
4403
4404 "sub $0x8,%2 \n"
4405 "jg 1b \n"
4406 : "+r"(y_buf), // %0
4407 "+r"(dst_argb), // %1
4408 "+rm"(width) // %2
4409 : "r"(yuvconstants) // %3
4410 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
4411 }
4412 #endif // HAS_I400TOARGBROW_SSE2
4413
4414 #ifdef HAS_I400TOARGBROW_AVX2
4415 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
4416 // note: vpunpcklbw mutates and vpackuswb unmutates.
I400ToARGBRow_AVX2(const uint8_t * y_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4417 void I400ToARGBRow_AVX2(const uint8_t* y_buf,
4418 uint8_t* dst_argb,
4419 const struct YuvConstants* yuvconstants,
4420 int width) {
4421 asm volatile(
4422 "vmovdqa 96(%3),%%ymm2 \n" // yg = 18997 = 1.164
4423 "vmovdqa 128(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16
4424 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000
4425 "vpslld $0x18,%%ymm4,%%ymm4 \n"
4426
4427 LABELALIGN
4428 "1: \n"
4429 // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
4430 "vmovdqu (%0),%%xmm0 \n"
4431 "lea 0x10(%0),%0 \n"
4432 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4433 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
4434 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
4435 "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n"
4436 "vpsraw $0x6,%%ymm0,%%ymm0 \n"
4437 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
4438 "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
4439 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
4440 "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
4441 "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
4442 "vpor %%ymm4,%%ymm0,%%ymm0 \n"
4443 "vpor %%ymm4,%%ymm1,%%ymm1 \n"
4444 "vmovdqu %%ymm0,(%1) \n"
4445 "vmovdqu %%ymm1,0x20(%1) \n"
4446 "lea 0x40(%1),%1 \n"
4447 "sub $0x10,%2 \n"
4448 "jg 1b \n"
4449 "vzeroupper \n"
4450 : "+r"(y_buf), // %0
4451 "+r"(dst_argb), // %1
4452 "+rm"(width) // %2
4453 : "r"(yuvconstants) // %3
4454 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
4455 }
4456 #endif // HAS_I400TOARGBROW_AVX2
4457
4458 #ifdef HAS_MIRRORROW_SSSE3
4459 // Shuffle table for reversing the bytes.
4460 static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
4461 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
4462
MirrorRow_SSSE3(const uint8_t * src,uint8_t * dst,int width)4463 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
4464 intptr_t temp_width = (intptr_t)(width);
4465 asm volatile(
4466
4467 "movdqa %3,%%xmm5 \n"
4468
4469 LABELALIGN
4470 "1: \n"
4471 "movdqu -0x10(%0,%2,1),%%xmm0 \n"
4472 "pshufb %%xmm5,%%xmm0 \n"
4473 "movdqu %%xmm0,(%1) \n"
4474 "lea 0x10(%1),%1 \n"
4475 "sub $0x10,%2 \n"
4476 "jg 1b \n"
4477 : "+r"(src), // %0
4478 "+r"(dst), // %1
4479 "+r"(temp_width) // %2
4480 : "m"(kShuffleMirror) // %3
4481 : "memory", "cc", "xmm0", "xmm5");
4482 }
4483 #endif // HAS_MIRRORROW_SSSE3
4484
4485 #ifdef HAS_MIRRORROW_AVX2
MirrorRow_AVX2(const uint8_t * src,uint8_t * dst,int width)4486 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
4487 intptr_t temp_width = (intptr_t)(width);
4488 asm volatile(
4489
4490 "vbroadcastf128 %3,%%ymm5 \n"
4491
4492 LABELALIGN
4493 "1: \n"
4494 "vmovdqu -0x20(%0,%2,1),%%ymm0 \n"
4495 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
4496 "vpermq $0x4e,%%ymm0,%%ymm0 \n"
4497 "vmovdqu %%ymm0,(%1) \n"
4498 "lea 0x20(%1),%1 \n"
4499 "sub $0x20,%2 \n"
4500 "jg 1b \n"
4501 "vzeroupper \n"
4502 : "+r"(src), // %0
4503 "+r"(dst), // %1
4504 "+r"(temp_width) // %2
4505 : "m"(kShuffleMirror) // %3
4506 : "memory", "cc", "xmm0", "xmm5");
4507 }
4508 #endif // HAS_MIRRORROW_AVX2
4509
4510 #ifdef HAS_MIRRORUVROW_SSSE3
4511 // Shuffle table for reversing the UV.
4512 static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
4513 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
4514
MirrorUVRow_SSSE3(const uint8_t * src_uv,uint8_t * dst_uv,int width)4515 void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
4516 intptr_t temp_width = (intptr_t)(width);
4517 asm volatile(
4518
4519 "movdqa %3,%%xmm5 \n"
4520
4521 LABELALIGN
4522 "1: \n"
4523 "movdqu -0x10(%0,%2,2),%%xmm0 \n"
4524 "pshufb %%xmm5,%%xmm0 \n"
4525 "movdqu %%xmm0,(%1) \n"
4526 "lea 0x10(%1),%1 \n"
4527 "sub $0x8,%2 \n"
4528 "jg 1b \n"
4529 : "+r"(src_uv), // %0
4530 "+r"(dst_uv), // %1
4531 "+r"(temp_width) // %2
4532 : "m"(kShuffleMirrorUV) // %3
4533 : "memory", "cc", "xmm0", "xmm5");
4534 }
4535 #endif // HAS_MIRRORUVROW_SSSE3
4536
4537 #ifdef HAS_MIRRORUVROW_AVX2
MirrorUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_uv,int width)4538 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
4539 intptr_t temp_width = (intptr_t)(width);
4540 asm volatile(
4541
4542 "vbroadcastf128 %3,%%ymm5 \n"
4543
4544 LABELALIGN
4545 "1: \n"
4546 "vmovdqu -0x20(%0,%2,2),%%ymm0 \n"
4547 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
4548 "vpermq $0x4e,%%ymm0,%%ymm0 \n"
4549 "vmovdqu %%ymm0,(%1) \n"
4550 "lea 0x20(%1),%1 \n"
4551 "sub $0x10,%2 \n"
4552 "jg 1b \n"
4553 "vzeroupper \n"
4554 : "+r"(src_uv), // %0
4555 "+r"(dst_uv), // %1
4556 "+r"(temp_width) // %2
4557 : "m"(kShuffleMirrorUV) // %3
4558 : "memory", "cc", "xmm0", "xmm5");
4559 }
4560 #endif // HAS_MIRRORUVROW_AVX2
4561
4562 #ifdef HAS_MIRRORSPLITUVROW_SSSE3
4563 // Shuffle table for reversing the bytes of UV channels.
4564 static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
4565 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
MirrorSplitUVRow_SSSE3(const uint8_t * src,uint8_t * dst_u,uint8_t * dst_v,int width)4566 void MirrorSplitUVRow_SSSE3(const uint8_t* src,
4567 uint8_t* dst_u,
4568 uint8_t* dst_v,
4569 int width) {
4570 intptr_t temp_width = (intptr_t)(width);
4571 asm volatile(
4572 "movdqa %4,%%xmm1 \n"
4573 "lea -0x10(%0,%3,2),%0 \n"
4574 "sub %1,%2 \n"
4575
4576 LABELALIGN
4577 "1: \n"
4578 "movdqu (%0),%%xmm0 \n"
4579 "lea -0x10(%0),%0 \n"
4580 "pshufb %%xmm1,%%xmm0 \n"
4581 "movlpd %%xmm0,(%1) \n"
4582 "movhpd %%xmm0,0x00(%1,%2,1) \n"
4583 "lea 0x8(%1),%1 \n"
4584 "sub $8,%3 \n"
4585 "jg 1b \n"
4586 : "+r"(src), // %0
4587 "+r"(dst_u), // %1
4588 "+r"(dst_v), // %2
4589 "+r"(temp_width) // %3
4590 : "m"(kShuffleMirrorSplitUV) // %4
4591 : "memory", "cc", "xmm0", "xmm1");
4592 }
4593 #endif // HAS_MIRRORSPLITUVROW_SSSE3
4594
4595 #ifdef HAS_RGB24MIRRORROW_SSSE3
4596
4597 // Shuffle first 5 pixels to last 5 mirrored. first byte zero
4598 static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u,
4599 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u};
4600
4601 // Shuffle last 5 pixels to first 5 mirrored. last byte zero
4602 static const uvec8 kShuffleMirrorRGB1 = {
4603 13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u};
4604
4605 // Shuffle 5 pixels at a time (15 bytes)
RGB24MirrorRow_SSSE3(const uint8_t * src_rgb24,uint8_t * dst_rgb24,int width)4606 void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
4607 uint8_t* dst_rgb24,
4608 int width) {
4609 intptr_t temp_width = (intptr_t)(width);
4610 src_rgb24 += width * 3 - 48;
4611 asm volatile(
4612 "movdqa %3,%%xmm4 \n"
4613 "movdqa %4,%%xmm5 \n"
4614
4615 LABELALIGN
4616 "1: \n"
4617 "movdqu (%0),%%xmm0 \n" // first 5
4618 "movdqu 15(%0),%%xmm1 \n" // next 5
4619 "movdqu 30(%0),%%xmm2 \n" // next 5
4620 "movdqu 32(%0),%%xmm3 \n" // last 1 special
4621 "pshufb %%xmm4,%%xmm0 \n"
4622 "pshufb %%xmm4,%%xmm1 \n"
4623 "pshufb %%xmm4,%%xmm2 \n"
4624 "pshufb %%xmm5,%%xmm3 \n"
4625 "lea -0x30(%0),%0 \n"
4626 "movdqu %%xmm0,32(%1) \n" // last 5
4627 "movdqu %%xmm1,17(%1) \n" // next 5
4628 "movdqu %%xmm2,2(%1) \n" // next 5
4629 "movlpd %%xmm3,0(%1) \n" // first 1
4630 "lea 0x30(%1),%1 \n"
4631 "sub $0x10,%2 \n"
4632 "jg 1b \n"
4633 : "+r"(src_rgb24), // %0
4634 "+r"(dst_rgb24), // %1
4635 "+r"(temp_width) // %2
4636 : "m"(kShuffleMirrorRGB0), // %3
4637 "m"(kShuffleMirrorRGB1) // %4
4638 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
4639 }
4640 #endif // HAS_RGB24MIRRORROW_SSSE3
4641
4642 #ifdef HAS_ARGBMIRRORROW_SSE2
4643
ARGBMirrorRow_SSE2(const uint8_t * src,uint8_t * dst,int width)4644 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
4645 intptr_t temp_width = (intptr_t)(width);
4646 asm volatile(
4647
4648 "lea -0x10(%0,%2,4),%0 \n"
4649
4650 LABELALIGN
4651 "1: \n"
4652 "movdqu (%0),%%xmm0 \n"
4653 "pshufd $0x1b,%%xmm0,%%xmm0 \n"
4654 "lea -0x10(%0),%0 \n"
4655 "movdqu %%xmm0,(%1) \n"
4656 "lea 0x10(%1),%1 \n"
4657 "sub $0x4,%2 \n"
4658 "jg 1b \n"
4659 : "+r"(src), // %0
4660 "+r"(dst), // %1
4661 "+r"(temp_width) // %2
4662 :
4663 : "memory", "cc", "xmm0");
4664 }
4665 #endif // HAS_ARGBMIRRORROW_SSE2
4666
4667 #ifdef HAS_ARGBMIRRORROW_AVX2
4668 // Shuffle table for reversing the bytes.
4669 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
ARGBMirrorRow_AVX2(const uint8_t * src,uint8_t * dst,int width)4670 void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
4671 intptr_t temp_width = (intptr_t)(width);
4672 asm volatile(
4673
4674 "vmovdqu %3,%%ymm5 \n"
4675
4676 LABELALIGN
4677 "1: \n"
4678 "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n"
4679 "vmovdqu %%ymm0,(%1) \n"
4680 "lea 0x20(%1),%1 \n"
4681 "sub $0x8,%2 \n"
4682 "jg 1b \n"
4683 "vzeroupper \n"
4684 : "+r"(src), // %0
4685 "+r"(dst), // %1
4686 "+r"(temp_width) // %2
4687 : "m"(kARGBShuffleMirror_AVX2) // %3
4688 : "memory", "cc", "xmm0", "xmm5");
4689 }
4690 #endif // HAS_ARGBMIRRORROW_AVX2
4691
4692 #ifdef HAS_SPLITUVROW_AVX2
SplitUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)4693 void SplitUVRow_AVX2(const uint8_t* src_uv,
4694 uint8_t* dst_u,
4695 uint8_t* dst_v,
4696 int width) {
4697 asm volatile(
4698 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4699 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
4700 "sub %1,%2 \n"
4701
4702 LABELALIGN
4703 "1: \n"
4704 "vmovdqu (%0),%%ymm0 \n"
4705 "vmovdqu 0x20(%0),%%ymm1 \n"
4706 "lea 0x40(%0),%0 \n"
4707 "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
4708 "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
4709 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
4710 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
4711 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4712 "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
4713 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4714 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
4715 "vmovdqu %%ymm0,(%1) \n"
4716 "vmovdqu %%ymm2,0x00(%1,%2,1) \n"
4717 "lea 0x20(%1),%1 \n"
4718 "sub $0x20,%3 \n"
4719 "jg 1b \n"
4720 "vzeroupper \n"
4721 : "+r"(src_uv), // %0
4722 "+r"(dst_u), // %1
4723 "+r"(dst_v), // %2
4724 "+r"(width) // %3
4725 :
4726 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
4727 }
4728 #endif // HAS_SPLITUVROW_AVX2
4729
4730 #ifdef HAS_SPLITUVROW_SSE2
SplitUVRow_SSE2(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)4731 void SplitUVRow_SSE2(const uint8_t* src_uv,
4732 uint8_t* dst_u,
4733 uint8_t* dst_v,
4734 int width) {
4735 asm volatile(
4736 "pcmpeqb %%xmm5,%%xmm5 \n"
4737 "psrlw $0x8,%%xmm5 \n"
4738 "sub %1,%2 \n"
4739
4740 LABELALIGN
4741 "1: \n"
4742 "movdqu (%0),%%xmm0 \n"
4743 "movdqu 0x10(%0),%%xmm1 \n"
4744 "lea 0x20(%0),%0 \n"
4745 "movdqa %%xmm0,%%xmm2 \n"
4746 "movdqa %%xmm1,%%xmm3 \n"
4747 "pand %%xmm5,%%xmm0 \n"
4748 "pand %%xmm5,%%xmm1 \n"
4749 "packuswb %%xmm1,%%xmm0 \n"
4750 "psrlw $0x8,%%xmm2 \n"
4751 "psrlw $0x8,%%xmm3 \n"
4752 "packuswb %%xmm3,%%xmm2 \n"
4753 "movdqu %%xmm0,(%1) \n"
4754 "movdqu %%xmm2,0x00(%1,%2,1) \n"
4755 "lea 0x10(%1),%1 \n"
4756 "sub $0x10,%3 \n"
4757 "jg 1b \n"
4758 : "+r"(src_uv), // %0
4759 "+r"(dst_u), // %1
4760 "+r"(dst_v), // %2
4761 "+r"(width) // %3
4762 :
4763 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
4764 }
4765 #endif // HAS_SPLITUVROW_SSE2
4766
4767 #ifdef HAS_DETILEROW_SSE2
DetileRow_SSE2(const uint8_t * src,ptrdiff_t src_tile_stride,uint8_t * dst,int width)4768 void DetileRow_SSE2(const uint8_t* src,
4769 ptrdiff_t src_tile_stride,
4770 uint8_t* dst,
4771 int width) {
4772 asm volatile(
4773 "1: \n"
4774 "movdqu (%0),%%xmm0 \n"
4775 "sub $0x10,%2 \n"
4776 "lea (%0,%3),%0 \n"
4777 "movdqu %%xmm0,(%1) \n"
4778 "lea 0x10(%1),%1 \n"
4779 "jg 1b \n"
4780 : "+r"(src), // %0
4781 "+r"(dst), // %1
4782 "+r"(width) // %2
4783 : "r"(src_tile_stride) // %3
4784 : "cc", "memory", "xmm0");
4785 }
4786 #endif // HAS_DETILEROW_SSE2
4787
4788 #ifdef HAS_DETILESPLITUVROW_SSSE3
4789 // TODO(greenjustin): Look into generating these constants instead of loading
4790 // them since this can cause branch mispredicts for fPIC code on 32-bit
4791 // machines.
4792 static const uvec8 kDeinterlaceUV = {0, 2, 4, 6, 8, 10, 12, 14,
4793 1, 3, 5, 7, 9, 11, 13, 15};
4794
4795 // TODO(greenjustin): Research alternatives to pshufb, since pshufb can be very
4796 // slow on older SSE2 processors.
DetileSplitUVRow_SSSE3(const uint8_t * src_uv,ptrdiff_t src_tile_stride,uint8_t * dst_u,uint8_t * dst_v,int width)4797 void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
4798 ptrdiff_t src_tile_stride,
4799 uint8_t* dst_u,
4800 uint8_t* dst_v,
4801 int width) {
4802 asm volatile(
4803 "movdqu %4,%%xmm1 \n"
4804 "1: \n"
4805 "movdqu (%0),%%xmm0 \n"
4806 "lea (%0, %5),%0 \n"
4807 "pshufb %%xmm1,%%xmm0 \n"
4808 "movq %%xmm0,(%1) \n"
4809 "lea 0x8(%1),%1 \n"
4810 "movhps %%xmm0,(%2) \n"
4811 "lea 0x8(%2),%2 \n"
4812 "sub $0x10,%3 \n"
4813 "jg 1b \n"
4814 : "+r"(src_uv), // %0
4815 "+r"(dst_u), // %1
4816 "+r"(dst_v), // %2
4817 "+r"(width) // %3
4818 : "m"(kDeinterlaceUV), // %4
4819 "r"(src_tile_stride) // %5
4820 : "cc", "memory", "xmm0", "xmm1");
4821 }
4822 #endif // HAS_DETILESPLITUVROW_SSSE3
4823
4824 #ifdef HAS_MERGEUVROW_AVX2
MergeUVRow_AVX2(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)4825 void MergeUVRow_AVX2(const uint8_t* src_u,
4826 const uint8_t* src_v,
4827 uint8_t* dst_uv,
4828 int width) {
4829 asm volatile(
4830
4831 "sub %0,%1 \n"
4832
4833 LABELALIGN
4834 "1: \n"
4835 "vmovdqu (%0),%%ymm0 \n"
4836 "vmovdqu 0x00(%0,%1,1),%%ymm1 \n"
4837 "lea 0x20(%0),%0 \n"
4838 "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
4839 "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
4840 "vextractf128 $0x0,%%ymm2,(%2) \n"
4841 "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
4842 "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
4843 "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
4844 "lea 0x40(%2),%2 \n"
4845 "sub $0x20,%3 \n"
4846 "jg 1b \n"
4847 "vzeroupper \n"
4848 : "+r"(src_u), // %0
4849 "+r"(src_v), // %1
4850 "+r"(dst_uv), // %2
4851 "+r"(width) // %3
4852 :
4853 : "memory", "cc", "xmm0", "xmm1", "xmm2");
4854 }
4855 #endif // HAS_MERGEUVROW_AVX2
4856
4857 #ifdef HAS_MERGEUVROW_SSE2
MergeUVRow_SSE2(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)4858 void MergeUVRow_SSE2(const uint8_t* src_u,
4859 const uint8_t* src_v,
4860 uint8_t* dst_uv,
4861 int width) {
4862 asm volatile(
4863
4864 "sub %0,%1 \n"
4865
4866 LABELALIGN
4867 "1: \n"
4868 "movdqu (%0),%%xmm0 \n"
4869 "movdqu 0x00(%0,%1,1),%%xmm1 \n"
4870 "lea 0x10(%0),%0 \n"
4871 "movdqa %%xmm0,%%xmm2 \n"
4872 "punpcklbw %%xmm1,%%xmm0 \n"
4873 "punpckhbw %%xmm1,%%xmm2 \n"
4874 "movdqu %%xmm0,(%2) \n"
4875 "movdqu %%xmm2,0x10(%2) \n"
4876 "lea 0x20(%2),%2 \n"
4877 "sub $0x10,%3 \n"
4878 "jg 1b \n"
4879 : "+r"(src_u), // %0
4880 "+r"(src_v), // %1
4881 "+r"(dst_uv), // %2
4882 "+r"(width) // %3
4883 :
4884 : "memory", "cc", "xmm0", "xmm1", "xmm2");
4885 }
4886 #endif // HAS_MERGEUVROW_SSE2
4887
4888 #ifdef HAS_MERGEUVROW_16_AVX2
MergeUVRow_16_AVX2(const uint16_t * src_u,const uint16_t * src_v,uint16_t * dst_uv,int depth,int width)4889 void MergeUVRow_16_AVX2(const uint16_t* src_u,
4890 const uint16_t* src_v,
4891 uint16_t* dst_uv,
4892 int depth,
4893 int width) {
4894 depth = 16 - depth;
4895 // clang-format off
4896 asm volatile (
4897 "vmovd %4,%%xmm3 \n"
4898 "sub %0,%1 \n"
4899
4900 // 16 pixels per loop.
4901 LABELALIGN
4902 "1: \n"
4903 "vmovdqu (%0),%%ymm0 \n"
4904 "vmovdqu (%0,%1,1),%%ymm1 \n"
4905 "add $0x20,%0 \n"
4906
4907 "vpsllw %%xmm3,%%ymm0,%%ymm0 \n"
4908 "vpsllw %%xmm3,%%ymm1,%%ymm1 \n"
4909 "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
4910 "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
4911 "vextractf128 $0x0,%%ymm2,(%2) \n"
4912 "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
4913 "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
4914 "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
4915 "add $0x40,%2 \n"
4916 "sub $0x10,%3 \n"
4917 "jg 1b \n"
4918 "vzeroupper \n"
4919 : "+r"(src_u), // %0
4920 "+r"(src_v), // %1
4921 "+r"(dst_uv), // %2
4922 "+r"(width) // %3
4923 : "r"(depth) // %4
4924 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
4925 // clang-format on
4926 }
4927 #endif // HAS_MERGEUVROW_AVX2
4928
4929 #ifdef HAS_SPLITUVROW_16_AVX2
4930 const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8, 9, 12, 13,
4931 2, 3, 6, 7, 10, 11, 14, 15};
SplitUVRow_16_AVX2(const uint16_t * src_uv,uint16_t * dst_u,uint16_t * dst_v,int depth,int width)4932 void SplitUVRow_16_AVX2(const uint16_t* src_uv,
4933 uint16_t* dst_u,
4934 uint16_t* dst_v,
4935 int depth,
4936 int width) {
4937 depth = 16 - depth;
4938 // clang-format off
4939 asm volatile (
4940 "vmovd %4,%%xmm3 \n"
4941 "vbroadcastf128 %5,%%ymm4 \n"
4942 "sub %1,%2 \n"
4943
4944 // 16 pixels per loop.
4945 LABELALIGN
4946 "1: \n"
4947 "vmovdqu (%0),%%ymm0 \n"
4948 "vmovdqu 0x20(%0),%%ymm1 \n"
4949 "add $0x40,%0 \n"
4950
4951 "vpsrlw %%xmm3,%%ymm0,%%ymm0 \n"
4952 "vpsrlw %%xmm3,%%ymm1,%%ymm1 \n"
4953 "vpshufb %%ymm4,%%ymm0,%%ymm0 \n"
4954 "vpshufb %%ymm4,%%ymm1,%%ymm1 \n"
4955 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4956 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
4957 "vextractf128 $0x0,%%ymm0,(%1) \n"
4958 "vextractf128 $0x0,%%ymm1,0x10(%1) \n"
4959 "vextractf128 $0x1,%%ymm0,(%1,%2) \n"
4960 "vextractf128 $0x1,%%ymm1,0x10(%1,%2) \n"
4961 "add $0x20,%1 \n"
4962 "sub $0x10,%3 \n"
4963 "jg 1b \n"
4964 "vzeroupper \n"
4965 : "+r"(src_uv), // %0
4966 "+r"(dst_u), // %1
4967 "+r"(dst_v), // %2
4968 "+r"(width) // %3
4969 : "r"(depth), // %4
4970 "m"(kSplitUVShuffle16) // %5
4971 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
4972 // clang-format on
4973 }
4974 #endif // HAS_SPLITUVROW_16_AVX2
4975
4976 // Use scale to convert lsb formats to msb, depending how many bits there are:
4977 // 128 = 9 bits
4978 // 64 = 10 bits
4979 // 16 = 12 bits
4980 // 1 = 16 bits
4981 #ifdef HAS_MULTIPLYROW_16_AVX2
MultiplyRow_16_AVX2(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)4982 void MultiplyRow_16_AVX2(const uint16_t* src_y,
4983 uint16_t* dst_y,
4984 int scale,
4985 int width) {
4986 // clang-format off
4987 asm volatile (
4988 "vmovd %3,%%xmm3 \n"
4989 "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
4990 "vbroadcastss %%xmm3,%%ymm3 \n"
4991 "sub %0,%1 \n"
4992
4993 // 32 pixels per loop.
4994 LABELALIGN
4995 "1: \n"
4996 "vmovdqu (%0),%%ymm0 \n"
4997 "vmovdqu 0x20(%0),%%ymm1 \n"
4998 "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
4999 "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
5000 "vmovdqu %%ymm0,(%0,%1) \n"
5001 "vmovdqu %%ymm1,0x20(%0,%1) \n"
5002 "add $0x40,%0 \n"
5003 "sub $0x20,%2 \n"
5004 "jg 1b \n"
5005 "vzeroupper \n"
5006 : "+r"(src_y), // %0
5007 "+r"(dst_y), // %1
5008 "+r"(width) // %2
5009 : "r"(scale) // %3
5010 : "memory", "cc", "xmm0", "xmm1", "xmm3");
5011 // clang-format on
5012 }
5013 #endif // HAS_MULTIPLYROW_16_AVX2
5014
5015 // Use scale to convert msb formats to lsb, depending how many bits there are:
5016 // 512 = 9 bits
5017 // 1024 = 10 bits
5018 // 4096 = 12 bits
5019 // 65536 = 16 bits
5020 #ifdef HAS_DIVIDEROW_16_AVX2
DivideRow_16_AVX2(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)5021 void DivideRow_16_AVX2(const uint16_t* src_y,
5022 uint16_t* dst_y,
5023 int scale,
5024 int width) {
5025 // clang-format off
5026 asm volatile (
5027 "vmovd %3,%%xmm3 \n"
5028 "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
5029 "vbroadcastss %%xmm3,%%ymm3 \n"
5030 "sub %0,%1 \n"
5031
5032 // 32 pixels per loop.
5033 LABELALIGN
5034 "1: \n"
5035 "vmovdqu (%0),%%ymm0 \n"
5036 "vmovdqu 0x20(%0),%%ymm1 \n"
5037 "vpmulhuw %%ymm3,%%ymm0,%%ymm0 \n"
5038 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
5039 "vmovdqu %%ymm0,(%0,%1) \n"
5040 "vmovdqu %%ymm1,0x20(%0,%1) \n"
5041 "add $0x40,%0 \n"
5042 "sub $0x20,%2 \n"
5043 "jg 1b \n"
5044 "vzeroupper \n"
5045 : "+r"(src_y), // %0
5046 "+r"(dst_y), // %1
5047 "+r"(width), // %2
5048 "+r"(scale) // %3
5049 :
5050 : "memory", "cc", "xmm0", "xmm1", "xmm3");
5051 // clang-format on
5052 }
5053 #endif // HAS_MULTIPLYROW_16_AVX2
5054
5055 // Use scale to convert lsb formats to msb, depending how many bits there are:
5056 // 32768 = 9 bits
5057 // 16384 = 10 bits
5058 // 4096 = 12 bits
5059 // 256 = 16 bits
Convert16To8Row_SSSE3(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)5060 void Convert16To8Row_SSSE3(const uint16_t* src_y,
5061 uint8_t* dst_y,
5062 int scale,
5063 int width) {
5064 // clang-format off
5065 asm volatile (
5066 "movd %3,%%xmm2 \n"
5067 "punpcklwd %%xmm2,%%xmm2 \n"
5068 "pshufd $0x0,%%xmm2,%%xmm2 \n"
5069
5070 // 32 pixels per loop.
5071 LABELALIGN
5072 "1: \n"
5073 "movdqu (%0),%%xmm0 \n"
5074 "movdqu 0x10(%0),%%xmm1 \n"
5075 "add $0x20,%0 \n"
5076 "pmulhuw %%xmm2,%%xmm0 \n"
5077 "pmulhuw %%xmm2,%%xmm1 \n"
5078 "packuswb %%xmm1,%%xmm0 \n"
5079 "movdqu %%xmm0,(%1) \n"
5080 "add $0x10,%1 \n"
5081 "sub $0x10,%2 \n"
5082 "jg 1b \n"
5083 : "+r"(src_y), // %0
5084 "+r"(dst_y), // %1
5085 "+r"(width) // %2
5086 : "r"(scale) // %3
5087 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5088 // clang-format on
5089 }
5090
5091 #ifdef HAS_CONVERT16TO8ROW_AVX2
Convert16To8Row_AVX2(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)5092 void Convert16To8Row_AVX2(const uint16_t* src_y,
5093 uint8_t* dst_y,
5094 int scale,
5095 int width) {
5096 // clang-format off
5097 asm volatile (
5098 "vmovd %3,%%xmm2 \n"
5099 "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
5100 "vbroadcastss %%xmm2,%%ymm2 \n"
5101
5102 // 32 pixels per loop.
5103 LABELALIGN
5104 "1: \n"
5105 "vmovdqu (%0),%%ymm0 \n"
5106 "vmovdqu 0x20(%0),%%ymm1 \n"
5107 "add $0x40,%0 \n"
5108 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
5109 "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
5110 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates
5111 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
5112 "vmovdqu %%ymm0,(%1) \n"
5113 "add $0x20,%1 \n"
5114 "sub $0x20,%2 \n"
5115 "jg 1b \n"
5116 "vzeroupper \n"
5117 : "+r"(src_y), // %0
5118 "+r"(dst_y), // %1
5119 "+r"(width) // %2
5120 : "r"(scale) // %3
5121 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5122 // clang-format on
5123 }
5124 #endif // HAS_CONVERT16TO8ROW_AVX2
5125
5126 // Use scale to convert to lsb formats depending how many bits there are:
5127 // 512 = 9 bits
5128 // 1024 = 10 bits
5129 // 4096 = 12 bits
5130 // TODO(fbarchard): reduce to SSE2
Convert8To16Row_SSE2(const uint8_t * src_y,uint16_t * dst_y,int scale,int width)5131 void Convert8To16Row_SSE2(const uint8_t* src_y,
5132 uint16_t* dst_y,
5133 int scale,
5134 int width) {
5135 // clang-format off
5136 asm volatile (
5137 "movd %3,%%xmm2 \n"
5138 "punpcklwd %%xmm2,%%xmm2 \n"
5139 "pshufd $0x0,%%xmm2,%%xmm2 \n"
5140
5141 // 32 pixels per loop.
5142 LABELALIGN
5143 "1: \n"
5144 "movdqu (%0),%%xmm0 \n"
5145 "movdqa %%xmm0,%%xmm1 \n"
5146 "punpcklbw %%xmm0,%%xmm0 \n"
5147 "punpckhbw %%xmm1,%%xmm1 \n"
5148 "add $0x10,%0 \n"
5149 "pmulhuw %%xmm2,%%xmm0 \n"
5150 "pmulhuw %%xmm2,%%xmm1 \n"
5151 "movdqu %%xmm0,(%1) \n"
5152 "movdqu %%xmm1,0x10(%1) \n"
5153 "add $0x20,%1 \n"
5154 "sub $0x10,%2 \n"
5155 "jg 1b \n"
5156 : "+r"(src_y), // %0
5157 "+r"(dst_y), // %1
5158 "+r"(width) // %2
5159 : "r"(scale) // %3
5160 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5161 // clang-format on
5162 }
5163
5164 #ifdef HAS_CONVERT8TO16ROW_AVX2
Convert8To16Row_AVX2(const uint8_t * src_y,uint16_t * dst_y,int scale,int width)5165 void Convert8To16Row_AVX2(const uint8_t* src_y,
5166 uint16_t* dst_y,
5167 int scale,
5168 int width) {
5169 // clang-format off
5170 asm volatile (
5171 "vmovd %3,%%xmm2 \n"
5172 "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
5173 "vbroadcastss %%xmm2,%%ymm2 \n"
5174
5175 // 32 pixels per loop.
5176 LABELALIGN
5177 "1: \n"
5178 "vmovdqu (%0),%%ymm0 \n"
5179 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
5180 "add $0x20,%0 \n"
5181 "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
5182 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
5183 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
5184 "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
5185 "vmovdqu %%ymm0,(%1) \n"
5186 "vmovdqu %%ymm1,0x20(%1) \n"
5187 "add $0x40,%1 \n"
5188 "sub $0x20,%2 \n"
5189 "jg 1b \n"
5190 "vzeroupper \n"
5191 : "+r"(src_y), // %0
5192 "+r"(dst_y), // %1
5193 "+r"(width) // %2
5194 : "r"(scale) // %3
5195 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5196 // clang-format on
5197 }
5198 #endif // HAS_CONVERT8TO16ROW_AVX2
5199
5200 #ifdef HAS_SPLITRGBROW_SSSE3
5201 // Shuffle table for converting RGB to Planar.
5202 static const uvec8 kSplitRGBShuffle[9] = {
5203 {0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
5204 128u, 128u},
5205 {128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u,
5206 128u, 128u},
5207 {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 1u, 4u,
5208 7u, 10u, 13u},
5209 {1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
5210 128u, 128u},
5211 {128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u,
5212 128u, 128u},
5213 {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u,
5214 8u, 11u, 14u},
5215 {2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
5216 128u, 128u},
5217 {128u, 128u, 128u, 128u, 128u, 1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u,
5218 128u, 128u},
5219 {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u,
5220 12u, 15u}};
5221
SplitRGBRow_SSSE3(const uint8_t * src_rgb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)5222 void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
5223 uint8_t* dst_r,
5224 uint8_t* dst_g,
5225 uint8_t* dst_b,
5226 int width) {
5227 asm volatile(
5228
5229 LABELALIGN
5230 "1: \n"
5231 "movdqu (%0),%%xmm0 \n"
5232 "movdqu 0x10(%0),%%xmm1 \n"
5233 "movdqu 0x20(%0),%%xmm2 \n"
5234 "pshufb 0(%5), %%xmm0 \n"
5235 "pshufb 16(%5), %%xmm1 \n"
5236 "pshufb 32(%5), %%xmm2 \n"
5237 "por %%xmm1,%%xmm0 \n"
5238 "por %%xmm2,%%xmm0 \n"
5239 "movdqu %%xmm0,(%1) \n"
5240 "lea 0x10(%1),%1 \n"
5241
5242 "movdqu (%0),%%xmm0 \n"
5243 "movdqu 0x10(%0),%%xmm1 \n"
5244 "movdqu 0x20(%0),%%xmm2 \n"
5245 "pshufb 48(%5),%%xmm0 \n"
5246 "pshufb 64(%5),%%xmm1 \n"
5247 "pshufb 80(%5), %%xmm2 \n"
5248 "por %%xmm1,%%xmm0 \n"
5249 "por %%xmm2,%%xmm0 \n"
5250 "movdqu %%xmm0,(%2) \n"
5251 "lea 0x10(%2),%2 \n"
5252
5253 "movdqu (%0),%%xmm0 \n"
5254 "movdqu 0x10(%0),%%xmm1 \n"
5255 "movdqu 0x20(%0),%%xmm2 \n"
5256 "pshufb 96(%5), %%xmm0 \n"
5257 "pshufb 112(%5), %%xmm1 \n"
5258 "pshufb 128(%5), %%xmm2 \n"
5259 "por %%xmm1,%%xmm0 \n"
5260 "por %%xmm2,%%xmm0 \n"
5261 "movdqu %%xmm0,(%3) \n"
5262 "lea 0x10(%3),%3 \n"
5263 "lea 0x30(%0),%0 \n"
5264 "sub $0x10,%4 \n"
5265 "jg 1b \n"
5266 : "+r"(src_rgb), // %0
5267 "+r"(dst_r), // %1
5268 "+r"(dst_g), // %2
5269 "+r"(dst_b), // %3
5270 "+r"(width) // %4
5271 : "r"(&kSplitRGBShuffle[0]) // %5
5272 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5273 }
5274 #endif // HAS_SPLITRGBROW_SSSE3
5275
5276 #ifdef HAS_MERGERGBROW_SSSE3
5277 // Shuffle table for converting Planar to RGB.
5278 static const uvec8 kMergeRGBShuffle[9] = {
5279 {0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u, 128u,
5280 128u, 5u},
5281 {128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u,
5282 128u, 128u},
5283 {128u, 128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u,
5284 4u, 128u},
5285 {128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u, 128u,
5286 10u, 128u},
5287 {5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u,
5288 128u, 10u},
5289 {128u, 5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u,
5290 128u, 128u},
5291 {128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u, 128u,
5292 15u, 128u, 128u},
5293 {128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u,
5294 128u, 15u, 128u},
5295 {10u, 128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u,
5296 128u, 128u, 15u}};
5297
MergeRGBRow_SSSE3(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_rgb,int width)5298 void MergeRGBRow_SSSE3(const uint8_t* src_r,
5299 const uint8_t* src_g,
5300 const uint8_t* src_b,
5301 uint8_t* dst_rgb,
5302 int width) {
5303 asm volatile(
5304
5305 LABELALIGN
5306 "1: \n"
5307 "movdqu (%0),%%xmm0 \n"
5308 "movdqu (%1),%%xmm1 \n"
5309 "movdqu (%2),%%xmm2 \n"
5310 "pshufb (%5), %%xmm0 \n"
5311 "pshufb 16(%5), %%xmm1 \n"
5312 "pshufb 32(%5), %%xmm2 \n"
5313 "por %%xmm1,%%xmm0 \n"
5314 "por %%xmm2,%%xmm0 \n"
5315 "movdqu %%xmm0,(%3) \n"
5316
5317 "movdqu (%0),%%xmm0 \n"
5318 "movdqu (%1),%%xmm1 \n"
5319 "movdqu (%2),%%xmm2 \n"
5320 "pshufb 48(%5), %%xmm0 \n"
5321 "pshufb 64(%5), %%xmm1 \n"
5322 "pshufb 80(%5), %%xmm2 \n"
5323 "por %%xmm1,%%xmm0 \n"
5324 "por %%xmm2,%%xmm0 \n"
5325 "movdqu %%xmm0,16(%3) \n"
5326
5327 "movdqu (%0),%%xmm0 \n"
5328 "movdqu (%1),%%xmm1 \n"
5329 "movdqu (%2),%%xmm2 \n"
5330 "pshufb 96(%5), %%xmm0 \n"
5331 "pshufb 112(%5), %%xmm1 \n"
5332 "pshufb 128(%5), %%xmm2 \n"
5333 "por %%xmm1,%%xmm0 \n"
5334 "por %%xmm2,%%xmm0 \n"
5335 "movdqu %%xmm0,32(%3) \n"
5336
5337 "lea 0x10(%0),%0 \n"
5338 "lea 0x10(%1),%1 \n"
5339 "lea 0x10(%2),%2 \n"
5340 "lea 0x30(%3),%3 \n"
5341 "sub $0x10,%4 \n"
5342 "jg 1b \n"
5343 : "+r"(src_r), // %0
5344 "+r"(src_g), // %1
5345 "+r"(src_b), // %2
5346 "+r"(dst_rgb), // %3
5347 "+r"(width) // %4
5348 : "r"(&kMergeRGBShuffle[0]) // %5
5349 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5350 }
5351 #endif // HAS_MERGERGBROW_SSSE3
5352
5353 #ifdef HAS_MERGEARGBROW_SSE2
MergeARGBRow_SSE2(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,const uint8_t * src_a,uint8_t * dst_argb,int width)5354 void MergeARGBRow_SSE2(const uint8_t* src_r,
5355 const uint8_t* src_g,
5356 const uint8_t* src_b,
5357 const uint8_t* src_a,
5358 uint8_t* dst_argb,
5359 int width) {
5360 asm volatile(
5361
5362 "sub %0,%1 \n"
5363 "sub %0,%2 \n"
5364 "sub %0,%3 \n"
5365
5366 LABELALIGN
5367 "1: \n"
5368
5369 "movq (%0,%2),%%xmm0 \n" // B
5370 "movq (%0),%%xmm1 \n" // R
5371 "movq (%0,%1),%%xmm2 \n" // G
5372 "punpcklbw %%xmm1,%%xmm0 \n" // BR
5373 "movq (%0,%3),%%xmm1 \n" // A
5374 "punpcklbw %%xmm1,%%xmm2 \n" // GA
5375 "movdqa %%xmm0,%%xmm1 \n" // BR
5376 "punpckhbw %%xmm2,%%xmm1 \n" // BGRA (hi)
5377 "punpcklbw %%xmm2,%%xmm0 \n" // BGRA (lo)
5378 "movdqu %%xmm0,(%4) \n"
5379 "movdqu %%xmm1,16(%4) \n"
5380
5381 "lea 8(%0),%0 \n"
5382 "lea 32(%4),%4 \n"
5383 "sub $0x8,%5 \n"
5384 "jg 1b \n"
5385 : "+r"(src_r), // %0
5386 "+r"(src_g), // %1
5387 "+r"(src_b), // %2
5388 "+r"(src_a), // %3
5389 "+r"(dst_argb), // %4
5390 "+r"(width) // %5
5391 :
5392 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5393 }
5394 #endif
5395
5396 #ifdef HAS_MERGEXRGBROW_SSE2
MergeXRGBRow_SSE2(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_argb,int width)5397 void MergeXRGBRow_SSE2(const uint8_t* src_r,
5398 const uint8_t* src_g,
5399 const uint8_t* src_b,
5400 uint8_t* dst_argb,
5401 int width) {
5402 asm volatile(
5403
5404 LABELALIGN
5405 "1: \n"
5406
5407 "movq (%2),%%xmm0 \n" // B
5408 "movq (%0),%%xmm1 \n" // R
5409 "movq (%1),%%xmm2 \n" // G
5410 "punpcklbw %%xmm1,%%xmm0 \n" // BR
5411 "pcmpeqd %%xmm1,%%xmm1 \n" // A(255)
5412 "punpcklbw %%xmm1,%%xmm2 \n" // GA
5413 "movdqa %%xmm0,%%xmm1 \n" // BR
5414 "punpckhbw %%xmm2,%%xmm1 \n" // BGRA (hi)
5415 "punpcklbw %%xmm2,%%xmm0 \n" // BGRA (lo)
5416 "movdqu %%xmm0,(%3) \n"
5417 "movdqu %%xmm1,16(%3) \n"
5418
5419 "lea 8(%0),%0 \n"
5420 "lea 8(%1),%1 \n"
5421 "lea 8(%2),%2 \n"
5422 "lea 32(%3),%3 \n"
5423 "sub $0x8,%4 \n"
5424 "jg 1b \n"
5425 : "+r"(src_r), // %0
5426 "+r"(src_g), // %1
5427 "+r"(src_b), // %2
5428 "+r"(dst_argb), // %3
5429 "+r"(width) // %4
5430 :
5431 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5432 }
5433 #endif // HAS_MERGEARGBROW_SSE2
5434
5435 #ifdef HAS_MERGEARGBROW_AVX2
MergeARGBRow_AVX2(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,const uint8_t * src_a,uint8_t * dst_argb,int width)5436 void MergeARGBRow_AVX2(const uint8_t* src_r,
5437 const uint8_t* src_g,
5438 const uint8_t* src_b,
5439 const uint8_t* src_a,
5440 uint8_t* dst_argb,
5441 int width) {
5442 asm volatile(
5443
5444 "sub %0,%1 \n"
5445 "sub %0,%2 \n"
5446 "sub %0,%3 \n"
5447
5448 LABELALIGN
5449 "1: \n"
5450
5451 "vmovdqu (%0,%2),%%xmm0 \n" // B
5452 "vmovdqu (%0,%1),%%xmm1 \n" // R
5453 "vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // G
5454 "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1 \n" // A
5455 "vpunpckhbw %%ymm1,%%ymm0,%%ymm2 \n"
5456 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
5457 "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n"
5458 "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n"
5459 "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n"
5460 "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n"
5461 "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n"
5462 "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n"
5463 "vmovdqu %%ymm0,(%4) \n" // First 8
5464 "vmovdqu %%ymm1,32(%4) \n" // Next 8
5465
5466 "lea 16(%0),%0 \n"
5467 "lea 64(%4),%4 \n"
5468 "sub $0x10,%5 \n"
5469 "jg 1b \n"
5470 "vzeroupper \n"
5471 : "+r"(src_r), // %0
5472 "+r"(src_g), // %1
5473 "+r"(src_b), // %2
5474 "+r"(src_a), // %3
5475 "+r"(dst_argb), // %4
5476 "+r"(width) // %5
5477 :
5478 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5479 }
5480 #endif
5481
5482 #ifdef HAS_MERGEXRGBROW_AVX2
MergeXRGBRow_AVX2(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_argb,int width)5483 void MergeXRGBRow_AVX2(const uint8_t* src_r,
5484 const uint8_t* src_g,
5485 const uint8_t* src_b,
5486 uint8_t* dst_argb,
5487 int width) {
5488 asm volatile(
5489
5490 LABELALIGN
5491 "1: \n"
5492
5493 "vmovdqu (%2),%%xmm0 \n" // B
5494 "vpcmpeqd %%ymm1,%%ymm1,%%ymm1 \n" // A(255)
5495 "vinserti128 $0,(%1),%%ymm1,%%ymm1 \n" // R
5496 "vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // G
5497 "vpunpckhbw %%ymm1,%%ymm0,%%ymm2 \n"
5498 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
5499 "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n"
5500 "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n"
5501 "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n"
5502 "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n"
5503 "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n"
5504 "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n"
5505 "vmovdqu %%ymm0,(%3) \n" // First 8
5506 "vmovdqu %%ymm1,32(%3) \n" // Next 8
5507
5508 "lea 16(%0),%0 \n"
5509 "lea 16(%1),%1 \n"
5510 "lea 16(%2),%2 \n"
5511 "lea 64(%3),%3 \n"
5512 "sub $0x10,%4 \n"
5513 "jg 1b \n"
5514 "vzeroupper \n"
5515 : "+r"(src_r), // %0
5516 "+r"(src_g), // %1
5517 "+r"(src_b), // %2
5518 "+r"(dst_argb), // %3
5519 "+rm"(width) // %4
5520 :
5521 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5522 }
5523 #endif // HAS_MERGEARGBROW_AVX2
5524
5525 #ifdef HAS_SPLITARGBROW_SSE2
SplitARGBRow_SSE2(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)5526 void SplitARGBRow_SSE2(const uint8_t* src_argb,
5527 uint8_t* dst_r,
5528 uint8_t* dst_g,
5529 uint8_t* dst_b,
5530 uint8_t* dst_a,
5531 int width) {
5532 asm volatile(
5533
5534 "sub %1,%2 \n"
5535 "sub %1,%3 \n"
5536 "sub %1,%4 \n"
5537
5538 LABELALIGN
5539 "1: \n"
5540
5541 "movdqu (%0),%%xmm0 \n" // 00-0F
5542 "movdqu 16(%0),%%xmm1 \n" // 10-1F
5543 "movdqa %%xmm0,%%xmm2 \n"
5544 "punpcklqdq %%xmm1,%%xmm0 \n" // 00-07 10-17
5545 "punpckhqdq %%xmm1,%%xmm2 \n" // 08-0F 18-1F
5546 "movdqa %%xmm0,%%xmm1 \n"
5547 "punpcklbw %%xmm2,%%xmm0 \n" // 08192A3B4C5D6E7F (lo)
5548 "punpckhbw %%xmm2,%%xmm1 \n" // 08192A3B4C5D6E7F (hi)
5549 "movdqa %%xmm0,%%xmm2 \n"
5550 "punpcklqdq %%xmm1,%%xmm0 \n" // 08192A3B08192A3B
5551 "punpckhqdq %%xmm1,%%xmm2 \n" // 4C5D6E7F4C5D6E7F
5552 "movdqa %%xmm0,%%xmm1 \n"
5553 "punpcklbw %%xmm2,%%xmm0 \n" // 048C159D26AE37BF (lo)
5554 "punpckhbw %%xmm2,%%xmm1 \n" // 048C159D26AE37BF (hi)
5555 "movdqa %%xmm0,%%xmm2 \n"
5556 "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
5557 "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
5558 "movlps %%xmm0,(%1,%3) \n" // B
5559 "movhps %%xmm0,(%1,%2) \n" // G
5560 "movlps %%xmm2,(%1) \n" // R
5561 "movhps %%xmm2,(%1,%4) \n" // A
5562
5563 "lea 32(%0),%0 \n"
5564 "lea 8(%1),%1 \n"
5565 "sub $0x8,%5 \n"
5566 "jg 1b \n"
5567 : "+r"(src_argb), // %0
5568 "+r"(dst_r), // %1
5569 "+r"(dst_g), // %2
5570 "+r"(dst_b), // %3
5571 "+r"(dst_a), // %4
5572 "+rm"(width) // %5
5573 :
5574 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5575 }
5576 #endif
5577
5578 #ifdef HAS_SPLITXRGBROW_SSE2
SplitXRGBRow_SSE2(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)5579 void SplitXRGBRow_SSE2(const uint8_t* src_argb,
5580 uint8_t* dst_r,
5581 uint8_t* dst_g,
5582 uint8_t* dst_b,
5583 int width) {
5584 asm volatile(
5585
5586 LABELALIGN
5587 "1: \n"
5588
5589 "movdqu (%0),%%xmm0 \n" // 00-0F
5590 "movdqu 16(%0),%%xmm1 \n" // 10-1F
5591 "movdqa %%xmm0,%%xmm2 \n"
5592 "punpcklqdq %%xmm1,%%xmm0 \n" // 00-07 10-17
5593 "punpckhqdq %%xmm1,%%xmm2 \n" // 08-0F 18-1F
5594 "movdqa %%xmm0,%%xmm1 \n"
5595 "punpcklbw %%xmm2,%%xmm0 \n" // 08192A3B4C5D6E7F (lo)
5596 "punpckhbw %%xmm2,%%xmm1 \n" // 08192A3B4C5D6E7F (hi)
5597 "movdqa %%xmm0,%%xmm2 \n"
5598 "punpcklqdq %%xmm1,%%xmm0 \n" // 08192A3B08192A3B
5599 "punpckhqdq %%xmm1,%%xmm2 \n" // 4C5D6E7F4C5D6E7F
5600 "movdqa %%xmm0,%%xmm1 \n"
5601 "punpcklbw %%xmm2,%%xmm0 \n" // 048C159D26AE37BF (lo)
5602 "punpckhbw %%xmm2,%%xmm1 \n" // 048C159D26AE37BF (hi)
5603 "movdqa %%xmm0,%%xmm2 \n"
5604 "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
5605 "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
5606 "movlps %%xmm0,(%3) \n" // B
5607 "movhps %%xmm0,(%2) \n" // G
5608 "movlps %%xmm2,(%1) \n" // R
5609
5610 "lea 32(%0),%0 \n"
5611 "lea 8(%1),%1 \n"
5612 "lea 8(%2),%2 \n"
5613 "lea 8(%3),%3 \n"
5614 "sub $0x8,%4 \n"
5615 "jg 1b \n"
5616 : "+r"(src_argb), // %0
5617 "+r"(dst_r), // %1
5618 "+r"(dst_g), // %2
5619 "+r"(dst_b), // %3
5620 "+rm"(width) // %4
5621 :
5622 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5623 }
5624 #endif
5625
5626 static const uvec8 kShuffleMaskARGBSplit = {0, 4, 8, 12, 1, 5, 9, 13,
5627 2, 6, 10, 14, 3, 7, 11, 15};
5628 #ifdef HAS_SPLITARGBROW_SSSE3
SplitARGBRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)5629 void SplitARGBRow_SSSE3(const uint8_t* src_argb,
5630 uint8_t* dst_r,
5631 uint8_t* dst_g,
5632 uint8_t* dst_b,
5633 uint8_t* dst_a,
5634 int width) {
5635 asm volatile(
5636
5637 "movdqa %6,%%xmm3 \n"
5638 "sub %1,%2 \n"
5639 "sub %1,%3 \n"
5640 "sub %1,%4 \n"
5641
5642 LABELALIGN
5643 "1: \n"
5644
5645 "movdqu (%0),%%xmm0 \n" // 00-0F
5646 "movdqu 16(%0),%%xmm1 \n" // 10-1F
5647 "pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo)
5648 "pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi)
5649 "movdqa %%xmm0,%%xmm2 \n"
5650 "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
5651 "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
5652 "movlps %%xmm0,(%1,%3) \n" // B
5653 "movhps %%xmm0,(%1,%2) \n" // G
5654 "movlps %%xmm2,(%1) \n" // R
5655 "movhps %%xmm2,(%1,%4) \n" // A
5656
5657 "lea 32(%0),%0 \n"
5658 "lea 8(%1),%1 \n"
5659 "subl $0x8,%5 \n"
5660 "jg 1b \n"
5661 : "+r"(src_argb), // %0
5662 "+r"(dst_r), // %1
5663 "+r"(dst_g), // %2
5664 "+r"(dst_b), // %3
5665 "+r"(dst_a), // %4
5666 #if defined(__i386__)
5667 "+m"(width) // %5
5668 #else
5669 "+rm"(width) // %5
5670 #endif
5671 : "m"(kShuffleMaskARGBSplit) // %6
5672 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
5673 }
5674 #endif
5675
5676 #ifdef HAS_SPLITXRGBROW_SSSE3
SplitXRGBRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)5677 void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
5678 uint8_t* dst_r,
5679 uint8_t* dst_g,
5680 uint8_t* dst_b,
5681 int width) {
5682 asm volatile(
5683
5684 "movdqa %5,%%xmm3 \n"
5685
5686 LABELALIGN
5687 "1: \n"
5688
5689 "movdqu (%0),%%xmm0 \n" // 00-0F
5690 "movdqu 16(%0),%%xmm1 \n" // 10-1F
5691 "pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo)
5692 "pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi)
5693 "movdqa %%xmm0,%%xmm2 \n"
5694 "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
5695 "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
5696 "movlps %%xmm0,(%3) \n" // B
5697 "movhps %%xmm0,(%2) \n" // G
5698 "movlps %%xmm2,(%1) \n" // R
5699
5700 "lea 32(%0),%0 \n"
5701 "lea 8(%1),%1 \n"
5702 "lea 8(%2),%2 \n"
5703 "lea 8(%3),%3 \n"
5704 "sub $0x8,%4 \n"
5705 "jg 1b \n"
5706 : "+r"(src_argb), // %0
5707 "+r"(dst_r), // %1
5708 "+r"(dst_g), // %2
5709 "+r"(dst_b), // %3
5710 "+r"(width) // %4
5711 : "m"(kShuffleMaskARGBSplit) // %5
5712 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
5713 }
5714 #endif
5715
5716 #ifdef HAS_SPLITARGBROW_AVX2
5717 static const ulvec32 kShuffleMaskARGBPermute = {0, 4, 1, 5, 2, 6, 3, 7};
SplitARGBRow_AVX2(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)5718 void SplitARGBRow_AVX2(const uint8_t* src_argb,
5719 uint8_t* dst_r,
5720 uint8_t* dst_g,
5721 uint8_t* dst_b,
5722 uint8_t* dst_a,
5723 int width) {
5724 asm volatile(
5725
5726 "sub %1,%2 \n"
5727 "sub %1,%3 \n"
5728 "sub %1,%4 \n"
5729 "vmovdqa %7,%%ymm3 \n"
5730 "vbroadcastf128 %6,%%ymm4 \n"
5731
5732 LABELALIGN
5733 "1: \n"
5734
5735 "vmovdqu (%0),%%xmm0 \n" // 00-0F
5736 "vmovdqu 16(%0),%%xmm1 \n" // 10-1F
5737 "vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F
5738 "vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F
5739 "vpshufb %%ymm4,%%ymm0,%%ymm0 \n"
5740 "vpshufb %%ymm4,%%ymm1,%%ymm1 \n"
5741 "vpermd %%ymm0,%%ymm3,%%ymm0 \n"
5742 "vpermd %%ymm1,%%ymm3,%%ymm1 \n"
5743 "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA
5744 "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" // BR
5745 "vmovdqu %%xmm0,(%1,%3) \n" // B
5746 "vextracti128 $1,%%ymm0,(%1) \n" // R
5747 "vmovdqu %%xmm2,(%1,%2) \n" // G
5748 "vextracti128 $1,%%ymm2,(%1,%4) \n" // A
5749 "lea 64(%0),%0 \n"
5750 "lea 16(%1),%1 \n"
5751 "subl $0x10,%5 \n"
5752 "jg 1b \n"
5753 "vzeroupper \n"
5754 : "+r"(src_argb), // %0
5755 "+r"(dst_r), // %1
5756 "+r"(dst_g), // %2
5757 "+r"(dst_b), // %3
5758 "+r"(dst_a), // %4
5759 #if defined(__i386__)
5760 "+m"(width) // %5
5761 #else
5762 "+rm"(width) // %5
5763 #endif
5764 : "m"(kShuffleMaskARGBSplit), // %6
5765 "m"(kShuffleMaskARGBPermute) // %7
5766 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
5767 }
5768 #endif
5769
5770 #ifdef HAS_SPLITXRGBROW_AVX2
SplitXRGBRow_AVX2(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)5771 void SplitXRGBRow_AVX2(const uint8_t* src_argb,
5772 uint8_t* dst_r,
5773 uint8_t* dst_g,
5774 uint8_t* dst_b,
5775 int width) {
5776 asm volatile(
5777
5778 "vmovdqa %6,%%ymm3 \n"
5779 "vbroadcastf128 %5,%%ymm4 \n"
5780
5781 LABELALIGN
5782 "1: \n"
5783
5784 "vmovdqu (%0),%%xmm0 \n" // 00-0F
5785 "vmovdqu 16(%0),%%xmm1 \n" // 10-1F
5786 "vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F
5787 "vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F
5788 "vpshufb %%ymm4,%%ymm0,%%ymm0 \n"
5789 "vpshufb %%ymm4,%%ymm1,%%ymm1 \n"
5790 "vpermd %%ymm0,%%ymm3,%%ymm0 \n"
5791 "vpermd %%ymm1,%%ymm3,%%ymm1 \n"
5792 "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA
5793 "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" // BR
5794 "vmovdqu %%xmm0,(%3) \n" // B
5795 "vextracti128 $1,%%ymm0,(%1) \n" // R
5796 "vmovdqu %%xmm2,(%2) \n" // G
5797
5798 "lea 64(%0),%0 \n"
5799 "lea 16(%1),%1 \n"
5800 "lea 16(%2),%2 \n"
5801 "lea 16(%3),%3 \n"
5802 "sub $0x10,%4 \n"
5803 "jg 1b \n"
5804 "vzeroupper \n"
5805 : "+r"(src_argb), // %0
5806 "+r"(dst_r), // %1
5807 "+r"(dst_g), // %2
5808 "+r"(dst_b), // %3
5809 "+r"(width) // %4
5810 : "m"(kShuffleMaskARGBSplit), // %5
5811 "m"(kShuffleMaskARGBPermute) // %6
5812 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
5813 }
5814 #endif
5815
5816 #ifdef HAS_MERGEXR30ROW_AVX2
MergeXR30Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_ar30,int depth,int width)5817 void MergeXR30Row_AVX2(const uint16_t* src_r,
5818 const uint16_t* src_g,
5819 const uint16_t* src_b,
5820 uint8_t* dst_ar30,
5821 int depth,
5822 int width) {
5823 int shift = depth - 10;
5824 asm volatile(
5825
5826 "sub %0,%1 \n"
5827 "sub %0,%2 \n"
5828 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
5829 "vpsrlw $14,%%ymm5,%%ymm5 \n"
5830 "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
5831 "vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n"
5832 "vpsrlw $6,%%ymm6,%%ymm6 \n"
5833 "vmovd %5,%%xmm4 \n"
5834
5835 LABELALIGN
5836 "1: \n"
5837 "vmovdqu (%0),%%ymm0 \n"
5838 "vmovdqu (%0,%1),%%ymm1 \n"
5839 "vmovdqu (%0,%2),%%ymm2 \n"
5840 "vpsrlw %%xmm4,%%ymm0,%%ymm0 \n"
5841 "vpsrlw %%xmm4,%%ymm1,%%ymm1 \n"
5842 "vpsrlw %%xmm4,%%ymm2,%%ymm2 \n"
5843 "vpminuw %%ymm0,%%ymm6,%%ymm0 \n"
5844 "vpminuw %%ymm1,%%ymm6,%%ymm1 \n"
5845 "vpminuw %%ymm2,%%ymm6,%%ymm2 \n"
5846 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
5847 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
5848 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
5849 "vpsllw $0x4,%%ymm0,%%ymm0 \n" // Shift R to target bit
5850 "vpunpckhwd %%ymm0,%%ymm2,%%ymm3 \n" // RB
5851 "vpunpcklwd %%ymm0,%%ymm2,%%ymm0 \n"
5852 "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" // AG
5853 "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n"
5854 "vpslld $0xa,%%ymm1,%%ymm1 \n" // Shift AG to target bit
5855 "vpslld $0xa,%%ymm2,%%ymm2 \n"
5856 "vpor %%ymm1,%%ymm0,%%ymm0 \n" // Combine
5857 "vpor %%ymm2,%%ymm3,%%ymm3 \n"
5858 "vmovdqu %%ymm0,(%3) \n"
5859 "vmovdqu %%ymm3,0x20(%3) \n"
5860 "lea 0x20(%0),%0 \n"
5861 "lea 0x40(%3),%3 \n"
5862 "sub $0x10,%4 \n"
5863 "jg 1b \n"
5864 "vzeroupper \n"
5865 : "+r"(src_r), // %0
5866 "+r"(src_g), // %1
5867 "+r"(src_b), // %2
5868 "+r"(dst_ar30), // %3
5869 "+r"(width) // %4
5870 #if defined(__i386__)
5871 : "m"(shift) // %5
5872 #else
5873 : "rm"(shift) // %5
5874 #endif
5875 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
5876 }
5877 #endif
5878
5879 #ifdef HAS_MERGEAR64ROW_AVX2
5880 static const lvec32 MergeAR64Permute = {0, 4, 2, 6, 1, 5, 3, 7};
MergeAR64Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,const uint16_t * src_a,uint16_t * dst_ar64,int depth,int width)5881 void MergeAR64Row_AVX2(const uint16_t* src_r,
5882 const uint16_t* src_g,
5883 const uint16_t* src_b,
5884 const uint16_t* src_a,
5885 uint16_t* dst_ar64,
5886 int depth,
5887 int width) {
5888 int shift = 16 - depth;
5889 int mask = (1 << depth) - 1;
5890 mask = (mask << 16) + mask;
5891 asm volatile(
5892
5893 "sub %0,%1 \n"
5894 "sub %0,%2 \n"
5895 "sub %0,%3 \n"
5896 "vmovdqa %8,%%ymm5 \n"
5897 "vmovd %6,%%xmm6 \n"
5898 "vbroadcastss %7,%%ymm7 \n"
5899
5900 LABELALIGN
5901 "1: \n"
5902 "vmovdqu (%0),%%ymm0 \n" // R
5903 "vmovdqu (%0,%1),%%ymm1 \n" // G
5904 "vmovdqu (%0,%2),%%ymm2 \n" // B
5905 "vmovdqu (%0,%3),%%ymm3 \n" // A
5906 "vpminuw %%ymm0,%%ymm7,%%ymm0 \n"
5907 "vpminuw %%ymm1,%%ymm7,%%ymm1 \n"
5908 "vpminuw %%ymm2,%%ymm7,%%ymm2 \n"
5909 "vpminuw %%ymm3,%%ymm7,%%ymm3 \n"
5910 "vpsllw %%xmm6,%%ymm0,%%ymm0 \n"
5911 "vpsllw %%xmm6,%%ymm1,%%ymm1 \n"
5912 "vpsllw %%xmm6,%%ymm2,%%ymm2 \n"
5913 "vpsllw %%xmm6,%%ymm3,%%ymm3 \n"
5914 "vpermd %%ymm0,%%ymm5,%%ymm0 \n"
5915 "vpermd %%ymm1,%%ymm5,%%ymm1 \n"
5916 "vpermd %%ymm2,%%ymm5,%%ymm2 \n"
5917 "vpermd %%ymm3,%%ymm5,%%ymm3 \n"
5918 "vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low)
5919 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi)
5920 "vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low)
5921 "vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi)
5922 "vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1)
5923 "vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3)
5924 "vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2)
5925 "vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4)
5926 "vmovdqu %%ymm3,(%4) \n"
5927 "vmovdqu %%ymm2,0x20(%4) \n"
5928 "vmovdqu %%ymm4,0x40(%4) \n"
5929 "vmovdqu %%ymm1,0x60(%4) \n"
5930 "lea 0x20(%0),%0 \n"
5931 "lea 0x80(%4),%4 \n"
5932 "subl $0x10,%5 \n"
5933 "jg 1b \n"
5934 "vzeroupper \n"
5935 : "+r"(src_r), // %0
5936 "+r"(src_g), // %1
5937 "+r"(src_b), // %2
5938 "+r"(src_a), // %3
5939 "+r"(dst_ar64), // %4
5940 #if defined(__i386__)
5941 "+m"(width) // %5
5942 #else
5943 "+rm"(width) // %5
5944 #endif
5945 : "m"(shift), // %6
5946 "m"(mask), // %7
5947 "m"(MergeAR64Permute) // %8
5948 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
5949 "xmm7");
5950 }
5951 #endif
5952
5953 #ifdef HAS_MERGEXR64ROW_AVX2
MergeXR64Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint16_t * dst_ar64,int depth,int width)5954 void MergeXR64Row_AVX2(const uint16_t* src_r,
5955 const uint16_t* src_g,
5956 const uint16_t* src_b,
5957 uint16_t* dst_ar64,
5958 int depth,
5959 int width) {
5960 int shift = 16 - depth;
5961 int mask = (1 << depth) - 1;
5962 mask = (mask << 16) + mask;
5963 asm volatile(
5964
5965 "sub %0,%1 \n"
5966 "sub %0,%2 \n"
5967 "vmovdqa %7,%%ymm5 \n"
5968 "vmovd %5,%%xmm6 \n"
5969 "vbroadcastss %6,%%ymm7 \n"
5970
5971 LABELALIGN
5972 "1: \n"
5973 "vmovdqu (%0),%%ymm0 \n" // R
5974 "vmovdqu (%0,%1),%%ymm1 \n" // G
5975 "vmovdqu (%0,%2),%%ymm2 \n" // B
5976 "vpminuw %%ymm0,%%ymm7,%%ymm0 \n"
5977 "vpminuw %%ymm1,%%ymm7,%%ymm1 \n"
5978 "vpminuw %%ymm2,%%ymm7,%%ymm2 \n"
5979 "vpsllw %%xmm6,%%ymm0,%%ymm0 \n"
5980 "vpsllw %%xmm6,%%ymm1,%%ymm1 \n"
5981 "vpsllw %%xmm6,%%ymm2,%%ymm2 \n"
5982 "vpermd %%ymm0,%%ymm5,%%ymm0 \n"
5983 "vpermd %%ymm1,%%ymm5,%%ymm1 \n"
5984 "vpermd %%ymm2,%%ymm5,%%ymm2 \n"
5985 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" // A (0xffff)
5986 "vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low)
5987 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi)
5988 "vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low)
5989 "vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi)
5990 "vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1)
5991 "vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3)
5992 "vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2)
5993 "vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4)
5994 "vmovdqu %%ymm3,(%3) \n"
5995 "vmovdqu %%ymm2,0x20(%3) \n"
5996 "vmovdqu %%ymm4,0x40(%3) \n"
5997 "vmovdqu %%ymm1,0x60(%3) \n"
5998 "lea 0x20(%0),%0 \n"
5999 "lea 0x80(%3),%3 \n"
6000 "subl $0x10,%4 \n"
6001 "jg 1b \n"
6002 "vzeroupper \n"
6003 : "+r"(src_r), // %0
6004 "+r"(src_g), // %1
6005 "+r"(src_b), // %2
6006 "+r"(dst_ar64), // %3
6007 "+r"(width) // %4
6008 : "m"(shift), // %5
6009 "m"(mask), // %6
6010 "m"(MergeAR64Permute) // %7
6011 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
6012 "xmm7");
6013 }
6014 #endif
6015
6016 #ifdef HAS_MERGEARGB16TO8ROW_AVX2
6017 static const uvec8 MergeARGB16To8Shuffle = {0, 8, 1, 9, 2, 10, 3, 11,
6018 4, 12, 5, 13, 6, 14, 7, 15};
MergeARGB16To8Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,const uint16_t * src_a,uint8_t * dst_argb,int depth,int width)6019 void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
6020 const uint16_t* src_g,
6021 const uint16_t* src_b,
6022 const uint16_t* src_a,
6023 uint8_t* dst_argb,
6024 int depth,
6025 int width) {
6026 int shift = depth - 8;
6027 asm volatile(
6028
6029 "sub %0,%1 \n"
6030 "sub %0,%2 \n"
6031 "sub %0,%3 \n"
6032 "vbroadcastf128 %7,%%ymm5 \n"
6033 "vmovd %6,%%xmm6 \n"
6034
6035 LABELALIGN
6036 "1: \n"
6037 "vmovdqu (%0),%%ymm0 \n" // R
6038 "vmovdqu (%0,%1),%%ymm1 \n" // G
6039 "vmovdqu (%0,%2),%%ymm2 \n" // B
6040 "vmovdqu (%0,%3),%%ymm3 \n" // A
6041 "vpsrlw %%xmm6,%%ymm0,%%ymm0 \n"
6042 "vpsrlw %%xmm6,%%ymm1,%%ymm1 \n"
6043 "vpsrlw %%xmm6,%%ymm2,%%ymm2 \n"
6044 "vpsrlw %%xmm6,%%ymm3,%%ymm3 \n"
6045 "vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar)
6046 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar)
6047 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave)
6048 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave)
6049 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
6050 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6051 "vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low)
6052 "vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi)
6053 "vmovdqu %%ymm2,(%4) \n"
6054 "vmovdqu %%ymm0,0x20(%4) \n"
6055 "lea 0x20(%0),%0 \n"
6056 "lea 0x40(%4),%4 \n"
6057 "subl $0x10,%5 \n"
6058 "jg 1b \n"
6059 "vzeroupper \n"
6060 : "+r"(src_r), // %0
6061 "+r"(src_g), // %1
6062 "+r"(src_b), // %2
6063 "+r"(src_a), // %3
6064 "+r"(dst_argb), // %4
6065 #if defined(__i386__)
6066 "+m"(width) // %5
6067 #else
6068 "+rm"(width) // %5
6069 #endif
6070 : "m"(shift), // %6
6071 "m"(MergeARGB16To8Shuffle) // %7
6072 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
6073 }
6074 #endif
6075
6076 #ifdef HAS_MERGEXRGB16TO8ROW_AVX2
MergeXRGB16To8Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_argb,int depth,int width)6077 void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
6078 const uint16_t* src_g,
6079 const uint16_t* src_b,
6080 uint8_t* dst_argb,
6081 int depth,
6082 int width) {
6083 int shift = depth - 8;
6084 asm volatile(
6085
6086 "sub %0,%1 \n"
6087 "sub %0,%2 \n"
6088 "vbroadcastf128 %6,%%ymm5 \n"
6089 "vmovd %5,%%xmm6 \n"
6090 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
6091 "vpsrlw $8,%%ymm3,%%ymm3 \n" // A (0xff)
6092
6093 LABELALIGN
6094 "1: \n"
6095 "vmovdqu (%0),%%ymm0 \n" // R
6096 "vmovdqu (%0,%1),%%ymm1 \n" // G
6097 "vmovdqu (%0,%2),%%ymm2 \n" // B
6098 "vpsrlw %%xmm6,%%ymm0,%%ymm0 \n"
6099 "vpsrlw %%xmm6,%%ymm1,%%ymm1 \n"
6100 "vpsrlw %%xmm6,%%ymm2,%%ymm2 \n"
6101 "vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar)
6102 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar)
6103 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave)
6104 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave)
6105 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
6106 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6107 "vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low)
6108 "vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi)
6109 "vmovdqu %%ymm2,(%3) \n"
6110 "vmovdqu %%ymm0,0x20(%3) \n"
6111 "lea 0x20(%0),%0 \n"
6112 "lea 0x40(%3),%3 \n"
6113 "subl $0x10,%4 \n"
6114 "jg 1b \n"
6115 "vzeroupper \n"
6116 : "+r"(src_r), // %0
6117 "+r"(src_g), // %1
6118 "+r"(src_b), // %2
6119 "+r"(dst_argb), // %3
6120 "+r"(width) // %4
6121 : "m"(shift), // %5
6122 "m"(MergeARGB16To8Shuffle) // %6
6123 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
6124 }
6125 #endif
6126
6127 #ifdef HAS_COPYROW_SSE2
CopyRow_SSE2(const uint8_t * src,uint8_t * dst,int width)6128 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
6129 asm volatile(
6130 "test $0xf,%0 \n"
6131 "jne 2f \n"
6132 "test $0xf,%1 \n"
6133 "jne 2f \n"
6134
6135 LABELALIGN
6136 "1: \n"
6137 "movdqa (%0),%%xmm0 \n"
6138 "movdqa 0x10(%0),%%xmm1 \n"
6139 "lea 0x20(%0),%0 \n"
6140 "movdqa %%xmm0,(%1) \n"
6141 "movdqa %%xmm1,0x10(%1) \n"
6142 "lea 0x20(%1),%1 \n"
6143 "sub $0x20,%2 \n"
6144 "jg 1b \n"
6145 "jmp 9f \n"
6146
6147 LABELALIGN
6148 "2: \n"
6149 "movdqu (%0),%%xmm0 \n"
6150 "movdqu 0x10(%0),%%xmm1 \n"
6151 "lea 0x20(%0),%0 \n"
6152 "movdqu %%xmm0,(%1) \n"
6153 "movdqu %%xmm1,0x10(%1) \n"
6154 "lea 0x20(%1),%1 \n"
6155 "sub $0x20,%2 \n"
6156 "jg 2b \n"
6157
6158 LABELALIGN "9: \n"
6159 : "+r"(src), // %0
6160 "+r"(dst), // %1
6161 "+r"(width) // %2
6162 :
6163 : "memory", "cc", "xmm0", "xmm1");
6164 }
6165 #endif // HAS_COPYROW_SSE2
6166
6167 #ifdef HAS_COPYROW_AVX
CopyRow_AVX(const uint8_t * src,uint8_t * dst,int width)6168 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
6169 asm volatile(
6170
6171 LABELALIGN
6172 "1: \n"
6173 "vmovdqu (%0),%%ymm0 \n"
6174 "vmovdqu 0x20(%0),%%ymm1 \n"
6175 "lea 0x40(%0),%0 \n"
6176 "vmovdqu %%ymm0,(%1) \n"
6177 "vmovdqu %%ymm1,0x20(%1) \n"
6178 "lea 0x40(%1),%1 \n"
6179 "sub $0x40,%2 \n"
6180 "jg 1b \n"
6181 : "+r"(src), // %0
6182 "+r"(dst), // %1
6183 "+r"(width) // %2
6184 :
6185 : "memory", "cc", "xmm0", "xmm1");
6186 }
6187 #endif // HAS_COPYROW_AVX
6188
6189 #ifdef HAS_COPYROW_ERMS
6190 // Multiple of 1.
CopyRow_ERMS(const uint8_t * src,uint8_t * dst,int width)6191 void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
6192 size_t width_tmp = (size_t)(width);
6193 asm volatile(
6194
6195 "rep movsb \n"
6196 : "+S"(src), // %0
6197 "+D"(dst), // %1
6198 "+c"(width_tmp) // %2
6199 :
6200 : "memory", "cc");
6201 }
6202 #endif // HAS_COPYROW_ERMS
6203
6204 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
6205 // width in pixels
ARGBCopyAlphaRow_SSE2(const uint8_t * src,uint8_t * dst,int width)6206 void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
6207 asm volatile(
6208 "pcmpeqb %%xmm0,%%xmm0 \n"
6209 "pslld $0x18,%%xmm0 \n"
6210 "pcmpeqb %%xmm1,%%xmm1 \n"
6211 "psrld $0x8,%%xmm1 \n"
6212
6213 LABELALIGN
6214 "1: \n"
6215 "movdqu (%0),%%xmm2 \n"
6216 "movdqu 0x10(%0),%%xmm3 \n"
6217 "lea 0x20(%0),%0 \n"
6218 "movdqu (%1),%%xmm4 \n"
6219 "movdqu 0x10(%1),%%xmm5 \n"
6220 "pand %%xmm0,%%xmm2 \n"
6221 "pand %%xmm0,%%xmm3 \n"
6222 "pand %%xmm1,%%xmm4 \n"
6223 "pand %%xmm1,%%xmm5 \n"
6224 "por %%xmm4,%%xmm2 \n"
6225 "por %%xmm5,%%xmm3 \n"
6226 "movdqu %%xmm2,(%1) \n"
6227 "movdqu %%xmm3,0x10(%1) \n"
6228 "lea 0x20(%1),%1 \n"
6229 "sub $0x8,%2 \n"
6230 "jg 1b \n"
6231 : "+r"(src), // %0
6232 "+r"(dst), // %1
6233 "+r"(width) // %2
6234 :
6235 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6236 }
6237 #endif // HAS_ARGBCOPYALPHAROW_SSE2
6238
6239 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
6240 // width in pixels
ARGBCopyAlphaRow_AVX2(const uint8_t * src,uint8_t * dst,int width)6241 void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
6242 asm volatile(
6243 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
6244 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
6245
6246 LABELALIGN
6247 "1: \n"
6248 "vmovdqu (%0),%%ymm1 \n"
6249 "vmovdqu 0x20(%0),%%ymm2 \n"
6250 "lea 0x40(%0),%0 \n"
6251 "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
6252 "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
6253 "vmovdqu %%ymm1,(%1) \n"
6254 "vmovdqu %%ymm2,0x20(%1) \n"
6255 "lea 0x40(%1),%1 \n"
6256 "sub $0x10,%2 \n"
6257 "jg 1b \n"
6258 "vzeroupper \n"
6259 : "+r"(src), // %0
6260 "+r"(dst), // %1
6261 "+r"(width) // %2
6262 :
6263 : "memory", "cc", "xmm0", "xmm1", "xmm2");
6264 }
6265 #endif // HAS_ARGBCOPYALPHAROW_AVX2
6266
6267 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
6268 // width in pixels
ARGBExtractAlphaRow_SSE2(const uint8_t * src_argb,uint8_t * dst_a,int width)6269 void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
6270 uint8_t* dst_a,
6271 int width) {
6272 asm volatile(
6273
6274 LABELALIGN
6275 "1: \n"
6276 "movdqu (%0), %%xmm0 \n"
6277 "movdqu 0x10(%0), %%xmm1 \n"
6278 "lea 0x20(%0), %0 \n"
6279 "psrld $0x18, %%xmm0 \n"
6280 "psrld $0x18, %%xmm1 \n"
6281 "packssdw %%xmm1, %%xmm0 \n"
6282 "packuswb %%xmm0, %%xmm0 \n"
6283 "movq %%xmm0,(%1) \n"
6284 "lea 0x8(%1), %1 \n"
6285 "sub $0x8, %2 \n"
6286 "jg 1b \n"
6287 : "+r"(src_argb), // %0
6288 "+r"(dst_a), // %1
6289 "+rm"(width) // %2
6290 :
6291 : "memory", "cc", "xmm0", "xmm1");
6292 }
6293 #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
6294
6295 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
6296 static const uvec8 kShuffleAlphaShort_AVX2 = {
6297 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u,
6298 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
6299
ARGBExtractAlphaRow_AVX2(const uint8_t * src_argb,uint8_t * dst_a,int width)6300 void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
6301 uint8_t* dst_a,
6302 int width) {
6303 asm volatile(
6304 "vmovdqa %3,%%ymm4 \n"
6305 "vbroadcastf128 %4,%%ymm5 \n"
6306
6307 LABELALIGN
6308 "1: \n"
6309 "vmovdqu (%0), %%ymm0 \n"
6310 "vmovdqu 0x20(%0), %%ymm1 \n"
6311 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
6312 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
6313 "vmovdqu 0x40(%0), %%ymm2 \n"
6314 "vmovdqu 0x60(%0), %%ymm3 \n"
6315 "lea 0x80(%0), %0 \n"
6316 "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
6317 "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
6318 "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
6319 "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
6320 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
6321 "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
6322 "vmovdqu %%ymm0,(%1) \n"
6323 "lea 0x20(%1),%1 \n"
6324 "sub $0x20, %2 \n"
6325 "jg 1b \n"
6326 "vzeroupper \n"
6327 : "+r"(src_argb), // %0
6328 "+r"(dst_a), // %1
6329 "+rm"(width) // %2
6330 : "m"(kPermdARGBToY_AVX), // %3
6331 "m"(kShuffleAlphaShort_AVX2) // %4
6332 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6333 }
6334 #endif // HAS_ARGBEXTRACTALPHAROW_AVX2
6335
6336 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
6337 // width in pixels
ARGBCopyYToAlphaRow_SSE2(const uint8_t * src,uint8_t * dst,int width)6338 void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
6339 asm volatile(
6340 "pcmpeqb %%xmm0,%%xmm0 \n"
6341 "pslld $0x18,%%xmm0 \n"
6342 "pcmpeqb %%xmm1,%%xmm1 \n"
6343 "psrld $0x8,%%xmm1 \n"
6344
6345 LABELALIGN
6346 "1: \n"
6347 "movq (%0),%%xmm2 \n"
6348 "lea 0x8(%0),%0 \n"
6349 "punpcklbw %%xmm2,%%xmm2 \n"
6350 "punpckhwd %%xmm2,%%xmm3 \n"
6351 "punpcklwd %%xmm2,%%xmm2 \n"
6352 "movdqu (%1),%%xmm4 \n"
6353 "movdqu 0x10(%1),%%xmm5 \n"
6354 "pand %%xmm0,%%xmm2 \n"
6355 "pand %%xmm0,%%xmm3 \n"
6356 "pand %%xmm1,%%xmm4 \n"
6357 "pand %%xmm1,%%xmm5 \n"
6358 "por %%xmm4,%%xmm2 \n"
6359 "por %%xmm5,%%xmm3 \n"
6360 "movdqu %%xmm2,(%1) \n"
6361 "movdqu %%xmm3,0x10(%1) \n"
6362 "lea 0x20(%1),%1 \n"
6363 "sub $0x8,%2 \n"
6364 "jg 1b \n"
6365 : "+r"(src), // %0
6366 "+r"(dst), // %1
6367 "+r"(width) // %2
6368 :
6369 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6370 }
6371 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
6372
6373 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
6374 // width in pixels
ARGBCopyYToAlphaRow_AVX2(const uint8_t * src,uint8_t * dst,int width)6375 void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
6376 asm volatile(
6377 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
6378 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
6379
6380 LABELALIGN
6381 "1: \n"
6382 "vpmovzxbd (%0),%%ymm1 \n"
6383 "vpmovzxbd 0x8(%0),%%ymm2 \n"
6384 "lea 0x10(%0),%0 \n"
6385 "vpslld $0x18,%%ymm1,%%ymm1 \n"
6386 "vpslld $0x18,%%ymm2,%%ymm2 \n"
6387 "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
6388 "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
6389 "vmovdqu %%ymm1,(%1) \n"
6390 "vmovdqu %%ymm2,0x20(%1) \n"
6391 "lea 0x40(%1),%1 \n"
6392 "sub $0x10,%2 \n"
6393 "jg 1b \n"
6394 "vzeroupper \n"
6395 : "+r"(src), // %0
6396 "+r"(dst), // %1
6397 "+r"(width) // %2
6398 :
6399 : "memory", "cc", "xmm0", "xmm1", "xmm2");
6400 }
6401 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
6402
6403 #ifdef HAS_SETROW_X86
SetRow_X86(uint8_t * dst,uint8_t v8,int width)6404 void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
6405 size_t width_tmp = (size_t)(width >> 2);
6406 const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
6407 asm volatile(
6408
6409 "rep stosl \n"
6410 : "+D"(dst), // %0
6411 "+c"(width_tmp) // %1
6412 : "a"(v32) // %2
6413 : "memory", "cc");
6414 }
6415
SetRow_ERMS(uint8_t * dst,uint8_t v8,int width)6416 void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
6417 size_t width_tmp = (size_t)(width);
6418 asm volatile(
6419
6420 "rep stosb \n"
6421 : "+D"(dst), // %0
6422 "+c"(width_tmp) // %1
6423 : "a"(v8) // %2
6424 : "memory", "cc");
6425 }
6426
ARGBSetRow_X86(uint8_t * dst_argb,uint32_t v32,int width)6427 void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
6428 size_t width_tmp = (size_t)(width);
6429 asm volatile(
6430
6431 "rep stosl \n"
6432 : "+D"(dst_argb), // %0
6433 "+c"(width_tmp) // %1
6434 : "a"(v32) // %2
6435 : "memory", "cc");
6436 }
6437 #endif // HAS_SETROW_X86
6438
6439 #ifdef HAS_YUY2TOYROW_SSE2
YUY2ToYRow_SSE2(const uint8_t * src_yuy2,uint8_t * dst_y,int width)6440 void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
6441 asm volatile(
6442 "pcmpeqb %%xmm5,%%xmm5 \n"
6443 "psrlw $0x8,%%xmm5 \n"
6444
6445 LABELALIGN
6446 "1: \n"
6447 "movdqu (%0),%%xmm0 \n"
6448 "movdqu 0x10(%0),%%xmm1 \n"
6449 "lea 0x20(%0),%0 \n"
6450 "pand %%xmm5,%%xmm0 \n"
6451 "pand %%xmm5,%%xmm1 \n"
6452 "packuswb %%xmm1,%%xmm0 \n"
6453 "movdqu %%xmm0,(%1) \n"
6454 "lea 0x10(%1),%1 \n"
6455 "sub $0x10,%2 \n"
6456 "jg 1b \n"
6457 : "+r"(src_yuy2), // %0
6458 "+r"(dst_y), // %1
6459 "+r"(width) // %2
6460 :
6461 : "memory", "cc", "xmm0", "xmm1", "xmm5");
6462 }
6463
YUY2ToUVRow_SSE2(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)6464 void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
6465 int stride_yuy2,
6466 uint8_t* dst_u,
6467 uint8_t* dst_v,
6468 int width) {
6469 asm volatile(
6470 "pcmpeqb %%xmm5,%%xmm5 \n"
6471 "psrlw $0x8,%%xmm5 \n"
6472 "sub %1,%2 \n"
6473
6474 LABELALIGN
6475 "1: \n"
6476 "movdqu (%0),%%xmm0 \n"
6477 "movdqu 0x10(%0),%%xmm1 \n"
6478 "movdqu 0x00(%0,%4,1),%%xmm2 \n"
6479 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
6480 "lea 0x20(%0),%0 \n"
6481 "pavgb %%xmm2,%%xmm0 \n"
6482 "pavgb %%xmm3,%%xmm1 \n"
6483 "psrlw $0x8,%%xmm0 \n"
6484 "psrlw $0x8,%%xmm1 \n"
6485 "packuswb %%xmm1,%%xmm0 \n"
6486 "movdqa %%xmm0,%%xmm1 \n"
6487 "pand %%xmm5,%%xmm0 \n"
6488 "packuswb %%xmm0,%%xmm0 \n"
6489 "psrlw $0x8,%%xmm1 \n"
6490 "packuswb %%xmm1,%%xmm1 \n"
6491 "movq %%xmm0,(%1) \n"
6492 "movq %%xmm1,0x00(%1,%2,1) \n"
6493 "lea 0x8(%1),%1 \n"
6494 "sub $0x10,%3 \n"
6495 "jg 1b \n"
6496 : "+r"(src_yuy2), // %0
6497 "+r"(dst_u), // %1
6498 "+r"(dst_v), // %2
6499 "+r"(width) // %3
6500 : "r"((intptr_t)(stride_yuy2)) // %4
6501 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
6502 }
6503
YUY2ToUV422Row_SSE2(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)6504 void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
6505 uint8_t* dst_u,
6506 uint8_t* dst_v,
6507 int width) {
6508 asm volatile(
6509 "pcmpeqb %%xmm5,%%xmm5 \n"
6510 "psrlw $0x8,%%xmm5 \n"
6511 "sub %1,%2 \n"
6512
6513 LABELALIGN
6514 "1: \n"
6515 "movdqu (%0),%%xmm0 \n"
6516 "movdqu 0x10(%0),%%xmm1 \n"
6517 "lea 0x20(%0),%0 \n"
6518 "psrlw $0x8,%%xmm0 \n"
6519 "psrlw $0x8,%%xmm1 \n"
6520 "packuswb %%xmm1,%%xmm0 \n"
6521 "movdqa %%xmm0,%%xmm1 \n"
6522 "pand %%xmm5,%%xmm0 \n"
6523 "packuswb %%xmm0,%%xmm0 \n"
6524 "psrlw $0x8,%%xmm1 \n"
6525 "packuswb %%xmm1,%%xmm1 \n"
6526 "movq %%xmm0,(%1) \n"
6527 "movq %%xmm1,0x00(%1,%2,1) \n"
6528 "lea 0x8(%1),%1 \n"
6529 "sub $0x10,%3 \n"
6530 "jg 1b \n"
6531 : "+r"(src_yuy2), // %0
6532 "+r"(dst_u), // %1
6533 "+r"(dst_v), // %2
6534 "+r"(width) // %3
6535 :
6536 : "memory", "cc", "xmm0", "xmm1", "xmm5");
6537 }
6538
UYVYToYRow_SSE2(const uint8_t * src_uyvy,uint8_t * dst_y,int width)6539 void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
6540 asm volatile(
6541
6542 LABELALIGN
6543 "1: \n"
6544 "movdqu (%0),%%xmm0 \n"
6545 "movdqu 0x10(%0),%%xmm1 \n"
6546 "lea 0x20(%0),%0 \n"
6547 "psrlw $0x8,%%xmm0 \n"
6548 "psrlw $0x8,%%xmm1 \n"
6549 "packuswb %%xmm1,%%xmm0 \n"
6550 "movdqu %%xmm0,(%1) \n"
6551 "lea 0x10(%1),%1 \n"
6552 "sub $0x10,%2 \n"
6553 "jg 1b \n"
6554 : "+r"(src_uyvy), // %0
6555 "+r"(dst_y), // %1
6556 "+r"(width) // %2
6557 :
6558 : "memory", "cc", "xmm0", "xmm1");
6559 }
6560
UYVYToUVRow_SSE2(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)6561 void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
6562 int stride_uyvy,
6563 uint8_t* dst_u,
6564 uint8_t* dst_v,
6565 int width) {
6566 asm volatile(
6567 "pcmpeqb %%xmm5,%%xmm5 \n"
6568 "psrlw $0x8,%%xmm5 \n"
6569 "sub %1,%2 \n"
6570
6571 LABELALIGN
6572 "1: \n"
6573 "movdqu (%0),%%xmm0 \n"
6574 "movdqu 0x10(%0),%%xmm1 \n"
6575 "movdqu 0x00(%0,%4,1),%%xmm2 \n"
6576 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
6577 "lea 0x20(%0),%0 \n"
6578 "pavgb %%xmm2,%%xmm0 \n"
6579 "pavgb %%xmm3,%%xmm1 \n"
6580 "pand %%xmm5,%%xmm0 \n"
6581 "pand %%xmm5,%%xmm1 \n"
6582 "packuswb %%xmm1,%%xmm0 \n"
6583 "movdqa %%xmm0,%%xmm1 \n"
6584 "pand %%xmm5,%%xmm0 \n"
6585 "packuswb %%xmm0,%%xmm0 \n"
6586 "psrlw $0x8,%%xmm1 \n"
6587 "packuswb %%xmm1,%%xmm1 \n"
6588 "movq %%xmm0,(%1) \n"
6589 "movq %%xmm1,0x00(%1,%2,1) \n"
6590 "lea 0x8(%1),%1 \n"
6591 "sub $0x10,%3 \n"
6592 "jg 1b \n"
6593 : "+r"(src_uyvy), // %0
6594 "+r"(dst_u), // %1
6595 "+r"(dst_v), // %2
6596 "+r"(width) // %3
6597 : "r"((intptr_t)(stride_uyvy)) // %4
6598 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
6599 }
6600
UYVYToUV422Row_SSE2(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)6601 void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
6602 uint8_t* dst_u,
6603 uint8_t* dst_v,
6604 int width) {
6605 asm volatile(
6606 "pcmpeqb %%xmm5,%%xmm5 \n"
6607 "psrlw $0x8,%%xmm5 \n"
6608 "sub %1,%2 \n"
6609
6610 LABELALIGN
6611 "1: \n"
6612 "movdqu (%0),%%xmm0 \n"
6613 "movdqu 0x10(%0),%%xmm1 \n"
6614 "lea 0x20(%0),%0 \n"
6615 "pand %%xmm5,%%xmm0 \n"
6616 "pand %%xmm5,%%xmm1 \n"
6617 "packuswb %%xmm1,%%xmm0 \n"
6618 "movdqa %%xmm0,%%xmm1 \n"
6619 "pand %%xmm5,%%xmm0 \n"
6620 "packuswb %%xmm0,%%xmm0 \n"
6621 "psrlw $0x8,%%xmm1 \n"
6622 "packuswb %%xmm1,%%xmm1 \n"
6623 "movq %%xmm0,(%1) \n"
6624 "movq %%xmm1,0x00(%1,%2,1) \n"
6625 "lea 0x8(%1),%1 \n"
6626 "sub $0x10,%3 \n"
6627 "jg 1b \n"
6628 : "+r"(src_uyvy), // %0
6629 "+r"(dst_u), // %1
6630 "+r"(dst_v), // %2
6631 "+r"(width) // %3
6632 :
6633 : "memory", "cc", "xmm0", "xmm1", "xmm5");
6634 }
6635 #endif // HAS_YUY2TOYROW_SSE2
6636
6637 #ifdef HAS_YUY2TOYROW_AVX2
YUY2ToYRow_AVX2(const uint8_t * src_yuy2,uint8_t * dst_y,int width)6638 void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
6639 asm volatile(
6640 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
6641 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
6642
6643 LABELALIGN
6644 "1: \n"
6645 "vmovdqu (%0),%%ymm0 \n"
6646 "vmovdqu 0x20(%0),%%ymm1 \n"
6647 "lea 0x40(%0),%0 \n"
6648 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
6649 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
6650 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
6651 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6652 "vmovdqu %%ymm0,(%1) \n"
6653 "lea 0x20(%1),%1 \n"
6654 "sub $0x20,%2 \n"
6655 "jg 1b \n"
6656 "vzeroupper \n"
6657 : "+r"(src_yuy2), // %0
6658 "+r"(dst_y), // %1
6659 "+r"(width) // %2
6660 :
6661 : "memory", "cc", "xmm0", "xmm1", "xmm5");
6662 }
6663
YUY2ToUVRow_AVX2(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)6664 void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
6665 int stride_yuy2,
6666 uint8_t* dst_u,
6667 uint8_t* dst_v,
6668 int width) {
6669 asm volatile(
6670 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
6671 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
6672 "sub %1,%2 \n"
6673
6674 LABELALIGN
6675 "1: \n"
6676 "vmovdqu (%0),%%ymm0 \n"
6677 "vmovdqu 0x20(%0),%%ymm1 \n"
6678 "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
6679 "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
6680 "lea 0x40(%0),%0 \n"
6681 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
6682 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
6683 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
6684 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6685 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
6686 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
6687 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
6688 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
6689 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
6690 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6691 "vextractf128 $0x0,%%ymm1,(%1) \n"
6692 "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
6693 "lea 0x10(%1),%1 \n"
6694 "sub $0x20,%3 \n"
6695 "jg 1b \n"
6696 "vzeroupper \n"
6697 : "+r"(src_yuy2), // %0
6698 "+r"(dst_u), // %1
6699 "+r"(dst_v), // %2
6700 "+r"(width) // %3
6701 : "r"((intptr_t)(stride_yuy2)) // %4
6702 : "memory", "cc", "xmm0", "xmm1", "xmm5");
6703 }
6704
YUY2ToUV422Row_AVX2(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)6705 void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
6706 uint8_t* dst_u,
6707 uint8_t* dst_v,
6708 int width) {
6709 asm volatile(
6710 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
6711 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
6712 "sub %1,%2 \n"
6713
6714 LABELALIGN
6715 "1: \n"
6716 "vmovdqu (%0),%%ymm0 \n"
6717 "vmovdqu 0x20(%0),%%ymm1 \n"
6718 "lea 0x40(%0),%0 \n"
6719 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
6720 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
6721 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
6722 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6723 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
6724 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
6725 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
6726 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
6727 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
6728 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6729 "vextractf128 $0x0,%%ymm1,(%1) \n"
6730 "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
6731 "lea 0x10(%1),%1 \n"
6732 "sub $0x20,%3 \n"
6733 "jg 1b \n"
6734 "vzeroupper \n"
6735 : "+r"(src_yuy2), // %0
6736 "+r"(dst_u), // %1
6737 "+r"(dst_v), // %2
6738 "+r"(width) // %3
6739 :
6740 : "memory", "cc", "xmm0", "xmm1", "xmm5");
6741 }
6742
UYVYToYRow_AVX2(const uint8_t * src_uyvy,uint8_t * dst_y,int width)6743 void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
6744 asm volatile(
6745
6746 LABELALIGN
6747 "1: \n"
6748 "vmovdqu (%0),%%ymm0 \n"
6749 "vmovdqu 0x20(%0),%%ymm1 \n"
6750 "lea 0x40(%0),%0 \n"
6751 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
6752 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
6753 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
6754 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6755 "vmovdqu %%ymm0,(%1) \n"
6756 "lea 0x20(%1),%1 \n"
6757 "sub $0x20,%2 \n"
6758 "jg 1b \n"
6759 "vzeroupper \n"
6760 : "+r"(src_uyvy), // %0
6761 "+r"(dst_y), // %1
6762 "+r"(width) // %2
6763 :
6764 : "memory", "cc", "xmm0", "xmm1", "xmm5");
6765 }
UYVYToUVRow_AVX2(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)6766 void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
6767 int stride_uyvy,
6768 uint8_t* dst_u,
6769 uint8_t* dst_v,
6770 int width) {
6771 asm volatile(
6772 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
6773 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
6774 "sub %1,%2 \n"
6775
6776 LABELALIGN
6777 "1: \n"
6778 "vmovdqu (%0),%%ymm0 \n"
6779 "vmovdqu 0x20(%0),%%ymm1 \n"
6780 "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
6781 "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
6782 "lea 0x40(%0),%0 \n"
6783 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
6784 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
6785 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
6786 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6787 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
6788 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
6789 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
6790 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
6791 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
6792 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6793 "vextractf128 $0x0,%%ymm1,(%1) \n"
6794 "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
6795 "lea 0x10(%1),%1 \n"
6796 "sub $0x20,%3 \n"
6797 "jg 1b \n"
6798 "vzeroupper \n"
6799 : "+r"(src_uyvy), // %0
6800 "+r"(dst_u), // %1
6801 "+r"(dst_v), // %2
6802 "+r"(width) // %3
6803 : "r"((intptr_t)(stride_uyvy)) // %4
6804 : "memory", "cc", "xmm0", "xmm1", "xmm5");
6805 }
6806
UYVYToUV422Row_AVX2(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)6807 void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
6808 uint8_t* dst_u,
6809 uint8_t* dst_v,
6810 int width) {
6811 asm volatile(
6812 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
6813 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
6814 "sub %1,%2 \n"
6815
6816 LABELALIGN
6817 "1: \n"
6818 "vmovdqu (%0),%%ymm0 \n"
6819 "vmovdqu 0x20(%0),%%ymm1 \n"
6820 "lea 0x40(%0),%0 \n"
6821 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
6822 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
6823 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
6824 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6825 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
6826 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
6827 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
6828 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
6829 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
6830 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6831 "vextractf128 $0x0,%%ymm1,(%1) \n"
6832 "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
6833 "lea 0x10(%1),%1 \n"
6834 "sub $0x20,%3 \n"
6835 "jg 1b \n"
6836 "vzeroupper \n"
6837 : "+r"(src_uyvy), // %0
6838 "+r"(dst_u), // %1
6839 "+r"(dst_v), // %2
6840 "+r"(width) // %3
6841 :
6842 : "memory", "cc", "xmm0", "xmm1", "xmm5");
6843 }
6844 #endif // HAS_YUY2TOYROW_AVX2
6845
6846 #ifdef HAS_ARGBBLENDROW_SSSE3
6847 // Shuffle table for isolating alpha.
6848 static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
6849 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
6850
6851 // Blend 8 pixels at a time
ARGBBlendRow_SSSE3(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)6852 void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
6853 const uint8_t* src_argb1,
6854 uint8_t* dst_argb,
6855 int width) {
6856 asm volatile(
6857 "pcmpeqb %%xmm7,%%xmm7 \n"
6858 "psrlw $0xf,%%xmm7 \n"
6859 "pcmpeqb %%xmm6,%%xmm6 \n"
6860 "psrlw $0x8,%%xmm6 \n"
6861 "pcmpeqb %%xmm5,%%xmm5 \n"
6862 "psllw $0x8,%%xmm5 \n"
6863 "pcmpeqb %%xmm4,%%xmm4 \n"
6864 "pslld $0x18,%%xmm4 \n"
6865 "sub $0x4,%3 \n"
6866 "jl 49f \n"
6867
6868 // 4 pixel loop.
6869 LABELALIGN
6870 "40: \n"
6871 "movdqu (%0),%%xmm3 \n"
6872 "lea 0x10(%0),%0 \n"
6873 "movdqa %%xmm3,%%xmm0 \n"
6874 "pxor %%xmm4,%%xmm3 \n"
6875 "movdqu (%1),%%xmm2 \n"
6876 "pshufb %4,%%xmm3 \n"
6877 "pand %%xmm6,%%xmm2 \n"
6878 "paddw %%xmm7,%%xmm3 \n"
6879 "pmullw %%xmm3,%%xmm2 \n"
6880 "movdqu (%1),%%xmm1 \n"
6881 "lea 0x10(%1),%1 \n"
6882 "psrlw $0x8,%%xmm1 \n"
6883 "por %%xmm4,%%xmm0 \n"
6884 "pmullw %%xmm3,%%xmm1 \n"
6885 "psrlw $0x8,%%xmm2 \n"
6886 "paddusb %%xmm2,%%xmm0 \n"
6887 "pand %%xmm5,%%xmm1 \n"
6888 "paddusb %%xmm1,%%xmm0 \n"
6889 "movdqu %%xmm0,(%2) \n"
6890 "lea 0x10(%2),%2 \n"
6891 "sub $0x4,%3 \n"
6892 "jge 40b \n"
6893
6894 "49: \n"
6895 "add $0x3,%3 \n"
6896 "jl 99f \n"
6897
6898 // 1 pixel loop.
6899 "91: \n"
6900 "movd (%0),%%xmm3 \n"
6901 "lea 0x4(%0),%0 \n"
6902 "movdqa %%xmm3,%%xmm0 \n"
6903 "pxor %%xmm4,%%xmm3 \n"
6904 "movd (%1),%%xmm2 \n"
6905 "pshufb %4,%%xmm3 \n"
6906 "pand %%xmm6,%%xmm2 \n"
6907 "paddw %%xmm7,%%xmm3 \n"
6908 "pmullw %%xmm3,%%xmm2 \n"
6909 "movd (%1),%%xmm1 \n"
6910 "lea 0x4(%1),%1 \n"
6911 "psrlw $0x8,%%xmm1 \n"
6912 "por %%xmm4,%%xmm0 \n"
6913 "pmullw %%xmm3,%%xmm1 \n"
6914 "psrlw $0x8,%%xmm2 \n"
6915 "paddusb %%xmm2,%%xmm0 \n"
6916 "pand %%xmm5,%%xmm1 \n"
6917 "paddusb %%xmm1,%%xmm0 \n"
6918 "movd %%xmm0,(%2) \n"
6919 "lea 0x4(%2),%2 \n"
6920 "sub $0x1,%3 \n"
6921 "jge 91b \n"
6922 "99: \n"
6923 : "+r"(src_argb), // %0
6924 "+r"(src_argb1), // %1
6925 "+r"(dst_argb), // %2
6926 "+r"(width) // %3
6927 : "m"(kShuffleAlpha) // %4
6928 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
6929 "xmm7");
6930 }
6931 #endif // HAS_ARGBBLENDROW_SSSE3
6932
6933 #ifdef HAS_BLENDPLANEROW_SSSE3
6934 // Blend 8 pixels at a time.
6935 // unsigned version of math
6936 // =((A2*C2)+(B2*(255-C2))+255)/256
6937 // signed version of math
6938 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_SSSE3(const uint8_t * src0,const uint8_t * src1,const uint8_t * alpha,uint8_t * dst,int width)6939 void BlendPlaneRow_SSSE3(const uint8_t* src0,
6940 const uint8_t* src1,
6941 const uint8_t* alpha,
6942 uint8_t* dst,
6943 int width) {
6944 asm volatile(
6945 "pcmpeqb %%xmm5,%%xmm5 \n"
6946 "psllw $0x8,%%xmm5 \n"
6947 "mov $0x80808080,%%eax \n"
6948 "movd %%eax,%%xmm6 \n"
6949 "pshufd $0x0,%%xmm6,%%xmm6 \n"
6950 "mov $0x807f807f,%%eax \n"
6951 "movd %%eax,%%xmm7 \n"
6952 "pshufd $0x0,%%xmm7,%%xmm7 \n"
6953 "sub %2,%0 \n"
6954 "sub %2,%1 \n"
6955 "sub %2,%3 \n"
6956
6957 // 8 pixel loop.
6958 LABELALIGN
6959 "1: \n"
6960 "movq (%2),%%xmm0 \n"
6961 "punpcklbw %%xmm0,%%xmm0 \n"
6962 "pxor %%xmm5,%%xmm0 \n"
6963 "movq (%0,%2,1),%%xmm1 \n"
6964 "movq (%1,%2,1),%%xmm2 \n"
6965 "punpcklbw %%xmm2,%%xmm1 \n"
6966 "psubb %%xmm6,%%xmm1 \n"
6967 "pmaddubsw %%xmm1,%%xmm0 \n"
6968 "paddw %%xmm7,%%xmm0 \n"
6969 "psrlw $0x8,%%xmm0 \n"
6970 "packuswb %%xmm0,%%xmm0 \n"
6971 "movq %%xmm0,(%3,%2,1) \n"
6972 "lea 0x8(%2),%2 \n"
6973 "sub $0x8,%4 \n"
6974 "jg 1b \n"
6975 : "+r"(src0), // %0
6976 "+r"(src1), // %1
6977 "+r"(alpha), // %2
6978 "+r"(dst), // %3
6979 "+rm"(width) // %4
6980 ::"memory",
6981 "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
6982 }
6983 #endif // HAS_BLENDPLANEROW_SSSE3
6984
6985 #ifdef HAS_BLENDPLANEROW_AVX2
6986 // Blend 32 pixels at a time.
6987 // unsigned version of math
6988 // =((A2*C2)+(B2*(255-C2))+255)/256
6989 // signed version of math
6990 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_AVX2(const uint8_t * src0,const uint8_t * src1,const uint8_t * alpha,uint8_t * dst,int width)6991 void BlendPlaneRow_AVX2(const uint8_t* src0,
6992 const uint8_t* src1,
6993 const uint8_t* alpha,
6994 uint8_t* dst,
6995 int width) {
6996 asm volatile(
6997 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
6998 "vpsllw $0x8,%%ymm5,%%ymm5 \n"
6999 "mov $0x80808080,%%eax \n"
7000 "vmovd %%eax,%%xmm6 \n"
7001 "vbroadcastss %%xmm6,%%ymm6 \n"
7002 "mov $0x807f807f,%%eax \n"
7003 "vmovd %%eax,%%xmm7 \n"
7004 "vbroadcastss %%xmm7,%%ymm7 \n"
7005 "sub %2,%0 \n"
7006 "sub %2,%1 \n"
7007 "sub %2,%3 \n"
7008
7009 // 32 pixel loop.
7010 LABELALIGN
7011 "1: \n"
7012 "vmovdqu (%2),%%ymm0 \n"
7013 "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
7014 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
7015 "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
7016 "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
7017 "vmovdqu (%0,%2,1),%%ymm1 \n"
7018 "vmovdqu (%1,%2,1),%%ymm2 \n"
7019 "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
7020 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
7021 "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
7022 "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
7023 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
7024 "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
7025 "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
7026 "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
7027 "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
7028 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
7029 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
7030 "vmovdqu %%ymm0,(%3,%2,1) \n"
7031 "lea 0x20(%2),%2 \n"
7032 "sub $0x20,%4 \n"
7033 "jg 1b \n"
7034 "vzeroupper \n"
7035 : "+r"(src0), // %0
7036 "+r"(src1), // %1
7037 "+r"(alpha), // %2
7038 "+r"(dst), // %3
7039 "+rm"(width) // %4
7040 ::"memory",
7041 "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7042 "xmm7");
7043 }
7044 #endif // HAS_BLENDPLANEROW_AVX2
7045
7046 #ifdef HAS_ARGBATTENUATEROW_SSSE3
7047 // Shuffle table duplicating alpha.
7048 static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
7049 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
7050 static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
7051 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
7052 // Attenuate 4 pixels at a time.
ARGBAttenuateRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width)7053 void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
7054 uint8_t* dst_argb,
7055 int width) {
7056 asm volatile(
7057 "pcmpeqb %%xmm3,%%xmm3 \n"
7058 "pslld $0x18,%%xmm3 \n"
7059 "movdqa %3,%%xmm4 \n"
7060 "movdqa %4,%%xmm5 \n"
7061
7062 // 4 pixel loop.
7063 LABELALIGN
7064 "1: \n"
7065 "movdqu (%0),%%xmm0 \n"
7066 "pshufb %%xmm4,%%xmm0 \n"
7067 "movdqu (%0),%%xmm1 \n"
7068 "punpcklbw %%xmm1,%%xmm1 \n"
7069 "pmulhuw %%xmm1,%%xmm0 \n"
7070 "movdqu (%0),%%xmm1 \n"
7071 "pshufb %%xmm5,%%xmm1 \n"
7072 "movdqu (%0),%%xmm2 \n"
7073 "punpckhbw %%xmm2,%%xmm2 \n"
7074 "pmulhuw %%xmm2,%%xmm1 \n"
7075 "movdqu (%0),%%xmm2 \n"
7076 "lea 0x10(%0),%0 \n"
7077 "pand %%xmm3,%%xmm2 \n"
7078 "psrlw $0x8,%%xmm0 \n"
7079 "psrlw $0x8,%%xmm1 \n"
7080 "packuswb %%xmm1,%%xmm0 \n"
7081 "por %%xmm2,%%xmm0 \n"
7082 "movdqu %%xmm0,(%1) \n"
7083 "lea 0x10(%1),%1 \n"
7084 "sub $0x4,%2 \n"
7085 "jg 1b \n"
7086 : "+r"(src_argb), // %0
7087 "+r"(dst_argb), // %1
7088 "+r"(width) // %2
7089 : "m"(kShuffleAlpha0), // %3
7090 "m"(kShuffleAlpha1) // %4
7091 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
7092 }
7093 #endif // HAS_ARGBATTENUATEROW_SSSE3
7094
7095 #ifdef HAS_ARGBATTENUATEROW_AVX2
7096 // Shuffle table duplicating alpha.
7097 static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
7098 128u, 128u, 14u, 15u, 14u, 15u,
7099 14u, 15u, 128u, 128u};
7100 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,int width)7101 void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
7102 uint8_t* dst_argb,
7103 int width) {
7104 asm volatile(
7105 "vbroadcastf128 %3,%%ymm4 \n"
7106 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
7107 "vpslld $0x18,%%ymm5,%%ymm5 \n"
7108 "sub %0,%1 \n"
7109
7110 // 8 pixel loop.
7111 LABELALIGN
7112 "1: \n"
7113 "vmovdqu (%0),%%ymm6 \n"
7114 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
7115 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
7116 "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
7117 "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
7118 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
7119 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
7120 "vpand %%ymm5,%%ymm6,%%ymm6 \n"
7121 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
7122 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
7123 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
7124 "vpor %%ymm6,%%ymm0,%%ymm0 \n"
7125 "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
7126 "lea 0x20(%0),%0 \n"
7127 "sub $0x8,%2 \n"
7128 "jg 1b \n"
7129 "vzeroupper \n"
7130 : "+r"(src_argb), // %0
7131 "+r"(dst_argb), // %1
7132 "+r"(width) // %2
7133 : "m"(kShuffleAlpha_AVX2) // %3
7134 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
7135 }
7136 #endif // HAS_ARGBATTENUATEROW_AVX2
7137
7138 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
7139 // Unattenuate 4 pixels at a time.
ARGBUnattenuateRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,int width)7140 void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
7141 uint8_t* dst_argb,
7142 int width) {
7143 uintptr_t alpha;
7144 asm volatile(
7145 // 4 pixel loop.
7146 LABELALIGN
7147 "1: \n"
7148 "movdqu (%0),%%xmm0 \n"
7149 "movzb 0x03(%0),%3 \n"
7150 "punpcklbw %%xmm0,%%xmm0 \n"
7151 "movd 0x00(%4,%3,4),%%xmm2 \n"
7152 "movzb 0x07(%0),%3 \n"
7153 "movd 0x00(%4,%3,4),%%xmm3 \n"
7154 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
7155 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
7156 "movlhps %%xmm3,%%xmm2 \n"
7157 "pmulhuw %%xmm2,%%xmm0 \n"
7158 "movdqu (%0),%%xmm1 \n"
7159 "movzb 0x0b(%0),%3 \n"
7160 "punpckhbw %%xmm1,%%xmm1 \n"
7161 "movd 0x00(%4,%3,4),%%xmm2 \n"
7162 "movzb 0x0f(%0),%3 \n"
7163 "movd 0x00(%4,%3,4),%%xmm3 \n"
7164 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
7165 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
7166 "movlhps %%xmm3,%%xmm2 \n"
7167 "pmulhuw %%xmm2,%%xmm1 \n"
7168 "lea 0x10(%0),%0 \n"
7169 "packuswb %%xmm1,%%xmm0 \n"
7170 "movdqu %%xmm0,(%1) \n"
7171 "lea 0x10(%1),%1 \n"
7172 "sub $0x4,%2 \n"
7173 "jg 1b \n"
7174 : "+r"(src_argb), // %0
7175 "+r"(dst_argb), // %1
7176 "+r"(width), // %2
7177 "=&r"(alpha) // %3
7178 : "r"(fixed_invtbl8) // %4
7179 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
7180 }
7181 #endif // HAS_ARGBUNATTENUATEROW_SSE2
7182
7183 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
7184 // Shuffle table duplicating alpha.
7185 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
7186 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
7187 // Unattenuate 8 pixels at a time.
ARGBUnattenuateRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,int width)7188 void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
7189 uint8_t* dst_argb,
7190 int width) {
7191 uintptr_t alpha;
7192 asm volatile(
7193 "sub %0,%1 \n"
7194 "vbroadcastf128 %5,%%ymm5 \n"
7195
7196 // 8 pixel loop.
7197 LABELALIGN
7198 "1: \n"
7199 // replace VPGATHER
7200 "movzb 0x03(%0),%3 \n"
7201 "vmovd 0x00(%4,%3,4),%%xmm0 \n"
7202 "movzb 0x07(%0),%3 \n"
7203 "vmovd 0x00(%4,%3,4),%%xmm1 \n"
7204 "movzb 0x0b(%0),%3 \n"
7205 "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
7206 "vmovd 0x00(%4,%3,4),%%xmm2 \n"
7207 "movzb 0x0f(%0),%3 \n"
7208 "vmovd 0x00(%4,%3,4),%%xmm3 \n"
7209 "movzb 0x13(%0),%3 \n"
7210 "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
7211 "vmovd 0x00(%4,%3,4),%%xmm0 \n"
7212 "movzb 0x17(%0),%3 \n"
7213 "vmovd 0x00(%4,%3,4),%%xmm1 \n"
7214 "movzb 0x1b(%0),%3 \n"
7215 "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
7216 "vmovd 0x00(%4,%3,4),%%xmm2 \n"
7217 "movzb 0x1f(%0),%3 \n"
7218 "vmovd 0x00(%4,%3,4),%%xmm3 \n"
7219 "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
7220 "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
7221 "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
7222 "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
7223 // end of VPGATHER
7224
7225 "vmovdqu (%0),%%ymm6 \n"
7226 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
7227 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
7228 "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
7229 "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
7230 "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
7231 "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
7232 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
7233 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
7234 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
7235 "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
7236 "lea 0x20(%0),%0 \n"
7237 "sub $0x8,%2 \n"
7238 "jg 1b \n"
7239 "vzeroupper \n"
7240 : "+r"(src_argb), // %0
7241 "+r"(dst_argb), // %1
7242 "+r"(width), // %2
7243 "=&r"(alpha) // %3
7244 : "r"(fixed_invtbl8), // %4
7245 "m"(kUnattenShuffleAlpha_AVX2) // %5
7246 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7247 "xmm7");
7248 }
7249 #endif // HAS_ARGBUNATTENUATEROW_AVX2
7250
7251 #ifdef HAS_ARGBGRAYROW_SSSE3
7252 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
ARGBGrayRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width)7253 void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
7254 asm volatile(
7255 "movdqa %3,%%xmm4 \n"
7256 "movdqa %4,%%xmm5 \n"
7257
7258 // 8 pixel loop.
7259 LABELALIGN
7260 "1: \n"
7261 "movdqu (%0),%%xmm0 \n"
7262 "movdqu 0x10(%0),%%xmm1 \n"
7263 "psubb %%xmm5,%%xmm0 \n"
7264 "psubb %%xmm5,%%xmm1 \n"
7265 "movdqu %%xmm4,%%xmm6 \n"
7266 "pmaddubsw %%xmm0,%%xmm6 \n"
7267 "movdqu %%xmm4,%%xmm0 \n"
7268 "pmaddubsw %%xmm1,%%xmm0 \n"
7269 "phaddw %%xmm0,%%xmm6 \n"
7270 "paddw %%xmm5,%%xmm6 \n"
7271 "psrlw $0x8,%%xmm6 \n"
7272 "packuswb %%xmm6,%%xmm6 \n"
7273 "movdqu (%0),%%xmm2 \n"
7274 "movdqu 0x10(%0),%%xmm3 \n"
7275 "lea 0x20(%0),%0 \n"
7276 "psrld $0x18,%%xmm2 \n"
7277 "psrld $0x18,%%xmm3 \n"
7278 "packuswb %%xmm3,%%xmm2 \n"
7279 "packuswb %%xmm2,%%xmm2 \n"
7280 "movdqa %%xmm6,%%xmm3 \n"
7281 "punpcklbw %%xmm6,%%xmm6 \n"
7282 "punpcklbw %%xmm2,%%xmm3 \n"
7283 "movdqa %%xmm6,%%xmm1 \n"
7284 "punpcklwd %%xmm3,%%xmm6 \n"
7285 "punpckhwd %%xmm3,%%xmm1 \n"
7286 "movdqu %%xmm6,(%1) \n"
7287 "movdqu %%xmm1,0x10(%1) \n"
7288 "lea 0x20(%1),%1 \n"
7289 "sub $0x8,%2 \n"
7290 "jg 1b \n"
7291 : "+r"(src_argb), // %0
7292 "+r"(dst_argb), // %1
7293 "+r"(width) // %2
7294 : "m"(kARGBToYJ), // %3
7295 "m"(kSub128) // %4
7296 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
7297 }
7298 #endif // HAS_ARGBGRAYROW_SSSE3
7299
7300 #ifdef HAS_ARGBSEPIAROW_SSSE3
7301 // b = (r * 35 + g * 68 + b * 17) >> 7
7302 // g = (r * 45 + g * 88 + b * 22) >> 7
7303 // r = (r * 50 + g * 98 + b * 24) >> 7
7304 // Constant for ARGB color to sepia tone
7305 static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
7306 17, 68, 35, 0, 17, 68, 35, 0};
7307
7308 static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
7309 22, 88, 45, 0, 22, 88, 45, 0};
7310
7311 static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
7312 24, 98, 50, 0, 24, 98, 50, 0};
7313
7314 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
ARGBSepiaRow_SSSE3(uint8_t * dst_argb,int width)7315 void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
7316 asm volatile(
7317 "movdqa %2,%%xmm2 \n"
7318 "movdqa %3,%%xmm3 \n"
7319 "movdqa %4,%%xmm4 \n"
7320
7321 // 8 pixel loop.
7322 LABELALIGN
7323 "1: \n"
7324 "movdqu (%0),%%xmm0 \n"
7325 "movdqu 0x10(%0),%%xmm6 \n"
7326 "pmaddubsw %%xmm2,%%xmm0 \n"
7327 "pmaddubsw %%xmm2,%%xmm6 \n"
7328 "phaddw %%xmm6,%%xmm0 \n"
7329 "psrlw $0x7,%%xmm0 \n"
7330 "packuswb %%xmm0,%%xmm0 \n"
7331 "movdqu (%0),%%xmm5 \n"
7332 "movdqu 0x10(%0),%%xmm1 \n"
7333 "pmaddubsw %%xmm3,%%xmm5 \n"
7334 "pmaddubsw %%xmm3,%%xmm1 \n"
7335 "phaddw %%xmm1,%%xmm5 \n"
7336 "psrlw $0x7,%%xmm5 \n"
7337 "packuswb %%xmm5,%%xmm5 \n"
7338 "punpcklbw %%xmm5,%%xmm0 \n"
7339 "movdqu (%0),%%xmm5 \n"
7340 "movdqu 0x10(%0),%%xmm1 \n"
7341 "pmaddubsw %%xmm4,%%xmm5 \n"
7342 "pmaddubsw %%xmm4,%%xmm1 \n"
7343 "phaddw %%xmm1,%%xmm5 \n"
7344 "psrlw $0x7,%%xmm5 \n"
7345 "packuswb %%xmm5,%%xmm5 \n"
7346 "movdqu (%0),%%xmm6 \n"
7347 "movdqu 0x10(%0),%%xmm1 \n"
7348 "psrld $0x18,%%xmm6 \n"
7349 "psrld $0x18,%%xmm1 \n"
7350 "packuswb %%xmm1,%%xmm6 \n"
7351 "packuswb %%xmm6,%%xmm6 \n"
7352 "punpcklbw %%xmm6,%%xmm5 \n"
7353 "movdqa %%xmm0,%%xmm1 \n"
7354 "punpcklwd %%xmm5,%%xmm0 \n"
7355 "punpckhwd %%xmm5,%%xmm1 \n"
7356 "movdqu %%xmm0,(%0) \n"
7357 "movdqu %%xmm1,0x10(%0) \n"
7358 "lea 0x20(%0),%0 \n"
7359 "sub $0x8,%1 \n"
7360 "jg 1b \n"
7361 : "+r"(dst_argb), // %0
7362 "+r"(width) // %1
7363 : "m"(kARGBToSepiaB), // %2
7364 "m"(kARGBToSepiaG), // %3
7365 "m"(kARGBToSepiaR) // %4
7366 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
7367 }
7368 #endif // HAS_ARGBSEPIAROW_SSSE3
7369
7370 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
7371 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
7372 // Same as Sepia except matrix is provided.
ARGBColorMatrixRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,const int8_t * matrix_argb,int width)7373 void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
7374 uint8_t* dst_argb,
7375 const int8_t* matrix_argb,
7376 int width) {
7377 asm volatile(
7378 "movdqu (%3),%%xmm5 \n"
7379 "pshufd $0x00,%%xmm5,%%xmm2 \n"
7380 "pshufd $0x55,%%xmm5,%%xmm3 \n"
7381 "pshufd $0xaa,%%xmm5,%%xmm4 \n"
7382 "pshufd $0xff,%%xmm5,%%xmm5 \n"
7383
7384 // 8 pixel loop.
7385 LABELALIGN
7386 "1: \n"
7387 "movdqu (%0),%%xmm0 \n"
7388 "movdqu 0x10(%0),%%xmm7 \n"
7389 "pmaddubsw %%xmm2,%%xmm0 \n"
7390 "pmaddubsw %%xmm2,%%xmm7 \n"
7391 "movdqu (%0),%%xmm6 \n"
7392 "movdqu 0x10(%0),%%xmm1 \n"
7393 "pmaddubsw %%xmm3,%%xmm6 \n"
7394 "pmaddubsw %%xmm3,%%xmm1 \n"
7395 "phaddsw %%xmm7,%%xmm0 \n"
7396 "phaddsw %%xmm1,%%xmm6 \n"
7397 "psraw $0x6,%%xmm0 \n"
7398 "psraw $0x6,%%xmm6 \n"
7399 "packuswb %%xmm0,%%xmm0 \n"
7400 "packuswb %%xmm6,%%xmm6 \n"
7401 "punpcklbw %%xmm6,%%xmm0 \n"
7402 "movdqu (%0),%%xmm1 \n"
7403 "movdqu 0x10(%0),%%xmm7 \n"
7404 "pmaddubsw %%xmm4,%%xmm1 \n"
7405 "pmaddubsw %%xmm4,%%xmm7 \n"
7406 "phaddsw %%xmm7,%%xmm1 \n"
7407 "movdqu (%0),%%xmm6 \n"
7408 "movdqu 0x10(%0),%%xmm7 \n"
7409 "pmaddubsw %%xmm5,%%xmm6 \n"
7410 "pmaddubsw %%xmm5,%%xmm7 \n"
7411 "phaddsw %%xmm7,%%xmm6 \n"
7412 "psraw $0x6,%%xmm1 \n"
7413 "psraw $0x6,%%xmm6 \n"
7414 "packuswb %%xmm1,%%xmm1 \n"
7415 "packuswb %%xmm6,%%xmm6 \n"
7416 "punpcklbw %%xmm6,%%xmm1 \n"
7417 "movdqa %%xmm0,%%xmm6 \n"
7418 "punpcklwd %%xmm1,%%xmm0 \n"
7419 "punpckhwd %%xmm1,%%xmm6 \n"
7420 "movdqu %%xmm0,(%1) \n"
7421 "movdqu %%xmm6,0x10(%1) \n"
7422 "lea 0x20(%0),%0 \n"
7423 "lea 0x20(%1),%1 \n"
7424 "sub $0x8,%2 \n"
7425 "jg 1b \n"
7426 : "+r"(src_argb), // %0
7427 "+r"(dst_argb), // %1
7428 "+r"(width) // %2
7429 : "r"(matrix_argb) // %3
7430 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7431 "xmm7");
7432 }
7433 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
7434
7435 #ifdef HAS_ARGBQUANTIZEROW_SSE2
7436 // Quantize 4 ARGB pixels (16 bytes).
ARGBQuantizeRow_SSE2(uint8_t * dst_argb,int scale,int interval_size,int interval_offset,int width)7437 void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
7438 int scale,
7439 int interval_size,
7440 int interval_offset,
7441 int width) {
7442 asm volatile(
7443 "movd %2,%%xmm2 \n"
7444 "movd %3,%%xmm3 \n"
7445 "movd %4,%%xmm4 \n"
7446 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
7447 "pshufd $0x44,%%xmm2,%%xmm2 \n"
7448 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
7449 "pshufd $0x44,%%xmm3,%%xmm3 \n"
7450 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
7451 "pshufd $0x44,%%xmm4,%%xmm4 \n"
7452 "pxor %%xmm5,%%xmm5 \n"
7453 "pcmpeqb %%xmm6,%%xmm6 \n"
7454 "pslld $0x18,%%xmm6 \n"
7455
7456 // 4 pixel loop.
7457 LABELALIGN
7458 "1: \n"
7459 "movdqu (%0),%%xmm0 \n"
7460 "punpcklbw %%xmm5,%%xmm0 \n"
7461 "pmulhuw %%xmm2,%%xmm0 \n"
7462 "movdqu (%0),%%xmm1 \n"
7463 "punpckhbw %%xmm5,%%xmm1 \n"
7464 "pmulhuw %%xmm2,%%xmm1 \n"
7465 "pmullw %%xmm3,%%xmm0 \n"
7466 "movdqu (%0),%%xmm7 \n"
7467 "pmullw %%xmm3,%%xmm1 \n"
7468 "pand %%xmm6,%%xmm7 \n"
7469 "paddw %%xmm4,%%xmm0 \n"
7470 "paddw %%xmm4,%%xmm1 \n"
7471 "packuswb %%xmm1,%%xmm0 \n"
7472 "por %%xmm7,%%xmm0 \n"
7473 "movdqu %%xmm0,(%0) \n"
7474 "lea 0x10(%0),%0 \n"
7475 "sub $0x4,%1 \n"
7476 "jg 1b \n"
7477 : "+r"(dst_argb), // %0
7478 "+r"(width) // %1
7479 : "r"(scale), // %2
7480 "r"(interval_size), // %3
7481 "r"(interval_offset) // %4
7482 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7483 "xmm7");
7484 }
7485 #endif // HAS_ARGBQUANTIZEROW_SSE2
7486
7487 #ifdef HAS_ARGBSHADEROW_SSE2
7488 // Shade 4 pixels at a time by specified value.
ARGBShadeRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)7489 void ARGBShadeRow_SSE2(const uint8_t* src_argb,
7490 uint8_t* dst_argb,
7491 int width,
7492 uint32_t value) {
7493 asm volatile(
7494 "movd %3,%%xmm2 \n"
7495 "punpcklbw %%xmm2,%%xmm2 \n"
7496 "punpcklqdq %%xmm2,%%xmm2 \n"
7497
7498 // 4 pixel loop.
7499 LABELALIGN
7500 "1: \n"
7501 "movdqu (%0),%%xmm0 \n"
7502 "lea 0x10(%0),%0 \n"
7503 "movdqa %%xmm0,%%xmm1 \n"
7504 "punpcklbw %%xmm0,%%xmm0 \n"
7505 "punpckhbw %%xmm1,%%xmm1 \n"
7506 "pmulhuw %%xmm2,%%xmm0 \n"
7507 "pmulhuw %%xmm2,%%xmm1 \n"
7508 "psrlw $0x8,%%xmm0 \n"
7509 "psrlw $0x8,%%xmm1 \n"
7510 "packuswb %%xmm1,%%xmm0 \n"
7511 "movdqu %%xmm0,(%1) \n"
7512 "lea 0x10(%1),%1 \n"
7513 "sub $0x4,%2 \n"
7514 "jg 1b \n"
7515 : "+r"(src_argb), // %0
7516 "+r"(dst_argb), // %1
7517 "+r"(width) // %2
7518 : "r"(value) // %3
7519 : "memory", "cc", "xmm0", "xmm1", "xmm2");
7520 }
7521 #endif // HAS_ARGBSHADEROW_SSE2
7522
7523 #ifdef HAS_ARGBMULTIPLYROW_SSE2
7524 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBMultiplyRow_SSE2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7525 void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
7526 const uint8_t* src_argb1,
7527 uint8_t* dst_argb,
7528 int width) {
7529 asm volatile(
7530
7531 "pxor %%xmm5,%%xmm5 \n"
7532
7533 // 4 pixel loop.
7534 LABELALIGN
7535 "1: \n"
7536 "movdqu (%0),%%xmm0 \n"
7537 "lea 0x10(%0),%0 \n"
7538 "movdqu (%1),%%xmm2 \n"
7539 "lea 0x10(%1),%1 \n"
7540 "movdqu %%xmm0,%%xmm1 \n"
7541 "movdqu %%xmm2,%%xmm3 \n"
7542 "punpcklbw %%xmm0,%%xmm0 \n"
7543 "punpckhbw %%xmm1,%%xmm1 \n"
7544 "punpcklbw %%xmm5,%%xmm2 \n"
7545 "punpckhbw %%xmm5,%%xmm3 \n"
7546 "pmulhuw %%xmm2,%%xmm0 \n"
7547 "pmulhuw %%xmm3,%%xmm1 \n"
7548 "packuswb %%xmm1,%%xmm0 \n"
7549 "movdqu %%xmm0,(%2) \n"
7550 "lea 0x10(%2),%2 \n"
7551 "sub $0x4,%3 \n"
7552 "jg 1b \n"
7553 : "+r"(src_argb), // %0
7554 "+r"(src_argb1), // %1
7555 "+r"(dst_argb), // %2
7556 "+r"(width) // %3
7557 :
7558 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
7559 }
7560 #endif // HAS_ARGBMULTIPLYROW_SSE2
7561
7562 #ifdef HAS_ARGBMULTIPLYROW_AVX2
7563 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_AVX2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7564 void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
7565 const uint8_t* src_argb1,
7566 uint8_t* dst_argb,
7567 int width) {
7568 asm volatile(
7569
7570 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
7571
7572 // 4 pixel loop.
7573 LABELALIGN
7574 "1: \n"
7575 "vmovdqu (%0),%%ymm1 \n"
7576 "lea 0x20(%0),%0 \n"
7577 "vmovdqu (%1),%%ymm3 \n"
7578 "lea 0x20(%1),%1 \n"
7579 "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
7580 "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
7581 "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
7582 "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
7583 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
7584 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
7585 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
7586 "vmovdqu %%ymm0,(%2) \n"
7587 "lea 0x20(%2),%2 \n"
7588 "sub $0x8,%3 \n"
7589 "jg 1b \n"
7590 "vzeroupper \n"
7591 : "+r"(src_argb), // %0
7592 "+r"(src_argb1), // %1
7593 "+r"(dst_argb), // %2
7594 "+r"(width) // %3
7595 :
7596 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
7597 }
7598 #endif // HAS_ARGBMULTIPLYROW_AVX2
7599
7600 #ifdef HAS_ARGBADDROW_SSE2
7601 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_SSE2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7602 void ARGBAddRow_SSE2(const uint8_t* src_argb,
7603 const uint8_t* src_argb1,
7604 uint8_t* dst_argb,
7605 int width) {
7606 asm volatile(
7607 // 4 pixel loop.
7608 LABELALIGN
7609 "1: \n"
7610 "movdqu (%0),%%xmm0 \n"
7611 "lea 0x10(%0),%0 \n"
7612 "movdqu (%1),%%xmm1 \n"
7613 "lea 0x10(%1),%1 \n"
7614 "paddusb %%xmm1,%%xmm0 \n"
7615 "movdqu %%xmm0,(%2) \n"
7616 "lea 0x10(%2),%2 \n"
7617 "sub $0x4,%3 \n"
7618 "jg 1b \n"
7619 : "+r"(src_argb), // %0
7620 "+r"(src_argb1), // %1
7621 "+r"(dst_argb), // %2
7622 "+r"(width) // %3
7623 :
7624 : "memory", "cc", "xmm0", "xmm1");
7625 }
7626 #endif // HAS_ARGBADDROW_SSE2
7627
7628 #ifdef HAS_ARGBADDROW_AVX2
7629 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_AVX2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7630 void ARGBAddRow_AVX2(const uint8_t* src_argb,
7631 const uint8_t* src_argb1,
7632 uint8_t* dst_argb,
7633 int width) {
7634 asm volatile(
7635 // 4 pixel loop.
7636 LABELALIGN
7637 "1: \n"
7638 "vmovdqu (%0),%%ymm0 \n"
7639 "lea 0x20(%0),%0 \n"
7640 "vpaddusb (%1),%%ymm0,%%ymm0 \n"
7641 "lea 0x20(%1),%1 \n"
7642 "vmovdqu %%ymm0,(%2) \n"
7643 "lea 0x20(%2),%2 \n"
7644 "sub $0x8,%3 \n"
7645 "jg 1b \n"
7646 "vzeroupper \n"
7647 : "+r"(src_argb), // %0
7648 "+r"(src_argb1), // %1
7649 "+r"(dst_argb), // %2
7650 "+r"(width) // %3
7651 :
7652 : "memory", "cc", "xmm0");
7653 }
7654 #endif // HAS_ARGBADDROW_AVX2
7655
7656 #ifdef HAS_ARGBSUBTRACTROW_SSE2
7657 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
ARGBSubtractRow_SSE2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7658 void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
7659 const uint8_t* src_argb1,
7660 uint8_t* dst_argb,
7661 int width) {
7662 asm volatile(
7663 // 4 pixel loop.
7664 LABELALIGN
7665 "1: \n"
7666 "movdqu (%0),%%xmm0 \n"
7667 "lea 0x10(%0),%0 \n"
7668 "movdqu (%1),%%xmm1 \n"
7669 "lea 0x10(%1),%1 \n"
7670 "psubusb %%xmm1,%%xmm0 \n"
7671 "movdqu %%xmm0,(%2) \n"
7672 "lea 0x10(%2),%2 \n"
7673 "sub $0x4,%3 \n"
7674 "jg 1b \n"
7675 : "+r"(src_argb), // %0
7676 "+r"(src_argb1), // %1
7677 "+r"(dst_argb), // %2
7678 "+r"(width) // %3
7679 :
7680 : "memory", "cc", "xmm0", "xmm1");
7681 }
7682 #endif // HAS_ARGBSUBTRACTROW_SSE2
7683
7684 #ifdef HAS_ARGBSUBTRACTROW_AVX2
7685 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_AVX2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7686 void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
7687 const uint8_t* src_argb1,
7688 uint8_t* dst_argb,
7689 int width) {
7690 asm volatile(
7691 // 4 pixel loop.
7692 LABELALIGN
7693 "1: \n"
7694 "vmovdqu (%0),%%ymm0 \n"
7695 "lea 0x20(%0),%0 \n"
7696 "vpsubusb (%1),%%ymm0,%%ymm0 \n"
7697 "lea 0x20(%1),%1 \n"
7698 "vmovdqu %%ymm0,(%2) \n"
7699 "lea 0x20(%2),%2 \n"
7700 "sub $0x8,%3 \n"
7701 "jg 1b \n"
7702 "vzeroupper \n"
7703 : "+r"(src_argb), // %0
7704 "+r"(src_argb1), // %1
7705 "+r"(dst_argb), // %2
7706 "+r"(width) // %3
7707 :
7708 : "memory", "cc", "xmm0");
7709 }
7710 #endif // HAS_ARGBSUBTRACTROW_AVX2
7711
7712 #ifdef HAS_SOBELXROW_SSE2
7713 // SobelX as a matrix is
7714 // -1 0 1
7715 // -2 0 2
7716 // -1 0 1
SobelXRow_SSE2(const uint8_t * src_y0,const uint8_t * src_y1,const uint8_t * src_y2,uint8_t * dst_sobelx,int width)7717 void SobelXRow_SSE2(const uint8_t* src_y0,
7718 const uint8_t* src_y1,
7719 const uint8_t* src_y2,
7720 uint8_t* dst_sobelx,
7721 int width) {
7722 asm volatile(
7723 "sub %0,%1 \n"
7724 "sub %0,%2 \n"
7725 "sub %0,%3 \n"
7726 "pxor %%xmm5,%%xmm5 \n"
7727
7728 // 8 pixel loop.
7729 LABELALIGN
7730 "1: \n"
7731 "movq (%0),%%xmm0 \n"
7732 "movq 0x2(%0),%%xmm1 \n"
7733 "punpcklbw %%xmm5,%%xmm0 \n"
7734 "punpcklbw %%xmm5,%%xmm1 \n"
7735 "psubw %%xmm1,%%xmm0 \n"
7736 "movq 0x00(%0,%1,1),%%xmm1 \n"
7737 "movq 0x02(%0,%1,1),%%xmm2 \n"
7738 "punpcklbw %%xmm5,%%xmm1 \n"
7739 "punpcklbw %%xmm5,%%xmm2 \n"
7740 "psubw %%xmm2,%%xmm1 \n"
7741 "movq 0x00(%0,%2,1),%%xmm2 \n"
7742 "movq 0x02(%0,%2,1),%%xmm3 \n"
7743 "punpcklbw %%xmm5,%%xmm2 \n"
7744 "punpcklbw %%xmm5,%%xmm3 \n"
7745 "psubw %%xmm3,%%xmm2 \n"
7746 "paddw %%xmm2,%%xmm0 \n"
7747 "paddw %%xmm1,%%xmm0 \n"
7748 "paddw %%xmm1,%%xmm0 \n"
7749 "pxor %%xmm1,%%xmm1 \n"
7750 "psubw %%xmm0,%%xmm1 \n"
7751 "pmaxsw %%xmm1,%%xmm0 \n"
7752 "packuswb %%xmm0,%%xmm0 \n"
7753 "movq %%xmm0,0x00(%0,%3,1) \n"
7754 "lea 0x8(%0),%0 \n"
7755 "sub $0x8,%4 \n"
7756 "jg 1b \n"
7757 : "+r"(src_y0), // %0
7758 "+r"(src_y1), // %1
7759 "+r"(src_y2), // %2
7760 "+r"(dst_sobelx), // %3
7761 "+r"(width) // %4
7762 :
7763 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
7764 }
7765 #endif // HAS_SOBELXROW_SSE2
7766
7767 #ifdef HAS_SOBELYROW_SSE2
7768 // SobelY as a matrix is
7769 // -1 -2 -1
7770 // 0 0 0
7771 // 1 2 1
SobelYRow_SSE2(const uint8_t * src_y0,const uint8_t * src_y1,uint8_t * dst_sobely,int width)7772 void SobelYRow_SSE2(const uint8_t* src_y0,
7773 const uint8_t* src_y1,
7774 uint8_t* dst_sobely,
7775 int width) {
7776 asm volatile(
7777 "sub %0,%1 \n"
7778 "sub %0,%2 \n"
7779 "pxor %%xmm5,%%xmm5 \n"
7780
7781 // 8 pixel loop.
7782 LABELALIGN
7783 "1: \n"
7784 "movq (%0),%%xmm0 \n"
7785 "movq 0x00(%0,%1,1),%%xmm1 \n"
7786 "punpcklbw %%xmm5,%%xmm0 \n"
7787 "punpcklbw %%xmm5,%%xmm1 \n"
7788 "psubw %%xmm1,%%xmm0 \n"
7789 "movq 0x1(%0),%%xmm1 \n"
7790 "movq 0x01(%0,%1,1),%%xmm2 \n"
7791 "punpcklbw %%xmm5,%%xmm1 \n"
7792 "punpcklbw %%xmm5,%%xmm2 \n"
7793 "psubw %%xmm2,%%xmm1 \n"
7794 "movq 0x2(%0),%%xmm2 \n"
7795 "movq 0x02(%0,%1,1),%%xmm3 \n"
7796 "punpcklbw %%xmm5,%%xmm2 \n"
7797 "punpcklbw %%xmm5,%%xmm3 \n"
7798 "psubw %%xmm3,%%xmm2 \n"
7799 "paddw %%xmm2,%%xmm0 \n"
7800 "paddw %%xmm1,%%xmm0 \n"
7801 "paddw %%xmm1,%%xmm0 \n"
7802 "pxor %%xmm1,%%xmm1 \n"
7803 "psubw %%xmm0,%%xmm1 \n"
7804 "pmaxsw %%xmm1,%%xmm0 \n"
7805 "packuswb %%xmm0,%%xmm0 \n"
7806 "movq %%xmm0,0x00(%0,%2,1) \n"
7807 "lea 0x8(%0),%0 \n"
7808 "sub $0x8,%3 \n"
7809 "jg 1b \n"
7810 : "+r"(src_y0), // %0
7811 "+r"(src_y1), // %1
7812 "+r"(dst_sobely), // %2
7813 "+r"(width) // %3
7814 :
7815 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
7816 }
7817 #endif // HAS_SOBELYROW_SSE2
7818
7819 #ifdef HAS_SOBELROW_SSE2
7820 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
7821 // A = 255
7822 // R = Sobel
7823 // G = Sobel
7824 // B = Sobel
SobelRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)7825 void SobelRow_SSE2(const uint8_t* src_sobelx,
7826 const uint8_t* src_sobely,
7827 uint8_t* dst_argb,
7828 int width) {
7829 asm volatile(
7830 "sub %0,%1 \n"
7831 "pcmpeqb %%xmm5,%%xmm5 \n"
7832 "pslld $0x18,%%xmm5 \n"
7833
7834 // 8 pixel loop.
7835 LABELALIGN
7836 "1: \n"
7837 "movdqu (%0),%%xmm0 \n"
7838 "movdqu 0x00(%0,%1,1),%%xmm1 \n"
7839 "lea 0x10(%0),%0 \n"
7840 "paddusb %%xmm1,%%xmm0 \n"
7841 "movdqa %%xmm0,%%xmm2 \n"
7842 "punpcklbw %%xmm0,%%xmm2 \n"
7843 "punpckhbw %%xmm0,%%xmm0 \n"
7844 "movdqa %%xmm2,%%xmm1 \n"
7845 "punpcklwd %%xmm2,%%xmm1 \n"
7846 "punpckhwd %%xmm2,%%xmm2 \n"
7847 "por %%xmm5,%%xmm1 \n"
7848 "por %%xmm5,%%xmm2 \n"
7849 "movdqa %%xmm0,%%xmm3 \n"
7850 "punpcklwd %%xmm0,%%xmm3 \n"
7851 "punpckhwd %%xmm0,%%xmm0 \n"
7852 "por %%xmm5,%%xmm3 \n"
7853 "por %%xmm5,%%xmm0 \n"
7854 "movdqu %%xmm1,(%2) \n"
7855 "movdqu %%xmm2,0x10(%2) \n"
7856 "movdqu %%xmm3,0x20(%2) \n"
7857 "movdqu %%xmm0,0x30(%2) \n"
7858 "lea 0x40(%2),%2 \n"
7859 "sub $0x10,%3 \n"
7860 "jg 1b \n"
7861 : "+r"(src_sobelx), // %0
7862 "+r"(src_sobely), // %1
7863 "+r"(dst_argb), // %2
7864 "+r"(width) // %3
7865 :
7866 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
7867 }
7868 #endif // HAS_SOBELROW_SSE2
7869
7870 #ifdef HAS_SOBELTOPLANEROW_SSE2
7871 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
SobelToPlaneRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_y,int width)7872 void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
7873 const uint8_t* src_sobely,
7874 uint8_t* dst_y,
7875 int width) {
7876 asm volatile(
7877 "sub %0,%1 \n"
7878 "pcmpeqb %%xmm5,%%xmm5 \n"
7879 "pslld $0x18,%%xmm5 \n"
7880
7881 // 8 pixel loop.
7882 LABELALIGN
7883 "1: \n"
7884 "movdqu (%0),%%xmm0 \n"
7885 "movdqu 0x00(%0,%1,1),%%xmm1 \n"
7886 "lea 0x10(%0),%0 \n"
7887 "paddusb %%xmm1,%%xmm0 \n"
7888 "movdqu %%xmm0,(%2) \n"
7889 "lea 0x10(%2),%2 \n"
7890 "sub $0x10,%3 \n"
7891 "jg 1b \n"
7892 : "+r"(src_sobelx), // %0
7893 "+r"(src_sobely), // %1
7894 "+r"(dst_y), // %2
7895 "+r"(width) // %3
7896 :
7897 : "memory", "cc", "xmm0", "xmm1");
7898 }
7899 #endif // HAS_SOBELTOPLANEROW_SSE2
7900
7901 #ifdef HAS_SOBELXYROW_SSE2
7902 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
7903 // A = 255
7904 // R = Sobel X
7905 // G = Sobel
7906 // B = Sobel Y
SobelXYRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)7907 void SobelXYRow_SSE2(const uint8_t* src_sobelx,
7908 const uint8_t* src_sobely,
7909 uint8_t* dst_argb,
7910 int width) {
7911 asm volatile(
7912 "sub %0,%1 \n"
7913 "pcmpeqb %%xmm5,%%xmm5 \n"
7914
7915 // 8 pixel loop.
7916 LABELALIGN
7917 "1: \n"
7918 "movdqu (%0),%%xmm0 \n"
7919 "movdqu 0x00(%0,%1,1),%%xmm1 \n"
7920 "lea 0x10(%0),%0 \n"
7921 "movdqa %%xmm0,%%xmm2 \n"
7922 "paddusb %%xmm1,%%xmm2 \n"
7923 "movdqa %%xmm0,%%xmm3 \n"
7924 "punpcklbw %%xmm5,%%xmm3 \n"
7925 "punpckhbw %%xmm5,%%xmm0 \n"
7926 "movdqa %%xmm1,%%xmm4 \n"
7927 "punpcklbw %%xmm2,%%xmm4 \n"
7928 "punpckhbw %%xmm2,%%xmm1 \n"
7929 "movdqa %%xmm4,%%xmm6 \n"
7930 "punpcklwd %%xmm3,%%xmm6 \n"
7931 "punpckhwd %%xmm3,%%xmm4 \n"
7932 "movdqa %%xmm1,%%xmm7 \n"
7933 "punpcklwd %%xmm0,%%xmm7 \n"
7934 "punpckhwd %%xmm0,%%xmm1 \n"
7935 "movdqu %%xmm6,(%2) \n"
7936 "movdqu %%xmm4,0x10(%2) \n"
7937 "movdqu %%xmm7,0x20(%2) \n"
7938 "movdqu %%xmm1,0x30(%2) \n"
7939 "lea 0x40(%2),%2 \n"
7940 "sub $0x10,%3 \n"
7941 "jg 1b \n"
7942 : "+r"(src_sobelx), // %0
7943 "+r"(src_sobely), // %1
7944 "+r"(dst_argb), // %2
7945 "+r"(width) // %3
7946 :
7947 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7948 "xmm7");
7949 }
7950 #endif // HAS_SOBELXYROW_SSE2
7951
7952 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
7953 // Creates a table of cumulative sums where each value is a sum of all values
7954 // above and to the left of the value, inclusive of the value.
ComputeCumulativeSumRow_SSE2(const uint8_t * row,int32_t * cumsum,const int32_t * previous_cumsum,int width)7955 void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
7956 int32_t* cumsum,
7957 const int32_t* previous_cumsum,
7958 int width) {
7959 asm volatile(
7960 "pxor %%xmm0,%%xmm0 \n"
7961 "pxor %%xmm1,%%xmm1 \n"
7962 "sub $0x4,%3 \n"
7963 "jl 49f \n"
7964 "test $0xf,%1 \n"
7965 "jne 49f \n"
7966
7967 // 4 pixel loop.
7968 LABELALIGN
7969 "40: \n"
7970 "movdqu (%0),%%xmm2 \n"
7971 "lea 0x10(%0),%0 \n"
7972 "movdqa %%xmm2,%%xmm4 \n"
7973 "punpcklbw %%xmm1,%%xmm2 \n"
7974 "movdqa %%xmm2,%%xmm3 \n"
7975 "punpcklwd %%xmm1,%%xmm2 \n"
7976 "punpckhwd %%xmm1,%%xmm3 \n"
7977 "punpckhbw %%xmm1,%%xmm4 \n"
7978 "movdqa %%xmm4,%%xmm5 \n"
7979 "punpcklwd %%xmm1,%%xmm4 \n"
7980 "punpckhwd %%xmm1,%%xmm5 \n"
7981 "paddd %%xmm2,%%xmm0 \n"
7982 "movdqu (%2),%%xmm2 \n"
7983 "paddd %%xmm0,%%xmm2 \n"
7984 "paddd %%xmm3,%%xmm0 \n"
7985 "movdqu 0x10(%2),%%xmm3 \n"
7986 "paddd %%xmm0,%%xmm3 \n"
7987 "paddd %%xmm4,%%xmm0 \n"
7988 "movdqu 0x20(%2),%%xmm4 \n"
7989 "paddd %%xmm0,%%xmm4 \n"
7990 "paddd %%xmm5,%%xmm0 \n"
7991 "movdqu 0x30(%2),%%xmm5 \n"
7992 "lea 0x40(%2),%2 \n"
7993 "paddd %%xmm0,%%xmm5 \n"
7994 "movdqu %%xmm2,(%1) \n"
7995 "movdqu %%xmm3,0x10(%1) \n"
7996 "movdqu %%xmm4,0x20(%1) \n"
7997 "movdqu %%xmm5,0x30(%1) \n"
7998 "lea 0x40(%1),%1 \n"
7999 "sub $0x4,%3 \n"
8000 "jge 40b \n"
8001
8002 "49: \n"
8003 "add $0x3,%3 \n"
8004 "jl 19f \n"
8005
8006 // 1 pixel loop.
8007 LABELALIGN
8008 "10: \n"
8009 "movd (%0),%%xmm2 \n"
8010 "lea 0x4(%0),%0 \n"
8011 "punpcklbw %%xmm1,%%xmm2 \n"
8012 "punpcklwd %%xmm1,%%xmm2 \n"
8013 "paddd %%xmm2,%%xmm0 \n"
8014 "movdqu (%2),%%xmm2 \n"
8015 "lea 0x10(%2),%2 \n"
8016 "paddd %%xmm0,%%xmm2 \n"
8017 "movdqu %%xmm2,(%1) \n"
8018 "lea 0x10(%1),%1 \n"
8019 "sub $0x1,%3 \n"
8020 "jge 10b \n"
8021
8022 "19: \n"
8023 : "+r"(row), // %0
8024 "+r"(cumsum), // %1
8025 "+r"(previous_cumsum), // %2
8026 "+r"(width) // %3
8027 :
8028 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
8029 }
8030 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
8031
8032 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
CumulativeSumToAverageRow_SSE2(const int32_t * topleft,const int32_t * botleft,int width,int area,uint8_t * dst,int count)8033 void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
8034 const int32_t* botleft,
8035 int width,
8036 int area,
8037 uint8_t* dst,
8038 int count) {
8039 asm volatile(
8040 "movd %5,%%xmm5 \n"
8041 "cvtdq2ps %%xmm5,%%xmm5 \n"
8042 "rcpss %%xmm5,%%xmm4 \n"
8043 "pshufd $0x0,%%xmm4,%%xmm4 \n"
8044 "sub $0x4,%3 \n"
8045 "jl 49f \n"
8046 "cmpl $0x80,%5 \n"
8047 "ja 40f \n"
8048
8049 "pshufd $0x0,%%xmm5,%%xmm5 \n"
8050 "pcmpeqb %%xmm6,%%xmm6 \n"
8051 "psrld $0x10,%%xmm6 \n"
8052 "cvtdq2ps %%xmm6,%%xmm6 \n"
8053 "addps %%xmm6,%%xmm5 \n"
8054 "mulps %%xmm4,%%xmm5 \n"
8055 "cvtps2dq %%xmm5,%%xmm5 \n"
8056 "packssdw %%xmm5,%%xmm5 \n"
8057
8058 // 4 pixel small loop.
8059 LABELALIGN
8060 "4: \n"
8061 "movdqu (%0),%%xmm0 \n"
8062 "movdqu 0x10(%0),%%xmm1 \n"
8063 "movdqu 0x20(%0),%%xmm2 \n"
8064 "movdqu 0x30(%0),%%xmm3 \n"
8065 "psubd 0x00(%0,%4,4),%%xmm0 \n"
8066 "psubd 0x10(%0,%4,4),%%xmm1 \n"
8067 "psubd 0x20(%0,%4,4),%%xmm2 \n"
8068 "psubd 0x30(%0,%4,4),%%xmm3 \n"
8069 "lea 0x40(%0),%0 \n"
8070 "psubd (%1),%%xmm0 \n"
8071 "psubd 0x10(%1),%%xmm1 \n"
8072 "psubd 0x20(%1),%%xmm2 \n"
8073 "psubd 0x30(%1),%%xmm3 \n"
8074 "paddd 0x00(%1,%4,4),%%xmm0 \n"
8075 "paddd 0x10(%1,%4,4),%%xmm1 \n"
8076 "paddd 0x20(%1,%4,4),%%xmm2 \n"
8077 "paddd 0x30(%1,%4,4),%%xmm3 \n"
8078 "lea 0x40(%1),%1 \n"
8079 "packssdw %%xmm1,%%xmm0 \n"
8080 "packssdw %%xmm3,%%xmm2 \n"
8081 "pmulhuw %%xmm5,%%xmm0 \n"
8082 "pmulhuw %%xmm5,%%xmm2 \n"
8083 "packuswb %%xmm2,%%xmm0 \n"
8084 "movdqu %%xmm0,(%2) \n"
8085 "lea 0x10(%2),%2 \n"
8086 "sub $0x4,%3 \n"
8087 "jge 4b \n"
8088 "jmp 49f \n"
8089
8090 // 4 pixel loop
8091 LABELALIGN
8092 "40: \n"
8093 "movdqu (%0),%%xmm0 \n"
8094 "movdqu 0x10(%0),%%xmm1 \n"
8095 "movdqu 0x20(%0),%%xmm2 \n"
8096 "movdqu 0x30(%0),%%xmm3 \n"
8097 "psubd 0x00(%0,%4,4),%%xmm0 \n"
8098 "psubd 0x10(%0,%4,4),%%xmm1 \n"
8099 "psubd 0x20(%0,%4,4),%%xmm2 \n"
8100 "psubd 0x30(%0,%4,4),%%xmm3 \n"
8101 "lea 0x40(%0),%0 \n"
8102 "psubd (%1),%%xmm0 \n"
8103 "psubd 0x10(%1),%%xmm1 \n"
8104 "psubd 0x20(%1),%%xmm2 \n"
8105 "psubd 0x30(%1),%%xmm3 \n"
8106 "paddd 0x00(%1,%4,4),%%xmm0 \n"
8107 "paddd 0x10(%1,%4,4),%%xmm1 \n"
8108 "paddd 0x20(%1,%4,4),%%xmm2 \n"
8109 "paddd 0x30(%1,%4,4),%%xmm3 \n"
8110 "lea 0x40(%1),%1 \n"
8111 "cvtdq2ps %%xmm0,%%xmm0 \n"
8112 "cvtdq2ps %%xmm1,%%xmm1 \n"
8113 "mulps %%xmm4,%%xmm0 \n"
8114 "mulps %%xmm4,%%xmm1 \n"
8115 "cvtdq2ps %%xmm2,%%xmm2 \n"
8116 "cvtdq2ps %%xmm3,%%xmm3 \n"
8117 "mulps %%xmm4,%%xmm2 \n"
8118 "mulps %%xmm4,%%xmm3 \n"
8119 "cvtps2dq %%xmm0,%%xmm0 \n"
8120 "cvtps2dq %%xmm1,%%xmm1 \n"
8121 "cvtps2dq %%xmm2,%%xmm2 \n"
8122 "cvtps2dq %%xmm3,%%xmm3 \n"
8123 "packssdw %%xmm1,%%xmm0 \n"
8124 "packssdw %%xmm3,%%xmm2 \n"
8125 "packuswb %%xmm2,%%xmm0 \n"
8126 "movdqu %%xmm0,(%2) \n"
8127 "lea 0x10(%2),%2 \n"
8128 "sub $0x4,%3 \n"
8129 "jge 40b \n"
8130
8131 "49: \n"
8132 "add $0x3,%3 \n"
8133 "jl 19f \n"
8134
8135 // 1 pixel loop
8136 LABELALIGN
8137 "10: \n"
8138 "movdqu (%0),%%xmm0 \n"
8139 "psubd 0x00(%0,%4,4),%%xmm0 \n"
8140 "lea 0x10(%0),%0 \n"
8141 "psubd (%1),%%xmm0 \n"
8142 "paddd 0x00(%1,%4,4),%%xmm0 \n"
8143 "lea 0x10(%1),%1 \n"
8144 "cvtdq2ps %%xmm0,%%xmm0 \n"
8145 "mulps %%xmm4,%%xmm0 \n"
8146 "cvtps2dq %%xmm0,%%xmm0 \n"
8147 "packssdw %%xmm0,%%xmm0 \n"
8148 "packuswb %%xmm0,%%xmm0 \n"
8149 "movd %%xmm0,(%2) \n"
8150 "lea 0x4(%2),%2 \n"
8151 "sub $0x1,%3 \n"
8152 "jge 10b \n"
8153 "19: \n"
8154 : "+r"(topleft), // %0
8155 "+r"(botleft), // %1
8156 "+r"(dst), // %2
8157 "+rm"(count) // %3
8158 : "r"((intptr_t)(width)), // %4
8159 "rm"(area) // %5
8160 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
8161 }
8162 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
8163
8164 #ifdef HAS_ARGBAFFINEROW_SSE2
8165 // Copy ARGB pixels from source image with slope to a row of destination.
8166 LIBYUV_API
ARGBAffineRow_SSE2(const uint8_t * src_argb,int src_argb_stride,uint8_t * dst_argb,const float * src_dudv,int width)8167 void ARGBAffineRow_SSE2(const uint8_t* src_argb,
8168 int src_argb_stride,
8169 uint8_t* dst_argb,
8170 const float* src_dudv,
8171 int width) {
8172 intptr_t src_argb_stride_temp = src_argb_stride;
8173 intptr_t temp;
8174 asm volatile(
8175 "movq (%3),%%xmm2 \n"
8176 "movq 0x08(%3),%%xmm7 \n"
8177 "shl $0x10,%1 \n"
8178 "add $0x4,%1 \n"
8179 "movd %1,%%xmm5 \n"
8180 "sub $0x4,%4 \n"
8181 "jl 49f \n"
8182
8183 "pshufd $0x44,%%xmm7,%%xmm7 \n"
8184 "pshufd $0x0,%%xmm5,%%xmm5 \n"
8185 "movdqa %%xmm2,%%xmm0 \n"
8186 "addps %%xmm7,%%xmm0 \n"
8187 "movlhps %%xmm0,%%xmm2 \n"
8188 "movdqa %%xmm7,%%xmm4 \n"
8189 "addps %%xmm4,%%xmm4 \n"
8190 "movdqa %%xmm2,%%xmm3 \n"
8191 "addps %%xmm4,%%xmm3 \n"
8192 "addps %%xmm4,%%xmm4 \n"
8193
8194 // 4 pixel loop
8195 LABELALIGN
8196 "40: \n"
8197 "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2
8198 "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2
8199 "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
8200 "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride
8201 "movd %%xmm0,%k1 \n"
8202 "pshufd $0x39,%%xmm0,%%xmm0 \n"
8203 "movd %%xmm0,%k5 \n"
8204 "pshufd $0x39,%%xmm0,%%xmm0 \n"
8205 "movd 0x00(%0,%1,1),%%xmm1 \n"
8206 "movd 0x00(%0,%5,1),%%xmm6 \n"
8207 "punpckldq %%xmm6,%%xmm1 \n"
8208 "addps %%xmm4,%%xmm2 \n"
8209 "movq %%xmm1,(%2) \n"
8210 "movd %%xmm0,%k1 \n"
8211 "pshufd $0x39,%%xmm0,%%xmm0 \n"
8212 "movd %%xmm0,%k5 \n"
8213 "movd 0x00(%0,%1,1),%%xmm0 \n"
8214 "movd 0x00(%0,%5,1),%%xmm6 \n"
8215 "punpckldq %%xmm6,%%xmm0 \n"
8216 "addps %%xmm4,%%xmm3 \n"
8217 "movq %%xmm0,0x08(%2) \n"
8218 "lea 0x10(%2),%2 \n"
8219 "sub $0x4,%4 \n"
8220 "jge 40b \n"
8221
8222 "49: \n"
8223 "add $0x3,%4 \n"
8224 "jl 19f \n"
8225
8226 // 1 pixel loop
8227 LABELALIGN
8228 "10: \n"
8229 "cvttps2dq %%xmm2,%%xmm0 \n"
8230 "packssdw %%xmm0,%%xmm0 \n"
8231 "pmaddwd %%xmm5,%%xmm0 \n"
8232 "addps %%xmm7,%%xmm2 \n"
8233 "movd %%xmm0,%k1 \n"
8234 "movd 0x00(%0,%1,1),%%xmm0 \n"
8235 "movd %%xmm0,(%2) \n"
8236 "lea 0x04(%2),%2 \n"
8237 "sub $0x1,%4 \n"
8238 "jge 10b \n"
8239 "19: \n"
8240 : "+r"(src_argb), // %0
8241 "+r"(src_argb_stride_temp), // %1
8242 "+r"(dst_argb), // %2
8243 "+r"(src_dudv), // %3
8244 "+rm"(width), // %4
8245 "=&r"(temp) // %5
8246 :
8247 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
8248 "xmm7");
8249 }
8250 #endif // HAS_ARGBAFFINEROW_SSE2
8251
8252 #ifdef HAS_INTERPOLATEROW_SSSE3
8253 // Bilinear filter 16x2 -> 16x1
InterpolateRow_SSSE3(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int width,int source_y_fraction)8254 void InterpolateRow_SSSE3(uint8_t* dst_ptr,
8255 const uint8_t* src_ptr,
8256 ptrdiff_t src_stride,
8257 int width,
8258 int source_y_fraction) {
8259 asm volatile(
8260 "sub %1,%0 \n"
8261 "cmp $0x0,%3 \n"
8262 "je 100f \n"
8263 "cmp $0x80,%3 \n"
8264 "je 50f \n"
8265
8266 "movd %3,%%xmm0 \n"
8267 "neg %3 \n"
8268 "add $0x100,%3 \n"
8269 "movd %3,%%xmm5 \n"
8270 "punpcklbw %%xmm0,%%xmm5 \n"
8271 "punpcklwd %%xmm5,%%xmm5 \n"
8272 "pshufd $0x0,%%xmm5,%%xmm5 \n"
8273 "mov $0x80808080,%%eax \n"
8274 "movd %%eax,%%xmm4 \n"
8275 "pshufd $0x0,%%xmm4,%%xmm4 \n"
8276
8277 // General purpose row blend.
8278 LABELALIGN
8279 "1: \n"
8280 "movdqu (%1),%%xmm0 \n"
8281 "movdqu 0x00(%1,%4,1),%%xmm2 \n"
8282 "movdqa %%xmm0,%%xmm1 \n"
8283 "punpcklbw %%xmm2,%%xmm0 \n"
8284 "punpckhbw %%xmm2,%%xmm1 \n"
8285 "psubb %%xmm4,%%xmm0 \n"
8286 "psubb %%xmm4,%%xmm1 \n"
8287 "movdqa %%xmm5,%%xmm2 \n"
8288 "movdqa %%xmm5,%%xmm3 \n"
8289 "pmaddubsw %%xmm0,%%xmm2 \n"
8290 "pmaddubsw %%xmm1,%%xmm3 \n"
8291 "paddw %%xmm4,%%xmm2 \n"
8292 "paddw %%xmm4,%%xmm3 \n"
8293 "psrlw $0x8,%%xmm2 \n"
8294 "psrlw $0x8,%%xmm3 \n"
8295 "packuswb %%xmm3,%%xmm2 \n"
8296 "movdqu %%xmm2,0x00(%1,%0,1) \n"
8297 "lea 0x10(%1),%1 \n"
8298 "sub $0x10,%2 \n"
8299 "jg 1b \n"
8300 "jmp 99f \n"
8301
8302 // Blend 50 / 50.
8303 LABELALIGN
8304 "50: \n"
8305 "movdqu (%1),%%xmm0 \n"
8306 "movdqu 0x00(%1,%4,1),%%xmm1 \n"
8307 "pavgb %%xmm1,%%xmm0 \n"
8308 "movdqu %%xmm0,0x00(%1,%0,1) \n"
8309 "lea 0x10(%1),%1 \n"
8310 "sub $0x10,%2 \n"
8311 "jg 50b \n"
8312 "jmp 99f \n"
8313
8314 // Blend 100 / 0 - Copy row unchanged.
8315 LABELALIGN
8316 "100: \n"
8317 "movdqu (%1),%%xmm0 \n"
8318 "movdqu %%xmm0,0x00(%1,%0,1) \n"
8319 "lea 0x10(%1),%1 \n"
8320 "sub $0x10,%2 \n"
8321 "jg 100b \n"
8322
8323 "99: \n"
8324 : "+r"(dst_ptr), // %0
8325 "+r"(src_ptr), // %1
8326 "+rm"(width), // %2
8327 "+r"(source_y_fraction) // %3
8328 : "r"((intptr_t)(src_stride)) // %4
8329 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
8330 }
8331 #endif // HAS_INTERPOLATEROW_SSSE3
8332
8333 #ifdef HAS_INTERPOLATEROW_AVX2
8334 // Bilinear filter 32x2 -> 32x1
InterpolateRow_AVX2(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int width,int source_y_fraction)8335 void InterpolateRow_AVX2(uint8_t* dst_ptr,
8336 const uint8_t* src_ptr,
8337 ptrdiff_t src_stride,
8338 int width,
8339 int source_y_fraction) {
8340 asm volatile(
8341 "sub %1,%0 \n"
8342 "cmp $0x0,%3 \n"
8343 "je 100f \n"
8344 "cmp $0x80,%3 \n"
8345 "je 50f \n"
8346
8347 "vmovd %3,%%xmm0 \n"
8348 "neg %3 \n"
8349 "add $0x100,%3 \n"
8350 "vmovd %3,%%xmm5 \n"
8351 "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
8352 "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
8353 "vbroadcastss %%xmm5,%%ymm5 \n"
8354 "mov $0x80808080,%%eax \n"
8355 "vmovd %%eax,%%xmm4 \n"
8356 "vbroadcastss %%xmm4,%%ymm4 \n"
8357
8358 // General purpose row blend.
8359 LABELALIGN
8360 "1: \n"
8361 "vmovdqu (%1),%%ymm0 \n"
8362 "vmovdqu 0x00(%1,%4,1),%%ymm2 \n"
8363 "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
8364 "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
8365 "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
8366 "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
8367 "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
8368 "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
8369 "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
8370 "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
8371 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
8372 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
8373 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
8374 "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
8375 "lea 0x20(%1),%1 \n"
8376 "sub $0x20,%2 \n"
8377 "jg 1b \n"
8378 "jmp 99f \n"
8379
8380 // Blend 50 / 50.
8381 LABELALIGN
8382 "50: \n"
8383 "vmovdqu (%1),%%ymm0 \n"
8384 "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n"
8385 "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
8386 "lea 0x20(%1),%1 \n"
8387 "sub $0x20,%2 \n"
8388 "jg 50b \n"
8389 "jmp 99f \n"
8390
8391 // Blend 100 / 0 - Copy row unchanged.
8392 LABELALIGN
8393 "100: \n"
8394 "vmovdqu (%1),%%ymm0 \n"
8395 "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
8396 "lea 0x20(%1),%1 \n"
8397 "sub $0x20,%2 \n"
8398 "jg 100b \n"
8399
8400 "99: \n"
8401 "vzeroupper \n"
8402 : "+r"(dst_ptr), // %0
8403 "+r"(src_ptr), // %1
8404 "+r"(width), // %2
8405 "+r"(source_y_fraction) // %3
8406 : "r"((intptr_t)(src_stride)) // %4
8407 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
8408 }
8409 #endif // HAS_INTERPOLATEROW_AVX2
8410
8411 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
8412 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)8413 void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
8414 uint8_t* dst_argb,
8415 const uint8_t* shuffler,
8416 int width) {
8417 asm volatile(
8418
8419 "movdqu (%3),%%xmm5 \n"
8420
8421 LABELALIGN
8422 "1: \n"
8423 "movdqu (%0),%%xmm0 \n"
8424 "movdqu 0x10(%0),%%xmm1 \n"
8425 "lea 0x20(%0),%0 \n"
8426 "pshufb %%xmm5,%%xmm0 \n"
8427 "pshufb %%xmm5,%%xmm1 \n"
8428 "movdqu %%xmm0,(%1) \n"
8429 "movdqu %%xmm1,0x10(%1) \n"
8430 "lea 0x20(%1),%1 \n"
8431 "sub $0x8,%2 \n"
8432 "jg 1b \n"
8433 : "+r"(src_argb), // %0
8434 "+r"(dst_argb), // %1
8435 "+r"(width) // %2
8436 : "r"(shuffler) // %3
8437 : "memory", "cc", "xmm0", "xmm1", "xmm5");
8438 }
8439 #endif // HAS_ARGBSHUFFLEROW_SSSE3
8440
8441 #ifdef HAS_ARGBSHUFFLEROW_AVX2
8442 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)8443 void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
8444 uint8_t* dst_argb,
8445 const uint8_t* shuffler,
8446 int width) {
8447 asm volatile(
8448
8449 "vbroadcastf128 (%3),%%ymm5 \n"
8450
8451 LABELALIGN
8452 "1: \n"
8453 "vmovdqu (%0),%%ymm0 \n"
8454 "vmovdqu 0x20(%0),%%ymm1 \n"
8455 "lea 0x40(%0),%0 \n"
8456 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
8457 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
8458 "vmovdqu %%ymm0,(%1) \n"
8459 "vmovdqu %%ymm1,0x20(%1) \n"
8460 "lea 0x40(%1),%1 \n"
8461 "sub $0x10,%2 \n"
8462 "jg 1b \n"
8463 "vzeroupper \n"
8464 : "+r"(src_argb), // %0
8465 "+r"(dst_argb), // %1
8466 "+r"(width) // %2
8467 : "r"(shuffler) // %3
8468 : "memory", "cc", "xmm0", "xmm1", "xmm5");
8469 }
8470 #endif // HAS_ARGBSHUFFLEROW_AVX2
8471
8472 #ifdef HAS_I422TOYUY2ROW_SSE2
I422ToYUY2Row_SSE2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)8473 void I422ToYUY2Row_SSE2(const uint8_t* src_y,
8474 const uint8_t* src_u,
8475 const uint8_t* src_v,
8476 uint8_t* dst_yuy2,
8477 int width) {
8478 asm volatile(
8479
8480 "sub %1,%2 \n"
8481
8482 LABELALIGN
8483 "1: \n"
8484 "movq (%1),%%xmm2 \n"
8485 "movq 0x00(%1,%2,1),%%xmm1 \n"
8486 "add $0x8,%1 \n"
8487 "punpcklbw %%xmm1,%%xmm2 \n"
8488 "movdqu (%0),%%xmm0 \n"
8489 "add $0x10,%0 \n"
8490 "movdqa %%xmm0,%%xmm1 \n"
8491 "punpcklbw %%xmm2,%%xmm0 \n"
8492 "punpckhbw %%xmm2,%%xmm1 \n"
8493 "movdqu %%xmm0,(%3) \n"
8494 "movdqu %%xmm1,0x10(%3) \n"
8495 "lea 0x20(%3),%3 \n"
8496 "sub $0x10,%4 \n"
8497 "jg 1b \n"
8498 : "+r"(src_y), // %0
8499 "+r"(src_u), // %1
8500 "+r"(src_v), // %2
8501 "+r"(dst_yuy2), // %3
8502 "+rm"(width) // %4
8503 :
8504 : "memory", "cc", "xmm0", "xmm1", "xmm2");
8505 }
8506 #endif // HAS_I422TOYUY2ROW_SSE2
8507
8508 #ifdef HAS_I422TOUYVYROW_SSE2
I422ToUYVYRow_SSE2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)8509 void I422ToUYVYRow_SSE2(const uint8_t* src_y,
8510 const uint8_t* src_u,
8511 const uint8_t* src_v,
8512 uint8_t* dst_uyvy,
8513 int width) {
8514 asm volatile(
8515
8516 "sub %1,%2 \n"
8517
8518 LABELALIGN
8519 "1: \n"
8520 "movq (%1),%%xmm2 \n"
8521 "movq 0x00(%1,%2,1),%%xmm1 \n"
8522 "add $0x8,%1 \n"
8523 "punpcklbw %%xmm1,%%xmm2 \n"
8524 "movdqu (%0),%%xmm0 \n"
8525 "movdqa %%xmm2,%%xmm1 \n"
8526 "add $0x10,%0 \n"
8527 "punpcklbw %%xmm0,%%xmm1 \n"
8528 "punpckhbw %%xmm0,%%xmm2 \n"
8529 "movdqu %%xmm1,(%3) \n"
8530 "movdqu %%xmm2,0x10(%3) \n"
8531 "lea 0x20(%3),%3 \n"
8532 "sub $0x10,%4 \n"
8533 "jg 1b \n"
8534 : "+r"(src_y), // %0
8535 "+r"(src_u), // %1
8536 "+r"(src_v), // %2
8537 "+r"(dst_uyvy), // %3
8538 "+rm"(width) // %4
8539 :
8540 : "memory", "cc", "xmm0", "xmm1", "xmm2");
8541 }
8542 #endif // HAS_I422TOUYVYROW_SSE2
8543
8544 #ifdef HAS_I422TOYUY2ROW_AVX2
I422ToYUY2Row_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)8545 void I422ToYUY2Row_AVX2(const uint8_t* src_y,
8546 const uint8_t* src_u,
8547 const uint8_t* src_v,
8548 uint8_t* dst_yuy2,
8549 int width) {
8550 asm volatile(
8551
8552 "sub %1,%2 \n"
8553
8554 LABELALIGN
8555 "1: \n"
8556 "vpmovzxbw (%1),%%ymm1 \n"
8557 "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
8558 "add $0x10,%1 \n"
8559 "vpsllw $0x8,%%ymm2,%%ymm2 \n"
8560 "vpor %%ymm1,%%ymm2,%%ymm2 \n"
8561 "vmovdqu (%0),%%ymm0 \n"
8562 "add $0x20,%0 \n"
8563 "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n"
8564 "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n"
8565 "vextractf128 $0x0,%%ymm1,(%3) \n"
8566 "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
8567 "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
8568 "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
8569 "lea 0x40(%3),%3 \n"
8570 "sub $0x20,%4 \n"
8571 "jg 1b \n"
8572 "vzeroupper \n"
8573 : "+r"(src_y), // %0
8574 "+r"(src_u), // %1
8575 "+r"(src_v), // %2
8576 "+r"(dst_yuy2), // %3
8577 "+rm"(width) // %4
8578 :
8579 : "memory", "cc", "xmm0", "xmm1", "xmm2");
8580 }
8581 #endif // HAS_I422TOYUY2ROW_AVX2
8582
8583 #ifdef HAS_I422TOUYVYROW_AVX2
I422ToUYVYRow_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)8584 void I422ToUYVYRow_AVX2(const uint8_t* src_y,
8585 const uint8_t* src_u,
8586 const uint8_t* src_v,
8587 uint8_t* dst_uyvy,
8588 int width) {
8589 asm volatile(
8590
8591 "sub %1,%2 \n"
8592
8593 LABELALIGN
8594 "1: \n"
8595 "vpmovzxbw (%1),%%ymm1 \n"
8596 "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
8597 "add $0x10,%1 \n"
8598 "vpsllw $0x8,%%ymm2,%%ymm2 \n"
8599 "vpor %%ymm1,%%ymm2,%%ymm2 \n"
8600 "vmovdqu (%0),%%ymm0 \n"
8601 "add $0x20,%0 \n"
8602 "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
8603 "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
8604 "vextractf128 $0x0,%%ymm1,(%3) \n"
8605 "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
8606 "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
8607 "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
8608 "lea 0x40(%3),%3 \n"
8609 "sub $0x20,%4 \n"
8610 "jg 1b \n"
8611 "vzeroupper \n"
8612 : "+r"(src_y), // %0
8613 "+r"(src_u), // %1
8614 "+r"(src_v), // %2
8615 "+r"(dst_uyvy), // %3
8616 "+rm"(width) // %4
8617 :
8618 : "memory", "cc", "xmm0", "xmm1", "xmm2");
8619 }
8620 #endif // HAS_I422TOUYVYROW_AVX2
8621
8622 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
ARGBPolynomialRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,const float * poly,int width)8623 void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
8624 uint8_t* dst_argb,
8625 const float* poly,
8626 int width) {
8627 asm volatile(
8628
8629 "pxor %%xmm3,%%xmm3 \n"
8630
8631 // 2 pixel loop.
8632 LABELALIGN
8633 "1: \n"
8634 "movq (%0),%%xmm0 \n"
8635 "lea 0x8(%0),%0 \n"
8636 "punpcklbw %%xmm3,%%xmm0 \n"
8637 "movdqa %%xmm0,%%xmm4 \n"
8638 "punpcklwd %%xmm3,%%xmm0 \n"
8639 "punpckhwd %%xmm3,%%xmm4 \n"
8640 "cvtdq2ps %%xmm0,%%xmm0 \n"
8641 "cvtdq2ps %%xmm4,%%xmm4 \n"
8642 "movdqa %%xmm0,%%xmm1 \n"
8643 "movdqa %%xmm4,%%xmm5 \n"
8644 "mulps 0x10(%3),%%xmm0 \n"
8645 "mulps 0x10(%3),%%xmm4 \n"
8646 "addps (%3),%%xmm0 \n"
8647 "addps (%3),%%xmm4 \n"
8648 "movdqa %%xmm1,%%xmm2 \n"
8649 "movdqa %%xmm5,%%xmm6 \n"
8650 "mulps %%xmm1,%%xmm2 \n"
8651 "mulps %%xmm5,%%xmm6 \n"
8652 "mulps %%xmm2,%%xmm1 \n"
8653 "mulps %%xmm6,%%xmm5 \n"
8654 "mulps 0x20(%3),%%xmm2 \n"
8655 "mulps 0x20(%3),%%xmm6 \n"
8656 "mulps 0x30(%3),%%xmm1 \n"
8657 "mulps 0x30(%3),%%xmm5 \n"
8658 "addps %%xmm2,%%xmm0 \n"
8659 "addps %%xmm6,%%xmm4 \n"
8660 "addps %%xmm1,%%xmm0 \n"
8661 "addps %%xmm5,%%xmm4 \n"
8662 "cvttps2dq %%xmm0,%%xmm0 \n"
8663 "cvttps2dq %%xmm4,%%xmm4 \n"
8664 "packuswb %%xmm4,%%xmm0 \n"
8665 "packuswb %%xmm0,%%xmm0 \n"
8666 "movq %%xmm0,(%1) \n"
8667 "lea 0x8(%1),%1 \n"
8668 "sub $0x2,%2 \n"
8669 "jg 1b \n"
8670 : "+r"(src_argb), // %0
8671 "+r"(dst_argb), // %1
8672 "+r"(width) // %2
8673 : "r"(poly) // %3
8674 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
8675 }
8676 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
8677
8678 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
ARGBPolynomialRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,const float * poly,int width)8679 void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
8680 uint8_t* dst_argb,
8681 const float* poly,
8682 int width) {
8683 asm volatile(
8684 "vbroadcastf128 (%3),%%ymm4 \n"
8685 "vbroadcastf128 0x10(%3),%%ymm5 \n"
8686 "vbroadcastf128 0x20(%3),%%ymm6 \n"
8687 "vbroadcastf128 0x30(%3),%%ymm7 \n"
8688
8689 // 2 pixel loop.
8690 LABELALIGN
8691 "1: \n"
8692 "vpmovzxbd (%0),%%ymm0 \n" // 2 ARGB pixels
8693 "lea 0x8(%0),%0 \n"
8694 "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
8695 "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
8696 "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
8697 "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
8698 "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
8699 "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X *
8700 // X
8701 "vcvttps2dq %%ymm0,%%ymm0 \n"
8702 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
8703 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
8704 "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
8705 "vmovq %%xmm0,(%1) \n"
8706 "lea 0x8(%1),%1 \n"
8707 "sub $0x2,%2 \n"
8708 "jg 1b \n"
8709 "vzeroupper \n"
8710 : "+r"(src_argb), // %0
8711 "+r"(dst_argb), // %1
8712 "+r"(width) // %2
8713 : "r"(poly) // %3
8714 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
8715 "xmm7");
8716 }
8717 #endif // HAS_ARGBPOLYNOMIALROW_AVX2
8718
8719 #ifdef HAS_HALFFLOATROW_SSE2
8720 static float kScaleBias = 1.9259299444e-34f;
HalfFloatRow_SSE2(const uint16_t * src,uint16_t * dst,float scale,int width)8721 void HalfFloatRow_SSE2(const uint16_t* src,
8722 uint16_t* dst,
8723 float scale,
8724 int width) {
8725 scale *= kScaleBias;
8726 asm volatile(
8727 "movd %3,%%xmm4 \n"
8728 "pshufd $0x0,%%xmm4,%%xmm4 \n"
8729 "pxor %%xmm5,%%xmm5 \n"
8730 "sub %0,%1 \n"
8731
8732 // 16 pixel loop.
8733 LABELALIGN
8734 "1: \n"
8735 "movdqu (%0),%%xmm2 \n" // 8 shorts
8736 "add $0x10,%0 \n"
8737 "movdqa %%xmm2,%%xmm3 \n"
8738 "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1
8739 "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats
8740 "punpckhwd %%xmm5,%%xmm3 \n"
8741 "cvtdq2ps %%xmm3,%%xmm3 \n"
8742 "mulps %%xmm4,%%xmm2 \n"
8743 "mulps %%xmm4,%%xmm3 \n"
8744 "psrld $0xd,%%xmm2 \n"
8745 "psrld $0xd,%%xmm3 \n"
8746 "packssdw %%xmm3,%%xmm2 \n"
8747 "movdqu %%xmm2,-0x10(%0,%1,1) \n"
8748 "sub $0x8,%2 \n"
8749 "jg 1b \n"
8750 : "+r"(src), // %0
8751 "+r"(dst), // %1
8752 "+r"(width) // %2
8753 : "m"(scale) // %3
8754 : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
8755 }
8756 #endif // HAS_HALFFLOATROW_SSE2
8757
8758 #ifdef HAS_HALFFLOATROW_AVX2
HalfFloatRow_AVX2(const uint16_t * src,uint16_t * dst,float scale,int width)8759 void HalfFloatRow_AVX2(const uint16_t* src,
8760 uint16_t* dst,
8761 float scale,
8762 int width) {
8763 scale *= kScaleBias;
8764 asm volatile(
8765 "vbroadcastss %3, %%ymm4 \n"
8766 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
8767 "sub %0,%1 \n"
8768
8769 // 16 pixel loop.
8770 LABELALIGN
8771 "1: \n"
8772 "vmovdqu (%0),%%ymm2 \n" // 16 shorts
8773 "add $0x20,%0 \n"
8774 "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
8775 "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
8776 "vcvtdq2ps %%ymm3,%%ymm3 \n"
8777 "vcvtdq2ps %%ymm2,%%ymm2 \n"
8778 "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
8779 "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
8780 "vpsrld $0xd,%%ymm3,%%ymm3 \n"
8781 "vpsrld $0xd,%%ymm2,%%ymm2 \n"
8782 "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
8783 "vmovdqu %%ymm2,-0x20(%0,%1,1) \n"
8784 "sub $0x10,%2 \n"
8785 "jg 1b \n"
8786
8787 "vzeroupper \n"
8788 : "+r"(src), // %0
8789 "+r"(dst), // %1
8790 "+r"(width) // %2
8791 #if defined(__x86_64__)
8792 : "x"(scale) // %3
8793 #else
8794 : "m"(scale) // %3
8795 #endif
8796 : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
8797 }
8798 #endif // HAS_HALFFLOATROW_AVX2
8799
8800 #ifdef HAS_HALFFLOATROW_F16C
HalfFloatRow_F16C(const uint16_t * src,uint16_t * dst,float scale,int width)8801 void HalfFloatRow_F16C(const uint16_t* src,
8802 uint16_t* dst,
8803 float scale,
8804 int width) {
8805 asm volatile(
8806 "vbroadcastss %3, %%ymm4 \n"
8807 "sub %0,%1 \n"
8808
8809 // 16 pixel loop.
8810 LABELALIGN
8811 "1: \n"
8812 "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints
8813 "vpmovzxwd 0x10(%0),%%ymm3 \n"
8814 "vcvtdq2ps %%ymm2,%%ymm2 \n"
8815 "vcvtdq2ps %%ymm3,%%ymm3 \n"
8816 "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
8817 "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
8818 "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
8819 "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
8820 "vmovdqu %%xmm2,0x00(%0,%1,1) \n"
8821 "vmovdqu %%xmm3,0x10(%0,%1,1) \n"
8822 "add $0x20,%0 \n"
8823 "sub $0x10,%2 \n"
8824 "jg 1b \n"
8825 "vzeroupper \n"
8826 : "+r"(src), // %0
8827 "+r"(dst), // %1
8828 "+r"(width) // %2
8829 #if defined(__x86_64__)
8830 : "x"(scale) // %3
8831 #else
8832 : "m"(scale) // %3
8833 #endif
8834 : "memory", "cc", "xmm2", "xmm3", "xmm4");
8835 }
8836 #endif // HAS_HALFFLOATROW_F16C
8837
8838 #ifdef HAS_HALFFLOATROW_F16C
HalfFloat1Row_F16C(const uint16_t * src,uint16_t * dst,float,int width)8839 void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
8840 asm volatile(
8841 "sub %0,%1 \n"
8842 // 16 pixel loop.
8843 LABELALIGN
8844 "1: \n"
8845 "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints
8846 "vpmovzxwd 0x10(%0),%%ymm3 \n"
8847 "vcvtdq2ps %%ymm2,%%ymm2 \n"
8848 "vcvtdq2ps %%ymm3,%%ymm3 \n"
8849 "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
8850 "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
8851 "vmovdqu %%xmm2,0x00(%0,%1,1) \n"
8852 "vmovdqu %%xmm3,0x10(%0,%1,1) \n"
8853 "add $0x20,%0 \n"
8854 "sub $0x10,%2 \n"
8855 "jg 1b \n"
8856 "vzeroupper \n"
8857 : "+r"(src), // %0
8858 "+r"(dst), // %1
8859 "+r"(width) // %2
8860 :
8861 : "memory", "cc", "xmm2", "xmm3");
8862 }
8863 #endif // HAS_HALFFLOATROW_F16C
8864
8865 #ifdef HAS_ARGBCOLORTABLEROW_X86
8866 // Tranform ARGB pixels with color table.
ARGBColorTableRow_X86(uint8_t * dst_argb,const uint8_t * table_argb,int width)8867 void ARGBColorTableRow_X86(uint8_t* dst_argb,
8868 const uint8_t* table_argb,
8869 int width) {
8870 uintptr_t pixel_temp;
8871 asm volatile(
8872 // 1 pixel loop.
8873 LABELALIGN
8874 "1: \n"
8875 "movzb (%0),%1 \n"
8876 "lea 0x4(%0),%0 \n"
8877 "movzb 0x00(%3,%1,4),%1 \n"
8878 "mov %b1,-0x4(%0) \n"
8879 "movzb -0x3(%0),%1 \n"
8880 "movzb 0x01(%3,%1,4),%1 \n"
8881 "mov %b1,-0x3(%0) \n"
8882 "movzb -0x2(%0),%1 \n"
8883 "movzb 0x02(%3,%1,4),%1 \n"
8884 "mov %b1,-0x2(%0) \n"
8885 "movzb -0x1(%0),%1 \n"
8886 "movzb 0x03(%3,%1,4),%1 \n"
8887 "mov %b1,-0x1(%0) \n"
8888 "dec %2 \n"
8889 "jg 1b \n"
8890 : "+r"(dst_argb), // %0
8891 "=&d"(pixel_temp), // %1
8892 "+r"(width) // %2
8893 : "r"(table_argb) // %3
8894 : "memory", "cc");
8895 }
8896 #endif // HAS_ARGBCOLORTABLEROW_X86
8897
8898 #ifdef HAS_RGBCOLORTABLEROW_X86
8899 // Tranform RGB pixels with color table.
RGBColorTableRow_X86(uint8_t * dst_argb,const uint8_t * table_argb,int width)8900 void RGBColorTableRow_X86(uint8_t* dst_argb,
8901 const uint8_t* table_argb,
8902 int width) {
8903 uintptr_t pixel_temp;
8904 asm volatile(
8905 // 1 pixel loop.
8906 LABELALIGN
8907 "1: \n"
8908 "movzb (%0),%1 \n"
8909 "lea 0x4(%0),%0 \n"
8910 "movzb 0x00(%3,%1,4),%1 \n"
8911 "mov %b1,-0x4(%0) \n"
8912 "movzb -0x3(%0),%1 \n"
8913 "movzb 0x01(%3,%1,4),%1 \n"
8914 "mov %b1,-0x3(%0) \n"
8915 "movzb -0x2(%0),%1 \n"
8916 "movzb 0x02(%3,%1,4),%1 \n"
8917 "mov %b1,-0x2(%0) \n"
8918 "dec %2 \n"
8919 "jg 1b \n"
8920 : "+r"(dst_argb), // %0
8921 "=&d"(pixel_temp), // %1
8922 "+r"(width) // %2
8923 : "r"(table_argb) // %3
8924 : "memory", "cc");
8925 }
8926 #endif // HAS_RGBCOLORTABLEROW_X86
8927
8928 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
8929 // Tranform RGB pixels with luma table.
ARGBLumaColorTableRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width,const uint8_t * luma,uint32_t lumacoeff)8930 void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
8931 uint8_t* dst_argb,
8932 int width,
8933 const uint8_t* luma,
8934 uint32_t lumacoeff) {
8935 uintptr_t pixel_temp;
8936 uintptr_t table_temp;
8937 asm volatile(
8938 "movd %6,%%xmm3 \n"
8939 "pshufd $0x0,%%xmm3,%%xmm3 \n"
8940 "pcmpeqb %%xmm4,%%xmm4 \n"
8941 "psllw $0x8,%%xmm4 \n"
8942 "pxor %%xmm5,%%xmm5 \n"
8943
8944 // 4 pixel loop.
8945 LABELALIGN
8946 "1: \n"
8947 "movdqu (%2),%%xmm0 \n"
8948 "pmaddubsw %%xmm3,%%xmm0 \n"
8949 "phaddw %%xmm0,%%xmm0 \n"
8950 "pand %%xmm4,%%xmm0 \n"
8951 "punpcklwd %%xmm5,%%xmm0 \n"
8952 "movd %%xmm0,%k1 \n" // 32 bit offset
8953 "add %5,%1 \n"
8954 "pshufd $0x39,%%xmm0,%%xmm0 \n"
8955
8956 "movzb (%2),%0 \n"
8957 "movzb 0x00(%1,%0,1),%0 \n"
8958 "mov %b0,(%3) \n"
8959 "movzb 0x1(%2),%0 \n"
8960 "movzb 0x00(%1,%0,1),%0 \n"
8961 "mov %b0,0x1(%3) \n"
8962 "movzb 0x2(%2),%0 \n"
8963 "movzb 0x00(%1,%0,1),%0 \n"
8964 "mov %b0,0x2(%3) \n"
8965 "movzb 0x3(%2),%0 \n"
8966 "mov %b0,0x3(%3) \n"
8967
8968 "movd %%xmm0,%k1 \n" // 32 bit offset
8969 "add %5,%1 \n"
8970 "pshufd $0x39,%%xmm0,%%xmm0 \n"
8971
8972 "movzb 0x4(%2),%0 \n"
8973 "movzb 0x00(%1,%0,1),%0 \n"
8974 "mov %b0,0x4(%3) \n"
8975 "movzb 0x5(%2),%0 \n"
8976 "movzb 0x00(%1,%0,1),%0 \n"
8977 "mov %b0,0x5(%3) \n"
8978 "movzb 0x6(%2),%0 \n"
8979 "movzb 0x00(%1,%0,1),%0 \n"
8980 "mov %b0,0x6(%3) \n"
8981 "movzb 0x7(%2),%0 \n"
8982 "mov %b0,0x7(%3) \n"
8983
8984 "movd %%xmm0,%k1 \n" // 32 bit offset
8985 "add %5,%1 \n"
8986 "pshufd $0x39,%%xmm0,%%xmm0 \n"
8987
8988 "movzb 0x8(%2),%0 \n"
8989 "movzb 0x00(%1,%0,1),%0 \n"
8990 "mov %b0,0x8(%3) \n"
8991 "movzb 0x9(%2),%0 \n"
8992 "movzb 0x00(%1,%0,1),%0 \n"
8993 "mov %b0,0x9(%3) \n"
8994 "movzb 0xa(%2),%0 \n"
8995 "movzb 0x00(%1,%0,1),%0 \n"
8996 "mov %b0,0xa(%3) \n"
8997 "movzb 0xb(%2),%0 \n"
8998 "mov %b0,0xb(%3) \n"
8999
9000 "movd %%xmm0,%k1 \n" // 32 bit offset
9001 "add %5,%1 \n"
9002
9003 "movzb 0xc(%2),%0 \n"
9004 "movzb 0x00(%1,%0,1),%0 \n"
9005 "mov %b0,0xc(%3) \n"
9006 "movzb 0xd(%2),%0 \n"
9007 "movzb 0x00(%1,%0,1),%0 \n"
9008 "mov %b0,0xd(%3) \n"
9009 "movzb 0xe(%2),%0 \n"
9010 "movzb 0x00(%1,%0,1),%0 \n"
9011 "mov %b0,0xe(%3) \n"
9012 "movzb 0xf(%2),%0 \n"
9013 "mov %b0,0xf(%3) \n"
9014 "lea 0x10(%2),%2 \n"
9015 "lea 0x10(%3),%3 \n"
9016 "sub $0x4,%4 \n"
9017 "jg 1b \n"
9018 : "=&d"(pixel_temp), // %0
9019 "=&a"(table_temp), // %1
9020 "+r"(src_argb), // %2
9021 "+r"(dst_argb), // %3
9022 "+rm"(width) // %4
9023 : "r"(luma), // %5
9024 "rm"(lumacoeff) // %6
9025 : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
9026 }
9027 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
9028
9029 static const uvec8 kYUV24Shuffle[3] = {
9030 {8, 9, 0, 8, 9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12},
9031 {9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15},
9032 {2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15, 6, 14, 15, 7}};
9033
9034 // Convert biplanar NV21 to packed YUV24
9035 // NV21 has VU in memory for chroma.
9036 // YUV24 is VUY in memory
NV21ToYUV24Row_SSSE3(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)9037 void NV21ToYUV24Row_SSSE3(const uint8_t* src_y,
9038 const uint8_t* src_vu,
9039 uint8_t* dst_yuv24,
9040 int width) {
9041 asm volatile(
9042 "sub %0,%1 \n"
9043 "movdqa (%4),%%xmm4 \n" // 3 shuffler constants
9044 "movdqa 16(%4),%%xmm5 \n"
9045 "movdqa 32(%4),%%xmm6 \n"
9046 "1: \n"
9047 "movdqu (%0),%%xmm2 \n" // load 16 Y values
9048 "movdqu (%0,%1),%%xmm3 \n" // load 8 VU values
9049 "lea 16(%0),%0 \n"
9050 "movdqa %%xmm2,%%xmm0 \n"
9051 "movdqa %%xmm2,%%xmm1 \n"
9052 "shufps $0x44,%%xmm3,%%xmm0 \n" // Y 0..7, UV 0..3
9053 "shufps $0x99,%%xmm3,%%xmm1 \n" // Y 4..11, UV 2..5
9054 "shufps $0xee,%%xmm3,%%xmm2 \n" // Y 8..15, UV 4..7
9055 "pshufb %%xmm4, %%xmm0 \n" // weave into YUV24
9056 "pshufb %%xmm5, %%xmm1 \n"
9057 "pshufb %%xmm6, %%xmm2 \n"
9058 "movdqu %%xmm0,(%2) \n"
9059 "movdqu %%xmm1,16(%2) \n"
9060 "movdqu %%xmm2,32(%2) \n"
9061 "lea 48(%2),%2 \n"
9062 "sub $16,%3 \n" // 16 pixels per loop
9063 "jg 1b \n"
9064 : "+r"(src_y), // %0
9065 "+r"(src_vu), // %1
9066 "+r"(dst_yuv24), // %2
9067 "+r"(width) // %3
9068 : "r"(&kYUV24Shuffle[0]) // %4
9069 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
9070 }
9071
9072 // Convert biplanar NV21 to packed YUV24
9073 // NV21 has VU in memory for chroma.
9074 // YUV24 is VUY in memory
NV21ToYUV24Row_AVX2(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)9075 void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
9076 const uint8_t* src_vu,
9077 uint8_t* dst_yuv24,
9078 int width) {
9079 asm volatile(
9080 "sub %0,%1 \n"
9081 "vbroadcastf128 (%4),%%ymm4 \n" // 3 shuffler constants
9082 "vbroadcastf128 16(%4),%%ymm5 \n"
9083 "vbroadcastf128 32(%4),%%ymm6 \n"
9084
9085 "1: \n"
9086 "vmovdqu (%0),%%ymm2 \n" // load 32 Y values
9087 "vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values
9088 "lea 32(%0),%0 \n"
9089 "vshufps $0x44,%%ymm3,%%ymm2,%%ymm0 \n" // Y 0..7, UV 0..3
9090 "vshufps $0x99,%%ymm3,%%ymm2,%%ymm1 \n" // Y 4..11, UV 2..5
9091 "vshufps $0xee,%%ymm3,%%ymm2,%%ymm2 \n" // Y 8..15, UV 4..7
9092 "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" // weave into YUV24
9093 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
9094 "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
9095 "vperm2i128 $0x20,%%ymm1,%%ymm0,%%ymm3 \n"
9096 "vperm2i128 $0x30,%%ymm0,%%ymm2,%%ymm0 \n"
9097 "vperm2i128 $0x31,%%ymm2,%%ymm1,%%ymm1 \n"
9098 "vmovdqu %%ymm3,(%2) \n"
9099 "vmovdqu %%ymm0,32(%2) \n"
9100 "vmovdqu %%ymm1,64(%2) \n"
9101 "lea 96(%2),%2 \n"
9102 "sub $32,%3 \n" // 32 pixels per loop
9103 "jg 1b \n"
9104 "vzeroupper \n"
9105 : "+r"(src_y), // %0
9106 "+r"(src_vu), // %1
9107 "+r"(dst_yuv24), // %2
9108 "+r"(width) // %3
9109 : "r"(&kYUV24Shuffle[0]) // %4
9110 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
9111 }
9112
9113 #ifdef HAS_NV21ToYUV24ROW_AVX512
9114 // The following VMBI VEX256 code tests okay with the intelsde emulator.
9115 static const lvec8 kYUV24Perm[3] = {
9116 {32, 33, 0, 32, 33, 1, 34, 35, 2, 34, 35, 3, 36, 37, 4, 36,
9117 37, 5, 38, 39, 6, 38, 39, 7, 40, 41, 8, 40, 41, 9, 42, 43},
9118 {10, 42, 43, 11, 44, 45, 12, 44, 45, 13, 46, 47, 14, 46, 47, 15,
9119 48, 49, 16, 48, 49, 17, 50, 51, 18, 50, 51, 19, 52, 53, 20, 52},
9120 {53, 21, 54, 55, 22, 54, 55, 23, 56, 57, 24, 56, 57, 25, 58, 59,
9121 26, 58, 59, 27, 60, 61, 28, 60, 61, 29, 62, 63, 30, 62, 63, 31}};
9122
NV21ToYUV24Row_AVX512(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)9123 void NV21ToYUV24Row_AVX512(const uint8_t* src_y,
9124 const uint8_t* src_vu,
9125 uint8_t* dst_yuv24,
9126 int width) {
9127 asm volatile(
9128 "sub %0,%1 \n"
9129 "vmovdqa (%4),%%ymm4 \n" // 3 shuffler constants
9130 "vmovdqa 32(%4),%%ymm5 \n"
9131 "vmovdqa 64(%4),%%ymm6 \n" LABELALIGN
9132 "1: \n"
9133 "vmovdqu (%0),%%ymm2 \n" // load 32 Y values
9134 "vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values
9135 "lea 32(%0),%0 \n"
9136 "vmovdqa %%ymm2, %%ymm0 \n"
9137 "vmovdqa %%ymm2, %%ymm1 \n"
9138 "vpermt2b %%ymm3,%%ymm4,%%ymm0 \n"
9139 "vpermt2b %%ymm3,%%ymm5,%%ymm1 \n"
9140 "vpermt2b %%ymm3,%%ymm6,%%ymm2 \n"
9141 "vmovdqu %%ymm0,(%2) \n"
9142 "vmovdqu %%ymm1,32(%2) \n"
9143 "vmovdqu %%ymm2,64(%2) \n"
9144 "lea 96(%2),%2 \n"
9145 "sub $32,%3 \n"
9146 "jg 1b \n"
9147 "vzeroupper \n"
9148 : "+r"(src_y), // %0
9149 "+r"(src_vu), // %1
9150 "+r"(dst_yuv24), // %2
9151 "+r"(width) // %3
9152 : "r"(&kYUV24Perm[0]) // %4
9153 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
9154 }
9155
9156 #endif // HAS_NV21ToYUV24ROW_AVX512
9157
9158 #ifdef HAS_SWAPUVROW_SSSE3
9159
9160 // Shuffle table for reversing the bytes.
9161 static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
9162 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
9163
9164 // Convert UV plane of NV12 to VU of NV21.
SwapUVRow_SSSE3(const uint8_t * src_uv,uint8_t * dst_vu,int width)9165 void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
9166 asm volatile(
9167
9168 "movdqu %3,%%xmm5 \n"
9169
9170 LABELALIGN
9171 "1: \n"
9172 "movdqu (%0),%%xmm0 \n"
9173 "movdqu 0x10(%0),%%xmm1 \n"
9174 "lea 0x20(%0),%0 \n"
9175 "pshufb %%xmm5,%%xmm0 \n"
9176 "pshufb %%xmm5,%%xmm1 \n"
9177 "movdqu %%xmm0,(%1) \n"
9178 "movdqu %%xmm1,0x10(%1) \n"
9179 "lea 0x20(%1),%1 \n"
9180 "sub $0x10,%2 \n"
9181 "jg 1b \n"
9182 : "+r"(src_uv), // %0
9183 "+r"(dst_vu), // %1
9184 "+r"(width) // %2
9185 : "m"(kShuffleUVToVU) // %3
9186 : "memory", "cc", "xmm0", "xmm1", "xmm5");
9187 }
9188 #endif // HAS_SWAPUVROW_SSSE3
9189
9190 #ifdef HAS_SWAPUVROW_AVX2
SwapUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_vu,int width)9191 void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
9192 asm volatile(
9193
9194 "vbroadcastf128 %3,%%ymm5 \n"
9195
9196 LABELALIGN
9197 "1: \n"
9198 "vmovdqu (%0),%%ymm0 \n"
9199 "vmovdqu 0x20(%0),%%ymm1 \n"
9200 "lea 0x40(%0),%0 \n"
9201 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
9202 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
9203 "vmovdqu %%ymm0,(%1) \n"
9204 "vmovdqu %%ymm1,0x20(%1) \n"
9205 "lea 0x40(%1),%1 \n"
9206 "sub $0x20,%2 \n"
9207 "jg 1b \n"
9208 "vzeroupper \n"
9209 : "+r"(src_uv), // %0
9210 "+r"(dst_vu), // %1
9211 "+r"(width) // %2
9212 : "m"(kShuffleUVToVU) // %3
9213 : "memory", "cc", "xmm0", "xmm1", "xmm5");
9214 }
9215 #endif // HAS_SWAPUVROW_AVX2
9216
HalfMergeUVRow_SSSE3(const uint8_t * src_u,int src_stride_u,const uint8_t * src_v,int src_stride_v,uint8_t * dst_uv,int width)9217 void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
9218 int src_stride_u,
9219 const uint8_t* src_v,
9220 int src_stride_v,
9221 uint8_t* dst_uv,
9222 int width) {
9223 asm volatile(
9224 "pcmpeqb %%xmm4,%%xmm4 \n"
9225 "psrlw $0xf,%%xmm4 \n"
9226 "packuswb %%xmm4,%%xmm4 \n"
9227 "pxor %%xmm5,%%xmm5 \n"
9228
9229 LABELALIGN
9230 "1: \n"
9231 "movdqu (%0),%%xmm0 \n" // load 16 U values
9232 "movdqu (%1),%%xmm1 \n" // load 16 V values
9233 "movdqu 0(%0,%4,1),%%xmm2 \n" // 16 from next row
9234 "movdqu 0(%1,%5,1),%%xmm3 \n"
9235 "lea 0x10(%0),%0 \n"
9236 "pmaddubsw %%xmm4,%%xmm0 \n" // half size
9237 "pmaddubsw %%xmm4,%%xmm1 \n"
9238 "pmaddubsw %%xmm4,%%xmm2 \n"
9239 "pmaddubsw %%xmm4,%%xmm3 \n"
9240 "lea 0x10(%1),%1 \n"
9241 "paddw %%xmm2,%%xmm0 \n"
9242 "paddw %%xmm3,%%xmm1 \n"
9243 "psrlw $0x1,%%xmm0 \n"
9244 "psrlw $0x1,%%xmm1 \n"
9245 "pavgw %%xmm5,%%xmm0 \n"
9246 "pavgw %%xmm5,%%xmm1 \n"
9247 "packuswb %%xmm0,%%xmm0 \n"
9248 "packuswb %%xmm1,%%xmm1 \n"
9249 "punpcklbw %%xmm1,%%xmm0 \n"
9250 "movdqu %%xmm0,(%2) \n" // store 8 UV pixels
9251 "lea 0x10(%2),%2 \n"
9252 "sub $0x10,%3 \n" // 16 src pixels per loop
9253 "jg 1b \n"
9254 : "+r"(src_u), // %0
9255 "+r"(src_v), // %1
9256 "+r"(dst_uv), // %2
9257 "+r"(width) // %3
9258 : "r"((intptr_t)(src_stride_u)), // %4
9259 "r"((intptr_t)(src_stride_v)) // %5
9260 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
9261 }
9262
HalfMergeUVRow_AVX2(const uint8_t * src_u,int src_stride_u,const uint8_t * src_v,int src_stride_v,uint8_t * dst_uv,int width)9263 void HalfMergeUVRow_AVX2(const uint8_t* src_u,
9264 int src_stride_u,
9265 const uint8_t* src_v,
9266 int src_stride_v,
9267 uint8_t* dst_uv,
9268 int width) {
9269 asm volatile(
9270 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
9271 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
9272 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
9273 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
9274
9275 LABELALIGN
9276 "1: \n"
9277 "vmovdqu (%0),%%ymm0 \n" // load 32 U values
9278 "vmovdqu (%1),%%ymm1 \n" // load 32 V values
9279 "vmovdqu 0(%0,%4,1),%%ymm2 \n" // 32 from next row
9280 "vmovdqu 0(%1,%5,1),%%ymm3 \n"
9281 "lea 0x20(%0),%0 \n"
9282 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // half size
9283 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
9284 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
9285 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
9286 "lea 0x20(%1),%1 \n"
9287 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
9288 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
9289 "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
9290 "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
9291 "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
9292 "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
9293 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
9294 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
9295 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
9296 "vmovdqu %%ymm0,(%2) \n" // store 16 UV pixels
9297 "lea 0x20(%2),%2 \n"
9298 "sub $0x20,%3 \n" // 32 src pixels per loop
9299 "jg 1b \n"
9300 "vzeroupper \n"
9301 : "+r"(src_u), // %0
9302 "+r"(src_v), // %1
9303 "+r"(dst_uv), // %2
9304 "+r"(width) // %3
9305 : "r"((intptr_t)(src_stride_u)), // %4
9306 "r"((intptr_t)(src_stride_v)) // %5
9307 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
9308 }
9309
ClampFloatToZero_SSE2(const float * src_x,float * dst_y,int width)9310 void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) {
9311 asm volatile(
9312 "pxor %%xmm1,%%xmm1 \n"
9313
9314 LABELALIGN
9315 "1: \n"
9316 "movd (%0),%%xmm0 \n" // load float
9317 "maxss %%xmm1, %%xmm0 \n" // clamp to zero
9318 "add 4, %0 \n"
9319 "movd %%xmm0, (%1) \n" // store float
9320 "add 4, %1 \n"
9321 "sub $0x4,%2 \n" // 1 float per loop
9322 "jg 1b \n"
9323 : "+r"(src_x), // %0
9324 "+r"(dst_y), // %1
9325 "+r"(width) // %2
9326 :
9327 : "memory", "cc", "xmm0", "xmm1");
9328 }
9329
9330 #endif // defined(__x86_64__) || defined(__i386__)
9331
9332 #ifdef __cplusplus
9333 } // extern "C"
9334 } // namespace libyuv
9335 #endif
9336