1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12 #ifdef __cplusplus
13 namespace libyuv {
14 extern "C" {
15 #endif
16
17 // This module is for GCC x86 and x64.
18 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
19
20 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
21
22 // Constants for ARGB
23 static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
24 25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u};
25
26 // JPeg full range.
27 static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
28 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};
29
30 static const uvec8 kABGRToYJ = {77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u,
31 77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u};
32
33 static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u,
34 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u};
35 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
36
37 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
38
39 static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
40 112, -74, -38, 0, 112, -74, -38, 0};
41
42 static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
43 127, -84, -43, 0, 127, -84, -43, 0};
44
45 static const vec8 kABGRToUJ = {-43, -84, 127, 0, -43, -84, 127, 0,
46 -43, -84, 127, 0, -43, -84, 127, 0};
47
48 static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
49 -18, -94, 112, 0, -18, -94, 112, 0};
50
51 static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
52 -20, -107, 127, 0, -20, -107, 127, 0};
53
54 static const vec8 kABGRToVJ = {127, -107, -20, 0, 127, -107, -20, 0,
55 127, -107, -20, 0, 127, -107, -20, 0};
56
57 // Constants for BGRA
58 static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
59 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
60
61 static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
62 0, -38, -74, 112, 0, -38, -74, 112};
63
64 static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
65 0, 112, -94, -18, 0, 112, -94, -18};
66
67 // Constants for ABGR
68 static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u,
69 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u};
70
71 static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
72 -38, -74, 112, 0, -38, -74, 112, 0};
73
74 static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
75 112, -94, -18, 0, 112, -94, -18, 0};
76
77 // Constants for RGBA.
78 static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u,
79 0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u};
80
81 static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
82 0, 112, -74, -38, 0, 112, -74, -38};
83
84 static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
85 0, -18, -94, 112, 0, -18, -94, 112};
86
87 static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
88 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u};
89
90 static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
91 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
92
93 static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
94 0x8080u, 0x8080u, 0x8080u, 0x8080u};
95
96 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
97
98 #ifdef HAS_RGB24TOARGBROW_SSSE3
99
100 // Shuffle table for converting RGB24 to ARGB.
101 static const uvec8 kShuffleMaskRGB24ToARGB = {
102 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
103
104 // Shuffle table for converting RAW to ARGB.
105 static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
106 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
107
108 // Shuffle table for converting RAW to RGBA.
109 static const uvec8 kShuffleMaskRAWToRGBA = {12u, 2u, 1u, 0u, 13u, 5u, 4u, 3u,
110 14u, 8u, 7u, 6u, 15u, 11u, 10u, 9u};
111
112 // Shuffle table for converting RAW to RGB24. First 8.
113 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
114 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
115 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
116
117 // Shuffle table for converting RAW to RGB24. Middle 8.
118 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
119 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
120 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
121
122 // Shuffle table for converting RAW to RGB24. Last 8.
123 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
124 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
125 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
126
127 // Shuffle table for converting ARGB to RGB24.
128 static const uvec8 kShuffleMaskARGBToRGB24 = {
129 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
130
131 // Shuffle table for converting ARGB to RAW.
132 static const uvec8 kShuffleMaskARGBToRAW = {
133 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
134
135 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
136 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
137 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
138
139 // YUY2 shuf 16 Y to 32 Y.
140 static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
141 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
142 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
143
144 // YUY2 shuf 8 UV to 16 UV.
145 static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
146 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
147 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
148
149 // UYVY shuf 16 Y to 32 Y.
150 static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
151 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
152 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
153
154 // UYVY shuf 8 UV to 16 UV.
155 static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
156 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
157 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
158
159 // NV21 shuf 8 VU to 16 UV.
160 static const lvec8 kShuffleNV21 = {
161 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
162 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
163 };
164 #endif // HAS_RGB24TOARGBROW_SSSE3
165
166 #ifdef HAS_J400TOARGBROW_SSE2
J400ToARGBRow_SSE2(const uint8_t * src_y,uint8_t * dst_argb,int width)167 void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
168 asm volatile(
169 "pcmpeqb %%xmm5,%%xmm5 \n"
170 "pslld $0x18,%%xmm5 \n"
171
172 LABELALIGN
173 "1: \n"
174 "movq (%0),%%xmm0 \n"
175 "lea 0x8(%0),%0 \n"
176 "punpcklbw %%xmm0,%%xmm0 \n"
177 "movdqa %%xmm0,%%xmm1 \n"
178 "punpcklwd %%xmm0,%%xmm0 \n"
179 "punpckhwd %%xmm1,%%xmm1 \n"
180 "por %%xmm5,%%xmm0 \n"
181 "por %%xmm5,%%xmm1 \n"
182 "movdqu %%xmm0,(%1) \n"
183 "movdqu %%xmm1,0x10(%1) \n"
184 "lea 0x20(%1),%1 \n"
185 "sub $0x8,%2 \n"
186 "jg 1b \n"
187 : "+r"(src_y), // %0
188 "+r"(dst_argb), // %1
189 "+r"(width) // %2
190 ::"memory",
191 "cc", "xmm0", "xmm1", "xmm5");
192 }
193 #endif // HAS_J400TOARGBROW_SSE2
194
195 #ifdef HAS_RGB24TOARGBROW_SSSE3
RGB24ToARGBRow_SSSE3(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)196 void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
197 uint8_t* dst_argb,
198 int width) {
199 asm volatile(
200 "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
201 "pslld $0x18,%%xmm5 \n"
202 "movdqa %3,%%xmm4 \n"
203
204 LABELALIGN
205 "1: \n"
206 "movdqu (%0),%%xmm0 \n"
207 "movdqu 0x10(%0),%%xmm1 \n"
208 "movdqu 0x20(%0),%%xmm3 \n"
209 "lea 0x30(%0),%0 \n"
210 "movdqa %%xmm3,%%xmm2 \n"
211 "palignr $0x8,%%xmm1,%%xmm2 \n"
212 "pshufb %%xmm4,%%xmm2 \n"
213 "por %%xmm5,%%xmm2 \n"
214 "palignr $0xc,%%xmm0,%%xmm1 \n"
215 "pshufb %%xmm4,%%xmm0 \n"
216 "movdqu %%xmm2,0x20(%1) \n"
217 "por %%xmm5,%%xmm0 \n"
218 "pshufb %%xmm4,%%xmm1 \n"
219 "movdqu %%xmm0,(%1) \n"
220 "por %%xmm5,%%xmm1 \n"
221 "palignr $0x4,%%xmm3,%%xmm3 \n"
222 "pshufb %%xmm4,%%xmm3 \n"
223 "movdqu %%xmm1,0x10(%1) \n"
224 "por %%xmm5,%%xmm3 \n"
225 "movdqu %%xmm3,0x30(%1) \n"
226 "lea 0x40(%1),%1 \n"
227 "sub $0x10,%2 \n"
228 "jg 1b \n"
229 : "+r"(src_rgb24), // %0
230 "+r"(dst_argb), // %1
231 "+r"(width) // %2
232 : "m"(kShuffleMaskRGB24ToARGB) // %3
233 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
234 }
235
RAWToARGBRow_SSSE3(const uint8_t * src_raw,uint8_t * dst_argb,int width)236 void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
237 asm volatile(
238 "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
239 "pslld $0x18,%%xmm5 \n"
240 "movdqa %3,%%xmm4 \n"
241
242 LABELALIGN
243 "1: \n"
244 "movdqu (%0),%%xmm0 \n"
245 "movdqu 0x10(%0),%%xmm1 \n"
246 "movdqu 0x20(%0),%%xmm3 \n"
247 "lea 0x30(%0),%0 \n"
248 "movdqa %%xmm3,%%xmm2 \n"
249 "palignr $0x8,%%xmm1,%%xmm2 \n"
250 "pshufb %%xmm4,%%xmm2 \n"
251 "por %%xmm5,%%xmm2 \n"
252 "palignr $0xc,%%xmm0,%%xmm1 \n"
253 "pshufb %%xmm4,%%xmm0 \n"
254 "movdqu %%xmm2,0x20(%1) \n"
255 "por %%xmm5,%%xmm0 \n"
256 "pshufb %%xmm4,%%xmm1 \n"
257 "movdqu %%xmm0,(%1) \n"
258 "por %%xmm5,%%xmm1 \n"
259 "palignr $0x4,%%xmm3,%%xmm3 \n"
260 "pshufb %%xmm4,%%xmm3 \n"
261 "movdqu %%xmm1,0x10(%1) \n"
262 "por %%xmm5,%%xmm3 \n"
263 "movdqu %%xmm3,0x30(%1) \n"
264 "lea 0x40(%1),%1 \n"
265 "sub $0x10,%2 \n"
266 "jg 1b \n"
267 : "+r"(src_raw), // %0
268 "+r"(dst_argb), // %1
269 "+r"(width) // %2
270 : "m"(kShuffleMaskRAWToARGB) // %3
271 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
272 }
273
274 // Same code as RAWToARGB with different shuffler and A in low bits
RAWToRGBARow_SSSE3(const uint8_t * src_raw,uint8_t * dst_rgba,int width)275 void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
276 asm volatile(
277 "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff
278 "psrld $0x18,%%xmm5 \n"
279 "movdqa %3,%%xmm4 \n"
280
281 LABELALIGN
282 "1: \n"
283 "movdqu (%0),%%xmm0 \n"
284 "movdqu 0x10(%0),%%xmm1 \n"
285 "movdqu 0x20(%0),%%xmm3 \n"
286 "lea 0x30(%0),%0 \n"
287 "movdqa %%xmm3,%%xmm2 \n"
288 "palignr $0x8,%%xmm1,%%xmm2 \n"
289 "pshufb %%xmm4,%%xmm2 \n"
290 "por %%xmm5,%%xmm2 \n"
291 "palignr $0xc,%%xmm0,%%xmm1 \n"
292 "pshufb %%xmm4,%%xmm0 \n"
293 "movdqu %%xmm2,0x20(%1) \n"
294 "por %%xmm5,%%xmm0 \n"
295 "pshufb %%xmm4,%%xmm1 \n"
296 "movdqu %%xmm0,(%1) \n"
297 "por %%xmm5,%%xmm1 \n"
298 "palignr $0x4,%%xmm3,%%xmm3 \n"
299 "pshufb %%xmm4,%%xmm3 \n"
300 "movdqu %%xmm1,0x10(%1) \n"
301 "por %%xmm5,%%xmm3 \n"
302 "movdqu %%xmm3,0x30(%1) \n"
303 "lea 0x40(%1),%1 \n"
304 "sub $0x10,%2 \n"
305 "jg 1b \n"
306 : "+r"(src_raw), // %0
307 "+r"(dst_rgba), // %1
308 "+r"(width) // %2
309 : "m"(kShuffleMaskRAWToRGBA) // %3
310 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
311 }
312
RAWToRGB24Row_SSSE3(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)313 void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
314 uint8_t* dst_rgb24,
315 int width) {
316 asm volatile(
317 "movdqa %3,%%xmm3 \n"
318 "movdqa %4,%%xmm4 \n"
319 "movdqa %5,%%xmm5 \n"
320
321 LABELALIGN
322 "1: \n"
323 "movdqu (%0),%%xmm0 \n"
324 "movdqu 0x4(%0),%%xmm1 \n"
325 "movdqu 0x8(%0),%%xmm2 \n"
326 "lea 0x18(%0),%0 \n"
327 "pshufb %%xmm3,%%xmm0 \n"
328 "pshufb %%xmm4,%%xmm1 \n"
329 "pshufb %%xmm5,%%xmm2 \n"
330 "movq %%xmm0,(%1) \n"
331 "movq %%xmm1,0x8(%1) \n"
332 "movq %%xmm2,0x10(%1) \n"
333 "lea 0x18(%1),%1 \n"
334 "sub $0x8,%2 \n"
335 "jg 1b \n"
336 : "+r"(src_raw), // %0
337 "+r"(dst_rgb24), // %1
338 "+r"(width) // %2
339 : "m"(kShuffleMaskRAWToRGB24_0), // %3
340 "m"(kShuffleMaskRAWToRGB24_1), // %4
341 "m"(kShuffleMaskRAWToRGB24_2) // %5
342 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
343 }
344
RGB565ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)345 void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
346 asm volatile(
347 "mov $0x1080108,%%eax \n"
348 "movd %%eax,%%xmm5 \n"
349 "pshufd $0x0,%%xmm5,%%xmm5 \n"
350 "mov $0x20802080,%%eax \n"
351 "movd %%eax,%%xmm6 \n"
352 "pshufd $0x0,%%xmm6,%%xmm6 \n"
353 "pcmpeqb %%xmm3,%%xmm3 \n"
354 "psllw $0xb,%%xmm3 \n"
355 "pcmpeqb %%xmm4,%%xmm4 \n"
356 "psllw $0xa,%%xmm4 \n"
357 "psrlw $0x5,%%xmm4 \n"
358 "pcmpeqb %%xmm7,%%xmm7 \n"
359 "psllw $0x8,%%xmm7 \n"
360 "sub %0,%1 \n"
361 "sub %0,%1 \n"
362
363 LABELALIGN
364 "1: \n"
365 "movdqu (%0),%%xmm0 \n"
366 "movdqa %%xmm0,%%xmm1 \n"
367 "movdqa %%xmm0,%%xmm2 \n"
368 "pand %%xmm3,%%xmm1 \n"
369 "psllw $0xb,%%xmm2 \n"
370 "pmulhuw %%xmm5,%%xmm1 \n"
371 "pmulhuw %%xmm5,%%xmm2 \n"
372 "psllw $0x8,%%xmm1 \n"
373 "por %%xmm2,%%xmm1 \n"
374 "pand %%xmm4,%%xmm0 \n"
375 "pmulhuw %%xmm6,%%xmm0 \n"
376 "por %%xmm7,%%xmm0 \n"
377 "movdqa %%xmm1,%%xmm2 \n"
378 "punpcklbw %%xmm0,%%xmm1 \n"
379 "punpckhbw %%xmm0,%%xmm2 \n"
380 "movdqu %%xmm1,0x00(%1,%0,2) \n"
381 "movdqu %%xmm2,0x10(%1,%0,2) \n"
382 "lea 0x10(%0),%0 \n"
383 "sub $0x8,%2 \n"
384 "jg 1b \n"
385 : "+r"(src), // %0
386 "+r"(dst), // %1
387 "+r"(width) // %2
388 :
389 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
390 "xmm6", "xmm7");
391 }
392
ARGB1555ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)393 void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
394 asm volatile(
395 "mov $0x1080108,%%eax \n"
396 "movd %%eax,%%xmm5 \n"
397 "pshufd $0x0,%%xmm5,%%xmm5 \n"
398 "mov $0x42004200,%%eax \n"
399 "movd %%eax,%%xmm6 \n"
400 "pshufd $0x0,%%xmm6,%%xmm6 \n"
401 "pcmpeqb %%xmm3,%%xmm3 \n"
402 "psllw $0xb,%%xmm3 \n"
403 "movdqa %%xmm3,%%xmm4 \n"
404 "psrlw $0x6,%%xmm4 \n"
405 "pcmpeqb %%xmm7,%%xmm7 \n"
406 "psllw $0x8,%%xmm7 \n"
407 "sub %0,%1 \n"
408 "sub %0,%1 \n"
409
410 LABELALIGN
411 "1: \n"
412 "movdqu (%0),%%xmm0 \n"
413 "movdqa %%xmm0,%%xmm1 \n"
414 "movdqa %%xmm0,%%xmm2 \n"
415 "psllw $0x1,%%xmm1 \n"
416 "psllw $0xb,%%xmm2 \n"
417 "pand %%xmm3,%%xmm1 \n"
418 "pmulhuw %%xmm5,%%xmm2 \n"
419 "pmulhuw %%xmm5,%%xmm1 \n"
420 "psllw $0x8,%%xmm1 \n"
421 "por %%xmm2,%%xmm1 \n"
422 "movdqa %%xmm0,%%xmm2 \n"
423 "pand %%xmm4,%%xmm0 \n"
424 "psraw $0x8,%%xmm2 \n"
425 "pmulhuw %%xmm6,%%xmm0 \n"
426 "pand %%xmm7,%%xmm2 \n"
427 "por %%xmm2,%%xmm0 \n"
428 "movdqa %%xmm1,%%xmm2 \n"
429 "punpcklbw %%xmm0,%%xmm1 \n"
430 "punpckhbw %%xmm0,%%xmm2 \n"
431 "movdqu %%xmm1,0x00(%1,%0,2) \n"
432 "movdqu %%xmm2,0x10(%1,%0,2) \n"
433 "lea 0x10(%0),%0 \n"
434 "sub $0x8,%2 \n"
435 "jg 1b \n"
436 : "+r"(src), // %0
437 "+r"(dst), // %1
438 "+r"(width) // %2
439 :
440 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
441 "xmm6", "xmm7");
442 }
443
ARGB4444ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)444 void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
445 asm volatile(
446 "mov $0xf0f0f0f,%%eax \n"
447 "movd %%eax,%%xmm4 \n"
448 "pshufd $0x0,%%xmm4,%%xmm4 \n"
449 "movdqa %%xmm4,%%xmm5 \n"
450 "pslld $0x4,%%xmm5 \n"
451 "sub %0,%1 \n"
452 "sub %0,%1 \n"
453
454 LABELALIGN
455 "1: \n"
456 "movdqu (%0),%%xmm0 \n"
457 "movdqa %%xmm0,%%xmm2 \n"
458 "pand %%xmm4,%%xmm0 \n"
459 "pand %%xmm5,%%xmm2 \n"
460 "movdqa %%xmm0,%%xmm1 \n"
461 "movdqa %%xmm2,%%xmm3 \n"
462 "psllw $0x4,%%xmm1 \n"
463 "psrlw $0x4,%%xmm3 \n"
464 "por %%xmm1,%%xmm0 \n"
465 "por %%xmm3,%%xmm2 \n"
466 "movdqa %%xmm0,%%xmm1 \n"
467 "punpcklbw %%xmm2,%%xmm0 \n"
468 "punpckhbw %%xmm2,%%xmm1 \n"
469 "movdqu %%xmm0,0x00(%1,%0,2) \n"
470 "movdqu %%xmm1,0x10(%1,%0,2) \n"
471 "lea 0x10(%0),%0 \n"
472 "sub $0x8,%2 \n"
473 "jg 1b \n"
474 : "+r"(src), // %0
475 "+r"(dst), // %1
476 "+r"(width) // %2
477 :
478 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
479 }
480
ARGBToRGB24Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)481 void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
482 asm volatile(
483
484 "movdqa %3,%%xmm6 \n"
485
486 LABELALIGN
487 "1: \n"
488 "movdqu (%0),%%xmm0 \n"
489 "movdqu 0x10(%0),%%xmm1 \n"
490 "movdqu 0x20(%0),%%xmm2 \n"
491 "movdqu 0x30(%0),%%xmm3 \n"
492 "lea 0x40(%0),%0 \n"
493 "pshufb %%xmm6,%%xmm0 \n"
494 "pshufb %%xmm6,%%xmm1 \n"
495 "pshufb %%xmm6,%%xmm2 \n"
496 "pshufb %%xmm6,%%xmm3 \n"
497 "movdqa %%xmm1,%%xmm4 \n"
498 "psrldq $0x4,%%xmm1 \n"
499 "pslldq $0xc,%%xmm4 \n"
500 "movdqa %%xmm2,%%xmm5 \n"
501 "por %%xmm4,%%xmm0 \n"
502 "pslldq $0x8,%%xmm5 \n"
503 "movdqu %%xmm0,(%1) \n"
504 "por %%xmm5,%%xmm1 \n"
505 "psrldq $0x8,%%xmm2 \n"
506 "pslldq $0x4,%%xmm3 \n"
507 "por %%xmm3,%%xmm2 \n"
508 "movdqu %%xmm1,0x10(%1) \n"
509 "movdqu %%xmm2,0x20(%1) \n"
510 "lea 0x30(%1),%1 \n"
511 "sub $0x10,%2 \n"
512 "jg 1b \n"
513 : "+r"(src), // %0
514 "+r"(dst), // %1
515 "+r"(width) // %2
516 : "m"(kShuffleMaskARGBToRGB24) // %3
517 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
518 }
519
ARGBToRAWRow_SSSE3(const uint8_t * src,uint8_t * dst,int width)520 void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
521 asm volatile(
522
523 "movdqa %3,%%xmm6 \n"
524
525 LABELALIGN
526 "1: \n"
527 "movdqu (%0),%%xmm0 \n"
528 "movdqu 0x10(%0),%%xmm1 \n"
529 "movdqu 0x20(%0),%%xmm2 \n"
530 "movdqu 0x30(%0),%%xmm3 \n"
531 "lea 0x40(%0),%0 \n"
532 "pshufb %%xmm6,%%xmm0 \n"
533 "pshufb %%xmm6,%%xmm1 \n"
534 "pshufb %%xmm6,%%xmm2 \n"
535 "pshufb %%xmm6,%%xmm3 \n"
536 "movdqa %%xmm1,%%xmm4 \n"
537 "psrldq $0x4,%%xmm1 \n"
538 "pslldq $0xc,%%xmm4 \n"
539 "movdqa %%xmm2,%%xmm5 \n"
540 "por %%xmm4,%%xmm0 \n"
541 "pslldq $0x8,%%xmm5 \n"
542 "movdqu %%xmm0,(%1) \n"
543 "por %%xmm5,%%xmm1 \n"
544 "psrldq $0x8,%%xmm2 \n"
545 "pslldq $0x4,%%xmm3 \n"
546 "por %%xmm3,%%xmm2 \n"
547 "movdqu %%xmm1,0x10(%1) \n"
548 "movdqu %%xmm2,0x20(%1) \n"
549 "lea 0x30(%1),%1 \n"
550 "sub $0x10,%2 \n"
551 "jg 1b \n"
552 : "+r"(src), // %0
553 "+r"(dst), // %1
554 "+r"(width) // %2
555 : "m"(kShuffleMaskARGBToRAW) // %3
556 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
557 }
558
559 #ifdef HAS_ARGBTORGB24ROW_AVX2
560 // vpermd for 12+12 to 24
561 static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
562
ARGBToRGB24Row_AVX2(const uint8_t * src,uint8_t * dst,int width)563 void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
564 asm volatile(
565 "vbroadcastf128 %3,%%ymm6 \n"
566 "vmovdqa %4,%%ymm7 \n"
567
568 LABELALIGN
569 "1: \n"
570 "vmovdqu (%0),%%ymm0 \n"
571 "vmovdqu 0x20(%0),%%ymm1 \n"
572 "vmovdqu 0x40(%0),%%ymm2 \n"
573 "vmovdqu 0x60(%0),%%ymm3 \n"
574 "lea 0x80(%0),%0 \n"
575 "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
576 "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
577 "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
578 "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
579 "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
580 "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
581 "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
582 "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
583 "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
584 "vpor %%ymm4,%%ymm0,%%ymm0 \n"
585 "vmovdqu %%ymm0,(%1) \n"
586 "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
587 "vpermq $0x4f,%%ymm2,%%ymm4 \n"
588 "vpor %%ymm4,%%ymm1,%%ymm1 \n"
589 "vmovdqu %%ymm1,0x20(%1) \n"
590 "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
591 "vpermq $0x93,%%ymm3,%%ymm3 \n"
592 "vpor %%ymm3,%%ymm2,%%ymm2 \n"
593 "vmovdqu %%ymm2,0x40(%1) \n"
594 "lea 0x60(%1),%1 \n"
595 "sub $0x20,%2 \n"
596 "jg 1b \n"
597 "vzeroupper \n"
598 : "+r"(src), // %0
599 "+r"(dst), // %1
600 "+r"(width) // %2
601 : "m"(kShuffleMaskARGBToRGB24), // %3
602 "m"(kPermdRGB24_AVX) // %4
603 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
604 "xmm7");
605 }
606 #endif
607
608 #ifdef HAS_ARGBTORGB24ROW_AVX512VBMI
609 // Shuffle table for converting ARGBToRGB24
610 static const ulvec8 kPermARGBToRGB24_0 = {
611 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u,
612 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
613 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
614 static const ulvec8 kPermARGBToRGB24_1 = {
615 10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
616 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
617 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
618 static const ulvec8 kPermARGBToRGB24_2 = {
619 21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
620 36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
621 50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
622
ARGBToRGB24Row_AVX512VBMI(const uint8_t * src,uint8_t * dst,int width)623 void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
624 asm volatile(
625 "vmovdqa %3,%%ymm5 \n"
626 "vmovdqa %4,%%ymm6 \n"
627 "vmovdqa %5,%%ymm7 \n"
628
629 LABELALIGN
630 "1: \n"
631 "vmovdqu (%0),%%ymm0 \n"
632 "vmovdqu 0x20(%0),%%ymm1 \n"
633 "vmovdqu 0x40(%0),%%ymm2 \n"
634 "vmovdqu 0x60(%0),%%ymm3 \n"
635 "lea 0x80(%0),%0 \n"
636 "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n"
637 "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n"
638 "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n"
639 "vmovdqu %%ymm0,(%1) \n"
640 "vmovdqu %%ymm1,0x20(%1) \n"
641 "vmovdqu %%ymm2,0x40(%1) \n"
642 "lea 0x60(%1),%1 \n"
643 "sub $0x20,%2 \n"
644 "jg 1b \n"
645 "vzeroupper \n"
646 : "+r"(src), // %0
647 "+r"(dst), // %1
648 "+r"(width) // %2
649 : "m"(kPermARGBToRGB24_0), // %3
650 "m"(kPermARGBToRGB24_1), // %4
651 "m"(kPermARGBToRGB24_2) // %5
652 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");
653 }
654 #endif
655
656 #ifdef HAS_ARGBTORAWROW_AVX2
ARGBToRAWRow_AVX2(const uint8_t * src,uint8_t * dst,int width)657 void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
658 asm volatile(
659 "vbroadcastf128 %3,%%ymm6 \n"
660 "vmovdqa %4,%%ymm7 \n"
661
662 LABELALIGN
663 "1: \n"
664 "vmovdqu (%0),%%ymm0 \n"
665 "vmovdqu 0x20(%0),%%ymm1 \n"
666 "vmovdqu 0x40(%0),%%ymm2 \n"
667 "vmovdqu 0x60(%0),%%ymm3 \n"
668 "lea 0x80(%0),%0 \n"
669 "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
670 "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
671 "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
672 "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
673 "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
674 "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
675 "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
676 "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
677 "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
678 "vpor %%ymm4,%%ymm0,%%ymm0 \n"
679 "vmovdqu %%ymm0,(%1) \n"
680 "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
681 "vpermq $0x4f,%%ymm2,%%ymm4 \n"
682 "vpor %%ymm4,%%ymm1,%%ymm1 \n"
683 "vmovdqu %%ymm1,0x20(%1) \n"
684 "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
685 "vpermq $0x93,%%ymm3,%%ymm3 \n"
686 "vpor %%ymm3,%%ymm2,%%ymm2 \n"
687 "vmovdqu %%ymm2,0x40(%1) \n"
688 "lea 0x60(%1),%1 \n"
689 "sub $0x20,%2 \n"
690 "jg 1b \n"
691 "vzeroupper \n"
692 : "+r"(src), // %0
693 "+r"(dst), // %1
694 "+r"(width) // %2
695 : "m"(kShuffleMaskARGBToRAW), // %3
696 "m"(kPermdRGB24_AVX) // %4
697 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
698 "xmm7");
699 }
700 #endif
701
ARGBToRGB565Row_SSE2(const uint8_t * src,uint8_t * dst,int width)702 void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
703 asm volatile(
704 "pcmpeqb %%xmm3,%%xmm3 \n"
705 "psrld $0x1b,%%xmm3 \n"
706 "pcmpeqb %%xmm4,%%xmm4 \n"
707 "psrld $0x1a,%%xmm4 \n"
708 "pslld $0x5,%%xmm4 \n"
709 "pcmpeqb %%xmm5,%%xmm5 \n"
710 "pslld $0xb,%%xmm5 \n"
711
712 LABELALIGN
713 "1: \n"
714 "movdqu (%0),%%xmm0 \n"
715 "movdqa %%xmm0,%%xmm1 \n"
716 "movdqa %%xmm0,%%xmm2 \n"
717 "pslld $0x8,%%xmm0 \n"
718 "psrld $0x3,%%xmm1 \n"
719 "psrld $0x5,%%xmm2 \n"
720 "psrad $0x10,%%xmm0 \n"
721 "pand %%xmm3,%%xmm1 \n"
722 "pand %%xmm4,%%xmm2 \n"
723 "pand %%xmm5,%%xmm0 \n"
724 "por %%xmm2,%%xmm1 \n"
725 "por %%xmm1,%%xmm0 \n"
726 "packssdw %%xmm0,%%xmm0 \n"
727 "lea 0x10(%0),%0 \n"
728 "movq %%xmm0,(%1) \n"
729 "lea 0x8(%1),%1 \n"
730 "sub $0x4,%2 \n"
731 "jg 1b \n"
732 : "+r"(src), // %0
733 "+r"(dst), // %1
734 "+r"(width) // %2
735 ::"memory",
736 "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
737 }
738
ARGBToRGB565DitherRow_SSE2(const uint8_t * src,uint8_t * dst,uint32_t dither4,int width)739 void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
740 uint8_t* dst,
741 uint32_t dither4,
742 int width) {
743 asm volatile(
744 "movd %3,%%xmm6 \n"
745 "punpcklbw %%xmm6,%%xmm6 \n"
746 "movdqa %%xmm6,%%xmm7 \n"
747 "punpcklwd %%xmm6,%%xmm6 \n"
748 "punpckhwd %%xmm7,%%xmm7 \n"
749 "pcmpeqb %%xmm3,%%xmm3 \n"
750 "psrld $0x1b,%%xmm3 \n"
751 "pcmpeqb %%xmm4,%%xmm4 \n"
752 "psrld $0x1a,%%xmm4 \n"
753 "pslld $0x5,%%xmm4 \n"
754 "pcmpeqb %%xmm5,%%xmm5 \n"
755 "pslld $0xb,%%xmm5 \n"
756
757 LABELALIGN
758 "1: \n"
759 "movdqu (%0),%%xmm0 \n"
760 "paddusb %%xmm6,%%xmm0 \n"
761 "movdqa %%xmm0,%%xmm1 \n"
762 "movdqa %%xmm0,%%xmm2 \n"
763 "pslld $0x8,%%xmm0 \n"
764 "psrld $0x3,%%xmm1 \n"
765 "psrld $0x5,%%xmm2 \n"
766 "psrad $0x10,%%xmm0 \n"
767 "pand %%xmm3,%%xmm1 \n"
768 "pand %%xmm4,%%xmm2 \n"
769 "pand %%xmm5,%%xmm0 \n"
770 "por %%xmm2,%%xmm1 \n"
771 "por %%xmm1,%%xmm0 \n"
772 "packssdw %%xmm0,%%xmm0 \n"
773 "lea 0x10(%0),%0 \n"
774 "movq %%xmm0,(%1) \n"
775 "lea 0x8(%1),%1 \n"
776 "sub $0x4,%2 \n"
777 "jg 1b \n"
778 : "+r"(src), // %0
779 "+r"(dst), // %1
780 "+r"(width) // %2
781 : "m"(dither4) // %3
782 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
783 "xmm7");
784 }
785
786 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
ARGBToRGB565DitherRow_AVX2(const uint8_t * src,uint8_t * dst,uint32_t dither4,int width)787 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
788 uint8_t* dst,
789 uint32_t dither4,
790 int width) {
791 asm volatile(
792 "vbroadcastss %3,%%xmm6 \n"
793 "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
794 "vpermq $0xd8,%%ymm6,%%ymm6 \n"
795 "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
796 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
797 "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
798 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
799 "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
800 "vpslld $0x5,%%ymm4,%%ymm4 \n"
801 "vpslld $0xb,%%ymm3,%%ymm5 \n"
802
803 LABELALIGN
804 "1: \n"
805 "vmovdqu (%0),%%ymm0 \n"
806 "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
807 "vpsrld $0x5,%%ymm0,%%ymm2 \n"
808 "vpsrld $0x3,%%ymm0,%%ymm1 \n"
809 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
810 "vpand %%ymm4,%%ymm2,%%ymm2 \n"
811 "vpand %%ymm3,%%ymm1,%%ymm1 \n"
812 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
813 "vpor %%ymm2,%%ymm1,%%ymm1 \n"
814 "vpor %%ymm1,%%ymm0,%%ymm0 \n"
815 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
816 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
817 "lea 0x20(%0),%0 \n"
818 "vmovdqu %%xmm0,(%1) \n"
819 "lea 0x10(%1),%1 \n"
820 "sub $0x8,%2 \n"
821 "jg 1b \n"
822 "vzeroupper \n"
823 : "+r"(src), // %0
824 "+r"(dst), // %1
825 "+r"(width) // %2
826 : "m"(dither4) // %3
827 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
828 "xmm7");
829 }
830 #endif // HAS_ARGBTORGB565DITHERROW_AVX2
831
ARGBToARGB1555Row_SSE2(const uint8_t * src,uint8_t * dst,int width)832 void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
833 asm volatile(
834 "pcmpeqb %%xmm4,%%xmm4 \n"
835 "psrld $0x1b,%%xmm4 \n"
836 "movdqa %%xmm4,%%xmm5 \n"
837 "pslld $0x5,%%xmm5 \n"
838 "movdqa %%xmm4,%%xmm6 \n"
839 "pslld $0xa,%%xmm6 \n"
840 "pcmpeqb %%xmm7,%%xmm7 \n"
841 "pslld $0xf,%%xmm7 \n"
842
843 LABELALIGN
844 "1: \n"
845 "movdqu (%0),%%xmm0 \n"
846 "movdqa %%xmm0,%%xmm1 \n"
847 "movdqa %%xmm0,%%xmm2 \n"
848 "movdqa %%xmm0,%%xmm3 \n"
849 "psrad $0x10,%%xmm0 \n"
850 "psrld $0x3,%%xmm1 \n"
851 "psrld $0x6,%%xmm2 \n"
852 "psrld $0x9,%%xmm3 \n"
853 "pand %%xmm7,%%xmm0 \n"
854 "pand %%xmm4,%%xmm1 \n"
855 "pand %%xmm5,%%xmm2 \n"
856 "pand %%xmm6,%%xmm3 \n"
857 "por %%xmm1,%%xmm0 \n"
858 "por %%xmm3,%%xmm2 \n"
859 "por %%xmm2,%%xmm0 \n"
860 "packssdw %%xmm0,%%xmm0 \n"
861 "lea 0x10(%0),%0 \n"
862 "movq %%xmm0,(%1) \n"
863 "lea 0x8(%1),%1 \n"
864 "sub $0x4,%2 \n"
865 "jg 1b \n"
866 : "+r"(src), // %0
867 "+r"(dst), // %1
868 "+r"(width) // %2
869 ::"memory",
870 "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
871 }
872
ARGBToARGB4444Row_SSE2(const uint8_t * src,uint8_t * dst,int width)873 void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
874 asm volatile(
875 "pcmpeqb %%xmm4,%%xmm4 \n"
876 "psllw $0xc,%%xmm4 \n"
877 "movdqa %%xmm4,%%xmm3 \n"
878 "psrlw $0x8,%%xmm3 \n"
879
880 LABELALIGN
881 "1: \n"
882 "movdqu (%0),%%xmm0 \n"
883 "movdqa %%xmm0,%%xmm1 \n"
884 "pand %%xmm3,%%xmm0 \n"
885 "pand %%xmm4,%%xmm1 \n"
886 "psrlq $0x4,%%xmm0 \n"
887 "psrlq $0x8,%%xmm1 \n"
888 "por %%xmm1,%%xmm0 \n"
889 "packuswb %%xmm0,%%xmm0 \n"
890 "lea 0x10(%0),%0 \n"
891 "movq %%xmm0,(%1) \n"
892 "lea 0x8(%1),%1 \n"
893 "sub $0x4,%2 \n"
894 "jg 1b \n"
895 : "+r"(src), // %0
896 "+r"(dst), // %1
897 "+r"(width) // %2
898 ::"memory",
899 "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
900 }
901 #endif // HAS_RGB24TOARGBROW_SSSE3
902
903 /*
904
905 ARGBToAR30Row:
906
907 Red Blue
908 With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
909 produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
910 wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
911 (1024+4)*16 for red.
912
913 Alpha Green
914 Alpha and Green are already in the high bits so vpand can zero out the other
915 bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
916 could be used for Green - (1024+4) putting the 10 bit green in the lsb. Alpha
917 would be a simple multiplier to shift it into position. It wants a gap of 10
918 above the green. Green is 10 bits, so there are 6 bits in the low short. 4
919 more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
920 and then a shift of 4 is a multiply of 16, so (4*16) = 64. Then shift the
921 result left 10 to position the A and G channels.
922 */
923
924 // Shuffle table for converting RAW to RGB24. Last 8.
925 static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u,
926 128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
927
928 static const uvec8 kShuffleBR30 = {128u, 2u, 128u, 0u, 128u, 6u, 128u, 4u,
929 128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
930
931 static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
932 static const uint32_t kMaskRB10 = 0x3ff003ff;
933 static const uint32_t kMaskAG10 = 0xc000ff00;
934 static const uint32_t kMulAG10 = 64 * 65536 + 1028;
935
ARGBToAR30Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)936 void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
937 asm volatile(
938 "movdqa %3,%%xmm2 \n" // shuffler for RB
939 "movd %4,%%xmm3 \n" // multipler for RB
940 "movd %5,%%xmm4 \n" // mask for R10 B10
941 "movd %6,%%xmm5 \n" // mask for AG
942 "movd %7,%%xmm6 \n" // multipler for AG
943 "pshufd $0x0,%%xmm3,%%xmm3 \n"
944 "pshufd $0x0,%%xmm4,%%xmm4 \n"
945 "pshufd $0x0,%%xmm5,%%xmm5 \n"
946 "pshufd $0x0,%%xmm6,%%xmm6 \n"
947 "sub %0,%1 \n"
948
949 "1: \n"
950 "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels
951 "movdqa %%xmm0,%%xmm1 \n"
952 "pshufb %%xmm2,%%xmm1 \n" // R0B0
953 "pand %%xmm5,%%xmm0 \n" // A0G0
954 "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
955 "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
956 "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
957 "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
958 "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
959 "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
960 "add $0x10,%0 \n"
961 "sub $0x4,%2 \n"
962 "jg 1b \n"
963
964 : "+r"(src), // %0
965 "+r"(dst), // %1
966 "+r"(width) // %2
967 : "m"(kShuffleRB30), // %3
968 "m"(kMulRB10), // %4
969 "m"(kMaskRB10), // %5
970 "m"(kMaskAG10), // %6
971 "m"(kMulAG10) // %7
972 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
973 }
974
ABGRToAR30Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)975 void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
976 asm volatile(
977 "movdqa %3,%%xmm2 \n" // shuffler for RB
978 "movd %4,%%xmm3 \n" // multipler for RB
979 "movd %5,%%xmm4 \n" // mask for R10 B10
980 "movd %6,%%xmm5 \n" // mask for AG
981 "movd %7,%%xmm6 \n" // multipler for AG
982 "pshufd $0x0,%%xmm3,%%xmm3 \n"
983 "pshufd $0x0,%%xmm4,%%xmm4 \n"
984 "pshufd $0x0,%%xmm5,%%xmm5 \n"
985 "pshufd $0x0,%%xmm6,%%xmm6 \n"
986 "sub %0,%1 \n"
987
988 "1: \n"
989 "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels
990 "movdqa %%xmm0,%%xmm1 \n"
991 "pshufb %%xmm2,%%xmm1 \n" // R0B0
992 "pand %%xmm5,%%xmm0 \n" // A0G0
993 "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
994 "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
995 "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
996 "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
997 "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
998 "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
999 "add $0x10,%0 \n"
1000 "sub $0x4,%2 \n"
1001 "jg 1b \n"
1002
1003 : "+r"(src), // %0
1004 "+r"(dst), // %1
1005 "+r"(width) // %2
1006 : "m"(kShuffleBR30), // %3 reversed shuffler
1007 "m"(kMulRB10), // %4
1008 "m"(kMaskRB10), // %5
1009 "m"(kMaskAG10), // %6
1010 "m"(kMulAG10) // %7
1011 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1012 }
1013
1014 #ifdef HAS_ARGBTOAR30ROW_AVX2
ARGBToAR30Row_AVX2(const uint8_t * src,uint8_t * dst,int width)1015 void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
1016 asm volatile(
1017 "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
1018 "vbroadcastss %4,%%ymm3 \n" // multipler for RB
1019 "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
1020 "vbroadcastss %6,%%ymm5 \n" // mask for AG
1021 "vbroadcastss %7,%%ymm6 \n" // multipler for AG
1022 "sub %0,%1 \n"
1023
1024 "1: \n"
1025 "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels
1026 "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
1027 "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
1028 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
1029 "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
1030 "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
1031 "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
1032 "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
1033 "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
1034 "add $0x20,%0 \n"
1035 "sub $0x8,%2 \n"
1036 "jg 1b \n"
1037 "vzeroupper \n"
1038
1039 : "+r"(src), // %0
1040 "+r"(dst), // %1
1041 "+r"(width) // %2
1042 : "m"(kShuffleRB30), // %3
1043 "m"(kMulRB10), // %4
1044 "m"(kMaskRB10), // %5
1045 "m"(kMaskAG10), // %6
1046 "m"(kMulAG10) // %7
1047 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1048 }
1049 #endif
1050
1051 #ifdef HAS_ABGRTOAR30ROW_AVX2
ABGRToAR30Row_AVX2(const uint8_t * src,uint8_t * dst,int width)1052 void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
1053 asm volatile(
1054 "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
1055 "vbroadcastss %4,%%ymm3 \n" // multipler for RB
1056 "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
1057 "vbroadcastss %6,%%ymm5 \n" // mask for AG
1058 "vbroadcastss %7,%%ymm6 \n" // multipler for AG
1059 "sub %0,%1 \n"
1060
1061 "1: \n"
1062 "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels
1063 "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
1064 "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
1065 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
1066 "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
1067 "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
1068 "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
1069 "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
1070 "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
1071 "add $0x20,%0 \n"
1072 "sub $0x8,%2 \n"
1073 "jg 1b \n"
1074 "vzeroupper \n"
1075
1076 : "+r"(src), // %0
1077 "+r"(dst), // %1
1078 "+r"(width) // %2
1079 : "m"(kShuffleBR30), // %3 reversed shuffler
1080 "m"(kMulRB10), // %4
1081 "m"(kMaskRB10), // %5
1082 "m"(kMaskAG10), // %6
1083 "m"(kMulAG10) // %7
1084 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1085 }
1086 #endif
1087
1088 static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7,
1089 10, 9, 8, 11, 14, 13, 12, 15};
1090
1091 static const uvec8 kShuffleARGBToAB64Lo = {2, 2, 1, 1, 0, 0, 3, 3,
1092 6, 6, 5, 5, 4, 4, 7, 7};
1093 static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9, 9, 8, 8, 11, 11,
1094 14, 14, 13, 13, 12, 12, 15, 15};
1095
ARGBToAR64Row_SSSE3(const uint8_t * src_argb,uint16_t * dst_ar64,int width)1096 void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
1097 uint16_t* dst_ar64,
1098 int width) {
1099 asm volatile(
1100
1101 LABELALIGN
1102 "1: \n"
1103 "movdqu (%0),%%xmm0 \n"
1104 "movdqa %%xmm0,%%xmm1 \n"
1105 "punpcklbw %%xmm0,%%xmm0 \n"
1106 "punpckhbw %%xmm1,%%xmm1 \n"
1107 "movdqu %%xmm0,(%1) \n"
1108 "movdqu %%xmm1,0x10(%1) \n"
1109 "lea 0x10(%0),%0 \n"
1110 "lea 0x20(%1),%1 \n"
1111 "sub $0x4,%2 \n"
1112 "jg 1b \n"
1113 : "+r"(src_argb), // %0
1114 "+r"(dst_ar64), // %1
1115 "+r"(width) // %2
1116 :
1117 : "memory", "cc", "xmm0", "xmm1");
1118 }
1119
ARGBToAB64Row_SSSE3(const uint8_t * src_argb,uint16_t * dst_ab64,int width)1120 void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
1121 uint16_t* dst_ab64,
1122 int width) {
1123 asm volatile(
1124
1125 "movdqa %3,%%xmm2 \n"
1126 "movdqa %4,%%xmm3 \n" LABELALIGN
1127 "1: \n"
1128 "movdqu (%0),%%xmm0 \n"
1129 "movdqa %%xmm0,%%xmm1 \n"
1130 "pshufb %%xmm2,%%xmm0 \n"
1131 "pshufb %%xmm3,%%xmm1 \n"
1132 "movdqu %%xmm0,(%1) \n"
1133 "movdqu %%xmm1,0x10(%1) \n"
1134 "lea 0x10(%0),%0 \n"
1135 "lea 0x20(%1),%1 \n"
1136 "sub $0x4,%2 \n"
1137 "jg 1b \n"
1138 : "+r"(src_argb), // %0
1139 "+r"(dst_ab64), // %1
1140 "+r"(width) // %2
1141 : "m"(kShuffleARGBToAB64Lo), // %3
1142 "m"(kShuffleARGBToAB64Hi) // %4
1143 : "memory", "cc", "xmm0", "xmm1", "xmm2");
1144 }
1145
AR64ToARGBRow_SSSE3(const uint16_t * src_ar64,uint8_t * dst_argb,int width)1146 void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
1147 uint8_t* dst_argb,
1148 int width) {
1149 asm volatile(
1150
1151 LABELALIGN
1152 "1: \n"
1153 "movdqu (%0),%%xmm0 \n"
1154 "movdqu 0x10(%0),%%xmm1 \n"
1155 "psrlw $8,%%xmm0 \n"
1156 "psrlw $8,%%xmm1 \n"
1157 "packuswb %%xmm1,%%xmm0 \n"
1158 "movdqu %%xmm0,(%1) \n"
1159 "lea 0x20(%0),%0 \n"
1160 "lea 0x10(%1),%1 \n"
1161 "sub $0x4,%2 \n"
1162 "jg 1b \n"
1163 : "+r"(src_ar64), // %0
1164 "+r"(dst_argb), // %1
1165 "+r"(width) // %2
1166 :
1167 : "memory", "cc", "xmm0", "xmm1");
1168 }
1169
AB64ToARGBRow_SSSE3(const uint16_t * src_ab64,uint8_t * dst_argb,int width)1170 void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
1171 uint8_t* dst_argb,
1172 int width) {
1173 asm volatile(
1174
1175 "movdqa %3,%%xmm2 \n" LABELALIGN
1176 "1: \n"
1177 "movdqu (%0),%%xmm0 \n"
1178 "movdqu 0x10(%0),%%xmm1 \n"
1179 "psrlw $8,%%xmm0 \n"
1180 "psrlw $8,%%xmm1 \n"
1181 "packuswb %%xmm1,%%xmm0 \n"
1182 "pshufb %%xmm2,%%xmm0 \n"
1183 "movdqu %%xmm0,(%1) \n"
1184 "lea 0x20(%0),%0 \n"
1185 "lea 0x10(%1),%1 \n"
1186 "sub $0x4,%2 \n"
1187 "jg 1b \n"
1188 : "+r"(src_ab64), // %0
1189 "+r"(dst_argb), // %1
1190 "+r"(width) // %2
1191 : "m"(kShuffleARGBToABGR) // %3
1192 : "memory", "cc", "xmm0", "xmm1", "xmm2");
1193 }
1194
1195 #ifdef HAS_ARGBTOAR64ROW_AVX2
ARGBToAR64Row_AVX2(const uint8_t * src_argb,uint16_t * dst_ar64,int width)1196 void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
1197 uint16_t* dst_ar64,
1198 int width) {
1199 asm volatile(
1200
1201 LABELALIGN
1202 "1: \n"
1203 "vmovdqu (%0),%%ymm0 \n"
1204 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1205 "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
1206 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
1207 "vmovdqu %%ymm0,(%1) \n"
1208 "vmovdqu %%ymm1,0x20(%1) \n"
1209 "lea 0x20(%0),%0 \n"
1210 "lea 0x40(%1),%1 \n"
1211 "sub $0x8,%2 \n"
1212 "jg 1b \n"
1213 "vzeroupper \n"
1214 : "+r"(src_argb), // %0
1215 "+r"(dst_ar64), // %1
1216 "+r"(width) // %2
1217 :
1218 : "memory", "cc", "xmm0", "xmm1");
1219 }
1220 #endif
1221
1222 #ifdef HAS_ARGBTOAB64ROW_AVX2
ARGBToAB64Row_AVX2(const uint8_t * src_argb,uint16_t * dst_ab64,int width)1223 void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
1224 uint16_t* dst_ab64,
1225 int width) {
1226 asm volatile(
1227
1228 "vbroadcastf128 %3,%%ymm2 \n"
1229 "vbroadcastf128 %4,%%ymm3 \n" LABELALIGN
1230 "1: \n"
1231 "vmovdqu (%0),%%ymm0 \n"
1232 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1233 "vpshufb %%ymm3,%%ymm0,%%ymm1 \n"
1234 "vpshufb %%ymm2,%%ymm0,%%ymm0 \n"
1235 "vmovdqu %%ymm0,(%1) \n"
1236 "vmovdqu %%ymm1,0x20(%1) \n"
1237 "lea 0x20(%0),%0 \n"
1238 "lea 0x40(%1),%1 \n"
1239 "sub $0x8,%2 \n"
1240 "jg 1b \n"
1241 "vzeroupper \n"
1242 : "+r"(src_argb), // %0
1243 "+r"(dst_ab64), // %1
1244 "+r"(width) // %2
1245 : "m"(kShuffleARGBToAB64Lo), // %3
1246 "m"(kShuffleARGBToAB64Hi) // %3
1247 : "memory", "cc", "xmm0", "xmm1", "xmm2");
1248 }
1249 #endif
1250
1251 #ifdef HAS_AR64TOARGBROW_AVX2
AR64ToARGBRow_AVX2(const uint16_t * src_ar64,uint8_t * dst_argb,int width)1252 void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
1253 uint8_t* dst_argb,
1254 int width) {
1255 asm volatile(
1256
1257 LABELALIGN
1258 "1: \n"
1259 "vmovdqu (%0),%%ymm0 \n"
1260 "vmovdqu 0x20(%0),%%ymm1 \n"
1261 "vpsrlw $8,%%ymm0,%%ymm0 \n"
1262 "vpsrlw $8,%%ymm1,%%ymm1 \n"
1263 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
1264 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1265 "vmovdqu %%ymm0,(%1) \n"
1266 "lea 0x40(%0),%0 \n"
1267 "lea 0x20(%1),%1 \n"
1268 "sub $0x8,%2 \n"
1269 "jg 1b \n"
1270 "vzeroupper \n"
1271 : "+r"(src_ar64), // %0
1272 "+r"(dst_argb), // %1
1273 "+r"(width) // %2
1274 :
1275 : "memory", "cc", "xmm0", "xmm1");
1276 }
1277 #endif
1278
1279 #ifdef HAS_AB64TOARGBROW_AVX2
AB64ToARGBRow_AVX2(const uint16_t * src_ab64,uint8_t * dst_argb,int width)1280 void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
1281 uint8_t* dst_argb,
1282 int width) {
1283 asm volatile(
1284
1285 "vbroadcastf128 %3,%%ymm2 \n" LABELALIGN
1286 "1: \n"
1287 "vmovdqu (%0),%%ymm0 \n"
1288 "vmovdqu 0x20(%0),%%ymm1 \n"
1289 "vpsrlw $8,%%ymm0,%%ymm0 \n"
1290 "vpsrlw $8,%%ymm1,%%ymm1 \n"
1291 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
1292 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1293 "vpshufb %%ymm2,%%ymm0,%%ymm0 \n"
1294 "vmovdqu %%ymm0,(%1) \n"
1295 "lea 0x40(%0),%0 \n"
1296 "lea 0x20(%1),%1 \n"
1297 "sub $0x8,%2 \n"
1298 "jg 1b \n"
1299 "vzeroupper \n"
1300 : "+r"(src_ab64), // %0
1301 "+r"(dst_argb), // %1
1302 "+r"(width) // %2
1303 : "m"(kShuffleARGBToABGR) // %3
1304 : "memory", "cc", "xmm0", "xmm1", "xmm2");
1305 }
1306 #endif
1307
1308 // clang-format off
1309
1310 // TODO(mraptis): Consider passing R, G, B multipliers as parameter.
1311 // round parameter is register containing value to add before shift.
1312 #define RGBTOY(round) \
1313 "1: \n" \
1314 "movdqu (%0),%%xmm0 \n" \
1315 "movdqu 0x10(%0),%%xmm1 \n" \
1316 "movdqu 0x20(%0),%%xmm2 \n" \
1317 "movdqu 0x30(%0),%%xmm3 \n" \
1318 "psubb %%xmm5,%%xmm0 \n" \
1319 "psubb %%xmm5,%%xmm1 \n" \
1320 "psubb %%xmm5,%%xmm2 \n" \
1321 "psubb %%xmm5,%%xmm3 \n" \
1322 "movdqu %%xmm4,%%xmm6 \n" \
1323 "pmaddubsw %%xmm0,%%xmm6 \n" \
1324 "movdqu %%xmm4,%%xmm0 \n" \
1325 "pmaddubsw %%xmm1,%%xmm0 \n" \
1326 "movdqu %%xmm4,%%xmm1 \n" \
1327 "pmaddubsw %%xmm2,%%xmm1 \n" \
1328 "movdqu %%xmm4,%%xmm2 \n" \
1329 "pmaddubsw %%xmm3,%%xmm2 \n" \
1330 "lea 0x40(%0),%0 \n" \
1331 "phaddw %%xmm0,%%xmm6 \n" \
1332 "phaddw %%xmm2,%%xmm1 \n" \
1333 "prefetcht0 1280(%0) \n" \
1334 "paddw %%" #round ",%%xmm6 \n" \
1335 "paddw %%" #round ",%%xmm1 \n" \
1336 "psrlw $0x8,%%xmm6 \n" \
1337 "psrlw $0x8,%%xmm1 \n" \
1338 "packuswb %%xmm1,%%xmm6 \n" \
1339 "movdqu %%xmm6,(%1) \n" \
1340 "lea 0x10(%1),%1 \n" \
1341 "sub $0x10,%2 \n" \
1342 "jg 1b \n"
1343
1344 #define RGBTOY_AVX2(round) \
1345 "1: \n" \
1346 "vmovdqu (%0),%%ymm0 \n" \
1347 "vmovdqu 0x20(%0),%%ymm1 \n" \
1348 "vmovdqu 0x40(%0),%%ymm2 \n" \
1349 "vmovdqu 0x60(%0),%%ymm3 \n" \
1350 "vpsubb %%ymm5, %%ymm0, %%ymm0 \n" \
1351 "vpsubb %%ymm5, %%ymm1, %%ymm1 \n" \
1352 "vpsubb %%ymm5, %%ymm2, %%ymm2 \n" \
1353 "vpsubb %%ymm5, %%ymm3, %%ymm3 \n" \
1354 "vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \
1355 "vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \
1356 "vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \
1357 "vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \
1358 "lea 0x80(%0),%0 \n" \
1359 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \
1360 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \
1361 "prefetcht0 1280(%0) \n" \
1362 "vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \
1363 "vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \
1364 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \
1365 "vpsrlw $0x8,%%ymm2,%%ymm2 \n" \
1366 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \
1367 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" /* unmutate. */ \
1368 "vmovdqu %%ymm0,(%1) \n" \
1369 "lea 0x20(%1),%1 \n" \
1370 "sub $0x20,%2 \n" \
1371 "jg 1b \n" \
1372 "vzeroupper \n"
1373
1374 // clang-format on
1375
1376 #ifdef HAS_ARGBTOYROW_SSSE3
1377 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
ARGBToYRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_y,int width)1378 void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1379 asm volatile(
1380 "movdqa %3,%%xmm4 \n"
1381 "movdqa %4,%%xmm5 \n"
1382 "movdqa %5,%%xmm7 \n"
1383
1384 LABELALIGN RGBTOY(xmm7)
1385 : "+r"(src_argb), // %0
1386 "+r"(dst_y), // %1
1387 "+r"(width) // %2
1388 : "m"(kARGBToY), // %3
1389 "m"(kSub128), // %4
1390 "m"(kAddY16) // %5
1391 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1392 "xmm7");
1393 }
1394 #endif // HAS_ARGBTOYROW_SSSE3
1395
1396 #ifdef HAS_ARGBTOYJROW_SSSE3
1397 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1398 // Same as ARGBToYRow but different coefficients, no add 16.
ARGBToYJRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_y,int width)1399 void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1400 asm volatile(
1401 "movdqa %3,%%xmm4 \n"
1402 "movdqa %4,%%xmm5 \n"
1403
1404 LABELALIGN RGBTOY(xmm5)
1405 : "+r"(src_argb), // %0
1406 "+r"(dst_y), // %1
1407 "+r"(width) // %2
1408 : "m"(kARGBToYJ), // %3
1409 "m"(kSub128) // %4
1410 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1411 }
1412 #endif // HAS_ARGBTOYJROW_SSSE3
1413
1414 #ifdef HAS_ABGRTOYJROW_SSSE3
1415 // Convert 16 ABGR pixels (64 bytes) to 16 YJ values.
1416 // Same as ABGRToYRow but different coefficients, no add 16.
ABGRToYJRow_SSSE3(const uint8_t * src_abgr,uint8_t * dst_y,int width)1417 void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1418 asm volatile(
1419 "movdqa %3,%%xmm4 \n"
1420 "movdqa %4,%%xmm5 \n"
1421
1422 LABELALIGN RGBTOY(xmm5)
1423 : "+r"(src_abgr), // %0
1424 "+r"(dst_y), // %1
1425 "+r"(width) // %2
1426 : "m"(kABGRToYJ), // %3
1427 "m"(kSub128) // %4
1428 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1429 }
1430 #endif // HAS_ABGRTOYJROW_SSSE3
1431
1432 #ifdef HAS_RGBATOYJROW_SSSE3
1433 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1434 // Same as ARGBToYRow but different coefficients, no add 16.
RGBAToYJRow_SSSE3(const uint8_t * src_rgba,uint8_t * dst_y,int width)1435 void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1436 asm volatile(
1437 "movdqa %3,%%xmm4 \n"
1438 "movdqa %4,%%xmm5 \n"
1439
1440 LABELALIGN RGBTOY(xmm5)
1441 : "+r"(src_rgba), // %0
1442 "+r"(dst_y), // %1
1443 "+r"(width) // %2
1444 : "m"(kRGBAToYJ), // %3
1445 "m"(kSub128) // %4
1446 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1447 }
1448 #endif // HAS_RGBATOYJROW_SSSE3
1449
1450 #if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ABGRTOYROW_AVX2) || \
1451 defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
1452 // vpermd for vphaddw + vpackuswb vpermd.
1453 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
1454 #endif
1455
1456 #ifdef HAS_ARGBTOYROW_AVX2
1457
1458 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYRow_AVX2(const uint8_t * src_argb,uint8_t * dst_y,int width)1459 void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1460 asm volatile(
1461 "vbroadcastf128 %3,%%ymm4 \n"
1462 "vbroadcastf128 %4,%%ymm5 \n"
1463 "vbroadcastf128 %5,%%ymm7 \n"
1464 "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2(
1465 ymm7) "vzeroupper \n"
1466 : "+r"(src_argb), // %0
1467 "+r"(dst_y), // %1
1468 "+r"(width) // %2
1469 : "m"(kARGBToY), // %3
1470 "m"(kSub128), // %4
1471 "m"(kAddY16), // %5
1472 "m"(kPermdARGBToY_AVX) // %6
1473 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1474 "xmm7");
1475 }
1476 #endif // HAS_ARGBTOYROW_AVX2
1477
1478 #ifdef HAS_ABGRTOYROW_AVX2
1479 // Convert 32 ABGR pixels (128 bytes) to 32 Y values.
ABGRToYRow_AVX2(const uint8_t * src_abgr,uint8_t * dst_y,int width)1480 void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1481 asm volatile(
1482 "vbroadcastf128 %3,%%ymm4 \n"
1483 "vbroadcastf128 %4,%%ymm5 \n"
1484 "vbroadcastf128 %5,%%ymm7 \n"
1485 "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2(
1486 ymm7) "vzeroupper \n"
1487 : "+r"(src_abgr), // %0
1488 "+r"(dst_y), // %1
1489 "+r"(width) // %2
1490 : "m"(kABGRToY), // %3
1491 "m"(kSub128), // %4
1492 "m"(kAddY16), // %5
1493 "m"(kPermdARGBToY_AVX) // %6
1494 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1495 "xmm7");
1496 }
1497 #endif // HAS_ABGRTOYROW_AVX2
1498
1499 #ifdef HAS_ARGBTOYJROW_AVX2
1500 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYJRow_AVX2(const uint8_t * src_argb,uint8_t * dst_y,int width)1501 void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1502 asm volatile(
1503 "vbroadcastf128 %3,%%ymm4 \n"
1504 "vbroadcastf128 %4,%%ymm5 \n"
1505 "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2(
1506 ymm5) "vzeroupper \n"
1507 : "+r"(src_argb), // %0
1508 "+r"(dst_y), // %1
1509 "+r"(width) // %2
1510 : "m"(kARGBToYJ), // %3
1511 "m"(kSub128), // %4
1512 "m"(kPermdARGBToY_AVX) // %5
1513 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1514 "xmm7");
1515 }
1516 #endif // HAS_ARGBTOYJROW_AVX2
1517
1518 #ifdef HAS_ABGRTOYJROW_AVX2
1519 // Convert 32 ABGR pixels (128 bytes) to 32 Y values.
ABGRToYJRow_AVX2(const uint8_t * src_abgr,uint8_t * dst_y,int width)1520 void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1521 asm volatile(
1522 "vbroadcastf128 %3,%%ymm4 \n"
1523 "vbroadcastf128 %4,%%ymm5 \n"
1524 "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2(
1525 ymm5) "vzeroupper \n"
1526 : "+r"(src_abgr), // %0
1527 "+r"(dst_y), // %1
1528 "+r"(width) // %2
1529 : "m"(kABGRToYJ), // %3
1530 "m"(kSub128), // %4
1531 "m"(kPermdARGBToY_AVX) // %5
1532 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1533 "xmm7");
1534 }
1535 #endif // HAS_ABGRTOYJROW_AVX2
1536
1537 #ifdef HAS_RGBATOYJROW_AVX2
1538 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
RGBAToYJRow_AVX2(const uint8_t * src_rgba,uint8_t * dst_y,int width)1539 void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1540 asm volatile(
1541 "vbroadcastf128 %3,%%ymm4 \n"
1542 "vbroadcastf128 %4,%%ymm5 \n"
1543 "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2(
1544 ymm5) "vzeroupper \n"
1545 : "+r"(src_rgba), // %0
1546 "+r"(dst_y), // %1
1547 "+r"(width) // %2
1548 : "m"(kRGBAToYJ), // %3
1549 "m"(kSub128), // %4
1550 "m"(kPermdARGBToY_AVX) // %5
1551 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1552 }
1553 #endif // HAS_RGBATOYJROW_AVX2
1554
1555 #ifdef HAS_ARGBTOUVROW_SSSE3
ARGBToUVRow_SSSE3(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1556 void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
1557 int src_stride_argb,
1558 uint8_t* dst_u,
1559 uint8_t* dst_v,
1560 int width) {
1561 asm volatile(
1562 "movdqa %5,%%xmm3 \n"
1563 "movdqa %6,%%xmm4 \n"
1564 "movdqa %7,%%xmm5 \n"
1565 "sub %1,%2 \n"
1566
1567 LABELALIGN
1568 "1: \n"
1569 "movdqu (%0),%%xmm0 \n"
1570 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
1571 "pavgb %%xmm7,%%xmm0 \n"
1572 "movdqu 0x10(%0),%%xmm1 \n"
1573 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1574 "pavgb %%xmm7,%%xmm1 \n"
1575 "movdqu 0x20(%0),%%xmm2 \n"
1576 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1577 "pavgb %%xmm7,%%xmm2 \n"
1578 "movdqu 0x30(%0),%%xmm6 \n"
1579 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1580 "pavgb %%xmm7,%%xmm6 \n"
1581
1582 "lea 0x40(%0),%0 \n"
1583 "movdqa %%xmm0,%%xmm7 \n"
1584 "shufps $0x88,%%xmm1,%%xmm0 \n"
1585 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1586 "pavgb %%xmm7,%%xmm0 \n"
1587 "movdqa %%xmm2,%%xmm7 \n"
1588 "shufps $0x88,%%xmm6,%%xmm2 \n"
1589 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1590 "pavgb %%xmm7,%%xmm2 \n"
1591 "movdqa %%xmm0,%%xmm1 \n"
1592 "movdqa %%xmm2,%%xmm6 \n"
1593 "pmaddubsw %%xmm4,%%xmm0 \n"
1594 "pmaddubsw %%xmm4,%%xmm2 \n"
1595 "pmaddubsw %%xmm3,%%xmm1 \n"
1596 "pmaddubsw %%xmm3,%%xmm6 \n"
1597 "phaddw %%xmm2,%%xmm0 \n"
1598 "phaddw %%xmm6,%%xmm1 \n"
1599 "psraw $0x8,%%xmm0 \n"
1600 "psraw $0x8,%%xmm1 \n"
1601 "packsswb %%xmm1,%%xmm0 \n"
1602 "paddb %%xmm5,%%xmm0 \n"
1603 "movlps %%xmm0,(%1) \n"
1604 "movhps %%xmm0,0x00(%1,%2,1) \n"
1605 "lea 0x8(%1),%1 \n"
1606 "sub $0x10,%3 \n"
1607 "jg 1b \n"
1608 : "+r"(src_argb), // %0
1609 "+r"(dst_u), // %1
1610 "+r"(dst_v), // %2
1611 "+rm"(width) // %3
1612 : "r"((intptr_t)(src_stride_argb)), // %4
1613 "m"(kARGBToV), // %5
1614 "m"(kARGBToU), // %6
1615 "m"(kAddUV128) // %7
1616 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1617 }
1618 #endif // HAS_ARGBTOUVROW_SSSE3
1619
1620 #if defined(HAS_ARGBTOUVROW_AVX2) || defined(HAS_ABGRTOUVROW_AVX2) || \
1621 defined(HAS_ARGBTOUVJROW_AVX2) || defined(HAS_ABGRTOUVJROW_AVX2)
1622 // vpshufb for vphaddw + vpackuswb packed to shorts.
1623 static const lvec8 kShufARGBToUV_AVX = {
1624 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
1625 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
1626 #endif
1627
1628 #if defined(HAS_ARGBTOUVROW_AVX2)
ARGBToUVRow_AVX2(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1629 void ARGBToUVRow_AVX2(const uint8_t* src_argb,
1630 int src_stride_argb,
1631 uint8_t* dst_u,
1632 uint8_t* dst_v,
1633 int width) {
1634 asm volatile(
1635 "vbroadcastf128 %5,%%ymm5 \n"
1636 "vbroadcastf128 %6,%%ymm6 \n"
1637 "vbroadcastf128 %7,%%ymm7 \n"
1638 "sub %1,%2 \n"
1639
1640 LABELALIGN
1641 "1: \n"
1642 "vmovdqu (%0),%%ymm0 \n"
1643 "vmovdqu 0x20(%0),%%ymm1 \n"
1644 "vmovdqu 0x40(%0),%%ymm2 \n"
1645 "vmovdqu 0x60(%0),%%ymm3 \n"
1646 "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
1647 "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
1648 "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
1649 "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
1650 "lea 0x80(%0),%0 \n"
1651 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
1652 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
1653 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
1654 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
1655 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
1656 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
1657
1658 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
1659 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
1660 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
1661 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
1662 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
1663 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
1664 "vpsraw $0x8,%%ymm1,%%ymm1 \n"
1665 "vpsraw $0x8,%%ymm0,%%ymm0 \n"
1666 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
1667 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1668 "vpshufb %8,%%ymm0,%%ymm0 \n"
1669 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
1670
1671 "vextractf128 $0x0,%%ymm0,(%1) \n"
1672 "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
1673 "lea 0x10(%1),%1 \n"
1674 "sub $0x20,%3 \n"
1675 "jg 1b \n"
1676 "vzeroupper \n"
1677 : "+r"(src_argb), // %0
1678 "+r"(dst_u), // %1
1679 "+r"(dst_v), // %2
1680 "+rm"(width) // %3
1681 : "r"((intptr_t)(src_stride_argb)), // %4
1682 "m"(kAddUV128), // %5
1683 "m"(kARGBToV), // %6
1684 "m"(kARGBToU), // %7
1685 "m"(kShufARGBToUV_AVX) // %8
1686 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1687 "xmm7");
1688 }
1689 #endif // HAS_ARGBTOUVROW_AVX2
1690
1691 #ifdef HAS_ABGRTOUVROW_AVX2
ABGRToUVRow_AVX2(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)1692 void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
1693 int src_stride_abgr,
1694 uint8_t* dst_u,
1695 uint8_t* dst_v,
1696 int width) {
1697 asm volatile(
1698 "vbroadcastf128 %5,%%ymm5 \n"
1699 "vbroadcastf128 %6,%%ymm6 \n"
1700 "vbroadcastf128 %7,%%ymm7 \n"
1701 "sub %1,%2 \n"
1702
1703 LABELALIGN
1704 "1: \n"
1705 "vmovdqu (%0),%%ymm0 \n"
1706 "vmovdqu 0x20(%0),%%ymm1 \n"
1707 "vmovdqu 0x40(%0),%%ymm2 \n"
1708 "vmovdqu 0x60(%0),%%ymm3 \n"
1709 "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
1710 "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
1711 "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
1712 "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
1713 "lea 0x80(%0),%0 \n"
1714 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
1715 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
1716 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
1717 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
1718 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
1719 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
1720
1721 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
1722 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
1723 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
1724 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
1725 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
1726 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
1727 "vpsraw $0x8,%%ymm1,%%ymm1 \n"
1728 "vpsraw $0x8,%%ymm0,%%ymm0 \n"
1729 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
1730 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1731 "vpshufb %8,%%ymm0,%%ymm0 \n"
1732 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
1733
1734 "vextractf128 $0x0,%%ymm0,(%1) \n"
1735 "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
1736 "lea 0x10(%1),%1 \n"
1737 "sub $0x20,%3 \n"
1738 "jg 1b \n"
1739 "vzeroupper \n"
1740 : "+r"(src_abgr), // %0
1741 "+r"(dst_u), // %1
1742 "+r"(dst_v), // %2
1743 "+rm"(width) // %3
1744 : "r"((intptr_t)(src_stride_abgr)), // %4
1745 "m"(kAddUV128), // %5
1746 "m"(kABGRToV), // %6
1747 "m"(kABGRToU), // %7
1748 "m"(kShufARGBToUV_AVX) // %8
1749 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1750 "xmm7");
1751 }
1752 #endif // HAS_ABGRTOUVROW_AVX2
1753
1754 #ifdef HAS_ARGBTOUVJROW_AVX2
ARGBToUVJRow_AVX2(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1755 void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
1756 int src_stride_argb,
1757 uint8_t* dst_u,
1758 uint8_t* dst_v,
1759 int width) {
1760 asm volatile(
1761 "vbroadcastf128 %5,%%ymm5 \n"
1762 "vbroadcastf128 %6,%%ymm6 \n"
1763 "vbroadcastf128 %7,%%ymm7 \n"
1764 "sub %1,%2 \n"
1765
1766 LABELALIGN
1767 "1: \n"
1768 "vmovdqu (%0),%%ymm0 \n"
1769 "vmovdqu 0x20(%0),%%ymm1 \n"
1770 "vmovdqu 0x40(%0),%%ymm2 \n"
1771 "vmovdqu 0x60(%0),%%ymm3 \n"
1772 "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
1773 "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
1774 "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
1775 "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
1776 "lea 0x80(%0),%0 \n"
1777 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
1778 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
1779 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
1780 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
1781 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
1782 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
1783
1784 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
1785 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
1786 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
1787 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
1788 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
1789 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
1790 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
1791 "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
1792 "vpsraw $0x8,%%ymm1,%%ymm1 \n"
1793 "vpsraw $0x8,%%ymm0,%%ymm0 \n"
1794 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
1795 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1796 "vpshufb %8,%%ymm0,%%ymm0 \n"
1797
1798 "vextractf128 $0x0,%%ymm0,(%1) \n"
1799 "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
1800 "lea 0x10(%1),%1 \n"
1801 "sub $0x20,%3 \n"
1802 "jg 1b \n"
1803 "vzeroupper \n"
1804 : "+r"(src_argb), // %0
1805 "+r"(dst_u), // %1
1806 "+r"(dst_v), // %2
1807 "+rm"(width) // %3
1808 : "r"((intptr_t)(src_stride_argb)), // %4
1809 "m"(kSub128), // %5
1810 "m"(kARGBToVJ), // %6
1811 "m"(kARGBToUJ), // %7
1812 "m"(kShufARGBToUV_AVX) // %8
1813 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1814 "xmm7");
1815 }
1816 #endif // HAS_ARGBTOUVJROW_AVX2
1817
1818 // TODO(fbarchard): Pass kABGRToVJ / kABGRToUJ as matrix
1819 #ifdef HAS_ABGRTOUVJROW_AVX2
ABGRToUVJRow_AVX2(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)1820 void ABGRToUVJRow_AVX2(const uint8_t* src_abgr,
1821 int src_stride_abgr,
1822 uint8_t* dst_u,
1823 uint8_t* dst_v,
1824 int width) {
1825 asm volatile(
1826 "vbroadcastf128 %5,%%ymm5 \n"
1827 "vbroadcastf128 %6,%%ymm6 \n"
1828 "vbroadcastf128 %7,%%ymm7 \n"
1829 "sub %1,%2 \n"
1830
1831 LABELALIGN
1832 "1: \n"
1833 "vmovdqu (%0),%%ymm0 \n"
1834 "vmovdqu 0x20(%0),%%ymm1 \n"
1835 "vmovdqu 0x40(%0),%%ymm2 \n"
1836 "vmovdqu 0x60(%0),%%ymm3 \n"
1837 "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
1838 "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
1839 "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
1840 "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
1841 "lea 0x80(%0),%0 \n"
1842 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
1843 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
1844 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
1845 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
1846 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
1847 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
1848
1849 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
1850 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
1851 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
1852 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
1853 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
1854 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
1855 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
1856 "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
1857 "vpsraw $0x8,%%ymm1,%%ymm1 \n"
1858 "vpsraw $0x8,%%ymm0,%%ymm0 \n"
1859 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
1860 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1861 "vpshufb %8,%%ymm0,%%ymm0 \n"
1862
1863 "vextractf128 $0x0,%%ymm0,(%1) \n"
1864 "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
1865 "lea 0x10(%1),%1 \n"
1866 "sub $0x20,%3 \n"
1867 "jg 1b \n"
1868 "vzeroupper \n"
1869 : "+r"(src_abgr), // %0
1870 "+r"(dst_u), // %1
1871 "+r"(dst_v), // %2
1872 "+rm"(width) // %3
1873 : "r"((intptr_t)(src_stride_abgr)), // %4
1874 "m"(kSub128), // %5
1875 "m"(kABGRToVJ), // %6
1876 "m"(kABGRToUJ), // %7
1877 "m"(kShufARGBToUV_AVX) // %8
1878 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1879 "xmm7");
1880 }
1881 #endif // HAS_ABGRTOUVJROW_AVX2
1882
1883 #ifdef HAS_ARGBTOUVJROW_SSSE3
ARGBToUVJRow_SSSE3(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1884 void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
1885 int src_stride_argb,
1886 uint8_t* dst_u,
1887 uint8_t* dst_v,
1888 int width) {
1889 asm volatile(
1890 "movdqa %5,%%xmm3 \n"
1891 "movdqa %6,%%xmm4 \n"
1892 "movdqa %7,%%xmm5 \n"
1893 "sub %1,%2 \n"
1894
1895 LABELALIGN
1896 "1: \n"
1897 "movdqu (%0),%%xmm0 \n"
1898 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
1899 "pavgb %%xmm7,%%xmm0 \n"
1900 "movdqu 0x10(%0),%%xmm1 \n"
1901 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1902 "pavgb %%xmm7,%%xmm1 \n"
1903 "movdqu 0x20(%0),%%xmm2 \n"
1904 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1905 "pavgb %%xmm7,%%xmm2 \n"
1906 "movdqu 0x30(%0),%%xmm6 \n"
1907 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1908 "pavgb %%xmm7,%%xmm6 \n"
1909
1910 "lea 0x40(%0),%0 \n"
1911 "movdqa %%xmm0,%%xmm7 \n"
1912 "shufps $0x88,%%xmm1,%%xmm0 \n"
1913 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1914 "pavgb %%xmm7,%%xmm0 \n"
1915 "movdqa %%xmm2,%%xmm7 \n"
1916 "shufps $0x88,%%xmm6,%%xmm2 \n"
1917 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1918 "pavgb %%xmm7,%%xmm2 \n"
1919 "movdqa %%xmm0,%%xmm1 \n"
1920 "movdqa %%xmm2,%%xmm6 \n"
1921 "pmaddubsw %%xmm4,%%xmm0 \n"
1922 "pmaddubsw %%xmm4,%%xmm2 \n"
1923 "pmaddubsw %%xmm3,%%xmm1 \n"
1924 "pmaddubsw %%xmm3,%%xmm6 \n"
1925 "phaddw %%xmm2,%%xmm0 \n"
1926 "phaddw %%xmm6,%%xmm1 \n"
1927 "paddw %%xmm5,%%xmm0 \n"
1928 "paddw %%xmm5,%%xmm1 \n"
1929 "psraw $0x8,%%xmm0 \n"
1930 "psraw $0x8,%%xmm1 \n"
1931 "packsswb %%xmm1,%%xmm0 \n"
1932 "movlps %%xmm0,(%1) \n"
1933 "movhps %%xmm0,0x00(%1,%2,1) \n"
1934 "lea 0x8(%1),%1 \n"
1935 "sub $0x10,%3 \n"
1936 "jg 1b \n"
1937 : "+r"(src_argb), // %0
1938 "+r"(dst_u), // %1
1939 "+r"(dst_v), // %2
1940 "+rm"(width) // %3
1941 : "r"((intptr_t)(src_stride_argb)), // %4
1942 "m"(kARGBToVJ), // %5
1943 "m"(kARGBToUJ), // %6
1944 "m"(kSub128) // %7
1945 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1946 }
1947 #endif // HAS_ARGBTOUVJROW_SSSE3
1948
1949 #ifdef HAS_ABGRTOUVJROW_SSSE3
ABGRToUVJRow_SSSE3(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)1950 void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr,
1951 int src_stride_abgr,
1952 uint8_t* dst_u,
1953 uint8_t* dst_v,
1954 int width) {
1955 asm volatile(
1956 "movdqa %5,%%xmm3 \n"
1957 "movdqa %6,%%xmm4 \n"
1958 "movdqa %7,%%xmm5 \n"
1959 "sub %1,%2 \n"
1960
1961 LABELALIGN
1962 "1: \n"
1963 "movdqu (%0),%%xmm0 \n"
1964 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
1965 "pavgb %%xmm7,%%xmm0 \n"
1966 "movdqu 0x10(%0),%%xmm1 \n"
1967 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1968 "pavgb %%xmm7,%%xmm1 \n"
1969 "movdqu 0x20(%0),%%xmm2 \n"
1970 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1971 "pavgb %%xmm7,%%xmm2 \n"
1972 "movdqu 0x30(%0),%%xmm6 \n"
1973 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1974 "pavgb %%xmm7,%%xmm6 \n"
1975
1976 "lea 0x40(%0),%0 \n"
1977 "movdqa %%xmm0,%%xmm7 \n"
1978 "shufps $0x88,%%xmm1,%%xmm0 \n"
1979 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1980 "pavgb %%xmm7,%%xmm0 \n"
1981 "movdqa %%xmm2,%%xmm7 \n"
1982 "shufps $0x88,%%xmm6,%%xmm2 \n"
1983 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1984 "pavgb %%xmm7,%%xmm2 \n"
1985 "movdqa %%xmm0,%%xmm1 \n"
1986 "movdqa %%xmm2,%%xmm6 \n"
1987 "pmaddubsw %%xmm4,%%xmm0 \n"
1988 "pmaddubsw %%xmm4,%%xmm2 \n"
1989 "pmaddubsw %%xmm3,%%xmm1 \n"
1990 "pmaddubsw %%xmm3,%%xmm6 \n"
1991 "phaddw %%xmm2,%%xmm0 \n"
1992 "phaddw %%xmm6,%%xmm1 \n"
1993 "paddw %%xmm5,%%xmm0 \n"
1994 "paddw %%xmm5,%%xmm1 \n"
1995 "psraw $0x8,%%xmm0 \n"
1996 "psraw $0x8,%%xmm1 \n"
1997 "packsswb %%xmm1,%%xmm0 \n"
1998 "movlps %%xmm0,(%1) \n"
1999 "movhps %%xmm0,0x00(%1,%2,1) \n"
2000 "lea 0x8(%1),%1 \n"
2001 "sub $0x10,%3 \n"
2002 "jg 1b \n"
2003 : "+r"(src_abgr), // %0
2004 "+r"(dst_u), // %1
2005 "+r"(dst_v), // %2
2006 "+rm"(width) // %3
2007 : "r"((intptr_t)(src_stride_abgr)), // %4
2008 "m"(kABGRToVJ), // %5
2009 "m"(kABGRToUJ), // %6
2010 "m"(kSub128) // %7
2011 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
2012 }
2013 #endif // HAS_ABGRTOUVJROW_SSSE3
2014
2015 #ifdef HAS_ARGBTOUV444ROW_SSSE3
ARGBToUV444Row_SSSE3(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width)2016 void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
2017 uint8_t* dst_u,
2018 uint8_t* dst_v,
2019 int width) {
2020 asm volatile(
2021 "movdqa %4,%%xmm3 \n"
2022 "movdqa %5,%%xmm4 \n"
2023 "movdqa %6,%%xmm5 \n"
2024 "sub %1,%2 \n"
2025
2026 LABELALIGN
2027 "1: \n"
2028 "movdqu (%0),%%xmm0 \n"
2029 "movdqu 0x10(%0),%%xmm1 \n"
2030 "movdqu 0x20(%0),%%xmm2 \n"
2031 "movdqu 0x30(%0),%%xmm6 \n"
2032 "pmaddubsw %%xmm4,%%xmm0 \n"
2033 "pmaddubsw %%xmm4,%%xmm1 \n"
2034 "pmaddubsw %%xmm4,%%xmm2 \n"
2035 "pmaddubsw %%xmm4,%%xmm6 \n"
2036 "phaddw %%xmm1,%%xmm0 \n"
2037 "phaddw %%xmm6,%%xmm2 \n"
2038 "psraw $0x8,%%xmm0 \n"
2039 "psraw $0x8,%%xmm2 \n"
2040 "packsswb %%xmm2,%%xmm0 \n"
2041 "paddb %%xmm5,%%xmm0 \n"
2042 "movdqu %%xmm0,(%1) \n"
2043 "movdqu (%0),%%xmm0 \n"
2044 "movdqu 0x10(%0),%%xmm1 \n"
2045 "movdqu 0x20(%0),%%xmm2 \n"
2046 "movdqu 0x30(%0),%%xmm6 \n"
2047 "pmaddubsw %%xmm3,%%xmm0 \n"
2048 "pmaddubsw %%xmm3,%%xmm1 \n"
2049 "pmaddubsw %%xmm3,%%xmm2 \n"
2050 "pmaddubsw %%xmm3,%%xmm6 \n"
2051 "phaddw %%xmm1,%%xmm0 \n"
2052 "phaddw %%xmm6,%%xmm2 \n"
2053 "psraw $0x8,%%xmm0 \n"
2054 "psraw $0x8,%%xmm2 \n"
2055 "packsswb %%xmm2,%%xmm0 \n"
2056 "paddb %%xmm5,%%xmm0 \n"
2057 "lea 0x40(%0),%0 \n"
2058 "movdqu %%xmm0,0x00(%1,%2,1) \n"
2059 "lea 0x10(%1),%1 \n"
2060 "sub $0x10,%3 \n"
2061 "jg 1b \n"
2062 : "+r"(src_argb), // %0
2063 "+r"(dst_u), // %1
2064 "+r"(dst_v), // %2
2065 "+rm"(width) // %3
2066 : "m"(kARGBToV), // %4
2067 "m"(kARGBToU), // %5
2068 "m"(kAddUV128) // %6
2069 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");
2070 }
2071 #endif // HAS_ARGBTOUV444ROW_SSSE3
2072
BGRAToYRow_SSSE3(const uint8_t * src_bgra,uint8_t * dst_y,int width)2073 void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
2074 asm volatile(
2075 "movdqa %3,%%xmm4 \n"
2076 "movdqa %4,%%xmm5 \n"
2077 "movdqa %5,%%xmm7 \n"
2078
2079 LABELALIGN RGBTOY(xmm7)
2080 : "+r"(src_bgra), // %0
2081 "+r"(dst_y), // %1
2082 "+r"(width) // %2
2083 : "m"(kBGRAToY), // %3
2084 "m"(kSub128), // %4
2085 "m"(kAddY16) // %5
2086 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2087 "xmm7");
2088 }
2089
BGRAToUVRow_SSSE3(const uint8_t * src_bgra,int src_stride_bgra,uint8_t * dst_u,uint8_t * dst_v,int width)2090 void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
2091 int src_stride_bgra,
2092 uint8_t* dst_u,
2093 uint8_t* dst_v,
2094 int width) {
2095 asm volatile(
2096 "movdqa %5,%%xmm3 \n"
2097 "movdqa %6,%%xmm4 \n"
2098 "movdqa %7,%%xmm5 \n"
2099 "sub %1,%2 \n"
2100
2101 LABELALIGN
2102 "1: \n"
2103 "movdqu (%0),%%xmm0 \n"
2104 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
2105 "pavgb %%xmm7,%%xmm0 \n"
2106 "movdqu 0x10(%0),%%xmm1 \n"
2107 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
2108 "pavgb %%xmm7,%%xmm1 \n"
2109 "movdqu 0x20(%0),%%xmm2 \n"
2110 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
2111 "pavgb %%xmm7,%%xmm2 \n"
2112 "movdqu 0x30(%0),%%xmm6 \n"
2113 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
2114 "pavgb %%xmm7,%%xmm6 \n"
2115
2116 "lea 0x40(%0),%0 \n"
2117 "movdqa %%xmm0,%%xmm7 \n"
2118 "shufps $0x88,%%xmm1,%%xmm0 \n"
2119 "shufps $0xdd,%%xmm1,%%xmm7 \n"
2120 "pavgb %%xmm7,%%xmm0 \n"
2121 "movdqa %%xmm2,%%xmm7 \n"
2122 "shufps $0x88,%%xmm6,%%xmm2 \n"
2123 "shufps $0xdd,%%xmm6,%%xmm7 \n"
2124 "pavgb %%xmm7,%%xmm2 \n"
2125 "movdqa %%xmm0,%%xmm1 \n"
2126 "movdqa %%xmm2,%%xmm6 \n"
2127 "pmaddubsw %%xmm4,%%xmm0 \n"
2128 "pmaddubsw %%xmm4,%%xmm2 \n"
2129 "pmaddubsw %%xmm3,%%xmm1 \n"
2130 "pmaddubsw %%xmm3,%%xmm6 \n"
2131 "phaddw %%xmm2,%%xmm0 \n"
2132 "phaddw %%xmm6,%%xmm1 \n"
2133 "psraw $0x8,%%xmm0 \n"
2134 "psraw $0x8,%%xmm1 \n"
2135 "packsswb %%xmm1,%%xmm0 \n"
2136 "paddb %%xmm5,%%xmm0 \n"
2137 "movlps %%xmm0,(%1) \n"
2138 "movhps %%xmm0,0x00(%1,%2,1) \n"
2139 "lea 0x8(%1),%1 \n"
2140 "sub $0x10,%3 \n"
2141 "jg 1b \n"
2142 : "+r"(src_bgra), // %0
2143 "+r"(dst_u), // %1
2144 "+r"(dst_v), // %2
2145 "+rm"(width) // %3
2146 : "r"((intptr_t)(src_stride_bgra)), // %4
2147 "m"(kBGRAToV), // %5
2148 "m"(kBGRAToU), // %6
2149 "m"(kAddUV128) // %7
2150 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
2151 }
2152
ABGRToYRow_SSSE3(const uint8_t * src_abgr,uint8_t * dst_y,int width)2153 void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
2154 asm volatile(
2155 "movdqa %3,%%xmm4 \n"
2156 "movdqa %4,%%xmm5 \n"
2157 "movdqa %5,%%xmm7 \n"
2158
2159 LABELALIGN RGBTOY(xmm7)
2160 : "+r"(src_abgr), // %0
2161 "+r"(dst_y), // %1
2162 "+r"(width) // %2
2163 : "m"(kABGRToY), // %3
2164 "m"(kSub128), // %4
2165 "m"(kAddY16) // %5
2166 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2167 "xmm7");
2168 }
2169
RGBAToYRow_SSSE3(const uint8_t * src_rgba,uint8_t * dst_y,int width)2170 void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
2171 asm volatile(
2172 "movdqa %3,%%xmm4 \n"
2173 "movdqa %4,%%xmm5 \n"
2174 "movdqa %5,%%xmm7 \n"
2175
2176 LABELALIGN RGBTOY(xmm7)
2177 : "+r"(src_rgba), // %0
2178 "+r"(dst_y), // %1
2179 "+r"(width) // %2
2180 : "m"(kRGBAToY), // %3
2181 "m"(kSub128), // %4
2182 "m"(kAddY16) // %5
2183 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2184 "xmm7");
2185 }
2186
ABGRToUVRow_SSSE3(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)2187 void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
2188 int src_stride_abgr,
2189 uint8_t* dst_u,
2190 uint8_t* dst_v,
2191 int width) {
2192 asm volatile(
2193 "movdqa %5,%%xmm3 \n"
2194 "movdqa %6,%%xmm4 \n"
2195 "movdqa %7,%%xmm5 \n"
2196 "sub %1,%2 \n"
2197
2198 LABELALIGN
2199 "1: \n"
2200 "movdqu (%0),%%xmm0 \n"
2201 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
2202 "pavgb %%xmm7,%%xmm0 \n"
2203 "movdqu 0x10(%0),%%xmm1 \n"
2204 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
2205 "pavgb %%xmm7,%%xmm1 \n"
2206 "movdqu 0x20(%0),%%xmm2 \n"
2207 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
2208 "pavgb %%xmm7,%%xmm2 \n"
2209 "movdqu 0x30(%0),%%xmm6 \n"
2210 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
2211 "pavgb %%xmm7,%%xmm6 \n"
2212
2213 "lea 0x40(%0),%0 \n"
2214 "movdqa %%xmm0,%%xmm7 \n"
2215 "shufps $0x88,%%xmm1,%%xmm0 \n"
2216 "shufps $0xdd,%%xmm1,%%xmm7 \n"
2217 "pavgb %%xmm7,%%xmm0 \n"
2218 "movdqa %%xmm2,%%xmm7 \n"
2219 "shufps $0x88,%%xmm6,%%xmm2 \n"
2220 "shufps $0xdd,%%xmm6,%%xmm7 \n"
2221 "pavgb %%xmm7,%%xmm2 \n"
2222 "movdqa %%xmm0,%%xmm1 \n"
2223 "movdqa %%xmm2,%%xmm6 \n"
2224 "pmaddubsw %%xmm4,%%xmm0 \n"
2225 "pmaddubsw %%xmm4,%%xmm2 \n"
2226 "pmaddubsw %%xmm3,%%xmm1 \n"
2227 "pmaddubsw %%xmm3,%%xmm6 \n"
2228 "phaddw %%xmm2,%%xmm0 \n"
2229 "phaddw %%xmm6,%%xmm1 \n"
2230 "psraw $0x8,%%xmm0 \n"
2231 "psraw $0x8,%%xmm1 \n"
2232 "packsswb %%xmm1,%%xmm0 \n"
2233 "paddb %%xmm5,%%xmm0 \n"
2234 "movlps %%xmm0,(%1) \n"
2235 "movhps %%xmm0,0x00(%1,%2,1) \n"
2236 "lea 0x8(%1),%1 \n"
2237 "sub $0x10,%3 \n"
2238 "jg 1b \n"
2239 : "+r"(src_abgr), // %0
2240 "+r"(dst_u), // %1
2241 "+r"(dst_v), // %2
2242 "+rm"(width) // %3
2243 : "r"((intptr_t)(src_stride_abgr)), // %4
2244 "m"(kABGRToV), // %5
2245 "m"(kABGRToU), // %6
2246 "m"(kAddUV128) // %7
2247 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
2248 }
2249
RGBAToUVRow_SSSE3(const uint8_t * src_rgba,int src_stride_rgba,uint8_t * dst_u,uint8_t * dst_v,int width)2250 void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
2251 int src_stride_rgba,
2252 uint8_t* dst_u,
2253 uint8_t* dst_v,
2254 int width) {
2255 asm volatile(
2256 "movdqa %5,%%xmm3 \n"
2257 "movdqa %6,%%xmm4 \n"
2258 "movdqa %7,%%xmm5 \n"
2259 "sub %1,%2 \n"
2260
2261 LABELALIGN
2262 "1: \n"
2263 "movdqu (%0),%%xmm0 \n"
2264 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
2265 "pavgb %%xmm7,%%xmm0 \n"
2266 "movdqu 0x10(%0),%%xmm1 \n"
2267 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
2268 "pavgb %%xmm7,%%xmm1 \n"
2269 "movdqu 0x20(%0),%%xmm2 \n"
2270 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
2271 "pavgb %%xmm7,%%xmm2 \n"
2272 "movdqu 0x30(%0),%%xmm6 \n"
2273 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
2274 "pavgb %%xmm7,%%xmm6 \n"
2275
2276 "lea 0x40(%0),%0 \n"
2277 "movdqa %%xmm0,%%xmm7 \n"
2278 "shufps $0x88,%%xmm1,%%xmm0 \n"
2279 "shufps $0xdd,%%xmm1,%%xmm7 \n"
2280 "pavgb %%xmm7,%%xmm0 \n"
2281 "movdqa %%xmm2,%%xmm7 \n"
2282 "shufps $0x88,%%xmm6,%%xmm2 \n"
2283 "shufps $0xdd,%%xmm6,%%xmm7 \n"
2284 "pavgb %%xmm7,%%xmm2 \n"
2285 "movdqa %%xmm0,%%xmm1 \n"
2286 "movdqa %%xmm2,%%xmm6 \n"
2287 "pmaddubsw %%xmm4,%%xmm0 \n"
2288 "pmaddubsw %%xmm4,%%xmm2 \n"
2289 "pmaddubsw %%xmm3,%%xmm1 \n"
2290 "pmaddubsw %%xmm3,%%xmm6 \n"
2291 "phaddw %%xmm2,%%xmm0 \n"
2292 "phaddw %%xmm6,%%xmm1 \n"
2293 "psraw $0x8,%%xmm0 \n"
2294 "psraw $0x8,%%xmm1 \n"
2295 "packsswb %%xmm1,%%xmm0 \n"
2296 "paddb %%xmm5,%%xmm0 \n"
2297 "movlps %%xmm0,(%1) \n"
2298 "movhps %%xmm0,0x00(%1,%2,1) \n"
2299 "lea 0x8(%1),%1 \n"
2300 "sub $0x10,%3 \n"
2301 "jg 1b \n"
2302 : "+r"(src_rgba), // %0
2303 "+r"(dst_u), // %1
2304 "+r"(dst_v), // %2
2305 "+rm"(width) // %3
2306 : "r"((intptr_t)(src_stride_rgba)), // %4
2307 "m"(kRGBAToV), // %5
2308 "m"(kRGBAToU), // %6
2309 "m"(kAddUV128) // %7
2310 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
2311 }
2312
2313 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
2314
2315 // Read 8 UV from 444
2316 #define READYUV444 \
2317 "movq (%[u_buf]),%%xmm3 \n" \
2318 "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2319 "lea 0x8(%[u_buf]),%[u_buf] \n" \
2320 "punpcklbw %%xmm1,%%xmm3 \n" \
2321 "movq (%[y_buf]),%%xmm4 \n" \
2322 "punpcklbw %%xmm4,%%xmm4 \n" \
2323 "lea 0x8(%[y_buf]),%[y_buf] \n"
2324
2325 // Read 4 UV from 422, upsample to 8 UV
2326 #define READYUV422 \
2327 "movd (%[u_buf]),%%xmm3 \n" \
2328 "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2329 "lea 0x4(%[u_buf]),%[u_buf] \n" \
2330 "punpcklbw %%xmm1,%%xmm3 \n" \
2331 "punpcklwd %%xmm3,%%xmm3 \n" \
2332 "movq (%[y_buf]),%%xmm4 \n" \
2333 "punpcklbw %%xmm4,%%xmm4 \n" \
2334 "lea 0x8(%[y_buf]),%[y_buf] \n"
2335
2336 // Read 4 UV from 422 10 bit, upsample to 8 UV
2337 #define READYUV210 \
2338 "movq (%[u_buf]),%%xmm3 \n" \
2339 "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2340 "lea 0x8(%[u_buf]),%[u_buf] \n" \
2341 "punpcklwd %%xmm1,%%xmm3 \n" \
2342 "psraw $2,%%xmm3 \n" \
2343 "packuswb %%xmm3,%%xmm3 \n" \
2344 "punpcklwd %%xmm3,%%xmm3 \n" \
2345 "movdqu (%[y_buf]),%%xmm4 \n" \
2346 "movdqa %%xmm4,%%xmm2 \n" \
2347 "psllw $6,%%xmm4 \n" \
2348 "psrlw $4,%%xmm2 \n" \
2349 "paddw %%xmm2,%%xmm4 \n" \
2350 "lea 0x10(%[y_buf]),%[y_buf] \n"
2351
2352 #define READYUVA210 \
2353 "movq (%[u_buf]),%%xmm3 \n" \
2354 "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2355 "lea 0x8(%[u_buf]),%[u_buf] \n" \
2356 "punpcklwd %%xmm1,%%xmm3 \n" \
2357 "psraw $2,%%xmm3 \n" \
2358 "packuswb %%xmm3,%%xmm3 \n" \
2359 "punpcklwd %%xmm3,%%xmm3 \n" \
2360 "movdqu (%[y_buf]),%%xmm4 \n" \
2361 "movdqa %%xmm4,%%xmm2 \n" \
2362 "psllw $6,%%xmm4 \n" \
2363 "psrlw $4,%%xmm2 \n" \
2364 "paddw %%xmm2,%%xmm4 \n" \
2365 "lea 0x10(%[y_buf]),%[y_buf] \n" \
2366 "movdqu (%[a_buf]),%%xmm5 \n" \
2367 "psraw $2,%%xmm5 \n" \
2368 "packuswb %%xmm5,%%xmm5 \n" \
2369 "lea 0x10(%[a_buf]),%[a_buf] \n"
2370
2371 // Read 8 UV from 444 10 bit
2372 #define READYUV410 \
2373 "movdqu (%[u_buf]),%%xmm3 \n" \
2374 "movdqu 0x00(%[u_buf],%[v_buf],1),%%xmm2 \n" \
2375 "lea 0x10(%[u_buf]),%[u_buf] \n" \
2376 "psraw $2,%%xmm3 \n" \
2377 "psraw $2,%%xmm2 \n" \
2378 "movdqa %%xmm3,%%xmm1 \n" \
2379 "punpcklwd %%xmm2,%%xmm3 \n" \
2380 "punpckhwd %%xmm2,%%xmm1 \n" \
2381 "packuswb %%xmm1,%%xmm3 \n" \
2382 "movdqu (%[y_buf]),%%xmm4 \n" \
2383 "movdqa %%xmm4,%%xmm2 \n" \
2384 "psllw $6,%%xmm4 \n" \
2385 "psrlw $4,%%xmm2 \n" \
2386 "paddw %%xmm2,%%xmm4 \n" \
2387 "lea 0x10(%[y_buf]),%[y_buf] \n"
2388
2389 // Read 8 UV from 444 10 bit. With 8 Alpha.
2390 #define READYUVA410 \
2391 "movdqu (%[u_buf]),%%xmm3 \n" \
2392 "movdqu 0x00(%[u_buf],%[v_buf],1),%%xmm2 \n" \
2393 "lea 0x10(%[u_buf]),%[u_buf] \n" \
2394 "psraw $2,%%xmm3 \n" \
2395 "psraw $2,%%xmm2 \n" \
2396 "movdqa %%xmm3,%%xmm1 \n" \
2397 "punpcklwd %%xmm2,%%xmm3 \n" \
2398 "punpckhwd %%xmm2,%%xmm1 \n" \
2399 "packuswb %%xmm1,%%xmm3 \n" \
2400 "movdqu (%[y_buf]),%%xmm4 \n" \
2401 "movdqa %%xmm4,%%xmm2 \n" \
2402 "psllw $6,%%xmm4 \n" \
2403 "psrlw $4,%%xmm2 \n" \
2404 "paddw %%xmm2,%%xmm4 \n" \
2405 "lea 0x10(%[y_buf]),%[y_buf] \n" \
2406 "movdqu (%[a_buf]),%%xmm5 \n" \
2407 "psraw $2,%%xmm5 \n" \
2408 "packuswb %%xmm5,%%xmm5 \n" \
2409 "lea 0x10(%[a_buf]),%[a_buf] \n"
2410
2411 // Read 4 UV from 422 12 bit, upsample to 8 UV
2412 #define READYUV212 \
2413 "movq (%[u_buf]),%%xmm3 \n" \
2414 "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2415 "lea 0x8(%[u_buf]),%[u_buf] \n" \
2416 "punpcklwd %%xmm1,%%xmm3 \n" \
2417 "psraw $0x4,%%xmm3 \n" \
2418 "packuswb %%xmm3,%%xmm3 \n" \
2419 "punpcklwd %%xmm3,%%xmm3 \n" \
2420 "movdqu (%[y_buf]),%%xmm4 \n" \
2421 "movdqa %%xmm4,%%xmm2 \n" \
2422 "psllw $4,%%xmm4 \n" \
2423 "psrlw $8,%%xmm2 \n" \
2424 "paddw %%xmm2,%%xmm4 \n" \
2425 "lea 0x10(%[y_buf]),%[y_buf] \n"
2426
2427 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
2428 #define READYUVA422 \
2429 "movd (%[u_buf]),%%xmm3 \n" \
2430 "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2431 "lea 0x4(%[u_buf]),%[u_buf] \n" \
2432 "punpcklbw %%xmm1,%%xmm3 \n" \
2433 "punpcklwd %%xmm3,%%xmm3 \n" \
2434 "movq (%[y_buf]),%%xmm4 \n" \
2435 "punpcklbw %%xmm4,%%xmm4 \n" \
2436 "lea 0x8(%[y_buf]),%[y_buf] \n" \
2437 "movq (%[a_buf]),%%xmm5 \n" \
2438 "lea 0x8(%[a_buf]),%[a_buf] \n"
2439
2440 // Read 8 UV from 444. With 8 Alpha.
2441 #define READYUVA444 \
2442 "movq (%[u_buf]),%%xmm3 \n" \
2443 "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2444 "lea 0x8(%[u_buf]),%[u_buf] \n" \
2445 "punpcklbw %%xmm1,%%xmm3 \n" \
2446 "movq (%[y_buf]),%%xmm4 \n" \
2447 "punpcklbw %%xmm4,%%xmm4 \n" \
2448 "lea 0x8(%[y_buf]),%[y_buf] \n" \
2449 "movq (%[a_buf]),%%xmm5 \n" \
2450 "lea 0x8(%[a_buf]),%[a_buf] \n"
2451
2452 // Read 4 UV from NV12, upsample to 8 UV
2453 #define READNV12 \
2454 "movq (%[uv_buf]),%%xmm3 \n" \
2455 "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
2456 "punpcklwd %%xmm3,%%xmm3 \n" \
2457 "movq (%[y_buf]),%%xmm4 \n" \
2458 "punpcklbw %%xmm4,%%xmm4 \n" \
2459 "lea 0x8(%[y_buf]),%[y_buf] \n"
2460
2461 // Read 4 VU from NV21, upsample to 8 UV
2462 #define READNV21 \
2463 "movq (%[vu_buf]),%%xmm3 \n" \
2464 "lea 0x8(%[vu_buf]),%[vu_buf] \n" \
2465 "pshufb %[kShuffleNV21], %%xmm3 \n" \
2466 "movq (%[y_buf]),%%xmm4 \n" \
2467 "punpcklbw %%xmm4,%%xmm4 \n" \
2468 "lea 0x8(%[y_buf]),%[y_buf] \n"
2469
2470 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
2471 #define READYUY2 \
2472 "movdqu (%[yuy2_buf]),%%xmm4 \n" \
2473 "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
2474 "movdqu (%[yuy2_buf]),%%xmm3 \n" \
2475 "pshufb %[kShuffleYUY2UV], %%xmm3 \n" \
2476 "lea 0x10(%[yuy2_buf]),%[yuy2_buf] \n"
2477
2478 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
2479 #define READUYVY \
2480 "movdqu (%[uyvy_buf]),%%xmm4 \n" \
2481 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
2482 "movdqu (%[uyvy_buf]),%%xmm3 \n" \
2483 "pshufb %[kShuffleUYVYUV], %%xmm3 \n" \
2484 "lea 0x10(%[uyvy_buf]),%[uyvy_buf] \n"
2485
2486 // Read 4 UV from P210, upsample to 8 UV
2487 #define READP210 \
2488 "movdqu (%[uv_buf]),%%xmm3 \n" \
2489 "lea 0x10(%[uv_buf]),%[uv_buf] \n" \
2490 "psrlw $0x8,%%xmm3 \n" \
2491 "packuswb %%xmm3,%%xmm3 \n" \
2492 "punpcklwd %%xmm3,%%xmm3 \n" \
2493 "movdqu (%[y_buf]),%%xmm4 \n" \
2494 "lea 0x10(%[y_buf]),%[y_buf] \n"
2495
2496 // Read 8 UV from P410
2497 #define READP410 \
2498 "movdqu (%[uv_buf]),%%xmm3 \n" \
2499 "movdqu 0x10(%[uv_buf]),%%xmm1 \n" \
2500 "lea 0x20(%[uv_buf]),%[uv_buf] \n" \
2501 "psrlw $0x8,%%xmm3 \n" \
2502 "psrlw $0x8,%%xmm1 \n" \
2503 "packuswb %%xmm1,%%xmm3 \n" \
2504 "movdqu (%[y_buf]),%%xmm4 \n" \
2505 "lea 0x10(%[y_buf]),%[y_buf] \n"
2506
2507 #if defined(__x86_64__)
2508 #define YUVTORGB_SETUP(yuvconstants) \
2509 "pcmpeqb %%xmm13,%%xmm13 \n" \
2510 "movdqa (%[yuvconstants]),%%xmm8 \n" \
2511 "pxor %%xmm12,%%xmm12 \n" \
2512 "movdqa 32(%[yuvconstants]),%%xmm9 \n" \
2513 "psllw $7,%%xmm13 \n" \
2514 "movdqa 64(%[yuvconstants]),%%xmm10 \n" \
2515 "pshufb %%xmm12,%%xmm13 \n" \
2516 "movdqa 96(%[yuvconstants]),%%xmm11 \n" \
2517 "movdqa 128(%[yuvconstants]),%%xmm12 \n"
2518
2519 // Convert 8 pixels: 8 UV and 8 Y
2520 #define YUVTORGB16(yuvconstants) \
2521 "psubb %%xmm13,%%xmm3 \n" \
2522 "pmulhuw %%xmm11,%%xmm4 \n" \
2523 "movdqa %%xmm8,%%xmm0 \n" \
2524 "movdqa %%xmm9,%%xmm1 \n" \
2525 "movdqa %%xmm10,%%xmm2 \n" \
2526 "paddw %%xmm12,%%xmm4 \n" \
2527 "pmaddubsw %%xmm3,%%xmm0 \n" \
2528 "pmaddubsw %%xmm3,%%xmm1 \n" \
2529 "pmaddubsw %%xmm3,%%xmm2 \n" \
2530 "paddsw %%xmm4,%%xmm0 \n" \
2531 "paddsw %%xmm4,%%xmm2 \n" \
2532 "psubsw %%xmm1,%%xmm4 \n" \
2533 "movdqa %%xmm4,%%xmm1 \n"
2534
2535 #define YUVTORGB_REGS "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
2536
2537 #else
2538 #define YUVTORGB_SETUP(yuvconstants)
2539 // Convert 8 pixels: 8 UV and 8 Y
2540 #define YUVTORGB16(yuvconstants) \
2541 "pcmpeqb %%xmm0,%%xmm0 \n" \
2542 "pxor %%xmm1,%%xmm1 \n" \
2543 "psllw $7,%%xmm0 \n" \
2544 "pshufb %%xmm1,%%xmm0 \n" \
2545 "psubb %%xmm0,%%xmm3 \n" \
2546 "pmulhuw 96(%[yuvconstants]),%%xmm4 \n" \
2547 "movdqa (%[yuvconstants]),%%xmm0 \n" \
2548 "movdqa 32(%[yuvconstants]),%%xmm1 \n" \
2549 "movdqa 64(%[yuvconstants]),%%xmm2 \n" \
2550 "pmaddubsw %%xmm3,%%xmm0 \n" \
2551 "pmaddubsw %%xmm3,%%xmm1 \n" \
2552 "pmaddubsw %%xmm3,%%xmm2 \n" \
2553 "movdqa 128(%[yuvconstants]),%%xmm3 \n" \
2554 "paddw %%xmm3,%%xmm4 \n" \
2555 "paddsw %%xmm4,%%xmm0 \n" \
2556 "paddsw %%xmm4,%%xmm2 \n" \
2557 "psubsw %%xmm1,%%xmm4 \n" \
2558 "movdqa %%xmm4,%%xmm1 \n"
2559
2560 #define YUVTORGB_REGS
2561 #endif
2562
2563 #define YUVTORGB(yuvconstants) \
2564 YUVTORGB16(yuvconstants) \
2565 "psraw $0x6,%%xmm0 \n" \
2566 "psraw $0x6,%%xmm1 \n" \
2567 "psraw $0x6,%%xmm2 \n" \
2568 "packuswb %%xmm0,%%xmm0 \n" \
2569 "packuswb %%xmm1,%%xmm1 \n" \
2570 "packuswb %%xmm2,%%xmm2 \n"
2571
2572 // Store 8 ARGB values.
2573 #define STOREARGB \
2574 "punpcklbw %%xmm1,%%xmm0 \n" \
2575 "punpcklbw %%xmm5,%%xmm2 \n" \
2576 "movdqa %%xmm0,%%xmm1 \n" \
2577 "punpcklwd %%xmm2,%%xmm0 \n" \
2578 "punpckhwd %%xmm2,%%xmm1 \n" \
2579 "movdqu %%xmm0,(%[dst_argb]) \n" \
2580 "movdqu %%xmm1,0x10(%[dst_argb]) \n" \
2581 "lea 0x20(%[dst_argb]), %[dst_argb] \n"
2582
2583 // Store 8 RGBA values.
2584 #define STORERGBA \
2585 "pcmpeqb %%xmm5,%%xmm5 \n" \
2586 "punpcklbw %%xmm2,%%xmm1 \n" \
2587 "punpcklbw %%xmm0,%%xmm5 \n" \
2588 "movdqa %%xmm5,%%xmm0 \n" \
2589 "punpcklwd %%xmm1,%%xmm5 \n" \
2590 "punpckhwd %%xmm1,%%xmm0 \n" \
2591 "movdqu %%xmm5,(%[dst_rgba]) \n" \
2592 "movdqu %%xmm0,0x10(%[dst_rgba]) \n" \
2593 "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
2594
2595 // Store 8 RGB24 values.
2596 #define STORERGB24 \
2597 "punpcklbw %%xmm1,%%xmm0 \n" \
2598 "punpcklbw %%xmm2,%%xmm2 \n" \
2599 "movdqa %%xmm0,%%xmm1 \n" \
2600 "punpcklwd %%xmm2,%%xmm0 \n" \
2601 "punpckhwd %%xmm2,%%xmm1 \n" \
2602 "pshufb %%xmm5,%%xmm0 \n" \
2603 "pshufb %%xmm6,%%xmm1 \n" \
2604 "palignr $0xc,%%xmm0,%%xmm1 \n" \
2605 "movq %%xmm0,(%[dst_rgb24]) \n" \
2606 "movdqu %%xmm1,0x8(%[dst_rgb24]) \n" \
2607 "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
2608
2609 // Store 8 AR30 values.
2610 #define STOREAR30 \
2611 "psraw $0x4,%%xmm0 \n" \
2612 "psraw $0x4,%%xmm1 \n" \
2613 "psraw $0x4,%%xmm2 \n" \
2614 "pminsw %%xmm7,%%xmm0 \n" \
2615 "pminsw %%xmm7,%%xmm1 \n" \
2616 "pminsw %%xmm7,%%xmm2 \n" \
2617 "pmaxsw %%xmm6,%%xmm0 \n" \
2618 "pmaxsw %%xmm6,%%xmm1 \n" \
2619 "pmaxsw %%xmm6,%%xmm2 \n" \
2620 "psllw $0x4,%%xmm2 \n" \
2621 "movdqa %%xmm0,%%xmm3 \n" \
2622 "punpcklwd %%xmm2,%%xmm0 \n" \
2623 "punpckhwd %%xmm2,%%xmm3 \n" \
2624 "movdqa %%xmm1,%%xmm2 \n" \
2625 "punpcklwd %%xmm5,%%xmm1 \n" \
2626 "punpckhwd %%xmm5,%%xmm2 \n" \
2627 "pslld $0xa,%%xmm1 \n" \
2628 "pslld $0xa,%%xmm2 \n" \
2629 "por %%xmm1,%%xmm0 \n" \
2630 "por %%xmm2,%%xmm3 \n" \
2631 "movdqu %%xmm0,(%[dst_ar30]) \n" \
2632 "movdqu %%xmm3,0x10(%[dst_ar30]) \n" \
2633 "lea 0x20(%[dst_ar30]), %[dst_ar30] \n"
2634
I444ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2635 void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
2636 const uint8_t* u_buf,
2637 const uint8_t* v_buf,
2638 uint8_t* dst_argb,
2639 const struct YuvConstants* yuvconstants,
2640 int width) {
2641 asm volatile (
2642 YUVTORGB_SETUP(yuvconstants)
2643 "sub %[u_buf],%[v_buf] \n"
2644 "pcmpeqb %%xmm5,%%xmm5 \n"
2645
2646 LABELALIGN
2647 "1: \n"
2648 READYUV444
2649 YUVTORGB(yuvconstants)
2650 STOREARGB
2651 "sub $0x8,%[width] \n"
2652 "jg 1b \n"
2653 : [y_buf]"+r"(y_buf), // %[y_buf]
2654 [u_buf]"+r"(u_buf), // %[u_buf]
2655 [v_buf]"+r"(v_buf), // %[v_buf]
2656 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2657 [width]"+rm"(width) // %[width]
2658 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2659 : "memory", "cc", YUVTORGB_REGS
2660 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2661 );
2662 }
2663
2664 #ifdef HAS_I444ALPHATOARGBROW_SSSE3
I444AlphaToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2665 void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
2666 const uint8_t* u_buf,
2667 const uint8_t* v_buf,
2668 const uint8_t* a_buf,
2669 uint8_t* dst_argb,
2670 const struct YuvConstants* yuvconstants,
2671 int width) {
2672 // clang-format off
2673 asm volatile (
2674 YUVTORGB_SETUP(yuvconstants)
2675 "sub %[u_buf],%[v_buf] \n"
2676
2677 LABELALIGN
2678 "1: \n"
2679 READYUVA444
2680 YUVTORGB(yuvconstants)
2681 STOREARGB
2682 "subl $0x8,%[width] \n"
2683 "jg 1b \n"
2684 : [y_buf]"+r"(y_buf), // %[y_buf]
2685 [u_buf]"+r"(u_buf), // %[u_buf]
2686 [v_buf]"+r"(v_buf), // %[v_buf]
2687 [a_buf]"+r"(a_buf), // %[a_buf]
2688 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2689 #if defined(__i386__)
2690 [width]"+m"(width) // %[width]
2691 #else
2692 [width]"+rm"(width) // %[width]
2693 #endif
2694 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2695 : "memory", "cc", YUVTORGB_REGS
2696 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2697 );
2698 // clang-format on
2699 }
2700 #endif // HAS_I444ALPHATOARGBROW_SSSE3
2701
I422ToRGB24Row_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)2702 void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
2703 const uint8_t* u_buf,
2704 const uint8_t* v_buf,
2705 uint8_t* dst_rgb24,
2706 const struct YuvConstants* yuvconstants,
2707 int width) {
2708 asm volatile (
2709 YUVTORGB_SETUP(yuvconstants)
2710 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
2711 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
2712 "sub %[u_buf],%[v_buf] \n"
2713
2714 LABELALIGN
2715 "1: \n"
2716 READYUV422
2717 YUVTORGB(yuvconstants)
2718 STORERGB24
2719 "subl $0x8,%[width] \n"
2720 "jg 1b \n"
2721 : [y_buf]"+r"(y_buf), // %[y_buf]
2722 [u_buf]"+r"(u_buf), // %[u_buf]
2723 [v_buf]"+r"(v_buf), // %[v_buf]
2724 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
2725 #if defined(__i386__)
2726 [width]"+m"(width) // %[width]
2727 #else
2728 [width]"+rm"(width) // %[width]
2729 #endif
2730 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2731 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
2732 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
2733 : "memory", "cc", YUVTORGB_REGS
2734 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2735 );
2736 }
2737
I444ToRGB24Row_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)2738 void OMITFP I444ToRGB24Row_SSSE3(const uint8_t* y_buf,
2739 const uint8_t* u_buf,
2740 const uint8_t* v_buf,
2741 uint8_t* dst_rgb24,
2742 const struct YuvConstants* yuvconstants,
2743 int width) {
2744 asm volatile (
2745 YUVTORGB_SETUP(yuvconstants)
2746 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
2747 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
2748 "sub %[u_buf],%[v_buf] \n"
2749
2750 LABELALIGN
2751 "1: \n"
2752 READYUV444
2753 YUVTORGB(yuvconstants)
2754 STORERGB24
2755 "subl $0x8,%[width] \n"
2756 "jg 1b \n"
2757 : [y_buf]"+r"(y_buf), // %[y_buf]
2758 [u_buf]"+r"(u_buf), // %[u_buf]
2759 [v_buf]"+r"(v_buf), // %[v_buf]
2760 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
2761 #if defined(__i386__)
2762 [width]"+m"(width) // %[width]
2763 #else
2764 [width]"+rm"(width) // %[width]
2765 #endif
2766 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2767 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
2768 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
2769 : "memory", "cc", YUVTORGB_REGS
2770 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2771 );
2772 }
2773
I422ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2774 void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
2775 const uint8_t* u_buf,
2776 const uint8_t* v_buf,
2777 uint8_t* dst_argb,
2778 const struct YuvConstants* yuvconstants,
2779 int width) {
2780 asm volatile (
2781 YUVTORGB_SETUP(yuvconstants)
2782 "sub %[u_buf],%[v_buf] \n"
2783 "pcmpeqb %%xmm5,%%xmm5 \n"
2784
2785 LABELALIGN
2786 "1: \n"
2787 READYUV422
2788 YUVTORGB(yuvconstants)
2789 STOREARGB
2790 "sub $0x8,%[width] \n"
2791 "jg 1b \n"
2792 : [y_buf]"+r"(y_buf), // %[y_buf]
2793 [u_buf]"+r"(u_buf), // %[u_buf]
2794 [v_buf]"+r"(v_buf), // %[v_buf]
2795 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2796 [width]"+rm"(width) // %[width]
2797 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2798 : "memory", "cc", YUVTORGB_REGS
2799 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2800 );
2801 }
2802
I422ToAR30Row_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2803 void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
2804 const uint8_t* u_buf,
2805 const uint8_t* v_buf,
2806 uint8_t* dst_ar30,
2807 const struct YuvConstants* yuvconstants,
2808 int width) {
2809 asm volatile (
2810 YUVTORGB_SETUP(yuvconstants)
2811 "sub %[u_buf],%[v_buf] \n"
2812 "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants
2813 "psrlw $14,%%xmm5 \n"
2814 "psllw $4,%%xmm5 \n" // 2 alpha bits
2815 "pxor %%xmm6,%%xmm6 \n" // 0 for min
2816 "pcmpeqb %%xmm7,%%xmm7 \n"
2817 "psrlw $6,%%xmm7 \n" // 1023 for max
2818
2819 LABELALIGN
2820 "1: \n"
2821 READYUV422
2822 YUVTORGB16(yuvconstants)
2823 STOREAR30
2824 "sub $0x8,%[width] \n"
2825 "jg 1b \n"
2826 : [y_buf]"+r"(y_buf), // %[y_buf]
2827 [u_buf]"+r"(u_buf), // %[u_buf]
2828 [v_buf]"+r"(v_buf), // %[v_buf]
2829 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
2830 [width]"+rm"(width) // %[width]
2831 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2832 : "memory", "cc", YUVTORGB_REGS
2833 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2834 );
2835 }
2836
2837 // 10 bit YUV to ARGB
I210ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2838 void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
2839 const uint16_t* u_buf,
2840 const uint16_t* v_buf,
2841 uint8_t* dst_argb,
2842 const struct YuvConstants* yuvconstants,
2843 int width) {
2844 asm volatile (
2845 YUVTORGB_SETUP(yuvconstants)
2846 "sub %[u_buf],%[v_buf] \n"
2847 "pcmpeqb %%xmm5,%%xmm5 \n"
2848
2849 LABELALIGN
2850 "1: \n"
2851 READYUV210
2852 YUVTORGB(yuvconstants)
2853 STOREARGB
2854 "sub $0x8,%[width] \n"
2855 "jg 1b \n"
2856 : [y_buf]"+r"(y_buf), // %[y_buf]
2857 [u_buf]"+r"(u_buf), // %[u_buf]
2858 [v_buf]"+r"(v_buf), // %[v_buf]
2859 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2860 [width]"+rm"(width) // %[width]
2861 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2862 : "memory", "cc", YUVTORGB_REGS
2863 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2864 );
2865 }
2866
2867 // 12 bit YUV to ARGB
I212ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2868 void OMITFP I212ToARGBRow_SSSE3(const uint16_t* y_buf,
2869 const uint16_t* u_buf,
2870 const uint16_t* v_buf,
2871 uint8_t* dst_argb,
2872 const struct YuvConstants* yuvconstants,
2873 int width) {
2874 asm volatile (
2875 YUVTORGB_SETUP(yuvconstants)
2876 "sub %[u_buf],%[v_buf] \n"
2877 "pcmpeqb %%xmm5,%%xmm5 \n"
2878
2879 LABELALIGN
2880 "1: \n"
2881 READYUV212
2882 YUVTORGB(yuvconstants)
2883 STOREARGB
2884 "sub $0x8,%[width] \n"
2885 "jg 1b \n"
2886 : [y_buf]"+r"(y_buf), // %[y_buf]
2887 [u_buf]"+r"(u_buf), // %[u_buf]
2888 [v_buf]"+r"(v_buf), // %[v_buf]
2889 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2890 [width]"+rm"(width) // %[width]
2891 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2892 : "memory", "cc", YUVTORGB_REGS
2893 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2894 );
2895 }
2896
2897 // 10 bit YUV to AR30
I210ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2898 void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
2899 const uint16_t* u_buf,
2900 const uint16_t* v_buf,
2901 uint8_t* dst_ar30,
2902 const struct YuvConstants* yuvconstants,
2903 int width) {
2904 asm volatile (
2905 YUVTORGB_SETUP(yuvconstants)
2906 "sub %[u_buf],%[v_buf] \n"
2907 "pcmpeqb %%xmm5,%%xmm5 \n"
2908 "psrlw $14,%%xmm5 \n"
2909 "psllw $4,%%xmm5 \n" // 2 alpha bits
2910 "pxor %%xmm6,%%xmm6 \n" // 0 for min
2911 "pcmpeqb %%xmm7,%%xmm7 \n"
2912 "psrlw $6,%%xmm7 \n" // 1023 for max
2913
2914 LABELALIGN
2915 "1: \n"
2916 READYUV210
2917 YUVTORGB16(yuvconstants)
2918 STOREAR30
2919 "sub $0x8,%[width] \n"
2920 "jg 1b \n"
2921 : [y_buf]"+r"(y_buf), // %[y_buf]
2922 [u_buf]"+r"(u_buf), // %[u_buf]
2923 [v_buf]"+r"(v_buf), // %[v_buf]
2924 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
2925 [width]"+rm"(width) // %[width]
2926 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2927 : "memory", "cc", YUVTORGB_REGS
2928 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2929 );
2930 }
2931
2932 // 12 bit YUV to AR30
I212ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2933 void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf,
2934 const uint16_t* u_buf,
2935 const uint16_t* v_buf,
2936 uint8_t* dst_ar30,
2937 const struct YuvConstants* yuvconstants,
2938 int width) {
2939 asm volatile (
2940 YUVTORGB_SETUP(yuvconstants)
2941 "sub %[u_buf],%[v_buf] \n"
2942 "pcmpeqb %%xmm5,%%xmm5 \n"
2943 "psrlw $14,%%xmm5 \n"
2944 "psllw $4,%%xmm5 \n" // 2 alpha bits
2945 "pxor %%xmm6,%%xmm6 \n" // 0 for min
2946 "pcmpeqb %%xmm7,%%xmm7 \n"
2947 "psrlw $6,%%xmm7 \n" // 1023 for max
2948
2949 LABELALIGN
2950 "1: \n"
2951 READYUV212
2952 YUVTORGB16(yuvconstants)
2953 STOREAR30
2954 "sub $0x8,%[width] \n"
2955 "jg 1b \n"
2956 : [y_buf]"+r"(y_buf), // %[y_buf]
2957 [u_buf]"+r"(u_buf), // %[u_buf]
2958 [v_buf]"+r"(v_buf), // %[v_buf]
2959 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
2960 [width]"+rm"(width) // %[width]
2961 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2962 : "memory", "cc", YUVTORGB_REGS
2963 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2964 );
2965 }
2966
2967 // 10 bit YUV to ARGB
I410ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2968 void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf,
2969 const uint16_t* u_buf,
2970 const uint16_t* v_buf,
2971 uint8_t* dst_argb,
2972 const struct YuvConstants* yuvconstants,
2973 int width) {
2974 asm volatile (
2975 YUVTORGB_SETUP(yuvconstants)
2976 "sub %[u_buf],%[v_buf] \n"
2977 "pcmpeqb %%xmm5,%%xmm5 \n"
2978
2979 LABELALIGN
2980 "1: \n"
2981 READYUV410
2982 YUVTORGB(yuvconstants)
2983 STOREARGB
2984 "sub $0x8,%[width] \n"
2985 "jg 1b \n"
2986 : [y_buf]"+r"(y_buf), // %[y_buf]
2987 [u_buf]"+r"(u_buf), // %[u_buf]
2988 [v_buf]"+r"(v_buf), // %[v_buf]
2989 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2990 [width]"+rm"(width) // %[width]
2991 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2992 : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2993 );
2994 }
2995
2996 #ifdef HAS_I210ALPHATOARGBROW_SSSE3
2997 // 10 bit YUVA to ARGB
I210AlphaToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,const uint16_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2998 void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
2999 const uint16_t* u_buf,
3000 const uint16_t* v_buf,
3001 const uint16_t* a_buf,
3002 uint8_t* dst_argb,
3003 const struct YuvConstants* yuvconstants,
3004 int width) {
3005 asm volatile(
3006 YUVTORGB_SETUP(
3007 yuvconstants) "sub %[u_buf],%[v_buf] \n"
3008
3009 LABELALIGN "1: \n" READYUVA210
3010 YUVTORGB(yuvconstants) STOREARGB
3011 "subl $0x8,%[width] \n"
3012 "jg 1b \n"
3013 : [y_buf] "+r"(y_buf), // %[y_buf]
3014 [u_buf] "+r"(u_buf), // %[u_buf]
3015 [v_buf] "+r"(v_buf), // %[v_buf]
3016 [a_buf] "+r"(a_buf),
3017 [dst_argb] "+r"(dst_argb), // %[dst_argb]
3018 #if defined(__i386__)
3019 [width] "+m"(width) // %[width]
3020 #else
3021 [width] "+rm"(width) // %[width]
3022 #endif
3023 : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
3024 : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
3025 "xmm5");
3026 }
3027 #endif
3028
3029 #ifdef HAS_I410ALPHATOARGBROW_SSSE3
3030 // 10 bit YUVA to ARGB
I410AlphaToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,const uint16_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3031 void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
3032 const uint16_t* u_buf,
3033 const uint16_t* v_buf,
3034 const uint16_t* a_buf,
3035 uint8_t* dst_argb,
3036 const struct YuvConstants* yuvconstants,
3037 int width) {
3038 // clang-format off
3039 asm volatile(
3040 YUVTORGB_SETUP(yuvconstants)
3041 "sub %[u_buf],%[v_buf] \n"
3042
3043 LABELALIGN
3044 "1: \n"
3045 READYUVA410
3046 YUVTORGB(yuvconstants)
3047 STOREARGB
3048 "subl $0x8,%[width] \n"
3049 "jg 1b \n"
3050 : [y_buf] "+r"(y_buf), // %[y_buf]
3051 [u_buf] "+r"(u_buf), // %[u_buf]
3052 [v_buf] "+r"(v_buf), // %[v_buf]
3053 [a_buf] "+r"(a_buf),
3054 [dst_argb] "+r"(dst_argb), // %[dst_argb]
3055 #if defined(__i386__)
3056 [width] "+m"(width) // %[width]
3057 #else
3058 [width] "+rm"(width) // %[width]
3059 #endif
3060 : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
3061 : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
3062 "xmm5");
3063 // clang-format on
3064 }
3065 #endif
3066
3067 // 10 bit YUV to AR30
I410ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3068 void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf,
3069 const uint16_t* u_buf,
3070 const uint16_t* v_buf,
3071 uint8_t* dst_ar30,
3072 const struct YuvConstants* yuvconstants,
3073 int width) {
3074 asm volatile (
3075 YUVTORGB_SETUP(yuvconstants)
3076 "sub %[u_buf],%[v_buf] \n"
3077 "pcmpeqb %%xmm5,%%xmm5 \n"
3078 "psrlw $14,%%xmm5 \n"
3079 "psllw $4,%%xmm5 \n" // 2 alpha bits
3080 "pxor %%xmm6,%%xmm6 \n" // 0 for min
3081 "pcmpeqb %%xmm7,%%xmm7 \n"
3082 "psrlw $6,%%xmm7 \n" // 1023 for max
3083
3084 LABELALIGN
3085 "1: \n"
3086 READYUV410
3087 YUVTORGB16(yuvconstants)
3088 STOREAR30
3089 "sub $0x8,%[width] \n"
3090 "jg 1b \n"
3091 : [y_buf]"+r"(y_buf), // %[y_buf]
3092 [u_buf]"+r"(u_buf), // %[u_buf]
3093 [v_buf]"+r"(v_buf), // %[v_buf]
3094 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
3095 [width]"+rm"(width) // %[width]
3096 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3097 : "memory", "cc", YUVTORGB_REGS
3098 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3099 );
3100 }
3101
3102 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
I422AlphaToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3103 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
3104 const uint8_t* u_buf,
3105 const uint8_t* v_buf,
3106 const uint8_t* a_buf,
3107 uint8_t* dst_argb,
3108 const struct YuvConstants* yuvconstants,
3109 int width) {
3110 // clang-format off
3111 asm volatile (
3112 YUVTORGB_SETUP(yuvconstants)
3113 "sub %[u_buf],%[v_buf] \n"
3114
3115 LABELALIGN
3116 "1: \n"
3117 READYUVA422
3118 YUVTORGB(yuvconstants)
3119 STOREARGB
3120 "subl $0x8,%[width] \n"
3121 "jg 1b \n"
3122 : [y_buf]"+r"(y_buf), // %[y_buf]
3123 [u_buf]"+r"(u_buf), // %[u_buf]
3124 [v_buf]"+r"(v_buf), // %[v_buf]
3125 [a_buf]"+r"(a_buf), // %[a_buf]
3126 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3127 #if defined(__i386__)
3128 [width]"+m"(width) // %[width]
3129 #else
3130 [width]"+rm"(width) // %[width]
3131 #endif
3132 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3133 : "memory", "cc", YUVTORGB_REGS
3134 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3135 );
3136 // clang-format on
3137 }
3138 #endif // HAS_I422ALPHATOARGBROW_SSSE3
3139
NV12ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3140 void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
3141 const uint8_t* uv_buf,
3142 uint8_t* dst_argb,
3143 const struct YuvConstants* yuvconstants,
3144 int width) {
3145 // clang-format off
3146 asm volatile (
3147 YUVTORGB_SETUP(yuvconstants)
3148 "pcmpeqb %%xmm5,%%xmm5 \n"
3149
3150 LABELALIGN
3151 "1: \n"
3152 READNV12
3153 YUVTORGB(yuvconstants)
3154 STOREARGB
3155 "sub $0x8,%[width] \n"
3156 "jg 1b \n"
3157 : [y_buf]"+r"(y_buf), // %[y_buf]
3158 [uv_buf]"+r"(uv_buf), // %[uv_buf]
3159 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3160 [width]"+rm"(width) // %[width]
3161 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3162 : "memory", "cc", YUVTORGB_REGS
3163 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3164 );
3165 // clang-format on
3166 }
3167
NV21ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * vu_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3168 void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
3169 const uint8_t* vu_buf,
3170 uint8_t* dst_argb,
3171 const struct YuvConstants* yuvconstants,
3172 int width) {
3173 // clang-format off
3174 asm volatile (
3175 YUVTORGB_SETUP(yuvconstants)
3176 "pcmpeqb %%xmm5,%%xmm5 \n"
3177
3178 LABELALIGN
3179 "1: \n"
3180 READNV21
3181 YUVTORGB(yuvconstants)
3182 STOREARGB
3183 "sub $0x8,%[width] \n"
3184 "jg 1b \n"
3185 : [y_buf]"+r"(y_buf), // %[y_buf]
3186 [vu_buf]"+r"(vu_buf), // %[vu_buf]
3187 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3188 [width]"+rm"(width) // %[width]
3189 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
3190 [kShuffleNV21]"m"(kShuffleNV21)
3191 : "memory", "cc", YUVTORGB_REGS
3192 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3193 );
3194 // clang-format on
3195 }
3196
YUY2ToARGBRow_SSSE3(const uint8_t * yuy2_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3197 void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
3198 uint8_t* dst_argb,
3199 const struct YuvConstants* yuvconstants,
3200 int width) {
3201 // clang-format off
3202 asm volatile (
3203 YUVTORGB_SETUP(yuvconstants)
3204 "pcmpeqb %%xmm5,%%xmm5 \n"
3205
3206 LABELALIGN
3207 "1: \n"
3208 READYUY2
3209 YUVTORGB(yuvconstants)
3210 STOREARGB
3211 "sub $0x8,%[width] \n"
3212 "jg 1b \n"
3213 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
3214 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3215 [width]"+rm"(width) // %[width]
3216 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
3217 [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
3218 [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
3219 : "memory", "cc", YUVTORGB_REGS
3220 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3221 );
3222 // clang-format on
3223 }
3224
UYVYToARGBRow_SSSE3(const uint8_t * uyvy_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3225 void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
3226 uint8_t* dst_argb,
3227 const struct YuvConstants* yuvconstants,
3228 int width) {
3229 // clang-format off
3230 asm volatile (
3231 YUVTORGB_SETUP(yuvconstants)
3232 "pcmpeqb %%xmm5,%%xmm5 \n"
3233
3234 LABELALIGN
3235 "1: \n"
3236 READUYVY
3237 YUVTORGB(yuvconstants)
3238 STOREARGB
3239 "sub $0x8,%[width] \n"
3240 "jg 1b \n"
3241 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
3242 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3243 [width]"+rm"(width) // %[width]
3244 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
3245 [kShuffleUYVYY]"m"(kShuffleUYVYY),
3246 [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
3247 : "memory", "cc", YUVTORGB_REGS
3248 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3249 );
3250 // clang-format on
3251 }
3252
P210ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3253 void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf,
3254 const uint16_t* uv_buf,
3255 uint8_t* dst_argb,
3256 const struct YuvConstants* yuvconstants,
3257 int width) {
3258 asm volatile(
3259 YUVTORGB_SETUP(
3260 yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n"
3261
3262 LABELALIGN "1: \n" READP210
3263 YUVTORGB(yuvconstants) STOREARGB
3264 "sub $0x8,%[width] \n"
3265 "jg 1b \n"
3266 : [y_buf] "+r"(y_buf), // %[y_buf]
3267 [uv_buf] "+r"(uv_buf), // %[u_buf]
3268 [dst_argb] "+r"(dst_argb), // %[dst_argb]
3269 [width] "+rm"(width) // %[width]
3270 : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
3271 : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
3272 "xmm5");
3273 }
3274
P410ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3275 void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf,
3276 const uint16_t* uv_buf,
3277 uint8_t* dst_argb,
3278 const struct YuvConstants* yuvconstants,
3279 int width) {
3280 asm volatile(
3281 YUVTORGB_SETUP(
3282 yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n"
3283
3284 LABELALIGN "1: \n" READP410
3285 YUVTORGB(yuvconstants) STOREARGB
3286 "sub $0x8,%[width] \n"
3287 "jg 1b \n"
3288 : [y_buf] "+r"(y_buf), // %[y_buf]
3289 [uv_buf] "+r"(uv_buf), // %[u_buf]
3290 [dst_argb] "+r"(dst_argb), // %[dst_argb]
3291 [width] "+rm"(width) // %[width]
3292 : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
3293 : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
3294 "xmm5");
3295 }
3296
P210ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3297 void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf,
3298 const uint16_t* uv_buf,
3299 uint8_t* dst_ar30,
3300 const struct YuvConstants* yuvconstants,
3301 int width) {
3302 asm volatile (
3303 YUVTORGB_SETUP(yuvconstants)
3304 "pcmpeqb %%xmm5,%%xmm5 \n"
3305 "psrlw $14,%%xmm5 \n"
3306 "psllw $4,%%xmm5 \n" // 2 alpha bits
3307 "pxor %%xmm6,%%xmm6 \n" // 0 for min
3308 "pcmpeqb %%xmm7,%%xmm7 \n"
3309 "psrlw $6,%%xmm7 \n" // 1023 for max
3310
3311 LABELALIGN
3312 "1: \n"
3313 READP210
3314 YUVTORGB16(yuvconstants)
3315 STOREAR30
3316 "sub $0x8,%[width] \n"
3317 "jg 1b \n"
3318 : [y_buf]"+r"(y_buf), // %[y_buf]
3319 [uv_buf]"+r"(uv_buf), // %[uv_buf]
3320 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
3321 [width]"+rm"(width) // %[width]
3322 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3323 : "memory", "cc", YUVTORGB_REGS
3324 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3325 );
3326 }
3327
P410ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3328 void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf,
3329 const uint16_t* uv_buf,
3330 uint8_t* dst_ar30,
3331 const struct YuvConstants* yuvconstants,
3332 int width) {
3333 asm volatile (
3334 YUVTORGB_SETUP(yuvconstants)
3335 "pcmpeqb %%xmm5,%%xmm5 \n"
3336 "psrlw $14,%%xmm5 \n"
3337 "psllw $4,%%xmm5 \n" // 2 alpha bits
3338 "pxor %%xmm6,%%xmm6 \n" // 0 for min
3339 "pcmpeqb %%xmm7,%%xmm7 \n"
3340 "psrlw $6,%%xmm7 \n" // 1023 for max
3341
3342 LABELALIGN
3343 "1: \n"
3344 READP410
3345 YUVTORGB16(yuvconstants)
3346 STOREAR30
3347 "sub $0x8,%[width] \n"
3348 "jg 1b \n"
3349 : [y_buf]"+r"(y_buf), // %[y_buf]
3350 [uv_buf]"+r"(uv_buf), // %[uv_buf]
3351 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
3352 [width]"+rm"(width) // %[width]
3353 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3354 : "memory", "cc", YUVTORGB_REGS
3355 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3356 );
3357 }
3358
I422ToRGBARow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_rgba,const struct YuvConstants * yuvconstants,int width)3359 void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
3360 const uint8_t* u_buf,
3361 const uint8_t* v_buf,
3362 uint8_t* dst_rgba,
3363 const struct YuvConstants* yuvconstants,
3364 int width) {
3365 asm volatile (
3366 YUVTORGB_SETUP(yuvconstants)
3367 "sub %[u_buf],%[v_buf] \n"
3368 "pcmpeqb %%xmm5,%%xmm5 \n"
3369
3370 LABELALIGN
3371 "1: \n"
3372 READYUV422
3373 YUVTORGB(yuvconstants)
3374 STORERGBA
3375 "sub $0x8,%[width] \n"
3376 "jg 1b \n"
3377 : [y_buf]"+r"(y_buf), // %[y_buf]
3378 [u_buf]"+r"(u_buf), // %[u_buf]
3379 [v_buf]"+r"(v_buf), // %[v_buf]
3380 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
3381 [width]"+rm"(width) // %[width]
3382 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3383 : "memory", "cc", YUVTORGB_REGS
3384 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3385 );
3386 }
3387
3388 #endif // HAS_I422TOARGBROW_SSSE3
3389
3390 // Read 16 UV from 444
3391 #define READYUV444_AVX2 \
3392 "vmovdqu (%[u_buf]),%%xmm3 \n" \
3393 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
3394 "lea 0x10(%[u_buf]),%[u_buf] \n" \
3395 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3396 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
3397 "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \
3398 "vmovdqu (%[y_buf]),%%xmm4 \n" \
3399 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
3400 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
3401 "lea 0x10(%[y_buf]),%[y_buf] \n"
3402
3403 // Read 8 UV from 422, upsample to 16 UV.
3404 #define READYUV422_AVX2 \
3405 "vmovq (%[u_buf]),%%xmm3 \n" \
3406 "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
3407 "lea 0x8(%[u_buf]),%[u_buf] \n" \
3408 "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \
3409 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3410 "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
3411 "vmovdqu (%[y_buf]),%%xmm4 \n" \
3412 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
3413 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
3414 "lea 0x10(%[y_buf]),%[y_buf] \n"
3415
3416 #define READYUV422_AVX512BW \
3417 "vmovdqu (%[u_buf]),%%xmm3 \n" \
3418 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
3419 "vpermq %%zmm3,%%zmm16,%%zmm3 \n" \
3420 "vpermq %%zmm1,%%zmm16,%%zmm1 \n" \
3421 "lea 0x10(%[u_buf]),%[u_buf] \n" \
3422 "vpunpcklbw %%zmm1,%%zmm3,%%zmm3 \n" \
3423 "vpermq $0xd8,%%zmm3,%%zmm3 \n" \
3424 "vpunpcklwd %%zmm3,%%zmm3,%%zmm3 \n" \
3425 "vmovdqu8 (%[y_buf]),%%ymm4 \n" \
3426 "vpermq %%zmm4,%%zmm17,%%zmm4 \n" \
3427 "vpermq $0xd8,%%zmm4,%%zmm4 \n" \
3428 "vpunpcklbw %%zmm4,%%zmm4,%%zmm4 \n" \
3429 "lea 0x20(%[y_buf]),%[y_buf] \n"
3430
3431 // Read 8 UV from 210, upsample to 16 UV
3432 // TODO(fbarchard): Consider vshufb to replace pack/unpack
3433 // TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
3434 #define READYUV210_AVX2 \
3435 "vmovdqu (%[u_buf]),%%xmm3 \n" \
3436 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
3437 "lea 0x10(%[u_buf]),%[u_buf] \n" \
3438 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3439 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
3440 "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \
3441 "vpsraw $2,%%ymm3,%%ymm3 \n" \
3442 "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
3443 "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
3444 "vmovdqu (%[y_buf]),%%ymm4 \n" \
3445 "vpsllw $6,%%ymm4,%%ymm2 \n" \
3446 "vpsrlw $4,%%ymm4,%%ymm4 \n" \
3447 "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \
3448 "lea 0x20(%[y_buf]),%[y_buf] \n"
3449
3450 // Read 8 UV from 210, upsample to 16 UV. With 16 Alpha.
3451 #define READYUVA210_AVX2 \
3452 "vmovdqu (%[u_buf]),%%xmm3 \n" \
3453 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
3454 "lea 0x10(%[u_buf]),%[u_buf] \n" \
3455 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3456 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
3457 "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \
3458 "vpsraw $2,%%ymm3,%%ymm3 \n" \
3459 "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
3460 "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
3461 "vmovdqu (%[y_buf]),%%ymm4 \n" \
3462 "vpsllw $6,%%ymm4,%%ymm2 \n" \
3463 "vpsrlw $4,%%ymm4,%%ymm4 \n" \
3464 "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \
3465 "lea 0x20(%[y_buf]),%[y_buf] \n" \
3466 "vmovdqu (%[a_buf]),%%ymm5 \n" \
3467 "vpsraw $2,%%ymm5,%%ymm5 \n" \
3468 "vpackuswb %%ymm5,%%ymm5,%%ymm5 \n" \
3469 "lea 0x20(%[a_buf]),%[a_buf] \n"
3470
3471 // Read 16 UV from 410
3472 #define READYUV410_AVX2 \
3473 "vmovdqu (%[u_buf]),%%ymm3 \n" \
3474 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%ymm2 \n" \
3475 "lea 0x20(%[u_buf]),%[u_buf] \n" \
3476 "vpsraw $2,%%ymm3,%%ymm3 \n" \
3477 "vpsraw $2,%%ymm2,%%ymm2 \n" \
3478 "vpunpckhwd %%ymm2,%%ymm3,%%ymm1 \n" \
3479 "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \
3480 "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \
3481 "vmovdqu (%[y_buf]),%%ymm4 \n" \
3482 "vpsllw $6,%%ymm4,%%ymm2 \n" \
3483 "vpsrlw $4,%%ymm4,%%ymm4 \n" \
3484 "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \
3485 "lea 0x20(%[y_buf]),%[y_buf] \n"
3486
3487 // Read 8 UV from 212 12 bit, upsample to 16 UV
3488 #define READYUV212_AVX2 \
3489 "vmovdqu (%[u_buf]),%%xmm3 \n" \
3490 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
3491 "lea 0x10(%[u_buf]),%[u_buf] \n" \
3492 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3493 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
3494 "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \
3495 "vpsraw $0x4,%%ymm3,%%ymm3 \n" \
3496 "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
3497 "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
3498 "vmovdqu (%[y_buf]),%%ymm4 \n" \
3499 "vpsllw $4,%%ymm4,%%ymm2 \n" \
3500 "vpsrlw $8,%%ymm4,%%ymm4 \n" \
3501 "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \
3502 "lea 0x20(%[y_buf]),%[y_buf] \n"
3503
3504 // Read 16 UV from 410. With 16 Alpha.
3505 #define READYUVA410_AVX2 \
3506 "vmovdqu (%[u_buf]),%%ymm3 \n" \
3507 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%ymm2 \n" \
3508 "lea 0x20(%[u_buf]),%[u_buf] \n" \
3509 "vpsraw $2,%%ymm3,%%ymm3 \n" \
3510 "vpsraw $2,%%ymm2,%%ymm2 \n" \
3511 "vpunpckhwd %%ymm2,%%ymm3,%%ymm1 \n" \
3512 "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \
3513 "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \
3514 "vmovdqu (%[y_buf]),%%ymm4 \n" \
3515 "vpsllw $6,%%ymm4,%%ymm2 \n" \
3516 "vpsrlw $4,%%ymm4,%%ymm4 \n" \
3517 "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \
3518 "lea 0x20(%[y_buf]),%[y_buf] \n" \
3519 "vmovdqu (%[a_buf]),%%ymm5 \n" \
3520 "vpsraw $2,%%ymm5,%%ymm5 \n" \
3521 "vpackuswb %%ymm5,%%ymm5,%%ymm5 \n" \
3522 "lea 0x20(%[a_buf]),%[a_buf] \n"
3523
3524 // Read 16 UV from 444. With 16 Alpha.
3525 #define READYUVA444_AVX2 \
3526 "vmovdqu (%[u_buf]),%%xmm3 \n" \
3527 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
3528 "lea 0x10(%[u_buf]),%[u_buf] \n" \
3529 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3530 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
3531 "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \
3532 "vmovdqu (%[y_buf]),%%xmm4 \n" \
3533 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
3534 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
3535 "lea 0x10(%[y_buf]),%[y_buf] \n" \
3536 "vmovdqu (%[a_buf]),%%xmm5 \n" \
3537 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
3538 "lea 0x10(%[a_buf]),%[a_buf] \n"
3539
3540 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
3541 #define READYUVA422_AVX2 \
3542 "vmovq (%[u_buf]),%%xmm3 \n" \
3543 "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
3544 "lea 0x8(%[u_buf]),%[u_buf] \n" \
3545 "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \
3546 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3547 "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
3548 "vmovdqu (%[y_buf]),%%xmm4 \n" \
3549 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
3550 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
3551 "lea 0x10(%[y_buf]),%[y_buf] \n" \
3552 "vmovdqu (%[a_buf]),%%xmm5 \n" \
3553 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
3554 "lea 0x10(%[a_buf]),%[a_buf] \n"
3555
3556 // Read 8 UV from NV12, upsample to 16 UV.
3557 #define READNV12_AVX2 \
3558 "vmovdqu (%[uv_buf]),%%xmm3 \n" \
3559 "lea 0x10(%[uv_buf]),%[uv_buf] \n" \
3560 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3561 "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
3562 "vmovdqu (%[y_buf]),%%xmm4 \n" \
3563 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
3564 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
3565 "lea 0x10(%[y_buf]),%[y_buf] \n"
3566
3567 // Read 8 VU from NV21, upsample to 16 UV.
3568 #define READNV21_AVX2 \
3569 "vmovdqu (%[vu_buf]),%%xmm3 \n" \
3570 "lea 0x10(%[vu_buf]),%[vu_buf] \n" \
3571 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3572 "vpshufb %[kShuffleNV21], %%ymm3, %%ymm3 \n" \
3573 "vmovdqu (%[y_buf]),%%xmm4 \n" \
3574 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
3575 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
3576 "lea 0x10(%[y_buf]),%[y_buf] \n"
3577
3578 // Read 4 UV from P210, upsample to 8 UV
3579 #define READP210_AVX2 \
3580 "vmovdqu (%[uv_buf]),%%ymm3 \n" \
3581 "lea 0x20(%[uv_buf]),%[uv_buf] \n" \
3582 "vpsrlw $0x8,%%ymm3,%%ymm3 \n" \
3583 "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
3584 "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
3585 "vmovdqu (%[y_buf]),%%ymm4 \n" \
3586 "lea 0x20(%[y_buf]),%[y_buf] \n"
3587
3588 // Read 8 UV from P410
3589 #define READP410_AVX2 \
3590 "vmovdqu (%[uv_buf]),%%ymm3 \n" \
3591 "vmovdqu 0x20(%[uv_buf]),%%ymm1 \n" \
3592 "lea 0x40(%[uv_buf]),%[uv_buf] \n" \
3593 "vpsrlw $0x8,%%ymm3,%%ymm3 \n" \
3594 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" \
3595 "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \
3596 "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
3597 "vmovdqu (%[y_buf]),%%ymm4 \n" \
3598 "lea 0x20(%[y_buf]),%[y_buf] \n"
3599
3600 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
3601 #define READYUY2_AVX2 \
3602 "vmovdqu (%[yuy2_buf]),%%ymm4 \n" \
3603 "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
3604 "vmovdqu (%[yuy2_buf]),%%ymm3 \n" \
3605 "vpshufb %[kShuffleYUY2UV], %%ymm3, %%ymm3 \n" \
3606 "lea 0x20(%[yuy2_buf]),%[yuy2_buf] \n"
3607
3608 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
3609 #define READUYVY_AVX2 \
3610 "vmovdqu (%[uyvy_buf]),%%ymm4 \n" \
3611 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
3612 "vmovdqu (%[uyvy_buf]),%%ymm3 \n" \
3613 "vpshufb %[kShuffleUYVYUV], %%ymm3, %%ymm3 \n" \
3614 "lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n"
3615
3616 // TODO(fbarchard): Remove broadcastb
3617 #if defined(__x86_64__)
3618 #define YUVTORGB_SETUP_AVX2(yuvconstants) \
3619 "vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \
3620 "vmovdqa (%[yuvconstants]),%%ymm8 \n" \
3621 "vpsllw $7,%%xmm13,%%xmm13 \n" \
3622 "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \
3623 "vpbroadcastb %%xmm13,%%ymm13 \n" \
3624 "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \
3625 "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \
3626 "vmovdqa 128(%[yuvconstants]),%%ymm12 \n"
3627
3628 #define YUVTORGB_SETUP_AVX512BW(yuvconstants) \
3629 "vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \
3630 "movdqa (%[yuvconstants]),%%xmm8 \n" \
3631 "vpbroadcastq %%xmm8, %%zmm8 \n" \
3632 "vpsllw $7,%%xmm13,%%xmm13 \n" \
3633 "vpbroadcastb %%xmm13,%%zmm13 \n" \
3634 "movq 32(%[yuvconstants]),%%xmm9 \n" \
3635 "vpbroadcastq %%xmm9,%%zmm9 \n" \
3636 "movq 64(%[yuvconstants]),%%xmm10 \n" \
3637 "vpbroadcastq %%xmm10,%%zmm10 \n" \
3638 "movq 96(%[yuvconstants]),%%xmm11 \n" \
3639 "vpbroadcastq %%xmm11,%%zmm11 \n" \
3640 "movq 128(%[yuvconstants]),%%xmm12 \n" \
3641 "vpbroadcastq %%xmm12,%%zmm12 \n" \
3642 "vmovdqu8 (%[quadsplitperm]),%%zmm16 \n" \
3643 "vmovdqu8 (%[dquadsplitperm]),%%zmm17 \n" \
3644 "vmovdqu8 (%[unperm]),%%zmm18 \n"
3645
3646 #define YUVTORGB16_AVX2(yuvconstants) \
3647 "vpsubb %%ymm13,%%ymm3,%%ymm3 \n" \
3648 "vpmulhuw %%ymm11,%%ymm4,%%ymm4 \n" \
3649 "vpmaddubsw %%ymm3,%%ymm8,%%ymm0 \n" \
3650 "vpmaddubsw %%ymm3,%%ymm9,%%ymm1 \n" \
3651 "vpmaddubsw %%ymm3,%%ymm10,%%ymm2 \n" \
3652 "vpaddw %%ymm4,%%ymm12,%%ymm4 \n" \
3653 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
3654 "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \
3655 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
3656
3657 #define YUVTORGB16_AVX512BW(yuvconstants) \
3658 "vpsubb %%zmm13,%%zmm3,%%zmm3 \n" \
3659 "vpmulhuw %%zmm11,%%zmm4,%%zmm4 \n" \
3660 "vpmaddubsw %%zmm3,%%zmm8,%%zmm0 \n" \
3661 "vpmaddubsw %%zmm3,%%zmm9,%%zmm1 \n" \
3662 "vpmaddubsw %%zmm3,%%zmm10,%%zmm2 \n" \
3663 "vpaddw %%zmm4,%%zmm12,%%zmm4 \n" \
3664 "vpaddsw %%zmm4,%%zmm0,%%zmm0 \n" \
3665 "vpsubsw %%zmm1,%%zmm4,%%zmm1 \n" \
3666 "vpaddsw %%zmm4,%%zmm2,%%zmm2 \n"
3667
3668 #define YUVTORGB_REGS_AVX2 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
3669 #define YUVTORGB_REGS_AVX512BW \
3670 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm16", "xmm17", "xmm18",
3671
3672 #else // Convert 16 pixels: 16 UV and 16 Y.
3673
3674 #define YUVTORGB_SETUP_AVX2(yuvconstants)
3675 #define YUVTORGB16_AVX2(yuvconstants) \
3676 "vpcmpeqb %%xmm0,%%xmm0,%%xmm0 \n" \
3677 "vpsllw $7,%%xmm0,%%xmm0 \n" \
3678 "vpbroadcastb %%xmm0,%%ymm0 \n" \
3679 "vpsubb %%ymm0,%%ymm3,%%ymm3 \n" \
3680 "vpmulhuw 96(%[yuvconstants]),%%ymm4,%%ymm4 \n" \
3681 "vmovdqa (%[yuvconstants]),%%ymm0 \n" \
3682 "vmovdqa 32(%[yuvconstants]),%%ymm1 \n" \
3683 "vmovdqa 64(%[yuvconstants]),%%ymm2 \n" \
3684 "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" \
3685 "vpmaddubsw %%ymm3,%%ymm1,%%ymm1 \n" \
3686 "vpmaddubsw %%ymm3,%%ymm2,%%ymm2 \n" \
3687 "vmovdqa 128(%[yuvconstants]),%%ymm3 \n" \
3688 "vpaddw %%ymm4,%%ymm3,%%ymm4 \n" \
3689 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
3690 "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \
3691 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
3692
3693 #define YUVTORGB_REGS_AVX2
3694 #endif
3695
3696 #define YUVTORGB_AVX2(yuvconstants) \
3697 YUVTORGB16_AVX2(yuvconstants) \
3698 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
3699 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
3700 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
3701 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
3702 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
3703 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
3704
3705 #define YUVTORGB_AVX512BW(yuvconstants) \
3706 YUVTORGB16_AVX512BW(yuvconstants) \
3707 "vpsraw $0x6,%%zmm0,%%zmm0 \n" \
3708 "vpsraw $0x6,%%zmm1,%%zmm1 \n" \
3709 "vpsraw $0x6,%%zmm2,%%zmm2 \n" \
3710 "vpackuswb %%zmm0,%%zmm0,%%zmm0 \n" \
3711 "vpackuswb %%zmm1,%%zmm1,%%zmm1 \n" \
3712 "vpackuswb %%zmm2,%%zmm2,%%zmm2 \n"
3713
3714 // Store 16 ARGB values.
3715 #define STOREARGB_AVX2 \
3716 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
3717 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
3718 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
3719 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
3720 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
3721 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
3722 "vmovdqu %%ymm1,(%[dst_argb]) \n" \
3723 "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \
3724 "lea 0x40(%[dst_argb]), %[dst_argb] \n"
3725
3726 // Store 32 ARGB values.
3727 #define STOREARGB_AVX512BW \
3728 "vpunpcklbw %%zmm1,%%zmm0,%%zmm0 \n" \
3729 "vpermq %%zmm0,%%zmm18,%%zmm0 \n" \
3730 "vpunpcklbw %%zmm5,%%zmm2,%%zmm2 \n" \
3731 "vpermq %%zmm2,%%zmm18,%%zmm2 \n" \
3732 "vpunpcklwd %%zmm2,%%zmm0,%%zmm1 \n" \
3733 "vpunpckhwd %%zmm2,%%zmm0,%%zmm0 \n" \
3734 "vmovdqu8 %%zmm1,(%[dst_argb]) \n" \
3735 "vmovdqu8 %%zmm0,0x40(%[dst_argb]) \n" \
3736 "lea 0x80(%[dst_argb]), %[dst_argb] \n"
3737
3738 // Store 16 AR30 values.
3739 #define STOREAR30_AVX2 \
3740 "vpsraw $0x4,%%ymm0,%%ymm0 \n" \
3741 "vpsraw $0x4,%%ymm1,%%ymm1 \n" \
3742 "vpsraw $0x4,%%ymm2,%%ymm2 \n" \
3743 "vpminsw %%ymm7,%%ymm0,%%ymm0 \n" \
3744 "vpminsw %%ymm7,%%ymm1,%%ymm1 \n" \
3745 "vpminsw %%ymm7,%%ymm2,%%ymm2 \n" \
3746 "vpmaxsw %%ymm6,%%ymm0,%%ymm0 \n" \
3747 "vpmaxsw %%ymm6,%%ymm1,%%ymm1 \n" \
3748 "vpmaxsw %%ymm6,%%ymm2,%%ymm2 \n" \
3749 "vpsllw $0x4,%%ymm2,%%ymm2 \n" \
3750 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
3751 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
3752 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
3753 "vpunpckhwd %%ymm2,%%ymm0,%%ymm3 \n" \
3754 "vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" \
3755 "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" \
3756 "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" \
3757 "vpslld $0xa,%%ymm1,%%ymm1 \n" \
3758 "vpslld $0xa,%%ymm2,%%ymm2 \n" \
3759 "vpor %%ymm1,%%ymm0,%%ymm0 \n" \
3760 "vpor %%ymm2,%%ymm3,%%ymm3 \n" \
3761 "vmovdqu %%ymm0,(%[dst_ar30]) \n" \
3762 "vmovdqu %%ymm3,0x20(%[dst_ar30]) \n" \
3763 "lea 0x40(%[dst_ar30]), %[dst_ar30] \n"
3764
3765 #ifdef HAS_I444TOARGBROW_AVX2
3766 // 16 pixels
3767 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
I444ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3768 void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
3769 const uint8_t* u_buf,
3770 const uint8_t* v_buf,
3771 uint8_t* dst_argb,
3772 const struct YuvConstants* yuvconstants,
3773 int width) {
3774 asm volatile (
3775 YUVTORGB_SETUP_AVX2(yuvconstants)
3776 "sub %[u_buf],%[v_buf] \n"
3777 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3778
3779 LABELALIGN
3780 "1: \n"
3781 READYUV444_AVX2
3782 YUVTORGB_AVX2(yuvconstants)
3783 STOREARGB_AVX2
3784 "sub $0x10,%[width] \n"
3785 "jg 1b \n"
3786 "vzeroupper \n"
3787 : [y_buf]"+r"(y_buf), // %[y_buf]
3788 [u_buf]"+r"(u_buf), // %[u_buf]
3789 [v_buf]"+r"(v_buf), // %[v_buf]
3790 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3791 [width]"+rm"(width) // %[width]
3792 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3793 : "memory", "cc", YUVTORGB_REGS_AVX2
3794 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3795 );
3796 }
3797 #endif // HAS_I444TOARGBROW_AVX2
3798
3799 #if defined(HAS_I422TOARGBROW_AVX2)
3800 // 16 pixels
3801 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I422ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3802 void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
3803 const uint8_t* u_buf,
3804 const uint8_t* v_buf,
3805 uint8_t* dst_argb,
3806 const struct YuvConstants* yuvconstants,
3807 int width) {
3808 asm volatile (
3809 YUVTORGB_SETUP_AVX2(yuvconstants)
3810 "sub %[u_buf],%[v_buf] \n"
3811 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3812
3813 LABELALIGN
3814 "1: \n"
3815 READYUV422_AVX2
3816 YUVTORGB_AVX2(yuvconstants)
3817 STOREARGB_AVX2
3818 "sub $0x10,%[width] \n"
3819 "jg 1b \n"
3820
3821 "vzeroupper \n"
3822 : [y_buf]"+r"(y_buf), // %[y_buf]
3823 [u_buf]"+r"(u_buf), // %[u_buf]
3824 [v_buf]"+r"(v_buf), // %[v_buf]
3825 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3826 [width]"+rm"(width) // %[width]
3827 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3828 : "memory", "cc", YUVTORGB_REGS_AVX2
3829 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3830 );
3831 }
3832 #endif // HAS_I422TOARGBROW_AVX2
3833
3834 #if defined(HAS_I422TOARGBROW_AVX512BW)
3835 static const uint64_t kSplitQuadWords[8] = {0, 2, 2, 2, 1, 2, 2, 2};
3836 static const uint64_t kSplitDoubleQuadWords[8] = {0, 1, 4, 4, 2, 3, 4, 4};
3837 static const uint64_t kUnpermuteAVX512[8] = {0, 4, 1, 5, 2, 6, 3, 7};
3838
3839 // 32 pixels
3840 // 16 UV values upsampled to 32 UV, mixed with 32 Y producing 32 ARGB (128
3841 // bytes).
I422ToARGBRow_AVX512BW(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3842 void OMITFP I422ToARGBRow_AVX512BW(const uint8_t* y_buf,
3843 const uint8_t* u_buf,
3844 const uint8_t* v_buf,
3845 uint8_t* dst_argb,
3846 const struct YuvConstants* yuvconstants,
3847 int width) {
3848 asm volatile (
3849 YUVTORGB_SETUP_AVX512BW(yuvconstants)
3850 "sub %[u_buf],%[v_buf] \n"
3851 "vpcmpeqb %%xmm5,%%xmm5,%%xmm5 \n"
3852 "vpbroadcastq %%xmm5,%%zmm5 \n"
3853
3854 LABELALIGN
3855 "1: \n"
3856 READYUV422_AVX512BW
3857 YUVTORGB_AVX512BW(yuvconstants)
3858 STOREARGB_AVX512BW
3859 "sub $0x20,%[width] \n"
3860 "jg 1b \n"
3861
3862 "vzeroupper \n"
3863 : [y_buf]"+r"(y_buf), // %[y_buf]
3864 [u_buf]"+r"(u_buf), // %[u_buf]
3865 [v_buf]"+r"(v_buf), // %[v_buf]
3866 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3867 [width]"+rm"(width) // %[width]
3868 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
3869 [quadsplitperm]"r"(kSplitQuadWords), // %[quadsplitperm]
3870 [dquadsplitperm]"r"(kSplitDoubleQuadWords), // %[dquadsplitperm]
3871 [unperm]"r"(kUnpermuteAVX512) // %[unperm]
3872 : "memory", "cc", YUVTORGB_REGS_AVX512BW
3873 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3874 );
3875 }
3876 #endif // HAS_I422TOARGBROW_AVX512BW
3877
3878 #if defined(HAS_I422TOAR30ROW_AVX2)
3879 // 16 pixels
3880 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I422ToAR30Row_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3881 void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
3882 const uint8_t* u_buf,
3883 const uint8_t* v_buf,
3884 uint8_t* dst_ar30,
3885 const struct YuvConstants* yuvconstants,
3886 int width) {
3887 asm volatile (
3888 YUVTORGB_SETUP_AVX2(yuvconstants)
3889 "sub %[u_buf],%[v_buf] \n"
3890 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
3891 "vpsrlw $14,%%ymm5,%%ymm5 \n"
3892 "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
3893 "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
3894 "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
3895 "vpsrlw $6,%%ymm7,%%ymm7 \n"
3896
3897 LABELALIGN
3898 "1: \n"
3899 READYUV422_AVX2
3900 YUVTORGB16_AVX2(yuvconstants)
3901 STOREAR30_AVX2
3902 "sub $0x10,%[width] \n"
3903 "jg 1b \n"
3904
3905 "vzeroupper \n"
3906 : [y_buf]"+r"(y_buf), // %[y_buf]
3907 [u_buf]"+r"(u_buf), // %[u_buf]
3908 [v_buf]"+r"(v_buf), // %[v_buf]
3909 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
3910 [width]"+rm"(width) // %[width]
3911 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3912 : "memory", "cc", YUVTORGB_REGS_AVX2
3913 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3914 );
3915 }
3916 #endif // HAS_I422TOAR30ROW_AVX2
3917
3918 #if defined(HAS_I210TOARGBROW_AVX2)
3919 // 16 pixels
3920 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I210ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3921 void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
3922 const uint16_t* u_buf,
3923 const uint16_t* v_buf,
3924 uint8_t* dst_argb,
3925 const struct YuvConstants* yuvconstants,
3926 int width) {
3927 asm volatile (
3928 YUVTORGB_SETUP_AVX2(yuvconstants)
3929 "sub %[u_buf],%[v_buf] \n"
3930 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3931
3932 LABELALIGN
3933 "1: \n"
3934 READYUV210_AVX2
3935 YUVTORGB_AVX2(yuvconstants)
3936 STOREARGB_AVX2
3937 "sub $0x10,%[width] \n"
3938 "jg 1b \n"
3939
3940 "vzeroupper \n"
3941 : [y_buf]"+r"(y_buf), // %[y_buf]
3942 [u_buf]"+r"(u_buf), // %[u_buf]
3943 [v_buf]"+r"(v_buf), // %[v_buf]
3944 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3945 [width]"+rm"(width) // %[width]
3946 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3947 : "memory", "cc", YUVTORGB_REGS_AVX2
3948 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3949 );
3950 }
3951 #endif // HAS_I210TOARGBROW_AVX2
3952
3953 #if defined(HAS_I212TOARGBROW_AVX2)
3954 // 16 pixels
3955 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I212ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3956 void OMITFP I212ToARGBRow_AVX2(const uint16_t* y_buf,
3957 const uint16_t* u_buf,
3958 const uint16_t* v_buf,
3959 uint8_t* dst_argb,
3960 const struct YuvConstants* yuvconstants,
3961 int width) {
3962 asm volatile (
3963 YUVTORGB_SETUP_AVX2(yuvconstants)
3964 "sub %[u_buf],%[v_buf] \n"
3965 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3966
3967 LABELALIGN
3968 "1: \n"
3969 READYUV212_AVX2
3970 YUVTORGB_AVX2(yuvconstants)
3971 STOREARGB_AVX2
3972 "sub $0x10,%[width] \n"
3973 "jg 1b \n"
3974
3975 "vzeroupper \n"
3976 : [y_buf]"+r"(y_buf), // %[y_buf]
3977 [u_buf]"+r"(u_buf), // %[u_buf]
3978 [v_buf]"+r"(v_buf), // %[v_buf]
3979 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3980 [width]"+rm"(width) // %[width]
3981 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
3982 : "memory", "cc", YUVTORGB_REGS_AVX2
3983 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3984 );
3985 }
3986 #endif // HAS_I212TOARGBROW_AVX2
3987
3988 #if defined(HAS_I210TOAR30ROW_AVX2)
3989 // 16 pixels
3990 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I210ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)3991 void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
3992 const uint16_t* u_buf,
3993 const uint16_t* v_buf,
3994 uint8_t* dst_ar30,
3995 const struct YuvConstants* yuvconstants,
3996 int width) {
3997 asm volatile (
3998 YUVTORGB_SETUP_AVX2(yuvconstants)
3999 "sub %[u_buf],%[v_buf] \n"
4000 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
4001 "vpsrlw $14,%%ymm5,%%ymm5 \n"
4002 "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
4003 "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
4004 "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
4005 "vpsrlw $6,%%ymm7,%%ymm7 \n"
4006
4007 LABELALIGN
4008 "1: \n"
4009 READYUV210_AVX2
4010 YUVTORGB16_AVX2(yuvconstants)
4011 STOREAR30_AVX2
4012 "sub $0x10,%[width] \n"
4013 "jg 1b \n"
4014
4015 "vzeroupper \n"
4016 : [y_buf]"+r"(y_buf), // %[y_buf]
4017 [u_buf]"+r"(u_buf), // %[u_buf]
4018 [v_buf]"+r"(v_buf), // %[v_buf]
4019 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
4020 [width]"+rm"(width) // %[width]
4021 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
4022 : "memory", "cc", YUVTORGB_REGS_AVX2
4023 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4024 );
4025 }
4026 #endif // HAS_I210TOAR30ROW_AVX2
4027
4028 #if defined(HAS_I212TOAR30ROW_AVX2)
4029 // 16 pixels
4030 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I212ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)4031 void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf,
4032 const uint16_t* u_buf,
4033 const uint16_t* v_buf,
4034 uint8_t* dst_ar30,
4035 const struct YuvConstants* yuvconstants,
4036 int width) {
4037 asm volatile (
4038 YUVTORGB_SETUP_AVX2(yuvconstants)
4039 "sub %[u_buf],%[v_buf] \n"
4040 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
4041 "vpsrlw $14,%%ymm5,%%ymm5 \n"
4042 "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
4043 "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
4044 "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
4045 "vpsrlw $6,%%ymm7,%%ymm7 \n"
4046
4047 LABELALIGN
4048 "1: \n"
4049 READYUV212_AVX2
4050 YUVTORGB16_AVX2(yuvconstants)
4051 STOREAR30_AVX2
4052 "sub $0x10,%[width] \n"
4053 "jg 1b \n"
4054
4055 "vzeroupper \n"
4056 : [y_buf]"+r"(y_buf), // %[y_buf]
4057 [u_buf]"+r"(u_buf), // %[u_buf]
4058 [v_buf]"+r"(v_buf), // %[v_buf]
4059 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
4060 [width]"+rm"(width) // %[width]
4061 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
4062 : "memory", "cc", YUVTORGB_REGS_AVX2
4063 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4064 );
4065 }
4066 #endif // HAS_I212TOAR30ROW_AVX2
4067
4068 #if defined(HAS_I410TOARGBROW_AVX2)
4069 // 16 pixels
4070 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
I410ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4071 void OMITFP I410ToARGBRow_AVX2(const uint16_t* y_buf,
4072 const uint16_t* u_buf,
4073 const uint16_t* v_buf,
4074 uint8_t* dst_argb,
4075 const struct YuvConstants* yuvconstants,
4076 int width) {
4077 asm volatile (
4078 YUVTORGB_SETUP_AVX2(yuvconstants)
4079 "sub %[u_buf],%[v_buf] \n"
4080 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4081
4082 LABELALIGN
4083 "1: \n"
4084 READYUV410_AVX2
4085 YUVTORGB_AVX2(yuvconstants)
4086 STOREARGB_AVX2
4087 "sub $0x10,%[width] \n"
4088 "jg 1b \n"
4089 "vzeroupper \n"
4090
4091 : [y_buf]"+r"(y_buf), // %[y_buf]
4092 [u_buf]"+r"(u_buf), // %[u_buf]
4093 [v_buf]"+r"(v_buf), // %[v_buf]
4094 [dst_argb]"+r"(dst_argb), // %[dst_argb]
4095 [width]"+rm"(width) // %[width]
4096 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
4097 : "memory", "cc", YUVTORGB_REGS_AVX2
4098 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4099 );
4100 }
4101 #endif // HAS_I410TOARGBROW_AVX2
4102
4103 #if defined(HAS_I210ALPHATOARGBROW_AVX2)
4104 // 16 pixels
4105 // 8 UV, 16 Y and 16 A producing 16 ARGB (64 bytes).
I210AlphaToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,const uint16_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4106 void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf,
4107 const uint16_t* u_buf,
4108 const uint16_t* v_buf,
4109 const uint16_t* a_buf,
4110 uint8_t* dst_argb,
4111 const struct YuvConstants* yuvconstants,
4112 int width) {
4113 asm volatile(
4114 YUVTORGB_SETUP_AVX2(
4115 yuvconstants) "sub %[u_buf],%[v_buf] \n"
4116
4117 LABELALIGN "1: \n" READYUVA210_AVX2
4118 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
4119 "subl $0x10,%[width] \n"
4120 "jg 1b \n"
4121 "vzeroupper \n"
4122
4123 : [y_buf] "+r"(y_buf), // %[y_buf]
4124 [u_buf] "+r"(u_buf), // %[u_buf]
4125 [v_buf] "+r"(v_buf), // %[v_buf]
4126 [a_buf] "+r"(a_buf), // %[a_buf]
4127 [dst_argb] "+r"(dst_argb), // %[dst_argb]
4128 #if defined(__i386__)
4129 [width] "+m"(width) // %[width]
4130 #else
4131 [width] "+rm"(width) // %[width]
4132 #endif
4133 : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
4134 : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
4135 "xmm4", "xmm5");
4136 }
4137 #endif // HAS_I210TOARGBROW_AVX2
4138
4139 #if defined(HAS_I410ALPHATOARGBROW_AVX2)
4140 // 16 pixels
4141 // 16 UV, 16 Y and 16 A producing 16 ARGB (64 bytes).
I410AlphaToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,const uint16_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4142 void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf,
4143 const uint16_t* u_buf,
4144 const uint16_t* v_buf,
4145 const uint16_t* a_buf,
4146 uint8_t* dst_argb,
4147 const struct YuvConstants* yuvconstants,
4148 int width) {
4149 asm volatile(
4150 YUVTORGB_SETUP_AVX2(
4151 yuvconstants) "sub %[u_buf],%[v_buf] \n"
4152
4153 LABELALIGN "1: \n" READYUVA410_AVX2
4154 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
4155 "subl $0x10,%[width] \n"
4156 "jg 1b \n"
4157 "vzeroupper \n"
4158
4159 : [y_buf] "+r"(y_buf), // %[y_buf]
4160 [u_buf] "+r"(u_buf), // %[u_buf]
4161 [v_buf] "+r"(v_buf), // %[v_buf]
4162 [a_buf] "+r"(a_buf), // %[a_buf]
4163 [dst_argb] "+r"(dst_argb), // %[dst_argb]
4164 #if defined(__i386__)
4165 [width] "+m"(width) // %[width]
4166 #else
4167 [width] "+rm"(width) // %[width]
4168 #endif
4169 : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
4170 : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
4171 "xmm4", "xmm5");
4172 }
4173 #endif // HAS_I410TOARGBROW_AVX2
4174
4175 #if defined(HAS_I410TOAR30ROW_AVX2)
4176 // 16 pixels
4177 // 16 UV values with 16 Y producing 16 AR30 (64 bytes).
I410ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)4178 void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf,
4179 const uint16_t* u_buf,
4180 const uint16_t* v_buf,
4181 uint8_t* dst_ar30,
4182 const struct YuvConstants* yuvconstants,
4183 int width) {
4184 asm volatile (
4185 YUVTORGB_SETUP_AVX2(yuvconstants)
4186 "sub %[u_buf],%[v_buf] \n"
4187 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
4188 "vpsrlw $14,%%ymm5,%%ymm5 \n"
4189 "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
4190 "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
4191 "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
4192 "vpsrlw $6,%%ymm7,%%ymm7 \n"
4193
4194 LABELALIGN
4195 "1: \n"
4196 READYUV410_AVX2
4197 YUVTORGB16_AVX2(yuvconstants)
4198 STOREAR30_AVX2
4199 "sub $0x10,%[width] \n"
4200 "jg 1b \n"
4201
4202 "vzeroupper \n"
4203 : [y_buf]"+r"(y_buf), // %[y_buf]
4204 [u_buf]"+r"(u_buf), // %[u_buf]
4205 [v_buf]"+r"(v_buf), // %[v_buf]
4206 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
4207 [width]"+rm"(width) // %[width]
4208 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
4209 : "memory", "cc", YUVTORGB_REGS_AVX2
4210 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4211 );
4212 }
4213 #endif // HAS_I410TOAR30ROW_AVX2
4214
4215 #if defined(HAS_I444ALPHATOARGBROW_AVX2)
4216 // 16 pixels
4217 // 16 UV values with 16 Y and 16 A producing 16 ARGB.
I444AlphaToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4218 void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf,
4219 const uint8_t* u_buf,
4220 const uint8_t* v_buf,
4221 const uint8_t* a_buf,
4222 uint8_t* dst_argb,
4223 const struct YuvConstants* yuvconstants,
4224 int width) {
4225 // clang-format off
4226 asm volatile (
4227 YUVTORGB_SETUP_AVX2(yuvconstants)
4228 "sub %[u_buf],%[v_buf] \n"
4229
4230 LABELALIGN
4231 "1: \n"
4232 READYUVA444_AVX2
4233 YUVTORGB_AVX2(yuvconstants)
4234 STOREARGB_AVX2
4235 "subl $0x10,%[width] \n"
4236 "jg 1b \n"
4237 "vzeroupper \n"
4238 : [y_buf]"+r"(y_buf), // %[y_buf]
4239 [u_buf]"+r"(u_buf), // %[u_buf]
4240 [v_buf]"+r"(v_buf), // %[v_buf]
4241 [a_buf]"+r"(a_buf), // %[a_buf]
4242 [dst_argb]"+r"(dst_argb), // %[dst_argb]
4243 #if defined(__i386__)
4244 [width]"+m"(width) // %[width]
4245 #else
4246 [width]"+rm"(width) // %[width]
4247 #endif
4248 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
4249 : "memory", "cc", YUVTORGB_REGS_AVX2
4250 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4251 );
4252 // clang-format on
4253 }
4254 #endif // HAS_I444ALPHATOARGBROW_AVX2
4255
4256 #if defined(HAS_I422ALPHATOARGBROW_AVX2)
4257 // 16 pixels
4258 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
I422AlphaToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4259 void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
4260 const uint8_t* u_buf,
4261 const uint8_t* v_buf,
4262 const uint8_t* a_buf,
4263 uint8_t* dst_argb,
4264 const struct YuvConstants* yuvconstants,
4265 int width) {
4266 // clang-format off
4267 asm volatile (
4268 YUVTORGB_SETUP_AVX2(yuvconstants)
4269 "sub %[u_buf],%[v_buf] \n"
4270
4271 LABELALIGN
4272 "1: \n"
4273 READYUVA422_AVX2
4274 YUVTORGB_AVX2(yuvconstants)
4275 STOREARGB_AVX2
4276 "subl $0x10,%[width] \n"
4277 "jg 1b \n"
4278 "vzeroupper \n"
4279 : [y_buf]"+r"(y_buf), // %[y_buf]
4280 [u_buf]"+r"(u_buf), // %[u_buf]
4281 [v_buf]"+r"(v_buf), // %[v_buf]
4282 [a_buf]"+r"(a_buf), // %[a_buf]
4283 [dst_argb]"+r"(dst_argb), // %[dst_argb]
4284 #if defined(__i386__)
4285 [width]"+m"(width) // %[width]
4286 #else
4287 [width]"+rm"(width) // %[width]
4288 #endif
4289 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
4290 : "memory", "cc", YUVTORGB_REGS_AVX2
4291 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4292 );
4293 // clang-format on
4294 }
4295 #endif // HAS_I422ALPHATOARGBROW_AVX2
4296
4297 #if defined(HAS_I422TORGBAROW_AVX2)
4298 // 16 pixels
4299 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
I422ToRGBARow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4300 void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
4301 const uint8_t* u_buf,
4302 const uint8_t* v_buf,
4303 uint8_t* dst_argb,
4304 const struct YuvConstants* yuvconstants,
4305 int width) {
4306 asm volatile (
4307 YUVTORGB_SETUP_AVX2(yuvconstants)
4308 "sub %[u_buf],%[v_buf] \n"
4309 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4310
4311 LABELALIGN
4312 "1: \n"
4313 READYUV422_AVX2
4314 YUVTORGB_AVX2(yuvconstants)
4315
4316 // Step 3: Weave into RGBA
4317 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
4318 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
4319 "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n"
4320 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
4321 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
4322 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
4323 "vmovdqu %%ymm0,(%[dst_argb]) \n"
4324 "vmovdqu %%ymm1,0x20(%[dst_argb]) \n"
4325 "lea 0x40(%[dst_argb]),%[dst_argb] \n"
4326 "sub $0x10,%[width] \n"
4327 "jg 1b \n"
4328 "vzeroupper \n"
4329 : [y_buf]"+r"(y_buf), // %[y_buf]
4330 [u_buf]"+r"(u_buf), // %[u_buf]
4331 [v_buf]"+r"(v_buf), // %[v_buf]
4332 [dst_argb]"+r"(dst_argb), // %[dst_argb]
4333 [width]"+rm"(width) // %[width]
4334 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
4335 : "memory", "cc", YUVTORGB_REGS_AVX2
4336 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4337 );
4338 }
4339 #endif // HAS_I422TORGBAROW_AVX2
4340
4341 #if defined(HAS_NV12TOARGBROW_AVX2)
4342 // 16 pixels.
4343 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV12ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4344 void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
4345 const uint8_t* uv_buf,
4346 uint8_t* dst_argb,
4347 const struct YuvConstants* yuvconstants,
4348 int width) {
4349 // clang-format off
4350 asm volatile (
4351 YUVTORGB_SETUP_AVX2(yuvconstants)
4352 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4353
4354 LABELALIGN
4355 "1: \n"
4356 READNV12_AVX2
4357 YUVTORGB_AVX2(yuvconstants)
4358 STOREARGB_AVX2
4359 "sub $0x10,%[width] \n"
4360 "jg 1b \n"
4361 "vzeroupper \n"
4362 : [y_buf]"+r"(y_buf), // %[y_buf]
4363 [uv_buf]"+r"(uv_buf), // %[uv_buf]
4364 [dst_argb]"+r"(dst_argb), // %[dst_argb]
4365 [width]"+rm"(width) // %[width]
4366 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
4367 : "memory", "cc", YUVTORGB_REGS_AVX2
4368 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4369 );
4370 // clang-format on
4371 }
4372 #endif // HAS_NV12TOARGBROW_AVX2
4373
4374 #if defined(HAS_NV21TOARGBROW_AVX2)
4375 // 16 pixels.
4376 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV21ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * vu_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4377 void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
4378 const uint8_t* vu_buf,
4379 uint8_t* dst_argb,
4380 const struct YuvConstants* yuvconstants,
4381 int width) {
4382 // clang-format off
4383 asm volatile (
4384 YUVTORGB_SETUP_AVX2(yuvconstants)
4385 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4386
4387 LABELALIGN
4388 "1: \n"
4389 READNV21_AVX2
4390 YUVTORGB_AVX2(yuvconstants)
4391 STOREARGB_AVX2
4392 "sub $0x10,%[width] \n"
4393 "jg 1b \n"
4394 "vzeroupper \n"
4395 : [y_buf]"+r"(y_buf), // %[y_buf]
4396 [vu_buf]"+r"(vu_buf), // %[vu_buf]
4397 [dst_argb]"+r"(dst_argb), // %[dst_argb]
4398 [width]"+rm"(width) // %[width]
4399 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
4400 [kShuffleNV21]"m"(kShuffleNV21)
4401 : "memory", "cc", YUVTORGB_REGS_AVX2
4402 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4403 );
4404 // clang-format on
4405 }
4406 #endif // HAS_NV21TOARGBROW_AVX2
4407
4408 #if defined(HAS_YUY2TOARGBROW_AVX2)
4409 // 16 pixels.
4410 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
YUY2ToARGBRow_AVX2(const uint8_t * yuy2_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4411 void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
4412 uint8_t* dst_argb,
4413 const struct YuvConstants* yuvconstants,
4414 int width) {
4415 // clang-format off
4416 asm volatile (
4417 YUVTORGB_SETUP_AVX2(yuvconstants)
4418 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4419
4420 LABELALIGN
4421 "1: \n"
4422 READYUY2_AVX2
4423 YUVTORGB_AVX2(yuvconstants)
4424 STOREARGB_AVX2
4425 "sub $0x10,%[width] \n"
4426 "jg 1b \n"
4427 "vzeroupper \n"
4428 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
4429 [dst_argb]"+r"(dst_argb), // %[dst_argb]
4430 [width]"+rm"(width) // %[width]
4431 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
4432 [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
4433 [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
4434 : "memory", "cc", YUVTORGB_REGS_AVX2
4435 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4436 );
4437 // clang-format on
4438 }
4439 #endif // HAS_YUY2TOARGBROW_AVX2
4440
4441 #if defined(HAS_UYVYTOARGBROW_AVX2)
4442 // 16 pixels.
4443 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
UYVYToARGBRow_AVX2(const uint8_t * uyvy_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4444 void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
4445 uint8_t* dst_argb,
4446 const struct YuvConstants* yuvconstants,
4447 int width) {
4448 // clang-format off
4449 asm volatile (
4450 YUVTORGB_SETUP_AVX2(yuvconstants)
4451 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4452
4453 LABELALIGN
4454 "1: \n"
4455 READUYVY_AVX2
4456 YUVTORGB_AVX2(yuvconstants)
4457 STOREARGB_AVX2
4458 "sub $0x10,%[width] \n"
4459 "jg 1b \n"
4460 "vzeroupper \n"
4461 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
4462 [dst_argb]"+r"(dst_argb), // %[dst_argb]
4463 [width]"+rm"(width) // %[width]
4464 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
4465 [kShuffleUYVYY]"m"(kShuffleUYVYY),
4466 [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
4467 : "memory", "cc", YUVTORGB_REGS_AVX2
4468 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4469 );
4470 // clang-format on
4471 }
4472 #endif // HAS_UYVYTOARGBROW_AVX2
4473
4474 #if defined(HAS_P210TOARGBROW_AVX2)
4475 // 16 pixels.
4476 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
P210ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4477 void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf,
4478 const uint16_t* uv_buf,
4479 uint8_t* dst_argb,
4480 const struct YuvConstants* yuvconstants,
4481 int width) {
4482 // clang-format off
4483 asm volatile (
4484 YUVTORGB_SETUP_AVX2(yuvconstants)
4485 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4486
4487 LABELALIGN
4488 "1: \n"
4489 READP210_AVX2
4490 YUVTORGB_AVX2(yuvconstants)
4491 STOREARGB_AVX2
4492 "sub $0x10,%[width] \n"
4493 "jg 1b \n"
4494 "vzeroupper \n"
4495 : [y_buf]"+r"(y_buf), // %[y_buf]
4496 [uv_buf]"+r"(uv_buf), // %[uv_buf]
4497 [dst_argb]"+r"(dst_argb), // %[dst_argb]
4498 [width]"+rm"(width) // %[width]
4499 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
4500 : "memory", "cc", YUVTORGB_REGS_AVX2
4501 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4502 );
4503 // clang-format on
4504 }
4505 #endif // HAS_P210TOARGBROW_AVX2
4506
4507 #if defined(HAS_P410TOARGBROW_AVX2)
4508 // 16 pixels.
4509 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
P410ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4510 void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf,
4511 const uint16_t* uv_buf,
4512 uint8_t* dst_argb,
4513 const struct YuvConstants* yuvconstants,
4514 int width) {
4515 // clang-format off
4516 asm volatile (
4517 YUVTORGB_SETUP_AVX2(yuvconstants)
4518 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4519
4520 LABELALIGN
4521 "1: \n"
4522 READP410_AVX2
4523 YUVTORGB_AVX2(yuvconstants)
4524 STOREARGB_AVX2
4525 "sub $0x10,%[width] \n"
4526 "jg 1b \n"
4527 "vzeroupper \n"
4528 : [y_buf]"+r"(y_buf), // %[y_buf]
4529 [uv_buf]"+r"(uv_buf), // %[uv_buf]
4530 [dst_argb]"+r"(dst_argb), // %[dst_argb]
4531 [width]"+rm"(width) // %[width]
4532 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
4533 : "memory", "cc", YUVTORGB_REGS_AVX2
4534 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4535 );
4536 // clang-format on
4537 }
4538 #endif // HAS_P410TOARGBROW_AVX2
4539
4540 #if defined(HAS_P210TOAR30ROW_AVX2)
4541 // 16 pixels
4542 // 16 UV values with 16 Y producing 16 AR30 (64 bytes).
P210ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)4543 void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf,
4544 const uint16_t* uv_buf,
4545 uint8_t* dst_ar30,
4546 const struct YuvConstants* yuvconstants,
4547 int width) {
4548 asm volatile (
4549 YUVTORGB_SETUP_AVX2(yuvconstants)
4550 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
4551 "vpsrlw $14,%%ymm5,%%ymm5 \n"
4552 "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
4553 "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
4554 "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
4555 "vpsrlw $6,%%ymm7,%%ymm7 \n"
4556
4557 LABELALIGN
4558 "1: \n"
4559 READP210_AVX2
4560 YUVTORGB16_AVX2(yuvconstants)
4561 STOREAR30_AVX2
4562 "sub $0x10,%[width] \n"
4563 "jg 1b \n"
4564
4565 "vzeroupper \n"
4566 : [y_buf]"+r"(y_buf), // %[y_buf]
4567 [uv_buf]"+r"(uv_buf), // %[uv_buf]
4568 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
4569 [width]"+rm"(width) // %[width]
4570 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
4571 : "memory", "cc", YUVTORGB_REGS_AVX2
4572 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4573 );
4574 }
4575 #endif // HAS_P210TOAR30ROW_AVX2
4576
4577 #if defined(HAS_P410TOAR30ROW_AVX2)
4578 // 16 pixels
4579 // 16 UV values with 16 Y producing 16 AR30 (64 bytes).
P410ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * uv_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)4580 void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf,
4581 const uint16_t* uv_buf,
4582 uint8_t* dst_ar30,
4583 const struct YuvConstants* yuvconstants,
4584 int width) {
4585 asm volatile (
4586 YUVTORGB_SETUP_AVX2(yuvconstants)
4587 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
4588 "vpsrlw $14,%%ymm5,%%ymm5 \n"
4589 "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
4590 "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
4591 "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
4592 "vpsrlw $6,%%ymm7,%%ymm7 \n"
4593
4594 LABELALIGN
4595 "1: \n"
4596 READP410_AVX2
4597 YUVTORGB16_AVX2(yuvconstants)
4598 STOREAR30_AVX2
4599 "sub $0x10,%[width] \n"
4600 "jg 1b \n"
4601
4602 "vzeroupper \n"
4603 : [y_buf]"+r"(y_buf), // %[y_buf]
4604 [uv_buf]"+r"(uv_buf), // %[uv_buf]
4605 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
4606 [width]"+rm"(width) // %[width]
4607 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
4608 : "memory", "cc", YUVTORGB_REGS_AVX2
4609 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4610 );
4611 }
4612 #endif // HAS_P410TOAR30ROW_AVX2
4613
4614 #ifdef HAS_I400TOARGBROW_SSE2
I400ToARGBRow_SSE2(const uint8_t * y_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4615 void I400ToARGBRow_SSE2(const uint8_t* y_buf,
4616 uint8_t* dst_argb,
4617 const struct YuvConstants* yuvconstants,
4618 int width) {
4619 asm volatile(
4620 "movdqa 96(%3),%%xmm2 \n" // yg = 18997 = 1.164
4621 "movdqa 128(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16
4622 "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000
4623 "pslld $0x18,%%xmm4 \n"
4624
4625 LABELALIGN
4626 "1: \n"
4627 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
4628 "movq (%0),%%xmm0 \n"
4629 "lea 0x8(%0),%0 \n"
4630 "punpcklbw %%xmm0,%%xmm0 \n"
4631 "pmulhuw %%xmm2,%%xmm0 \n"
4632 "paddsw %%xmm3,%%xmm0 \n"
4633 "psraw $6, %%xmm0 \n"
4634 "packuswb %%xmm0,%%xmm0 \n"
4635
4636 // Step 2: Weave into ARGB
4637 "punpcklbw %%xmm0,%%xmm0 \n"
4638 "movdqa %%xmm0,%%xmm1 \n"
4639 "punpcklwd %%xmm0,%%xmm0 \n"
4640 "punpckhwd %%xmm1,%%xmm1 \n"
4641 "por %%xmm4,%%xmm0 \n"
4642 "por %%xmm4,%%xmm1 \n"
4643 "movdqu %%xmm0,(%1) \n"
4644 "movdqu %%xmm1,0x10(%1) \n"
4645 "lea 0x20(%1),%1 \n"
4646
4647 "sub $0x8,%2 \n"
4648 "jg 1b \n"
4649 : "+r"(y_buf), // %0
4650 "+r"(dst_argb), // %1
4651 "+rm"(width) // %2
4652 : "r"(yuvconstants) // %3
4653 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
4654 }
4655 #endif // HAS_I400TOARGBROW_SSE2
4656
4657 #ifdef HAS_I400TOARGBROW_AVX2
4658 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
4659 // note: vpunpcklbw mutates and vpackuswb unmutates.
I400ToARGBRow_AVX2(const uint8_t * y_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)4660 void I400ToARGBRow_AVX2(const uint8_t* y_buf,
4661 uint8_t* dst_argb,
4662 const struct YuvConstants* yuvconstants,
4663 int width) {
4664 asm volatile(
4665 "vmovdqa 96(%3),%%ymm2 \n" // yg = 18997 = 1.164
4666 "vmovdqa 128(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16
4667 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000
4668 "vpslld $0x18,%%ymm4,%%ymm4 \n"
4669
4670 LABELALIGN
4671 "1: \n"
4672 // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
4673 "vmovdqu (%0),%%xmm0 \n"
4674 "lea 0x10(%0),%0 \n"
4675 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4676 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
4677 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
4678 "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n"
4679 "vpsraw $0x6,%%ymm0,%%ymm0 \n"
4680 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
4681 "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
4682 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
4683 "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
4684 "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
4685 "vpor %%ymm4,%%ymm0,%%ymm0 \n"
4686 "vpor %%ymm4,%%ymm1,%%ymm1 \n"
4687 "vmovdqu %%ymm0,(%1) \n"
4688 "vmovdqu %%ymm1,0x20(%1) \n"
4689 "lea 0x40(%1),%1 \n"
4690 "sub $0x10,%2 \n"
4691 "jg 1b \n"
4692 "vzeroupper \n"
4693 : "+r"(y_buf), // %0
4694 "+r"(dst_argb), // %1
4695 "+rm"(width) // %2
4696 : "r"(yuvconstants) // %3
4697 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
4698 }
4699 #endif // HAS_I400TOARGBROW_AVX2
4700
4701 #ifdef HAS_MIRRORROW_SSSE3
4702 // Shuffle table for reversing the bytes.
4703 static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
4704 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
4705
MirrorRow_SSSE3(const uint8_t * src,uint8_t * dst,int width)4706 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
4707 intptr_t temp_width = (intptr_t)(width);
4708 asm volatile(
4709
4710 "movdqa %3,%%xmm5 \n"
4711
4712 LABELALIGN
4713 "1: \n"
4714 "movdqu -0x10(%0,%2,1),%%xmm0 \n"
4715 "pshufb %%xmm5,%%xmm0 \n"
4716 "movdqu %%xmm0,(%1) \n"
4717 "lea 0x10(%1),%1 \n"
4718 "sub $0x10,%2 \n"
4719 "jg 1b \n"
4720 : "+r"(src), // %0
4721 "+r"(dst), // %1
4722 "+r"(temp_width) // %2
4723 : "m"(kShuffleMirror) // %3
4724 : "memory", "cc", "xmm0", "xmm5");
4725 }
4726 #endif // HAS_MIRRORROW_SSSE3
4727
4728 #ifdef HAS_MIRRORROW_AVX2
MirrorRow_AVX2(const uint8_t * src,uint8_t * dst,int width)4729 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
4730 intptr_t temp_width = (intptr_t)(width);
4731 asm volatile(
4732
4733 "vbroadcastf128 %3,%%ymm5 \n"
4734
4735 LABELALIGN
4736 "1: \n"
4737 "vmovdqu -0x20(%0,%2,1),%%ymm0 \n"
4738 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
4739 "vpermq $0x4e,%%ymm0,%%ymm0 \n"
4740 "vmovdqu %%ymm0,(%1) \n"
4741 "lea 0x20(%1),%1 \n"
4742 "sub $0x20,%2 \n"
4743 "jg 1b \n"
4744 "vzeroupper \n"
4745 : "+r"(src), // %0
4746 "+r"(dst), // %1
4747 "+r"(temp_width) // %2
4748 : "m"(kShuffleMirror) // %3
4749 : "memory", "cc", "xmm0", "xmm5");
4750 }
4751 #endif // HAS_MIRRORROW_AVX2
4752
4753 #ifdef HAS_MIRRORUVROW_SSSE3
4754 // Shuffle table for reversing the UV.
4755 static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
4756 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
4757
MirrorUVRow_SSSE3(const uint8_t * src_uv,uint8_t * dst_uv,int width)4758 void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
4759 intptr_t temp_width = (intptr_t)(width);
4760 asm volatile(
4761
4762 "movdqa %3,%%xmm5 \n"
4763
4764 LABELALIGN
4765 "1: \n"
4766 "movdqu -0x10(%0,%2,2),%%xmm0 \n"
4767 "pshufb %%xmm5,%%xmm0 \n"
4768 "movdqu %%xmm0,(%1) \n"
4769 "lea 0x10(%1),%1 \n"
4770 "sub $0x8,%2 \n"
4771 "jg 1b \n"
4772 : "+r"(src_uv), // %0
4773 "+r"(dst_uv), // %1
4774 "+r"(temp_width) // %2
4775 : "m"(kShuffleMirrorUV) // %3
4776 : "memory", "cc", "xmm0", "xmm5");
4777 }
4778 #endif // HAS_MIRRORUVROW_SSSE3
4779
4780 #ifdef HAS_MIRRORUVROW_AVX2
MirrorUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_uv,int width)4781 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
4782 intptr_t temp_width = (intptr_t)(width);
4783 asm volatile(
4784
4785 "vbroadcastf128 %3,%%ymm5 \n"
4786
4787 LABELALIGN
4788 "1: \n"
4789 "vmovdqu -0x20(%0,%2,2),%%ymm0 \n"
4790 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
4791 "vpermq $0x4e,%%ymm0,%%ymm0 \n"
4792 "vmovdqu %%ymm0,(%1) \n"
4793 "lea 0x20(%1),%1 \n"
4794 "sub $0x10,%2 \n"
4795 "jg 1b \n"
4796 "vzeroupper \n"
4797 : "+r"(src_uv), // %0
4798 "+r"(dst_uv), // %1
4799 "+r"(temp_width) // %2
4800 : "m"(kShuffleMirrorUV) // %3
4801 : "memory", "cc", "xmm0", "xmm5");
4802 }
4803 #endif // HAS_MIRRORUVROW_AVX2
4804
4805 #ifdef HAS_MIRRORSPLITUVROW_SSSE3
4806 // Shuffle table for reversing the bytes of UV channels.
4807 static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
4808 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
MirrorSplitUVRow_SSSE3(const uint8_t * src,uint8_t * dst_u,uint8_t * dst_v,int width)4809 void MirrorSplitUVRow_SSSE3(const uint8_t* src,
4810 uint8_t* dst_u,
4811 uint8_t* dst_v,
4812 int width) {
4813 intptr_t temp_width = (intptr_t)(width);
4814 asm volatile(
4815 "movdqa %4,%%xmm1 \n"
4816 "lea -0x10(%0,%3,2),%0 \n"
4817 "sub %1,%2 \n"
4818
4819 LABELALIGN
4820 "1: \n"
4821 "movdqu (%0),%%xmm0 \n"
4822 "lea -0x10(%0),%0 \n"
4823 "pshufb %%xmm1,%%xmm0 \n"
4824 "movlpd %%xmm0,(%1) \n"
4825 "movhpd %%xmm0,0x00(%1,%2,1) \n"
4826 "lea 0x8(%1),%1 \n"
4827 "sub $8,%3 \n"
4828 "jg 1b \n"
4829 : "+r"(src), // %0
4830 "+r"(dst_u), // %1
4831 "+r"(dst_v), // %2
4832 "+r"(temp_width) // %3
4833 : "m"(kShuffleMirrorSplitUV) // %4
4834 : "memory", "cc", "xmm0", "xmm1");
4835 }
4836 #endif // HAS_MIRRORSPLITUVROW_SSSE3
4837
4838 #ifdef HAS_RGB24MIRRORROW_SSSE3
4839
4840 // Shuffle first 5 pixels to last 5 mirrored. first byte zero
4841 static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u,
4842 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u};
4843
4844 // Shuffle last 5 pixels to first 5 mirrored. last byte zero
4845 static const uvec8 kShuffleMirrorRGB1 = {
4846 13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u};
4847
4848 // Shuffle 5 pixels at a time (15 bytes)
RGB24MirrorRow_SSSE3(const uint8_t * src_rgb24,uint8_t * dst_rgb24,int width)4849 void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
4850 uint8_t* dst_rgb24,
4851 int width) {
4852 intptr_t temp_width = (intptr_t)(width);
4853 src_rgb24 += width * 3 - 48;
4854 asm volatile(
4855 "movdqa %3,%%xmm4 \n"
4856 "movdqa %4,%%xmm5 \n"
4857
4858 LABELALIGN
4859 "1: \n"
4860 "movdqu (%0),%%xmm0 \n" // first 5
4861 "movdqu 15(%0),%%xmm1 \n" // next 5
4862 "movdqu 30(%0),%%xmm2 \n" // next 5
4863 "movdqu 32(%0),%%xmm3 \n" // last 1 special
4864 "pshufb %%xmm4,%%xmm0 \n"
4865 "pshufb %%xmm4,%%xmm1 \n"
4866 "pshufb %%xmm4,%%xmm2 \n"
4867 "pshufb %%xmm5,%%xmm3 \n"
4868 "lea -0x30(%0),%0 \n"
4869 "movdqu %%xmm0,32(%1) \n" // last 5
4870 "movdqu %%xmm1,17(%1) \n" // next 5
4871 "movdqu %%xmm2,2(%1) \n" // next 5
4872 "movlpd %%xmm3,0(%1) \n" // first 1
4873 "lea 0x30(%1),%1 \n"
4874 "sub $0x10,%2 \n"
4875 "jg 1b \n"
4876 : "+r"(src_rgb24), // %0
4877 "+r"(dst_rgb24), // %1
4878 "+r"(temp_width) // %2
4879 : "m"(kShuffleMirrorRGB0), // %3
4880 "m"(kShuffleMirrorRGB1) // %4
4881 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
4882 }
4883 #endif // HAS_RGB24MIRRORROW_SSSE3
4884
4885 #ifdef HAS_ARGBMIRRORROW_SSE2
4886
ARGBMirrorRow_SSE2(const uint8_t * src,uint8_t * dst,int width)4887 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
4888 intptr_t temp_width = (intptr_t)(width);
4889 asm volatile(
4890
4891 "lea -0x10(%0,%2,4),%0 \n"
4892
4893 LABELALIGN
4894 "1: \n"
4895 "movdqu (%0),%%xmm0 \n"
4896 "pshufd $0x1b,%%xmm0,%%xmm0 \n"
4897 "lea -0x10(%0),%0 \n"
4898 "movdqu %%xmm0,(%1) \n"
4899 "lea 0x10(%1),%1 \n"
4900 "sub $0x4,%2 \n"
4901 "jg 1b \n"
4902 : "+r"(src), // %0
4903 "+r"(dst), // %1
4904 "+r"(temp_width) // %2
4905 :
4906 : "memory", "cc", "xmm0");
4907 }
4908 #endif // HAS_ARGBMIRRORROW_SSE2
4909
4910 #ifdef HAS_ARGBMIRRORROW_AVX2
4911 // Shuffle table for reversing the bytes.
4912 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
ARGBMirrorRow_AVX2(const uint8_t * src,uint8_t * dst,int width)4913 void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
4914 intptr_t temp_width = (intptr_t)(width);
4915 asm volatile(
4916
4917 "vmovdqu %3,%%ymm5 \n"
4918
4919 LABELALIGN
4920 "1: \n"
4921 "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n"
4922 "vmovdqu %%ymm0,(%1) \n"
4923 "lea 0x20(%1),%1 \n"
4924 "sub $0x8,%2 \n"
4925 "jg 1b \n"
4926 "vzeroupper \n"
4927 : "+r"(src), // %0
4928 "+r"(dst), // %1
4929 "+r"(temp_width) // %2
4930 : "m"(kARGBShuffleMirror_AVX2) // %3
4931 : "memory", "cc", "xmm0", "xmm5");
4932 }
4933 #endif // HAS_ARGBMIRRORROW_AVX2
4934
4935 #ifdef HAS_SPLITUVROW_AVX2
SplitUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)4936 void SplitUVRow_AVX2(const uint8_t* src_uv,
4937 uint8_t* dst_u,
4938 uint8_t* dst_v,
4939 int width) {
4940 asm volatile(
4941 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4942 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
4943 "sub %1,%2 \n"
4944
4945 LABELALIGN
4946 "1: \n"
4947 "vmovdqu (%0),%%ymm0 \n"
4948 "vmovdqu 0x20(%0),%%ymm1 \n"
4949 "lea 0x40(%0),%0 \n"
4950 "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
4951 "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
4952 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
4953 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
4954 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4955 "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
4956 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4957 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
4958 "vmovdqu %%ymm0,(%1) \n"
4959 "vmovdqu %%ymm2,0x00(%1,%2,1) \n"
4960 "lea 0x20(%1),%1 \n"
4961 "sub $0x20,%3 \n"
4962 "jg 1b \n"
4963 "vzeroupper \n"
4964 : "+r"(src_uv), // %0
4965 "+r"(dst_u), // %1
4966 "+r"(dst_v), // %2
4967 "+r"(width) // %3
4968 :
4969 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
4970 }
4971 #endif // HAS_SPLITUVROW_AVX2
4972
4973 #ifdef HAS_SPLITUVROW_SSE2
SplitUVRow_SSE2(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)4974 void SplitUVRow_SSE2(const uint8_t* src_uv,
4975 uint8_t* dst_u,
4976 uint8_t* dst_v,
4977 int width) {
4978 asm volatile(
4979 "pcmpeqb %%xmm5,%%xmm5 \n"
4980 "psrlw $0x8,%%xmm5 \n"
4981 "sub %1,%2 \n"
4982
4983 LABELALIGN
4984 "1: \n"
4985 "movdqu (%0),%%xmm0 \n"
4986 "movdqu 0x10(%0),%%xmm1 \n"
4987 "lea 0x20(%0),%0 \n"
4988 "movdqa %%xmm0,%%xmm2 \n"
4989 "movdqa %%xmm1,%%xmm3 \n"
4990 "pand %%xmm5,%%xmm0 \n"
4991 "pand %%xmm5,%%xmm1 \n"
4992 "packuswb %%xmm1,%%xmm0 \n"
4993 "psrlw $0x8,%%xmm2 \n"
4994 "psrlw $0x8,%%xmm3 \n"
4995 "packuswb %%xmm3,%%xmm2 \n"
4996 "movdqu %%xmm0,(%1) \n"
4997 "movdqu %%xmm2,0x00(%1,%2,1) \n"
4998 "lea 0x10(%1),%1 \n"
4999 "sub $0x10,%3 \n"
5000 "jg 1b \n"
5001 : "+r"(src_uv), // %0
5002 "+r"(dst_u), // %1
5003 "+r"(dst_v), // %2
5004 "+r"(width) // %3
5005 :
5006 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
5007 }
5008 #endif // HAS_SPLITUVROW_SSE2
5009
5010 #ifdef HAS_DETILEROW_SSE2
DetileRow_SSE2(const uint8_t * src,ptrdiff_t src_tile_stride,uint8_t * dst,int width)5011 void DetileRow_SSE2(const uint8_t* src,
5012 ptrdiff_t src_tile_stride,
5013 uint8_t* dst,
5014 int width) {
5015 asm volatile(
5016 "1: \n"
5017 "movdqu (%0),%%xmm0 \n"
5018 "sub $0x10,%2 \n"
5019 "lea (%0,%3),%0 \n"
5020 "movdqu %%xmm0,(%1) \n"
5021 "lea 0x10(%1),%1 \n"
5022 "jg 1b \n"
5023 : "+r"(src), // %0
5024 "+r"(dst), // %1
5025 "+r"(width) // %2
5026 : "r"(src_tile_stride) // %3
5027 : "cc", "memory", "xmm0");
5028 }
5029 #endif // HAS_DETILEROW_SSE2
5030
5031 #ifdef HAS_DETILEROW_16_SSE2
DetileRow_16_SSE2(const uint16_t * src,ptrdiff_t src_tile_stride,uint16_t * dst,int width)5032 void DetileRow_16_SSE2(const uint16_t* src,
5033 ptrdiff_t src_tile_stride,
5034 uint16_t* dst,
5035 int width) {
5036 asm volatile(
5037 "1: \n"
5038 "movdqu (%0),%%xmm0 \n"
5039 "movdqu 0x10(%0),%%xmm1 \n"
5040 "lea (%0,%3,2),%0 \n"
5041 "movdqu %%xmm0,(%1) \n"
5042 "movdqu %%xmm1,0x10(%1) \n"
5043 "lea 0x20(%1),%1 \n"
5044 "sub $0x10,%2 \n"
5045 "jg 1b \n"
5046 : "+r"(src), // %0
5047 "+r"(dst), // %1
5048 "+r"(width) // %2
5049 : "r"(src_tile_stride) // %3
5050 : "cc", "memory", "xmm0", "xmm1");
5051 }
5052 #endif // HAS_DETILEROW_SSE2
5053
5054 #ifdef HAS_DETILEROW_16_AVX
DetileRow_16_AVX(const uint16_t * src,ptrdiff_t src_tile_stride,uint16_t * dst,int width)5055 void DetileRow_16_AVX(const uint16_t* src,
5056 ptrdiff_t src_tile_stride,
5057 uint16_t* dst,
5058 int width) {
5059 asm volatile(
5060 "1: \n"
5061 "vmovdqu (%0),%%ymm0 \n"
5062 "lea (%0,%3,2),%0 \n"
5063 "vmovdqu %%ymm0,(%1) \n"
5064 "lea 0x20(%1),%1 \n"
5065 "sub $0x10,%2 \n"
5066 "jg 1b \n"
5067 "vzeroupper \n"
5068 : "+r"(src), // %0
5069 "+r"(dst), // %1
5070 "+r"(width) // %2
5071 : "r"(src_tile_stride) // %3
5072 : "cc", "memory", "xmm0");
5073 }
5074 #endif // HAS_DETILEROW_AVX
5075
5076 #ifdef HAS_DETILETOYUY2_SSE2
5077 // Read 16 Y, 8 UV, and write 8 YUYV.
DetileToYUY2_SSE2(const uint8_t * src_y,ptrdiff_t src_y_tile_stride,const uint8_t * src_uv,ptrdiff_t src_uv_tile_stride,uint8_t * dst_yuy2,int width)5078 void DetileToYUY2_SSE2(const uint8_t* src_y,
5079 ptrdiff_t src_y_tile_stride,
5080 const uint8_t* src_uv,
5081 ptrdiff_t src_uv_tile_stride,
5082 uint8_t* dst_yuy2,
5083 int width) {
5084 asm volatile(
5085 "1: \n"
5086 "movdqu (%0),%%xmm0 \n" // Load 16 Y
5087 "sub $0x10,%3 \n"
5088 "lea (%0,%4),%0 \n"
5089 "movdqu (%1),%%xmm1 \n" // Load 8 UV
5090 "lea (%1,%5),%1 \n"
5091 "movdqu %%xmm0,%%xmm2 \n"
5092 "punpcklbw %%xmm1,%%xmm0 \n"
5093 "punpckhbw %%xmm1,%%xmm2 \n"
5094 "movdqu %%xmm0,(%2) \n"
5095 "movdqu %%xmm2,0x10(%2) \n"
5096 "lea 0x20(%2),%2 \n"
5097 "jg 1b \n"
5098 : "+r"(src_y), // %0
5099 "+r"(src_uv), // %1
5100 "+r"(dst_yuy2), // %2
5101 "+r"(width) // %3
5102 : "r"(src_y_tile_stride), // %4
5103 "r"(src_uv_tile_stride) // %5
5104 : "cc", "memory", "xmm0", "xmm1", "xmm2" // Clobber list
5105 );
5106 }
5107 #endif
5108
5109 #ifdef HAS_DETILESPLITUVROW_SSSE3
5110 // TODO(greenjustin): Look into generating these constants instead of loading
5111 // them since this can cause branch mispredicts for fPIC code on 32-bit
5112 // machines.
5113 static const uvec8 kDeinterlaceUV = {0, 2, 4, 6, 8, 10, 12, 14,
5114 1, 3, 5, 7, 9, 11, 13, 15};
5115
5116 // TODO(greenjustin): Research alternatives to pshufb, since pshufb can be very
5117 // slow on older SSE2 processors.
DetileSplitUVRow_SSSE3(const uint8_t * src_uv,ptrdiff_t src_tile_stride,uint8_t * dst_u,uint8_t * dst_v,int width)5118 void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
5119 ptrdiff_t src_tile_stride,
5120 uint8_t* dst_u,
5121 uint8_t* dst_v,
5122 int width) {
5123 asm volatile(
5124 "movdqu %4,%%xmm1 \n"
5125 "1: \n"
5126 "movdqu (%0),%%xmm0 \n"
5127 "lea (%0, %5),%0 \n"
5128 "pshufb %%xmm1,%%xmm0 \n"
5129 "movq %%xmm0,(%1) \n"
5130 "lea 0x8(%1),%1 \n"
5131 "movhps %%xmm0,(%2) \n"
5132 "lea 0x8(%2),%2 \n"
5133 "sub $0x10,%3 \n"
5134 "jg 1b \n"
5135 : "+r"(src_uv), // %0
5136 "+r"(dst_u), // %1
5137 "+r"(dst_v), // %2
5138 "+r"(width) // %3
5139 : "m"(kDeinterlaceUV), // %4
5140 "r"(src_tile_stride) // %5
5141 : "cc", "memory", "xmm0", "xmm1");
5142 }
5143 #endif // HAS_DETILESPLITUVROW_SSSE3
5144
5145 #ifdef HAS_MERGEUVROW_AVX512BW
MergeUVRow_AVX512BW(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)5146 void MergeUVRow_AVX512BW(const uint8_t* src_u,
5147 const uint8_t* src_v,
5148 uint8_t* dst_uv,
5149 int width) {
5150 asm volatile("sub %0,%1 \n"
5151
5152 LABELALIGN
5153 "1: \n"
5154 "vpmovzxbw (%0),%%zmm0 \n"
5155 "vpmovzxbw 0x00(%0,%1,1),%%zmm1 \n"
5156 "lea 0x20(%0),%0 \n"
5157 "vpsllw $0x8,%%zmm1,%%zmm1 \n"
5158 "vporq %%zmm0,%%zmm1,%%zmm2 \n"
5159 "vmovdqu64 %%zmm2,(%2) \n"
5160 "lea 0x40(%2),%2 \n"
5161 "sub $0x20,%3 \n"
5162 "jg 1b \n"
5163 "vzeroupper \n"
5164 : "+r"(src_u), // %0
5165 "+r"(src_v), // %1
5166 "+r"(dst_uv), // %2
5167 "+r"(width) // %3
5168 :
5169 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5170 }
5171 #endif // HAS_MERGEUVROW_AVX512BW
5172
5173 #ifdef HAS_MERGEUVROW_AVX2
MergeUVRow_AVX2(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)5174 void MergeUVRow_AVX2(const uint8_t* src_u,
5175 const uint8_t* src_v,
5176 uint8_t* dst_uv,
5177 int width) {
5178 asm volatile("sub %0,%1 \n"
5179
5180 LABELALIGN
5181 "1: \n"
5182 "vpmovzxbw (%0),%%ymm0 \n"
5183 "vpmovzxbw 0x00(%0,%1,1),%%ymm1 \n"
5184 "lea 0x10(%0),%0 \n"
5185 "vpsllw $0x8,%%ymm1,%%ymm1 \n"
5186 "vpor %%ymm0,%%ymm1,%%ymm2 \n"
5187 "vmovdqu %%ymm2,(%2) \n"
5188 "lea 0x20(%2),%2 \n"
5189 "sub $0x10,%3 \n"
5190 "jg 1b \n"
5191 "vzeroupper \n"
5192 : "+r"(src_u), // %0
5193 "+r"(src_v), // %1
5194 "+r"(dst_uv), // %2
5195 "+r"(width) // %3
5196 :
5197 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5198 }
5199 #endif // HAS_MERGEUVROW_AVX2
5200
5201 #ifdef HAS_MERGEUVROW_SSE2
MergeUVRow_SSE2(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)5202 void MergeUVRow_SSE2(const uint8_t* src_u,
5203 const uint8_t* src_v,
5204 uint8_t* dst_uv,
5205 int width) {
5206 asm volatile("sub %0,%1 \n"
5207
5208 LABELALIGN
5209 "1: \n"
5210 "movdqu (%0),%%xmm0 \n"
5211 "movdqu 0x00(%0,%1,1),%%xmm1 \n"
5212 "lea 0x10(%0),%0 \n"
5213 "movdqa %%xmm0,%%xmm2 \n"
5214 "punpcklbw %%xmm1,%%xmm0 \n"
5215 "punpckhbw %%xmm1,%%xmm2 \n"
5216 "movdqu %%xmm0,(%2) \n"
5217 "movdqu %%xmm2,0x10(%2) \n"
5218 "lea 0x20(%2),%2 \n"
5219 "sub $0x10,%3 \n"
5220 "jg 1b \n"
5221 : "+r"(src_u), // %0
5222 "+r"(src_v), // %1
5223 "+r"(dst_uv), // %2
5224 "+r"(width) // %3
5225 :
5226 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5227 }
5228 #endif // HAS_MERGEUVROW_SSE2
5229
5230 #ifdef HAS_MERGEUVROW_16_AVX2
MergeUVRow_16_AVX2(const uint16_t * src_u,const uint16_t * src_v,uint16_t * dst_uv,int depth,int width)5231 void MergeUVRow_16_AVX2(const uint16_t* src_u,
5232 const uint16_t* src_v,
5233 uint16_t* dst_uv,
5234 int depth,
5235 int width) {
5236 // clang-format off
5237 asm volatile (
5238 "vmovd %4,%%xmm3 \n"
5239 "vmovd %5,%%xmm4 \n"
5240
5241
5242 "sub %0,%1 \n"
5243 // 8 pixels per loop.
5244
5245 LABELALIGN
5246 "1: \n"
5247 "vpmovzxwd (%0),%%ymm0 \n"
5248 "vpmovzxwd 0x00(%0,%1,1),%%ymm1 \n"
5249 "lea 0x10(%0),%0 \n"
5250 "vpsllw %%xmm3,%%ymm0,%%ymm0 \n"
5251 "vpslld %%xmm4,%%ymm1,%%ymm1 \n"
5252 "vpor %%ymm0,%%ymm1,%%ymm2 \n"
5253 "vmovdqu %%ymm2,(%2) \n"
5254 "lea 0x20(%2),%2 \n"
5255 "sub $0x8,%3 \n"
5256 "jg 1b \n"
5257 "vzeroupper \n"
5258 : "+r"(src_u), // %0
5259 "+r"(src_v), // %1
5260 "+r"(dst_uv), // %2
5261 "+r"(width) // %3
5262 : "r"(16 - depth), // %4
5263 "r"(32 - depth) // %5
5264 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
5265 // clang-format on
5266 }
5267 #endif // HAS_MERGEUVROW_AVX2
5268
5269 #ifdef HAS_SPLITUVROW_16_AVX2
5270 const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8, 9, 12, 13,
5271 2, 3, 6, 7, 10, 11, 14, 15};
SplitUVRow_16_AVX2(const uint16_t * src_uv,uint16_t * dst_u,uint16_t * dst_v,int depth,int width)5272 void SplitUVRow_16_AVX2(const uint16_t* src_uv,
5273 uint16_t* dst_u,
5274 uint16_t* dst_v,
5275 int depth,
5276 int width) {
5277 depth = 16 - depth;
5278 // clang-format off
5279 asm volatile (
5280 "vmovd %4,%%xmm3 \n"
5281 "vbroadcastf128 %5,%%ymm4 \n"
5282 "sub %1,%2 \n"
5283
5284 // 16 pixels per loop.
5285 LABELALIGN
5286 "1: \n"
5287 "vmovdqu (%0),%%ymm0 \n"
5288 "vmovdqu 0x20(%0),%%ymm1 \n"
5289 "add $0x40,%0 \n"
5290
5291 "vpsrlw %%xmm3,%%ymm0,%%ymm0 \n"
5292 "vpsrlw %%xmm3,%%ymm1,%%ymm1 \n"
5293 "vpshufb %%ymm4,%%ymm0,%%ymm0 \n"
5294 "vpshufb %%ymm4,%%ymm1,%%ymm1 \n"
5295 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
5296 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
5297 "vextractf128 $0x0,%%ymm0,(%1) \n"
5298 "vextractf128 $0x0,%%ymm1,0x10(%1) \n"
5299 "vextractf128 $0x1,%%ymm0,(%1,%2) \n"
5300 "vextractf128 $0x1,%%ymm1,0x10(%1,%2) \n"
5301 "add $0x20,%1 \n"
5302 "sub $0x10,%3 \n"
5303 "jg 1b \n"
5304 "vzeroupper \n"
5305 : "+r"(src_uv), // %0
5306 "+r"(dst_u), // %1
5307 "+r"(dst_v), // %2
5308 "+r"(width) // %3
5309 : "r"(depth), // %4
5310 "m"(kSplitUVShuffle16) // %5
5311 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
5312 // clang-format on
5313 }
5314 #endif // HAS_SPLITUVROW_16_AVX2
5315
5316 // Use scale to convert lsb formats to msb, depending how many bits there are:
5317 // 128 = 9 bits
5318 // 64 = 10 bits
5319 // 16 = 12 bits
5320 // 1 = 16 bits
5321 #ifdef HAS_MULTIPLYROW_16_AVX2
MultiplyRow_16_AVX2(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)5322 void MultiplyRow_16_AVX2(const uint16_t* src_y,
5323 uint16_t* dst_y,
5324 int scale,
5325 int width) {
5326 // clang-format off
5327 asm volatile (
5328 "vmovd %3,%%xmm3 \n"
5329 "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
5330 "vbroadcastss %%xmm3,%%ymm3 \n"
5331 "sub %0,%1 \n"
5332
5333 // 32 pixels per loop.
5334 LABELALIGN
5335 "1: \n"
5336 "vmovdqu (%0),%%ymm0 \n"
5337 "vmovdqu 0x20(%0),%%ymm1 \n"
5338 "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
5339 "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
5340 "vmovdqu %%ymm0,(%0,%1) \n"
5341 "vmovdqu %%ymm1,0x20(%0,%1) \n"
5342 "add $0x40,%0 \n"
5343 "sub $0x20,%2 \n"
5344 "jg 1b \n"
5345 "vzeroupper \n"
5346 : "+r"(src_y), // %0
5347 "+r"(dst_y), // %1
5348 "+r"(width) // %2
5349 : "r"(scale) // %3
5350 : "memory", "cc", "xmm0", "xmm1", "xmm3");
5351 // clang-format on
5352 }
5353 #endif // HAS_MULTIPLYROW_16_AVX2
5354
5355 // Use scale to convert msb formats to lsb, depending how many bits there are:
5356 // 512 = 9 bits
5357 // 1024 = 10 bits
5358 // 4096 = 12 bits
5359 // 65536 = 16 bits
5360 #ifdef HAS_DIVIDEROW_16_AVX2
DivideRow_16_AVX2(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)5361 void DivideRow_16_AVX2(const uint16_t* src_y,
5362 uint16_t* dst_y,
5363 int scale,
5364 int width) {
5365 // clang-format off
5366 asm volatile (
5367 "vmovd %3,%%xmm3 \n"
5368 "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
5369 "vbroadcastss %%xmm3,%%ymm3 \n"
5370 "sub %0,%1 \n"
5371
5372 // 32 pixels per loop.
5373 LABELALIGN
5374 "1: \n"
5375 "vmovdqu (%0),%%ymm0 \n"
5376 "vmovdqu 0x20(%0),%%ymm1 \n"
5377 "vpmulhuw %%ymm3,%%ymm0,%%ymm0 \n"
5378 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
5379 "vmovdqu %%ymm0,(%0,%1) \n"
5380 "vmovdqu %%ymm1,0x20(%0,%1) \n"
5381 "add $0x40,%0 \n"
5382 "sub $0x20,%2 \n"
5383 "jg 1b \n"
5384 "vzeroupper \n"
5385 : "+r"(src_y), // %0
5386 "+r"(dst_y), // %1
5387 "+r"(width), // %2
5388 "+r"(scale) // %3
5389 :
5390 : "memory", "cc", "xmm0", "xmm1", "xmm3");
5391 // clang-format on
5392 }
5393 #endif // HAS_MULTIPLYROW_16_AVX2
5394
5395 // Use scale to convert lsb formats to msb, depending how many bits there are:
5396 // 32768 = 9 bits
5397 // 16384 = 10 bits
5398 // 4096 = 12 bits
5399 // 256 = 16 bits
Convert16To8Row_SSSE3(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)5400 void Convert16To8Row_SSSE3(const uint16_t* src_y,
5401 uint8_t* dst_y,
5402 int scale,
5403 int width) {
5404 // clang-format off
5405 asm volatile (
5406 "movd %3,%%xmm2 \n"
5407 "punpcklwd %%xmm2,%%xmm2 \n"
5408 "pshufd $0x0,%%xmm2,%%xmm2 \n"
5409
5410 // 32 pixels per loop.
5411 LABELALIGN
5412 "1: \n"
5413 "movdqu (%0),%%xmm0 \n"
5414 "movdqu 0x10(%0),%%xmm1 \n"
5415 "add $0x20,%0 \n"
5416 "pmulhuw %%xmm2,%%xmm0 \n"
5417 "pmulhuw %%xmm2,%%xmm1 \n"
5418 "packuswb %%xmm1,%%xmm0 \n"
5419 "movdqu %%xmm0,(%1) \n"
5420 "add $0x10,%1 \n"
5421 "sub $0x10,%2 \n"
5422 "jg 1b \n"
5423 : "+r"(src_y), // %0
5424 "+r"(dst_y), // %1
5425 "+r"(width) // %2
5426 : "r"(scale) // %3
5427 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5428 // clang-format on
5429 }
5430
5431 #ifdef HAS_CONVERT16TO8ROW_AVX2
Convert16To8Row_AVX2(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)5432 void Convert16To8Row_AVX2(const uint16_t* src_y,
5433 uint8_t* dst_y,
5434 int scale,
5435 int width) {
5436 // clang-format off
5437 asm volatile (
5438 "vmovd %3,%%xmm2 \n"
5439 "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
5440 "vbroadcastss %%xmm2,%%ymm2 \n"
5441
5442 // 32 pixels per loop.
5443 LABELALIGN
5444 "1: \n"
5445 "vmovdqu (%0),%%ymm0 \n"
5446 "vmovdqu 0x20(%0),%%ymm1 \n"
5447 "add $0x40,%0 \n"
5448 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
5449 "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
5450 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates
5451 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
5452 "vmovdqu %%ymm0,(%1) \n"
5453 "add $0x20,%1 \n"
5454 "sub $0x20,%2 \n"
5455 "jg 1b \n"
5456 "vzeroupper \n"
5457 : "+r"(src_y), // %0
5458 "+r"(dst_y), // %1
5459 "+r"(width) // %2
5460 : "r"(scale) // %3
5461 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5462 // clang-format on
5463 }
5464 #endif // HAS_CONVERT16TO8ROW_AVX2
5465
5466 // Use scale to convert to lsb formats depending how many bits there are:
5467 // 512 = 9 bits
5468 // 1024 = 10 bits
5469 // 4096 = 12 bits
Convert8To16Row_SSE2(const uint8_t * src_y,uint16_t * dst_y,int scale,int width)5470 void Convert8To16Row_SSE2(const uint8_t* src_y,
5471 uint16_t* dst_y,
5472 int scale,
5473 int width) {
5474 // clang-format off
5475 asm volatile (
5476 "movd %3,%%xmm2 \n"
5477 "punpcklwd %%xmm2,%%xmm2 \n"
5478 "pshufd $0x0,%%xmm2,%%xmm2 \n"
5479
5480 // 32 pixels per loop.
5481 LABELALIGN
5482 "1: \n"
5483 "movdqu (%0),%%xmm0 \n"
5484 "movdqa %%xmm0,%%xmm1 \n"
5485 "punpcklbw %%xmm0,%%xmm0 \n"
5486 "punpckhbw %%xmm1,%%xmm1 \n"
5487 "add $0x10,%0 \n"
5488 "pmulhuw %%xmm2,%%xmm0 \n"
5489 "pmulhuw %%xmm2,%%xmm1 \n"
5490 "movdqu %%xmm0,(%1) \n"
5491 "movdqu %%xmm1,0x10(%1) \n"
5492 "add $0x20,%1 \n"
5493 "sub $0x10,%2 \n"
5494 "jg 1b \n"
5495 : "+r"(src_y), // %0
5496 "+r"(dst_y), // %1
5497 "+r"(width) // %2
5498 : "r"(scale) // %3
5499 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5500 // clang-format on
5501 }
5502
5503 #ifdef HAS_CONVERT8TO16ROW_AVX2
Convert8To16Row_AVX2(const uint8_t * src_y,uint16_t * dst_y,int scale,int width)5504 void Convert8To16Row_AVX2(const uint8_t* src_y,
5505 uint16_t* dst_y,
5506 int scale,
5507 int width) {
5508 // clang-format off
5509 asm volatile (
5510 "vmovd %3,%%xmm2 \n"
5511 "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
5512 "vbroadcastss %%xmm2,%%ymm2 \n"
5513
5514 // 32 pixels per loop.
5515 LABELALIGN
5516 "1: \n"
5517 "vmovdqu (%0),%%ymm0 \n"
5518 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
5519 "add $0x20,%0 \n"
5520 "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
5521 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
5522 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
5523 "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
5524 "vmovdqu %%ymm0,(%1) \n"
5525 "vmovdqu %%ymm1,0x20(%1) \n"
5526 "add $0x40,%1 \n"
5527 "sub $0x20,%2 \n"
5528 "jg 1b \n"
5529 "vzeroupper \n"
5530 : "+r"(src_y), // %0
5531 "+r"(dst_y), // %1
5532 "+r"(width) // %2
5533 : "r"(scale) // %3
5534 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5535 // clang-format on
5536 }
5537 #endif // HAS_CONVERT8TO16ROW_AVX2
5538
5539 #ifdef HAS_SPLITRGBROW_SSSE3
5540 // Shuffle table for converting RGB to Planar.
5541 static const uvec8 kSplitRGBShuffle[9] = {
5542 {0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
5543 128u, 128u},
5544 {128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u,
5545 128u, 128u},
5546 {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 1u, 4u,
5547 7u, 10u, 13u},
5548 {1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
5549 128u, 128u},
5550 {128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u,
5551 128u, 128u},
5552 {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u,
5553 8u, 11u, 14u},
5554 {2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
5555 128u, 128u},
5556 {128u, 128u, 128u, 128u, 128u, 1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u,
5557 128u, 128u},
5558 {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u,
5559 12u, 15u}};
5560
SplitRGBRow_SSSE3(const uint8_t * src_rgb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)5561 void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
5562 uint8_t* dst_r,
5563 uint8_t* dst_g,
5564 uint8_t* dst_b,
5565 int width) {
5566 asm volatile(
5567
5568 LABELALIGN
5569 "1: \n"
5570 "movdqu (%0),%%xmm0 \n"
5571 "movdqu 0x10(%0),%%xmm1 \n"
5572 "movdqu 0x20(%0),%%xmm2 \n"
5573 "pshufb 0(%5), %%xmm0 \n"
5574 "pshufb 16(%5), %%xmm1 \n"
5575 "pshufb 32(%5), %%xmm2 \n"
5576 "por %%xmm1,%%xmm0 \n"
5577 "por %%xmm2,%%xmm0 \n"
5578 "movdqu %%xmm0,(%1) \n"
5579 "lea 0x10(%1),%1 \n"
5580
5581 "movdqu (%0),%%xmm0 \n"
5582 "movdqu 0x10(%0),%%xmm1 \n"
5583 "movdqu 0x20(%0),%%xmm2 \n"
5584 "pshufb 48(%5),%%xmm0 \n"
5585 "pshufb 64(%5),%%xmm1 \n"
5586 "pshufb 80(%5), %%xmm2 \n"
5587 "por %%xmm1,%%xmm0 \n"
5588 "por %%xmm2,%%xmm0 \n"
5589 "movdqu %%xmm0,(%2) \n"
5590 "lea 0x10(%2),%2 \n"
5591
5592 "movdqu (%0),%%xmm0 \n"
5593 "movdqu 0x10(%0),%%xmm1 \n"
5594 "movdqu 0x20(%0),%%xmm2 \n"
5595 "pshufb 96(%5), %%xmm0 \n"
5596 "pshufb 112(%5), %%xmm1 \n"
5597 "pshufb 128(%5), %%xmm2 \n"
5598 "por %%xmm1,%%xmm0 \n"
5599 "por %%xmm2,%%xmm0 \n"
5600 "movdqu %%xmm0,(%3) \n"
5601 "lea 0x10(%3),%3 \n"
5602 "lea 0x30(%0),%0 \n"
5603 "sub $0x10,%4 \n"
5604 "jg 1b \n"
5605 : "+r"(src_rgb), // %0
5606 "+r"(dst_r), // %1
5607 "+r"(dst_g), // %2
5608 "+r"(dst_b), // %3
5609 "+r"(width) // %4
5610 : "r"(&kSplitRGBShuffle[0]) // %5
5611 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5612 }
5613 #endif // HAS_SPLITRGBROW_SSSE3
5614
5615 #ifdef HAS_MERGERGBROW_SSSE3
5616 // Shuffle table for converting Planar to RGB.
5617 static const uvec8 kMergeRGBShuffle[9] = {
5618 {0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u, 128u,
5619 128u, 5u},
5620 {128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u,
5621 128u, 128u},
5622 {128u, 128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u,
5623 4u, 128u},
5624 {128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u, 128u,
5625 10u, 128u},
5626 {5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u,
5627 128u, 10u},
5628 {128u, 5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u,
5629 128u, 128u},
5630 {128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u, 128u,
5631 15u, 128u, 128u},
5632 {128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u,
5633 128u, 15u, 128u},
5634 {10u, 128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u,
5635 128u, 128u, 15u}};
5636
MergeRGBRow_SSSE3(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_rgb,int width)5637 void MergeRGBRow_SSSE3(const uint8_t* src_r,
5638 const uint8_t* src_g,
5639 const uint8_t* src_b,
5640 uint8_t* dst_rgb,
5641 int width) {
5642 asm volatile(
5643
5644 LABELALIGN
5645 "1: \n"
5646 "movdqu (%0),%%xmm0 \n"
5647 "movdqu (%1),%%xmm1 \n"
5648 "movdqu (%2),%%xmm2 \n"
5649 "pshufb (%5), %%xmm0 \n"
5650 "pshufb 16(%5), %%xmm1 \n"
5651 "pshufb 32(%5), %%xmm2 \n"
5652 "por %%xmm1,%%xmm0 \n"
5653 "por %%xmm2,%%xmm0 \n"
5654 "movdqu %%xmm0,(%3) \n"
5655
5656 "movdqu (%0),%%xmm0 \n"
5657 "movdqu (%1),%%xmm1 \n"
5658 "movdqu (%2),%%xmm2 \n"
5659 "pshufb 48(%5), %%xmm0 \n"
5660 "pshufb 64(%5), %%xmm1 \n"
5661 "pshufb 80(%5), %%xmm2 \n"
5662 "por %%xmm1,%%xmm0 \n"
5663 "por %%xmm2,%%xmm0 \n"
5664 "movdqu %%xmm0,16(%3) \n"
5665
5666 "movdqu (%0),%%xmm0 \n"
5667 "movdqu (%1),%%xmm1 \n"
5668 "movdqu (%2),%%xmm2 \n"
5669 "pshufb 96(%5), %%xmm0 \n"
5670 "pshufb 112(%5), %%xmm1 \n"
5671 "pshufb 128(%5), %%xmm2 \n"
5672 "por %%xmm1,%%xmm0 \n"
5673 "por %%xmm2,%%xmm0 \n"
5674 "movdqu %%xmm0,32(%3) \n"
5675
5676 "lea 0x10(%0),%0 \n"
5677 "lea 0x10(%1),%1 \n"
5678 "lea 0x10(%2),%2 \n"
5679 "lea 0x30(%3),%3 \n"
5680 "sub $0x10,%4 \n"
5681 "jg 1b \n"
5682 : "+r"(src_r), // %0
5683 "+r"(src_g), // %1
5684 "+r"(src_b), // %2
5685 "+r"(dst_rgb), // %3
5686 "+r"(width) // %4
5687 : "r"(&kMergeRGBShuffle[0]) // %5
5688 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5689 }
5690 #endif // HAS_MERGERGBROW_SSSE3
5691
5692 #ifdef HAS_MERGEARGBROW_SSE2
MergeARGBRow_SSE2(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,const uint8_t * src_a,uint8_t * dst_argb,int width)5693 void MergeARGBRow_SSE2(const uint8_t* src_r,
5694 const uint8_t* src_g,
5695 const uint8_t* src_b,
5696 const uint8_t* src_a,
5697 uint8_t* dst_argb,
5698 int width) {
5699 asm volatile(
5700
5701 "sub %0,%1 \n"
5702 "sub %0,%2 \n"
5703 "sub %0,%3 \n"
5704
5705 LABELALIGN
5706 "1: \n"
5707
5708 "movq (%0,%2),%%xmm0 \n" // B
5709 "movq (%0),%%xmm1 \n" // R
5710 "movq (%0,%1),%%xmm2 \n" // G
5711 "punpcklbw %%xmm1,%%xmm0 \n" // BR
5712 "movq (%0,%3),%%xmm1 \n" // A
5713 "punpcklbw %%xmm1,%%xmm2 \n" // GA
5714 "movdqa %%xmm0,%%xmm1 \n" // BR
5715 "punpckhbw %%xmm2,%%xmm1 \n" // BGRA (hi)
5716 "punpcklbw %%xmm2,%%xmm0 \n" // BGRA (lo)
5717 "movdqu %%xmm0,(%4) \n"
5718 "movdqu %%xmm1,16(%4) \n"
5719
5720 "lea 8(%0),%0 \n"
5721 "lea 32(%4),%4 \n"
5722 "sub $0x8,%5 \n"
5723 "jg 1b \n"
5724 : "+r"(src_r), // %0
5725 "+r"(src_g), // %1
5726 "+r"(src_b), // %2
5727 "+r"(src_a), // %3
5728 "+r"(dst_argb), // %4
5729 "+r"(width) // %5
5730 :
5731 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5732 }
5733 #endif
5734
5735 #ifdef HAS_MERGEXRGBROW_SSE2
MergeXRGBRow_SSE2(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_argb,int width)5736 void MergeXRGBRow_SSE2(const uint8_t* src_r,
5737 const uint8_t* src_g,
5738 const uint8_t* src_b,
5739 uint8_t* dst_argb,
5740 int width) {
5741 asm volatile(
5742
5743 LABELALIGN
5744 "1: \n"
5745
5746 "movq (%2),%%xmm0 \n" // B
5747 "movq (%0),%%xmm1 \n" // R
5748 "movq (%1),%%xmm2 \n" // G
5749 "punpcklbw %%xmm1,%%xmm0 \n" // BR
5750 "pcmpeqd %%xmm1,%%xmm1 \n" // A(255)
5751 "punpcklbw %%xmm1,%%xmm2 \n" // GA
5752 "movdqa %%xmm0,%%xmm1 \n" // BR
5753 "punpckhbw %%xmm2,%%xmm1 \n" // BGRA (hi)
5754 "punpcklbw %%xmm2,%%xmm0 \n" // BGRA (lo)
5755 "movdqu %%xmm0,(%3) \n"
5756 "movdqu %%xmm1,16(%3) \n"
5757
5758 "lea 8(%0),%0 \n"
5759 "lea 8(%1),%1 \n"
5760 "lea 8(%2),%2 \n"
5761 "lea 32(%3),%3 \n"
5762 "sub $0x8,%4 \n"
5763 "jg 1b \n"
5764 : "+r"(src_r), // %0
5765 "+r"(src_g), // %1
5766 "+r"(src_b), // %2
5767 "+r"(dst_argb), // %3
5768 "+r"(width) // %4
5769 :
5770 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5771 }
5772 #endif // HAS_MERGEARGBROW_SSE2
5773
5774 #ifdef HAS_MERGEARGBROW_AVX2
MergeARGBRow_AVX2(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,const uint8_t * src_a,uint8_t * dst_argb,int width)5775 void MergeARGBRow_AVX2(const uint8_t* src_r,
5776 const uint8_t* src_g,
5777 const uint8_t* src_b,
5778 const uint8_t* src_a,
5779 uint8_t* dst_argb,
5780 int width) {
5781 asm volatile(
5782
5783 "sub %0,%1 \n"
5784 "sub %0,%2 \n"
5785 "sub %0,%3 \n"
5786
5787 LABELALIGN
5788 "1: \n"
5789
5790 "vmovdqu (%0,%2),%%xmm0 \n" // B
5791 "vmovdqu (%0,%1),%%xmm1 \n" // R
5792 "vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // G
5793 "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1 \n" // A
5794 "vpunpckhbw %%ymm1,%%ymm0,%%ymm2 \n"
5795 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
5796 "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n"
5797 "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n"
5798 "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n"
5799 "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n"
5800 "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n"
5801 "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n"
5802 "vmovdqu %%ymm0,(%4) \n" // First 8
5803 "vmovdqu %%ymm1,32(%4) \n" // Next 8
5804
5805 "lea 16(%0),%0 \n"
5806 "lea 64(%4),%4 \n"
5807 "sub $0x10,%5 \n"
5808 "jg 1b \n"
5809 "vzeroupper \n"
5810 : "+r"(src_r), // %0
5811 "+r"(src_g), // %1
5812 "+r"(src_b), // %2
5813 "+r"(src_a), // %3
5814 "+r"(dst_argb), // %4
5815 "+r"(width) // %5
5816 :
5817 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5818 }
5819 #endif
5820
5821 #ifdef HAS_MERGEXRGBROW_AVX2
MergeXRGBRow_AVX2(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_argb,int width)5822 void MergeXRGBRow_AVX2(const uint8_t* src_r,
5823 const uint8_t* src_g,
5824 const uint8_t* src_b,
5825 uint8_t* dst_argb,
5826 int width) {
5827 asm volatile(
5828
5829 LABELALIGN
5830 "1: \n"
5831
5832 "vmovdqu (%2),%%xmm0 \n" // B
5833 "vpcmpeqd %%ymm1,%%ymm1,%%ymm1 \n" // A(255)
5834 "vinserti128 $0,(%1),%%ymm1,%%ymm1 \n" // R
5835 "vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // G
5836 "vpunpckhbw %%ymm1,%%ymm0,%%ymm2 \n"
5837 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
5838 "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n"
5839 "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n"
5840 "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n"
5841 "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n"
5842 "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n"
5843 "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n"
5844 "vmovdqu %%ymm0,(%3) \n" // First 8
5845 "vmovdqu %%ymm1,32(%3) \n" // Next 8
5846
5847 "lea 16(%0),%0 \n"
5848 "lea 16(%1),%1 \n"
5849 "lea 16(%2),%2 \n"
5850 "lea 64(%3),%3 \n"
5851 "sub $0x10,%4 \n"
5852 "jg 1b \n"
5853 "vzeroupper \n"
5854 : "+r"(src_r), // %0
5855 "+r"(src_g), // %1
5856 "+r"(src_b), // %2
5857 "+r"(dst_argb), // %3
5858 "+rm"(width) // %4
5859 :
5860 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5861 }
5862 #endif // HAS_MERGEARGBROW_AVX2
5863
5864 #ifdef HAS_SPLITARGBROW_SSE2
SplitARGBRow_SSE2(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)5865 void SplitARGBRow_SSE2(const uint8_t* src_argb,
5866 uint8_t* dst_r,
5867 uint8_t* dst_g,
5868 uint8_t* dst_b,
5869 uint8_t* dst_a,
5870 int width) {
5871 asm volatile(
5872
5873 "sub %1,%2 \n"
5874 "sub %1,%3 \n"
5875 "sub %1,%4 \n"
5876
5877 LABELALIGN
5878 "1: \n"
5879
5880 "movdqu (%0),%%xmm0 \n" // 00-0F
5881 "movdqu 16(%0),%%xmm1 \n" // 10-1F
5882 "movdqa %%xmm0,%%xmm2 \n"
5883 "punpcklqdq %%xmm1,%%xmm0 \n" // 00-07 10-17
5884 "punpckhqdq %%xmm1,%%xmm2 \n" // 08-0F 18-1F
5885 "movdqa %%xmm0,%%xmm1 \n"
5886 "punpcklbw %%xmm2,%%xmm0 \n" // 08192A3B4C5D6E7F (lo)
5887 "punpckhbw %%xmm2,%%xmm1 \n" // 08192A3B4C5D6E7F (hi)
5888 "movdqa %%xmm0,%%xmm2 \n"
5889 "punpcklqdq %%xmm1,%%xmm0 \n" // 08192A3B08192A3B
5890 "punpckhqdq %%xmm1,%%xmm2 \n" // 4C5D6E7F4C5D6E7F
5891 "movdqa %%xmm0,%%xmm1 \n"
5892 "punpcklbw %%xmm2,%%xmm0 \n" // 048C159D26AE37BF (lo)
5893 "punpckhbw %%xmm2,%%xmm1 \n" // 048C159D26AE37BF (hi)
5894 "movdqa %%xmm0,%%xmm2 \n"
5895 "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
5896 "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
5897 "movlps %%xmm0,(%1,%3) \n" // B
5898 "movhps %%xmm0,(%1,%2) \n" // G
5899 "movlps %%xmm2,(%1) \n" // R
5900 "movhps %%xmm2,(%1,%4) \n" // A
5901
5902 "lea 32(%0),%0 \n"
5903 "lea 8(%1),%1 \n"
5904 "sub $0x8,%5 \n"
5905 "jg 1b \n"
5906 : "+r"(src_argb), // %0
5907 "+r"(dst_r), // %1
5908 "+r"(dst_g), // %2
5909 "+r"(dst_b), // %3
5910 "+r"(dst_a), // %4
5911 "+rm"(width) // %5
5912 :
5913 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5914 }
5915 #endif
5916
5917 #ifdef HAS_SPLITXRGBROW_SSE2
SplitXRGBRow_SSE2(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)5918 void SplitXRGBRow_SSE2(const uint8_t* src_argb,
5919 uint8_t* dst_r,
5920 uint8_t* dst_g,
5921 uint8_t* dst_b,
5922 int width) {
5923 asm volatile(
5924
5925 LABELALIGN
5926 "1: \n"
5927
5928 "movdqu (%0),%%xmm0 \n" // 00-0F
5929 "movdqu 16(%0),%%xmm1 \n" // 10-1F
5930 "movdqa %%xmm0,%%xmm2 \n"
5931 "punpcklqdq %%xmm1,%%xmm0 \n" // 00-07 10-17
5932 "punpckhqdq %%xmm1,%%xmm2 \n" // 08-0F 18-1F
5933 "movdqa %%xmm0,%%xmm1 \n"
5934 "punpcklbw %%xmm2,%%xmm0 \n" // 08192A3B4C5D6E7F (lo)
5935 "punpckhbw %%xmm2,%%xmm1 \n" // 08192A3B4C5D6E7F (hi)
5936 "movdqa %%xmm0,%%xmm2 \n"
5937 "punpcklqdq %%xmm1,%%xmm0 \n" // 08192A3B08192A3B
5938 "punpckhqdq %%xmm1,%%xmm2 \n" // 4C5D6E7F4C5D6E7F
5939 "movdqa %%xmm0,%%xmm1 \n"
5940 "punpcklbw %%xmm2,%%xmm0 \n" // 048C159D26AE37BF (lo)
5941 "punpckhbw %%xmm2,%%xmm1 \n" // 048C159D26AE37BF (hi)
5942 "movdqa %%xmm0,%%xmm2 \n"
5943 "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
5944 "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
5945 "movlps %%xmm0,(%3) \n" // B
5946 "movhps %%xmm0,(%2) \n" // G
5947 "movlps %%xmm2,(%1) \n" // R
5948
5949 "lea 32(%0),%0 \n"
5950 "lea 8(%1),%1 \n"
5951 "lea 8(%2),%2 \n"
5952 "lea 8(%3),%3 \n"
5953 "sub $0x8,%4 \n"
5954 "jg 1b \n"
5955 : "+r"(src_argb), // %0
5956 "+r"(dst_r), // %1
5957 "+r"(dst_g), // %2
5958 "+r"(dst_b), // %3
5959 "+rm"(width) // %4
5960 :
5961 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5962 }
5963 #endif
5964
5965 static const uvec8 kShuffleMaskARGBSplit = {0, 4, 8, 12, 1, 5, 9, 13,
5966 2, 6, 10, 14, 3, 7, 11, 15};
5967 #ifdef HAS_SPLITARGBROW_SSSE3
SplitARGBRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)5968 void SplitARGBRow_SSSE3(const uint8_t* src_argb,
5969 uint8_t* dst_r,
5970 uint8_t* dst_g,
5971 uint8_t* dst_b,
5972 uint8_t* dst_a,
5973 int width) {
5974 asm volatile(
5975
5976 "movdqa %6,%%xmm3 \n"
5977 "sub %1,%2 \n"
5978 "sub %1,%3 \n"
5979 "sub %1,%4 \n"
5980
5981 LABELALIGN
5982 "1: \n"
5983
5984 "movdqu (%0),%%xmm0 \n" // 00-0F
5985 "movdqu 16(%0),%%xmm1 \n" // 10-1F
5986 "pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo)
5987 "pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi)
5988 "movdqa %%xmm0,%%xmm2 \n"
5989 "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
5990 "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
5991 "movlps %%xmm0,(%1,%3) \n" // B
5992 "movhps %%xmm0,(%1,%2) \n" // G
5993 "movlps %%xmm2,(%1) \n" // R
5994 "movhps %%xmm2,(%1,%4) \n" // A
5995
5996 "lea 32(%0),%0 \n"
5997 "lea 8(%1),%1 \n"
5998 "subl $0x8,%5 \n"
5999 "jg 1b \n"
6000 : "+r"(src_argb), // %0
6001 "+r"(dst_r), // %1
6002 "+r"(dst_g), // %2
6003 "+r"(dst_b), // %3
6004 "+r"(dst_a), // %4
6005 #if defined(__i386__)
6006 "+m"(width) // %5
6007 #else
6008 "+rm"(width) // %5
6009 #endif
6010 : "m"(kShuffleMaskARGBSplit) // %6
6011 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
6012 }
6013 #endif
6014
6015 #ifdef HAS_SPLITXRGBROW_SSSE3
SplitXRGBRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)6016 void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
6017 uint8_t* dst_r,
6018 uint8_t* dst_g,
6019 uint8_t* dst_b,
6020 int width) {
6021 asm volatile(
6022
6023 "movdqa %5,%%xmm3 \n"
6024
6025 LABELALIGN
6026 "1: \n"
6027
6028 "movdqu (%0),%%xmm0 \n" // 00-0F
6029 "movdqu 16(%0),%%xmm1 \n" // 10-1F
6030 "pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo)
6031 "pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi)
6032 "movdqa %%xmm0,%%xmm2 \n"
6033 "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
6034 "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
6035 "movlps %%xmm0,(%3) \n" // B
6036 "movhps %%xmm0,(%2) \n" // G
6037 "movlps %%xmm2,(%1) \n" // R
6038
6039 "lea 32(%0),%0 \n"
6040 "lea 8(%1),%1 \n"
6041 "lea 8(%2),%2 \n"
6042 "lea 8(%3),%3 \n"
6043 "sub $0x8,%4 \n"
6044 "jg 1b \n"
6045 : "+r"(src_argb), // %0
6046 "+r"(dst_r), // %1
6047 "+r"(dst_g), // %2
6048 "+r"(dst_b), // %3
6049 "+r"(width) // %4
6050 : "m"(kShuffleMaskARGBSplit) // %5
6051 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
6052 }
6053 #endif
6054
6055 #ifdef HAS_SPLITARGBROW_AVX2
6056 static const ulvec32 kShuffleMaskARGBPermute = {0, 4, 1, 5, 2, 6, 3, 7};
SplitARGBRow_AVX2(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)6057 void SplitARGBRow_AVX2(const uint8_t* src_argb,
6058 uint8_t* dst_r,
6059 uint8_t* dst_g,
6060 uint8_t* dst_b,
6061 uint8_t* dst_a,
6062 int width) {
6063 asm volatile(
6064
6065 "sub %1,%2 \n"
6066 "sub %1,%3 \n"
6067 "sub %1,%4 \n"
6068 "vmovdqa %7,%%ymm3 \n"
6069 "vbroadcastf128 %6,%%ymm4 \n"
6070
6071 LABELALIGN
6072 "1: \n"
6073
6074 "vmovdqu (%0),%%xmm0 \n" // 00-0F
6075 "vmovdqu 16(%0),%%xmm1 \n" // 10-1F
6076 "vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F
6077 "vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F
6078 "vpshufb %%ymm4,%%ymm0,%%ymm0 \n"
6079 "vpshufb %%ymm4,%%ymm1,%%ymm1 \n"
6080 "vpermd %%ymm0,%%ymm3,%%ymm0 \n"
6081 "vpermd %%ymm1,%%ymm3,%%ymm1 \n"
6082 "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA
6083 "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" // BR
6084 "vmovdqu %%xmm0,(%1,%3) \n" // B
6085 "vextracti128 $1,%%ymm0,(%1) \n" // R
6086 "vmovdqu %%xmm2,(%1,%2) \n" // G
6087 "vextracti128 $1,%%ymm2,(%1,%4) \n" // A
6088 "lea 64(%0),%0 \n"
6089 "lea 16(%1),%1 \n"
6090 "subl $0x10,%5 \n"
6091 "jg 1b \n"
6092 "vzeroupper \n"
6093 : "+r"(src_argb), // %0
6094 "+r"(dst_r), // %1
6095 "+r"(dst_g), // %2
6096 "+r"(dst_b), // %3
6097 "+r"(dst_a), // %4
6098 #if defined(__i386__)
6099 "+m"(width) // %5
6100 #else
6101 "+rm"(width) // %5
6102 #endif
6103 : "m"(kShuffleMaskARGBSplit), // %6
6104 "m"(kShuffleMaskARGBPermute) // %7
6105 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
6106 }
6107 #endif
6108
6109 #ifdef HAS_SPLITXRGBROW_AVX2
SplitXRGBRow_AVX2(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)6110 void SplitXRGBRow_AVX2(const uint8_t* src_argb,
6111 uint8_t* dst_r,
6112 uint8_t* dst_g,
6113 uint8_t* dst_b,
6114 int width) {
6115 asm volatile(
6116
6117 "vmovdqa %6,%%ymm3 \n"
6118 "vbroadcastf128 %5,%%ymm4 \n"
6119
6120 LABELALIGN
6121 "1: \n"
6122
6123 "vmovdqu (%0),%%xmm0 \n" // 00-0F
6124 "vmovdqu 16(%0),%%xmm1 \n" // 10-1F
6125 "vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F
6126 "vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F
6127 "vpshufb %%ymm4,%%ymm0,%%ymm0 \n"
6128 "vpshufb %%ymm4,%%ymm1,%%ymm1 \n"
6129 "vpermd %%ymm0,%%ymm3,%%ymm0 \n"
6130 "vpermd %%ymm1,%%ymm3,%%ymm1 \n"
6131 "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA
6132 "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" // BR
6133 "vmovdqu %%xmm0,(%3) \n" // B
6134 "vextracti128 $1,%%ymm0,(%1) \n" // R
6135 "vmovdqu %%xmm2,(%2) \n" // G
6136
6137 "lea 64(%0),%0 \n"
6138 "lea 16(%1),%1 \n"
6139 "lea 16(%2),%2 \n"
6140 "lea 16(%3),%3 \n"
6141 "sub $0x10,%4 \n"
6142 "jg 1b \n"
6143 "vzeroupper \n"
6144 : "+r"(src_argb), // %0
6145 "+r"(dst_r), // %1
6146 "+r"(dst_g), // %2
6147 "+r"(dst_b), // %3
6148 "+r"(width) // %4
6149 : "m"(kShuffleMaskARGBSplit), // %5
6150 "m"(kShuffleMaskARGBPermute) // %6
6151 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
6152 }
6153 #endif
6154
6155 #ifdef HAS_MERGEXR30ROW_AVX2
MergeXR30Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_ar30,int depth,int width)6156 void MergeXR30Row_AVX2(const uint16_t* src_r,
6157 const uint16_t* src_g,
6158 const uint16_t* src_b,
6159 uint8_t* dst_ar30,
6160 int depth,
6161 int width) {
6162 int shift = depth - 10;
6163 asm volatile(
6164
6165 "sub %0,%1 \n"
6166 "sub %0,%2 \n"
6167 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
6168 "vpsrlw $14,%%ymm5,%%ymm5 \n"
6169 "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
6170 "vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n"
6171 "vpsrlw $6,%%ymm6,%%ymm6 \n"
6172 "vmovd %5,%%xmm4 \n"
6173
6174 LABELALIGN
6175 "1: \n"
6176 "vmovdqu (%0),%%ymm0 \n"
6177 "vmovdqu (%0,%1),%%ymm1 \n"
6178 "vmovdqu (%0,%2),%%ymm2 \n"
6179 "vpsrlw %%xmm4,%%ymm0,%%ymm0 \n"
6180 "vpsrlw %%xmm4,%%ymm1,%%ymm1 \n"
6181 "vpsrlw %%xmm4,%%ymm2,%%ymm2 \n"
6182 "vpminuw %%ymm0,%%ymm6,%%ymm0 \n"
6183 "vpminuw %%ymm1,%%ymm6,%%ymm1 \n"
6184 "vpminuw %%ymm2,%%ymm6,%%ymm2 \n"
6185 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6186 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
6187 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
6188 "vpsllw $0x4,%%ymm0,%%ymm0 \n" // Shift R to target bit
6189 "vpunpckhwd %%ymm0,%%ymm2,%%ymm3 \n" // RB
6190 "vpunpcklwd %%ymm0,%%ymm2,%%ymm0 \n"
6191 "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" // AG
6192 "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n"
6193 "vpslld $0xa,%%ymm1,%%ymm1 \n" // Shift AG to target bit
6194 "vpslld $0xa,%%ymm2,%%ymm2 \n"
6195 "vpor %%ymm1,%%ymm0,%%ymm0 \n" // Combine
6196 "vpor %%ymm2,%%ymm3,%%ymm3 \n"
6197 "vmovdqu %%ymm0,(%3) \n"
6198 "vmovdqu %%ymm3,0x20(%3) \n"
6199 "lea 0x20(%0),%0 \n"
6200 "lea 0x40(%3),%3 \n"
6201 "sub $0x10,%4 \n"
6202 "jg 1b \n"
6203 "vzeroupper \n"
6204 : "+r"(src_r), // %0
6205 "+r"(src_g), // %1
6206 "+r"(src_b), // %2
6207 "+r"(dst_ar30), // %3
6208 "+r"(width) // %4
6209 #if defined(__i386__)
6210 : "m"(shift) // %5
6211 #else
6212 : "rm"(shift) // %5
6213 #endif
6214 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6215 }
6216 #endif
6217
6218 #ifdef HAS_MERGEAR64ROW_AVX2
6219 static const lvec32 MergeAR64Permute = {0, 4, 2, 6, 1, 5, 3, 7};
MergeAR64Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,const uint16_t * src_a,uint16_t * dst_ar64,int depth,int width)6220 void MergeAR64Row_AVX2(const uint16_t* src_r,
6221 const uint16_t* src_g,
6222 const uint16_t* src_b,
6223 const uint16_t* src_a,
6224 uint16_t* dst_ar64,
6225 int depth,
6226 int width) {
6227 int shift = 16 - depth;
6228 int mask = (1 << depth) - 1;
6229 mask = (mask << 16) + mask;
6230 asm volatile(
6231
6232 "sub %0,%1 \n"
6233 "sub %0,%2 \n"
6234 "sub %0,%3 \n"
6235 "vmovdqa %8,%%ymm5 \n"
6236 "vmovd %6,%%xmm6 \n"
6237 "vbroadcastss %7,%%ymm7 \n"
6238
6239 LABELALIGN
6240 "1: \n"
6241 "vmovdqu (%0),%%ymm0 \n" // R
6242 "vmovdqu (%0,%1),%%ymm1 \n" // G
6243 "vmovdqu (%0,%2),%%ymm2 \n" // B
6244 "vmovdqu (%0,%3),%%ymm3 \n" // A
6245 "vpminuw %%ymm0,%%ymm7,%%ymm0 \n"
6246 "vpminuw %%ymm1,%%ymm7,%%ymm1 \n"
6247 "vpminuw %%ymm2,%%ymm7,%%ymm2 \n"
6248 "vpminuw %%ymm3,%%ymm7,%%ymm3 \n"
6249 "vpsllw %%xmm6,%%ymm0,%%ymm0 \n"
6250 "vpsllw %%xmm6,%%ymm1,%%ymm1 \n"
6251 "vpsllw %%xmm6,%%ymm2,%%ymm2 \n"
6252 "vpsllw %%xmm6,%%ymm3,%%ymm3 \n"
6253 "vpermd %%ymm0,%%ymm5,%%ymm0 \n"
6254 "vpermd %%ymm1,%%ymm5,%%ymm1 \n"
6255 "vpermd %%ymm2,%%ymm5,%%ymm2 \n"
6256 "vpermd %%ymm3,%%ymm5,%%ymm3 \n"
6257 "vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low)
6258 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi)
6259 "vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low)
6260 "vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi)
6261 "vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1)
6262 "vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3)
6263 "vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2)
6264 "vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4)
6265 "vmovdqu %%ymm3,(%4) \n"
6266 "vmovdqu %%ymm2,0x20(%4) \n"
6267 "vmovdqu %%ymm4,0x40(%4) \n"
6268 "vmovdqu %%ymm1,0x60(%4) \n"
6269 "lea 0x20(%0),%0 \n"
6270 "lea 0x80(%4),%4 \n"
6271 "subl $0x10,%5 \n"
6272 "jg 1b \n"
6273 "vzeroupper \n"
6274 : "+r"(src_r), // %0
6275 "+r"(src_g), // %1
6276 "+r"(src_b), // %2
6277 "+r"(src_a), // %3
6278 "+r"(dst_ar64), // %4
6279 #if defined(__i386__)
6280 "+m"(width) // %5
6281 #else
6282 "+rm"(width) // %5
6283 #endif
6284 : "m"(shift), // %6
6285 "m"(mask), // %7
6286 "m"(MergeAR64Permute) // %8
6287 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
6288 "xmm7");
6289 }
6290 #endif
6291
6292 #ifdef HAS_MERGEXR64ROW_AVX2
MergeXR64Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint16_t * dst_ar64,int depth,int width)6293 void MergeXR64Row_AVX2(const uint16_t* src_r,
6294 const uint16_t* src_g,
6295 const uint16_t* src_b,
6296 uint16_t* dst_ar64,
6297 int depth,
6298 int width) {
6299 int shift = 16 - depth;
6300 int mask = (1 << depth) - 1;
6301 mask = (mask << 16) + mask;
6302 asm volatile(
6303
6304 "sub %0,%1 \n"
6305 "sub %0,%2 \n"
6306 "vmovdqa %7,%%ymm5 \n"
6307 "vmovd %5,%%xmm6 \n"
6308 "vbroadcastss %6,%%ymm7 \n"
6309
6310 LABELALIGN
6311 "1: \n"
6312 "vmovdqu (%0),%%ymm0 \n" // R
6313 "vmovdqu (%0,%1),%%ymm1 \n" // G
6314 "vmovdqu (%0,%2),%%ymm2 \n" // B
6315 "vpminuw %%ymm0,%%ymm7,%%ymm0 \n"
6316 "vpminuw %%ymm1,%%ymm7,%%ymm1 \n"
6317 "vpminuw %%ymm2,%%ymm7,%%ymm2 \n"
6318 "vpsllw %%xmm6,%%ymm0,%%ymm0 \n"
6319 "vpsllw %%xmm6,%%ymm1,%%ymm1 \n"
6320 "vpsllw %%xmm6,%%ymm2,%%ymm2 \n"
6321 "vpermd %%ymm0,%%ymm5,%%ymm0 \n"
6322 "vpermd %%ymm1,%%ymm5,%%ymm1 \n"
6323 "vpermd %%ymm2,%%ymm5,%%ymm2 \n"
6324 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" // A (0xffff)
6325 "vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low)
6326 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi)
6327 "vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low)
6328 "vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi)
6329 "vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1)
6330 "vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3)
6331 "vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2)
6332 "vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4)
6333 "vmovdqu %%ymm3,(%3) \n"
6334 "vmovdqu %%ymm2,0x20(%3) \n"
6335 "vmovdqu %%ymm4,0x40(%3) \n"
6336 "vmovdqu %%ymm1,0x60(%3) \n"
6337 "lea 0x20(%0),%0 \n"
6338 "lea 0x80(%3),%3 \n"
6339 "subl $0x10,%4 \n"
6340 "jg 1b \n"
6341 "vzeroupper \n"
6342 : "+r"(src_r), // %0
6343 "+r"(src_g), // %1
6344 "+r"(src_b), // %2
6345 "+r"(dst_ar64), // %3
6346 "+r"(width) // %4
6347 : "m"(shift), // %5
6348 "m"(mask), // %6
6349 "m"(MergeAR64Permute) // %7
6350 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
6351 "xmm7");
6352 }
6353 #endif
6354
6355 #ifdef HAS_MERGEARGB16TO8ROW_AVX2
6356 static const uvec8 MergeARGB16To8Shuffle = {0, 8, 1, 9, 2, 10, 3, 11,
6357 4, 12, 5, 13, 6, 14, 7, 15};
MergeARGB16To8Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,const uint16_t * src_a,uint8_t * dst_argb,int depth,int width)6358 void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
6359 const uint16_t* src_g,
6360 const uint16_t* src_b,
6361 const uint16_t* src_a,
6362 uint8_t* dst_argb,
6363 int depth,
6364 int width) {
6365 int shift = depth - 8;
6366 asm volatile(
6367
6368 "sub %0,%1 \n"
6369 "sub %0,%2 \n"
6370 "sub %0,%3 \n"
6371 "vbroadcastf128 %7,%%ymm5 \n"
6372 "vmovd %6,%%xmm6 \n"
6373
6374 LABELALIGN
6375 "1: \n"
6376 "vmovdqu (%0),%%ymm0 \n" // R
6377 "vmovdqu (%0,%1),%%ymm1 \n" // G
6378 "vmovdqu (%0,%2),%%ymm2 \n" // B
6379 "vmovdqu (%0,%3),%%ymm3 \n" // A
6380 "vpsrlw %%xmm6,%%ymm0,%%ymm0 \n"
6381 "vpsrlw %%xmm6,%%ymm1,%%ymm1 \n"
6382 "vpsrlw %%xmm6,%%ymm2,%%ymm2 \n"
6383 "vpsrlw %%xmm6,%%ymm3,%%ymm3 \n"
6384 "vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar)
6385 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar)
6386 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave)
6387 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave)
6388 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
6389 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6390 "vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low)
6391 "vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi)
6392 "vmovdqu %%ymm2,(%4) \n"
6393 "vmovdqu %%ymm0,0x20(%4) \n"
6394 "lea 0x20(%0),%0 \n"
6395 "lea 0x40(%4),%4 \n"
6396 "subl $0x10,%5 \n"
6397 "jg 1b \n"
6398 "vzeroupper \n"
6399 : "+r"(src_r), // %0
6400 "+r"(src_g), // %1
6401 "+r"(src_b), // %2
6402 "+r"(src_a), // %3
6403 "+r"(dst_argb), // %4
6404 #if defined(__i386__)
6405 "+m"(width) // %5
6406 #else
6407 "+rm"(width) // %5
6408 #endif
6409 : "m"(shift), // %6
6410 "m"(MergeARGB16To8Shuffle) // %7
6411 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
6412 }
6413 #endif
6414
6415 #ifdef HAS_MERGEXRGB16TO8ROW_AVX2
MergeXRGB16To8Row_AVX2(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_argb,int depth,int width)6416 void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
6417 const uint16_t* src_g,
6418 const uint16_t* src_b,
6419 uint8_t* dst_argb,
6420 int depth,
6421 int width) {
6422 int shift = depth - 8;
6423 asm volatile(
6424
6425 "sub %0,%1 \n"
6426 "sub %0,%2 \n"
6427 "vbroadcastf128 %6,%%ymm5 \n"
6428 "vmovd %5,%%xmm6 \n"
6429 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
6430 "vpsrlw $8,%%ymm3,%%ymm3 \n" // A (0xff)
6431
6432 LABELALIGN
6433 "1: \n"
6434 "vmovdqu (%0),%%ymm0 \n" // R
6435 "vmovdqu (%0,%1),%%ymm1 \n" // G
6436 "vmovdqu (%0,%2),%%ymm2 \n" // B
6437 "vpsrlw %%xmm6,%%ymm0,%%ymm0 \n"
6438 "vpsrlw %%xmm6,%%ymm1,%%ymm1 \n"
6439 "vpsrlw %%xmm6,%%ymm2,%%ymm2 \n"
6440 "vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar)
6441 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar)
6442 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave)
6443 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave)
6444 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
6445 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6446 "vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low)
6447 "vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi)
6448 "vmovdqu %%ymm2,(%3) \n"
6449 "vmovdqu %%ymm0,0x20(%3) \n"
6450 "lea 0x20(%0),%0 \n"
6451 "lea 0x40(%3),%3 \n"
6452 "subl $0x10,%4 \n"
6453 "jg 1b \n"
6454 "vzeroupper \n"
6455 : "+r"(src_r), // %0
6456 "+r"(src_g), // %1
6457 "+r"(src_b), // %2
6458 "+r"(dst_argb), // %3
6459 "+r"(width) // %4
6460 : "m"(shift), // %5
6461 "m"(MergeARGB16To8Shuffle) // %6
6462 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
6463 }
6464 #endif
6465
6466 #ifdef HAS_COPYROW_SSE2
CopyRow_SSE2(const uint8_t * src,uint8_t * dst,int width)6467 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
6468 asm volatile(
6469 "test $0xf,%0 \n"
6470 "jne 2f \n"
6471 "test $0xf,%1 \n"
6472 "jne 2f \n"
6473
6474 LABELALIGN
6475 "1: \n"
6476 "movdqa (%0),%%xmm0 \n"
6477 "movdqa 0x10(%0),%%xmm1 \n"
6478 "lea 0x20(%0),%0 \n"
6479 "movdqa %%xmm0,(%1) \n"
6480 "movdqa %%xmm1,0x10(%1) \n"
6481 "lea 0x20(%1),%1 \n"
6482 "sub $0x20,%2 \n"
6483 "jg 1b \n"
6484 "jmp 9f \n"
6485
6486 LABELALIGN
6487 "2: \n"
6488 "movdqu (%0),%%xmm0 \n"
6489 "movdqu 0x10(%0),%%xmm1 \n"
6490 "lea 0x20(%0),%0 \n"
6491 "movdqu %%xmm0,(%1) \n"
6492 "movdqu %%xmm1,0x10(%1) \n"
6493 "lea 0x20(%1),%1 \n"
6494 "sub $0x20,%2 \n"
6495 "jg 2b \n"
6496
6497 LABELALIGN "9: \n"
6498 : "+r"(src), // %0
6499 "+r"(dst), // %1
6500 "+r"(width) // %2
6501 :
6502 : "memory", "cc", "xmm0", "xmm1");
6503 }
6504 #endif // HAS_COPYROW_SSE2
6505
6506 #ifdef HAS_COPYROW_AVX
CopyRow_AVX(const uint8_t * src,uint8_t * dst,int width)6507 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
6508 asm volatile(
6509
6510 LABELALIGN
6511 "1: \n"
6512 "vmovdqu (%0),%%ymm0 \n"
6513 "vmovdqu 0x20(%0),%%ymm1 \n"
6514 "lea 0x40(%0),%0 \n"
6515 "vmovdqu %%ymm0,(%1) \n"
6516 "vmovdqu %%ymm1,0x20(%1) \n"
6517 "lea 0x40(%1),%1 \n"
6518 "sub $0x40,%2 \n"
6519 "jg 1b \n"
6520 "vzeroupper \n"
6521 : "+r"(src), // %0
6522 "+r"(dst), // %1
6523 "+r"(width) // %2
6524 :
6525 : "memory", "cc", "xmm0", "xmm1");
6526 }
6527 #endif // HAS_COPYROW_AVX
6528
6529 #ifdef HAS_COPYROW_ERMS
6530 // Multiple of 1.
CopyRow_ERMS(const uint8_t * src,uint8_t * dst,int width)6531 void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
6532 size_t width_tmp = (size_t)(width);
6533 asm volatile(
6534
6535 "rep movsb \n"
6536 : "+S"(src), // %0
6537 "+D"(dst), // %1
6538 "+c"(width_tmp) // %2
6539 :
6540 : "memory", "cc");
6541 }
6542 #endif // HAS_COPYROW_ERMS
6543
6544 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
6545 // width in pixels
ARGBCopyAlphaRow_SSE2(const uint8_t * src,uint8_t * dst,int width)6546 void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
6547 asm volatile(
6548 "pcmpeqb %%xmm0,%%xmm0 \n"
6549 "pslld $0x18,%%xmm0 \n"
6550 "pcmpeqb %%xmm1,%%xmm1 \n"
6551 "psrld $0x8,%%xmm1 \n"
6552
6553 LABELALIGN
6554 "1: \n"
6555 "movdqu (%0),%%xmm2 \n"
6556 "movdqu 0x10(%0),%%xmm3 \n"
6557 "lea 0x20(%0),%0 \n"
6558 "movdqu (%1),%%xmm4 \n"
6559 "movdqu 0x10(%1),%%xmm5 \n"
6560 "pand %%xmm0,%%xmm2 \n"
6561 "pand %%xmm0,%%xmm3 \n"
6562 "pand %%xmm1,%%xmm4 \n"
6563 "pand %%xmm1,%%xmm5 \n"
6564 "por %%xmm4,%%xmm2 \n"
6565 "por %%xmm5,%%xmm3 \n"
6566 "movdqu %%xmm2,(%1) \n"
6567 "movdqu %%xmm3,0x10(%1) \n"
6568 "lea 0x20(%1),%1 \n"
6569 "sub $0x8,%2 \n"
6570 "jg 1b \n"
6571 : "+r"(src), // %0
6572 "+r"(dst), // %1
6573 "+r"(width) // %2
6574 :
6575 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6576 }
6577 #endif // HAS_ARGBCOPYALPHAROW_SSE2
6578
6579 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
6580 // width in pixels
ARGBCopyAlphaRow_AVX2(const uint8_t * src,uint8_t * dst,int width)6581 void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
6582 asm volatile(
6583 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
6584 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
6585
6586 LABELALIGN
6587 "1: \n"
6588 "vmovdqu (%0),%%ymm1 \n"
6589 "vmovdqu 0x20(%0),%%ymm2 \n"
6590 "lea 0x40(%0),%0 \n"
6591 "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
6592 "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
6593 "vmovdqu %%ymm1,(%1) \n"
6594 "vmovdqu %%ymm2,0x20(%1) \n"
6595 "lea 0x40(%1),%1 \n"
6596 "sub $0x10,%2 \n"
6597 "jg 1b \n"
6598 "vzeroupper \n"
6599 : "+r"(src), // %0
6600 "+r"(dst), // %1
6601 "+r"(width) // %2
6602 :
6603 : "memory", "cc", "xmm0", "xmm1", "xmm2");
6604 }
6605 #endif // HAS_ARGBCOPYALPHAROW_AVX2
6606
6607 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
6608 // width in pixels
ARGBExtractAlphaRow_SSE2(const uint8_t * src_argb,uint8_t * dst_a,int width)6609 void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
6610 uint8_t* dst_a,
6611 int width) {
6612 asm volatile(
6613
6614 LABELALIGN
6615 "1: \n"
6616 "movdqu (%0), %%xmm0 \n"
6617 "movdqu 0x10(%0), %%xmm1 \n"
6618 "lea 0x20(%0), %0 \n"
6619 "psrld $0x18, %%xmm0 \n"
6620 "psrld $0x18, %%xmm1 \n"
6621 "packssdw %%xmm1, %%xmm0 \n"
6622 "packuswb %%xmm0, %%xmm0 \n"
6623 "movq %%xmm0,(%1) \n"
6624 "lea 0x8(%1), %1 \n"
6625 "sub $0x8, %2 \n"
6626 "jg 1b \n"
6627 : "+r"(src_argb), // %0
6628 "+r"(dst_a), // %1
6629 "+rm"(width) // %2
6630 :
6631 : "memory", "cc", "xmm0", "xmm1");
6632 }
6633 #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
6634
6635 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
6636 static const uvec8 kShuffleAlphaShort_AVX2 = {
6637 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u,
6638 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
6639
ARGBExtractAlphaRow_AVX2(const uint8_t * src_argb,uint8_t * dst_a,int width)6640 void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
6641 uint8_t* dst_a,
6642 int width) {
6643 asm volatile(
6644 "vmovdqa %3,%%ymm4 \n"
6645 "vbroadcastf128 %4,%%ymm5 \n"
6646
6647 LABELALIGN
6648 "1: \n"
6649 "vmovdqu (%0), %%ymm0 \n"
6650 "vmovdqu 0x20(%0), %%ymm1 \n"
6651 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
6652 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
6653 "vmovdqu 0x40(%0), %%ymm2 \n"
6654 "vmovdqu 0x60(%0), %%ymm3 \n"
6655 "lea 0x80(%0), %0 \n"
6656 "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
6657 "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
6658 "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
6659 "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
6660 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
6661 "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
6662 "vmovdqu %%ymm0,(%1) \n"
6663 "lea 0x20(%1),%1 \n"
6664 "sub $0x20, %2 \n"
6665 "jg 1b \n"
6666 "vzeroupper \n"
6667 : "+r"(src_argb), // %0
6668 "+r"(dst_a), // %1
6669 "+rm"(width) // %2
6670 : "m"(kPermdARGBToY_AVX), // %3
6671 "m"(kShuffleAlphaShort_AVX2) // %4
6672 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6673 }
6674 #endif // HAS_ARGBEXTRACTALPHAROW_AVX2
6675
6676 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
6677 // width in pixels
ARGBCopyYToAlphaRow_SSE2(const uint8_t * src,uint8_t * dst,int width)6678 void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
6679 asm volatile(
6680 "pcmpeqb %%xmm0,%%xmm0 \n"
6681 "pslld $0x18,%%xmm0 \n"
6682 "pcmpeqb %%xmm1,%%xmm1 \n"
6683 "psrld $0x8,%%xmm1 \n"
6684
6685 LABELALIGN
6686 "1: \n"
6687 "movq (%0),%%xmm2 \n"
6688 "lea 0x8(%0),%0 \n"
6689 "punpcklbw %%xmm2,%%xmm2 \n"
6690 "punpckhwd %%xmm2,%%xmm3 \n"
6691 "punpcklwd %%xmm2,%%xmm2 \n"
6692 "movdqu (%1),%%xmm4 \n"
6693 "movdqu 0x10(%1),%%xmm5 \n"
6694 "pand %%xmm0,%%xmm2 \n"
6695 "pand %%xmm0,%%xmm3 \n"
6696 "pand %%xmm1,%%xmm4 \n"
6697 "pand %%xmm1,%%xmm5 \n"
6698 "por %%xmm4,%%xmm2 \n"
6699 "por %%xmm5,%%xmm3 \n"
6700 "movdqu %%xmm2,(%1) \n"
6701 "movdqu %%xmm3,0x10(%1) \n"
6702 "lea 0x20(%1),%1 \n"
6703 "sub $0x8,%2 \n"
6704 "jg 1b \n"
6705 : "+r"(src), // %0
6706 "+r"(dst), // %1
6707 "+r"(width) // %2
6708 :
6709 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6710 }
6711 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
6712
6713 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
6714 // width in pixels
ARGBCopyYToAlphaRow_AVX2(const uint8_t * src,uint8_t * dst,int width)6715 void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
6716 asm volatile(
6717 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
6718 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
6719
6720 LABELALIGN
6721 "1: \n"
6722 "vpmovzxbd (%0),%%ymm1 \n"
6723 "vpmovzxbd 0x8(%0),%%ymm2 \n"
6724 "lea 0x10(%0),%0 \n"
6725 "vpslld $0x18,%%ymm1,%%ymm1 \n"
6726 "vpslld $0x18,%%ymm2,%%ymm2 \n"
6727 "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
6728 "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
6729 "vmovdqu %%ymm1,(%1) \n"
6730 "vmovdqu %%ymm2,0x20(%1) \n"
6731 "lea 0x40(%1),%1 \n"
6732 "sub $0x10,%2 \n"
6733 "jg 1b \n"
6734 "vzeroupper \n"
6735 : "+r"(src), // %0
6736 "+r"(dst), // %1
6737 "+r"(width) // %2
6738 :
6739 : "memory", "cc", "xmm0", "xmm1", "xmm2");
6740 }
6741 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
6742
6743 #ifdef HAS_SETROW_X86
SetRow_X86(uint8_t * dst,uint8_t v8,int width)6744 void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
6745 size_t width_tmp = (size_t)(width >> 2);
6746 const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
6747 asm volatile(
6748
6749 "rep stosl \n"
6750 : "+D"(dst), // %0
6751 "+c"(width_tmp) // %1
6752 : "a"(v32) // %2
6753 : "memory", "cc");
6754 }
6755
SetRow_ERMS(uint8_t * dst,uint8_t v8,int width)6756 void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
6757 size_t width_tmp = (size_t)(width);
6758 asm volatile(
6759
6760 "rep stosb \n"
6761 : "+D"(dst), // %0
6762 "+c"(width_tmp) // %1
6763 : "a"(v8) // %2
6764 : "memory", "cc");
6765 }
6766
ARGBSetRow_X86(uint8_t * dst_argb,uint32_t v32,int width)6767 void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
6768 size_t width_tmp = (size_t)(width);
6769 asm volatile(
6770
6771 "rep stosl \n"
6772 : "+D"(dst_argb), // %0
6773 "+c"(width_tmp) // %1
6774 : "a"(v32) // %2
6775 : "memory", "cc");
6776 }
6777 #endif // HAS_SETROW_X86
6778
6779 #ifdef HAS_YUY2TOYROW_SSE2
YUY2ToYRow_SSE2(const uint8_t * src_yuy2,uint8_t * dst_y,int width)6780 void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
6781 asm volatile(
6782 "pcmpeqb %%xmm5,%%xmm5 \n"
6783 "psrlw $0x8,%%xmm5 \n"
6784
6785 LABELALIGN
6786 "1: \n"
6787 "movdqu (%0),%%xmm0 \n"
6788 "movdqu 0x10(%0),%%xmm1 \n"
6789 "lea 0x20(%0),%0 \n"
6790 "pand %%xmm5,%%xmm0 \n"
6791 "pand %%xmm5,%%xmm1 \n"
6792 "packuswb %%xmm1,%%xmm0 \n"
6793 "movdqu %%xmm0,(%1) \n"
6794 "lea 0x10(%1),%1 \n"
6795 "sub $0x10,%2 \n"
6796 "jg 1b \n"
6797 : "+r"(src_yuy2), // %0
6798 "+r"(dst_y), // %1
6799 "+r"(width) // %2
6800 :
6801 : "memory", "cc", "xmm0", "xmm1", "xmm5");
6802 }
6803
YUY2ToNVUVRow_SSE2(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_uv,int width)6804 void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2,
6805 int stride_yuy2,
6806 uint8_t* dst_uv,
6807 int width) {
6808 asm volatile(LABELALIGN
6809 "1: \n"
6810 "movdqu (%0),%%xmm0 \n"
6811 "movdqu 0x10(%0),%%xmm1 \n"
6812 "movdqu 0x00(%0,%3,1),%%xmm2 \n"
6813 "movdqu 0x10(%0,%3,1),%%xmm3 \n"
6814 "lea 0x20(%0),%0 \n"
6815 "pavgb %%xmm2,%%xmm0 \n"
6816 "pavgb %%xmm3,%%xmm1 \n"
6817 "psrlw $0x8,%%xmm0 \n"
6818 "psrlw $0x8,%%xmm1 \n"
6819 "packuswb %%xmm1,%%xmm0 \n"
6820 "movdqu %%xmm0,(%1) \n"
6821 "lea 0x10(%1),%1 \n"
6822 "sub $0x10,%2 \n"
6823 "jg 1b \n"
6824 : "+r"(src_yuy2), // %0
6825 "+r"(dst_uv), // %1
6826 "+r"(width) // %2
6827 : "r"((intptr_t)(stride_yuy2)) // %3
6828 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
6829 }
6830
YUY2ToUVRow_SSE2(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)6831 void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
6832 int stride_yuy2,
6833 uint8_t* dst_u,
6834 uint8_t* dst_v,
6835 int width) {
6836 asm volatile(
6837 "pcmpeqb %%xmm5,%%xmm5 \n"
6838 "psrlw $0x8,%%xmm5 \n"
6839 "sub %1,%2 \n"
6840
6841 LABELALIGN
6842 "1: \n"
6843 "movdqu (%0),%%xmm0 \n"
6844 "movdqu 0x10(%0),%%xmm1 \n"
6845 "movdqu 0x00(%0,%4,1),%%xmm2 \n"
6846 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
6847 "lea 0x20(%0),%0 \n"
6848 "pavgb %%xmm2,%%xmm0 \n"
6849 "pavgb %%xmm3,%%xmm1 \n"
6850 "psrlw $0x8,%%xmm0 \n"
6851 "psrlw $0x8,%%xmm1 \n"
6852 "packuswb %%xmm1,%%xmm0 \n"
6853 "movdqa %%xmm0,%%xmm1 \n"
6854 "pand %%xmm5,%%xmm0 \n"
6855 "packuswb %%xmm0,%%xmm0 \n"
6856 "psrlw $0x8,%%xmm1 \n"
6857 "packuswb %%xmm1,%%xmm1 \n"
6858 "movq %%xmm0,(%1) \n"
6859 "movq %%xmm1,0x00(%1,%2,1) \n"
6860 "lea 0x8(%1),%1 \n"
6861 "sub $0x10,%3 \n"
6862 "jg 1b \n"
6863 : "+r"(src_yuy2), // %0
6864 "+r"(dst_u), // %1
6865 "+r"(dst_v), // %2
6866 "+r"(width) // %3
6867 : "r"((intptr_t)(stride_yuy2)) // %4
6868 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
6869 }
6870
YUY2ToUV422Row_SSE2(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)6871 void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
6872 uint8_t* dst_u,
6873 uint8_t* dst_v,
6874 int width) {
6875 asm volatile(
6876 "pcmpeqb %%xmm5,%%xmm5 \n"
6877 "psrlw $0x8,%%xmm5 \n"
6878 "sub %1,%2 \n"
6879
6880 LABELALIGN
6881 "1: \n"
6882 "movdqu (%0),%%xmm0 \n"
6883 "movdqu 0x10(%0),%%xmm1 \n"
6884 "lea 0x20(%0),%0 \n"
6885 "psrlw $0x8,%%xmm0 \n"
6886 "psrlw $0x8,%%xmm1 \n"
6887 "packuswb %%xmm1,%%xmm0 \n"
6888 "movdqa %%xmm0,%%xmm1 \n"
6889 "pand %%xmm5,%%xmm0 \n"
6890 "packuswb %%xmm0,%%xmm0 \n"
6891 "psrlw $0x8,%%xmm1 \n"
6892 "packuswb %%xmm1,%%xmm1 \n"
6893 "movq %%xmm0,(%1) \n"
6894 "movq %%xmm1,0x00(%1,%2,1) \n"
6895 "lea 0x8(%1),%1 \n"
6896 "sub $0x10,%3 \n"
6897 "jg 1b \n"
6898 : "+r"(src_yuy2), // %0
6899 "+r"(dst_u), // %1
6900 "+r"(dst_v), // %2
6901 "+r"(width) // %3
6902 :
6903 : "memory", "cc", "xmm0", "xmm1", "xmm5");
6904 }
6905
UYVYToYRow_SSE2(const uint8_t * src_uyvy,uint8_t * dst_y,int width)6906 void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
6907 asm volatile(
6908
6909 LABELALIGN
6910 "1: \n"
6911 "movdqu (%0),%%xmm0 \n"
6912 "movdqu 0x10(%0),%%xmm1 \n"
6913 "lea 0x20(%0),%0 \n"
6914 "psrlw $0x8,%%xmm0 \n"
6915 "psrlw $0x8,%%xmm1 \n"
6916 "packuswb %%xmm1,%%xmm0 \n"
6917 "movdqu %%xmm0,(%1) \n"
6918 "lea 0x10(%1),%1 \n"
6919 "sub $0x10,%2 \n"
6920 "jg 1b \n"
6921 : "+r"(src_uyvy), // %0
6922 "+r"(dst_y), // %1
6923 "+r"(width) // %2
6924 :
6925 : "memory", "cc", "xmm0", "xmm1");
6926 }
6927
UYVYToUVRow_SSE2(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)6928 void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
6929 int stride_uyvy,
6930 uint8_t* dst_u,
6931 uint8_t* dst_v,
6932 int width) {
6933 asm volatile(
6934 "pcmpeqb %%xmm5,%%xmm5 \n"
6935 "psrlw $0x8,%%xmm5 \n"
6936 "sub %1,%2 \n"
6937
6938 LABELALIGN
6939 "1: \n"
6940 "movdqu (%0),%%xmm0 \n"
6941 "movdqu 0x10(%0),%%xmm1 \n"
6942 "movdqu 0x00(%0,%4,1),%%xmm2 \n"
6943 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
6944 "lea 0x20(%0),%0 \n"
6945 "pavgb %%xmm2,%%xmm0 \n"
6946 "pavgb %%xmm3,%%xmm1 \n"
6947 "pand %%xmm5,%%xmm0 \n"
6948 "pand %%xmm5,%%xmm1 \n"
6949 "packuswb %%xmm1,%%xmm0 \n"
6950 "movdqa %%xmm0,%%xmm1 \n"
6951 "pand %%xmm5,%%xmm0 \n"
6952 "packuswb %%xmm0,%%xmm0 \n"
6953 "psrlw $0x8,%%xmm1 \n"
6954 "packuswb %%xmm1,%%xmm1 \n"
6955 "movq %%xmm0,(%1) \n"
6956 "movq %%xmm1,0x00(%1,%2,1) \n"
6957 "lea 0x8(%1),%1 \n"
6958 "sub $0x10,%3 \n"
6959 "jg 1b \n"
6960 : "+r"(src_uyvy), // %0
6961 "+r"(dst_u), // %1
6962 "+r"(dst_v), // %2
6963 "+r"(width) // %3
6964 : "r"((intptr_t)(stride_uyvy)) // %4
6965 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
6966 }
6967
UYVYToUV422Row_SSE2(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)6968 void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
6969 uint8_t* dst_u,
6970 uint8_t* dst_v,
6971 int width) {
6972 asm volatile(
6973 "pcmpeqb %%xmm5,%%xmm5 \n"
6974 "psrlw $0x8,%%xmm5 \n"
6975 "sub %1,%2 \n"
6976
6977 LABELALIGN
6978 "1: \n"
6979 "movdqu (%0),%%xmm0 \n"
6980 "movdqu 0x10(%0),%%xmm1 \n"
6981 "lea 0x20(%0),%0 \n"
6982 "pand %%xmm5,%%xmm0 \n"
6983 "pand %%xmm5,%%xmm1 \n"
6984 "packuswb %%xmm1,%%xmm0 \n"
6985 "movdqa %%xmm0,%%xmm1 \n"
6986 "pand %%xmm5,%%xmm0 \n"
6987 "packuswb %%xmm0,%%xmm0 \n"
6988 "psrlw $0x8,%%xmm1 \n"
6989 "packuswb %%xmm1,%%xmm1 \n"
6990 "movq %%xmm0,(%1) \n"
6991 "movq %%xmm1,0x00(%1,%2,1) \n"
6992 "lea 0x8(%1),%1 \n"
6993 "sub $0x10,%3 \n"
6994 "jg 1b \n"
6995 : "+r"(src_uyvy), // %0
6996 "+r"(dst_u), // %1
6997 "+r"(dst_v), // %2
6998 "+r"(width) // %3
6999 :
7000 : "memory", "cc", "xmm0", "xmm1", "xmm5");
7001 }
7002 #endif // HAS_YUY2TOYROW_SSE2
7003
7004 #ifdef HAS_YUY2TOYROW_AVX2
YUY2ToYRow_AVX2(const uint8_t * src_yuy2,uint8_t * dst_y,int width)7005 void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
7006 asm volatile(
7007 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
7008 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
7009
7010 LABELALIGN
7011 "1: \n"
7012 "vmovdqu (%0),%%ymm0 \n"
7013 "vmovdqu 0x20(%0),%%ymm1 \n"
7014 "lea 0x40(%0),%0 \n"
7015 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
7016 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
7017 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
7018 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
7019 "vmovdqu %%ymm0,(%1) \n"
7020 "lea 0x20(%1),%1 \n"
7021 "sub $0x20,%2 \n"
7022 "jg 1b \n"
7023 "vzeroupper \n"
7024 : "+r"(src_yuy2), // %0
7025 "+r"(dst_y), // %1
7026 "+r"(width) // %2
7027 :
7028 : "memory", "cc", "xmm0", "xmm1", "xmm5");
7029 }
7030
YUY2ToNVUVRow_AVX2(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_uv,int width)7031 void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
7032 int stride_yuy2,
7033 uint8_t* dst_uv,
7034 int width) {
7035 asm volatile(
7036
7037 LABELALIGN
7038 "1: \n"
7039 "vmovdqu (%0),%%ymm0 \n"
7040 "vmovdqu 0x20(%0),%%ymm1 \n"
7041 "vpavgb 0x00(%0,%3,1),%%ymm0,%%ymm0 \n"
7042 "vpavgb 0x20(%0,%3,1),%%ymm1,%%ymm1 \n"
7043 "lea 0x40(%0),%0 \n"
7044 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
7045 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
7046 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
7047 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
7048 "vmovdqu %%ymm0,(%1) \n"
7049 "lea 0x20(%1),%1 \n"
7050 "sub $0x20,%2 \n"
7051 "jg 1b \n"
7052 "vzeroupper \n"
7053 : "+r"(src_yuy2), // %0
7054 "+r"(dst_uv), // %1
7055 "+r"(width) // %2
7056 : "r"((intptr_t)(stride_yuy2)) // %3
7057 : "memory", "cc", "xmm0", "xmm1");
7058 }
7059
YUY2ToUVRow_AVX2(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)7060 void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
7061 int stride_yuy2,
7062 uint8_t* dst_u,
7063 uint8_t* dst_v,
7064 int width) {
7065 asm volatile(
7066 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
7067 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
7068 "sub %1,%2 \n"
7069
7070 LABELALIGN
7071 "1: \n"
7072 "vmovdqu (%0),%%ymm0 \n"
7073 "vmovdqu 0x20(%0),%%ymm1 \n"
7074 "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
7075 "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
7076 "lea 0x40(%0),%0 \n"
7077 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
7078 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
7079 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
7080 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
7081 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
7082 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
7083 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
7084 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
7085 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
7086 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
7087 "vextractf128 $0x0,%%ymm1,(%1) \n"
7088 "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
7089 "lea 0x10(%1),%1 \n"
7090 "sub $0x20,%3 \n"
7091 "jg 1b \n"
7092 "vzeroupper \n"
7093 : "+r"(src_yuy2), // %0
7094 "+r"(dst_u), // %1
7095 "+r"(dst_v), // %2
7096 "+r"(width) // %3
7097 : "r"((intptr_t)(stride_yuy2)) // %4
7098 : "memory", "cc", "xmm0", "xmm1", "xmm5");
7099 }
7100
YUY2ToUV422Row_AVX2(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)7101 void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
7102 uint8_t* dst_u,
7103 uint8_t* dst_v,
7104 int width) {
7105 asm volatile(
7106 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
7107 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
7108 "sub %1,%2 \n"
7109
7110 LABELALIGN
7111 "1: \n"
7112 "vmovdqu (%0),%%ymm0 \n"
7113 "vmovdqu 0x20(%0),%%ymm1 \n"
7114 "lea 0x40(%0),%0 \n"
7115 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
7116 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
7117 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
7118 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
7119 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
7120 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
7121 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
7122 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
7123 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
7124 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
7125 "vextractf128 $0x0,%%ymm1,(%1) \n"
7126 "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
7127 "lea 0x10(%1),%1 \n"
7128 "sub $0x20,%3 \n"
7129 "jg 1b \n"
7130 "vzeroupper \n"
7131 : "+r"(src_yuy2), // %0
7132 "+r"(dst_u), // %1
7133 "+r"(dst_v), // %2
7134 "+r"(width) // %3
7135 :
7136 : "memory", "cc", "xmm0", "xmm1", "xmm5");
7137 }
7138
UYVYToYRow_AVX2(const uint8_t * src_uyvy,uint8_t * dst_y,int width)7139 void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
7140 asm volatile(
7141
7142 LABELALIGN
7143 "1: \n"
7144 "vmovdqu (%0),%%ymm0 \n"
7145 "vmovdqu 0x20(%0),%%ymm1 \n"
7146 "lea 0x40(%0),%0 \n"
7147 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
7148 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
7149 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
7150 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
7151 "vmovdqu %%ymm0,(%1) \n"
7152 "lea 0x20(%1),%1 \n"
7153 "sub $0x20,%2 \n"
7154 "jg 1b \n"
7155 "vzeroupper \n"
7156 : "+r"(src_uyvy), // %0
7157 "+r"(dst_y), // %1
7158 "+r"(width) // %2
7159 :
7160 : "memory", "cc", "xmm0", "xmm1", "xmm5");
7161 }
UYVYToUVRow_AVX2(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)7162 void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
7163 int stride_uyvy,
7164 uint8_t* dst_u,
7165 uint8_t* dst_v,
7166 int width) {
7167 asm volatile(
7168 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
7169 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
7170 "sub %1,%2 \n"
7171
7172 LABELALIGN
7173 "1: \n"
7174 "vmovdqu (%0),%%ymm0 \n"
7175 "vmovdqu 0x20(%0),%%ymm1 \n"
7176 "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
7177 "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
7178 "lea 0x40(%0),%0 \n"
7179 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
7180 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
7181 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
7182 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
7183 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
7184 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
7185 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
7186 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
7187 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
7188 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
7189 "vextractf128 $0x0,%%ymm1,(%1) \n"
7190 "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
7191 "lea 0x10(%1),%1 \n"
7192 "sub $0x20,%3 \n"
7193 "jg 1b \n"
7194 "vzeroupper \n"
7195 : "+r"(src_uyvy), // %0
7196 "+r"(dst_u), // %1
7197 "+r"(dst_v), // %2
7198 "+r"(width) // %3
7199 : "r"((intptr_t)(stride_uyvy)) // %4
7200 : "memory", "cc", "xmm0", "xmm1", "xmm5");
7201 }
7202
UYVYToUV422Row_AVX2(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)7203 void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
7204 uint8_t* dst_u,
7205 uint8_t* dst_v,
7206 int width) {
7207 asm volatile(
7208 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
7209 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
7210 "sub %1,%2 \n"
7211
7212 LABELALIGN
7213 "1: \n"
7214 "vmovdqu (%0),%%ymm0 \n"
7215 "vmovdqu 0x20(%0),%%ymm1 \n"
7216 "lea 0x40(%0),%0 \n"
7217 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
7218 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
7219 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
7220 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
7221 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
7222 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
7223 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
7224 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
7225 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
7226 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
7227 "vextractf128 $0x0,%%ymm1,(%1) \n"
7228 "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
7229 "lea 0x10(%1),%1 \n"
7230 "sub $0x20,%3 \n"
7231 "jg 1b \n"
7232 "vzeroupper \n"
7233 : "+r"(src_uyvy), // %0
7234 "+r"(dst_u), // %1
7235 "+r"(dst_v), // %2
7236 "+r"(width) // %3
7237 :
7238 : "memory", "cc", "xmm0", "xmm1", "xmm5");
7239 }
7240 #endif // HAS_YUY2TOYROW_AVX2
7241
7242 #ifdef HAS_ARGBBLENDROW_SSSE3
7243 // Shuffle table for isolating alpha.
7244 static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
7245 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
7246
7247 // Blend 8 pixels at a time
ARGBBlendRow_SSSE3(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7248 void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
7249 const uint8_t* src_argb1,
7250 uint8_t* dst_argb,
7251 int width) {
7252 asm volatile(
7253 "pcmpeqb %%xmm7,%%xmm7 \n"
7254 "psrlw $0xf,%%xmm7 \n"
7255 "pcmpeqb %%xmm6,%%xmm6 \n"
7256 "psrlw $0x8,%%xmm6 \n"
7257 "pcmpeqb %%xmm5,%%xmm5 \n"
7258 "psllw $0x8,%%xmm5 \n"
7259 "pcmpeqb %%xmm4,%%xmm4 \n"
7260 "pslld $0x18,%%xmm4 \n"
7261 "sub $0x4,%3 \n"
7262 "jl 49f \n"
7263
7264 // 4 pixel loop.
7265 LABELALIGN
7266 "40: \n"
7267 "movdqu (%0),%%xmm3 \n"
7268 "lea 0x10(%0),%0 \n"
7269 "movdqa %%xmm3,%%xmm0 \n"
7270 "pxor %%xmm4,%%xmm3 \n"
7271 "movdqu (%1),%%xmm2 \n"
7272 "pshufb %4,%%xmm3 \n"
7273 "pand %%xmm6,%%xmm2 \n"
7274 "paddw %%xmm7,%%xmm3 \n"
7275 "pmullw %%xmm3,%%xmm2 \n"
7276 "movdqu (%1),%%xmm1 \n"
7277 "lea 0x10(%1),%1 \n"
7278 "psrlw $0x8,%%xmm1 \n"
7279 "por %%xmm4,%%xmm0 \n"
7280 "pmullw %%xmm3,%%xmm1 \n"
7281 "psrlw $0x8,%%xmm2 \n"
7282 "paddusb %%xmm2,%%xmm0 \n"
7283 "pand %%xmm5,%%xmm1 \n"
7284 "paddusb %%xmm1,%%xmm0 \n"
7285 "movdqu %%xmm0,(%2) \n"
7286 "lea 0x10(%2),%2 \n"
7287 "sub $0x4,%3 \n"
7288 "jge 40b \n"
7289
7290 "49: \n"
7291 "add $0x3,%3 \n"
7292 "jl 99f \n"
7293
7294 // 1 pixel loop.
7295 "91: \n"
7296 "movd (%0),%%xmm3 \n"
7297 "lea 0x4(%0),%0 \n"
7298 "movdqa %%xmm3,%%xmm0 \n"
7299 "pxor %%xmm4,%%xmm3 \n"
7300 "movd (%1),%%xmm2 \n"
7301 "pshufb %4,%%xmm3 \n"
7302 "pand %%xmm6,%%xmm2 \n"
7303 "paddw %%xmm7,%%xmm3 \n"
7304 "pmullw %%xmm3,%%xmm2 \n"
7305 "movd (%1),%%xmm1 \n"
7306 "lea 0x4(%1),%1 \n"
7307 "psrlw $0x8,%%xmm1 \n"
7308 "por %%xmm4,%%xmm0 \n"
7309 "pmullw %%xmm3,%%xmm1 \n"
7310 "psrlw $0x8,%%xmm2 \n"
7311 "paddusb %%xmm2,%%xmm0 \n"
7312 "pand %%xmm5,%%xmm1 \n"
7313 "paddusb %%xmm1,%%xmm0 \n"
7314 "movd %%xmm0,(%2) \n"
7315 "lea 0x4(%2),%2 \n"
7316 "sub $0x1,%3 \n"
7317 "jge 91b \n"
7318 "99: \n"
7319 : "+r"(src_argb), // %0
7320 "+r"(src_argb1), // %1
7321 "+r"(dst_argb), // %2
7322 "+r"(width) // %3
7323 : "m"(kShuffleAlpha) // %4
7324 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7325 "xmm7");
7326 }
7327 #endif // HAS_ARGBBLENDROW_SSSE3
7328
7329 #ifdef HAS_BLENDPLANEROW_SSSE3
7330 // Blend 8 pixels at a time.
7331 // unsigned version of math
7332 // =((A2*C2)+(B2*(255-C2))+255)/256
7333 // signed version of math
7334 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_SSSE3(const uint8_t * src0,const uint8_t * src1,const uint8_t * alpha,uint8_t * dst,int width)7335 void BlendPlaneRow_SSSE3(const uint8_t* src0,
7336 const uint8_t* src1,
7337 const uint8_t* alpha,
7338 uint8_t* dst,
7339 int width) {
7340 asm volatile(
7341 "pcmpeqb %%xmm5,%%xmm5 \n"
7342 "psllw $0x8,%%xmm5 \n"
7343 "mov $0x80808080,%%eax \n"
7344 "movd %%eax,%%xmm6 \n"
7345 "pshufd $0x0,%%xmm6,%%xmm6 \n"
7346 "mov $0x807f807f,%%eax \n"
7347 "movd %%eax,%%xmm7 \n"
7348 "pshufd $0x0,%%xmm7,%%xmm7 \n"
7349 "sub %2,%0 \n"
7350 "sub %2,%1 \n"
7351 "sub %2,%3 \n"
7352
7353 // 8 pixel loop.
7354 LABELALIGN
7355 "1: \n"
7356 "movq (%2),%%xmm0 \n"
7357 "punpcklbw %%xmm0,%%xmm0 \n"
7358 "pxor %%xmm5,%%xmm0 \n"
7359 "movq (%0,%2,1),%%xmm1 \n"
7360 "movq (%1,%2,1),%%xmm2 \n"
7361 "punpcklbw %%xmm2,%%xmm1 \n"
7362 "psubb %%xmm6,%%xmm1 \n"
7363 "pmaddubsw %%xmm1,%%xmm0 \n"
7364 "paddw %%xmm7,%%xmm0 \n"
7365 "psrlw $0x8,%%xmm0 \n"
7366 "packuswb %%xmm0,%%xmm0 \n"
7367 "movq %%xmm0,(%3,%2,1) \n"
7368 "lea 0x8(%2),%2 \n"
7369 "sub $0x8,%4 \n"
7370 "jg 1b \n"
7371 : "+r"(src0), // %0
7372 "+r"(src1), // %1
7373 "+r"(alpha), // %2
7374 "+r"(dst), // %3
7375 "+rm"(width) // %4
7376 ::"memory",
7377 "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
7378 }
7379 #endif // HAS_BLENDPLANEROW_SSSE3
7380
7381 #ifdef HAS_BLENDPLANEROW_AVX2
7382 // Blend 32 pixels at a time.
7383 // unsigned version of math
7384 // =((A2*C2)+(B2*(255-C2))+255)/256
7385 // signed version of math
7386 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_AVX2(const uint8_t * src0,const uint8_t * src1,const uint8_t * alpha,uint8_t * dst,int width)7387 void BlendPlaneRow_AVX2(const uint8_t* src0,
7388 const uint8_t* src1,
7389 const uint8_t* alpha,
7390 uint8_t* dst,
7391 int width) {
7392 asm volatile(
7393 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
7394 "vpsllw $0x8,%%ymm5,%%ymm5 \n"
7395 "mov $0x80808080,%%eax \n"
7396 "vmovd %%eax,%%xmm6 \n"
7397 "vbroadcastss %%xmm6,%%ymm6 \n"
7398 "mov $0x807f807f,%%eax \n"
7399 "vmovd %%eax,%%xmm7 \n"
7400 "vbroadcastss %%xmm7,%%ymm7 \n"
7401 "sub %2,%0 \n"
7402 "sub %2,%1 \n"
7403 "sub %2,%3 \n"
7404
7405 // 32 pixel loop.
7406 LABELALIGN
7407 "1: \n"
7408 "vmovdqu (%2),%%ymm0 \n"
7409 "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
7410 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
7411 "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
7412 "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
7413 "vmovdqu (%0,%2,1),%%ymm1 \n"
7414 "vmovdqu (%1,%2,1),%%ymm2 \n"
7415 "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
7416 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
7417 "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
7418 "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
7419 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
7420 "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
7421 "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
7422 "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
7423 "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
7424 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
7425 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
7426 "vmovdqu %%ymm0,(%3,%2,1) \n"
7427 "lea 0x20(%2),%2 \n"
7428 "sub $0x20,%4 \n"
7429 "jg 1b \n"
7430 "vzeroupper \n"
7431 : "+r"(src0), // %0
7432 "+r"(src1), // %1
7433 "+r"(alpha), // %2
7434 "+r"(dst), // %3
7435 "+rm"(width) // %4
7436 ::"memory",
7437 "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7438 "xmm7");
7439 }
7440 #endif // HAS_BLENDPLANEROW_AVX2
7441
7442 #ifdef HAS_ARGBATTENUATEROW_SSSE3
7443 // Shuffle table duplicating alpha.
7444 static const vec8 kAttenuateShuffle = {6, -128, 6, -128, 6, -128,
7445 -128, -128, 14, -128, 14, -128,
7446 14, -128, -128, -128};
7447
7448 // Attenuate 4 pixels at a time.
ARGBAttenuateRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width)7449 void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
7450 uint8_t* dst_argb,
7451 int width) {
7452 asm volatile(
7453 "movdqa %3,%%xmm4 \n"
7454 "pcmpeqb %%xmm5,%%xmm5 \n"
7455 "pslld $0x18,%%xmm5 \n"
7456 "pxor %%xmm6,%%xmm6 \n"
7457 "pcmpeqb %%xmm7,%%xmm7 \n"
7458 "punpcklbw %%xmm6,%%xmm7 \n"
7459 "sub %0,%1 \n"
7460
7461 // 4 pixel loop.
7462 LABELALIGN
7463 "1: \n"
7464 "movdqu (%0),%%xmm6 \n"
7465 "movdqa %%xmm6,%%xmm0 \n"
7466 "movdqa %%xmm6,%%xmm1 \n"
7467 "punpcklbw %%xmm5,%%xmm0 \n"
7468 "punpckhbw %%xmm5,%%xmm1 \n"
7469 "movdqa %%xmm0,%%xmm2 \n"
7470 "movdqa %%xmm1,%%xmm3 \n"
7471 "pshufb %%xmm4,%%xmm2 \n" // a,a,a,0
7472 "pshufb %%xmm4,%%xmm3 \n"
7473 "pmullw %%xmm2,%%xmm0 \n" // rgb * alpha
7474 "pmullw %%xmm3,%%xmm1 \n"
7475 "paddw %%xmm7,%%xmm0 \n" // + 255
7476 "paddw %%xmm7,%%xmm1 \n"
7477 "psrlw $0x8,%%xmm0 \n"
7478 "psrlw $0x8,%%xmm1 \n"
7479 "packuswb %%xmm1,%%xmm0 \n"
7480 "pand %%xmm5,%%xmm6 \n"
7481 "por %%xmm6,%%xmm0 \n"
7482 "movdqu %%xmm0,(%0,%1) \n"
7483 "lea 0x10(%0),%0 \n"
7484 "sub $0x4,%2 \n"
7485 "jg 1b \n"
7486 : "+r"(src_argb), // %0
7487 "+r"(dst_argb), // %1
7488 "+r"(width) // %2
7489 : "m"(kAttenuateShuffle) // %3
7490 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7491 "xmm7");
7492 }
7493 #endif // HAS_ARGBATTENUATEROW_SSSE3
7494
7495 #ifdef HAS_ARGBATTENUATEROW_AVX2
7496
7497 // Shuffle table duplicating alpha.
7498 static const lvec8 kAttenuateShuffle_AVX2 = {
7499 6, -128, 6, -128, 6, -128, -128, -128, 14, -128, 14,
7500 -128, 14, -128, -128, -128, 22, -128, 22, -128, 22, -128,
7501 -128, -128, 30, -128, 30, -128, 30, -128, -128, -128};
7502
7503 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,int width)7504 void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
7505 uint8_t* dst_argb,
7506 int width) {
7507 asm volatile(
7508 "vmovdqa %3,%%ymm4 \n"
7509 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
7510 "vpslld $0x18,%%ymm5,%%ymm5 \n"
7511 "vpxor %%ymm6,%%ymm6,%%ymm6 \n"
7512 "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n"
7513 "vpunpcklbw %%ymm6,%%ymm7,%%ymm7 \n"
7514 "sub %0,%1 \n"
7515
7516 // 8 pixel loop.
7517 LABELALIGN
7518 "1: \n"
7519 "vmovdqu (%0),%%ymm6 \n"
7520 "vpunpcklbw %%ymm5,%%ymm6,%%ymm0 \n"
7521 "vpunpckhbw %%ymm5,%%ymm6,%%ymm1 \n"
7522 "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
7523 "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
7524 "vpmullw %%ymm2,%%ymm0,%%ymm0 \n"
7525 "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
7526 "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
7527 "vpaddw %%ymm7,%%ymm1,%%ymm1 \n"
7528 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
7529 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
7530 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
7531 "vpand %%ymm5,%%ymm6,%%ymm1 \n"
7532 "vpor %%ymm1,%%ymm0,%%ymm0 \n"
7533 "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
7534 "lea 0x20(%0),%0 \n"
7535 "sub $0x8,%2 \n"
7536 "jg 1b \n"
7537 "vzeroupper \n"
7538 : "+r"(src_argb), // %0
7539 "+r"(dst_argb), // %1
7540 "+r"(width) // %2
7541 : "m"(kAttenuateShuffle_AVX2) // %3
7542 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7543 "xmm7");
7544 }
7545 #endif // HAS_ARGBATTENUATEROW_AVX2
7546
7547 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
7548 // Unattenuate 4 pixels at a time.
ARGBUnattenuateRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,int width)7549 void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
7550 uint8_t* dst_argb,
7551 int width) {
7552 uintptr_t alpha;
7553 asm volatile(
7554 // 4 pixel loop.
7555 LABELALIGN
7556 "1: \n"
7557 "movdqu (%0),%%xmm0 \n"
7558 "movzb 0x03(%0),%3 \n"
7559 "punpcklbw %%xmm0,%%xmm0 \n"
7560 "movd 0x00(%4,%3,4),%%xmm2 \n"
7561 "movzb 0x07(%0),%3 \n"
7562 "movd 0x00(%4,%3,4),%%xmm3 \n"
7563 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
7564 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
7565 "movlhps %%xmm3,%%xmm2 \n"
7566 "pmulhuw %%xmm2,%%xmm0 \n"
7567 "movdqu (%0),%%xmm1 \n"
7568 "movzb 0x0b(%0),%3 \n"
7569 "punpckhbw %%xmm1,%%xmm1 \n"
7570 "movd 0x00(%4,%3,4),%%xmm2 \n"
7571 "movzb 0x0f(%0),%3 \n"
7572 "movd 0x00(%4,%3,4),%%xmm3 \n"
7573 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
7574 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
7575 "movlhps %%xmm3,%%xmm2 \n"
7576 "pmulhuw %%xmm2,%%xmm1 \n"
7577 "lea 0x10(%0),%0 \n"
7578 "packuswb %%xmm1,%%xmm0 \n"
7579 "movdqu %%xmm0,(%1) \n"
7580 "lea 0x10(%1),%1 \n"
7581 "sub $0x4,%2 \n"
7582 "jg 1b \n"
7583 : "+r"(src_argb), // %0
7584 "+r"(dst_argb), // %1
7585 "+r"(width), // %2
7586 "=&r"(alpha) // %3
7587 : "r"(fixed_invtbl8) // %4
7588 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
7589 }
7590 #endif // HAS_ARGBUNATTENUATEROW_SSE2
7591
7592 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
7593 // Shuffle table duplicating alpha.
7594 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
7595 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
7596 // Unattenuate 8 pixels at a time.
ARGBUnattenuateRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,int width)7597 void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
7598 uint8_t* dst_argb,
7599 int width) {
7600 uintptr_t alpha;
7601 asm volatile(
7602 "sub %0,%1 \n"
7603 "vbroadcastf128 %5,%%ymm5 \n"
7604
7605 // 8 pixel loop.
7606 LABELALIGN
7607 "1: \n"
7608 // replace VPGATHER
7609 "movzb 0x03(%0),%3 \n"
7610 "vmovd 0x00(%4,%3,4),%%xmm0 \n"
7611 "movzb 0x07(%0),%3 \n"
7612 "vmovd 0x00(%4,%3,4),%%xmm1 \n"
7613 "movzb 0x0b(%0),%3 \n"
7614 "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
7615 "vmovd 0x00(%4,%3,4),%%xmm2 \n"
7616 "movzb 0x0f(%0),%3 \n"
7617 "vmovd 0x00(%4,%3,4),%%xmm3 \n"
7618 "movzb 0x13(%0),%3 \n"
7619 "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
7620 "vmovd 0x00(%4,%3,4),%%xmm0 \n"
7621 "movzb 0x17(%0),%3 \n"
7622 "vmovd 0x00(%4,%3,4),%%xmm1 \n"
7623 "movzb 0x1b(%0),%3 \n"
7624 "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
7625 "vmovd 0x00(%4,%3,4),%%xmm2 \n"
7626 "movzb 0x1f(%0),%3 \n"
7627 "vmovd 0x00(%4,%3,4),%%xmm3 \n"
7628 "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
7629 "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
7630 "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
7631 "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
7632 // end of VPGATHER
7633
7634 "vmovdqu (%0),%%ymm6 \n"
7635 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
7636 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
7637 "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
7638 "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
7639 "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
7640 "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
7641 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
7642 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
7643 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
7644 "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
7645 "lea 0x20(%0),%0 \n"
7646 "sub $0x8,%2 \n"
7647 "jg 1b \n"
7648 "vzeroupper \n"
7649 : "+r"(src_argb), // %0
7650 "+r"(dst_argb), // %1
7651 "+r"(width), // %2
7652 "=&r"(alpha) // %3
7653 : "r"(fixed_invtbl8), // %4
7654 "m"(kUnattenShuffleAlpha_AVX2) // %5
7655 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7656 "xmm7");
7657 }
7658 #endif // HAS_ARGBUNATTENUATEROW_AVX2
7659
7660 #ifdef HAS_ARGBGRAYROW_SSSE3
7661 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
ARGBGrayRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width)7662 void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
7663 asm volatile(
7664 "movdqa %3,%%xmm4 \n"
7665 "movdqa %4,%%xmm5 \n"
7666
7667 // 8 pixel loop.
7668 LABELALIGN
7669 "1: \n"
7670 "movdqu (%0),%%xmm0 \n"
7671 "movdqu 0x10(%0),%%xmm1 \n"
7672 "psubb %%xmm5,%%xmm0 \n"
7673 "psubb %%xmm5,%%xmm1 \n"
7674 "movdqu %%xmm4,%%xmm6 \n"
7675 "pmaddubsw %%xmm0,%%xmm6 \n"
7676 "movdqu %%xmm4,%%xmm0 \n"
7677 "pmaddubsw %%xmm1,%%xmm0 \n"
7678 "phaddw %%xmm0,%%xmm6 \n"
7679 "paddw %%xmm5,%%xmm6 \n"
7680 "psrlw $0x8,%%xmm6 \n"
7681 "packuswb %%xmm6,%%xmm6 \n"
7682 "movdqu (%0),%%xmm2 \n"
7683 "movdqu 0x10(%0),%%xmm3 \n"
7684 "lea 0x20(%0),%0 \n"
7685 "psrld $0x18,%%xmm2 \n"
7686 "psrld $0x18,%%xmm3 \n"
7687 "packuswb %%xmm3,%%xmm2 \n"
7688 "packuswb %%xmm2,%%xmm2 \n"
7689 "movdqa %%xmm6,%%xmm3 \n"
7690 "punpcklbw %%xmm6,%%xmm6 \n"
7691 "punpcklbw %%xmm2,%%xmm3 \n"
7692 "movdqa %%xmm6,%%xmm1 \n"
7693 "punpcklwd %%xmm3,%%xmm6 \n"
7694 "punpckhwd %%xmm3,%%xmm1 \n"
7695 "movdqu %%xmm6,(%1) \n"
7696 "movdqu %%xmm1,0x10(%1) \n"
7697 "lea 0x20(%1),%1 \n"
7698 "sub $0x8,%2 \n"
7699 "jg 1b \n"
7700 : "+r"(src_argb), // %0
7701 "+r"(dst_argb), // %1
7702 "+r"(width) // %2
7703 : "m"(kARGBToYJ), // %3
7704 "m"(kSub128) // %4
7705 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
7706 }
7707 #endif // HAS_ARGBGRAYROW_SSSE3
7708
7709 #ifdef HAS_ARGBSEPIAROW_SSSE3
7710 // b = (r * 35 + g * 68 + b * 17) >> 7
7711 // g = (r * 45 + g * 88 + b * 22) >> 7
7712 // r = (r * 50 + g * 98 + b * 24) >> 7
7713 // Constant for ARGB color to sepia tone
7714 static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
7715 17, 68, 35, 0, 17, 68, 35, 0};
7716
7717 static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
7718 22, 88, 45, 0, 22, 88, 45, 0};
7719
7720 static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
7721 24, 98, 50, 0, 24, 98, 50, 0};
7722
7723 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
ARGBSepiaRow_SSSE3(uint8_t * dst_argb,int width)7724 void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
7725 asm volatile(
7726 "movdqa %2,%%xmm2 \n"
7727 "movdqa %3,%%xmm3 \n"
7728 "movdqa %4,%%xmm4 \n"
7729
7730 // 8 pixel loop.
7731 LABELALIGN
7732 "1: \n"
7733 "movdqu (%0),%%xmm0 \n"
7734 "movdqu 0x10(%0),%%xmm6 \n"
7735 "pmaddubsw %%xmm2,%%xmm0 \n"
7736 "pmaddubsw %%xmm2,%%xmm6 \n"
7737 "phaddw %%xmm6,%%xmm0 \n"
7738 "psrlw $0x7,%%xmm0 \n"
7739 "packuswb %%xmm0,%%xmm0 \n"
7740 "movdqu (%0),%%xmm5 \n"
7741 "movdqu 0x10(%0),%%xmm1 \n"
7742 "pmaddubsw %%xmm3,%%xmm5 \n"
7743 "pmaddubsw %%xmm3,%%xmm1 \n"
7744 "phaddw %%xmm1,%%xmm5 \n"
7745 "psrlw $0x7,%%xmm5 \n"
7746 "packuswb %%xmm5,%%xmm5 \n"
7747 "punpcklbw %%xmm5,%%xmm0 \n"
7748 "movdqu (%0),%%xmm5 \n"
7749 "movdqu 0x10(%0),%%xmm1 \n"
7750 "pmaddubsw %%xmm4,%%xmm5 \n"
7751 "pmaddubsw %%xmm4,%%xmm1 \n"
7752 "phaddw %%xmm1,%%xmm5 \n"
7753 "psrlw $0x7,%%xmm5 \n"
7754 "packuswb %%xmm5,%%xmm5 \n"
7755 "movdqu (%0),%%xmm6 \n"
7756 "movdqu 0x10(%0),%%xmm1 \n"
7757 "psrld $0x18,%%xmm6 \n"
7758 "psrld $0x18,%%xmm1 \n"
7759 "packuswb %%xmm1,%%xmm6 \n"
7760 "packuswb %%xmm6,%%xmm6 \n"
7761 "punpcklbw %%xmm6,%%xmm5 \n"
7762 "movdqa %%xmm0,%%xmm1 \n"
7763 "punpcklwd %%xmm5,%%xmm0 \n"
7764 "punpckhwd %%xmm5,%%xmm1 \n"
7765 "movdqu %%xmm0,(%0) \n"
7766 "movdqu %%xmm1,0x10(%0) \n"
7767 "lea 0x20(%0),%0 \n"
7768 "sub $0x8,%1 \n"
7769 "jg 1b \n"
7770 : "+r"(dst_argb), // %0
7771 "+r"(width) // %1
7772 : "m"(kARGBToSepiaB), // %2
7773 "m"(kARGBToSepiaG), // %3
7774 "m"(kARGBToSepiaR) // %4
7775 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
7776 }
7777 #endif // HAS_ARGBSEPIAROW_SSSE3
7778
7779 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
7780 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
7781 // Same as Sepia except matrix is provided.
ARGBColorMatrixRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,const int8_t * matrix_argb,int width)7782 void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
7783 uint8_t* dst_argb,
7784 const int8_t* matrix_argb,
7785 int width) {
7786 asm volatile(
7787 "movdqu (%3),%%xmm5 \n"
7788 "pshufd $0x00,%%xmm5,%%xmm2 \n"
7789 "pshufd $0x55,%%xmm5,%%xmm3 \n"
7790 "pshufd $0xaa,%%xmm5,%%xmm4 \n"
7791 "pshufd $0xff,%%xmm5,%%xmm5 \n"
7792
7793 // 8 pixel loop.
7794 LABELALIGN
7795 "1: \n"
7796 "movdqu (%0),%%xmm0 \n"
7797 "movdqu 0x10(%0),%%xmm7 \n"
7798 "pmaddubsw %%xmm2,%%xmm0 \n"
7799 "pmaddubsw %%xmm2,%%xmm7 \n"
7800 "movdqu (%0),%%xmm6 \n"
7801 "movdqu 0x10(%0),%%xmm1 \n"
7802 "pmaddubsw %%xmm3,%%xmm6 \n"
7803 "pmaddubsw %%xmm3,%%xmm1 \n"
7804 "phaddsw %%xmm7,%%xmm0 \n"
7805 "phaddsw %%xmm1,%%xmm6 \n"
7806 "psraw $0x6,%%xmm0 \n"
7807 "psraw $0x6,%%xmm6 \n"
7808 "packuswb %%xmm0,%%xmm0 \n"
7809 "packuswb %%xmm6,%%xmm6 \n"
7810 "punpcklbw %%xmm6,%%xmm0 \n"
7811 "movdqu (%0),%%xmm1 \n"
7812 "movdqu 0x10(%0),%%xmm7 \n"
7813 "pmaddubsw %%xmm4,%%xmm1 \n"
7814 "pmaddubsw %%xmm4,%%xmm7 \n"
7815 "phaddsw %%xmm7,%%xmm1 \n"
7816 "movdqu (%0),%%xmm6 \n"
7817 "movdqu 0x10(%0),%%xmm7 \n"
7818 "pmaddubsw %%xmm5,%%xmm6 \n"
7819 "pmaddubsw %%xmm5,%%xmm7 \n"
7820 "phaddsw %%xmm7,%%xmm6 \n"
7821 "psraw $0x6,%%xmm1 \n"
7822 "psraw $0x6,%%xmm6 \n"
7823 "packuswb %%xmm1,%%xmm1 \n"
7824 "packuswb %%xmm6,%%xmm6 \n"
7825 "punpcklbw %%xmm6,%%xmm1 \n"
7826 "movdqa %%xmm0,%%xmm6 \n"
7827 "punpcklwd %%xmm1,%%xmm0 \n"
7828 "punpckhwd %%xmm1,%%xmm6 \n"
7829 "movdqu %%xmm0,(%1) \n"
7830 "movdqu %%xmm6,0x10(%1) \n"
7831 "lea 0x20(%0),%0 \n"
7832 "lea 0x20(%1),%1 \n"
7833 "sub $0x8,%2 \n"
7834 "jg 1b \n"
7835 : "+r"(src_argb), // %0
7836 "+r"(dst_argb), // %1
7837 "+r"(width) // %2
7838 : "r"(matrix_argb) // %3
7839 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7840 "xmm7");
7841 }
7842 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
7843
7844 #ifdef HAS_ARGBQUANTIZEROW_SSE2
7845 // Quantize 4 ARGB pixels (16 bytes).
ARGBQuantizeRow_SSE2(uint8_t * dst_argb,int scale,int interval_size,int interval_offset,int width)7846 void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
7847 int scale,
7848 int interval_size,
7849 int interval_offset,
7850 int width) {
7851 asm volatile(
7852 "movd %2,%%xmm2 \n"
7853 "movd %3,%%xmm3 \n"
7854 "movd %4,%%xmm4 \n"
7855 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
7856 "pshufd $0x44,%%xmm2,%%xmm2 \n"
7857 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
7858 "pshufd $0x44,%%xmm3,%%xmm3 \n"
7859 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
7860 "pshufd $0x44,%%xmm4,%%xmm4 \n"
7861 "pxor %%xmm5,%%xmm5 \n"
7862 "pcmpeqb %%xmm6,%%xmm6 \n"
7863 "pslld $0x18,%%xmm6 \n"
7864
7865 // 4 pixel loop.
7866 LABELALIGN
7867 "1: \n"
7868 "movdqu (%0),%%xmm0 \n"
7869 "punpcklbw %%xmm5,%%xmm0 \n"
7870 "pmulhuw %%xmm2,%%xmm0 \n"
7871 "movdqu (%0),%%xmm1 \n"
7872 "punpckhbw %%xmm5,%%xmm1 \n"
7873 "pmulhuw %%xmm2,%%xmm1 \n"
7874 "pmullw %%xmm3,%%xmm0 \n"
7875 "movdqu (%0),%%xmm7 \n"
7876 "pmullw %%xmm3,%%xmm1 \n"
7877 "pand %%xmm6,%%xmm7 \n"
7878 "paddw %%xmm4,%%xmm0 \n"
7879 "paddw %%xmm4,%%xmm1 \n"
7880 "packuswb %%xmm1,%%xmm0 \n"
7881 "por %%xmm7,%%xmm0 \n"
7882 "movdqu %%xmm0,(%0) \n"
7883 "lea 0x10(%0),%0 \n"
7884 "sub $0x4,%1 \n"
7885 "jg 1b \n"
7886 : "+r"(dst_argb), // %0
7887 "+r"(width) // %1
7888 : "r"(scale), // %2
7889 "r"(interval_size), // %3
7890 "r"(interval_offset) // %4
7891 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
7892 "xmm7");
7893 }
7894 #endif // HAS_ARGBQUANTIZEROW_SSE2
7895
7896 #ifdef HAS_ARGBSHADEROW_SSE2
7897 // Shade 4 pixels at a time by specified value.
ARGBShadeRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)7898 void ARGBShadeRow_SSE2(const uint8_t* src_argb,
7899 uint8_t* dst_argb,
7900 int width,
7901 uint32_t value) {
7902 asm volatile(
7903 "movd %3,%%xmm2 \n"
7904 "punpcklbw %%xmm2,%%xmm2 \n"
7905 "punpcklqdq %%xmm2,%%xmm2 \n"
7906
7907 // 4 pixel loop.
7908 LABELALIGN
7909 "1: \n"
7910 "movdqu (%0),%%xmm0 \n"
7911 "lea 0x10(%0),%0 \n"
7912 "movdqa %%xmm0,%%xmm1 \n"
7913 "punpcklbw %%xmm0,%%xmm0 \n"
7914 "punpckhbw %%xmm1,%%xmm1 \n"
7915 "pmulhuw %%xmm2,%%xmm0 \n"
7916 "pmulhuw %%xmm2,%%xmm1 \n"
7917 "psrlw $0x8,%%xmm0 \n"
7918 "psrlw $0x8,%%xmm1 \n"
7919 "packuswb %%xmm1,%%xmm0 \n"
7920 "movdqu %%xmm0,(%1) \n"
7921 "lea 0x10(%1),%1 \n"
7922 "sub $0x4,%2 \n"
7923 "jg 1b \n"
7924 : "+r"(src_argb), // %0
7925 "+r"(dst_argb), // %1
7926 "+r"(width) // %2
7927 : "r"(value) // %3
7928 : "memory", "cc", "xmm0", "xmm1", "xmm2");
7929 }
7930 #endif // HAS_ARGBSHADEROW_SSE2
7931
7932 #ifdef HAS_ARGBMULTIPLYROW_SSE2
7933 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBMultiplyRow_SSE2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7934 void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
7935 const uint8_t* src_argb1,
7936 uint8_t* dst_argb,
7937 int width) {
7938 asm volatile(
7939
7940 "pxor %%xmm5,%%xmm5 \n"
7941
7942 // 4 pixel loop.
7943 LABELALIGN
7944 "1: \n"
7945 "movdqu (%0),%%xmm0 \n"
7946 "lea 0x10(%0),%0 \n"
7947 "movdqu (%1),%%xmm2 \n"
7948 "lea 0x10(%1),%1 \n"
7949 "movdqu %%xmm0,%%xmm1 \n"
7950 "movdqu %%xmm2,%%xmm3 \n"
7951 "punpcklbw %%xmm0,%%xmm0 \n"
7952 "punpckhbw %%xmm1,%%xmm1 \n"
7953 "punpcklbw %%xmm5,%%xmm2 \n"
7954 "punpckhbw %%xmm5,%%xmm3 \n"
7955 "pmulhuw %%xmm2,%%xmm0 \n"
7956 "pmulhuw %%xmm3,%%xmm1 \n"
7957 "packuswb %%xmm1,%%xmm0 \n"
7958 "movdqu %%xmm0,(%2) \n"
7959 "lea 0x10(%2),%2 \n"
7960 "sub $0x4,%3 \n"
7961 "jg 1b \n"
7962 : "+r"(src_argb), // %0
7963 "+r"(src_argb1), // %1
7964 "+r"(dst_argb), // %2
7965 "+r"(width) // %3
7966 :
7967 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
7968 }
7969 #endif // HAS_ARGBMULTIPLYROW_SSE2
7970
7971 #ifdef HAS_ARGBMULTIPLYROW_AVX2
7972 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_AVX2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)7973 void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
7974 const uint8_t* src_argb1,
7975 uint8_t* dst_argb,
7976 int width) {
7977 asm volatile(
7978
7979 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
7980
7981 // 4 pixel loop.
7982 LABELALIGN
7983 "1: \n"
7984 "vmovdqu (%0),%%ymm1 \n"
7985 "lea 0x20(%0),%0 \n"
7986 "vmovdqu (%1),%%ymm3 \n"
7987 "lea 0x20(%1),%1 \n"
7988 "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
7989 "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
7990 "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
7991 "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
7992 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
7993 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
7994 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
7995 "vmovdqu %%ymm0,(%2) \n"
7996 "lea 0x20(%2),%2 \n"
7997 "sub $0x8,%3 \n"
7998 "jg 1b \n"
7999 "vzeroupper \n"
8000 : "+r"(src_argb), // %0
8001 "+r"(src_argb1), // %1
8002 "+r"(dst_argb), // %2
8003 "+r"(width) // %3
8004 :
8005 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
8006 }
8007 #endif // HAS_ARGBMULTIPLYROW_AVX2
8008
8009 #ifdef HAS_ARGBADDROW_SSE2
8010 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_SSE2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)8011 void ARGBAddRow_SSE2(const uint8_t* src_argb,
8012 const uint8_t* src_argb1,
8013 uint8_t* dst_argb,
8014 int width) {
8015 asm volatile(
8016 // 4 pixel loop.
8017 LABELALIGN
8018 "1: \n"
8019 "movdqu (%0),%%xmm0 \n"
8020 "lea 0x10(%0),%0 \n"
8021 "movdqu (%1),%%xmm1 \n"
8022 "lea 0x10(%1),%1 \n"
8023 "paddusb %%xmm1,%%xmm0 \n"
8024 "movdqu %%xmm0,(%2) \n"
8025 "lea 0x10(%2),%2 \n"
8026 "sub $0x4,%3 \n"
8027 "jg 1b \n"
8028 : "+r"(src_argb), // %0
8029 "+r"(src_argb1), // %1
8030 "+r"(dst_argb), // %2
8031 "+r"(width) // %3
8032 :
8033 : "memory", "cc", "xmm0", "xmm1");
8034 }
8035 #endif // HAS_ARGBADDROW_SSE2
8036
8037 #ifdef HAS_ARGBADDROW_AVX2
8038 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_AVX2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)8039 void ARGBAddRow_AVX2(const uint8_t* src_argb,
8040 const uint8_t* src_argb1,
8041 uint8_t* dst_argb,
8042 int width) {
8043 asm volatile(
8044 // 4 pixel loop.
8045 LABELALIGN
8046 "1: \n"
8047 "vmovdqu (%0),%%ymm0 \n"
8048 "lea 0x20(%0),%0 \n"
8049 "vpaddusb (%1),%%ymm0,%%ymm0 \n"
8050 "lea 0x20(%1),%1 \n"
8051 "vmovdqu %%ymm0,(%2) \n"
8052 "lea 0x20(%2),%2 \n"
8053 "sub $0x8,%3 \n"
8054 "jg 1b \n"
8055 "vzeroupper \n"
8056 : "+r"(src_argb), // %0
8057 "+r"(src_argb1), // %1
8058 "+r"(dst_argb), // %2
8059 "+r"(width) // %3
8060 :
8061 : "memory", "cc", "xmm0");
8062 }
8063 #endif // HAS_ARGBADDROW_AVX2
8064
8065 #ifdef HAS_ARGBSUBTRACTROW_SSE2
8066 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
ARGBSubtractRow_SSE2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)8067 void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
8068 const uint8_t* src_argb1,
8069 uint8_t* dst_argb,
8070 int width) {
8071 asm volatile(
8072 // 4 pixel loop.
8073 LABELALIGN
8074 "1: \n"
8075 "movdqu (%0),%%xmm0 \n"
8076 "lea 0x10(%0),%0 \n"
8077 "movdqu (%1),%%xmm1 \n"
8078 "lea 0x10(%1),%1 \n"
8079 "psubusb %%xmm1,%%xmm0 \n"
8080 "movdqu %%xmm0,(%2) \n"
8081 "lea 0x10(%2),%2 \n"
8082 "sub $0x4,%3 \n"
8083 "jg 1b \n"
8084 : "+r"(src_argb), // %0
8085 "+r"(src_argb1), // %1
8086 "+r"(dst_argb), // %2
8087 "+r"(width) // %3
8088 :
8089 : "memory", "cc", "xmm0", "xmm1");
8090 }
8091 #endif // HAS_ARGBSUBTRACTROW_SSE2
8092
8093 #ifdef HAS_ARGBSUBTRACTROW_AVX2
8094 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_AVX2(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)8095 void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
8096 const uint8_t* src_argb1,
8097 uint8_t* dst_argb,
8098 int width) {
8099 asm volatile(
8100 // 4 pixel loop.
8101 LABELALIGN
8102 "1: \n"
8103 "vmovdqu (%0),%%ymm0 \n"
8104 "lea 0x20(%0),%0 \n"
8105 "vpsubusb (%1),%%ymm0,%%ymm0 \n"
8106 "lea 0x20(%1),%1 \n"
8107 "vmovdqu %%ymm0,(%2) \n"
8108 "lea 0x20(%2),%2 \n"
8109 "sub $0x8,%3 \n"
8110 "jg 1b \n"
8111 "vzeroupper \n"
8112 : "+r"(src_argb), // %0
8113 "+r"(src_argb1), // %1
8114 "+r"(dst_argb), // %2
8115 "+r"(width) // %3
8116 :
8117 : "memory", "cc", "xmm0");
8118 }
8119 #endif // HAS_ARGBSUBTRACTROW_AVX2
8120
8121 #ifdef HAS_SOBELXROW_SSE2
8122 // SobelX as a matrix is
8123 // -1 0 1
8124 // -2 0 2
8125 // -1 0 1
SobelXRow_SSE2(const uint8_t * src_y0,const uint8_t * src_y1,const uint8_t * src_y2,uint8_t * dst_sobelx,int width)8126 void SobelXRow_SSE2(const uint8_t* src_y0,
8127 const uint8_t* src_y1,
8128 const uint8_t* src_y2,
8129 uint8_t* dst_sobelx,
8130 int width) {
8131 asm volatile(
8132 "sub %0,%1 \n"
8133 "sub %0,%2 \n"
8134 "sub %0,%3 \n"
8135 "pxor %%xmm5,%%xmm5 \n"
8136
8137 // 8 pixel loop.
8138 LABELALIGN
8139 "1: \n"
8140 "movq (%0),%%xmm0 \n"
8141 "movq 0x2(%0),%%xmm1 \n"
8142 "punpcklbw %%xmm5,%%xmm0 \n"
8143 "punpcklbw %%xmm5,%%xmm1 \n"
8144 "psubw %%xmm1,%%xmm0 \n"
8145 "movq 0x00(%0,%1,1),%%xmm1 \n"
8146 "movq 0x02(%0,%1,1),%%xmm2 \n"
8147 "punpcklbw %%xmm5,%%xmm1 \n"
8148 "punpcklbw %%xmm5,%%xmm2 \n"
8149 "psubw %%xmm2,%%xmm1 \n"
8150 "movq 0x00(%0,%2,1),%%xmm2 \n"
8151 "movq 0x02(%0,%2,1),%%xmm3 \n"
8152 "punpcklbw %%xmm5,%%xmm2 \n"
8153 "punpcklbw %%xmm5,%%xmm3 \n"
8154 "psubw %%xmm3,%%xmm2 \n"
8155 "paddw %%xmm2,%%xmm0 \n"
8156 "paddw %%xmm1,%%xmm0 \n"
8157 "paddw %%xmm1,%%xmm0 \n"
8158 "pxor %%xmm1,%%xmm1 \n"
8159 "psubw %%xmm0,%%xmm1 \n"
8160 "pmaxsw %%xmm1,%%xmm0 \n"
8161 "packuswb %%xmm0,%%xmm0 \n"
8162 "movq %%xmm0,0x00(%0,%3,1) \n"
8163 "lea 0x8(%0),%0 \n"
8164 "sub $0x8,%4 \n"
8165 "jg 1b \n"
8166 : "+r"(src_y0), // %0
8167 "+r"(src_y1), // %1
8168 "+r"(src_y2), // %2
8169 "+r"(dst_sobelx), // %3
8170 "+r"(width) // %4
8171 :
8172 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
8173 }
8174 #endif // HAS_SOBELXROW_SSE2
8175
8176 #ifdef HAS_SOBELYROW_SSE2
8177 // SobelY as a matrix is
8178 // -1 -2 -1
8179 // 0 0 0
8180 // 1 2 1
SobelYRow_SSE2(const uint8_t * src_y0,const uint8_t * src_y1,uint8_t * dst_sobely,int width)8181 void SobelYRow_SSE2(const uint8_t* src_y0,
8182 const uint8_t* src_y1,
8183 uint8_t* dst_sobely,
8184 int width) {
8185 asm volatile(
8186 "sub %0,%1 \n"
8187 "sub %0,%2 \n"
8188 "pxor %%xmm5,%%xmm5 \n"
8189
8190 // 8 pixel loop.
8191 LABELALIGN
8192 "1: \n"
8193 "movq (%0),%%xmm0 \n"
8194 "movq 0x00(%0,%1,1),%%xmm1 \n"
8195 "punpcklbw %%xmm5,%%xmm0 \n"
8196 "punpcklbw %%xmm5,%%xmm1 \n"
8197 "psubw %%xmm1,%%xmm0 \n"
8198 "movq 0x1(%0),%%xmm1 \n"
8199 "movq 0x01(%0,%1,1),%%xmm2 \n"
8200 "punpcklbw %%xmm5,%%xmm1 \n"
8201 "punpcklbw %%xmm5,%%xmm2 \n"
8202 "psubw %%xmm2,%%xmm1 \n"
8203 "movq 0x2(%0),%%xmm2 \n"
8204 "movq 0x02(%0,%1,1),%%xmm3 \n"
8205 "punpcklbw %%xmm5,%%xmm2 \n"
8206 "punpcklbw %%xmm5,%%xmm3 \n"
8207 "psubw %%xmm3,%%xmm2 \n"
8208 "paddw %%xmm2,%%xmm0 \n"
8209 "paddw %%xmm1,%%xmm0 \n"
8210 "paddw %%xmm1,%%xmm0 \n"
8211 "pxor %%xmm1,%%xmm1 \n"
8212 "psubw %%xmm0,%%xmm1 \n"
8213 "pmaxsw %%xmm1,%%xmm0 \n"
8214 "packuswb %%xmm0,%%xmm0 \n"
8215 "movq %%xmm0,0x00(%0,%2,1) \n"
8216 "lea 0x8(%0),%0 \n"
8217 "sub $0x8,%3 \n"
8218 "jg 1b \n"
8219 : "+r"(src_y0), // %0
8220 "+r"(src_y1), // %1
8221 "+r"(dst_sobely), // %2
8222 "+r"(width) // %3
8223 :
8224 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
8225 }
8226 #endif // HAS_SOBELYROW_SSE2
8227
8228 #ifdef HAS_SOBELROW_SSE2
8229 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
8230 // A = 255
8231 // R = Sobel
8232 // G = Sobel
8233 // B = Sobel
SobelRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)8234 void SobelRow_SSE2(const uint8_t* src_sobelx,
8235 const uint8_t* src_sobely,
8236 uint8_t* dst_argb,
8237 int width) {
8238 asm volatile(
8239 "sub %0,%1 \n"
8240 "pcmpeqb %%xmm5,%%xmm5 \n"
8241 "pslld $0x18,%%xmm5 \n"
8242
8243 // 8 pixel loop.
8244 LABELALIGN
8245 "1: \n"
8246 "movdqu (%0),%%xmm0 \n"
8247 "movdqu 0x00(%0,%1,1),%%xmm1 \n"
8248 "lea 0x10(%0),%0 \n"
8249 "paddusb %%xmm1,%%xmm0 \n"
8250 "movdqa %%xmm0,%%xmm2 \n"
8251 "punpcklbw %%xmm0,%%xmm2 \n"
8252 "punpckhbw %%xmm0,%%xmm0 \n"
8253 "movdqa %%xmm2,%%xmm1 \n"
8254 "punpcklwd %%xmm2,%%xmm1 \n"
8255 "punpckhwd %%xmm2,%%xmm2 \n"
8256 "por %%xmm5,%%xmm1 \n"
8257 "por %%xmm5,%%xmm2 \n"
8258 "movdqa %%xmm0,%%xmm3 \n"
8259 "punpcklwd %%xmm0,%%xmm3 \n"
8260 "punpckhwd %%xmm0,%%xmm0 \n"
8261 "por %%xmm5,%%xmm3 \n"
8262 "por %%xmm5,%%xmm0 \n"
8263 "movdqu %%xmm1,(%2) \n"
8264 "movdqu %%xmm2,0x10(%2) \n"
8265 "movdqu %%xmm3,0x20(%2) \n"
8266 "movdqu %%xmm0,0x30(%2) \n"
8267 "lea 0x40(%2),%2 \n"
8268 "sub $0x10,%3 \n"
8269 "jg 1b \n"
8270 : "+r"(src_sobelx), // %0
8271 "+r"(src_sobely), // %1
8272 "+r"(dst_argb), // %2
8273 "+r"(width) // %3
8274 :
8275 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
8276 }
8277 #endif // HAS_SOBELROW_SSE2
8278
8279 #ifdef HAS_SOBELTOPLANEROW_SSE2
8280 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
SobelToPlaneRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_y,int width)8281 void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
8282 const uint8_t* src_sobely,
8283 uint8_t* dst_y,
8284 int width) {
8285 asm volatile(
8286 "sub %0,%1 \n"
8287 "pcmpeqb %%xmm5,%%xmm5 \n"
8288 "pslld $0x18,%%xmm5 \n"
8289
8290 // 8 pixel loop.
8291 LABELALIGN
8292 "1: \n"
8293 "movdqu (%0),%%xmm0 \n"
8294 "movdqu 0x00(%0,%1,1),%%xmm1 \n"
8295 "lea 0x10(%0),%0 \n"
8296 "paddusb %%xmm1,%%xmm0 \n"
8297 "movdqu %%xmm0,(%2) \n"
8298 "lea 0x10(%2),%2 \n"
8299 "sub $0x10,%3 \n"
8300 "jg 1b \n"
8301 : "+r"(src_sobelx), // %0
8302 "+r"(src_sobely), // %1
8303 "+r"(dst_y), // %2
8304 "+r"(width) // %3
8305 :
8306 : "memory", "cc", "xmm0", "xmm1");
8307 }
8308 #endif // HAS_SOBELTOPLANEROW_SSE2
8309
8310 #ifdef HAS_SOBELXYROW_SSE2
8311 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
8312 // A = 255
8313 // R = Sobel X
8314 // G = Sobel
8315 // B = Sobel Y
SobelXYRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)8316 void SobelXYRow_SSE2(const uint8_t* src_sobelx,
8317 const uint8_t* src_sobely,
8318 uint8_t* dst_argb,
8319 int width) {
8320 asm volatile(
8321 "sub %0,%1 \n"
8322 "pcmpeqb %%xmm5,%%xmm5 \n"
8323
8324 // 8 pixel loop.
8325 LABELALIGN
8326 "1: \n"
8327 "movdqu (%0),%%xmm0 \n"
8328 "movdqu 0x00(%0,%1,1),%%xmm1 \n"
8329 "lea 0x10(%0),%0 \n"
8330 "movdqa %%xmm0,%%xmm2 \n"
8331 "paddusb %%xmm1,%%xmm2 \n"
8332 "movdqa %%xmm0,%%xmm3 \n"
8333 "punpcklbw %%xmm5,%%xmm3 \n"
8334 "punpckhbw %%xmm5,%%xmm0 \n"
8335 "movdqa %%xmm1,%%xmm4 \n"
8336 "punpcklbw %%xmm2,%%xmm4 \n"
8337 "punpckhbw %%xmm2,%%xmm1 \n"
8338 "movdqa %%xmm4,%%xmm6 \n"
8339 "punpcklwd %%xmm3,%%xmm6 \n"
8340 "punpckhwd %%xmm3,%%xmm4 \n"
8341 "movdqa %%xmm1,%%xmm7 \n"
8342 "punpcklwd %%xmm0,%%xmm7 \n"
8343 "punpckhwd %%xmm0,%%xmm1 \n"
8344 "movdqu %%xmm6,(%2) \n"
8345 "movdqu %%xmm4,0x10(%2) \n"
8346 "movdqu %%xmm7,0x20(%2) \n"
8347 "movdqu %%xmm1,0x30(%2) \n"
8348 "lea 0x40(%2),%2 \n"
8349 "sub $0x10,%3 \n"
8350 "jg 1b \n"
8351 : "+r"(src_sobelx), // %0
8352 "+r"(src_sobely), // %1
8353 "+r"(dst_argb), // %2
8354 "+r"(width) // %3
8355 :
8356 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
8357 "xmm7");
8358 }
8359 #endif // HAS_SOBELXYROW_SSE2
8360
8361 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
8362 // Creates a table of cumulative sums where each value is a sum of all values
8363 // above and to the left of the value, inclusive of the value.
ComputeCumulativeSumRow_SSE2(const uint8_t * row,int32_t * cumsum,const int32_t * previous_cumsum,int width)8364 void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
8365 int32_t* cumsum,
8366 const int32_t* previous_cumsum,
8367 int width) {
8368 asm volatile(
8369 "pxor %%xmm0,%%xmm0 \n"
8370 "pxor %%xmm1,%%xmm1 \n"
8371 "sub $0x4,%3 \n"
8372 "jl 49f \n"
8373 "test $0xf,%1 \n"
8374 "jne 49f \n"
8375
8376 // 4 pixel loop.
8377 LABELALIGN
8378 "40: \n"
8379 "movdqu (%0),%%xmm2 \n"
8380 "lea 0x10(%0),%0 \n"
8381 "movdqa %%xmm2,%%xmm4 \n"
8382 "punpcklbw %%xmm1,%%xmm2 \n"
8383 "movdqa %%xmm2,%%xmm3 \n"
8384 "punpcklwd %%xmm1,%%xmm2 \n"
8385 "punpckhwd %%xmm1,%%xmm3 \n"
8386 "punpckhbw %%xmm1,%%xmm4 \n"
8387 "movdqa %%xmm4,%%xmm5 \n"
8388 "punpcklwd %%xmm1,%%xmm4 \n"
8389 "punpckhwd %%xmm1,%%xmm5 \n"
8390 "paddd %%xmm2,%%xmm0 \n"
8391 "movdqu (%2),%%xmm2 \n"
8392 "paddd %%xmm0,%%xmm2 \n"
8393 "paddd %%xmm3,%%xmm0 \n"
8394 "movdqu 0x10(%2),%%xmm3 \n"
8395 "paddd %%xmm0,%%xmm3 \n"
8396 "paddd %%xmm4,%%xmm0 \n"
8397 "movdqu 0x20(%2),%%xmm4 \n"
8398 "paddd %%xmm0,%%xmm4 \n"
8399 "paddd %%xmm5,%%xmm0 \n"
8400 "movdqu 0x30(%2),%%xmm5 \n"
8401 "lea 0x40(%2),%2 \n"
8402 "paddd %%xmm0,%%xmm5 \n"
8403 "movdqu %%xmm2,(%1) \n"
8404 "movdqu %%xmm3,0x10(%1) \n"
8405 "movdqu %%xmm4,0x20(%1) \n"
8406 "movdqu %%xmm5,0x30(%1) \n"
8407 "lea 0x40(%1),%1 \n"
8408 "sub $0x4,%3 \n"
8409 "jge 40b \n"
8410
8411 "49: \n"
8412 "add $0x3,%3 \n"
8413 "jl 19f \n"
8414
8415 // 1 pixel loop.
8416 LABELALIGN
8417 "10: \n"
8418 "movd (%0),%%xmm2 \n"
8419 "lea 0x4(%0),%0 \n"
8420 "punpcklbw %%xmm1,%%xmm2 \n"
8421 "punpcklwd %%xmm1,%%xmm2 \n"
8422 "paddd %%xmm2,%%xmm0 \n"
8423 "movdqu (%2),%%xmm2 \n"
8424 "lea 0x10(%2),%2 \n"
8425 "paddd %%xmm0,%%xmm2 \n"
8426 "movdqu %%xmm2,(%1) \n"
8427 "lea 0x10(%1),%1 \n"
8428 "sub $0x1,%3 \n"
8429 "jge 10b \n"
8430
8431 "19: \n"
8432 : "+r"(row), // %0
8433 "+r"(cumsum), // %1
8434 "+r"(previous_cumsum), // %2
8435 "+r"(width) // %3
8436 :
8437 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
8438 }
8439 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
8440
8441 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
CumulativeSumToAverageRow_SSE2(const int32_t * topleft,const int32_t * botleft,int width,int area,uint8_t * dst,int count)8442 void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
8443 const int32_t* botleft,
8444 int width,
8445 int area,
8446 uint8_t* dst,
8447 int count) {
8448 asm volatile(
8449 "movd %5,%%xmm5 \n"
8450 "cvtdq2ps %%xmm5,%%xmm5 \n"
8451 "rcpss %%xmm5,%%xmm4 \n"
8452 "pshufd $0x0,%%xmm4,%%xmm4 \n"
8453 "sub $0x4,%3 \n"
8454 "jl 49f \n"
8455 "cmpl $0x80,%5 \n"
8456 "ja 40f \n"
8457
8458 "pshufd $0x0,%%xmm5,%%xmm5 \n"
8459 "pcmpeqb %%xmm6,%%xmm6 \n"
8460 "psrld $0x10,%%xmm6 \n"
8461 "cvtdq2ps %%xmm6,%%xmm6 \n"
8462 "addps %%xmm6,%%xmm5 \n"
8463 "mulps %%xmm4,%%xmm5 \n"
8464 "cvtps2dq %%xmm5,%%xmm5 \n"
8465 "packssdw %%xmm5,%%xmm5 \n"
8466
8467 // 4 pixel small loop.
8468 LABELALIGN
8469 "4: \n"
8470 "movdqu (%0),%%xmm0 \n"
8471 "movdqu 0x10(%0),%%xmm1 \n"
8472 "movdqu 0x20(%0),%%xmm2 \n"
8473 "movdqu 0x30(%0),%%xmm3 \n"
8474 "psubd 0x00(%0,%4,4),%%xmm0 \n"
8475 "psubd 0x10(%0,%4,4),%%xmm1 \n"
8476 "psubd 0x20(%0,%4,4),%%xmm2 \n"
8477 "psubd 0x30(%0,%4,4),%%xmm3 \n"
8478 "lea 0x40(%0),%0 \n"
8479 "psubd (%1),%%xmm0 \n"
8480 "psubd 0x10(%1),%%xmm1 \n"
8481 "psubd 0x20(%1),%%xmm2 \n"
8482 "psubd 0x30(%1),%%xmm3 \n"
8483 "paddd 0x00(%1,%4,4),%%xmm0 \n"
8484 "paddd 0x10(%1,%4,4),%%xmm1 \n"
8485 "paddd 0x20(%1,%4,4),%%xmm2 \n"
8486 "paddd 0x30(%1,%4,4),%%xmm3 \n"
8487 "lea 0x40(%1),%1 \n"
8488 "packssdw %%xmm1,%%xmm0 \n"
8489 "packssdw %%xmm3,%%xmm2 \n"
8490 "pmulhuw %%xmm5,%%xmm0 \n"
8491 "pmulhuw %%xmm5,%%xmm2 \n"
8492 "packuswb %%xmm2,%%xmm0 \n"
8493 "movdqu %%xmm0,(%2) \n"
8494 "lea 0x10(%2),%2 \n"
8495 "sub $0x4,%3 \n"
8496 "jge 4b \n"
8497 "jmp 49f \n"
8498
8499 // 4 pixel loop
8500 LABELALIGN
8501 "40: \n"
8502 "movdqu (%0),%%xmm0 \n"
8503 "movdqu 0x10(%0),%%xmm1 \n"
8504 "movdqu 0x20(%0),%%xmm2 \n"
8505 "movdqu 0x30(%0),%%xmm3 \n"
8506 "psubd 0x00(%0,%4,4),%%xmm0 \n"
8507 "psubd 0x10(%0,%4,4),%%xmm1 \n"
8508 "psubd 0x20(%0,%4,4),%%xmm2 \n"
8509 "psubd 0x30(%0,%4,4),%%xmm3 \n"
8510 "lea 0x40(%0),%0 \n"
8511 "psubd (%1),%%xmm0 \n"
8512 "psubd 0x10(%1),%%xmm1 \n"
8513 "psubd 0x20(%1),%%xmm2 \n"
8514 "psubd 0x30(%1),%%xmm3 \n"
8515 "paddd 0x00(%1,%4,4),%%xmm0 \n"
8516 "paddd 0x10(%1,%4,4),%%xmm1 \n"
8517 "paddd 0x20(%1,%4,4),%%xmm2 \n"
8518 "paddd 0x30(%1,%4,4),%%xmm3 \n"
8519 "lea 0x40(%1),%1 \n"
8520 "cvtdq2ps %%xmm0,%%xmm0 \n"
8521 "cvtdq2ps %%xmm1,%%xmm1 \n"
8522 "mulps %%xmm4,%%xmm0 \n"
8523 "mulps %%xmm4,%%xmm1 \n"
8524 "cvtdq2ps %%xmm2,%%xmm2 \n"
8525 "cvtdq2ps %%xmm3,%%xmm3 \n"
8526 "mulps %%xmm4,%%xmm2 \n"
8527 "mulps %%xmm4,%%xmm3 \n"
8528 "cvtps2dq %%xmm0,%%xmm0 \n"
8529 "cvtps2dq %%xmm1,%%xmm1 \n"
8530 "cvtps2dq %%xmm2,%%xmm2 \n"
8531 "cvtps2dq %%xmm3,%%xmm3 \n"
8532 "packssdw %%xmm1,%%xmm0 \n"
8533 "packssdw %%xmm3,%%xmm2 \n"
8534 "packuswb %%xmm2,%%xmm0 \n"
8535 "movdqu %%xmm0,(%2) \n"
8536 "lea 0x10(%2),%2 \n"
8537 "sub $0x4,%3 \n"
8538 "jge 40b \n"
8539
8540 "49: \n"
8541 "add $0x3,%3 \n"
8542 "jl 19f \n"
8543
8544 // 1 pixel loop
8545 LABELALIGN
8546 "10: \n"
8547 "movdqu (%0),%%xmm0 \n"
8548 "psubd 0x00(%0,%4,4),%%xmm0 \n"
8549 "lea 0x10(%0),%0 \n"
8550 "psubd (%1),%%xmm0 \n"
8551 "paddd 0x00(%1,%4,4),%%xmm0 \n"
8552 "lea 0x10(%1),%1 \n"
8553 "cvtdq2ps %%xmm0,%%xmm0 \n"
8554 "mulps %%xmm4,%%xmm0 \n"
8555 "cvtps2dq %%xmm0,%%xmm0 \n"
8556 "packssdw %%xmm0,%%xmm0 \n"
8557 "packuswb %%xmm0,%%xmm0 \n"
8558 "movd %%xmm0,(%2) \n"
8559 "lea 0x4(%2),%2 \n"
8560 "sub $0x1,%3 \n"
8561 "jge 10b \n"
8562 "19: \n"
8563 : "+r"(topleft), // %0
8564 "+r"(botleft), // %1
8565 "+r"(dst), // %2
8566 "+rm"(count) // %3
8567 : "r"((intptr_t)(width)), // %4
8568 "rm"(area) // %5
8569 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
8570 }
8571 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
8572
8573 #ifdef HAS_ARGBAFFINEROW_SSE2
8574 // Copy ARGB pixels from source image with slope to a row of destination.
8575 LIBYUV_API
ARGBAffineRow_SSE2(const uint8_t * src_argb,int src_argb_stride,uint8_t * dst_argb,const float * src_dudv,int width)8576 void ARGBAffineRow_SSE2(const uint8_t* src_argb,
8577 int src_argb_stride,
8578 uint8_t* dst_argb,
8579 const float* src_dudv,
8580 int width) {
8581 intptr_t src_argb_stride_temp = src_argb_stride;
8582 intptr_t temp;
8583 asm volatile(
8584 "movq (%3),%%xmm2 \n"
8585 "movq 0x08(%3),%%xmm7 \n"
8586 "shl $0x10,%1 \n"
8587 "add $0x4,%1 \n"
8588 "movd %1,%%xmm5 \n"
8589 "sub $0x4,%4 \n"
8590 "jl 49f \n"
8591
8592 "pshufd $0x44,%%xmm7,%%xmm7 \n"
8593 "pshufd $0x0,%%xmm5,%%xmm5 \n"
8594 "movdqa %%xmm2,%%xmm0 \n"
8595 "addps %%xmm7,%%xmm0 \n"
8596 "movlhps %%xmm0,%%xmm2 \n"
8597 "movdqa %%xmm7,%%xmm4 \n"
8598 "addps %%xmm4,%%xmm4 \n"
8599 "movdqa %%xmm2,%%xmm3 \n"
8600 "addps %%xmm4,%%xmm3 \n"
8601 "addps %%xmm4,%%xmm4 \n"
8602
8603 // 4 pixel loop
8604 LABELALIGN
8605 "40: \n"
8606 "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2
8607 "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2
8608 "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
8609 "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride
8610 "movd %%xmm0,%k1 \n"
8611 "pshufd $0x39,%%xmm0,%%xmm0 \n"
8612 "movd %%xmm0,%k5 \n"
8613 "pshufd $0x39,%%xmm0,%%xmm0 \n"
8614 "movd 0x00(%0,%1,1),%%xmm1 \n"
8615 "movd 0x00(%0,%5,1),%%xmm6 \n"
8616 "punpckldq %%xmm6,%%xmm1 \n"
8617 "addps %%xmm4,%%xmm2 \n"
8618 "movq %%xmm1,(%2) \n"
8619 "movd %%xmm0,%k1 \n"
8620 "pshufd $0x39,%%xmm0,%%xmm0 \n"
8621 "movd %%xmm0,%k5 \n"
8622 "movd 0x00(%0,%1,1),%%xmm0 \n"
8623 "movd 0x00(%0,%5,1),%%xmm6 \n"
8624 "punpckldq %%xmm6,%%xmm0 \n"
8625 "addps %%xmm4,%%xmm3 \n"
8626 "movq %%xmm0,0x08(%2) \n"
8627 "lea 0x10(%2),%2 \n"
8628 "sub $0x4,%4 \n"
8629 "jge 40b \n"
8630
8631 "49: \n"
8632 "add $0x3,%4 \n"
8633 "jl 19f \n"
8634
8635 // 1 pixel loop
8636 LABELALIGN
8637 "10: \n"
8638 "cvttps2dq %%xmm2,%%xmm0 \n"
8639 "packssdw %%xmm0,%%xmm0 \n"
8640 "pmaddwd %%xmm5,%%xmm0 \n"
8641 "addps %%xmm7,%%xmm2 \n"
8642 "movd %%xmm0,%k1 \n"
8643 "movd 0x00(%0,%1,1),%%xmm0 \n"
8644 "movd %%xmm0,(%2) \n"
8645 "lea 0x04(%2),%2 \n"
8646 "sub $0x1,%4 \n"
8647 "jge 10b \n"
8648 "19: \n"
8649 : "+r"(src_argb), // %0
8650 "+r"(src_argb_stride_temp), // %1
8651 "+r"(dst_argb), // %2
8652 "+r"(src_dudv), // %3
8653 "+rm"(width), // %4
8654 "=&r"(temp) // %5
8655 :
8656 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
8657 "xmm7");
8658 }
8659 #endif // HAS_ARGBAFFINEROW_SSE2
8660
8661 #ifdef HAS_INTERPOLATEROW_SSSE3
8662 // Bilinear filter 16x2 -> 16x1
InterpolateRow_SSSE3(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int width,int source_y_fraction)8663 void InterpolateRow_SSSE3(uint8_t* dst_ptr,
8664 const uint8_t* src_ptr,
8665 ptrdiff_t src_stride,
8666 int width,
8667 int source_y_fraction) {
8668 asm volatile(
8669 "sub %1,%0 \n"
8670 "cmp $0x0,%3 \n"
8671 "je 100f \n"
8672 "cmp $0x80,%3 \n"
8673 "je 50f \n"
8674
8675 "movd %3,%%xmm0 \n"
8676 "neg %3 \n"
8677 "add $0x100,%3 \n"
8678 "movd %3,%%xmm5 \n"
8679 "punpcklbw %%xmm0,%%xmm5 \n"
8680 "punpcklwd %%xmm5,%%xmm5 \n"
8681 "pshufd $0x0,%%xmm5,%%xmm5 \n"
8682 "mov $0x80808080,%%eax \n"
8683 "movd %%eax,%%xmm4 \n"
8684 "pshufd $0x0,%%xmm4,%%xmm4 \n"
8685
8686 // General purpose row blend.
8687 LABELALIGN
8688 "1: \n"
8689 "movdqu (%1),%%xmm0 \n"
8690 "movdqu 0x00(%1,%4,1),%%xmm2 \n"
8691 "movdqa %%xmm0,%%xmm1 \n"
8692 "punpcklbw %%xmm2,%%xmm0 \n"
8693 "punpckhbw %%xmm2,%%xmm1 \n"
8694 "psubb %%xmm4,%%xmm0 \n"
8695 "psubb %%xmm4,%%xmm1 \n"
8696 "movdqa %%xmm5,%%xmm2 \n"
8697 "movdqa %%xmm5,%%xmm3 \n"
8698 "pmaddubsw %%xmm0,%%xmm2 \n"
8699 "pmaddubsw %%xmm1,%%xmm3 \n"
8700 "paddw %%xmm4,%%xmm2 \n"
8701 "paddw %%xmm4,%%xmm3 \n"
8702 "psrlw $0x8,%%xmm2 \n"
8703 "psrlw $0x8,%%xmm3 \n"
8704 "packuswb %%xmm3,%%xmm2 \n"
8705 "movdqu %%xmm2,0x00(%1,%0,1) \n"
8706 "lea 0x10(%1),%1 \n"
8707 "sub $0x10,%2 \n"
8708 "jg 1b \n"
8709 "jmp 99f \n"
8710
8711 // Blend 50 / 50.
8712 LABELALIGN
8713 "50: \n"
8714 "movdqu (%1),%%xmm0 \n"
8715 "movdqu 0x00(%1,%4,1),%%xmm1 \n"
8716 "pavgb %%xmm1,%%xmm0 \n"
8717 "movdqu %%xmm0,0x00(%1,%0,1) \n"
8718 "lea 0x10(%1),%1 \n"
8719 "sub $0x10,%2 \n"
8720 "jg 50b \n"
8721 "jmp 99f \n"
8722
8723 // Blend 100 / 0 - Copy row unchanged.
8724 LABELALIGN
8725 "100: \n"
8726 "movdqu (%1),%%xmm0 \n"
8727 "movdqu %%xmm0,0x00(%1,%0,1) \n"
8728 "lea 0x10(%1),%1 \n"
8729 "sub $0x10,%2 \n"
8730 "jg 100b \n"
8731
8732 "99: \n"
8733 : "+r"(dst_ptr), // %0
8734 "+r"(src_ptr), // %1
8735 "+rm"(width), // %2
8736 "+r"(source_y_fraction) // %3
8737 : "r"((intptr_t)(src_stride)) // %4
8738 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
8739 }
8740 #endif // HAS_INTERPOLATEROW_SSSE3
8741
8742 #ifdef HAS_INTERPOLATEROW_AVX2
8743 // Bilinear filter 32x2 -> 32x1
InterpolateRow_AVX2(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int width,int source_y_fraction)8744 void InterpolateRow_AVX2(uint8_t* dst_ptr,
8745 const uint8_t* src_ptr,
8746 ptrdiff_t src_stride,
8747 int width,
8748 int source_y_fraction) {
8749 asm volatile(
8750 "sub %1,%0 \n"
8751 "cmp $0x0,%3 \n"
8752 "je 100f \n"
8753 "cmp $0x80,%3 \n"
8754 "je 50f \n"
8755
8756 "vmovd %3,%%xmm0 \n"
8757 "neg %3 \n"
8758 "add $0x100,%3 \n"
8759 "vmovd %3,%%xmm5 \n"
8760 "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
8761 "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
8762 "vbroadcastss %%xmm5,%%ymm5 \n"
8763 "mov $0x80808080,%%eax \n"
8764 "vmovd %%eax,%%xmm4 \n"
8765 "vbroadcastss %%xmm4,%%ymm4 \n"
8766
8767 // General purpose row blend.
8768 LABELALIGN
8769 "1: \n"
8770 "vmovdqu (%1),%%ymm0 \n"
8771 "vmovdqu 0x00(%1,%4,1),%%ymm2 \n"
8772 "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
8773 "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
8774 "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
8775 "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
8776 "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
8777 "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
8778 "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
8779 "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
8780 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
8781 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
8782 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
8783 "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
8784 "lea 0x20(%1),%1 \n"
8785 "sub $0x20,%2 \n"
8786 "jg 1b \n"
8787 "jmp 99f \n"
8788
8789 // Blend 50 / 50.
8790 LABELALIGN
8791 "50: \n"
8792 "vmovdqu (%1),%%ymm0 \n"
8793 "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n"
8794 "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
8795 "lea 0x20(%1),%1 \n"
8796 "sub $0x20,%2 \n"
8797 "jg 50b \n"
8798 "jmp 99f \n"
8799
8800 // Blend 100 / 0 - Copy row unchanged.
8801 LABELALIGN
8802 "100: \n"
8803 "vmovdqu (%1),%%ymm0 \n"
8804 "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
8805 "lea 0x20(%1),%1 \n"
8806 "sub $0x20,%2 \n"
8807 "jg 100b \n"
8808
8809 "99: \n"
8810 "vzeroupper \n"
8811 : "+r"(dst_ptr), // %0
8812 "+r"(src_ptr), // %1
8813 "+r"(width), // %2
8814 "+r"(source_y_fraction) // %3
8815 : "r"((intptr_t)(src_stride)) // %4
8816 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
8817 }
8818 #endif // HAS_INTERPOLATEROW_AVX2
8819
8820 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
8821 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)8822 void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
8823 uint8_t* dst_argb,
8824 const uint8_t* shuffler,
8825 int width) {
8826 asm volatile(
8827
8828 "movdqu (%3),%%xmm5 \n"
8829
8830 LABELALIGN
8831 "1: \n"
8832 "movdqu (%0),%%xmm0 \n"
8833 "movdqu 0x10(%0),%%xmm1 \n"
8834 "lea 0x20(%0),%0 \n"
8835 "pshufb %%xmm5,%%xmm0 \n"
8836 "pshufb %%xmm5,%%xmm1 \n"
8837 "movdqu %%xmm0,(%1) \n"
8838 "movdqu %%xmm1,0x10(%1) \n"
8839 "lea 0x20(%1),%1 \n"
8840 "sub $0x8,%2 \n"
8841 "jg 1b \n"
8842 : "+r"(src_argb), // %0
8843 "+r"(dst_argb), // %1
8844 "+r"(width) // %2
8845 : "r"(shuffler) // %3
8846 : "memory", "cc", "xmm0", "xmm1", "xmm5");
8847 }
8848 #endif // HAS_ARGBSHUFFLEROW_SSSE3
8849
8850 #ifdef HAS_ARGBSHUFFLEROW_AVX2
8851 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)8852 void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
8853 uint8_t* dst_argb,
8854 const uint8_t* shuffler,
8855 int width) {
8856 asm volatile(
8857
8858 "vbroadcastf128 (%3),%%ymm5 \n"
8859
8860 LABELALIGN
8861 "1: \n"
8862 "vmovdqu (%0),%%ymm0 \n"
8863 "vmovdqu 0x20(%0),%%ymm1 \n"
8864 "lea 0x40(%0),%0 \n"
8865 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
8866 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
8867 "vmovdqu %%ymm0,(%1) \n"
8868 "vmovdqu %%ymm1,0x20(%1) \n"
8869 "lea 0x40(%1),%1 \n"
8870 "sub $0x10,%2 \n"
8871 "jg 1b \n"
8872 "vzeroupper \n"
8873 : "+r"(src_argb), // %0
8874 "+r"(dst_argb), // %1
8875 "+r"(width) // %2
8876 : "r"(shuffler) // %3
8877 : "memory", "cc", "xmm0", "xmm1", "xmm5");
8878 }
8879 #endif // HAS_ARGBSHUFFLEROW_AVX2
8880
8881 #ifdef HAS_I422TOYUY2ROW_SSE2
I422ToYUY2Row_SSE2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)8882 void I422ToYUY2Row_SSE2(const uint8_t* src_y,
8883 const uint8_t* src_u,
8884 const uint8_t* src_v,
8885 uint8_t* dst_yuy2,
8886 int width) {
8887 asm volatile(
8888
8889 "sub %1,%2 \n"
8890
8891 LABELALIGN
8892 "1: \n"
8893 "movq (%1),%%xmm2 \n"
8894 "movq 0x00(%1,%2,1),%%xmm1 \n"
8895 "add $0x8,%1 \n"
8896 "punpcklbw %%xmm1,%%xmm2 \n"
8897 "movdqu (%0),%%xmm0 \n"
8898 "add $0x10,%0 \n"
8899 "movdqa %%xmm0,%%xmm1 \n"
8900 "punpcklbw %%xmm2,%%xmm0 \n"
8901 "punpckhbw %%xmm2,%%xmm1 \n"
8902 "movdqu %%xmm0,(%3) \n"
8903 "movdqu %%xmm1,0x10(%3) \n"
8904 "lea 0x20(%3),%3 \n"
8905 "sub $0x10,%4 \n"
8906 "jg 1b \n"
8907 : "+r"(src_y), // %0
8908 "+r"(src_u), // %1
8909 "+r"(src_v), // %2
8910 "+r"(dst_yuy2), // %3
8911 "+rm"(width) // %4
8912 :
8913 : "memory", "cc", "xmm0", "xmm1", "xmm2");
8914 }
8915 #endif // HAS_I422TOYUY2ROW_SSE2
8916
8917 #ifdef HAS_I422TOUYVYROW_SSE2
I422ToUYVYRow_SSE2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)8918 void I422ToUYVYRow_SSE2(const uint8_t* src_y,
8919 const uint8_t* src_u,
8920 const uint8_t* src_v,
8921 uint8_t* dst_uyvy,
8922 int width) {
8923 asm volatile(
8924
8925 "sub %1,%2 \n"
8926
8927 LABELALIGN
8928 "1: \n"
8929 "movq (%1),%%xmm2 \n"
8930 "movq 0x00(%1,%2,1),%%xmm1 \n"
8931 "add $0x8,%1 \n"
8932 "punpcklbw %%xmm1,%%xmm2 \n"
8933 "movdqu (%0),%%xmm0 \n"
8934 "movdqa %%xmm2,%%xmm1 \n"
8935 "add $0x10,%0 \n"
8936 "punpcklbw %%xmm0,%%xmm1 \n"
8937 "punpckhbw %%xmm0,%%xmm2 \n"
8938 "movdqu %%xmm1,(%3) \n"
8939 "movdqu %%xmm2,0x10(%3) \n"
8940 "lea 0x20(%3),%3 \n"
8941 "sub $0x10,%4 \n"
8942 "jg 1b \n"
8943 : "+r"(src_y), // %0
8944 "+r"(src_u), // %1
8945 "+r"(src_v), // %2
8946 "+r"(dst_uyvy), // %3
8947 "+rm"(width) // %4
8948 :
8949 : "memory", "cc", "xmm0", "xmm1", "xmm2");
8950 }
8951 #endif // HAS_I422TOUYVYROW_SSE2
8952
8953 #ifdef HAS_I422TOYUY2ROW_AVX2
I422ToYUY2Row_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)8954 void I422ToYUY2Row_AVX2(const uint8_t* src_y,
8955 const uint8_t* src_u,
8956 const uint8_t* src_v,
8957 uint8_t* dst_yuy2,
8958 int width) {
8959 asm volatile(
8960
8961 "sub %1,%2 \n"
8962
8963 LABELALIGN
8964 "1: \n"
8965 "vpmovzxbw (%1),%%ymm1 \n"
8966 "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
8967 "add $0x10,%1 \n"
8968 "vpsllw $0x8,%%ymm2,%%ymm2 \n"
8969 "vpor %%ymm1,%%ymm2,%%ymm2 \n"
8970 "vmovdqu (%0),%%ymm0 \n"
8971 "add $0x20,%0 \n"
8972 "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n"
8973 "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n"
8974 "vextractf128 $0x0,%%ymm1,(%3) \n"
8975 "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
8976 "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
8977 "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
8978 "lea 0x40(%3),%3 \n"
8979 "sub $0x20,%4 \n"
8980 "jg 1b \n"
8981 "vzeroupper \n"
8982 : "+r"(src_y), // %0
8983 "+r"(src_u), // %1
8984 "+r"(src_v), // %2
8985 "+r"(dst_yuy2), // %3
8986 "+rm"(width) // %4
8987 :
8988 : "memory", "cc", "xmm0", "xmm1", "xmm2");
8989 }
8990 #endif // HAS_I422TOYUY2ROW_AVX2
8991
8992 #ifdef HAS_I422TOUYVYROW_AVX2
I422ToUYVYRow_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)8993 void I422ToUYVYRow_AVX2(const uint8_t* src_y,
8994 const uint8_t* src_u,
8995 const uint8_t* src_v,
8996 uint8_t* dst_uyvy,
8997 int width) {
8998 asm volatile(
8999
9000 "sub %1,%2 \n"
9001
9002 LABELALIGN
9003 "1: \n"
9004 "vpmovzxbw (%1),%%ymm1 \n"
9005 "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
9006 "add $0x10,%1 \n"
9007 "vpsllw $0x8,%%ymm2,%%ymm2 \n"
9008 "vpor %%ymm1,%%ymm2,%%ymm2 \n"
9009 "vmovdqu (%0),%%ymm0 \n"
9010 "add $0x20,%0 \n"
9011 "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
9012 "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
9013 "vextractf128 $0x0,%%ymm1,(%3) \n"
9014 "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
9015 "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
9016 "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
9017 "lea 0x40(%3),%3 \n"
9018 "sub $0x20,%4 \n"
9019 "jg 1b \n"
9020 "vzeroupper \n"
9021 : "+r"(src_y), // %0
9022 "+r"(src_u), // %1
9023 "+r"(src_v), // %2
9024 "+r"(dst_uyvy), // %3
9025 "+rm"(width) // %4
9026 :
9027 : "memory", "cc", "xmm0", "xmm1", "xmm2");
9028 }
9029 #endif // HAS_I422TOUYVYROW_AVX2
9030
9031 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
ARGBPolynomialRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,const float * poly,int width)9032 void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
9033 uint8_t* dst_argb,
9034 const float* poly,
9035 int width) {
9036 asm volatile(
9037
9038 "pxor %%xmm3,%%xmm3 \n"
9039
9040 // 2 pixel loop.
9041 LABELALIGN
9042 "1: \n"
9043 "movq (%0),%%xmm0 \n"
9044 "lea 0x8(%0),%0 \n"
9045 "punpcklbw %%xmm3,%%xmm0 \n"
9046 "movdqa %%xmm0,%%xmm4 \n"
9047 "punpcklwd %%xmm3,%%xmm0 \n"
9048 "punpckhwd %%xmm3,%%xmm4 \n"
9049 "cvtdq2ps %%xmm0,%%xmm0 \n"
9050 "cvtdq2ps %%xmm4,%%xmm4 \n"
9051 "movdqa %%xmm0,%%xmm1 \n"
9052 "movdqa %%xmm4,%%xmm5 \n"
9053 "mulps 0x10(%3),%%xmm0 \n"
9054 "mulps 0x10(%3),%%xmm4 \n"
9055 "addps (%3),%%xmm0 \n"
9056 "addps (%3),%%xmm4 \n"
9057 "movdqa %%xmm1,%%xmm2 \n"
9058 "movdqa %%xmm5,%%xmm6 \n"
9059 "mulps %%xmm1,%%xmm2 \n"
9060 "mulps %%xmm5,%%xmm6 \n"
9061 "mulps %%xmm2,%%xmm1 \n"
9062 "mulps %%xmm6,%%xmm5 \n"
9063 "mulps 0x20(%3),%%xmm2 \n"
9064 "mulps 0x20(%3),%%xmm6 \n"
9065 "mulps 0x30(%3),%%xmm1 \n"
9066 "mulps 0x30(%3),%%xmm5 \n"
9067 "addps %%xmm2,%%xmm0 \n"
9068 "addps %%xmm6,%%xmm4 \n"
9069 "addps %%xmm1,%%xmm0 \n"
9070 "addps %%xmm5,%%xmm4 \n"
9071 "cvttps2dq %%xmm0,%%xmm0 \n"
9072 "cvttps2dq %%xmm4,%%xmm4 \n"
9073 "packuswb %%xmm4,%%xmm0 \n"
9074 "packuswb %%xmm0,%%xmm0 \n"
9075 "movq %%xmm0,(%1) \n"
9076 "lea 0x8(%1),%1 \n"
9077 "sub $0x2,%2 \n"
9078 "jg 1b \n"
9079 : "+r"(src_argb), // %0
9080 "+r"(dst_argb), // %1
9081 "+r"(width) // %2
9082 : "r"(poly) // %3
9083 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
9084 }
9085 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
9086
9087 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
ARGBPolynomialRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,const float * poly,int width)9088 void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
9089 uint8_t* dst_argb,
9090 const float* poly,
9091 int width) {
9092 asm volatile(
9093 "vbroadcastf128 (%3),%%ymm4 \n"
9094 "vbroadcastf128 0x10(%3),%%ymm5 \n"
9095 "vbroadcastf128 0x20(%3),%%ymm6 \n"
9096 "vbroadcastf128 0x30(%3),%%ymm7 \n"
9097
9098 // 2 pixel loop.
9099 LABELALIGN
9100 "1: \n"
9101 "vpmovzxbd (%0),%%ymm0 \n" // 2 ARGB pixels
9102 "lea 0x8(%0),%0 \n"
9103 "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
9104 "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
9105 "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
9106 "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
9107 "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
9108 "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X *
9109 // X
9110 "vcvttps2dq %%ymm0,%%ymm0 \n"
9111 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
9112 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
9113 "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
9114 "vmovq %%xmm0,(%1) \n"
9115 "lea 0x8(%1),%1 \n"
9116 "sub $0x2,%2 \n"
9117 "jg 1b \n"
9118 "vzeroupper \n"
9119 : "+r"(src_argb), // %0
9120 "+r"(dst_argb), // %1
9121 "+r"(width) // %2
9122 : "r"(poly) // %3
9123 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
9124 "xmm7");
9125 }
9126 #endif // HAS_ARGBPOLYNOMIALROW_AVX2
9127
9128 #ifdef HAS_HALFFLOATROW_SSE2
9129 static float kScaleBias = 1.9259299444e-34f;
HalfFloatRow_SSE2(const uint16_t * src,uint16_t * dst,float scale,int width)9130 void HalfFloatRow_SSE2(const uint16_t* src,
9131 uint16_t* dst,
9132 float scale,
9133 int width) {
9134 scale *= kScaleBias;
9135 asm volatile(
9136 "movd %3,%%xmm4 \n"
9137 "pshufd $0x0,%%xmm4,%%xmm4 \n"
9138 "pxor %%xmm5,%%xmm5 \n"
9139 "sub %0,%1 \n"
9140
9141 // 16 pixel loop.
9142 LABELALIGN
9143 "1: \n"
9144 "movdqu (%0),%%xmm2 \n" // 8 shorts
9145 "add $0x10,%0 \n"
9146 "movdqa %%xmm2,%%xmm3 \n"
9147 "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1
9148 "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats
9149 "punpckhwd %%xmm5,%%xmm3 \n"
9150 "cvtdq2ps %%xmm3,%%xmm3 \n"
9151 "mulps %%xmm4,%%xmm2 \n"
9152 "mulps %%xmm4,%%xmm3 \n"
9153 "psrld $0xd,%%xmm2 \n"
9154 "psrld $0xd,%%xmm3 \n"
9155 "packssdw %%xmm3,%%xmm2 \n"
9156 "movdqu %%xmm2,-0x10(%0,%1,1) \n"
9157 "sub $0x8,%2 \n"
9158 "jg 1b \n"
9159 : "+r"(src), // %0
9160 "+r"(dst), // %1
9161 "+r"(width) // %2
9162 : "m"(scale) // %3
9163 : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
9164 }
9165 #endif // HAS_HALFFLOATROW_SSE2
9166
9167 #ifdef HAS_HALFFLOATROW_AVX2
HalfFloatRow_AVX2(const uint16_t * src,uint16_t * dst,float scale,int width)9168 void HalfFloatRow_AVX2(const uint16_t* src,
9169 uint16_t* dst,
9170 float scale,
9171 int width) {
9172 scale *= kScaleBias;
9173 asm volatile(
9174 "vbroadcastss %3, %%ymm4 \n"
9175 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
9176 "sub %0,%1 \n"
9177
9178 // 16 pixel loop.
9179 LABELALIGN
9180 "1: \n"
9181 "vmovdqu (%0),%%ymm2 \n" // 16 shorts
9182 "add $0x20,%0 \n"
9183 "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
9184 "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
9185 "vcvtdq2ps %%ymm3,%%ymm3 \n"
9186 "vcvtdq2ps %%ymm2,%%ymm2 \n"
9187 "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
9188 "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
9189 "vpsrld $0xd,%%ymm3,%%ymm3 \n"
9190 "vpsrld $0xd,%%ymm2,%%ymm2 \n"
9191 "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
9192 "vmovdqu %%ymm2,-0x20(%0,%1,1) \n"
9193 "sub $0x10,%2 \n"
9194 "jg 1b \n"
9195
9196 "vzeroupper \n"
9197 : "+r"(src), // %0
9198 "+r"(dst), // %1
9199 "+r"(width) // %2
9200 #if defined(__x86_64__)
9201 : "x"(scale) // %3
9202 #else
9203 : "m"(scale) // %3
9204 #endif
9205 : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
9206 }
9207 #endif // HAS_HALFFLOATROW_AVX2
9208
9209 #ifdef HAS_HALFFLOATROW_F16C
HalfFloatRow_F16C(const uint16_t * src,uint16_t * dst,float scale,int width)9210 void HalfFloatRow_F16C(const uint16_t* src,
9211 uint16_t* dst,
9212 float scale,
9213 int width) {
9214 asm volatile(
9215 "vbroadcastss %3, %%ymm4 \n"
9216 "sub %0,%1 \n"
9217
9218 // 16 pixel loop.
9219 LABELALIGN
9220 "1: \n"
9221 "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints
9222 "vpmovzxwd 0x10(%0),%%ymm3 \n"
9223 "vcvtdq2ps %%ymm2,%%ymm2 \n"
9224 "vcvtdq2ps %%ymm3,%%ymm3 \n"
9225 "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
9226 "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
9227 "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
9228 "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
9229 "vmovdqu %%xmm2,0x00(%0,%1,1) \n"
9230 "vmovdqu %%xmm3,0x10(%0,%1,1) \n"
9231 "add $0x20,%0 \n"
9232 "sub $0x10,%2 \n"
9233 "jg 1b \n"
9234 "vzeroupper \n"
9235 : "+r"(src), // %0
9236 "+r"(dst), // %1
9237 "+r"(width) // %2
9238 #if defined(__x86_64__)
9239 : "x"(scale) // %3
9240 #else
9241 : "m"(scale) // %3
9242 #endif
9243 : "memory", "cc", "xmm2", "xmm3", "xmm4");
9244 }
9245 #endif // HAS_HALFFLOATROW_F16C
9246
9247 #ifdef HAS_HALFFLOATROW_F16C
HalfFloat1Row_F16C(const uint16_t * src,uint16_t * dst,float,int width)9248 void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
9249 asm volatile(
9250 "sub %0,%1 \n"
9251 // 16 pixel loop.
9252 LABELALIGN
9253 "1: \n"
9254 "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints
9255 "vpmovzxwd 0x10(%0),%%ymm3 \n"
9256 "vcvtdq2ps %%ymm2,%%ymm2 \n"
9257 "vcvtdq2ps %%ymm3,%%ymm3 \n"
9258 "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
9259 "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
9260 "vmovdqu %%xmm2,0x00(%0,%1,1) \n"
9261 "vmovdqu %%xmm3,0x10(%0,%1,1) \n"
9262 "add $0x20,%0 \n"
9263 "sub $0x10,%2 \n"
9264 "jg 1b \n"
9265 "vzeroupper \n"
9266 : "+r"(src), // %0
9267 "+r"(dst), // %1
9268 "+r"(width) // %2
9269 :
9270 : "memory", "cc", "xmm2", "xmm3");
9271 }
9272 #endif // HAS_HALFFLOATROW_F16C
9273
9274 #ifdef HAS_ARGBCOLORTABLEROW_X86
9275 // Tranform ARGB pixels with color table.
ARGBColorTableRow_X86(uint8_t * dst_argb,const uint8_t * table_argb,int width)9276 void ARGBColorTableRow_X86(uint8_t* dst_argb,
9277 const uint8_t* table_argb,
9278 int width) {
9279 uintptr_t pixel_temp;
9280 asm volatile(
9281 // 1 pixel loop.
9282 LABELALIGN
9283 "1: \n"
9284 "movzb (%0),%1 \n"
9285 "lea 0x4(%0),%0 \n"
9286 "movzb 0x00(%3,%1,4),%1 \n"
9287 "mov %b1,-0x4(%0) \n"
9288 "movzb -0x3(%0),%1 \n"
9289 "movzb 0x01(%3,%1,4),%1 \n"
9290 "mov %b1,-0x3(%0) \n"
9291 "movzb -0x2(%0),%1 \n"
9292 "movzb 0x02(%3,%1,4),%1 \n"
9293 "mov %b1,-0x2(%0) \n"
9294 "movzb -0x1(%0),%1 \n"
9295 "movzb 0x03(%3,%1,4),%1 \n"
9296 "mov %b1,-0x1(%0) \n"
9297 "dec %2 \n"
9298 "jg 1b \n"
9299 : "+r"(dst_argb), // %0
9300 "=&d"(pixel_temp), // %1
9301 "+r"(width) // %2
9302 : "r"(table_argb) // %3
9303 : "memory", "cc");
9304 }
9305 #endif // HAS_ARGBCOLORTABLEROW_X86
9306
9307 #ifdef HAS_RGBCOLORTABLEROW_X86
9308 // Tranform RGB pixels with color table.
RGBColorTableRow_X86(uint8_t * dst_argb,const uint8_t * table_argb,int width)9309 void RGBColorTableRow_X86(uint8_t* dst_argb,
9310 const uint8_t* table_argb,
9311 int width) {
9312 uintptr_t pixel_temp;
9313 asm volatile(
9314 // 1 pixel loop.
9315 LABELALIGN
9316 "1: \n"
9317 "movzb (%0),%1 \n"
9318 "lea 0x4(%0),%0 \n"
9319 "movzb 0x00(%3,%1,4),%1 \n"
9320 "mov %b1,-0x4(%0) \n"
9321 "movzb -0x3(%0),%1 \n"
9322 "movzb 0x01(%3,%1,4),%1 \n"
9323 "mov %b1,-0x3(%0) \n"
9324 "movzb -0x2(%0),%1 \n"
9325 "movzb 0x02(%3,%1,4),%1 \n"
9326 "mov %b1,-0x2(%0) \n"
9327 "dec %2 \n"
9328 "jg 1b \n"
9329 : "+r"(dst_argb), // %0
9330 "=&d"(pixel_temp), // %1
9331 "+r"(width) // %2
9332 : "r"(table_argb) // %3
9333 : "memory", "cc");
9334 }
9335 #endif // HAS_RGBCOLORTABLEROW_X86
9336
9337 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
9338 // Tranform RGB pixels with luma table.
ARGBLumaColorTableRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width,const uint8_t * luma,uint32_t lumacoeff)9339 void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
9340 uint8_t* dst_argb,
9341 int width,
9342 const uint8_t* luma,
9343 uint32_t lumacoeff) {
9344 uintptr_t pixel_temp;
9345 uintptr_t table_temp;
9346 asm volatile(
9347 "movd %6,%%xmm3 \n"
9348 "pshufd $0x0,%%xmm3,%%xmm3 \n"
9349 "pcmpeqb %%xmm4,%%xmm4 \n"
9350 "psllw $0x8,%%xmm4 \n"
9351 "pxor %%xmm5,%%xmm5 \n"
9352
9353 // 4 pixel loop.
9354 LABELALIGN
9355 "1: \n"
9356 "movdqu (%2),%%xmm0 \n"
9357 "pmaddubsw %%xmm3,%%xmm0 \n"
9358 "phaddw %%xmm0,%%xmm0 \n"
9359 "pand %%xmm4,%%xmm0 \n"
9360 "punpcklwd %%xmm5,%%xmm0 \n"
9361 "movd %%xmm0,%k1 \n" // 32 bit offset
9362 "add %5,%1 \n"
9363 "pshufd $0x39,%%xmm0,%%xmm0 \n"
9364
9365 "movzb (%2),%0 \n"
9366 "movzb 0x00(%1,%0,1),%0 \n"
9367 "mov %b0,(%3) \n"
9368 "movzb 0x1(%2),%0 \n"
9369 "movzb 0x00(%1,%0,1),%0 \n"
9370 "mov %b0,0x1(%3) \n"
9371 "movzb 0x2(%2),%0 \n"
9372 "movzb 0x00(%1,%0,1),%0 \n"
9373 "mov %b0,0x2(%3) \n"
9374 "movzb 0x3(%2),%0 \n"
9375 "mov %b0,0x3(%3) \n"
9376
9377 "movd %%xmm0,%k1 \n" // 32 bit offset
9378 "add %5,%1 \n"
9379 "pshufd $0x39,%%xmm0,%%xmm0 \n"
9380
9381 "movzb 0x4(%2),%0 \n"
9382 "movzb 0x00(%1,%0,1),%0 \n"
9383 "mov %b0,0x4(%3) \n"
9384 "movzb 0x5(%2),%0 \n"
9385 "movzb 0x00(%1,%0,1),%0 \n"
9386 "mov %b0,0x5(%3) \n"
9387 "movzb 0x6(%2),%0 \n"
9388 "movzb 0x00(%1,%0,1),%0 \n"
9389 "mov %b0,0x6(%3) \n"
9390 "movzb 0x7(%2),%0 \n"
9391 "mov %b0,0x7(%3) \n"
9392
9393 "movd %%xmm0,%k1 \n" // 32 bit offset
9394 "add %5,%1 \n"
9395 "pshufd $0x39,%%xmm0,%%xmm0 \n"
9396
9397 "movzb 0x8(%2),%0 \n"
9398 "movzb 0x00(%1,%0,1),%0 \n"
9399 "mov %b0,0x8(%3) \n"
9400 "movzb 0x9(%2),%0 \n"
9401 "movzb 0x00(%1,%0,1),%0 \n"
9402 "mov %b0,0x9(%3) \n"
9403 "movzb 0xa(%2),%0 \n"
9404 "movzb 0x00(%1,%0,1),%0 \n"
9405 "mov %b0,0xa(%3) \n"
9406 "movzb 0xb(%2),%0 \n"
9407 "mov %b0,0xb(%3) \n"
9408
9409 "movd %%xmm0,%k1 \n" // 32 bit offset
9410 "add %5,%1 \n"
9411
9412 "movzb 0xc(%2),%0 \n"
9413 "movzb 0x00(%1,%0,1),%0 \n"
9414 "mov %b0,0xc(%3) \n"
9415 "movzb 0xd(%2),%0 \n"
9416 "movzb 0x00(%1,%0,1),%0 \n"
9417 "mov %b0,0xd(%3) \n"
9418 "movzb 0xe(%2),%0 \n"
9419 "movzb 0x00(%1,%0,1),%0 \n"
9420 "mov %b0,0xe(%3) \n"
9421 "movzb 0xf(%2),%0 \n"
9422 "mov %b0,0xf(%3) \n"
9423 "lea 0x10(%2),%2 \n"
9424 "lea 0x10(%3),%3 \n"
9425 "sub $0x4,%4 \n"
9426 "jg 1b \n"
9427 : "=&d"(pixel_temp), // %0
9428 "=&a"(table_temp), // %1
9429 "+r"(src_argb), // %2
9430 "+r"(dst_argb), // %3
9431 "+rm"(width) // %4
9432 : "r"(luma), // %5
9433 "rm"(lumacoeff) // %6
9434 : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
9435 }
9436 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
9437
9438 static const uvec8 kYUV24Shuffle[3] = {
9439 {8, 9, 0, 8, 9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12},
9440 {9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15},
9441 {2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15, 6, 14, 15, 7}};
9442
9443 // Convert biplanar NV21 to packed YUV24
9444 // NV21 has VU in memory for chroma.
9445 // YUV24 is VUY in memory
NV21ToYUV24Row_SSSE3(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)9446 void NV21ToYUV24Row_SSSE3(const uint8_t* src_y,
9447 const uint8_t* src_vu,
9448 uint8_t* dst_yuv24,
9449 int width) {
9450 asm volatile(
9451 "sub %0,%1 \n"
9452 "movdqa (%4),%%xmm4 \n" // 3 shuffler constants
9453 "movdqa 16(%4),%%xmm5 \n"
9454 "movdqa 32(%4),%%xmm6 \n"
9455 "1: \n"
9456 "movdqu (%0),%%xmm2 \n" // load 16 Y values
9457 "movdqu (%0,%1),%%xmm3 \n" // load 8 VU values
9458 "lea 16(%0),%0 \n"
9459 "movdqa %%xmm2,%%xmm0 \n"
9460 "movdqa %%xmm2,%%xmm1 \n"
9461 "shufps $0x44,%%xmm3,%%xmm0 \n" // Y 0..7, UV 0..3
9462 "shufps $0x99,%%xmm3,%%xmm1 \n" // Y 4..11, UV 2..5
9463 "shufps $0xee,%%xmm3,%%xmm2 \n" // Y 8..15, UV 4..7
9464 "pshufb %%xmm4, %%xmm0 \n" // weave into YUV24
9465 "pshufb %%xmm5, %%xmm1 \n"
9466 "pshufb %%xmm6, %%xmm2 \n"
9467 "movdqu %%xmm0,(%2) \n"
9468 "movdqu %%xmm1,16(%2) \n"
9469 "movdqu %%xmm2,32(%2) \n"
9470 "lea 48(%2),%2 \n"
9471 "sub $16,%3 \n" // 16 pixels per loop
9472 "jg 1b \n"
9473 : "+r"(src_y), // %0
9474 "+r"(src_vu), // %1
9475 "+r"(dst_yuv24), // %2
9476 "+r"(width) // %3
9477 : "r"(&kYUV24Shuffle[0]) // %4
9478 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
9479 }
9480
9481 // Convert biplanar NV21 to packed YUV24
9482 // NV21 has VU in memory for chroma.
9483 // YUV24 is VUY in memory
NV21ToYUV24Row_AVX2(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)9484 void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
9485 const uint8_t* src_vu,
9486 uint8_t* dst_yuv24,
9487 int width) {
9488 asm volatile(
9489 "sub %0,%1 \n"
9490 "vbroadcastf128 (%4),%%ymm4 \n" // 3 shuffler constants
9491 "vbroadcastf128 16(%4),%%ymm5 \n"
9492 "vbroadcastf128 32(%4),%%ymm6 \n"
9493
9494 "1: \n"
9495 "vmovdqu (%0),%%ymm2 \n" // load 32 Y values
9496 "vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values
9497 "lea 32(%0),%0 \n"
9498 "vshufps $0x44,%%ymm3,%%ymm2,%%ymm0 \n" // Y 0..7, UV 0..3
9499 "vshufps $0x99,%%ymm3,%%ymm2,%%ymm1 \n" // Y 4..11, UV 2..5
9500 "vshufps $0xee,%%ymm3,%%ymm2,%%ymm2 \n" // Y 8..15, UV 4..7
9501 "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" // weave into YUV24
9502 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
9503 "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
9504 "vperm2i128 $0x20,%%ymm1,%%ymm0,%%ymm3 \n"
9505 "vperm2i128 $0x30,%%ymm0,%%ymm2,%%ymm0 \n"
9506 "vperm2i128 $0x31,%%ymm2,%%ymm1,%%ymm1 \n"
9507 "vmovdqu %%ymm3,(%2) \n"
9508 "vmovdqu %%ymm0,32(%2) \n"
9509 "vmovdqu %%ymm1,64(%2) \n"
9510 "lea 96(%2),%2 \n"
9511 "sub $32,%3 \n" // 32 pixels per loop
9512 "jg 1b \n"
9513 "vzeroupper \n"
9514 : "+r"(src_y), // %0
9515 "+r"(src_vu), // %1
9516 "+r"(dst_yuv24), // %2
9517 "+r"(width) // %3
9518 : "r"(&kYUV24Shuffle[0]) // %4
9519 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
9520 }
9521
9522 #ifdef HAS_NV21ToYUV24ROW_AVX512
9523 // The following VMBI VEX256 code tests okay with the intelsde emulator.
9524 static const lvec8 kYUV24Perm[3] = {
9525 {32, 33, 0, 32, 33, 1, 34, 35, 2, 34, 35, 3, 36, 37, 4, 36,
9526 37, 5, 38, 39, 6, 38, 39, 7, 40, 41, 8, 40, 41, 9, 42, 43},
9527 {10, 42, 43, 11, 44, 45, 12, 44, 45, 13, 46, 47, 14, 46, 47, 15,
9528 48, 49, 16, 48, 49, 17, 50, 51, 18, 50, 51, 19, 52, 53, 20, 52},
9529 {53, 21, 54, 55, 22, 54, 55, 23, 56, 57, 24, 56, 57, 25, 58, 59,
9530 26, 58, 59, 27, 60, 61, 28, 60, 61, 29, 62, 63, 30, 62, 63, 31}};
9531
NV21ToYUV24Row_AVX512(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)9532 void NV21ToYUV24Row_AVX512(const uint8_t* src_y,
9533 const uint8_t* src_vu,
9534 uint8_t* dst_yuv24,
9535 int width) {
9536 asm volatile(
9537 "sub %0,%1 \n"
9538 "vmovdqa (%4),%%ymm4 \n" // 3 shuffler constants
9539 "vmovdqa 32(%4),%%ymm5 \n"
9540 "vmovdqa 64(%4),%%ymm6 \n" LABELALIGN
9541 "1: \n"
9542 "vmovdqu (%0),%%ymm2 \n" // load 32 Y values
9543 "vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values
9544 "lea 32(%0),%0 \n"
9545 "vmovdqa %%ymm2, %%ymm0 \n"
9546 "vmovdqa %%ymm2, %%ymm1 \n"
9547 "vpermt2b %%ymm3,%%ymm4,%%ymm0 \n"
9548 "vpermt2b %%ymm3,%%ymm5,%%ymm1 \n"
9549 "vpermt2b %%ymm3,%%ymm6,%%ymm2 \n"
9550 "vmovdqu %%ymm0,(%2) \n"
9551 "vmovdqu %%ymm1,32(%2) \n"
9552 "vmovdqu %%ymm2,64(%2) \n"
9553 "lea 96(%2),%2 \n"
9554 "sub $32,%3 \n"
9555 "jg 1b \n"
9556 "vzeroupper \n"
9557 : "+r"(src_y), // %0
9558 "+r"(src_vu), // %1
9559 "+r"(dst_yuv24), // %2
9560 "+r"(width) // %3
9561 : "r"(&kYUV24Perm[0]) // %4
9562 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
9563 }
9564
9565 #endif // HAS_NV21ToYUV24ROW_AVX512
9566
9567 #ifdef HAS_SWAPUVROW_SSSE3
9568
9569 // Shuffle table for reversing the bytes.
9570 static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
9571 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
9572
9573 // Convert UV plane of NV12 to VU of NV21.
SwapUVRow_SSSE3(const uint8_t * src_uv,uint8_t * dst_vu,int width)9574 void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
9575 asm volatile(
9576
9577 "movdqu %3,%%xmm5 \n"
9578
9579 LABELALIGN
9580 "1: \n"
9581 "movdqu (%0),%%xmm0 \n"
9582 "movdqu 0x10(%0),%%xmm1 \n"
9583 "lea 0x20(%0),%0 \n"
9584 "pshufb %%xmm5,%%xmm0 \n"
9585 "pshufb %%xmm5,%%xmm1 \n"
9586 "movdqu %%xmm0,(%1) \n"
9587 "movdqu %%xmm1,0x10(%1) \n"
9588 "lea 0x20(%1),%1 \n"
9589 "sub $0x10,%2 \n"
9590 "jg 1b \n"
9591 : "+r"(src_uv), // %0
9592 "+r"(dst_vu), // %1
9593 "+r"(width) // %2
9594 : "m"(kShuffleUVToVU) // %3
9595 : "memory", "cc", "xmm0", "xmm1", "xmm5");
9596 }
9597 #endif // HAS_SWAPUVROW_SSSE3
9598
9599 #ifdef HAS_SWAPUVROW_AVX2
SwapUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_vu,int width)9600 void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
9601 asm volatile(
9602
9603 "vbroadcastf128 %3,%%ymm5 \n"
9604
9605 LABELALIGN
9606 "1: \n"
9607 "vmovdqu (%0),%%ymm0 \n"
9608 "vmovdqu 0x20(%0),%%ymm1 \n"
9609 "lea 0x40(%0),%0 \n"
9610 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
9611 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
9612 "vmovdqu %%ymm0,(%1) \n"
9613 "vmovdqu %%ymm1,0x20(%1) \n"
9614 "lea 0x40(%1),%1 \n"
9615 "sub $0x20,%2 \n"
9616 "jg 1b \n"
9617 "vzeroupper \n"
9618 : "+r"(src_uv), // %0
9619 "+r"(dst_vu), // %1
9620 "+r"(width) // %2
9621 : "m"(kShuffleUVToVU) // %3
9622 : "memory", "cc", "xmm0", "xmm1", "xmm5");
9623 }
9624 #endif // HAS_SWAPUVROW_AVX2
9625
HalfMergeUVRow_SSSE3(const uint8_t * src_u,int src_stride_u,const uint8_t * src_v,int src_stride_v,uint8_t * dst_uv,int width)9626 void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
9627 int src_stride_u,
9628 const uint8_t* src_v,
9629 int src_stride_v,
9630 uint8_t* dst_uv,
9631 int width) {
9632 asm volatile(
9633 "pcmpeqb %%xmm4,%%xmm4 \n"
9634 "psrlw $0xf,%%xmm4 \n"
9635 "packuswb %%xmm4,%%xmm4 \n"
9636 "pxor %%xmm5,%%xmm5 \n"
9637
9638 LABELALIGN
9639 "1: \n"
9640 "movdqu (%0),%%xmm0 \n" // load 16 U values
9641 "movdqu (%1),%%xmm1 \n" // load 16 V values
9642 "movdqu 0(%0,%4,1),%%xmm2 \n" // 16 from next row
9643 "movdqu 0(%1,%5,1),%%xmm3 \n"
9644 "lea 0x10(%0),%0 \n"
9645 "pmaddubsw %%xmm4,%%xmm0 \n" // half size
9646 "pmaddubsw %%xmm4,%%xmm1 \n"
9647 "pmaddubsw %%xmm4,%%xmm2 \n"
9648 "pmaddubsw %%xmm4,%%xmm3 \n"
9649 "lea 0x10(%1),%1 \n"
9650 "paddw %%xmm2,%%xmm0 \n"
9651 "paddw %%xmm3,%%xmm1 \n"
9652 "psrlw $0x1,%%xmm0 \n"
9653 "psrlw $0x1,%%xmm1 \n"
9654 "pavgw %%xmm5,%%xmm0 \n"
9655 "pavgw %%xmm5,%%xmm1 \n"
9656 "packuswb %%xmm0,%%xmm0 \n"
9657 "packuswb %%xmm1,%%xmm1 \n"
9658 "punpcklbw %%xmm1,%%xmm0 \n"
9659 "movdqu %%xmm0,(%2) \n" // store 8 UV pixels
9660 "lea 0x10(%2),%2 \n"
9661 "sub $0x10,%3 \n" // 16 src pixels per loop
9662 "jg 1b \n"
9663 : "+r"(src_u), // %0
9664 "+r"(src_v), // %1
9665 "+r"(dst_uv), // %2
9666 "+r"(width) // %3
9667 : "r"((intptr_t)(src_stride_u)), // %4
9668 "r"((intptr_t)(src_stride_v)) // %5
9669 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
9670 }
9671
HalfMergeUVRow_AVX2(const uint8_t * src_u,int src_stride_u,const uint8_t * src_v,int src_stride_v,uint8_t * dst_uv,int width)9672 void HalfMergeUVRow_AVX2(const uint8_t* src_u,
9673 int src_stride_u,
9674 const uint8_t* src_v,
9675 int src_stride_v,
9676 uint8_t* dst_uv,
9677 int width) {
9678 asm volatile(
9679 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
9680 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
9681 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
9682 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
9683
9684 LABELALIGN
9685 "1: \n"
9686 "vmovdqu (%0),%%ymm0 \n" // load 32 U values
9687 "vmovdqu (%1),%%ymm1 \n" // load 32 V values
9688 "vmovdqu 0(%0,%4,1),%%ymm2 \n" // 32 from next row
9689 "vmovdqu 0(%1,%5,1),%%ymm3 \n"
9690 "lea 0x20(%0),%0 \n"
9691 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // half size
9692 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
9693 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
9694 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
9695 "lea 0x20(%1),%1 \n"
9696 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
9697 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
9698 "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
9699 "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
9700 "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
9701 "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
9702 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
9703 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
9704 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
9705 "vmovdqu %%ymm0,(%2) \n" // store 16 UV pixels
9706 "lea 0x20(%2),%2 \n"
9707 "sub $0x20,%3 \n" // 32 src pixels per loop
9708 "jg 1b \n"
9709 "vzeroupper \n"
9710 : "+r"(src_u), // %0
9711 "+r"(src_v), // %1
9712 "+r"(dst_uv), // %2
9713 "+r"(width) // %3
9714 : "r"((intptr_t)(src_stride_u)), // %4
9715 "r"((intptr_t)(src_stride_v)) // %5
9716 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
9717 }
9718
ClampFloatToZero_SSE2(const float * src_x,float * dst_y,int width)9719 void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) {
9720 asm volatile(
9721 "pxor %%xmm1,%%xmm1 \n"
9722
9723 LABELALIGN
9724 "1: \n"
9725 "movd (%0),%%xmm0 \n" // load float
9726 "maxss %%xmm1, %%xmm0 \n" // clamp to zero
9727 "add 4, %0 \n"
9728 "movd %%xmm0, (%1) \n" // store float
9729 "add 4, %1 \n"
9730 "sub $0x4,%2 \n" // 1 float per loop
9731 "jg 1b \n"
9732 : "+r"(src_x), // %0
9733 "+r"(dst_y), // %1
9734 "+r"(width) // %2
9735 :
9736 : "memory", "cc", "xmm0", "xmm1");
9737 }
9738
9739 #endif // defined(__x86_64__) || defined(__i386__)
9740
9741 #ifdef __cplusplus
9742 } // extern "C"
9743 } // namespace libyuv
9744 #endif
9745