1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // This module is for GCC x86 and x64.
19 #if !defined(LIBYUV_DISABLE_X86) && \
20 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
21
22 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
23
24 // Constants for ARGB
25 static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
26 25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u};
27
28 // JPeg full range.
29 static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
30 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};
31
32 static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u,
33 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u};
34 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
35
36 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
37
38 static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
39 112, -74, -38, 0, 112, -74, -38, 0};
40
41 static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
42 127, -84, -43, 0, 127, -84, -43, 0};
43
44 static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
45 -18, -94, 112, 0, -18, -94, 112, 0};
46
47 static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
48 -20, -107, 127, 0, -20, -107, 127, 0};
49
50 // Constants for BGRA
51 static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
52 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
53
54 static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
55 0, -38, -74, 112, 0, -38, -74, 112};
56
57 static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
58 0, 112, -94, -18, 0, 112, -94, -18};
59
60 // Constants for ABGR
61 static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u,
62 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u};
63
64 static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
65 -38, -74, 112, 0, -38, -74, 112, 0};
66
67 static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
68 112, -94, -18, 0, 112, -94, -18, 0};
69
70 // Constants for RGBA.
71 static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u,
72 0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u};
73
74 static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
75 0, 112, -74, -38, 0, 112, -74, -38};
76
77 static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
78 0, -18, -94, 112, 0, -18, -94, 112};
79
80 static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
81 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u};
82
83 static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
84 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
85
86 static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
87 0x8080u, 0x8080u, 0x8080u, 0x8080u};
88
89 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
90
91 #ifdef HAS_RGB24TOARGBROW_SSSE3
92
93 // Shuffle table for converting RGB24 to ARGB.
94 static const uvec8 kShuffleMaskRGB24ToARGB = {
95 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
96
97 // Shuffle table for converting RAW to ARGB.
98 static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
99 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
100
101 // Shuffle table for converting RAW to RGBA.
102 static const uvec8 kShuffleMaskRAWToRGBA = {12u, 2u, 1u, 0u, 13u, 5u, 4u, 3u,
103 14u, 8u, 7u, 6u, 15u, 11u, 10u, 9u};
104
105 // Shuffle table for converting RAW to RGB24. First 8.
106 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
107 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
108 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
109
110 // Shuffle table for converting RAW to RGB24. Middle 8.
111 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
112 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
113 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
114
115 // Shuffle table for converting RAW to RGB24. Last 8.
116 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
117 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
118 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
119
120 // Shuffle table for converting ARGB to RGB24.
121 static const uvec8 kShuffleMaskARGBToRGB24 = {
122 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
123
124 // Shuffle table for converting ARGB to RAW.
125 static const uvec8 kShuffleMaskARGBToRAW = {
126 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
127
128 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
129 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
130 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
131
132 // YUY2 shuf 16 Y to 32 Y.
133 static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
134 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
135 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
136
137 // YUY2 shuf 8 UV to 16 UV.
138 static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
139 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
140 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
141
142 // UYVY shuf 16 Y to 32 Y.
143 static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
144 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
145 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
146
147 // UYVY shuf 8 UV to 16 UV.
148 static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
149 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
150 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
151
152 // NV21 shuf 8 VU to 16 UV.
153 static const lvec8 kShuffleNV21 = {
154 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
155 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
156 };
157 #endif // HAS_RGB24TOARGBROW_SSSE3
158
159 #ifdef HAS_J400TOARGBROW_SSE2
J400ToARGBRow_SSE2(const uint8_t * src_y,uint8_t * dst_argb,int width)160 void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
161 asm volatile(
162 "pcmpeqb %%xmm5,%%xmm5 \n"
163 "pslld $0x18,%%xmm5 \n"
164
165 LABELALIGN
166 "1: \n"
167 "movq (%0),%%xmm0 \n"
168 "lea 0x8(%0),%0 \n"
169 "punpcklbw %%xmm0,%%xmm0 \n"
170 "movdqa %%xmm0,%%xmm1 \n"
171 "punpcklwd %%xmm0,%%xmm0 \n"
172 "punpckhwd %%xmm1,%%xmm1 \n"
173 "por %%xmm5,%%xmm0 \n"
174 "por %%xmm5,%%xmm1 \n"
175 "movdqu %%xmm0,(%1) \n"
176 "movdqu %%xmm1,0x10(%1) \n"
177 "lea 0x20(%1),%1 \n"
178 "sub $0x8,%2 \n"
179 "jg 1b \n"
180 : "+r"(src_y), // %0
181 "+r"(dst_argb), // %1
182 "+r"(width) // %2
183 ::"memory",
184 "cc", "xmm0", "xmm1", "xmm5");
185 }
186 #endif // HAS_J400TOARGBROW_SSE2
187
188 #ifdef HAS_RGB24TOARGBROW_SSSE3
RGB24ToARGBRow_SSSE3(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)189 void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
190 uint8_t* dst_argb,
191 int width) {
192 asm volatile(
193 "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
194 "pslld $0x18,%%xmm5 \n"
195 "movdqa %3,%%xmm4 \n"
196
197 LABELALIGN
198 "1: \n"
199 "movdqu (%0),%%xmm0 \n"
200 "movdqu 0x10(%0),%%xmm1 \n"
201 "movdqu 0x20(%0),%%xmm3 \n"
202 "lea 0x30(%0),%0 \n"
203 "movdqa %%xmm3,%%xmm2 \n"
204 "palignr $0x8,%%xmm1,%%xmm2 \n"
205 "pshufb %%xmm4,%%xmm2 \n"
206 "por %%xmm5,%%xmm2 \n"
207 "palignr $0xc,%%xmm0,%%xmm1 \n"
208 "pshufb %%xmm4,%%xmm0 \n"
209 "movdqu %%xmm2,0x20(%1) \n"
210 "por %%xmm5,%%xmm0 \n"
211 "pshufb %%xmm4,%%xmm1 \n"
212 "movdqu %%xmm0,(%1) \n"
213 "por %%xmm5,%%xmm1 \n"
214 "palignr $0x4,%%xmm3,%%xmm3 \n"
215 "pshufb %%xmm4,%%xmm3 \n"
216 "movdqu %%xmm1,0x10(%1) \n"
217 "por %%xmm5,%%xmm3 \n"
218 "movdqu %%xmm3,0x30(%1) \n"
219 "lea 0x40(%1),%1 \n"
220 "sub $0x10,%2 \n"
221 "jg 1b \n"
222 : "+r"(src_rgb24), // %0
223 "+r"(dst_argb), // %1
224 "+r"(width) // %2
225 : "m"(kShuffleMaskRGB24ToARGB) // %3
226 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
227 }
228
RAWToARGBRow_SSSE3(const uint8_t * src_raw,uint8_t * dst_argb,int width)229 void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
230 asm volatile(
231 "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
232 "pslld $0x18,%%xmm5 \n"
233 "movdqa %3,%%xmm4 \n"
234
235 LABELALIGN
236 "1: \n"
237 "movdqu (%0),%%xmm0 \n"
238 "movdqu 0x10(%0),%%xmm1 \n"
239 "movdqu 0x20(%0),%%xmm3 \n"
240 "lea 0x30(%0),%0 \n"
241 "movdqa %%xmm3,%%xmm2 \n"
242 "palignr $0x8,%%xmm1,%%xmm2 \n"
243 "pshufb %%xmm4,%%xmm2 \n"
244 "por %%xmm5,%%xmm2 \n"
245 "palignr $0xc,%%xmm0,%%xmm1 \n"
246 "pshufb %%xmm4,%%xmm0 \n"
247 "movdqu %%xmm2,0x20(%1) \n"
248 "por %%xmm5,%%xmm0 \n"
249 "pshufb %%xmm4,%%xmm1 \n"
250 "movdqu %%xmm0,(%1) \n"
251 "por %%xmm5,%%xmm1 \n"
252 "palignr $0x4,%%xmm3,%%xmm3 \n"
253 "pshufb %%xmm4,%%xmm3 \n"
254 "movdqu %%xmm1,0x10(%1) \n"
255 "por %%xmm5,%%xmm3 \n"
256 "movdqu %%xmm3,0x30(%1) \n"
257 "lea 0x40(%1),%1 \n"
258 "sub $0x10,%2 \n"
259 "jg 1b \n"
260 : "+r"(src_raw), // %0
261 "+r"(dst_argb), // %1
262 "+r"(width) // %2
263 : "m"(kShuffleMaskRAWToARGB) // %3
264 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
265 }
266
267 // Same code as RAWToARGB with different shuffler and A in low bits
RAWToRGBARow_SSSE3(const uint8_t * src_raw,uint8_t * dst_rgba,int width)268 void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
269 asm volatile(
270 "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff
271 "psrld $0x18,%%xmm5 \n"
272 "movdqa %3,%%xmm4 \n"
273
274 LABELALIGN
275 "1: \n"
276 "movdqu (%0),%%xmm0 \n"
277 "movdqu 0x10(%0),%%xmm1 \n"
278 "movdqu 0x20(%0),%%xmm3 \n"
279 "lea 0x30(%0),%0 \n"
280 "movdqa %%xmm3,%%xmm2 \n"
281 "palignr $0x8,%%xmm1,%%xmm2 \n"
282 "pshufb %%xmm4,%%xmm2 \n"
283 "por %%xmm5,%%xmm2 \n"
284 "palignr $0xc,%%xmm0,%%xmm1 \n"
285 "pshufb %%xmm4,%%xmm0 \n"
286 "movdqu %%xmm2,0x20(%1) \n"
287 "por %%xmm5,%%xmm0 \n"
288 "pshufb %%xmm4,%%xmm1 \n"
289 "movdqu %%xmm0,(%1) \n"
290 "por %%xmm5,%%xmm1 \n"
291 "palignr $0x4,%%xmm3,%%xmm3 \n"
292 "pshufb %%xmm4,%%xmm3 \n"
293 "movdqu %%xmm1,0x10(%1) \n"
294 "por %%xmm5,%%xmm3 \n"
295 "movdqu %%xmm3,0x30(%1) \n"
296 "lea 0x40(%1),%1 \n"
297 "sub $0x10,%2 \n"
298 "jg 1b \n"
299 : "+r"(src_raw), // %0
300 "+r"(dst_rgba), // %1
301 "+r"(width) // %2
302 : "m"(kShuffleMaskRAWToRGBA) // %3
303 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
304 }
305
RAWToRGB24Row_SSSE3(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)306 void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
307 uint8_t* dst_rgb24,
308 int width) {
309 asm volatile(
310 "movdqa %3,%%xmm3 \n"
311 "movdqa %4,%%xmm4 \n"
312 "movdqa %5,%%xmm5 \n"
313
314 LABELALIGN
315 "1: \n"
316 "movdqu (%0),%%xmm0 \n"
317 "movdqu 0x4(%0),%%xmm1 \n"
318 "movdqu 0x8(%0),%%xmm2 \n"
319 "lea 0x18(%0),%0 \n"
320 "pshufb %%xmm3,%%xmm0 \n"
321 "pshufb %%xmm4,%%xmm1 \n"
322 "pshufb %%xmm5,%%xmm2 \n"
323 "movq %%xmm0,(%1) \n"
324 "movq %%xmm1,0x8(%1) \n"
325 "movq %%xmm2,0x10(%1) \n"
326 "lea 0x18(%1),%1 \n"
327 "sub $0x8,%2 \n"
328 "jg 1b \n"
329 : "+r"(src_raw), // %0
330 "+r"(dst_rgb24), // %1
331 "+r"(width) // %2
332 : "m"(kShuffleMaskRAWToRGB24_0), // %3
333 "m"(kShuffleMaskRAWToRGB24_1), // %4
334 "m"(kShuffleMaskRAWToRGB24_2) // %5
335 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
336 }
337
RGB565ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)338 void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
339 asm volatile(
340 "mov $0x1080108,%%eax \n"
341 "movd %%eax,%%xmm5 \n"
342 "pshufd $0x0,%%xmm5,%%xmm5 \n"
343 "mov $0x20802080,%%eax \n"
344 "movd %%eax,%%xmm6 \n"
345 "pshufd $0x0,%%xmm6,%%xmm6 \n"
346 "pcmpeqb %%xmm3,%%xmm3 \n"
347 "psllw $0xb,%%xmm3 \n"
348 "pcmpeqb %%xmm4,%%xmm4 \n"
349 "psllw $0xa,%%xmm4 \n"
350 "psrlw $0x5,%%xmm4 \n"
351 "pcmpeqb %%xmm7,%%xmm7 \n"
352 "psllw $0x8,%%xmm7 \n"
353 "sub %0,%1 \n"
354 "sub %0,%1 \n"
355
356 LABELALIGN
357 "1: \n"
358 "movdqu (%0),%%xmm0 \n"
359 "movdqa %%xmm0,%%xmm1 \n"
360 "movdqa %%xmm0,%%xmm2 \n"
361 "pand %%xmm3,%%xmm1 \n"
362 "psllw $0xb,%%xmm2 \n"
363 "pmulhuw %%xmm5,%%xmm1 \n"
364 "pmulhuw %%xmm5,%%xmm2 \n"
365 "psllw $0x8,%%xmm1 \n"
366 "por %%xmm2,%%xmm1 \n"
367 "pand %%xmm4,%%xmm0 \n"
368 "pmulhuw %%xmm6,%%xmm0 \n"
369 "por %%xmm7,%%xmm0 \n"
370 "movdqa %%xmm1,%%xmm2 \n"
371 "punpcklbw %%xmm0,%%xmm1 \n"
372 "punpckhbw %%xmm0,%%xmm2 \n"
373 "movdqu %%xmm1,0x00(%1,%0,2) \n"
374 "movdqu %%xmm2,0x10(%1,%0,2) \n"
375 "lea 0x10(%0),%0 \n"
376 "sub $0x8,%2 \n"
377 "jg 1b \n"
378 : "+r"(src), // %0
379 "+r"(dst), // %1
380 "+r"(width) // %2
381 :
382 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
383 "xmm6", "xmm7");
384 }
385
ARGB1555ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)386 void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
387 asm volatile(
388 "mov $0x1080108,%%eax \n"
389 "movd %%eax,%%xmm5 \n"
390 "pshufd $0x0,%%xmm5,%%xmm5 \n"
391 "mov $0x42004200,%%eax \n"
392 "movd %%eax,%%xmm6 \n"
393 "pshufd $0x0,%%xmm6,%%xmm6 \n"
394 "pcmpeqb %%xmm3,%%xmm3 \n"
395 "psllw $0xb,%%xmm3 \n"
396 "movdqa %%xmm3,%%xmm4 \n"
397 "psrlw $0x6,%%xmm4 \n"
398 "pcmpeqb %%xmm7,%%xmm7 \n"
399 "psllw $0x8,%%xmm7 \n"
400 "sub %0,%1 \n"
401 "sub %0,%1 \n"
402
403 LABELALIGN
404 "1: \n"
405 "movdqu (%0),%%xmm0 \n"
406 "movdqa %%xmm0,%%xmm1 \n"
407 "movdqa %%xmm0,%%xmm2 \n"
408 "psllw $0x1,%%xmm1 \n"
409 "psllw $0xb,%%xmm2 \n"
410 "pand %%xmm3,%%xmm1 \n"
411 "pmulhuw %%xmm5,%%xmm2 \n"
412 "pmulhuw %%xmm5,%%xmm1 \n"
413 "psllw $0x8,%%xmm1 \n"
414 "por %%xmm2,%%xmm1 \n"
415 "movdqa %%xmm0,%%xmm2 \n"
416 "pand %%xmm4,%%xmm0 \n"
417 "psraw $0x8,%%xmm2 \n"
418 "pmulhuw %%xmm6,%%xmm0 \n"
419 "pand %%xmm7,%%xmm2 \n"
420 "por %%xmm2,%%xmm0 \n"
421 "movdqa %%xmm1,%%xmm2 \n"
422 "punpcklbw %%xmm0,%%xmm1 \n"
423 "punpckhbw %%xmm0,%%xmm2 \n"
424 "movdqu %%xmm1,0x00(%1,%0,2) \n"
425 "movdqu %%xmm2,0x10(%1,%0,2) \n"
426 "lea 0x10(%0),%0 \n"
427 "sub $0x8,%2 \n"
428 "jg 1b \n"
429 : "+r"(src), // %0
430 "+r"(dst), // %1
431 "+r"(width) // %2
432 :
433 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
434 "xmm6", "xmm7");
435 }
436
ARGB4444ToARGBRow_SSE2(const uint8_t * src,uint8_t * dst,int width)437 void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
438 asm volatile(
439 "mov $0xf0f0f0f,%%eax \n"
440 "movd %%eax,%%xmm4 \n"
441 "pshufd $0x0,%%xmm4,%%xmm4 \n"
442 "movdqa %%xmm4,%%xmm5 \n"
443 "pslld $0x4,%%xmm5 \n"
444 "sub %0,%1 \n"
445 "sub %0,%1 \n"
446
447 LABELALIGN
448 "1: \n"
449 "movdqu (%0),%%xmm0 \n"
450 "movdqa %%xmm0,%%xmm2 \n"
451 "pand %%xmm4,%%xmm0 \n"
452 "pand %%xmm5,%%xmm2 \n"
453 "movdqa %%xmm0,%%xmm1 \n"
454 "movdqa %%xmm2,%%xmm3 \n"
455 "psllw $0x4,%%xmm1 \n"
456 "psrlw $0x4,%%xmm3 \n"
457 "por %%xmm1,%%xmm0 \n"
458 "por %%xmm3,%%xmm2 \n"
459 "movdqa %%xmm0,%%xmm1 \n"
460 "punpcklbw %%xmm2,%%xmm0 \n"
461 "punpckhbw %%xmm2,%%xmm1 \n"
462 "movdqu %%xmm0,0x00(%1,%0,2) \n"
463 "movdqu %%xmm1,0x10(%1,%0,2) \n"
464 "lea 0x10(%0),%0 \n"
465 "sub $0x8,%2 \n"
466 "jg 1b \n"
467 : "+r"(src), // %0
468 "+r"(dst), // %1
469 "+r"(width) // %2
470 :
471 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
472 }
473
ARGBToRGB24Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)474 void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
475 asm volatile(
476
477 "movdqa %3,%%xmm6 \n"
478
479 LABELALIGN
480 "1: \n"
481 "movdqu (%0),%%xmm0 \n"
482 "movdqu 0x10(%0),%%xmm1 \n"
483 "movdqu 0x20(%0),%%xmm2 \n"
484 "movdqu 0x30(%0),%%xmm3 \n"
485 "lea 0x40(%0),%0 \n"
486 "pshufb %%xmm6,%%xmm0 \n"
487 "pshufb %%xmm6,%%xmm1 \n"
488 "pshufb %%xmm6,%%xmm2 \n"
489 "pshufb %%xmm6,%%xmm3 \n"
490 "movdqa %%xmm1,%%xmm4 \n"
491 "psrldq $0x4,%%xmm1 \n"
492 "pslldq $0xc,%%xmm4 \n"
493 "movdqa %%xmm2,%%xmm5 \n"
494 "por %%xmm4,%%xmm0 \n"
495 "pslldq $0x8,%%xmm5 \n"
496 "movdqu %%xmm0,(%1) \n"
497 "por %%xmm5,%%xmm1 \n"
498 "psrldq $0x8,%%xmm2 \n"
499 "pslldq $0x4,%%xmm3 \n"
500 "por %%xmm3,%%xmm2 \n"
501 "movdqu %%xmm1,0x10(%1) \n"
502 "movdqu %%xmm2,0x20(%1) \n"
503 "lea 0x30(%1),%1 \n"
504 "sub $0x10,%2 \n"
505 "jg 1b \n"
506 : "+r"(src), // %0
507 "+r"(dst), // %1
508 "+r"(width) // %2
509 : "m"(kShuffleMaskARGBToRGB24) // %3
510 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
511 }
512
ARGBToRAWRow_SSSE3(const uint8_t * src,uint8_t * dst,int width)513 void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
514 asm volatile(
515
516 "movdqa %3,%%xmm6 \n"
517
518 LABELALIGN
519 "1: \n"
520 "movdqu (%0),%%xmm0 \n"
521 "movdqu 0x10(%0),%%xmm1 \n"
522 "movdqu 0x20(%0),%%xmm2 \n"
523 "movdqu 0x30(%0),%%xmm3 \n"
524 "lea 0x40(%0),%0 \n"
525 "pshufb %%xmm6,%%xmm0 \n"
526 "pshufb %%xmm6,%%xmm1 \n"
527 "pshufb %%xmm6,%%xmm2 \n"
528 "pshufb %%xmm6,%%xmm3 \n"
529 "movdqa %%xmm1,%%xmm4 \n"
530 "psrldq $0x4,%%xmm1 \n"
531 "pslldq $0xc,%%xmm4 \n"
532 "movdqa %%xmm2,%%xmm5 \n"
533 "por %%xmm4,%%xmm0 \n"
534 "pslldq $0x8,%%xmm5 \n"
535 "movdqu %%xmm0,(%1) \n"
536 "por %%xmm5,%%xmm1 \n"
537 "psrldq $0x8,%%xmm2 \n"
538 "pslldq $0x4,%%xmm3 \n"
539 "por %%xmm3,%%xmm2 \n"
540 "movdqu %%xmm1,0x10(%1) \n"
541 "movdqu %%xmm2,0x20(%1) \n"
542 "lea 0x30(%1),%1 \n"
543 "sub $0x10,%2 \n"
544 "jg 1b \n"
545 : "+r"(src), // %0
546 "+r"(dst), // %1
547 "+r"(width) // %2
548 : "m"(kShuffleMaskARGBToRAW) // %3
549 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
550 }
551
552 #ifdef HAS_ARGBTORGB24ROW_AVX2
553 // vpermd for 12+12 to 24
554 static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
555
ARGBToRGB24Row_AVX2(const uint8_t * src,uint8_t * dst,int width)556 void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
557 asm volatile(
558 "vbroadcastf128 %3,%%ymm6 \n"
559 "vmovdqa %4,%%ymm7 \n"
560
561 LABELALIGN
562 "1: \n"
563 "vmovdqu (%0),%%ymm0 \n"
564 "vmovdqu 0x20(%0),%%ymm1 \n"
565 "vmovdqu 0x40(%0),%%ymm2 \n"
566 "vmovdqu 0x60(%0),%%ymm3 \n"
567 "lea 0x80(%0),%0 \n"
568 "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
569 "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
570 "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
571 "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
572 "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
573 "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
574 "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
575 "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
576 "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
577 "vpor %%ymm4,%%ymm0,%%ymm0 \n"
578 "vmovdqu %%ymm0,(%1) \n"
579 "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
580 "vpermq $0x4f,%%ymm2,%%ymm4 \n"
581 "vpor %%ymm4,%%ymm1,%%ymm1 \n"
582 "vmovdqu %%ymm1,0x20(%1) \n"
583 "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
584 "vpermq $0x93,%%ymm3,%%ymm3 \n"
585 "vpor %%ymm3,%%ymm2,%%ymm2 \n"
586 "vmovdqu %%ymm2,0x40(%1) \n"
587 "lea 0x60(%1),%1 \n"
588 "sub $0x20,%2 \n"
589 "jg 1b \n"
590 "vzeroupper \n"
591 : "+r"(src), // %0
592 "+r"(dst), // %1
593 "+r"(width) // %2
594 : "m"(kShuffleMaskARGBToRGB24), // %3
595 "m"(kPermdRGB24_AVX) // %4
596 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
597 "xmm7");
598 }
599 #endif
600
601 #ifdef HAS_ARGBTORGB24ROW_AVX512VBMI
602 // Shuffle table for converting ARGBToRGB24
603 static const ulvec8 kPermARGBToRGB24_0 = {
604 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u,
605 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
606 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
607 static const ulvec8 kPermARGBToRGB24_1 = {
608 10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
609 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
610 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
611 static const ulvec8 kPermARGBToRGB24_2 = {
612 21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
613 36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
614 50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
615
ARGBToRGB24Row_AVX512VBMI(const uint8_t * src,uint8_t * dst,int width)616 void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
617 asm volatile(
618 "vmovdqa %3,%%ymm5 \n"
619 "vmovdqa %4,%%ymm6 \n"
620 "vmovdqa %5,%%ymm7 \n"
621
622 LABELALIGN
623 "1: \n"
624 "vmovdqu (%0),%%ymm0 \n"
625 "vmovdqu 0x20(%0),%%ymm1 \n"
626 "vmovdqu 0x40(%0),%%ymm2 \n"
627 "vmovdqu 0x60(%0),%%ymm3 \n"
628 "lea 0x80(%0),%0 \n"
629 "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n"
630 "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n"
631 "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n"
632 "vmovdqu %%ymm0,(%1) \n"
633 "vmovdqu %%ymm1,0x20(%1) \n"
634 "vmovdqu %%ymm2,0x40(%1) \n"
635 "lea 0x60(%1),%1 \n"
636 "sub $0x20,%2 \n"
637 "jg 1b \n"
638 "vzeroupper \n"
639 : "+r"(src), // %0
640 "+r"(dst), // %1
641 "+r"(width) // %2
642 : "m"(kPermARGBToRGB24_0), // %3
643 "m"(kPermARGBToRGB24_1), // %4
644 "m"(kPermARGBToRGB24_2) // %5
645 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");
646 }
647 #endif
648
649 #ifdef HAS_ARGBTORAWROW_AVX2
ARGBToRAWRow_AVX2(const uint8_t * src,uint8_t * dst,int width)650 void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
651 asm volatile(
652 "vbroadcastf128 %3,%%ymm6 \n"
653 "vmovdqa %4,%%ymm7 \n"
654
655 LABELALIGN
656 "1: \n"
657 "vmovdqu (%0),%%ymm0 \n"
658 "vmovdqu 0x20(%0),%%ymm1 \n"
659 "vmovdqu 0x40(%0),%%ymm2 \n"
660 "vmovdqu 0x60(%0),%%ymm3 \n"
661 "lea 0x80(%0),%0 \n"
662 "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
663 "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
664 "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
665 "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
666 "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
667 "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
668 "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
669 "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
670 "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
671 "vpor %%ymm4,%%ymm0,%%ymm0 \n"
672 "vmovdqu %%ymm0,(%1) \n"
673 "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
674 "vpermq $0x4f,%%ymm2,%%ymm4 \n"
675 "vpor %%ymm4,%%ymm1,%%ymm1 \n"
676 "vmovdqu %%ymm1,0x20(%1) \n"
677 "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
678 "vpermq $0x93,%%ymm3,%%ymm3 \n"
679 "vpor %%ymm3,%%ymm2,%%ymm2 \n"
680 "vmovdqu %%ymm2,0x40(%1) \n"
681 "lea 0x60(%1),%1 \n"
682 "sub $0x20,%2 \n"
683 "jg 1b \n"
684 "vzeroupper \n"
685 : "+r"(src), // %0
686 "+r"(dst), // %1
687 "+r"(width) // %2
688 : "m"(kShuffleMaskARGBToRAW), // %3
689 "m"(kPermdRGB24_AVX) // %4
690 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
691 "xmm7");
692 }
693 #endif
694
ARGBToRGB565Row_SSE2(const uint8_t * src,uint8_t * dst,int width)695 void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
696 asm volatile(
697 "pcmpeqb %%xmm3,%%xmm3 \n"
698 "psrld $0x1b,%%xmm3 \n"
699 "pcmpeqb %%xmm4,%%xmm4 \n"
700 "psrld $0x1a,%%xmm4 \n"
701 "pslld $0x5,%%xmm4 \n"
702 "pcmpeqb %%xmm5,%%xmm5 \n"
703 "pslld $0xb,%%xmm5 \n"
704
705 LABELALIGN
706 "1: \n"
707 "movdqu (%0),%%xmm0 \n"
708 "movdqa %%xmm0,%%xmm1 \n"
709 "movdqa %%xmm0,%%xmm2 \n"
710 "pslld $0x8,%%xmm0 \n"
711 "psrld $0x3,%%xmm1 \n"
712 "psrld $0x5,%%xmm2 \n"
713 "psrad $0x10,%%xmm0 \n"
714 "pand %%xmm3,%%xmm1 \n"
715 "pand %%xmm4,%%xmm2 \n"
716 "pand %%xmm5,%%xmm0 \n"
717 "por %%xmm2,%%xmm1 \n"
718 "por %%xmm1,%%xmm0 \n"
719 "packssdw %%xmm0,%%xmm0 \n"
720 "lea 0x10(%0),%0 \n"
721 "movq %%xmm0,(%1) \n"
722 "lea 0x8(%1),%1 \n"
723 "sub $0x4,%2 \n"
724 "jg 1b \n"
725 : "+r"(src), // %0
726 "+r"(dst), // %1
727 "+r"(width) // %2
728 ::"memory",
729 "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
730 }
731
ARGBToRGB565DitherRow_SSE2(const uint8_t * src,uint8_t * dst,const uint32_t dither4,int width)732 void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
733 uint8_t* dst,
734 const uint32_t dither4,
735 int width) {
736 asm volatile(
737 "movd %3,%%xmm6 \n"
738 "punpcklbw %%xmm6,%%xmm6 \n"
739 "movdqa %%xmm6,%%xmm7 \n"
740 "punpcklwd %%xmm6,%%xmm6 \n"
741 "punpckhwd %%xmm7,%%xmm7 \n"
742 "pcmpeqb %%xmm3,%%xmm3 \n"
743 "psrld $0x1b,%%xmm3 \n"
744 "pcmpeqb %%xmm4,%%xmm4 \n"
745 "psrld $0x1a,%%xmm4 \n"
746 "pslld $0x5,%%xmm4 \n"
747 "pcmpeqb %%xmm5,%%xmm5 \n"
748 "pslld $0xb,%%xmm5 \n"
749
750 LABELALIGN
751 "1: \n"
752 "movdqu (%0),%%xmm0 \n"
753 "paddusb %%xmm6,%%xmm0 \n"
754 "movdqa %%xmm0,%%xmm1 \n"
755 "movdqa %%xmm0,%%xmm2 \n"
756 "pslld $0x8,%%xmm0 \n"
757 "psrld $0x3,%%xmm1 \n"
758 "psrld $0x5,%%xmm2 \n"
759 "psrad $0x10,%%xmm0 \n"
760 "pand %%xmm3,%%xmm1 \n"
761 "pand %%xmm4,%%xmm2 \n"
762 "pand %%xmm5,%%xmm0 \n"
763 "por %%xmm2,%%xmm1 \n"
764 "por %%xmm1,%%xmm0 \n"
765 "packssdw %%xmm0,%%xmm0 \n"
766 "lea 0x10(%0),%0 \n"
767 "movq %%xmm0,(%1) \n"
768 "lea 0x8(%1),%1 \n"
769 "sub $0x4,%2 \n"
770 "jg 1b \n"
771 : "+r"(src), // %0
772 "+r"(dst), // %1
773 "+r"(width) // %2
774 : "m"(dither4) // %3
775 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
776 "xmm7");
777 }
778
779 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
ARGBToRGB565DitherRow_AVX2(const uint8_t * src,uint8_t * dst,const uint32_t dither4,int width)780 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
781 uint8_t* dst,
782 const uint32_t dither4,
783 int width) {
784 asm volatile(
785 "vbroadcastss %3,%%xmm6 \n"
786 "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
787 "vpermq $0xd8,%%ymm6,%%ymm6 \n"
788 "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
789 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
790 "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
791 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
792 "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
793 "vpslld $0x5,%%ymm4,%%ymm4 \n"
794 "vpslld $0xb,%%ymm3,%%ymm5 \n"
795
796 LABELALIGN
797 "1: \n"
798 "vmovdqu (%0),%%ymm0 \n"
799 "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
800 "vpsrld $0x5,%%ymm0,%%ymm2 \n"
801 "vpsrld $0x3,%%ymm0,%%ymm1 \n"
802 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
803 "vpand %%ymm4,%%ymm2,%%ymm2 \n"
804 "vpand %%ymm3,%%ymm1,%%ymm1 \n"
805 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
806 "vpor %%ymm2,%%ymm1,%%ymm1 \n"
807 "vpor %%ymm1,%%ymm0,%%ymm0 \n"
808 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
809 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
810 "lea 0x20(%0),%0 \n"
811 "vmovdqu %%xmm0,(%1) \n"
812 "lea 0x10(%1),%1 \n"
813 "sub $0x8,%2 \n"
814 "jg 1b \n"
815 "vzeroupper \n"
816 : "+r"(src), // %0
817 "+r"(dst), // %1
818 "+r"(width) // %2
819 : "m"(dither4) // %3
820 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
821 "xmm7");
822 }
823 #endif // HAS_ARGBTORGB565DITHERROW_AVX2
824
ARGBToARGB1555Row_SSE2(const uint8_t * src,uint8_t * dst,int width)825 void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
826 asm volatile(
827 "pcmpeqb %%xmm4,%%xmm4 \n"
828 "psrld $0x1b,%%xmm4 \n"
829 "movdqa %%xmm4,%%xmm5 \n"
830 "pslld $0x5,%%xmm5 \n"
831 "movdqa %%xmm4,%%xmm6 \n"
832 "pslld $0xa,%%xmm6 \n"
833 "pcmpeqb %%xmm7,%%xmm7 \n"
834 "pslld $0xf,%%xmm7 \n"
835
836 LABELALIGN
837 "1: \n"
838 "movdqu (%0),%%xmm0 \n"
839 "movdqa %%xmm0,%%xmm1 \n"
840 "movdqa %%xmm0,%%xmm2 \n"
841 "movdqa %%xmm0,%%xmm3 \n"
842 "psrad $0x10,%%xmm0 \n"
843 "psrld $0x3,%%xmm1 \n"
844 "psrld $0x6,%%xmm2 \n"
845 "psrld $0x9,%%xmm3 \n"
846 "pand %%xmm7,%%xmm0 \n"
847 "pand %%xmm4,%%xmm1 \n"
848 "pand %%xmm5,%%xmm2 \n"
849 "pand %%xmm6,%%xmm3 \n"
850 "por %%xmm1,%%xmm0 \n"
851 "por %%xmm3,%%xmm2 \n"
852 "por %%xmm2,%%xmm0 \n"
853 "packssdw %%xmm0,%%xmm0 \n"
854 "lea 0x10(%0),%0 \n"
855 "movq %%xmm0,(%1) \n"
856 "lea 0x8(%1),%1 \n"
857 "sub $0x4,%2 \n"
858 "jg 1b \n"
859 : "+r"(src), // %0
860 "+r"(dst), // %1
861 "+r"(width) // %2
862 ::"memory",
863 "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
864 }
865
ARGBToARGB4444Row_SSE2(const uint8_t * src,uint8_t * dst,int width)866 void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
867 asm volatile(
868 "pcmpeqb %%xmm4,%%xmm4 \n"
869 "psllw $0xc,%%xmm4 \n"
870 "movdqa %%xmm4,%%xmm3 \n"
871 "psrlw $0x8,%%xmm3 \n"
872
873 LABELALIGN
874 "1: \n"
875 "movdqu (%0),%%xmm0 \n"
876 "movdqa %%xmm0,%%xmm1 \n"
877 "pand %%xmm3,%%xmm0 \n"
878 "pand %%xmm4,%%xmm1 \n"
879 "psrlq $0x4,%%xmm0 \n"
880 "psrlq $0x8,%%xmm1 \n"
881 "por %%xmm1,%%xmm0 \n"
882 "packuswb %%xmm0,%%xmm0 \n"
883 "lea 0x10(%0),%0 \n"
884 "movq %%xmm0,(%1) \n"
885 "lea 0x8(%1),%1 \n"
886 "sub $0x4,%2 \n"
887 "jg 1b \n"
888 : "+r"(src), // %0
889 "+r"(dst), // %1
890 "+r"(width) // %2
891 ::"memory",
892 "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
893 }
894 #endif // HAS_RGB24TOARGBROW_SSSE3
895
896 /*
897
898 ARGBToAR30Row:
899
900 Red Blue
901 With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
902 produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
903 wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
904 (1024+4)*16 for red.
905
906 Alpha Green
907 Alpha and Green are already in the high bits so vpand can zero out the other
908 bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
909 could be used for Green - (1024+4) putting the 10 bit green in the lsb. Alpha
910 would be a simple multiplier to shift it into position. It wants a gap of 10
911 above the green. Green is 10 bits, so there are 6 bits in the low short. 4
912 more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
913 and then a shift of 4 is a multiply of 16, so (4*16) = 64. Then shift the
914 result left 10 to position the A and G channels.
915 */
916
917 // Shuffle table for converting RAW to RGB24. Last 8.
918 static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u,
919 128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
920
921 static const uvec8 kShuffleBR30 = {128u, 2u, 128u, 0u, 128u, 6u, 128u, 4u,
922 128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
923
924 static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
925 static const uint32_t kMaskRB10 = 0x3ff003ff;
926 static const uint32_t kMaskAG10 = 0xc000ff00;
927 static const uint32_t kMulAG10 = 64 * 65536 + 1028;
928
ARGBToAR30Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)929 void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
930 asm volatile(
931 "movdqa %3,%%xmm2 \n" // shuffler for RB
932 "movd %4,%%xmm3 \n" // multipler for RB
933 "movd %5,%%xmm4 \n" // mask for R10 B10
934 "movd %6,%%xmm5 \n" // mask for AG
935 "movd %7,%%xmm6 \n" // multipler for AG
936 "pshufd $0x0,%%xmm3,%%xmm3 \n"
937 "pshufd $0x0,%%xmm4,%%xmm4 \n"
938 "pshufd $0x0,%%xmm5,%%xmm5 \n"
939 "pshufd $0x0,%%xmm6,%%xmm6 \n"
940 "sub %0,%1 \n"
941
942 "1: \n"
943 "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels
944 "movdqa %%xmm0,%%xmm1 \n"
945 "pshufb %%xmm2,%%xmm1 \n" // R0B0
946 "pand %%xmm5,%%xmm0 \n" // A0G0
947 "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
948 "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
949 "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
950 "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
951 "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
952 "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
953 "add $0x10,%0 \n"
954 "sub $0x4,%2 \n"
955 "jg 1b \n"
956
957 : "+r"(src), // %0
958 "+r"(dst), // %1
959 "+r"(width) // %2
960 : "m"(kShuffleRB30), // %3
961 "m"(kMulRB10), // %4
962 "m"(kMaskRB10), // %5
963 "m"(kMaskAG10), // %6
964 "m"(kMulAG10) // %7
965 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
966 }
967
ABGRToAR30Row_SSSE3(const uint8_t * src,uint8_t * dst,int width)968 void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
969 asm volatile(
970 "movdqa %3,%%xmm2 \n" // shuffler for RB
971 "movd %4,%%xmm3 \n" // multipler for RB
972 "movd %5,%%xmm4 \n" // mask for R10 B10
973 "movd %6,%%xmm5 \n" // mask for AG
974 "movd %7,%%xmm6 \n" // multipler for AG
975 "pshufd $0x0,%%xmm3,%%xmm3 \n"
976 "pshufd $0x0,%%xmm4,%%xmm4 \n"
977 "pshufd $0x0,%%xmm5,%%xmm5 \n"
978 "pshufd $0x0,%%xmm6,%%xmm6 \n"
979 "sub %0,%1 \n"
980
981 "1: \n"
982 "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels
983 "movdqa %%xmm0,%%xmm1 \n"
984 "pshufb %%xmm2,%%xmm1 \n" // R0B0
985 "pand %%xmm5,%%xmm0 \n" // A0G0
986 "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
987 "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
988 "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
989 "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
990 "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
991 "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
992 "add $0x10,%0 \n"
993 "sub $0x4,%2 \n"
994 "jg 1b \n"
995
996 : "+r"(src), // %0
997 "+r"(dst), // %1
998 "+r"(width) // %2
999 : "m"(kShuffleBR30), // %3 reversed shuffler
1000 "m"(kMulRB10), // %4
1001 "m"(kMaskRB10), // %5
1002 "m"(kMaskAG10), // %6
1003 "m"(kMulAG10) // %7
1004 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1005 }
1006
1007 #ifdef HAS_ARGBTOAR30ROW_AVX2
ARGBToAR30Row_AVX2(const uint8_t * src,uint8_t * dst,int width)1008 void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
1009 asm volatile(
1010 "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
1011 "vbroadcastss %4,%%ymm3 \n" // multipler for RB
1012 "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
1013 "vbroadcastss %6,%%ymm5 \n" // mask for AG
1014 "vbroadcastss %7,%%ymm6 \n" // multipler for AG
1015 "sub %0,%1 \n"
1016
1017 "1: \n"
1018 "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels
1019 "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
1020 "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
1021 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
1022 "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
1023 "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
1024 "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
1025 "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
1026 "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
1027 "add $0x20,%0 \n"
1028 "sub $0x8,%2 \n"
1029 "jg 1b \n"
1030 "vzeroupper \n"
1031
1032 : "+r"(src), // %0
1033 "+r"(dst), // %1
1034 "+r"(width) // %2
1035 : "m"(kShuffleRB30), // %3
1036 "m"(kMulRB10), // %4
1037 "m"(kMaskRB10), // %5
1038 "m"(kMaskAG10), // %6
1039 "m"(kMulAG10) // %7
1040 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1041 }
1042 #endif
1043
1044 #ifdef HAS_ABGRTOAR30ROW_AVX2
ABGRToAR30Row_AVX2(const uint8_t * src,uint8_t * dst,int width)1045 void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
1046 asm volatile(
1047 "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
1048 "vbroadcastss %4,%%ymm3 \n" // multipler for RB
1049 "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
1050 "vbroadcastss %6,%%ymm5 \n" // mask for AG
1051 "vbroadcastss %7,%%ymm6 \n" // multipler for AG
1052 "sub %0,%1 \n"
1053
1054 "1: \n"
1055 "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels
1056 "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
1057 "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
1058 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
1059 "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
1060 "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
1061 "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
1062 "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
1063 "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
1064 "add $0x20,%0 \n"
1065 "sub $0x8,%2 \n"
1066 "jg 1b \n"
1067 "vzeroupper \n"
1068
1069 : "+r"(src), // %0
1070 "+r"(dst), // %1
1071 "+r"(width) // %2
1072 : "m"(kShuffleBR30), // %3 reversed shuffler
1073 "m"(kMulRB10), // %4
1074 "m"(kMaskRB10), // %5
1075 "m"(kMaskAG10), // %6
1076 "m"(kMulAG10) // %7
1077 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1078 }
1079 #endif
1080
1081 // clang-format off
1082
1083 // TODO(mraptis): Consider passing R, G, B multipliers as parameter.
1084 // round parameter is register containing value to add before shift.
1085 #define RGBTOY(round) \
1086 "1: \n" \
1087 "movdqu (%0),%%xmm0 \n" \
1088 "movdqu 0x10(%0),%%xmm1 \n" \
1089 "movdqu 0x20(%0),%%xmm2 \n" \
1090 "movdqu 0x30(%0),%%xmm3 \n" \
1091 "psubb %%xmm5,%%xmm0 \n" \
1092 "psubb %%xmm5,%%xmm1 \n" \
1093 "psubb %%xmm5,%%xmm2 \n" \
1094 "psubb %%xmm5,%%xmm3 \n" \
1095 "movdqu %%xmm4,%%xmm6 \n" \
1096 "pmaddubsw %%xmm0,%%xmm6 \n" \
1097 "movdqu %%xmm4,%%xmm0 \n" \
1098 "pmaddubsw %%xmm1,%%xmm0 \n" \
1099 "movdqu %%xmm4,%%xmm1 \n" \
1100 "pmaddubsw %%xmm2,%%xmm1 \n" \
1101 "movdqu %%xmm4,%%xmm2 \n" \
1102 "pmaddubsw %%xmm3,%%xmm2 \n" \
1103 "lea 0x40(%0),%0 \n" \
1104 "phaddw %%xmm0,%%xmm6 \n" \
1105 "phaddw %%xmm2,%%xmm1 \n" \
1106 "prefetcht0 1280(%0) \n" \
1107 "paddw %%" #round ",%%xmm6 \n" \
1108 "paddw %%" #round ",%%xmm1 \n" \
1109 "psrlw $0x8,%%xmm6 \n" \
1110 "psrlw $0x8,%%xmm1 \n" \
1111 "packuswb %%xmm1,%%xmm6 \n" \
1112 "movdqu %%xmm6,(%1) \n" \
1113 "lea 0x10(%1),%1 \n" \
1114 "sub $0x10,%2 \n" \
1115 "jg 1b \n"
1116
1117 #define RGBTOY_AVX2(round) \
1118 "1: \n" \
1119 "vmovdqu (%0),%%ymm0 \n" \
1120 "vmovdqu 0x20(%0),%%ymm1 \n" \
1121 "vmovdqu 0x40(%0),%%ymm2 \n" \
1122 "vmovdqu 0x60(%0),%%ymm3 \n" \
1123 "vpsubb %%ymm5, %%ymm0, %%ymm0 \n" \
1124 "vpsubb %%ymm5, %%ymm1, %%ymm1 \n" \
1125 "vpsubb %%ymm5, %%ymm2, %%ymm2 \n" \
1126 "vpsubb %%ymm5, %%ymm3, %%ymm3 \n" \
1127 "vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \
1128 "vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \
1129 "vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \
1130 "vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \
1131 "lea 0x80(%0),%0 \n" \
1132 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \
1133 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \
1134 "prefetcht0 1280(%0) \n" \
1135 "vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \
1136 "vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \
1137 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \
1138 "vpsrlw $0x8,%%ymm2,%%ymm2 \n" \
1139 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \
1140 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" /* unmutate. */ \
1141 "vmovdqu %%ymm0,(%1) \n" \
1142 "lea 0x20(%1),%1 \n" \
1143 "sub $0x20,%2 \n" \
1144 "jg 1b \n" \
1145 "vzeroupper \n"
1146
1147 // clang-format on
1148
1149 #ifdef HAS_ARGBTOYROW_SSSE3
1150 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
ARGBToYRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_y,int width)1151 void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1152 asm volatile(
1153 "movdqa %3,%%xmm4 \n"
1154 "movdqa %4,%%xmm5 \n"
1155 "movdqa %5,%%xmm7 \n"
1156
1157 LABELALIGN RGBTOY(xmm7)
1158 : "+r"(src_argb), // %0
1159 "+r"(dst_y), // %1
1160 "+r"(width) // %2
1161 : "m"(kARGBToY), // %3
1162 "m"(kSub128), // %4
1163 "m"(kAddY16) // %5
1164 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1165 "xmm7");
1166 }
1167 #endif // HAS_ARGBTOYROW_SSSE3
1168
1169 #ifdef HAS_ARGBTOYJROW_SSSE3
1170 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1171 // Same as ARGBToYRow but different coefficients, no add 16.
ARGBToYJRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_y,int width)1172 void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1173 asm volatile(
1174 "movdqa %3,%%xmm4 \n"
1175 "movdqa %4,%%xmm5 \n"
1176
1177 LABELALIGN RGBTOY(xmm5)
1178 : "+r"(src_argb), // %0
1179 "+r"(dst_y), // %1
1180 "+r"(width) // %2
1181 : "m"(kARGBToYJ), // %3
1182 "m"(kSub128) // %4
1183 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1184 }
1185 #endif // HAS_ARGBTOYJROW_SSSE3
1186
1187 #ifdef HAS_RGBATOYJROW_SSSE3
1188 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1189 // Same as ARGBToYRow but different coefficients, no add 16.
RGBAToYJRow_SSSE3(const uint8_t * src_rgba,uint8_t * dst_y,int width)1190 void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1191 asm volatile(
1192 "movdqa %3,%%xmm4 \n"
1193 "movdqa %4,%%xmm5 \n"
1194
1195 LABELALIGN RGBTOY(xmm5)
1196 : "+r"(src_rgba), // %0
1197 "+r"(dst_y), // %1
1198 "+r"(width) // %2
1199 : "m"(kRGBAToYJ), // %3
1200 "m"(kSub128) // %4
1201 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1202 }
1203 #endif // HAS_RGBATOYJROW_SSSE3
1204
1205 #ifdef HAS_ARGBTOYROW_AVX2
1206 // vpermd for vphaddw + vpackuswb vpermd.
1207 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
1208
1209 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYRow_AVX2(const uint8_t * src_argb,uint8_t * dst_y,int width)1210 void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1211 asm volatile(
1212 "vbroadcastf128 %3,%%ymm4 \n"
1213 "vbroadcastf128 %4,%%ymm5 \n"
1214 "vbroadcastf128 %5,%%ymm7 \n"
1215 "vmovdqu %6,%%ymm6 \n"
1216
1217 LABELALIGN RGBTOY_AVX2(ymm7)
1218 : "+r"(src_argb), // %0
1219 "+r"(dst_y), // %1
1220 "+r"(width) // %2
1221 : "m"(kARGBToY), // %3
1222 "m"(kSub128), // %4
1223 "m"(kAddY16), // %5
1224 "m"(kPermdARGBToY_AVX) // %6
1225 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1226 "xmm7");
1227 }
1228 #endif // HAS_ARGBTOYROW_AVX2
1229
1230 #ifdef HAS_ABGRTOYROW_AVX2
1231 // Convert 32 ABGR pixels (128 bytes) to 32 Y values.
ABGRToYRow_AVX2(const uint8_t * src_abgr,uint8_t * dst_y,int width)1232 void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1233 asm volatile(
1234 "vbroadcastf128 %3,%%ymm4 \n"
1235 "vbroadcastf128 %4,%%ymm5 \n"
1236 "vbroadcastf128 %5,%%ymm7 \n"
1237 "vmovdqu %6,%%ymm6 \n"
1238
1239 LABELALIGN RGBTOY_AVX2(ymm7)
1240 : "+r"(src_abgr), // %0
1241 "+r"(dst_y), // %1
1242 "+r"(width) // %2
1243 : "m"(kABGRToY), // %3
1244 "m"(kSub128), // %4
1245 "m"(kAddY16), // %5
1246 "m"(kPermdARGBToY_AVX) // %6
1247 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1248 "xmm7");
1249 }
1250 #endif // HAS_ABGRTOYROW_AVX2
1251
1252 #ifdef HAS_ARGBTOYJROW_AVX2
1253 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYJRow_AVX2(const uint8_t * src_argb,uint8_t * dst_y,int width)1254 void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1255 asm volatile(
1256 "vbroadcastf128 %3,%%ymm4 \n"
1257 "vbroadcastf128 %4,%%ymm5 \n"
1258 "vmovdqu %5,%%ymm6 \n"
1259
1260 LABELALIGN RGBTOY_AVX2(ymm5)
1261 : "+r"(src_argb), // %0
1262 "+r"(dst_y), // %1
1263 "+r"(width) // %2
1264 : "m"(kARGBToYJ), // %3
1265 "m"(kSub128), // %4
1266 "m"(kPermdARGBToY_AVX) // %5
1267 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1268 "xmm7");
1269 }
1270 #endif // HAS_ARGBTOYJROW_AVX2
1271
1272 #ifdef HAS_RGBATOYJROW_AVX2
1273 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
RGBAToYJRow_AVX2(const uint8_t * src_rgba,uint8_t * dst_y,int width)1274 void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1275 asm volatile(
1276 "vbroadcastf128 %3,%%ymm4 \n"
1277 "vbroadcastf128 %4,%%ymm5 \n"
1278 "vmovdqu %5,%%ymm6 \n"
1279
1280 LABELALIGN RGBTOY_AVX2(
1281 ymm5) "vzeroupper \n"
1282 : "+r"(src_rgba), // %0
1283 "+r"(dst_y), // %1
1284 "+r"(width) // %2
1285 : "m"(kRGBAToYJ), // %3
1286 "m"(kSub128), // %4
1287 "m"(kPermdARGBToY_AVX) // %5
1288 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1289 }
1290 #endif // HAS_RGBATOYJROW_AVX2
1291
1292 #ifdef HAS_ARGBTOUVROW_SSSE3
ARGBToUVRow_SSSE3(const uint8_t * src_argb0,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1293 void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
1294 int src_stride_argb,
1295 uint8_t* dst_u,
1296 uint8_t* dst_v,
1297 int width) {
1298 asm volatile(
1299 "movdqa %5,%%xmm3 \n"
1300 "movdqa %6,%%xmm4 \n"
1301 "movdqa %7,%%xmm5 \n"
1302 "sub %1,%2 \n"
1303
1304 LABELALIGN
1305 "1: \n"
1306 "movdqu (%0),%%xmm0 \n"
1307 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
1308 "pavgb %%xmm7,%%xmm0 \n"
1309 "movdqu 0x10(%0),%%xmm1 \n"
1310 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1311 "pavgb %%xmm7,%%xmm1 \n"
1312 "movdqu 0x20(%0),%%xmm2 \n"
1313 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1314 "pavgb %%xmm7,%%xmm2 \n"
1315 "movdqu 0x30(%0),%%xmm6 \n"
1316 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1317 "pavgb %%xmm7,%%xmm6 \n"
1318
1319 "lea 0x40(%0),%0 \n"
1320 "movdqa %%xmm0,%%xmm7 \n"
1321 "shufps $0x88,%%xmm1,%%xmm0 \n"
1322 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1323 "pavgb %%xmm7,%%xmm0 \n"
1324 "movdqa %%xmm2,%%xmm7 \n"
1325 "shufps $0x88,%%xmm6,%%xmm2 \n"
1326 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1327 "pavgb %%xmm7,%%xmm2 \n"
1328 "movdqa %%xmm0,%%xmm1 \n"
1329 "movdqa %%xmm2,%%xmm6 \n"
1330 "pmaddubsw %%xmm4,%%xmm0 \n"
1331 "pmaddubsw %%xmm4,%%xmm2 \n"
1332 "pmaddubsw %%xmm3,%%xmm1 \n"
1333 "pmaddubsw %%xmm3,%%xmm6 \n"
1334 "phaddw %%xmm2,%%xmm0 \n"
1335 "phaddw %%xmm6,%%xmm1 \n"
1336 "psraw $0x8,%%xmm0 \n"
1337 "psraw $0x8,%%xmm1 \n"
1338 "packsswb %%xmm1,%%xmm0 \n"
1339 "paddb %%xmm5,%%xmm0 \n"
1340 "movlps %%xmm0,(%1) \n"
1341 "movhps %%xmm0,0x00(%1,%2,1) \n"
1342 "lea 0x8(%1),%1 \n"
1343 "sub $0x10,%3 \n"
1344 "jg 1b \n"
1345 : "+r"(src_argb0), // %0
1346 "+r"(dst_u), // %1
1347 "+r"(dst_v), // %2
1348 "+rm"(width) // %3
1349 : "r"((intptr_t)(src_stride_argb)), // %4
1350 "m"(kARGBToV), // %5
1351 "m"(kARGBToU), // %6
1352 "m"(kAddUV128) // %7
1353 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1354 }
1355 #endif // HAS_ARGBTOUVROW_SSSE3
1356
1357 #ifdef HAS_ARGBTOUVROW_AVX2
1358 // vpshufb for vphaddw + vpackuswb packed to shorts.
1359 static const lvec8 kShufARGBToUV_AVX = {
1360 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
1361 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
ARGBToUVRow_AVX2(const uint8_t * src_argb0,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1362 void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
1363 int src_stride_argb,
1364 uint8_t* dst_u,
1365 uint8_t* dst_v,
1366 int width) {
1367 asm volatile(
1368 "vbroadcastf128 %5,%%ymm5 \n"
1369 "vbroadcastf128 %6,%%ymm6 \n"
1370 "vbroadcastf128 %7,%%ymm7 \n"
1371 "sub %1,%2 \n"
1372
1373 LABELALIGN
1374 "1: \n"
1375 "vmovdqu (%0),%%ymm0 \n"
1376 "vmovdqu 0x20(%0),%%ymm1 \n"
1377 "vmovdqu 0x40(%0),%%ymm2 \n"
1378 "vmovdqu 0x60(%0),%%ymm3 \n"
1379 "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
1380 "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
1381 "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
1382 "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
1383 "lea 0x80(%0),%0 \n"
1384 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
1385 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
1386 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
1387 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
1388 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
1389 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
1390
1391 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
1392 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
1393 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
1394 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
1395 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
1396 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
1397 "vpsraw $0x8,%%ymm1,%%ymm1 \n"
1398 "vpsraw $0x8,%%ymm0,%%ymm0 \n"
1399 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
1400 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1401 "vpshufb %8,%%ymm0,%%ymm0 \n"
1402 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
1403
1404 "vextractf128 $0x0,%%ymm0,(%1) \n"
1405 "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
1406 "lea 0x10(%1),%1 \n"
1407 "sub $0x20,%3 \n"
1408 "jg 1b \n"
1409 "vzeroupper \n"
1410 : "+r"(src_argb0), // %0
1411 "+r"(dst_u), // %1
1412 "+r"(dst_v), // %2
1413 "+rm"(width) // %3
1414 : "r"((intptr_t)(src_stride_argb)), // %4
1415 "m"(kAddUV128), // %5
1416 "m"(kARGBToV), // %6
1417 "m"(kARGBToU), // %7
1418 "m"(kShufARGBToUV_AVX) // %8
1419 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1420 "xmm7");
1421 }
1422 #endif // HAS_ARGBTOUVROW_AVX2
1423
1424 #ifdef HAS_ABGRTOUVROW_AVX2
ABGRToUVRow_AVX2(const uint8_t * src_abgr0,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)1425 void ABGRToUVRow_AVX2(const uint8_t* src_abgr0,
1426 int src_stride_abgr,
1427 uint8_t* dst_u,
1428 uint8_t* dst_v,
1429 int width) {
1430 asm volatile(
1431 "vbroadcastf128 %5,%%ymm5 \n"
1432 "vbroadcastf128 %6,%%ymm6 \n"
1433 "vbroadcastf128 %7,%%ymm7 \n"
1434 "sub %1,%2 \n"
1435
1436 LABELALIGN
1437 "1: \n"
1438 "vmovdqu (%0),%%ymm0 \n"
1439 "vmovdqu 0x20(%0),%%ymm1 \n"
1440 "vmovdqu 0x40(%0),%%ymm2 \n"
1441 "vmovdqu 0x60(%0),%%ymm3 \n"
1442 "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
1443 "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
1444 "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
1445 "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
1446 "lea 0x80(%0),%0 \n"
1447 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
1448 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
1449 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
1450 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
1451 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
1452 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
1453
1454 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
1455 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
1456 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
1457 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
1458 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
1459 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
1460 "vpsraw $0x8,%%ymm1,%%ymm1 \n"
1461 "vpsraw $0x8,%%ymm0,%%ymm0 \n"
1462 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
1463 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1464 "vpshufb %8,%%ymm0,%%ymm0 \n"
1465 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
1466
1467 "vextractf128 $0x0,%%ymm0,(%1) \n"
1468 "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
1469 "lea 0x10(%1),%1 \n"
1470 "sub $0x20,%3 \n"
1471 "jg 1b \n"
1472 "vzeroupper \n"
1473 : "+r"(src_abgr0), // %0
1474 "+r"(dst_u), // %1
1475 "+r"(dst_v), // %2
1476 "+rm"(width) // %3
1477 : "r"((intptr_t)(src_stride_abgr)), // %4
1478 "m"(kAddUV128), // %5
1479 "m"(kABGRToV), // %6
1480 "m"(kABGRToU), // %7
1481 "m"(kShufARGBToUV_AVX) // %8
1482 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1483 "xmm7");
1484 }
1485 #endif // HAS_ABGRTOUVROW_AVX2
1486
1487 #ifdef HAS_ARGBTOUVJROW_AVX2
ARGBToUVJRow_AVX2(const uint8_t * src_argb0,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1488 void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
1489 int src_stride_argb,
1490 uint8_t* dst_u,
1491 uint8_t* dst_v,
1492 int width) {
1493 asm volatile(
1494 "vbroadcastf128 %5,%%ymm5 \n"
1495 "vbroadcastf128 %6,%%ymm6 \n"
1496 "vbroadcastf128 %7,%%ymm7 \n"
1497 "sub %1,%2 \n"
1498
1499 LABELALIGN
1500 "1: \n"
1501 "vmovdqu (%0),%%ymm0 \n"
1502 "vmovdqu 0x20(%0),%%ymm1 \n"
1503 "vmovdqu 0x40(%0),%%ymm2 \n"
1504 "vmovdqu 0x60(%0),%%ymm3 \n"
1505 "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
1506 "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
1507 "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
1508 "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
1509 "lea 0x80(%0),%0 \n"
1510 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
1511 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
1512 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
1513 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
1514 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
1515 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
1516
1517 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
1518 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
1519 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
1520 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
1521 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
1522 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
1523 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
1524 "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
1525 "vpsraw $0x8,%%ymm1,%%ymm1 \n"
1526 "vpsraw $0x8,%%ymm0,%%ymm0 \n"
1527 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
1528 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1529 "vpshufb %8,%%ymm0,%%ymm0 \n"
1530
1531 "vextractf128 $0x0,%%ymm0,(%1) \n"
1532 "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
1533 "lea 0x10(%1),%1 \n"
1534 "sub $0x20,%3 \n"
1535 "jg 1b \n"
1536 "vzeroupper \n"
1537 : "+r"(src_argb0), // %0
1538 "+r"(dst_u), // %1
1539 "+r"(dst_v), // %2
1540 "+rm"(width) // %3
1541 : "r"((intptr_t)(src_stride_argb)), // %4
1542 "m"(kSub128), // %5
1543 "m"(kARGBToVJ), // %6
1544 "m"(kARGBToUJ), // %7
1545 "m"(kShufARGBToUV_AVX) // %8
1546 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1547 "xmm7");
1548 }
1549 #endif // HAS_ARGBTOUVJROW_AVX2
1550
1551 #ifdef HAS_ARGBTOUVJROW_SSSE3
ARGBToUVJRow_SSSE3(const uint8_t * src_argb0,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1552 void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
1553 int src_stride_argb,
1554 uint8_t* dst_u,
1555 uint8_t* dst_v,
1556 int width) {
1557 asm volatile(
1558 "movdqa %5,%%xmm3 \n"
1559 "movdqa %6,%%xmm4 \n"
1560 "movdqa %7,%%xmm5 \n"
1561 "sub %1,%2 \n"
1562
1563 LABELALIGN
1564 "1: \n"
1565 "movdqu (%0),%%xmm0 \n"
1566 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
1567 "pavgb %%xmm7,%%xmm0 \n"
1568 "movdqu 0x10(%0),%%xmm1 \n"
1569 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1570 "pavgb %%xmm7,%%xmm1 \n"
1571 "movdqu 0x20(%0),%%xmm2 \n"
1572 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1573 "pavgb %%xmm7,%%xmm2 \n"
1574 "movdqu 0x30(%0),%%xmm6 \n"
1575 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1576 "pavgb %%xmm7,%%xmm6 \n"
1577
1578 "lea 0x40(%0),%0 \n"
1579 "movdqa %%xmm0,%%xmm7 \n"
1580 "shufps $0x88,%%xmm1,%%xmm0 \n"
1581 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1582 "pavgb %%xmm7,%%xmm0 \n"
1583 "movdqa %%xmm2,%%xmm7 \n"
1584 "shufps $0x88,%%xmm6,%%xmm2 \n"
1585 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1586 "pavgb %%xmm7,%%xmm2 \n"
1587 "movdqa %%xmm0,%%xmm1 \n"
1588 "movdqa %%xmm2,%%xmm6 \n"
1589 "pmaddubsw %%xmm4,%%xmm0 \n"
1590 "pmaddubsw %%xmm4,%%xmm2 \n"
1591 "pmaddubsw %%xmm3,%%xmm1 \n"
1592 "pmaddubsw %%xmm3,%%xmm6 \n"
1593 "phaddw %%xmm2,%%xmm0 \n"
1594 "phaddw %%xmm6,%%xmm1 \n"
1595 "paddw %%xmm5,%%xmm0 \n"
1596 "paddw %%xmm5,%%xmm1 \n"
1597 "psraw $0x8,%%xmm0 \n"
1598 "psraw $0x8,%%xmm1 \n"
1599 "packsswb %%xmm1,%%xmm0 \n"
1600 "movlps %%xmm0,(%1) \n"
1601 "movhps %%xmm0,0x00(%1,%2,1) \n"
1602 "lea 0x8(%1),%1 \n"
1603 "sub $0x10,%3 \n"
1604 "jg 1b \n"
1605 : "+r"(src_argb0), // %0
1606 "+r"(dst_u), // %1
1607 "+r"(dst_v), // %2
1608 "+rm"(width) // %3
1609 : "r"((intptr_t)(src_stride_argb)), // %4
1610 "m"(kARGBToVJ), // %5
1611 "m"(kARGBToUJ), // %6
1612 "m"(kSub128) // %7
1613 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1614 }
1615 #endif // HAS_ARGBTOUVJROW_SSSE3
1616
1617 #ifdef HAS_ARGBTOUV444ROW_SSSE3
ARGBToUV444Row_SSSE3(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1618 void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
1619 uint8_t* dst_u,
1620 uint8_t* dst_v,
1621 int width) {
1622 asm volatile(
1623 "movdqa %4,%%xmm3 \n"
1624 "movdqa %5,%%xmm4 \n"
1625 "movdqa %6,%%xmm5 \n"
1626 "sub %1,%2 \n"
1627
1628 LABELALIGN
1629 "1: \n"
1630 "movdqu (%0),%%xmm0 \n"
1631 "movdqu 0x10(%0),%%xmm1 \n"
1632 "movdqu 0x20(%0),%%xmm2 \n"
1633 "movdqu 0x30(%0),%%xmm6 \n"
1634 "pmaddubsw %%xmm4,%%xmm0 \n"
1635 "pmaddubsw %%xmm4,%%xmm1 \n"
1636 "pmaddubsw %%xmm4,%%xmm2 \n"
1637 "pmaddubsw %%xmm4,%%xmm6 \n"
1638 "phaddw %%xmm1,%%xmm0 \n"
1639 "phaddw %%xmm6,%%xmm2 \n"
1640 "psraw $0x8,%%xmm0 \n"
1641 "psraw $0x8,%%xmm2 \n"
1642 "packsswb %%xmm2,%%xmm0 \n"
1643 "paddb %%xmm5,%%xmm0 \n"
1644 "movdqu %%xmm0,(%1) \n"
1645 "movdqu (%0),%%xmm0 \n"
1646 "movdqu 0x10(%0),%%xmm1 \n"
1647 "movdqu 0x20(%0),%%xmm2 \n"
1648 "movdqu 0x30(%0),%%xmm6 \n"
1649 "pmaddubsw %%xmm3,%%xmm0 \n"
1650 "pmaddubsw %%xmm3,%%xmm1 \n"
1651 "pmaddubsw %%xmm3,%%xmm2 \n"
1652 "pmaddubsw %%xmm3,%%xmm6 \n"
1653 "phaddw %%xmm1,%%xmm0 \n"
1654 "phaddw %%xmm6,%%xmm2 \n"
1655 "psraw $0x8,%%xmm0 \n"
1656 "psraw $0x8,%%xmm2 \n"
1657 "packsswb %%xmm2,%%xmm0 \n"
1658 "paddb %%xmm5,%%xmm0 \n"
1659 "lea 0x40(%0),%0 \n"
1660 "movdqu %%xmm0,0x00(%1,%2,1) \n"
1661 "lea 0x10(%1),%1 \n"
1662 "sub $0x10,%3 \n"
1663 "jg 1b \n"
1664 : "+r"(src_argb), // %0
1665 "+r"(dst_u), // %1
1666 "+r"(dst_v), // %2
1667 "+rm"(width) // %3
1668 : "m"(kARGBToV), // %4
1669 "m"(kARGBToU), // %5
1670 "m"(kAddUV128) // %6
1671 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");
1672 }
1673 #endif // HAS_ARGBTOUV444ROW_SSSE3
1674
BGRAToYRow_SSSE3(const uint8_t * src_bgra,uint8_t * dst_y,int width)1675 void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
1676 asm volatile(
1677 "movdqa %3,%%xmm4 \n"
1678 "movdqa %4,%%xmm5 \n"
1679 "movdqa %5,%%xmm7 \n"
1680
1681 LABELALIGN RGBTOY(xmm7)
1682 : "+r"(src_bgra), // %0
1683 "+r"(dst_y), // %1
1684 "+r"(width) // %2
1685 : "m"(kBGRAToY), // %3
1686 "m"(kSub128), // %4
1687 "m"(kAddY16) // %5
1688 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1689 "xmm7");
1690 }
1691
BGRAToUVRow_SSSE3(const uint8_t * src_bgra0,int src_stride_bgra,uint8_t * dst_u,uint8_t * dst_v,int width)1692 void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
1693 int src_stride_bgra,
1694 uint8_t* dst_u,
1695 uint8_t* dst_v,
1696 int width) {
1697 asm volatile(
1698 "movdqa %5,%%xmm3 \n"
1699 "movdqa %6,%%xmm4 \n"
1700 "movdqa %7,%%xmm5 \n"
1701 "sub %1,%2 \n"
1702
1703 LABELALIGN
1704 "1: \n"
1705 "movdqu (%0),%%xmm0 \n"
1706 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
1707 "pavgb %%xmm7,%%xmm0 \n"
1708 "movdqu 0x10(%0),%%xmm1 \n"
1709 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1710 "pavgb %%xmm7,%%xmm1 \n"
1711 "movdqu 0x20(%0),%%xmm2 \n"
1712 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1713 "pavgb %%xmm7,%%xmm2 \n"
1714 "movdqu 0x30(%0),%%xmm6 \n"
1715 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1716 "pavgb %%xmm7,%%xmm6 \n"
1717
1718 "lea 0x40(%0),%0 \n"
1719 "movdqa %%xmm0,%%xmm7 \n"
1720 "shufps $0x88,%%xmm1,%%xmm0 \n"
1721 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1722 "pavgb %%xmm7,%%xmm0 \n"
1723 "movdqa %%xmm2,%%xmm7 \n"
1724 "shufps $0x88,%%xmm6,%%xmm2 \n"
1725 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1726 "pavgb %%xmm7,%%xmm2 \n"
1727 "movdqa %%xmm0,%%xmm1 \n"
1728 "movdqa %%xmm2,%%xmm6 \n"
1729 "pmaddubsw %%xmm4,%%xmm0 \n"
1730 "pmaddubsw %%xmm4,%%xmm2 \n"
1731 "pmaddubsw %%xmm3,%%xmm1 \n"
1732 "pmaddubsw %%xmm3,%%xmm6 \n"
1733 "phaddw %%xmm2,%%xmm0 \n"
1734 "phaddw %%xmm6,%%xmm1 \n"
1735 "psraw $0x8,%%xmm0 \n"
1736 "psraw $0x8,%%xmm1 \n"
1737 "packsswb %%xmm1,%%xmm0 \n"
1738 "paddb %%xmm5,%%xmm0 \n"
1739 "movlps %%xmm0,(%1) \n"
1740 "movhps %%xmm0,0x00(%1,%2,1) \n"
1741 "lea 0x8(%1),%1 \n"
1742 "sub $0x10,%3 \n"
1743 "jg 1b \n"
1744 : "+r"(src_bgra0), // %0
1745 "+r"(dst_u), // %1
1746 "+r"(dst_v), // %2
1747 "+rm"(width) // %3
1748 : "r"((intptr_t)(src_stride_bgra)), // %4
1749 "m"(kBGRAToV), // %5
1750 "m"(kBGRAToU), // %6
1751 "m"(kAddUV128) // %7
1752 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1753 }
1754
ABGRToYRow_SSSE3(const uint8_t * src_abgr,uint8_t * dst_y,int width)1755 void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1756 asm volatile(
1757 "movdqa %3,%%xmm4 \n"
1758 "movdqa %4,%%xmm5 \n"
1759 "movdqa %5,%%xmm7 \n"
1760
1761 LABELALIGN RGBTOY(xmm7)
1762 : "+r"(src_abgr), // %0
1763 "+r"(dst_y), // %1
1764 "+r"(width) // %2
1765 : "m"(kABGRToY), // %3
1766 "m"(kSub128), // %4
1767 "m"(kAddY16) // %5
1768 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1769 "xmm7");
1770 }
1771
RGBAToYRow_SSSE3(const uint8_t * src_rgba,uint8_t * dst_y,int width)1772 void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1773 asm volatile(
1774 "movdqa %3,%%xmm4 \n"
1775 "movdqa %4,%%xmm5 \n"
1776 "movdqa %5,%%xmm7 \n"
1777
1778 LABELALIGN RGBTOY(xmm7)
1779 : "+r"(src_rgba), // %0
1780 "+r"(dst_y), // %1
1781 "+r"(width) // %2
1782 : "m"(kRGBAToY), // %3
1783 "m"(kSub128), // %4
1784 "m"(kAddY16) // %5
1785 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1786 "xmm7");
1787 }
1788
ABGRToUVRow_SSSE3(const uint8_t * src_abgr0,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)1789 void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
1790 int src_stride_abgr,
1791 uint8_t* dst_u,
1792 uint8_t* dst_v,
1793 int width) {
1794 asm volatile(
1795 "movdqa %5,%%xmm3 \n"
1796 "movdqa %6,%%xmm4 \n"
1797 "movdqa %7,%%xmm5 \n"
1798 "sub %1,%2 \n"
1799
1800 LABELALIGN
1801 "1: \n"
1802 "movdqu (%0),%%xmm0 \n"
1803 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
1804 "pavgb %%xmm7,%%xmm0 \n"
1805 "movdqu 0x10(%0),%%xmm1 \n"
1806 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1807 "pavgb %%xmm7,%%xmm1 \n"
1808 "movdqu 0x20(%0),%%xmm2 \n"
1809 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1810 "pavgb %%xmm7,%%xmm2 \n"
1811 "movdqu 0x30(%0),%%xmm6 \n"
1812 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1813 "pavgb %%xmm7,%%xmm6 \n"
1814
1815 "lea 0x40(%0),%0 \n"
1816 "movdqa %%xmm0,%%xmm7 \n"
1817 "shufps $0x88,%%xmm1,%%xmm0 \n"
1818 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1819 "pavgb %%xmm7,%%xmm0 \n"
1820 "movdqa %%xmm2,%%xmm7 \n"
1821 "shufps $0x88,%%xmm6,%%xmm2 \n"
1822 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1823 "pavgb %%xmm7,%%xmm2 \n"
1824 "movdqa %%xmm0,%%xmm1 \n"
1825 "movdqa %%xmm2,%%xmm6 \n"
1826 "pmaddubsw %%xmm4,%%xmm0 \n"
1827 "pmaddubsw %%xmm4,%%xmm2 \n"
1828 "pmaddubsw %%xmm3,%%xmm1 \n"
1829 "pmaddubsw %%xmm3,%%xmm6 \n"
1830 "phaddw %%xmm2,%%xmm0 \n"
1831 "phaddw %%xmm6,%%xmm1 \n"
1832 "psraw $0x8,%%xmm0 \n"
1833 "psraw $0x8,%%xmm1 \n"
1834 "packsswb %%xmm1,%%xmm0 \n"
1835 "paddb %%xmm5,%%xmm0 \n"
1836 "movlps %%xmm0,(%1) \n"
1837 "movhps %%xmm0,0x00(%1,%2,1) \n"
1838 "lea 0x8(%1),%1 \n"
1839 "sub $0x10,%3 \n"
1840 "jg 1b \n"
1841 : "+r"(src_abgr0), // %0
1842 "+r"(dst_u), // %1
1843 "+r"(dst_v), // %2
1844 "+rm"(width) // %3
1845 : "r"((intptr_t)(src_stride_abgr)), // %4
1846 "m"(kABGRToV), // %5
1847 "m"(kABGRToU), // %6
1848 "m"(kAddUV128) // %7
1849 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1850 }
1851
RGBAToUVRow_SSSE3(const uint8_t * src_rgba0,int src_stride_rgba,uint8_t * dst_u,uint8_t * dst_v,int width)1852 void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
1853 int src_stride_rgba,
1854 uint8_t* dst_u,
1855 uint8_t* dst_v,
1856 int width) {
1857 asm volatile(
1858 "movdqa %5,%%xmm3 \n"
1859 "movdqa %6,%%xmm4 \n"
1860 "movdqa %7,%%xmm5 \n"
1861 "sub %1,%2 \n"
1862
1863 LABELALIGN
1864 "1: \n"
1865 "movdqu (%0),%%xmm0 \n"
1866 "movdqu 0x00(%0,%4,1),%%xmm7 \n"
1867 "pavgb %%xmm7,%%xmm0 \n"
1868 "movdqu 0x10(%0),%%xmm1 \n"
1869 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1870 "pavgb %%xmm7,%%xmm1 \n"
1871 "movdqu 0x20(%0),%%xmm2 \n"
1872 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1873 "pavgb %%xmm7,%%xmm2 \n"
1874 "movdqu 0x30(%0),%%xmm6 \n"
1875 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1876 "pavgb %%xmm7,%%xmm6 \n"
1877
1878 "lea 0x40(%0),%0 \n"
1879 "movdqa %%xmm0,%%xmm7 \n"
1880 "shufps $0x88,%%xmm1,%%xmm0 \n"
1881 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1882 "pavgb %%xmm7,%%xmm0 \n"
1883 "movdqa %%xmm2,%%xmm7 \n"
1884 "shufps $0x88,%%xmm6,%%xmm2 \n"
1885 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1886 "pavgb %%xmm7,%%xmm2 \n"
1887 "movdqa %%xmm0,%%xmm1 \n"
1888 "movdqa %%xmm2,%%xmm6 \n"
1889 "pmaddubsw %%xmm4,%%xmm0 \n"
1890 "pmaddubsw %%xmm4,%%xmm2 \n"
1891 "pmaddubsw %%xmm3,%%xmm1 \n"
1892 "pmaddubsw %%xmm3,%%xmm6 \n"
1893 "phaddw %%xmm2,%%xmm0 \n"
1894 "phaddw %%xmm6,%%xmm1 \n"
1895 "psraw $0x8,%%xmm0 \n"
1896 "psraw $0x8,%%xmm1 \n"
1897 "packsswb %%xmm1,%%xmm0 \n"
1898 "paddb %%xmm5,%%xmm0 \n"
1899 "movlps %%xmm0,(%1) \n"
1900 "movhps %%xmm0,0x00(%1,%2,1) \n"
1901 "lea 0x8(%1),%1 \n"
1902 "sub $0x10,%3 \n"
1903 "jg 1b \n"
1904 : "+r"(src_rgba0), // %0
1905 "+r"(dst_u), // %1
1906 "+r"(dst_v), // %2
1907 "+rm"(width) // %3
1908 : "r"((intptr_t)(src_stride_rgba)), // %4
1909 "m"(kRGBAToV), // %5
1910 "m"(kRGBAToU), // %6
1911 "m"(kAddUV128) // %7
1912 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
1913 }
1914
1915 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
1916
1917 // Read 8 UV from 444
1918 #define READYUV444 \
1919 "movq (%[u_buf]),%%xmm0 \n" \
1920 "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
1921 "lea 0x8(%[u_buf]),%[u_buf] \n" \
1922 "punpcklbw %%xmm1,%%xmm0 \n" \
1923 "movq (%[y_buf]),%%xmm4 \n" \
1924 "punpcklbw %%xmm4,%%xmm4 \n" \
1925 "lea 0x8(%[y_buf]),%[y_buf] \n"
1926
1927 // Read 4 UV from 422, upsample to 8 UV
1928 #define READYUV422 \
1929 "movd (%[u_buf]),%%xmm0 \n" \
1930 "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
1931 "lea 0x4(%[u_buf]),%[u_buf] \n" \
1932 "punpcklbw %%xmm1,%%xmm0 \n" \
1933 "punpcklwd %%xmm0,%%xmm0 \n" \
1934 "movq (%[y_buf]),%%xmm4 \n" \
1935 "punpcklbw %%xmm4,%%xmm4 \n" \
1936 "lea 0x8(%[y_buf]),%[y_buf] \n"
1937
1938 // Read 4 UV from 422 10 bit, upsample to 8 UV
1939 // TODO(fbarchard): Consider shufb to replace pack/unpack
1940 // TODO(fbarchard): Consider pmulhuw to replace psraw
1941 // TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
1942 #define READYUV210 \
1943 "movq (%[u_buf]),%%xmm0 \n" \
1944 "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
1945 "lea 0x8(%[u_buf]),%[u_buf] \n" \
1946 "punpcklwd %%xmm1,%%xmm0 \n" \
1947 "psraw $0x2,%%xmm0 \n" \
1948 "packuswb %%xmm0,%%xmm0 \n" \
1949 "punpcklwd %%xmm0,%%xmm0 \n" \
1950 "movdqu (%[y_buf]),%%xmm4 \n" \
1951 "psllw $0x6,%%xmm4 \n" \
1952 "lea 0x10(%[y_buf]),%[y_buf] \n"
1953
1954 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
1955 #define READYUVA422 \
1956 "movd (%[u_buf]),%%xmm0 \n" \
1957 "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
1958 "lea 0x4(%[u_buf]),%[u_buf] \n" \
1959 "punpcklbw %%xmm1,%%xmm0 \n" \
1960 "punpcklwd %%xmm0,%%xmm0 \n" \
1961 "movq (%[y_buf]),%%xmm4 \n" \
1962 "punpcklbw %%xmm4,%%xmm4 \n" \
1963 "lea 0x8(%[y_buf]),%[y_buf] \n" \
1964 "movq (%[a_buf]),%%xmm5 \n" \
1965 "lea 0x8(%[a_buf]),%[a_buf] \n"
1966
1967 // Read 4 UV from NV12, upsample to 8 UV
1968 #define READNV12 \
1969 "movq (%[uv_buf]),%%xmm0 \n" \
1970 "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
1971 "punpcklwd %%xmm0,%%xmm0 \n" \
1972 "movq (%[y_buf]),%%xmm4 \n" \
1973 "punpcklbw %%xmm4,%%xmm4 \n" \
1974 "lea 0x8(%[y_buf]),%[y_buf] \n"
1975
1976 // Read 4 VU from NV21, upsample to 8 UV
1977 #define READNV21 \
1978 "movq (%[vu_buf]),%%xmm0 \n" \
1979 "lea 0x8(%[vu_buf]),%[vu_buf] \n" \
1980 "pshufb %[kShuffleNV21], %%xmm0 \n" \
1981 "movq (%[y_buf]),%%xmm4 \n" \
1982 "punpcklbw %%xmm4,%%xmm4 \n" \
1983 "lea 0x8(%[y_buf]),%[y_buf] \n"
1984
1985 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
1986 #define READYUY2 \
1987 "movdqu (%[yuy2_buf]),%%xmm4 \n" \
1988 "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
1989 "movdqu (%[yuy2_buf]),%%xmm0 \n" \
1990 "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \
1991 "lea 0x10(%[yuy2_buf]),%[yuy2_buf] \n"
1992
1993 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
1994 #define READUYVY \
1995 "movdqu (%[uyvy_buf]),%%xmm4 \n" \
1996 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
1997 "movdqu (%[uyvy_buf]),%%xmm0 \n" \
1998 "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \
1999 "lea 0x10(%[uyvy_buf]),%[uyvy_buf] \n"
2000
2001 #if defined(__x86_64__)
2002 #define YUVTORGB_SETUP(yuvconstants) \
2003 "movdqa (%[yuvconstants]),%%xmm8 \n" \
2004 "movdqa 32(%[yuvconstants]),%%xmm9 \n" \
2005 "movdqa 64(%[yuvconstants]),%%xmm10 \n" \
2006 "movdqa 96(%[yuvconstants]),%%xmm11 \n" \
2007 "movdqa 128(%[yuvconstants]),%%xmm12 \n" \
2008 "movdqa 160(%[yuvconstants]),%%xmm13 \n" \
2009 "movdqa 192(%[yuvconstants]),%%xmm14 \n"
2010 // Convert 8 pixels: 8 UV and 8 Y
2011 #define YUVTORGB16(yuvconstants) \
2012 "movdqa %%xmm0,%%xmm1 \n" \
2013 "movdqa %%xmm0,%%xmm2 \n" \
2014 "movdqa %%xmm0,%%xmm3 \n" \
2015 "movdqa %%xmm11,%%xmm0 \n" \
2016 "pmaddubsw %%xmm8,%%xmm1 \n" \
2017 "psubw %%xmm1,%%xmm0 \n" \
2018 "movdqa %%xmm12,%%xmm1 \n" \
2019 "pmaddubsw %%xmm9,%%xmm2 \n" \
2020 "psubw %%xmm2,%%xmm1 \n" \
2021 "movdqa %%xmm13,%%xmm2 \n" \
2022 "pmaddubsw %%xmm10,%%xmm3 \n" \
2023 "psubw %%xmm3,%%xmm2 \n" \
2024 "pmulhuw %%xmm14,%%xmm4 \n" \
2025 "paddsw %%xmm4,%%xmm0 \n" \
2026 "paddsw %%xmm4,%%xmm1 \n" \
2027 "paddsw %%xmm4,%%xmm2 \n"
2028 #define YUVTORGB_REGS \
2029 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
2030
2031 #else
2032 #define YUVTORGB_SETUP(yuvconstants)
2033 // Convert 8 pixels: 8 UV and 8 Y
2034 #define YUVTORGB16(yuvconstants) \
2035 "movdqa %%xmm0,%%xmm1 \n" \
2036 "movdqa %%xmm0,%%xmm2 \n" \
2037 "movdqa %%xmm0,%%xmm3 \n" \
2038 "movdqa 96(%[yuvconstants]),%%xmm0 \n" \
2039 "pmaddubsw (%[yuvconstants]),%%xmm1 \n" \
2040 "psubw %%xmm1,%%xmm0 \n" \
2041 "movdqa 128(%[yuvconstants]),%%xmm1 \n" \
2042 "pmaddubsw 32(%[yuvconstants]),%%xmm2 \n" \
2043 "psubw %%xmm2,%%xmm1 \n" \
2044 "movdqa 160(%[yuvconstants]),%%xmm2 \n" \
2045 "pmaddubsw 64(%[yuvconstants]),%%xmm3 \n" \
2046 "psubw %%xmm3,%%xmm2 \n" \
2047 "pmulhuw 192(%[yuvconstants]),%%xmm4 \n" \
2048 "paddsw %%xmm4,%%xmm0 \n" \
2049 "paddsw %%xmm4,%%xmm1 \n" \
2050 "paddsw %%xmm4,%%xmm2 \n"
2051 #define YUVTORGB_REGS
2052 #endif
2053
2054 #define YUVTORGB(yuvconstants) \
2055 YUVTORGB16(yuvconstants) \
2056 "psraw $0x6,%%xmm0 \n" \
2057 "psraw $0x6,%%xmm1 \n" \
2058 "psraw $0x6,%%xmm2 \n" \
2059 "packuswb %%xmm0,%%xmm0 \n" \
2060 "packuswb %%xmm1,%%xmm1 \n" \
2061 "packuswb %%xmm2,%%xmm2 \n"
2062
2063 // Store 8 ARGB values.
2064 #define STOREARGB \
2065 "punpcklbw %%xmm1,%%xmm0 \n" \
2066 "punpcklbw %%xmm5,%%xmm2 \n" \
2067 "movdqa %%xmm0,%%xmm1 \n" \
2068 "punpcklwd %%xmm2,%%xmm0 \n" \
2069 "punpckhwd %%xmm2,%%xmm1 \n" \
2070 "movdqu %%xmm0,(%[dst_argb]) \n" \
2071 "movdqu %%xmm1,0x10(%[dst_argb]) \n" \
2072 "lea 0x20(%[dst_argb]), %[dst_argb] \n"
2073
2074 // Store 8 RGBA values.
2075 #define STORERGBA \
2076 "pcmpeqb %%xmm5,%%xmm5 \n" \
2077 "punpcklbw %%xmm2,%%xmm1 \n" \
2078 "punpcklbw %%xmm0,%%xmm5 \n" \
2079 "movdqa %%xmm5,%%xmm0 \n" \
2080 "punpcklwd %%xmm1,%%xmm5 \n" \
2081 "punpckhwd %%xmm1,%%xmm0 \n" \
2082 "movdqu %%xmm5,(%[dst_rgba]) \n" \
2083 "movdqu %%xmm0,0x10(%[dst_rgba]) \n" \
2084 "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
2085
2086 // Store 8 AR30 values.
2087 #define STOREAR30 \
2088 "psraw $0x4,%%xmm0 \n" \
2089 "psraw $0x4,%%xmm1 \n" \
2090 "psraw $0x4,%%xmm2 \n" \
2091 "pminsw %%xmm7,%%xmm0 \n" \
2092 "pminsw %%xmm7,%%xmm1 \n" \
2093 "pminsw %%xmm7,%%xmm2 \n" \
2094 "pmaxsw %%xmm6,%%xmm0 \n" \
2095 "pmaxsw %%xmm6,%%xmm1 \n" \
2096 "pmaxsw %%xmm6,%%xmm2 \n" \
2097 "psllw $0x4,%%xmm2 \n" \
2098 "movdqa %%xmm0,%%xmm3 \n" \
2099 "punpcklwd %%xmm2,%%xmm0 \n" \
2100 "punpckhwd %%xmm2,%%xmm3 \n" \
2101 "movdqa %%xmm1,%%xmm2 \n" \
2102 "punpcklwd %%xmm5,%%xmm1 \n" \
2103 "punpckhwd %%xmm5,%%xmm2 \n" \
2104 "pslld $0xa,%%xmm1 \n" \
2105 "pslld $0xa,%%xmm2 \n" \
2106 "por %%xmm1,%%xmm0 \n" \
2107 "por %%xmm2,%%xmm3 \n" \
2108 "movdqu %%xmm0,(%[dst_ar30]) \n" \
2109 "movdqu %%xmm3,0x10(%[dst_ar30]) \n" \
2110 "lea 0x20(%[dst_ar30]), %[dst_ar30] \n"
2111
I444ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2112 void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
2113 const uint8_t* u_buf,
2114 const uint8_t* v_buf,
2115 uint8_t* dst_argb,
2116 const struct YuvConstants* yuvconstants,
2117 int width) {
2118 asm volatile (
2119 YUVTORGB_SETUP(yuvconstants)
2120 "sub %[u_buf],%[v_buf] \n"
2121 "pcmpeqb %%xmm5,%%xmm5 \n"
2122
2123 LABELALIGN
2124 "1: \n"
2125 READYUV444
2126 YUVTORGB(yuvconstants)
2127 STOREARGB
2128 "sub $0x8,%[width] \n"
2129 "jg 1b \n"
2130 : [y_buf]"+r"(y_buf), // %[y_buf]
2131 [u_buf]"+r"(u_buf), // %[u_buf]
2132 [v_buf]"+r"(v_buf), // %[v_buf]
2133 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2134 [width]"+rm"(width) // %[width]
2135 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2136 : "memory", "cc", YUVTORGB_REGS
2137 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2138 );
2139 }
2140
I422ToRGB24Row_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)2141 void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
2142 const uint8_t* u_buf,
2143 const uint8_t* v_buf,
2144 uint8_t* dst_rgb24,
2145 const struct YuvConstants* yuvconstants,
2146 int width) {
2147 asm volatile (
2148 YUVTORGB_SETUP(yuvconstants)
2149 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
2150 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
2151 "sub %[u_buf],%[v_buf] \n"
2152
2153 LABELALIGN
2154 "1: \n"
2155 READYUV422
2156 YUVTORGB(yuvconstants)
2157 "punpcklbw %%xmm1,%%xmm0 \n"
2158 "punpcklbw %%xmm2,%%xmm2 \n"
2159 "movdqa %%xmm0,%%xmm1 \n"
2160 "punpcklwd %%xmm2,%%xmm0 \n"
2161 "punpckhwd %%xmm2,%%xmm1 \n"
2162 "pshufb %%xmm5,%%xmm0 \n"
2163 "pshufb %%xmm6,%%xmm1 \n"
2164 "palignr $0xc,%%xmm0,%%xmm1 \n"
2165 "movq %%xmm0,(%[dst_rgb24]) \n"
2166 "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
2167 "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
2168 "subl $0x8,%[width] \n"
2169 "jg 1b \n"
2170 : [y_buf]"+r"(y_buf), // %[y_buf]
2171 [u_buf]"+r"(u_buf), // %[u_buf]
2172 [v_buf]"+r"(v_buf), // %[v_buf]
2173 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
2174 #if defined(__i386__)
2175 [width]"+m"(width) // %[width]
2176 #else
2177 [width]"+rm"(width) // %[width]
2178 #endif
2179 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2180 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
2181 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
2182 : "memory", "cc", YUVTORGB_REGS
2183 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2184 );
2185 }
2186
I422ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2187 void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
2188 const uint8_t* u_buf,
2189 const uint8_t* v_buf,
2190 uint8_t* dst_argb,
2191 const struct YuvConstants* yuvconstants,
2192 int width) {
2193 asm volatile (
2194 YUVTORGB_SETUP(yuvconstants)
2195 "sub %[u_buf],%[v_buf] \n"
2196 "pcmpeqb %%xmm5,%%xmm5 \n"
2197
2198 LABELALIGN
2199 "1: \n"
2200 READYUV422
2201 YUVTORGB(yuvconstants)
2202 STOREARGB
2203 "sub $0x8,%[width] \n"
2204 "jg 1b \n"
2205 : [y_buf]"+r"(y_buf), // %[y_buf]
2206 [u_buf]"+r"(u_buf), // %[u_buf]
2207 [v_buf]"+r"(v_buf), // %[v_buf]
2208 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2209 [width]"+rm"(width) // %[width]
2210 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2211 : "memory", "cc", YUVTORGB_REGS
2212 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2213 );
2214 }
2215
I422ToAR30Row_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2216 void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
2217 const uint8_t* u_buf,
2218 const uint8_t* v_buf,
2219 uint8_t* dst_ar30,
2220 const struct YuvConstants* yuvconstants,
2221 int width) {
2222 asm volatile (
2223 YUVTORGB_SETUP(yuvconstants)
2224 "sub %[u_buf],%[v_buf] \n"
2225 "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants
2226 "psrlw $14,%%xmm5 \n"
2227 "psllw $4,%%xmm5 \n" // 2 alpha bits
2228 "pxor %%xmm6,%%xmm6 \n"
2229 "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
2230 "psrlw $6,%%xmm7 \n" // 1023 for max
2231
2232 LABELALIGN
2233 "1: \n"
2234 READYUV422
2235 YUVTORGB16(yuvconstants)
2236 STOREAR30
2237 "sub $0x8,%[width] \n"
2238 "jg 1b \n"
2239 : [y_buf]"+r"(y_buf), // %[y_buf]
2240 [u_buf]"+r"(u_buf), // %[u_buf]
2241 [v_buf]"+r"(v_buf), // %[v_buf]
2242 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
2243 [width]"+rm"(width) // %[width]
2244 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2245 : "memory", "cc", YUVTORGB_REGS
2246 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2247 );
2248 }
2249
2250 // 10 bit YUV to ARGB
I210ToARGBRow_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2251 void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
2252 const uint16_t* u_buf,
2253 const uint16_t* v_buf,
2254 uint8_t* dst_argb,
2255 const struct YuvConstants* yuvconstants,
2256 int width) {
2257 asm volatile (
2258 YUVTORGB_SETUP(yuvconstants)
2259 "sub %[u_buf],%[v_buf] \n"
2260 "pcmpeqb %%xmm5,%%xmm5 \n"
2261
2262 LABELALIGN
2263 "1: \n"
2264 READYUV210
2265 YUVTORGB(yuvconstants)
2266 STOREARGB
2267 "sub $0x8,%[width] \n"
2268 "jg 1b \n"
2269 : [y_buf]"+r"(y_buf), // %[y_buf]
2270 [u_buf]"+r"(u_buf), // %[u_buf]
2271 [v_buf]"+r"(v_buf), // %[v_buf]
2272 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2273 [width]"+rm"(width) // %[width]
2274 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2275 : "memory", "cc", YUVTORGB_REGS
2276 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2277 );
2278 }
2279
2280 // 10 bit YUV to AR30
I210ToAR30Row_SSSE3(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2281 void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
2282 const uint16_t* u_buf,
2283 const uint16_t* v_buf,
2284 uint8_t* dst_ar30,
2285 const struct YuvConstants* yuvconstants,
2286 int width) {
2287 asm volatile (
2288 YUVTORGB_SETUP(yuvconstants)
2289 "sub %[u_buf],%[v_buf] \n"
2290 "pcmpeqb %%xmm5,%%xmm5 \n"
2291 "psrlw $14,%%xmm5 \n"
2292 "psllw $4,%%xmm5 \n" // 2 alpha bits
2293 "pxor %%xmm6,%%xmm6 \n"
2294 "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
2295 "psrlw $6,%%xmm7 \n" // 1023 for max
2296
2297 LABELALIGN
2298 "1: \n"
2299 READYUV210
2300 YUVTORGB16(yuvconstants)
2301 STOREAR30
2302 "sub $0x8,%[width] \n"
2303 "jg 1b \n"
2304 : [y_buf]"+r"(y_buf), // %[y_buf]
2305 [u_buf]"+r"(u_buf), // %[u_buf]
2306 [v_buf]"+r"(v_buf), // %[v_buf]
2307 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
2308 [width]"+rm"(width) // %[width]
2309 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2310 : "memory", "cc", YUVTORGB_REGS
2311 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2312 );
2313 }
2314
2315 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
I422AlphaToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2316 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
2317 const uint8_t* u_buf,
2318 const uint8_t* v_buf,
2319 const uint8_t* a_buf,
2320 uint8_t* dst_argb,
2321 const struct YuvConstants* yuvconstants,
2322 int width) {
2323 // clang-format off
2324 asm volatile (
2325 YUVTORGB_SETUP(yuvconstants)
2326 "sub %[u_buf],%[v_buf] \n"
2327
2328 LABELALIGN
2329 "1: \n"
2330 READYUVA422
2331 YUVTORGB(yuvconstants)
2332 STOREARGB
2333 "subl $0x8,%[width] \n"
2334 "jg 1b \n"
2335 : [y_buf]"+r"(y_buf), // %[y_buf]
2336 [u_buf]"+r"(u_buf), // %[u_buf]
2337 [v_buf]"+r"(v_buf), // %[v_buf]
2338 [a_buf]"+r"(a_buf), // %[a_buf]
2339 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2340 #if defined(__i386__)
2341 [width]"+m"(width) // %[width]
2342 #else
2343 [width]"+rm"(width) // %[width]
2344 #endif
2345 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2346 : "memory", "cc", YUVTORGB_REGS
2347 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2348 );
2349 // clang-format on
2350 }
2351 #endif // HAS_I422ALPHATOARGBROW_SSSE3
2352
NV12ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2353 void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
2354 const uint8_t* uv_buf,
2355 uint8_t* dst_argb,
2356 const struct YuvConstants* yuvconstants,
2357 int width) {
2358 // clang-format off
2359 asm volatile (
2360 YUVTORGB_SETUP(yuvconstants)
2361 "pcmpeqb %%xmm5,%%xmm5 \n"
2362
2363 LABELALIGN
2364 "1: \n"
2365 READNV12
2366 YUVTORGB(yuvconstants)
2367 STOREARGB
2368 "sub $0x8,%[width] \n"
2369 "jg 1b \n"
2370 : [y_buf]"+r"(y_buf), // %[y_buf]
2371 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2372 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2373 [width]"+rm"(width) // %[width]
2374 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2375 : "memory", "cc", YUVTORGB_REGS
2376 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2377 );
2378 // clang-format on
2379 }
2380
NV21ToARGBRow_SSSE3(const uint8_t * y_buf,const uint8_t * vu_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2381 void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
2382 const uint8_t* vu_buf,
2383 uint8_t* dst_argb,
2384 const struct YuvConstants* yuvconstants,
2385 int width) {
2386 // clang-format off
2387 asm volatile (
2388 YUVTORGB_SETUP(yuvconstants)
2389 "pcmpeqb %%xmm5,%%xmm5 \n"
2390
2391 LABELALIGN
2392 "1: \n"
2393 READNV21
2394 YUVTORGB(yuvconstants)
2395 STOREARGB
2396 "sub $0x8,%[width] \n"
2397 "jg 1b \n"
2398 : [y_buf]"+r"(y_buf), // %[y_buf]
2399 [vu_buf]"+r"(vu_buf), // %[vu_buf]
2400 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2401 [width]"+rm"(width) // %[width]
2402 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2403 [kShuffleNV21]"m"(kShuffleNV21)
2404 : "memory", "cc", YUVTORGB_REGS
2405 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2406 );
2407 // clang-format on
2408 }
2409
YUY2ToARGBRow_SSSE3(const uint8_t * yuy2_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2410 void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
2411 uint8_t* dst_argb,
2412 const struct YuvConstants* yuvconstants,
2413 int width) {
2414 // clang-format off
2415 asm volatile (
2416 YUVTORGB_SETUP(yuvconstants)
2417 "pcmpeqb %%xmm5,%%xmm5 \n"
2418
2419 LABELALIGN
2420 "1: \n"
2421 READYUY2
2422 YUVTORGB(yuvconstants)
2423 STOREARGB
2424 "sub $0x8,%[width] \n"
2425 "jg 1b \n"
2426 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
2427 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2428 [width]"+rm"(width) // %[width]
2429 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2430 [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
2431 [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
2432 : "memory", "cc", YUVTORGB_REGS
2433 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2434 );
2435 // clang-format on
2436 }
2437
UYVYToARGBRow_SSSE3(const uint8_t * uyvy_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2438 void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
2439 uint8_t* dst_argb,
2440 const struct YuvConstants* yuvconstants,
2441 int width) {
2442 // clang-format off
2443 asm volatile (
2444 YUVTORGB_SETUP(yuvconstants)
2445 "pcmpeqb %%xmm5,%%xmm5 \n"
2446
2447 LABELALIGN
2448 "1: \n"
2449 READUYVY
2450 YUVTORGB(yuvconstants)
2451 STOREARGB
2452 "sub $0x8,%[width] \n"
2453 "jg 1b \n"
2454 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
2455 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2456 [width]"+rm"(width) // %[width]
2457 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2458 [kShuffleUYVYY]"m"(kShuffleUYVYY),
2459 [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
2460 : "memory", "cc", YUVTORGB_REGS
2461 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2462 );
2463 // clang-format on
2464 }
2465
I422ToRGBARow_SSSE3(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_rgba,const struct YuvConstants * yuvconstants,int width)2466 void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
2467 const uint8_t* u_buf,
2468 const uint8_t* v_buf,
2469 uint8_t* dst_rgba,
2470 const struct YuvConstants* yuvconstants,
2471 int width) {
2472 asm volatile (
2473 YUVTORGB_SETUP(yuvconstants)
2474 "sub %[u_buf],%[v_buf] \n"
2475 "pcmpeqb %%xmm5,%%xmm5 \n"
2476
2477 LABELALIGN
2478 "1: \n"
2479 READYUV422
2480 YUVTORGB(yuvconstants)
2481 STORERGBA
2482 "sub $0x8,%[width] \n"
2483 "jg 1b \n"
2484 : [y_buf]"+r"(y_buf), // %[y_buf]
2485 [u_buf]"+r"(u_buf), // %[u_buf]
2486 [v_buf]"+r"(v_buf), // %[v_buf]
2487 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
2488 [width]"+rm"(width) // %[width]
2489 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2490 : "memory", "cc", YUVTORGB_REGS
2491 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2492 );
2493 }
2494
2495 #endif // HAS_I422TOARGBROW_SSSE3
2496
2497 // Read 16 UV from 444
2498 #define READYUV444_AVX2 \
2499 "vmovdqu (%[u_buf]),%%xmm0 \n" \
2500 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2501 "lea 0x10(%[u_buf]),%[u_buf] \n" \
2502 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2503 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
2504 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
2505 "vmovdqu (%[y_buf]),%%xmm4 \n" \
2506 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
2507 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
2508 "lea 0x10(%[y_buf]),%[y_buf] \n"
2509
2510 // Read 8 UV from 422, upsample to 16 UV.
2511 #define READYUV422_AVX2 \
2512 "vmovq (%[u_buf]),%%xmm0 \n" \
2513 "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2514 "lea 0x8(%[u_buf]),%[u_buf] \n" \
2515 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
2516 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2517 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
2518 "vmovdqu (%[y_buf]),%%xmm4 \n" \
2519 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
2520 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
2521 "lea 0x10(%[y_buf]),%[y_buf] \n"
2522
2523 // Read 8 UV from 210 10 bit, upsample to 16 UV
2524 // TODO(fbarchard): Consider vshufb to replace pack/unpack
2525 // TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
2526 #define READYUV210_AVX2 \
2527 "vmovdqu (%[u_buf]),%%xmm0 \n" \
2528 "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2529 "lea 0x10(%[u_buf]),%[u_buf] \n" \
2530 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2531 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
2532 "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" \
2533 "vpsraw $0x2,%%ymm0,%%ymm0 \n" \
2534 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
2535 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
2536 "vmovdqu (%[y_buf]),%%ymm4 \n" \
2537 "vpsllw $0x6,%%ymm4,%%ymm4 \n" \
2538 "lea 0x20(%[y_buf]),%[y_buf] \n"
2539
2540 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
2541 #define READYUVA422_AVX2 \
2542 "vmovq (%[u_buf]),%%xmm0 \n" \
2543 "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
2544 "lea 0x8(%[u_buf]),%[u_buf] \n" \
2545 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
2546 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2547 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
2548 "vmovdqu (%[y_buf]),%%xmm4 \n" \
2549 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
2550 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
2551 "lea 0x10(%[y_buf]),%[y_buf] \n" \
2552 "vmovdqu (%[a_buf]),%%xmm5 \n" \
2553 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
2554 "lea 0x10(%[a_buf]),%[a_buf] \n"
2555
2556 // Read 8 UV from NV12, upsample to 16 UV.
2557 #define READNV12_AVX2 \
2558 "vmovdqu (%[uv_buf]),%%xmm0 \n" \
2559 "lea 0x10(%[uv_buf]),%[uv_buf] \n" \
2560 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2561 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
2562 "vmovdqu (%[y_buf]),%%xmm4 \n" \
2563 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
2564 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
2565 "lea 0x10(%[y_buf]),%[y_buf] \n"
2566
2567 // Read 8 VU from NV21, upsample to 16 UV.
2568 #define READNV21_AVX2 \
2569 "vmovdqu (%[vu_buf]),%%xmm0 \n" \
2570 "lea 0x10(%[vu_buf]),%[vu_buf] \n" \
2571 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2572 "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \
2573 "vmovdqu (%[y_buf]),%%xmm4 \n" \
2574 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
2575 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
2576 "lea 0x10(%[y_buf]),%[y_buf] \n"
2577
2578 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
2579 #define READYUY2_AVX2 \
2580 "vmovdqu (%[yuy2_buf]),%%ymm4 \n" \
2581 "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
2582 "vmovdqu (%[yuy2_buf]),%%ymm0 \n" \
2583 "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \
2584 "lea 0x20(%[yuy2_buf]),%[yuy2_buf] \n"
2585
2586 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
2587 #define READUYVY_AVX2 \
2588 "vmovdqu (%[uyvy_buf]),%%ymm4 \n" \
2589 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
2590 "vmovdqu (%[uyvy_buf]),%%ymm0 \n" \
2591 "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \
2592 "lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n"
2593
2594 #if defined(__x86_64__)
2595 #define YUVTORGB_SETUP_AVX2(yuvconstants) \
2596 "vmovdqa (%[yuvconstants]),%%ymm8 \n" \
2597 "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \
2598 "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \
2599 "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \
2600 "vmovdqa 128(%[yuvconstants]),%%ymm12 \n" \
2601 "vmovdqa 160(%[yuvconstants]),%%ymm13 \n" \
2602 "vmovdqa 192(%[yuvconstants]),%%ymm14 \n"
2603
2604 #define YUVTORGB16_AVX2(yuvconstants) \
2605 "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \
2606 "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \
2607 "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \
2608 "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \
2609 "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \
2610 "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \
2611 "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \
2612 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
2613 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
2614 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
2615
2616 #define YUVTORGB_REGS_AVX2 \
2617 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
2618
2619 #else // Convert 16 pixels: 16 UV and 16 Y.
2620
2621 #define YUVTORGB_SETUP_AVX2(yuvconstants)
2622 #define YUVTORGB16_AVX2(yuvconstants) \
2623 "vpmaddubsw 64(%[yuvconstants]),%%ymm0,%%ymm2 \n" \
2624 "vpmaddubsw 32(%[yuvconstants]),%%ymm0,%%ymm1 \n" \
2625 "vpmaddubsw (%[yuvconstants]),%%ymm0,%%ymm0 \n" \
2626 "vmovdqu 160(%[yuvconstants]),%%ymm3 \n" \
2627 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
2628 "vmovdqu 128(%[yuvconstants]),%%ymm3 \n" \
2629 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
2630 "vmovdqu 96(%[yuvconstants]),%%ymm3 \n" \
2631 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
2632 "vpmulhuw 192(%[yuvconstants]),%%ymm4,%%ymm4 \n" \
2633 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
2634 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
2635 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
2636 #define YUVTORGB_REGS_AVX2
2637 #endif
2638
2639 #define YUVTORGB_AVX2(yuvconstants) \
2640 YUVTORGB16_AVX2(yuvconstants) \
2641 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
2642 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
2643 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
2644 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
2645 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
2646 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
2647
2648 // Store 16 ARGB values.
2649 #define STOREARGB_AVX2 \
2650 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
2651 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2652 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
2653 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
2654 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
2655 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
2656 "vmovdqu %%ymm1,(%[dst_argb]) \n" \
2657 "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \
2658 "lea 0x40(%[dst_argb]), %[dst_argb] \n"
2659
2660 // Store 16 AR30 values.
2661 #define STOREAR30_AVX2 \
2662 "vpsraw $0x4,%%ymm0,%%ymm0 \n" \
2663 "vpsraw $0x4,%%ymm1,%%ymm1 \n" \
2664 "vpsraw $0x4,%%ymm2,%%ymm2 \n" \
2665 "vpminsw %%ymm7,%%ymm0,%%ymm0 \n" \
2666 "vpminsw %%ymm7,%%ymm1,%%ymm1 \n" \
2667 "vpminsw %%ymm7,%%ymm2,%%ymm2 \n" \
2668 "vpmaxsw %%ymm6,%%ymm0,%%ymm0 \n" \
2669 "vpmaxsw %%ymm6,%%ymm1,%%ymm1 \n" \
2670 "vpmaxsw %%ymm6,%%ymm2,%%ymm2 \n" \
2671 "vpsllw $0x4,%%ymm2,%%ymm2 \n" \
2672 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2673 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
2674 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
2675 "vpunpckhwd %%ymm2,%%ymm0,%%ymm3 \n" \
2676 "vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" \
2677 "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" \
2678 "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" \
2679 "vpslld $0xa,%%ymm1,%%ymm1 \n" \
2680 "vpslld $0xa,%%ymm2,%%ymm2 \n" \
2681 "vpor %%ymm1,%%ymm0,%%ymm0 \n" \
2682 "vpor %%ymm2,%%ymm3,%%ymm3 \n" \
2683 "vmovdqu %%ymm0,(%[dst_ar30]) \n" \
2684 "vmovdqu %%ymm3,0x20(%[dst_ar30]) \n" \
2685 "lea 0x40(%[dst_ar30]), %[dst_ar30] \n"
2686
2687 #ifdef HAS_I444TOARGBROW_AVX2
2688 // 16 pixels
2689 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
I444ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2690 void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
2691 const uint8_t* u_buf,
2692 const uint8_t* v_buf,
2693 uint8_t* dst_argb,
2694 const struct YuvConstants* yuvconstants,
2695 int width) {
2696 asm volatile (
2697 YUVTORGB_SETUP_AVX2(yuvconstants)
2698 "sub %[u_buf],%[v_buf] \n"
2699 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2700
2701 LABELALIGN
2702 "1: \n"
2703 READYUV444_AVX2
2704 YUVTORGB_AVX2(yuvconstants)
2705 STOREARGB_AVX2
2706 "sub $0x10,%[width] \n"
2707 "jg 1b \n"
2708 "vzeroupper \n"
2709 : [y_buf]"+r"(y_buf), // %[y_buf]
2710 [u_buf]"+r"(u_buf), // %[u_buf]
2711 [v_buf]"+r"(v_buf), // %[v_buf]
2712 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2713 [width]"+rm"(width) // %[width]
2714 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2715 : "memory", "cc", YUVTORGB_REGS_AVX2
2716 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2717 );
2718 }
2719 #endif // HAS_I444TOARGBROW_AVX2
2720
2721 #if defined(HAS_I422TOARGBROW_AVX2)
2722 // 16 pixels
2723 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I422ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2724 void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
2725 const uint8_t* u_buf,
2726 const uint8_t* v_buf,
2727 uint8_t* dst_argb,
2728 const struct YuvConstants* yuvconstants,
2729 int width) {
2730 asm volatile (
2731 YUVTORGB_SETUP_AVX2(yuvconstants)
2732 "sub %[u_buf],%[v_buf] \n"
2733 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2734
2735 LABELALIGN
2736 "1: \n"
2737 READYUV422_AVX2
2738 YUVTORGB_AVX2(yuvconstants)
2739 STOREARGB_AVX2
2740 "sub $0x10,%[width] \n"
2741 "jg 1b \n"
2742
2743 "vzeroupper \n"
2744 : [y_buf]"+r"(y_buf), // %[y_buf]
2745 [u_buf]"+r"(u_buf), // %[u_buf]
2746 [v_buf]"+r"(v_buf), // %[v_buf]
2747 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2748 [width]"+rm"(width) // %[width]
2749 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2750 : "memory", "cc", YUVTORGB_REGS_AVX2
2751 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2752 );
2753 }
2754 #endif // HAS_I422TOARGBROW_AVX2
2755
2756 #if defined(HAS_I422TOAR30ROW_AVX2)
2757 // 16 pixels
2758 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I422ToAR30Row_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2759 void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
2760 const uint8_t* u_buf,
2761 const uint8_t* v_buf,
2762 uint8_t* dst_ar30,
2763 const struct YuvConstants* yuvconstants,
2764 int width) {
2765 asm volatile (
2766 YUVTORGB_SETUP_AVX2(yuvconstants)
2767 "sub %[u_buf],%[v_buf] \n"
2768 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
2769 "vpsrlw $14,%%ymm5,%%ymm5 \n"
2770 "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
2771 "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
2772 "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
2773 "vpsrlw $6,%%ymm7,%%ymm7 \n"
2774
2775 LABELALIGN
2776 "1: \n"
2777 READYUV422_AVX2
2778 YUVTORGB16_AVX2(yuvconstants)
2779 STOREAR30_AVX2
2780 "sub $0x10,%[width] \n"
2781 "jg 1b \n"
2782
2783 "vzeroupper \n"
2784 : [y_buf]"+r"(y_buf), // %[y_buf]
2785 [u_buf]"+r"(u_buf), // %[u_buf]
2786 [v_buf]"+r"(v_buf), // %[v_buf]
2787 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
2788 [width]"+rm"(width) // %[width]
2789 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2790 : "memory", "cc", YUVTORGB_REGS_AVX2
2791 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2792 );
2793 }
2794 #endif // HAS_I422TOAR30ROW_AVX2
2795
2796 #if defined(HAS_I210TOARGBROW_AVX2)
2797 // 16 pixels
2798 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I210ToARGBRow_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2799 void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
2800 const uint16_t* u_buf,
2801 const uint16_t* v_buf,
2802 uint8_t* dst_argb,
2803 const struct YuvConstants* yuvconstants,
2804 int width) {
2805 asm volatile (
2806 YUVTORGB_SETUP_AVX2(yuvconstants)
2807 "sub %[u_buf],%[v_buf] \n"
2808 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2809
2810 LABELALIGN
2811 "1: \n"
2812 READYUV210_AVX2
2813 YUVTORGB_AVX2(yuvconstants)
2814 STOREARGB_AVX2
2815 "sub $0x10,%[width] \n"
2816 "jg 1b \n"
2817
2818 "vzeroupper \n"
2819 : [y_buf]"+r"(y_buf), // %[y_buf]
2820 [u_buf]"+r"(u_buf), // %[u_buf]
2821 [v_buf]"+r"(v_buf), // %[v_buf]
2822 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2823 [width]"+rm"(width) // %[width]
2824 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2825 : "memory", "cc", YUVTORGB_REGS_AVX2
2826 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2827 );
2828 }
2829 #endif // HAS_I210TOARGBROW_AVX2
2830
2831 #if defined(HAS_I210TOAR30ROW_AVX2)
2832 // 16 pixels
2833 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
I210ToAR30Row_AVX2(const uint16_t * y_buf,const uint16_t * u_buf,const uint16_t * v_buf,uint8_t * dst_ar30,const struct YuvConstants * yuvconstants,int width)2834 void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
2835 const uint16_t* u_buf,
2836 const uint16_t* v_buf,
2837 uint8_t* dst_ar30,
2838 const struct YuvConstants* yuvconstants,
2839 int width) {
2840 asm volatile (
2841 YUVTORGB_SETUP_AVX2(yuvconstants)
2842 "sub %[u_buf],%[v_buf] \n"
2843 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
2844 "vpsrlw $14,%%ymm5,%%ymm5 \n"
2845 "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
2846 "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
2847 "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
2848 "vpsrlw $6,%%ymm7,%%ymm7 \n"
2849
2850 LABELALIGN
2851 "1: \n"
2852 READYUV210_AVX2
2853 YUVTORGB16_AVX2(yuvconstants)
2854 STOREAR30_AVX2
2855 "sub $0x10,%[width] \n"
2856 "jg 1b \n"
2857
2858 "vzeroupper \n"
2859 : [y_buf]"+r"(y_buf), // %[y_buf]
2860 [u_buf]"+r"(u_buf), // %[u_buf]
2861 [v_buf]"+r"(v_buf), // %[v_buf]
2862 [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
2863 [width]"+rm"(width) // %[width]
2864 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2865 : "memory", "cc", YUVTORGB_REGS_AVX2
2866 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2867 );
2868 }
2869 #endif // HAS_I210TOAR30ROW_AVX2
2870
2871 #if defined(HAS_I422ALPHATOARGBROW_AVX2)
2872 // 16 pixels
2873 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
I422AlphaToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,const uint8_t * a_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2874 void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
2875 const uint8_t* u_buf,
2876 const uint8_t* v_buf,
2877 const uint8_t* a_buf,
2878 uint8_t* dst_argb,
2879 const struct YuvConstants* yuvconstants,
2880 int width) {
2881 // clang-format off
2882 asm volatile (
2883 YUVTORGB_SETUP_AVX2(yuvconstants)
2884 "sub %[u_buf],%[v_buf] \n"
2885
2886 LABELALIGN
2887 "1: \n"
2888 READYUVA422_AVX2
2889 YUVTORGB_AVX2(yuvconstants)
2890 STOREARGB_AVX2
2891 "subl $0x10,%[width] \n"
2892 "jg 1b \n"
2893 "vzeroupper \n"
2894 : [y_buf]"+r"(y_buf), // %[y_buf]
2895 [u_buf]"+r"(u_buf), // %[u_buf]
2896 [v_buf]"+r"(v_buf), // %[v_buf]
2897 [a_buf]"+r"(a_buf), // %[a_buf]
2898 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2899 #if defined(__i386__)
2900 [width]"+m"(width) // %[width]
2901 #else
2902 [width]"+rm"(width) // %[width]
2903 #endif
2904 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2905 : "memory", "cc", YUVTORGB_REGS_AVX2
2906 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2907 );
2908 // clang-format on
2909 }
2910 #endif // HAS_I422ALPHATOARGBROW_AVX2
2911
2912 #if defined(HAS_I422TORGBAROW_AVX2)
2913 // 16 pixels
2914 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
I422ToRGBARow_AVX2(const uint8_t * y_buf,const uint8_t * u_buf,const uint8_t * v_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2915 void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
2916 const uint8_t* u_buf,
2917 const uint8_t* v_buf,
2918 uint8_t* dst_argb,
2919 const struct YuvConstants* yuvconstants,
2920 int width) {
2921 asm volatile (
2922 YUVTORGB_SETUP_AVX2(yuvconstants)
2923 "sub %[u_buf],%[v_buf] \n"
2924 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2925
2926 LABELALIGN
2927 "1: \n"
2928 READYUV422_AVX2
2929 YUVTORGB_AVX2(yuvconstants)
2930
2931 // Step 3: Weave into RGBA
2932 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
2933 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2934 "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n"
2935 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2936 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
2937 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
2938 "vmovdqu %%ymm0,(%[dst_argb]) \n"
2939 "vmovdqu %%ymm1,0x20(%[dst_argb]) \n"
2940 "lea 0x40(%[dst_argb]),%[dst_argb] \n"
2941 "sub $0x10,%[width] \n"
2942 "jg 1b \n"
2943 "vzeroupper \n"
2944 : [y_buf]"+r"(y_buf), // %[y_buf]
2945 [u_buf]"+r"(u_buf), // %[u_buf]
2946 [v_buf]"+r"(v_buf), // %[v_buf]
2947 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2948 [width]"+rm"(width) // %[width]
2949 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2950 : "memory", "cc", YUVTORGB_REGS_AVX2
2951 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2952 );
2953 }
2954 #endif // HAS_I422TORGBAROW_AVX2
2955
2956 #if defined(HAS_NV12TOARGBROW_AVX2)
2957 // 16 pixels.
2958 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV12ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * uv_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2959 void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
2960 const uint8_t* uv_buf,
2961 uint8_t* dst_argb,
2962 const struct YuvConstants* yuvconstants,
2963 int width) {
2964 // clang-format off
2965 asm volatile (
2966 YUVTORGB_SETUP_AVX2(yuvconstants)
2967 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2968
2969 LABELALIGN
2970 "1: \n"
2971 READNV12_AVX2
2972 YUVTORGB_AVX2(yuvconstants)
2973 STOREARGB_AVX2
2974 "sub $0x10,%[width] \n"
2975 "jg 1b \n"
2976 "vzeroupper \n"
2977 : [y_buf]"+r"(y_buf), // %[y_buf]
2978 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2979 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2980 [width]"+rm"(width) // %[width]
2981 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2982 : "memory", "cc", YUVTORGB_REGS_AVX2
2983 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2984 );
2985 // clang-format on
2986 }
2987 #endif // HAS_NV12TOARGBROW_AVX2
2988
2989 #if defined(HAS_NV21TOARGBROW_AVX2)
2990 // 16 pixels.
2991 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV21ToARGBRow_AVX2(const uint8_t * y_buf,const uint8_t * vu_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2992 void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
2993 const uint8_t* vu_buf,
2994 uint8_t* dst_argb,
2995 const struct YuvConstants* yuvconstants,
2996 int width) {
2997 // clang-format off
2998 asm volatile (
2999 YUVTORGB_SETUP_AVX2(yuvconstants)
3000 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3001
3002 LABELALIGN
3003 "1: \n"
3004 READNV21_AVX2
3005 YUVTORGB_AVX2(yuvconstants)
3006 STOREARGB_AVX2
3007 "sub $0x10,%[width] \n"
3008 "jg 1b \n"
3009 "vzeroupper \n"
3010 : [y_buf]"+r"(y_buf), // %[y_buf]
3011 [vu_buf]"+r"(vu_buf), // %[vu_buf]
3012 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3013 [width]"+rm"(width) // %[width]
3014 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
3015 [kShuffleNV21]"m"(kShuffleNV21)
3016 : "memory", "cc", YUVTORGB_REGS_AVX2
3017 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3018 );
3019 // clang-format on
3020 }
3021 #endif // HAS_NV21TOARGBROW_AVX2
3022
3023 #if defined(HAS_YUY2TOARGBROW_AVX2)
3024 // 16 pixels.
3025 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
YUY2ToARGBRow_AVX2(const uint8_t * yuy2_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3026 void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
3027 uint8_t* dst_argb,
3028 const struct YuvConstants* yuvconstants,
3029 int width) {
3030 // clang-format off
3031 asm volatile (
3032 YUVTORGB_SETUP_AVX2(yuvconstants)
3033 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3034
3035 LABELALIGN
3036 "1: \n"
3037 READYUY2_AVX2
3038 YUVTORGB_AVX2(yuvconstants)
3039 STOREARGB_AVX2
3040 "sub $0x10,%[width] \n"
3041 "jg 1b \n"
3042 "vzeroupper \n"
3043 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
3044 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3045 [width]"+rm"(width) // %[width]
3046 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
3047 [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
3048 [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
3049 : "memory", "cc", YUVTORGB_REGS_AVX2
3050 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3051 );
3052 // clang-format on
3053 }
3054 #endif // HAS_YUY2TOARGBROW_AVX2
3055
3056 #if defined(HAS_UYVYTOARGBROW_AVX2)
3057 // 16 pixels.
3058 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
UYVYToARGBRow_AVX2(const uint8_t * uyvy_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3059 void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
3060 uint8_t* dst_argb,
3061 const struct YuvConstants* yuvconstants,
3062 int width) {
3063 // clang-format off
3064 asm volatile (
3065 YUVTORGB_SETUP_AVX2(yuvconstants)
3066 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3067
3068 LABELALIGN
3069 "1: \n"
3070 READUYVY_AVX2
3071 YUVTORGB_AVX2(yuvconstants)
3072 STOREARGB_AVX2
3073 "sub $0x10,%[width] \n"
3074 "jg 1b \n"
3075 "vzeroupper \n"
3076 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
3077 [dst_argb]"+r"(dst_argb), // %[dst_argb]
3078 [width]"+rm"(width) // %[width]
3079 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
3080 [kShuffleUYVYY]"m"(kShuffleUYVYY),
3081 [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
3082 : "memory", "cc", YUVTORGB_REGS_AVX2
3083 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3084 );
3085 // clang-format on
3086 }
3087 #endif // HAS_UYVYTOARGBROW_AVX2
3088
3089 #ifdef HAS_I400TOARGBROW_SSE2
I400ToARGBRow_SSE2(const uint8_t * y_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3090 void I400ToARGBRow_SSE2(const uint8_t* y_buf,
3091 uint8_t* dst_argb,
3092 const struct YuvConstants* yuvconstants,
3093 int width) {
3094 asm volatile(
3095 "movdqa 192(%3),%%xmm2 \n" // yg = 18997 = 1.164
3096 "movdqa 224(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16
3097 "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000
3098 "pslld $0x18,%%xmm4 \n"
3099
3100 LABELALIGN
3101 "1: \n"
3102 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
3103 "movq (%0),%%xmm0 \n"
3104 "lea 0x8(%0),%0 \n"
3105 "punpcklbw %%xmm0,%%xmm0 \n"
3106 "pmulhuw %%xmm2,%%xmm0 \n"
3107 "paddsw %%xmm3,%%xmm0 \n"
3108 "psraw $6, %%xmm0 \n"
3109 "packuswb %%xmm0,%%xmm0 \n"
3110
3111 // Step 2: Weave into ARGB
3112 "punpcklbw %%xmm0,%%xmm0 \n"
3113 "movdqa %%xmm0,%%xmm1 \n"
3114 "punpcklwd %%xmm0,%%xmm0 \n"
3115 "punpckhwd %%xmm1,%%xmm1 \n"
3116 "por %%xmm4,%%xmm0 \n"
3117 "por %%xmm4,%%xmm1 \n"
3118 "movdqu %%xmm0,(%1) \n"
3119 "movdqu %%xmm1,0x10(%1) \n"
3120 "lea 0x20(%1),%1 \n"
3121
3122 "sub $0x8,%2 \n"
3123 "jg 1b \n"
3124 : "+r"(y_buf), // %0
3125 "+r"(dst_argb), // %1
3126 "+rm"(width) // %2
3127 : "r"(yuvconstants) // %3
3128 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
3129 }
3130 #endif // HAS_I400TOARGBROW_SSE2
3131
3132 #ifdef HAS_I400TOARGBROW_AVX2
3133 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
3134 // note: vpunpcklbw mutates and vpackuswb unmutates.
I400ToARGBRow_AVX2(const uint8_t * y_buf,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)3135 void I400ToARGBRow_AVX2(const uint8_t* y_buf,
3136 uint8_t* dst_argb,
3137 const struct YuvConstants* yuvconstants,
3138 int width) {
3139 asm volatile(
3140 "vmovdqa 192(%3),%%ymm2 \n" // yg = 18997 = 1.164
3141 "vmovdqa 224(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16
3142 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000
3143 "vpslld $0x18,%%ymm4,%%ymm4 \n"
3144
3145 LABELALIGN
3146 "1: \n"
3147 // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
3148 "vmovdqu (%0),%%xmm0 \n"
3149 "lea 0x10(%0),%0 \n"
3150 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3151 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
3152 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
3153 "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n"
3154 "vpsraw $0x6,%%ymm0,%%ymm0 \n"
3155 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3156 "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
3157 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3158 "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
3159 "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
3160 "vpor %%ymm4,%%ymm0,%%ymm0 \n"
3161 "vpor %%ymm4,%%ymm1,%%ymm1 \n"
3162 "vmovdqu %%ymm0,(%1) \n"
3163 "vmovdqu %%ymm1,0x20(%1) \n"
3164 "lea 0x40(%1),%1 \n"
3165 "sub $0x10,%2 \n"
3166 "jg 1b \n"
3167 "vzeroupper \n"
3168 : "+r"(y_buf), // %0
3169 "+r"(dst_argb), // %1
3170 "+rm"(width) // %2
3171 : "r"(yuvconstants) // %3
3172 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
3173 }
3174 #endif // HAS_I400TOARGBROW_AVX2
3175
3176 #ifdef HAS_MIRRORROW_SSSE3
3177 // Shuffle table for reversing the bytes.
3178 static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
3179 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
3180
MirrorRow_SSSE3(const uint8_t * src,uint8_t * dst,int width)3181 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
3182 intptr_t temp_width = (intptr_t)(width);
3183 asm volatile(
3184
3185 "movdqa %3,%%xmm5 \n"
3186
3187 LABELALIGN
3188 "1: \n"
3189 "movdqu -0x10(%0,%2,1),%%xmm0 \n"
3190 "pshufb %%xmm5,%%xmm0 \n"
3191 "movdqu %%xmm0,(%1) \n"
3192 "lea 0x10(%1),%1 \n"
3193 "sub $0x10,%2 \n"
3194 "jg 1b \n"
3195 : "+r"(src), // %0
3196 "+r"(dst), // %1
3197 "+r"(temp_width) // %2
3198 : "m"(kShuffleMirror) // %3
3199 : "memory", "cc", "xmm0", "xmm5");
3200 }
3201 #endif // HAS_MIRRORROW_SSSE3
3202
3203 #ifdef HAS_MIRRORROW_AVX2
MirrorRow_AVX2(const uint8_t * src,uint8_t * dst,int width)3204 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
3205 intptr_t temp_width = (intptr_t)(width);
3206 asm volatile(
3207
3208 "vbroadcastf128 %3,%%ymm5 \n"
3209
3210 LABELALIGN
3211 "1: \n"
3212 "vmovdqu -0x20(%0,%2,1),%%ymm0 \n"
3213 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
3214 "vpermq $0x4e,%%ymm0,%%ymm0 \n"
3215 "vmovdqu %%ymm0,(%1) \n"
3216 "lea 0x20(%1),%1 \n"
3217 "sub $0x20,%2 \n"
3218 "jg 1b \n"
3219 "vzeroupper \n"
3220 : "+r"(src), // %0
3221 "+r"(dst), // %1
3222 "+r"(temp_width) // %2
3223 : "m"(kShuffleMirror) // %3
3224 : "memory", "cc", "xmm0", "xmm5");
3225 }
3226 #endif // HAS_MIRRORROW_AVX2
3227
3228 #ifdef HAS_MIRRORUVROW_SSSE3
3229 // Shuffle table for reversing the UV.
3230 static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
3231 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
3232
MirrorUVRow_SSSE3(const uint8_t * src_uv,uint8_t * dst_uv,int width)3233 void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
3234 intptr_t temp_width = (intptr_t)(width);
3235 asm volatile(
3236
3237 "movdqa %3,%%xmm5 \n"
3238
3239 LABELALIGN
3240 "1: \n"
3241 "movdqu -0x10(%0,%2,2),%%xmm0 \n"
3242 "pshufb %%xmm5,%%xmm0 \n"
3243 "movdqu %%xmm0,(%1) \n"
3244 "lea 0x10(%1),%1 \n"
3245 "sub $0x8,%2 \n"
3246 "jg 1b \n"
3247 : "+r"(src_uv), // %0
3248 "+r"(dst_uv), // %1
3249 "+r"(temp_width) // %2
3250 : "m"(kShuffleMirrorUV) // %3
3251 : "memory", "cc", "xmm0", "xmm5");
3252 }
3253 #endif // HAS_MIRRORUVROW_SSSE3
3254
3255 #ifdef HAS_MIRRORUVROW_AVX2
MirrorUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_uv,int width)3256 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
3257 intptr_t temp_width = (intptr_t)(width);
3258 asm volatile(
3259
3260 "vbroadcastf128 %3,%%ymm5 \n"
3261
3262 LABELALIGN
3263 "1: \n"
3264 "vmovdqu -0x20(%0,%2,2),%%ymm0 \n"
3265 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
3266 "vpermq $0x4e,%%ymm0,%%ymm0 \n"
3267 "vmovdqu %%ymm0,(%1) \n"
3268 "lea 0x20(%1),%1 \n"
3269 "sub $0x10,%2 \n"
3270 "jg 1b \n"
3271 "vzeroupper \n"
3272 : "+r"(src_uv), // %0
3273 "+r"(dst_uv), // %1
3274 "+r"(temp_width) // %2
3275 : "m"(kShuffleMirrorUV) // %3
3276 : "memory", "cc", "xmm0", "xmm5");
3277 }
3278 #endif // HAS_MIRRORUVROW_AVX2
3279
3280 #ifdef HAS_MIRRORSPLITUVROW_SSSE3
3281 // Shuffle table for reversing the bytes of UV channels.
3282 static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
3283 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
MirrorSplitUVRow_SSSE3(const uint8_t * src,uint8_t * dst_u,uint8_t * dst_v,int width)3284 void MirrorSplitUVRow_SSSE3(const uint8_t* src,
3285 uint8_t* dst_u,
3286 uint8_t* dst_v,
3287 int width) {
3288 intptr_t temp_width = (intptr_t)(width);
3289 asm volatile(
3290 "movdqa %4,%%xmm1 \n"
3291 "lea -0x10(%0,%3,2),%0 \n"
3292 "sub %1,%2 \n"
3293
3294 LABELALIGN
3295 "1: \n"
3296 "movdqu (%0),%%xmm0 \n"
3297 "lea -0x10(%0),%0 \n"
3298 "pshufb %%xmm1,%%xmm0 \n"
3299 "movlpd %%xmm0,(%1) \n"
3300 "movhpd %%xmm0,0x00(%1,%2,1) \n"
3301 "lea 0x8(%1),%1 \n"
3302 "sub $8,%3 \n"
3303 "jg 1b \n"
3304 : "+r"(src), // %0
3305 "+r"(dst_u), // %1
3306 "+r"(dst_v), // %2
3307 "+r"(temp_width) // %3
3308 : "m"(kShuffleMirrorSplitUV) // %4
3309 : "memory", "cc", "xmm0", "xmm1");
3310 }
3311 #endif // HAS_MIRRORSPLITUVROW_SSSE3
3312
3313 #ifdef HAS_RGB24MIRRORROW_SSSE3
3314
3315 // Shuffle first 5 pixels to last 5 mirrored. first byte zero
3316 static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u,
3317 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u};
3318
3319 // Shuffle last 5 pixels to first 5 mirrored. last byte zero
3320 static const uvec8 kShuffleMirrorRGB1 = {
3321 13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u};
3322
3323 // Shuffle 5 pixels at a time (15 bytes)
RGB24MirrorRow_SSSE3(const uint8_t * src_rgb24,uint8_t * dst_rgb24,int width)3324 void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
3325 uint8_t* dst_rgb24,
3326 int width) {
3327 intptr_t temp_width = (intptr_t)(width);
3328 src_rgb24 += width * 3 - 48;
3329 asm volatile(
3330 "movdqa %3,%%xmm4 \n"
3331 "movdqa %4,%%xmm5 \n"
3332
3333 LABELALIGN
3334 "1: \n"
3335 "movdqu (%0),%%xmm0 \n" // first 5
3336 "movdqu 15(%0),%%xmm1 \n" // next 5
3337 "movdqu 30(%0),%%xmm2 \n" // next 5
3338 "movdqu 32(%0),%%xmm3 \n" // last 1 special
3339 "pshufb %%xmm4,%%xmm0 \n"
3340 "pshufb %%xmm4,%%xmm1 \n"
3341 "pshufb %%xmm4,%%xmm2 \n"
3342 "pshufb %%xmm5,%%xmm3 \n"
3343 "lea -0x30(%0),%0 \n"
3344 "movdqu %%xmm0,32(%1) \n" // last 5
3345 "movdqu %%xmm1,17(%1) \n" // next 5
3346 "movdqu %%xmm2,2(%1) \n" // next 5
3347 "movlpd %%xmm3,0(%1) \n" // first 1
3348 "lea 0x30(%1),%1 \n"
3349 "sub $0x10,%2 \n"
3350 "jg 1b \n"
3351 : "+r"(src_rgb24), // %0
3352 "+r"(dst_rgb24), // %1
3353 "+r"(temp_width) // %2
3354 : "m"(kShuffleMirrorRGB0), // %3
3355 "m"(kShuffleMirrorRGB1) // %4
3356 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
3357 }
3358 #endif // HAS_RGB24MIRRORROW_SSSE3
3359
3360 #ifdef HAS_ARGBMIRRORROW_SSE2
3361
ARGBMirrorRow_SSE2(const uint8_t * src,uint8_t * dst,int width)3362 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
3363 intptr_t temp_width = (intptr_t)(width);
3364 asm volatile(
3365
3366 "lea -0x10(%0,%2,4),%0 \n"
3367
3368 LABELALIGN
3369 "1: \n"
3370 "movdqu (%0),%%xmm0 \n"
3371 "pshufd $0x1b,%%xmm0,%%xmm0 \n"
3372 "lea -0x10(%0),%0 \n"
3373 "movdqu %%xmm0,(%1) \n"
3374 "lea 0x10(%1),%1 \n"
3375 "sub $0x4,%2 \n"
3376 "jg 1b \n"
3377 : "+r"(src), // %0
3378 "+r"(dst), // %1
3379 "+r"(temp_width) // %2
3380 :
3381 : "memory", "cc", "xmm0");
3382 }
3383 #endif // HAS_ARGBMIRRORROW_SSE2
3384
3385 #ifdef HAS_ARGBMIRRORROW_AVX2
3386 // Shuffle table for reversing the bytes.
3387 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
ARGBMirrorRow_AVX2(const uint8_t * src,uint8_t * dst,int width)3388 void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
3389 intptr_t temp_width = (intptr_t)(width);
3390 asm volatile(
3391
3392 "vmovdqu %3,%%ymm5 \n"
3393
3394 LABELALIGN
3395 "1: \n"
3396 "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n"
3397 "vmovdqu %%ymm0,(%1) \n"
3398 "lea 0x20(%1),%1 \n"
3399 "sub $0x8,%2 \n"
3400 "jg 1b \n"
3401 "vzeroupper \n"
3402 : "+r"(src), // %0
3403 "+r"(dst), // %1
3404 "+r"(temp_width) // %2
3405 : "m"(kARGBShuffleMirror_AVX2) // %3
3406 : "memory", "cc", "xmm0", "xmm5");
3407 }
3408 #endif // HAS_ARGBMIRRORROW_AVX2
3409
3410 #ifdef HAS_SPLITUVROW_AVX2
SplitUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)3411 void SplitUVRow_AVX2(const uint8_t* src_uv,
3412 uint8_t* dst_u,
3413 uint8_t* dst_v,
3414 int width) {
3415 asm volatile(
3416 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3417 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3418 "sub %1,%2 \n"
3419
3420 LABELALIGN
3421 "1: \n"
3422 "vmovdqu (%0),%%ymm0 \n"
3423 "vmovdqu 0x20(%0),%%ymm1 \n"
3424 "lea 0x40(%0),%0 \n"
3425 "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
3426 "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
3427 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
3428 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
3429 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3430 "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
3431 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3432 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
3433 "vmovdqu %%ymm0,(%1) \n"
3434 "vmovdqu %%ymm2,0x00(%1,%2,1) \n"
3435 "lea 0x20(%1),%1 \n"
3436 "sub $0x20,%3 \n"
3437 "jg 1b \n"
3438 "vzeroupper \n"
3439 : "+r"(src_uv), // %0
3440 "+r"(dst_u), // %1
3441 "+r"(dst_v), // %2
3442 "+r"(width) // %3
3443 :
3444 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
3445 }
3446 #endif // HAS_SPLITUVROW_AVX2
3447
3448 #ifdef HAS_SPLITUVROW_SSE2
SplitUVRow_SSE2(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)3449 void SplitUVRow_SSE2(const uint8_t* src_uv,
3450 uint8_t* dst_u,
3451 uint8_t* dst_v,
3452 int width) {
3453 asm volatile(
3454 "pcmpeqb %%xmm5,%%xmm5 \n"
3455 "psrlw $0x8,%%xmm5 \n"
3456 "sub %1,%2 \n"
3457
3458 LABELALIGN
3459 "1: \n"
3460 "movdqu (%0),%%xmm0 \n"
3461 "movdqu 0x10(%0),%%xmm1 \n"
3462 "lea 0x20(%0),%0 \n"
3463 "movdqa %%xmm0,%%xmm2 \n"
3464 "movdqa %%xmm1,%%xmm3 \n"
3465 "pand %%xmm5,%%xmm0 \n"
3466 "pand %%xmm5,%%xmm1 \n"
3467 "packuswb %%xmm1,%%xmm0 \n"
3468 "psrlw $0x8,%%xmm2 \n"
3469 "psrlw $0x8,%%xmm3 \n"
3470 "packuswb %%xmm3,%%xmm2 \n"
3471 "movdqu %%xmm0,(%1) \n"
3472 "movdqu %%xmm2,0x00(%1,%2,1) \n"
3473 "lea 0x10(%1),%1 \n"
3474 "sub $0x10,%3 \n"
3475 "jg 1b \n"
3476 : "+r"(src_uv), // %0
3477 "+r"(dst_u), // %1
3478 "+r"(dst_v), // %2
3479 "+r"(width) // %3
3480 :
3481 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
3482 }
3483 #endif // HAS_SPLITUVROW_SSE2
3484
3485 #ifdef HAS_MERGEUVROW_AVX2
MergeUVRow_AVX2(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)3486 void MergeUVRow_AVX2(const uint8_t* src_u,
3487 const uint8_t* src_v,
3488 uint8_t* dst_uv,
3489 int width) {
3490 asm volatile(
3491
3492 "sub %0,%1 \n"
3493
3494 LABELALIGN
3495 "1: \n"
3496 "vmovdqu (%0),%%ymm0 \n"
3497 "vmovdqu 0x00(%0,%1,1),%%ymm1 \n"
3498 "lea 0x20(%0),%0 \n"
3499 "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
3500 "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
3501 "vextractf128 $0x0,%%ymm2,(%2) \n"
3502 "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
3503 "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
3504 "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
3505 "lea 0x40(%2),%2 \n"
3506 "sub $0x20,%3 \n"
3507 "jg 1b \n"
3508 "vzeroupper \n"
3509 : "+r"(src_u), // %0
3510 "+r"(src_v), // %1
3511 "+r"(dst_uv), // %2
3512 "+r"(width) // %3
3513 :
3514 : "memory", "cc", "xmm0", "xmm1", "xmm2");
3515 }
3516 #endif // HAS_MERGEUVROW_AVX2
3517
3518 #ifdef HAS_MERGEUVROW_SSE2
MergeUVRow_SSE2(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)3519 void MergeUVRow_SSE2(const uint8_t* src_u,
3520 const uint8_t* src_v,
3521 uint8_t* dst_uv,
3522 int width) {
3523 asm volatile(
3524
3525 "sub %0,%1 \n"
3526
3527 LABELALIGN
3528 "1: \n"
3529 "movdqu (%0),%%xmm0 \n"
3530 "movdqu 0x00(%0,%1,1),%%xmm1 \n"
3531 "lea 0x10(%0),%0 \n"
3532 "movdqa %%xmm0,%%xmm2 \n"
3533 "punpcklbw %%xmm1,%%xmm0 \n"
3534 "punpckhbw %%xmm1,%%xmm2 \n"
3535 "movdqu %%xmm0,(%2) \n"
3536 "movdqu %%xmm2,0x10(%2) \n"
3537 "lea 0x20(%2),%2 \n"
3538 "sub $0x10,%3 \n"
3539 "jg 1b \n"
3540 : "+r"(src_u), // %0
3541 "+r"(src_v), // %1
3542 "+r"(dst_uv), // %2
3543 "+r"(width) // %3
3544 :
3545 : "memory", "cc", "xmm0", "xmm1", "xmm2");
3546 }
3547 #endif // HAS_MERGEUVROW_SSE2
3548
3549 // Use scale to convert lsb formats to msb, depending how many bits there are:
3550 // 128 = 9 bits
3551 // 64 = 10 bits
3552 // 16 = 12 bits
3553 // 1 = 16 bits
3554 #ifdef HAS_MERGEUVROW_16_AVX2
MergeUVRow_16_AVX2(const uint16_t * src_u,const uint16_t * src_v,uint16_t * dst_uv,int scale,int width)3555 void MergeUVRow_16_AVX2(const uint16_t* src_u,
3556 const uint16_t* src_v,
3557 uint16_t* dst_uv,
3558 int scale,
3559 int width) {
3560 // clang-format off
3561 asm volatile (
3562 "vmovd %4,%%xmm3 \n"
3563 "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
3564 "vbroadcastss %%xmm3,%%ymm3 \n"
3565 "sub %0,%1 \n"
3566
3567 // 16 pixels per loop.
3568 LABELALIGN
3569 "1: \n"
3570 "vmovdqu (%0),%%ymm0 \n"
3571 "vmovdqu (%0,%1,1),%%ymm1 \n"
3572 "add $0x20,%0 \n"
3573
3574 "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
3575 "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
3576 "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
3577 "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
3578 "vextractf128 $0x0,%%ymm2,(%2) \n"
3579 "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
3580 "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
3581 "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
3582 "add $0x40,%2 \n"
3583 "sub $0x10,%3 \n"
3584 "jg 1b \n"
3585 "vzeroupper \n"
3586 : "+r"(src_u), // %0
3587 "+r"(src_v), // %1
3588 "+r"(dst_uv), // %2
3589 "+r"(width) // %3
3590 : "r"(scale) // %4
3591 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
3592 // clang-format on
3593 }
3594 #endif // HAS_MERGEUVROW_AVX2
3595
3596 // Use scale to convert lsb formats to msb, depending how many bits there are:
3597 // 128 = 9 bits
3598 // 64 = 10 bits
3599 // 16 = 12 bits
3600 // 1 = 16 bits
3601 #ifdef HAS_MULTIPLYROW_16_AVX2
MultiplyRow_16_AVX2(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)3602 void MultiplyRow_16_AVX2(const uint16_t* src_y,
3603 uint16_t* dst_y,
3604 int scale,
3605 int width) {
3606 // clang-format off
3607 asm volatile (
3608 "vmovd %3,%%xmm3 \n"
3609 "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
3610 "vbroadcastss %%xmm3,%%ymm3 \n"
3611 "sub %0,%1 \n"
3612
3613 // 16 pixels per loop.
3614 LABELALIGN
3615 "1: \n"
3616 "vmovdqu (%0),%%ymm0 \n"
3617 "vmovdqu 0x20(%0),%%ymm1 \n"
3618 "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
3619 "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
3620 "vmovdqu %%ymm0,(%0,%1) \n"
3621 "vmovdqu %%ymm1,0x20(%0,%1) \n"
3622 "add $0x40,%0 \n"
3623 "sub $0x20,%2 \n"
3624 "jg 1b \n"
3625 "vzeroupper \n"
3626 : "+r"(src_y), // %0
3627 "+r"(dst_y), // %1
3628 "+r"(width) // %2
3629 : "r"(scale) // %3
3630 : "memory", "cc", "xmm0", "xmm1", "xmm3");
3631 // clang-format on
3632 }
3633 #endif // HAS_MULTIPLYROW_16_AVX2
3634
3635 // Use scale to convert lsb formats to msb, depending how many bits there are:
3636 // 32768 = 9 bits
3637 // 16384 = 10 bits
3638 // 4096 = 12 bits
3639 // 256 = 16 bits
Convert16To8Row_SSSE3(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)3640 void Convert16To8Row_SSSE3(const uint16_t* src_y,
3641 uint8_t* dst_y,
3642 int scale,
3643 int width) {
3644 // clang-format off
3645 asm volatile (
3646 "movd %3,%%xmm2 \n"
3647 "punpcklwd %%xmm2,%%xmm2 \n"
3648 "pshufd $0x0,%%xmm2,%%xmm2 \n"
3649
3650 // 32 pixels per loop.
3651 LABELALIGN
3652 "1: \n"
3653 "movdqu (%0),%%xmm0 \n"
3654 "movdqu 0x10(%0),%%xmm1 \n"
3655 "add $0x20,%0 \n"
3656 "pmulhuw %%xmm2,%%xmm0 \n"
3657 "pmulhuw %%xmm2,%%xmm1 \n"
3658 "packuswb %%xmm1,%%xmm0 \n"
3659 "movdqu %%xmm0,(%1) \n"
3660 "add $0x10,%1 \n"
3661 "sub $0x10,%2 \n"
3662 "jg 1b \n"
3663 : "+r"(src_y), // %0
3664 "+r"(dst_y), // %1
3665 "+r"(width) // %2
3666 : "r"(scale) // %3
3667 : "memory", "cc", "xmm0", "xmm1", "xmm2");
3668 // clang-format on
3669 }
3670
3671 #ifdef HAS_CONVERT16TO8ROW_AVX2
Convert16To8Row_AVX2(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)3672 void Convert16To8Row_AVX2(const uint16_t* src_y,
3673 uint8_t* dst_y,
3674 int scale,
3675 int width) {
3676 // clang-format off
3677 asm volatile (
3678 "vmovd %3,%%xmm2 \n"
3679 "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
3680 "vbroadcastss %%xmm2,%%ymm2 \n"
3681
3682 // 32 pixels per loop.
3683 LABELALIGN
3684 "1: \n"
3685 "vmovdqu (%0),%%ymm0 \n"
3686 "vmovdqu 0x20(%0),%%ymm1 \n"
3687 "add $0x40,%0 \n"
3688 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
3689 "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
3690 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates
3691 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3692 "vmovdqu %%ymm0,(%1) \n"
3693 "add $0x20,%1 \n"
3694 "sub $0x20,%2 \n"
3695 "jg 1b \n"
3696 "vzeroupper \n"
3697 : "+r"(src_y), // %0
3698 "+r"(dst_y), // %1
3699 "+r"(width) // %2
3700 : "r"(scale) // %3
3701 : "memory", "cc", "xmm0", "xmm1", "xmm2");
3702 // clang-format on
3703 }
3704 #endif // HAS_CONVERT16TO8ROW_AVX2
3705
3706 // Use scale to convert to lsb formats depending how many bits there are:
3707 // 512 = 9 bits
3708 // 1024 = 10 bits
3709 // 4096 = 12 bits
3710 // TODO(fbarchard): reduce to SSE2
Convert8To16Row_SSE2(const uint8_t * src_y,uint16_t * dst_y,int scale,int width)3711 void Convert8To16Row_SSE2(const uint8_t* src_y,
3712 uint16_t* dst_y,
3713 int scale,
3714 int width) {
3715 // clang-format off
3716 asm volatile (
3717 "movd %3,%%xmm2 \n"
3718 "punpcklwd %%xmm2,%%xmm2 \n"
3719 "pshufd $0x0,%%xmm2,%%xmm2 \n"
3720
3721 // 32 pixels per loop.
3722 LABELALIGN
3723 "1: \n"
3724 "movdqu (%0),%%xmm0 \n"
3725 "movdqa %%xmm0,%%xmm1 \n"
3726 "punpcklbw %%xmm0,%%xmm0 \n"
3727 "punpckhbw %%xmm1,%%xmm1 \n"
3728 "add $0x10,%0 \n"
3729 "pmulhuw %%xmm2,%%xmm0 \n"
3730 "pmulhuw %%xmm2,%%xmm1 \n"
3731 "movdqu %%xmm0,(%1) \n"
3732 "movdqu %%xmm1,0x10(%1) \n"
3733 "add $0x20,%1 \n"
3734 "sub $0x10,%2 \n"
3735 "jg 1b \n"
3736 : "+r"(src_y), // %0
3737 "+r"(dst_y), // %1
3738 "+r"(width) // %2
3739 : "r"(scale) // %3
3740 : "memory", "cc", "xmm0", "xmm1", "xmm2");
3741 // clang-format on
3742 }
3743
3744 #ifdef HAS_CONVERT8TO16ROW_AVX2
Convert8To16Row_AVX2(const uint8_t * src_y,uint16_t * dst_y,int scale,int width)3745 void Convert8To16Row_AVX2(const uint8_t* src_y,
3746 uint16_t* dst_y,
3747 int scale,
3748 int width) {
3749 // clang-format off
3750 asm volatile (
3751 "vmovd %3,%%xmm2 \n"
3752 "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
3753 "vbroadcastss %%xmm2,%%ymm2 \n"
3754
3755 // 32 pixels per loop.
3756 LABELALIGN
3757 "1: \n"
3758 "vmovdqu (%0),%%ymm0 \n"
3759 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3760 "add $0x20,%0 \n"
3761 "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
3762 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
3763 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
3764 "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
3765 "vmovdqu %%ymm0,(%1) \n"
3766 "vmovdqu %%ymm1,0x20(%1) \n"
3767 "add $0x40,%1 \n"
3768 "sub $0x20,%2 \n"
3769 "jg 1b \n"
3770 "vzeroupper \n"
3771 : "+r"(src_y), // %0
3772 "+r"(dst_y), // %1
3773 "+r"(width) // %2
3774 : "r"(scale) // %3
3775 : "memory", "cc", "xmm0", "xmm1", "xmm2");
3776 // clang-format on
3777 }
3778 #endif // HAS_CONVERT8TO16ROW_AVX2
3779
3780 #ifdef HAS_SPLITRGBROW_SSSE3
3781
3782 // Shuffle table for converting RGB to Planar.
3783 static const uvec8 kShuffleMaskRGBToR0 = {0u, 3u, 6u, 9u, 12u, 15u,
3784 128u, 128u, 128u, 128u, 128u, 128u,
3785 128u, 128u, 128u, 128u};
3786 static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
3787 2u, 5u, 8u, 11u, 14u, 128u,
3788 128u, 128u, 128u, 128u};
3789 static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
3790 128u, 128u, 128u, 128u, 128u, 1u,
3791 4u, 7u, 10u, 13u};
3792
3793 static const uvec8 kShuffleMaskRGBToG0 = {1u, 4u, 7u, 10u, 13u, 128u,
3794 128u, 128u, 128u, 128u, 128u, 128u,
3795 128u, 128u, 128u, 128u};
3796 static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
3797 3u, 6u, 9u, 12u, 15u, 128u,
3798 128u, 128u, 128u, 128u};
3799 static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
3800 128u, 128u, 128u, 128u, 128u, 2u,
3801 5u, 8u, 11u, 14u};
3802
3803 static const uvec8 kShuffleMaskRGBToB0 = {2u, 5u, 8u, 11u, 14u, 128u,
3804 128u, 128u, 128u, 128u, 128u, 128u,
3805 128u, 128u, 128u, 128u};
3806 static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
3807 4u, 7u, 10u, 13u, 128u, 128u,
3808 128u, 128u, 128u, 128u};
3809 static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
3810 128u, 128u, 128u, 128u, 0u, 3u,
3811 6u, 9u, 12u, 15u};
3812
SplitRGBRow_SSSE3(const uint8_t * src_rgb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)3813 void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
3814 uint8_t* dst_r,
3815 uint8_t* dst_g,
3816 uint8_t* dst_b,
3817 int width) {
3818 asm volatile(
3819
3820 LABELALIGN
3821 "1: \n"
3822 "movdqu (%0),%%xmm0 \n"
3823 "movdqu 0x10(%0),%%xmm1 \n"
3824 "movdqu 0x20(%0),%%xmm2 \n"
3825 "pshufb %5, %%xmm0 \n"
3826 "pshufb %6, %%xmm1 \n"
3827 "pshufb %7, %%xmm2 \n"
3828 "por %%xmm1,%%xmm0 \n"
3829 "por %%xmm2,%%xmm0 \n"
3830 "movdqu %%xmm0,(%1) \n"
3831 "lea 0x10(%1),%1 \n"
3832
3833 "movdqu (%0),%%xmm0 \n"
3834 "movdqu 0x10(%0),%%xmm1 \n"
3835 "movdqu 0x20(%0),%%xmm2 \n"
3836 "pshufb %8, %%xmm0 \n"
3837 "pshufb %9, %%xmm1 \n"
3838 "pshufb %10, %%xmm2 \n"
3839 "por %%xmm1,%%xmm0 \n"
3840 "por %%xmm2,%%xmm0 \n"
3841 "movdqu %%xmm0,(%2) \n"
3842 "lea 0x10(%2),%2 \n"
3843
3844 "movdqu (%0),%%xmm0 \n"
3845 "movdqu 0x10(%0),%%xmm1 \n"
3846 "movdqu 0x20(%0),%%xmm2 \n"
3847 "pshufb %11, %%xmm0 \n"
3848 "pshufb %12, %%xmm1 \n"
3849 "pshufb %13, %%xmm2 \n"
3850 "por %%xmm1,%%xmm0 \n"
3851 "por %%xmm2,%%xmm0 \n"
3852 "movdqu %%xmm0,(%3) \n"
3853 "lea 0x10(%3),%3 \n"
3854 "lea 0x30(%0),%0 \n"
3855 "sub $0x10,%4 \n"
3856 "jg 1b \n"
3857 : "+r"(src_rgb), // %0
3858 "+r"(dst_r), // %1
3859 "+r"(dst_g), // %2
3860 "+r"(dst_b), // %3
3861 "+r"(width) // %4
3862 : "m"(kShuffleMaskRGBToR0), // %5
3863 "m"(kShuffleMaskRGBToR1), // %6
3864 "m"(kShuffleMaskRGBToR2), // %7
3865 "m"(kShuffleMaskRGBToG0), // %8
3866 "m"(kShuffleMaskRGBToG1), // %9
3867 "m"(kShuffleMaskRGBToG2), // %10
3868 "m"(kShuffleMaskRGBToB0), // %11
3869 "m"(kShuffleMaskRGBToB1), // %12
3870 "m"(kShuffleMaskRGBToB2) // %13
3871 : "memory", "cc", "xmm0", "xmm1", "xmm2");
3872 }
3873 #endif // HAS_SPLITRGBROW_SSSE3
3874
3875 #ifdef HAS_MERGERGBROW_SSSE3
3876
3877 // Shuffle table for converting RGB to Planar.
3878 static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
3879 2u, 128u, 128u, 3u, 128u, 128u,
3880 4u, 128u, 128u, 5u};
3881 static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
3882 128u, 2u, 128u, 128u, 3u, 128u,
3883 128u, 4u, 128u, 128u};
3884 static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
3885 128u, 128u, 2u, 128u, 128u, 3u,
3886 128u, 128u, 4u, 128u};
3887
3888 static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
3889 7u, 128u, 128u, 8u, 128u, 128u,
3890 9u, 128u, 128u, 10u};
3891 static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
3892 128u, 7u, 128u, 128u, 8u, 128u,
3893 128u, 9u, 128u, 128u};
3894 static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u, 128u, 128u, 7u,
3895 128u, 128u, 8u, 128u, 128u, 9u,
3896 128u, 128u, 10u, 128u};
3897
3898 static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
3899 12u, 128u, 128u, 13u, 128u, 128u,
3900 14u, 128u, 128u, 15u};
3901 static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
3902 128u, 13u, 128u, 128u, 14u, 128u,
3903 128u, 15u, 128u, 128u};
3904 static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
3905 128u, 128u, 13u, 128u, 128u, 14u,
3906 128u, 128u, 15u, 128u};
3907
MergeRGBRow_SSSE3(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_rgb,int width)3908 void MergeRGBRow_SSSE3(const uint8_t* src_r,
3909 const uint8_t* src_g,
3910 const uint8_t* src_b,
3911 uint8_t* dst_rgb,
3912 int width) {
3913 asm volatile(
3914
3915 LABELALIGN
3916 "1: \n"
3917 "movdqu (%0),%%xmm0 \n"
3918 "movdqu (%1),%%xmm1 \n"
3919 "movdqu (%2),%%xmm2 \n"
3920 "pshufb %5, %%xmm0 \n"
3921 "pshufb %6, %%xmm1 \n"
3922 "pshufb %7, %%xmm2 \n"
3923 "por %%xmm1,%%xmm0 \n"
3924 "por %%xmm2,%%xmm0 \n"
3925 "movdqu %%xmm0,(%3) \n"
3926
3927 "movdqu (%0),%%xmm0 \n"
3928 "movdqu (%1),%%xmm1 \n"
3929 "movdqu (%2),%%xmm2 \n"
3930 "pshufb %8, %%xmm0 \n"
3931 "pshufb %9, %%xmm1 \n"
3932 "pshufb %10, %%xmm2 \n"
3933 "por %%xmm1,%%xmm0 \n"
3934 "por %%xmm2,%%xmm0 \n"
3935 "movdqu %%xmm0,16(%3) \n"
3936
3937 "movdqu (%0),%%xmm0 \n"
3938 "movdqu (%1),%%xmm1 \n"
3939 "movdqu (%2),%%xmm2 \n"
3940 "pshufb %11, %%xmm0 \n"
3941 "pshufb %12, %%xmm1 \n"
3942 "pshufb %13, %%xmm2 \n"
3943 "por %%xmm1,%%xmm0 \n"
3944 "por %%xmm2,%%xmm0 \n"
3945 "movdqu %%xmm0,32(%3) \n"
3946
3947 "lea 0x10(%0),%0 \n"
3948 "lea 0x10(%1),%1 \n"
3949 "lea 0x10(%2),%2 \n"
3950 "lea 0x30(%3),%3 \n"
3951 "sub $0x10,%4 \n"
3952 "jg 1b \n"
3953 : "+r"(src_r), // %0
3954 "+r"(src_g), // %1
3955 "+r"(src_b), // %2
3956 "+r"(dst_rgb), // %3
3957 "+r"(width) // %4
3958 : "m"(kShuffleMaskRToRGB0), // %5
3959 "m"(kShuffleMaskGToRGB0), // %6
3960 "m"(kShuffleMaskBToRGB0), // %7
3961 "m"(kShuffleMaskRToRGB1), // %8
3962 "m"(kShuffleMaskGToRGB1), // %9
3963 "m"(kShuffleMaskBToRGB1), // %10
3964 "m"(kShuffleMaskRToRGB2), // %11
3965 "m"(kShuffleMaskGToRGB2), // %12
3966 "m"(kShuffleMaskBToRGB2) // %13
3967 : "memory", "cc", "xmm0", "xmm1", "xmm2");
3968 }
3969 #endif // HAS_MERGERGBROW_SSSE3
3970
3971 #ifdef HAS_COPYROW_SSE2
CopyRow_SSE2(const uint8_t * src,uint8_t * dst,int width)3972 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
3973 asm volatile(
3974 "test $0xf,%0 \n"
3975 "jne 2f \n"
3976 "test $0xf,%1 \n"
3977 "jne 2f \n"
3978
3979 LABELALIGN
3980 "1: \n"
3981 "movdqa (%0),%%xmm0 \n"
3982 "movdqa 0x10(%0),%%xmm1 \n"
3983 "lea 0x20(%0),%0 \n"
3984 "movdqa %%xmm0,(%1) \n"
3985 "movdqa %%xmm1,0x10(%1) \n"
3986 "lea 0x20(%1),%1 \n"
3987 "sub $0x20,%2 \n"
3988 "jg 1b \n"
3989 "jmp 9f \n"
3990
3991 LABELALIGN
3992 "2: \n"
3993 "movdqu (%0),%%xmm0 \n"
3994 "movdqu 0x10(%0),%%xmm1 \n"
3995 "lea 0x20(%0),%0 \n"
3996 "movdqu %%xmm0,(%1) \n"
3997 "movdqu %%xmm1,0x10(%1) \n"
3998 "lea 0x20(%1),%1 \n"
3999 "sub $0x20,%2 \n"
4000 "jg 2b \n"
4001
4002 LABELALIGN "9: \n"
4003 : "+r"(src), // %0
4004 "+r"(dst), // %1
4005 "+r"(width) // %2
4006 :
4007 : "memory", "cc", "xmm0", "xmm1");
4008 }
4009 #endif // HAS_COPYROW_SSE2
4010
4011 #ifdef HAS_COPYROW_AVX
CopyRow_AVX(const uint8_t * src,uint8_t * dst,int width)4012 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
4013 asm volatile(
4014
4015 LABELALIGN
4016 "1: \n"
4017 "vmovdqu (%0),%%ymm0 \n"
4018 "vmovdqu 0x20(%0),%%ymm1 \n"
4019 "lea 0x40(%0),%0 \n"
4020 "vmovdqu %%ymm0,(%1) \n"
4021 "vmovdqu %%ymm1,0x20(%1) \n"
4022 "lea 0x40(%1),%1 \n"
4023 "sub $0x40,%2 \n"
4024 "jg 1b \n"
4025 : "+r"(src), // %0
4026 "+r"(dst), // %1
4027 "+r"(width) // %2
4028 :
4029 : "memory", "cc", "xmm0", "xmm1");
4030 }
4031 #endif // HAS_COPYROW_AVX
4032
4033 #ifdef HAS_COPYROW_ERMS
4034 // Multiple of 1.
CopyRow_ERMS(const uint8_t * src,uint8_t * dst,int width)4035 void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
4036 size_t width_tmp = (size_t)(width);
4037 asm volatile(
4038
4039 "rep movsb \n"
4040 : "+S"(src), // %0
4041 "+D"(dst), // %1
4042 "+c"(width_tmp) // %2
4043 :
4044 : "memory", "cc");
4045 }
4046 #endif // HAS_COPYROW_ERMS
4047
4048 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
4049 // width in pixels
ARGBCopyAlphaRow_SSE2(const uint8_t * src,uint8_t * dst,int width)4050 void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
4051 asm volatile(
4052 "pcmpeqb %%xmm0,%%xmm0 \n"
4053 "pslld $0x18,%%xmm0 \n"
4054 "pcmpeqb %%xmm1,%%xmm1 \n"
4055 "psrld $0x8,%%xmm1 \n"
4056
4057 LABELALIGN
4058 "1: \n"
4059 "movdqu (%0),%%xmm2 \n"
4060 "movdqu 0x10(%0),%%xmm3 \n"
4061 "lea 0x20(%0),%0 \n"
4062 "movdqu (%1),%%xmm4 \n"
4063 "movdqu 0x10(%1),%%xmm5 \n"
4064 "pand %%xmm0,%%xmm2 \n"
4065 "pand %%xmm0,%%xmm3 \n"
4066 "pand %%xmm1,%%xmm4 \n"
4067 "pand %%xmm1,%%xmm5 \n"
4068 "por %%xmm4,%%xmm2 \n"
4069 "por %%xmm5,%%xmm3 \n"
4070 "movdqu %%xmm2,(%1) \n"
4071 "movdqu %%xmm3,0x10(%1) \n"
4072 "lea 0x20(%1),%1 \n"
4073 "sub $0x8,%2 \n"
4074 "jg 1b \n"
4075 : "+r"(src), // %0
4076 "+r"(dst), // %1
4077 "+r"(width) // %2
4078 :
4079 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
4080 }
4081 #endif // HAS_ARGBCOPYALPHAROW_SSE2
4082
4083 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
4084 // width in pixels
ARGBCopyAlphaRow_AVX2(const uint8_t * src,uint8_t * dst,int width)4085 void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
4086 asm volatile(
4087 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
4088 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
4089
4090 LABELALIGN
4091 "1: \n"
4092 "vmovdqu (%0),%%ymm1 \n"
4093 "vmovdqu 0x20(%0),%%ymm2 \n"
4094 "lea 0x40(%0),%0 \n"
4095 "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
4096 "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
4097 "vmovdqu %%ymm1,(%1) \n"
4098 "vmovdqu %%ymm2,0x20(%1) \n"
4099 "lea 0x40(%1),%1 \n"
4100 "sub $0x10,%2 \n"
4101 "jg 1b \n"
4102 "vzeroupper \n"
4103 : "+r"(src), // %0
4104 "+r"(dst), // %1
4105 "+r"(width) // %2
4106 :
4107 : "memory", "cc", "xmm0", "xmm1", "xmm2");
4108 }
4109 #endif // HAS_ARGBCOPYALPHAROW_AVX2
4110
4111 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
4112 // width in pixels
ARGBExtractAlphaRow_SSE2(const uint8_t * src_argb,uint8_t * dst_a,int width)4113 void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
4114 uint8_t* dst_a,
4115 int width) {
4116 asm volatile(
4117
4118 LABELALIGN
4119 "1: \n"
4120 "movdqu (%0), %%xmm0 \n"
4121 "movdqu 0x10(%0), %%xmm1 \n"
4122 "lea 0x20(%0), %0 \n"
4123 "psrld $0x18, %%xmm0 \n"
4124 "psrld $0x18, %%xmm1 \n"
4125 "packssdw %%xmm1, %%xmm0 \n"
4126 "packuswb %%xmm0, %%xmm0 \n"
4127 "movq %%xmm0,(%1) \n"
4128 "lea 0x8(%1), %1 \n"
4129 "sub $0x8, %2 \n"
4130 "jg 1b \n"
4131 : "+r"(src_argb), // %0
4132 "+r"(dst_a), // %1
4133 "+rm"(width) // %2
4134 :
4135 : "memory", "cc", "xmm0", "xmm1");
4136 }
4137 #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
4138
4139 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
4140 static const uvec8 kShuffleAlphaShort_AVX2 = {
4141 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u,
4142 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
4143
ARGBExtractAlphaRow_AVX2(const uint8_t * src_argb,uint8_t * dst_a,int width)4144 void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
4145 uint8_t* dst_a,
4146 int width) {
4147 asm volatile(
4148 "vmovdqa %3,%%ymm4 \n"
4149 "vbroadcastf128 %4,%%ymm5 \n"
4150
4151 LABELALIGN
4152 "1: \n"
4153 "vmovdqu (%0), %%ymm0 \n"
4154 "vmovdqu 0x20(%0), %%ymm1 \n"
4155 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
4156 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
4157 "vmovdqu 0x40(%0), %%ymm2 \n"
4158 "vmovdqu 0x60(%0), %%ymm3 \n"
4159 "lea 0x80(%0), %0 \n"
4160 "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
4161 "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
4162 "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
4163 "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
4164 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
4165 "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
4166 "vmovdqu %%ymm0,(%1) \n"
4167 "lea 0x20(%1),%1 \n"
4168 "sub $0x20, %2 \n"
4169 "jg 1b \n"
4170 "vzeroupper \n"
4171 : "+r"(src_argb), // %0
4172 "+r"(dst_a), // %1
4173 "+rm"(width) // %2
4174 : "m"(kPermdARGBToY_AVX), // %3
4175 "m"(kShuffleAlphaShort_AVX2) // %4
4176 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
4177 }
4178 #endif // HAS_ARGBEXTRACTALPHAROW_AVX2
4179
4180 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
4181 // width in pixels
ARGBCopyYToAlphaRow_SSE2(const uint8_t * src,uint8_t * dst,int width)4182 void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
4183 asm volatile(
4184 "pcmpeqb %%xmm0,%%xmm0 \n"
4185 "pslld $0x18,%%xmm0 \n"
4186 "pcmpeqb %%xmm1,%%xmm1 \n"
4187 "psrld $0x8,%%xmm1 \n"
4188
4189 LABELALIGN
4190 "1: \n"
4191 "movq (%0),%%xmm2 \n"
4192 "lea 0x8(%0),%0 \n"
4193 "punpcklbw %%xmm2,%%xmm2 \n"
4194 "punpckhwd %%xmm2,%%xmm3 \n"
4195 "punpcklwd %%xmm2,%%xmm2 \n"
4196 "movdqu (%1),%%xmm4 \n"
4197 "movdqu 0x10(%1),%%xmm5 \n"
4198 "pand %%xmm0,%%xmm2 \n"
4199 "pand %%xmm0,%%xmm3 \n"
4200 "pand %%xmm1,%%xmm4 \n"
4201 "pand %%xmm1,%%xmm5 \n"
4202 "por %%xmm4,%%xmm2 \n"
4203 "por %%xmm5,%%xmm3 \n"
4204 "movdqu %%xmm2,(%1) \n"
4205 "movdqu %%xmm3,0x10(%1) \n"
4206 "lea 0x20(%1),%1 \n"
4207 "sub $0x8,%2 \n"
4208 "jg 1b \n"
4209 : "+r"(src), // %0
4210 "+r"(dst), // %1
4211 "+r"(width) // %2
4212 :
4213 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
4214 }
4215 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
4216
4217 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
4218 // width in pixels
ARGBCopyYToAlphaRow_AVX2(const uint8_t * src,uint8_t * dst,int width)4219 void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
4220 asm volatile(
4221 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
4222 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
4223
4224 LABELALIGN
4225 "1: \n"
4226 "vpmovzxbd (%0),%%ymm1 \n"
4227 "vpmovzxbd 0x8(%0),%%ymm2 \n"
4228 "lea 0x10(%0),%0 \n"
4229 "vpslld $0x18,%%ymm1,%%ymm1 \n"
4230 "vpslld $0x18,%%ymm2,%%ymm2 \n"
4231 "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
4232 "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
4233 "vmovdqu %%ymm1,(%1) \n"
4234 "vmovdqu %%ymm2,0x20(%1) \n"
4235 "lea 0x40(%1),%1 \n"
4236 "sub $0x10,%2 \n"
4237 "jg 1b \n"
4238 "vzeroupper \n"
4239 : "+r"(src), // %0
4240 "+r"(dst), // %1
4241 "+r"(width) // %2
4242 :
4243 : "memory", "cc", "xmm0", "xmm1", "xmm2");
4244 }
4245 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
4246
4247 #ifdef HAS_SETROW_X86
SetRow_X86(uint8_t * dst,uint8_t v8,int width)4248 void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
4249 size_t width_tmp = (size_t)(width >> 2);
4250 const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
4251 asm volatile(
4252
4253 "rep stosl \n"
4254 : "+D"(dst), // %0
4255 "+c"(width_tmp) // %1
4256 : "a"(v32) // %2
4257 : "memory", "cc");
4258 }
4259
SetRow_ERMS(uint8_t * dst,uint8_t v8,int width)4260 void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
4261 size_t width_tmp = (size_t)(width);
4262 asm volatile(
4263
4264 "rep stosb \n"
4265 : "+D"(dst), // %0
4266 "+c"(width_tmp) // %1
4267 : "a"(v8) // %2
4268 : "memory", "cc");
4269 }
4270
ARGBSetRow_X86(uint8_t * dst_argb,uint32_t v32,int width)4271 void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
4272 size_t width_tmp = (size_t)(width);
4273 asm volatile(
4274
4275 "rep stosl \n"
4276 : "+D"(dst_argb), // %0
4277 "+c"(width_tmp) // %1
4278 : "a"(v32) // %2
4279 : "memory", "cc");
4280 }
4281 #endif // HAS_SETROW_X86
4282
4283 #ifdef HAS_YUY2TOYROW_SSE2
YUY2ToYRow_SSE2(const uint8_t * src_yuy2,uint8_t * dst_y,int width)4284 void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
4285 asm volatile(
4286 "pcmpeqb %%xmm5,%%xmm5 \n"
4287 "psrlw $0x8,%%xmm5 \n"
4288
4289 LABELALIGN
4290 "1: \n"
4291 "movdqu (%0),%%xmm0 \n"
4292 "movdqu 0x10(%0),%%xmm1 \n"
4293 "lea 0x20(%0),%0 \n"
4294 "pand %%xmm5,%%xmm0 \n"
4295 "pand %%xmm5,%%xmm1 \n"
4296 "packuswb %%xmm1,%%xmm0 \n"
4297 "movdqu %%xmm0,(%1) \n"
4298 "lea 0x10(%1),%1 \n"
4299 "sub $0x10,%2 \n"
4300 "jg 1b \n"
4301 : "+r"(src_yuy2), // %0
4302 "+r"(dst_y), // %1
4303 "+r"(width) // %2
4304 :
4305 : "memory", "cc", "xmm0", "xmm1", "xmm5");
4306 }
4307
YUY2ToUVRow_SSE2(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)4308 void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
4309 int stride_yuy2,
4310 uint8_t* dst_u,
4311 uint8_t* dst_v,
4312 int width) {
4313 asm volatile(
4314 "pcmpeqb %%xmm5,%%xmm5 \n"
4315 "psrlw $0x8,%%xmm5 \n"
4316 "sub %1,%2 \n"
4317
4318 LABELALIGN
4319 "1: \n"
4320 "movdqu (%0),%%xmm0 \n"
4321 "movdqu 0x10(%0),%%xmm1 \n"
4322 "movdqu 0x00(%0,%4,1),%%xmm2 \n"
4323 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
4324 "lea 0x20(%0),%0 \n"
4325 "pavgb %%xmm2,%%xmm0 \n"
4326 "pavgb %%xmm3,%%xmm1 \n"
4327 "psrlw $0x8,%%xmm0 \n"
4328 "psrlw $0x8,%%xmm1 \n"
4329 "packuswb %%xmm1,%%xmm0 \n"
4330 "movdqa %%xmm0,%%xmm1 \n"
4331 "pand %%xmm5,%%xmm0 \n"
4332 "packuswb %%xmm0,%%xmm0 \n"
4333 "psrlw $0x8,%%xmm1 \n"
4334 "packuswb %%xmm1,%%xmm1 \n"
4335 "movq %%xmm0,(%1) \n"
4336 "movq %%xmm1,0x00(%1,%2,1) \n"
4337 "lea 0x8(%1),%1 \n"
4338 "sub $0x10,%3 \n"
4339 "jg 1b \n"
4340 : "+r"(src_yuy2), // %0
4341 "+r"(dst_u), // %1
4342 "+r"(dst_v), // %2
4343 "+r"(width) // %3
4344 : "r"((intptr_t)(stride_yuy2)) // %4
4345 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
4346 }
4347
YUY2ToUV422Row_SSE2(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)4348 void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
4349 uint8_t* dst_u,
4350 uint8_t* dst_v,
4351 int width) {
4352 asm volatile(
4353 "pcmpeqb %%xmm5,%%xmm5 \n"
4354 "psrlw $0x8,%%xmm5 \n"
4355 "sub %1,%2 \n"
4356
4357 LABELALIGN
4358 "1: \n"
4359 "movdqu (%0),%%xmm0 \n"
4360 "movdqu 0x10(%0),%%xmm1 \n"
4361 "lea 0x20(%0),%0 \n"
4362 "psrlw $0x8,%%xmm0 \n"
4363 "psrlw $0x8,%%xmm1 \n"
4364 "packuswb %%xmm1,%%xmm0 \n"
4365 "movdqa %%xmm0,%%xmm1 \n"
4366 "pand %%xmm5,%%xmm0 \n"
4367 "packuswb %%xmm0,%%xmm0 \n"
4368 "psrlw $0x8,%%xmm1 \n"
4369 "packuswb %%xmm1,%%xmm1 \n"
4370 "movq %%xmm0,(%1) \n"
4371 "movq %%xmm1,0x00(%1,%2,1) \n"
4372 "lea 0x8(%1),%1 \n"
4373 "sub $0x10,%3 \n"
4374 "jg 1b \n"
4375 : "+r"(src_yuy2), // %0
4376 "+r"(dst_u), // %1
4377 "+r"(dst_v), // %2
4378 "+r"(width) // %3
4379 :
4380 : "memory", "cc", "xmm0", "xmm1", "xmm5");
4381 }
4382
UYVYToYRow_SSE2(const uint8_t * src_uyvy,uint8_t * dst_y,int width)4383 void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
4384 asm volatile(
4385
4386 LABELALIGN
4387 "1: \n"
4388 "movdqu (%0),%%xmm0 \n"
4389 "movdqu 0x10(%0),%%xmm1 \n"
4390 "lea 0x20(%0),%0 \n"
4391 "psrlw $0x8,%%xmm0 \n"
4392 "psrlw $0x8,%%xmm1 \n"
4393 "packuswb %%xmm1,%%xmm0 \n"
4394 "movdqu %%xmm0,(%1) \n"
4395 "lea 0x10(%1),%1 \n"
4396 "sub $0x10,%2 \n"
4397 "jg 1b \n"
4398 : "+r"(src_uyvy), // %0
4399 "+r"(dst_y), // %1
4400 "+r"(width) // %2
4401 :
4402 : "memory", "cc", "xmm0", "xmm1");
4403 }
4404
UYVYToUVRow_SSE2(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)4405 void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
4406 int stride_uyvy,
4407 uint8_t* dst_u,
4408 uint8_t* dst_v,
4409 int width) {
4410 asm volatile(
4411 "pcmpeqb %%xmm5,%%xmm5 \n"
4412 "psrlw $0x8,%%xmm5 \n"
4413 "sub %1,%2 \n"
4414
4415 LABELALIGN
4416 "1: \n"
4417 "movdqu (%0),%%xmm0 \n"
4418 "movdqu 0x10(%0),%%xmm1 \n"
4419 "movdqu 0x00(%0,%4,1),%%xmm2 \n"
4420 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
4421 "lea 0x20(%0),%0 \n"
4422 "pavgb %%xmm2,%%xmm0 \n"
4423 "pavgb %%xmm3,%%xmm1 \n"
4424 "pand %%xmm5,%%xmm0 \n"
4425 "pand %%xmm5,%%xmm1 \n"
4426 "packuswb %%xmm1,%%xmm0 \n"
4427 "movdqa %%xmm0,%%xmm1 \n"
4428 "pand %%xmm5,%%xmm0 \n"
4429 "packuswb %%xmm0,%%xmm0 \n"
4430 "psrlw $0x8,%%xmm1 \n"
4431 "packuswb %%xmm1,%%xmm1 \n"
4432 "movq %%xmm0,(%1) \n"
4433 "movq %%xmm1,0x00(%1,%2,1) \n"
4434 "lea 0x8(%1),%1 \n"
4435 "sub $0x10,%3 \n"
4436 "jg 1b \n"
4437 : "+r"(src_uyvy), // %0
4438 "+r"(dst_u), // %1
4439 "+r"(dst_v), // %2
4440 "+r"(width) // %3
4441 : "r"((intptr_t)(stride_uyvy)) // %4
4442 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
4443 }
4444
UYVYToUV422Row_SSE2(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)4445 void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
4446 uint8_t* dst_u,
4447 uint8_t* dst_v,
4448 int width) {
4449 asm volatile(
4450 "pcmpeqb %%xmm5,%%xmm5 \n"
4451 "psrlw $0x8,%%xmm5 \n"
4452 "sub %1,%2 \n"
4453
4454 LABELALIGN
4455 "1: \n"
4456 "movdqu (%0),%%xmm0 \n"
4457 "movdqu 0x10(%0),%%xmm1 \n"
4458 "lea 0x20(%0),%0 \n"
4459 "pand %%xmm5,%%xmm0 \n"
4460 "pand %%xmm5,%%xmm1 \n"
4461 "packuswb %%xmm1,%%xmm0 \n"
4462 "movdqa %%xmm0,%%xmm1 \n"
4463 "pand %%xmm5,%%xmm0 \n"
4464 "packuswb %%xmm0,%%xmm0 \n"
4465 "psrlw $0x8,%%xmm1 \n"
4466 "packuswb %%xmm1,%%xmm1 \n"
4467 "movq %%xmm0,(%1) \n"
4468 "movq %%xmm1,0x00(%1,%2,1) \n"
4469 "lea 0x8(%1),%1 \n"
4470 "sub $0x10,%3 \n"
4471 "jg 1b \n"
4472 : "+r"(src_uyvy), // %0
4473 "+r"(dst_u), // %1
4474 "+r"(dst_v), // %2
4475 "+r"(width) // %3
4476 :
4477 : "memory", "cc", "xmm0", "xmm1", "xmm5");
4478 }
4479 #endif // HAS_YUY2TOYROW_SSE2
4480
4481 #ifdef HAS_YUY2TOYROW_AVX2
YUY2ToYRow_AVX2(const uint8_t * src_yuy2,uint8_t * dst_y,int width)4482 void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
4483 asm volatile(
4484 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4485 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
4486
4487 LABELALIGN
4488 "1: \n"
4489 "vmovdqu (%0),%%ymm0 \n"
4490 "vmovdqu 0x20(%0),%%ymm1 \n"
4491 "lea 0x40(%0),%0 \n"
4492 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
4493 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
4494 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4495 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4496 "vmovdqu %%ymm0,(%1) \n"
4497 "lea 0x20(%1),%1 \n"
4498 "sub $0x20,%2 \n"
4499 "jg 1b \n"
4500 "vzeroupper \n"
4501 : "+r"(src_yuy2), // %0
4502 "+r"(dst_y), // %1
4503 "+r"(width) // %2
4504 :
4505 : "memory", "cc", "xmm0", "xmm1", "xmm5");
4506 }
4507
YUY2ToUVRow_AVX2(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)4508 void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
4509 int stride_yuy2,
4510 uint8_t* dst_u,
4511 uint8_t* dst_v,
4512 int width) {
4513 asm volatile(
4514 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4515 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
4516 "sub %1,%2 \n"
4517
4518 LABELALIGN
4519 "1: \n"
4520 "vmovdqu (%0),%%ymm0 \n"
4521 "vmovdqu 0x20(%0),%%ymm1 \n"
4522 "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
4523 "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
4524 "lea 0x40(%0),%0 \n"
4525 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
4526 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
4527 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4528 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4529 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
4530 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
4531 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
4532 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
4533 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
4534 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4535 "vextractf128 $0x0,%%ymm1,(%1) \n"
4536 "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
4537 "lea 0x10(%1),%1 \n"
4538 "sub $0x20,%3 \n"
4539 "jg 1b \n"
4540 "vzeroupper \n"
4541 : "+r"(src_yuy2), // %0
4542 "+r"(dst_u), // %1
4543 "+r"(dst_v), // %2
4544 "+r"(width) // %3
4545 : "r"((intptr_t)(stride_yuy2)) // %4
4546 : "memory", "cc", "xmm0", "xmm1", "xmm5");
4547 }
4548
YUY2ToUV422Row_AVX2(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)4549 void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
4550 uint8_t* dst_u,
4551 uint8_t* dst_v,
4552 int width) {
4553 asm volatile(
4554 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4555 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
4556 "sub %1,%2 \n"
4557
4558 LABELALIGN
4559 "1: \n"
4560 "vmovdqu (%0),%%ymm0 \n"
4561 "vmovdqu 0x20(%0),%%ymm1 \n"
4562 "lea 0x40(%0),%0 \n"
4563 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
4564 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
4565 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4566 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4567 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
4568 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
4569 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
4570 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
4571 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
4572 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4573 "vextractf128 $0x0,%%ymm1,(%1) \n"
4574 "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
4575 "lea 0x10(%1),%1 \n"
4576 "sub $0x20,%3 \n"
4577 "jg 1b \n"
4578 "vzeroupper \n"
4579 : "+r"(src_yuy2), // %0
4580 "+r"(dst_u), // %1
4581 "+r"(dst_v), // %2
4582 "+r"(width) // %3
4583 :
4584 : "memory", "cc", "xmm0", "xmm1", "xmm5");
4585 }
4586
UYVYToYRow_AVX2(const uint8_t * src_uyvy,uint8_t * dst_y,int width)4587 void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
4588 asm volatile(
4589
4590 LABELALIGN
4591 "1: \n"
4592 "vmovdqu (%0),%%ymm0 \n"
4593 "vmovdqu 0x20(%0),%%ymm1 \n"
4594 "lea 0x40(%0),%0 \n"
4595 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
4596 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
4597 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4598 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4599 "vmovdqu %%ymm0,(%1) \n"
4600 "lea 0x20(%1),%1 \n"
4601 "sub $0x20,%2 \n"
4602 "jg 1b \n"
4603 "vzeroupper \n"
4604 : "+r"(src_uyvy), // %0
4605 "+r"(dst_y), // %1
4606 "+r"(width) // %2
4607 :
4608 : "memory", "cc", "xmm0", "xmm1", "xmm5");
4609 }
UYVYToUVRow_AVX2(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)4610 void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
4611 int stride_uyvy,
4612 uint8_t* dst_u,
4613 uint8_t* dst_v,
4614 int width) {
4615 asm volatile(
4616 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4617 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
4618 "sub %1,%2 \n"
4619
4620 LABELALIGN
4621 "1: \n"
4622 "vmovdqu (%0),%%ymm0 \n"
4623 "vmovdqu 0x20(%0),%%ymm1 \n"
4624 "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
4625 "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
4626 "lea 0x40(%0),%0 \n"
4627 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
4628 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
4629 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4630 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4631 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
4632 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
4633 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
4634 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
4635 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
4636 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4637 "vextractf128 $0x0,%%ymm1,(%1) \n"
4638 "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
4639 "lea 0x10(%1),%1 \n"
4640 "sub $0x20,%3 \n"
4641 "jg 1b \n"
4642 "vzeroupper \n"
4643 : "+r"(src_uyvy), // %0
4644 "+r"(dst_u), // %1
4645 "+r"(dst_v), // %2
4646 "+r"(width) // %3
4647 : "r"((intptr_t)(stride_uyvy)) // %4
4648 : "memory", "cc", "xmm0", "xmm1", "xmm5");
4649 }
4650
UYVYToUV422Row_AVX2(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)4651 void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
4652 uint8_t* dst_u,
4653 uint8_t* dst_v,
4654 int width) {
4655 asm volatile(
4656 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4657 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
4658 "sub %1,%2 \n"
4659
4660 LABELALIGN
4661 "1: \n"
4662 "vmovdqu (%0),%%ymm0 \n"
4663 "vmovdqu 0x20(%0),%%ymm1 \n"
4664 "lea 0x40(%0),%0 \n"
4665 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
4666 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
4667 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4668 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4669 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
4670 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
4671 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
4672 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
4673 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
4674 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
4675 "vextractf128 $0x0,%%ymm1,(%1) \n"
4676 "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
4677 "lea 0x10(%1),%1 \n"
4678 "sub $0x20,%3 \n"
4679 "jg 1b \n"
4680 "vzeroupper \n"
4681 : "+r"(src_uyvy), // %0
4682 "+r"(dst_u), // %1
4683 "+r"(dst_v), // %2
4684 "+r"(width) // %3
4685 :
4686 : "memory", "cc", "xmm0", "xmm1", "xmm5");
4687 }
4688 #endif // HAS_YUY2TOYROW_AVX2
4689
4690 #ifdef HAS_ARGBBLENDROW_SSSE3
4691 // Shuffle table for isolating alpha.
4692 static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
4693 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
4694
4695 // Blend 8 pixels at a time
ARGBBlendRow_SSSE3(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)4696 void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
4697 const uint8_t* src_argb1,
4698 uint8_t* dst_argb,
4699 int width) {
4700 asm volatile(
4701 "pcmpeqb %%xmm7,%%xmm7 \n"
4702 "psrlw $0xf,%%xmm7 \n"
4703 "pcmpeqb %%xmm6,%%xmm6 \n"
4704 "psrlw $0x8,%%xmm6 \n"
4705 "pcmpeqb %%xmm5,%%xmm5 \n"
4706 "psllw $0x8,%%xmm5 \n"
4707 "pcmpeqb %%xmm4,%%xmm4 \n"
4708 "pslld $0x18,%%xmm4 \n"
4709 "sub $0x4,%3 \n"
4710 "jl 49f \n"
4711
4712 // 4 pixel loop.
4713 LABELALIGN
4714 "40: \n"
4715 "movdqu (%0),%%xmm3 \n"
4716 "lea 0x10(%0),%0 \n"
4717 "movdqa %%xmm3,%%xmm0 \n"
4718 "pxor %%xmm4,%%xmm3 \n"
4719 "movdqu (%1),%%xmm2 \n"
4720 "pshufb %4,%%xmm3 \n"
4721 "pand %%xmm6,%%xmm2 \n"
4722 "paddw %%xmm7,%%xmm3 \n"
4723 "pmullw %%xmm3,%%xmm2 \n"
4724 "movdqu (%1),%%xmm1 \n"
4725 "lea 0x10(%1),%1 \n"
4726 "psrlw $0x8,%%xmm1 \n"
4727 "por %%xmm4,%%xmm0 \n"
4728 "pmullw %%xmm3,%%xmm1 \n"
4729 "psrlw $0x8,%%xmm2 \n"
4730 "paddusb %%xmm2,%%xmm0 \n"
4731 "pand %%xmm5,%%xmm1 \n"
4732 "paddusb %%xmm1,%%xmm0 \n"
4733 "movdqu %%xmm0,(%2) \n"
4734 "lea 0x10(%2),%2 \n"
4735 "sub $0x4,%3 \n"
4736 "jge 40b \n"
4737
4738 "49: \n"
4739 "add $0x3,%3 \n"
4740 "jl 99f \n"
4741
4742 // 1 pixel loop.
4743 "91: \n"
4744 "movd (%0),%%xmm3 \n"
4745 "lea 0x4(%0),%0 \n"
4746 "movdqa %%xmm3,%%xmm0 \n"
4747 "pxor %%xmm4,%%xmm3 \n"
4748 "movd (%1),%%xmm2 \n"
4749 "pshufb %4,%%xmm3 \n"
4750 "pand %%xmm6,%%xmm2 \n"
4751 "paddw %%xmm7,%%xmm3 \n"
4752 "pmullw %%xmm3,%%xmm2 \n"
4753 "movd (%1),%%xmm1 \n"
4754 "lea 0x4(%1),%1 \n"
4755 "psrlw $0x8,%%xmm1 \n"
4756 "por %%xmm4,%%xmm0 \n"
4757 "pmullw %%xmm3,%%xmm1 \n"
4758 "psrlw $0x8,%%xmm2 \n"
4759 "paddusb %%xmm2,%%xmm0 \n"
4760 "pand %%xmm5,%%xmm1 \n"
4761 "paddusb %%xmm1,%%xmm0 \n"
4762 "movd %%xmm0,(%2) \n"
4763 "lea 0x4(%2),%2 \n"
4764 "sub $0x1,%3 \n"
4765 "jge 91b \n"
4766 "99: \n"
4767 : "+r"(src_argb0), // %0
4768 "+r"(src_argb1), // %1
4769 "+r"(dst_argb), // %2
4770 "+r"(width) // %3
4771 : "m"(kShuffleAlpha) // %4
4772 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
4773 "xmm7");
4774 }
4775 #endif // HAS_ARGBBLENDROW_SSSE3
4776
4777 #ifdef HAS_BLENDPLANEROW_SSSE3
4778 // Blend 8 pixels at a time.
4779 // unsigned version of math
4780 // =((A2*C2)+(B2*(255-C2))+255)/256
4781 // signed version of math
4782 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_SSSE3(const uint8_t * src0,const uint8_t * src1,const uint8_t * alpha,uint8_t * dst,int width)4783 void BlendPlaneRow_SSSE3(const uint8_t* src0,
4784 const uint8_t* src1,
4785 const uint8_t* alpha,
4786 uint8_t* dst,
4787 int width) {
4788 asm volatile(
4789 "pcmpeqb %%xmm5,%%xmm5 \n"
4790 "psllw $0x8,%%xmm5 \n"
4791 "mov $0x80808080,%%eax \n"
4792 "movd %%eax,%%xmm6 \n"
4793 "pshufd $0x0,%%xmm6,%%xmm6 \n"
4794 "mov $0x807f807f,%%eax \n"
4795 "movd %%eax,%%xmm7 \n"
4796 "pshufd $0x0,%%xmm7,%%xmm7 \n"
4797 "sub %2,%0 \n"
4798 "sub %2,%1 \n"
4799 "sub %2,%3 \n"
4800
4801 // 8 pixel loop.
4802 LABELALIGN
4803 "1: \n"
4804 "movq (%2),%%xmm0 \n"
4805 "punpcklbw %%xmm0,%%xmm0 \n"
4806 "pxor %%xmm5,%%xmm0 \n"
4807 "movq (%0,%2,1),%%xmm1 \n"
4808 "movq (%1,%2,1),%%xmm2 \n"
4809 "punpcklbw %%xmm2,%%xmm1 \n"
4810 "psubb %%xmm6,%%xmm1 \n"
4811 "pmaddubsw %%xmm1,%%xmm0 \n"
4812 "paddw %%xmm7,%%xmm0 \n"
4813 "psrlw $0x8,%%xmm0 \n"
4814 "packuswb %%xmm0,%%xmm0 \n"
4815 "movq %%xmm0,(%3,%2,1) \n"
4816 "lea 0x8(%2),%2 \n"
4817 "sub $0x8,%4 \n"
4818 "jg 1b \n"
4819 : "+r"(src0), // %0
4820 "+r"(src1), // %1
4821 "+r"(alpha), // %2
4822 "+r"(dst), // %3
4823 "+rm"(width) // %4
4824 ::"memory",
4825 "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
4826 }
4827 #endif // HAS_BLENDPLANEROW_SSSE3
4828
4829 #ifdef HAS_BLENDPLANEROW_AVX2
4830 // Blend 32 pixels at a time.
4831 // unsigned version of math
4832 // =((A2*C2)+(B2*(255-C2))+255)/256
4833 // signed version of math
4834 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_AVX2(const uint8_t * src0,const uint8_t * src1,const uint8_t * alpha,uint8_t * dst,int width)4835 void BlendPlaneRow_AVX2(const uint8_t* src0,
4836 const uint8_t* src1,
4837 const uint8_t* alpha,
4838 uint8_t* dst,
4839 int width) {
4840 asm volatile(
4841 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4842 "vpsllw $0x8,%%ymm5,%%ymm5 \n"
4843 "mov $0x80808080,%%eax \n"
4844 "vmovd %%eax,%%xmm6 \n"
4845 "vbroadcastss %%xmm6,%%ymm6 \n"
4846 "mov $0x807f807f,%%eax \n"
4847 "vmovd %%eax,%%xmm7 \n"
4848 "vbroadcastss %%xmm7,%%ymm7 \n"
4849 "sub %2,%0 \n"
4850 "sub %2,%1 \n"
4851 "sub %2,%3 \n"
4852
4853 // 32 pixel loop.
4854 LABELALIGN
4855 "1: \n"
4856 "vmovdqu (%2),%%ymm0 \n"
4857 "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
4858 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
4859 "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
4860 "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
4861 "vmovdqu (%0,%2,1),%%ymm1 \n"
4862 "vmovdqu (%1,%2,1),%%ymm2 \n"
4863 "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
4864 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
4865 "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
4866 "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
4867 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
4868 "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
4869 "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
4870 "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
4871 "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
4872 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
4873 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
4874 "vmovdqu %%ymm0,(%3,%2,1) \n"
4875 "lea 0x20(%2),%2 \n"
4876 "sub $0x20,%4 \n"
4877 "jg 1b \n"
4878 "vzeroupper \n"
4879 : "+r"(src0), // %0
4880 "+r"(src1), // %1
4881 "+r"(alpha), // %2
4882 "+r"(dst), // %3
4883 "+rm"(width) // %4
4884 ::"memory",
4885 "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
4886 "xmm7");
4887 }
4888 #endif // HAS_BLENDPLANEROW_AVX2
4889
4890 #ifdef HAS_ARGBATTENUATEROW_SSSE3
4891 // Shuffle table duplicating alpha.
4892 static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
4893 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
4894 static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
4895 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
4896 // Attenuate 4 pixels at a time.
ARGBAttenuateRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width)4897 void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
4898 uint8_t* dst_argb,
4899 int width) {
4900 asm volatile(
4901 "pcmpeqb %%xmm3,%%xmm3 \n"
4902 "pslld $0x18,%%xmm3 \n"
4903 "movdqa %3,%%xmm4 \n"
4904 "movdqa %4,%%xmm5 \n"
4905
4906 // 4 pixel loop.
4907 LABELALIGN
4908 "1: \n"
4909 "movdqu (%0),%%xmm0 \n"
4910 "pshufb %%xmm4,%%xmm0 \n"
4911 "movdqu (%0),%%xmm1 \n"
4912 "punpcklbw %%xmm1,%%xmm1 \n"
4913 "pmulhuw %%xmm1,%%xmm0 \n"
4914 "movdqu (%0),%%xmm1 \n"
4915 "pshufb %%xmm5,%%xmm1 \n"
4916 "movdqu (%0),%%xmm2 \n"
4917 "punpckhbw %%xmm2,%%xmm2 \n"
4918 "pmulhuw %%xmm2,%%xmm1 \n"
4919 "movdqu (%0),%%xmm2 \n"
4920 "lea 0x10(%0),%0 \n"
4921 "pand %%xmm3,%%xmm2 \n"
4922 "psrlw $0x8,%%xmm0 \n"
4923 "psrlw $0x8,%%xmm1 \n"
4924 "packuswb %%xmm1,%%xmm0 \n"
4925 "por %%xmm2,%%xmm0 \n"
4926 "movdqu %%xmm0,(%1) \n"
4927 "lea 0x10(%1),%1 \n"
4928 "sub $0x4,%2 \n"
4929 "jg 1b \n"
4930 : "+r"(src_argb), // %0
4931 "+r"(dst_argb), // %1
4932 "+r"(width) // %2
4933 : "m"(kShuffleAlpha0), // %3
4934 "m"(kShuffleAlpha1) // %4
4935 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
4936 }
4937 #endif // HAS_ARGBATTENUATEROW_SSSE3
4938
4939 #ifdef HAS_ARGBATTENUATEROW_AVX2
4940 // Shuffle table duplicating alpha.
4941 static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
4942 128u, 128u, 14u, 15u, 14u, 15u,
4943 14u, 15u, 128u, 128u};
4944 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,int width)4945 void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
4946 uint8_t* dst_argb,
4947 int width) {
4948 asm volatile(
4949 "vbroadcastf128 %3,%%ymm4 \n"
4950 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
4951 "vpslld $0x18,%%ymm5,%%ymm5 \n"
4952 "sub %0,%1 \n"
4953
4954 // 8 pixel loop.
4955 LABELALIGN
4956 "1: \n"
4957 "vmovdqu (%0),%%ymm6 \n"
4958 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
4959 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
4960 "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
4961 "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
4962 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
4963 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
4964 "vpand %%ymm5,%%ymm6,%%ymm6 \n"
4965 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
4966 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
4967 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4968 "vpor %%ymm6,%%ymm0,%%ymm0 \n"
4969 "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
4970 "lea 0x20(%0),%0 \n"
4971 "sub $0x8,%2 \n"
4972 "jg 1b \n"
4973 "vzeroupper \n"
4974 : "+r"(src_argb), // %0
4975 "+r"(dst_argb), // %1
4976 "+r"(width) // %2
4977 : "m"(kShuffleAlpha_AVX2) // %3
4978 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
4979 }
4980 #endif // HAS_ARGBATTENUATEROW_AVX2
4981
4982 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
4983 // Unattenuate 4 pixels at a time.
ARGBUnattenuateRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,int width)4984 void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
4985 uint8_t* dst_argb,
4986 int width) {
4987 uintptr_t alpha;
4988 asm volatile(
4989 // 4 pixel loop.
4990 LABELALIGN
4991 "1: \n"
4992 "movdqu (%0),%%xmm0 \n"
4993 "movzb 0x03(%0),%3 \n"
4994 "punpcklbw %%xmm0,%%xmm0 \n"
4995 "movd 0x00(%4,%3,4),%%xmm2 \n"
4996 "movzb 0x07(%0),%3 \n"
4997 "movd 0x00(%4,%3,4),%%xmm3 \n"
4998 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
4999 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
5000 "movlhps %%xmm3,%%xmm2 \n"
5001 "pmulhuw %%xmm2,%%xmm0 \n"
5002 "movdqu (%0),%%xmm1 \n"
5003 "movzb 0x0b(%0),%3 \n"
5004 "punpckhbw %%xmm1,%%xmm1 \n"
5005 "movd 0x00(%4,%3,4),%%xmm2 \n"
5006 "movzb 0x0f(%0),%3 \n"
5007 "movd 0x00(%4,%3,4),%%xmm3 \n"
5008 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
5009 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
5010 "movlhps %%xmm3,%%xmm2 \n"
5011 "pmulhuw %%xmm2,%%xmm1 \n"
5012 "lea 0x10(%0),%0 \n"
5013 "packuswb %%xmm1,%%xmm0 \n"
5014 "movdqu %%xmm0,(%1) \n"
5015 "lea 0x10(%1),%1 \n"
5016 "sub $0x4,%2 \n"
5017 "jg 1b \n"
5018 : "+r"(src_argb), // %0
5019 "+r"(dst_argb), // %1
5020 "+r"(width), // %2
5021 "=&r"(alpha) // %3
5022 : "r"(fixed_invtbl8) // %4
5023 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
5024 }
5025 #endif // HAS_ARGBUNATTENUATEROW_SSE2
5026
5027 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
5028 // Shuffle table duplicating alpha.
5029 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
5030 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
5031 // Unattenuate 8 pixels at a time.
ARGBUnattenuateRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,int width)5032 void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
5033 uint8_t* dst_argb,
5034 int width) {
5035 uintptr_t alpha;
5036 asm volatile(
5037 "sub %0,%1 \n"
5038 "vbroadcastf128 %5,%%ymm5 \n"
5039
5040 // 8 pixel loop.
5041 LABELALIGN
5042 "1: \n"
5043 // replace VPGATHER
5044 "movzb 0x03(%0),%3 \n"
5045 "vmovd 0x00(%4,%3,4),%%xmm0 \n"
5046 "movzb 0x07(%0),%3 \n"
5047 "vmovd 0x00(%4,%3,4),%%xmm1 \n"
5048 "movzb 0x0b(%0),%3 \n"
5049 "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
5050 "vmovd 0x00(%4,%3,4),%%xmm2 \n"
5051 "movzb 0x0f(%0),%3 \n"
5052 "vmovd 0x00(%4,%3,4),%%xmm3 \n"
5053 "movzb 0x13(%0),%3 \n"
5054 "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
5055 "vmovd 0x00(%4,%3,4),%%xmm0 \n"
5056 "movzb 0x17(%0),%3 \n"
5057 "vmovd 0x00(%4,%3,4),%%xmm1 \n"
5058 "movzb 0x1b(%0),%3 \n"
5059 "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
5060 "vmovd 0x00(%4,%3,4),%%xmm2 \n"
5061 "movzb 0x1f(%0),%3 \n"
5062 "vmovd 0x00(%4,%3,4),%%xmm3 \n"
5063 "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
5064 "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
5065 "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
5066 "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
5067 // end of VPGATHER
5068
5069 "vmovdqu (%0),%%ymm6 \n"
5070 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
5071 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
5072 "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
5073 "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
5074 "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
5075 "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
5076 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
5077 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
5078 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
5079 "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
5080 "lea 0x20(%0),%0 \n"
5081 "sub $0x8,%2 \n"
5082 "jg 1b \n"
5083 "vzeroupper \n"
5084 : "+r"(src_argb), // %0
5085 "+r"(dst_argb), // %1
5086 "+r"(width), // %2
5087 "=&r"(alpha) // %3
5088 : "r"(fixed_invtbl8), // %4
5089 "m"(kUnattenShuffleAlpha_AVX2) // %5
5090 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
5091 "xmm7");
5092 }
5093 #endif // HAS_ARGBUNATTENUATEROW_AVX2
5094
5095 #ifdef HAS_ARGBGRAYROW_SSSE3
5096 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
ARGBGrayRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width)5097 void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
5098 asm volatile(
5099 "movdqa %3,%%xmm4 \n"
5100 "movdqa %4,%%xmm5 \n"
5101
5102 // 8 pixel loop.
5103 LABELALIGN
5104 "1: \n"
5105 "movdqu (%0),%%xmm0 \n"
5106 "movdqu 0x10(%0),%%xmm1 \n"
5107 "psubb %%xmm5,%%xmm0 \n"
5108 "psubb %%xmm5,%%xmm1 \n"
5109 "movdqu %%xmm4,%%xmm6 \n"
5110 "pmaddubsw %%xmm0,%%xmm6 \n"
5111 "movdqu %%xmm4,%%xmm0 \n"
5112 "pmaddubsw %%xmm1,%%xmm0 \n"
5113 "phaddw %%xmm0,%%xmm6 \n"
5114 "paddw %%xmm5,%%xmm6 \n"
5115 "psrlw $0x8,%%xmm6 \n"
5116 "packuswb %%xmm6,%%xmm6 \n"
5117 "movdqu (%0),%%xmm2 \n"
5118 "movdqu 0x10(%0),%%xmm3 \n"
5119 "lea 0x20(%0),%0 \n"
5120 "psrld $0x18,%%xmm2 \n"
5121 "psrld $0x18,%%xmm3 \n"
5122 "packuswb %%xmm3,%%xmm2 \n"
5123 "packuswb %%xmm2,%%xmm2 \n"
5124 "movdqa %%xmm6,%%xmm3 \n"
5125 "punpcklbw %%xmm6,%%xmm6 \n"
5126 "punpcklbw %%xmm2,%%xmm3 \n"
5127 "movdqa %%xmm6,%%xmm1 \n"
5128 "punpcklwd %%xmm3,%%xmm6 \n"
5129 "punpckhwd %%xmm3,%%xmm1 \n"
5130 "movdqu %%xmm6,(%1) \n"
5131 "movdqu %%xmm1,0x10(%1) \n"
5132 "lea 0x20(%1),%1 \n"
5133 "sub $0x8,%2 \n"
5134 "jg 1b \n"
5135 : "+r"(src_argb), // %0
5136 "+r"(dst_argb), // %1
5137 "+r"(width) // %2
5138 : "m"(kARGBToYJ), // %3
5139 "m"(kSub128) // %4
5140 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
5141 }
5142 #endif // HAS_ARGBGRAYROW_SSSE3
5143
5144 #ifdef HAS_ARGBSEPIAROW_SSSE3
5145 // b = (r * 35 + g * 68 + b * 17) >> 7
5146 // g = (r * 45 + g * 88 + b * 22) >> 7
5147 // r = (r * 50 + g * 98 + b * 24) >> 7
5148 // Constant for ARGB color to sepia tone
5149 static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
5150 17, 68, 35, 0, 17, 68, 35, 0};
5151
5152 static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
5153 22, 88, 45, 0, 22, 88, 45, 0};
5154
5155 static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
5156 24, 98, 50, 0, 24, 98, 50, 0};
5157
5158 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
ARGBSepiaRow_SSSE3(uint8_t * dst_argb,int width)5159 void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
5160 asm volatile(
5161 "movdqa %2,%%xmm2 \n"
5162 "movdqa %3,%%xmm3 \n"
5163 "movdqa %4,%%xmm4 \n"
5164
5165 // 8 pixel loop.
5166 LABELALIGN
5167 "1: \n"
5168 "movdqu (%0),%%xmm0 \n"
5169 "movdqu 0x10(%0),%%xmm6 \n"
5170 "pmaddubsw %%xmm2,%%xmm0 \n"
5171 "pmaddubsw %%xmm2,%%xmm6 \n"
5172 "phaddw %%xmm6,%%xmm0 \n"
5173 "psrlw $0x7,%%xmm0 \n"
5174 "packuswb %%xmm0,%%xmm0 \n"
5175 "movdqu (%0),%%xmm5 \n"
5176 "movdqu 0x10(%0),%%xmm1 \n"
5177 "pmaddubsw %%xmm3,%%xmm5 \n"
5178 "pmaddubsw %%xmm3,%%xmm1 \n"
5179 "phaddw %%xmm1,%%xmm5 \n"
5180 "psrlw $0x7,%%xmm5 \n"
5181 "packuswb %%xmm5,%%xmm5 \n"
5182 "punpcklbw %%xmm5,%%xmm0 \n"
5183 "movdqu (%0),%%xmm5 \n"
5184 "movdqu 0x10(%0),%%xmm1 \n"
5185 "pmaddubsw %%xmm4,%%xmm5 \n"
5186 "pmaddubsw %%xmm4,%%xmm1 \n"
5187 "phaddw %%xmm1,%%xmm5 \n"
5188 "psrlw $0x7,%%xmm5 \n"
5189 "packuswb %%xmm5,%%xmm5 \n"
5190 "movdqu (%0),%%xmm6 \n"
5191 "movdqu 0x10(%0),%%xmm1 \n"
5192 "psrld $0x18,%%xmm6 \n"
5193 "psrld $0x18,%%xmm1 \n"
5194 "packuswb %%xmm1,%%xmm6 \n"
5195 "packuswb %%xmm6,%%xmm6 \n"
5196 "punpcklbw %%xmm6,%%xmm5 \n"
5197 "movdqa %%xmm0,%%xmm1 \n"
5198 "punpcklwd %%xmm5,%%xmm0 \n"
5199 "punpckhwd %%xmm5,%%xmm1 \n"
5200 "movdqu %%xmm0,(%0) \n"
5201 "movdqu %%xmm1,0x10(%0) \n"
5202 "lea 0x20(%0),%0 \n"
5203 "sub $0x8,%1 \n"
5204 "jg 1b \n"
5205 : "+r"(dst_argb), // %0
5206 "+r"(width) // %1
5207 : "m"(kARGBToSepiaB), // %2
5208 "m"(kARGBToSepiaG), // %3
5209 "m"(kARGBToSepiaR) // %4
5210 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
5211 }
5212 #endif // HAS_ARGBSEPIAROW_SSSE3
5213
5214 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
5215 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
5216 // Same as Sepia except matrix is provided.
ARGBColorMatrixRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,const int8_t * matrix_argb,int width)5217 void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
5218 uint8_t* dst_argb,
5219 const int8_t* matrix_argb,
5220 int width) {
5221 asm volatile(
5222 "movdqu (%3),%%xmm5 \n"
5223 "pshufd $0x00,%%xmm5,%%xmm2 \n"
5224 "pshufd $0x55,%%xmm5,%%xmm3 \n"
5225 "pshufd $0xaa,%%xmm5,%%xmm4 \n"
5226 "pshufd $0xff,%%xmm5,%%xmm5 \n"
5227
5228 // 8 pixel loop.
5229 LABELALIGN
5230 "1: \n"
5231 "movdqu (%0),%%xmm0 \n"
5232 "movdqu 0x10(%0),%%xmm7 \n"
5233 "pmaddubsw %%xmm2,%%xmm0 \n"
5234 "pmaddubsw %%xmm2,%%xmm7 \n"
5235 "movdqu (%0),%%xmm6 \n"
5236 "movdqu 0x10(%0),%%xmm1 \n"
5237 "pmaddubsw %%xmm3,%%xmm6 \n"
5238 "pmaddubsw %%xmm3,%%xmm1 \n"
5239 "phaddsw %%xmm7,%%xmm0 \n"
5240 "phaddsw %%xmm1,%%xmm6 \n"
5241 "psraw $0x6,%%xmm0 \n"
5242 "psraw $0x6,%%xmm6 \n"
5243 "packuswb %%xmm0,%%xmm0 \n"
5244 "packuswb %%xmm6,%%xmm6 \n"
5245 "punpcklbw %%xmm6,%%xmm0 \n"
5246 "movdqu (%0),%%xmm1 \n"
5247 "movdqu 0x10(%0),%%xmm7 \n"
5248 "pmaddubsw %%xmm4,%%xmm1 \n"
5249 "pmaddubsw %%xmm4,%%xmm7 \n"
5250 "phaddsw %%xmm7,%%xmm1 \n"
5251 "movdqu (%0),%%xmm6 \n"
5252 "movdqu 0x10(%0),%%xmm7 \n"
5253 "pmaddubsw %%xmm5,%%xmm6 \n"
5254 "pmaddubsw %%xmm5,%%xmm7 \n"
5255 "phaddsw %%xmm7,%%xmm6 \n"
5256 "psraw $0x6,%%xmm1 \n"
5257 "psraw $0x6,%%xmm6 \n"
5258 "packuswb %%xmm1,%%xmm1 \n"
5259 "packuswb %%xmm6,%%xmm6 \n"
5260 "punpcklbw %%xmm6,%%xmm1 \n"
5261 "movdqa %%xmm0,%%xmm6 \n"
5262 "punpcklwd %%xmm1,%%xmm0 \n"
5263 "punpckhwd %%xmm1,%%xmm6 \n"
5264 "movdqu %%xmm0,(%1) \n"
5265 "movdqu %%xmm6,0x10(%1) \n"
5266 "lea 0x20(%0),%0 \n"
5267 "lea 0x20(%1),%1 \n"
5268 "sub $0x8,%2 \n"
5269 "jg 1b \n"
5270 : "+r"(src_argb), // %0
5271 "+r"(dst_argb), // %1
5272 "+r"(width) // %2
5273 : "r"(matrix_argb) // %3
5274 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
5275 "xmm7");
5276 }
5277 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
5278
5279 #ifdef HAS_ARGBQUANTIZEROW_SSE2
5280 // Quantize 4 ARGB pixels (16 bytes).
ARGBQuantizeRow_SSE2(uint8_t * dst_argb,int scale,int interval_size,int interval_offset,int width)5281 void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
5282 int scale,
5283 int interval_size,
5284 int interval_offset,
5285 int width) {
5286 asm volatile(
5287 "movd %2,%%xmm2 \n"
5288 "movd %3,%%xmm3 \n"
5289 "movd %4,%%xmm4 \n"
5290 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
5291 "pshufd $0x44,%%xmm2,%%xmm2 \n"
5292 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
5293 "pshufd $0x44,%%xmm3,%%xmm3 \n"
5294 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
5295 "pshufd $0x44,%%xmm4,%%xmm4 \n"
5296 "pxor %%xmm5,%%xmm5 \n"
5297 "pcmpeqb %%xmm6,%%xmm6 \n"
5298 "pslld $0x18,%%xmm6 \n"
5299
5300 // 4 pixel loop.
5301 LABELALIGN
5302 "1: \n"
5303 "movdqu (%0),%%xmm0 \n"
5304 "punpcklbw %%xmm5,%%xmm0 \n"
5305 "pmulhuw %%xmm2,%%xmm0 \n"
5306 "movdqu (%0),%%xmm1 \n"
5307 "punpckhbw %%xmm5,%%xmm1 \n"
5308 "pmulhuw %%xmm2,%%xmm1 \n"
5309 "pmullw %%xmm3,%%xmm0 \n"
5310 "movdqu (%0),%%xmm7 \n"
5311 "pmullw %%xmm3,%%xmm1 \n"
5312 "pand %%xmm6,%%xmm7 \n"
5313 "paddw %%xmm4,%%xmm0 \n"
5314 "paddw %%xmm4,%%xmm1 \n"
5315 "packuswb %%xmm1,%%xmm0 \n"
5316 "por %%xmm7,%%xmm0 \n"
5317 "movdqu %%xmm0,(%0) \n"
5318 "lea 0x10(%0),%0 \n"
5319 "sub $0x4,%1 \n"
5320 "jg 1b \n"
5321 : "+r"(dst_argb), // %0
5322 "+r"(width) // %1
5323 : "r"(scale), // %2
5324 "r"(interval_size), // %3
5325 "r"(interval_offset) // %4
5326 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
5327 "xmm7");
5328 }
5329 #endif // HAS_ARGBQUANTIZEROW_SSE2
5330
5331 #ifdef HAS_ARGBSHADEROW_SSE2
5332 // Shade 4 pixels at a time by specified value.
ARGBShadeRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)5333 void ARGBShadeRow_SSE2(const uint8_t* src_argb,
5334 uint8_t* dst_argb,
5335 int width,
5336 uint32_t value) {
5337 asm volatile(
5338 "movd %3,%%xmm2 \n"
5339 "punpcklbw %%xmm2,%%xmm2 \n"
5340 "punpcklqdq %%xmm2,%%xmm2 \n"
5341
5342 // 4 pixel loop.
5343 LABELALIGN
5344 "1: \n"
5345 "movdqu (%0),%%xmm0 \n"
5346 "lea 0x10(%0),%0 \n"
5347 "movdqa %%xmm0,%%xmm1 \n"
5348 "punpcklbw %%xmm0,%%xmm0 \n"
5349 "punpckhbw %%xmm1,%%xmm1 \n"
5350 "pmulhuw %%xmm2,%%xmm0 \n"
5351 "pmulhuw %%xmm2,%%xmm1 \n"
5352 "psrlw $0x8,%%xmm0 \n"
5353 "psrlw $0x8,%%xmm1 \n"
5354 "packuswb %%xmm1,%%xmm0 \n"
5355 "movdqu %%xmm0,(%1) \n"
5356 "lea 0x10(%1),%1 \n"
5357 "sub $0x4,%2 \n"
5358 "jg 1b \n"
5359 : "+r"(src_argb), // %0
5360 "+r"(dst_argb), // %1
5361 "+r"(width) // %2
5362 : "r"(value) // %3
5363 : "memory", "cc", "xmm0", "xmm1", "xmm2");
5364 }
5365 #endif // HAS_ARGBSHADEROW_SSE2
5366
5367 #ifdef HAS_ARGBMULTIPLYROW_SSE2
5368 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBMultiplyRow_SSE2(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)5369 void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
5370 const uint8_t* src_argb1,
5371 uint8_t* dst_argb,
5372 int width) {
5373 asm volatile(
5374
5375 "pxor %%xmm5,%%xmm5 \n"
5376
5377 // 4 pixel loop.
5378 LABELALIGN
5379 "1: \n"
5380 "movdqu (%0),%%xmm0 \n"
5381 "lea 0x10(%0),%0 \n"
5382 "movdqu (%1),%%xmm2 \n"
5383 "lea 0x10(%1),%1 \n"
5384 "movdqu %%xmm0,%%xmm1 \n"
5385 "movdqu %%xmm2,%%xmm3 \n"
5386 "punpcklbw %%xmm0,%%xmm0 \n"
5387 "punpckhbw %%xmm1,%%xmm1 \n"
5388 "punpcklbw %%xmm5,%%xmm2 \n"
5389 "punpckhbw %%xmm5,%%xmm3 \n"
5390 "pmulhuw %%xmm2,%%xmm0 \n"
5391 "pmulhuw %%xmm3,%%xmm1 \n"
5392 "packuswb %%xmm1,%%xmm0 \n"
5393 "movdqu %%xmm0,(%2) \n"
5394 "lea 0x10(%2),%2 \n"
5395 "sub $0x4,%3 \n"
5396 "jg 1b \n"
5397 : "+r"(src_argb0), // %0
5398 "+r"(src_argb1), // %1
5399 "+r"(dst_argb), // %2
5400 "+r"(width) // %3
5401 :
5402 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
5403 }
5404 #endif // HAS_ARGBMULTIPLYROW_SSE2
5405
5406 #ifdef HAS_ARGBMULTIPLYROW_AVX2
5407 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_AVX2(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)5408 void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
5409 const uint8_t* src_argb1,
5410 uint8_t* dst_argb,
5411 int width) {
5412 asm volatile(
5413
5414 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
5415
5416 // 4 pixel loop.
5417 LABELALIGN
5418 "1: \n"
5419 "vmovdqu (%0),%%ymm1 \n"
5420 "lea 0x20(%0),%0 \n"
5421 "vmovdqu (%1),%%ymm3 \n"
5422 "lea 0x20(%1),%1 \n"
5423 "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
5424 "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
5425 "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
5426 "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
5427 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
5428 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
5429 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
5430 "vmovdqu %%ymm0,(%2) \n"
5431 "lea 0x20(%2),%2 \n"
5432 "sub $0x8,%3 \n"
5433 "jg 1b \n"
5434 "vzeroupper \n"
5435 : "+r"(src_argb0), // %0
5436 "+r"(src_argb1), // %1
5437 "+r"(dst_argb), // %2
5438 "+r"(width) // %3
5439 :
5440 : "memory", "cc"
5441 #if defined(__AVX2__)
5442 ,
5443 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
5444 #endif
5445 );
5446 }
5447 #endif // HAS_ARGBMULTIPLYROW_AVX2
5448
5449 #ifdef HAS_ARGBADDROW_SSE2
5450 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_SSE2(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)5451 void ARGBAddRow_SSE2(const uint8_t* src_argb0,
5452 const uint8_t* src_argb1,
5453 uint8_t* dst_argb,
5454 int width) {
5455 asm volatile(
5456 // 4 pixel loop.
5457 LABELALIGN
5458 "1: \n"
5459 "movdqu (%0),%%xmm0 \n"
5460 "lea 0x10(%0),%0 \n"
5461 "movdqu (%1),%%xmm1 \n"
5462 "lea 0x10(%1),%1 \n"
5463 "paddusb %%xmm1,%%xmm0 \n"
5464 "movdqu %%xmm0,(%2) \n"
5465 "lea 0x10(%2),%2 \n"
5466 "sub $0x4,%3 \n"
5467 "jg 1b \n"
5468 : "+r"(src_argb0), // %0
5469 "+r"(src_argb1), // %1
5470 "+r"(dst_argb), // %2
5471 "+r"(width) // %3
5472 :
5473 : "memory", "cc", "xmm0", "xmm1");
5474 }
5475 #endif // HAS_ARGBADDROW_SSE2
5476
5477 #ifdef HAS_ARGBADDROW_AVX2
5478 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_AVX2(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)5479 void ARGBAddRow_AVX2(const uint8_t* src_argb0,
5480 const uint8_t* src_argb1,
5481 uint8_t* dst_argb,
5482 int width) {
5483 asm volatile(
5484 // 4 pixel loop.
5485 LABELALIGN
5486 "1: \n"
5487 "vmovdqu (%0),%%ymm0 \n"
5488 "lea 0x20(%0),%0 \n"
5489 "vpaddusb (%1),%%ymm0,%%ymm0 \n"
5490 "lea 0x20(%1),%1 \n"
5491 "vmovdqu %%ymm0,(%2) \n"
5492 "lea 0x20(%2),%2 \n"
5493 "sub $0x8,%3 \n"
5494 "jg 1b \n"
5495 "vzeroupper \n"
5496 : "+r"(src_argb0), // %0
5497 "+r"(src_argb1), // %1
5498 "+r"(dst_argb), // %2
5499 "+r"(width) // %3
5500 :
5501 : "memory", "cc", "xmm0");
5502 }
5503 #endif // HAS_ARGBADDROW_AVX2
5504
5505 #ifdef HAS_ARGBSUBTRACTROW_SSE2
5506 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
ARGBSubtractRow_SSE2(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)5507 void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
5508 const uint8_t* src_argb1,
5509 uint8_t* dst_argb,
5510 int width) {
5511 asm volatile(
5512 // 4 pixel loop.
5513 LABELALIGN
5514 "1: \n"
5515 "movdqu (%0),%%xmm0 \n"
5516 "lea 0x10(%0),%0 \n"
5517 "movdqu (%1),%%xmm1 \n"
5518 "lea 0x10(%1),%1 \n"
5519 "psubusb %%xmm1,%%xmm0 \n"
5520 "movdqu %%xmm0,(%2) \n"
5521 "lea 0x10(%2),%2 \n"
5522 "sub $0x4,%3 \n"
5523 "jg 1b \n"
5524 : "+r"(src_argb0), // %0
5525 "+r"(src_argb1), // %1
5526 "+r"(dst_argb), // %2
5527 "+r"(width) // %3
5528 :
5529 : "memory", "cc", "xmm0", "xmm1");
5530 }
5531 #endif // HAS_ARGBSUBTRACTROW_SSE2
5532
5533 #ifdef HAS_ARGBSUBTRACTROW_AVX2
5534 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_AVX2(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)5535 void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
5536 const uint8_t* src_argb1,
5537 uint8_t* dst_argb,
5538 int width) {
5539 asm volatile(
5540 // 4 pixel loop.
5541 LABELALIGN
5542 "1: \n"
5543 "vmovdqu (%0),%%ymm0 \n"
5544 "lea 0x20(%0),%0 \n"
5545 "vpsubusb (%1),%%ymm0,%%ymm0 \n"
5546 "lea 0x20(%1),%1 \n"
5547 "vmovdqu %%ymm0,(%2) \n"
5548 "lea 0x20(%2),%2 \n"
5549 "sub $0x8,%3 \n"
5550 "jg 1b \n"
5551 "vzeroupper \n"
5552 : "+r"(src_argb0), // %0
5553 "+r"(src_argb1), // %1
5554 "+r"(dst_argb), // %2
5555 "+r"(width) // %3
5556 :
5557 : "memory", "cc", "xmm0");
5558 }
5559 #endif // HAS_ARGBSUBTRACTROW_AVX2
5560
5561 #ifdef HAS_SOBELXROW_SSE2
5562 // SobelX as a matrix is
5563 // -1 0 1
5564 // -2 0 2
5565 // -1 0 1
SobelXRow_SSE2(const uint8_t * src_y0,const uint8_t * src_y1,const uint8_t * src_y2,uint8_t * dst_sobelx,int width)5566 void SobelXRow_SSE2(const uint8_t* src_y0,
5567 const uint8_t* src_y1,
5568 const uint8_t* src_y2,
5569 uint8_t* dst_sobelx,
5570 int width) {
5571 asm volatile(
5572 "sub %0,%1 \n"
5573 "sub %0,%2 \n"
5574 "sub %0,%3 \n"
5575 "pxor %%xmm5,%%xmm5 \n"
5576
5577 // 8 pixel loop.
5578 LABELALIGN
5579 "1: \n"
5580 "movq (%0),%%xmm0 \n"
5581 "movq 0x2(%0),%%xmm1 \n"
5582 "punpcklbw %%xmm5,%%xmm0 \n"
5583 "punpcklbw %%xmm5,%%xmm1 \n"
5584 "psubw %%xmm1,%%xmm0 \n"
5585 "movq 0x00(%0,%1,1),%%xmm1 \n"
5586 "movq 0x02(%0,%1,1),%%xmm2 \n"
5587 "punpcklbw %%xmm5,%%xmm1 \n"
5588 "punpcklbw %%xmm5,%%xmm2 \n"
5589 "psubw %%xmm2,%%xmm1 \n"
5590 "movq 0x00(%0,%2,1),%%xmm2 \n"
5591 "movq 0x02(%0,%2,1),%%xmm3 \n"
5592 "punpcklbw %%xmm5,%%xmm2 \n"
5593 "punpcklbw %%xmm5,%%xmm3 \n"
5594 "psubw %%xmm3,%%xmm2 \n"
5595 "paddw %%xmm2,%%xmm0 \n"
5596 "paddw %%xmm1,%%xmm0 \n"
5597 "paddw %%xmm1,%%xmm0 \n"
5598 "pxor %%xmm1,%%xmm1 \n"
5599 "psubw %%xmm0,%%xmm1 \n"
5600 "pmaxsw %%xmm1,%%xmm0 \n"
5601 "packuswb %%xmm0,%%xmm0 \n"
5602 "movq %%xmm0,0x00(%0,%3,1) \n"
5603 "lea 0x8(%0),%0 \n"
5604 "sub $0x8,%4 \n"
5605 "jg 1b \n"
5606 : "+r"(src_y0), // %0
5607 "+r"(src_y1), // %1
5608 "+r"(src_y2), // %2
5609 "+r"(dst_sobelx), // %3
5610 "+r"(width) // %4
5611 :
5612 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
5613 }
5614 #endif // HAS_SOBELXROW_SSE2
5615
5616 #ifdef HAS_SOBELYROW_SSE2
5617 // SobelY as a matrix is
5618 // -1 -2 -1
5619 // 0 0 0
5620 // 1 2 1
SobelYRow_SSE2(const uint8_t * src_y0,const uint8_t * src_y1,uint8_t * dst_sobely,int width)5621 void SobelYRow_SSE2(const uint8_t* src_y0,
5622 const uint8_t* src_y1,
5623 uint8_t* dst_sobely,
5624 int width) {
5625 asm volatile(
5626 "sub %0,%1 \n"
5627 "sub %0,%2 \n"
5628 "pxor %%xmm5,%%xmm5 \n"
5629
5630 // 8 pixel loop.
5631 LABELALIGN
5632 "1: \n"
5633 "movq (%0),%%xmm0 \n"
5634 "movq 0x00(%0,%1,1),%%xmm1 \n"
5635 "punpcklbw %%xmm5,%%xmm0 \n"
5636 "punpcklbw %%xmm5,%%xmm1 \n"
5637 "psubw %%xmm1,%%xmm0 \n"
5638 "movq 0x1(%0),%%xmm1 \n"
5639 "movq 0x01(%0,%1,1),%%xmm2 \n"
5640 "punpcklbw %%xmm5,%%xmm1 \n"
5641 "punpcklbw %%xmm5,%%xmm2 \n"
5642 "psubw %%xmm2,%%xmm1 \n"
5643 "movq 0x2(%0),%%xmm2 \n"
5644 "movq 0x02(%0,%1,1),%%xmm3 \n"
5645 "punpcklbw %%xmm5,%%xmm2 \n"
5646 "punpcklbw %%xmm5,%%xmm3 \n"
5647 "psubw %%xmm3,%%xmm2 \n"
5648 "paddw %%xmm2,%%xmm0 \n"
5649 "paddw %%xmm1,%%xmm0 \n"
5650 "paddw %%xmm1,%%xmm0 \n"
5651 "pxor %%xmm1,%%xmm1 \n"
5652 "psubw %%xmm0,%%xmm1 \n"
5653 "pmaxsw %%xmm1,%%xmm0 \n"
5654 "packuswb %%xmm0,%%xmm0 \n"
5655 "movq %%xmm0,0x00(%0,%2,1) \n"
5656 "lea 0x8(%0),%0 \n"
5657 "sub $0x8,%3 \n"
5658 "jg 1b \n"
5659 : "+r"(src_y0), // %0
5660 "+r"(src_y1), // %1
5661 "+r"(dst_sobely), // %2
5662 "+r"(width) // %3
5663 :
5664 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
5665 }
5666 #endif // HAS_SOBELYROW_SSE2
5667
5668 #ifdef HAS_SOBELROW_SSE2
5669 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
5670 // A = 255
5671 // R = Sobel
5672 // G = Sobel
5673 // B = Sobel
SobelRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)5674 void SobelRow_SSE2(const uint8_t* src_sobelx,
5675 const uint8_t* src_sobely,
5676 uint8_t* dst_argb,
5677 int width) {
5678 asm volatile(
5679 "sub %0,%1 \n"
5680 "pcmpeqb %%xmm5,%%xmm5 \n"
5681 "pslld $0x18,%%xmm5 \n"
5682
5683 // 8 pixel loop.
5684 LABELALIGN
5685 "1: \n"
5686 "movdqu (%0),%%xmm0 \n"
5687 "movdqu 0x00(%0,%1,1),%%xmm1 \n"
5688 "lea 0x10(%0),%0 \n"
5689 "paddusb %%xmm1,%%xmm0 \n"
5690 "movdqa %%xmm0,%%xmm2 \n"
5691 "punpcklbw %%xmm0,%%xmm2 \n"
5692 "punpckhbw %%xmm0,%%xmm0 \n"
5693 "movdqa %%xmm2,%%xmm1 \n"
5694 "punpcklwd %%xmm2,%%xmm1 \n"
5695 "punpckhwd %%xmm2,%%xmm2 \n"
5696 "por %%xmm5,%%xmm1 \n"
5697 "por %%xmm5,%%xmm2 \n"
5698 "movdqa %%xmm0,%%xmm3 \n"
5699 "punpcklwd %%xmm0,%%xmm3 \n"
5700 "punpckhwd %%xmm0,%%xmm0 \n"
5701 "por %%xmm5,%%xmm3 \n"
5702 "por %%xmm5,%%xmm0 \n"
5703 "movdqu %%xmm1,(%2) \n"
5704 "movdqu %%xmm2,0x10(%2) \n"
5705 "movdqu %%xmm3,0x20(%2) \n"
5706 "movdqu %%xmm0,0x30(%2) \n"
5707 "lea 0x40(%2),%2 \n"
5708 "sub $0x10,%3 \n"
5709 "jg 1b \n"
5710 : "+r"(src_sobelx), // %0
5711 "+r"(src_sobely), // %1
5712 "+r"(dst_argb), // %2
5713 "+r"(width) // %3
5714 :
5715 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
5716 }
5717 #endif // HAS_SOBELROW_SSE2
5718
5719 #ifdef HAS_SOBELTOPLANEROW_SSE2
5720 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
SobelToPlaneRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_y,int width)5721 void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
5722 const uint8_t* src_sobely,
5723 uint8_t* dst_y,
5724 int width) {
5725 asm volatile(
5726 "sub %0,%1 \n"
5727 "pcmpeqb %%xmm5,%%xmm5 \n"
5728 "pslld $0x18,%%xmm5 \n"
5729
5730 // 8 pixel loop.
5731 LABELALIGN
5732 "1: \n"
5733 "movdqu (%0),%%xmm0 \n"
5734 "movdqu 0x00(%0,%1,1),%%xmm1 \n"
5735 "lea 0x10(%0),%0 \n"
5736 "paddusb %%xmm1,%%xmm0 \n"
5737 "movdqu %%xmm0,(%2) \n"
5738 "lea 0x10(%2),%2 \n"
5739 "sub $0x10,%3 \n"
5740 "jg 1b \n"
5741 : "+r"(src_sobelx), // %0
5742 "+r"(src_sobely), // %1
5743 "+r"(dst_y), // %2
5744 "+r"(width) // %3
5745 :
5746 : "memory", "cc", "xmm0", "xmm1");
5747 }
5748 #endif // HAS_SOBELTOPLANEROW_SSE2
5749
5750 #ifdef HAS_SOBELXYROW_SSE2
5751 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
5752 // A = 255
5753 // R = Sobel X
5754 // G = Sobel
5755 // B = Sobel Y
SobelXYRow_SSE2(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)5756 void SobelXYRow_SSE2(const uint8_t* src_sobelx,
5757 const uint8_t* src_sobely,
5758 uint8_t* dst_argb,
5759 int width) {
5760 asm volatile(
5761 "sub %0,%1 \n"
5762 "pcmpeqb %%xmm5,%%xmm5 \n"
5763
5764 // 8 pixel loop.
5765 LABELALIGN
5766 "1: \n"
5767 "movdqu (%0),%%xmm0 \n"
5768 "movdqu 0x00(%0,%1,1),%%xmm1 \n"
5769 "lea 0x10(%0),%0 \n"
5770 "movdqa %%xmm0,%%xmm2 \n"
5771 "paddusb %%xmm1,%%xmm2 \n"
5772 "movdqa %%xmm0,%%xmm3 \n"
5773 "punpcklbw %%xmm5,%%xmm3 \n"
5774 "punpckhbw %%xmm5,%%xmm0 \n"
5775 "movdqa %%xmm1,%%xmm4 \n"
5776 "punpcklbw %%xmm2,%%xmm4 \n"
5777 "punpckhbw %%xmm2,%%xmm1 \n"
5778 "movdqa %%xmm4,%%xmm6 \n"
5779 "punpcklwd %%xmm3,%%xmm6 \n"
5780 "punpckhwd %%xmm3,%%xmm4 \n"
5781 "movdqa %%xmm1,%%xmm7 \n"
5782 "punpcklwd %%xmm0,%%xmm7 \n"
5783 "punpckhwd %%xmm0,%%xmm1 \n"
5784 "movdqu %%xmm6,(%2) \n"
5785 "movdqu %%xmm4,0x10(%2) \n"
5786 "movdqu %%xmm7,0x20(%2) \n"
5787 "movdqu %%xmm1,0x30(%2) \n"
5788 "lea 0x40(%2),%2 \n"
5789 "sub $0x10,%3 \n"
5790 "jg 1b \n"
5791 : "+r"(src_sobelx), // %0
5792 "+r"(src_sobely), // %1
5793 "+r"(dst_argb), // %2
5794 "+r"(width) // %3
5795 :
5796 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
5797 "xmm7");
5798 }
5799 #endif // HAS_SOBELXYROW_SSE2
5800
5801 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
5802 // Creates a table of cumulative sums where each value is a sum of all values
5803 // above and to the left of the value, inclusive of the value.
ComputeCumulativeSumRow_SSE2(const uint8_t * row,int32_t * cumsum,const int32_t * previous_cumsum,int width)5804 void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
5805 int32_t* cumsum,
5806 const int32_t* previous_cumsum,
5807 int width) {
5808 asm volatile(
5809 "pxor %%xmm0,%%xmm0 \n"
5810 "pxor %%xmm1,%%xmm1 \n"
5811 "sub $0x4,%3 \n"
5812 "jl 49f \n"
5813 "test $0xf,%1 \n"
5814 "jne 49f \n"
5815
5816 // 4 pixel loop.
5817 LABELALIGN
5818 "40: \n"
5819 "movdqu (%0),%%xmm2 \n"
5820 "lea 0x10(%0),%0 \n"
5821 "movdqa %%xmm2,%%xmm4 \n"
5822 "punpcklbw %%xmm1,%%xmm2 \n"
5823 "movdqa %%xmm2,%%xmm3 \n"
5824 "punpcklwd %%xmm1,%%xmm2 \n"
5825 "punpckhwd %%xmm1,%%xmm3 \n"
5826 "punpckhbw %%xmm1,%%xmm4 \n"
5827 "movdqa %%xmm4,%%xmm5 \n"
5828 "punpcklwd %%xmm1,%%xmm4 \n"
5829 "punpckhwd %%xmm1,%%xmm5 \n"
5830 "paddd %%xmm2,%%xmm0 \n"
5831 "movdqu (%2),%%xmm2 \n"
5832 "paddd %%xmm0,%%xmm2 \n"
5833 "paddd %%xmm3,%%xmm0 \n"
5834 "movdqu 0x10(%2),%%xmm3 \n"
5835 "paddd %%xmm0,%%xmm3 \n"
5836 "paddd %%xmm4,%%xmm0 \n"
5837 "movdqu 0x20(%2),%%xmm4 \n"
5838 "paddd %%xmm0,%%xmm4 \n"
5839 "paddd %%xmm5,%%xmm0 \n"
5840 "movdqu 0x30(%2),%%xmm5 \n"
5841 "lea 0x40(%2),%2 \n"
5842 "paddd %%xmm0,%%xmm5 \n"
5843 "movdqu %%xmm2,(%1) \n"
5844 "movdqu %%xmm3,0x10(%1) \n"
5845 "movdqu %%xmm4,0x20(%1) \n"
5846 "movdqu %%xmm5,0x30(%1) \n"
5847 "lea 0x40(%1),%1 \n"
5848 "sub $0x4,%3 \n"
5849 "jge 40b \n"
5850
5851 "49: \n"
5852 "add $0x3,%3 \n"
5853 "jl 19f \n"
5854
5855 // 1 pixel loop.
5856 LABELALIGN
5857 "10: \n"
5858 "movd (%0),%%xmm2 \n"
5859 "lea 0x4(%0),%0 \n"
5860 "punpcklbw %%xmm1,%%xmm2 \n"
5861 "punpcklwd %%xmm1,%%xmm2 \n"
5862 "paddd %%xmm2,%%xmm0 \n"
5863 "movdqu (%2),%%xmm2 \n"
5864 "lea 0x10(%2),%2 \n"
5865 "paddd %%xmm0,%%xmm2 \n"
5866 "movdqu %%xmm2,(%1) \n"
5867 "lea 0x10(%1),%1 \n"
5868 "sub $0x1,%3 \n"
5869 "jge 10b \n"
5870
5871 "19: \n"
5872 : "+r"(row), // %0
5873 "+r"(cumsum), // %1
5874 "+r"(previous_cumsum), // %2
5875 "+r"(width) // %3
5876 :
5877 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
5878 }
5879 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
5880
5881 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
CumulativeSumToAverageRow_SSE2(const int32_t * topleft,const int32_t * botleft,int width,int area,uint8_t * dst,int count)5882 void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
5883 const int32_t* botleft,
5884 int width,
5885 int area,
5886 uint8_t* dst,
5887 int count) {
5888 asm volatile(
5889 "movd %5,%%xmm5 \n"
5890 "cvtdq2ps %%xmm5,%%xmm5 \n"
5891 "rcpss %%xmm5,%%xmm4 \n"
5892 "pshufd $0x0,%%xmm4,%%xmm4 \n"
5893 "sub $0x4,%3 \n"
5894 "jl 49f \n"
5895 "cmpl $0x80,%5 \n"
5896 "ja 40f \n"
5897
5898 "pshufd $0x0,%%xmm5,%%xmm5 \n"
5899 "pcmpeqb %%xmm6,%%xmm6 \n"
5900 "psrld $0x10,%%xmm6 \n"
5901 "cvtdq2ps %%xmm6,%%xmm6 \n"
5902 "addps %%xmm6,%%xmm5 \n"
5903 "mulps %%xmm4,%%xmm5 \n"
5904 "cvtps2dq %%xmm5,%%xmm5 \n"
5905 "packssdw %%xmm5,%%xmm5 \n"
5906
5907 // 4 pixel small loop.
5908 LABELALIGN
5909 "4: \n"
5910 "movdqu (%0),%%xmm0 \n"
5911 "movdqu 0x10(%0),%%xmm1 \n"
5912 "movdqu 0x20(%0),%%xmm2 \n"
5913 "movdqu 0x30(%0),%%xmm3 \n"
5914 "psubd 0x00(%0,%4,4),%%xmm0 \n"
5915 "psubd 0x10(%0,%4,4),%%xmm1 \n"
5916 "psubd 0x20(%0,%4,4),%%xmm2 \n"
5917 "psubd 0x30(%0,%4,4),%%xmm3 \n"
5918 "lea 0x40(%0),%0 \n"
5919 "psubd (%1),%%xmm0 \n"
5920 "psubd 0x10(%1),%%xmm1 \n"
5921 "psubd 0x20(%1),%%xmm2 \n"
5922 "psubd 0x30(%1),%%xmm3 \n"
5923 "paddd 0x00(%1,%4,4),%%xmm0 \n"
5924 "paddd 0x10(%1,%4,4),%%xmm1 \n"
5925 "paddd 0x20(%1,%4,4),%%xmm2 \n"
5926 "paddd 0x30(%1,%4,4),%%xmm3 \n"
5927 "lea 0x40(%1),%1 \n"
5928 "packssdw %%xmm1,%%xmm0 \n"
5929 "packssdw %%xmm3,%%xmm2 \n"
5930 "pmulhuw %%xmm5,%%xmm0 \n"
5931 "pmulhuw %%xmm5,%%xmm2 \n"
5932 "packuswb %%xmm2,%%xmm0 \n"
5933 "movdqu %%xmm0,(%2) \n"
5934 "lea 0x10(%2),%2 \n"
5935 "sub $0x4,%3 \n"
5936 "jge 4b \n"
5937 "jmp 49f \n"
5938
5939 // 4 pixel loop
5940 LABELALIGN
5941 "40: \n"
5942 "movdqu (%0),%%xmm0 \n"
5943 "movdqu 0x10(%0),%%xmm1 \n"
5944 "movdqu 0x20(%0),%%xmm2 \n"
5945 "movdqu 0x30(%0),%%xmm3 \n"
5946 "psubd 0x00(%0,%4,4),%%xmm0 \n"
5947 "psubd 0x10(%0,%4,4),%%xmm1 \n"
5948 "psubd 0x20(%0,%4,4),%%xmm2 \n"
5949 "psubd 0x30(%0,%4,4),%%xmm3 \n"
5950 "lea 0x40(%0),%0 \n"
5951 "psubd (%1),%%xmm0 \n"
5952 "psubd 0x10(%1),%%xmm1 \n"
5953 "psubd 0x20(%1),%%xmm2 \n"
5954 "psubd 0x30(%1),%%xmm3 \n"
5955 "paddd 0x00(%1,%4,4),%%xmm0 \n"
5956 "paddd 0x10(%1,%4,4),%%xmm1 \n"
5957 "paddd 0x20(%1,%4,4),%%xmm2 \n"
5958 "paddd 0x30(%1,%4,4),%%xmm3 \n"
5959 "lea 0x40(%1),%1 \n"
5960 "cvtdq2ps %%xmm0,%%xmm0 \n"
5961 "cvtdq2ps %%xmm1,%%xmm1 \n"
5962 "mulps %%xmm4,%%xmm0 \n"
5963 "mulps %%xmm4,%%xmm1 \n"
5964 "cvtdq2ps %%xmm2,%%xmm2 \n"
5965 "cvtdq2ps %%xmm3,%%xmm3 \n"
5966 "mulps %%xmm4,%%xmm2 \n"
5967 "mulps %%xmm4,%%xmm3 \n"
5968 "cvtps2dq %%xmm0,%%xmm0 \n"
5969 "cvtps2dq %%xmm1,%%xmm1 \n"
5970 "cvtps2dq %%xmm2,%%xmm2 \n"
5971 "cvtps2dq %%xmm3,%%xmm3 \n"
5972 "packssdw %%xmm1,%%xmm0 \n"
5973 "packssdw %%xmm3,%%xmm2 \n"
5974 "packuswb %%xmm2,%%xmm0 \n"
5975 "movdqu %%xmm0,(%2) \n"
5976 "lea 0x10(%2),%2 \n"
5977 "sub $0x4,%3 \n"
5978 "jge 40b \n"
5979
5980 "49: \n"
5981 "add $0x3,%3 \n"
5982 "jl 19f \n"
5983
5984 // 1 pixel loop
5985 LABELALIGN
5986 "10: \n"
5987 "movdqu (%0),%%xmm0 \n"
5988 "psubd 0x00(%0,%4,4),%%xmm0 \n"
5989 "lea 0x10(%0),%0 \n"
5990 "psubd (%1),%%xmm0 \n"
5991 "paddd 0x00(%1,%4,4),%%xmm0 \n"
5992 "lea 0x10(%1),%1 \n"
5993 "cvtdq2ps %%xmm0,%%xmm0 \n"
5994 "mulps %%xmm4,%%xmm0 \n"
5995 "cvtps2dq %%xmm0,%%xmm0 \n"
5996 "packssdw %%xmm0,%%xmm0 \n"
5997 "packuswb %%xmm0,%%xmm0 \n"
5998 "movd %%xmm0,(%2) \n"
5999 "lea 0x4(%2),%2 \n"
6000 "sub $0x1,%3 \n"
6001 "jge 10b \n"
6002 "19: \n"
6003 : "+r"(topleft), // %0
6004 "+r"(botleft), // %1
6005 "+r"(dst), // %2
6006 "+rm"(count) // %3
6007 : "r"((intptr_t)(width)), // %4
6008 "rm"(area) // %5
6009 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
6010 }
6011 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
6012
6013 #ifdef HAS_ARGBAFFINEROW_SSE2
6014 // Copy ARGB pixels from source image with slope to a row of destination.
6015 LIBYUV_API
ARGBAffineRow_SSE2(const uint8_t * src_argb,int src_argb_stride,uint8_t * dst_argb,const float * src_dudv,int width)6016 void ARGBAffineRow_SSE2(const uint8_t* src_argb,
6017 int src_argb_stride,
6018 uint8_t* dst_argb,
6019 const float* src_dudv,
6020 int width) {
6021 intptr_t src_argb_stride_temp = src_argb_stride;
6022 intptr_t temp;
6023 asm volatile(
6024 "movq (%3),%%xmm2 \n"
6025 "movq 0x08(%3),%%xmm7 \n"
6026 "shl $0x10,%1 \n"
6027 "add $0x4,%1 \n"
6028 "movd %1,%%xmm5 \n"
6029 "sub $0x4,%4 \n"
6030 "jl 49f \n"
6031
6032 "pshufd $0x44,%%xmm7,%%xmm7 \n"
6033 "pshufd $0x0,%%xmm5,%%xmm5 \n"
6034 "movdqa %%xmm2,%%xmm0 \n"
6035 "addps %%xmm7,%%xmm0 \n"
6036 "movlhps %%xmm0,%%xmm2 \n"
6037 "movdqa %%xmm7,%%xmm4 \n"
6038 "addps %%xmm4,%%xmm4 \n"
6039 "movdqa %%xmm2,%%xmm3 \n"
6040 "addps %%xmm4,%%xmm3 \n"
6041 "addps %%xmm4,%%xmm4 \n"
6042
6043 // 4 pixel loop
6044 LABELALIGN
6045 "40: \n"
6046 "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2
6047 "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2
6048 "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
6049 "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride
6050 "movd %%xmm0,%k1 \n"
6051 "pshufd $0x39,%%xmm0,%%xmm0 \n"
6052 "movd %%xmm0,%k5 \n"
6053 "pshufd $0x39,%%xmm0,%%xmm0 \n"
6054 "movd 0x00(%0,%1,1),%%xmm1 \n"
6055 "movd 0x00(%0,%5,1),%%xmm6 \n"
6056 "punpckldq %%xmm6,%%xmm1 \n"
6057 "addps %%xmm4,%%xmm2 \n"
6058 "movq %%xmm1,(%2) \n"
6059 "movd %%xmm0,%k1 \n"
6060 "pshufd $0x39,%%xmm0,%%xmm0 \n"
6061 "movd %%xmm0,%k5 \n"
6062 "movd 0x00(%0,%1,1),%%xmm0 \n"
6063 "movd 0x00(%0,%5,1),%%xmm6 \n"
6064 "punpckldq %%xmm6,%%xmm0 \n"
6065 "addps %%xmm4,%%xmm3 \n"
6066 "movq %%xmm0,0x08(%2) \n"
6067 "lea 0x10(%2),%2 \n"
6068 "sub $0x4,%4 \n"
6069 "jge 40b \n"
6070
6071 "49: \n"
6072 "add $0x3,%4 \n"
6073 "jl 19f \n"
6074
6075 // 1 pixel loop
6076 LABELALIGN
6077 "10: \n"
6078 "cvttps2dq %%xmm2,%%xmm0 \n"
6079 "packssdw %%xmm0,%%xmm0 \n"
6080 "pmaddwd %%xmm5,%%xmm0 \n"
6081 "addps %%xmm7,%%xmm2 \n"
6082 "movd %%xmm0,%k1 \n"
6083 "movd 0x00(%0,%1,1),%%xmm0 \n"
6084 "movd %%xmm0,(%2) \n"
6085 "lea 0x04(%2),%2 \n"
6086 "sub $0x1,%4 \n"
6087 "jge 10b \n"
6088 "19: \n"
6089 : "+r"(src_argb), // %0
6090 "+r"(src_argb_stride_temp), // %1
6091 "+r"(dst_argb), // %2
6092 "+r"(src_dudv), // %3
6093 "+rm"(width), // %4
6094 "=&r"(temp) // %5
6095 :
6096 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
6097 "xmm7");
6098 }
6099 #endif // HAS_ARGBAFFINEROW_SSE2
6100
6101 #ifdef HAS_INTERPOLATEROW_SSSE3
6102 // Bilinear filter 16x2 -> 16x1
InterpolateRow_SSSE3(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)6103 void InterpolateRow_SSSE3(uint8_t* dst_ptr,
6104 const uint8_t* src_ptr,
6105 ptrdiff_t src_stride,
6106 int dst_width,
6107 int source_y_fraction) {
6108 asm volatile(
6109 "sub %1,%0 \n"
6110 "cmp $0x0,%3 \n"
6111 "je 100f \n"
6112 "cmp $0x80,%3 \n"
6113 "je 50f \n"
6114
6115 "movd %3,%%xmm0 \n"
6116 "neg %3 \n"
6117 "add $0x100,%3 \n"
6118 "movd %3,%%xmm5 \n"
6119 "punpcklbw %%xmm0,%%xmm5 \n"
6120 "punpcklwd %%xmm5,%%xmm5 \n"
6121 "pshufd $0x0,%%xmm5,%%xmm5 \n"
6122 "mov $0x80808080,%%eax \n"
6123 "movd %%eax,%%xmm4 \n"
6124 "pshufd $0x0,%%xmm4,%%xmm4 \n"
6125
6126 // General purpose row blend.
6127 LABELALIGN
6128 "1: \n"
6129 "movdqu (%1),%%xmm0 \n"
6130 "movdqu 0x00(%1,%4,1),%%xmm2 \n"
6131 "movdqa %%xmm0,%%xmm1 \n"
6132 "punpcklbw %%xmm2,%%xmm0 \n"
6133 "punpckhbw %%xmm2,%%xmm1 \n"
6134 "psubb %%xmm4,%%xmm0 \n"
6135 "psubb %%xmm4,%%xmm1 \n"
6136 "movdqa %%xmm5,%%xmm2 \n"
6137 "movdqa %%xmm5,%%xmm3 \n"
6138 "pmaddubsw %%xmm0,%%xmm2 \n"
6139 "pmaddubsw %%xmm1,%%xmm3 \n"
6140 "paddw %%xmm4,%%xmm2 \n"
6141 "paddw %%xmm4,%%xmm3 \n"
6142 "psrlw $0x8,%%xmm2 \n"
6143 "psrlw $0x8,%%xmm3 \n"
6144 "packuswb %%xmm3,%%xmm2 \n"
6145 "movdqu %%xmm2,0x00(%1,%0,1) \n"
6146 "lea 0x10(%1),%1 \n"
6147 "sub $0x10,%2 \n"
6148 "jg 1b \n"
6149 "jmp 99f \n"
6150
6151 // Blend 50 / 50.
6152 LABELALIGN
6153 "50: \n"
6154 "movdqu (%1),%%xmm0 \n"
6155 "movdqu 0x00(%1,%4,1),%%xmm1 \n"
6156 "pavgb %%xmm1,%%xmm0 \n"
6157 "movdqu %%xmm0,0x00(%1,%0,1) \n"
6158 "lea 0x10(%1),%1 \n"
6159 "sub $0x10,%2 \n"
6160 "jg 50b \n"
6161 "jmp 99f \n"
6162
6163 // Blend 100 / 0 - Copy row unchanged.
6164 LABELALIGN
6165 "100: \n"
6166 "movdqu (%1),%%xmm0 \n"
6167 "movdqu %%xmm0,0x00(%1,%0,1) \n"
6168 "lea 0x10(%1),%1 \n"
6169 "sub $0x10,%2 \n"
6170 "jg 100b \n"
6171
6172 "99: \n"
6173 : "+r"(dst_ptr), // %0
6174 "+r"(src_ptr), // %1
6175 "+rm"(dst_width), // %2
6176 "+r"(source_y_fraction) // %3
6177 : "r"((intptr_t)(src_stride)) // %4
6178 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
6179 }
6180 #endif // HAS_INTERPOLATEROW_SSSE3
6181
6182 #ifdef HAS_INTERPOLATEROW_AVX2
6183 // Bilinear filter 32x2 -> 32x1
InterpolateRow_AVX2(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)6184 void InterpolateRow_AVX2(uint8_t* dst_ptr,
6185 const uint8_t* src_ptr,
6186 ptrdiff_t src_stride,
6187 int dst_width,
6188 int source_y_fraction) {
6189 asm volatile(
6190 "cmp $0x0,%3 \n"
6191 "je 100f \n"
6192 "sub %1,%0 \n"
6193 "cmp $0x80,%3 \n"
6194 "je 50f \n"
6195
6196 "vmovd %3,%%xmm0 \n"
6197 "neg %3 \n"
6198 "add $0x100,%3 \n"
6199 "vmovd %3,%%xmm5 \n"
6200 "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
6201 "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
6202 "vbroadcastss %%xmm5,%%ymm5 \n"
6203 "mov $0x80808080,%%eax \n"
6204 "vmovd %%eax,%%xmm4 \n"
6205 "vbroadcastss %%xmm4,%%ymm4 \n"
6206
6207 // General purpose row blend.
6208 LABELALIGN
6209 "1: \n"
6210 "vmovdqu (%1),%%ymm0 \n"
6211 "vmovdqu 0x00(%1,%4,1),%%ymm2 \n"
6212 "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
6213 "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
6214 "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
6215 "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
6216 "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
6217 "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
6218 "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
6219 "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
6220 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
6221 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
6222 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
6223 "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
6224 "lea 0x20(%1),%1 \n"
6225 "sub $0x20,%2 \n"
6226 "jg 1b \n"
6227 "jmp 99f \n"
6228
6229 // Blend 50 / 50.
6230 LABELALIGN
6231 "50: \n"
6232 "vmovdqu (%1),%%ymm0 \n"
6233 "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n"
6234 "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
6235 "lea 0x20(%1),%1 \n"
6236 "sub $0x20,%2 \n"
6237 "jg 50b \n"
6238 "jmp 99f \n"
6239
6240 // Blend 100 / 0 - Copy row unchanged.
6241 LABELALIGN
6242 "100: \n"
6243 "rep movsb \n"
6244 "jmp 999f \n"
6245
6246 "99: \n"
6247 "vzeroupper \n"
6248 "999: \n"
6249 : "+D"(dst_ptr), // %0
6250 "+S"(src_ptr), // %1
6251 "+cm"(dst_width), // %2
6252 "+r"(source_y_fraction) // %3
6253 : "r"((intptr_t)(src_stride)) // %4
6254 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
6255 }
6256 #endif // HAS_INTERPOLATEROW_AVX2
6257
6258 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
6259 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)6260 void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
6261 uint8_t* dst_argb,
6262 const uint8_t* shuffler,
6263 int width) {
6264 asm volatile(
6265
6266 "movdqu (%3),%%xmm5 \n"
6267
6268 LABELALIGN
6269 "1: \n"
6270 "movdqu (%0),%%xmm0 \n"
6271 "movdqu 0x10(%0),%%xmm1 \n"
6272 "lea 0x20(%0),%0 \n"
6273 "pshufb %%xmm5,%%xmm0 \n"
6274 "pshufb %%xmm5,%%xmm1 \n"
6275 "movdqu %%xmm0,(%1) \n"
6276 "movdqu %%xmm1,0x10(%1) \n"
6277 "lea 0x20(%1),%1 \n"
6278 "sub $0x8,%2 \n"
6279 "jg 1b \n"
6280 : "+r"(src_argb), // %0
6281 "+r"(dst_argb), // %1
6282 "+r"(width) // %2
6283 : "r"(shuffler) // %3
6284 : "memory", "cc", "xmm0", "xmm1", "xmm5");
6285 }
6286 #endif // HAS_ARGBSHUFFLEROW_SSSE3
6287
6288 #ifdef HAS_ARGBSHUFFLEROW_AVX2
6289 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)6290 void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
6291 uint8_t* dst_argb,
6292 const uint8_t* shuffler,
6293 int width) {
6294 asm volatile(
6295
6296 "vbroadcastf128 (%3),%%ymm5 \n"
6297
6298 LABELALIGN
6299 "1: \n"
6300 "vmovdqu (%0),%%ymm0 \n"
6301 "vmovdqu 0x20(%0),%%ymm1 \n"
6302 "lea 0x40(%0),%0 \n"
6303 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
6304 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
6305 "vmovdqu %%ymm0,(%1) \n"
6306 "vmovdqu %%ymm1,0x20(%1) \n"
6307 "lea 0x40(%1),%1 \n"
6308 "sub $0x10,%2 \n"
6309 "jg 1b \n"
6310 "vzeroupper \n"
6311 : "+r"(src_argb), // %0
6312 "+r"(dst_argb), // %1
6313 "+r"(width) // %2
6314 : "r"(shuffler) // %3
6315 : "memory", "cc", "xmm0", "xmm1", "xmm5");
6316 }
6317 #endif // HAS_ARGBSHUFFLEROW_AVX2
6318
6319 #ifdef HAS_I422TOYUY2ROW_SSE2
I422ToYUY2Row_SSE2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)6320 void I422ToYUY2Row_SSE2(const uint8_t* src_y,
6321 const uint8_t* src_u,
6322 const uint8_t* src_v,
6323 uint8_t* dst_yuy2,
6324 int width) {
6325 asm volatile(
6326
6327 "sub %1,%2 \n"
6328
6329 LABELALIGN
6330 "1: \n"
6331 "movq (%1),%%xmm2 \n"
6332 "movq 0x00(%1,%2,1),%%xmm1 \n"
6333 "add $0x8,%1 \n"
6334 "punpcklbw %%xmm1,%%xmm2 \n"
6335 "movdqu (%0),%%xmm0 \n"
6336 "add $0x10,%0 \n"
6337 "movdqa %%xmm0,%%xmm1 \n"
6338 "punpcklbw %%xmm2,%%xmm0 \n"
6339 "punpckhbw %%xmm2,%%xmm1 \n"
6340 "movdqu %%xmm0,(%3) \n"
6341 "movdqu %%xmm1,0x10(%3) \n"
6342 "lea 0x20(%3),%3 \n"
6343 "sub $0x10,%4 \n"
6344 "jg 1b \n"
6345 : "+r"(src_y), // %0
6346 "+r"(src_u), // %1
6347 "+r"(src_v), // %2
6348 "+r"(dst_yuy2), // %3
6349 "+rm"(width) // %4
6350 :
6351 : "memory", "cc", "xmm0", "xmm1", "xmm2");
6352 }
6353 #endif // HAS_I422TOYUY2ROW_SSE2
6354
6355 #ifdef HAS_I422TOUYVYROW_SSE2
I422ToUYVYRow_SSE2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)6356 void I422ToUYVYRow_SSE2(const uint8_t* src_y,
6357 const uint8_t* src_u,
6358 const uint8_t* src_v,
6359 uint8_t* dst_uyvy,
6360 int width) {
6361 asm volatile(
6362
6363 "sub %1,%2 \n"
6364
6365 LABELALIGN
6366 "1: \n"
6367 "movq (%1),%%xmm2 \n"
6368 "movq 0x00(%1,%2,1),%%xmm1 \n"
6369 "add $0x8,%1 \n"
6370 "punpcklbw %%xmm1,%%xmm2 \n"
6371 "movdqu (%0),%%xmm0 \n"
6372 "movdqa %%xmm2,%%xmm1 \n"
6373 "add $0x10,%0 \n"
6374 "punpcklbw %%xmm0,%%xmm1 \n"
6375 "punpckhbw %%xmm0,%%xmm2 \n"
6376 "movdqu %%xmm1,(%3) \n"
6377 "movdqu %%xmm2,0x10(%3) \n"
6378 "lea 0x20(%3),%3 \n"
6379 "sub $0x10,%4 \n"
6380 "jg 1b \n"
6381 : "+r"(src_y), // %0
6382 "+r"(src_u), // %1
6383 "+r"(src_v), // %2
6384 "+r"(dst_uyvy), // %3
6385 "+rm"(width) // %4
6386 :
6387 : "memory", "cc", "xmm0", "xmm1", "xmm2");
6388 }
6389 #endif // HAS_I422TOUYVYROW_SSE2
6390
6391 #ifdef HAS_I422TOYUY2ROW_AVX2
I422ToYUY2Row_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)6392 void I422ToYUY2Row_AVX2(const uint8_t* src_y,
6393 const uint8_t* src_u,
6394 const uint8_t* src_v,
6395 uint8_t* dst_yuy2,
6396 int width) {
6397 asm volatile(
6398
6399 "sub %1,%2 \n"
6400
6401 LABELALIGN
6402 "1: \n"
6403 "vpmovzxbw (%1),%%ymm1 \n"
6404 "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
6405 "add $0x10,%1 \n"
6406 "vpsllw $0x8,%%ymm2,%%ymm2 \n"
6407 "vpor %%ymm1,%%ymm2,%%ymm2 \n"
6408 "vmovdqu (%0),%%ymm0 \n"
6409 "add $0x20,%0 \n"
6410 "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n"
6411 "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n"
6412 "vextractf128 $0x0,%%ymm1,(%3) \n"
6413 "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
6414 "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
6415 "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
6416 "lea 0x40(%3),%3 \n"
6417 "sub $0x20,%4 \n"
6418 "jg 1b \n"
6419 "vzeroupper \n"
6420 : "+r"(src_y), // %0
6421 "+r"(src_u), // %1
6422 "+r"(src_v), // %2
6423 "+r"(dst_yuy2), // %3
6424 "+rm"(width) // %4
6425 :
6426 : "memory", "cc", "xmm0", "xmm1", "xmm2");
6427 }
6428 #endif // HAS_I422TOYUY2ROW_AVX2
6429
6430 #ifdef HAS_I422TOUYVYROW_AVX2
I422ToUYVYRow_AVX2(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)6431 void I422ToUYVYRow_AVX2(const uint8_t* src_y,
6432 const uint8_t* src_u,
6433 const uint8_t* src_v,
6434 uint8_t* dst_uyvy,
6435 int width) {
6436 asm volatile(
6437
6438 "sub %1,%2 \n"
6439
6440 LABELALIGN
6441 "1: \n"
6442 "vpmovzxbw (%1),%%ymm1 \n"
6443 "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
6444 "add $0x10,%1 \n"
6445 "vpsllw $0x8,%%ymm2,%%ymm2 \n"
6446 "vpor %%ymm1,%%ymm2,%%ymm2 \n"
6447 "vmovdqu (%0),%%ymm0 \n"
6448 "add $0x20,%0 \n"
6449 "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
6450 "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
6451 "vextractf128 $0x0,%%ymm1,(%3) \n"
6452 "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
6453 "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
6454 "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
6455 "lea 0x40(%3),%3 \n"
6456 "sub $0x20,%4 \n"
6457 "jg 1b \n"
6458 "vzeroupper \n"
6459 : "+r"(src_y), // %0
6460 "+r"(src_u), // %1
6461 "+r"(src_v), // %2
6462 "+r"(dst_uyvy), // %3
6463 "+rm"(width) // %4
6464 :
6465 : "memory", "cc", "xmm0", "xmm1", "xmm2");
6466 }
6467 #endif // HAS_I422TOUYVYROW_AVX2
6468
6469 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
ARGBPolynomialRow_SSE2(const uint8_t * src_argb,uint8_t * dst_argb,const float * poly,int width)6470 void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
6471 uint8_t* dst_argb,
6472 const float* poly,
6473 int width) {
6474 asm volatile(
6475
6476 "pxor %%xmm3,%%xmm3 \n"
6477
6478 // 2 pixel loop.
6479 LABELALIGN
6480 "1: \n"
6481 "movq (%0),%%xmm0 \n"
6482 "lea 0x8(%0),%0 \n"
6483 "punpcklbw %%xmm3,%%xmm0 \n"
6484 "movdqa %%xmm0,%%xmm4 \n"
6485 "punpcklwd %%xmm3,%%xmm0 \n"
6486 "punpckhwd %%xmm3,%%xmm4 \n"
6487 "cvtdq2ps %%xmm0,%%xmm0 \n"
6488 "cvtdq2ps %%xmm4,%%xmm4 \n"
6489 "movdqa %%xmm0,%%xmm1 \n"
6490 "movdqa %%xmm4,%%xmm5 \n"
6491 "mulps 0x10(%3),%%xmm0 \n"
6492 "mulps 0x10(%3),%%xmm4 \n"
6493 "addps (%3),%%xmm0 \n"
6494 "addps (%3),%%xmm4 \n"
6495 "movdqa %%xmm1,%%xmm2 \n"
6496 "movdqa %%xmm5,%%xmm6 \n"
6497 "mulps %%xmm1,%%xmm2 \n"
6498 "mulps %%xmm5,%%xmm6 \n"
6499 "mulps %%xmm2,%%xmm1 \n"
6500 "mulps %%xmm6,%%xmm5 \n"
6501 "mulps 0x20(%3),%%xmm2 \n"
6502 "mulps 0x20(%3),%%xmm6 \n"
6503 "mulps 0x30(%3),%%xmm1 \n"
6504 "mulps 0x30(%3),%%xmm5 \n"
6505 "addps %%xmm2,%%xmm0 \n"
6506 "addps %%xmm6,%%xmm4 \n"
6507 "addps %%xmm1,%%xmm0 \n"
6508 "addps %%xmm5,%%xmm4 \n"
6509 "cvttps2dq %%xmm0,%%xmm0 \n"
6510 "cvttps2dq %%xmm4,%%xmm4 \n"
6511 "packuswb %%xmm4,%%xmm0 \n"
6512 "packuswb %%xmm0,%%xmm0 \n"
6513 "movq %%xmm0,(%1) \n"
6514 "lea 0x8(%1),%1 \n"
6515 "sub $0x2,%2 \n"
6516 "jg 1b \n"
6517 : "+r"(src_argb), // %0
6518 "+r"(dst_argb), // %1
6519 "+r"(width) // %2
6520 : "r"(poly) // %3
6521 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
6522 }
6523 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
6524
6525 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
ARGBPolynomialRow_AVX2(const uint8_t * src_argb,uint8_t * dst_argb,const float * poly,int width)6526 void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
6527 uint8_t* dst_argb,
6528 const float* poly,
6529 int width) {
6530 asm volatile(
6531 "vbroadcastf128 (%3),%%ymm4 \n"
6532 "vbroadcastf128 0x10(%3),%%ymm5 \n"
6533 "vbroadcastf128 0x20(%3),%%ymm6 \n"
6534 "vbroadcastf128 0x30(%3),%%ymm7 \n"
6535
6536 // 2 pixel loop.
6537 LABELALIGN
6538 "1: \n"
6539 "vpmovzxbd (%0),%%ymm0 \n" // 2 ARGB pixels
6540 "lea 0x8(%0),%0 \n"
6541 "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
6542 "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
6543 "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
6544 "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
6545 "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
6546 "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X *
6547 // X
6548 "vcvttps2dq %%ymm0,%%ymm0 \n"
6549 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
6550 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
6551 "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
6552 "vmovq %%xmm0,(%1) \n"
6553 "lea 0x8(%1),%1 \n"
6554 "sub $0x2,%2 \n"
6555 "jg 1b \n"
6556 "vzeroupper \n"
6557 : "+r"(src_argb), // %0
6558 "+r"(dst_argb), // %1
6559 "+r"(width) // %2
6560 : "r"(poly) // %3
6561 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
6562 "xmm7");
6563 }
6564 #endif // HAS_ARGBPOLYNOMIALROW_AVX2
6565
6566 #ifdef HAS_HALFFLOATROW_SSE2
6567 static float kScaleBias = 1.9259299444e-34f;
HalfFloatRow_SSE2(const uint16_t * src,uint16_t * dst,float scale,int width)6568 void HalfFloatRow_SSE2(const uint16_t* src,
6569 uint16_t* dst,
6570 float scale,
6571 int width) {
6572 scale *= kScaleBias;
6573 asm volatile(
6574 "movd %3,%%xmm4 \n"
6575 "pshufd $0x0,%%xmm4,%%xmm4 \n"
6576 "pxor %%xmm5,%%xmm5 \n"
6577 "sub %0,%1 \n"
6578
6579 // 16 pixel loop.
6580 LABELALIGN
6581 "1: \n"
6582 "movdqu (%0),%%xmm2 \n" // 8 shorts
6583 "add $0x10,%0 \n"
6584 "movdqa %%xmm2,%%xmm3 \n"
6585 "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1
6586 "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats
6587 "punpckhwd %%xmm5,%%xmm3 \n"
6588 "cvtdq2ps %%xmm3,%%xmm3 \n"
6589 "mulps %%xmm4,%%xmm2 \n"
6590 "mulps %%xmm4,%%xmm3 \n"
6591 "psrld $0xd,%%xmm2 \n"
6592 "psrld $0xd,%%xmm3 \n"
6593 "packssdw %%xmm3,%%xmm2 \n"
6594 "movdqu %%xmm2,-0x10(%0,%1,1) \n"
6595 "sub $0x8,%2 \n"
6596 "jg 1b \n"
6597 : "+r"(src), // %0
6598 "+r"(dst), // %1
6599 "+r"(width) // %2
6600 : "m"(scale) // %3
6601 : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
6602 }
6603 #endif // HAS_HALFFLOATROW_SSE2
6604
6605 #ifdef HAS_HALFFLOATROW_AVX2
HalfFloatRow_AVX2(const uint16_t * src,uint16_t * dst,float scale,int width)6606 void HalfFloatRow_AVX2(const uint16_t* src,
6607 uint16_t* dst,
6608 float scale,
6609 int width) {
6610 scale *= kScaleBias;
6611 asm volatile(
6612 "vbroadcastss %3, %%ymm4 \n"
6613 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
6614 "sub %0,%1 \n"
6615
6616 // 16 pixel loop.
6617 LABELALIGN
6618 "1: \n"
6619 "vmovdqu (%0),%%ymm2 \n" // 16 shorts
6620 "add $0x20,%0 \n"
6621 "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
6622 "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
6623 "vcvtdq2ps %%ymm3,%%ymm3 \n"
6624 "vcvtdq2ps %%ymm2,%%ymm2 \n"
6625 "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
6626 "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
6627 "vpsrld $0xd,%%ymm3,%%ymm3 \n"
6628 "vpsrld $0xd,%%ymm2,%%ymm2 \n"
6629 "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
6630 "vmovdqu %%ymm2,-0x20(%0,%1,1) \n"
6631 "sub $0x10,%2 \n"
6632 "jg 1b \n"
6633
6634 "vzeroupper \n"
6635 : "+r"(src), // %0
6636 "+r"(dst), // %1
6637 "+r"(width) // %2
6638 #if defined(__x86_64__)
6639 : "x"(scale) // %3
6640 #else
6641 : "m"(scale) // %3
6642 #endif
6643 : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
6644 }
6645 #endif // HAS_HALFFLOATROW_AVX2
6646
6647 #ifdef HAS_HALFFLOATROW_F16C
HalfFloatRow_F16C(const uint16_t * src,uint16_t * dst,float scale,int width)6648 void HalfFloatRow_F16C(const uint16_t* src,
6649 uint16_t* dst,
6650 float scale,
6651 int width) {
6652 asm volatile(
6653 "vbroadcastss %3, %%ymm4 \n"
6654 "sub %0,%1 \n"
6655
6656 // 16 pixel loop.
6657 LABELALIGN
6658 "1: \n"
6659 "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints
6660 "vpmovzxwd 0x10(%0),%%ymm3 \n"
6661 "vcvtdq2ps %%ymm2,%%ymm2 \n"
6662 "vcvtdq2ps %%ymm3,%%ymm3 \n"
6663 "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
6664 "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
6665 "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
6666 "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
6667 "vmovdqu %%xmm2,0x00(%0,%1,1) \n"
6668 "vmovdqu %%xmm3,0x10(%0,%1,1) \n"
6669 "add $0x20,%0 \n"
6670 "sub $0x10,%2 \n"
6671 "jg 1b \n"
6672 "vzeroupper \n"
6673 : "+r"(src), // %0
6674 "+r"(dst), // %1
6675 "+r"(width) // %2
6676 #if defined(__x86_64__)
6677 : "x"(scale) // %3
6678 #else
6679 : "m"(scale) // %3
6680 #endif
6681 : "memory", "cc", "xmm2", "xmm3", "xmm4");
6682 }
6683 #endif // HAS_HALFFLOATROW_F16C
6684
6685 #ifdef HAS_HALFFLOATROW_F16C
HalfFloat1Row_F16C(const uint16_t * src,uint16_t * dst,float,int width)6686 void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
6687 asm volatile(
6688 "sub %0,%1 \n"
6689 // 16 pixel loop.
6690 LABELALIGN
6691 "1: \n"
6692 "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints
6693 "vpmovzxwd 0x10(%0),%%ymm3 \n"
6694 "vcvtdq2ps %%ymm2,%%ymm2 \n"
6695 "vcvtdq2ps %%ymm3,%%ymm3 \n"
6696 "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
6697 "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
6698 "vmovdqu %%xmm2,0x00(%0,%1,1) \n"
6699 "vmovdqu %%xmm3,0x10(%0,%1,1) \n"
6700 "add $0x20,%0 \n"
6701 "sub $0x10,%2 \n"
6702 "jg 1b \n"
6703 "vzeroupper \n"
6704 : "+r"(src), // %0
6705 "+r"(dst), // %1
6706 "+r"(width) // %2
6707 :
6708 : "memory", "cc", "xmm2", "xmm3");
6709 }
6710 #endif // HAS_HALFFLOATROW_F16C
6711
6712 #ifdef HAS_ARGBCOLORTABLEROW_X86
6713 // Tranform ARGB pixels with color table.
ARGBColorTableRow_X86(uint8_t * dst_argb,const uint8_t * table_argb,int width)6714 void ARGBColorTableRow_X86(uint8_t* dst_argb,
6715 const uint8_t* table_argb,
6716 int width) {
6717 uintptr_t pixel_temp;
6718 asm volatile(
6719 // 1 pixel loop.
6720 LABELALIGN
6721 "1: \n"
6722 "movzb (%0),%1 \n"
6723 "lea 0x4(%0),%0 \n"
6724 "movzb 0x00(%3,%1,4),%1 \n"
6725 "mov %b1,-0x4(%0) \n"
6726 "movzb -0x3(%0),%1 \n"
6727 "movzb 0x01(%3,%1,4),%1 \n"
6728 "mov %b1,-0x3(%0) \n"
6729 "movzb -0x2(%0),%1 \n"
6730 "movzb 0x02(%3,%1,4),%1 \n"
6731 "mov %b1,-0x2(%0) \n"
6732 "movzb -0x1(%0),%1 \n"
6733 "movzb 0x03(%3,%1,4),%1 \n"
6734 "mov %b1,-0x1(%0) \n"
6735 "dec %2 \n"
6736 "jg 1b \n"
6737 : "+r"(dst_argb), // %0
6738 "=&d"(pixel_temp), // %1
6739 "+r"(width) // %2
6740 : "r"(table_argb) // %3
6741 : "memory", "cc");
6742 }
6743 #endif // HAS_ARGBCOLORTABLEROW_X86
6744
6745 #ifdef HAS_RGBCOLORTABLEROW_X86
6746 // Tranform RGB pixels with color table.
RGBColorTableRow_X86(uint8_t * dst_argb,const uint8_t * table_argb,int width)6747 void RGBColorTableRow_X86(uint8_t* dst_argb,
6748 const uint8_t* table_argb,
6749 int width) {
6750 uintptr_t pixel_temp;
6751 asm volatile(
6752 // 1 pixel loop.
6753 LABELALIGN
6754 "1: \n"
6755 "movzb (%0),%1 \n"
6756 "lea 0x4(%0),%0 \n"
6757 "movzb 0x00(%3,%1,4),%1 \n"
6758 "mov %b1,-0x4(%0) \n"
6759 "movzb -0x3(%0),%1 \n"
6760 "movzb 0x01(%3,%1,4),%1 \n"
6761 "mov %b1,-0x3(%0) \n"
6762 "movzb -0x2(%0),%1 \n"
6763 "movzb 0x02(%3,%1,4),%1 \n"
6764 "mov %b1,-0x2(%0) \n"
6765 "dec %2 \n"
6766 "jg 1b \n"
6767 : "+r"(dst_argb), // %0
6768 "=&d"(pixel_temp), // %1
6769 "+r"(width) // %2
6770 : "r"(table_argb) // %3
6771 : "memory", "cc");
6772 }
6773 #endif // HAS_RGBCOLORTABLEROW_X86
6774
6775 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
6776 // Tranform RGB pixels with luma table.
ARGBLumaColorTableRow_SSSE3(const uint8_t * src_argb,uint8_t * dst_argb,int width,const uint8_t * luma,uint32_t lumacoeff)6777 void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
6778 uint8_t* dst_argb,
6779 int width,
6780 const uint8_t* luma,
6781 uint32_t lumacoeff) {
6782 uintptr_t pixel_temp;
6783 uintptr_t table_temp;
6784 asm volatile(
6785 "movd %6,%%xmm3 \n"
6786 "pshufd $0x0,%%xmm3,%%xmm3 \n"
6787 "pcmpeqb %%xmm4,%%xmm4 \n"
6788 "psllw $0x8,%%xmm4 \n"
6789 "pxor %%xmm5,%%xmm5 \n"
6790
6791 // 4 pixel loop.
6792 LABELALIGN
6793 "1: \n"
6794 "movdqu (%2),%%xmm0 \n"
6795 "pmaddubsw %%xmm3,%%xmm0 \n"
6796 "phaddw %%xmm0,%%xmm0 \n"
6797 "pand %%xmm4,%%xmm0 \n"
6798 "punpcklwd %%xmm5,%%xmm0 \n"
6799 "movd %%xmm0,%k1 \n" // 32 bit offset
6800 "add %5,%1 \n"
6801 "pshufd $0x39,%%xmm0,%%xmm0 \n"
6802
6803 "movzb (%2),%0 \n"
6804 "movzb 0x00(%1,%0,1),%0 \n"
6805 "mov %b0,(%3) \n"
6806 "movzb 0x1(%2),%0 \n"
6807 "movzb 0x00(%1,%0,1),%0 \n"
6808 "mov %b0,0x1(%3) \n"
6809 "movzb 0x2(%2),%0 \n"
6810 "movzb 0x00(%1,%0,1),%0 \n"
6811 "mov %b0,0x2(%3) \n"
6812 "movzb 0x3(%2),%0 \n"
6813 "mov %b0,0x3(%3) \n"
6814
6815 "movd %%xmm0,%k1 \n" // 32 bit offset
6816 "add %5,%1 \n"
6817 "pshufd $0x39,%%xmm0,%%xmm0 \n"
6818
6819 "movzb 0x4(%2),%0 \n"
6820 "movzb 0x00(%1,%0,1),%0 \n"
6821 "mov %b0,0x4(%3) \n"
6822 "movzb 0x5(%2),%0 \n"
6823 "movzb 0x00(%1,%0,1),%0 \n"
6824 "mov %b0,0x5(%3) \n"
6825 "movzb 0x6(%2),%0 \n"
6826 "movzb 0x00(%1,%0,1),%0 \n"
6827 "mov %b0,0x6(%3) \n"
6828 "movzb 0x7(%2),%0 \n"
6829 "mov %b0,0x7(%3) \n"
6830
6831 "movd %%xmm0,%k1 \n" // 32 bit offset
6832 "add %5,%1 \n"
6833 "pshufd $0x39,%%xmm0,%%xmm0 \n"
6834
6835 "movzb 0x8(%2),%0 \n"
6836 "movzb 0x00(%1,%0,1),%0 \n"
6837 "mov %b0,0x8(%3) \n"
6838 "movzb 0x9(%2),%0 \n"
6839 "movzb 0x00(%1,%0,1),%0 \n"
6840 "mov %b0,0x9(%3) \n"
6841 "movzb 0xa(%2),%0 \n"
6842 "movzb 0x00(%1,%0,1),%0 \n"
6843 "mov %b0,0xa(%3) \n"
6844 "movzb 0xb(%2),%0 \n"
6845 "mov %b0,0xb(%3) \n"
6846
6847 "movd %%xmm0,%k1 \n" // 32 bit offset
6848 "add %5,%1 \n"
6849
6850 "movzb 0xc(%2),%0 \n"
6851 "movzb 0x00(%1,%0,1),%0 \n"
6852 "mov %b0,0xc(%3) \n"
6853 "movzb 0xd(%2),%0 \n"
6854 "movzb 0x00(%1,%0,1),%0 \n"
6855 "mov %b0,0xd(%3) \n"
6856 "movzb 0xe(%2),%0 \n"
6857 "movzb 0x00(%1,%0,1),%0 \n"
6858 "mov %b0,0xe(%3) \n"
6859 "movzb 0xf(%2),%0 \n"
6860 "mov %b0,0xf(%3) \n"
6861 "lea 0x10(%2),%2 \n"
6862 "lea 0x10(%3),%3 \n"
6863 "sub $0x4,%4 \n"
6864 "jg 1b \n"
6865 : "=&d"(pixel_temp), // %0
6866 "=&a"(table_temp), // %1
6867 "+r"(src_argb), // %2
6868 "+r"(dst_argb), // %3
6869 "+rm"(width) // %4
6870 : "r"(luma), // %5
6871 "rm"(lumacoeff) // %6
6872 : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
6873 }
6874 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6875
6876 #ifdef HAS_NV21TOYUV24ROW_AVX2
6877
6878 // begin NV21ToYUV24Row_C avx2 constants
6879 static const ulvec8 kBLEND0 = {0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, 0x00,
6880 0x80, 0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80,
6881 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
6882 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00};
6883
6884 static const ulvec8 kBLEND1 = {0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
6885 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
6886 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
6887 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80};
6888
6889 static const ulvec8 kBLEND2 = {0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
6890 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
6891 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
6892 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00};
6893
6894 static const ulvec8 kSHUF0 = {0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
6895 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05,
6896 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
6897 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05};
6898
6899 static const ulvec8 kSHUF1 = {0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
6900 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80,
6901 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
6902 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80};
6903
6904 static const ulvec8 kSHUF2 = {0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
6905 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f,
6906 0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
6907 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f};
6908
6909 static const ulvec8 kSHUF3 = {0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
6910 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80,
6911 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
6912 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80};
6913
6914 static const ulvec8 kSHUF4 = {0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
6915 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a,
6916 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
6917 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a};
6918
6919 static const ulvec8 kSHUF5 = {0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
6920 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80,
6921 0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
6922 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80};
6923
6924 // NV21ToYUV24Row_AVX2
NV21ToYUV24Row_AVX2(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)6925 void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
6926 const uint8_t* src_vu,
6927 uint8_t* dst_yuv24,
6928 int width) {
6929 uint8_t* src_y_ptr;
6930 uint64_t src_offset = 0;
6931 uint64_t width64;
6932
6933 width64 = width;
6934 src_y_ptr = (uint8_t*)src_y;
6935
6936 asm volatile(
6937 "vmovdqu %5, %%ymm0 \n" // init blend value
6938 "vmovdqu %6, %%ymm1 \n" // init blend value
6939 "vmovdqu %7, %%ymm2 \n" // init blend value
6940 // "sub $0x20, %3 \n" //sub 32 from
6941 // width for final loop
6942
6943 LABELALIGN
6944 "1: \n" // label 1
6945 "vmovdqu (%0,%4), %%ymm3 \n" // src_y
6946 "vmovdqu 1(%1,%4), %%ymm4 \n" // src_uv+1
6947 "vmovdqu (%1), %%ymm5 \n" // src_uv
6948 "vpshufb %8, %%ymm3, %%ymm13 \n" // y, kSHUF0 for shuf
6949 "vpshufb %9, %%ymm4, %%ymm14 \n" // uv+1, kSHUF1 for
6950 // shuf
6951 "vpshufb %10, %%ymm5, %%ymm15 \n" // uv, kSHUF2 for
6952 // shuf
6953 "vpshufb %11, %%ymm3, %%ymm3 \n" // y kSHUF3 for shuf
6954 "vpshufb %12, %%ymm4, %%ymm4 \n" // uv+1 kSHUF4 for
6955 // shuf
6956 "vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" // blend 0
6957 "vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" // blend 0
6958 "vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" // blend 2
6959 "vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" // blend 1
6960 "vpshufb %13, %%ymm5, %%ymm15 \n" // shuffle const
6961 "vpor %%ymm4, %%ymm3, %%ymm5 \n" // get results
6962 "vmovdqu %%ymm12, 0x20(%2) \n" // store dst_yuv+20h
6963 "vpor %%ymm15, %%ymm5, %%ymm3 \n" // get results
6964 "add $0x20, %4 \n" // add to src buffer
6965 // ptr
6966 "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" // insert
6967 "vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" // insert
6968 "vmovdqu %%ymm4, (%2) \n" // store dst_yuv
6969 "vmovdqu %%ymm5, 0x40(%2) \n" // store dst_yuv+40h
6970 "add $0x60,%2 \n" // add to dst buffer
6971 // ptr
6972 // "cmp %3, %4 \n" //(width64 -
6973 // 32 bytes) and src_offset
6974 "sub $0x20,%3 \n" // 32 pixels per loop
6975 "jg 1b \n"
6976 "vzeroupper \n" // sse-avx2
6977 // transistions
6978
6979 : "+r"(src_y), //%0
6980 "+r"(src_vu), //%1
6981 "+r"(dst_yuv24), //%2
6982 "+r"(width64), //%3
6983 "+r"(src_offset) //%4
6984 : "m"(kBLEND0), //%5
6985 "m"(kBLEND1), //%6
6986 "m"(kBLEND2), //%7
6987 "m"(kSHUF0), //%8
6988 "m"(kSHUF1), //%9
6989 "m"(kSHUF2), //%10
6990 "m"(kSHUF3), //%11
6991 "m"(kSHUF4), //%12
6992 "m"(kSHUF5) //%13
6993 : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12",
6994 "xmm13", "xmm14", "xmm15");
6995 }
6996 #endif // HAS_NV21TOYUV24ROW_AVX2
6997
6998 #ifdef HAS_SWAPUVROW_SSSE3
6999
7000 // Shuffle table for reversing the bytes.
7001 static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
7002 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
7003
7004 // Convert UV plane of NV12 to VU of NV21.
SwapUVRow_SSSE3(const uint8_t * src_uv,uint8_t * dst_vu,int width)7005 void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
7006 asm volatile(
7007
7008 "movdqu %3,%%xmm5 \n"
7009
7010 LABELALIGN
7011 "1: \n"
7012 "movdqu (%0),%%xmm0 \n"
7013 "movdqu 0x10(%0),%%xmm1 \n"
7014 "lea 0x20(%0),%0 \n"
7015 "pshufb %%xmm5,%%xmm0 \n"
7016 "pshufb %%xmm5,%%xmm1 \n"
7017 "movdqu %%xmm0,(%1) \n"
7018 "movdqu %%xmm1,0x10(%1) \n"
7019 "lea 0x20(%1),%1 \n"
7020 "sub $0x10,%2 \n"
7021 "jg 1b \n"
7022 : "+r"(src_uv), // %0
7023 "+r"(dst_vu), // %1
7024 "+r"(width) // %2
7025 : "m"(kShuffleUVToVU) // %3
7026 : "memory", "cc", "xmm0", "xmm1", "xmm5");
7027 }
7028 #endif // HAS_SWAPUVROW_SSSE3
7029
7030 #ifdef HAS_SWAPUVROW_AVX2
SwapUVRow_AVX2(const uint8_t * src_uv,uint8_t * dst_vu,int width)7031 void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
7032 asm volatile(
7033
7034 "vbroadcastf128 %3,%%ymm5 \n"
7035
7036 LABELALIGN
7037 "1: \n"
7038 "vmovdqu (%0),%%ymm0 \n"
7039 "vmovdqu 0x20(%0),%%ymm1 \n"
7040 "lea 0x40(%0),%0 \n"
7041 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
7042 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
7043 "vmovdqu %%ymm0,(%1) \n"
7044 "vmovdqu %%ymm1,0x20(%1) \n"
7045 "lea 0x40(%1),%1 \n"
7046 "sub $0x20,%2 \n"
7047 "jg 1b \n"
7048 "vzeroupper \n"
7049 : "+r"(src_uv), // %0
7050 "+r"(dst_vu), // %1
7051 "+r"(width) // %2
7052 : "m"(kShuffleUVToVU) // %3
7053 : "memory", "cc", "xmm0", "xmm1", "xmm5");
7054 }
7055 #endif // HAS_SWAPUVROW_AVX2
7056
HalfMergeUVRow_SSSE3(const uint8_t * src_u,int src_stride_u,const uint8_t * src_v,int src_stride_v,uint8_t * dst_uv,int width)7057 void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
7058 int src_stride_u,
7059 const uint8_t* src_v,
7060 int src_stride_v,
7061 uint8_t* dst_uv,
7062 int width) {
7063 asm volatile(
7064 "pcmpeqb %%xmm4,%%xmm4 \n"
7065 "psrlw $0xf,%%xmm4 \n"
7066 "packuswb %%xmm4,%%xmm4 \n"
7067 "pxor %%xmm5,%%xmm5 \n"
7068
7069 LABELALIGN
7070 "1: \n"
7071 "movdqu (%0),%%xmm0 \n" // load 16 U values
7072 "movdqu (%1),%%xmm1 \n" // load 16 V values
7073 "movdqu 0(%0,%4,1),%%xmm2 \n" // 16 from next row
7074 "movdqu 0(%1,%5,1),%%xmm3 \n"
7075 "lea 0x10(%0),%0 \n"
7076 "pmaddubsw %%xmm4,%%xmm0 \n" // half size
7077 "pmaddubsw %%xmm4,%%xmm1 \n"
7078 "pmaddubsw %%xmm4,%%xmm2 \n"
7079 "pmaddubsw %%xmm4,%%xmm3 \n"
7080 "lea 0x10(%1),%1 \n"
7081 "paddw %%xmm2,%%xmm0 \n"
7082 "paddw %%xmm3,%%xmm1 \n"
7083 "psrlw $0x1,%%xmm0 \n"
7084 "psrlw $0x1,%%xmm1 \n"
7085 "pavgw %%xmm5,%%xmm0 \n"
7086 "pavgw %%xmm5,%%xmm1 \n"
7087 "packuswb %%xmm0,%%xmm0 \n"
7088 "packuswb %%xmm1,%%xmm1 \n"
7089 "punpcklbw %%xmm1,%%xmm0 \n"
7090 "movdqu %%xmm0,(%2) \n" // store 8 UV pixels
7091 "lea 0x10(%2),%2 \n"
7092 "sub $0x10,%3 \n" // 16 src pixels per loop
7093 "jg 1b \n"
7094 : "+r"(src_u), // %0
7095 "+r"(src_v), // %1
7096 "+r"(dst_uv), // %2
7097 "+r"(width) // %3
7098 : "r"((intptr_t)(src_stride_u)), // %4
7099 "r"((intptr_t)(src_stride_v)) // %5
7100 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
7101 }
7102
HalfMergeUVRow_AVX2(const uint8_t * src_u,int src_stride_u,const uint8_t * src_v,int src_stride_v,uint8_t * dst_uv,int width)7103 void HalfMergeUVRow_AVX2(const uint8_t* src_u,
7104 int src_stride_u,
7105 const uint8_t* src_v,
7106 int src_stride_v,
7107 uint8_t* dst_uv,
7108 int width) {
7109 asm volatile(
7110 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
7111 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
7112 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
7113 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
7114
7115 LABELALIGN
7116 "1: \n"
7117 "vmovdqu (%0),%%ymm0 \n" // load 32 U values
7118 "vmovdqu (%1),%%ymm1 \n" // load 32 V values
7119 "vmovdqu 0(%0,%4,1),%%ymm2 \n" // 32 from next row
7120 "vmovdqu 0(%1,%5,1),%%ymm3 \n"
7121 "lea 0x20(%0),%0 \n"
7122 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // half size
7123 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
7124 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
7125 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
7126 "lea 0x20(%1),%1 \n"
7127 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
7128 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
7129 "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
7130 "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
7131 "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
7132 "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
7133 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
7134 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
7135 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
7136 "vmovdqu %%ymm0,(%2) \n" // store 16 UV pixels
7137 "lea 0x20(%2),%2 \n"
7138 "sub $0x20,%3 \n" // 32 src pixels per loop
7139 "jg 1b \n"
7140 "vzeroupper \n"
7141 : "+r"(src_u), // %0
7142 "+r"(src_v), // %1
7143 "+r"(dst_uv), // %2
7144 "+r"(width) // %3
7145 : "r"((intptr_t)(src_stride_u)), // %4
7146 "r"((intptr_t)(src_stride_v)) // %5
7147 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
7148 }
7149
ClampFloatToZero_SSE2(const float * src_x,float * dst_y,int width)7150 void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) {
7151 asm volatile(
7152 "pxor %%xmm1,%%xmm1 \n"
7153
7154 LABELALIGN
7155 "1: \n"
7156 "movd (%0),%%xmm0 \n" // load float
7157 "maxss %%xmm1, %%xmm0 \n" // clamp to zero
7158 "add 4, %0 \n"
7159 "movd %%xmm0, (%1) \n" // store float
7160 "add 4, %1 \n"
7161 "sub $0x4,%2 \n" // 1 float per loop
7162 "jg 1b \n"
7163 : "+r"(src_x), // %0
7164 "+r"(dst_y), // %1
7165 "+r"(width) // %2
7166 :
7167 : "memory", "cc", "xmm0", "xmm1");
7168 }
7169
7170 #endif // defined(__x86_64__) || defined(__i386__)
7171
7172 #ifdef __cplusplus
7173 } // extern "C"
7174 } // namespace libyuv
7175 #endif
7176