1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12 #include "libyuv/scale_row.h"
13
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18
19 // This module is for GCC x86 and x64.
20 #if !defined(LIBYUV_DISABLE_X86) && \
21 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
22
23 // Offsets for source bytes 0 to 9
24 static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
25 128, 128, 128, 128, 128, 128, 128, 128};
26
27 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
28 static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
29 128, 128, 128, 128, 128, 128, 128, 128};
30
31 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
32 static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
33 128, 128, 128, 128, 128, 128, 128, 128};
34
35 // Offsets for source bytes 0 to 10
36 static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
37
38 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
39 static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7,
40 8, 9, 9, 10, 10, 11, 12, 13};
41
42 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
43 static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
44 10, 11, 12, 13, 13, 14, 14, 15};
45
46 // Coefficients for source bytes 0 to 10
47 static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
48
49 // Coefficients for source bytes 10 to 21
50 static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
51
52 // Coefficients for source bytes 21 to 31
53 static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
54
55 // Coefficients for source bytes 21 to 31
56 static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
57
58 static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
59 128, 128, 128, 128, 128, 128, 128, 128};
60
61 static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
62 6, 8, 11, 14, 128, 128, 128, 128};
63
64 // Arrange words 0,3,6 into 0,1,2
65 static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
66 128, 128, 128, 128, 128, 128, 128, 128};
67
68 // Arrange words 0,3,6 into 3,4,5
69 static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
70 6, 7, 12, 13, 128, 128, 128, 128};
71
72 // Scaling values for boxes of 3x3 and 2x3
73 static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
74 65536 / 9, 65536 / 6, 0, 0};
75
76 // Arrange first value for pixels 0,1,2,3,4,5
77 static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
78 11, 128, 14, 128, 128, 128, 128, 128};
79
80 // Arrange second value for pixels 0,1,2,3,4,5
81 static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
82 12, 128, 15, 128, 128, 128, 128, 128};
83
84 // Arrange third value for pixels 0,1,2,3,4,5
85 static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
86 13, 128, 128, 128, 128, 128, 128, 128};
87
88 // Scaling values for boxes of 3x2 and 2x2
89 static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
90 65536 / 3, 65536 / 2, 0, 0};
91
92 // GCC versions of row functions are verbatim conversions from Visual C.
93 // Generated using gcc disassembly on Visual C object file:
94 // objdump -D yuvscaler.obj >yuvscaler.txt
95
ScaleRowDown2_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)96 void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
97 ptrdiff_t src_stride,
98 uint8_t* dst_ptr,
99 int dst_width) {
100 (void)src_stride;
101 asm volatile(
102 // 16 pixel loop.
103 LABELALIGN
104 "1: \n"
105 "movdqu (%0),%%xmm0 \n"
106 "movdqu 0x10(%0),%%xmm1 \n"
107 "lea 0x20(%0),%0 \n"
108 "psrlw $0x8,%%xmm0 \n"
109 "psrlw $0x8,%%xmm1 \n"
110 "packuswb %%xmm1,%%xmm0 \n"
111 "movdqu %%xmm0,(%1) \n"
112 "lea 0x10(%1),%1 \n"
113 "sub $0x10,%2 \n"
114 "jg 1b \n"
115 : "+r"(src_ptr), // %0
116 "+r"(dst_ptr), // %1
117 "+r"(dst_width) // %2
118 ::"memory",
119 "cc", "xmm0", "xmm1");
120 }
121
ScaleRowDown2Linear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)122 void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
123 ptrdiff_t src_stride,
124 uint8_t* dst_ptr,
125 int dst_width) {
126 (void)src_stride;
127 asm volatile(
128 "pcmpeqb %%xmm4,%%xmm4 \n"
129 "psrlw $0xf,%%xmm4 \n"
130 "packuswb %%xmm4,%%xmm4 \n"
131 "pxor %%xmm5,%%xmm5 \n"
132
133 LABELALIGN
134 "1: \n"
135 "movdqu (%0),%%xmm0 \n"
136 "movdqu 0x10(%0),%%xmm1 \n"
137 "lea 0x20(%0),%0 \n"
138 "pmaddubsw %%xmm4,%%xmm0 \n"
139 "pmaddubsw %%xmm4,%%xmm1 \n"
140 "pavgw %%xmm5,%%xmm0 \n"
141 "pavgw %%xmm5,%%xmm1 \n"
142 "packuswb %%xmm1,%%xmm0 \n"
143 "movdqu %%xmm0,(%1) \n"
144 "lea 0x10(%1),%1 \n"
145 "sub $0x10,%2 \n"
146 "jg 1b \n"
147 : "+r"(src_ptr), // %0
148 "+r"(dst_ptr), // %1
149 "+r"(dst_width) // %2
150 ::"memory",
151 "cc", "xmm0", "xmm1", "xmm4", "xmm5");
152 }
153
ScaleRowDown2Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)154 void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
155 ptrdiff_t src_stride,
156 uint8_t* dst_ptr,
157 int dst_width) {
158 asm volatile(
159 "pcmpeqb %%xmm4,%%xmm4 \n"
160 "psrlw $0xf,%%xmm4 \n"
161 "packuswb %%xmm4,%%xmm4 \n"
162 "pxor %%xmm5,%%xmm5 \n"
163
164 LABELALIGN
165 "1: \n"
166 "movdqu (%0),%%xmm0 \n"
167 "movdqu 0x10(%0),%%xmm1 \n"
168 "movdqu 0x00(%0,%3,1),%%xmm2 \n"
169 "movdqu 0x10(%0,%3,1),%%xmm3 \n"
170 "lea 0x20(%0),%0 \n"
171 "pmaddubsw %%xmm4,%%xmm0 \n"
172 "pmaddubsw %%xmm4,%%xmm1 \n"
173 "pmaddubsw %%xmm4,%%xmm2 \n"
174 "pmaddubsw %%xmm4,%%xmm3 \n"
175 "paddw %%xmm2,%%xmm0 \n"
176 "paddw %%xmm3,%%xmm1 \n"
177 "psrlw $0x1,%%xmm0 \n"
178 "psrlw $0x1,%%xmm1 \n"
179 "pavgw %%xmm5,%%xmm0 \n"
180 "pavgw %%xmm5,%%xmm1 \n"
181 "packuswb %%xmm1,%%xmm0 \n"
182 "movdqu %%xmm0,(%1) \n"
183 "lea 0x10(%1),%1 \n"
184 "sub $0x10,%2 \n"
185 "jg 1b \n"
186 : "+r"(src_ptr), // %0
187 "+r"(dst_ptr), // %1
188 "+r"(dst_width) // %2
189 : "r"((intptr_t)(src_stride)) // %3
190 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
191 }
192
193 #ifdef HAS_SCALEROWDOWN2_AVX2
ScaleRowDown2_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)194 void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
195 ptrdiff_t src_stride,
196 uint8_t* dst_ptr,
197 int dst_width) {
198 (void)src_stride;
199 asm volatile(
200
201 LABELALIGN
202 "1: \n"
203 "vmovdqu (%0),%%ymm0 \n"
204 "vmovdqu 0x20(%0),%%ymm1 \n"
205 "lea 0x40(%0),%0 \n"
206 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
207 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
208 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
209 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
210 "vmovdqu %%ymm0,(%1) \n"
211 "lea 0x20(%1),%1 \n"
212 "sub $0x20,%2 \n"
213 "jg 1b \n"
214 "vzeroupper \n"
215 : "+r"(src_ptr), // %0
216 "+r"(dst_ptr), // %1
217 "+r"(dst_width) // %2
218 ::"memory",
219 "cc", "xmm0", "xmm1");
220 }
221
ScaleRowDown2Linear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)222 void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
223 ptrdiff_t src_stride,
224 uint8_t* dst_ptr,
225 int dst_width) {
226 (void)src_stride;
227 asm volatile(
228 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
229 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
230 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
231 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
232
233 LABELALIGN
234 "1: \n"
235 "vmovdqu (%0),%%ymm0 \n"
236 "vmovdqu 0x20(%0),%%ymm1 \n"
237 "lea 0x40(%0),%0 \n"
238 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
239 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
240 "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
241 "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
242 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
243 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
244 "vmovdqu %%ymm0,(%1) \n"
245 "lea 0x20(%1),%1 \n"
246 "sub $0x20,%2 \n"
247 "jg 1b \n"
248 "vzeroupper \n"
249 : "+r"(src_ptr), // %0
250 "+r"(dst_ptr), // %1
251 "+r"(dst_width) // %2
252 ::"memory",
253 "cc", "xmm0", "xmm1", "xmm4", "xmm5");
254 }
255
ScaleRowDown2Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)256 void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
257 ptrdiff_t src_stride,
258 uint8_t* dst_ptr,
259 int dst_width) {
260 asm volatile(
261 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
262 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
263 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
264 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
265
266 LABELALIGN
267 "1: \n"
268 "vmovdqu (%0),%%ymm0 \n"
269 "vmovdqu 0x20(%0),%%ymm1 \n"
270 "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
271 "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
272 "lea 0x40(%0),%0 \n"
273 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
274 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
275 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
276 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
277 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
278 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
279 "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
280 "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
281 "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
282 "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
283 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
284 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
285 "vmovdqu %%ymm0,(%1) \n"
286 "lea 0x20(%1),%1 \n"
287 "sub $0x20,%2 \n"
288 "jg 1b \n"
289 "vzeroupper \n"
290 : "+r"(src_ptr), // %0
291 "+r"(dst_ptr), // %1
292 "+r"(dst_width) // %2
293 : "r"((intptr_t)(src_stride)) // %3
294 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
295 }
296 #endif // HAS_SCALEROWDOWN2_AVX2
297
ScaleRowDown4_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)298 void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
299 ptrdiff_t src_stride,
300 uint8_t* dst_ptr,
301 int dst_width) {
302 (void)src_stride;
303 asm volatile(
304 "pcmpeqb %%xmm5,%%xmm5 \n"
305 "psrld $0x18,%%xmm5 \n"
306 "pslld $0x10,%%xmm5 \n"
307
308 LABELALIGN
309 "1: \n"
310 "movdqu (%0),%%xmm0 \n"
311 "movdqu 0x10(%0),%%xmm1 \n"
312 "lea 0x20(%0),%0 \n"
313 "pand %%xmm5,%%xmm0 \n"
314 "pand %%xmm5,%%xmm1 \n"
315 "packuswb %%xmm1,%%xmm0 \n"
316 "psrlw $0x8,%%xmm0 \n"
317 "packuswb %%xmm0,%%xmm0 \n"
318 "movq %%xmm0,(%1) \n"
319 "lea 0x8(%1),%1 \n"
320 "sub $0x8,%2 \n"
321 "jg 1b \n"
322 : "+r"(src_ptr), // %0
323 "+r"(dst_ptr), // %1
324 "+r"(dst_width) // %2
325 ::"memory",
326 "cc", "xmm0", "xmm1", "xmm5");
327 }
328
ScaleRowDown4Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)329 void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
330 ptrdiff_t src_stride,
331 uint8_t* dst_ptr,
332 int dst_width) {
333 intptr_t stridex3;
334 asm volatile(
335 "pcmpeqb %%xmm4,%%xmm4 \n"
336 "psrlw $0xf,%%xmm4 \n"
337 "movdqa %%xmm4,%%xmm5 \n"
338 "packuswb %%xmm4,%%xmm4 \n"
339 "psllw $0x3,%%xmm5 \n"
340 "lea 0x00(%4,%4,2),%3 \n"
341
342 LABELALIGN
343 "1: \n"
344 "movdqu (%0),%%xmm0 \n"
345 "movdqu 0x10(%0),%%xmm1 \n"
346 "movdqu 0x00(%0,%4,1),%%xmm2 \n"
347 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
348 "pmaddubsw %%xmm4,%%xmm0 \n"
349 "pmaddubsw %%xmm4,%%xmm1 \n"
350 "pmaddubsw %%xmm4,%%xmm2 \n"
351 "pmaddubsw %%xmm4,%%xmm3 \n"
352 "paddw %%xmm2,%%xmm0 \n"
353 "paddw %%xmm3,%%xmm1 \n"
354 "movdqu 0x00(%0,%4,2),%%xmm2 \n"
355 "movdqu 0x10(%0,%4,2),%%xmm3 \n"
356 "pmaddubsw %%xmm4,%%xmm2 \n"
357 "pmaddubsw %%xmm4,%%xmm3 \n"
358 "paddw %%xmm2,%%xmm0 \n"
359 "paddw %%xmm3,%%xmm1 \n"
360 "movdqu 0x00(%0,%3,1),%%xmm2 \n"
361 "movdqu 0x10(%0,%3,1),%%xmm3 \n"
362 "lea 0x20(%0),%0 \n"
363 "pmaddubsw %%xmm4,%%xmm2 \n"
364 "pmaddubsw %%xmm4,%%xmm3 \n"
365 "paddw %%xmm2,%%xmm0 \n"
366 "paddw %%xmm3,%%xmm1 \n"
367 "phaddw %%xmm1,%%xmm0 \n"
368 "paddw %%xmm5,%%xmm0 \n"
369 "psrlw $0x4,%%xmm0 \n"
370 "packuswb %%xmm0,%%xmm0 \n"
371 "movq %%xmm0,(%1) \n"
372 "lea 0x8(%1),%1 \n"
373 "sub $0x8,%2 \n"
374 "jg 1b \n"
375 : "+r"(src_ptr), // %0
376 "+r"(dst_ptr), // %1
377 "+r"(dst_width), // %2
378 "=&r"(stridex3) // %3
379 : "r"((intptr_t)(src_stride)) // %4
380 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
381 }
382
383 #ifdef HAS_SCALEROWDOWN4_AVX2
ScaleRowDown4_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)384 void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
385 ptrdiff_t src_stride,
386 uint8_t* dst_ptr,
387 int dst_width) {
388 (void)src_stride;
389 asm volatile(
390 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
391 "vpsrld $0x18,%%ymm5,%%ymm5 \n"
392 "vpslld $0x10,%%ymm5,%%ymm5 \n"
393
394 LABELALIGN
395 "1: \n"
396 "vmovdqu (%0),%%ymm0 \n"
397 "vmovdqu 0x20(%0),%%ymm1 \n"
398 "lea 0x40(%0),%0 \n"
399 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
400 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
401 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
402 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
403 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
404 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
405 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
406 "vmovdqu %%xmm0,(%1) \n"
407 "lea 0x10(%1),%1 \n"
408 "sub $0x10,%2 \n"
409 "jg 1b \n"
410 "vzeroupper \n"
411 : "+r"(src_ptr), // %0
412 "+r"(dst_ptr), // %1
413 "+r"(dst_width) // %2
414 ::"memory",
415 "cc", "xmm0", "xmm1", "xmm5");
416 }
417
ScaleRowDown4Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)418 void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
419 ptrdiff_t src_stride,
420 uint8_t* dst_ptr,
421 int dst_width) {
422 asm volatile(
423 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
424 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
425 "vpsllw $0x3,%%ymm4,%%ymm5 \n"
426 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
427
428 LABELALIGN
429 "1: \n"
430 "vmovdqu (%0),%%ymm0 \n"
431 "vmovdqu 0x20(%0),%%ymm1 \n"
432 "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
433 "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
434 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
435 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
436 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
437 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
438 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
439 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
440 "vmovdqu 0x00(%0,%3,2),%%ymm2 \n"
441 "vmovdqu 0x20(%0,%3,2),%%ymm3 \n"
442 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
443 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
444 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
445 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
446 "vmovdqu 0x00(%0,%4,1),%%ymm2 \n"
447 "vmovdqu 0x20(%0,%4,1),%%ymm3 \n"
448 "lea 0x40(%0),%0 \n"
449 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
450 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
451 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
452 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
453 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
454 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
455 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
456 "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
457 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
458 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
459 "vmovdqu %%xmm0,(%1) \n"
460 "lea 0x10(%1),%1 \n"
461 "sub $0x10,%2 \n"
462 "jg 1b \n"
463 "vzeroupper \n"
464 : "+r"(src_ptr), // %0
465 "+r"(dst_ptr), // %1
466 "+r"(dst_width) // %2
467 : "r"((intptr_t)(src_stride)), // %3
468 "r"((intptr_t)(src_stride * 3)) // %4
469 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
470 }
471 #endif // HAS_SCALEROWDOWN4_AVX2
472
ScaleRowDown34_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)473 void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
474 ptrdiff_t src_stride,
475 uint8_t* dst_ptr,
476 int dst_width) {
477 (void)src_stride;
478 asm volatile(
479 "movdqa %0,%%xmm3 \n"
480 "movdqa %1,%%xmm4 \n"
481 "movdqa %2,%%xmm5 \n"
482 :
483 : "m"(kShuf0), // %0
484 "m"(kShuf1), // %1
485 "m"(kShuf2) // %2
486 );
487 asm volatile(
488
489 LABELALIGN
490 "1: \n"
491 "movdqu (%0),%%xmm0 \n"
492 "movdqu 0x10(%0),%%xmm2 \n"
493 "lea 0x20(%0),%0 \n"
494 "movdqa %%xmm2,%%xmm1 \n"
495 "palignr $0x8,%%xmm0,%%xmm1 \n"
496 "pshufb %%xmm3,%%xmm0 \n"
497 "pshufb %%xmm4,%%xmm1 \n"
498 "pshufb %%xmm5,%%xmm2 \n"
499 "movq %%xmm0,(%1) \n"
500 "movq %%xmm1,0x8(%1) \n"
501 "movq %%xmm2,0x10(%1) \n"
502 "lea 0x18(%1),%1 \n"
503 "sub $0x18,%2 \n"
504 "jg 1b \n"
505 : "+r"(src_ptr), // %0
506 "+r"(dst_ptr), // %1
507 "+r"(dst_width) // %2
508 ::"memory",
509 "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
510 }
511
ScaleRowDown34_1_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)512 void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
513 ptrdiff_t src_stride,
514 uint8_t* dst_ptr,
515 int dst_width) {
516 asm volatile(
517 "movdqa %0,%%xmm2 \n" // kShuf01
518 "movdqa %1,%%xmm3 \n" // kShuf11
519 "movdqa %2,%%xmm4 \n" // kShuf21
520 :
521 : "m"(kShuf01), // %0
522 "m"(kShuf11), // %1
523 "m"(kShuf21) // %2
524 );
525 asm volatile(
526 "movdqa %0,%%xmm5 \n" // kMadd01
527 "movdqa %1,%%xmm0 \n" // kMadd11
528 "movdqa %2,%%xmm1 \n" // kRound34
529 :
530 : "m"(kMadd01), // %0
531 "m"(kMadd11), // %1
532 "m"(kRound34) // %2
533 );
534 asm volatile(
535
536 LABELALIGN
537 "1: \n"
538 "movdqu (%0),%%xmm6 \n"
539 "movdqu 0x00(%0,%3,1),%%xmm7 \n"
540 "pavgb %%xmm7,%%xmm6 \n"
541 "pshufb %%xmm2,%%xmm6 \n"
542 "pmaddubsw %%xmm5,%%xmm6 \n"
543 "paddsw %%xmm1,%%xmm6 \n"
544 "psrlw $0x2,%%xmm6 \n"
545 "packuswb %%xmm6,%%xmm6 \n"
546 "movq %%xmm6,(%1) \n"
547 "movdqu 0x8(%0),%%xmm6 \n"
548 "movdqu 0x8(%0,%3,1),%%xmm7 \n"
549 "pavgb %%xmm7,%%xmm6 \n"
550 "pshufb %%xmm3,%%xmm6 \n"
551 "pmaddubsw %%xmm0,%%xmm6 \n"
552 "paddsw %%xmm1,%%xmm6 \n"
553 "psrlw $0x2,%%xmm6 \n"
554 "packuswb %%xmm6,%%xmm6 \n"
555 "movq %%xmm6,0x8(%1) \n"
556 "movdqu 0x10(%0),%%xmm6 \n"
557 "movdqu 0x10(%0,%3,1),%%xmm7 \n"
558 "lea 0x20(%0),%0 \n"
559 "pavgb %%xmm7,%%xmm6 \n"
560 "pshufb %%xmm4,%%xmm6 \n"
561 "pmaddubsw %4,%%xmm6 \n"
562 "paddsw %%xmm1,%%xmm6 \n"
563 "psrlw $0x2,%%xmm6 \n"
564 "packuswb %%xmm6,%%xmm6 \n"
565 "movq %%xmm6,0x10(%1) \n"
566 "lea 0x18(%1),%1 \n"
567 "sub $0x18,%2 \n"
568 "jg 1b \n"
569 : "+r"(src_ptr), // %0
570 "+r"(dst_ptr), // %1
571 "+r"(dst_width) // %2
572 : "r"((intptr_t)(src_stride)), // %3
573 "m"(kMadd21) // %4
574 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
575 "xmm7");
576 }
577
ScaleRowDown34_0_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)578 void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
579 ptrdiff_t src_stride,
580 uint8_t* dst_ptr,
581 int dst_width) {
582 asm volatile(
583 "movdqa %0,%%xmm2 \n" // kShuf01
584 "movdqa %1,%%xmm3 \n" // kShuf11
585 "movdqa %2,%%xmm4 \n" // kShuf21
586 :
587 : "m"(kShuf01), // %0
588 "m"(kShuf11), // %1
589 "m"(kShuf21) // %2
590 );
591 asm volatile(
592 "movdqa %0,%%xmm5 \n" // kMadd01
593 "movdqa %1,%%xmm0 \n" // kMadd11
594 "movdqa %2,%%xmm1 \n" // kRound34
595 :
596 : "m"(kMadd01), // %0
597 "m"(kMadd11), // %1
598 "m"(kRound34) // %2
599 );
600
601 asm volatile(
602
603 LABELALIGN
604 "1: \n"
605 "movdqu (%0),%%xmm6 \n"
606 "movdqu 0x00(%0,%3,1),%%xmm7 \n"
607 "pavgb %%xmm6,%%xmm7 \n"
608 "pavgb %%xmm7,%%xmm6 \n"
609 "pshufb %%xmm2,%%xmm6 \n"
610 "pmaddubsw %%xmm5,%%xmm6 \n"
611 "paddsw %%xmm1,%%xmm6 \n"
612 "psrlw $0x2,%%xmm6 \n"
613 "packuswb %%xmm6,%%xmm6 \n"
614 "movq %%xmm6,(%1) \n"
615 "movdqu 0x8(%0),%%xmm6 \n"
616 "movdqu 0x8(%0,%3,1),%%xmm7 \n"
617 "pavgb %%xmm6,%%xmm7 \n"
618 "pavgb %%xmm7,%%xmm6 \n"
619 "pshufb %%xmm3,%%xmm6 \n"
620 "pmaddubsw %%xmm0,%%xmm6 \n"
621 "paddsw %%xmm1,%%xmm6 \n"
622 "psrlw $0x2,%%xmm6 \n"
623 "packuswb %%xmm6,%%xmm6 \n"
624 "movq %%xmm6,0x8(%1) \n"
625 "movdqu 0x10(%0),%%xmm6 \n"
626 "movdqu 0x10(%0,%3,1),%%xmm7 \n"
627 "lea 0x20(%0),%0 \n"
628 "pavgb %%xmm6,%%xmm7 \n"
629 "pavgb %%xmm7,%%xmm6 \n"
630 "pshufb %%xmm4,%%xmm6 \n"
631 "pmaddubsw %4,%%xmm6 \n"
632 "paddsw %%xmm1,%%xmm6 \n"
633 "psrlw $0x2,%%xmm6 \n"
634 "packuswb %%xmm6,%%xmm6 \n"
635 "movq %%xmm6,0x10(%1) \n"
636 "lea 0x18(%1),%1 \n"
637 "sub $0x18,%2 \n"
638 "jg 1b \n"
639 : "+r"(src_ptr), // %0
640 "+r"(dst_ptr), // %1
641 "+r"(dst_width) // %2
642 : "r"((intptr_t)(src_stride)), // %3
643 "m"(kMadd21) // %4
644 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
645 "xmm7");
646 }
647
ScaleRowDown38_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)648 void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
649 ptrdiff_t src_stride,
650 uint8_t* dst_ptr,
651 int dst_width) {
652 (void)src_stride;
653 asm volatile(
654 "movdqa %3,%%xmm4 \n"
655 "movdqa %4,%%xmm5 \n"
656
657 LABELALIGN
658 "1: \n"
659 "movdqu (%0),%%xmm0 \n"
660 "movdqu 0x10(%0),%%xmm1 \n"
661 "lea 0x20(%0),%0 \n"
662 "pshufb %%xmm4,%%xmm0 \n"
663 "pshufb %%xmm5,%%xmm1 \n"
664 "paddusb %%xmm1,%%xmm0 \n"
665 "movq %%xmm0,(%1) \n"
666 "movhlps %%xmm0,%%xmm1 \n"
667 "movd %%xmm1,0x8(%1) \n"
668 "lea 0xc(%1),%1 \n"
669 "sub $0xc,%2 \n"
670 "jg 1b \n"
671 : "+r"(src_ptr), // %0
672 "+r"(dst_ptr), // %1
673 "+r"(dst_width) // %2
674 : "m"(kShuf38a), // %3
675 "m"(kShuf38b) // %4
676 : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
677 }
678
ScaleRowDown38_2_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)679 void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
680 ptrdiff_t src_stride,
681 uint8_t* dst_ptr,
682 int dst_width) {
683 asm volatile(
684 "movdqa %0,%%xmm2 \n"
685 "movdqa %1,%%xmm3 \n"
686 "movdqa %2,%%xmm4 \n"
687 "movdqa %3,%%xmm5 \n"
688 :
689 : "m"(kShufAb0), // %0
690 "m"(kShufAb1), // %1
691 "m"(kShufAb2), // %2
692 "m"(kScaleAb2) // %3
693 );
694 asm volatile(
695
696 LABELALIGN
697 "1: \n"
698 "movdqu (%0),%%xmm0 \n"
699 "movdqu 0x00(%0,%3,1),%%xmm1 \n"
700 "lea 0x10(%0),%0 \n"
701 "pavgb %%xmm1,%%xmm0 \n"
702 "movdqa %%xmm0,%%xmm1 \n"
703 "pshufb %%xmm2,%%xmm1 \n"
704 "movdqa %%xmm0,%%xmm6 \n"
705 "pshufb %%xmm3,%%xmm6 \n"
706 "paddusw %%xmm6,%%xmm1 \n"
707 "pshufb %%xmm4,%%xmm0 \n"
708 "paddusw %%xmm0,%%xmm1 \n"
709 "pmulhuw %%xmm5,%%xmm1 \n"
710 "packuswb %%xmm1,%%xmm1 \n"
711 "movd %%xmm1,(%1) \n"
712 "psrlq $0x10,%%xmm1 \n"
713 "movd %%xmm1,0x2(%1) \n"
714 "lea 0x6(%1),%1 \n"
715 "sub $0x6,%2 \n"
716 "jg 1b \n"
717 : "+r"(src_ptr), // %0
718 "+r"(dst_ptr), // %1
719 "+r"(dst_width) // %2
720 : "r"((intptr_t)(src_stride)) // %3
721 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
722 }
723
ScaleRowDown38_3_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)724 void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
725 ptrdiff_t src_stride,
726 uint8_t* dst_ptr,
727 int dst_width) {
728 asm volatile(
729 "movdqa %0,%%xmm2 \n"
730 "movdqa %1,%%xmm3 \n"
731 "movdqa %2,%%xmm4 \n"
732 "pxor %%xmm5,%%xmm5 \n"
733 :
734 : "m"(kShufAc), // %0
735 "m"(kShufAc3), // %1
736 "m"(kScaleAc33) // %2
737 );
738 asm volatile(
739
740 LABELALIGN
741 "1: \n"
742 "movdqu (%0),%%xmm0 \n"
743 "movdqu 0x00(%0,%3,1),%%xmm6 \n"
744 "movhlps %%xmm0,%%xmm1 \n"
745 "movhlps %%xmm6,%%xmm7 \n"
746 "punpcklbw %%xmm5,%%xmm0 \n"
747 "punpcklbw %%xmm5,%%xmm1 \n"
748 "punpcklbw %%xmm5,%%xmm6 \n"
749 "punpcklbw %%xmm5,%%xmm7 \n"
750 "paddusw %%xmm6,%%xmm0 \n"
751 "paddusw %%xmm7,%%xmm1 \n"
752 "movdqu 0x00(%0,%3,2),%%xmm6 \n"
753 "lea 0x10(%0),%0 \n"
754 "movhlps %%xmm6,%%xmm7 \n"
755 "punpcklbw %%xmm5,%%xmm6 \n"
756 "punpcklbw %%xmm5,%%xmm7 \n"
757 "paddusw %%xmm6,%%xmm0 \n"
758 "paddusw %%xmm7,%%xmm1 \n"
759 "movdqa %%xmm0,%%xmm6 \n"
760 "psrldq $0x2,%%xmm0 \n"
761 "paddusw %%xmm0,%%xmm6 \n"
762 "psrldq $0x2,%%xmm0 \n"
763 "paddusw %%xmm0,%%xmm6 \n"
764 "pshufb %%xmm2,%%xmm6 \n"
765 "movdqa %%xmm1,%%xmm7 \n"
766 "psrldq $0x2,%%xmm1 \n"
767 "paddusw %%xmm1,%%xmm7 \n"
768 "psrldq $0x2,%%xmm1 \n"
769 "paddusw %%xmm1,%%xmm7 \n"
770 "pshufb %%xmm3,%%xmm7 \n"
771 "paddusw %%xmm7,%%xmm6 \n"
772 "pmulhuw %%xmm4,%%xmm6 \n"
773 "packuswb %%xmm6,%%xmm6 \n"
774 "movd %%xmm6,(%1) \n"
775 "psrlq $0x10,%%xmm6 \n"
776 "movd %%xmm6,0x2(%1) \n"
777 "lea 0x6(%1),%1 \n"
778 "sub $0x6,%2 \n"
779 "jg 1b \n"
780 : "+r"(src_ptr), // %0
781 "+r"(dst_ptr), // %1
782 "+r"(dst_width) // %2
783 : "r"((intptr_t)(src_stride)) // %3
784 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
785 "xmm7");
786 }
787
788 // Reads 16xN bytes and produces 16 shorts at a time.
ScaleAddRow_SSE2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)789 void ScaleAddRow_SSE2(const uint8_t* src_ptr,
790 uint16_t* dst_ptr,
791 int src_width) {
792 asm volatile(
793
794 "pxor %%xmm5,%%xmm5 \n"
795
796 // 16 pixel loop.
797 LABELALIGN
798 "1: \n"
799 "movdqu (%0),%%xmm3 \n"
800 "lea 0x10(%0),%0 \n" // src_ptr += 16
801 "movdqu (%1),%%xmm0 \n"
802 "movdqu 0x10(%1),%%xmm1 \n"
803 "movdqa %%xmm3,%%xmm2 \n"
804 "punpcklbw %%xmm5,%%xmm2 \n"
805 "punpckhbw %%xmm5,%%xmm3 \n"
806 "paddusw %%xmm2,%%xmm0 \n"
807 "paddusw %%xmm3,%%xmm1 \n"
808 "movdqu %%xmm0,(%1) \n"
809 "movdqu %%xmm1,0x10(%1) \n"
810 "lea 0x20(%1),%1 \n"
811 "sub $0x10,%2 \n"
812 "jg 1b \n"
813 : "+r"(src_ptr), // %0
814 "+r"(dst_ptr), // %1
815 "+r"(src_width) // %2
816 :
817 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
818 }
819
820 #ifdef HAS_SCALEADDROW_AVX2
821 // Reads 32 bytes and accumulates to 32 shorts at a time.
ScaleAddRow_AVX2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)822 void ScaleAddRow_AVX2(const uint8_t* src_ptr,
823 uint16_t* dst_ptr,
824 int src_width) {
825 asm volatile(
826
827 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
828
829 LABELALIGN
830 "1: \n"
831 "vmovdqu (%0),%%ymm3 \n"
832 "lea 0x20(%0),%0 \n" // src_ptr += 32
833 "vpermq $0xd8,%%ymm3,%%ymm3 \n"
834 "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
835 "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
836 "vpaddusw (%1),%%ymm2,%%ymm0 \n"
837 "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n"
838 "vmovdqu %%ymm0,(%1) \n"
839 "vmovdqu %%ymm1,0x20(%1) \n"
840 "lea 0x40(%1),%1 \n"
841 "sub $0x20,%2 \n"
842 "jg 1b \n"
843 "vzeroupper \n"
844 : "+r"(src_ptr), // %0
845 "+r"(dst_ptr), // %1
846 "+r"(src_width) // %2
847 :
848 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
849 }
850 #endif // HAS_SCALEADDROW_AVX2
851
852 // Constant for making pixels signed to avoid pmaddubsw
853 // saturation.
854 static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
855 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
856
857 // Constant for making pixels unsigned and adding .5 for rounding.
858 static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
859 0x4040, 0x4040, 0x4040, 0x4040};
860
861 // Bilinear column filtering. SSSE3 version.
ScaleFilterCols_SSSE3(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)862 void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
863 const uint8_t* src_ptr,
864 int dst_width,
865 int x,
866 int dx) {
867 intptr_t x0, x1, temp_pixel;
868 asm volatile(
869 "movd %6,%%xmm2 \n"
870 "movd %7,%%xmm3 \n"
871 "movl $0x04040000,%k2 \n"
872 "movd %k2,%%xmm5 \n"
873 "pcmpeqb %%xmm6,%%xmm6 \n"
874 "psrlw $0x9,%%xmm6 \n" // 0x007f007f
875 "pcmpeqb %%xmm7,%%xmm7 \n"
876 "psrlw $15,%%xmm7 \n" // 0x00010001
877
878 "pextrw $0x1,%%xmm2,%k3 \n"
879 "subl $0x2,%5 \n"
880 "jl 29f \n"
881 "movdqa %%xmm2,%%xmm0 \n"
882 "paddd %%xmm3,%%xmm0 \n"
883 "punpckldq %%xmm0,%%xmm2 \n"
884 "punpckldq %%xmm3,%%xmm3 \n"
885 "paddd %%xmm3,%%xmm3 \n"
886 "pextrw $0x3,%%xmm2,%k4 \n"
887
888 LABELALIGN
889 "2: \n"
890 "movdqa %%xmm2,%%xmm1 \n"
891 "paddd %%xmm3,%%xmm2 \n"
892 "movzwl 0x00(%1,%3,1),%k2 \n"
893 "movd %k2,%%xmm0 \n"
894 "psrlw $0x9,%%xmm1 \n"
895 "movzwl 0x00(%1,%4,1),%k2 \n"
896 "movd %k2,%%xmm4 \n"
897 "pshufb %%xmm5,%%xmm1 \n"
898 "punpcklwd %%xmm4,%%xmm0 \n"
899 "psubb %8,%%xmm0 \n" // make pixels signed.
900 "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) +
901 // 1
902 "paddusb %%xmm7,%%xmm1 \n"
903 "pmaddubsw %%xmm0,%%xmm1 \n"
904 "pextrw $0x1,%%xmm2,%k3 \n"
905 "pextrw $0x3,%%xmm2,%k4 \n"
906 "paddw %9,%%xmm1 \n" // make pixels unsigned.
907 "psrlw $0x7,%%xmm1 \n"
908 "packuswb %%xmm1,%%xmm1 \n"
909 "movd %%xmm1,%k2 \n"
910 "mov %w2,(%0) \n"
911 "lea 0x2(%0),%0 \n"
912 "subl $0x2,%5 \n"
913 "jge 2b \n"
914
915 LABELALIGN
916 "29: \n"
917 "addl $0x1,%5 \n"
918 "jl 99f \n"
919 "movzwl 0x00(%1,%3,1),%k2 \n"
920 "movd %k2,%%xmm0 \n"
921 "psrlw $0x9,%%xmm2 \n"
922 "pshufb %%xmm5,%%xmm2 \n"
923 "psubb %8,%%xmm0 \n" // make pixels signed.
924 "pxor %%xmm6,%%xmm2 \n"
925 "paddusb %%xmm7,%%xmm2 \n"
926 "pmaddubsw %%xmm0,%%xmm2 \n"
927 "paddw %9,%%xmm2 \n" // make pixels unsigned.
928 "psrlw $0x7,%%xmm2 \n"
929 "packuswb %%xmm2,%%xmm2 \n"
930 "movd %%xmm2,%k2 \n"
931 "mov %b2,(%0) \n"
932 "99: \n"
933 : "+r"(dst_ptr), // %0
934 "+r"(src_ptr), // %1
935 "=&a"(temp_pixel), // %2
936 "=&r"(x0), // %3
937 "=&r"(x1), // %4
938 #if defined(__x86_64__)
939 "+rm"(dst_width) // %5
940 #else
941 "+m"(dst_width) // %5
942 #endif
943 : "rm"(x), // %6
944 "rm"(dx), // %7
945 #if defined(__x86_64__)
946 "x"(kFsub80), // %8
947 "x"(kFadd40) // %9
948 #else
949 "m"(kFsub80), // %8
950 "m"(kFadd40) // %9
951 #endif
952 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
953 "xmm7");
954 }
955
956 // Reads 4 pixels, duplicates them and writes 8 pixels.
957 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleColsUp2_SSE2(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)958 void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
959 const uint8_t* src_ptr,
960 int dst_width,
961 int x,
962 int dx) {
963 (void)x;
964 (void)dx;
965 asm volatile(
966
967 LABELALIGN
968 "1: \n"
969 "movdqu (%1),%%xmm0 \n"
970 "lea 0x10(%1),%1 \n"
971 "movdqa %%xmm0,%%xmm1 \n"
972 "punpcklbw %%xmm0,%%xmm0 \n"
973 "punpckhbw %%xmm1,%%xmm1 \n"
974 "movdqu %%xmm0,(%0) \n"
975 "movdqu %%xmm1,0x10(%0) \n"
976 "lea 0x20(%0),%0 \n"
977 "sub $0x20,%2 \n"
978 "jg 1b \n"
979
980 : "+r"(dst_ptr), // %0
981 "+r"(src_ptr), // %1
982 "+r"(dst_width) // %2
983 ::"memory",
984 "cc", "xmm0", "xmm1");
985 }
986
ScaleARGBRowDown2_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)987 void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
988 ptrdiff_t src_stride,
989 uint8_t* dst_argb,
990 int dst_width) {
991 (void)src_stride;
992 asm volatile(
993
994 LABELALIGN
995 "1: \n"
996 "movdqu (%0),%%xmm0 \n"
997 "movdqu 0x10(%0),%%xmm1 \n"
998 "lea 0x20(%0),%0 \n"
999 "shufps $0xdd,%%xmm1,%%xmm0 \n"
1000 "movdqu %%xmm0,(%1) \n"
1001 "lea 0x10(%1),%1 \n"
1002 "sub $0x4,%2 \n"
1003 "jg 1b \n"
1004 : "+r"(src_argb), // %0
1005 "+r"(dst_argb), // %1
1006 "+r"(dst_width) // %2
1007 ::"memory",
1008 "cc", "xmm0", "xmm1");
1009 }
1010
ScaleARGBRowDown2Linear_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1011 void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
1012 ptrdiff_t src_stride,
1013 uint8_t* dst_argb,
1014 int dst_width) {
1015 (void)src_stride;
1016 asm volatile(
1017
1018 LABELALIGN
1019 "1: \n"
1020 "movdqu (%0),%%xmm0 \n"
1021 "movdqu 0x10(%0),%%xmm1 \n"
1022 "lea 0x20(%0),%0 \n"
1023 "movdqa %%xmm0,%%xmm2 \n"
1024 "shufps $0x88,%%xmm1,%%xmm0 \n"
1025 "shufps $0xdd,%%xmm1,%%xmm2 \n"
1026 "pavgb %%xmm2,%%xmm0 \n"
1027 "movdqu %%xmm0,(%1) \n"
1028 "lea 0x10(%1),%1 \n"
1029 "sub $0x4,%2 \n"
1030 "jg 1b \n"
1031 : "+r"(src_argb), // %0
1032 "+r"(dst_argb), // %1
1033 "+r"(dst_width) // %2
1034 ::"memory",
1035 "cc", "xmm0", "xmm1");
1036 }
1037
ScaleARGBRowDown2Box_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1038 void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
1039 ptrdiff_t src_stride,
1040 uint8_t* dst_argb,
1041 int dst_width) {
1042 asm volatile(
1043
1044 LABELALIGN
1045 "1: \n"
1046 "movdqu (%0),%%xmm0 \n"
1047 "movdqu 0x10(%0),%%xmm1 \n"
1048 "movdqu 0x00(%0,%3,1),%%xmm2 \n"
1049 "movdqu 0x10(%0,%3,1),%%xmm3 \n"
1050 "lea 0x20(%0),%0 \n"
1051 "pavgb %%xmm2,%%xmm0 \n"
1052 "pavgb %%xmm3,%%xmm1 \n"
1053 "movdqa %%xmm0,%%xmm2 \n"
1054 "shufps $0x88,%%xmm1,%%xmm0 \n"
1055 "shufps $0xdd,%%xmm1,%%xmm2 \n"
1056 "pavgb %%xmm2,%%xmm0 \n"
1057 "movdqu %%xmm0,(%1) \n"
1058 "lea 0x10(%1),%1 \n"
1059 "sub $0x4,%2 \n"
1060 "jg 1b \n"
1061 : "+r"(src_argb), // %0
1062 "+r"(dst_argb), // %1
1063 "+r"(dst_width) // %2
1064 : "r"((intptr_t)(src_stride)) // %3
1065 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
1066 }
1067
1068 // Reads 4 pixels at a time.
1069 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEven_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)1070 void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
1071 ptrdiff_t src_stride,
1072 int src_stepx,
1073 uint8_t* dst_argb,
1074 int dst_width) {
1075 intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
1076 intptr_t src_stepx_x12;
1077 (void)src_stride;
1078 asm volatile(
1079 "lea 0x00(,%1,4),%1 \n"
1080 "lea 0x00(%1,%1,2),%4 \n"
1081
1082 LABELALIGN
1083 "1: \n"
1084 "movd (%0),%%xmm0 \n"
1085 "movd 0x00(%0,%1,1),%%xmm1 \n"
1086 "punpckldq %%xmm1,%%xmm0 \n"
1087 "movd 0x00(%0,%1,2),%%xmm2 \n"
1088 "movd 0x00(%0,%4,1),%%xmm3 \n"
1089 "lea 0x00(%0,%1,4),%0 \n"
1090 "punpckldq %%xmm3,%%xmm2 \n"
1091 "punpcklqdq %%xmm2,%%xmm0 \n"
1092 "movdqu %%xmm0,(%2) \n"
1093 "lea 0x10(%2),%2 \n"
1094 "sub $0x4,%3 \n"
1095 "jg 1b \n"
1096 : "+r"(src_argb), // %0
1097 "+r"(src_stepx_x4), // %1
1098 "+r"(dst_argb), // %2
1099 "+r"(dst_width), // %3
1100 "=&r"(src_stepx_x12) // %4
1101 ::"memory",
1102 "cc", "xmm0", "xmm1", "xmm2", "xmm3");
1103 }
1104
1105 // Blends four 2x2 to 4x1.
1106 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEvenBox_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)1107 void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
1108 ptrdiff_t src_stride,
1109 int src_stepx,
1110 uint8_t* dst_argb,
1111 int dst_width) {
1112 intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
1113 intptr_t src_stepx_x12;
1114 intptr_t row1 = (intptr_t)(src_stride);
1115 asm volatile(
1116 "lea 0x00(,%1,4),%1 \n"
1117 "lea 0x00(%1,%1,2),%4 \n"
1118 "lea 0x00(%0,%5,1),%5 \n"
1119
1120 LABELALIGN
1121 "1: \n"
1122 "movq (%0),%%xmm0 \n"
1123 "movhps 0x00(%0,%1,1),%%xmm0 \n"
1124 "movq 0x00(%0,%1,2),%%xmm1 \n"
1125 "movhps 0x00(%0,%4,1),%%xmm1 \n"
1126 "lea 0x00(%0,%1,4),%0 \n"
1127 "movq (%5),%%xmm2 \n"
1128 "movhps 0x00(%5,%1,1),%%xmm2 \n"
1129 "movq 0x00(%5,%1,2),%%xmm3 \n"
1130 "movhps 0x00(%5,%4,1),%%xmm3 \n"
1131 "lea 0x00(%5,%1,4),%5 \n"
1132 "pavgb %%xmm2,%%xmm0 \n"
1133 "pavgb %%xmm3,%%xmm1 \n"
1134 "movdqa %%xmm0,%%xmm2 \n"
1135 "shufps $0x88,%%xmm1,%%xmm0 \n"
1136 "shufps $0xdd,%%xmm1,%%xmm2 \n"
1137 "pavgb %%xmm2,%%xmm0 \n"
1138 "movdqu %%xmm0,(%2) \n"
1139 "lea 0x10(%2),%2 \n"
1140 "sub $0x4,%3 \n"
1141 "jg 1b \n"
1142 : "+r"(src_argb), // %0
1143 "+r"(src_stepx_x4), // %1
1144 "+r"(dst_argb), // %2
1145 "+rm"(dst_width), // %3
1146 "=&r"(src_stepx_x12), // %4
1147 "+r"(row1) // %5
1148 ::"memory",
1149 "cc", "xmm0", "xmm1", "xmm2", "xmm3");
1150 }
1151
ScaleARGBCols_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1152 void ScaleARGBCols_SSE2(uint8_t* dst_argb,
1153 const uint8_t* src_argb,
1154 int dst_width,
1155 int x,
1156 int dx) {
1157 intptr_t x0, x1;
1158 asm volatile(
1159 "movd %5,%%xmm2 \n"
1160 "movd %6,%%xmm3 \n"
1161 "pshufd $0x0,%%xmm2,%%xmm2 \n"
1162 "pshufd $0x11,%%xmm3,%%xmm0 \n"
1163 "paddd %%xmm0,%%xmm2 \n"
1164 "paddd %%xmm3,%%xmm3 \n"
1165 "pshufd $0x5,%%xmm3,%%xmm0 \n"
1166 "paddd %%xmm0,%%xmm2 \n"
1167 "paddd %%xmm3,%%xmm3 \n"
1168 "pshufd $0x0,%%xmm3,%%xmm3 \n"
1169 "pextrw $0x1,%%xmm2,%k0 \n"
1170 "pextrw $0x3,%%xmm2,%k1 \n"
1171 "cmp $0x0,%4 \n"
1172 "jl 99f \n"
1173 "sub $0x4,%4 \n"
1174 "jl 49f \n"
1175
1176 LABELALIGN
1177 "40: \n"
1178 "movd 0x00(%3,%0,4),%%xmm0 \n"
1179 "movd 0x00(%3,%1,4),%%xmm1 \n"
1180 "pextrw $0x5,%%xmm2,%k0 \n"
1181 "pextrw $0x7,%%xmm2,%k1 \n"
1182 "paddd %%xmm3,%%xmm2 \n"
1183 "punpckldq %%xmm1,%%xmm0 \n"
1184 "movd 0x00(%3,%0,4),%%xmm1 \n"
1185 "movd 0x00(%3,%1,4),%%xmm4 \n"
1186 "pextrw $0x1,%%xmm2,%k0 \n"
1187 "pextrw $0x3,%%xmm2,%k1 \n"
1188 "punpckldq %%xmm4,%%xmm1 \n"
1189 "punpcklqdq %%xmm1,%%xmm0 \n"
1190 "movdqu %%xmm0,(%2) \n"
1191 "lea 0x10(%2),%2 \n"
1192 "sub $0x4,%4 \n"
1193 "jge 40b \n"
1194
1195 "49: \n"
1196 "test $0x2,%4 \n"
1197 "je 29f \n"
1198 "movd 0x00(%3,%0,4),%%xmm0 \n"
1199 "movd 0x00(%3,%1,4),%%xmm1 \n"
1200 "pextrw $0x5,%%xmm2,%k0 \n"
1201 "punpckldq %%xmm1,%%xmm0 \n"
1202 "movq %%xmm0,(%2) \n"
1203 "lea 0x8(%2),%2 \n"
1204 "29: \n"
1205 "test $0x1,%4 \n"
1206 "je 99f \n"
1207 "movd 0x00(%3,%0,4),%%xmm0 \n"
1208 "movd %%xmm0,(%2) \n"
1209 "99: \n"
1210 : "=&a"(x0), // %0
1211 "=&d"(x1), // %1
1212 "+r"(dst_argb), // %2
1213 "+r"(src_argb), // %3
1214 "+r"(dst_width) // %4
1215 : "rm"(x), // %5
1216 "rm"(dx) // %6
1217 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
1218 }
1219
1220 // Reads 4 pixels, duplicates them and writes 8 pixels.
1221 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleARGBColsUp2_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1222 void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
1223 const uint8_t* src_argb,
1224 int dst_width,
1225 int x,
1226 int dx) {
1227 (void)x;
1228 (void)dx;
1229 asm volatile(
1230
1231 LABELALIGN
1232 "1: \n"
1233 "movdqu (%1),%%xmm0 \n"
1234 "lea 0x10(%1),%1 \n"
1235 "movdqa %%xmm0,%%xmm1 \n"
1236 "punpckldq %%xmm0,%%xmm0 \n"
1237 "punpckhdq %%xmm1,%%xmm1 \n"
1238 "movdqu %%xmm0,(%0) \n"
1239 "movdqu %%xmm1,0x10(%0) \n"
1240 "lea 0x20(%0),%0 \n"
1241 "sub $0x8,%2 \n"
1242 "jg 1b \n"
1243
1244 : "+r"(dst_argb), // %0
1245 "+r"(src_argb), // %1
1246 "+r"(dst_width) // %2
1247 ::"memory",
1248 "cc", "xmm0", "xmm1");
1249 }
1250
1251 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1252 static const uvec8 kShuffleColARGB = {
1253 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
1254 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
1255 };
1256
1257 // Shuffle table for duplicating 2 fractions into 8 bytes each
1258 static const uvec8 kShuffleFractions = {
1259 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1260 };
1261
1262 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
ScaleARGBFilterCols_SSSE3(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1263 void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
1264 const uint8_t* src_argb,
1265 int dst_width,
1266 int x,
1267 int dx) {
1268 intptr_t x0, x1;
1269 asm volatile(
1270 "movdqa %0,%%xmm4 \n"
1271 "movdqa %1,%%xmm5 \n"
1272 :
1273 : "m"(kShuffleColARGB), // %0
1274 "m"(kShuffleFractions) // %1
1275 );
1276
1277 asm volatile(
1278 "movd %5,%%xmm2 \n"
1279 "movd %6,%%xmm3 \n"
1280 "pcmpeqb %%xmm6,%%xmm6 \n"
1281 "psrlw $0x9,%%xmm6 \n"
1282 "pextrw $0x1,%%xmm2,%k3 \n"
1283 "sub $0x2,%2 \n"
1284 "jl 29f \n"
1285 "movdqa %%xmm2,%%xmm0 \n"
1286 "paddd %%xmm3,%%xmm0 \n"
1287 "punpckldq %%xmm0,%%xmm2 \n"
1288 "punpckldq %%xmm3,%%xmm3 \n"
1289 "paddd %%xmm3,%%xmm3 \n"
1290 "pextrw $0x3,%%xmm2,%k4 \n"
1291
1292 LABELALIGN
1293 "2: \n"
1294 "movdqa %%xmm2,%%xmm1 \n"
1295 "paddd %%xmm3,%%xmm2 \n"
1296 "movq 0x00(%1,%3,4),%%xmm0 \n"
1297 "psrlw $0x9,%%xmm1 \n"
1298 "movhps 0x00(%1,%4,4),%%xmm0 \n"
1299 "pshufb %%xmm5,%%xmm1 \n"
1300 "pshufb %%xmm4,%%xmm0 \n"
1301 "pxor %%xmm6,%%xmm1 \n"
1302 "pmaddubsw %%xmm1,%%xmm0 \n"
1303 "psrlw $0x7,%%xmm0 \n"
1304 "pextrw $0x1,%%xmm2,%k3 \n"
1305 "pextrw $0x3,%%xmm2,%k4 \n"
1306 "packuswb %%xmm0,%%xmm0 \n"
1307 "movq %%xmm0,(%0) \n"
1308 "lea 0x8(%0),%0 \n"
1309 "sub $0x2,%2 \n"
1310 "jge 2b \n"
1311
1312 LABELALIGN
1313 "29: \n"
1314 "add $0x1,%2 \n"
1315 "jl 99f \n"
1316 "psrlw $0x9,%%xmm2 \n"
1317 "movq 0x00(%1,%3,4),%%xmm0 \n"
1318 "pshufb %%xmm5,%%xmm2 \n"
1319 "pshufb %%xmm4,%%xmm0 \n"
1320 "pxor %%xmm6,%%xmm2 \n"
1321 "pmaddubsw %%xmm2,%%xmm0 \n"
1322 "psrlw $0x7,%%xmm0 \n"
1323 "packuswb %%xmm0,%%xmm0 \n"
1324 "movd %%xmm0,(%0) \n"
1325
1326 LABELALIGN
1327 "99: \n" // clang-format error.
1328
1329 : "+r"(dst_argb), // %0
1330 "+r"(src_argb), // %1
1331 "+rm"(dst_width), // %2
1332 "=&r"(x0), // %3
1333 "=&r"(x1) // %4
1334 : "rm"(x), // %5
1335 "rm"(dx) // %6
1336 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1337 }
1338
1339 // Divide num by div and return as 16.16 fixed point result.
FixedDiv_X86(int num,int div)1340 int FixedDiv_X86(int num, int div) {
1341 asm volatile(
1342 "cdq \n"
1343 "shld $0x10,%%eax,%%edx \n"
1344 "shl $0x10,%%eax \n"
1345 "idiv %1 \n"
1346 "mov %0, %%eax \n"
1347 : "+a"(num) // %0
1348 : "c"(div) // %1
1349 : "memory", "cc", "edx");
1350 return num;
1351 }
1352
1353 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
FixedDiv1_X86(int num,int div)1354 int FixedDiv1_X86(int num, int div) {
1355 asm volatile(
1356 "cdq \n"
1357 "shld $0x10,%%eax,%%edx \n"
1358 "shl $0x10,%%eax \n"
1359 "sub $0x10001,%%eax \n"
1360 "sbb $0x0,%%edx \n"
1361 "sub $0x1,%1 \n"
1362 "idiv %1 \n"
1363 "mov %0, %%eax \n"
1364 : "+a"(num) // %0
1365 : "c"(div) // %1
1366 : "memory", "cc", "edx");
1367 return num;
1368 }
1369
1370 #ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
1371 // Shuffle table for splitting UV into upper and lower part of register.
1372 static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
1373 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
1374 static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u,
1375 6u, 14u, 0x80, 0x80, 0x80, 0x80,
1376 0x80, 0x80, 0x80, 0x80};
1377
ScaleUVRowDown2Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)1378 void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
1379 ptrdiff_t src_stride,
1380 uint8_t* dst_ptr,
1381 int dst_width) {
1382 asm volatile(
1383 "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101
1384 "psrlw $0xf,%%xmm4 \n"
1385 "packuswb %%xmm4,%%xmm4 \n"
1386 "pxor %%xmm5, %%xmm5 \n" // zero
1387 "movdqa %4,%%xmm1 \n" // split shuffler
1388 "movdqa %5,%%xmm3 \n" // merge shuffler
1389
1390 LABELALIGN
1391 "1: \n"
1392 "movdqu (%0),%%xmm0 \n" // 8 UV row 0
1393 "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1
1394 "lea 0x10(%0),%0 \n"
1395 "pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv
1396 "pshufb %%xmm1,%%xmm2 \n"
1397 "pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add
1398 "pmaddubsw %%xmm4,%%xmm2 \n"
1399 "paddw %%xmm2,%%xmm0 \n" // vertical add
1400 "psrlw $0x1,%%xmm0 \n" // round
1401 "pavgw %%xmm5,%%xmm0 \n"
1402 "pshufb %%xmm3,%%xmm0 \n" // merge uv
1403 "movq %%xmm0,(%1) \n"
1404 "lea 0x8(%1),%1 \n" // 4 UV
1405 "sub $0x4,%2 \n"
1406 "jg 1b \n"
1407 : "+r"(src_ptr), // %0
1408 "+r"(dst_ptr), // %1
1409 "+r"(dst_width) // %2
1410 : "r"((intptr_t)(src_stride)), // %3
1411 "m"(kShuffleSplitUV), // %4
1412 "m"(kShuffleMergeUV) // %5
1413 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1414 }
1415 #endif // HAS_SCALEUVROWDOWN2BOX_SSSE3
1416
1417 #ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
ScaleUVRowDown2Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)1418 void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
1419 ptrdiff_t src_stride,
1420 uint8_t* dst_ptr,
1421 int dst_width) {
1422 asm volatile(
1423 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101
1424 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
1425 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
1426 "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero
1427 "vbroadcastf128 %4,%%ymm1 \n" // split shuffler
1428 "vbroadcastf128 %5,%%ymm3 \n" // merge shuffler
1429
1430 LABELALIGN
1431 "1: \n"
1432 "vmovdqu (%0),%%ymm0 \n" // 16 UV row 0
1433 "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1
1434 "lea 0x20(%0),%0 \n"
1435 "vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv
1436 "vpshufb %%ymm1,%%ymm2,%%ymm2 \n"
1437 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add
1438 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
1439 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add
1440 "vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round
1441 "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
1442 "vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv
1443 "vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords
1444 "vmovdqu %%xmm0,(%1) \n"
1445 "lea 0x10(%1),%1 \n" // 8 UV
1446 "sub $0x8,%2 \n"
1447 "jg 1b \n"
1448 "vzeroupper \n"
1449 : "+r"(src_ptr), // %0
1450 "+r"(dst_ptr), // %1
1451 "+r"(dst_width) // %2
1452 : "r"((intptr_t)(src_stride)), // %3
1453 "m"(kShuffleSplitUV), // %4
1454 "m"(kShuffleMergeUV) // %5
1455 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1456 }
1457 #endif // HAS_SCALEUVROWDOWN2BOX_AVX2
1458
1459 #endif // defined(__x86_64__) || defined(__i386__)
1460
1461 #ifdef __cplusplus
1462 } // extern "C"
1463 } // namespace libyuv
1464 #endif
1465