1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12 #include "libyuv/scale_row.h"
13
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18
19 // This module is for GCC x86 and x64.
20 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
21
22 // Offsets for source bytes 0 to 9
23 static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
24 128, 128, 128, 128, 128, 128, 128, 128};
25
26 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
27 static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
28 128, 128, 128, 128, 128, 128, 128, 128};
29
30 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
31 static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
32 128, 128, 128, 128, 128, 128, 128, 128};
33
34 // Offsets for source bytes 0 to 10
35 static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
36
37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
38 static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7,
39 8, 9, 9, 10, 10, 11, 12, 13};
40
41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
42 static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
43 10, 11, 12, 13, 13, 14, 14, 15};
44
45 // Coefficients for source bytes 0 to 10
46 static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
47
48 // Coefficients for source bytes 10 to 21
49 static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
50
51 // Coefficients for source bytes 21 to 31
52 static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
53
54 // Coefficients for source bytes 21 to 31
55 static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
56
57 static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
58 128, 128, 128, 128, 128, 128, 128, 128};
59
60 static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
61 6, 8, 11, 14, 128, 128, 128, 128};
62
63 // Arrange words 0,3,6 into 0,1,2
64 static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
65 128, 128, 128, 128, 128, 128, 128, 128};
66
67 // Arrange words 0,3,6 into 3,4,5
68 static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
69 6, 7, 12, 13, 128, 128, 128, 128};
70
71 // Scaling values for boxes of 3x3 and 2x3
72 static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
73 65536 / 9, 65536 / 6, 0, 0};
74
75 // Arrange first value for pixels 0,1,2,3,4,5
76 static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
77 11, 128, 14, 128, 128, 128, 128, 128};
78
79 // Arrange second value for pixels 0,1,2,3,4,5
80 static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
81 12, 128, 15, 128, 128, 128, 128, 128};
82
83 // Arrange third value for pixels 0,1,2,3,4,5
84 static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
85 13, 128, 128, 128, 128, 128, 128, 128};
86
87 // Scaling values for boxes of 3x2 and 2x2
88 static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
89 65536 / 3, 65536 / 2, 0, 0};
90
91 // GCC versions of row functions are verbatim conversions from Visual C.
92 // Generated using gcc disassembly on Visual C object file:
93 // objdump -D yuvscaler.obj >yuvscaler.txt
94
ScaleRowDown2_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)95 void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
96 ptrdiff_t src_stride,
97 uint8_t* dst_ptr,
98 int dst_width) {
99 (void)src_stride;
100 asm volatile(
101 // 16 pixel loop.
102 LABELALIGN
103 "1: \n"
104 "movdqu (%0),%%xmm0 \n"
105 "movdqu 0x10(%0),%%xmm1 \n"
106 "lea 0x20(%0),%0 \n"
107 "psrlw $0x8,%%xmm0 \n"
108 "psrlw $0x8,%%xmm1 \n"
109 "packuswb %%xmm1,%%xmm0 \n"
110 "movdqu %%xmm0,(%1) \n"
111 "lea 0x10(%1),%1 \n"
112 "sub $0x10,%2 \n"
113 "jg 1b \n"
114 : "+r"(src_ptr), // %0
115 "+r"(dst_ptr), // %1
116 "+r"(dst_width) // %2
117 ::"memory",
118 "cc", "xmm0", "xmm1");
119 }
120
ScaleRowDown2Linear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)121 void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
122 ptrdiff_t src_stride,
123 uint8_t* dst_ptr,
124 int dst_width) {
125 (void)src_stride;
126 asm volatile(
127 "pcmpeqb %%xmm4,%%xmm4 \n"
128 "psrlw $0xf,%%xmm4 \n"
129 "packuswb %%xmm4,%%xmm4 \n"
130 "pxor %%xmm5,%%xmm5 \n"
131
132 LABELALIGN
133 "1: \n"
134 "movdqu (%0),%%xmm0 \n"
135 "movdqu 0x10(%0),%%xmm1 \n"
136 "lea 0x20(%0),%0 \n"
137 "pmaddubsw %%xmm4,%%xmm0 \n"
138 "pmaddubsw %%xmm4,%%xmm1 \n"
139 "pavgw %%xmm5,%%xmm0 \n"
140 "pavgw %%xmm5,%%xmm1 \n"
141 "packuswb %%xmm1,%%xmm0 \n"
142 "movdqu %%xmm0,(%1) \n"
143 "lea 0x10(%1),%1 \n"
144 "sub $0x10,%2 \n"
145 "jg 1b \n"
146 : "+r"(src_ptr), // %0
147 "+r"(dst_ptr), // %1
148 "+r"(dst_width) // %2
149 ::"memory",
150 "cc", "xmm0", "xmm1", "xmm4", "xmm5");
151 }
152
ScaleRowDown2Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)153 void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
154 ptrdiff_t src_stride,
155 uint8_t* dst_ptr,
156 int dst_width) {
157 asm volatile(
158 "pcmpeqb %%xmm4,%%xmm4 \n"
159 "psrlw $0xf,%%xmm4 \n"
160 "packuswb %%xmm4,%%xmm4 \n"
161 "pxor %%xmm5,%%xmm5 \n"
162
163 LABELALIGN
164 "1: \n"
165 "movdqu (%0),%%xmm0 \n"
166 "movdqu 0x10(%0),%%xmm1 \n"
167 "movdqu 0x00(%0,%3,1),%%xmm2 \n"
168 "movdqu 0x10(%0,%3,1),%%xmm3 \n"
169 "lea 0x20(%0),%0 \n"
170 "pmaddubsw %%xmm4,%%xmm0 \n"
171 "pmaddubsw %%xmm4,%%xmm1 \n"
172 "pmaddubsw %%xmm4,%%xmm2 \n"
173 "pmaddubsw %%xmm4,%%xmm3 \n"
174 "paddw %%xmm2,%%xmm0 \n"
175 "paddw %%xmm3,%%xmm1 \n"
176 "psrlw $0x1,%%xmm0 \n"
177 "psrlw $0x1,%%xmm1 \n"
178 "pavgw %%xmm5,%%xmm0 \n"
179 "pavgw %%xmm5,%%xmm1 \n"
180 "packuswb %%xmm1,%%xmm0 \n"
181 "movdqu %%xmm0,(%1) \n"
182 "lea 0x10(%1),%1 \n"
183 "sub $0x10,%2 \n"
184 "jg 1b \n"
185 : "+r"(src_ptr), // %0
186 "+r"(dst_ptr), // %1
187 "+r"(dst_width) // %2
188 : "r"((intptr_t)(src_stride)) // %3
189 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
190 }
191
192 #ifdef HAS_SCALEROWDOWN2_AVX2
ScaleRowDown2_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)193 void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
194 ptrdiff_t src_stride,
195 uint8_t* dst_ptr,
196 int dst_width) {
197 (void)src_stride;
198 asm volatile(LABELALIGN
199 "1: \n"
200 "vmovdqu (%0),%%ymm0 \n"
201 "vmovdqu 0x20(%0),%%ymm1 \n"
202 "lea 0x40(%0),%0 \n"
203 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
204 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
205 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
206 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
207 "vmovdqu %%ymm0,(%1) \n"
208 "lea 0x20(%1),%1 \n"
209 "sub $0x20,%2 \n"
210 "jg 1b \n"
211 "vzeroupper \n"
212 : "+r"(src_ptr), // %0
213 "+r"(dst_ptr), // %1
214 "+r"(dst_width) // %2
215 ::"memory",
216 "cc", "xmm0", "xmm1");
217 }
218
ScaleRowDown2Linear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)219 void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
220 ptrdiff_t src_stride,
221 uint8_t* dst_ptr,
222 int dst_width) {
223 (void)src_stride;
224 asm volatile(
225 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
226 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
227 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
228 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
229
230 LABELALIGN
231 "1: \n"
232 "vmovdqu (%0),%%ymm0 \n"
233 "vmovdqu 0x20(%0),%%ymm1 \n"
234 "lea 0x40(%0),%0 \n"
235 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
236 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
237 "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
238 "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
239 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
240 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
241 "vmovdqu %%ymm0,(%1) \n"
242 "lea 0x20(%1),%1 \n"
243 "sub $0x20,%2 \n"
244 "jg 1b \n"
245 "vzeroupper \n"
246 : "+r"(src_ptr), // %0
247 "+r"(dst_ptr), // %1
248 "+r"(dst_width) // %2
249 ::"memory",
250 "cc", "xmm0", "xmm1", "xmm4", "xmm5");
251 }
252
ScaleRowDown2Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)253 void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
254 ptrdiff_t src_stride,
255 uint8_t* dst_ptr,
256 int dst_width) {
257 asm volatile(
258 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
259 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
260 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
261 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
262
263 LABELALIGN
264 "1: \n"
265 "vmovdqu (%0),%%ymm0 \n"
266 "vmovdqu 0x20(%0),%%ymm1 \n"
267 "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
268 "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
269 "lea 0x40(%0),%0 \n"
270 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
271 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
272 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
273 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
274 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
275 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
276 "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
277 "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
278 "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
279 "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
280 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
281 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
282 "vmovdqu %%ymm0,(%1) \n"
283 "lea 0x20(%1),%1 \n"
284 "sub $0x20,%2 \n"
285 "jg 1b \n"
286 "vzeroupper \n"
287 : "+r"(src_ptr), // %0
288 "+r"(dst_ptr), // %1
289 "+r"(dst_width) // %2
290 : "r"((intptr_t)(src_stride)) // %3
291 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
292 }
293 #endif // HAS_SCALEROWDOWN2_AVX2
294
ScaleRowDown4_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)295 void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
296 ptrdiff_t src_stride,
297 uint8_t* dst_ptr,
298 int dst_width) {
299 (void)src_stride;
300 asm volatile(
301 "pcmpeqb %%xmm5,%%xmm5 \n"
302 "psrld $0x18,%%xmm5 \n"
303 "pslld $0x10,%%xmm5 \n"
304
305 LABELALIGN
306 "1: \n"
307 "movdqu (%0),%%xmm0 \n"
308 "movdqu 0x10(%0),%%xmm1 \n"
309 "lea 0x20(%0),%0 \n"
310 "pand %%xmm5,%%xmm0 \n"
311 "pand %%xmm5,%%xmm1 \n"
312 "packuswb %%xmm1,%%xmm0 \n"
313 "psrlw $0x8,%%xmm0 \n"
314 "packuswb %%xmm0,%%xmm0 \n"
315 "movq %%xmm0,(%1) \n"
316 "lea 0x8(%1),%1 \n"
317 "sub $0x8,%2 \n"
318 "jg 1b \n"
319 : "+r"(src_ptr), // %0
320 "+r"(dst_ptr), // %1
321 "+r"(dst_width) // %2
322 ::"memory",
323 "cc", "xmm0", "xmm1", "xmm5");
324 }
325
ScaleRowDown4Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)326 void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
327 ptrdiff_t src_stride,
328 uint8_t* dst_ptr,
329 int dst_width) {
330 intptr_t stridex3;
331 asm volatile(
332 "pcmpeqb %%xmm4,%%xmm4 \n"
333 "psrlw $0xf,%%xmm4 \n"
334 "movdqa %%xmm4,%%xmm5 \n"
335 "packuswb %%xmm4,%%xmm4 \n"
336 "psllw $0x3,%%xmm5 \n"
337 "lea 0x00(%4,%4,2),%3 \n"
338
339 LABELALIGN
340 "1: \n"
341 "movdqu (%0),%%xmm0 \n"
342 "movdqu 0x10(%0),%%xmm1 \n"
343 "movdqu 0x00(%0,%4,1),%%xmm2 \n"
344 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
345 "pmaddubsw %%xmm4,%%xmm0 \n"
346 "pmaddubsw %%xmm4,%%xmm1 \n"
347 "pmaddubsw %%xmm4,%%xmm2 \n"
348 "pmaddubsw %%xmm4,%%xmm3 \n"
349 "paddw %%xmm2,%%xmm0 \n"
350 "paddw %%xmm3,%%xmm1 \n"
351 "movdqu 0x00(%0,%4,2),%%xmm2 \n"
352 "movdqu 0x10(%0,%4,2),%%xmm3 \n"
353 "pmaddubsw %%xmm4,%%xmm2 \n"
354 "pmaddubsw %%xmm4,%%xmm3 \n"
355 "paddw %%xmm2,%%xmm0 \n"
356 "paddw %%xmm3,%%xmm1 \n"
357 "movdqu 0x00(%0,%3,1),%%xmm2 \n"
358 "movdqu 0x10(%0,%3,1),%%xmm3 \n"
359 "lea 0x20(%0),%0 \n"
360 "pmaddubsw %%xmm4,%%xmm2 \n"
361 "pmaddubsw %%xmm4,%%xmm3 \n"
362 "paddw %%xmm2,%%xmm0 \n"
363 "paddw %%xmm3,%%xmm1 \n"
364 "phaddw %%xmm1,%%xmm0 \n"
365 "paddw %%xmm5,%%xmm0 \n"
366 "psrlw $0x4,%%xmm0 \n"
367 "packuswb %%xmm0,%%xmm0 \n"
368 "movq %%xmm0,(%1) \n"
369 "lea 0x8(%1),%1 \n"
370 "sub $0x8,%2 \n"
371 "jg 1b \n"
372 : "+r"(src_ptr), // %0
373 "+r"(dst_ptr), // %1
374 "+r"(dst_width), // %2
375 "=&r"(stridex3) // %3
376 : "r"((intptr_t)(src_stride)) // %4
377 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
378 }
379
380 #ifdef HAS_SCALEROWDOWN4_AVX2
ScaleRowDown4_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)381 void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
382 ptrdiff_t src_stride,
383 uint8_t* dst_ptr,
384 int dst_width) {
385 (void)src_stride;
386 asm volatile(
387 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
388 "vpsrld $0x18,%%ymm5,%%ymm5 \n"
389 "vpslld $0x10,%%ymm5,%%ymm5 \n"
390
391 LABELALIGN
392 "1: \n"
393 "vmovdqu (%0),%%ymm0 \n"
394 "vmovdqu 0x20(%0),%%ymm1 \n"
395 "lea 0x40(%0),%0 \n"
396 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
397 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
398 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
399 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
400 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
401 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
402 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
403 "vmovdqu %%xmm0,(%1) \n"
404 "lea 0x10(%1),%1 \n"
405 "sub $0x10,%2 \n"
406 "jg 1b \n"
407 "vzeroupper \n"
408 : "+r"(src_ptr), // %0
409 "+r"(dst_ptr), // %1
410 "+r"(dst_width) // %2
411 ::"memory",
412 "cc", "xmm0", "xmm1", "xmm5");
413 }
414
ScaleRowDown4Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)415 void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
416 ptrdiff_t src_stride,
417 uint8_t* dst_ptr,
418 int dst_width) {
419 asm volatile(
420 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
421 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
422 "vpsllw $0x3,%%ymm4,%%ymm5 \n"
423 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
424
425 LABELALIGN
426 "1: \n"
427 "vmovdqu (%0),%%ymm0 \n"
428 "vmovdqu 0x20(%0),%%ymm1 \n"
429 "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
430 "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
431 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
432 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
433 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
434 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
435 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
436 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
437 "vmovdqu 0x00(%0,%3,2),%%ymm2 \n"
438 "vmovdqu 0x20(%0,%3,2),%%ymm3 \n"
439 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
440 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
441 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
442 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
443 "vmovdqu 0x00(%0,%4,1),%%ymm2 \n"
444 "vmovdqu 0x20(%0,%4,1),%%ymm3 \n"
445 "lea 0x40(%0),%0 \n"
446 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
447 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
448 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
449 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
450 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
451 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
452 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
453 "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
454 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
455 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
456 "vmovdqu %%xmm0,(%1) \n"
457 "lea 0x10(%1),%1 \n"
458 "sub $0x10,%2 \n"
459 "jg 1b \n"
460 "vzeroupper \n"
461 : "+r"(src_ptr), // %0
462 "+r"(dst_ptr), // %1
463 "+r"(dst_width) // %2
464 : "r"((intptr_t)(src_stride)), // %3
465 "r"((intptr_t)(src_stride * 3)) // %4
466 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
467 }
468 #endif // HAS_SCALEROWDOWN4_AVX2
469
ScaleRowDown34_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)470 void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
471 ptrdiff_t src_stride,
472 uint8_t* dst_ptr,
473 int dst_width) {
474 (void)src_stride;
475 asm volatile(
476 "movdqa %0,%%xmm3 \n"
477 "movdqa %1,%%xmm4 \n"
478 "movdqa %2,%%xmm5 \n"
479 :
480 : "m"(kShuf0), // %0
481 "m"(kShuf1), // %1
482 "m"(kShuf2) // %2
483 );
484 asm volatile(LABELALIGN
485 "1: \n"
486 "movdqu (%0),%%xmm0 \n"
487 "movdqu 0x10(%0),%%xmm2 \n"
488 "lea 0x20(%0),%0 \n"
489 "movdqa %%xmm2,%%xmm1 \n"
490 "palignr $0x8,%%xmm0,%%xmm1 \n"
491 "pshufb %%xmm3,%%xmm0 \n"
492 "pshufb %%xmm4,%%xmm1 \n"
493 "pshufb %%xmm5,%%xmm2 \n"
494 "movq %%xmm0,(%1) \n"
495 "movq %%xmm1,0x8(%1) \n"
496 "movq %%xmm2,0x10(%1) \n"
497 "lea 0x18(%1),%1 \n"
498 "sub $0x18,%2 \n"
499 "jg 1b \n"
500 : "+r"(src_ptr), // %0
501 "+r"(dst_ptr), // %1
502 "+r"(dst_width) // %2
503 ::"memory",
504 "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
505 }
506
ScaleRowDown34_1_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)507 void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
508 ptrdiff_t src_stride,
509 uint8_t* dst_ptr,
510 int dst_width) {
511 asm volatile(
512 "movdqa %0,%%xmm2 \n" // kShuf01
513 "movdqa %1,%%xmm3 \n" // kShuf11
514 "movdqa %2,%%xmm4 \n" // kShuf21
515 :
516 : "m"(kShuf01), // %0
517 "m"(kShuf11), // %1
518 "m"(kShuf21) // %2
519 );
520 asm volatile(
521 "movdqa %0,%%xmm5 \n" // kMadd01
522 "movdqa %1,%%xmm0 \n" // kMadd11
523 "movdqa %2,%%xmm1 \n" // kRound34
524 :
525 : "m"(kMadd01), // %0
526 "m"(kMadd11), // %1
527 "m"(kRound34) // %2
528 );
529 asm volatile(LABELALIGN
530 "1: \n"
531 "movdqu (%0),%%xmm6 \n"
532 "movdqu 0x00(%0,%3,1),%%xmm7 \n"
533 "pavgb %%xmm7,%%xmm6 \n"
534 "pshufb %%xmm2,%%xmm6 \n"
535 "pmaddubsw %%xmm5,%%xmm6 \n"
536 "paddsw %%xmm1,%%xmm6 \n"
537 "psrlw $0x2,%%xmm6 \n"
538 "packuswb %%xmm6,%%xmm6 \n"
539 "movq %%xmm6,(%1) \n"
540 "movdqu 0x8(%0),%%xmm6 \n"
541 "movdqu 0x8(%0,%3,1),%%xmm7 \n"
542 "pavgb %%xmm7,%%xmm6 \n"
543 "pshufb %%xmm3,%%xmm6 \n"
544 "pmaddubsw %%xmm0,%%xmm6 \n"
545 "paddsw %%xmm1,%%xmm6 \n"
546 "psrlw $0x2,%%xmm6 \n"
547 "packuswb %%xmm6,%%xmm6 \n"
548 "movq %%xmm6,0x8(%1) \n"
549 "movdqu 0x10(%0),%%xmm6 \n"
550 "movdqu 0x10(%0,%3,1),%%xmm7 \n"
551 "lea 0x20(%0),%0 \n"
552 "pavgb %%xmm7,%%xmm6 \n"
553 "pshufb %%xmm4,%%xmm6 \n"
554 "pmaddubsw %4,%%xmm6 \n"
555 "paddsw %%xmm1,%%xmm6 \n"
556 "psrlw $0x2,%%xmm6 \n"
557 "packuswb %%xmm6,%%xmm6 \n"
558 "movq %%xmm6,0x10(%1) \n"
559 "lea 0x18(%1),%1 \n"
560 "sub $0x18,%2 \n"
561 "jg 1b \n"
562 : "+r"(src_ptr), // %0
563 "+r"(dst_ptr), // %1
564 "+r"(dst_width) // %2
565 : "r"((intptr_t)(src_stride)), // %3
566 "m"(kMadd21) // %4
567 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
568 "xmm6", "xmm7");
569 }
570
ScaleRowDown34_0_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)571 void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
572 ptrdiff_t src_stride,
573 uint8_t* dst_ptr,
574 int dst_width) {
575 asm volatile(
576 "movdqa %0,%%xmm2 \n" // kShuf01
577 "movdqa %1,%%xmm3 \n" // kShuf11
578 "movdqa %2,%%xmm4 \n" // kShuf21
579 :
580 : "m"(kShuf01), // %0
581 "m"(kShuf11), // %1
582 "m"(kShuf21) // %2
583 );
584 asm volatile(
585 "movdqa %0,%%xmm5 \n" // kMadd01
586 "movdqa %1,%%xmm0 \n" // kMadd11
587 "movdqa %2,%%xmm1 \n" // kRound34
588 :
589 : "m"(kMadd01), // %0
590 "m"(kMadd11), // %1
591 "m"(kRound34) // %2
592 );
593
594 asm volatile(LABELALIGN
595 "1: \n"
596 "movdqu (%0),%%xmm6 \n"
597 "movdqu 0x00(%0,%3,1),%%xmm7 \n"
598 "pavgb %%xmm6,%%xmm7 \n"
599 "pavgb %%xmm7,%%xmm6 \n"
600 "pshufb %%xmm2,%%xmm6 \n"
601 "pmaddubsw %%xmm5,%%xmm6 \n"
602 "paddsw %%xmm1,%%xmm6 \n"
603 "psrlw $0x2,%%xmm6 \n"
604 "packuswb %%xmm6,%%xmm6 \n"
605 "movq %%xmm6,(%1) \n"
606 "movdqu 0x8(%0),%%xmm6 \n"
607 "movdqu 0x8(%0,%3,1),%%xmm7 \n"
608 "pavgb %%xmm6,%%xmm7 \n"
609 "pavgb %%xmm7,%%xmm6 \n"
610 "pshufb %%xmm3,%%xmm6 \n"
611 "pmaddubsw %%xmm0,%%xmm6 \n"
612 "paddsw %%xmm1,%%xmm6 \n"
613 "psrlw $0x2,%%xmm6 \n"
614 "packuswb %%xmm6,%%xmm6 \n"
615 "movq %%xmm6,0x8(%1) \n"
616 "movdqu 0x10(%0),%%xmm6 \n"
617 "movdqu 0x10(%0,%3,1),%%xmm7 \n"
618 "lea 0x20(%0),%0 \n"
619 "pavgb %%xmm6,%%xmm7 \n"
620 "pavgb %%xmm7,%%xmm6 \n"
621 "pshufb %%xmm4,%%xmm6 \n"
622 "pmaddubsw %4,%%xmm6 \n"
623 "paddsw %%xmm1,%%xmm6 \n"
624 "psrlw $0x2,%%xmm6 \n"
625 "packuswb %%xmm6,%%xmm6 \n"
626 "movq %%xmm6,0x10(%1) \n"
627 "lea 0x18(%1),%1 \n"
628 "sub $0x18,%2 \n"
629 "jg 1b \n"
630 : "+r"(src_ptr), // %0
631 "+r"(dst_ptr), // %1
632 "+r"(dst_width) // %2
633 : "r"((intptr_t)(src_stride)), // %3
634 "m"(kMadd21) // %4
635 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
636 "xmm6", "xmm7");
637 }
638
ScaleRowDown38_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)639 void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
640 ptrdiff_t src_stride,
641 uint8_t* dst_ptr,
642 int dst_width) {
643 (void)src_stride;
644 asm volatile(
645 "movdqa %3,%%xmm4 \n"
646 "movdqa %4,%%xmm5 \n"
647
648 LABELALIGN
649 "1: \n"
650 "movdqu (%0),%%xmm0 \n"
651 "movdqu 0x10(%0),%%xmm1 \n"
652 "lea 0x20(%0),%0 \n"
653 "pshufb %%xmm4,%%xmm0 \n"
654 "pshufb %%xmm5,%%xmm1 \n"
655 "paddusb %%xmm1,%%xmm0 \n"
656 "movq %%xmm0,(%1) \n"
657 "movhlps %%xmm0,%%xmm1 \n"
658 "movd %%xmm1,0x8(%1) \n"
659 "lea 0xc(%1),%1 \n"
660 "sub $0xc,%2 \n"
661 "jg 1b \n"
662 : "+r"(src_ptr), // %0
663 "+r"(dst_ptr), // %1
664 "+r"(dst_width) // %2
665 : "m"(kShuf38a), // %3
666 "m"(kShuf38b) // %4
667 : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
668 }
669
ScaleRowDown38_2_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)670 void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
671 ptrdiff_t src_stride,
672 uint8_t* dst_ptr,
673 int dst_width) {
674 asm volatile(
675 "movdqa %0,%%xmm2 \n"
676 "movdqa %1,%%xmm3 \n"
677 "movdqa %2,%%xmm4 \n"
678 "movdqa %3,%%xmm5 \n"
679 :
680 : "m"(kShufAb0), // %0
681 "m"(kShufAb1), // %1
682 "m"(kShufAb2), // %2
683 "m"(kScaleAb2) // %3
684 );
685 asm volatile(LABELALIGN
686 "1: \n"
687 "movdqu (%0),%%xmm0 \n"
688 "movdqu 0x00(%0,%3,1),%%xmm1 \n"
689 "lea 0x10(%0),%0 \n"
690 "pavgb %%xmm1,%%xmm0 \n"
691 "movdqa %%xmm0,%%xmm1 \n"
692 "pshufb %%xmm2,%%xmm1 \n"
693 "movdqa %%xmm0,%%xmm6 \n"
694 "pshufb %%xmm3,%%xmm6 \n"
695 "paddusw %%xmm6,%%xmm1 \n"
696 "pshufb %%xmm4,%%xmm0 \n"
697 "paddusw %%xmm0,%%xmm1 \n"
698 "pmulhuw %%xmm5,%%xmm1 \n"
699 "packuswb %%xmm1,%%xmm1 \n"
700 "movd %%xmm1,(%1) \n"
701 "psrlq $0x10,%%xmm1 \n"
702 "movd %%xmm1,0x2(%1) \n"
703 "lea 0x6(%1),%1 \n"
704 "sub $0x6,%2 \n"
705 "jg 1b \n"
706 : "+r"(src_ptr), // %0
707 "+r"(dst_ptr), // %1
708 "+r"(dst_width) // %2
709 : "r"((intptr_t)(src_stride)) // %3
710 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
711 "xmm6");
712 }
713
ScaleRowDown38_3_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)714 void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
715 ptrdiff_t src_stride,
716 uint8_t* dst_ptr,
717 int dst_width) {
718 asm volatile(
719 "movdqa %0,%%xmm2 \n"
720 "movdqa %1,%%xmm3 \n"
721 "movdqa %2,%%xmm4 \n"
722 "pxor %%xmm5,%%xmm5 \n"
723 :
724 : "m"(kShufAc), // %0
725 "m"(kShufAc3), // %1
726 "m"(kScaleAc33) // %2
727 );
728 asm volatile(LABELALIGN
729 "1: \n"
730 "movdqu (%0),%%xmm0 \n"
731 "movdqu 0x00(%0,%3,1),%%xmm6 \n"
732 "movhlps %%xmm0,%%xmm1 \n"
733 "movhlps %%xmm6,%%xmm7 \n"
734 "punpcklbw %%xmm5,%%xmm0 \n"
735 "punpcklbw %%xmm5,%%xmm1 \n"
736 "punpcklbw %%xmm5,%%xmm6 \n"
737 "punpcklbw %%xmm5,%%xmm7 \n"
738 "paddusw %%xmm6,%%xmm0 \n"
739 "paddusw %%xmm7,%%xmm1 \n"
740 "movdqu 0x00(%0,%3,2),%%xmm6 \n"
741 "lea 0x10(%0),%0 \n"
742 "movhlps %%xmm6,%%xmm7 \n"
743 "punpcklbw %%xmm5,%%xmm6 \n"
744 "punpcklbw %%xmm5,%%xmm7 \n"
745 "paddusw %%xmm6,%%xmm0 \n"
746 "paddusw %%xmm7,%%xmm1 \n"
747 "movdqa %%xmm0,%%xmm6 \n"
748 "psrldq $0x2,%%xmm0 \n"
749 "paddusw %%xmm0,%%xmm6 \n"
750 "psrldq $0x2,%%xmm0 \n"
751 "paddusw %%xmm0,%%xmm6 \n"
752 "pshufb %%xmm2,%%xmm6 \n"
753 "movdqa %%xmm1,%%xmm7 \n"
754 "psrldq $0x2,%%xmm1 \n"
755 "paddusw %%xmm1,%%xmm7 \n"
756 "psrldq $0x2,%%xmm1 \n"
757 "paddusw %%xmm1,%%xmm7 \n"
758 "pshufb %%xmm3,%%xmm7 \n"
759 "paddusw %%xmm7,%%xmm6 \n"
760 "pmulhuw %%xmm4,%%xmm6 \n"
761 "packuswb %%xmm6,%%xmm6 \n"
762 "movd %%xmm6,(%1) \n"
763 "psrlq $0x10,%%xmm6 \n"
764 "movd %%xmm6,0x2(%1) \n"
765 "lea 0x6(%1),%1 \n"
766 "sub $0x6,%2 \n"
767 "jg 1b \n"
768 : "+r"(src_ptr), // %0
769 "+r"(dst_ptr), // %1
770 "+r"(dst_width) // %2
771 : "r"((intptr_t)(src_stride)) // %3
772 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
773 "xmm6", "xmm7");
774 }
775
776 static const uvec8 kLinearShuffleFar = {2, 3, 0, 1, 6, 7, 4, 5,
777 10, 11, 8, 9, 14, 15, 12, 13};
778
779 static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
780 3, 1, 1, 3, 3, 1, 1, 3};
781
782 #ifdef HAS_SCALEROWUP2LINEAR_SSE2
ScaleRowUp2_Linear_SSE2(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)783 void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
784 uint8_t* dst_ptr,
785 int dst_width) {
786 asm volatile(
787 "pxor %%xmm0,%%xmm0 \n" // 0
788 "pcmpeqw %%xmm6,%%xmm6 \n"
789 "psrlw $15,%%xmm6 \n"
790 "psllw $1,%%xmm6 \n" // all 2
791
792 LABELALIGN
793 "1: \n"
794 "movq (%0),%%xmm1 \n" // 01234567
795 "movq 1(%0),%%xmm2 \n" // 12345678
796 "movdqa %%xmm1,%%xmm3 \n"
797 "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
798 "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677
799 "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
800 "movdqa %%xmm1,%%xmm4 \n"
801 "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16)
802 "movdqa %%xmm2,%%xmm5 \n"
803 "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16)
804 "paddw %%xmm5,%%xmm4 \n"
805 "movdqa %%xmm3,%%xmm5 \n"
806 "paddw %%xmm6,%%xmm4 \n"
807 "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16)
808 "paddw %%xmm5,%%xmm5 \n"
809 "paddw %%xmm4,%%xmm5 \n" // 3*near+far+2 (lo)
810 "psrlw $2,%%xmm5 \n" // 3/4*near+1/4*far (lo)
811
812 "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16)
813 "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
814 "paddw %%xmm2,%%xmm1 \n"
815 "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
816 "paddw %%xmm6,%%xmm1 \n"
817 "paddw %%xmm3,%%xmm3 \n"
818 "paddw %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi)
819 "psrlw $2,%%xmm1 \n" // 3/4*near+1/4*far (hi)
820
821 "packuswb %%xmm1,%%xmm5 \n"
822 "movdqu %%xmm5,(%1) \n"
823
824 "lea 0x8(%0),%0 \n"
825 "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
826 "sub $0x10,%2 \n"
827 "jg 1b \n"
828 : "+r"(src_ptr), // %0
829 "+r"(dst_ptr), // %1
830 "+r"(dst_width) // %2
831 :
832 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
833 }
834 #endif
835
836 #ifdef HAS_SCALEROWUP2BILINEAR_SSE2
ScaleRowUp2_Bilinear_SSE2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)837 void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
838 ptrdiff_t src_stride,
839 uint8_t* dst_ptr,
840 ptrdiff_t dst_stride,
841 int dst_width) {
842 asm volatile(
843 LABELALIGN
844 "1: \n"
845 "pxor %%xmm0,%%xmm0 \n" // 0
846 // above line
847 "movq (%0),%%xmm1 \n" // 01234567
848 "movq 1(%0),%%xmm2 \n" // 12345678
849 "movdqa %%xmm1,%%xmm3 \n"
850 "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
851 "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677
852 "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
853
854 "movdqa %%xmm1,%%xmm4 \n"
855 "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16)
856 "movdqa %%xmm2,%%xmm5 \n"
857 "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16)
858 "paddw %%xmm5,%%xmm4 \n" // near+far
859 "movdqa %%xmm3,%%xmm5 \n"
860 "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16)
861 "paddw %%xmm5,%%xmm5 \n" // 2*near
862 "paddw %%xmm5,%%xmm4 \n" // 3*near+far (1, lo)
863
864 "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16)
865 "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
866 "paddw %%xmm2,%%xmm1 \n"
867 "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
868 "paddw %%xmm3,%%xmm3 \n" // 2*near
869 "paddw %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
870
871 // below line
872 "movq (%0,%3),%%xmm6 \n" // 01234567
873 "movq 1(%0,%3),%%xmm2 \n" // 12345678
874 "movdqa %%xmm6,%%xmm3 \n"
875 "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
876 "punpcklbw %%xmm6,%%xmm6 \n" // 0011223344556677
877 "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
878
879 "movdqa %%xmm6,%%xmm5 \n"
880 "punpcklbw %%xmm0,%%xmm5 \n" // 00112233 (16)
881 "movdqa %%xmm2,%%xmm7 \n"
882 "punpcklbw %%xmm0,%%xmm7 \n" // 11223344 (16)
883 "paddw %%xmm7,%%xmm5 \n" // near+far
884 "movdqa %%xmm3,%%xmm7 \n"
885 "punpcklbw %%xmm0,%%xmm7 \n" // 01122334 (16)
886 "paddw %%xmm7,%%xmm7 \n" // 2*near
887 "paddw %%xmm7,%%xmm5 \n" // 3*near+far (2, lo)
888
889 "punpckhbw %%xmm0,%%xmm6 \n" // 44556677 (16)
890 "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
891 "paddw %%xmm6,%%xmm2 \n" // near+far
892 "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
893 "paddw %%xmm3,%%xmm3 \n" // 2*near
894 "paddw %%xmm3,%%xmm2 \n" // 3*near+far (2, hi)
895
896 // xmm4 xmm1
897 // xmm5 xmm2
898 "pcmpeqw %%xmm0,%%xmm0 \n"
899 "psrlw $15,%%xmm0 \n"
900 "psllw $3,%%xmm0 \n" // all 8
901
902 "movdqa %%xmm4,%%xmm3 \n"
903 "movdqa %%xmm5,%%xmm6 \n"
904 "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (1, lo)
905 "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, lo)
906 "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (1, lo)
907 "paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, lo)
908 "psrlw $4,%%xmm3 \n" // ^ div by 16
909
910 "movdqa %%xmm1,%%xmm7 \n"
911 "movdqa %%xmm2,%%xmm6 \n"
912 "paddw %%xmm7,%%xmm7 \n" // 6*near+2*far (1, hi)
913 "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, hi)
914 "paddw %%xmm1,%%xmm7 \n" // 9*near+3*far (1, hi)
915 "paddw %%xmm6,%%xmm7 \n" // 9 3 3 1 + 8 (1, hi)
916 "psrlw $4,%%xmm7 \n" // ^ div by 16
917
918 "packuswb %%xmm7,%%xmm3 \n"
919 "movdqu %%xmm3,(%1) \n" // save above line
920
921 "movdqa %%xmm5,%%xmm3 \n"
922 "paddw %%xmm0,%%xmm4 \n" // 3*near+far+8 (1, lo)
923 "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, lo)
924 "paddw %%xmm3,%%xmm5 \n" // 9*near+3*far (2, lo)
925 "paddw %%xmm4,%%xmm5 \n" // 9 3 3 1 + 8 (lo)
926 "psrlw $4,%%xmm5 \n" // ^ div by 16
927
928 "movdqa %%xmm2,%%xmm3 \n"
929 "paddw %%xmm0,%%xmm1 \n" // 3*near+far+8 (1, hi)
930 "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, hi)
931 "paddw %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi)
932 "paddw %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (hi)
933 "psrlw $4,%%xmm2 \n" // ^ div by 16
934
935 "packuswb %%xmm2,%%xmm5 \n"
936 "movdqu %%xmm5,(%1,%4) \n" // save below line
937
938 "lea 0x8(%0),%0 \n"
939 "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
940 "sub $0x10,%2 \n"
941 "jg 1b \n"
942 : "+r"(src_ptr), // %0
943 "+r"(dst_ptr), // %1
944 "+r"(dst_width) // %2
945 : "r"((intptr_t)(src_stride)), // %3
946 "r"((intptr_t)(dst_stride)) // %4
947 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
948 "xmm7");
949 }
950 #endif
951
952 #ifdef HAS_SCALEROWUP2LINEAR_12_SSSE3
ScaleRowUp2_Linear_12_SSSE3(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)953 void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
954 uint16_t* dst_ptr,
955 int dst_width) {
956 asm volatile(
957 "movdqa %3,%%xmm5 \n"
958 "pcmpeqw %%xmm4,%%xmm4 \n"
959 "psrlw $15,%%xmm4 \n"
960 "psllw $1,%%xmm4 \n" // all 2
961
962 LABELALIGN
963 "1: \n"
964 "movdqu (%0),%%xmm0 \n" // 01234567 (16)
965 "movdqu 2(%0),%%xmm1 \n" // 12345678 (16)
966
967 "movdqa %%xmm0,%%xmm2 \n"
968 "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16)
969 "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16)
970
971 "movdqa %%xmm2,%%xmm3 \n"
972 "movdqa %%xmm0,%%xmm1 \n"
973 "pshufb %%xmm5,%%xmm3 \n" // 54657687 (far)
974 "pshufb %%xmm5,%%xmm1 \n" // 10213243 (far)
975
976 "paddw %%xmm4,%%xmm1 \n" // far+2
977 "paddw %%xmm4,%%xmm3 \n" // far+2
978 "paddw %%xmm0,%%xmm1 \n" // near+far+2
979 "paddw %%xmm2,%%xmm3 \n" // near+far+2
980 "paddw %%xmm0,%%xmm0 \n" // 2*near
981 "paddw %%xmm2,%%xmm2 \n" // 2*near
982 "paddw %%xmm1,%%xmm0 \n" // 3*near+far+2 (lo)
983 "paddw %%xmm3,%%xmm2 \n" // 3*near+far+2 (hi)
984
985 "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far
986 "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far
987 "movdqu %%xmm0,(%1) \n"
988 "movdqu %%xmm2,16(%1) \n"
989
990 "lea 0x10(%0),%0 \n"
991 "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
992 "sub $0x10,%2 \n"
993 "jg 1b \n"
994 : "+r"(src_ptr), // %0
995 "+r"(dst_ptr), // %1
996 "+r"(dst_width) // %2
997 : "m"(kLinearShuffleFar) // %3
998 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
999 }
1000 #endif
1001
1002 #ifdef HAS_SCALEROWUP2BILINEAR_12_SSSE3
ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1003 void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
1004 ptrdiff_t src_stride,
1005 uint16_t* dst_ptr,
1006 ptrdiff_t dst_stride,
1007 int dst_width) {
1008 asm volatile(
1009 "pcmpeqw %%xmm7,%%xmm7 \n"
1010 "psrlw $15,%%xmm7 \n"
1011 "psllw $3,%%xmm7 \n" // all 8
1012 "movdqa %5,%%xmm6 \n"
1013
1014 LABELALIGN
1015 "1: \n"
1016 // above line
1017 "movdqu (%0),%%xmm0 \n" // 01234567 (16)
1018 "movdqu 2(%0),%%xmm1 \n" // 12345678 (16)
1019 "movdqa %%xmm0,%%xmm2 \n"
1020 "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16)
1021 "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16)
1022 "movdqa %%xmm2,%%xmm3 \n"
1023 "movdqa %%xmm0,%%xmm1 \n"
1024 "pshufb %%xmm6,%%xmm3 \n" // 54657687 (far)
1025 "pshufb %%xmm6,%%xmm1 \n" // 10213243 (far)
1026 "paddw %%xmm0,%%xmm1 \n" // near+far
1027 "paddw %%xmm2,%%xmm3 \n" // near+far
1028 "paddw %%xmm0,%%xmm0 \n" // 2*near
1029 "paddw %%xmm2,%%xmm2 \n" // 2*near
1030 "paddw %%xmm1,%%xmm0 \n" // 3*near+far (1, lo)
1031 "paddw %%xmm3,%%xmm2 \n" // 3*near+far (1, hi)
1032
1033 // below line
1034 "movdqu (%0,%3,2),%%xmm1 \n" // 01234567 (16)
1035 "movdqu 2(%0,%3,2),%%xmm4 \n" // 12345678 (16)
1036 "movdqa %%xmm1,%%xmm3 \n"
1037 "punpckhwd %%xmm4,%%xmm3 \n" // 45566778 (16)
1038 "punpcklwd %%xmm4,%%xmm1 \n" // 01122334 (16)
1039 "movdqa %%xmm3,%%xmm5 \n"
1040 "movdqa %%xmm1,%%xmm4 \n"
1041 "pshufb %%xmm6,%%xmm5 \n" // 54657687 (far)
1042 "pshufb %%xmm6,%%xmm4 \n" // 10213243 (far)
1043 "paddw %%xmm1,%%xmm4 \n" // near+far
1044 "paddw %%xmm3,%%xmm5 \n" // near+far
1045 "paddw %%xmm1,%%xmm1 \n" // 2*near
1046 "paddw %%xmm3,%%xmm3 \n" // 2*near
1047 "paddw %%xmm4,%%xmm1 \n" // 3*near+far (2, lo)
1048 "paddw %%xmm5,%%xmm3 \n" // 3*near+far (2, hi)
1049
1050 // xmm0 xmm2
1051 // xmm1 xmm3
1052
1053 "movdqa %%xmm0,%%xmm4 \n"
1054 "movdqa %%xmm1,%%xmm5 \n"
1055 "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, lo)
1056 "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, lo)
1057 "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
1058 "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
1059 "psrlw $4,%%xmm4 \n" // ^ div by 16
1060 "movdqu %%xmm4,(%1) \n"
1061
1062 "movdqa %%xmm2,%%xmm4 \n"
1063 "movdqa %%xmm3,%%xmm5 \n"
1064 "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, hi)
1065 "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, hi)
1066 "paddw %%xmm2,%%xmm4 \n" // 9*near+3*far (1, hi)
1067 "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, hi)
1068 "psrlw $4,%%xmm4 \n" // ^ div by 16
1069 "movdqu %%xmm4,0x10(%1) \n"
1070
1071 "movdqa %%xmm1,%%xmm4 \n"
1072 "paddw %%xmm7,%%xmm0 \n" // 3*near+far+8 (1, lo)
1073 "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, lo)
1074 "paddw %%xmm4,%%xmm1 \n" // 9*near+3*far (2, lo)
1075 "paddw %%xmm0,%%xmm1 \n" // 9 3 3 1 + 8 (2, lo)
1076 "psrlw $4,%%xmm1 \n" // ^ div by 16
1077 "movdqu %%xmm1,(%1,%4,2) \n"
1078
1079 "movdqa %%xmm3,%%xmm4 \n"
1080 "paddw %%xmm7,%%xmm2 \n" // 3*near+far+8 (1, hi)
1081 "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, hi)
1082 "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (2, hi)
1083 "paddw %%xmm2,%%xmm3 \n" // 9 3 3 1 + 8 (2, hi)
1084 "psrlw $4,%%xmm3 \n" // ^ div by 16
1085 "movdqu %%xmm3,0x10(%1,%4,2) \n"
1086
1087 "lea 0x10(%0),%0 \n"
1088 "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
1089 "sub $0x10,%2 \n"
1090 "jg 1b \n"
1091 : "+r"(src_ptr), // %0
1092 "+r"(dst_ptr), // %1
1093 "+r"(dst_width) // %2
1094 : "r"((intptr_t)(src_stride)), // %3
1095 "r"((intptr_t)(dst_stride)), // %4
1096 "m"(kLinearShuffleFar) // %5
1097 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1098 }
1099 #endif
1100
1101 #ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
ScaleRowUp2_Linear_16_SSE2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)1102 void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
1103 uint16_t* dst_ptr,
1104 int dst_width) {
1105 asm volatile(
1106 "pxor %%xmm5,%%xmm5 \n"
1107 "pcmpeqd %%xmm4,%%xmm4 \n"
1108 "psrld $31,%%xmm4 \n"
1109 "pslld $1,%%xmm4 \n" // all 2
1110
1111 LABELALIGN
1112 "1: \n"
1113 "movq (%0),%%xmm0 \n" // 0123 (16b)
1114 "movq 2(%0),%%xmm1 \n" // 1234 (16b)
1115
1116 "punpcklwd %%xmm5,%%xmm0 \n" // 0123 (32b)
1117 "punpcklwd %%xmm5,%%xmm1 \n" // 1234 (32b)
1118
1119 "movdqa %%xmm0,%%xmm2 \n"
1120 "movdqa %%xmm1,%%xmm3 \n"
1121
1122 "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far)
1123 "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far)
1124
1125 "paddd %%xmm4,%%xmm2 \n" // far+2 (lo)
1126 "paddd %%xmm4,%%xmm3 \n" // far+2 (hi)
1127 "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo)
1128 "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi)
1129 "paddd %%xmm0,%%xmm0 \n" // 2*near (lo)
1130 "paddd %%xmm1,%%xmm1 \n" // 2*near (hi)
1131 "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo)
1132 "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi)
1133
1134 "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
1135 "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi)
1136 "packssdw %%xmm1,%%xmm0 \n"
1137 "pshufd $0b11011000,%%xmm0,%%xmm0 \n"
1138 "movdqu %%xmm0,(%1) \n"
1139
1140 "lea 0x8(%0),%0 \n"
1141 "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel
1142 "sub $0x8,%2 \n"
1143 "jg 1b \n"
1144 : "+r"(src_ptr), // %0
1145 "+r"(dst_ptr), // %1
1146 "+r"(dst_width) // %2
1147 :
1148 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1149 }
1150 #endif
1151
1152 #ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
ScaleRowUp2_Bilinear_16_SSE2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1153 void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
1154 ptrdiff_t src_stride,
1155 uint16_t* dst_ptr,
1156 ptrdiff_t dst_stride,
1157 int dst_width) {
1158 asm volatile(
1159 "pxor %%xmm7,%%xmm7 \n"
1160 "pcmpeqd %%xmm6,%%xmm6 \n"
1161 "psrld $31,%%xmm6 \n"
1162 "pslld $3,%%xmm6 \n" // all 8
1163
1164 LABELALIGN
1165 "1: \n"
1166 "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v)
1167 "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v)
1168 "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v)
1169 "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v)
1170 "movdqa %%xmm0,%%xmm2 \n"
1171 "movdqa %%xmm1,%%xmm3 \n"
1172 "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo)
1173 "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi)
1174 "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo)
1175 "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi)
1176 "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo)
1177 "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi)
1178 "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo)
1179 "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
1180
1181 "movq (%0),%%xmm0 \n" // 0123 (16b)
1182 "movq 2(%0),%%xmm1 \n" // 1234 (16b)
1183 "punpcklwd %%xmm7,%%xmm0 \n" // 0123 (32b)
1184 "punpcklwd %%xmm7,%%xmm1 \n" // 1234 (32b)
1185 "movdqa %%xmm0,%%xmm2 \n"
1186 "movdqa %%xmm1,%%xmm3 \n"
1187 "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far)
1188 "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far)
1189 "paddd %%xmm0,%%xmm2 \n" // near+far (lo)
1190 "paddd %%xmm1,%%xmm3 \n" // near+far (hi)
1191 "paddd %%xmm0,%%xmm0 \n" // 2*near (lo)
1192 "paddd %%xmm1,%%xmm1 \n" // 2*near (hi)
1193 "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo)
1194 "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
1195
1196 "movq (%0,%3,2),%%xmm2 \n"
1197 "movq 2(%0,%3,2),%%xmm3 \n"
1198 "punpcklwd %%xmm7,%%xmm2 \n" // 0123 (32b)
1199 "punpcklwd %%xmm7,%%xmm3 \n" // 1234 (32b)
1200 "movdqa %%xmm2,%%xmm4 \n"
1201 "movdqa %%xmm3,%%xmm5 \n"
1202 "pshufd $0b10110001,%%xmm4,%%xmm4 \n" // 1032 (even, far)
1203 "pshufd $0b10110001,%%xmm5,%%xmm5 \n" // 2143 (odd, far)
1204 "paddd %%xmm2,%%xmm4 \n" // near+far (lo)
1205 "paddd %%xmm3,%%xmm5 \n" // near+far (hi)
1206 "paddd %%xmm2,%%xmm2 \n" // 2*near (lo)
1207 "paddd %%xmm3,%%xmm3 \n" // 2*near (hi)
1208 "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo)
1209 "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi)
1210
1211 "movdqa %%xmm0,%%xmm4 \n"
1212 "movdqa %%xmm2,%%xmm5 \n"
1213 "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
1214 "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
1215 "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
1216 "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
1217 "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo)
1218
1219 "movdqa %%xmm2,%%xmm5 \n"
1220 "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo)
1221 "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
1222 "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo)
1223 "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
1224 "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo)
1225
1226 "movdqa %%xmm1,%%xmm0 \n"
1227 "movdqa %%xmm3,%%xmm2 \n"
1228 "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi)
1229 "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi)
1230 "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi)
1231 "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
1232 "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi)
1233
1234 "movdqa %%xmm3,%%xmm2 \n"
1235 "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi)
1236 "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi)
1237 "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi)
1238 "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi)
1239 "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi)
1240
1241 "packssdw %%xmm0,%%xmm4 \n"
1242 "pshufd $0b11011000,%%xmm4,%%xmm4 \n"
1243 "movdqu %%xmm4,(%1) \n" // store above
1244 "packssdw %%xmm2,%%xmm5 \n"
1245 "pshufd $0b11011000,%%xmm4,%%xmm4 \n"
1246 "movdqu %%xmm5,(%1,%4,2) \n" // store below
1247
1248 "lea 0x8(%0),%0 \n"
1249 "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel
1250 "sub $0x8,%2 \n"
1251 "jg 1b \n"
1252 : "+r"(src_ptr), // %0
1253 "+r"(dst_ptr), // %1
1254 "+r"(dst_width) // %2
1255 : "r"((intptr_t)(src_stride)), // %3
1256 "r"((intptr_t)(dst_stride)) // %4
1257 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1258 }
1259 #endif
1260
1261 #ifdef HAS_SCALEROWUP2LINEAR_SSSE3
ScaleRowUp2_Linear_SSSE3(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)1262 void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
1263 uint8_t* dst_ptr,
1264 int dst_width) {
1265 asm volatile(
1266 "pcmpeqw %%xmm4,%%xmm4 \n"
1267 "psrlw $15,%%xmm4 \n"
1268 "psllw $1,%%xmm4 \n" // all 2
1269 "movdqa %3,%%xmm3 \n"
1270
1271 LABELALIGN
1272 "1: \n"
1273 "movq (%0),%%xmm0 \n" // 01234567
1274 "movq 1(%0),%%xmm1 \n" // 12345678
1275 "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
1276 "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
1277 "movdqa %%xmm0,%%xmm2 \n"
1278 "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
1279 "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
1280 "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (hi)
1281 "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (lo)
1282 "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo)
1283 "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi)
1284 "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
1285 "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi)
1286 "packuswb %%xmm2,%%xmm0 \n"
1287 "movdqu %%xmm0,(%1) \n"
1288 "lea 0x8(%0),%0 \n"
1289 "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
1290 "sub $0x10,%2 \n"
1291 "jg 1b \n"
1292 : "+r"(src_ptr), // %0
1293 "+r"(dst_ptr), // %1
1294 "+r"(dst_width) // %2
1295 : "m"(kLinearMadd31) // %3
1296 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1297 }
1298 #endif
1299
1300 #ifdef HAS_SCALEROWUP2BILINEAR_SSSE3
ScaleRowUp2_Bilinear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1301 void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
1302 ptrdiff_t src_stride,
1303 uint8_t* dst_ptr,
1304 ptrdiff_t dst_stride,
1305 int dst_width) {
1306 asm volatile(
1307 "pcmpeqw %%xmm6,%%xmm6 \n"
1308 "psrlw $15,%%xmm6 \n"
1309 "psllw $3,%%xmm6 \n" // all 8
1310 "movdqa %5,%%xmm7 \n"
1311
1312 LABELALIGN
1313 "1: \n"
1314 "movq (%0),%%xmm0 \n" // 01234567
1315 "movq 1(%0),%%xmm1 \n" // 12345678
1316 "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
1317 "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
1318 "movdqa %%xmm0,%%xmm2 \n"
1319 "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
1320 "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
1321 "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1, hi)
1322 "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1, lo)
1323
1324 "movq (%0,%3),%%xmm1 \n"
1325 "movq 1(%0,%3),%%xmm4 \n"
1326 "punpcklwd %%xmm1,%%xmm1 \n"
1327 "punpcklwd %%xmm4,%%xmm4 \n"
1328 "movdqa %%xmm1,%%xmm3 \n"
1329 "punpckhdq %%xmm4,%%xmm3 \n"
1330 "punpckldq %%xmm4,%%xmm1 \n"
1331 "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi)
1332 "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo)
1333
1334 // xmm0 xmm2
1335 // xmm1 xmm3
1336
1337 "movdqa %%xmm0,%%xmm4 \n"
1338 "movdqa %%xmm1,%%xmm5 \n"
1339 "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
1340 "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
1341 "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
1342 "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
1343 "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo)
1344
1345 "movdqa %%xmm1,%%xmm5 \n"
1346 "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo)
1347 "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
1348 "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo)
1349 "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
1350 "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo)
1351
1352 "movdqa %%xmm2,%%xmm0 \n"
1353 "movdqa %%xmm3,%%xmm1 \n"
1354 "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi)
1355 "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi)
1356 "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi)
1357 "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
1358 "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi)
1359
1360 "movdqa %%xmm3,%%xmm1 \n"
1361 "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi)
1362 "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi)
1363 "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi)
1364 "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi)
1365 "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi)
1366
1367 "packuswb %%xmm0,%%xmm4 \n"
1368 "movdqu %%xmm4,(%1) \n" // store above
1369 "packuswb %%xmm1,%%xmm5 \n"
1370 "movdqu %%xmm5,(%1,%4) \n" // store below
1371
1372 "lea 0x8(%0),%0 \n"
1373 "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
1374 "sub $0x10,%2 \n"
1375 "jg 1b \n"
1376 : "+r"(src_ptr), // %0
1377 "+r"(dst_ptr), // %1
1378 "+r"(dst_width) // %2
1379 : "r"((intptr_t)(src_stride)), // %3
1380 "r"((intptr_t)(dst_stride)), // %4
1381 "m"(kLinearMadd31) // %5
1382 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1383 "xmm7");
1384 }
1385 #endif
1386
1387 #ifdef HAS_SCALEROWUP2LINEAR_AVX2
ScaleRowUp2_Linear_AVX2(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)1388 void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
1389 uint8_t* dst_ptr,
1390 int dst_width) {
1391 asm volatile(
1392 "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
1393 "vpsrlw $15,%%ymm4,%%ymm4 \n"
1394 "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
1395 "vbroadcastf128 %3,%%ymm3 \n"
1396
1397 LABELALIGN
1398 "1: \n"
1399 "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
1400 "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
1401 "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
1402 "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
1403 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
1404 "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
1405 "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
1406 "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
1407 "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
1408 "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo)
1409 "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
1410 "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
1411 "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
1412 "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
1413 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
1414 "vmovdqu %%ymm0,(%1) \n"
1415
1416 "lea 0x10(%0),%0 \n"
1417 "lea 0x20(%1),%1 \n" // 16 sample to 32 sample
1418 "sub $0x20,%2 \n"
1419 "jg 1b \n"
1420 "vzeroupper \n"
1421 : "+r"(src_ptr), // %0
1422 "+r"(dst_ptr), // %1
1423 "+r"(dst_width) // %2
1424 : "m"(kLinearMadd31) // %3
1425 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
1426 }
1427 #endif
1428
1429 #ifdef HAS_SCALEROWUP2BILINEAR_AVX2
ScaleRowUp2_Bilinear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1430 void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
1431 ptrdiff_t src_stride,
1432 uint8_t* dst_ptr,
1433 ptrdiff_t dst_stride,
1434 int dst_width) {
1435 asm volatile(
1436 "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
1437 "vpsrlw $15,%%ymm6,%%ymm6 \n"
1438 "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
1439 "vbroadcastf128 %5,%%ymm7 \n"
1440
1441 LABELALIGN
1442 "1: \n"
1443 "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
1444 "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
1445 "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
1446 "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
1447 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
1448 "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
1449 "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
1450 "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
1451 "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
1452 "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo)
1453
1454 "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF
1455 "vmovdqu 1(%0,%3),%%xmm3 \n" // 123456789ABCDEF0
1456 "vpermq $0b11011000,%%ymm2,%%ymm2 \n"
1457 "vpermq $0b11011000,%%ymm3,%%ymm3 \n"
1458 "vpunpcklwd %%ymm2,%%ymm2,%%ymm2 \n"
1459 "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n"
1460 "vpunpckhdq %%ymm3,%%ymm2,%%ymm4 \n"
1461 "vpunpckldq %%ymm3,%%ymm2,%%ymm2 \n"
1462 "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
1463 "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo)
1464
1465 // ymm0 ymm1
1466 // ymm2 ymm3
1467
1468 "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
1469 "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
1470 "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
1471 "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
1472 "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
1473
1474 "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
1475 "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
1476 "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
1477 "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
1478 "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
1479
1480 "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
1481 "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
1482 "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
1483 "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
1484 "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
1485
1486 "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
1487 "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
1488 "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
1489 "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
1490 "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
1491
1492 "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n"
1493 "vmovdqu %%ymm4,(%1) \n" // store above
1494 "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n"
1495 "vmovdqu %%ymm5,(%1,%4) \n" // store below
1496
1497 "lea 0x10(%0),%0 \n"
1498 "lea 0x20(%1),%1 \n" // 16 sample to 32 sample
1499 "sub $0x20,%2 \n"
1500 "jg 1b \n"
1501 "vzeroupper \n"
1502 : "+r"(src_ptr), // %0
1503 "+r"(dst_ptr), // %1
1504 "+r"(dst_width) // %2
1505 : "r"((intptr_t)(src_stride)), // %3
1506 "r"((intptr_t)(dst_stride)), // %4
1507 "m"(kLinearMadd31) // %5
1508 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1509 "xmm7");
1510 }
1511 #endif
1512
1513 #ifdef HAS_SCALEROWUP2LINEAR_12_AVX2
ScaleRowUp2_Linear_12_AVX2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)1514 void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
1515 uint16_t* dst_ptr,
1516 int dst_width) {
1517 asm volatile(
1518 "vbroadcastf128 %3,%%ymm5 \n"
1519 "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
1520 "vpsrlw $15,%%ymm4,%%ymm4 \n"
1521 "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
1522
1523 LABELALIGN
1524 "1: \n"
1525 "vmovdqu (%0),%%ymm0 \n" // 0123456789ABCDEF (16b)
1526 "vmovdqu 2(%0),%%ymm1 \n" // 123456789ABCDEF0 (16b)
1527
1528 "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 012389AB4567CDEF
1529 "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 12349ABC5678DEF0
1530
1531 "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n" // 899AABBCCDDEEFF0 (near)
1532 "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near)
1533 "vpshufb %%ymm5,%%ymm2,%%ymm3 \n" // 98A9BACBDCEDFE0F (far)
1534 "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far)
1535
1536 "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // far+2
1537 "vpaddw %%ymm4,%%ymm3,%%ymm3 \n" // far+2
1538 "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far+2
1539 "vpaddw %%ymm2,%%ymm3,%%ymm3 \n" // near+far+2
1540 "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near
1541 "vpaddw %%ymm2,%%ymm2,%%ymm2 \n" // 2*near
1542 "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 3*near+far+2
1543 "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 3*near+far+2
1544
1545 "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far
1546 "vpsrlw $2,%%ymm2,%%ymm2 \n" // 3/4*near+1/4*far
1547 "vmovdqu %%ymm0,(%1) \n"
1548 "vmovdqu %%ymm2,32(%1) \n"
1549
1550 "lea 0x20(%0),%0 \n"
1551 "lea 0x40(%1),%1 \n" // 16 sample to 32 sample
1552 "sub $0x20,%2 \n"
1553 "jg 1b \n"
1554 "vzeroupper \n"
1555 : "+r"(src_ptr), // %0
1556 "+r"(dst_ptr), // %1
1557 "+r"(dst_width) // %2
1558 : "m"(kLinearShuffleFar) // %3
1559 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1560 }
1561 #endif
1562
1563 #ifdef HAS_SCALEROWUP2BILINEAR_12_AVX2
ScaleRowUp2_Bilinear_12_AVX2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1564 void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
1565 ptrdiff_t src_stride,
1566 uint16_t* dst_ptr,
1567 ptrdiff_t dst_stride,
1568 int dst_width) {
1569 asm volatile(
1570 "vbroadcastf128 %5,%%ymm5 \n"
1571 "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
1572 "vpsrlw $15,%%ymm4,%%ymm4 \n"
1573 "vpsllw $3,%%ymm4,%%ymm4 \n" // all 8
1574
1575 LABELALIGN
1576 "1: \n"
1577
1578 "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
1579 "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
1580 "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
1581 "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
1582 "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near)
1583 "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far)
1584 "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far
1585 "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near
1586 "vpaddw %%ymm0,%%ymm1,%%ymm2 \n" // 3*near+far (1)
1587
1588 "vmovdqu (%0,%3,2),%%xmm0 \n" // 01234567 (16b)
1589 "vmovdqu 2(%0,%3,2),%%xmm1 \n" // 12345678 (16b)
1590 "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
1591 "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
1592 "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near)
1593 "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far)
1594 "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far
1595 "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near
1596 "vpaddw %%ymm0,%%ymm1,%%ymm3 \n" // 3*near+far (2)
1597
1598 "vpaddw %%ymm2,%%ymm2,%%ymm0 \n" // 6*near+2*far (1)
1599 "vpaddw %%ymm4,%%ymm3,%%ymm1 \n" // 3*near+far+8 (2)
1600 "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9*near+3*far (1)
1601 "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (1)
1602 "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
1603 "vmovdqu %%ymm0,(%1) \n" // store above
1604
1605 "vpaddw %%ymm3,%%ymm3,%%ymm0 \n" // 6*near+2*far (2)
1606 "vpaddw %%ymm4,%%ymm2,%%ymm1 \n" // 3*near+far+8 (1)
1607 "vpaddw %%ymm0,%%ymm3,%%ymm0 \n" // 9*near+3*far (2)
1608 "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (2)
1609 "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
1610 "vmovdqu %%ymm0,(%1,%4,2) \n" // store below
1611
1612 "lea 0x10(%0),%0 \n"
1613 "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
1614 "sub $0x10,%2 \n"
1615 "jg 1b \n"
1616 "vzeroupper \n"
1617 : "+r"(src_ptr), // %0
1618 "+r"(dst_ptr), // %1
1619 "+r"(dst_width) // %2
1620 : "r"((intptr_t)(src_stride)), // %3
1621 "r"((intptr_t)(dst_stride)), // %4
1622 "m"(kLinearShuffleFar) // %5
1623 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1624 }
1625 #endif
1626
1627 #ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
ScaleRowUp2_Linear_16_AVX2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)1628 void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
1629 uint16_t* dst_ptr,
1630 int dst_width) {
1631 asm volatile(
1632 "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
1633 "vpsrld $31,%%ymm4,%%ymm4 \n"
1634 "vpslld $1,%%ymm4,%%ymm4 \n" // all 2
1635
1636 LABELALIGN
1637 "1: \n"
1638 "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v)
1639 "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v)
1640
1641 "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
1642 "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
1643
1644 "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far)
1645 "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far)
1646
1647 "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo)
1648 "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi)
1649 "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo)
1650 "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi)
1651 "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
1652 "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
1653 "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo)
1654 "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi)
1655
1656 "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
1657 "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
1658 "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n"
1659 "vpshufd $0b11011000,%%ymm0,%%ymm0 \n"
1660 "vmovdqu %%ymm0,(%1) \n"
1661
1662 "lea 0x10(%0),%0 \n"
1663 "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel
1664 "sub $0x10,%2 \n"
1665 "jg 1b \n"
1666 "vzeroupper \n"
1667 : "+r"(src_ptr), // %0
1668 "+r"(dst_ptr), // %1
1669 "+r"(dst_width) // %2
1670 :
1671 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
1672 }
1673 #endif
1674
1675 #ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2
ScaleRowUp2_Bilinear_16_AVX2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1676 void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
1677 ptrdiff_t src_stride,
1678 uint16_t* dst_ptr,
1679 ptrdiff_t dst_stride,
1680 int dst_width) {
1681 asm volatile(
1682 "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
1683 "vpsrld $31,%%ymm6,%%ymm6 \n"
1684 "vpslld $3,%%ymm6,%%ymm6 \n" // all 8
1685
1686 LABELALIGN
1687 "1: \n"
1688
1689 "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v)
1690 "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v)
1691 "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
1692 "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
1693 "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far)
1694 "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far)
1695 "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo)
1696 "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi)
1697 "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
1698 "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
1699 "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (1, lo)
1700 "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (1, hi)
1701
1702 "vmovdqu (%0,%3,2),%%xmm2 \n" // 01234567 (16b, 1u1v)
1703 "vmovdqu 2(%0,%3,2),%%xmm3 \n" // 12345678 (16b, 1u1v)
1704 "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v)
1705 "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v)
1706 "vpshufd $0b10110001,%%ymm2,%%ymm4 \n" // 10325476 (lo, far)
1707 "vpshufd $0b10110001,%%ymm3,%%ymm5 \n" // 21436587 (hi, far)
1708 "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo)
1709 "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi)
1710 "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo)
1711 "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi)
1712 "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (2, lo)
1713 "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (2, hi)
1714
1715 "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
1716 "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
1717 "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
1718 "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
1719 "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
1720
1721 "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
1722 "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
1723 "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
1724 "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
1725 "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
1726
1727 "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
1728 "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
1729 "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
1730 "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
1731 "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
1732
1733 "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
1734 "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
1735 "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
1736 "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
1737 "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
1738
1739 "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n"
1740 "vpshufd $0b11011000,%%ymm4,%%ymm4 \n"
1741 "vmovdqu %%ymm4,(%1) \n" // store above
1742 "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n"
1743 "vpshufd $0b11011000,%%ymm5,%%ymm5 \n"
1744 "vmovdqu %%ymm5,(%1,%4,2) \n" // store below
1745
1746 "lea 0x10(%0),%0 \n"
1747 "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel
1748 "sub $0x10,%2 \n"
1749 "jg 1b \n"
1750 "vzeroupper \n"
1751 : "+r"(src_ptr), // %0
1752 "+r"(dst_ptr), // %1
1753 "+r"(dst_width) // %2
1754 : "r"((intptr_t)(src_stride)), // %3
1755 "r"((intptr_t)(dst_stride)) // %4
1756 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1757 }
1758 #endif
1759
1760 // Reads 16xN bytes and produces 16 shorts at a time.
ScaleAddRow_SSE2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)1761 void ScaleAddRow_SSE2(const uint8_t* src_ptr,
1762 uint16_t* dst_ptr,
1763 int src_width) {
1764 asm volatile("pxor %%xmm5,%%xmm5 \n"
1765
1766 // 16 pixel loop.
1767 LABELALIGN
1768 "1: \n"
1769 "movdqu (%0),%%xmm3 \n"
1770 "lea 0x10(%0),%0 \n" // src_ptr += 16
1771 "movdqu (%1),%%xmm0 \n"
1772 "movdqu 0x10(%1),%%xmm1 \n"
1773 "movdqa %%xmm3,%%xmm2 \n"
1774 "punpcklbw %%xmm5,%%xmm2 \n"
1775 "punpckhbw %%xmm5,%%xmm3 \n"
1776 "paddusw %%xmm2,%%xmm0 \n"
1777 "paddusw %%xmm3,%%xmm1 \n"
1778 "movdqu %%xmm0,(%1) \n"
1779 "movdqu %%xmm1,0x10(%1) \n"
1780 "lea 0x20(%1),%1 \n"
1781 "sub $0x10,%2 \n"
1782 "jg 1b \n"
1783 : "+r"(src_ptr), // %0
1784 "+r"(dst_ptr), // %1
1785 "+r"(src_width) // %2
1786 :
1787 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
1788 }
1789
1790 #ifdef HAS_SCALEADDROW_AVX2
1791 // Reads 32 bytes and accumulates to 32 shorts at a time.
ScaleAddRow_AVX2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)1792 void ScaleAddRow_AVX2(const uint8_t* src_ptr,
1793 uint16_t* dst_ptr,
1794 int src_width) {
1795 asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n"
1796
1797 LABELALIGN
1798 "1: \n"
1799 "vmovdqu (%0),%%ymm3 \n"
1800 "lea 0x20(%0),%0 \n" // src_ptr += 32
1801 "vpermq $0xd8,%%ymm3,%%ymm3 \n"
1802 "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
1803 "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
1804 "vpaddusw (%1),%%ymm2,%%ymm0 \n"
1805 "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n"
1806 "vmovdqu %%ymm0,(%1) \n"
1807 "vmovdqu %%ymm1,0x20(%1) \n"
1808 "lea 0x40(%1),%1 \n"
1809 "sub $0x20,%2 \n"
1810 "jg 1b \n"
1811 "vzeroupper \n"
1812 : "+r"(src_ptr), // %0
1813 "+r"(dst_ptr), // %1
1814 "+r"(src_width) // %2
1815 :
1816 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
1817 }
1818 #endif // HAS_SCALEADDROW_AVX2
1819
1820 // Constant for making pixels signed to avoid pmaddubsw
1821 // saturation.
1822 static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1823 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
1824
1825 // Constant for making pixels unsigned and adding .5 for rounding.
1826 static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
1827 0x4040, 0x4040, 0x4040, 0x4040};
1828
1829 // Bilinear column filtering. SSSE3 version.
ScaleFilterCols_SSSE3(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)1830 void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
1831 const uint8_t* src_ptr,
1832 int dst_width,
1833 int x,
1834 int dx) {
1835 intptr_t x0, x1, temp_pixel;
1836 asm volatile(
1837 "movd %6,%%xmm2 \n"
1838 "movd %7,%%xmm3 \n"
1839 "movl $0x04040000,%k2 \n"
1840 "movd %k2,%%xmm5 \n"
1841 "pcmpeqb %%xmm6,%%xmm6 \n"
1842 "psrlw $0x9,%%xmm6 \n" // 0x007f007f
1843 "pcmpeqb %%xmm7,%%xmm7 \n"
1844 "psrlw $15,%%xmm7 \n" // 0x00010001
1845
1846 "pextrw $0x1,%%xmm2,%k3 \n"
1847 "subl $0x2,%5 \n"
1848 "jl 29f \n"
1849 "movdqa %%xmm2,%%xmm0 \n"
1850 "paddd %%xmm3,%%xmm0 \n"
1851 "punpckldq %%xmm0,%%xmm2 \n"
1852 "punpckldq %%xmm3,%%xmm3 \n"
1853 "paddd %%xmm3,%%xmm3 \n"
1854 "pextrw $0x3,%%xmm2,%k4 \n"
1855
1856 LABELALIGN
1857 "2: \n"
1858 "movdqa %%xmm2,%%xmm1 \n"
1859 "paddd %%xmm3,%%xmm2 \n"
1860 "movzwl 0x00(%1,%3,1),%k2 \n"
1861 "movd %k2,%%xmm0 \n"
1862 "psrlw $0x9,%%xmm1 \n"
1863 "movzwl 0x00(%1,%4,1),%k2 \n"
1864 "movd %k2,%%xmm4 \n"
1865 "pshufb %%xmm5,%%xmm1 \n"
1866 "punpcklwd %%xmm4,%%xmm0 \n"
1867 "psubb %8,%%xmm0 \n" // make pixels signed.
1868 "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) +
1869 // 1
1870 "paddusb %%xmm7,%%xmm1 \n"
1871 "pmaddubsw %%xmm0,%%xmm1 \n"
1872 "pextrw $0x1,%%xmm2,%k3 \n"
1873 "pextrw $0x3,%%xmm2,%k4 \n"
1874 "paddw %9,%%xmm1 \n" // make pixels unsigned.
1875 "psrlw $0x7,%%xmm1 \n"
1876 "packuswb %%xmm1,%%xmm1 \n"
1877 "movd %%xmm1,%k2 \n"
1878 "mov %w2,(%0) \n"
1879 "lea 0x2(%0),%0 \n"
1880 "subl $0x2,%5 \n"
1881 "jge 2b \n"
1882
1883 LABELALIGN
1884 "29: \n"
1885 "addl $0x1,%5 \n"
1886 "jl 99f \n"
1887 "movzwl 0x00(%1,%3,1),%k2 \n"
1888 "movd %k2,%%xmm0 \n"
1889 "psrlw $0x9,%%xmm2 \n"
1890 "pshufb %%xmm5,%%xmm2 \n"
1891 "psubb %8,%%xmm0 \n" // make pixels signed.
1892 "pxor %%xmm6,%%xmm2 \n"
1893 "paddusb %%xmm7,%%xmm2 \n"
1894 "pmaddubsw %%xmm0,%%xmm2 \n"
1895 "paddw %9,%%xmm2 \n" // make pixels unsigned.
1896 "psrlw $0x7,%%xmm2 \n"
1897 "packuswb %%xmm2,%%xmm2 \n"
1898 "movd %%xmm2,%k2 \n"
1899 "mov %b2,(%0) \n"
1900 "99: \n"
1901 : "+r"(dst_ptr), // %0
1902 "+r"(src_ptr), // %1
1903 "=&a"(temp_pixel), // %2
1904 "=&r"(x0), // %3
1905 "=&r"(x1), // %4
1906 #if defined(__x86_64__)
1907 "+rm"(dst_width) // %5
1908 #else
1909 "+m"(dst_width) // %5
1910 #endif
1911 : "rm"(x), // %6
1912 "rm"(dx), // %7
1913 #if defined(__x86_64__)
1914 "x"(kFsub80), // %8
1915 "x"(kFadd40) // %9
1916 #else
1917 "m"(kFsub80), // %8
1918 "m"(kFadd40) // %9
1919 #endif
1920 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1921 "xmm7");
1922 }
1923
1924 // Reads 4 pixels, duplicates them and writes 8 pixels.
1925 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleColsUp2_SSE2(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)1926 void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
1927 const uint8_t* src_ptr,
1928 int dst_width,
1929 int x,
1930 int dx) {
1931 (void)x;
1932 (void)dx;
1933 asm volatile(LABELALIGN
1934 "1: \n"
1935 "movdqu (%1),%%xmm0 \n"
1936 "lea 0x10(%1),%1 \n"
1937 "movdqa %%xmm0,%%xmm1 \n"
1938 "punpcklbw %%xmm0,%%xmm0 \n"
1939 "punpckhbw %%xmm1,%%xmm1 \n"
1940 "movdqu %%xmm0,(%0) \n"
1941 "movdqu %%xmm1,0x10(%0) \n"
1942 "lea 0x20(%0),%0 \n"
1943 "sub $0x20,%2 \n"
1944 "jg 1b \n"
1945
1946 : "+r"(dst_ptr), // %0
1947 "+r"(src_ptr), // %1
1948 "+r"(dst_width) // %2
1949 ::"memory",
1950 "cc", "xmm0", "xmm1");
1951 }
1952
ScaleARGBRowDown2_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1953 void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
1954 ptrdiff_t src_stride,
1955 uint8_t* dst_argb,
1956 int dst_width) {
1957 (void)src_stride;
1958 asm volatile(LABELALIGN
1959 "1: \n"
1960 "movdqu (%0),%%xmm0 \n"
1961 "movdqu 0x10(%0),%%xmm1 \n"
1962 "lea 0x20(%0),%0 \n"
1963 "shufps $0xdd,%%xmm1,%%xmm0 \n"
1964 "movdqu %%xmm0,(%1) \n"
1965 "lea 0x10(%1),%1 \n"
1966 "sub $0x4,%2 \n"
1967 "jg 1b \n"
1968 : "+r"(src_argb), // %0
1969 "+r"(dst_argb), // %1
1970 "+r"(dst_width) // %2
1971 ::"memory",
1972 "cc", "xmm0", "xmm1");
1973 }
1974
ScaleARGBRowDown2Linear_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1975 void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
1976 ptrdiff_t src_stride,
1977 uint8_t* dst_argb,
1978 int dst_width) {
1979 (void)src_stride;
1980 asm volatile(LABELALIGN
1981 "1: \n"
1982 "movdqu (%0),%%xmm0 \n"
1983 "movdqu 0x10(%0),%%xmm1 \n"
1984 "lea 0x20(%0),%0 \n"
1985 "movdqa %%xmm0,%%xmm2 \n"
1986 "shufps $0x88,%%xmm1,%%xmm0 \n"
1987 "shufps $0xdd,%%xmm1,%%xmm2 \n"
1988 "pavgb %%xmm2,%%xmm0 \n"
1989 "movdqu %%xmm0,(%1) \n"
1990 "lea 0x10(%1),%1 \n"
1991 "sub $0x4,%2 \n"
1992 "jg 1b \n"
1993 : "+r"(src_argb), // %0
1994 "+r"(dst_argb), // %1
1995 "+r"(dst_width) // %2
1996 ::"memory",
1997 "cc", "xmm0", "xmm1");
1998 }
1999
ScaleARGBRowDown2Box_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)2000 void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
2001 ptrdiff_t src_stride,
2002 uint8_t* dst_argb,
2003 int dst_width) {
2004 asm volatile(LABELALIGN
2005 "1: \n"
2006 "movdqu (%0),%%xmm0 \n"
2007 "movdqu 0x10(%0),%%xmm1 \n"
2008 "movdqu 0x00(%0,%3,1),%%xmm2 \n"
2009 "movdqu 0x10(%0,%3,1),%%xmm3 \n"
2010 "lea 0x20(%0),%0 \n"
2011 "pavgb %%xmm2,%%xmm0 \n"
2012 "pavgb %%xmm3,%%xmm1 \n"
2013 "movdqa %%xmm0,%%xmm2 \n"
2014 "shufps $0x88,%%xmm1,%%xmm0 \n"
2015 "shufps $0xdd,%%xmm1,%%xmm2 \n"
2016 "pavgb %%xmm2,%%xmm0 \n"
2017 "movdqu %%xmm0,(%1) \n"
2018 "lea 0x10(%1),%1 \n"
2019 "sub $0x4,%2 \n"
2020 "jg 1b \n"
2021 : "+r"(src_argb), // %0
2022 "+r"(dst_argb), // %1
2023 "+r"(dst_width) // %2
2024 : "r"((intptr_t)(src_stride)) // %3
2025 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2026 }
2027
2028 // Reads 4 pixels at a time.
2029 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEven_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)2030 void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
2031 ptrdiff_t src_stride,
2032 int src_stepx,
2033 uint8_t* dst_argb,
2034 int dst_width) {
2035 intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
2036 intptr_t src_stepx_x12;
2037 (void)src_stride;
2038 asm volatile(
2039 "lea 0x00(,%1,4),%1 \n"
2040 "lea 0x00(%1,%1,2),%4 \n"
2041
2042 LABELALIGN
2043 "1: \n"
2044 "movd (%0),%%xmm0 \n"
2045 "movd 0x00(%0,%1,1),%%xmm1 \n"
2046 "punpckldq %%xmm1,%%xmm0 \n"
2047 "movd 0x00(%0,%1,2),%%xmm2 \n"
2048 "movd 0x00(%0,%4,1),%%xmm3 \n"
2049 "lea 0x00(%0,%1,4),%0 \n"
2050 "punpckldq %%xmm3,%%xmm2 \n"
2051 "punpcklqdq %%xmm2,%%xmm0 \n"
2052 "movdqu %%xmm0,(%2) \n"
2053 "lea 0x10(%2),%2 \n"
2054 "sub $0x4,%3 \n"
2055 "jg 1b \n"
2056 : "+r"(src_argb), // %0
2057 "+r"(src_stepx_x4), // %1
2058 "+r"(dst_argb), // %2
2059 "+r"(dst_width), // %3
2060 "=&r"(src_stepx_x12) // %4
2061 ::"memory",
2062 "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2063 }
2064
2065 // Blends four 2x2 to 4x1.
2066 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEvenBox_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)2067 void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
2068 ptrdiff_t src_stride,
2069 int src_stepx,
2070 uint8_t* dst_argb,
2071 int dst_width) {
2072 intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
2073 intptr_t src_stepx_x12;
2074 intptr_t row1 = (intptr_t)(src_stride);
2075 asm volatile(
2076 "lea 0x00(,%1,4),%1 \n"
2077 "lea 0x00(%1,%1,2),%4 \n"
2078 "lea 0x00(%0,%5,1),%5 \n"
2079
2080 LABELALIGN
2081 "1: \n"
2082 "movq (%0),%%xmm0 \n"
2083 "movhps 0x00(%0,%1,1),%%xmm0 \n"
2084 "movq 0x00(%0,%1,2),%%xmm1 \n"
2085 "movhps 0x00(%0,%4,1),%%xmm1 \n"
2086 "lea 0x00(%0,%1,4),%0 \n"
2087 "movq (%5),%%xmm2 \n"
2088 "movhps 0x00(%5,%1,1),%%xmm2 \n"
2089 "movq 0x00(%5,%1,2),%%xmm3 \n"
2090 "movhps 0x00(%5,%4,1),%%xmm3 \n"
2091 "lea 0x00(%5,%1,4),%5 \n"
2092 "pavgb %%xmm2,%%xmm0 \n"
2093 "pavgb %%xmm3,%%xmm1 \n"
2094 "movdqa %%xmm0,%%xmm2 \n"
2095 "shufps $0x88,%%xmm1,%%xmm0 \n"
2096 "shufps $0xdd,%%xmm1,%%xmm2 \n"
2097 "pavgb %%xmm2,%%xmm0 \n"
2098 "movdqu %%xmm0,(%2) \n"
2099 "lea 0x10(%2),%2 \n"
2100 "sub $0x4,%3 \n"
2101 "jg 1b \n"
2102 : "+r"(src_argb), // %0
2103 "+r"(src_stepx_x4), // %1
2104 "+r"(dst_argb), // %2
2105 "+rm"(dst_width), // %3
2106 "=&r"(src_stepx_x12), // %4
2107 "+r"(row1) // %5
2108 ::"memory",
2109 "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2110 }
2111
ScaleARGBCols_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)2112 void ScaleARGBCols_SSE2(uint8_t* dst_argb,
2113 const uint8_t* src_argb,
2114 int dst_width,
2115 int x,
2116 int dx) {
2117 intptr_t x0, x1;
2118 asm volatile(
2119 "movd %5,%%xmm2 \n"
2120 "movd %6,%%xmm3 \n"
2121 "pshufd $0x0,%%xmm2,%%xmm2 \n"
2122 "pshufd $0x11,%%xmm3,%%xmm0 \n"
2123 "paddd %%xmm0,%%xmm2 \n"
2124 "paddd %%xmm3,%%xmm3 \n"
2125 "pshufd $0x5,%%xmm3,%%xmm0 \n"
2126 "paddd %%xmm0,%%xmm2 \n"
2127 "paddd %%xmm3,%%xmm3 \n"
2128 "pshufd $0x0,%%xmm3,%%xmm3 \n"
2129 "pextrw $0x1,%%xmm2,%k0 \n"
2130 "pextrw $0x3,%%xmm2,%k1 \n"
2131 "cmp $0x0,%4 \n"
2132 "jl 99f \n"
2133 "sub $0x4,%4 \n"
2134 "jl 49f \n"
2135
2136 LABELALIGN
2137 "40: \n"
2138 "movd 0x00(%3,%0,4),%%xmm0 \n"
2139 "movd 0x00(%3,%1,4),%%xmm1 \n"
2140 "pextrw $0x5,%%xmm2,%k0 \n"
2141 "pextrw $0x7,%%xmm2,%k1 \n"
2142 "paddd %%xmm3,%%xmm2 \n"
2143 "punpckldq %%xmm1,%%xmm0 \n"
2144 "movd 0x00(%3,%0,4),%%xmm1 \n"
2145 "movd 0x00(%3,%1,4),%%xmm4 \n"
2146 "pextrw $0x1,%%xmm2,%k0 \n"
2147 "pextrw $0x3,%%xmm2,%k1 \n"
2148 "punpckldq %%xmm4,%%xmm1 \n"
2149 "punpcklqdq %%xmm1,%%xmm0 \n"
2150 "movdqu %%xmm0,(%2) \n"
2151 "lea 0x10(%2),%2 \n"
2152 "sub $0x4,%4 \n"
2153 "jge 40b \n"
2154
2155 "49: \n"
2156 "test $0x2,%4 \n"
2157 "je 29f \n"
2158 "movd 0x00(%3,%0,4),%%xmm0 \n"
2159 "movd 0x00(%3,%1,4),%%xmm1 \n"
2160 "pextrw $0x5,%%xmm2,%k0 \n"
2161 "punpckldq %%xmm1,%%xmm0 \n"
2162 "movq %%xmm0,(%2) \n"
2163 "lea 0x8(%2),%2 \n"
2164 "29: \n"
2165 "test $0x1,%4 \n"
2166 "je 99f \n"
2167 "movd 0x00(%3,%0,4),%%xmm0 \n"
2168 "movd %%xmm0,(%2) \n"
2169 "99: \n"
2170 : "=&a"(x0), // %0
2171 "=&d"(x1), // %1
2172 "+r"(dst_argb), // %2
2173 "+r"(src_argb), // %3
2174 "+r"(dst_width) // %4
2175 : "rm"(x), // %5
2176 "rm"(dx) // %6
2177 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2178 }
2179
2180 // Reads 4 pixels, duplicates them and writes 8 pixels.
2181 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleARGBColsUp2_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)2182 void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
2183 const uint8_t* src_argb,
2184 int dst_width,
2185 int x,
2186 int dx) {
2187 (void)x;
2188 (void)dx;
2189 asm volatile(LABELALIGN
2190 "1: \n"
2191 "movdqu (%1),%%xmm0 \n"
2192 "lea 0x10(%1),%1 \n"
2193 "movdqa %%xmm0,%%xmm1 \n"
2194 "punpckldq %%xmm0,%%xmm0 \n"
2195 "punpckhdq %%xmm1,%%xmm1 \n"
2196 "movdqu %%xmm0,(%0) \n"
2197 "movdqu %%xmm1,0x10(%0) \n"
2198 "lea 0x20(%0),%0 \n"
2199 "sub $0x8,%2 \n"
2200 "jg 1b \n"
2201
2202 : "+r"(dst_argb), // %0
2203 "+r"(src_argb), // %1
2204 "+r"(dst_width) // %2
2205 ::"memory",
2206 "cc", "xmm0", "xmm1");
2207 }
2208
2209 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
2210 static const uvec8 kShuffleColARGB = {
2211 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
2212 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
2213 };
2214
2215 // Shuffle table for duplicating 2 fractions into 8 bytes each
2216 static const uvec8 kShuffleFractions = {
2217 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
2218 };
2219
2220 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
ScaleARGBFilterCols_SSSE3(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)2221 void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
2222 const uint8_t* src_argb,
2223 int dst_width,
2224 int x,
2225 int dx) {
2226 intptr_t x0, x1;
2227 asm volatile(
2228 "movdqa %0,%%xmm4 \n"
2229 "movdqa %1,%%xmm5 \n"
2230 :
2231 : "m"(kShuffleColARGB), // %0
2232 "m"(kShuffleFractions) // %1
2233 );
2234
2235 asm volatile(
2236 "movd %5,%%xmm2 \n"
2237 "movd %6,%%xmm3 \n"
2238 "pcmpeqb %%xmm6,%%xmm6 \n"
2239 "psrlw $0x9,%%xmm6 \n"
2240 "pextrw $0x1,%%xmm2,%k3 \n"
2241 "sub $0x2,%2 \n"
2242 "jl 29f \n"
2243 "movdqa %%xmm2,%%xmm0 \n"
2244 "paddd %%xmm3,%%xmm0 \n"
2245 "punpckldq %%xmm0,%%xmm2 \n"
2246 "punpckldq %%xmm3,%%xmm3 \n"
2247 "paddd %%xmm3,%%xmm3 \n"
2248 "pextrw $0x3,%%xmm2,%k4 \n"
2249
2250 LABELALIGN
2251 "2: \n"
2252 "movdqa %%xmm2,%%xmm1 \n"
2253 "paddd %%xmm3,%%xmm2 \n"
2254 "movq 0x00(%1,%3,4),%%xmm0 \n"
2255 "psrlw $0x9,%%xmm1 \n"
2256 "movhps 0x00(%1,%4,4),%%xmm0 \n"
2257 "pshufb %%xmm5,%%xmm1 \n"
2258 "pshufb %%xmm4,%%xmm0 \n"
2259 "pxor %%xmm6,%%xmm1 \n"
2260 "pmaddubsw %%xmm1,%%xmm0 \n"
2261 "psrlw $0x7,%%xmm0 \n"
2262 "pextrw $0x1,%%xmm2,%k3 \n"
2263 "pextrw $0x3,%%xmm2,%k4 \n"
2264 "packuswb %%xmm0,%%xmm0 \n"
2265 "movq %%xmm0,(%0) \n"
2266 "lea 0x8(%0),%0 \n"
2267 "sub $0x2,%2 \n"
2268 "jge 2b \n"
2269
2270 LABELALIGN
2271 "29: \n"
2272 "add $0x1,%2 \n"
2273 "jl 99f \n"
2274 "psrlw $0x9,%%xmm2 \n"
2275 "movq 0x00(%1,%3,4),%%xmm0 \n"
2276 "pshufb %%xmm5,%%xmm2 \n"
2277 "pshufb %%xmm4,%%xmm0 \n"
2278 "pxor %%xmm6,%%xmm2 \n"
2279 "pmaddubsw %%xmm2,%%xmm0 \n"
2280 "psrlw $0x7,%%xmm0 \n"
2281 "packuswb %%xmm0,%%xmm0 \n"
2282 "movd %%xmm0,(%0) \n"
2283
2284 LABELALIGN
2285 "99: \n" // clang-format error.
2286
2287 : "+r"(dst_argb), // %0
2288 "+r"(src_argb), // %1
2289 "+rm"(dst_width), // %2
2290 "=&r"(x0), // %3
2291 "=&r"(x1) // %4
2292 : "rm"(x), // %5
2293 "rm"(dx) // %6
2294 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2295 }
2296
2297 // Divide num by div and return as 16.16 fixed point result.
FixedDiv_X86(int num,int div)2298 int FixedDiv_X86(int num, int div) {
2299 asm volatile(
2300 "cdq \n"
2301 "shld $0x10,%%eax,%%edx \n"
2302 "shl $0x10,%%eax \n"
2303 "idiv %1 \n"
2304 "mov %0, %%eax \n"
2305 : "+a"(num) // %0
2306 : "c"(div) // %1
2307 : "memory", "cc", "edx");
2308 return num;
2309 }
2310
2311 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
FixedDiv1_X86(int num,int div)2312 int FixedDiv1_X86(int num, int div) {
2313 asm volatile(
2314 "cdq \n"
2315 "shld $0x10,%%eax,%%edx \n"
2316 "shl $0x10,%%eax \n"
2317 "sub $0x10001,%%eax \n"
2318 "sbb $0x0,%%edx \n"
2319 "sub $0x1,%1 \n"
2320 "idiv %1 \n"
2321 "mov %0, %%eax \n"
2322 : "+a"(num) // %0
2323 : "c"(div) // %1
2324 : "memory", "cc", "edx");
2325 return num;
2326 }
2327
2328 #ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
2329 // Shuffle table for splitting UV into upper and lower part of register.
2330 static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
2331 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
2332 static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u,
2333 6u, 14u, 0x80, 0x80, 0x80, 0x80,
2334 0x80, 0x80, 0x80, 0x80};
2335
ScaleUVRowDown2Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)2336 void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
2337 ptrdiff_t src_stride,
2338 uint8_t* dst_ptr,
2339 int dst_width) {
2340 asm volatile(
2341 "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101
2342 "psrlw $0xf,%%xmm4 \n"
2343 "packuswb %%xmm4,%%xmm4 \n"
2344 "pxor %%xmm5, %%xmm5 \n" // zero
2345 "movdqa %4,%%xmm1 \n" // split shuffler
2346 "movdqa %5,%%xmm3 \n" // merge shuffler
2347
2348 LABELALIGN
2349 "1: \n"
2350 "movdqu (%0),%%xmm0 \n" // 8 UV row 0
2351 "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1
2352 "lea 0x10(%0),%0 \n"
2353 "pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv
2354 "pshufb %%xmm1,%%xmm2 \n"
2355 "pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add
2356 "pmaddubsw %%xmm4,%%xmm2 \n"
2357 "paddw %%xmm2,%%xmm0 \n" // vertical add
2358 "psrlw $0x1,%%xmm0 \n" // round
2359 "pavgw %%xmm5,%%xmm0 \n"
2360 "pshufb %%xmm3,%%xmm0 \n" // merge uv
2361 "movq %%xmm0,(%1) \n"
2362 "lea 0x8(%1),%1 \n" // 4 UV
2363 "sub $0x4,%2 \n"
2364 "jg 1b \n"
2365 : "+r"(src_ptr), // %0
2366 "+r"(dst_ptr), // %1
2367 "+r"(dst_width) // %2
2368 : "r"((intptr_t)(src_stride)), // %3
2369 "m"(kShuffleSplitUV), // %4
2370 "m"(kShuffleMergeUV) // %5
2371 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2372 }
2373 #endif // HAS_SCALEUVROWDOWN2BOX_SSSE3
2374
2375 #ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
ScaleUVRowDown2Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)2376 void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
2377 ptrdiff_t src_stride,
2378 uint8_t* dst_ptr,
2379 int dst_width) {
2380 asm volatile(
2381 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101
2382 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
2383 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
2384 "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero
2385 "vbroadcastf128 %4,%%ymm1 \n" // split shuffler
2386 "vbroadcastf128 %5,%%ymm3 \n" // merge shuffler
2387
2388 LABELALIGN
2389 "1: \n"
2390 "vmovdqu (%0),%%ymm0 \n" // 16 UV row 0
2391 "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1
2392 "lea 0x20(%0),%0 \n"
2393 "vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv
2394 "vpshufb %%ymm1,%%ymm2,%%ymm2 \n"
2395 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add
2396 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
2397 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add
2398 "vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round
2399 "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
2400 "vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv
2401 "vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords
2402 "vmovdqu %%xmm0,(%1) \n"
2403 "lea 0x10(%1),%1 \n" // 8 UV
2404 "sub $0x8,%2 \n"
2405 "jg 1b \n"
2406 "vzeroupper \n"
2407 : "+r"(src_ptr), // %0
2408 "+r"(dst_ptr), // %1
2409 "+r"(dst_width) // %2
2410 : "r"((intptr_t)(src_stride)), // %3
2411 "m"(kShuffleSplitUV), // %4
2412 "m"(kShuffleMergeUV) // %5
2413 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2414 }
2415 #endif // HAS_SCALEUVROWDOWN2BOX_AVX2
2416
2417 static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
2418 3, 1, 3, 1, 1, 3, 1, 3};
2419
2420 #ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3
ScaleUVRowUp2_Linear_SSSE3(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)2421 void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
2422 uint8_t* dst_ptr,
2423 int dst_width) {
2424 asm volatile(
2425 "pcmpeqw %%xmm4,%%xmm4 \n"
2426 "psrlw $15,%%xmm4 \n"
2427 "psllw $1,%%xmm4 \n" // all 2
2428 "movdqa %3,%%xmm3 \n"
2429
2430 LABELALIGN
2431 "1: \n"
2432 "movq (%0),%%xmm0 \n" // 00112233 (1u1v)
2433 "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v)
2434 "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v)
2435 "movdqa %%xmm0,%%xmm2 \n"
2436 "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v)
2437 "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v)
2438 "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (1u1v16, hi)
2439 "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (1u1v16, lo)
2440 "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo)
2441 "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi)
2442 "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
2443 "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi)
2444 "packuswb %%xmm2,%%xmm0 \n"
2445 "movdqu %%xmm0,(%1) \n"
2446
2447 "lea 0x8(%0),%0 \n"
2448 "lea 0x10(%1),%1 \n" // 4 uv to 8 uv
2449 "sub $0x8,%2 \n"
2450 "jg 1b \n"
2451 : "+r"(src_ptr), // %0
2452 "+r"(dst_ptr), // %1
2453 "+r"(dst_width) // %2
2454 : "m"(kUVLinearMadd31) // %3
2455 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2456 }
2457 #endif
2458
2459 #ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3
ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2460 void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
2461 ptrdiff_t src_stride,
2462 uint8_t* dst_ptr,
2463 ptrdiff_t dst_stride,
2464 int dst_width) {
2465 asm volatile(
2466 "pcmpeqw %%xmm6,%%xmm6 \n"
2467 "psrlw $15,%%xmm6 \n"
2468 "psllw $3,%%xmm6 \n" // all 8
2469 "movdqa %5,%%xmm7 \n"
2470
2471 LABELALIGN
2472 "1: \n"
2473 "movq (%0),%%xmm0 \n" // 00112233 (1u1v)
2474 "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v)
2475 "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v)
2476 "movdqa %%xmm0,%%xmm2 \n"
2477 "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v)
2478 "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v)
2479 "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1u1v16, hi)
2480 "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1u1v16, lo)
2481
2482 "movq (%0,%3),%%xmm1 \n"
2483 "movq 2(%0,%3),%%xmm4 \n"
2484 "punpcklbw %%xmm4,%%xmm1 \n"
2485 "movdqa %%xmm1,%%xmm3 \n"
2486 "punpckhdq %%xmm1,%%xmm3 \n"
2487 "punpckldq %%xmm1,%%xmm1 \n"
2488 "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi)
2489 "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo)
2490
2491 // xmm0 xmm2
2492 // xmm1 xmm3
2493
2494 "movdqa %%xmm0,%%xmm4 \n"
2495 "movdqa %%xmm1,%%xmm5 \n"
2496 "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
2497 "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
2498 "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
2499 "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
2500 "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo)
2501
2502 "movdqa %%xmm1,%%xmm5 \n"
2503 "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo)
2504 "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
2505 "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo)
2506 "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
2507 "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo)
2508
2509 "movdqa %%xmm2,%%xmm0 \n"
2510 "movdqa %%xmm3,%%xmm1 \n"
2511 "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi)
2512 "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi)
2513 "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi)
2514 "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
2515 "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi)
2516
2517 "movdqa %%xmm3,%%xmm1 \n"
2518 "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi)
2519 "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi)
2520 "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi)
2521 "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi)
2522 "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi)
2523
2524 "packuswb %%xmm0,%%xmm4 \n"
2525 "movdqu %%xmm4,(%1) \n" // store above
2526 "packuswb %%xmm1,%%xmm5 \n"
2527 "movdqu %%xmm5,(%1,%4) \n" // store below
2528
2529 "lea 0x8(%0),%0 \n"
2530 "lea 0x10(%1),%1 \n" // 4 uv to 8 uv
2531 "sub $0x8,%2 \n"
2532 "jg 1b \n"
2533 : "+r"(src_ptr), // %0
2534 "+r"(dst_ptr), // %1
2535 "+r"(dst_width) // %2
2536 : "r"((intptr_t)(src_stride)), // %3
2537 "r"((intptr_t)(dst_stride)), // %4
2538 "m"(kUVLinearMadd31) // %5
2539 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2540 "xmm7");
2541 }
2542 #endif
2543
2544 #ifdef HAS_SCALEUVROWUP2LINEAR_AVX2
2545
ScaleUVRowUp2_Linear_AVX2(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)2546 void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
2547 uint8_t* dst_ptr,
2548 int dst_width) {
2549 asm volatile(
2550 "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
2551 "vpsrlw $15,%%ymm4,%%ymm4 \n"
2552 "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
2553 "vbroadcastf128 %3,%%ymm3 \n"
2554
2555 LABELALIGN
2556 "1: \n"
2557 "vmovdqu (%0),%%xmm0 \n"
2558 "vmovdqu 2(%0),%%xmm1 \n"
2559 "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
2560 "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
2561 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
2562 "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n"
2563 "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n"
2564 "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
2565 "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo)
2566 "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
2567 "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
2568 "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
2569 "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
2570 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
2571 "vmovdqu %%ymm0,(%1) \n"
2572
2573 "lea 0x10(%0),%0 \n"
2574 "lea 0x20(%1),%1 \n" // 8 uv to 16 uv
2575 "sub $0x10,%2 \n"
2576 "jg 1b \n"
2577 "vzeroupper \n"
2578 : "+r"(src_ptr), // %0
2579 "+r"(dst_ptr), // %1
2580 "+r"(dst_width) // %2
2581 : "m"(kUVLinearMadd31) // %3
2582 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2583 }
2584 #endif
2585
2586 #ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2
ScaleUVRowUp2_Bilinear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2587 void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
2588 ptrdiff_t src_stride,
2589 uint8_t* dst_ptr,
2590 ptrdiff_t dst_stride,
2591 int dst_width) {
2592 asm volatile(
2593 "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
2594 "vpsrlw $15,%%ymm6,%%ymm6 \n"
2595 "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
2596 "vbroadcastf128 %5,%%ymm7 \n"
2597
2598 LABELALIGN
2599 "1: \n"
2600 "vmovdqu (%0),%%xmm0 \n"
2601 "vmovdqu 2(%0),%%xmm1 \n"
2602 "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
2603 "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
2604 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
2605 "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n"
2606 "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n"
2607 "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
2608 "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo)
2609
2610 "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF
2611 "vmovdqu 2(%0,%3),%%xmm3 \n" // 123456789ABCDEF0
2612 "vpermq $0b11011000,%%ymm2,%%ymm2 \n"
2613 "vpermq $0b11011000,%%ymm3,%%ymm3 \n"
2614 "vpunpcklbw %%ymm3,%%ymm2,%%ymm2 \n"
2615 "vpunpckhdq %%ymm2,%%ymm2,%%ymm4 \n"
2616 "vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n"
2617 "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
2618 "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo)
2619
2620 // ymm0 ymm1
2621 // ymm2 ymm3
2622
2623 "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
2624 "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
2625 "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
2626 "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
2627 "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
2628
2629 "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
2630 "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
2631 "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
2632 "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
2633 "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
2634
2635 "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
2636 "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
2637 "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
2638 "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
2639 "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
2640
2641 "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
2642 "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
2643 "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
2644 "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
2645 "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
2646
2647 "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n"
2648 "vmovdqu %%ymm4,(%1) \n" // store above
2649 "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n"
2650 "vmovdqu %%ymm5,(%1,%4) \n" // store below
2651
2652 "lea 0x10(%0),%0 \n"
2653 "lea 0x20(%1),%1 \n" // 8 uv to 16 uv
2654 "sub $0x10,%2 \n"
2655 "jg 1b \n"
2656 "vzeroupper \n"
2657 : "+r"(src_ptr), // %0
2658 "+r"(dst_ptr), // %1
2659 "+r"(dst_width) // %2
2660 : "r"((intptr_t)(src_stride)), // %3
2661 "r"((intptr_t)(dst_stride)), // %4
2662 "m"(kUVLinearMadd31) // %5
2663 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2664 "xmm7");
2665 }
2666 #endif
2667
2668 #ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE41
ScaleUVRowUp2_Linear_16_SSE41(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)2669 void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
2670 uint16_t* dst_ptr,
2671 int dst_width) {
2672 asm volatile(
2673 "pxor %%xmm5,%%xmm5 \n"
2674 "pcmpeqd %%xmm4,%%xmm4 \n"
2675 "psrld $31,%%xmm4 \n"
2676 "pslld $1,%%xmm4 \n" // all 2
2677
2678 LABELALIGN
2679 "1: \n"
2680 "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v)
2681 "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v)
2682
2683 "punpcklwd %%xmm5,%%xmm0 \n" // 0011 (32b, 1u1v)
2684 "punpcklwd %%xmm5,%%xmm1 \n" // 1122 (32b, 1u1v)
2685
2686 "movdqa %%xmm0,%%xmm2 \n"
2687 "movdqa %%xmm1,%%xmm3 \n"
2688
2689 "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (lo, far)
2690 "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (hi, far)
2691
2692 "paddd %%xmm4,%%xmm2 \n" // far+2 (lo)
2693 "paddd %%xmm4,%%xmm3 \n" // far+2 (hi)
2694 "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo)
2695 "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi)
2696 "paddd %%xmm0,%%xmm0 \n" // 2*near (lo)
2697 "paddd %%xmm1,%%xmm1 \n" // 2*near (hi)
2698 "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo)
2699 "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi)
2700
2701 "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
2702 "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi)
2703 "packusdw %%xmm1,%%xmm0 \n"
2704 "movdqu %%xmm0,(%1) \n"
2705
2706 "lea 0x8(%0),%0 \n"
2707 "lea 0x10(%1),%1 \n" // 2 uv to 4 uv
2708 "sub $0x4,%2 \n"
2709 "jg 1b \n"
2710 : "+r"(src_ptr), // %0
2711 "+r"(dst_ptr), // %1
2712 "+r"(dst_width) // %2
2713 :
2714 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2715 }
2716 #endif
2717
2718 #ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE41
ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2719 void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
2720 ptrdiff_t src_stride,
2721 uint16_t* dst_ptr,
2722 ptrdiff_t dst_stride,
2723 int dst_width) {
2724 asm volatile(
2725 "pxor %%xmm7,%%xmm7 \n"
2726 "pcmpeqd %%xmm6,%%xmm6 \n"
2727 "psrld $31,%%xmm6 \n"
2728 "pslld $3,%%xmm6 \n" // all 8
2729
2730 LABELALIGN
2731 "1: \n"
2732 "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v)
2733 "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v)
2734 "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v)
2735 "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v)
2736 "movdqa %%xmm0,%%xmm2 \n"
2737 "movdqa %%xmm1,%%xmm3 \n"
2738 "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo)
2739 "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi)
2740 "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo)
2741 "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi)
2742 "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo)
2743 "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi)
2744 "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo)
2745 "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
2746
2747 "movq (%0,%3,2),%%xmm2 \n"
2748 "movq 4(%0,%3,2),%%xmm3 \n"
2749 "punpcklwd %%xmm7,%%xmm2 \n"
2750 "punpcklwd %%xmm7,%%xmm3 \n"
2751 "movdqa %%xmm2,%%xmm4 \n"
2752 "movdqa %%xmm3,%%xmm5 \n"
2753 "pshufd $0b01001110,%%xmm4,%%xmm4 \n" // 1100 (far) (2, lo)
2754 "pshufd $0b01001110,%%xmm5,%%xmm5 \n" // 2211 (far) (2, hi)
2755 "paddd %%xmm2,%%xmm4 \n" // near+far (2, lo)
2756 "paddd %%xmm3,%%xmm5 \n" // near+far (2, hi)
2757 "paddd %%xmm2,%%xmm2 \n" // 2*near (2, lo)
2758 "paddd %%xmm3,%%xmm3 \n" // 2*near (2, hi)
2759 "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo)
2760 "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi)
2761
2762 "movdqa %%xmm0,%%xmm4 \n"
2763 "movdqa %%xmm2,%%xmm5 \n"
2764 "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
2765 "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
2766 "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
2767 "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
2768 "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo)
2769
2770 "movdqa %%xmm2,%%xmm5 \n"
2771 "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo)
2772 "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
2773 "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo)
2774 "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
2775 "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo)
2776
2777 "movdqa %%xmm1,%%xmm0 \n"
2778 "movdqa %%xmm3,%%xmm2 \n"
2779 "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi)
2780 "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi)
2781 "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi)
2782 "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
2783 "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi)
2784
2785 "movdqa %%xmm3,%%xmm2 \n"
2786 "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi)
2787 "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi)
2788 "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi)
2789 "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi)
2790 "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi)
2791
2792 "packusdw %%xmm0,%%xmm4 \n"
2793 "movdqu %%xmm4,(%1) \n" // store above
2794 "packusdw %%xmm2,%%xmm5 \n"
2795 "movdqu %%xmm5,(%1,%4,2) \n" // store below
2796
2797 "lea 0x8(%0),%0 \n"
2798 "lea 0x10(%1),%1 \n" // 2 uv to 4 uv
2799 "sub $0x4,%2 \n"
2800 "jg 1b \n"
2801 : "+r"(src_ptr), // %0
2802 "+r"(dst_ptr), // %1
2803 "+r"(dst_width) // %2
2804 : "r"((intptr_t)(src_stride)), // %3
2805 "r"((intptr_t)(dst_stride)) // %4
2806 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2807 "xmm7");
2808 }
2809 #endif
2810
2811 #ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2
ScaleUVRowUp2_Linear_16_AVX2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)2812 void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
2813 uint16_t* dst_ptr,
2814 int dst_width) {
2815 asm volatile(
2816 "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
2817 "vpsrld $31,%%ymm4,%%ymm4 \n"
2818 "vpslld $1,%%ymm4,%%ymm4 \n" // all 2
2819
2820 LABELALIGN
2821 "1: \n"
2822 "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v)
2823 "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v)
2824
2825 "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
2826 "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
2827
2828 "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far)
2829 "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far)
2830
2831 "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo)
2832 "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi)
2833 "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo)
2834 "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi)
2835 "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
2836 "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
2837 "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo)
2838 "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi)
2839
2840 "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
2841 "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
2842 "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n"
2843 "vmovdqu %%ymm0,(%1) \n"
2844
2845 "lea 0x10(%0),%0 \n"
2846 "lea 0x20(%1),%1 \n" // 4 uv to 8 uv
2847 "sub $0x8,%2 \n"
2848 "jg 1b \n"
2849 "vzeroupper \n"
2850 : "+r"(src_ptr), // %0
2851 "+r"(dst_ptr), // %1
2852 "+r"(dst_width) // %2
2853 :
2854 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2855 }
2856 #endif
2857
2858 #ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2
ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2859 void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
2860 ptrdiff_t src_stride,
2861 uint16_t* dst_ptr,
2862 ptrdiff_t dst_stride,
2863 int dst_width) {
2864 asm volatile(
2865 "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
2866 "vpsrld $31,%%ymm6,%%ymm6 \n"
2867 "vpslld $3,%%ymm6,%%ymm6 \n" // all 8
2868
2869 LABELALIGN
2870 "1: \n"
2871
2872 "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v)
2873 "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v)
2874 "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
2875 "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
2876 "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far)
2877 "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far)
2878 "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo)
2879 "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi)
2880 "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
2881 "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
2882 "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (lo)
2883 "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (hi)
2884
2885 "vmovdqu (%0,%3,2),%%xmm2 \n" // 00112233 (16b, 1u1v)
2886 "vmovdqu 4(%0,%3,2),%%xmm3 \n" // 11223344 (16b, 1u1v)
2887 "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v)
2888 "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v)
2889 "vpshufd $0b01001110,%%ymm2,%%ymm4 \n" // 11003322 (lo, far)
2890 "vpshufd $0b01001110,%%ymm3,%%ymm5 \n" // 22114433 (hi, far)
2891 "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo)
2892 "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi)
2893 "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo)
2894 "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi)
2895 "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (lo)
2896 "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (hi)
2897
2898 "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
2899 "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
2900 "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
2901 "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
2902 "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
2903
2904 "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
2905 "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
2906 "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
2907 "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
2908 "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
2909
2910 "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
2911 "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
2912 "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
2913 "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
2914 "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
2915
2916 "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
2917 "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
2918 "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
2919 "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
2920 "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
2921
2922 "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n"
2923 "vmovdqu %%ymm4,(%1) \n" // store above
2924 "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n"
2925 "vmovdqu %%ymm5,(%1,%4,2) \n" // store below
2926
2927 "lea 0x10(%0),%0 \n"
2928 "lea 0x20(%1),%1 \n" // 4 uv to 8 uv
2929 "sub $0x8,%2 \n"
2930 "jg 1b \n"
2931 "vzeroupper \n"
2932 : "+r"(src_ptr), // %0
2933 "+r"(dst_ptr), // %1
2934 "+r"(dst_width) // %2
2935 : "r"((intptr_t)(src_stride)), // %3
2936 "r"((intptr_t)(dst_stride)) // %4
2937 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2938 }
2939 #endif
2940
2941 #endif // defined(__x86_64__) || defined(__i386__)
2942
2943 #ifdef __cplusplus
2944 } // extern "C"
2945 } // namespace libyuv
2946 #endif
2947