1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/scale.h"
12
13 #include <assert.h>
14 #include <string.h>
15 #include <stdlib.h> // For getenv()
16
17 #include "libyuv/cpu_id.h"
18 #include "libyuv/planar_functions.h" // For CopyARGB
19 #include "libyuv/row.h"
20
21 #ifdef __cplusplus
22 namespace libyuv {
23 extern "C" {
24 #endif
25
26 // Bilinear SSE2 is disabled.
27 #define SSE2_DISABLED 1
28
29 // ARGB scaling uses bilinear or point, but not box filter.
30 /**
31 * SSE2 downscalers with bilinear interpolation.
32 */
33
34 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
35
36 #define HAS_SCALEARGBROWDOWN2_SSE2
37 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
38 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
39 __declspec(naked) __declspec(align(16))
ScaleARGBRowDown2_SSE2(const uint8 * src_ptr,ptrdiff_t,uint8 * dst_ptr,int dst_width)40 static void ScaleARGBRowDown2_SSE2(const uint8* src_ptr,
41 ptrdiff_t /* src_stride */,
42 uint8* dst_ptr, int dst_width) {
43 __asm {
44 mov eax, [esp + 4] // src_ptr
45 // src_stride ignored
46 mov edx, [esp + 12] // dst_ptr
47 mov ecx, [esp + 16] // dst_width
48
49 align 16
50 wloop:
51 movdqa xmm0, [eax]
52 movdqa xmm1, [eax + 16]
53 lea eax, [eax + 32]
54 shufps xmm0, xmm1, 0x88
55 sub ecx, 4
56 movdqa [edx], xmm0
57 lea edx, [edx + 16]
58 jg wloop
59
60 ret
61 }
62 }
63
64 // Blends 8x2 rectangle to 4x1.
65 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
66 __declspec(naked) __declspec(align(16))
ScaleARGBRowDown2Int_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)67 static void ScaleARGBRowDown2Int_SSE2(const uint8* src_ptr,
68 ptrdiff_t src_stride,
69 uint8* dst_ptr, int dst_width) {
70 __asm {
71 push esi
72 mov eax, [esp + 4 + 4] // src_ptr
73 mov esi, [esp + 4 + 8] // src_stride
74 mov edx, [esp + 4 + 12] // dst_ptr
75 mov ecx, [esp + 4 + 16] // dst_width
76
77 align 16
78 wloop:
79 movdqa xmm0, [eax]
80 movdqa xmm1, [eax + 16]
81 movdqa xmm2, [eax + esi]
82 movdqa xmm3, [eax + esi + 16]
83 lea eax, [eax + 32]
84 pavgb xmm0, xmm2 // average rows
85 pavgb xmm1, xmm3
86 movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
87 shufps xmm0, xmm1, 0x88 // even pixels
88 shufps xmm2, xmm1, 0xdd // odd pixels
89 pavgb xmm0, xmm2
90 sub ecx, 4
91 movdqa [edx], xmm0
92 lea edx, [edx + 16]
93 jg wloop
94
95 pop esi
96 ret
97 }
98 }
99
100 #define HAS_SCALEARGBROWDOWNEVEN_SSE2
101 // Reads 4 pixels at a time.
102 // Alignment requirement: dst_ptr 16 byte aligned.
103 __declspec(naked) __declspec(align(16))
ScaleARGBRowDownEven_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,int src_stepx,uint8 * dst_ptr,int dst_width)104 void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
105 int src_stepx,
106 uint8* dst_ptr, int dst_width) {
107 __asm {
108 push ebx
109 push edi
110 mov eax, [esp + 8 + 4] // src_ptr
111 // src_stride ignored
112 mov ebx, [esp + 8 + 12] // src_stepx
113 mov edx, [esp + 8 + 16] // dst_ptr
114 mov ecx, [esp + 8 + 20] // dst_width
115 lea ebx, [ebx * 4]
116 lea edi, [ebx + ebx * 2]
117
118 align 16
119 wloop:
120 movd xmm0, [eax]
121 movd xmm1, [eax + ebx]
122 punpckldq xmm0, xmm1
123 movd xmm2, [eax + ebx * 2]
124 movd xmm3, [eax + edi]
125 lea eax, [eax + ebx * 4]
126 punpckldq xmm2, xmm3
127 punpcklqdq xmm0, xmm2
128 sub ecx, 4
129 movdqa [edx], xmm0
130 lea edx, [edx + 16]
131 jg wloop
132
133 pop edi
134 pop ebx
135 ret
136 }
137 }
138
139 // Blends four 2x2 to 4x1.
140 // Alignment requirement: dst_ptr 16 byte aligned.
141 __declspec(naked) __declspec(align(16))
ScaleARGBRowDownEvenInt_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,int src_stepx,uint8 * dst_ptr,int dst_width)142 static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_ptr,
143 ptrdiff_t src_stride,
144 int src_stepx,
145 uint8* dst_ptr, int dst_width) {
146 __asm {
147 push ebx
148 push esi
149 push edi
150 mov eax, [esp + 12 + 4] // src_ptr
151 mov esi, [esp + 12 + 8] // src_stride
152 mov ebx, [esp + 12 + 12] // src_stepx
153 mov edx, [esp + 12 + 16] // dst_ptr
154 mov ecx, [esp + 12 + 20] // dst_width
155 lea esi, [eax + esi] // row1 pointer
156 lea ebx, [ebx * 4]
157 lea edi, [ebx + ebx * 2]
158
159 align 16
160 wloop:
161 movq xmm0, qword ptr [eax] // row0 4 pairs
162 movhps xmm0, qword ptr [eax + ebx]
163 movq xmm1, qword ptr [eax + ebx * 2]
164 movhps xmm1, qword ptr [eax + edi]
165 lea eax, [eax + ebx * 4]
166 movq xmm2, qword ptr [esi] // row1 4 pairs
167 movhps xmm2, qword ptr [esi + ebx]
168 movq xmm3, qword ptr [esi + ebx * 2]
169 movhps xmm3, qword ptr [esi + edi]
170 lea esi, [esi + ebx * 4]
171 pavgb xmm0, xmm2 // average rows
172 pavgb xmm1, xmm3
173 movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
174 shufps xmm0, xmm1, 0x88 // even pixels
175 shufps xmm2, xmm1, 0xdd // odd pixels
176 pavgb xmm0, xmm2
177 sub ecx, 4
178 movdqa [edx], xmm0
179 lea edx, [edx + 16]
180 jg wloop
181
182 pop edi
183 pop esi
184 pop ebx
185 ret
186 }
187 }
188
189 // Bilinear row filtering combines 4x2 -> 4x1. SSE2 version.
190 #ifndef SSE2_DISABLED
191 #define HAS_SCALEARGBFILTERROWS_SSE2_DISABLED
192 __declspec(naked) __declspec(align(16))
ScaleARGBFilterRows_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)193 void ScaleARGBFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
194 ptrdiff_t src_stride, int dst_width,
195 int source_y_fraction) {
196 __asm {
197 push esi
198 push edi
199 mov edi, [esp + 8 + 4] // dst_ptr
200 mov esi, [esp + 8 + 8] // src_ptr
201 mov edx, [esp + 8 + 12] // src_stride
202 mov ecx, [esp + 8 + 16] // dst_width
203 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
204 sub edi, esi
205 cmp eax, 0
206 je xloop1
207 cmp eax, 128
208 je xloop2
209
210 movd xmm5, eax // xmm5 = y fraction
211 punpcklbw xmm5, xmm5
212 punpcklwd xmm5, xmm5
213 pshufd xmm5, xmm5, 0
214 pxor xmm4, xmm4
215
216 // f * row1 + (1 - frac) row0
217 // frac * (row1 - row0) + row0
218 align 16
219 xloop:
220 movdqa xmm0, [esi] // row0
221 movdqa xmm2, [esi + edx] // row1
222 movdqa xmm1, xmm0
223 movdqa xmm3, xmm2
224 punpcklbw xmm2, xmm4
225 punpckhbw xmm3, xmm4
226 punpcklbw xmm0, xmm4
227 punpckhbw xmm1, xmm4
228 psubw xmm2, xmm0 // row1 - row0
229 psubw xmm3, xmm1
230 pmulhw xmm2, xmm5 // scale diff
231 pmulhw xmm3, xmm5
232 paddw xmm0, xmm2 // sum rows
233 paddw xmm1, xmm3
234 packuswb xmm0, xmm1
235 sub ecx, 4
236 movdqa [esi + edi], xmm0
237 lea esi, [esi + 16]
238 jg xloop
239
240 shufps xmm0, xmm0, 0xff
241 movdqa [esi + edi], xmm0 // duplicate last pixel for filtering
242 pop edi
243 pop esi
244 ret
245
246 align 16
247 xloop1:
248 movdqa xmm0, [esi]
249 sub ecx, 4
250 movdqa [esi + edi], xmm0
251 lea esi, [esi + 16]
252 jg xloop1
253
254 shufps xmm0, xmm0, 0xff
255 movdqa [esi + edi], xmm0
256 pop edi
257 pop esi
258 ret
259
260 align 16
261 xloop2:
262 movdqa xmm0, [esi]
263 pavgb xmm0, [esi + edx]
264 sub ecx, 4
265 movdqa [esi + edi], xmm0
266 lea esi, [esi + 16]
267 jg xloop2
268
269 shufps xmm0, xmm0, 0xff
270 movdqa [esi + edi], xmm0
271 pop edi
272 pop esi
273 ret
274 }
275 }
276 #endif // SSE2_DISABLED
277
278 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version.
279 #define HAS_SCALEARGBFILTERROWS_SSSE3
280 __declspec(naked) __declspec(align(16))
ScaleARGBFilterRows_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)281 void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
282 ptrdiff_t src_stride, int dst_width,
283 int source_y_fraction) {
284 __asm {
285 push esi
286 push edi
287 mov edi, [esp + 8 + 4] // dst_ptr
288 mov esi, [esp + 8 + 8] // src_ptr
289 mov edx, [esp + 8 + 12] // src_stride
290 mov ecx, [esp + 8 + 16] // dst_width
291 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
292 sub edi, esi
293 shr eax, 1
294 cmp eax, 0
295 je xloop1
296 cmp eax, 64
297 je xloop2
298 movd xmm0, eax // high fraction 0..127
299 neg eax
300 add eax, 128
301 movd xmm5, eax // low fraction 128..1
302 punpcklbw xmm5, xmm0
303 punpcklwd xmm5, xmm5
304 pshufd xmm5, xmm5, 0
305
306 align 16
307 xloop:
308 movdqa xmm0, [esi]
309 movdqa xmm2, [esi + edx]
310 movdqa xmm1, xmm0
311 punpcklbw xmm0, xmm2
312 punpckhbw xmm1, xmm2
313 pmaddubsw xmm0, xmm5
314 pmaddubsw xmm1, xmm5
315 psrlw xmm0, 7
316 psrlw xmm1, 7
317 packuswb xmm0, xmm1
318 sub ecx, 4
319 movdqa [esi + edi], xmm0
320 lea esi, [esi + 16]
321 jg xloop
322
323 shufps xmm0, xmm0, 0xff
324 movdqa [esi + edi], xmm0 // duplicate last pixel for filtering
325 pop edi
326 pop esi
327 ret
328
329 align 16
330 xloop1:
331 movdqa xmm0, [esi]
332 sub ecx, 4
333 movdqa [esi + edi], xmm0
334 lea esi, [esi + 16]
335 jg xloop1
336
337 shufps xmm0, xmm0, 0xff
338 movdqa [esi + edi], xmm0
339 pop edi
340 pop esi
341 ret
342
343 align 16
344 xloop2:
345 movdqa xmm0, [esi]
346 pavgb xmm0, [esi + edx]
347 sub ecx, 4
348 movdqa [esi + edi], xmm0
349 lea esi, [esi + 16]
350 jg xloop2
351
352 shufps xmm0, xmm0, 0xff
353 movdqa [esi + edi], xmm0
354 pop edi
355 pop esi
356 ret
357 }
358 }
359
360 #elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
361
362 // GCC versions of row functions are verbatim conversions from Visual C.
363 // Generated using gcc disassembly on Visual C object file:
364 // objdump -D yuvscaler.obj >yuvscaler.txt
365 #define HAS_SCALEARGBROWDOWN2_SSE2
366 static void ScaleARGBRowDown2_SSE2(const uint8* src_ptr,
367 ptrdiff_t /* src_stride */,
368 uint8* dst_ptr, int dst_width) {
369 asm volatile (
370 ".p2align 4 \n"
371 "1: \n"
372 "movdqa (%0),%%xmm0 \n"
373 "movdqa 0x10(%0),%%xmm1 \n"
374 "lea 0x20(%0),%0 \n"
375 "shufps $0x88,%%xmm1,%%xmm0 \n"
376 "sub $0x4,%2 \n"
377 "movdqa %%xmm0,(%1) \n"
378 "lea 0x10(%1),%1 \n"
379 "jg 1b \n"
380 : "+r"(src_ptr), // %0
381 "+r"(dst_ptr), // %1
382 "+r"(dst_width) // %2
383 :
384 : "memory", "cc"
385 #if defined(__SSE2__)
386 , "xmm0", "xmm1"
387 #endif
388 );
389 }
390
391 static void ScaleARGBRowDown2Int_SSE2(const uint8* src_ptr,
392 ptrdiff_t src_stride,
393 uint8* dst_ptr, int dst_width) {
394 asm volatile (
395 ".p2align 4 \n"
396 "1: \n"
397 "movdqa (%0),%%xmm0 \n"
398 "movdqa 0x10(%0),%%xmm1 \n"
399 "movdqa (%0,%3,1),%%xmm2 \n"
400 "movdqa 0x10(%0,%3,1),%%xmm3 \n"
401 "lea 0x20(%0),%0 \n"
402 "pavgb %%xmm2,%%xmm0 \n"
403 "pavgb %%xmm3,%%xmm1 \n"
404 "movdqa %%xmm0,%%xmm2 \n"
405 "shufps $0x88,%%xmm1,%%xmm0 \n"
406 "shufps $0xdd,%%xmm1,%%xmm2 \n"
407 "pavgb %%xmm2,%%xmm0 \n"
408 "sub $0x4,%2 \n"
409 "movdqa %%xmm0,(%1) \n"
410 "lea 0x10(%1),%1 \n"
411 "jg 1b \n"
412 : "+r"(src_ptr), // %0
413 "+r"(dst_ptr), // %1
414 "+r"(dst_width) // %2
415 : "r"(static_cast<intptr_t>(src_stride)) // %3
416 : "memory", "cc"
417 #if defined(__SSE2__)
418 , "xmm0", "xmm1", "xmm2", "xmm3"
419 #endif
420 );
421 }
422
423 #define HAS_SCALEARGBROWDOWNEVEN_SSE2
424 // Reads 4 pixels at a time.
425 // Alignment requirement: dst_ptr 16 byte aligned.
426 void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
427 int src_stepx,
428 uint8* dst_ptr, int dst_width) {
429 intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
430 intptr_t src_stepx_x12 = 0;
431 asm volatile (
432 "lea 0x0(,%1,4),%1 \n"
433 "lea (%1,%1,2),%4 \n"
434 ".p2align 4 \n"
435 "1: \n"
436 "movd (%0),%%xmm0 \n"
437 "movd (%0,%1,1),%%xmm1 \n"
438 "punpckldq %%xmm1,%%xmm0 \n"
439 "movd (%0,%1,2),%%xmm2 \n"
440 "movd (%0,%4,1),%%xmm3 \n"
441 "lea (%0,%1,4),%0 \n"
442 "punpckldq %%xmm3,%%xmm2 \n"
443 "punpcklqdq %%xmm2,%%xmm0 \n"
444 "sub $0x4,%3 \n"
445 "movdqa %%xmm0,(%2) \n"
446 "lea 0x10(%2),%2 \n"
447 "jg 1b \n"
448 : "+r"(src_ptr), // %0
449 "+r"(src_stepx_x4), // %1
450 "+r"(dst_ptr), // %2
451 "+r"(dst_width), // %3
452 "+r"(src_stepx_x12) // %4
453 :
454 : "memory", "cc"
455 #if defined(__SSE2__)
456 , "xmm0", "xmm1", "xmm2", "xmm3"
457 #endif
458 );
459 }
460
461 // Blends four 2x2 to 4x1.
462 // Alignment requirement: dst_ptr 16 byte aligned.
463 static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_ptr,
464 ptrdiff_t src_stride, int src_stepx,
465 uint8* dst_ptr, int dst_width) {
466 intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
467 intptr_t src_stepx_x12 = 0;
468 intptr_t row1 = static_cast<intptr_t>(src_stride);
469 asm volatile (
470 "lea 0x0(,%1,4),%1 \n"
471 "lea (%1,%1,2),%4 \n"
472 "lea (%0,%5,1),%5 \n"
473 ".p2align 4 \n"
474 "1: \n"
475 "movq (%0),%%xmm0 \n"
476 "movhps (%0,%1,1),%%xmm0 \n"
477 "movq (%0,%1,2),%%xmm1 \n"
478 "movhps (%0,%4,1),%%xmm1 \n"
479 "lea (%0,%1,4),%0 \n"
480 "movq (%5),%%xmm2 \n"
481 "movhps (%5,%1,1),%%xmm2 \n"
482 "movq (%5,%1,2),%%xmm3 \n"
483 "movhps (%5,%4,1),%%xmm3 \n"
484 "lea (%5,%1,4),%5 \n"
485 "pavgb %%xmm2,%%xmm0 \n"
486 "pavgb %%xmm3,%%xmm1 \n"
487 "movdqa %%xmm0,%%xmm2 \n"
488 "shufps $0x88,%%xmm1,%%xmm0 \n"
489 "shufps $0xdd,%%xmm1,%%xmm2 \n"
490 "pavgb %%xmm2,%%xmm0 \n"
491 "sub $0x4,%3 \n"
492 "movdqa %%xmm0,(%2) \n"
493 "lea 0x10(%2),%2 \n"
494 "jg 1b \n"
495 : "+r"(src_ptr), // %0
496 "+r"(src_stepx_x4), // %1
497 "+r"(dst_ptr), // %2
498 "+rm"(dst_width), // %3
499 "+r"(src_stepx_x12), // %4
500 "+r"(row1) // %5
501 :
502 : "memory", "cc"
503 #if defined(__SSE2__)
504 , "xmm0", "xmm1", "xmm2", "xmm3"
505 #endif
506 );
507 }
508
509 #ifndef SSE2_DISABLED
510 // Bilinear row filtering combines 4x2 -> 4x1. SSE2 version
511 #define HAS_SCALEARGBFILTERROWS_SSE2_DISABLED
512 void ScaleARGBFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
513 ptrdiff_t src_stride, int dst_width,
514 int source_y_fraction) {
515 asm volatile (
516 "sub %1,%0 \n"
517 "cmp $0x0,%3 \n"
518 "je 2f \n"
519 "cmp $0x80,%3 \n"
520 "je 3f \n"
521 "movd %3,%%xmm5 \n"
522 "punpcklbw %%xmm5,%%xmm5 \n"
523 "punpcklwd %%xmm5,%%xmm5 \n"
524 "pshufd $0x0,%%xmm5,%%xmm5 \n"
525 "pxor %%xmm4,%%xmm4 \n"
526 ".p2align 4 \n"
527 "1: \n"
528 "movdqa (%1),%%xmm0 \n"
529 "movdqa (%1,%4,1),%%xmm2 \n"
530 "movdqa %%xmm0,%%xmm1 \n"
531 "movdqa %%xmm2,%%xmm3 \n"
532 "punpcklbw %%xmm4,%%xmm2 \n"
533 "punpckhbw %%xmm4,%%xmm3 \n"
534 "punpcklbw %%xmm4,%%xmm0 \n"
535 "punpckhbw %%xmm4,%%xmm1 \n"
536 "psubw %%xmm0,%%xmm2 \n"
537 "psubw %%xmm1,%%xmm3 \n"
538 "pmulhw %%xmm5,%%xmm2 \n"
539 "pmulhw %%xmm5,%%xmm3 \n"
540 "paddw %%xmm2,%%xmm0 \n"
541 "paddw %%xmm3,%%xmm1 \n"
542 "packuswb %%xmm1,%%xmm0 \n"
543 "sub $0x4,%2 \n"
544 "movdqa %%xmm0,(%1,%0,1) \n"
545 "lea 0x10(%1),%1 \n"
546 "jg 1b \n"
547 "jmp 4f \n"
548 ".p2align 4 \n"
549 "2: \n"
550 "movdqa (%1),%%xmm0 \n"
551 "sub $0x4,%2 \n"
552 "movdqa %%xmm0,(%1,%0,1) \n"
553 "lea 0x10(%1),%1 \n"
554 "jg 2b \n"
555 "jmp 4f \n"
556 ".p2align 4 \n"
557 "3: \n"
558 "movdqa (%1),%%xmm0 \n"
559 "pavgb (%1,%4,1),%%xmm0 \n"
560 "sub $0x4,%2 \n"
561 "movdqa %%xmm0,(%1,%0,1) \n"
562 "lea 0x10(%1),%1 \n"
563 "lea 0x10(%1),%1 \n"
564 "jg 3b \n"
565 ".p2align 4 \n"
566 "4: \n"
567 "shufps $0xff,%%xmm0,%%xmm0 \n"
568 "movdqa %%xmm0,(%1,%0,1) \n"
569 : "+r"(dst_ptr), // %0
570 "+r"(src_ptr), // %1
571 "+r"(dst_width), // %2
572 "+r"(source_y_fraction) // %3
573 : "r"(static_cast<intptr_t>(src_stride)) // %4
574 : "memory", "cc"
575 #if defined(__SSE2__)
576 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
577 #endif
578 );
579 }
580 #endif // SSE2_DISABLED
581
582 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
583 #define HAS_SCALEARGBFILTERROWS_SSSE3
584 void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
585 ptrdiff_t src_stride, int dst_width,
586 int source_y_fraction) {
587 asm volatile (
588 "sub %1,%0 \n"
589 "shr %3 \n"
590 "cmp $0x0,%3 \n"
591 "je 2f \n"
592 "cmp $0x40,%3 \n"
593 "je 3f \n"
594 "movd %3,%%xmm0 \n"
595 "neg %3 \n"
596 "add $0x80,%3 \n"
597 "movd %3,%%xmm5 \n"
598 "punpcklbw %%xmm0,%%xmm5 \n"
599 "punpcklwd %%xmm5,%%xmm5 \n"
600 "pshufd $0x0,%%xmm5,%%xmm5 \n"
601 ".p2align 4 \n"
602 "1: \n"
603 "movdqa (%1),%%xmm0 \n"
604 "movdqa (%1,%4,1),%%xmm2 \n"
605 "movdqa %%xmm0,%%xmm1 \n"
606 "punpcklbw %%xmm2,%%xmm0 \n"
607 "punpckhbw %%xmm2,%%xmm1 \n"
608 "pmaddubsw %%xmm5,%%xmm0 \n"
609 "pmaddubsw %%xmm5,%%xmm1 \n"
610 "psrlw $0x7,%%xmm0 \n"
611 "psrlw $0x7,%%xmm1 \n"
612 "packuswb %%xmm1,%%xmm0 \n"
613 "sub $0x4,%2 \n"
614 "movdqa %%xmm0,(%1,%0,1) \n"
615 "lea 0x10(%1),%1 \n"
616 "jg 1b \n"
617 "jmp 4f \n"
618 ".p2align 4 \n"
619 "2: \n"
620 "movdqa (%1),%%xmm0 \n"
621 "sub $0x4,%2 \n"
622 "movdqa %%xmm0,(%1,%0,1) \n"
623 "lea 0x10(%1),%1 \n"
624 "jg 2b \n"
625 "jmp 4f \n"
626 ".p2align 4 \n"
627 "3: \n"
628 "movdqa (%1),%%xmm0 \n"
629 "pavgb (%1,%4,1),%%xmm0 \n"
630 "sub $0x4,%2 \n"
631 "movdqa %%xmm0,(%1,%0,1) \n"
632 "lea 0x10(%1),%1 \n"
633 "jg 3b \n"
634 "4: \n"
635 ".p2align 4 \n"
636 "shufps $0xff,%%xmm0,%%xmm0 \n"
637 "movdqa %%xmm0,(%1,%0,1) \n"
638 : "+r"(dst_ptr), // %0
639 "+r"(src_ptr), // %1
640 "+r"(dst_width), // %2
641 "+r"(source_y_fraction) // %3
642 : "r"(static_cast<intptr_t>(src_stride)) // %4
643 : "memory", "cc"
644 #if defined(__SSE2__)
645 , "xmm0", "xmm1", "xmm2", "xmm5"
646 #endif
647 );
648 }
649 #endif // defined(__x86_64__) || defined(__i386__)
650
ScaleARGBRowDown2_C(const uint8 * src_ptr,ptrdiff_t,uint8 * dst_ptr,int dst_width)651 static void ScaleARGBRowDown2_C(const uint8* src_ptr,
652 ptrdiff_t /* src_stride */,
653 uint8* dst_ptr, int dst_width) {
654 const uint32* src = reinterpret_cast<const uint32*>(src_ptr);
655 uint32* dst = reinterpret_cast<uint32*>(dst_ptr);
656
657 for (int x = 0; x < dst_width - 1; x += 2) {
658 dst[0] = src[0];
659 dst[1] = src[2];
660 src += 4;
661 dst += 2;
662 }
663 if (dst_width & 1) {
664 dst[0] = src[0];
665 }
666 }
667
ScaleARGBRowDown2Int_C(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)668 static void ScaleARGBRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
669 uint8* dst_ptr, int dst_width) {
670 for (int x = 0; x < dst_width; ++x) {
671 dst_ptr[0] = (src_ptr[0] + src_ptr[4] +
672 src_ptr[src_stride] + src_ptr[src_stride + 4] + 2) >> 2;
673 dst_ptr[1] = (src_ptr[1] + src_ptr[5] +
674 src_ptr[src_stride + 1] + src_ptr[src_stride + 5] + 2) >> 2;
675 dst_ptr[2] = (src_ptr[2] + src_ptr[6] +
676 src_ptr[src_stride + 2] + src_ptr[src_stride + 6] + 2) >> 2;
677 dst_ptr[3] = (src_ptr[3] + src_ptr[7] +
678 src_ptr[src_stride + 3] + src_ptr[src_stride + 7] + 2) >> 2;
679 src_ptr += 8;
680 dst_ptr += 4;
681 }
682 }
683
ScaleARGBRowDownEven_C(const uint8 * src_ptr,ptrdiff_t,int src_stepx,uint8 * dst_ptr,int dst_width)684 void ScaleARGBRowDownEven_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
685 int src_stepx,
686 uint8* dst_ptr, int dst_width) {
687 const uint32* src = reinterpret_cast<const uint32*>(src_ptr);
688 uint32* dst = reinterpret_cast<uint32*>(dst_ptr);
689
690 for (int x = 0; x < dst_width - 1; x += 2) {
691 dst[0] = src[0];
692 dst[1] = src[src_stepx];
693 src += src_stepx * 2;
694 dst += 2;
695 }
696 if (dst_width & 1) {
697 dst[0] = src[0];
698 }
699 }
700
ScaleARGBRowDownEvenInt_C(const uint8 * src_ptr,ptrdiff_t src_stride,int src_stepx,uint8 * dst_ptr,int dst_width)701 static void ScaleARGBRowDownEvenInt_C(const uint8* src_ptr,
702 ptrdiff_t src_stride,
703 int src_stepx,
704 uint8* dst_ptr, int dst_width) {
705 for (int x = 0; x < dst_width; ++x) {
706 dst_ptr[0] = (src_ptr[0] + src_ptr[4] +
707 src_ptr[src_stride] + src_ptr[src_stride + 4] + 2) >> 2;
708 dst_ptr[1] = (src_ptr[1] + src_ptr[5] +
709 src_ptr[src_stride + 1] + src_ptr[src_stride + 5] + 2) >> 2;
710 dst_ptr[2] = (src_ptr[2] + src_ptr[6] +
711 src_ptr[src_stride + 2] + src_ptr[src_stride + 6] + 2) >> 2;
712 dst_ptr[3] = (src_ptr[3] + src_ptr[7] +
713 src_ptr[src_stride + 3] + src_ptr[src_stride + 7] + 2) >> 2;
714 src_ptr += src_stepx * 4;
715 dst_ptr += 4;
716 }
717 }
718
719 // (1-f)a + fb can be replaced with a + f(b-a)
720
721 #define BLENDER1(a, b, f) (static_cast<int>(a) + \
722 ((f) * (static_cast<int>(b) - static_cast<int>(a)) >> 16))
723
724 #define BLENDERC(a, b, f, s) static_cast<uint32>( \
725 BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
726
727 #define BLENDER(a, b, f) \
728 BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
729 BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
730
ScaleARGBFilterCols_C(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)731 static void ScaleARGBFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
732 int dst_width, int x, int dx) {
733 const uint32* src = reinterpret_cast<const uint32*>(src_ptr);
734 uint32* dst = reinterpret_cast<uint32*>(dst_ptr);
735 for (int j = 0; j < dst_width - 1; j += 2) {
736 int xi = x >> 16;
737 uint32 a = src[xi];
738 uint32 b = src[xi + 1];
739 dst[0] = BLENDER(a, b, x & 0xffff);
740 x += dx;
741 xi = x >> 16;
742 a = src[xi];
743 b = src[xi + 1];
744 dst[1] = BLENDER(a, b, x & 0xffff);
745 x += dx;
746 dst += 2;
747 }
748 if (dst_width & 1) {
749 int xi = x >> 16;
750 uint32 a = src[xi];
751 uint32 b = src[xi + 1];
752 dst[0] = BLENDER(a, b, x & 0xffff);
753 }
754 }
755
756 static const int kMaxInputWidth = 2560;
757
758 // C version 2x2 -> 2x1
ScaleARGBFilterRows_C(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)759 void ScaleARGBFilterRows_C(uint8* dst_ptr, const uint8* src_ptr,
760 ptrdiff_t src_stride,
761 int dst_width, int source_y_fraction) {
762 assert(dst_width > 0);
763 int y1_fraction = source_y_fraction;
764 int y0_fraction = 256 - y1_fraction;
765 const uint8* src_ptr1 = src_ptr + src_stride;
766 uint8* end = dst_ptr + (dst_width << 2);
767 do {
768 dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
769 dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
770 dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
771 dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
772 dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
773 dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
774 dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
775 dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
776 src_ptr += 8;
777 src_ptr1 += 8;
778 dst_ptr += 8;
779 } while (dst_ptr < end);
780 // Duplicate the last pixel (4 bytes) for filtering.
781 dst_ptr[0] = dst_ptr[-4];
782 dst_ptr[1] = dst_ptr[-3];
783 dst_ptr[2] = dst_ptr[-2];
784 dst_ptr[3] = dst_ptr[-1];
785 }
786
787 /**
788 * ScaleARGB ARGB, 1/2
789 *
790 * This is an optimized version for scaling down a ARGB to 1/2 of
791 * its original size.
792 *
793 */
ScaleARGBDown2(int,int,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)794 static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
795 int dst_width, int dst_height,
796 int src_stride, int dst_stride,
797 const uint8* src_ptr, uint8* dst_ptr,
798 FilterMode filtering) {
799 void (*ScaleARGBRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
800 uint8* dst_ptr, int dst_width) =
801 filtering ? ScaleARGBRowDown2Int_C : ScaleARGBRowDown2_C;
802 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
803 if (TestCpuFlag(kCpuHasSSE2) &&
804 IS_ALIGNED(dst_width, 4) &&
805 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
806 IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
807 ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_SSE2 :
808 ScaleARGBRowDown2_SSE2;
809 }
810 #endif
811
812 // TODO(fbarchard): Loop through source height to allow odd height.
813 for (int y = 0; y < dst_height; ++y) {
814 ScaleARGBRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
815 src_ptr += (src_stride << 1);
816 dst_ptr += dst_stride;
817 }
818 }
819
820 /**
821 * ScaleARGB ARGB Even
822 *
823 * This is an optimized version for scaling down a ARGB to even
824 * multiple of its original size.
825 *
826 */
ScaleARGBDownEven(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)827 static void ScaleARGBDownEven(int src_width, int src_height,
828 int dst_width, int dst_height,
829 int src_stride, int dst_stride,
830 const uint8* src_ptr, uint8* dst_ptr,
831 FilterMode filtering) {
832 assert(IS_ALIGNED(src_width, 2));
833 assert(IS_ALIGNED(src_height, 2));
834 void (*ScaleARGBRowDownEven)(const uint8* src_ptr, ptrdiff_t src_stride,
835 int src_step, uint8* dst_ptr, int dst_width) =
836 filtering ? ScaleARGBRowDownEvenInt_C : ScaleARGBRowDownEven_C;
837 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
838 if (TestCpuFlag(kCpuHasSSE2) &&
839 IS_ALIGNED(dst_width, 4) &&
840 IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
841 ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenInt_SSE2 :
842 ScaleARGBRowDownEven_SSE2;
843 }
844 #endif
845 int src_step = src_width / dst_width;
846 // Adjust to point to center of box.
847 int row_step = src_height / dst_height;
848 int row_stride = row_step * src_stride;
849 src_ptr += ((row_step >> 1) - 1) * src_stride + ((src_step >> 1) - 1) * 4;
850 for (int y = 0; y < dst_height; ++y) {
851 ScaleARGBRowDownEven(src_ptr, src_stride, src_step, dst_ptr, dst_width);
852 src_ptr += row_stride;
853 dst_ptr += dst_stride;
854 }
855 }
856 /**
857 * ScaleARGB ARGB to/from any dimensions, with bilinear
858 * interpolation.
859 */
860
ScaleARGBBilinear(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr)861 static void ScaleARGBBilinear(int src_width, int src_height,
862 int dst_width, int dst_height,
863 int src_stride, int dst_stride,
864 const uint8* src_ptr, uint8* dst_ptr) {
865 assert(dst_width > 0);
866 assert(dst_height > 0);
867 assert(src_width <= kMaxInputWidth);
868 SIMD_ALIGNED(uint8 row[kMaxInputWidth * 4 + 16]);
869 void (*ScaleARGBFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
870 ptrdiff_t src_stride,
871 int dst_width, int source_y_fraction) =
872 ScaleARGBFilterRows_C;
873 #if defined(HAS_SCALEARGBFILTERROWS_SSE2)
874 if (TestCpuFlag(kCpuHasSSE2) &&
875 IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
876 ScaleARGBFilterRows = ScaleARGBFilterRows_SSE2;
877 }
878 #endif
879 #if defined(HAS_SCALEARGBFILTERROWS_SSSE3)
880 if (TestCpuFlag(kCpuHasSSSE3) &&
881 IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
882 ScaleARGBFilterRows = ScaleARGBFilterRows_SSSE3;
883 }
884 #endif
885 int dx = (src_width << 16) / dst_width;
886 int dy = (src_height << 16) / dst_height;
887 int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
888 int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
889 int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
890 for (int j = 0; j < dst_height; ++j) {
891 int yi = y >> 16;
892 int yf = (y >> 8) & 255;
893 const uint8* src = src_ptr + yi * src_stride;
894 ScaleARGBFilterRows(row, src, src_stride, src_width, yf);
895 ScaleARGBFilterCols_C(dst_ptr, row, dst_width, x, dx);
896 dst_ptr += dst_stride;
897 y += dy;
898 if (y > maxy) {
899 y = maxy;
900 }
901 }
902 }
903
904 // Scales a single row of pixels using point sampling.
905 // Code is adapted from libyuv bilinear yuv scaling, but with bilinear
906 // interpolation off, and argb pixels instead of yuv.
ScaleARGBCols(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)907 static void ScaleARGBCols(uint8* dst_ptr, const uint8* src_ptr,
908 int dst_width, int x, int dx) {
909 const uint32* src = reinterpret_cast<const uint32*>(src_ptr);
910 uint32* dst = reinterpret_cast<uint32*>(dst_ptr);
911 for (int j = 0; j < dst_width - 1; j += 2) {
912 dst[0] = src[x >> 16];
913 x += dx;
914 dst[1] = src[x >> 16];
915 x += dx;
916 dst += 2;
917 }
918 if (dst_width & 1) {
919 dst[0] = src[x >> 16];
920 }
921 }
922
923 /**
924 * ScaleARGB ARGB to/from any dimensions, without interpolation.
925 * Fixed point math is used for performance: The upper 16 bits
926 * of x and dx is the integer part of the source position and
927 * the lower 16 bits are the fixed decimal part.
928 */
929
ScaleARGBSimple(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr)930 static void ScaleARGBSimple(int src_width, int src_height,
931 int dst_width, int dst_height,
932 int src_stride, int dst_stride,
933 const uint8* src_ptr, uint8* dst_ptr) {
934 int dx = (src_width << 16) / dst_width;
935 int dy = (src_height << 16) / dst_height;
936 int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
937 int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
938 for (int i = 0; i < dst_height; ++i) {
939 ScaleARGBCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
940 dst_ptr += dst_stride;
941 y += dy;
942 }
943 }
944
945 /**
946 * ScaleARGB ARGB to/from any dimensions.
947 */
ScaleARGBAnySize(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)948 static void ScaleARGBAnySize(int src_width, int src_height,
949 int dst_width, int dst_height,
950 int src_stride, int dst_stride,
951 const uint8* src_ptr, uint8* dst_ptr,
952 FilterMode filtering) {
953 if (!filtering || (src_width > kMaxInputWidth)) {
954 ScaleARGBSimple(src_width, src_height, dst_width, dst_height,
955 src_stride, dst_stride, src_ptr, dst_ptr);
956 } else {
957 ScaleARGBBilinear(src_width, src_height, dst_width, dst_height,
958 src_stride, dst_stride, src_ptr, dst_ptr);
959 }
960 }
961
962 // ScaleARGB a ARGB.
963 //
964 // This function in turn calls a scaling function
965 // suitable for handling the desired resolutions.
966
ScaleARGB(const uint8 * src,int src_stride,int src_width,int src_height,uint8 * dst,int dst_stride,int dst_width,int dst_height,FilterMode filtering)967 static void ScaleARGB(const uint8* src, int src_stride,
968 int src_width, int src_height,
969 uint8* dst, int dst_stride,
970 int dst_width, int dst_height,
971 FilterMode filtering) {
972 #ifdef CPU_X86
973 // environment variable overrides for testing.
974 char *filter_override = getenv("LIBYUV_FILTER");
975 if (filter_override) {
976 filtering = (FilterMode)atoi(filter_override); // NOLINT
977 }
978 #endif
979 if (dst_width == src_width && dst_height == src_height) {
980 // Straight copy.
981 ARGBCopy(src, src_stride, dst, dst_stride, dst_width, dst_height);
982 return;
983 }
984 if (2 * dst_width == src_width && 2 * dst_height == src_height) {
985 // Optimized 1/2.
986 ScaleARGBDown2(src_width, src_height, dst_width, dst_height,
987 src_stride, dst_stride, src, dst, filtering);
988 return;
989 }
990 int scale_down_x = src_width / dst_width;
991 int scale_down_y = src_height / dst_height;
992 if (dst_width * scale_down_x == src_width &&
993 dst_height * scale_down_y == src_height) {
994 if (!(scale_down_x & 1) && !(scale_down_y & 1)) {
995 // Optimized even scale down. ie 4, 6, 8, 10x
996 ScaleARGBDownEven(src_width, src_height, dst_width, dst_height,
997 src_stride, dst_stride, src, dst, filtering);
998 return;
999 }
1000 if ((scale_down_x & 1) && (scale_down_y & 1)) {
1001 filtering = kFilterNone;
1002 }
1003 }
1004 // Arbitrary scale up and/or down.
1005 ScaleARGBAnySize(src_width, src_height, dst_width, dst_height,
1006 src_stride, dst_stride, src, dst, filtering);
1007 }
1008
1009 // ScaleARGB an ARGB image.
1010 LIBYUV_API
ARGBScale(const uint8 * src_argb,int src_stride_argb,int src_width,int src_height,uint8 * dst_argb,int dst_stride_argb,int dst_width,int dst_height,FilterMode filtering)1011 int ARGBScale(const uint8* src_argb, int src_stride_argb,
1012 int src_width, int src_height,
1013 uint8* dst_argb, int dst_stride_argb,
1014 int dst_width, int dst_height,
1015 FilterMode filtering) {
1016 if (!src_argb || src_width <= 0 || src_height == 0 ||
1017 !dst_argb || dst_width <= 0 || dst_height <= 0) {
1018 return -1;
1019 }
1020 // Negative height means invert the image.
1021 if (src_height < 0) {
1022 src_height = -src_height;
1023 src_argb = src_argb + (src_height - 1) * src_stride_argb;
1024 src_stride_argb = -src_stride_argb;
1025 }
1026 ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
1027 dst_argb, dst_stride_argb, dst_width, dst_height,
1028 filtering);
1029 return 0;
1030 }
1031
1032 #ifdef __cplusplus
1033 } // extern "C"
1034 } // namespace libyuv
1035 #endif
1036