1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/scale.h"
12
13 #include <assert.h>
14 #include <string.h>
15 #include <stdlib.h> // For getenv()
16
17 #include "libyuv/cpu_id.h"
18 #include "libyuv/planar_functions.h" // For CopyPlane
19 #include "libyuv/row.h"
20
21 #ifdef __cplusplus
22 namespace libyuv {
23 extern "C" {
24 #endif
25
26 // Bilinear SSE2 is disabled.
27 #define SSE2_DISABLED 1
28
29 // Note: Some SSE2 reference manuals
30 // cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
31
32 // Set the following flag to true to revert to only
33 // using the reference implementation ScalePlaneBox(), and
34 // NOT the optimized versions. Useful for debugging and
35 // when comparing the quality of the resulting YUV planes
36 // as produced by the optimized and non-optimized versions.
37 static bool use_reference_impl_ = false;
38
39 LIBYUV_API
SetUseReferenceImpl(bool use)40 void SetUseReferenceImpl(bool use) {
41 use_reference_impl_ = use;
42 }
43
44 // ScaleRowDown2Int also used by planar functions
45
46 /**
47 * NEON downscalers with interpolation.
48 *
49 * Provided by Fritz Koenig
50 *
51 */
52
53 #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
54 #define HAS_SCALEROWDOWN2_NEON
55 // Note - not static due to reuse in convert for 444 to 420.
56 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
57 uint8* dst, int dst_width);
58
59 void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
60 uint8* dst, int dst_width);
61
62 #define HAS_SCALEROWDOWN4_NEON
63 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
64 uint8* dst_ptr, int dst_width);
65 void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
66 uint8* dst_ptr, int dst_width);
67
68 #define HAS_SCALEROWDOWN34_NEON
69 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
70 // to load up the every 4th pixel into a 4 different registers.
71 // Point samples 32 pixels to 24 pixels.
72 void ScaleRowDown34_NEON(const uint8* src_ptr,
73 ptrdiff_t /* src_stride */,
74 uint8* dst_ptr, int dst_width);
75 void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
76 ptrdiff_t src_stride,
77 uint8* dst_ptr, int dst_width);
78 void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
79 ptrdiff_t src_stride,
80 uint8* dst_ptr, int dst_width);
81
82 #define HAS_SCALEROWDOWN38_NEON
83 // 32 -> 12
84 void ScaleRowDown38_NEON(const uint8* src_ptr,
85 ptrdiff_t /* src_stride */,
86 uint8* dst_ptr, int dst_width);
87 // 32x3 -> 12x1
88 void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
89 ptrdiff_t src_stride,
90 uint8* dst_ptr, int dst_width);
91 // 32x2 -> 12x1
92 void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
93 ptrdiff_t src_stride,
94 uint8* dst_ptr, int dst_width);
95 // 16x2 -> 16x1
96 #define HAS_SCALEFILTERROWS_NEON
97 void ScaleFilterRows_NEON(uint8* dst_ptr,
98 const uint8* src_ptr, ptrdiff_t src_stride,
99 int dst_width, int source_y_fraction);
100
101 /**
102 * SSE2 downscalers with interpolation.
103 *
104 * Provided by Frank Barchard (fbarchard@google.com)
105 *
106 */
107
108
109 // Constants for SSSE3 code
110 #elif !defined(YUV_DISABLE_ASM) && \
111 (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__))
112
113 // GCC 4.2 on OSX has link error when passing static or const to inline.
114 // TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
115 #ifdef __APPLE__
116 #define CONST
117 #else
118 #define CONST static const
119 #endif
120
121 // Offsets for source bytes 0 to 9
122 CONST uvec8 kShuf0 =
123 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
124
125 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
126 CONST uvec8 kShuf1 =
127 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
128
129 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
130 CONST uvec8 kShuf2 =
131 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
132
133 // Offsets for source bytes 0 to 10
134 CONST uvec8 kShuf01 =
135 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
136
137 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
138 CONST uvec8 kShuf11 =
139 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
140
141 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
142 CONST uvec8 kShuf21 =
143 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
144
145 // Coefficients for source bytes 0 to 10
146 CONST uvec8 kMadd01 =
147 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
148
149 // Coefficients for source bytes 10 to 21
150 CONST uvec8 kMadd11 =
151 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
152
153 // Coefficients for source bytes 21 to 31
154 CONST uvec8 kMadd21 =
155 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
156
157 // Coefficients for source bytes 21 to 31
158 CONST vec16 kRound34 =
159 { 2, 2, 2, 2, 2, 2, 2, 2 };
160
161 CONST uvec8 kShuf38a =
162 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
163
164 CONST uvec8 kShuf38b =
165 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
166
167 // Arrange words 0,3,6 into 0,1,2
168 CONST uvec8 kShufAc =
169 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
170
171 // Arrange words 0,3,6 into 3,4,5
172 CONST uvec8 kShufAc3 =
173 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
174
175 // Scaling values for boxes of 3x3 and 2x3
176 CONST uvec16 kScaleAc33 =
177 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
178
179 // Arrange first value for pixels 0,1,2,3,4,5
180 CONST uvec8 kShufAb0 =
181 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
182
183 // Arrange second value for pixels 0,1,2,3,4,5
184 CONST uvec8 kShufAb1 =
185 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
186
187 // Arrange third value for pixels 0,1,2,3,4,5
188 CONST uvec8 kShufAb2 =
189 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
190
191 // Scaling values for boxes of 3x2 and 2x2
192 CONST uvec16 kScaleAb2 =
193 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
194 #endif
195
196 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
197
198 #define HAS_SCALEROWDOWN2_SSE2
199 // Reads 32 pixels, throws half away and writes 16 pixels.
200 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
201 __declspec(naked) __declspec(align(16))
ScaleRowDown2_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)202 static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
203 uint8* dst_ptr, int dst_width) {
204 __asm {
205 mov eax, [esp + 4] // src_ptr
206 // src_stride ignored
207 mov edx, [esp + 12] // dst_ptr
208 mov ecx, [esp + 16] // dst_width
209 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
210 psrlw xmm5, 8
211
212 align 16
213 wloop:
214 movdqa xmm0, [eax]
215 movdqa xmm1, [eax + 16]
216 lea eax, [eax + 32]
217 pand xmm0, xmm5
218 pand xmm1, xmm5
219 packuswb xmm0, xmm1
220 sub ecx, 16
221 movdqa [edx], xmm0
222 lea edx, [edx + 16]
223 jg wloop
224
225 ret
226 }
227 }
228 // Blends 32x2 rectangle to 16x1.
229 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
230 __declspec(naked) __declspec(align(16))
ScaleRowDown2Int_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)231 void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
232 uint8* dst_ptr, int dst_width) {
233 __asm {
234 push esi
235 mov eax, [esp + 4 + 4] // src_ptr
236 mov esi, [esp + 4 + 8] // src_stride
237 mov edx, [esp + 4 + 12] // dst_ptr
238 mov ecx, [esp + 4 + 16] // dst_width
239 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
240 psrlw xmm5, 8
241
242 align 16
243 wloop:
244 movdqa xmm0, [eax]
245 movdqa xmm1, [eax + 16]
246 movdqa xmm2, [eax + esi]
247 movdqa xmm3, [eax + esi + 16]
248 lea eax, [eax + 32]
249 pavgb xmm0, xmm2 // average rows
250 pavgb xmm1, xmm3
251
252 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
253 psrlw xmm0, 8
254 movdqa xmm3, xmm1
255 psrlw xmm1, 8
256 pand xmm2, xmm5
257 pand xmm3, xmm5
258 pavgw xmm0, xmm2
259 pavgw xmm1, xmm3
260 packuswb xmm0, xmm1
261
262 sub ecx, 16
263 movdqa [edx], xmm0
264 lea edx, [edx + 16]
265 jg wloop
266
267 pop esi
268 ret
269 }
270 }
271
272 // Reads 32 pixels, throws half away and writes 16 pixels.
273 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
274 __declspec(naked) __declspec(align(16))
ScaleRowDown2_Unaligned_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)275 static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
276 ptrdiff_t src_stride,
277 uint8* dst_ptr, int dst_width) {
278 __asm {
279 mov eax, [esp + 4] // src_ptr
280 // src_stride ignored
281 mov edx, [esp + 12] // dst_ptr
282 mov ecx, [esp + 16] // dst_width
283 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
284 psrlw xmm5, 8
285
286 align 16
287 wloop:
288 movdqu xmm0, [eax]
289 movdqu xmm1, [eax + 16]
290 lea eax, [eax + 32]
291 pand xmm0, xmm5
292 pand xmm1, xmm5
293 packuswb xmm0, xmm1
294 sub ecx, 16
295 movdqu [edx], xmm0
296 lea edx, [edx + 16]
297 jg wloop
298
299 ret
300 }
301 }
302 // Blends 32x2 rectangle to 16x1.
303 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
304 __declspec(naked) __declspec(align(16))
ScaleRowDown2Int_Unaligned_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)305 static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr,
306 ptrdiff_t src_stride,
307 uint8* dst_ptr, int dst_width) {
308 __asm {
309 push esi
310 mov eax, [esp + 4 + 4] // src_ptr
311 mov esi, [esp + 4 + 8] // src_stride
312 mov edx, [esp + 4 + 12] // dst_ptr
313 mov ecx, [esp + 4 + 16] // dst_width
314 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
315 psrlw xmm5, 8
316
317 align 16
318 wloop:
319 movdqu xmm0, [eax]
320 movdqu xmm1, [eax + 16]
321 movdqu xmm2, [eax + esi]
322 movdqu xmm3, [eax + esi + 16]
323 lea eax, [eax + 32]
324 pavgb xmm0, xmm2 // average rows
325 pavgb xmm1, xmm3
326
327 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
328 psrlw xmm0, 8
329 movdqa xmm3, xmm1
330 psrlw xmm1, 8
331 pand xmm2, xmm5
332 pand xmm3, xmm5
333 pavgw xmm0, xmm2
334 pavgw xmm1, xmm3
335 packuswb xmm0, xmm1
336
337 sub ecx, 16
338 movdqu [edx], xmm0
339 lea edx, [edx + 16]
340 jg wloop
341
342 pop esi
343 ret
344 }
345 }
346
347 #define HAS_SCALEROWDOWN4_SSE2
348 // Point samples 32 pixels to 8 pixels.
349 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
350 __declspec(naked) __declspec(align(16))
ScaleRowDown4_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)351 static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
352 uint8* dst_ptr, int dst_width) {
353 __asm {
354 mov eax, [esp + 4] // src_ptr
355 // src_stride ignored
356 mov edx, [esp + 12] // dst_ptr
357 mov ecx, [esp + 16] // dst_width
358 pcmpeqb xmm5, xmm5 // generate mask 0x000000ff
359 psrld xmm5, 24
360
361 align 16
362 wloop:
363 movdqa xmm0, [eax]
364 movdqa xmm1, [eax + 16]
365 lea eax, [eax + 32]
366 pand xmm0, xmm5
367 pand xmm1, xmm5
368 packuswb xmm0, xmm1
369 packuswb xmm0, xmm0
370 sub ecx, 8
371 movq qword ptr [edx], xmm0
372 lea edx, [edx + 8]
373 jg wloop
374
375 ret
376 }
377 }
378
379 // Blends 32x4 rectangle to 8x1.
380 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
381 __declspec(naked) __declspec(align(16))
ScaleRowDown4Int_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)382 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
383 uint8* dst_ptr, int dst_width) {
384 __asm {
385 push esi
386 push edi
387 mov eax, [esp + 8 + 4] // src_ptr
388 mov esi, [esp + 8 + 8] // src_stride
389 mov edx, [esp + 8 + 12] // dst_ptr
390 mov ecx, [esp + 8 + 16] // dst_width
391 lea edi, [esi + esi * 2] // src_stride * 3
392 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
393 psrlw xmm7, 8
394
395 align 16
396 wloop:
397 movdqa xmm0, [eax]
398 movdqa xmm1, [eax + 16]
399 movdqa xmm2, [eax + esi]
400 movdqa xmm3, [eax + esi + 16]
401 pavgb xmm0, xmm2 // average rows
402 pavgb xmm1, xmm3
403 movdqa xmm2, [eax + esi * 2]
404 movdqa xmm3, [eax + esi * 2 + 16]
405 movdqa xmm4, [eax + edi]
406 movdqa xmm5, [eax + edi + 16]
407 lea eax, [eax + 32]
408 pavgb xmm2, xmm4
409 pavgb xmm3, xmm5
410 pavgb xmm0, xmm2
411 pavgb xmm1, xmm3
412
413 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
414 psrlw xmm0, 8
415 movdqa xmm3, xmm1
416 psrlw xmm1, 8
417 pand xmm2, xmm7
418 pand xmm3, xmm7
419 pavgw xmm0, xmm2
420 pavgw xmm1, xmm3
421 packuswb xmm0, xmm1
422
423 movdqa xmm2, xmm0 // average columns (16 to 8 pixels)
424 psrlw xmm0, 8
425 pand xmm2, xmm7
426 pavgw xmm0, xmm2
427 packuswb xmm0, xmm0
428
429 sub ecx, 8
430 movq qword ptr [edx], xmm0
431 lea edx, [edx + 8]
432 jg wloop
433
434 pop edi
435 pop esi
436 ret
437 }
438 }
439
440 #define HAS_SCALEROWDOWN8_SSE2
441 // Point samples 32 pixels to 4 pixels.
442 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
443 __declspec(naked) __declspec(align(16))
ScaleRowDown8_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)444 static void ScaleRowDown8_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
445 uint8* dst_ptr, int dst_width) {
446 __asm {
447 mov eax, [esp + 4] // src_ptr
448 // src_stride ignored
449 mov edx, [esp + 12] // dst_ptr
450 mov ecx, [esp + 16] // dst_width
451 pcmpeqb xmm5, xmm5 // generate mask isolating 1 src 8 bytes
452 psrlq xmm5, 56
453
454 align 16
455 wloop:
456 movdqa xmm0, [eax]
457 movdqa xmm1, [eax + 16]
458 lea eax, [eax + 32]
459 pand xmm0, xmm5
460 pand xmm1, xmm5
461 packuswb xmm0, xmm1 // 32->16
462 packuswb xmm0, xmm0 // 16->8
463 packuswb xmm0, xmm0 // 8->4
464 sub ecx, 4
465 movd dword ptr [edx], xmm0
466 lea edx, [edx + 4]
467 jg wloop
468
469 ret
470 }
471 }
472
473 // Blends 32x8 rectangle to 4x1.
474 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
475 __declspec(naked) __declspec(align(16))
ScaleRowDown8Int_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)476 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
477 uint8* dst_ptr, int dst_width) {
478 __asm {
479 push esi
480 push edi
481 push ebp
482 mov eax, [esp + 12 + 4] // src_ptr
483 mov esi, [esp + 12 + 8] // src_stride
484 mov edx, [esp + 12 + 12] // dst_ptr
485 mov ecx, [esp + 12 + 16] // dst_width
486 lea edi, [esi + esi * 2] // src_stride * 3
487 pxor xmm7, xmm7
488
489 align 16
490 wloop:
491 movdqa xmm0, [eax] // average 8 rows to 1
492 movdqa xmm1, [eax + 16]
493 movdqa xmm2, [eax + esi]
494 movdqa xmm3, [eax + esi + 16]
495 pavgb xmm0, xmm2
496 pavgb xmm1, xmm3
497 movdqa xmm2, [eax + esi * 2]
498 movdqa xmm3, [eax + esi * 2 + 16]
499 movdqa xmm4, [eax + edi]
500 movdqa xmm5, [eax + edi + 16]
501 lea ebp, [eax + esi * 4]
502 lea eax, [eax + 32]
503 pavgb xmm2, xmm4
504 pavgb xmm3, xmm5
505 pavgb xmm0, xmm2
506 pavgb xmm1, xmm3
507
508 movdqa xmm2, [ebp]
509 movdqa xmm3, [ebp + 16]
510 movdqa xmm4, [ebp + esi]
511 movdqa xmm5, [ebp + esi + 16]
512 pavgb xmm2, xmm4
513 pavgb xmm3, xmm5
514 movdqa xmm4, [ebp + esi * 2]
515 movdqa xmm5, [ebp + esi * 2 + 16]
516 movdqa xmm6, [ebp + edi]
517 pavgb xmm4, xmm6
518 movdqa xmm6, [ebp + edi + 16]
519 pavgb xmm5, xmm6
520 pavgb xmm2, xmm4
521 pavgb xmm3, xmm5
522 pavgb xmm0, xmm2
523 pavgb xmm1, xmm3
524
525 psadbw xmm0, xmm7 // average 32 pixels to 4
526 psadbw xmm1, xmm7
527 pshufd xmm0, xmm0, 0xd8 // x1x0 -> xx01
528 pshufd xmm1, xmm1, 0x8d // x3x2 -> 32xx
529 por xmm0, xmm1 // -> 3201
530 psrlw xmm0, 3
531 packuswb xmm0, xmm0
532 packuswb xmm0, xmm0
533
534 sub ecx, 4
535 movd dword ptr [edx], xmm0
536 lea edx, [edx + 4]
537 jg wloop
538
539 pop ebp
540 pop edi
541 pop esi
542 ret
543 }
544 }
545
546 #define HAS_SCALEROWDOWN34_SSSE3
547 // Point samples 32 pixels to 24 pixels.
548 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
549 // Then shuffled to do the scaling.
550
551 // Note that movdqa+palign may be better than movdqu.
552 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
553 __declspec(naked) __declspec(align(16))
ScaleRowDown34_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)554 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
555 uint8* dst_ptr, int dst_width) {
556 __asm {
557 mov eax, [esp + 4] // src_ptr
558 // src_stride ignored
559 mov edx, [esp + 12] // dst_ptr
560 mov ecx, [esp + 16] // dst_width
561 movdqa xmm3, kShuf0
562 movdqa xmm4, kShuf1
563 movdqa xmm5, kShuf2
564
565 align 16
566 wloop:
567 movdqa xmm0, [eax]
568 movdqa xmm1, [eax + 16]
569 lea eax, [eax + 32]
570 movdqa xmm2, xmm1
571 palignr xmm1, xmm0, 8
572 pshufb xmm0, xmm3
573 pshufb xmm1, xmm4
574 pshufb xmm2, xmm5
575 movq qword ptr [edx], xmm0
576 movq qword ptr [edx + 8], xmm1
577 movq qword ptr [edx + 16], xmm2
578 lea edx, [edx + 24]
579 sub ecx, 24
580 jg wloop
581
582 ret
583 }
584 }
585
586 // Blends 32x2 rectangle to 24x1
587 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
588 // Then shuffled to do the scaling.
589
590 // Register usage:
591 // xmm0 src_row 0
592 // xmm1 src_row 1
593 // xmm2 shuf 0
594 // xmm3 shuf 1
595 // xmm4 shuf 2
596 // xmm5 madd 0
597 // xmm6 madd 1
598 // xmm7 kRound34
599
600 // Note that movdqa+palign may be better than movdqu.
601 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
602 __declspec(naked) __declspec(align(16))
ScaleRowDown34_1_Int_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)603 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr,
604 ptrdiff_t src_stride,
605 uint8* dst_ptr, int dst_width) {
606 __asm {
607 push esi
608 mov eax, [esp + 4 + 4] // src_ptr
609 mov esi, [esp + 4 + 8] // src_stride
610 mov edx, [esp + 4 + 12] // dst_ptr
611 mov ecx, [esp + 4 + 16] // dst_width
612 movdqa xmm2, kShuf01
613 movdqa xmm3, kShuf11
614 movdqa xmm4, kShuf21
615 movdqa xmm5, kMadd01
616 movdqa xmm6, kMadd11
617 movdqa xmm7, kRound34
618
619 align 16
620 wloop:
621 movdqa xmm0, [eax] // pixels 0..7
622 movdqa xmm1, [eax + esi]
623 pavgb xmm0, xmm1
624 pshufb xmm0, xmm2
625 pmaddubsw xmm0, xmm5
626 paddsw xmm0, xmm7
627 psrlw xmm0, 2
628 packuswb xmm0, xmm0
629 movq qword ptr [edx], xmm0
630 movdqu xmm0, [eax + 8] // pixels 8..15
631 movdqu xmm1, [eax + esi + 8]
632 pavgb xmm0, xmm1
633 pshufb xmm0, xmm3
634 pmaddubsw xmm0, xmm6
635 paddsw xmm0, xmm7
636 psrlw xmm0, 2
637 packuswb xmm0, xmm0
638 movq qword ptr [edx + 8], xmm0
639 movdqa xmm0, [eax + 16] // pixels 16..23
640 movdqa xmm1, [eax + esi + 16]
641 lea eax, [eax + 32]
642 pavgb xmm0, xmm1
643 pshufb xmm0, xmm4
644 movdqa xmm1, kMadd21
645 pmaddubsw xmm0, xmm1
646 paddsw xmm0, xmm7
647 psrlw xmm0, 2
648 packuswb xmm0, xmm0
649 sub ecx, 24
650 movq qword ptr [edx + 16], xmm0
651 lea edx, [edx + 24]
652 jg wloop
653
654 pop esi
655 ret
656 }
657 }
658
659 // Note that movdqa+palign may be better than movdqu.
660 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
661 __declspec(naked) __declspec(align(16))
ScaleRowDown34_0_Int_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)662 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr,
663 ptrdiff_t src_stride,
664 uint8* dst_ptr, int dst_width) {
665 __asm {
666 push esi
667 mov eax, [esp + 4 + 4] // src_ptr
668 mov esi, [esp + 4 + 8] // src_stride
669 mov edx, [esp + 4 + 12] // dst_ptr
670 mov ecx, [esp + 4 + 16] // dst_width
671 movdqa xmm2, kShuf01
672 movdqa xmm3, kShuf11
673 movdqa xmm4, kShuf21
674 movdqa xmm5, kMadd01
675 movdqa xmm6, kMadd11
676 movdqa xmm7, kRound34
677
678 align 16
679 wloop:
680 movdqa xmm0, [eax] // pixels 0..7
681 movdqa xmm1, [eax + esi]
682 pavgb xmm1, xmm0
683 pavgb xmm0, xmm1
684 pshufb xmm0, xmm2
685 pmaddubsw xmm0, xmm5
686 paddsw xmm0, xmm7
687 psrlw xmm0, 2
688 packuswb xmm0, xmm0
689 movq qword ptr [edx], xmm0
690 movdqu xmm0, [eax + 8] // pixels 8..15
691 movdqu xmm1, [eax + esi + 8]
692 pavgb xmm1, xmm0
693 pavgb xmm0, xmm1
694 pshufb xmm0, xmm3
695 pmaddubsw xmm0, xmm6
696 paddsw xmm0, xmm7
697 psrlw xmm0, 2
698 packuswb xmm0, xmm0
699 movq qword ptr [edx + 8], xmm0
700 movdqa xmm0, [eax + 16] // pixels 16..23
701 movdqa xmm1, [eax + esi + 16]
702 lea eax, [eax + 32]
703 pavgb xmm1, xmm0
704 pavgb xmm0, xmm1
705 pshufb xmm0, xmm4
706 movdqa xmm1, kMadd21
707 pmaddubsw xmm0, xmm1
708 paddsw xmm0, xmm7
709 psrlw xmm0, 2
710 packuswb xmm0, xmm0
711 sub ecx, 24
712 movq qword ptr [edx + 16], xmm0
713 lea edx, [edx+24]
714 jg wloop
715
716 pop esi
717 ret
718 }
719 }
720
721 #define HAS_SCALEROWDOWN38_SSSE3
722 // 3/8 point sampler
723
724 // Scale 32 pixels to 12
725 __declspec(naked) __declspec(align(16))
ScaleRowDown38_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)726 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
727 uint8* dst_ptr, int dst_width) {
728 __asm {
729 mov eax, [esp + 4] // src_ptr
730 // src_stride ignored
731 mov edx, [esp + 12] // dst_ptr
732 mov ecx, [esp + 16] // dst_width
733 movdqa xmm4, kShuf38a
734 movdqa xmm5, kShuf38b
735
736 align 16
737 xloop:
738 movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
739 movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
740 lea eax, [eax + 32]
741 pshufb xmm0, xmm4
742 pshufb xmm1, xmm5
743 paddusb xmm0, xmm1
744
745 sub ecx, 12
746 movq qword ptr [edx], xmm0 // write 12 pixels
747 movhlps xmm1, xmm0
748 movd [edx + 8], xmm1
749 lea edx, [edx + 12]
750 jg xloop
751
752 ret
753 }
754 }
755
756 // Scale 16x3 pixels to 6x1 with interpolation
757 __declspec(naked) __declspec(align(16))
ScaleRowDown38_3_Int_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)758 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr,
759 ptrdiff_t src_stride,
760 uint8* dst_ptr, int dst_width) {
761 __asm {
762 push esi
763 mov eax, [esp + 4 + 4] // src_ptr
764 mov esi, [esp + 4 + 8] // src_stride
765 mov edx, [esp + 4 + 12] // dst_ptr
766 mov ecx, [esp + 4 + 16] // dst_width
767 movdqa xmm2, kShufAc
768 movdqa xmm3, kShufAc3
769 movdqa xmm4, kScaleAc33
770 pxor xmm5, xmm5
771
772 align 16
773 xloop:
774 movdqa xmm0, [eax] // sum up 3 rows into xmm0/1
775 movdqa xmm6, [eax + esi]
776 movhlps xmm1, xmm0
777 movhlps xmm7, xmm6
778 punpcklbw xmm0, xmm5
779 punpcklbw xmm1, xmm5
780 punpcklbw xmm6, xmm5
781 punpcklbw xmm7, xmm5
782 paddusw xmm0, xmm6
783 paddusw xmm1, xmm7
784 movdqa xmm6, [eax + esi * 2]
785 lea eax, [eax + 16]
786 movhlps xmm7, xmm6
787 punpcklbw xmm6, xmm5
788 punpcklbw xmm7, xmm5
789 paddusw xmm0, xmm6
790 paddusw xmm1, xmm7
791
792 movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
793 psrldq xmm0, 2
794 paddusw xmm6, xmm0
795 psrldq xmm0, 2
796 paddusw xmm6, xmm0
797 pshufb xmm6, xmm2
798
799 movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
800 psrldq xmm1, 2
801 paddusw xmm7, xmm1
802 psrldq xmm1, 2
803 paddusw xmm7, xmm1
804 pshufb xmm7, xmm3
805 paddusw xmm6, xmm7
806
807 pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
808 packuswb xmm6, xmm6
809
810 sub ecx, 6
811 movd [edx], xmm6 // write 6 pixels
812 psrlq xmm6, 16
813 movd [edx + 2], xmm6
814 lea edx, [edx + 6]
815 jg xloop
816
817 pop esi
818 ret
819 }
820 }
821
822 // Scale 16x2 pixels to 6x1 with interpolation
823 __declspec(naked) __declspec(align(16))
ScaleRowDown38_2_Int_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)824 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr,
825 ptrdiff_t src_stride,
826 uint8* dst_ptr, int dst_width) {
827 __asm {
828 push esi
829 mov eax, [esp + 4 + 4] // src_ptr
830 mov esi, [esp + 4 + 8] // src_stride
831 mov edx, [esp + 4 + 12] // dst_ptr
832 mov ecx, [esp + 4 + 16] // dst_width
833 movdqa xmm2, kShufAb0
834 movdqa xmm3, kShufAb1
835 movdqa xmm4, kShufAb2
836 movdqa xmm5, kScaleAb2
837
838 align 16
839 xloop:
840 movdqa xmm0, [eax] // average 2 rows into xmm0
841 pavgb xmm0, [eax + esi]
842 lea eax, [eax + 16]
843
844 movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
845 pshufb xmm1, xmm2
846 movdqa xmm6, xmm0
847 pshufb xmm6, xmm3
848 paddusw xmm1, xmm6
849 pshufb xmm0, xmm4
850 paddusw xmm1, xmm0
851
852 pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
853 packuswb xmm1, xmm1
854
855 sub ecx, 6
856 movd [edx], xmm1 // write 6 pixels
857 psrlq xmm1, 16
858 movd [edx + 2], xmm1
859 lea edx, [edx + 6]
860 jg xloop
861
862 pop esi
863 ret
864 }
865 }
866
867 #define HAS_SCALEADDROWS_SSE2
868
869 // Reads 16xN bytes and produces 16 shorts at a time.
870 __declspec(naked) __declspec(align(16))
ScaleAddRows_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint16 * dst_ptr,int src_width,int src_height)871 static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
872 uint16* dst_ptr, int src_width,
873 int src_height) {
874 __asm {
875 push esi
876 push edi
877 push ebx
878 push ebp
879 mov esi, [esp + 16 + 4] // src_ptr
880 mov edx, [esp + 16 + 8] // src_stride
881 mov edi, [esp + 16 + 12] // dst_ptr
882 mov ecx, [esp + 16 + 16] // dst_width
883 mov ebx, [esp + 16 + 20] // height
884 pxor xmm4, xmm4
885 dec ebx
886
887 align 16
888 xloop:
889 // first row
890 movdqa xmm0, [esi]
891 lea eax, [esi + edx]
892 movdqa xmm1, xmm0
893 punpcklbw xmm0, xmm4
894 punpckhbw xmm1, xmm4
895 lea esi, [esi + 16]
896 mov ebp, ebx
897 test ebp, ebp
898 je ydone
899
900 // sum remaining rows
901 align 16
902 yloop:
903 movdqa xmm2, [eax] // read 16 pixels
904 lea eax, [eax + edx] // advance to next row
905 movdqa xmm3, xmm2
906 punpcklbw xmm2, xmm4
907 punpckhbw xmm3, xmm4
908 paddusw xmm0, xmm2 // sum 16 words
909 paddusw xmm1, xmm3
910 sub ebp, 1
911 jg yloop
912 ydone:
913 movdqa [edi], xmm0
914 movdqa [edi + 16], xmm1
915 lea edi, [edi + 32]
916
917 sub ecx, 16
918 jg xloop
919
920 pop ebp
921 pop ebx
922 pop edi
923 pop esi
924 ret
925 }
926 }
927
928 #ifndef SSE2_DISABLED
929 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
930 // Normal formula for bilinear interpolation is:
931 // source_y_fraction * row1 + (1 - source_y_fraction) row0
932 // SSE2 version using the a single multiply of difference:
933 // source_y_fraction * (row1 - row0) + row0
934 #define HAS_SCALEFILTERROWS_SSE2_DISABLED
935 __declspec(naked) __declspec(align(16))
ScaleFilterRows_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)936 static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
937 ptrdiff_t src_stride, int dst_width,
938 int source_y_fraction) {
939 __asm {
940 push esi
941 push edi
942 mov edi, [esp + 8 + 4] // dst_ptr
943 mov esi, [esp + 8 + 8] // src_ptr
944 mov edx, [esp + 8 + 12] // src_stride
945 mov ecx, [esp + 8 + 16] // dst_width
946 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
947 sub edi, esi
948 cmp eax, 0
949 je xloop1
950 cmp eax, 128
951 je xloop2
952
953 movd xmm5, eax // xmm5 = y fraction
954 punpcklbw xmm5, xmm5
955 punpcklwd xmm5, xmm5
956 pshufd xmm5, xmm5, 0
957 pxor xmm4, xmm4
958
959 align 16
960 xloop:
961 movdqa xmm0, [esi] // row0
962 movdqa xmm2, [esi + edx] // row1
963 movdqa xmm1, xmm0
964 movdqa xmm3, xmm2
965 punpcklbw xmm2, xmm4
966 punpckhbw xmm3, xmm4
967 punpcklbw xmm0, xmm4
968 punpckhbw xmm1, xmm4
969 psubw xmm2, xmm0 // row1 - row0
970 psubw xmm3, xmm1
971 pmulhw xmm2, xmm5 // scale diff
972 pmulhw xmm3, xmm5
973 paddw xmm0, xmm2 // sum rows
974 paddw xmm1, xmm3
975 packuswb xmm0, xmm1
976 sub ecx, 16
977 movdqa [esi + edi], xmm0
978 lea esi, [esi + 16]
979 jg xloop
980
981 punpckhbw xmm0, xmm0 // duplicate last pixel for filtering
982 pshufhw xmm0, xmm0, 0xff
983 punpckhqdq xmm0, xmm0
984 movdqa [esi + edi], xmm0
985 pop edi
986 pop esi
987 ret
988
989 align 16
990 xloop1:
991 movdqa xmm0, [esi]
992 sub ecx, 16
993 movdqa [esi + edi], xmm0
994 lea esi, [esi + 16]
995 jg xloop1
996
997 punpckhbw xmm0, xmm0 // duplicate last pixel for filtering
998 pshufhw xmm0, xmm0, 0xff
999 punpckhqdq xmm0, xmm0
1000 movdqa [esi + edi], xmm0
1001 pop edi
1002 pop esi
1003 ret
1004
1005 align 16
1006 xloop2:
1007 movdqa xmm0, [esi]
1008 pavgb xmm0, [esi + edx]
1009 sub ecx, 16
1010 movdqa [esi + edi], xmm0
1011 lea esi, [esi + 16]
1012 jg xloop2
1013
1014 punpckhbw xmm0, xmm0 // duplicate last pixel for filtering
1015 pshufhw xmm0, xmm0, 0xff
1016 punpckhqdq xmm0, xmm0
1017 movdqa [esi + edi], xmm0
1018 pop edi
1019 pop esi
1020 ret
1021 }
1022 }
1023 #endif // SSE2_DISABLED
1024 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
1025 #define HAS_SCALEFILTERROWS_SSSE3
1026 __declspec(naked) __declspec(align(16))
ScaleFilterRows_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)1027 static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
1028 ptrdiff_t src_stride, int dst_width,
1029 int source_y_fraction) {
1030 __asm {
1031 push esi
1032 push edi
1033 mov edi, [esp + 8 + 4] // dst_ptr
1034 mov esi, [esp + 8 + 8] // src_ptr
1035 mov edx, [esp + 8 + 12] // src_stride
1036 mov ecx, [esp + 8 + 16] // dst_width
1037 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
1038 sub edi, esi
1039 shr eax, 1
1040 cmp eax, 0
1041 je xloop1
1042 cmp eax, 64
1043 je xloop2
1044 movd xmm0, eax // high fraction 0..127
1045 neg eax
1046 add eax, 128
1047 movd xmm5, eax // low fraction 128..1
1048 punpcklbw xmm5, xmm0
1049 punpcklwd xmm5, xmm5
1050 pshufd xmm5, xmm5, 0
1051
1052 align 16
1053 xloop:
1054 movdqa xmm0, [esi]
1055 movdqa xmm2, [esi + edx]
1056 movdqa xmm1, xmm0
1057 punpcklbw xmm0, xmm2
1058 punpckhbw xmm1, xmm2
1059 pmaddubsw xmm0, xmm5
1060 pmaddubsw xmm1, xmm5
1061 psrlw xmm0, 7
1062 psrlw xmm1, 7
1063 packuswb xmm0, xmm1
1064 sub ecx, 16
1065 movdqa [esi + edi], xmm0
1066 lea esi, [esi + 16]
1067 jg xloop
1068
1069 punpckhbw xmm0, xmm0 // duplicate last pixel for filtering
1070 pshufhw xmm0, xmm0, 0xff
1071 punpckhqdq xmm0, xmm0
1072 movdqa [esi + edi], xmm0
1073
1074 pop edi
1075 pop esi
1076 ret
1077
1078 align 16
1079 xloop1:
1080 movdqa xmm0, [esi]
1081 sub ecx, 16
1082 movdqa [esi + edi], xmm0
1083 lea esi, [esi + 16]
1084 jg xloop1
1085
1086 punpckhbw xmm0, xmm0
1087 pshufhw xmm0, xmm0, 0xff
1088 punpckhqdq xmm0, xmm0
1089 movdqa [esi + edi], xmm0
1090 pop edi
1091 pop esi
1092 ret
1093
1094 align 16
1095 xloop2:
1096 movdqa xmm0, [esi]
1097 pavgb xmm0, [esi + edx]
1098 sub ecx, 16
1099 movdqa [esi + edi], xmm0
1100 lea esi, [esi + 16]
1101 jg xloop2
1102
1103 punpckhbw xmm0, xmm0
1104 pshufhw xmm0, xmm0, 0xff
1105 punpckhqdq xmm0, xmm0
1106 movdqa [esi + edi], xmm0
1107 pop edi
1108 pop esi
1109 ret
1110 }
1111 }
1112
1113 #elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
1114
1115 // GCC versions of row functions are verbatim conversions from Visual C.
1116 // Generated using gcc disassembly on Visual C object file:
1117 // objdump -D yuvscaler.obj >yuvscaler.txt
1118 #define HAS_SCALEROWDOWN2_SSE2
ScaleRowDown2_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)1119 static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
1120 uint8* dst_ptr, int dst_width) {
1121 asm volatile (
1122 "pcmpeqb %%xmm5,%%xmm5 \n"
1123 "psrlw $0x8,%%xmm5 \n"
1124 ".p2align 4 \n"
1125 "1: \n"
1126 "movdqa (%0),%%xmm0 \n"
1127 "movdqa 0x10(%0),%%xmm1 \n"
1128 "lea 0x20(%0),%0 \n"
1129 "pand %%xmm5,%%xmm0 \n"
1130 "pand %%xmm5,%%xmm1 \n"
1131 "packuswb %%xmm1,%%xmm0 \n"
1132 "movdqa %%xmm0,(%1) \n"
1133 "lea 0x10(%1),%1 \n"
1134 "sub $0x10,%2 \n"
1135 "jg 1b \n"
1136 : "+r"(src_ptr), // %0
1137 "+r"(dst_ptr), // %1
1138 "+r"(dst_width) // %2
1139 :
1140 : "memory", "cc"
1141 #if defined(__SSE2__)
1142 , "xmm0", "xmm1", "xmm5"
1143 #endif
1144 );
1145 }
1146
ScaleRowDown2Int_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)1147 void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
1148 uint8* dst_ptr, int dst_width) {
1149 asm volatile (
1150 "pcmpeqb %%xmm5,%%xmm5 \n"
1151 "psrlw $0x8,%%xmm5 \n"
1152 ".p2align 4 \n"
1153 "1: \n"
1154 "movdqa (%0),%%xmm0 \n"
1155 "movdqa 0x10(%0),%%xmm1 \n"
1156 "movdqa (%0,%3,1),%%xmm2 \n"
1157 "movdqa 0x10(%0,%3,1),%%xmm3 \n"
1158 "lea 0x20(%0),%0 \n"
1159 "pavgb %%xmm2,%%xmm0 \n"
1160 "pavgb %%xmm3,%%xmm1 \n"
1161 "movdqa %%xmm0,%%xmm2 \n"
1162 "psrlw $0x8,%%xmm0 \n"
1163 "movdqa %%xmm1,%%xmm3 \n"
1164 "psrlw $0x8,%%xmm1 \n"
1165 "pand %%xmm5,%%xmm2 \n"
1166 "pand %%xmm5,%%xmm3 \n"
1167 "pavgw %%xmm2,%%xmm0 \n"
1168 "pavgw %%xmm3,%%xmm1 \n"
1169 "packuswb %%xmm1,%%xmm0 \n"
1170 "movdqa %%xmm0,(%1) \n"
1171 "lea 0x10(%1),%1 \n"
1172 "sub $0x10,%2 \n"
1173 "jg 1b \n"
1174 : "+r"(src_ptr), // %0
1175 "+r"(dst_ptr), // %1
1176 "+r"(dst_width) // %2
1177 : "r"(static_cast<intptr_t>(src_stride)) // %3
1178 : "memory", "cc"
1179 #if defined(__SSE2__)
1180 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1181 #endif
1182 );
1183 }
ScaleRowDown2_Unaligned_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)1184 static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
1185 ptrdiff_t src_stride,
1186 uint8* dst_ptr, int dst_width) {
1187 asm volatile (
1188 "pcmpeqb %%xmm5,%%xmm5 \n"
1189 "psrlw $0x8,%%xmm5 \n"
1190 ".p2align 4 \n"
1191 "1: \n"
1192 "movdqu (%0),%%xmm0 \n"
1193 "movdqu 0x10(%0),%%xmm1 \n"
1194 "lea 0x20(%0),%0 \n"
1195 "pand %%xmm5,%%xmm0 \n"
1196 "pand %%xmm5,%%xmm1 \n"
1197 "packuswb %%xmm1,%%xmm0 \n"
1198 "movdqu %%xmm0,(%1) \n"
1199 "lea 0x10(%1),%1 \n"
1200 "sub $0x10,%2 \n"
1201 "jg 1b \n"
1202 : "+r"(src_ptr), // %0
1203 "+r"(dst_ptr), // %1
1204 "+r"(dst_width) // %2
1205 :
1206 : "memory", "cc"
1207 #if defined(__SSE2__)
1208 , "xmm0", "xmm1", "xmm5"
1209 #endif
1210 );
1211 }
1212
ScaleRowDown2Int_Unaligned_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)1213 static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr,
1214 ptrdiff_t src_stride,
1215 uint8* dst_ptr, int dst_width) {
1216 asm volatile (
1217 "pcmpeqb %%xmm5,%%xmm5 \n"
1218 "psrlw $0x8,%%xmm5 \n"
1219 ".p2align 4 \n"
1220 "1: \n"
1221 "movdqu (%0),%%xmm0 \n"
1222 "movdqu 0x10(%0),%%xmm1 \n"
1223 "movdqu (%0,%3,1),%%xmm2 \n"
1224 "movdqu 0x10(%0,%3,1),%%xmm3 \n"
1225 "lea 0x20(%0),%0 \n"
1226 "pavgb %%xmm2,%%xmm0 \n"
1227 "pavgb %%xmm3,%%xmm1 \n"
1228 "movdqa %%xmm0,%%xmm2 \n"
1229 "psrlw $0x8,%%xmm0 \n"
1230 "movdqa %%xmm1,%%xmm3 \n"
1231 "psrlw $0x8,%%xmm1 \n"
1232 "pand %%xmm5,%%xmm2 \n"
1233 "pand %%xmm5,%%xmm3 \n"
1234 "pavgw %%xmm2,%%xmm0 \n"
1235 "pavgw %%xmm3,%%xmm1 \n"
1236 "packuswb %%xmm1,%%xmm0 \n"
1237 "movdqu %%xmm0,(%1) \n"
1238 "lea 0x10(%1),%1 \n"
1239 "sub $0x10,%2 \n"
1240 "jg 1b \n"
1241 : "+r"(src_ptr), // %0
1242 "+r"(dst_ptr), // %1
1243 "+r"(dst_width) // %2
1244 : "r"(static_cast<intptr_t>(src_stride)) // %3
1245 : "memory", "cc"
1246 #if defined(__SSE2__)
1247 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1248 #endif
1249 );
1250 }
1251
1252 #define HAS_SCALEROWDOWN4_SSE2
ScaleRowDown4_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)1253 static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
1254 uint8* dst_ptr, int dst_width) {
1255 asm volatile (
1256 "pcmpeqb %%xmm5,%%xmm5 \n"
1257 "psrld $0x18,%%xmm5 \n"
1258 ".p2align 4 \n"
1259 "1: \n"
1260 "movdqa (%0),%%xmm0 \n"
1261 "movdqa 0x10(%0),%%xmm1 \n"
1262 "lea 0x20(%0),%0 \n"
1263 "pand %%xmm5,%%xmm0 \n"
1264 "pand %%xmm5,%%xmm1 \n"
1265 "packuswb %%xmm1,%%xmm0 \n"
1266 "packuswb %%xmm0,%%xmm0 \n"
1267 "movq %%xmm0,(%1) \n"
1268 "lea 0x8(%1),%1 \n"
1269 "sub $0x8,%2 \n"
1270 "jg 1b \n"
1271 : "+r"(src_ptr), // %0
1272 "+r"(dst_ptr), // %1
1273 "+r"(dst_width) // %2
1274 :
1275 : "memory", "cc"
1276 #if defined(__SSE2__)
1277 , "xmm0", "xmm1", "xmm5"
1278 #endif
1279 );
1280 }
1281
ScaleRowDown4Int_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)1282 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
1283 uint8* dst_ptr, int dst_width) {
1284 intptr_t stridex3 = 0;
1285 asm volatile (
1286 "pcmpeqb %%xmm7,%%xmm7 \n"
1287 "psrlw $0x8,%%xmm7 \n"
1288 "lea (%4,%4,2),%3 \n"
1289 ".p2align 4 \n"
1290 "1: \n"
1291 "movdqa (%0),%%xmm0 \n"
1292 "movdqa 0x10(%0),%%xmm1 \n"
1293 "movdqa (%0,%4,1),%%xmm2 \n"
1294 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
1295 "pavgb %%xmm2,%%xmm0 \n"
1296 "pavgb %%xmm3,%%xmm1 \n"
1297 "movdqa (%0,%4,2),%%xmm2 \n"
1298 "movdqa 0x10(%0,%4,2),%%xmm3 \n"
1299 "movdqa (%0,%3,1),%%xmm4 \n"
1300 "movdqa 0x10(%0,%3,1),%%xmm5 \n"
1301 "lea 0x20(%0),%0 \n"
1302 "pavgb %%xmm4,%%xmm2 \n"
1303 "pavgb %%xmm2,%%xmm0 \n"
1304 "pavgb %%xmm5,%%xmm3 \n"
1305 "pavgb %%xmm3,%%xmm1 \n"
1306 "movdqa %%xmm0,%%xmm2 \n"
1307 "psrlw $0x8,%%xmm0 \n"
1308 "movdqa %%xmm1,%%xmm3 \n"
1309 "psrlw $0x8,%%xmm1 \n"
1310 "pand %%xmm7,%%xmm2 \n"
1311 "pand %%xmm7,%%xmm3 \n"
1312 "pavgw %%xmm2,%%xmm0 \n"
1313 "pavgw %%xmm3,%%xmm1 \n"
1314 "packuswb %%xmm1,%%xmm0 \n"
1315 "movdqa %%xmm0,%%xmm2 \n"
1316 "psrlw $0x8,%%xmm0 \n"
1317 "pand %%xmm7,%%xmm2 \n"
1318 "pavgw %%xmm2,%%xmm0 \n"
1319 "packuswb %%xmm0,%%xmm0 \n"
1320 "movq %%xmm0,(%1) \n"
1321 "lea 0x8(%1),%1 \n"
1322 "sub $0x8,%2 \n"
1323 "jg 1b \n"
1324 : "+r"(src_ptr), // %0
1325 "+r"(dst_ptr), // %1
1326 "+r"(dst_width), // %2
1327 "+r"(stridex3) // %3
1328 : "r"(static_cast<intptr_t>(src_stride)) // %4
1329 : "memory", "cc"
1330 #if defined(__SSE2__)
1331 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
1332 #endif
1333 );
1334 }
1335
1336 #define HAS_SCALEROWDOWN8_SSE2
ScaleRowDown8_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)1337 static void ScaleRowDown8_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
1338 uint8* dst_ptr, int dst_width) {
1339 asm volatile (
1340 "pcmpeqb %%xmm5,%%xmm5 \n"
1341 "psrlq $0x38,%%xmm5 \n"
1342 ".p2align 4 \n"
1343 "1: \n"
1344 "movdqa (%0),%%xmm0 \n"
1345 "movdqa 0x10(%0),%%xmm1 \n"
1346 "lea 0x20(%0),%0 \n"
1347 "pand %%xmm5,%%xmm0 \n"
1348 "pand %%xmm5,%%xmm1 \n"
1349 "packuswb %%xmm1,%%xmm0 \n"
1350 "packuswb %%xmm0,%%xmm0 \n"
1351 "packuswb %%xmm0,%%xmm0 \n"
1352 "movd %%xmm0,(%1) \n"
1353 "lea 0x4(%1),%1 \n"
1354 "sub $0x4,%2 \n"
1355 "jg 1b \n"
1356 : "+r"(src_ptr), // %0
1357 "+r"(dst_ptr), // %1
1358 "+r"(dst_width) // %2
1359 :
1360 : "memory", "cc"
1361 #if defined(__SSE2__)
1362 , "xmm0", "xmm1", "xmm5"
1363 #endif
1364 );
1365 }
1366
ScaleRowDown8Int_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)1367 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
1368 uint8* dst_ptr, int dst_width) {
1369 intptr_t stridex3 = 0;
1370 intptr_t row4 = 0;
1371 asm volatile (
1372 "lea (%5,%5,2),%3 \n"
1373 "pxor %%xmm7,%%xmm7 \n"
1374 ".p2align 4 \n"
1375 "1: \n"
1376 "movdqa (%0),%%xmm0 \n"
1377 "movdqa 0x10(%0),%%xmm1 \n"
1378 "movdqa (%0,%5,1),%%xmm2 \n"
1379 "movdqa 0x10(%0,%5,1),%%xmm3 \n"
1380 "pavgb %%xmm2,%%xmm0 \n"
1381 "pavgb %%xmm3,%%xmm1 \n"
1382 "movdqa (%0,%5,2),%%xmm2 \n"
1383 "movdqa 0x10(%0,%5,2),%%xmm3 \n"
1384 "movdqa (%0,%3,1),%%xmm4 \n"
1385 "movdqa 0x10(%0,%3,1),%%xmm5 \n"
1386 "lea (%0,%5,4),%4 \n"
1387 "lea 0x20(%0),%0 \n"
1388 "pavgb %%xmm4,%%xmm2 \n"
1389 "pavgb %%xmm5,%%xmm3 \n"
1390 "pavgb %%xmm2,%%xmm0 \n"
1391 "pavgb %%xmm3,%%xmm1 \n"
1392 "movdqa 0x0(%4),%%xmm2 \n"
1393 "movdqa 0x10(%4),%%xmm3 \n"
1394 "movdqa 0x0(%4,%5,1),%%xmm4 \n"
1395 "movdqa 0x10(%4,%5,1),%%xmm5 \n"
1396 "pavgb %%xmm4,%%xmm2 \n"
1397 "pavgb %%xmm5,%%xmm3 \n"
1398 "movdqa 0x0(%4,%5,2),%%xmm4 \n"
1399 "movdqa 0x10(%4,%5,2),%%xmm5 \n"
1400 "movdqa 0x0(%4,%3,1),%%xmm6 \n"
1401 "pavgb %%xmm6,%%xmm4 \n"
1402 "movdqa 0x10(%4,%3,1),%%xmm6 \n"
1403 "pavgb %%xmm6,%%xmm5 \n"
1404 "pavgb %%xmm4,%%xmm2 \n"
1405 "pavgb %%xmm5,%%xmm3 \n"
1406 "pavgb %%xmm2,%%xmm0 \n"
1407 "pavgb %%xmm3,%%xmm1 \n"
1408 "psadbw %%xmm7,%%xmm0 \n"
1409 "psadbw %%xmm7,%%xmm1 \n"
1410 "pshufd $0xd8,%%xmm0,%%xmm0 \n"
1411 "pshufd $0x8d,%%xmm1,%%xmm1 \n"
1412 "por %%xmm1,%%xmm0 \n"
1413 "psrlw $0x3,%%xmm0 \n"
1414 "packuswb %%xmm0,%%xmm0 \n"
1415 "packuswb %%xmm0,%%xmm0 \n"
1416 "movd %%xmm0,(%1) \n"
1417 "lea 0x4(%1),%1 \n"
1418 "sub $0x4,%2 \n"
1419 "jg 1b \n"
1420 : "+r"(src_ptr), // %0
1421 "+r"(dst_ptr), // %1
1422 "+rm"(dst_width), // %2
1423 "+r"(stridex3), // %3
1424 "+r"(row4) // %4
1425 : "r"(static_cast<intptr_t>(src_stride)) // %5
1426 : "memory", "cc"
1427 #if defined(__SSE2__)
1428 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
1429 #endif
1430 );
1431 }
1432
1433 #define HAS_SCALEROWDOWN34_SSSE3
ScaleRowDown34_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)1434 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
1435 uint8* dst_ptr, int dst_width) {
1436 asm volatile (
1437 "movdqa %0,%%xmm3 \n"
1438 "movdqa %1,%%xmm4 \n"
1439 "movdqa %2,%%xmm5 \n"
1440 :
1441 : "m"(kShuf0), // %0
1442 "m"(kShuf1), // %1
1443 "m"(kShuf2) // %2
1444 );
1445 asm volatile (
1446 ".p2align 4 \n"
1447 "1: \n"
1448 "movdqa (%0),%%xmm0 \n"
1449 "movdqa 0x10(%0),%%xmm2 \n"
1450 "lea 0x20(%0),%0 \n"
1451 "movdqa %%xmm2,%%xmm1 \n"
1452 "palignr $0x8,%%xmm0,%%xmm1 \n"
1453 "pshufb %%xmm3,%%xmm0 \n"
1454 "pshufb %%xmm4,%%xmm1 \n"
1455 "pshufb %%xmm5,%%xmm2 \n"
1456 "movq %%xmm0,(%1) \n"
1457 "movq %%xmm1,0x8(%1) \n"
1458 "movq %%xmm2,0x10(%1) \n"
1459 "lea 0x18(%1),%1 \n"
1460 "sub $0x18,%2 \n"
1461 "jg 1b \n"
1462 : "+r"(src_ptr), // %0
1463 "+r"(dst_ptr), // %1
1464 "+r"(dst_width) // %2
1465 :
1466 : "memory", "cc"
1467 #if defined(__SSE2__)
1468 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1469 #endif
1470 );
1471 }
1472
ScaleRowDown34_1_Int_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)1473 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr,
1474 ptrdiff_t src_stride,
1475 uint8* dst_ptr, int dst_width) {
1476 asm volatile (
1477 "movdqa %0,%%xmm2 \n" // kShuf01
1478 "movdqa %1,%%xmm3 \n" // kShuf11
1479 "movdqa %2,%%xmm4 \n" // kShuf21
1480 :
1481 : "m"(kShuf01), // %0
1482 "m"(kShuf11), // %1
1483 "m"(kShuf21) // %2
1484 );
1485 asm volatile (
1486 "movdqa %0,%%xmm5 \n" // kMadd01
1487 "movdqa %1,%%xmm0 \n" // kMadd11
1488 "movdqa %2,%%xmm1 \n" // kRound34
1489 :
1490 : "m"(kMadd01), // %0
1491 "m"(kMadd11), // %1
1492 "m"(kRound34) // %2
1493 );
1494 asm volatile (
1495 ".p2align 4 \n"
1496 "1: \n"
1497 "movdqa (%0),%%xmm6 \n"
1498 "movdqa (%0,%3),%%xmm7 \n"
1499 "pavgb %%xmm7,%%xmm6 \n"
1500 "pshufb %%xmm2,%%xmm6 \n"
1501 "pmaddubsw %%xmm5,%%xmm6 \n"
1502 "paddsw %%xmm1,%%xmm6 \n"
1503 "psrlw $0x2,%%xmm6 \n"
1504 "packuswb %%xmm6,%%xmm6 \n"
1505 "movq %%xmm6,(%1) \n"
1506 "movdqu 0x8(%0),%%xmm6 \n"
1507 "movdqu 0x8(%0,%3),%%xmm7 \n"
1508 "pavgb %%xmm7,%%xmm6 \n"
1509 "pshufb %%xmm3,%%xmm6 \n"
1510 "pmaddubsw %%xmm0,%%xmm6 \n"
1511 "paddsw %%xmm1,%%xmm6 \n"
1512 "psrlw $0x2,%%xmm6 \n"
1513 "packuswb %%xmm6,%%xmm6 \n"
1514 "movq %%xmm6,0x8(%1) \n"
1515 "movdqa 0x10(%0),%%xmm6 \n"
1516 "movdqa 0x10(%0,%3),%%xmm7 \n"
1517 "lea 0x20(%0),%0 \n"
1518 "pavgb %%xmm7,%%xmm6 \n"
1519 "pshufb %%xmm4,%%xmm6 \n"
1520 "pmaddubsw %4,%%xmm6 \n"
1521 "paddsw %%xmm1,%%xmm6 \n"
1522 "psrlw $0x2,%%xmm6 \n"
1523 "packuswb %%xmm6,%%xmm6 \n"
1524 "movq %%xmm6,0x10(%1) \n"
1525 "lea 0x18(%1),%1 \n"
1526 "sub $0x18,%2 \n"
1527 "jg 1b \n"
1528 : "+r"(src_ptr), // %0
1529 "+r"(dst_ptr), // %1
1530 "+r"(dst_width) // %2
1531 : "r"(static_cast<intptr_t>(src_stride)), // %3
1532 "m"(kMadd21) // %4
1533 : "memory", "cc"
1534 #if defined(__SSE2__)
1535 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
1536 #endif
1537 );
1538 }
1539
ScaleRowDown34_0_Int_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)1540 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr,
1541 ptrdiff_t src_stride,
1542 uint8* dst_ptr, int dst_width) {
1543 asm volatile (
1544 "movdqa %0,%%xmm2 \n" // kShuf01
1545 "movdqa %1,%%xmm3 \n" // kShuf11
1546 "movdqa %2,%%xmm4 \n" // kShuf21
1547 :
1548 : "m"(kShuf01), // %0
1549 "m"(kShuf11), // %1
1550 "m"(kShuf21) // %2
1551 );
1552 asm volatile (
1553 "movdqa %0,%%xmm5 \n" // kMadd01
1554 "movdqa %1,%%xmm0 \n" // kMadd11
1555 "movdqa %2,%%xmm1 \n" // kRound34
1556 :
1557 : "m"(kMadd01), // %0
1558 "m"(kMadd11), // %1
1559 "m"(kRound34) // %2
1560 );
1561
1562 asm volatile (
1563 ".p2align 4 \n"
1564 "1: \n"
1565 "movdqa (%0),%%xmm6 \n"
1566 "movdqa (%0,%3,1),%%xmm7 \n"
1567 "pavgb %%xmm6,%%xmm7 \n"
1568 "pavgb %%xmm7,%%xmm6 \n"
1569 "pshufb %%xmm2,%%xmm6 \n"
1570 "pmaddubsw %%xmm5,%%xmm6 \n"
1571 "paddsw %%xmm1,%%xmm6 \n"
1572 "psrlw $0x2,%%xmm6 \n"
1573 "packuswb %%xmm6,%%xmm6 \n"
1574 "movq %%xmm6,(%1) \n"
1575 "movdqu 0x8(%0),%%xmm6 \n"
1576 "movdqu 0x8(%0,%3,1),%%xmm7 \n"
1577 "pavgb %%xmm6,%%xmm7 \n"
1578 "pavgb %%xmm7,%%xmm6 \n"
1579 "pshufb %%xmm3,%%xmm6 \n"
1580 "pmaddubsw %%xmm0,%%xmm6 \n"
1581 "paddsw %%xmm1,%%xmm6 \n"
1582 "psrlw $0x2,%%xmm6 \n"
1583 "packuswb %%xmm6,%%xmm6 \n"
1584 "movq %%xmm6,0x8(%1) \n"
1585 "movdqa 0x10(%0),%%xmm6 \n"
1586 "movdqa 0x10(%0,%3,1),%%xmm7 \n"
1587 "lea 0x20(%0),%0 \n"
1588 "pavgb %%xmm6,%%xmm7 \n"
1589 "pavgb %%xmm7,%%xmm6 \n"
1590 "pshufb %%xmm4,%%xmm6 \n"
1591 "pmaddubsw %4,%%xmm6 \n"
1592 "paddsw %%xmm1,%%xmm6 \n"
1593 "psrlw $0x2,%%xmm6 \n"
1594 "packuswb %%xmm6,%%xmm6 \n"
1595 "movq %%xmm6,0x10(%1) \n"
1596 "lea 0x18(%1),%1 \n"
1597 "sub $0x18,%2 \n"
1598 "jg 1b \n"
1599 : "+r"(src_ptr), // %0
1600 "+r"(dst_ptr), // %1
1601 "+r"(dst_width) // %2
1602 : "r"(static_cast<intptr_t>(src_stride)), // %3
1603 "m"(kMadd21) // %4
1604 : "memory", "cc"
1605 #if defined(__SSE2__)
1606 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
1607 #endif
1608 );
1609 }
1610
1611 #define HAS_SCALEROWDOWN38_SSSE3
ScaleRowDown38_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)1612 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
1613 uint8* dst_ptr, int dst_width) {
1614 asm volatile (
1615 "movdqa %3,%%xmm4 \n"
1616 "movdqa %4,%%xmm5 \n"
1617 ".p2align 4 \n"
1618 "1: \n"
1619 "movdqa (%0),%%xmm0 \n"
1620 "movdqa 0x10(%0),%%xmm1 \n"
1621 "lea 0x20(%0),%0 \n"
1622 "pshufb %%xmm4,%%xmm0 \n"
1623 "pshufb %%xmm5,%%xmm1 \n"
1624 "paddusb %%xmm1,%%xmm0 \n"
1625 "movq %%xmm0,(%1) \n"
1626 "movhlps %%xmm0,%%xmm1 \n"
1627 "movd %%xmm1,0x8(%1) \n"
1628 "lea 0xc(%1),%1 \n"
1629 "sub $0xc,%2 \n"
1630 "jg 1b \n"
1631 : "+r"(src_ptr), // %0
1632 "+r"(dst_ptr), // %1
1633 "+r"(dst_width) // %2
1634 : "m"(kShuf38a), // %3
1635 "m"(kShuf38b) // %4
1636 : "memory", "cc"
1637 #if defined(__SSE2__)
1638 , "xmm0", "xmm1", "xmm4", "xmm5"
1639 #endif
1640 );
1641 }
1642
ScaleRowDown38_2_Int_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)1643 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr,
1644 ptrdiff_t src_stride,
1645 uint8* dst_ptr, int dst_width) {
1646 asm volatile (
1647 "movdqa %0,%%xmm2 \n"
1648 "movdqa %1,%%xmm3 \n"
1649 "movdqa %2,%%xmm4 \n"
1650 "movdqa %3,%%xmm5 \n"
1651 :
1652 : "m"(kShufAb0), // %0
1653 "m"(kShufAb1), // %1
1654 "m"(kShufAb2), // %2
1655 "m"(kScaleAb2) // %3
1656 );
1657 asm volatile (
1658 ".p2align 4 \n"
1659 "1: \n"
1660 "movdqa (%0),%%xmm0 \n"
1661 "pavgb (%0,%3,1),%%xmm0 \n"
1662 "lea 0x10(%0),%0 \n"
1663 "movdqa %%xmm0,%%xmm1 \n"
1664 "pshufb %%xmm2,%%xmm1 \n"
1665 "movdqa %%xmm0,%%xmm6 \n"
1666 "pshufb %%xmm3,%%xmm6 \n"
1667 "paddusw %%xmm6,%%xmm1 \n"
1668 "pshufb %%xmm4,%%xmm0 \n"
1669 "paddusw %%xmm0,%%xmm1 \n"
1670 "pmulhuw %%xmm5,%%xmm1 \n"
1671 "packuswb %%xmm1,%%xmm1 \n"
1672 "sub $0x6,%2 \n"
1673 "movd %%xmm1,(%1) \n"
1674 "psrlq $0x10,%%xmm1 \n"
1675 "movd %%xmm1,0x2(%1) \n"
1676 "lea 0x6(%1),%1 \n"
1677 "jg 1b \n"
1678 : "+r"(src_ptr), // %0
1679 "+r"(dst_ptr), // %1
1680 "+r"(dst_width) // %2
1681 : "r"(static_cast<intptr_t>(src_stride)) // %3
1682 : "memory", "cc"
1683 #if defined(__SSE2__)
1684 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1685 #endif
1686 );
1687 }
1688
ScaleRowDown38_3_Int_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)1689 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr,
1690 ptrdiff_t src_stride,
1691 uint8* dst_ptr, int dst_width) {
1692 asm volatile (
1693 "movdqa %0,%%xmm2 \n"
1694 "movdqa %1,%%xmm3 \n"
1695 "movdqa %2,%%xmm4 \n"
1696 "pxor %%xmm5,%%xmm5 \n"
1697 :
1698 : "m"(kShufAc), // %0
1699 "m"(kShufAc3), // %1
1700 "m"(kScaleAc33) // %2
1701 );
1702 asm volatile (
1703 ".p2align 4 \n"
1704 "1: \n"
1705 "movdqa (%0),%%xmm0 \n"
1706 "movdqa (%0,%3,1),%%xmm6 \n"
1707 "movhlps %%xmm0,%%xmm1 \n"
1708 "movhlps %%xmm6,%%xmm7 \n"
1709 "punpcklbw %%xmm5,%%xmm0 \n"
1710 "punpcklbw %%xmm5,%%xmm1 \n"
1711 "punpcklbw %%xmm5,%%xmm6 \n"
1712 "punpcklbw %%xmm5,%%xmm7 \n"
1713 "paddusw %%xmm6,%%xmm0 \n"
1714 "paddusw %%xmm7,%%xmm1 \n"
1715 "movdqa (%0,%3,2),%%xmm6 \n"
1716 "lea 0x10(%0),%0 \n"
1717 "movhlps %%xmm6,%%xmm7 \n"
1718 "punpcklbw %%xmm5,%%xmm6 \n"
1719 "punpcklbw %%xmm5,%%xmm7 \n"
1720 "paddusw %%xmm6,%%xmm0 \n"
1721 "paddusw %%xmm7,%%xmm1 \n"
1722 "movdqa %%xmm0,%%xmm6 \n"
1723 "psrldq $0x2,%%xmm0 \n"
1724 "paddusw %%xmm0,%%xmm6 \n"
1725 "psrldq $0x2,%%xmm0 \n"
1726 "paddusw %%xmm0,%%xmm6 \n"
1727 "pshufb %%xmm2,%%xmm6 \n"
1728 "movdqa %%xmm1,%%xmm7 \n"
1729 "psrldq $0x2,%%xmm1 \n"
1730 "paddusw %%xmm1,%%xmm7 \n"
1731 "psrldq $0x2,%%xmm1 \n"
1732 "paddusw %%xmm1,%%xmm7 \n"
1733 "pshufb %%xmm3,%%xmm7 \n"
1734 "paddusw %%xmm7,%%xmm6 \n"
1735 "pmulhuw %%xmm4,%%xmm6 \n"
1736 "packuswb %%xmm6,%%xmm6 \n"
1737 "sub $0x6,%2 \n"
1738 "movd %%xmm6,(%1) \n"
1739 "psrlq $0x10,%%xmm6 \n"
1740 "movd %%xmm6,0x2(%1) \n"
1741 "lea 0x6(%1),%1 \n"
1742 "jg 1b \n"
1743 : "+r"(src_ptr), // %0
1744 "+r"(dst_ptr), // %1
1745 "+r"(dst_width) // %2
1746 : "r"(static_cast<intptr_t>(src_stride)) // %3
1747 : "memory", "cc"
1748 #if defined(__SSE2__)
1749 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
1750 #endif
1751 );
1752 }
1753
1754 #define HAS_SCALEADDROWS_SSE2
ScaleAddRows_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint16 * dst_ptr,int src_width,int src_height)1755 static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
1756 uint16* dst_ptr, int src_width, int src_height) {
1757 int tmp_height = 0;
1758 intptr_t tmp_src = 0;
1759 asm volatile (
1760 "pxor %%xmm4,%%xmm4 \n"
1761 "sub $0x1,%5 \n"
1762 ".p2align 4 \n"
1763 "1: \n"
1764 "movdqa (%0),%%xmm0 \n"
1765 "mov %0,%3 \n"
1766 "add %6,%0 \n"
1767 "movdqa %%xmm0,%%xmm1 \n"
1768 "punpcklbw %%xmm4,%%xmm0 \n"
1769 "punpckhbw %%xmm4,%%xmm1 \n"
1770 "mov %5,%2 \n"
1771 "test %2,%2 \n"
1772 "je 3f \n"
1773 "2: \n"
1774 "movdqa (%0),%%xmm2 \n"
1775 "add %6,%0 \n"
1776 "movdqa %%xmm2,%%xmm3 \n"
1777 "punpcklbw %%xmm4,%%xmm2 \n"
1778 "punpckhbw %%xmm4,%%xmm3 \n"
1779 "paddusw %%xmm2,%%xmm0 \n"
1780 "paddusw %%xmm3,%%xmm1 \n"
1781 "sub $0x1,%2 \n"
1782 "jg 2b \n"
1783 "3: \n"
1784 "movdqa %%xmm0,(%1) \n"
1785 "movdqa %%xmm1,0x10(%1) \n"
1786 "lea 0x10(%3),%0 \n"
1787 "lea 0x20(%1),%1 \n"
1788 "sub $0x10,%4 \n"
1789 "jg 1b \n"
1790 : "+r"(src_ptr), // %0
1791 "+r"(dst_ptr), // %1
1792 "+r"(tmp_height), // %2
1793 "+r"(tmp_src), // %3
1794 "+r"(src_width), // %4
1795 "+rm"(src_height) // %5
1796 : "rm"(static_cast<intptr_t>(src_stride)) // %6
1797 : "memory", "cc"
1798 #if defined(__SSE2__)
1799 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1800 #endif
1801 );
1802 }
1803
1804 #ifndef SSE2_DISABLED
1805 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
1806 #define HAS_SCALEFILTERROWS_SSE2_DISABLED
ScaleFilterRows_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)1807 static void ScaleFilterRows_SSE2(uint8* dst_ptr,
1808 const uint8* src_ptr, ptrdiff_t src_stride,
1809 int dst_width, int source_y_fraction) {
1810 asm volatile (
1811 "sub %1,%0 \n"
1812 "cmp $0x0,%3 \n"
1813 "je 2f \n"
1814 "cmp $0x80,%3 \n"
1815 "je 3f \n"
1816 "movd %3,%%xmm5 \n"
1817 "punpcklbw %%xmm5,%%xmm5 \n"
1818 "punpcklwd %%xmm5,%%xmm5 \n"
1819 "pshufd $0x0,%%xmm5,%%xmm5 \n"
1820 "pxor %%xmm4,%%xmm4 \n"
1821 ".p2align 4 \n"
1822 "1: \n"
1823 "movdqa (%1),%%xmm0 \n"
1824 "movdqa (%1,%4,1),%%xmm2 \n"
1825 "movdqa %%xmm0,%%xmm1 \n"
1826 "movdqa %%xmm2,%%xmm3 \n"
1827 "punpcklbw %%xmm4,%%xmm2 \n"
1828 "punpckhbw %%xmm4,%%xmm3 \n"
1829 "punpcklbw %%xmm4,%%xmm0 \n"
1830 "punpckhbw %%xmm4,%%xmm1 \n"
1831 "psubw %%xmm0,%%xmm2 \n"
1832 "psubw %%xmm1,%%xmm3 \n"
1833 "pmulhw %%xmm5,%%xmm2 \n"
1834 "pmulhw %%xmm5,%%xmm3 \n"
1835 "paddw %%xmm2,%%xmm0 \n"
1836 "paddw %%xmm3,%%xmm1 \n"
1837 "packuswb %%xmm1,%%xmm0 \n"
1838 "sub $0x10,%2 \n"
1839 "movdqa %%xmm0,(%1,%0,1) \n"
1840 "lea 0x10(%1),%1 \n"
1841 "jg 1b \n"
1842 "jmp 4f \n"
1843 ".p2align 4 \n"
1844 "2: \n"
1845 "movdqa (%1),%%xmm0 \n"
1846 "sub $0x10,%2 \n"
1847 "movdqa %%xmm0,(%1,%0,1) \n"
1848 "lea 0x10(%1),%1 \n"
1849 "jg 2b \n"
1850 "jmp 4f \n"
1851 ".p2align 4 \n"
1852 "3: \n"
1853 "movdqa (%1),%%xmm0 \n"
1854 "pavgb (%1,%4,1),%%xmm0 \n"
1855 "sub $0x10,%2 \n"
1856 "movdqa %%xmm0,(%1,%0,1) \n"
1857 "lea 0x10(%1),%1 \n"
1858 "jg 3b \n"
1859 ".p2align 4 \n"
1860 "4: \n"
1861 "punpckhbw %%xmm0,%%xmm0 \n"
1862 "pshufhw $0xff,%%xmm0,%%xmm0 \n"
1863 "punpckhqdq %%xmm0,%%xmm0 \n"
1864 "movdqa %%xmm0,(%1,%0,1) \n"
1865 : "+r"(dst_ptr), // %0
1866 "+r"(src_ptr), // %1
1867 "+r"(dst_width), // %2
1868 "+r"(source_y_fraction) // %3
1869 : "r"(static_cast<intptr_t>(src_stride)) // %4
1870 : "memory", "cc"
1871 #if defined(__SSE2__)
1872 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1873 #endif
1874 );
1875 }
1876 #endif // SSE2_DISABLED
1877
1878 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
1879 #define HAS_SCALEFILTERROWS_SSSE3
ScaleFilterRows_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)1880 static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
1881 const uint8* src_ptr, ptrdiff_t src_stride,
1882 int dst_width, int source_y_fraction) {
1883 asm volatile (
1884 "sub %1,%0 \n"
1885 "shr %3 \n"
1886 "cmp $0x0,%3 \n"
1887 "je 2f \n"
1888 "cmp $0x40,%3 \n"
1889 "je 3f \n"
1890 "movd %3,%%xmm0 \n"
1891 "neg %3 \n"
1892 "add $0x80,%3 \n"
1893 "movd %3,%%xmm5 \n"
1894 "punpcklbw %%xmm0,%%xmm5 \n"
1895 "punpcklwd %%xmm5,%%xmm5 \n"
1896 "pshufd $0x0,%%xmm5,%%xmm5 \n"
1897 ".p2align 4 \n"
1898 "1: \n"
1899 "movdqa (%1),%%xmm0 \n"
1900 "movdqa (%1,%4,1),%%xmm2 \n"
1901 "movdqa %%xmm0,%%xmm1 \n"
1902 "punpcklbw %%xmm2,%%xmm0 \n"
1903 "punpckhbw %%xmm2,%%xmm1 \n"
1904 "pmaddubsw %%xmm5,%%xmm0 \n"
1905 "pmaddubsw %%xmm5,%%xmm1 \n"
1906 "psrlw $0x7,%%xmm0 \n"
1907 "psrlw $0x7,%%xmm1 \n"
1908 "packuswb %%xmm1,%%xmm0 \n"
1909 "sub $0x10,%2 \n"
1910 "movdqa %%xmm0,(%1,%0,1) \n"
1911 "lea 0x10(%1),%1 \n"
1912 "jg 1b \n"
1913 "jmp 4f \n"
1914 ".p2align 4 \n"
1915 "2: \n"
1916 "movdqa (%1),%%xmm0 \n"
1917 "sub $0x10,%2 \n"
1918 "movdqa %%xmm0,(%1,%0,1) \n"
1919 "lea 0x10(%1),%1 \n"
1920 "jg 2b \n"
1921 "jmp 4f \n"
1922 ".p2align 4 \n"
1923 "3: \n"
1924 "movdqa (%1),%%xmm0 \n"
1925 "pavgb (%1,%4,1),%%xmm0 \n"
1926 "sub $0x10,%2 \n"
1927 "movdqa %%xmm0,(%1,%0,1) \n"
1928 "lea 0x10(%1),%1 \n"
1929 "jg 3b \n"
1930 ".p2align 4 \n"
1931 "4: \n"
1932 "punpckhbw %%xmm0,%%xmm0 \n"
1933 "pshufhw $0xff,%%xmm0,%%xmm0 \n"
1934 "punpckhqdq %%xmm0,%%xmm0 \n"
1935 "movdqa %%xmm0,(%1,%0,1) \n"
1936 : "+r"(dst_ptr), // %0
1937 "+r"(src_ptr), // %1
1938 "+r"(dst_width), // %2
1939 "+r"(source_y_fraction) // %3
1940 : "r"(static_cast<intptr_t>(src_stride)) // %4
1941 : "memory", "cc"
1942 #if defined(__SSE2__)
1943 , "xmm0", "xmm1", "xmm2", "xmm5"
1944 #endif
1945 );
1946 }
1947 #endif // defined(__x86_64__) || defined(__i386__)
1948
1949 // CPU agnostic row functions
ScaleRowDown2_C(const uint8 * src_ptr,ptrdiff_t,uint8 * dst,int dst_width)1950 static void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
1951 uint8* dst, int dst_width) {
1952 uint8* dend = dst + dst_width - 1;
1953 do {
1954 dst[0] = src_ptr[0];
1955 dst[1] = src_ptr[2];
1956 dst += 2;
1957 src_ptr += 4;
1958 } while (dst < dend);
1959 if (dst_width & 1) {
1960 dst[0] = src_ptr[0];
1961 }
1962 }
1963
ScaleRowDown2Int_C(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)1964 void ScaleRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
1965 uint8* dst, int dst_width) {
1966 const uint8* s = src_ptr;
1967 const uint8* t = src_ptr + src_stride;
1968 uint8* dend = dst + dst_width - 1;
1969 do {
1970 dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
1971 dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
1972 dst += 2;
1973 s += 4;
1974 t += 4;
1975 } while (dst < dend);
1976 if (dst_width & 1) {
1977 dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
1978 }
1979 }
1980
ScaleRowDown4_C(const uint8 * src_ptr,ptrdiff_t,uint8 * dst,int dst_width)1981 static void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
1982 uint8* dst, int dst_width) {
1983 uint8* dend = dst + dst_width - 1;
1984 do {
1985 dst[0] = src_ptr[0];
1986 dst[1] = src_ptr[4];
1987 dst += 2;
1988 src_ptr += 8;
1989 } while (dst < dend);
1990 if (dst_width & 1) {
1991 dst[0] = src_ptr[0];
1992 }
1993 }
1994
ScaleRowDown4Int_C(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)1995 static void ScaleRowDown4Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
1996 uint8* dst, int dst_width) {
1997 intptr_t stride = src_stride;
1998 uint8* dend = dst + dst_width - 1;
1999 do {
2000 dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
2001 src_ptr[stride + 0] + src_ptr[stride + 1] +
2002 src_ptr[stride + 2] + src_ptr[stride + 3] +
2003 src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
2004 src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
2005 src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
2006 src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
2007 8) >> 4;
2008 dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
2009 src_ptr[stride + 4] + src_ptr[stride + 5] +
2010 src_ptr[stride + 6] + src_ptr[stride + 7] +
2011 src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
2012 src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
2013 src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
2014 src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
2015 8) >> 4;
2016 dst += 2;
2017 src_ptr += 8;
2018 } while (dst < dend);
2019 if (dst_width & 1) {
2020 dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
2021 src_ptr[stride + 0] + src_ptr[stride + 1] +
2022 src_ptr[stride + 2] + src_ptr[stride + 3] +
2023 src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
2024 src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
2025 src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
2026 src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
2027 8) >> 4;
2028 }
2029 }
2030
2031 // 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down.
2032 // Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu.
2033 static const int kMaxOutputWidth = 640;
2034 static const int kMaxRow12 = kMaxOutputWidth * 2;
2035
ScaleRowDown8_C(const uint8 * src_ptr,ptrdiff_t,uint8 * dst,int dst_width)2036 static void ScaleRowDown8_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
2037 uint8* dst, int dst_width) {
2038 uint8* dend = dst + dst_width - 1;
2039 do {
2040 dst[0] = src_ptr[0];
2041 dst[1] = src_ptr[8];
2042 dst += 2;
2043 src_ptr += 16;
2044 } while (dst < dend);
2045 if (dst_width & 1) {
2046 dst[0] = src_ptr[0];
2047 }
2048 }
2049
2050 // Note calling code checks width is less than max and if not
2051 // uses ScaleRowDown8_C instead.
ScaleRowDown8Int_C(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)2052 static void ScaleRowDown8Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
2053 uint8* dst, int dst_width) {
2054 SIMD_ALIGNED(uint8 src_row[kMaxRow12 * 2]);
2055 assert(dst_width <= kMaxOutputWidth);
2056 ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2);
2057 ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride,
2058 src_row + kMaxOutputWidth,
2059 dst_width * 2);
2060 ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width);
2061 }
2062
ScaleRowDown34_C(const uint8 * src_ptr,ptrdiff_t,uint8 * dst,int dst_width)2063 static void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
2064 uint8* dst, int dst_width) {
2065 assert((dst_width % 3 == 0) && (dst_width > 0));
2066 uint8* dend = dst + dst_width;
2067 do {
2068 dst[0] = src_ptr[0];
2069 dst[1] = src_ptr[1];
2070 dst[2] = src_ptr[3];
2071 dst += 3;
2072 src_ptr += 4;
2073 } while (dst < dend);
2074 }
2075
2076 // Filter rows 0 and 1 together, 3 : 1
ScaleRowDown34_0_Int_C(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * d,int dst_width)2077 static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
2078 uint8* d, int dst_width) {
2079 assert((dst_width % 3 == 0) && (dst_width > 0));
2080 const uint8* s = src_ptr;
2081 const uint8* t = src_ptr + src_stride;
2082 uint8* dend = d + dst_width;
2083 do {
2084 uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
2085 uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
2086 uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
2087 uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
2088 uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
2089 uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
2090 d[0] = (a0 * 3 + b0 + 2) >> 2;
2091 d[1] = (a1 * 3 + b1 + 2) >> 2;
2092 d[2] = (a2 * 3 + b2 + 2) >> 2;
2093 d += 3;
2094 s += 4;
2095 t += 4;
2096 } while (d < dend);
2097 }
2098
2099 // Filter rows 1 and 2 together, 1 : 1
ScaleRowDown34_1_Int_C(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * d,int dst_width)2100 static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
2101 uint8* d, int dst_width) {
2102 assert((dst_width % 3 == 0) && (dst_width > 0));
2103 const uint8* s = src_ptr;
2104 const uint8* t = src_ptr + src_stride;
2105 uint8* dend = d + dst_width;
2106 do {
2107 uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
2108 uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
2109 uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
2110 uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
2111 uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
2112 uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
2113 d[0] = (a0 + b0 + 1) >> 1;
2114 d[1] = (a1 + b1 + 1) >> 1;
2115 d[2] = (a2 + b2 + 1) >> 1;
2116 d += 3;
2117 s += 4;
2118 t += 4;
2119 } while (d < dend);
2120 }
2121
2122 // (1-f)a + fb can be replaced with a + f(b-a)
2123 #define BLENDER(a, b, f) (static_cast<int>(a) + \
2124 ((f) * (static_cast<int>(b) - static_cast<int>(a)) >> 16))
2125
ScaleFilterCols_C(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)2126 static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
2127 int dst_width, int x, int dx) {
2128 for (int j = 0; j < dst_width - 1; j += 2) {
2129 int xi = x >> 16;
2130 int a = src_ptr[xi];
2131 int b = src_ptr[xi + 1];
2132 dst_ptr[0] = BLENDER(a, b, x & 0xffff);
2133 x += dx;
2134 xi = x >> 16;
2135 a = src_ptr[xi];
2136 b = src_ptr[xi + 1];
2137 dst_ptr[1] = BLENDER(a, b, x & 0xffff);
2138 x += dx;
2139 dst_ptr += 2;
2140 }
2141 if (dst_width & 1) {
2142 int xi = x >> 16;
2143 int a = src_ptr[xi];
2144 int b = src_ptr[xi + 1];
2145 dst_ptr[0] = BLENDER(a, b, x & 0xffff);
2146 }
2147 }
2148
2149 static const int kMaxInputWidth = 2560;
2150
2151 #if defined(HAS_SCALEFILTERROWS_SSE2)
2152 // Filter row to 3/4
ScaleFilterCols34_C(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width)2153 static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
2154 int dst_width) {
2155 assert((dst_width % 3 == 0) && (dst_width > 0));
2156 const uint8* s = src_ptr;
2157 uint8* dend = dst_ptr + dst_width;
2158 do {
2159 dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;
2160 dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;
2161 dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2;
2162 dst_ptr += 3;
2163 s += 4;
2164 } while (dst_ptr < dend);
2165 }
2166
2167 #define HAS_SCALEROWDOWN34_SSE2_DISABLED
2168 // Filter rows 0 and 1 together, 3 : 1
ScaleRowDown34_0_Int_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)2169 static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr,
2170 ptrdiff_t src_stride,
2171 uint8* dst_ptr, int dst_width) {
2172 assert((dst_width % 3 == 0) && (dst_width > 0));
2173 SIMD_ALIGNED(uint8 row[kMaxInputWidth]);
2174 ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4);
2175 ScaleFilterCols34_C(dst_ptr, row, dst_width);
2176 }
2177
2178 // Filter rows 1 and 2 together, 1 : 1
ScaleRowDown34_1_Int_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)2179 static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr,
2180 ptrdiff_t src_stride,
2181 uint8* dst_ptr, int dst_width) {
2182 assert((dst_width % 3 == 0) && (dst_width > 0));
2183 SIMD_ALIGNED(uint8 row[kMaxInputWidth]);
2184 ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2);
2185 ScaleFilterCols34_C(dst_ptr, row, dst_width);
2186 }
2187 #endif
2188
ScaleRowDown38_C(const uint8 * src_ptr,ptrdiff_t,uint8 * dst,int dst_width)2189 static void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
2190 uint8* dst, int dst_width) {
2191 assert(dst_width % 3 == 0);
2192 for (int x = 0; x < dst_width; x += 3) {
2193 dst[0] = src_ptr[0];
2194 dst[1] = src_ptr[3];
2195 dst[2] = src_ptr[6];
2196 dst += 3;
2197 src_ptr += 8;
2198 }
2199 }
2200
2201 // 8x3 -> 3x1
ScaleRowDown38_3_Int_C(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)2202 static void ScaleRowDown38_3_Int_C(const uint8* src_ptr,
2203 ptrdiff_t src_stride,
2204 uint8* dst_ptr, int dst_width) {
2205 assert((dst_width % 3 == 0) && (dst_width > 0));
2206 intptr_t stride = src_stride;
2207 for (int i = 0; i < dst_width; i += 3) {
2208 dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
2209 src_ptr[stride + 0] + src_ptr[stride + 1] +
2210 src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
2211 src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
2212 (65536 / 9) >> 16;
2213 dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
2214 src_ptr[stride + 3] + src_ptr[stride + 4] +
2215 src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
2216 src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
2217 (65536 / 9) >> 16;
2218 dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
2219 src_ptr[stride + 6] + src_ptr[stride + 7] +
2220 src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
2221 (65536 / 6) >> 16;
2222 src_ptr += 8;
2223 dst_ptr += 3;
2224 }
2225 }
2226
2227 // 8x2 -> 3x1
ScaleRowDown38_2_Int_C(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)2228 static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
2229 uint8* dst_ptr, int dst_width) {
2230 assert((dst_width % 3 == 0) && (dst_width > 0));
2231 intptr_t stride = src_stride;
2232 for (int i = 0; i < dst_width; i += 3) {
2233 dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
2234 src_ptr[stride + 0] + src_ptr[stride + 1] +
2235 src_ptr[stride + 2]) * (65536 / 6) >> 16;
2236 dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
2237 src_ptr[stride + 3] + src_ptr[stride + 4] +
2238 src_ptr[stride + 5]) * (65536 / 6) >> 16;
2239 dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
2240 src_ptr[stride + 6] + src_ptr[stride + 7]) *
2241 (65536 / 4) >> 16;
2242 src_ptr += 8;
2243 dst_ptr += 3;
2244 }
2245 }
2246
2247 // C version 8x2 -> 8x1
ScaleFilterRows_C(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)2248 static void ScaleFilterRows_C(uint8* dst_ptr,
2249 const uint8* src_ptr, ptrdiff_t src_stride,
2250 int dst_width, int source_y_fraction) {
2251 assert(dst_width > 0);
2252 int y1_fraction = source_y_fraction;
2253 int y0_fraction = 256 - y1_fraction;
2254 const uint8* src_ptr1 = src_ptr + src_stride;
2255 uint8* end = dst_ptr + dst_width;
2256 do {
2257 dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
2258 dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
2259 dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
2260 dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
2261 dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
2262 dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
2263 dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
2264 dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
2265 src_ptr += 8;
2266 src_ptr1 += 8;
2267 dst_ptr += 8;
2268 } while (dst_ptr < end);
2269 dst_ptr[0] = dst_ptr[-1];
2270 }
2271
ScaleAddRows_C(const uint8 * src_ptr,ptrdiff_t src_stride,uint16 * dst_ptr,int src_width,int src_height)2272 void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
2273 uint16* dst_ptr, int src_width, int src_height) {
2274 assert(src_width > 0);
2275 assert(src_height > 0);
2276 for (int x = 0; x < src_width; ++x) {
2277 const uint8* s = src_ptr + x;
2278 int sum = 0;
2279 for (int y = 0; y < src_height; ++y) {
2280 sum += s[0];
2281 s += src_stride;
2282 }
2283 dst_ptr[x] = sum;
2284 }
2285 }
2286
2287 /**
2288 * Scale plane, 1/2
2289 *
2290 * This is an optimized version for scaling down a plane to 1/2 of
2291 * its original size.
2292 *
2293 */
ScalePlaneDown2(int,int,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)2294 static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
2295 int dst_width, int dst_height,
2296 int src_stride, int dst_stride,
2297 const uint8* src_ptr, uint8* dst_ptr,
2298 FilterMode filtering) {
2299 void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
2300 uint8* dst_ptr, int dst_width) =
2301 filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C;
2302 #if defined(HAS_SCALEROWDOWN2_NEON)
2303 if (TestCpuFlag(kCpuHasNEON) &&
2304 IS_ALIGNED(dst_width, 16)) {
2305 ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;
2306 }
2307 #elif defined(HAS_SCALEROWDOWN2_SSE2)
2308 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
2309 ScaleRowDown2 = filtering ? ScaleRowDown2Int_Unaligned_SSE2 :
2310 ScaleRowDown2_Unaligned_SSE2;
2311 if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
2312 IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
2313 ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;
2314 }
2315 }
2316 #endif
2317
2318 // TODO(fbarchard): Loop through source height to allow odd height.
2319 for (int y = 0; y < dst_height; ++y) {
2320 ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
2321 src_ptr += (src_stride << 1);
2322 dst_ptr += dst_stride;
2323 }
2324 }
2325
2326 /**
2327 * Scale plane, 1/4
2328 *
2329 * This is an optimized version for scaling down a plane to 1/4 of
2330 * its original size.
2331 */
ScalePlaneDown4(int,int,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)2332 static void ScalePlaneDown4(int /* src_width */, int /* src_height */,
2333 int dst_width, int dst_height,
2334 int src_stride, int dst_stride,
2335 const uint8* src_ptr, uint8* dst_ptr,
2336 FilterMode filtering) {
2337 void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
2338 uint8* dst_ptr, int dst_width) =
2339 filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C;
2340 #if defined(HAS_SCALEROWDOWN4_NEON)
2341 if (TestCpuFlag(kCpuHasNEON) &&
2342 IS_ALIGNED(dst_width, 4)) {
2343 ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON;
2344 }
2345 #elif defined(HAS_SCALEROWDOWN4_SSE2)
2346 if (TestCpuFlag(kCpuHasSSE2) &&
2347 IS_ALIGNED(dst_width, 8) &&
2348 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
2349 ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
2350 }
2351 #endif
2352
2353 for (int y = 0; y < dst_height; ++y) {
2354 ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
2355 src_ptr += (src_stride << 2);
2356 dst_ptr += dst_stride;
2357 }
2358 }
2359
2360 /**
2361 * Scale plane, 1/8
2362 *
2363 * This is an optimized version for scaling down a plane to 1/8
2364 * of its original size.
2365 *
2366 */
ScalePlaneDown8(int,int,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)2367 static void ScalePlaneDown8(int /* src_width */, int /* src_height */,
2368 int dst_width, int dst_height,
2369 int src_stride, int dst_stride,
2370 const uint8* src_ptr, uint8* dst_ptr,
2371 FilterMode filtering) {
2372 void (*ScaleRowDown8)(const uint8* src_ptr, ptrdiff_t src_stride,
2373 uint8* dst_ptr, int dst_width) =
2374 filtering && (dst_width <= kMaxOutputWidth) ?
2375 ScaleRowDown8Int_C : ScaleRowDown8_C;
2376 #if defined(HAS_SCALEROWDOWN8_SSE2)
2377 if (TestCpuFlag(kCpuHasSSE2) &&
2378 IS_ALIGNED(dst_width, 4) &&
2379 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
2380 ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2;
2381 }
2382 #endif
2383
2384 for (int y = 0; y < dst_height; ++y) {
2385 ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width);
2386 src_ptr += (src_stride << 3);
2387 dst_ptr += dst_stride;
2388 }
2389 }
2390
2391 /**
2392 * Scale plane down, 3/4
2393 *
2394 * Provided by Frank Barchard (fbarchard@google.com)
2395 *
2396 */
ScalePlaneDown34(int,int,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)2397 static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
2398 int dst_width, int dst_height,
2399 int src_stride, int dst_stride,
2400 const uint8* src_ptr, uint8* dst_ptr,
2401 FilterMode filtering) {
2402 assert(dst_width % 3 == 0);
2403 void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,
2404 uint8* dst_ptr, int dst_width);
2405 void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,
2406 uint8* dst_ptr, int dst_width);
2407 if (!filtering) {
2408 ScaleRowDown34_0 = ScaleRowDown34_C;
2409 ScaleRowDown34_1 = ScaleRowDown34_C;
2410 } else {
2411 ScaleRowDown34_0 = ScaleRowDown34_0_Int_C;
2412 ScaleRowDown34_1 = ScaleRowDown34_1_Int_C;
2413 }
2414 #if defined(HAS_SCALEROWDOWN34_NEON)
2415 if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
2416 if (!filtering) {
2417 ScaleRowDown34_0 = ScaleRowDown34_NEON;
2418 ScaleRowDown34_1 = ScaleRowDown34_NEON;
2419 } else {
2420 ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON;
2421 ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON;
2422 }
2423 }
2424 #endif
2425 #if defined(HAS_SCALEROWDOWN34_SSE2)
2426 if (TestCpuFlag(kCpuHasSSE2) && (dst_width % 24 == 0) &&
2427 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && filtering) {
2428 ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2;
2429 ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2;
2430 }
2431 #endif
2432 #if defined(HAS_SCALEROWDOWN34_SSSE3)
2433 if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
2434 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
2435 if (!filtering) {
2436 ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
2437 ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
2438 } else {
2439 ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3;
2440 ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3;
2441 }
2442 }
2443 #endif
2444
2445 for (int y = 0; y < dst_height - 2; y += 3) {
2446 ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
2447 src_ptr += src_stride;
2448 dst_ptr += dst_stride;
2449 ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);
2450 src_ptr += src_stride;
2451 dst_ptr += dst_stride;
2452 ScaleRowDown34_0(src_ptr + src_stride, -src_stride,
2453 dst_ptr, dst_width);
2454 src_ptr += src_stride * 2;
2455 dst_ptr += dst_stride;
2456 }
2457
2458 // Remainder 1 or 2 rows with last row vertically unfiltered
2459 if ((dst_height % 3) == 2) {
2460 ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
2461 src_ptr += src_stride;
2462 dst_ptr += dst_stride;
2463 ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
2464 } else if ((dst_height % 3) == 1) {
2465 ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
2466 }
2467 }
2468
2469 /**
2470 * Scale plane, 3/8
2471 *
2472 * This is an optimized version for scaling down a plane to 3/8
2473 * of its original size.
2474 *
2475 * Uses box filter arranges like this
2476 * aaabbbcc -> abc
2477 * aaabbbcc def
2478 * aaabbbcc ghi
2479 * dddeeeff
2480 * dddeeeff
2481 * dddeeeff
2482 * ggghhhii
2483 * ggghhhii
2484 * Boxes are 3x3, 2x3, 3x2 and 2x2
2485 */
ScalePlaneDown38(int,int,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)2486 static void ScalePlaneDown38(int /* src_width */, int /* src_height */,
2487 int dst_width, int dst_height,
2488 int src_stride, int dst_stride,
2489 const uint8* src_ptr, uint8* dst_ptr,
2490 FilterMode filtering) {
2491 assert(dst_width % 3 == 0);
2492 void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,
2493 uint8* dst_ptr, int dst_width);
2494 void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride,
2495 uint8* dst_ptr, int dst_width);
2496 if (!filtering) {
2497 ScaleRowDown38_3 = ScaleRowDown38_C;
2498 ScaleRowDown38_2 = ScaleRowDown38_C;
2499 } else {
2500 ScaleRowDown38_3 = ScaleRowDown38_3_Int_C;
2501 ScaleRowDown38_2 = ScaleRowDown38_2_Int_C;
2502 }
2503 #if defined(HAS_SCALEROWDOWN38_NEON)
2504 if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
2505 if (!filtering) {
2506 ScaleRowDown38_3 = ScaleRowDown38_NEON;
2507 ScaleRowDown38_2 = ScaleRowDown38_NEON;
2508 } else {
2509 ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON;
2510 ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON;
2511 }
2512 }
2513 #elif defined(HAS_SCALEROWDOWN38_SSSE3)
2514 if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
2515 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
2516 if (!filtering) {
2517 ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
2518 ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
2519 } else {
2520 ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3;
2521 ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;
2522 }
2523 }
2524 #endif
2525
2526 for (int y = 0; y < dst_height - 2; y += 3) {
2527 ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
2528 src_ptr += src_stride * 3;
2529 dst_ptr += dst_stride;
2530 ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
2531 src_ptr += src_stride * 3;
2532 dst_ptr += dst_stride;
2533 ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);
2534 src_ptr += src_stride * 2;
2535 dst_ptr += dst_stride;
2536 }
2537
2538 // Remainder 1 or 2 rows with last row vertically unfiltered
2539 if ((dst_height % 3) == 2) {
2540 ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
2541 src_ptr += src_stride * 3;
2542 dst_ptr += dst_stride;
2543 ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
2544 } else if ((dst_height % 3) == 1) {
2545 ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
2546 }
2547 }
2548
SumBox(int iboxwidth,int iboxheight,ptrdiff_t src_stride,const uint8 * src_ptr)2549 static __inline uint32 SumBox(int iboxwidth, int iboxheight,
2550 ptrdiff_t src_stride, const uint8* src_ptr) {
2551 assert(iboxwidth > 0);
2552 assert(iboxheight > 0);
2553 uint32 sum = 0u;
2554 for (int y = 0; y < iboxheight; ++y) {
2555 for (int x = 0; x < iboxwidth; ++x) {
2556 sum += src_ptr[x];
2557 }
2558 src_ptr += src_stride;
2559 }
2560 return sum;
2561 }
2562
ScalePlaneBoxRow_C(int dst_width,int boxheight,int x,int dx,ptrdiff_t src_stride,const uint8 * src_ptr,uint8 * dst_ptr)2563 static void ScalePlaneBoxRow_C(int dst_width, int boxheight,
2564 int x, int dx, ptrdiff_t src_stride,
2565 const uint8* src_ptr, uint8* dst_ptr) {
2566 for (int i = 0; i < dst_width; ++i) {
2567 int ix = x >> 16;
2568 x += dx;
2569 int boxwidth = (x >> 16) - ix;
2570 *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
2571 (boxwidth * boxheight);
2572 }
2573 }
2574
SumPixels(int iboxwidth,const uint16 * src_ptr)2575 static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
2576 assert(iboxwidth > 0);
2577 uint32 sum = 0u;
2578 for (int x = 0; x < iboxwidth; ++x) {
2579 sum += src_ptr[x];
2580 }
2581 return sum;
2582 }
2583
ScaleAddCols2_C(int dst_width,int boxheight,int x,int dx,const uint16 * src_ptr,uint8 * dst_ptr)2584 static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
2585 const uint16* src_ptr, uint8* dst_ptr) {
2586 int scaletbl[2];
2587 int minboxwidth = (dx >> 16);
2588 scaletbl[0] = 65536 / (minboxwidth * boxheight);
2589 scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
2590 int *scaleptr = scaletbl - minboxwidth;
2591 for (int i = 0; i < dst_width; ++i) {
2592 int ix = x >> 16;
2593 x += dx;
2594 int boxwidth = (x >> 16) - ix;
2595 *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
2596 }
2597 }
2598
ScaleAddCols1_C(int dst_width,int boxheight,int x,int dx,const uint16 * src_ptr,uint8 * dst_ptr)2599 static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
2600 const uint16* src_ptr, uint8* dst_ptr) {
2601 int boxwidth = (dx >> 16);
2602 int scaleval = 65536 / (boxwidth * boxheight);
2603 for (int i = 0; i < dst_width; ++i) {
2604 *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
2605 x += boxwidth;
2606 }
2607 }
2608
2609 /**
2610 * Scale plane down to any dimensions, with interpolation.
2611 * (boxfilter).
2612 *
2613 * Same method as SimpleScale, which is fixed point, outputting
2614 * one pixel of destination using fixed point (16.16) to step
2615 * through source, sampling a box of pixel with simple
2616 * averaging.
2617 */
ScalePlaneBox(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr)2618 static void ScalePlaneBox(int src_width, int src_height,
2619 int dst_width, int dst_height,
2620 int src_stride, int dst_stride,
2621 const uint8* src_ptr, uint8* dst_ptr) {
2622 assert(dst_width > 0);
2623 assert(dst_height > 0);
2624 int dx = (src_width << 16) / dst_width;
2625 int dy = (src_height << 16) / dst_height;
2626 int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
2627 int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
2628 int maxy = (src_height << 16);
2629 if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) ||
2630 dst_height * 2 > src_height) {
2631 uint8* dst = dst_ptr;
2632 for (int j = 0; j < dst_height; ++j) {
2633 int iy = y >> 16;
2634 const uint8* src = src_ptr + iy * src_stride;
2635 y += dy;
2636 if (y > maxy) {
2637 y = maxy;
2638 }
2639 int boxheight = (y >> 16) - iy;
2640 ScalePlaneBoxRow_C(dst_width, boxheight,
2641 x, dx, src_stride,
2642 src, dst);
2643 dst += dst_stride;
2644 }
2645 } else {
2646 SIMD_ALIGNED(uint16 row[kMaxInputWidth]);
2647 void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride,
2648 uint16* dst_ptr, int src_width, int src_height)=
2649 ScaleAddRows_C;
2650 void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
2651 const uint16* src_ptr, uint8* dst_ptr);
2652 if (dx & 0xffff) {
2653 ScaleAddCols = ScaleAddCols2_C;
2654 } else {
2655 ScaleAddCols = ScaleAddCols1_C;
2656 }
2657 #if defined(HAS_SCALEADDROWS_SSE2)
2658 if (TestCpuFlag(kCpuHasSSE2) &&
2659 IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
2660 ScaleAddRows = ScaleAddRows_SSE2;
2661 }
2662 #endif
2663
2664 for (int j = 0; j < dst_height; ++j) {
2665 int iy = y >> 16;
2666 const uint8* src = src_ptr + iy * src_stride;
2667 y += dy;
2668 if (y > (src_height << 16)) {
2669 y = (src_height << 16);
2670 }
2671 int boxheight = (y >> 16) - iy;
2672 ScaleAddRows(src, src_stride, row, src_width, boxheight);
2673 ScaleAddCols(dst_width, boxheight, x, dx, row, dst_ptr);
2674 dst_ptr += dst_stride;
2675 }
2676 }
2677 }
2678
2679 /**
2680 * Scale plane to/from any dimensions, with interpolation.
2681 */
ScalePlaneBilinearSimple(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr)2682 static void ScalePlaneBilinearSimple(int src_width, int src_height,
2683 int dst_width, int dst_height,
2684 int src_stride, int dst_stride,
2685 const uint8* src_ptr, uint8* dst_ptr) {
2686 int dx = (src_width << 16) / dst_width;
2687 int dy = (src_height << 16) / dst_height;
2688 int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
2689 int maxx = (src_width > 1) ? ((src_width - 1) << 16) - 1 : 0;
2690 int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
2691 for (int i = 0; i < dst_height; ++i) {
2692 int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
2693 int yi = y >> 16;
2694 int yf = y & 0xffff;
2695 const uint8* src0 = src_ptr + yi * src_stride;
2696 const uint8* src1 = (yi < src_height - 1) ? src0 + src_stride : src0;
2697 uint8* dst = dst_ptr;
2698 for (int j = 0; j < dst_width; ++j) {
2699 int xi = x >> 16;
2700 int xf = x & 0xffff;
2701 int x1 = (xi < src_width - 1) ? xi + 1 : xi;
2702 int a = src0[xi];
2703 int b = src0[x1];
2704 int r0 = BLENDER(a, b, xf);
2705 a = src1[xi];
2706 b = src1[x1];
2707 int r1 = BLENDER(a, b, xf);
2708 *dst++ = BLENDER(r0, r1, yf);
2709 x += dx;
2710 if (x > maxx)
2711 x = maxx;
2712 }
2713 dst_ptr += dst_stride;
2714 y += dy;
2715 if (y > maxy)
2716 y = maxy;
2717 }
2718 }
2719
2720 /**
2721 * Scale plane to/from any dimensions, with bilinear
2722 * interpolation.
2723 */
ScalePlaneBilinear(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr)2724 void ScalePlaneBilinear(int src_width, int src_height,
2725 int dst_width, int dst_height,
2726 int src_stride, int dst_stride,
2727 const uint8* src_ptr, uint8* dst_ptr) {
2728 assert(dst_width > 0);
2729 assert(dst_height > 0);
2730 if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) {
2731 ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height,
2732 src_stride, dst_stride, src_ptr, dst_ptr);
2733
2734 } else {
2735 SIMD_ALIGNED(uint8 row[kMaxInputWidth + 16]);
2736 void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
2737 ptrdiff_t src_stride,
2738 int dst_width, int source_y_fraction) =
2739 ScaleFilterRows_C;
2740 #if defined(HAS_SCALEFILTERROWS_NEON)
2741 if (TestCpuFlag(kCpuHasNEON)) {
2742 ScaleFilterRows = ScaleFilterRows_NEON;
2743 }
2744 #endif
2745 #if defined(HAS_SCALEFILTERROWS_SSE2)
2746 if (TestCpuFlag(kCpuHasSSE2) &&
2747 IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
2748 ScaleFilterRows = ScaleFilterRows_SSE2;
2749 }
2750 #endif
2751 #if defined(HAS_SCALEFILTERROWS_SSSE3)
2752 if (TestCpuFlag(kCpuHasSSSE3) &&
2753 IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
2754 ScaleFilterRows = ScaleFilterRows_SSSE3;
2755 }
2756 #endif
2757
2758 int dx = (src_width << 16) / dst_width;
2759 int dy = (src_height << 16) / dst_height;
2760 int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
2761 int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
2762 int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
2763 for (int j = 0; j < dst_height; ++j) {
2764 int yi = y >> 16;
2765 int yf = (y >> 8) & 255;
2766 const uint8* src = src_ptr + yi * src_stride;
2767 ScaleFilterRows(row, src, src_stride, src_width, yf);
2768 ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx);
2769 dst_ptr += dst_stride;
2770 y += dy;
2771 if (y > maxy) {
2772 y = maxy;
2773 }
2774 }
2775 }
2776 }
2777
2778 /**
2779 * Scale plane to/from any dimensions, without interpolation.
2780 * Fixed point math is used for performance: The upper 16 bits
2781 * of x and dx is the integer part of the source position and
2782 * the lower 16 bits are the fixed decimal part.
2783 */
ScalePlaneSimple(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr)2784 static void ScalePlaneSimple(int src_width, int src_height,
2785 int dst_width, int dst_height,
2786 int src_stride, int dst_stride,
2787 const uint8* src_ptr, uint8* dst_ptr) {
2788 int dx = (src_width << 16) / dst_width;
2789 int dy = (src_height << 16) / dst_height;
2790 int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
2791 for (int j = 0; j < dst_height; ++j) {
2792 int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
2793 int yi = y >> 16;
2794 const uint8* src = src_ptr + yi * src_stride;
2795 uint8* dst = dst_ptr;
2796 for (int i = 0; i < dst_width; ++i) {
2797 *dst++ = src[x >> 16];
2798 x += dx;
2799 }
2800 dst_ptr += dst_stride;
2801 y += dy;
2802 }
2803 }
2804
2805 /**
2806 * Scale plane to/from any dimensions.
2807 */
ScalePlaneAnySize(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)2808 static void ScalePlaneAnySize(int src_width, int src_height,
2809 int dst_width, int dst_height,
2810 int src_stride, int dst_stride,
2811 const uint8* src_ptr, uint8* dst_ptr,
2812 FilterMode filtering) {
2813 if (!filtering) {
2814 ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
2815 src_stride, dst_stride, src_ptr, dst_ptr);
2816 } else {
2817 // fall back to non-optimized version
2818 ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
2819 src_stride, dst_stride, src_ptr, dst_ptr);
2820 }
2821 }
2822
2823 /**
2824 * Scale plane down, any size
2825 *
2826 * This is an optimized version for scaling down a plane to any size.
2827 * The current implementation is ~10 times faster compared to the
2828 * reference implementation for e.g. XGA->LowResPAL
2829 *
2830 */
ScalePlaneDown(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)2831 static void ScalePlaneDown(int src_width, int src_height,
2832 int dst_width, int dst_height,
2833 int src_stride, int dst_stride,
2834 const uint8* src_ptr, uint8* dst_ptr,
2835 FilterMode filtering) {
2836 if (!filtering) {
2837 ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
2838 src_stride, dst_stride, src_ptr, dst_ptr);
2839 } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) {
2840 // between 1/2x and 1x use bilinear
2841 ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
2842 src_stride, dst_stride, src_ptr, dst_ptr);
2843 } else {
2844 ScalePlaneBox(src_width, src_height, dst_width, dst_height,
2845 src_stride, dst_stride, src_ptr, dst_ptr);
2846 }
2847 }
2848
2849 // Scale a plane.
2850 // This function in turn calls a scaling function suitable for handling
2851 // the desired resolutions.
2852
2853 LIBYUV_API
ScalePlane(const uint8 * src,int src_stride,int src_width,int src_height,uint8 * dst,int dst_stride,int dst_width,int dst_height,FilterMode filtering)2854 void ScalePlane(const uint8* src, int src_stride,
2855 int src_width, int src_height,
2856 uint8* dst, int dst_stride,
2857 int dst_width, int dst_height,
2858 FilterMode filtering) {
2859 #ifdef CPU_X86
2860 // environment variable overrides for testing.
2861 char *filter_override = getenv("LIBYUV_FILTER");
2862 if (filter_override) {
2863 filtering = (FilterMode)atoi(filter_override); // NOLINT
2864 }
2865 #endif
2866 // Use specialized scales to improve performance for common resolutions.
2867 // For example, all the 1/2 scalings will use ScalePlaneDown2()
2868 if (dst_width == src_width && dst_height == src_height) {
2869 // Straight copy.
2870 CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
2871 } else if (dst_width <= src_width && dst_height <= src_height) {
2872 // Scale down.
2873 if (use_reference_impl_) {
2874 // For testing, allow the optimized versions to be disabled.
2875 ScalePlaneDown(src_width, src_height, dst_width, dst_height,
2876 src_stride, dst_stride, src, dst, filtering);
2877 } else if (4 * dst_width == 3 * src_width &&
2878 4 * dst_height == 3 * src_height) {
2879 // optimized, 3/4
2880 ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
2881 src_stride, dst_stride, src, dst, filtering);
2882 } else if (2 * dst_width == src_width && 2 * dst_height == src_height) {
2883 // optimized, 1/2
2884 ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
2885 src_stride, dst_stride, src, dst, filtering);
2886 // 3/8 rounded up for odd sized chroma height.
2887 } else if (8 * dst_width == 3 * src_width &&
2888 dst_height == ((src_height * 3 + 7) / 8)) {
2889 // optimized, 3/8
2890 ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
2891 src_stride, dst_stride, src, dst, filtering);
2892 } else if (4 * dst_width == src_width && 4 * dst_height == src_height &&
2893 filtering != kFilterBilinear) {
2894 // optimized, 1/4
2895 ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
2896 src_stride, dst_stride, src, dst, filtering);
2897 } else if (8 * dst_width == src_width && 8 * dst_height == src_height &&
2898 filtering != kFilterBilinear) {
2899 // optimized, 1/8
2900 ScalePlaneDown8(src_width, src_height, dst_width, dst_height,
2901 src_stride, dst_stride, src, dst, filtering);
2902 } else {
2903 // Arbitrary downsample
2904 ScalePlaneDown(src_width, src_height, dst_width, dst_height,
2905 src_stride, dst_stride, src, dst, filtering);
2906 }
2907 } else {
2908 // Arbitrary scale up and/or down.
2909 ScalePlaneAnySize(src_width, src_height, dst_width, dst_height,
2910 src_stride, dst_stride, src, dst, filtering);
2911 }
2912 }
2913
2914 // Scale an I420 image.
2915 // This function in turn calls a scaling function for each plane.
2916
2917 #define UNDER_ALLOCATED_HACK 1
2918
2919 LIBYUV_API
I420Scale(const uint8 * src_y,int src_stride_y,const uint8 * src_u,int src_stride_u,const uint8 * src_v,int src_stride_v,int src_width,int src_height,uint8 * dst_y,int dst_stride_y,uint8 * dst_u,int dst_stride_u,uint8 * dst_v,int dst_stride_v,int dst_width,int dst_height,FilterMode filtering)2920 int I420Scale(const uint8* src_y, int src_stride_y,
2921 const uint8* src_u, int src_stride_u,
2922 const uint8* src_v, int src_stride_v,
2923 int src_width, int src_height,
2924 uint8* dst_y, int dst_stride_y,
2925 uint8* dst_u, int dst_stride_u,
2926 uint8* dst_v, int dst_stride_v,
2927 int dst_width, int dst_height,
2928 FilterMode filtering) {
2929 if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
2930 !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
2931 return -1;
2932 }
2933 // Negative height means invert the image.
2934 if (src_height < 0) {
2935 src_height = -src_height;
2936 int halfheight = (src_height + 1) >> 1;
2937 src_y = src_y + (src_height - 1) * src_stride_y;
2938 src_u = src_u + (halfheight - 1) * src_stride_u;
2939 src_v = src_v + (halfheight - 1) * src_stride_v;
2940 src_stride_y = -src_stride_y;
2941 src_stride_u = -src_stride_u;
2942 src_stride_v = -src_stride_v;
2943 }
2944 int src_halfwidth = (src_width + 1) >> 1;
2945 int src_halfheight = (src_height + 1) >> 1;
2946 int dst_halfwidth = (dst_width + 1) >> 1;
2947 int dst_halfheight = (dst_height + 1) >> 1;
2948
2949 #ifdef UNDER_ALLOCATED_HACK
2950 // If caller passed width / 2 for stride, adjust halfwidth to match.
2951 if ((src_width & 1) && src_stride_u && src_halfwidth > abs(src_stride_u)) {
2952 src_halfwidth = src_width >> 1;
2953 }
2954 if ((dst_width & 1) && dst_stride_u && dst_halfwidth > abs(dst_stride_u)) {
2955 dst_halfwidth = dst_width >> 1;
2956 }
2957 // If caller used height / 2 when computing src_v, it will point into what
2958 // should be the src_u plane. Detect this and reduce halfheight to match.
2959 int uv_src_plane_size = src_halfwidth * src_halfheight;
2960 if ((src_height & 1) &&
2961 (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {
2962 src_halfheight = src_height >> 1;
2963 }
2964 int uv_dst_plane_size = dst_halfwidth * dst_halfheight;
2965 if ((dst_height & 1) &&
2966 (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) {
2967 dst_halfheight = dst_height >> 1;
2968 }
2969 #endif
2970
2971 ScalePlane(src_y, src_stride_y, src_width, src_height,
2972 dst_y, dst_stride_y, dst_width, dst_height,
2973 filtering);
2974 ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
2975 dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
2976 filtering);
2977 ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
2978 dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
2979 filtering);
2980 return 0;
2981 }
2982
2983 // Deprecated api
2984 LIBYUV_API
Scale(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,int src_stride_y,int src_stride_u,int src_stride_v,int src_width,int src_height,uint8 * dst_y,uint8 * dst_u,uint8 * dst_v,int dst_stride_y,int dst_stride_u,int dst_stride_v,int dst_width,int dst_height,bool interpolate)2985 int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
2986 int src_stride_y, int src_stride_u, int src_stride_v,
2987 int src_width, int src_height,
2988 uint8* dst_y, uint8* dst_u, uint8* dst_v,
2989 int dst_stride_y, int dst_stride_u, int dst_stride_v,
2990 int dst_width, int dst_height,
2991 bool interpolate) {
2992 if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
2993 !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
2994 return -1;
2995 }
2996 // Negative height means invert the image.
2997 if (src_height < 0) {
2998 src_height = -src_height;
2999 int halfheight = (src_height + 1) >> 1;
3000 src_y = src_y + (src_height - 1) * src_stride_y;
3001 src_u = src_u + (halfheight - 1) * src_stride_u;
3002 src_v = src_v + (halfheight - 1) * src_stride_v;
3003 src_stride_y = -src_stride_y;
3004 src_stride_u = -src_stride_u;
3005 src_stride_v = -src_stride_v;
3006 }
3007 int src_halfwidth = (src_width + 1) >> 1;
3008 int src_halfheight = (src_height + 1) >> 1;
3009 int dst_halfwidth = (dst_width + 1) >> 1;
3010 int dst_halfheight = (dst_height + 1) >> 1;
3011 FilterMode filtering = interpolate ? kFilterBox : kFilterNone;
3012
3013 #ifdef UNDER_ALLOCATED_HACK
3014 // If caller passed width / 2 for stride, adjust halfwidth to match.
3015 if ((src_width & 1) && src_stride_u && src_halfwidth > abs(src_stride_u)) {
3016 src_halfwidth = src_width >> 1;
3017 }
3018 if ((dst_width & 1) && dst_stride_u && dst_halfwidth > abs(dst_stride_u)) {
3019 dst_halfwidth = dst_width >> 1;
3020 }
3021 // If caller used height / 2 when computing src_v, it will point into what
3022 // should be the src_u plane. Detect this and reduce halfheight to match.
3023 int uv_src_plane_size = src_halfwidth * src_halfheight;
3024 if ((src_height & 1) &&
3025 (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {
3026 src_halfheight = src_height >> 1;
3027 }
3028 int uv_dst_plane_size = dst_halfwidth * dst_halfheight;
3029 if ((dst_height & 1) &&
3030 (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) {
3031 dst_halfheight = dst_height >> 1;
3032 }
3033 #endif
3034
3035 ScalePlane(src_y, src_stride_y, src_width, src_height,
3036 dst_y, dst_stride_y, dst_width, dst_height,
3037 filtering);
3038 ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
3039 dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
3040 filtering);
3041 ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
3042 dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
3043 filtering);
3044 return 0;
3045 }
3046
3047 // Deprecated api
3048 LIBYUV_API
ScaleOffset(const uint8 * src,int src_width,int src_height,uint8 * dst,int dst_width,int dst_height,int dst_yoffset,bool interpolate)3049 int ScaleOffset(const uint8* src, int src_width, int src_height,
3050 uint8* dst, int dst_width, int dst_height, int dst_yoffset,
3051 bool interpolate) {
3052 if (!src || src_width <= 0 || src_height <= 0 ||
3053 !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 ||
3054 dst_yoffset >= dst_height) {
3055 return -1;
3056 }
3057 dst_yoffset = dst_yoffset & ~1; // chroma requires offset to multiple of 2.
3058 int src_halfwidth = (src_width + 1) >> 1;
3059 int src_halfheight = (src_height + 1) >> 1;
3060 int dst_halfwidth = (dst_width + 1) >> 1;
3061 int dst_halfheight = (dst_height + 1) >> 1;
3062 int aheight = dst_height - dst_yoffset * 2; // actual output height
3063 const uint8* src_y = src;
3064 const uint8* src_u = src + src_width * src_height;
3065 const uint8* src_v = src + src_width * src_height +
3066 src_halfwidth * src_halfheight;
3067 uint8* dst_y = dst + dst_yoffset * dst_width;
3068 uint8* dst_u = dst + dst_width * dst_height +
3069 (dst_yoffset >> 1) * dst_halfwidth;
3070 uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
3071 (dst_yoffset >> 1) * dst_halfwidth;
3072 return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth,
3073 src_width, src_height, dst_y, dst_u, dst_v, dst_width,
3074 dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate);
3075 }
3076
3077 #ifdef __cplusplus
3078 } // extern "C"
3079 } // namespace libyuv
3080 #endif
3081