1 /*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/scale.h"
12
13 #include <assert.h>
14 #include <string.h>
15
16 #include "libyuv/cpu_id.h"
17
18 #if defined(_MSC_VER)
19 #define ALIGN16(var) __declspec(align(16)) var
20 #else
21 #define ALIGN16(var) var __attribute__((aligned(16)))
22 #endif
23
24 // Note: A Neon reference manual
25 // http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html
26 // Note: Some SSE2 reference manuals
27 // cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
28
29 namespace libyuv {
30
31 // Set the following flag to true to revert to only
32 // using the reference implementation ScalePlaneBox(), and
33 // NOT the optimized versions. Useful for debugging and
34 // when comparing the quality of the resulting YUV planes
35 // as produced by the optimized and non-optimized versions.
36
37 static bool use_reference_impl_ = false;
38
SetUseReferenceImpl(bool use)39 void SetUseReferenceImpl(bool use) {
40 use_reference_impl_ = use;
41 }
42
43 /**
44 * NEON downscalers with interpolation.
45 *
46 * Provided by Fritz Koenig
47 *
48 */
49
50 #if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
51 #define HAS_SCALEROWDOWN2_NEON
ScaleRowDown2_NEON(const uint8 * src_ptr,int,uint8 * dst,int dst_width)52 void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */,
53 uint8* dst, int dst_width) {
54 __asm__ volatile
55 (
56 "1:\n"
57 "vld2.u8 {q0,q1}, [%0]! \n" // load even pixels into q0, odd into q1
58 "vst1.u8 {q0}, [%1]! \n" // store even pixels
59 "subs %2, %2, #16 \n" // 16 processed per loop
60 "bhi 1b \n"
61 : "+r"(src_ptr), // %0
62 "+r"(dst), // %1
63 "+r"(dst_width) // %2
64 :
65 : "q0", "q1" // Clobber List
66 );
67 }
68
ScaleRowDown2Int_NEON(const uint8 * src_ptr,int src_stride,uint8 * dst,int dst_width)69 void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
70 uint8* dst, int dst_width) {
71 __asm__ volatile
72 (
73 "mov r4, #2 \n" // rounding constant
74 "add %1, %0 \n" // change the stride to row 2 pointer
75 "vdup.16 q4, r4 \n"
76 "1:\n"
77 "vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post increment
78 "vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post increment
79 "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
80 "vpaddl.u8 q1, q1 \n"
81 "vpadal.u8 q0, q2 \n" // row 2 add adjacent, add row 1 to row 2
82 "vpadal.u8 q1, q3 \n"
83 "vadd.u16 q0, q4 \n" // rounding
84 "vadd.u16 q1, q4 \n"
85 "vshrn.u16 d0, q0, #2 \n" // downshift and pack
86 "vshrn.u16 d1, q1, #2 \n"
87 "vst1.u8 {q0}, [%2]! \n"
88 "subs %3, %3, #16 \n" // 16 processed per loop
89 "bhi 1b \n"
90 : "+r"(src_ptr), // %0
91 "+r"(src_stride), // %1
92 "+r"(dst), // %2
93 "+r"(dst_width) // %3
94 :
95 : "r4", "q0", "q1", "q2", "q3", "q4" // Clobber List
96 );
97 }
98
99 #define HAS_SCALEROWDOWN4_NEON
100 // Expecting widths on arm devices to be smaller. Went with 8x4 blocks
101 // to get most coverage. Look to back and evaluate 16x4 blocks with
102 // handling of leftovers.
ScaleRowDown4_NEON(const uint8 * src_ptr,int,uint8 * dst_ptr,int dst_width)103 static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */,
104 uint8* dst_ptr, int dst_width) {
105 __asm__ volatile
106 (
107 "mov r4, #4 \n"
108 "1: \n"
109 "vld1.u8 {d0[0]}, [%0],r4 \n" // load up only 2 pixels of data to
110 "vld1.u8 {d0[1]}, [%0],r4 \n" // represent the entire 8x4 block
111
112 "vst1.u16 {d0[0]}, [%1]! \n"
113
114 "subs %2, #2 \n" // dst_width -= 2
115 "bhi 1b \n"
116 : "+r"(src_ptr), // %0
117 "+r"(dst_ptr), // %1
118 "+r"(dst_width) // %2
119 :
120 : "r4", "q0", "q1", "memory", "cc"
121 );
122 }
123
ScaleRowDown4Int_NEON(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)124 static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,
125 uint8* dst_ptr, int dst_width) {
126 __asm__ volatile
127 (
128 "1: \n"
129 "mov r4, %0 \n"
130 "vld1.u8 {d0}, [r4],%3 \n" // load up 8x4 block of input data
131 "vld1.u8 {d1}, [r4],%3 \n"
132 "vld1.u8 {d2}, [r4],%3 \n"
133 "vld1.u8 {d3}, [r4] \n"
134
135 // data is loaded up int q0 and q1
136 // q0 = a00 a01 a02 a03 b00 b01 b02 b03 a10 a11 a12 a13 b10 b11 b12 b13
137 // q1 = a20 a21 a22 a23 b20 b21 b22 b23 a20 a21 a22 a23 b20 b21 b22 b23
138 // q0 = a00+a01 a02+a03 b00+b01 b02+b03 a10+a11 a12+a13 b10+b11 b12+b13
139 "vpaddl.u8 q0, q0 \n"
140
141 // d0 = a00+a01+a20+a21 a02+a03+a22+a23 b00+b01+b20+b21 b02+b03+b22+b23
142 // d1 = a10+a11+a20+a21 a12+a13+a22+a23 b10+b11+b20+b21 b12+b13+b22+b23
143 "vpadal.u8 q0, q1 \n"
144
145 // d0 = a00+a01+a20+a21+a02+a03+a22+a23 b00+b01+b20+b21+b02+b03+b22+b23
146 // d1 = a10+a11+a20+a21+a12+a13+a22+a23 b10+b11+b20+b21+b12+b13+b22+b23
147 "vpaddl.u16 q0, q0 \n"
148
149
150 // d0 = a00+a01+a20+a21+a02+a03+a22+a23+a10+a11+a20+a21+a12+a13+a22+a23
151 // b00+b01+b20+b21+b02+b03+b22+b23+b10+b11+b20+b21+b12+b13+b22+b23
152 "vadd.u32 d0, d1 \n"
153
154 "vrshr.u32 d0, d0, #4 \n" // divide by 16 w/rounding
155
156 "vst1.u8 {d0[0]}, [%1]! \n"
157 "vst1.u8 {d0[4]}, [%1]! \n"
158
159 "add %0, #8 \n" // move src pointer to next 8 pixels
160 "subs %2, #2 \n" // dst_width -= 2
161 "bhi 1b \n"
162
163 : "+r"(src_ptr), // %0
164 "+r"(dst_ptr), // %1
165 "+r"(dst_width) // %2
166 : "r"(src_stride) // %3
167 : "r4", "q0", "q1", "memory", "cc"
168 );
169 }
170
171 /**
172 * SSE2 downscalers with interpolation.
173 *
174 * Provided by Frank Barchard (fbarchard@google.com)
175 *
176 */
177
178 // Constants for SSE2 code
179 #elif (defined(WIN32) || defined(__i386__) || defined(__x86_64__)) && \
180 !defined(COVERAGE_ENABLED) && !TARGET_IPHONE_SIMULATOR
181 #if defined(_MSC_VER)
182 #define TALIGN16(t, var) __declspec(align(16)) t _ ## var
183 #elif defined(OSX)
184 #define TALIGN16(t, var) t var __attribute__((aligned(16)))
185 #else
186 #define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))
187 #endif
188
189 // Offsets for source bytes 0 to 9
190 extern "C" TALIGN16(const uint8, shuf0[16]) =
191 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
192
193 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
194 extern "C" TALIGN16(const uint8, shuf1[16]) =
195 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
196
197 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
198 extern "C" TALIGN16(const uint8, shuf2[16]) =
199 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
200
201 // Offsets for source bytes 0 to 10
202 extern "C" TALIGN16(const uint8, shuf01[16]) =
203 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
204
205 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
206 extern "C" TALIGN16(const uint8, shuf11[16]) =
207 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
208
209 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
210 extern "C" TALIGN16(const uint8, shuf21[16]) =
211 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
212
213 // Coefficients for source bytes 0 to 10
214 extern "C" TALIGN16(const uint8, madd01[16]) =
215 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
216
217 // Coefficients for source bytes 10 to 21
218 extern "C" TALIGN16(const uint8, madd11[16]) =
219 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
220
221 // Coefficients for source bytes 21 to 31
222 extern "C" TALIGN16(const uint8, madd21[16]) =
223 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
224
225 // Coefficients for source bytes 21 to 31
226 extern "C" TALIGN16(const int16, round34[8]) =
227 { 2, 2, 2, 2, 2, 2, 2, 2 };
228
229 extern "C" TALIGN16(const uint8, shuf38a[16]) =
230 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
231
232 extern "C" TALIGN16(const uint8, shuf38b[16]) =
233 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
234
235 // Arrange words 0,3,6 into 0,1,2
236 extern "C" TALIGN16(const uint8, shufac0[16]) =
237 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
238
239 // Arrange words 0,3,6 into 3,4,5
240 extern "C" TALIGN16(const uint8, shufac3[16]) =
241 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
242
243 // Scaling values for boxes of 3x3 and 2x3
244 extern "C" TALIGN16(const uint16, scaleac3[8]) =
245 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
246
247 // Arrange first value for pixels 0,1,2,3,4,5
248 extern "C" TALIGN16(const uint8, shufab0[16]) =
249 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
250
251 // Arrange second value for pixels 0,1,2,3,4,5
252 extern "C" TALIGN16(const uint8, shufab1[16]) =
253 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
254
255 // Arrange third value for pixels 0,1,2,3,4,5
256 extern "C" TALIGN16(const uint8, shufab2[16]) =
257 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
258
259 // Scaling values for boxes of 3x2 and 2x2
260 extern "C" TALIGN16(const uint16, scaleab2[8]) =
261 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
262 #endif
263
264 #if defined(WIN32) && !defined(COVERAGE_ENABLED)
265
266 #define HAS_SCALEROWDOWN2_SSE2
267 // Reads 32 pixels, throws half away and writes 16 pixels.
268 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
269 __declspec(naked)
ScaleRowDown2_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)270 static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
271 uint8* dst_ptr, int dst_width) {
272 __asm {
273 mov eax, [esp + 4] // src_ptr
274 // src_stride ignored
275 mov edx, [esp + 12] // dst_ptr
276 mov ecx, [esp + 16] // dst_width
277 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
278 psrlw xmm7, 8
279
280 wloop:
281 movdqa xmm0, [eax]
282 movdqa xmm1, [eax + 16]
283 lea eax, [eax + 32]
284 pand xmm0, xmm7
285 pand xmm1, xmm7
286 packuswb xmm0, xmm1
287 movdqa [edx], xmm0
288 lea edx, [edx + 16]
289 sub ecx, 16
290 ja wloop
291
292 ret
293 }
294 }
295 // Blends 32x2 rectangle to 16x1.
296 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
297 __declspec(naked)
ScaleRowDown2Int_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)298 static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
299 uint8* dst_ptr, int dst_width) {
300 __asm {
301 push esi
302 mov eax, [esp + 4 + 4] // src_ptr
303 mov esi, [esp + 4 + 8] // src_stride
304 mov edx, [esp + 4 + 12] // dst_ptr
305 mov ecx, [esp + 4 + 16] // dst_width
306 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
307 psrlw xmm7, 8
308
309 wloop:
310 movdqa xmm0, [eax]
311 movdqa xmm1, [eax + 16]
312 movdqa xmm2, [eax + esi]
313 movdqa xmm3, [eax + esi + 16]
314 lea eax, [eax + 32]
315 pavgb xmm0, xmm2 // average rows
316 pavgb xmm1, xmm3
317
318 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
319 psrlw xmm0, 8
320 movdqa xmm3, xmm1
321 psrlw xmm1, 8
322 pand xmm2, xmm7
323 pand xmm3, xmm7
324 pavgw xmm0, xmm2
325 pavgw xmm1, xmm3
326 packuswb xmm0, xmm1
327
328 movdqa [edx], xmm0
329 lea edx, [edx + 16]
330 sub ecx, 16
331 ja wloop
332
333 pop esi
334 ret
335 }
336 }
337
338 #define HAS_SCALEROWDOWN4_SSE2
339 // Point samples 32 pixels to 8 pixels.
340 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
341 __declspec(naked)
ScaleRowDown4_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)342 static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
343 uint8* dst_ptr, int dst_width) {
344 __asm {
345 pushad
346 mov esi, [esp + 32 + 4] // src_ptr
347 // src_stride ignored
348 mov edi, [esp + 32 + 12] // dst_ptr
349 mov ecx, [esp + 32 + 16] // dst_width
350 pcmpeqb xmm7, xmm7 // generate mask 0x000000ff
351 psrld xmm7, 24
352
353 wloop:
354 movdqa xmm0, [esi]
355 movdqa xmm1, [esi + 16]
356 lea esi, [esi + 32]
357 pand xmm0, xmm7
358 pand xmm1, xmm7
359 packuswb xmm0, xmm1
360 packuswb xmm0, xmm0
361 movq qword ptr [edi], xmm0
362 lea edi, [edi + 8]
363 sub ecx, 8
364 ja wloop
365
366 popad
367 ret
368 }
369 }
370
371 // Blends 32x4 rectangle to 8x1.
372 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
373 __declspec(naked)
ScaleRowDown4Int_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)374 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
375 uint8* dst_ptr, int dst_width) {
376 __asm {
377 pushad
378 mov esi, [esp + 32 + 4] // src_ptr
379 mov ebx, [esp + 32 + 8] // src_stride
380 mov edi, [esp + 32 + 12] // dst_ptr
381 mov ecx, [esp + 32 + 16] // dst_width
382 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
383 psrlw xmm7, 8
384 lea edx, [ebx + ebx * 2] // src_stride * 3
385
386 wloop:
387 movdqa xmm0, [esi]
388 movdqa xmm1, [esi + 16]
389 movdqa xmm2, [esi + ebx]
390 movdqa xmm3, [esi + ebx + 16]
391 pavgb xmm0, xmm2 // average rows
392 pavgb xmm1, xmm3
393 movdqa xmm2, [esi + ebx * 2]
394 movdqa xmm3, [esi + ebx * 2 + 16]
395 movdqa xmm4, [esi + edx]
396 movdqa xmm5, [esi + edx + 16]
397 lea esi, [esi + 32]
398 pavgb xmm2, xmm4
399 pavgb xmm3, xmm5
400 pavgb xmm0, xmm2
401 pavgb xmm1, xmm3
402
403 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
404 psrlw xmm0, 8
405 movdqa xmm3, xmm1
406 psrlw xmm1, 8
407 pand xmm2, xmm7
408 pand xmm3, xmm7
409 pavgw xmm0, xmm2
410 pavgw xmm1, xmm3
411 packuswb xmm0, xmm1
412
413 movdqa xmm2, xmm0 // average columns (16 to 8 pixels)
414 psrlw xmm0, 8
415 pand xmm2, xmm7
416 pavgw xmm0, xmm2
417 packuswb xmm0, xmm0
418
419 movq qword ptr [edi], xmm0
420 lea edi, [edi + 8]
421 sub ecx, 8
422 ja wloop
423
424 popad
425 ret
426 }
427 }
428
429 #define HAS_SCALEROWDOWN8_SSE2
430 // Point samples 32 pixels to 4 pixels.
431 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
432 __declspec(naked)
ScaleRowDown8_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)433 static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
434 uint8* dst_ptr, int dst_width) {
435 __asm {
436 pushad
437 mov esi, [esp + 32 + 4] // src_ptr
438 // src_stride ignored
439 mov edi, [esp + 32 + 12] // dst_ptr
440 mov ecx, [esp + 32 + 16] // dst_width
441 pcmpeqb xmm7, xmm7 // generate mask isolating 1 src 8 bytes
442 psrlq xmm7, 56
443
444 wloop:
445 movdqa xmm0, [esi]
446 movdqa xmm1, [esi + 16]
447 lea esi, [esi + 32]
448 pand xmm0, xmm7
449 pand xmm1, xmm7
450 packuswb xmm0, xmm1 // 32->16
451 packuswb xmm0, xmm0 // 16->8
452 packuswb xmm0, xmm0 // 8->4
453 movd dword ptr [edi], xmm0
454 lea edi, [edi + 4]
455 sub ecx, 4
456 ja wloop
457
458 popad
459 ret
460 }
461 }
462
463 // Blends 32x8 rectangle to 4x1.
464 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
465 __declspec(naked)
ScaleRowDown8Int_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)466 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
467 uint8* dst_ptr, int dst_width) {
468 __asm {
469 pushad
470 mov esi, [esp + 32 + 4] // src_ptr
471 mov ebx, [esp + 32 + 8] // src_stride
472 mov edi, [esp + 32 + 12] // dst_ptr
473 mov ecx, [esp + 32 + 16] // dst_width
474 lea edx, [ebx + ebx * 2] // src_stride * 3
475 pxor xmm7, xmm7
476
477 wloop:
478 movdqa xmm0, [esi] // average 8 rows to 1
479 movdqa xmm1, [esi + 16]
480 movdqa xmm2, [esi + ebx]
481 movdqa xmm3, [esi + ebx + 16]
482 pavgb xmm0, xmm2
483 pavgb xmm1, xmm3
484 movdqa xmm2, [esi + ebx * 2]
485 movdqa xmm3, [esi + ebx * 2 + 16]
486 movdqa xmm4, [esi + edx]
487 movdqa xmm5, [esi + edx + 16]
488 lea ebp, [esi + ebx * 4]
489 lea esi, [esi + 32]
490 pavgb xmm2, xmm4
491 pavgb xmm3, xmm5
492 pavgb xmm0, xmm2
493 pavgb xmm1, xmm3
494
495 movdqa xmm2, [ebp]
496 movdqa xmm3, [ebp + 16]
497 movdqa xmm4, [ebp + ebx]
498 movdqa xmm5, [ebp + ebx + 16]
499 pavgb xmm2, xmm4
500 pavgb xmm3, xmm5
501 movdqa xmm4, [ebp + ebx * 2]
502 movdqa xmm5, [ebp + ebx * 2 + 16]
503 movdqa xmm6, [ebp + edx]
504 pavgb xmm4, xmm6
505 movdqa xmm6, [ebp + edx + 16]
506 pavgb xmm5, xmm6
507 pavgb xmm2, xmm4
508 pavgb xmm3, xmm5
509 pavgb xmm0, xmm2
510 pavgb xmm1, xmm3
511
512 psadbw xmm0, xmm7 // average 32 pixels to 4
513 psadbw xmm1, xmm7
514 pshufd xmm0, xmm0, 0xd8 // x1x0 -> xx01
515 pshufd xmm1, xmm1, 0x8d // x3x2 -> 32xx
516 por xmm0, xmm1 // -> 3201
517 psrlw xmm0, 3
518 packuswb xmm0, xmm0
519 packuswb xmm0, xmm0
520 movd dword ptr [edi], xmm0
521
522 lea edi, [edi + 4]
523 sub ecx, 4
524 ja wloop
525
526 popad
527 ret
528 }
529 }
530
531 #define HAS_SCALEROWDOWN34_SSSE3
532 // Point samples 32 pixels to 24 pixels.
533 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
534 // Then shuffled to do the scaling.
535
536 // Note that movdqa+palign may be better than movdqu.
537 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
538 __declspec(naked)
ScaleRowDown34_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)539 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
540 uint8* dst_ptr, int dst_width) {
541 __asm {
542 pushad
543 mov esi, [esp + 32 + 4] // src_ptr
544 // src_stride ignored
545 mov edi, [esp + 32 + 12] // dst_ptr
546 mov ecx, [esp + 32 + 16] // dst_width
547 movdqa xmm3, _shuf0
548 movdqa xmm4, _shuf1
549 movdqa xmm5, _shuf2
550
551 wloop:
552 movdqa xmm0, [esi]
553 movdqa xmm2, [esi + 16]
554 lea esi, [esi + 32]
555 movdqa xmm1, xmm2
556 palignr xmm1, xmm0, 8
557 pshufb xmm0, xmm3
558 pshufb xmm1, xmm4
559 pshufb xmm2, xmm5
560 movq qword ptr [edi], xmm0
561 movq qword ptr [edi + 8], xmm1
562 movq qword ptr [edi + 16], xmm2
563 lea edi, [edi + 24]
564 sub ecx, 24
565 ja wloop
566
567 popad
568 ret
569 }
570 }
571
572 // Blends 32x2 rectangle to 24x1
573 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
574 // Then shuffled to do the scaling.
575
576 // Register usage:
577 // xmm0 src_row 0
578 // xmm1 src_row 1
579 // xmm2 shuf 0
580 // xmm3 shuf 1
581 // xmm4 shuf 2
582 // xmm5 madd 0
583 // xmm6 madd 1
584 // xmm7 round34
585
586 // Note that movdqa+palign may be better than movdqu.
587 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
588 __declspec(naked)
ScaleRowDown34_1_Int_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)589 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
590 uint8* dst_ptr, int dst_width) {
591 __asm {
592 pushad
593 mov esi, [esp + 32 + 4] // src_ptr
594 mov ebx, [esp + 32 + 8] // src_stride
595 mov edi, [esp + 32 + 12] // dst_ptr
596 mov ecx, [esp + 32 + 16] // dst_width
597 movdqa xmm2, _shuf01
598 movdqa xmm3, _shuf11
599 movdqa xmm4, _shuf21
600 movdqa xmm5, _madd01
601 movdqa xmm6, _madd11
602 movdqa xmm7, _round34
603
604 wloop:
605 movdqa xmm0, [esi] // pixels 0..7
606 movdqa xmm1, [esi+ebx]
607 pavgb xmm0, xmm1
608 pshufb xmm0, xmm2
609 pmaddubsw xmm0, xmm5
610 paddsw xmm0, xmm7
611 psrlw xmm0, 2
612 packuswb xmm0, xmm0
613 movq qword ptr [edi], xmm0
614 movdqu xmm0, [esi+8] // pixels 8..15
615 movdqu xmm1, [esi+ebx+8]
616 pavgb xmm0, xmm1
617 pshufb xmm0, xmm3
618 pmaddubsw xmm0, xmm6
619 paddsw xmm0, xmm7
620 psrlw xmm0, 2
621 packuswb xmm0, xmm0
622 movq qword ptr [edi+8], xmm0
623 movdqa xmm0, [esi+16] // pixels 16..23
624 movdqa xmm1, [esi+ebx+16]
625 lea esi, [esi+32]
626 pavgb xmm0, xmm1
627 pshufb xmm0, xmm4
628 movdqa xmm1, _madd21
629 pmaddubsw xmm0, xmm1
630 paddsw xmm0, xmm7
631 psrlw xmm0, 2
632 packuswb xmm0, xmm0
633 movq qword ptr [edi+16], xmm0
634 lea edi, [edi+24]
635 sub ecx, 24
636 ja wloop
637
638 popad
639 ret
640 }
641 }
642
643 // Note that movdqa+palign may be better than movdqu.
644 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
645 __declspec(naked)
ScaleRowDown34_0_Int_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)646 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
647 uint8* dst_ptr, int dst_width) {
648 __asm {
649 pushad
650 mov esi, [esp + 32 + 4] // src_ptr
651 mov ebx, [esp + 32 + 8] // src_stride
652 mov edi, [esp + 32 + 12] // dst_ptr
653 mov ecx, [esp + 32 + 16] // dst_width
654 movdqa xmm2, _shuf01
655 movdqa xmm3, _shuf11
656 movdqa xmm4, _shuf21
657 movdqa xmm5, _madd01
658 movdqa xmm6, _madd11
659 movdqa xmm7, _round34
660
661 wloop:
662 movdqa xmm0, [esi] // pixels 0..7
663 movdqa xmm1, [esi+ebx]
664 pavgb xmm1, xmm0
665 pavgb xmm0, xmm1
666 pshufb xmm0, xmm2
667 pmaddubsw xmm0, xmm5
668 paddsw xmm0, xmm7
669 psrlw xmm0, 2
670 packuswb xmm0, xmm0
671 movq qword ptr [edi], xmm0
672 movdqu xmm0, [esi+8] // pixels 8..15
673 movdqu xmm1, [esi+ebx+8]
674 pavgb xmm1, xmm0
675 pavgb xmm0, xmm1
676 pshufb xmm0, xmm3
677 pmaddubsw xmm0, xmm6
678 paddsw xmm0, xmm7
679 psrlw xmm0, 2
680 packuswb xmm0, xmm0
681 movq qword ptr [edi+8], xmm0
682 movdqa xmm0, [esi+16] // pixels 16..23
683 movdqa xmm1, [esi+ebx+16]
684 lea esi, [esi+32]
685 pavgb xmm1, xmm0
686 pavgb xmm0, xmm1
687 pshufb xmm0, xmm4
688 movdqa xmm1, _madd21
689 pmaddubsw xmm0, xmm1
690 paddsw xmm0, xmm7
691 psrlw xmm0, 2
692 packuswb xmm0, xmm0
693 movq qword ptr [edi+16], xmm0
694 lea edi, [edi+24]
695 sub ecx, 24
696 ja wloop
697
698 popad
699 ret
700 }
701 }
702
703 #define HAS_SCALEROWDOWN38_SSSE3
704 // 3/8 point sampler
705
706 // Scale 32 pixels to 12
707 __declspec(naked)
ScaleRowDown38_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)708 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
709 uint8* dst_ptr, int dst_width) {
710 __asm {
711 pushad
712 mov esi, [esp + 32 + 4] // src_ptr
713 mov edx, [esp + 32 + 8] // src_stride
714 mov edi, [esp + 32 + 12] // dst_ptr
715 mov ecx, [esp + 32 + 16] // dst_width
716 movdqa xmm5, _shuf38a
717 movdqa xmm6, _shuf38b
718 pxor xmm7, xmm7
719
720 xloop:
721 movdqa xmm0, [esi] // 16 pixels -> 0,1,2,3,4,5
722 movdqa xmm1, [esi + 16] // 16 pixels -> 6,7,8,9,10,11
723 lea esi, [esi + 32]
724 pshufb xmm0, xmm5
725 pshufb xmm1, xmm6
726 paddusb xmm0, xmm1
727
728 movq qword ptr [edi], xmm0 // write 12 pixels
729 movhlps xmm1, xmm0
730 movd [edi + 8], xmm1
731 lea edi, [edi + 12]
732 sub ecx, 12
733 ja xloop
734
735 popad
736 ret
737 }
738 }
739
740 // Scale 16x3 pixels to 6x1 with interpolation
741 __declspec(naked)
ScaleRowDown38_3_Int_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)742 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
743 uint8* dst_ptr, int dst_width) {
744 __asm {
745 pushad
746 mov esi, [esp + 32 + 4] // src_ptr
747 mov edx, [esp + 32 + 8] // src_stride
748 mov edi, [esp + 32 + 12] // dst_ptr
749 mov ecx, [esp + 32 + 16] // dst_width
750 movdqa xmm4, _shufac0
751 movdqa xmm5, _shufac3
752 movdqa xmm6, _scaleac3
753 pxor xmm7, xmm7
754
755 xloop:
756 movdqa xmm0, [esi] // sum up 3 rows into xmm0/1
757 movdqa xmm2, [esi + edx]
758 movhlps xmm1, xmm0
759 movhlps xmm3, xmm2
760 punpcklbw xmm0, xmm7
761 punpcklbw xmm1, xmm7
762 punpcklbw xmm2, xmm7
763 punpcklbw xmm3, xmm7
764 paddusw xmm0, xmm2
765 paddusw xmm1, xmm3
766 movdqa xmm2, [esi + edx * 2]
767 lea esi, [esi + 16]
768 movhlps xmm3, xmm2
769 punpcklbw xmm2, xmm7
770 punpcklbw xmm3, xmm7
771 paddusw xmm0, xmm2
772 paddusw xmm1, xmm3
773
774 movdqa xmm2, xmm0 // 8 pixels -> 0,1,2 of xmm2
775 psrldq xmm0, 2
776 paddusw xmm2, xmm0
777 psrldq xmm0, 2
778 paddusw xmm2, xmm0
779 pshufb xmm2, xmm4
780
781 movdqa xmm3, xmm1 // 8 pixels -> 3,4,5 of xmm2
782 psrldq xmm1, 2
783 paddusw xmm3, xmm1
784 psrldq xmm1, 2
785 paddusw xmm3, xmm1
786 pshufb xmm3, xmm5
787 paddusw xmm2, xmm3
788
789 pmulhuw xmm2, xmm6 // divide by 9,9,6, 9,9,6
790 packuswb xmm2, xmm2
791
792 movd [edi], xmm2 // write 6 pixels
793 pextrw eax, xmm2, 2
794 mov [edi + 4], ax
795 lea edi, [edi + 6]
796 sub ecx, 6
797 ja xloop
798
799 popad
800 ret
801 }
802 }
803
804 // Scale 16x2 pixels to 6x1 with interpolation
805 __declspec(naked)
ScaleRowDown38_2_Int_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)806 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
807 uint8* dst_ptr, int dst_width) {
808 __asm {
809 pushad
810 mov esi, [esp + 32 + 4] // src_ptr
811 mov edx, [esp + 32 + 8] // src_stride
812 mov edi, [esp + 32 + 12] // dst_ptr
813 mov ecx, [esp + 32 + 16] // dst_width
814 movdqa xmm4, _shufab0
815 movdqa xmm5, _shufab1
816 movdqa xmm6, _shufab2
817 movdqa xmm7, _scaleab2
818
819 xloop:
820 movdqa xmm2, [esi] // average 2 rows into xmm2
821 pavgb xmm2, [esi + edx]
822 lea esi, [esi + 16]
823
824 movdqa xmm0, xmm2 // 16 pixels -> 0,1,2,3,4,5 of xmm0
825 pshufb xmm0, xmm4
826 movdqa xmm1, xmm2
827 pshufb xmm1, xmm5
828 paddusw xmm0, xmm1
829 pshufb xmm2, xmm6
830 paddusw xmm0, xmm2
831
832 pmulhuw xmm0, xmm7 // divide by 3,3,2, 3,3,2
833 packuswb xmm0, xmm0
834
835 movd [edi], xmm0 // write 6 pixels
836 pextrw eax, xmm0, 2
837 mov [edi + 4], ax
838 lea edi, [edi + 6]
839 sub ecx, 6
840 ja xloop
841
842 popad
843 ret
844 }
845 }
846
847 #define HAS_SCALEADDROWS_SSE2
848
849 // Reads 8xN bytes and produces 16 shorts at a time.
850 __declspec(naked)
ScaleAddRows_SSE2(const uint8 * src_ptr,int src_stride,uint16 * dst_ptr,int src_width,int src_height)851 static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
852 uint16* dst_ptr, int src_width,
853 int src_height) {
854 __asm {
855 pushad
856 mov esi, [esp + 32 + 4] // src_ptr
857 mov edx, [esp + 32 + 8] // src_stride
858 mov edi, [esp + 32 + 12] // dst_ptr
859 mov ecx, [esp + 32 + 16] // dst_width
860 mov ebx, [esp + 32 + 20] // height
861 pxor xmm7, xmm7
862 dec ebx
863
864 xloop:
865 // first row
866 movdqa xmm2, [esi]
867 lea eax, [esi + edx]
868 movhlps xmm3, xmm2
869 mov ebp, ebx
870 punpcklbw xmm2, xmm7
871 punpcklbw xmm3, xmm7
872
873 // sum remaining rows
874 yloop:
875 movdqa xmm0, [eax] // read 16 pixels
876 lea eax, [eax + edx] // advance to next row
877 movhlps xmm1, xmm0
878 punpcklbw xmm0, xmm7
879 punpcklbw xmm1, xmm7
880 paddusw xmm2, xmm0 // sum 16 words
881 paddusw xmm3, xmm1
882 sub ebp, 1
883 ja yloop
884
885 movdqa [edi], xmm2
886 movdqa [edi + 16], xmm3
887 lea edi, [edi + 32]
888 lea esi, [esi + 16]
889
890 sub ecx, 16
891 ja xloop
892
893 popad
894 ret
895 }
896 }
897
898 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
899 #define HAS_SCALEFILTERROWS_SSE2
900 __declspec(naked)
ScaleFilterRows_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,int src_stride,int dst_width,int source_y_fraction)901 static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
902 int src_stride, int dst_width,
903 int source_y_fraction) {
904 __asm {
905 push esi
906 push edi
907 mov edi, [esp + 8 + 4] // dst_ptr
908 mov esi, [esp + 8 + 8] // src_ptr
909 mov edx, [esp + 8 + 12] // src_stride
910 mov ecx, [esp + 8 + 16] // dst_width
911 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
912 cmp eax, 0
913 je xloop1
914 cmp eax, 128
915 je xloop2
916
917 movd xmm6, eax // xmm6 = y fraction
918 punpcklwd xmm6, xmm6
919 pshufd xmm6, xmm6, 0
920 neg eax // xmm5 = 256 - y fraction
921 add eax, 256
922 movd xmm5, eax
923 punpcklwd xmm5, xmm5
924 pshufd xmm5, xmm5, 0
925 pxor xmm7, xmm7
926
927 xloop:
928 movdqa xmm0, [esi]
929 movdqa xmm2, [esi + edx]
930 lea esi, [esi + 16]
931 movdqa xmm1, xmm0
932 movdqa xmm3, xmm2
933 punpcklbw xmm0, xmm7
934 punpcklbw xmm2, xmm7
935 punpckhbw xmm1, xmm7
936 punpckhbw xmm3, xmm7
937 pmullw xmm0, xmm5 // scale row 0
938 pmullw xmm1, xmm5
939 pmullw xmm2, xmm6 // scale row 1
940 pmullw xmm3, xmm6
941 paddusw xmm0, xmm2 // sum rows
942 paddusw xmm1, xmm3
943 psrlw xmm0, 8
944 psrlw xmm1, 8
945 packuswb xmm0, xmm1
946 movdqa [edi], xmm0
947 lea edi, [edi + 16]
948 sub ecx, 16
949 ja xloop
950
951 mov al, [edi - 1]
952 mov [edi], al
953 pop edi
954 pop esi
955 ret
956
957 xloop1:
958 movdqa xmm0, [esi]
959 lea esi, [esi + 16]
960 movdqa [edi], xmm0
961 lea edi, [edi + 16]
962 sub ecx, 16
963 ja xloop1
964
965 mov al, [edi - 1]
966 mov [edi], al
967 pop edi
968 pop esi
969 ret
970
971 xloop2:
972 movdqa xmm0, [esi]
973 movdqa xmm2, [esi + edx]
974 lea esi, [esi + 16]
975 pavgb xmm0, xmm2
976 movdqa [edi], xmm0
977 lea edi, [edi + 16]
978 sub ecx, 16
979 ja xloop2
980
981 mov al, [edi - 1]
982 mov [edi], al
983 pop edi
984 pop esi
985 ret
986 }
987 }
988
989 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
990 #define HAS_SCALEFILTERROWS_SSSE3
991 __declspec(naked)
ScaleFilterRows_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,int src_stride,int dst_width,int source_y_fraction)992 static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
993 int src_stride, int dst_width,
994 int source_y_fraction) {
995 __asm {
996 push esi
997 push edi
998 mov edi, [esp + 8 + 4] // dst_ptr
999 mov esi, [esp + 8 + 8] // src_ptr
1000 mov edx, [esp + 8 + 12] // src_stride
1001 mov ecx, [esp + 8 + 16] // dst_width
1002 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
1003 cmp eax, 0
1004 je xloop1
1005 cmp eax, 128
1006 je xloop2
1007
1008 shr eax, 1
1009 mov ah,al
1010 neg al
1011 add al, 128
1012 movd xmm7, eax
1013 punpcklwd xmm7, xmm7
1014 pshufd xmm7, xmm7, 0
1015
1016 xloop:
1017 movdqa xmm0, [esi]
1018 movdqa xmm2, [esi + edx]
1019 lea esi, [esi + 16]
1020 movdqa xmm1, xmm0
1021 punpcklbw xmm0, xmm2
1022 punpckhbw xmm1, xmm2
1023 pmaddubsw xmm0, xmm7
1024 pmaddubsw xmm1, xmm7
1025 psrlw xmm0, 7
1026 psrlw xmm1, 7
1027 packuswb xmm0, xmm1
1028 movdqa [edi], xmm0
1029 lea edi, [edi + 16]
1030 sub ecx, 16
1031 ja xloop
1032
1033 mov al, [edi - 1]
1034 mov [edi], al
1035 pop edi
1036 pop esi
1037 ret
1038
1039 xloop1:
1040 movdqa xmm0, [esi]
1041 lea esi, [esi + 16]
1042 movdqa [edi], xmm0
1043 lea edi, [edi + 16]
1044 sub ecx, 16
1045 ja xloop1
1046
1047 mov al, [edi - 1]
1048 mov [edi], al
1049 pop edi
1050 pop esi
1051 ret
1052
1053 xloop2:
1054 movdqa xmm0, [esi]
1055 movdqa xmm2, [esi + edx]
1056 lea esi, [esi + 16]
1057 pavgb xmm0, xmm2
1058 movdqa [edi], xmm0
1059 lea edi, [edi + 16]
1060 sub ecx, 16
1061 ja xloop2
1062
1063 mov al, [edi - 1]
1064 mov [edi], al
1065 pop edi
1066 pop esi
1067 ret
1068
1069 }
1070 }
1071
1072 // Note that movdqa+palign may be better than movdqu.
1073 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
1074 __declspec(naked)
ScaleFilterCols34_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width)1075 static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
1076 int dst_width) {
1077 __asm {
1078 mov edx, [esp + 4] // dst_ptr
1079 mov eax, [esp + 8] // src_ptr
1080 mov ecx, [esp + 12] // dst_width
1081 movdqa xmm1, _round34
1082 movdqa xmm2, _shuf01
1083 movdqa xmm3, _shuf11
1084 movdqa xmm4, _shuf21
1085 movdqa xmm5, _madd01
1086 movdqa xmm6, _madd11
1087 movdqa xmm7, _madd21
1088
1089 wloop:
1090 movdqa xmm0, [eax] // pixels 0..7
1091 pshufb xmm0, xmm2
1092 pmaddubsw xmm0, xmm5
1093 paddsw xmm0, xmm1
1094 psrlw xmm0, 2
1095 packuswb xmm0, xmm0
1096 movq qword ptr [edx], xmm0
1097 movdqu xmm0, [eax+8] // pixels 8..15
1098 pshufb xmm0, xmm3
1099 pmaddubsw xmm0, xmm6
1100 paddsw xmm0, xmm1
1101 psrlw xmm0, 2
1102 packuswb xmm0, xmm0
1103 movq qword ptr [edx+8], xmm0
1104 movdqa xmm0, [eax+16] // pixels 16..23
1105 lea eax, [eax+32]
1106 pshufb xmm0, xmm4
1107 pmaddubsw xmm0, xmm7
1108 paddsw xmm0, xmm1
1109 psrlw xmm0, 2
1110 packuswb xmm0, xmm0
1111 movq qword ptr [edx+16], xmm0
1112 lea edx, [edx+24]
1113 sub ecx, 24
1114 ja wloop
1115 ret
1116 }
1117 }
1118
1119 #elif (defined(__x86_64__) || defined(__i386__)) && \
1120 !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
1121
1122 // GCC versions of row functions are verbatim conversions from Visual C.
1123 // Generated using gcc disassembly on Visual C object file:
1124 // objdump -D yuvscaler.obj >yuvscaler.txt
1125 #define HAS_SCALEROWDOWN2_SSE2
ScaleRowDown2_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)1126 static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
1127 uint8* dst_ptr, int dst_width) {
1128 asm volatile(
1129 "pcmpeqb %%xmm7,%%xmm7\n"
1130 "psrlw $0x8,%%xmm7\n"
1131 "1:"
1132 "movdqa (%0),%%xmm0\n"
1133 "movdqa 0x10(%0),%%xmm1\n"
1134 "lea 0x20(%0),%0\n"
1135 "pand %%xmm7,%%xmm0\n"
1136 "pand %%xmm7,%%xmm1\n"
1137 "packuswb %%xmm1,%%xmm0\n"
1138 "movdqa %%xmm0,(%1)\n"
1139 "lea 0x10(%1),%1\n"
1140 "sub $0x10,%2\n"
1141 "ja 1b\n"
1142 : "+r"(src_ptr), // %0
1143 "+r"(dst_ptr), // %1
1144 "+r"(dst_width) // %2
1145 :
1146 : "memory"
1147 );
1148 }
1149
ScaleRowDown2Int_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)1150 static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
1151 uint8* dst_ptr, int dst_width) {
1152 asm volatile(
1153 "pcmpeqb %%xmm7,%%xmm7\n"
1154 "psrlw $0x8,%%xmm7\n"
1155 "1:"
1156 "movdqa (%0),%%xmm0\n"
1157 "movdqa 0x10(%0),%%xmm1\n"
1158 "movdqa (%0,%3,1),%%xmm2\n"
1159 "movdqa 0x10(%0,%3,1),%%xmm3\n"
1160 "lea 0x20(%0),%0\n"
1161 "pavgb %%xmm2,%%xmm0\n"
1162 "pavgb %%xmm3,%%xmm1\n"
1163 "movdqa %%xmm0,%%xmm2\n"
1164 "psrlw $0x8,%%xmm0\n"
1165 "movdqa %%xmm1,%%xmm3\n"
1166 "psrlw $0x8,%%xmm1\n"
1167 "pand %%xmm7,%%xmm2\n"
1168 "pand %%xmm7,%%xmm3\n"
1169 "pavgw %%xmm2,%%xmm0\n"
1170 "pavgw %%xmm3,%%xmm1\n"
1171 "packuswb %%xmm1,%%xmm0\n"
1172 "movdqa %%xmm0,(%1)\n"
1173 "lea 0x10(%1),%1\n"
1174 "sub $0x10,%2\n"
1175 "ja 1b\n"
1176 : "+r"(src_ptr), // %0
1177 "+r"(dst_ptr), // %1
1178 "+r"(dst_width) // %2
1179 : "r"(static_cast<intptr_t>(src_stride)) // %3
1180 : "memory"
1181 );
1182 }
1183
1184 #define HAS_SCALEROWDOWN4_SSE2
ScaleRowDown4_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)1185 static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
1186 uint8* dst_ptr, int dst_width) {
1187 asm volatile(
1188 "pcmpeqb %%xmm7,%%xmm7\n"
1189 "psrld $0x18,%%xmm7\n"
1190 "1:"
1191 "movdqa (%0),%%xmm0\n"
1192 "movdqa 0x10(%0),%%xmm1\n"
1193 "lea 0x20(%0),%0\n"
1194 "pand %%xmm7,%%xmm0\n"
1195 "pand %%xmm7,%%xmm1\n"
1196 "packuswb %%xmm1,%%xmm0\n"
1197 "packuswb %%xmm0,%%xmm0\n"
1198 "movq %%xmm0,(%1)\n"
1199 "lea 0x8(%1),%1\n"
1200 "sub $0x8,%2\n"
1201 "ja 1b\n"
1202 : "+r"(src_ptr), // %0
1203 "+r"(dst_ptr), // %1
1204 "+r"(dst_width) // %2
1205 :
1206 : "memory"
1207 );
1208 }
1209
ScaleRowDown4Int_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)1210 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
1211 uint8* dst_ptr, int dst_width) {
1212 intptr_t temp = 0;
1213 asm volatile(
1214 "pcmpeqb %%xmm7,%%xmm7\n"
1215 "psrlw $0x8,%%xmm7\n"
1216 "lea (%4,%4,2),%3\n"
1217 "1:"
1218 "movdqa (%0),%%xmm0\n"
1219 "movdqa 0x10(%0),%%xmm1\n"
1220 "movdqa (%0,%4,1),%%xmm2\n"
1221 "movdqa 0x10(%0,%4,1),%%xmm3\n"
1222 "pavgb %%xmm2,%%xmm0\n"
1223 "pavgb %%xmm3,%%xmm1\n"
1224 "movdqa (%0,%4,2),%%xmm2\n"
1225 "movdqa 0x10(%0,%4,2),%%xmm3\n"
1226 "movdqa (%0,%3,1),%%xmm4\n"
1227 "movdqa 0x10(%0,%3,1),%%xmm5\n"
1228 "lea 0x20(%0),%0\n"
1229 "pavgb %%xmm4,%%xmm2\n"
1230 "pavgb %%xmm2,%%xmm0\n"
1231 "pavgb %%xmm5,%%xmm3\n"
1232 "pavgb %%xmm3,%%xmm1\n"
1233 "movdqa %%xmm0,%%xmm2\n"
1234 "psrlw $0x8,%%xmm0\n"
1235 "movdqa %%xmm1,%%xmm3\n"
1236 "psrlw $0x8,%%xmm1\n"
1237 "pand %%xmm7,%%xmm2\n"
1238 "pand %%xmm7,%%xmm3\n"
1239 "pavgw %%xmm2,%%xmm0\n"
1240 "pavgw %%xmm3,%%xmm1\n"
1241 "packuswb %%xmm1,%%xmm0\n"
1242 "movdqa %%xmm0,%%xmm2\n"
1243 "psrlw $0x8,%%xmm0\n"
1244 "pand %%xmm7,%%xmm2\n"
1245 "pavgw %%xmm2,%%xmm0\n"
1246 "packuswb %%xmm0,%%xmm0\n"
1247 "movq %%xmm0,(%1)\n"
1248 "lea 0x8(%1),%1\n"
1249 "sub $0x8,%2\n"
1250 "ja 1b\n"
1251 : "+r"(src_ptr), // %0
1252 "+r"(dst_ptr), // %1
1253 "+r"(dst_width), // %2
1254 "+r"(temp) // %3
1255 : "r"(static_cast<intptr_t>(src_stride)) // %4
1256 : "memory"
1257 );
1258 }
1259
1260 #define HAS_SCALEROWDOWN8_SSE2
ScaleRowDown8_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)1261 static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
1262 uint8* dst_ptr, int dst_width) {
1263 asm volatile(
1264 "pcmpeqb %%xmm7,%%xmm7\n"
1265 "psrlq $0x38,%%xmm7\n"
1266 "1:"
1267 "movdqa (%0),%%xmm0\n"
1268 "movdqa 0x10(%0),%%xmm1\n"
1269 "lea 0x20(%0),%0\n"
1270 "pand %%xmm7,%%xmm0\n"
1271 "pand %%xmm7,%%xmm1\n"
1272 "packuswb %%xmm1,%%xmm0\n"
1273 "packuswb %%xmm0,%%xmm0\n"
1274 "packuswb %%xmm0,%%xmm0\n"
1275 "movd %%xmm0,(%1)\n"
1276 "lea 0x4(%1),%1\n"
1277 "sub $0x4,%2\n"
1278 "ja 1b\n"
1279 : "+r"(src_ptr), // %0
1280 "+r"(dst_ptr), // %1
1281 "+r"(dst_width) // %2
1282 :
1283 : "memory"
1284 );
1285 }
1286
1287 #if defined(__i386__)
1288 extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
1289 uint8* dst_ptr, int dst_width);
1290 asm(
1291 ".text\n"
1292 #if defined(OSX)
1293 ".globl _ScaleRowDown8Int_SSE2\n"
1294 "_ScaleRowDown8Int_SSE2:\n"
1295 #else
1296 ".global ScaleRowDown8Int_SSE2\n"
1297 "ScaleRowDown8Int_SSE2:\n"
1298 #endif
1299 "pusha\n"
1300 "mov 0x24(%esp),%esi\n"
1301 "mov 0x28(%esp),%ebx\n"
1302 "mov 0x2c(%esp),%edi\n"
1303 "mov 0x30(%esp),%ecx\n"
1304 "lea (%ebx,%ebx,2),%edx\n"
1305 "pxor %xmm7,%xmm7\n"
1306
1307 "1:"
1308 "movdqa (%esi),%xmm0\n"
1309 "movdqa 0x10(%esi),%xmm1\n"
1310 "movdqa (%esi,%ebx,1),%xmm2\n"
1311 "movdqa 0x10(%esi,%ebx,1),%xmm3\n"
1312 "pavgb %xmm2,%xmm0\n"
1313 "pavgb %xmm3,%xmm1\n"
1314 "movdqa (%esi,%ebx,2),%xmm2\n"
1315 "movdqa 0x10(%esi,%ebx,2),%xmm3\n"
1316 "movdqa (%esi,%edx,1),%xmm4\n"
1317 "movdqa 0x10(%esi,%edx,1),%xmm5\n"
1318 "lea (%esi,%ebx,4),%ebp\n"
1319 "lea 0x20(%esi),%esi\n"
1320 "pavgb %xmm4,%xmm2\n"
1321 "pavgb %xmm5,%xmm3\n"
1322 "pavgb %xmm2,%xmm0\n"
1323 "pavgb %xmm3,%xmm1\n"
1324 "movdqa 0x0(%ebp),%xmm2\n"
1325 "movdqa 0x10(%ebp),%xmm3\n"
1326 "movdqa 0x0(%ebp,%ebx,1),%xmm4\n"
1327 "movdqa 0x10(%ebp,%ebx,1),%xmm5\n"
1328 "pavgb %xmm4,%xmm2\n"
1329 "pavgb %xmm5,%xmm3\n"
1330 "movdqa 0x0(%ebp,%ebx,2),%xmm4\n"
1331 "movdqa 0x10(%ebp,%ebx,2),%xmm5\n"
1332 "movdqa 0x0(%ebp,%edx,1),%xmm6\n"
1333 "pavgb %xmm6,%xmm4\n"
1334 "movdqa 0x10(%ebp,%edx,1),%xmm6\n"
1335 "pavgb %xmm6,%xmm5\n"
1336 "pavgb %xmm4,%xmm2\n"
1337 "pavgb %xmm5,%xmm3\n"
1338 "pavgb %xmm2,%xmm0\n"
1339 "pavgb %xmm3,%xmm1\n"
1340 "psadbw %xmm7,%xmm0\n"
1341 "psadbw %xmm7,%xmm1\n"
1342 "pshufd $0xd8,%xmm0,%xmm0\n"
1343 "pshufd $0x8d,%xmm1,%xmm1\n"
1344 "por %xmm1,%xmm0\n"
1345 "psrlw $0x3,%xmm0\n"
1346 "packuswb %xmm0,%xmm0\n"
1347 "packuswb %xmm0,%xmm0\n"
1348 "movd %xmm0,(%edi)\n"
1349 "lea 0x4(%edi),%edi\n"
1350 "sub $0x4,%ecx\n"
1351 "ja 1b\n"
1352 "popa\n"
1353 "ret\n"
1354 );
1355
1356 // fpic is used for magiccam plugin
1357 #if !defined(__PIC__)
1358 #define HAS_SCALEROWDOWN34_SSSE3
1359 extern "C" void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
1360 uint8* dst_ptr, int dst_width);
1361 asm(
1362 ".text\n"
1363 #if defined(OSX)
1364 ".globl _ScaleRowDown34_SSSE3\n"
1365 "_ScaleRowDown34_SSSE3:\n"
1366 #else
1367 ".global ScaleRowDown34_SSSE3\n"
1368 "ScaleRowDown34_SSSE3:\n"
1369 #endif
1370 "pusha\n"
1371 "mov 0x24(%esp),%esi\n"
1372 "mov 0x2c(%esp),%edi\n"
1373 "mov 0x30(%esp),%ecx\n"
1374 "movdqa _shuf0,%xmm3\n"
1375 "movdqa _shuf1,%xmm4\n"
1376 "movdqa _shuf2,%xmm5\n"
1377
1378 "1:"
1379 "movdqa (%esi),%xmm0\n"
1380 "movdqa 0x10(%esi),%xmm2\n"
1381 "lea 0x20(%esi),%esi\n"
1382 "movdqa %xmm2,%xmm1\n"
1383 "palignr $0x8,%xmm0,%xmm1\n"
1384 "pshufb %xmm3,%xmm0\n"
1385 "pshufb %xmm4,%xmm1\n"
1386 "pshufb %xmm5,%xmm2\n"
1387 "movq %xmm0,(%edi)\n"
1388 "movq %xmm1,0x8(%edi)\n"
1389 "movq %xmm2,0x10(%edi)\n"
1390 "lea 0x18(%edi),%edi\n"
1391 "sub $0x18,%ecx\n"
1392 "ja 1b\n"
1393 "popa\n"
1394 "ret\n"
1395 );
1396
1397 extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
1398 uint8* dst_ptr, int dst_width);
1399 asm(
1400 ".text\n"
1401 #if defined(OSX)
1402 ".globl _ScaleRowDown34_1_Int_SSSE3\n"
1403 "_ScaleRowDown34_1_Int_SSSE3:\n"
1404 #else
1405 ".global ScaleRowDown34_1_Int_SSSE3\n"
1406 "ScaleRowDown34_1_Int_SSSE3:\n"
1407 #endif
1408 "pusha\n"
1409 "mov 0x24(%esp),%esi\n"
1410 "mov 0x28(%esp),%ebp\n"
1411 "mov 0x2c(%esp),%edi\n"
1412 "mov 0x30(%esp),%ecx\n"
1413 "movdqa _shuf01,%xmm2\n"
1414 "movdqa _shuf11,%xmm3\n"
1415 "movdqa _shuf21,%xmm4\n"
1416 "movdqa _madd01,%xmm5\n"
1417 "movdqa _madd11,%xmm6\n"
1418 "movdqa _round34,%xmm7\n"
1419
1420 "1:"
1421 "movdqa (%esi),%xmm0\n"
1422 "movdqa (%esi,%ebp),%xmm1\n"
1423 "pavgb %xmm1,%xmm0\n"
1424 "pshufb %xmm2,%xmm0\n"
1425 "pmaddubsw %xmm5,%xmm0\n"
1426 "paddsw %xmm7,%xmm0\n"
1427 "psrlw $0x2,%xmm0\n"
1428 "packuswb %xmm0,%xmm0\n"
1429 "movq %xmm0,(%edi)\n"
1430 "movdqu 0x8(%esi),%xmm0\n"
1431 "movdqu 0x8(%esi,%ebp),%xmm1\n"
1432 "pavgb %xmm1,%xmm0\n"
1433 "pshufb %xmm3,%xmm0\n"
1434 "pmaddubsw %xmm6,%xmm0\n"
1435 "paddsw %xmm7,%xmm0\n"
1436 "psrlw $0x2,%xmm0\n"
1437 "packuswb %xmm0,%xmm0\n"
1438 "movq %xmm0,0x8(%edi)\n"
1439 "movdqa 0x10(%esi),%xmm0\n"
1440 "movdqa 0x10(%esi,%ebp),%xmm1\n"
1441 "lea 0x20(%esi),%esi\n"
1442 "pavgb %xmm1,%xmm0\n"
1443 "pshufb %xmm4,%xmm0\n"
1444 "movdqa _madd21,%xmm1\n"
1445 "pmaddubsw %xmm1,%xmm0\n"
1446 "paddsw %xmm7,%xmm0\n"
1447 "psrlw $0x2,%xmm0\n"
1448 "packuswb %xmm0,%xmm0\n"
1449 "movq %xmm0,0x10(%edi)\n"
1450 "lea 0x18(%edi),%edi\n"
1451 "sub $0x18,%ecx\n"
1452 "ja 1b\n"
1453
1454 "popa\n"
1455 "ret\n"
1456 );
1457
1458 extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
1459 uint8* dst_ptr, int dst_width);
1460 asm(
1461 ".text\n"
1462 #if defined(OSX)
1463 ".globl _ScaleRowDown34_0_Int_SSSE3\n"
1464 "_ScaleRowDown34_0_Int_SSSE3:\n"
1465 #else
1466 ".global ScaleRowDown34_0_Int_SSSE3\n"
1467 "ScaleRowDown34_0_Int_SSSE3:\n"
1468 #endif
1469 "pusha\n"
1470 "mov 0x24(%esp),%esi\n"
1471 "mov 0x28(%esp),%ebp\n"
1472 "mov 0x2c(%esp),%edi\n"
1473 "mov 0x30(%esp),%ecx\n"
1474 "movdqa _shuf01,%xmm2\n"
1475 "movdqa _shuf11,%xmm3\n"
1476 "movdqa _shuf21,%xmm4\n"
1477 "movdqa _madd01,%xmm5\n"
1478 "movdqa _madd11,%xmm6\n"
1479 "movdqa _round34,%xmm7\n"
1480
1481 "1:"
1482 "movdqa (%esi),%xmm0\n"
1483 "movdqa (%esi,%ebp,1),%xmm1\n"
1484 "pavgb %xmm0,%xmm1\n"
1485 "pavgb %xmm1,%xmm0\n"
1486 "pshufb %xmm2,%xmm0\n"
1487 "pmaddubsw %xmm5,%xmm0\n"
1488 "paddsw %xmm7,%xmm0\n"
1489 "psrlw $0x2,%xmm0\n"
1490 "packuswb %xmm0,%xmm0\n"
1491 "movq %xmm0,(%edi)\n"
1492 "movdqu 0x8(%esi),%xmm0\n"
1493 "movdqu 0x8(%esi,%ebp,1),%xmm1\n"
1494 "pavgb %xmm0,%xmm1\n"
1495 "pavgb %xmm1,%xmm0\n"
1496 "pshufb %xmm3,%xmm0\n"
1497 "pmaddubsw %xmm6,%xmm0\n"
1498 "paddsw %xmm7,%xmm0\n"
1499 "psrlw $0x2,%xmm0\n"
1500 "packuswb %xmm0,%xmm0\n"
1501 "movq %xmm0,0x8(%edi)\n"
1502 "movdqa 0x10(%esi),%xmm0\n"
1503 "movdqa 0x10(%esi,%ebp,1),%xmm1\n"
1504 "lea 0x20(%esi),%esi\n"
1505 "pavgb %xmm0,%xmm1\n"
1506 "pavgb %xmm1,%xmm0\n"
1507 "pshufb %xmm4,%xmm0\n"
1508 "movdqa _madd21,%xmm1\n"
1509 "pmaddubsw %xmm1,%xmm0\n"
1510 "paddsw %xmm7,%xmm0\n"
1511 "psrlw $0x2,%xmm0\n"
1512 "packuswb %xmm0,%xmm0\n"
1513 "movq %xmm0,0x10(%edi)\n"
1514 "lea 0x18(%edi),%edi\n"
1515 "sub $0x18,%ecx\n"
1516 "ja 1b\n"
1517 "popa\n"
1518 "ret\n"
1519 );
1520
1521 #define HAS_SCALEROWDOWN38_SSSE3
1522 extern "C" void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
1523 uint8* dst_ptr, int dst_width);
1524 asm(
1525 ".text\n"
1526 #if defined(OSX)
1527 ".globl _ScaleRowDown38_SSSE3\n"
1528 "_ScaleRowDown38_SSSE3:\n"
1529 #else
1530 ".global ScaleRowDown38_SSSE3\n"
1531 "ScaleRowDown38_SSSE3:\n"
1532 #endif
1533 "pusha\n"
1534 "mov 0x24(%esp),%esi\n"
1535 "mov 0x28(%esp),%edx\n"
1536 "mov 0x2c(%esp),%edi\n"
1537 "mov 0x30(%esp),%ecx\n"
1538 "movdqa _shuf38a ,%xmm5\n"
1539 "movdqa _shuf38b ,%xmm6\n"
1540 "pxor %xmm7,%xmm7\n"
1541
1542 "1:"
1543 "movdqa (%esi),%xmm0\n"
1544 "movdqa 0x10(%esi),%xmm1\n"
1545 "lea 0x20(%esi),%esi\n"
1546 "pshufb %xmm5,%xmm0\n"
1547 "pshufb %xmm6,%xmm1\n"
1548 "paddusb %xmm1,%xmm0\n"
1549 "movq %xmm0,(%edi)\n"
1550 "movhlps %xmm0,%xmm1\n"
1551 "movd %xmm1,0x8(%edi)\n"
1552 "lea 0xc(%edi),%edi\n"
1553 "sub $0xc,%ecx\n"
1554 "ja 1b\n"
1555 "popa\n"
1556 "ret\n"
1557 );
1558
1559 extern "C" void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
1560 uint8* dst_ptr, int dst_width);
1561 asm(
1562 ".text\n"
1563 #if defined(OSX)
1564 ".globl _ScaleRowDown38_3_Int_SSSE3\n"
1565 "_ScaleRowDown38_3_Int_SSSE3:\n"
1566 #else
1567 ".global ScaleRowDown38_3_Int_SSSE3\n"
1568 "ScaleRowDown38_3_Int_SSSE3:\n"
1569 #endif
1570 "pusha\n"
1571 "mov 0x24(%esp),%esi\n"
1572 "mov 0x28(%esp),%edx\n"
1573 "mov 0x2c(%esp),%edi\n"
1574 "mov 0x30(%esp),%ecx\n"
1575 "movdqa _shufac0,%xmm4\n"
1576 "movdqa _shufac3,%xmm5\n"
1577 "movdqa _scaleac3,%xmm6\n"
1578 "pxor %xmm7,%xmm7\n"
1579
1580 "1:"
1581 "movdqa (%esi),%xmm0\n"
1582 "movdqa (%esi,%edx,1),%xmm2\n"
1583 "movhlps %xmm0,%xmm1\n"
1584 "movhlps %xmm2,%xmm3\n"
1585 "punpcklbw %xmm7,%xmm0\n"
1586 "punpcklbw %xmm7,%xmm1\n"
1587 "punpcklbw %xmm7,%xmm2\n"
1588 "punpcklbw %xmm7,%xmm3\n"
1589 "paddusw %xmm2,%xmm0\n"
1590 "paddusw %xmm3,%xmm1\n"
1591 "movdqa (%esi,%edx,2),%xmm2\n"
1592 "lea 0x10(%esi),%esi\n"
1593 "movhlps %xmm2,%xmm3\n"
1594 "punpcklbw %xmm7,%xmm2\n"
1595 "punpcklbw %xmm7,%xmm3\n"
1596 "paddusw %xmm2,%xmm0\n"
1597 "paddusw %xmm3,%xmm1\n"
1598 "movdqa %xmm0,%xmm2\n"
1599 "psrldq $0x2,%xmm0\n"
1600 "paddusw %xmm0,%xmm2\n"
1601 "psrldq $0x2,%xmm0\n"
1602 "paddusw %xmm0,%xmm2\n"
1603 "pshufb %xmm4,%xmm2\n"
1604 "movdqa %xmm1,%xmm3\n"
1605 "psrldq $0x2,%xmm1\n"
1606 "paddusw %xmm1,%xmm3\n"
1607 "psrldq $0x2,%xmm1\n"
1608 "paddusw %xmm1,%xmm3\n"
1609 "pshufb %xmm5,%xmm3\n"
1610 "paddusw %xmm3,%xmm2\n"
1611 "pmulhuw %xmm6,%xmm2\n"
1612 "packuswb %xmm2,%xmm2\n"
1613 "movd %xmm2,(%edi)\n"
1614 "pextrw $0x2,%xmm2,%eax\n"
1615 "mov %ax,0x4(%edi)\n"
1616 "lea 0x6(%edi),%edi\n"
1617 "sub $0x6,%ecx\n"
1618 "ja 1b\n"
1619 "popa\n"
1620 "ret\n"
1621 );
1622
1623 extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
1624 uint8* dst_ptr, int dst_width);
1625 asm(
1626 ".text\n"
1627 #if defined(OSX)
1628 ".globl _ScaleRowDown38_2_Int_SSSE3\n"
1629 "_ScaleRowDown38_2_Int_SSSE3:\n"
1630 #else
1631 ".global ScaleRowDown38_2_Int_SSSE3\n"
1632 "ScaleRowDown38_2_Int_SSSE3:\n"
1633 #endif
1634 "pusha\n"
1635 "mov 0x24(%esp),%esi\n"
1636 "mov 0x28(%esp),%edx\n"
1637 "mov 0x2c(%esp),%edi\n"
1638 "mov 0x30(%esp),%ecx\n"
1639 "movdqa _shufab0,%xmm4\n"
1640 "movdqa _shufab1,%xmm5\n"
1641 "movdqa _shufab2,%xmm6\n"
1642 "movdqa _scaleab2,%xmm7\n"
1643
1644 "1:"
1645 "movdqa (%esi),%xmm2\n"
1646 "pavgb (%esi,%edx,1),%xmm2\n"
1647 "lea 0x10(%esi),%esi\n"
1648 "movdqa %xmm2,%xmm0\n"
1649 "pshufb %xmm4,%xmm0\n"
1650 "movdqa %xmm2,%xmm1\n"
1651 "pshufb %xmm5,%xmm1\n"
1652 "paddusw %xmm1,%xmm0\n"
1653 "pshufb %xmm6,%xmm2\n"
1654 "paddusw %xmm2,%xmm0\n"
1655 "pmulhuw %xmm7,%xmm0\n"
1656 "packuswb %xmm0,%xmm0\n"
1657 "movd %xmm0,(%edi)\n"
1658 "pextrw $0x2,%xmm0,%eax\n"
1659 "mov %ax,0x4(%edi)\n"
1660 "lea 0x6(%edi),%edi\n"
1661 "sub $0x6,%ecx\n"
1662 "ja 1b\n"
1663 "popa\n"
1664 "ret\n"
1665 );
1666 #endif // __PIC__
1667
1668 #define HAS_SCALEADDROWS_SSE2
1669 extern "C" void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
1670 uint16* dst_ptr, int src_width,
1671 int src_height);
1672 asm(
1673 ".text\n"
1674 #if defined(OSX)
1675 ".globl _ScaleAddRows_SSE2\n"
1676 "_ScaleAddRows_SSE2:\n"
1677 #else
1678 ".global ScaleAddRows_SSE2\n"
1679 "ScaleAddRows_SSE2:\n"
1680 #endif
1681 "pusha\n"
1682 "mov 0x24(%esp),%esi\n"
1683 "mov 0x28(%esp),%edx\n"
1684 "mov 0x2c(%esp),%edi\n"
1685 "mov 0x30(%esp),%ecx\n"
1686 "mov 0x34(%esp),%ebx\n"
1687 "pxor %xmm7,%xmm7\n"
1688
1689 "1:"
1690 "movdqa (%esi),%xmm2\n"
1691 "lea (%esi,%edx,1),%eax\n"
1692 "movhlps %xmm2,%xmm3\n"
1693 "lea -0x1(%ebx),%ebp\n"
1694 "punpcklbw %xmm7,%xmm2\n"
1695 "punpcklbw %xmm7,%xmm3\n"
1696
1697 "2:"
1698 "movdqa (%eax),%xmm0\n"
1699 "lea (%eax,%edx,1),%eax\n"
1700 "movhlps %xmm0,%xmm1\n"
1701 "punpcklbw %xmm7,%xmm0\n"
1702 "punpcklbw %xmm7,%xmm1\n"
1703 "paddusw %xmm0,%xmm2\n"
1704 "paddusw %xmm1,%xmm3\n"
1705 "sub $0x1,%ebp\n"
1706 "ja 2b\n"
1707
1708 "movdqa %xmm2,(%edi)\n"
1709 "movdqa %xmm3,0x10(%edi)\n"
1710 "lea 0x20(%edi),%edi\n"
1711 "lea 0x10(%esi),%esi\n"
1712 "sub $0x10,%ecx\n"
1713 "ja 1b\n"
1714 "popa\n"
1715 "ret\n"
1716 );
1717
1718 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
1719 #define HAS_SCALEFILTERROWS_SSE2
1720 extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
1721 const uint8* src_ptr, int src_stride,
1722 int dst_width, int source_y_fraction);
1723 asm(
1724 ".text\n"
1725 #if defined(OSX)
1726 ".globl _ScaleFilterRows_SSE2\n"
1727 "_ScaleFilterRows_SSE2:\n"
1728 #else
1729 ".global ScaleFilterRows_SSE2\n"
1730 "ScaleFilterRows_SSE2:\n"
1731 #endif
1732 "push %esi\n"
1733 "push %edi\n"
1734 "mov 0xc(%esp),%edi\n"
1735 "mov 0x10(%esp),%esi\n"
1736 "mov 0x14(%esp),%edx\n"
1737 "mov 0x18(%esp),%ecx\n"
1738 "mov 0x1c(%esp),%eax\n"
1739 "cmp $0x0,%eax\n"
1740 "je 2f\n"
1741 "cmp $0x80,%eax\n"
1742 "je 3f\n"
1743 "movd %eax,%xmm6\n"
1744 "punpcklwd %xmm6,%xmm6\n"
1745 "pshufd $0x0,%xmm6,%xmm6\n"
1746 "neg %eax\n"
1747 "add $0x100,%eax\n"
1748 "movd %eax,%xmm5\n"
1749 "punpcklwd %xmm5,%xmm5\n"
1750 "pshufd $0x0,%xmm5,%xmm5\n"
1751 "pxor %xmm7,%xmm7\n"
1752
1753 "1:"
1754 "movdqa (%esi),%xmm0\n"
1755 "movdqa (%esi,%edx,1),%xmm2\n"
1756 "lea 0x10(%esi),%esi\n"
1757 "movdqa %xmm0,%xmm1\n"
1758 "movdqa %xmm2,%xmm3\n"
1759 "punpcklbw %xmm7,%xmm0\n"
1760 "punpcklbw %xmm7,%xmm2\n"
1761 "punpckhbw %xmm7,%xmm1\n"
1762 "punpckhbw %xmm7,%xmm3\n"
1763 "pmullw %xmm5,%xmm0\n"
1764 "pmullw %xmm5,%xmm1\n"
1765 "pmullw %xmm6,%xmm2\n"
1766 "pmullw %xmm6,%xmm3\n"
1767 "paddusw %xmm2,%xmm0\n"
1768 "paddusw %xmm3,%xmm1\n"
1769 "psrlw $0x8,%xmm0\n"
1770 "psrlw $0x8,%xmm1\n"
1771 "packuswb %xmm1,%xmm0\n"
1772 "movdqa %xmm0,(%edi)\n"
1773 "lea 0x10(%edi),%edi\n"
1774 "sub $0x10,%ecx\n"
1775 "ja 1b\n"
1776 "mov -0x1(%edi),%al\n"
1777 "mov %al,(%edi)\n"
1778 "pop %edi\n"
1779 "pop %esi\n"
1780 "ret\n"
1781
1782 "2:"
1783 "movdqa (%esi),%xmm0\n"
1784 "lea 0x10(%esi),%esi\n"
1785 "movdqa %xmm0,(%edi)\n"
1786 "lea 0x10(%edi),%edi\n"
1787 "sub $0x10,%ecx\n"
1788 "ja 2b\n"
1789
1790 "mov -0x1(%edi),%al\n"
1791 "mov %al,(%edi)\n"
1792 "pop %edi\n"
1793 "pop %esi\n"
1794 "ret\n"
1795
1796 "3:"
1797 "movdqa (%esi),%xmm0\n"
1798 "movdqa (%esi,%edx,1),%xmm2\n"
1799 "lea 0x10(%esi),%esi\n"
1800 "pavgb %xmm2,%xmm0\n"
1801 "movdqa %xmm0,(%edi)\n"
1802 "lea 0x10(%edi),%edi\n"
1803 "sub $0x10,%ecx\n"
1804 "ja 3b\n"
1805
1806 "mov -0x1(%edi),%al\n"
1807 "mov %al,(%edi)\n"
1808 "pop %edi\n"
1809 "pop %esi\n"
1810 "ret\n"
1811 );
1812
1813 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
1814 #define HAS_SCALEFILTERROWS_SSSE3
1815 extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
1816 const uint8* src_ptr, int src_stride,
1817 int dst_width, int source_y_fraction);
1818 asm(
1819 ".text\n"
1820 #if defined(OSX)
1821 ".globl _ScaleFilterRows_SSSE3\n"
1822 "_ScaleFilterRows_SSSE3:\n"
1823 #else
1824 ".global ScaleFilterRows_SSSE3\n"
1825 "ScaleFilterRows_SSSE3:\n"
1826 #endif
1827 "push %esi\n"
1828 "push %edi\n"
1829 "mov 0xc(%esp),%edi\n"
1830 "mov 0x10(%esp),%esi\n"
1831 "mov 0x14(%esp),%edx\n"
1832 "mov 0x18(%esp),%ecx\n"
1833 "mov 0x1c(%esp),%eax\n"
1834 "cmp $0x0,%eax\n"
1835 "je 2f\n"
1836 "cmp $0x80,%eax\n"
1837 "je 3f\n"
1838 "shr %eax\n"
1839 "mov %al,%ah\n"
1840 "neg %al\n"
1841 "add $0x80,%al\n"
1842 "movd %eax,%xmm7\n"
1843 "punpcklwd %xmm7,%xmm7\n"
1844 "pshufd $0x0,%xmm7,%xmm7\n"
1845
1846 "1:"
1847 "movdqa (%esi),%xmm0\n"
1848 "movdqa (%esi,%edx,1),%xmm2\n"
1849 "lea 0x10(%esi),%esi\n"
1850 "movdqa %xmm0,%xmm1\n"
1851 "punpcklbw %xmm2,%xmm0\n"
1852 "punpckhbw %xmm2,%xmm1\n"
1853 "pmaddubsw %xmm7,%xmm0\n"
1854 "pmaddubsw %xmm7,%xmm1\n"
1855 "psrlw $0x7,%xmm0\n"
1856 "psrlw $0x7,%xmm1\n"
1857 "packuswb %xmm1,%xmm0\n"
1858 "movdqa %xmm0,(%edi)\n"
1859 "lea 0x10(%edi),%edi\n"
1860 "sub $0x10,%ecx\n"
1861 "ja 1b\n"
1862 "mov -0x1(%edi),%al\n"
1863 "mov %al,(%edi)\n"
1864 "pop %edi\n"
1865 "pop %esi\n"
1866 "ret\n"
1867
1868 "2:"
1869 "movdqa (%esi),%xmm0\n"
1870 "lea 0x10(%esi),%esi\n"
1871 "movdqa %xmm0,(%edi)\n"
1872 "lea 0x10(%edi),%edi\n"
1873 "sub $0x10,%ecx\n"
1874 "ja 2b\n"
1875 "mov -0x1(%edi),%al\n"
1876 "mov %al,(%edi)\n"
1877 "pop %edi\n"
1878 "pop %esi\n"
1879 "ret\n"
1880
1881 "3:"
1882 "movdqa (%esi),%xmm0\n"
1883 "movdqa (%esi,%edx,1),%xmm2\n"
1884 "lea 0x10(%esi),%esi\n"
1885 "pavgb %xmm2,%xmm0\n"
1886 "movdqa %xmm0,(%edi)\n"
1887 "lea 0x10(%edi),%edi\n"
1888 "sub $0x10,%ecx\n"
1889 "ja 3b\n"
1890 "mov -0x1(%edi),%al\n"
1891 "mov %al,(%edi)\n"
1892 "pop %edi\n"
1893 "pop %esi\n"
1894 "ret\n"
1895 );
1896
1897 #elif defined(__x86_64__)
ScaleRowDown8Int_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)1898 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
1899 uint8* dst_ptr, int dst_width) {
1900 asm volatile(
1901 "lea (%3,%3,2),%%r10\n"
1902 "pxor %%xmm7,%%xmm7\n"
1903 "1:"
1904 "movdqa (%0),%%xmm0\n"
1905 "movdqa 0x10(%0),%%xmm1\n"
1906 "movdqa (%0,%3,1),%%xmm2\n"
1907 "movdqa 0x10(%0,%3,1),%%xmm3\n"
1908 "pavgb %%xmm2,%%xmm0\n"
1909 "pavgb %%xmm3,%%xmm1\n"
1910 "movdqa (%0,%3,2),%%xmm2\n"
1911 "movdqa 0x10(%0,%3,2),%%xmm3\n"
1912 "movdqa (%0,%%r10,1),%%xmm4\n"
1913 "movdqa 0x10(%0,%%r10,1),%%xmm5\n"
1914 "lea (%0,%3,4),%%r11\n"
1915 "lea 0x20(%0),%0\n"
1916 "pavgb %%xmm4,%%xmm2\n"
1917 "pavgb %%xmm5,%%xmm3\n"
1918 "pavgb %%xmm2,%%xmm0\n"
1919 "pavgb %%xmm3,%%xmm1\n"
1920 "movdqa 0x0(%%r11),%%xmm2\n"
1921 "movdqa 0x10(%%r11),%%xmm3\n"
1922 "movdqa 0x0(%%r11,%3,1),%%xmm4\n"
1923 "movdqa 0x10(%%r11,%3,1),%%xmm5\n"
1924 "pavgb %%xmm4,%%xmm2\n"
1925 "pavgb %%xmm5,%%xmm3\n"
1926 "movdqa 0x0(%%r11,%3,2),%%xmm4\n"
1927 "movdqa 0x10(%%r11,%3,2),%%xmm5\n"
1928 "movdqa 0x0(%%r11,%%r10,1),%%xmm6\n"
1929 "pavgb %%xmm6,%%xmm4\n"
1930 "movdqa 0x10(%%r11,%%r10,1),%%xmm6\n"
1931 "pavgb %%xmm6,%%xmm5\n"
1932 "pavgb %%xmm4,%%xmm2\n"
1933 "pavgb %%xmm5,%%xmm3\n"
1934 "pavgb %%xmm2,%%xmm0\n"
1935 "pavgb %%xmm3,%%xmm1\n"
1936 "psadbw %%xmm7,%%xmm0\n"
1937 "psadbw %%xmm7,%%xmm1\n"
1938 "pshufd $0xd8,%%xmm0,%%xmm0\n"
1939 "pshufd $0x8d,%%xmm1,%%xmm1\n"
1940 "por %%xmm1,%%xmm0\n"
1941 "psrlw $0x3,%%xmm0\n"
1942 "packuswb %%xmm0,%%xmm0\n"
1943 "packuswb %%xmm0,%%xmm0\n"
1944 "movd %%xmm0,(%1)\n"
1945 "lea 0x4(%1),%1\n"
1946 "sub $0x4,%2\n"
1947 "ja 1b\n"
1948 : "+r"(src_ptr), // %0
1949 "+r"(dst_ptr), // %1
1950 "+r"(dst_width) // %2
1951 : "r"(static_cast<intptr_t>(src_stride)) // %3
1952 : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3",
1953 "xmm4", "xmm5", "xmm6", "xmm7"
1954 );
1955 }
1956
1957 #define HAS_SCALEROWDOWN34_SSSE3
ScaleRowDown34_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)1958 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
1959 uint8* dst_ptr, int dst_width) {
1960 asm volatile(
1961 "movdqa (%3),%%xmm3\n"
1962 "movdqa (%4),%%xmm4\n"
1963 "movdqa (%5),%%xmm5\n"
1964 "1:"
1965 "movdqa (%0),%%xmm0\n"
1966 "movdqa 0x10(%0),%%xmm2\n"
1967 "lea 0x20(%0),%0\n"
1968 "movdqa %%xmm2,%%xmm1\n"
1969 "palignr $0x8,%%xmm0,%%xmm1\n"
1970 "pshufb %%xmm3,%%xmm0\n"
1971 "pshufb %%xmm4,%%xmm1\n"
1972 "pshufb %%xmm5,%%xmm2\n"
1973 "movq %%xmm0,(%1)\n"
1974 "movq %%xmm1,0x8(%1)\n"
1975 "movq %%xmm2,0x10(%1)\n"
1976 "lea 0x18(%1),%1\n"
1977 "sub $0x18,%2\n"
1978 "ja 1b\n"
1979 : "+r"(src_ptr), // %0
1980 "+r"(dst_ptr), // %1
1981 "+r"(dst_width) // %2
1982 : "r"(_shuf0), // %3
1983 "r"(_shuf1), // %4
1984 "r"(_shuf2) // %5
1985 : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1986 );
1987 }
1988
ScaleRowDown34_1_Int_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)1989 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
1990 uint8* dst_ptr, int dst_width) {
1991 asm volatile(
1992 "movdqa (%4),%%xmm2\n" // _shuf01
1993 "movdqa (%5),%%xmm3\n" // _shuf11
1994 "movdqa (%6),%%xmm4\n" // _shuf21
1995 "movdqa (%7),%%xmm5\n" // _madd01
1996 "movdqa (%8),%%xmm6\n" // _madd11
1997 "movdqa (%9),%%xmm7\n" // _round34
1998 "movdqa (%10),%%xmm8\n" // _madd21
1999 "1:"
2000 "movdqa (%0),%%xmm0\n"
2001 "movdqa (%0,%3),%%xmm1\n"
2002 "pavgb %%xmm1,%%xmm0\n"
2003 "pshufb %%xmm2,%%xmm0\n"
2004 "pmaddubsw %%xmm5,%%xmm0\n"
2005 "paddsw %%xmm7,%%xmm0\n"
2006 "psrlw $0x2,%%xmm0\n"
2007 "packuswb %%xmm0,%%xmm0\n"
2008 "movq %%xmm0,(%1)\n"
2009 "movdqu 0x8(%0),%%xmm0\n"
2010 "movdqu 0x8(%0,%3),%%xmm1\n"
2011 "pavgb %%xmm1,%%xmm0\n"
2012 "pshufb %%xmm3,%%xmm0\n"
2013 "pmaddubsw %%xmm6,%%xmm0\n"
2014 "paddsw %%xmm7,%%xmm0\n"
2015 "psrlw $0x2,%%xmm0\n"
2016 "packuswb %%xmm0,%%xmm0\n"
2017 "movq %%xmm0,0x8(%1)\n"
2018 "movdqa 0x10(%0),%%xmm0\n"
2019 "movdqa 0x10(%0,%3),%%xmm1\n"
2020 "lea 0x20(%0),%0\n"
2021 "pavgb %%xmm1,%%xmm0\n"
2022 "pshufb %%xmm4,%%xmm0\n"
2023 "pmaddubsw %%xmm8,%%xmm0\n"
2024 "paddsw %%xmm7,%%xmm0\n"
2025 "psrlw $0x2,%%xmm0\n"
2026 "packuswb %%xmm0,%%xmm0\n"
2027 "movq %%xmm0,0x10(%1)\n"
2028 "lea 0x18(%1),%1\n"
2029 "sub $0x18,%2\n"
2030 "ja 1b\n"
2031 : "+r"(src_ptr), // %0
2032 "+r"(dst_ptr), // %1
2033 "+r"(dst_width) // %2
2034 : "r"(static_cast<intptr_t>(src_stride)), // %3
2035 "r"(_shuf01), // %4
2036 "r"(_shuf11), // %5
2037 "r"(_shuf21), // %6
2038 "r"(_madd01), // %7
2039 "r"(_madd11), // %8
2040 "r"(_round34), // %9
2041 "r"(_madd21) // %10
2042 : "memory", "xmm0", "xmm1", "xmm2", "xmm3",
2043 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8"
2044 );
2045 }
2046
ScaleRowDown34_0_Int_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2047 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
2048 uint8* dst_ptr, int dst_width) {
2049 asm volatile(
2050 "movdqa (%4),%%xmm2\n" // _shuf01
2051 "movdqa (%5),%%xmm3\n" // _shuf11
2052 "movdqa (%6),%%xmm4\n" // _shuf21
2053 "movdqa (%7),%%xmm5\n" // _madd01
2054 "movdqa (%8),%%xmm6\n" // _madd11
2055 "movdqa (%9),%%xmm7\n" // _round34
2056 "movdqa (%10),%%xmm8\n" // _madd21
2057 "1:"
2058 "movdqa (%0),%%xmm0\n"
2059 "movdqa (%0,%3,1),%%xmm1\n"
2060 "pavgb %%xmm0,%%xmm1\n"
2061 "pavgb %%xmm1,%%xmm0\n"
2062 "pshufb %%xmm2,%%xmm0\n"
2063 "pmaddubsw %%xmm5,%%xmm0\n"
2064 "paddsw %%xmm7,%%xmm0\n"
2065 "psrlw $0x2,%%xmm0\n"
2066 "packuswb %%xmm0,%%xmm0\n"
2067 "movq %%xmm0,(%1)\n"
2068 "movdqu 0x8(%0),%%xmm0\n"
2069 "movdqu 0x8(%0,%3,1),%%xmm1\n"
2070 "pavgb %%xmm0,%%xmm1\n"
2071 "pavgb %%xmm1,%%xmm0\n"
2072 "pshufb %%xmm3,%%xmm0\n"
2073 "pmaddubsw %%xmm6,%%xmm0\n"
2074 "paddsw %%xmm7,%%xmm0\n"
2075 "psrlw $0x2,%%xmm0\n"
2076 "packuswb %%xmm0,%%xmm0\n"
2077 "movq %%xmm0,0x8(%1)\n"
2078 "movdqa 0x10(%0),%%xmm0\n"
2079 "movdqa 0x10(%0,%3,1),%%xmm1\n"
2080 "lea 0x20(%0),%0\n"
2081 "pavgb %%xmm0,%%xmm1\n"
2082 "pavgb %%xmm1,%%xmm0\n"
2083 "pshufb %%xmm4,%%xmm0\n"
2084 "pmaddubsw %%xmm8,%%xmm0\n"
2085 "paddsw %%xmm7,%%xmm0\n"
2086 "psrlw $0x2,%%xmm0\n"
2087 "packuswb %%xmm0,%%xmm0\n"
2088 "movq %%xmm0,0x10(%1)\n"
2089 "lea 0x18(%1),%1\n"
2090 "sub $0x18,%2\n"
2091 "ja 1b\n"
2092 : "+r"(src_ptr), // %0
2093 "+r"(dst_ptr), // %1
2094 "+r"(dst_width) // %2
2095 : "r"(static_cast<intptr_t>(src_stride)), // %3
2096 "r"(_shuf01), // %4
2097 "r"(_shuf11), // %5
2098 "r"(_shuf21), // %6
2099 "r"(_madd01), // %7
2100 "r"(_madd11), // %8
2101 "r"(_round34), // %9
2102 "r"(_madd21) // %10
2103 : "memory", "xmm0", "xmm1", "xmm2", "xmm3",
2104 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8"
2105 );
2106 }
2107
2108 #define HAS_SCALEROWDOWN38_SSSE3
ScaleRowDown38_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2109 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
2110 uint8* dst_ptr, int dst_width) {
2111 asm volatile(
2112 "movdqa (%3),%%xmm5\n"
2113 "movdqa (%4),%%xmm6\n"
2114 "pxor %%xmm7,%%xmm7\n"
2115 "1:"
2116 "movdqa (%0),%%xmm0\n"
2117 "movdqa 0x10(%0),%%xmm1\n"
2118 "lea 0x20(%0),%0\n"
2119 "pshufb %%xmm5,%%xmm0\n"
2120 "pshufb %%xmm6,%%xmm1\n"
2121 "paddusb %%xmm1,%%xmm0\n"
2122 "movq %%xmm0,(%1)\n"
2123 "movhlps %%xmm0,%%xmm1\n"
2124 "movd %%xmm1,0x8(%1)\n"
2125 "lea 0xc(%1),%1\n"
2126 "sub $0xc,%2\n"
2127 "ja 1b\n"
2128 : "+r"(src_ptr), // %0
2129 "+r"(dst_ptr), // %1
2130 "+r"(dst_width) // %2
2131 : "r"(_shuf38a), // %3
2132 "r"(_shuf38b) // %4
2133 : "memory", "xmm0", "xmm1", "xmm5", "xmm6", "xmm7"
2134 );
2135 }
2136
ScaleRowDown38_3_Int_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2137 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
2138 uint8* dst_ptr, int dst_width) {
2139 asm volatile(
2140 "movdqa (%4),%%xmm4\n"
2141 "movdqa (%5),%%xmm5\n"
2142 "movdqa (%6),%%xmm6\n"
2143 "pxor %%xmm7,%%xmm7\n"
2144 "1:"
2145 "movdqa (%0),%%xmm0\n"
2146 "movdqa (%0,%3,1),%%xmm2\n"
2147 "movhlps %%xmm0,%%xmm1\n"
2148 "movhlps %%xmm2,%%xmm3\n"
2149 "punpcklbw %%xmm7,%%xmm0\n"
2150 "punpcklbw %%xmm7,%%xmm1\n"
2151 "punpcklbw %%xmm7,%%xmm2\n"
2152 "punpcklbw %%xmm7,%%xmm3\n"
2153 "paddusw %%xmm2,%%xmm0\n"
2154 "paddusw %%xmm3,%%xmm1\n"
2155 "movdqa (%0,%3,2),%%xmm2\n"
2156 "lea 0x10(%0),%0\n"
2157 "movhlps %%xmm2,%%xmm3\n"
2158 "punpcklbw %%xmm7,%%xmm2\n"
2159 "punpcklbw %%xmm7,%%xmm3\n"
2160 "paddusw %%xmm2,%%xmm0\n"
2161 "paddusw %%xmm3,%%xmm1\n"
2162 "movdqa %%xmm0,%%xmm2\n"
2163 "psrldq $0x2,%%xmm0\n"
2164 "paddusw %%xmm0,%%xmm2\n"
2165 "psrldq $0x2,%%xmm0\n"
2166 "paddusw %%xmm0,%%xmm2\n"
2167 "pshufb %%xmm4,%%xmm2\n"
2168 "movdqa %%xmm1,%%xmm3\n"
2169 "psrldq $0x2,%%xmm1\n"
2170 "paddusw %%xmm1,%%xmm3\n"
2171 "psrldq $0x2,%%xmm1\n"
2172 "paddusw %%xmm1,%%xmm3\n"
2173 "pshufb %%xmm5,%%xmm3\n"
2174 "paddusw %%xmm3,%%xmm2\n"
2175 "pmulhuw %%xmm6,%%xmm2\n"
2176 "packuswb %%xmm2,%%xmm2\n"
2177 "movd %%xmm2,(%1)\n"
2178 "pextrw $0x2,%%xmm2,%%eax\n"
2179 "mov %%ax,0x4(%1)\n"
2180 "lea 0x6(%1),%1\n"
2181 "sub $0x6,%2\n"
2182 "ja 1b\n"
2183 : "+r"(src_ptr), // %0
2184 "+r"(dst_ptr), // %1
2185 "+r"(dst_width) // %2
2186 : "r"(static_cast<intptr_t>(src_stride)), // %3
2187 "r"(_shufac0), // %4
2188 "r"(_shufac3), // %5
2189 "r"(_scaleac3) // %6
2190 : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm3",
2191 "xmm4", "xmm5", "xmm6", "xmm7"
2192 );
2193 }
2194
ScaleRowDown38_2_Int_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2195 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
2196 uint8* dst_ptr, int dst_width) {
2197 asm volatile(
2198 "movdqa (%4),%%xmm4\n"
2199 "movdqa (%5),%%xmm5\n"
2200 "movdqa (%6),%%xmm6\n"
2201 "movdqa (%7),%%xmm7\n"
2202 "1:"
2203 "movdqa (%0),%%xmm2\n"
2204 "pavgb (%0,%3,1),%%xmm2\n"
2205 "lea 0x10(%0),%0\n"
2206 "movdqa %%xmm2,%%xmm0\n"
2207 "pshufb %%xmm4,%%xmm0\n"
2208 "movdqa %%xmm2,%%xmm1\n"
2209 "pshufb %%xmm5,%%xmm1\n"
2210 "paddusw %%xmm1,%%xmm0\n"
2211 "pshufb %%xmm6,%%xmm2\n"
2212 "paddusw %%xmm2,%%xmm0\n"
2213 "pmulhuw %%xmm7,%%xmm0\n"
2214 "packuswb %%xmm0,%%xmm0\n"
2215 "movd %%xmm0,(%1)\n"
2216 "pextrw $0x2,%%xmm0,%%eax\n"
2217 "mov %%ax,0x4(%1)\n"
2218 "lea 0x6(%1),%1\n"
2219 "sub $0x6,%2\n"
2220 "ja 1b\n"
2221 : "+r"(src_ptr), // %0
2222 "+r"(dst_ptr), // %1
2223 "+r"(dst_width) // %2
2224 : "r"(static_cast<intptr_t>(src_stride)), // %3
2225 "r"(_shufab0), // %4
2226 "r"(_shufab1), // %5
2227 "r"(_shufab2), // %6
2228 "r"(_scaleab2) // %7
2229 : "memory", "rax", "xmm0", "xmm1", "xmm2",
2230 "xmm4", "xmm5", "xmm6", "xmm7"
2231 );
2232 }
2233
2234 #define HAS_SCALEADDROWS_SSE2
ScaleAddRows_SSE2(const uint8 * src_ptr,int src_stride,uint16 * dst_ptr,int src_width,int src_height)2235 static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
2236 uint16* dst_ptr, int src_width,
2237 int src_height) {
2238 asm volatile(
2239 "pxor %%xmm7,%%xmm7\n"
2240 "1:"
2241 "movdqa (%0),%%xmm2\n"
2242 "lea (%0,%4,1),%%r10\n"
2243 "movhlps %%xmm2,%%xmm3\n"
2244 "lea -0x1(%3),%%r11\n"
2245 "punpcklbw %%xmm7,%%xmm2\n"
2246 "punpcklbw %%xmm7,%%xmm3\n"
2247
2248 "2:"
2249 "movdqa (%%r10),%%xmm0\n"
2250 "lea (%%r10,%4,1),%%r10\n"
2251 "movhlps %%xmm0,%%xmm1\n"
2252 "punpcklbw %%xmm7,%%xmm0\n"
2253 "punpcklbw %%xmm7,%%xmm1\n"
2254 "paddusw %%xmm0,%%xmm2\n"
2255 "paddusw %%xmm1,%%xmm3\n"
2256 "sub $0x1,%%r11\n"
2257 "ja 2b\n"
2258
2259 "movdqa %%xmm2,(%1)\n"
2260 "movdqa %%xmm3,0x10(%1)\n"
2261 "lea 0x20(%1),%1\n"
2262 "lea 0x10(%0),%0\n"
2263 "sub $0x10,%2\n"
2264 "ja 1b\n"
2265 : "+r"(src_ptr), // %0
2266 "+r"(dst_ptr), // %1
2267 "+r"(src_width), // %2
2268 "+r"(src_height) // %3
2269 : "r"(static_cast<intptr_t>(src_stride)) // %4
2270 : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7"
2271 );
2272 }
2273
2274 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
2275 #define HAS_SCALEFILTERROWS_SSE2
ScaleFilterRows_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,int src_stride,int dst_width,int source_y_fraction)2276 static void ScaleFilterRows_SSE2(uint8* dst_ptr,
2277 const uint8* src_ptr, int src_stride,
2278 int dst_width, int source_y_fraction) {
2279 if (source_y_fraction == 0) {
2280 asm volatile(
2281 "1:"
2282 "movdqa (%1),%%xmm0\n"
2283 "lea 0x10(%1),%1\n"
2284 "movdqa %%xmm0,(%0)\n"
2285 "lea 0x10(%0),%0\n"
2286 "sub $0x10,%2\n"
2287 "ja 1b\n"
2288 "mov -0x1(%0),%%al\n"
2289 "mov %%al,(%0)\n"
2290 : "+r"(dst_ptr), // %0
2291 "+r"(src_ptr), // %1
2292 "+r"(dst_width) // %2
2293 :
2294 : "memory", "rax", "xmm0"
2295 );
2296 return;
2297 } else if (source_y_fraction == 128) {
2298 asm volatile(
2299 "1:"
2300 "movdqa (%1),%%xmm0\n"
2301 "movdqa (%1,%3,1),%%xmm2\n"
2302 "lea 0x10(%1),%1\n"
2303 "pavgb %%xmm2,%%xmm0\n"
2304 "movdqa %%xmm0,(%0)\n"
2305 "lea 0x10(%0),%0\n"
2306 "sub $0x10,%2\n"
2307 "ja 1b\n"
2308 "mov -0x1(%0),%%al\n"
2309 "mov %%al,(%0)\n"
2310 : "+r"(dst_ptr), // %0
2311 "+r"(src_ptr), // %1
2312 "+r"(dst_width) // %2
2313 : "r"(static_cast<intptr_t>(src_stride)) // %3
2314 : "memory", "rax", "xmm0", "xmm2"
2315 );
2316 return;
2317 } else {
2318 asm volatile(
2319 "mov %3,%%eax\n"
2320 "movd %%eax,%%xmm6\n"
2321 "punpcklwd %%xmm6,%%xmm6\n"
2322 "pshufd $0x0,%%xmm6,%%xmm6\n"
2323 "neg %%eax\n"
2324 "add $0x100,%%eax\n"
2325 "movd %%eax,%%xmm5\n"
2326 "punpcklwd %%xmm5,%%xmm5\n"
2327 "pshufd $0x0,%%xmm5,%%xmm5\n"
2328 "pxor %%xmm7,%%xmm7\n"
2329 "1:"
2330 "movdqa (%1),%%xmm0\n"
2331 "movdqa (%1,%4,1),%%xmm2\n"
2332 "lea 0x10(%1),%1\n"
2333 "movdqa %%xmm0,%%xmm1\n"
2334 "movdqa %%xmm2,%%xmm3\n"
2335 "punpcklbw %%xmm7,%%xmm0\n"
2336 "punpcklbw %%xmm7,%%xmm2\n"
2337 "punpckhbw %%xmm7,%%xmm1\n"
2338 "punpckhbw %%xmm7,%%xmm3\n"
2339 "pmullw %%xmm5,%%xmm0\n"
2340 "pmullw %%xmm5,%%xmm1\n"
2341 "pmullw %%xmm6,%%xmm2\n"
2342 "pmullw %%xmm6,%%xmm3\n"
2343 "paddusw %%xmm2,%%xmm0\n"
2344 "paddusw %%xmm3,%%xmm1\n"
2345 "psrlw $0x8,%%xmm0\n"
2346 "psrlw $0x8,%%xmm1\n"
2347 "packuswb %%xmm1,%%xmm0\n"
2348 "movdqa %%xmm0,(%0)\n"
2349 "lea 0x10(%0),%0\n"
2350 "sub $0x10,%2\n"
2351 "ja 1b\n"
2352 "mov -0x1(%0),%%al\n"
2353 "mov %%al,(%0)\n"
2354 : "+r"(dst_ptr), // %0
2355 "+r"(src_ptr), // %1
2356 "+r"(dst_width), // %2
2357 "+r"(source_y_fraction) // %3
2358 : "r"(static_cast<intptr_t>(src_stride)) // %4
2359 : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm3",
2360 "xmm5", "xmm6", "xmm7"
2361 );
2362 }
2363 return;
2364 }
2365
2366 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
2367 #define HAS_SCALEFILTERROWS_SSSE3
ScaleFilterRows_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,int src_stride,int dst_width,int source_y_fraction)2368 static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
2369 const uint8* src_ptr, int src_stride,
2370 int dst_width, int source_y_fraction) {
2371 if (source_y_fraction == 0) {
2372 asm volatile(
2373 "1:"
2374 "movdqa (%1),%%xmm0\n"
2375 "lea 0x10(%1),%1\n"
2376 "movdqa %%xmm0,(%0)\n"
2377 "lea 0x10(%0),%0\n"
2378 "sub $0x10,%2\n"
2379 "ja 1b\n"
2380 "mov -0x1(%0),%%al\n"
2381 "mov %%al,(%0)\n"
2382 : "+r"(dst_ptr), // %0
2383 "+r"(src_ptr), // %1
2384 "+r"(dst_width) // %2
2385 :
2386 : "memory", "rax", "xmm0"
2387 );
2388 return;
2389 } else if (source_y_fraction == 128) {
2390 asm volatile(
2391 "1:"
2392 "movdqa (%1),%%xmm0\n"
2393 "movdqa (%1,%3,1),%%xmm2\n"
2394 "lea 0x10(%1),%1\n"
2395 "pavgb %%xmm2,%%xmm0\n"
2396 "movdqa %%xmm0,(%0)\n"
2397 "lea 0x10(%0),%0\n"
2398 "sub $0x10,%2\n"
2399 "ja 1b\n"
2400 "mov -0x1(%0),%%al\n"
2401 "mov %%al,(%0)\n"
2402 : "+r"(dst_ptr), // %0
2403 "+r"(src_ptr), // %1
2404 "+r"(dst_width) // %2
2405 : "r"(static_cast<intptr_t>(src_stride)) // %3
2406 : "memory", "rax", "xmm0", "xmm2"
2407 );
2408 return;
2409 } else {
2410 asm volatile(
2411 "mov %3,%%eax\n"
2412 "shr %%eax\n"
2413 "mov %%al,%%ah\n"
2414 "neg %%al\n"
2415 "add $0x80,%%al\n"
2416 "movd %%eax,%%xmm7\n"
2417 "punpcklwd %%xmm7,%%xmm7\n"
2418 "pshufd $0x0,%%xmm7,%%xmm7\n"
2419 "1:"
2420 "movdqa (%1),%%xmm0\n"
2421 "movdqa (%1,%4,1),%%xmm2\n"
2422 "lea 0x10(%1),%1\n"
2423 "movdqa %%xmm0,%%xmm1\n"
2424 "punpcklbw %%xmm2,%%xmm0\n"
2425 "punpckhbw %%xmm2,%%xmm1\n"
2426 "pmaddubsw %%xmm7,%%xmm0\n"
2427 "pmaddubsw %%xmm7,%%xmm1\n"
2428 "psrlw $0x7,%%xmm0\n"
2429 "psrlw $0x7,%%xmm1\n"
2430 "packuswb %%xmm1,%%xmm0\n"
2431 "movdqa %%xmm0,(%0)\n"
2432 "lea 0x10(%0),%0\n"
2433 "sub $0x10,%2\n"
2434 "ja 1b\n"
2435 "mov -0x1(%0),%%al\n"
2436 "mov %%al,(%0)\n"
2437 : "+r"(dst_ptr), // %0
2438 "+r"(src_ptr), // %1
2439 "+r"(dst_width), // %2
2440 "+r"(source_y_fraction) // %3
2441 : "r"(static_cast<intptr_t>(src_stride)) // %4
2442 : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm7"
2443 );
2444 }
2445 return;
2446 }
2447 #endif
2448 #endif
2449
2450 // CPU agnostic row functions
ScaleRowDown2_C(const uint8 * src_ptr,int,uint8 * dst,int dst_width)2451 static void ScaleRowDown2_C(const uint8* src_ptr, int,
2452 uint8* dst, int dst_width) {
2453 for (int x = 0; x < dst_width; ++x) {
2454 *dst++ = *src_ptr;
2455 src_ptr += 2;
2456 }
2457 }
2458
ScaleRowDown2Int_C(const uint8 * src_ptr,int src_stride,uint8 * dst,int dst_width)2459 static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride,
2460 uint8* dst, int dst_width) {
2461 for (int x = 0; x < dst_width; ++x) {
2462 *dst++ = (src_ptr[0] + src_ptr[1] +
2463 src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2;
2464 src_ptr += 2;
2465 }
2466 }
2467
ScaleRowDown4_C(const uint8 * src_ptr,int,uint8 * dst,int dst_width)2468 static void ScaleRowDown4_C(const uint8* src_ptr, int,
2469 uint8* dst, int dst_width) {
2470 for (int x = 0; x < dst_width; ++x) {
2471 *dst++ = *src_ptr;
2472 src_ptr += 4;
2473 }
2474 }
2475
ScaleRowDown4Int_C(const uint8 * src_ptr,int src_stride,uint8 * dst,int dst_width)2476 static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride,
2477 uint8* dst, int dst_width) {
2478 for (int x = 0; x < dst_width; ++x) {
2479 *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
2480 src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
2481 src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
2482 src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
2483 src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
2484 src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
2485 src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] +
2486 8) >> 4;
2487 src_ptr += 4;
2488 }
2489 }
2490
2491 // 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down.
2492 // Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu.
2493 static const int kMaxOutputWidth = 640;
2494 static const int kMaxRow12 = kMaxOutputWidth * 2;
2495
ScaleRowDown8_C(const uint8 * src_ptr,int,uint8 * dst,int dst_width)2496 static void ScaleRowDown8_C(const uint8* src_ptr, int,
2497 uint8* dst, int dst_width) {
2498 for (int x = 0; x < dst_width; ++x) {
2499 *dst++ = *src_ptr;
2500 src_ptr += 8;
2501 }
2502 }
2503
2504 // Note calling code checks width is less than max and if not
2505 // uses ScaleRowDown8_C instead.
ScaleRowDown8Int_C(const uint8 * src_ptr,int src_stride,uint8 * dst,int dst_width)2506 static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride,
2507 uint8* dst, int dst_width) {
2508 ALIGN16(uint8 src_row[kMaxRow12 * 2]);
2509 assert(dst_width <= kMaxOutputWidth);
2510 ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2);
2511 ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride,
2512 src_row + kMaxOutputWidth,
2513 dst_width * 2);
2514 ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width);
2515 }
2516
ScaleRowDown34_C(const uint8 * src_ptr,int,uint8 * dst,int dst_width)2517 static void ScaleRowDown34_C(const uint8* src_ptr, int,
2518 uint8* dst, int dst_width) {
2519 assert((dst_width % 3 == 0) && (dst_width > 0));
2520 uint8* dend = dst + dst_width;
2521 do {
2522 dst[0] = src_ptr[0];
2523 dst[1] = src_ptr[1];
2524 dst[2] = src_ptr[3];
2525 dst += 3;
2526 src_ptr += 4;
2527 } while (dst < dend);
2528 }
2529
2530 // Filter rows 0 and 1 together, 3 : 1
ScaleRowDown34_0_Int_C(const uint8 * src_ptr,int src_stride,uint8 * d,int dst_width)2531 static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride,
2532 uint8* d, int dst_width) {
2533 assert((dst_width % 3 == 0) && (dst_width > 0));
2534 uint8* dend = d + dst_width;
2535 const uint8* s = src_ptr;
2536 const uint8* t = src_ptr + src_stride;
2537 do {
2538 uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
2539 uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
2540 uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
2541 uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
2542 uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
2543 uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
2544 d[0] = (a0 * 3 + b0 + 2) >> 2;
2545 d[1] = (a1 * 3 + b1 + 2) >> 2;
2546 d[2] = (a2 * 3 + b2 + 2) >> 2;
2547 d += 3;
2548 s += 4;
2549 t += 4;
2550 } while (d < dend);
2551 }
2552
2553 // Filter rows 1 and 2 together, 1 : 1
ScaleRowDown34_1_Int_C(const uint8 * src_ptr,int src_stride,uint8 * d,int dst_width)2554 static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride,
2555 uint8* d, int dst_width) {
2556 assert((dst_width % 3 == 0) && (dst_width > 0));
2557 uint8* dend = d + dst_width;
2558 const uint8* s = src_ptr;
2559 const uint8* t = src_ptr + src_stride;
2560 do {
2561 uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
2562 uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
2563 uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
2564 uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
2565 uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
2566 uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
2567 d[0] = (a0 + b0 + 1) >> 1;
2568 d[1] = (a1 + b1 + 1) >> 1;
2569 d[2] = (a2 + b2 + 1) >> 1;
2570 d += 3;
2571 s += 4;
2572 t += 4;
2573 } while (d < dend);
2574 }
2575
2576 #if defined(HAS_SCALEFILTERROWS_SSE2)
2577 // Filter row to 3/4
ScaleFilterCols34_C(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width)2578 static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
2579 int dst_width) {
2580 assert((dst_width % 3 == 0) && (dst_width > 0));
2581 uint8* dend = dst_ptr + dst_width;
2582 const uint8* s = src_ptr;
2583 do {
2584 dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;
2585 dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;
2586 dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2;
2587 dst_ptr += 3;
2588 s += 4;
2589 } while (dst_ptr < dend);
2590 }
2591 #endif
2592
ScaleFilterCols_C(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int dx)2593 static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
2594 int dst_width, int dx) {
2595 int x = 0;
2596 for (int j = 0; j < dst_width; ++j) {
2597 int xi = x >> 16;
2598 int xf1 = x & 0xffff;
2599 int xf0 = 65536 - xf1;
2600
2601 *dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16;
2602 x += dx;
2603 }
2604 }
2605
2606 static const int kMaxInputWidth = 2560;
2607 #if defined(HAS_SCALEFILTERROWS_SSE2)
2608 #define HAS_SCALEROWDOWN34_SSE2
2609 // Filter rows 0 and 1 together, 3 : 1
ScaleRowDown34_0_Int_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2610 static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride,
2611 uint8* dst_ptr, int dst_width) {
2612 assert((dst_width % 3 == 0) && (dst_width > 0));
2613 ALIGN16(uint8 row[kMaxInputWidth]);
2614 ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3,
2615 256 / 4);
2616 ScaleFilterCols34_C(dst_ptr, row, dst_width);
2617 }
2618
2619 // Filter rows 1 and 2 together, 1 : 1
ScaleRowDown34_1_Int_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2620 static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride,
2621 uint8* dst_ptr, int dst_width) {
2622 assert((dst_width % 3 == 0) && (dst_width > 0));
2623 ALIGN16(uint8 row[kMaxInputWidth]);
2624 ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2);
2625 ScaleFilterCols34_C(dst_ptr, row, dst_width);
2626 }
2627 #endif
2628
ScaleRowDown38_C(const uint8 * src_ptr,int,uint8 * dst,int dst_width)2629 static void ScaleRowDown38_C(const uint8* src_ptr, int,
2630 uint8* dst, int dst_width) {
2631 assert(dst_width % 3 == 0);
2632 for (int x = 0; x < dst_width; x += 3) {
2633 dst[0] = src_ptr[0];
2634 dst[1] = src_ptr[3];
2635 dst[2] = src_ptr[6];
2636 dst += 3;
2637 src_ptr += 8;
2638 }
2639 }
2640
2641 // 8x3 -> 3x1
ScaleRowDown38_3_Int_C(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2642 static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride,
2643 uint8* dst_ptr, int dst_width) {
2644 assert((dst_width % 3 == 0) && (dst_width > 0));
2645 for (int i = 0; i < dst_width; i+=3) {
2646 dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
2647 src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
2648 src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +
2649 src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *
2650 (65536 / 9) >> 16;
2651 dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
2652 src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
2653 src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +
2654 src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *
2655 (65536 / 9) >> 16;
2656 dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
2657 src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +
2658 src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) *
2659 (65536 / 6) >> 16;
2660 src_ptr += 8;
2661 dst_ptr += 3;
2662 }
2663 }
2664
2665 // 8x2 -> 3x1
ScaleRowDown38_2_Int_C(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2666 static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride,
2667 uint8* dst_ptr, int dst_width) {
2668 assert((dst_width % 3 == 0) && (dst_width > 0));
2669 for (int i = 0; i < dst_width; i+=3) {
2670 dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
2671 src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
2672 src_ptr[src_stride + 2]) * (65536 / 6) >> 16;
2673 dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
2674 src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
2675 src_ptr[src_stride + 5]) * (65536 / 6) >> 16;
2676 dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
2677 src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) *
2678 (65536 / 4) >> 16;
2679 src_ptr += 8;
2680 dst_ptr += 3;
2681 }
2682 }
2683
2684 // C version 8x2 -> 8x1
ScaleFilterRows_C(uint8 * dst_ptr,const uint8 * src_ptr,int src_stride,int dst_width,int source_y_fraction)2685 static void ScaleFilterRows_C(uint8* dst_ptr,
2686 const uint8* src_ptr, int src_stride,
2687 int dst_width, int source_y_fraction) {
2688 assert(dst_width > 0);
2689 int y1_fraction = source_y_fraction;
2690 int y0_fraction = 256 - y1_fraction;
2691 const uint8* src_ptr1 = src_ptr + src_stride;
2692 uint8* end = dst_ptr + dst_width;
2693 do {
2694 dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
2695 dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
2696 dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
2697 dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
2698 dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
2699 dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
2700 dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
2701 dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
2702 src_ptr += 8;
2703 src_ptr1 += 8;
2704 dst_ptr += 8;
2705 } while (dst_ptr < end);
2706 dst_ptr[0] = dst_ptr[-1];
2707 }
2708
ScaleAddRows_C(const uint8 * src_ptr,int src_stride,uint16 * dst_ptr,int src_width,int src_height)2709 void ScaleAddRows_C(const uint8* src_ptr, int src_stride,
2710 uint16* dst_ptr, int src_width, int src_height) {
2711 assert(src_width > 0);
2712 assert(src_height > 0);
2713 for (int x = 0; x < src_width; ++x) {
2714 const uint8* s = src_ptr + x;
2715 int sum = 0;
2716 for (int y = 0; y < src_height; ++y) {
2717 sum += s[0];
2718 s += src_stride;
2719 }
2720 dst_ptr[x] = sum;
2721 }
2722 }
2723
2724 /**
2725 * Scale plane, 1/2
2726 *
2727 * This is an optimized version for scaling down a plane to 1/2 of
2728 * its original size.
2729 *
2730 */
ScalePlaneDown2(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)2731 static void ScalePlaneDown2(int src_width, int src_height,
2732 int dst_width, int dst_height,
2733 int src_stride, int dst_stride,
2734 const uint8* src_ptr, uint8* dst_ptr,
2735 FilterMode filtering) {
2736 assert(src_width % 2 == 0);
2737 assert(src_height % 2 == 0);
2738 void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride,
2739 uint8* dst_ptr, int dst_width);
2740
2741 #if defined(HAS_SCALEROWDOWN2_NEON)
2742 if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
2743 (dst_width % 16 == 0) && (src_stride % 16 == 0) &&
2744 (dst_stride % 16 == 0) &&
2745 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 16)) {
2746 ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;
2747 } else
2748 #endif
2749 #if defined(HAS_SCALEROWDOWN2_SSE2)
2750 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
2751 (dst_width % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
2752 IS_ALIGNED(dst_ptr, 16)) {
2753 ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;
2754 } else
2755 #endif
2756 {
2757 ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C;
2758 }
2759
2760 for (int y = 0; y < dst_height; ++y) {
2761 ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
2762 src_ptr += (src_stride << 1);
2763 dst_ptr += dst_stride;
2764 }
2765 }
2766
2767 /**
2768 * Scale plane, 1/4
2769 *
2770 * This is an optimized version for scaling down a plane to 1/4 of
2771 * its original size.
2772 */
ScalePlaneDown4(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)2773 static void ScalePlaneDown4(int src_width, int src_height,
2774 int dst_width, int dst_height,
2775 int src_stride, int dst_stride,
2776 const uint8* src_ptr, uint8* dst_ptr,
2777 FilterMode filtering) {
2778 assert(src_width % 4 == 0);
2779 assert(src_height % 4 == 0);
2780 void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride,
2781 uint8* dst_ptr, int dst_width);
2782
2783 #if defined(HAS_SCALEROWDOWN4_NEON)
2784 if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
2785 (dst_width % 2 == 0) && (src_stride % 8 == 0) &&
2786 IS_ALIGNED(src_ptr, 8)) {
2787 ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON;
2788 } else
2789 #endif
2790 #if defined(HAS_SCALEROWDOWN4_SSE2)
2791 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
2792 (dst_width % 8 == 0) && (src_stride % 16 == 0) &&
2793 (dst_stride % 8 == 0) &&
2794 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
2795 ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
2796 } else
2797 #endif
2798 {
2799 ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C;
2800 }
2801
2802 for (int y = 0; y < dst_height; ++y) {
2803 ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
2804 src_ptr += (src_stride << 2);
2805 dst_ptr += dst_stride;
2806 }
2807 }
2808
2809 /**
2810 * Scale plane, 1/8
2811 *
2812 * This is an optimized version for scaling down a plane to 1/8
2813 * of its original size.
2814 *
2815 */
ScalePlaneDown8(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)2816 static void ScalePlaneDown8(int src_width, int src_height,
2817 int dst_width, int dst_height,
2818 int src_stride, int dst_stride,
2819 const uint8* src_ptr, uint8* dst_ptr,
2820 FilterMode filtering) {
2821 assert(src_width % 8 == 0);
2822 assert(src_height % 8 == 0);
2823 void (*ScaleRowDown8)(const uint8* src_ptr, int src_stride,
2824 uint8* dst_ptr, int dst_width);
2825 #if defined(HAS_SCALEROWDOWN8_SSE2)
2826 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
2827 (dst_width % 16 == 0) && dst_width <= kMaxOutputWidth &&
2828 (src_stride % 16 == 0) && (dst_stride % 16 == 0) &&
2829 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 16)) {
2830 ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2;
2831 } else
2832 #endif
2833 {
2834 ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ?
2835 ScaleRowDown8Int_C : ScaleRowDown8_C;
2836 }
2837 for (int y = 0; y < dst_height; ++y) {
2838 ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width);
2839 src_ptr += (src_stride << 3);
2840 dst_ptr += dst_stride;
2841 }
2842 }
2843
2844 /**
2845 * Scale plane down, 3/4
2846 *
2847 * Provided by Frank Barchard (fbarchard@google.com)
2848 *
2849 */
ScalePlaneDown34(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)2850 static void ScalePlaneDown34(int src_width, int src_height,
2851 int dst_width, int dst_height,
2852 int src_stride, int dst_stride,
2853 const uint8* src_ptr, uint8* dst_ptr,
2854 FilterMode filtering) {
2855 assert(dst_width % 3 == 0);
2856 void (*ScaleRowDown34_0)(const uint8* src_ptr, int src_stride,
2857 uint8* dst_ptr, int dst_width);
2858 void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride,
2859 uint8* dst_ptr, int dst_width);
2860 #if defined(HAS_SCALEROWDOWN34_SSSE3)
2861 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
2862 (dst_width % 24 == 0) && (src_stride % 16 == 0) &&
2863 (dst_stride % 8 == 0) &&
2864 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
2865 if (!filtering) {
2866 ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
2867 ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
2868 } else {
2869 ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3;
2870 ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3;
2871 }
2872 } else
2873 #endif
2874 #if defined(HAS_SCALEROWDOWN34_SSE2)
2875 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
2876 (dst_width % 24 == 0) && (src_stride % 16 == 0) &&
2877 (dst_stride % 8 == 0) &&
2878 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8) &&
2879 filtering) {
2880 ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2;
2881 ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2;
2882 } else
2883 #endif
2884 {
2885 if (!filtering) {
2886 ScaleRowDown34_0 = ScaleRowDown34_C;
2887 ScaleRowDown34_1 = ScaleRowDown34_C;
2888 } else {
2889 ScaleRowDown34_0 = ScaleRowDown34_0_Int_C;
2890 ScaleRowDown34_1 = ScaleRowDown34_1_Int_C;
2891 }
2892 }
2893 int src_row = 0;
2894 for (int y = 0; y < dst_height; ++y) {
2895 switch (src_row) {
2896 case 0:
2897 ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
2898 break;
2899
2900 case 1:
2901 ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);
2902 break;
2903
2904 case 2:
2905 ScaleRowDown34_0(src_ptr + src_stride, -src_stride,
2906 dst_ptr, dst_width);
2907 break;
2908 }
2909 ++src_row;
2910 src_ptr += src_stride;
2911 dst_ptr += dst_stride;
2912 if (src_row >= 3) {
2913 src_ptr += src_stride;
2914 src_row = 0;
2915 }
2916 }
2917 }
2918
2919 /**
2920 * Scale plane, 3/8
2921 *
2922 * This is an optimized version for scaling down a plane to 3/8
2923 * of its original size.
2924 *
2925 * Reduces 16x3 to 6x1
2926 */
ScalePlaneDown38(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)2927 static void ScalePlaneDown38(int src_width, int src_height,
2928 int dst_width, int dst_height,
2929 int src_stride, int dst_stride,
2930 const uint8* src_ptr, uint8* dst_ptr,
2931 FilterMode filtering) {
2932 assert(dst_width % 3 == 0);
2933 void (*ScaleRowDown38_3)(const uint8* src_ptr, int src_stride,
2934 uint8* dst_ptr, int dst_width);
2935 void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride,
2936 uint8* dst_ptr, int dst_width);
2937 #if defined(HAS_SCALEROWDOWN38_SSSE3)
2938 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
2939 (dst_width % 24 == 0) && (src_stride % 16 == 0) &&
2940 (dst_stride % 8 == 0) &&
2941 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
2942 if (!filtering) {
2943 ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
2944 ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
2945 } else {
2946 ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3;
2947 ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;
2948 }
2949 } else
2950 #endif
2951 {
2952 if (!filtering) {
2953 ScaleRowDown38_3 = ScaleRowDown38_C;
2954 ScaleRowDown38_2 = ScaleRowDown38_C;
2955 } else {
2956 ScaleRowDown38_3 = ScaleRowDown38_3_Int_C;
2957 ScaleRowDown38_2 = ScaleRowDown38_2_Int_C;
2958 }
2959 }
2960 int src_row = 0;
2961 for (int y = 0; y < dst_height; ++y) {
2962 switch (src_row) {
2963 case 0:
2964 case 1:
2965 ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
2966 src_ptr += src_stride * 3;
2967 ++src_row;
2968 break;
2969
2970 case 2:
2971 ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);
2972 src_ptr += src_stride * 2;
2973 src_row = 0;
2974 break;
2975 }
2976 dst_ptr += dst_stride;
2977 }
2978 }
2979
SumBox(int iboxwidth,int iboxheight,int src_stride,const uint8 * src_ptr)2980 inline static uint32 SumBox(int iboxwidth, int iboxheight,
2981 int src_stride, const uint8* src_ptr) {
2982 assert(iboxwidth > 0);
2983 assert(iboxheight > 0);
2984 uint32 sum = 0u;
2985 for (int y = 0; y < iboxheight; ++y) {
2986 for (int x = 0; x < iboxwidth; ++x) {
2987 sum += src_ptr[x];
2988 }
2989 src_ptr += src_stride;
2990 }
2991 return sum;
2992 }
2993
ScalePlaneBoxRow(int dst_width,int boxheight,int dx,int src_stride,const uint8 * src_ptr,uint8 * dst_ptr)2994 static void ScalePlaneBoxRow(int dst_width, int boxheight,
2995 int dx, int src_stride,
2996 const uint8* src_ptr, uint8* dst_ptr) {
2997 int x = 0;
2998 for (int i = 0; i < dst_width; ++i) {
2999 int ix = x >> 16;
3000 x += dx;
3001 int boxwidth = (x >> 16) - ix;
3002 *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
3003 (boxwidth * boxheight);
3004 }
3005 }
3006
SumPixels(int iboxwidth,const uint16 * src_ptr)3007 inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
3008 assert(iboxwidth > 0);
3009 uint32 sum = 0u;
3010 for (int x = 0; x < iboxwidth; ++x) {
3011 sum += src_ptr[x];
3012 }
3013 return sum;
3014 }
3015
ScaleAddCols2_C(int dst_width,int boxheight,int dx,const uint16 * src_ptr,uint8 * dst_ptr)3016 static void ScaleAddCols2_C(int dst_width, int boxheight, int dx,
3017 const uint16* src_ptr, uint8* dst_ptr) {
3018 int scaletbl[2];
3019 int minboxwidth = (dx >> 16);
3020 scaletbl[0] = 65536 / (minboxwidth * boxheight);
3021 scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
3022 int *scaleptr = scaletbl - minboxwidth;
3023 int x = 0;
3024 for (int i = 0; i < dst_width; ++i) {
3025 int ix = x >> 16;
3026 x += dx;
3027 int boxwidth = (x >> 16) - ix;
3028 *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
3029 }
3030 }
3031
ScaleAddCols1_C(int dst_width,int boxheight,int dx,const uint16 * src_ptr,uint8 * dst_ptr)3032 static void ScaleAddCols1_C(int dst_width, int boxheight, int dx,
3033 const uint16* src_ptr, uint8* dst_ptr) {
3034 int boxwidth = (dx >> 16);
3035 int scaleval = 65536 / (boxwidth * boxheight);
3036 int x = 0;
3037 for (int i = 0; i < dst_width; ++i) {
3038 *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
3039 x += boxwidth;
3040 }
3041 }
3042
3043 /**
3044 * Scale plane down to any dimensions, with interpolation.
3045 * (boxfilter).
3046 *
3047 * Same method as SimpleScale, which is fixed point, outputting
3048 * one pixel of destination using fixed point (16.16) to step
3049 * through source, sampling a box of pixel with simple
3050 * averaging.
3051 */
ScalePlaneBox(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr)3052 static void ScalePlaneBox(int src_width, int src_height,
3053 int dst_width, int dst_height,
3054 int src_stride, int dst_stride,
3055 const uint8* src_ptr, uint8* dst_ptr) {
3056 assert(dst_width > 0);
3057 assert(dst_height > 0);
3058 int dy = (src_height << 16) / dst_height;
3059 int dx = (src_width << 16) / dst_width;
3060 if ((src_width % 16 != 0) || (src_width > kMaxInputWidth) ||
3061 dst_height * 2 > src_height) {
3062 uint8* dst = dst_ptr;
3063 int dy = (src_height << 16) / dst_height;
3064 int dx = (src_width << 16) / dst_width;
3065 int y = 0;
3066 for (int j = 0; j < dst_height; ++j) {
3067 int iy = y >> 16;
3068 const uint8* const src = src_ptr + iy * src_stride;
3069 y += dy;
3070 if (y > (src_height << 16)) {
3071 y = (src_height << 16);
3072 }
3073 int boxheight = (y >> 16) - iy;
3074 ScalePlaneBoxRow(dst_width, boxheight,
3075 dx, src_stride,
3076 src, dst);
3077
3078 dst += dst_stride;
3079 }
3080 } else {
3081 ALIGN16(uint16 row[kMaxInputWidth]);
3082 void (*ScaleAddRows)(const uint8* src_ptr, int src_stride,
3083 uint16* dst_ptr, int src_width, int src_height);
3084 void (*ScaleAddCols)(int dst_width, int boxheight, int dx,
3085 const uint16* src_ptr, uint8* dst_ptr);
3086 #if defined(HAS_SCALEADDROWS_SSE2)
3087 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
3088 (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
3089 (src_width % 16) == 0) {
3090 ScaleAddRows = ScaleAddRows_SSE2;
3091 } else
3092 #endif
3093 {
3094 ScaleAddRows = ScaleAddRows_C;
3095 }
3096 if (dx & 0xffff) {
3097 ScaleAddCols = ScaleAddCols2_C;
3098 } else {
3099 ScaleAddCols = ScaleAddCols1_C;
3100 }
3101
3102 int y = 0;
3103 for (int j = 0; j < dst_height; ++j) {
3104 int iy = y >> 16;
3105 const uint8* const src = src_ptr + iy * src_stride;
3106 y += dy;
3107 if (y > (src_height << 16)) {
3108 y = (src_height << 16);
3109 }
3110 int boxheight = (y >> 16) - iy;
3111 ScaleAddRows(src, src_stride, row, src_width, boxheight);
3112 ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr);
3113 dst_ptr += dst_stride;
3114 }
3115 }
3116 }
3117
3118 /**
3119 * Scale plane to/from any dimensions, with interpolation.
3120 */
ScalePlaneBilinearSimple(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr)3121 static void ScalePlaneBilinearSimple(int src_width, int src_height,
3122 int dst_width, int dst_height,
3123 int src_stride, int dst_stride,
3124 const uint8* src_ptr, uint8* dst_ptr) {
3125 uint8* dst = dst_ptr;
3126 int dx = (src_width << 16) / dst_width;
3127 int dy = (src_height << 16) / dst_height;
3128 int maxx = ((src_width - 1) << 16) - 1;
3129 int maxy = ((src_height - 1) << 16) - 1;
3130 int y = (dst_height < src_height) ? 32768 :
3131 (src_height << 16) / dst_height - 32768;
3132 for (int i = 0; i < dst_height; ++i) {
3133 int cy = (y < 0) ? 0 : y;
3134 int yi = cy >> 16;
3135 int yf = cy & 0xffff;
3136 const uint8* const src = src_ptr + yi * src_stride;
3137 int x = (dst_width < src_width) ? 32768 :
3138 (src_width << 16) / dst_width - 32768;
3139 for (int j = 0; j < dst_width; ++j) {
3140 int cx = (x < 0) ? 0 : x;
3141 int xi = cx >> 16;
3142 int xf = cx & 0xffff;
3143 int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16;
3144 int r1 = (src[xi + src_stride] * (65536 - xf) +
3145 src[xi + src_stride + 1] * xf) >> 16;
3146 *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16;
3147 x += dx;
3148 if (x > maxx)
3149 x = maxx;
3150 }
3151 dst += dst_stride - dst_width;
3152 y += dy;
3153 if (y > maxy)
3154 y = maxy;
3155 }
3156 }
3157
3158 /**
3159 * Scale plane to/from any dimensions, with bilinear
3160 * interpolation.
3161 */
ScalePlaneBilinear(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr)3162 static void ScalePlaneBilinear(int src_width, int src_height,
3163 int dst_width, int dst_height,
3164 int src_stride, int dst_stride,
3165 const uint8* src_ptr, uint8* dst_ptr) {
3166 assert(dst_width > 0);
3167 assert(dst_height > 0);
3168 int dy = (src_height << 16) / dst_height;
3169 int dx = (src_width << 16) / dst_width;
3170 if ((src_width % 8 != 0) || (src_width > kMaxInputWidth)) {
3171 ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height,
3172 src_stride, dst_stride, src_ptr, dst_ptr);
3173
3174 } else {
3175 ALIGN16(uint8 row[kMaxInputWidth + 1]);
3176 void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
3177 int src_stride,
3178 int dst_width, int source_y_fraction);
3179 void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
3180 int dst_width, int dx);
3181 #if defined(HAS_SCALEFILTERROWS_SSSE3)
3182 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
3183 (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
3184 (src_width % 16) == 0) {
3185 ScaleFilterRows = ScaleFilterRows_SSSE3;
3186 } else
3187 #endif
3188 #if defined(HAS_SCALEFILTERROWS_SSE2)
3189 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
3190 (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
3191 (src_width % 16) == 0) {
3192 ScaleFilterRows = ScaleFilterRows_SSE2;
3193 } else
3194 #endif
3195 {
3196 ScaleFilterRows = ScaleFilterRows_C;
3197 }
3198 ScaleFilterCols = ScaleFilterCols_C;
3199
3200 int y = 0;
3201 int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows.
3202 for (int j = 0; j < dst_height; ++j) {
3203 int iy = y >> 16;
3204 int fy = (y >> 8) & 255;
3205 const uint8* const src = src_ptr + iy * src_stride;
3206 ScaleFilterRows(row, src, src_stride, src_width, fy);
3207 ScaleFilterCols(dst_ptr, row, dst_width, dx);
3208 dst_ptr += dst_stride;
3209 y += dy;
3210 if (y > maxy) {
3211 y = maxy;
3212 }
3213 }
3214 }
3215 }
3216
3217 /**
3218 * Scale plane to/from any dimensions, without interpolation.
3219 * Fixed point math is used for performance: The upper 16 bits
3220 * of x and dx is the integer part of the source position and
3221 * the lower 16 bits are the fixed decimal part.
3222 */
ScalePlaneSimple(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr)3223 static void ScalePlaneSimple(int src_width, int src_height,
3224 int dst_width, int dst_height,
3225 int src_stride, int dst_stride,
3226 const uint8* src_ptr, uint8* dst_ptr) {
3227 uint8* dst = dst_ptr;
3228 int dx = (src_width << 16) / dst_width;
3229 for (int y = 0; y < dst_height; ++y) {
3230 const uint8* const src = src_ptr + (y * src_height / dst_height) *
3231 src_stride;
3232 // TODO(fbarchard): Round X coordinate by setting x=0x8000.
3233 int x = 0;
3234 for (int i = 0; i < dst_width; ++i) {
3235 *dst++ = src[x >> 16];
3236 x += dx;
3237 }
3238 dst += dst_stride - dst_width;
3239 }
3240 }
3241
3242 /**
3243 * Scale plane to/from any dimensions.
3244 */
ScalePlaneAnySize(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)3245 static void ScalePlaneAnySize(int src_width, int src_height,
3246 int dst_width, int dst_height,
3247 int src_stride, int dst_stride,
3248 const uint8* src_ptr, uint8* dst_ptr,
3249 FilterMode filtering) {
3250 if (!filtering) {
3251 ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
3252 src_stride, dst_stride, src_ptr, dst_ptr);
3253 } else {
3254 // fall back to non-optimized version
3255 ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
3256 src_stride, dst_stride, src_ptr, dst_ptr);
3257 }
3258 }
3259
3260 /**
3261 * Scale plane down, any size
3262 *
3263 * This is an optimized version for scaling down a plane to any size.
3264 * The current implementation is ~10 times faster compared to the
3265 * reference implementation for e.g. XGA->LowResPAL
3266 *
3267 */
ScalePlaneDown(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)3268 static void ScalePlaneDown(int src_width, int src_height,
3269 int dst_width, int dst_height,
3270 int src_stride, int dst_stride,
3271 const uint8* src_ptr, uint8* dst_ptr,
3272 FilterMode filtering) {
3273 if (!filtering) {
3274 ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
3275 src_stride, dst_stride, src_ptr, dst_ptr);
3276 } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) {
3277 // between 1/2x and 1x use bilinear
3278 ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
3279 src_stride, dst_stride, src_ptr, dst_ptr);
3280 } else {
3281 ScalePlaneBox(src_width, src_height, dst_width, dst_height,
3282 src_stride, dst_stride, src_ptr, dst_ptr);
3283 }
3284 }
3285
3286 /**
3287 * Copy plane, no scaling
3288 *
3289 * This simply copies the given plane without scaling.
3290 * The current implementation is ~115 times faster
3291 * compared to the reference implementation.
3292 *
3293 */
CopyPlane(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr)3294 static void CopyPlane(int src_width, int src_height,
3295 int dst_width, int dst_height,
3296 int src_stride, int dst_stride,
3297 const uint8* src_ptr, uint8* dst_ptr) {
3298 if (src_stride == src_width && dst_stride == dst_width) {
3299 // All contiguous, so can use REALLY fast path.
3300 memcpy(dst_ptr, src_ptr, src_width * src_height);
3301 } else {
3302 // Not all contiguous; must copy scanlines individually
3303 const uint8* src = src_ptr;
3304 uint8* dst = dst_ptr;
3305 for (int i = 0; i < src_height; ++i) {
3306 memcpy(dst, src, src_width);
3307 dst += dst_stride;
3308 src += src_stride;
3309 }
3310 }
3311 }
3312
ScalePlane(const uint8 * src,int src_stride,int src_width,int src_height,uint8 * dst,int dst_stride,int dst_width,int dst_height,FilterMode filtering,bool use_ref)3313 static void ScalePlane(const uint8* src, int src_stride,
3314 int src_width, int src_height,
3315 uint8* dst, int dst_stride,
3316 int dst_width, int dst_height,
3317 FilterMode filtering, bool use_ref) {
3318 // Use specialized scales to improve performance for common resolutions.
3319 // For example, all the 1/2 scalings will use ScalePlaneDown2()
3320 if (dst_width == src_width && dst_height == src_height) {
3321 // Straight copy.
3322 CopyPlane(src_width, src_height, dst_width, dst_height, src_stride,
3323 dst_stride, src, dst);
3324 } else if (dst_width <= src_width && dst_height <= src_height) {
3325 // Scale down.
3326 if (use_ref) {
3327 // For testing, allow the optimized versions to be disabled.
3328 ScalePlaneDown(src_width, src_height, dst_width, dst_height,
3329 src_stride, dst_stride, src, dst, filtering);
3330 } else if (4 * dst_width == 3 * src_width &&
3331 4 * dst_height == 3 * src_height) {
3332 // optimized, 3/4
3333 ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
3334 src_stride, dst_stride, src, dst, filtering);
3335 } else if (2 * dst_width == src_width && 2 * dst_height == src_height) {
3336 // optimized, 1/2
3337 ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
3338 src_stride, dst_stride, src, dst, filtering);
3339 // 3/8 rounded up for odd sized chroma height.
3340 } else if (8 * dst_width == 3 * src_width &&
3341 dst_height == ((src_height * 3 + 7) / 8)) {
3342 // optimized, 3/8
3343 ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
3344 src_stride, dst_stride, src, dst, filtering);
3345 } else if (4 * dst_width == src_width && 4 * dst_height == src_height) {
3346 // optimized, 1/4
3347 ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
3348 src_stride, dst_stride, src, dst, filtering);
3349 } else if (8 * dst_width == src_width && 8 * dst_height == src_height) {
3350 // optimized, 1/8
3351 ScalePlaneDown8(src_width, src_height, dst_width, dst_height,
3352 src_stride, dst_stride, src, dst, filtering);
3353 } else {
3354 // Arbitrary downsample
3355 ScalePlaneDown(src_width, src_height, dst_width, dst_height,
3356 src_stride, dst_stride, src, dst, filtering);
3357 }
3358 } else {
3359 // Arbitrary scale up and/or down.
3360 ScalePlaneAnySize(src_width, src_height, dst_width, dst_height,
3361 src_stride, dst_stride, src, dst, filtering);
3362 }
3363 }
3364
3365 /**
3366 * Scale a plane.
3367 *
3368 * This function in turn calls a scaling function
3369 * suitable for handling the desired resolutions.
3370 *
3371 */
3372
I420Scale(const uint8 * src_y,int src_stride_y,const uint8 * src_u,int src_stride_u,const uint8 * src_v,int src_stride_v,int src_width,int src_height,uint8 * dst_y,int dst_stride_y,uint8 * dst_u,int dst_stride_u,uint8 * dst_v,int dst_stride_v,int dst_width,int dst_height,FilterMode filtering)3373 int I420Scale(const uint8* src_y, int src_stride_y,
3374 const uint8* src_u, int src_stride_u,
3375 const uint8* src_v, int src_stride_v,
3376 int src_width, int src_height,
3377 uint8* dst_y, int dst_stride_y,
3378 uint8* dst_u, int dst_stride_u,
3379 uint8* dst_v, int dst_stride_v,
3380 int dst_width, int dst_height,
3381 FilterMode filtering) {
3382 if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
3383 !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
3384 return -1;
3385 }
3386 // Negative height means invert the image.
3387 if (src_height < 0) {
3388 src_height = -src_height;
3389 int halfheight = (src_height + 1) >> 1;
3390 src_y = src_y + (src_height - 1) * src_stride_y;
3391 src_u = src_u + (halfheight - 1) * src_stride_u;
3392 src_v = src_v + (halfheight - 1) * src_stride_v;
3393 src_stride_y = -src_stride_y;
3394 src_stride_u = -src_stride_u;
3395 src_stride_v = -src_stride_v;
3396 }
3397 int halfsrc_width = (src_width + 1) >> 1;
3398 int halfsrc_height = (src_height + 1) >> 1;
3399 int halfdst_width = (dst_width + 1) >> 1;
3400 int halfoheight = (dst_height + 1) >> 1;
3401
3402 ScalePlane(src_y, src_stride_y, src_width, src_height,
3403 dst_y, dst_stride_y, dst_width, dst_height,
3404 filtering, use_reference_impl_);
3405 ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height,
3406 dst_u, dst_stride_u, halfdst_width, halfoheight,
3407 filtering, use_reference_impl_);
3408 ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height,
3409 dst_v, dst_stride_v, halfdst_width, halfoheight,
3410 filtering, use_reference_impl_);
3411 return 0;
3412 }
3413
Scale(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,int src_stride_y,int src_stride_u,int src_stride_v,int src_width,int src_height,uint8 * dst_y,uint8 * dst_u,uint8 * dst_v,int dst_stride_y,int dst_stride_u,int dst_stride_v,int dst_width,int dst_height,bool interpolate)3414 int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
3415 int src_stride_y, int src_stride_u, int src_stride_v,
3416 int src_width, int src_height,
3417 uint8* dst_y, uint8* dst_u, uint8* dst_v,
3418 int dst_stride_y, int dst_stride_u, int dst_stride_v,
3419 int dst_width, int dst_height,
3420 bool interpolate) {
3421 if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
3422 !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
3423 return -1;
3424 }
3425 // Negative height means invert the image.
3426 if (src_height < 0) {
3427 src_height = -src_height;
3428 int halfheight = (src_height + 1) >> 1;
3429 src_y = src_y + (src_height - 1) * src_stride_y;
3430 src_u = src_u + (halfheight - 1) * src_stride_u;
3431 src_v = src_v + (halfheight - 1) * src_stride_v;
3432 src_stride_y = -src_stride_y;
3433 src_stride_u = -src_stride_u;
3434 src_stride_v = -src_stride_v;
3435 }
3436 int halfsrc_width = (src_width + 1) >> 1;
3437 int halfsrc_height = (src_height + 1) >> 1;
3438 int halfdst_width = (dst_width + 1) >> 1;
3439 int halfoheight = (dst_height + 1) >> 1;
3440 FilterMode filtering = interpolate ? kFilterBox : kFilterNone;
3441
3442 ScalePlane(src_y, src_stride_y, src_width, src_height,
3443 dst_y, dst_stride_y, dst_width, dst_height,
3444 filtering, use_reference_impl_);
3445 ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height,
3446 dst_u, dst_stride_u, halfdst_width, halfoheight,
3447 filtering, use_reference_impl_);
3448 ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height,
3449 dst_v, dst_stride_v, halfdst_width, halfoheight,
3450 filtering, use_reference_impl_);
3451 return 0;
3452 }
3453
Scale(const uint8 * src,int src_width,int src_height,uint8 * dst,int dst_width,int dst_height,int ooffset,bool interpolate)3454 int Scale(const uint8* src, int src_width, int src_height,
3455 uint8* dst, int dst_width, int dst_height, int ooffset,
3456 bool interpolate) {
3457 if (!src || src_width <= 0 || src_height <= 0 ||
3458 !dst || dst_width <= 0 || dst_height <= 0 || ooffset < 0 ||
3459 ooffset >= dst_height) {
3460 return -1;
3461 }
3462 ooffset = ooffset & ~1; // chroma requires offset to multiple of 2.
3463 int halfsrc_width = (src_width + 1) >> 1;
3464 int halfsrc_height = (src_height + 1) >> 1;
3465 int halfdst_width = (dst_width + 1) >> 1;
3466 int halfoheight = (dst_height + 1) >> 1;
3467 int aheight = dst_height - ooffset * 2; // actual output height
3468 const uint8* const iyptr = src;
3469 uint8* oyptr = dst + ooffset * dst_width;
3470 const uint8* const iuptr = src + src_width * src_height;
3471 uint8* ouptr = dst + dst_width * dst_height + (ooffset >> 1) * halfdst_width;
3472 const uint8* const ivptr = src + src_width * src_height +
3473 halfsrc_width * halfsrc_height;
3474 uint8* ovptr = dst + dst_width * dst_height + halfdst_width * halfoheight +
3475 (ooffset >> 1) * halfdst_width;
3476 return Scale(iyptr, iuptr, ivptr, src_width, halfsrc_width, halfsrc_width,
3477 src_width, src_height, oyptr, ouptr, ovptr, dst_width,
3478 halfdst_width, halfdst_width, dst_width, aheight, interpolate);
3479 }
3480
3481 } // namespace libyuv
3482