• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/scale.h"
12 
13 #include <assert.h>
14 #include <string.h>
15 
16 #include "libyuv/cpu_id.h"
17 
18 #if defined(_MSC_VER)
19 #define ALIGN16(var) __declspec(align(16)) var
20 #else
21 #define ALIGN16(var) var __attribute__((aligned(16)))
22 #endif
23 
24 // Note: A Neon reference manual
25 // http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html
26 // Note: Some SSE2 reference manuals
27 // cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
28 
29 namespace libyuv {
30 
31 // Set the following flag to true to revert to only
32 // using the reference implementation ScalePlaneBox(), and
33 // NOT the optimized versions. Useful for debugging and
34 // when comparing the quality of the resulting YUV planes
35 // as produced by the optimized and non-optimized versions.
36 
37 static bool use_reference_impl_ = false;
38 
SetUseReferenceImpl(bool use)39 void SetUseReferenceImpl(bool use) {
40   use_reference_impl_ = use;
41 }
42 
43 /**
44  * NEON downscalers with interpolation.
45  *
46  * Provided by Fritz Koenig
47  *
48  */
49 
50 #if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
51 #define HAS_SCALEROWDOWN2_NEON
ScaleRowDown2_NEON(const uint8 * src_ptr,int,uint8 * dst,int dst_width)52 void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */,
53                         uint8* dst, int dst_width) {
54   __asm__ volatile
55   (
56     "1:\n"
57     "vld2.u8    {q0,q1}, [%0]!    \n"  // load even pixels into q0, odd into q1
58     "vst1.u8    {q0}, [%1]!       \n"  // store even pixels
59     "subs       %2, %2, #16       \n"  // 16 processed per loop
60     "bhi        1b                \n"
61     : "+r"(src_ptr),          // %0
62       "+r"(dst),              // %1
63       "+r"(dst_width)         // %2
64     :
65     : "q0", "q1"              // Clobber List
66   );
67 }
68 
ScaleRowDown2Int_NEON(const uint8 * src_ptr,int src_stride,uint8 * dst,int dst_width)69 void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
70                            uint8* dst, int dst_width) {
71   __asm__ volatile
72   (
73     "mov        r4, #2            \n"  // rounding constant
74     "add        %1, %0            \n"  // change the stride to row 2 pointer
75     "vdup.16    q4, r4            \n"
76     "1:\n"
77     "vld1.u8    {q0,q1}, [%0]!    \n"  // load row 1 and post increment
78     "vld1.u8    {q2,q3}, [%1]!    \n"  // load row 2 and post increment
79     "vpaddl.u8  q0, q0            \n"  // row 1 add adjacent
80     "vpaddl.u8  q1, q1            \n"
81     "vpadal.u8  q0, q2            \n"  // row 2 add adjacent, add row 1 to row 2
82     "vpadal.u8  q1, q3            \n"
83     "vadd.u16   q0, q4            \n"  // rounding
84     "vadd.u16   q1, q4            \n"
85     "vshrn.u16  d0, q0, #2        \n"  // downshift and pack
86     "vshrn.u16  d1, q1, #2        \n"
87     "vst1.u8    {q0}, [%2]!       \n"
88     "subs       %3, %3, #16       \n"  // 16 processed per loop
89     "bhi        1b                \n"
90     : "+r"(src_ptr),          // %0
91       "+r"(src_stride),       // %1
92       "+r"(dst),              // %2
93       "+r"(dst_width)         // %3
94     :
95     : "r4", "q0", "q1", "q2", "q3", "q4"              // Clobber List
96    );
97 }
98 
99 #define HAS_SCALEROWDOWN4_NEON
100 // Expecting widths on arm devices to be smaller.  Went with 8x4 blocks
101 //  to get most coverage.  Look to back and evaluate 16x4 blocks with
102 //  handling of leftovers.
ScaleRowDown4_NEON(const uint8 * src_ptr,int,uint8 * dst_ptr,int dst_width)103 static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */,
104                                uint8* dst_ptr, int dst_width) {
105   __asm__ volatile
106   (
107     "mov        r4, #4            \n"
108     "1:                           \n"
109     "vld1.u8    {d0[0]}, [%0],r4  \n"   // load up only 2 pixels of data to
110     "vld1.u8    {d0[1]}, [%0],r4  \n"   //  represent the entire 8x4 block
111 
112     "vst1.u16   {d0[0]}, [%1]!    \n"
113 
114     "subs       %2, #2            \n"   // dst_width -= 2
115     "bhi        1b                \n"
116     : "+r"(src_ptr),          // %0
117       "+r"(dst_ptr),          // %1
118       "+r"(dst_width)         // %2
119     :
120     : "r4", "q0", "q1", "memory", "cc"
121   );
122 }
123 
ScaleRowDown4Int_NEON(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)124 static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,
125                                   uint8* dst_ptr, int dst_width) {
126   __asm__ volatile
127   (
128     "1:                           \n"
129     "mov        r4, %0            \n"
130     "vld1.u8    {d0}, [r4],%3     \n"   // load up 8x4 block of input data
131     "vld1.u8    {d1}, [r4],%3     \n"
132     "vld1.u8    {d2}, [r4],%3     \n"
133     "vld1.u8    {d3}, [r4]        \n"
134 
135     // data is loaded up int q0 and q1
136     // q0 = a00 a01 a02 a03 b00 b01 b02 b03 a10 a11 a12 a13 b10 b11 b12 b13
137     // q1 = a20 a21 a22 a23 b20 b21 b22 b23 a20 a21 a22 a23 b20 b21 b22 b23
138     // q0 = a00+a01 a02+a03 b00+b01 b02+b03 a10+a11 a12+a13 b10+b11 b12+b13
139     "vpaddl.u8  q0, q0            \n"
140 
141     // d0 = a00+a01+a20+a21 a02+a03+a22+a23 b00+b01+b20+b21 b02+b03+b22+b23
142     // d1 = a10+a11+a20+a21 a12+a13+a22+a23 b10+b11+b20+b21 b12+b13+b22+b23
143     "vpadal.u8  q0, q1            \n"
144 
145     // d0 = a00+a01+a20+a21+a02+a03+a22+a23 b00+b01+b20+b21+b02+b03+b22+b23
146     // d1 = a10+a11+a20+a21+a12+a13+a22+a23 b10+b11+b20+b21+b12+b13+b22+b23
147     "vpaddl.u16 q0, q0            \n"
148 
149 
150     // d0 = a00+a01+a20+a21+a02+a03+a22+a23+a10+a11+a20+a21+a12+a13+a22+a23
151     //      b00+b01+b20+b21+b02+b03+b22+b23+b10+b11+b20+b21+b12+b13+b22+b23
152     "vadd.u32   d0, d1            \n"
153 
154     "vrshr.u32  d0, d0, #4        \n"   // divide by 16 w/rounding
155 
156     "vst1.u8    {d0[0]}, [%1]!    \n"
157     "vst1.u8    {d0[4]}, [%1]!    \n"
158 
159     "add        %0, #8            \n"   // move src pointer to next 8 pixels
160     "subs       %2, #2            \n"   // dst_width -= 2
161     "bhi        1b                \n"
162 
163     : "+r"(src_ptr),          // %0
164       "+r"(dst_ptr),          // %1
165       "+r"(dst_width)         // %2
166     : "r"(src_stride)         // %3
167     : "r4", "q0", "q1", "memory", "cc"
168   );
169 }
170 
171 /**
172  * SSE2 downscalers with interpolation.
173  *
174  * Provided by Frank Barchard (fbarchard@google.com)
175  *
176  */
177 
178 // Constants for SSE2 code
179 #elif (defined(WIN32) || defined(__i386__) || defined(__x86_64__)) && \
180     !defined(COVERAGE_ENABLED) && !TARGET_IPHONE_SIMULATOR
181 #if defined(_MSC_VER)
182 #define TALIGN16(t, var) __declspec(align(16)) t _ ## var
183 #elif defined(OSX)
184 #define TALIGN16(t, var) t var __attribute__((aligned(16)))
185 #else
186 #define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))
187 #endif
188 
189 // Offsets for source bytes 0 to 9
190 extern "C" TALIGN16(const uint8, shuf0[16]) =
191   { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
192 
193 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
194 extern "C" TALIGN16(const uint8, shuf1[16]) =
195   { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
196 
197 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
198 extern "C" TALIGN16(const uint8, shuf2[16]) =
199   { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
200 
201 // Offsets for source bytes 0 to 10
202 extern "C" TALIGN16(const uint8, shuf01[16]) =
203   { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
204 
205 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
206 extern "C" TALIGN16(const uint8, shuf11[16]) =
207   { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
208 
209 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
210 extern "C" TALIGN16(const uint8, shuf21[16]) =
211   { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
212 
213 // Coefficients for source bytes 0 to 10
214 extern "C" TALIGN16(const uint8, madd01[16]) =
215   { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
216 
217 // Coefficients for source bytes 10 to 21
218 extern "C" TALIGN16(const uint8, madd11[16]) =
219   { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
220 
221 // Coefficients for source bytes 21 to 31
222 extern "C" TALIGN16(const uint8, madd21[16]) =
223   { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
224 
225 // Coefficients for source bytes 21 to 31
226 extern "C" TALIGN16(const int16, round34[8]) =
227   { 2, 2, 2, 2, 2, 2, 2, 2 };
228 
229 extern "C" TALIGN16(const uint8, shuf38a[16]) =
230   { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
231 
232 extern "C" TALIGN16(const uint8, shuf38b[16]) =
233   { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
234 
235 // Arrange words 0,3,6 into 0,1,2
236 extern "C" TALIGN16(const uint8, shufac0[16]) =
237   { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
238 
239 // Arrange words 0,3,6 into 3,4,5
240 extern "C" TALIGN16(const uint8, shufac3[16]) =
241   { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
242 
243 // Scaling values for boxes of 3x3 and 2x3
244 extern "C" TALIGN16(const uint16, scaleac3[8]) =
245   { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
246 
247 // Arrange first value for pixels 0,1,2,3,4,5
248 extern "C" TALIGN16(const uint8, shufab0[16]) =
249   { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
250 
251 // Arrange second value for pixels 0,1,2,3,4,5
252 extern "C" TALIGN16(const uint8, shufab1[16]) =
253   { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
254 
255 // Arrange third value for pixels 0,1,2,3,4,5
256 extern "C" TALIGN16(const uint8, shufab2[16]) =
257   { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
258 
259 // Scaling values for boxes of 3x2 and 2x2
260 extern "C" TALIGN16(const uint16, scaleab2[8]) =
261   { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
262 #endif
263 
264 #if defined(WIN32) && !defined(COVERAGE_ENABLED)
265 
266 #define HAS_SCALEROWDOWN2_SSE2
267 // Reads 32 pixels, throws half away and writes 16 pixels.
268 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
269 __declspec(naked)
ScaleRowDown2_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)270 static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
271                                uint8* dst_ptr, int dst_width) {
272   __asm {
273     mov        eax, [esp + 4]        // src_ptr
274                                      // src_stride ignored
275     mov        edx, [esp + 12]       // dst_ptr
276     mov        ecx, [esp + 16]       // dst_width
277     pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
278     psrlw      xmm7, 8
279 
280   wloop:
281     movdqa     xmm0, [eax]
282     movdqa     xmm1, [eax + 16]
283     lea        eax,  [eax + 32]
284     pand       xmm0, xmm7
285     pand       xmm1, xmm7
286     packuswb   xmm0, xmm1
287     movdqa     [edx], xmm0
288     lea        edx, [edx + 16]
289     sub        ecx, 16
290     ja         wloop
291 
292     ret
293   }
294 }
295 // Blends 32x2 rectangle to 16x1.
296 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
297 __declspec(naked)
ScaleRowDown2Int_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)298 static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
299                                   uint8* dst_ptr, int dst_width) {
300   __asm {
301     push       esi
302     mov        eax, [esp + 4 + 4]    // src_ptr
303     mov        esi, [esp + 4 + 8]    // src_stride
304     mov        edx, [esp + 4 + 12]   // dst_ptr
305     mov        ecx, [esp + 4 + 16]   // dst_width
306     pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
307     psrlw      xmm7, 8
308 
309   wloop:
310     movdqa     xmm0, [eax]
311     movdqa     xmm1, [eax + 16]
312     movdqa     xmm2, [eax + esi]
313     movdqa     xmm3, [eax + esi + 16]
314     lea        eax,  [eax + 32]
315     pavgb      xmm0, xmm2            // average rows
316     pavgb      xmm1, xmm3
317 
318     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
319     psrlw      xmm0, 8
320     movdqa     xmm3, xmm1
321     psrlw      xmm1, 8
322     pand       xmm2, xmm7
323     pand       xmm3, xmm7
324     pavgw      xmm0, xmm2
325     pavgw      xmm1, xmm3
326     packuswb   xmm0, xmm1
327 
328     movdqa     [edx], xmm0
329     lea        edx, [edx + 16]
330     sub        ecx, 16
331     ja         wloop
332 
333     pop        esi
334     ret
335   }
336 }
337 
338 #define HAS_SCALEROWDOWN4_SSE2
339 // Point samples 32 pixels to 8 pixels.
340 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
341 __declspec(naked)
ScaleRowDown4_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)342 static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
343                                uint8* dst_ptr, int dst_width) {
344   __asm {
345     pushad
346     mov        esi, [esp + 32 + 4]   // src_ptr
347                                      // src_stride ignored
348     mov        edi, [esp + 32 + 12]  // dst_ptr
349     mov        ecx, [esp + 32 + 16]  // dst_width
350     pcmpeqb    xmm7, xmm7            // generate mask 0x000000ff
351     psrld      xmm7, 24
352 
353   wloop:
354     movdqa     xmm0, [esi]
355     movdqa     xmm1, [esi + 16]
356     lea        esi,  [esi + 32]
357     pand       xmm0, xmm7
358     pand       xmm1, xmm7
359     packuswb   xmm0, xmm1
360     packuswb   xmm0, xmm0
361     movq       qword ptr [edi], xmm0
362     lea        edi, [edi + 8]
363     sub        ecx, 8
364     ja         wloop
365 
366     popad
367     ret
368   }
369 }
370 
371 // Blends 32x4 rectangle to 8x1.
372 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
373 __declspec(naked)
ScaleRowDown4Int_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)374 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
375                                   uint8* dst_ptr, int dst_width) {
376   __asm {
377     pushad
378     mov        esi, [esp + 32 + 4]   // src_ptr
379     mov        ebx, [esp + 32 + 8]   // src_stride
380     mov        edi, [esp + 32 + 12]  // dst_ptr
381     mov        ecx, [esp + 32 + 16]  // dst_width
382     pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
383     psrlw      xmm7, 8
384     lea        edx, [ebx + ebx * 2]  // src_stride * 3
385 
386   wloop:
387     movdqa     xmm0, [esi]
388     movdqa     xmm1, [esi + 16]
389     movdqa     xmm2, [esi + ebx]
390     movdqa     xmm3, [esi + ebx + 16]
391     pavgb      xmm0, xmm2            // average rows
392     pavgb      xmm1, xmm3
393     movdqa     xmm2, [esi + ebx * 2]
394     movdqa     xmm3, [esi + ebx * 2 + 16]
395     movdqa     xmm4, [esi + edx]
396     movdqa     xmm5, [esi + edx + 16]
397     lea        esi, [esi + 32]
398     pavgb      xmm2, xmm4
399     pavgb      xmm3, xmm5
400     pavgb      xmm0, xmm2
401     pavgb      xmm1, xmm3
402 
403     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
404     psrlw      xmm0, 8
405     movdqa     xmm3, xmm1
406     psrlw      xmm1, 8
407     pand       xmm2, xmm7
408     pand       xmm3, xmm7
409     pavgw      xmm0, xmm2
410     pavgw      xmm1, xmm3
411     packuswb   xmm0, xmm1
412 
413     movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
414     psrlw      xmm0, 8
415     pand       xmm2, xmm7
416     pavgw      xmm0, xmm2
417     packuswb   xmm0, xmm0
418 
419     movq       qword ptr [edi], xmm0
420     lea        edi, [edi + 8]
421     sub        ecx, 8
422     ja         wloop
423 
424     popad
425     ret
426   }
427 }
428 
429 #define HAS_SCALEROWDOWN8_SSE2
430 // Point samples 32 pixels to 4 pixels.
431 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
432 __declspec(naked)
ScaleRowDown8_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)433 static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
434                                uint8* dst_ptr, int dst_width) {
435   __asm {
436     pushad
437     mov        esi, [esp + 32 + 4]   // src_ptr
438                                      // src_stride ignored
439     mov        edi, [esp + 32 + 12]  // dst_ptr
440     mov        ecx, [esp + 32 + 16]  // dst_width
441     pcmpeqb    xmm7, xmm7            // generate mask isolating 1 src 8 bytes
442     psrlq      xmm7, 56
443 
444   wloop:
445     movdqa     xmm0, [esi]
446     movdqa     xmm1, [esi + 16]
447     lea        esi,  [esi + 32]
448     pand       xmm0, xmm7
449     pand       xmm1, xmm7
450     packuswb   xmm0, xmm1  // 32->16
451     packuswb   xmm0, xmm0  // 16->8
452     packuswb   xmm0, xmm0  // 8->4
453     movd       dword ptr [edi], xmm0
454     lea        edi, [edi + 4]
455     sub        ecx, 4
456     ja         wloop
457 
458     popad
459     ret
460   }
461 }
462 
463 // Blends 32x8 rectangle to 4x1.
464 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
465 __declspec(naked)
ScaleRowDown8Int_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)466 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
467                                   uint8* dst_ptr, int dst_width) {
468   __asm {
469     pushad
470     mov        esi, [esp + 32 + 4]   // src_ptr
471     mov        ebx, [esp + 32 + 8]   // src_stride
472     mov        edi, [esp + 32 + 12]  // dst_ptr
473     mov        ecx, [esp + 32 + 16]  // dst_width
474     lea        edx, [ebx + ebx * 2]  // src_stride * 3
475     pxor       xmm7, xmm7
476 
477   wloop:
478     movdqa     xmm0, [esi]           // average 8 rows to 1
479     movdqa     xmm1, [esi + 16]
480     movdqa     xmm2, [esi + ebx]
481     movdqa     xmm3, [esi + ebx + 16]
482     pavgb      xmm0, xmm2
483     pavgb      xmm1, xmm3
484     movdqa     xmm2, [esi + ebx * 2]
485     movdqa     xmm3, [esi + ebx * 2 + 16]
486     movdqa     xmm4, [esi + edx]
487     movdqa     xmm5, [esi + edx + 16]
488     lea        ebp, [esi + ebx * 4]
489     lea        esi, [esi + 32]
490     pavgb      xmm2, xmm4
491     pavgb      xmm3, xmm5
492     pavgb      xmm0, xmm2
493     pavgb      xmm1, xmm3
494 
495     movdqa     xmm2, [ebp]
496     movdqa     xmm3, [ebp + 16]
497     movdqa     xmm4, [ebp + ebx]
498     movdqa     xmm5, [ebp + ebx + 16]
499     pavgb      xmm2, xmm4
500     pavgb      xmm3, xmm5
501     movdqa     xmm4, [ebp + ebx * 2]
502     movdqa     xmm5, [ebp + ebx * 2 + 16]
503     movdqa     xmm6, [ebp + edx]
504     pavgb      xmm4, xmm6
505     movdqa     xmm6, [ebp + edx + 16]
506     pavgb      xmm5, xmm6
507     pavgb      xmm2, xmm4
508     pavgb      xmm3, xmm5
509     pavgb      xmm0, xmm2
510     pavgb      xmm1, xmm3
511 
512     psadbw     xmm0, xmm7            // average 32 pixels to 4
513     psadbw     xmm1, xmm7
514     pshufd     xmm0, xmm0, 0xd8      // x1x0 -> xx01
515     pshufd     xmm1, xmm1, 0x8d      // x3x2 -> 32xx
516     por        xmm0, xmm1            //      -> 3201
517     psrlw      xmm0, 3
518     packuswb   xmm0, xmm0
519     packuswb   xmm0, xmm0
520     movd       dword ptr [edi], xmm0
521 
522     lea        edi, [edi + 4]
523     sub        ecx, 4
524     ja         wloop
525 
526     popad
527     ret
528   }
529 }
530 
531 #define HAS_SCALEROWDOWN34_SSSE3
532 // Point samples 32 pixels to 24 pixels.
533 // Produces three 8 byte values.  For each 8 bytes, 16 bytes are read.
534 // Then shuffled to do the scaling.
535 
536 // Note that movdqa+palign may be better than movdqu.
537 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
538 __declspec(naked)
ScaleRowDown34_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)539 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
540                                  uint8* dst_ptr, int dst_width) {
541   __asm {
542     pushad
543     mov        esi, [esp + 32 + 4]   // src_ptr
544                                      // src_stride ignored
545     mov        edi, [esp + 32 + 12]  // dst_ptr
546     mov        ecx, [esp + 32 + 16]  // dst_width
547     movdqa     xmm3, _shuf0
548     movdqa     xmm4, _shuf1
549     movdqa     xmm5, _shuf2
550 
551   wloop:
552     movdqa     xmm0, [esi]
553     movdqa     xmm2, [esi + 16]
554     lea        esi,  [esi + 32]
555     movdqa     xmm1, xmm2
556     palignr    xmm1, xmm0, 8
557     pshufb     xmm0, xmm3
558     pshufb     xmm1, xmm4
559     pshufb     xmm2, xmm5
560     movq       qword ptr [edi], xmm0
561     movq       qword ptr [edi + 8], xmm1
562     movq       qword ptr [edi + 16], xmm2
563     lea        edi, [edi + 24]
564     sub        ecx, 24
565     ja         wloop
566 
567     popad
568     ret
569   }
570 }
571 
572 // Blends 32x2 rectangle to 24x1
573 // Produces three 8 byte values.  For each 8 bytes, 16 bytes are read.
574 // Then shuffled to do the scaling.
575 
576 // Register usage:
577 // xmm0 src_row 0
578 // xmm1 src_row 1
579 // xmm2 shuf 0
580 // xmm3 shuf 1
581 // xmm4 shuf 2
582 // xmm5 madd 0
583 // xmm6 madd 1
584 // xmm7 round34
585 
586 // Note that movdqa+palign may be better than movdqu.
587 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
588 __declspec(naked)
ScaleRowDown34_1_Int_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)589 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
590                                        uint8* dst_ptr, int dst_width) {
591   __asm {
592     pushad
593     mov        esi, [esp + 32 + 4]   // src_ptr
594     mov        ebx, [esp + 32 + 8]   // src_stride
595     mov        edi, [esp + 32 + 12]  // dst_ptr
596     mov        ecx, [esp + 32 + 16]  // dst_width
597     movdqa     xmm2, _shuf01
598     movdqa     xmm3, _shuf11
599     movdqa     xmm4, _shuf21
600     movdqa     xmm5, _madd01
601     movdqa     xmm6, _madd11
602     movdqa     xmm7, _round34
603 
604   wloop:
605     movdqa     xmm0, [esi]           // pixels 0..7
606     movdqa     xmm1, [esi+ebx]
607     pavgb      xmm0, xmm1
608     pshufb     xmm0, xmm2
609     pmaddubsw  xmm0, xmm5
610     paddsw     xmm0, xmm7
611     psrlw      xmm0, 2
612     packuswb   xmm0, xmm0
613     movq       qword ptr [edi], xmm0
614     movdqu     xmm0, [esi+8]         // pixels 8..15
615     movdqu     xmm1, [esi+ebx+8]
616     pavgb      xmm0, xmm1
617     pshufb     xmm0, xmm3
618     pmaddubsw  xmm0, xmm6
619     paddsw     xmm0, xmm7
620     psrlw      xmm0, 2
621     packuswb   xmm0, xmm0
622     movq       qword ptr [edi+8], xmm0
623     movdqa     xmm0, [esi+16]        // pixels 16..23
624     movdqa     xmm1, [esi+ebx+16]
625     lea        esi, [esi+32]
626     pavgb      xmm0, xmm1
627     pshufb     xmm0, xmm4
628     movdqa     xmm1, _madd21
629     pmaddubsw  xmm0, xmm1
630     paddsw     xmm0, xmm7
631     psrlw      xmm0, 2
632     packuswb   xmm0, xmm0
633     movq       qword ptr [edi+16], xmm0
634     lea        edi, [edi+24]
635     sub        ecx, 24
636     ja         wloop
637 
638     popad
639     ret
640   }
641 }
642 
643 // Note that movdqa+palign may be better than movdqu.
644 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
645 __declspec(naked)
ScaleRowDown34_0_Int_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)646 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
647                                        uint8* dst_ptr, int dst_width) {
648   __asm {
649     pushad
650     mov        esi, [esp + 32 + 4]   // src_ptr
651     mov        ebx, [esp + 32 + 8]   // src_stride
652     mov        edi, [esp + 32 + 12]  // dst_ptr
653     mov        ecx, [esp + 32 + 16]  // dst_width
654     movdqa     xmm2, _shuf01
655     movdqa     xmm3, _shuf11
656     movdqa     xmm4, _shuf21
657     movdqa     xmm5, _madd01
658     movdqa     xmm6, _madd11
659     movdqa     xmm7, _round34
660 
661   wloop:
662     movdqa     xmm0, [esi]           // pixels 0..7
663     movdqa     xmm1, [esi+ebx]
664     pavgb      xmm1, xmm0
665     pavgb      xmm0, xmm1
666     pshufb     xmm0, xmm2
667     pmaddubsw  xmm0, xmm5
668     paddsw     xmm0, xmm7
669     psrlw      xmm0, 2
670     packuswb   xmm0, xmm0
671     movq       qword ptr [edi], xmm0
672     movdqu     xmm0, [esi+8]         // pixels 8..15
673     movdqu     xmm1, [esi+ebx+8]
674     pavgb      xmm1, xmm0
675     pavgb      xmm0, xmm1
676     pshufb     xmm0, xmm3
677     pmaddubsw  xmm0, xmm6
678     paddsw     xmm0, xmm7
679     psrlw      xmm0, 2
680     packuswb   xmm0, xmm0
681     movq       qword ptr [edi+8], xmm0
682     movdqa     xmm0, [esi+16]        // pixels 16..23
683     movdqa     xmm1, [esi+ebx+16]
684     lea        esi, [esi+32]
685     pavgb      xmm1, xmm0
686     pavgb      xmm0, xmm1
687     pshufb     xmm0, xmm4
688     movdqa     xmm1, _madd21
689     pmaddubsw  xmm0, xmm1
690     paddsw     xmm0, xmm7
691     psrlw      xmm0, 2
692     packuswb   xmm0, xmm0
693     movq       qword ptr [edi+16], xmm0
694     lea        edi, [edi+24]
695     sub        ecx, 24
696     ja         wloop
697 
698     popad
699     ret
700   }
701 }
702 
703 #define HAS_SCALEROWDOWN38_SSSE3
704 // 3/8 point sampler
705 
706 // Scale 32 pixels to 12
707 __declspec(naked)
ScaleRowDown38_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)708 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
709                                  uint8* dst_ptr, int dst_width) {
710   __asm {
711     pushad
712     mov        esi, [esp + 32 + 4]   // src_ptr
713     mov        edx, [esp + 32 + 8]   // src_stride
714     mov        edi, [esp + 32 + 12]  // dst_ptr
715     mov        ecx, [esp + 32 + 16]  // dst_width
716     movdqa     xmm5, _shuf38a
717     movdqa     xmm6, _shuf38b
718     pxor       xmm7, xmm7
719 
720   xloop:
721     movdqa     xmm0, [esi]           // 16 pixels -> 0,1,2,3,4,5
722     movdqa     xmm1, [esi + 16]      // 16 pixels -> 6,7,8,9,10,11
723     lea        esi, [esi + 32]
724     pshufb     xmm0, xmm5
725     pshufb     xmm1, xmm6
726     paddusb    xmm0, xmm1
727 
728     movq       qword ptr [edi], xmm0 // write 12 pixels
729     movhlps    xmm1, xmm0
730     movd       [edi + 8], xmm1
731     lea        edi, [edi + 12]
732     sub        ecx, 12
733     ja         xloop
734 
735     popad
736     ret
737   }
738 }
739 
740 // Scale 16x3 pixels to 6x1 with interpolation
741 __declspec(naked)
ScaleRowDown38_3_Int_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)742 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
743                                        uint8* dst_ptr, int dst_width) {
744   __asm {
745     pushad
746     mov        esi, [esp + 32 + 4]   // src_ptr
747     mov        edx, [esp + 32 + 8]   // src_stride
748     mov        edi, [esp + 32 + 12]  // dst_ptr
749     mov        ecx, [esp + 32 + 16]  // dst_width
750     movdqa     xmm4, _shufac0
751     movdqa     xmm5, _shufac3
752     movdqa     xmm6, _scaleac3
753     pxor       xmm7, xmm7
754 
755   xloop:
756     movdqa     xmm0, [esi]           // sum up 3 rows into xmm0/1
757     movdqa     xmm2, [esi + edx]
758     movhlps    xmm1, xmm0
759     movhlps    xmm3, xmm2
760     punpcklbw  xmm0, xmm7
761     punpcklbw  xmm1, xmm7
762     punpcklbw  xmm2, xmm7
763     punpcklbw  xmm3, xmm7
764     paddusw    xmm0, xmm2
765     paddusw    xmm1, xmm3
766     movdqa     xmm2, [esi + edx * 2]
767     lea        esi, [esi + 16]
768     movhlps    xmm3, xmm2
769     punpcklbw  xmm2, xmm7
770     punpcklbw  xmm3, xmm7
771     paddusw    xmm0, xmm2
772     paddusw    xmm1, xmm3
773 
774     movdqa     xmm2, xmm0            // 8 pixels -> 0,1,2 of xmm2
775     psrldq     xmm0, 2
776     paddusw    xmm2, xmm0
777     psrldq     xmm0, 2
778     paddusw    xmm2, xmm0
779     pshufb     xmm2, xmm4
780 
781     movdqa     xmm3, xmm1            // 8 pixels -> 3,4,5 of xmm2
782     psrldq     xmm1, 2
783     paddusw    xmm3, xmm1
784     psrldq     xmm1, 2
785     paddusw    xmm3, xmm1
786     pshufb     xmm3, xmm5
787     paddusw    xmm2, xmm3
788 
789     pmulhuw    xmm2, xmm6            // divide by 9,9,6, 9,9,6
790     packuswb   xmm2, xmm2
791 
792     movd       [edi], xmm2           // write 6 pixels
793     pextrw     eax, xmm2, 2
794     mov        [edi + 4], ax
795     lea        edi, [edi + 6]
796     sub        ecx, 6
797     ja         xloop
798 
799     popad
800     ret
801   }
802 }
803 
804 // Scale 16x2 pixels to 6x1 with interpolation
805 __declspec(naked)
ScaleRowDown38_2_Int_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)806 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
807                                        uint8* dst_ptr, int dst_width) {
808   __asm {
809     pushad
810     mov        esi, [esp + 32 + 4]   // src_ptr
811     mov        edx, [esp + 32 + 8]   // src_stride
812     mov        edi, [esp + 32 + 12]  // dst_ptr
813     mov        ecx, [esp + 32 + 16]  // dst_width
814     movdqa     xmm4, _shufab0
815     movdqa     xmm5, _shufab1
816     movdqa     xmm6, _shufab2
817     movdqa     xmm7, _scaleab2
818 
819   xloop:
820     movdqa     xmm2, [esi]           // average 2 rows into xmm2
821     pavgb      xmm2, [esi + edx]
822     lea        esi, [esi + 16]
823 
824     movdqa     xmm0, xmm2            // 16 pixels -> 0,1,2,3,4,5 of xmm0
825     pshufb     xmm0, xmm4
826     movdqa     xmm1, xmm2
827     pshufb     xmm1, xmm5
828     paddusw    xmm0, xmm1
829     pshufb     xmm2, xmm6
830     paddusw    xmm0, xmm2
831 
832     pmulhuw    xmm0, xmm7            // divide by 3,3,2, 3,3,2
833     packuswb   xmm0, xmm0
834 
835     movd       [edi], xmm0           // write 6 pixels
836     pextrw     eax, xmm0, 2
837     mov        [edi + 4], ax
838     lea        edi, [edi + 6]
839     sub        ecx, 6
840     ja         xloop
841 
842     popad
843     ret
844   }
845 }
846 
847 #define HAS_SCALEADDROWS_SSE2
848 
849 // Reads 8xN bytes and produces 16 shorts at a time.
850 __declspec(naked)
ScaleAddRows_SSE2(const uint8 * src_ptr,int src_stride,uint16 * dst_ptr,int src_width,int src_height)851 static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
852                               uint16* dst_ptr, int src_width,
853                               int src_height) {
854   __asm {
855     pushad
856     mov        esi, [esp + 32 + 4]   // src_ptr
857     mov        edx, [esp + 32 + 8]   // src_stride
858     mov        edi, [esp + 32 + 12]  // dst_ptr
859     mov        ecx, [esp + 32 + 16]  // dst_width
860     mov        ebx, [esp + 32 + 20]  // height
861     pxor       xmm7, xmm7
862     dec        ebx
863 
864   xloop:
865     // first row
866     movdqa     xmm2, [esi]
867     lea        eax, [esi + edx]
868     movhlps    xmm3, xmm2
869     mov        ebp, ebx
870     punpcklbw  xmm2, xmm7
871     punpcklbw  xmm3, xmm7
872 
873     // sum remaining rows
874   yloop:
875     movdqa     xmm0, [eax]       // read 16 pixels
876     lea        eax, [eax + edx]  // advance to next row
877     movhlps    xmm1, xmm0
878     punpcklbw  xmm0, xmm7
879     punpcklbw  xmm1, xmm7
880     paddusw    xmm2, xmm0        // sum 16 words
881     paddusw    xmm3, xmm1
882     sub        ebp, 1
883     ja         yloop
884 
885     movdqa     [edi], xmm2
886     movdqa     [edi + 16], xmm3
887     lea        edi, [edi + 32]
888     lea        esi, [esi + 16]
889 
890     sub        ecx, 16
891     ja         xloop
892 
893     popad
894     ret
895   }
896 }
897 
898 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
899 #define HAS_SCALEFILTERROWS_SSE2
900 __declspec(naked)
ScaleFilterRows_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,int src_stride,int dst_width,int source_y_fraction)901 static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
902                                  int src_stride, int dst_width,
903                                  int source_y_fraction) {
904   __asm {
905     push       esi
906     push       edi
907     mov        edi, [esp + 8 + 4]   // dst_ptr
908     mov        esi, [esp + 8 + 8]   // src_ptr
909     mov        edx, [esp + 8 + 12]  // src_stride
910     mov        ecx, [esp + 8 + 16]  // dst_width
911     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
912     cmp        eax, 0
913     je         xloop1
914     cmp        eax, 128
915     je         xloop2
916 
917     movd       xmm6, eax            // xmm6 = y fraction
918     punpcklwd  xmm6, xmm6
919     pshufd     xmm6, xmm6, 0
920     neg        eax                  // xmm5 = 256 - y fraction
921     add        eax, 256
922     movd       xmm5, eax
923     punpcklwd  xmm5, xmm5
924     pshufd     xmm5, xmm5, 0
925     pxor       xmm7, xmm7
926 
927   xloop:
928     movdqa     xmm0, [esi]
929     movdqa     xmm2, [esi + edx]
930     lea        esi, [esi + 16]
931     movdqa     xmm1, xmm0
932     movdqa     xmm3, xmm2
933     punpcklbw  xmm0, xmm7
934     punpcklbw  xmm2, xmm7
935     punpckhbw  xmm1, xmm7
936     punpckhbw  xmm3, xmm7
937     pmullw     xmm0, xmm5           // scale row 0
938     pmullw     xmm1, xmm5
939     pmullw     xmm2, xmm6           // scale row 1
940     pmullw     xmm3, xmm6
941     paddusw    xmm0, xmm2           // sum rows
942     paddusw    xmm1, xmm3
943     psrlw      xmm0, 8
944     psrlw      xmm1, 8
945     packuswb   xmm0, xmm1
946     movdqa     [edi], xmm0
947     lea        edi, [edi + 16]
948     sub        ecx, 16
949     ja         xloop
950 
951     mov        al, [edi - 1]
952     mov        [edi], al
953     pop        edi
954     pop        esi
955     ret
956 
957   xloop1:
958     movdqa     xmm0, [esi]
959     lea        esi, [esi + 16]
960     movdqa     [edi], xmm0
961     lea        edi, [edi + 16]
962     sub        ecx, 16
963     ja         xloop1
964 
965     mov        al, [edi - 1]
966     mov        [edi], al
967     pop        edi
968     pop        esi
969     ret
970 
971   xloop2:
972     movdqa     xmm0, [esi]
973     movdqa     xmm2, [esi + edx]
974     lea        esi, [esi + 16]
975     pavgb      xmm0, xmm2
976     movdqa     [edi], xmm0
977     lea        edi, [edi + 16]
978     sub        ecx, 16
979     ja         xloop2
980 
981     mov        al, [edi - 1]
982     mov        [edi], al
983     pop        edi
984     pop        esi
985     ret
986   }
987 }
988 
989 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
990 #define HAS_SCALEFILTERROWS_SSSE3
991 __declspec(naked)
ScaleFilterRows_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,int src_stride,int dst_width,int source_y_fraction)992 static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
993                                   int src_stride, int dst_width,
994                                   int source_y_fraction) {
995   __asm {
996     push       esi
997     push       edi
998     mov        edi, [esp + 8 + 4]   // dst_ptr
999     mov        esi, [esp + 8 + 8]   // src_ptr
1000     mov        edx, [esp + 8 + 12]  // src_stride
1001     mov        ecx, [esp + 8 + 16]  // dst_width
1002     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
1003     cmp        eax, 0
1004     je         xloop1
1005     cmp        eax, 128
1006     je         xloop2
1007 
1008     shr        eax, 1
1009     mov        ah,al
1010     neg        al
1011     add        al, 128
1012     movd       xmm7, eax
1013     punpcklwd  xmm7, xmm7
1014     pshufd     xmm7, xmm7, 0
1015 
1016   xloop:
1017     movdqa     xmm0, [esi]
1018     movdqa     xmm2, [esi + edx]
1019     lea        esi, [esi + 16]
1020     movdqa     xmm1, xmm0
1021     punpcklbw  xmm0, xmm2
1022     punpckhbw  xmm1, xmm2
1023     pmaddubsw  xmm0, xmm7
1024     pmaddubsw  xmm1, xmm7
1025     psrlw      xmm0, 7
1026     psrlw      xmm1, 7
1027     packuswb   xmm0, xmm1
1028     movdqa     [edi], xmm0
1029     lea        edi, [edi + 16]
1030     sub        ecx, 16
1031     ja         xloop
1032 
1033     mov        al, [edi - 1]
1034     mov        [edi], al
1035     pop        edi
1036     pop        esi
1037     ret
1038 
1039   xloop1:
1040     movdqa     xmm0, [esi]
1041     lea        esi, [esi + 16]
1042     movdqa     [edi], xmm0
1043     lea        edi, [edi + 16]
1044     sub        ecx, 16
1045     ja         xloop1
1046 
1047     mov        al, [edi - 1]
1048     mov        [edi], al
1049     pop        edi
1050     pop        esi
1051     ret
1052 
1053   xloop2:
1054     movdqa     xmm0, [esi]
1055     movdqa     xmm2, [esi + edx]
1056     lea        esi, [esi + 16]
1057     pavgb      xmm0, xmm2
1058     movdqa     [edi], xmm0
1059     lea        edi, [edi + 16]
1060     sub        ecx, 16
1061     ja         xloop2
1062 
1063     mov        al, [edi - 1]
1064     mov        [edi], al
1065     pop        edi
1066     pop        esi
1067     ret
1068 
1069   }
1070 }
1071 
1072 // Note that movdqa+palign may be better than movdqu.
1073 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
1074 __declspec(naked)
ScaleFilterCols34_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width)1075 static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
1076                                     int dst_width) {
1077   __asm {
1078     mov        edx, [esp + 4]    // dst_ptr
1079     mov        eax, [esp + 8]    // src_ptr
1080     mov        ecx, [esp + 12]   // dst_width
1081     movdqa     xmm1, _round34
1082     movdqa     xmm2, _shuf01
1083     movdqa     xmm3, _shuf11
1084     movdqa     xmm4, _shuf21
1085     movdqa     xmm5, _madd01
1086     movdqa     xmm6, _madd11
1087     movdqa     xmm7, _madd21
1088 
1089   wloop:
1090     movdqa     xmm0, [eax]           // pixels 0..7
1091     pshufb     xmm0, xmm2
1092     pmaddubsw  xmm0, xmm5
1093     paddsw     xmm0, xmm1
1094     psrlw      xmm0, 2
1095     packuswb   xmm0, xmm0
1096     movq       qword ptr [edx], xmm0
1097     movdqu     xmm0, [eax+8]         // pixels 8..15
1098     pshufb     xmm0, xmm3
1099     pmaddubsw  xmm0, xmm6
1100     paddsw     xmm0, xmm1
1101     psrlw      xmm0, 2
1102     packuswb   xmm0, xmm0
1103     movq       qword ptr [edx+8], xmm0
1104     movdqa     xmm0, [eax+16]        // pixels 16..23
1105     lea        eax, [eax+32]
1106     pshufb     xmm0, xmm4
1107     pmaddubsw  xmm0, xmm7
1108     paddsw     xmm0, xmm1
1109     psrlw      xmm0, 2
1110     packuswb   xmm0, xmm0
1111     movq       qword ptr [edx+16], xmm0
1112     lea        edx, [edx+24]
1113     sub        ecx, 24
1114     ja         wloop
1115     ret
1116   }
1117 }
1118 
1119 #elif (defined(__x86_64__) || defined(__i386__)) && \
1120     !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
1121 
1122 // GCC versions of row functions are verbatim conversions from Visual C.
1123 // Generated using gcc disassembly on Visual C object file:
1124 // objdump -D yuvscaler.obj >yuvscaler.txt
1125 #define HAS_SCALEROWDOWN2_SSE2
ScaleRowDown2_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)1126 static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
1127                                uint8* dst_ptr, int dst_width) {
1128   asm volatile(
1129   "pcmpeqb    %%xmm7,%%xmm7\n"
1130   "psrlw      $0x8,%%xmm7\n"
1131 "1:"
1132   "movdqa     (%0),%%xmm0\n"
1133   "movdqa     0x10(%0),%%xmm1\n"
1134   "lea        0x20(%0),%0\n"
1135   "pand       %%xmm7,%%xmm0\n"
1136   "pand       %%xmm7,%%xmm1\n"
1137   "packuswb   %%xmm1,%%xmm0\n"
1138   "movdqa     %%xmm0,(%1)\n"
1139   "lea        0x10(%1),%1\n"
1140   "sub        $0x10,%2\n"
1141   "ja         1b\n"
1142   : "+r"(src_ptr),    // %0
1143     "+r"(dst_ptr),    // %1
1144     "+r"(dst_width)   // %2
1145   :
1146   : "memory"
1147 );
1148 }
1149 
ScaleRowDown2Int_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)1150 static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
1151                                   uint8* dst_ptr, int dst_width) {
1152   asm volatile(
1153   "pcmpeqb    %%xmm7,%%xmm7\n"
1154   "psrlw      $0x8,%%xmm7\n"
1155 "1:"
1156   "movdqa     (%0),%%xmm0\n"
1157   "movdqa     0x10(%0),%%xmm1\n"
1158   "movdqa     (%0,%3,1),%%xmm2\n"
1159   "movdqa     0x10(%0,%3,1),%%xmm3\n"
1160   "lea        0x20(%0),%0\n"
1161   "pavgb      %%xmm2,%%xmm0\n"
1162   "pavgb      %%xmm3,%%xmm1\n"
1163   "movdqa     %%xmm0,%%xmm2\n"
1164   "psrlw      $0x8,%%xmm0\n"
1165   "movdqa     %%xmm1,%%xmm3\n"
1166   "psrlw      $0x8,%%xmm1\n"
1167   "pand       %%xmm7,%%xmm2\n"
1168   "pand       %%xmm7,%%xmm3\n"
1169   "pavgw      %%xmm2,%%xmm0\n"
1170   "pavgw      %%xmm3,%%xmm1\n"
1171   "packuswb   %%xmm1,%%xmm0\n"
1172   "movdqa     %%xmm0,(%1)\n"
1173   "lea        0x10(%1),%1\n"
1174   "sub        $0x10,%2\n"
1175   "ja         1b\n"
1176   : "+r"(src_ptr),    // %0
1177     "+r"(dst_ptr),    // %1
1178     "+r"(dst_width)   // %2
1179   : "r"(static_cast<intptr_t>(src_stride))   // %3
1180   : "memory"
1181 );
1182 }
1183 
1184 #define HAS_SCALEROWDOWN4_SSE2
ScaleRowDown4_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)1185 static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
1186                                uint8* dst_ptr, int dst_width) {
1187   asm volatile(
1188   "pcmpeqb    %%xmm7,%%xmm7\n"
1189   "psrld      $0x18,%%xmm7\n"
1190 "1:"
1191   "movdqa     (%0),%%xmm0\n"
1192   "movdqa     0x10(%0),%%xmm1\n"
1193   "lea        0x20(%0),%0\n"
1194   "pand       %%xmm7,%%xmm0\n"
1195   "pand       %%xmm7,%%xmm1\n"
1196   "packuswb   %%xmm1,%%xmm0\n"
1197   "packuswb   %%xmm0,%%xmm0\n"
1198   "movq       %%xmm0,(%1)\n"
1199   "lea        0x8(%1),%1\n"
1200   "sub        $0x8,%2\n"
1201   "ja         1b\n"
1202   : "+r"(src_ptr),    // %0
1203     "+r"(dst_ptr),    // %1
1204     "+r"(dst_width)   // %2
1205   :
1206   : "memory"
1207 );
1208 }
1209 
ScaleRowDown4Int_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)1210 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
1211                                   uint8* dst_ptr, int dst_width) {
1212   intptr_t temp = 0;
1213   asm volatile(
1214   "pcmpeqb    %%xmm7,%%xmm7\n"
1215   "psrlw      $0x8,%%xmm7\n"
1216   "lea        (%4,%4,2),%3\n"
1217 "1:"
1218   "movdqa     (%0),%%xmm0\n"
1219   "movdqa     0x10(%0),%%xmm1\n"
1220   "movdqa     (%0,%4,1),%%xmm2\n"
1221   "movdqa     0x10(%0,%4,1),%%xmm3\n"
1222   "pavgb      %%xmm2,%%xmm0\n"
1223   "pavgb      %%xmm3,%%xmm1\n"
1224   "movdqa     (%0,%4,2),%%xmm2\n"
1225   "movdqa     0x10(%0,%4,2),%%xmm3\n"
1226   "movdqa     (%0,%3,1),%%xmm4\n"
1227   "movdqa     0x10(%0,%3,1),%%xmm5\n"
1228   "lea        0x20(%0),%0\n"
1229   "pavgb      %%xmm4,%%xmm2\n"
1230   "pavgb      %%xmm2,%%xmm0\n"
1231   "pavgb      %%xmm5,%%xmm3\n"
1232   "pavgb      %%xmm3,%%xmm1\n"
1233   "movdqa     %%xmm0,%%xmm2\n"
1234   "psrlw      $0x8,%%xmm0\n"
1235   "movdqa     %%xmm1,%%xmm3\n"
1236   "psrlw      $0x8,%%xmm1\n"
1237   "pand       %%xmm7,%%xmm2\n"
1238   "pand       %%xmm7,%%xmm3\n"
1239   "pavgw      %%xmm2,%%xmm0\n"
1240   "pavgw      %%xmm3,%%xmm1\n"
1241   "packuswb   %%xmm1,%%xmm0\n"
1242   "movdqa     %%xmm0,%%xmm2\n"
1243   "psrlw      $0x8,%%xmm0\n"
1244   "pand       %%xmm7,%%xmm2\n"
1245   "pavgw      %%xmm2,%%xmm0\n"
1246   "packuswb   %%xmm0,%%xmm0\n"
1247   "movq       %%xmm0,(%1)\n"
1248   "lea        0x8(%1),%1\n"
1249   "sub        $0x8,%2\n"
1250   "ja         1b\n"
1251   : "+r"(src_ptr),     // %0
1252     "+r"(dst_ptr),     // %1
1253     "+r"(dst_width),   // %2
1254     "+r"(temp)         // %3
1255   : "r"(static_cast<intptr_t>(src_stride))    // %4
1256   : "memory"
1257 );
1258 }
1259 
1260 #define HAS_SCALEROWDOWN8_SSE2
ScaleRowDown8_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)1261 static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
1262                                uint8* dst_ptr, int dst_width) {
1263   asm volatile(
1264   "pcmpeqb    %%xmm7,%%xmm7\n"
1265   "psrlq      $0x38,%%xmm7\n"
1266 "1:"
1267   "movdqa     (%0),%%xmm0\n"
1268   "movdqa     0x10(%0),%%xmm1\n"
1269   "lea        0x20(%0),%0\n"
1270   "pand       %%xmm7,%%xmm0\n"
1271   "pand       %%xmm7,%%xmm1\n"
1272   "packuswb   %%xmm1,%%xmm0\n"
1273   "packuswb   %%xmm0,%%xmm0\n"
1274   "packuswb   %%xmm0,%%xmm0\n"
1275   "movd       %%xmm0,(%1)\n"
1276   "lea        0x4(%1),%1\n"
1277   "sub        $0x4,%2\n"
1278   "ja         1b\n"
1279   : "+r"(src_ptr),    // %0
1280     "+r"(dst_ptr),    // %1
1281     "+r"(dst_width)   // %2
1282   :
1283   : "memory"
1284 );
1285 }
1286 
1287 #if defined(__i386__)
1288 extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
1289                                       uint8* dst_ptr, int dst_width);
1290   asm(
1291     ".text\n"
1292 #if defined(OSX)
1293     ".globl _ScaleRowDown8Int_SSE2\n"
1294 "_ScaleRowDown8Int_SSE2:\n"
1295 #else
1296     ".global ScaleRowDown8Int_SSE2\n"
1297 "ScaleRowDown8Int_SSE2:\n"
1298 #endif
1299     "pusha\n"
1300     "mov    0x24(%esp),%esi\n"
1301     "mov    0x28(%esp),%ebx\n"
1302     "mov    0x2c(%esp),%edi\n"
1303     "mov    0x30(%esp),%ecx\n"
1304     "lea    (%ebx,%ebx,2),%edx\n"
1305     "pxor   %xmm7,%xmm7\n"
1306 
1307 "1:"
1308     "movdqa (%esi),%xmm0\n"
1309     "movdqa 0x10(%esi),%xmm1\n"
1310     "movdqa (%esi,%ebx,1),%xmm2\n"
1311     "movdqa 0x10(%esi,%ebx,1),%xmm3\n"
1312     "pavgb  %xmm2,%xmm0\n"
1313     "pavgb  %xmm3,%xmm1\n"
1314     "movdqa (%esi,%ebx,2),%xmm2\n"
1315     "movdqa 0x10(%esi,%ebx,2),%xmm3\n"
1316     "movdqa (%esi,%edx,1),%xmm4\n"
1317     "movdqa 0x10(%esi,%edx,1),%xmm5\n"
1318     "lea    (%esi,%ebx,4),%ebp\n"
1319     "lea    0x20(%esi),%esi\n"
1320     "pavgb  %xmm4,%xmm2\n"
1321     "pavgb  %xmm5,%xmm3\n"
1322     "pavgb  %xmm2,%xmm0\n"
1323     "pavgb  %xmm3,%xmm1\n"
1324     "movdqa 0x0(%ebp),%xmm2\n"
1325     "movdqa 0x10(%ebp),%xmm3\n"
1326     "movdqa 0x0(%ebp,%ebx,1),%xmm4\n"
1327     "movdqa 0x10(%ebp,%ebx,1),%xmm5\n"
1328     "pavgb  %xmm4,%xmm2\n"
1329     "pavgb  %xmm5,%xmm3\n"
1330     "movdqa 0x0(%ebp,%ebx,2),%xmm4\n"
1331     "movdqa 0x10(%ebp,%ebx,2),%xmm5\n"
1332     "movdqa 0x0(%ebp,%edx,1),%xmm6\n"
1333     "pavgb  %xmm6,%xmm4\n"
1334     "movdqa 0x10(%ebp,%edx,1),%xmm6\n"
1335     "pavgb  %xmm6,%xmm5\n"
1336     "pavgb  %xmm4,%xmm2\n"
1337     "pavgb  %xmm5,%xmm3\n"
1338     "pavgb  %xmm2,%xmm0\n"
1339     "pavgb  %xmm3,%xmm1\n"
1340     "psadbw %xmm7,%xmm0\n"
1341     "psadbw %xmm7,%xmm1\n"
1342     "pshufd $0xd8,%xmm0,%xmm0\n"
1343     "pshufd $0x8d,%xmm1,%xmm1\n"
1344     "por    %xmm1,%xmm0\n"
1345     "psrlw  $0x3,%xmm0\n"
1346     "packuswb %xmm0,%xmm0\n"
1347     "packuswb %xmm0,%xmm0\n"
1348     "movd   %xmm0,(%edi)\n"
1349     "lea    0x4(%edi),%edi\n"
1350     "sub    $0x4,%ecx\n"
1351     "ja     1b\n"
1352     "popa\n"
1353     "ret\n"
1354 );
1355 
1356 // fpic is used for magiccam plugin
1357 #if !defined(__PIC__)
1358 #define HAS_SCALEROWDOWN34_SSSE3
1359 extern "C" void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
1360                                      uint8* dst_ptr, int dst_width);
1361   asm(
1362     ".text\n"
1363 #if defined(OSX)
1364     ".globl _ScaleRowDown34_SSSE3\n"
1365 "_ScaleRowDown34_SSSE3:\n"
1366 #else
1367     ".global ScaleRowDown34_SSSE3\n"
1368 "ScaleRowDown34_SSSE3:\n"
1369 #endif
1370     "pusha\n"
1371     "mov    0x24(%esp),%esi\n"
1372     "mov    0x2c(%esp),%edi\n"
1373     "mov    0x30(%esp),%ecx\n"
1374     "movdqa _shuf0,%xmm3\n"
1375     "movdqa _shuf1,%xmm4\n"
1376     "movdqa _shuf2,%xmm5\n"
1377 
1378 "1:"
1379     "movdqa (%esi),%xmm0\n"
1380     "movdqa 0x10(%esi),%xmm2\n"
1381     "lea    0x20(%esi),%esi\n"
1382     "movdqa %xmm2,%xmm1\n"
1383     "palignr $0x8,%xmm0,%xmm1\n"
1384     "pshufb %xmm3,%xmm0\n"
1385     "pshufb %xmm4,%xmm1\n"
1386     "pshufb %xmm5,%xmm2\n"
1387     "movq   %xmm0,(%edi)\n"
1388     "movq   %xmm1,0x8(%edi)\n"
1389     "movq   %xmm2,0x10(%edi)\n"
1390     "lea    0x18(%edi),%edi\n"
1391     "sub    $0x18,%ecx\n"
1392     "ja     1b\n"
1393     "popa\n"
1394     "ret\n"
1395 );
1396 
1397 extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
1398                                            uint8* dst_ptr, int dst_width);
1399   asm(
1400     ".text\n"
1401 #if defined(OSX)
1402     ".globl _ScaleRowDown34_1_Int_SSSE3\n"
1403 "_ScaleRowDown34_1_Int_SSSE3:\n"
1404 #else
1405     ".global ScaleRowDown34_1_Int_SSSE3\n"
1406 "ScaleRowDown34_1_Int_SSSE3:\n"
1407 #endif
1408     "pusha\n"
1409     "mov    0x24(%esp),%esi\n"
1410     "mov    0x28(%esp),%ebp\n"
1411     "mov    0x2c(%esp),%edi\n"
1412     "mov    0x30(%esp),%ecx\n"
1413     "movdqa _shuf01,%xmm2\n"
1414     "movdqa _shuf11,%xmm3\n"
1415     "movdqa _shuf21,%xmm4\n"
1416     "movdqa _madd01,%xmm5\n"
1417     "movdqa _madd11,%xmm6\n"
1418     "movdqa _round34,%xmm7\n"
1419 
1420 "1:"
1421     "movdqa (%esi),%xmm0\n"
1422     "movdqa (%esi,%ebp),%xmm1\n"
1423     "pavgb  %xmm1,%xmm0\n"
1424     "pshufb %xmm2,%xmm0\n"
1425     "pmaddubsw %xmm5,%xmm0\n"
1426     "paddsw %xmm7,%xmm0\n"
1427     "psrlw  $0x2,%xmm0\n"
1428     "packuswb %xmm0,%xmm0\n"
1429     "movq   %xmm0,(%edi)\n"
1430     "movdqu 0x8(%esi),%xmm0\n"
1431     "movdqu 0x8(%esi,%ebp),%xmm1\n"
1432     "pavgb  %xmm1,%xmm0\n"
1433     "pshufb %xmm3,%xmm0\n"
1434     "pmaddubsw %xmm6,%xmm0\n"
1435     "paddsw %xmm7,%xmm0\n"
1436     "psrlw  $0x2,%xmm0\n"
1437     "packuswb %xmm0,%xmm0\n"
1438     "movq   %xmm0,0x8(%edi)\n"
1439     "movdqa 0x10(%esi),%xmm0\n"
1440     "movdqa 0x10(%esi,%ebp),%xmm1\n"
1441     "lea    0x20(%esi),%esi\n"
1442     "pavgb  %xmm1,%xmm0\n"
1443     "pshufb %xmm4,%xmm0\n"
1444     "movdqa  _madd21,%xmm1\n"
1445     "pmaddubsw %xmm1,%xmm0\n"
1446     "paddsw %xmm7,%xmm0\n"
1447     "psrlw  $0x2,%xmm0\n"
1448     "packuswb %xmm0,%xmm0\n"
1449     "movq   %xmm0,0x10(%edi)\n"
1450     "lea    0x18(%edi),%edi\n"
1451     "sub    $0x18,%ecx\n"
1452     "ja     1b\n"
1453 
1454     "popa\n"
1455     "ret\n"
1456 );
1457 
1458 extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
1459                                            uint8* dst_ptr, int dst_width);
1460   asm(
1461     ".text\n"
1462 #if defined(OSX)
1463     ".globl _ScaleRowDown34_0_Int_SSSE3\n"
1464 "_ScaleRowDown34_0_Int_SSSE3:\n"
1465 #else
1466     ".global ScaleRowDown34_0_Int_SSSE3\n"
1467 "ScaleRowDown34_0_Int_SSSE3:\n"
1468 #endif
1469     "pusha\n"
1470     "mov    0x24(%esp),%esi\n"
1471     "mov    0x28(%esp),%ebp\n"
1472     "mov    0x2c(%esp),%edi\n"
1473     "mov    0x30(%esp),%ecx\n"
1474     "movdqa _shuf01,%xmm2\n"
1475     "movdqa _shuf11,%xmm3\n"
1476     "movdqa _shuf21,%xmm4\n"
1477     "movdqa _madd01,%xmm5\n"
1478     "movdqa _madd11,%xmm6\n"
1479     "movdqa _round34,%xmm7\n"
1480 
1481 "1:"
1482     "movdqa (%esi),%xmm0\n"
1483     "movdqa (%esi,%ebp,1),%xmm1\n"
1484     "pavgb  %xmm0,%xmm1\n"
1485     "pavgb  %xmm1,%xmm0\n"
1486     "pshufb %xmm2,%xmm0\n"
1487     "pmaddubsw %xmm5,%xmm0\n"
1488     "paddsw %xmm7,%xmm0\n"
1489     "psrlw  $0x2,%xmm0\n"
1490     "packuswb %xmm0,%xmm0\n"
1491     "movq   %xmm0,(%edi)\n"
1492     "movdqu 0x8(%esi),%xmm0\n"
1493     "movdqu 0x8(%esi,%ebp,1),%xmm1\n"
1494     "pavgb  %xmm0,%xmm1\n"
1495     "pavgb  %xmm1,%xmm0\n"
1496     "pshufb %xmm3,%xmm0\n"
1497     "pmaddubsw %xmm6,%xmm0\n"
1498     "paddsw %xmm7,%xmm0\n"
1499     "psrlw  $0x2,%xmm0\n"
1500     "packuswb %xmm0,%xmm0\n"
1501     "movq   %xmm0,0x8(%edi)\n"
1502     "movdqa 0x10(%esi),%xmm0\n"
1503     "movdqa 0x10(%esi,%ebp,1),%xmm1\n"
1504     "lea    0x20(%esi),%esi\n"
1505     "pavgb  %xmm0,%xmm1\n"
1506     "pavgb  %xmm1,%xmm0\n"
1507     "pshufb %xmm4,%xmm0\n"
1508     "movdqa  _madd21,%xmm1\n"
1509     "pmaddubsw %xmm1,%xmm0\n"
1510     "paddsw %xmm7,%xmm0\n"
1511     "psrlw  $0x2,%xmm0\n"
1512     "packuswb %xmm0,%xmm0\n"
1513     "movq   %xmm0,0x10(%edi)\n"
1514     "lea    0x18(%edi),%edi\n"
1515     "sub    $0x18,%ecx\n"
1516     "ja     1b\n"
1517     "popa\n"
1518     "ret\n"
1519 );
1520 
1521 #define HAS_SCALEROWDOWN38_SSSE3
1522 extern "C" void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
1523                                      uint8* dst_ptr, int dst_width);
1524   asm(
1525     ".text\n"
1526 #if defined(OSX)
1527     ".globl _ScaleRowDown38_SSSE3\n"
1528 "_ScaleRowDown38_SSSE3:\n"
1529 #else
1530     ".global ScaleRowDown38_SSSE3\n"
1531 "ScaleRowDown38_SSSE3:\n"
1532 #endif
1533     "pusha\n"
1534     "mov    0x24(%esp),%esi\n"
1535     "mov    0x28(%esp),%edx\n"
1536     "mov    0x2c(%esp),%edi\n"
1537     "mov    0x30(%esp),%ecx\n"
1538     "movdqa _shuf38a ,%xmm5\n"
1539     "movdqa _shuf38b ,%xmm6\n"
1540     "pxor   %xmm7,%xmm7\n"
1541 
1542 "1:"
1543     "movdqa (%esi),%xmm0\n"
1544     "movdqa 0x10(%esi),%xmm1\n"
1545     "lea    0x20(%esi),%esi\n"
1546     "pshufb %xmm5,%xmm0\n"
1547     "pshufb %xmm6,%xmm1\n"
1548     "paddusb %xmm1,%xmm0\n"
1549     "movq   %xmm0,(%edi)\n"
1550     "movhlps %xmm0,%xmm1\n"
1551     "movd   %xmm1,0x8(%edi)\n"
1552     "lea    0xc(%edi),%edi\n"
1553     "sub    $0xc,%ecx\n"
1554     "ja     1b\n"
1555     "popa\n"
1556     "ret\n"
1557 );
1558 
1559 extern "C" void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
1560                                            uint8* dst_ptr, int dst_width);
1561   asm(
1562     ".text\n"
1563 #if defined(OSX)
1564     ".globl _ScaleRowDown38_3_Int_SSSE3\n"
1565 "_ScaleRowDown38_3_Int_SSSE3:\n"
1566 #else
1567     ".global ScaleRowDown38_3_Int_SSSE3\n"
1568 "ScaleRowDown38_3_Int_SSSE3:\n"
1569 #endif
1570     "pusha\n"
1571     "mov    0x24(%esp),%esi\n"
1572     "mov    0x28(%esp),%edx\n"
1573     "mov    0x2c(%esp),%edi\n"
1574     "mov    0x30(%esp),%ecx\n"
1575     "movdqa _shufac0,%xmm4\n"
1576     "movdqa _shufac3,%xmm5\n"
1577     "movdqa _scaleac3,%xmm6\n"
1578     "pxor   %xmm7,%xmm7\n"
1579 
1580 "1:"
1581     "movdqa (%esi),%xmm0\n"
1582     "movdqa (%esi,%edx,1),%xmm2\n"
1583     "movhlps %xmm0,%xmm1\n"
1584     "movhlps %xmm2,%xmm3\n"
1585     "punpcklbw %xmm7,%xmm0\n"
1586     "punpcklbw %xmm7,%xmm1\n"
1587     "punpcklbw %xmm7,%xmm2\n"
1588     "punpcklbw %xmm7,%xmm3\n"
1589     "paddusw %xmm2,%xmm0\n"
1590     "paddusw %xmm3,%xmm1\n"
1591     "movdqa (%esi,%edx,2),%xmm2\n"
1592     "lea    0x10(%esi),%esi\n"
1593     "movhlps %xmm2,%xmm3\n"
1594     "punpcklbw %xmm7,%xmm2\n"
1595     "punpcklbw %xmm7,%xmm3\n"
1596     "paddusw %xmm2,%xmm0\n"
1597     "paddusw %xmm3,%xmm1\n"
1598     "movdqa %xmm0,%xmm2\n"
1599     "psrldq $0x2,%xmm0\n"
1600     "paddusw %xmm0,%xmm2\n"
1601     "psrldq $0x2,%xmm0\n"
1602     "paddusw %xmm0,%xmm2\n"
1603     "pshufb %xmm4,%xmm2\n"
1604     "movdqa %xmm1,%xmm3\n"
1605     "psrldq $0x2,%xmm1\n"
1606     "paddusw %xmm1,%xmm3\n"
1607     "psrldq $0x2,%xmm1\n"
1608     "paddusw %xmm1,%xmm3\n"
1609     "pshufb %xmm5,%xmm3\n"
1610     "paddusw %xmm3,%xmm2\n"
1611     "pmulhuw %xmm6,%xmm2\n"
1612     "packuswb %xmm2,%xmm2\n"
1613     "movd   %xmm2,(%edi)\n"
1614     "pextrw $0x2,%xmm2,%eax\n"
1615     "mov    %ax,0x4(%edi)\n"
1616     "lea    0x6(%edi),%edi\n"
1617     "sub    $0x6,%ecx\n"
1618     "ja     1b\n"
1619     "popa\n"
1620     "ret\n"
1621 );
1622 
1623 extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
1624                                            uint8* dst_ptr, int dst_width);
1625   asm(
1626     ".text\n"
1627 #if defined(OSX)
1628     ".globl _ScaleRowDown38_2_Int_SSSE3\n"
1629 "_ScaleRowDown38_2_Int_SSSE3:\n"
1630 #else
1631     ".global ScaleRowDown38_2_Int_SSSE3\n"
1632 "ScaleRowDown38_2_Int_SSSE3:\n"
1633 #endif
1634     "pusha\n"
1635     "mov    0x24(%esp),%esi\n"
1636     "mov    0x28(%esp),%edx\n"
1637     "mov    0x2c(%esp),%edi\n"
1638     "mov    0x30(%esp),%ecx\n"
1639     "movdqa _shufab0,%xmm4\n"
1640     "movdqa _shufab1,%xmm5\n"
1641     "movdqa _shufab2,%xmm6\n"
1642     "movdqa _scaleab2,%xmm7\n"
1643 
1644 "1:"
1645     "movdqa (%esi),%xmm2\n"
1646     "pavgb  (%esi,%edx,1),%xmm2\n"
1647     "lea    0x10(%esi),%esi\n"
1648     "movdqa %xmm2,%xmm0\n"
1649     "pshufb %xmm4,%xmm0\n"
1650     "movdqa %xmm2,%xmm1\n"
1651     "pshufb %xmm5,%xmm1\n"
1652     "paddusw %xmm1,%xmm0\n"
1653     "pshufb %xmm6,%xmm2\n"
1654     "paddusw %xmm2,%xmm0\n"
1655     "pmulhuw %xmm7,%xmm0\n"
1656     "packuswb %xmm0,%xmm0\n"
1657     "movd   %xmm0,(%edi)\n"
1658     "pextrw $0x2,%xmm0,%eax\n"
1659     "mov    %ax,0x4(%edi)\n"
1660     "lea    0x6(%edi),%edi\n"
1661     "sub    $0x6,%ecx\n"
1662     "ja     1b\n"
1663     "popa\n"
1664     "ret\n"
1665 );
1666 #endif // __PIC__
1667 
1668 #define HAS_SCALEADDROWS_SSE2
1669 extern "C" void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
1670                                   uint16* dst_ptr, int src_width,
1671                                   int src_height);
1672   asm(
1673     ".text\n"
1674 #if defined(OSX)
1675     ".globl _ScaleAddRows_SSE2\n"
1676 "_ScaleAddRows_SSE2:\n"
1677 #else
1678     ".global ScaleAddRows_SSE2\n"
1679 "ScaleAddRows_SSE2:\n"
1680 #endif
1681     "pusha\n"
1682     "mov    0x24(%esp),%esi\n"
1683     "mov    0x28(%esp),%edx\n"
1684     "mov    0x2c(%esp),%edi\n"
1685     "mov    0x30(%esp),%ecx\n"
1686     "mov    0x34(%esp),%ebx\n"
1687     "pxor   %xmm7,%xmm7\n"
1688 
1689 "1:"
1690     "movdqa (%esi),%xmm2\n"
1691     "lea    (%esi,%edx,1),%eax\n"
1692     "movhlps %xmm2,%xmm3\n"
1693     "lea    -0x1(%ebx),%ebp\n"
1694     "punpcklbw %xmm7,%xmm2\n"
1695     "punpcklbw %xmm7,%xmm3\n"
1696 
1697 "2:"
1698     "movdqa (%eax),%xmm0\n"
1699     "lea    (%eax,%edx,1),%eax\n"
1700     "movhlps %xmm0,%xmm1\n"
1701     "punpcklbw %xmm7,%xmm0\n"
1702     "punpcklbw %xmm7,%xmm1\n"
1703     "paddusw %xmm0,%xmm2\n"
1704     "paddusw %xmm1,%xmm3\n"
1705     "sub    $0x1,%ebp\n"
1706     "ja     2b\n"
1707 
1708     "movdqa %xmm2,(%edi)\n"
1709     "movdqa %xmm3,0x10(%edi)\n"
1710     "lea    0x20(%edi),%edi\n"
1711     "lea    0x10(%esi),%esi\n"
1712     "sub    $0x10,%ecx\n"
1713     "ja     1b\n"
1714     "popa\n"
1715     "ret\n"
1716 );
1717 
1718 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
1719 #define HAS_SCALEFILTERROWS_SSE2
1720 extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
1721                                      const uint8* src_ptr, int src_stride,
1722                                      int dst_width, int source_y_fraction);
1723   asm(
1724     ".text\n"
1725 #if defined(OSX)
1726     ".globl _ScaleFilterRows_SSE2\n"
1727 "_ScaleFilterRows_SSE2:\n"
1728 #else
1729     ".global ScaleFilterRows_SSE2\n"
1730 "ScaleFilterRows_SSE2:\n"
1731 #endif
1732     "push   %esi\n"
1733     "push   %edi\n"
1734     "mov    0xc(%esp),%edi\n"
1735     "mov    0x10(%esp),%esi\n"
1736     "mov    0x14(%esp),%edx\n"
1737     "mov    0x18(%esp),%ecx\n"
1738     "mov    0x1c(%esp),%eax\n"
1739     "cmp    $0x0,%eax\n"
1740     "je     2f\n"
1741     "cmp    $0x80,%eax\n"
1742     "je     3f\n"
1743     "movd   %eax,%xmm6\n"
1744     "punpcklwd %xmm6,%xmm6\n"
1745     "pshufd $0x0,%xmm6,%xmm6\n"
1746     "neg    %eax\n"
1747     "add    $0x100,%eax\n"
1748     "movd   %eax,%xmm5\n"
1749     "punpcklwd %xmm5,%xmm5\n"
1750     "pshufd $0x0,%xmm5,%xmm5\n"
1751     "pxor   %xmm7,%xmm7\n"
1752 
1753 "1:"
1754     "movdqa (%esi),%xmm0\n"
1755     "movdqa (%esi,%edx,1),%xmm2\n"
1756     "lea    0x10(%esi),%esi\n"
1757     "movdqa %xmm0,%xmm1\n"
1758     "movdqa %xmm2,%xmm3\n"
1759     "punpcklbw %xmm7,%xmm0\n"
1760     "punpcklbw %xmm7,%xmm2\n"
1761     "punpckhbw %xmm7,%xmm1\n"
1762     "punpckhbw %xmm7,%xmm3\n"
1763     "pmullw %xmm5,%xmm0\n"
1764     "pmullw %xmm5,%xmm1\n"
1765     "pmullw %xmm6,%xmm2\n"
1766     "pmullw %xmm6,%xmm3\n"
1767     "paddusw %xmm2,%xmm0\n"
1768     "paddusw %xmm3,%xmm1\n"
1769     "psrlw  $0x8,%xmm0\n"
1770     "psrlw  $0x8,%xmm1\n"
1771     "packuswb %xmm1,%xmm0\n"
1772     "movdqa %xmm0,(%edi)\n"
1773     "lea    0x10(%edi),%edi\n"
1774     "sub    $0x10,%ecx\n"
1775     "ja     1b\n"
1776     "mov    -0x1(%edi),%al\n"
1777     "mov    %al,(%edi)\n"
1778     "pop    %edi\n"
1779     "pop    %esi\n"
1780     "ret\n"
1781 
1782 "2:"
1783     "movdqa (%esi),%xmm0\n"
1784     "lea    0x10(%esi),%esi\n"
1785     "movdqa %xmm0,(%edi)\n"
1786     "lea    0x10(%edi),%edi\n"
1787     "sub    $0x10,%ecx\n"
1788     "ja     2b\n"
1789 
1790     "mov    -0x1(%edi),%al\n"
1791     "mov    %al,(%edi)\n"
1792     "pop    %edi\n"
1793     "pop    %esi\n"
1794     "ret\n"
1795 
1796 "3:"
1797     "movdqa (%esi),%xmm0\n"
1798     "movdqa (%esi,%edx,1),%xmm2\n"
1799     "lea    0x10(%esi),%esi\n"
1800     "pavgb  %xmm2,%xmm0\n"
1801     "movdqa %xmm0,(%edi)\n"
1802     "lea    0x10(%edi),%edi\n"
1803     "sub    $0x10,%ecx\n"
1804     "ja     3b\n"
1805 
1806     "mov    -0x1(%edi),%al\n"
1807     "mov    %al,(%edi)\n"
1808     "pop    %edi\n"
1809     "pop    %esi\n"
1810     "ret\n"
1811 );
1812 
1813 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
1814 #define HAS_SCALEFILTERROWS_SSSE3
1815 extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
1816                                       const uint8* src_ptr, int src_stride,
1817                                       int dst_width, int source_y_fraction);
1818   asm(
1819     ".text\n"
1820 #if defined(OSX)
1821     ".globl _ScaleFilterRows_SSSE3\n"
1822 "_ScaleFilterRows_SSSE3:\n"
1823 #else
1824     ".global ScaleFilterRows_SSSE3\n"
1825 "ScaleFilterRows_SSSE3:\n"
1826 #endif
1827     "push   %esi\n"
1828     "push   %edi\n"
1829     "mov    0xc(%esp),%edi\n"
1830     "mov    0x10(%esp),%esi\n"
1831     "mov    0x14(%esp),%edx\n"
1832     "mov    0x18(%esp),%ecx\n"
1833     "mov    0x1c(%esp),%eax\n"
1834     "cmp    $0x0,%eax\n"
1835     "je     2f\n"
1836     "cmp    $0x80,%eax\n"
1837     "je     3f\n"
1838     "shr    %eax\n"
1839     "mov    %al,%ah\n"
1840     "neg    %al\n"
1841     "add    $0x80,%al\n"
1842     "movd   %eax,%xmm7\n"
1843     "punpcklwd %xmm7,%xmm7\n"
1844     "pshufd $0x0,%xmm7,%xmm7\n"
1845 
1846 "1:"
1847     "movdqa (%esi),%xmm0\n"
1848     "movdqa (%esi,%edx,1),%xmm2\n"
1849     "lea    0x10(%esi),%esi\n"
1850     "movdqa %xmm0,%xmm1\n"
1851     "punpcklbw %xmm2,%xmm0\n"
1852     "punpckhbw %xmm2,%xmm1\n"
1853     "pmaddubsw %xmm7,%xmm0\n"
1854     "pmaddubsw %xmm7,%xmm1\n"
1855     "psrlw  $0x7,%xmm0\n"
1856     "psrlw  $0x7,%xmm1\n"
1857     "packuswb %xmm1,%xmm0\n"
1858     "movdqa %xmm0,(%edi)\n"
1859     "lea    0x10(%edi),%edi\n"
1860     "sub    $0x10,%ecx\n"
1861     "ja     1b\n"
1862     "mov    -0x1(%edi),%al\n"
1863     "mov    %al,(%edi)\n"
1864     "pop    %edi\n"
1865     "pop    %esi\n"
1866     "ret\n"
1867 
1868 "2:"
1869     "movdqa (%esi),%xmm0\n"
1870     "lea    0x10(%esi),%esi\n"
1871     "movdqa %xmm0,(%edi)\n"
1872     "lea    0x10(%edi),%edi\n"
1873     "sub    $0x10,%ecx\n"
1874     "ja     2b\n"
1875     "mov    -0x1(%edi),%al\n"
1876     "mov    %al,(%edi)\n"
1877     "pop    %edi\n"
1878     "pop    %esi\n"
1879     "ret\n"
1880 
1881 "3:"
1882     "movdqa (%esi),%xmm0\n"
1883     "movdqa (%esi,%edx,1),%xmm2\n"
1884     "lea    0x10(%esi),%esi\n"
1885     "pavgb  %xmm2,%xmm0\n"
1886     "movdqa %xmm0,(%edi)\n"
1887     "lea    0x10(%edi),%edi\n"
1888     "sub    $0x10,%ecx\n"
1889     "ja     3b\n"
1890     "mov    -0x1(%edi),%al\n"
1891     "mov    %al,(%edi)\n"
1892     "pop    %edi\n"
1893     "pop    %esi\n"
1894     "ret\n"
1895 );
1896 
1897 #elif defined(__x86_64__)
ScaleRowDown8Int_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)1898 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
1899                                   uint8* dst_ptr, int dst_width) {
1900   asm volatile(
1901   "lea        (%3,%3,2),%%r10\n"
1902   "pxor       %%xmm7,%%xmm7\n"
1903 "1:"
1904   "movdqa     (%0),%%xmm0\n"
1905   "movdqa     0x10(%0),%%xmm1\n"
1906   "movdqa     (%0,%3,1),%%xmm2\n"
1907   "movdqa     0x10(%0,%3,1),%%xmm3\n"
1908   "pavgb      %%xmm2,%%xmm0\n"
1909   "pavgb      %%xmm3,%%xmm1\n"
1910   "movdqa     (%0,%3,2),%%xmm2\n"
1911   "movdqa     0x10(%0,%3,2),%%xmm3\n"
1912   "movdqa     (%0,%%r10,1),%%xmm4\n"
1913   "movdqa     0x10(%0,%%r10,1),%%xmm5\n"
1914   "lea        (%0,%3,4),%%r11\n"
1915   "lea        0x20(%0),%0\n"
1916   "pavgb      %%xmm4,%%xmm2\n"
1917   "pavgb      %%xmm5,%%xmm3\n"
1918   "pavgb      %%xmm2,%%xmm0\n"
1919   "pavgb      %%xmm3,%%xmm1\n"
1920   "movdqa     0x0(%%r11),%%xmm2\n"
1921   "movdqa     0x10(%%r11),%%xmm3\n"
1922   "movdqa     0x0(%%r11,%3,1),%%xmm4\n"
1923   "movdqa     0x10(%%r11,%3,1),%%xmm5\n"
1924   "pavgb      %%xmm4,%%xmm2\n"
1925   "pavgb      %%xmm5,%%xmm3\n"
1926   "movdqa     0x0(%%r11,%3,2),%%xmm4\n"
1927   "movdqa     0x10(%%r11,%3,2),%%xmm5\n"
1928   "movdqa     0x0(%%r11,%%r10,1),%%xmm6\n"
1929   "pavgb      %%xmm6,%%xmm4\n"
1930   "movdqa     0x10(%%r11,%%r10,1),%%xmm6\n"
1931   "pavgb      %%xmm6,%%xmm5\n"
1932   "pavgb      %%xmm4,%%xmm2\n"
1933   "pavgb      %%xmm5,%%xmm3\n"
1934   "pavgb      %%xmm2,%%xmm0\n"
1935   "pavgb      %%xmm3,%%xmm1\n"
1936   "psadbw     %%xmm7,%%xmm0\n"
1937   "psadbw     %%xmm7,%%xmm1\n"
1938   "pshufd     $0xd8,%%xmm0,%%xmm0\n"
1939   "pshufd     $0x8d,%%xmm1,%%xmm1\n"
1940   "por        %%xmm1,%%xmm0\n"
1941   "psrlw      $0x3,%%xmm0\n"
1942   "packuswb   %%xmm0,%%xmm0\n"
1943   "packuswb   %%xmm0,%%xmm0\n"
1944   "movd       %%xmm0,(%1)\n"
1945   "lea        0x4(%1),%1\n"
1946   "sub        $0x4,%2\n"
1947   "ja         1b\n"
1948   : "+r"(src_ptr),     // %0
1949     "+r"(dst_ptr),     // %1
1950     "+r"(dst_width)    // %2
1951   : "r"(static_cast<intptr_t>(src_stride))   // %3
1952   : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3",
1953     "xmm4", "xmm5", "xmm6", "xmm7"
1954 );
1955 }
1956 
1957 #define HAS_SCALEROWDOWN34_SSSE3
ScaleRowDown34_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)1958 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
1959                                  uint8* dst_ptr, int dst_width) {
1960   asm volatile(
1961   "movdqa     (%3),%%xmm3\n"
1962   "movdqa     (%4),%%xmm4\n"
1963   "movdqa     (%5),%%xmm5\n"
1964 "1:"
1965   "movdqa     (%0),%%xmm0\n"
1966   "movdqa     0x10(%0),%%xmm2\n"
1967   "lea        0x20(%0),%0\n"
1968   "movdqa     %%xmm2,%%xmm1\n"
1969   "palignr    $0x8,%%xmm0,%%xmm1\n"
1970   "pshufb     %%xmm3,%%xmm0\n"
1971   "pshufb     %%xmm4,%%xmm1\n"
1972   "pshufb     %%xmm5,%%xmm2\n"
1973   "movq       %%xmm0,(%1)\n"
1974   "movq       %%xmm1,0x8(%1)\n"
1975   "movq       %%xmm2,0x10(%1)\n"
1976   "lea        0x18(%1),%1\n"
1977   "sub        $0x18,%2\n"
1978   "ja         1b\n"
1979   : "+r"(src_ptr),     // %0
1980     "+r"(dst_ptr),     // %1
1981     "+r"(dst_width)    // %2
1982   : "r"(_shuf0),   // %3
1983     "r"(_shuf1),   // %4
1984     "r"(_shuf2)    // %5
1985   : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1986 );
1987 }
1988 
ScaleRowDown34_1_Int_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)1989 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
1990                                        uint8* dst_ptr, int dst_width) {
1991   asm volatile(
1992   "movdqa     (%4),%%xmm2\n"  // _shuf01
1993   "movdqa     (%5),%%xmm3\n"  // _shuf11
1994   "movdqa     (%6),%%xmm4\n"  // _shuf21
1995   "movdqa     (%7),%%xmm5\n"  // _madd01
1996   "movdqa     (%8),%%xmm6\n"  // _madd11
1997   "movdqa     (%9),%%xmm7\n"  // _round34
1998   "movdqa     (%10),%%xmm8\n"  // _madd21
1999 "1:"
2000   "movdqa     (%0),%%xmm0\n"
2001   "movdqa     (%0,%3),%%xmm1\n"
2002   "pavgb      %%xmm1,%%xmm0\n"
2003   "pshufb     %%xmm2,%%xmm0\n"
2004   "pmaddubsw  %%xmm5,%%xmm0\n"
2005   "paddsw     %%xmm7,%%xmm0\n"
2006   "psrlw      $0x2,%%xmm0\n"
2007   "packuswb   %%xmm0,%%xmm0\n"
2008   "movq       %%xmm0,(%1)\n"
2009   "movdqu     0x8(%0),%%xmm0\n"
2010   "movdqu     0x8(%0,%3),%%xmm1\n"
2011   "pavgb      %%xmm1,%%xmm0\n"
2012   "pshufb     %%xmm3,%%xmm0\n"
2013   "pmaddubsw  %%xmm6,%%xmm0\n"
2014   "paddsw     %%xmm7,%%xmm0\n"
2015   "psrlw      $0x2,%%xmm0\n"
2016   "packuswb   %%xmm0,%%xmm0\n"
2017   "movq       %%xmm0,0x8(%1)\n"
2018   "movdqa     0x10(%0),%%xmm0\n"
2019   "movdqa     0x10(%0,%3),%%xmm1\n"
2020   "lea        0x20(%0),%0\n"
2021   "pavgb      %%xmm1,%%xmm0\n"
2022   "pshufb     %%xmm4,%%xmm0\n"
2023   "pmaddubsw  %%xmm8,%%xmm0\n"
2024   "paddsw     %%xmm7,%%xmm0\n"
2025   "psrlw      $0x2,%%xmm0\n"
2026   "packuswb   %%xmm0,%%xmm0\n"
2027   "movq       %%xmm0,0x10(%1)\n"
2028   "lea        0x18(%1),%1\n"
2029   "sub        $0x18,%2\n"
2030   "ja         1b\n"
2031   : "+r"(src_ptr),     // %0
2032     "+r"(dst_ptr),     // %1
2033     "+r"(dst_width)    // %2
2034   : "r"(static_cast<intptr_t>(src_stride)),  // %3
2035     "r"(_shuf01),   // %4
2036     "r"(_shuf11),   // %5
2037     "r"(_shuf21),   // %6
2038     "r"(_madd01),   // %7
2039     "r"(_madd11),   // %8
2040     "r"(_round34),  // %9
2041     "r"(_madd21)    // %10
2042   : "memory", "xmm0", "xmm1", "xmm2", "xmm3",
2043     "xmm4", "xmm5", "xmm6", "xmm7", "xmm8"
2044 );
2045 }
2046 
ScaleRowDown34_0_Int_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2047 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
2048                                        uint8* dst_ptr, int dst_width) {
2049   asm volatile(
2050   "movdqa     (%4),%%xmm2\n"  // _shuf01
2051   "movdqa     (%5),%%xmm3\n"  // _shuf11
2052   "movdqa     (%6),%%xmm4\n"  // _shuf21
2053   "movdqa     (%7),%%xmm5\n"  // _madd01
2054   "movdqa     (%8),%%xmm6\n"  // _madd11
2055   "movdqa     (%9),%%xmm7\n"  // _round34
2056   "movdqa     (%10),%%xmm8\n"  // _madd21
2057 "1:"
2058   "movdqa     (%0),%%xmm0\n"
2059   "movdqa     (%0,%3,1),%%xmm1\n"
2060   "pavgb      %%xmm0,%%xmm1\n"
2061   "pavgb      %%xmm1,%%xmm0\n"
2062   "pshufb     %%xmm2,%%xmm0\n"
2063   "pmaddubsw  %%xmm5,%%xmm0\n"
2064   "paddsw     %%xmm7,%%xmm0\n"
2065   "psrlw      $0x2,%%xmm0\n"
2066   "packuswb   %%xmm0,%%xmm0\n"
2067   "movq       %%xmm0,(%1)\n"
2068   "movdqu     0x8(%0),%%xmm0\n"
2069   "movdqu     0x8(%0,%3,1),%%xmm1\n"
2070   "pavgb      %%xmm0,%%xmm1\n"
2071   "pavgb      %%xmm1,%%xmm0\n"
2072   "pshufb     %%xmm3,%%xmm0\n"
2073   "pmaddubsw  %%xmm6,%%xmm0\n"
2074   "paddsw     %%xmm7,%%xmm0\n"
2075   "psrlw      $0x2,%%xmm0\n"
2076   "packuswb   %%xmm0,%%xmm0\n"
2077   "movq       %%xmm0,0x8(%1)\n"
2078   "movdqa     0x10(%0),%%xmm0\n"
2079   "movdqa     0x10(%0,%3,1),%%xmm1\n"
2080   "lea        0x20(%0),%0\n"
2081   "pavgb      %%xmm0,%%xmm1\n"
2082   "pavgb      %%xmm1,%%xmm0\n"
2083   "pshufb     %%xmm4,%%xmm0\n"
2084   "pmaddubsw  %%xmm8,%%xmm0\n"
2085   "paddsw     %%xmm7,%%xmm0\n"
2086   "psrlw      $0x2,%%xmm0\n"
2087   "packuswb   %%xmm0,%%xmm0\n"
2088   "movq       %%xmm0,0x10(%1)\n"
2089   "lea        0x18(%1),%1\n"
2090   "sub        $0x18,%2\n"
2091   "ja         1b\n"
2092   : "+r"(src_ptr),     // %0
2093     "+r"(dst_ptr),     // %1
2094     "+r"(dst_width)    // %2
2095   : "r"(static_cast<intptr_t>(src_stride)),  // %3
2096     "r"(_shuf01),   // %4
2097     "r"(_shuf11),   // %5
2098     "r"(_shuf21),   // %6
2099     "r"(_madd01),   // %7
2100     "r"(_madd11),   // %8
2101     "r"(_round34),  // %9
2102     "r"(_madd21)    // %10
2103   : "memory", "xmm0", "xmm1", "xmm2", "xmm3",
2104     "xmm4", "xmm5", "xmm6", "xmm7", "xmm8"
2105 );
2106 }
2107 
2108 #define HAS_SCALEROWDOWN38_SSSE3
ScaleRowDown38_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2109 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
2110                                  uint8* dst_ptr, int dst_width) {
2111   asm volatile(
2112   "movdqa     (%3),%%xmm5\n"
2113   "movdqa     (%4),%%xmm6\n"
2114   "pxor       %%xmm7,%%xmm7\n"
2115 "1:"
2116   "movdqa     (%0),%%xmm0\n"
2117   "movdqa     0x10(%0),%%xmm1\n"
2118   "lea        0x20(%0),%0\n"
2119   "pshufb     %%xmm5,%%xmm0\n"
2120   "pshufb     %%xmm6,%%xmm1\n"
2121   "paddusb    %%xmm1,%%xmm0\n"
2122   "movq       %%xmm0,(%1)\n"
2123   "movhlps    %%xmm0,%%xmm1\n"
2124   "movd       %%xmm1,0x8(%1)\n"
2125   "lea        0xc(%1),%1\n"
2126   "sub        $0xc,%2\n"
2127   "ja         1b\n"
2128   : "+r"(src_ptr),     // %0
2129     "+r"(dst_ptr),     // %1
2130     "+r"(dst_width)    // %2
2131   : "r"(_shuf38a),  // %3
2132     "r"(_shuf38b)   // %4
2133   : "memory", "xmm0", "xmm1", "xmm5", "xmm6", "xmm7"
2134 );
2135 }
2136 
ScaleRowDown38_3_Int_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2137 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
2138                                        uint8* dst_ptr, int dst_width) {
2139   asm volatile(
2140   "movdqa     (%4),%%xmm4\n"
2141   "movdqa     (%5),%%xmm5\n"
2142   "movdqa     (%6),%%xmm6\n"
2143   "pxor       %%xmm7,%%xmm7\n"
2144 "1:"
2145   "movdqa     (%0),%%xmm0\n"
2146   "movdqa     (%0,%3,1),%%xmm2\n"
2147   "movhlps    %%xmm0,%%xmm1\n"
2148   "movhlps    %%xmm2,%%xmm3\n"
2149   "punpcklbw  %%xmm7,%%xmm0\n"
2150   "punpcklbw  %%xmm7,%%xmm1\n"
2151   "punpcklbw  %%xmm7,%%xmm2\n"
2152   "punpcklbw  %%xmm7,%%xmm3\n"
2153   "paddusw    %%xmm2,%%xmm0\n"
2154   "paddusw    %%xmm3,%%xmm1\n"
2155   "movdqa     (%0,%3,2),%%xmm2\n"
2156   "lea        0x10(%0),%0\n"
2157   "movhlps    %%xmm2,%%xmm3\n"
2158   "punpcklbw  %%xmm7,%%xmm2\n"
2159   "punpcklbw  %%xmm7,%%xmm3\n"
2160   "paddusw    %%xmm2,%%xmm0\n"
2161   "paddusw    %%xmm3,%%xmm1\n"
2162   "movdqa     %%xmm0,%%xmm2\n"
2163   "psrldq     $0x2,%%xmm0\n"
2164   "paddusw    %%xmm0,%%xmm2\n"
2165   "psrldq     $0x2,%%xmm0\n"
2166   "paddusw    %%xmm0,%%xmm2\n"
2167   "pshufb     %%xmm4,%%xmm2\n"
2168   "movdqa     %%xmm1,%%xmm3\n"
2169   "psrldq     $0x2,%%xmm1\n"
2170   "paddusw    %%xmm1,%%xmm3\n"
2171   "psrldq     $0x2,%%xmm1\n"
2172   "paddusw    %%xmm1,%%xmm3\n"
2173   "pshufb     %%xmm5,%%xmm3\n"
2174   "paddusw    %%xmm3,%%xmm2\n"
2175   "pmulhuw    %%xmm6,%%xmm2\n"
2176   "packuswb   %%xmm2,%%xmm2\n"
2177   "movd       %%xmm2,(%1)\n"
2178   "pextrw     $0x2,%%xmm2,%%eax\n"
2179   "mov        %%ax,0x4(%1)\n"
2180   "lea        0x6(%1),%1\n"
2181   "sub        $0x6,%2\n"
2182   "ja         1b\n"
2183   : "+r"(src_ptr),     // %0
2184     "+r"(dst_ptr),     // %1
2185     "+r"(dst_width)    // %2
2186   : "r"(static_cast<intptr_t>(src_stride)),  // %3
2187     "r"(_shufac0),   // %4
2188     "r"(_shufac3),   // %5
2189     "r"(_scaleac3)   // %6
2190   : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm3",
2191     "xmm4", "xmm5", "xmm6", "xmm7"
2192 );
2193 }
2194 
ScaleRowDown38_2_Int_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2195 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
2196                                        uint8* dst_ptr, int dst_width) {
2197   asm volatile(
2198   "movdqa     (%4),%%xmm4\n"
2199   "movdqa     (%5),%%xmm5\n"
2200   "movdqa     (%6),%%xmm6\n"
2201   "movdqa     (%7),%%xmm7\n"
2202 "1:"
2203   "movdqa     (%0),%%xmm2\n"
2204   "pavgb      (%0,%3,1),%%xmm2\n"
2205   "lea        0x10(%0),%0\n"
2206   "movdqa     %%xmm2,%%xmm0\n"
2207   "pshufb     %%xmm4,%%xmm0\n"
2208   "movdqa     %%xmm2,%%xmm1\n"
2209   "pshufb     %%xmm5,%%xmm1\n"
2210   "paddusw    %%xmm1,%%xmm0\n"
2211   "pshufb     %%xmm6,%%xmm2\n"
2212   "paddusw    %%xmm2,%%xmm0\n"
2213   "pmulhuw    %%xmm7,%%xmm0\n"
2214   "packuswb   %%xmm0,%%xmm0\n"
2215   "movd       %%xmm0,(%1)\n"
2216   "pextrw     $0x2,%%xmm0,%%eax\n"
2217   "mov        %%ax,0x4(%1)\n"
2218   "lea        0x6(%1),%1\n"
2219   "sub        $0x6,%2\n"
2220   "ja         1b\n"
2221   : "+r"(src_ptr),     // %0
2222     "+r"(dst_ptr),     // %1
2223     "+r"(dst_width)    // %2
2224   : "r"(static_cast<intptr_t>(src_stride)),  // %3
2225     "r"(_shufab0),   // %4
2226     "r"(_shufab1),   // %5
2227     "r"(_shufab2),   // %6
2228     "r"(_scaleab2)   // %7
2229   : "memory", "rax", "xmm0", "xmm1", "xmm2",
2230     "xmm4", "xmm5", "xmm6", "xmm7"
2231 );
2232 }
2233 
2234 #define HAS_SCALEADDROWS_SSE2
ScaleAddRows_SSE2(const uint8 * src_ptr,int src_stride,uint16 * dst_ptr,int src_width,int src_height)2235 static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
2236                               uint16* dst_ptr, int src_width,
2237                               int src_height) {
2238   asm volatile(
2239   "pxor       %%xmm7,%%xmm7\n"
2240 "1:"
2241   "movdqa     (%0),%%xmm2\n"
2242   "lea        (%0,%4,1),%%r10\n"
2243   "movhlps    %%xmm2,%%xmm3\n"
2244   "lea        -0x1(%3),%%r11\n"
2245   "punpcklbw  %%xmm7,%%xmm2\n"
2246   "punpcklbw  %%xmm7,%%xmm3\n"
2247 
2248 "2:"
2249   "movdqa     (%%r10),%%xmm0\n"
2250   "lea        (%%r10,%4,1),%%r10\n"
2251   "movhlps    %%xmm0,%%xmm1\n"
2252   "punpcklbw  %%xmm7,%%xmm0\n"
2253   "punpcklbw  %%xmm7,%%xmm1\n"
2254   "paddusw    %%xmm0,%%xmm2\n"
2255   "paddusw    %%xmm1,%%xmm3\n"
2256   "sub        $0x1,%%r11\n"
2257   "ja         2b\n"
2258 
2259   "movdqa     %%xmm2,(%1)\n"
2260   "movdqa     %%xmm3,0x10(%1)\n"
2261   "lea        0x20(%1),%1\n"
2262   "lea        0x10(%0),%0\n"
2263   "sub        $0x10,%2\n"
2264   "ja         1b\n"
2265   : "+r"(src_ptr),     // %0
2266     "+r"(dst_ptr),     // %1
2267     "+r"(src_width),   // %2
2268     "+r"(src_height)   // %3
2269   : "r"(static_cast<intptr_t>(src_stride))  // %4
2270   : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7"
2271 );
2272 }
2273 
2274 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
2275 #define HAS_SCALEFILTERROWS_SSE2
ScaleFilterRows_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,int src_stride,int dst_width,int source_y_fraction)2276 static void ScaleFilterRows_SSE2(uint8* dst_ptr,
2277                                  const uint8* src_ptr, int src_stride,
2278                                  int dst_width, int source_y_fraction) {
2279   if (source_y_fraction == 0) {
2280     asm volatile(
2281     "1:"
2282       "movdqa     (%1),%%xmm0\n"
2283       "lea        0x10(%1),%1\n"
2284       "movdqa     %%xmm0,(%0)\n"
2285       "lea        0x10(%0),%0\n"
2286       "sub        $0x10,%2\n"
2287       "ja         1b\n"
2288       "mov        -0x1(%0),%%al\n"
2289       "mov        %%al,(%0)\n"
2290       : "+r"(dst_ptr),     // %0
2291         "+r"(src_ptr),     // %1
2292         "+r"(dst_width)    // %2
2293       :
2294       : "memory", "rax", "xmm0"
2295     );
2296     return;
2297   } else if (source_y_fraction == 128) {
2298     asm volatile(
2299     "1:"
2300       "movdqa     (%1),%%xmm0\n"
2301       "movdqa     (%1,%3,1),%%xmm2\n"
2302       "lea        0x10(%1),%1\n"
2303       "pavgb      %%xmm2,%%xmm0\n"
2304       "movdqa     %%xmm0,(%0)\n"
2305       "lea        0x10(%0),%0\n"
2306       "sub        $0x10,%2\n"
2307       "ja         1b\n"
2308       "mov        -0x1(%0),%%al\n"
2309       "mov        %%al,(%0)\n"
2310       : "+r"(dst_ptr),     // %0
2311         "+r"(src_ptr),     // %1
2312         "+r"(dst_width)    // %2
2313       : "r"(static_cast<intptr_t>(src_stride))  // %3
2314       : "memory", "rax", "xmm0", "xmm2"
2315     );
2316     return;
2317   } else {
2318     asm volatile(
2319       "mov        %3,%%eax\n"
2320       "movd       %%eax,%%xmm6\n"
2321       "punpcklwd  %%xmm6,%%xmm6\n"
2322       "pshufd     $0x0,%%xmm6,%%xmm6\n"
2323       "neg        %%eax\n"
2324       "add        $0x100,%%eax\n"
2325       "movd       %%eax,%%xmm5\n"
2326       "punpcklwd  %%xmm5,%%xmm5\n"
2327       "pshufd     $0x0,%%xmm5,%%xmm5\n"
2328       "pxor       %%xmm7,%%xmm7\n"
2329     "1:"
2330       "movdqa     (%1),%%xmm0\n"
2331       "movdqa     (%1,%4,1),%%xmm2\n"
2332       "lea        0x10(%1),%1\n"
2333       "movdqa     %%xmm0,%%xmm1\n"
2334       "movdqa     %%xmm2,%%xmm3\n"
2335       "punpcklbw  %%xmm7,%%xmm0\n"
2336       "punpcklbw  %%xmm7,%%xmm2\n"
2337       "punpckhbw  %%xmm7,%%xmm1\n"
2338       "punpckhbw  %%xmm7,%%xmm3\n"
2339       "pmullw     %%xmm5,%%xmm0\n"
2340       "pmullw     %%xmm5,%%xmm1\n"
2341       "pmullw     %%xmm6,%%xmm2\n"
2342       "pmullw     %%xmm6,%%xmm3\n"
2343       "paddusw    %%xmm2,%%xmm0\n"
2344       "paddusw    %%xmm3,%%xmm1\n"
2345       "psrlw      $0x8,%%xmm0\n"
2346       "psrlw      $0x8,%%xmm1\n"
2347       "packuswb   %%xmm1,%%xmm0\n"
2348       "movdqa     %%xmm0,(%0)\n"
2349       "lea        0x10(%0),%0\n"
2350       "sub        $0x10,%2\n"
2351       "ja         1b\n"
2352       "mov        -0x1(%0),%%al\n"
2353       "mov        %%al,(%0)\n"
2354       : "+r"(dst_ptr),     // %0
2355         "+r"(src_ptr),     // %1
2356         "+r"(dst_width),   // %2
2357         "+r"(source_y_fraction)  // %3
2358       : "r"(static_cast<intptr_t>(src_stride))  // %4
2359       : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm3",
2360         "xmm5", "xmm6", "xmm7"
2361     );
2362   }
2363   return;
2364 }
2365 
2366 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
2367 #define HAS_SCALEFILTERROWS_SSSE3
ScaleFilterRows_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,int src_stride,int dst_width,int source_y_fraction)2368 static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
2369                                   const uint8* src_ptr, int src_stride,
2370                                   int dst_width, int source_y_fraction) {
2371   if (source_y_fraction == 0) {
2372     asm volatile(
2373    "1:"
2374       "movdqa     (%1),%%xmm0\n"
2375       "lea        0x10(%1),%1\n"
2376       "movdqa     %%xmm0,(%0)\n"
2377       "lea        0x10(%0),%0\n"
2378       "sub        $0x10,%2\n"
2379       "ja         1b\n"
2380       "mov        -0x1(%0),%%al\n"
2381       "mov        %%al,(%0)\n"
2382       : "+r"(dst_ptr),     // %0
2383         "+r"(src_ptr),     // %1
2384         "+r"(dst_width)    // %2
2385       :
2386       : "memory", "rax", "xmm0"
2387     );
2388     return;
2389   } else if (source_y_fraction == 128) {
2390     asm volatile(
2391     "1:"
2392       "movdqa     (%1),%%xmm0\n"
2393       "movdqa     (%1,%3,1),%%xmm2\n"
2394       "lea        0x10(%1),%1\n"
2395       "pavgb      %%xmm2,%%xmm0\n"
2396       "movdqa     %%xmm0,(%0)\n"
2397       "lea        0x10(%0),%0\n"
2398       "sub        $0x10,%2\n"
2399       "ja         1b\n"
2400       "mov        -0x1(%0),%%al\n"
2401       "mov        %%al,(%0)\n"
2402       : "+r"(dst_ptr),     // %0
2403         "+r"(src_ptr),     // %1
2404         "+r"(dst_width)    // %2
2405       : "r"(static_cast<intptr_t>(src_stride))  // %3
2406      : "memory", "rax", "xmm0", "xmm2"
2407     );
2408     return;
2409   } else {
2410     asm volatile(
2411       "mov        %3,%%eax\n"
2412       "shr        %%eax\n"
2413       "mov        %%al,%%ah\n"
2414       "neg        %%al\n"
2415       "add        $0x80,%%al\n"
2416       "movd       %%eax,%%xmm7\n"
2417       "punpcklwd  %%xmm7,%%xmm7\n"
2418       "pshufd     $0x0,%%xmm7,%%xmm7\n"
2419     "1:"
2420       "movdqa     (%1),%%xmm0\n"
2421       "movdqa     (%1,%4,1),%%xmm2\n"
2422       "lea        0x10(%1),%1\n"
2423       "movdqa     %%xmm0,%%xmm1\n"
2424       "punpcklbw  %%xmm2,%%xmm0\n"
2425       "punpckhbw  %%xmm2,%%xmm1\n"
2426       "pmaddubsw  %%xmm7,%%xmm0\n"
2427       "pmaddubsw  %%xmm7,%%xmm1\n"
2428       "psrlw      $0x7,%%xmm0\n"
2429       "psrlw      $0x7,%%xmm1\n"
2430       "packuswb   %%xmm1,%%xmm0\n"
2431       "movdqa     %%xmm0,(%0)\n"
2432       "lea        0x10(%0),%0\n"
2433       "sub        $0x10,%2\n"
2434       "ja         1b\n"
2435       "mov        -0x1(%0),%%al\n"
2436       "mov        %%al,(%0)\n"
2437       : "+r"(dst_ptr),     // %0
2438         "+r"(src_ptr),     // %1
2439         "+r"(dst_width),   // %2
2440         "+r"(source_y_fraction)  // %3
2441       : "r"(static_cast<intptr_t>(src_stride))  // %4
2442       : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm7"
2443     );
2444   }
2445   return;
2446 }
2447 #endif
2448 #endif
2449 
2450 // CPU agnostic row functions
ScaleRowDown2_C(const uint8 * src_ptr,int,uint8 * dst,int dst_width)2451 static void ScaleRowDown2_C(const uint8* src_ptr, int,
2452                             uint8* dst, int dst_width) {
2453   for (int x = 0; x < dst_width; ++x) {
2454     *dst++ = *src_ptr;
2455     src_ptr += 2;
2456   }
2457 }
2458 
ScaleRowDown2Int_C(const uint8 * src_ptr,int src_stride,uint8 * dst,int dst_width)2459 static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride,
2460                                uint8* dst, int dst_width) {
2461   for (int x = 0; x < dst_width; ++x) {
2462     *dst++ = (src_ptr[0] + src_ptr[1] +
2463               src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2;
2464     src_ptr += 2;
2465   }
2466 }
2467 
ScaleRowDown4_C(const uint8 * src_ptr,int,uint8 * dst,int dst_width)2468 static void ScaleRowDown4_C(const uint8* src_ptr, int,
2469                             uint8* dst, int dst_width) {
2470   for (int x = 0; x < dst_width; ++x) {
2471     *dst++ = *src_ptr;
2472     src_ptr += 4;
2473   }
2474 }
2475 
ScaleRowDown4Int_C(const uint8 * src_ptr,int src_stride,uint8 * dst,int dst_width)2476 static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride,
2477                                uint8* dst, int dst_width) {
2478   for (int x = 0; x < dst_width; ++x) {
2479     *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
2480               src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
2481               src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
2482               src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
2483               src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
2484               src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
2485               src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] +
2486               8) >> 4;
2487     src_ptr += 4;
2488   }
2489 }
2490 
2491 // 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down.
2492 // Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu.
2493 static const int kMaxOutputWidth = 640;
2494 static const int kMaxRow12 = kMaxOutputWidth * 2;
2495 
ScaleRowDown8_C(const uint8 * src_ptr,int,uint8 * dst,int dst_width)2496 static void ScaleRowDown8_C(const uint8* src_ptr, int,
2497                             uint8* dst, int dst_width) {
2498   for (int x = 0; x < dst_width; ++x) {
2499     *dst++ = *src_ptr;
2500     src_ptr += 8;
2501   }
2502 }
2503 
2504 // Note calling code checks width is less than max and if not
2505 // uses ScaleRowDown8_C instead.
ScaleRowDown8Int_C(const uint8 * src_ptr,int src_stride,uint8 * dst,int dst_width)2506 static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride,
2507                                uint8* dst, int dst_width) {
2508   ALIGN16(uint8 src_row[kMaxRow12 * 2]);
2509   assert(dst_width <= kMaxOutputWidth);
2510   ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2);
2511   ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride,
2512                      src_row + kMaxOutputWidth,
2513                      dst_width * 2);
2514   ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width);
2515 }
2516 
ScaleRowDown34_C(const uint8 * src_ptr,int,uint8 * dst,int dst_width)2517 static void ScaleRowDown34_C(const uint8* src_ptr, int,
2518                              uint8* dst, int dst_width) {
2519   assert((dst_width % 3 == 0) && (dst_width > 0));
2520   uint8* dend = dst + dst_width;
2521   do {
2522     dst[0] = src_ptr[0];
2523     dst[1] = src_ptr[1];
2524     dst[2] = src_ptr[3];
2525     dst += 3;
2526     src_ptr += 4;
2527   } while (dst < dend);
2528 }
2529 
2530 // Filter rows 0 and 1 together, 3 : 1
ScaleRowDown34_0_Int_C(const uint8 * src_ptr,int src_stride,uint8 * d,int dst_width)2531 static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride,
2532                                    uint8* d, int dst_width) {
2533   assert((dst_width % 3 == 0) && (dst_width > 0));
2534   uint8* dend = d + dst_width;
2535   const uint8* s = src_ptr;
2536   const uint8* t = src_ptr + src_stride;
2537   do {
2538     uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
2539     uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
2540     uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
2541     uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
2542     uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
2543     uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
2544     d[0] = (a0 * 3 + b0 + 2) >> 2;
2545     d[1] = (a1 * 3 + b1 + 2) >> 2;
2546     d[2] = (a2 * 3 + b2 + 2) >> 2;
2547     d += 3;
2548     s += 4;
2549     t += 4;
2550   } while (d < dend);
2551 }
2552 
2553 // Filter rows 1 and 2 together, 1 : 1
ScaleRowDown34_1_Int_C(const uint8 * src_ptr,int src_stride,uint8 * d,int dst_width)2554 static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride,
2555                                    uint8* d, int dst_width) {
2556   assert((dst_width % 3 == 0) && (dst_width > 0));
2557   uint8* dend = d + dst_width;
2558   const uint8* s = src_ptr;
2559   const uint8* t = src_ptr + src_stride;
2560   do {
2561     uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
2562     uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
2563     uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
2564     uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
2565     uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
2566     uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
2567     d[0] = (a0 + b0 + 1) >> 1;
2568     d[1] = (a1 + b1 + 1) >> 1;
2569     d[2] = (a2 + b2 + 1) >> 1;
2570     d += 3;
2571     s += 4;
2572     t += 4;
2573   } while (d < dend);
2574 }
2575 
2576 #if defined(HAS_SCALEFILTERROWS_SSE2)
2577 // Filter row to 3/4
ScaleFilterCols34_C(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width)2578 static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
2579                                 int dst_width) {
2580   assert((dst_width % 3 == 0) && (dst_width > 0));
2581   uint8* dend = dst_ptr + dst_width;
2582   const uint8* s = src_ptr;
2583   do {
2584     dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;
2585     dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;
2586     dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2;
2587     dst_ptr += 3;
2588     s += 4;
2589   } while (dst_ptr < dend);
2590 }
2591 #endif
2592 
ScaleFilterCols_C(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int dx)2593 static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
2594                               int dst_width, int dx) {
2595   int x = 0;
2596   for (int j = 0; j < dst_width; ++j) {
2597     int xi = x >> 16;
2598     int xf1 = x & 0xffff;
2599     int xf0 = 65536 - xf1;
2600 
2601     *dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16;
2602     x += dx;
2603   }
2604 }
2605 
2606 static const int kMaxInputWidth = 2560;
2607 #if defined(HAS_SCALEFILTERROWS_SSE2)
2608 #define HAS_SCALEROWDOWN34_SSE2
2609 // Filter rows 0 and 1 together, 3 : 1
ScaleRowDown34_0_Int_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2610 static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride,
2611                                       uint8* dst_ptr, int dst_width) {
2612   assert((dst_width % 3 == 0) && (dst_width > 0));
2613   ALIGN16(uint8 row[kMaxInputWidth]);
2614   ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3,
2615                        256 / 4);
2616   ScaleFilterCols34_C(dst_ptr, row, dst_width);
2617 }
2618 
2619 // Filter rows 1 and 2 together, 1 : 1
ScaleRowDown34_1_Int_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2620 static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride,
2621                                       uint8* dst_ptr, int dst_width) {
2622   assert((dst_width % 3 == 0) && (dst_width > 0));
2623   ALIGN16(uint8 row[kMaxInputWidth]);
2624   ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2);
2625   ScaleFilterCols34_C(dst_ptr, row, dst_width);
2626 }
2627 #endif
2628 
ScaleRowDown38_C(const uint8 * src_ptr,int,uint8 * dst,int dst_width)2629 static void ScaleRowDown38_C(const uint8* src_ptr, int,
2630                              uint8* dst, int dst_width) {
2631   assert(dst_width % 3 == 0);
2632   for (int x = 0; x < dst_width; x += 3) {
2633     dst[0] = src_ptr[0];
2634     dst[1] = src_ptr[3];
2635     dst[2] = src_ptr[6];
2636     dst += 3;
2637     src_ptr += 8;
2638   }
2639 }
2640 
2641 // 8x3 -> 3x1
ScaleRowDown38_3_Int_C(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2642 static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride,
2643                                    uint8* dst_ptr, int dst_width) {
2644   assert((dst_width % 3 == 0) && (dst_width > 0));
2645   for (int i = 0; i < dst_width; i+=3) {
2646     dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
2647         src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
2648         src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +
2649         src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *
2650         (65536 / 9) >> 16;
2651     dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
2652         src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
2653         src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +
2654         src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *
2655         (65536 / 9) >> 16;
2656     dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
2657         src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +
2658         src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) *
2659         (65536 / 6) >> 16;
2660     src_ptr += 8;
2661     dst_ptr += 3;
2662   }
2663 }
2664 
2665 // 8x2 -> 3x1
ScaleRowDown38_2_Int_C(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2666 static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride,
2667                                    uint8* dst_ptr, int dst_width) {
2668   assert((dst_width % 3 == 0) && (dst_width > 0));
2669   for (int i = 0; i < dst_width; i+=3) {
2670     dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
2671         src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
2672         src_ptr[src_stride + 2]) * (65536 / 6) >> 16;
2673     dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
2674         src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
2675         src_ptr[src_stride + 5]) * (65536 / 6) >> 16;
2676     dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
2677         src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) *
2678         (65536 / 4) >> 16;
2679     src_ptr += 8;
2680     dst_ptr += 3;
2681   }
2682 }
2683 
2684 // C version 8x2 -> 8x1
ScaleFilterRows_C(uint8 * dst_ptr,const uint8 * src_ptr,int src_stride,int dst_width,int source_y_fraction)2685 static void ScaleFilterRows_C(uint8* dst_ptr,
2686                               const uint8* src_ptr, int src_stride,
2687                               int dst_width, int source_y_fraction) {
2688   assert(dst_width > 0);
2689   int y1_fraction = source_y_fraction;
2690   int y0_fraction = 256 - y1_fraction;
2691   const uint8* src_ptr1 = src_ptr + src_stride;
2692   uint8* end = dst_ptr + dst_width;
2693   do {
2694     dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
2695     dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
2696     dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
2697     dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
2698     dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
2699     dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
2700     dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
2701     dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
2702     src_ptr += 8;
2703     src_ptr1 += 8;
2704     dst_ptr += 8;
2705   } while (dst_ptr < end);
2706   dst_ptr[0] = dst_ptr[-1];
2707 }
2708 
ScaleAddRows_C(const uint8 * src_ptr,int src_stride,uint16 * dst_ptr,int src_width,int src_height)2709 void ScaleAddRows_C(const uint8* src_ptr, int src_stride,
2710                     uint16* dst_ptr, int src_width, int src_height) {
2711   assert(src_width > 0);
2712   assert(src_height > 0);
2713   for (int x = 0; x < src_width; ++x) {
2714     const uint8* s = src_ptr + x;
2715     int sum = 0;
2716     for (int y = 0; y < src_height; ++y) {
2717       sum += s[0];
2718       s += src_stride;
2719     }
2720     dst_ptr[x] = sum;
2721   }
2722 }
2723 
2724 /**
2725  * Scale plane, 1/2
2726  *
2727  * This is an optimized version for scaling down a plane to 1/2 of
2728  * its original size.
2729  *
2730  */
ScalePlaneDown2(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)2731 static void ScalePlaneDown2(int src_width, int src_height,
2732                             int dst_width, int dst_height,
2733                             int src_stride, int dst_stride,
2734                             const uint8* src_ptr, uint8* dst_ptr,
2735                             FilterMode filtering) {
2736   assert(src_width % 2 == 0);
2737   assert(src_height % 2 == 0);
2738   void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride,
2739                         uint8* dst_ptr, int dst_width);
2740 
2741 #if defined(HAS_SCALEROWDOWN2_NEON)
2742   if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
2743       (dst_width % 16 == 0) && (src_stride % 16 == 0) &&
2744       (dst_stride % 16 == 0) &&
2745       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 16)) {
2746     ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;
2747   } else
2748 #endif
2749 #if defined(HAS_SCALEROWDOWN2_SSE2)
2750   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
2751       (dst_width % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
2752       IS_ALIGNED(dst_ptr, 16)) {
2753     ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;
2754   } else
2755 #endif
2756   {
2757     ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C;
2758   }
2759 
2760   for (int y = 0; y < dst_height; ++y) {
2761     ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
2762     src_ptr += (src_stride << 1);
2763     dst_ptr += dst_stride;
2764   }
2765 }
2766 
2767 /**
2768  * Scale plane, 1/4
2769  *
2770  * This is an optimized version for scaling down a plane to 1/4 of
2771  * its original size.
2772  */
ScalePlaneDown4(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)2773 static void ScalePlaneDown4(int src_width, int src_height,
2774                             int dst_width, int dst_height,
2775                             int src_stride, int dst_stride,
2776                             const uint8* src_ptr, uint8* dst_ptr,
2777                             FilterMode filtering) {
2778   assert(src_width % 4 == 0);
2779   assert(src_height % 4 == 0);
2780   void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride,
2781                         uint8* dst_ptr, int dst_width);
2782 
2783 #if defined(HAS_SCALEROWDOWN4_NEON)
2784   if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
2785       (dst_width % 2 == 0) && (src_stride % 8 == 0) &&
2786       IS_ALIGNED(src_ptr, 8)) {
2787     ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON;
2788   } else
2789 #endif
2790 #if defined(HAS_SCALEROWDOWN4_SSE2)
2791   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
2792       (dst_width % 8 == 0) && (src_stride % 16 == 0) &&
2793       (dst_stride % 8 == 0) &&
2794       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
2795     ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
2796   } else
2797 #endif
2798   {
2799     ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C;
2800   }
2801 
2802   for (int y = 0; y < dst_height; ++y) {
2803     ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
2804     src_ptr += (src_stride << 2);
2805     dst_ptr += dst_stride;
2806   }
2807 }
2808 
2809 /**
2810  * Scale plane, 1/8
2811  *
2812  * This is an optimized version for scaling down a plane to 1/8
2813  * of its original size.
2814  *
2815  */
ScalePlaneDown8(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)2816 static void ScalePlaneDown8(int src_width, int src_height,
2817                             int dst_width, int dst_height,
2818                             int src_stride, int dst_stride,
2819                             const uint8* src_ptr, uint8* dst_ptr,
2820                             FilterMode filtering) {
2821   assert(src_width % 8 == 0);
2822   assert(src_height % 8 == 0);
2823   void (*ScaleRowDown8)(const uint8* src_ptr, int src_stride,
2824                         uint8* dst_ptr, int dst_width);
2825 #if defined(HAS_SCALEROWDOWN8_SSE2)
2826   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
2827       (dst_width % 16 == 0) && dst_width <= kMaxOutputWidth &&
2828       (src_stride % 16 == 0) && (dst_stride % 16 == 0) &&
2829       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 16)) {
2830     ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2;
2831   } else
2832 #endif
2833   {
2834     ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ?
2835         ScaleRowDown8Int_C : ScaleRowDown8_C;
2836   }
2837   for (int y = 0; y < dst_height; ++y) {
2838     ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width);
2839     src_ptr += (src_stride << 3);
2840     dst_ptr += dst_stride;
2841   }
2842 }
2843 
2844 /**
2845  * Scale plane down, 3/4
2846  *
2847  * Provided by Frank Barchard (fbarchard@google.com)
2848  *
2849  */
ScalePlaneDown34(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)2850 static void ScalePlaneDown34(int src_width, int src_height,
2851                              int dst_width, int dst_height,
2852                              int src_stride, int dst_stride,
2853                              const uint8* src_ptr, uint8* dst_ptr,
2854                              FilterMode filtering) {
2855   assert(dst_width % 3 == 0);
2856   void (*ScaleRowDown34_0)(const uint8* src_ptr, int src_stride,
2857                            uint8* dst_ptr, int dst_width);
2858   void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride,
2859                            uint8* dst_ptr, int dst_width);
2860 #if defined(HAS_SCALEROWDOWN34_SSSE3)
2861   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
2862       (dst_width % 24 == 0) && (src_stride % 16 == 0) &&
2863       (dst_stride % 8 == 0) &&
2864       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
2865     if (!filtering) {
2866       ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
2867       ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
2868     } else {
2869       ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3;
2870       ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3;
2871     }
2872   } else
2873 #endif
2874 #if defined(HAS_SCALEROWDOWN34_SSE2)
2875   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
2876       (dst_width % 24 == 0) && (src_stride % 16 == 0) &&
2877       (dst_stride % 8 == 0) &&
2878       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8) &&
2879       filtering) {
2880     ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2;
2881     ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2;
2882   } else
2883 #endif
2884   {
2885     if (!filtering) {
2886       ScaleRowDown34_0 = ScaleRowDown34_C;
2887       ScaleRowDown34_1 = ScaleRowDown34_C;
2888     } else {
2889       ScaleRowDown34_0 = ScaleRowDown34_0_Int_C;
2890       ScaleRowDown34_1 = ScaleRowDown34_1_Int_C;
2891     }
2892   }
2893   int src_row = 0;
2894   for (int y = 0; y < dst_height; ++y) {
2895     switch (src_row) {
2896       case 0:
2897         ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
2898         break;
2899 
2900       case 1:
2901         ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);
2902         break;
2903 
2904       case 2:
2905         ScaleRowDown34_0(src_ptr + src_stride, -src_stride,
2906                          dst_ptr, dst_width);
2907         break;
2908     }
2909     ++src_row;
2910     src_ptr += src_stride;
2911     dst_ptr += dst_stride;
2912     if (src_row >= 3) {
2913       src_ptr += src_stride;
2914       src_row = 0;
2915     }
2916   }
2917 }
2918 
2919 /**
2920  * Scale plane, 3/8
2921  *
2922  * This is an optimized version for scaling down a plane to 3/8
2923  * of its original size.
2924  *
2925  * Reduces 16x3 to 6x1
2926  */
ScalePlaneDown38(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)2927 static void ScalePlaneDown38(int src_width, int src_height,
2928                              int dst_width, int dst_height,
2929                              int src_stride, int dst_stride,
2930                              const uint8* src_ptr, uint8* dst_ptr,
2931                              FilterMode filtering) {
2932   assert(dst_width % 3 == 0);
2933   void (*ScaleRowDown38_3)(const uint8* src_ptr, int src_stride,
2934                            uint8* dst_ptr, int dst_width);
2935   void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride,
2936                            uint8* dst_ptr, int dst_width);
2937 #if defined(HAS_SCALEROWDOWN38_SSSE3)
2938   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
2939       (dst_width % 24 == 0) && (src_stride % 16 == 0) &&
2940       (dst_stride % 8 == 0) &&
2941       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
2942     if (!filtering) {
2943       ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
2944       ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
2945     } else {
2946       ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3;
2947       ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;
2948     }
2949   } else
2950 #endif
2951   {
2952     if (!filtering) {
2953       ScaleRowDown38_3 = ScaleRowDown38_C;
2954       ScaleRowDown38_2 = ScaleRowDown38_C;
2955     } else {
2956       ScaleRowDown38_3 = ScaleRowDown38_3_Int_C;
2957       ScaleRowDown38_2 = ScaleRowDown38_2_Int_C;
2958     }
2959   }
2960   int src_row = 0;
2961   for (int y = 0; y < dst_height; ++y) {
2962     switch (src_row) {
2963       case 0:
2964       case 1:
2965         ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
2966         src_ptr += src_stride * 3;
2967         ++src_row;
2968         break;
2969 
2970       case 2:
2971         ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);
2972         src_ptr += src_stride * 2;
2973         src_row = 0;
2974         break;
2975     }
2976     dst_ptr += dst_stride;
2977   }
2978 }
2979 
SumBox(int iboxwidth,int iboxheight,int src_stride,const uint8 * src_ptr)2980 inline static uint32 SumBox(int iboxwidth, int iboxheight,
2981                             int src_stride, const uint8* src_ptr) {
2982   assert(iboxwidth > 0);
2983   assert(iboxheight > 0);
2984   uint32 sum = 0u;
2985   for (int y = 0; y < iboxheight; ++y) {
2986     for (int x = 0; x < iboxwidth; ++x) {
2987       sum += src_ptr[x];
2988     }
2989     src_ptr += src_stride;
2990   }
2991   return sum;
2992 }
2993 
ScalePlaneBoxRow(int dst_width,int boxheight,int dx,int src_stride,const uint8 * src_ptr,uint8 * dst_ptr)2994 static void ScalePlaneBoxRow(int dst_width, int boxheight,
2995                              int dx, int src_stride,
2996                              const uint8* src_ptr, uint8* dst_ptr) {
2997   int x = 0;
2998   for (int i = 0; i < dst_width; ++i) {
2999     int ix = x >> 16;
3000     x += dx;
3001     int boxwidth = (x >> 16) - ix;
3002     *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
3003         (boxwidth * boxheight);
3004   }
3005 }
3006 
SumPixels(int iboxwidth,const uint16 * src_ptr)3007 inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
3008   assert(iboxwidth > 0);
3009   uint32 sum = 0u;
3010   for (int x = 0; x < iboxwidth; ++x) {
3011     sum += src_ptr[x];
3012   }
3013   return sum;
3014 }
3015 
ScaleAddCols2_C(int dst_width,int boxheight,int dx,const uint16 * src_ptr,uint8 * dst_ptr)3016 static void ScaleAddCols2_C(int dst_width, int boxheight, int dx,
3017                             const uint16* src_ptr, uint8* dst_ptr) {
3018   int scaletbl[2];
3019   int minboxwidth = (dx >> 16);
3020   scaletbl[0] = 65536 / (minboxwidth * boxheight);
3021   scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
3022   int *scaleptr = scaletbl - minboxwidth;
3023   int x = 0;
3024   for (int i = 0; i < dst_width; ++i) {
3025     int ix = x >> 16;
3026     x += dx;
3027     int boxwidth = (x >> 16) - ix;
3028     *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
3029   }
3030 }
3031 
ScaleAddCols1_C(int dst_width,int boxheight,int dx,const uint16 * src_ptr,uint8 * dst_ptr)3032 static void ScaleAddCols1_C(int dst_width, int boxheight, int dx,
3033                             const uint16* src_ptr, uint8* dst_ptr) {
3034   int boxwidth = (dx >> 16);
3035   int scaleval = 65536 / (boxwidth * boxheight);
3036   int x = 0;
3037   for (int i = 0; i < dst_width; ++i) {
3038     *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
3039     x += boxwidth;
3040   }
3041 }
3042 
3043 /**
3044  * Scale plane down to any dimensions, with interpolation.
3045  * (boxfilter).
3046  *
3047  * Same method as SimpleScale, which is fixed point, outputting
3048  * one pixel of destination using fixed point (16.16) to step
3049  * through source, sampling a box of pixel with simple
3050  * averaging.
3051  */
ScalePlaneBox(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr)3052 static void ScalePlaneBox(int src_width, int src_height,
3053                           int dst_width, int dst_height,
3054                           int src_stride, int dst_stride,
3055                           const uint8* src_ptr, uint8* dst_ptr) {
3056   assert(dst_width > 0);
3057   assert(dst_height > 0);
3058   int dy = (src_height << 16) / dst_height;
3059   int dx = (src_width << 16) / dst_width;
3060   if ((src_width % 16 != 0) || (src_width > kMaxInputWidth) ||
3061       dst_height * 2 > src_height) {
3062     uint8* dst = dst_ptr;
3063     int dy = (src_height << 16) / dst_height;
3064     int dx = (src_width << 16) / dst_width;
3065     int y = 0;
3066     for (int j = 0; j < dst_height; ++j) {
3067       int iy = y >> 16;
3068       const uint8* const src = src_ptr + iy * src_stride;
3069       y += dy;
3070       if (y > (src_height << 16)) {
3071         y = (src_height << 16);
3072       }
3073       int boxheight = (y >> 16) - iy;
3074       ScalePlaneBoxRow(dst_width, boxheight,
3075                        dx, src_stride,
3076                        src, dst);
3077 
3078       dst += dst_stride;
3079     }
3080   } else {
3081     ALIGN16(uint16 row[kMaxInputWidth]);
3082     void (*ScaleAddRows)(const uint8* src_ptr, int src_stride,
3083                          uint16* dst_ptr, int src_width, int src_height);
3084     void (*ScaleAddCols)(int dst_width, int boxheight, int dx,
3085                          const uint16* src_ptr, uint8* dst_ptr);
3086 #if defined(HAS_SCALEADDROWS_SSE2)
3087     if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
3088         (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
3089         (src_width % 16) == 0) {
3090       ScaleAddRows = ScaleAddRows_SSE2;
3091     } else
3092 #endif
3093     {
3094       ScaleAddRows = ScaleAddRows_C;
3095     }
3096     if (dx & 0xffff) {
3097       ScaleAddCols = ScaleAddCols2_C;
3098     } else {
3099       ScaleAddCols = ScaleAddCols1_C;
3100     }
3101 
3102     int y = 0;
3103     for (int j = 0; j < dst_height; ++j) {
3104       int iy = y >> 16;
3105       const uint8* const src = src_ptr + iy * src_stride;
3106       y += dy;
3107       if (y > (src_height << 16)) {
3108         y = (src_height << 16);
3109       }
3110       int boxheight = (y >> 16) - iy;
3111       ScaleAddRows(src, src_stride, row, src_width, boxheight);
3112       ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr);
3113       dst_ptr += dst_stride;
3114     }
3115   }
3116 }
3117 
3118 /**
3119  * Scale plane to/from any dimensions, with interpolation.
3120  */
ScalePlaneBilinearSimple(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr)3121 static void ScalePlaneBilinearSimple(int src_width, int src_height,
3122                                      int dst_width, int dst_height,
3123                                      int src_stride, int dst_stride,
3124                                      const uint8* src_ptr, uint8* dst_ptr) {
3125   uint8* dst = dst_ptr;
3126   int dx = (src_width << 16) / dst_width;
3127   int dy = (src_height << 16) / dst_height;
3128   int maxx = ((src_width - 1) << 16) - 1;
3129   int maxy = ((src_height - 1) << 16) - 1;
3130   int y = (dst_height < src_height) ? 32768 :
3131       (src_height << 16) / dst_height - 32768;
3132   for (int i = 0; i < dst_height; ++i) {
3133     int cy = (y < 0) ? 0 : y;
3134     int yi = cy >> 16;
3135     int yf = cy & 0xffff;
3136     const uint8* const src = src_ptr + yi * src_stride;
3137     int x = (dst_width < src_width) ? 32768 :
3138         (src_width << 16) / dst_width - 32768;
3139     for (int j = 0; j < dst_width; ++j) {
3140       int cx = (x < 0) ? 0 : x;
3141       int xi = cx >> 16;
3142       int xf = cx & 0xffff;
3143       int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16;
3144       int r1 = (src[xi + src_stride] * (65536 - xf) +
3145           src[xi + src_stride + 1] * xf) >> 16;
3146       *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16;
3147       x += dx;
3148       if (x > maxx)
3149         x = maxx;
3150     }
3151     dst += dst_stride - dst_width;
3152     y += dy;
3153     if (y > maxy)
3154       y = maxy;
3155   }
3156 }
3157 
3158 /**
3159  * Scale plane to/from any dimensions, with bilinear
3160  * interpolation.
3161  */
ScalePlaneBilinear(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr)3162 static void ScalePlaneBilinear(int src_width, int src_height,
3163                                int dst_width, int dst_height,
3164                                int src_stride, int dst_stride,
3165                                const uint8* src_ptr, uint8* dst_ptr) {
3166   assert(dst_width > 0);
3167   assert(dst_height > 0);
3168   int dy = (src_height << 16) / dst_height;
3169   int dx = (src_width << 16) / dst_width;
3170   if ((src_width % 8 != 0) || (src_width > kMaxInputWidth)) {
3171     ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height,
3172                              src_stride, dst_stride, src_ptr, dst_ptr);
3173 
3174   } else {
3175     ALIGN16(uint8 row[kMaxInputWidth + 1]);
3176     void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
3177                             int src_stride,
3178                             int dst_width, int source_y_fraction);
3179     void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
3180                             int dst_width, int dx);
3181 #if defined(HAS_SCALEFILTERROWS_SSSE3)
3182     if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
3183         (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
3184         (src_width % 16) == 0) {
3185       ScaleFilterRows = ScaleFilterRows_SSSE3;
3186     } else
3187 #endif
3188 #if defined(HAS_SCALEFILTERROWS_SSE2)
3189     if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
3190         (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
3191         (src_width % 16) == 0) {
3192       ScaleFilterRows = ScaleFilterRows_SSE2;
3193     } else
3194 #endif
3195     {
3196       ScaleFilterRows = ScaleFilterRows_C;
3197     }
3198     ScaleFilterCols = ScaleFilterCols_C;
3199 
3200     int y = 0;
3201     int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows.
3202     for (int j = 0; j < dst_height; ++j) {
3203       int iy = y >> 16;
3204       int fy = (y >> 8) & 255;
3205       const uint8* const src = src_ptr + iy * src_stride;
3206       ScaleFilterRows(row, src, src_stride, src_width, fy);
3207       ScaleFilterCols(dst_ptr, row, dst_width, dx);
3208       dst_ptr += dst_stride;
3209       y += dy;
3210       if (y > maxy) {
3211         y = maxy;
3212       }
3213     }
3214   }
3215 }
3216 
3217 /**
3218  * Scale plane to/from any dimensions, without interpolation.
3219  * Fixed point math is used for performance: The upper 16 bits
3220  * of x and dx is the integer part of the source position and
3221  * the lower 16 bits are the fixed decimal part.
3222  */
ScalePlaneSimple(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr)3223 static void ScalePlaneSimple(int src_width, int src_height,
3224                              int dst_width, int dst_height,
3225                              int src_stride, int dst_stride,
3226                              const uint8* src_ptr, uint8* dst_ptr) {
3227   uint8* dst = dst_ptr;
3228   int dx = (src_width << 16) / dst_width;
3229   for (int y = 0; y < dst_height; ++y) {
3230     const uint8* const src = src_ptr + (y * src_height / dst_height) *
3231         src_stride;
3232     // TODO(fbarchard): Round X coordinate by setting x=0x8000.
3233     int x = 0;
3234     for (int i = 0; i < dst_width; ++i) {
3235       *dst++ = src[x >> 16];
3236       x += dx;
3237     }
3238     dst += dst_stride - dst_width;
3239   }
3240 }
3241 
3242 /**
3243  * Scale plane to/from any dimensions.
3244  */
ScalePlaneAnySize(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)3245 static void ScalePlaneAnySize(int src_width, int src_height,
3246                               int dst_width, int dst_height,
3247                               int src_stride, int dst_stride,
3248                               const uint8* src_ptr, uint8* dst_ptr,
3249                               FilterMode filtering) {
3250   if (!filtering) {
3251     ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
3252                      src_stride, dst_stride, src_ptr, dst_ptr);
3253   } else {
3254     // fall back to non-optimized version
3255     ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
3256                        src_stride, dst_stride, src_ptr, dst_ptr);
3257   }
3258 }
3259 
3260 /**
3261  * Scale plane down, any size
3262  *
3263  * This is an optimized version for scaling down a plane to any size.
3264  * The current implementation is ~10 times faster compared to the
3265  * reference implementation for e.g. XGA->LowResPAL
3266  *
3267  */
ScalePlaneDown(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)3268 static void ScalePlaneDown(int src_width, int src_height,
3269                            int dst_width, int dst_height,
3270                            int src_stride, int dst_stride,
3271                            const uint8* src_ptr, uint8* dst_ptr,
3272                            FilterMode filtering) {
3273   if (!filtering) {
3274     ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
3275                      src_stride, dst_stride, src_ptr, dst_ptr);
3276   } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) {
3277     // between 1/2x and 1x use bilinear
3278     ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
3279                        src_stride, dst_stride, src_ptr, dst_ptr);
3280   } else {
3281     ScalePlaneBox(src_width, src_height, dst_width, dst_height,
3282                   src_stride, dst_stride, src_ptr, dst_ptr);
3283   }
3284 }
3285 
3286 /**
3287  * Copy plane, no scaling
3288  *
3289  * This simply copies the given plane without scaling.
3290  * The current implementation is ~115 times faster
3291  * compared to the reference implementation.
3292  *
3293  */
CopyPlane(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr)3294 static void CopyPlane(int src_width, int src_height,
3295                       int dst_width, int dst_height,
3296                       int src_stride, int dst_stride,
3297                       const uint8* src_ptr, uint8* dst_ptr) {
3298   if (src_stride == src_width && dst_stride == dst_width) {
3299     // All contiguous, so can use REALLY fast path.
3300     memcpy(dst_ptr, src_ptr, src_width * src_height);
3301   } else {
3302     // Not all contiguous; must copy scanlines individually
3303     const uint8* src = src_ptr;
3304     uint8* dst = dst_ptr;
3305     for (int i = 0; i < src_height; ++i) {
3306       memcpy(dst, src, src_width);
3307       dst += dst_stride;
3308       src += src_stride;
3309     }
3310   }
3311 }
3312 
ScalePlane(const uint8 * src,int src_stride,int src_width,int src_height,uint8 * dst,int dst_stride,int dst_width,int dst_height,FilterMode filtering,bool use_ref)3313 static void ScalePlane(const uint8* src, int src_stride,
3314                        int src_width, int src_height,
3315                        uint8* dst, int dst_stride,
3316                        int dst_width, int dst_height,
3317                        FilterMode filtering, bool use_ref) {
3318   // Use specialized scales to improve performance for common resolutions.
3319   // For example, all the 1/2 scalings will use ScalePlaneDown2()
3320   if (dst_width == src_width && dst_height == src_height) {
3321     // Straight copy.
3322     CopyPlane(src_width, src_height, dst_width, dst_height, src_stride,
3323               dst_stride, src, dst);
3324   } else if (dst_width <= src_width && dst_height <= src_height) {
3325     // Scale down.
3326     if (use_ref) {
3327       // For testing, allow the optimized versions to be disabled.
3328       ScalePlaneDown(src_width, src_height, dst_width, dst_height,
3329                      src_stride, dst_stride, src, dst, filtering);
3330     } else if (4 * dst_width == 3 * src_width &&
3331                4 * dst_height == 3 * src_height) {
3332       // optimized, 3/4
3333       ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
3334                        src_stride, dst_stride, src, dst, filtering);
3335     } else if (2 * dst_width == src_width && 2 * dst_height == src_height) {
3336       // optimized, 1/2
3337       ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
3338                       src_stride, dst_stride, src, dst, filtering);
3339     // 3/8 rounded up for odd sized chroma height.
3340     } else if (8 * dst_width == 3 * src_width &&
3341                dst_height == ((src_height * 3 + 7) / 8)) {
3342       // optimized, 3/8
3343       ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
3344                        src_stride, dst_stride, src, dst, filtering);
3345     } else if (4 * dst_width == src_width && 4 * dst_height == src_height) {
3346       // optimized, 1/4
3347       ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
3348                       src_stride, dst_stride, src, dst, filtering);
3349     } else if (8 * dst_width == src_width && 8 * dst_height == src_height) {
3350       // optimized, 1/8
3351       ScalePlaneDown8(src_width, src_height, dst_width, dst_height,
3352                       src_stride, dst_stride, src, dst, filtering);
3353     } else {
3354       // Arbitrary downsample
3355       ScalePlaneDown(src_width, src_height, dst_width, dst_height,
3356                      src_stride, dst_stride, src, dst, filtering);
3357     }
3358   } else {
3359     // Arbitrary scale up and/or down.
3360     ScalePlaneAnySize(src_width, src_height, dst_width, dst_height,
3361                       src_stride, dst_stride, src, dst, filtering);
3362   }
3363 }
3364 
3365 /**
3366  * Scale a plane.
3367  *
3368  * This function in turn calls a scaling function
3369  * suitable for handling the desired resolutions.
3370  *
3371  */
3372 
I420Scale(const uint8 * src_y,int src_stride_y,const uint8 * src_u,int src_stride_u,const uint8 * src_v,int src_stride_v,int src_width,int src_height,uint8 * dst_y,int dst_stride_y,uint8 * dst_u,int dst_stride_u,uint8 * dst_v,int dst_stride_v,int dst_width,int dst_height,FilterMode filtering)3373 int I420Scale(const uint8* src_y, int src_stride_y,
3374               const uint8* src_u, int src_stride_u,
3375               const uint8* src_v, int src_stride_v,
3376               int src_width, int src_height,
3377               uint8* dst_y, int dst_stride_y,
3378               uint8* dst_u, int dst_stride_u,
3379               uint8* dst_v, int dst_stride_v,
3380               int dst_width, int dst_height,
3381               FilterMode filtering) {
3382   if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
3383       !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
3384     return -1;
3385   }
3386   // Negative height means invert the image.
3387   if (src_height < 0) {
3388     src_height = -src_height;
3389     int halfheight = (src_height + 1) >> 1;
3390     src_y = src_y + (src_height - 1) * src_stride_y;
3391     src_u = src_u + (halfheight - 1) * src_stride_u;
3392     src_v = src_v + (halfheight - 1) * src_stride_v;
3393     src_stride_y = -src_stride_y;
3394     src_stride_u = -src_stride_u;
3395     src_stride_v = -src_stride_v;
3396   }
3397   int halfsrc_width = (src_width + 1) >> 1;
3398   int halfsrc_height = (src_height + 1) >> 1;
3399   int halfdst_width = (dst_width + 1) >> 1;
3400   int halfoheight = (dst_height + 1) >> 1;
3401 
3402   ScalePlane(src_y, src_stride_y, src_width, src_height,
3403              dst_y, dst_stride_y, dst_width, dst_height,
3404              filtering, use_reference_impl_);
3405   ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height,
3406              dst_u, dst_stride_u, halfdst_width, halfoheight,
3407              filtering, use_reference_impl_);
3408   ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height,
3409              dst_v, dst_stride_v, halfdst_width, halfoheight,
3410              filtering, use_reference_impl_);
3411   return 0;
3412 }
3413 
Scale(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,int src_stride_y,int src_stride_u,int src_stride_v,int src_width,int src_height,uint8 * dst_y,uint8 * dst_u,uint8 * dst_v,int dst_stride_y,int dst_stride_u,int dst_stride_v,int dst_width,int dst_height,bool interpolate)3414 int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
3415           int src_stride_y, int src_stride_u, int src_stride_v,
3416           int src_width, int src_height,
3417           uint8* dst_y, uint8* dst_u, uint8* dst_v,
3418           int dst_stride_y, int dst_stride_u, int dst_stride_v,
3419           int dst_width, int dst_height,
3420           bool interpolate) {
3421   if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
3422       !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
3423     return -1;
3424   }
3425   // Negative height means invert the image.
3426   if (src_height < 0) {
3427     src_height = -src_height;
3428     int halfheight = (src_height + 1) >> 1;
3429     src_y = src_y + (src_height - 1) * src_stride_y;
3430     src_u = src_u + (halfheight - 1) * src_stride_u;
3431     src_v = src_v + (halfheight - 1) * src_stride_v;
3432     src_stride_y = -src_stride_y;
3433     src_stride_u = -src_stride_u;
3434     src_stride_v = -src_stride_v;
3435   }
3436   int halfsrc_width = (src_width + 1) >> 1;
3437   int halfsrc_height = (src_height + 1) >> 1;
3438   int halfdst_width = (dst_width + 1) >> 1;
3439   int halfoheight = (dst_height + 1) >> 1;
3440   FilterMode filtering = interpolate ? kFilterBox : kFilterNone;
3441 
3442   ScalePlane(src_y, src_stride_y, src_width, src_height,
3443              dst_y, dst_stride_y, dst_width, dst_height,
3444              filtering, use_reference_impl_);
3445   ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height,
3446              dst_u, dst_stride_u, halfdst_width, halfoheight,
3447              filtering, use_reference_impl_);
3448   ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height,
3449              dst_v, dst_stride_v, halfdst_width, halfoheight,
3450              filtering, use_reference_impl_);
3451   return 0;
3452 }
3453 
Scale(const uint8 * src,int src_width,int src_height,uint8 * dst,int dst_width,int dst_height,int ooffset,bool interpolate)3454 int Scale(const uint8* src, int src_width, int src_height,
3455           uint8* dst, int dst_width, int dst_height, int ooffset,
3456           bool interpolate) {
3457   if (!src || src_width <= 0 || src_height <= 0 ||
3458       !dst || dst_width <= 0 || dst_height <= 0 || ooffset < 0 ||
3459       ooffset >= dst_height) {
3460     return -1;
3461   }
3462   ooffset = ooffset & ~1;  // chroma requires offset to multiple of 2.
3463   int halfsrc_width = (src_width + 1) >> 1;
3464   int halfsrc_height = (src_height + 1) >> 1;
3465   int halfdst_width = (dst_width + 1) >> 1;
3466   int halfoheight = (dst_height + 1) >> 1;
3467   int aheight = dst_height - ooffset * 2;  // actual output height
3468   const uint8* const iyptr = src;
3469   uint8* oyptr = dst + ooffset * dst_width;
3470   const uint8* const iuptr = src + src_width * src_height;
3471   uint8* ouptr = dst + dst_width * dst_height + (ooffset >> 1) * halfdst_width;
3472   const uint8* const ivptr = src + src_width * src_height +
3473                              halfsrc_width * halfsrc_height;
3474   uint8* ovptr = dst + dst_width * dst_height + halfdst_width * halfoheight +
3475                  (ooffset >> 1) * halfdst_width;
3476   return Scale(iyptr, iuptr, ivptr, src_width, halfsrc_width, halfsrc_width,
3477                src_width, src_height, oyptr, ouptr, ovptr, dst_width,
3478                halfdst_width, halfdst_width, dst_width, aheight, interpolate);
3479 }
3480 
3481 }  // namespace libyuv
3482