1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12 #include "libyuv/scale_row.h"
13
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18
19 // This module is for 32 bit Visual C x86
20 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
21 !defined(__clang__) && defined(_M_IX86)
22
23 // Offsets for source bytes 0 to 9
24 static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
25 128, 128, 128, 128, 128, 128, 128, 128};
26
27 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
28 static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
29 128, 128, 128, 128, 128, 128, 128, 128};
30
31 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
32 static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
33 128, 128, 128, 128, 128, 128, 128, 128};
34
35 // Offsets for source bytes 0 to 10
36 static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
37
38 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
39 static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7,
40 8, 9, 9, 10, 10, 11, 12, 13};
41
42 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
43 static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
44 10, 11, 12, 13, 13, 14, 14, 15};
45
46 // Coefficients for source bytes 0 to 10
47 static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
48
49 // Coefficients for source bytes 10 to 21
50 static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
51
52 // Coefficients for source bytes 21 to 31
53 static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
54
55 // Coefficients for source bytes 21 to 31
56 static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
57
58 static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
59 128, 128, 128, 128, 128, 128, 128, 128};
60
61 static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
62 6, 8, 11, 14, 128, 128, 128, 128};
63
64 // Arrange words 0,3,6 into 0,1,2
65 static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
66 128, 128, 128, 128, 128, 128, 128, 128};
67
68 // Arrange words 0,3,6 into 3,4,5
69 static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
70 6, 7, 12, 13, 128, 128, 128, 128};
71
72 // Scaling values for boxes of 3x3 and 2x3
73 static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
74 65536 / 9, 65536 / 6, 0, 0};
75
76 // Arrange first value for pixels 0,1,2,3,4,5
77 static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
78 11, 128, 14, 128, 128, 128, 128, 128};
79
80 // Arrange second value for pixels 0,1,2,3,4,5
81 static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
82 12, 128, 15, 128, 128, 128, 128, 128};
83
84 // Arrange third value for pixels 0,1,2,3,4,5
85 static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
86 13, 128, 128, 128, 128, 128, 128, 128};
87
88 // Scaling values for boxes of 3x2 and 2x2
89 static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
90 65536 / 3, 65536 / 2, 0, 0};
91
92 // Reads 32 pixels, throws half away and writes 16 pixels.
ScaleRowDown2_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)93 __declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
94 ptrdiff_t src_stride,
95 uint8_t* dst_ptr,
96 int dst_width) {
97 __asm {
98 mov eax, [esp + 4] // src_ptr
99 // src_stride ignored
100 mov edx, [esp + 12] // dst_ptr
101 mov ecx, [esp + 16] // dst_width
102
103 wloop:
104 movdqu xmm0, [eax]
105 movdqu xmm1, [eax + 16]
106 lea eax, [eax + 32]
107 psrlw xmm0, 8 // isolate odd pixels.
108 psrlw xmm1, 8
109 packuswb xmm0, xmm1
110 movdqu [edx], xmm0
111 lea edx, [edx + 16]
112 sub ecx, 16
113 jg wloop
114
115 ret
116 }
117 }
118
119 // Blends 32x1 rectangle to 16x1.
ScaleRowDown2Linear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)120 __declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
121 ptrdiff_t src_stride,
122 uint8_t* dst_ptr,
123 int dst_width) {
124 __asm {
125 mov eax, [esp + 4] // src_ptr
126 // src_stride
127 mov edx, [esp + 12] // dst_ptr
128 mov ecx, [esp + 16] // dst_width
129
130 pcmpeqb xmm4, xmm4 // constant 0x0101
131 psrlw xmm4, 15
132 packuswb xmm4, xmm4
133 pxor xmm5, xmm5 // constant 0
134
135 wloop:
136 movdqu xmm0, [eax]
137 movdqu xmm1, [eax + 16]
138 lea eax, [eax + 32]
139 pmaddubsw xmm0, xmm4 // horizontal add
140 pmaddubsw xmm1, xmm4
141 pavgw xmm0, xmm5 // (x + 1) / 2
142 pavgw xmm1, xmm5
143 packuswb xmm0, xmm1
144 movdqu [edx], xmm0
145 lea edx, [edx + 16]
146 sub ecx, 16
147 jg wloop
148
149 ret
150 }
151 }
152
153 // Blends 32x2 rectangle to 16x1.
ScaleRowDown2Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)154 __declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
155 ptrdiff_t src_stride,
156 uint8_t* dst_ptr,
157 int dst_width) {
158 __asm {
159 push esi
160 mov eax, [esp + 4 + 4] // src_ptr
161 mov esi, [esp + 4 + 8] // src_stride
162 mov edx, [esp + 4 + 12] // dst_ptr
163 mov ecx, [esp + 4 + 16] // dst_width
164
165 pcmpeqb xmm4, xmm4 // constant 0x0101
166 psrlw xmm4, 15
167 packuswb xmm4, xmm4
168 pxor xmm5, xmm5 // constant 0
169
170 wloop:
171 movdqu xmm0, [eax]
172 movdqu xmm1, [eax + 16]
173 movdqu xmm2, [eax + esi]
174 movdqu xmm3, [eax + esi + 16]
175 lea eax, [eax + 32]
176 pmaddubsw xmm0, xmm4 // horizontal add
177 pmaddubsw xmm1, xmm4
178 pmaddubsw xmm2, xmm4
179 pmaddubsw xmm3, xmm4
180 paddw xmm0, xmm2 // vertical add
181 paddw xmm1, xmm3
182 psrlw xmm0, 1
183 psrlw xmm1, 1
184 pavgw xmm0, xmm5 // (x + 1) / 2
185 pavgw xmm1, xmm5
186 packuswb xmm0, xmm1
187 movdqu [edx], xmm0
188 lea edx, [edx + 16]
189 sub ecx, 16
190 jg wloop
191
192 pop esi
193 ret
194 }
195 }
196
197 #ifdef HAS_SCALEROWDOWN2_AVX2
198 // Reads 64 pixels, throws half away and writes 32 pixels.
ScaleRowDown2_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)199 __declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
200 ptrdiff_t src_stride,
201 uint8_t* dst_ptr,
202 int dst_width) {
203 __asm {
204 mov eax, [esp + 4] // src_ptr
205 // src_stride ignored
206 mov edx, [esp + 12] // dst_ptr
207 mov ecx, [esp + 16] // dst_width
208
209 wloop:
210 vmovdqu ymm0, [eax]
211 vmovdqu ymm1, [eax + 32]
212 lea eax, [eax + 64]
213 vpsrlw ymm0, ymm0, 8 // isolate odd pixels.
214 vpsrlw ymm1, ymm1, 8
215 vpackuswb ymm0, ymm0, ymm1
216 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
217 vmovdqu [edx], ymm0
218 lea edx, [edx + 32]
219 sub ecx, 32
220 jg wloop
221
222 vzeroupper
223 ret
224 }
225 }
226
227 // Blends 64x1 rectangle to 32x1.
ScaleRowDown2Linear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)228 __declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
229 ptrdiff_t src_stride,
230 uint8_t* dst_ptr,
231 int dst_width) {
232 __asm {
233 mov eax, [esp + 4] // src_ptr
234 // src_stride
235 mov edx, [esp + 12] // dst_ptr
236 mov ecx, [esp + 16] // dst_width
237
238 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
239 vpsrlw ymm4, ymm4, 15
240 vpackuswb ymm4, ymm4, ymm4
241 vpxor ymm5, ymm5, ymm5 // constant 0
242
243 wloop:
244 vmovdqu ymm0, [eax]
245 vmovdqu ymm1, [eax + 32]
246 lea eax, [eax + 64]
247 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
248 vpmaddubsw ymm1, ymm1, ymm4
249 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
250 vpavgw ymm1, ymm1, ymm5
251 vpackuswb ymm0, ymm0, ymm1
252 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
253 vmovdqu [edx], ymm0
254 lea edx, [edx + 32]
255 sub ecx, 32
256 jg wloop
257
258 vzeroupper
259 ret
260 }
261 }
262
263 // For rounding, average = (sum + 2) / 4
264 // becomes average((sum >> 1), 0)
265 // Blends 64x2 rectangle to 32x1.
ScaleRowDown2Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)266 __declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
267 ptrdiff_t src_stride,
268 uint8_t* dst_ptr,
269 int dst_width) {
270 __asm {
271 push esi
272 mov eax, [esp + 4 + 4] // src_ptr
273 mov esi, [esp + 4 + 8] // src_stride
274 mov edx, [esp + 4 + 12] // dst_ptr
275 mov ecx, [esp + 4 + 16] // dst_width
276
277 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
278 vpsrlw ymm4, ymm4, 15
279 vpackuswb ymm4, ymm4, ymm4
280 vpxor ymm5, ymm5, ymm5 // constant 0
281
282 wloop:
283 vmovdqu ymm0, [eax]
284 vmovdqu ymm1, [eax + 32]
285 vmovdqu ymm2, [eax + esi]
286 vmovdqu ymm3, [eax + esi + 32]
287 lea eax, [eax + 64]
288 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
289 vpmaddubsw ymm1, ymm1, ymm4
290 vpmaddubsw ymm2, ymm2, ymm4
291 vpmaddubsw ymm3, ymm3, ymm4
292 vpaddw ymm0, ymm0, ymm2 // vertical add
293 vpaddw ymm1, ymm1, ymm3
294 vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2
295 vpsrlw ymm1, ymm1, 1
296 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
297 vpavgw ymm1, ymm1, ymm5
298 vpackuswb ymm0, ymm0, ymm1
299 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
300 vmovdqu [edx], ymm0
301 lea edx, [edx + 32]
302 sub ecx, 32
303 jg wloop
304
305 pop esi
306 vzeroupper
307 ret
308 }
309 }
310 #endif // HAS_SCALEROWDOWN2_AVX2
311
312 // Point samples 32 pixels to 8 pixels.
ScaleRowDown4_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)313 __declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
314 ptrdiff_t src_stride,
315 uint8_t* dst_ptr,
316 int dst_width) {
317 __asm {
318 mov eax, [esp + 4] // src_ptr
319 // src_stride ignored
320 mov edx, [esp + 12] // dst_ptr
321 mov ecx, [esp + 16] // dst_width
322 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
323 psrld xmm5, 24
324 pslld xmm5, 16
325
326 wloop:
327 movdqu xmm0, [eax]
328 movdqu xmm1, [eax + 16]
329 lea eax, [eax + 32]
330 pand xmm0, xmm5
331 pand xmm1, xmm5
332 packuswb xmm0, xmm1
333 psrlw xmm0, 8
334 packuswb xmm0, xmm0
335 movq qword ptr [edx], xmm0
336 lea edx, [edx + 8]
337 sub ecx, 8
338 jg wloop
339
340 ret
341 }
342 }
343
344 // Blends 32x4 rectangle to 8x1.
ScaleRowDown4Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)345 __declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
346 ptrdiff_t src_stride,
347 uint8_t* dst_ptr,
348 int dst_width) {
349 __asm {
350 push esi
351 push edi
352 mov eax, [esp + 8 + 4] // src_ptr
353 mov esi, [esp + 8 + 8] // src_stride
354 mov edx, [esp + 8 + 12] // dst_ptr
355 mov ecx, [esp + 8 + 16] // dst_width
356 lea edi, [esi + esi * 2] // src_stride * 3
357 pcmpeqb xmm4, xmm4 // constant 0x0101
358 psrlw xmm4, 15
359 movdqa xmm5, xmm4
360 packuswb xmm4, xmm4
361 psllw xmm5, 3 // constant 0x0008
362
363 wloop:
364 movdqu xmm0, [eax] // average rows
365 movdqu xmm1, [eax + 16]
366 movdqu xmm2, [eax + esi]
367 movdqu xmm3, [eax + esi + 16]
368 pmaddubsw xmm0, xmm4 // horizontal add
369 pmaddubsw xmm1, xmm4
370 pmaddubsw xmm2, xmm4
371 pmaddubsw xmm3, xmm4
372 paddw xmm0, xmm2 // vertical add rows 0, 1
373 paddw xmm1, xmm3
374 movdqu xmm2, [eax + esi * 2]
375 movdqu xmm3, [eax + esi * 2 + 16]
376 pmaddubsw xmm2, xmm4
377 pmaddubsw xmm3, xmm4
378 paddw xmm0, xmm2 // add row 2
379 paddw xmm1, xmm3
380 movdqu xmm2, [eax + edi]
381 movdqu xmm3, [eax + edi + 16]
382 lea eax, [eax + 32]
383 pmaddubsw xmm2, xmm4
384 pmaddubsw xmm3, xmm4
385 paddw xmm0, xmm2 // add row 3
386 paddw xmm1, xmm3
387 phaddw xmm0, xmm1
388 paddw xmm0, xmm5 // + 8 for round
389 psrlw xmm0, 4 // /16 for average of 4 * 4
390 packuswb xmm0, xmm0
391 movq qword ptr [edx], xmm0
392 lea edx, [edx + 8]
393 sub ecx, 8
394 jg wloop
395
396 pop edi
397 pop esi
398 ret
399 }
400 }
401
402 #ifdef HAS_SCALEROWDOWN4_AVX2
403 // Point samples 64 pixels to 16 pixels.
ScaleRowDown4_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)404 __declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
405 ptrdiff_t src_stride,
406 uint8_t* dst_ptr,
407 int dst_width) {
408 __asm {
409 mov eax, [esp + 4] // src_ptr
410 // src_stride ignored
411 mov edx, [esp + 12] // dst_ptr
412 mov ecx, [esp + 16] // dst_width
413 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000
414 vpsrld ymm5, ymm5, 24
415 vpslld ymm5, ymm5, 16
416
417 wloop:
418 vmovdqu ymm0, [eax]
419 vmovdqu ymm1, [eax + 32]
420 lea eax, [eax + 64]
421 vpand ymm0, ymm0, ymm5
422 vpand ymm1, ymm1, ymm5
423 vpackuswb ymm0, ymm0, ymm1
424 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
425 vpsrlw ymm0, ymm0, 8
426 vpackuswb ymm0, ymm0, ymm0
427 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
428 vmovdqu [edx], xmm0
429 lea edx, [edx + 16]
430 sub ecx, 16
431 jg wloop
432
433 vzeroupper
434 ret
435 }
436 }
437
438 // Blends 64x4 rectangle to 16x1.
ScaleRowDown4Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)439 __declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
440 ptrdiff_t src_stride,
441 uint8_t* dst_ptr,
442 int dst_width) {
443 __asm {
444 push esi
445 push edi
446 mov eax, [esp + 8 + 4] // src_ptr
447 mov esi, [esp + 8 + 8] // src_stride
448 mov edx, [esp + 8 + 12] // dst_ptr
449 mov ecx, [esp + 8 + 16] // dst_width
450 lea edi, [esi + esi * 2] // src_stride * 3
451 vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101
452 vpsrlw ymm4, ymm4, 15
453 vpsllw ymm5, ymm4, 3 // constant 0x0008
454 vpackuswb ymm4, ymm4, ymm4
455
456 wloop:
457 vmovdqu ymm0, [eax] // average rows
458 vmovdqu ymm1, [eax + 32]
459 vmovdqu ymm2, [eax + esi]
460 vmovdqu ymm3, [eax + esi + 32]
461 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
462 vpmaddubsw ymm1, ymm1, ymm4
463 vpmaddubsw ymm2, ymm2, ymm4
464 vpmaddubsw ymm3, ymm3, ymm4
465 vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1
466 vpaddw ymm1, ymm1, ymm3
467 vmovdqu ymm2, [eax + esi * 2]
468 vmovdqu ymm3, [eax + esi * 2 + 32]
469 vpmaddubsw ymm2, ymm2, ymm4
470 vpmaddubsw ymm3, ymm3, ymm4
471 vpaddw ymm0, ymm0, ymm2 // add row 2
472 vpaddw ymm1, ymm1, ymm3
473 vmovdqu ymm2, [eax + edi]
474 vmovdqu ymm3, [eax + edi + 32]
475 lea eax, [eax + 64]
476 vpmaddubsw ymm2, ymm2, ymm4
477 vpmaddubsw ymm3, ymm3, ymm4
478 vpaddw ymm0, ymm0, ymm2 // add row 3
479 vpaddw ymm1, ymm1, ymm3
480 vphaddw ymm0, ymm0, ymm1 // mutates
481 vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw
482 vpaddw ymm0, ymm0, ymm5 // + 8 for round
483 vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4
484 vpackuswb ymm0, ymm0, ymm0
485 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
486 vmovdqu [edx], xmm0
487 lea edx, [edx + 16]
488 sub ecx, 16
489 jg wloop
490
491 pop edi
492 pop esi
493 vzeroupper
494 ret
495 }
496 }
497 #endif // HAS_SCALEROWDOWN4_AVX2
498
499 // Point samples 32 pixels to 24 pixels.
500 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
501 // Then shuffled to do the scaling.
502
ScaleRowDown34_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)503 __declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
504 ptrdiff_t src_stride,
505 uint8_t* dst_ptr,
506 int dst_width) {
507 __asm {
508 mov eax, [esp + 4] // src_ptr
509 // src_stride ignored
510 mov edx, [esp + 12] // dst_ptr
511 mov ecx, [esp + 16] // dst_width
512 movdqa xmm3, xmmword ptr kShuf0
513 movdqa xmm4, xmmword ptr kShuf1
514 movdqa xmm5, xmmword ptr kShuf2
515
516 wloop:
517 movdqu xmm0, [eax]
518 movdqu xmm1, [eax + 16]
519 lea eax, [eax + 32]
520 movdqa xmm2, xmm1
521 palignr xmm1, xmm0, 8
522 pshufb xmm0, xmm3
523 pshufb xmm1, xmm4
524 pshufb xmm2, xmm5
525 movq qword ptr [edx], xmm0
526 movq qword ptr [edx + 8], xmm1
527 movq qword ptr [edx + 16], xmm2
528 lea edx, [edx + 24]
529 sub ecx, 24
530 jg wloop
531
532 ret
533 }
534 }
535
536 // Blends 32x2 rectangle to 24x1
537 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
538 // Then shuffled to do the scaling.
539
540 // Register usage:
541 // xmm0 src_row 0
542 // xmm1 src_row 1
543 // xmm2 shuf 0
544 // xmm3 shuf 1
545 // xmm4 shuf 2
546 // xmm5 madd 0
547 // xmm6 madd 1
548 // xmm7 kRound34
549
550 // Note that movdqa+palign may be better than movdqu.
ScaleRowDown34_1_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)551 __declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
552 ptrdiff_t src_stride,
553 uint8_t* dst_ptr,
554 int dst_width) {
555 __asm {
556 push esi
557 mov eax, [esp + 4 + 4] // src_ptr
558 mov esi, [esp + 4 + 8] // src_stride
559 mov edx, [esp + 4 + 12] // dst_ptr
560 mov ecx, [esp + 4 + 16] // dst_width
561 movdqa xmm2, xmmword ptr kShuf01
562 movdqa xmm3, xmmword ptr kShuf11
563 movdqa xmm4, xmmword ptr kShuf21
564 movdqa xmm5, xmmword ptr kMadd01
565 movdqa xmm6, xmmword ptr kMadd11
566 movdqa xmm7, xmmword ptr kRound34
567
568 wloop:
569 movdqu xmm0, [eax] // pixels 0..7
570 movdqu xmm1, [eax + esi]
571 pavgb xmm0, xmm1
572 pshufb xmm0, xmm2
573 pmaddubsw xmm0, xmm5
574 paddsw xmm0, xmm7
575 psrlw xmm0, 2
576 packuswb xmm0, xmm0
577 movq qword ptr [edx], xmm0
578 movdqu xmm0, [eax + 8] // pixels 8..15
579 movdqu xmm1, [eax + esi + 8]
580 pavgb xmm0, xmm1
581 pshufb xmm0, xmm3
582 pmaddubsw xmm0, xmm6
583 paddsw xmm0, xmm7
584 psrlw xmm0, 2
585 packuswb xmm0, xmm0
586 movq qword ptr [edx + 8], xmm0
587 movdqu xmm0, [eax + 16] // pixels 16..23
588 movdqu xmm1, [eax + esi + 16]
589 lea eax, [eax + 32]
590 pavgb xmm0, xmm1
591 pshufb xmm0, xmm4
592 movdqa xmm1, xmmword ptr kMadd21
593 pmaddubsw xmm0, xmm1
594 paddsw xmm0, xmm7
595 psrlw xmm0, 2
596 packuswb xmm0, xmm0
597 movq qword ptr [edx + 16], xmm0
598 lea edx, [edx + 24]
599 sub ecx, 24
600 jg wloop
601
602 pop esi
603 ret
604 }
605 }
606
607 // Note that movdqa+palign may be better than movdqu.
ScaleRowDown34_0_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)608 __declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
609 ptrdiff_t src_stride,
610 uint8_t* dst_ptr,
611 int dst_width) {
612 __asm {
613 push esi
614 mov eax, [esp + 4 + 4] // src_ptr
615 mov esi, [esp + 4 + 8] // src_stride
616 mov edx, [esp + 4 + 12] // dst_ptr
617 mov ecx, [esp + 4 + 16] // dst_width
618 movdqa xmm2, xmmword ptr kShuf01
619 movdqa xmm3, xmmword ptr kShuf11
620 movdqa xmm4, xmmword ptr kShuf21
621 movdqa xmm5, xmmword ptr kMadd01
622 movdqa xmm6, xmmword ptr kMadd11
623 movdqa xmm7, xmmword ptr kRound34
624
625 wloop:
626 movdqu xmm0, [eax] // pixels 0..7
627 movdqu xmm1, [eax + esi]
628 pavgb xmm1, xmm0
629 pavgb xmm0, xmm1
630 pshufb xmm0, xmm2
631 pmaddubsw xmm0, xmm5
632 paddsw xmm0, xmm7
633 psrlw xmm0, 2
634 packuswb xmm0, xmm0
635 movq qword ptr [edx], xmm0
636 movdqu xmm0, [eax + 8] // pixels 8..15
637 movdqu xmm1, [eax + esi + 8]
638 pavgb xmm1, xmm0
639 pavgb xmm0, xmm1
640 pshufb xmm0, xmm3
641 pmaddubsw xmm0, xmm6
642 paddsw xmm0, xmm7
643 psrlw xmm0, 2
644 packuswb xmm0, xmm0
645 movq qword ptr [edx + 8], xmm0
646 movdqu xmm0, [eax + 16] // pixels 16..23
647 movdqu xmm1, [eax + esi + 16]
648 lea eax, [eax + 32]
649 pavgb xmm1, xmm0
650 pavgb xmm0, xmm1
651 pshufb xmm0, xmm4
652 movdqa xmm1, xmmword ptr kMadd21
653 pmaddubsw xmm0, xmm1
654 paddsw xmm0, xmm7
655 psrlw xmm0, 2
656 packuswb xmm0, xmm0
657 movq qword ptr [edx + 16], xmm0
658 lea edx, [edx+24]
659 sub ecx, 24
660 jg wloop
661
662 pop esi
663 ret
664 }
665 }
666
667 // 3/8 point sampler
668
669 // Scale 32 pixels to 12
ScaleRowDown38_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)670 __declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
671 ptrdiff_t src_stride,
672 uint8_t* dst_ptr,
673 int dst_width) {
674 __asm {
675 mov eax, [esp + 4] // src_ptr
676 // src_stride ignored
677 mov edx, [esp + 12] // dst_ptr
678 mov ecx, [esp + 16] // dst_width
679 movdqa xmm4, xmmword ptr kShuf38a
680 movdqa xmm5, xmmword ptr kShuf38b
681
682 xloop:
683 movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
684 movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
685 lea eax, [eax + 32]
686 pshufb xmm0, xmm4
687 pshufb xmm1, xmm5
688 paddusb xmm0, xmm1
689
690 movq qword ptr [edx], xmm0 // write 12 pixels
691 movhlps xmm1, xmm0
692 movd [edx + 8], xmm1
693 lea edx, [edx + 12]
694 sub ecx, 12
695 jg xloop
696
697 ret
698 }
699 }
700
701 // Scale 16x3 pixels to 6x1 with interpolation
ScaleRowDown38_3_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)702 __declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
703 ptrdiff_t src_stride,
704 uint8_t* dst_ptr,
705 int dst_width) {
706 __asm {
707 push esi
708 mov eax, [esp + 4 + 4] // src_ptr
709 mov esi, [esp + 4 + 8] // src_stride
710 mov edx, [esp + 4 + 12] // dst_ptr
711 mov ecx, [esp + 4 + 16] // dst_width
712 movdqa xmm2, xmmword ptr kShufAc
713 movdqa xmm3, xmmword ptr kShufAc3
714 movdqa xmm4, xmmword ptr kScaleAc33
715 pxor xmm5, xmm5
716
717 xloop:
718 movdqu xmm0, [eax] // sum up 3 rows into xmm0/1
719 movdqu xmm6, [eax + esi]
720 movhlps xmm1, xmm0
721 movhlps xmm7, xmm6
722 punpcklbw xmm0, xmm5
723 punpcklbw xmm1, xmm5
724 punpcklbw xmm6, xmm5
725 punpcklbw xmm7, xmm5
726 paddusw xmm0, xmm6
727 paddusw xmm1, xmm7
728 movdqu xmm6, [eax + esi * 2]
729 lea eax, [eax + 16]
730 movhlps xmm7, xmm6
731 punpcklbw xmm6, xmm5
732 punpcklbw xmm7, xmm5
733 paddusw xmm0, xmm6
734 paddusw xmm1, xmm7
735
736 movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
737 psrldq xmm0, 2
738 paddusw xmm6, xmm0
739 psrldq xmm0, 2
740 paddusw xmm6, xmm0
741 pshufb xmm6, xmm2
742
743 movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
744 psrldq xmm1, 2
745 paddusw xmm7, xmm1
746 psrldq xmm1, 2
747 paddusw xmm7, xmm1
748 pshufb xmm7, xmm3
749 paddusw xmm6, xmm7
750
751 pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
752 packuswb xmm6, xmm6
753
754 movd [edx], xmm6 // write 6 pixels
755 psrlq xmm6, 16
756 movd [edx + 2], xmm6
757 lea edx, [edx + 6]
758 sub ecx, 6
759 jg xloop
760
761 pop esi
762 ret
763 }
764 }
765
766 // Scale 16x2 pixels to 6x1 with interpolation
ScaleRowDown38_2_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)767 __declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
768 ptrdiff_t src_stride,
769 uint8_t* dst_ptr,
770 int dst_width) {
771 __asm {
772 push esi
773 mov eax, [esp + 4 + 4] // src_ptr
774 mov esi, [esp + 4 + 8] // src_stride
775 mov edx, [esp + 4 + 12] // dst_ptr
776 mov ecx, [esp + 4 + 16] // dst_width
777 movdqa xmm2, xmmword ptr kShufAb0
778 movdqa xmm3, xmmword ptr kShufAb1
779 movdqa xmm4, xmmword ptr kShufAb2
780 movdqa xmm5, xmmword ptr kScaleAb2
781
782 xloop:
783 movdqu xmm0, [eax] // average 2 rows into xmm0
784 movdqu xmm1, [eax + esi]
785 lea eax, [eax + 16]
786 pavgb xmm0, xmm1
787
788 movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
789 pshufb xmm1, xmm2
790 movdqa xmm6, xmm0
791 pshufb xmm6, xmm3
792 paddusw xmm1, xmm6
793 pshufb xmm0, xmm4
794 paddusw xmm1, xmm0
795
796 pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
797 packuswb xmm1, xmm1
798
799 movd [edx], xmm1 // write 6 pixels
800 psrlq xmm1, 16
801 movd [edx + 2], xmm1
802 lea edx, [edx + 6]
803 sub ecx, 6
804 jg xloop
805
806 pop esi
807 ret
808 }
809 }
810
811 // Reads 16 bytes and accumulates to 16 shorts at a time.
ScaleAddRow_SSE2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)812 __declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr,
813 uint16_t* dst_ptr,
814 int src_width) {
815 __asm {
816 mov eax, [esp + 4] // src_ptr
817 mov edx, [esp + 8] // dst_ptr
818 mov ecx, [esp + 12] // src_width
819 pxor xmm5, xmm5
820
821 // sum rows
822 xloop:
823 movdqu xmm3, [eax] // read 16 bytes
824 lea eax, [eax + 16]
825 movdqu xmm0, [edx] // read 16 words from destination
826 movdqu xmm1, [edx + 16]
827 movdqa xmm2, xmm3
828 punpcklbw xmm2, xmm5
829 punpckhbw xmm3, xmm5
830 paddusw xmm0, xmm2 // sum 16 words
831 paddusw xmm1, xmm3
832 movdqu [edx], xmm0 // write 16 words to destination
833 movdqu [edx + 16], xmm1
834 lea edx, [edx + 32]
835 sub ecx, 16
836 jg xloop
837 ret
838 }
839 }
840
841 #ifdef HAS_SCALEADDROW_AVX2
842 // Reads 32 bytes and accumulates to 32 shorts at a time.
ScaleAddRow_AVX2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)843 __declspec(naked) void ScaleAddRow_AVX2(const uint8_t* src_ptr,
844 uint16_t* dst_ptr,
845 int src_width) {
846 __asm {
847 mov eax, [esp + 4] // src_ptr
848 mov edx, [esp + 8] // dst_ptr
849 mov ecx, [esp + 12] // src_width
850 vpxor ymm5, ymm5, ymm5
851
852 // sum rows
853 xloop:
854 vmovdqu ymm3, [eax] // read 32 bytes
855 lea eax, [eax + 32]
856 vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck
857 vpunpcklbw ymm2, ymm3, ymm5
858 vpunpckhbw ymm3, ymm3, ymm5
859 vpaddusw ymm0, ymm2, [edx] // sum 16 words
860 vpaddusw ymm1, ymm3, [edx + 32]
861 vmovdqu [edx], ymm0 // write 32 words to destination
862 vmovdqu [edx + 32], ymm1
863 lea edx, [edx + 64]
864 sub ecx, 32
865 jg xloop
866
867 vzeroupper
868 ret
869 }
870 }
871 #endif // HAS_SCALEADDROW_AVX2
872
873 // Constant for making pixels signed to avoid pmaddubsw
874 // saturation.
875 static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
876 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
877
878 // Constant for making pixels unsigned and adding .5 for rounding.
879 static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
880 0x4040, 0x4040, 0x4040, 0x4040};
881
882 // Bilinear column filtering. SSSE3 version.
ScaleFilterCols_SSSE3(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)883 __declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
884 const uint8_t* src_ptr,
885 int dst_width,
886 int x,
887 int dx) {
888 __asm {
889 push ebx
890 push esi
891 push edi
892 mov edi, [esp + 12 + 4] // dst_ptr
893 mov esi, [esp + 12 + 8] // src_ptr
894 mov ecx, [esp + 12 + 12] // dst_width
895 movd xmm2, [esp + 12 + 16] // x
896 movd xmm3, [esp + 12 + 20] // dx
897 mov eax, 0x04040000 // shuffle to line up fractions with pixel.
898 movd xmm5, eax
899 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
900 psrlw xmm6, 9
901 pcmpeqb xmm7, xmm7 // generate 0x0001
902 psrlw xmm7, 15
903 pextrw eax, xmm2, 1 // get x0 integer. preroll
904 sub ecx, 2
905 jl xloop29
906
907 movdqa xmm0, xmm2 // x1 = x0 + dx
908 paddd xmm0, xmm3
909 punpckldq xmm2, xmm0 // x0 x1
910 punpckldq xmm3, xmm3 // dx dx
911 paddd xmm3, xmm3 // dx * 2, dx * 2
912 pextrw edx, xmm2, 3 // get x1 integer. preroll
913
914 // 2 Pixel loop.
915 xloop2:
916 movdqa xmm1, xmm2 // x0, x1 fractions.
917 paddd xmm2, xmm3 // x += dx
918 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
919 movd xmm0, ebx
920 psrlw xmm1, 9 // 7 bit fractions.
921 movzx ebx, word ptr [esi + edx] // 2 source x1 pixels
922 movd xmm4, ebx
923 pshufb xmm1, xmm5 // 0011
924 punpcklwd xmm0, xmm4
925 psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
926 pxor xmm1, xmm6 // 0..7f and 7f..0
927 paddusb xmm1, xmm7 // +1 so 0..7f and 80..1
928 pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels.
929 pextrw eax, xmm2, 1 // get x0 integer. next iteration.
930 pextrw edx, xmm2, 3 // get x1 integer. next iteration.
931 paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round.
932 psrlw xmm1, 7 // 8.7 fixed point to low 8 bits.
933 packuswb xmm1, xmm1 // 8 bits, 2 pixels.
934 movd ebx, xmm1
935 mov [edi], bx
936 lea edi, [edi + 2]
937 sub ecx, 2 // 2 pixels
938 jge xloop2
939
940 xloop29:
941 add ecx, 2 - 1
942 jl xloop99
943
944 // 1 pixel remainder
945 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
946 movd xmm0, ebx
947 psrlw xmm2, 9 // 7 bit fractions.
948 pshufb xmm2, xmm5 // 0011
949 psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
950 pxor xmm2, xmm6 // 0..7f and 7f..0
951 paddusb xmm2, xmm7 // +1 so 0..7f and 80..1
952 pmaddubsw xmm2, xmm0 // 16 bit
953 paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round.
954 psrlw xmm2, 7 // 8.7 fixed point to low 8 bits.
955 packuswb xmm2, xmm2 // 8 bits
956 movd ebx, xmm2
957 mov [edi], bl
958
959 xloop99:
960
961 pop edi
962 pop esi
963 pop ebx
964 ret
965 }
966 }
967
968 // Reads 16 pixels, duplicates them and writes 32 pixels.
ScaleColsUp2_SSE2(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)969 __declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
970 const uint8_t* src_ptr,
971 int dst_width,
972 int x,
973 int dx) {
974 __asm {
975 mov edx, [esp + 4] // dst_ptr
976 mov eax, [esp + 8] // src_ptr
977 mov ecx, [esp + 12] // dst_width
978
979 wloop:
980 movdqu xmm0, [eax]
981 lea eax, [eax + 16]
982 movdqa xmm1, xmm0
983 punpcklbw xmm0, xmm0
984 punpckhbw xmm1, xmm1
985 movdqu [edx], xmm0
986 movdqu [edx + 16], xmm1
987 lea edx, [edx + 32]
988 sub ecx, 32
989 jg wloop
990
991 ret
992 }
993 }
994
995 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
ScaleARGBRowDown2_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)996 __declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
997 ptrdiff_t src_stride,
998 uint8_t* dst_argb,
999 int dst_width) {
1000 __asm {
1001 mov eax, [esp + 4] // src_argb
1002 // src_stride ignored
1003 mov edx, [esp + 12] // dst_argb
1004 mov ecx, [esp + 16] // dst_width
1005
1006 wloop:
1007 movdqu xmm0, [eax]
1008 movdqu xmm1, [eax + 16]
1009 lea eax, [eax + 32]
1010 shufps xmm0, xmm1, 0xdd
1011 movdqu [edx], xmm0
1012 lea edx, [edx + 16]
1013 sub ecx, 4
1014 jg wloop
1015
1016 ret
1017 }
1018 }
1019
1020 // Blends 8x1 rectangle to 4x1.
ScaleARGBRowDown2Linear_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1021 __declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
1022 ptrdiff_t src_stride,
1023 uint8_t* dst_argb,
1024 int dst_width) {
1025 __asm {
1026 mov eax, [esp + 4] // src_argb
1027 // src_stride ignored
1028 mov edx, [esp + 12] // dst_argb
1029 mov ecx, [esp + 16] // dst_width
1030
1031 wloop:
1032 movdqu xmm0, [eax]
1033 movdqu xmm1, [eax + 16]
1034 lea eax, [eax + 32]
1035 movdqa xmm2, xmm0
1036 shufps xmm0, xmm1, 0x88 // even pixels
1037 shufps xmm2, xmm1, 0xdd // odd pixels
1038 pavgb xmm0, xmm2
1039 movdqu [edx], xmm0
1040 lea edx, [edx + 16]
1041 sub ecx, 4
1042 jg wloop
1043
1044 ret
1045 }
1046 }
1047
1048 // Blends 8x2 rectangle to 4x1.
ScaleARGBRowDown2Box_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1049 __declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
1050 ptrdiff_t src_stride,
1051 uint8_t* dst_argb,
1052 int dst_width) {
1053 __asm {
1054 push esi
1055 mov eax, [esp + 4 + 4] // src_argb
1056 mov esi, [esp + 4 + 8] // src_stride
1057 mov edx, [esp + 4 + 12] // dst_argb
1058 mov ecx, [esp + 4 + 16] // dst_width
1059
1060 wloop:
1061 movdqu xmm0, [eax]
1062 movdqu xmm1, [eax + 16]
1063 movdqu xmm2, [eax + esi]
1064 movdqu xmm3, [eax + esi + 16]
1065 lea eax, [eax + 32]
1066 pavgb xmm0, xmm2 // average rows
1067 pavgb xmm1, xmm3
1068 movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
1069 shufps xmm0, xmm1, 0x88 // even pixels
1070 shufps xmm2, xmm1, 0xdd // odd pixels
1071 pavgb xmm0, xmm2
1072 movdqu [edx], xmm0
1073 lea edx, [edx + 16]
1074 sub ecx, 4
1075 jg wloop
1076
1077 pop esi
1078 ret
1079 }
1080 }
1081
1082 // Reads 4 pixels at a time.
ScaleARGBRowDownEven_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)1083 __declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
1084 ptrdiff_t src_stride,
1085 int src_stepx,
1086 uint8_t* dst_argb,
1087 int dst_width) {
1088 __asm {
1089 push ebx
1090 push edi
1091 mov eax, [esp + 8 + 4] // src_argb
1092 // src_stride ignored
1093 mov ebx, [esp + 8 + 12] // src_stepx
1094 mov edx, [esp + 8 + 16] // dst_argb
1095 mov ecx, [esp + 8 + 20] // dst_width
1096 lea ebx, [ebx * 4]
1097 lea edi, [ebx + ebx * 2]
1098
1099 wloop:
1100 movd xmm0, [eax]
1101 movd xmm1, [eax + ebx]
1102 punpckldq xmm0, xmm1
1103 movd xmm2, [eax + ebx * 2]
1104 movd xmm3, [eax + edi]
1105 lea eax, [eax + ebx * 4]
1106 punpckldq xmm2, xmm3
1107 punpcklqdq xmm0, xmm2
1108 movdqu [edx], xmm0
1109 lea edx, [edx + 16]
1110 sub ecx, 4
1111 jg wloop
1112
1113 pop edi
1114 pop ebx
1115 ret
1116 }
1117 }
1118
1119 // Blends four 2x2 to 4x1.
ScaleARGBRowDownEvenBox_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)1120 __declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
1121 ptrdiff_t src_stride,
1122 int src_stepx,
1123 uint8_t* dst_argb,
1124 int dst_width) {
1125 __asm {
1126 push ebx
1127 push esi
1128 push edi
1129 mov eax, [esp + 12 + 4] // src_argb
1130 mov esi, [esp + 12 + 8] // src_stride
1131 mov ebx, [esp + 12 + 12] // src_stepx
1132 mov edx, [esp + 12 + 16] // dst_argb
1133 mov ecx, [esp + 12 + 20] // dst_width
1134 lea esi, [eax + esi] // row1 pointer
1135 lea ebx, [ebx * 4]
1136 lea edi, [ebx + ebx * 2]
1137
1138 wloop:
1139 movq xmm0, qword ptr [eax] // row0 4 pairs
1140 movhps xmm0, qword ptr [eax + ebx]
1141 movq xmm1, qword ptr [eax + ebx * 2]
1142 movhps xmm1, qword ptr [eax + edi]
1143 lea eax, [eax + ebx * 4]
1144 movq xmm2, qword ptr [esi] // row1 4 pairs
1145 movhps xmm2, qword ptr [esi + ebx]
1146 movq xmm3, qword ptr [esi + ebx * 2]
1147 movhps xmm3, qword ptr [esi + edi]
1148 lea esi, [esi + ebx * 4]
1149 pavgb xmm0, xmm2 // average rows
1150 pavgb xmm1, xmm3
1151 movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
1152 shufps xmm0, xmm1, 0x88 // even pixels
1153 shufps xmm2, xmm1, 0xdd // odd pixels
1154 pavgb xmm0, xmm2
1155 movdqu [edx], xmm0
1156 lea edx, [edx + 16]
1157 sub ecx, 4
1158 jg wloop
1159
1160 pop edi
1161 pop esi
1162 pop ebx
1163 ret
1164 }
1165 }
1166
1167 // Column scaling unfiltered. SSE2 version.
ScaleARGBCols_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1168 __declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb,
1169 const uint8_t* src_argb,
1170 int dst_width,
1171 int x,
1172 int dx) {
1173 __asm {
1174 push edi
1175 push esi
1176 mov edi, [esp + 8 + 4] // dst_argb
1177 mov esi, [esp + 8 + 8] // src_argb
1178 mov ecx, [esp + 8 + 12] // dst_width
1179 movd xmm2, [esp + 8 + 16] // x
1180 movd xmm3, [esp + 8 + 20] // dx
1181
1182 pshufd xmm2, xmm2, 0 // x0 x0 x0 x0
1183 pshufd xmm0, xmm3, 0x11 // dx 0 dx 0
1184 paddd xmm2, xmm0
1185 paddd xmm3, xmm3 // 0, 0, 0, dx * 2
1186 pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0
1187 paddd xmm2, xmm0 // x3 x2 x1 x0
1188 paddd xmm3, xmm3 // 0, 0, 0, dx * 4
1189 pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4
1190
1191 pextrw eax, xmm2, 1 // get x0 integer.
1192 pextrw edx, xmm2, 3 // get x1 integer.
1193
1194 cmp ecx, 0
1195 jle xloop99
1196 sub ecx, 4
1197 jl xloop49
1198
1199 // 4 Pixel loop.
1200 xloop4:
1201 movd xmm0, [esi + eax * 4] // 1 source x0 pixels
1202 movd xmm1, [esi + edx * 4] // 1 source x1 pixels
1203 pextrw eax, xmm2, 5 // get x2 integer.
1204 pextrw edx, xmm2, 7 // get x3 integer.
1205 paddd xmm2, xmm3 // x += dx
1206 punpckldq xmm0, xmm1 // x0 x1
1207
1208 movd xmm1, [esi + eax * 4] // 1 source x2 pixels
1209 movd xmm4, [esi + edx * 4] // 1 source x3 pixels
1210 pextrw eax, xmm2, 1 // get x0 integer. next iteration.
1211 pextrw edx, xmm2, 3 // get x1 integer. next iteration.
1212 punpckldq xmm1, xmm4 // x2 x3
1213 punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
1214 movdqu [edi], xmm0
1215 lea edi, [edi + 16]
1216 sub ecx, 4 // 4 pixels
1217 jge xloop4
1218
1219 xloop49:
1220 test ecx, 2
1221 je xloop29
1222
1223 // 2 Pixels.
1224 movd xmm0, [esi + eax * 4] // 1 source x0 pixels
1225 movd xmm1, [esi + edx * 4] // 1 source x1 pixels
1226 pextrw eax, xmm2, 5 // get x2 integer.
1227 punpckldq xmm0, xmm1 // x0 x1
1228
1229 movq qword ptr [edi], xmm0
1230 lea edi, [edi + 8]
1231
1232 xloop29:
1233 test ecx, 1
1234 je xloop99
1235
1236 // 1 Pixels.
1237 movd xmm0, [esi + eax * 4] // 1 source x2 pixels
1238 movd dword ptr [edi], xmm0
1239 xloop99:
1240
1241 pop esi
1242 pop edi
1243 ret
1244 }
1245 }
1246
1247 // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
1248 // TODO(fbarchard): Port to Neon
1249
1250 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1251 static const uvec8 kShuffleColARGB = {
1252 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
1253 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
1254 };
1255
1256 // Shuffle table for duplicating 2 fractions into 8 bytes each
1257 static const uvec8 kShuffleFractions = {
1258 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1259 };
1260
ScaleARGBFilterCols_SSSE3(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1261 __declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
1262 const uint8_t* src_argb,
1263 int dst_width,
1264 int x,
1265 int dx) {
1266 __asm {
1267 push esi
1268 push edi
1269 mov edi, [esp + 8 + 4] // dst_argb
1270 mov esi, [esp + 8 + 8] // src_argb
1271 mov ecx, [esp + 8 + 12] // dst_width
1272 movd xmm2, [esp + 8 + 16] // x
1273 movd xmm3, [esp + 8 + 20] // dx
1274 movdqa xmm4, xmmword ptr kShuffleColARGB
1275 movdqa xmm5, xmmword ptr kShuffleFractions
1276 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
1277 psrlw xmm6, 9
1278 pextrw eax, xmm2, 1 // get x0 integer. preroll
1279 sub ecx, 2
1280 jl xloop29
1281
1282 movdqa xmm0, xmm2 // x1 = x0 + dx
1283 paddd xmm0, xmm3
1284 punpckldq xmm2, xmm0 // x0 x1
1285 punpckldq xmm3, xmm3 // dx dx
1286 paddd xmm3, xmm3 // dx * 2, dx * 2
1287 pextrw edx, xmm2, 3 // get x1 integer. preroll
1288
1289 // 2 Pixel loop.
1290 xloop2:
1291 movdqa xmm1, xmm2 // x0, x1 fractions.
1292 paddd xmm2, xmm3 // x += dx
1293 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
1294 psrlw xmm1, 9 // 7 bit fractions.
1295 movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels
1296 pshufb xmm1, xmm5 // 0000000011111111
1297 pshufb xmm0, xmm4 // arrange pixels into pairs
1298 pxor xmm1, xmm6 // 0..7f and 7f..0
1299 pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.
1300 pextrw eax, xmm2, 1 // get x0 integer. next iteration.
1301 pextrw edx, xmm2, 3 // get x1 integer. next iteration.
1302 psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits.
1303 packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.
1304 movq qword ptr [edi], xmm0
1305 lea edi, [edi + 8]
1306 sub ecx, 2 // 2 pixels
1307 jge xloop2
1308
1309 xloop29:
1310
1311 add ecx, 2 - 1
1312 jl xloop99
1313
1314 // 1 pixel remainder
1315 psrlw xmm2, 9 // 7 bit fractions.
1316 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
1317 pshufb xmm2, xmm5 // 00000000
1318 pshufb xmm0, xmm4 // arrange pixels into pairs
1319 pxor xmm2, xmm6 // 0..7f and 7f..0
1320 pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel.
1321 psrlw xmm0, 7
1322 packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
1323 movd [edi], xmm0
1324
1325 xloop99:
1326
1327 pop edi
1328 pop esi
1329 ret
1330 }
1331 }
1332
1333 // Reads 4 pixels, duplicates them and writes 8 pixels.
ScaleARGBColsUp2_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1334 __declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
1335 const uint8_t* src_argb,
1336 int dst_width,
1337 int x,
1338 int dx) {
1339 __asm {
1340 mov edx, [esp + 4] // dst_argb
1341 mov eax, [esp + 8] // src_argb
1342 mov ecx, [esp + 12] // dst_width
1343
1344 wloop:
1345 movdqu xmm0, [eax]
1346 lea eax, [eax + 16]
1347 movdqa xmm1, xmm0
1348 punpckldq xmm0, xmm0
1349 punpckhdq xmm1, xmm1
1350 movdqu [edx], xmm0
1351 movdqu [edx + 16], xmm1
1352 lea edx, [edx + 32]
1353 sub ecx, 8
1354 jg wloop
1355
1356 ret
1357 }
1358 }
1359
1360 // Divide num by div and return as 16.16 fixed point result.
FixedDiv_X86(int num,int div)1361 __declspec(naked) int FixedDiv_X86(int num, int div) {
1362 __asm {
1363 mov eax, [esp + 4] // num
1364 cdq // extend num to 64 bits
1365 shld edx, eax, 16 // 32.16
1366 shl eax, 16
1367 idiv dword ptr [esp + 8]
1368 ret
1369 }
1370 }
1371
1372 // Divide num by div and return as 16.16 fixed point result.
FixedDiv1_X86(int num,int div)1373 __declspec(naked) int FixedDiv1_X86(int num, int div) {
1374 __asm {
1375 mov eax, [esp + 4] // num
1376 mov ecx, [esp + 8] // denom
1377 cdq // extend num to 64 bits
1378 shld edx, eax, 16 // 32.16
1379 shl eax, 16
1380 sub eax, 0x00010001
1381 sbb edx, 0
1382 sub ecx, 1
1383 idiv ecx
1384 ret
1385 }
1386 }
1387 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
1388
1389 #ifdef __cplusplus
1390 } // extern "C"
1391 } // namespace libyuv
1392 #endif
1393