1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12 #include "libyuv/scale_row.h"
13
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18
19 // This module is for 32 bit Visual C x86 and clangcl
20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
21
22 // Offsets for source bytes 0 to 9
23 static uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
24 128, 128, 128, 128, 128, 128, 128, 128};
25
26 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
27 static uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
28 128, 128, 128, 128, 128, 128, 128, 128};
29
30 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
31 static uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
32 128, 128, 128, 128, 128, 128, 128, 128};
33
34 // Offsets for source bytes 0 to 10
35 static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
36
37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
38 static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13};
39
40 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
41 static uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
42 10, 11, 12, 13, 13, 14, 14, 15};
43
44 // Coefficients for source bytes 0 to 10
45 static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
46
47 // Coefficients for source bytes 10 to 21
48 static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
49
50 // Coefficients for source bytes 21 to 31
51 static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
52
53 // Coefficients for source bytes 21 to 31
54 static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
55
56 static uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
57 128, 128, 128, 128, 128, 128, 128, 128};
58
59 static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
60 6, 8, 11, 14, 128, 128, 128, 128};
61
62 // Arrange words 0,3,6 into 0,1,2
63 static uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
64 128, 128, 128, 128, 128, 128, 128, 128};
65
66 // Arrange words 0,3,6 into 3,4,5
67 static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
68 6, 7, 12, 13, 128, 128, 128, 128};
69
70 // Scaling values for boxes of 3x3 and 2x3
71 static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
72 65536 / 9, 65536 / 6, 0, 0};
73
74 // Arrange first value for pixels 0,1,2,3,4,5
75 static uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
76 11, 128, 14, 128, 128, 128, 128, 128};
77
78 // Arrange second value for pixels 0,1,2,3,4,5
79 static uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
80 12, 128, 15, 128, 128, 128, 128, 128};
81
82 // Arrange third value for pixels 0,1,2,3,4,5
83 static uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
84 13, 128, 128, 128, 128, 128, 128, 128};
85
86 // Scaling values for boxes of 3x2 and 2x2
87 static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
88 65536 / 3, 65536 / 2, 0, 0};
89
90 // Reads 32 pixels, throws half away and writes 16 pixels.
ScaleRowDown2_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)91 __declspec(naked) void ScaleRowDown2_SSSE3(const uint8* src_ptr,
92 ptrdiff_t src_stride,
93 uint8* dst_ptr,
94 int dst_width) {
95 __asm {
96 mov eax, [esp + 4] // src_ptr
97 // src_stride ignored
98 mov edx, [esp + 12] // dst_ptr
99 mov ecx, [esp + 16] // dst_width
100
101 wloop:
102 movdqu xmm0, [eax]
103 movdqu xmm1, [eax + 16]
104 lea eax, [eax + 32]
105 psrlw xmm0, 8 // isolate odd pixels.
106 psrlw xmm1, 8
107 packuswb xmm0, xmm1
108 movdqu [edx], xmm0
109 lea edx, [edx + 16]
110 sub ecx, 16
111 jg wloop
112
113 ret
114 }
115 }
116
117 // Blends 32x1 rectangle to 16x1.
ScaleRowDown2Linear_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)118 __declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr,
119 ptrdiff_t src_stride,
120 uint8* dst_ptr,
121 int dst_width) {
122 __asm {
123 mov eax, [esp + 4] // src_ptr
124 // src_stride
125 mov edx, [esp + 12] // dst_ptr
126 mov ecx, [esp + 16] // dst_width
127
128 pcmpeqb xmm4, xmm4 // constant 0x0101
129 psrlw xmm4, 15
130 packuswb xmm4, xmm4
131 pxor xmm5, xmm5 // constant 0
132
133 wloop:
134 movdqu xmm0, [eax]
135 movdqu xmm1, [eax + 16]
136 lea eax, [eax + 32]
137 pmaddubsw xmm0, xmm4 // horizontal add
138 pmaddubsw xmm1, xmm4
139 pavgw xmm0, xmm5 // (x + 1) / 2
140 pavgw xmm1, xmm5
141 packuswb xmm0, xmm1
142 movdqu [edx], xmm0
143 lea edx, [edx + 16]
144 sub ecx, 16
145 jg wloop
146
147 ret
148 }
149 }
150
151 // Blends 32x2 rectangle to 16x1.
ScaleRowDown2Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)152 __declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8* src_ptr,
153 ptrdiff_t src_stride,
154 uint8* dst_ptr,
155 int dst_width) {
156 __asm {
157 push esi
158 mov eax, [esp + 4 + 4] // src_ptr
159 mov esi, [esp + 4 + 8] // src_stride
160 mov edx, [esp + 4 + 12] // dst_ptr
161 mov ecx, [esp + 4 + 16] // dst_width
162
163 pcmpeqb xmm4, xmm4 // constant 0x0101
164 psrlw xmm4, 15
165 packuswb xmm4, xmm4
166 pxor xmm5, xmm5 // constant 0
167
168 wloop:
169 movdqu xmm0, [eax]
170 movdqu xmm1, [eax + 16]
171 movdqu xmm2, [eax + esi]
172 movdqu xmm3, [eax + esi + 16]
173 lea eax, [eax + 32]
174 pmaddubsw xmm0, xmm4 // horizontal add
175 pmaddubsw xmm1, xmm4
176 pmaddubsw xmm2, xmm4
177 pmaddubsw xmm3, xmm4
178 paddw xmm0, xmm2 // vertical add
179 paddw xmm1, xmm3
180 psrlw xmm0, 1
181 psrlw xmm1, 1
182 pavgw xmm0, xmm5 // (x + 1) / 2
183 pavgw xmm1, xmm5
184 packuswb xmm0, xmm1
185 movdqu [edx], xmm0
186 lea edx, [edx + 16]
187 sub ecx, 16
188 jg wloop
189
190 pop esi
191 ret
192 }
193 }
194
195 #ifdef HAS_SCALEROWDOWN2_AVX2
196 // Reads 64 pixels, throws half away and writes 32 pixels.
ScaleRowDown2_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)197 __declspec(naked) void ScaleRowDown2_AVX2(const uint8* src_ptr,
198 ptrdiff_t src_stride,
199 uint8* dst_ptr,
200 int dst_width) {
201 __asm {
202 mov eax, [esp + 4] // src_ptr
203 // src_stride ignored
204 mov edx, [esp + 12] // dst_ptr
205 mov ecx, [esp + 16] // dst_width
206
207 wloop:
208 vmovdqu ymm0, [eax]
209 vmovdqu ymm1, [eax + 32]
210 lea eax, [eax + 64]
211 vpsrlw ymm0, ymm0, 8 // isolate odd pixels.
212 vpsrlw ymm1, ymm1, 8
213 vpackuswb ymm0, ymm0, ymm1
214 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
215 vmovdqu [edx], ymm0
216 lea edx, [edx + 32]
217 sub ecx, 32
218 jg wloop
219
220 vzeroupper
221 ret
222 }
223 }
224
225 // Blends 64x1 rectangle to 32x1.
ScaleRowDown2Linear_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)226 __declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8* src_ptr,
227 ptrdiff_t src_stride,
228 uint8* dst_ptr,
229 int dst_width) {
230 __asm {
231 mov eax, [esp + 4] // src_ptr
232 // src_stride
233 mov edx, [esp + 12] // dst_ptr
234 mov ecx, [esp + 16] // dst_width
235
236 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
237 vpsrlw ymm4, ymm4, 15
238 vpackuswb ymm4, ymm4, ymm4
239 vpxor ymm5, ymm5, ymm5 // constant 0
240
241 wloop:
242 vmovdqu ymm0, [eax]
243 vmovdqu ymm1, [eax + 32]
244 lea eax, [eax + 64]
245 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
246 vpmaddubsw ymm1, ymm1, ymm4
247 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
248 vpavgw ymm1, ymm1, ymm5
249 vpackuswb ymm0, ymm0, ymm1
250 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
251 vmovdqu [edx], ymm0
252 lea edx, [edx + 32]
253 sub ecx, 32
254 jg wloop
255
256 vzeroupper
257 ret
258 }
259 }
260
261 // For rounding, average = (sum + 2) / 4
262 // becomes average((sum >> 1), 0)
263 // Blends 64x2 rectangle to 32x1.
ScaleRowDown2Box_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)264 __declspec(naked) void ScaleRowDown2Box_AVX2(const uint8* src_ptr,
265 ptrdiff_t src_stride,
266 uint8* dst_ptr,
267 int dst_width) {
268 __asm {
269 push esi
270 mov eax, [esp + 4 + 4] // src_ptr
271 mov esi, [esp + 4 + 8] // src_stride
272 mov edx, [esp + 4 + 12] // dst_ptr
273 mov ecx, [esp + 4 + 16] // dst_width
274
275 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
276 vpsrlw ymm4, ymm4, 15
277 vpackuswb ymm4, ymm4, ymm4
278 vpxor ymm5, ymm5, ymm5 // constant 0
279
280 wloop:
281 vmovdqu ymm0, [eax]
282 vmovdqu ymm1, [eax + 32]
283 vmovdqu ymm2, [eax + esi]
284 vmovdqu ymm3, [eax + esi + 32]
285 lea eax, [eax + 64]
286 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
287 vpmaddubsw ymm1, ymm1, ymm4
288 vpmaddubsw ymm2, ymm2, ymm4
289 vpmaddubsw ymm3, ymm3, ymm4
290 vpaddw ymm0, ymm0, ymm2 // vertical add
291 vpaddw ymm1, ymm1, ymm3
292 vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2
293 vpsrlw ymm1, ymm1, 1
294 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
295 vpavgw ymm1, ymm1, ymm5
296 vpackuswb ymm0, ymm0, ymm1
297 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
298 vmovdqu [edx], ymm0
299 lea edx, [edx + 32]
300 sub ecx, 32
301 jg wloop
302
303 pop esi
304 vzeroupper
305 ret
306 }
307 }
308 #endif // HAS_SCALEROWDOWN2_AVX2
309
310 // Point samples 32 pixels to 8 pixels.
ScaleRowDown4_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)311 __declspec(naked) void ScaleRowDown4_SSSE3(const uint8* src_ptr,
312 ptrdiff_t src_stride,
313 uint8* dst_ptr,
314 int dst_width) {
315 __asm {
316 mov eax, [esp + 4] // src_ptr
317 // src_stride ignored
318 mov edx, [esp + 12] // dst_ptr
319 mov ecx, [esp + 16] // dst_width
320 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
321 psrld xmm5, 24
322 pslld xmm5, 16
323
324 wloop:
325 movdqu xmm0, [eax]
326 movdqu xmm1, [eax + 16]
327 lea eax, [eax + 32]
328 pand xmm0, xmm5
329 pand xmm1, xmm5
330 packuswb xmm0, xmm1
331 psrlw xmm0, 8
332 packuswb xmm0, xmm0
333 movq qword ptr [edx], xmm0
334 lea edx, [edx + 8]
335 sub ecx, 8
336 jg wloop
337
338 ret
339 }
340 }
341
342 // Blends 32x4 rectangle to 8x1.
ScaleRowDown4Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)343 __declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8* src_ptr,
344 ptrdiff_t src_stride,
345 uint8* dst_ptr,
346 int dst_width) {
347 __asm {
348 push esi
349 push edi
350 mov eax, [esp + 8 + 4] // src_ptr
351 mov esi, [esp + 8 + 8] // src_stride
352 mov edx, [esp + 8 + 12] // dst_ptr
353 mov ecx, [esp + 8 + 16] // dst_width
354 lea edi, [esi + esi * 2] // src_stride * 3
355 pcmpeqb xmm4, xmm4 // constant 0x0101
356 psrlw xmm4, 15
357 movdqa xmm5, xmm4
358 packuswb xmm4, xmm4
359 psllw xmm5, 3 // constant 0x0008
360
361 wloop:
362 movdqu xmm0, [eax] // average rows
363 movdqu xmm1, [eax + 16]
364 movdqu xmm2, [eax + esi]
365 movdqu xmm3, [eax + esi + 16]
366 pmaddubsw xmm0, xmm4 // horizontal add
367 pmaddubsw xmm1, xmm4
368 pmaddubsw xmm2, xmm4
369 pmaddubsw xmm3, xmm4
370 paddw xmm0, xmm2 // vertical add rows 0, 1
371 paddw xmm1, xmm3
372 movdqu xmm2, [eax + esi * 2]
373 movdqu xmm3, [eax + esi * 2 + 16]
374 pmaddubsw xmm2, xmm4
375 pmaddubsw xmm3, xmm4
376 paddw xmm0, xmm2 // add row 2
377 paddw xmm1, xmm3
378 movdqu xmm2, [eax + edi]
379 movdqu xmm3, [eax + edi + 16]
380 lea eax, [eax + 32]
381 pmaddubsw xmm2, xmm4
382 pmaddubsw xmm3, xmm4
383 paddw xmm0, xmm2 // add row 3
384 paddw xmm1, xmm3
385 phaddw xmm0, xmm1
386 paddw xmm0, xmm5 // + 8 for round
387 psrlw xmm0, 4 // /16 for average of 4 * 4
388 packuswb xmm0, xmm0
389 movq qword ptr [edx], xmm0
390 lea edx, [edx + 8]
391 sub ecx, 8
392 jg wloop
393
394 pop edi
395 pop esi
396 ret
397 }
398 }
399
400 #ifdef HAS_SCALEROWDOWN4_AVX2
401 // Point samples 64 pixels to 16 pixels.
ScaleRowDown4_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)402 __declspec(naked) void ScaleRowDown4_AVX2(const uint8* src_ptr,
403 ptrdiff_t src_stride,
404 uint8* dst_ptr,
405 int dst_width) {
406 __asm {
407 mov eax, [esp + 4] // src_ptr
408 // src_stride ignored
409 mov edx, [esp + 12] // dst_ptr
410 mov ecx, [esp + 16] // dst_width
411 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000
412 vpsrld ymm5, ymm5, 24
413 vpslld ymm5, ymm5, 16
414
415 wloop:
416 vmovdqu ymm0, [eax]
417 vmovdqu ymm1, [eax + 32]
418 lea eax, [eax + 64]
419 vpand ymm0, ymm0, ymm5
420 vpand ymm1, ymm1, ymm5
421 vpackuswb ymm0, ymm0, ymm1
422 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
423 vpsrlw ymm0, ymm0, 8
424 vpackuswb ymm0, ymm0, ymm0
425 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
426 vmovdqu [edx], xmm0
427 lea edx, [edx + 16]
428 sub ecx, 16
429 jg wloop
430
431 vzeroupper
432 ret
433 }
434 }
435
436 // Blends 64x4 rectangle to 16x1.
ScaleRowDown4Box_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)437 __declspec(naked) void ScaleRowDown4Box_AVX2(const uint8* src_ptr,
438 ptrdiff_t src_stride,
439 uint8* dst_ptr,
440 int dst_width) {
441 __asm {
442 push esi
443 push edi
444 mov eax, [esp + 8 + 4] // src_ptr
445 mov esi, [esp + 8 + 8] // src_stride
446 mov edx, [esp + 8 + 12] // dst_ptr
447 mov ecx, [esp + 8 + 16] // dst_width
448 lea edi, [esi + esi * 2] // src_stride * 3
449 vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101
450 vpsrlw ymm4, ymm4, 15
451 vpsllw ymm5, ymm4, 3 // constant 0x0008
452 vpackuswb ymm4, ymm4, ymm4
453
454 wloop:
455 vmovdqu ymm0, [eax] // average rows
456 vmovdqu ymm1, [eax + 32]
457 vmovdqu ymm2, [eax + esi]
458 vmovdqu ymm3, [eax + esi + 32]
459 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
460 vpmaddubsw ymm1, ymm1, ymm4
461 vpmaddubsw ymm2, ymm2, ymm4
462 vpmaddubsw ymm3, ymm3, ymm4
463 vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1
464 vpaddw ymm1, ymm1, ymm3
465 vmovdqu ymm2, [eax + esi * 2]
466 vmovdqu ymm3, [eax + esi * 2 + 32]
467 vpmaddubsw ymm2, ymm2, ymm4
468 vpmaddubsw ymm3, ymm3, ymm4
469 vpaddw ymm0, ymm0, ymm2 // add row 2
470 vpaddw ymm1, ymm1, ymm3
471 vmovdqu ymm2, [eax + edi]
472 vmovdqu ymm3, [eax + edi + 32]
473 lea eax, [eax + 64]
474 vpmaddubsw ymm2, ymm2, ymm4
475 vpmaddubsw ymm3, ymm3, ymm4
476 vpaddw ymm0, ymm0, ymm2 // add row 3
477 vpaddw ymm1, ymm1, ymm3
478 vphaddw ymm0, ymm0, ymm1 // mutates
479 vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw
480 vpaddw ymm0, ymm0, ymm5 // + 8 for round
481 vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4
482 vpackuswb ymm0, ymm0, ymm0
483 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
484 vmovdqu [edx], xmm0
485 lea edx, [edx + 16]
486 sub ecx, 16
487 jg wloop
488
489 pop edi
490 pop esi
491 vzeroupper
492 ret
493 }
494 }
495 #endif // HAS_SCALEROWDOWN4_AVX2
496
497 // Point samples 32 pixels to 24 pixels.
498 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
499 // Then shuffled to do the scaling.
500
ScaleRowDown34_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)501 __declspec(naked) void ScaleRowDown34_SSSE3(const uint8* src_ptr,
502 ptrdiff_t src_stride,
503 uint8* dst_ptr,
504 int dst_width) {
505 __asm {
506 mov eax, [esp + 4] // src_ptr
507 // src_stride ignored
508 mov edx, [esp + 12] // dst_ptr
509 mov ecx, [esp + 16] // dst_width
510 movdqa xmm3, xmmword ptr kShuf0
511 movdqa xmm4, xmmword ptr kShuf1
512 movdqa xmm5, xmmword ptr kShuf2
513
514 wloop:
515 movdqu xmm0, [eax]
516 movdqu xmm1, [eax + 16]
517 lea eax, [eax + 32]
518 movdqa xmm2, xmm1
519 palignr xmm1, xmm0, 8
520 pshufb xmm0, xmm3
521 pshufb xmm1, xmm4
522 pshufb xmm2, xmm5
523 movq qword ptr [edx], xmm0
524 movq qword ptr [edx + 8], xmm1
525 movq qword ptr [edx + 16], xmm2
526 lea edx, [edx + 24]
527 sub ecx, 24
528 jg wloop
529
530 ret
531 }
532 }
533
534 // Blends 32x2 rectangle to 24x1
535 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
536 // Then shuffled to do the scaling.
537
538 // Register usage:
539 // xmm0 src_row 0
540 // xmm1 src_row 1
541 // xmm2 shuf 0
542 // xmm3 shuf 1
543 // xmm4 shuf 2
544 // xmm5 madd 0
545 // xmm6 madd 1
546 // xmm7 kRound34
547
548 // Note that movdqa+palign may be better than movdqu.
ScaleRowDown34_1_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)549 __declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
550 ptrdiff_t src_stride,
551 uint8* dst_ptr,
552 int dst_width) {
553 __asm {
554 push esi
555 mov eax, [esp + 4 + 4] // src_ptr
556 mov esi, [esp + 4 + 8] // src_stride
557 mov edx, [esp + 4 + 12] // dst_ptr
558 mov ecx, [esp + 4 + 16] // dst_width
559 movdqa xmm2, xmmword ptr kShuf01
560 movdqa xmm3, xmmword ptr kShuf11
561 movdqa xmm4, xmmword ptr kShuf21
562 movdqa xmm5, xmmword ptr kMadd01
563 movdqa xmm6, xmmword ptr kMadd11
564 movdqa xmm7, xmmword ptr kRound34
565
566 wloop:
567 movdqu xmm0, [eax] // pixels 0..7
568 movdqu xmm1, [eax + esi]
569 pavgb xmm0, xmm1
570 pshufb xmm0, xmm2
571 pmaddubsw xmm0, xmm5
572 paddsw xmm0, xmm7
573 psrlw xmm0, 2
574 packuswb xmm0, xmm0
575 movq qword ptr [edx], xmm0
576 movdqu xmm0, [eax + 8] // pixels 8..15
577 movdqu xmm1, [eax + esi + 8]
578 pavgb xmm0, xmm1
579 pshufb xmm0, xmm3
580 pmaddubsw xmm0, xmm6
581 paddsw xmm0, xmm7
582 psrlw xmm0, 2
583 packuswb xmm0, xmm0
584 movq qword ptr [edx + 8], xmm0
585 movdqu xmm0, [eax + 16] // pixels 16..23
586 movdqu xmm1, [eax + esi + 16]
587 lea eax, [eax + 32]
588 pavgb xmm0, xmm1
589 pshufb xmm0, xmm4
590 movdqa xmm1, xmmword ptr kMadd21
591 pmaddubsw xmm0, xmm1
592 paddsw xmm0, xmm7
593 psrlw xmm0, 2
594 packuswb xmm0, xmm0
595 movq qword ptr [edx + 16], xmm0
596 lea edx, [edx + 24]
597 sub ecx, 24
598 jg wloop
599
600 pop esi
601 ret
602 }
603 }
604
605 // Note that movdqa+palign may be better than movdqu.
ScaleRowDown34_0_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)606 __declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
607 ptrdiff_t src_stride,
608 uint8* dst_ptr,
609 int dst_width) {
610 __asm {
611 push esi
612 mov eax, [esp + 4 + 4] // src_ptr
613 mov esi, [esp + 4 + 8] // src_stride
614 mov edx, [esp + 4 + 12] // dst_ptr
615 mov ecx, [esp + 4 + 16] // dst_width
616 movdqa xmm2, xmmword ptr kShuf01
617 movdqa xmm3, xmmword ptr kShuf11
618 movdqa xmm4, xmmword ptr kShuf21
619 movdqa xmm5, xmmword ptr kMadd01
620 movdqa xmm6, xmmword ptr kMadd11
621 movdqa xmm7, xmmword ptr kRound34
622
623 wloop:
624 movdqu xmm0, [eax] // pixels 0..7
625 movdqu xmm1, [eax + esi]
626 pavgb xmm1, xmm0
627 pavgb xmm0, xmm1
628 pshufb xmm0, xmm2
629 pmaddubsw xmm0, xmm5
630 paddsw xmm0, xmm7
631 psrlw xmm0, 2
632 packuswb xmm0, xmm0
633 movq qword ptr [edx], xmm0
634 movdqu xmm0, [eax + 8] // pixels 8..15
635 movdqu xmm1, [eax + esi + 8]
636 pavgb xmm1, xmm0
637 pavgb xmm0, xmm1
638 pshufb xmm0, xmm3
639 pmaddubsw xmm0, xmm6
640 paddsw xmm0, xmm7
641 psrlw xmm0, 2
642 packuswb xmm0, xmm0
643 movq qword ptr [edx + 8], xmm0
644 movdqu xmm0, [eax + 16] // pixels 16..23
645 movdqu xmm1, [eax + esi + 16]
646 lea eax, [eax + 32]
647 pavgb xmm1, xmm0
648 pavgb xmm0, xmm1
649 pshufb xmm0, xmm4
650 movdqa xmm1, xmmword ptr kMadd21
651 pmaddubsw xmm0, xmm1
652 paddsw xmm0, xmm7
653 psrlw xmm0, 2
654 packuswb xmm0, xmm0
655 movq qword ptr [edx + 16], xmm0
656 lea edx, [edx+24]
657 sub ecx, 24
658 jg wloop
659
660 pop esi
661 ret
662 }
663 }
664
665 // 3/8 point sampler
666
667 // Scale 32 pixels to 12
ScaleRowDown38_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)668 __declspec(naked) void ScaleRowDown38_SSSE3(const uint8* src_ptr,
669 ptrdiff_t src_stride,
670 uint8* dst_ptr,
671 int dst_width) {
672 __asm {
673 mov eax, [esp + 4] // src_ptr
674 // src_stride ignored
675 mov edx, [esp + 12] // dst_ptr
676 mov ecx, [esp + 16] // dst_width
677 movdqa xmm4, xmmword ptr kShuf38a
678 movdqa xmm5, xmmword ptr kShuf38b
679
680 xloop:
681 movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
682 movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
683 lea eax, [eax + 32]
684 pshufb xmm0, xmm4
685 pshufb xmm1, xmm5
686 paddusb xmm0, xmm1
687
688 movq qword ptr [edx], xmm0 // write 12 pixels
689 movhlps xmm1, xmm0
690 movd [edx + 8], xmm1
691 lea edx, [edx + 12]
692 sub ecx, 12
693 jg xloop
694
695 ret
696 }
697 }
698
699 // Scale 16x3 pixels to 6x1 with interpolation
ScaleRowDown38_3_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)700 __declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
701 ptrdiff_t src_stride,
702 uint8* dst_ptr,
703 int dst_width) {
704 __asm {
705 push esi
706 mov eax, [esp + 4 + 4] // src_ptr
707 mov esi, [esp + 4 + 8] // src_stride
708 mov edx, [esp + 4 + 12] // dst_ptr
709 mov ecx, [esp + 4 + 16] // dst_width
710 movdqa xmm2, xmmword ptr kShufAc
711 movdqa xmm3, xmmword ptr kShufAc3
712 movdqa xmm4, xmmword ptr kScaleAc33
713 pxor xmm5, xmm5
714
715 xloop:
716 movdqu xmm0, [eax] // sum up 3 rows into xmm0/1
717 movdqu xmm6, [eax + esi]
718 movhlps xmm1, xmm0
719 movhlps xmm7, xmm6
720 punpcklbw xmm0, xmm5
721 punpcklbw xmm1, xmm5
722 punpcklbw xmm6, xmm5
723 punpcklbw xmm7, xmm5
724 paddusw xmm0, xmm6
725 paddusw xmm1, xmm7
726 movdqu xmm6, [eax + esi * 2]
727 lea eax, [eax + 16]
728 movhlps xmm7, xmm6
729 punpcklbw xmm6, xmm5
730 punpcklbw xmm7, xmm5
731 paddusw xmm0, xmm6
732 paddusw xmm1, xmm7
733
734 movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
735 psrldq xmm0, 2
736 paddusw xmm6, xmm0
737 psrldq xmm0, 2
738 paddusw xmm6, xmm0
739 pshufb xmm6, xmm2
740
741 movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
742 psrldq xmm1, 2
743 paddusw xmm7, xmm1
744 psrldq xmm1, 2
745 paddusw xmm7, xmm1
746 pshufb xmm7, xmm3
747 paddusw xmm6, xmm7
748
749 pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
750 packuswb xmm6, xmm6
751
752 movd [edx], xmm6 // write 6 pixels
753 psrlq xmm6, 16
754 movd [edx + 2], xmm6
755 lea edx, [edx + 6]
756 sub ecx, 6
757 jg xloop
758
759 pop esi
760 ret
761 }
762 }
763
764 // Scale 16x2 pixels to 6x1 with interpolation
ScaleRowDown38_2_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)765 __declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
766 ptrdiff_t src_stride,
767 uint8* dst_ptr,
768 int dst_width) {
769 __asm {
770 push esi
771 mov eax, [esp + 4 + 4] // src_ptr
772 mov esi, [esp + 4 + 8] // src_stride
773 mov edx, [esp + 4 + 12] // dst_ptr
774 mov ecx, [esp + 4 + 16] // dst_width
775 movdqa xmm2, xmmword ptr kShufAb0
776 movdqa xmm3, xmmword ptr kShufAb1
777 movdqa xmm4, xmmword ptr kShufAb2
778 movdqa xmm5, xmmword ptr kScaleAb2
779
780 xloop:
781 movdqu xmm0, [eax] // average 2 rows into xmm0
782 movdqu xmm1, [eax + esi]
783 lea eax, [eax + 16]
784 pavgb xmm0, xmm1
785
786 movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
787 pshufb xmm1, xmm2
788 movdqa xmm6, xmm0
789 pshufb xmm6, xmm3
790 paddusw xmm1, xmm6
791 pshufb xmm0, xmm4
792 paddusw xmm1, xmm0
793
794 pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
795 packuswb xmm1, xmm1
796
797 movd [edx], xmm1 // write 6 pixels
798 psrlq xmm1, 16
799 movd [edx + 2], xmm1
800 lea edx, [edx + 6]
801 sub ecx, 6
802 jg xloop
803
804 pop esi
805 ret
806 }
807 }
808
809 // Reads 16 bytes and accumulates to 16 shorts at a time.
ScaleAddRow_SSE2(const uint8 * src_ptr,uint16 * dst_ptr,int src_width)810 __declspec(naked) void ScaleAddRow_SSE2(const uint8* src_ptr,
811 uint16* dst_ptr,
812 int src_width) {
813 __asm {
814 mov eax, [esp + 4] // src_ptr
815 mov edx, [esp + 8] // dst_ptr
816 mov ecx, [esp + 12] // src_width
817 pxor xmm5, xmm5
818
819 // sum rows
820 xloop:
821 movdqu xmm3, [eax] // read 16 bytes
822 lea eax, [eax + 16]
823 movdqu xmm0, [edx] // read 16 words from destination
824 movdqu xmm1, [edx + 16]
825 movdqa xmm2, xmm3
826 punpcklbw xmm2, xmm5
827 punpckhbw xmm3, xmm5
828 paddusw xmm0, xmm2 // sum 16 words
829 paddusw xmm1, xmm3
830 movdqu [edx], xmm0 // write 16 words to destination
831 movdqu [edx + 16], xmm1
832 lea edx, [edx + 32]
833 sub ecx, 16
834 jg xloop
835 ret
836 }
837 }
838
839 #ifdef HAS_SCALEADDROW_AVX2
840 // Reads 32 bytes and accumulates to 32 shorts at a time.
ScaleAddRow_AVX2(const uint8 * src_ptr,uint16 * dst_ptr,int src_width)841 __declspec(naked) void ScaleAddRow_AVX2(const uint8* src_ptr,
842 uint16* dst_ptr,
843 int src_width) {
844 __asm {
845 mov eax, [esp + 4] // src_ptr
846 mov edx, [esp + 8] // dst_ptr
847 mov ecx, [esp + 12] // src_width
848 vpxor ymm5, ymm5, ymm5
849
850 // sum rows
851 xloop:
852 vmovdqu ymm3, [eax] // read 32 bytes
853 lea eax, [eax + 32]
854 vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck
855 vpunpcklbw ymm2, ymm3, ymm5
856 vpunpckhbw ymm3, ymm3, ymm5
857 vpaddusw ymm0, ymm2, [edx] // sum 16 words
858 vpaddusw ymm1, ymm3, [edx + 32]
859 vmovdqu [edx], ymm0 // write 32 words to destination
860 vmovdqu [edx + 32], ymm1
861 lea edx, [edx + 64]
862 sub ecx, 32
863 jg xloop
864
865 vzeroupper
866 ret
867 }
868 }
869 #endif // HAS_SCALEADDROW_AVX2
870
871 // Constant for making pixels signed to avoid pmaddubsw
872 // saturation.
873 static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
874 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
875
876 // Constant for making pixels unsigned and adding .5 for rounding.
877 static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
878 0x4040, 0x4040, 0x4040, 0x4040};
879
880 // Bilinear column filtering. SSSE3 version.
ScaleFilterCols_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)881 __declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr,
882 const uint8* src_ptr,
883 int dst_width,
884 int x,
885 int dx) {
886 __asm {
887 push ebx
888 push esi
889 push edi
890 mov edi, [esp + 12 + 4] // dst_ptr
891 mov esi, [esp + 12 + 8] // src_ptr
892 mov ecx, [esp + 12 + 12] // dst_width
893 movd xmm2, [esp + 12 + 16] // x
894 movd xmm3, [esp + 12 + 20] // dx
895 mov eax, 0x04040000 // shuffle to line up fractions with pixel.
896 movd xmm5, eax
897 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
898 psrlw xmm6, 9
899 pcmpeqb xmm7, xmm7 // generate 0x0001
900 psrlw xmm7, 15
901 pextrw eax, xmm2, 1 // get x0 integer. preroll
902 sub ecx, 2
903 jl xloop29
904
905 movdqa xmm0, xmm2 // x1 = x0 + dx
906 paddd xmm0, xmm3
907 punpckldq xmm2, xmm0 // x0 x1
908 punpckldq xmm3, xmm3 // dx dx
909 paddd xmm3, xmm3 // dx * 2, dx * 2
910 pextrw edx, xmm2, 3 // get x1 integer. preroll
911
912 // 2 Pixel loop.
913 xloop2:
914 movdqa xmm1, xmm2 // x0, x1 fractions.
915 paddd xmm2, xmm3 // x += dx
916 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
917 movd xmm0, ebx
918 psrlw xmm1, 9 // 7 bit fractions.
919 movzx ebx, word ptr [esi + edx] // 2 source x1 pixels
920 movd xmm4, ebx
921 pshufb xmm1, xmm5 // 0011
922 punpcklwd xmm0, xmm4
923 psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
924 pxor xmm1, xmm6 // 0..7f and 7f..0
925 paddusb xmm1, xmm7 // +1 so 0..7f and 80..1
926 pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels.
927 pextrw eax, xmm2, 1 // get x0 integer. next iteration.
928 pextrw edx, xmm2, 3 // get x1 integer. next iteration.
929 paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round.
930 psrlw xmm1, 7 // 8.7 fixed point to low 8 bits.
931 packuswb xmm1, xmm1 // 8 bits, 2 pixels.
932 movd ebx, xmm1
933 mov [edi], bx
934 lea edi, [edi + 2]
935 sub ecx, 2 // 2 pixels
936 jge xloop2
937
938 xloop29:
939 add ecx, 2 - 1
940 jl xloop99
941
942 // 1 pixel remainder
943 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
944 movd xmm0, ebx
945 psrlw xmm2, 9 // 7 bit fractions.
946 pshufb xmm2, xmm5 // 0011
947 psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
948 pxor xmm2, xmm6 // 0..7f and 7f..0
949 paddusb xmm2, xmm7 // +1 so 0..7f and 80..1
950 pmaddubsw xmm2, xmm0 // 16 bit
951 paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round.
952 psrlw xmm2, 7 // 8.7 fixed point to low 8 bits.
953 packuswb xmm2, xmm2 // 8 bits
954 movd ebx, xmm2
955 mov [edi], bl
956
957 xloop99:
958
959 pop edi
960 pop esi
961 pop ebx
962 ret
963 }
964 }
965
966 // Reads 16 pixels, duplicates them and writes 32 pixels.
ScaleColsUp2_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)967 __declspec(naked) void ScaleColsUp2_SSE2(uint8* dst_ptr,
968 const uint8* src_ptr,
969 int dst_width,
970 int x,
971 int dx) {
972 __asm {
973 mov edx, [esp + 4] // dst_ptr
974 mov eax, [esp + 8] // src_ptr
975 mov ecx, [esp + 12] // dst_width
976
977 wloop:
978 movdqu xmm0, [eax]
979 lea eax, [eax + 16]
980 movdqa xmm1, xmm0
981 punpcklbw xmm0, xmm0
982 punpckhbw xmm1, xmm1
983 movdqu [edx], xmm0
984 movdqu [edx + 16], xmm1
985 lea edx, [edx + 32]
986 sub ecx, 32
987 jg wloop
988
989 ret
990 }
991 }
992
993 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
ScaleARGBRowDown2_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)994 __declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
995 ptrdiff_t src_stride,
996 uint8* dst_argb,
997 int dst_width) {
998 __asm {
999 mov eax, [esp + 4] // src_argb
1000 // src_stride ignored
1001 mov edx, [esp + 12] // dst_argb
1002 mov ecx, [esp + 16] // dst_width
1003
1004 wloop:
1005 movdqu xmm0, [eax]
1006 movdqu xmm1, [eax + 16]
1007 lea eax, [eax + 32]
1008 shufps xmm0, xmm1, 0xdd
1009 movdqu [edx], xmm0
1010 lea edx, [edx + 16]
1011 sub ecx, 4
1012 jg wloop
1013
1014 ret
1015 }
1016 }
1017
1018 // Blends 8x1 rectangle to 4x1.
ScaleARGBRowDown2Linear_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)1019 __declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
1020 ptrdiff_t src_stride,
1021 uint8* dst_argb,
1022 int dst_width) {
1023 __asm {
1024 mov eax, [esp + 4] // src_argb
1025 // src_stride ignored
1026 mov edx, [esp + 12] // dst_argb
1027 mov ecx, [esp + 16] // dst_width
1028
1029 wloop:
1030 movdqu xmm0, [eax]
1031 movdqu xmm1, [eax + 16]
1032 lea eax, [eax + 32]
1033 movdqa xmm2, xmm0
1034 shufps xmm0, xmm1, 0x88 // even pixels
1035 shufps xmm2, xmm1, 0xdd // odd pixels
1036 pavgb xmm0, xmm2
1037 movdqu [edx], xmm0
1038 lea edx, [edx + 16]
1039 sub ecx, 4
1040 jg wloop
1041
1042 ret
1043 }
1044 }
1045
1046 // Blends 8x2 rectangle to 4x1.
ScaleARGBRowDown2Box_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)1047 __declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
1048 ptrdiff_t src_stride,
1049 uint8* dst_argb,
1050 int dst_width) {
1051 __asm {
1052 push esi
1053 mov eax, [esp + 4 + 4] // src_argb
1054 mov esi, [esp + 4 + 8] // src_stride
1055 mov edx, [esp + 4 + 12] // dst_argb
1056 mov ecx, [esp + 4 + 16] // dst_width
1057
1058 wloop:
1059 movdqu xmm0, [eax]
1060 movdqu xmm1, [eax + 16]
1061 movdqu xmm2, [eax + esi]
1062 movdqu xmm3, [eax + esi + 16]
1063 lea eax, [eax + 32]
1064 pavgb xmm0, xmm2 // average rows
1065 pavgb xmm1, xmm3
1066 movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
1067 shufps xmm0, xmm1, 0x88 // even pixels
1068 shufps xmm2, xmm1, 0xdd // odd pixels
1069 pavgb xmm0, xmm2
1070 movdqu [edx], xmm0
1071 lea edx, [edx + 16]
1072 sub ecx, 4
1073 jg wloop
1074
1075 pop esi
1076 ret
1077 }
1078 }
1079
1080 // Reads 4 pixels at a time.
ScaleARGBRowDownEven_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)1081 __declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8* src_argb,
1082 ptrdiff_t src_stride,
1083 int src_stepx,
1084 uint8* dst_argb,
1085 int dst_width) {
1086 __asm {
1087 push ebx
1088 push edi
1089 mov eax, [esp + 8 + 4] // src_argb
1090 // src_stride ignored
1091 mov ebx, [esp + 8 + 12] // src_stepx
1092 mov edx, [esp + 8 + 16] // dst_argb
1093 mov ecx, [esp + 8 + 20] // dst_width
1094 lea ebx, [ebx * 4]
1095 lea edi, [ebx + ebx * 2]
1096
1097 wloop:
1098 movd xmm0, [eax]
1099 movd xmm1, [eax + ebx]
1100 punpckldq xmm0, xmm1
1101 movd xmm2, [eax + ebx * 2]
1102 movd xmm3, [eax + edi]
1103 lea eax, [eax + ebx * 4]
1104 punpckldq xmm2, xmm3
1105 punpcklqdq xmm0, xmm2
1106 movdqu [edx], xmm0
1107 lea edx, [edx + 16]
1108 sub ecx, 4
1109 jg wloop
1110
1111 pop edi
1112 pop ebx
1113 ret
1114 }
1115 }
1116
1117 // Blends four 2x2 to 4x1.
ScaleARGBRowDownEvenBox_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)1118 __declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
1119 ptrdiff_t src_stride,
1120 int src_stepx,
1121 uint8* dst_argb,
1122 int dst_width) {
1123 __asm {
1124 push ebx
1125 push esi
1126 push edi
1127 mov eax, [esp + 12 + 4] // src_argb
1128 mov esi, [esp + 12 + 8] // src_stride
1129 mov ebx, [esp + 12 + 12] // src_stepx
1130 mov edx, [esp + 12 + 16] // dst_argb
1131 mov ecx, [esp + 12 + 20] // dst_width
1132 lea esi, [eax + esi] // row1 pointer
1133 lea ebx, [ebx * 4]
1134 lea edi, [ebx + ebx * 2]
1135
1136 wloop:
1137 movq xmm0, qword ptr [eax] // row0 4 pairs
1138 movhps xmm0, qword ptr [eax + ebx]
1139 movq xmm1, qword ptr [eax + ebx * 2]
1140 movhps xmm1, qword ptr [eax + edi]
1141 lea eax, [eax + ebx * 4]
1142 movq xmm2, qword ptr [esi] // row1 4 pairs
1143 movhps xmm2, qword ptr [esi + ebx]
1144 movq xmm3, qword ptr [esi + ebx * 2]
1145 movhps xmm3, qword ptr [esi + edi]
1146 lea esi, [esi + ebx * 4]
1147 pavgb xmm0, xmm2 // average rows
1148 pavgb xmm1, xmm3
1149 movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
1150 shufps xmm0, xmm1, 0x88 // even pixels
1151 shufps xmm2, xmm1, 0xdd // odd pixels
1152 pavgb xmm0, xmm2
1153 movdqu [edx], xmm0
1154 lea edx, [edx + 16]
1155 sub ecx, 4
1156 jg wloop
1157
1158 pop edi
1159 pop esi
1160 pop ebx
1161 ret
1162 }
1163 }
1164
1165 // Column scaling unfiltered. SSE2 version.
ScaleARGBCols_SSE2(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1166 __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
1167 const uint8* src_argb,
1168 int dst_width,
1169 int x,
1170 int dx) {
1171 __asm {
1172 push edi
1173 push esi
1174 mov edi, [esp + 8 + 4] // dst_argb
1175 mov esi, [esp + 8 + 8] // src_argb
1176 mov ecx, [esp + 8 + 12] // dst_width
1177 movd xmm2, [esp + 8 + 16] // x
1178 movd xmm3, [esp + 8 + 20] // dx
1179
1180 pshufd xmm2, xmm2, 0 // x0 x0 x0 x0
1181 pshufd xmm0, xmm3, 0x11 // dx 0 dx 0
1182 paddd xmm2, xmm0
1183 paddd xmm3, xmm3 // 0, 0, 0, dx * 2
1184 pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0
1185 paddd xmm2, xmm0 // x3 x2 x1 x0
1186 paddd xmm3, xmm3 // 0, 0, 0, dx * 4
1187 pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4
1188
1189 pextrw eax, xmm2, 1 // get x0 integer.
1190 pextrw edx, xmm2, 3 // get x1 integer.
1191
1192 cmp ecx, 0
1193 jle xloop99
1194 sub ecx, 4
1195 jl xloop49
1196
1197 // 4 Pixel loop.
1198 xloop4:
1199 movd xmm0, [esi + eax * 4] // 1 source x0 pixels
1200 movd xmm1, [esi + edx * 4] // 1 source x1 pixels
1201 pextrw eax, xmm2, 5 // get x2 integer.
1202 pextrw edx, xmm2, 7 // get x3 integer.
1203 paddd xmm2, xmm3 // x += dx
1204 punpckldq xmm0, xmm1 // x0 x1
1205
1206 movd xmm1, [esi + eax * 4] // 1 source x2 pixels
1207 movd xmm4, [esi + edx * 4] // 1 source x3 pixels
1208 pextrw eax, xmm2, 1 // get x0 integer. next iteration.
1209 pextrw edx, xmm2, 3 // get x1 integer. next iteration.
1210 punpckldq xmm1, xmm4 // x2 x3
1211 punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
1212 movdqu [edi], xmm0
1213 lea edi, [edi + 16]
1214 sub ecx, 4 // 4 pixels
1215 jge xloop4
1216
1217 xloop49:
1218 test ecx, 2
1219 je xloop29
1220
1221 // 2 Pixels.
1222 movd xmm0, [esi + eax * 4] // 1 source x0 pixels
1223 movd xmm1, [esi + edx * 4] // 1 source x1 pixels
1224 pextrw eax, xmm2, 5 // get x2 integer.
1225 punpckldq xmm0, xmm1 // x0 x1
1226
1227 movq qword ptr [edi], xmm0
1228 lea edi, [edi + 8]
1229
1230 xloop29:
1231 test ecx, 1
1232 je xloop99
1233
1234 // 1 Pixels.
1235 movd xmm0, [esi + eax * 4] // 1 source x2 pixels
1236 movd dword ptr [edi], xmm0
1237 xloop99:
1238
1239 pop esi
1240 pop edi
1241 ret
1242 }
1243 }
1244
1245 // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
1246 // TODO(fbarchard): Port to Neon
1247
1248 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1249 static uvec8 kShuffleColARGB = {
1250 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
1251 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
1252 };
1253
1254 // Shuffle table for duplicating 2 fractions into 8 bytes each
1255 static uvec8 kShuffleFractions = {
1256 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1257 };
1258
ScaleARGBFilterCols_SSSE3(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1259 __declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
1260 const uint8* src_argb,
1261 int dst_width,
1262 int x,
1263 int dx) {
1264 __asm {
1265 push esi
1266 push edi
1267 mov edi, [esp + 8 + 4] // dst_argb
1268 mov esi, [esp + 8 + 8] // src_argb
1269 mov ecx, [esp + 8 + 12] // dst_width
1270 movd xmm2, [esp + 8 + 16] // x
1271 movd xmm3, [esp + 8 + 20] // dx
1272 movdqa xmm4, xmmword ptr kShuffleColARGB
1273 movdqa xmm5, xmmword ptr kShuffleFractions
1274 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
1275 psrlw xmm6, 9
1276 pextrw eax, xmm2, 1 // get x0 integer. preroll
1277 sub ecx, 2
1278 jl xloop29
1279
1280 movdqa xmm0, xmm2 // x1 = x0 + dx
1281 paddd xmm0, xmm3
1282 punpckldq xmm2, xmm0 // x0 x1
1283 punpckldq xmm3, xmm3 // dx dx
1284 paddd xmm3, xmm3 // dx * 2, dx * 2
1285 pextrw edx, xmm2, 3 // get x1 integer. preroll
1286
1287 // 2 Pixel loop.
1288 xloop2:
1289 movdqa xmm1, xmm2 // x0, x1 fractions.
1290 paddd xmm2, xmm3 // x += dx
1291 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
1292 psrlw xmm1, 9 // 7 bit fractions.
1293 movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels
1294 pshufb xmm1, xmm5 // 0000000011111111
1295 pshufb xmm0, xmm4 // arrange pixels into pairs
1296 pxor xmm1, xmm6 // 0..7f and 7f..0
1297 pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.
1298 pextrw eax, xmm2, 1 // get x0 integer. next iteration.
1299 pextrw edx, xmm2, 3 // get x1 integer. next iteration.
1300 psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits.
1301 packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.
1302 movq qword ptr [edi], xmm0
1303 lea edi, [edi + 8]
1304 sub ecx, 2 // 2 pixels
1305 jge xloop2
1306
1307 xloop29:
1308
1309 add ecx, 2 - 1
1310 jl xloop99
1311
1312 // 1 pixel remainder
1313 psrlw xmm2, 9 // 7 bit fractions.
1314 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
1315 pshufb xmm2, xmm5 // 00000000
1316 pshufb xmm0, xmm4 // arrange pixels into pairs
1317 pxor xmm2, xmm6 // 0..7f and 7f..0
1318 pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel.
1319 psrlw xmm0, 7
1320 packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
1321 movd [edi], xmm0
1322
1323 xloop99:
1324
1325 pop edi
1326 pop esi
1327 ret
1328 }
1329 }
1330
1331 // Reads 4 pixels, duplicates them and writes 8 pixels.
ScaleARGBColsUp2_SSE2(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1332 __declspec(naked) void ScaleARGBColsUp2_SSE2(uint8* dst_argb,
1333 const uint8* src_argb,
1334 int dst_width,
1335 int x,
1336 int dx) {
1337 __asm {
1338 mov edx, [esp + 4] // dst_argb
1339 mov eax, [esp + 8] // src_argb
1340 mov ecx, [esp + 12] // dst_width
1341
1342 wloop:
1343 movdqu xmm0, [eax]
1344 lea eax, [eax + 16]
1345 movdqa xmm1, xmm0
1346 punpckldq xmm0, xmm0
1347 punpckhdq xmm1, xmm1
1348 movdqu [edx], xmm0
1349 movdqu [edx + 16], xmm1
1350 lea edx, [edx + 32]
1351 sub ecx, 8
1352 jg wloop
1353
1354 ret
1355 }
1356 }
1357
1358 // Divide num by div and return as 16.16 fixed point result.
FixedDiv_X86(int num,int div)1359 __declspec(naked) int FixedDiv_X86(int num, int div) {
1360 __asm {
1361 mov eax, [esp + 4] // num
1362 cdq // extend num to 64 bits
1363 shld edx, eax, 16 // 32.16
1364 shl eax, 16
1365 idiv dword ptr [esp + 8]
1366 ret
1367 }
1368 }
1369
1370 // Divide num by div and return as 16.16 fixed point result.
FixedDiv1_X86(int num,int div)1371 __declspec(naked) int FixedDiv1_X86(int num, int div) {
1372 __asm {
1373 mov eax, [esp + 4] // num
1374 mov ecx, [esp + 8] // denom
1375 cdq // extend num to 64 bits
1376 shld edx, eax, 16 // 32.16
1377 shl eax, 16
1378 sub eax, 0x00010001
1379 sbb edx, 0
1380 sub ecx, 1
1381 idiv ecx
1382 ret
1383 }
1384 }
1385 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
1386
1387 #ifdef __cplusplus
1388 } // extern "C"
1389 } // namespace libyuv
1390 #endif
1391