1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12 #include "libyuv/scale_row.h"
13
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18
19 // This module is for Visual C x86.
20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
21 defined(_MSC_VER) && !defined(__clang__)
22
23 // Offsets for source bytes 0 to 9
24 static uvec8 kShuf0 =
25 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
26
27 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
28 static uvec8 kShuf1 =
29 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
30
31 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
32 static uvec8 kShuf2 =
33 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
34
35 // Offsets for source bytes 0 to 10
36 static uvec8 kShuf01 =
37 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
38
39 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
40 static uvec8 kShuf11 =
41 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
42
43 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
44 static uvec8 kShuf21 =
45 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
46
47 // Coefficients for source bytes 0 to 10
48 static uvec8 kMadd01 =
49 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
50
51 // Coefficients for source bytes 10 to 21
52 static uvec8 kMadd11 =
53 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
54
55 // Coefficients for source bytes 21 to 31
56 static uvec8 kMadd21 =
57 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
58
59 // Coefficients for source bytes 21 to 31
60 static vec16 kRound34 =
61 { 2, 2, 2, 2, 2, 2, 2, 2 };
62
63 static uvec8 kShuf38a =
64 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
65
66 static uvec8 kShuf38b =
67 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
68
69 // Arrange words 0,3,6 into 0,1,2
70 static uvec8 kShufAc =
71 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
72
73 // Arrange words 0,3,6 into 3,4,5
74 static uvec8 kShufAc3 =
75 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
76
77 // Scaling values for boxes of 3x3 and 2x3
78 static uvec16 kScaleAc33 =
79 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
80
81 // Arrange first value for pixels 0,1,2,3,4,5
82 static uvec8 kShufAb0 =
83 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
84
85 // Arrange second value for pixels 0,1,2,3,4,5
86 static uvec8 kShufAb1 =
87 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
88
89 // Arrange third value for pixels 0,1,2,3,4,5
90 static uvec8 kShufAb2 =
91 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
92
93 // Scaling values for boxes of 3x2 and 2x2
94 static uvec16 kScaleAb2 =
95 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
96
97 // Reads 32 pixels, throws half away and writes 16 pixels.
98 __declspec(naked)
ScaleRowDown2_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)99 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
100 uint8* dst_ptr, int dst_width) {
101 __asm {
102 mov eax, [esp + 4] // src_ptr
103 // src_stride ignored
104 mov edx, [esp + 12] // dst_ptr
105 mov ecx, [esp + 16] // dst_width
106
107 wloop:
108 movdqu xmm0, [eax]
109 movdqu xmm1, [eax + 16]
110 lea eax, [eax + 32]
111 psrlw xmm0, 8 // isolate odd pixels.
112 psrlw xmm1, 8
113 packuswb xmm0, xmm1
114 movdqu [edx], xmm0
115 lea edx, [edx + 16]
116 sub ecx, 16
117 jg wloop
118
119 ret
120 }
121 }
122
123 // Blends 32x1 rectangle to 16x1.
124 __declspec(naked)
ScaleRowDown2Linear_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)125 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
126 uint8* dst_ptr, int dst_width) {
127 __asm {
128 mov eax, [esp + 4] // src_ptr
129 // src_stride
130 mov edx, [esp + 12] // dst_ptr
131 mov ecx, [esp + 16] // dst_width
132 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
133 psrlw xmm5, 8
134
135 wloop:
136 movdqu xmm0, [eax]
137 movdqu xmm1, [eax + 16]
138 lea eax, [eax + 32]
139
140 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
141 psrlw xmm0, 8
142 movdqa xmm3, xmm1
143 psrlw xmm1, 8
144 pand xmm2, xmm5
145 pand xmm3, xmm5
146 pavgw xmm0, xmm2
147 pavgw xmm1, xmm3
148 packuswb xmm0, xmm1
149
150 movdqu [edx], xmm0
151 lea edx, [edx + 16]
152 sub ecx, 16
153 jg wloop
154
155 ret
156 }
157 }
158
159 // Blends 32x2 rectangle to 16x1.
160 __declspec(naked)
ScaleRowDown2Box_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)161 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
162 uint8* dst_ptr, int dst_width) {
163 __asm {
164 push esi
165 mov eax, [esp + 4 + 4] // src_ptr
166 mov esi, [esp + 4 + 8] // src_stride
167 mov edx, [esp + 4 + 12] // dst_ptr
168 mov ecx, [esp + 4 + 16] // dst_width
169 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
170 psrlw xmm5, 8
171
172 wloop:
173 movdqu xmm0, [eax]
174 movdqu xmm1, [eax + 16]
175 movdqu xmm2, [eax + esi]
176 movdqu xmm3, [eax + esi + 16]
177 lea eax, [eax + 32]
178 pavgb xmm0, xmm2 // average rows
179 pavgb xmm1, xmm3
180
181 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
182 psrlw xmm0, 8
183 movdqa xmm3, xmm1
184 psrlw xmm1, 8
185 pand xmm2, xmm5
186 pand xmm3, xmm5
187 pavgw xmm0, xmm2
188 pavgw xmm1, xmm3
189 packuswb xmm0, xmm1
190
191 movdqu [edx], xmm0
192 lea edx, [edx + 16]
193 sub ecx, 16
194 jg wloop
195
196 pop esi
197 ret
198 }
199 }
200
201 #ifdef HAS_SCALEROWDOWN2_AVX2
202 // Reads 64 pixels, throws half away and writes 32 pixels.
203 __declspec(naked)
ScaleRowDown2_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)204 void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
205 uint8* dst_ptr, int dst_width) {
206 __asm {
207 mov eax, [esp + 4] // src_ptr
208 // src_stride ignored
209 mov edx, [esp + 12] // dst_ptr
210 mov ecx, [esp + 16] // dst_width
211
212 wloop:
213 vmovdqu ymm0, [eax]
214 vmovdqu ymm1, [eax + 32]
215 lea eax, [eax + 64]
216 vpsrlw ymm0, ymm0, 8 // isolate odd pixels.
217 vpsrlw ymm1, ymm1, 8
218 vpackuswb ymm0, ymm0, ymm1
219 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
220 vmovdqu [edx], ymm0
221 lea edx, [edx + 32]
222 sub ecx, 32
223 jg wloop
224
225 vzeroupper
226 ret
227 }
228 }
229
230 // Blends 64x1 rectangle to 32x1.
231 __declspec(naked)
ScaleRowDown2Linear_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)232 void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
233 uint8* dst_ptr, int dst_width) {
234 __asm {
235 mov eax, [esp + 4] // src_ptr
236 // src_stride
237 mov edx, [esp + 12] // dst_ptr
238 mov ecx, [esp + 16] // dst_width
239
240 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
241 vpsrlw ymm4, ymm4, 15
242 vpackuswb ymm4, ymm4, ymm4
243 vpxor ymm5, ymm5, ymm5 // constant 0
244
245 wloop:
246 vmovdqu ymm0, [eax]
247 vmovdqu ymm1, [eax + 32]
248 lea eax, [eax + 64]
249
250 vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
251 vpmaddubsw ymm1, ymm1, ymm4
252 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
253 vpavgw ymm1, ymm1, ymm5
254 vpackuswb ymm0, ymm0, ymm1
255 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
256
257 vmovdqu [edx], ymm0
258 lea edx, [edx + 32]
259 sub ecx, 32
260 jg wloop
261
262 vzeroupper
263 ret
264 }
265 }
266
267 // Blends 64x2 rectangle to 32x1.
268 __declspec(naked)
ScaleRowDown2Box_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)269 void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
270 uint8* dst_ptr, int dst_width) {
271 __asm {
272 push esi
273 mov eax, [esp + 4 + 4] // src_ptr
274 mov esi, [esp + 4 + 8] // src_stride
275 mov edx, [esp + 4 + 12] // dst_ptr
276 mov ecx, [esp + 4 + 16] // dst_width
277
278 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
279 vpsrlw ymm4, ymm4, 15
280 vpackuswb ymm4, ymm4, ymm4
281 vpxor ymm5, ymm5, ymm5 // constant 0
282
283 wloop:
284 vmovdqu ymm0, [eax] // average rows
285 vmovdqu ymm1, [eax + 32]
286 vpavgb ymm0, ymm0, [eax + esi]
287 vpavgb ymm1, ymm1, [eax + esi + 32]
288 lea eax, [eax + 64]
289
290 vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
291 vpmaddubsw ymm1, ymm1, ymm4
292 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
293 vpavgw ymm1, ymm1, ymm5
294 vpackuswb ymm0, ymm0, ymm1
295 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
296
297 vmovdqu [edx], ymm0
298 lea edx, [edx + 32]
299 sub ecx, 32
300 jg wloop
301
302 pop esi
303 vzeroupper
304 ret
305 }
306 }
307 #endif // HAS_SCALEROWDOWN2_AVX2
308
309 // Point samples 32 pixels to 8 pixels.
310 __declspec(naked)
ScaleRowDown4_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)311 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
312 uint8* dst_ptr, int dst_width) {
313 __asm {
314 mov eax, [esp + 4] // src_ptr
315 // src_stride ignored
316 mov edx, [esp + 12] // dst_ptr
317 mov ecx, [esp + 16] // dst_width
318 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
319 psrld xmm5, 24
320 pslld xmm5, 16
321
322 wloop:
323 movdqu xmm0, [eax]
324 movdqu xmm1, [eax + 16]
325 lea eax, [eax + 32]
326 pand xmm0, xmm5
327 pand xmm1, xmm5
328 packuswb xmm0, xmm1
329 psrlw xmm0, 8
330 packuswb xmm0, xmm0
331 movq qword ptr [edx], xmm0
332 lea edx, [edx + 8]
333 sub ecx, 8
334 jg wloop
335
336 ret
337 }
338 }
339
340 // Blends 32x4 rectangle to 8x1.
341 __declspec(naked)
ScaleRowDown4Box_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)342 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
343 uint8* dst_ptr, int dst_width) {
344 __asm {
345 push esi
346 push edi
347 mov eax, [esp + 8 + 4] // src_ptr
348 mov esi, [esp + 8 + 8] // src_stride
349 mov edx, [esp + 8 + 12] // dst_ptr
350 mov ecx, [esp + 8 + 16] // dst_width
351 lea edi, [esi + esi * 2] // src_stride * 3
352 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
353 psrlw xmm7, 8
354
355 wloop:
356 movdqu xmm0, [eax] // average rows
357 movdqu xmm1, [eax + 16]
358 movdqu xmm2, [eax + esi]
359 movdqu xmm3, [eax + esi + 16]
360 pavgb xmm0, xmm2
361 pavgb xmm1, xmm3
362 movdqu xmm2, [eax + esi * 2]
363 movdqu xmm3, [eax + esi * 2 + 16]
364 movdqu xmm4, [eax + edi]
365 movdqu xmm5, [eax + edi + 16]
366 lea eax, [eax + 32]
367 pavgb xmm2, xmm4
368 pavgb xmm3, xmm5
369 pavgb xmm0, xmm2
370 pavgb xmm1, xmm3
371
372 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
373 psrlw xmm0, 8
374 movdqa xmm3, xmm1
375 psrlw xmm1, 8
376 pand xmm2, xmm7
377 pand xmm3, xmm7
378 pavgw xmm0, xmm2
379 pavgw xmm1, xmm3
380 packuswb xmm0, xmm1
381
382 movdqa xmm2, xmm0 // average columns (16 to 8 pixels)
383 psrlw xmm0, 8
384 pand xmm2, xmm7
385 pavgw xmm0, xmm2
386 packuswb xmm0, xmm0
387
388 movq qword ptr [edx], xmm0
389 lea edx, [edx + 8]
390 sub ecx, 8
391 jg wloop
392
393 pop edi
394 pop esi
395 ret
396 }
397 }
398
399 #ifdef HAS_SCALEROWDOWN4_AVX2
400 // Point samples 64 pixels to 16 pixels.
401 __declspec(naked)
ScaleRowDown4_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)402 void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
403 uint8* dst_ptr, int dst_width) {
404 __asm {
405 mov eax, [esp + 4] // src_ptr
406 // src_stride ignored
407 mov edx, [esp + 12] // dst_ptr
408 mov ecx, [esp + 16] // dst_width
409 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000
410 vpsrld ymm5, ymm5, 24
411 vpslld ymm5, ymm5, 16
412
413 wloop:
414 vmovdqu ymm0, [eax]
415 vmovdqu ymm1, [eax + 32]
416 lea eax, [eax + 64]
417 vpand ymm0, ymm0, ymm5
418 vpand ymm1, ymm1, ymm5
419 vpackuswb ymm0, ymm0, ymm1
420 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
421 vpsrlw ymm0, ymm0, 8
422 vpackuswb ymm0, ymm0, ymm0
423 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
424 vmovdqu [edx], xmm0
425 lea edx, [edx + 16]
426 sub ecx, 16
427 jg wloop
428
429 vzeroupper
430 ret
431 }
432 }
433
434 // Blends 64x4 rectangle to 16x1.
435 __declspec(naked)
ScaleRowDown4Box_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)436 void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
437 uint8* dst_ptr, int dst_width) {
438 __asm {
439 push esi
440 push edi
441 mov eax, [esp + 8 + 4] // src_ptr
442 mov esi, [esp + 8 + 8] // src_stride
443 mov edx, [esp + 8 + 12] // dst_ptr
444 mov ecx, [esp + 8 + 16] // dst_width
445 lea edi, [esi + esi * 2] // src_stride * 3
446 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0x00ff00ff
447 vpsrlw ymm7, ymm7, 8
448
449 wloop:
450 vmovdqu ymm0, [eax] // average rows
451 vmovdqu ymm1, [eax + 32]
452 vpavgb ymm0, ymm0, [eax + esi]
453 vpavgb ymm1, ymm1, [eax + esi + 32]
454 vmovdqu ymm2, [eax + esi * 2]
455 vmovdqu ymm3, [eax + esi * 2 + 32]
456 vpavgb ymm2, ymm2, [eax + edi]
457 vpavgb ymm3, ymm3, [eax + edi + 32]
458 lea eax, [eax + 64]
459 vpavgb ymm0, ymm0, ymm2
460 vpavgb ymm1, ymm1, ymm3
461
462 vpand ymm2, ymm0, ymm7 // average columns (64 to 32 pixels)
463 vpand ymm3, ymm1, ymm7
464 vpsrlw ymm0, ymm0, 8
465 vpsrlw ymm1, ymm1, 8
466 vpavgw ymm0, ymm0, ymm2
467 vpavgw ymm1, ymm1, ymm3
468 vpackuswb ymm0, ymm0, ymm1
469 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
470
471 vpand ymm2, ymm0, ymm7 // average columns (32 to 16 pixels)
472 vpsrlw ymm0, ymm0, 8
473 vpavgw ymm0, ymm0, ymm2
474 vpackuswb ymm0, ymm0, ymm0
475 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
476
477 vmovdqu [edx], xmm0
478 lea edx, [edx + 16]
479 sub ecx, 16
480 jg wloop
481
482 pop edi
483 pop esi
484 vzeroupper
485 ret
486 }
487 }
488 #endif // HAS_SCALEROWDOWN4_AVX2
489
490 // Point samples 32 pixels to 24 pixels.
491 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
492 // Then shuffled to do the scaling.
493
494 __declspec(naked)
ScaleRowDown34_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)495 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
496 uint8* dst_ptr, int dst_width) {
497 __asm {
498 mov eax, [esp + 4] // src_ptr
499 // src_stride ignored
500 mov edx, [esp + 12] // dst_ptr
501 mov ecx, [esp + 16] // dst_width
502 movdqa xmm3, kShuf0
503 movdqa xmm4, kShuf1
504 movdqa xmm5, kShuf2
505
506 wloop:
507 movdqu xmm0, [eax]
508 movdqu xmm1, [eax + 16]
509 lea eax, [eax + 32]
510 movdqa xmm2, xmm1
511 palignr xmm1, xmm0, 8
512 pshufb xmm0, xmm3
513 pshufb xmm1, xmm4
514 pshufb xmm2, xmm5
515 movq qword ptr [edx], xmm0
516 movq qword ptr [edx + 8], xmm1
517 movq qword ptr [edx + 16], xmm2
518 lea edx, [edx + 24]
519 sub ecx, 24
520 jg wloop
521
522 ret
523 }
524 }
525
526 // Blends 32x2 rectangle to 24x1
527 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
528 // Then shuffled to do the scaling.
529
530 // Register usage:
531 // xmm0 src_row 0
532 // xmm1 src_row 1
533 // xmm2 shuf 0
534 // xmm3 shuf 1
535 // xmm4 shuf 2
536 // xmm5 madd 0
537 // xmm6 madd 1
538 // xmm7 kRound34
539
540 // Note that movdqa+palign may be better than movdqu.
541 __declspec(naked)
ScaleRowDown34_1_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)542 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
543 ptrdiff_t src_stride,
544 uint8* dst_ptr, int dst_width) {
545 __asm {
546 push esi
547 mov eax, [esp + 4 + 4] // src_ptr
548 mov esi, [esp + 4 + 8] // src_stride
549 mov edx, [esp + 4 + 12] // dst_ptr
550 mov ecx, [esp + 4 + 16] // dst_width
551 movdqa xmm2, kShuf01
552 movdqa xmm3, kShuf11
553 movdqa xmm4, kShuf21
554 movdqa xmm5, kMadd01
555 movdqa xmm6, kMadd11
556 movdqa xmm7, kRound34
557
558 wloop:
559 movdqu xmm0, [eax] // pixels 0..7
560 movdqu xmm1, [eax + esi]
561 pavgb xmm0, xmm1
562 pshufb xmm0, xmm2
563 pmaddubsw xmm0, xmm5
564 paddsw xmm0, xmm7
565 psrlw xmm0, 2
566 packuswb xmm0, xmm0
567 movq qword ptr [edx], xmm0
568 movdqu xmm0, [eax + 8] // pixels 8..15
569 movdqu xmm1, [eax + esi + 8]
570 pavgb xmm0, xmm1
571 pshufb xmm0, xmm3
572 pmaddubsw xmm0, xmm6
573 paddsw xmm0, xmm7
574 psrlw xmm0, 2
575 packuswb xmm0, xmm0
576 movq qword ptr [edx + 8], xmm0
577 movdqu xmm0, [eax + 16] // pixels 16..23
578 movdqu xmm1, [eax + esi + 16]
579 lea eax, [eax + 32]
580 pavgb xmm0, xmm1
581 pshufb xmm0, xmm4
582 movdqa xmm1, kMadd21
583 pmaddubsw xmm0, xmm1
584 paddsw xmm0, xmm7
585 psrlw xmm0, 2
586 packuswb xmm0, xmm0
587 movq qword ptr [edx + 16], xmm0
588 lea edx, [edx + 24]
589 sub ecx, 24
590 jg wloop
591
592 pop esi
593 ret
594 }
595 }
596
597 // Note that movdqa+palign may be better than movdqu.
598 __declspec(naked)
ScaleRowDown34_0_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)599 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
600 ptrdiff_t src_stride,
601 uint8* dst_ptr, int dst_width) {
602 __asm {
603 push esi
604 mov eax, [esp + 4 + 4] // src_ptr
605 mov esi, [esp + 4 + 8] // src_stride
606 mov edx, [esp + 4 + 12] // dst_ptr
607 mov ecx, [esp + 4 + 16] // dst_width
608 movdqa xmm2, kShuf01
609 movdqa xmm3, kShuf11
610 movdqa xmm4, kShuf21
611 movdqa xmm5, kMadd01
612 movdqa xmm6, kMadd11
613 movdqa xmm7, kRound34
614
615 wloop:
616 movdqu xmm0, [eax] // pixels 0..7
617 movdqu xmm1, [eax + esi]
618 pavgb xmm1, xmm0
619 pavgb xmm0, xmm1
620 pshufb xmm0, xmm2
621 pmaddubsw xmm0, xmm5
622 paddsw xmm0, xmm7
623 psrlw xmm0, 2
624 packuswb xmm0, xmm0
625 movq qword ptr [edx], xmm0
626 movdqu xmm0, [eax + 8] // pixels 8..15
627 movdqu xmm1, [eax + esi + 8]
628 pavgb xmm1, xmm0
629 pavgb xmm0, xmm1
630 pshufb xmm0, xmm3
631 pmaddubsw xmm0, xmm6
632 paddsw xmm0, xmm7
633 psrlw xmm0, 2
634 packuswb xmm0, xmm0
635 movq qword ptr [edx + 8], xmm0
636 movdqu xmm0, [eax + 16] // pixels 16..23
637 movdqu xmm1, [eax + esi + 16]
638 lea eax, [eax + 32]
639 pavgb xmm1, xmm0
640 pavgb xmm0, xmm1
641 pshufb xmm0, xmm4
642 movdqa xmm1, kMadd21
643 pmaddubsw xmm0, xmm1
644 paddsw xmm0, xmm7
645 psrlw xmm0, 2
646 packuswb xmm0, xmm0
647 movq qword ptr [edx + 16], xmm0
648 lea edx, [edx+24]
649 sub ecx, 24
650 jg wloop
651
652 pop esi
653 ret
654 }
655 }
656
657 // 3/8 point sampler
658
659 // Scale 32 pixels to 12
660 __declspec(naked)
ScaleRowDown38_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)661 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
662 uint8* dst_ptr, int dst_width) {
663 __asm {
664 mov eax, [esp + 4] // src_ptr
665 // src_stride ignored
666 mov edx, [esp + 12] // dst_ptr
667 mov ecx, [esp + 16] // dst_width
668 movdqa xmm4, kShuf38a
669 movdqa xmm5, kShuf38b
670
671 xloop:
672 movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
673 movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
674 lea eax, [eax + 32]
675 pshufb xmm0, xmm4
676 pshufb xmm1, xmm5
677 paddusb xmm0, xmm1
678
679 movq qword ptr [edx], xmm0 // write 12 pixels
680 movhlps xmm1, xmm0
681 movd [edx + 8], xmm1
682 lea edx, [edx + 12]
683 sub ecx, 12
684 jg xloop
685
686 ret
687 }
688 }
689
690 // Scale 16x3 pixels to 6x1 with interpolation
691 __declspec(naked)
ScaleRowDown38_3_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)692 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
693 ptrdiff_t src_stride,
694 uint8* dst_ptr, int dst_width) {
695 __asm {
696 push esi
697 mov eax, [esp + 4 + 4] // src_ptr
698 mov esi, [esp + 4 + 8] // src_stride
699 mov edx, [esp + 4 + 12] // dst_ptr
700 mov ecx, [esp + 4 + 16] // dst_width
701 movdqa xmm2, kShufAc
702 movdqa xmm3, kShufAc3
703 movdqa xmm4, kScaleAc33
704 pxor xmm5, xmm5
705
706 xloop:
707 movdqu xmm0, [eax] // sum up 3 rows into xmm0/1
708 movdqu xmm6, [eax + esi]
709 movhlps xmm1, xmm0
710 movhlps xmm7, xmm6
711 punpcklbw xmm0, xmm5
712 punpcklbw xmm1, xmm5
713 punpcklbw xmm6, xmm5
714 punpcklbw xmm7, xmm5
715 paddusw xmm0, xmm6
716 paddusw xmm1, xmm7
717 movdqu xmm6, [eax + esi * 2]
718 lea eax, [eax + 16]
719 movhlps xmm7, xmm6
720 punpcklbw xmm6, xmm5
721 punpcklbw xmm7, xmm5
722 paddusw xmm0, xmm6
723 paddusw xmm1, xmm7
724
725 movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
726 psrldq xmm0, 2
727 paddusw xmm6, xmm0
728 psrldq xmm0, 2
729 paddusw xmm6, xmm0
730 pshufb xmm6, xmm2
731
732 movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
733 psrldq xmm1, 2
734 paddusw xmm7, xmm1
735 psrldq xmm1, 2
736 paddusw xmm7, xmm1
737 pshufb xmm7, xmm3
738 paddusw xmm6, xmm7
739
740 pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
741 packuswb xmm6, xmm6
742
743 movd [edx], xmm6 // write 6 pixels
744 psrlq xmm6, 16
745 movd [edx + 2], xmm6
746 lea edx, [edx + 6]
747 sub ecx, 6
748 jg xloop
749
750 pop esi
751 ret
752 }
753 }
754
755 // Scale 16x2 pixels to 6x1 with interpolation
756 __declspec(naked)
ScaleRowDown38_2_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)757 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
758 ptrdiff_t src_stride,
759 uint8* dst_ptr, int dst_width) {
760 __asm {
761 push esi
762 mov eax, [esp + 4 + 4] // src_ptr
763 mov esi, [esp + 4 + 8] // src_stride
764 mov edx, [esp + 4 + 12] // dst_ptr
765 mov ecx, [esp + 4 + 16] // dst_width
766 movdqa xmm2, kShufAb0
767 movdqa xmm3, kShufAb1
768 movdqa xmm4, kShufAb2
769 movdqa xmm5, kScaleAb2
770
771 xloop:
772 movdqu xmm0, [eax] // average 2 rows into xmm0
773 movdqu xmm1, [eax + esi]
774 lea eax, [eax + 16]
775 pavgb xmm0, xmm1
776
777 movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
778 pshufb xmm1, xmm2
779 movdqa xmm6, xmm0
780 pshufb xmm6, xmm3
781 paddusw xmm1, xmm6
782 pshufb xmm0, xmm4
783 paddusw xmm1, xmm0
784
785 pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
786 packuswb xmm1, xmm1
787
788 movd [edx], xmm1 // write 6 pixels
789 psrlq xmm1, 16
790 movd [edx + 2], xmm1
791 lea edx, [edx + 6]
792 sub ecx, 6
793 jg xloop
794
795 pop esi
796 ret
797 }
798 }
799
800 // Reads 16 bytes and accumulates to 16 shorts at a time.
801 __declspec(naked)
ScaleAddRow_SSE2(const uint8 * src_ptr,uint16 * dst_ptr,int src_width)802 void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
803 __asm {
804 mov eax, [esp + 4] // src_ptr
805 mov edx, [esp + 8] // dst_ptr
806 mov ecx, [esp + 12] // src_width
807 pxor xmm5, xmm5
808
809 // sum rows
810 xloop:
811 movdqu xmm3, [eax] // read 16 bytes
812 lea eax, [eax + 16]
813 movdqu xmm0, [edx] // read 16 words from destination
814 movdqu xmm1, [edx + 16]
815 movdqa xmm2, xmm3
816 punpcklbw xmm2, xmm5
817 punpckhbw xmm3, xmm5
818 paddusw xmm0, xmm2 // sum 16 words
819 paddusw xmm1, xmm3
820 movdqu [edx], xmm0 // write 16 words to destination
821 movdqu [edx + 16], xmm1
822 lea edx, [edx + 32]
823 sub ecx, 16
824 jg xloop
825 ret
826 }
827 }
828
829 #ifdef HAS_SCALEADDROW_AVX2
830 // Reads 32 bytes and accumulates to 32 shorts at a time.
831 __declspec(naked)
ScaleAddRow_AVX2(const uint8 * src_ptr,uint16 * dst_ptr,int src_width)832 void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
833 __asm {
834 mov eax, [esp + 4] // src_ptr
835 mov edx, [esp + 8] // dst_ptr
836 mov ecx, [esp + 12] // src_width
837 vpxor ymm5, ymm5, ymm5
838
839 // sum rows
840 xloop:
841 vmovdqu ymm3, [eax] // read 32 bytes
842 lea eax, [eax + 32]
843 vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck
844 vpunpcklbw ymm2, ymm3, ymm5
845 vpunpckhbw ymm3, ymm3, ymm5
846 vpaddusw ymm0, ymm2, [edx] // sum 16 words
847 vpaddusw ymm1, ymm3, [edx + 32]
848 vmovdqu [edx], ymm0 // write 32 words to destination
849 vmovdqu [edx + 32], ymm1
850 lea edx, [edx + 64]
851 sub ecx, 32
852 jg xloop
853
854 vzeroupper
855 ret
856 }
857 }
858 #endif // HAS_SCALEADDROW_AVX2
859
860 // Bilinear column filtering. SSSE3 version.
861 __declspec(naked)
ScaleFilterCols_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)862 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
863 int dst_width, int x, int dx) {
864 __asm {
865 push ebx
866 push esi
867 push edi
868 mov edi, [esp + 12 + 4] // dst_ptr
869 mov esi, [esp + 12 + 8] // src_ptr
870 mov ecx, [esp + 12 + 12] // dst_width
871 movd xmm2, [esp + 12 + 16] // x
872 movd xmm3, [esp + 12 + 20] // dx
873 mov eax, 0x04040000 // shuffle to line up fractions with pixel.
874 movd xmm5, eax
875 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
876 psrlw xmm6, 9
877 pextrw eax, xmm2, 1 // get x0 integer. preroll
878 sub ecx, 2
879 jl xloop29
880
881 movdqa xmm0, xmm2 // x1 = x0 + dx
882 paddd xmm0, xmm3
883 punpckldq xmm2, xmm0 // x0 x1
884 punpckldq xmm3, xmm3 // dx dx
885 paddd xmm3, xmm3 // dx * 2, dx * 2
886 pextrw edx, xmm2, 3 // get x1 integer. preroll
887
888 // 2 Pixel loop.
889 xloop2:
890 movdqa xmm1, xmm2 // x0, x1 fractions.
891 paddd xmm2, xmm3 // x += dx
892 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
893 movd xmm0, ebx
894 psrlw xmm1, 9 // 7 bit fractions.
895 movzx ebx, word ptr [esi + edx] // 2 source x1 pixels
896 movd xmm4, ebx
897 pshufb xmm1, xmm5 // 0011
898 punpcklwd xmm0, xmm4
899 pxor xmm1, xmm6 // 0..7f and 7f..0
900 pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels.
901 pextrw eax, xmm2, 1 // get x0 integer. next iteration.
902 pextrw edx, xmm2, 3 // get x1 integer. next iteration.
903 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
904 packuswb xmm0, xmm0 // 8 bits, 2 pixels.
905 movd ebx, xmm0
906 mov [edi], bx
907 lea edi, [edi + 2]
908 sub ecx, 2 // 2 pixels
909 jge xloop2
910
911 xloop29:
912
913 add ecx, 2 - 1
914 jl xloop99
915
916 // 1 pixel remainder
917 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
918 movd xmm0, ebx
919 psrlw xmm2, 9 // 7 bit fractions.
920 pshufb xmm2, xmm5 // 0011
921 pxor xmm2, xmm6 // 0..7f and 7f..0
922 pmaddubsw xmm0, xmm2 // 16 bit
923 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
924 packuswb xmm0, xmm0 // 8 bits
925 movd ebx, xmm0
926 mov [edi], bl
927
928 xloop99:
929
930 pop edi
931 pop esi
932 pop ebx
933 ret
934 }
935 }
936
937 // Reads 16 pixels, duplicates them and writes 32 pixels.
938 __declspec(naked)
ScaleColsUp2_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)939 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
940 int dst_width, int x, int dx) {
941 __asm {
942 mov edx, [esp + 4] // dst_ptr
943 mov eax, [esp + 8] // src_ptr
944 mov ecx, [esp + 12] // dst_width
945
946 wloop:
947 movdqu xmm0, [eax]
948 lea eax, [eax + 16]
949 movdqa xmm1, xmm0
950 punpcklbw xmm0, xmm0
951 punpckhbw xmm1, xmm1
952 movdqu [edx], xmm0
953 movdqu [edx + 16], xmm1
954 lea edx, [edx + 32]
955 sub ecx, 32
956 jg wloop
957
958 ret
959 }
960 }
961
962 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
963 __declspec(naked)
ScaleARGBRowDown2_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)964 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
965 ptrdiff_t src_stride,
966 uint8* dst_argb, int dst_width) {
967 __asm {
968 mov eax, [esp + 4] // src_argb
969 // src_stride ignored
970 mov edx, [esp + 12] // dst_argb
971 mov ecx, [esp + 16] // dst_width
972
973 wloop:
974 movdqu xmm0, [eax]
975 movdqu xmm1, [eax + 16]
976 lea eax, [eax + 32]
977 shufps xmm0, xmm1, 0xdd
978 movdqu [edx], xmm0
979 lea edx, [edx + 16]
980 sub ecx, 4
981 jg wloop
982
983 ret
984 }
985 }
986
987 // Blends 8x1 rectangle to 4x1.
988 __declspec(naked)
ScaleARGBRowDown2Linear_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)989 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
990 ptrdiff_t src_stride,
991 uint8* dst_argb, int dst_width) {
992 __asm {
993 mov eax, [esp + 4] // src_argb
994 // src_stride ignored
995 mov edx, [esp + 12] // dst_argb
996 mov ecx, [esp + 16] // dst_width
997
998 wloop:
999 movdqu xmm0, [eax]
1000 movdqu xmm1, [eax + 16]
1001 lea eax, [eax + 32]
1002 movdqa xmm2, xmm0
1003 shufps xmm0, xmm1, 0x88 // even pixels
1004 shufps xmm2, xmm1, 0xdd // odd pixels
1005 pavgb xmm0, xmm2
1006 movdqu [edx], xmm0
1007 lea edx, [edx + 16]
1008 sub ecx, 4
1009 jg wloop
1010
1011 ret
1012 }
1013 }
1014
1015 // Blends 8x2 rectangle to 4x1.
1016 __declspec(naked)
ScaleARGBRowDown2Box_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)1017 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
1018 ptrdiff_t src_stride,
1019 uint8* dst_argb, int dst_width) {
1020 __asm {
1021 push esi
1022 mov eax, [esp + 4 + 4] // src_argb
1023 mov esi, [esp + 4 + 8] // src_stride
1024 mov edx, [esp + 4 + 12] // dst_argb
1025 mov ecx, [esp + 4 + 16] // dst_width
1026
1027 wloop:
1028 movdqu xmm0, [eax]
1029 movdqu xmm1, [eax + 16]
1030 movdqu xmm2, [eax + esi]
1031 movdqu xmm3, [eax + esi + 16]
1032 lea eax, [eax + 32]
1033 pavgb xmm0, xmm2 // average rows
1034 pavgb xmm1, xmm3
1035 movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
1036 shufps xmm0, xmm1, 0x88 // even pixels
1037 shufps xmm2, xmm1, 0xdd // odd pixels
1038 pavgb xmm0, xmm2
1039 movdqu [edx], xmm0
1040 lea edx, [edx + 16]
1041 sub ecx, 4
1042 jg wloop
1043
1044 pop esi
1045 ret
1046 }
1047 }
1048
1049 // Reads 4 pixels at a time.
1050 __declspec(naked)
ScaleARGBRowDownEven_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)1051 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
1052 int src_stepx,
1053 uint8* dst_argb, int dst_width) {
1054 __asm {
1055 push ebx
1056 push edi
1057 mov eax, [esp + 8 + 4] // src_argb
1058 // src_stride ignored
1059 mov ebx, [esp + 8 + 12] // src_stepx
1060 mov edx, [esp + 8 + 16] // dst_argb
1061 mov ecx, [esp + 8 + 20] // dst_width
1062 lea ebx, [ebx * 4]
1063 lea edi, [ebx + ebx * 2]
1064
1065 wloop:
1066 movd xmm0, [eax]
1067 movd xmm1, [eax + ebx]
1068 punpckldq xmm0, xmm1
1069 movd xmm2, [eax + ebx * 2]
1070 movd xmm3, [eax + edi]
1071 lea eax, [eax + ebx * 4]
1072 punpckldq xmm2, xmm3
1073 punpcklqdq xmm0, xmm2
1074 movdqu [edx], xmm0
1075 lea edx, [edx + 16]
1076 sub ecx, 4
1077 jg wloop
1078
1079 pop edi
1080 pop ebx
1081 ret
1082 }
1083 }
1084
1085 // Blends four 2x2 to 4x1.
1086 __declspec(naked)
ScaleARGBRowDownEvenBox_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)1087 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
1088 ptrdiff_t src_stride,
1089 int src_stepx,
1090 uint8* dst_argb, int dst_width) {
1091 __asm {
1092 push ebx
1093 push esi
1094 push edi
1095 mov eax, [esp + 12 + 4] // src_argb
1096 mov esi, [esp + 12 + 8] // src_stride
1097 mov ebx, [esp + 12 + 12] // src_stepx
1098 mov edx, [esp + 12 + 16] // dst_argb
1099 mov ecx, [esp + 12 + 20] // dst_width
1100 lea esi, [eax + esi] // row1 pointer
1101 lea ebx, [ebx * 4]
1102 lea edi, [ebx + ebx * 2]
1103
1104 wloop:
1105 movq xmm0, qword ptr [eax] // row0 4 pairs
1106 movhps xmm0, qword ptr [eax + ebx]
1107 movq xmm1, qword ptr [eax + ebx * 2]
1108 movhps xmm1, qword ptr [eax + edi]
1109 lea eax, [eax + ebx * 4]
1110 movq xmm2, qword ptr [esi] // row1 4 pairs
1111 movhps xmm2, qword ptr [esi + ebx]
1112 movq xmm3, qword ptr [esi + ebx * 2]
1113 movhps xmm3, qword ptr [esi + edi]
1114 lea esi, [esi + ebx * 4]
1115 pavgb xmm0, xmm2 // average rows
1116 pavgb xmm1, xmm3
1117 movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
1118 shufps xmm0, xmm1, 0x88 // even pixels
1119 shufps xmm2, xmm1, 0xdd // odd pixels
1120 pavgb xmm0, xmm2
1121 movdqu [edx], xmm0
1122 lea edx, [edx + 16]
1123 sub ecx, 4
1124 jg wloop
1125
1126 pop edi
1127 pop esi
1128 pop ebx
1129 ret
1130 }
1131 }
1132
1133 // Column scaling unfiltered. SSE2 version.
1134 __declspec(naked)
ScaleARGBCols_SSE2(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1135 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
1136 int dst_width, int x, int dx) {
1137 __asm {
1138 push edi
1139 push esi
1140 mov edi, [esp + 8 + 4] // dst_argb
1141 mov esi, [esp + 8 + 8] // src_argb
1142 mov ecx, [esp + 8 + 12] // dst_width
1143 movd xmm2, [esp + 8 + 16] // x
1144 movd xmm3, [esp + 8 + 20] // dx
1145
1146 pshufd xmm2, xmm2, 0 // x0 x0 x0 x0
1147 pshufd xmm0, xmm3, 0x11 // dx 0 dx 0
1148 paddd xmm2, xmm0
1149 paddd xmm3, xmm3 // 0, 0, 0, dx * 2
1150 pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0
1151 paddd xmm2, xmm0 // x3 x2 x1 x0
1152 paddd xmm3, xmm3 // 0, 0, 0, dx * 4
1153 pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4
1154
1155 pextrw eax, xmm2, 1 // get x0 integer.
1156 pextrw edx, xmm2, 3 // get x1 integer.
1157
1158 cmp ecx, 0
1159 jle xloop99
1160 sub ecx, 4
1161 jl xloop49
1162
1163 // 4 Pixel loop.
1164 xloop4:
1165 movd xmm0, [esi + eax * 4] // 1 source x0 pixels
1166 movd xmm1, [esi + edx * 4] // 1 source x1 pixels
1167 pextrw eax, xmm2, 5 // get x2 integer.
1168 pextrw edx, xmm2, 7 // get x3 integer.
1169 paddd xmm2, xmm3 // x += dx
1170 punpckldq xmm0, xmm1 // x0 x1
1171
1172 movd xmm1, [esi + eax * 4] // 1 source x2 pixels
1173 movd xmm4, [esi + edx * 4] // 1 source x3 pixels
1174 pextrw eax, xmm2, 1 // get x0 integer. next iteration.
1175 pextrw edx, xmm2, 3 // get x1 integer. next iteration.
1176 punpckldq xmm1, xmm4 // x2 x3
1177 punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
1178 movdqu [edi], xmm0
1179 lea edi, [edi + 16]
1180 sub ecx, 4 // 4 pixels
1181 jge xloop4
1182
1183 xloop49:
1184 test ecx, 2
1185 je xloop29
1186
1187 // 2 Pixels.
1188 movd xmm0, [esi + eax * 4] // 1 source x0 pixels
1189 movd xmm1, [esi + edx * 4] // 1 source x1 pixels
1190 pextrw eax, xmm2, 5 // get x2 integer.
1191 punpckldq xmm0, xmm1 // x0 x1
1192
1193 movq qword ptr [edi], xmm0
1194 lea edi, [edi + 8]
1195
1196 xloop29:
1197 test ecx, 1
1198 je xloop99
1199
1200 // 1 Pixels.
1201 movd xmm0, [esi + eax * 4] // 1 source x2 pixels
1202 movd dword ptr [edi], xmm0
1203 xloop99:
1204
1205 pop esi
1206 pop edi
1207 ret
1208 }
1209 }
1210
1211 // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
1212 // TODO(fbarchard): Port to Neon
1213
1214 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1215 static uvec8 kShuffleColARGB = {
1216 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
1217 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
1218 };
1219
1220 // Shuffle table for duplicating 2 fractions into 8 bytes each
1221 static uvec8 kShuffleFractions = {
1222 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1223 };
1224
1225 __declspec(naked)
ScaleARGBFilterCols_SSSE3(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1226 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
1227 int dst_width, int x, int dx) {
1228 __asm {
1229 push esi
1230 push edi
1231 mov edi, [esp + 8 + 4] // dst_argb
1232 mov esi, [esp + 8 + 8] // src_argb
1233 mov ecx, [esp + 8 + 12] // dst_width
1234 movd xmm2, [esp + 8 + 16] // x
1235 movd xmm3, [esp + 8 + 20] // dx
1236 movdqa xmm4, kShuffleColARGB
1237 movdqa xmm5, kShuffleFractions
1238 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
1239 psrlw xmm6, 9
1240 pextrw eax, xmm2, 1 // get x0 integer. preroll
1241 sub ecx, 2
1242 jl xloop29
1243
1244 movdqa xmm0, xmm2 // x1 = x0 + dx
1245 paddd xmm0, xmm3
1246 punpckldq xmm2, xmm0 // x0 x1
1247 punpckldq xmm3, xmm3 // dx dx
1248 paddd xmm3, xmm3 // dx * 2, dx * 2
1249 pextrw edx, xmm2, 3 // get x1 integer. preroll
1250
1251 // 2 Pixel loop.
1252 xloop2:
1253 movdqa xmm1, xmm2 // x0, x1 fractions.
1254 paddd xmm2, xmm3 // x += dx
1255 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
1256 psrlw xmm1, 9 // 7 bit fractions.
1257 movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels
1258 pshufb xmm1, xmm5 // 0000000011111111
1259 pshufb xmm0, xmm4 // arrange pixels into pairs
1260 pxor xmm1, xmm6 // 0..7f and 7f..0
1261 pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.
1262 pextrw eax, xmm2, 1 // get x0 integer. next iteration.
1263 pextrw edx, xmm2, 3 // get x1 integer. next iteration.
1264 psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits.
1265 packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.
1266 movq qword ptr [edi], xmm0
1267 lea edi, [edi + 8]
1268 sub ecx, 2 // 2 pixels
1269 jge xloop2
1270
1271 xloop29:
1272
1273 add ecx, 2 - 1
1274 jl xloop99
1275
1276 // 1 pixel remainder
1277 psrlw xmm2, 9 // 7 bit fractions.
1278 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
1279 pshufb xmm2, xmm5 // 00000000
1280 pshufb xmm0, xmm4 // arrange pixels into pairs
1281 pxor xmm2, xmm6 // 0..7f and 7f..0
1282 pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel.
1283 psrlw xmm0, 7
1284 packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
1285 movd [edi], xmm0
1286
1287 xloop99:
1288
1289 pop edi
1290 pop esi
1291 ret
1292 }
1293 }
1294
1295 // Reads 4 pixels, duplicates them and writes 8 pixels.
1296 __declspec(naked)
ScaleARGBColsUp2_SSE2(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1297 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
1298 int dst_width, int x, int dx) {
1299 __asm {
1300 mov edx, [esp + 4] // dst_argb
1301 mov eax, [esp + 8] // src_argb
1302 mov ecx, [esp + 12] // dst_width
1303
1304 wloop:
1305 movdqu xmm0, [eax]
1306 lea eax, [eax + 16]
1307 movdqa xmm1, xmm0
1308 punpckldq xmm0, xmm0
1309 punpckhdq xmm1, xmm1
1310 movdqu [edx], xmm0
1311 movdqu [edx + 16], xmm1
1312 lea edx, [edx + 32]
1313 sub ecx, 8
1314 jg wloop
1315
1316 ret
1317 }
1318 }
1319
1320 // Divide num by div and return as 16.16 fixed point result.
1321 __declspec(naked)
FixedDiv_X86(int num,int div)1322 int FixedDiv_X86(int num, int div) {
1323 __asm {
1324 mov eax, [esp + 4] // num
1325 cdq // extend num to 64 bits
1326 shld edx, eax, 16 // 32.16
1327 shl eax, 16
1328 idiv dword ptr [esp + 8]
1329 ret
1330 }
1331 }
1332
1333 // Divide num by div and return as 16.16 fixed point result.
1334 __declspec(naked)
FixedDiv1_X86(int num,int div)1335 int FixedDiv1_X86(int num, int div) {
1336 __asm {
1337 mov eax, [esp + 4] // num
1338 mov ecx, [esp + 8] // denom
1339 cdq // extend num to 64 bits
1340 shld edx, eax, 16 // 32.16
1341 shl eax, 16
1342 sub eax, 0x00010001
1343 sbb edx, 0
1344 sub ecx, 1
1345 idiv ecx
1346 ret
1347 }
1348 }
1349 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
1350
1351 #ifdef __cplusplus
1352 } // extern "C"
1353 } // namespace libyuv
1354 #endif
1355