1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12 #include "libyuv/scale_row.h"
13
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18
19 // This module is for 32 bit Visual C x86 and clangcl
20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
21
22 // Offsets for source bytes 0 to 9
23 static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
24 128, 128, 128, 128, 128, 128, 128, 128};
25
26 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
27 static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
28 128, 128, 128, 128, 128, 128, 128, 128};
29
30 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
31 static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
32 128, 128, 128, 128, 128, 128, 128, 128};
33
34 // Offsets for source bytes 0 to 10
35 static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
36
37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
38 static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7,
39 8, 9, 9, 10, 10, 11, 12, 13};
40
41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
42 static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
43 10, 11, 12, 13, 13, 14, 14, 15};
44
45 // Coefficients for source bytes 0 to 10
46 static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
47
48 // Coefficients for source bytes 10 to 21
49 static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
50
51 // Coefficients for source bytes 21 to 31
52 static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
53
54 // Coefficients for source bytes 21 to 31
55 static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
56
57 static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
58 128, 128, 128, 128, 128, 128, 128, 128};
59
60 static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
61 6, 8, 11, 14, 128, 128, 128, 128};
62
63 // Arrange words 0,3,6 into 0,1,2
64 static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
65 128, 128, 128, 128, 128, 128, 128, 128};
66
67 // Arrange words 0,3,6 into 3,4,5
68 static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
69 6, 7, 12, 13, 128, 128, 128, 128};
70
71 // Scaling values for boxes of 3x3 and 2x3
72 static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
73 65536 / 9, 65536 / 6, 0, 0};
74
75 // Arrange first value for pixels 0,1,2,3,4,5
76 static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
77 11, 128, 14, 128, 128, 128, 128, 128};
78
79 // Arrange second value for pixels 0,1,2,3,4,5
80 static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
81 12, 128, 15, 128, 128, 128, 128, 128};
82
83 // Arrange third value for pixels 0,1,2,3,4,5
84 static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
85 13, 128, 128, 128, 128, 128, 128, 128};
86
87 // Scaling values for boxes of 3x2 and 2x2
88 static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
89 65536 / 3, 65536 / 2, 0, 0};
90
91 // Reads 32 pixels, throws half away and writes 16 pixels.
ScaleRowDown2_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)92 __declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
93 ptrdiff_t src_stride,
94 uint8_t* dst_ptr,
95 int dst_width) {
96 __asm {
97 mov eax, [esp + 4] // src_ptr
98 // src_stride ignored
99 mov edx, [esp + 12] // dst_ptr
100 mov ecx, [esp + 16] // dst_width
101
102 wloop:
103 movdqu xmm0, [eax]
104 movdqu xmm1, [eax + 16]
105 lea eax, [eax + 32]
106 psrlw xmm0, 8 // isolate odd pixels.
107 psrlw xmm1, 8
108 packuswb xmm0, xmm1
109 movdqu [edx], xmm0
110 lea edx, [edx + 16]
111 sub ecx, 16
112 jg wloop
113
114 ret
115 }
116 }
117
118 // Blends 32x1 rectangle to 16x1.
ScaleRowDown2Linear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)119 __declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
120 ptrdiff_t src_stride,
121 uint8_t* dst_ptr,
122 int dst_width) {
123 __asm {
124 mov eax, [esp + 4] // src_ptr
125 // src_stride
126 mov edx, [esp + 12] // dst_ptr
127 mov ecx, [esp + 16] // dst_width
128
129 pcmpeqb xmm4, xmm4 // constant 0x0101
130 psrlw xmm4, 15
131 packuswb xmm4, xmm4
132 pxor xmm5, xmm5 // constant 0
133
134 wloop:
135 movdqu xmm0, [eax]
136 movdqu xmm1, [eax + 16]
137 lea eax, [eax + 32]
138 pmaddubsw xmm0, xmm4 // horizontal add
139 pmaddubsw xmm1, xmm4
140 pavgw xmm0, xmm5 // (x + 1) / 2
141 pavgw xmm1, xmm5
142 packuswb xmm0, xmm1
143 movdqu [edx], xmm0
144 lea edx, [edx + 16]
145 sub ecx, 16
146 jg wloop
147
148 ret
149 }
150 }
151
152 // Blends 32x2 rectangle to 16x1.
ScaleRowDown2Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)153 __declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
154 ptrdiff_t src_stride,
155 uint8_t* dst_ptr,
156 int dst_width) {
157 __asm {
158 push esi
159 mov eax, [esp + 4 + 4] // src_ptr
160 mov esi, [esp + 4 + 8] // src_stride
161 mov edx, [esp + 4 + 12] // dst_ptr
162 mov ecx, [esp + 4 + 16] // dst_width
163
164 pcmpeqb xmm4, xmm4 // constant 0x0101
165 psrlw xmm4, 15
166 packuswb xmm4, xmm4
167 pxor xmm5, xmm5 // constant 0
168
169 wloop:
170 movdqu xmm0, [eax]
171 movdqu xmm1, [eax + 16]
172 movdqu xmm2, [eax + esi]
173 movdqu xmm3, [eax + esi + 16]
174 lea eax, [eax + 32]
175 pmaddubsw xmm0, xmm4 // horizontal add
176 pmaddubsw xmm1, xmm4
177 pmaddubsw xmm2, xmm4
178 pmaddubsw xmm3, xmm4
179 paddw xmm0, xmm2 // vertical add
180 paddw xmm1, xmm3
181 psrlw xmm0, 1
182 psrlw xmm1, 1
183 pavgw xmm0, xmm5 // (x + 1) / 2
184 pavgw xmm1, xmm5
185 packuswb xmm0, xmm1
186 movdqu [edx], xmm0
187 lea edx, [edx + 16]
188 sub ecx, 16
189 jg wloop
190
191 pop esi
192 ret
193 }
194 }
195
196 #ifdef HAS_SCALEROWDOWN2_AVX2
197 // Reads 64 pixels, throws half away and writes 32 pixels.
ScaleRowDown2_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)198 __declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
199 ptrdiff_t src_stride,
200 uint8_t* dst_ptr,
201 int dst_width) {
202 __asm {
203 mov eax, [esp + 4] // src_ptr
204 // src_stride ignored
205 mov edx, [esp + 12] // dst_ptr
206 mov ecx, [esp + 16] // dst_width
207
208 wloop:
209 vmovdqu ymm0, [eax]
210 vmovdqu ymm1, [eax + 32]
211 lea eax, [eax + 64]
212 vpsrlw ymm0, ymm0, 8 // isolate odd pixels.
213 vpsrlw ymm1, ymm1, 8
214 vpackuswb ymm0, ymm0, ymm1
215 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
216 vmovdqu [edx], ymm0
217 lea edx, [edx + 32]
218 sub ecx, 32
219 jg wloop
220
221 vzeroupper
222 ret
223 }
224 }
225
226 // Blends 64x1 rectangle to 32x1.
ScaleRowDown2Linear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)227 __declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
228 ptrdiff_t src_stride,
229 uint8_t* dst_ptr,
230 int dst_width) {
231 __asm {
232 mov eax, [esp + 4] // src_ptr
233 // src_stride
234 mov edx, [esp + 12] // dst_ptr
235 mov ecx, [esp + 16] // dst_width
236
237 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
238 vpsrlw ymm4, ymm4, 15
239 vpackuswb ymm4, ymm4, ymm4
240 vpxor ymm5, ymm5, ymm5 // constant 0
241
242 wloop:
243 vmovdqu ymm0, [eax]
244 vmovdqu ymm1, [eax + 32]
245 lea eax, [eax + 64]
246 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
247 vpmaddubsw ymm1, ymm1, ymm4
248 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
249 vpavgw ymm1, ymm1, ymm5
250 vpackuswb ymm0, ymm0, ymm1
251 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
252 vmovdqu [edx], ymm0
253 lea edx, [edx + 32]
254 sub ecx, 32
255 jg wloop
256
257 vzeroupper
258 ret
259 }
260 }
261
262 // For rounding, average = (sum + 2) / 4
263 // becomes average((sum >> 1), 0)
264 // Blends 64x2 rectangle to 32x1.
ScaleRowDown2Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)265 __declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
266 ptrdiff_t src_stride,
267 uint8_t* dst_ptr,
268 int dst_width) {
269 __asm {
270 push esi
271 mov eax, [esp + 4 + 4] // src_ptr
272 mov esi, [esp + 4 + 8] // src_stride
273 mov edx, [esp + 4 + 12] // dst_ptr
274 mov ecx, [esp + 4 + 16] // dst_width
275
276 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
277 vpsrlw ymm4, ymm4, 15
278 vpackuswb ymm4, ymm4, ymm4
279 vpxor ymm5, ymm5, ymm5 // constant 0
280
281 wloop:
282 vmovdqu ymm0, [eax]
283 vmovdqu ymm1, [eax + 32]
284 vmovdqu ymm2, [eax + esi]
285 vmovdqu ymm3, [eax + esi + 32]
286 lea eax, [eax + 64]
287 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
288 vpmaddubsw ymm1, ymm1, ymm4
289 vpmaddubsw ymm2, ymm2, ymm4
290 vpmaddubsw ymm3, ymm3, ymm4
291 vpaddw ymm0, ymm0, ymm2 // vertical add
292 vpaddw ymm1, ymm1, ymm3
293 vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2
294 vpsrlw ymm1, ymm1, 1
295 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
296 vpavgw ymm1, ymm1, ymm5
297 vpackuswb ymm0, ymm0, ymm1
298 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
299 vmovdqu [edx], ymm0
300 lea edx, [edx + 32]
301 sub ecx, 32
302 jg wloop
303
304 pop esi
305 vzeroupper
306 ret
307 }
308 }
309 #endif // HAS_SCALEROWDOWN2_AVX2
310
311 // Point samples 32 pixels to 8 pixels.
ScaleRowDown4_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)312 __declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
313 ptrdiff_t src_stride,
314 uint8_t* dst_ptr,
315 int dst_width) {
316 __asm {
317 mov eax, [esp + 4] // src_ptr
318 // src_stride ignored
319 mov edx, [esp + 12] // dst_ptr
320 mov ecx, [esp + 16] // dst_width
321 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
322 psrld xmm5, 24
323 pslld xmm5, 16
324
325 wloop:
326 movdqu xmm0, [eax]
327 movdqu xmm1, [eax + 16]
328 lea eax, [eax + 32]
329 pand xmm0, xmm5
330 pand xmm1, xmm5
331 packuswb xmm0, xmm1
332 psrlw xmm0, 8
333 packuswb xmm0, xmm0
334 movq qword ptr [edx], xmm0
335 lea edx, [edx + 8]
336 sub ecx, 8
337 jg wloop
338
339 ret
340 }
341 }
342
343 // Blends 32x4 rectangle to 8x1.
ScaleRowDown4Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)344 __declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
345 ptrdiff_t src_stride,
346 uint8_t* dst_ptr,
347 int dst_width) {
348 __asm {
349 push esi
350 push edi
351 mov eax, [esp + 8 + 4] // src_ptr
352 mov esi, [esp + 8 + 8] // src_stride
353 mov edx, [esp + 8 + 12] // dst_ptr
354 mov ecx, [esp + 8 + 16] // dst_width
355 lea edi, [esi + esi * 2] // src_stride * 3
356 pcmpeqb xmm4, xmm4 // constant 0x0101
357 psrlw xmm4, 15
358 movdqa xmm5, xmm4
359 packuswb xmm4, xmm4
360 psllw xmm5, 3 // constant 0x0008
361
362 wloop:
363 movdqu xmm0, [eax] // average rows
364 movdqu xmm1, [eax + 16]
365 movdqu xmm2, [eax + esi]
366 movdqu xmm3, [eax + esi + 16]
367 pmaddubsw xmm0, xmm4 // horizontal add
368 pmaddubsw xmm1, xmm4
369 pmaddubsw xmm2, xmm4
370 pmaddubsw xmm3, xmm4
371 paddw xmm0, xmm2 // vertical add rows 0, 1
372 paddw xmm1, xmm3
373 movdqu xmm2, [eax + esi * 2]
374 movdqu xmm3, [eax + esi * 2 + 16]
375 pmaddubsw xmm2, xmm4
376 pmaddubsw xmm3, xmm4
377 paddw xmm0, xmm2 // add row 2
378 paddw xmm1, xmm3
379 movdqu xmm2, [eax + edi]
380 movdqu xmm3, [eax + edi + 16]
381 lea eax, [eax + 32]
382 pmaddubsw xmm2, xmm4
383 pmaddubsw xmm3, xmm4
384 paddw xmm0, xmm2 // add row 3
385 paddw xmm1, xmm3
386 phaddw xmm0, xmm1
387 paddw xmm0, xmm5 // + 8 for round
388 psrlw xmm0, 4 // /16 for average of 4 * 4
389 packuswb xmm0, xmm0
390 movq qword ptr [edx], xmm0
391 lea edx, [edx + 8]
392 sub ecx, 8
393 jg wloop
394
395 pop edi
396 pop esi
397 ret
398 }
399 }
400
401 #ifdef HAS_SCALEROWDOWN4_AVX2
402 // Point samples 64 pixels to 16 pixels.
ScaleRowDown4_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)403 __declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
404 ptrdiff_t src_stride,
405 uint8_t* dst_ptr,
406 int dst_width) {
407 __asm {
408 mov eax, [esp + 4] // src_ptr
409 // src_stride ignored
410 mov edx, [esp + 12] // dst_ptr
411 mov ecx, [esp + 16] // dst_width
412 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000
413 vpsrld ymm5, ymm5, 24
414 vpslld ymm5, ymm5, 16
415
416 wloop:
417 vmovdqu ymm0, [eax]
418 vmovdqu ymm1, [eax + 32]
419 lea eax, [eax + 64]
420 vpand ymm0, ymm0, ymm5
421 vpand ymm1, ymm1, ymm5
422 vpackuswb ymm0, ymm0, ymm1
423 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
424 vpsrlw ymm0, ymm0, 8
425 vpackuswb ymm0, ymm0, ymm0
426 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
427 vmovdqu [edx], xmm0
428 lea edx, [edx + 16]
429 sub ecx, 16
430 jg wloop
431
432 vzeroupper
433 ret
434 }
435 }
436
437 // Blends 64x4 rectangle to 16x1.
ScaleRowDown4Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)438 __declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
439 ptrdiff_t src_stride,
440 uint8_t* dst_ptr,
441 int dst_width) {
442 __asm {
443 push esi
444 push edi
445 mov eax, [esp + 8 + 4] // src_ptr
446 mov esi, [esp + 8 + 8] // src_stride
447 mov edx, [esp + 8 + 12] // dst_ptr
448 mov ecx, [esp + 8 + 16] // dst_width
449 lea edi, [esi + esi * 2] // src_stride * 3
450 vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101
451 vpsrlw ymm4, ymm4, 15
452 vpsllw ymm5, ymm4, 3 // constant 0x0008
453 vpackuswb ymm4, ymm4, ymm4
454
455 wloop:
456 vmovdqu ymm0, [eax] // average rows
457 vmovdqu ymm1, [eax + 32]
458 vmovdqu ymm2, [eax + esi]
459 vmovdqu ymm3, [eax + esi + 32]
460 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
461 vpmaddubsw ymm1, ymm1, ymm4
462 vpmaddubsw ymm2, ymm2, ymm4
463 vpmaddubsw ymm3, ymm3, ymm4
464 vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1
465 vpaddw ymm1, ymm1, ymm3
466 vmovdqu ymm2, [eax + esi * 2]
467 vmovdqu ymm3, [eax + esi * 2 + 32]
468 vpmaddubsw ymm2, ymm2, ymm4
469 vpmaddubsw ymm3, ymm3, ymm4
470 vpaddw ymm0, ymm0, ymm2 // add row 2
471 vpaddw ymm1, ymm1, ymm3
472 vmovdqu ymm2, [eax + edi]
473 vmovdqu ymm3, [eax + edi + 32]
474 lea eax, [eax + 64]
475 vpmaddubsw ymm2, ymm2, ymm4
476 vpmaddubsw ymm3, ymm3, ymm4
477 vpaddw ymm0, ymm0, ymm2 // add row 3
478 vpaddw ymm1, ymm1, ymm3
479 vphaddw ymm0, ymm0, ymm1 // mutates
480 vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw
481 vpaddw ymm0, ymm0, ymm5 // + 8 for round
482 vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4
483 vpackuswb ymm0, ymm0, ymm0
484 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
485 vmovdqu [edx], xmm0
486 lea edx, [edx + 16]
487 sub ecx, 16
488 jg wloop
489
490 pop edi
491 pop esi
492 vzeroupper
493 ret
494 }
495 }
496 #endif // HAS_SCALEROWDOWN4_AVX2
497
498 // Point samples 32 pixels to 24 pixels.
499 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
500 // Then shuffled to do the scaling.
501
ScaleRowDown34_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)502 __declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
503 ptrdiff_t src_stride,
504 uint8_t* dst_ptr,
505 int dst_width) {
506 __asm {
507 mov eax, [esp + 4] // src_ptr
508 // src_stride ignored
509 mov edx, [esp + 12] // dst_ptr
510 mov ecx, [esp + 16] // dst_width
511 movdqa xmm3, xmmword ptr kShuf0
512 movdqa xmm4, xmmword ptr kShuf1
513 movdqa xmm5, xmmword ptr kShuf2
514
515 wloop:
516 movdqu xmm0, [eax]
517 movdqu xmm1, [eax + 16]
518 lea eax, [eax + 32]
519 movdqa xmm2, xmm1
520 palignr xmm1, xmm0, 8
521 pshufb xmm0, xmm3
522 pshufb xmm1, xmm4
523 pshufb xmm2, xmm5
524 movq qword ptr [edx], xmm0
525 movq qword ptr [edx + 8], xmm1
526 movq qword ptr [edx + 16], xmm2
527 lea edx, [edx + 24]
528 sub ecx, 24
529 jg wloop
530
531 ret
532 }
533 }
534
535 // Blends 32x2 rectangle to 24x1
536 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
537 // Then shuffled to do the scaling.
538
539 // Register usage:
540 // xmm0 src_row 0
541 // xmm1 src_row 1
542 // xmm2 shuf 0
543 // xmm3 shuf 1
544 // xmm4 shuf 2
545 // xmm5 madd 0
546 // xmm6 madd 1
547 // xmm7 kRound34
548
549 // Note that movdqa+palign may be better than movdqu.
ScaleRowDown34_1_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)550 __declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
551 ptrdiff_t src_stride,
552 uint8_t* dst_ptr,
553 int dst_width) {
554 __asm {
555 push esi
556 mov eax, [esp + 4 + 4] // src_ptr
557 mov esi, [esp + 4 + 8] // src_stride
558 mov edx, [esp + 4 + 12] // dst_ptr
559 mov ecx, [esp + 4 + 16] // dst_width
560 movdqa xmm2, xmmword ptr kShuf01
561 movdqa xmm3, xmmword ptr kShuf11
562 movdqa xmm4, xmmword ptr kShuf21
563 movdqa xmm5, xmmword ptr kMadd01
564 movdqa xmm6, xmmword ptr kMadd11
565 movdqa xmm7, xmmword ptr kRound34
566
567 wloop:
568 movdqu xmm0, [eax] // pixels 0..7
569 movdqu xmm1, [eax + esi]
570 pavgb xmm0, xmm1
571 pshufb xmm0, xmm2
572 pmaddubsw xmm0, xmm5
573 paddsw xmm0, xmm7
574 psrlw xmm0, 2
575 packuswb xmm0, xmm0
576 movq qword ptr [edx], xmm0
577 movdqu xmm0, [eax + 8] // pixels 8..15
578 movdqu xmm1, [eax + esi + 8]
579 pavgb xmm0, xmm1
580 pshufb xmm0, xmm3
581 pmaddubsw xmm0, xmm6
582 paddsw xmm0, xmm7
583 psrlw xmm0, 2
584 packuswb xmm0, xmm0
585 movq qword ptr [edx + 8], xmm0
586 movdqu xmm0, [eax + 16] // pixels 16..23
587 movdqu xmm1, [eax + esi + 16]
588 lea eax, [eax + 32]
589 pavgb xmm0, xmm1
590 pshufb xmm0, xmm4
591 movdqa xmm1, xmmword ptr kMadd21
592 pmaddubsw xmm0, xmm1
593 paddsw xmm0, xmm7
594 psrlw xmm0, 2
595 packuswb xmm0, xmm0
596 movq qword ptr [edx + 16], xmm0
597 lea edx, [edx + 24]
598 sub ecx, 24
599 jg wloop
600
601 pop esi
602 ret
603 }
604 }
605
606 // Note that movdqa+palign may be better than movdqu.
ScaleRowDown34_0_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)607 __declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
608 ptrdiff_t src_stride,
609 uint8_t* dst_ptr,
610 int dst_width) {
611 __asm {
612 push esi
613 mov eax, [esp + 4 + 4] // src_ptr
614 mov esi, [esp + 4 + 8] // src_stride
615 mov edx, [esp + 4 + 12] // dst_ptr
616 mov ecx, [esp + 4 + 16] // dst_width
617 movdqa xmm2, xmmword ptr kShuf01
618 movdqa xmm3, xmmword ptr kShuf11
619 movdqa xmm4, xmmword ptr kShuf21
620 movdqa xmm5, xmmword ptr kMadd01
621 movdqa xmm6, xmmword ptr kMadd11
622 movdqa xmm7, xmmword ptr kRound34
623
624 wloop:
625 movdqu xmm0, [eax] // pixels 0..7
626 movdqu xmm1, [eax + esi]
627 pavgb xmm1, xmm0
628 pavgb xmm0, xmm1
629 pshufb xmm0, xmm2
630 pmaddubsw xmm0, xmm5
631 paddsw xmm0, xmm7
632 psrlw xmm0, 2
633 packuswb xmm0, xmm0
634 movq qword ptr [edx], xmm0
635 movdqu xmm0, [eax + 8] // pixels 8..15
636 movdqu xmm1, [eax + esi + 8]
637 pavgb xmm1, xmm0
638 pavgb xmm0, xmm1
639 pshufb xmm0, xmm3
640 pmaddubsw xmm0, xmm6
641 paddsw xmm0, xmm7
642 psrlw xmm0, 2
643 packuswb xmm0, xmm0
644 movq qword ptr [edx + 8], xmm0
645 movdqu xmm0, [eax + 16] // pixels 16..23
646 movdqu xmm1, [eax + esi + 16]
647 lea eax, [eax + 32]
648 pavgb xmm1, xmm0
649 pavgb xmm0, xmm1
650 pshufb xmm0, xmm4
651 movdqa xmm1, xmmword ptr kMadd21
652 pmaddubsw xmm0, xmm1
653 paddsw xmm0, xmm7
654 psrlw xmm0, 2
655 packuswb xmm0, xmm0
656 movq qword ptr [edx + 16], xmm0
657 lea edx, [edx+24]
658 sub ecx, 24
659 jg wloop
660
661 pop esi
662 ret
663 }
664 }
665
666 // 3/8 point sampler
667
668 // Scale 32 pixels to 12
ScaleRowDown38_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)669 __declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
670 ptrdiff_t src_stride,
671 uint8_t* dst_ptr,
672 int dst_width) {
673 __asm {
674 mov eax, [esp + 4] // src_ptr
675 // src_stride ignored
676 mov edx, [esp + 12] // dst_ptr
677 mov ecx, [esp + 16] // dst_width
678 movdqa xmm4, xmmword ptr kShuf38a
679 movdqa xmm5, xmmword ptr kShuf38b
680
681 xloop:
682 movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
683 movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
684 lea eax, [eax + 32]
685 pshufb xmm0, xmm4
686 pshufb xmm1, xmm5
687 paddusb xmm0, xmm1
688
689 movq qword ptr [edx], xmm0 // write 12 pixels
690 movhlps xmm1, xmm0
691 movd [edx + 8], xmm1
692 lea edx, [edx + 12]
693 sub ecx, 12
694 jg xloop
695
696 ret
697 }
698 }
699
700 // Scale 16x3 pixels to 6x1 with interpolation
ScaleRowDown38_3_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)701 __declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
702 ptrdiff_t src_stride,
703 uint8_t* dst_ptr,
704 int dst_width) {
705 __asm {
706 push esi
707 mov eax, [esp + 4 + 4] // src_ptr
708 mov esi, [esp + 4 + 8] // src_stride
709 mov edx, [esp + 4 + 12] // dst_ptr
710 mov ecx, [esp + 4 + 16] // dst_width
711 movdqa xmm2, xmmword ptr kShufAc
712 movdqa xmm3, xmmword ptr kShufAc3
713 movdqa xmm4, xmmword ptr kScaleAc33
714 pxor xmm5, xmm5
715
716 xloop:
717 movdqu xmm0, [eax] // sum up 3 rows into xmm0/1
718 movdqu xmm6, [eax + esi]
719 movhlps xmm1, xmm0
720 movhlps xmm7, xmm6
721 punpcklbw xmm0, xmm5
722 punpcklbw xmm1, xmm5
723 punpcklbw xmm6, xmm5
724 punpcklbw xmm7, xmm5
725 paddusw xmm0, xmm6
726 paddusw xmm1, xmm7
727 movdqu xmm6, [eax + esi * 2]
728 lea eax, [eax + 16]
729 movhlps xmm7, xmm6
730 punpcklbw xmm6, xmm5
731 punpcklbw xmm7, xmm5
732 paddusw xmm0, xmm6
733 paddusw xmm1, xmm7
734
735 movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
736 psrldq xmm0, 2
737 paddusw xmm6, xmm0
738 psrldq xmm0, 2
739 paddusw xmm6, xmm0
740 pshufb xmm6, xmm2
741
742 movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
743 psrldq xmm1, 2
744 paddusw xmm7, xmm1
745 psrldq xmm1, 2
746 paddusw xmm7, xmm1
747 pshufb xmm7, xmm3
748 paddusw xmm6, xmm7
749
750 pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
751 packuswb xmm6, xmm6
752
753 movd [edx], xmm6 // write 6 pixels
754 psrlq xmm6, 16
755 movd [edx + 2], xmm6
756 lea edx, [edx + 6]
757 sub ecx, 6
758 jg xloop
759
760 pop esi
761 ret
762 }
763 }
764
765 // Scale 16x2 pixels to 6x1 with interpolation
ScaleRowDown38_2_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)766 __declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
767 ptrdiff_t src_stride,
768 uint8_t* dst_ptr,
769 int dst_width) {
770 __asm {
771 push esi
772 mov eax, [esp + 4 + 4] // src_ptr
773 mov esi, [esp + 4 + 8] // src_stride
774 mov edx, [esp + 4 + 12] // dst_ptr
775 mov ecx, [esp + 4 + 16] // dst_width
776 movdqa xmm2, xmmword ptr kShufAb0
777 movdqa xmm3, xmmword ptr kShufAb1
778 movdqa xmm4, xmmword ptr kShufAb2
779 movdqa xmm5, xmmword ptr kScaleAb2
780
781 xloop:
782 movdqu xmm0, [eax] // average 2 rows into xmm0
783 movdqu xmm1, [eax + esi]
784 lea eax, [eax + 16]
785 pavgb xmm0, xmm1
786
787 movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
788 pshufb xmm1, xmm2
789 movdqa xmm6, xmm0
790 pshufb xmm6, xmm3
791 paddusw xmm1, xmm6
792 pshufb xmm0, xmm4
793 paddusw xmm1, xmm0
794
795 pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
796 packuswb xmm1, xmm1
797
798 movd [edx], xmm1 // write 6 pixels
799 psrlq xmm1, 16
800 movd [edx + 2], xmm1
801 lea edx, [edx + 6]
802 sub ecx, 6
803 jg xloop
804
805 pop esi
806 ret
807 }
808 }
809
810 // Reads 16 bytes and accumulates to 16 shorts at a time.
ScaleAddRow_SSE2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)811 __declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr,
812 uint16_t* dst_ptr,
813 int src_width) {
814 __asm {
815 mov eax, [esp + 4] // src_ptr
816 mov edx, [esp + 8] // dst_ptr
817 mov ecx, [esp + 12] // src_width
818 pxor xmm5, xmm5
819
820 // sum rows
821 xloop:
822 movdqu xmm3, [eax] // read 16 bytes
823 lea eax, [eax + 16]
824 movdqu xmm0, [edx] // read 16 words from destination
825 movdqu xmm1, [edx + 16]
826 movdqa xmm2, xmm3
827 punpcklbw xmm2, xmm5
828 punpckhbw xmm3, xmm5
829 paddusw xmm0, xmm2 // sum 16 words
830 paddusw xmm1, xmm3
831 movdqu [edx], xmm0 // write 16 words to destination
832 movdqu [edx + 16], xmm1
833 lea edx, [edx + 32]
834 sub ecx, 16
835 jg xloop
836 ret
837 }
838 }
839
840 #ifdef HAS_SCALEADDROW_AVX2
841 // Reads 32 bytes and accumulates to 32 shorts at a time.
ScaleAddRow_AVX2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)842 __declspec(naked) void ScaleAddRow_AVX2(const uint8_t* src_ptr,
843 uint16_t* dst_ptr,
844 int src_width) {
845 __asm {
846 mov eax, [esp + 4] // src_ptr
847 mov edx, [esp + 8] // dst_ptr
848 mov ecx, [esp + 12] // src_width
849 vpxor ymm5, ymm5, ymm5
850
851 // sum rows
852 xloop:
853 vmovdqu ymm3, [eax] // read 32 bytes
854 lea eax, [eax + 32]
855 vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck
856 vpunpcklbw ymm2, ymm3, ymm5
857 vpunpckhbw ymm3, ymm3, ymm5
858 vpaddusw ymm0, ymm2, [edx] // sum 16 words
859 vpaddusw ymm1, ymm3, [edx + 32]
860 vmovdqu [edx], ymm0 // write 32 words to destination
861 vmovdqu [edx + 32], ymm1
862 lea edx, [edx + 64]
863 sub ecx, 32
864 jg xloop
865
866 vzeroupper
867 ret
868 }
869 }
870 #endif // HAS_SCALEADDROW_AVX2
871
872 // Constant for making pixels signed to avoid pmaddubsw
873 // saturation.
874 static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
875 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
876
877 // Constant for making pixels unsigned and adding .5 for rounding.
878 static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
879 0x4040, 0x4040, 0x4040, 0x4040};
880
881 // Bilinear column filtering. SSSE3 version.
ScaleFilterCols_SSSE3(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)882 __declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
883 const uint8_t* src_ptr,
884 int dst_width,
885 int x,
886 int dx) {
887 __asm {
888 push ebx
889 push esi
890 push edi
891 mov edi, [esp + 12 + 4] // dst_ptr
892 mov esi, [esp + 12 + 8] // src_ptr
893 mov ecx, [esp + 12 + 12] // dst_width
894 movd xmm2, [esp + 12 + 16] // x
895 movd xmm3, [esp + 12 + 20] // dx
896 mov eax, 0x04040000 // shuffle to line up fractions with pixel.
897 movd xmm5, eax
898 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
899 psrlw xmm6, 9
900 pcmpeqb xmm7, xmm7 // generate 0x0001
901 psrlw xmm7, 15
902 pextrw eax, xmm2, 1 // get x0 integer. preroll
903 sub ecx, 2
904 jl xloop29
905
906 movdqa xmm0, xmm2 // x1 = x0 + dx
907 paddd xmm0, xmm3
908 punpckldq xmm2, xmm0 // x0 x1
909 punpckldq xmm3, xmm3 // dx dx
910 paddd xmm3, xmm3 // dx * 2, dx * 2
911 pextrw edx, xmm2, 3 // get x1 integer. preroll
912
913 // 2 Pixel loop.
914 xloop2:
915 movdqa xmm1, xmm2 // x0, x1 fractions.
916 paddd xmm2, xmm3 // x += dx
917 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
918 movd xmm0, ebx
919 psrlw xmm1, 9 // 7 bit fractions.
920 movzx ebx, word ptr [esi + edx] // 2 source x1 pixels
921 movd xmm4, ebx
922 pshufb xmm1, xmm5 // 0011
923 punpcklwd xmm0, xmm4
924 psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
925 pxor xmm1, xmm6 // 0..7f and 7f..0
926 paddusb xmm1, xmm7 // +1 so 0..7f and 80..1
927 pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels.
928 pextrw eax, xmm2, 1 // get x0 integer. next iteration.
929 pextrw edx, xmm2, 3 // get x1 integer. next iteration.
930 paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round.
931 psrlw xmm1, 7 // 8.7 fixed point to low 8 bits.
932 packuswb xmm1, xmm1 // 8 bits, 2 pixels.
933 movd ebx, xmm1
934 mov [edi], bx
935 lea edi, [edi + 2]
936 sub ecx, 2 // 2 pixels
937 jge xloop2
938
939 xloop29:
940 add ecx, 2 - 1
941 jl xloop99
942
943 // 1 pixel remainder
944 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
945 movd xmm0, ebx
946 psrlw xmm2, 9 // 7 bit fractions.
947 pshufb xmm2, xmm5 // 0011
948 psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
949 pxor xmm2, xmm6 // 0..7f and 7f..0
950 paddusb xmm2, xmm7 // +1 so 0..7f and 80..1
951 pmaddubsw xmm2, xmm0 // 16 bit
952 paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round.
953 psrlw xmm2, 7 // 8.7 fixed point to low 8 bits.
954 packuswb xmm2, xmm2 // 8 bits
955 movd ebx, xmm2
956 mov [edi], bl
957
958 xloop99:
959
960 pop edi
961 pop esi
962 pop ebx
963 ret
964 }
965 }
966
967 // Reads 16 pixels, duplicates them and writes 32 pixels.
ScaleColsUp2_SSE2(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)968 __declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
969 const uint8_t* src_ptr,
970 int dst_width,
971 int x,
972 int dx) {
973 __asm {
974 mov edx, [esp + 4] // dst_ptr
975 mov eax, [esp + 8] // src_ptr
976 mov ecx, [esp + 12] // dst_width
977
978 wloop:
979 movdqu xmm0, [eax]
980 lea eax, [eax + 16]
981 movdqa xmm1, xmm0
982 punpcklbw xmm0, xmm0
983 punpckhbw xmm1, xmm1
984 movdqu [edx], xmm0
985 movdqu [edx + 16], xmm1
986 lea edx, [edx + 32]
987 sub ecx, 32
988 jg wloop
989
990 ret
991 }
992 }
993
994 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
ScaleARGBRowDown2_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)995 __declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
996 ptrdiff_t src_stride,
997 uint8_t* dst_argb,
998 int dst_width) {
999 __asm {
1000 mov eax, [esp + 4] // src_argb
1001 // src_stride ignored
1002 mov edx, [esp + 12] // dst_argb
1003 mov ecx, [esp + 16] // dst_width
1004
1005 wloop:
1006 movdqu xmm0, [eax]
1007 movdqu xmm1, [eax + 16]
1008 lea eax, [eax + 32]
1009 shufps xmm0, xmm1, 0xdd
1010 movdqu [edx], xmm0
1011 lea edx, [edx + 16]
1012 sub ecx, 4
1013 jg wloop
1014
1015 ret
1016 }
1017 }
1018
1019 // Blends 8x1 rectangle to 4x1.
ScaleARGBRowDown2Linear_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1020 __declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
1021 ptrdiff_t src_stride,
1022 uint8_t* dst_argb,
1023 int dst_width) {
1024 __asm {
1025 mov eax, [esp + 4] // src_argb
1026 // src_stride ignored
1027 mov edx, [esp + 12] // dst_argb
1028 mov ecx, [esp + 16] // dst_width
1029
1030 wloop:
1031 movdqu xmm0, [eax]
1032 movdqu xmm1, [eax + 16]
1033 lea eax, [eax + 32]
1034 movdqa xmm2, xmm0
1035 shufps xmm0, xmm1, 0x88 // even pixels
1036 shufps xmm2, xmm1, 0xdd // odd pixels
1037 pavgb xmm0, xmm2
1038 movdqu [edx], xmm0
1039 lea edx, [edx + 16]
1040 sub ecx, 4
1041 jg wloop
1042
1043 ret
1044 }
1045 }
1046
1047 // Blends 8x2 rectangle to 4x1.
ScaleARGBRowDown2Box_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1048 __declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
1049 ptrdiff_t src_stride,
1050 uint8_t* dst_argb,
1051 int dst_width) {
1052 __asm {
1053 push esi
1054 mov eax, [esp + 4 + 4] // src_argb
1055 mov esi, [esp + 4 + 8] // src_stride
1056 mov edx, [esp + 4 + 12] // dst_argb
1057 mov ecx, [esp + 4 + 16] // dst_width
1058
1059 wloop:
1060 movdqu xmm0, [eax]
1061 movdqu xmm1, [eax + 16]
1062 movdqu xmm2, [eax + esi]
1063 movdqu xmm3, [eax + esi + 16]
1064 lea eax, [eax + 32]
1065 pavgb xmm0, xmm2 // average rows
1066 pavgb xmm1, xmm3
1067 movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
1068 shufps xmm0, xmm1, 0x88 // even pixels
1069 shufps xmm2, xmm1, 0xdd // odd pixels
1070 pavgb xmm0, xmm2
1071 movdqu [edx], xmm0
1072 lea edx, [edx + 16]
1073 sub ecx, 4
1074 jg wloop
1075
1076 pop esi
1077 ret
1078 }
1079 }
1080
1081 // Reads 4 pixels at a time.
ScaleARGBRowDownEven_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)1082 __declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
1083 ptrdiff_t src_stride,
1084 int src_stepx,
1085 uint8_t* dst_argb,
1086 int dst_width) {
1087 __asm {
1088 push ebx
1089 push edi
1090 mov eax, [esp + 8 + 4] // src_argb
1091 // src_stride ignored
1092 mov ebx, [esp + 8 + 12] // src_stepx
1093 mov edx, [esp + 8 + 16] // dst_argb
1094 mov ecx, [esp + 8 + 20] // dst_width
1095 lea ebx, [ebx * 4]
1096 lea edi, [ebx + ebx * 2]
1097
1098 wloop:
1099 movd xmm0, [eax]
1100 movd xmm1, [eax + ebx]
1101 punpckldq xmm0, xmm1
1102 movd xmm2, [eax + ebx * 2]
1103 movd xmm3, [eax + edi]
1104 lea eax, [eax + ebx * 4]
1105 punpckldq xmm2, xmm3
1106 punpcklqdq xmm0, xmm2
1107 movdqu [edx], xmm0
1108 lea edx, [edx + 16]
1109 sub ecx, 4
1110 jg wloop
1111
1112 pop edi
1113 pop ebx
1114 ret
1115 }
1116 }
1117
1118 // Blends four 2x2 to 4x1.
ScaleARGBRowDownEvenBox_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)1119 __declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
1120 ptrdiff_t src_stride,
1121 int src_stepx,
1122 uint8_t* dst_argb,
1123 int dst_width) {
1124 __asm {
1125 push ebx
1126 push esi
1127 push edi
1128 mov eax, [esp + 12 + 4] // src_argb
1129 mov esi, [esp + 12 + 8] // src_stride
1130 mov ebx, [esp + 12 + 12] // src_stepx
1131 mov edx, [esp + 12 + 16] // dst_argb
1132 mov ecx, [esp + 12 + 20] // dst_width
1133 lea esi, [eax + esi] // row1 pointer
1134 lea ebx, [ebx * 4]
1135 lea edi, [ebx + ebx * 2]
1136
1137 wloop:
1138 movq xmm0, qword ptr [eax] // row0 4 pairs
1139 movhps xmm0, qword ptr [eax + ebx]
1140 movq xmm1, qword ptr [eax + ebx * 2]
1141 movhps xmm1, qword ptr [eax + edi]
1142 lea eax, [eax + ebx * 4]
1143 movq xmm2, qword ptr [esi] // row1 4 pairs
1144 movhps xmm2, qword ptr [esi + ebx]
1145 movq xmm3, qword ptr [esi + ebx * 2]
1146 movhps xmm3, qword ptr [esi + edi]
1147 lea esi, [esi + ebx * 4]
1148 pavgb xmm0, xmm2 // average rows
1149 pavgb xmm1, xmm3
1150 movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
1151 shufps xmm0, xmm1, 0x88 // even pixels
1152 shufps xmm2, xmm1, 0xdd // odd pixels
1153 pavgb xmm0, xmm2
1154 movdqu [edx], xmm0
1155 lea edx, [edx + 16]
1156 sub ecx, 4
1157 jg wloop
1158
1159 pop edi
1160 pop esi
1161 pop ebx
1162 ret
1163 }
1164 }
1165
1166 // Column scaling unfiltered. SSE2 version.
ScaleARGBCols_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1167 __declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb,
1168 const uint8_t* src_argb,
1169 int dst_width,
1170 int x,
1171 int dx) {
1172 __asm {
1173 push edi
1174 push esi
1175 mov edi, [esp + 8 + 4] // dst_argb
1176 mov esi, [esp + 8 + 8] // src_argb
1177 mov ecx, [esp + 8 + 12] // dst_width
1178 movd xmm2, [esp + 8 + 16] // x
1179 movd xmm3, [esp + 8 + 20] // dx
1180
1181 pshufd xmm2, xmm2, 0 // x0 x0 x0 x0
1182 pshufd xmm0, xmm3, 0x11 // dx 0 dx 0
1183 paddd xmm2, xmm0
1184 paddd xmm3, xmm3 // 0, 0, 0, dx * 2
1185 pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0
1186 paddd xmm2, xmm0 // x3 x2 x1 x0
1187 paddd xmm3, xmm3 // 0, 0, 0, dx * 4
1188 pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4
1189
1190 pextrw eax, xmm2, 1 // get x0 integer.
1191 pextrw edx, xmm2, 3 // get x1 integer.
1192
1193 cmp ecx, 0
1194 jle xloop99
1195 sub ecx, 4
1196 jl xloop49
1197
1198 // 4 Pixel loop.
1199 xloop4:
1200 movd xmm0, [esi + eax * 4] // 1 source x0 pixels
1201 movd xmm1, [esi + edx * 4] // 1 source x1 pixels
1202 pextrw eax, xmm2, 5 // get x2 integer.
1203 pextrw edx, xmm2, 7 // get x3 integer.
1204 paddd xmm2, xmm3 // x += dx
1205 punpckldq xmm0, xmm1 // x0 x1
1206
1207 movd xmm1, [esi + eax * 4] // 1 source x2 pixels
1208 movd xmm4, [esi + edx * 4] // 1 source x3 pixels
1209 pextrw eax, xmm2, 1 // get x0 integer. next iteration.
1210 pextrw edx, xmm2, 3 // get x1 integer. next iteration.
1211 punpckldq xmm1, xmm4 // x2 x3
1212 punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
1213 movdqu [edi], xmm0
1214 lea edi, [edi + 16]
1215 sub ecx, 4 // 4 pixels
1216 jge xloop4
1217
1218 xloop49:
1219 test ecx, 2
1220 je xloop29
1221
1222 // 2 Pixels.
1223 movd xmm0, [esi + eax * 4] // 1 source x0 pixels
1224 movd xmm1, [esi + edx * 4] // 1 source x1 pixels
1225 pextrw eax, xmm2, 5 // get x2 integer.
1226 punpckldq xmm0, xmm1 // x0 x1
1227
1228 movq qword ptr [edi], xmm0
1229 lea edi, [edi + 8]
1230
1231 xloop29:
1232 test ecx, 1
1233 je xloop99
1234
1235 // 1 Pixels.
1236 movd xmm0, [esi + eax * 4] // 1 source x2 pixels
1237 movd dword ptr [edi], xmm0
1238 xloop99:
1239
1240 pop esi
1241 pop edi
1242 ret
1243 }
1244 }
1245
1246 // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
1247 // TODO(fbarchard): Port to Neon
1248
1249 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1250 static const uvec8 kShuffleColARGB = {
1251 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
1252 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
1253 };
1254
1255 // Shuffle table for duplicating 2 fractions into 8 bytes each
1256 static const uvec8 kShuffleFractions = {
1257 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1258 };
1259
ScaleARGBFilterCols_SSSE3(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1260 __declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
1261 const uint8_t* src_argb,
1262 int dst_width,
1263 int x,
1264 int dx) {
1265 __asm {
1266 push esi
1267 push edi
1268 mov edi, [esp + 8 + 4] // dst_argb
1269 mov esi, [esp + 8 + 8] // src_argb
1270 mov ecx, [esp + 8 + 12] // dst_width
1271 movd xmm2, [esp + 8 + 16] // x
1272 movd xmm3, [esp + 8 + 20] // dx
1273 movdqa xmm4, xmmword ptr kShuffleColARGB
1274 movdqa xmm5, xmmword ptr kShuffleFractions
1275 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
1276 psrlw xmm6, 9
1277 pextrw eax, xmm2, 1 // get x0 integer. preroll
1278 sub ecx, 2
1279 jl xloop29
1280
1281 movdqa xmm0, xmm2 // x1 = x0 + dx
1282 paddd xmm0, xmm3
1283 punpckldq xmm2, xmm0 // x0 x1
1284 punpckldq xmm3, xmm3 // dx dx
1285 paddd xmm3, xmm3 // dx * 2, dx * 2
1286 pextrw edx, xmm2, 3 // get x1 integer. preroll
1287
1288 // 2 Pixel loop.
1289 xloop2:
1290 movdqa xmm1, xmm2 // x0, x1 fractions.
1291 paddd xmm2, xmm3 // x += dx
1292 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
1293 psrlw xmm1, 9 // 7 bit fractions.
1294 movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels
1295 pshufb xmm1, xmm5 // 0000000011111111
1296 pshufb xmm0, xmm4 // arrange pixels into pairs
1297 pxor xmm1, xmm6 // 0..7f and 7f..0
1298 pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.
1299 pextrw eax, xmm2, 1 // get x0 integer. next iteration.
1300 pextrw edx, xmm2, 3 // get x1 integer. next iteration.
1301 psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits.
1302 packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.
1303 movq qword ptr [edi], xmm0
1304 lea edi, [edi + 8]
1305 sub ecx, 2 // 2 pixels
1306 jge xloop2
1307
1308 xloop29:
1309
1310 add ecx, 2 - 1
1311 jl xloop99
1312
1313 // 1 pixel remainder
1314 psrlw xmm2, 9 // 7 bit fractions.
1315 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
1316 pshufb xmm2, xmm5 // 00000000
1317 pshufb xmm0, xmm4 // arrange pixels into pairs
1318 pxor xmm2, xmm6 // 0..7f and 7f..0
1319 pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel.
1320 psrlw xmm0, 7
1321 packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
1322 movd [edi], xmm0
1323
1324 xloop99:
1325
1326 pop edi
1327 pop esi
1328 ret
1329 }
1330 }
1331
1332 // Reads 4 pixels, duplicates them and writes 8 pixels.
ScaleARGBColsUp2_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1333 __declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
1334 const uint8_t* src_argb,
1335 int dst_width,
1336 int x,
1337 int dx) {
1338 __asm {
1339 mov edx, [esp + 4] // dst_argb
1340 mov eax, [esp + 8] // src_argb
1341 mov ecx, [esp + 12] // dst_width
1342
1343 wloop:
1344 movdqu xmm0, [eax]
1345 lea eax, [eax + 16]
1346 movdqa xmm1, xmm0
1347 punpckldq xmm0, xmm0
1348 punpckhdq xmm1, xmm1
1349 movdqu [edx], xmm0
1350 movdqu [edx + 16], xmm1
1351 lea edx, [edx + 32]
1352 sub ecx, 8
1353 jg wloop
1354
1355 ret
1356 }
1357 }
1358
1359 // Divide num by div and return as 16.16 fixed point result.
FixedDiv_X86(int num,int div)1360 __declspec(naked) int FixedDiv_X86(int num, int div) {
1361 __asm {
1362 mov eax, [esp + 4] // num
1363 cdq // extend num to 64 bits
1364 shld edx, eax, 16 // 32.16
1365 shl eax, 16
1366 idiv dword ptr [esp + 8]
1367 ret
1368 }
1369 }
1370
1371 // Divide num by div and return as 16.16 fixed point result.
FixedDiv1_X86(int num,int div)1372 __declspec(naked) int FixedDiv1_X86(int num, int div) {
1373 __asm {
1374 mov eax, [esp + 4] // num
1375 mov ecx, [esp + 8] // denom
1376 cdq // extend num to 64 bits
1377 shld edx, eax, 16 // 32.16
1378 shl eax, 16
1379 sub eax, 0x00010001
1380 sbb edx, 0
1381 sub ecx, 1
1382 idiv ecx
1383 ret
1384 }
1385 }
1386 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
1387
1388 #ifdef __cplusplus
1389 } // extern "C"
1390 } // namespace libyuv
1391 #endif
1392