1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // This module is for GCC x86 and x64.
19 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
20
21 // Offsets for source bytes 0 to 9
22 static uvec8 kShuf0 =
23 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
24
25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
26 static uvec8 kShuf1 =
27 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
28
29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
30 static uvec8 kShuf2 =
31 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
32
33 // Offsets for source bytes 0 to 10
34 static uvec8 kShuf01 =
35 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
36
37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
38 static uvec8 kShuf11 =
39 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
40
41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
42 static uvec8 kShuf21 =
43 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
44
45 // Coefficients for source bytes 0 to 10
46 static uvec8 kMadd01 =
47 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
48
49 // Coefficients for source bytes 10 to 21
50 static uvec8 kMadd11 =
51 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
52
53 // Coefficients for source bytes 21 to 31
54 static uvec8 kMadd21 =
55 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
56
57 // Coefficients for source bytes 21 to 31
58 static vec16 kRound34 =
59 { 2, 2, 2, 2, 2, 2, 2, 2 };
60
61 static uvec8 kShuf38a =
62 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
63
64 static uvec8 kShuf38b =
65 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
66
67 // Arrange words 0,3,6 into 0,1,2
68 static uvec8 kShufAc =
69 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
70
71 // Arrange words 0,3,6 into 3,4,5
72 static uvec8 kShufAc3 =
73 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
74
75 // Scaling values for boxes of 3x3 and 2x3
76 static uvec16 kScaleAc33 =
77 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
78
79 // Arrange first value for pixels 0,1,2,3,4,5
80 static uvec8 kShufAb0 =
81 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
82
83 // Arrange second value for pixels 0,1,2,3,4,5
84 static uvec8 kShufAb1 =
85 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
86
87 // Arrange third value for pixels 0,1,2,3,4,5
88 static uvec8 kShufAb2 =
89 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
90
91 // Scaling values for boxes of 3x2 and 2x2
92 static uvec16 kScaleAb2 =
93 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
94
95 // GCC versions of row functions are verbatim conversions from Visual C.
96 // Generated using gcc disassembly on Visual C object file:
97 // objdump -D yuvscaler.obj >yuvscaler.txt
98
ScaleRowDown2_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)99 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
100 uint8* dst_ptr, int dst_width) {
101 asm volatile (
102 LABELALIGN
103 "1: \n"
104 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
105 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
106 "lea " MEMLEA(0x20,0) ",%0 \n"
107 "psrlw $0x8,%%xmm0 \n"
108 "psrlw $0x8,%%xmm1 \n"
109 "packuswb %%xmm1,%%xmm0 \n"
110 "movdqa %%xmm0," MEMACCESS(1) " \n"
111 "lea " MEMLEA(0x10,1) ",%1 \n"
112 "sub $0x10,%2 \n"
113 "jg 1b \n"
114 : "+r"(src_ptr), // %0
115 "+r"(dst_ptr), // %1
116 "+r"(dst_width) // %2
117 :
118 : "memory", "cc"
119 #if defined(__SSE2__)
120 , "xmm0", "xmm1"
121 #endif
122 );
123 }
124
ScaleRowDown2Linear_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)125 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
126 uint8* dst_ptr, int dst_width) {
127 asm volatile (
128 "pcmpeqb %%xmm5,%%xmm5 \n"
129 "psrlw $0x8,%%xmm5 \n"
130
131 LABELALIGN
132 "1: \n"
133 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
134 "movdqa " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
135 "lea " MEMLEA(0x20,0) ",%0 \n"
136 "movdqa %%xmm0,%%xmm2 \n"
137 "psrlw $0x8,%%xmm0 \n"
138 "movdqa %%xmm1,%%xmm3 \n"
139 "psrlw $0x8,%%xmm1 \n"
140 "pand %%xmm5,%%xmm2 \n"
141 "pand %%xmm5,%%xmm3 \n"
142 "pavgw %%xmm2,%%xmm0 \n"
143 "pavgw %%xmm3,%%xmm1 \n"
144 "packuswb %%xmm1,%%xmm0 \n"
145 "movdqa %%xmm0," MEMACCESS(1) " \n"
146 "lea " MEMLEA(0x10,1) ",%1 \n"
147 "sub $0x10,%2 \n"
148 "jg 1b \n"
149 : "+r"(src_ptr), // %0
150 "+r"(dst_ptr), // %1
151 "+r"(dst_width) // %2
152 :
153 : "memory", "cc"
154 #if defined(__SSE2__)
155 , "xmm0", "xmm1", "xmm5"
156 #endif
157 );
158 }
159
ScaleRowDown2Box_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)160 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
161 uint8* dst_ptr, int dst_width) {
162 asm volatile (
163 "pcmpeqb %%xmm5,%%xmm5 \n"
164 "psrlw $0x8,%%xmm5 \n"
165
166 LABELALIGN
167 "1: \n"
168 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
169 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
170 MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2
171 BUNDLEALIGN
172 MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3
173 "lea " MEMLEA(0x20,0) ",%0 \n"
174 "pavgb %%xmm2,%%xmm0 \n"
175 "pavgb %%xmm3,%%xmm1 \n"
176 "movdqa %%xmm0,%%xmm2 \n"
177 "psrlw $0x8,%%xmm0 \n"
178 "movdqa %%xmm1,%%xmm3 \n"
179 "psrlw $0x8,%%xmm1 \n"
180 "pand %%xmm5,%%xmm2 \n"
181 "pand %%xmm5,%%xmm3 \n"
182 "pavgw %%xmm2,%%xmm0 \n"
183 "pavgw %%xmm3,%%xmm1 \n"
184 "packuswb %%xmm1,%%xmm0 \n"
185 "movdqa %%xmm0," MEMACCESS(1) " \n"
186 "lea " MEMLEA(0x10,1) ",%1 \n"
187 "sub $0x10,%2 \n"
188 "jg 1b \n"
189 : "+r"(src_ptr), // %0
190 "+r"(dst_ptr), // %1
191 "+r"(dst_width) // %2
192 : "r"((intptr_t)(src_stride)) // %3
193 : "memory", "cc"
194 #if defined(__native_client__) && defined(__x86_64__)
195 , "r14"
196 #endif
197 #if defined(__SSE2__)
198 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
199 #endif
200 );
201 }
202
ScaleRowDown2_Unaligned_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)203 void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
204 uint8* dst_ptr, int dst_width) {
205 asm volatile (
206 LABELALIGN
207 "1: \n"
208 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
209 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
210 "lea " MEMLEA(0x20,0) ",%0 \n"
211 "psrlw $0x8,%%xmm0 \n"
212 "psrlw $0x8,%%xmm1 \n"
213 "packuswb %%xmm1,%%xmm0 \n"
214 "movdqu %%xmm0," MEMACCESS(1) " \n"
215 "lea " MEMLEA(0x10,1) ",%1 \n"
216 "sub $0x10,%2 \n"
217 "jg 1b \n"
218 : "+r"(src_ptr), // %0
219 "+r"(dst_ptr), // %1
220 "+r"(dst_width) // %2
221 :
222 : "memory", "cc"
223 #if defined(__SSE2__)
224 , "xmm0", "xmm1"
225 #endif
226 );
227 }
228
ScaleRowDown2Linear_Unaligned_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)229 void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
230 ptrdiff_t src_stride,
231 uint8* dst_ptr, int dst_width) {
232 asm volatile (
233 "pcmpeqb %%xmm5,%%xmm5 \n"
234 "psrlw $0x8,%%xmm5 \n"
235
236 LABELALIGN
237 "1: \n"
238 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
239 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
240 "lea " MEMLEA(0x20,0) ",%0 \n"
241 "movdqa %%xmm0,%%xmm2 \n"
242 "psrlw $0x8,%%xmm0 \n"
243 "movdqa %%xmm1,%%xmm3 \n"
244 "psrlw $0x8,%%xmm1 \n"
245 "pand %%xmm5,%%xmm2 \n"
246 "pand %%xmm5,%%xmm3 \n"
247 "pavgw %%xmm2,%%xmm0 \n"
248 "pavgw %%xmm3,%%xmm1 \n"
249 "packuswb %%xmm1,%%xmm0 \n"
250 "movdqu %%xmm0," MEMACCESS(1) " \n"
251 "lea " MEMLEA(0x10,1) ",%1 \n"
252 "sub $0x10,%2 \n"
253 "jg 1b \n"
254 : "+r"(src_ptr), // %0
255 "+r"(dst_ptr), // %1
256 "+r"(dst_width) // %2
257 :
258 : "memory", "cc"
259 #if defined(__SSE2__)
260 , "xmm0", "xmm1", "xmm5"
261 #endif
262 );
263 }
264
ScaleRowDown2Box_Unaligned_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)265 void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
266 ptrdiff_t src_stride,
267 uint8* dst_ptr, int dst_width) {
268 asm volatile (
269 "pcmpeqb %%xmm5,%%xmm5 \n"
270 "psrlw $0x8,%%xmm5 \n"
271
272 LABELALIGN
273 "1: \n"
274 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
275 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
276 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
277 BUNDLEALIGN
278 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
279 "lea " MEMLEA(0x20,0) ",%0 \n"
280 "pavgb %%xmm2,%%xmm0 \n"
281 "pavgb %%xmm3,%%xmm1 \n"
282 "movdqa %%xmm0,%%xmm2 \n"
283 "psrlw $0x8,%%xmm0 \n"
284 "movdqa %%xmm1,%%xmm3 \n"
285 "psrlw $0x8,%%xmm1 \n"
286 "pand %%xmm5,%%xmm2 \n"
287 "pand %%xmm5,%%xmm3 \n"
288 "pavgw %%xmm2,%%xmm0 \n"
289 "pavgw %%xmm3,%%xmm1 \n"
290 "packuswb %%xmm1,%%xmm0 \n"
291 "movdqu %%xmm0," MEMACCESS(1) " \n"
292 "lea " MEMLEA(0x10,1) ",%1 \n"
293 "sub $0x10,%2 \n"
294 "jg 1b \n"
295 : "+r"(src_ptr), // %0
296 "+r"(dst_ptr), // %1
297 "+r"(dst_width) // %2
298 : "r"((intptr_t)(src_stride)) // %3
299 : "memory", "cc"
300 #if defined(__native_client__) && defined(__x86_64__)
301 , "r14"
302 #endif
303 #if defined(__SSE2__)
304 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
305 #endif
306 );
307 }
308
ScaleRowDown4_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)309 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
310 uint8* dst_ptr, int dst_width) {
311 asm volatile (
312 "pcmpeqb %%xmm5,%%xmm5 \n"
313 "psrld $0x18,%%xmm5 \n"
314 "pslld $0x10,%%xmm5 \n"
315
316 LABELALIGN
317 "1: \n"
318 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
319 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
320 "lea " MEMLEA(0x20,0) ",%0 \n"
321 "pand %%xmm5,%%xmm0 \n"
322 "pand %%xmm5,%%xmm1 \n"
323 "packuswb %%xmm1,%%xmm0 \n"
324 "psrlw $0x8,%%xmm0 \n"
325 "packuswb %%xmm0,%%xmm0 \n"
326 "movq %%xmm0," MEMACCESS(1) " \n"
327 "lea " MEMLEA(0x8,1) ",%1 \n"
328 "sub $0x8,%2 \n"
329 "jg 1b \n"
330 : "+r"(src_ptr), // %0
331 "+r"(dst_ptr), // %1
332 "+r"(dst_width) // %2
333 :
334 : "memory", "cc"
335 #if defined(__SSE2__)
336 , "xmm0", "xmm1", "xmm5"
337 #endif
338 );
339 }
340
ScaleRowDown4Box_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)341 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
342 uint8* dst_ptr, int dst_width) {
343 intptr_t stridex3 = 0;
344 asm volatile (
345 "pcmpeqb %%xmm7,%%xmm7 \n"
346 "psrlw $0x8,%%xmm7 \n"
347 "lea " MEMLEA4(0x00,4,4,2) ",%3 \n"
348
349 LABELALIGN
350 "1: \n"
351 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
352 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
353 MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
354 BUNDLEALIGN
355 MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
356 "pavgb %%xmm2,%%xmm0 \n"
357 "pavgb %%xmm3,%%xmm1 \n"
358 MEMOPREG(movdqa,0x00,0,4,2,xmm2) // movdqa (%0,%4,2),%%xmm2
359 BUNDLEALIGN
360 MEMOPREG(movdqa,0x10,0,4,2,xmm3) // movdqa 0x10(%0,%4,2),%%xmm3
361 MEMOPREG(movdqa,0x00,0,3,1,xmm4) // movdqa (%0,%3,1),%%xmm4
362 MEMOPREG(movdqa,0x10,0,3,1,xmm5) // movdqa 0x10(%0,%3,1),%%xmm5
363 "lea " MEMLEA(0x20,0) ",%0 \n"
364 "pavgb %%xmm4,%%xmm2 \n"
365 "pavgb %%xmm2,%%xmm0 \n"
366 "pavgb %%xmm5,%%xmm3 \n"
367 "pavgb %%xmm3,%%xmm1 \n"
368 "movdqa %%xmm0,%%xmm2 \n"
369 "psrlw $0x8,%%xmm0 \n"
370 "movdqa %%xmm1,%%xmm3 \n"
371 "psrlw $0x8,%%xmm1 \n"
372 "pand %%xmm7,%%xmm2 \n"
373 "pand %%xmm7,%%xmm3 \n"
374 "pavgw %%xmm2,%%xmm0 \n"
375 "pavgw %%xmm3,%%xmm1 \n"
376 "packuswb %%xmm1,%%xmm0 \n"
377 "movdqa %%xmm0,%%xmm2 \n"
378 "psrlw $0x8,%%xmm0 \n"
379 "pand %%xmm7,%%xmm2 \n"
380 "pavgw %%xmm2,%%xmm0 \n"
381 "packuswb %%xmm0,%%xmm0 \n"
382 "movq %%xmm0," MEMACCESS(1) " \n"
383 "lea " MEMLEA(0x8,1) ",%1 \n"
384 "sub $0x8,%2 \n"
385 "jg 1b \n"
386 : "+r"(src_ptr), // %0
387 "+r"(dst_ptr), // %1
388 "+r"(dst_width), // %2
389 "+r"(stridex3) // %3
390 : "r"((intptr_t)(src_stride)) // %4
391 : "memory", "cc"
392 #if defined(__native_client__) && defined(__x86_64__)
393 , "r14"
394 #endif
395 #if defined(__SSE2__)
396 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
397 #endif
398 );
399 }
400
ScaleRowDown34_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)401 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
402 uint8* dst_ptr, int dst_width) {
403 asm volatile (
404 "movdqa %0,%%xmm3 \n"
405 "movdqa %1,%%xmm4 \n"
406 "movdqa %2,%%xmm5 \n"
407 :
408 : "m"(kShuf0), // %0
409 "m"(kShuf1), // %1
410 "m"(kShuf2) // %2
411 );
412 asm volatile (
413 LABELALIGN
414 "1: \n"
415 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
416 "movdqa " MEMACCESS2(0x10,0) ",%%xmm2 \n"
417 "lea " MEMLEA(0x20,0) ",%0 \n"
418 "movdqa %%xmm2,%%xmm1 \n"
419 "palignr $0x8,%%xmm0,%%xmm1 \n"
420 "pshufb %%xmm3,%%xmm0 \n"
421 "pshufb %%xmm4,%%xmm1 \n"
422 "pshufb %%xmm5,%%xmm2 \n"
423 "movq %%xmm0," MEMACCESS(1) " \n"
424 "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
425 "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
426 "lea " MEMLEA(0x18,1) ",%1 \n"
427 "sub $0x18,%2 \n"
428 "jg 1b \n"
429 : "+r"(src_ptr), // %0
430 "+r"(dst_ptr), // %1
431 "+r"(dst_width) // %2
432 :
433 : "memory", "cc"
434 #if defined(__SSE2__)
435 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
436 #endif
437 );
438 }
439
ScaleRowDown34_1_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)440 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
441 ptrdiff_t src_stride,
442 uint8* dst_ptr, int dst_width) {
443 asm volatile (
444 "movdqa %0,%%xmm2 \n" // kShuf01
445 "movdqa %1,%%xmm3 \n" // kShuf11
446 "movdqa %2,%%xmm4 \n" // kShuf21
447 :
448 : "m"(kShuf01), // %0
449 "m"(kShuf11), // %1
450 "m"(kShuf21) // %2
451 );
452 asm volatile (
453 "movdqa %0,%%xmm5 \n" // kMadd01
454 "movdqa %1,%%xmm0 \n" // kMadd11
455 "movdqa %2,%%xmm1 \n" // kRound34
456 :
457 : "m"(kMadd01), // %0
458 "m"(kMadd11), // %1
459 "m"(kRound34) // %2
460 );
461 asm volatile (
462 LABELALIGN
463 "1: \n"
464 "movdqa " MEMACCESS(0) ",%%xmm6 \n"
465 MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3),%%xmm7
466 "pavgb %%xmm7,%%xmm6 \n"
467 "pshufb %%xmm2,%%xmm6 \n"
468 "pmaddubsw %%xmm5,%%xmm6 \n"
469 "paddsw %%xmm1,%%xmm6 \n"
470 "psrlw $0x2,%%xmm6 \n"
471 "packuswb %%xmm6,%%xmm6 \n"
472 "movq %%xmm6," MEMACCESS(1) " \n"
473 "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
474 MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7
475 "pavgb %%xmm7,%%xmm6 \n"
476 "pshufb %%xmm3,%%xmm6 \n"
477 "pmaddubsw %%xmm0,%%xmm6 \n"
478 "paddsw %%xmm1,%%xmm6 \n"
479 "psrlw $0x2,%%xmm6 \n"
480 "packuswb %%xmm6,%%xmm6 \n"
481 "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
482 "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
483 BUNDLEALIGN
484 MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3),%%xmm7
485 "lea " MEMLEA(0x20,0) ",%0 \n"
486 "pavgb %%xmm7,%%xmm6 \n"
487 "pshufb %%xmm4,%%xmm6 \n"
488 "pmaddubsw %4,%%xmm6 \n"
489 "paddsw %%xmm1,%%xmm6 \n"
490 "psrlw $0x2,%%xmm6 \n"
491 "packuswb %%xmm6,%%xmm6 \n"
492 "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
493 "lea " MEMLEA(0x18,1) ",%1 \n"
494 "sub $0x18,%2 \n"
495 "jg 1b \n"
496 : "+r"(src_ptr), // %0
497 "+r"(dst_ptr), // %1
498 "+r"(dst_width) // %2
499 : "r"((intptr_t)(src_stride)), // %3
500 "m"(kMadd21) // %4
501 : "memory", "cc"
502 #if defined(__native_client__) && defined(__x86_64__)
503 , "r14"
504 #endif
505 #if defined(__SSE2__)
506 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
507 #endif
508 );
509 }
510
ScaleRowDown34_0_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)511 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
512 ptrdiff_t src_stride,
513 uint8* dst_ptr, int dst_width) {
514 asm volatile (
515 "movdqa %0,%%xmm2 \n" // kShuf01
516 "movdqa %1,%%xmm3 \n" // kShuf11
517 "movdqa %2,%%xmm4 \n" // kShuf21
518 :
519 : "m"(kShuf01), // %0
520 "m"(kShuf11), // %1
521 "m"(kShuf21) // %2
522 );
523 asm volatile (
524 "movdqa %0,%%xmm5 \n" // kMadd01
525 "movdqa %1,%%xmm0 \n" // kMadd11
526 "movdqa %2,%%xmm1 \n" // kRound34
527 :
528 : "m"(kMadd01), // %0
529 "m"(kMadd11), // %1
530 "m"(kRound34) // %2
531 );
532
533 asm volatile (
534 LABELALIGN
535 "1: \n"
536 "movdqa " MEMACCESS(0) ",%%xmm6 \n"
537 MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3,1),%%xmm7
538 "pavgb %%xmm6,%%xmm7 \n"
539 "pavgb %%xmm7,%%xmm6 \n"
540 "pshufb %%xmm2,%%xmm6 \n"
541 "pmaddubsw %%xmm5,%%xmm6 \n"
542 "paddsw %%xmm1,%%xmm6 \n"
543 "psrlw $0x2,%%xmm6 \n"
544 "packuswb %%xmm6,%%xmm6 \n"
545 "movq %%xmm6," MEMACCESS(1) " \n"
546 "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
547 MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7
548 "pavgb %%xmm6,%%xmm7 \n"
549 "pavgb %%xmm7,%%xmm6 \n"
550 "pshufb %%xmm3,%%xmm6 \n"
551 "pmaddubsw %%xmm0,%%xmm6 \n"
552 "paddsw %%xmm1,%%xmm6 \n"
553 "psrlw $0x2,%%xmm6 \n"
554 "packuswb %%xmm6,%%xmm6 \n"
555 "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
556 "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
557 MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3,1),%%xmm7
558 "lea " MEMLEA(0x20,0) ",%0 \n"
559 "pavgb %%xmm6,%%xmm7 \n"
560 "pavgb %%xmm7,%%xmm6 \n"
561 "pshufb %%xmm4,%%xmm6 \n"
562 "pmaddubsw %4,%%xmm6 \n"
563 "paddsw %%xmm1,%%xmm6 \n"
564 "psrlw $0x2,%%xmm6 \n"
565 "packuswb %%xmm6,%%xmm6 \n"
566 "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
567 "lea " MEMLEA(0x18,1) ",%1 \n"
568 "sub $0x18,%2 \n"
569 "jg 1b \n"
570 : "+r"(src_ptr), // %0
571 "+r"(dst_ptr), // %1
572 "+r"(dst_width) // %2
573 : "r"((intptr_t)(src_stride)), // %3
574 "m"(kMadd21) // %4
575 : "memory", "cc"
576 #if defined(__native_client__) && defined(__x86_64__)
577 , "r14"
578 #endif
579 #if defined(__SSE2__)
580 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
581 #endif
582 );
583 }
584
ScaleRowDown38_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)585 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
586 uint8* dst_ptr, int dst_width) {
587 asm volatile (
588 "movdqa %3,%%xmm4 \n"
589 "movdqa %4,%%xmm5 \n"
590
591 LABELALIGN
592 "1: \n"
593 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
594 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
595 "lea " MEMLEA(0x20,0) ",%0 \n"
596 "pshufb %%xmm4,%%xmm0 \n"
597 "pshufb %%xmm5,%%xmm1 \n"
598 "paddusb %%xmm1,%%xmm0 \n"
599 "movq %%xmm0," MEMACCESS(1) " \n"
600 "movhlps %%xmm0,%%xmm1 \n"
601 "movd %%xmm1," MEMACCESS2(0x8,1) " \n"
602 "lea " MEMLEA(0xc,1) ",%1 \n"
603 "sub $0xc,%2 \n"
604 "jg 1b \n"
605 : "+r"(src_ptr), // %0
606 "+r"(dst_ptr), // %1
607 "+r"(dst_width) // %2
608 : "m"(kShuf38a), // %3
609 "m"(kShuf38b) // %4
610 : "memory", "cc"
611 #if defined(__SSE2__)
612 , "xmm0", "xmm1", "xmm4", "xmm5"
613 #endif
614 );
615 }
616
ScaleRowDown38_2_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)617 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
618 ptrdiff_t src_stride,
619 uint8* dst_ptr, int dst_width) {
620 asm volatile (
621 "movdqa %0,%%xmm2 \n"
622 "movdqa %1,%%xmm3 \n"
623 "movdqa %2,%%xmm4 \n"
624 "movdqa %3,%%xmm5 \n"
625 :
626 : "m"(kShufAb0), // %0
627 "m"(kShufAb1), // %1
628 "m"(kShufAb2), // %2
629 "m"(kScaleAb2) // %3
630 );
631 asm volatile (
632 LABELALIGN
633 "1: \n"
634 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
635 MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3,1),%%xmm0
636 "lea " MEMLEA(0x10,0) ",%0 \n"
637 "movdqa %%xmm0,%%xmm1 \n"
638 "pshufb %%xmm2,%%xmm1 \n"
639 "movdqa %%xmm0,%%xmm6 \n"
640 "pshufb %%xmm3,%%xmm6 \n"
641 "paddusw %%xmm6,%%xmm1 \n"
642 "pshufb %%xmm4,%%xmm0 \n"
643 "paddusw %%xmm0,%%xmm1 \n"
644 "pmulhuw %%xmm5,%%xmm1 \n"
645 "packuswb %%xmm1,%%xmm1 \n"
646 "sub $0x6,%2 \n"
647 "movd %%xmm1," MEMACCESS(1) " \n"
648 "psrlq $0x10,%%xmm1 \n"
649 "movd %%xmm1," MEMACCESS2(0x2,1) " \n"
650 "lea " MEMLEA(0x6,1) ",%1 \n"
651 "jg 1b \n"
652 : "+r"(src_ptr), // %0
653 "+r"(dst_ptr), // %1
654 "+r"(dst_width) // %2
655 : "r"((intptr_t)(src_stride)) // %3
656 : "memory", "cc"
657 #if defined(__native_client__) && defined(__x86_64__)
658 , "r14"
659 #endif
660 #if defined(__SSE2__)
661 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
662 #endif
663 );
664 }
665
ScaleRowDown38_3_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)666 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
667 ptrdiff_t src_stride,
668 uint8* dst_ptr, int dst_width) {
669 asm volatile (
670 "movdqa %0,%%xmm2 \n"
671 "movdqa %1,%%xmm3 \n"
672 "movdqa %2,%%xmm4 \n"
673 "pxor %%xmm5,%%xmm5 \n"
674 :
675 : "m"(kShufAc), // %0
676 "m"(kShufAc3), // %1
677 "m"(kScaleAc33) // %2
678 );
679 asm volatile (
680 LABELALIGN
681 "1: \n"
682 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
683 MEMOPREG(movdqa,0x00,0,3,1,xmm6) // movdqa (%0,%3,1),%%xmm6
684 "movhlps %%xmm0,%%xmm1 \n"
685 "movhlps %%xmm6,%%xmm7 \n"
686 "punpcklbw %%xmm5,%%xmm0 \n"
687 "punpcklbw %%xmm5,%%xmm1 \n"
688 "punpcklbw %%xmm5,%%xmm6 \n"
689 "punpcklbw %%xmm5,%%xmm7 \n"
690 "paddusw %%xmm6,%%xmm0 \n"
691 "paddusw %%xmm7,%%xmm1 \n"
692 MEMOPREG(movdqa,0x00,0,3,2,xmm6) // movdqa (%0,%3,2),%%xmm6
693 "lea " MEMLEA(0x10,0) ",%0 \n"
694 "movhlps %%xmm6,%%xmm7 \n"
695 "punpcklbw %%xmm5,%%xmm6 \n"
696 "punpcklbw %%xmm5,%%xmm7 \n"
697 "paddusw %%xmm6,%%xmm0 \n"
698 "paddusw %%xmm7,%%xmm1 \n"
699 "movdqa %%xmm0,%%xmm6 \n"
700 "psrldq $0x2,%%xmm0 \n"
701 "paddusw %%xmm0,%%xmm6 \n"
702 "psrldq $0x2,%%xmm0 \n"
703 "paddusw %%xmm0,%%xmm6 \n"
704 "pshufb %%xmm2,%%xmm6 \n"
705 "movdqa %%xmm1,%%xmm7 \n"
706 "psrldq $0x2,%%xmm1 \n"
707 "paddusw %%xmm1,%%xmm7 \n"
708 "psrldq $0x2,%%xmm1 \n"
709 "paddusw %%xmm1,%%xmm7 \n"
710 "pshufb %%xmm3,%%xmm7 \n"
711 "paddusw %%xmm7,%%xmm6 \n"
712 "pmulhuw %%xmm4,%%xmm6 \n"
713 "packuswb %%xmm6,%%xmm6 \n"
714 "sub $0x6,%2 \n"
715 "movd %%xmm6," MEMACCESS(1) " \n"
716 "psrlq $0x10,%%xmm6 \n"
717 "movd %%xmm6," MEMACCESS2(0x2,1) " \n"
718 "lea " MEMLEA(0x6,1) ",%1 \n"
719 "jg 1b \n"
720 : "+r"(src_ptr), // %0
721 "+r"(dst_ptr), // %1
722 "+r"(dst_width) // %2
723 : "r"((intptr_t)(src_stride)) // %3
724 : "memory", "cc"
725 #if defined(__native_client__) && defined(__x86_64__)
726 , "r14"
727 #endif
728 #if defined(__SSE2__)
729 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
730 #endif
731 );
732 }
733
ScaleAddRows_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint16 * dst_ptr,int src_width,int src_height)734 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
735 uint16* dst_ptr, int src_width, int src_height) {
736 int tmp_height = 0;
737 intptr_t tmp_src = 0;
738 asm volatile (
739 "pxor %%xmm4,%%xmm4 \n"
740 "sub $0x1,%5 \n"
741
742 LABELALIGN
743 "1: \n"
744 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
745 "mov %0,%3 \n"
746 "add %6,%0 \n"
747 "movdqa %%xmm0,%%xmm1 \n"
748 "punpcklbw %%xmm4,%%xmm0 \n"
749 "punpckhbw %%xmm4,%%xmm1 \n"
750 "mov %5,%2 \n"
751 "test %2,%2 \n"
752 "je 3f \n"
753
754 LABELALIGN
755 "2: \n"
756 "movdqa " MEMACCESS(0) ",%%xmm2 \n"
757 "add %6,%0 \n"
758 "movdqa %%xmm2,%%xmm3 \n"
759 "punpcklbw %%xmm4,%%xmm2 \n"
760 "punpckhbw %%xmm4,%%xmm3 \n"
761 "paddusw %%xmm2,%%xmm0 \n"
762 "paddusw %%xmm3,%%xmm1 \n"
763 "sub $0x1,%2 \n"
764 "jg 2b \n"
765
766 LABELALIGN
767 "3: \n"
768 "movdqa %%xmm0," MEMACCESS(1) " \n"
769 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
770 "lea " MEMLEA(0x10,3) ",%0 \n"
771 "lea " MEMLEA(0x20,1) ",%1 \n"
772 "sub $0x10,%4 \n"
773 "jg 1b \n"
774 : "+r"(src_ptr), // %0
775 "+r"(dst_ptr), // %1
776 "+r"(tmp_height), // %2
777 "+r"(tmp_src), // %3
778 "+r"(src_width), // %4
779 "+rm"(src_height) // %5
780 : "rm"((intptr_t)(src_stride)) // %6
781 : "memory", "cc"
782 #if defined(__SSE2__)
783 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
784 #endif
785 );
786 }
787
788 // Bilinear column filtering. SSSE3 version.
ScaleFilterCols_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)789 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
790 int dst_width, int x, int dx) {
791 intptr_t x0 = 0, x1 = 0, temp_pixel = 0;
792 asm volatile (
793 "movd %6,%%xmm2 \n"
794 "movd %7,%%xmm3 \n"
795 "movl $0x04040000,%k2 \n"
796 "movd %k2,%%xmm5 \n"
797 "pcmpeqb %%xmm6,%%xmm6 \n"
798 "psrlw $0x9,%%xmm6 \n"
799 "pextrw $0x1,%%xmm2,%k3 \n"
800 "subl $0x2,%5 \n"
801 "jl 29f \n"
802 "movdqa %%xmm2,%%xmm0 \n"
803 "paddd %%xmm3,%%xmm0 \n"
804 "punpckldq %%xmm0,%%xmm2 \n"
805 "punpckldq %%xmm3,%%xmm3 \n"
806 "paddd %%xmm3,%%xmm3 \n"
807 "pextrw $0x3,%%xmm2,%k4 \n"
808
809 LABELALIGN
810 "2: \n"
811 "movdqa %%xmm2,%%xmm1 \n"
812 "paddd %%xmm3,%%xmm2 \n"
813 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
814 "movd %k2,%%xmm0 \n"
815 "psrlw $0x9,%%xmm1 \n"
816 BUNDLEALIGN
817 MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2
818 "movd %k2,%%xmm4 \n"
819 "pshufb %%xmm5,%%xmm1 \n"
820 "punpcklwd %%xmm4,%%xmm0 \n"
821 "pxor %%xmm6,%%xmm1 \n"
822 "pmaddubsw %%xmm1,%%xmm0 \n"
823 "pextrw $0x1,%%xmm2,%k3 \n"
824 "pextrw $0x3,%%xmm2,%k4 \n"
825 "psrlw $0x7,%%xmm0 \n"
826 "packuswb %%xmm0,%%xmm0 \n"
827 "movd %%xmm0,%k2 \n"
828 "mov %w2," MEMACCESS(0) " \n"
829 "lea " MEMLEA(0x2,0) ",%0 \n"
830 "sub $0x2,%5 \n"
831 "jge 2b \n"
832
833 LABELALIGN
834 "29: \n"
835 "addl $0x1,%5 \n"
836 "jl 99f \n"
837 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
838 "movd %k2,%%xmm0 \n"
839 "psrlw $0x9,%%xmm2 \n"
840 "pshufb %%xmm5,%%xmm2 \n"
841 "pxor %%xmm6,%%xmm2 \n"
842 "pmaddubsw %%xmm2,%%xmm0 \n"
843 "psrlw $0x7,%%xmm0 \n"
844 "packuswb %%xmm0,%%xmm0 \n"
845 "movd %%xmm0,%k2 \n"
846 "mov %b2," MEMACCESS(0) " \n"
847 "99: \n"
848 : "+r"(dst_ptr), // %0
849 "+r"(src_ptr), // %1
850 "+a"(temp_pixel), // %2
851 "+r"(x0), // %3
852 "+r"(x1), // %4
853 "+rm"(dst_width) // %5
854 : "rm"(x), // %6
855 "rm"(dx) // %7
856 : "memory", "cc"
857 #if defined(__native_client__) && defined(__x86_64__)
858 , "r14"
859 #endif
860 #if defined(__SSE2__)
861 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
862 #endif
863 );
864 }
865
866 // Reads 4 pixels, duplicates them and writes 8 pixels.
867 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleColsUp2_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)868 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
869 int dst_width, int x, int dx) {
870 asm volatile (
871 LABELALIGN
872 "1: \n"
873 "movdqa " MEMACCESS(1) ",%%xmm0 \n"
874 "lea " MEMLEA(0x10,1) ",%1 \n"
875 "movdqa %%xmm0,%%xmm1 \n"
876 "punpcklbw %%xmm0,%%xmm0 \n"
877 "punpckhbw %%xmm1,%%xmm1 \n"
878 "sub $0x20,%2 \n"
879 "movdqa %%xmm0," MEMACCESS(0) " \n"
880 "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
881 "lea " MEMLEA(0x20,0) ",%0 \n"
882 "jg 1b \n"
883
884 : "+r"(dst_ptr), // %0
885 "+r"(src_ptr), // %1
886 "+r"(dst_width) // %2
887 :
888 : "memory", "cc"
889 #if defined(__SSE2__)
890 , "xmm0", "xmm1"
891 #endif
892 );
893 }
894
ScaleARGBRowDown2_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)895 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
896 ptrdiff_t src_stride,
897 uint8* dst_argb, int dst_width) {
898 asm volatile (
899 LABELALIGN
900 "1: \n"
901 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
902 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
903 "lea " MEMLEA(0x20,0) ",%0 \n"
904 "shufps $0xdd,%%xmm1,%%xmm0 \n"
905 "sub $0x4,%2 \n"
906 "movdqa %%xmm0," MEMACCESS(1) " \n"
907 "lea " MEMLEA(0x10,1) ",%1 \n"
908 "jg 1b \n"
909 : "+r"(src_argb), // %0
910 "+r"(dst_argb), // %1
911 "+r"(dst_width) // %2
912 :
913 : "memory", "cc"
914 #if defined(__SSE2__)
915 , "xmm0", "xmm1"
916 #endif
917 );
918 }
919
ScaleARGBRowDown2Linear_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)920 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
921 ptrdiff_t src_stride,
922 uint8* dst_argb, int dst_width) {
923 asm volatile (
924 LABELALIGN
925 "1: \n"
926 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
927 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
928 "lea " MEMLEA(0x20,0) ",%0 \n"
929 "movdqa %%xmm0,%%xmm2 \n"
930 "shufps $0x88,%%xmm1,%%xmm0 \n"
931 "shufps $0xdd,%%xmm1,%%xmm2 \n"
932 "pavgb %%xmm2,%%xmm0 \n"
933 "sub $0x4,%2 \n"
934 "movdqa %%xmm0," MEMACCESS(1) " \n"
935 "lea " MEMLEA(0x10,1) ",%1 \n"
936 "jg 1b \n"
937 : "+r"(src_argb), // %0
938 "+r"(dst_argb), // %1
939 "+r"(dst_width) // %2
940 :
941 : "memory", "cc"
942 #if defined(__SSE2__)
943 , "xmm0", "xmm1"
944 #endif
945 );
946 }
947
ScaleARGBRowDown2Box_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)948 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
949 ptrdiff_t src_stride,
950 uint8* dst_argb, int dst_width) {
951 asm volatile (
952 LABELALIGN
953 "1: \n"
954 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
955 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
956 BUNDLEALIGN
957 MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2
958 MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3
959 "lea " MEMLEA(0x20,0) ",%0 \n"
960 "pavgb %%xmm2,%%xmm0 \n"
961 "pavgb %%xmm3,%%xmm1 \n"
962 "movdqa %%xmm0,%%xmm2 \n"
963 "shufps $0x88,%%xmm1,%%xmm0 \n"
964 "shufps $0xdd,%%xmm1,%%xmm2 \n"
965 "pavgb %%xmm2,%%xmm0 \n"
966 "sub $0x4,%2 \n"
967 "movdqa %%xmm0," MEMACCESS(1) " \n"
968 "lea " MEMLEA(0x10,1) ",%1 \n"
969 "jg 1b \n"
970 : "+r"(src_argb), // %0
971 "+r"(dst_argb), // %1
972 "+r"(dst_width) // %2
973 : "r"((intptr_t)(src_stride)) // %3
974 : "memory", "cc"
975 #if defined(__native_client__) && defined(__x86_64__)
976 , "r14"
977 #endif
978 #if defined(__SSE2__)
979 , "xmm0", "xmm1", "xmm2", "xmm3"
980 #endif
981 );
982 }
983
984 // Reads 4 pixels at a time.
985 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEven_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)986 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
987 int src_stepx,
988 uint8* dst_argb, int dst_width) {
989 intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
990 intptr_t src_stepx_x12 = 0;
991 asm volatile (
992 "lea " MEMLEA3(0x00,1,4) ",%1 \n"
993 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
994 LABELALIGN
995 "1: \n"
996 "movd " MEMACCESS(0) ",%%xmm0 \n"
997 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
998 "punpckldq %%xmm1,%%xmm0 \n"
999 BUNDLEALIGN
1000 MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2
1001 MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3
1002 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
1003 "punpckldq %%xmm3,%%xmm2 \n"
1004 "punpcklqdq %%xmm2,%%xmm0 \n"
1005 "sub $0x4,%3 \n"
1006 "movdqa %%xmm0," MEMACCESS(2) " \n"
1007 "lea " MEMLEA(0x10,2) ",%2 \n"
1008 "jg 1b \n"
1009 : "+r"(src_argb), // %0
1010 "+r"(src_stepx_x4), // %1
1011 "+r"(dst_argb), // %2
1012 "+r"(dst_width), // %3
1013 "+r"(src_stepx_x12) // %4
1014 :
1015 : "memory", "cc"
1016 #if defined(__native_client__) && defined(__x86_64__)
1017 , "r14"
1018 #endif
1019 #if defined(__SSE2__)
1020 , "xmm0", "xmm1", "xmm2", "xmm3"
1021 #endif
1022 );
1023 }
1024
1025 // Blends four 2x2 to 4x1.
1026 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEvenBox_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)1027 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
1028 ptrdiff_t src_stride, int src_stepx,
1029 uint8* dst_argb, int dst_width) {
1030 intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
1031 intptr_t src_stepx_x12 = 0;
1032 intptr_t row1 = (intptr_t)(src_stride);
1033 asm volatile (
1034 "lea " MEMLEA3(0x00,1,4) ",%1 \n"
1035 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
1036 "lea " MEMLEA4(0x00,0,5,1) ",%5 \n"
1037
1038 LABELALIGN
1039 "1: \n"
1040 "movq " MEMACCESS(0) ",%%xmm0 \n"
1041 MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0
1042 MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1
1043 BUNDLEALIGN
1044 MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1
1045 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
1046 "movq " MEMACCESS(5) ",%%xmm2 \n"
1047 BUNDLEALIGN
1048 MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2
1049 MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3
1050 MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3
1051 "lea " MEMLEA4(0x00,5,1,4) ",%5 \n"
1052 "pavgb %%xmm2,%%xmm0 \n"
1053 "pavgb %%xmm3,%%xmm1 \n"
1054 "movdqa %%xmm0,%%xmm2 \n"
1055 "shufps $0x88,%%xmm1,%%xmm0 \n"
1056 "shufps $0xdd,%%xmm1,%%xmm2 \n"
1057 "pavgb %%xmm2,%%xmm0 \n"
1058 "sub $0x4,%3 \n"
1059 "movdqa %%xmm0," MEMACCESS(2) " \n"
1060 "lea " MEMLEA(0x10,2) ",%2 \n"
1061 "jg 1b \n"
1062 : "+r"(src_argb), // %0
1063 "+r"(src_stepx_x4), // %1
1064 "+r"(dst_argb), // %2
1065 "+rm"(dst_width), // %3
1066 "+r"(src_stepx_x12), // %4
1067 "+r"(row1) // %5
1068 :
1069 : "memory", "cc"
1070 #if defined(__native_client__) && defined(__x86_64__)
1071 , "r14"
1072 #endif
1073 #if defined(__SSE2__)
1074 , "xmm0", "xmm1", "xmm2", "xmm3"
1075 #endif
1076 );
1077 }
1078
ScaleARGBCols_SSE2(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1079 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
1080 int dst_width, int x, int dx) {
1081 intptr_t x0 = 0, x1 = 0;
1082 asm volatile (
1083 "movd %5,%%xmm2 \n"
1084 "movd %6,%%xmm3 \n"
1085 "pshufd $0x0,%%xmm2,%%xmm2 \n"
1086 "pshufd $0x11,%%xmm3,%%xmm0 \n"
1087 "paddd %%xmm0,%%xmm2 \n"
1088 "paddd %%xmm3,%%xmm3 \n"
1089 "pshufd $0x5,%%xmm3,%%xmm0 \n"
1090 "paddd %%xmm0,%%xmm2 \n"
1091 "paddd %%xmm3,%%xmm3 \n"
1092 "pshufd $0x0,%%xmm3,%%xmm3 \n"
1093 "pextrw $0x1,%%xmm2,%k0 \n"
1094 "pextrw $0x3,%%xmm2,%k1 \n"
1095 "cmp $0x0,%4 \n"
1096 "jl 99f \n"
1097 "sub $0x4,%4 \n"
1098 "jl 49f \n"
1099
1100 LABELALIGN
1101 "40: \n"
1102 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
1103 MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
1104 "pextrw $0x5,%%xmm2,%k0 \n"
1105 "pextrw $0x7,%%xmm2,%k1 \n"
1106 "paddd %%xmm3,%%xmm2 \n"
1107 "punpckldq %%xmm1,%%xmm0 \n"
1108 MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1
1109 MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4
1110 "pextrw $0x1,%%xmm2,%k0 \n"
1111 "pextrw $0x3,%%xmm2,%k1 \n"
1112 "punpckldq %%xmm4,%%xmm1 \n"
1113 "punpcklqdq %%xmm1,%%xmm0 \n"
1114 "sub $0x4,%4 \n"
1115 "movdqu %%xmm0," MEMACCESS(2) " \n"
1116 "lea " MEMLEA(0x10,2) ",%2 \n"
1117 "jge 40b \n"
1118
1119 "49: \n"
1120 "test $0x2,%4 \n"
1121 "je 29f \n"
1122 BUNDLEALIGN
1123 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
1124 MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
1125 "pextrw $0x5,%%xmm2,%k0 \n"
1126 "punpckldq %%xmm1,%%xmm0 \n"
1127 "movq %%xmm0," MEMACCESS(2) " \n"
1128 "lea " MEMLEA(0x8,2) ",%2 \n"
1129 "29: \n"
1130 "test $0x1,%4 \n"
1131 "je 99f \n"
1132 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
1133 "movd %%xmm0," MEMACCESS(2) " \n"
1134 "99: \n"
1135 : "+a"(x0), // %0
1136 "+d"(x1), // %1
1137 "+r"(dst_argb), // %2
1138 "+r"(src_argb), // %3
1139 "+r"(dst_width) // %4
1140 : "rm"(x), // %5
1141 "rm"(dx) // %6
1142 : "memory", "cc"
1143 #if defined(__native_client__) && defined(__x86_64__)
1144 , "r14"
1145 #endif
1146 #if defined(__SSE2__)
1147 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1148 #endif
1149 );
1150 }
1151
1152 // Reads 4 pixels, duplicates them and writes 8 pixels.
1153 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleARGBColsUp2_SSE2(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1154 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
1155 int dst_width, int x, int dx) {
1156 asm volatile (
1157 LABELALIGN
1158 "1: \n"
1159 "movdqa " MEMACCESS(1) ",%%xmm0 \n"
1160 "lea " MEMLEA(0x10,1) ",%1 \n"
1161 "movdqa %%xmm0,%%xmm1 \n"
1162 "punpckldq %%xmm0,%%xmm0 \n"
1163 "punpckhdq %%xmm1,%%xmm1 \n"
1164 "sub $0x8,%2 \n"
1165 "movdqa %%xmm0," MEMACCESS(0) " \n"
1166 "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
1167 "lea " MEMLEA(0x20,0) ",%0 \n"
1168 "jg 1b \n"
1169
1170 : "+r"(dst_argb), // %0
1171 "+r"(src_argb), // %1
1172 "+r"(dst_width) // %2
1173 :
1174 : "memory", "cc"
1175 #if defined(__native_client__) && defined(__x86_64__)
1176 , "r14"
1177 #endif
1178 #if defined(__SSE2__)
1179 , "xmm0", "xmm1"
1180 #endif
1181 );
1182 }
1183
1184 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1185 static uvec8 kShuffleColARGB = {
1186 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
1187 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
1188 };
1189
1190 // Shuffle table for duplicating 2 fractions into 8 bytes each
1191 static uvec8 kShuffleFractions = {
1192 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1193 };
1194
1195 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
ScaleARGBFilterCols_SSSE3(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1196 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
1197 int dst_width, int x, int dx) {
1198 intptr_t x0 = 0, x1 = 0;
1199 asm volatile (
1200 "movdqa %0,%%xmm4 \n"
1201 "movdqa %1,%%xmm5 \n"
1202 :
1203 : "m"(kShuffleColARGB), // %0
1204 "m"(kShuffleFractions) // %1
1205 );
1206
1207 asm volatile (
1208 "movd %5,%%xmm2 \n"
1209 "movd %6,%%xmm3 \n"
1210 "pcmpeqb %%xmm6,%%xmm6 \n"
1211 "psrlw $0x9,%%xmm6 \n"
1212 "pextrw $0x1,%%xmm2,%k3 \n"
1213 "sub $0x2,%2 \n"
1214 "jl 29f \n"
1215 "movdqa %%xmm2,%%xmm0 \n"
1216 "paddd %%xmm3,%%xmm0 \n"
1217 "punpckldq %%xmm0,%%xmm2 \n"
1218 "punpckldq %%xmm3,%%xmm3 \n"
1219 "paddd %%xmm3,%%xmm3 \n"
1220 "pextrw $0x3,%%xmm2,%k4 \n"
1221
1222 LABELALIGN
1223 "2: \n"
1224 "movdqa %%xmm2,%%xmm1 \n"
1225 "paddd %%xmm3,%%xmm2 \n"
1226 MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
1227 "psrlw $0x9,%%xmm1 \n"
1228 BUNDLEALIGN
1229 MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0
1230 "pshufb %%xmm5,%%xmm1 \n"
1231 "pshufb %%xmm4,%%xmm0 \n"
1232 "pxor %%xmm6,%%xmm1 \n"
1233 "pmaddubsw %%xmm1,%%xmm0 \n"
1234 "psrlw $0x7,%%xmm0 \n"
1235 "pextrw $0x1,%%xmm2,%k3 \n"
1236 "pextrw $0x3,%%xmm2,%k4 \n"
1237 "packuswb %%xmm0,%%xmm0 \n"
1238 "movq %%xmm0," MEMACCESS(0) " \n"
1239 "lea " MEMLEA(0x8,0) ",%0 \n"
1240 "sub $0x2,%2 \n"
1241 "jge 2b \n"
1242
1243 LABELALIGN
1244 "29: \n"
1245 "add $0x1,%2 \n"
1246 "jl 99f \n"
1247 "psrlw $0x9,%%xmm2 \n"
1248 BUNDLEALIGN
1249 MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
1250 "pshufb %%xmm5,%%xmm2 \n"
1251 "pshufb %%xmm4,%%xmm0 \n"
1252 "pxor %%xmm6,%%xmm2 \n"
1253 "pmaddubsw %%xmm2,%%xmm0 \n"
1254 "psrlw $0x7,%%xmm0 \n"
1255 "packuswb %%xmm0,%%xmm0 \n"
1256 "movd %%xmm0," MEMACCESS(0) " \n"
1257
1258 LABELALIGN
1259 "99: \n"
1260 : "+r"(dst_argb), // %0
1261 "+r"(src_argb), // %1
1262 "+rm"(dst_width), // %2
1263 "+r"(x0), // %3
1264 "+r"(x1) // %4
1265 : "rm"(x), // %5
1266 "rm"(dx) // %6
1267 : "memory", "cc"
1268 #if defined(__native_client__) && defined(__x86_64__)
1269 , "r14"
1270 #endif
1271 #if defined(__SSE2__)
1272 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1273 #endif
1274 );
1275 }
1276
1277 // Divide num by div and return as 16.16 fixed point result.
FixedDiv_X86(int num,int div)1278 int FixedDiv_X86(int num, int div) {
1279 asm volatile (
1280 "cdq \n"
1281 "shld $0x10,%%eax,%%edx \n"
1282 "shl $0x10,%%eax \n"
1283 "idiv %1 \n"
1284 "mov %0, %%eax \n"
1285 : "+a"(num) // %0
1286 : "c"(div) // %1
1287 : "memory", "cc", "edx"
1288 );
1289 return num;
1290 }
1291
1292 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
FixedDiv1_X86(int num,int div)1293 int FixedDiv1_X86(int num, int div) {
1294 asm volatile (
1295 "cdq \n"
1296 "shld $0x10,%%eax,%%edx \n"
1297 "shl $0x10,%%eax \n"
1298 "sub $0x10001,%%eax \n"
1299 "sbb $0x0,%%edx \n"
1300 "sub $0x1,%1 \n"
1301 "idiv %1 \n"
1302 "mov %0, %%eax \n"
1303 : "+a"(num) // %0
1304 : "c"(div) // %1
1305 : "memory", "cc", "edx"
1306 );
1307 return num;
1308 }
1309
1310 #endif // defined(__x86_64__) || defined(__i386__)
1311
1312 #ifdef __cplusplus
1313 } // extern "C"
1314 } // namespace libyuv
1315 #endif
1316