1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12 #include "libyuv/scale_row.h"
13
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18
19 // This module is for GCC x86 and x64.
20 #if !defined(LIBYUV_DISABLE_X86) && \
21 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
22
23 // Offsets for source bytes 0 to 9
24 static uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
25 128, 128, 128, 128, 128, 128, 128, 128};
26
27 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
28 static uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
29 128, 128, 128, 128, 128, 128, 128, 128};
30
31 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
32 static uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
33 128, 128, 128, 128, 128, 128, 128, 128};
34
35 // Offsets for source bytes 0 to 10
36 static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
37
38 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
39 static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13};
40
41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
42 static uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
43 10, 11, 12, 13, 13, 14, 14, 15};
44
45 // Coefficients for source bytes 0 to 10
46 static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
47
48 // Coefficients for source bytes 10 to 21
49 static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
50
51 // Coefficients for source bytes 21 to 31
52 static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
53
54 // Coefficients for source bytes 21 to 31
55 static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
56
57 static uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
58 128, 128, 128, 128, 128, 128, 128, 128};
59
60 static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
61 6, 8, 11, 14, 128, 128, 128, 128};
62
63 // Arrange words 0,3,6 into 0,1,2
64 static uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
65 128, 128, 128, 128, 128, 128, 128, 128};
66
67 // Arrange words 0,3,6 into 3,4,5
68 static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
69 6, 7, 12, 13, 128, 128, 128, 128};
70
71 // Scaling values for boxes of 3x3 and 2x3
72 static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
73 65536 / 9, 65536 / 6, 0, 0};
74
75 // Arrange first value for pixels 0,1,2,3,4,5
76 static uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
77 11, 128, 14, 128, 128, 128, 128, 128};
78
79 // Arrange second value for pixels 0,1,2,3,4,5
80 static uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
81 12, 128, 15, 128, 128, 128, 128, 128};
82
83 // Arrange third value for pixels 0,1,2,3,4,5
84 static uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
85 13, 128, 128, 128, 128, 128, 128, 128};
86
87 // Scaling values for boxes of 3x2 and 2x2
88 static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
89 65536 / 3, 65536 / 2, 0, 0};
90
91 // GCC versions of row functions are verbatim conversions from Visual C.
92 // Generated using gcc disassembly on Visual C object file:
93 // objdump -D yuvscaler.obj >yuvscaler.txt
94
ScaleRowDown2_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)95 void ScaleRowDown2_SSSE3(const uint8* src_ptr,
96 ptrdiff_t src_stride,
97 uint8* dst_ptr,
98 int dst_width) {
99 (void)src_stride;
100 asm volatile (
101 LABELALIGN
102 "1: \n"
103 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
104 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
105 "lea " MEMLEA(0x20,0) ",%0 \n"
106 "psrlw $0x8,%%xmm0 \n"
107 "psrlw $0x8,%%xmm1 \n"
108 "packuswb %%xmm1,%%xmm0 \n"
109 "movdqu %%xmm0," MEMACCESS(1) " \n"
110 "lea " MEMLEA(0x10,1) ",%1 \n"
111 "sub $0x10,%2 \n"
112 "jg 1b \n"
113 : "+r"(src_ptr), // %0
114 "+r"(dst_ptr), // %1
115 "+r"(dst_width) // %2
116 :: "memory", "cc", "xmm0", "xmm1"
117 );
118 }
119
ScaleRowDown2Linear_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)120 void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr,
121 ptrdiff_t src_stride,
122 uint8* dst_ptr,
123 int dst_width) {
124 (void)src_stride;
125 asm volatile (
126 "pcmpeqb %%xmm4,%%xmm4 \n"
127 "psrlw $0xf,%%xmm4 \n"
128 "packuswb %%xmm4,%%xmm4 \n"
129 "pxor %%xmm5,%%xmm5 \n"
130
131 LABELALIGN
132 "1: \n"
133 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
134 "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
135 "lea " MEMLEA(0x20,0) ",%0 \n"
136 "pmaddubsw %%xmm4,%%xmm0 \n"
137 "pmaddubsw %%xmm4,%%xmm1 \n"
138 "pavgw %%xmm5,%%xmm0 \n"
139 "pavgw %%xmm5,%%xmm1 \n"
140 "packuswb %%xmm1,%%xmm0 \n"
141 "movdqu %%xmm0," MEMACCESS(1) " \n"
142 "lea " MEMLEA(0x10,1) ",%1 \n"
143 "sub $0x10,%2 \n"
144 "jg 1b \n"
145 : "+r"(src_ptr), // %0
146 "+r"(dst_ptr), // %1
147 "+r"(dst_width) // %2
148 :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
149 );
150 }
151
ScaleRowDown2Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)152 void ScaleRowDown2Box_SSSE3(const uint8* src_ptr,
153 ptrdiff_t src_stride,
154 uint8* dst_ptr,
155 int dst_width) {
156 asm volatile (
157 "pcmpeqb %%xmm4,%%xmm4 \n"
158 "psrlw $0xf,%%xmm4 \n"
159 "packuswb %%xmm4,%%xmm4 \n"
160 "pxor %%xmm5,%%xmm5 \n"
161
162 LABELALIGN
163 "1: \n"
164 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
165 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
166 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
167 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
168 "lea " MEMLEA(0x20,0) ",%0 \n"
169 "pmaddubsw %%xmm4,%%xmm0 \n"
170 "pmaddubsw %%xmm4,%%xmm1 \n"
171 "pmaddubsw %%xmm4,%%xmm2 \n"
172 "pmaddubsw %%xmm4,%%xmm3 \n"
173 "paddw %%xmm2,%%xmm0 \n"
174 "paddw %%xmm3,%%xmm1 \n"
175 "psrlw $0x1,%%xmm0 \n"
176 "psrlw $0x1,%%xmm1 \n"
177 "pavgw %%xmm5,%%xmm0 \n"
178 "pavgw %%xmm5,%%xmm1 \n"
179 "packuswb %%xmm1,%%xmm0 \n"
180 "movdqu %%xmm0," MEMACCESS(1) " \n"
181 "lea " MEMLEA(0x10,1) ",%1 \n"
182 "sub $0x10,%2 \n"
183 "jg 1b \n"
184 : "+r"(src_ptr), // %0
185 "+r"(dst_ptr), // %1
186 "+r"(dst_width) // %2
187 : "r"((intptr_t)(src_stride)) // %3
188 : "memory", "cc", NACL_R14
189 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
190 );
191 }
192
193 #ifdef HAS_SCALEROWDOWN2_AVX2
ScaleRowDown2_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)194 void ScaleRowDown2_AVX2(const uint8* src_ptr,
195 ptrdiff_t src_stride,
196 uint8* dst_ptr,
197 int dst_width) {
198 (void)src_stride;
199 asm volatile (
200 LABELALIGN
201 "1: \n"
202 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
203 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
204 "lea " MEMLEA(0x40,0) ",%0 \n"
205 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
206 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
207 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
208 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
209 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
210 "lea " MEMLEA(0x20,1) ",%1 \n"
211 "sub $0x20,%2 \n"
212 "jg 1b \n"
213 "vzeroupper \n"
214 : "+r"(src_ptr), // %0
215 "+r"(dst_ptr), // %1
216 "+r"(dst_width) // %2
217 :: "memory", "cc", "xmm0", "xmm1"
218 );
219 }
220
ScaleRowDown2Linear_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)221 void ScaleRowDown2Linear_AVX2(const uint8* src_ptr,
222 ptrdiff_t src_stride,
223 uint8* dst_ptr,
224 int dst_width) {
225 (void)src_stride;
226 asm volatile (
227 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
228 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
229 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
230 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
231
232 LABELALIGN
233 "1: \n"
234 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
235 "vmovdqu " MEMACCESS2(0x20, 0) ",%%ymm1 \n"
236 "lea " MEMLEA(0x40,0) ",%0 \n"
237 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
238 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
239 "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
240 "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
241 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
242 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
243 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
244 "lea " MEMLEA(0x20,1) ",%1 \n"
245 "sub $0x20,%2 \n"
246 "jg 1b \n"
247 "vzeroupper \n"
248 : "+r"(src_ptr), // %0
249 "+r"(dst_ptr), // %1
250 "+r"(dst_width) // %2
251 :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
252 );
253 }
254
ScaleRowDown2Box_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)255 void ScaleRowDown2Box_AVX2(const uint8* src_ptr,
256 ptrdiff_t src_stride,
257 uint8* dst_ptr,
258 int dst_width) {
259 asm volatile (
260 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
261 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
262 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
263 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
264
265 LABELALIGN
266 "1: \n"
267 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
268 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
269 MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2
270 MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3
271 "lea " MEMLEA(0x40,0) ",%0 \n"
272 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
273 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
274 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
275 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
276 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
277 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
278 "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
279 "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
280 "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
281 "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
282 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
283 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
284 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
285 "lea " MEMLEA(0x20,1) ",%1 \n"
286 "sub $0x20,%2 \n"
287 "jg 1b \n"
288 "vzeroupper \n"
289 : "+r"(src_ptr), // %0
290 "+r"(dst_ptr), // %1
291 "+r"(dst_width) // %2
292 : "r"((intptr_t)(src_stride)) // %3
293 : "memory", "cc", NACL_R14
294 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
295 );
296 }
297 #endif // HAS_SCALEROWDOWN2_AVX2
298
ScaleRowDown4_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)299 void ScaleRowDown4_SSSE3(const uint8* src_ptr,
300 ptrdiff_t src_stride,
301 uint8* dst_ptr,
302 int dst_width) {
303 (void)src_stride;
304 asm volatile (
305 "pcmpeqb %%xmm5,%%xmm5 \n"
306 "psrld $0x18,%%xmm5 \n"
307 "pslld $0x10,%%xmm5 \n"
308
309 LABELALIGN
310 "1: \n"
311 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
312 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
313 "lea " MEMLEA(0x20,0) ",%0 \n"
314 "pand %%xmm5,%%xmm0 \n"
315 "pand %%xmm5,%%xmm1 \n"
316 "packuswb %%xmm1,%%xmm0 \n"
317 "psrlw $0x8,%%xmm0 \n"
318 "packuswb %%xmm0,%%xmm0 \n"
319 "movq %%xmm0," MEMACCESS(1) " \n"
320 "lea " MEMLEA(0x8,1) ",%1 \n"
321 "sub $0x8,%2 \n"
322 "jg 1b \n"
323 : "+r"(src_ptr), // %0
324 "+r"(dst_ptr), // %1
325 "+r"(dst_width) // %2
326 :: "memory", "cc", "xmm0", "xmm1", "xmm5"
327 );
328 }
329
ScaleRowDown4Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)330 void ScaleRowDown4Box_SSSE3(const uint8* src_ptr,
331 ptrdiff_t src_stride,
332 uint8* dst_ptr,
333 int dst_width) {
334 intptr_t stridex3;
335 asm volatile (
336 "pcmpeqb %%xmm4,%%xmm4 \n"
337 "psrlw $0xf,%%xmm4 \n"
338 "movdqa %%xmm4,%%xmm5 \n"
339 "packuswb %%xmm4,%%xmm4 \n"
340 "psllw $0x3,%%xmm5 \n"
341 "lea " MEMLEA4(0x00,4,4,2) ",%3 \n"
342
343 LABELALIGN
344 "1: \n"
345 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
346 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
347 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
348 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
349 "pmaddubsw %%xmm4,%%xmm0 \n"
350 "pmaddubsw %%xmm4,%%xmm1 \n"
351 "pmaddubsw %%xmm4,%%xmm2 \n"
352 "pmaddubsw %%xmm4,%%xmm3 \n"
353 "paddw %%xmm2,%%xmm0 \n"
354 "paddw %%xmm3,%%xmm1 \n"
355 MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2
356 MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3
357 "pmaddubsw %%xmm4,%%xmm2 \n"
358 "pmaddubsw %%xmm4,%%xmm3 \n"
359 "paddw %%xmm2,%%xmm0 \n"
360 "paddw %%xmm3,%%xmm1 \n"
361 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
362 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
363 "lea " MEMLEA(0x20,0) ",%0 \n"
364 "pmaddubsw %%xmm4,%%xmm2 \n"
365 "pmaddubsw %%xmm4,%%xmm3 \n"
366 "paddw %%xmm2,%%xmm0 \n"
367 "paddw %%xmm3,%%xmm1 \n"
368 "phaddw %%xmm1,%%xmm0 \n"
369 "paddw %%xmm5,%%xmm0 \n"
370 "psrlw $0x4,%%xmm0 \n"
371 "packuswb %%xmm0,%%xmm0 \n"
372 "movq %%xmm0," MEMACCESS(1) " \n"
373 "lea " MEMLEA(0x8,1) ",%1 \n"
374 "sub $0x8,%2 \n"
375 "jg 1b \n"
376 : "+r"(src_ptr), // %0
377 "+r"(dst_ptr), // %1
378 "+r"(dst_width), // %2
379 "=&r"(stridex3) // %3
380 : "r"((intptr_t)(src_stride)) // %4
381 : "memory", "cc", NACL_R14
382 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
383 );
384 }
385
386 #ifdef HAS_SCALEROWDOWN4_AVX2
ScaleRowDown4_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)387 void ScaleRowDown4_AVX2(const uint8* src_ptr,
388 ptrdiff_t src_stride,
389 uint8* dst_ptr,
390 int dst_width) {
391 (void)src_stride;
392 asm volatile (
393 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
394 "vpsrld $0x18,%%ymm5,%%ymm5 \n"
395 "vpslld $0x10,%%ymm5,%%ymm5 \n"
396 LABELALIGN
397 "1: \n"
398 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
399 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
400 "lea " MEMLEA(0x40,0) ",%0 \n"
401 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
402 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
403 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
404 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
405 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
406 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
407 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
408 "vmovdqu %%xmm0," MEMACCESS(1) " \n"
409 "lea " MEMLEA(0x10,1) ",%1 \n"
410 "sub $0x10,%2 \n"
411 "jg 1b \n"
412 "vzeroupper \n"
413 : "+r"(src_ptr), // %0
414 "+r"(dst_ptr), // %1
415 "+r"(dst_width) // %2
416 :: "memory", "cc", "xmm0", "xmm1", "xmm5"
417 );
418 }
419
ScaleRowDown4Box_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)420 void ScaleRowDown4Box_AVX2(const uint8* src_ptr,
421 ptrdiff_t src_stride,
422 uint8* dst_ptr,
423 int dst_width) {
424 asm volatile (
425 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
426 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
427 "vpsllw $0x3,%%ymm4,%%ymm5 \n"
428 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
429
430 LABELALIGN
431 "1: \n"
432 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
433 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
434 MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2
435 MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3
436 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
437 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
438 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
439 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
440 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
441 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
442 MEMOPREG(vmovdqu,0x00,0,3,2,ymm2) // vmovdqu (%0,%3,2),%%ymm2
443 MEMOPREG(vmovdqu,0x20,0,3,2,ymm3) // vmovdqu 0x20(%0,%3,2),%%ymm3
444 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
445 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
446 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
447 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
448 MEMOPREG(vmovdqu,0x00,0,4,1,ymm2) // vmovdqu (%0,%4,1),%%ymm2
449 MEMOPREG(vmovdqu,0x20,0,4,1,ymm3) // vmovdqu 0x20(%0,%4,1),%%ymm3
450 "lea " MEMLEA(0x40,0) ",%0 \n"
451 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
452 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
453 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
454 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
455 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
456 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
457 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
458 "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
459 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
460 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
461 "vmovdqu %%xmm0," MEMACCESS(1) " \n"
462 "lea " MEMLEA(0x10,1) ",%1 \n"
463 "sub $0x10,%2 \n"
464 "jg 1b \n"
465 "vzeroupper \n"
466 : "+r"(src_ptr), // %0
467 "+r"(dst_ptr), // %1
468 "+r"(dst_width) // %2
469 : "r"((intptr_t)(src_stride)), // %3
470 "r"((intptr_t)(src_stride * 3)) // %4
471 : "memory", "cc", NACL_R14
472 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
473 );
474 }
475 #endif // HAS_SCALEROWDOWN4_AVX2
476
ScaleRowDown34_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)477 void ScaleRowDown34_SSSE3(const uint8* src_ptr,
478 ptrdiff_t src_stride,
479 uint8* dst_ptr,
480 int dst_width) {
481 (void)src_stride;
482 asm volatile(
483 "movdqa %0,%%xmm3 \n"
484 "movdqa %1,%%xmm4 \n"
485 "movdqa %2,%%xmm5 \n"
486 :
487 : "m"(kShuf0), // %0
488 "m"(kShuf1), // %1
489 "m"(kShuf2) // %2
490 );
491 asm volatile (
492 LABELALIGN
493 "1: \n"
494 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
495 "movdqu " MEMACCESS2(0x10,0) ",%%xmm2 \n"
496 "lea " MEMLEA(0x20,0) ",%0 \n"
497 "movdqa %%xmm2,%%xmm1 \n"
498 "palignr $0x8,%%xmm0,%%xmm1 \n"
499 "pshufb %%xmm3,%%xmm0 \n"
500 "pshufb %%xmm4,%%xmm1 \n"
501 "pshufb %%xmm5,%%xmm2 \n"
502 "movq %%xmm0," MEMACCESS(1) " \n"
503 "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
504 "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
505 "lea " MEMLEA(0x18,1) ",%1 \n"
506 "sub $0x18,%2 \n"
507 "jg 1b \n"
508 : "+r"(src_ptr), // %0
509 "+r"(dst_ptr), // %1
510 "+r"(dst_width) // %2
511 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
512 );
513 }
514
ScaleRowDown34_1_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)515 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
516 ptrdiff_t src_stride,
517 uint8* dst_ptr,
518 int dst_width) {
519 asm volatile(
520 "movdqa %0,%%xmm2 \n" // kShuf01
521 "movdqa %1,%%xmm3 \n" // kShuf11
522 "movdqa %2,%%xmm4 \n" // kShuf21
523 :
524 : "m"(kShuf01), // %0
525 "m"(kShuf11), // %1
526 "m"(kShuf21) // %2
527 );
528 asm volatile(
529 "movdqa %0,%%xmm5 \n" // kMadd01
530 "movdqa %1,%%xmm0 \n" // kMadd11
531 "movdqa %2,%%xmm1 \n" // kRound34
532 :
533 : "m"(kMadd01), // %0
534 "m"(kMadd11), // %1
535 "m"(kRound34) // %2
536 );
537 asm volatile (
538 LABELALIGN
539 "1: \n"
540 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
541 MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3),%%xmm7
542 "pavgb %%xmm7,%%xmm6 \n"
543 "pshufb %%xmm2,%%xmm6 \n"
544 "pmaddubsw %%xmm5,%%xmm6 \n"
545 "paddsw %%xmm1,%%xmm6 \n"
546 "psrlw $0x2,%%xmm6 \n"
547 "packuswb %%xmm6,%%xmm6 \n"
548 "movq %%xmm6," MEMACCESS(1) " \n"
549 "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
550 MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7
551 "pavgb %%xmm7,%%xmm6 \n"
552 "pshufb %%xmm3,%%xmm6 \n"
553 "pmaddubsw %%xmm0,%%xmm6 \n"
554 "paddsw %%xmm1,%%xmm6 \n"
555 "psrlw $0x2,%%xmm6 \n"
556 "packuswb %%xmm6,%%xmm6 \n"
557 "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
558 "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
559 MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3),%%xmm7
560 "lea " MEMLEA(0x20,0) ",%0 \n"
561 "pavgb %%xmm7,%%xmm6 \n"
562 "pshufb %%xmm4,%%xmm6 \n"
563 "pmaddubsw %4,%%xmm6 \n"
564 "paddsw %%xmm1,%%xmm6 \n"
565 "psrlw $0x2,%%xmm6 \n"
566 "packuswb %%xmm6,%%xmm6 \n"
567 "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
568 "lea " MEMLEA(0x18,1) ",%1 \n"
569 "sub $0x18,%2 \n"
570 "jg 1b \n"
571 : "+r"(src_ptr), // %0
572 "+r"(dst_ptr), // %1
573 "+r"(dst_width) // %2
574 : "r"((intptr_t)(src_stride)), // %3
575 "m"(kMadd21) // %4
576 : "memory", "cc", NACL_R14
577 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
578 );
579 }
580
ScaleRowDown34_0_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)581 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
582 ptrdiff_t src_stride,
583 uint8* dst_ptr,
584 int dst_width) {
585 asm volatile(
586 "movdqa %0,%%xmm2 \n" // kShuf01
587 "movdqa %1,%%xmm3 \n" // kShuf11
588 "movdqa %2,%%xmm4 \n" // kShuf21
589 :
590 : "m"(kShuf01), // %0
591 "m"(kShuf11), // %1
592 "m"(kShuf21) // %2
593 );
594 asm volatile(
595 "movdqa %0,%%xmm5 \n" // kMadd01
596 "movdqa %1,%%xmm0 \n" // kMadd11
597 "movdqa %2,%%xmm1 \n" // kRound34
598 :
599 : "m"(kMadd01), // %0
600 "m"(kMadd11), // %1
601 "m"(kRound34) // %2
602 );
603
604 asm volatile (
605 LABELALIGN
606 "1: \n"
607 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
608 MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3,1),%%xmm7
609 "pavgb %%xmm6,%%xmm7 \n"
610 "pavgb %%xmm7,%%xmm6 \n"
611 "pshufb %%xmm2,%%xmm6 \n"
612 "pmaddubsw %%xmm5,%%xmm6 \n"
613 "paddsw %%xmm1,%%xmm6 \n"
614 "psrlw $0x2,%%xmm6 \n"
615 "packuswb %%xmm6,%%xmm6 \n"
616 "movq %%xmm6," MEMACCESS(1) " \n"
617 "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
618 MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7
619 "pavgb %%xmm6,%%xmm7 \n"
620 "pavgb %%xmm7,%%xmm6 \n"
621 "pshufb %%xmm3,%%xmm6 \n"
622 "pmaddubsw %%xmm0,%%xmm6 \n"
623 "paddsw %%xmm1,%%xmm6 \n"
624 "psrlw $0x2,%%xmm6 \n"
625 "packuswb %%xmm6,%%xmm6 \n"
626 "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
627 "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
628 MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3,1),%%xmm7
629 "lea " MEMLEA(0x20,0) ",%0 \n"
630 "pavgb %%xmm6,%%xmm7 \n"
631 "pavgb %%xmm7,%%xmm6 \n"
632 "pshufb %%xmm4,%%xmm6 \n"
633 "pmaddubsw %4,%%xmm6 \n"
634 "paddsw %%xmm1,%%xmm6 \n"
635 "psrlw $0x2,%%xmm6 \n"
636 "packuswb %%xmm6,%%xmm6 \n"
637 "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
638 "lea " MEMLEA(0x18,1) ",%1 \n"
639 "sub $0x18,%2 \n"
640 "jg 1b \n"
641 : "+r"(src_ptr), // %0
642 "+r"(dst_ptr), // %1
643 "+r"(dst_width) // %2
644 : "r"((intptr_t)(src_stride)), // %3
645 "m"(kMadd21) // %4
646 : "memory", "cc", NACL_R14
647 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
648 );
649 }
650
ScaleRowDown38_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)651 void ScaleRowDown38_SSSE3(const uint8* src_ptr,
652 ptrdiff_t src_stride,
653 uint8* dst_ptr,
654 int dst_width) {
655 (void)src_stride;
656 asm volatile (
657 "movdqa %3,%%xmm4 \n"
658 "movdqa %4,%%xmm5 \n"
659
660 LABELALIGN
661 "1: \n"
662 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
663 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
664 "lea " MEMLEA(0x20,0) ",%0 \n"
665 "pshufb %%xmm4,%%xmm0 \n"
666 "pshufb %%xmm5,%%xmm1 \n"
667 "paddusb %%xmm1,%%xmm0 \n"
668 "movq %%xmm0," MEMACCESS(1) " \n"
669 "movhlps %%xmm0,%%xmm1 \n"
670 "movd %%xmm1," MEMACCESS2(0x8,1) " \n"
671 "lea " MEMLEA(0xc,1) ",%1 \n"
672 "sub $0xc,%2 \n"
673 "jg 1b \n"
674 : "+r"(src_ptr), // %0
675 "+r"(dst_ptr), // %1
676 "+r"(dst_width) // %2
677 : "m"(kShuf38a), // %3
678 "m"(kShuf38b) // %4
679 : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
680 );
681 }
682
ScaleRowDown38_2_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)683 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
684 ptrdiff_t src_stride,
685 uint8* dst_ptr,
686 int dst_width) {
687 asm volatile(
688 "movdqa %0,%%xmm2 \n"
689 "movdqa %1,%%xmm3 \n"
690 "movdqa %2,%%xmm4 \n"
691 "movdqa %3,%%xmm5 \n"
692 :
693 : "m"(kShufAb0), // %0
694 "m"(kShufAb1), // %1
695 "m"(kShufAb2), // %2
696 "m"(kScaleAb2) // %3
697 );
698 asm volatile (
699 LABELALIGN
700 "1: \n"
701 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
702 MEMOPREG(movdqu,0x00,0,3,1,xmm1) // movdqu (%0,%3,1),%%xmm1
703 "lea " MEMLEA(0x10,0) ",%0 \n"
704 "pavgb %%xmm1,%%xmm0 \n"
705 "movdqa %%xmm0,%%xmm1 \n"
706 "pshufb %%xmm2,%%xmm1 \n"
707 "movdqa %%xmm0,%%xmm6 \n"
708 "pshufb %%xmm3,%%xmm6 \n"
709 "paddusw %%xmm6,%%xmm1 \n"
710 "pshufb %%xmm4,%%xmm0 \n"
711 "paddusw %%xmm0,%%xmm1 \n"
712 "pmulhuw %%xmm5,%%xmm1 \n"
713 "packuswb %%xmm1,%%xmm1 \n"
714 "movd %%xmm1," MEMACCESS(1) " \n"
715 "psrlq $0x10,%%xmm1 \n"
716 "movd %%xmm1," MEMACCESS2(0x2,1) " \n"
717 "lea " MEMLEA(0x6,1) ",%1 \n"
718 "sub $0x6,%2 \n"
719 "jg 1b \n"
720 : "+r"(src_ptr), // %0
721 "+r"(dst_ptr), // %1
722 "+r"(dst_width) // %2
723 : "r"((intptr_t)(src_stride)) // %3
724 : "memory", "cc", NACL_R14
725 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
726 );
727 }
728
ScaleRowDown38_3_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)729 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
730 ptrdiff_t src_stride,
731 uint8* dst_ptr,
732 int dst_width) {
733 asm volatile(
734 "movdqa %0,%%xmm2 \n"
735 "movdqa %1,%%xmm3 \n"
736 "movdqa %2,%%xmm4 \n"
737 "pxor %%xmm5,%%xmm5 \n"
738 :
739 : "m"(kShufAc), // %0
740 "m"(kShufAc3), // %1
741 "m"(kScaleAc33) // %2
742 );
743 asm volatile (
744 LABELALIGN
745 "1: \n"
746 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
747 MEMOPREG(movdqu,0x00,0,3,1,xmm6) // movdqu (%0,%3,1),%%xmm6
748 "movhlps %%xmm0,%%xmm1 \n"
749 "movhlps %%xmm6,%%xmm7 \n"
750 "punpcklbw %%xmm5,%%xmm0 \n"
751 "punpcklbw %%xmm5,%%xmm1 \n"
752 "punpcklbw %%xmm5,%%xmm6 \n"
753 "punpcklbw %%xmm5,%%xmm7 \n"
754 "paddusw %%xmm6,%%xmm0 \n"
755 "paddusw %%xmm7,%%xmm1 \n"
756 MEMOPREG(movdqu,0x00,0,3,2,xmm6) // movdqu (%0,%3,2),%%xmm6
757 "lea " MEMLEA(0x10,0) ",%0 \n"
758 "movhlps %%xmm6,%%xmm7 \n"
759 "punpcklbw %%xmm5,%%xmm6 \n"
760 "punpcklbw %%xmm5,%%xmm7 \n"
761 "paddusw %%xmm6,%%xmm0 \n"
762 "paddusw %%xmm7,%%xmm1 \n"
763 "movdqa %%xmm0,%%xmm6 \n"
764 "psrldq $0x2,%%xmm0 \n"
765 "paddusw %%xmm0,%%xmm6 \n"
766 "psrldq $0x2,%%xmm0 \n"
767 "paddusw %%xmm0,%%xmm6 \n"
768 "pshufb %%xmm2,%%xmm6 \n"
769 "movdqa %%xmm1,%%xmm7 \n"
770 "psrldq $0x2,%%xmm1 \n"
771 "paddusw %%xmm1,%%xmm7 \n"
772 "psrldq $0x2,%%xmm1 \n"
773 "paddusw %%xmm1,%%xmm7 \n"
774 "pshufb %%xmm3,%%xmm7 \n"
775 "paddusw %%xmm7,%%xmm6 \n"
776 "pmulhuw %%xmm4,%%xmm6 \n"
777 "packuswb %%xmm6,%%xmm6 \n"
778 "movd %%xmm6," MEMACCESS(1) " \n"
779 "psrlq $0x10,%%xmm6 \n"
780 "movd %%xmm6," MEMACCESS2(0x2,1) " \n"
781 "lea " MEMLEA(0x6,1) ",%1 \n"
782 "sub $0x6,%2 \n"
783 "jg 1b \n"
784 : "+r"(src_ptr), // %0
785 "+r"(dst_ptr), // %1
786 "+r"(dst_width) // %2
787 : "r"((intptr_t)(src_stride)) // %3
788 : "memory", "cc", NACL_R14
789 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
790 );
791 }
792
793 // Reads 16xN bytes and produces 16 shorts at a time.
ScaleAddRow_SSE2(const uint8 * src_ptr,uint16 * dst_ptr,int src_width)794 void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
795 asm volatile (
796 "pxor %%xmm5,%%xmm5 \n"
797
798 LABELALIGN
799 "1: \n"
800 "movdqu " MEMACCESS(0) ",%%xmm3 \n"
801 "lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16
802 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
803 "movdqu " MEMACCESS2(0x10,1) ",%%xmm1 \n"
804 "movdqa %%xmm3,%%xmm2 \n"
805 "punpcklbw %%xmm5,%%xmm2 \n"
806 "punpckhbw %%xmm5,%%xmm3 \n"
807 "paddusw %%xmm2,%%xmm0 \n"
808 "paddusw %%xmm3,%%xmm1 \n"
809 "movdqu %%xmm0," MEMACCESS(1) " \n"
810 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
811 "lea " MEMLEA(0x20,1) ",%1 \n"
812 "sub $0x10,%2 \n"
813 "jg 1b \n"
814 : "+r"(src_ptr), // %0
815 "+r"(dst_ptr), // %1
816 "+r"(src_width) // %2
817 :
818 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
819 );
820 }
821
822 #ifdef HAS_SCALEADDROW_AVX2
823 // Reads 32 bytes and accumulates to 32 shorts at a time.
ScaleAddRow_AVX2(const uint8 * src_ptr,uint16 * dst_ptr,int src_width)824 void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
825 asm volatile (
826 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
827
828 LABELALIGN
829 "1: \n"
830 "vmovdqu " MEMACCESS(0) ",%%ymm3 \n"
831 "lea " MEMLEA(0x20,0) ",%0 \n" // src_ptr += 32
832 "vpermq $0xd8,%%ymm3,%%ymm3 \n"
833 "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
834 "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
835 "vpaddusw " MEMACCESS(1) ",%%ymm2,%%ymm0 \n"
836 "vpaddusw " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n"
837 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
838 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
839 "lea " MEMLEA(0x40,1) ",%1 \n"
840 "sub $0x20,%2 \n"
841 "jg 1b \n"
842 "vzeroupper \n"
843 : "+r"(src_ptr), // %0
844 "+r"(dst_ptr), // %1
845 "+r"(src_width) // %2
846 :
847 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
848 );
849 }
850 #endif // HAS_SCALEADDROW_AVX2
851
852 // Constant for making pixels signed to avoid pmaddubsw
853 // saturation.
854 static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
855 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
856
857 // Constant for making pixels unsigned and adding .5 for rounding.
858 static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
859 0x4040, 0x4040, 0x4040, 0x4040};
860
861 // Bilinear column filtering. SSSE3 version.
ScaleFilterCols_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)862 void ScaleFilterCols_SSSE3(uint8* dst_ptr,
863 const uint8* src_ptr,
864 int dst_width,
865 int x,
866 int dx) {
867 intptr_t x0, x1, temp_pixel;
868 asm volatile (
869 "movd %6,%%xmm2 \n"
870 "movd %7,%%xmm3 \n"
871 "movl $0x04040000,%k2 \n"
872 "movd %k2,%%xmm5 \n"
873 "pcmpeqb %%xmm6,%%xmm6 \n"
874 "psrlw $0x9,%%xmm6 \n" // 0x007f007f
875 "pcmpeqb %%xmm7,%%xmm7 \n"
876 "psrlw $15,%%xmm7 \n" // 0x00010001
877
878 "pextrw $0x1,%%xmm2,%k3 \n"
879 "subl $0x2,%5 \n"
880 "jl 29f \n"
881 "movdqa %%xmm2,%%xmm0 \n"
882 "paddd %%xmm3,%%xmm0 \n"
883 "punpckldq %%xmm0,%%xmm2 \n"
884 "punpckldq %%xmm3,%%xmm3 \n"
885 "paddd %%xmm3,%%xmm3 \n"
886 "pextrw $0x3,%%xmm2,%k4 \n"
887
888 LABELALIGN
889 "2: \n"
890 "movdqa %%xmm2,%%xmm1 \n"
891 "paddd %%xmm3,%%xmm2 \n"
892 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
893 "movd %k2,%%xmm0 \n"
894 "psrlw $0x9,%%xmm1 \n"
895 MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2
896 "movd %k2,%%xmm4 \n"
897 "pshufb %%xmm5,%%xmm1 \n"
898 "punpcklwd %%xmm4,%%xmm0 \n"
899 "psubb %8,%%xmm0 \n" // make pixels signed.
900 "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + 1
901 "paddusb %%xmm7,%%xmm1 \n"
902 "pmaddubsw %%xmm0,%%xmm1 \n"
903 "pextrw $0x1,%%xmm2,%k3 \n"
904 "pextrw $0x3,%%xmm2,%k4 \n"
905 "paddw %9,%%xmm1 \n" // make pixels unsigned.
906 "psrlw $0x7,%%xmm1 \n"
907 "packuswb %%xmm1,%%xmm1 \n"
908 "movd %%xmm1,%k2 \n"
909 "mov %w2," MEMACCESS(0) " \n"
910 "lea " MEMLEA(0x2,0) ",%0 \n"
911 "subl $0x2,%5 \n"
912 "jge 2b \n"
913
914 LABELALIGN
915 "29: \n"
916 "addl $0x1,%5 \n"
917 "jl 99f \n"
918 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
919 "movd %k2,%%xmm0 \n"
920 "psrlw $0x9,%%xmm2 \n"
921 "pshufb %%xmm5,%%xmm2 \n"
922 "psubb %8,%%xmm0 \n" // make pixels signed.
923 "pxor %%xmm6,%%xmm2 \n"
924 "paddusb %%xmm7,%%xmm2 \n"
925 "pmaddubsw %%xmm0,%%xmm2 \n"
926 "paddw %9,%%xmm2 \n" // make pixels unsigned.
927 "psrlw $0x7,%%xmm2 \n"
928 "packuswb %%xmm2,%%xmm2 \n"
929 "movd %%xmm2,%k2 \n"
930 "mov %b2," MEMACCESS(0) " \n"
931 "99: \n"
932 : "+r"(dst_ptr), // %0
933 "+r"(src_ptr), // %1
934 "=&a"(temp_pixel), // %2
935 "=&r"(x0), // %3
936 "=&r"(x1), // %4
937 #if defined(__x86_64__)
938 "+rm"(dst_width) // %5
939 #else
940 "+m"(dst_width) // %5
941 #endif
942 : "rm"(x), // %6
943 "rm"(dx), // %7
944 #if defined(__x86_64__)
945 "x"(kFsub80), // %8
946 "x"(kFadd40) // %9
947 #else
948 "m"(kFsub80), // %8
949 "m"(kFadd40) // %9
950 #endif
951 : "memory", "cc", NACL_R14
952 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
953 );
954 }
955
956 // Reads 4 pixels, duplicates them and writes 8 pixels.
957 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleColsUp2_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)958 void ScaleColsUp2_SSE2(uint8* dst_ptr,
959 const uint8* src_ptr,
960 int dst_width,
961 int x,
962 int dx) {
963 (void)x;
964 (void)dx;
965 asm volatile (
966 LABELALIGN
967 "1: \n"
968 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
969 "lea " MEMLEA(0x10,1) ",%1 \n"
970 "movdqa %%xmm0,%%xmm1 \n"
971 "punpcklbw %%xmm0,%%xmm0 \n"
972 "punpckhbw %%xmm1,%%xmm1 \n"
973 "movdqu %%xmm0," MEMACCESS(0) " \n"
974 "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
975 "lea " MEMLEA(0x20,0) ",%0 \n"
976 "sub $0x20,%2 \n"
977 "jg 1b \n"
978
979 : "+r"(dst_ptr), // %0
980 "+r"(src_ptr), // %1
981 "+r"(dst_width) // %2
982 :: "memory", "cc", "xmm0", "xmm1"
983 );
984 }
985
ScaleARGBRowDown2_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)986 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
987 ptrdiff_t src_stride,
988 uint8* dst_argb,
989 int dst_width) {
990 (void)src_stride;
991 asm volatile (
992 LABELALIGN
993 "1: \n"
994 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
995 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
996 "lea " MEMLEA(0x20,0) ",%0 \n"
997 "shufps $0xdd,%%xmm1,%%xmm0 \n"
998 "movdqu %%xmm0," MEMACCESS(1) " \n"
999 "lea " MEMLEA(0x10,1) ",%1 \n"
1000 "sub $0x4,%2 \n"
1001 "jg 1b \n"
1002 : "+r"(src_argb), // %0
1003 "+r"(dst_argb), // %1
1004 "+r"(dst_width) // %2
1005 :: "memory", "cc", "xmm0", "xmm1"
1006 );
1007 }
1008
ScaleARGBRowDown2Linear_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)1009 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
1010 ptrdiff_t src_stride,
1011 uint8* dst_argb,
1012 int dst_width) {
1013 (void)src_stride;
1014 asm volatile (
1015 LABELALIGN
1016 "1: \n"
1017 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1018 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1019 "lea " MEMLEA(0x20,0) ",%0 \n"
1020 "movdqa %%xmm0,%%xmm2 \n"
1021 "shufps $0x88,%%xmm1,%%xmm0 \n"
1022 "shufps $0xdd,%%xmm1,%%xmm2 \n"
1023 "pavgb %%xmm2,%%xmm0 \n"
1024 "movdqu %%xmm0," MEMACCESS(1) " \n"
1025 "lea " MEMLEA(0x10,1) ",%1 \n"
1026 "sub $0x4,%2 \n"
1027 "jg 1b \n"
1028 : "+r"(src_argb), // %0
1029 "+r"(dst_argb), // %1
1030 "+r"(dst_width) // %2
1031 :: "memory", "cc", "xmm0", "xmm1"
1032 );
1033 }
1034
ScaleARGBRowDown2Box_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)1035 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
1036 ptrdiff_t src_stride,
1037 uint8* dst_argb,
1038 int dst_width) {
1039 asm volatile (
1040 LABELALIGN
1041 "1: \n"
1042 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1043 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1044 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
1045 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
1046 "lea " MEMLEA(0x20,0) ",%0 \n"
1047 "pavgb %%xmm2,%%xmm0 \n"
1048 "pavgb %%xmm3,%%xmm1 \n"
1049 "movdqa %%xmm0,%%xmm2 \n"
1050 "shufps $0x88,%%xmm1,%%xmm0 \n"
1051 "shufps $0xdd,%%xmm1,%%xmm2 \n"
1052 "pavgb %%xmm2,%%xmm0 \n"
1053 "movdqu %%xmm0," MEMACCESS(1) " \n"
1054 "lea " MEMLEA(0x10,1) ",%1 \n"
1055 "sub $0x4,%2 \n"
1056 "jg 1b \n"
1057 : "+r"(src_argb), // %0
1058 "+r"(dst_argb), // %1
1059 "+r"(dst_width) // %2
1060 : "r"((intptr_t)(src_stride)) // %3
1061 : "memory", "cc", NACL_R14
1062 "xmm0", "xmm1", "xmm2", "xmm3"
1063 );
1064 }
1065
1066 // Reads 4 pixels at a time.
1067 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEven_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)1068 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb,
1069 ptrdiff_t src_stride,
1070 int src_stepx,
1071 uint8* dst_argb,
1072 int dst_width) {
1073 intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
1074 intptr_t src_stepx_x12;
1075 (void)src_stride;
1076 asm volatile (
1077 "lea " MEMLEA3(0x00,1,4) ",%1 \n"
1078 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
1079 LABELALIGN
1080 "1: \n"
1081 "movd " MEMACCESS(0) ",%%xmm0 \n"
1082 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
1083 "punpckldq %%xmm1,%%xmm0 \n"
1084 MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2
1085 MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3
1086 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
1087 "punpckldq %%xmm3,%%xmm2 \n"
1088 "punpcklqdq %%xmm2,%%xmm0 \n"
1089 "movdqu %%xmm0," MEMACCESS(2) " \n"
1090 "lea " MEMLEA(0x10,2) ",%2 \n"
1091 "sub $0x4,%3 \n"
1092 "jg 1b \n"
1093 : "+r"(src_argb), // %0
1094 "+r"(src_stepx_x4), // %1
1095 "+r"(dst_argb), // %2
1096 "+r"(dst_width), // %3
1097 "=&r"(src_stepx_x12) // %4
1098 :: "memory", "cc", NACL_R14
1099 "xmm0", "xmm1", "xmm2", "xmm3"
1100 );
1101 }
1102
1103 // Blends four 2x2 to 4x1.
1104 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEvenBox_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)1105 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
1106 ptrdiff_t src_stride,
1107 int src_stepx,
1108 uint8* dst_argb,
1109 int dst_width) {
1110 intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
1111 intptr_t src_stepx_x12;
1112 intptr_t row1 = (intptr_t)(src_stride);
1113 asm volatile (
1114 "lea " MEMLEA3(0x00,1,4) ",%1 \n"
1115 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
1116 "lea " MEMLEA4(0x00,0,5,1) ",%5 \n"
1117
1118 LABELALIGN
1119 "1: \n"
1120 "movq " MEMACCESS(0) ",%%xmm0 \n"
1121 MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0
1122 MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1
1123 MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1
1124 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
1125 "movq " MEMACCESS(5) ",%%xmm2 \n"
1126 MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2
1127 MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3
1128 MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3
1129 "lea " MEMLEA4(0x00,5,1,4) ",%5 \n"
1130 "pavgb %%xmm2,%%xmm0 \n"
1131 "pavgb %%xmm3,%%xmm1 \n"
1132 "movdqa %%xmm0,%%xmm2 \n"
1133 "shufps $0x88,%%xmm1,%%xmm0 \n"
1134 "shufps $0xdd,%%xmm1,%%xmm2 \n"
1135 "pavgb %%xmm2,%%xmm0 \n"
1136 "movdqu %%xmm0," MEMACCESS(2) " \n"
1137 "lea " MEMLEA(0x10,2) ",%2 \n"
1138 "sub $0x4,%3 \n"
1139 "jg 1b \n"
1140 : "+r"(src_argb), // %0
1141 "+r"(src_stepx_x4), // %1
1142 "+r"(dst_argb), // %2
1143 "+rm"(dst_width), // %3
1144 "=&r"(src_stepx_x12), // %4
1145 "+r"(row1) // %5
1146 :: "memory", "cc", NACL_R14
1147 "xmm0", "xmm1", "xmm2", "xmm3"
1148 );
1149 }
1150
ScaleARGBCols_SSE2(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1151 void ScaleARGBCols_SSE2(uint8* dst_argb,
1152 const uint8* src_argb,
1153 int dst_width,
1154 int x,
1155 int dx) {
1156 intptr_t x0, x1;
1157 asm volatile (
1158 "movd %5,%%xmm2 \n"
1159 "movd %6,%%xmm3 \n"
1160 "pshufd $0x0,%%xmm2,%%xmm2 \n"
1161 "pshufd $0x11,%%xmm3,%%xmm0 \n"
1162 "paddd %%xmm0,%%xmm2 \n"
1163 "paddd %%xmm3,%%xmm3 \n"
1164 "pshufd $0x5,%%xmm3,%%xmm0 \n"
1165 "paddd %%xmm0,%%xmm2 \n"
1166 "paddd %%xmm3,%%xmm3 \n"
1167 "pshufd $0x0,%%xmm3,%%xmm3 \n"
1168 "pextrw $0x1,%%xmm2,%k0 \n"
1169 "pextrw $0x3,%%xmm2,%k1 \n"
1170 "cmp $0x0,%4 \n"
1171 "jl 99f \n"
1172 "sub $0x4,%4 \n"
1173 "jl 49f \n"
1174
1175 LABELALIGN
1176 "40: \n"
1177 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
1178 MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
1179 "pextrw $0x5,%%xmm2,%k0 \n"
1180 "pextrw $0x7,%%xmm2,%k1 \n"
1181 "paddd %%xmm3,%%xmm2 \n"
1182 "punpckldq %%xmm1,%%xmm0 \n"
1183 MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1
1184 MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4
1185 "pextrw $0x1,%%xmm2,%k0 \n"
1186 "pextrw $0x3,%%xmm2,%k1 \n"
1187 "punpckldq %%xmm4,%%xmm1 \n"
1188 "punpcklqdq %%xmm1,%%xmm0 \n"
1189 "movdqu %%xmm0," MEMACCESS(2) " \n"
1190 "lea " MEMLEA(0x10,2) ",%2 \n"
1191 "sub $0x4,%4 \n"
1192 "jge 40b \n"
1193
1194 "49: \n"
1195 "test $0x2,%4 \n"
1196 "je 29f \n"
1197 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
1198 MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
1199 "pextrw $0x5,%%xmm2,%k0 \n"
1200 "punpckldq %%xmm1,%%xmm0 \n"
1201 "movq %%xmm0," MEMACCESS(2) " \n"
1202 "lea " MEMLEA(0x8,2) ",%2 \n"
1203 "29: \n"
1204 "test $0x1,%4 \n"
1205 "je 99f \n"
1206 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
1207 "movd %%xmm0," MEMACCESS(2) " \n"
1208 "99: \n"
1209 : "=&a"(x0), // %0
1210 "=&d"(x1), // %1
1211 "+r"(dst_argb), // %2
1212 "+r"(src_argb), // %3
1213 "+r"(dst_width) // %4
1214 : "rm"(x), // %5
1215 "rm"(dx) // %6
1216 : "memory", "cc", NACL_R14
1217 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1218 );
1219 }
1220
1221 // Reads 4 pixels, duplicates them and writes 8 pixels.
1222 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleARGBColsUp2_SSE2(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1223 void ScaleARGBColsUp2_SSE2(uint8* dst_argb,
1224 const uint8* src_argb,
1225 int dst_width,
1226 int x,
1227 int dx) {
1228 (void)x;
1229 (void)dx;
1230 asm volatile (
1231 LABELALIGN
1232 "1: \n"
1233 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
1234 "lea " MEMLEA(0x10,1) ",%1 \n"
1235 "movdqa %%xmm0,%%xmm1 \n"
1236 "punpckldq %%xmm0,%%xmm0 \n"
1237 "punpckhdq %%xmm1,%%xmm1 \n"
1238 "movdqu %%xmm0," MEMACCESS(0) " \n"
1239 "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
1240 "lea " MEMLEA(0x20,0) ",%0 \n"
1241 "sub $0x8,%2 \n"
1242 "jg 1b \n"
1243
1244 : "+r"(dst_argb), // %0
1245 "+r"(src_argb), // %1
1246 "+r"(dst_width) // %2
1247 :: "memory", "cc", NACL_R14
1248 "xmm0", "xmm1"
1249 );
1250 }
1251
1252 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1253 static uvec8 kShuffleColARGB = {
1254 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
1255 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
1256 };
1257
1258 // Shuffle table for duplicating 2 fractions into 8 bytes each
1259 static uvec8 kShuffleFractions = {
1260 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1261 };
1262
1263 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
ScaleARGBFilterCols_SSSE3(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1264 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
1265 const uint8* src_argb,
1266 int dst_width,
1267 int x,
1268 int dx) {
1269 intptr_t x0, x1;
1270 asm volatile(
1271 "movdqa %0,%%xmm4 \n"
1272 "movdqa %1,%%xmm5 \n"
1273 :
1274 : "m"(kShuffleColARGB), // %0
1275 "m"(kShuffleFractions) // %1
1276 );
1277
1278 asm volatile (
1279 "movd %5,%%xmm2 \n"
1280 "movd %6,%%xmm3 \n"
1281 "pcmpeqb %%xmm6,%%xmm6 \n"
1282 "psrlw $0x9,%%xmm6 \n"
1283 "pextrw $0x1,%%xmm2,%k3 \n"
1284 "sub $0x2,%2 \n"
1285 "jl 29f \n"
1286 "movdqa %%xmm2,%%xmm0 \n"
1287 "paddd %%xmm3,%%xmm0 \n"
1288 "punpckldq %%xmm0,%%xmm2 \n"
1289 "punpckldq %%xmm3,%%xmm3 \n"
1290 "paddd %%xmm3,%%xmm3 \n"
1291 "pextrw $0x3,%%xmm2,%k4 \n"
1292
1293 LABELALIGN
1294 "2: \n"
1295 "movdqa %%xmm2,%%xmm1 \n"
1296 "paddd %%xmm3,%%xmm2 \n"
1297 MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
1298 "psrlw $0x9,%%xmm1 \n"
1299 MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0
1300 "pshufb %%xmm5,%%xmm1 \n"
1301 "pshufb %%xmm4,%%xmm0 \n"
1302 "pxor %%xmm6,%%xmm1 \n"
1303 "pmaddubsw %%xmm1,%%xmm0 \n"
1304 "psrlw $0x7,%%xmm0 \n"
1305 "pextrw $0x1,%%xmm2,%k3 \n"
1306 "pextrw $0x3,%%xmm2,%k4 \n"
1307 "packuswb %%xmm0,%%xmm0 \n"
1308 "movq %%xmm0," MEMACCESS(0) " \n"
1309 "lea " MEMLEA(0x8,0) ",%0 \n"
1310 "sub $0x2,%2 \n"
1311 "jge 2b \n"
1312
1313 LABELALIGN
1314 "29: \n"
1315 "add $0x1,%2 \n"
1316 "jl 99f \n"
1317 "psrlw $0x9,%%xmm2 \n"
1318 MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
1319 "pshufb %%xmm5,%%xmm2 \n"
1320 "pshufb %%xmm4,%%xmm0 \n"
1321 "pxor %%xmm6,%%xmm2 \n"
1322 "pmaddubsw %%xmm2,%%xmm0 \n"
1323 "psrlw $0x7,%%xmm0 \n"
1324 "packuswb %%xmm0,%%xmm0 \n"
1325 "movd %%xmm0," MEMACCESS(0) " \n"
1326
1327 LABELALIGN
1328 "99: \n"
1329 : "+r"(dst_argb), // %0
1330 "+r"(src_argb), // %1
1331 "+rm"(dst_width), // %2
1332 "=&r"(x0), // %3
1333 "=&r"(x1) // %4
1334 : "rm"(x), // %5
1335 "rm"(dx) // %6
1336 : "memory", "cc", NACL_R14
1337 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1338 );
1339 }
1340
1341 // Divide num by div and return as 16.16 fixed point result.
FixedDiv_X86(int num,int div)1342 int FixedDiv_X86(int num, int div) {
1343 asm volatile(
1344 "cdq \n"
1345 "shld $0x10,%%eax,%%edx \n"
1346 "shl $0x10,%%eax \n"
1347 "idiv %1 \n"
1348 "mov %0, %%eax \n"
1349 : "+a"(num) // %0
1350 : "c"(div) // %1
1351 : "memory", "cc", "edx");
1352 return num;
1353 }
1354
1355 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
FixedDiv1_X86(int num,int div)1356 int FixedDiv1_X86(int num, int div) {
1357 asm volatile(
1358 "cdq \n"
1359 "shld $0x10,%%eax,%%edx \n"
1360 "shl $0x10,%%eax \n"
1361 "sub $0x10001,%%eax \n"
1362 "sbb $0x0,%%edx \n"
1363 "sub $0x1,%1 \n"
1364 "idiv %1 \n"
1365 "mov %0, %%eax \n"
1366 : "+a"(num) // %0
1367 : "c"(div) // %1
1368 : "memory", "cc", "edx");
1369 return num;
1370 }
1371
1372 #endif // defined(__x86_64__) || defined(__i386__)
1373
1374 #ifdef __cplusplus
1375 } // extern "C"
1376 } // namespace libyuv
1377 #endif
1378