1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12 #include "vpx_ports/config.h"
13 #include "vpx_ports/mem.h"
14 #include "vp8/common/subpixel.h"
15
16 extern const short vp8_six_tap_mmx[8][6*8];
17 extern const short vp8_bilinear_filters_mmx[8][2*8];
18
19 extern void vp8_filter_block1d_h6_mmx
20 (
21 unsigned char *src_ptr,
22 unsigned short *output_ptr,
23 unsigned int src_pixels_per_line,
24 unsigned int pixel_step,
25 unsigned int output_height,
26 unsigned int output_width,
27 const short *vp8_filter
28 );
29 extern void vp8_filter_block1dc_v6_mmx
30 (
31 unsigned short *src_ptr,
32 unsigned char *output_ptr,
33 int output_pitch,
34 unsigned int pixels_per_line,
35 unsigned int pixel_step,
36 unsigned int output_height,
37 unsigned int output_width,
38 const short *vp8_filter
39 );
40 extern void vp8_filter_block1d8_h6_sse2
41 (
42 unsigned char *src_ptr,
43 unsigned short *output_ptr,
44 unsigned int src_pixels_per_line,
45 unsigned int pixel_step,
46 unsigned int output_height,
47 unsigned int output_width,
48 const short *vp8_filter
49 );
50 extern void vp8_filter_block1d16_h6_sse2
51 (
52 unsigned char *src_ptr,
53 unsigned short *output_ptr,
54 unsigned int src_pixels_per_line,
55 unsigned int pixel_step,
56 unsigned int output_height,
57 unsigned int output_width,
58 const short *vp8_filter
59 );
60 extern void vp8_filter_block1d8_v6_sse2
61 (
62 unsigned short *src_ptr,
63 unsigned char *output_ptr,
64 int dst_ptich,
65 unsigned int pixels_per_line,
66 unsigned int pixel_step,
67 unsigned int output_height,
68 unsigned int output_width,
69 const short *vp8_filter
70 );
71 extern void vp8_filter_block1d16_v6_sse2
72 (
73 unsigned short *src_ptr,
74 unsigned char *output_ptr,
75 int dst_ptich,
76 unsigned int pixels_per_line,
77 unsigned int pixel_step,
78 unsigned int output_height,
79 unsigned int output_width,
80 const short *vp8_filter
81 );
82 extern void vp8_unpack_block1d16_h6_sse2
83 (
84 unsigned char *src_ptr,
85 unsigned short *output_ptr,
86 unsigned int src_pixels_per_line,
87 unsigned int output_height,
88 unsigned int output_width
89 );
90 extern void vp8_filter_block1d8_h6_only_sse2
91 (
92 unsigned char *src_ptr,
93 unsigned int src_pixels_per_line,
94 unsigned char *output_ptr,
95 int dst_ptich,
96 unsigned int output_height,
97 const short *vp8_filter
98 );
99 extern void vp8_filter_block1d16_h6_only_sse2
100 (
101 unsigned char *src_ptr,
102 unsigned int src_pixels_per_line,
103 unsigned char *output_ptr,
104 int dst_ptich,
105 unsigned int output_height,
106 const short *vp8_filter
107 );
108 extern void vp8_filter_block1d8_v6_only_sse2
109 (
110 unsigned char *src_ptr,
111 unsigned int src_pixels_per_line,
112 unsigned char *output_ptr,
113 int dst_ptich,
114 unsigned int output_height,
115 const short *vp8_filter
116 );
117 extern prototype_subpixel_predict(vp8_bilinear_predict8x8_mmx);
118
119
120 #if HAVE_MMX
vp8_sixtap_predict4x4_mmx(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)121 void vp8_sixtap_predict4x4_mmx
122 (
123 unsigned char *src_ptr,
124 int src_pixels_per_line,
125 int xoffset,
126 int yoffset,
127 unsigned char *dst_ptr,
128 int dst_pitch
129 )
130 {
131 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 16*16); /* Temp data bufffer used in filtering */
132 const short *HFilter, *VFilter;
133 HFilter = vp8_six_tap_mmx[xoffset];
134 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter);
135 VFilter = vp8_six_tap_mmx[yoffset];
136 vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4 , 4, 4, VFilter);
137
138 }
139
140
vp8_sixtap_predict16x16_mmx(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)141 void vp8_sixtap_predict16x16_mmx
142 (
143 unsigned char *src_ptr,
144 int src_pixels_per_line,
145 int xoffset,
146 int yoffset,
147 unsigned char *dst_ptr,
148 int dst_pitch
149 )
150 {
151
152 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24); /* Temp data bufffer used in filtering */
153
154 const short *HFilter, *VFilter;
155
156
157 HFilter = vp8_six_tap_mmx[xoffset];
158
159 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter);
160 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 21, 32, HFilter);
161 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, FData2 + 8, src_pixels_per_line, 1, 21, 32, HFilter);
162 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, FData2 + 12, src_pixels_per_line, 1, 21, 32, HFilter);
163
164 VFilter = vp8_six_tap_mmx[yoffset];
165 vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, 16, VFilter);
166 vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16 , 16, 16, VFilter);
167 vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16 , 16, 16, VFilter);
168 vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16 , 16, 16, VFilter);
169
170 }
171
172
vp8_sixtap_predict8x8_mmx(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)173 void vp8_sixtap_predict8x8_mmx
174 (
175 unsigned char *src_ptr,
176 int src_pixels_per_line,
177 int xoffset,
178 int yoffset,
179 unsigned char *dst_ptr,
180 int dst_pitch
181 )
182 {
183
184 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */
185
186 const short *HFilter, *VFilter;
187
188 HFilter = vp8_six_tap_mmx[xoffset];
189 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter);
190 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 13, 16, HFilter);
191
192 VFilter = vp8_six_tap_mmx[yoffset];
193 vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, 8, VFilter);
194 vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 8, 8, VFilter);
195
196 }
197
198
vp8_sixtap_predict8x4_mmx(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)199 void vp8_sixtap_predict8x4_mmx
200 (
201 unsigned char *src_ptr,
202 int src_pixels_per_line,
203 int xoffset,
204 int yoffset,
205 unsigned char *dst_ptr,
206 int dst_pitch
207 )
208 {
209
210 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */
211
212 const short *HFilter, *VFilter;
213
214 HFilter = vp8_six_tap_mmx[xoffset];
215 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter);
216 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 9, 16, HFilter);
217
218 VFilter = vp8_six_tap_mmx[yoffset];
219 vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, 8, VFilter);
220 vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 4, 8, VFilter);
221
222 }
223
224
225
vp8_bilinear_predict16x16_mmx(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)226 void vp8_bilinear_predict16x16_mmx
227 (
228 unsigned char *src_ptr,
229 int src_pixels_per_line,
230 int xoffset,
231 int yoffset,
232 unsigned char *dst_ptr,
233 int dst_pitch
234 )
235 {
236 vp8_bilinear_predict8x8_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pitch);
237 vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + 8, dst_pitch);
238 vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8, dst_pitch);
239 vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8 + 8, dst_pitch);
240 }
241 #endif
242
243
244 #if HAVE_SSE2
vp8_sixtap_predict16x16_sse2(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)245 void vp8_sixtap_predict16x16_sse2
246 (
247 unsigned char *src_ptr,
248 int src_pixels_per_line,
249 int xoffset,
250 int yoffset,
251 unsigned char *dst_ptr,
252 int dst_pitch
253
254 )
255 {
256 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24); /* Temp data bufffer used in filtering */
257
258 const short *HFilter, *VFilter;
259
260 if (xoffset)
261 {
262 if (yoffset)
263 {
264 HFilter = vp8_six_tap_mmx[xoffset];
265 vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter);
266 VFilter = vp8_six_tap_mmx[yoffset];
267 vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
268 }
269 else
270 {
271 /* First-pass only */
272 HFilter = vp8_six_tap_mmx[xoffset];
273 vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, HFilter);
274 }
275 }
276 else
277 {
278 /* Second-pass only */
279 VFilter = vp8_six_tap_mmx[yoffset];
280 vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 21, 32);
281 vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
282 }
283 }
284
285
vp8_sixtap_predict8x8_sse2(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)286 void vp8_sixtap_predict8x8_sse2
287 (
288 unsigned char *src_ptr,
289 int src_pixels_per_line,
290 int xoffset,
291 int yoffset,
292 unsigned char *dst_ptr,
293 int dst_pitch
294 )
295 {
296 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */
297 const short *HFilter, *VFilter;
298
299 if (xoffset)
300 {
301 if (yoffset)
302 {
303 HFilter = vp8_six_tap_mmx[xoffset];
304 vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter);
305 VFilter = vp8_six_tap_mmx[yoffset];
306 vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, dst_pitch, VFilter);
307 }
308 else
309 {
310 /* First-pass only */
311 HFilter = vp8_six_tap_mmx[xoffset];
312 vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, HFilter);
313 }
314 }
315 else
316 {
317 /* Second-pass only */
318 VFilter = vp8_six_tap_mmx[yoffset];
319 vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, VFilter);
320 }
321 }
322
323
vp8_sixtap_predict8x4_sse2(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)324 void vp8_sixtap_predict8x4_sse2
325 (
326 unsigned char *src_ptr,
327 int src_pixels_per_line,
328 int xoffset,
329 int yoffset,
330 unsigned char *dst_ptr,
331 int dst_pitch
332 )
333 {
334 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */
335 const short *HFilter, *VFilter;
336
337 if (xoffset)
338 {
339 if (yoffset)
340 {
341 HFilter = vp8_six_tap_mmx[xoffset];
342 vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter);
343 VFilter = vp8_six_tap_mmx[yoffset];
344 vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, dst_pitch, VFilter);
345 }
346 else
347 {
348 /* First-pass only */
349 HFilter = vp8_six_tap_mmx[xoffset];
350 vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, HFilter);
351 }
352 }
353 else
354 {
355 /* Second-pass only */
356 VFilter = vp8_six_tap_mmx[yoffset];
357 vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, VFilter);
358 }
359 }
360
361 #endif
362
363 #if HAVE_SSSE3
364
365 extern void vp8_filter_block1d8_h6_ssse3
366 (
367 unsigned char *src_ptr,
368 unsigned int src_pixels_per_line,
369 unsigned char *output_ptr,
370 unsigned int output_pitch,
371 unsigned int output_height,
372 unsigned int vp8_filter_index
373 );
374
375 extern void vp8_filter_block1d16_h6_ssse3
376 (
377 unsigned char *src_ptr,
378 unsigned int src_pixels_per_line,
379 unsigned char *output_ptr,
380 unsigned int output_pitch,
381 unsigned int output_height,
382 unsigned int vp8_filter_index
383 );
384
385 extern void vp8_filter_block1d16_v6_ssse3
386 (
387 unsigned char *src_ptr,
388 unsigned int src_pitch,
389 unsigned char *output_ptr,
390 unsigned int out_pitch,
391 unsigned int output_height,
392 unsigned int vp8_filter_index
393 );
394
395 extern void vp8_filter_block1d8_v6_ssse3
396 (
397 unsigned char *src_ptr,
398 unsigned int src_pitch,
399 unsigned char *output_ptr,
400 unsigned int out_pitch,
401 unsigned int output_height,
402 unsigned int vp8_filter_index
403 );
404
405 extern void vp8_filter_block1d4_h6_ssse3
406 (
407 unsigned char *src_ptr,
408 unsigned int src_pixels_per_line,
409 unsigned char *output_ptr,
410 unsigned int output_pitch,
411 unsigned int output_height,
412 unsigned int vp8_filter_index
413 );
414
415 extern void vp8_filter_block1d4_v6_ssse3
416 (
417 unsigned char *src_ptr,
418 unsigned int src_pitch,
419 unsigned char *output_ptr,
420 unsigned int out_pitch,
421 unsigned int output_height,
422 unsigned int vp8_filter_index
423 );
424
vp8_sixtap_predict16x16_ssse3(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)425 void vp8_sixtap_predict16x16_ssse3
426 (
427 unsigned char *src_ptr,
428 int src_pixels_per_line,
429 int xoffset,
430 int yoffset,
431 unsigned char *dst_ptr,
432 int dst_pitch
433
434 )
435 {
436 DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 24*24);
437
438 if (xoffset)
439 {
440 if (yoffset)
441 {
442 vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 16, 21, xoffset);
443 vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch, 16, yoffset);
444 }
445 else
446 {
447 /* First-pass only */
448 vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, xoffset);
449 }
450 }
451 else
452 {
453 /* Second-pass only */
454 vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line) , src_pixels_per_line, dst_ptr, dst_pitch, 16, yoffset);
455 }
456 }
457
vp8_sixtap_predict8x8_ssse3(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)458 void vp8_sixtap_predict8x8_ssse3
459 (
460 unsigned char *src_ptr,
461 int src_pixels_per_line,
462 int xoffset,
463 int yoffset,
464 unsigned char *dst_ptr,
465 int dst_pitch
466 )
467 {
468 DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
469
470 if (xoffset)
471 {
472 if (yoffset)
473 {
474 vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 8, 13, xoffset);
475 vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset);
476 }
477 else
478 {
479 vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, xoffset);
480 }
481 }
482 else
483 {
484 /* Second-pass only */
485 vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, yoffset);
486 }
487 }
488
489
vp8_sixtap_predict8x4_ssse3(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)490 void vp8_sixtap_predict8x4_ssse3
491 (
492 unsigned char *src_ptr,
493 int src_pixels_per_line,
494 int xoffset,
495 int yoffset,
496 unsigned char *dst_ptr,
497 int dst_pitch
498 )
499 {
500 DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
501
502 if (xoffset)
503 {
504 if (yoffset)
505 {
506 vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 8, 9, xoffset);
507 vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset);
508 }
509 else
510 {
511 /* First-pass only */
512 vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, xoffset);
513 }
514 }
515 else
516 {
517 /* Second-pass only */
518 vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, yoffset);
519 }
520 }
521
vp8_sixtap_predict4x4_ssse3(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)522 void vp8_sixtap_predict4x4_ssse3
523 (
524 unsigned char *src_ptr,
525 int src_pixels_per_line,
526 int xoffset,
527 int yoffset,
528 unsigned char *dst_ptr,
529 int dst_pitch
530 )
531 {
532 DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 4*9);
533
534 if (xoffset)
535 {
536 if (yoffset)
537 {
538 vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 4, 9, xoffset);
539 vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset);
540 }
541 else
542 {
543 vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, xoffset);
544 }
545 }
546 else
547 {
548 vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, yoffset);
549 }
550
551 }
552
553 #endif
554