• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "vpx_config.h"
12 #include "vp8_rtcd.h"
13 #include "vpx_ports/mem.h"
14 #include "filter_x86.h"
15 
16 extern const short vp8_six_tap_x86[8][6 * 8];
17 
18 extern void vp8_filter_block1d_h6_mmx(unsigned char *src_ptr,
19                                       unsigned short *output_ptr,
20                                       unsigned int src_pixels_per_line,
21                                       unsigned int pixel_step,
22                                       unsigned int output_height,
23                                       unsigned int output_width,
24                                       const short *vp8_filter);
25 extern void vp8_filter_block1dc_v6_mmx(
26     unsigned short *src_ptr, unsigned char *output_ptr, int output_pitch,
27     unsigned int pixels_per_line, unsigned int pixel_step,
28     unsigned int output_height, unsigned int output_width,
29     const short *vp8_filter);
30 extern void vp8_filter_block1d8_h6_sse2(unsigned char *src_ptr,
31                                         unsigned short *output_ptr,
32                                         unsigned int src_pixels_per_line,
33                                         unsigned int pixel_step,
34                                         unsigned int output_height,
35                                         unsigned int output_width,
36                                         const short *vp8_filter);
37 extern void vp8_filter_block1d16_h6_sse2(unsigned char *src_ptr,
38                                          unsigned short *output_ptr,
39                                          unsigned int src_pixels_per_line,
40                                          unsigned int pixel_step,
41                                          unsigned int output_height,
42                                          unsigned int output_width,
43                                          const short *vp8_filter);
44 extern void vp8_filter_block1d8_v6_sse2(
45     unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
46     unsigned int pixels_per_line, unsigned int pixel_step,
47     unsigned int output_height, unsigned int output_width,
48     const short *vp8_filter);
49 extern void vp8_filter_block1d16_v6_sse2(
50     unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
51     unsigned int pixels_per_line, unsigned int pixel_step,
52     unsigned int output_height, unsigned int output_width,
53     const short *vp8_filter);
54 extern void vp8_unpack_block1d16_h6_sse2(unsigned char *src_ptr,
55                                          unsigned short *output_ptr,
56                                          unsigned int src_pixels_per_line,
57                                          unsigned int output_height,
58                                          unsigned int output_width);
59 extern void vp8_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,
60                                              unsigned int src_pixels_per_line,
61                                              unsigned char *output_ptr,
62                                              int dst_ptich,
63                                              unsigned int output_height,
64                                              const short *vp8_filter);
65 extern void vp8_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,
66                                               unsigned int src_pixels_per_line,
67                                               unsigned char *output_ptr,
68                                               int dst_ptich,
69                                               unsigned int output_height,
70                                               const short *vp8_filter);
71 extern void vp8_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
72                                              unsigned int src_pixels_per_line,
73                                              unsigned char *output_ptr,
74                                              int dst_ptich,
75                                              unsigned int output_height,
76                                              const short *vp8_filter);
77 
78 #if HAVE_MMX
vp8_sixtap_predict4x4_mmx(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)79 void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
80                                int xoffset, int yoffset, unsigned char *dst_ptr,
81                                int dst_pitch) {
82   DECLARE_ALIGNED(16, unsigned short,
83                   FData2[16 * 16]); /* Temp data bufffer used in filtering */
84   const short *HFilter, *VFilter;
85   HFilter = vp8_six_tap_x86[xoffset];
86   vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
87                             src_pixels_per_line, 1, 9, 8, HFilter);
88   VFilter = vp8_six_tap_x86[yoffset];
89   vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4, 4, 4,
90                              VFilter);
91 }
92 #endif
93 
94 #if HAVE_SSE2
vp8_sixtap_predict16x16_sse2(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)95 void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr,
96                                   int src_pixels_per_line, int xoffset,
97                                   int yoffset, unsigned char *dst_ptr,
98                                   int dst_pitch
99 
100                                   ) {
101   DECLARE_ALIGNED(16, unsigned short,
102                   FData2[24 * 24]); /* Temp data bufffer used in filtering */
103 
104   const short *HFilter, *VFilter;
105 
106   if (xoffset) {
107     if (yoffset) {
108       HFilter = vp8_six_tap_x86[xoffset];
109       vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
110                                    src_pixels_per_line, 1, 21, 32, HFilter);
111       VFilter = vp8_six_tap_x86[yoffset];
112       vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
113                                    dst_pitch, VFilter);
114     } else {
115       /* First-pass only */
116       HFilter = vp8_six_tap_x86[xoffset];
117       vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
118                                         dst_pitch, 16, HFilter);
119     }
120   } else {
121     /* Second-pass only */
122     VFilter = vp8_six_tap_x86[yoffset];
123     vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
124                                  src_pixels_per_line, 21, 32);
125     vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
126                                  dst_pitch, VFilter);
127   }
128 }
129 
vp8_sixtap_predict8x8_sse2(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)130 void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line,
131                                 int xoffset, int yoffset,
132                                 unsigned char *dst_ptr, int dst_pitch) {
133   DECLARE_ALIGNED(16, unsigned short,
134                   FData2[256]); /* Temp data bufffer used in filtering */
135   const short *HFilter, *VFilter;
136 
137   if (xoffset) {
138     if (yoffset) {
139       HFilter = vp8_six_tap_x86[xoffset];
140       vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
141                                   src_pixels_per_line, 1, 13, 16, HFilter);
142       VFilter = vp8_six_tap_x86[yoffset];
143       vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8,
144                                   dst_pitch, VFilter);
145     } else {
146       /* First-pass only */
147       HFilter = vp8_six_tap_x86[xoffset];
148       vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
149                                        dst_pitch, 8, HFilter);
150     }
151   } else {
152     /* Second-pass only */
153     VFilter = vp8_six_tap_x86[yoffset];
154     vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
155                                      src_pixels_per_line, dst_ptr, dst_pitch, 8,
156                                      VFilter);
157   }
158 }
159 
vp8_sixtap_predict8x4_sse2(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)160 void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line,
161                                 int xoffset, int yoffset,
162                                 unsigned char *dst_ptr, int dst_pitch) {
163   DECLARE_ALIGNED(16, unsigned short,
164                   FData2[256]); /* Temp data bufffer used in filtering */
165   const short *HFilter, *VFilter;
166 
167   if (xoffset) {
168     if (yoffset) {
169       HFilter = vp8_six_tap_x86[xoffset];
170       vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
171                                   src_pixels_per_line, 1, 9, 16, HFilter);
172       VFilter = vp8_six_tap_x86[yoffset];
173       vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4,
174                                   dst_pitch, VFilter);
175     } else {
176       /* First-pass only */
177       HFilter = vp8_six_tap_x86[xoffset];
178       vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
179                                        dst_pitch, 4, HFilter);
180     }
181   } else {
182     /* Second-pass only */
183     VFilter = vp8_six_tap_x86[yoffset];
184     vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
185                                      src_pixels_per_line, dst_ptr, dst_pitch, 4,
186                                      VFilter);
187   }
188 }
189 
190 #endif
191 
192 #if HAVE_SSSE3
193 
194 extern void vp8_filter_block1d8_h6_ssse3(unsigned char *src_ptr,
195                                          unsigned int src_pixels_per_line,
196                                          unsigned char *output_ptr,
197                                          unsigned int output_pitch,
198                                          unsigned int output_height,
199                                          unsigned int vp8_filter_index);
200 
201 extern void vp8_filter_block1d16_h6_ssse3(unsigned char *src_ptr,
202                                           unsigned int src_pixels_per_line,
203                                           unsigned char *output_ptr,
204                                           unsigned int output_pitch,
205                                           unsigned int output_height,
206                                           unsigned int vp8_filter_index);
207 
208 extern void vp8_filter_block1d16_v6_ssse3(unsigned char *src_ptr,
209                                           unsigned int src_pitch,
210                                           unsigned char *output_ptr,
211                                           unsigned int out_pitch,
212                                           unsigned int output_height,
213                                           unsigned int vp8_filter_index);
214 
215 extern void vp8_filter_block1d8_v6_ssse3(unsigned char *src_ptr,
216                                          unsigned int src_pitch,
217                                          unsigned char *output_ptr,
218                                          unsigned int out_pitch,
219                                          unsigned int output_height,
220                                          unsigned int vp8_filter_index);
221 
222 extern void vp8_filter_block1d4_h6_ssse3(unsigned char *src_ptr,
223                                          unsigned int src_pixels_per_line,
224                                          unsigned char *output_ptr,
225                                          unsigned int output_pitch,
226                                          unsigned int output_height,
227                                          unsigned int vp8_filter_index);
228 
229 extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
230                                          unsigned int src_pitch,
231                                          unsigned char *output_ptr,
232                                          unsigned int out_pitch,
233                                          unsigned int output_height,
234                                          unsigned int vp8_filter_index);
235 
vp8_sixtap_predict16x16_ssse3(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)236 void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr,
237                                    int src_pixels_per_line, int xoffset,
238                                    int yoffset, unsigned char *dst_ptr,
239                                    int dst_pitch
240 
241                                    ) {
242   DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]);
243 
244   if (xoffset) {
245     if (yoffset) {
246       vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
247                                     src_pixels_per_line, FData2, 16, 21,
248                                     xoffset);
249       vp8_filter_block1d16_v6_ssse3(FData2, 16, dst_ptr, dst_pitch, 16,
250                                     yoffset);
251     } else {
252       /* First-pass only */
253       vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
254                                     dst_pitch, 16, xoffset);
255     }
256   } else {
257     if (yoffset) {
258       /* Second-pass only */
259       vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
260                                     src_pixels_per_line, dst_ptr, dst_pitch, 16,
261                                     yoffset);
262     } else {
263       /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
264        * yoffset==0) case correctly. Add copy function here to guarantee
265        * six-tap function handles all possible offsets. */
266       vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
267     }
268   }
269 }
270 
vp8_sixtap_predict8x8_ssse3(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)271 void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr,
272                                  int src_pixels_per_line, int xoffset,
273                                  int yoffset, unsigned char *dst_ptr,
274                                  int dst_pitch) {
275   DECLARE_ALIGNED(16, unsigned char, FData2[256]);
276 
277   if (xoffset) {
278     if (yoffset) {
279       vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
280                                    src_pixels_per_line, FData2, 8, 13, xoffset);
281       vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset);
282     } else {
283       vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
284                                    dst_pitch, 8, xoffset);
285     }
286   } else {
287     if (yoffset) {
288       /* Second-pass only */
289       vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
290                                    src_pixels_per_line, dst_ptr, dst_pitch, 8,
291                                    yoffset);
292     } else {
293       /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
294        * yoffset==0) case correctly. Add copy function here to guarantee
295        * six-tap function handles all possible offsets. */
296       vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
297     }
298   }
299 }
300 
vp8_sixtap_predict8x4_ssse3(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)301 void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr,
302                                  int src_pixels_per_line, int xoffset,
303                                  int yoffset, unsigned char *dst_ptr,
304                                  int dst_pitch) {
305   DECLARE_ALIGNED(16, unsigned char, FData2[256]);
306 
307   if (xoffset) {
308     if (yoffset) {
309       vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
310                                    src_pixels_per_line, FData2, 8, 9, xoffset);
311       vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset);
312     } else {
313       /* First-pass only */
314       vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
315                                    dst_pitch, 4, xoffset);
316     }
317   } else {
318     if (yoffset) {
319       /* Second-pass only */
320       vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
321                                    src_pixels_per_line, dst_ptr, dst_pitch, 4,
322                                    yoffset);
323     } else {
324       /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
325        * yoffset==0) case correctly. Add copy function here to guarantee
326        * six-tap function handles all possible offsets. */
327       vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
328     }
329   }
330 }
331 
vp8_sixtap_predict4x4_ssse3(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)332 void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr,
333                                  int src_pixels_per_line, int xoffset,
334                                  int yoffset, unsigned char *dst_ptr,
335                                  int dst_pitch) {
336   DECLARE_ALIGNED(16, unsigned char, FData2[4 * 9]);
337 
338   if (xoffset) {
339     if (yoffset) {
340       vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
341                                    src_pixels_per_line, FData2, 4, 9, xoffset);
342       vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset);
343     } else {
344       vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
345                                    dst_pitch, 4, xoffset);
346     }
347   } else {
348     if (yoffset) {
349       vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
350                                    src_pixels_per_line, dst_ptr, dst_pitch, 4,
351                                    yoffset);
352     } else {
353       /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
354         * yoffset==0) case correctly. Add copy function here to guarantee
355         * six-tap function handles all possible offsets. */
356       int r;
357 
358       for (r = 0; r < 4; ++r) {
359         dst_ptr[0] = src_ptr[0];
360         dst_ptr[1] = src_ptr[1];
361         dst_ptr[2] = src_ptr[2];
362         dst_ptr[3] = src_ptr[3];
363         dst_ptr += dst_pitch;
364         src_ptr += src_pixels_per_line;
365       }
366     }
367   }
368 }
369 
370 #endif
371