1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "vpx_config.h"
12 #include "vp8_rtcd.h"
13 #include "vpx_ports/mem.h"
14 #include "filter_x86.h"
15
16 extern const short vp8_six_tap_x86[8][6 * 8];
17
18 extern void vp8_filter_block1d_h6_mmx(unsigned char *src_ptr,
19 unsigned short *output_ptr,
20 unsigned int src_pixels_per_line,
21 unsigned int pixel_step,
22 unsigned int output_height,
23 unsigned int output_width,
24 const short *vp8_filter);
25 extern void vp8_filter_block1dc_v6_mmx(
26 unsigned short *src_ptr, unsigned char *output_ptr, int output_pitch,
27 unsigned int pixels_per_line, unsigned int pixel_step,
28 unsigned int output_height, unsigned int output_width,
29 const short *vp8_filter);
30 extern void vp8_filter_block1d8_h6_sse2(unsigned char *src_ptr,
31 unsigned short *output_ptr,
32 unsigned int src_pixels_per_line,
33 unsigned int pixel_step,
34 unsigned int output_height,
35 unsigned int output_width,
36 const short *vp8_filter);
37 extern void vp8_filter_block1d16_h6_sse2(unsigned char *src_ptr,
38 unsigned short *output_ptr,
39 unsigned int src_pixels_per_line,
40 unsigned int pixel_step,
41 unsigned int output_height,
42 unsigned int output_width,
43 const short *vp8_filter);
44 extern void vp8_filter_block1d8_v6_sse2(
45 unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
46 unsigned int pixels_per_line, unsigned int pixel_step,
47 unsigned int output_height, unsigned int output_width,
48 const short *vp8_filter);
49 extern void vp8_filter_block1d16_v6_sse2(
50 unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
51 unsigned int pixels_per_line, unsigned int pixel_step,
52 unsigned int output_height, unsigned int output_width,
53 const short *vp8_filter);
54 extern void vp8_unpack_block1d16_h6_sse2(unsigned char *src_ptr,
55 unsigned short *output_ptr,
56 unsigned int src_pixels_per_line,
57 unsigned int output_height,
58 unsigned int output_width);
59 extern void vp8_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,
60 unsigned int src_pixels_per_line,
61 unsigned char *output_ptr,
62 int dst_ptich,
63 unsigned int output_height,
64 const short *vp8_filter);
65 extern void vp8_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,
66 unsigned int src_pixels_per_line,
67 unsigned char *output_ptr,
68 int dst_ptich,
69 unsigned int output_height,
70 const short *vp8_filter);
71 extern void vp8_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
72 unsigned int src_pixels_per_line,
73 unsigned char *output_ptr,
74 int dst_ptich,
75 unsigned int output_height,
76 const short *vp8_filter);
77
78 #if HAVE_MMX
vp8_sixtap_predict4x4_mmx(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)79 void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
80 int xoffset, int yoffset, unsigned char *dst_ptr,
81 int dst_pitch) {
82 DECLARE_ALIGNED(16, unsigned short,
83 FData2[16 * 16]); /* Temp data bufffer used in filtering */
84 const short *HFilter, *VFilter;
85 HFilter = vp8_six_tap_x86[xoffset];
86 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
87 src_pixels_per_line, 1, 9, 8, HFilter);
88 VFilter = vp8_six_tap_x86[yoffset];
89 vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4, 4, 4,
90 VFilter);
91 }
92 #endif
93
94 #if HAVE_SSE2
vp8_sixtap_predict16x16_sse2(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)95 void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr,
96 int src_pixels_per_line, int xoffset,
97 int yoffset, unsigned char *dst_ptr,
98 int dst_pitch
99
100 ) {
101 DECLARE_ALIGNED(16, unsigned short,
102 FData2[24 * 24]); /* Temp data bufffer used in filtering */
103
104 const short *HFilter, *VFilter;
105
106 if (xoffset) {
107 if (yoffset) {
108 HFilter = vp8_six_tap_x86[xoffset];
109 vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
110 src_pixels_per_line, 1, 21, 32, HFilter);
111 VFilter = vp8_six_tap_x86[yoffset];
112 vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
113 dst_pitch, VFilter);
114 } else {
115 /* First-pass only */
116 HFilter = vp8_six_tap_x86[xoffset];
117 vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
118 dst_pitch, 16, HFilter);
119 }
120 } else {
121 /* Second-pass only */
122 VFilter = vp8_six_tap_x86[yoffset];
123 vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
124 src_pixels_per_line, 21, 32);
125 vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
126 dst_pitch, VFilter);
127 }
128 }
129
vp8_sixtap_predict8x8_sse2(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)130 void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line,
131 int xoffset, int yoffset,
132 unsigned char *dst_ptr, int dst_pitch) {
133 DECLARE_ALIGNED(16, unsigned short,
134 FData2[256]); /* Temp data bufffer used in filtering */
135 const short *HFilter, *VFilter;
136
137 if (xoffset) {
138 if (yoffset) {
139 HFilter = vp8_six_tap_x86[xoffset];
140 vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
141 src_pixels_per_line, 1, 13, 16, HFilter);
142 VFilter = vp8_six_tap_x86[yoffset];
143 vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8,
144 dst_pitch, VFilter);
145 } else {
146 /* First-pass only */
147 HFilter = vp8_six_tap_x86[xoffset];
148 vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
149 dst_pitch, 8, HFilter);
150 }
151 } else {
152 /* Second-pass only */
153 VFilter = vp8_six_tap_x86[yoffset];
154 vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
155 src_pixels_per_line, dst_ptr, dst_pitch, 8,
156 VFilter);
157 }
158 }
159
vp8_sixtap_predict8x4_sse2(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)160 void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line,
161 int xoffset, int yoffset,
162 unsigned char *dst_ptr, int dst_pitch) {
163 DECLARE_ALIGNED(16, unsigned short,
164 FData2[256]); /* Temp data bufffer used in filtering */
165 const short *HFilter, *VFilter;
166
167 if (xoffset) {
168 if (yoffset) {
169 HFilter = vp8_six_tap_x86[xoffset];
170 vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
171 src_pixels_per_line, 1, 9, 16, HFilter);
172 VFilter = vp8_six_tap_x86[yoffset];
173 vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4,
174 dst_pitch, VFilter);
175 } else {
176 /* First-pass only */
177 HFilter = vp8_six_tap_x86[xoffset];
178 vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
179 dst_pitch, 4, HFilter);
180 }
181 } else {
182 /* Second-pass only */
183 VFilter = vp8_six_tap_x86[yoffset];
184 vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
185 src_pixels_per_line, dst_ptr, dst_pitch, 4,
186 VFilter);
187 }
188 }
189
190 #endif
191
192 #if HAVE_SSSE3
193
194 extern void vp8_filter_block1d8_h6_ssse3(unsigned char *src_ptr,
195 unsigned int src_pixels_per_line,
196 unsigned char *output_ptr,
197 unsigned int output_pitch,
198 unsigned int output_height,
199 unsigned int vp8_filter_index);
200
201 extern void vp8_filter_block1d16_h6_ssse3(unsigned char *src_ptr,
202 unsigned int src_pixels_per_line,
203 unsigned char *output_ptr,
204 unsigned int output_pitch,
205 unsigned int output_height,
206 unsigned int vp8_filter_index);
207
208 extern void vp8_filter_block1d16_v6_ssse3(unsigned char *src_ptr,
209 unsigned int src_pitch,
210 unsigned char *output_ptr,
211 unsigned int out_pitch,
212 unsigned int output_height,
213 unsigned int vp8_filter_index);
214
215 extern void vp8_filter_block1d8_v6_ssse3(unsigned char *src_ptr,
216 unsigned int src_pitch,
217 unsigned char *output_ptr,
218 unsigned int out_pitch,
219 unsigned int output_height,
220 unsigned int vp8_filter_index);
221
222 extern void vp8_filter_block1d4_h6_ssse3(unsigned char *src_ptr,
223 unsigned int src_pixels_per_line,
224 unsigned char *output_ptr,
225 unsigned int output_pitch,
226 unsigned int output_height,
227 unsigned int vp8_filter_index);
228
229 extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
230 unsigned int src_pitch,
231 unsigned char *output_ptr,
232 unsigned int out_pitch,
233 unsigned int output_height,
234 unsigned int vp8_filter_index);
235
vp8_sixtap_predict16x16_ssse3(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)236 void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr,
237 int src_pixels_per_line, int xoffset,
238 int yoffset, unsigned char *dst_ptr,
239 int dst_pitch
240
241 ) {
242 DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]);
243
244 if (xoffset) {
245 if (yoffset) {
246 vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
247 src_pixels_per_line, FData2, 16, 21,
248 xoffset);
249 vp8_filter_block1d16_v6_ssse3(FData2, 16, dst_ptr, dst_pitch, 16,
250 yoffset);
251 } else {
252 /* First-pass only */
253 vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
254 dst_pitch, 16, xoffset);
255 }
256 } else {
257 if (yoffset) {
258 /* Second-pass only */
259 vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
260 src_pixels_per_line, dst_ptr, dst_pitch, 16,
261 yoffset);
262 } else {
263 /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
264 * yoffset==0) case correctly. Add copy function here to guarantee
265 * six-tap function handles all possible offsets. */
266 vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
267 }
268 }
269 }
270
vp8_sixtap_predict8x8_ssse3(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)271 void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr,
272 int src_pixels_per_line, int xoffset,
273 int yoffset, unsigned char *dst_ptr,
274 int dst_pitch) {
275 DECLARE_ALIGNED(16, unsigned char, FData2[256]);
276
277 if (xoffset) {
278 if (yoffset) {
279 vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
280 src_pixels_per_line, FData2, 8, 13, xoffset);
281 vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset);
282 } else {
283 vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
284 dst_pitch, 8, xoffset);
285 }
286 } else {
287 if (yoffset) {
288 /* Second-pass only */
289 vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
290 src_pixels_per_line, dst_ptr, dst_pitch, 8,
291 yoffset);
292 } else {
293 /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
294 * yoffset==0) case correctly. Add copy function here to guarantee
295 * six-tap function handles all possible offsets. */
296 vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
297 }
298 }
299 }
300
vp8_sixtap_predict8x4_ssse3(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)301 void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr,
302 int src_pixels_per_line, int xoffset,
303 int yoffset, unsigned char *dst_ptr,
304 int dst_pitch) {
305 DECLARE_ALIGNED(16, unsigned char, FData2[256]);
306
307 if (xoffset) {
308 if (yoffset) {
309 vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
310 src_pixels_per_line, FData2, 8, 9, xoffset);
311 vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset);
312 } else {
313 /* First-pass only */
314 vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
315 dst_pitch, 4, xoffset);
316 }
317 } else {
318 if (yoffset) {
319 /* Second-pass only */
320 vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
321 src_pixels_per_line, dst_ptr, dst_pitch, 4,
322 yoffset);
323 } else {
324 /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
325 * yoffset==0) case correctly. Add copy function here to guarantee
326 * six-tap function handles all possible offsets. */
327 vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
328 }
329 }
330 }
331
vp8_sixtap_predict4x4_ssse3(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)332 void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr,
333 int src_pixels_per_line, int xoffset,
334 int yoffset, unsigned char *dst_ptr,
335 int dst_pitch) {
336 DECLARE_ALIGNED(16, unsigned char, FData2[4 * 9]);
337
338 if (xoffset) {
339 if (yoffset) {
340 vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
341 src_pixels_per_line, FData2, 4, 9, xoffset);
342 vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset);
343 } else {
344 vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
345 dst_pitch, 4, xoffset);
346 }
347 } else {
348 if (yoffset) {
349 vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
350 src_pixels_per_line, dst_ptr, dst_pitch, 4,
351 yoffset);
352 } else {
353 /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
354 * yoffset==0) case correctly. Add copy function here to guarantee
355 * six-tap function handles all possible offsets. */
356 int r;
357
358 for (r = 0; r < 4; ++r) {
359 dst_ptr[0] = src_ptr[0];
360 dst_ptr[1] = src_ptr[1];
361 dst_ptr[2] = src_ptr[2];
362 dst_ptr[3] = src_ptr[3];
363 dst_ptr += dst_pitch;
364 src_ptr += src_pixels_per_line;
365 }
366 }
367 }
368 }
369
370 #endif
371