1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "vpx_config.h"
12 #include "vp8_rtcd.h"
13 #include "vpx_ports/mem.h"
14
15 extern const short vp8_six_tap_x86[8][6 * 8];
16
17 extern void vp8_filter_block1d_h6_mmx(unsigned char *src_ptr,
18 unsigned short *output_ptr,
19 unsigned int src_pixels_per_line,
20 unsigned int pixel_step,
21 unsigned int output_height,
22 unsigned int output_width,
23 const short *vp8_filter);
24 extern void vp8_filter_block1dc_v6_mmx(
25 unsigned short *src_ptr, unsigned char *output_ptr, int output_pitch,
26 unsigned int pixels_per_line, unsigned int pixel_step,
27 unsigned int output_height, unsigned int output_width,
28 const short *vp8_filter);
29 extern void vp8_filter_block1d8_h6_sse2(unsigned char *src_ptr,
30 unsigned short *output_ptr,
31 unsigned int src_pixels_per_line,
32 unsigned int pixel_step,
33 unsigned int output_height,
34 unsigned int output_width,
35 const short *vp8_filter);
36 extern void vp8_filter_block1d16_h6_sse2(unsigned char *src_ptr,
37 unsigned short *output_ptr,
38 unsigned int src_pixels_per_line,
39 unsigned int pixel_step,
40 unsigned int output_height,
41 unsigned int output_width,
42 const short *vp8_filter);
43 extern void vp8_filter_block1d8_v6_sse2(
44 unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
45 unsigned int pixels_per_line, unsigned int pixel_step,
46 unsigned int output_height, unsigned int output_width,
47 const short *vp8_filter);
48 extern void vp8_filter_block1d16_v6_sse2(
49 unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
50 unsigned int pixels_per_line, unsigned int pixel_step,
51 unsigned int output_height, unsigned int output_width,
52 const short *vp8_filter);
53 extern void vp8_unpack_block1d16_h6_sse2(unsigned char *src_ptr,
54 unsigned short *output_ptr,
55 unsigned int src_pixels_per_line,
56 unsigned int output_height,
57 unsigned int output_width);
58 extern void vp8_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,
59 unsigned int src_pixels_per_line,
60 unsigned char *output_ptr,
61 int dst_ptich,
62 unsigned int output_height,
63 const short *vp8_filter);
64 extern void vp8_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,
65 unsigned int src_pixels_per_line,
66 unsigned char *output_ptr,
67 int dst_ptich,
68 unsigned int output_height,
69 const short *vp8_filter);
70 extern void vp8_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
71 unsigned int src_pixels_per_line,
72 unsigned char *output_ptr,
73 int dst_ptich,
74 unsigned int output_height,
75 const short *vp8_filter);
76
77 #if HAVE_MMX
vp8_sixtap_predict4x4_mmx(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)78 void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
79 int xoffset, int yoffset, unsigned char *dst_ptr,
80 int dst_pitch) {
81 DECLARE_ALIGNED(16, unsigned short,
82 FData2[16 * 16]); /* Temp data bufffer used in filtering */
83 const short *HFilter, *VFilter;
84 HFilter = vp8_six_tap_x86[xoffset];
85 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
86 src_pixels_per_line, 1, 9, 8, HFilter);
87 VFilter = vp8_six_tap_x86[yoffset];
88 vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4, 4, 4,
89 VFilter);
90 }
91 #endif
92
93 #if HAVE_SSE2
vp8_sixtap_predict16x16_sse2(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)94 void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr,
95 int src_pixels_per_line, int xoffset,
96 int yoffset, unsigned char *dst_ptr,
97 int dst_pitch) {
98 DECLARE_ALIGNED(16, unsigned short,
99 FData2[24 * 24]); /* Temp data bufffer used in filtering */
100
101 const short *HFilter, *VFilter;
102
103 if (xoffset) {
104 if (yoffset) {
105 HFilter = vp8_six_tap_x86[xoffset];
106 vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
107 src_pixels_per_line, 1, 21, 32, HFilter);
108 VFilter = vp8_six_tap_x86[yoffset];
109 vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
110 dst_pitch, VFilter);
111 } else {
112 /* First-pass only */
113 HFilter = vp8_six_tap_x86[xoffset];
114 vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
115 dst_pitch, 16, HFilter);
116 }
117 } else {
118 /* Second-pass only */
119 VFilter = vp8_six_tap_x86[yoffset];
120 vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
121 src_pixels_per_line, 21, 32);
122 vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
123 dst_pitch, VFilter);
124 }
125 }
126
vp8_sixtap_predict8x8_sse2(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)127 void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line,
128 int xoffset, int yoffset,
129 unsigned char *dst_ptr, int dst_pitch) {
130 DECLARE_ALIGNED(16, unsigned short,
131 FData2[256]); /* Temp data bufffer used in filtering */
132 const short *HFilter, *VFilter;
133
134 if (xoffset) {
135 if (yoffset) {
136 HFilter = vp8_six_tap_x86[xoffset];
137 vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
138 src_pixels_per_line, 1, 13, 16, HFilter);
139 VFilter = vp8_six_tap_x86[yoffset];
140 vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8,
141 dst_pitch, VFilter);
142 } else {
143 /* First-pass only */
144 HFilter = vp8_six_tap_x86[xoffset];
145 vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
146 dst_pitch, 8, HFilter);
147 }
148 } else {
149 /* Second-pass only */
150 VFilter = vp8_six_tap_x86[yoffset];
151 vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
152 src_pixels_per_line, dst_ptr, dst_pitch, 8,
153 VFilter);
154 }
155 }
156
vp8_sixtap_predict8x4_sse2(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)157 void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line,
158 int xoffset, int yoffset,
159 unsigned char *dst_ptr, int dst_pitch) {
160 DECLARE_ALIGNED(16, unsigned short,
161 FData2[256]); /* Temp data bufffer used in filtering */
162 const short *HFilter, *VFilter;
163
164 if (xoffset) {
165 if (yoffset) {
166 HFilter = vp8_six_tap_x86[xoffset];
167 vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
168 src_pixels_per_line, 1, 9, 16, HFilter);
169 VFilter = vp8_six_tap_x86[yoffset];
170 vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4,
171 dst_pitch, VFilter);
172 } else {
173 /* First-pass only */
174 HFilter = vp8_six_tap_x86[xoffset];
175 vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
176 dst_pitch, 4, HFilter);
177 }
178 } else {
179 /* Second-pass only */
180 VFilter = vp8_six_tap_x86[yoffset];
181 vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
182 src_pixels_per_line, dst_ptr, dst_pitch, 4,
183 VFilter);
184 }
185 }
186
187 #endif
188
189 #if HAVE_SSSE3
190
191 extern void vp8_filter_block1d8_h6_ssse3(unsigned char *src_ptr,
192 unsigned int src_pixels_per_line,
193 unsigned char *output_ptr,
194 unsigned int output_pitch,
195 unsigned int output_height,
196 unsigned int vp8_filter_index);
197
198 extern void vp8_filter_block1d16_h6_ssse3(unsigned char *src_ptr,
199 unsigned int src_pixels_per_line,
200 unsigned char *output_ptr,
201 unsigned int output_pitch,
202 unsigned int output_height,
203 unsigned int vp8_filter_index);
204
205 extern void vp8_filter_block1d16_v6_ssse3(unsigned char *src_ptr,
206 unsigned int src_pitch,
207 unsigned char *output_ptr,
208 unsigned int out_pitch,
209 unsigned int output_height,
210 unsigned int vp8_filter_index);
211
212 extern void vp8_filter_block1d8_v6_ssse3(unsigned char *src_ptr,
213 unsigned int src_pitch,
214 unsigned char *output_ptr,
215 unsigned int out_pitch,
216 unsigned int output_height,
217 unsigned int vp8_filter_index);
218
219 extern void vp8_filter_block1d4_h6_ssse3(unsigned char *src_ptr,
220 unsigned int src_pixels_per_line,
221 unsigned char *output_ptr,
222 unsigned int output_pitch,
223 unsigned int output_height,
224 unsigned int vp8_filter_index);
225
226 extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
227 unsigned int src_pitch,
228 unsigned char *output_ptr,
229 unsigned int out_pitch,
230 unsigned int output_height,
231 unsigned int vp8_filter_index);
232
vp8_sixtap_predict16x16_ssse3(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)233 void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr,
234 int src_pixels_per_line, int xoffset,
235 int yoffset, unsigned char *dst_ptr,
236 int dst_pitch) {
237 DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]);
238
239 if (xoffset) {
240 if (yoffset) {
241 vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
242 src_pixels_per_line, FData2, 16, 21,
243 xoffset);
244 vp8_filter_block1d16_v6_ssse3(FData2, 16, dst_ptr, dst_pitch, 16,
245 yoffset);
246 } else {
247 /* First-pass only */
248 vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
249 dst_pitch, 16, xoffset);
250 }
251 } else {
252 if (yoffset) {
253 /* Second-pass only */
254 vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
255 src_pixels_per_line, dst_ptr, dst_pitch, 16,
256 yoffset);
257 } else {
258 /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
259 * yoffset==0) case correctly. Add copy function here to guarantee
260 * six-tap function handles all possible offsets. */
261 vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
262 }
263 }
264 }
265
vp8_sixtap_predict8x8_ssse3(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)266 void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr,
267 int src_pixels_per_line, int xoffset,
268 int yoffset, unsigned char *dst_ptr,
269 int dst_pitch) {
270 DECLARE_ALIGNED(16, unsigned char, FData2[256]);
271
272 if (xoffset) {
273 if (yoffset) {
274 vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
275 src_pixels_per_line, FData2, 8, 13, xoffset);
276 vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset);
277 } else {
278 vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
279 dst_pitch, 8, xoffset);
280 }
281 } else {
282 if (yoffset) {
283 /* Second-pass only */
284 vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
285 src_pixels_per_line, dst_ptr, dst_pitch, 8,
286 yoffset);
287 } else {
288 /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
289 * yoffset==0) case correctly. Add copy function here to guarantee
290 * six-tap function handles all possible offsets. */
291 vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
292 }
293 }
294 }
295
vp8_sixtap_predict8x4_ssse3(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)296 void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr,
297 int src_pixels_per_line, int xoffset,
298 int yoffset, unsigned char *dst_ptr,
299 int dst_pitch) {
300 DECLARE_ALIGNED(16, unsigned char, FData2[256]);
301
302 if (xoffset) {
303 if (yoffset) {
304 vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
305 src_pixels_per_line, FData2, 8, 9, xoffset);
306 vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset);
307 } else {
308 /* First-pass only */
309 vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
310 dst_pitch, 4, xoffset);
311 }
312 } else {
313 if (yoffset) {
314 /* Second-pass only */
315 vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
316 src_pixels_per_line, dst_ptr, dst_pitch, 4,
317 yoffset);
318 } else {
319 /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
320 * yoffset==0) case correctly. Add copy function here to guarantee
321 * six-tap function handles all possible offsets. */
322 vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
323 }
324 }
325 }
326
vp8_sixtap_predict4x4_ssse3(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)327 void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr,
328 int src_pixels_per_line, int xoffset,
329 int yoffset, unsigned char *dst_ptr,
330 int dst_pitch) {
331 DECLARE_ALIGNED(16, unsigned char, FData2[4 * 9]);
332
333 if (xoffset) {
334 if (yoffset) {
335 vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
336 src_pixels_per_line, FData2, 4, 9, xoffset);
337 vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset);
338 } else {
339 vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
340 dst_pitch, 4, xoffset);
341 }
342 } else {
343 if (yoffset) {
344 vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
345 src_pixels_per_line, dst_ptr, dst_pitch, 4,
346 yoffset);
347 } else {
348 /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
349 * yoffset==0) case correctly. Add copy function here to guarantee
350 * six-tap function handles all possible offsets. */
351 int r;
352
353 for (r = 0; r < 4; ++r) {
354 dst_ptr[0] = src_ptr[0];
355 dst_ptr[1] = src_ptr[1];
356 dst_ptr[2] = src_ptr[2];
357 dst_ptr[3] = src_ptr[3];
358 dst_ptr += dst_pitch;
359 src_ptr += src_pixels_per_line;
360 }
361 }
362 }
363 }
364
365 #endif
366