• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 
13 #include "./vpx_config.h"
14 #include "./vp9_rtcd.h"
15 #include "vpx_ports/mem.h"
16 
17 typedef void filter8_1dfunction (
18   const unsigned char *src_ptr,
19   const ptrdiff_t src_pitch,
20   unsigned char *output_ptr,
21   ptrdiff_t out_pitch,
22   unsigned int output_height,
23   const short *filter
24 );
25 
26 #define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
27   void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
28                                    uint8_t *dst, ptrdiff_t dst_stride, \
29                                    const int16_t *filter_x, int x_step_q4, \
30                                    const int16_t *filter_y, int y_step_q4, \
31                                    int w, int h) { \
32   if (step_q4 == 16 && filter[3] != 128) { \
33     if (filter[0] || filter[1] || filter[2]) { \
34       while (w >= 16) { \
35         vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \
36                                                  src_stride, \
37                                                  dst, \
38                                                  dst_stride, \
39                                                  h, \
40                                                  filter); \
41         src += 16; \
42         dst += 16; \
43         w -= 16; \
44       } \
45       while (w >= 8) { \
46         vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \
47                                                 src_stride, \
48                                                 dst, \
49                                                 dst_stride, \
50                                                 h, \
51                                                 filter); \
52         src += 8; \
53         dst += 8; \
54         w -= 8; \
55       } \
56       while (w >= 4) { \
57         vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \
58                                                 src_stride, \
59                                                 dst, \
60                                                 dst_stride, \
61                                                 h, \
62                                                 filter); \
63         src += 4; \
64         dst += 4; \
65         w -= 4; \
66       } \
67     } else { \
68       while (w >= 16) { \
69         vp9_filter_block1d16_##dir##2_##avg##opt(src, \
70                                                  src_stride, \
71                                                  dst, \
72                                                  dst_stride, \
73                                                  h, \
74                                                  filter); \
75         src += 16; \
76         dst += 16; \
77         w -= 16; \
78       } \
79       while (w >= 8) { \
80         vp9_filter_block1d8_##dir##2_##avg##opt(src, \
81                                                 src_stride, \
82                                                 dst, \
83                                                 dst_stride, \
84                                                 h, \
85                                                 filter); \
86         src += 8; \
87         dst += 8; \
88         w -= 8; \
89       } \
90       while (w >= 4) { \
91         vp9_filter_block1d4_##dir##2_##avg##opt(src, \
92                                                 src_stride, \
93                                                 dst, \
94                                                 dst_stride, \
95                                                 h, \
96                                                 filter); \
97         src += 4; \
98         dst += 4; \
99         w -= 4; \
100       } \
101     } \
102   } \
103   if (w) { \
104     vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
105                              filter_x, x_step_q4, filter_y, y_step_q4, \
106                              w, h); \
107   } \
108 }
109 
110 #define FUN_CONV_2D(avg, opt) \
111 void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
112                               uint8_t *dst, ptrdiff_t dst_stride, \
113                               const int16_t *filter_x, int x_step_q4, \
114                               const int16_t *filter_y, int y_step_q4, \
115                               int w, int h) { \
116   assert(w <= 64); \
117   assert(h <= 64); \
118   if (x_step_q4 == 16 && y_step_q4 == 16) { \
119     if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
120         filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
121       DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); \
122       vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
123                                 filter_x, x_step_q4, filter_y, y_step_q4, \
124                                 w, h + 7); \
125       vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
126                                       filter_x, x_step_q4, filter_y, \
127                                       y_step_q4, w, h); \
128     } else { \
129       DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 65); \
130       vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
131                                 filter_x, x_step_q4, filter_y, y_step_q4, \
132                                 w, h + 1); \
133       vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
134                                       filter_x, x_step_q4, filter_y, \
135                                       y_step_q4, w, h); \
136     } \
137   } else { \
138     vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
139                            filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
140   } \
141 }
142 
143 #if CONFIG_VP9_HIGHBITDEPTH
144 
145 typedef void high_filter8_1dfunction (
146   const uint16_t *src_ptr,
147   const ptrdiff_t src_pitch,
148   uint16_t *output_ptr,
149   ptrdiff_t out_pitch,
150   unsigned int output_height,
151   const int16_t *filter,
152   int bd
153 );
154 
155 #define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
156   void vp9_high_convolve8_##name##_##opt(const uint8_t *src8, \
157                                          ptrdiff_t src_stride, \
158                                          uint8_t *dst8, ptrdiff_t dst_stride, \
159                                          const int16_t *filter_x, \
160                                          int x_step_q4, \
161                                          const int16_t *filter_y, \
162                                          int y_step_q4, \
163                                          int w, int h, int bd) { \
164   if (step_q4 == 16 && filter[3] != 128) { \
165     uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
166     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
167     if (filter[0] || filter[1] || filter[2]) { \
168       while (w >= 16) { \
169         vp9_high_filter_block1d16_##dir##8_##avg##opt(src_start, \
170                                                       src_stride, \
171                                                       dst, \
172                                                       dst_stride, \
173                                                       h, \
174                                                       filter, \
175                                                       bd); \
176         src += 16; \
177         dst += 16; \
178         w -= 16; \
179       } \
180       while (w >= 8) { \
181         vp9_high_filter_block1d8_##dir##8_##avg##opt(src_start, \
182                                                      src_stride, \
183                                                      dst, \
184                                                      dst_stride, \
185                                                      h, \
186                                                      filter, \
187                                                      bd); \
188         src += 8; \
189         dst += 8; \
190         w -= 8; \
191       } \
192       while (w >= 4) { \
193         vp9_high_filter_block1d4_##dir##8_##avg##opt(src_start, \
194                                                      src_stride, \
195                                                      dst, \
196                                                      dst_stride, \
197                                                      h, \
198                                                      filter, \
199                                                      bd); \
200         src += 4; \
201         dst += 4; \
202         w -= 4; \
203       } \
204     } else { \
205       while (w >= 16) { \
206         vp9_high_filter_block1d16_##dir##2_##avg##opt(src, \
207                                                       src_stride, \
208                                                       dst, \
209                                                       dst_stride, \
210                                                       h, \
211                                                       filter, \
212                                                       bd); \
213         src += 16; \
214         dst += 16; \
215         w -= 16; \
216       } \
217       while (w >= 8) { \
218         vp9_high_filter_block1d8_##dir##2_##avg##opt(src, \
219                                                      src_stride, \
220                                                      dst, \
221                                                      dst_stride, \
222                                                      h, \
223                                                      filter, \
224                                                      bd); \
225         src += 8; \
226         dst += 8; \
227         w -= 8; \
228       } \
229       while (w >= 4) { \
230         vp9_high_filter_block1d4_##dir##2_##avg##opt(src, \
231                                                      src_stride, \
232                                                      dst, \
233                                                      dst_stride, \
234                                                      h, \
235                                                      filter, \
236                                                      bd); \
237         src += 4; \
238         dst += 4; \
239         w -= 4; \
240       } \
241     } \
242   } \
243   if (w) { \
244     vp9_high_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
245                                   filter_x, x_step_q4, filter_y, y_step_q4, \
246                                   w, h, bd); \
247   } \
248 }
249 
250 #define HIGH_FUN_CONV_2D(avg, opt) \
251 void vp9_high_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
252                                    uint8_t *dst, ptrdiff_t dst_stride, \
253                                    const int16_t *filter_x, int x_step_q4, \
254                                    const int16_t *filter_y, int y_step_q4, \
255                                    int w, int h, int bd) { \
256   assert(w <= 64); \
257   assert(h <= 64); \
258   if (x_step_q4 == 16 && y_step_q4 == 16) { \
259     if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
260         filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
261       DECLARE_ALIGNED_ARRAY(16, uint16_t, fdata2, 64 * 71); \
262       vp9_high_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
263                                      CONVERT_TO_BYTEPTR(fdata2), 64, \
264                                      filter_x, x_step_q4, filter_y, y_step_q4, \
265                                      w, h + 7, bd); \
266       vp9_high_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
267                                            64, dst, dst_stride, \
268                                            filter_x, x_step_q4, filter_y, \
269                                            y_step_q4, w, h, bd); \
270     } else { \
271       DECLARE_ALIGNED_ARRAY(16, uint16_t, fdata2, 64 * 65); \
272       vp9_high_convolve8_horiz_##opt(src, src_stride, \
273                                      CONVERT_TO_BYTEPTR(fdata2), 64, \
274                                      filter_x, x_step_q4, filter_y, y_step_q4, \
275                                      w, h + 1, bd); \
276       vp9_high_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
277                                            dst, dst_stride, \
278                                            filter_x, x_step_q4, filter_y, \
279                                            y_step_q4, w, h, bd); \
280     } \
281   } else { \
282     vp9_high_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
283                                 filter_x, x_step_q4, filter_y, y_step_q4, w, \
284                                 h, bd); \
285   } \
286 }
287 #endif  // CONFIG_VP9_HIGHBITDEPTH
288 
289 #if HAVE_AVX2 && HAVE_SSSE3
290 filter8_1dfunction vp9_filter_block1d16_v8_avx2;
291 filter8_1dfunction vp9_filter_block1d16_h8_avx2;
292 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
293 #if ARCH_X86_64
294 filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
295 filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
296 filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
297 #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
298 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
299 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
300 #else  // ARCH_X86
301 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
302 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
303 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
304 #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
305 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
306 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
307 #endif  // ARCH_X86_64 / ARCH_X86
308 filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
309 filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
310 filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
311 filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
312 filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
313 filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
314 #define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
315 #define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
316 #define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
317 #define vp9_filter_block1d8_v2_avx2  vp9_filter_block1d8_v2_ssse3
318 #define vp9_filter_block1d8_h2_avx2  vp9_filter_block1d8_h2_ssse3
319 #define vp9_filter_block1d4_v2_avx2  vp9_filter_block1d4_v2_ssse3
320 #define vp9_filter_block1d4_h2_avx2  vp9_filter_block1d4_h2_ssse3
321 // void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
322 //                                uint8_t *dst, ptrdiff_t dst_stride,
323 //                                const int16_t *filter_x, int x_step_q4,
324 //                                const int16_t *filter_y, int y_step_q4,
325 //                                int w, int h);
326 // void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
327 //                               uint8_t *dst, ptrdiff_t dst_stride,
328 //                               const int16_t *filter_x, int x_step_q4,
329 //                               const int16_t *filter_y, int y_step_q4,
330 //                               int w, int h);
331 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
332 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
333 
334 // void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
335 //                          uint8_t *dst, ptrdiff_t dst_stride,
336 //                          const int16_t *filter_x, int x_step_q4,
337 //                          const int16_t *filter_y, int y_step_q4,
338 //                          int w, int h);
339 FUN_CONV_2D(, avx2);
340 #endif  // HAVE_AX2 && HAVE_SSSE3
341 #if HAVE_SSSE3
342 #if ARCH_X86_64
343 filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
344 filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
345 filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
346 filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
347 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
348 filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
349 #define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3
350 #define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3
351 #define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
352 #define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
353 #define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
354 #else  // ARCH_X86
355 filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
356 filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
357 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
358 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
359 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
360 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
361 #endif  // ARCH_X86_64 / ARCH_X86
362 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
363 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
364 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
365 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
366 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
367 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
368 
369 filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
370 filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
371 filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
372 filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
373 filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
374 filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
375 filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;
376 filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;
377 filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;
378 filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;
379 filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;
380 filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;
381 
382 // void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
383 //                                uint8_t *dst, ptrdiff_t dst_stride,
384 //                                const int16_t *filter_x, int x_step_q4,
385 //                                const int16_t *filter_y, int y_step_q4,
386 //                                int w, int h);
387 // void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
388 //                               uint8_t *dst, ptrdiff_t dst_stride,
389 //                               const int16_t *filter_x, int x_step_q4,
390 //                               const int16_t *filter_y, int y_step_q4,
391 //                               int w, int h);
392 // void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
393 //                                    uint8_t *dst, ptrdiff_t dst_stride,
394 //                                    const int16_t *filter_x, int x_step_q4,
395 //                                    const int16_t *filter_y, int y_step_q4,
396 //                                    int w, int h);
397 // void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
398 //                                   uint8_t *dst, ptrdiff_t dst_stride,
399 //                                   const int16_t *filter_x, int x_step_q4,
400 //                                   const int16_t *filter_y, int y_step_q4,
401 //                                   int w, int h);
402 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
403 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
404 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
405 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
406             ssse3);
407 
408 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
409 //                          uint8_t *dst, ptrdiff_t dst_stride,
410 //                          const int16_t *filter_x, int x_step_q4,
411 //                          const int16_t *filter_y, int y_step_q4,
412 //                          int w, int h);
413 // void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
414 //                              uint8_t *dst, ptrdiff_t dst_stride,
415 //                              const int16_t *filter_x, int x_step_q4,
416 //                              const int16_t *filter_y, int y_step_q4,
417 //                              int w, int h);
418 FUN_CONV_2D(, ssse3);
419 FUN_CONV_2D(avg_ , ssse3);
420 #endif  // HAVE_SSSE3
421 
422 #if HAVE_SSE2
423 filter8_1dfunction vp9_filter_block1d16_v8_sse2;
424 filter8_1dfunction vp9_filter_block1d16_h8_sse2;
425 filter8_1dfunction vp9_filter_block1d8_v8_sse2;
426 filter8_1dfunction vp9_filter_block1d8_h8_sse2;
427 filter8_1dfunction vp9_filter_block1d4_v8_sse2;
428 filter8_1dfunction vp9_filter_block1d4_h8_sse2;
429 filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2;
430 filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2;
431 filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2;
432 filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;
433 filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;
434 filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;
435 
436 filter8_1dfunction vp9_filter_block1d16_v2_sse2;
437 filter8_1dfunction vp9_filter_block1d16_h2_sse2;
438 filter8_1dfunction vp9_filter_block1d8_v2_sse2;
439 filter8_1dfunction vp9_filter_block1d8_h2_sse2;
440 filter8_1dfunction vp9_filter_block1d4_v2_sse2;
441 filter8_1dfunction vp9_filter_block1d4_h2_sse2;
442 filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2;
443 filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2;
444 filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2;
445 filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2;
446 filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2;
447 filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2;
448 
449 // void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
450 //                               uint8_t *dst, ptrdiff_t dst_stride,
451 //                               const int16_t *filter_x, int x_step_q4,
452 //                               const int16_t *filter_y, int y_step_q4,
453 //                               int w, int h);
454 // void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
455 //                              uint8_t *dst, ptrdiff_t dst_stride,
456 //                              const int16_t *filter_x, int x_step_q4,
457 //                              const int16_t *filter_y, int y_step_q4,
458 //                              int w, int h);
459 // void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
460 //                                   uint8_t *dst, ptrdiff_t dst_stride,
461 //                                   const int16_t *filter_x, int x_step_q4,
462 //                                   const int16_t *filter_y, int y_step_q4,
463 //                                   int w, int h);
464 // void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
465 //                                  uint8_t *dst, ptrdiff_t dst_stride,
466 //                                  const int16_t *filter_x, int x_step_q4,
467 //                                  const int16_t *filter_y, int y_step_q4,
468 //                                  int w, int h);
469 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
470 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
471 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
472 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
473 
474 // void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
475 //                         uint8_t *dst, ptrdiff_t dst_stride,
476 //                         const int16_t *filter_x, int x_step_q4,
477 //                         const int16_t *filter_y, int y_step_q4,
478 //                         int w, int h);
479 // void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
480 //                             uint8_t *dst, ptrdiff_t dst_stride,
481 //                             const int16_t *filter_x, int x_step_q4,
482 //                             const int16_t *filter_y, int y_step_q4,
483 //                             int w, int h);
484 FUN_CONV_2D(, sse2);
485 FUN_CONV_2D(avg_ , sse2);
486 
487 #if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
488 high_filter8_1dfunction vp9_high_filter_block1d16_v8_sse2;
489 high_filter8_1dfunction vp9_high_filter_block1d16_h8_sse2;
490 high_filter8_1dfunction vp9_high_filter_block1d8_v8_sse2;
491 high_filter8_1dfunction vp9_high_filter_block1d8_h8_sse2;
492 high_filter8_1dfunction vp9_high_filter_block1d4_v8_sse2;
493 high_filter8_1dfunction vp9_high_filter_block1d4_h8_sse2;
494 high_filter8_1dfunction vp9_high_filter_block1d16_v8_avg_sse2;
495 high_filter8_1dfunction vp9_high_filter_block1d16_h8_avg_sse2;
496 high_filter8_1dfunction vp9_high_filter_block1d8_v8_avg_sse2;
497 high_filter8_1dfunction vp9_high_filter_block1d8_h8_avg_sse2;
498 high_filter8_1dfunction vp9_high_filter_block1d4_v8_avg_sse2;
499 high_filter8_1dfunction vp9_high_filter_block1d4_h8_avg_sse2;
500 
501 high_filter8_1dfunction vp9_high_filter_block1d16_v2_sse2;
502 high_filter8_1dfunction vp9_high_filter_block1d16_h2_sse2;
503 high_filter8_1dfunction vp9_high_filter_block1d8_v2_sse2;
504 high_filter8_1dfunction vp9_high_filter_block1d8_h2_sse2;
505 high_filter8_1dfunction vp9_high_filter_block1d4_v2_sse2;
506 high_filter8_1dfunction vp9_high_filter_block1d4_h2_sse2;
507 high_filter8_1dfunction vp9_high_filter_block1d16_v2_avg_sse2;
508 high_filter8_1dfunction vp9_high_filter_block1d16_h2_avg_sse2;
509 high_filter8_1dfunction vp9_high_filter_block1d8_v2_avg_sse2;
510 high_filter8_1dfunction vp9_high_filter_block1d8_h2_avg_sse2;
511 high_filter8_1dfunction vp9_high_filter_block1d4_v2_avg_sse2;
512 high_filter8_1dfunction vp9_high_filter_block1d4_h2_avg_sse2;
513 
514 // void vp9_high_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
515 //                                    uint8_t *dst, ptrdiff_t dst_stride,
516 //                                    const int16_t *filter_x, int x_step_q4,
517 //                                    const int16_t *filter_y, int y_step_q4,
518 //                                    int w, int h, int bd);
519 // void vp9_high_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
520 //                                   uint8_t *dst, ptrdiff_t dst_stride,
521 //                                   const int16_t *filter_x, int x_step_q4,
522 //                                   const int16_t *filter_y, int y_step_q4,
523 //                                   int w, int h, int bd);
524 // void vp9_high_convolve8_avg_horiz_sse2(const uint8_t *src,
525 //                                        ptrdiff_t src_stride,
526 //                                        uint8_t *dst, ptrdiff_t dst_stride,
527 //                                        const int16_t *filter_x,
528 //                                        int x_step_q4,
529 //                                        const int16_t *filter_y,
530 //                                        int y_step_q4,
531 //                                        int w, int h, int bd);
532 // void vp9_high_convolve8_avg_vert_sse2(const uint8_t *src,
533 //                                       ptrdiff_t src_stride,
534 //                                       uint8_t *dst, ptrdiff_t dst_stride,
535 //                                       const int16_t *filter_x, int x_step_q4,
536 //                                       const int16_t *filter_y, int y_step_q4,
537 //                                       int w, int h, int bd);
538 HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
539 HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
540 HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
541 HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
542                  sse2);
543 
544 // void vp9_high_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
545 //                              uint8_t *dst, ptrdiff_t dst_stride,
546 //                              const int16_t *filter_x, int x_step_q4,
547 //                              const int16_t *filter_y, int y_step_q4,
548 //                              int w, int h, int bd);
549 // void vp9_high_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
550 //                                  uint8_t *dst, ptrdiff_t dst_stride,
551 //                                  const int16_t *filter_x, int x_step_q4,
552 //                                  const int16_t *filter_y, int y_step_q4,
553 //                                  int w, int h, int bd);
554 HIGH_FUN_CONV_2D(, sse2);
555 HIGH_FUN_CONV_2D(avg_ , sse2);
556 #endif  // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
557 #endif  // HAVE_SSE2
558