• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 // Due to a header conflict between math.h and intrinsics includes with ceil()
12 // in certain configurations under vs9 this include needs to precede
13 // tmmintrin.h.
14 
15 #include <tmmintrin.h>
16 
17 #include "./vpx_dsp_rtcd.h"
18 #include "vpx_dsp/vpx_filter.h"
19 #include "vpx_dsp/x86/convolve.h"
20 #include "vpx_mem/vpx_mem.h"
21 #include "vpx_ports/mem.h"
22 #include "vpx_ports/emmintrin_compat.h"
23 
24 // filters only for the 4_h8 convolution
25 DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
26   0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
27 };
28 
29 DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
30   4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
31 };
32 
33 // filters for 8_h8 and 16_h8
34 DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
35   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
36 };
37 
38 DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
39   2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
40 };
41 
42 DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
43   4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
44 };
45 
46 DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
47   6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
48 };
49 
50 // These are reused by the avx2 intrinsics.
51 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
52 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
53 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
54 
vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)55 void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
56                                          ptrdiff_t src_pixels_per_line,
57                                          uint8_t *output_ptr,
58                                          ptrdiff_t output_pitch,
59                                          uint32_t output_height,
60                                          const int16_t *filter) {
61   __m128i firstFilters, secondFilters, shuffle1, shuffle2;
62   __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
63   __m128i addFilterReg64, filtersReg, srcReg, minReg;
64   unsigned int i;
65 
66   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
67   addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
68   filtersReg = _mm_loadu_si128((const __m128i *)filter);
69   // converting the 16 bit (short) to  8 bit (byte) and have the same data
70   // in both lanes of 128 bit register.
71   filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
72 
73   // duplicate only the first 16 bits in the filter into the first lane
74   firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
75   // duplicate only the third 16 bit in the filter into the first lane
76   secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
77   // duplicate only the seconds 16 bits in the filter into the second lane
78   // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
79   firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
80   // duplicate only the forth 16 bits in the filter into the second lane
81   // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
82   secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
83 
84   // loading the local filters
85   shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);
86   shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
87 
88   for (i = 0; i < output_height; i++) {
89     srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
90 
91     // filter the source buffer
92     srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
93     srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);
94 
95     // multiply 2 adjacent elements with the filter and add the result
96     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
97     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
98 
99     // extract the higher half of the lane
100     srcRegFilt3 =  _mm_srli_si128(srcRegFilt1, 8);
101     srcRegFilt4 =  _mm_srli_si128(srcRegFilt2, 8);
102 
103     minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
104 
105     // add and saturate all the results together
106     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
107     srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
108     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
109     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
110     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
111 
112     // shift by 7 bit each 16 bits
113     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
114 
115     // shrink to 8 bit each 16 bits
116     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
117     src_ptr+=src_pixels_per_line;
118 
119     // save only 4 bytes
120     *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
121 
122     output_ptr+=output_pitch;
123   }
124 }
125 
vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)126 void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
127                                          ptrdiff_t src_pixels_per_line,
128                                          uint8_t *output_ptr,
129                                          ptrdiff_t output_pitch,
130                                          uint32_t output_height,
131                                          const int16_t *filter) {
132   __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
133   __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
134   __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
135   __m128i addFilterReg64, filtersReg, minReg;
136   unsigned int i;
137 
138   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
139   addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
140   filtersReg = _mm_loadu_si128((const __m128i *)filter);
141   // converting the 16 bit (short) to  8 bit (byte) and have the same data
142   // in both lanes of 128 bit register.
143   filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
144 
145   // duplicate only the first 16 bits (first and second byte)
146   // across 128 bit register
147   firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
148   // duplicate only the second 16 bits (third and forth byte)
149   // across 128 bit register
150   secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
151   // duplicate only the third 16 bits (fifth and sixth byte)
152   // across 128 bit register
153   thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
154   // duplicate only the forth 16 bits (seventh and eighth byte)
155   // across 128 bit register
156   forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
157 
158   filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
159   filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
160   filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
161   filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
162 
163   for (i = 0; i < output_height; i++) {
164     srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
165 
166     // filter the source buffer
167     srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
168     srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
169 
170     // multiply 2 adjacent elements with the filter and add the result
171     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
172     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
173 
174     // filter the source buffer
175     srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
176     srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
177 
178     // multiply 2 adjacent elements with the filter and add the result
179     srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
180     srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
181 
182     // add and saturate all the results together
183     minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
184     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
185 
186     srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3);
187     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
188     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
189     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
190 
191     // shift by 7 bit each 16 bits
192     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
193 
194     // shrink to 8 bit each 16 bits
195     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
196 
197     src_ptr+=src_pixels_per_line;
198 
199     // save only 8 bytes
200     _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
201 
202     output_ptr+=output_pitch;
203   }
204 }
205 
vpx_filter_block1d16_h8_intrin_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)206 static void vpx_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr,
207                                                  ptrdiff_t src_pixels_per_line,
208                                                  uint8_t *output_ptr,
209                                                  ptrdiff_t output_pitch,
210                                                  uint32_t output_height,
211                                                  const int16_t *filter) {
212   __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
213   __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
214   __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
215   __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
216   unsigned int i;
217 
218   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
219   addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
220   filtersReg = _mm_loadu_si128((const __m128i *)filter);
221   // converting the 16 bit (short) to  8 bit (byte) and have the same data
222   // in both lanes of 128 bit register.
223   filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
224 
225   // duplicate only the first 16 bits (first and second byte)
226   // across 128 bit register
227   firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
228   // duplicate only the second 16 bits (third and forth byte)
229   // across 128 bit register
230   secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
231   // duplicate only the third 16 bits (fifth and sixth byte)
232   // across 128 bit register
233   thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
234   // duplicate only the forth 16 bits (seventh and eighth byte)
235   // across 128 bit register
236   forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
237 
238   filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
239   filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
240   filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
241   filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
242 
243   for (i = 0; i < output_height; i++) {
244     srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
245 
246     // filter the source buffer
247     srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
248     srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg);
249 
250     // multiply 2 adjacent elements with the filter and add the result
251     srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
252     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
253 
254     // add and saturate the results together
255     srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
256 
257     // filter the source buffer
258     srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg);
259     srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
260 
261     // multiply 2 adjacent elements with the filter and add the result
262     srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
263     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
264 
265     // add and saturate the results together
266     srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
267                                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
268 
269     // reading the next 16 bytes.
270     // (part of it was being read by earlier read)
271     srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
272 
273     // add and saturate the results together
274     srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
275                                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
276 
277     // filter the source buffer
278     srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
279     srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg);
280 
281     // multiply 2 adjacent elements with the filter and add the result
282     srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
283     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
284 
285     // add and saturate the results together
286     srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
287 
288     // filter the source buffer
289     srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg);
290     srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
291 
292     // multiply 2 adjacent elements with the filter and add the result
293     srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
294     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
295 
296     // add and saturate the results together
297     srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
298     _mm_min_epi16(srcRegFilt3, srcRegFilt2));
299     srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
300     _mm_max_epi16(srcRegFilt3, srcRegFilt2));
301 
302     srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
303     srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
304 
305     // shift by 7 bit each 16 bit
306     srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
307     srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
308 
309     // shrink to 8 bit each 16 bits, the first lane contain the first
310     // convolve result and the second lane contain the second convolve
311     // result
312     srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
313 
314     src_ptr+=src_pixels_per_line;
315 
316     // save 16 bytes
317     _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
318 
319     output_ptr+=output_pitch;
320   }
321 }
322 
vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)323 void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
324                                          ptrdiff_t src_pitch,
325                                          uint8_t *output_ptr,
326                                          ptrdiff_t out_pitch,
327                                          uint32_t output_height,
328                                          const int16_t *filter) {
329   __m128i addFilterReg64, filtersReg, minReg;
330   __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
331   __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
332   __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
333   __m128i srcReg8;
334   unsigned int i;
335 
336   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
337   addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
338   filtersReg = _mm_loadu_si128((const __m128i *)filter);
339   // converting the 16 bit (short) to  8 bit (byte) and have the same data
340   // in both lanes of 128 bit register.
341   filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
342 
343   // duplicate only the first 16 bits in the filter
344   firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
345   // duplicate only the second 16 bits in the filter
346   secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
347   // duplicate only the third 16 bits in the filter
348   thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
349   // duplicate only the forth 16 bits in the filter
350   forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
351 
352   // load the first 7 rows of 8 bytes
353   srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
354   srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
355   srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
356   srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
357   srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
358   srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
359   srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
360 
361   for (i = 0; i < output_height; i++) {
362     // load the last 8 bytes
363     srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
364 
365     // merge the result together
366     srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
367     srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
368 
369     // merge the result together
370     srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
371     srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
372 
373     // multiply 2 adjacent elements with the filter and add the result
374     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
375     srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
376     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
377     srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
378 
379     // add and saturate the results together
380     minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
381     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
382     srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
383     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
384     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
385     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
386 
387     // shift by 7 bit each 16 bit
388     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
389 
390     // shrink to 8 bit each 16 bits
391     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
392 
393     src_ptr+=src_pitch;
394 
395     // shift down a row
396     srcReg1 = srcReg2;
397     srcReg2 = srcReg3;
398     srcReg3 = srcReg4;
399     srcReg4 = srcReg5;
400     srcReg5 = srcReg6;
401     srcReg6 = srcReg7;
402     srcReg7 = srcReg8;
403 
404     // save only 8 bytes convolve result
405     _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
406 
407     output_ptr+=out_pitch;
408   }
409 }
410 
vpx_filter_block1d16_v8_intrin_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)411 static void vpx_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr,
412                                                  ptrdiff_t src_pitch,
413                                                  uint8_t *output_ptr,
414                                                  ptrdiff_t out_pitch,
415                                                  uint32_t output_height,
416                                                  const int16_t *filter) {
417   __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;
418   __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
419   __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
420   __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
421   __m128i srcReg8;
422   unsigned int i;
423 
424   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
425   addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
426   filtersReg = _mm_loadu_si128((const __m128i *)filter);
427   // converting the 16 bit (short) to  8 bit (byte) and have the same data
428   // in both lanes of 128 bit register.
429   filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
430 
431   // duplicate only the first 16 bits in the filter
432   firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
433   // duplicate only the second 16 bits in the filter
434   secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
435   // duplicate only the third 16 bits in the filter
436   thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
437   // duplicate only the forth 16 bits in the filter
438   forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
439 
440   // load the first 7 rows of 16 bytes
441   srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
442   srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
443   srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
444   srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
445   srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
446   srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
447   srcReg7 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
448 
449   for (i = 0; i < output_height; i++) {
450     // load the last 16 bytes
451     srcReg8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
452 
453     // merge the result together
454     srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2);
455     srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8);
456     srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2);
457     srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8);
458 
459     // multiply 2 adjacent elements with the filter and add the result
460     srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
461     srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
462     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
463     srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
464 
465     // add and saturate the results together
466     srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
467     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
468 
469     // merge the result together
470     srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
471     srcRegFilt6 = _mm_unpackhi_epi8(srcReg3, srcReg4);
472 
473     // multiply 2 adjacent elements with the filter and add the result
474     srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
475     srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
476 
477     // merge the result together
478     srcRegFilt7 = _mm_unpacklo_epi8(srcReg5, srcReg6);
479     srcRegFilt8 = _mm_unpackhi_epi8(srcReg5, srcReg6);
480 
481     // multiply 2 adjacent elements with the filter and add the result
482     srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
483     srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
484 
485     // add and saturate the results together
486     srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
487                                  _mm_min_epi16(srcRegFilt3, srcRegFilt7));
488     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
489                                  _mm_min_epi16(srcRegFilt6, srcRegFilt8));
490 
491     // add and saturate the results together
492     srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
493                                  _mm_max_epi16(srcRegFilt3, srcRegFilt7));
494     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
495                                  _mm_max_epi16(srcRegFilt6, srcRegFilt8));
496     srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
497     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
498 
499     // shift by 7 bit each 16 bit
500     srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
501     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
502 
503     // shrink to 8 bit each 16 bits, the first lane contain the first
504     // convolve result and the second lane contain the second convolve
505     // result
506     srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
507 
508     src_ptr+=src_pitch;
509 
510     // shift down a row
511     srcReg1 = srcReg2;
512     srcReg2 = srcReg3;
513     srcReg3 = srcReg4;
514     srcReg4 = srcReg5;
515     srcReg5 = srcReg6;
516     srcReg6 = srcReg7;
517     srcReg7 = srcReg8;
518 
519     // save 16 bytes convolve result
520     _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
521 
522     output_ptr+=out_pitch;
523   }
524 }
525 
526 #if ARCH_X86_64
527 filter8_1dfunction vpx_filter_block1d16_v8_intrin_ssse3;
528 filter8_1dfunction vpx_filter_block1d16_h8_intrin_ssse3;
529 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
530 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
531 filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
532 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
533 #define vpx_filter_block1d16_v8_ssse3 vpx_filter_block1d16_v8_intrin_ssse3
534 #define vpx_filter_block1d16_h8_ssse3 vpx_filter_block1d16_h8_intrin_ssse3
535 #define vpx_filter_block1d8_v8_ssse3 vpx_filter_block1d8_v8_intrin_ssse3
536 #define vpx_filter_block1d8_h8_ssse3 vpx_filter_block1d8_h8_intrin_ssse3
537 #define vpx_filter_block1d4_h8_ssse3 vpx_filter_block1d4_h8_intrin_ssse3
538 #else  // ARCH_X86
539 filter8_1dfunction vpx_filter_block1d16_v8_ssse3;
540 filter8_1dfunction vpx_filter_block1d16_h8_ssse3;
541 filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
542 filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
543 filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
544 filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
545 #endif  // ARCH_X86_64
546 filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3;
547 filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3;
548 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
549 filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
550 filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
551 filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
552 
553 filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
554 filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
555 filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
556 filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
557 filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
558 filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
559 filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
560 filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
561 filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
562 filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
563 filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
564 filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
565 
566 // void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
567 //                                uint8_t *dst, ptrdiff_t dst_stride,
568 //                                const int16_t *filter_x, int x_step_q4,
569 //                                const int16_t *filter_y, int y_step_q4,
570 //                                int w, int h);
571 // void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
572 //                               uint8_t *dst, ptrdiff_t dst_stride,
573 //                               const int16_t *filter_x, int x_step_q4,
574 //                               const int16_t *filter_y, int y_step_q4,
575 //                               int w, int h);
576 // void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
577 //                                    uint8_t *dst, ptrdiff_t dst_stride,
578 //                                    const int16_t *filter_x, int x_step_q4,
579 //                                    const int16_t *filter_y, int y_step_q4,
580 //                                    int w, int h);
581 // void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
582 //                                   uint8_t *dst, ptrdiff_t dst_stride,
583 //                                   const int16_t *filter_x, int x_step_q4,
584 //                                   const int16_t *filter_y, int y_step_q4,
585 //                                   int w, int h);
586 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
587 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
588 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
589 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
590             ssse3);
591 
592 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,           \
593                       out0, out1, out2, out3, out4, out5, out6, out7) { \
594   const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1);                    \
595   const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3);                    \
596   const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5);                    \
597   const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7);                    \
598                                                                         \
599   const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1);               \
600   const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1);               \
601   const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3);               \
602   const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3);               \
603                                                                         \
604   const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2);               \
605   const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2);               \
606   const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3);               \
607   const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3);               \
608                                                                         \
609   out0 = _mm_unpacklo_epi64(tr2_0, tr2_0);                              \
610   out1 = _mm_unpackhi_epi64(tr2_0, tr2_0);                              \
611   out2 = _mm_unpacklo_epi64(tr2_1, tr2_1);                              \
612   out3 = _mm_unpackhi_epi64(tr2_1, tr2_1);                              \
613   out4 = _mm_unpacklo_epi64(tr2_2, tr2_2);                              \
614   out5 = _mm_unpackhi_epi64(tr2_2, tr2_2);                              \
615   out6 = _mm_unpacklo_epi64(tr2_3, tr2_3);                              \
616   out7 = _mm_unpackhi_epi64(tr2_3, tr2_3);                              \
617 }
618 
filter_horiz_w8_ssse3(const uint8_t * src_x,ptrdiff_t src_pitch,uint8_t * dst,const int16_t * x_filter)619 static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
620                                   uint8_t *dst, const int16_t *x_filter) {
621   const __m128i k_256 = _mm_set1_epi16(1 << 8);
622   const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
623   // pack and duplicate the filter values
624   const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
625   const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
626   const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
627   const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
628   const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
629   const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
630   const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
631   const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
632   const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
633   const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
634   const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
635   const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
636   // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
637   const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
638   // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
639   const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
640   // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
641   const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
642   // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
643   const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
644   // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
645   const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
646   // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
647   const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
648   // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
649   const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
650   // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
651   const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
652   // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
653   const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
654   const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
655   const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
656   const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
657   // multiply 2 adjacent elements with the filter and add the result
658   const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
659   const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
660   const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
661   const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
662   // add and saturate the results together
663   const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
664   const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
665   __m128i temp = _mm_adds_epi16(x0, x3);
666   temp = _mm_adds_epi16(temp, min_x2x1);
667   temp = _mm_adds_epi16(temp, max_x2x1);
668   // round and shift by 7 bit each 16 bit
669   temp = _mm_mulhrs_epi16(temp, k_256);
670   // shrink to 8 bit each 16 bits
671   temp = _mm_packus_epi16(temp, temp);
672   // save only 8 bytes convolve result
673   _mm_storel_epi64((__m128i*)dst, temp);
674 }
675 
transpose8x8_to_dst(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride)676 static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
677                                 uint8_t *dst, ptrdiff_t dst_stride) {
678   __m128i A, B, C, D, E, F, G, H;
679 
680   A = _mm_loadl_epi64((const __m128i *)src);
681   B = _mm_loadl_epi64((const __m128i *)(src + src_stride));
682   C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
683   D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
684   E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
685   F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5));
686   G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
687   H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
688 
689   TRANSPOSE_8X8(A, B, C, D, E, F, G, H,
690                 A, B, C, D, E, F, G, H);
691 
692   _mm_storel_epi64((__m128i*)dst, A);
693   _mm_storel_epi64((__m128i*)(dst + dst_stride * 1), B);
694   _mm_storel_epi64((__m128i*)(dst + dst_stride * 2), C);
695   _mm_storel_epi64((__m128i*)(dst + dst_stride * 3), D);
696   _mm_storel_epi64((__m128i*)(dst + dst_stride * 4), E);
697   _mm_storel_epi64((__m128i*)(dst + dst_stride * 5), F);
698   _mm_storel_epi64((__m128i*)(dst + dst_stride * 6), G);
699   _mm_storel_epi64((__m128i*)(dst + dst_stride * 7), H);
700 }
701 
scaledconvolve_horiz_w8(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)702 static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
703                                     uint8_t *dst, ptrdiff_t dst_stride,
704                                     const InterpKernel *x_filters,
705                                     int x0_q4, int x_step_q4, int w, int h) {
706   DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
707   int x, y, z;
708   src -= SUBPEL_TAPS / 2 - 1;
709 
710   // This function processes 8x8 areas.  The intermediate height is not always
711   // a multiple of 8, so force it to be a multiple of 8 here.
712   y = h + (8 - (h & 0x7));
713 
714   do {
715     int x_q4 = x0_q4;
716     for (x = 0; x < w; x += 8) {
717       // process 8 src_x steps
718       for (z = 0; z < 8; ++z) {
719         const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
720         const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
721         if (x_q4 & SUBPEL_MASK) {
722           filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
723         } else {
724           int i;
725           for (i = 0; i < 8; ++i) {
726             temp[z * 8 + i] = src_x[i * src_stride + 3];
727           }
728         }
729         x_q4 += x_step_q4;
730       }
731 
732       // transpose the 8x8 filters values back to dst
733       transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
734     }
735 
736     src += src_stride * 8;
737     dst += dst_stride * 8;
738   } while (y -= 8);
739 }
740 
filter_horiz_w4_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * dst,const int16_t * filter)741 static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
742                                   uint8_t *dst, const int16_t *filter) {
743   const __m128i k_256 = _mm_set1_epi16(1 << 8);
744   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
745   // pack and duplicate the filter values
746   const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
747   const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
748   const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
749   const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
750   const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
751   const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
752   const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
753   const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
754   // TRANSPOSE...
755   // 00 01 02 03 04 05 06 07
756   // 10 11 12 13 14 15 16 17
757   // 20 21 22 23 24 25 26 27
758   // 30 31 32 33 34 35 36 37
759   //
760   // TO
761   //
762   // 00 10 20 30
763   // 01 11 21 31
764   // 02 12 22 32
765   // 03 13 23 33
766   // 04 14 24 34
767   // 05 15 25 35
768   // 06 16 26 36
769   // 07 17 27 37
770   //
771   // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
772   const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
773   // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
774   const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
775   // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
776   const __m128i s1s0  = _mm_unpacklo_epi32(tr0_0, tr0_1);
777   // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
778   const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
779   // 02 03 12 13 22 23 32 33
780   const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
781   // 06 07 16 17 26 27 36 37
782   const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
783   // multiply 2 adjacent elements with the filter and add the result
784   const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
785   const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
786   const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
787   const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
788   // add and saturate the results together
789   const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
790   const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
791   __m128i temp = _mm_adds_epi16(x0, x3);
792   temp = _mm_adds_epi16(temp, min_x2x1);
793   temp = _mm_adds_epi16(temp, max_x2x1);
794   // round and shift by 7 bit each 16 bit
795   temp = _mm_mulhrs_epi16(temp, k_256);
796   // shrink to 8 bit each 16 bits
797   temp = _mm_packus_epi16(temp, temp);
798   // save only 4 bytes
799   *(int *)dst = _mm_cvtsi128_si32(temp);
800 }
801 
transpose4x4_to_dst(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride)802 static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
803                                 uint8_t *dst, ptrdiff_t dst_stride) {
804   __m128i A = _mm_cvtsi32_si128(*(const int *)src);
805   __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
806   __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
807   __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
808   // 00 10 01 11 02 12 03 13
809   const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
810   // 20 30 21 31 22 32 23 33
811   const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
812   // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
813   A = _mm_unpacklo_epi16(tr0_0, tr0_1);
814   B = _mm_srli_si128(A, 4);
815   C = _mm_srli_si128(A, 8);
816   D = _mm_srli_si128(A, 12);
817 
818   *(int *)(dst) =  _mm_cvtsi128_si32(A);
819   *(int *)(dst + dst_stride) =  _mm_cvtsi128_si32(B);
820   *(int *)(dst + dst_stride * 2) =  _mm_cvtsi128_si32(C);
821   *(int *)(dst + dst_stride * 3) =  _mm_cvtsi128_si32(D);
822 }
823 
scaledconvolve_horiz_w4(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)824 static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
825                                     uint8_t *dst, ptrdiff_t dst_stride,
826                                     const InterpKernel *x_filters,
827                                     int x0_q4, int x_step_q4, int w, int h) {
828   DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
829   int x, y, z;
830   src -= SUBPEL_TAPS / 2 - 1;
831 
832   for (y = 0; y < h; y += 4) {
833     int x_q4 = x0_q4;
834     for (x = 0; x < w; x += 4) {
835       // process 4 src_x steps
836       for (z = 0; z < 4; ++z) {
837         const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
838         const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
839         if (x_q4 & SUBPEL_MASK) {
840           filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
841         } else {
842           int i;
843           for (i = 0; i < 4; ++i) {
844             temp[z * 4 + i] = src_x[i * src_stride + 3];
845           }
846         }
847         x_q4 += x_step_q4;
848       }
849 
850       // transpose the 4x4 filters values back to dst
851       transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
852     }
853 
854     src += src_stride * 4;
855     dst += dst_stride * 4;
856   }
857 }
858 
filter_vert_w4_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * dst,const int16_t * filter)859 static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
860                                  uint8_t *dst, const int16_t *filter) {
861   const __m128i k_256 = _mm_set1_epi16(1 << 8);
862   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
863   // pack and duplicate the filter values
864   const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
865   const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
866   const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
867   const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
868   const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
869   const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
870   const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
871   const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
872   const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
873   const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
874   const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
875   const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
876   const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
877   const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
878   const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
879   const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
880   // multiply 2 adjacent elements with the filter and add the result
881   const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
882   const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
883   const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
884   const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
885   // add and saturate the results together
886   const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
887   const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
888   __m128i temp = _mm_adds_epi16(x0, x3);
889   temp = _mm_adds_epi16(temp, min_x2x1);
890   temp = _mm_adds_epi16(temp, max_x2x1);
891   // round and shift by 7 bit each 16 bit
892   temp = _mm_mulhrs_epi16(temp, k_256);
893   // shrink to 8 bit each 16 bits
894   temp = _mm_packus_epi16(temp, temp);
895   // save only 4 bytes
896   *(int *)dst = _mm_cvtsi128_si32(temp);
897 }
898 
scaledconvolve_vert_w4(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)899 static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
900                                    uint8_t *dst, ptrdiff_t dst_stride,
901                                    const InterpKernel *y_filters,
902                                    int y0_q4, int y_step_q4, int w, int h) {
903   int y;
904   int y_q4 = y0_q4;
905 
906   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
907   for (y = 0; y < h; ++y) {
908     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
909     const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
910 
911     if (y_q4 & SUBPEL_MASK) {
912       filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
913     } else {
914       memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
915     }
916 
917     y_q4 += y_step_q4;
918   }
919 }
920 
filter_vert_w8_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * dst,const int16_t * filter)921 static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
922                                  uint8_t *dst, const int16_t *filter) {
923   const __m128i k_256 = _mm_set1_epi16(1 << 8);
924   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
925   // pack and duplicate the filter values
926   const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
927   const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
928   const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
929   const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
930   const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
931   const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
932   const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
933   const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
934   const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
935   const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
936   const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
937   const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
938   const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
939   const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
940   const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
941   const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
942   // multiply 2 adjacent elements with the filter and add the result
943   const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
944   const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
945   const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
946   const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
947   // add and saturate the results together
948   const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
949   const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
950   __m128i temp = _mm_adds_epi16(x0, x3);
951   temp = _mm_adds_epi16(temp, min_x2x1);
952   temp = _mm_adds_epi16(temp, max_x2x1);
953   // round and shift by 7 bit each 16 bit
954   temp = _mm_mulhrs_epi16(temp, k_256);
955   // shrink to 8 bit each 16 bits
956   temp = _mm_packus_epi16(temp, temp);
957   // save only 8 bytes convolve result
958   _mm_storel_epi64((__m128i*)dst, temp);
959 }
960 
scaledconvolve_vert_w8(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)961 static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
962                                    uint8_t *dst, ptrdiff_t dst_stride,
963                                    const InterpKernel *y_filters,
964                                    int y0_q4, int y_step_q4, int w, int h) {
965   int y;
966   int y_q4 = y0_q4;
967 
968   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
969   for (y = 0; y < h; ++y) {
970     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
971     const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
972     if (y_q4 & SUBPEL_MASK) {
973       filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
974     } else {
975       memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
976     }
977     y_q4 += y_step_q4;
978   }
979 }
980 
filter_vert_w16_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * dst,const int16_t * filter,int w)981 static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
982                                   uint8_t *dst, const int16_t *filter, int w) {
983   const __m128i k_256 = _mm_set1_epi16(1 << 8);
984   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
985   // pack and duplicate the filter values
986   const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
987   const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
988   const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
989   const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
990   int i;
991 
992   for (i = 0; i < w; i += 16) {
993     const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
994     const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
995     const __m128i C =
996         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
997     const __m128i D =
998         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
999     const __m128i E =
1000         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
1001     const __m128i F =
1002         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
1003     const __m128i G =
1004         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
1005     const __m128i H =
1006         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
1007     // merge the result together
1008     const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
1009     const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
1010     const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
1011     const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
1012     // multiply 2 adjacent elements with the filter and add the result
1013     const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
1014     const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
1015     const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
1016     const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
1017     // add and saturate the results together
1018     const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
1019     const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
1020     // merge the result together
1021     const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
1022     const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
1023     // multiply 2 adjacent elements with the filter and add the result
1024     const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
1025     const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
1026     // merge the result together
1027     const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
1028     const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
1029     // multiply 2 adjacent elements with the filter and add the result
1030     const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
1031     const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
1032     // add and saturate the results together
1033     __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
1034     __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));
1035 
1036     // add and saturate the results together
1037     temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
1038     temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
1039     // round and shift by 7 bit each 16 bit
1040     temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
1041     temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
1042     // shrink to 8 bit each 16 bits, the first lane contain the first
1043     // convolve result and the second lane contain the second convolve
1044     // result
1045     temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
1046     src_ptr += 16;
1047      // save 16 bytes convolve result
1048     _mm_store_si128((__m128i*)&dst[i], temp_hi);
1049   }
1050 }
1051 
scaledconvolve_vert_w16(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)1052 static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
1053                                     uint8_t *dst, ptrdiff_t dst_stride,
1054                                     const InterpKernel *y_filters,
1055                                     int y0_q4, int y_step_q4, int w, int h) {
1056   int y;
1057   int y_q4 = y0_q4;
1058 
1059   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1060   for (y = 0; y < h; ++y) {
1061     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1062     const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1063     if (y_q4 & SUBPEL_MASK) {
1064       filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
1065                             w);
1066     } else {
1067       memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
1068     }
1069     y_q4 += y_step_q4;
1070   }
1071 }
1072 
scaledconvolve2d(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_q4,int x_step_q4,const InterpKernel * const y_filters,int y0_q4,int y_step_q4,int w,int h)1073 static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
1074                              uint8_t *dst, ptrdiff_t dst_stride,
1075                              const InterpKernel *const x_filters,
1076                              int x0_q4, int x_step_q4,
1077                              const InterpKernel *const y_filters,
1078                              int y0_q4, int y_step_q4,
1079                              int w, int h) {
1080   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
1081   // 2d filtering proceeds in 2 steps:
1082   //   (1) Interpolate horizontally into an intermediate buffer, temp.
1083   //   (2) Interpolate temp vertically to derive the sub-pixel result.
1084   // Deriving the maximum number of rows in the temp buffer (135):
1085   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1086   // --Largest block size is 64x64 pixels.
1087   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
1088   //   original frame (in 1/16th pixel units).
1089   // --Must round-up because block may be located at sub-pixel position.
1090   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1091   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
1092   // --Require an additional 8 rows for the horiz_w8 transpose tail.
1093   DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
1094   const int intermediate_height =
1095       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1096 
1097   assert(w <= 64);
1098   assert(h <= 64);
1099   assert(y_step_q4 <= 32);
1100   assert(x_step_q4 <= 32);
1101 
1102   if (w >= 8) {
1103     scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1104                             src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
1105                             w, intermediate_height);
1106   } else {
1107     scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1108                             src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
1109                             w, intermediate_height);
1110   }
1111 
1112   if (w >= 16) {
1113     scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1114                             dst_stride, y_filters, y0_q4, y_step_q4, w, h);
1115   } else if (w == 8) {
1116     scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1117                            dst_stride, y_filters, y0_q4, y_step_q4, w, h);
1118   } else {
1119     scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1120                            dst_stride, y_filters, y0_q4, y_step_q4, w, h);
1121   }
1122 }
1123 
get_filter_base(const int16_t * filter)1124 static const InterpKernel *get_filter_base(const int16_t *filter) {
1125   // NOTE: This assumes that the filter table is 256-byte aligned.
1126   // TODO(agrange) Modify to make independent of table alignment.
1127   return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
1128 }
1129 
get_filter_offset(const int16_t * f,const InterpKernel * base)1130 static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
1131   return (int)((const InterpKernel *)(intptr_t)f - base);
1132 }
1133 
vpx_scaled_2d_ssse3(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)1134 void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride,
1135                          uint8_t *dst, ptrdiff_t dst_stride,
1136                          const int16_t *filter_x, int x_step_q4,
1137                          const int16_t *filter_y, int y_step_q4,
1138                          int w, int h) {
1139   const InterpKernel *const filters_x = get_filter_base(filter_x);
1140   const int x0_q4 = get_filter_offset(filter_x, filters_x);
1141 
1142   const InterpKernel *const filters_y = get_filter_base(filter_y);
1143   const int y0_q4 = get_filter_offset(filter_y, filters_y);
1144 
1145   scaledconvolve2d(src, src_stride, dst, dst_stride,
1146                    filters_x, x0_q4, x_step_q4,
1147                    filters_y, y0_q4, y_step_q4, w, h);
1148 }
1149 
1150 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
1151 //                          uint8_t *dst, ptrdiff_t dst_stride,
1152 //                          const int16_t *filter_x, int x_step_q4,
1153 //                          const int16_t *filter_y, int y_step_q4,
1154 //                          int w, int h);
1155 // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
1156 //                              uint8_t *dst, ptrdiff_t dst_stride,
1157 //                              const int16_t *filter_x, int x_step_q4,
1158 //                              const int16_t *filter_y, int y_step_q4,
1159 //                              int w, int h);
1160 FUN_CONV_2D(, ssse3);
1161 FUN_CONV_2D(avg_ , ssse3);
1162