• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 // Due to a header conflict between math.h and intrinsics includes with ceil()
12 // in certain configurations under vs9 this include needs to precede
13 // tmmintrin.h.
14 
15 #include <tmmintrin.h>
16 
17 #include "./vpx_dsp_rtcd.h"
18 #include "vpx_dsp/vpx_filter.h"
19 #include "vpx_dsp/x86/convolve.h"
20 #include "vpx_mem/vpx_mem.h"
21 #include "vpx_ports/mem.h"
22 #include "vpx_ports/emmintrin_compat.h"
23 
24 // filters only for the 4_h8 convolution
25 DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
26   0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
27 };
28 
29 DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
30   4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
31 };
32 
33 // filters for 8_h8 and 16_h8
34 DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
35   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
36 };
37 
38 DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
39   2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
40 };
41 
42 DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
43   4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
44 };
45 
46 DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
47   6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
48 };
49 
50 // These are reused by the avx2 intrinsics.
51 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
52 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
53 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
54 
vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)55 void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
56                                          ptrdiff_t src_pixels_per_line,
57                                          uint8_t *output_ptr,
58                                          ptrdiff_t output_pitch,
59                                          uint32_t output_height,
60                                          const int16_t *filter) {
61   __m128i firstFilters, secondFilters, shuffle1, shuffle2;
62   __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
63   __m128i addFilterReg64, filtersReg, srcReg, minReg;
64   unsigned int i;
65 
66   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
67   addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
68   filtersReg = _mm_loadu_si128((const __m128i *)filter);
69   // converting the 16 bit (short) to  8 bit (byte) and have the same data
70   // in both lanes of 128 bit register.
71   filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
72 
73   // duplicate only the first 16 bits in the filter into the first lane
74   firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
75   // duplicate only the third 16 bit in the filter into the first lane
76   secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
77   // duplicate only the seconds 16 bits in the filter into the second lane
78   // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
79   firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
80   // duplicate only the forth 16 bits in the filter into the second lane
81   // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
82   secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
83 
84   // loading the local filters
85   shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);
86   shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
87 
88   for (i = 0; i < output_height; i++) {
89     srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
90 
91     // filter the source buffer
92     srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
93     srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);
94 
95     // multiply 2 adjacent elements with the filter and add the result
96     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
97     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
98 
99     // extract the higher half of the lane
100     srcRegFilt3 =  _mm_srli_si128(srcRegFilt1, 8);
101     srcRegFilt4 =  _mm_srli_si128(srcRegFilt2, 8);
102 
103     minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
104 
105     // add and saturate all the results together
106     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
107     srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
108     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
109     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
110     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
111 
112     // shift by 7 bit each 16 bits
113     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
114 
115     // shrink to 8 bit each 16 bits
116     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
117     src_ptr+=src_pixels_per_line;
118 
119     // save only 4 bytes
120     *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
121 
122     output_ptr+=output_pitch;
123   }
124 }
125 
vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)126 void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
127                                          ptrdiff_t src_pixels_per_line,
128                                          uint8_t *output_ptr,
129                                          ptrdiff_t output_pitch,
130                                          uint32_t output_height,
131                                          const int16_t *filter) {
132   __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
133   __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
134   __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
135   __m128i addFilterReg64, filtersReg, minReg;
136   unsigned int i;
137 
138   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
139   addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
140   filtersReg = _mm_loadu_si128((const __m128i *)filter);
141   // converting the 16 bit (short) to  8 bit (byte) and have the same data
142   // in both lanes of 128 bit register.
143   filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
144 
145   // duplicate only the first 16 bits (first and second byte)
146   // across 128 bit register
147   firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
148   // duplicate only the second 16 bits (third and forth byte)
149   // across 128 bit register
150   secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
151   // duplicate only the third 16 bits (fifth and sixth byte)
152   // across 128 bit register
153   thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
154   // duplicate only the forth 16 bits (seventh and eighth byte)
155   // across 128 bit register
156   forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
157 
158   filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
159   filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
160   filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
161   filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
162 
163   for (i = 0; i < output_height; i++) {
164     srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
165 
166     // filter the source buffer
167     srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
168     srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
169 
170     // multiply 2 adjacent elements with the filter and add the result
171     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
172     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
173 
174     // filter the source buffer
175     srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
176     srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
177 
178     // multiply 2 adjacent elements with the filter and add the result
179     srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
180     srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
181 
182     // add and saturate all the results together
183     minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
184     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
185 
186     srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3);
187     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
188     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
189     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
190 
191     // shift by 7 bit each 16 bits
192     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
193 
194     // shrink to 8 bit each 16 bits
195     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
196 
197     src_ptr+=src_pixels_per_line;
198 
199     // save only 8 bytes
200     _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
201 
202     output_ptr+=output_pitch;
203   }
204 }
205 
vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)206 void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
207                                          ptrdiff_t src_pitch,
208                                          uint8_t *output_ptr,
209                                          ptrdiff_t out_pitch,
210                                          uint32_t output_height,
211                                          const int16_t *filter) {
212   __m128i addFilterReg64, filtersReg, minReg;
213   __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
214   __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
215   __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
216   __m128i srcReg8;
217   unsigned int i;
218 
219   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
220   addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
221   filtersReg = _mm_loadu_si128((const __m128i *)filter);
222   // converting the 16 bit (short) to  8 bit (byte) and have the same data
223   // in both lanes of 128 bit register.
224   filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
225 
226   // duplicate only the first 16 bits in the filter
227   firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
228   // duplicate only the second 16 bits in the filter
229   secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
230   // duplicate only the third 16 bits in the filter
231   thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
232   // duplicate only the forth 16 bits in the filter
233   forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
234 
235   // load the first 7 rows of 8 bytes
236   srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
237   srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
238   srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
239   srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
240   srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
241   srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
242   srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
243 
244   for (i = 0; i < output_height; i++) {
245     // load the last 8 bytes
246     srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
247 
248     // merge the result together
249     srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
250     srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
251 
252     // merge the result together
253     srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
254     srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
255 
256     // multiply 2 adjacent elements with the filter and add the result
257     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
258     srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
259     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
260     srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
261 
262     // add and saturate the results together
263     minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
264     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
265     srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
266     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
267     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
268     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
269 
270     // shift by 7 bit each 16 bit
271     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
272 
273     // shrink to 8 bit each 16 bits
274     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
275 
276     src_ptr+=src_pitch;
277 
278     // shift down a row
279     srcReg1 = srcReg2;
280     srcReg2 = srcReg3;
281     srcReg3 = srcReg4;
282     srcReg4 = srcReg5;
283     srcReg5 = srcReg6;
284     srcReg6 = srcReg7;
285     srcReg7 = srcReg8;
286 
287     // save only 8 bytes convolve result
288     _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
289 
290     output_ptr+=out_pitch;
291   }
292 }
293 
294 filter8_1dfunction vpx_filter_block1d16_v8_ssse3;
295 filter8_1dfunction vpx_filter_block1d16_h8_ssse3;
296 filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
297 filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
298 filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
299 filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
300 filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3;
301 filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3;
302 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
303 filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
304 filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
305 filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
306 
307 filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
308 filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
309 filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
310 filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
311 filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
312 filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
313 filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
314 filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
315 filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
316 filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
317 filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
318 filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
319 
320 // void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
321 //                                uint8_t *dst, ptrdiff_t dst_stride,
322 //                                const int16_t *filter_x, int x_step_q4,
323 //                                const int16_t *filter_y, int y_step_q4,
324 //                                int w, int h);
325 // void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
326 //                               uint8_t *dst, ptrdiff_t dst_stride,
327 //                               const int16_t *filter_x, int x_step_q4,
328 //                               const int16_t *filter_y, int y_step_q4,
329 //                               int w, int h);
330 // void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
331 //                                    uint8_t *dst, ptrdiff_t dst_stride,
332 //                                    const int16_t *filter_x, int x_step_q4,
333 //                                    const int16_t *filter_y, int y_step_q4,
334 //                                    int w, int h);
335 // void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
336 //                                   uint8_t *dst, ptrdiff_t dst_stride,
337 //                                   const int16_t *filter_x, int x_step_q4,
338 //                                   const int16_t *filter_y, int y_step_q4,
339 //                                   int w, int h);
340 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
341 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
342 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
343 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
344             ssse3);
345 
346 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,           \
347                       out0, out1, out2, out3, out4, out5, out6, out7) { \
348   const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1);                    \
349   const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3);                    \
350   const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5);                    \
351   const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7);                    \
352                                                                         \
353   const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1);               \
354   const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1);               \
355   const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3);               \
356   const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3);               \
357                                                                         \
358   const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2);               \
359   const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2);               \
360   const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3);               \
361   const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3);               \
362                                                                         \
363   out0 = _mm_unpacklo_epi64(tr2_0, tr2_0);                              \
364   out1 = _mm_unpackhi_epi64(tr2_0, tr2_0);                              \
365   out2 = _mm_unpacklo_epi64(tr2_1, tr2_1);                              \
366   out3 = _mm_unpackhi_epi64(tr2_1, tr2_1);                              \
367   out4 = _mm_unpacklo_epi64(tr2_2, tr2_2);                              \
368   out5 = _mm_unpackhi_epi64(tr2_2, tr2_2);                              \
369   out6 = _mm_unpacklo_epi64(tr2_3, tr2_3);                              \
370   out7 = _mm_unpackhi_epi64(tr2_3, tr2_3);                              \
371 }
372 
filter_horiz_w8_ssse3(const uint8_t * src_x,ptrdiff_t src_pitch,uint8_t * dst,const int16_t * x_filter)373 static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
374                                   uint8_t *dst, const int16_t *x_filter) {
375   const __m128i k_256 = _mm_set1_epi16(1 << 8);
376   const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
377   // pack and duplicate the filter values
378   const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
379   const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
380   const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
381   const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
382   const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
383   const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
384   const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
385   const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
386   const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
387   const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
388   const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
389   const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
390   // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
391   const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
392   // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
393   const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
394   // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
395   const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
396   // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
397   const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
398   // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
399   const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
400   // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
401   const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
402   // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
403   const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
404   // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
405   const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
406   // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
407   const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
408   const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
409   const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
410   const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
411   // multiply 2 adjacent elements with the filter and add the result
412   const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
413   const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
414   const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
415   const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
416   // add and saturate the results together
417   const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
418   const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
419   __m128i temp = _mm_adds_epi16(x0, x3);
420   temp = _mm_adds_epi16(temp, min_x2x1);
421   temp = _mm_adds_epi16(temp, max_x2x1);
422   // round and shift by 7 bit each 16 bit
423   temp = _mm_mulhrs_epi16(temp, k_256);
424   // shrink to 8 bit each 16 bits
425   temp = _mm_packus_epi16(temp, temp);
426   // save only 8 bytes convolve result
427   _mm_storel_epi64((__m128i*)dst, temp);
428 }
429 
transpose8x8_to_dst(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride)430 static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
431                                 uint8_t *dst, ptrdiff_t dst_stride) {
432   __m128i A, B, C, D, E, F, G, H;
433 
434   A = _mm_loadl_epi64((const __m128i *)src);
435   B = _mm_loadl_epi64((const __m128i *)(src + src_stride));
436   C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
437   D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
438   E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
439   F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5));
440   G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
441   H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
442 
443   TRANSPOSE_8X8(A, B, C, D, E, F, G, H,
444                 A, B, C, D, E, F, G, H);
445 
446   _mm_storel_epi64((__m128i*)dst, A);
447   _mm_storel_epi64((__m128i*)(dst + dst_stride * 1), B);
448   _mm_storel_epi64((__m128i*)(dst + dst_stride * 2), C);
449   _mm_storel_epi64((__m128i*)(dst + dst_stride * 3), D);
450   _mm_storel_epi64((__m128i*)(dst + dst_stride * 4), E);
451   _mm_storel_epi64((__m128i*)(dst + dst_stride * 5), F);
452   _mm_storel_epi64((__m128i*)(dst + dst_stride * 6), G);
453   _mm_storel_epi64((__m128i*)(dst + dst_stride * 7), H);
454 }
455 
scaledconvolve_horiz_w8(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)456 static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
457                                     uint8_t *dst, ptrdiff_t dst_stride,
458                                     const InterpKernel *x_filters,
459                                     int x0_q4, int x_step_q4, int w, int h) {
460   DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
461   int x, y, z;
462   src -= SUBPEL_TAPS / 2 - 1;
463 
464   // This function processes 8x8 areas.  The intermediate height is not always
465   // a multiple of 8, so force it to be a multiple of 8 here.
466   y = h + (8 - (h & 0x7));
467 
468   do {
469     int x_q4 = x0_q4;
470     for (x = 0; x < w; x += 8) {
471       // process 8 src_x steps
472       for (z = 0; z < 8; ++z) {
473         const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
474         const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
475         if (x_q4 & SUBPEL_MASK) {
476           filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
477         } else {
478           int i;
479           for (i = 0; i < 8; ++i) {
480             temp[z * 8 + i] = src_x[i * src_stride + 3];
481           }
482         }
483         x_q4 += x_step_q4;
484       }
485 
486       // transpose the 8x8 filters values back to dst
487       transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
488     }
489 
490     src += src_stride * 8;
491     dst += dst_stride * 8;
492   } while (y -= 8);
493 }
494 
filter_horiz_w4_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * dst,const int16_t * filter)495 static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
496                                   uint8_t *dst, const int16_t *filter) {
497   const __m128i k_256 = _mm_set1_epi16(1 << 8);
498   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
499   // pack and duplicate the filter values
500   const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
501   const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
502   const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
503   const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
504   const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
505   const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
506   const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
507   const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
508   // TRANSPOSE...
509   // 00 01 02 03 04 05 06 07
510   // 10 11 12 13 14 15 16 17
511   // 20 21 22 23 24 25 26 27
512   // 30 31 32 33 34 35 36 37
513   //
514   // TO
515   //
516   // 00 10 20 30
517   // 01 11 21 31
518   // 02 12 22 32
519   // 03 13 23 33
520   // 04 14 24 34
521   // 05 15 25 35
522   // 06 16 26 36
523   // 07 17 27 37
524   //
525   // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
526   const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
527   // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
528   const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
529   // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
530   const __m128i s1s0  = _mm_unpacklo_epi32(tr0_0, tr0_1);
531   // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
532   const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
533   // 02 03 12 13 22 23 32 33
534   const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
535   // 06 07 16 17 26 27 36 37
536   const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
537   // multiply 2 adjacent elements with the filter and add the result
538   const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
539   const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
540   const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
541   const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
542   // add and saturate the results together
543   const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
544   const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
545   __m128i temp = _mm_adds_epi16(x0, x3);
546   temp = _mm_adds_epi16(temp, min_x2x1);
547   temp = _mm_adds_epi16(temp, max_x2x1);
548   // round and shift by 7 bit each 16 bit
549   temp = _mm_mulhrs_epi16(temp, k_256);
550   // shrink to 8 bit each 16 bits
551   temp = _mm_packus_epi16(temp, temp);
552   // save only 4 bytes
553   *(int *)dst = _mm_cvtsi128_si32(temp);
554 }
555 
transpose4x4_to_dst(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride)556 static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
557                                 uint8_t *dst, ptrdiff_t dst_stride) {
558   __m128i A = _mm_cvtsi32_si128(*(const int *)src);
559   __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
560   __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
561   __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
562   // 00 10 01 11 02 12 03 13
563   const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
564   // 20 30 21 31 22 32 23 33
565   const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
566   // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
567   A = _mm_unpacklo_epi16(tr0_0, tr0_1);
568   B = _mm_srli_si128(A, 4);
569   C = _mm_srli_si128(A, 8);
570   D = _mm_srli_si128(A, 12);
571 
572   *(int *)(dst) =  _mm_cvtsi128_si32(A);
573   *(int *)(dst + dst_stride) =  _mm_cvtsi128_si32(B);
574   *(int *)(dst + dst_stride * 2) =  _mm_cvtsi128_si32(C);
575   *(int *)(dst + dst_stride * 3) =  _mm_cvtsi128_si32(D);
576 }
577 
scaledconvolve_horiz_w4(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)578 static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
579                                     uint8_t *dst, ptrdiff_t dst_stride,
580                                     const InterpKernel *x_filters,
581                                     int x0_q4, int x_step_q4, int w, int h) {
582   DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
583   int x, y, z;
584   src -= SUBPEL_TAPS / 2 - 1;
585 
586   for (y = 0; y < h; y += 4) {
587     int x_q4 = x0_q4;
588     for (x = 0; x < w; x += 4) {
589       // process 4 src_x steps
590       for (z = 0; z < 4; ++z) {
591         const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
592         const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
593         if (x_q4 & SUBPEL_MASK) {
594           filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
595         } else {
596           int i;
597           for (i = 0; i < 4; ++i) {
598             temp[z * 4 + i] = src_x[i * src_stride + 3];
599           }
600         }
601         x_q4 += x_step_q4;
602       }
603 
604       // transpose the 4x4 filters values back to dst
605       transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
606     }
607 
608     src += src_stride * 4;
609     dst += dst_stride * 4;
610   }
611 }
612 
filter_vert_w4_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * dst,const int16_t * filter)613 static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
614                                  uint8_t *dst, const int16_t *filter) {
615   const __m128i k_256 = _mm_set1_epi16(1 << 8);
616   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
617   // pack and duplicate the filter values
618   const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
619   const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
620   const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
621   const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
622   const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
623   const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
624   const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
625   const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
626   const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
627   const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
628   const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
629   const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
630   const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
631   const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
632   const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
633   const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
634   // multiply 2 adjacent elements with the filter and add the result
635   const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
636   const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
637   const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
638   const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
639   // add and saturate the results together
640   const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
641   const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
642   __m128i temp = _mm_adds_epi16(x0, x3);
643   temp = _mm_adds_epi16(temp, min_x2x1);
644   temp = _mm_adds_epi16(temp, max_x2x1);
645   // round and shift by 7 bit each 16 bit
646   temp = _mm_mulhrs_epi16(temp, k_256);
647   // shrink to 8 bit each 16 bits
648   temp = _mm_packus_epi16(temp, temp);
649   // save only 4 bytes
650   *(int *)dst = _mm_cvtsi128_si32(temp);
651 }
652 
scaledconvolve_vert_w4(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)653 static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
654                                    uint8_t *dst, ptrdiff_t dst_stride,
655                                    const InterpKernel *y_filters,
656                                    int y0_q4, int y_step_q4, int w, int h) {
657   int y;
658   int y_q4 = y0_q4;
659 
660   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
661   for (y = 0; y < h; ++y) {
662     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
663     const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
664 
665     if (y_q4 & SUBPEL_MASK) {
666       filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
667     } else {
668       memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
669     }
670 
671     y_q4 += y_step_q4;
672   }
673 }
674 
filter_vert_w8_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * dst,const int16_t * filter)675 static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
676                                  uint8_t *dst, const int16_t *filter) {
677   const __m128i k_256 = _mm_set1_epi16(1 << 8);
678   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
679   // pack and duplicate the filter values
680   const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
681   const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
682   const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
683   const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
684   const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
685   const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
686   const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
687   const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
688   const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
689   const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
690   const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
691   const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
692   const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
693   const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
694   const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
695   const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
696   // multiply 2 adjacent elements with the filter and add the result
697   const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
698   const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
699   const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
700   const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
701   // add and saturate the results together
702   const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
703   const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
704   __m128i temp = _mm_adds_epi16(x0, x3);
705   temp = _mm_adds_epi16(temp, min_x2x1);
706   temp = _mm_adds_epi16(temp, max_x2x1);
707   // round and shift by 7 bit each 16 bit
708   temp = _mm_mulhrs_epi16(temp, k_256);
709   // shrink to 8 bit each 16 bits
710   temp = _mm_packus_epi16(temp, temp);
711   // save only 8 bytes convolve result
712   _mm_storel_epi64((__m128i*)dst, temp);
713 }
714 
scaledconvolve_vert_w8(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)715 static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
716                                    uint8_t *dst, ptrdiff_t dst_stride,
717                                    const InterpKernel *y_filters,
718                                    int y0_q4, int y_step_q4, int w, int h) {
719   int y;
720   int y_q4 = y0_q4;
721 
722   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
723   for (y = 0; y < h; ++y) {
724     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
725     const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
726     if (y_q4 & SUBPEL_MASK) {
727       filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
728     } else {
729       memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
730     }
731     y_q4 += y_step_q4;
732   }
733 }
734 
filter_vert_w16_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * dst,const int16_t * filter,int w)735 static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
736                                   uint8_t *dst, const int16_t *filter, int w) {
737   const __m128i k_256 = _mm_set1_epi16(1 << 8);
738   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
739   // pack and duplicate the filter values
740   const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
741   const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
742   const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
743   const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
744   int i;
745 
746   for (i = 0; i < w; i += 16) {
747     const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
748     const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
749     const __m128i C =
750         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
751     const __m128i D =
752         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
753     const __m128i E =
754         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
755     const __m128i F =
756         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
757     const __m128i G =
758         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
759     const __m128i H =
760         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
761     // merge the result together
762     const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
763     const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
764     const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
765     const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
766     // multiply 2 adjacent elements with the filter and add the result
767     const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
768     const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
769     const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
770     const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
771     // add and saturate the results together
772     const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
773     const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
774     // merge the result together
775     const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
776     const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
777     // multiply 2 adjacent elements with the filter and add the result
778     const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
779     const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
780     // merge the result together
781     const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
782     const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
783     // multiply 2 adjacent elements with the filter and add the result
784     const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
785     const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
786     // add and saturate the results together
787     __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
788     __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));
789 
790     // add and saturate the results together
791     temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
792     temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
793     // round and shift by 7 bit each 16 bit
794     temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
795     temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
796     // shrink to 8 bit each 16 bits, the first lane contain the first
797     // convolve result and the second lane contain the second convolve
798     // result
799     temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
800     src_ptr += 16;
801      // save 16 bytes convolve result
802     _mm_store_si128((__m128i*)&dst[i], temp_hi);
803   }
804 }
805 
scaledconvolve_vert_w16(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)806 static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
807                                     uint8_t *dst, ptrdiff_t dst_stride,
808                                     const InterpKernel *y_filters,
809                                     int y0_q4, int y_step_q4, int w, int h) {
810   int y;
811   int y_q4 = y0_q4;
812 
813   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
814   for (y = 0; y < h; ++y) {
815     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
816     const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
817     if (y_q4 & SUBPEL_MASK) {
818       filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
819                             w);
820     } else {
821       memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
822     }
823     y_q4 += y_step_q4;
824   }
825 }
826 
scaledconvolve2d(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_q4,int x_step_q4,const InterpKernel * const y_filters,int y0_q4,int y_step_q4,int w,int h)827 static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
828                              uint8_t *dst, ptrdiff_t dst_stride,
829                              const InterpKernel *const x_filters,
830                              int x0_q4, int x_step_q4,
831                              const InterpKernel *const y_filters,
832                              int y0_q4, int y_step_q4,
833                              int w, int h) {
834   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
835   // 2d filtering proceeds in 2 steps:
836   //   (1) Interpolate horizontally into an intermediate buffer, temp.
837   //   (2) Interpolate temp vertically to derive the sub-pixel result.
838   // Deriving the maximum number of rows in the temp buffer (135):
839   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
840   // --Largest block size is 64x64 pixels.
841   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
842   //   original frame (in 1/16th pixel units).
843   // --Must round-up because block may be located at sub-pixel position.
844   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
845   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
846   // --Require an additional 8 rows for the horiz_w8 transpose tail.
847   DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
848   const int intermediate_height =
849       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
850 
851   assert(w <= 64);
852   assert(h <= 64);
853   assert(y_step_q4 <= 32);
854   assert(x_step_q4 <= 32);
855 
856   if (w >= 8) {
857     scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
858                             src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
859                             w, intermediate_height);
860   } else {
861     scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
862                             src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
863                             w, intermediate_height);
864   }
865 
866   if (w >= 16) {
867     scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
868                             dst_stride, y_filters, y0_q4, y_step_q4, w, h);
869   } else if (w == 8) {
870     scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
871                            dst_stride, y_filters, y0_q4, y_step_q4, w, h);
872   } else {
873     scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
874                            dst_stride, y_filters, y0_q4, y_step_q4, w, h);
875   }
876 }
877 
get_filter_base(const int16_t * filter)878 static const InterpKernel *get_filter_base(const int16_t *filter) {
879   // NOTE: This assumes that the filter table is 256-byte aligned.
880   // TODO(agrange) Modify to make independent of table alignment.
881   return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
882 }
883 
get_filter_offset(const int16_t * f,const InterpKernel * base)884 static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
885   return (int)((const InterpKernel *)(intptr_t)f - base);
886 }
887 
vpx_scaled_2d_ssse3(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)888 void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride,
889                          uint8_t *dst, ptrdiff_t dst_stride,
890                          const int16_t *filter_x, int x_step_q4,
891                          const int16_t *filter_y, int y_step_q4,
892                          int w, int h) {
893   const InterpKernel *const filters_x = get_filter_base(filter_x);
894   const int x0_q4 = get_filter_offset(filter_x, filters_x);
895 
896   const InterpKernel *const filters_y = get_filter_base(filter_y);
897   const int y0_q4 = get_filter_offset(filter_y, filters_y);
898 
899   scaledconvolve2d(src, src_stride, dst, dst_stride,
900                    filters_x, x0_q4, x_step_q4,
901                    filters_y, y0_q4, y_step_q4, w, h);
902 }
903 
904 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
905 //                          uint8_t *dst, ptrdiff_t dst_stride,
906 //                          const int16_t *filter_x, int x_step_q4,
907 //                          const int16_t *filter_y, int y_step_q4,
908 //                          int w, int h);
909 // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
910 //                              uint8_t *dst, ptrdiff_t dst_stride,
911 //                              const int16_t *filter_x, int x_step_q4,
912 //                              const int16_t *filter_y, int y_step_q4,
913 //                              int w, int h);
914 FUN_CONV_2D(, ssse3);
915 FUN_CONV_2D(avg_ , ssse3);
916