1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 // Due to a header conflict between math.h and intrinsics includes with ceil()
12 // in certain configurations under vs9 this include needs to precede
13 // tmmintrin.h.
14
15 #include <tmmintrin.h>
16
17 #include "./vpx_dsp_rtcd.h"
18 #include "vpx_dsp/vpx_filter.h"
19 #include "vpx_dsp/x86/convolve.h"
20 #include "vpx_mem/vpx_mem.h"
21 #include "vpx_ports/mem.h"
22 #include "vpx_ports/emmintrin_compat.h"
23
24 // filters only for the 4_h8 convolution
25 DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
26 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
27 };
28
29 DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
30 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
31 };
32
33 // filters for 8_h8 and 16_h8
34 DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
35 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
36 };
37
38 DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
39 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
40 };
41
42 DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
43 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
44 };
45
46 DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
47 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
48 };
49
50 // These are reused by the avx2 intrinsics.
51 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
52 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
53 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
54
vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)55 void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
56 ptrdiff_t src_pixels_per_line,
57 uint8_t *output_ptr,
58 ptrdiff_t output_pitch,
59 uint32_t output_height,
60 const int16_t *filter) {
61 __m128i firstFilters, secondFilters, shuffle1, shuffle2;
62 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
63 __m128i addFilterReg64, filtersReg, srcReg, minReg;
64 unsigned int i;
65
66 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
67 addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
68 filtersReg = _mm_loadu_si128((const __m128i *)filter);
69 // converting the 16 bit (short) to 8 bit (byte) and have the same data
70 // in both lanes of 128 bit register.
71 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
72
73 // duplicate only the first 16 bits in the filter into the first lane
74 firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
75 // duplicate only the third 16 bit in the filter into the first lane
76 secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
77 // duplicate only the seconds 16 bits in the filter into the second lane
78 // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
79 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
80 // duplicate only the forth 16 bits in the filter into the second lane
81 // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
82 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
83
84 // loading the local filters
85 shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);
86 shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
87
88 for (i = 0; i < output_height; i++) {
89 srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
90
91 // filter the source buffer
92 srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
93 srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);
94
95 // multiply 2 adjacent elements with the filter and add the result
96 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
97 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
98
99 // extract the higher half of the lane
100 srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
101 srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
102
103 minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
104
105 // add and saturate all the results together
106 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
107 srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
108 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
109 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
110 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
111
112 // shift by 7 bit each 16 bits
113 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
114
115 // shrink to 8 bit each 16 bits
116 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
117 src_ptr+=src_pixels_per_line;
118
119 // save only 4 bytes
120 *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
121
122 output_ptr+=output_pitch;
123 }
124 }
125
vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)126 void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
127 ptrdiff_t src_pixels_per_line,
128 uint8_t *output_ptr,
129 ptrdiff_t output_pitch,
130 uint32_t output_height,
131 const int16_t *filter) {
132 __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
133 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
134 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
135 __m128i addFilterReg64, filtersReg, minReg;
136 unsigned int i;
137
138 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
139 addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
140 filtersReg = _mm_loadu_si128((const __m128i *)filter);
141 // converting the 16 bit (short) to 8 bit (byte) and have the same data
142 // in both lanes of 128 bit register.
143 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
144
145 // duplicate only the first 16 bits (first and second byte)
146 // across 128 bit register
147 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
148 // duplicate only the second 16 bits (third and forth byte)
149 // across 128 bit register
150 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
151 // duplicate only the third 16 bits (fifth and sixth byte)
152 // across 128 bit register
153 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
154 // duplicate only the forth 16 bits (seventh and eighth byte)
155 // across 128 bit register
156 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
157
158 filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
159 filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
160 filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
161 filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
162
163 for (i = 0; i < output_height; i++) {
164 srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
165
166 // filter the source buffer
167 srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
168 srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
169
170 // multiply 2 adjacent elements with the filter and add the result
171 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
172 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
173
174 // filter the source buffer
175 srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
176 srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
177
178 // multiply 2 adjacent elements with the filter and add the result
179 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
180 srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
181
182 // add and saturate all the results together
183 minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
184 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
185
186 srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3);
187 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
188 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
189 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
190
191 // shift by 7 bit each 16 bits
192 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
193
194 // shrink to 8 bit each 16 bits
195 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
196
197 src_ptr+=src_pixels_per_line;
198
199 // save only 8 bytes
200 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
201
202 output_ptr+=output_pitch;
203 }
204 }
205
vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)206 void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
207 ptrdiff_t src_pitch,
208 uint8_t *output_ptr,
209 ptrdiff_t out_pitch,
210 uint32_t output_height,
211 const int16_t *filter) {
212 __m128i addFilterReg64, filtersReg, minReg;
213 __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
214 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
215 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
216 __m128i srcReg8;
217 unsigned int i;
218
219 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
220 addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
221 filtersReg = _mm_loadu_si128((const __m128i *)filter);
222 // converting the 16 bit (short) to 8 bit (byte) and have the same data
223 // in both lanes of 128 bit register.
224 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
225
226 // duplicate only the first 16 bits in the filter
227 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
228 // duplicate only the second 16 bits in the filter
229 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
230 // duplicate only the third 16 bits in the filter
231 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
232 // duplicate only the forth 16 bits in the filter
233 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
234
235 // load the first 7 rows of 8 bytes
236 srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
237 srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
238 srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
239 srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
240 srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
241 srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
242 srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
243
244 for (i = 0; i < output_height; i++) {
245 // load the last 8 bytes
246 srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
247
248 // merge the result together
249 srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
250 srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
251
252 // merge the result together
253 srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
254 srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
255
256 // multiply 2 adjacent elements with the filter and add the result
257 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
258 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
259 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
260 srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
261
262 // add and saturate the results together
263 minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
264 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
265 srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
266 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
267 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
268 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
269
270 // shift by 7 bit each 16 bit
271 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
272
273 // shrink to 8 bit each 16 bits
274 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
275
276 src_ptr+=src_pitch;
277
278 // shift down a row
279 srcReg1 = srcReg2;
280 srcReg2 = srcReg3;
281 srcReg3 = srcReg4;
282 srcReg4 = srcReg5;
283 srcReg5 = srcReg6;
284 srcReg6 = srcReg7;
285 srcReg7 = srcReg8;
286
287 // save only 8 bytes convolve result
288 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
289
290 output_ptr+=out_pitch;
291 }
292 }
293
294 filter8_1dfunction vpx_filter_block1d16_v8_ssse3;
295 filter8_1dfunction vpx_filter_block1d16_h8_ssse3;
296 filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
297 filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
298 filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
299 filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
300 filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3;
301 filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3;
302 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
303 filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
304 filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
305 filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
306
307 filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
308 filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
309 filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
310 filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
311 filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
312 filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
313 filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
314 filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
315 filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
316 filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
317 filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
318 filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
319
320 // void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
321 // uint8_t *dst, ptrdiff_t dst_stride,
322 // const int16_t *filter_x, int x_step_q4,
323 // const int16_t *filter_y, int y_step_q4,
324 // int w, int h);
325 // void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
326 // uint8_t *dst, ptrdiff_t dst_stride,
327 // const int16_t *filter_x, int x_step_q4,
328 // const int16_t *filter_y, int y_step_q4,
329 // int w, int h);
330 // void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
331 // uint8_t *dst, ptrdiff_t dst_stride,
332 // const int16_t *filter_x, int x_step_q4,
333 // const int16_t *filter_y, int y_step_q4,
334 // int w, int h);
335 // void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
336 // uint8_t *dst, ptrdiff_t dst_stride,
337 // const int16_t *filter_x, int x_step_q4,
338 // const int16_t *filter_y, int y_step_q4,
339 // int w, int h);
340 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
341 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
342 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
343 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
344 ssse3);
345
346 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
347 out0, out1, out2, out3, out4, out5, out6, out7) { \
348 const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1); \
349 const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3); \
350 const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5); \
351 const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7); \
352 \
353 const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1); \
354 const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1); \
355 const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3); \
356 const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3); \
357 \
358 const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2); \
359 const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2); \
360 const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3); \
361 const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \
362 \
363 out0 = _mm_unpacklo_epi64(tr2_0, tr2_0); \
364 out1 = _mm_unpackhi_epi64(tr2_0, tr2_0); \
365 out2 = _mm_unpacklo_epi64(tr2_1, tr2_1); \
366 out3 = _mm_unpackhi_epi64(tr2_1, tr2_1); \
367 out4 = _mm_unpacklo_epi64(tr2_2, tr2_2); \
368 out5 = _mm_unpackhi_epi64(tr2_2, tr2_2); \
369 out6 = _mm_unpacklo_epi64(tr2_3, tr2_3); \
370 out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \
371 }
372
filter_horiz_w8_ssse3(const uint8_t * src_x,ptrdiff_t src_pitch,uint8_t * dst,const int16_t * x_filter)373 static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
374 uint8_t *dst, const int16_t *x_filter) {
375 const __m128i k_256 = _mm_set1_epi16(1 << 8);
376 const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
377 // pack and duplicate the filter values
378 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
379 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
380 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
381 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
382 const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
383 const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
384 const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
385 const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
386 const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
387 const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
388 const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
389 const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
390 // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
391 const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
392 // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
393 const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
394 // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
395 const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
396 // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
397 const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
398 // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
399 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
400 // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
401 const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
402 // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
403 const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
404 // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
405 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
406 // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
407 const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
408 const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
409 const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
410 const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
411 // multiply 2 adjacent elements with the filter and add the result
412 const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
413 const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
414 const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
415 const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
416 // add and saturate the results together
417 const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
418 const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
419 __m128i temp = _mm_adds_epi16(x0, x3);
420 temp = _mm_adds_epi16(temp, min_x2x1);
421 temp = _mm_adds_epi16(temp, max_x2x1);
422 // round and shift by 7 bit each 16 bit
423 temp = _mm_mulhrs_epi16(temp, k_256);
424 // shrink to 8 bit each 16 bits
425 temp = _mm_packus_epi16(temp, temp);
426 // save only 8 bytes convolve result
427 _mm_storel_epi64((__m128i*)dst, temp);
428 }
429
transpose8x8_to_dst(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride)430 static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
431 uint8_t *dst, ptrdiff_t dst_stride) {
432 __m128i A, B, C, D, E, F, G, H;
433
434 A = _mm_loadl_epi64((const __m128i *)src);
435 B = _mm_loadl_epi64((const __m128i *)(src + src_stride));
436 C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
437 D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
438 E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
439 F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5));
440 G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
441 H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
442
443 TRANSPOSE_8X8(A, B, C, D, E, F, G, H,
444 A, B, C, D, E, F, G, H);
445
446 _mm_storel_epi64((__m128i*)dst, A);
447 _mm_storel_epi64((__m128i*)(dst + dst_stride * 1), B);
448 _mm_storel_epi64((__m128i*)(dst + dst_stride * 2), C);
449 _mm_storel_epi64((__m128i*)(dst + dst_stride * 3), D);
450 _mm_storel_epi64((__m128i*)(dst + dst_stride * 4), E);
451 _mm_storel_epi64((__m128i*)(dst + dst_stride * 5), F);
452 _mm_storel_epi64((__m128i*)(dst + dst_stride * 6), G);
453 _mm_storel_epi64((__m128i*)(dst + dst_stride * 7), H);
454 }
455
scaledconvolve_horiz_w8(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)456 static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
457 uint8_t *dst, ptrdiff_t dst_stride,
458 const InterpKernel *x_filters,
459 int x0_q4, int x_step_q4, int w, int h) {
460 DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
461 int x, y, z;
462 src -= SUBPEL_TAPS / 2 - 1;
463
464 // This function processes 8x8 areas. The intermediate height is not always
465 // a multiple of 8, so force it to be a multiple of 8 here.
466 y = h + (8 - (h & 0x7));
467
468 do {
469 int x_q4 = x0_q4;
470 for (x = 0; x < w; x += 8) {
471 // process 8 src_x steps
472 for (z = 0; z < 8; ++z) {
473 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
474 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
475 if (x_q4 & SUBPEL_MASK) {
476 filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
477 } else {
478 int i;
479 for (i = 0; i < 8; ++i) {
480 temp[z * 8 + i] = src_x[i * src_stride + 3];
481 }
482 }
483 x_q4 += x_step_q4;
484 }
485
486 // transpose the 8x8 filters values back to dst
487 transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
488 }
489
490 src += src_stride * 8;
491 dst += dst_stride * 8;
492 } while (y -= 8);
493 }
494
filter_horiz_w4_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * dst,const int16_t * filter)495 static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
496 uint8_t *dst, const int16_t *filter) {
497 const __m128i k_256 = _mm_set1_epi16(1 << 8);
498 const __m128i f_values = _mm_load_si128((const __m128i *)filter);
499 // pack and duplicate the filter values
500 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
501 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
502 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
503 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
504 const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
505 const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
506 const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
507 const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
508 // TRANSPOSE...
509 // 00 01 02 03 04 05 06 07
510 // 10 11 12 13 14 15 16 17
511 // 20 21 22 23 24 25 26 27
512 // 30 31 32 33 34 35 36 37
513 //
514 // TO
515 //
516 // 00 10 20 30
517 // 01 11 21 31
518 // 02 12 22 32
519 // 03 13 23 33
520 // 04 14 24 34
521 // 05 15 25 35
522 // 06 16 26 36
523 // 07 17 27 37
524 //
525 // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
526 const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
527 // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
528 const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
529 // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
530 const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
531 // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
532 const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
533 // 02 03 12 13 22 23 32 33
534 const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
535 // 06 07 16 17 26 27 36 37
536 const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
537 // multiply 2 adjacent elements with the filter and add the result
538 const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
539 const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
540 const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
541 const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
542 // add and saturate the results together
543 const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
544 const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
545 __m128i temp = _mm_adds_epi16(x0, x3);
546 temp = _mm_adds_epi16(temp, min_x2x1);
547 temp = _mm_adds_epi16(temp, max_x2x1);
548 // round and shift by 7 bit each 16 bit
549 temp = _mm_mulhrs_epi16(temp, k_256);
550 // shrink to 8 bit each 16 bits
551 temp = _mm_packus_epi16(temp, temp);
552 // save only 4 bytes
553 *(int *)dst = _mm_cvtsi128_si32(temp);
554 }
555
transpose4x4_to_dst(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride)556 static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
557 uint8_t *dst, ptrdiff_t dst_stride) {
558 __m128i A = _mm_cvtsi32_si128(*(const int *)src);
559 __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
560 __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
561 __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
562 // 00 10 01 11 02 12 03 13
563 const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
564 // 20 30 21 31 22 32 23 33
565 const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
566 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
567 A = _mm_unpacklo_epi16(tr0_0, tr0_1);
568 B = _mm_srli_si128(A, 4);
569 C = _mm_srli_si128(A, 8);
570 D = _mm_srli_si128(A, 12);
571
572 *(int *)(dst) = _mm_cvtsi128_si32(A);
573 *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B);
574 *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C);
575 *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D);
576 }
577
scaledconvolve_horiz_w4(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)578 static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
579 uint8_t *dst, ptrdiff_t dst_stride,
580 const InterpKernel *x_filters,
581 int x0_q4, int x_step_q4, int w, int h) {
582 DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
583 int x, y, z;
584 src -= SUBPEL_TAPS / 2 - 1;
585
586 for (y = 0; y < h; y += 4) {
587 int x_q4 = x0_q4;
588 for (x = 0; x < w; x += 4) {
589 // process 4 src_x steps
590 for (z = 0; z < 4; ++z) {
591 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
592 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
593 if (x_q4 & SUBPEL_MASK) {
594 filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
595 } else {
596 int i;
597 for (i = 0; i < 4; ++i) {
598 temp[z * 4 + i] = src_x[i * src_stride + 3];
599 }
600 }
601 x_q4 += x_step_q4;
602 }
603
604 // transpose the 4x4 filters values back to dst
605 transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
606 }
607
608 src += src_stride * 4;
609 dst += dst_stride * 4;
610 }
611 }
612
filter_vert_w4_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * dst,const int16_t * filter)613 static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
614 uint8_t *dst, const int16_t *filter) {
615 const __m128i k_256 = _mm_set1_epi16(1 << 8);
616 const __m128i f_values = _mm_load_si128((const __m128i *)filter);
617 // pack and duplicate the filter values
618 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
619 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
620 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
621 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
622 const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
623 const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
624 const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
625 const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
626 const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
627 const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
628 const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
629 const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
630 const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
631 const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
632 const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
633 const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
634 // multiply 2 adjacent elements with the filter and add the result
635 const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
636 const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
637 const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
638 const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
639 // add and saturate the results together
640 const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
641 const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
642 __m128i temp = _mm_adds_epi16(x0, x3);
643 temp = _mm_adds_epi16(temp, min_x2x1);
644 temp = _mm_adds_epi16(temp, max_x2x1);
645 // round and shift by 7 bit each 16 bit
646 temp = _mm_mulhrs_epi16(temp, k_256);
647 // shrink to 8 bit each 16 bits
648 temp = _mm_packus_epi16(temp, temp);
649 // save only 4 bytes
650 *(int *)dst = _mm_cvtsi128_si32(temp);
651 }
652
scaledconvolve_vert_w4(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)653 static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
654 uint8_t *dst, ptrdiff_t dst_stride,
655 const InterpKernel *y_filters,
656 int y0_q4, int y_step_q4, int w, int h) {
657 int y;
658 int y_q4 = y0_q4;
659
660 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
661 for (y = 0; y < h; ++y) {
662 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
663 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
664
665 if (y_q4 & SUBPEL_MASK) {
666 filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
667 } else {
668 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
669 }
670
671 y_q4 += y_step_q4;
672 }
673 }
674
filter_vert_w8_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * dst,const int16_t * filter)675 static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
676 uint8_t *dst, const int16_t *filter) {
677 const __m128i k_256 = _mm_set1_epi16(1 << 8);
678 const __m128i f_values = _mm_load_si128((const __m128i *)filter);
679 // pack and duplicate the filter values
680 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
681 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
682 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
683 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
684 const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
685 const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
686 const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
687 const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
688 const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
689 const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
690 const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
691 const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
692 const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
693 const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
694 const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
695 const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
696 // multiply 2 adjacent elements with the filter and add the result
697 const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
698 const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
699 const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
700 const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
701 // add and saturate the results together
702 const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
703 const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
704 __m128i temp = _mm_adds_epi16(x0, x3);
705 temp = _mm_adds_epi16(temp, min_x2x1);
706 temp = _mm_adds_epi16(temp, max_x2x1);
707 // round and shift by 7 bit each 16 bit
708 temp = _mm_mulhrs_epi16(temp, k_256);
709 // shrink to 8 bit each 16 bits
710 temp = _mm_packus_epi16(temp, temp);
711 // save only 8 bytes convolve result
712 _mm_storel_epi64((__m128i*)dst, temp);
713 }
714
scaledconvolve_vert_w8(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)715 static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
716 uint8_t *dst, ptrdiff_t dst_stride,
717 const InterpKernel *y_filters,
718 int y0_q4, int y_step_q4, int w, int h) {
719 int y;
720 int y_q4 = y0_q4;
721
722 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
723 for (y = 0; y < h; ++y) {
724 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
725 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
726 if (y_q4 & SUBPEL_MASK) {
727 filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
728 } else {
729 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
730 }
731 y_q4 += y_step_q4;
732 }
733 }
734
filter_vert_w16_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * dst,const int16_t * filter,int w)735 static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
736 uint8_t *dst, const int16_t *filter, int w) {
737 const __m128i k_256 = _mm_set1_epi16(1 << 8);
738 const __m128i f_values = _mm_load_si128((const __m128i *)filter);
739 // pack and duplicate the filter values
740 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
741 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
742 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
743 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
744 int i;
745
746 for (i = 0; i < w; i += 16) {
747 const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
748 const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
749 const __m128i C =
750 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
751 const __m128i D =
752 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
753 const __m128i E =
754 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
755 const __m128i F =
756 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
757 const __m128i G =
758 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
759 const __m128i H =
760 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
761 // merge the result together
762 const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
763 const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
764 const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
765 const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
766 // multiply 2 adjacent elements with the filter and add the result
767 const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
768 const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
769 const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
770 const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
771 // add and saturate the results together
772 const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
773 const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
774 // merge the result together
775 const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
776 const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
777 // multiply 2 adjacent elements with the filter and add the result
778 const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
779 const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
780 // merge the result together
781 const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
782 const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
783 // multiply 2 adjacent elements with the filter and add the result
784 const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
785 const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
786 // add and saturate the results together
787 __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
788 __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));
789
790 // add and saturate the results together
791 temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
792 temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
793 // round and shift by 7 bit each 16 bit
794 temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
795 temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
796 // shrink to 8 bit each 16 bits, the first lane contain the first
797 // convolve result and the second lane contain the second convolve
798 // result
799 temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
800 src_ptr += 16;
801 // save 16 bytes convolve result
802 _mm_store_si128((__m128i*)&dst[i], temp_hi);
803 }
804 }
805
scaledconvolve_vert_w16(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)806 static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
807 uint8_t *dst, ptrdiff_t dst_stride,
808 const InterpKernel *y_filters,
809 int y0_q4, int y_step_q4, int w, int h) {
810 int y;
811 int y_q4 = y0_q4;
812
813 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
814 for (y = 0; y < h; ++y) {
815 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
816 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
817 if (y_q4 & SUBPEL_MASK) {
818 filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
819 w);
820 } else {
821 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
822 }
823 y_q4 += y_step_q4;
824 }
825 }
826
scaledconvolve2d(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_q4,int x_step_q4,const InterpKernel * const y_filters,int y0_q4,int y_step_q4,int w,int h)827 static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
828 uint8_t *dst, ptrdiff_t dst_stride,
829 const InterpKernel *const x_filters,
830 int x0_q4, int x_step_q4,
831 const InterpKernel *const y_filters,
832 int y0_q4, int y_step_q4,
833 int w, int h) {
834 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
835 // 2d filtering proceeds in 2 steps:
836 // (1) Interpolate horizontally into an intermediate buffer, temp.
837 // (2) Interpolate temp vertically to derive the sub-pixel result.
838 // Deriving the maximum number of rows in the temp buffer (135):
839 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
840 // --Largest block size is 64x64 pixels.
841 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
842 // original frame (in 1/16th pixel units).
843 // --Must round-up because block may be located at sub-pixel position.
844 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
845 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
846 // --Require an additional 8 rows for the horiz_w8 transpose tail.
847 DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
848 const int intermediate_height =
849 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
850
851 assert(w <= 64);
852 assert(h <= 64);
853 assert(y_step_q4 <= 32);
854 assert(x_step_q4 <= 32);
855
856 if (w >= 8) {
857 scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
858 src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
859 w, intermediate_height);
860 } else {
861 scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
862 src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
863 w, intermediate_height);
864 }
865
866 if (w >= 16) {
867 scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
868 dst_stride, y_filters, y0_q4, y_step_q4, w, h);
869 } else if (w == 8) {
870 scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
871 dst_stride, y_filters, y0_q4, y_step_q4, w, h);
872 } else {
873 scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
874 dst_stride, y_filters, y0_q4, y_step_q4, w, h);
875 }
876 }
877
get_filter_base(const int16_t * filter)878 static const InterpKernel *get_filter_base(const int16_t *filter) {
879 // NOTE: This assumes that the filter table is 256-byte aligned.
880 // TODO(agrange) Modify to make independent of table alignment.
881 return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
882 }
883
get_filter_offset(const int16_t * f,const InterpKernel * base)884 static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
885 return (int)((const InterpKernel *)(intptr_t)f - base);
886 }
887
vpx_scaled_2d_ssse3(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)888 void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride,
889 uint8_t *dst, ptrdiff_t dst_stride,
890 const int16_t *filter_x, int x_step_q4,
891 const int16_t *filter_y, int y_step_q4,
892 int w, int h) {
893 const InterpKernel *const filters_x = get_filter_base(filter_x);
894 const int x0_q4 = get_filter_offset(filter_x, filters_x);
895
896 const InterpKernel *const filters_y = get_filter_base(filter_y);
897 const int y0_q4 = get_filter_offset(filter_y, filters_y);
898
899 scaledconvolve2d(src, src_stride, dst, dst_stride,
900 filters_x, x0_q4, x_step_q4,
901 filters_y, y0_q4, y_step_q4, w, h);
902 }
903
904 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
905 // uint8_t *dst, ptrdiff_t dst_stride,
906 // const int16_t *filter_x, int x_step_q4,
907 // const int16_t *filter_y, int y_step_q4,
908 // int w, int h);
909 // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
910 // uint8_t *dst, ptrdiff_t dst_stride,
911 // const int16_t *filter_x, int x_step_q4,
912 // const int16_t *filter_y, int y_step_q4,
913 // int w, int h);
914 FUN_CONV_2D(, ssse3);
915 FUN_CONV_2D(avg_ , ssse3);
916