1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 // Due to a header conflict between math.h and intrinsics includes with ceil()
12 // in certain configurations under vs9 this include needs to precede
13 // tmmintrin.h.
14
15 #include <tmmintrin.h>
16
17 #include "./vpx_dsp_rtcd.h"
18 #include "vpx_dsp/vpx_filter.h"
19 #include "vpx_dsp/x86/convolve.h"
20 #include "vpx_mem/vpx_mem.h"
21 #include "vpx_ports/mem.h"
22 #include "vpx_ports/emmintrin_compat.h"
23
24 // filters only for the 4_h8 convolution
25 DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
26 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
27 };
28
29 DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
30 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
31 };
32
33 // filters for 8_h8 and 16_h8
34 DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
35 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
36 };
37
38 DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
39 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
40 };
41
42 DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
43 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
44 };
45
46 DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
47 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
48 };
49
50 // These are reused by the avx2 intrinsics.
51 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
52 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
53 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
54
vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)55 void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
56 ptrdiff_t src_pixels_per_line,
57 uint8_t *output_ptr,
58 ptrdiff_t output_pitch,
59 uint32_t output_height,
60 const int16_t *filter) {
61 __m128i firstFilters, secondFilters, shuffle1, shuffle2;
62 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
63 __m128i addFilterReg64, filtersReg, srcReg, minReg;
64 unsigned int i;
65
66 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
67 addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
68 filtersReg = _mm_loadu_si128((const __m128i *)filter);
69 // converting the 16 bit (short) to 8 bit (byte) and have the same data
70 // in both lanes of 128 bit register.
71 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
72
73 // duplicate only the first 16 bits in the filter into the first lane
74 firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
75 // duplicate only the third 16 bit in the filter into the first lane
76 secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
77 // duplicate only the seconds 16 bits in the filter into the second lane
78 // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
79 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
80 // duplicate only the forth 16 bits in the filter into the second lane
81 // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
82 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
83
84 // loading the local filters
85 shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);
86 shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
87
88 for (i = 0; i < output_height; i++) {
89 srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
90
91 // filter the source buffer
92 srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
93 srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);
94
95 // multiply 2 adjacent elements with the filter and add the result
96 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
97 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
98
99 // extract the higher half of the lane
100 srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
101 srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
102
103 minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
104
105 // add and saturate all the results together
106 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
107 srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
108 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
109 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
110 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
111
112 // shift by 7 bit each 16 bits
113 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
114
115 // shrink to 8 bit each 16 bits
116 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
117 src_ptr+=src_pixels_per_line;
118
119 // save only 4 bytes
120 *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
121
122 output_ptr+=output_pitch;
123 }
124 }
125
vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)126 void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
127 ptrdiff_t src_pixels_per_line,
128 uint8_t *output_ptr,
129 ptrdiff_t output_pitch,
130 uint32_t output_height,
131 const int16_t *filter) {
132 __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
133 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
134 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
135 __m128i addFilterReg64, filtersReg, minReg;
136 unsigned int i;
137
138 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
139 addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
140 filtersReg = _mm_loadu_si128((const __m128i *)filter);
141 // converting the 16 bit (short) to 8 bit (byte) and have the same data
142 // in both lanes of 128 bit register.
143 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
144
145 // duplicate only the first 16 bits (first and second byte)
146 // across 128 bit register
147 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
148 // duplicate only the second 16 bits (third and forth byte)
149 // across 128 bit register
150 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
151 // duplicate only the third 16 bits (fifth and sixth byte)
152 // across 128 bit register
153 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
154 // duplicate only the forth 16 bits (seventh and eighth byte)
155 // across 128 bit register
156 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
157
158 filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
159 filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
160 filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
161 filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
162
163 for (i = 0; i < output_height; i++) {
164 srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
165
166 // filter the source buffer
167 srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
168 srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
169
170 // multiply 2 adjacent elements with the filter and add the result
171 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
172 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
173
174 // filter the source buffer
175 srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
176 srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
177
178 // multiply 2 adjacent elements with the filter and add the result
179 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
180 srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
181
182 // add and saturate all the results together
183 minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
184 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
185
186 srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3);
187 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
188 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
189 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
190
191 // shift by 7 bit each 16 bits
192 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
193
194 // shrink to 8 bit each 16 bits
195 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
196
197 src_ptr+=src_pixels_per_line;
198
199 // save only 8 bytes
200 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
201
202 output_ptr+=output_pitch;
203 }
204 }
205
vpx_filter_block1d16_h8_intrin_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)206 static void vpx_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr,
207 ptrdiff_t src_pixels_per_line,
208 uint8_t *output_ptr,
209 ptrdiff_t output_pitch,
210 uint32_t output_height,
211 const int16_t *filter) {
212 __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
213 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
214 __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
215 __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
216 unsigned int i;
217
218 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
219 addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
220 filtersReg = _mm_loadu_si128((const __m128i *)filter);
221 // converting the 16 bit (short) to 8 bit (byte) and have the same data
222 // in both lanes of 128 bit register.
223 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
224
225 // duplicate only the first 16 bits (first and second byte)
226 // across 128 bit register
227 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
228 // duplicate only the second 16 bits (third and forth byte)
229 // across 128 bit register
230 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
231 // duplicate only the third 16 bits (fifth and sixth byte)
232 // across 128 bit register
233 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
234 // duplicate only the forth 16 bits (seventh and eighth byte)
235 // across 128 bit register
236 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
237
238 filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
239 filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
240 filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
241 filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
242
243 for (i = 0; i < output_height; i++) {
244 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
245
246 // filter the source buffer
247 srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
248 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg);
249
250 // multiply 2 adjacent elements with the filter and add the result
251 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
252 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
253
254 // add and saturate the results together
255 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
256
257 // filter the source buffer
258 srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg);
259 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
260
261 // multiply 2 adjacent elements with the filter and add the result
262 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
263 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
264
265 // add and saturate the results together
266 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
267 _mm_min_epi16(srcRegFilt3, srcRegFilt2));
268
269 // reading the next 16 bytes.
270 // (part of it was being read by earlier read)
271 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
272
273 // add and saturate the results together
274 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
275 _mm_max_epi16(srcRegFilt3, srcRegFilt2));
276
277 // filter the source buffer
278 srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
279 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg);
280
281 // multiply 2 adjacent elements with the filter and add the result
282 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
283 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);
284
285 // add and saturate the results together
286 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
287
288 // filter the source buffer
289 srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg);
290 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
291
292 // multiply 2 adjacent elements with the filter and add the result
293 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
294 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
295
296 // add and saturate the results together
297 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
298 _mm_min_epi16(srcRegFilt3, srcRegFilt2));
299 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
300 _mm_max_epi16(srcRegFilt3, srcRegFilt2));
301
302 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
303 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
304
305 // shift by 7 bit each 16 bit
306 srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
307 srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
308
309 // shrink to 8 bit each 16 bits, the first lane contain the first
310 // convolve result and the second lane contain the second convolve
311 // result
312 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
313
314 src_ptr+=src_pixels_per_line;
315
316 // save 16 bytes
317 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
318
319 output_ptr+=output_pitch;
320 }
321 }
322
vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)323 void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
324 ptrdiff_t src_pitch,
325 uint8_t *output_ptr,
326 ptrdiff_t out_pitch,
327 uint32_t output_height,
328 const int16_t *filter) {
329 __m128i addFilterReg64, filtersReg, minReg;
330 __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
331 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
332 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
333 __m128i srcReg8;
334 unsigned int i;
335
336 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
337 addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
338 filtersReg = _mm_loadu_si128((const __m128i *)filter);
339 // converting the 16 bit (short) to 8 bit (byte) and have the same data
340 // in both lanes of 128 bit register.
341 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
342
343 // duplicate only the first 16 bits in the filter
344 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
345 // duplicate only the second 16 bits in the filter
346 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
347 // duplicate only the third 16 bits in the filter
348 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
349 // duplicate only the forth 16 bits in the filter
350 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
351
352 // load the first 7 rows of 8 bytes
353 srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
354 srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
355 srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
356 srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
357 srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
358 srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
359 srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
360
361 for (i = 0; i < output_height; i++) {
362 // load the last 8 bytes
363 srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
364
365 // merge the result together
366 srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
367 srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
368
369 // merge the result together
370 srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
371 srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
372
373 // multiply 2 adjacent elements with the filter and add the result
374 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
375 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
376 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
377 srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
378
379 // add and saturate the results together
380 minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
381 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
382 srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
383 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
384 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
385 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
386
387 // shift by 7 bit each 16 bit
388 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
389
390 // shrink to 8 bit each 16 bits
391 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
392
393 src_ptr+=src_pitch;
394
395 // shift down a row
396 srcReg1 = srcReg2;
397 srcReg2 = srcReg3;
398 srcReg3 = srcReg4;
399 srcReg4 = srcReg5;
400 srcReg5 = srcReg6;
401 srcReg6 = srcReg7;
402 srcReg7 = srcReg8;
403
404 // save only 8 bytes convolve result
405 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
406
407 output_ptr+=out_pitch;
408 }
409 }
410
vpx_filter_block1d16_v8_intrin_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)411 static void vpx_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr,
412 ptrdiff_t src_pitch,
413 uint8_t *output_ptr,
414 ptrdiff_t out_pitch,
415 uint32_t output_height,
416 const int16_t *filter) {
417 __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;
418 __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
419 __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
420 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
421 __m128i srcReg8;
422 unsigned int i;
423
424 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
425 addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
426 filtersReg = _mm_loadu_si128((const __m128i *)filter);
427 // converting the 16 bit (short) to 8 bit (byte) and have the same data
428 // in both lanes of 128 bit register.
429 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
430
431 // duplicate only the first 16 bits in the filter
432 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
433 // duplicate only the second 16 bits in the filter
434 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
435 // duplicate only the third 16 bits in the filter
436 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
437 // duplicate only the forth 16 bits in the filter
438 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
439
440 // load the first 7 rows of 16 bytes
441 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
442 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
443 srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
444 srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
445 srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
446 srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
447 srcReg7 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
448
449 for (i = 0; i < output_height; i++) {
450 // load the last 16 bytes
451 srcReg8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
452
453 // merge the result together
454 srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2);
455 srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8);
456 srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2);
457 srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8);
458
459 // multiply 2 adjacent elements with the filter and add the result
460 srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
461 srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
462 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
463 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
464
465 // add and saturate the results together
466 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
467 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
468
469 // merge the result together
470 srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
471 srcRegFilt6 = _mm_unpackhi_epi8(srcReg3, srcReg4);
472
473 // multiply 2 adjacent elements with the filter and add the result
474 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
475 srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
476
477 // merge the result together
478 srcRegFilt7 = _mm_unpacklo_epi8(srcReg5, srcReg6);
479 srcRegFilt8 = _mm_unpackhi_epi8(srcReg5, srcReg6);
480
481 // multiply 2 adjacent elements with the filter and add the result
482 srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
483 srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
484
485 // add and saturate the results together
486 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
487 _mm_min_epi16(srcRegFilt3, srcRegFilt7));
488 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
489 _mm_min_epi16(srcRegFilt6, srcRegFilt8));
490
491 // add and saturate the results together
492 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
493 _mm_max_epi16(srcRegFilt3, srcRegFilt7));
494 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
495 _mm_max_epi16(srcRegFilt6, srcRegFilt8));
496 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
497 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
498
499 // shift by 7 bit each 16 bit
500 srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
501 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
502
503 // shrink to 8 bit each 16 bits, the first lane contain the first
504 // convolve result and the second lane contain the second convolve
505 // result
506 srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
507
508 src_ptr+=src_pitch;
509
510 // shift down a row
511 srcReg1 = srcReg2;
512 srcReg2 = srcReg3;
513 srcReg3 = srcReg4;
514 srcReg4 = srcReg5;
515 srcReg5 = srcReg6;
516 srcReg6 = srcReg7;
517 srcReg7 = srcReg8;
518
519 // save 16 bytes convolve result
520 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
521
522 output_ptr+=out_pitch;
523 }
524 }
525
526 #if ARCH_X86_64
527 filter8_1dfunction vpx_filter_block1d16_v8_intrin_ssse3;
528 filter8_1dfunction vpx_filter_block1d16_h8_intrin_ssse3;
529 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
530 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
531 filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
532 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
533 #define vpx_filter_block1d16_v8_ssse3 vpx_filter_block1d16_v8_intrin_ssse3
534 #define vpx_filter_block1d16_h8_ssse3 vpx_filter_block1d16_h8_intrin_ssse3
535 #define vpx_filter_block1d8_v8_ssse3 vpx_filter_block1d8_v8_intrin_ssse3
536 #define vpx_filter_block1d8_h8_ssse3 vpx_filter_block1d8_h8_intrin_ssse3
537 #define vpx_filter_block1d4_h8_ssse3 vpx_filter_block1d4_h8_intrin_ssse3
538 #else // ARCH_X86
539 filter8_1dfunction vpx_filter_block1d16_v8_ssse3;
540 filter8_1dfunction vpx_filter_block1d16_h8_ssse3;
541 filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
542 filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
543 filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
544 filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
545 #endif // ARCH_X86_64
546 filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3;
547 filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3;
548 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
549 filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
550 filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
551 filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
552
553 filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
554 filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
555 filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
556 filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
557 filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
558 filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
559 filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
560 filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
561 filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
562 filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
563 filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
564 filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
565
566 // void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
567 // uint8_t *dst, ptrdiff_t dst_stride,
568 // const int16_t *filter_x, int x_step_q4,
569 // const int16_t *filter_y, int y_step_q4,
570 // int w, int h);
571 // void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
572 // uint8_t *dst, ptrdiff_t dst_stride,
573 // const int16_t *filter_x, int x_step_q4,
574 // const int16_t *filter_y, int y_step_q4,
575 // int w, int h);
576 // void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
577 // uint8_t *dst, ptrdiff_t dst_stride,
578 // const int16_t *filter_x, int x_step_q4,
579 // const int16_t *filter_y, int y_step_q4,
580 // int w, int h);
581 // void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
582 // uint8_t *dst, ptrdiff_t dst_stride,
583 // const int16_t *filter_x, int x_step_q4,
584 // const int16_t *filter_y, int y_step_q4,
585 // int w, int h);
586 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
587 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
588 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
589 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
590 ssse3);
591
592 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
593 out0, out1, out2, out3, out4, out5, out6, out7) { \
594 const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1); \
595 const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3); \
596 const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5); \
597 const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7); \
598 \
599 const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1); \
600 const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1); \
601 const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3); \
602 const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3); \
603 \
604 const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2); \
605 const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2); \
606 const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3); \
607 const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \
608 \
609 out0 = _mm_unpacklo_epi64(tr2_0, tr2_0); \
610 out1 = _mm_unpackhi_epi64(tr2_0, tr2_0); \
611 out2 = _mm_unpacklo_epi64(tr2_1, tr2_1); \
612 out3 = _mm_unpackhi_epi64(tr2_1, tr2_1); \
613 out4 = _mm_unpacklo_epi64(tr2_2, tr2_2); \
614 out5 = _mm_unpackhi_epi64(tr2_2, tr2_2); \
615 out6 = _mm_unpacklo_epi64(tr2_3, tr2_3); \
616 out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \
617 }
618
filter_horiz_w8_ssse3(const uint8_t * src_x,ptrdiff_t src_pitch,uint8_t * dst,const int16_t * x_filter)619 static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
620 uint8_t *dst, const int16_t *x_filter) {
621 const __m128i k_256 = _mm_set1_epi16(1 << 8);
622 const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
623 // pack and duplicate the filter values
624 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
625 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
626 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
627 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
628 const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
629 const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
630 const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
631 const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
632 const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
633 const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
634 const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
635 const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
636 // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
637 const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
638 // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
639 const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
640 // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
641 const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
642 // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
643 const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
644 // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
645 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
646 // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
647 const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
648 // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
649 const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
650 // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
651 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
652 // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
653 const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
654 const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
655 const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
656 const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
657 // multiply 2 adjacent elements with the filter and add the result
658 const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
659 const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
660 const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
661 const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
662 // add and saturate the results together
663 const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
664 const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
665 __m128i temp = _mm_adds_epi16(x0, x3);
666 temp = _mm_adds_epi16(temp, min_x2x1);
667 temp = _mm_adds_epi16(temp, max_x2x1);
668 // round and shift by 7 bit each 16 bit
669 temp = _mm_mulhrs_epi16(temp, k_256);
670 // shrink to 8 bit each 16 bits
671 temp = _mm_packus_epi16(temp, temp);
672 // save only 8 bytes convolve result
673 _mm_storel_epi64((__m128i*)dst, temp);
674 }
675
transpose8x8_to_dst(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride)676 static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
677 uint8_t *dst, ptrdiff_t dst_stride) {
678 __m128i A, B, C, D, E, F, G, H;
679
680 A = _mm_loadl_epi64((const __m128i *)src);
681 B = _mm_loadl_epi64((const __m128i *)(src + src_stride));
682 C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
683 D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
684 E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
685 F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5));
686 G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
687 H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
688
689 TRANSPOSE_8X8(A, B, C, D, E, F, G, H,
690 A, B, C, D, E, F, G, H);
691
692 _mm_storel_epi64((__m128i*)dst, A);
693 _mm_storel_epi64((__m128i*)(dst + dst_stride * 1), B);
694 _mm_storel_epi64((__m128i*)(dst + dst_stride * 2), C);
695 _mm_storel_epi64((__m128i*)(dst + dst_stride * 3), D);
696 _mm_storel_epi64((__m128i*)(dst + dst_stride * 4), E);
697 _mm_storel_epi64((__m128i*)(dst + dst_stride * 5), F);
698 _mm_storel_epi64((__m128i*)(dst + dst_stride * 6), G);
699 _mm_storel_epi64((__m128i*)(dst + dst_stride * 7), H);
700 }
701
scaledconvolve_horiz_w8(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)702 static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
703 uint8_t *dst, ptrdiff_t dst_stride,
704 const InterpKernel *x_filters,
705 int x0_q4, int x_step_q4, int w, int h) {
706 DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
707 int x, y, z;
708 src -= SUBPEL_TAPS / 2 - 1;
709
710 // This function processes 8x8 areas. The intermediate height is not always
711 // a multiple of 8, so force it to be a multiple of 8 here.
712 y = h + (8 - (h & 0x7));
713
714 do {
715 int x_q4 = x0_q4;
716 for (x = 0; x < w; x += 8) {
717 // process 8 src_x steps
718 for (z = 0; z < 8; ++z) {
719 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
720 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
721 if (x_q4 & SUBPEL_MASK) {
722 filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
723 } else {
724 int i;
725 for (i = 0; i < 8; ++i) {
726 temp[z * 8 + i] = src_x[i * src_stride + 3];
727 }
728 }
729 x_q4 += x_step_q4;
730 }
731
732 // transpose the 8x8 filters values back to dst
733 transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
734 }
735
736 src += src_stride * 8;
737 dst += dst_stride * 8;
738 } while (y -= 8);
739 }
740
filter_horiz_w4_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * dst,const int16_t * filter)741 static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
742 uint8_t *dst, const int16_t *filter) {
743 const __m128i k_256 = _mm_set1_epi16(1 << 8);
744 const __m128i f_values = _mm_load_si128((const __m128i *)filter);
745 // pack and duplicate the filter values
746 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
747 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
748 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
749 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
750 const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
751 const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
752 const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
753 const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
754 // TRANSPOSE...
755 // 00 01 02 03 04 05 06 07
756 // 10 11 12 13 14 15 16 17
757 // 20 21 22 23 24 25 26 27
758 // 30 31 32 33 34 35 36 37
759 //
760 // TO
761 //
762 // 00 10 20 30
763 // 01 11 21 31
764 // 02 12 22 32
765 // 03 13 23 33
766 // 04 14 24 34
767 // 05 15 25 35
768 // 06 16 26 36
769 // 07 17 27 37
770 //
771 // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
772 const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
773 // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
774 const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
775 // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
776 const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
777 // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
778 const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
779 // 02 03 12 13 22 23 32 33
780 const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
781 // 06 07 16 17 26 27 36 37
782 const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
783 // multiply 2 adjacent elements with the filter and add the result
784 const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
785 const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
786 const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
787 const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
788 // add and saturate the results together
789 const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
790 const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
791 __m128i temp = _mm_adds_epi16(x0, x3);
792 temp = _mm_adds_epi16(temp, min_x2x1);
793 temp = _mm_adds_epi16(temp, max_x2x1);
794 // round and shift by 7 bit each 16 bit
795 temp = _mm_mulhrs_epi16(temp, k_256);
796 // shrink to 8 bit each 16 bits
797 temp = _mm_packus_epi16(temp, temp);
798 // save only 4 bytes
799 *(int *)dst = _mm_cvtsi128_si32(temp);
800 }
801
transpose4x4_to_dst(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride)802 static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
803 uint8_t *dst, ptrdiff_t dst_stride) {
804 __m128i A = _mm_cvtsi32_si128(*(const int *)src);
805 __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
806 __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
807 __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
808 // 00 10 01 11 02 12 03 13
809 const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
810 // 20 30 21 31 22 32 23 33
811 const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
812 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
813 A = _mm_unpacklo_epi16(tr0_0, tr0_1);
814 B = _mm_srli_si128(A, 4);
815 C = _mm_srli_si128(A, 8);
816 D = _mm_srli_si128(A, 12);
817
818 *(int *)(dst) = _mm_cvtsi128_si32(A);
819 *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B);
820 *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C);
821 *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D);
822 }
823
scaledconvolve_horiz_w4(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)824 static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
825 uint8_t *dst, ptrdiff_t dst_stride,
826 const InterpKernel *x_filters,
827 int x0_q4, int x_step_q4, int w, int h) {
828 DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
829 int x, y, z;
830 src -= SUBPEL_TAPS / 2 - 1;
831
832 for (y = 0; y < h; y += 4) {
833 int x_q4 = x0_q4;
834 for (x = 0; x < w; x += 4) {
835 // process 4 src_x steps
836 for (z = 0; z < 4; ++z) {
837 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
838 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
839 if (x_q4 & SUBPEL_MASK) {
840 filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
841 } else {
842 int i;
843 for (i = 0; i < 4; ++i) {
844 temp[z * 4 + i] = src_x[i * src_stride + 3];
845 }
846 }
847 x_q4 += x_step_q4;
848 }
849
850 // transpose the 4x4 filters values back to dst
851 transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
852 }
853
854 src += src_stride * 4;
855 dst += dst_stride * 4;
856 }
857 }
858
filter_vert_w4_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * dst,const int16_t * filter)859 static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
860 uint8_t *dst, const int16_t *filter) {
861 const __m128i k_256 = _mm_set1_epi16(1 << 8);
862 const __m128i f_values = _mm_load_si128((const __m128i *)filter);
863 // pack and duplicate the filter values
864 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
865 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
866 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
867 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
868 const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
869 const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
870 const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
871 const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
872 const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
873 const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
874 const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
875 const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
876 const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
877 const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
878 const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
879 const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
880 // multiply 2 adjacent elements with the filter and add the result
881 const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
882 const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
883 const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
884 const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
885 // add and saturate the results together
886 const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
887 const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
888 __m128i temp = _mm_adds_epi16(x0, x3);
889 temp = _mm_adds_epi16(temp, min_x2x1);
890 temp = _mm_adds_epi16(temp, max_x2x1);
891 // round and shift by 7 bit each 16 bit
892 temp = _mm_mulhrs_epi16(temp, k_256);
893 // shrink to 8 bit each 16 bits
894 temp = _mm_packus_epi16(temp, temp);
895 // save only 4 bytes
896 *(int *)dst = _mm_cvtsi128_si32(temp);
897 }
898
scaledconvolve_vert_w4(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)899 static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
900 uint8_t *dst, ptrdiff_t dst_stride,
901 const InterpKernel *y_filters,
902 int y0_q4, int y_step_q4, int w, int h) {
903 int y;
904 int y_q4 = y0_q4;
905
906 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
907 for (y = 0; y < h; ++y) {
908 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
909 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
910
911 if (y_q4 & SUBPEL_MASK) {
912 filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
913 } else {
914 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
915 }
916
917 y_q4 += y_step_q4;
918 }
919 }
920
filter_vert_w8_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * dst,const int16_t * filter)921 static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
922 uint8_t *dst, const int16_t *filter) {
923 const __m128i k_256 = _mm_set1_epi16(1 << 8);
924 const __m128i f_values = _mm_load_si128((const __m128i *)filter);
925 // pack and duplicate the filter values
926 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
927 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
928 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
929 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
930 const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
931 const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
932 const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
933 const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
934 const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
935 const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
936 const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
937 const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
938 const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
939 const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
940 const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
941 const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
942 // multiply 2 adjacent elements with the filter and add the result
943 const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
944 const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
945 const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
946 const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
947 // add and saturate the results together
948 const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
949 const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
950 __m128i temp = _mm_adds_epi16(x0, x3);
951 temp = _mm_adds_epi16(temp, min_x2x1);
952 temp = _mm_adds_epi16(temp, max_x2x1);
953 // round and shift by 7 bit each 16 bit
954 temp = _mm_mulhrs_epi16(temp, k_256);
955 // shrink to 8 bit each 16 bits
956 temp = _mm_packus_epi16(temp, temp);
957 // save only 8 bytes convolve result
958 _mm_storel_epi64((__m128i*)dst, temp);
959 }
960
scaledconvolve_vert_w8(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)961 static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
962 uint8_t *dst, ptrdiff_t dst_stride,
963 const InterpKernel *y_filters,
964 int y0_q4, int y_step_q4, int w, int h) {
965 int y;
966 int y_q4 = y0_q4;
967
968 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
969 for (y = 0; y < h; ++y) {
970 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
971 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
972 if (y_q4 & SUBPEL_MASK) {
973 filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
974 } else {
975 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
976 }
977 y_q4 += y_step_q4;
978 }
979 }
980
filter_vert_w16_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * dst,const int16_t * filter,int w)981 static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
982 uint8_t *dst, const int16_t *filter, int w) {
983 const __m128i k_256 = _mm_set1_epi16(1 << 8);
984 const __m128i f_values = _mm_load_si128((const __m128i *)filter);
985 // pack and duplicate the filter values
986 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
987 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
988 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
989 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
990 int i;
991
992 for (i = 0; i < w; i += 16) {
993 const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
994 const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
995 const __m128i C =
996 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
997 const __m128i D =
998 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
999 const __m128i E =
1000 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
1001 const __m128i F =
1002 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
1003 const __m128i G =
1004 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
1005 const __m128i H =
1006 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
1007 // merge the result together
1008 const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
1009 const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
1010 const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
1011 const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
1012 // multiply 2 adjacent elements with the filter and add the result
1013 const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
1014 const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
1015 const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
1016 const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
1017 // add and saturate the results together
1018 const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
1019 const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
1020 // merge the result together
1021 const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
1022 const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
1023 // multiply 2 adjacent elements with the filter and add the result
1024 const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
1025 const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
1026 // merge the result together
1027 const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
1028 const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
1029 // multiply 2 adjacent elements with the filter and add the result
1030 const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
1031 const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
1032 // add and saturate the results together
1033 __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
1034 __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));
1035
1036 // add and saturate the results together
1037 temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
1038 temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
1039 // round and shift by 7 bit each 16 bit
1040 temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
1041 temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
1042 // shrink to 8 bit each 16 bits, the first lane contain the first
1043 // convolve result and the second lane contain the second convolve
1044 // result
1045 temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
1046 src_ptr += 16;
1047 // save 16 bytes convolve result
1048 _mm_store_si128((__m128i*)&dst[i], temp_hi);
1049 }
1050 }
1051
scaledconvolve_vert_w16(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)1052 static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
1053 uint8_t *dst, ptrdiff_t dst_stride,
1054 const InterpKernel *y_filters,
1055 int y0_q4, int y_step_q4, int w, int h) {
1056 int y;
1057 int y_q4 = y0_q4;
1058
1059 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1060 for (y = 0; y < h; ++y) {
1061 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1062 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1063 if (y_q4 & SUBPEL_MASK) {
1064 filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
1065 w);
1066 } else {
1067 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
1068 }
1069 y_q4 += y_step_q4;
1070 }
1071 }
1072
scaledconvolve2d(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_q4,int x_step_q4,const InterpKernel * const y_filters,int y0_q4,int y_step_q4,int w,int h)1073 static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
1074 uint8_t *dst, ptrdiff_t dst_stride,
1075 const InterpKernel *const x_filters,
1076 int x0_q4, int x_step_q4,
1077 const InterpKernel *const y_filters,
1078 int y0_q4, int y_step_q4,
1079 int w, int h) {
1080 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
1081 // 2d filtering proceeds in 2 steps:
1082 // (1) Interpolate horizontally into an intermediate buffer, temp.
1083 // (2) Interpolate temp vertically to derive the sub-pixel result.
1084 // Deriving the maximum number of rows in the temp buffer (135):
1085 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1086 // --Largest block size is 64x64 pixels.
1087 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
1088 // original frame (in 1/16th pixel units).
1089 // --Must round-up because block may be located at sub-pixel position.
1090 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1091 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
1092 // --Require an additional 8 rows for the horiz_w8 transpose tail.
1093 DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
1094 const int intermediate_height =
1095 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1096
1097 assert(w <= 64);
1098 assert(h <= 64);
1099 assert(y_step_q4 <= 32);
1100 assert(x_step_q4 <= 32);
1101
1102 if (w >= 8) {
1103 scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1104 src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
1105 w, intermediate_height);
1106 } else {
1107 scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1108 src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
1109 w, intermediate_height);
1110 }
1111
1112 if (w >= 16) {
1113 scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1114 dst_stride, y_filters, y0_q4, y_step_q4, w, h);
1115 } else if (w == 8) {
1116 scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1117 dst_stride, y_filters, y0_q4, y_step_q4, w, h);
1118 } else {
1119 scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1120 dst_stride, y_filters, y0_q4, y_step_q4, w, h);
1121 }
1122 }
1123
get_filter_base(const int16_t * filter)1124 static const InterpKernel *get_filter_base(const int16_t *filter) {
1125 // NOTE: This assumes that the filter table is 256-byte aligned.
1126 // TODO(agrange) Modify to make independent of table alignment.
1127 return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
1128 }
1129
get_filter_offset(const int16_t * f,const InterpKernel * base)1130 static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
1131 return (int)((const InterpKernel *)(intptr_t)f - base);
1132 }
1133
vpx_scaled_2d_ssse3(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)1134 void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride,
1135 uint8_t *dst, ptrdiff_t dst_stride,
1136 const int16_t *filter_x, int x_step_q4,
1137 const int16_t *filter_y, int y_step_q4,
1138 int w, int h) {
1139 const InterpKernel *const filters_x = get_filter_base(filter_x);
1140 const int x0_q4 = get_filter_offset(filter_x, filters_x);
1141
1142 const InterpKernel *const filters_y = get_filter_base(filter_y);
1143 const int y0_q4 = get_filter_offset(filter_y, filters_y);
1144
1145 scaledconvolve2d(src, src_stride, dst, dst_stride,
1146 filters_x, x0_q4, x_step_q4,
1147 filters_y, y0_q4, y_step_q4, w, h);
1148 }
1149
1150 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
1151 // uint8_t *dst, ptrdiff_t dst_stride,
1152 // const int16_t *filter_x, int x_step_q4,
1153 // const int16_t *filter_y, int y_step_q4,
1154 // int w, int h);
1155 // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
1156 // uint8_t *dst, ptrdiff_t dst_stride,
1157 // const int16_t *filter_x, int x_step_q4,
1158 // const int16_t *filter_y, int y_step_q4,
1159 // int w, int h);
1160 FUN_CONV_2D(, ssse3);
1161 FUN_CONV_2D(avg_ , ssse3);
1162