• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <immintrin.h>
13 
14 #include "config/av1_rtcd.h"
15 
16 #include "third_party/SVT-AV1/convolve_avx2.h"
17 
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_dsp/x86/convolve_avx2.h"
20 #include "aom_dsp/x86/convolve_common_intrin.h"
21 #include "aom_dsp/x86/synonyms.h"
22 
av1_convolve_y_sr_general_avx2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)23 static AOM_INLINE void av1_convolve_y_sr_general_avx2(
24     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
25     int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) {
26   // right shift is F-1 because we are already dividing
27   // filter co-efficients by 2
28   const int right_shift_bits = (FILTER_BITS - 1);
29   __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
30   __m256i right_shift_const = _mm256_set1_epi16((1 << right_shift_bits) >> 1);
31 
32   __m256i coeffs[6], s[12];
33   __m128i d[10];
34 
35   int i, vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
36 
37   if (vert_tap == 6)
38     prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs);
39   else if (vert_tap == 12) {
40     prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs);
41   } else {
42     prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
43   }
44 
45   // vert_filt as 4 tap
46   if (vert_tap == 4) {
47     const int fo_vert = 1;
48     const uint8_t *const src_ptr = src - fo_vert * src_stride;
49     for (int j = 0; j < w; j += 16) {
50       const uint8_t *data = &src_ptr[j];
51       d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
52       d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
53       d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
54       d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
55       d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
56 
57       // Load lines a and b. Line a to lower 128, line b to upper 128
58       const __m256i src_01a = _mm256_permute2x128_si256(
59           _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
60 
61       const __m256i src_12a = _mm256_permute2x128_si256(
62           _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
63 
64       const __m256i src_23a = _mm256_permute2x128_si256(
65           _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
66 
67       const __m256i src_34a = _mm256_permute2x128_si256(
68           _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
69 
70       s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
71       s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
72 
73       s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
74       s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
75 
76       for (i = 0; i < h; i += 2) {
77         data = &src_ptr[i * src_stride + j];
78         d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
79         const __m256i src_45a = _mm256_permute2x128_si256(
80             _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
81 
82         d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
83         const __m256i src_56a = _mm256_permute2x128_si256(
84             _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20);
85 
86         s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
87         s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
88 
89         const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
90         /* rounding code */
91         // shift by F - 1
92         const __m256i res_16b_lo = _mm256_sra_epi16(
93             _mm256_add_epi16(res_lo, right_shift_const), right_shift);
94         // 8 bit conversion and saturation to uint8
95         __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
96 
97         if (w - j > 8) {
98           const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
99 
100           /* rounding code */
101           // shift by F - 1
102           const __m256i res_16b_hi = _mm256_sra_epi16(
103               _mm256_add_epi16(res_hi, right_shift_const), right_shift);
104           // 8 bit conversion and saturation to uint8
105           __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
106 
107           __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
108 
109           const __m128i res_0 = _mm256_castsi256_si128(res_a);
110           const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
111 
112           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
113           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
114                            res_1);
115         } else {
116           const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
117           const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
118           if (w - j > 4) {
119             _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
120             _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
121                              res_1);
122           } else if (w - j > 2) {
123             xx_storel_32(&dst[i * dst_stride + j], res_0);
124             xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
125           } else {
126             __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
127             __m128i *const p_1 =
128                 (__m128i *)&dst[i * dst_stride + j + dst_stride];
129             *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
130             *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
131           }
132         }
133         s[0] = s[1];
134         s[1] = s[2];
135 
136         s[3] = s[4];
137         s[4] = s[5];
138       }
139     }
140   } else if (vert_tap == 6) {
141     const int fo_vert = vert_tap / 2 - 1;
142     const uint8_t *const src_ptr = src - fo_vert * src_stride;
143 
144     for (int j = 0; j < w; j += 16) {
145       const uint8_t *data = &src_ptr[j];
146       __m256i src6;
147 
148       d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
149       d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
150       d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
151       d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
152       // Load lines a and b. Line a to lower 128, line b to upper 128
153       const __m256i src_01a = _mm256_permute2x128_si256(
154           _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
155 
156       const __m256i src_12a = _mm256_permute2x128_si256(
157           _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
158 
159       const __m256i src_23a = _mm256_permute2x128_si256(
160           _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
161 
162       src6 = _mm256_castsi128_si256(
163           _mm_loadu_si128((__m128i *)(data + 4 * src_stride)));
164       const __m256i src_34a =
165           _mm256_permute2x128_si256(_mm256_castsi128_si256(d[3]), src6, 0x20);
166 
167       s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
168       s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
169 
170       s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
171       s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
172 
173       for (i = 0; i < h; i += 2) {
174         data = &src_ptr[i * src_stride + j];
175         const __m256i src_45a = _mm256_permute2x128_si256(
176             src6,
177             _mm256_castsi128_si256(
178                 _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
179             0x20);
180 
181         src6 = _mm256_castsi128_si256(
182             _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
183         const __m256i src_56a = _mm256_permute2x128_si256(
184             _mm256_castsi128_si256(
185                 _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
186             src6, 0x20);
187 
188         s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
189         s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
190 
191         const __m256i res_lo = convolve_lowbd_6tap(s, coeffs);
192 
193         /* rounding code */
194         // shift by F - 1
195         const __m256i res_16b_lo = _mm256_sra_epi16(
196             _mm256_add_epi16(res_lo, right_shift_const), right_shift);
197         // 8 bit conversion and saturation to uint8
198         __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
199 
200         if (w - j > 8) {
201           const __m256i res_hi = convolve_lowbd_6tap(s + 3, coeffs);
202 
203           /* rounding code */
204           // shift by F - 1
205           const __m256i res_16b_hi = _mm256_sra_epi16(
206               _mm256_add_epi16(res_hi, right_shift_const), right_shift);
207           // 8 bit conversion and saturation to uint8
208           __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
209 
210           __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
211 
212           const __m128i res_0 = _mm256_castsi256_si128(res_a);
213           const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
214 
215           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
216           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
217                            res_1);
218         } else {
219           const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
220           const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
221           if (w - j > 4) {
222             _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
223             _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
224                              res_1);
225           } else if (w - j > 2) {
226             xx_storel_32(&dst[i * dst_stride + j], res_0);
227             xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
228           } else {
229             __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
230             __m128i *const p_1 =
231                 (__m128i *)&dst[i * dst_stride + j + dst_stride];
232             *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
233             *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
234           }
235         }
236         s[0] = s[1];
237         s[1] = s[2];
238         s[3] = s[4];
239         s[4] = s[5];
240       }
241     }
242   } else if (vert_tap == 12) {  // vert_tap == 12
243     const int fo_vert = filter_params_y->taps / 2 - 1;
244     const uint8_t *const src_ptr = src - fo_vert * src_stride;
245     const __m256i v_zero = _mm256_setzero_si256();
246     right_shift = _mm_cvtsi32_si128(FILTER_BITS);
247     right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1);
248 
249     for (int j = 0; j < w; j += 8) {
250       const uint8_t *data = &src_ptr[j];
251       __m256i src10;
252 
253       d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride));
254       d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride));
255       d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
256       d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride));
257       d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride));
258       d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride));
259       d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
260       d[7] = _mm_loadl_epi64((__m128i *)(data + 7 * src_stride));
261       d[8] = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
262       d[9] = _mm_loadl_epi64((__m128i *)(data + 9 * src_stride));
263       // Load lines a and b. Line a to lower 128, line b to upper 128
264       const __m256i src_01a = _mm256_permute2x128_si256(
265           _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
266 
267       const __m256i src_12a = _mm256_permute2x128_si256(
268           _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
269 
270       const __m256i src_23a = _mm256_permute2x128_si256(
271           _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
272 
273       const __m256i src_34a = _mm256_permute2x128_si256(
274           _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
275 
276       const __m256i src_45a = _mm256_permute2x128_si256(
277           _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
278 
279       const __m256i src_56a = _mm256_permute2x128_si256(
280           _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[6]), 0x20);
281 
282       const __m256i src_67a = _mm256_permute2x128_si256(
283           _mm256_castsi128_si256(d[6]), _mm256_castsi128_si256(d[7]), 0x20);
284 
285       const __m256i src_78a = _mm256_permute2x128_si256(
286           _mm256_castsi128_si256(d[7]), _mm256_castsi128_si256(d[8]), 0x20);
287 
288       const __m256i src_89a = _mm256_permute2x128_si256(
289           _mm256_castsi128_si256(d[8]), _mm256_castsi128_si256(d[9]), 0x20);
290 
291       src10 = _mm256_castsi128_si256(
292           _mm_loadl_epi64((__m128i *)(data + 10 * src_stride)));
293       const __m256i src_910a =
294           _mm256_permute2x128_si256(_mm256_castsi128_si256(d[9]), src10, 0x20);
295 
296       const __m256i src_01 = _mm256_unpacklo_epi8(src_01a, v_zero);
297       const __m256i src_12 = _mm256_unpacklo_epi8(src_12a, v_zero);
298       const __m256i src_23 = _mm256_unpacklo_epi8(src_23a, v_zero);
299       const __m256i src_34 = _mm256_unpacklo_epi8(src_34a, v_zero);
300       const __m256i src_45 = _mm256_unpacklo_epi8(src_45a, v_zero);
301       const __m256i src_56 = _mm256_unpacklo_epi8(src_56a, v_zero);
302       const __m256i src_67 = _mm256_unpacklo_epi8(src_67a, v_zero);
303       const __m256i src_78 = _mm256_unpacklo_epi8(src_78a, v_zero);
304       const __m256i src_89 = _mm256_unpacklo_epi8(src_89a, v_zero);
305       const __m256i src_910 = _mm256_unpacklo_epi8(src_910a, v_zero);
306 
307       s[0] = _mm256_unpacklo_epi16(src_01, src_12);
308       s[1] = _mm256_unpacklo_epi16(src_23, src_34);
309       s[2] = _mm256_unpacklo_epi16(src_45, src_56);
310       s[3] = _mm256_unpacklo_epi16(src_67, src_78);
311       s[4] = _mm256_unpacklo_epi16(src_89, src_910);
312 
313       s[6] = _mm256_unpackhi_epi16(src_01, src_12);
314       s[7] = _mm256_unpackhi_epi16(src_23, src_34);
315       s[8] = _mm256_unpackhi_epi16(src_45, src_56);
316       s[9] = _mm256_unpackhi_epi16(src_67, src_78);
317       s[10] = _mm256_unpackhi_epi16(src_89, src_910);
318 
319       for (i = 0; i < h; i += 2) {
320         data = &src_ptr[i * src_stride + j];
321         const __m256i src_1011a = _mm256_permute2x128_si256(
322             src10,
323             _mm256_castsi128_si256(
324                 _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
325             0x20);
326 
327         src10 = _mm256_castsi128_si256(
328             _mm_loadl_epi64((__m128i *)(data + 12 * src_stride)));
329 
330         const __m256i src_1112a = _mm256_permute2x128_si256(
331             _mm256_castsi128_si256(
332                 _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
333             src10, 0x20);
334 
335         const __m256i src_1011 = _mm256_unpacklo_epi8(src_1011a, v_zero);
336         const __m256i src_1112 = _mm256_unpacklo_epi8(src_1112a, v_zero);
337 
338         s[5] = _mm256_unpacklo_epi16(src_1011, src_1112);
339         s[11] = _mm256_unpackhi_epi16(src_1011, src_1112);
340 
341         const __m256i res_lo = convolve_12taps(s, coeffs);
342 
343         const __m256i res_32b_lo = _mm256_sra_epi32(
344             _mm256_add_epi32(res_lo, right_shift_const), right_shift);
345         // 8 bit conversion and saturation to uint8
346         __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
347         __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
348 
349         if (w - j > 4) {
350           const __m256i res_hi = convolve_12taps(s + 6, coeffs);
351 
352           const __m256i res_32b_hi = _mm256_sra_epi32(
353               _mm256_add_epi32(res_hi, right_shift_const), right_shift);
354           __m256i res_16b_hi = _mm256_packs_epi32(res_32b_hi, res_32b_hi);
355           // 8 bit conversion and saturation to uint8
356           __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
357 
358           __m256i res_a = _mm256_unpacklo_epi32(res_8b_lo, res_8b_hi);
359 
360           const __m128i res_0 = _mm256_extracti128_si256(res_a, 0);
361           const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
362 
363           _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
364           _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
365                            res_1);
366         } else {
367           const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
368           const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
369           if (w - j > 2) {
370             *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0);
371             *(int *)&dst[i * dst_stride + j + dst_stride] =
372                 _mm_cvtsi128_si32(res_1);
373           } else {
374             *(uint16_t *)&dst[i * dst_stride + j] =
375                 (uint16_t)_mm_cvtsi128_si32(res_0);
376             *(uint16_t *)&dst[i * dst_stride + j + dst_stride] =
377                 (uint16_t)_mm_cvtsi128_si32(res_1);
378           }
379         }
380         s[0] = s[1];
381         s[1] = s[2];
382         s[2] = s[3];
383         s[3] = s[4];
384         s[4] = s[5];
385 
386         s[6] = s[7];
387         s[7] = s[8];
388         s[8] = s[9];
389         s[9] = s[10];
390         s[10] = s[11];
391       }
392     }
393   } else {
394     const int fo_vert = filter_params_y->taps / 2 - 1;
395     const uint8_t *const src_ptr = src - fo_vert * src_stride;
396 
397     for (int j = 0; j < w; j += 16) {
398       const uint8_t *data = &src_ptr[j];
399       __m256i src6;
400 
401       d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
402       d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
403       d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
404       d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
405       d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
406       d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
407       // Load lines a and b. Line a to lower 128, line b to upper 128
408       const __m256i src_01a = _mm256_permute2x128_si256(
409           _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
410 
411       const __m256i src_12a = _mm256_permute2x128_si256(
412           _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
413 
414       const __m256i src_23a = _mm256_permute2x128_si256(
415           _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
416 
417       const __m256i src_34a = _mm256_permute2x128_si256(
418           _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
419 
420       const __m256i src_45a = _mm256_permute2x128_si256(
421           _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
422 
423       src6 = _mm256_castsi128_si256(
424           _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
425       const __m256i src_56a =
426           _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20);
427 
428       s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
429       s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
430       s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
431 
432       s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
433       s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
434       s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
435 
436       for (i = 0; i < h; i += 2) {
437         data = &src_ptr[i * src_stride + j];
438         const __m256i src_67a = _mm256_permute2x128_si256(
439             src6,
440             _mm256_castsi128_si256(
441                 _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
442             0x20);
443 
444         src6 = _mm256_castsi128_si256(
445             _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
446         const __m256i src_78a = _mm256_permute2x128_si256(
447             _mm256_castsi128_si256(
448                 _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
449             src6, 0x20);
450 
451         s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
452         s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
453 
454         const __m256i res_lo = convolve_lowbd(s, coeffs);
455 
456         /* rounding code */
457         // shift by F - 1
458         const __m256i res_16b_lo = _mm256_sra_epi16(
459             _mm256_add_epi16(res_lo, right_shift_const), right_shift);
460         // 8 bit conversion and saturation to uint8
461         __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
462 
463         if (w - j > 8) {
464           const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
465 
466           /* rounding code */
467           // shift by F - 1
468           const __m256i res_16b_hi = _mm256_sra_epi16(
469               _mm256_add_epi16(res_hi, right_shift_const), right_shift);
470           // 8 bit conversion and saturation to uint8
471           __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
472 
473           __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
474 
475           const __m128i res_0 = _mm256_castsi256_si128(res_a);
476           const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
477 
478           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
479           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
480                            res_1);
481         } else {
482           const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
483           const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
484           if (w - j > 4) {
485             _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
486             _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
487                              res_1);
488           } else if (w - j > 2) {
489             xx_storel_32(&dst[i * dst_stride + j], res_0);
490             xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
491           } else {
492             __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
493             __m128i *const p_1 =
494                 (__m128i *)&dst[i * dst_stride + j + dst_stride];
495             *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
496             *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
497           }
498         }
499         s[0] = s[1];
500         s[1] = s[2];
501         s[2] = s[3];
502 
503         s[4] = s[5];
504         s[5] = s[6];
505         s[6] = s[7];
506       }
507     }
508   }
509 }
510 
av1_convolve_y_sr_avx2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_y,const int32_t subpel_y_q4)511 void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride,
512                             uint8_t *dst, int32_t dst_stride, int32_t w,
513                             int32_t h,
514                             const InterpFilterParams *filter_params_y,
515                             const int32_t subpel_y_q4) {
516   const int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
517 
518   if (vert_tap == 12) {
519     av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
520                                    filter_params_y, subpel_y_q4);
521   } else {
522     av1_convolve_y_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
523                                        filter_params_y, subpel_y_q4);
524   }
525 }
526 
av1_convolve_x_sr_general_avx2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)527 static AOM_INLINE void av1_convolve_x_sr_general_avx2(
528     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
529     int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
530     ConvolveParams *conv_params) {
531   const int bits = FILTER_BITS - conv_params->round_0;
532   const __m128i round_shift = _mm_cvtsi32_si128(bits);
533   __m256i round_0_const =
534       _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
535   __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
536   __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
537   int i, horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
538 
539   assert(bits >= 0);
540   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
541          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
542   assert(conv_params->round_0 > 0);
543 
544   __m256i coeffs[6], filt[4];
545   filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
546   filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
547 
548   if (horiz_tap == 6)
549     prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs);
550   else if (horiz_tap == 12) {
551     prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs);
552   } else {
553     prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
554   }
555 
556   // horz_filt as 4 tap
557   if (horiz_tap == 4) {
558     const int fo_horiz = 1;
559     const uint8_t *const src_ptr = src - fo_horiz;
560     if (w <= 8) {
561       for (i = 0; i < h; i += 2) {
562         const __m256i data = _mm256_permute2x128_si256(
563             _mm256_castsi128_si256(
564                 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
565             _mm256_castsi128_si256(_mm_loadu_si128(
566                 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
567             0x20);
568 
569         __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
570 
571         res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
572                                    round_0_shift);
573 
574         res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
575                                    round_shift);
576 
577         /* rounding code */
578         // 8 bit conversion and saturation to uint8
579         __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
580 
581         const __m128i res_0 = _mm256_castsi256_si128(res_8b);
582         const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
583 
584         if (w > 4) {
585           _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
586           _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
587         } else if (w > 2) {
588           xx_storel_32(&dst[i * dst_stride], res_0);
589           xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
590         } else {
591           __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
592           __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
593           *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
594           *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
595         }
596       }
597     } else {
598       for (i = 0; i < h; ++i) {
599         for (int j = 0; j < w; j += 16) {
600           // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
601           // 18 19 20 21 22 23
602           const __m256i data = _mm256_inserti128_si256(
603               _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
604               _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
605               1);
606 
607           __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
608 
609           res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
610                                      round_0_shift);
611 
612           res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
613                                      round_shift);
614 
615           /* rounding code */
616           // 8 bit conversion and saturation to uint8
617           __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
618 
619           // Store values into the destination buffer
620           // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
621           res_8b = _mm256_permute4x64_epi64(res_8b, 216);
622           __m128i res = _mm256_castsi256_si128(res_8b);
623           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
624         }
625       }
626     }
627   } else if (horiz_tap == 6) {
628     const int fo_horiz = horiz_tap / 2 - 1;
629     const uint8_t *const src_ptr = src - fo_horiz;
630     filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
631     filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
632 
633     if (w <= 8) {
634       for (i = 0; i < h; i += 2) {
635         const __m256i data = _mm256_permute2x128_si256(
636             _mm256_castsi128_si256(
637                 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
638             _mm256_castsi128_si256(_mm_loadu_si128(
639                 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
640             0x20);
641 
642         __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
643 
644         res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
645                                    round_0_shift);
646 
647         res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
648                                    round_shift);
649 
650         /* rounding code */
651         // 8 bit conversion and saturation to uint8
652         __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
653 
654         const __m128i res_0 = _mm256_castsi256_si128(res_8b);
655         const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
656         if (w > 4) {
657           _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
658           _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
659         } else if (w > 2) {
660           xx_storel_32(&dst[i * dst_stride], res_0);
661           xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
662         } else {
663           __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
664           __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
665           *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
666           *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
667         }
668       }
669     } else {
670       for (i = 0; i < h; ++i) {
671         for (int j = 0; j < w; j += 16) {
672           // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
673           // 18 19 20 21 22 23
674           const __m256i data = _mm256_inserti128_si256(
675               _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
676               _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
677               1);
678 
679           __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
680 
681           res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
682                                      round_0_shift);
683 
684           res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
685                                      round_shift);
686 
687           /* rounding code */
688           // 8 bit conversion and saturation to uint8
689           __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
690 
691           // Store values into the destination buffer
692           // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
693           res_8b = _mm256_permute4x64_epi64(res_8b, 216);
694           __m128i res = _mm256_castsi256_si128(res_8b);
695           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
696         }
697       }
698     }
699   } else if (horiz_tap == 12) {  // horiz_tap == 12
700     const int fo_horiz = filter_params_x->taps / 2 - 1;
701     const uint8_t *const src_ptr = src - fo_horiz;
702     const __m256i v_zero = _mm256_setzero_si256();
703     round_0_const = _mm256_set1_epi32((1 << (conv_params->round_0)) >> 1);
704     round_const = _mm256_set1_epi32((1 << bits) >> 1);
705     round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
706     __m256i s[6];
707 
708     if (w <= 4) {
709       for (i = 0; i < h; i += 2) {
710         const __m256i data = _mm256_permute2x128_si256(
711             _mm256_castsi128_si256(
712                 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
713             _mm256_castsi128_si256(_mm_loadu_si128(
714                 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
715             0x20);
716         // row0 0..7 row1 0..7
717         const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
718         // row0 8..F row1 8..F
719         const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
720 
721         // row0 00 00 01 01 .. 03 03 row1 00 00 01 01 .. 03 03
722         const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
723         // row0 04 04 .. 07 07 row1 04 04 .. 07 07
724         const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
725 
726         // row0 08 08 09 09 .. 0B 0B row1 08 08 09 09 .. 0B 0B
727         const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
728         // row0 0C 0C .. 0F 0F row1 0C 0C .. 0F 0F
729         const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
730 
731         // 00 01 01 02 02 03 03 04 10 11 11 12 12 13 13 14
732         s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
733         // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
734         s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
735         // 04 05 05 06 06 07 07 08 14 15 15 16 16 17 17 18
736         s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
737         // 06 07 07 08 08 09 09 0A 16 17 17 18 18 19 19 1A
738         s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
739         // 08 09 09 0A 0A 0B 0B 0C 18 19 19 1A 1A 1B 1B 1C
740         s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
741         // 0A 0B 0B 0C 0C 0D 0D 0E 1A 1B 1B 1C 1C 1D 1D 1E
742         s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
743 
744         const __m256i res_lo = convolve_12taps(s, coeffs);
745 
746         __m256i res_32b_lo = _mm256_sra_epi32(
747             _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
748 
749         // 00 01 02 03 10 12 13 14
750         res_32b_lo = _mm256_sra_epi32(_mm256_add_epi32(res_32b_lo, round_const),
751                                       round_shift);
752         // 8 bit conversion and saturation to uint8
753         // 00 01 02 03 00 01 02 03 10 11 12 13 10 11 12 13
754         __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
755         // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
756         // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
757         __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
758 
759         // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
760         const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
761         // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
762         const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
763         if (w > 2) {
764           // 00 01 02 03
765           *(int *)&dst[i * dst_stride] = _mm_cvtsi128_si32(res_0);
766           // 10 11 12 13
767           *(int *)&dst[i * dst_stride + dst_stride] = _mm_cvtsi128_si32(res_1);
768         } else {
769           // 00 01
770           *(uint16_t *)&dst[i * dst_stride] =
771               (uint16_t)_mm_cvtsi128_si32(res_0);
772           // 10 11
773           *(uint16_t *)&dst[i * dst_stride + dst_stride] =
774               (uint16_t)_mm_cvtsi128_si32(res_1);
775         }
776       }
777     } else {
778       for (i = 0; i < h; i++) {
779         for (int j = 0; j < w; j += 8) {
780           const __m256i data = _mm256_permute2x128_si256(
781               _mm256_castsi128_si256(
782                   _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
783               _mm256_castsi128_si256(_mm_loadu_si128(
784                   (__m128i *)(&src_ptr[i * src_stride + j + 4]))),
785               0x20);
786           // row0 0..7 4..B
787           const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
788           // row0 8..F C..13
789           const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
790 
791           // row0 00 00 01 01 .. 03 03 04 04 05 05 .. 07 07
792           const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
793           // row0 04 04 .. 07 07 08 08 .. 0B 0B
794           const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
795 
796           // row0 08 08 09 09 .. 0B 0B 0C 0C 0D 0D .. 0F 0F
797           const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
798           // row0 0C 0C 0D 0D .. 0F 0F 10 10 11 11 .. 13 13
799           const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
800 
801           s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
802           s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
803           s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
804           s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
805           s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
806           s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
807 
808           const __m256i res_lo = convolve_12taps(s, coeffs);
809 
810           __m256i res_32b_lo = _mm256_sra_epi32(
811               _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
812 
813           res_32b_lo = _mm256_sra_epi32(
814               _mm256_add_epi32(res_32b_lo, round_const), round_shift);
815           // 8 bit conversion and saturation to uint8
816           __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
817           __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
818           const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
819           const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
820           *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0);
821           *(int *)&dst[i * dst_stride + j + 4] = _mm_cvtsi128_si32(res_1);
822         }
823       }
824     }
825   } else {
826     const int fo_horiz = filter_params_x->taps / 2 - 1;
827     const uint8_t *const src_ptr = src - fo_horiz;
828     filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
829     filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
830 
831     if (w <= 8) {
832       for (i = 0; i < h; i += 2) {
833         const __m256i data = _mm256_permute2x128_si256(
834             _mm256_castsi128_si256(
835                 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
836             _mm256_castsi128_si256(_mm_loadu_si128(
837                 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
838             0x20);
839 
840         __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
841 
842         res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
843                                    round_0_shift);
844 
845         res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
846                                    round_shift);
847 
848         /* rounding code */
849         // 8 bit conversion and saturation to uint8
850         __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
851 
852         const __m128i res_0 = _mm256_castsi256_si128(res_8b);
853         const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
854         if (w > 4) {
855           _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
856           _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
857         } else if (w > 2) {
858           xx_storel_32(&dst[i * dst_stride], res_0);
859           xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
860         } else {
861           __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
862           __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
863           *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
864           *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
865         }
866       }
867     } else {
868       for (i = 0; i < h; ++i) {
869         for (int j = 0; j < w; j += 16) {
870           // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
871           // 18 19 20 21 22 23
872           const __m256i data = _mm256_inserti128_si256(
873               _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
874               _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
875               1);
876 
877           __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
878 
879           res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
880                                      round_0_shift);
881 
882           res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
883                                      round_shift);
884 
885           /* rounding code */
886           // 8 bit conversion and saturation to uint8
887           __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
888 
889           // Store values into the destination buffer
890           // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
891           res_8b = _mm256_permute4x64_epi64(res_8b, 216);
892           __m128i res = _mm256_castsi256_si128(res_8b);
893           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
894         }
895       }
896     }
897   }
898 }
899 
av1_convolve_x_sr_avx2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_x,const int32_t subpel_x_q4,ConvolveParams * conv_params)900 void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
901                             uint8_t *dst, int32_t dst_stride, int32_t w,
902                             int32_t h,
903                             const InterpFilterParams *filter_params_x,
904                             const int32_t subpel_x_q4,
905                             ConvolveParams *conv_params) {
906   const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
907 
908   if (horz_tap == 12) {
909     av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
910                                    filter_params_x, subpel_x_q4, conv_params);
911   } else {
912     av1_convolve_x_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
913                                        filter_params_x, subpel_x_q4,
914                                        conv_params);
915   }
916 }
917