1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <immintrin.h>
13
14 #include "config/av1_rtcd.h"
15
16 #include "third_party/SVT-AV1/convolve_avx2.h"
17
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_dsp/x86/convolve_avx2.h"
20 #include "aom_dsp/x86/convolve_common_intrin.h"
21 #include "aom_dsp/x86/synonyms.h"
22
av1_convolve_y_sr_general_avx2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)23 static AOM_INLINE void av1_convolve_y_sr_general_avx2(
24 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
25 int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) {
26 // right shift is F-1 because we are already dividing
27 // filter co-efficients by 2
28 const int right_shift_bits = (FILTER_BITS - 1);
29 __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
30 __m256i right_shift_const = _mm256_set1_epi16((1 << right_shift_bits) >> 1);
31
32 __m256i coeffs[6], s[12];
33 __m128i d[10];
34
35 int i, vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
36
37 if (vert_tap == 6)
38 prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs);
39 else if (vert_tap == 12) {
40 prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs);
41 } else {
42 prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
43 }
44
45 // vert_filt as 4 tap
46 if (vert_tap == 4) {
47 const int fo_vert = 1;
48 const uint8_t *const src_ptr = src - fo_vert * src_stride;
49 for (int j = 0; j < w; j += 16) {
50 const uint8_t *data = &src_ptr[j];
51 d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
52 d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
53 d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
54 d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
55 d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
56
57 // Load lines a and b. Line a to lower 128, line b to upper 128
58 const __m256i src_01a = _mm256_permute2x128_si256(
59 _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
60
61 const __m256i src_12a = _mm256_permute2x128_si256(
62 _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
63
64 const __m256i src_23a = _mm256_permute2x128_si256(
65 _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
66
67 const __m256i src_34a = _mm256_permute2x128_si256(
68 _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
69
70 s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
71 s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
72
73 s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
74 s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
75
76 for (i = 0; i < h; i += 2) {
77 data = &src_ptr[i * src_stride + j];
78 d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
79 const __m256i src_45a = _mm256_permute2x128_si256(
80 _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
81
82 d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
83 const __m256i src_56a = _mm256_permute2x128_si256(
84 _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20);
85
86 s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
87 s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
88
89 const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
90 /* rounding code */
91 // shift by F - 1
92 const __m256i res_16b_lo = _mm256_sra_epi16(
93 _mm256_add_epi16(res_lo, right_shift_const), right_shift);
94 // 8 bit conversion and saturation to uint8
95 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
96
97 if (w - j > 8) {
98 const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
99
100 /* rounding code */
101 // shift by F - 1
102 const __m256i res_16b_hi = _mm256_sra_epi16(
103 _mm256_add_epi16(res_hi, right_shift_const), right_shift);
104 // 8 bit conversion and saturation to uint8
105 __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
106
107 __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
108
109 const __m128i res_0 = _mm256_castsi256_si128(res_a);
110 const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
111
112 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
113 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
114 res_1);
115 } else {
116 const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
117 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
118 if (w - j > 4) {
119 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
120 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
121 res_1);
122 } else if (w - j > 2) {
123 xx_storel_32(&dst[i * dst_stride + j], res_0);
124 xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
125 } else {
126 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
127 __m128i *const p_1 =
128 (__m128i *)&dst[i * dst_stride + j + dst_stride];
129 *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
130 *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
131 }
132 }
133 s[0] = s[1];
134 s[1] = s[2];
135
136 s[3] = s[4];
137 s[4] = s[5];
138 }
139 }
140 } else if (vert_tap == 6) {
141 const int fo_vert = vert_tap / 2 - 1;
142 const uint8_t *const src_ptr = src - fo_vert * src_stride;
143
144 for (int j = 0; j < w; j += 16) {
145 const uint8_t *data = &src_ptr[j];
146 __m256i src6;
147
148 d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
149 d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
150 d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
151 d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
152 // Load lines a and b. Line a to lower 128, line b to upper 128
153 const __m256i src_01a = _mm256_permute2x128_si256(
154 _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
155
156 const __m256i src_12a = _mm256_permute2x128_si256(
157 _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
158
159 const __m256i src_23a = _mm256_permute2x128_si256(
160 _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
161
162 src6 = _mm256_castsi128_si256(
163 _mm_loadu_si128((__m128i *)(data + 4 * src_stride)));
164 const __m256i src_34a =
165 _mm256_permute2x128_si256(_mm256_castsi128_si256(d[3]), src6, 0x20);
166
167 s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
168 s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
169
170 s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
171 s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
172
173 for (i = 0; i < h; i += 2) {
174 data = &src_ptr[i * src_stride + j];
175 const __m256i src_45a = _mm256_permute2x128_si256(
176 src6,
177 _mm256_castsi128_si256(
178 _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
179 0x20);
180
181 src6 = _mm256_castsi128_si256(
182 _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
183 const __m256i src_56a = _mm256_permute2x128_si256(
184 _mm256_castsi128_si256(
185 _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
186 src6, 0x20);
187
188 s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
189 s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
190
191 const __m256i res_lo = convolve_lowbd_6tap(s, coeffs);
192
193 /* rounding code */
194 // shift by F - 1
195 const __m256i res_16b_lo = _mm256_sra_epi16(
196 _mm256_add_epi16(res_lo, right_shift_const), right_shift);
197 // 8 bit conversion and saturation to uint8
198 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
199
200 if (w - j > 8) {
201 const __m256i res_hi = convolve_lowbd_6tap(s + 3, coeffs);
202
203 /* rounding code */
204 // shift by F - 1
205 const __m256i res_16b_hi = _mm256_sra_epi16(
206 _mm256_add_epi16(res_hi, right_shift_const), right_shift);
207 // 8 bit conversion and saturation to uint8
208 __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
209
210 __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
211
212 const __m128i res_0 = _mm256_castsi256_si128(res_a);
213 const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
214
215 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
216 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
217 res_1);
218 } else {
219 const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
220 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
221 if (w - j > 4) {
222 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
223 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
224 res_1);
225 } else if (w - j > 2) {
226 xx_storel_32(&dst[i * dst_stride + j], res_0);
227 xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
228 } else {
229 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
230 __m128i *const p_1 =
231 (__m128i *)&dst[i * dst_stride + j + dst_stride];
232 *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
233 *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
234 }
235 }
236 s[0] = s[1];
237 s[1] = s[2];
238 s[3] = s[4];
239 s[4] = s[5];
240 }
241 }
242 } else if (vert_tap == 12) { // vert_tap == 12
243 const int fo_vert = filter_params_y->taps / 2 - 1;
244 const uint8_t *const src_ptr = src - fo_vert * src_stride;
245 const __m256i v_zero = _mm256_setzero_si256();
246 right_shift = _mm_cvtsi32_si128(FILTER_BITS);
247 right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1);
248
249 for (int j = 0; j < w; j += 8) {
250 const uint8_t *data = &src_ptr[j];
251 __m256i src10;
252
253 d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride));
254 d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride));
255 d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
256 d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride));
257 d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride));
258 d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride));
259 d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
260 d[7] = _mm_loadl_epi64((__m128i *)(data + 7 * src_stride));
261 d[8] = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
262 d[9] = _mm_loadl_epi64((__m128i *)(data + 9 * src_stride));
263 // Load lines a and b. Line a to lower 128, line b to upper 128
264 const __m256i src_01a = _mm256_permute2x128_si256(
265 _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
266
267 const __m256i src_12a = _mm256_permute2x128_si256(
268 _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
269
270 const __m256i src_23a = _mm256_permute2x128_si256(
271 _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
272
273 const __m256i src_34a = _mm256_permute2x128_si256(
274 _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
275
276 const __m256i src_45a = _mm256_permute2x128_si256(
277 _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
278
279 const __m256i src_56a = _mm256_permute2x128_si256(
280 _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[6]), 0x20);
281
282 const __m256i src_67a = _mm256_permute2x128_si256(
283 _mm256_castsi128_si256(d[6]), _mm256_castsi128_si256(d[7]), 0x20);
284
285 const __m256i src_78a = _mm256_permute2x128_si256(
286 _mm256_castsi128_si256(d[7]), _mm256_castsi128_si256(d[8]), 0x20);
287
288 const __m256i src_89a = _mm256_permute2x128_si256(
289 _mm256_castsi128_si256(d[8]), _mm256_castsi128_si256(d[9]), 0x20);
290
291 src10 = _mm256_castsi128_si256(
292 _mm_loadl_epi64((__m128i *)(data + 10 * src_stride)));
293 const __m256i src_910a =
294 _mm256_permute2x128_si256(_mm256_castsi128_si256(d[9]), src10, 0x20);
295
296 const __m256i src_01 = _mm256_unpacklo_epi8(src_01a, v_zero);
297 const __m256i src_12 = _mm256_unpacklo_epi8(src_12a, v_zero);
298 const __m256i src_23 = _mm256_unpacklo_epi8(src_23a, v_zero);
299 const __m256i src_34 = _mm256_unpacklo_epi8(src_34a, v_zero);
300 const __m256i src_45 = _mm256_unpacklo_epi8(src_45a, v_zero);
301 const __m256i src_56 = _mm256_unpacklo_epi8(src_56a, v_zero);
302 const __m256i src_67 = _mm256_unpacklo_epi8(src_67a, v_zero);
303 const __m256i src_78 = _mm256_unpacklo_epi8(src_78a, v_zero);
304 const __m256i src_89 = _mm256_unpacklo_epi8(src_89a, v_zero);
305 const __m256i src_910 = _mm256_unpacklo_epi8(src_910a, v_zero);
306
307 s[0] = _mm256_unpacklo_epi16(src_01, src_12);
308 s[1] = _mm256_unpacklo_epi16(src_23, src_34);
309 s[2] = _mm256_unpacklo_epi16(src_45, src_56);
310 s[3] = _mm256_unpacklo_epi16(src_67, src_78);
311 s[4] = _mm256_unpacklo_epi16(src_89, src_910);
312
313 s[6] = _mm256_unpackhi_epi16(src_01, src_12);
314 s[7] = _mm256_unpackhi_epi16(src_23, src_34);
315 s[8] = _mm256_unpackhi_epi16(src_45, src_56);
316 s[9] = _mm256_unpackhi_epi16(src_67, src_78);
317 s[10] = _mm256_unpackhi_epi16(src_89, src_910);
318
319 for (i = 0; i < h; i += 2) {
320 data = &src_ptr[i * src_stride + j];
321 const __m256i src_1011a = _mm256_permute2x128_si256(
322 src10,
323 _mm256_castsi128_si256(
324 _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
325 0x20);
326
327 src10 = _mm256_castsi128_si256(
328 _mm_loadl_epi64((__m128i *)(data + 12 * src_stride)));
329
330 const __m256i src_1112a = _mm256_permute2x128_si256(
331 _mm256_castsi128_si256(
332 _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
333 src10, 0x20);
334
335 const __m256i src_1011 = _mm256_unpacklo_epi8(src_1011a, v_zero);
336 const __m256i src_1112 = _mm256_unpacklo_epi8(src_1112a, v_zero);
337
338 s[5] = _mm256_unpacklo_epi16(src_1011, src_1112);
339 s[11] = _mm256_unpackhi_epi16(src_1011, src_1112);
340
341 const __m256i res_lo = convolve_12taps(s, coeffs);
342
343 const __m256i res_32b_lo = _mm256_sra_epi32(
344 _mm256_add_epi32(res_lo, right_shift_const), right_shift);
345 // 8 bit conversion and saturation to uint8
346 __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
347 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
348
349 if (w - j > 4) {
350 const __m256i res_hi = convolve_12taps(s + 6, coeffs);
351
352 const __m256i res_32b_hi = _mm256_sra_epi32(
353 _mm256_add_epi32(res_hi, right_shift_const), right_shift);
354 __m256i res_16b_hi = _mm256_packs_epi32(res_32b_hi, res_32b_hi);
355 // 8 bit conversion and saturation to uint8
356 __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
357
358 __m256i res_a = _mm256_unpacklo_epi32(res_8b_lo, res_8b_hi);
359
360 const __m128i res_0 = _mm256_extracti128_si256(res_a, 0);
361 const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
362
363 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
364 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
365 res_1);
366 } else {
367 const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
368 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
369 if (w - j > 2) {
370 *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0);
371 *(int *)&dst[i * dst_stride + j + dst_stride] =
372 _mm_cvtsi128_si32(res_1);
373 } else {
374 *(uint16_t *)&dst[i * dst_stride + j] =
375 (uint16_t)_mm_cvtsi128_si32(res_0);
376 *(uint16_t *)&dst[i * dst_stride + j + dst_stride] =
377 (uint16_t)_mm_cvtsi128_si32(res_1);
378 }
379 }
380 s[0] = s[1];
381 s[1] = s[2];
382 s[2] = s[3];
383 s[3] = s[4];
384 s[4] = s[5];
385
386 s[6] = s[7];
387 s[7] = s[8];
388 s[8] = s[9];
389 s[9] = s[10];
390 s[10] = s[11];
391 }
392 }
393 } else {
394 const int fo_vert = filter_params_y->taps / 2 - 1;
395 const uint8_t *const src_ptr = src - fo_vert * src_stride;
396
397 for (int j = 0; j < w; j += 16) {
398 const uint8_t *data = &src_ptr[j];
399 __m256i src6;
400
401 d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
402 d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
403 d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
404 d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
405 d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
406 d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
407 // Load lines a and b. Line a to lower 128, line b to upper 128
408 const __m256i src_01a = _mm256_permute2x128_si256(
409 _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
410
411 const __m256i src_12a = _mm256_permute2x128_si256(
412 _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
413
414 const __m256i src_23a = _mm256_permute2x128_si256(
415 _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
416
417 const __m256i src_34a = _mm256_permute2x128_si256(
418 _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
419
420 const __m256i src_45a = _mm256_permute2x128_si256(
421 _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
422
423 src6 = _mm256_castsi128_si256(
424 _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
425 const __m256i src_56a =
426 _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20);
427
428 s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
429 s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
430 s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
431
432 s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
433 s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
434 s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
435
436 for (i = 0; i < h; i += 2) {
437 data = &src_ptr[i * src_stride + j];
438 const __m256i src_67a = _mm256_permute2x128_si256(
439 src6,
440 _mm256_castsi128_si256(
441 _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
442 0x20);
443
444 src6 = _mm256_castsi128_si256(
445 _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
446 const __m256i src_78a = _mm256_permute2x128_si256(
447 _mm256_castsi128_si256(
448 _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
449 src6, 0x20);
450
451 s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
452 s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
453
454 const __m256i res_lo = convolve_lowbd(s, coeffs);
455
456 /* rounding code */
457 // shift by F - 1
458 const __m256i res_16b_lo = _mm256_sra_epi16(
459 _mm256_add_epi16(res_lo, right_shift_const), right_shift);
460 // 8 bit conversion and saturation to uint8
461 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
462
463 if (w - j > 8) {
464 const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
465
466 /* rounding code */
467 // shift by F - 1
468 const __m256i res_16b_hi = _mm256_sra_epi16(
469 _mm256_add_epi16(res_hi, right_shift_const), right_shift);
470 // 8 bit conversion and saturation to uint8
471 __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
472
473 __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
474
475 const __m128i res_0 = _mm256_castsi256_si128(res_a);
476 const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
477
478 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
479 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
480 res_1);
481 } else {
482 const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
483 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
484 if (w - j > 4) {
485 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
486 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
487 res_1);
488 } else if (w - j > 2) {
489 xx_storel_32(&dst[i * dst_stride + j], res_0);
490 xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
491 } else {
492 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
493 __m128i *const p_1 =
494 (__m128i *)&dst[i * dst_stride + j + dst_stride];
495 *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
496 *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
497 }
498 }
499 s[0] = s[1];
500 s[1] = s[2];
501 s[2] = s[3];
502
503 s[4] = s[5];
504 s[5] = s[6];
505 s[6] = s[7];
506 }
507 }
508 }
509 }
510
av1_convolve_y_sr_avx2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_y,const int32_t subpel_y_q4)511 void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride,
512 uint8_t *dst, int32_t dst_stride, int32_t w,
513 int32_t h,
514 const InterpFilterParams *filter_params_y,
515 const int32_t subpel_y_q4) {
516 const int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
517
518 if (vert_tap == 12) {
519 av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
520 filter_params_y, subpel_y_q4);
521 } else {
522 av1_convolve_y_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
523 filter_params_y, subpel_y_q4);
524 }
525 }
526
av1_convolve_x_sr_general_avx2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)527 static AOM_INLINE void av1_convolve_x_sr_general_avx2(
528 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
529 int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
530 ConvolveParams *conv_params) {
531 const int bits = FILTER_BITS - conv_params->round_0;
532 const __m128i round_shift = _mm_cvtsi32_si128(bits);
533 __m256i round_0_const =
534 _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
535 __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
536 __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
537 int i, horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
538
539 assert(bits >= 0);
540 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
541 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
542 assert(conv_params->round_0 > 0);
543
544 __m256i coeffs[6], filt[4];
545 filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
546 filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
547
548 if (horiz_tap == 6)
549 prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs);
550 else if (horiz_tap == 12) {
551 prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs);
552 } else {
553 prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
554 }
555
556 // horz_filt as 4 tap
557 if (horiz_tap == 4) {
558 const int fo_horiz = 1;
559 const uint8_t *const src_ptr = src - fo_horiz;
560 if (w <= 8) {
561 for (i = 0; i < h; i += 2) {
562 const __m256i data = _mm256_permute2x128_si256(
563 _mm256_castsi128_si256(
564 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
565 _mm256_castsi128_si256(_mm_loadu_si128(
566 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
567 0x20);
568
569 __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
570
571 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
572 round_0_shift);
573
574 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
575 round_shift);
576
577 /* rounding code */
578 // 8 bit conversion and saturation to uint8
579 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
580
581 const __m128i res_0 = _mm256_castsi256_si128(res_8b);
582 const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
583
584 if (w > 4) {
585 _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
586 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
587 } else if (w > 2) {
588 xx_storel_32(&dst[i * dst_stride], res_0);
589 xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
590 } else {
591 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
592 __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
593 *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
594 *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
595 }
596 }
597 } else {
598 for (i = 0; i < h; ++i) {
599 for (int j = 0; j < w; j += 16) {
600 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
601 // 18 19 20 21 22 23
602 const __m256i data = _mm256_inserti128_si256(
603 _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
604 _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
605 1);
606
607 __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
608
609 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
610 round_0_shift);
611
612 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
613 round_shift);
614
615 /* rounding code */
616 // 8 bit conversion and saturation to uint8
617 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
618
619 // Store values into the destination buffer
620 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
621 res_8b = _mm256_permute4x64_epi64(res_8b, 216);
622 __m128i res = _mm256_castsi256_si128(res_8b);
623 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
624 }
625 }
626 }
627 } else if (horiz_tap == 6) {
628 const int fo_horiz = horiz_tap / 2 - 1;
629 const uint8_t *const src_ptr = src - fo_horiz;
630 filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
631 filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
632
633 if (w <= 8) {
634 for (i = 0; i < h; i += 2) {
635 const __m256i data = _mm256_permute2x128_si256(
636 _mm256_castsi128_si256(
637 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
638 _mm256_castsi128_si256(_mm_loadu_si128(
639 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
640 0x20);
641
642 __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
643
644 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
645 round_0_shift);
646
647 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
648 round_shift);
649
650 /* rounding code */
651 // 8 bit conversion and saturation to uint8
652 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
653
654 const __m128i res_0 = _mm256_castsi256_si128(res_8b);
655 const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
656 if (w > 4) {
657 _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
658 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
659 } else if (w > 2) {
660 xx_storel_32(&dst[i * dst_stride], res_0);
661 xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
662 } else {
663 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
664 __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
665 *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
666 *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
667 }
668 }
669 } else {
670 for (i = 0; i < h; ++i) {
671 for (int j = 0; j < w; j += 16) {
672 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
673 // 18 19 20 21 22 23
674 const __m256i data = _mm256_inserti128_si256(
675 _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
676 _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
677 1);
678
679 __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
680
681 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
682 round_0_shift);
683
684 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
685 round_shift);
686
687 /* rounding code */
688 // 8 bit conversion and saturation to uint8
689 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
690
691 // Store values into the destination buffer
692 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
693 res_8b = _mm256_permute4x64_epi64(res_8b, 216);
694 __m128i res = _mm256_castsi256_si128(res_8b);
695 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
696 }
697 }
698 }
699 } else if (horiz_tap == 12) { // horiz_tap == 12
700 const int fo_horiz = filter_params_x->taps / 2 - 1;
701 const uint8_t *const src_ptr = src - fo_horiz;
702 const __m256i v_zero = _mm256_setzero_si256();
703 round_0_const = _mm256_set1_epi32((1 << (conv_params->round_0)) >> 1);
704 round_const = _mm256_set1_epi32((1 << bits) >> 1);
705 round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
706 __m256i s[6];
707
708 if (w <= 4) {
709 for (i = 0; i < h; i += 2) {
710 const __m256i data = _mm256_permute2x128_si256(
711 _mm256_castsi128_si256(
712 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
713 _mm256_castsi128_si256(_mm_loadu_si128(
714 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
715 0x20);
716 // row0 0..7 row1 0..7
717 const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
718 // row0 8..F row1 8..F
719 const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
720
721 // row0 00 00 01 01 .. 03 03 row1 00 00 01 01 .. 03 03
722 const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
723 // row0 04 04 .. 07 07 row1 04 04 .. 07 07
724 const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
725
726 // row0 08 08 09 09 .. 0B 0B row1 08 08 09 09 .. 0B 0B
727 const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
728 // row0 0C 0C .. 0F 0F row1 0C 0C .. 0F 0F
729 const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
730
731 // 00 01 01 02 02 03 03 04 10 11 11 12 12 13 13 14
732 s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
733 // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
734 s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
735 // 04 05 05 06 06 07 07 08 14 15 15 16 16 17 17 18
736 s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
737 // 06 07 07 08 08 09 09 0A 16 17 17 18 18 19 19 1A
738 s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
739 // 08 09 09 0A 0A 0B 0B 0C 18 19 19 1A 1A 1B 1B 1C
740 s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
741 // 0A 0B 0B 0C 0C 0D 0D 0E 1A 1B 1B 1C 1C 1D 1D 1E
742 s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
743
744 const __m256i res_lo = convolve_12taps(s, coeffs);
745
746 __m256i res_32b_lo = _mm256_sra_epi32(
747 _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
748
749 // 00 01 02 03 10 12 13 14
750 res_32b_lo = _mm256_sra_epi32(_mm256_add_epi32(res_32b_lo, round_const),
751 round_shift);
752 // 8 bit conversion and saturation to uint8
753 // 00 01 02 03 00 01 02 03 10 11 12 13 10 11 12 13
754 __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
755 // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
756 // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
757 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
758
759 // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
760 const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
761 // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
762 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
763 if (w > 2) {
764 // 00 01 02 03
765 *(int *)&dst[i * dst_stride] = _mm_cvtsi128_si32(res_0);
766 // 10 11 12 13
767 *(int *)&dst[i * dst_stride + dst_stride] = _mm_cvtsi128_si32(res_1);
768 } else {
769 // 00 01
770 *(uint16_t *)&dst[i * dst_stride] =
771 (uint16_t)_mm_cvtsi128_si32(res_0);
772 // 10 11
773 *(uint16_t *)&dst[i * dst_stride + dst_stride] =
774 (uint16_t)_mm_cvtsi128_si32(res_1);
775 }
776 }
777 } else {
778 for (i = 0; i < h; i++) {
779 for (int j = 0; j < w; j += 8) {
780 const __m256i data = _mm256_permute2x128_si256(
781 _mm256_castsi128_si256(
782 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
783 _mm256_castsi128_si256(_mm_loadu_si128(
784 (__m128i *)(&src_ptr[i * src_stride + j + 4]))),
785 0x20);
786 // row0 0..7 4..B
787 const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);
788 // row0 8..F C..13
789 const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);
790
791 // row0 00 00 01 01 .. 03 03 04 04 05 05 .. 07 07
792 const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);
793 // row0 04 04 .. 07 07 08 08 .. 0B 0B
794 const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);
795
796 // row0 08 08 09 09 .. 0B 0B 0C 0C 0D 0D .. 0F 0F
797 const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);
798 // row0 0C 0C 0D 0D .. 0F 0F 10 10 11 11 .. 13 13
799 const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);
800
801 s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);
802 s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);
803 s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);
804 s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);
805 s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);
806 s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);
807
808 const __m256i res_lo = convolve_12taps(s, coeffs);
809
810 __m256i res_32b_lo = _mm256_sra_epi32(
811 _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
812
813 res_32b_lo = _mm256_sra_epi32(
814 _mm256_add_epi32(res_32b_lo, round_const), round_shift);
815 // 8 bit conversion and saturation to uint8
816 __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
817 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
818 const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
819 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
820 *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0);
821 *(int *)&dst[i * dst_stride + j + 4] = _mm_cvtsi128_si32(res_1);
822 }
823 }
824 }
825 } else {
826 const int fo_horiz = filter_params_x->taps / 2 - 1;
827 const uint8_t *const src_ptr = src - fo_horiz;
828 filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
829 filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
830
831 if (w <= 8) {
832 for (i = 0; i < h; i += 2) {
833 const __m256i data = _mm256_permute2x128_si256(
834 _mm256_castsi128_si256(
835 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
836 _mm256_castsi128_si256(_mm_loadu_si128(
837 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
838 0x20);
839
840 __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
841
842 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
843 round_0_shift);
844
845 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
846 round_shift);
847
848 /* rounding code */
849 // 8 bit conversion and saturation to uint8
850 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
851
852 const __m128i res_0 = _mm256_castsi256_si128(res_8b);
853 const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
854 if (w > 4) {
855 _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
856 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
857 } else if (w > 2) {
858 xx_storel_32(&dst[i * dst_stride], res_0);
859 xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
860 } else {
861 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
862 __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
863 *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
864 *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
865 }
866 }
867 } else {
868 for (i = 0; i < h; ++i) {
869 for (int j = 0; j < w; j += 16) {
870 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
871 // 18 19 20 21 22 23
872 const __m256i data = _mm256_inserti128_si256(
873 _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
874 _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
875 1);
876
877 __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
878
879 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
880 round_0_shift);
881
882 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
883 round_shift);
884
885 /* rounding code */
886 // 8 bit conversion and saturation to uint8
887 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
888
889 // Store values into the destination buffer
890 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
891 res_8b = _mm256_permute4x64_epi64(res_8b, 216);
892 __m128i res = _mm256_castsi256_si128(res_8b);
893 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
894 }
895 }
896 }
897 }
898 }
899
av1_convolve_x_sr_avx2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_x,const int32_t subpel_x_q4,ConvolveParams * conv_params)900 void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride,
901 uint8_t *dst, int32_t dst_stride, int32_t w,
902 int32_t h,
903 const InterpFilterParams *filter_params_x,
904 const int32_t subpel_x_q4,
905 ConvolveParams *conv_params) {
906 const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
907
908 if (horz_tap == 12) {
909 av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
910 filter_params_x, subpel_x_q4, conv_params);
911 } else {
912 av1_convolve_x_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
913 filter_params_x, subpel_x_q4,
914 conv_params);
915 }
916 }
917