1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <immintrin.h>
13
14 #include "config/av1_rtcd.h"
15
16 #include "aom_dsp/aom_dsp_common.h"
17 #include "aom_dsp/x86/convolve_avx2.h"
18 #include "aom_dsp/x86/synonyms.h"
19
av1_convolve_y_sr_avx2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)20 void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
21 int dst_stride, int w, int h,
22 const InterpFilterParams *filter_params_x,
23 const InterpFilterParams *filter_params_y,
24 const int subpel_x_q4, const int subpel_y_q4,
25 ConvolveParams *conv_params) {
26 int i, j, is_vert_4tap = 0;
27 // right shift is F-1 because we are already dividing
28 // filter co-efficients by 2
29 const int right_shift_bits = (FILTER_BITS - 1);
30 const __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
31 const __m256i right_shift_const =
32 _mm256_set1_epi16((1 << right_shift_bits) >> 1);
33
34 assert(conv_params->round_0 <= FILTER_BITS);
35 assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
36 ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
37
38 (void)filter_params_x;
39 (void)subpel_x_q4;
40 (void)conv_params;
41 __m256i coeffs[4], s[8];
42 __m128i d[6];
43
44 prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs);
45
46 // Condition for checking valid vert_filt taps
47 if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
48 is_vert_4tap = 1;
49
50 // vert_filt as 4 tap
51 if (is_vert_4tap) {
52 const int fo_vert = 1;
53 const uint8_t *const src_ptr = src - fo_vert * src_stride;
54 for (j = 0; j < w; j += 16) {
55 const uint8_t *data = &src_ptr[j];
56 d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
57 d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
58 d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
59 d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
60 d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
61
62 // Load lines a and b. Line a to lower 128, line b to upper 128
63 const __m256i src_01a = _mm256_permute2x128_si256(
64 _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
65
66 const __m256i src_12a = _mm256_permute2x128_si256(
67 _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
68
69 const __m256i src_23a = _mm256_permute2x128_si256(
70 _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
71
72 const __m256i src_34a = _mm256_permute2x128_si256(
73 _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
74
75 s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
76 s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
77
78 s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
79 s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
80
81 for (i = 0; i < h; i += 2) {
82 data = &src_ptr[i * src_stride + j];
83 d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
84 const __m256i src_45a = _mm256_permute2x128_si256(
85 _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
86
87 d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
88 const __m256i src_56a = _mm256_permute2x128_si256(
89 _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20);
90
91 s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
92 s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
93
94 const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
95 /* rounding code */
96 // shift by F - 1
97 const __m256i res_16b_lo = _mm256_sra_epi16(
98 _mm256_add_epi16(res_lo, right_shift_const), right_shift);
99 // 8 bit conversion and saturation to uint8
100 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
101
102 if (w - j > 8) {
103 const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
104
105 /* rounding code */
106 // shift by F - 1
107 const __m256i res_16b_hi = _mm256_sra_epi16(
108 _mm256_add_epi16(res_hi, right_shift_const), right_shift);
109 // 8 bit conversion and saturation to uint8
110 __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
111
112 __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
113
114 const __m128i res_0 = _mm256_castsi256_si128(res_a);
115 const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
116
117 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
118 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
119 res_1);
120 } else {
121 const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
122 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
123 if (w - j > 4) {
124 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
125 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
126 res_1);
127 } else if (w - j > 2) {
128 xx_storel_32(&dst[i * dst_stride + j], res_0);
129 xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
130 } else {
131 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
132 __m128i *const p_1 =
133 (__m128i *)&dst[i * dst_stride + j + dst_stride];
134 *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
135 *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
136 }
137 }
138 s[0] = s[1];
139 s[1] = s[2];
140
141 s[3] = s[4];
142 s[4] = s[5];
143 }
144 }
145 } else {
146 const int fo_vert = filter_params_y->taps / 2 - 1;
147 const uint8_t *const src_ptr = src - fo_vert * src_stride;
148
149 for (j = 0; j < w; j += 16) {
150 const uint8_t *data = &src_ptr[j];
151 __m256i src6;
152
153 d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
154 d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
155 d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
156 d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
157 d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
158 d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
159 // Load lines a and b. Line a to lower 128, line b to upper 128
160 const __m256i src_01a = _mm256_permute2x128_si256(
161 _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
162
163 const __m256i src_12a = _mm256_permute2x128_si256(
164 _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
165
166 const __m256i src_23a = _mm256_permute2x128_si256(
167 _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
168
169 const __m256i src_34a = _mm256_permute2x128_si256(
170 _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
171
172 const __m256i src_45a = _mm256_permute2x128_si256(
173 _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
174
175 src6 = _mm256_castsi128_si256(
176 _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
177 const __m256i src_56a =
178 _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20);
179
180 s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
181 s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
182 s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
183
184 s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
185 s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
186 s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
187
188 for (i = 0; i < h; i += 2) {
189 data = &src_ptr[i * src_stride + j];
190 const __m256i src_67a = _mm256_permute2x128_si256(
191 src6,
192 _mm256_castsi128_si256(
193 _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
194 0x20);
195
196 src6 = _mm256_castsi128_si256(
197 _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
198 const __m256i src_78a = _mm256_permute2x128_si256(
199 _mm256_castsi128_si256(
200 _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
201 src6, 0x20);
202
203 s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
204 s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
205
206 const __m256i res_lo = convolve_lowbd(s, coeffs);
207
208 /* rounding code */
209 // shift by F - 1
210 const __m256i res_16b_lo = _mm256_sra_epi16(
211 _mm256_add_epi16(res_lo, right_shift_const), right_shift);
212 // 8 bit conversion and saturation to uint8
213 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
214
215 if (w - j > 8) {
216 const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
217
218 /* rounding code */
219 // shift by F - 1
220 const __m256i res_16b_hi = _mm256_sra_epi16(
221 _mm256_add_epi16(res_hi, right_shift_const), right_shift);
222 // 8 bit conversion and saturation to uint8
223 __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
224
225 __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
226
227 const __m128i res_0 = _mm256_castsi256_si128(res_a);
228 const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
229
230 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
231 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
232 res_1);
233 } else {
234 const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
235 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
236 if (w - j > 4) {
237 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
238 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
239 res_1);
240 } else if (w - j > 2) {
241 xx_storel_32(&dst[i * dst_stride + j], res_0);
242 xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
243 } else {
244 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
245 __m128i *const p_1 =
246 (__m128i *)&dst[i * dst_stride + j + dst_stride];
247 *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
248 *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
249 }
250 }
251 s[0] = s[1];
252 s[1] = s[2];
253 s[2] = s[3];
254
255 s[4] = s[5];
256 s[5] = s[6];
257 s[6] = s[7];
258 }
259 }
260 }
261 }
262
av1_convolve_x_sr_avx2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)263 void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
264 int dst_stride, int w, int h,
265 const InterpFilterParams *filter_params_x,
266 const InterpFilterParams *filter_params_y,
267 const int subpel_x_q4, const int subpel_y_q4,
268 ConvolveParams *conv_params) {
269 const int bits = FILTER_BITS - conv_params->round_0;
270
271 const __m256i round_0_const =
272 _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
273 const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
274 const __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
275 const __m128i round_shift = _mm_cvtsi32_si128(bits);
276 int i, is_horiz_4tap = 0;
277 (void)filter_params_y;
278 (void)subpel_y_q4;
279
280 assert(bits >= 0);
281 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
282 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
283 assert(conv_params->round_0 > 0);
284
285 __m256i coeffs[4], filt[4];
286 filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
287 filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
288
289 prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
290
291 // Condition for checking valid horz_filt taps
292 if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
293 is_horiz_4tap = 1;
294
295 // horz_filt as 4 tap
296 if (is_horiz_4tap) {
297 const int fo_horiz = 1;
298 const uint8_t *const src_ptr = src - fo_horiz;
299 if (w <= 8) {
300 for (i = 0; i < h; i += 2) {
301 const __m256i data = _mm256_permute2x128_si256(
302 _mm256_castsi128_si256(
303 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
304 _mm256_castsi128_si256(_mm_loadu_si128(
305 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
306 0x20);
307
308 __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
309
310 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
311 round_0_shift);
312
313 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
314 round_shift);
315
316 /* rounding code */
317 // 8 bit conversion and saturation to uint8
318 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
319
320 const __m128i res_0 = _mm256_castsi256_si128(res_8b);
321 const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
322
323 if (w > 4) {
324 _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
325 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
326 } else if (w > 2) {
327 xx_storel_32(&dst[i * dst_stride], res_0);
328 xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
329 } else {
330 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
331 __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
332 *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
333 *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
334 }
335 }
336 } else {
337 for (i = 0; i < h; ++i) {
338 for (int j = 0; j < w; j += 16) {
339 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
340 // 18 19 20 21 22 23
341 const __m256i data = _mm256_inserti128_si256(
342 _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
343 _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
344 1);
345
346 __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
347
348 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
349 round_0_shift);
350
351 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
352 round_shift);
353
354 /* rounding code */
355 // 8 bit conversion and saturation to uint8
356 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
357
358 // Store values into the destination buffer
359 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
360 res_8b = _mm256_permute4x64_epi64(res_8b, 216);
361 __m128i res = _mm256_castsi256_si128(res_8b);
362 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
363 }
364 }
365 }
366 } else {
367 const int fo_horiz = filter_params_x->taps / 2 - 1;
368 const uint8_t *const src_ptr = src - fo_horiz;
369 filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
370 filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
371
372 if (w <= 8) {
373 for (i = 0; i < h; i += 2) {
374 const __m256i data = _mm256_permute2x128_si256(
375 _mm256_castsi128_si256(
376 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
377 _mm256_castsi128_si256(_mm_loadu_si128(
378 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
379 0x20);
380
381 __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
382
383 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
384 round_0_shift);
385
386 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
387 round_shift);
388
389 /* rounding code */
390 // 8 bit conversion and saturation to uint8
391 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
392
393 const __m128i res_0 = _mm256_castsi256_si128(res_8b);
394 const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
395 if (w > 4) {
396 _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
397 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
398 } else if (w > 2) {
399 xx_storel_32(&dst[i * dst_stride], res_0);
400 xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
401 } else {
402 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
403 __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
404 *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
405 *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
406 }
407 }
408 } else {
409 for (i = 0; i < h; ++i) {
410 for (int j = 0; j < w; j += 16) {
411 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
412 // 18 19 20 21 22 23
413 const __m256i data = _mm256_inserti128_si256(
414 _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
415 _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
416 1);
417
418 __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
419
420 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
421 round_0_shift);
422
423 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
424 round_shift);
425
426 /* rounding code */
427 // 8 bit conversion and saturation to uint8
428 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
429
430 // Store values into the destination buffer
431 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
432 res_8b = _mm256_permute4x64_epi64(res_8b, 216);
433 __m128i res = _mm256_castsi256_si128(res_8b);
434 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
435 }
436 }
437 }
438 }
439 }
440