1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <emmintrin.h>
13
14 #include "config/av1_rtcd.h"
15
16 #include "aom_dsp/aom_filter.h"
17 #include "aom_dsp/x86/convolve_sse2.h"
18
av1_dist_wtd_convolve_x_sse2(const uint8_t * src,int src_stride,uint8_t * dst0,int dst_stride0,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)19 void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride,
20 uint8_t *dst0, int dst_stride0, int w, int h,
21 const InterpFilterParams *filter_params_x,
22 const int subpel_x_qn,
23 ConvolveParams *conv_params) {
24 const int bd = 8;
25 CONV_BUF_TYPE *dst = conv_params->dst;
26 const int dst_stride = conv_params->dst_stride;
27 const int fo_horiz = filter_params_x->taps / 2 - 1;
28 const uint8_t *src_ptr = src - fo_horiz;
29 const int bits = FILTER_BITS - conv_params->round_1;
30 const __m128i left_shift = _mm_cvtsi32_si128(bits);
31 const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_0) >> 1);
32 const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
33 const int w0 = conv_params->fwd_offset;
34 const int w1 = conv_params->bck_offset;
35 const __m128i wt0 = _mm_set1_epi16(w0);
36 const __m128i wt1 = _mm_set1_epi16(w1);
37 const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
38 const int do_average = conv_params->do_average;
39 const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
40 const int offset_0 =
41 bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
42 const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
43 const __m128i offset_const = _mm_set1_epi16(offset);
44 const int rounding_shift =
45 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
46 const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
47 __m128i coeffs[4];
48
49 prepare_coeffs(filter_params_x, subpel_x_qn, coeffs);
50
51 if (w == 4) {
52 do {
53 const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
54 __m128i s[4];
55
56 s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
57 s[1] =
58 _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
59 s[2] =
60 _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
61 s[3] =
62 _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
63 const __m128i res_lo = convolve_lo_x(s, coeffs);
64 const __m128i res_lo_round =
65 _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
66 const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
67
68 const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_lo_shift);
69 const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
70
71 // Accumulate values into the destination buffer
72 if (do_average) {
73 const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
74
75 const __m128i comp_avg_res =
76 comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
77
78 const __m128i round_result = convolve_rounding(
79 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
80
81 const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
82 *(int *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
83 } else {
84 _mm_store_si128((__m128i *)(&dst[0]), res_unsigned);
85 }
86 src_ptr += src_stride;
87 dst += dst_stride;
88 dst0 += dst_stride0;
89 } while (--h);
90 } else {
91 assert(!(w % 8));
92 int i = 0;
93 do {
94 int j = 0;
95 do {
96 const __m128i data =
97 _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
98 __m128i s[4];
99
100 // Filter even-index pixels
101 s[0] = data;
102 s[1] = _mm_srli_si128(data, 2);
103 s[2] = _mm_srli_si128(data, 4);
104 s[3] = _mm_srli_si128(data, 6);
105 const __m128i res_even = convolve_lo_x(s, coeffs);
106
107 // Filter odd-index pixels
108 s[0] = _mm_srli_si128(data, 1);
109 s[1] = _mm_srli_si128(data, 3);
110 s[2] = _mm_srli_si128(data, 5);
111 s[3] = _mm_srli_si128(data, 7);
112 const __m128i res_odd = convolve_lo_x(s, coeffs);
113
114 // Rearrange pixels back into the order 0 ... 7
115 const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
116 const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
117 const __m128i res_lo_round =
118 _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
119 const __m128i res_hi_round =
120 _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
121 const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
122 const __m128i res_hi_shift = _mm_sll_epi32(res_hi_round, left_shift);
123
124 const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
125 const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
126
127 // Accumulate values into the destination buffer
128 if (do_average) {
129 const __m128i data_ref_0 =
130 _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
131
132 const __m128i comp_avg_res =
133 comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
134
135 const __m128i round_result = convolve_rounding(
136 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
137
138 const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
139 _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
140 } else {
141 _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
142 }
143 j += 8;
144 } while (j < w);
145 } while (++i < h);
146 }
147 }
148
av1_dist_wtd_convolve_y_sse2(const uint8_t * src,int src_stride,uint8_t * dst0,int dst_stride0,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,ConvolveParams * conv_params)149 void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride,
150 uint8_t *dst0, int dst_stride0, int w, int h,
151 const InterpFilterParams *filter_params_y,
152 const int subpel_y_qn,
153 ConvolveParams *conv_params) {
154 const int bd = 8;
155 CONV_BUF_TYPE *dst = conv_params->dst;
156 const int dst_stride = conv_params->dst_stride;
157 const int fo_vert = filter_params_y->taps / 2 - 1;
158 const uint8_t *src_ptr = src - fo_vert * src_stride;
159 const int bits = FILTER_BITS - conv_params->round_0;
160 const __m128i left_shift = _mm_cvtsi32_si128(bits);
161 const __m128i wt0 = _mm_set1_epi16(conv_params->fwd_offset);
162 const __m128i wt1 = _mm_set1_epi16(conv_params->bck_offset);
163 const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
164 const int do_average = conv_params->do_average;
165 const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
166 const int offset_0 =
167 bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
168 const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
169 const __m128i offset_const = _mm_set1_epi16(offset);
170 const int rounding_shift =
171 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
172 const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
173 const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_1) >> 1);
174 const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
175 __m128i coeffs[4];
176
177 prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
178
179 if (w == 4) {
180 __m128i s[8], src6, res, res_shift;
181 src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride));
182 s[0] = _mm_unpacklo_epi8(
183 _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)),
184 _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)));
185 s[1] = _mm_unpacklo_epi8(
186 _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)),
187 _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)));
188 s[2] = _mm_unpacklo_epi8(
189 _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)),
190 _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)));
191 s[3] = _mm_unpacklo_epi8(
192 _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)),
193 _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)));
194 s[4] = _mm_unpacklo_epi8(
195 _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)),
196 _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)));
197 s[5] = _mm_unpacklo_epi8(
198 _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6);
199
200 do {
201 s[6] = _mm_unpacklo_epi8(
202 src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)));
203 src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride));
204 s[7] = _mm_unpacklo_epi8(
205 _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6);
206
207 res = convolve_lo_y(s + 0, coeffs);
208 res_shift = _mm_sll_epi32(res, left_shift);
209 res_shift =
210 _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
211
212 __m128i res_16b = _mm_packs_epi32(res_shift, res_shift);
213 __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
214
215 // Accumulate values into the destination buffer
216 if (do_average) {
217 const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
218
219 const __m128i comp_avg_res =
220 comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
221
222 const __m128i round_result = convolve_rounding(
223 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
224
225 const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
226 *(int *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
227
228 } else {
229 _mm_store_si128((__m128i *)dst, res_unsigned);
230 }
231
232 src_ptr += src_stride;
233 dst += dst_stride;
234 dst0 += dst_stride0;
235
236 res = convolve_lo_y(s + 1, coeffs);
237 res_shift = _mm_sll_epi32(res, left_shift);
238 res_shift =
239 _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
240
241 res_16b = _mm_packs_epi32(res_shift, res_shift);
242 res_unsigned = _mm_add_epi16(res_16b, offset_const);
243
244 // Accumulate values into the destination buffer
245 if (do_average) {
246 const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
247
248 const __m128i comp_avg_res =
249 comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
250
251 const __m128i round_result = convolve_rounding(
252 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
253
254 const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
255 *(int *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
256
257 } else {
258 _mm_store_si128((__m128i *)dst, res_unsigned);
259 }
260
261 src_ptr += src_stride;
262 dst += dst_stride;
263 dst0 += dst_stride0;
264
265 s[0] = s[2];
266 s[1] = s[3];
267 s[2] = s[4];
268 s[3] = s[5];
269 s[4] = s[6];
270 s[5] = s[7];
271 h -= 2;
272 } while (h);
273 } else {
274 assert(!(w % 8));
275 int j = 0;
276 do {
277 __m128i s[8], src6, res_lo, res_hi, res_lo_shift, res_hi_shift;
278 const uint8_t *data = &src_ptr[j];
279
280 src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
281 s[0] = _mm_unpacklo_epi8(
282 _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
283 _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
284 s[1] = _mm_unpacklo_epi8(
285 _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
286 _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
287 s[2] = _mm_unpacklo_epi8(
288 _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
289 _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
290 s[3] = _mm_unpacklo_epi8(
291 _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
292 _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
293 s[4] = _mm_unpacklo_epi8(
294 _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
295 _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
296 s[5] = _mm_unpacklo_epi8(
297 _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
298
299 int i = 0;
300 do {
301 data = &src_ptr[i * src_stride + j];
302 s[6] = _mm_unpacklo_epi8(
303 src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
304 src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
305 s[7] = _mm_unpacklo_epi8(
306 _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
307
308 res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels
309 res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels
310 res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
311 res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
312 res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
313 round_shift);
314 res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
315 round_shift);
316
317 __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
318 __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
319
320 // Accumulate values into the destination buffer
321 if (do_average) {
322 const __m128i data_ref_0 =
323 _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
324
325 const __m128i comp_avg_res =
326 comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
327
328 const __m128i round_result = convolve_rounding(
329 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
330
331 const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
332 _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
333 } else {
334 _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
335 }
336 i++;
337
338 res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels
339 res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels
340 res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
341 res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
342 res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
343 round_shift);
344 res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
345 round_shift);
346 res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
347 res_unsigned = _mm_add_epi16(res_16b, offset_const);
348
349 // Accumulate values into the destination buffer
350 if (do_average) {
351 __m128i data_ref_0 =
352 _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
353
354 const __m128i comp_avg_res =
355 comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
356
357 const __m128i round_result = convolve_rounding(
358 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
359
360 const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
361 _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
362 } else {
363 _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
364 }
365 i++;
366
367 s[0] = s[2];
368 s[1] = s[3];
369 s[2] = s[4];
370 s[3] = s[5];
371 s[4] = s[6];
372 s[5] = s[7];
373 } while (i < h);
374 j += 8;
375 } while (j < w);
376 }
377 }
378