• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <emmintrin.h>
13 
14 #include "config/av1_rtcd.h"
15 
16 #include "aom_dsp/aom_filter.h"
17 #include "aom_dsp/x86/convolve_sse2.h"
18 
av1_dist_wtd_convolve_x_sse2(const uint8_t * src,int src_stride,uint8_t * dst0,int dst_stride0,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)19 void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride,
20                                   uint8_t *dst0, int dst_stride0, int w, int h,
21                                   const InterpFilterParams *filter_params_x,
22                                   const int subpel_x_qn,
23                                   ConvolveParams *conv_params) {
24   const int bd = 8;
25   CONV_BUF_TYPE *dst = conv_params->dst;
26   const int dst_stride = conv_params->dst_stride;
27   const int fo_horiz = filter_params_x->taps / 2 - 1;
28   const uint8_t *src_ptr = src - fo_horiz;
29   const int bits = FILTER_BITS - conv_params->round_1;
30   const __m128i left_shift = _mm_cvtsi32_si128(bits);
31   const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_0) >> 1);
32   const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
33   const int w0 = conv_params->fwd_offset;
34   const int w1 = conv_params->bck_offset;
35   const __m128i wt0 = _mm_set1_epi16(w0);
36   const __m128i wt1 = _mm_set1_epi16(w1);
37   const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
38   const int do_average = conv_params->do_average;
39   const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
40   const int offset_0 =
41       bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
42   const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
43   const __m128i offset_const = _mm_set1_epi16(offset);
44   const int rounding_shift =
45       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
46   const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
47   __m128i coeffs[4];
48 
49   prepare_coeffs(filter_params_x, subpel_x_qn, coeffs);
50 
51   if (w == 4) {
52     do {
53       const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
54       __m128i s[4];
55 
56       s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
57       s[1] =
58           _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
59       s[2] =
60           _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
61       s[3] =
62           _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
63       const __m128i res_lo = convolve_lo_x(s, coeffs);
64       const __m128i res_lo_round =
65           _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
66       const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
67 
68       const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_lo_shift);
69       const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
70 
71       // Accumulate values into the destination buffer
72       if (do_average) {
73         const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
74 
75         const __m128i comp_avg_res =
76             comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
77 
78         const __m128i round_result = convolve_rounding(
79             &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
80 
81         const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
82         *(int *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
83       } else {
84         _mm_store_si128((__m128i *)(&dst[0]), res_unsigned);
85       }
86       src_ptr += src_stride;
87       dst += dst_stride;
88       dst0 += dst_stride0;
89     } while (--h);
90   } else {
91     assert(!(w % 8));
92     int i = 0;
93     do {
94       int j = 0;
95       do {
96         const __m128i data =
97             _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
98         __m128i s[4];
99 
100         // Filter even-index pixels
101         s[0] = data;
102         s[1] = _mm_srli_si128(data, 2);
103         s[2] = _mm_srli_si128(data, 4);
104         s[3] = _mm_srli_si128(data, 6);
105         const __m128i res_even = convolve_lo_x(s, coeffs);
106 
107         // Filter odd-index pixels
108         s[0] = _mm_srli_si128(data, 1);
109         s[1] = _mm_srli_si128(data, 3);
110         s[2] = _mm_srli_si128(data, 5);
111         s[3] = _mm_srli_si128(data, 7);
112         const __m128i res_odd = convolve_lo_x(s, coeffs);
113 
114         // Rearrange pixels back into the order 0 ... 7
115         const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
116         const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
117         const __m128i res_lo_round =
118             _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
119         const __m128i res_hi_round =
120             _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
121         const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
122         const __m128i res_hi_shift = _mm_sll_epi32(res_hi_round, left_shift);
123 
124         const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
125         const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
126 
127         // Accumulate values into the destination buffer
128         if (do_average) {
129           const __m128i data_ref_0 =
130               _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
131 
132           const __m128i comp_avg_res =
133               comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
134 
135           const __m128i round_result = convolve_rounding(
136               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
137 
138           const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
139           _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
140         } else {
141           _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
142         }
143         j += 8;
144       } while (j < w);
145     } while (++i < h);
146   }
147 }
148 
av1_dist_wtd_convolve_y_sse2(const uint8_t * src,int src_stride,uint8_t * dst0,int dst_stride0,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,ConvolveParams * conv_params)149 void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride,
150                                   uint8_t *dst0, int dst_stride0, int w, int h,
151                                   const InterpFilterParams *filter_params_y,
152                                   const int subpel_y_qn,
153                                   ConvolveParams *conv_params) {
154   const int bd = 8;
155   CONV_BUF_TYPE *dst = conv_params->dst;
156   const int dst_stride = conv_params->dst_stride;
157   const int fo_vert = filter_params_y->taps / 2 - 1;
158   const uint8_t *src_ptr = src - fo_vert * src_stride;
159   const int bits = FILTER_BITS - conv_params->round_0;
160   const __m128i left_shift = _mm_cvtsi32_si128(bits);
161   const __m128i wt0 = _mm_set1_epi16(conv_params->fwd_offset);
162   const __m128i wt1 = _mm_set1_epi16(conv_params->bck_offset);
163   const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
164   const int do_average = conv_params->do_average;
165   const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
166   const int offset_0 =
167       bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
168   const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
169   const __m128i offset_const = _mm_set1_epi16(offset);
170   const int rounding_shift =
171       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
172   const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
173   const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_1) >> 1);
174   const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
175   __m128i coeffs[4];
176 
177   prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
178 
179   if (w == 4) {
180     __m128i s[8], src6, res, res_shift;
181     src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride));
182     s[0] = _mm_unpacklo_epi8(
183         _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)),
184         _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)));
185     s[1] = _mm_unpacklo_epi8(
186         _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)),
187         _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)));
188     s[2] = _mm_unpacklo_epi8(
189         _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)),
190         _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)));
191     s[3] = _mm_unpacklo_epi8(
192         _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)),
193         _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)));
194     s[4] = _mm_unpacklo_epi8(
195         _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)),
196         _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)));
197     s[5] = _mm_unpacklo_epi8(
198         _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6);
199 
200     do {
201       s[6] = _mm_unpacklo_epi8(
202           src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)));
203       src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride));
204       s[7] = _mm_unpacklo_epi8(
205           _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6);
206 
207       res = convolve_lo_y(s + 0, coeffs);
208       res_shift = _mm_sll_epi32(res, left_shift);
209       res_shift =
210           _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
211 
212       __m128i res_16b = _mm_packs_epi32(res_shift, res_shift);
213       __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
214 
215       // Accumulate values into the destination buffer
216       if (do_average) {
217         const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
218 
219         const __m128i comp_avg_res =
220             comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
221 
222         const __m128i round_result = convolve_rounding(
223             &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
224 
225         const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
226         *(int *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
227 
228       } else {
229         _mm_store_si128((__m128i *)dst, res_unsigned);
230       }
231 
232       src_ptr += src_stride;
233       dst += dst_stride;
234       dst0 += dst_stride0;
235 
236       res = convolve_lo_y(s + 1, coeffs);
237       res_shift = _mm_sll_epi32(res, left_shift);
238       res_shift =
239           _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
240 
241       res_16b = _mm_packs_epi32(res_shift, res_shift);
242       res_unsigned = _mm_add_epi16(res_16b, offset_const);
243 
244       // Accumulate values into the destination buffer
245       if (do_average) {
246         const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
247 
248         const __m128i comp_avg_res =
249             comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
250 
251         const __m128i round_result = convolve_rounding(
252             &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
253 
254         const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
255         *(int *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
256 
257       } else {
258         _mm_store_si128((__m128i *)dst, res_unsigned);
259       }
260 
261       src_ptr += src_stride;
262       dst += dst_stride;
263       dst0 += dst_stride0;
264 
265       s[0] = s[2];
266       s[1] = s[3];
267       s[2] = s[4];
268       s[3] = s[5];
269       s[4] = s[6];
270       s[5] = s[7];
271       h -= 2;
272     } while (h);
273   } else {
274     assert(!(w % 8));
275     int j = 0;
276     do {
277       __m128i s[8], src6, res_lo, res_hi, res_lo_shift, res_hi_shift;
278       const uint8_t *data = &src_ptr[j];
279 
280       src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
281       s[0] = _mm_unpacklo_epi8(
282           _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
283           _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
284       s[1] = _mm_unpacklo_epi8(
285           _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
286           _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
287       s[2] = _mm_unpacklo_epi8(
288           _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
289           _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
290       s[3] = _mm_unpacklo_epi8(
291           _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
292           _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
293       s[4] = _mm_unpacklo_epi8(
294           _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
295           _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
296       s[5] = _mm_unpacklo_epi8(
297           _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
298 
299       int i = 0;
300       do {
301         data = &src_ptr[i * src_stride + j];
302         s[6] = _mm_unpacklo_epi8(
303             src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
304         src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
305         s[7] = _mm_unpacklo_epi8(
306             _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
307 
308         res_lo = convolve_lo_y(s, coeffs);  // Filter low index pixels
309         res_hi = convolve_hi_y(s, coeffs);  // Filter high index pixels
310         res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
311         res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
312         res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
313                                      round_shift);
314         res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
315                                      round_shift);
316 
317         __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
318         __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
319 
320         // Accumulate values into the destination buffer
321         if (do_average) {
322           const __m128i data_ref_0 =
323               _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
324 
325           const __m128i comp_avg_res =
326               comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
327 
328           const __m128i round_result = convolve_rounding(
329               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
330 
331           const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
332           _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
333         } else {
334           _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
335         }
336         i++;
337 
338         res_lo = convolve_lo_y(s + 1, coeffs);  // Filter low index pixels
339         res_hi = convolve_hi_y(s + 1, coeffs);  // Filter high index pixels
340         res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
341         res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
342         res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
343                                      round_shift);
344         res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
345                                      round_shift);
346         res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
347         res_unsigned = _mm_add_epi16(res_16b, offset_const);
348 
349         // Accumulate values into the destination buffer
350         if (do_average) {
351           __m128i data_ref_0 =
352               _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
353 
354           const __m128i comp_avg_res =
355               comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
356 
357           const __m128i round_result = convolve_rounding(
358               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
359 
360           const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
361           _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
362         } else {
363           _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
364         }
365         i++;
366 
367         s[0] = s[2];
368         s[1] = s[3];
369         s[2] = s[4];
370         s[3] = s[5];
371         s[4] = s[6];
372         s[5] = s[7];
373       } while (i < h);
374       j += 8;
375     } while (j < w);
376   }
377 }
378