1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <emmintrin.h>
13
14 #include "config/av1_rtcd.h"
15
16 #include "aom_dsp/aom_dsp_common.h"
17 #include "aom_dsp/aom_filter.h"
18 #include "aom_dsp/x86/convolve_common_intrin.h"
19 #include "av1/common/convolve.h"
20
prepare_coeffs(const InterpFilterParams * const filter_params,const int subpel_q4,__m128i * const coeffs)21 static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
22 const int subpel_q4,
23 __m128i *const coeffs /* [4] */) {
24 const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel(
25 filter_params, subpel_q4 & SUBPEL_MASK);
26 const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
27 // coeffs 0 1 0 1 2 3 2 3
28 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
29 // coeffs 4 5 4 5 6 7 6 7
30 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
31
32 coeffs[0] = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 0 1 0 1 0 1 0 1
33 coeffs[1] = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
34 coeffs[2] = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
35 coeffs[3] = _mm_unpackhi_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7
36 }
37
convolve(const __m128i * const s,const __m128i * const coeffs)38 static INLINE __m128i convolve(const __m128i *const s,
39 const __m128i *const coeffs) {
40 const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]);
41 const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]);
42 const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]);
43 const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]);
44 const __m128i d = _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3));
45 return d;
46 }
47
convolve_lo_x(const __m128i * const s,const __m128i * const coeffs)48 static INLINE __m128i convolve_lo_x(const __m128i *const s,
49 const __m128i *const coeffs) {
50 __m128i ss[4];
51 ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
52 ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
53 ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
54 ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
55 return convolve(ss, coeffs);
56 }
57
convolve_lo_y(const __m128i * const s,const __m128i * const coeffs)58 static INLINE __m128i convolve_lo_y(const __m128i *const s,
59 const __m128i *const coeffs) {
60 __m128i ss[4];
61 ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
62 ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
63 ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
64 ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
65 return convolve(ss, coeffs);
66 }
67
convolve_hi_y(const __m128i * const s,const __m128i * const coeffs)68 static INLINE __m128i convolve_hi_y(const __m128i *const s,
69 const __m128i *const coeffs) {
70 __m128i ss[4];
71 ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
72 ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
73 ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
74 ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
75 return convolve(ss, coeffs);
76 }
77
av1_convolve_y_sr_sse2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)78 void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
79 int dst_stride, int w, int h,
80 const InterpFilterParams *filter_params_x,
81 const InterpFilterParams *filter_params_y,
82 const int subpel_x_q4, const int subpel_y_q4,
83 ConvolveParams *conv_params) {
84 const int fo_vert = filter_params_y->taps / 2 - 1;
85 const uint8_t *src_ptr = src - fo_vert * src_stride;
86 const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
87 const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
88 __m128i coeffs[4];
89
90 (void)filter_params_x;
91 (void)subpel_x_q4;
92 (void)conv_params;
93
94 assert(conv_params->round_0 <= FILTER_BITS);
95 assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
96 ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
97
98 prepare_coeffs(filter_params_y, subpel_y_q4, coeffs);
99
100 if (w <= 4) {
101 __m128i s[8], src6, res, res_round, res16;
102 uint32_t res_int;
103 src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
104 s[0] = _mm_unpacklo_epi8(
105 _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
106 _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
107 s[1] = _mm_unpacklo_epi8(
108 _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
109 _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
110 s[2] = _mm_unpacklo_epi8(
111 _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
112 _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
113 s[3] = _mm_unpacklo_epi8(
114 _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
115 _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
116 s[4] = _mm_unpacklo_epi8(
117 _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
118 _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
119 s[5] = _mm_unpacklo_epi8(
120 _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
121
122 do {
123 s[6] = _mm_unpacklo_epi8(
124 src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
125 src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
126 s[7] = _mm_unpacklo_epi8(
127 _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
128
129 res = convolve_lo_y(s + 0, coeffs);
130 res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
131 res16 = _mm_packs_epi32(res_round, res_round);
132 res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
133
134 if (w == 2)
135 *(uint16_t *)dst = res_int;
136 else
137 *(uint32_t *)dst = res_int;
138
139 src_ptr += src_stride;
140 dst += dst_stride;
141
142 res = convolve_lo_y(s + 1, coeffs);
143 res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
144 res16 = _mm_packs_epi32(res_round, res_round);
145 res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
146
147 if (w == 2)
148 *(uint16_t *)dst = res_int;
149 else
150 *(uint32_t *)dst = res_int;
151
152 src_ptr += src_stride;
153 dst += dst_stride;
154
155 s[0] = s[2];
156 s[1] = s[3];
157 s[2] = s[4];
158 s[3] = s[5];
159 s[4] = s[6];
160 s[5] = s[7];
161 h -= 2;
162 } while (h);
163 } else {
164 assert(!(w % 8));
165 int j = 0;
166 do {
167 __m128i s[8], src6, res_lo, res_hi;
168 __m128i res_lo_round, res_hi_round, res16, res;
169 const uint8_t *data = &src_ptr[j];
170
171 src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
172 s[0] = _mm_unpacklo_epi8(
173 _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
174 _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
175 s[1] = _mm_unpacklo_epi8(
176 _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
177 _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
178 s[2] = _mm_unpacklo_epi8(
179 _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
180 _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
181 s[3] = _mm_unpacklo_epi8(
182 _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
183 _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
184 s[4] = _mm_unpacklo_epi8(
185 _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
186 _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
187 s[5] = _mm_unpacklo_epi8(
188 _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
189
190 int i = 0;
191 do {
192 data = &src_ptr[i * src_stride + j];
193 s[6] = _mm_unpacklo_epi8(
194 src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
195 src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
196 s[7] = _mm_unpacklo_epi8(
197 _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
198
199 res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels
200 res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels
201
202 res_lo_round =
203 _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
204 res_hi_round =
205 _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
206
207 res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
208 res = _mm_packus_epi16(res16, res16);
209
210 _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
211 i++;
212
213 res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels
214 res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels
215
216 res_lo_round =
217 _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
218 res_hi_round =
219 _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
220
221 res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
222 res = _mm_packus_epi16(res16, res16);
223
224 _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
225 i++;
226
227 s[0] = s[2];
228 s[1] = s[3];
229 s[2] = s[4];
230 s[3] = s[5];
231 s[4] = s[6];
232 s[5] = s[7];
233 } while (i < h);
234 j += 8;
235 } while (j < w);
236 }
237 }
238
av1_convolve_x_sr_sse2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)239 void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
240 int dst_stride, int w, int h,
241 const InterpFilterParams *filter_params_x,
242 const InterpFilterParams *filter_params_y,
243 const int subpel_x_q4, const int subpel_y_q4,
244 ConvolveParams *conv_params) {
245 const int fo_horiz = filter_params_x->taps / 2 - 1;
246 const uint8_t *src_ptr = src - fo_horiz;
247 const int bits = FILTER_BITS - conv_params->round_0;
248 const __m128i round_0_const =
249 _mm_set1_epi32((1 << conv_params->round_0) >> 1);
250 const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1);
251 const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
252 const __m128i round_shift = _mm_cvtsi32_si128(bits);
253 __m128i coeffs[4];
254
255 (void)filter_params_y;
256 (void)subpel_y_q4;
257
258 assert(bits >= 0);
259 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
260 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
261
262 prepare_coeffs(filter_params_x, subpel_x_q4, coeffs);
263
264 if (w <= 4) {
265 do {
266 const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
267 __m128i s[4];
268
269 s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
270 s[1] =
271 _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
272 s[2] =
273 _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
274 s[3] =
275 _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
276 const __m128i res_lo = convolve_lo_x(s, coeffs);
277 __m128i res_lo_round =
278 _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
279 res_lo_round =
280 _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), round_shift);
281
282 const __m128i res16 = _mm_packs_epi32(res_lo_round, res_lo_round);
283 const __m128i res = _mm_packus_epi16(res16, res16);
284
285 uint32_t r = _mm_cvtsi128_si32(res);
286 if (w == 2)
287 *(uint16_t *)dst = r;
288 else
289 *(uint32_t *)dst = r;
290
291 src_ptr += src_stride;
292 dst += dst_stride;
293 } while (--h);
294 } else {
295 assert(!(w % 8));
296 int i = 0;
297 do {
298 int j = 0;
299 do {
300 const __m128i data =
301 _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
302 __m128i s[4];
303
304 // Filter even-index pixels
305 s[0] = data;
306 s[1] = _mm_srli_si128(data, 2);
307 s[2] = _mm_srli_si128(data, 4);
308 s[3] = _mm_srli_si128(data, 6);
309 const __m128i res_even = convolve_lo_x(s, coeffs);
310
311 // Filter odd-index pixels
312 s[0] = _mm_srli_si128(data, 1);
313 s[1] = _mm_srli_si128(data, 3);
314 s[2] = _mm_srli_si128(data, 5);
315 s[3] = _mm_srli_si128(data, 7);
316 const __m128i res_odd = convolve_lo_x(s, coeffs);
317
318 // Rearrange pixels back into the order 0 ... 7
319 const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
320 const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
321 __m128i res_lo_round =
322 _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
323 res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
324 round_shift);
325 __m128i res_hi_round =
326 _mm_sra_epi32(_mm_add_epi32(res_hi, round_0_const), round_0_shift);
327 res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
328 round_shift);
329
330 const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
331 const __m128i res = _mm_packus_epi16(res16, res16);
332
333 _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
334 j += 8;
335 } while (j < w);
336 } while (++i < h);
337 }
338 }
339