1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <smmintrin.h>
13
14 #include "config/av1_rtcd.h"
15
16 #include "av1/common/warped_motion.h"
17
18 static const uint8_t warp_highbd_arrange_bytes[16] = { 0, 2, 4, 6, 8, 10,
19 12, 14, 1, 3, 5, 7,
20 9, 11, 13, 15 };
21
22 static const uint8_t highbd_shuffle_alpha0_mask0[16] = {
23 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
24 };
25 static const uint8_t highbd_shuffle_alpha0_mask1[16] = {
26 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
27 };
28 static const uint8_t highbd_shuffle_alpha0_mask2[16] = { 8, 9, 10, 11, 8, 9,
29 10, 11, 8, 9, 10, 11,
30 8, 9, 10, 11 };
31 static const uint8_t highbd_shuffle_alpha0_mask3[16] = { 12, 13, 14, 15, 12, 13,
32 14, 15, 12, 13, 14, 15,
33 12, 13, 14, 15 };
34
highbd_prepare_horizontal_filter_coeff(int alpha,int sx,__m128i * coeff)35 static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,
36 __m128i *coeff) {
37 // Filter even-index pixels
38 const __m128i tmp_0 =
39 _mm_loadu_si128((__m128i *)(av1_warped_filter +
40 ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
41 const __m128i tmp_2 =
42 _mm_loadu_si128((__m128i *)(av1_warped_filter +
43 ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
44 const __m128i tmp_4 =
45 _mm_loadu_si128((__m128i *)(av1_warped_filter +
46 ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
47 const __m128i tmp_6 =
48 _mm_loadu_si128((__m128i *)(av1_warped_filter +
49 ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
50
51 // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
52 const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
53 // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
54 const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
55 // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
56 const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
57 // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
58 const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
59
60 // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
61 coeff[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
62 // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
63 coeff[2] = _mm_unpackhi_epi64(tmp_8, tmp_10);
64 // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
65 coeff[4] = _mm_unpacklo_epi64(tmp_12, tmp_14);
66 // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
67 coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14);
68
69 // Filter odd-index pixels
70 const __m128i tmp_1 =
71 _mm_loadu_si128((__m128i *)(av1_warped_filter +
72 ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
73 const __m128i tmp_3 =
74 _mm_loadu_si128((__m128i *)(av1_warped_filter +
75 ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
76 const __m128i tmp_5 =
77 _mm_loadu_si128((__m128i *)(av1_warped_filter +
78 ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
79 const __m128i tmp_7 =
80 _mm_loadu_si128((__m128i *)(av1_warped_filter +
81 ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
82
83 const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
84 const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
85 const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
86 const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
87
88 coeff[1] = _mm_unpacklo_epi64(tmp_9, tmp_11);
89 coeff[3] = _mm_unpackhi_epi64(tmp_9, tmp_11);
90 coeff[5] = _mm_unpacklo_epi64(tmp_13, tmp_15);
91 coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
92 }
93
highbd_prepare_horizontal_filter_coeff_alpha0(int sx,__m128i * coeff)94 static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0(
95 int sx, __m128i *coeff) {
96 // Filter coeff
97 const __m128i tmp_0 = _mm_loadu_si128(
98 (__m128i *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
99
100 coeff[0] = _mm_shuffle_epi8(
101 tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0));
102 coeff[2] = _mm_shuffle_epi8(
103 tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask1));
104 coeff[4] = _mm_shuffle_epi8(
105 tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask2));
106 coeff[6] = _mm_shuffle_epi8(
107 tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask3));
108
109 coeff[1] = coeff[0];
110 coeff[3] = coeff[2];
111 coeff[5] = coeff[4];
112 coeff[7] = coeff[6];
113 }
114
highbd_filter_src_pixels(const __m128i * src,const __m128i * src2,__m128i * tmp,__m128i * coeff,const int offset_bits_horiz,const int reduce_bits_horiz,int k)115 static INLINE void highbd_filter_src_pixels(
116 const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff,
117 const int offset_bits_horiz, const int reduce_bits_horiz, int k) {
118 const __m128i src_1 = *src;
119 const __m128i src2_1 = *src2;
120
121 const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) +
122 ((1 << reduce_bits_horiz) >> 1));
123
124 const __m128i res_0 = _mm_madd_epi16(src_1, coeff[0]);
125 const __m128i res_2 =
126 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 4), coeff[2]);
127 const __m128i res_4 =
128 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 8), coeff[4]);
129 const __m128i res_6 =
130 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 12), coeff[6]);
131
132 __m128i res_even =
133 _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6));
134 res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
135 _mm_cvtsi32_si128(reduce_bits_horiz));
136
137 const __m128i res_1 =
138 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 2), coeff[1]);
139 const __m128i res_3 =
140 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 6), coeff[3]);
141 const __m128i res_5 =
142 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 10), coeff[5]);
143 const __m128i res_7 =
144 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 14), coeff[7]);
145
146 __m128i res_odd =
147 _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7));
148 res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
149 _mm_cvtsi32_si128(reduce_bits_horiz));
150
151 // Combine results into one register.
152 // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
153 // as this order helps with the vertical filter.
154 tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
155 }
156
highbd_horiz_filter(const __m128i * src,const __m128i * src2,__m128i * tmp,int sx,int alpha,int k,const int offset_bits_horiz,const int reduce_bits_horiz)157 static INLINE void highbd_horiz_filter(const __m128i *src, const __m128i *src2,
158 __m128i *tmp, int sx, int alpha, int k,
159 const int offset_bits_horiz,
160 const int reduce_bits_horiz) {
161 __m128i coeff[8];
162 highbd_prepare_horizontal_filter_coeff(alpha, sx, coeff);
163 highbd_filter_src_pixels(src, src2, tmp, coeff, offset_bits_horiz,
164 reduce_bits_horiz, k);
165 }
166
highbd_warp_horizontal_filter_alpha0_beta0(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)167 static INLINE void highbd_warp_horizontal_filter_alpha0_beta0(
168 const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
169 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
170 const int offset_bits_horiz, const int reduce_bits_horiz) {
171 (void)beta;
172 (void)alpha;
173 int k;
174
175 __m128i coeff[8];
176 highbd_prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
177
178 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
179 int iy = iy4 + k;
180 if (iy < 0)
181 iy = 0;
182 else if (iy > height - 1)
183 iy = height - 1;
184
185 // Load source pixels
186 const __m128i src =
187 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
188 const __m128i src2 =
189 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
190 highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
191 reduce_bits_horiz, k);
192 }
193 }
194
highbd_warp_horizontal_filter_alpha0(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)195 static INLINE void highbd_warp_horizontal_filter_alpha0(
196 const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
197 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
198 const int offset_bits_horiz, const int reduce_bits_horiz) {
199 (void)alpha;
200 int k;
201 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
202 int iy = iy4 + k;
203 if (iy < 0)
204 iy = 0;
205 else if (iy > height - 1)
206 iy = height - 1;
207 int sx = sx4 + beta * (k + 4);
208
209 // Load source pixels
210 const __m128i src =
211 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
212 const __m128i src2 =
213 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
214
215 __m128i coeff[8];
216 highbd_prepare_horizontal_filter_coeff_alpha0(sx, coeff);
217 highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
218 reduce_bits_horiz, k);
219 }
220 }
221
highbd_warp_horizontal_filter_beta0(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)222 static INLINE void highbd_warp_horizontal_filter_beta0(
223 const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
224 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
225 const int offset_bits_horiz, const int reduce_bits_horiz) {
226 (void)beta;
227 int k;
228 __m128i coeff[8];
229 highbd_prepare_horizontal_filter_coeff(alpha, sx4, coeff);
230
231 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
232 int iy = iy4 + k;
233 if (iy < 0)
234 iy = 0;
235 else if (iy > height - 1)
236 iy = height - 1;
237
238 // Load source pixels
239 const __m128i src =
240 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
241 const __m128i src2 =
242 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
243 highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
244 reduce_bits_horiz, k);
245 }
246 }
247
highbd_warp_horizontal_filter(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)248 static INLINE void highbd_warp_horizontal_filter(
249 const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
250 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
251 const int offset_bits_horiz, const int reduce_bits_horiz) {
252 int k;
253 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
254 int iy = iy4 + k;
255 if (iy < 0)
256 iy = 0;
257 else if (iy > height - 1)
258 iy = height - 1;
259 int sx = sx4 + beta * (k + 4);
260
261 // Load source pixels
262 const __m128i src =
263 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
264 const __m128i src2 =
265 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
266
267 highbd_horiz_filter(&src, &src2, tmp, sx, alpha, k, offset_bits_horiz,
268 reduce_bits_horiz);
269 }
270 }
271
highbd_prepare_warp_horizontal_filter(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)272 static INLINE void highbd_prepare_warp_horizontal_filter(
273 const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
274 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
275 const int offset_bits_horiz, const int reduce_bits_horiz) {
276 if (alpha == 0 && beta == 0)
277 highbd_warp_horizontal_filter_alpha0_beta0(
278 ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
279 offset_bits_horiz, reduce_bits_horiz);
280
281 else if (alpha == 0 && beta != 0)
282 highbd_warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha,
283 beta, p_height, height, i,
284 offset_bits_horiz, reduce_bits_horiz);
285
286 else if (alpha != 0 && beta == 0)
287 highbd_warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
288 beta, p_height, height, i,
289 offset_bits_horiz, reduce_bits_horiz);
290 else
291 highbd_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
292 p_height, height, i, offset_bits_horiz,
293 reduce_bits_horiz);
294 }
295
av1_highbd_warp_affine_sse4_1(const int32_t * mat,const uint16_t * ref,int width,int height,int stride,uint16_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)296 void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
297 int width, int height, int stride,
298 uint16_t *pred, int p_col, int p_row,
299 int p_width, int p_height, int p_stride,
300 int subsampling_x, int subsampling_y, int bd,
301 ConvolveParams *conv_params, int16_t alpha,
302 int16_t beta, int16_t gamma, int16_t delta) {
303 __m128i tmp[15];
304 int i, j, k;
305 const int reduce_bits_horiz =
306 conv_params->round_0 +
307 AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
308 const int reduce_bits_vert = conv_params->is_compound
309 ? conv_params->round_1
310 : 2 * FILTER_BITS - reduce_bits_horiz;
311 const int offset_bits_horiz = bd + FILTER_BITS - 1;
312 assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
313 assert(!(bd == 12 && reduce_bits_horiz < 5));
314 assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
315
316 const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
317 const __m128i clip_pixel =
318 _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
319 const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
320 const __m128i reduce_bits_vert_const =
321 _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
322 const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
323 const int round_bits =
324 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
325 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
326 const __m128i res_sub_const =
327 _mm_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
328 (1 << (offset_bits - conv_params->round_1 - 1)));
329 __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
330 __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));
331
332 const int w0 = conv_params->fwd_offset;
333 const int w1 = conv_params->bck_offset;
334 const __m128i wt0 = _mm_set1_epi32(w0);
335 const __m128i wt1 = _mm_set1_epi32(w1);
336
337 /* Note: For this code to work, the left/right frame borders need to be
338 extended by at least 13 pixels each. By the time we get here, other
339 code will have set up this border, but we allow an explicit check
340 for debugging purposes.
341 */
342 /*for (i = 0; i < height; ++i) {
343 for (j = 0; j < 13; ++j) {
344 assert(ref[i * stride - 13 + j] == ref[i * stride]);
345 assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
346 }
347 }*/
348
349 for (i = 0; i < p_height; i += 8) {
350 for (j = 0; j < p_width; j += 8) {
351 const int32_t src_x = (p_col + j + 4) << subsampling_x;
352 const int32_t src_y = (p_row + i + 4) << subsampling_y;
353 const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
354 const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
355 const int32_t x4 = dst_x >> subsampling_x;
356 const int32_t y4 = dst_y >> subsampling_y;
357
358 int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
359 int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
360 int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
361 int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
362
363 // Add in all the constant terms, including rounding and offset
364 sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
365 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
366 sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
367 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
368
369 sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
370 sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
371
372 // Horizontal filter
373 // If the block is aligned such that, after clamping, every sample
374 // would be taken from the leftmost/rightmost column, then we can
375 // skip the expensive horizontal filter.
376 if (ix4 <= -7) {
377 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
378 int iy = iy4 + k;
379 if (iy < 0)
380 iy = 0;
381 else if (iy > height - 1)
382 iy = height - 1;
383 tmp[k + 7] = _mm_set1_epi16(
384 (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
385 ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
386 }
387 } else if (ix4 >= width + 6) {
388 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
389 int iy = iy4 + k;
390 if (iy < 0)
391 iy = 0;
392 else if (iy > height - 1)
393 iy = height - 1;
394 tmp[k + 7] =
395 _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
396 ref[iy * stride + (width - 1)] *
397 (1 << (FILTER_BITS - reduce_bits_horiz)));
398 }
399 } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
400 const int out_of_boundary_left = -(ix4 - 6);
401 const int out_of_boundary_right = (ix4 + 8) - width;
402
403 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
404 int iy = iy4 + k;
405 if (iy < 0)
406 iy = 0;
407 else if (iy > height - 1)
408 iy = height - 1;
409 int sx = sx4 + beta * (k + 4);
410
411 // Load source pixels
412 const __m128i src =
413 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
414 const __m128i src2 =
415 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
416
417 const __m128i src_01 = _mm_shuffle_epi8(
418 src, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
419 const __m128i src2_01 = _mm_shuffle_epi8(
420 src2, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
421
422 __m128i src_lo = _mm_unpacklo_epi64(src_01, src2_01);
423 __m128i src_hi = _mm_unpackhi_epi64(src_01, src2_01);
424
425 if (out_of_boundary_left >= 0) {
426 const __m128i shuffle_reg_left =
427 _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
428 src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_left);
429 src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_left);
430 }
431
432 if (out_of_boundary_right >= 0) {
433 const __m128i shuffle_reg_right = _mm_loadu_si128(
434 (__m128i *)warp_pad_right[out_of_boundary_right]);
435 src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_right);
436 src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_right);
437 }
438
439 const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi);
440 const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi);
441
442 highbd_horiz_filter(&src_padded, &src2_padded, tmp, sx, alpha, k,
443 offset_bits_horiz, reduce_bits_horiz);
444 }
445 } else {
446 highbd_prepare_warp_horizontal_filter(
447 ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
448 offset_bits_horiz, reduce_bits_horiz);
449 }
450
451 // Vertical filter
452 for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
453 int sy = sy4 + delta * (k + 4);
454
455 // Load from tmp and rearrange pairs of consecutive rows into the
456 // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
457 const __m128i *src = tmp + (k + 4);
458 const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
459 const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
460 const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
461 const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
462
463 // Filter even-index pixels
464 const __m128i tmp_0 = _mm_loadu_si128(
465 (__m128i *)(av1_warped_filter +
466 ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
467 const __m128i tmp_2 = _mm_loadu_si128(
468 (__m128i *)(av1_warped_filter +
469 ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
470 const __m128i tmp_4 = _mm_loadu_si128(
471 (__m128i *)(av1_warped_filter +
472 ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
473 const __m128i tmp_6 = _mm_loadu_si128(
474 (__m128i *)(av1_warped_filter +
475 ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
476
477 const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
478 const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
479 const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
480 const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
481
482 const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
483 const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
484 const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
485 const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
486
487 const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
488 const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
489 const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
490 const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
491
492 const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
493 _mm_add_epi32(res_4, res_6));
494
495 // Filter odd-index pixels
496 const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
497 const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
498 const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
499 const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
500
501 const __m128i tmp_1 = _mm_loadu_si128(
502 (__m128i *)(av1_warped_filter +
503 ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
504 const __m128i tmp_3 = _mm_loadu_si128(
505 (__m128i *)(av1_warped_filter +
506 ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
507 const __m128i tmp_5 = _mm_loadu_si128(
508 (__m128i *)(av1_warped_filter +
509 ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
510 const __m128i tmp_7 = _mm_loadu_si128(
511 (__m128i *)(av1_warped_filter +
512 ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
513
514 const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
515 const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
516 const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
517 const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
518
519 const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
520 const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
521 const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
522 const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
523
524 const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
525 const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
526 const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
527 const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
528
529 const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
530 _mm_add_epi32(res_5, res_7));
531
532 // Rearrange pixels back into the order 0 ... 7
533 __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
534 __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
535
536 if (conv_params->is_compound) {
537 __m128i *const p =
538 (__m128i *)&conv_params
539 ->dst[(i + k + 4) * conv_params->dst_stride + j];
540 res_lo = _mm_add_epi32(res_lo, res_add_const);
541 res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const),
542 reduce_bits_vert_shift);
543
544 if (conv_params->do_average) {
545 __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
546 __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p));
547
548 if (conv_params->use_dist_wtd_comp_avg) {
549 res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
550 _mm_mullo_epi32(res_lo, wt1));
551 res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS);
552 } else {
553 res_lo = _mm_srai_epi32(_mm_add_epi32(p_32, res_lo), 1);
554 }
555
556 __m128i res32_lo = _mm_add_epi32(res_lo, res_sub_const);
557 res32_lo = _mm_sra_epi32(_mm_add_epi32(res32_lo, round_bits_const),
558 round_bits_shift);
559
560 __m128i res16_lo = _mm_packus_epi32(res32_lo, res32_lo);
561 res16_lo = _mm_min_epi16(res16_lo, clip_pixel);
562 _mm_storel_epi64(dst16, res16_lo);
563 } else {
564 res_lo = _mm_packus_epi32(res_lo, res_lo);
565 _mm_storel_epi64(p, res_lo);
566 }
567 if (p_width > 4) {
568 __m128i *const p4 =
569 (__m128i *)&conv_params
570 ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
571
572 res_hi = _mm_add_epi32(res_hi, res_add_const);
573 res_hi =
574 _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const),
575 reduce_bits_vert_shift);
576 if (conv_params->do_average) {
577 __m128i *const dst16_4 =
578 (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
579 __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4));
580
581 if (conv_params->use_dist_wtd_comp_avg) {
582 res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0),
583 _mm_mullo_epi32(res_hi, wt1));
584 res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS);
585 } else {
586 res_hi = _mm_srai_epi32(_mm_add_epi32(p4_32, res_hi), 1);
587 }
588
589 __m128i res32_hi = _mm_add_epi32(res_hi, res_sub_const);
590 res32_hi = _mm_sra_epi32(
591 _mm_add_epi32(res32_hi, round_bits_const), round_bits_shift);
592 __m128i res16_hi = _mm_packus_epi32(res32_hi, res32_hi);
593 res16_hi = _mm_min_epi16(res16_hi, clip_pixel);
594 _mm_storel_epi64(dst16_4, res16_hi);
595 } else {
596 res_hi = _mm_packus_epi32(res_hi, res_hi);
597 _mm_storel_epi64(p4, res_hi);
598 }
599 }
600 } else {
601 // Round and pack into 8 bits
602 const __m128i round_const =
603 _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
604 ((1 << reduce_bits_vert) >> 1));
605
606 const __m128i res_lo_round = _mm_srai_epi32(
607 _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
608 const __m128i res_hi_round = _mm_srai_epi32(
609 _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
610
611 __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
612 // Clamp res_16bit to the range [0, 2^bd - 1]
613 const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
614 const __m128i zero = _mm_setzero_si128();
615 res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
616
617 // Store, blending with 'pred' if needed
618 __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
619
620 // Note: If we're outputting a 4x4 block, we need to be very careful
621 // to only output 4 pixels at this point, to avoid encode/decode
622 // mismatches when encoding with multiple threads.
623 if (p_width == 4) {
624 _mm_storel_epi64(p, res_16bit);
625 } else {
626 _mm_storeu_si128(p, res_16bit);
627 }
628 }
629 }
630 }
631 }
632 }
633