1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <smmintrin.h>
13
14 #include "config/av1_rtcd.h"
15
16 #include "av1/common/warped_motion.h"
17
18 static const uint8_t warp_highbd_arrange_bytes[16] = {
19 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
20 };
21
22 static const uint8_t highbd_shuffle_alpha0_mask0[16] = {
23 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
24 };
25 static const uint8_t highbd_shuffle_alpha0_mask1[16] = {
26 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
27 };
28 static const uint8_t highbd_shuffle_alpha0_mask2[16] = {
29 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11
30 };
31 static const uint8_t highbd_shuffle_alpha0_mask3[16] = {
32 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
33 };
34
highbd_prepare_horizontal_filter_coeff(int alpha,int sx,__m128i * coeff)35 static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,
36 __m128i *coeff) {
37 // Filter even-index pixels
38 const __m128i tmp_0 = _mm_loadu_si128(
39 (__m128i *)(warped_filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
40 const __m128i tmp_2 = _mm_loadu_si128(
41 (__m128i *)(warped_filter + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
42 const __m128i tmp_4 = _mm_loadu_si128(
43 (__m128i *)(warped_filter + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
44 const __m128i tmp_6 = _mm_loadu_si128(
45 (__m128i *)(warped_filter + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
46
47 // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
48 const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
49 // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
50 const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
51 // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
52 const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
53 // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
54 const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
55
56 // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
57 coeff[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
58 // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
59 coeff[2] = _mm_unpackhi_epi64(tmp_8, tmp_10);
60 // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
61 coeff[4] = _mm_unpacklo_epi64(tmp_12, tmp_14);
62 // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
63 coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14);
64
65 // Filter odd-index pixels
66 const __m128i tmp_1 = _mm_loadu_si128(
67 (__m128i *)(warped_filter + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
68 const __m128i tmp_3 = _mm_loadu_si128(
69 (__m128i *)(warped_filter + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
70 const __m128i tmp_5 = _mm_loadu_si128(
71 (__m128i *)(warped_filter + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
72 const __m128i tmp_7 = _mm_loadu_si128(
73 (__m128i *)(warped_filter + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
74
75 const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
76 const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
77 const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
78 const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
79
80 coeff[1] = _mm_unpacklo_epi64(tmp_9, tmp_11);
81 coeff[3] = _mm_unpackhi_epi64(tmp_9, tmp_11);
82 coeff[5] = _mm_unpacklo_epi64(tmp_13, tmp_15);
83 coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
84 }
85
highbd_prepare_horizontal_filter_coeff_alpha0(int sx,__m128i * coeff)86 static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0(
87 int sx, __m128i *coeff) {
88 // Filter coeff
89 const __m128i tmp_0 = _mm_loadu_si128(
90 (__m128i *)(warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
91
92 coeff[0] = _mm_shuffle_epi8(
93 tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0));
94 coeff[2] = _mm_shuffle_epi8(
95 tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask1));
96 coeff[4] = _mm_shuffle_epi8(
97 tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask2));
98 coeff[6] = _mm_shuffle_epi8(
99 tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask3));
100
101 coeff[1] = coeff[0];
102 coeff[3] = coeff[2];
103 coeff[5] = coeff[4];
104 coeff[7] = coeff[6];
105 }
106
highbd_filter_src_pixels(const __m128i * src,const __m128i * src2,__m128i * tmp,__m128i * coeff,const int offset_bits_horiz,const int reduce_bits_horiz,int k)107 static INLINE void highbd_filter_src_pixels(
108 const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff,
109 const int offset_bits_horiz, const int reduce_bits_horiz, int k) {
110 const __m128i src_1 = *src;
111 const __m128i src2_1 = *src2;
112
113 const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) +
114 ((1 << reduce_bits_horiz) >> 1));
115
116 const __m128i res_0 = _mm_madd_epi16(src_1, coeff[0]);
117 const __m128i res_2 =
118 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 4), coeff[2]);
119 const __m128i res_4 =
120 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 8), coeff[4]);
121 const __m128i res_6 =
122 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 12), coeff[6]);
123
124 __m128i res_even =
125 _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6));
126 res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
127 _mm_cvtsi32_si128(reduce_bits_horiz));
128
129 const __m128i res_1 =
130 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 2), coeff[1]);
131 const __m128i res_3 =
132 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 6), coeff[3]);
133 const __m128i res_5 =
134 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 10), coeff[5]);
135 const __m128i res_7 =
136 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 14), coeff[7]);
137
138 __m128i res_odd =
139 _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7));
140 res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
141 _mm_cvtsi32_si128(reduce_bits_horiz));
142
143 // Combine results into one register.
144 // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
145 // as this order helps with the vertical filter.
146 tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
147 }
148
highbd_horiz_filter(const __m128i * src,const __m128i * src2,__m128i * tmp,int sx,int alpha,int k,const int offset_bits_horiz,const int reduce_bits_horiz)149 static INLINE void highbd_horiz_filter(const __m128i *src, const __m128i *src2,
150 __m128i *tmp, int sx, int alpha, int k,
151 const int offset_bits_horiz,
152 const int reduce_bits_horiz) {
153 __m128i coeff[8];
154 highbd_prepare_horizontal_filter_coeff(alpha, sx, coeff);
155 highbd_filter_src_pixels(src, src2, tmp, coeff, offset_bits_horiz,
156 reduce_bits_horiz, k);
157 }
158
highbd_warp_horizontal_filter_alpha0_beta0(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)159 static INLINE void highbd_warp_horizontal_filter_alpha0_beta0(
160 const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
161 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
162 const int offset_bits_horiz, const int reduce_bits_horiz) {
163 (void)beta;
164 (void)alpha;
165 int k;
166
167 __m128i coeff[8];
168 highbd_prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
169
170 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
171 int iy = iy4 + k;
172 if (iy < 0)
173 iy = 0;
174 else if (iy > height - 1)
175 iy = height - 1;
176
177 // Load source pixels
178 const __m128i src =
179 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
180 const __m128i src2 =
181 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
182 highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
183 reduce_bits_horiz, k);
184 }
185 }
186
highbd_warp_horizontal_filter_alpha0(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)187 static INLINE void highbd_warp_horizontal_filter_alpha0(
188 const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
189 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
190 const int offset_bits_horiz, const int reduce_bits_horiz) {
191 (void)alpha;
192 int k;
193 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
194 int iy = iy4 + k;
195 if (iy < 0)
196 iy = 0;
197 else if (iy > height - 1)
198 iy = height - 1;
199 int sx = sx4 + beta * (k + 4);
200
201 // Load source pixels
202 const __m128i src =
203 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
204 const __m128i src2 =
205 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
206
207 __m128i coeff[8];
208 highbd_prepare_horizontal_filter_coeff_alpha0(sx, coeff);
209 highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
210 reduce_bits_horiz, k);
211 }
212 }
213
highbd_warp_horizontal_filter_beta0(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)214 static INLINE void highbd_warp_horizontal_filter_beta0(
215 const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
216 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
217 const int offset_bits_horiz, const int reduce_bits_horiz) {
218 (void)beta;
219 int k;
220 __m128i coeff[8];
221 highbd_prepare_horizontal_filter_coeff(alpha, sx4, coeff);
222
223 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
224 int iy = iy4 + k;
225 if (iy < 0)
226 iy = 0;
227 else if (iy > height - 1)
228 iy = height - 1;
229
230 // Load source pixels
231 const __m128i src =
232 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
233 const __m128i src2 =
234 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
235 highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
236 reduce_bits_horiz, k);
237 }
238 }
239
highbd_warp_horizontal_filter(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)240 static INLINE void highbd_warp_horizontal_filter(
241 const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
242 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
243 const int offset_bits_horiz, const int reduce_bits_horiz) {
244 int k;
245 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
246 int iy = iy4 + k;
247 if (iy < 0)
248 iy = 0;
249 else if (iy > height - 1)
250 iy = height - 1;
251 int sx = sx4 + beta * (k + 4);
252
253 // Load source pixels
254 const __m128i src =
255 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
256 const __m128i src2 =
257 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
258
259 highbd_horiz_filter(&src, &src2, tmp, sx, alpha, k, offset_bits_horiz,
260 reduce_bits_horiz);
261 }
262 }
263
highbd_prepare_warp_horizontal_filter(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)264 static INLINE void highbd_prepare_warp_horizontal_filter(
265 const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
266 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
267 const int offset_bits_horiz, const int reduce_bits_horiz) {
268 if (alpha == 0 && beta == 0)
269 highbd_warp_horizontal_filter_alpha0_beta0(
270 ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
271 offset_bits_horiz, reduce_bits_horiz);
272
273 else if (alpha == 0 && beta != 0)
274 highbd_warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha,
275 beta, p_height, height, i,
276 offset_bits_horiz, reduce_bits_horiz);
277
278 else if (alpha != 0 && beta == 0)
279 highbd_warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
280 beta, p_height, height, i,
281 offset_bits_horiz, reduce_bits_horiz);
282 else
283 highbd_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
284 p_height, height, i, offset_bits_horiz,
285 reduce_bits_horiz);
286 }
287
av1_highbd_warp_affine_sse4_1(const int32_t * mat,const uint16_t * ref,int width,int height,int stride,uint16_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)288 void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
289 int width, int height, int stride,
290 uint16_t *pred, int p_col, int p_row,
291 int p_width, int p_height, int p_stride,
292 int subsampling_x, int subsampling_y, int bd,
293 ConvolveParams *conv_params, int16_t alpha,
294 int16_t beta, int16_t gamma, int16_t delta) {
295 __m128i tmp[15];
296 int i, j, k;
297 const int reduce_bits_horiz =
298 conv_params->round_0 +
299 AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
300 const int reduce_bits_vert = conv_params->is_compound
301 ? conv_params->round_1
302 : 2 * FILTER_BITS - reduce_bits_horiz;
303 const int offset_bits_horiz = bd + FILTER_BITS - 1;
304 assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
305 assert(!(bd == 12 && reduce_bits_horiz < 5));
306 assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
307
308 const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
309 const __m128i clip_pixel =
310 _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
311 const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
312 const __m128i reduce_bits_vert_const =
313 _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
314 const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
315 const int round_bits =
316 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
317 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
318 const __m128i res_sub_const =
319 _mm_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
320 (1 << (offset_bits - conv_params->round_1 - 1)));
321 __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
322 __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));
323
324 const int w0 = conv_params->fwd_offset;
325 const int w1 = conv_params->bck_offset;
326 const __m128i wt0 = _mm_set1_epi32(w0);
327 const __m128i wt1 = _mm_set1_epi32(w1);
328
329 /* Note: For this code to work, the left/right frame borders need to be
330 extended by at least 13 pixels each. By the time we get here, other
331 code will have set up this border, but we allow an explicit check
332 for debugging purposes.
333 */
334 /*for (i = 0; i < height; ++i) {
335 for (j = 0; j < 13; ++j) {
336 assert(ref[i * stride - 13 + j] == ref[i * stride]);
337 assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
338 }
339 }*/
340
341 for (i = 0; i < p_height; i += 8) {
342 for (j = 0; j < p_width; j += 8) {
343 const int32_t src_x = (p_col + j + 4) << subsampling_x;
344 const int32_t src_y = (p_row + i + 4) << subsampling_y;
345 const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
346 const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
347 const int32_t x4 = dst_x >> subsampling_x;
348 const int32_t y4 = dst_y >> subsampling_y;
349
350 int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
351 int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
352 int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
353 int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
354
355 // Add in all the constant terms, including rounding and offset
356 sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
357 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
358 sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
359 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
360
361 sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
362 sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
363
364 // Horizontal filter
365 // If the block is aligned such that, after clamping, every sample
366 // would be taken from the leftmost/rightmost column, then we can
367 // skip the expensive horizontal filter.
368 if (ix4 <= -7) {
369 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
370 int iy = iy4 + k;
371 if (iy < 0)
372 iy = 0;
373 else if (iy > height - 1)
374 iy = height - 1;
375 tmp[k + 7] = _mm_set1_epi16(
376 (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
377 ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
378 }
379 } else if (ix4 >= width + 6) {
380 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
381 int iy = iy4 + k;
382 if (iy < 0)
383 iy = 0;
384 else if (iy > height - 1)
385 iy = height - 1;
386 tmp[k + 7] =
387 _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
388 ref[iy * stride + (width - 1)] *
389 (1 << (FILTER_BITS - reduce_bits_horiz)));
390 }
391 } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
392 const int out_of_boundary_left = -(ix4 - 6);
393 const int out_of_boundary_right = (ix4 + 8) - width;
394
395 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
396 int iy = iy4 + k;
397 if (iy < 0)
398 iy = 0;
399 else if (iy > height - 1)
400 iy = height - 1;
401 int sx = sx4 + beta * (k + 4);
402
403 // Load source pixels
404 const __m128i src =
405 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
406 const __m128i src2 =
407 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
408
409 const __m128i src_01 = _mm_shuffle_epi8(
410 src, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
411 const __m128i src2_01 = _mm_shuffle_epi8(
412 src2, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
413
414 __m128i src_lo = _mm_unpacklo_epi64(src_01, src2_01);
415 __m128i src_hi = _mm_unpackhi_epi64(src_01, src2_01);
416
417 if (out_of_boundary_left >= 0) {
418 const __m128i shuffle_reg_left =
419 _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
420 src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_left);
421 src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_left);
422 }
423
424 if (out_of_boundary_right >= 0) {
425 const __m128i shuffle_reg_right = _mm_loadu_si128(
426 (__m128i *)warp_pad_right[out_of_boundary_right]);
427 src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_right);
428 src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_right);
429 }
430
431 const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi);
432 const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi);
433
434 highbd_horiz_filter(&src_padded, &src2_padded, tmp, sx, alpha, k,
435 offset_bits_horiz, reduce_bits_horiz);
436 }
437 } else {
438 highbd_prepare_warp_horizontal_filter(
439 ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
440 offset_bits_horiz, reduce_bits_horiz);
441 }
442
443 // Vertical filter
444 for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
445 int sy = sy4 + delta * (k + 4);
446
447 // Load from tmp and rearrange pairs of consecutive rows into the
448 // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
449 const __m128i *src = tmp + (k + 4);
450 const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
451 const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
452 const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
453 const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
454
455 // Filter even-index pixels
456 const __m128i tmp_0 = _mm_loadu_si128(
457 (__m128i *)(warped_filter +
458 ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
459 const __m128i tmp_2 = _mm_loadu_si128(
460 (__m128i *)(warped_filter +
461 ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
462 const __m128i tmp_4 = _mm_loadu_si128(
463 (__m128i *)(warped_filter +
464 ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
465 const __m128i tmp_6 = _mm_loadu_si128(
466 (__m128i *)(warped_filter +
467 ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
468
469 const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
470 const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
471 const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
472 const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
473
474 const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
475 const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
476 const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
477 const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
478
479 const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
480 const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
481 const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
482 const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
483
484 const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
485 _mm_add_epi32(res_4, res_6));
486
487 // Filter odd-index pixels
488 const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
489 const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
490 const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
491 const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
492
493 const __m128i tmp_1 = _mm_loadu_si128(
494 (__m128i *)(warped_filter +
495 ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
496 const __m128i tmp_3 = _mm_loadu_si128(
497 (__m128i *)(warped_filter +
498 ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
499 const __m128i tmp_5 = _mm_loadu_si128(
500 (__m128i *)(warped_filter +
501 ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
502 const __m128i tmp_7 = _mm_loadu_si128(
503 (__m128i *)(warped_filter +
504 ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
505
506 const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
507 const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
508 const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
509 const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
510
511 const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
512 const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
513 const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
514 const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
515
516 const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
517 const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
518 const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
519 const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
520
521 const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
522 _mm_add_epi32(res_5, res_7));
523
524 // Rearrange pixels back into the order 0 ... 7
525 __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
526 __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
527
528 if (conv_params->is_compound) {
529 __m128i *const p =
530 (__m128i *)&conv_params
531 ->dst[(i + k + 4) * conv_params->dst_stride + j];
532 res_lo = _mm_add_epi32(res_lo, res_add_const);
533 res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const),
534 reduce_bits_vert_shift);
535
536 if (conv_params->do_average) {
537 __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
538 __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p));
539
540 if (conv_params->use_dist_wtd_comp_avg) {
541 res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
542 _mm_mullo_epi32(res_lo, wt1));
543 res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS);
544 } else {
545 res_lo = _mm_srai_epi32(_mm_add_epi32(p_32, res_lo), 1);
546 }
547
548 __m128i res32_lo = _mm_add_epi32(res_lo, res_sub_const);
549 res32_lo = _mm_sra_epi32(_mm_add_epi32(res32_lo, round_bits_const),
550 round_bits_shift);
551
552 __m128i res16_lo = _mm_packus_epi32(res32_lo, res32_lo);
553 res16_lo = _mm_min_epi16(res16_lo, clip_pixel);
554 _mm_storel_epi64(dst16, res16_lo);
555 } else {
556 res_lo = _mm_packus_epi32(res_lo, res_lo);
557 _mm_storel_epi64(p, res_lo);
558 }
559 if (p_width > 4) {
560 __m128i *const p4 =
561 (__m128i *)&conv_params
562 ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
563
564 res_hi = _mm_add_epi32(res_hi, res_add_const);
565 res_hi =
566 _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const),
567 reduce_bits_vert_shift);
568 if (conv_params->do_average) {
569 __m128i *const dst16_4 =
570 (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
571 __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4));
572
573 if (conv_params->use_dist_wtd_comp_avg) {
574 res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0),
575 _mm_mullo_epi32(res_hi, wt1));
576 res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS);
577 } else {
578 res_hi = _mm_srai_epi32(_mm_add_epi32(p4_32, res_hi), 1);
579 }
580
581 __m128i res32_hi = _mm_add_epi32(res_hi, res_sub_const);
582 res32_hi = _mm_sra_epi32(
583 _mm_add_epi32(res32_hi, round_bits_const), round_bits_shift);
584 __m128i res16_hi = _mm_packus_epi32(res32_hi, res32_hi);
585 res16_hi = _mm_min_epi16(res16_hi, clip_pixel);
586 _mm_storel_epi64(dst16_4, res16_hi);
587 } else {
588 res_hi = _mm_packus_epi32(res_hi, res_hi);
589 _mm_storel_epi64(p4, res_hi);
590 }
591 }
592 } else {
593 // Round and pack into 8 bits
594 const __m128i round_const =
595 _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
596 ((1 << reduce_bits_vert) >> 1));
597
598 const __m128i res_lo_round = _mm_srai_epi32(
599 _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
600 const __m128i res_hi_round = _mm_srai_epi32(
601 _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
602
603 __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
604 // Clamp res_16bit to the range [0, 2^bd - 1]
605 const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
606 const __m128i zero = _mm_setzero_si128();
607 res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
608
609 // Store, blending with 'pred' if needed
610 __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
611
612 // Note: If we're outputting a 4x4 block, we need to be very careful
613 // to only output 4 pixels at this point, to avoid encode/decode
614 // mismatches when encoding with multiple threads.
615 if (p_width == 4) {
616 _mm_storel_epi64(p, res_16bit);
617 } else {
618 _mm_storeu_si128(p, res_16bit);
619 }
620 }
621 }
622 }
623 }
624 }
625