• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <smmintrin.h>
13 
14 #include "config/av1_rtcd.h"
15 
16 #include "av1/common/warped_motion.h"
17 
18 static const uint8_t warp_highbd_arrange_bytes[16] = { 0,  2,  4,  6, 8, 10,
19                                                        12, 14, 1,  3, 5, 7,
20                                                        9,  11, 13, 15 };
21 
22 static const uint8_t highbd_shuffle_alpha0_mask0[16] = {
23   0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
24 };
25 static const uint8_t highbd_shuffle_alpha0_mask1[16] = {
26   4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
27 };
28 static const uint8_t highbd_shuffle_alpha0_mask2[16] = { 8,  9,  10, 11, 8,  9,
29                                                          10, 11, 8,  9,  10, 11,
30                                                          8,  9,  10, 11 };
31 static const uint8_t highbd_shuffle_alpha0_mask3[16] = { 12, 13, 14, 15, 12, 13,
32                                                          14, 15, 12, 13, 14, 15,
33                                                          12, 13, 14, 15 };
34 
highbd_prepare_horizontal_filter_coeff(int alpha,int sx,__m128i * coeff)35 static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,
36                                                           __m128i *coeff) {
37   // Filter even-index pixels
38   const __m128i tmp_0 =
39       _mm_loadu_si128((__m128i *)(av1_warped_filter +
40                                   ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
41   const __m128i tmp_2 =
42       _mm_loadu_si128((__m128i *)(av1_warped_filter +
43                                   ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
44   const __m128i tmp_4 =
45       _mm_loadu_si128((__m128i *)(av1_warped_filter +
46                                   ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
47   const __m128i tmp_6 =
48       _mm_loadu_si128((__m128i *)(av1_warped_filter +
49                                   ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
50 
51   // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
52   const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
53   // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
54   const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
55   // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
56   const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
57   // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
58   const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
59 
60   // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
61   coeff[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
62   // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
63   coeff[2] = _mm_unpackhi_epi64(tmp_8, tmp_10);
64   // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
65   coeff[4] = _mm_unpacklo_epi64(tmp_12, tmp_14);
66   // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
67   coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14);
68 
69   // Filter odd-index pixels
70   const __m128i tmp_1 =
71       _mm_loadu_si128((__m128i *)(av1_warped_filter +
72                                   ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
73   const __m128i tmp_3 =
74       _mm_loadu_si128((__m128i *)(av1_warped_filter +
75                                   ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
76   const __m128i tmp_5 =
77       _mm_loadu_si128((__m128i *)(av1_warped_filter +
78                                   ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
79   const __m128i tmp_7 =
80       _mm_loadu_si128((__m128i *)(av1_warped_filter +
81                                   ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
82 
83   const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
84   const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
85   const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
86   const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
87 
88   coeff[1] = _mm_unpacklo_epi64(tmp_9, tmp_11);
89   coeff[3] = _mm_unpackhi_epi64(tmp_9, tmp_11);
90   coeff[5] = _mm_unpacklo_epi64(tmp_13, tmp_15);
91   coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
92 }
93 
highbd_prepare_horizontal_filter_coeff_alpha0(int sx,__m128i * coeff)94 static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0(
95     int sx, __m128i *coeff) {
96   // Filter coeff
97   const __m128i tmp_0 = _mm_loadu_si128(
98       (__m128i *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
99 
100   coeff[0] = _mm_shuffle_epi8(
101       tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0));
102   coeff[2] = _mm_shuffle_epi8(
103       tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask1));
104   coeff[4] = _mm_shuffle_epi8(
105       tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask2));
106   coeff[6] = _mm_shuffle_epi8(
107       tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask3));
108 
109   coeff[1] = coeff[0];
110   coeff[3] = coeff[2];
111   coeff[5] = coeff[4];
112   coeff[7] = coeff[6];
113 }
114 
highbd_filter_src_pixels(const __m128i * src,const __m128i * src2,__m128i * tmp,__m128i * coeff,const int offset_bits_horiz,const int reduce_bits_horiz,int k)115 static INLINE void highbd_filter_src_pixels(
116     const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff,
117     const int offset_bits_horiz, const int reduce_bits_horiz, int k) {
118   const __m128i src_1 = *src;
119   const __m128i src2_1 = *src2;
120 
121   const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) +
122                                              ((1 << reduce_bits_horiz) >> 1));
123 
124   const __m128i res_0 = _mm_madd_epi16(src_1, coeff[0]);
125   const __m128i res_2 =
126       _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 4), coeff[2]);
127   const __m128i res_4 =
128       _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 8), coeff[4]);
129   const __m128i res_6 =
130       _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 12), coeff[6]);
131 
132   __m128i res_even =
133       _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6));
134   res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
135                            _mm_cvtsi32_si128(reduce_bits_horiz));
136 
137   const __m128i res_1 =
138       _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 2), coeff[1]);
139   const __m128i res_3 =
140       _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 6), coeff[3]);
141   const __m128i res_5 =
142       _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 10), coeff[5]);
143   const __m128i res_7 =
144       _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 14), coeff[7]);
145 
146   __m128i res_odd =
147       _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7));
148   res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
149                           _mm_cvtsi32_si128(reduce_bits_horiz));
150 
151   // Combine results into one register.
152   // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
153   // as this order helps with the vertical filter.
154   tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
155 }
156 
highbd_horiz_filter(const __m128i * src,const __m128i * src2,__m128i * tmp,int sx,int alpha,int k,const int offset_bits_horiz,const int reduce_bits_horiz)157 static INLINE void highbd_horiz_filter(const __m128i *src, const __m128i *src2,
158                                        __m128i *tmp, int sx, int alpha, int k,
159                                        const int offset_bits_horiz,
160                                        const int reduce_bits_horiz) {
161   __m128i coeff[8];
162   highbd_prepare_horizontal_filter_coeff(alpha, sx, coeff);
163   highbd_filter_src_pixels(src, src2, tmp, coeff, offset_bits_horiz,
164                            reduce_bits_horiz, k);
165 }
166 
highbd_warp_horizontal_filter_alpha0_beta0(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)167 static INLINE void highbd_warp_horizontal_filter_alpha0_beta0(
168     const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
169     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
170     const int offset_bits_horiz, const int reduce_bits_horiz) {
171   (void)beta;
172   (void)alpha;
173   int k;
174 
175   __m128i coeff[8];
176   highbd_prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
177 
178   for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
179     int iy = iy4 + k;
180     if (iy < 0)
181       iy = 0;
182     else if (iy > height - 1)
183       iy = height - 1;
184 
185     // Load source pixels
186     const __m128i src =
187         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
188     const __m128i src2 =
189         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
190     highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
191                              reduce_bits_horiz, k);
192   }
193 }
194 
highbd_warp_horizontal_filter_alpha0(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)195 static INLINE void highbd_warp_horizontal_filter_alpha0(
196     const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
197     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
198     const int offset_bits_horiz, const int reduce_bits_horiz) {
199   (void)alpha;
200   int k;
201   for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
202     int iy = iy4 + k;
203     if (iy < 0)
204       iy = 0;
205     else if (iy > height - 1)
206       iy = height - 1;
207     int sx = sx4 + beta * (k + 4);
208 
209     // Load source pixels
210     const __m128i src =
211         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
212     const __m128i src2 =
213         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
214 
215     __m128i coeff[8];
216     highbd_prepare_horizontal_filter_coeff_alpha0(sx, coeff);
217     highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
218                              reduce_bits_horiz, k);
219   }
220 }
221 
highbd_warp_horizontal_filter_beta0(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)222 static INLINE void highbd_warp_horizontal_filter_beta0(
223     const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
224     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
225     const int offset_bits_horiz, const int reduce_bits_horiz) {
226   (void)beta;
227   int k;
228   __m128i coeff[8];
229   highbd_prepare_horizontal_filter_coeff(alpha, sx4, coeff);
230 
231   for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
232     int iy = iy4 + k;
233     if (iy < 0)
234       iy = 0;
235     else if (iy > height - 1)
236       iy = height - 1;
237 
238     // Load source pixels
239     const __m128i src =
240         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
241     const __m128i src2 =
242         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
243     highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
244                              reduce_bits_horiz, k);
245   }
246 }
247 
highbd_warp_horizontal_filter(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)248 static INLINE void highbd_warp_horizontal_filter(
249     const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
250     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
251     const int offset_bits_horiz, const int reduce_bits_horiz) {
252   int k;
253   for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
254     int iy = iy4 + k;
255     if (iy < 0)
256       iy = 0;
257     else if (iy > height - 1)
258       iy = height - 1;
259     int sx = sx4 + beta * (k + 4);
260 
261     // Load source pixels
262     const __m128i src =
263         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
264     const __m128i src2 =
265         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
266 
267     highbd_horiz_filter(&src, &src2, tmp, sx, alpha, k, offset_bits_horiz,
268                         reduce_bits_horiz);
269   }
270 }
271 
highbd_prepare_warp_horizontal_filter(const uint16_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)272 static INLINE void highbd_prepare_warp_horizontal_filter(
273     const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
274     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
275     const int offset_bits_horiz, const int reduce_bits_horiz) {
276   if (alpha == 0 && beta == 0)
277     highbd_warp_horizontal_filter_alpha0_beta0(
278         ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
279         offset_bits_horiz, reduce_bits_horiz);
280 
281   else if (alpha == 0 && beta != 0)
282     highbd_warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha,
283                                          beta, p_height, height, i,
284                                          offset_bits_horiz, reduce_bits_horiz);
285 
286   else if (alpha != 0 && beta == 0)
287     highbd_warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
288                                         beta, p_height, height, i,
289                                         offset_bits_horiz, reduce_bits_horiz);
290   else
291     highbd_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
292                                   p_height, height, i, offset_bits_horiz,
293                                   reduce_bits_horiz);
294 }
295 
av1_highbd_warp_affine_sse4_1(const int32_t * mat,const uint16_t * ref,int width,int height,int stride,uint16_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)296 void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
297                                    int width, int height, int stride,
298                                    uint16_t *pred, int p_col, int p_row,
299                                    int p_width, int p_height, int p_stride,
300                                    int subsampling_x, int subsampling_y, int bd,
301                                    ConvolveParams *conv_params, int16_t alpha,
302                                    int16_t beta, int16_t gamma, int16_t delta) {
303   __m128i tmp[15];
304   int i, j, k;
305   const int reduce_bits_horiz =
306       conv_params->round_0 +
307       AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
308   const int reduce_bits_vert = conv_params->is_compound
309                                    ? conv_params->round_1
310                                    : 2 * FILTER_BITS - reduce_bits_horiz;
311   const int offset_bits_horiz = bd + FILTER_BITS - 1;
312   assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
313   assert(!(bd == 12 && reduce_bits_horiz < 5));
314   assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
315 
316   const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
317   const __m128i clip_pixel =
318       _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
319   const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
320   const __m128i reduce_bits_vert_const =
321       _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
322   const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
323   const int round_bits =
324       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
325   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
326   const __m128i res_sub_const =
327       _mm_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
328                      (1 << (offset_bits - conv_params->round_1 - 1)));
329   __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
330   __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));
331 
332   const int w0 = conv_params->fwd_offset;
333   const int w1 = conv_params->bck_offset;
334   const __m128i wt0 = _mm_set1_epi32(w0);
335   const __m128i wt1 = _mm_set1_epi32(w1);
336 
337   /* Note: For this code to work, the left/right frame borders need to be
338   extended by at least 13 pixels each. By the time we get here, other
339   code will have set up this border, but we allow an explicit check
340   for debugging purposes.
341   */
342   /*for (i = 0; i < height; ++i) {
343   for (j = 0; j < 13; ++j) {
344   assert(ref[i * stride - 13 + j] == ref[i * stride]);
345   assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
346   }
347   }*/
348 
349   for (i = 0; i < p_height; i += 8) {
350     for (j = 0; j < p_width; j += 8) {
351       const int32_t src_x = (p_col + j + 4) << subsampling_x;
352       const int32_t src_y = (p_row + i + 4) << subsampling_y;
353       const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
354       const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
355       const int32_t x4 = dst_x >> subsampling_x;
356       const int32_t y4 = dst_y >> subsampling_y;
357 
358       int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
359       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
360       int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
361       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
362 
363       // Add in all the constant terms, including rounding and offset
364       sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
365              (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
366       sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
367              (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
368 
369       sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
370       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
371 
372       // Horizontal filter
373       // If the block is aligned such that, after clamping, every sample
374       // would be taken from the leftmost/rightmost column, then we can
375       // skip the expensive horizontal filter.
376       if (ix4 <= -7) {
377         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
378           int iy = iy4 + k;
379           if (iy < 0)
380             iy = 0;
381           else if (iy > height - 1)
382             iy = height - 1;
383           tmp[k + 7] = _mm_set1_epi16(
384               (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
385               ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
386         }
387       } else if (ix4 >= width + 6) {
388         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
389           int iy = iy4 + k;
390           if (iy < 0)
391             iy = 0;
392           else if (iy > height - 1)
393             iy = height - 1;
394           tmp[k + 7] =
395               _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
396                              ref[iy * stride + (width - 1)] *
397                                  (1 << (FILTER_BITS - reduce_bits_horiz)));
398         }
399       } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
400         const int out_of_boundary_left = -(ix4 - 6);
401         const int out_of_boundary_right = (ix4 + 8) - width;
402 
403         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
404           int iy = iy4 + k;
405           if (iy < 0)
406             iy = 0;
407           else if (iy > height - 1)
408             iy = height - 1;
409           int sx = sx4 + beta * (k + 4);
410 
411           // Load source pixels
412           const __m128i src =
413               _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
414           const __m128i src2 =
415               _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
416 
417           const __m128i src_01 = _mm_shuffle_epi8(
418               src, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
419           const __m128i src2_01 = _mm_shuffle_epi8(
420               src2, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
421 
422           __m128i src_lo = _mm_unpacklo_epi64(src_01, src2_01);
423           __m128i src_hi = _mm_unpackhi_epi64(src_01, src2_01);
424 
425           if (out_of_boundary_left >= 0) {
426             const __m128i shuffle_reg_left =
427                 _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
428             src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_left);
429             src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_left);
430           }
431 
432           if (out_of_boundary_right >= 0) {
433             const __m128i shuffle_reg_right = _mm_loadu_si128(
434                 (__m128i *)warp_pad_right[out_of_boundary_right]);
435             src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_right);
436             src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_right);
437           }
438 
439           const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi);
440           const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi);
441 
442           highbd_horiz_filter(&src_padded, &src2_padded, tmp, sx, alpha, k,
443                               offset_bits_horiz, reduce_bits_horiz);
444         }
445       } else {
446         highbd_prepare_warp_horizontal_filter(
447             ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
448             offset_bits_horiz, reduce_bits_horiz);
449       }
450 
451       // Vertical filter
452       for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
453         int sy = sy4 + delta * (k + 4);
454 
455         // Load from tmp and rearrange pairs of consecutive rows into the
456         // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
457         const __m128i *src = tmp + (k + 4);
458         const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
459         const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
460         const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
461         const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
462 
463         // Filter even-index pixels
464         const __m128i tmp_0 = _mm_loadu_si128(
465             (__m128i *)(av1_warped_filter +
466                         ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
467         const __m128i tmp_2 = _mm_loadu_si128(
468             (__m128i *)(av1_warped_filter +
469                         ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
470         const __m128i tmp_4 = _mm_loadu_si128(
471             (__m128i *)(av1_warped_filter +
472                         ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
473         const __m128i tmp_6 = _mm_loadu_si128(
474             (__m128i *)(av1_warped_filter +
475                         ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
476 
477         const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
478         const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
479         const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
480         const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
481 
482         const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
483         const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
484         const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
485         const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
486 
487         const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
488         const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
489         const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
490         const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
491 
492         const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
493                                                _mm_add_epi32(res_4, res_6));
494 
495         // Filter odd-index pixels
496         const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
497         const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
498         const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
499         const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
500 
501         const __m128i tmp_1 = _mm_loadu_si128(
502             (__m128i *)(av1_warped_filter +
503                         ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
504         const __m128i tmp_3 = _mm_loadu_si128(
505             (__m128i *)(av1_warped_filter +
506                         ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
507         const __m128i tmp_5 = _mm_loadu_si128(
508             (__m128i *)(av1_warped_filter +
509                         ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
510         const __m128i tmp_7 = _mm_loadu_si128(
511             (__m128i *)(av1_warped_filter +
512                         ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
513 
514         const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
515         const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
516         const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
517         const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
518 
519         const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
520         const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
521         const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
522         const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
523 
524         const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
525         const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
526         const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
527         const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
528 
529         const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
530                                               _mm_add_epi32(res_5, res_7));
531 
532         // Rearrange pixels back into the order 0 ... 7
533         __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
534         __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
535 
536         if (conv_params->is_compound) {
537           __m128i *const p =
538               (__m128i *)&conv_params
539                   ->dst[(i + k + 4) * conv_params->dst_stride + j];
540           res_lo = _mm_add_epi32(res_lo, res_add_const);
541           res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const),
542                                  reduce_bits_vert_shift);
543 
544           if (conv_params->do_average) {
545             __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
546             __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p));
547 
548             if (conv_params->use_dist_wtd_comp_avg) {
549               res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
550                                      _mm_mullo_epi32(res_lo, wt1));
551               res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS);
552             } else {
553               res_lo = _mm_srai_epi32(_mm_add_epi32(p_32, res_lo), 1);
554             }
555 
556             __m128i res32_lo = _mm_add_epi32(res_lo, res_sub_const);
557             res32_lo = _mm_sra_epi32(_mm_add_epi32(res32_lo, round_bits_const),
558                                      round_bits_shift);
559 
560             __m128i res16_lo = _mm_packus_epi32(res32_lo, res32_lo);
561             res16_lo = _mm_min_epi16(res16_lo, clip_pixel);
562             _mm_storel_epi64(dst16, res16_lo);
563           } else {
564             res_lo = _mm_packus_epi32(res_lo, res_lo);
565             _mm_storel_epi64(p, res_lo);
566           }
567           if (p_width > 4) {
568             __m128i *const p4 =
569                 (__m128i *)&conv_params
570                     ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
571 
572             res_hi = _mm_add_epi32(res_hi, res_add_const);
573             res_hi =
574                 _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const),
575                               reduce_bits_vert_shift);
576             if (conv_params->do_average) {
577               __m128i *const dst16_4 =
578                   (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
579               __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4));
580 
581               if (conv_params->use_dist_wtd_comp_avg) {
582                 res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0),
583                                        _mm_mullo_epi32(res_hi, wt1));
584                 res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS);
585               } else {
586                 res_hi = _mm_srai_epi32(_mm_add_epi32(p4_32, res_hi), 1);
587               }
588 
589               __m128i res32_hi = _mm_add_epi32(res_hi, res_sub_const);
590               res32_hi = _mm_sra_epi32(
591                   _mm_add_epi32(res32_hi, round_bits_const), round_bits_shift);
592               __m128i res16_hi = _mm_packus_epi32(res32_hi, res32_hi);
593               res16_hi = _mm_min_epi16(res16_hi, clip_pixel);
594               _mm_storel_epi64(dst16_4, res16_hi);
595             } else {
596               res_hi = _mm_packus_epi32(res_hi, res_hi);
597               _mm_storel_epi64(p4, res_hi);
598             }
599           }
600         } else {
601           // Round and pack into 8 bits
602           const __m128i round_const =
603               _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
604                              ((1 << reduce_bits_vert) >> 1));
605 
606           const __m128i res_lo_round = _mm_srai_epi32(
607               _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
608           const __m128i res_hi_round = _mm_srai_epi32(
609               _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
610 
611           __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
612           // Clamp res_16bit to the range [0, 2^bd - 1]
613           const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
614           const __m128i zero = _mm_setzero_si128();
615           res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
616 
617           // Store, blending with 'pred' if needed
618           __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
619 
620           // Note: If we're outputting a 4x4 block, we need to be very careful
621           // to only output 4 pixels at this point, to avoid encode/decode
622           // mismatches when encoding with multiple threads.
623           if (p_width == 4) {
624             _mm_storel_epi64(p, res_16bit);
625           } else {
626             _mm_storeu_si128(p, res_16bit);
627           }
628         }
629       }
630     }
631   }
632 }
633