• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2023, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 #include <assert.h>
14 
15 #include "aom_dsp/arm/mem_neon.h"
16 #include "av1/common/arm/compound_convolve_neon.h"
17 #include "config/aom_config.h"
18 #include "config/av1_rtcd.h"
19 
20 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
21   0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
22   4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
23   8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
24 };
25 
convolve4_4_2d_h(uint8x16_t samples,const int8x8_t x_filter,const uint8x16_t permute_tbl,const int32x4_t horiz_const)26 static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples,
27                                          const int8x8_t x_filter,
28                                          const uint8x16_t permute_tbl,
29                                          const int32x4_t horiz_const) {
30   // Permute samples ready for dot product.
31   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
32   uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
33 
34   // First 4 output values.
35   int32x4_t sum = vusdotq_lane_s32(horiz_const, permuted_samples, x_filter, 0);
36 
37   // We halved the convolution filter values so -1 from the right shift.
38   return vshrn_n_s32(sum, ROUND0_BITS - 1);
39 }
40 
convolve8_8_2d_h(uint8x16_t samples,const int8x8_t x_filter,const uint8x16x3_t permute_tbl,const int32x4_t horiz_const)41 static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples,
42                                          const int8x8_t x_filter,
43                                          const uint8x16x3_t permute_tbl,
44                                          const int32x4_t horiz_const) {
45   uint8x16_t permuted_samples[3];
46   int32x4_t sum[2];
47 
48   // Permute samples ready for dot product.
49   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
50   permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
51   // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
52   permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
53   // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
54   permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
55 
56   // First 4 output values.
57   sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], x_filter, 0);
58   sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
59   // Second 4 output values.
60   sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], x_filter, 0);
61   sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
62 
63   // Narrow and re-pack.
64   // We halved the convolution filter values so -1 from the right shift.
65   return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
66                       vshrn_n_s32(sum[1], ROUND0_BITS - 1));
67 }
68 
dist_wtd_convolve_2d_horiz_neon_i8mm(const uint8_t * src,int src_stride,int16_t * im_block,const int im_stride,const int16_t * x_filter_ptr,const int im_h,int w)69 static INLINE void dist_wtd_convolve_2d_horiz_neon_i8mm(
70     const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
71     const int16_t *x_filter_ptr, const int im_h, int w) {
72   const int bd = 8;
73   // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
74   // shifts - which are generally faster than rounding shifts on modern CPUs.
75   // (The extra -1 is needed because we halved the filter values.)
76   const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) +
77                                             (1 << ((ROUND0_BITS - 1) - 1)));
78 
79   const uint8_t *src_ptr = src;
80   int16_t *dst_ptr = im_block;
81   int dst_stride = im_stride;
82   int height = im_h;
83 
84   if (w == 4) {
85     const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
86     // 4-tap filters are used for blocks having width <= 4.
87     // Filter values are even, so halve to reduce intermediate precision reqs.
88     const int8x8_t x_filter =
89         vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
90 
91     src_ptr += 2;
92 
93     do {
94       uint8x16_t s0, s1, s2, s3;
95       load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
96 
97       int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
98       int16x4_t d1 = convolve4_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
99       int16x4_t d2 = convolve4_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
100       int16x4_t d3 = convolve4_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
101 
102       store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
103 
104       src_ptr += 4 * src_stride;
105       dst_ptr += 4 * dst_stride;
106       height -= 4;
107     } while (height > 4);
108 
109     do {
110       uint8x16_t s0 = vld1q_u8(src_ptr);
111 
112       int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
113 
114       vst1_s16(dst_ptr, d0);
115 
116       src_ptr += src_stride;
117       dst_ptr += dst_stride;
118     } while (--height != 0);
119   } else {
120     const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
121     // Filter values are even, so halve to reduce intermediate precision reqs.
122     const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
123 
124     do {
125       const uint8_t *s = src_ptr;
126       int16_t *d = dst_ptr;
127       int width = w;
128 
129       do {
130         uint8x16_t s0, s1, s2, s3;
131         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
132 
133         int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
134         int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
135         int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
136         int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
137 
138         store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
139 
140         s += 8;
141         d += 8;
142         width -= 8;
143       } while (width > 0);
144       src_ptr += 4 * src_stride;
145       dst_ptr += 4 * dst_stride;
146       height -= 4;
147     } while (height > 4);
148 
149     do {
150       const uint8_t *s = src_ptr;
151       int16_t *d = dst_ptr;
152       int width = w;
153 
154       do {
155         uint8x16_t s0 = vld1q_u8(s);
156 
157         int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
158 
159         vst1q_s16(d, d0);
160 
161         s += 8;
162         d += 8;
163         width -= 8;
164       } while (width > 0);
165       src_ptr += src_stride;
166       dst_ptr += dst_stride;
167     } while (--height != 0);
168   }
169 }
170 
av1_dist_wtd_convolve_2d_neon_i8mm(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)171 void av1_dist_wtd_convolve_2d_neon_i8mm(
172     const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
173     int h, const InterpFilterParams *filter_params_x,
174     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
175     const int subpel_y_qn, ConvolveParams *conv_params) {
176   assert(w % 4 == 0);
177   assert(h % 4 == 0);
178 
179   DECLARE_ALIGNED(16, int16_t,
180                   im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
181 
182   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
183   const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
184 
185   const int im_h = h + clamped_y_taps - 1;
186   const int im_stride = MAX_SB_SIZE;
187   const int vert_offset = clamped_y_taps / 2 - 1;
188   const int horiz_offset = filter_params_x->taps / 2 - 1;
189   const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
190   const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
191       filter_params_x, subpel_x_qn & SUBPEL_MASK);
192   const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
193       filter_params_y, subpel_y_qn & SUBPEL_MASK);
194 
195   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
196 
197   dist_wtd_convolve_2d_horiz_neon_i8mm(src_ptr, src_stride, im_block, im_stride,
198                                        x_filter_ptr, im_h, w);
199 
200   if (clamped_y_taps == 6) {
201     if (conv_params->do_average) {
202       if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
203         dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon(
204             im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
205             w);
206       } else {
207         dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block, im_stride, dst8,
208                                                 dst8_stride, conv_params,
209                                                 y_filter, h, w);
210       }
211     } else {
212       dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, conv_params,
213                                           y_filter, h, w);
214     }
215   } else {
216     if (conv_params->do_average) {
217       if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
218         dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon(
219             im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h,
220             w);
221       } else {
222         dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8,
223                                                 dst8_stride, conv_params,
224                                                 y_filter, h, w);
225       }
226     } else {
227       dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params,
228                                           y_filter, h, w);
229     }
230   }
231 }
232 
convolve4_4_x(uint8x16_t samples,const int8x8_t x_filter,const uint8x16_t permute_tbl,const int32x4_t round_offset)233 static INLINE uint16x4_t convolve4_4_x(uint8x16_t samples,
234                                        const int8x8_t x_filter,
235                                        const uint8x16_t permute_tbl,
236                                        const int32x4_t round_offset) {
237   // Permute samples ready for dot product.
238   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
239   uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
240 
241   // First 4 output values.
242   int32x4_t sum = vusdotq_lane_s32(round_offset, permuted_samples, x_filter, 0);
243 
244   // We halved the convolution filter values so -1 from the right shift.
245   return vreinterpret_u16_s16(vshrn_n_s32(sum, ROUND0_BITS - 1));
246 }
247 
convolve8_8_x(uint8x16_t samples,const int8x8_t x_filter,const uint8x16x3_t permute_tbl,const int32x4_t round_offset)248 static INLINE uint16x8_t convolve8_8_x(uint8x16_t samples,
249                                        const int8x8_t x_filter,
250                                        const uint8x16x3_t permute_tbl,
251                                        const int32x4_t round_offset) {
252   uint8x16_t permuted_samples[3];
253   int32x4_t sum[2];
254 
255   // Permute samples ready for dot product.
256   // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
257   permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
258   // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
259   permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
260   // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
261   permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
262 
263   // First 4 output values.
264   sum[0] = vusdotq_lane_s32(round_offset, permuted_samples[0], x_filter, 0);
265   sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1);
266   // Second 4 output values.
267   sum[1] = vusdotq_lane_s32(round_offset, permuted_samples[1], x_filter, 0);
268   sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1);
269 
270   // Narrow and re-pack.
271   // We halved the convolution filter values so -1 from the right shift.
272   int16x8_t res = vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1),
273                                vshrn_n_s32(sum[1], ROUND0_BITS - 1));
274   return vreinterpretq_u16_s16(res);
275 }
276 
dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)277 static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm(
278     const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
279     int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
280     ConvolveParams *conv_params) {
281   assert(w % 4 == 0);
282   assert(h % 4 == 0);
283 
284   const int bd = 8;
285   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
286   const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
287                                (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
288   const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
289   // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
290   // shifts - which are generally faster than rounding shifts on modern CPUs.
291   // (The extra -1 is needed because we halved the filter values.)
292   const int32x4_t round_offset_shim = vdupq_n_s32(
293       (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
294 
295   const uint16_t fwd_offset = conv_params->fwd_offset;
296   const uint16_t bck_offset = conv_params->bck_offset;
297 
298   // Horizontal filter.
299   const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
300       filter_params_x, subpel_x_qn & SUBPEL_MASK);
301 
302   const int horiz_offset = filter_params_x->taps / 2 - 1;
303   const uint8_t *src_ptr = src - horiz_offset;
304   CONV_BUF_TYPE *dst_ptr = conv_params->dst;
305   uint8_t *dst8_ptr = dst8;
306   int dst_stride = conv_params->dst_stride;
307   int height = h;
308 
309   if (w == 4) {
310     const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
311     // 4-tap filters are used for blocks having width <= 4.
312     // Filter values are even, so halve to reduce intermediate precision reqs.
313     const int8x8_t x_filter =
314         vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
315 
316     src_ptr += 2;
317 
318     do {
319       uint8x16_t s0, s1, s2, s3;
320       load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
321 
322       uint16x4_t d0 =
323           convolve4_4_x(s0, x_filter, permute_tbl, round_offset_shim);
324       uint16x4_t d1 =
325           convolve4_4_x(s1, x_filter, permute_tbl, round_offset_shim);
326       uint16x4_t d2 =
327           convolve4_4_x(s2, x_filter, permute_tbl, round_offset_shim);
328       uint16x4_t d3 =
329           convolve4_4_x(s3, x_filter, permute_tbl, round_offset_shim);
330 
331       uint16x4_t dd0, dd1, dd2, dd3;
332       load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
333 
334       uint8x8_t d01_u8, d23_u8;
335       compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
336                                bck_offset, round_offset_vec, &d01_u8, &d23_u8);
337 
338       store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8);
339       store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8);
340 
341       src_ptr += 4 * src_stride;
342       dst_ptr += 4 * dst_stride;
343       dst8_ptr += 4 * dst8_stride;
344       height -= 4;
345     } while (height != 0);
346   } else {
347     const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
348     // Filter values are even, so halve to reduce intermediate precision reqs.
349     const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
350 
351     do {
352       const uint8_t *s = src_ptr;
353       CONV_BUF_TYPE *d = dst_ptr;
354       uint8_t *d_u8 = dst8_ptr;
355       int width = w;
356 
357       do {
358         uint8x16_t s0, s1, s2, s3;
359         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
360 
361         uint16x8_t d0 =
362             convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
363         uint16x8_t d1 =
364             convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
365         uint16x8_t d2 =
366             convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
367         uint16x8_t d3 =
368             convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
369 
370         uint16x8_t dd0, dd1, dd2, dd3;
371         load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
372 
373         uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
374         compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset,
375                                  bck_offset, round_offset_vec, &d0_u8, &d1_u8,
376                                  &d2_u8, &d3_u8);
377 
378         store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
379 
380         s += 8;
381         d += 8;
382         d_u8 += 8;
383         width -= 8;
384       } while (width != 0);
385       src_ptr += 4 * src_stride;
386       dst_ptr += 4 * dst_stride;
387       dst8_ptr += 4 * dst8_stride;
388       height -= 4;
389     } while (height != 0);
390   }
391 }
392 
dist_wtd_convolve_x_avg_neon_i8mm(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)393 static INLINE void dist_wtd_convolve_x_avg_neon_i8mm(
394     const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
395     int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
396     ConvolveParams *conv_params) {
397   assert(w % 4 == 0);
398   assert(h % 4 == 0);
399 
400   const int bd = 8;
401   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
402   const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
403                                (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
404   const int16x8_t round_offset_vec = vdupq_n_s16(round_offset);
405   // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
406   // shifts - which are generally faster than rounding shifts on modern CPUs.
407   // (The extra -1 is needed because we halved the filter values.)
408   const int32x4_t round_offset_shim = vdupq_n_s32(
409       (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
410 
411   // Horizontal filter.
412   const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
413       filter_params_x, subpel_x_qn & SUBPEL_MASK);
414 
415   const int horiz_offset = filter_params_x->taps / 2 - 1;
416   const uint8_t *src_ptr = src - horiz_offset;
417   CONV_BUF_TYPE *dst_ptr = conv_params->dst;
418   uint8_t *dst8_ptr = dst8;
419   int dst_stride = conv_params->dst_stride;
420   int height = h;
421 
422   if (w == 4) {
423     const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
424     // 4-tap filters are used for blocks having width <= 4.
425     // Filter values are even, so halve to reduce intermediate precision reqs.
426     const int8x8_t x_filter =
427         vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
428 
429     src_ptr += 2;
430 
431     do {
432       uint8x16_t s0, s1, s2, s3;
433       load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
434 
435       uint16x4_t d0 =
436           convolve4_4_x(s0, x_filter, permute_tbl, round_offset_shim);
437       uint16x4_t d1 =
438           convolve4_4_x(s1, x_filter, permute_tbl, round_offset_shim);
439       uint16x4_t d2 =
440           convolve4_4_x(s2, x_filter, permute_tbl, round_offset_shim);
441       uint16x4_t d3 =
442           convolve4_4_x(s3, x_filter, permute_tbl, round_offset_shim);
443 
444       uint16x4_t dd0, dd1, dd2, dd3;
445       load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3);
446 
447       uint8x8_t d01_u8, d23_u8;
448       compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
449                             round_offset_vec, &d01_u8, &d23_u8);
450 
451       store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8);
452       store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8);
453 
454       src_ptr += 4 * src_stride;
455       dst_ptr += 4 * dst_stride;
456       dst8_ptr += 4 * dst8_stride;
457       height -= 4;
458     } while (height != 0);
459   } else {
460     const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
461     // Filter values are even, so halve to reduce intermediate precision reqs.
462     const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
463 
464     do {
465       const uint8_t *s = src_ptr;
466       CONV_BUF_TYPE *d = dst_ptr;
467       uint8_t *d_u8 = dst8_ptr;
468       int width = w;
469 
470       do {
471         uint8x16_t s0, s1, s2, s3;
472         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
473 
474         uint16x8_t d0 =
475             convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
476         uint16x8_t d1 =
477             convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
478         uint16x8_t d2 =
479             convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
480         uint16x8_t d3 =
481             convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
482 
483         uint16x8_t dd0, dd1, dd2, dd3;
484         load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
485 
486         uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8;
487         compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3,
488                               round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8);
489 
490         store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8);
491 
492         s += 8;
493         d += 8;
494         d_u8 += 8;
495         width -= 8;
496       } while (width != 0);
497       src_ptr += 4 * src_stride;
498       dst_ptr += 4 * dst_stride;
499       dst8_ptr += 4 * dst8_stride;
500       height -= 4;
501     } while (height != 0);
502   }
503 }
504 
dist_wtd_convolve_x_neon_i8mm(const uint8_t * src,int src_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)505 static INLINE void dist_wtd_convolve_x_neon_i8mm(
506     const uint8_t *src, int src_stride, int w, int h,
507     const InterpFilterParams *filter_params_x, const int subpel_x_qn,
508     ConvolveParams *conv_params) {
509   assert(w % 4 == 0);
510   assert(h % 4 == 0);
511 
512   const int bd = 8;
513   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
514   const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
515                                (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
516   // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding
517   // shifts - which are generally faster than rounding shifts on modern CPUs.
518   // (The extra -1 is needed because we halved the filter values.)
519   const int32x4_t round_offset_shim = vdupq_n_s32(
520       (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1)));
521 
522   // Horizontal filter.
523   const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
524       filter_params_x, subpel_x_qn & SUBPEL_MASK);
525 
526   const int horiz_offset = filter_params_x->taps / 2 - 1;
527   const uint8_t *src_ptr = src - horiz_offset;
528   CONV_BUF_TYPE *dst_ptr = conv_params->dst;
529   int dst_stride = conv_params->dst_stride;
530   int height = h;
531 
532   if (w == 4) {
533     const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
534     // 4-tap filters are used for blocks having width <= 4.
535     // Filter values are even, so halve to reduce intermediate precision reqs.
536     const int8x8_t x_filter =
537         vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1);
538 
539     src_ptr += 2;
540 
541     do {
542       uint8x16_t s0, s1, s2, s3;
543       load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
544 
545       uint16x4_t d0 =
546           convolve4_4_x(s0, x_filter, permute_tbl, round_offset_shim);
547       uint16x4_t d1 =
548           convolve4_4_x(s1, x_filter, permute_tbl, round_offset_shim);
549       uint16x4_t d2 =
550           convolve4_4_x(s2, x_filter, permute_tbl, round_offset_shim);
551       uint16x4_t d3 =
552           convolve4_4_x(s3, x_filter, permute_tbl, round_offset_shim);
553 
554       store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
555 
556       src_ptr += 4 * src_stride;
557       dst_ptr += 4 * dst_stride;
558       height -= 4;
559     } while (height != 0);
560   } else {
561     const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
562     // Filter values are even, so halve to reduce intermediate precision reqs.
563     const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
564 
565     do {
566       const uint8_t *s = src_ptr;
567       CONV_BUF_TYPE *d = dst_ptr;
568       int width = w;
569 
570       do {
571         uint8x16_t s0, s1, s2, s3;
572         load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
573 
574         uint16x8_t d0 =
575             convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim);
576         uint16x8_t d1 =
577             convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim);
578         uint16x8_t d2 =
579             convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim);
580         uint16x8_t d3 =
581             convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim);
582 
583         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
584 
585         s += 8;
586         d += 8;
587         width -= 8;
588       } while (width != 0);
589       src_ptr += 4 * src_stride;
590       dst_ptr += 4 * dst_stride;
591       height -= 4;
592     } while (height != 0);
593   }
594 }
595 
av1_dist_wtd_convolve_x_neon_i8mm(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)596 void av1_dist_wtd_convolve_x_neon_i8mm(
597     const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
598     int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
599     ConvolveParams *conv_params) {
600   if (conv_params->do_average) {
601     if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) {
602       dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm(
603           src, src_stride, dst8, dst8_stride, w, h, filter_params_x,
604           subpel_x_qn, conv_params);
605     } else {
606       dist_wtd_convolve_x_avg_neon_i8mm(src, src_stride, dst8, dst8_stride, w,
607                                         h, filter_params_x, subpel_x_qn,
608                                         conv_params);
609     }
610   } else {
611     dist_wtd_convolve_x_neon_i8mm(src, src_stride, w, h, filter_params_x,
612                                   subpel_x_qn, conv_params);
613   }
614 }
615