• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2023, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <arm_neon.h>
14 
15 #include "config/aom_config.h"
16 #include "config/av1_rtcd.h"
17 
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_dsp/arm/mem_neon.h"
20 #include "aom_ports/mem.h"
21 #include "av1/common/convolve.h"
22 #include "av1/common/filter.h"
23 #include "av1/common/arm/highbd_compound_convolve_neon.h"
24 #include "av1/common/arm/highbd_convolve_neon.h"
25 
highbd_12_convolve6_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t filter,const int32x4_t offset)26 static INLINE uint16x4_t highbd_12_convolve6_4(
27     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
28     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
29     const int16x8_t filter, const int32x4_t offset) {
30   // Values at indices 0 and 7 of y_filter are zero.
31   const int16x4_t filter_0_3 = vget_low_s16(filter);
32   const int16x4_t filter_4_7 = vget_high_s16(filter);
33 
34   int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 1);
35   sum = vmlal_lane_s16(sum, s1, filter_0_3, 2);
36   sum = vmlal_lane_s16(sum, s2, filter_0_3, 3);
37   sum = vmlal_lane_s16(sum, s3, filter_4_7, 0);
38   sum = vmlal_lane_s16(sum, s4, filter_4_7, 1);
39   sum = vmlal_lane_s16(sum, s5, filter_4_7, 2);
40 
41   return vqshrun_n_s32(sum, ROUND0_BITS + 2);
42 }
43 
44 static INLINE uint16x4_t
highbd_convolve6_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t filter,const int32x4_t offset)45 highbd_convolve6_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
46                    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
47                    const int16x8_t filter, const int32x4_t offset) {
48   // Values at indices 0 and 7 of y_filter are zero.
49   const int16x4_t filter_0_3 = vget_low_s16(filter);
50   const int16x4_t filter_4_7 = vget_high_s16(filter);
51 
52   int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 1);
53   sum = vmlal_lane_s16(sum, s1, filter_0_3, 2);
54   sum = vmlal_lane_s16(sum, s2, filter_0_3, 3);
55   sum = vmlal_lane_s16(sum, s3, filter_4_7, 0);
56   sum = vmlal_lane_s16(sum, s4, filter_4_7, 1);
57   sum = vmlal_lane_s16(sum, s5, filter_4_7, 2);
58 
59   return vqshrun_n_s32(sum, ROUND0_BITS);
60 }
61 
highbd_12_convolve6_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t filter,const int32x4_t offset)62 static INLINE uint16x8_t highbd_12_convolve6_8(
63     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
64     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
65     const int16x8_t filter, const int32x4_t offset) {
66   // Values at indices 0 and 7 of y_filter are zero.
67   const int16x4_t filter_0_3 = vget_low_s16(filter);
68   const int16x4_t filter_4_7 = vget_high_s16(filter);
69 
70   int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 1);
71   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2);
72   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3);
73   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0);
74   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1);
75   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2);
76 
77   int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 1);
78   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2);
79   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3);
80   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0);
81   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1);
82   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2);
83 
84   return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2),
85                       vqshrun_n_s32(sum1, ROUND0_BITS + 2));
86 }
87 
88 static INLINE uint16x8_t
highbd_convolve6_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t filter,const int32x4_t offset)89 highbd_convolve6_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
90                    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
91                    const int16x8_t filter, const int32x4_t offset) {
92   // Values at indices 0 and 7 of y_filter are zero.
93   const int16x4_t filter_0_3 = vget_low_s16(filter);
94   const int16x4_t filter_4_7 = vget_high_s16(filter);
95 
96   int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 1);
97   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2);
98   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3);
99   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0);
100   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1);
101   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2);
102 
103   int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 1);
104   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2);
105   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3);
106   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0);
107   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1);
108   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2);
109 
110   return vcombine_u16(vqshrun_n_s32(sum0, 3), vqshrun_n_s32(sum1, ROUND0_BITS));
111 }
112 
highbd_12_dist_wtd_convolve_x_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)113 static INLINE void highbd_12_dist_wtd_convolve_x_6tap_neon(
114     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
115     int w, int h, const int16_t *x_filter_ptr, const int offset) {
116   const int32x4_t offset_vec = vdupq_n_s32(offset);
117 
118   const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
119 
120   int height = h;
121 
122   do {
123     int width = w;
124     const int16_t *s = (const int16_t *)src_ptr;
125     uint16_t *d = dst_ptr;
126 
127     do {
128       int16x8_t s0[6], s1[6], s2[6], s3[6];
129       load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
130                    &s0[4], &s0[5]);
131       load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
132                    &s1[4], &s1[5]);
133       load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
134                    &s2[4], &s2[5]);
135       load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
136                    &s3[4], &s3[5]);
137 
138       uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
139                                             s0[5], x_filter, offset_vec);
140       uint16x8_t d1 = highbd_12_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
141                                             s1[5], x_filter, offset_vec);
142       uint16x8_t d2 = highbd_12_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
143                                             s2[5], x_filter, offset_vec);
144       uint16x8_t d3 = highbd_12_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
145                                             s3[5], x_filter, offset_vec);
146 
147       store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
148 
149       s += 8;
150       d += 8;
151       width -= 8;
152     } while (width != 0);
153     src_ptr += 4 * src_stride;
154     dst_ptr += 4 * dst_stride;
155     height -= 4;
156   } while (height != 0);
157 }
158 
highbd_dist_wtd_convolve_x_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)159 static INLINE void highbd_dist_wtd_convolve_x_6tap_neon(
160     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
161     int w, int h, const int16_t *x_filter_ptr, const int offset) {
162   const int32x4_t offset_vec = vdupq_n_s32(offset);
163 
164   const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
165 
166   int height = h;
167 
168   do {
169     int width = w;
170     const int16_t *s = (const int16_t *)src_ptr;
171     uint16_t *d = dst_ptr;
172 
173     do {
174       int16x8_t s0[6], s1[6], s2[6], s3[6];
175       load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
176                    &s0[4], &s0[5]);
177       load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
178                    &s1[4], &s1[5]);
179       load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
180                    &s2[4], &s2[5]);
181       load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
182                    &s3[4], &s3[5]);
183 
184       uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
185                                          s0[5], x_filter, offset_vec);
186       uint16x8_t d1 = highbd_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
187                                          s1[5], x_filter, offset_vec);
188       uint16x8_t d2 = highbd_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
189                                          s2[5], x_filter, offset_vec);
190       uint16x8_t d3 = highbd_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
191                                          s3[5], x_filter, offset_vec);
192 
193       store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
194 
195       s += 8;
196       d += 8;
197       width -= 8;
198     } while (width != 0);
199     src_ptr += 4 * src_stride;
200     dst_ptr += 4 * dst_stride;
201     height -= 4;
202   } while (height != 0);
203 }
204 
highbd_12_convolve8_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t filter,const int32x4_t offset)205 static INLINE uint16x4_t highbd_12_convolve8_4(
206     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
207     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
208     const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
209     const int32x4_t offset) {
210   const int16x4_t filter_0_3 = vget_low_s16(filter);
211   const int16x4_t filter_4_7 = vget_high_s16(filter);
212 
213   int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 0);
214   sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
215   sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
216   sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
217   sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
218   sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
219   sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
220   sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
221 
222   return vqshrun_n_s32(sum, ROUND0_BITS + 2);
223 }
224 
225 static INLINE uint16x4_t
highbd_convolve8_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t filter,const int32x4_t offset)226 highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
227                    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
228                    const int16x4_t s6, const int16x4_t s7,
229                    const int16x8_t filter, const int32x4_t offset) {
230   const int16x4_t filter_0_3 = vget_low_s16(filter);
231   const int16x4_t filter_4_7 = vget_high_s16(filter);
232 
233   int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 0);
234   sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
235   sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
236   sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
237   sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
238   sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
239   sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
240   sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
241 
242   return vqshrun_n_s32(sum, ROUND0_BITS);
243 }
244 
highbd_12_convolve8_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filter,const int32x4_t offset)245 static INLINE uint16x8_t highbd_12_convolve8_8(
246     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
247     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
248     const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
249     const int32x4_t offset) {
250   const int16x4_t filter_0_3 = vget_low_s16(filter);
251   const int16x4_t filter_4_7 = vget_high_s16(filter);
252 
253   int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 0);
254   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
255   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
256   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
257   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
258   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
259   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
260   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
261 
262   int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 0);
263   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
264   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
265   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
266   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
267   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
268   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
269   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
270 
271   return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2),
272                       vqshrun_n_s32(sum1, ROUND0_BITS + 2));
273 }
274 
275 static INLINE uint16x8_t
highbd_convolve8_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filter,const int32x4_t offset)276 highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
277                    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
278                    const int16x8_t s6, const int16x8_t s7,
279                    const int16x8_t filter, const int32x4_t offset) {
280   const int16x4_t filter_0_3 = vget_low_s16(filter);
281   const int16x4_t filter_4_7 = vget_high_s16(filter);
282 
283   int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 0);
284   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
285   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
286   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
287   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
288   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
289   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
290   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
291 
292   int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 0);
293   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
294   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
295   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
296   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
297   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
298   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
299   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
300 
301   return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS),
302                       vqshrun_n_s32(sum1, ROUND0_BITS));
303 }
304 
highbd_12_convolve4_4_x(const int16x4_t s[4],const int16x4_t x_filter,const int32x4_t offset)305 static INLINE uint16x4_t highbd_12_convolve4_4_x(const int16x4_t s[4],
306                                                  const int16x4_t x_filter,
307                                                  const int32x4_t offset) {
308   int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0);
309   sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
310   sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
311   sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
312 
313   return vqshrun_n_s32(sum, 5);
314 }
315 
highbd_convolve4_4_x(const int16x4_t s[4],const int16x4_t x_filter,const int32x4_t offset)316 static INLINE uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4],
317                                               const int16x4_t x_filter,
318                                               const int32x4_t offset) {
319   int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0);
320   sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
321   sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
322   sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
323 
324   return vqshrun_n_s32(sum, ROUND0_BITS);
325 }
326 
highbd_12_dist_wtd_convolve_x_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)327 static INLINE void highbd_12_dist_wtd_convolve_x_neon(
328     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
329     int w, int h, const int16_t *x_filter_ptr, const int offset) {
330   const int32x4_t offset_vec = vdupq_n_s32(offset);
331 
332   if (w == 4) {
333     // 4-tap filters are used for blocks having width == 4.
334     const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
335     const int16_t *s = (const int16_t *)(src_ptr + 2);
336     uint16_t *d = dst_ptr;
337 
338     do {
339       int16x4_t s0[4], s1[4], s2[4], s3[4];
340       load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
341       load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
342       load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
343       load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
344 
345       uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec);
346       uint16x4_t d1 = highbd_12_convolve4_4_x(s1, x_filter, offset_vec);
347       uint16x4_t d2 = highbd_12_convolve4_4_x(s2, x_filter, offset_vec);
348       uint16x4_t d3 = highbd_12_convolve4_4_x(s3, x_filter, offset_vec);
349 
350       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
351 
352       s += 4 * src_stride;
353       d += 4 * dst_stride;
354       h -= 4;
355     } while (h != 0);
356   } else {
357     const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
358     int height = h;
359 
360     do {
361       int width = w;
362       const int16_t *s = (const int16_t *)src_ptr;
363       uint16_t *d = dst_ptr;
364 
365       do {
366         int16x8_t s0[8], s1[8], s2[8], s3[8];
367         load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
368                      &s0[4], &s0[5], &s0[6], &s0[7]);
369         load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
370                      &s1[4], &s1[5], &s1[6], &s1[7]);
371         load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
372                      &s2[4], &s2[5], &s2[6], &s2[7]);
373         load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
374                      &s3[4], &s3[5], &s3[6], &s3[7]);
375 
376         uint16x8_t d0 =
377             highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5],
378                                   s0[6], s0[7], x_filter, offset_vec);
379         uint16x8_t d1 =
380             highbd_12_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5],
381                                   s1[6], s1[7], x_filter, offset_vec);
382         uint16x8_t d2 =
383             highbd_12_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5],
384                                   s2[6], s2[7], x_filter, offset_vec);
385         uint16x8_t d3 =
386             highbd_12_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5],
387                                   s3[6], s3[7], x_filter, offset_vec);
388 
389         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
390 
391         s += 8;
392         d += 8;
393         width -= 8;
394       } while (width != 0);
395       src_ptr += 4 * src_stride;
396       dst_ptr += 4 * dst_stride;
397       height -= 4;
398     } while (height != 0);
399   }
400 }
401 
highbd_dist_wtd_convolve_x_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)402 static INLINE void highbd_dist_wtd_convolve_x_neon(
403     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
404     int w, int h, const int16_t *x_filter_ptr, const int offset) {
405   const int32x4_t offset_vec = vdupq_n_s32(offset);
406 
407   if (w == 4) {
408     // 4-tap filters are used for blocks having width == 4.
409     const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
410     const int16_t *s = (const int16_t *)(src_ptr + 2);
411     uint16_t *d = dst_ptr;
412 
413     do {
414       int16x4_t s0[4], s1[4], s2[4], s3[4];
415       load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
416       load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
417       load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
418       load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
419 
420       uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec);
421       uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset_vec);
422       uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset_vec);
423       uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset_vec);
424 
425       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
426 
427       s += 4 * src_stride;
428       d += 4 * dst_stride;
429       h -= 4;
430     } while (h != 0);
431   } else {
432     const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
433     int height = h;
434 
435     do {
436       int width = w;
437       const int16_t *s = (const int16_t *)src_ptr;
438       uint16_t *d = dst_ptr;
439 
440       do {
441         int16x8_t s0[8], s1[8], s2[8], s3[8];
442         load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
443                      &s0[4], &s0[5], &s0[6], &s0[7]);
444         load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
445                      &s1[4], &s1[5], &s1[6], &s1[7]);
446         load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
447                      &s2[4], &s2[5], &s2[6], &s2[7]);
448         load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
449                      &s3[4], &s3[5], &s3[6], &s3[7]);
450 
451         uint16x8_t d0 =
452             highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6],
453                                s0[7], x_filter, offset_vec);
454         uint16x8_t d1 =
455             highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6],
456                                s1[7], x_filter, offset_vec);
457         uint16x8_t d2 =
458             highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6],
459                                s2[7], x_filter, offset_vec);
460         uint16x8_t d3 =
461             highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6],
462                                s3[7], x_filter, offset_vec);
463 
464         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
465 
466         s += 8;
467         d += 8;
468         width -= 8;
469       } while (width != 0);
470       src_ptr += 4 * src_stride;
471       dst_ptr += 4 * dst_stride;
472       height -= 4;
473     } while (height != 0);
474   }
475 }
476 
av1_highbd_dist_wtd_convolve_x_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)477 void av1_highbd_dist_wtd_convolve_x_neon(
478     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
479     int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
480     ConvolveParams *conv_params, int bd) {
481   DECLARE_ALIGNED(16, uint16_t,
482                   im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
483   CONV_BUF_TYPE *dst16 = conv_params->dst;
484   const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
485   int dst16_stride = conv_params->dst_stride;
486   const int im_stride = MAX_SB_SIZE;
487   const int horiz_offset = filter_params_x->taps / 2 - 1;
488   assert(FILTER_BITS == COMPOUND_ROUND1_BITS);
489   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
490   const int offset_avg = (1 << (offset_bits - conv_params->round_1)) +
491                          (1 << (offset_bits - conv_params->round_1 - 1));
492   const int offset_convolve = (1 << (conv_params->round_0 - 1)) +
493                               (1 << (bd + FILTER_BITS)) +
494                               (1 << (bd + FILTER_BITS - 1));
495 
496   const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
497       filter_params_x, subpel_x_qn & SUBPEL_MASK);
498 
499   src -= horiz_offset;
500 
501   // horizontal filter
502   if (bd == 12) {
503     if (conv_params->do_average) {
504       if (x_filter_taps <= 6 && w != 4) {
505         highbd_12_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, im_block,
506                                                 im_stride, w, h, x_filter_ptr,
507                                                 offset_convolve);
508       } else {
509         highbd_12_dist_wtd_convolve_x_neon(src, src_stride, im_block, im_stride,
510                                            w, h, x_filter_ptr, offset_convolve);
511       }
512       if (conv_params->use_dist_wtd_comp_avg) {
513         highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
514                                          w, h, conv_params, offset_avg, bd);
515       } else {
516         highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
517                                 conv_params, offset_avg, bd);
518       }
519     } else {
520       if (x_filter_taps <= 6 && w != 4) {
521         highbd_12_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, dst16,
522                                                 dst16_stride, w, h,
523                                                 x_filter_ptr, offset_convolve);
524       } else {
525         highbd_12_dist_wtd_convolve_x_neon(src, src_stride, dst16, dst16_stride,
526                                            w, h, x_filter_ptr, offset_convolve);
527       }
528     }
529   } else {
530     if (conv_params->do_average) {
531       if (x_filter_taps <= 6 && w != 4) {
532         highbd_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, im_block,
533                                              im_stride, w, h, x_filter_ptr,
534                                              offset_convolve);
535       } else {
536         highbd_dist_wtd_convolve_x_neon(src, src_stride, im_block, im_stride, w,
537                                         h, x_filter_ptr, offset_convolve);
538       }
539       if (conv_params->use_dist_wtd_comp_avg) {
540         highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
541                                       h, conv_params, offset_avg, bd);
542       } else {
543         highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
544                              conv_params, offset_avg, bd);
545       }
546     } else {
547       if (x_filter_taps <= 6 && w != 4) {
548         highbd_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, dst16,
549                                              dst16_stride, w, h, x_filter_ptr,
550                                              offset_convolve);
551       } else {
552         highbd_dist_wtd_convolve_x_neon(src, src_stride, dst16, dst16_stride, w,
553                                         h, x_filter_ptr, offset_convolve);
554       }
555     }
556   }
557 }
558 
highbd_12_dist_wtd_convolve_y_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,const int offset)559 static INLINE void highbd_12_dist_wtd_convolve_y_6tap_neon(
560     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
561     int w, int h, const int16_t *y_filter_ptr, const int offset) {
562   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
563   const int32x4_t offset_vec = vdupq_n_s32(offset);
564 
565   if (w == 4) {
566     const int16_t *s = (const int16_t *)src_ptr;
567     uint16_t *d = dst_ptr;
568 
569     int16x4_t s0, s1, s2, s3, s4;
570     load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
571     s += 5 * src_stride;
572 
573     do {
574       int16x4_t s5, s6, s7, s8;
575       load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
576 
577       uint16x4_t d0 =
578           highbd_12_convolve6_4(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
579       uint16x4_t d1 =
580           highbd_12_convolve6_4(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
581       uint16x4_t d2 =
582           highbd_12_convolve6_4(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
583       uint16x4_t d3 =
584           highbd_12_convolve6_4(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
585 
586       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
587 
588       s0 = s4;
589       s1 = s5;
590       s2 = s6;
591       s3 = s7;
592       s4 = s8;
593       s += 4 * src_stride;
594       d += 4 * dst_stride;
595       h -= 4;
596     } while (h != 0);
597   } else {
598     do {
599       int height = h;
600       const int16_t *s = (const int16_t *)src_ptr;
601       uint16_t *d = dst_ptr;
602 
603       int16x8_t s0, s1, s2, s3, s4;
604       load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
605       s += 5 * src_stride;
606 
607       do {
608         int16x8_t s5, s6, s7, s8;
609         load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
610 
611         uint16x8_t d0 =
612             highbd_12_convolve6_8(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
613         uint16x8_t d1 =
614             highbd_12_convolve6_8(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
615         uint16x8_t d2 =
616             highbd_12_convolve6_8(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
617         uint16x8_t d3 =
618             highbd_12_convolve6_8(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
619 
620         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
621 
622         s0 = s4;
623         s1 = s5;
624         s2 = s6;
625         s3 = s7;
626         s4 = s8;
627         s += 4 * src_stride;
628         d += 4 * dst_stride;
629         height -= 4;
630       } while (height != 0);
631       src_ptr += 8;
632       dst_ptr += 8;
633       w -= 8;
634     } while (w != 0);
635   }
636 }
637 
highbd_dist_wtd_convolve_y_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,const int offset)638 static INLINE void highbd_dist_wtd_convolve_y_6tap_neon(
639     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
640     int w, int h, const int16_t *y_filter_ptr, const int offset) {
641   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
642   const int32x4_t offset_vec = vdupq_n_s32(offset);
643 
644   if (w == 4) {
645     const int16_t *s = (const int16_t *)src_ptr;
646     uint16_t *d = dst_ptr;
647 
648     int16x4_t s0, s1, s2, s3, s4;
649     load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
650     s += 5 * src_stride;
651 
652     do {
653       int16x4_t s5, s6, s7, s8;
654       load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
655 
656       uint16x4_t d0 =
657           highbd_convolve6_4(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
658       uint16x4_t d1 =
659           highbd_convolve6_4(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
660       uint16x4_t d2 =
661           highbd_convolve6_4(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
662       uint16x4_t d3 =
663           highbd_convolve6_4(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
664 
665       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
666 
667       s0 = s4;
668       s1 = s5;
669       s2 = s6;
670       s3 = s7;
671       s4 = s8;
672       s += 4 * src_stride;
673       d += 4 * dst_stride;
674       h -= 4;
675     } while (h != 0);
676   } else {
677     do {
678       int height = h;
679       const int16_t *s = (const int16_t *)src_ptr;
680       uint16_t *d = dst_ptr;
681 
682       int16x8_t s0, s1, s2, s3, s4;
683       load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
684       s += 5 * src_stride;
685 
686       do {
687         int16x8_t s5, s6, s7, s8;
688         load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
689 
690         uint16x8_t d0 =
691             highbd_convolve6_8(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
692         uint16x8_t d1 =
693             highbd_convolve6_8(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
694         uint16x8_t d2 =
695             highbd_convolve6_8(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
696         uint16x8_t d3 =
697             highbd_convolve6_8(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
698 
699         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
700 
701         s0 = s4;
702         s1 = s5;
703         s2 = s6;
704         s3 = s7;
705         s4 = s8;
706         s += 4 * src_stride;
707         d += 4 * dst_stride;
708         height -= 4;
709       } while (height != 0);
710       src_ptr += 8;
711       dst_ptr += 8;
712       w -= 8;
713     } while (w != 0);
714   }
715 }
716 
highbd_12_dist_wtd_convolve_y_8tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,const int offset)717 static INLINE void highbd_12_dist_wtd_convolve_y_8tap_neon(
718     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
719     int w, int h, const int16_t *y_filter_ptr, const int offset) {
720   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
721   const int32x4_t offset_vec = vdupq_n_s32(offset);
722 
723   if (w == 4) {
724     const int16_t *s = (const int16_t *)src_ptr;
725     uint16_t *d = dst_ptr;
726 
727     int16x4_t s0, s1, s2, s3, s4, s5, s6;
728     load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
729     s += 7 * src_stride;
730 
731     do {
732       int16x4_t s7, s8, s9, s10;
733       load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
734 
735       uint16x4_t d0 = highbd_12_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7,
736                                             y_filter, offset_vec);
737       uint16x4_t d1 = highbd_12_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8,
738                                             y_filter, offset_vec);
739       uint16x4_t d2 = highbd_12_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9,
740                                             y_filter, offset_vec);
741       uint16x4_t d3 = highbd_12_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10,
742                                             y_filter, offset_vec);
743 
744       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
745 
746       s0 = s4;
747       s1 = s5;
748       s2 = s6;
749       s3 = s7;
750       s4 = s8;
751       s5 = s9;
752       s6 = s10;
753       s += 4 * src_stride;
754       d += 4 * dst_stride;
755       h -= 4;
756     } while (h != 0);
757   } else {
758     do {
759       int height = h;
760       const int16_t *s = (const int16_t *)src_ptr;
761       uint16_t *d = dst_ptr;
762 
763       int16x8_t s0, s1, s2, s3, s4, s5, s6;
764       load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
765       s += 7 * src_stride;
766 
767       do {
768         int16x8_t s7, s8, s9, s10;
769         load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
770 
771         uint16x8_t d0 = highbd_12_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7,
772                                               y_filter, offset_vec);
773         uint16x8_t d1 = highbd_12_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8,
774                                               y_filter, offset_vec);
775         uint16x8_t d2 = highbd_12_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9,
776                                               y_filter, offset_vec);
777         uint16x8_t d3 = highbd_12_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10,
778                                               y_filter, offset_vec);
779 
780         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
781 
782         s0 = s4;
783         s1 = s5;
784         s2 = s6;
785         s3 = s7;
786         s4 = s8;
787         s5 = s9;
788         s6 = s10;
789         s += 4 * src_stride;
790         d += 4 * dst_stride;
791         height -= 4;
792       } while (height != 0);
793       src_ptr += 8;
794       dst_ptr += 8;
795       w -= 8;
796     } while (w != 0);
797   }
798 }
highbd_dist_wtd_convolve_y_8tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,const int offset)799 static INLINE void highbd_dist_wtd_convolve_y_8tap_neon(
800     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
801     int w, int h, const int16_t *y_filter_ptr, const int offset) {
802   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
803   const int32x4_t offset_vec = vdupq_n_s32(offset);
804 
805   if (w == 4) {
806     const int16_t *s = (const int16_t *)src_ptr;
807     uint16_t *d = dst_ptr;
808 
809     int16x4_t s0, s1, s2, s3, s4, s5, s6;
810     load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
811     s += 7 * src_stride;
812 
813     do {
814       int16x4_t s7, s8, s9, s10;
815       load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
816 
817       uint16x4_t d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7,
818                                          y_filter, offset_vec);
819       uint16x4_t d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8,
820                                          y_filter, offset_vec);
821       uint16x4_t d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9,
822                                          y_filter, offset_vec);
823       uint16x4_t d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10,
824                                          y_filter, offset_vec);
825 
826       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
827 
828       s0 = s4;
829       s1 = s5;
830       s2 = s6;
831       s3 = s7;
832       s4 = s8;
833       s5 = s9;
834       s6 = s10;
835       s += 4 * src_stride;
836       d += 4 * dst_stride;
837       h -= 4;
838     } while (h != 0);
839   } else {
840     do {
841       int height = h;
842       const int16_t *s = (const int16_t *)src_ptr;
843       uint16_t *d = dst_ptr;
844 
845       int16x8_t s0, s1, s2, s3, s4, s5, s6;
846       load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
847       s += 7 * src_stride;
848 
849       do {
850         int16x8_t s7, s8, s9, s10;
851         load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
852 
853         uint16x8_t d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7,
854                                            y_filter, offset_vec);
855         uint16x8_t d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8,
856                                            y_filter, offset_vec);
857         uint16x8_t d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9,
858                                            y_filter, offset_vec);
859         uint16x8_t d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10,
860                                            y_filter, offset_vec);
861 
862         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
863 
864         s0 = s4;
865         s1 = s5;
866         s2 = s6;
867         s3 = s7;
868         s4 = s8;
869         s5 = s9;
870         s6 = s10;
871         s += 4 * src_stride;
872         d += 4 * dst_stride;
873         height -= 4;
874       } while (height != 0);
875       src_ptr += 8;
876       dst_ptr += 8;
877       w -= 8;
878     } while (w != 0);
879   }
880 }
881 
av1_highbd_dist_wtd_convolve_y_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,ConvolveParams * conv_params,int bd)882 void av1_highbd_dist_wtd_convolve_y_neon(
883     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
884     int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
885     ConvolveParams *conv_params, int bd) {
886   DECLARE_ALIGNED(16, uint16_t,
887                   im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
888   CONV_BUF_TYPE *dst16 = conv_params->dst;
889   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
890   int dst16_stride = conv_params->dst_stride;
891   const int im_stride = MAX_SB_SIZE;
892   const int vert_offset = filter_params_y->taps / 2 - 1;
893   assert(FILTER_BITS == COMPOUND_ROUND1_BITS);
894   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
895   const int round_offset_avg = (1 << (offset_bits - conv_params->round_1)) +
896                                (1 << (offset_bits - conv_params->round_1 - 1));
897   const int round_offset_conv = (1 << (conv_params->round_0 - 1)) +
898                                 (1 << (bd + FILTER_BITS)) +
899                                 (1 << (bd + FILTER_BITS - 1));
900 
901   const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
902       filter_params_y, subpel_y_qn & SUBPEL_MASK);
903 
904   src -= vert_offset * src_stride;
905 
906   if (bd == 12) {
907     if (conv_params->do_average) {
908       if (y_filter_taps <= 6) {
909         highbd_12_dist_wtd_convolve_y_6tap_neon(
910             src + src_stride, src_stride, im_block, im_stride, w, h,
911             y_filter_ptr, round_offset_conv);
912       } else {
913         highbd_12_dist_wtd_convolve_y_8tap_neon(src, src_stride, im_block,
914                                                 im_stride, w, h, y_filter_ptr,
915                                                 round_offset_conv);
916       }
917       if (conv_params->use_dist_wtd_comp_avg) {
918         highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
919                                          w, h, conv_params, round_offset_avg,
920                                          bd);
921       } else {
922         highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
923                                 conv_params, round_offset_avg, bd);
924       }
925     } else {
926       if (y_filter_taps <= 6) {
927         highbd_12_dist_wtd_convolve_y_6tap_neon(
928             src + src_stride, src_stride, dst16, dst16_stride, w, h,
929             y_filter_ptr, round_offset_conv);
930       } else {
931         highbd_12_dist_wtd_convolve_y_8tap_neon(
932             src, src_stride, dst16, dst16_stride, w, h, y_filter_ptr,
933             round_offset_conv);
934       }
935     }
936   } else {
937     if (conv_params->do_average) {
938       if (y_filter_taps <= 6) {
939         highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride,
940                                              im_block, im_stride, w, h,
941                                              y_filter_ptr, round_offset_conv);
942       } else {
943         highbd_dist_wtd_convolve_y_8tap_neon(src, src_stride, im_block,
944                                              im_stride, w, h, y_filter_ptr,
945                                              round_offset_conv);
946       }
947       if (conv_params->use_dist_wtd_comp_avg) {
948         highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
949                                       h, conv_params, round_offset_avg, bd);
950       } else {
951         highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
952                              conv_params, round_offset_avg, bd);
953       }
954     } else {
955       if (y_filter_taps <= 6) {
956         highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride,
957                                              dst16, dst16_stride, w, h,
958                                              y_filter_ptr, round_offset_conv);
959       } else {
960         highbd_dist_wtd_convolve_y_8tap_neon(src, src_stride, dst16,
961                                              dst16_stride, w, h, y_filter_ptr,
962                                              round_offset_conv);
963       }
964     }
965   }
966 }
967 
highbd_2d_copy_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int round_bits,const int offset)968 static INLINE void highbd_2d_copy_neon(const uint16_t *src_ptr, int src_stride,
969                                        uint16_t *dst_ptr, int dst_stride, int w,
970                                        int h, const int round_bits,
971                                        const int offset) {
972   if (w <= 4) {
973     const int16x4_t round_shift_s16 = vdup_n_s16(round_bits);
974     const uint16x4_t offset_u16 = vdup_n_u16(offset);
975 
976     for (int y = 0; y < h; ++y) {
977       const uint16x4_t s = vld1_u16(src_ptr + y * src_stride);
978       uint16x4_t d = vshl_u16(s, round_shift_s16);
979       d = vadd_u16(d, offset_u16);
980       if (w == 2) {
981         store_u16_2x1(dst_ptr + y * dst_stride, d);
982       } else {
983         vst1_u16(dst_ptr + y * dst_stride, d);
984       }
985     }
986   } else {
987     const int16x8_t round_shift_s16 = vdupq_n_s16(round_bits);
988     const uint16x8_t offset_u16 = vdupq_n_u16(offset);
989 
990     for (int y = 0; y < h; ++y) {
991       for (int x = 0; x < w; x += 8) {
992         const uint16x8_t s = vld1q_u16(src_ptr + y * src_stride + x);
993         uint16x8_t d = vshlq_u16(s, round_shift_s16);
994         d = vaddq_u16(d, offset_u16);
995         vst1q_u16(dst_ptr + y * dst_stride + x, d);
996       }
997     }
998   }
999 }
1000 
av1_highbd_dist_wtd_convolve_2d_copy_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,ConvolveParams * conv_params,int bd)1001 void av1_highbd_dist_wtd_convolve_2d_copy_neon(const uint16_t *src,
1002                                                int src_stride, uint16_t *dst,
1003                                                int dst_stride, int w, int h,
1004                                                ConvolveParams *conv_params,
1005                                                int bd) {
1006   DECLARE_ALIGNED(16, uint16_t,
1007                   im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
1008 
1009   const int im_stride = MAX_SB_SIZE;
1010   CONV_BUF_TYPE *dst16 = conv_params->dst;
1011   int dst16_stride = conv_params->dst_stride;
1012   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1013   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
1014                            (1 << (offset_bits - conv_params->round_1 - 1));
1015   const int round_bits =
1016       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
1017   assert(round_bits >= 0);
1018 
1019   if (conv_params->do_average) {
1020     highbd_2d_copy_neon(src, src_stride, im_block, im_stride, w, h, round_bits,
1021                         round_offset);
1022   } else {
1023     highbd_2d_copy_neon(src, src_stride, dst16, dst16_stride, w, h, round_bits,
1024                         round_offset);
1025   }
1026 
1027   if (conv_params->do_average) {
1028     if (conv_params->use_dist_wtd_comp_avg) {
1029       if (bd == 12) {
1030         highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
1031                                          w, h, conv_params, round_offset, bd);
1032       } else {
1033         highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
1034                                       h, conv_params, round_offset, bd);
1035       }
1036     } else {
1037       if (bd == 12) {
1038         highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
1039                                 conv_params, round_offset, bd);
1040       } else {
1041         highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
1042                              conv_params, round_offset, bd);
1043       }
1044     }
1045   }
1046 }
1047 
highbd_convolve6_4_2d_v(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t y_filter,const int32x4_t offset)1048 static INLINE uint16x4_t highbd_convolve6_4_2d_v(
1049     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
1050     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
1051     const int16x8_t y_filter, const int32x4_t offset) {
1052   // Values at indices 0 and 7 of y_filter are zero.
1053   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
1054   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
1055 
1056   int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 1);
1057   sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
1058   sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
1059   sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
1060   sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
1061   sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
1062 
1063   return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
1064 }
1065 
highbd_convolve6_8_2d_v(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t y_filter,const int32x4_t offset)1066 static INLINE uint16x8_t highbd_convolve6_8_2d_v(
1067     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
1068     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
1069     const int16x8_t y_filter, const int32x4_t offset) {
1070   // Values at indices 0 and 7 of y_filter are zero.
1071   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
1072   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
1073 
1074   int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 1);
1075   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
1076   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
1077   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
1078   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
1079   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
1080 
1081   int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 1);
1082   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
1083   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
1084   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
1085   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
1086   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
1087 
1088   return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
1089                       vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
1090 }
1091 
highbd_dist_wtd_convolve_2d_vert_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,int offset)1092 static INLINE void highbd_dist_wtd_convolve_2d_vert_6tap_neon(
1093     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1094     int w, int h, const int16_t *y_filter_ptr, int offset) {
1095   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
1096   const int32x4_t offset_vec = vdupq_n_s32(offset);
1097 
1098   if (w == 4) {
1099     const int16_t *s = (const int16_t *)src_ptr;
1100     uint16_t *d = dst_ptr;
1101 
1102     int16x4_t s0, s1, s2, s3, s4;
1103     load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
1104     s += 5 * src_stride;
1105 
1106     do {
1107       int16x4_t s5, s6, s7, s8;
1108       load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
1109 
1110       uint16x4_t d0 =
1111           highbd_convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
1112       uint16x4_t d1 =
1113           highbd_convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
1114       uint16x4_t d2 =
1115           highbd_convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
1116       uint16x4_t d3 =
1117           highbd_convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
1118 
1119       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1120 
1121       s0 = s4;
1122       s1 = s5;
1123       s2 = s6;
1124       s3 = s7;
1125       s4 = s8;
1126       s += 4 * src_stride;
1127       d += 4 * dst_stride;
1128       h -= 4;
1129     } while (h != 0);
1130   } else {
1131     do {
1132       int height = h;
1133       const int16_t *s = (const int16_t *)src_ptr;
1134       uint16_t *d = dst_ptr;
1135 
1136       int16x8_t s0, s1, s2, s3, s4;
1137       load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
1138       s += 5 * src_stride;
1139 
1140       do {
1141         int16x8_t s5, s6, s7, s8;
1142         load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
1143 
1144         uint16x8_t d0 = highbd_convolve6_8_2d_v(s0, s1, s2, s3, s4, s5,
1145                                                 y_filter, offset_vec);
1146         uint16x8_t d1 = highbd_convolve6_8_2d_v(s1, s2, s3, s4, s5, s6,
1147                                                 y_filter, offset_vec);
1148         uint16x8_t d2 = highbd_convolve6_8_2d_v(s2, s3, s4, s5, s6, s7,
1149                                                 y_filter, offset_vec);
1150         uint16x8_t d3 = highbd_convolve6_8_2d_v(s3, s4, s5, s6, s7, s8,
1151                                                 y_filter, offset_vec);
1152 
1153         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1154 
1155         s0 = s4;
1156         s1 = s5;
1157         s2 = s6;
1158         s3 = s7;
1159         s4 = s8;
1160         s += 4 * src_stride;
1161         d += 4 * dst_stride;
1162         height -= 4;
1163       } while (height != 0);
1164       src_ptr += 8;
1165       dst_ptr += 8;
1166       w -= 8;
1167     } while (w != 0);
1168   }
1169 }
1170 
highbd_convolve8_4_2d_v(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t y_filter,const int32x4_t offset)1171 static INLINE uint16x4_t highbd_convolve8_4_2d_v(
1172     const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
1173     const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
1174     const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
1175     const int32x4_t offset) {
1176   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
1177   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
1178 
1179   int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 0);
1180   sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
1181   sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
1182   sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
1183   sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
1184   sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
1185   sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
1186   sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
1187 
1188   return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
1189 }
1190 
highbd_convolve8_8_2d_v(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t y_filter,const int32x4_t offset)1191 static INLINE uint16x8_t highbd_convolve8_8_2d_v(
1192     const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
1193     const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
1194     const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
1195     const int32x4_t offset) {
1196   const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
1197   const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
1198 
1199   int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 0);
1200   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
1201   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
1202   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
1203   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
1204   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
1205   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
1206   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
1207 
1208   int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 0);
1209   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
1210   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
1211   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
1212   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
1213   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
1214   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
1215   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
1216 
1217   return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
1218                       vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
1219 }
1220 
highbd_dist_wtd_convolve_2d_vert_8tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,int offset)1221 static INLINE void highbd_dist_wtd_convolve_2d_vert_8tap_neon(
1222     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1223     int w, int h, const int16_t *y_filter_ptr, int offset) {
1224   const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
1225   const int32x4_t offset_vec = vdupq_n_s32(offset);
1226 
1227   if (w <= 4) {
1228     const int16_t *s = (const int16_t *)src_ptr;
1229     uint16_t *d = dst_ptr;
1230 
1231     int16x4_t s0, s1, s2, s3, s4, s5, s6;
1232     load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
1233     s += 7 * src_stride;
1234 
1235     do {
1236       int16x4_t s7, s8, s9, s10;
1237       load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
1238 
1239       uint16x4_t d0 = highbd_convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
1240                                               y_filter, offset_vec);
1241       uint16x4_t d1 = highbd_convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
1242                                               y_filter, offset_vec);
1243       uint16x4_t d2 = highbd_convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
1244                                               y_filter, offset_vec);
1245       uint16x4_t d3 = highbd_convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
1246                                               y_filter, offset_vec);
1247 
1248       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1249 
1250       s0 = s4;
1251       s1 = s5;
1252       s2 = s6;
1253       s3 = s7;
1254       s4 = s8;
1255       s5 = s9;
1256       s6 = s10;
1257       s += 4 * src_stride;
1258       d += 4 * dst_stride;
1259       h -= 4;
1260     } while (h != 0);
1261   } else {
1262     do {
1263       int height = h;
1264       const int16_t *s = (const int16_t *)src_ptr;
1265       uint16_t *d = dst_ptr;
1266 
1267       int16x8_t s0, s1, s2, s3, s4, s5, s6;
1268       load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
1269       s += 7 * src_stride;
1270 
1271       do {
1272         int16x8_t s7, s8, s9, s10;
1273         load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
1274 
1275         uint16x8_t d0 = highbd_convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
1276                                                 y_filter, offset_vec);
1277         uint16x8_t d1 = highbd_convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
1278                                                 y_filter, offset_vec);
1279         uint16x8_t d2 = highbd_convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
1280                                                 y_filter, offset_vec);
1281         uint16x8_t d3 = highbd_convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
1282                                                 y_filter, offset_vec);
1283 
1284         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1285 
1286         s0 = s4;
1287         s1 = s5;
1288         s2 = s6;
1289         s3 = s7;
1290         s4 = s8;
1291         s5 = s9;
1292         s6 = s10;
1293         s += 4 * src_stride;
1294         d += 4 * dst_stride;
1295         height -= 4;
1296       } while (height != 0);
1297       src_ptr += 8;
1298       dst_ptr += 8;
1299       w -= 8;
1300     } while (w != 0);
1301   }
1302 }
1303 
highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)1304 static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon(
1305     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1306     int w, int h, const int16_t *x_filter_ptr, const int offset) {
1307   // The smallest block height is 4, and the horizontal convolution needs to
1308   // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
1309   assert(h >= 5);
1310   const int32x4_t offset_vec = vdupq_n_s32(offset);
1311 
1312   const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
1313 
1314   int height = h;
1315 
1316   do {
1317     int width = w;
1318     const int16_t *s = (const int16_t *)src_ptr;
1319     uint16_t *d = dst_ptr;
1320 
1321     do {
1322       int16x8_t s0[6], s1[6], s2[6], s3[6];
1323       load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1324                    &s0[4], &s0[5]);
1325       load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1326                    &s1[4], &s1[5]);
1327       load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1328                    &s2[4], &s2[5]);
1329       load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1330                    &s3[4], &s3[5]);
1331 
1332       uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
1333                                             s0[5], x_filter, offset_vec);
1334       uint16x8_t d1 = highbd_12_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
1335                                             s1[5], x_filter, offset_vec);
1336       uint16x8_t d2 = highbd_12_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
1337                                             s2[5], x_filter, offset_vec);
1338       uint16x8_t d3 = highbd_12_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
1339                                             s3[5], x_filter, offset_vec);
1340 
1341       store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1342 
1343       s += 8;
1344       d += 8;
1345       width -= 8;
1346     } while (width != 0);
1347     src_ptr += 4 * src_stride;
1348     dst_ptr += 4 * dst_stride;
1349     height -= 4;
1350   } while (height > 4);
1351 
1352   do {
1353     int width = w;
1354     const int16_t *s = (const int16_t *)src_ptr;
1355     uint16_t *d = dst_ptr;
1356 
1357     do {
1358       int16x8_t s0[6];
1359       load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]);
1360 
1361       uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
1362                                             s0[5], x_filter, offset_vec);
1363       vst1q_u16(d, d0);
1364 
1365       s += 8;
1366       d += 8;
1367       width -= 8;
1368     } while (width != 0);
1369     src_ptr += src_stride;
1370     dst_ptr += dst_stride;
1371   } while (--height != 0);
1372 }
1373 
highbd_dist_wtd_convolve_2d_horiz_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)1374 static INLINE void highbd_dist_wtd_convolve_2d_horiz_6tap_neon(
1375     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1376     int w, int h, const int16_t *x_filter_ptr, const int offset) {
1377   // The smallest block height is 4, and the horizontal convolution needs to
1378   // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
1379   assert(h >= 5);
1380   const int32x4_t offset_vec = vdupq_n_s32(offset);
1381 
1382   const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
1383 
1384   int height = h;
1385 
1386   do {
1387     int width = w;
1388     const int16_t *s = (const int16_t *)src_ptr;
1389     uint16_t *d = dst_ptr;
1390 
1391     do {
1392       int16x8_t s0[6], s1[6], s2[6], s3[6];
1393       load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1394                    &s0[4], &s0[5]);
1395       load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1396                    &s1[4], &s1[5]);
1397       load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1398                    &s2[4], &s2[5]);
1399       load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1400                    &s3[4], &s3[5]);
1401 
1402       uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
1403                                          s0[5], x_filter, offset_vec);
1404       uint16x8_t d1 = highbd_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
1405                                          s1[5], x_filter, offset_vec);
1406       uint16x8_t d2 = highbd_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
1407                                          s2[5], x_filter, offset_vec);
1408       uint16x8_t d3 = highbd_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
1409                                          s3[5], x_filter, offset_vec);
1410 
1411       store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1412 
1413       s += 8;
1414       d += 8;
1415       width -= 8;
1416     } while (width != 0);
1417     src_ptr += 4 * src_stride;
1418     dst_ptr += 4 * dst_stride;
1419     height -= 4;
1420   } while (height > 4);
1421 
1422   do {
1423     int width = w;
1424     const int16_t *s = (const int16_t *)src_ptr;
1425     uint16_t *d = dst_ptr;
1426 
1427     do {
1428       int16x8_t s0[6];
1429       load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]);
1430 
1431       uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
1432                                          s0[5], x_filter, offset_vec);
1433       vst1q_u16(d, d0);
1434 
1435       s += 8;
1436       d += 8;
1437       width -= 8;
1438     } while (width != 0);
1439     src_ptr += src_stride;
1440     dst_ptr += dst_stride;
1441   } while (--height != 0);
1442 }
1443 
highbd_12_dist_wtd_convolve_2d_horiz_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)1444 static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_neon(
1445     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1446     int w, int h, const int16_t *x_filter_ptr, const int offset) {
1447   // The smallest block height is 4, and the horizontal convolution needs to
1448   // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
1449   assert(h >= 5);
1450   const int32x4_t offset_vec = vdupq_n_s32(offset);
1451 
1452   if (w == 4) {
1453     // 4-tap filters are used for blocks having width == 4.
1454     const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
1455     const int16_t *s = (const int16_t *)(src_ptr + 1);
1456     uint16_t *d = dst_ptr;
1457 
1458     do {
1459       int16x4_t s0[4], s1[4], s2[4], s3[4];
1460       load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
1461       load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
1462       load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
1463       load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
1464 
1465       uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec);
1466       uint16x4_t d1 = highbd_12_convolve4_4_x(s1, x_filter, offset_vec);
1467       uint16x4_t d2 = highbd_12_convolve4_4_x(s2, x_filter, offset_vec);
1468       uint16x4_t d3 = highbd_12_convolve4_4_x(s3, x_filter, offset_vec);
1469 
1470       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1471 
1472       s += 4 * src_stride;
1473       d += 4 * dst_stride;
1474       h -= 4;
1475     } while (h > 4);
1476 
1477     do {
1478       int16x4_t s0[4];
1479       load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
1480 
1481       uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec);
1482       vst1_u16(d, d0);
1483 
1484       s += src_stride;
1485       d += dst_stride;
1486     } while (--h != 0);
1487   } else {
1488     const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
1489     int height = h;
1490 
1491     do {
1492       int width = w;
1493       const int16_t *s = (const int16_t *)src_ptr;
1494       uint16_t *d = dst_ptr;
1495 
1496       do {
1497         int16x8_t s0[8], s1[8], s2[8], s3[8];
1498         load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1499                      &s0[4], &s0[5], &s0[6], &s0[7]);
1500         load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1501                      &s1[4], &s1[5], &s1[6], &s1[7]);
1502         load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1503                      &s2[4], &s2[5], &s2[6], &s2[7]);
1504         load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1505                      &s3[4], &s3[5], &s3[6], &s3[7]);
1506 
1507         uint16x8_t d0 =
1508             highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5],
1509                                   s0[6], s0[7], x_filter, offset_vec);
1510         uint16x8_t d1 =
1511             highbd_12_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5],
1512                                   s1[6], s1[7], x_filter, offset_vec);
1513         uint16x8_t d2 =
1514             highbd_12_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5],
1515                                   s2[6], s2[7], x_filter, offset_vec);
1516         uint16x8_t d3 =
1517             highbd_12_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5],
1518                                   s3[6], s3[7], x_filter, offset_vec);
1519 
1520         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1521 
1522         s += 8;
1523         d += 8;
1524         width -= 8;
1525       } while (width != 0);
1526       src_ptr += 4 * src_stride;
1527       dst_ptr += 4 * dst_stride;
1528       height -= 4;
1529     } while (height > 4);
1530 
1531     do {
1532       int width = w;
1533       const int16_t *s = (const int16_t *)src_ptr;
1534       uint16_t *d = dst_ptr;
1535 
1536       do {
1537         int16x8_t s0[8];
1538         load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1539                      &s0[4], &s0[5], &s0[6], &s0[7]);
1540 
1541         uint16x8_t d0 =
1542             highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5],
1543                                   s0[6], s0[7], x_filter, offset_vec);
1544         vst1q_u16(d, d0);
1545 
1546         s += 8;
1547         d += 8;
1548         width -= 8;
1549       } while (width != 0);
1550       src_ptr += src_stride;
1551       dst_ptr += dst_stride;
1552     } while (--height != 0);
1553   }
1554 }
1555 
highbd_dist_wtd_convolve_2d_horiz_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)1556 static INLINE void highbd_dist_wtd_convolve_2d_horiz_neon(
1557     const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1558     int w, int h, const int16_t *x_filter_ptr, const int offset) {
1559   // The smallest block height is 4, and the horizontal convolution needs to
1560   // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
1561   assert(h >= 5);
1562   const int32x4_t offset_vec = vdupq_n_s32(offset);
1563 
1564   if (w == 4) {
1565     // 4-tap filters are used for blocks having width == 4.
1566     const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
1567     const int16_t *s = (const int16_t *)(src_ptr + 1);
1568     uint16_t *d = dst_ptr;
1569 
1570     do {
1571       int16x4_t s0[4], s1[4], s2[4], s3[4];
1572       load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
1573       load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
1574       load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
1575       load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
1576 
1577       uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec);
1578       uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset_vec);
1579       uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset_vec);
1580       uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset_vec);
1581 
1582       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1583 
1584       s += 4 * src_stride;
1585       d += 4 * dst_stride;
1586       h -= 4;
1587     } while (h > 4);
1588 
1589     do {
1590       int16x4_t s0[4];
1591       load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
1592 
1593       uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec);
1594       vst1_u16(d, d0);
1595 
1596       s += src_stride;
1597       d += dst_stride;
1598     } while (--h != 0);
1599   } else {
1600     const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
1601     int height = h;
1602 
1603     do {
1604       int width = w;
1605       const int16_t *s = (const int16_t *)src_ptr;
1606       uint16_t *d = dst_ptr;
1607 
1608       do {
1609         int16x8_t s0[8], s1[8], s2[8], s3[8];
1610         load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1611                      &s0[4], &s0[5], &s0[6], &s0[7]);
1612         load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1613                      &s1[4], &s1[5], &s1[6], &s1[7]);
1614         load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1615                      &s2[4], &s2[5], &s2[6], &s2[7]);
1616         load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1617                      &s3[4], &s3[5], &s3[6], &s3[7]);
1618 
1619         uint16x8_t d0 =
1620             highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6],
1621                                s0[7], x_filter, offset_vec);
1622         uint16x8_t d1 =
1623             highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6],
1624                                s1[7], x_filter, offset_vec);
1625         uint16x8_t d2 =
1626             highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6],
1627                                s2[7], x_filter, offset_vec);
1628         uint16x8_t d3 =
1629             highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6],
1630                                s3[7], x_filter, offset_vec);
1631 
1632         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1633 
1634         s += 8;
1635         d += 8;
1636         width -= 8;
1637       } while (width != 0);
1638       src_ptr += 4 * src_stride;
1639       dst_ptr += 4 * dst_stride;
1640       height -= 4;
1641     } while (height > 4);
1642 
1643     do {
1644       int width = w;
1645       const int16_t *s = (const int16_t *)src_ptr;
1646       uint16_t *d = dst_ptr;
1647 
1648       do {
1649         int16x8_t s0[8];
1650         load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1651                      &s0[4], &s0[5], &s0[6], &s0[7]);
1652 
1653         uint16x8_t d0 =
1654             highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6],
1655                                s0[7], x_filter, offset_vec);
1656         vst1q_u16(d, d0);
1657 
1658         s += 8;
1659         d += 8;
1660         width -= 8;
1661       } while (width != 0);
1662       src_ptr += src_stride;
1663       dst_ptr += dst_stride;
1664     } while (--height != 0);
1665   }
1666 }
1667 
av1_highbd_dist_wtd_convolve_2d_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)1668 void av1_highbd_dist_wtd_convolve_2d_neon(
1669     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
1670     int h, const InterpFilterParams *filter_params_x,
1671     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
1672     const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
1673   DECLARE_ALIGNED(16, uint16_t,
1674                   im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
1675   DECLARE_ALIGNED(16, uint16_t,
1676                   im_block2[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
1677 
1678   CONV_BUF_TYPE *dst16 = conv_params->dst;
1679   int dst16_stride = conv_params->dst_stride;
1680   const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
1681   const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps;
1682   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
1683   const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
1684 
1685   const int im_h = h + clamped_y_taps - 1;
1686   const int im_stride = MAX_SB_SIZE;
1687   const int vert_offset = clamped_y_taps / 2 - 1;
1688   const int horiz_offset = clamped_x_taps / 2 - 1;
1689   // The extra shim of (1 << (conv_params->round_0 - 1)) allows us to use a
1690   // faster non-rounding non-saturating left shift.
1691   const int round_offset_conv_x =
1692       (1 << (bd + FILTER_BITS - 1)) + (1 << (conv_params->round_0 - 1));
1693   const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1694   const int round_offset_conv_y = (1 << y_offset_bits);
1695   const int round_offset_avg =
1696       ((1 << (y_offset_bits - conv_params->round_1)) +
1697        (1 << (y_offset_bits - conv_params->round_1 - 1)));
1698 
1699   const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
1700 
1701   const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
1702       filter_params_x, subpel_x_qn & SUBPEL_MASK);
1703   const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
1704       filter_params_y, subpel_y_qn & SUBPEL_MASK);
1705 
1706   // horizontal filter
1707   if (bd == 12) {
1708     if (x_filter_taps <= 6 && w != 4) {
1709       highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon(
1710           src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
1711           round_offset_conv_x);
1712     } else {
1713       highbd_12_dist_wtd_convolve_2d_horiz_neon(
1714           src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
1715           round_offset_conv_x);
1716     }
1717   } else {
1718     if (x_filter_taps <= 6 && w != 4) {
1719       highbd_dist_wtd_convolve_2d_horiz_6tap_neon(
1720           src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
1721           round_offset_conv_x);
1722     } else {
1723       highbd_dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block,
1724                                              im_stride, w, im_h, x_filter_ptr,
1725                                              round_offset_conv_x);
1726     }
1727   }
1728 
1729   // vertical filter
1730   if (y_filter_taps <= 6) {
1731     if (conv_params->do_average) {
1732       highbd_dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, im_block2,
1733                                                  im_stride, w, h, y_filter_ptr,
1734                                                  round_offset_conv_y);
1735     } else {
1736       highbd_dist_wtd_convolve_2d_vert_6tap_neon(
1737           im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr,
1738           round_offset_conv_y);
1739     }
1740   } else {
1741     if (conv_params->do_average) {
1742       highbd_dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, im_block2,
1743                                                  im_stride, w, h, y_filter_ptr,
1744                                                  round_offset_conv_y);
1745     } else {
1746       highbd_dist_wtd_convolve_2d_vert_8tap_neon(
1747           im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr,
1748           round_offset_conv_y);
1749     }
1750   }
1751 
1752   // Do the compound averaging outside the loop, avoids branching within the
1753   // main loop
1754   if (conv_params->do_average) {
1755     if (conv_params->use_dist_wtd_comp_avg) {
1756       if (bd == 12) {
1757         highbd_12_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride,
1758                                          w, h, conv_params, round_offset_avg,
1759                                          bd);
1760       } else {
1761         highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w,
1762                                       h, conv_params, round_offset_avg, bd);
1763       }
1764     } else {
1765       if (bd == 12) {
1766         highbd_12_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
1767                                 conv_params, round_offset_avg, bd);
1768       } else {
1769         highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
1770                              conv_params, round_offset_avg, bd);
1771       }
1772     }
1773   }
1774 }
1775