1 /*
2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <arm_neon.h>
14
15 #include "config/aom_config.h"
16 #include "config/av1_rtcd.h"
17
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_dsp/arm/mem_neon.h"
20 #include "aom_ports/mem.h"
21 #include "av1/common/convolve.h"
22 #include "av1/common/filter.h"
23 #include "av1/common/arm/highbd_compound_convolve_neon.h"
24 #include "av1/common/arm/highbd_convolve_neon.h"
25
highbd_12_convolve6_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t filter,const int32x4_t offset)26 static INLINE uint16x4_t highbd_12_convolve6_4(
27 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
28 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
29 const int16x8_t filter, const int32x4_t offset) {
30 // Values at indices 0 and 7 of y_filter are zero.
31 const int16x4_t filter_0_3 = vget_low_s16(filter);
32 const int16x4_t filter_4_7 = vget_high_s16(filter);
33
34 int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 1);
35 sum = vmlal_lane_s16(sum, s1, filter_0_3, 2);
36 sum = vmlal_lane_s16(sum, s2, filter_0_3, 3);
37 sum = vmlal_lane_s16(sum, s3, filter_4_7, 0);
38 sum = vmlal_lane_s16(sum, s4, filter_4_7, 1);
39 sum = vmlal_lane_s16(sum, s5, filter_4_7, 2);
40
41 return vqshrun_n_s32(sum, ROUND0_BITS + 2);
42 }
43
44 static INLINE uint16x4_t
highbd_convolve6_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t filter,const int32x4_t offset)45 highbd_convolve6_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
46 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
47 const int16x8_t filter, const int32x4_t offset) {
48 // Values at indices 0 and 7 of y_filter are zero.
49 const int16x4_t filter_0_3 = vget_low_s16(filter);
50 const int16x4_t filter_4_7 = vget_high_s16(filter);
51
52 int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 1);
53 sum = vmlal_lane_s16(sum, s1, filter_0_3, 2);
54 sum = vmlal_lane_s16(sum, s2, filter_0_3, 3);
55 sum = vmlal_lane_s16(sum, s3, filter_4_7, 0);
56 sum = vmlal_lane_s16(sum, s4, filter_4_7, 1);
57 sum = vmlal_lane_s16(sum, s5, filter_4_7, 2);
58
59 return vqshrun_n_s32(sum, ROUND0_BITS);
60 }
61
highbd_12_convolve6_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t filter,const int32x4_t offset)62 static INLINE uint16x8_t highbd_12_convolve6_8(
63 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
64 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
65 const int16x8_t filter, const int32x4_t offset) {
66 // Values at indices 0 and 7 of y_filter are zero.
67 const int16x4_t filter_0_3 = vget_low_s16(filter);
68 const int16x4_t filter_4_7 = vget_high_s16(filter);
69
70 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 1);
71 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2);
72 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3);
73 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0);
74 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1);
75 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2);
76
77 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 1);
78 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2);
79 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3);
80 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0);
81 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1);
82 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2);
83
84 return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2),
85 vqshrun_n_s32(sum1, ROUND0_BITS + 2));
86 }
87
88 static INLINE uint16x8_t
highbd_convolve6_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t filter,const int32x4_t offset)89 highbd_convolve6_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
90 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
91 const int16x8_t filter, const int32x4_t offset) {
92 // Values at indices 0 and 7 of y_filter are zero.
93 const int16x4_t filter_0_3 = vget_low_s16(filter);
94 const int16x4_t filter_4_7 = vget_high_s16(filter);
95
96 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 1);
97 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2);
98 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3);
99 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0);
100 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1);
101 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2);
102
103 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 1);
104 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2);
105 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3);
106 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0);
107 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1);
108 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2);
109
110 return vcombine_u16(vqshrun_n_s32(sum0, 3), vqshrun_n_s32(sum1, ROUND0_BITS));
111 }
112
highbd_12_dist_wtd_convolve_x_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)113 static INLINE void highbd_12_dist_wtd_convolve_x_6tap_neon(
114 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
115 int w, int h, const int16_t *x_filter_ptr, const int offset) {
116 const int32x4_t offset_vec = vdupq_n_s32(offset);
117
118 const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
119
120 int height = h;
121
122 do {
123 int width = w;
124 const int16_t *s = (const int16_t *)src_ptr;
125 uint16_t *d = dst_ptr;
126
127 do {
128 int16x8_t s0[6], s1[6], s2[6], s3[6];
129 load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
130 &s0[4], &s0[5]);
131 load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
132 &s1[4], &s1[5]);
133 load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
134 &s2[4], &s2[5]);
135 load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
136 &s3[4], &s3[5]);
137
138 uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
139 s0[5], x_filter, offset_vec);
140 uint16x8_t d1 = highbd_12_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
141 s1[5], x_filter, offset_vec);
142 uint16x8_t d2 = highbd_12_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
143 s2[5], x_filter, offset_vec);
144 uint16x8_t d3 = highbd_12_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
145 s3[5], x_filter, offset_vec);
146
147 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
148
149 s += 8;
150 d += 8;
151 width -= 8;
152 } while (width != 0);
153 src_ptr += 4 * src_stride;
154 dst_ptr += 4 * dst_stride;
155 height -= 4;
156 } while (height != 0);
157 }
158
highbd_dist_wtd_convolve_x_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)159 static INLINE void highbd_dist_wtd_convolve_x_6tap_neon(
160 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
161 int w, int h, const int16_t *x_filter_ptr, const int offset) {
162 const int32x4_t offset_vec = vdupq_n_s32(offset);
163
164 const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
165
166 int height = h;
167
168 do {
169 int width = w;
170 const int16_t *s = (const int16_t *)src_ptr;
171 uint16_t *d = dst_ptr;
172
173 do {
174 int16x8_t s0[6], s1[6], s2[6], s3[6];
175 load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
176 &s0[4], &s0[5]);
177 load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
178 &s1[4], &s1[5]);
179 load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
180 &s2[4], &s2[5]);
181 load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
182 &s3[4], &s3[5]);
183
184 uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
185 s0[5], x_filter, offset_vec);
186 uint16x8_t d1 = highbd_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
187 s1[5], x_filter, offset_vec);
188 uint16x8_t d2 = highbd_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
189 s2[5], x_filter, offset_vec);
190 uint16x8_t d3 = highbd_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
191 s3[5], x_filter, offset_vec);
192
193 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
194
195 s += 8;
196 d += 8;
197 width -= 8;
198 } while (width != 0);
199 src_ptr += 4 * src_stride;
200 dst_ptr += 4 * dst_stride;
201 height -= 4;
202 } while (height != 0);
203 }
204
highbd_12_convolve8_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t filter,const int32x4_t offset)205 static INLINE uint16x4_t highbd_12_convolve8_4(
206 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
207 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
208 const int16x4_t s6, const int16x4_t s7, const int16x8_t filter,
209 const int32x4_t offset) {
210 const int16x4_t filter_0_3 = vget_low_s16(filter);
211 const int16x4_t filter_4_7 = vget_high_s16(filter);
212
213 int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 0);
214 sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
215 sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
216 sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
217 sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
218 sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
219 sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
220 sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
221
222 return vqshrun_n_s32(sum, ROUND0_BITS + 2);
223 }
224
225 static INLINE uint16x4_t
highbd_convolve8_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t filter,const int32x4_t offset)226 highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
227 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
228 const int16x4_t s6, const int16x4_t s7,
229 const int16x8_t filter, const int32x4_t offset) {
230 const int16x4_t filter_0_3 = vget_low_s16(filter);
231 const int16x4_t filter_4_7 = vget_high_s16(filter);
232
233 int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 0);
234 sum = vmlal_lane_s16(sum, s1, filter_0_3, 1);
235 sum = vmlal_lane_s16(sum, s2, filter_0_3, 2);
236 sum = vmlal_lane_s16(sum, s3, filter_0_3, 3);
237 sum = vmlal_lane_s16(sum, s4, filter_4_7, 0);
238 sum = vmlal_lane_s16(sum, s5, filter_4_7, 1);
239 sum = vmlal_lane_s16(sum, s6, filter_4_7, 2);
240 sum = vmlal_lane_s16(sum, s7, filter_4_7, 3);
241
242 return vqshrun_n_s32(sum, ROUND0_BITS);
243 }
244
highbd_12_convolve8_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filter,const int32x4_t offset)245 static INLINE uint16x8_t highbd_12_convolve8_8(
246 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
247 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
248 const int16x8_t s6, const int16x8_t s7, const int16x8_t filter,
249 const int32x4_t offset) {
250 const int16x4_t filter_0_3 = vget_low_s16(filter);
251 const int16x4_t filter_4_7 = vget_high_s16(filter);
252
253 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 0);
254 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
255 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
256 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
257 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
258 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
259 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
260 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
261
262 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 0);
263 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
264 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
265 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
266 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
267 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
268 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
269 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
270
271 return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2),
272 vqshrun_n_s32(sum1, ROUND0_BITS + 2));
273 }
274
275 static INLINE uint16x8_t
highbd_convolve8_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filter,const int32x4_t offset)276 highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
277 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
278 const int16x8_t s6, const int16x8_t s7,
279 const int16x8_t filter, const int32x4_t offset) {
280 const int16x4_t filter_0_3 = vget_low_s16(filter);
281 const int16x4_t filter_4_7 = vget_high_s16(filter);
282
283 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 0);
284 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1);
285 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2);
286 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3);
287 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0);
288 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1);
289 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2);
290 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3);
291
292 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 0);
293 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1);
294 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2);
295 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3);
296 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0);
297 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1);
298 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2);
299 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3);
300
301 return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS),
302 vqshrun_n_s32(sum1, ROUND0_BITS));
303 }
304
highbd_12_convolve4_4_x(const int16x4_t s[4],const int16x4_t x_filter,const int32x4_t offset)305 static INLINE uint16x4_t highbd_12_convolve4_4_x(const int16x4_t s[4],
306 const int16x4_t x_filter,
307 const int32x4_t offset) {
308 int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0);
309 sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
310 sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
311 sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
312
313 return vqshrun_n_s32(sum, 5);
314 }
315
highbd_convolve4_4_x(const int16x4_t s[4],const int16x4_t x_filter,const int32x4_t offset)316 static INLINE uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4],
317 const int16x4_t x_filter,
318 const int32x4_t offset) {
319 int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0);
320 sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
321 sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
322 sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
323
324 return vqshrun_n_s32(sum, ROUND0_BITS);
325 }
326
highbd_12_dist_wtd_convolve_x_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)327 static INLINE void highbd_12_dist_wtd_convolve_x_neon(
328 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
329 int w, int h, const int16_t *x_filter_ptr, const int offset) {
330 const int32x4_t offset_vec = vdupq_n_s32(offset);
331
332 if (w == 4) {
333 // 4-tap filters are used for blocks having width == 4.
334 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
335 const int16_t *s = (const int16_t *)(src_ptr + 2);
336 uint16_t *d = dst_ptr;
337
338 do {
339 int16x4_t s0[4], s1[4], s2[4], s3[4];
340 load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
341 load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
342 load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
343 load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
344
345 uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec);
346 uint16x4_t d1 = highbd_12_convolve4_4_x(s1, x_filter, offset_vec);
347 uint16x4_t d2 = highbd_12_convolve4_4_x(s2, x_filter, offset_vec);
348 uint16x4_t d3 = highbd_12_convolve4_4_x(s3, x_filter, offset_vec);
349
350 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
351
352 s += 4 * src_stride;
353 d += 4 * dst_stride;
354 h -= 4;
355 } while (h != 0);
356 } else {
357 const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
358 int height = h;
359
360 do {
361 int width = w;
362 const int16_t *s = (const int16_t *)src_ptr;
363 uint16_t *d = dst_ptr;
364
365 do {
366 int16x8_t s0[8], s1[8], s2[8], s3[8];
367 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
368 &s0[4], &s0[5], &s0[6], &s0[7]);
369 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
370 &s1[4], &s1[5], &s1[6], &s1[7]);
371 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
372 &s2[4], &s2[5], &s2[6], &s2[7]);
373 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
374 &s3[4], &s3[5], &s3[6], &s3[7]);
375
376 uint16x8_t d0 =
377 highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5],
378 s0[6], s0[7], x_filter, offset_vec);
379 uint16x8_t d1 =
380 highbd_12_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5],
381 s1[6], s1[7], x_filter, offset_vec);
382 uint16x8_t d2 =
383 highbd_12_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5],
384 s2[6], s2[7], x_filter, offset_vec);
385 uint16x8_t d3 =
386 highbd_12_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5],
387 s3[6], s3[7], x_filter, offset_vec);
388
389 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
390
391 s += 8;
392 d += 8;
393 width -= 8;
394 } while (width != 0);
395 src_ptr += 4 * src_stride;
396 dst_ptr += 4 * dst_stride;
397 height -= 4;
398 } while (height != 0);
399 }
400 }
401
highbd_dist_wtd_convolve_x_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)402 static INLINE void highbd_dist_wtd_convolve_x_neon(
403 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
404 int w, int h, const int16_t *x_filter_ptr, const int offset) {
405 const int32x4_t offset_vec = vdupq_n_s32(offset);
406
407 if (w == 4) {
408 // 4-tap filters are used for blocks having width == 4.
409 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
410 const int16_t *s = (const int16_t *)(src_ptr + 2);
411 uint16_t *d = dst_ptr;
412
413 do {
414 int16x4_t s0[4], s1[4], s2[4], s3[4];
415 load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
416 load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
417 load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
418 load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
419
420 uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec);
421 uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset_vec);
422 uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset_vec);
423 uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset_vec);
424
425 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
426
427 s += 4 * src_stride;
428 d += 4 * dst_stride;
429 h -= 4;
430 } while (h != 0);
431 } else {
432 const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
433 int height = h;
434
435 do {
436 int width = w;
437 const int16_t *s = (const int16_t *)src_ptr;
438 uint16_t *d = dst_ptr;
439
440 do {
441 int16x8_t s0[8], s1[8], s2[8], s3[8];
442 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
443 &s0[4], &s0[5], &s0[6], &s0[7]);
444 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
445 &s1[4], &s1[5], &s1[6], &s1[7]);
446 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
447 &s2[4], &s2[5], &s2[6], &s2[7]);
448 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
449 &s3[4], &s3[5], &s3[6], &s3[7]);
450
451 uint16x8_t d0 =
452 highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6],
453 s0[7], x_filter, offset_vec);
454 uint16x8_t d1 =
455 highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6],
456 s1[7], x_filter, offset_vec);
457 uint16x8_t d2 =
458 highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6],
459 s2[7], x_filter, offset_vec);
460 uint16x8_t d3 =
461 highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6],
462 s3[7], x_filter, offset_vec);
463
464 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
465
466 s += 8;
467 d += 8;
468 width -= 8;
469 } while (width != 0);
470 src_ptr += 4 * src_stride;
471 dst_ptr += 4 * dst_stride;
472 height -= 4;
473 } while (height != 0);
474 }
475 }
476
av1_highbd_dist_wtd_convolve_x_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)477 void av1_highbd_dist_wtd_convolve_x_neon(
478 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
479 int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
480 ConvolveParams *conv_params, int bd) {
481 DECLARE_ALIGNED(16, uint16_t,
482 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
483 CONV_BUF_TYPE *dst16 = conv_params->dst;
484 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
485 int dst16_stride = conv_params->dst_stride;
486 const int im_stride = MAX_SB_SIZE;
487 const int horiz_offset = filter_params_x->taps / 2 - 1;
488 assert(FILTER_BITS == COMPOUND_ROUND1_BITS);
489 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
490 const int offset_avg = (1 << (offset_bits - conv_params->round_1)) +
491 (1 << (offset_bits - conv_params->round_1 - 1));
492 const int offset_convolve = (1 << (conv_params->round_0 - 1)) +
493 (1 << (bd + FILTER_BITS)) +
494 (1 << (bd + FILTER_BITS - 1));
495
496 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
497 filter_params_x, subpel_x_qn & SUBPEL_MASK);
498
499 src -= horiz_offset;
500
501 // horizontal filter
502 if (bd == 12) {
503 if (conv_params->do_average) {
504 if (x_filter_taps <= 6 && w != 4) {
505 highbd_12_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, im_block,
506 im_stride, w, h, x_filter_ptr,
507 offset_convolve);
508 } else {
509 highbd_12_dist_wtd_convolve_x_neon(src, src_stride, im_block, im_stride,
510 w, h, x_filter_ptr, offset_convolve);
511 }
512 if (conv_params->use_dist_wtd_comp_avg) {
513 highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
514 w, h, conv_params, offset_avg, bd);
515 } else {
516 highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
517 conv_params, offset_avg, bd);
518 }
519 } else {
520 if (x_filter_taps <= 6 && w != 4) {
521 highbd_12_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, dst16,
522 dst16_stride, w, h,
523 x_filter_ptr, offset_convolve);
524 } else {
525 highbd_12_dist_wtd_convolve_x_neon(src, src_stride, dst16, dst16_stride,
526 w, h, x_filter_ptr, offset_convolve);
527 }
528 }
529 } else {
530 if (conv_params->do_average) {
531 if (x_filter_taps <= 6 && w != 4) {
532 highbd_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, im_block,
533 im_stride, w, h, x_filter_ptr,
534 offset_convolve);
535 } else {
536 highbd_dist_wtd_convolve_x_neon(src, src_stride, im_block, im_stride, w,
537 h, x_filter_ptr, offset_convolve);
538 }
539 if (conv_params->use_dist_wtd_comp_avg) {
540 highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
541 h, conv_params, offset_avg, bd);
542 } else {
543 highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
544 conv_params, offset_avg, bd);
545 }
546 } else {
547 if (x_filter_taps <= 6 && w != 4) {
548 highbd_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, dst16,
549 dst16_stride, w, h, x_filter_ptr,
550 offset_convolve);
551 } else {
552 highbd_dist_wtd_convolve_x_neon(src, src_stride, dst16, dst16_stride, w,
553 h, x_filter_ptr, offset_convolve);
554 }
555 }
556 }
557 }
558
highbd_12_dist_wtd_convolve_y_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,const int offset)559 static INLINE void highbd_12_dist_wtd_convolve_y_6tap_neon(
560 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
561 int w, int h, const int16_t *y_filter_ptr, const int offset) {
562 const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
563 const int32x4_t offset_vec = vdupq_n_s32(offset);
564
565 if (w == 4) {
566 const int16_t *s = (const int16_t *)src_ptr;
567 uint16_t *d = dst_ptr;
568
569 int16x4_t s0, s1, s2, s3, s4;
570 load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
571 s += 5 * src_stride;
572
573 do {
574 int16x4_t s5, s6, s7, s8;
575 load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
576
577 uint16x4_t d0 =
578 highbd_12_convolve6_4(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
579 uint16x4_t d1 =
580 highbd_12_convolve6_4(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
581 uint16x4_t d2 =
582 highbd_12_convolve6_4(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
583 uint16x4_t d3 =
584 highbd_12_convolve6_4(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
585
586 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
587
588 s0 = s4;
589 s1 = s5;
590 s2 = s6;
591 s3 = s7;
592 s4 = s8;
593 s += 4 * src_stride;
594 d += 4 * dst_stride;
595 h -= 4;
596 } while (h != 0);
597 } else {
598 do {
599 int height = h;
600 const int16_t *s = (const int16_t *)src_ptr;
601 uint16_t *d = dst_ptr;
602
603 int16x8_t s0, s1, s2, s3, s4;
604 load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
605 s += 5 * src_stride;
606
607 do {
608 int16x8_t s5, s6, s7, s8;
609 load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
610
611 uint16x8_t d0 =
612 highbd_12_convolve6_8(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
613 uint16x8_t d1 =
614 highbd_12_convolve6_8(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
615 uint16x8_t d2 =
616 highbd_12_convolve6_8(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
617 uint16x8_t d3 =
618 highbd_12_convolve6_8(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
619
620 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
621
622 s0 = s4;
623 s1 = s5;
624 s2 = s6;
625 s3 = s7;
626 s4 = s8;
627 s += 4 * src_stride;
628 d += 4 * dst_stride;
629 height -= 4;
630 } while (height != 0);
631 src_ptr += 8;
632 dst_ptr += 8;
633 w -= 8;
634 } while (w != 0);
635 }
636 }
637
highbd_dist_wtd_convolve_y_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,const int offset)638 static INLINE void highbd_dist_wtd_convolve_y_6tap_neon(
639 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
640 int w, int h, const int16_t *y_filter_ptr, const int offset) {
641 const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
642 const int32x4_t offset_vec = vdupq_n_s32(offset);
643
644 if (w == 4) {
645 const int16_t *s = (const int16_t *)src_ptr;
646 uint16_t *d = dst_ptr;
647
648 int16x4_t s0, s1, s2, s3, s4;
649 load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
650 s += 5 * src_stride;
651
652 do {
653 int16x4_t s5, s6, s7, s8;
654 load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
655
656 uint16x4_t d0 =
657 highbd_convolve6_4(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
658 uint16x4_t d1 =
659 highbd_convolve6_4(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
660 uint16x4_t d2 =
661 highbd_convolve6_4(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
662 uint16x4_t d3 =
663 highbd_convolve6_4(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
664
665 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
666
667 s0 = s4;
668 s1 = s5;
669 s2 = s6;
670 s3 = s7;
671 s4 = s8;
672 s += 4 * src_stride;
673 d += 4 * dst_stride;
674 h -= 4;
675 } while (h != 0);
676 } else {
677 do {
678 int height = h;
679 const int16_t *s = (const int16_t *)src_ptr;
680 uint16_t *d = dst_ptr;
681
682 int16x8_t s0, s1, s2, s3, s4;
683 load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
684 s += 5 * src_stride;
685
686 do {
687 int16x8_t s5, s6, s7, s8;
688 load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
689
690 uint16x8_t d0 =
691 highbd_convolve6_8(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
692 uint16x8_t d1 =
693 highbd_convolve6_8(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
694 uint16x8_t d2 =
695 highbd_convolve6_8(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
696 uint16x8_t d3 =
697 highbd_convolve6_8(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
698
699 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
700
701 s0 = s4;
702 s1 = s5;
703 s2 = s6;
704 s3 = s7;
705 s4 = s8;
706 s += 4 * src_stride;
707 d += 4 * dst_stride;
708 height -= 4;
709 } while (height != 0);
710 src_ptr += 8;
711 dst_ptr += 8;
712 w -= 8;
713 } while (w != 0);
714 }
715 }
716
highbd_12_dist_wtd_convolve_y_8tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,const int offset)717 static INLINE void highbd_12_dist_wtd_convolve_y_8tap_neon(
718 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
719 int w, int h, const int16_t *y_filter_ptr, const int offset) {
720 const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
721 const int32x4_t offset_vec = vdupq_n_s32(offset);
722
723 if (w == 4) {
724 const int16_t *s = (const int16_t *)src_ptr;
725 uint16_t *d = dst_ptr;
726
727 int16x4_t s0, s1, s2, s3, s4, s5, s6;
728 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
729 s += 7 * src_stride;
730
731 do {
732 int16x4_t s7, s8, s9, s10;
733 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
734
735 uint16x4_t d0 = highbd_12_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7,
736 y_filter, offset_vec);
737 uint16x4_t d1 = highbd_12_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8,
738 y_filter, offset_vec);
739 uint16x4_t d2 = highbd_12_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9,
740 y_filter, offset_vec);
741 uint16x4_t d3 = highbd_12_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10,
742 y_filter, offset_vec);
743
744 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
745
746 s0 = s4;
747 s1 = s5;
748 s2 = s6;
749 s3 = s7;
750 s4 = s8;
751 s5 = s9;
752 s6 = s10;
753 s += 4 * src_stride;
754 d += 4 * dst_stride;
755 h -= 4;
756 } while (h != 0);
757 } else {
758 do {
759 int height = h;
760 const int16_t *s = (const int16_t *)src_ptr;
761 uint16_t *d = dst_ptr;
762
763 int16x8_t s0, s1, s2, s3, s4, s5, s6;
764 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
765 s += 7 * src_stride;
766
767 do {
768 int16x8_t s7, s8, s9, s10;
769 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
770
771 uint16x8_t d0 = highbd_12_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7,
772 y_filter, offset_vec);
773 uint16x8_t d1 = highbd_12_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8,
774 y_filter, offset_vec);
775 uint16x8_t d2 = highbd_12_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9,
776 y_filter, offset_vec);
777 uint16x8_t d3 = highbd_12_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10,
778 y_filter, offset_vec);
779
780 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
781
782 s0 = s4;
783 s1 = s5;
784 s2 = s6;
785 s3 = s7;
786 s4 = s8;
787 s5 = s9;
788 s6 = s10;
789 s += 4 * src_stride;
790 d += 4 * dst_stride;
791 height -= 4;
792 } while (height != 0);
793 src_ptr += 8;
794 dst_ptr += 8;
795 w -= 8;
796 } while (w != 0);
797 }
798 }
highbd_dist_wtd_convolve_y_8tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,const int offset)799 static INLINE void highbd_dist_wtd_convolve_y_8tap_neon(
800 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
801 int w, int h, const int16_t *y_filter_ptr, const int offset) {
802 const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
803 const int32x4_t offset_vec = vdupq_n_s32(offset);
804
805 if (w == 4) {
806 const int16_t *s = (const int16_t *)src_ptr;
807 uint16_t *d = dst_ptr;
808
809 int16x4_t s0, s1, s2, s3, s4, s5, s6;
810 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
811 s += 7 * src_stride;
812
813 do {
814 int16x4_t s7, s8, s9, s10;
815 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
816
817 uint16x4_t d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7,
818 y_filter, offset_vec);
819 uint16x4_t d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8,
820 y_filter, offset_vec);
821 uint16x4_t d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9,
822 y_filter, offset_vec);
823 uint16x4_t d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10,
824 y_filter, offset_vec);
825
826 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
827
828 s0 = s4;
829 s1 = s5;
830 s2 = s6;
831 s3 = s7;
832 s4 = s8;
833 s5 = s9;
834 s6 = s10;
835 s += 4 * src_stride;
836 d += 4 * dst_stride;
837 h -= 4;
838 } while (h != 0);
839 } else {
840 do {
841 int height = h;
842 const int16_t *s = (const int16_t *)src_ptr;
843 uint16_t *d = dst_ptr;
844
845 int16x8_t s0, s1, s2, s3, s4, s5, s6;
846 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
847 s += 7 * src_stride;
848
849 do {
850 int16x8_t s7, s8, s9, s10;
851 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
852
853 uint16x8_t d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7,
854 y_filter, offset_vec);
855 uint16x8_t d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8,
856 y_filter, offset_vec);
857 uint16x8_t d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9,
858 y_filter, offset_vec);
859 uint16x8_t d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10,
860 y_filter, offset_vec);
861
862 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
863
864 s0 = s4;
865 s1 = s5;
866 s2 = s6;
867 s3 = s7;
868 s4 = s8;
869 s5 = s9;
870 s6 = s10;
871 s += 4 * src_stride;
872 d += 4 * dst_stride;
873 height -= 4;
874 } while (height != 0);
875 src_ptr += 8;
876 dst_ptr += 8;
877 w -= 8;
878 } while (w != 0);
879 }
880 }
881
av1_highbd_dist_wtd_convolve_y_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,ConvolveParams * conv_params,int bd)882 void av1_highbd_dist_wtd_convolve_y_neon(
883 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
884 int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
885 ConvolveParams *conv_params, int bd) {
886 DECLARE_ALIGNED(16, uint16_t,
887 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
888 CONV_BUF_TYPE *dst16 = conv_params->dst;
889 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
890 int dst16_stride = conv_params->dst_stride;
891 const int im_stride = MAX_SB_SIZE;
892 const int vert_offset = filter_params_y->taps / 2 - 1;
893 assert(FILTER_BITS == COMPOUND_ROUND1_BITS);
894 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
895 const int round_offset_avg = (1 << (offset_bits - conv_params->round_1)) +
896 (1 << (offset_bits - conv_params->round_1 - 1));
897 const int round_offset_conv = (1 << (conv_params->round_0 - 1)) +
898 (1 << (bd + FILTER_BITS)) +
899 (1 << (bd + FILTER_BITS - 1));
900
901 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
902 filter_params_y, subpel_y_qn & SUBPEL_MASK);
903
904 src -= vert_offset * src_stride;
905
906 if (bd == 12) {
907 if (conv_params->do_average) {
908 if (y_filter_taps <= 6) {
909 highbd_12_dist_wtd_convolve_y_6tap_neon(
910 src + src_stride, src_stride, im_block, im_stride, w, h,
911 y_filter_ptr, round_offset_conv);
912 } else {
913 highbd_12_dist_wtd_convolve_y_8tap_neon(src, src_stride, im_block,
914 im_stride, w, h, y_filter_ptr,
915 round_offset_conv);
916 }
917 if (conv_params->use_dist_wtd_comp_avg) {
918 highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
919 w, h, conv_params, round_offset_avg,
920 bd);
921 } else {
922 highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
923 conv_params, round_offset_avg, bd);
924 }
925 } else {
926 if (y_filter_taps <= 6) {
927 highbd_12_dist_wtd_convolve_y_6tap_neon(
928 src + src_stride, src_stride, dst16, dst16_stride, w, h,
929 y_filter_ptr, round_offset_conv);
930 } else {
931 highbd_12_dist_wtd_convolve_y_8tap_neon(
932 src, src_stride, dst16, dst16_stride, w, h, y_filter_ptr,
933 round_offset_conv);
934 }
935 }
936 } else {
937 if (conv_params->do_average) {
938 if (y_filter_taps <= 6) {
939 highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride,
940 im_block, im_stride, w, h,
941 y_filter_ptr, round_offset_conv);
942 } else {
943 highbd_dist_wtd_convolve_y_8tap_neon(src, src_stride, im_block,
944 im_stride, w, h, y_filter_ptr,
945 round_offset_conv);
946 }
947 if (conv_params->use_dist_wtd_comp_avg) {
948 highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
949 h, conv_params, round_offset_avg, bd);
950 } else {
951 highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
952 conv_params, round_offset_avg, bd);
953 }
954 } else {
955 if (y_filter_taps <= 6) {
956 highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride,
957 dst16, dst16_stride, w, h,
958 y_filter_ptr, round_offset_conv);
959 } else {
960 highbd_dist_wtd_convolve_y_8tap_neon(src, src_stride, dst16,
961 dst16_stride, w, h, y_filter_ptr,
962 round_offset_conv);
963 }
964 }
965 }
966 }
967
highbd_2d_copy_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int round_bits,const int offset)968 static INLINE void highbd_2d_copy_neon(const uint16_t *src_ptr, int src_stride,
969 uint16_t *dst_ptr, int dst_stride, int w,
970 int h, const int round_bits,
971 const int offset) {
972 if (w <= 4) {
973 const int16x4_t round_shift_s16 = vdup_n_s16(round_bits);
974 const uint16x4_t offset_u16 = vdup_n_u16(offset);
975
976 for (int y = 0; y < h; ++y) {
977 const uint16x4_t s = vld1_u16(src_ptr + y * src_stride);
978 uint16x4_t d = vshl_u16(s, round_shift_s16);
979 d = vadd_u16(d, offset_u16);
980 if (w == 2) {
981 store_u16_2x1(dst_ptr + y * dst_stride, d);
982 } else {
983 vst1_u16(dst_ptr + y * dst_stride, d);
984 }
985 }
986 } else {
987 const int16x8_t round_shift_s16 = vdupq_n_s16(round_bits);
988 const uint16x8_t offset_u16 = vdupq_n_u16(offset);
989
990 for (int y = 0; y < h; ++y) {
991 for (int x = 0; x < w; x += 8) {
992 const uint16x8_t s = vld1q_u16(src_ptr + y * src_stride + x);
993 uint16x8_t d = vshlq_u16(s, round_shift_s16);
994 d = vaddq_u16(d, offset_u16);
995 vst1q_u16(dst_ptr + y * dst_stride + x, d);
996 }
997 }
998 }
999 }
1000
av1_highbd_dist_wtd_convolve_2d_copy_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,ConvolveParams * conv_params,int bd)1001 void av1_highbd_dist_wtd_convolve_2d_copy_neon(const uint16_t *src,
1002 int src_stride, uint16_t *dst,
1003 int dst_stride, int w, int h,
1004 ConvolveParams *conv_params,
1005 int bd) {
1006 DECLARE_ALIGNED(16, uint16_t,
1007 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
1008
1009 const int im_stride = MAX_SB_SIZE;
1010 CONV_BUF_TYPE *dst16 = conv_params->dst;
1011 int dst16_stride = conv_params->dst_stride;
1012 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1013 const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
1014 (1 << (offset_bits - conv_params->round_1 - 1));
1015 const int round_bits =
1016 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
1017 assert(round_bits >= 0);
1018
1019 if (conv_params->do_average) {
1020 highbd_2d_copy_neon(src, src_stride, im_block, im_stride, w, h, round_bits,
1021 round_offset);
1022 } else {
1023 highbd_2d_copy_neon(src, src_stride, dst16, dst16_stride, w, h, round_bits,
1024 round_offset);
1025 }
1026
1027 if (conv_params->do_average) {
1028 if (conv_params->use_dist_wtd_comp_avg) {
1029 if (bd == 12) {
1030 highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride,
1031 w, h, conv_params, round_offset, bd);
1032 } else {
1033 highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w,
1034 h, conv_params, round_offset, bd);
1035 }
1036 } else {
1037 if (bd == 12) {
1038 highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
1039 conv_params, round_offset, bd);
1040 } else {
1041 highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h,
1042 conv_params, round_offset, bd);
1043 }
1044 }
1045 }
1046 }
1047
highbd_convolve6_4_2d_v(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t y_filter,const int32x4_t offset)1048 static INLINE uint16x4_t highbd_convolve6_4_2d_v(
1049 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
1050 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
1051 const int16x8_t y_filter, const int32x4_t offset) {
1052 // Values at indices 0 and 7 of y_filter are zero.
1053 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
1054 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
1055
1056 int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 1);
1057 sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
1058 sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
1059 sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
1060 sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
1061 sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
1062
1063 return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
1064 }
1065
highbd_convolve6_8_2d_v(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t y_filter,const int32x4_t offset)1066 static INLINE uint16x8_t highbd_convolve6_8_2d_v(
1067 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
1068 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
1069 const int16x8_t y_filter, const int32x4_t offset) {
1070 // Values at indices 0 and 7 of y_filter are zero.
1071 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
1072 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
1073
1074 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 1);
1075 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
1076 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
1077 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
1078 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
1079 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
1080
1081 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 1);
1082 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
1083 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
1084 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
1085 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
1086 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
1087
1088 return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
1089 vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
1090 }
1091
highbd_dist_wtd_convolve_2d_vert_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,int offset)1092 static INLINE void highbd_dist_wtd_convolve_2d_vert_6tap_neon(
1093 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1094 int w, int h, const int16_t *y_filter_ptr, int offset) {
1095 const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
1096 const int32x4_t offset_vec = vdupq_n_s32(offset);
1097
1098 if (w == 4) {
1099 const int16_t *s = (const int16_t *)src_ptr;
1100 uint16_t *d = dst_ptr;
1101
1102 int16x4_t s0, s1, s2, s3, s4;
1103 load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
1104 s += 5 * src_stride;
1105
1106 do {
1107 int16x4_t s5, s6, s7, s8;
1108 load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
1109
1110 uint16x4_t d0 =
1111 highbd_convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_vec);
1112 uint16x4_t d1 =
1113 highbd_convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_vec);
1114 uint16x4_t d2 =
1115 highbd_convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_vec);
1116 uint16x4_t d3 =
1117 highbd_convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_vec);
1118
1119 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1120
1121 s0 = s4;
1122 s1 = s5;
1123 s2 = s6;
1124 s3 = s7;
1125 s4 = s8;
1126 s += 4 * src_stride;
1127 d += 4 * dst_stride;
1128 h -= 4;
1129 } while (h != 0);
1130 } else {
1131 do {
1132 int height = h;
1133 const int16_t *s = (const int16_t *)src_ptr;
1134 uint16_t *d = dst_ptr;
1135
1136 int16x8_t s0, s1, s2, s3, s4;
1137 load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
1138 s += 5 * src_stride;
1139
1140 do {
1141 int16x8_t s5, s6, s7, s8;
1142 load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
1143
1144 uint16x8_t d0 = highbd_convolve6_8_2d_v(s0, s1, s2, s3, s4, s5,
1145 y_filter, offset_vec);
1146 uint16x8_t d1 = highbd_convolve6_8_2d_v(s1, s2, s3, s4, s5, s6,
1147 y_filter, offset_vec);
1148 uint16x8_t d2 = highbd_convolve6_8_2d_v(s2, s3, s4, s5, s6, s7,
1149 y_filter, offset_vec);
1150 uint16x8_t d3 = highbd_convolve6_8_2d_v(s3, s4, s5, s6, s7, s8,
1151 y_filter, offset_vec);
1152
1153 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1154
1155 s0 = s4;
1156 s1 = s5;
1157 s2 = s6;
1158 s3 = s7;
1159 s4 = s8;
1160 s += 4 * src_stride;
1161 d += 4 * dst_stride;
1162 height -= 4;
1163 } while (height != 0);
1164 src_ptr += 8;
1165 dst_ptr += 8;
1166 w -= 8;
1167 } while (w != 0);
1168 }
1169 }
1170
highbd_convolve8_4_2d_v(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t y_filter,const int32x4_t offset)1171 static INLINE uint16x4_t highbd_convolve8_4_2d_v(
1172 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
1173 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
1174 const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
1175 const int32x4_t offset) {
1176 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
1177 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
1178
1179 int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 0);
1180 sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
1181 sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
1182 sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
1183 sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
1184 sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
1185 sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
1186 sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
1187
1188 return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
1189 }
1190
highbd_convolve8_8_2d_v(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t y_filter,const int32x4_t offset)1191 static INLINE uint16x8_t highbd_convolve8_8_2d_v(
1192 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
1193 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
1194 const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
1195 const int32x4_t offset) {
1196 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
1197 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
1198
1199 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 0);
1200 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
1201 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
1202 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
1203 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
1204 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
1205 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
1206 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
1207
1208 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 0);
1209 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
1210 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
1211 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
1212 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
1213 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
1214 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
1215 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
1216
1217 return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
1218 vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
1219 }
1220
highbd_dist_wtd_convolve_2d_vert_8tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,int offset)1221 static INLINE void highbd_dist_wtd_convolve_2d_vert_8tap_neon(
1222 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1223 int w, int h, const int16_t *y_filter_ptr, int offset) {
1224 const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
1225 const int32x4_t offset_vec = vdupq_n_s32(offset);
1226
1227 if (w <= 4) {
1228 const int16_t *s = (const int16_t *)src_ptr;
1229 uint16_t *d = dst_ptr;
1230
1231 int16x4_t s0, s1, s2, s3, s4, s5, s6;
1232 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
1233 s += 7 * src_stride;
1234
1235 do {
1236 int16x4_t s7, s8, s9, s10;
1237 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
1238
1239 uint16x4_t d0 = highbd_convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
1240 y_filter, offset_vec);
1241 uint16x4_t d1 = highbd_convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
1242 y_filter, offset_vec);
1243 uint16x4_t d2 = highbd_convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
1244 y_filter, offset_vec);
1245 uint16x4_t d3 = highbd_convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
1246 y_filter, offset_vec);
1247
1248 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1249
1250 s0 = s4;
1251 s1 = s5;
1252 s2 = s6;
1253 s3 = s7;
1254 s4 = s8;
1255 s5 = s9;
1256 s6 = s10;
1257 s += 4 * src_stride;
1258 d += 4 * dst_stride;
1259 h -= 4;
1260 } while (h != 0);
1261 } else {
1262 do {
1263 int height = h;
1264 const int16_t *s = (const int16_t *)src_ptr;
1265 uint16_t *d = dst_ptr;
1266
1267 int16x8_t s0, s1, s2, s3, s4, s5, s6;
1268 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
1269 s += 7 * src_stride;
1270
1271 do {
1272 int16x8_t s7, s8, s9, s10;
1273 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
1274
1275 uint16x8_t d0 = highbd_convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7,
1276 y_filter, offset_vec);
1277 uint16x8_t d1 = highbd_convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8,
1278 y_filter, offset_vec);
1279 uint16x8_t d2 = highbd_convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9,
1280 y_filter, offset_vec);
1281 uint16x8_t d3 = highbd_convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10,
1282 y_filter, offset_vec);
1283
1284 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1285
1286 s0 = s4;
1287 s1 = s5;
1288 s2 = s6;
1289 s3 = s7;
1290 s4 = s8;
1291 s5 = s9;
1292 s6 = s10;
1293 s += 4 * src_stride;
1294 d += 4 * dst_stride;
1295 height -= 4;
1296 } while (height != 0);
1297 src_ptr += 8;
1298 dst_ptr += 8;
1299 w -= 8;
1300 } while (w != 0);
1301 }
1302 }
1303
highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)1304 static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon(
1305 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1306 int w, int h, const int16_t *x_filter_ptr, const int offset) {
1307 // The smallest block height is 4, and the horizontal convolution needs to
1308 // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
1309 assert(h >= 5);
1310 const int32x4_t offset_vec = vdupq_n_s32(offset);
1311
1312 const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
1313
1314 int height = h;
1315
1316 do {
1317 int width = w;
1318 const int16_t *s = (const int16_t *)src_ptr;
1319 uint16_t *d = dst_ptr;
1320
1321 do {
1322 int16x8_t s0[6], s1[6], s2[6], s3[6];
1323 load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1324 &s0[4], &s0[5]);
1325 load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1326 &s1[4], &s1[5]);
1327 load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1328 &s2[4], &s2[5]);
1329 load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1330 &s3[4], &s3[5]);
1331
1332 uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
1333 s0[5], x_filter, offset_vec);
1334 uint16x8_t d1 = highbd_12_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
1335 s1[5], x_filter, offset_vec);
1336 uint16x8_t d2 = highbd_12_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
1337 s2[5], x_filter, offset_vec);
1338 uint16x8_t d3 = highbd_12_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
1339 s3[5], x_filter, offset_vec);
1340
1341 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1342
1343 s += 8;
1344 d += 8;
1345 width -= 8;
1346 } while (width != 0);
1347 src_ptr += 4 * src_stride;
1348 dst_ptr += 4 * dst_stride;
1349 height -= 4;
1350 } while (height > 4);
1351
1352 do {
1353 int width = w;
1354 const int16_t *s = (const int16_t *)src_ptr;
1355 uint16_t *d = dst_ptr;
1356
1357 do {
1358 int16x8_t s0[6];
1359 load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]);
1360
1361 uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
1362 s0[5], x_filter, offset_vec);
1363 vst1q_u16(d, d0);
1364
1365 s += 8;
1366 d += 8;
1367 width -= 8;
1368 } while (width != 0);
1369 src_ptr += src_stride;
1370 dst_ptr += dst_stride;
1371 } while (--height != 0);
1372 }
1373
highbd_dist_wtd_convolve_2d_horiz_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)1374 static INLINE void highbd_dist_wtd_convolve_2d_horiz_6tap_neon(
1375 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1376 int w, int h, const int16_t *x_filter_ptr, const int offset) {
1377 // The smallest block height is 4, and the horizontal convolution needs to
1378 // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
1379 assert(h >= 5);
1380 const int32x4_t offset_vec = vdupq_n_s32(offset);
1381
1382 const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
1383
1384 int height = h;
1385
1386 do {
1387 int width = w;
1388 const int16_t *s = (const int16_t *)src_ptr;
1389 uint16_t *d = dst_ptr;
1390
1391 do {
1392 int16x8_t s0[6], s1[6], s2[6], s3[6];
1393 load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1394 &s0[4], &s0[5]);
1395 load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1396 &s1[4], &s1[5]);
1397 load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1398 &s2[4], &s2[5]);
1399 load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1400 &s3[4], &s3[5]);
1401
1402 uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
1403 s0[5], x_filter, offset_vec);
1404 uint16x8_t d1 = highbd_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4],
1405 s1[5], x_filter, offset_vec);
1406 uint16x8_t d2 = highbd_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4],
1407 s2[5], x_filter, offset_vec);
1408 uint16x8_t d3 = highbd_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4],
1409 s3[5], x_filter, offset_vec);
1410
1411 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1412
1413 s += 8;
1414 d += 8;
1415 width -= 8;
1416 } while (width != 0);
1417 src_ptr += 4 * src_stride;
1418 dst_ptr += 4 * dst_stride;
1419 height -= 4;
1420 } while (height > 4);
1421
1422 do {
1423 int width = w;
1424 const int16_t *s = (const int16_t *)src_ptr;
1425 uint16_t *d = dst_ptr;
1426
1427 do {
1428 int16x8_t s0[6];
1429 load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]);
1430
1431 uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4],
1432 s0[5], x_filter, offset_vec);
1433 vst1q_u16(d, d0);
1434
1435 s += 8;
1436 d += 8;
1437 width -= 8;
1438 } while (width != 0);
1439 src_ptr += src_stride;
1440 dst_ptr += dst_stride;
1441 } while (--height != 0);
1442 }
1443
highbd_12_dist_wtd_convolve_2d_horiz_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)1444 static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_neon(
1445 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1446 int w, int h, const int16_t *x_filter_ptr, const int offset) {
1447 // The smallest block height is 4, and the horizontal convolution needs to
1448 // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
1449 assert(h >= 5);
1450 const int32x4_t offset_vec = vdupq_n_s32(offset);
1451
1452 if (w == 4) {
1453 // 4-tap filters are used for blocks having width == 4.
1454 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
1455 const int16_t *s = (const int16_t *)(src_ptr + 1);
1456 uint16_t *d = dst_ptr;
1457
1458 do {
1459 int16x4_t s0[4], s1[4], s2[4], s3[4];
1460 load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
1461 load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
1462 load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
1463 load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
1464
1465 uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec);
1466 uint16x4_t d1 = highbd_12_convolve4_4_x(s1, x_filter, offset_vec);
1467 uint16x4_t d2 = highbd_12_convolve4_4_x(s2, x_filter, offset_vec);
1468 uint16x4_t d3 = highbd_12_convolve4_4_x(s3, x_filter, offset_vec);
1469
1470 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1471
1472 s += 4 * src_stride;
1473 d += 4 * dst_stride;
1474 h -= 4;
1475 } while (h > 4);
1476
1477 do {
1478 int16x4_t s0[4];
1479 load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
1480
1481 uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec);
1482 vst1_u16(d, d0);
1483
1484 s += src_stride;
1485 d += dst_stride;
1486 } while (--h != 0);
1487 } else {
1488 const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
1489 int height = h;
1490
1491 do {
1492 int width = w;
1493 const int16_t *s = (const int16_t *)src_ptr;
1494 uint16_t *d = dst_ptr;
1495
1496 do {
1497 int16x8_t s0[8], s1[8], s2[8], s3[8];
1498 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1499 &s0[4], &s0[5], &s0[6], &s0[7]);
1500 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1501 &s1[4], &s1[5], &s1[6], &s1[7]);
1502 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1503 &s2[4], &s2[5], &s2[6], &s2[7]);
1504 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1505 &s3[4], &s3[5], &s3[6], &s3[7]);
1506
1507 uint16x8_t d0 =
1508 highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5],
1509 s0[6], s0[7], x_filter, offset_vec);
1510 uint16x8_t d1 =
1511 highbd_12_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5],
1512 s1[6], s1[7], x_filter, offset_vec);
1513 uint16x8_t d2 =
1514 highbd_12_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5],
1515 s2[6], s2[7], x_filter, offset_vec);
1516 uint16x8_t d3 =
1517 highbd_12_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5],
1518 s3[6], s3[7], x_filter, offset_vec);
1519
1520 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1521
1522 s += 8;
1523 d += 8;
1524 width -= 8;
1525 } while (width != 0);
1526 src_ptr += 4 * src_stride;
1527 dst_ptr += 4 * dst_stride;
1528 height -= 4;
1529 } while (height > 4);
1530
1531 do {
1532 int width = w;
1533 const int16_t *s = (const int16_t *)src_ptr;
1534 uint16_t *d = dst_ptr;
1535
1536 do {
1537 int16x8_t s0[8];
1538 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1539 &s0[4], &s0[5], &s0[6], &s0[7]);
1540
1541 uint16x8_t d0 =
1542 highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5],
1543 s0[6], s0[7], x_filter, offset_vec);
1544 vst1q_u16(d, d0);
1545
1546 s += 8;
1547 d += 8;
1548 width -= 8;
1549 } while (width != 0);
1550 src_ptr += src_stride;
1551 dst_ptr += dst_stride;
1552 } while (--height != 0);
1553 }
1554 }
1555
highbd_dist_wtd_convolve_2d_horiz_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int offset)1556 static INLINE void highbd_dist_wtd_convolve_2d_horiz_neon(
1557 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1558 int w, int h, const int16_t *x_filter_ptr, const int offset) {
1559 // The smallest block height is 4, and the horizontal convolution needs to
1560 // process an extra (filter_taps/2 - 1) lines for the vertical convolution.
1561 assert(h >= 5);
1562 const int32x4_t offset_vec = vdupq_n_s32(offset);
1563
1564 if (w == 4) {
1565 // 4-tap filters are used for blocks having width == 4.
1566 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
1567 const int16_t *s = (const int16_t *)(src_ptr + 1);
1568 uint16_t *d = dst_ptr;
1569
1570 do {
1571 int16x4_t s0[4], s1[4], s2[4], s3[4];
1572 load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
1573 load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
1574 load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
1575 load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
1576
1577 uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec);
1578 uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset_vec);
1579 uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset_vec);
1580 uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset_vec);
1581
1582 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1583
1584 s += 4 * src_stride;
1585 d += 4 * dst_stride;
1586 h -= 4;
1587 } while (h > 4);
1588
1589 do {
1590 int16x4_t s0[4];
1591 load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
1592
1593 uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec);
1594 vst1_u16(d, d0);
1595
1596 s += src_stride;
1597 d += dst_stride;
1598 } while (--h != 0);
1599 } else {
1600 const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
1601 int height = h;
1602
1603 do {
1604 int width = w;
1605 const int16_t *s = (const int16_t *)src_ptr;
1606 uint16_t *d = dst_ptr;
1607
1608 do {
1609 int16x8_t s0[8], s1[8], s2[8], s3[8];
1610 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1611 &s0[4], &s0[5], &s0[6], &s0[7]);
1612 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1613 &s1[4], &s1[5], &s1[6], &s1[7]);
1614 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1615 &s2[4], &s2[5], &s2[6], &s2[7]);
1616 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1617 &s3[4], &s3[5], &s3[6], &s3[7]);
1618
1619 uint16x8_t d0 =
1620 highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6],
1621 s0[7], x_filter, offset_vec);
1622 uint16x8_t d1 =
1623 highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6],
1624 s1[7], x_filter, offset_vec);
1625 uint16x8_t d2 =
1626 highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6],
1627 s2[7], x_filter, offset_vec);
1628 uint16x8_t d3 =
1629 highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6],
1630 s3[7], x_filter, offset_vec);
1631
1632 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1633
1634 s += 8;
1635 d += 8;
1636 width -= 8;
1637 } while (width != 0);
1638 src_ptr += 4 * src_stride;
1639 dst_ptr += 4 * dst_stride;
1640 height -= 4;
1641 } while (height > 4);
1642
1643 do {
1644 int width = w;
1645 const int16_t *s = (const int16_t *)src_ptr;
1646 uint16_t *d = dst_ptr;
1647
1648 do {
1649 int16x8_t s0[8];
1650 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1651 &s0[4], &s0[5], &s0[6], &s0[7]);
1652
1653 uint16x8_t d0 =
1654 highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6],
1655 s0[7], x_filter, offset_vec);
1656 vst1q_u16(d, d0);
1657
1658 s += 8;
1659 d += 8;
1660 width -= 8;
1661 } while (width != 0);
1662 src_ptr += src_stride;
1663 dst_ptr += dst_stride;
1664 } while (--height != 0);
1665 }
1666 }
1667
av1_highbd_dist_wtd_convolve_2d_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)1668 void av1_highbd_dist_wtd_convolve_2d_neon(
1669 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
1670 int h, const InterpFilterParams *filter_params_x,
1671 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
1672 const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
1673 DECLARE_ALIGNED(16, uint16_t,
1674 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
1675 DECLARE_ALIGNED(16, uint16_t,
1676 im_block2[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
1677
1678 CONV_BUF_TYPE *dst16 = conv_params->dst;
1679 int dst16_stride = conv_params->dst_stride;
1680 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
1681 const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps;
1682 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
1683 const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
1684
1685 const int im_h = h + clamped_y_taps - 1;
1686 const int im_stride = MAX_SB_SIZE;
1687 const int vert_offset = clamped_y_taps / 2 - 1;
1688 const int horiz_offset = clamped_x_taps / 2 - 1;
1689 // The extra shim of (1 << (conv_params->round_0 - 1)) allows us to use a
1690 // faster non-rounding non-saturating left shift.
1691 const int round_offset_conv_x =
1692 (1 << (bd + FILTER_BITS - 1)) + (1 << (conv_params->round_0 - 1));
1693 const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1694 const int round_offset_conv_y = (1 << y_offset_bits);
1695 const int round_offset_avg =
1696 ((1 << (y_offset_bits - conv_params->round_1)) +
1697 (1 << (y_offset_bits - conv_params->round_1 - 1)));
1698
1699 const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
1700
1701 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
1702 filter_params_x, subpel_x_qn & SUBPEL_MASK);
1703 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
1704 filter_params_y, subpel_y_qn & SUBPEL_MASK);
1705
1706 // horizontal filter
1707 if (bd == 12) {
1708 if (x_filter_taps <= 6 && w != 4) {
1709 highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon(
1710 src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
1711 round_offset_conv_x);
1712 } else {
1713 highbd_12_dist_wtd_convolve_2d_horiz_neon(
1714 src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
1715 round_offset_conv_x);
1716 }
1717 } else {
1718 if (x_filter_taps <= 6 && w != 4) {
1719 highbd_dist_wtd_convolve_2d_horiz_6tap_neon(
1720 src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr,
1721 round_offset_conv_x);
1722 } else {
1723 highbd_dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block,
1724 im_stride, w, im_h, x_filter_ptr,
1725 round_offset_conv_x);
1726 }
1727 }
1728
1729 // vertical filter
1730 if (y_filter_taps <= 6) {
1731 if (conv_params->do_average) {
1732 highbd_dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, im_block2,
1733 im_stride, w, h, y_filter_ptr,
1734 round_offset_conv_y);
1735 } else {
1736 highbd_dist_wtd_convolve_2d_vert_6tap_neon(
1737 im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr,
1738 round_offset_conv_y);
1739 }
1740 } else {
1741 if (conv_params->do_average) {
1742 highbd_dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, im_block2,
1743 im_stride, w, h, y_filter_ptr,
1744 round_offset_conv_y);
1745 } else {
1746 highbd_dist_wtd_convolve_2d_vert_8tap_neon(
1747 im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr,
1748 round_offset_conv_y);
1749 }
1750 }
1751
1752 // Do the compound averaging outside the loop, avoids branching within the
1753 // main loop
1754 if (conv_params->do_average) {
1755 if (conv_params->use_dist_wtd_comp_avg) {
1756 if (bd == 12) {
1757 highbd_12_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride,
1758 w, h, conv_params, round_offset_avg,
1759 bd);
1760 } else {
1761 highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w,
1762 h, conv_params, round_offset_avg, bd);
1763 }
1764 } else {
1765 if (bd == 12) {
1766 highbd_12_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
1767 conv_params, round_offset_avg, bd);
1768 } else {
1769 highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h,
1770 conv_params, round_offset_avg, bd);
1771 }
1772 }
1773 }
1774 }
1775