1 /*
2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <arm_neon.h>
14
15 #include "config/aom_config.h"
16 #include "config/av1_rtcd.h"
17
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_dsp/arm/mem_neon.h"
20 #include "aom_ports/mem.h"
21 #include "av1/common/convolve.h"
22 #include "av1/common/filter.h"
23
24 static INLINE uint16x4_t
highbd_convolve6_4_y(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t y_filter)25 highbd_convolve6_4_y(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
26 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
27 const int16x8_t y_filter) {
28 // Values at indices 0 and 7 of y_filter are zero.
29 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
30 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
31
32 int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 1);
33 sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
34 sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
35 sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
36 sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
37 sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
38
39 return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
40 }
41
42 static INLINE uint16x8_t
highbd_convolve6_8_y(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t y_filter)43 highbd_convolve6_8_y(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
44 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
45 const int16x8_t y_filter) {
46 // Values at indices 0 and 7 of y_filter are zero.
47 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
48 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
49
50 int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 1);
51 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
52 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
53 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
54 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
55 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
56
57 int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 1);
58 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
59 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
60 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
61 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
62 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
63
64 return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
65 vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
66 }
67
highbd_convolve_y_sr_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,const int bd)68 static INLINE void highbd_convolve_y_sr_6tap_neon(
69 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
70 int w, int h, const int16_t *y_filter_ptr, const int bd) {
71 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
72 const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
73
74 if (w == 4) {
75 const int16_t *s = (const int16_t *)(src_ptr + src_stride);
76 uint16_t *d = dst_ptr;
77
78 int16x4_t s0, s1, s2, s3, s4;
79 load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
80 s += 5 * src_stride;
81
82 do {
83 int16x4_t s5, s6, s7, s8;
84 load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
85
86 uint16x4_t d0 =
87 highbd_convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter_0_7);
88 uint16x4_t d1 =
89 highbd_convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter_0_7);
90 uint16x4_t d2 =
91 highbd_convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter_0_7);
92 uint16x4_t d3 =
93 highbd_convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter_0_7);
94
95 d0 = vmin_u16(d0, vget_low_u16(max));
96 d1 = vmin_u16(d1, vget_low_u16(max));
97 d2 = vmin_u16(d2, vget_low_u16(max));
98 d3 = vmin_u16(d3, vget_low_u16(max));
99
100 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
101
102 s0 = s4;
103 s1 = s5;
104 s2 = s6;
105 s3 = s7;
106 s4 = s8;
107 s += 4 * src_stride;
108 d += 4 * dst_stride;
109 h -= 4;
110 } while (h != 0);
111 } else {
112 // Width is a multiple of 8 and height is a multiple of 4.
113 do {
114 int height = h;
115 const int16_t *s = (const int16_t *)(src_ptr + src_stride);
116 uint16_t *d = dst_ptr;
117
118 int16x8_t s0, s1, s2, s3, s4;
119 load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
120 s += 5 * src_stride;
121
122 do {
123 int16x8_t s5, s6, s7, s8;
124 load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
125
126 uint16x8_t d0 =
127 highbd_convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter_0_7);
128 uint16x8_t d1 =
129 highbd_convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter_0_7);
130 uint16x8_t d2 =
131 highbd_convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter_0_7);
132 uint16x8_t d3 =
133 highbd_convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter_0_7);
134
135 d0 = vminq_u16(d0, max);
136 d1 = vminq_u16(d1, max);
137 d2 = vminq_u16(d2, max);
138 d3 = vminq_u16(d3, max);
139
140 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
141
142 s0 = s4;
143 s1 = s5;
144 s2 = s6;
145 s3 = s7;
146 s4 = s8;
147 s += 4 * src_stride;
148 d += 4 * dst_stride;
149 height -= 4;
150 } while (height != 0);
151
152 src_ptr += 8;
153 dst_ptr += 8;
154 w -= 8;
155 } while (w != 0);
156 }
157 }
158
highbd_convolve8_4_y(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t y_filter)159 static INLINE uint16x4_t highbd_convolve8_4_y(
160 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
161 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
162 const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
163 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
164 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
165
166 int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0);
167 sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
168 sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
169 sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
170 sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
171 sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
172 sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
173 sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
174
175 return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
176 }
177
highbd_convolve8_8_y(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t y_filter)178 static INLINE uint16x8_t highbd_convolve8_8_y(
179 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
180 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
181 const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter) {
182 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
183 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
184
185 int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0);
186 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
187 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
188 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
189 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
190 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
191 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
192 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
193
194 int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0);
195 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
196 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
197 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
198 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
199 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
200 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
201 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
202
203 return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
204 vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
205 }
206
highbd_convolve_y_sr_8tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,int bd)207 static INLINE void highbd_convolve_y_sr_8tap_neon(
208 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
209 int w, int h, const int16_t *y_filter_ptr, int bd) {
210 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
211 const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
212
213 if (w == 4) {
214 const int16_t *s = (const int16_t *)src_ptr;
215 uint16_t *d = dst_ptr;
216
217 int16x4_t s0, s1, s2, s3, s4, s5, s6;
218 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
219 s += 7 * src_stride;
220
221 do {
222 int16x4_t s7, s8, s9, s10;
223 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
224
225 uint16x4_t d0 =
226 highbd_convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
227 uint16x4_t d1 =
228 highbd_convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
229 uint16x4_t d2 =
230 highbd_convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
231 uint16x4_t d3 =
232 highbd_convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
233
234 d0 = vmin_u16(d0, vget_low_u16(max));
235 d1 = vmin_u16(d1, vget_low_u16(max));
236 d2 = vmin_u16(d2, vget_low_u16(max));
237 d3 = vmin_u16(d3, vget_low_u16(max));
238
239 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
240
241 s0 = s4;
242 s1 = s5;
243 s2 = s6;
244 s3 = s7;
245 s4 = s8;
246 s5 = s9;
247 s6 = s10;
248 s += 4 * src_stride;
249 d += 4 * dst_stride;
250 h -= 4;
251 } while (h != 0);
252 } else {
253 do {
254 int height = h;
255 const int16_t *s = (const int16_t *)src_ptr;
256 uint16_t *d = dst_ptr;
257
258 int16x8_t s0, s1, s2, s3, s4, s5, s6;
259 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
260 s += 7 * src_stride;
261
262 do {
263 int16x8_t s7, s8, s9, s10;
264 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
265
266 uint16x8_t d0 =
267 highbd_convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
268 uint16x8_t d1 =
269 highbd_convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
270 uint16x8_t d2 =
271 highbd_convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
272 uint16x8_t d3 =
273 highbd_convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
274
275 d0 = vminq_u16(d0, max);
276 d1 = vminq_u16(d1, max);
277 d2 = vminq_u16(d2, max);
278 d3 = vminq_u16(d3, max);
279
280 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
281
282 s0 = s4;
283 s1 = s5;
284 s2 = s6;
285 s3 = s7;
286 s4 = s8;
287 s5 = s9;
288 s6 = s10;
289 s += 4 * src_stride;
290 d += 4 * dst_stride;
291 height -= 4;
292 } while (height != 0);
293 src_ptr += 8;
294 dst_ptr += 8;
295 w -= 8;
296 } while (w != 0);
297 }
298 }
299
highbd_convolve12_4_y(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x4_t s8,const int16x4_t s9,const int16x4_t s10,const int16x4_t s11,const int16x8_t y_filter_0_7,const int16x4_t y_filter_8_11)300 static INLINE uint16x4_t highbd_convolve12_4_y(
301 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
302 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
303 const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
304 const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
305 const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
306 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
307 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
308
309 int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0);
310 sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
311 sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
312 sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
313 sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
314 sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
315 sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
316 sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
317 sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0);
318 sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1);
319 sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2);
320 sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
321
322 return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS);
323 }
324
highbd_convolve12_8_y(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t s8,const int16x8_t s9,const int16x8_t s10,const int16x8_t s11,const int16x8_t y_filter_0_7,const int16x4_t y_filter_8_11)325 static INLINE uint16x8_t highbd_convolve12_8_y(
326 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
327 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
328 const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
329 const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
330 const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) {
331 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
332 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
333
334 int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0);
335 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
336 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
337 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
338 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
339 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
340 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
341 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
342 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0);
343 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1);
344 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2);
345 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3);
346
347 int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0);
348 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
349 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
350 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
351 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
352 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
353 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
354 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
355 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0);
356 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1);
357 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2);
358 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3);
359
360 return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS),
361 vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS));
362 }
363
highbd_convolve_y_sr_12tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,int bd)364 static INLINE void highbd_convolve_y_sr_12tap_neon(
365 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
366 int w, int h, const int16_t *y_filter_ptr, int bd) {
367 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
368 const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
369 const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
370
371 if (w == 4) {
372 const int16_t *s = (const int16_t *)src_ptr;
373 uint16_t *d = dst_ptr;
374
375 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
376 load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
377 &s9, &s10);
378 s += 11 * src_stride;
379
380 do {
381 int16x4_t s11, s12, s13, s14;
382 load_s16_4x4(s, src_stride, &s11, &s12, &s13, &s14);
383
384 uint16x4_t d0 =
385 highbd_convolve12_4_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
386 s11, y_filter_0_7, y_filter_8_11);
387 uint16x4_t d1 =
388 highbd_convolve12_4_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
389 s12, y_filter_0_7, y_filter_8_11);
390 uint16x4_t d2 =
391 highbd_convolve12_4_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
392 s13, y_filter_0_7, y_filter_8_11);
393 uint16x4_t d3 =
394 highbd_convolve12_4_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
395 s14, y_filter_0_7, y_filter_8_11);
396
397 d0 = vmin_u16(d0, vget_low_u16(max));
398 d1 = vmin_u16(d1, vget_low_u16(max));
399 d2 = vmin_u16(d2, vget_low_u16(max));
400 d3 = vmin_u16(d3, vget_low_u16(max));
401
402 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
403
404 s0 = s4;
405 s1 = s5;
406 s2 = s6;
407 s3 = s7;
408 s4 = s8;
409 s5 = s9;
410 s6 = s10;
411 s7 = s11;
412 s8 = s12;
413 s9 = s13;
414 s10 = s14;
415 s += 4 * src_stride;
416 d += 4 * dst_stride;
417 h -= 4;
418 } while (h != 0);
419 } else {
420 do {
421 int height = h;
422 const int16_t *s = (const int16_t *)src_ptr;
423 uint16_t *d = dst_ptr;
424
425 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
426 load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
427 &s9, &s10);
428 s += 11 * src_stride;
429
430 do {
431 int16x8_t s11, s12, s13, s14;
432 load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14);
433
434 uint16x8_t d0 =
435 highbd_convolve12_8_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10,
436 s11, y_filter_0_7, y_filter_8_11);
437 uint16x8_t d1 =
438 highbd_convolve12_8_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
439 s12, y_filter_0_7, y_filter_8_11);
440 uint16x8_t d2 =
441 highbd_convolve12_8_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
442 s13, y_filter_0_7, y_filter_8_11);
443 uint16x8_t d3 =
444 highbd_convolve12_8_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
445 s13, s14, y_filter_0_7, y_filter_8_11);
446
447 d0 = vminq_u16(d0, max);
448 d1 = vminq_u16(d1, max);
449 d2 = vminq_u16(d2, max);
450 d3 = vminq_u16(d3, max);
451
452 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
453
454 s0 = s4;
455 s1 = s5;
456 s2 = s6;
457 s3 = s7;
458 s4 = s8;
459 s5 = s9;
460 s6 = s10;
461 s7 = s11;
462 s8 = s12;
463 s9 = s13;
464 s10 = s14;
465 s += 4 * src_stride;
466 d += 4 * dst_stride;
467 height -= 4;
468 } while (height != 0);
469
470 src_ptr += 8;
471 dst_ptr += 8;
472 w -= 8;
473 } while (w != 0);
474 }
475 }
476
av1_highbd_convolve_y_sr_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,int bd)477 void av1_highbd_convolve_y_sr_neon(const uint16_t *src, int src_stride,
478 uint16_t *dst, int dst_stride, int w, int h,
479 const InterpFilterParams *filter_params_y,
480 const int subpel_y_qn, int bd) {
481 if (w == 2 || h == 2) {
482 av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
483 filter_params_y, subpel_y_qn, bd);
484 return;
485 }
486 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
487 const int vert_offset = filter_params_y->taps / 2 - 1;
488 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
489 filter_params_y, subpel_y_qn & SUBPEL_MASK);
490
491 src -= vert_offset * src_stride;
492
493 if (y_filter_taps > 8) {
494 highbd_convolve_y_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
495 y_filter_ptr, bd);
496 return;
497 }
498 if (y_filter_taps < 8) {
499 highbd_convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h,
500 y_filter_ptr, bd);
501 return;
502 }
503
504 highbd_convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h,
505 y_filter_ptr, bd);
506 }
507
highbd_convolve6_8_x(const int16x8_t s[6],const int16x8_t x_filter,const int32x4_t offset)508 static INLINE uint16x8_t highbd_convolve6_8_x(const int16x8_t s[6],
509 const int16x8_t x_filter,
510 const int32x4_t offset) {
511 // Values at indices 0 and 7 of y_filter are zero.
512 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
513 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
514
515 int32x4_t sum0 = offset;
516 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 1);
517 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 2);
518 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 3);
519 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_4_7, 0);
520 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 1);
521 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 2);
522
523 int32x4_t sum1 = offset;
524 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 1);
525 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 2);
526 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 3);
527 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_4_7, 0);
528 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 1);
529 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 2);
530
531 return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
532 vqrshrun_n_s32(sum1, FILTER_BITS));
533 }
534
highbd_convolve_x_sr_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,ConvolveParams * conv_params,int bd)535 static INLINE void highbd_convolve_x_sr_6tap_neon(
536 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
537 int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
538 int bd) {
539 const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
540 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
541 // This shim allows to do only one rounding shift instead of two.
542 const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1));
543
544 int height = h;
545
546 do {
547 int width = w;
548 const int16_t *s = (const int16_t *)src_ptr;
549 uint16_t *d = dst_ptr;
550
551 do {
552 int16x8_t s0[6], s1[6], s2[6], s3[6];
553 load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
554 &s0[4], &s0[5]);
555 load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
556 &s1[4], &s1[5]);
557 load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
558 &s2[4], &s2[5]);
559 load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
560 &s3[4], &s3[5]);
561
562 uint16x8_t d0 = highbd_convolve6_8_x(s0, x_filter, offset);
563 uint16x8_t d1 = highbd_convolve6_8_x(s1, x_filter, offset);
564 uint16x8_t d2 = highbd_convolve6_8_x(s2, x_filter, offset);
565 uint16x8_t d3 = highbd_convolve6_8_x(s3, x_filter, offset);
566
567 d0 = vminq_u16(d0, max);
568 d1 = vminq_u16(d1, max);
569 d2 = vminq_u16(d2, max);
570 d3 = vminq_u16(d3, max);
571
572 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
573
574 s += 8;
575 d += 8;
576 width -= 8;
577 } while (width != 0);
578
579 src_ptr += 4 * src_stride;
580 dst_ptr += 4 * dst_stride;
581 height -= 4;
582 } while (height != 0);
583 }
584
highbd_convolve4_4_x(const int16x4_t s[4],const int16x4_t x_filter,const int32x4_t offset)585 static INLINE uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4],
586 const int16x4_t x_filter,
587 const int32x4_t offset) {
588 int32x4_t sum = offset;
589 sum = vmlal_lane_s16(sum, s[0], x_filter, 0);
590 sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
591 sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
592 sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
593
594 return vqrshrun_n_s32(sum, FILTER_BITS);
595 }
596
highbd_convolve8_8_x(const int16x8_t s[8],const int16x8_t x_filter,const int32x4_t offset)597 static INLINE uint16x8_t highbd_convolve8_8_x(const int16x8_t s[8],
598 const int16x8_t x_filter,
599 const int32x4_t offset) {
600 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
601 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
602
603 int32x4_t sum0 = offset;
604 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 0);
605 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
606 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
607 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
608 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
609 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
610 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
611 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
612
613 int32x4_t sum1 = offset;
614 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 0);
615 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
616 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
617 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
618 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
619 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
620 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
621 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
622
623 return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
624 vqrshrun_n_s32(sum1, FILTER_BITS));
625 }
626
highbd_convolve_x_sr_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,ConvolveParams * conv_params,int bd)627 static INLINE void highbd_convolve_x_sr_neon(const uint16_t *src_ptr,
628 int src_stride, uint16_t *dst_ptr,
629 int dst_stride, int w, int h,
630 const int16_t *x_filter_ptr,
631 ConvolveParams *conv_params,
632 int bd) {
633 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
634 // This shim allows to do only one rounding shift instead of two.
635 const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1));
636
637 if (w == 4) {
638 // 4-tap filters are used for blocks having width == 4.
639 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
640 const int16_t *s = (const int16_t *)(src_ptr + 2);
641 uint16_t *d = dst_ptr;
642
643 do {
644 int16x4_t s0[4], s1[4], s2[4], s3[4];
645 load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
646 load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
647 load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
648 load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
649
650 uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset);
651 uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset);
652 uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset);
653 uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset);
654
655 d0 = vmin_u16(d0, vget_low_u16(max));
656 d1 = vmin_u16(d1, vget_low_u16(max));
657 d2 = vmin_u16(d2, vget_low_u16(max));
658 d3 = vmin_u16(d3, vget_low_u16(max));
659
660 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
661
662 s += 4 * src_stride;
663 d += 4 * dst_stride;
664 h -= 4;
665 } while (h != 0);
666 } else {
667 const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
668 int height = h;
669
670 do {
671 int width = w;
672 const int16_t *s = (const int16_t *)src_ptr;
673 uint16_t *d = dst_ptr;
674
675 do {
676 int16x8_t s0[8], s1[8], s2[8], s3[8];
677 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
678 &s0[4], &s0[5], &s0[6], &s0[7]);
679 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
680 &s1[4], &s1[5], &s1[6], &s1[7]);
681 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
682 &s2[4], &s2[5], &s2[6], &s2[7]);
683 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
684 &s3[4], &s3[5], &s3[6], &s3[7]);
685
686 uint16x8_t d0 = highbd_convolve8_8_x(s0, x_filter, offset);
687 uint16x8_t d1 = highbd_convolve8_8_x(s1, x_filter, offset);
688 uint16x8_t d2 = highbd_convolve8_8_x(s2, x_filter, offset);
689 uint16x8_t d3 = highbd_convolve8_8_x(s3, x_filter, offset);
690
691 d0 = vminq_u16(d0, max);
692 d1 = vminq_u16(d1, max);
693 d2 = vminq_u16(d2, max);
694 d3 = vminq_u16(d3, max);
695
696 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
697
698 s += 8;
699 d += 8;
700 width -= 8;
701 } while (width != 0);
702 src_ptr += 4 * src_stride;
703 dst_ptr += 4 * dst_stride;
704 height -= 4;
705 } while (height != 0);
706 }
707 }
708
highbd_convolve12_4_x(const int16x4_t s[12],const int16x8_t x_filter_0_7,const int16x4_t x_filter_8_11,const int32x4_t offset)709 static INLINE uint16x4_t highbd_convolve12_4_x(const int16x4_t s[12],
710 const int16x8_t x_filter_0_7,
711 const int16x4_t x_filter_8_11,
712 const int32x4_t offset) {
713 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
714 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
715
716 int32x4_t sum = offset;
717 sum = vmlal_lane_s16(sum, s[0], x_filter_0_3, 0);
718 sum = vmlal_lane_s16(sum, s[1], x_filter_0_3, 1);
719 sum = vmlal_lane_s16(sum, s[2], x_filter_0_3, 2);
720 sum = vmlal_lane_s16(sum, s[3], x_filter_0_3, 3);
721 sum = vmlal_lane_s16(sum, s[4], x_filter_4_7, 0);
722 sum = vmlal_lane_s16(sum, s[5], x_filter_4_7, 1);
723 sum = vmlal_lane_s16(sum, s[6], x_filter_4_7, 2);
724 sum = vmlal_lane_s16(sum, s[7], x_filter_4_7, 3);
725 sum = vmlal_lane_s16(sum, s[8], x_filter_8_11, 0);
726 sum = vmlal_lane_s16(sum, s[9], x_filter_8_11, 1);
727 sum = vmlal_lane_s16(sum, s[10], x_filter_8_11, 2);
728 sum = vmlal_lane_s16(sum, s[11], x_filter_8_11, 3);
729
730 return vqrshrun_n_s32(sum, FILTER_BITS);
731 }
732
highbd_convolve12_8_x(const int16x8_t s[12],const int16x8_t x_filter_0_7,const int16x4_t x_filter_8_11,const int32x4_t offset)733 static INLINE uint16x8_t highbd_convolve12_8_x(const int16x8_t s[12],
734 const int16x8_t x_filter_0_7,
735 const int16x4_t x_filter_8_11,
736 const int32x4_t offset) {
737 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
738 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
739
740 int32x4_t sum0 = offset;
741 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 0);
742 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
743 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
744 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
745 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
746 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
747 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
748 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
749 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[8]), x_filter_8_11, 0);
750 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[9]), x_filter_8_11, 1);
751 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[10]), x_filter_8_11, 2);
752 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[11]), x_filter_8_11, 3);
753
754 int32x4_t sum1 = offset;
755 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 0);
756 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
757 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
758 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
759 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
760 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
761 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
762 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
763 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[8]), x_filter_8_11, 0);
764 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[9]), x_filter_8_11, 1);
765 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[10]), x_filter_8_11, 2);
766 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[11]), x_filter_8_11, 3);
767
768 return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
769 vqrshrun_n_s32(sum1, FILTER_BITS));
770 }
771
highbd_convolve_x_sr_12tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,ConvolveParams * conv_params,int bd)772 static INLINE void highbd_convolve_x_sr_12tap_neon(
773 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
774 int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
775 int bd) {
776 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
777 // This shim allows to do only one rounding shift instead of two.
778 const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1));
779 const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
780 const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
781
782 if (w == 4) {
783 const int16_t *s = (const int16_t *)src_ptr;
784 uint16_t *d = dst_ptr;
785
786 do {
787 int16x4_t s0[12], s1[12], s2[12], s3[12];
788 load_s16_4x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
789 &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
790 &s0[11]);
791 load_s16_4x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
792 &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
793 &s1[11]);
794 load_s16_4x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
795 &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
796 &s2[11]);
797 load_s16_4x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
798 &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
799 &s3[11]);
800
801 uint16x4_t d0 =
802 highbd_convolve12_4_x(s0, x_filter_0_7, x_filter_8_11, offset);
803 uint16x4_t d1 =
804 highbd_convolve12_4_x(s1, x_filter_0_7, x_filter_8_11, offset);
805 uint16x4_t d2 =
806 highbd_convolve12_4_x(s2, x_filter_0_7, x_filter_8_11, offset);
807 uint16x4_t d3 =
808 highbd_convolve12_4_x(s3, x_filter_0_7, x_filter_8_11, offset);
809
810 d0 = vmin_u16(d0, vget_low_u16(max));
811 d1 = vmin_u16(d1, vget_low_u16(max));
812 d2 = vmin_u16(d2, vget_low_u16(max));
813 d3 = vmin_u16(d3, vget_low_u16(max));
814
815 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
816
817 s += 4 * src_stride;
818 d += 4 * dst_stride;
819 h -= 4;
820 } while (h != 0);
821 } else {
822 int height = h;
823
824 do {
825 int width = w;
826 const int16_t *s = (const int16_t *)src_ptr;
827 uint16_t *d = dst_ptr;
828
829 do {
830 int16x8_t s0[12], s1[12], s2[12], s3[12];
831 load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
832 &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
833 &s0[11]);
834 load_s16_8x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
835 &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
836 &s1[11]);
837 load_s16_8x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
838 &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
839 &s2[11]);
840 load_s16_8x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
841 &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
842 &s3[11]);
843
844 uint16x8_t d0 =
845 highbd_convolve12_8_x(s0, x_filter_0_7, x_filter_8_11, offset);
846 uint16x8_t d1 =
847 highbd_convolve12_8_x(s1, x_filter_0_7, x_filter_8_11, offset);
848 uint16x8_t d2 =
849 highbd_convolve12_8_x(s2, x_filter_0_7, x_filter_8_11, offset);
850 uint16x8_t d3 =
851 highbd_convolve12_8_x(s3, x_filter_0_7, x_filter_8_11, offset);
852
853 d0 = vminq_u16(d0, max);
854 d1 = vminq_u16(d1, max);
855 d2 = vminq_u16(d2, max);
856 d3 = vminq_u16(d3, max);
857
858 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
859
860 s += 8;
861 d += 8;
862 width -= 8;
863 } while (width != 0);
864 src_ptr += 4 * src_stride;
865 dst_ptr += 4 * dst_stride;
866 height -= 4;
867 } while (height != 0);
868 }
869 }
870
av1_highbd_convolve_x_sr_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)871 void av1_highbd_convolve_x_sr_neon(const uint16_t *src, int src_stride,
872 uint16_t *dst, int dst_stride, int w, int h,
873 const InterpFilterParams *filter_params_x,
874 const int subpel_x_qn,
875 ConvolveParams *conv_params, int bd) {
876 if (w == 2 || h == 2) {
877 av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
878 filter_params_x, subpel_x_qn, conv_params, bd);
879 return;
880 }
881 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
882 const int horiz_offset = filter_params_x->taps / 2 - 1;
883 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
884 filter_params_x, subpel_x_qn & SUBPEL_MASK);
885
886 src -= horiz_offset;
887
888 if (x_filter_taps > 8) {
889 highbd_convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h,
890 x_filter_ptr, conv_params, bd);
891 return;
892 }
893 if (x_filter_taps <= 6 && w != 4) {
894 highbd_convolve_x_sr_6tap_neon(src + 1, src_stride, dst, dst_stride, w, h,
895 x_filter_ptr, conv_params, bd);
896 return;
897 }
898
899 highbd_convolve_x_sr_neon(src, src_stride, dst, dst_stride, w, h,
900 x_filter_ptr, conv_params, bd);
901 }
902
highbd_convolve6_4_2d_v(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x8_t y_filter,const int32x4_t round_shift,const int32x4_t offset)903 static INLINE uint16x4_t highbd_convolve6_4_2d_v(
904 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
905 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
906 const int16x8_t y_filter, const int32x4_t round_shift,
907 const int32x4_t offset) {
908 // Values at indices 0 and 7 of y_filter are zero.
909 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
910 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
911
912 int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 1);
913 sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2);
914 sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3);
915 sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0);
916 sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1);
917 sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2);
918
919 sum = vshlq_s32(sum, round_shift);
920 return vqmovun_s32(sum);
921 }
922
highbd_convolve6_8_2d_v(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t y_filter,const int32x4_t round_shift,const int32x4_t offset)923 static INLINE uint16x8_t highbd_convolve6_8_2d_v(
924 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
925 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
926 const int16x8_t y_filter, const int32x4_t round_shift,
927 const int32x4_t offset) {
928 // Values at indices 0 and 7 of y_filter are zero.
929 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter);
930 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter);
931
932 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 1);
933 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2);
934 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3);
935 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0);
936 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1);
937 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2);
938
939 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 1);
940 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2);
941 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3);
942 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0);
943 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1);
944 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2);
945
946 sum0 = vshlq_s32(sum0, round_shift);
947 sum1 = vshlq_s32(sum1, round_shift);
948
949 return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
950 }
951
highbd_convolve_2d_sr_vert_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,ConvolveParams * conv_params,int bd,const int offset)952 static INLINE void highbd_convolve_2d_sr_vert_6tap_neon(
953 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
954 int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
955 int bd, const int offset) {
956 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
957 const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
958 const int32x4_t offset_s32 = vdupq_n_s32(offset);
959 const int round1_shift = conv_params->round_1;
960 const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift);
961
962 if (w == 4) {
963 const int16_t *s = (const int16_t *)src_ptr;
964 uint16_t *d = dst_ptr;
965 int16x4_t s0, s1, s2, s3, s4;
966 load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
967 s += 5 * src_stride;
968
969 do {
970 int16x4_t s5, s6, s7, s8;
971 load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8);
972
973 uint16x4_t d0 = highbd_convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter,
974 round1_shift_s32, offset_s32);
975 uint16x4_t d1 = highbd_convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter,
976 round1_shift_s32, offset_s32);
977 uint16x4_t d2 = highbd_convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter,
978 round1_shift_s32, offset_s32);
979 uint16x4_t d3 = highbd_convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter,
980 round1_shift_s32, offset_s32);
981
982 d0 = vmin_u16(d0, vget_low_u16(max));
983 d1 = vmin_u16(d1, vget_low_u16(max));
984 d2 = vmin_u16(d2, vget_low_u16(max));
985 d3 = vmin_u16(d3, vget_low_u16(max));
986
987 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
988
989 s0 = s4;
990 s1 = s5;
991 s2 = s6;
992 s3 = s7;
993 s4 = s8;
994 s += 4 * src_stride;
995 d += 4 * dst_stride;
996 h -= 4;
997 } while (h != 0);
998 } else {
999 do {
1000 int height = h;
1001 const int16_t *s = (const int16_t *)src_ptr;
1002 uint16_t *d = dst_ptr;
1003 int16x8_t s0, s1, s2, s3, s4;
1004 load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4);
1005 s += 5 * src_stride;
1006
1007 do {
1008 int16x8_t s5, s6, s7, s8;
1009 load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8);
1010
1011 uint16x8_t d0 = highbd_convolve6_8_2d_v(
1012 s0, s1, s2, s3, s4, s5, y_filter, round1_shift_s32, offset_s32);
1013 uint16x8_t d1 = highbd_convolve6_8_2d_v(
1014 s1, s2, s3, s4, s5, s6, y_filter, round1_shift_s32, offset_s32);
1015 uint16x8_t d2 = highbd_convolve6_8_2d_v(
1016 s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32, offset_s32);
1017 uint16x8_t d3 = highbd_convolve6_8_2d_v(
1018 s3, s4, s5, s6, s7, s8, y_filter, round1_shift_s32, offset_s32);
1019
1020 d0 = vminq_u16(d0, max);
1021 d1 = vminq_u16(d1, max);
1022 d2 = vminq_u16(d2, max);
1023 d3 = vminq_u16(d3, max);
1024
1025 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1026
1027 s0 = s4;
1028 s1 = s5;
1029 s2 = s6;
1030 s3 = s7;
1031 s4 = s8;
1032 s += 4 * src_stride;
1033 d += 4 * dst_stride;
1034 height -= 4;
1035 } while (height != 0);
1036 src_ptr += 8;
1037 dst_ptr += 8;
1038 w -= 8;
1039 } while (w != 0);
1040 }
1041 }
1042
highbd_convolve8_4_2d_v(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t y_filter,const int32x4_t round_shift,const int32x4_t offset)1043 static INLINE uint16x4_t highbd_convolve8_4_2d_v(
1044 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
1045 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
1046 const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter,
1047 const int32x4_t round_shift, const int32x4_t offset) {
1048 const int16x4_t y_filter_lo = vget_low_s16(y_filter);
1049 const int16x4_t y_filter_hi = vget_high_s16(y_filter);
1050
1051 int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_lo, 0);
1052 sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
1053 sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
1054 sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
1055 sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
1056 sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
1057 sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
1058 sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
1059
1060 sum = vshlq_s32(sum, round_shift);
1061 return vqmovun_s32(sum);
1062 }
1063
highbd_convolve8_8_2d_v(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t y_filter,const int32x4_t round_shift,const int32x4_t offset)1064 static INLINE uint16x8_t highbd_convolve8_8_2d_v(
1065 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
1066 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
1067 const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
1068 const int32x4_t round_shift, const int32x4_t offset) {
1069 const int16x4_t y_filter_lo = vget_low_s16(y_filter);
1070 const int16x4_t y_filter_hi = vget_high_s16(y_filter);
1071
1072 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_lo, 0);
1073 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1);
1074 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2);
1075 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3);
1076 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0);
1077 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1);
1078 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2);
1079 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3);
1080
1081 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_lo, 0);
1082 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1);
1083 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2);
1084 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3);
1085 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0);
1086 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1);
1087 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2);
1088 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3);
1089
1090 sum0 = vshlq_s32(sum0, round_shift);
1091 sum1 = vshlq_s32(sum1, round_shift);
1092
1093 return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
1094 }
1095
highbd_convolve_2d_sr_vert_8tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,ConvolveParams * conv_params,int bd,const int offset)1096 static INLINE void highbd_convolve_2d_sr_vert_8tap_neon(
1097 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1098 int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
1099 int bd, const int offset) {
1100 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
1101 const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
1102 const int32x4_t offset_s32 = vdupq_n_s32(offset);
1103 const int round1_shift = conv_params->round_1;
1104 const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift);
1105
1106 if (w == 4) {
1107 const int16_t *s = (const int16_t *)src_ptr;
1108 uint16_t *d = dst_ptr;
1109
1110 int16x4_t s0, s1, s2, s3, s4, s5, s6;
1111 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
1112 s += 7 * src_stride;
1113
1114 do {
1115 int16x4_t s7, s8, s9, s10;
1116 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
1117
1118 uint16x4_t d0 =
1119 highbd_convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
1120 round1_shift_s32, offset_s32);
1121 uint16x4_t d1 =
1122 highbd_convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
1123 round1_shift_s32, offset_s32);
1124 uint16x4_t d2 =
1125 highbd_convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
1126 round1_shift_s32, offset_s32);
1127 uint16x4_t d3 =
1128 highbd_convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
1129 round1_shift_s32, offset_s32);
1130
1131 d0 = vmin_u16(d0, vget_low_u16(max));
1132 d1 = vmin_u16(d1, vget_low_u16(max));
1133 d2 = vmin_u16(d2, vget_low_u16(max));
1134 d3 = vmin_u16(d3, vget_low_u16(max));
1135
1136 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1137
1138 s0 = s4;
1139 s1 = s5;
1140 s2 = s6;
1141 s3 = s7;
1142 s4 = s8;
1143 s5 = s9;
1144 s6 = s10;
1145 s += 4 * src_stride;
1146 d += 4 * dst_stride;
1147 h -= 4;
1148 } while (h != 0);
1149 } else {
1150 do {
1151 int height = h;
1152 const int16_t *s = (const int16_t *)src_ptr;
1153 uint16_t *d = dst_ptr;
1154
1155 int16x8_t s0, s1, s2, s3, s4, s5, s6;
1156 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
1157 s += 7 * src_stride;
1158
1159 do {
1160 int16x8_t s7, s8, s9, s10;
1161 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
1162
1163 uint16x8_t d0 =
1164 highbd_convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
1165 round1_shift_s32, offset_s32);
1166 uint16x8_t d1 =
1167 highbd_convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
1168 round1_shift_s32, offset_s32);
1169 uint16x8_t d2 =
1170 highbd_convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
1171 round1_shift_s32, offset_s32);
1172 uint16x8_t d3 =
1173 highbd_convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
1174 round1_shift_s32, offset_s32);
1175
1176 d0 = vminq_u16(d0, max);
1177 d1 = vminq_u16(d1, max);
1178 d2 = vminq_u16(d2, max);
1179 d3 = vminq_u16(d3, max);
1180
1181 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1182
1183 s0 = s4;
1184 s1 = s5;
1185 s2 = s6;
1186 s3 = s7;
1187 s4 = s8;
1188 s5 = s9;
1189 s6 = s10;
1190 s += 4 * src_stride;
1191 d += 4 * dst_stride;
1192 height -= 4;
1193 } while (height != 0);
1194 src_ptr += 8;
1195 dst_ptr += 8;
1196 w -= 8;
1197 } while (w != 0);
1198 }
1199 }
1200
highbd_convolve12_4_2d_v(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x4_t s8,const int16x4_t s9,const int16x4_t s10,const int16x4_t s11,const int16x8_t y_filter_0_7,const int16x4_t y_filter_8_11,const int32x4_t round_shift,const int32x4_t offset)1201 static INLINE uint16x4_t highbd_convolve12_4_2d_v(
1202 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
1203 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
1204 const int16x4_t s6, const int16x4_t s7, const int16x4_t s8,
1205 const int16x4_t s9, const int16x4_t s10, const int16x4_t s11,
1206 const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
1207 const int32x4_t round_shift, const int32x4_t offset) {
1208 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
1209 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
1210
1211 int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 0);
1212 sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1);
1213 sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2);
1214 sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3);
1215 sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0);
1216 sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1);
1217 sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2);
1218 sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3);
1219 sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0);
1220 sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1);
1221 sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2);
1222 sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3);
1223
1224 sum = vshlq_s32(sum, round_shift);
1225 return vqmovun_s32(sum);
1226 }
1227
highbd_convolve12_8_2d_v(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t s8,const int16x8_t s9,const int16x8_t s10,const int16x8_t s11,const int16x8_t y_filter_0_7,const int16x4_t y_filter_8_11,const int32x4_t round_shift,const int32x4_t offset)1228 static INLINE uint16x8_t highbd_convolve12_8_2d_v(
1229 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
1230 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
1231 const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
1232 const int16x8_t s9, const int16x8_t s10, const int16x8_t s11,
1233 const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11,
1234 const int32x4_t round_shift, const int32x4_t offset) {
1235 const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7);
1236 const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7);
1237
1238 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 0);
1239 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1);
1240 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2);
1241 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3);
1242 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0);
1243 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1);
1244 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2);
1245 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3);
1246 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0);
1247 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1);
1248 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2);
1249 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3);
1250
1251 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 0);
1252 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1);
1253 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2);
1254 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3);
1255 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0);
1256 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1);
1257 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2);
1258 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3);
1259 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0);
1260 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1);
1261 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2);
1262 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3);
1263
1264 sum0 = vshlq_s32(sum0, round_shift);
1265 sum1 = vshlq_s32(sum1, round_shift);
1266
1267 return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
1268 }
1269
highbd_convolve_2d_sr_vert_12tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr,ConvolveParams * conv_params,const int bd,const int offset)1270 static INLINE void highbd_convolve_2d_sr_vert_12tap_neon(
1271 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1272 int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params,
1273 const int bd, const int offset) {
1274 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
1275 const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
1276 const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
1277 const int32x4_t offset_s32 = vdupq_n_s32(offset);
1278 const int round1_shift = conv_params->round_1;
1279 const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift);
1280
1281 if (w == 4) {
1282 const int16_t *s = (const int16_t *)src_ptr;
1283 uint16_t *d = dst_ptr;
1284
1285 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
1286 load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
1287 &s9, &s10);
1288 s += 11 * src_stride;
1289
1290 do {
1291 int16x4_t s11, s12, s13, s14;
1292 load_s16_4x4(s, src_stride, &s11, &s12, &s13, &s14);
1293
1294 uint16x4_t d0 = highbd_convolve12_4_2d_v(
1295 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7,
1296 y_filter_8_11, round1_shift_s32, offset_s32);
1297 uint16x4_t d1 = highbd_convolve12_4_2d_v(
1298 s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7,
1299 y_filter_8_11, round1_shift_s32, offset_s32);
1300 uint16x4_t d2 = highbd_convolve12_4_2d_v(
1301 s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7,
1302 y_filter_8_11, round1_shift_s32, offset_s32);
1303 uint16x4_t d3 = highbd_convolve12_4_2d_v(
1304 s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7,
1305 y_filter_8_11, round1_shift_s32, offset_s32);
1306
1307 d0 = vmin_u16(d0, vget_low_u16(max));
1308 d1 = vmin_u16(d1, vget_low_u16(max));
1309 d2 = vmin_u16(d2, vget_low_u16(max));
1310 d3 = vmin_u16(d3, vget_low_u16(max));
1311
1312 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1313
1314 s0 = s4;
1315 s1 = s5;
1316 s2 = s6;
1317 s3 = s7;
1318 s4 = s8;
1319 s5 = s9;
1320 s6 = s10;
1321 s7 = s11;
1322 s8 = s12;
1323 s9 = s13;
1324 s10 = s14;
1325 s += 4 * src_stride;
1326 d += 4 * dst_stride;
1327 h -= 4;
1328 } while (h != 0);
1329 } else {
1330 do {
1331 int height = h;
1332 const int16_t *s = (const int16_t *)src_ptr;
1333 uint16_t *d = dst_ptr;
1334
1335 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
1336 load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
1337 &s9, &s10);
1338 s += 11 * src_stride;
1339
1340 do {
1341 int16x8_t s11, s12, s13, s14;
1342 load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14);
1343
1344 uint16x8_t d0 = highbd_convolve12_8_2d_v(
1345 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7,
1346 y_filter_8_11, round1_shift_s32, offset_s32);
1347 uint16x8_t d1 = highbd_convolve12_8_2d_v(
1348 s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7,
1349 y_filter_8_11, round1_shift_s32, offset_s32);
1350 uint16x8_t d2 = highbd_convolve12_8_2d_v(
1351 s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7,
1352 y_filter_8_11, round1_shift_s32, offset_s32);
1353 uint16x8_t d3 = highbd_convolve12_8_2d_v(
1354 s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7,
1355 y_filter_8_11, round1_shift_s32, offset_s32);
1356
1357 d0 = vminq_u16(d0, max);
1358 d1 = vminq_u16(d1, max);
1359 d2 = vminq_u16(d2, max);
1360 d3 = vminq_u16(d3, max);
1361
1362 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1363
1364 s0 = s4;
1365 s1 = s5;
1366 s2 = s6;
1367 s3 = s7;
1368 s4 = s8;
1369 s5 = s9;
1370 s6 = s10;
1371 s7 = s11;
1372 s8 = s12;
1373 s9 = s13;
1374 s10 = s14;
1375 s += 4 * src_stride;
1376 d += 4 * dst_stride;
1377 height -= 4;
1378 } while (height != 0);
1379
1380 src_ptr += 8;
1381 dst_ptr += 8;
1382 w -= 8;
1383 } while (w != 0);
1384 }
1385 }
1386
highbd_convolve6_8_2d_h(const int16x8_t s[6],const int16x8_t x_filter,const int32x4_t shift_s32,const int32x4_t offset)1387 static INLINE uint16x8_t highbd_convolve6_8_2d_h(const int16x8_t s[6],
1388 const int16x8_t x_filter,
1389 const int32x4_t shift_s32,
1390 const int32x4_t offset) {
1391 // Values at indices 0 and 7 of y_filter are zero.
1392 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
1393 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
1394
1395 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 1);
1396 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 2);
1397 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 3);
1398 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_4_7, 0);
1399 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 1);
1400 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 2);
1401
1402 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 1);
1403 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 2);
1404 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 3);
1405 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_4_7, 0);
1406 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 1);
1407 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 2);
1408
1409 sum0 = vqrshlq_s32(sum0, shift_s32);
1410 sum1 = vqrshlq_s32(sum1, shift_s32);
1411
1412 return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
1413 }
1414
highbd_convolve_2d_sr_horiz_6tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,ConvolveParams * conv_params,const int offset)1415 static INLINE void highbd_convolve_2d_sr_horiz_6tap_neon(
1416 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1417 int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
1418 const int offset) {
1419 // The smallest block height processed by the SIMD functions is 4, and the
1420 // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines
1421 // for the vertical convolution.
1422 assert(h >= 5);
1423 const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
1424 const int32x4_t offset_s32 = vdupq_n_s32(offset);
1425
1426 const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
1427 int height = h;
1428
1429 do {
1430 int width = w;
1431 const int16_t *s = (const int16_t *)src_ptr;
1432 uint16_t *d = dst_ptr;
1433
1434 do {
1435 int16x8_t s0[6], s1[6], s2[6], s3[6];
1436 load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1437 &s0[4], &s0[5]);
1438 load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1439 &s1[4], &s1[5]);
1440 load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1441 &s2[4], &s2[5]);
1442 load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1443 &s3[4], &s3[5]);
1444
1445 uint16x8_t d0 =
1446 highbd_convolve6_8_2d_h(s0, x_filter, shift_s32, offset_s32);
1447 uint16x8_t d1 =
1448 highbd_convolve6_8_2d_h(s1, x_filter, shift_s32, offset_s32);
1449 uint16x8_t d2 =
1450 highbd_convolve6_8_2d_h(s2, x_filter, shift_s32, offset_s32);
1451 uint16x8_t d3 =
1452 highbd_convolve6_8_2d_h(s3, x_filter, shift_s32, offset_s32);
1453
1454 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1455
1456 s += 8;
1457 d += 8;
1458 width -= 8;
1459 } while (width != 0);
1460 src_ptr += 4 * src_stride;
1461 dst_ptr += 4 * dst_stride;
1462 height -= 4;
1463 } while (height > 4);
1464 do {
1465 int width = w;
1466 const int16_t *s = (const int16_t *)src_ptr;
1467 uint16_t *d = dst_ptr;
1468
1469 do {
1470 int16x8_t s0[6];
1471 load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]);
1472
1473 uint16x8_t d0 =
1474 highbd_convolve6_8_2d_h(s0, x_filter, shift_s32, offset_s32);
1475 vst1q_u16(d, d0);
1476
1477 s += 8;
1478 d += 8;
1479 width -= 8;
1480 } while (width != 0);
1481 src_ptr += src_stride;
1482 dst_ptr += dst_stride;
1483 } while (--height != 0);
1484 }
1485
highbd_convolve4_4_2d_h(const int16x4_t s[4],const int16x4_t x_filter,const int32x4_t shift_s32,const int32x4_t offset)1486 static INLINE uint16x4_t highbd_convolve4_4_2d_h(const int16x4_t s[4],
1487 const int16x4_t x_filter,
1488 const int32x4_t shift_s32,
1489 const int32x4_t offset) {
1490 int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0);
1491 sum = vmlal_lane_s16(sum, s[1], x_filter, 1);
1492 sum = vmlal_lane_s16(sum, s[2], x_filter, 2);
1493 sum = vmlal_lane_s16(sum, s[3], x_filter, 3);
1494
1495 sum = vqrshlq_s32(sum, shift_s32);
1496 return vqmovun_s32(sum);
1497 }
1498
highbd_convolve8_8_2d_h(const int16x8_t s[8],const int16x8_t x_filter,const int32x4_t shift_s32,const int32x4_t offset)1499 static INLINE uint16x8_t highbd_convolve8_8_2d_h(const int16x8_t s[8],
1500 const int16x8_t x_filter,
1501 const int32x4_t shift_s32,
1502 const int32x4_t offset) {
1503 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter);
1504 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter);
1505
1506 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 0);
1507 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
1508 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
1509 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
1510 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
1511 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
1512 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
1513 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
1514
1515 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 0);
1516 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
1517 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
1518 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
1519 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
1520 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
1521 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
1522 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
1523
1524 sum0 = vqrshlq_s32(sum0, shift_s32);
1525 sum1 = vqrshlq_s32(sum1, shift_s32);
1526
1527 return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
1528 }
1529
highbd_convolve_2d_sr_horiz_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,ConvolveParams * conv_params,const int offset)1530 static INLINE void highbd_convolve_2d_sr_horiz_neon(
1531 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1532 int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
1533 const int offset) {
1534 // The smallest block height processed by the SIMD functions is 4, and the
1535 // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines
1536 // for the vertical convolution.
1537 assert(h >= 5);
1538 const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
1539 const int32x4_t offset_s32 = vdupq_n_s32(offset);
1540
1541 if (w == 4) {
1542 // 4-tap filters are used for blocks having width <= 4.
1543 const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2);
1544 const int16_t *s = (const int16_t *)(src_ptr + 1);
1545 uint16_t *d = dst_ptr;
1546
1547 do {
1548 int16x4_t s0[4], s1[4], s2[4], s3[4];
1549 load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
1550 load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
1551 load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
1552 load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
1553
1554 uint16x4_t d0 =
1555 highbd_convolve4_4_2d_h(s0, x_filter, shift_s32, offset_s32);
1556 uint16x4_t d1 =
1557 highbd_convolve4_4_2d_h(s1, x_filter, shift_s32, offset_s32);
1558 uint16x4_t d2 =
1559 highbd_convolve4_4_2d_h(s2, x_filter, shift_s32, offset_s32);
1560 uint16x4_t d3 =
1561 highbd_convolve4_4_2d_h(s3, x_filter, shift_s32, offset_s32);
1562
1563 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1564
1565 s += 4 * src_stride;
1566 d += 4 * dst_stride;
1567 h -= 4;
1568 } while (h > 4);
1569
1570 do {
1571 int16x4_t s0[4];
1572 load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
1573
1574 uint16x4_t d0 =
1575 highbd_convolve4_4_2d_h(s0, x_filter, shift_s32, offset_s32);
1576
1577 vst1_u16(d, d0);
1578
1579 s += src_stride;
1580 d += dst_stride;
1581 } while (--h != 0);
1582 } else {
1583 const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
1584 int height = h;
1585
1586 do {
1587 int width = w;
1588 const int16_t *s = (const int16_t *)src_ptr;
1589 uint16_t *d = dst_ptr;
1590
1591 do {
1592 int16x8_t s0[8], s1[8], s2[8], s3[8];
1593 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1594 &s0[4], &s0[5], &s0[6], &s0[7]);
1595 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1596 &s1[4], &s1[5], &s1[6], &s1[7]);
1597 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1598 &s2[4], &s2[5], &s2[6], &s2[7]);
1599 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1600 &s3[4], &s3[5], &s3[6], &s3[7]);
1601
1602 uint16x8_t d0 =
1603 highbd_convolve8_8_2d_h(s0, x_filter, shift_s32, offset_s32);
1604 uint16x8_t d1 =
1605 highbd_convolve8_8_2d_h(s1, x_filter, shift_s32, offset_s32);
1606 uint16x8_t d2 =
1607 highbd_convolve8_8_2d_h(s2, x_filter, shift_s32, offset_s32);
1608 uint16x8_t d3 =
1609 highbd_convolve8_8_2d_h(s3, x_filter, shift_s32, offset_s32);
1610
1611 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1612
1613 s += 8;
1614 d += 8;
1615 width -= 8;
1616 } while (width != 0);
1617 src_ptr += 4 * src_stride;
1618 dst_ptr += 4 * dst_stride;
1619 height -= 4;
1620 } while (height > 4);
1621
1622 do {
1623 int width = w;
1624 const int16_t *s = (const int16_t *)src_ptr;
1625 uint16_t *d = dst_ptr;
1626
1627 do {
1628 int16x8_t s0[8];
1629 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1630 &s0[4], &s0[5], &s0[6], &s0[7]);
1631
1632 uint16x8_t d0 =
1633 highbd_convolve8_8_2d_h(s0, x_filter, shift_s32, offset_s32);
1634 vst1q_u16(d, d0);
1635
1636 s += 8;
1637 d += 8;
1638 width -= 8;
1639 } while (width != 0);
1640 src_ptr += src_stride;
1641 dst_ptr += dst_stride;
1642 } while (--height != 0);
1643 }
1644 }
1645
highbd_convolve12_4_2d_h(const int16x4_t s[12],const int16x8_t x_filter_0_7,const int16x4_t x_filter_8_11,const int32x4_t shift_s32,const int32x4_t offset)1646 static INLINE uint16x4_t highbd_convolve12_4_2d_h(const int16x4_t s[12],
1647 const int16x8_t x_filter_0_7,
1648 const int16x4_t x_filter_8_11,
1649 const int32x4_t shift_s32,
1650 const int32x4_t offset) {
1651 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
1652 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
1653
1654 int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter_0_3, 0);
1655 sum = vmlal_lane_s16(sum, s[1], x_filter_0_3, 1);
1656 sum = vmlal_lane_s16(sum, s[2], x_filter_0_3, 2);
1657 sum = vmlal_lane_s16(sum, s[3], x_filter_0_3, 3);
1658 sum = vmlal_lane_s16(sum, s[4], x_filter_4_7, 0);
1659 sum = vmlal_lane_s16(sum, s[5], x_filter_4_7, 1);
1660 sum = vmlal_lane_s16(sum, s[6], x_filter_4_7, 2);
1661 sum = vmlal_lane_s16(sum, s[7], x_filter_4_7, 3);
1662 sum = vmlal_lane_s16(sum, s[8], x_filter_8_11, 0);
1663 sum = vmlal_lane_s16(sum, s[9], x_filter_8_11, 1);
1664 sum = vmlal_lane_s16(sum, s[10], x_filter_8_11, 2);
1665 sum = vmlal_lane_s16(sum, s[11], x_filter_8_11, 3);
1666
1667 sum = vqrshlq_s32(sum, shift_s32);
1668 return vqmovun_s32(sum);
1669 }
1670
highbd_convolve12_8_2d_h(const int16x8_t s[12],const int16x8_t x_filter_0_7,const int16x4_t x_filter_8_11,const int32x4_t shift_s32,const int32x4_t offset)1671 static INLINE uint16x8_t highbd_convolve12_8_2d_h(const int16x8_t s[12],
1672 const int16x8_t x_filter_0_7,
1673 const int16x4_t x_filter_8_11,
1674 const int32x4_t shift_s32,
1675 const int32x4_t offset) {
1676 const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7);
1677 const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7);
1678
1679 int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 0);
1680 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1);
1681 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2);
1682 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3);
1683 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0);
1684 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1);
1685 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2);
1686 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3);
1687 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[8]), x_filter_8_11, 0);
1688 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[9]), x_filter_8_11, 1);
1689 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[10]), x_filter_8_11, 2);
1690 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[11]), x_filter_8_11, 3);
1691
1692 int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 0);
1693 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1);
1694 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2);
1695 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3);
1696 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0);
1697 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1);
1698 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2);
1699 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3);
1700 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[8]), x_filter_8_11, 0);
1701 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[9]), x_filter_8_11, 1);
1702 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[10]), x_filter_8_11, 2);
1703 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[11]), x_filter_8_11, 3);
1704
1705 sum0 = vqrshlq_s32(sum0, shift_s32);
1706 sum1 = vqrshlq_s32(sum1, shift_s32);
1707
1708 return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1));
1709 }
1710
highbd_convolve_2d_sr_horiz_12tap_neon(const uint16_t * src_ptr,int src_stride,uint16_t * dst_ptr,int dst_stride,int w,int h,const int16_t * x_filter_ptr,ConvolveParams * conv_params,const int offset)1711 static INLINE void highbd_convolve_2d_sr_horiz_12tap_neon(
1712 const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride,
1713 int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params,
1714 const int offset) {
1715 // The smallest block height processed by the SIMD functions is 4, and the
1716 // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines
1717 // for the vertical convolution.
1718 assert(h >= 5);
1719 const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0);
1720 const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
1721 const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
1722 const int32x4_t offset_s32 = vdupq_n_s32(offset);
1723
1724 if (w == 4) {
1725 const int16_t *s = (const int16_t *)src_ptr;
1726 uint16_t *d = dst_ptr;
1727
1728 do {
1729 int16x4_t s0[12], s1[12], s2[12], s3[12];
1730 load_s16_4x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1731 &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
1732 &s0[11]);
1733 load_s16_4x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1734 &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
1735 &s1[11]);
1736 load_s16_4x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1737 &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
1738 &s2[11]);
1739 load_s16_4x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1740 &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
1741 &s3[11]);
1742
1743 uint16x4_t d0 = highbd_convolve12_4_2d_h(s0, x_filter_0_7, x_filter_8_11,
1744 shift_s32, offset_s32);
1745 uint16x4_t d1 = highbd_convolve12_4_2d_h(s1, x_filter_0_7, x_filter_8_11,
1746 shift_s32, offset_s32);
1747 uint16x4_t d2 = highbd_convolve12_4_2d_h(s2, x_filter_0_7, x_filter_8_11,
1748 shift_s32, offset_s32);
1749 uint16x4_t d3 = highbd_convolve12_4_2d_h(s3, x_filter_0_7, x_filter_8_11,
1750 shift_s32, offset_s32);
1751
1752 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1753
1754 s += 4 * src_stride;
1755 d += 4 * dst_stride;
1756 h -= 4;
1757 } while (h > 4);
1758
1759 do {
1760 int16x4_t s0[12];
1761 load_s16_4x12(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5],
1762 &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], &s0[11]);
1763
1764 uint16x4_t d0 = highbd_convolve12_4_2d_h(s0, x_filter_0_7, x_filter_8_11,
1765 shift_s32, offset_s32);
1766
1767 vst1_u16(d, d0);
1768
1769 s += src_stride;
1770 d += dst_stride;
1771 } while (--h != 0);
1772 } else {
1773 int height = h;
1774
1775 do {
1776 int width = w;
1777 const int16_t *s = (const int16_t *)src_ptr;
1778 uint16_t *d = dst_ptr;
1779
1780 do {
1781 int16x8_t s0[12], s1[12], s2[12], s3[12];
1782 load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1783 &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
1784 &s0[11]);
1785 load_s16_8x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
1786 &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
1787 &s1[11]);
1788 load_s16_8x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
1789 &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10],
1790 &s2[11]);
1791 load_s16_8x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
1792 &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10],
1793 &s3[11]);
1794
1795 uint16x8_t d0 = highbd_convolve12_8_2d_h(
1796 s0, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
1797 uint16x8_t d1 = highbd_convolve12_8_2d_h(
1798 s1, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
1799 uint16x8_t d2 = highbd_convolve12_8_2d_h(
1800 s2, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
1801 uint16x8_t d3 = highbd_convolve12_8_2d_h(
1802 s3, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
1803
1804 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1805
1806 s += 8;
1807 d += 8;
1808 width -= 8;
1809 } while (width != 0);
1810 src_ptr += 4 * src_stride;
1811 dst_ptr += 4 * dst_stride;
1812 height -= 4;
1813 } while (height > 4);
1814
1815 do {
1816 int width = w;
1817 const int16_t *s = (const int16_t *)src_ptr;
1818 uint16_t *d = dst_ptr;
1819
1820 do {
1821 int16x8_t s0[12];
1822 load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
1823 &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
1824 &s0[11]);
1825
1826 uint16x8_t d0 = highbd_convolve12_8_2d_h(
1827 s0, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32);
1828 vst1q_u16(d, d0);
1829
1830 s += 8;
1831 d += 8;
1832 width -= 8;
1833 } while (width > 0);
1834 src_ptr += src_stride;
1835 dst_ptr += dst_stride;
1836 } while (--height != 0);
1837 }
1838 }
1839
av1_highbd_convolve_2d_sr_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)1840 void av1_highbd_convolve_2d_sr_neon(const uint16_t *src, int src_stride,
1841 uint16_t *dst, int dst_stride, int w, int h,
1842 const InterpFilterParams *filter_params_x,
1843 const InterpFilterParams *filter_params_y,
1844 const int subpel_x_qn,
1845 const int subpel_y_qn,
1846 ConvolveParams *conv_params, int bd) {
1847 if (w == 2 || h == 2) {
1848 av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
1849 filter_params_x, filter_params_y, subpel_x_qn,
1850 subpel_y_qn, conv_params, bd);
1851 return;
1852 }
1853 DECLARE_ALIGNED(16, uint16_t,
1854 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
1855 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
1856 const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps;
1857
1858 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
1859 const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
1860 const int im_h = h + clamped_y_taps - 1;
1861 const int im_stride = MAX_SB_SIZE;
1862 const int vert_offset = clamped_y_taps / 2 - 1;
1863 const int horiz_offset = clamped_x_taps / 2 - 1;
1864 const int x_offset_initial = (1 << (bd + FILTER_BITS - 1));
1865 const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1866 // The extra shim of (1 << (conv_params->round_1 - 1)) allows us to do a
1867 // simple shift left instead of a rounding saturating shift left.
1868 const int y_offset =
1869 (1 << (conv_params->round_1 - 1)) - (1 << (y_offset_bits - 1));
1870
1871 const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
1872
1873 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
1874 filter_params_x, subpel_x_qn & SUBPEL_MASK);
1875 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
1876 filter_params_y, subpel_y_qn & SUBPEL_MASK);
1877
1878 if (x_filter_taps > 8) {
1879 highbd_convolve_2d_sr_horiz_12tap_neon(src_ptr, src_stride, im_block,
1880 im_stride, w, im_h, x_filter_ptr,
1881 conv_params, x_offset_initial);
1882
1883 highbd_convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride,
1884 w, h, y_filter_ptr, conv_params, bd,
1885 y_offset);
1886 return;
1887 }
1888 if (x_filter_taps <= 6 && w != 4) {
1889 highbd_convolve_2d_sr_horiz_6tap_neon(src_ptr, src_stride, im_block,
1890 im_stride, w, im_h, x_filter_ptr,
1891 conv_params, x_offset_initial);
1892 } else {
1893 highbd_convolve_2d_sr_horiz_neon(src_ptr, src_stride, im_block, im_stride,
1894 w, im_h, x_filter_ptr, conv_params,
1895 x_offset_initial);
1896 }
1897
1898 if (y_filter_taps <= 6) {
1899 highbd_convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride,
1900 w, h, y_filter_ptr, conv_params, bd,
1901 y_offset);
1902 } else {
1903 highbd_convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride,
1904 w, h, y_filter_ptr, conv_params, bd,
1905 y_offset);
1906 }
1907 }
1908
1909 // Filter used is [64, 64].
av1_highbd_convolve_x_sr_intrabc_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)1910 void av1_highbd_convolve_x_sr_intrabc_neon(
1911 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
1912 int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn,
1913 ConvolveParams *conv_params, int bd) {
1914 assert(subpel_x_qn == 8);
1915 assert(filter_params_x->taps == 2);
1916 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
1917 (void)filter_params_x;
1918 (void)subpel_x_qn;
1919 (void)conv_params;
1920 (void)bd;
1921
1922 if (w <= 4) {
1923 do {
1924 uint16x4_t s0 = vld1_u16(src);
1925 uint16x4_t s1 = vld1_u16(src + 1);
1926
1927 uint16x4_t d0 = vrhadd_u16(s0, s1);
1928
1929 if (w == 2) {
1930 store_u16_2x1(dst, d0);
1931 } else {
1932 vst1_u16(dst, d0);
1933 }
1934
1935 src += src_stride;
1936 dst += dst_stride;
1937 } while (--h != 0);
1938 } else {
1939 do {
1940 const uint16_t *src_ptr = src;
1941 uint16_t *dst_ptr = dst;
1942 int width = w;
1943
1944 do {
1945 uint16x8_t s0 = vld1q_u16(src_ptr);
1946 uint16x8_t s1 = vld1q_u16(src_ptr + 1);
1947
1948 uint16x8_t d0 = vrhaddq_u16(s0, s1);
1949
1950 vst1q_u16(dst_ptr, d0);
1951
1952 src_ptr += 8;
1953 dst_ptr += 8;
1954 width -= 8;
1955 } while (width != 0);
1956 src += src_stride;
1957 dst += dst_stride;
1958 } while (--h != 0);
1959 }
1960 }
1961
1962 // Filter used is [64, 64].
av1_highbd_convolve_y_sr_intrabc_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,int bd)1963 void av1_highbd_convolve_y_sr_intrabc_neon(
1964 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
1965 int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn,
1966 int bd) {
1967 assert(subpel_y_qn == 8);
1968 assert(filter_params_y->taps == 2);
1969 (void)filter_params_y;
1970 (void)subpel_y_qn;
1971 (void)bd;
1972
1973 if (w <= 4) {
1974 do {
1975 uint16x4_t s0 = vld1_u16(src);
1976 uint16x4_t s1 = vld1_u16(src + src_stride);
1977
1978 uint16x4_t d0 = vrhadd_u16(s0, s1);
1979
1980 if (w == 2) {
1981 store_u16_2x1(dst, d0);
1982 } else {
1983 vst1_u16(dst, d0);
1984 }
1985
1986 src += src_stride;
1987 dst += dst_stride;
1988 } while (--h != 0);
1989 } else {
1990 do {
1991 const uint16_t *src_ptr = src;
1992 uint16_t *dst_ptr = dst;
1993 int height = h;
1994
1995 do {
1996 uint16x8_t s0 = vld1q_u16(src_ptr);
1997 uint16x8_t s1 = vld1q_u16(src_ptr + src_stride);
1998
1999 uint16x8_t d0 = vrhaddq_u16(s0, s1);
2000
2001 vst1q_u16(dst_ptr, d0);
2002
2003 src_ptr += src_stride;
2004 dst_ptr += dst_stride;
2005 } while (--height != 0);
2006 src += 8;
2007 dst += 8;
2008 w -= 8;
2009 } while (w != 0);
2010 }
2011 }
2012
2013 // Both horizontal and vertical passes use the same 2-tap filter: [64, 64].
av1_highbd_convolve_2d_sr_intrabc_neon(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params,int bd)2014 void av1_highbd_convolve_2d_sr_intrabc_neon(
2015 const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
2016 int h, const InterpFilterParams *filter_params_x,
2017 const InterpFilterParams *filter_params_y, const int subpel_x_qn,
2018 const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
2019 assert(subpel_x_qn == 8);
2020 assert(subpel_y_qn == 8);
2021 assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
2022 assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
2023 assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
2024 (void)filter_params_x;
2025 (void)subpel_x_qn;
2026 (void)filter_params_y;
2027 (void)subpel_y_qn;
2028 (void)conv_params;
2029 (void)bd;
2030
2031 DECLARE_ALIGNED(16, uint16_t,
2032 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
2033 int im_h = h + 1;
2034 int im_stride = MAX_SB_SIZE;
2035
2036 uint16x8_t vert_offset = vdupq_n_u16(1);
2037
2038 uint16_t *im = im_block;
2039
2040 // Horizontal filter.
2041 if (w <= 4) {
2042 do {
2043 uint16x4_t s0 = vld1_u16(src);
2044 uint16x4_t s1 = vld1_u16(src + 1);
2045
2046 uint16x4_t d0 = vadd_u16(s0, s1);
2047
2048 // Safe to store the whole vector, the im buffer is big enough.
2049 vst1_u16(im, d0);
2050
2051 src += src_stride;
2052 im += im_stride;
2053 } while (--im_h != 0);
2054 } else {
2055 do {
2056 const uint16_t *src_ptr = src;
2057 uint16_t *im_ptr = im;
2058 int width = w;
2059
2060 do {
2061 uint16x8_t s0 = vld1q_u16(src_ptr);
2062 uint16x8_t s1 = vld1q_u16(src_ptr + 1);
2063
2064 uint16x8_t d0 = vaddq_u16(s0, s1);
2065
2066 vst1q_u16(im_ptr, d0);
2067
2068 src_ptr += 8;
2069 im_ptr += 8;
2070 width -= 8;
2071 } while (width != 0);
2072 src += src_stride;
2073 im += im_stride;
2074 } while (--im_h != 0);
2075 }
2076
2077 im = im_block;
2078
2079 // Vertical filter.
2080 if (w <= 4) {
2081 do {
2082 uint16x4_t s0 = vld1_u16(im);
2083 uint16x4_t s1 = vld1_u16(im + im_stride);
2084
2085 uint16x4_t d0 = vhadd_u16(s0, s1);
2086 d0 = vhadd_u16(d0, vget_low_u16(vert_offset));
2087
2088 if (w == 2) {
2089 store_u16_2x1(dst, d0);
2090 } else {
2091 vst1_u16(dst, d0);
2092 }
2093
2094 im += im_stride;
2095 dst += dst_stride;
2096 } while (--h != 0);
2097 } else {
2098 do {
2099 uint16_t *im_ptr = im;
2100 uint16_t *dst_ptr = dst;
2101 int height = h;
2102
2103 do {
2104 uint16x8_t s0 = vld1q_u16(im_ptr);
2105 uint16x8_t s1 = vld1q_u16(im_ptr + im_stride);
2106
2107 uint16x8_t d0 = vhaddq_u16(s0, s1);
2108 d0 = vhaddq_u16(d0, vert_offset);
2109
2110 vst1q_u16(dst_ptr, d0);
2111
2112 im_ptr += im_stride;
2113 dst_ptr += dst_stride;
2114 } while (--height != 0);
2115 im += 8;
2116 dst += 8;
2117 w -= 8;
2118 } while (w != 0);
2119 }
2120 }
2121