1 /*
2 *
3 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
4 *
5 * This source code is subject to the terms of the BSD 2 Clause License and
6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7 * was not distributed with this source code in the LICENSE file, you can
8 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
9 * Media Patent License 1.0 was not distributed with this source code in the
10 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
11 */
12
13 #include <assert.h>
14 #include <arm_neon.h>
15
16 #include "config/av1_rtcd.h"
17
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_ports/mem.h"
20 #include "av1/common/convolve.h"
21 #include "av1/common/filter.h"
22 #include "av1/common/arm/convolve_neon.h"
23 #include "av1/common/arm/mem_neon.h"
24 #include "av1/common/arm/transpose_neon.h"
25
convolve8_4x4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16_t * filter)26 static INLINE int16x4_t convolve8_4x4(const int16x4_t s0, const int16x4_t s1,
27 const int16x4_t s2, const int16x4_t s3,
28 const int16x4_t s4, const int16x4_t s5,
29 const int16x4_t s6, const int16x4_t s7,
30 const int16_t *filter) {
31 int16x4_t sum;
32
33 sum = vmul_n_s16(s0, filter[0]);
34 sum = vmla_n_s16(sum, s1, filter[1]);
35 sum = vmla_n_s16(sum, s2, filter[2]);
36 sum = vmla_n_s16(sum, s5, filter[5]);
37 sum = vmla_n_s16(sum, s6, filter[6]);
38 sum = vmla_n_s16(sum, s7, filter[7]);
39 /* filter[3] can take a max value of 128. So the max value of the result :
40 * 128*255 + sum > 16 bits
41 */
42 sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
43 sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
44
45 return sum;
46 }
47
convolve8_horiz_8x8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16_t * filter,const int16x8_t shift_round_0,const int16x8_t shift_by_bits)48 static INLINE uint8x8_t convolve8_horiz_8x8(
49 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
50 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
51 const int16x8_t s6, const int16x8_t s7, const int16_t *filter,
52 const int16x8_t shift_round_0, const int16x8_t shift_by_bits) {
53 int16x8_t sum;
54
55 sum = vmulq_n_s16(s0, filter[0]);
56 sum = vmlaq_n_s16(sum, s1, filter[1]);
57 sum = vmlaq_n_s16(sum, s2, filter[2]);
58 sum = vmlaq_n_s16(sum, s5, filter[5]);
59 sum = vmlaq_n_s16(sum, s6, filter[6]);
60 sum = vmlaq_n_s16(sum, s7, filter[7]);
61 /* filter[3] can take a max value of 128. So the max value of the result :
62 * 128*255 + sum > 16 bits
63 */
64 sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
65 sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
66
67 sum = vqrshlq_s16(sum, shift_round_0);
68 sum = vqrshlq_s16(sum, shift_by_bits);
69
70 return vqmovun_s16(sum);
71 }
72
73 #if !defined(__aarch64__)
convolve8_horiz_4x1(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16_t * filter,const int16x4_t shift_round_0,const int16x4_t shift_by_bits)74 static INLINE uint8x8_t convolve8_horiz_4x1(
75 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
76 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
77 const int16x4_t s6, const int16x4_t s7, const int16_t *filter,
78 const int16x4_t shift_round_0, const int16x4_t shift_by_bits) {
79 int16x4_t sum;
80
81 sum = vmul_n_s16(s0, filter[0]);
82 sum = vmla_n_s16(sum, s1, filter[1]);
83 sum = vmla_n_s16(sum, s2, filter[2]);
84 sum = vmla_n_s16(sum, s5, filter[5]);
85 sum = vmla_n_s16(sum, s6, filter[6]);
86 sum = vmla_n_s16(sum, s7, filter[7]);
87 /* filter[3] can take a max value of 128. So the max value of the result :
88 * 128*255 + sum > 16 bits
89 */
90 sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
91 sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
92
93 sum = vqrshl_s16(sum, shift_round_0);
94 sum = vqrshl_s16(sum, shift_by_bits);
95
96 return vqmovun_s16(vcombine_s16(sum, sum));
97 }
98 #endif // !defined(__arch64__)
99
convolve8_vert_8x4(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16_t * filter)100 static INLINE uint8x8_t convolve8_vert_8x4(
101 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
102 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
103 const int16x8_t s6, const int16x8_t s7, const int16_t *filter) {
104 int16x8_t sum;
105
106 sum = vmulq_n_s16(s0, filter[0]);
107 sum = vmlaq_n_s16(sum, s1, filter[1]);
108 sum = vmlaq_n_s16(sum, s2, filter[2]);
109 sum = vmlaq_n_s16(sum, s5, filter[5]);
110 sum = vmlaq_n_s16(sum, s6, filter[6]);
111 sum = vmlaq_n_s16(sum, s7, filter[7]);
112 /* filter[3] can take a max value of 128. So the max value of the result :
113 * 128*255 + sum > 16 bits
114 */
115 sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
116 sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
117
118 return vqrshrun_n_s16(sum, FILTER_BITS);
119 }
120
convolve8_vert_4x4_s32(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16_t * y_filter,const int32x4_t round_shift_vec,const int32x4_t offset_const,const int32x4_t sub_const_vec)121 static INLINE uint16x4_t convolve8_vert_4x4_s32(
122 const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
123 const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
124 const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter,
125 const int32x4_t round_shift_vec, const int32x4_t offset_const,
126 const int32x4_t sub_const_vec) {
127 int32x4_t sum0;
128 uint16x4_t res;
129 const int32x4_t zero = vdupq_n_s32(0);
130
131 sum0 = vmull_n_s16(s0, y_filter[0]);
132 sum0 = vmlal_n_s16(sum0, s1, y_filter[1]);
133 sum0 = vmlal_n_s16(sum0, s2, y_filter[2]);
134 sum0 = vmlal_n_s16(sum0, s3, y_filter[3]);
135 sum0 = vmlal_n_s16(sum0, s4, y_filter[4]);
136 sum0 = vmlal_n_s16(sum0, s5, y_filter[5]);
137 sum0 = vmlal_n_s16(sum0, s6, y_filter[6]);
138 sum0 = vmlal_n_s16(sum0, s7, y_filter[7]);
139
140 sum0 = vaddq_s32(sum0, offset_const);
141 sum0 = vqrshlq_s32(sum0, round_shift_vec);
142 sum0 = vsubq_s32(sum0, sub_const_vec);
143 sum0 = vmaxq_s32(sum0, zero);
144
145 res = vmovn_u32(vreinterpretq_u32_s32(sum0));
146
147 return res;
148 }
149
convolve8_vert_8x4_s32(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16_t * y_filter,const int32x4_t round_shift_vec,const int32x4_t offset_const,const int32x4_t sub_const_vec,const int16x8_t vec_round_bits)150 static INLINE uint8x8_t convolve8_vert_8x4_s32(
151 const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
152 const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
153 const int16x8_t s6, const int16x8_t s7, const int16_t *y_filter,
154 const int32x4_t round_shift_vec, const int32x4_t offset_const,
155 const int32x4_t sub_const_vec, const int16x8_t vec_round_bits) {
156 int32x4_t sum0, sum1;
157 uint16x8_t res;
158 const int32x4_t zero = vdupq_n_s32(0);
159
160 sum0 = vmull_n_s16(vget_low_s16(s0), y_filter[0]);
161 sum0 = vmlal_n_s16(sum0, vget_low_s16(s1), y_filter[1]);
162 sum0 = vmlal_n_s16(sum0, vget_low_s16(s2), y_filter[2]);
163 sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), y_filter[3]);
164 sum0 = vmlal_n_s16(sum0, vget_low_s16(s4), y_filter[4]);
165 sum0 = vmlal_n_s16(sum0, vget_low_s16(s5), y_filter[5]);
166 sum0 = vmlal_n_s16(sum0, vget_low_s16(s6), y_filter[6]);
167 sum0 = vmlal_n_s16(sum0, vget_low_s16(s7), y_filter[7]);
168
169 sum1 = vmull_n_s16(vget_high_s16(s0), y_filter[0]);
170 sum1 = vmlal_n_s16(sum1, vget_high_s16(s1), y_filter[1]);
171 sum1 = vmlal_n_s16(sum1, vget_high_s16(s2), y_filter[2]);
172 sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), y_filter[3]);
173 sum1 = vmlal_n_s16(sum1, vget_high_s16(s4), y_filter[4]);
174 sum1 = vmlal_n_s16(sum1, vget_high_s16(s5), y_filter[5]);
175 sum1 = vmlal_n_s16(sum1, vget_high_s16(s6), y_filter[6]);
176 sum1 = vmlal_n_s16(sum1, vget_high_s16(s7), y_filter[7]);
177
178 sum0 = vaddq_s32(sum0, offset_const);
179 sum1 = vaddq_s32(sum1, offset_const);
180 sum0 = vqrshlq_s32(sum0, round_shift_vec);
181 sum1 = vqrshlq_s32(sum1, round_shift_vec);
182 sum0 = vsubq_s32(sum0, sub_const_vec);
183 sum1 = vsubq_s32(sum1, sub_const_vec);
184 sum0 = vmaxq_s32(sum0, zero);
185 sum1 = vmaxq_s32(sum1, zero);
186 res = vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(sum0)),
187 vqmovn_u32(vreinterpretq_u32_s32(sum1)));
188
189 res = vqrshlq_u16(res, vec_round_bits);
190
191 return vqmovn_u16(res);
192 }
193
av1_convolve_x_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)194 void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
195 int dst_stride, int w, int h,
196 const InterpFilterParams *filter_params_x,
197 const InterpFilterParams *filter_params_y,
198 const int subpel_x_q4, const int subpel_y_q4,
199 ConvolveParams *conv_params) {
200 const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
201 const int8_t bits = FILTER_BITS - conv_params->round_0;
202
203 (void)subpel_y_q4;
204 (void)conv_params;
205 (void)filter_params_y;
206
207 uint8x8_t t0;
208 #if defined(__aarch64__)
209 uint8x8_t t1, t2, t3;
210 #endif
211
212 assert(bits >= 0);
213 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
214 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
215
216 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
217 filter_params_x, subpel_x_q4 & SUBPEL_MASK);
218
219 const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0);
220 const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
221
222 src -= horiz_offset;
223 #if defined(__aarch64__)
224 if (h == 4) {
225 uint8x8_t d01, d23;
226 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
227 int16x8_t d01_temp, d23_temp;
228
229 __builtin_prefetch(src + 0 * src_stride);
230 __builtin_prefetch(src + 1 * src_stride);
231 __builtin_prefetch(src + 2 * src_stride);
232 __builtin_prefetch(src + 3 * src_stride);
233
234 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
235 transpose_u8_8x4(&t0, &t1, &t2, &t3);
236
237 s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
238 s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
239 s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
240 s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
241 s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
242 s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
243 s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
244 __builtin_prefetch(dst + 0 * dst_stride);
245 __builtin_prefetch(dst + 1 * dst_stride);
246 __builtin_prefetch(dst + 2 * dst_stride);
247 __builtin_prefetch(dst + 3 * dst_stride);
248 src += 7;
249
250 do {
251 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
252 transpose_u8_8x4(&t0, &t1, &t2, &t3);
253
254 s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
255 s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
256 s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
257 s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
258
259 d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, x_filter);
260
261 d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, x_filter);
262
263 d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, x_filter);
264
265 d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, x_filter);
266
267 d01_temp = vqrshlq_s16(vcombine_s16(d0, d1), shift_round_0);
268 d23_temp = vqrshlq_s16(vcombine_s16(d2, d3), shift_round_0);
269
270 d01_temp = vqrshlq_s16(d01_temp, shift_by_bits);
271 d23_temp = vqrshlq_s16(d23_temp, shift_by_bits);
272
273 d01 = vqmovun_s16(d01_temp);
274 d23 = vqmovun_s16(d23_temp);
275
276 transpose_u8_4x4(&d01, &d23);
277
278 if (w != 2) {
279 vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), // 00 01 02 03
280 vreinterpret_u32_u8(d01), 0);
281 vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), // 10 11 12 13
282 vreinterpret_u32_u8(d23), 0);
283 vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), // 20 21 22 23
284 vreinterpret_u32_u8(d01), 1);
285 vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), // 30 31 32 33
286 vreinterpret_u32_u8(d23), 1);
287 } else {
288 vst1_lane_u16((uint16_t *)(dst + 0 * dst_stride), // 00 01
289 vreinterpret_u16_u8(d01), 0);
290 vst1_lane_u16((uint16_t *)(dst + 1 * dst_stride), // 10 11
291 vreinterpret_u16_u8(d23), 0);
292 vst1_lane_u16((uint16_t *)(dst + 2 * dst_stride), // 20 21
293 vreinterpret_u16_u8(d01), 2);
294 vst1_lane_u16((uint16_t *)(dst + 3 * dst_stride), // 30 31
295 vreinterpret_u16_u8(d23), 2);
296 }
297
298 s0 = s4;
299 s1 = s5;
300 s2 = s6;
301 s3 = s7;
302 s4 = s8;
303 s5 = s9;
304 s6 = s10;
305 src += 4;
306 dst += 4;
307 w -= 4;
308 } while (w > 0);
309 } else {
310 #endif
311 int width;
312 const uint8_t *s;
313 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
314
315 #if defined(__aarch64__)
316 int16x8_t s8, s9, s10;
317 uint8x8_t t4, t5, t6, t7;
318 #endif
319
320 if (w <= 4) {
321 #if defined(__aarch64__)
322 do {
323 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
324 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
325 s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
326 s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
327 s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
328 s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
329 s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
330 s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
331 s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
332
333 load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
334 &t7);
335 src += 8 * src_stride;
336 __builtin_prefetch(dst + 0 * dst_stride);
337 __builtin_prefetch(dst + 1 * dst_stride);
338 __builtin_prefetch(dst + 2 * dst_stride);
339 __builtin_prefetch(dst + 3 * dst_stride);
340 __builtin_prefetch(dst + 4 * dst_stride);
341 __builtin_prefetch(dst + 5 * dst_stride);
342 __builtin_prefetch(dst + 6 * dst_stride);
343 __builtin_prefetch(dst + 7 * dst_stride);
344
345 transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
346
347 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
348 s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
349 s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
350 s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
351
352 __builtin_prefetch(src + 0 * src_stride);
353 __builtin_prefetch(src + 1 * src_stride);
354 __builtin_prefetch(src + 2 * src_stride);
355 __builtin_prefetch(src + 3 * src_stride);
356 __builtin_prefetch(src + 4 * src_stride);
357 __builtin_prefetch(src + 5 * src_stride);
358 __builtin_prefetch(src + 6 * src_stride);
359 __builtin_prefetch(src + 7 * src_stride);
360 t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
361 shift_round_0, shift_by_bits);
362 t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
363 shift_round_0, shift_by_bits);
364 t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
365 shift_round_0, shift_by_bits);
366 t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
367 shift_round_0, shift_by_bits);
368
369 transpose_u8_8x4(&t0, &t1, &t2, &t3);
370
371 if ((w == 4) && (h > 4)) {
372 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
373 0); // 00 01 02 03
374 dst += dst_stride;
375 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
376 0); // 10 11 12 13
377 dst += dst_stride;
378 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2),
379 0); // 20 21 22 23
380 dst += dst_stride;
381 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3),
382 0); // 30 31 32 33
383 dst += dst_stride;
384 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
385 1); // 40 41 42 43
386 dst += dst_stride;
387 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
388 1); // 50 51 52 53
389 dst += dst_stride;
390 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2),
391 1); // 60 61 62 63
392 dst += dst_stride;
393 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3),
394 1); // 70 71 72 73
395 dst += dst_stride;
396 } else if ((w == 4) && (h == 2)) {
397 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
398 0); // 00 01 02 03
399 dst += dst_stride;
400 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
401 0); // 10 11 12 13
402 dst += dst_stride;
403 } else if ((w == 2) && (h > 4)) {
404 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0); // 00 01
405 dst += dst_stride;
406 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0); // 10 11
407 dst += dst_stride;
408 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 0); // 20 21
409 dst += dst_stride;
410 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 0); // 30 31
411 dst += dst_stride;
412 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 2); // 40 41
413 dst += dst_stride;
414 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 2); // 50 51
415 dst += dst_stride;
416 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 2); // 60 61
417 dst += dst_stride;
418 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 2); // 70 71
419 dst += dst_stride;
420 } else if ((w == 2) && (h == 2)) {
421 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0); // 00 01
422 dst += dst_stride;
423 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0); // 10 11
424 dst += dst_stride;
425 }
426 h -= 8;
427 } while (h > 0);
428 #else
429 int16x8_t tt0;
430 int16x4_t x0, x1, x2, x3, x4, x5, x6, x7;
431 const int16x4_t shift_round_0_low = vget_low_s16(shift_round_0);
432 const int16x4_t shift_by_bits_low = vget_low_s16(shift_by_bits);
433 do {
434 t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
435 tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
436 x0 = vget_low_s16(tt0); // a0 a1 a2 a3
437 x4 = vget_high_s16(tt0); // a4 a5 a6 a7
438
439 t0 = vld1_u8(src + 8); // a8 a9 a10 a11 a12 a13 a14 a15
440 tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
441 x7 = vget_low_s16(tt0); // a8 a9 a10 a11
442
443 x1 = vext_s16(x0, x4, 1); // a1 a2 a3 a4
444 x2 = vext_s16(x0, x4, 2); // a2 a3 a4 a5
445 x3 = vext_s16(x0, x4, 3); // a3 a4 a5 a6
446 x5 = vext_s16(x4, x7, 1); // a5 a6 a7 a8
447 x6 = vext_s16(x4, x7, 2); // a6 a7 a8 a9
448 x7 = vext_s16(x4, x7, 3); // a7 a8 a9 a10
449
450 src += src_stride;
451
452 t0 = convolve8_horiz_4x1(x0, x1, x2, x3, x4, x5, x6, x7, x_filter,
453 shift_round_0_low, shift_by_bits_low);
454
455 if (w == 4) {
456 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
457 0); // 00 01 02 03
458 dst += dst_stride;
459 } else if (w == 2) {
460 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0); // 00 01
461 dst += dst_stride;
462 }
463 h -= 1;
464 } while (h > 0);
465 #endif
466 } else {
467 uint8_t *d;
468 int16x8_t s11;
469 #if defined(__aarch64__)
470 int16x8_t s12, s13, s14;
471 do {
472 __builtin_prefetch(src + 0 * src_stride);
473 __builtin_prefetch(src + 1 * src_stride);
474 __builtin_prefetch(src + 2 * src_stride);
475 __builtin_prefetch(src + 3 * src_stride);
476 __builtin_prefetch(src + 4 * src_stride);
477 __builtin_prefetch(src + 5 * src_stride);
478 __builtin_prefetch(src + 6 * src_stride);
479 __builtin_prefetch(src + 7 * src_stride);
480 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
481 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
482 s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
483 s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
484 s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
485 s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
486 s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
487 s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
488 s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
489
490 width = w;
491 s = src + 7;
492 d = dst;
493 __builtin_prefetch(dst + 0 * dst_stride);
494 __builtin_prefetch(dst + 1 * dst_stride);
495 __builtin_prefetch(dst + 2 * dst_stride);
496 __builtin_prefetch(dst + 3 * dst_stride);
497 __builtin_prefetch(dst + 4 * dst_stride);
498 __builtin_prefetch(dst + 5 * dst_stride);
499 __builtin_prefetch(dst + 6 * dst_stride);
500 __builtin_prefetch(dst + 7 * dst_stride);
501
502 do {
503 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
504 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
505 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
506 s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
507 s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
508 s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
509 s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
510 s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
511 s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
512 s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
513
514 t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
515 shift_round_0, shift_by_bits);
516
517 t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
518 shift_round_0, shift_by_bits);
519
520 t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
521 shift_round_0, shift_by_bits);
522
523 t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
524 shift_round_0, shift_by_bits);
525
526 t4 = convolve8_horiz_8x8(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
527 shift_round_0, shift_by_bits);
528
529 t5 = convolve8_horiz_8x8(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
530 shift_round_0, shift_by_bits);
531
532 t6 = convolve8_horiz_8x8(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
533 shift_round_0, shift_by_bits);
534
535 t7 = convolve8_horiz_8x8(s7, s8, s9, s10, s11, s12, s13, s14,
536 x_filter, shift_round_0, shift_by_bits);
537
538 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
539 if (h != 2) {
540 store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
541 } else {
542 store_row2_u8_8x8(d, dst_stride, t0, t1);
543 }
544 s0 = s8;
545 s1 = s9;
546 s2 = s10;
547 s3 = s11;
548 s4 = s12;
549 s5 = s13;
550 s6 = s14;
551 s += 8;
552 d += 8;
553 width -= 8;
554 } while (width > 0);
555 src += 8 * src_stride;
556 dst += 8 * dst_stride;
557 h -= 8;
558 } while (h > 0);
559 #else
560 do {
561 t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
562 s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
563
564 width = w;
565 s = src + 8;
566 d = dst;
567 __builtin_prefetch(dst);
568
569 do {
570 t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
571 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
572 s11 = s0;
573 s0 = s7;
574
575 s1 = vextq_s16(s11, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
576 s2 = vextq_s16(s11, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
577 s3 = vextq_s16(s11, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
578 s4 = vextq_s16(s11, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
579 s5 = vextq_s16(s11, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
580 s6 = vextq_s16(s11, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
581 s7 = vextq_s16(s11, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
582
583 t0 = convolve8_horiz_8x8(s11, s1, s2, s3, s4, s5, s6, s7, x_filter,
584 shift_round_0, shift_by_bits);
585 vst1_u8(d, t0);
586
587 s += 8;
588 d += 8;
589 width -= 8;
590 } while (width > 0);
591 src += src_stride;
592 dst += dst_stride;
593 h -= 1;
594 } while (h > 0);
595 #endif
596 }
597 #if defined(__aarch64__)
598 }
599 #endif
600 }
601
av1_convolve_y_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)602 void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
603 int dst_stride, int w, int h,
604 const InterpFilterParams *filter_params_x,
605 const InterpFilterParams *filter_params_y,
606 const int subpel_x_q4, const int subpel_y_q4,
607 ConvolveParams *conv_params) {
608 const int vert_offset = filter_params_y->taps / 2 - 1;
609
610 src -= vert_offset * src_stride;
611
612 (void)filter_params_x;
613 (void)subpel_x_q4;
614 (void)conv_params;
615
616 assert(conv_params->round_0 <= FILTER_BITS);
617 assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
618 ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
619
620 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
621 filter_params_y, subpel_y_q4 & SUBPEL_MASK);
622
623 if (w <= 4) {
624 uint8x8_t d01;
625 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
626 #if defined(__aarch64__)
627 uint8x8_t d23;
628 int16x4_t s8, s9, s10, d1, d2, d3;
629 #endif
630 s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
631 src += src_stride;
632 s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
633 src += src_stride;
634 s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
635 src += src_stride;
636 s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
637 src += src_stride;
638 s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
639 src += src_stride;
640 s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
641 src += src_stride;
642 s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
643 src += src_stride;
644
645 do {
646 s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
647 src += src_stride;
648 #if defined(__aarch64__)
649 s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
650 src += src_stride;
651 s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
652 src += src_stride;
653 s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
654 src += src_stride;
655
656 __builtin_prefetch(dst + 0 * dst_stride);
657 __builtin_prefetch(dst + 1 * dst_stride);
658 __builtin_prefetch(dst + 2 * dst_stride);
659 __builtin_prefetch(dst + 3 * dst_stride);
660 __builtin_prefetch(src + 0 * src_stride);
661 __builtin_prefetch(src + 1 * src_stride);
662 __builtin_prefetch(src + 2 * src_stride);
663 __builtin_prefetch(src + 3 * src_stride);
664 d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
665 d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
666 d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
667 d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
668
669 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
670 d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
671 if ((w == 4) && (h != 2)) {
672 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
673 0); // 00 01 02 03
674 dst += dst_stride;
675 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
676 1); // 10 11 12 13
677 dst += dst_stride;
678 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23),
679 0); // 20 21 22 23
680 dst += dst_stride;
681 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23),
682 1); // 30 31 32 33
683 dst += dst_stride;
684 } else if ((w == 4) && (h == 2)) {
685 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
686 0); // 00 01 02 03
687 dst += dst_stride;
688 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
689 1); // 10 11 12 13
690 dst += dst_stride;
691 } else if ((w == 2) && (h != 2)) {
692 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0); // 00 01
693 dst += dst_stride;
694 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2); // 10 11
695 dst += dst_stride;
696 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 0); // 20 21
697 dst += dst_stride;
698 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 2); // 30 31
699 dst += dst_stride;
700 } else if ((w == 2) && (h == 2)) {
701 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0); // 00 01
702 dst += dst_stride;
703 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2); // 10 11
704 dst += dst_stride;
705 }
706 s0 = s4;
707 s1 = s5;
708 s2 = s6;
709 s3 = s7;
710 s4 = s8;
711 s5 = s9;
712 s6 = s10;
713 h -= 4;
714 #else
715 __builtin_prefetch(dst + 0 * dst_stride);
716 __builtin_prefetch(src + 0 * src_stride);
717
718 d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
719
720 d01 = vqrshrun_n_s16(vcombine_s16(d0, d0), FILTER_BITS);
721
722 if (w == 4) {
723 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
724 dst += dst_stride;
725 } else if (w == 2) {
726 vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);
727 dst += dst_stride;
728 }
729 s0 = s1;
730 s1 = s2;
731 s2 = s3;
732 s3 = s4;
733 s4 = s5;
734 s5 = s6;
735 s6 = s7;
736 h -= 1;
737 #endif
738 } while (h > 0);
739 } else {
740 int height;
741 const uint8_t *s;
742 uint8_t *d;
743 uint8x8_t t0;
744 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
745 #if defined(__aarch64__)
746 uint8x8_t t1, t2, t3;
747 int16x8_t s8, s9, s10;
748 #endif
749 do {
750 __builtin_prefetch(src + 0 * src_stride);
751 __builtin_prefetch(src + 1 * src_stride);
752 __builtin_prefetch(src + 2 * src_stride);
753 __builtin_prefetch(src + 3 * src_stride);
754 __builtin_prefetch(src + 4 * src_stride);
755 __builtin_prefetch(src + 5 * src_stride);
756 __builtin_prefetch(src + 6 * src_stride);
757 s = src;
758 s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
759 s += src_stride;
760 s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
761 s += src_stride;
762 s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
763 s += src_stride;
764 s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
765 s += src_stride;
766 s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
767 s += src_stride;
768 s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
769 s += src_stride;
770 s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
771 s += src_stride;
772 d = dst;
773 height = h;
774
775 do {
776 s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
777 s += src_stride;
778 #if defined(__aarch64__)
779 s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
780 s += src_stride;
781 s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
782 s += src_stride;
783 s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
784 s += src_stride;
785
786 __builtin_prefetch(d + 0 * dst_stride);
787 __builtin_prefetch(d + 1 * dst_stride);
788 __builtin_prefetch(d + 2 * dst_stride);
789 __builtin_prefetch(d + 3 * dst_stride);
790 __builtin_prefetch(s + 0 * src_stride);
791 __builtin_prefetch(s + 1 * src_stride);
792 __builtin_prefetch(s + 2 * src_stride);
793 __builtin_prefetch(s + 3 * src_stride);
794 t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
795 t1 = convolve8_vert_8x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
796 t2 = convolve8_vert_8x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
797 t3 = convolve8_vert_8x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
798 if (h != 2) {
799 vst1_u8(d, t0);
800 d += dst_stride;
801 vst1_u8(d, t1);
802 d += dst_stride;
803 vst1_u8(d, t2);
804 d += dst_stride;
805 vst1_u8(d, t3);
806 d += dst_stride;
807 } else {
808 vst1_u8(d, t0);
809 d += dst_stride;
810 vst1_u8(d, t1);
811 d += dst_stride;
812 }
813 s0 = s4;
814 s1 = s5;
815 s2 = s6;
816 s3 = s7;
817 s4 = s8;
818 s5 = s9;
819 s6 = s10;
820 height -= 4;
821 #else
822 __builtin_prefetch(d);
823 __builtin_prefetch(s);
824
825 t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
826
827 vst1_u8(d, t0);
828 d += dst_stride;
829
830 s0 = s1;
831 s1 = s2;
832 s2 = s3;
833 s3 = s4;
834 s4 = s5;
835 s5 = s6;
836 s6 = s7;
837 height -= 1;
838 #endif
839 } while (height > 0);
840 src += 8;
841 dst += 8;
842 w -= 8;
843 } while (w > 0);
844 }
845 }
846
av1_convolve_2d_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)847 void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
848 int dst_stride, int w, int h,
849 const InterpFilterParams *filter_params_x,
850 const InterpFilterParams *filter_params_y,
851 const int subpel_x_q4, const int subpel_y_q4,
852 ConvolveParams *conv_params) {
853 int im_dst_stride;
854 int width, height;
855 uint8x8_t t0;
856 #if defined(__aarch64__)
857 uint8x8_t t1, t2, t3, t4, t5, t6, t7;
858 #endif
859
860 DECLARE_ALIGNED(16, int16_t,
861 im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
862
863 const int bd = 8;
864 const int im_h = h + filter_params_y->taps - 1;
865 const int im_stride = MAX_SB_SIZE;
866 const int vert_offset = filter_params_y->taps / 2 - 1;
867 const int horiz_offset = filter_params_x->taps / 2 - 1;
868
869 const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
870 const uint8_t *s;
871 int16_t *dst_ptr;
872
873 dst_ptr = im_block;
874 im_dst_stride = im_stride;
875 height = im_h;
876 width = w;
877
878 const int16_t round_bits =
879 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
880 const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
881 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
882 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
883 filter_params_x, subpel_x_q4 & SUBPEL_MASK);
884
885 int16_t x_filter_tmp[8];
886 int16x8_t filter_x_coef = vld1q_s16(x_filter);
887
888 // filter coeffs are even, so downshifting by 1 to reduce intermediate
889 // precision requirements.
890 filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
891 vst1q_s16(&x_filter_tmp[0], filter_x_coef);
892
893 assert(conv_params->round_0 > 0);
894
895 if (w <= 4) {
896 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
897 #if defined(__aarch64__)
898 int16x4_t s8, s9, s10, d1, d2, d3;
899 #endif
900
901 const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
902 const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
903
904 do {
905 s = src_ptr;
906
907 #if defined(__aarch64__)
908 __builtin_prefetch(s + 0 * src_stride);
909 __builtin_prefetch(s + 1 * src_stride);
910 __builtin_prefetch(s + 2 * src_stride);
911 __builtin_prefetch(s + 3 * src_stride);
912
913 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
914 transpose_u8_8x4(&t0, &t1, &t2, &t3);
915
916 s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
917 s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
918 s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
919 s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
920 s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
921 s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
922 s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
923
924 __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
925 __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
926 __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
927 __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
928 s += 7;
929
930 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
931 transpose_u8_8x4(&t0, &t1, &t2, &t3);
932
933 s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
934 s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
935 s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
936 s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
937
938 d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
939 horiz_const, shift_round_0);
940 d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
941 horiz_const, shift_round_0);
942 d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
943 horiz_const, shift_round_0);
944 d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
945 horiz_const, shift_round_0);
946
947 transpose_s16_4x4d(&d0, &d1, &d2, &d3);
948 if (w == 4) {
949 vst1_s16((dst_ptr + 0 * im_dst_stride), d0);
950 vst1_s16((dst_ptr + 1 * im_dst_stride), d1);
951 vst1_s16((dst_ptr + 2 * im_dst_stride), d2);
952 vst1_s16((dst_ptr + 3 * im_dst_stride), d3);
953 } else if (w == 2) {
954 vst1_lane_u32((uint32_t *)(dst_ptr + 0 * im_dst_stride),
955 vreinterpret_u32_s16(d0), 0);
956 vst1_lane_u32((uint32_t *)(dst_ptr + 1 * im_dst_stride),
957 vreinterpret_u32_s16(d1), 0);
958 vst1_lane_u32((uint32_t *)(dst_ptr + 2 * im_dst_stride),
959 vreinterpret_u32_s16(d2), 0);
960 vst1_lane_u32((uint32_t *)(dst_ptr + 3 * im_dst_stride),
961 vreinterpret_u32_s16(d3), 0);
962 }
963 src_ptr += 4 * src_stride;
964 dst_ptr += 4 * im_dst_stride;
965 height -= 4;
966 #else
967 int16x8_t tt0;
968
969 __builtin_prefetch(s);
970
971 t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
972 tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
973 s0 = vget_low_s16(tt0);
974 s4 = vget_high_s16(tt0);
975
976 __builtin_prefetch(dst_ptr);
977 s += 8;
978
979 t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
980 s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
981
982 s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
983 s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
984 s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
985 s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8
986 s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9
987 s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10
988
989 d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
990 horiz_const, shift_round_0);
991
992 if (w == 4) {
993 vst1_s16(dst_ptr, d0);
994 dst_ptr += im_dst_stride;
995 } else if (w == 2) {
996 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
997 dst_ptr += im_dst_stride;
998 }
999
1000 src_ptr += src_stride;
1001 height -= 1;
1002 #endif
1003 } while (height > 0);
1004 } else {
1005 int16_t *d_tmp;
1006 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, res0;
1007 #if defined(__aarch64__)
1008 int16x8_t s8, s9, s10, res1, res2, res3, res4, res5, res6, res7;
1009 int16x8_t s11, s12, s13, s14;
1010 #endif
1011
1012 const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
1013 const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
1014
1015 #if defined(__aarch64__)
1016 do {
1017 __builtin_prefetch(src_ptr + 0 * src_stride);
1018 __builtin_prefetch(src_ptr + 1 * src_stride);
1019 __builtin_prefetch(src_ptr + 2 * src_stride);
1020 __builtin_prefetch(src_ptr + 3 * src_stride);
1021 __builtin_prefetch(src_ptr + 4 * src_stride);
1022 __builtin_prefetch(src_ptr + 5 * src_stride);
1023 __builtin_prefetch(src_ptr + 6 * src_stride);
1024 __builtin_prefetch(src_ptr + 7 * src_stride);
1025
1026 load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1027
1028 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1029
1030 s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
1031 s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
1032 s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
1033 s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
1034 s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
1035 s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
1036 s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
1037
1038 width = w;
1039 s = src_ptr + 7;
1040 d_tmp = dst_ptr;
1041
1042 __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
1043 __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
1044 __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
1045 __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
1046 __builtin_prefetch(dst_ptr + 4 * im_dst_stride);
1047 __builtin_prefetch(dst_ptr + 5 * im_dst_stride);
1048 __builtin_prefetch(dst_ptr + 6 * im_dst_stride);
1049 __builtin_prefetch(dst_ptr + 7 * im_dst_stride);
1050
1051 do {
1052 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1053
1054 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
1055
1056 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
1057 s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
1058 s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
1059 s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
1060 s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
1061 s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
1062 s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
1063 s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
1064
1065 res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
1066 horiz_const, shift_round_0);
1067 res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
1068 horiz_const, shift_round_0);
1069 res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
1070 horiz_const, shift_round_0);
1071 res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
1072 horiz_const, shift_round_0);
1073 res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
1074 horiz_const, shift_round_0);
1075 res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
1076 x_filter_tmp, horiz_const, shift_round_0);
1077 res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
1078 x_filter_tmp, horiz_const, shift_round_0);
1079 res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
1080 x_filter_tmp, horiz_const, shift_round_0);
1081
1082 transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
1083 &res7);
1084
1085 store_s16_8x8(d_tmp, im_dst_stride, res0, res1, res2, res3, res4, res5,
1086 res6, res7);
1087
1088 s0 = s8;
1089 s1 = s9;
1090 s2 = s10;
1091 s3 = s11;
1092 s4 = s12;
1093 s5 = s13;
1094 s6 = s14;
1095 s += 8;
1096 d_tmp += 8;
1097 width -= 8;
1098 } while (width > 0);
1099 src_ptr += 8 * src_stride;
1100 dst_ptr += 8 * im_dst_stride;
1101 height -= 8;
1102 } while (height > 0);
1103 #else
1104 do {
1105 t0 = vld1_u8(src_ptr);
1106 s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
1107
1108 width = w;
1109 s = src_ptr + 8;
1110 d_tmp = dst_ptr;
1111
1112 __builtin_prefetch(dst_ptr);
1113
1114 do {
1115 t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
1116 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
1117 int16x8_t sum = s0;
1118 s0 = s7;
1119
1120 s1 = vextq_s16(sum, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
1121 s2 = vextq_s16(sum, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
1122 s3 = vextq_s16(sum, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
1123 s4 = vextq_s16(sum, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
1124 s5 = vextq_s16(sum, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
1125 s6 = vextq_s16(sum, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
1126 s7 = vextq_s16(sum, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
1127
1128 res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
1129 horiz_const, shift_round_0);
1130
1131 vst1q_s16(d_tmp, res0);
1132
1133 s += 8;
1134 d_tmp += 8;
1135 width -= 8;
1136 } while (width > 0);
1137 src_ptr += src_stride;
1138 dst_ptr += im_dst_stride;
1139 height -= 1;
1140 } while (height > 0);
1141 #endif
1142 }
1143
1144 // vertical
1145 {
1146 uint8_t *dst_u8_ptr, *d_u8;
1147 int16_t *v_src_ptr, *v_s;
1148
1149 const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
1150 (1 << (offset_bits - conv_params->round_1 - 1));
1151 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
1152 filter_params_y, subpel_y_q4 & SUBPEL_MASK);
1153
1154 const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
1155 const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
1156 const int32x4_t sub_const_vec = vdupq_n_s32(sub_const);
1157
1158 src_stride = im_stride;
1159 v_src_ptr = im_block;
1160 dst_u8_ptr = dst;
1161
1162 height = h;
1163 width = w;
1164
1165 if (width <= 4) {
1166 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
1167 uint16x4_t d0;
1168 uint16x8_t dd0;
1169 uint8x8_t d01;
1170
1171 #if defined(__aarch64__)
1172 int16x4_t s8, s9, s10;
1173 uint16x4_t d1, d2, d3;
1174 uint16x8_t dd1;
1175 uint8x8_t d23;
1176 #endif
1177
1178 d_u8 = dst_u8_ptr;
1179 v_s = v_src_ptr;
1180
1181 __builtin_prefetch(v_s + 0 * im_stride);
1182 __builtin_prefetch(v_s + 1 * im_stride);
1183 __builtin_prefetch(v_s + 2 * im_stride);
1184 __builtin_prefetch(v_s + 3 * im_stride);
1185 __builtin_prefetch(v_s + 4 * im_stride);
1186 __builtin_prefetch(v_s + 5 * im_stride);
1187 __builtin_prefetch(v_s + 6 * im_stride);
1188 __builtin_prefetch(v_s + 7 * im_stride);
1189
1190 load_s16_4x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
1191 v_s += (7 * im_stride);
1192
1193 do {
1194 #if defined(__aarch64__)
1195 load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
1196 v_s += (im_stride << 2);
1197
1198 __builtin_prefetch(d_u8 + 0 * dst_stride);
1199 __builtin_prefetch(d_u8 + 1 * dst_stride);
1200 __builtin_prefetch(d_u8 + 2 * dst_stride);
1201 __builtin_prefetch(d_u8 + 3 * dst_stride);
1202
1203 d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
1204 round_shift_vec, offset_const,
1205 sub_const_vec);
1206 d1 = convolve8_vert_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
1207 round_shift_vec, offset_const,
1208 sub_const_vec);
1209 d2 = convolve8_vert_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
1210 round_shift_vec, offset_const,
1211 sub_const_vec);
1212 d3 = convolve8_vert_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
1213 round_shift_vec, offset_const,
1214 sub_const_vec);
1215
1216 dd0 = vqrshlq_u16(vcombine_u16(d0, d1), vec_round_bits);
1217 dd1 = vqrshlq_u16(vcombine_u16(d2, d3), vec_round_bits);
1218
1219 d01 = vqmovn_u16(dd0);
1220 d23 = vqmovn_u16(dd1);
1221
1222 if ((w == 4) && (h != 2)) {
1223 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1224 0); // 00 01 02 03
1225 d_u8 += dst_stride;
1226 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1227 1); // 10 11 12 13
1228 d_u8 += dst_stride;
1229 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
1230 0); // 20 21 22 23
1231 d_u8 += dst_stride;
1232 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
1233 1); // 30 31 32 33
1234 d_u8 += dst_stride;
1235 } else if ((w == 2) && (h != 2)) {
1236 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1237 0); // 00 01
1238 d_u8 += dst_stride;
1239 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1240 2); // 10 11
1241 d_u8 += dst_stride;
1242 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
1243 0); // 20 21
1244 d_u8 += dst_stride;
1245 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
1246 2); // 30 31
1247 d_u8 += dst_stride;
1248 } else if ((w == 4) && (h == 2)) {
1249 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1250 0); // 00 01 02 03
1251 d_u8 += dst_stride;
1252 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1253 1); // 10 11 12 13
1254 d_u8 += dst_stride;
1255 } else if ((w == 2) && (h == 2)) {
1256 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1257 0); // 00 01
1258 d_u8 += dst_stride;
1259 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1260 2); // 10 11
1261 d_u8 += dst_stride;
1262 }
1263
1264 s0 = s4;
1265 s1 = s5;
1266 s2 = s6;
1267 s3 = s7;
1268 s4 = s8;
1269 s5 = s9;
1270 s6 = s10;
1271 height -= 4;
1272 #else
1273 s7 = vld1_s16(v_s);
1274 v_s += im_stride;
1275
1276 __builtin_prefetch(d_u8 + 0 * dst_stride);
1277
1278 d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
1279 round_shift_vec, offset_const,
1280 sub_const_vec);
1281
1282 dd0 = vqrshlq_u16(vcombine_u16(d0, d0), vec_round_bits);
1283 d01 = vqmovn_u16(dd0);
1284
1285 if (w == 4) {
1286 vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
1287 0); // 00 01 02 03
1288 d_u8 += dst_stride;
1289
1290 } else if (w == 2) {
1291 vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
1292 0); // 00 01
1293 d_u8 += dst_stride;
1294 }
1295
1296 s0 = s1;
1297 s1 = s2;
1298 s2 = s3;
1299 s3 = s4;
1300 s4 = s5;
1301 s5 = s6;
1302 s6 = s7;
1303 height -= 1;
1304 #endif
1305 } while (height > 0);
1306 } else {
1307 // if width is a multiple of 8 & height is a multiple of 4
1308 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
1309 uint8x8_t res0;
1310 #if defined(__aarch64__)
1311 int16x8_t s8, s9, s10;
1312 uint8x8_t res1, res2, res3;
1313 #endif
1314
1315 do {
1316 __builtin_prefetch(v_src_ptr + 0 * im_stride);
1317 __builtin_prefetch(v_src_ptr + 1 * im_stride);
1318 __builtin_prefetch(v_src_ptr + 2 * im_stride);
1319 __builtin_prefetch(v_src_ptr + 3 * im_stride);
1320 __builtin_prefetch(v_src_ptr + 4 * im_stride);
1321 __builtin_prefetch(v_src_ptr + 5 * im_stride);
1322 __builtin_prefetch(v_src_ptr + 6 * im_stride);
1323 __builtin_prefetch(v_src_ptr + 7 * im_stride);
1324
1325 v_s = v_src_ptr;
1326 load_s16_8x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
1327 v_s += (7 * im_stride);
1328
1329 d_u8 = dst_u8_ptr;
1330 height = h;
1331
1332 do {
1333 #if defined(__aarch64__)
1334 load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
1335 v_s += (im_stride << 2);
1336
1337 __builtin_prefetch(d_u8 + 4 * dst_stride);
1338 __builtin_prefetch(d_u8 + 5 * dst_stride);
1339 __builtin_prefetch(d_u8 + 6 * dst_stride);
1340 __builtin_prefetch(d_u8 + 7 * dst_stride);
1341
1342 res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
1343 y_filter, round_shift_vec, offset_const,
1344 sub_const_vec, vec_round_bits);
1345 res1 = convolve8_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8,
1346 y_filter, round_shift_vec, offset_const,
1347 sub_const_vec, vec_round_bits);
1348 res2 = convolve8_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9,
1349 y_filter, round_shift_vec, offset_const,
1350 sub_const_vec, vec_round_bits);
1351 res3 = convolve8_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10,
1352 y_filter, round_shift_vec, offset_const,
1353 sub_const_vec, vec_round_bits);
1354
1355 if (h != 2) {
1356 vst1_u8(d_u8, res0);
1357 d_u8 += dst_stride;
1358 vst1_u8(d_u8, res1);
1359 d_u8 += dst_stride;
1360 vst1_u8(d_u8, res2);
1361 d_u8 += dst_stride;
1362 vst1_u8(d_u8, res3);
1363 d_u8 += dst_stride;
1364 } else {
1365 vst1_u8(d_u8, res0);
1366 d_u8 += dst_stride;
1367 vst1_u8(d_u8, res1);
1368 d_u8 += dst_stride;
1369 }
1370 s0 = s4;
1371 s1 = s5;
1372 s2 = s6;
1373 s3 = s7;
1374 s4 = s8;
1375 s5 = s9;
1376 s6 = s10;
1377 height -= 4;
1378 #else
1379 s7 = vld1q_s16(v_s);
1380 v_s += im_stride;
1381
1382 __builtin_prefetch(d_u8 + 0 * dst_stride);
1383
1384 res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
1385 y_filter, round_shift_vec, offset_const,
1386 sub_const_vec, vec_round_bits);
1387
1388 vst1_u8(d_u8, res0);
1389 d_u8 += dst_stride;
1390
1391 s0 = s1;
1392 s1 = s2;
1393 s2 = s3;
1394 s3 = s4;
1395 s4 = s5;
1396 s5 = s6;
1397 s6 = s7;
1398 height -= 1;
1399 #endif
1400 } while (height > 0);
1401 v_src_ptr += 8;
1402 dst_u8_ptr += 8;
1403 w -= 8;
1404 } while (w > 0);
1405 }
1406 }
1407 }
av1_convolve_2d_copy_sr_neon(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)1408 void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride,
1409 uint8_t *dst, int dst_stride, int w, int h,
1410 const InterpFilterParams *filter_params_x,
1411 const InterpFilterParams *filter_params_y,
1412 const int subpel_x_q4, const int subpel_y_q4,
1413 ConvolveParams *conv_params) {
1414 (void)filter_params_x;
1415 (void)filter_params_y;
1416 (void)subpel_x_q4;
1417 (void)subpel_y_q4;
1418 (void)conv_params;
1419
1420 const uint8_t *src1;
1421 uint8_t *dst1;
1422 int y;
1423
1424 if (!(w & 0x0F)) {
1425 for (y = 0; y < h; ++y) {
1426 src1 = src;
1427 dst1 = dst;
1428 for (int x = 0; x < (w >> 4); ++x) {
1429 vst1q_u8(dst1, vld1q_u8(src1));
1430 src1 += 16;
1431 dst1 += 16;
1432 }
1433 src += src_stride;
1434 dst += dst_stride;
1435 }
1436 } else if (!(w & 0x07)) {
1437 for (y = 0; y < h; ++y) {
1438 vst1_u8(dst, vld1_u8(src));
1439 src += src_stride;
1440 dst += dst_stride;
1441 }
1442 } else if (!(w & 0x03)) {
1443 for (y = 0; y < h; ++y) {
1444 vst1_lane_u32((uint32_t *)(dst), vreinterpret_u32_u8(vld1_u8(src)), 0);
1445 src += src_stride;
1446 dst += dst_stride;
1447 }
1448 } else if (!(w & 0x01)) {
1449 for (y = 0; y < h; ++y) {
1450 vst1_lane_u16((uint16_t *)(dst), vreinterpret_u16_u8(vld1_u8(src)), 0);
1451 src += src_stride;
1452 dst += dst_stride;
1453 }
1454 }
1455 }
1456