1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 * Copyright (c) 2023, Alliance for Open Media. All rights reserved
4 *
5 * This source code is subject to the terms of the BSD 2 Clause License and
6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7 * was not distributed with this source code in the LICENSE file, you can
8 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
9 * Media Patent License 1.0 was not distributed with this source code in the
10 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
11 */
12
13 #include <arm_neon.h>
14 #include <assert.h>
15 #include <string.h>
16
17 #include "config/aom_config.h"
18 #include "config/aom_dsp_rtcd.h"
19
20 #include "aom/aom_integer.h"
21 #include "aom_dsp/aom_dsp_common.h"
22 #include "aom_dsp/aom_filter.h"
23 #include "aom_dsp/arm/mem_neon.h"
24 #include "aom_dsp/arm/transpose_neon.h"
25 #include "aom_ports/mem.h"
26
convolve8_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t filter)27 static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
28 const int16x4_t s2, const int16x4_t s3,
29 const int16x4_t s4, const int16x4_t s5,
30 const int16x4_t s6, const int16x4_t s7,
31 const int16x8_t filter) {
32 const int16x4_t filter_lo = vget_low_s16(filter);
33 const int16x4_t filter_hi = vget_high_s16(filter);
34 int16x4_t sum;
35
36 sum = vmul_lane_s16(s0, filter_lo, 0);
37 sum = vmla_lane_s16(sum, s1, filter_lo, 1);
38 sum = vmla_lane_s16(sum, s2, filter_lo, 2);
39 sum = vmla_lane_s16(sum, s5, filter_hi, 1);
40 sum = vmla_lane_s16(sum, s6, filter_hi, 2);
41 sum = vmla_lane_s16(sum, s7, filter_hi, 3);
42 sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3));
43 sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0));
44 return sum;
45 }
46
convolve8_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filter)47 static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
48 const int16x8_t s2, const int16x8_t s3,
49 const int16x8_t s4, const int16x8_t s5,
50 const int16x8_t s6, const int16x8_t s7,
51 const int16x8_t filter) {
52 const int16x4_t filter_lo = vget_low_s16(filter);
53 const int16x4_t filter_hi = vget_high_s16(filter);
54 int16x8_t sum;
55
56 sum = vmulq_lane_s16(s0, filter_lo, 0);
57 sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
58 sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
59 sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
60 sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
61 sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
62 sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3));
63 sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0));
64 return vqrshrun_n_s16(sum, FILTER_BITS);
65 }
66
aom_convolve8_horiz_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)67 void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
68 uint8_t *dst, ptrdiff_t dst_stride,
69 const int16_t *filter_x, int x_step_q4,
70 const int16_t *filter_y, int y_step_q4, int w,
71 int h) {
72 const int16x8_t filter = vld1q_s16(filter_x);
73
74 assert((intptr_t)dst % 4 == 0);
75 assert(dst_stride % 4 == 0);
76
77 (void)x_step_q4;
78 (void)filter_y;
79 (void)y_step_q4;
80
81 src -= ((SUBPEL_TAPS / 2) - 1);
82
83 if (h == 4) {
84 uint8x8_t t0, t1, t2, t3, d01, d23;
85 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
86
87 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
88 transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
89 s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
90 s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
91 s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
92 s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
93 s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
94 s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
95 s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
96
97 src += 7;
98
99 do {
100 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
101 transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
102 s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
103 s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
104 s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
105 s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
106
107 d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
108 d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
109 d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
110 d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
111 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
112 d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
113
114 transpose_elems_inplace_u8_4x4(&d01, &d23);
115
116 store_u8x4_strided_x2(dst + 0 * dst_stride, 2 * dst_stride, d01);
117 store_u8x4_strided_x2(dst + 1 * dst_stride, 2 * dst_stride, d23);
118
119 s0 = s4;
120 s1 = s5;
121 s2 = s6;
122 s3 = s7;
123 s4 = s8;
124 s5 = s9;
125 s6 = s10;
126 src += 4;
127 dst += 4;
128 w -= 4;
129 } while (w != 0);
130 } else {
131 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, d0, d1, d2, d3;
132 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
133
134 if (w == 4) {
135 do {
136 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
137 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
138 s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
139 s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
140 s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
141 s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
142 s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
143 s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
144 s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
145
146 load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
147 &t7);
148 transpose_elems_u8_4x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2,
149 &t3);
150 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
151 s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
152 s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
153 s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
154
155 d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
156 d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
157 d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
158 d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
159
160 transpose_elems_inplace_u8_8x4(&d0, &d1, &d2, &d3);
161
162 store_u8x4_strided_x2(dst + 0 * dst_stride, 4 * dst_stride, d0);
163 store_u8x4_strided_x2(dst + 1 * dst_stride, 4 * dst_stride, d1);
164 store_u8x4_strided_x2(dst + 2 * dst_stride, 4 * dst_stride, d2);
165 store_u8x4_strided_x2(dst + 3 * dst_stride, 4 * dst_stride, d3);
166
167 src += 8 * src_stride;
168 dst += 8 * dst_stride;
169 h -= 8;
170 } while (h > 0);
171 } else {
172 uint8x8_t d4, d5, d6, d7;
173 int16x8_t s11, s12, s13, s14;
174 int width;
175 const uint8_t *s;
176 uint8_t *d;
177
178 do {
179 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
180 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
181 s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
182 s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
183 s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
184 s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
185 s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
186 s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
187 s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
188
189 width = w;
190 s = src + 7;
191 d = dst;
192
193 do {
194 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
195 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6,
196 &t7);
197 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
198 s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
199 s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
200 s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
201 s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
202 s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
203 s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
204 s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
205
206 d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
207 d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
208 d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
209 d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
210 d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter);
211 d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter);
212 d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter);
213 d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter);
214
215 transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6,
216 &d7);
217
218 store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
219
220 s0 = s8;
221 s1 = s9;
222 s2 = s10;
223 s3 = s11;
224 s4 = s12;
225 s5 = s13;
226 s6 = s14;
227 s += 8;
228 d += 8;
229 width -= 8;
230 } while (width != 0);
231 src += 8 * src_stride;
232 dst += 8 * dst_stride;
233 h -= 8;
234 } while (h > 0);
235 }
236 }
237 }
238
aom_convolve8_vert_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)239 void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
240 uint8_t *dst, ptrdiff_t dst_stride,
241 const int16_t *filter_x, int x_step_q4,
242 const int16_t *filter_y, int y_step_q4, int w,
243 int h) {
244 const int16x8_t filter = vld1q_s16(filter_y);
245
246 assert((intptr_t)dst % 4 == 0);
247 assert(dst_stride % 4 == 0);
248
249 (void)filter_x;
250 (void)x_step_q4;
251 (void)y_step_q4;
252
253 src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
254
255 if (w == 4) {
256 uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23;
257 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
258
259 load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
260 s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
261 s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
262 s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
263 s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
264 s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
265 s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
266 s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
267
268 src += 7 * src_stride;
269
270 do {
271 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
272 s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
273 s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
274 s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
275 s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
276
277 d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
278 d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
279 d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
280 d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
281 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
282 d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
283
284 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
285 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
286
287 s0 = s4;
288 s1 = s5;
289 s2 = s6;
290 s3 = s7;
291 s4 = s8;
292 s5 = s9;
293 s6 = s10;
294 src += 4 * src_stride;
295 dst += 4 * dst_stride;
296 h -= 4;
297 } while (h != 0);
298 } else {
299 uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
300 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
301 int height;
302 const uint8_t *s;
303 uint8_t *d;
304
305 do {
306 load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
307 s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
308 s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
309 s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
310 s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
311 s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
312 s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
313 s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
314
315 height = h;
316 s = src + 7 * src_stride;
317 d = dst;
318
319 do {
320 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
321 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
322 s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
323 s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
324 s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
325
326 d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
327 d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
328 d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
329 d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
330
331 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
332
333 s0 = s4;
334 s1 = s5;
335 s2 = s6;
336 s3 = s7;
337 s4 = s8;
338 s5 = s9;
339 s6 = s10;
340 s += 4 * src_stride;
341 d += 4 * dst_stride;
342 height -= 4;
343 } while (height != 0);
344 src += 8;
345 dst += 8;
346 w -= 8;
347 } while (w != 0);
348 }
349 }
350