• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  * Copyright (c) 2023, Alliance for Open Media. All rights reserved
4  *
5  * This source code is subject to the terms of the BSD 2 Clause License and
6  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7  * was not distributed with this source code in the LICENSE file, you can
8  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
9  * Media Patent License 1.0 was not distributed with this source code in the
10  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
11  */
12 
13 #include <arm_neon.h>
14 #include <assert.h>
15 #include <string.h>
16 
17 #include "config/aom_config.h"
18 #include "config/aom_dsp_rtcd.h"
19 
20 #include "aom/aom_integer.h"
21 #include "aom_dsp/aom_dsp_common.h"
22 #include "aom_dsp/aom_filter.h"
23 #include "aom_dsp/arm/mem_neon.h"
24 #include "aom_dsp/arm/transpose_neon.h"
25 #include "aom_ports/mem.h"
26 
convolve8_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t filter)27 static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
28                                     const int16x4_t s2, const int16x4_t s3,
29                                     const int16x4_t s4, const int16x4_t s5,
30                                     const int16x4_t s6, const int16x4_t s7,
31                                     const int16x8_t filter) {
32   const int16x4_t filter_lo = vget_low_s16(filter);
33   const int16x4_t filter_hi = vget_high_s16(filter);
34   int16x4_t sum;
35 
36   sum = vmul_lane_s16(s0, filter_lo, 0);
37   sum = vmla_lane_s16(sum, s1, filter_lo, 1);
38   sum = vmla_lane_s16(sum, s2, filter_lo, 2);
39   sum = vmla_lane_s16(sum, s5, filter_hi, 1);
40   sum = vmla_lane_s16(sum, s6, filter_hi, 2);
41   sum = vmla_lane_s16(sum, s7, filter_hi, 3);
42   sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3));
43   sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0));
44   return sum;
45 }
46 
convolve8_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filter)47 static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
48                                     const int16x8_t s2, const int16x8_t s3,
49                                     const int16x8_t s4, const int16x8_t s5,
50                                     const int16x8_t s6, const int16x8_t s7,
51                                     const int16x8_t filter) {
52   const int16x4_t filter_lo = vget_low_s16(filter);
53   const int16x4_t filter_hi = vget_high_s16(filter);
54   int16x8_t sum;
55 
56   sum = vmulq_lane_s16(s0, filter_lo, 0);
57   sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
58   sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
59   sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
60   sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
61   sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
62   sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3));
63   sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0));
64   return vqrshrun_n_s16(sum, FILTER_BITS);
65 }
66 
aom_convolve8_horiz_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)67 void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
68                               uint8_t *dst, ptrdiff_t dst_stride,
69                               const int16_t *filter_x, int x_step_q4,
70                               const int16_t *filter_y, int y_step_q4, int w,
71                               int h) {
72   const int16x8_t filter = vld1q_s16(filter_x);
73 
74   assert((intptr_t)dst % 4 == 0);
75   assert(dst_stride % 4 == 0);
76 
77   (void)x_step_q4;
78   (void)filter_y;
79   (void)y_step_q4;
80 
81   src -= ((SUBPEL_TAPS / 2) - 1);
82 
83   if (h == 4) {
84     uint8x8_t t0, t1, t2, t3, d01, d23;
85     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
86 
87     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
88     transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
89     s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
90     s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
91     s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
92     s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
93     s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
94     s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
95     s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
96 
97     src += 7;
98 
99     do {
100       load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
101       transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
102       s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
103       s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
104       s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
105       s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
106 
107       d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
108       d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
109       d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
110       d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
111       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
112       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
113 
114       transpose_elems_inplace_u8_4x4(&d01, &d23);
115 
116       store_u8x4_strided_x2(dst + 0 * dst_stride, 2 * dst_stride, d01);
117       store_u8x4_strided_x2(dst + 1 * dst_stride, 2 * dst_stride, d23);
118 
119       s0 = s4;
120       s1 = s5;
121       s2 = s6;
122       s3 = s7;
123       s4 = s8;
124       s5 = s9;
125       s6 = s10;
126       src += 4;
127       dst += 4;
128       w -= 4;
129     } while (w != 0);
130   } else {
131     uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, d0, d1, d2, d3;
132     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
133 
134     if (w == 4) {
135       do {
136         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
137         transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
138         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
139         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
140         s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
141         s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
142         s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
143         s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
144         s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
145 
146         load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
147                     &t7);
148         transpose_elems_u8_4x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2,
149                                &t3);
150         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
151         s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
152         s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
153         s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
154 
155         d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
156         d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
157         d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
158         d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
159 
160         transpose_elems_inplace_u8_8x4(&d0, &d1, &d2, &d3);
161 
162         store_u8x4_strided_x2(dst + 0 * dst_stride, 4 * dst_stride, d0);
163         store_u8x4_strided_x2(dst + 1 * dst_stride, 4 * dst_stride, d1);
164         store_u8x4_strided_x2(dst + 2 * dst_stride, 4 * dst_stride, d2);
165         store_u8x4_strided_x2(dst + 3 * dst_stride, 4 * dst_stride, d3);
166 
167         src += 8 * src_stride;
168         dst += 8 * dst_stride;
169         h -= 8;
170       } while (h > 0);
171     } else {
172       uint8x8_t d4, d5, d6, d7;
173       int16x8_t s11, s12, s13, s14;
174       int width;
175       const uint8_t *s;
176       uint8_t *d;
177 
178       do {
179         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
180         transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
181         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
182         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
183         s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
184         s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
185         s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
186         s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
187         s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
188 
189         width = w;
190         s = src + 7;
191         d = dst;
192 
193         do {
194           load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
195           transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6,
196                                          &t7);
197           s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
198           s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
199           s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
200           s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
201           s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
202           s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
203           s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
204           s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
205 
206           d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
207           d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
208           d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
209           d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
210           d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter);
211           d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter);
212           d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter);
213           d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter);
214 
215           transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6,
216                                          &d7);
217 
218           store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
219 
220           s0 = s8;
221           s1 = s9;
222           s2 = s10;
223           s3 = s11;
224           s4 = s12;
225           s5 = s13;
226           s6 = s14;
227           s += 8;
228           d += 8;
229           width -= 8;
230         } while (width != 0);
231         src += 8 * src_stride;
232         dst += 8 * dst_stride;
233         h -= 8;
234       } while (h > 0);
235     }
236   }
237 }
238 
aom_convolve8_vert_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)239 void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
240                              uint8_t *dst, ptrdiff_t dst_stride,
241                              const int16_t *filter_x, int x_step_q4,
242                              const int16_t *filter_y, int y_step_q4, int w,
243                              int h) {
244   const int16x8_t filter = vld1q_s16(filter_y);
245 
246   assert((intptr_t)dst % 4 == 0);
247   assert(dst_stride % 4 == 0);
248 
249   (void)filter_x;
250   (void)x_step_q4;
251   (void)y_step_q4;
252 
253   src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
254 
255   if (w == 4) {
256     uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23;
257     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
258 
259     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
260     s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
261     s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
262     s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
263     s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
264     s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
265     s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
266     s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
267 
268     src += 7 * src_stride;
269 
270     do {
271       load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
272       s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
273       s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
274       s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
275       s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
276 
277       d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
278       d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
279       d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
280       d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
281       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
282       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
283 
284       store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
285       store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
286 
287       s0 = s4;
288       s1 = s5;
289       s2 = s6;
290       s3 = s7;
291       s4 = s8;
292       s5 = s9;
293       s6 = s10;
294       src += 4 * src_stride;
295       dst += 4 * dst_stride;
296       h -= 4;
297     } while (h != 0);
298   } else {
299     uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
300     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
301     int height;
302     const uint8_t *s;
303     uint8_t *d;
304 
305     do {
306       load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
307       s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
308       s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
309       s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
310       s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
311       s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
312       s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
313       s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
314 
315       height = h;
316       s = src + 7 * src_stride;
317       d = dst;
318 
319       do {
320         load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
321         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
322         s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
323         s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
324         s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
325 
326         d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
327         d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
328         d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
329         d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
330 
331         store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
332 
333         s0 = s4;
334         s1 = s5;
335         s2 = s6;
336         s3 = s7;
337         s4 = s8;
338         s5 = s9;
339         s6 = s10;
340         s += 4 * src_stride;
341         d += 4 * dst_stride;
342         height -= 4;
343       } while (height != 0);
344       src += 8;
345       dst += 8;
346       w -= 8;
347     } while (w != 0);
348   }
349 }
350