• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2023, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 #include <assert.h>
14 
15 #include "aom_dsp/arm/sum_neon.h"
16 
17 #define MAX_UPSAMPLE_SZ 16
18 
av1_highbd_filter_intra_edge_neon(uint16_t * p,int sz,int strength)19 void av1_highbd_filter_intra_edge_neon(uint16_t *p, int sz, int strength) {
20   if (!strength) return;
21   assert(sz >= 0 && sz <= 129);
22 
23   DECLARE_ALIGNED(16, static const uint16_t,
24                   idx[8]) = { 0, 1, 2, 3, 4, 5, 6, 7 };
25   const uint16x8_t index = vld1q_u16(idx);
26 
27   uint16_t edge[160];  // Max value of sz + enough padding for vector accesses.
28   memcpy(edge + 1, p, sz * sizeof(*p));
29 
30   // Populate extra space appropriately.
31   edge[0] = edge[1];
32   edge[sz + 1] = edge[sz];
33   edge[sz + 2] = edge[sz];
34 
35   // Don't overwrite first pixel.
36   uint16_t *dst = p + 1;
37   sz--;
38 
39   if (strength == 1) {  // Filter: {4, 8, 4}.
40     const uint16_t *src = edge + 1;
41 
42     while (sz >= 8) {
43       uint16x8_t s0 = vld1q_u16(src);
44       uint16x8_t s1 = vld1q_u16(src + 1);
45       uint16x8_t s2 = vld1q_u16(src + 2);
46 
47       // Make use of the identity:
48       // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2
49       uint16x8_t t0 = vaddq_u16(s0, s2);
50       uint16x8_t t1 = vaddq_u16(s1, s1);
51       uint16x8_t sum = vaddq_u16(t0, t1);
52       uint16x8_t res = vrshrq_n_u16(sum, 2);
53 
54       vst1q_u16(dst, res);
55 
56       src += 8;
57       dst += 8;
58       sz -= 8;
59     }
60 
61     if (sz > 0) {  // Handle sz < 8 to avoid modifying out-of-bounds values.
62       uint16x8_t s0 = vld1q_u16(src);
63       uint16x8_t s1 = vld1q_u16(src + 1);
64       uint16x8_t s2 = vld1q_u16(src + 2);
65 
66       // Make use of the identity:
67       // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2
68       uint16x8_t t0 = vaddq_u16(s0, s2);
69       uint16x8_t t1 = vaddq_u16(s1, s1);
70       uint16x8_t sum = vaddq_u16(t0, t1);
71       uint16x8_t res = vrshrq_n_u16(sum, 2);
72 
73       // Mask off out-of-bounds indices.
74       uint16x8_t current_dst = vld1q_u16(dst);
75       uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index);
76       res = vbslq_u16(mask, res, current_dst);
77 
78       vst1q_u16(dst, res);
79     }
80   } else if (strength == 2) {  // Filter: {5, 6, 5}.
81     const uint16_t *src = edge + 1;
82 
83     const uint16x8x3_t filter = { { vdupq_n_u16(5), vdupq_n_u16(6),
84                                     vdupq_n_u16(5) } };
85     while (sz >= 8) {
86       uint16x8_t s0 = vld1q_u16(src);
87       uint16x8_t s1 = vld1q_u16(src + 1);
88       uint16x8_t s2 = vld1q_u16(src + 2);
89 
90       uint16x8_t accum = vmulq_u16(s0, filter.val[0]);
91       accum = vmlaq_u16(accum, s1, filter.val[1]);
92       accum = vmlaq_u16(accum, s2, filter.val[2]);
93       uint16x8_t res = vrshrq_n_u16(accum, 4);
94 
95       vst1q_u16(dst, res);
96 
97       src += 8;
98       dst += 8;
99       sz -= 8;
100     }
101 
102     if (sz > 0) {  // Handle sz < 8 to avoid modifying out-of-bounds values.
103       uint16x8_t s0 = vld1q_u16(src);
104       uint16x8_t s1 = vld1q_u16(src + 1);
105       uint16x8_t s2 = vld1q_u16(src + 2);
106 
107       uint16x8_t accum = vmulq_u16(s0, filter.val[0]);
108       accum = vmlaq_u16(accum, s1, filter.val[1]);
109       accum = vmlaq_u16(accum, s2, filter.val[2]);
110       uint16x8_t res = vrshrq_n_u16(accum, 4);
111 
112       // Mask off out-of-bounds indices.
113       uint16x8_t current_dst = vld1q_u16(dst);
114       uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index);
115       res = vbslq_u16(mask, res, current_dst);
116 
117       vst1q_u16(dst, res);
118     }
119   } else {  // Filter {2, 4, 4, 4, 2}.
120     const uint16_t *src = edge;
121 
122     while (sz >= 8) {
123       uint16x8_t s0 = vld1q_u16(src);
124       uint16x8_t s1 = vld1q_u16(src + 1);
125       uint16x8_t s2 = vld1q_u16(src + 2);
126       uint16x8_t s3 = vld1q_u16(src + 3);
127       uint16x8_t s4 = vld1q_u16(src + 4);
128 
129       // Make use of the identity:
130       // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3
131       uint16x8_t t0 = vaddq_u16(s0, s4);
132       uint16x8_t t1 = vaddq_u16(s1, s2);
133       t1 = vaddq_u16(t1, s3);
134       t1 = vaddq_u16(t1, t1);
135       uint16x8_t sum = vaddq_u16(t0, t1);
136       uint16x8_t res = vrshrq_n_u16(sum, 3);
137 
138       vst1q_u16(dst, res);
139 
140       src += 8;
141       dst += 8;
142       sz -= 8;
143     }
144 
145     if (sz > 0) {  // Handle sz < 8 to avoid modifying out-of-bounds values.
146       uint16x8_t s0 = vld1q_u16(src);
147       uint16x8_t s1 = vld1q_u16(src + 1);
148       uint16x8_t s2 = vld1q_u16(src + 2);
149       uint16x8_t s3 = vld1q_u16(src + 3);
150       uint16x8_t s4 = vld1q_u16(src + 4);
151 
152       // Make use of the identity:
153       // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3
154       uint16x8_t t0 = vaddq_u16(s0, s4);
155       uint16x8_t t1 = vaddq_u16(s1, s2);
156       t1 = vaddq_u16(t1, s3);
157       t1 = vaddq_u16(t1, t1);
158       uint16x8_t sum = vaddq_u16(t0, t1);
159       uint16x8_t res = vrshrq_n_u16(sum, 3);
160 
161       // Mask off out-of-bounds indices.
162       uint16x8_t current_dst = vld1q_u16(dst);
163       uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index);
164       res = vbslq_u16(mask, res, current_dst);
165 
166       vst1q_u16(dst, res);
167     }
168   }
169 }
170 
av1_highbd_upsample_intra_edge_neon(uint16_t * p,int sz,int bd)171 void av1_highbd_upsample_intra_edge_neon(uint16_t *p, int sz, int bd) {
172   if (!sz) return;
173 
174   assert(sz <= MAX_UPSAMPLE_SZ);
175 
176   uint16_t edge[MAX_UPSAMPLE_SZ + 3];
177   const uint16_t *src = edge;
178 
179   // Copy p[-1..(sz-1)] and pad out both ends.
180   edge[0] = p[-1];
181   edge[1] = p[-1];
182   memcpy(edge + 2, p, sz * 2);
183   edge[sz + 2] = p[sz - 1];
184   p[-2] = p[-1];
185 
186   uint16x8_t pixel_val_max = vdupq_n_u16((1 << bd) - 1);
187 
188   uint16_t *dst = p - 1;
189 
190   if (bd == 12) {
191     do {
192       uint16x8_t s0 = vld1q_u16(src);
193       uint16x8_t s1 = vld1q_u16(src + 1);
194       uint16x8_t s2 = vld1q_u16(src + 2);
195       uint16x8_t s3 = vld1q_u16(src + 3);
196 
197       uint16x8_t t0 = vaddq_u16(s1, s2);
198       uint16x8_t t1 = vaddq_u16(s0, s3);
199       uint32x4_t acc0 = vmull_n_u16(vget_low_u16(t0), 9);
200       acc0 = vqsubq_u32(acc0, vmovl_u16(vget_low_u16(t1)));
201       uint32x4_t acc1 = vmull_n_u16(vget_high_u16(t0), 9);
202       acc1 = vqsubq_u32(acc1, vmovl_u16(vget_high_u16(t1)));
203 
204       uint16x8x2_t res;
205       res.val[0] = vcombine_u16(vrshrn_n_u32(acc0, 4), vrshrn_n_u32(acc1, 4));
206       // Clamp pixel values at bitdepth maximum.
207       res.val[0] = vminq_u16(res.val[0], pixel_val_max);
208       res.val[1] = s2;
209 
210       vst2q_u16(dst, res);
211 
212       src += 8;
213       dst += 16;
214       sz -= 8;
215     } while (sz > 0);
216   } else {  // Bit depth is 8 or 10.
217     do {
218       uint16x8_t s0 = vld1q_u16(src);
219       uint16x8_t s1 = vld1q_u16(src + 1);
220       uint16x8_t s2 = vld1q_u16(src + 2);
221       uint16x8_t s3 = vld1q_u16(src + 3);
222 
223       uint16x8_t t0 = vaddq_u16(s0, s3);
224       uint16x8_t t1 = vaddq_u16(s1, s2);
225       t1 = vmulq_n_u16(t1, 9);
226       t1 = vqsubq_u16(t1, t0);
227 
228       uint16x8x2_t res;
229       res.val[0] = vrshrq_n_u16(t1, 4);
230       // Clamp pixel values at bitdepth maximum.
231       res.val[0] = vminq_u16(res.val[0], pixel_val_max);
232       res.val[1] = s2;
233 
234       vst2q_u16(dst, res);
235 
236       src += 8;
237       dst += 16;
238       sz -= 8;
239     } while (sz > 0);
240   }
241 }
242