1 /*
2 * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include "./vpx_config.h"
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_dsp/arm/transpose_neon.h"
15
load_thresh(const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,uint16x8_t * blimit_vec,uint16x8_t * limit_vec,uint16x8_t * thresh_vec,const int bd)16 static INLINE void load_thresh(const uint8_t *blimit, const uint8_t *limit,
17 const uint8_t *thresh, uint16x8_t *blimit_vec,
18 uint16x8_t *limit_vec, uint16x8_t *thresh_vec,
19 const int bd) {
20 const int16x8_t shift = vdupq_n_s16(bd - 8);
21 *blimit_vec = vmovl_u8(vld1_dup_u8(blimit));
22 *limit_vec = vmovl_u8(vld1_dup_u8(limit));
23 *thresh_vec = vmovl_u8(vld1_dup_u8(thresh));
24 *blimit_vec = vshlq_u16(*blimit_vec, shift);
25 *limit_vec = vshlq_u16(*limit_vec, shift);
26 *thresh_vec = vshlq_u16(*thresh_vec, shift);
27 }
28
29 // Here flat is 128-bit long, with each 16-bit chunk being a mask of
30 // a pixel. When used to control filter branches, we only detect whether it is
31 // all 0s or all 1s. We pairwise add flat to a 32-bit long number flat_status.
32 // flat equals 0 if and only if flat_status equals 0.
33 // flat equals -1 (all 1s) if and only if flat_status equals -4. (This is true
34 // because each mask occupies more than 1 bit.)
calc_flat_status(const uint16x8_t flat)35 static INLINE uint32_t calc_flat_status(const uint16x8_t flat) {
36 const uint64x1_t t0 = vadd_u64(vreinterpret_u64_u16(vget_low_u16(flat)),
37 vreinterpret_u64_u16(vget_high_u16(flat)));
38 const uint64x1_t t1 = vpaddl_u32(vreinterpret_u32_u64(t0));
39 return vget_lane_u32(vreinterpret_u32_u64(t1), 0);
40 }
41
42 static INLINE uint16x8_t
filter_hev_mask4(const uint16x8_t limit,const uint16x8_t blimit,const uint16x8_t thresh,const uint16x8_t p3,const uint16x8_t p2,const uint16x8_t p1,const uint16x8_t p0,const uint16x8_t q0,const uint16x8_t q1,const uint16x8_t q2,const uint16x8_t q3,uint16x8_t * hev,uint16x8_t * mask)43 filter_hev_mask4(const uint16x8_t limit, const uint16x8_t blimit,
44 const uint16x8_t thresh, const uint16x8_t p3,
45 const uint16x8_t p2, const uint16x8_t p1, const uint16x8_t p0,
46 const uint16x8_t q0, const uint16x8_t q1, const uint16x8_t q2,
47 const uint16x8_t q3, uint16x8_t *hev, uint16x8_t *mask) {
48 uint16x8_t max, t0, t1;
49
50 max = vabdq_u16(p1, p0);
51 max = vmaxq_u16(max, vabdq_u16(q1, q0));
52 *hev = vcgtq_u16(max, thresh);
53 *mask = vmaxq_u16(max, vabdq_u16(p3, p2));
54 *mask = vmaxq_u16(*mask, vabdq_u16(p2, p1));
55 *mask = vmaxq_u16(*mask, vabdq_u16(q2, q1));
56 *mask = vmaxq_u16(*mask, vabdq_u16(q3, q2));
57 t0 = vabdq_u16(p0, q0);
58 t1 = vabdq_u16(p1, q1);
59 t0 = vaddq_u16(t0, t0);
60 t1 = vshrq_n_u16(t1, 1);
61 t0 = vaddq_u16(t0, t1);
62 *mask = vcleq_u16(*mask, limit);
63 t0 = vcleq_u16(t0, blimit);
64 *mask = vandq_u16(*mask, t0);
65
66 return max;
67 }
68
filter_flat_hev_mask(const uint16x8_t limit,const uint16x8_t blimit,const uint16x8_t thresh,const uint16x8_t p3,const uint16x8_t p2,const uint16x8_t p1,const uint16x8_t p0,const uint16x8_t q0,const uint16x8_t q1,const uint16x8_t q2,const uint16x8_t q3,uint16x8_t * flat,uint32_t * flat_status,uint16x8_t * hev,const int bd)69 static INLINE uint16x8_t filter_flat_hev_mask(
70 const uint16x8_t limit, const uint16x8_t blimit, const uint16x8_t thresh,
71 const uint16x8_t p3, const uint16x8_t p2, const uint16x8_t p1,
72 const uint16x8_t p0, const uint16x8_t q0, const uint16x8_t q1,
73 const uint16x8_t q2, const uint16x8_t q3, uint16x8_t *flat,
74 uint32_t *flat_status, uint16x8_t *hev, const int bd) {
75 uint16x8_t mask;
76 const uint16x8_t max = filter_hev_mask4(limit, blimit, thresh, p3, p2, p1, p0,
77 q0, q1, q2, q3, hev, &mask);
78 *flat = vmaxq_u16(max, vabdq_u16(p2, p0));
79 *flat = vmaxq_u16(*flat, vabdq_u16(q2, q0));
80 *flat = vmaxq_u16(*flat, vabdq_u16(p3, p0));
81 *flat = vmaxq_u16(*flat, vabdq_u16(q3, q0));
82 *flat = vcleq_u16(*flat, vdupq_n_u16(1 << (bd - 8))); /* flat_mask4() */
83 *flat = vandq_u16(*flat, mask);
84 *flat_status = calc_flat_status(*flat);
85
86 return mask;
87 }
88
flat_mask5(const uint16x8_t p4,const uint16x8_t p3,const uint16x8_t p2,const uint16x8_t p1,const uint16x8_t p0,const uint16x8_t q0,const uint16x8_t q1,const uint16x8_t q2,const uint16x8_t q3,const uint16x8_t q4,const uint16x8_t flat,uint32_t * flat2_status,const int bd)89 static INLINE uint16x8_t flat_mask5(const uint16x8_t p4, const uint16x8_t p3,
90 const uint16x8_t p2, const uint16x8_t p1,
91 const uint16x8_t p0, const uint16x8_t q0,
92 const uint16x8_t q1, const uint16x8_t q2,
93 const uint16x8_t q3, const uint16x8_t q4,
94 const uint16x8_t flat,
95 uint32_t *flat2_status, const int bd) {
96 uint16x8_t flat2 = vabdq_u16(p4, p0);
97 flat2 = vmaxq_u16(flat2, vabdq_u16(p3, p0));
98 flat2 = vmaxq_u16(flat2, vabdq_u16(p2, p0));
99 flat2 = vmaxq_u16(flat2, vabdq_u16(p1, p0));
100 flat2 = vmaxq_u16(flat2, vabdq_u16(q1, q0));
101 flat2 = vmaxq_u16(flat2, vabdq_u16(q2, q0));
102 flat2 = vmaxq_u16(flat2, vabdq_u16(q3, q0));
103 flat2 = vmaxq_u16(flat2, vabdq_u16(q4, q0));
104 flat2 = vcleq_u16(flat2, vdupq_n_u16(1 << (bd - 8)));
105 flat2 = vandq_u16(flat2, flat);
106 *flat2_status = calc_flat_status(flat2);
107
108 return flat2;
109 }
110
flip_sign(const uint16x8_t v,const int bd)111 static INLINE int16x8_t flip_sign(const uint16x8_t v, const int bd) {
112 const uint16x8_t offset = vdupq_n_u16(0x80 << (bd - 8));
113 return vreinterpretq_s16_u16(vsubq_u16(v, offset));
114 }
115
flip_sign_back(const int16x8_t v,const int bd)116 static INLINE uint16x8_t flip_sign_back(const int16x8_t v, const int bd) {
117 const int16x8_t offset = vdupq_n_s16(0x80 << (bd - 8));
118 return vreinterpretq_u16_s16(vaddq_s16(v, offset));
119 }
120
filter_update(const uint16x8_t sub0,const uint16x8_t sub1,const uint16x8_t add0,const uint16x8_t add1,uint16x8_t * sum)121 static INLINE void filter_update(const uint16x8_t sub0, const uint16x8_t sub1,
122 const uint16x8_t add0, const uint16x8_t add1,
123 uint16x8_t *sum) {
124 *sum = vsubq_u16(*sum, sub0);
125 *sum = vsubq_u16(*sum, sub1);
126 *sum = vaddq_u16(*sum, add0);
127 *sum = vaddq_u16(*sum, add1);
128 }
129
calc_7_tap_filter_kernel(const uint16x8_t sub0,const uint16x8_t sub1,const uint16x8_t add0,const uint16x8_t add1,uint16x8_t * sum)130 static INLINE uint16x8_t calc_7_tap_filter_kernel(const uint16x8_t sub0,
131 const uint16x8_t sub1,
132 const uint16x8_t add0,
133 const uint16x8_t add1,
134 uint16x8_t *sum) {
135 filter_update(sub0, sub1, add0, add1, sum);
136 return vrshrq_n_u16(*sum, 3);
137 }
138
apply_15_tap_filter_kernel(const uint16x8_t flat,const uint16x8_t sub0,const uint16x8_t sub1,const uint16x8_t add0,const uint16x8_t add1,const uint16x8_t in,uint16x8_t * sum)139 static INLINE uint16x8_t apply_15_tap_filter_kernel(
140 const uint16x8_t flat, const uint16x8_t sub0, const uint16x8_t sub1,
141 const uint16x8_t add0, const uint16x8_t add1, const uint16x8_t in,
142 uint16x8_t *sum) {
143 filter_update(sub0, sub1, add0, add1, sum);
144 return vbslq_u16(flat, vrshrq_n_u16(*sum, 4), in);
145 }
146
147 // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
calc_7_tap_filter(const uint16x8_t p3,const uint16x8_t p2,const uint16x8_t p1,const uint16x8_t p0,const uint16x8_t q0,const uint16x8_t q1,const uint16x8_t q2,const uint16x8_t q3,uint16x8_t * op2,uint16x8_t * op1,uint16x8_t * op0,uint16x8_t * oq0,uint16x8_t * oq1,uint16x8_t * oq2)148 static INLINE void calc_7_tap_filter(const uint16x8_t p3, const uint16x8_t p2,
149 const uint16x8_t p1, const uint16x8_t p0,
150 const uint16x8_t q0, const uint16x8_t q1,
151 const uint16x8_t q2, const uint16x8_t q3,
152 uint16x8_t *op2, uint16x8_t *op1,
153 uint16x8_t *op0, uint16x8_t *oq0,
154 uint16x8_t *oq1, uint16x8_t *oq2) {
155 uint16x8_t sum;
156 sum = vaddq_u16(p3, p3); // 2*p3
157 sum = vaddq_u16(sum, p3); // 3*p3
158 sum = vaddq_u16(sum, p2); // 3*p3+p2
159 sum = vaddq_u16(sum, p2); // 3*p3+2*p2
160 sum = vaddq_u16(sum, p1); // 3*p3+2*p2+p1
161 sum = vaddq_u16(sum, p0); // 3*p3+2*p2+p1+p0
162 sum = vaddq_u16(sum, q0); // 3*p3+2*p2+p1+p0+q0
163 *op2 = vrshrq_n_u16(sum, 3);
164 *op1 = calc_7_tap_filter_kernel(p3, p2, p1, q1, &sum);
165 *op0 = calc_7_tap_filter_kernel(p3, p1, p0, q2, &sum);
166 *oq0 = calc_7_tap_filter_kernel(p3, p0, q0, q3, &sum);
167 *oq1 = calc_7_tap_filter_kernel(p2, q0, q1, q3, &sum);
168 *oq2 = calc_7_tap_filter_kernel(p1, q1, q2, q3, &sum);
169 }
170
apply_7_tap_filter(const uint16x8_t flat,const uint16x8_t p3,const uint16x8_t p2,const uint16x8_t p1,const uint16x8_t p0,const uint16x8_t q0,const uint16x8_t q1,const uint16x8_t q2,const uint16x8_t q3,uint16x8_t * op2,uint16x8_t * op1,uint16x8_t * op0,uint16x8_t * oq0,uint16x8_t * oq1,uint16x8_t * oq2)171 static INLINE void apply_7_tap_filter(const uint16x8_t flat,
172 const uint16x8_t p3, const uint16x8_t p2,
173 const uint16x8_t p1, const uint16x8_t p0,
174 const uint16x8_t q0, const uint16x8_t q1,
175 const uint16x8_t q2, const uint16x8_t q3,
176 uint16x8_t *op2, uint16x8_t *op1,
177 uint16x8_t *op0, uint16x8_t *oq0,
178 uint16x8_t *oq1, uint16x8_t *oq2) {
179 uint16x8_t tp1, tp0, tq0, tq1;
180 calc_7_tap_filter(p3, p2, p1, p0, q0, q1, q2, q3, op2, &tp1, &tp0, &tq0, &tq1,
181 oq2);
182 *op2 = vbslq_u16(flat, *op2, p2);
183 *op1 = vbslq_u16(flat, tp1, *op1);
184 *op0 = vbslq_u16(flat, tp0, *op0);
185 *oq0 = vbslq_u16(flat, tq0, *oq0);
186 *oq1 = vbslq_u16(flat, tq1, *oq1);
187 *oq2 = vbslq_u16(flat, *oq2, q2);
188 }
189
190 // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
apply_15_tap_filter(const uint16x8_t flat2,const uint16x8_t p7,const uint16x8_t p6,const uint16x8_t p5,const uint16x8_t p4,const uint16x8_t p3,const uint16x8_t p2,const uint16x8_t p1,const uint16x8_t p0,const uint16x8_t q0,const uint16x8_t q1,const uint16x8_t q2,const uint16x8_t q3,const uint16x8_t q4,const uint16x8_t q5,const uint16x8_t q6,const uint16x8_t q7,uint16x8_t * op6,uint16x8_t * op5,uint16x8_t * op4,uint16x8_t * op3,uint16x8_t * op2,uint16x8_t * op1,uint16x8_t * op0,uint16x8_t * oq0,uint16x8_t * oq1,uint16x8_t * oq2,uint16x8_t * oq3,uint16x8_t * oq4,uint16x8_t * oq5,uint16x8_t * oq6)191 static INLINE void apply_15_tap_filter(
192 const uint16x8_t flat2, const uint16x8_t p7, const uint16x8_t p6,
193 const uint16x8_t p5, const uint16x8_t p4, const uint16x8_t p3,
194 const uint16x8_t p2, const uint16x8_t p1, const uint16x8_t p0,
195 const uint16x8_t q0, const uint16x8_t q1, const uint16x8_t q2,
196 const uint16x8_t q3, const uint16x8_t q4, const uint16x8_t q5,
197 const uint16x8_t q6, const uint16x8_t q7, uint16x8_t *op6, uint16x8_t *op5,
198 uint16x8_t *op4, uint16x8_t *op3, uint16x8_t *op2, uint16x8_t *op1,
199 uint16x8_t *op0, uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2,
200 uint16x8_t *oq3, uint16x8_t *oq4, uint16x8_t *oq5, uint16x8_t *oq6) {
201 uint16x8_t sum;
202 sum = vshlq_n_u16(p7, 3); // 8*p7
203 sum = vsubq_u16(sum, p7); // 7*p7
204 sum = vaddq_u16(sum, p6); // 7*p7+p6
205 sum = vaddq_u16(sum, p6); // 7*p7+2*p6
206 sum = vaddq_u16(sum, p5); // 7*p7+2*p6+p5
207 sum = vaddq_u16(sum, p4); // 7*p7+2*p6+p5+p4
208 sum = vaddq_u16(sum, p3); // 7*p7+2*p6+p5+p4+p3
209 sum = vaddq_u16(sum, p2); // 7*p7+2*p6+p5+p4+p3+p2
210 sum = vaddq_u16(sum, p1); // 7*p7+2*p6+p5+p4+p3+p2+p1
211 sum = vaddq_u16(sum, p0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
212 sum = vaddq_u16(sum, q0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
213 *op6 = vbslq_u16(flat2, vrshrq_n_u16(sum, 4), p6);
214 *op5 = apply_15_tap_filter_kernel(flat2, p7, p6, p5, q1, p5, &sum);
215 *op4 = apply_15_tap_filter_kernel(flat2, p7, p5, p4, q2, p4, &sum);
216 *op3 = apply_15_tap_filter_kernel(flat2, p7, p4, p3, q3, p3, &sum);
217 *op2 = apply_15_tap_filter_kernel(flat2, p7, p3, p2, q4, *op2, &sum);
218 *op1 = apply_15_tap_filter_kernel(flat2, p7, p2, p1, q5, *op1, &sum);
219 *op0 = apply_15_tap_filter_kernel(flat2, p7, p1, p0, q6, *op0, &sum);
220 *oq0 = apply_15_tap_filter_kernel(flat2, p7, p0, q0, q7, *oq0, &sum);
221 *oq1 = apply_15_tap_filter_kernel(flat2, p6, q0, q1, q7, *oq1, &sum);
222 *oq2 = apply_15_tap_filter_kernel(flat2, p5, q1, q2, q7, *oq2, &sum);
223 *oq3 = apply_15_tap_filter_kernel(flat2, p4, q2, q3, q7, q3, &sum);
224 *oq4 = apply_15_tap_filter_kernel(flat2, p3, q3, q4, q7, q4, &sum);
225 *oq5 = apply_15_tap_filter_kernel(flat2, p2, q4, q5, q7, q5, &sum);
226 *oq6 = apply_15_tap_filter_kernel(flat2, p1, q5, q6, q7, q6, &sum);
227 }
228
filter4(const uint16x8_t mask,const uint16x8_t hev,const uint16x8_t p1,const uint16x8_t p0,const uint16x8_t q0,const uint16x8_t q1,uint16x8_t * op1,uint16x8_t * op0,uint16x8_t * oq0,uint16x8_t * oq1,const int bd)229 static INLINE void filter4(const uint16x8_t mask, const uint16x8_t hev,
230 const uint16x8_t p1, const uint16x8_t p0,
231 const uint16x8_t q0, const uint16x8_t q1,
232 uint16x8_t *op1, uint16x8_t *op0, uint16x8_t *oq0,
233 uint16x8_t *oq1, const int bd) {
234 const int16x8_t max = vdupq_n_s16((1 << (bd - 1)) - 1);
235 const int16x8_t min = vdupq_n_s16((int16_t)(((uint32_t)-1) << (bd - 1)));
236 int16x8_t filter, filter1, filter2, t;
237 int16x8_t ps1 = flip_sign(p1, bd);
238 int16x8_t ps0 = flip_sign(p0, bd);
239 int16x8_t qs0 = flip_sign(q0, bd);
240 int16x8_t qs1 = flip_sign(q1, bd);
241
242 /* add outer taps if we have high edge variance */
243 filter = vsubq_s16(ps1, qs1);
244 filter = vmaxq_s16(filter, min);
245 filter = vminq_s16(filter, max);
246 filter = vandq_s16(filter, vreinterpretq_s16_u16(hev));
247 t = vsubq_s16(qs0, ps0);
248
249 /* inner taps */
250 filter = vaddq_s16(filter, t);
251 filter = vaddq_s16(filter, t);
252 filter = vaddq_s16(filter, t);
253 filter = vmaxq_s16(filter, min);
254 filter = vminq_s16(filter, max);
255 filter = vandq_s16(filter, vreinterpretq_s16_u16(mask));
256
257 /* save bottom 3 bits so that we round one side +4 and the other +3 */
258 /* if it equals 4 we'll set it to adjust by -1 to account for the fact */
259 /* we'd round it by 3 the other way */
260 t = vaddq_s16(filter, vdupq_n_s16(4));
261 t = vminq_s16(t, max);
262 filter1 = vshrq_n_s16(t, 3);
263 t = vaddq_s16(filter, vdupq_n_s16(3));
264 t = vminq_s16(t, max);
265 filter2 = vshrq_n_s16(t, 3);
266
267 qs0 = vsubq_s16(qs0, filter1);
268 qs0 = vmaxq_s16(qs0, min);
269 qs0 = vminq_s16(qs0, max);
270 ps0 = vaddq_s16(ps0, filter2);
271 ps0 = vmaxq_s16(ps0, min);
272 ps0 = vminq_s16(ps0, max);
273 *oq0 = flip_sign_back(qs0, bd);
274 *op0 = flip_sign_back(ps0, bd);
275
276 /* outer tap adjustments */
277 filter = vrshrq_n_s16(filter1, 1);
278 filter = vbicq_s16(filter, vreinterpretq_s16_u16(hev));
279
280 qs1 = vsubq_s16(qs1, filter);
281 qs1 = vmaxq_s16(qs1, min);
282 qs1 = vminq_s16(qs1, max);
283 ps1 = vaddq_s16(ps1, filter);
284 ps1 = vmaxq_s16(ps1, min);
285 ps1 = vminq_s16(ps1, max);
286 *oq1 = flip_sign_back(qs1, bd);
287 *op1 = flip_sign_back(ps1, bd);
288 }
289
filter8(const uint16x8_t mask,const uint16x8_t flat,const uint32_t flat_status,const uint16x8_t hev,const uint16x8_t p3,const uint16x8_t p2,const uint16x8_t p1,const uint16x8_t p0,const uint16x8_t q0,const uint16x8_t q1,const uint16x8_t q2,const uint16x8_t q3,uint16x8_t * op2,uint16x8_t * op1,uint16x8_t * op0,uint16x8_t * oq0,uint16x8_t * oq1,uint16x8_t * oq2,const int bd)290 static INLINE void filter8(const uint16x8_t mask, const uint16x8_t flat,
291 const uint32_t flat_status, const uint16x8_t hev,
292 const uint16x8_t p3, const uint16x8_t p2,
293 const uint16x8_t p1, const uint16x8_t p0,
294 const uint16x8_t q0, const uint16x8_t q1,
295 const uint16x8_t q2, const uint16x8_t q3,
296 uint16x8_t *op2, uint16x8_t *op1, uint16x8_t *op0,
297 uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2,
298 const int bd) {
299 if (flat_status != (uint32_t)-4) {
300 filter4(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1, bd);
301 *op2 = p2;
302 *oq2 = q2;
303 if (flat_status) {
304 apply_7_tap_filter(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0,
305 oq0, oq1, oq2);
306 }
307 } else {
308 calc_7_tap_filter(p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, oq0, oq1,
309 oq2);
310 }
311 }
312
filter16(const uint16x8_t mask,const uint16x8_t flat,const uint32_t flat_status,const uint16x8_t flat2,const uint32_t flat2_status,const uint16x8_t hev,const uint16x8_t p7,const uint16x8_t p6,const uint16x8_t p5,const uint16x8_t p4,const uint16x8_t p3,const uint16x8_t p2,const uint16x8_t p1,const uint16x8_t p0,const uint16x8_t q0,const uint16x8_t q1,const uint16x8_t q2,const uint16x8_t q3,const uint16x8_t q4,const uint16x8_t q5,const uint16x8_t q6,const uint16x8_t q7,uint16x8_t * op6,uint16x8_t * op5,uint16x8_t * op4,uint16x8_t * op3,uint16x8_t * op2,uint16x8_t * op1,uint16x8_t * op0,uint16x8_t * oq0,uint16x8_t * oq1,uint16x8_t * oq2,uint16x8_t * oq3,uint16x8_t * oq4,uint16x8_t * oq5,uint16x8_t * oq6,const int bd)313 static INLINE void filter16(
314 const uint16x8_t mask, const uint16x8_t flat, const uint32_t flat_status,
315 const uint16x8_t flat2, const uint32_t flat2_status, const uint16x8_t hev,
316 const uint16x8_t p7, const uint16x8_t p6, const uint16x8_t p5,
317 const uint16x8_t p4, const uint16x8_t p3, const uint16x8_t p2,
318 const uint16x8_t p1, const uint16x8_t p0, const uint16x8_t q0,
319 const uint16x8_t q1, const uint16x8_t q2, const uint16x8_t q3,
320 const uint16x8_t q4, const uint16x8_t q5, const uint16x8_t q6,
321 const uint16x8_t q7, uint16x8_t *op6, uint16x8_t *op5, uint16x8_t *op4,
322 uint16x8_t *op3, uint16x8_t *op2, uint16x8_t *op1, uint16x8_t *op0,
323 uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2, uint16x8_t *oq3,
324 uint16x8_t *oq4, uint16x8_t *oq5, uint16x8_t *oq6, const int bd) {
325 if (flat_status != (uint32_t)-4) {
326 filter4(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1, bd);
327 }
328
329 if (flat_status) {
330 *op2 = p2;
331 *oq2 = q2;
332 if (flat2_status != (uint32_t)-4) {
333 apply_7_tap_filter(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0,
334 oq0, oq1, oq2);
335 }
336 if (flat2_status) {
337 apply_15_tap_filter(flat2, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3,
338 q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0,
339 oq0, oq1, oq2, oq3, oq4, oq5, oq6);
340 }
341 }
342 }
343
load_8x8(const uint16_t * s,const int p,uint16x8_t * p3,uint16x8_t * p2,uint16x8_t * p1,uint16x8_t * p0,uint16x8_t * q0,uint16x8_t * q1,uint16x8_t * q2,uint16x8_t * q3)344 static INLINE void load_8x8(const uint16_t *s, const int p, uint16x8_t *p3,
345 uint16x8_t *p2, uint16x8_t *p1, uint16x8_t *p0,
346 uint16x8_t *q0, uint16x8_t *q1, uint16x8_t *q2,
347 uint16x8_t *q3) {
348 *p3 = vld1q_u16(s);
349 s += p;
350 *p2 = vld1q_u16(s);
351 s += p;
352 *p1 = vld1q_u16(s);
353 s += p;
354 *p0 = vld1q_u16(s);
355 s += p;
356 *q0 = vld1q_u16(s);
357 s += p;
358 *q1 = vld1q_u16(s);
359 s += p;
360 *q2 = vld1q_u16(s);
361 s += p;
362 *q3 = vld1q_u16(s);
363 }
364
load_8x16(const uint16_t * s,const int p,uint16x8_t * s0,uint16x8_t * s1,uint16x8_t * s2,uint16x8_t * s3,uint16x8_t * s4,uint16x8_t * s5,uint16x8_t * s6,uint16x8_t * s7,uint16x8_t * s8,uint16x8_t * s9,uint16x8_t * s10,uint16x8_t * s11,uint16x8_t * s12,uint16x8_t * s13,uint16x8_t * s14,uint16x8_t * s15)365 static INLINE void load_8x16(const uint16_t *s, const int p, uint16x8_t *s0,
366 uint16x8_t *s1, uint16x8_t *s2, uint16x8_t *s3,
367 uint16x8_t *s4, uint16x8_t *s5, uint16x8_t *s6,
368 uint16x8_t *s7, uint16x8_t *s8, uint16x8_t *s9,
369 uint16x8_t *s10, uint16x8_t *s11, uint16x8_t *s12,
370 uint16x8_t *s13, uint16x8_t *s14,
371 uint16x8_t *s15) {
372 *s0 = vld1q_u16(s);
373 s += p;
374 *s1 = vld1q_u16(s);
375 s += p;
376 *s2 = vld1q_u16(s);
377 s += p;
378 *s3 = vld1q_u16(s);
379 s += p;
380 *s4 = vld1q_u16(s);
381 s += p;
382 *s5 = vld1q_u16(s);
383 s += p;
384 *s6 = vld1q_u16(s);
385 s += p;
386 *s7 = vld1q_u16(s);
387 s += p;
388 *s8 = vld1q_u16(s);
389 s += p;
390 *s9 = vld1q_u16(s);
391 s += p;
392 *s10 = vld1q_u16(s);
393 s += p;
394 *s11 = vld1q_u16(s);
395 s += p;
396 *s12 = vld1q_u16(s);
397 s += p;
398 *s13 = vld1q_u16(s);
399 s += p;
400 *s14 = vld1q_u16(s);
401 s += p;
402 *s15 = vld1q_u16(s);
403 }
404
store_8x4(uint16_t * s,const int p,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3)405 static INLINE void store_8x4(uint16_t *s, const int p, const uint16x8_t s0,
406 const uint16x8_t s1, const uint16x8_t s2,
407 const uint16x8_t s3) {
408 vst1q_u16(s, s0);
409 s += p;
410 vst1q_u16(s, s1);
411 s += p;
412 vst1q_u16(s, s2);
413 s += p;
414 vst1q_u16(s, s3);
415 }
416
store_8x6(uint16_t * s,const int p,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3,const uint16x8_t s4,const uint16x8_t s5)417 static INLINE void store_8x6(uint16_t *s, const int p, const uint16x8_t s0,
418 const uint16x8_t s1, const uint16x8_t s2,
419 const uint16x8_t s3, const uint16x8_t s4,
420 const uint16x8_t s5) {
421 vst1q_u16(s, s0);
422 s += p;
423 vst1q_u16(s, s1);
424 s += p;
425 vst1q_u16(s, s2);
426 s += p;
427 vst1q_u16(s, s3);
428 s += p;
429 vst1q_u16(s, s4);
430 s += p;
431 vst1q_u16(s, s5);
432 }
433
store_4x8(uint16_t * s,const int p,const uint16x8_t p1,const uint16x8_t p0,const uint16x8_t q0,const uint16x8_t q1)434 static INLINE void store_4x8(uint16_t *s, const int p, const uint16x8_t p1,
435 const uint16x8_t p0, const uint16x8_t q0,
436 const uint16x8_t q1) {
437 uint16x8x4_t o;
438
439 o.val[0] = p1;
440 o.val[1] = p0;
441 o.val[2] = q0;
442 o.val[3] = q1;
443 vst4q_lane_u16(s, o, 0);
444 s += p;
445 vst4q_lane_u16(s, o, 1);
446 s += p;
447 vst4q_lane_u16(s, o, 2);
448 s += p;
449 vst4q_lane_u16(s, o, 3);
450 s += p;
451 vst4q_lane_u16(s, o, 4);
452 s += p;
453 vst4q_lane_u16(s, o, 5);
454 s += p;
455 vst4q_lane_u16(s, o, 6);
456 s += p;
457 vst4q_lane_u16(s, o, 7);
458 }
459
store_6x8(uint16_t * s,const int p,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3,const uint16x8_t s4,const uint16x8_t s5)460 static INLINE void store_6x8(uint16_t *s, const int p, const uint16x8_t s0,
461 const uint16x8_t s1, const uint16x8_t s2,
462 const uint16x8_t s3, const uint16x8_t s4,
463 const uint16x8_t s5) {
464 uint16x8x3_t o0, o1;
465
466 o0.val[0] = s0;
467 o0.val[1] = s1;
468 o0.val[2] = s2;
469 o1.val[0] = s3;
470 o1.val[1] = s4;
471 o1.val[2] = s5;
472 vst3q_lane_u16(s - 3, o0, 0);
473 vst3q_lane_u16(s + 0, o1, 0);
474 s += p;
475 vst3q_lane_u16(s - 3, o0, 1);
476 vst3q_lane_u16(s + 0, o1, 1);
477 s += p;
478 vst3q_lane_u16(s - 3, o0, 2);
479 vst3q_lane_u16(s + 0, o1, 2);
480 s += p;
481 vst3q_lane_u16(s - 3, o0, 3);
482 vst3q_lane_u16(s + 0, o1, 3);
483 s += p;
484 vst3q_lane_u16(s - 3, o0, 4);
485 vst3q_lane_u16(s + 0, o1, 4);
486 s += p;
487 vst3q_lane_u16(s - 3, o0, 5);
488 vst3q_lane_u16(s + 0, o1, 5);
489 s += p;
490 vst3q_lane_u16(s - 3, o0, 6);
491 vst3q_lane_u16(s + 0, o1, 6);
492 s += p;
493 vst3q_lane_u16(s - 3, o0, 7);
494 vst3q_lane_u16(s + 0, o1, 7);
495 }
496
store_7x8(uint16_t * s,const int p,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3,const uint16x8_t s4,const uint16x8_t s5,const uint16x8_t s6)497 static INLINE void store_7x8(uint16_t *s, const int p, const uint16x8_t s0,
498 const uint16x8_t s1, const uint16x8_t s2,
499 const uint16x8_t s3, const uint16x8_t s4,
500 const uint16x8_t s5, const uint16x8_t s6) {
501 uint16x8x4_t o0;
502 uint16x8x3_t o1;
503
504 o0.val[0] = s0;
505 o0.val[1] = s1;
506 o0.val[2] = s2;
507 o0.val[3] = s3;
508 o1.val[0] = s4;
509 o1.val[1] = s5;
510 o1.val[2] = s6;
511 vst4q_lane_u16(s - 4, o0, 0);
512 vst3q_lane_u16(s + 0, o1, 0);
513 s += p;
514 vst4q_lane_u16(s - 4, o0, 1);
515 vst3q_lane_u16(s + 0, o1, 1);
516 s += p;
517 vst4q_lane_u16(s - 4, o0, 2);
518 vst3q_lane_u16(s + 0, o1, 2);
519 s += p;
520 vst4q_lane_u16(s - 4, o0, 3);
521 vst3q_lane_u16(s + 0, o1, 3);
522 s += p;
523 vst4q_lane_u16(s - 4, o0, 4);
524 vst3q_lane_u16(s + 0, o1, 4);
525 s += p;
526 vst4q_lane_u16(s - 4, o0, 5);
527 vst3q_lane_u16(s + 0, o1, 5);
528 s += p;
529 vst4q_lane_u16(s - 4, o0, 6);
530 vst3q_lane_u16(s + 0, o1, 6);
531 s += p;
532 vst4q_lane_u16(s - 4, o0, 7);
533 vst3q_lane_u16(s + 0, o1, 7);
534 }
535
store_8x14(uint16_t * s,const int p,const uint16x8_t p6,const uint16x8_t p5,const uint16x8_t p4,const uint16x8_t p3,const uint16x8_t p2,const uint16x8_t p1,const uint16x8_t p0,const uint16x8_t q0,const uint16x8_t q1,const uint16x8_t q2,const uint16x8_t q3,const uint16x8_t q4,const uint16x8_t q5,const uint16x8_t q6,const uint32_t flat_status,const uint32_t flat2_status)536 static INLINE void store_8x14(uint16_t *s, const int p, const uint16x8_t p6,
537 const uint16x8_t p5, const uint16x8_t p4,
538 const uint16x8_t p3, const uint16x8_t p2,
539 const uint16x8_t p1, const uint16x8_t p0,
540 const uint16x8_t q0, const uint16x8_t q1,
541 const uint16x8_t q2, const uint16x8_t q3,
542 const uint16x8_t q4, const uint16x8_t q5,
543 const uint16x8_t q6, const uint32_t flat_status,
544 const uint32_t flat2_status) {
545 if (flat_status) {
546 if (flat2_status) {
547 vst1q_u16(s - 7 * p, p6);
548 vst1q_u16(s - 6 * p, p5);
549 vst1q_u16(s - 5 * p, p4);
550 vst1q_u16(s - 4 * p, p3);
551 vst1q_u16(s + 3 * p, q3);
552 vst1q_u16(s + 4 * p, q4);
553 vst1q_u16(s + 5 * p, q5);
554 vst1q_u16(s + 6 * p, q6);
555 }
556 vst1q_u16(s - 3 * p, p2);
557 vst1q_u16(s + 2 * p, q2);
558 }
559 vst1q_u16(s - 2 * p, p1);
560 vst1q_u16(s - 1 * p, p0);
561 vst1q_u16(s + 0 * p, q0);
562 vst1q_u16(s + 1 * p, q1);
563 }
564
vpx_highbd_lpf_horizontal_4_neon(uint16_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)565 void vpx_highbd_lpf_horizontal_4_neon(uint16_t *s, int p, const uint8_t *blimit,
566 const uint8_t *limit,
567 const uint8_t *thresh, int bd) {
568 uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
569 mask, hev;
570
571 load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
572 load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
573 filter_hev_mask4(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
574 q2, q3, &hev, &mask);
575 filter4(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1, bd);
576 store_8x4(s - 2 * p, p, p1, p0, q0, q1);
577 }
578
vpx_highbd_lpf_horizontal_4_dual_neon(uint16_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)579 void vpx_highbd_lpf_horizontal_4_dual_neon(
580 uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
581 const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
582 const uint8_t *thresh1, int bd) {
583 vpx_highbd_lpf_horizontal_4_neon(s, p, blimit0, limit0, thresh0, bd);
584 vpx_highbd_lpf_horizontal_4_neon(s + 8, p, blimit1, limit1, thresh1, bd);
585 }
586
vpx_highbd_lpf_vertical_4_neon(uint16_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)587 void vpx_highbd_lpf_vertical_4_neon(uint16_t *s, int p, const uint8_t *blimit,
588 const uint8_t *limit, const uint8_t *thresh,
589 int bd) {
590 uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
591 mask, hev;
592
593 load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
594 transpose_s16_8x8((int16x8_t *)&p3, (int16x8_t *)&p2, (int16x8_t *)&p1,
595 (int16x8_t *)&p0, (int16x8_t *)&q0, (int16x8_t *)&q1,
596 (int16x8_t *)&q2, (int16x8_t *)&q3);
597 load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
598 filter_hev_mask4(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
599 q2, q3, &hev, &mask);
600 filter4(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1, bd);
601 store_4x8(s - 2, p, p1, p0, q0, q1);
602 }
603
vpx_highbd_lpf_vertical_4_dual_neon(uint16_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)604 void vpx_highbd_lpf_vertical_4_dual_neon(
605 uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
606 const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
607 const uint8_t *thresh1, int bd) {
608 vpx_highbd_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, bd);
609 vpx_highbd_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, bd);
610 }
611
vpx_highbd_lpf_horizontal_8_neon(uint16_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)612 void vpx_highbd_lpf_horizontal_8_neon(uint16_t *s, int p, const uint8_t *blimit,
613 const uint8_t *limit,
614 const uint8_t *thresh, int bd) {
615 uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
616 op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
617 uint32_t flat_status;
618
619 load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
620 load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
621 mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0,
622 q0, q1, q2, q3, &flat, &flat_status, &hev, bd);
623 filter8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
624 &op1, &op0, &oq0, &oq1, &oq2, bd);
625 store_8x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
626 }
627
vpx_highbd_lpf_horizontal_8_dual_neon(uint16_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)628 void vpx_highbd_lpf_horizontal_8_dual_neon(
629 uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
630 const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
631 const uint8_t *thresh1, int bd) {
632 vpx_highbd_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, bd);
633 vpx_highbd_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, bd);
634 }
635
vpx_highbd_lpf_vertical_8_neon(uint16_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)636 void vpx_highbd_lpf_vertical_8_neon(uint16_t *s, int p, const uint8_t *blimit,
637 const uint8_t *limit, const uint8_t *thresh,
638 int bd) {
639 uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
640 op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
641 uint32_t flat_status;
642
643 load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
644 transpose_s16_8x8((int16x8_t *)&p3, (int16x8_t *)&p2, (int16x8_t *)&p1,
645 (int16x8_t *)&p0, (int16x8_t *)&q0, (int16x8_t *)&q1,
646 (int16x8_t *)&q2, (int16x8_t *)&q3);
647 load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
648 mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0,
649 q0, q1, q2, q3, &flat, &flat_status, &hev, bd);
650 filter8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
651 &op1, &op0, &oq0, &oq1, &oq2, bd);
652 // Note: store_6x8() is faster than transpose + store_8x8().
653 store_6x8(s, p, op2, op1, op0, oq0, oq1, oq2);
654 }
655
vpx_highbd_lpf_vertical_8_dual_neon(uint16_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)656 void vpx_highbd_lpf_vertical_8_dual_neon(
657 uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
658 const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
659 const uint8_t *thresh1, int bd) {
660 vpx_highbd_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, bd);
661 vpx_highbd_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, bd);
662 }
663
lpf_horizontal_16_kernel(uint16_t * s,int p,const uint16x8_t blimit_vec,const uint16x8_t limit_vec,const uint16x8_t thresh_vec,const int bd)664 static void lpf_horizontal_16_kernel(uint16_t *s, int p,
665 const uint16x8_t blimit_vec,
666 const uint16x8_t limit_vec,
667 const uint16x8_t thresh_vec,
668 const int bd) {
669 uint16x8_t mask, flat, flat2, hev, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2,
670 q3, q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
671 oq4, oq5, oq6;
672 uint32_t flat_status, flat2_status;
673
674 load_8x16(s - 8 * p, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1, &q2,
675 &q3, &q4, &q5, &q6, &q7);
676 mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0,
677 q0, q1, q2, q3, &flat, &flat_status, &hev, bd);
678 flat2 = flat_mask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat,
679 &flat2_status, bd);
680 filter16(mask, flat, flat_status, flat2, flat2_status, hev, p7, p6, p5, p4,
681 p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4,
682 &op3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
683 bd);
684 store_8x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4,
685 oq5, oq6, flat_status, flat2_status);
686 }
687
lpf_vertical_16_kernel(uint16_t * s,int p,const uint16x8_t blimit_vec,const uint16x8_t limit_vec,const uint16x8_t thresh_vec,const int bd)688 static void lpf_vertical_16_kernel(uint16_t *s, int p,
689 const uint16x8_t blimit_vec,
690 const uint16x8_t limit_vec,
691 const uint16x8_t thresh_vec, const int bd) {
692 uint16x8_t mask, flat, flat2, hev, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2,
693 q3, q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
694 oq4, oq5, oq6;
695 uint32_t flat_status, flat2_status;
696
697 load_8x8(s - 8, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0);
698 transpose_s16_8x8((int16x8_t *)&p7, (int16x8_t *)&p6, (int16x8_t *)&p5,
699 (int16x8_t *)&p4, (int16x8_t *)&p3, (int16x8_t *)&p2,
700 (int16x8_t *)&p1, (int16x8_t *)&p0);
701 load_8x8(s, p, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
702 transpose_s16_8x8((int16x8_t *)&q0, (int16x8_t *)&q1, (int16x8_t *)&q2,
703 (int16x8_t *)&q3, (int16x8_t *)&q4, (int16x8_t *)&q5,
704 (int16x8_t *)&q6, (int16x8_t *)&q7);
705 mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0,
706 q0, q1, q2, q3, &flat, &flat_status, &hev, bd);
707 flat2 = flat_mask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat,
708 &flat2_status, bd);
709 filter16(mask, flat, flat_status, flat2, flat2_status, hev, p7, p6, p5, p4,
710 p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4,
711 &op3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
712 bd);
713 if (flat_status) {
714 if (flat2_status) {
715 store_7x8(s - 3, p, op6, op5, op4, op3, op2, op1, op0);
716 store_7x8(s + 4, p, oq0, oq1, oq2, oq3, oq4, oq5, oq6);
717 } else {
718 // Note: store_6x8() is faster than transpose + store_8x8().
719 store_6x8(s, p, op2, op1, op0, oq0, oq1, oq2);
720 }
721 } else {
722 store_4x8(s - 2, p, op1, op0, oq0, oq1);
723 }
724 }
725
vpx_highbd_lpf_horizontal_16_neon(uint16_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)726 void vpx_highbd_lpf_horizontal_16_neon(uint16_t *s, int p,
727 const uint8_t *blimit,
728 const uint8_t *limit,
729 const uint8_t *thresh, int bd) {
730 uint16x8_t blimit_vec, limit_vec, thresh_vec;
731 load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
732 lpf_horizontal_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd);
733 }
734
vpx_highbd_lpf_horizontal_16_dual_neon(uint16_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)735 void vpx_highbd_lpf_horizontal_16_dual_neon(uint16_t *s, int p,
736 const uint8_t *blimit,
737 const uint8_t *limit,
738 const uint8_t *thresh, int bd) {
739 uint16x8_t blimit_vec, limit_vec, thresh_vec;
740 load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
741 lpf_horizontal_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd);
742 lpf_horizontal_16_kernel(s + 8, p, blimit_vec, limit_vec, thresh_vec, bd);
743 }
744
vpx_highbd_lpf_vertical_16_neon(uint16_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)745 void vpx_highbd_lpf_vertical_16_neon(uint16_t *s, int p, const uint8_t *blimit,
746 const uint8_t *limit,
747 const uint8_t *thresh, int bd) {
748 uint16x8_t blimit_vec, limit_vec, thresh_vec;
749 load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
750 lpf_vertical_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd);
751 }
752
vpx_highbd_lpf_vertical_16_dual_neon(uint16_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)753 void vpx_highbd_lpf_vertical_16_dual_neon(uint16_t *s, int p,
754 const uint8_t *blimit,
755 const uint8_t *limit,
756 const uint8_t *thresh, int bd) {
757 uint16x8_t blimit_vec, limit_vec, thresh_vec;
758 load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd);
759 lpf_vertical_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd);
760 lpf_vertical_16_kernel(s + 8 * p, p, blimit_vec, limit_vec, thresh_vec, bd);
761 }
762