1 /*
2 * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include "./vpx_config.h"
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_dsp/arm/transpose_neon.h"
15
16 // For all the static inline functions, the functions ending with '_8' process
17 // 8 samples in a bunch, and the functions ending with '_16' process 16 samples
18 // in a bunch.
19
20 #define FUN_LOAD_THRESH(w, r) \
21 static INLINE void load_thresh_##w( \
22 const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, \
23 uint8x##w##_t *blimit_vec, uint8x##w##_t *limit_vec, \
24 uint8x##w##_t *thresh_vec) { \
25 *blimit_vec = vld1##r##dup_u8(blimit); \
26 *limit_vec = vld1##r##dup_u8(limit); \
27 *thresh_vec = vld1##r##dup_u8(thresh); \
28 }
29
30 FUN_LOAD_THRESH(8, _) // load_thresh_8
31 FUN_LOAD_THRESH(16, q_) // load_thresh_16
32 #undef FUN_LOAD_THRESH
33
load_thresh_8_dual(const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,uint8x16_t * blimit_vec,uint8x16_t * limit_vec,uint8x16_t * thresh_vec)34 static INLINE void load_thresh_8_dual(
35 const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0,
36 const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,
37 uint8x16_t *blimit_vec, uint8x16_t *limit_vec, uint8x16_t *thresh_vec) {
38 *blimit_vec = vcombine_u8(vld1_dup_u8(blimit0), vld1_dup_u8(blimit1));
39 *limit_vec = vcombine_u8(vld1_dup_u8(limit0), vld1_dup_u8(limit1));
40 *thresh_vec = vcombine_u8(vld1_dup_u8(thresh0), vld1_dup_u8(thresh1));
41 }
42
43 // Here flat is 64-bit long, with each 8-bit (or 4-bit) chunk being a mask of a
44 // pixel. When used to control filter branches, we only detect whether it is all
45 // 0s or all 1s. We pairwise add flat to a 32-bit long number flat_status.
46 // flat equals 0 if and only if flat_status equals 0.
47 // flat equals -1 (all 1s) if and only if flat_status equals -2. (This is true
48 // because each mask occupies more than 1 bit.)
calc_flat_status_8(uint8x8_t flat)49 static INLINE uint32_t calc_flat_status_8(uint8x8_t flat) {
50 return vget_lane_u32(
51 vreinterpret_u32_u64(vpaddl_u32(vreinterpret_u32_u8(flat))), 0);
52 }
53
54 // Here flat is 128-bit long, with each 8-bit chunk being a mask of a pixel.
55 // When used to control filter branches, we only detect whether it is all 0s or
56 // all 1s. We narrowing shift right each 16-bit chunk by 4 arithmetically, so
57 // we get a 64-bit long number, with each 4-bit chunk being a mask of a pixel.
58 // Then we pairwise add flat to a 32-bit long number flat_status.
59 // flat equals 0 if and only if flat_status equals 0.
60 // flat equals -1 (all 1s) if and only if flat_status equals -2. (This is true
61 // because each mask occupies more than 1 bit.)
calc_flat_status_16(uint8x16_t flat)62 static INLINE uint32_t calc_flat_status_16(uint8x16_t flat) {
63 const uint8x8_t flat_4bit =
64 vreinterpret_u8_s8(vshrn_n_s16(vreinterpretq_s16_u8(flat), 4));
65 return calc_flat_status_8(flat_4bit);
66 }
67
68 #define FUN_FILTER_HEV_MASK4(w, r) \
69 static INLINE uint8x##w##_t filter_hev_mask4_##w( \
70 const uint8x##w##_t limit, const uint8x##w##_t blimit, \
71 const uint8x##w##_t thresh, const uint8x##w##_t p3, \
72 const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
73 const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
74 const uint8x##w##_t q3, uint8x##w##_t *hev, uint8x##w##_t *mask) { \
75 uint8x##w##_t max, t0, t1; \
76 \
77 max = vabd##r##u8(p1, p0); \
78 max = vmax##r##u8(max, vabd##r##u8(q1, q0)); \
79 *hev = vcgt##r##u8(max, thresh); \
80 *mask = vmax##r##u8(max, vabd##r##u8(p3, p2)); \
81 *mask = vmax##r##u8(*mask, vabd##r##u8(p2, p1)); \
82 *mask = vmax##r##u8(*mask, vabd##r##u8(q2, q1)); \
83 *mask = vmax##r##u8(*mask, vabd##r##u8(q3, q2)); \
84 t0 = vabd##r##u8(p0, q0); \
85 t1 = vabd##r##u8(p1, q1); \
86 t0 = vqadd##r##u8(t0, t0); \
87 t1 = vshr##r##n_u8(t1, 1); \
88 t0 = vqadd##r##u8(t0, t1); \
89 *mask = vcle##r##u8(*mask, limit); \
90 t0 = vcle##r##u8(t0, blimit); \
91 *mask = vand##r##u8(*mask, t0); \
92 \
93 return max; \
94 }
95
96 FUN_FILTER_HEV_MASK4(8, _) // filter_hev_mask4_8
97 FUN_FILTER_HEV_MASK4(16, q_) // filter_hev_mask4_16
98 #undef FUN_FILTER_HEV_MASK4
99
100 #define FUN_FILTER_FLAT_HEV_MASK(w, r) \
101 static INLINE uint8x##w##_t filter_flat_hev_mask_##w( \
102 const uint8x##w##_t limit, const uint8x##w##_t blimit, \
103 const uint8x##w##_t thresh, const uint8x##w##_t p3, \
104 const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
105 const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
106 const uint8x##w##_t q3, uint8x##w##_t *flat, uint32_t *flat_status, \
107 uint8x##w##_t *hev) { \
108 uint8x##w##_t max, mask; \
109 \
110 max = filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, \
111 q2, q3, hev, &mask); \
112 *flat = vmax##r##u8(max, vabd##r##u8(p2, p0)); \
113 *flat = vmax##r##u8(*flat, vabd##r##u8(q2, q0)); \
114 *flat = vmax##r##u8(*flat, vabd##r##u8(p3, p0)); \
115 *flat = vmax##r##u8(*flat, vabd##r##u8(q3, q0)); \
116 *flat = vcle##r##u8(*flat, vdup##r##n_u8(1)); /* flat_mask4() */ \
117 *flat = vand##r##u8(*flat, mask); \
118 *flat_status = calc_flat_status_##w(*flat); \
119 \
120 return mask; \
121 }
122
123 FUN_FILTER_FLAT_HEV_MASK(8, _) // filter_flat_hev_mask_8
124 FUN_FILTER_FLAT_HEV_MASK(16, q_) // filter_flat_hev_mask_16
125 #undef FUN_FILTER_FLAT_HEV_MASK
126
127 #define FUN_FLAT_MASK5(w, r) \
128 static INLINE uint8x##w##_t flat_mask5_##w( \
129 const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \
130 const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
131 const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \
132 const uint8x##w##_t q4, const uint8x##w##_t flat, \
133 uint32_t *flat2_status) { \
134 uint8x##w##_t flat2 = vabd##r##u8(p4, p0); \
135 flat2 = vmax##r##u8(flat2, vabd##r##u8(p3, p0)); \
136 flat2 = vmax##r##u8(flat2, vabd##r##u8(p2, p0)); \
137 flat2 = vmax##r##u8(flat2, vabd##r##u8(p1, p0)); \
138 flat2 = vmax##r##u8(flat2, vabd##r##u8(q1, q0)); \
139 flat2 = vmax##r##u8(flat2, vabd##r##u8(q2, q0)); \
140 flat2 = vmax##r##u8(flat2, vabd##r##u8(q3, q0)); \
141 flat2 = vmax##r##u8(flat2, vabd##r##u8(q4, q0)); \
142 flat2 = vcle##r##u8(flat2, vdup##r##n_u8(1)); \
143 flat2 = vand##r##u8(flat2, flat); \
144 *flat2_status = calc_flat_status_##w(flat2); \
145 \
146 return flat2; \
147 }
148
149 FUN_FLAT_MASK5(8, _) // flat_mask5_8
150 FUN_FLAT_MASK5(16, q_) // flat_mask5_16
151 #undef FUN_FLAT_MASK5
152
153 #define FUN_FLIP_SIGN(w, r) \
154 static INLINE int8x##w##_t flip_sign_##w(const uint8x##w##_t v) { \
155 const uint8x##w##_t sign_bit = vdup##r##n_u8(0x80); \
156 return vreinterpret##r##s8_u8(veor##r##u8(v, sign_bit)); \
157 }
158
159 FUN_FLIP_SIGN(8, _) // flip_sign_8
160 FUN_FLIP_SIGN(16, q_) // flip_sign_16
161 #undef FUN_FLIP_SIGN
162
163 #define FUN_FLIP_SIGN_BACK(w, r) \
164 static INLINE uint8x##w##_t flip_sign_back_##w(const int8x##w##_t v) { \
165 const int8x##w##_t sign_bit = vdup##r##n_s8(0x80); \
166 return vreinterpret##r##u8_s8(veor##r##s8(v, sign_bit)); \
167 }
168
169 FUN_FLIP_SIGN_BACK(8, _) // flip_sign_back_8
170 FUN_FLIP_SIGN_BACK(16, q_) // flip_sign_back_16
171 #undef FUN_FLIP_SIGN_BACK
172
filter_update_8(const uint8x8_t sub0,const uint8x8_t sub1,const uint8x8_t add0,const uint8x8_t add1,uint16x8_t * sum)173 static INLINE void filter_update_8(const uint8x8_t sub0, const uint8x8_t sub1,
174 const uint8x8_t add0, const uint8x8_t add1,
175 uint16x8_t *sum) {
176 *sum = vsubw_u8(*sum, sub0);
177 *sum = vsubw_u8(*sum, sub1);
178 *sum = vaddw_u8(*sum, add0);
179 *sum = vaddw_u8(*sum, add1);
180 }
181
filter_update_16(const uint8x16_t sub0,const uint8x16_t sub1,const uint8x16_t add0,const uint8x16_t add1,uint16x8_t * sum0,uint16x8_t * sum1)182 static INLINE void filter_update_16(const uint8x16_t sub0,
183 const uint8x16_t sub1,
184 const uint8x16_t add0,
185 const uint8x16_t add1, uint16x8_t *sum0,
186 uint16x8_t *sum1) {
187 *sum0 = vsubw_u8(*sum0, vget_low_u8(sub0));
188 *sum1 = vsubw_u8(*sum1, vget_high_u8(sub0));
189 *sum0 = vsubw_u8(*sum0, vget_low_u8(sub1));
190 *sum1 = vsubw_u8(*sum1, vget_high_u8(sub1));
191 *sum0 = vaddw_u8(*sum0, vget_low_u8(add0));
192 *sum1 = vaddw_u8(*sum1, vget_high_u8(add0));
193 *sum0 = vaddw_u8(*sum0, vget_low_u8(add1));
194 *sum1 = vaddw_u8(*sum1, vget_high_u8(add1));
195 }
196
calc_7_tap_filter_8_kernel(const uint8x8_t sub0,const uint8x8_t sub1,const uint8x8_t add0,const uint8x8_t add1,uint16x8_t * sum)197 static INLINE uint8x8_t calc_7_tap_filter_8_kernel(const uint8x8_t sub0,
198 const uint8x8_t sub1,
199 const uint8x8_t add0,
200 const uint8x8_t add1,
201 uint16x8_t *sum) {
202 filter_update_8(sub0, sub1, add0, add1, sum);
203 return vrshrn_n_u16(*sum, 3);
204 }
205
calc_7_tap_filter_16_kernel(const uint8x16_t sub0,const uint8x16_t sub1,const uint8x16_t add0,const uint8x16_t add1,uint16x8_t * sum0,uint16x8_t * sum1)206 static INLINE uint8x16_t calc_7_tap_filter_16_kernel(
207 const uint8x16_t sub0, const uint8x16_t sub1, const uint8x16_t add0,
208 const uint8x16_t add1, uint16x8_t *sum0, uint16x8_t *sum1) {
209 filter_update_16(sub0, sub1, add0, add1, sum0, sum1);
210 return vcombine_u8(vrshrn_n_u16(*sum0, 3), vrshrn_n_u16(*sum1, 3));
211 }
212
apply_15_tap_filter_8_kernel(const uint8x8_t flat,const uint8x8_t sub0,const uint8x8_t sub1,const uint8x8_t add0,const uint8x8_t add1,const uint8x8_t in,uint16x8_t * sum)213 static INLINE uint8x8_t apply_15_tap_filter_8_kernel(
214 const uint8x8_t flat, const uint8x8_t sub0, const uint8x8_t sub1,
215 const uint8x8_t add0, const uint8x8_t add1, const uint8x8_t in,
216 uint16x8_t *sum) {
217 filter_update_8(sub0, sub1, add0, add1, sum);
218 return vbsl_u8(flat, vrshrn_n_u16(*sum, 4), in);
219 }
220
apply_15_tap_filter_16_kernel(const uint8x16_t flat,const uint8x16_t sub0,const uint8x16_t sub1,const uint8x16_t add0,const uint8x16_t add1,const uint8x16_t in,uint16x8_t * sum0,uint16x8_t * sum1)221 static INLINE uint8x16_t apply_15_tap_filter_16_kernel(
222 const uint8x16_t flat, const uint8x16_t sub0, const uint8x16_t sub1,
223 const uint8x16_t add0, const uint8x16_t add1, const uint8x16_t in,
224 uint16x8_t *sum0, uint16x8_t *sum1) {
225 uint8x16_t t;
226 filter_update_16(sub0, sub1, add0, add1, sum0, sum1);
227 t = vcombine_u8(vrshrn_n_u16(*sum0, 4), vrshrn_n_u16(*sum1, 4));
228 return vbslq_u8(flat, t, in);
229 }
230
231 // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
calc_7_tap_filter_8(const uint8x8_t p3,const uint8x8_t p2,const uint8x8_t p1,const uint8x8_t p0,const uint8x8_t q0,const uint8x8_t q1,const uint8x8_t q2,const uint8x8_t q3,uint8x8_t * op2,uint8x8_t * op1,uint8x8_t * op0,uint8x8_t * oq0,uint8x8_t * oq1,uint8x8_t * oq2)232 static INLINE void calc_7_tap_filter_8(const uint8x8_t p3, const uint8x8_t p2,
233 const uint8x8_t p1, const uint8x8_t p0,
234 const uint8x8_t q0, const uint8x8_t q1,
235 const uint8x8_t q2, const uint8x8_t q3,
236 uint8x8_t *op2, uint8x8_t *op1,
237 uint8x8_t *op0, uint8x8_t *oq0,
238 uint8x8_t *oq1, uint8x8_t *oq2) {
239 uint16x8_t sum;
240 sum = vaddl_u8(p3, p3); // 2*p3
241 sum = vaddw_u8(sum, p3); // 3*p3
242 sum = vaddw_u8(sum, p2); // 3*p3+p2
243 sum = vaddw_u8(sum, p2); // 3*p3+2*p2
244 sum = vaddw_u8(sum, p1); // 3*p3+2*p2+p1
245 sum = vaddw_u8(sum, p0); // 3*p3+2*p2+p1+p0
246 sum = vaddw_u8(sum, q0); // 3*p3+2*p2+p1+p0+q0
247 *op2 = vrshrn_n_u16(sum, 3);
248 *op1 = calc_7_tap_filter_8_kernel(p3, p2, p1, q1, &sum);
249 *op0 = calc_7_tap_filter_8_kernel(p3, p1, p0, q2, &sum);
250 *oq0 = calc_7_tap_filter_8_kernel(p3, p0, q0, q3, &sum);
251 *oq1 = calc_7_tap_filter_8_kernel(p2, q0, q1, q3, &sum);
252 *oq2 = calc_7_tap_filter_8_kernel(p1, q1, q2, q3, &sum);
253 }
254
calc_7_tap_filter_16(const uint8x16_t p3,const uint8x16_t p2,const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t q2,const uint8x16_t q3,uint8x16_t * op2,uint8x16_t * op1,uint8x16_t * op0,uint8x16_t * oq0,uint8x16_t * oq1,uint8x16_t * oq2)255 static INLINE void calc_7_tap_filter_16(
256 const uint8x16_t p3, const uint8x16_t p2, const uint8x16_t p1,
257 const uint8x16_t p0, const uint8x16_t q0, const uint8x16_t q1,
258 const uint8x16_t q2, const uint8x16_t q3, uint8x16_t *op2, uint8x16_t *op1,
259 uint8x16_t *op0, uint8x16_t *oq0, uint8x16_t *oq1, uint8x16_t *oq2) {
260 uint16x8_t sum0, sum1;
261 sum0 = vaddl_u8(vget_low_u8(p3), vget_low_u8(p3)); // 2*p3
262 sum1 = vaddl_u8(vget_high_u8(p3), vget_high_u8(p3)); // 2*p3
263 sum0 = vaddw_u8(sum0, vget_low_u8(p3)); // 3*p3
264 sum1 = vaddw_u8(sum1, vget_high_u8(p3)); // 3*p3
265 sum0 = vaddw_u8(sum0, vget_low_u8(p2)); // 3*p3+p2
266 sum1 = vaddw_u8(sum1, vget_high_u8(p2)); // 3*p3+p2
267 sum0 = vaddw_u8(sum0, vget_low_u8(p2)); // 3*p3+2*p2
268 sum1 = vaddw_u8(sum1, vget_high_u8(p2)); // 3*p3+2*p2
269 sum0 = vaddw_u8(sum0, vget_low_u8(p1)); // 3*p3+2*p2+p1
270 sum1 = vaddw_u8(sum1, vget_high_u8(p1)); // 3*p3+2*p2+p1
271 sum0 = vaddw_u8(sum0, vget_low_u8(p0)); // 3*p3+2*p2+p1+p0
272 sum1 = vaddw_u8(sum1, vget_high_u8(p0)); // 3*p3+2*p2+p1+p0
273 sum0 = vaddw_u8(sum0, vget_low_u8(q0)); // 3*p3+2*p2+p1+p0+q0
274 sum1 = vaddw_u8(sum1, vget_high_u8(q0)); // 3*p3+2*p2+p1+p0+q0
275 *op2 = vcombine_u8(vrshrn_n_u16(sum0, 3), vrshrn_n_u16(sum1, 3));
276 *op1 = calc_7_tap_filter_16_kernel(p3, p2, p1, q1, &sum0, &sum1);
277 *op0 = calc_7_tap_filter_16_kernel(p3, p1, p0, q2, &sum0, &sum1);
278 *oq0 = calc_7_tap_filter_16_kernel(p3, p0, q0, q3, &sum0, &sum1);
279 *oq1 = calc_7_tap_filter_16_kernel(p2, q0, q1, q3, &sum0, &sum1);
280 *oq2 = calc_7_tap_filter_16_kernel(p1, q1, q2, q3, &sum0, &sum1);
281 }
282
283 #define FUN_APPLY_7_TAP_FILTER(w, r) \
284 static INLINE void apply_7_tap_filter_##w( \
285 const uint8x##w##_t flat, const uint8x##w##_t p3, \
286 const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \
287 const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \
288 const uint8x##w##_t q3, uint8x##w##_t *op2, uint8x##w##_t *op1, \
289 uint8x##w##_t *op0, uint8x##w##_t *oq0, uint8x##w##_t *oq1, \
290 uint8x##w##_t *oq2) { \
291 uint8x##w##_t tp1, tp0, tq0, tq1; \
292 calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, &tp1, &tp0, \
293 &tq0, &tq1, oq2); \
294 *op2 = vbsl##r##u8(flat, *op2, p2); \
295 *op1 = vbsl##r##u8(flat, tp1, *op1); \
296 *op0 = vbsl##r##u8(flat, tp0, *op0); \
297 *oq0 = vbsl##r##u8(flat, tq0, *oq0); \
298 *oq1 = vbsl##r##u8(flat, tq1, *oq1); \
299 *oq2 = vbsl##r##u8(flat, *oq2, q2); \
300 }
301
302 FUN_APPLY_7_TAP_FILTER(8, _) // apply_7_tap_filter_8
303 FUN_APPLY_7_TAP_FILTER(16, q_) // apply_7_tap_filter_16
304 #undef FUN_APPLY_7_TAP_FILTER
305
306 // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
apply_15_tap_filter_8(const uint8x8_t flat2,const uint8x8_t p7,const uint8x8_t p6,const uint8x8_t p5,const uint8x8_t p4,const uint8x8_t p3,const uint8x8_t p2,const uint8x8_t p1,const uint8x8_t p0,const uint8x8_t q0,const uint8x8_t q1,const uint8x8_t q2,const uint8x8_t q3,const uint8x8_t q4,const uint8x8_t q5,const uint8x8_t q6,const uint8x8_t q7,uint8x8_t * op6,uint8x8_t * op5,uint8x8_t * op4,uint8x8_t * op3,uint8x8_t * op2,uint8x8_t * op1,uint8x8_t * op0,uint8x8_t * oq0,uint8x8_t * oq1,uint8x8_t * oq2,uint8x8_t * oq3,uint8x8_t * oq4,uint8x8_t * oq5,uint8x8_t * oq6)307 static INLINE void apply_15_tap_filter_8(
308 const uint8x8_t flat2, const uint8x8_t p7, const uint8x8_t p6,
309 const uint8x8_t p5, const uint8x8_t p4, const uint8x8_t p3,
310 const uint8x8_t p2, const uint8x8_t p1, const uint8x8_t p0,
311 const uint8x8_t q0, const uint8x8_t q1, const uint8x8_t q2,
312 const uint8x8_t q3, const uint8x8_t q4, const uint8x8_t q5,
313 const uint8x8_t q6, const uint8x8_t q7, uint8x8_t *op6, uint8x8_t *op5,
314 uint8x8_t *op4, uint8x8_t *op3, uint8x8_t *op2, uint8x8_t *op1,
315 uint8x8_t *op0, uint8x8_t *oq0, uint8x8_t *oq1, uint8x8_t *oq2,
316 uint8x8_t *oq3, uint8x8_t *oq4, uint8x8_t *oq5, uint8x8_t *oq6) {
317 uint16x8_t sum;
318 sum = vshll_n_u8(p7, 3); // 8*p7
319 sum = vsubw_u8(sum, p7); // 7*p7
320 sum = vaddw_u8(sum, p6); // 7*p7+p6
321 sum = vaddw_u8(sum, p6); // 7*p7+2*p6
322 sum = vaddw_u8(sum, p5); // 7*p7+2*p6+p5
323 sum = vaddw_u8(sum, p4); // 7*p7+2*p6+p5+p4
324 sum = vaddw_u8(sum, p3); // 7*p7+2*p6+p5+p4+p3
325 sum = vaddw_u8(sum, p2); // 7*p7+2*p6+p5+p4+p3+p2
326 sum = vaddw_u8(sum, p1); // 7*p7+2*p6+p5+p4+p3+p2+p1
327 sum = vaddw_u8(sum, p0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
328 sum = vaddw_u8(sum, q0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
329 *op6 = vbsl_u8(flat2, vrshrn_n_u16(sum, 4), p6);
330 *op5 = apply_15_tap_filter_8_kernel(flat2, p7, p6, p5, q1, p5, &sum);
331 *op4 = apply_15_tap_filter_8_kernel(flat2, p7, p5, p4, q2, p4, &sum);
332 *op3 = apply_15_tap_filter_8_kernel(flat2, p7, p4, p3, q3, p3, &sum);
333 *op2 = apply_15_tap_filter_8_kernel(flat2, p7, p3, p2, q4, *op2, &sum);
334 *op1 = apply_15_tap_filter_8_kernel(flat2, p7, p2, p1, q5, *op1, &sum);
335 *op0 = apply_15_tap_filter_8_kernel(flat2, p7, p1, p0, q6, *op0, &sum);
336 *oq0 = apply_15_tap_filter_8_kernel(flat2, p7, p0, q0, q7, *oq0, &sum);
337 *oq1 = apply_15_tap_filter_8_kernel(flat2, p6, q0, q1, q7, *oq1, &sum);
338 *oq2 = apply_15_tap_filter_8_kernel(flat2, p5, q1, q2, q7, *oq2, &sum);
339 *oq3 = apply_15_tap_filter_8_kernel(flat2, p4, q2, q3, q7, q3, &sum);
340 *oq4 = apply_15_tap_filter_8_kernel(flat2, p3, q3, q4, q7, q4, &sum);
341 *oq5 = apply_15_tap_filter_8_kernel(flat2, p2, q4, q5, q7, q5, &sum);
342 *oq6 = apply_15_tap_filter_8_kernel(flat2, p1, q5, q6, q7, q6, &sum);
343 }
344
apply_15_tap_filter_16(const uint8x16_t flat2,const uint8x16_t p7,const uint8x16_t p6,const uint8x16_t p5,const uint8x16_t p4,const uint8x16_t p3,const uint8x16_t p2,const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t q2,const uint8x16_t q3,const uint8x16_t q4,const uint8x16_t q5,const uint8x16_t q6,const uint8x16_t q7,uint8x16_t * op6,uint8x16_t * op5,uint8x16_t * op4,uint8x16_t * op3,uint8x16_t * op2,uint8x16_t * op1,uint8x16_t * op0,uint8x16_t * oq0,uint8x16_t * oq1,uint8x16_t * oq2,uint8x16_t * oq3,uint8x16_t * oq4,uint8x16_t * oq5,uint8x16_t * oq6)345 static INLINE void apply_15_tap_filter_16(
346 const uint8x16_t flat2, const uint8x16_t p7, const uint8x16_t p6,
347 const uint8x16_t p5, const uint8x16_t p4, const uint8x16_t p3,
348 const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
349 const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
350 const uint8x16_t q3, const uint8x16_t q4, const uint8x16_t q5,
351 const uint8x16_t q6, const uint8x16_t q7, uint8x16_t *op6, uint8x16_t *op5,
352 uint8x16_t *op4, uint8x16_t *op3, uint8x16_t *op2, uint8x16_t *op1,
353 uint8x16_t *op0, uint8x16_t *oq0, uint8x16_t *oq1, uint8x16_t *oq2,
354 uint8x16_t *oq3, uint8x16_t *oq4, uint8x16_t *oq5, uint8x16_t *oq6) {
355 uint16x8_t sum0, sum1;
356 uint8x16_t t;
357 sum0 = vshll_n_u8(vget_low_u8(p7), 3); // 8*p7
358 sum1 = vshll_n_u8(vget_high_u8(p7), 3); // 8*p7
359 sum0 = vsubw_u8(sum0, vget_low_u8(p7)); // 7*p7
360 sum1 = vsubw_u8(sum1, vget_high_u8(p7)); // 7*p7
361 sum0 = vaddw_u8(sum0, vget_low_u8(p6)); // 7*p7+p6
362 sum1 = vaddw_u8(sum1, vget_high_u8(p6)); // 7*p7+p6
363 sum0 = vaddw_u8(sum0, vget_low_u8(p6)); // 7*p7+2*p6
364 sum1 = vaddw_u8(sum1, vget_high_u8(p6)); // 7*p7+2*p6
365 sum0 = vaddw_u8(sum0, vget_low_u8(p5)); // 7*p7+2*p6+p5
366 sum1 = vaddw_u8(sum1, vget_high_u8(p5)); // 7*p7+2*p6+p5
367 sum0 = vaddw_u8(sum0, vget_low_u8(p4)); // 7*p7+2*p6+p5+p4
368 sum1 = vaddw_u8(sum1, vget_high_u8(p4)); // 7*p7+2*p6+p5+p4
369 sum0 = vaddw_u8(sum0, vget_low_u8(p3)); // 7*p7+2*p6+p5+p4+p3
370 sum1 = vaddw_u8(sum1, vget_high_u8(p3)); // 7*p7+2*p6+p5+p4+p3
371 sum0 = vaddw_u8(sum0, vget_low_u8(p2)); // 7*p7+2*p6+p5+p4+p3+p2
372 sum1 = vaddw_u8(sum1, vget_high_u8(p2)); // 7*p7+2*p6+p5+p4+p3+p2
373 sum0 = vaddw_u8(sum0, vget_low_u8(p1)); // 7*p7+2*p6+p5+p4+p3+p2+p1
374 sum1 = vaddw_u8(sum1, vget_high_u8(p1)); // 7*p7+2*p6+p5+p4+p3+p2+p1
375 sum0 = vaddw_u8(sum0, vget_low_u8(p0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
376 sum1 = vaddw_u8(sum1, vget_high_u8(p0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0
377 sum0 = vaddw_u8(sum0, vget_low_u8(q0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
378 sum1 = vaddw_u8(sum1, vget_high_u8(q0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0
379 t = vcombine_u8(vrshrn_n_u16(sum0, 4), vrshrn_n_u16(sum1, 4));
380 *op6 = vbslq_u8(flat2, t, p6);
381 *op5 = apply_15_tap_filter_16_kernel(flat2, p7, p6, p5, q1, p5, &sum0, &sum1);
382 *op4 = apply_15_tap_filter_16_kernel(flat2, p7, p5, p4, q2, p4, &sum0, &sum1);
383 *op3 = apply_15_tap_filter_16_kernel(flat2, p7, p4, p3, q3, p3, &sum0, &sum1);
384 *op2 =
385 apply_15_tap_filter_16_kernel(flat2, p7, p3, p2, q4, *op2, &sum0, &sum1);
386 *op1 =
387 apply_15_tap_filter_16_kernel(flat2, p7, p2, p1, q5, *op1, &sum0, &sum1);
388 *op0 =
389 apply_15_tap_filter_16_kernel(flat2, p7, p1, p0, q6, *op0, &sum0, &sum1);
390 *oq0 =
391 apply_15_tap_filter_16_kernel(flat2, p7, p0, q0, q7, *oq0, &sum0, &sum1);
392 *oq1 =
393 apply_15_tap_filter_16_kernel(flat2, p6, q0, q1, q7, *oq1, &sum0, &sum1);
394 *oq2 =
395 apply_15_tap_filter_16_kernel(flat2, p5, q1, q2, q7, *oq2, &sum0, &sum1);
396 *oq3 = apply_15_tap_filter_16_kernel(flat2, p4, q2, q3, q7, q3, &sum0, &sum1);
397 *oq4 = apply_15_tap_filter_16_kernel(flat2, p3, q3, q4, q7, q4, &sum0, &sum1);
398 *oq5 = apply_15_tap_filter_16_kernel(flat2, p2, q4, q5, q7, q5, &sum0, &sum1);
399 *oq6 = apply_15_tap_filter_16_kernel(flat2, p1, q5, q6, q7, q6, &sum0, &sum1);
400 }
401
402 #define FUN_FILTER4(w, r) \
403 static INLINE void filter4_##w( \
404 const uint8x##w##_t mask, const uint8x##w##_t hev, \
405 const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
406 const uint8x##w##_t q1, uint8x##w##_t *op1, uint8x##w##_t *op0, \
407 uint8x##w##_t *oq0, uint8x##w##_t *oq1) { \
408 int8x##w##_t filter, filter1, filter2, t; \
409 int8x##w##_t ps1 = flip_sign_##w(p1); \
410 int8x##w##_t ps0 = flip_sign_##w(p0); \
411 int8x##w##_t qs0 = flip_sign_##w(q0); \
412 int8x##w##_t qs1 = flip_sign_##w(q1); \
413 \
414 /* add outer taps if we have high edge variance */ \
415 filter = vqsub##r##s8(ps1, qs1); \
416 filter = vand##r##s8(filter, vreinterpret##r##s8_u8(hev)); \
417 t = vqsub##r##s8(qs0, ps0); \
418 \
419 /* inner taps */ \
420 filter = vqadd##r##s8(filter, t); \
421 filter = vqadd##r##s8(filter, t); \
422 filter = vqadd##r##s8(filter, t); \
423 filter = vand##r##s8(filter, vreinterpret##r##s8_u8(mask)); \
424 \
425 /* save bottom 3 bits so that we round one side +4 and the other +3 */ \
426 /* if it equals 4 we'll set it to adjust by -1 to account for the fact */ \
427 /* we'd round it by 3 the other way */ \
428 filter1 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(4)), 3); \
429 filter2 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(3)), 3); \
430 \
431 qs0 = vqsub##r##s8(qs0, filter1); \
432 ps0 = vqadd##r##s8(ps0, filter2); \
433 *oq0 = flip_sign_back_##w(qs0); \
434 *op0 = flip_sign_back_##w(ps0); \
435 \
436 /* outer tap adjustments */ \
437 filter = vrshr##r##n_s8(filter1, 1); \
438 filter = vbic##r##s8(filter, vreinterpret##r##s8_u8(hev)); \
439 \
440 qs1 = vqsub##r##s8(qs1, filter); \
441 ps1 = vqadd##r##s8(ps1, filter); \
442 *oq1 = flip_sign_back_##w(qs1); \
443 *op1 = flip_sign_back_##w(ps1); \
444 }
445
446 FUN_FILTER4(8, _) // filter4_8
447 FUN_FILTER4(16, q_) // filter4_16
448 #undef FUN_FILTER4
449
450 #define FUN_FILTER8(w) \
451 static INLINE void filter8_##w( \
452 const uint8x##w##_t mask, const uint8x##w##_t flat, \
453 const uint32_t flat_status, const uint8x##w##_t hev, \
454 const uint8x##w##_t p3, const uint8x##w##_t p2, const uint8x##w##_t p1, \
455 const uint8x##w##_t p0, const uint8x##w##_t q0, const uint8x##w##_t q1, \
456 const uint8x##w##_t q2, const uint8x##w##_t q3, uint8x##w##_t *op2, \
457 uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0, \
458 uint8x##w##_t *oq1, uint8x##w##_t *oq2) { \
459 if (flat_status != (uint32_t)-2) { \
460 filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1); \
461 *op2 = p2; \
462 *oq2 = q2; \
463 if (flat_status) { \
464 apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \
465 op0, oq0, oq1, oq2); \
466 } \
467 } else { \
468 calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, \
469 oq0, oq1, oq2); \
470 } \
471 }
472
473 FUN_FILTER8(8) // filter8_8
474 FUN_FILTER8(16) // filter8_16
475 #undef FUN_FILTER8
476
477 #define FUN_FILTER16(w) \
478 static INLINE void filter16_##w( \
479 const uint8x##w##_t mask, const uint8x##w##_t flat, \
480 const uint32_t flat_status, const uint8x##w##_t flat2, \
481 const uint32_t flat2_status, const uint8x##w##_t hev, \
482 const uint8x##w##_t p7, const uint8x##w##_t p6, const uint8x##w##_t p5, \
483 const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \
484 const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
485 const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \
486 const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \
487 const uint8x##w##_t q7, uint8x##w##_t *op6, uint8x##w##_t *op5, \
488 uint8x##w##_t *op4, uint8x##w##_t *op3, uint8x##w##_t *op2, \
489 uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0, \
490 uint8x##w##_t *oq1, uint8x##w##_t *oq2, uint8x##w##_t *oq3, \
491 uint8x##w##_t *oq4, uint8x##w##_t *oq5, uint8x##w##_t *oq6) { \
492 if (flat_status != (uint32_t)-2) { \
493 filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1); \
494 } \
495 \
496 if (flat_status) { \
497 *op2 = p2; \
498 *oq2 = q2; \
499 if (flat2_status != (uint32_t)-2) { \
500 apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \
501 op0, oq0, oq1, oq2); \
502 } \
503 if (flat2_status) { \
504 apply_15_tap_filter_##w(flat2, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, \
505 q2, q3, q4, q5, q6, q7, op6, op5, op4, op3, \
506 op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, \
507 oq6); \
508 } \
509 } \
510 }
511
512 FUN_FILTER16(8) // filter16_8
513 FUN_FILTER16(16) // filter16_16
514 #undef FUN_FILTER16
515
516 #define FUN_LOAD8(w, r) \
517 static INLINE void load_##w##x8( \
518 const uint8_t *s, const int p, uint8x##w##_t *p3, uint8x##w##_t *p2, \
519 uint8x##w##_t *p1, uint8x##w##_t *p0, uint8x##w##_t *q0, \
520 uint8x##w##_t *q1, uint8x##w##_t *q2, uint8x##w##_t *q3) { \
521 *p3 = vld1##r##u8(s); \
522 s += p; \
523 *p2 = vld1##r##u8(s); \
524 s += p; \
525 *p1 = vld1##r##u8(s); \
526 s += p; \
527 *p0 = vld1##r##u8(s); \
528 s += p; \
529 *q0 = vld1##r##u8(s); \
530 s += p; \
531 *q1 = vld1##r##u8(s); \
532 s += p; \
533 *q2 = vld1##r##u8(s); \
534 s += p; \
535 *q3 = vld1##r##u8(s); \
536 }
537
538 FUN_LOAD8(8, _) // load_8x8
539 FUN_LOAD8(16, q_) // load_16x8
540 #undef FUN_LOAD8
541
542 #define FUN_LOAD16(w, r) \
543 static INLINE void load_##w##x16( \
544 const uint8_t *s, const int p, uint8x##w##_t *s0, uint8x##w##_t *s1, \
545 uint8x##w##_t *s2, uint8x##w##_t *s3, uint8x##w##_t *s4, \
546 uint8x##w##_t *s5, uint8x##w##_t *s6, uint8x##w##_t *s7, \
547 uint8x##w##_t *s8, uint8x##w##_t *s9, uint8x##w##_t *s10, \
548 uint8x##w##_t *s11, uint8x##w##_t *s12, uint8x##w##_t *s13, \
549 uint8x##w##_t *s14, uint8x##w##_t *s15) { \
550 *s0 = vld1##r##u8(s); \
551 s += p; \
552 *s1 = vld1##r##u8(s); \
553 s += p; \
554 *s2 = vld1##r##u8(s); \
555 s += p; \
556 *s3 = vld1##r##u8(s); \
557 s += p; \
558 *s4 = vld1##r##u8(s); \
559 s += p; \
560 *s5 = vld1##r##u8(s); \
561 s += p; \
562 *s6 = vld1##r##u8(s); \
563 s += p; \
564 *s7 = vld1##r##u8(s); \
565 s += p; \
566 *s8 = vld1##r##u8(s); \
567 s += p; \
568 *s9 = vld1##r##u8(s); \
569 s += p; \
570 *s10 = vld1##r##u8(s); \
571 s += p; \
572 *s11 = vld1##r##u8(s); \
573 s += p; \
574 *s12 = vld1##r##u8(s); \
575 s += p; \
576 *s13 = vld1##r##u8(s); \
577 s += p; \
578 *s14 = vld1##r##u8(s); \
579 s += p; \
580 *s15 = vld1##r##u8(s); \
581 }
582
583 FUN_LOAD16(8, _) // load_8x16
584 FUN_LOAD16(16, q_) // load_16x16
585 #undef FUN_LOAD16
586
587 #define FUN_STORE4(w, r) \
588 static INLINE void store_##w##x4( \
589 uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
590 const uint8x##w##_t s2, const uint8x##w##_t s3) { \
591 vst1##r##u8(s, s0); \
592 s += p; \
593 vst1##r##u8(s, s1); \
594 s += p; \
595 vst1##r##u8(s, s2); \
596 s += p; \
597 vst1##r##u8(s, s3); \
598 }
599
600 FUN_STORE4(8, _) // store_8x4
601 FUN_STORE4(16, q_) // store_16x4
602 #undef FUN_STORE4
603
604 #define FUN_STORE6(w, r) \
605 static INLINE void store_##w##x6( \
606 uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
607 const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4, \
608 const uint8x##w##_t s5) { \
609 vst1##r##u8(s, s0); \
610 s += p; \
611 vst1##r##u8(s, s1); \
612 s += p; \
613 vst1##r##u8(s, s2); \
614 s += p; \
615 vst1##r##u8(s, s3); \
616 s += p; \
617 vst1##r##u8(s, s4); \
618 s += p; \
619 vst1##r##u8(s, s5); \
620 }
621
622 FUN_STORE6(8, _) // store_8x6
623 FUN_STORE6(16, q_) // store_16x6
624 #undef FUN_STORE6
625
store_4x8(uint8_t * s,const int p,const uint8x8_t p1,const uint8x8_t p0,const uint8x8_t q0,const uint8x8_t q1)626 static INLINE void store_4x8(uint8_t *s, const int p, const uint8x8_t p1,
627 const uint8x8_t p0, const uint8x8_t q0,
628 const uint8x8_t q1) {
629 uint8x8x4_t o;
630
631 o.val[0] = p1;
632 o.val[1] = p0;
633 o.val[2] = q0;
634 o.val[3] = q1;
635 vst4_lane_u8(s, o, 0);
636 s += p;
637 vst4_lane_u8(s, o, 1);
638 s += p;
639 vst4_lane_u8(s, o, 2);
640 s += p;
641 vst4_lane_u8(s, o, 3);
642 s += p;
643 vst4_lane_u8(s, o, 4);
644 s += p;
645 vst4_lane_u8(s, o, 5);
646 s += p;
647 vst4_lane_u8(s, o, 6);
648 s += p;
649 vst4_lane_u8(s, o, 7);
650 }
651
store_6x8(uint8_t * s,const int p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3,const uint8x8_t s4,const uint8x8_t s5)652 static INLINE void store_6x8(uint8_t *s, const int p, const uint8x8_t s0,
653 const uint8x8_t s1, const uint8x8_t s2,
654 const uint8x8_t s3, const uint8x8_t s4,
655 const uint8x8_t s5) {
656 uint8x8x3_t o0, o1;
657
658 o0.val[0] = s0;
659 o0.val[1] = s1;
660 o0.val[2] = s2;
661 o1.val[0] = s3;
662 o1.val[1] = s4;
663 o1.val[2] = s5;
664 vst3_lane_u8(s - 3, o0, 0);
665 vst3_lane_u8(s + 0, o1, 0);
666 s += p;
667 vst3_lane_u8(s - 3, o0, 1);
668 vst3_lane_u8(s + 0, o1, 1);
669 s += p;
670 vst3_lane_u8(s - 3, o0, 2);
671 vst3_lane_u8(s + 0, o1, 2);
672 s += p;
673 vst3_lane_u8(s - 3, o0, 3);
674 vst3_lane_u8(s + 0, o1, 3);
675 s += p;
676 vst3_lane_u8(s - 3, o0, 4);
677 vst3_lane_u8(s + 0, o1, 4);
678 s += p;
679 vst3_lane_u8(s - 3, o0, 5);
680 vst3_lane_u8(s + 0, o1, 5);
681 s += p;
682 vst3_lane_u8(s - 3, o0, 6);
683 vst3_lane_u8(s + 0, o1, 6);
684 s += p;
685 vst3_lane_u8(s - 3, o0, 7);
686 vst3_lane_u8(s + 0, o1, 7);
687 }
688
689 #define FUN_STORE8(w, r) \
690 static INLINE void store_##w##x8( \
691 uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \
692 const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4, \
693 const uint8x##w##_t s5, const uint8x##w##_t s6, \
694 const uint8x##w##_t s7) { \
695 vst1##r##u8(s, s0); \
696 s += p; \
697 vst1##r##u8(s, s1); \
698 s += p; \
699 vst1##r##u8(s, s2); \
700 s += p; \
701 vst1##r##u8(s, s3); \
702 s += p; \
703 vst1##r##u8(s, s4); \
704 s += p; \
705 vst1##r##u8(s, s5); \
706 s += p; \
707 vst1##r##u8(s, s6); \
708 s += p; \
709 vst1##r##u8(s, s7); \
710 }
711
712 FUN_STORE8(8, _) // store_8x8
713 FUN_STORE8(16, q_) // store_16x8
714 #undef FUN_STORE8
715
716 #define FUN_STORE14(w, r) \
717 static INLINE void store_##w##x14( \
718 uint8_t *s, const int p, const uint8x##w##_t p6, const uint8x##w##_t p5, \
719 const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \
720 const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
721 const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \
722 const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \
723 const uint32_t flat_status, const uint32_t flat2_status) { \
724 if (flat_status) { \
725 if (flat2_status) { \
726 vst1##r##u8(s - 7 * p, p6); \
727 vst1##r##u8(s - 6 * p, p5); \
728 vst1##r##u8(s - 5 * p, p4); \
729 vst1##r##u8(s - 4 * p, p3); \
730 vst1##r##u8(s + 3 * p, q3); \
731 vst1##r##u8(s + 4 * p, q4); \
732 vst1##r##u8(s + 5 * p, q5); \
733 vst1##r##u8(s + 6 * p, q6); \
734 } \
735 vst1##r##u8(s - 3 * p, p2); \
736 vst1##r##u8(s + 2 * p, q2); \
737 } \
738 vst1##r##u8(s - 2 * p, p1); \
739 vst1##r##u8(s - 1 * p, p0); \
740 vst1##r##u8(s + 0 * p, q0); \
741 vst1##r##u8(s + 1 * p, q1); \
742 }
743
744 FUN_STORE14(8, _) // store_8x14
745 FUN_STORE14(16, q_) // store_16x14
746 #undef FUN_STORE14
747
store_16x16(uint8_t * s,const int p,const uint8x16_t s0,const uint8x16_t s1,const uint8x16_t s2,const uint8x16_t s3,const uint8x16_t s4,const uint8x16_t s5,const uint8x16_t s6,const uint8x16_t s7,const uint8x16_t s8,const uint8x16_t s9,const uint8x16_t s10,const uint8x16_t s11,const uint8x16_t s12,const uint8x16_t s13,const uint8x16_t s14,const uint8x16_t s15)748 static INLINE void store_16x16(uint8_t *s, const int p, const uint8x16_t s0,
749 const uint8x16_t s1, const uint8x16_t s2,
750 const uint8x16_t s3, const uint8x16_t s4,
751 const uint8x16_t s5, const uint8x16_t s6,
752 const uint8x16_t s7, const uint8x16_t s8,
753 const uint8x16_t s9, const uint8x16_t s10,
754 const uint8x16_t s11, const uint8x16_t s12,
755 const uint8x16_t s13, const uint8x16_t s14,
756 const uint8x16_t s15) {
757 vst1q_u8(s, s0);
758 s += p;
759 vst1q_u8(s, s1);
760 s += p;
761 vst1q_u8(s, s2);
762 s += p;
763 vst1q_u8(s, s3);
764 s += p;
765 vst1q_u8(s, s4);
766 s += p;
767 vst1q_u8(s, s5);
768 s += p;
769 vst1q_u8(s, s6);
770 s += p;
771 vst1q_u8(s, s7);
772 s += p;
773 vst1q_u8(s, s8);
774 s += p;
775 vst1q_u8(s, s9);
776 s += p;
777 vst1q_u8(s, s10);
778 s += p;
779 vst1q_u8(s, s11);
780 s += p;
781 vst1q_u8(s, s12);
782 s += p;
783 vst1q_u8(s, s13);
784 s += p;
785 vst1q_u8(s, s14);
786 s += p;
787 vst1q_u8(s, s15);
788 }
789
790 #define FUN_HOR_4_KERNEL(name, w) \
791 static INLINE void lpf_horizontal_4##name##kernel( \
792 uint8_t *s, const int p, const uint8x##w##_t blimit, \
793 const uint8x##w##_t limit, const uint8x##w##_t thresh) { \
794 uint8x##w##_t p3, p2, p1, p0, q0, q1, q2, q3, mask, hev; \
795 \
796 load_##w##x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); \
797 filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, q2, \
798 q3, &hev, &mask); \
799 filter4_##w(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1); \
800 store_##w##x4(s - 2 * p, p, p1, p0, q0, q1); \
801 }
802
803 FUN_HOR_4_KERNEL(_, 8) // lpf_horizontal_4_kernel
804 FUN_HOR_4_KERNEL(_dual_, 16) // lpf_horizontal_4_dual_kernel
805 #undef FUN_HOR_4_KERNEL
806
vpx_lpf_horizontal_4_neon(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)807 void vpx_lpf_horizontal_4_neon(uint8_t *s, int p, const uint8_t *blimit,
808 const uint8_t *limit, const uint8_t *thresh) {
809 uint8x8_t blimit_vec, limit_vec, thresh_vec;
810 load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
811 lpf_horizontal_4_kernel(s, p, blimit_vec, limit_vec, thresh_vec);
812 }
813
vpx_lpf_horizontal_4_dual_neon(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)814 void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
815 const uint8_t *limit0,
816 const uint8_t *thresh0,
817 const uint8_t *blimit1,
818 const uint8_t *limit1,
819 const uint8_t *thresh1) {
820 uint8x16_t blimit_vec, limit_vec, thresh_vec;
821 load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
822 &blimit_vec, &limit_vec, &thresh_vec);
823 lpf_horizontal_4_dual_kernel(s, p, blimit_vec, limit_vec, thresh_vec);
824 }
825
vpx_lpf_vertical_4_neon(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)826 void vpx_lpf_vertical_4_neon(uint8_t *s, int p, const uint8_t *blimit,
827 const uint8_t *limit, const uint8_t *thresh) {
828 uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
829 mask, hev;
830 load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
831 load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
832 transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
833 filter_hev_mask4_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
834 q2, q3, &hev, &mask);
835 filter4_8(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);
836 store_4x8(s - 2, p, p1, p0, q0, q1);
837 }
838
vpx_lpf_vertical_4_dual_neon(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)839 void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
840 const uint8_t *limit0, const uint8_t *thresh0,
841 const uint8_t *blimit1, const uint8_t *limit1,
842 const uint8_t *thresh1) {
843 uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
844 mask, hev;
845 uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
846 s15;
847
848 load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
849 &blimit_vec, &limit_vec, &thresh_vec);
850 load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10,
851 &s11, &s12, &s13, &s14, &s15);
852 transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
853 s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
854 filter_hev_mask4_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1,
855 q2, q3, &hev, &mask);
856 filter4_16(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1);
857 s -= 2;
858 store_4x8(s, p, vget_low_u8(p1), vget_low_u8(p0), vget_low_u8(q0),
859 vget_low_u8(q1));
860 store_4x8(s + 8 * p, p, vget_high_u8(p1), vget_high_u8(p0), vget_high_u8(q0),
861 vget_high_u8(q1));
862 }
863
vpx_lpf_horizontal_8_neon(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)864 void vpx_lpf_horizontal_8_neon(uint8_t *s, int p, const uint8_t *blimit,
865 const uint8_t *limit, const uint8_t *thresh) {
866 uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
867 op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
868 uint32_t flat_status;
869
870 load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
871 load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
872 mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
873 p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
874 filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
875 &op1, &op0, &oq0, &oq1, &oq2);
876 store_8x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
877 }
878
vpx_lpf_horizontal_8_dual_neon(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)879 void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
880 const uint8_t *limit0,
881 const uint8_t *thresh0,
882 const uint8_t *blimit1,
883 const uint8_t *limit1,
884 const uint8_t *thresh1) {
885 uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
886 op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
887 uint32_t flat_status;
888
889 load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
890 &blimit_vec, &limit_vec, &thresh_vec);
891 load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
892 mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
893 p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
894 filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
895 &op1, &op0, &oq0, &oq1, &oq2);
896 store_16x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2);
897 }
898
vpx_lpf_vertical_8_neon(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)899 void vpx_lpf_vertical_8_neon(uint8_t *s, int p, const uint8_t *blimit,
900 const uint8_t *limit, const uint8_t *thresh) {
901 uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
902 op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
903 uint32_t flat_status;
904
905 load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec);
906 load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
907 transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
908 mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
909 p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
910 filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
911 &op1, &op0, &oq0, &oq1, &oq2);
912 // Note: transpose + store_8x8() is faster than store_6x8().
913 transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3);
914 store_8x8(s - 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3);
915 }
916
vpx_lpf_vertical_8_dual_neon(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)917 void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
918 const uint8_t *limit0, const uint8_t *thresh0,
919 const uint8_t *blimit1, const uint8_t *limit1,
920 const uint8_t *thresh1) {
921 uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3,
922 op2, op1, op0, oq0, oq1, oq2, mask, flat, hev;
923 uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
924 s15;
925 uint32_t flat_status;
926
927 load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1,
928 &blimit_vec, &limit_vec, &thresh_vec);
929 load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10,
930 &s11, &s12, &s13, &s14, &s15);
931 transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
932 s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
933 mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1,
934 p0, q0, q1, q2, q3, &flat, &flat_status, &hev);
935 filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2,
936 &op1, &op0, &oq0, &oq1, &oq2);
937 // Note: store_6x8() twice is faster than transpose + store_8x16().
938 store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0),
939 vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2));
940 store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1),
941 vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1),
942 vget_high_u8(oq2));
943 }
944
945 #define FUN_LPF_16_KERNEL(name, w) \
946 static INLINE void lpf_16##name##kernel( \
947 const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, \
948 const uint8x##w##_t p7, const uint8x##w##_t p6, const uint8x##w##_t p5, \
949 const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \
950 const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \
951 const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \
952 const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \
953 const uint8x##w##_t q7, uint8x##w##_t *op6, uint8x##w##_t *op5, \
954 uint8x##w##_t *op4, uint8x##w##_t *op3, uint8x##w##_t *op2, \
955 uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0, \
956 uint8x##w##_t *oq1, uint8x##w##_t *oq2, uint8x##w##_t *oq3, \
957 uint8x##w##_t *oq4, uint8x##w##_t *oq5, uint8x##w##_t *oq6, \
958 uint32_t *flat_status, uint32_t *flat2_status) { \
959 uint8x##w##_t blimit_vec, limit_vec, thresh_vec, mask, flat, flat2, hev; \
960 \
961 load_thresh_##w(blimit, limit, thresh, &blimit_vec, &limit_vec, \
962 &thresh_vec); \
963 mask = filter_flat_hev_mask_##w(limit_vec, blimit_vec, thresh_vec, p3, p2, \
964 p1, p0, q0, q1, q2, q3, &flat, \
965 flat_status, &hev); \
966 flat2 = flat_mask5_##w(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, \
967 flat2_status); \
968 filter16_##w(mask, flat, *flat_status, flat2, *flat2_status, hev, p7, p6, \
969 p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6, \
970 op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, \
971 oq6); \
972 }
973
974 FUN_LPF_16_KERNEL(_, 8) // lpf_16_kernel
975 FUN_LPF_16_KERNEL(_dual_, 16) // lpf_16_dual_kernel
976 #undef FUN_LPF_16_KERNEL
977
vpx_lpf_horizontal_16_neon(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)978 void vpx_lpf_horizontal_16_neon(uint8_t *s, int p, const uint8_t *blimit,
979 const uint8_t *limit, const uint8_t *thresh) {
980 uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6,
981 op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
982 uint32_t flat_status, flat2_status;
983
984 load_8x16(s - 8 * p, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1, &q2,
985 &q3, &q4, &q5, &q6, &q7);
986 lpf_16_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1,
987 q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, &op1,
988 &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, &flat_status,
989 &flat2_status);
990 store_8x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4,
991 oq5, oq6, flat_status, flat2_status);
992 }
993
vpx_lpf_horizontal_16_dual_neon(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)994 void vpx_lpf_horizontal_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
995 const uint8_t *limit,
996 const uint8_t *thresh) {
997 uint8x16_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7,
998 op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
999 uint32_t flat_status, flat2_status;
1000
1001 load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
1002 p7 = vld1q_u8(s - 8 * p);
1003 p6 = vld1q_u8(s - 7 * p);
1004 p5 = vld1q_u8(s - 6 * p);
1005 p4 = vld1q_u8(s - 5 * p);
1006 q4 = vld1q_u8(s + 4 * p);
1007 q5 = vld1q_u8(s + 5 * p);
1008 q6 = vld1q_u8(s + 6 * p);
1009 q7 = vld1q_u8(s + 7 * p);
1010 lpf_16_dual_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0,
1011 q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2,
1012 &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
1013 &flat_status, &flat2_status);
1014 store_16x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4,
1015 oq5, oq6, flat_status, flat2_status);
1016 }
1017
vpx_lpf_vertical_16_neon(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)1018 void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
1019 const uint8_t *limit, const uint8_t *thresh) {
1020 uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6,
1021 op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
1022 uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
1023 uint32_t flat_status, flat2_status;
1024
1025 s -= 8;
1026 load_16x8(s, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
1027 transpose_u8_16x8(s0, s1, s2, s3, s4, s5, s6, s7, &p7, &p6, &p5, &p4, &p3,
1028 &p2, &p1, &p0, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
1029 lpf_16_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1,
1030 q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, &op1,
1031 &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, &flat_status,
1032 &flat2_status);
1033 if (flat_status) {
1034 if (flat2_status) {
1035 transpose_u8_8x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2,
1036 oq3, oq4, oq5, oq6, q7, &s0, &s1, &s2, &s3, &s4, &s5,
1037 &s6, &s7);
1038 store_16x8(s, p, s0, s1, s2, s3, s4, s5, s6, s7);
1039 } else {
1040 // Note: transpose + store_8x8() is faster than store_6x8().
1041 transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3);
1042 store_8x8(s + 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3);
1043 }
1044 } else {
1045 store_4x8(s + 6, p, op1, op0, oq0, oq1);
1046 }
1047 }
1048
vpx_lpf_vertical_16_dual_neon(uint8_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)1049 void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
1050 const uint8_t *limit,
1051 const uint8_t *thresh) {
1052 uint8x16_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7,
1053 op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6;
1054 uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
1055 s15;
1056 uint32_t flat_status, flat2_status;
1057
1058 s -= 8;
1059 load_16x16(s, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10, &s11,
1060 &s12, &s13, &s14, &s15);
1061 transpose_u8_16x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
1062 s14, s15, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1,
1063 &q2, &q3, &q4, &q5, &q6, &q7);
1064 lpf_16_dual_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0,
1065 q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2,
1066 &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6,
1067 &flat_status, &flat2_status);
1068 if (flat_status) {
1069 if (flat2_status) {
1070 transpose_u8_16x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2,
1071 oq3, oq4, oq5, oq6, q7, &s0, &s1, &s2, &s3, &s4, &s5,
1072 &s6, &s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14,
1073 &s15);
1074 store_16x16(s, p, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
1075 s13, s14, s15);
1076 } else {
1077 // Note: store_6x8() twice is faster than transpose + store_8x16().
1078 s += 8;
1079 store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0),
1080 vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2));
1081 store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1),
1082 vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1),
1083 vget_high_u8(oq2));
1084 }
1085 } else {
1086 s += 6;
1087 store_4x8(s, p, vget_low_u8(op1), vget_low_u8(op0), vget_low_u8(oq0),
1088 vget_low_u8(oq1));
1089 store_4x8(s + 8 * p, p, vget_high_u8(op1), vget_high_u8(op0),
1090 vget_high_u8(oq0), vget_high_u8(oq1));
1091 }
1092 }
1093