1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13 #include <assert.h>
14 #include <stdint.h>
15
16 #include "config/aom_config.h"
17 #include "config/aom_dsp_rtcd.h"
18
19 #include "aom/aom_integer.h"
20 #include "aom_dsp/arm/mem_neon.h"
21 #include "aom_dsp/arm/reinterpret_neon.h"
22 #include "aom_dsp/arm/sum_neon.h"
23 #include "aom_dsp/arm/transpose_neon.h"
24 #include "aom_dsp/intrapred_common.h"
25
26 //------------------------------------------------------------------------------
27 // DC 4x4
28
dc_load_sum_4(const uint8_t * in)29 static INLINE uint16x8_t dc_load_sum_4(const uint8_t *in) {
30 const uint8x8_t a = load_u8_4x1(in);
31 const uint16x4_t p0 = vpaddl_u8(a);
32 const uint16x4_t p1 = vpadd_u16(p0, p0);
33 return vcombine_u16(p1, vdup_n_u16(0));
34 }
35
dc_store_4xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x8_t dc)36 static INLINE void dc_store_4xh(uint8_t *dst, ptrdiff_t stride, int h,
37 uint8x8_t dc) {
38 for (int i = 0; i < h; ++i) {
39 store_u8_4x1(dst + i * stride, dc);
40 }
41 }
42
aom_dc_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)43 void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
44 const uint8_t *above, const uint8_t *left) {
45 const uint16x8_t sum_top = dc_load_sum_4(above);
46 const uint16x8_t sum_left = dc_load_sum_4(left);
47 const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
48 const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);
49 dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
50 }
51
aom_dc_left_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)52 void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
53 const uint8_t *above, const uint8_t *left) {
54 const uint16x8_t sum_left = dc_load_sum_4(left);
55 const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 2);
56 (void)above;
57 dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
58 }
59
aom_dc_top_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)60 void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
61 const uint8_t *above, const uint8_t *left) {
62 const uint16x8_t sum_top = dc_load_sum_4(above);
63 const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 2);
64 (void)left;
65 dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
66 }
67
aom_dc_128_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)68 void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
69 const uint8_t *above, const uint8_t *left) {
70 const uint8x8_t dc0 = vdup_n_u8(0x80);
71 (void)above;
72 (void)left;
73 dc_store_4xh(dst, stride, 4, dc0);
74 }
75
76 //------------------------------------------------------------------------------
77 // DC 8x8
78
dc_load_sum_8(const uint8_t * in)79 static INLINE uint16x8_t dc_load_sum_8(const uint8_t *in) {
80 // This isn't used in the case where we want to load both above and left
81 // vectors, since we want to avoid performing the reduction twice.
82 const uint8x8_t a = vld1_u8(in);
83 const uint16x4_t p0 = vpaddl_u8(a);
84 const uint16x4_t p1 = vpadd_u16(p0, p0);
85 const uint16x4_t p2 = vpadd_u16(p1, p1);
86 return vcombine_u16(p2, vdup_n_u16(0));
87 }
88
horizontal_add_and_broadcast_u16x8(uint16x8_t a)89 static INLINE uint16x8_t horizontal_add_and_broadcast_u16x8(uint16x8_t a) {
90 #if AOM_ARCH_AARCH64
91 // On AArch64 we could also use vdupq_n_u16(vaddvq_u16(a)) here to save an
92 // instruction, however the addv instruction is usually slightly more
93 // expensive than a pairwise addition, so the need for immediately
94 // broadcasting the result again seems to negate any benefit.
95 const uint16x8_t b = vpaddq_u16(a, a);
96 const uint16x8_t c = vpaddq_u16(b, b);
97 return vpaddq_u16(c, c);
98 #else
99 const uint16x4_t b = vadd_u16(vget_low_u16(a), vget_high_u16(a));
100 const uint16x4_t c = vpadd_u16(b, b);
101 const uint16x4_t d = vpadd_u16(c, c);
102 return vcombine_u16(d, d);
103 #endif
104 }
105
dc_store_8xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x8_t dc)106 static INLINE void dc_store_8xh(uint8_t *dst, ptrdiff_t stride, int h,
107 uint8x8_t dc) {
108 for (int i = 0; i < h; ++i) {
109 vst1_u8(dst + i * stride, dc);
110 }
111 }
112
aom_dc_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)113 void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
114 const uint8_t *above, const uint8_t *left) {
115 const uint8x8_t sum_top = vld1_u8(above);
116 const uint8x8_t sum_left = vld1_u8(left);
117 uint16x8_t sum = vaddl_u8(sum_left, sum_top);
118 sum = horizontal_add_and_broadcast_u16x8(sum);
119 const uint8x8_t dc0 = vrshrn_n_u16(sum, 4);
120 dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
121 }
122
aom_dc_left_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)123 void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
124 const uint8_t *above, const uint8_t *left) {
125 const uint16x8_t sum_left = dc_load_sum_8(left);
126 const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 3);
127 (void)above;
128 dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
129 }
130
aom_dc_top_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)131 void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
132 const uint8_t *above, const uint8_t *left) {
133 const uint16x8_t sum_top = dc_load_sum_8(above);
134 const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 3);
135 (void)left;
136 dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
137 }
138
aom_dc_128_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)139 void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
140 const uint8_t *above, const uint8_t *left) {
141 const uint8x8_t dc0 = vdup_n_u8(0x80);
142 (void)above;
143 (void)left;
144 dc_store_8xh(dst, stride, 8, dc0);
145 }
146
147 //------------------------------------------------------------------------------
148 // DC 16x16
149
dc_load_partial_sum_16(const uint8_t * in)150 static INLINE uint16x8_t dc_load_partial_sum_16(const uint8_t *in) {
151 const uint8x16_t a = vld1q_u8(in);
152 // delay the remainder of the reduction until
153 // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
154 // than twice in the case we are loading both above and left.
155 return vpaddlq_u8(a);
156 }
157
dc_load_sum_16(const uint8_t * in)158 static INLINE uint16x8_t dc_load_sum_16(const uint8_t *in) {
159 return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_16(in));
160 }
161
dc_store_16xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x16_t dc)162 static INLINE void dc_store_16xh(uint8_t *dst, ptrdiff_t stride, int h,
163 uint8x16_t dc) {
164 for (int i = 0; i < h; ++i) {
165 vst1q_u8(dst + i * stride, dc);
166 }
167 }
168
aom_dc_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)169 void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
170 const uint8_t *above, const uint8_t *left) {
171 const uint16x8_t sum_top = dc_load_partial_sum_16(above);
172 const uint16x8_t sum_left = dc_load_partial_sum_16(left);
173 uint16x8_t sum = vaddq_u16(sum_left, sum_top);
174 sum = horizontal_add_and_broadcast_u16x8(sum);
175 const uint8x8_t dc0 = vrshrn_n_u16(sum, 5);
176 dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
177 }
178
aom_dc_left_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)179 void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
180 const uint8_t *above,
181 const uint8_t *left) {
182 const uint16x8_t sum_left = dc_load_sum_16(left);
183 const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 4);
184 (void)above;
185 dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
186 }
187
aom_dc_top_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)188 void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
189 const uint8_t *above,
190 const uint8_t *left) {
191 const uint16x8_t sum_top = dc_load_sum_16(above);
192 const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 4);
193 (void)left;
194 dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
195 }
196
aom_dc_128_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)197 void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
198 const uint8_t *above,
199 const uint8_t *left) {
200 const uint8x16_t dc0 = vdupq_n_u8(0x80);
201 (void)above;
202 (void)left;
203 dc_store_16xh(dst, stride, 16, dc0);
204 }
205
206 //------------------------------------------------------------------------------
207 // DC 32x32
208
dc_load_partial_sum_32(const uint8_t * in)209 static INLINE uint16x8_t dc_load_partial_sum_32(const uint8_t *in) {
210 const uint8x16_t a0 = vld1q_u8(in);
211 const uint8x16_t a1 = vld1q_u8(in + 16);
212 // delay the remainder of the reduction until
213 // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
214 // than twice in the case we are loading both above and left.
215 return vpadalq_u8(vpaddlq_u8(a0), a1);
216 }
217
dc_load_sum_32(const uint8_t * in)218 static INLINE uint16x8_t dc_load_sum_32(const uint8_t *in) {
219 return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_32(in));
220 }
221
dc_store_32xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x16_t dc)222 static INLINE void dc_store_32xh(uint8_t *dst, ptrdiff_t stride, int h,
223 uint8x16_t dc) {
224 for (int i = 0; i < h; ++i) {
225 vst1q_u8(dst + i * stride, dc);
226 vst1q_u8(dst + i * stride + 16, dc);
227 }
228 }
229
aom_dc_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)230 void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
231 const uint8_t *above, const uint8_t *left) {
232 const uint16x8_t sum_top = dc_load_partial_sum_32(above);
233 const uint16x8_t sum_left = dc_load_partial_sum_32(left);
234 uint16x8_t sum = vaddq_u16(sum_left, sum_top);
235 sum = horizontal_add_and_broadcast_u16x8(sum);
236 const uint8x8_t dc0 = vrshrn_n_u16(sum, 6);
237 dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
238 }
239
aom_dc_left_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)240 void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
241 const uint8_t *above,
242 const uint8_t *left) {
243 const uint16x8_t sum_left = dc_load_sum_32(left);
244 const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 5);
245 (void)above;
246 dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
247 }
248
aom_dc_top_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)249 void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
250 const uint8_t *above,
251 const uint8_t *left) {
252 const uint16x8_t sum_top = dc_load_sum_32(above);
253 const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 5);
254 (void)left;
255 dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
256 }
257
aom_dc_128_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)258 void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
259 const uint8_t *above,
260 const uint8_t *left) {
261 const uint8x16_t dc0 = vdupq_n_u8(0x80);
262 (void)above;
263 (void)left;
264 dc_store_32xh(dst, stride, 32, dc0);
265 }
266
267 //------------------------------------------------------------------------------
268 // DC 64x64
269
dc_load_partial_sum_64(const uint8_t * in)270 static INLINE uint16x8_t dc_load_partial_sum_64(const uint8_t *in) {
271 const uint8x16_t a0 = vld1q_u8(in);
272 const uint8x16_t a1 = vld1q_u8(in + 16);
273 const uint8x16_t a2 = vld1q_u8(in + 32);
274 const uint8x16_t a3 = vld1q_u8(in + 48);
275 const uint16x8_t p01 = vpadalq_u8(vpaddlq_u8(a0), a1);
276 const uint16x8_t p23 = vpadalq_u8(vpaddlq_u8(a2), a3);
277 // delay the remainder of the reduction until
278 // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
279 // than twice in the case we are loading both above and left.
280 return vaddq_u16(p01, p23);
281 }
282
dc_load_sum_64(const uint8_t * in)283 static INLINE uint16x8_t dc_load_sum_64(const uint8_t *in) {
284 return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_64(in));
285 }
286
dc_store_64xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x16_t dc)287 static INLINE void dc_store_64xh(uint8_t *dst, ptrdiff_t stride, int h,
288 uint8x16_t dc) {
289 for (int i = 0; i < h; ++i) {
290 vst1q_u8(dst + i * stride, dc);
291 vst1q_u8(dst + i * stride + 16, dc);
292 vst1q_u8(dst + i * stride + 32, dc);
293 vst1q_u8(dst + i * stride + 48, dc);
294 }
295 }
296
aom_dc_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)297 void aom_dc_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
298 const uint8_t *above, const uint8_t *left) {
299 const uint16x8_t sum_top = dc_load_partial_sum_64(above);
300 const uint16x8_t sum_left = dc_load_partial_sum_64(left);
301 uint16x8_t sum = vaddq_u16(sum_left, sum_top);
302 sum = horizontal_add_and_broadcast_u16x8(sum);
303 const uint8x8_t dc0 = vrshrn_n_u16(sum, 7);
304 dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
305 }
306
aom_dc_left_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)307 void aom_dc_left_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
308 const uint8_t *above,
309 const uint8_t *left) {
310 const uint16x8_t sum_left = dc_load_sum_64(left);
311 const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 6);
312 (void)above;
313 dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
314 }
315
aom_dc_top_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)316 void aom_dc_top_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
317 const uint8_t *above,
318 const uint8_t *left) {
319 const uint16x8_t sum_top = dc_load_sum_64(above);
320 const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 6);
321 (void)left;
322 dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
323 }
324
aom_dc_128_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)325 void aom_dc_128_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
326 const uint8_t *above,
327 const uint8_t *left) {
328 const uint8x16_t dc0 = vdupq_n_u8(0x80);
329 (void)above;
330 (void)left;
331 dc_store_64xh(dst, stride, 64, dc0);
332 }
333
334 //------------------------------------------------------------------------------
335 // DC rectangular cases
336
337 #define DC_MULTIPLIER_1X2 0x5556
338 #define DC_MULTIPLIER_1X4 0x3334
339
340 #define DC_SHIFT2 16
341
divide_using_multiply_shift(int num,int shift1,int multiplier,int shift2)342 static INLINE int divide_using_multiply_shift(int num, int shift1,
343 int multiplier, int shift2) {
344 const int interm = num >> shift1;
345 return interm * multiplier >> shift2;
346 }
347
calculate_dc_from_sum(int bw,int bh,uint32_t sum,int shift1,int multiplier)348 static INLINE int calculate_dc_from_sum(int bw, int bh, uint32_t sum,
349 int shift1, int multiplier) {
350 const int expected_dc = divide_using_multiply_shift(
351 sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2);
352 assert(expected_dc < (1 << 8));
353 return expected_dc;
354 }
355
356 #undef DC_SHIFT2
357
aom_dc_predictor_4x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)358 void aom_dc_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
359 const uint8_t *above, const uint8_t *left) {
360 uint8x8_t a = load_u8_4x1(above);
361 uint8x8_t l = vld1_u8(left);
362 uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
363 uint32_t dc = calculate_dc_from_sum(4, 8, sum, 2, DC_MULTIPLIER_1X2);
364 dc_store_4xh(dst, stride, 8, vdup_n_u8(dc));
365 }
366
aom_dc_predictor_8x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)367 void aom_dc_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
368 const uint8_t *above, const uint8_t *left) {
369 uint8x8_t a = vld1_u8(above);
370 uint8x8_t l = load_u8_4x1(left);
371 uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
372 uint32_t dc = calculate_dc_from_sum(8, 4, sum, 2, DC_MULTIPLIER_1X2);
373 dc_store_8xh(dst, stride, 4, vdup_n_u8(dc));
374 }
375
aom_dc_predictor_4x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)376 void aom_dc_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
377 const uint8_t *above, const uint8_t *left) {
378 uint8x8_t a = load_u8_4x1(above);
379 uint8x16_t l = vld1q_u8(left);
380 uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a);
381 uint32_t sum = horizontal_add_u16x8(sum_al);
382 uint32_t dc = calculate_dc_from_sum(4, 16, sum, 2, DC_MULTIPLIER_1X4);
383 dc_store_4xh(dst, stride, 16, vdup_n_u8(dc));
384 }
385
aom_dc_predictor_16x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)386 void aom_dc_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
387 const uint8_t *above, const uint8_t *left) {
388 uint8x16_t a = vld1q_u8(above);
389 uint8x8_t l = load_u8_4x1(left);
390 uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l);
391 uint32_t sum = horizontal_add_u16x8(sum_al);
392 uint32_t dc = calculate_dc_from_sum(16, 4, sum, 2, DC_MULTIPLIER_1X4);
393 dc_store_16xh(dst, stride, 4, vdupq_n_u8(dc));
394 }
395
aom_dc_predictor_8x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)396 void aom_dc_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
397 const uint8_t *above, const uint8_t *left) {
398 uint8x8_t a = vld1_u8(above);
399 uint8x16_t l = vld1q_u8(left);
400 uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a);
401 uint32_t sum = horizontal_add_u16x8(sum_al);
402 uint32_t dc = calculate_dc_from_sum(8, 16, sum, 3, DC_MULTIPLIER_1X2);
403 dc_store_8xh(dst, stride, 16, vdup_n_u8(dc));
404 }
405
aom_dc_predictor_16x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)406 void aom_dc_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
407 const uint8_t *above, const uint8_t *left) {
408 uint8x16_t a = vld1q_u8(above);
409 uint8x8_t l = vld1_u8(left);
410 uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l);
411 uint32_t sum = horizontal_add_u16x8(sum_al);
412 uint32_t dc = calculate_dc_from_sum(16, 8, sum, 3, DC_MULTIPLIER_1X2);
413 dc_store_16xh(dst, stride, 8, vdupq_n_u8(dc));
414 }
415
aom_dc_predictor_8x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)416 void aom_dc_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
417 const uint8_t *above, const uint8_t *left) {
418 uint8x8_t a = vld1_u8(above);
419 uint16x8_t sum_left = dc_load_partial_sum_32(left);
420 uint16x8_t sum_al = vaddw_u8(sum_left, a);
421 uint32_t sum = horizontal_add_u16x8(sum_al);
422 uint32_t dc = calculate_dc_from_sum(8, 32, sum, 3, DC_MULTIPLIER_1X4);
423 dc_store_8xh(dst, stride, 32, vdup_n_u8(dc));
424 }
425
aom_dc_predictor_32x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)426 void aom_dc_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
427 const uint8_t *above, const uint8_t *left) {
428 uint16x8_t sum_top = dc_load_partial_sum_32(above);
429 uint8x8_t l = vld1_u8(left);
430 uint16x8_t sum_al = vaddw_u8(sum_top, l);
431 uint32_t sum = horizontal_add_u16x8(sum_al);
432 uint32_t dc = calculate_dc_from_sum(32, 8, sum, 3, DC_MULTIPLIER_1X4);
433 dc_store_32xh(dst, stride, 8, vdupq_n_u8(dc));
434 }
435
aom_dc_predictor_16x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)436 void aom_dc_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
437 const uint8_t *above, const uint8_t *left) {
438 uint16x8_t sum_above = dc_load_partial_sum_16(above);
439 uint16x8_t sum_left = dc_load_partial_sum_32(left);
440 uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
441 uint32_t sum = horizontal_add_u16x8(sum_al);
442 uint32_t dc = calculate_dc_from_sum(16, 32, sum, 4, DC_MULTIPLIER_1X2);
443 dc_store_16xh(dst, stride, 32, vdupq_n_u8(dc));
444 }
445
aom_dc_predictor_32x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)446 void aom_dc_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
447 const uint8_t *above, const uint8_t *left) {
448 uint16x8_t sum_above = dc_load_partial_sum_32(above);
449 uint16x8_t sum_left = dc_load_partial_sum_16(left);
450 uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
451 uint32_t sum = horizontal_add_u16x8(sum_al);
452 uint32_t dc = calculate_dc_from_sum(32, 16, sum, 4, DC_MULTIPLIER_1X2);
453 dc_store_32xh(dst, stride, 16, vdupq_n_u8(dc));
454 }
455
aom_dc_predictor_16x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)456 void aom_dc_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
457 const uint8_t *above, const uint8_t *left) {
458 uint16x8_t sum_above = dc_load_partial_sum_16(above);
459 uint16x8_t sum_left = dc_load_partial_sum_64(left);
460 uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
461 uint32_t sum = horizontal_add_u16x8(sum_al);
462 uint32_t dc = calculate_dc_from_sum(16, 64, sum, 4, DC_MULTIPLIER_1X4);
463 dc_store_16xh(dst, stride, 64, vdupq_n_u8(dc));
464 }
465
aom_dc_predictor_64x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)466 void aom_dc_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
467 const uint8_t *above, const uint8_t *left) {
468 uint16x8_t sum_above = dc_load_partial_sum_64(above);
469 uint16x8_t sum_left = dc_load_partial_sum_16(left);
470 uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
471 uint32_t sum = horizontal_add_u16x8(sum_al);
472 uint32_t dc = calculate_dc_from_sum(64, 16, sum, 4, DC_MULTIPLIER_1X4);
473 dc_store_64xh(dst, stride, 16, vdupq_n_u8(dc));
474 }
475
aom_dc_predictor_32x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)476 void aom_dc_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
477 const uint8_t *above, const uint8_t *left) {
478 uint16x8_t sum_above = dc_load_partial_sum_32(above);
479 uint16x8_t sum_left = dc_load_partial_sum_64(left);
480 uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
481 uint32_t sum = horizontal_add_u16x8(sum_al);
482 uint32_t dc = calculate_dc_from_sum(32, 64, sum, 5, DC_MULTIPLIER_1X2);
483 dc_store_32xh(dst, stride, 64, vdupq_n_u8(dc));
484 }
485
aom_dc_predictor_64x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)486 void aom_dc_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
487 const uint8_t *above, const uint8_t *left) {
488 uint16x8_t sum_above = dc_load_partial_sum_64(above);
489 uint16x8_t sum_left = dc_load_partial_sum_32(left);
490 uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
491 uint32_t sum = horizontal_add_u16x8(sum_al);
492 uint32_t dc = calculate_dc_from_sum(64, 32, sum, 5, DC_MULTIPLIER_1X2);
493 dc_store_64xh(dst, stride, 32, vdupq_n_u8(dc));
494 }
495
496 #undef DC_MULTIPLIER_1X2
497 #undef DC_MULTIPLIER_1X4
498
499 #define DC_PREDICTOR_128(w, h, q) \
500 void aom_dc_128_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
501 const uint8_t *above, \
502 const uint8_t *left) { \
503 (void)above; \
504 (void)left; \
505 dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u8(0x80)); \
506 }
507
508 DC_PREDICTOR_128(4, 8, )
509 DC_PREDICTOR_128(4, 16, )
510 DC_PREDICTOR_128(8, 4, )
511 DC_PREDICTOR_128(8, 16, )
512 DC_PREDICTOR_128(8, 32, )
513 DC_PREDICTOR_128(16, 4, q)
514 DC_PREDICTOR_128(16, 8, q)
515 DC_PREDICTOR_128(16, 32, q)
516 DC_PREDICTOR_128(16, 64, q)
517 DC_PREDICTOR_128(32, 8, q)
518 DC_PREDICTOR_128(32, 16, q)
519 DC_PREDICTOR_128(32, 64, q)
520 DC_PREDICTOR_128(64, 32, q)
521 DC_PREDICTOR_128(64, 16, q)
522
523 #undef DC_PREDICTOR_128
524
525 #define DC_PREDICTOR_LEFT(w, h, shift, q) \
526 void aom_dc_left_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
527 const uint8_t *above, \
528 const uint8_t *left) { \
529 (void)above; \
530 const uint16x8_t sum = dc_load_sum_##h(left); \
531 const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift)); \
532 dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0)); \
533 }
534
535 DC_PREDICTOR_LEFT(4, 8, 3, )
536 DC_PREDICTOR_LEFT(8, 4, 2, )
537 DC_PREDICTOR_LEFT(8, 16, 4, )
538 DC_PREDICTOR_LEFT(16, 8, 3, q)
539 DC_PREDICTOR_LEFT(16, 32, 5, q)
540 DC_PREDICTOR_LEFT(32, 16, 4, q)
541 DC_PREDICTOR_LEFT(32, 64, 6, q)
542 DC_PREDICTOR_LEFT(64, 32, 5, q)
543 DC_PREDICTOR_LEFT(4, 16, 4, )
544 DC_PREDICTOR_LEFT(16, 4, 2, q)
545 DC_PREDICTOR_LEFT(8, 32, 5, )
546 DC_PREDICTOR_LEFT(32, 8, 3, q)
547 DC_PREDICTOR_LEFT(16, 64, 6, q)
548 DC_PREDICTOR_LEFT(64, 16, 4, q)
549
550 #undef DC_PREDICTOR_LEFT
551
552 #define DC_PREDICTOR_TOP(w, h, shift, q) \
553 void aom_dc_top_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
554 const uint8_t *above, \
555 const uint8_t *left) { \
556 (void)left; \
557 const uint16x8_t sum = dc_load_sum_##w(above); \
558 const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift)); \
559 dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0)); \
560 }
561
562 DC_PREDICTOR_TOP(4, 8, 2, )
563 DC_PREDICTOR_TOP(4, 16, 2, )
564 DC_PREDICTOR_TOP(8, 4, 3, )
565 DC_PREDICTOR_TOP(8, 16, 3, )
566 DC_PREDICTOR_TOP(8, 32, 3, )
567 DC_PREDICTOR_TOP(16, 4, 4, q)
568 DC_PREDICTOR_TOP(16, 8, 4, q)
569 DC_PREDICTOR_TOP(16, 32, 4, q)
570 DC_PREDICTOR_TOP(16, 64, 4, q)
571 DC_PREDICTOR_TOP(32, 8, 5, q)
572 DC_PREDICTOR_TOP(32, 16, 5, q)
573 DC_PREDICTOR_TOP(32, 64, 5, q)
574 DC_PREDICTOR_TOP(64, 16, 6, q)
575 DC_PREDICTOR_TOP(64, 32, 6, q)
576
577 #undef DC_PREDICTOR_TOP
578
579 // -----------------------------------------------------------------------------
580
v_store_4xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x8_t d0)581 static INLINE void v_store_4xh(uint8_t *dst, ptrdiff_t stride, int h,
582 uint8x8_t d0) {
583 for (int i = 0; i < h; ++i) {
584 store_u8_4x1(dst + i * stride, d0);
585 }
586 }
587
v_store_8xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x8_t d0)588 static INLINE void v_store_8xh(uint8_t *dst, ptrdiff_t stride, int h,
589 uint8x8_t d0) {
590 for (int i = 0; i < h; ++i) {
591 vst1_u8(dst + i * stride, d0);
592 }
593 }
594
v_store_16xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x16_t d0)595 static INLINE void v_store_16xh(uint8_t *dst, ptrdiff_t stride, int h,
596 uint8x16_t d0) {
597 for (int i = 0; i < h; ++i) {
598 vst1q_u8(dst + i * stride, d0);
599 }
600 }
601
v_store_32xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x16_t d0,uint8x16_t d1)602 static INLINE void v_store_32xh(uint8_t *dst, ptrdiff_t stride, int h,
603 uint8x16_t d0, uint8x16_t d1) {
604 for (int i = 0; i < h; ++i) {
605 vst1q_u8(dst + 0, d0);
606 vst1q_u8(dst + 16, d1);
607 dst += stride;
608 }
609 }
610
v_store_64xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x16_t d0,uint8x16_t d1,uint8x16_t d2,uint8x16_t d3)611 static INLINE void v_store_64xh(uint8_t *dst, ptrdiff_t stride, int h,
612 uint8x16_t d0, uint8x16_t d1, uint8x16_t d2,
613 uint8x16_t d3) {
614 for (int i = 0; i < h; ++i) {
615 vst1q_u8(dst + 0, d0);
616 vst1q_u8(dst + 16, d1);
617 vst1q_u8(dst + 32, d2);
618 vst1q_u8(dst + 48, d3);
619 dst += stride;
620 }
621 }
622
aom_v_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)623 void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
624 const uint8_t *above, const uint8_t *left) {
625 (void)left;
626 v_store_4xh(dst, stride, 4, load_u8_4x1(above));
627 }
628
aom_v_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)629 void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
630 const uint8_t *above, const uint8_t *left) {
631 (void)left;
632 v_store_8xh(dst, stride, 8, vld1_u8(above));
633 }
634
aom_v_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)635 void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
636 const uint8_t *above, const uint8_t *left) {
637 (void)left;
638 v_store_16xh(dst, stride, 16, vld1q_u8(above));
639 }
640
aom_v_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)641 void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
642 const uint8_t *above, const uint8_t *left) {
643 const uint8x16_t d0 = vld1q_u8(above);
644 const uint8x16_t d1 = vld1q_u8(above + 16);
645 (void)left;
646 v_store_32xh(dst, stride, 32, d0, d1);
647 }
648
aom_v_predictor_4x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)649 void aom_v_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
650 const uint8_t *above, const uint8_t *left) {
651 (void)left;
652 v_store_4xh(dst, stride, 8, load_u8_4x1(above));
653 }
654
aom_v_predictor_4x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)655 void aom_v_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
656 const uint8_t *above, const uint8_t *left) {
657 (void)left;
658 v_store_4xh(dst, stride, 16, load_u8_4x1(above));
659 }
660
aom_v_predictor_8x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)661 void aom_v_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
662 const uint8_t *above, const uint8_t *left) {
663 (void)left;
664 v_store_8xh(dst, stride, 4, vld1_u8(above));
665 }
666
aom_v_predictor_8x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)667 void aom_v_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
668 const uint8_t *above, const uint8_t *left) {
669 (void)left;
670 v_store_8xh(dst, stride, 16, vld1_u8(above));
671 }
672
aom_v_predictor_8x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)673 void aom_v_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
674 const uint8_t *above, const uint8_t *left) {
675 (void)left;
676 v_store_8xh(dst, stride, 32, vld1_u8(above));
677 }
678
aom_v_predictor_16x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)679 void aom_v_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
680 const uint8_t *above, const uint8_t *left) {
681 (void)left;
682 v_store_16xh(dst, stride, 4, vld1q_u8(above));
683 }
684
aom_v_predictor_16x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)685 void aom_v_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
686 const uint8_t *above, const uint8_t *left) {
687 (void)left;
688 v_store_16xh(dst, stride, 8, vld1q_u8(above));
689 }
690
aom_v_predictor_16x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)691 void aom_v_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
692 const uint8_t *above, const uint8_t *left) {
693 (void)left;
694 v_store_16xh(dst, stride, 32, vld1q_u8(above));
695 }
696
aom_v_predictor_16x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)697 void aom_v_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
698 const uint8_t *above, const uint8_t *left) {
699 (void)left;
700 v_store_16xh(dst, stride, 64, vld1q_u8(above));
701 }
702
aom_v_predictor_32x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)703 void aom_v_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
704 const uint8_t *above, const uint8_t *left) {
705 const uint8x16_t d0 = vld1q_u8(above);
706 const uint8x16_t d1 = vld1q_u8(above + 16);
707 (void)left;
708 v_store_32xh(dst, stride, 8, d0, d1);
709 }
710
aom_v_predictor_32x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)711 void aom_v_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
712 const uint8_t *above, const uint8_t *left) {
713 const uint8x16_t d0 = vld1q_u8(above);
714 const uint8x16_t d1 = vld1q_u8(above + 16);
715 (void)left;
716 v_store_32xh(dst, stride, 16, d0, d1);
717 }
718
aom_v_predictor_32x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)719 void aom_v_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
720 const uint8_t *above, const uint8_t *left) {
721 const uint8x16_t d0 = vld1q_u8(above);
722 const uint8x16_t d1 = vld1q_u8(above + 16);
723 (void)left;
724 v_store_32xh(dst, stride, 64, d0, d1);
725 }
726
aom_v_predictor_64x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)727 void aom_v_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
728 const uint8_t *above, const uint8_t *left) {
729 const uint8x16_t d0 = vld1q_u8(above);
730 const uint8x16_t d1 = vld1q_u8(above + 16);
731 const uint8x16_t d2 = vld1q_u8(above + 32);
732 const uint8x16_t d3 = vld1q_u8(above + 48);
733 (void)left;
734 v_store_64xh(dst, stride, 16, d0, d1, d2, d3);
735 }
736
aom_v_predictor_64x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)737 void aom_v_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
738 const uint8_t *above, const uint8_t *left) {
739 const uint8x16_t d0 = vld1q_u8(above);
740 const uint8x16_t d1 = vld1q_u8(above + 16);
741 const uint8x16_t d2 = vld1q_u8(above + 32);
742 const uint8x16_t d3 = vld1q_u8(above + 48);
743 (void)left;
744 v_store_64xh(dst, stride, 32, d0, d1, d2, d3);
745 }
746
aom_v_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)747 void aom_v_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
748 const uint8_t *above, const uint8_t *left) {
749 const uint8x16_t d0 = vld1q_u8(above);
750 const uint8x16_t d1 = vld1q_u8(above + 16);
751 const uint8x16_t d2 = vld1q_u8(above + 32);
752 const uint8x16_t d3 = vld1q_u8(above + 48);
753 (void)left;
754 v_store_64xh(dst, stride, 64, d0, d1, d2, d3);
755 }
756
757 // -----------------------------------------------------------------------------
758
h_store_4x8(uint8_t * dst,ptrdiff_t stride,uint8x8_t d0)759 static INLINE void h_store_4x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
760 store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0));
761 store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1));
762 store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2));
763 store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3));
764 store_u8_4x1(dst + 4 * stride, vdup_lane_u8(d0, 4));
765 store_u8_4x1(dst + 5 * stride, vdup_lane_u8(d0, 5));
766 store_u8_4x1(dst + 6 * stride, vdup_lane_u8(d0, 6));
767 store_u8_4x1(dst + 7 * stride, vdup_lane_u8(d0, 7));
768 }
769
h_store_8x8(uint8_t * dst,ptrdiff_t stride,uint8x8_t d0)770 static INLINE void h_store_8x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
771 vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0));
772 vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1));
773 vst1_u8(dst + 2 * stride, vdup_lane_u8(d0, 2));
774 vst1_u8(dst + 3 * stride, vdup_lane_u8(d0, 3));
775 vst1_u8(dst + 4 * stride, vdup_lane_u8(d0, 4));
776 vst1_u8(dst + 5 * stride, vdup_lane_u8(d0, 5));
777 vst1_u8(dst + 6 * stride, vdup_lane_u8(d0, 6));
778 vst1_u8(dst + 7 * stride, vdup_lane_u8(d0, 7));
779 }
780
h_store_16x8(uint8_t * dst,ptrdiff_t stride,uint8x8_t d0)781 static INLINE void h_store_16x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
782 vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0));
783 vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1));
784 vst1q_u8(dst + 2 * stride, vdupq_lane_u8(d0, 2));
785 vst1q_u8(dst + 3 * stride, vdupq_lane_u8(d0, 3));
786 vst1q_u8(dst + 4 * stride, vdupq_lane_u8(d0, 4));
787 vst1q_u8(dst + 5 * stride, vdupq_lane_u8(d0, 5));
788 vst1q_u8(dst + 6 * stride, vdupq_lane_u8(d0, 6));
789 vst1q_u8(dst + 7 * stride, vdupq_lane_u8(d0, 7));
790 }
791
h_store_32x8(uint8_t * dst,ptrdiff_t stride,uint8x8_t d0)792 static INLINE void h_store_32x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
793 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0));
794 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0));
795 dst += stride;
796 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 1));
797 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 1));
798 dst += stride;
799 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 2));
800 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 2));
801 dst += stride;
802 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 3));
803 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 3));
804 dst += stride;
805 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 4));
806 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 4));
807 dst += stride;
808 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 5));
809 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 5));
810 dst += stride;
811 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 6));
812 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 6));
813 dst += stride;
814 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 7));
815 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 7));
816 }
817
h_store_64x8(uint8_t * dst,ptrdiff_t stride,uint8x8_t d0)818 static INLINE void h_store_64x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
819 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0));
820 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0));
821 vst1q_u8(dst + 32, vdupq_lane_u8(d0, 0));
822 vst1q_u8(dst + 48, vdupq_lane_u8(d0, 0));
823 dst += stride;
824 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 1));
825 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 1));
826 vst1q_u8(dst + 32, vdupq_lane_u8(d0, 1));
827 vst1q_u8(dst + 48, vdupq_lane_u8(d0, 1));
828 dst += stride;
829 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 2));
830 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 2));
831 vst1q_u8(dst + 32, vdupq_lane_u8(d0, 2));
832 vst1q_u8(dst + 48, vdupq_lane_u8(d0, 2));
833 dst += stride;
834 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 3));
835 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 3));
836 vst1q_u8(dst + 32, vdupq_lane_u8(d0, 3));
837 vst1q_u8(dst + 48, vdupq_lane_u8(d0, 3));
838 dst += stride;
839 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 4));
840 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 4));
841 vst1q_u8(dst + 32, vdupq_lane_u8(d0, 4));
842 vst1q_u8(dst + 48, vdupq_lane_u8(d0, 4));
843 dst += stride;
844 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 5));
845 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 5));
846 vst1q_u8(dst + 32, vdupq_lane_u8(d0, 5));
847 vst1q_u8(dst + 48, vdupq_lane_u8(d0, 5));
848 dst += stride;
849 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 6));
850 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 6));
851 vst1q_u8(dst + 32, vdupq_lane_u8(d0, 6));
852 vst1q_u8(dst + 48, vdupq_lane_u8(d0, 6));
853 dst += stride;
854 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 7));
855 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 7));
856 vst1q_u8(dst + 32, vdupq_lane_u8(d0, 7));
857 vst1q_u8(dst + 48, vdupq_lane_u8(d0, 7));
858 }
859
aom_h_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)860 void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
861 const uint8_t *above, const uint8_t *left) {
862 const uint8x8_t d0 = load_u8_4x1(left);
863 (void)above;
864 store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0));
865 store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1));
866 store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2));
867 store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3));
868 }
869
aom_h_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)870 void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
871 const uint8_t *above, const uint8_t *left) {
872 const uint8x8_t d0 = vld1_u8(left);
873 (void)above;
874 h_store_8x8(dst, stride, d0);
875 }
876
aom_h_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)877 void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
878 const uint8_t *above, const uint8_t *left) {
879 const uint8x16_t d0 = vld1q_u8(left);
880 (void)above;
881 h_store_16x8(dst, stride, vget_low_u8(d0));
882 h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
883 }
884
aom_h_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)885 void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
886 const uint8_t *above, const uint8_t *left) {
887 const uint8x16_t d0 = vld1q_u8(left);
888 const uint8x16_t d1 = vld1q_u8(left + 16);
889 (void)above;
890 h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
891 h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
892 h_store_32x8(dst + 16 * stride, stride, vget_low_u8(d1));
893 h_store_32x8(dst + 24 * stride, stride, vget_high_u8(d1));
894 }
895
aom_h_predictor_4x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)896 void aom_h_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
897 const uint8_t *above, const uint8_t *left) {
898 const uint8x8_t d0 = vld1_u8(left);
899 (void)above;
900 h_store_4x8(dst, stride, d0);
901 }
902
aom_h_predictor_4x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)903 void aom_h_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
904 const uint8_t *above, const uint8_t *left) {
905 const uint8x16_t d0 = vld1q_u8(left);
906 (void)above;
907 h_store_4x8(dst + 0 * stride, stride, vget_low_u8(d0));
908 h_store_4x8(dst + 8 * stride, stride, vget_high_u8(d0));
909 }
910
aom_h_predictor_8x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)911 void aom_h_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
912 const uint8_t *above, const uint8_t *left) {
913 const uint8x8_t d0 = load_u8_4x1(left);
914 (void)above;
915 vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0));
916 vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1));
917 vst1_u8(dst + 2 * stride, vdup_lane_u8(d0, 2));
918 vst1_u8(dst + 3 * stride, vdup_lane_u8(d0, 3));
919 }
920
aom_h_predictor_8x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)921 void aom_h_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
922 const uint8_t *above, const uint8_t *left) {
923 const uint8x16_t d0 = vld1q_u8(left);
924 (void)above;
925 h_store_8x8(dst + 0 * stride, stride, vget_low_u8(d0));
926 h_store_8x8(dst + 8 * stride, stride, vget_high_u8(d0));
927 }
928
aom_h_predictor_8x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)929 void aom_h_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
930 const uint8_t *above, const uint8_t *left) {
931 const uint8x16_t d0 = vld1q_u8(left);
932 const uint8x16_t d1 = vld1q_u8(left + 16);
933 (void)above;
934 h_store_8x8(dst + 0 * stride, stride, vget_low_u8(d0));
935 h_store_8x8(dst + 8 * stride, stride, vget_high_u8(d0));
936 h_store_8x8(dst + 16 * stride, stride, vget_low_u8(d1));
937 h_store_8x8(dst + 24 * stride, stride, vget_high_u8(d1));
938 }
939
aom_h_predictor_16x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)940 void aom_h_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
941 const uint8_t *above, const uint8_t *left) {
942 const uint8x8_t d0 = load_u8_4x1(left);
943 (void)above;
944 vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0));
945 vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1));
946 vst1q_u8(dst + 2 * stride, vdupq_lane_u8(d0, 2));
947 vst1q_u8(dst + 3 * stride, vdupq_lane_u8(d0, 3));
948 }
949
aom_h_predictor_16x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)950 void aom_h_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
951 const uint8_t *above, const uint8_t *left) {
952 const uint8x8_t d0 = vld1_u8(left);
953 (void)above;
954 h_store_16x8(dst, stride, d0);
955 }
956
aom_h_predictor_16x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)957 void aom_h_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
958 const uint8_t *above, const uint8_t *left) {
959 const uint8x16_t d0 = vld1q_u8(left);
960 const uint8x16_t d1 = vld1q_u8(left + 16);
961 (void)above;
962 h_store_16x8(dst + 0 * stride, stride, vget_low_u8(d0));
963 h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
964 h_store_16x8(dst + 16 * stride, stride, vget_low_u8(d1));
965 h_store_16x8(dst + 24 * stride, stride, vget_high_u8(d1));
966 }
967
aom_h_predictor_16x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)968 void aom_h_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
969 const uint8_t *above, const uint8_t *left) {
970 const uint8x16_t d0 = vld1q_u8(left);
971 const uint8x16_t d1 = vld1q_u8(left + 16);
972 const uint8x16_t d2 = vld1q_u8(left + 32);
973 const uint8x16_t d3 = vld1q_u8(left + 48);
974 (void)above;
975 h_store_16x8(dst + 0 * stride, stride, vget_low_u8(d0));
976 h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
977 h_store_16x8(dst + 16 * stride, stride, vget_low_u8(d1));
978 h_store_16x8(dst + 24 * stride, stride, vget_high_u8(d1));
979 h_store_16x8(dst + 32 * stride, stride, vget_low_u8(d2));
980 h_store_16x8(dst + 40 * stride, stride, vget_high_u8(d2));
981 h_store_16x8(dst + 48 * stride, stride, vget_low_u8(d3));
982 h_store_16x8(dst + 56 * stride, stride, vget_high_u8(d3));
983 }
984
aom_h_predictor_32x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)985 void aom_h_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
986 const uint8_t *above, const uint8_t *left) {
987 const uint8x8_t d0 = vld1_u8(left);
988 (void)above;
989 h_store_32x8(dst, stride, d0);
990 }
991
aom_h_predictor_32x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)992 void aom_h_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
993 const uint8_t *above, const uint8_t *left) {
994 const uint8x16_t d0 = vld1q_u8(left);
995 (void)above;
996 h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
997 h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
998 }
999
aom_h_predictor_32x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1000 void aom_h_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
1001 const uint8_t *above, const uint8_t *left) {
1002 const uint8x16_t d0 = vld1q_u8(left + 0);
1003 const uint8x16_t d1 = vld1q_u8(left + 16);
1004 const uint8x16_t d2 = vld1q_u8(left + 32);
1005 const uint8x16_t d3 = vld1q_u8(left + 48);
1006 (void)above;
1007 h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
1008 h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
1009 h_store_32x8(dst + 16 * stride, stride, vget_low_u8(d1));
1010 h_store_32x8(dst + 24 * stride, stride, vget_high_u8(d1));
1011 h_store_32x8(dst + 32 * stride, stride, vget_low_u8(d2));
1012 h_store_32x8(dst + 40 * stride, stride, vget_high_u8(d2));
1013 h_store_32x8(dst + 48 * stride, stride, vget_low_u8(d3));
1014 h_store_32x8(dst + 56 * stride, stride, vget_high_u8(d3));
1015 }
1016
aom_h_predictor_64x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1017 void aom_h_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
1018 const uint8_t *above, const uint8_t *left) {
1019 const uint8x16_t d0 = vld1q_u8(left);
1020 (void)above;
1021 h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
1022 h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
1023 }
1024
aom_h_predictor_64x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1025 void aom_h_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
1026 const uint8_t *above, const uint8_t *left) {
1027 (void)above;
1028 for (int i = 0; i < 2; ++i) {
1029 const uint8x16_t d0 = vld1q_u8(left);
1030 h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
1031 h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
1032 left += 16;
1033 dst += 16 * stride;
1034 }
1035 }
1036
aom_h_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1037 void aom_h_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
1038 const uint8_t *above, const uint8_t *left) {
1039 (void)above;
1040 for (int i = 0; i < 4; ++i) {
1041 const uint8x16_t d0 = vld1q_u8(left);
1042 h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
1043 h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
1044 left += 16;
1045 dst += 16 * stride;
1046 }
1047 }
1048
1049 /* ---------------------P R E D I C T I O N Z 1--------------------------- */
1050
1051 // Low bit depth functions
1052 static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
1053 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1054 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1055 { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1056 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1057 { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1058 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1059 { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1060 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1061 { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1062 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1063 { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1064 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1065 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1066 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1067 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1068 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1069 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
1070 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1071 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
1072 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1073 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
1074 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1075 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1076 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1077 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1078 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1079 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1080 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1081 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1082 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1083 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1084 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1085 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1086 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
1087 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1088 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1089 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
1090 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1091 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1092 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
1093 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1094 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1095 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0,
1096 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1097 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1098 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0,
1099 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1100 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1101 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0,
1102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1103 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1104 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
1105 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1106 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1107 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
1108 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1109 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1110 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1111 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1112 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1113 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1114 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1115 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1116 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1117 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
1118 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1119 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1120 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 },
1121 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1122 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1123 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 },
1124 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1125 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1126 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 },
1127 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1128 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1129 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 },
1130 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1131 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1132 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 },
1133 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1134 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1135 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 },
1136 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1137 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1138 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
1139 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1140 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1141 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
1142 };
1143
dr_prediction_z1_HxW_internal_neon_64(int H,int W,uint8x8_t * dst,const uint8_t * above,int upsample_above,int dx)1144 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon_64(
1145 int H, int W, uint8x8_t *dst, const uint8_t *above, int upsample_above,
1146 int dx) {
1147 const int frac_bits = 6 - upsample_above;
1148 const int max_base_x = ((W + H) - 1) << upsample_above;
1149
1150 assert(dx > 0);
1151 // pre-filter above pixels
1152 // store in temp buffers:
1153 // above[x] * 32 + 16
1154 // above[x+1] - above[x]
1155 // final pixels will be calculated as:
1156 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1157
1158 const uint8x8_t a_mbase_x = vdup_n_u8(above[max_base_x]);
1159
1160 int x = dx;
1161 for (int r = 0; r < W; r++) {
1162 int base = x >> frac_bits;
1163 int base_max_diff = (max_base_x - base) >> upsample_above;
1164 if (base_max_diff <= 0) {
1165 for (int i = r; i < W; ++i) {
1166 dst[i] = a_mbase_x; // save 4 values
1167 }
1168 return;
1169 }
1170
1171 if (base_max_diff > H) base_max_diff = H;
1172
1173 uint8x8x2_t a01_128;
1174 uint16x8_t shift;
1175 if (upsample_above) {
1176 a01_128 = vld2_u8(above + base);
1177 shift = vdupq_n_u16(((x << upsample_above) & 0x3f) >> 1);
1178 } else {
1179 a01_128.val[0] = vld1_u8(above + base);
1180 a01_128.val[1] = vld1_u8(above + base + 1);
1181 shift = vdupq_n_u16((x & 0x3f) >> 1);
1182 }
1183 uint16x8_t diff = vsubl_u8(a01_128.val[1], a01_128.val[0]);
1184 uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a01_128.val[0], vdup_n_u8(32));
1185 uint16x8_t res = vmlaq_u16(a32, diff, shift);
1186
1187 uint8x8_t mask = vld1_u8(BaseMask[base_max_diff]);
1188 dst[r] = vbsl_u8(mask, vshrn_n_u16(res, 5), a_mbase_x);
1189
1190 x += dx;
1191 }
1192 }
1193
dr_prediction_z1_4xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)1194 static void dr_prediction_z1_4xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
1195 const uint8_t *above, int upsample_above,
1196 int dx) {
1197 uint8x8_t dstvec[16];
1198
1199 dr_prediction_z1_HxW_internal_neon_64(4, N, dstvec, above, upsample_above,
1200 dx);
1201 for (int i = 0; i < N; i++) {
1202 vst1_lane_u32((uint32_t *)(dst + stride * i),
1203 vreinterpret_u32_u8(dstvec[i]), 0);
1204 }
1205 }
1206
dr_prediction_z1_8xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)1207 static void dr_prediction_z1_8xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
1208 const uint8_t *above, int upsample_above,
1209 int dx) {
1210 uint8x8_t dstvec[32];
1211
1212 dr_prediction_z1_HxW_internal_neon_64(8, N, dstvec, above, upsample_above,
1213 dx);
1214 for (int i = 0; i < N; i++) {
1215 vst1_u8(dst + stride * i, dstvec[i]);
1216 }
1217 }
1218
dr_prediction_z1_HxW_internal_neon(int H,int W,uint8x16_t * dst,const uint8_t * above,int upsample_above,int dx)1219 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon(
1220 int H, int W, uint8x16_t *dst, const uint8_t *above, int upsample_above,
1221 int dx) {
1222 const int frac_bits = 6 - upsample_above;
1223 const int max_base_x = ((W + H) - 1) << upsample_above;
1224
1225 assert(dx > 0);
1226 // pre-filter above pixels
1227 // store in temp buffers:
1228 // above[x] * 32 + 16
1229 // above[x+1] - above[x]
1230 // final pixels will be calculated as:
1231 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1232
1233 const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
1234
1235 int x = dx;
1236 for (int r = 0; r < W; r++) {
1237 int base = x >> frac_bits;
1238 int base_max_diff = (max_base_x - base) >> upsample_above;
1239 if (base_max_diff <= 0) {
1240 for (int i = r; i < W; ++i) {
1241 dst[i] = a_mbase_x; // save 4 values
1242 }
1243 return;
1244 }
1245
1246 if (base_max_diff > H) base_max_diff = H;
1247
1248 uint16x8_t shift;
1249 uint8x16_t a0_128, a1_128;
1250 if (upsample_above) {
1251 uint8x8x2_t v_tmp_a0_128 = vld2_u8(above + base);
1252 a0_128 = vcombine_u8(v_tmp_a0_128.val[0], v_tmp_a0_128.val[1]);
1253 a1_128 = vextq_u8(a0_128, vdupq_n_u8(0), 8);
1254 shift = vdupq_n_u16(x & 0x1f);
1255 } else {
1256 a0_128 = vld1q_u8(above + base);
1257 a1_128 = vld1q_u8(above + base + 1);
1258 shift = vdupq_n_u16((x & 0x3f) >> 1);
1259 }
1260 uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
1261 uint16x8_t diff_hi = vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
1262 uint16x8_t a32_lo =
1263 vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32));
1264 uint16x8_t a32_hi =
1265 vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
1266 uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
1267 uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
1268 uint8x16_t v_temp =
1269 vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5));
1270
1271 uint8x16_t mask = vld1q_u8(BaseMask[base_max_diff]);
1272 dst[r] = vbslq_u8(mask, v_temp, a_mbase_x);
1273
1274 x += dx;
1275 }
1276 }
1277
dr_prediction_z1_16xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)1278 static void dr_prediction_z1_16xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
1279 const uint8_t *above, int upsample_above,
1280 int dx) {
1281 uint8x16_t dstvec[64];
1282
1283 dr_prediction_z1_HxW_internal_neon(16, N, dstvec, above, upsample_above, dx);
1284 for (int i = 0; i < N; i++) {
1285 vst1q_u8(dst + stride * i, dstvec[i]);
1286 }
1287 }
1288
dr_prediction_z1_32xN_internal_neon(int N,uint8x16x2_t * dstvec,const uint8_t * above,int dx)1289 static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_neon(
1290 int N, uint8x16x2_t *dstvec, const uint8_t *above, int dx) {
1291 const int frac_bits = 6;
1292 const int max_base_x = ((32 + N) - 1);
1293
1294 // pre-filter above pixels
1295 // store in temp buffers:
1296 // above[x] * 32 + 16
1297 // above[x+1] - above[x]
1298 // final pixels will be calculated as:
1299 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1300
1301 const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
1302
1303 int x = dx;
1304 for (int r = 0; r < N; r++) {
1305 int base = x >> frac_bits;
1306 int base_max_diff = (max_base_x - base);
1307 if (base_max_diff <= 0) {
1308 for (int i = r; i < N; ++i) {
1309 dstvec[i].val[0] = a_mbase_x; // save 32 values
1310 dstvec[i].val[1] = a_mbase_x;
1311 }
1312 return;
1313 }
1314 if (base_max_diff > 32) base_max_diff = 32;
1315
1316 uint16x8_t shift = vdupq_n_u16((x & 0x3f) >> 1);
1317
1318 uint8x16_t res16[2];
1319 for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
1320 int mdiff = base_max_diff - j;
1321 if (mdiff <= 0) {
1322 res16[jj] = a_mbase_x;
1323 } else {
1324 uint8x16_t a0_128 = vld1q_u8(above + base + j);
1325 uint8x16_t a1_128 = vld1q_u8(above + base + j + 1);
1326 uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
1327 uint16x8_t diff_hi =
1328 vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
1329 uint16x8_t a32_lo =
1330 vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32));
1331 uint16x8_t a32_hi =
1332 vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
1333 uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
1334 uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
1335
1336 res16[jj] = vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5));
1337 }
1338 }
1339
1340 uint8x16_t mask_lo = vld1q_u8(BaseMask[base_max_diff]);
1341 uint8x16_t mask_hi = vld1q_u8(BaseMask[base_max_diff] + 16);
1342 dstvec[r].val[0] = vbslq_u8(mask_lo, res16[0], a_mbase_x);
1343 dstvec[r].val[1] = vbslq_u8(mask_hi, res16[1], a_mbase_x);
1344 x += dx;
1345 }
1346 }
1347
dr_prediction_z1_32xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int dx)1348 static void dr_prediction_z1_32xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
1349 const uint8_t *above, int dx) {
1350 uint8x16x2_t dstvec[64];
1351
1352 dr_prediction_z1_32xN_internal_neon(N, dstvec, above, dx);
1353 for (int i = 0; i < N; i++) {
1354 vst1q_u8(dst + stride * i, dstvec[i].val[0]);
1355 vst1q_u8(dst + stride * i + 16, dstvec[i].val[1]);
1356 }
1357 }
1358
1359 // clang-format off
1360 static const uint8_t kLoadMaxShuffles[] = {
1361 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1362 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1363 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1364 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1365 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1366 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1367 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1368 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1369 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15,
1370 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15,
1371 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15,
1372 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15,
1373 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15,
1374 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15,
1375 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15,
1376 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1377 };
1378 // clang-format on
1379
z1_load_masked_neon(const uint8_t * ptr,int shuffle_idx)1380 static INLINE uint8x16_t z1_load_masked_neon(const uint8_t *ptr,
1381 int shuffle_idx) {
1382 uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]);
1383 uint8x16_t src = vld1q_u8(ptr);
1384 #if AOM_ARCH_AARCH64
1385 return vqtbl1q_u8(src, shuffle);
1386 #else
1387 uint8x8x2_t src2 = { { vget_low_u8(src), vget_high_u8(src) } };
1388 uint8x8_t lo = vtbl2_u8(src2, vget_low_u8(shuffle));
1389 uint8x8_t hi = vtbl2_u8(src2, vget_high_u8(shuffle));
1390 return vcombine_u8(lo, hi);
1391 #endif
1392 }
1393
dr_prediction_z1_64xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int dx)1394 static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
1395 const uint8_t *above, int dx) {
1396 const int frac_bits = 6;
1397 const int max_base_x = ((64 + N) - 1);
1398
1399 // pre-filter above pixels
1400 // store in temp buffers:
1401 // above[x] * 32 + 16
1402 // above[x+1] - above[x]
1403 // final pixels will be calculated as:
1404 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1405
1406 const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
1407
1408 int x = dx;
1409 for (int r = 0; r < N; r++, dst += stride) {
1410 int base = x >> frac_bits;
1411 if (base >= max_base_x) {
1412 for (int i = r; i < N; ++i) {
1413 vst1q_u8(dst, a_mbase_x);
1414 vst1q_u8(dst + 16, a_mbase_x);
1415 vst1q_u8(dst + 32, a_mbase_x);
1416 vst1q_u8(dst + 48, a_mbase_x);
1417 dst += stride;
1418 }
1419 return;
1420 }
1421
1422 uint16x8_t shift = vdupq_n_u16((x & 0x3f) >> 1);
1423 uint8x16_t base_inc128 =
1424 vaddq_u8(vdupq_n_u8(base), vcombine_u8(vcreate_u8(0x0706050403020100),
1425 vcreate_u8(0x0F0E0D0C0B0A0908)));
1426
1427 for (int j = 0; j < 64; j += 16) {
1428 if (base + j >= max_base_x) {
1429 vst1q_u8(dst + j, a_mbase_x);
1430 } else {
1431 uint8x16_t a0_128;
1432 uint8x16_t a1_128;
1433 if (base + j + 15 >= max_base_x) {
1434 int shuffle_idx = max_base_x - base - j;
1435 a0_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx);
1436 } else {
1437 a0_128 = vld1q_u8(above + base + j);
1438 }
1439 if (base + j + 16 >= max_base_x) {
1440 int shuffle_idx = max_base_x - base - j - 1;
1441 a1_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx);
1442 } else {
1443 a1_128 = vld1q_u8(above + base + j + 1);
1444 }
1445
1446 uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
1447 uint16x8_t diff_hi =
1448 vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
1449 uint16x8_t a32_lo =
1450 vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32));
1451 uint16x8_t a32_hi =
1452 vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
1453 uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
1454 uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
1455 vst1q_u8(dst + j,
1456 vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5)));
1457
1458 base_inc128 = vaddq_u8(base_inc128, vdupq_n_u8(16));
1459 }
1460 }
1461 x += dx;
1462 }
1463 }
1464
1465 // Directional prediction, zone 1: 0 < angle < 90
av1_dr_prediction_z1_neon(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int dx,int dy)1466 void av1_dr_prediction_z1_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
1467 const uint8_t *above, const uint8_t *left,
1468 int upsample_above, int dx, int dy) {
1469 (void)left;
1470 (void)dy;
1471
1472 switch (bw) {
1473 case 4:
1474 dr_prediction_z1_4xN_neon(bh, dst, stride, above, upsample_above, dx);
1475 break;
1476 case 8:
1477 dr_prediction_z1_8xN_neon(bh, dst, stride, above, upsample_above, dx);
1478 break;
1479 case 16:
1480 dr_prediction_z1_16xN_neon(bh, dst, stride, above, upsample_above, dx);
1481 break;
1482 case 32: dr_prediction_z1_32xN_neon(bh, dst, stride, above, dx); break;
1483 case 64: dr_prediction_z1_64xN_neon(bh, dst, stride, above, dx); break;
1484 default: break;
1485 }
1486 }
1487
1488 /* ---------------------P R E D I C T I O N Z 2--------------------------- */
1489
1490 // TODO(aomedia:349428506): enable this for armv7 after SIGBUS is fixed.
1491 #if AOM_ARCH_AARCH64
1492 #if !AOM_ARCH_AARCH64
1493 static DECLARE_ALIGNED(16, uint8_t, LoadMaskz2[4][16]) = {
1494 { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1495 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
1496 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
1497 0, 0, 0 },
1498 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1499 0xff, 0xff, 0xff, 0xff }
1500 };
1501 #endif // !AOM_ARCH_AARCH64
1502
dr_prediction_z2_Nx4_above_neon(const uint8_t * above,int upsample_above,int dx,int base_x,int y,uint8x8_t * a0_x,uint8x8_t * a1_x,uint16x4_t * shift0)1503 static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_above_neon(
1504 const uint8_t *above, int upsample_above, int dx, int base_x, int y,
1505 uint8x8_t *a0_x, uint8x8_t *a1_x, uint16x4_t *shift0) {
1506 uint16x4_t r6 = vcreate_u16(0x00C0008000400000);
1507 uint16x4_t ydx = vdup_n_u16(y * dx);
1508 if (upsample_above) {
1509 // Cannot use LD2 here since we only want to load eight bytes, but LD2 can
1510 // only load either 16 or 32.
1511 uint8x8_t v_tmp = vld1_u8(above + base_x);
1512 *a0_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[0];
1513 *a1_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[1];
1514 *shift0 = vand_u16(vsub_u16(r6, ydx), vdup_n_u16(0x1f));
1515 } else {
1516 *a0_x = load_u8_4x1(above + base_x);
1517 *a1_x = load_u8_4x1(above + base_x + 1);
1518 *shift0 = vand_u16(vhsub_u16(r6, ydx), vdup_n_u16(0x1f));
1519 }
1520 }
1521
dr_prediction_z2_Nx4_left_neon(uint8x16x2_t left_vals,int upsample_left,int dy,int r,int min_base_y,int frac_bits_y,uint16x4_t * a0_y,uint16x4_t * a1_y,uint16x4_t * shift1)1522 static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_left_neon(
1523 #if AOM_ARCH_AARCH64
1524 uint8x16x2_t left_vals,
1525 #else
1526 const uint8_t *left,
1527 #endif
1528 int upsample_left, int dy, int r, int min_base_y, int frac_bits_y,
1529 uint16x4_t *a0_y, uint16x4_t *a1_y, uint16x4_t *shift1) {
1530 int16x4_t dy64 = vdup_n_s16(dy);
1531 int16x4_t v_1234 = vcreate_s16(0x0004000300020001);
1532 int16x4_t v_frac_bits_y = vdup_n_s16(-frac_bits_y);
1533 int16x4_t min_base_y64 = vdup_n_s16(min_base_y);
1534 int16x4_t v_r6 = vdup_n_s16(r << 6);
1535 int16x4_t y_c64 = vmls_s16(v_r6, v_1234, dy64);
1536 int16x4_t base_y_c64 = vshl_s16(y_c64, v_frac_bits_y);
1537
1538 // Values in base_y_c64 range from -2 through 14 inclusive.
1539 base_y_c64 = vmax_s16(base_y_c64, min_base_y64);
1540
1541 #if AOM_ARCH_AARCH64
1542 uint8x8_t left_idx0 =
1543 vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(2))); // [0, 16]
1544 uint8x8_t left_idx1 =
1545 vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(3))); // [1, 17]
1546
1547 *a0_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx0));
1548 *a1_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx1));
1549 #else // !AOM_ARCH_AARCH64
1550 DECLARE_ALIGNED(32, int16_t, base_y_c[4]);
1551
1552 vst1_s16(base_y_c, base_y_c64);
1553 uint8x8_t a0_y_u8 = vdup_n_u8(0);
1554 a0_y_u8 = vld1_lane_u8(left + base_y_c[0], a0_y_u8, 0);
1555 a0_y_u8 = vld1_lane_u8(left + base_y_c[1], a0_y_u8, 2);
1556 a0_y_u8 = vld1_lane_u8(left + base_y_c[2], a0_y_u8, 4);
1557 a0_y_u8 = vld1_lane_u8(left + base_y_c[3], a0_y_u8, 6);
1558
1559 base_y_c64 = vadd_s16(base_y_c64, vdup_n_s16(1));
1560 vst1_s16(base_y_c, base_y_c64);
1561 uint8x8_t a1_y_u8 = vdup_n_u8(0);
1562 a1_y_u8 = vld1_lane_u8(left + base_y_c[0], a1_y_u8, 0);
1563 a1_y_u8 = vld1_lane_u8(left + base_y_c[1], a1_y_u8, 2);
1564 a1_y_u8 = vld1_lane_u8(left + base_y_c[2], a1_y_u8, 4);
1565 a1_y_u8 = vld1_lane_u8(left + base_y_c[3], a1_y_u8, 6);
1566
1567 *a0_y = vreinterpret_u16_u8(a0_y_u8);
1568 *a1_y = vreinterpret_u16_u8(a1_y_u8);
1569 #endif // AOM_ARCH_AARCH64
1570
1571 if (upsample_left) {
1572 *shift1 = vand_u16(vreinterpret_u16_s16(y_c64), vdup_n_u16(0x1f));
1573 } else {
1574 *shift1 =
1575 vand_u16(vshr_n_u16(vreinterpret_u16_s16(y_c64), 1), vdup_n_u16(0x1f));
1576 }
1577 }
1578
dr_prediction_z2_Nx8_above_neon(const uint8_t * above,int upsample_above,int dx,int base_x,int y)1579 static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_above_neon(
1580 const uint8_t *above, int upsample_above, int dx, int base_x, int y) {
1581 uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
1582 vcreate_u16(0x0008000700060005));
1583 uint16x8_t ydx = vdupq_n_u16(y * dx);
1584 uint16x8_t r6 = vshlq_n_u16(vextq_u16(c1234, vdupq_n_u16(0), 2), 6);
1585
1586 uint16x8_t shift0;
1587 uint8x8_t a0_x0;
1588 uint8x8_t a1_x0;
1589 if (upsample_above) {
1590 uint8x8x2_t v_tmp = vld2_u8(above + base_x);
1591 a0_x0 = v_tmp.val[0];
1592 a1_x0 = v_tmp.val[1];
1593 shift0 = vandq_u16(vsubq_u16(r6, ydx), vdupq_n_u16(0x1f));
1594 } else {
1595 a0_x0 = vld1_u8(above + base_x);
1596 a1_x0 = vld1_u8(above + base_x + 1);
1597 shift0 = vandq_u16(vhsubq_u16(r6, ydx), vdupq_n_u16(0x1f));
1598 }
1599
1600 uint16x8_t diff0 = vsubl_u8(a1_x0, a0_x0); // a[x+1] - a[x]
1601 uint16x8_t a32 =
1602 vmlal_u8(vdupq_n_u16(16), a0_x0, vdup_n_u8(32)); // a[x] * 32 + 16
1603 uint16x8_t res = vmlaq_u16(a32, diff0, shift0);
1604 return vshrn_n_u16(res, 5);
1605 }
1606
dr_prediction_z2_Nx8_left_neon(uint8x16x3_t left_vals,int upsample_left,int dy,int r,int min_base_y,int frac_bits_y)1607 static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_left_neon(
1608 #if AOM_ARCH_AARCH64
1609 uint8x16x3_t left_vals,
1610 #else
1611 const uint8_t *left,
1612 #endif
1613 int upsample_left, int dy, int r, int min_base_y, int frac_bits_y) {
1614 int16x8_t v_r6 = vdupq_n_s16(r << 6);
1615 int16x8_t dy128 = vdupq_n_s16(dy);
1616 int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y);
1617 int16x8_t min_base_y128 = vdupq_n_s16(min_base_y);
1618
1619 uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
1620 vcreate_u16(0x0008000700060005));
1621 int16x8_t y_c128 = vmlsq_s16(v_r6, vreinterpretq_s16_u16(c1234), dy128);
1622 int16x8_t base_y_c128 = vshlq_s16(y_c128, v_frac_bits_y);
1623
1624 // Values in base_y_c128 range from -2 through 31 inclusive.
1625 base_y_c128 = vmaxq_s16(base_y_c128, min_base_y128);
1626
1627 #if AOM_ARCH_AARCH64
1628 uint8x16_t left_idx0 =
1629 vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(2))); // [0, 33]
1630 uint8x16_t left_idx1 =
1631 vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(3))); // [1, 34]
1632 uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
1633
1634 uint8x16_t a01_x = vqtbl3q_u8(left_vals, left_idx01);
1635 uint8x8_t a0_x1 = vget_low_u8(a01_x);
1636 uint8x8_t a1_x1 = vget_high_u8(a01_x);
1637 #else // !AOM_ARCH_AARCH64
1638 uint8x8_t a0_x1 = load_u8_gather_s16_x8(left, base_y_c128);
1639 uint8x8_t a1_x1 = load_u8_gather_s16_x8(left + 1, base_y_c128);
1640 #endif // AOM_ARCH_AARCH64
1641
1642 uint16x8_t shift1;
1643 if (upsample_left) {
1644 shift1 = vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x1f));
1645 } else {
1646 shift1 = vshrq_n_u16(
1647 vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x3f)), 1);
1648 }
1649
1650 uint16x8_t diff1 = vsubl_u8(a1_x1, a0_x1);
1651 uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a0_x1, vdup_n_u8(32));
1652 uint16x8_t res = vmlaq_u16(a32, diff1, shift1);
1653 return vshrn_n_u16(res, 5);
1654 }
1655
dr_prediction_z2_NxW_above_neon(const uint8_t * above,int dx,int base_x,int y,int j)1656 static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_above_neon(
1657 const uint8_t *above, int dx, int base_x, int y, int j) {
1658 uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000),
1659 vcreate_u16(0x0007000600050004)),
1660 vcombine_u16(vcreate_u16(0x000B000A00090008),
1661 vcreate_u16(0x000F000E000D000C)) } };
1662 uint16x8_t j256 = vdupq_n_u16(j);
1663 uint16x8_t ydx = vdupq_n_u16((uint16_t)(y * dx));
1664
1665 const uint8x16_t a0_x128 = vld1q_u8(above + base_x + j);
1666 const uint8x16_t a1_x128 = vld1q_u8(above + base_x + j + 1);
1667 uint16x8_t res6_0 = vshlq_n_u16(vaddq_u16(c0123.val[0], j256), 6);
1668 uint16x8_t res6_1 = vshlq_n_u16(vaddq_u16(c0123.val[1], j256), 6);
1669 uint16x8_t shift0 =
1670 vshrq_n_u16(vandq_u16(vsubq_u16(res6_0, ydx), vdupq_n_u16(0x3f)), 1);
1671 uint16x8_t shift1 =
1672 vshrq_n_u16(vandq_u16(vsubq_u16(res6_1, ydx), vdupq_n_u16(0x3f)), 1);
1673 // a[x+1] - a[x]
1674 uint16x8_t diff0 = vsubl_u8(vget_low_u8(a1_x128), vget_low_u8(a0_x128));
1675 uint16x8_t diff1 = vsubl_u8(vget_high_u8(a1_x128), vget_high_u8(a0_x128));
1676 // a[x] * 32 + 16
1677 uint16x8_t a32_0 =
1678 vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_x128), vdup_n_u8(32));
1679 uint16x8_t a32_1 =
1680 vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_x128), vdup_n_u8(32));
1681 uint16x8_t res0 = vmlaq_u16(a32_0, diff0, shift0);
1682 uint16x8_t res1 = vmlaq_u16(a32_1, diff1, shift1);
1683 return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5));
1684 }
1685
dr_prediction_z2_NxW_left_neon(uint8x16x4_t left_vals0,uint8x16x4_t left_vals1,int dy,int r,int j)1686 static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_left_neon(
1687 #if AOM_ARCH_AARCH64
1688 uint8x16x4_t left_vals0, uint8x16x4_t left_vals1,
1689 #else
1690 const uint8_t *left,
1691 #endif
1692 int dy, int r, int j) {
1693 // here upsample_above and upsample_left are 0 by design of
1694 // av1_use_intra_edge_upsample
1695 const int min_base_y = -1;
1696
1697 int16x8_t min_base_y256 = vdupq_n_s16(min_base_y);
1698 int16x8_t half_min_base_y256 = vdupq_n_s16(min_base_y >> 1);
1699 int16x8_t dy256 = vdupq_n_s16(dy);
1700 uint16x8_t j256 = vdupq_n_u16(j);
1701
1702 uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000),
1703 vcreate_u16(0x0007000600050004)),
1704 vcombine_u16(vcreate_u16(0x000B000A00090008),
1705 vcreate_u16(0x000F000E000D000C)) } };
1706 uint16x8x2_t c1234 = { { vaddq_u16(c0123.val[0], vdupq_n_u16(1)),
1707 vaddq_u16(c0123.val[1], vdupq_n_u16(1)) } };
1708
1709 int16x8_t v_r6 = vdupq_n_s16(r << 6);
1710
1711 int16x8_t c256_0 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[0]));
1712 int16x8_t c256_1 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[1]));
1713 int16x8_t mul16_lo = vreinterpretq_s16_u16(
1714 vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_0, dy256)),
1715 vreinterpretq_u16_s16(half_min_base_y256)));
1716 int16x8_t mul16_hi = vreinterpretq_s16_u16(
1717 vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_1, dy256)),
1718 vreinterpretq_u16_s16(half_min_base_y256)));
1719 int16x8_t y_c256_lo = vsubq_s16(v_r6, mul16_lo);
1720 int16x8_t y_c256_hi = vsubq_s16(v_r6, mul16_hi);
1721
1722 int16x8_t base_y_c256_lo = vshrq_n_s16(y_c256_lo, 6);
1723 int16x8_t base_y_c256_hi = vshrq_n_s16(y_c256_hi, 6);
1724
1725 base_y_c256_lo = vmaxq_s16(min_base_y256, base_y_c256_lo);
1726 base_y_c256_hi = vmaxq_s16(min_base_y256, base_y_c256_hi);
1727
1728 #if !AOM_ARCH_AARCH64
1729 int16_t min_y = vgetq_lane_s16(base_y_c256_hi, 7);
1730 int16_t max_y = vgetq_lane_s16(base_y_c256_lo, 0);
1731 int16_t offset_diff = max_y - min_y;
1732
1733 uint8x8_t a0_y0;
1734 uint8x8_t a0_y1;
1735 uint8x8_t a1_y0;
1736 uint8x8_t a1_y1;
1737 if (offset_diff < 16) {
1738 // Avoid gathers where the data we want is close together in memory.
1739 // We don't need this for AArch64 since we can already use TBL to cover the
1740 // full range of possible values.
1741 assert(offset_diff >= 0);
1742 int16x8_t min_y256 = vdupq_lane_s16(vget_high_s16(base_y_c256_hi), 3);
1743
1744 int16x8x2_t base_y_offset;
1745 base_y_offset.val[0] = vsubq_s16(base_y_c256_lo, min_y256);
1746 base_y_offset.val[1] = vsubq_s16(base_y_c256_hi, min_y256);
1747
1748 int8x16_t base_y_offset128 = vcombine_s8(vqmovn_s16(base_y_offset.val[0]),
1749 vqmovn_s16(base_y_offset.val[1]));
1750
1751 uint8x16_t v_loadmaskz2 = vld1q_u8(LoadMaskz2[offset_diff / 4]);
1752 uint8x16_t a0_y128 = vld1q_u8(left + min_y);
1753 uint8x16_t a1_y128 = vld1q_u8(left + min_y + 1);
1754 a0_y128 = vandq_u8(a0_y128, v_loadmaskz2);
1755 a1_y128 = vandq_u8(a1_y128, v_loadmaskz2);
1756
1757 uint8x8_t v_index_low = vget_low_u8(vreinterpretq_u8_s8(base_y_offset128));
1758 uint8x8_t v_index_high =
1759 vget_high_u8(vreinterpretq_u8_s8(base_y_offset128));
1760 uint8x8x2_t v_tmp, v_res;
1761 v_tmp.val[0] = vget_low_u8(a0_y128);
1762 v_tmp.val[1] = vget_high_u8(a0_y128);
1763 v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
1764 v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
1765 a0_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
1766 v_tmp.val[0] = vget_low_u8(a1_y128);
1767 v_tmp.val[1] = vget_high_u8(a1_y128);
1768 v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
1769 v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
1770 a1_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
1771
1772 a0_y0 = vget_low_u8(a0_y128);
1773 a0_y1 = vget_high_u8(a0_y128);
1774 a1_y0 = vget_low_u8(a1_y128);
1775 a1_y1 = vget_high_u8(a1_y128);
1776 } else {
1777 a0_y0 = load_u8_gather_s16_x8(left, base_y_c256_lo);
1778 a0_y1 = load_u8_gather_s16_x8(left, base_y_c256_hi);
1779 a1_y0 = load_u8_gather_s16_x8(left + 1, base_y_c256_lo);
1780 a1_y1 = load_u8_gather_s16_x8(left + 1, base_y_c256_hi);
1781 }
1782 #else
1783 // Values in left_idx{0,1} range from 0 through 63 inclusive.
1784 uint8x16_t left_idx0 =
1785 vreinterpretq_u8_s16(vaddq_s16(base_y_c256_lo, vdupq_n_s16(1)));
1786 uint8x16_t left_idx1 =
1787 vreinterpretq_u8_s16(vaddq_s16(base_y_c256_hi, vdupq_n_s16(1)));
1788 uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
1789
1790 uint8x16_t a0_y01 = vqtbl4q_u8(left_vals0, left_idx01);
1791 uint8x16_t a1_y01 = vqtbl4q_u8(left_vals1, left_idx01);
1792
1793 uint8x8_t a0_y0 = vget_low_u8(a0_y01);
1794 uint8x8_t a0_y1 = vget_high_u8(a0_y01);
1795 uint8x8_t a1_y0 = vget_low_u8(a1_y01);
1796 uint8x8_t a1_y1 = vget_high_u8(a1_y01);
1797 #endif // !AOM_ARCH_AARCH64
1798
1799 uint16x8_t shifty_lo = vshrq_n_u16(
1800 vandq_u16(vreinterpretq_u16_s16(y_c256_lo), vdupq_n_u16(0x3f)), 1);
1801 uint16x8_t shifty_hi = vshrq_n_u16(
1802 vandq_u16(vreinterpretq_u16_s16(y_c256_hi), vdupq_n_u16(0x3f)), 1);
1803
1804 // a[x+1] - a[x]
1805 uint16x8_t diff_lo = vsubl_u8(a1_y0, a0_y0);
1806 uint16x8_t diff_hi = vsubl_u8(a1_y1, a0_y1);
1807 // a[x] * 32 + 16
1808 uint16x8_t a32_lo = vmlal_u8(vdupq_n_u16(16), a0_y0, vdup_n_u8(32));
1809 uint16x8_t a32_hi = vmlal_u8(vdupq_n_u16(16), a0_y1, vdup_n_u8(32));
1810
1811 uint16x8_t res0 = vmlaq_u16(a32_lo, diff_lo, shifty_lo);
1812 uint16x8_t res1 = vmlaq_u16(a32_hi, diff_hi, shifty_hi);
1813
1814 return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5));
1815 }
1816
dr_prediction_z2_Nx4_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)1817 static void dr_prediction_z2_Nx4_neon(int N, uint8_t *dst, ptrdiff_t stride,
1818 const uint8_t *above, const uint8_t *left,
1819 int upsample_above, int upsample_left,
1820 int dx, int dy) {
1821 const int min_base_x = -(1 << upsample_above);
1822 const int min_base_y = -(1 << upsample_left);
1823 const int frac_bits_x = 6 - upsample_above;
1824 const int frac_bits_y = 6 - upsample_left;
1825
1826 assert(dx > 0);
1827 // pre-filter above pixels
1828 // store in temp buffers:
1829 // above[x] * 32 + 16
1830 // above[x+1] - above[x]
1831 // final pixels will be calculated as:
1832 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1833
1834 #if AOM_ARCH_AARCH64
1835 // Use ext rather than loading left + 14 directly to avoid over-read.
1836 const uint8x16_t left_m2 = vld1q_u8(left - 2);
1837 const uint8x16_t left_0 = vld1q_u8(left);
1838 const uint8x16_t left_14 = vextq_u8(left_0, left_0, 14);
1839 const uint8x16x2_t left_vals = { { left_m2, left_14 } };
1840 #define LEFT left_vals
1841 #else // !AOM_ARCH_AARCH64
1842 #define LEFT left
1843 #endif // AOM_ARCH_AARCH64
1844
1845 for (int r = 0; r < N; r++) {
1846 int y = r + 1;
1847 int base_x = (-y * dx) >> frac_bits_x;
1848 const int base_min_diff =
1849 (min_base_x - ((-y * dx) >> frac_bits_x) + upsample_above) >>
1850 upsample_above;
1851
1852 if (base_min_diff <= 0) {
1853 uint8x8_t a0_x_u8, a1_x_u8;
1854 uint16x4_t shift0;
1855 dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y,
1856 &a0_x_u8, &a1_x_u8, &shift0);
1857 uint8x8_t a0_x = a0_x_u8;
1858 uint8x8_t a1_x = a1_x_u8;
1859
1860 uint16x8_t diff = vsubl_u8(a1_x, a0_x); // a[x+1] - a[x]
1861 uint16x8_t a32 =
1862 vmlal_u8(vdupq_n_u16(16), a0_x, vdup_n_u8(32)); // a[x] * 32 + 16
1863 uint16x8_t res =
1864 vmlaq_u16(a32, diff, vcombine_u16(shift0, vdup_n_u16(0)));
1865 uint8x8_t resx = vshrn_n_u16(res, 5);
1866 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resx), 0);
1867 } else if (base_min_diff < 4) {
1868 uint8x8_t a0_x_u8, a1_x_u8;
1869 uint16x4_t shift0;
1870 dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y,
1871 &a0_x_u8, &a1_x_u8, &shift0);
1872 uint16x8_t a0_x = vmovl_u8(a0_x_u8);
1873 uint16x8_t a1_x = vmovl_u8(a1_x_u8);
1874
1875 uint16x4_t a0_y;
1876 uint16x4_t a1_y;
1877 uint16x4_t shift1;
1878 dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y,
1879 frac_bits_y, &a0_y, &a1_y, &shift1);
1880 a0_x = vcombine_u16(vget_low_u16(a0_x), a0_y);
1881 a1_x = vcombine_u16(vget_low_u16(a1_x), a1_y);
1882
1883 uint16x8_t shift = vcombine_u16(shift0, shift1);
1884 uint16x8_t diff = vsubq_u16(a1_x, a0_x); // a[x+1] - a[x]
1885 uint16x8_t a32 =
1886 vmlaq_n_u16(vdupq_n_u16(16), a0_x, 32); // a[x] * 32 + 16
1887 uint16x8_t res = vmlaq_u16(a32, diff, shift);
1888 uint8x8_t resx = vshrn_n_u16(res, 5);
1889 uint8x8_t resy = vext_u8(resx, vdup_n_u8(0), 4);
1890
1891 uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
1892 uint8x8_t v_resxy = vbsl_u8(mask, resy, resx);
1893 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(v_resxy), 0);
1894 } else {
1895 uint16x4_t a0_y, a1_y;
1896 uint16x4_t shift1;
1897 dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y,
1898 frac_bits_y, &a0_y, &a1_y, &shift1);
1899 uint16x4_t diff = vsub_u16(a1_y, a0_y); // a[x+1] - a[x]
1900 uint16x4_t a32 = vmla_n_u16(vdup_n_u16(16), a0_y, 32); // a[x] * 32 + 16
1901 uint16x4_t res = vmla_u16(a32, diff, shift1);
1902 uint8x8_t resy = vshrn_n_u16(vcombine_u16(res, vdup_n_u16(0)), 5);
1903
1904 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resy), 0);
1905 }
1906
1907 dst += stride;
1908 }
1909 #undef LEFT
1910 }
1911
dr_prediction_z2_Nx8_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)1912 static void dr_prediction_z2_Nx8_neon(int N, uint8_t *dst, ptrdiff_t stride,
1913 const uint8_t *above, const uint8_t *left,
1914 int upsample_above, int upsample_left,
1915 int dx, int dy) {
1916 const int min_base_x = -(1 << upsample_above);
1917 const int min_base_y = -(1 << upsample_left);
1918 const int frac_bits_x = 6 - upsample_above;
1919 const int frac_bits_y = 6 - upsample_left;
1920
1921 // pre-filter above pixels
1922 // store in temp buffers:
1923 // above[x] * 32 + 16
1924 // above[x+1] - above[x]
1925 // final pixels will be calculated as:
1926 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1927
1928 #if AOM_ARCH_AARCH64
1929 // Use ext rather than loading left + 30 directly to avoid over-read.
1930 const uint8x16_t left_m2 = vld1q_u8(left - 2);
1931 const uint8x16_t left_0 = vld1q_u8(left + 0);
1932 const uint8x16_t left_16 = vld1q_u8(left + 16);
1933 const uint8x16_t left_14 = vextq_u8(left_0, left_16, 14);
1934 const uint8x16_t left_30 = vextq_u8(left_16, left_16, 14);
1935 const uint8x16x3_t left_vals = { { left_m2, left_14, left_30 } };
1936 #define LEFT left_vals
1937 #else // !AOM_ARCH_AARCH64
1938 #define LEFT left
1939 #endif // AOM_ARCH_AARCH64
1940
1941 for (int r = 0; r < N; r++) {
1942 int y = r + 1;
1943 int base_x = (-y * dx) >> frac_bits_x;
1944 int base_min_diff =
1945 (min_base_x - base_x + upsample_above) >> upsample_above;
1946
1947 if (base_min_diff <= 0) {
1948 uint8x8_t resx =
1949 dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y);
1950 vst1_u8(dst, resx);
1951 } else if (base_min_diff < 8) {
1952 uint8x8_t resx =
1953 dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y);
1954 uint8x8_t resy = dr_prediction_z2_Nx8_left_neon(
1955 LEFT, upsample_left, dy, r, min_base_y, frac_bits_y);
1956 uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
1957 uint8x8_t resxy = vbsl_u8(mask, resy, resx);
1958 vst1_u8(dst, resxy);
1959 } else {
1960 uint8x8_t resy = dr_prediction_z2_Nx8_left_neon(
1961 LEFT, upsample_left, dy, r, min_base_y, frac_bits_y);
1962 vst1_u8(dst, resy);
1963 }
1964
1965 dst += stride;
1966 }
1967 #undef LEFT
1968 }
1969
dr_prediction_z2_HxW_neon(int H,int W,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int dx,int dy)1970 static void dr_prediction_z2_HxW_neon(int H, int W, uint8_t *dst,
1971 ptrdiff_t stride, const uint8_t *above,
1972 const uint8_t *left, int dx, int dy) {
1973 // here upsample_above and upsample_left are 0 by design of
1974 // av1_use_intra_edge_upsample
1975 const int min_base_x = -1;
1976
1977 #if AOM_ARCH_AARCH64
1978 const uint8x16_t left_m1 = vld1q_u8(left - 1);
1979 const uint8x16_t left_0 = vld1q_u8(left + 0);
1980 const uint8x16_t left_16 = vld1q_u8(left + 16);
1981 const uint8x16_t left_32 = vld1q_u8(left + 32);
1982 const uint8x16_t left_48 = vld1q_u8(left + 48);
1983 const uint8x16_t left_15 = vextq_u8(left_0, left_16, 15);
1984 const uint8x16_t left_31 = vextq_u8(left_16, left_32, 15);
1985 const uint8x16_t left_47 = vextq_u8(left_32, left_48, 15);
1986 const uint8x16x4_t left_vals0 = { { left_m1, left_15, left_31, left_47 } };
1987 const uint8x16x4_t left_vals1 = { { left_0, left_16, left_32, left_48 } };
1988 #define LEFT left_vals0, left_vals1
1989 #else // !AOM_ARCH_AARCH64
1990 #define LEFT left
1991 #endif // AOM_ARCH_AARCH64
1992
1993 for (int r = 0; r < H; r++) {
1994 int y = r + 1;
1995 int base_x = (-y * dx) >> 6;
1996 for (int j = 0; j < W; j += 16) {
1997 const int base_min_diff = min_base_x - base_x - j;
1998
1999 if (base_min_diff <= 0) {
2000 uint8x16_t resx =
2001 dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j);
2002 vst1q_u8(dst + j, resx);
2003 } else if (base_min_diff < 16) {
2004 uint8x16_t resx =
2005 dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j);
2006 uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j);
2007 uint8x16_t mask = vld1q_u8(BaseMask[base_min_diff]);
2008 uint8x16_t resxy = vbslq_u8(mask, resy, resx);
2009 vst1q_u8(dst + j, resxy);
2010 } else {
2011 uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j);
2012 vst1q_u8(dst + j, resy);
2013 }
2014 } // for j
2015 dst += stride;
2016 }
2017 #undef LEFT
2018 }
2019
2020 // Directional prediction, zone 2: 90 < angle < 180
av1_dr_prediction_z2_neon(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)2021 void av1_dr_prediction_z2_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
2022 const uint8_t *above, const uint8_t *left,
2023 int upsample_above, int upsample_left, int dx,
2024 int dy) {
2025 assert(dx > 0);
2026 assert(dy > 0);
2027
2028 switch (bw) {
2029 case 4:
2030 dr_prediction_z2_Nx4_neon(bh, dst, stride, above, left, upsample_above,
2031 upsample_left, dx, dy);
2032 break;
2033 case 8:
2034 dr_prediction_z2_Nx8_neon(bh, dst, stride, above, left, upsample_above,
2035 upsample_left, dx, dy);
2036 break;
2037 default:
2038 dr_prediction_z2_HxW_neon(bh, bw, dst, stride, above, left, dx, dy);
2039 break;
2040 }
2041 }
2042 #endif // AOM_ARCH_AARCH64
2043
2044 /* ---------------------P R E D I C T I O N Z 3--------------------------- */
2045
z3_transpose_arrays_u8_16x4(const uint8x16_t * x,uint8x16x2_t * d)2046 static AOM_FORCE_INLINE void z3_transpose_arrays_u8_16x4(const uint8x16_t *x,
2047 uint8x16x2_t *d) {
2048 uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
2049 uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
2050
2051 d[0] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
2052 vreinterpretq_u16_u8(w1.val[0])));
2053 d[1] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
2054 vreinterpretq_u16_u8(w1.val[1])));
2055 }
2056
z3_transpose_arrays_u8_4x4(const uint8x8_t * x,uint8x8x2_t * d)2057 static AOM_FORCE_INLINE void z3_transpose_arrays_u8_4x4(const uint8x8_t *x,
2058 uint8x8x2_t *d) {
2059 uint8x8x2_t w0 = vzip_u8(x[0], x[1]);
2060 uint8x8x2_t w1 = vzip_u8(x[2], x[3]);
2061
2062 *d = aom_reinterpret_u8_u16_x2(
2063 vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])));
2064 }
2065
z3_transpose_arrays_u8_8x4(const uint8x8_t * x,uint8x8x2_t * d)2066 static AOM_FORCE_INLINE void z3_transpose_arrays_u8_8x4(const uint8x8_t *x,
2067 uint8x8x2_t *d) {
2068 uint8x8x2_t w0 = vzip_u8(x[0], x[1]);
2069 uint8x8x2_t w1 = vzip_u8(x[2], x[3]);
2070
2071 d[0] = aom_reinterpret_u8_u16_x2(
2072 vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])));
2073 d[1] = aom_reinterpret_u8_u16_x2(
2074 vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1])));
2075 }
2076
z3_transpose_arrays_u8_16x16(const uint8_t * src,ptrdiff_t pitchSrc,uint8_t * dst,ptrdiff_t pitchDst)2077 static void z3_transpose_arrays_u8_16x16(const uint8_t *src, ptrdiff_t pitchSrc,
2078 uint8_t *dst, ptrdiff_t pitchDst) {
2079 // The same as the normal transposes in transpose_neon.h, but with a stride
2080 // between consecutive vectors of elements.
2081 uint8x16_t r[16];
2082 uint8x16_t d[16];
2083 for (int i = 0; i < 16; i++) {
2084 r[i] = vld1q_u8(src + i * pitchSrc);
2085 }
2086 transpose_arrays_u8_16x16(r, d);
2087 for (int i = 0; i < 16; i++) {
2088 vst1q_u8(dst + i * pitchDst, d[i]);
2089 }
2090 }
2091
z3_transpose_arrays_u8_16nx16n(const uint8_t * src,ptrdiff_t pitchSrc,uint8_t * dst,ptrdiff_t pitchDst,int width,int height)2092 static void z3_transpose_arrays_u8_16nx16n(const uint8_t *src,
2093 ptrdiff_t pitchSrc, uint8_t *dst,
2094 ptrdiff_t pitchDst, int width,
2095 int height) {
2096 for (int j = 0; j < height; j += 16) {
2097 for (int i = 0; i < width; i += 16) {
2098 z3_transpose_arrays_u8_16x16(src + i * pitchSrc + j, pitchSrc,
2099 dst + j * pitchDst + i, pitchDst);
2100 }
2101 }
2102 }
2103
dr_prediction_z3_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2104 static void dr_prediction_z3_4x4_neon(uint8_t *dst, ptrdiff_t stride,
2105 const uint8_t *left, int upsample_left,
2106 int dy) {
2107 uint8x8_t dstvec[4];
2108 uint8x8x2_t dest;
2109
2110 dr_prediction_z1_HxW_internal_neon_64(4, 4, dstvec, left, upsample_left, dy);
2111 z3_transpose_arrays_u8_4x4(dstvec, &dest);
2112 store_u8x4_strided_x2(dst + stride * 0, stride, dest.val[0]);
2113 store_u8x4_strided_x2(dst + stride * 2, stride, dest.val[1]);
2114 }
2115
dr_prediction_z3_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2116 static void dr_prediction_z3_8x8_neon(uint8_t *dst, ptrdiff_t stride,
2117 const uint8_t *left, int upsample_left,
2118 int dy) {
2119 uint8x8_t dstvec[8];
2120 uint8x8_t d[8];
2121
2122 dr_prediction_z1_HxW_internal_neon_64(8, 8, dstvec, left, upsample_left, dy);
2123 transpose_arrays_u8_8x8(dstvec, d);
2124 store_u8_8x8(dst, stride, d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]);
2125 }
2126
dr_prediction_z3_4x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2127 static void dr_prediction_z3_4x8_neon(uint8_t *dst, ptrdiff_t stride,
2128 const uint8_t *left, int upsample_left,
2129 int dy) {
2130 uint8x8_t dstvec[4];
2131 uint8x8x2_t d[2];
2132
2133 dr_prediction_z1_HxW_internal_neon_64(8, 4, dstvec, left, upsample_left, dy);
2134 z3_transpose_arrays_u8_8x4(dstvec, d);
2135 store_u8x4_strided_x2(dst + stride * 0, stride, d[0].val[0]);
2136 store_u8x4_strided_x2(dst + stride * 2, stride, d[0].val[1]);
2137 store_u8x4_strided_x2(dst + stride * 4, stride, d[1].val[0]);
2138 store_u8x4_strided_x2(dst + stride * 6, stride, d[1].val[1]);
2139 }
2140
dr_prediction_z3_8x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2141 static void dr_prediction_z3_8x4_neon(uint8_t *dst, ptrdiff_t stride,
2142 const uint8_t *left, int upsample_left,
2143 int dy) {
2144 uint8x8_t dstvec[8];
2145 uint8x8_t d[8];
2146
2147 dr_prediction_z1_HxW_internal_neon_64(4, 8, dstvec, left, upsample_left, dy);
2148 transpose_arrays_u8_8x8(dstvec, d);
2149 store_u8_8x4(dst, stride, d[0], d[1], d[2], d[3]);
2150 }
2151
dr_prediction_z3_8x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2152 static void dr_prediction_z3_8x16_neon(uint8_t *dst, ptrdiff_t stride,
2153 const uint8_t *left, int upsample_left,
2154 int dy) {
2155 uint8x16_t dstvec[8];
2156 uint8x8_t d[16];
2157
2158 dr_prediction_z1_HxW_internal_neon(16, 8, dstvec, left, upsample_left, dy);
2159 transpose_arrays_u8_16x8(dstvec, d);
2160 for (int i = 0; i < 16; i++) {
2161 vst1_u8(dst + i * stride, d[i]);
2162 }
2163 }
2164
dr_prediction_z3_16x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2165 static void dr_prediction_z3_16x8_neon(uint8_t *dst, ptrdiff_t stride,
2166 const uint8_t *left, int upsample_left,
2167 int dy) {
2168 uint8x8_t dstvec[16];
2169 uint8x16_t d[8];
2170
2171 dr_prediction_z1_HxW_internal_neon_64(8, 16, dstvec, left, upsample_left, dy);
2172 transpose_arrays_u8_8x16(dstvec, d);
2173 for (int i = 0; i < 8; i++) {
2174 vst1q_u8(dst + i * stride, d[i]);
2175 }
2176 }
2177
dr_prediction_z3_4x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2178 static void dr_prediction_z3_4x16_neon(uint8_t *dst, ptrdiff_t stride,
2179 const uint8_t *left, int upsample_left,
2180 int dy) {
2181 uint8x16_t dstvec[4];
2182 uint8x16x2_t d[2];
2183
2184 dr_prediction_z1_HxW_internal_neon(16, 4, dstvec, left, upsample_left, dy);
2185 z3_transpose_arrays_u8_16x4(dstvec, d);
2186 store_u8x4_strided_x4(dst + stride * 0, stride, d[0].val[0]);
2187 store_u8x4_strided_x4(dst + stride * 4, stride, d[0].val[1]);
2188 store_u8x4_strided_x4(dst + stride * 8, stride, d[1].val[0]);
2189 store_u8x4_strided_x4(dst + stride * 12, stride, d[1].val[1]);
2190 }
2191
dr_prediction_z3_16x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2192 static void dr_prediction_z3_16x4_neon(uint8_t *dst, ptrdiff_t stride,
2193 const uint8_t *left, int upsample_left,
2194 int dy) {
2195 uint8x8_t dstvec[16];
2196 uint8x16_t d[8];
2197
2198 dr_prediction_z1_HxW_internal_neon_64(4, 16, dstvec, left, upsample_left, dy);
2199 transpose_arrays_u8_8x16(dstvec, d);
2200 for (int i = 0; i < 4; i++) {
2201 vst1q_u8(dst + i * stride, d[i]);
2202 }
2203 }
2204
dr_prediction_z3_8x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2205 static void dr_prediction_z3_8x32_neon(uint8_t *dst, ptrdiff_t stride,
2206 const uint8_t *left, int upsample_left,
2207 int dy) {
2208 (void)upsample_left;
2209 uint8x16x2_t dstvec[16];
2210 uint8x16_t d[32];
2211 uint8x16_t v_zero = vdupq_n_u8(0);
2212
2213 dr_prediction_z1_32xN_internal_neon(8, dstvec, left, dy);
2214 for (int i = 8; i < 16; i++) {
2215 dstvec[i].val[0] = v_zero;
2216 dstvec[i].val[1] = v_zero;
2217 }
2218 transpose_arrays_u8_32x16(dstvec, d);
2219 for (int i = 0; i < 32; i++) {
2220 vst1_u8(dst + i * stride, vget_low_u8(d[i]));
2221 }
2222 }
2223
dr_prediction_z3_32x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2224 static void dr_prediction_z3_32x8_neon(uint8_t *dst, ptrdiff_t stride,
2225 const uint8_t *left, int upsample_left,
2226 int dy) {
2227 uint8x8_t dstvec[32];
2228 uint8x16_t d[16];
2229
2230 dr_prediction_z1_HxW_internal_neon_64(8, 32, dstvec, left, upsample_left, dy);
2231 transpose_arrays_u8_8x16(dstvec, d);
2232 transpose_arrays_u8_8x16(dstvec + 16, d + 8);
2233 for (int i = 0; i < 8; i++) {
2234 vst1q_u8(dst + i * stride, d[i]);
2235 vst1q_u8(dst + i * stride + 16, d[i + 8]);
2236 }
2237 }
2238
dr_prediction_z3_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2239 static void dr_prediction_z3_16x16_neon(uint8_t *dst, ptrdiff_t stride,
2240 const uint8_t *left, int upsample_left,
2241 int dy) {
2242 uint8x16_t dstvec[16];
2243 uint8x16_t d[16];
2244
2245 dr_prediction_z1_HxW_internal_neon(16, 16, dstvec, left, upsample_left, dy);
2246 transpose_arrays_u8_16x16(dstvec, d);
2247 for (int i = 0; i < 16; i++) {
2248 vst1q_u8(dst + i * stride, d[i]);
2249 }
2250 }
2251
dr_prediction_z3_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2252 static void dr_prediction_z3_32x32_neon(uint8_t *dst, ptrdiff_t stride,
2253 const uint8_t *left, int upsample_left,
2254 int dy) {
2255 (void)upsample_left;
2256 uint8x16x2_t dstvec[32];
2257 uint8x16_t d[64];
2258
2259 dr_prediction_z1_32xN_internal_neon(32, dstvec, left, dy);
2260 transpose_arrays_u8_32x16(dstvec, d);
2261 transpose_arrays_u8_32x16(dstvec + 16, d + 32);
2262 for (int i = 0; i < 32; i++) {
2263 vst1q_u8(dst + i * stride, d[i]);
2264 vst1q_u8(dst + i * stride + 16, d[i + 32]);
2265 }
2266 }
2267
dr_prediction_z3_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2268 static void dr_prediction_z3_64x64_neon(uint8_t *dst, ptrdiff_t stride,
2269 const uint8_t *left, int upsample_left,
2270 int dy) {
2271 (void)upsample_left;
2272 DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
2273
2274 dr_prediction_z1_64xN_neon(64, dstT, 64, left, dy);
2275 z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 64, 64);
2276 }
2277
dr_prediction_z3_16x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2278 static void dr_prediction_z3_16x32_neon(uint8_t *dst, ptrdiff_t stride,
2279 const uint8_t *left, int upsample_left,
2280 int dy) {
2281 (void)upsample_left;
2282 uint8x16x2_t dstvec[16];
2283 uint8x16_t d[32];
2284
2285 dr_prediction_z1_32xN_internal_neon(16, dstvec, left, dy);
2286 transpose_arrays_u8_32x16(dstvec, d);
2287 for (int i = 0; i < 16; i++) {
2288 vst1q_u8(dst + 2 * i * stride, d[2 * i + 0]);
2289 vst1q_u8(dst + (2 * i + 1) * stride, d[2 * i + 1]);
2290 }
2291 }
2292
dr_prediction_z3_32x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2293 static void dr_prediction_z3_32x16_neon(uint8_t *dst, ptrdiff_t stride,
2294 const uint8_t *left, int upsample_left,
2295 int dy) {
2296 uint8x16_t dstvec[32];
2297
2298 dr_prediction_z1_HxW_internal_neon(16, 32, dstvec, left, upsample_left, dy);
2299 for (int i = 0; i < 32; i += 16) {
2300 uint8x16_t d[16];
2301 transpose_arrays_u8_16x16(dstvec + i, d);
2302 for (int j = 0; j < 16; j++) {
2303 vst1q_u8(dst + j * stride + i, d[j]);
2304 }
2305 }
2306 }
2307
dr_prediction_z3_32x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2308 static void dr_prediction_z3_32x64_neon(uint8_t *dst, ptrdiff_t stride,
2309 const uint8_t *left, int upsample_left,
2310 int dy) {
2311 (void)upsample_left;
2312 uint8_t dstT[64 * 32];
2313
2314 dr_prediction_z1_64xN_neon(32, dstT, 64, left, dy);
2315 z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 32, 64);
2316 }
2317
dr_prediction_z3_64x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2318 static void dr_prediction_z3_64x32_neon(uint8_t *dst, ptrdiff_t stride,
2319 const uint8_t *left, int upsample_left,
2320 int dy) {
2321 (void)upsample_left;
2322 uint8_t dstT[32 * 64];
2323
2324 dr_prediction_z1_32xN_neon(64, dstT, 32, left, dy);
2325 z3_transpose_arrays_u8_16nx16n(dstT, 32, dst, stride, 64, 32);
2326 }
2327
dr_prediction_z3_16x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2328 static void dr_prediction_z3_16x64_neon(uint8_t *dst, ptrdiff_t stride,
2329 const uint8_t *left, int upsample_left,
2330 int dy) {
2331 (void)upsample_left;
2332 uint8_t dstT[64 * 16];
2333
2334 dr_prediction_z1_64xN_neon(16, dstT, 64, left, dy);
2335 z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 16, 64);
2336 }
2337
dr_prediction_z3_64x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2338 static void dr_prediction_z3_64x16_neon(uint8_t *dst, ptrdiff_t stride,
2339 const uint8_t *left, int upsample_left,
2340 int dy) {
2341 uint8x16_t dstvec[64];
2342
2343 dr_prediction_z1_HxW_internal_neon(16, 64, dstvec, left, upsample_left, dy);
2344 for (int i = 0; i < 64; i += 16) {
2345 uint8x16_t d[16];
2346 transpose_arrays_u8_16x16(dstvec + i, d);
2347 for (int j = 0; j < 16; ++j) {
2348 vst1q_u8(dst + j * stride + i, d[j]);
2349 }
2350 }
2351 }
2352
2353 typedef void (*dr_prediction_z3_fn)(uint8_t *dst, ptrdiff_t stride,
2354 const uint8_t *left, int upsample_left,
2355 int dy);
2356
2357 static dr_prediction_z3_fn dr_prediction_z3_arr[7][7] = {
2358 { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
2359 { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
2360 { NULL, NULL, dr_prediction_z3_4x4_neon, dr_prediction_z3_4x8_neon,
2361 dr_prediction_z3_4x16_neon, NULL, NULL },
2362 { NULL, NULL, dr_prediction_z3_8x4_neon, dr_prediction_z3_8x8_neon,
2363 dr_prediction_z3_8x16_neon, dr_prediction_z3_8x32_neon, NULL },
2364 { NULL, NULL, dr_prediction_z3_16x4_neon, dr_prediction_z3_16x8_neon,
2365 dr_prediction_z3_16x16_neon, dr_prediction_z3_16x32_neon,
2366 dr_prediction_z3_16x64_neon },
2367 { NULL, NULL, NULL, dr_prediction_z3_32x8_neon, dr_prediction_z3_32x16_neon,
2368 dr_prediction_z3_32x32_neon, dr_prediction_z3_32x64_neon },
2369 { NULL, NULL, NULL, NULL, dr_prediction_z3_64x16_neon,
2370 dr_prediction_z3_64x32_neon, dr_prediction_z3_64x64_neon },
2371 };
2372
av1_dr_prediction_z3_neon(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_left,int dx,int dy)2373 void av1_dr_prediction_z3_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
2374 const uint8_t *above, const uint8_t *left,
2375 int upsample_left, int dx, int dy) {
2376 (void)above;
2377 (void)dx;
2378 assert(dx == 1);
2379 assert(dy > 0);
2380
2381 dr_prediction_z3_fn f = dr_prediction_z3_arr[get_msb(bw)][get_msb(bh)];
2382 assert(f != NULL);
2383 f(dst, stride, left, upsample_left, dy);
2384 }
2385
2386 // -----------------------------------------------------------------------------
2387 // SMOOTH_PRED
2388
2389 // 256 - v = vneg_s8(v)
negate_s8(const uint8x8_t v)2390 static INLINE uint8x8_t negate_s8(const uint8x8_t v) {
2391 return vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(v)));
2392 }
2393
smooth_4xh_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * const top_row,const uint8_t * const left_column,const int height)2394 static void smooth_4xh_neon(uint8_t *dst, ptrdiff_t stride,
2395 const uint8_t *const top_row,
2396 const uint8_t *const left_column,
2397 const int height) {
2398 const uint8_t top_right = top_row[3];
2399 const uint8_t bottom_left = left_column[height - 1];
2400 const uint8_t *const weights_y = smooth_weights + height - 4;
2401
2402 uint8x8_t top_v = load_u8_4x1(top_row);
2403 const uint8x8_t top_right_v = vdup_n_u8(top_right);
2404 const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
2405 uint8x8_t weights_x_v = load_u8_4x1(smooth_weights);
2406 const uint8x8_t scaled_weights_x = negate_s8(weights_x_v);
2407 const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
2408
2409 assert(height > 0);
2410 int y = 0;
2411 do {
2412 const uint8x8_t left_v = vdup_n_u8(left_column[y]);
2413 const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
2414 const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);
2415 const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
2416 const uint16x8_t weighted_top_bl =
2417 vmlal_u8(weighted_bl, weights_y_v, top_v);
2418 const uint16x8_t weighted_left_tr =
2419 vmlal_u8(weighted_tr, weights_x_v, left_v);
2420 // Maximum value of each parameter: 0xFF00
2421 const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
2422 const uint8x8_t result = vrshrn_n_u16(avg, SMOOTH_WEIGHT_LOG2_SCALE);
2423
2424 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(result), 0);
2425 dst += stride;
2426 } while (++y != height);
2427 }
2428
calculate_pred(const uint16x8_t weighted_top_bl,const uint16x8_t weighted_left_tr)2429 static INLINE uint8x8_t calculate_pred(const uint16x8_t weighted_top_bl,
2430 const uint16x8_t weighted_left_tr) {
2431 // Maximum value of each parameter: 0xFF00
2432 const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
2433 return vrshrn_n_u16(avg, SMOOTH_WEIGHT_LOG2_SCALE);
2434 }
2435
calculate_weights_and_pred(const uint8x8_t top,const uint8x8_t left,const uint16x8_t weighted_tr,const uint8x8_t bottom_left,const uint8x8_t weights_x,const uint8x8_t scaled_weights_y,const uint8x8_t weights_y)2436 static INLINE uint8x8_t calculate_weights_and_pred(
2437 const uint8x8_t top, const uint8x8_t left, const uint16x8_t weighted_tr,
2438 const uint8x8_t bottom_left, const uint8x8_t weights_x,
2439 const uint8x8_t scaled_weights_y, const uint8x8_t weights_y) {
2440 const uint16x8_t weighted_top = vmull_u8(weights_y, top);
2441 const uint16x8_t weighted_top_bl =
2442 vmlal_u8(weighted_top, scaled_weights_y, bottom_left);
2443 const uint16x8_t weighted_left_tr = vmlal_u8(weighted_tr, weights_x, left);
2444 return calculate_pred(weighted_top_bl, weighted_left_tr);
2445 }
2446
smooth_8xh_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * const top_row,const uint8_t * const left_column,const int height)2447 static void smooth_8xh_neon(uint8_t *dst, ptrdiff_t stride,
2448 const uint8_t *const top_row,
2449 const uint8_t *const left_column,
2450 const int height) {
2451 const uint8_t top_right = top_row[7];
2452 const uint8_t bottom_left = left_column[height - 1];
2453 const uint8_t *const weights_y = smooth_weights + height - 4;
2454
2455 const uint8x8_t top_v = vld1_u8(top_row);
2456 const uint8x8_t top_right_v = vdup_n_u8(top_right);
2457 const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
2458 const uint8x8_t weights_x_v = vld1_u8(smooth_weights + 4);
2459 const uint8x8_t scaled_weights_x = negate_s8(weights_x_v);
2460 const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
2461
2462 assert(height > 0);
2463 int y = 0;
2464 do {
2465 const uint8x8_t left_v = vdup_n_u8(left_column[y]);
2466 const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
2467 const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);
2468 const uint8x8_t result =
2469 calculate_weights_and_pred(top_v, left_v, weighted_tr, bottom_left_v,
2470 weights_x_v, scaled_weights_y, weights_y_v);
2471
2472 vst1_u8(dst, result);
2473 dst += stride;
2474 } while (++y != height);
2475 }
2476
2477 #define SMOOTH_NXM(W, H) \
2478 void aom_smooth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t y_stride, \
2479 const uint8_t *above, \
2480 const uint8_t *left) { \
2481 smooth_##W##xh_neon(dst, y_stride, above, left, H); \
2482 }
2483
2484 SMOOTH_NXM(4, 4)
2485 SMOOTH_NXM(4, 8)
2486 SMOOTH_NXM(8, 4)
2487 SMOOTH_NXM(8, 8)
2488 SMOOTH_NXM(4, 16)
2489 SMOOTH_NXM(8, 16)
2490 SMOOTH_NXM(8, 32)
2491
2492 #undef SMOOTH_NXM
2493
calculate_weights_and_predq(const uint8x16_t top,const uint8x8_t left,const uint8x8_t top_right,const uint8x8_t weights_y,const uint8x16_t weights_x,const uint8x16_t scaled_weights_x,const uint16x8_t weighted_bl)2494 static INLINE uint8x16_t calculate_weights_and_predq(
2495 const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right,
2496 const uint8x8_t weights_y, const uint8x16_t weights_x,
2497 const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) {
2498 const uint16x8_t weighted_top_bl_low =
2499 vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
2500 const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
2501 const uint16x8_t weighted_left_tr_low =
2502 vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
2503 const uint8x8_t result_low =
2504 calculate_pred(weighted_top_bl_low, weighted_left_tr_low);
2505
2506 const uint16x8_t weighted_top_bl_high =
2507 vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
2508 const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
2509 const uint16x8_t weighted_left_tr_high =
2510 vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
2511 const uint8x8_t result_high =
2512 calculate_pred(weighted_top_bl_high, weighted_left_tr_high);
2513
2514 return vcombine_u8(result_low, result_high);
2515 }
2516
2517 // 256 - v = vneg_s8(v)
negate_s8q(const uint8x16_t v)2518 static INLINE uint8x16_t negate_s8q(const uint8x16_t v) {
2519 return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(v)));
2520 }
2521
2522 // For width 16 and above.
2523 #define SMOOTH_PREDICTOR(W) \
2524 static void smooth_##W##xh_neon( \
2525 uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \
2526 const uint8_t *const left_column, const int height) { \
2527 const uint8_t top_right = top_row[(W)-1]; \
2528 const uint8_t bottom_left = left_column[height - 1]; \
2529 const uint8_t *const weights_y = smooth_weights + height - 4; \
2530 \
2531 uint8x16_t top_v[4]; \
2532 top_v[0] = vld1q_u8(top_row); \
2533 if ((W) > 16) { \
2534 top_v[1] = vld1q_u8(top_row + 16); \
2535 if ((W) == 64) { \
2536 top_v[2] = vld1q_u8(top_row + 32); \
2537 top_v[3] = vld1q_u8(top_row + 48); \
2538 } \
2539 } \
2540 \
2541 const uint8x8_t top_right_v = vdup_n_u8(top_right); \
2542 const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); \
2543 \
2544 uint8x16_t weights_x_v[4]; \
2545 weights_x_v[0] = vld1q_u8(smooth_weights + (W)-4); \
2546 if ((W) > 16) { \
2547 weights_x_v[1] = vld1q_u8(smooth_weights + (W) + 16 - 4); \
2548 if ((W) == 64) { \
2549 weights_x_v[2] = vld1q_u8(smooth_weights + (W) + 32 - 4); \
2550 weights_x_v[3] = vld1q_u8(smooth_weights + (W) + 48 - 4); \
2551 } \
2552 } \
2553 \
2554 uint8x16_t scaled_weights_x[4]; \
2555 scaled_weights_x[0] = negate_s8q(weights_x_v[0]); \
2556 if ((W) > 16) { \
2557 scaled_weights_x[1] = negate_s8q(weights_x_v[1]); \
2558 if ((W) == 64) { \
2559 scaled_weights_x[2] = negate_s8q(weights_x_v[2]); \
2560 scaled_weights_x[3] = negate_s8q(weights_x_v[3]); \
2561 } \
2562 } \
2563 \
2564 for (int y = 0; y < height; ++y) { \
2565 const uint8x8_t left_v = vdup_n_u8(left_column[y]); \
2566 const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); \
2567 const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); \
2568 const uint16x8_t weighted_bl = \
2569 vmull_u8(scaled_weights_y, bottom_left_v); \
2570 \
2571 vst1q_u8(dst, calculate_weights_and_predq( \
2572 top_v[0], left_v, top_right_v, weights_y_v, \
2573 weights_x_v[0], scaled_weights_x[0], weighted_bl)); \
2574 \
2575 if ((W) > 16) { \
2576 vst1q_u8(dst + 16, \
2577 calculate_weights_and_predq( \
2578 top_v[1], left_v, top_right_v, weights_y_v, \
2579 weights_x_v[1], scaled_weights_x[1], weighted_bl)); \
2580 if ((W) == 64) { \
2581 vst1q_u8(dst + 32, \
2582 calculate_weights_and_predq( \
2583 top_v[2], left_v, top_right_v, weights_y_v, \
2584 weights_x_v[2], scaled_weights_x[2], weighted_bl)); \
2585 vst1q_u8(dst + 48, \
2586 calculate_weights_and_predq( \
2587 top_v[3], left_v, top_right_v, weights_y_v, \
2588 weights_x_v[3], scaled_weights_x[3], weighted_bl)); \
2589 } \
2590 } \
2591 \
2592 dst += stride; \
2593 } \
2594 }
2595
2596 SMOOTH_PREDICTOR(16)
2597 SMOOTH_PREDICTOR(32)
2598 SMOOTH_PREDICTOR(64)
2599
2600 #undef SMOOTH_PREDICTOR
2601
2602 #define SMOOTH_NXM_WIDE(W, H) \
2603 void aom_smooth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t y_stride, \
2604 const uint8_t *above, \
2605 const uint8_t *left) { \
2606 smooth_##W##xh_neon(dst, y_stride, above, left, H); \
2607 }
2608
2609 SMOOTH_NXM_WIDE(16, 4)
2610 SMOOTH_NXM_WIDE(16, 8)
2611 SMOOTH_NXM_WIDE(16, 16)
2612 SMOOTH_NXM_WIDE(16, 32)
2613 SMOOTH_NXM_WIDE(16, 64)
2614 SMOOTH_NXM_WIDE(32, 8)
2615 SMOOTH_NXM_WIDE(32, 16)
2616 SMOOTH_NXM_WIDE(32, 32)
2617 SMOOTH_NXM_WIDE(32, 64)
2618 SMOOTH_NXM_WIDE(64, 16)
2619 SMOOTH_NXM_WIDE(64, 32)
2620 SMOOTH_NXM_WIDE(64, 64)
2621
2622 #undef SMOOTH_NXM_WIDE
2623
2624 // -----------------------------------------------------------------------------
2625 // SMOOTH_V_PRED
2626
2627 // For widths 4 and 8.
2628 #define SMOOTH_V_PREDICTOR(W) \
2629 static void smooth_v_##W##xh_neon( \
2630 uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \
2631 const uint8_t *const left_column, const int height) { \
2632 const uint8_t bottom_left = left_column[height - 1]; \
2633 const uint8_t *const weights_y = smooth_weights + height - 4; \
2634 \
2635 uint8x8_t top_v; \
2636 if ((W) == 4) { \
2637 top_v = load_u8_4x1(top_row); \
2638 } else { /* width == 8 */ \
2639 top_v = vld1_u8(top_row); \
2640 } \
2641 \
2642 const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); \
2643 \
2644 assert(height > 0); \
2645 int y = 0; \
2646 do { \
2647 const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); \
2648 const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); \
2649 \
2650 const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v); \
2651 const uint16x8_t weighted_top_bl = \
2652 vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v); \
2653 const uint8x8_t pred = \
2654 vrshrn_n_u16(weighted_top_bl, SMOOTH_WEIGHT_LOG2_SCALE); \
2655 \
2656 if ((W) == 4) { \
2657 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0); \
2658 } else { /* width == 8 */ \
2659 vst1_u8(dst, pred); \
2660 } \
2661 dst += stride; \
2662 } while (++y != height); \
2663 }
2664
2665 SMOOTH_V_PREDICTOR(4)
2666 SMOOTH_V_PREDICTOR(8)
2667
2668 #undef SMOOTH_V_PREDICTOR
2669
2670 #define SMOOTH_V_NXM(W, H) \
2671 void aom_smooth_v_predictor_##W##x##H##_neon( \
2672 uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
2673 const uint8_t *left) { \
2674 smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
2675 }
2676
2677 SMOOTH_V_NXM(4, 4)
2678 SMOOTH_V_NXM(4, 8)
2679 SMOOTH_V_NXM(4, 16)
2680 SMOOTH_V_NXM(8, 4)
2681 SMOOTH_V_NXM(8, 8)
2682 SMOOTH_V_NXM(8, 16)
2683 SMOOTH_V_NXM(8, 32)
2684
2685 #undef SMOOTH_V_NXM
2686
calculate_vertical_weights_and_pred(const uint8x16_t top,const uint8x8_t weights_y,const uint16x8_t weighted_bl)2687 static INLINE uint8x16_t calculate_vertical_weights_and_pred(
2688 const uint8x16_t top, const uint8x8_t weights_y,
2689 const uint16x8_t weighted_bl) {
2690 const uint16x8_t pred_low =
2691 vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
2692 const uint16x8_t pred_high =
2693 vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
2694 const uint8x8_t pred_scaled_low =
2695 vrshrn_n_u16(pred_low, SMOOTH_WEIGHT_LOG2_SCALE);
2696 const uint8x8_t pred_scaled_high =
2697 vrshrn_n_u16(pred_high, SMOOTH_WEIGHT_LOG2_SCALE);
2698 return vcombine_u8(pred_scaled_low, pred_scaled_high);
2699 }
2700
2701 // For width 16 and above.
2702 #define SMOOTH_V_PREDICTOR(W) \
2703 static void smooth_v_##W##xh_neon( \
2704 uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \
2705 const uint8_t *const left_column, const int height) { \
2706 const uint8_t bottom_left = left_column[height - 1]; \
2707 const uint8_t *const weights_y = smooth_weights + height - 4; \
2708 \
2709 uint8x16_t top_v[4]; \
2710 top_v[0] = vld1q_u8(top_row); \
2711 if ((W) > 16) { \
2712 top_v[1] = vld1q_u8(top_row + 16); \
2713 if ((W) == 64) { \
2714 top_v[2] = vld1q_u8(top_row + 32); \
2715 top_v[3] = vld1q_u8(top_row + 48); \
2716 } \
2717 } \
2718 \
2719 const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); \
2720 \
2721 assert(height > 0); \
2722 int y = 0; \
2723 do { \
2724 const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); \
2725 const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); \
2726 const uint16x8_t weighted_bl = \
2727 vmull_u8(scaled_weights_y, bottom_left_v); \
2728 \
2729 const uint8x16_t pred_0 = calculate_vertical_weights_and_pred( \
2730 top_v[0], weights_y_v, weighted_bl); \
2731 vst1q_u8(dst, pred_0); \
2732 \
2733 if ((W) > 16) { \
2734 const uint8x16_t pred_1 = calculate_vertical_weights_and_pred( \
2735 top_v[1], weights_y_v, weighted_bl); \
2736 vst1q_u8(dst + 16, pred_1); \
2737 \
2738 if ((W) == 64) { \
2739 const uint8x16_t pred_2 = calculate_vertical_weights_and_pred( \
2740 top_v[2], weights_y_v, weighted_bl); \
2741 vst1q_u8(dst + 32, pred_2); \
2742 \
2743 const uint8x16_t pred_3 = calculate_vertical_weights_and_pred( \
2744 top_v[3], weights_y_v, weighted_bl); \
2745 vst1q_u8(dst + 48, pred_3); \
2746 } \
2747 } \
2748 \
2749 dst += stride; \
2750 } while (++y != height); \
2751 }
2752
2753 SMOOTH_V_PREDICTOR(16)
2754 SMOOTH_V_PREDICTOR(32)
2755 SMOOTH_V_PREDICTOR(64)
2756
2757 #undef SMOOTH_V_PREDICTOR
2758
2759 #define SMOOTH_V_NXM_WIDE(W, H) \
2760 void aom_smooth_v_predictor_##W##x##H##_neon( \
2761 uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
2762 const uint8_t *left) { \
2763 smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
2764 }
2765
2766 SMOOTH_V_NXM_WIDE(16, 4)
2767 SMOOTH_V_NXM_WIDE(16, 8)
2768 SMOOTH_V_NXM_WIDE(16, 16)
2769 SMOOTH_V_NXM_WIDE(16, 32)
2770 SMOOTH_V_NXM_WIDE(16, 64)
2771 SMOOTH_V_NXM_WIDE(32, 8)
2772 SMOOTH_V_NXM_WIDE(32, 16)
2773 SMOOTH_V_NXM_WIDE(32, 32)
2774 SMOOTH_V_NXM_WIDE(32, 64)
2775 SMOOTH_V_NXM_WIDE(64, 16)
2776 SMOOTH_V_NXM_WIDE(64, 32)
2777 SMOOTH_V_NXM_WIDE(64, 64)
2778
2779 #undef SMOOTH_V_NXM_WIDE
2780
2781 // -----------------------------------------------------------------------------
2782 // SMOOTH_H_PRED
2783
2784 // For widths 4 and 8.
2785 #define SMOOTH_H_PREDICTOR(W) \
2786 static void smooth_h_##W##xh_neon( \
2787 uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \
2788 const uint8_t *const left_column, const int height) { \
2789 const uint8_t top_right = top_row[(W)-1]; \
2790 \
2791 const uint8x8_t top_right_v = vdup_n_u8(top_right); \
2792 /* Over-reads for 4xN but still within the array. */ \
2793 const uint8x8_t weights_x = vld1_u8(smooth_weights + (W)-4); \
2794 const uint8x8_t scaled_weights_x = negate_s8(weights_x); \
2795 const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); \
2796 \
2797 assert(height > 0); \
2798 int y = 0; \
2799 do { \
2800 const uint8x8_t left_v = vdup_n_u8(left_column[y]); \
2801 const uint16x8_t weighted_left_tr = \
2802 vmlal_u8(weighted_tr, weights_x, left_v); \
2803 const uint8x8_t pred = \
2804 vrshrn_n_u16(weighted_left_tr, SMOOTH_WEIGHT_LOG2_SCALE); \
2805 \
2806 if ((W) == 4) { \
2807 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0); \
2808 } else { /* width == 8 */ \
2809 vst1_u8(dst, pred); \
2810 } \
2811 dst += stride; \
2812 } while (++y != height); \
2813 }
2814
2815 SMOOTH_H_PREDICTOR(4)
2816 SMOOTH_H_PREDICTOR(8)
2817
2818 #undef SMOOTH_H_PREDICTOR
2819
2820 #define SMOOTH_H_NXM(W, H) \
2821 void aom_smooth_h_predictor_##W##x##H##_neon( \
2822 uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
2823 const uint8_t *left) { \
2824 smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
2825 }
2826
2827 SMOOTH_H_NXM(4, 4)
2828 SMOOTH_H_NXM(4, 8)
2829 SMOOTH_H_NXM(4, 16)
2830 SMOOTH_H_NXM(8, 4)
2831 SMOOTH_H_NXM(8, 8)
2832 SMOOTH_H_NXM(8, 16)
2833 SMOOTH_H_NXM(8, 32)
2834
2835 #undef SMOOTH_H_NXM
2836
calculate_horizontal_weights_and_pred(const uint8x8_t left,const uint8x8_t top_right,const uint8x16_t weights_x,const uint8x16_t scaled_weights_x)2837 static INLINE uint8x16_t calculate_horizontal_weights_and_pred(
2838 const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x,
2839 const uint8x16_t scaled_weights_x) {
2840 const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
2841 const uint16x8_t weighted_left_tr_low =
2842 vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
2843 const uint8x8_t pred_scaled_low =
2844 vrshrn_n_u16(weighted_left_tr_low, SMOOTH_WEIGHT_LOG2_SCALE);
2845
2846 const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
2847 const uint16x8_t weighted_left_tr_high =
2848 vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
2849 const uint8x8_t pred_scaled_high =
2850 vrshrn_n_u16(weighted_left_tr_high, SMOOTH_WEIGHT_LOG2_SCALE);
2851
2852 return vcombine_u8(pred_scaled_low, pred_scaled_high);
2853 }
2854
2855 // For width 16 and above.
2856 #define SMOOTH_H_PREDICTOR(W) \
2857 static void smooth_h_##W##xh_neon( \
2858 uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \
2859 const uint8_t *const left_column, const int height) { \
2860 const uint8_t top_right = top_row[(W)-1]; \
2861 \
2862 const uint8x8_t top_right_v = vdup_n_u8(top_right); \
2863 \
2864 uint8x16_t weights_x[4]; \
2865 weights_x[0] = vld1q_u8(smooth_weights + (W)-4); \
2866 if ((W) > 16) { \
2867 weights_x[1] = vld1q_u8(smooth_weights + (W) + 16 - 4); \
2868 if ((W) == 64) { \
2869 weights_x[2] = vld1q_u8(smooth_weights + (W) + 32 - 4); \
2870 weights_x[3] = vld1q_u8(smooth_weights + (W) + 48 - 4); \
2871 } \
2872 } \
2873 \
2874 uint8x16_t scaled_weights_x[4]; \
2875 scaled_weights_x[0] = negate_s8q(weights_x[0]); \
2876 if ((W) > 16) { \
2877 scaled_weights_x[1] = negate_s8q(weights_x[1]); \
2878 if ((W) == 64) { \
2879 scaled_weights_x[2] = negate_s8q(weights_x[2]); \
2880 scaled_weights_x[3] = negate_s8q(weights_x[3]); \
2881 } \
2882 } \
2883 \
2884 assert(height > 0); \
2885 int y = 0; \
2886 do { \
2887 const uint8x8_t left_v = vdup_n_u8(left_column[y]); \
2888 \
2889 const uint8x16_t pred_0 = calculate_horizontal_weights_and_pred( \
2890 left_v, top_right_v, weights_x[0], scaled_weights_x[0]); \
2891 vst1q_u8(dst, pred_0); \
2892 \
2893 if ((W) > 16) { \
2894 const uint8x16_t pred_1 = calculate_horizontal_weights_and_pred( \
2895 left_v, top_right_v, weights_x[1], scaled_weights_x[1]); \
2896 vst1q_u8(dst + 16, pred_1); \
2897 \
2898 if ((W) == 64) { \
2899 const uint8x16_t pred_2 = calculate_horizontal_weights_and_pred( \
2900 left_v, top_right_v, weights_x[2], scaled_weights_x[2]); \
2901 vst1q_u8(dst + 32, pred_2); \
2902 \
2903 const uint8x16_t pred_3 = calculate_horizontal_weights_and_pred( \
2904 left_v, top_right_v, weights_x[3], scaled_weights_x[3]); \
2905 vst1q_u8(dst + 48, pred_3); \
2906 } \
2907 } \
2908 dst += stride; \
2909 } while (++y != height); \
2910 }
2911
2912 SMOOTH_H_PREDICTOR(16)
2913 SMOOTH_H_PREDICTOR(32)
2914 SMOOTH_H_PREDICTOR(64)
2915
2916 #undef SMOOTH_H_PREDICTOR
2917
2918 #define SMOOTH_H_NXM_WIDE(W, H) \
2919 void aom_smooth_h_predictor_##W##x##H##_neon( \
2920 uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
2921 const uint8_t *left) { \
2922 smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
2923 }
2924
2925 SMOOTH_H_NXM_WIDE(16, 4)
2926 SMOOTH_H_NXM_WIDE(16, 8)
2927 SMOOTH_H_NXM_WIDE(16, 16)
2928 SMOOTH_H_NXM_WIDE(16, 32)
2929 SMOOTH_H_NXM_WIDE(16, 64)
2930 SMOOTH_H_NXM_WIDE(32, 8)
2931 SMOOTH_H_NXM_WIDE(32, 16)
2932 SMOOTH_H_NXM_WIDE(32, 32)
2933 SMOOTH_H_NXM_WIDE(32, 64)
2934 SMOOTH_H_NXM_WIDE(64, 16)
2935 SMOOTH_H_NXM_WIDE(64, 32)
2936 SMOOTH_H_NXM_WIDE(64, 64)
2937
2938 #undef SMOOTH_H_NXM_WIDE
2939
2940 // -----------------------------------------------------------------------------
2941 // PAETH
2942
paeth_4or8_x_h_neon(uint8_t * dest,ptrdiff_t stride,const uint8_t * const top_row,const uint8_t * const left_column,int width,int height)2943 static INLINE void paeth_4or8_x_h_neon(uint8_t *dest, ptrdiff_t stride,
2944 const uint8_t *const top_row,
2945 const uint8_t *const left_column,
2946 int width, int height) {
2947 const uint8x8_t top_left = vdup_n_u8(top_row[-1]);
2948 const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
2949 uint8x8_t top;
2950 if (width == 4) {
2951 top = load_u8_4x1(top_row);
2952 } else { // width == 8
2953 top = vld1_u8(top_row);
2954 }
2955
2956 assert(height > 0);
2957 int y = 0;
2958 do {
2959 const uint8x8_t left = vdup_n_u8(left_column[y]);
2960
2961 const uint8x8_t left_dist = vabd_u8(top, top_left);
2962 const uint8x8_t top_dist = vabd_u8(left, top_left);
2963 const uint16x8_t top_left_dist =
2964 vabdq_u16(vaddl_u8(top, left), top_left_x2);
2965
2966 const uint8x8_t left_le_top = vcle_u8(left_dist, top_dist);
2967 const uint8x8_t left_le_top_left =
2968 vmovn_u16(vcleq_u16(vmovl_u8(left_dist), top_left_dist));
2969 const uint8x8_t top_le_top_left =
2970 vmovn_u16(vcleq_u16(vmovl_u8(top_dist), top_left_dist));
2971
2972 // if (left_dist <= top_dist && left_dist <= top_left_dist)
2973 const uint8x8_t left_mask = vand_u8(left_le_top, left_le_top_left);
2974 // dest[x] = left_column[y];
2975 // Fill all the unused spaces with 'top'. They will be overwritten when
2976 // the positions for top_left are known.
2977 uint8x8_t result = vbsl_u8(left_mask, left, top);
2978 // else if (top_dist <= top_left_dist)
2979 // dest[x] = top_row[x];
2980 // Add these values to the mask. They were already set.
2981 const uint8x8_t left_or_top_mask = vorr_u8(left_mask, top_le_top_left);
2982 // else
2983 // dest[x] = top_left;
2984 result = vbsl_u8(left_or_top_mask, result, top_left);
2985
2986 if (width == 4) {
2987 store_u8_4x1(dest, result);
2988 } else { // width == 8
2989 vst1_u8(dest, result);
2990 }
2991 dest += stride;
2992 } while (++y != height);
2993 }
2994
2995 #define PAETH_NXM(W, H) \
2996 void aom_paeth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t stride, \
2997 const uint8_t *above, \
2998 const uint8_t *left) { \
2999 paeth_4or8_x_h_neon(dst, stride, above, left, W, H); \
3000 }
3001
3002 PAETH_NXM(4, 4)
3003 PAETH_NXM(4, 8)
3004 PAETH_NXM(8, 4)
3005 PAETH_NXM(8, 8)
3006 PAETH_NXM(8, 16)
3007
3008 PAETH_NXM(4, 16)
3009 PAETH_NXM(8, 32)
3010
3011 // Calculate X distance <= TopLeft distance and pack the resulting mask into
3012 // uint8x8_t.
x_le_top_left(const uint8x16_t x_dist,const uint16x8_t top_left_dist_low,const uint16x8_t top_left_dist_high)3013 static INLINE uint8x16_t x_le_top_left(const uint8x16_t x_dist,
3014 const uint16x8_t top_left_dist_low,
3015 const uint16x8_t top_left_dist_high) {
3016 const uint8x16_t top_left_dist = vcombine_u8(vqmovn_u16(top_left_dist_low),
3017 vqmovn_u16(top_left_dist_high));
3018 return vcleq_u8(x_dist, top_left_dist);
3019 }
3020
3021 // Select the closest values and collect them.
select_paeth(const uint8x16_t top,const uint8x16_t left,const uint8x16_t top_left,const uint8x16_t left_le_top,const uint8x16_t left_le_top_left,const uint8x16_t top_le_top_left)3022 static INLINE uint8x16_t select_paeth(const uint8x16_t top,
3023 const uint8x16_t left,
3024 const uint8x16_t top_left,
3025 const uint8x16_t left_le_top,
3026 const uint8x16_t left_le_top_left,
3027 const uint8x16_t top_le_top_left) {
3028 // if (left_dist <= top_dist && left_dist <= top_left_dist)
3029 const uint8x16_t left_mask = vandq_u8(left_le_top, left_le_top_left);
3030 // dest[x] = left_column[y];
3031 // Fill all the unused spaces with 'top'. They will be overwritten when
3032 // the positions for top_left are known.
3033 uint8x16_t result = vbslq_u8(left_mask, left, top);
3034 // else if (top_dist <= top_left_dist)
3035 // dest[x] = top_row[x];
3036 // Add these values to the mask. They were already set.
3037 const uint8x16_t left_or_top_mask = vorrq_u8(left_mask, top_le_top_left);
3038 // else
3039 // dest[x] = top_left;
3040 return vbslq_u8(left_or_top_mask, result, top_left);
3041 }
3042
3043 // Generate numbered and high/low versions of top_left_dist.
3044 #define TOP_LEFT_DIST(num) \
3045 const uint16x8_t top_left_##num##_dist_low = vabdq_u16( \
3046 vaddl_u8(vget_low_u8(top[num]), vget_low_u8(left)), top_left_x2); \
3047 const uint16x8_t top_left_##num##_dist_high = vabdq_u16( \
3048 vaddl_u8(vget_high_u8(top[num]), vget_low_u8(left)), top_left_x2)
3049
3050 // Generate numbered versions of XLeTopLeft with x = left.
3051 #define LEFT_LE_TOP_LEFT(num) \
3052 const uint8x16_t left_le_top_left_##num = \
3053 x_le_top_left(left_##num##_dist, top_left_##num##_dist_low, \
3054 top_left_##num##_dist_high)
3055
3056 // Generate numbered versions of XLeTopLeft with x = top.
3057 #define TOP_LE_TOP_LEFT(num) \
3058 const uint8x16_t top_le_top_left_##num = x_le_top_left( \
3059 top_dist, top_left_##num##_dist_low, top_left_##num##_dist_high)
3060
paeth16_plus_x_h_neon(uint8_t * dest,ptrdiff_t stride,const uint8_t * const top_row,const uint8_t * const left_column,int width,int height)3061 static INLINE void paeth16_plus_x_h_neon(uint8_t *dest, ptrdiff_t stride,
3062 const uint8_t *const top_row,
3063 const uint8_t *const left_column,
3064 int width, int height) {
3065 const uint8x16_t top_left = vdupq_n_u8(top_row[-1]);
3066 const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
3067 uint8x16_t top[4];
3068 top[0] = vld1q_u8(top_row);
3069 if (width > 16) {
3070 top[1] = vld1q_u8(top_row + 16);
3071 if (width == 64) {
3072 top[2] = vld1q_u8(top_row + 32);
3073 top[3] = vld1q_u8(top_row + 48);
3074 }
3075 }
3076
3077 assert(height > 0);
3078 int y = 0;
3079 do {
3080 const uint8x16_t left = vdupq_n_u8(left_column[y]);
3081
3082 const uint8x16_t top_dist = vabdq_u8(left, top_left);
3083
3084 const uint8x16_t left_0_dist = vabdq_u8(top[0], top_left);
3085 TOP_LEFT_DIST(0);
3086 const uint8x16_t left_0_le_top = vcleq_u8(left_0_dist, top_dist);
3087 LEFT_LE_TOP_LEFT(0);
3088 TOP_LE_TOP_LEFT(0);
3089
3090 const uint8x16_t result_0 =
3091 select_paeth(top[0], left, top_left, left_0_le_top, left_le_top_left_0,
3092 top_le_top_left_0);
3093 vst1q_u8(dest, result_0);
3094
3095 if (width > 16) {
3096 const uint8x16_t left_1_dist = vabdq_u8(top[1], top_left);
3097 TOP_LEFT_DIST(1);
3098 const uint8x16_t left_1_le_top = vcleq_u8(left_1_dist, top_dist);
3099 LEFT_LE_TOP_LEFT(1);
3100 TOP_LE_TOP_LEFT(1);
3101
3102 const uint8x16_t result_1 =
3103 select_paeth(top[1], left, top_left, left_1_le_top,
3104 left_le_top_left_1, top_le_top_left_1);
3105 vst1q_u8(dest + 16, result_1);
3106
3107 if (width == 64) {
3108 const uint8x16_t left_2_dist = vabdq_u8(top[2], top_left);
3109 TOP_LEFT_DIST(2);
3110 const uint8x16_t left_2_le_top = vcleq_u8(left_2_dist, top_dist);
3111 LEFT_LE_TOP_LEFT(2);
3112 TOP_LE_TOP_LEFT(2);
3113
3114 const uint8x16_t result_2 =
3115 select_paeth(top[2], left, top_left, left_2_le_top,
3116 left_le_top_left_2, top_le_top_left_2);
3117 vst1q_u8(dest + 32, result_2);
3118
3119 const uint8x16_t left_3_dist = vabdq_u8(top[3], top_left);
3120 TOP_LEFT_DIST(3);
3121 const uint8x16_t left_3_le_top = vcleq_u8(left_3_dist, top_dist);
3122 LEFT_LE_TOP_LEFT(3);
3123 TOP_LE_TOP_LEFT(3);
3124
3125 const uint8x16_t result_3 =
3126 select_paeth(top[3], left, top_left, left_3_le_top,
3127 left_le_top_left_3, top_le_top_left_3);
3128 vst1q_u8(dest + 48, result_3);
3129 }
3130 }
3131
3132 dest += stride;
3133 } while (++y != height);
3134 }
3135
3136 #define PAETH_NXM_WIDE(W, H) \
3137 void aom_paeth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t stride, \
3138 const uint8_t *above, \
3139 const uint8_t *left) { \
3140 paeth16_plus_x_h_neon(dst, stride, above, left, W, H); \
3141 }
3142
3143 PAETH_NXM_WIDE(16, 8)
3144 PAETH_NXM_WIDE(16, 16)
3145 PAETH_NXM_WIDE(16, 32)
3146 PAETH_NXM_WIDE(32, 16)
3147 PAETH_NXM_WIDE(32, 32)
3148 PAETH_NXM_WIDE(32, 64)
3149 PAETH_NXM_WIDE(64, 32)
3150 PAETH_NXM_WIDE(64, 64)
3151
3152 PAETH_NXM_WIDE(16, 4)
3153 PAETH_NXM_WIDE(16, 64)
3154 PAETH_NXM_WIDE(32, 8)
3155 PAETH_NXM_WIDE(64, 16)
3156