• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 #include <assert.h>
14 #include <stdint.h>
15 
16 #include "config/aom_config.h"
17 #include "config/aom_dsp_rtcd.h"
18 
19 #include "aom/aom_integer.h"
20 #include "aom_dsp/arm/mem_neon.h"
21 #include "aom_dsp/arm/reinterpret_neon.h"
22 #include "aom_dsp/arm/sum_neon.h"
23 #include "aom_dsp/arm/transpose_neon.h"
24 #include "aom_dsp/intrapred_common.h"
25 
26 //------------------------------------------------------------------------------
27 // DC 4x4
28 
dc_load_sum_4(const uint8_t * in)29 static INLINE uint16x8_t dc_load_sum_4(const uint8_t *in) {
30   const uint8x8_t a = load_u8_4x1(in);
31   const uint16x4_t p0 = vpaddl_u8(a);
32   const uint16x4_t p1 = vpadd_u16(p0, p0);
33   return vcombine_u16(p1, vdup_n_u16(0));
34 }
35 
dc_store_4xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x8_t dc)36 static INLINE void dc_store_4xh(uint8_t *dst, ptrdiff_t stride, int h,
37                                 uint8x8_t dc) {
38   for (int i = 0; i < h; ++i) {
39     store_u8_4x1(dst + i * stride, dc);
40   }
41 }
42 
aom_dc_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)43 void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
44                                const uint8_t *above, const uint8_t *left) {
45   const uint16x8_t sum_top = dc_load_sum_4(above);
46   const uint16x8_t sum_left = dc_load_sum_4(left);
47   const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
48   const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);
49   dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
50 }
51 
aom_dc_left_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)52 void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
53                                     const uint8_t *above, const uint8_t *left) {
54   const uint16x8_t sum_left = dc_load_sum_4(left);
55   const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 2);
56   (void)above;
57   dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
58 }
59 
aom_dc_top_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)60 void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
61                                    const uint8_t *above, const uint8_t *left) {
62   const uint16x8_t sum_top = dc_load_sum_4(above);
63   const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 2);
64   (void)left;
65   dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
66 }
67 
aom_dc_128_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)68 void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
69                                    const uint8_t *above, const uint8_t *left) {
70   const uint8x8_t dc0 = vdup_n_u8(0x80);
71   (void)above;
72   (void)left;
73   dc_store_4xh(dst, stride, 4, dc0);
74 }
75 
76 //------------------------------------------------------------------------------
77 // DC 8x8
78 
dc_load_sum_8(const uint8_t * in)79 static INLINE uint16x8_t dc_load_sum_8(const uint8_t *in) {
80   // This isn't used in the case where we want to load both above and left
81   // vectors, since we want to avoid performing the reduction twice.
82   const uint8x8_t a = vld1_u8(in);
83   const uint16x4_t p0 = vpaddl_u8(a);
84   const uint16x4_t p1 = vpadd_u16(p0, p0);
85   const uint16x4_t p2 = vpadd_u16(p1, p1);
86   return vcombine_u16(p2, vdup_n_u16(0));
87 }
88 
horizontal_add_and_broadcast_u16x8(uint16x8_t a)89 static INLINE uint16x8_t horizontal_add_and_broadcast_u16x8(uint16x8_t a) {
90 #if AOM_ARCH_AARCH64
91   // On AArch64 we could also use vdupq_n_u16(vaddvq_u16(a)) here to save an
92   // instruction, however the addv instruction is usually slightly more
93   // expensive than a pairwise addition, so the need for immediately
94   // broadcasting the result again seems to negate any benefit.
95   const uint16x8_t b = vpaddq_u16(a, a);
96   const uint16x8_t c = vpaddq_u16(b, b);
97   return vpaddq_u16(c, c);
98 #else
99   const uint16x4_t b = vadd_u16(vget_low_u16(a), vget_high_u16(a));
100   const uint16x4_t c = vpadd_u16(b, b);
101   const uint16x4_t d = vpadd_u16(c, c);
102   return vcombine_u16(d, d);
103 #endif
104 }
105 
dc_store_8xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x8_t dc)106 static INLINE void dc_store_8xh(uint8_t *dst, ptrdiff_t stride, int h,
107                                 uint8x8_t dc) {
108   for (int i = 0; i < h; ++i) {
109     vst1_u8(dst + i * stride, dc);
110   }
111 }
112 
aom_dc_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)113 void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
114                                const uint8_t *above, const uint8_t *left) {
115   const uint8x8_t sum_top = vld1_u8(above);
116   const uint8x8_t sum_left = vld1_u8(left);
117   uint16x8_t sum = vaddl_u8(sum_left, sum_top);
118   sum = horizontal_add_and_broadcast_u16x8(sum);
119   const uint8x8_t dc0 = vrshrn_n_u16(sum, 4);
120   dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
121 }
122 
aom_dc_left_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)123 void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
124                                     const uint8_t *above, const uint8_t *left) {
125   const uint16x8_t sum_left = dc_load_sum_8(left);
126   const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 3);
127   (void)above;
128   dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
129 }
130 
aom_dc_top_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)131 void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
132                                    const uint8_t *above, const uint8_t *left) {
133   const uint16x8_t sum_top = dc_load_sum_8(above);
134   const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 3);
135   (void)left;
136   dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
137 }
138 
aom_dc_128_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)139 void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
140                                    const uint8_t *above, const uint8_t *left) {
141   const uint8x8_t dc0 = vdup_n_u8(0x80);
142   (void)above;
143   (void)left;
144   dc_store_8xh(dst, stride, 8, dc0);
145 }
146 
147 //------------------------------------------------------------------------------
148 // DC 16x16
149 
dc_load_partial_sum_16(const uint8_t * in)150 static INLINE uint16x8_t dc_load_partial_sum_16(const uint8_t *in) {
151   const uint8x16_t a = vld1q_u8(in);
152   // delay the remainder of the reduction until
153   // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
154   // than twice in the case we are loading both above and left.
155   return vpaddlq_u8(a);
156 }
157 
dc_load_sum_16(const uint8_t * in)158 static INLINE uint16x8_t dc_load_sum_16(const uint8_t *in) {
159   return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_16(in));
160 }
161 
dc_store_16xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x16_t dc)162 static INLINE void dc_store_16xh(uint8_t *dst, ptrdiff_t stride, int h,
163                                  uint8x16_t dc) {
164   for (int i = 0; i < h; ++i) {
165     vst1q_u8(dst + i * stride, dc);
166   }
167 }
168 
aom_dc_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)169 void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
170                                  const uint8_t *above, const uint8_t *left) {
171   const uint16x8_t sum_top = dc_load_partial_sum_16(above);
172   const uint16x8_t sum_left = dc_load_partial_sum_16(left);
173   uint16x8_t sum = vaddq_u16(sum_left, sum_top);
174   sum = horizontal_add_and_broadcast_u16x8(sum);
175   const uint8x8_t dc0 = vrshrn_n_u16(sum, 5);
176   dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
177 }
178 
aom_dc_left_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)179 void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
180                                       const uint8_t *above,
181                                       const uint8_t *left) {
182   const uint16x8_t sum_left = dc_load_sum_16(left);
183   const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 4);
184   (void)above;
185   dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
186 }
187 
aom_dc_top_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)188 void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
189                                      const uint8_t *above,
190                                      const uint8_t *left) {
191   const uint16x8_t sum_top = dc_load_sum_16(above);
192   const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 4);
193   (void)left;
194   dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
195 }
196 
aom_dc_128_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)197 void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
198                                      const uint8_t *above,
199                                      const uint8_t *left) {
200   const uint8x16_t dc0 = vdupq_n_u8(0x80);
201   (void)above;
202   (void)left;
203   dc_store_16xh(dst, stride, 16, dc0);
204 }
205 
206 //------------------------------------------------------------------------------
207 // DC 32x32
208 
dc_load_partial_sum_32(const uint8_t * in)209 static INLINE uint16x8_t dc_load_partial_sum_32(const uint8_t *in) {
210   const uint8x16_t a0 = vld1q_u8(in);
211   const uint8x16_t a1 = vld1q_u8(in + 16);
212   // delay the remainder of the reduction until
213   // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
214   // than twice in the case we are loading both above and left.
215   return vpadalq_u8(vpaddlq_u8(a0), a1);
216 }
217 
dc_load_sum_32(const uint8_t * in)218 static INLINE uint16x8_t dc_load_sum_32(const uint8_t *in) {
219   return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_32(in));
220 }
221 
dc_store_32xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x16_t dc)222 static INLINE void dc_store_32xh(uint8_t *dst, ptrdiff_t stride, int h,
223                                  uint8x16_t dc) {
224   for (int i = 0; i < h; ++i) {
225     vst1q_u8(dst + i * stride, dc);
226     vst1q_u8(dst + i * stride + 16, dc);
227   }
228 }
229 
aom_dc_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)230 void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
231                                  const uint8_t *above, const uint8_t *left) {
232   const uint16x8_t sum_top = dc_load_partial_sum_32(above);
233   const uint16x8_t sum_left = dc_load_partial_sum_32(left);
234   uint16x8_t sum = vaddq_u16(sum_left, sum_top);
235   sum = horizontal_add_and_broadcast_u16x8(sum);
236   const uint8x8_t dc0 = vrshrn_n_u16(sum, 6);
237   dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
238 }
239 
aom_dc_left_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)240 void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
241                                       const uint8_t *above,
242                                       const uint8_t *left) {
243   const uint16x8_t sum_left = dc_load_sum_32(left);
244   const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 5);
245   (void)above;
246   dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
247 }
248 
aom_dc_top_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)249 void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
250                                      const uint8_t *above,
251                                      const uint8_t *left) {
252   const uint16x8_t sum_top = dc_load_sum_32(above);
253   const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 5);
254   (void)left;
255   dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
256 }
257 
aom_dc_128_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)258 void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
259                                      const uint8_t *above,
260                                      const uint8_t *left) {
261   const uint8x16_t dc0 = vdupq_n_u8(0x80);
262   (void)above;
263   (void)left;
264   dc_store_32xh(dst, stride, 32, dc0);
265 }
266 
267 //------------------------------------------------------------------------------
268 // DC 64x64
269 
dc_load_partial_sum_64(const uint8_t * in)270 static INLINE uint16x8_t dc_load_partial_sum_64(const uint8_t *in) {
271   const uint8x16_t a0 = vld1q_u8(in);
272   const uint8x16_t a1 = vld1q_u8(in + 16);
273   const uint8x16_t a2 = vld1q_u8(in + 32);
274   const uint8x16_t a3 = vld1q_u8(in + 48);
275   const uint16x8_t p01 = vpadalq_u8(vpaddlq_u8(a0), a1);
276   const uint16x8_t p23 = vpadalq_u8(vpaddlq_u8(a2), a3);
277   // delay the remainder of the reduction until
278   // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
279   // than twice in the case we are loading both above and left.
280   return vaddq_u16(p01, p23);
281 }
282 
dc_load_sum_64(const uint8_t * in)283 static INLINE uint16x8_t dc_load_sum_64(const uint8_t *in) {
284   return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_64(in));
285 }
286 
dc_store_64xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x16_t dc)287 static INLINE void dc_store_64xh(uint8_t *dst, ptrdiff_t stride, int h,
288                                  uint8x16_t dc) {
289   for (int i = 0; i < h; ++i) {
290     vst1q_u8(dst + i * stride, dc);
291     vst1q_u8(dst + i * stride + 16, dc);
292     vst1q_u8(dst + i * stride + 32, dc);
293     vst1q_u8(dst + i * stride + 48, dc);
294   }
295 }
296 
aom_dc_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)297 void aom_dc_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
298                                  const uint8_t *above, const uint8_t *left) {
299   const uint16x8_t sum_top = dc_load_partial_sum_64(above);
300   const uint16x8_t sum_left = dc_load_partial_sum_64(left);
301   uint16x8_t sum = vaddq_u16(sum_left, sum_top);
302   sum = horizontal_add_and_broadcast_u16x8(sum);
303   const uint8x8_t dc0 = vrshrn_n_u16(sum, 7);
304   dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
305 }
306 
aom_dc_left_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)307 void aom_dc_left_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
308                                       const uint8_t *above,
309                                       const uint8_t *left) {
310   const uint16x8_t sum_left = dc_load_sum_64(left);
311   const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 6);
312   (void)above;
313   dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
314 }
315 
aom_dc_top_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)316 void aom_dc_top_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
317                                      const uint8_t *above,
318                                      const uint8_t *left) {
319   const uint16x8_t sum_top = dc_load_sum_64(above);
320   const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 6);
321   (void)left;
322   dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
323 }
324 
aom_dc_128_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)325 void aom_dc_128_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
326                                      const uint8_t *above,
327                                      const uint8_t *left) {
328   const uint8x16_t dc0 = vdupq_n_u8(0x80);
329   (void)above;
330   (void)left;
331   dc_store_64xh(dst, stride, 64, dc0);
332 }
333 
334 //------------------------------------------------------------------------------
335 // DC rectangular cases
336 
337 #define DC_MULTIPLIER_1X2 0x5556
338 #define DC_MULTIPLIER_1X4 0x3334
339 
340 #define DC_SHIFT2 16
341 
divide_using_multiply_shift(int num,int shift1,int multiplier,int shift2)342 static INLINE int divide_using_multiply_shift(int num, int shift1,
343                                               int multiplier, int shift2) {
344   const int interm = num >> shift1;
345   return interm * multiplier >> shift2;
346 }
347 
calculate_dc_from_sum(int bw,int bh,uint32_t sum,int shift1,int multiplier)348 static INLINE int calculate_dc_from_sum(int bw, int bh, uint32_t sum,
349                                         int shift1, int multiplier) {
350   const int expected_dc = divide_using_multiply_shift(
351       sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2);
352   assert(expected_dc < (1 << 8));
353   return expected_dc;
354 }
355 
356 #undef DC_SHIFT2
357 
aom_dc_predictor_4x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)358 void aom_dc_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
359                                const uint8_t *above, const uint8_t *left) {
360   uint8x8_t a = load_u8_4x1(above);
361   uint8x8_t l = vld1_u8(left);
362   uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
363   uint32_t dc = calculate_dc_from_sum(4, 8, sum, 2, DC_MULTIPLIER_1X2);
364   dc_store_4xh(dst, stride, 8, vdup_n_u8(dc));
365 }
366 
aom_dc_predictor_8x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)367 void aom_dc_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
368                                const uint8_t *above, const uint8_t *left) {
369   uint8x8_t a = vld1_u8(above);
370   uint8x8_t l = load_u8_4x1(left);
371   uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
372   uint32_t dc = calculate_dc_from_sum(8, 4, sum, 2, DC_MULTIPLIER_1X2);
373   dc_store_8xh(dst, stride, 4, vdup_n_u8(dc));
374 }
375 
aom_dc_predictor_4x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)376 void aom_dc_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
377                                 const uint8_t *above, const uint8_t *left) {
378   uint8x8_t a = load_u8_4x1(above);
379   uint8x16_t l = vld1q_u8(left);
380   uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a);
381   uint32_t sum = horizontal_add_u16x8(sum_al);
382   uint32_t dc = calculate_dc_from_sum(4, 16, sum, 2, DC_MULTIPLIER_1X4);
383   dc_store_4xh(dst, stride, 16, vdup_n_u8(dc));
384 }
385 
aom_dc_predictor_16x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)386 void aom_dc_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
387                                 const uint8_t *above, const uint8_t *left) {
388   uint8x16_t a = vld1q_u8(above);
389   uint8x8_t l = load_u8_4x1(left);
390   uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l);
391   uint32_t sum = horizontal_add_u16x8(sum_al);
392   uint32_t dc = calculate_dc_from_sum(16, 4, sum, 2, DC_MULTIPLIER_1X4);
393   dc_store_16xh(dst, stride, 4, vdupq_n_u8(dc));
394 }
395 
aom_dc_predictor_8x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)396 void aom_dc_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
397                                 const uint8_t *above, const uint8_t *left) {
398   uint8x8_t a = vld1_u8(above);
399   uint8x16_t l = vld1q_u8(left);
400   uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a);
401   uint32_t sum = horizontal_add_u16x8(sum_al);
402   uint32_t dc = calculate_dc_from_sum(8, 16, sum, 3, DC_MULTIPLIER_1X2);
403   dc_store_8xh(dst, stride, 16, vdup_n_u8(dc));
404 }
405 
aom_dc_predictor_16x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)406 void aom_dc_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
407                                 const uint8_t *above, const uint8_t *left) {
408   uint8x16_t a = vld1q_u8(above);
409   uint8x8_t l = vld1_u8(left);
410   uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l);
411   uint32_t sum = horizontal_add_u16x8(sum_al);
412   uint32_t dc = calculate_dc_from_sum(16, 8, sum, 3, DC_MULTIPLIER_1X2);
413   dc_store_16xh(dst, stride, 8, vdupq_n_u8(dc));
414 }
415 
aom_dc_predictor_8x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)416 void aom_dc_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
417                                 const uint8_t *above, const uint8_t *left) {
418   uint8x8_t a = vld1_u8(above);
419   uint16x8_t sum_left = dc_load_partial_sum_32(left);
420   uint16x8_t sum_al = vaddw_u8(sum_left, a);
421   uint32_t sum = horizontal_add_u16x8(sum_al);
422   uint32_t dc = calculate_dc_from_sum(8, 32, sum, 3, DC_MULTIPLIER_1X4);
423   dc_store_8xh(dst, stride, 32, vdup_n_u8(dc));
424 }
425 
aom_dc_predictor_32x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)426 void aom_dc_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
427                                 const uint8_t *above, const uint8_t *left) {
428   uint16x8_t sum_top = dc_load_partial_sum_32(above);
429   uint8x8_t l = vld1_u8(left);
430   uint16x8_t sum_al = vaddw_u8(sum_top, l);
431   uint32_t sum = horizontal_add_u16x8(sum_al);
432   uint32_t dc = calculate_dc_from_sum(32, 8, sum, 3, DC_MULTIPLIER_1X4);
433   dc_store_32xh(dst, stride, 8, vdupq_n_u8(dc));
434 }
435 
aom_dc_predictor_16x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)436 void aom_dc_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
437                                  const uint8_t *above, const uint8_t *left) {
438   uint16x8_t sum_above = dc_load_partial_sum_16(above);
439   uint16x8_t sum_left = dc_load_partial_sum_32(left);
440   uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
441   uint32_t sum = horizontal_add_u16x8(sum_al);
442   uint32_t dc = calculate_dc_from_sum(16, 32, sum, 4, DC_MULTIPLIER_1X2);
443   dc_store_16xh(dst, stride, 32, vdupq_n_u8(dc));
444 }
445 
aom_dc_predictor_32x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)446 void aom_dc_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
447                                  const uint8_t *above, const uint8_t *left) {
448   uint16x8_t sum_above = dc_load_partial_sum_32(above);
449   uint16x8_t sum_left = dc_load_partial_sum_16(left);
450   uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
451   uint32_t sum = horizontal_add_u16x8(sum_al);
452   uint32_t dc = calculate_dc_from_sum(32, 16, sum, 4, DC_MULTIPLIER_1X2);
453   dc_store_32xh(dst, stride, 16, vdupq_n_u8(dc));
454 }
455 
aom_dc_predictor_16x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)456 void aom_dc_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
457                                  const uint8_t *above, const uint8_t *left) {
458   uint16x8_t sum_above = dc_load_partial_sum_16(above);
459   uint16x8_t sum_left = dc_load_partial_sum_64(left);
460   uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
461   uint32_t sum = horizontal_add_u16x8(sum_al);
462   uint32_t dc = calculate_dc_from_sum(16, 64, sum, 4, DC_MULTIPLIER_1X4);
463   dc_store_16xh(dst, stride, 64, vdupq_n_u8(dc));
464 }
465 
aom_dc_predictor_64x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)466 void aom_dc_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
467                                  const uint8_t *above, const uint8_t *left) {
468   uint16x8_t sum_above = dc_load_partial_sum_64(above);
469   uint16x8_t sum_left = dc_load_partial_sum_16(left);
470   uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
471   uint32_t sum = horizontal_add_u16x8(sum_al);
472   uint32_t dc = calculate_dc_from_sum(64, 16, sum, 4, DC_MULTIPLIER_1X4);
473   dc_store_64xh(dst, stride, 16, vdupq_n_u8(dc));
474 }
475 
aom_dc_predictor_32x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)476 void aom_dc_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
477                                  const uint8_t *above, const uint8_t *left) {
478   uint16x8_t sum_above = dc_load_partial_sum_32(above);
479   uint16x8_t sum_left = dc_load_partial_sum_64(left);
480   uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
481   uint32_t sum = horizontal_add_u16x8(sum_al);
482   uint32_t dc = calculate_dc_from_sum(32, 64, sum, 5, DC_MULTIPLIER_1X2);
483   dc_store_32xh(dst, stride, 64, vdupq_n_u8(dc));
484 }
485 
aom_dc_predictor_64x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)486 void aom_dc_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
487                                  const uint8_t *above, const uint8_t *left) {
488   uint16x8_t sum_above = dc_load_partial_sum_64(above);
489   uint16x8_t sum_left = dc_load_partial_sum_32(left);
490   uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
491   uint32_t sum = horizontal_add_u16x8(sum_al);
492   uint32_t dc = calculate_dc_from_sum(64, 32, sum, 5, DC_MULTIPLIER_1X2);
493   dc_store_64xh(dst, stride, 32, vdupq_n_u8(dc));
494 }
495 
496 #undef DC_MULTIPLIER_1X2
497 #undef DC_MULTIPLIER_1X4
498 
499 #define DC_PREDICTOR_128(w, h, q)                                            \
500   void aom_dc_128_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
501                                              const uint8_t *above,           \
502                                              const uint8_t *left) {          \
503     (void)above;                                                             \
504     (void)left;                                                              \
505     dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u8(0x80));                \
506   }
507 
508 DC_PREDICTOR_128(4, 8, )
509 DC_PREDICTOR_128(4, 16, )
510 DC_PREDICTOR_128(8, 4, )
511 DC_PREDICTOR_128(8, 16, )
512 DC_PREDICTOR_128(8, 32, )
513 DC_PREDICTOR_128(16, 4, q)
514 DC_PREDICTOR_128(16, 8, q)
515 DC_PREDICTOR_128(16, 32, q)
516 DC_PREDICTOR_128(16, 64, q)
517 DC_PREDICTOR_128(32, 8, q)
518 DC_PREDICTOR_128(32, 16, q)
519 DC_PREDICTOR_128(32, 64, q)
520 DC_PREDICTOR_128(64, 32, q)
521 DC_PREDICTOR_128(64, 16, q)
522 
523 #undef DC_PREDICTOR_128
524 
525 #define DC_PREDICTOR_LEFT(w, h, shift, q)                                     \
526   void aom_dc_left_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
527                                               const uint8_t *above,           \
528                                               const uint8_t *left) {          \
529     (void)above;                                                              \
530     const uint16x8_t sum = dc_load_sum_##h(left);                             \
531     const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift));                         \
532     dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0));            \
533   }
534 
535 DC_PREDICTOR_LEFT(4, 8, 3, )
536 DC_PREDICTOR_LEFT(8, 4, 2, )
537 DC_PREDICTOR_LEFT(8, 16, 4, )
538 DC_PREDICTOR_LEFT(16, 8, 3, q)
539 DC_PREDICTOR_LEFT(16, 32, 5, q)
540 DC_PREDICTOR_LEFT(32, 16, 4, q)
541 DC_PREDICTOR_LEFT(32, 64, 6, q)
542 DC_PREDICTOR_LEFT(64, 32, 5, q)
543 DC_PREDICTOR_LEFT(4, 16, 4, )
544 DC_PREDICTOR_LEFT(16, 4, 2, q)
545 DC_PREDICTOR_LEFT(8, 32, 5, )
546 DC_PREDICTOR_LEFT(32, 8, 3, q)
547 DC_PREDICTOR_LEFT(16, 64, 6, q)
548 DC_PREDICTOR_LEFT(64, 16, 4, q)
549 
550 #undef DC_PREDICTOR_LEFT
551 
552 #define DC_PREDICTOR_TOP(w, h, shift, q)                                     \
553   void aom_dc_top_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
554                                              const uint8_t *above,           \
555                                              const uint8_t *left) {          \
556     (void)left;                                                              \
557     const uint16x8_t sum = dc_load_sum_##w(above);                           \
558     const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift));                        \
559     dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0));           \
560   }
561 
562 DC_PREDICTOR_TOP(4, 8, 2, )
563 DC_PREDICTOR_TOP(4, 16, 2, )
564 DC_PREDICTOR_TOP(8, 4, 3, )
565 DC_PREDICTOR_TOP(8, 16, 3, )
566 DC_PREDICTOR_TOP(8, 32, 3, )
567 DC_PREDICTOR_TOP(16, 4, 4, q)
568 DC_PREDICTOR_TOP(16, 8, 4, q)
569 DC_PREDICTOR_TOP(16, 32, 4, q)
570 DC_PREDICTOR_TOP(16, 64, 4, q)
571 DC_PREDICTOR_TOP(32, 8, 5, q)
572 DC_PREDICTOR_TOP(32, 16, 5, q)
573 DC_PREDICTOR_TOP(32, 64, 5, q)
574 DC_PREDICTOR_TOP(64, 16, 6, q)
575 DC_PREDICTOR_TOP(64, 32, 6, q)
576 
577 #undef DC_PREDICTOR_TOP
578 
579 // -----------------------------------------------------------------------------
580 
v_store_4xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x8_t d0)581 static INLINE void v_store_4xh(uint8_t *dst, ptrdiff_t stride, int h,
582                                uint8x8_t d0) {
583   for (int i = 0; i < h; ++i) {
584     store_u8_4x1(dst + i * stride, d0);
585   }
586 }
587 
v_store_8xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x8_t d0)588 static INLINE void v_store_8xh(uint8_t *dst, ptrdiff_t stride, int h,
589                                uint8x8_t d0) {
590   for (int i = 0; i < h; ++i) {
591     vst1_u8(dst + i * stride, d0);
592   }
593 }
594 
v_store_16xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x16_t d0)595 static INLINE void v_store_16xh(uint8_t *dst, ptrdiff_t stride, int h,
596                                 uint8x16_t d0) {
597   for (int i = 0; i < h; ++i) {
598     vst1q_u8(dst + i * stride, d0);
599   }
600 }
601 
v_store_32xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x16_t d0,uint8x16_t d1)602 static INLINE void v_store_32xh(uint8_t *dst, ptrdiff_t stride, int h,
603                                 uint8x16_t d0, uint8x16_t d1) {
604   for (int i = 0; i < h; ++i) {
605     vst1q_u8(dst + 0, d0);
606     vst1q_u8(dst + 16, d1);
607     dst += stride;
608   }
609 }
610 
v_store_64xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x16_t d0,uint8x16_t d1,uint8x16_t d2,uint8x16_t d3)611 static INLINE void v_store_64xh(uint8_t *dst, ptrdiff_t stride, int h,
612                                 uint8x16_t d0, uint8x16_t d1, uint8x16_t d2,
613                                 uint8x16_t d3) {
614   for (int i = 0; i < h; ++i) {
615     vst1q_u8(dst + 0, d0);
616     vst1q_u8(dst + 16, d1);
617     vst1q_u8(dst + 32, d2);
618     vst1q_u8(dst + 48, d3);
619     dst += stride;
620   }
621 }
622 
aom_v_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)623 void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
624                               const uint8_t *above, const uint8_t *left) {
625   (void)left;
626   v_store_4xh(dst, stride, 4, load_u8_4x1(above));
627 }
628 
aom_v_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)629 void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
630                               const uint8_t *above, const uint8_t *left) {
631   (void)left;
632   v_store_8xh(dst, stride, 8, vld1_u8(above));
633 }
634 
aom_v_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)635 void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
636                                 const uint8_t *above, const uint8_t *left) {
637   (void)left;
638   v_store_16xh(dst, stride, 16, vld1q_u8(above));
639 }
640 
aom_v_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)641 void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
642                                 const uint8_t *above, const uint8_t *left) {
643   const uint8x16_t d0 = vld1q_u8(above);
644   const uint8x16_t d1 = vld1q_u8(above + 16);
645   (void)left;
646   v_store_32xh(dst, stride, 32, d0, d1);
647 }
648 
aom_v_predictor_4x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)649 void aom_v_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
650                               const uint8_t *above, const uint8_t *left) {
651   (void)left;
652   v_store_4xh(dst, stride, 8, load_u8_4x1(above));
653 }
654 
aom_v_predictor_4x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)655 void aom_v_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
656                                const uint8_t *above, const uint8_t *left) {
657   (void)left;
658   v_store_4xh(dst, stride, 16, load_u8_4x1(above));
659 }
660 
aom_v_predictor_8x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)661 void aom_v_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
662                               const uint8_t *above, const uint8_t *left) {
663   (void)left;
664   v_store_8xh(dst, stride, 4, vld1_u8(above));
665 }
666 
aom_v_predictor_8x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)667 void aom_v_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
668                                const uint8_t *above, const uint8_t *left) {
669   (void)left;
670   v_store_8xh(dst, stride, 16, vld1_u8(above));
671 }
672 
aom_v_predictor_8x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)673 void aom_v_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
674                                const uint8_t *above, const uint8_t *left) {
675   (void)left;
676   v_store_8xh(dst, stride, 32, vld1_u8(above));
677 }
678 
aom_v_predictor_16x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)679 void aom_v_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
680                                const uint8_t *above, const uint8_t *left) {
681   (void)left;
682   v_store_16xh(dst, stride, 4, vld1q_u8(above));
683 }
684 
aom_v_predictor_16x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)685 void aom_v_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
686                                const uint8_t *above, const uint8_t *left) {
687   (void)left;
688   v_store_16xh(dst, stride, 8, vld1q_u8(above));
689 }
690 
aom_v_predictor_16x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)691 void aom_v_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
692                                 const uint8_t *above, const uint8_t *left) {
693   (void)left;
694   v_store_16xh(dst, stride, 32, vld1q_u8(above));
695 }
696 
aom_v_predictor_16x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)697 void aom_v_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
698                                 const uint8_t *above, const uint8_t *left) {
699   (void)left;
700   v_store_16xh(dst, stride, 64, vld1q_u8(above));
701 }
702 
aom_v_predictor_32x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)703 void aom_v_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
704                                const uint8_t *above, const uint8_t *left) {
705   const uint8x16_t d0 = vld1q_u8(above);
706   const uint8x16_t d1 = vld1q_u8(above + 16);
707   (void)left;
708   v_store_32xh(dst, stride, 8, d0, d1);
709 }
710 
aom_v_predictor_32x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)711 void aom_v_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
712                                 const uint8_t *above, const uint8_t *left) {
713   const uint8x16_t d0 = vld1q_u8(above);
714   const uint8x16_t d1 = vld1q_u8(above + 16);
715   (void)left;
716   v_store_32xh(dst, stride, 16, d0, d1);
717 }
718 
aom_v_predictor_32x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)719 void aom_v_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
720                                 const uint8_t *above, const uint8_t *left) {
721   const uint8x16_t d0 = vld1q_u8(above);
722   const uint8x16_t d1 = vld1q_u8(above + 16);
723   (void)left;
724   v_store_32xh(dst, stride, 64, d0, d1);
725 }
726 
aom_v_predictor_64x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)727 void aom_v_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
728                                 const uint8_t *above, const uint8_t *left) {
729   const uint8x16_t d0 = vld1q_u8(above);
730   const uint8x16_t d1 = vld1q_u8(above + 16);
731   const uint8x16_t d2 = vld1q_u8(above + 32);
732   const uint8x16_t d3 = vld1q_u8(above + 48);
733   (void)left;
734   v_store_64xh(dst, stride, 16, d0, d1, d2, d3);
735 }
736 
aom_v_predictor_64x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)737 void aom_v_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
738                                 const uint8_t *above, const uint8_t *left) {
739   const uint8x16_t d0 = vld1q_u8(above);
740   const uint8x16_t d1 = vld1q_u8(above + 16);
741   const uint8x16_t d2 = vld1q_u8(above + 32);
742   const uint8x16_t d3 = vld1q_u8(above + 48);
743   (void)left;
744   v_store_64xh(dst, stride, 32, d0, d1, d2, d3);
745 }
746 
aom_v_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)747 void aom_v_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
748                                 const uint8_t *above, const uint8_t *left) {
749   const uint8x16_t d0 = vld1q_u8(above);
750   const uint8x16_t d1 = vld1q_u8(above + 16);
751   const uint8x16_t d2 = vld1q_u8(above + 32);
752   const uint8x16_t d3 = vld1q_u8(above + 48);
753   (void)left;
754   v_store_64xh(dst, stride, 64, d0, d1, d2, d3);
755 }
756 
757 // -----------------------------------------------------------------------------
758 
h_store_4x8(uint8_t * dst,ptrdiff_t stride,uint8x8_t d0)759 static INLINE void h_store_4x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
760   store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0));
761   store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1));
762   store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2));
763   store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3));
764   store_u8_4x1(dst + 4 * stride, vdup_lane_u8(d0, 4));
765   store_u8_4x1(dst + 5 * stride, vdup_lane_u8(d0, 5));
766   store_u8_4x1(dst + 6 * stride, vdup_lane_u8(d0, 6));
767   store_u8_4x1(dst + 7 * stride, vdup_lane_u8(d0, 7));
768 }
769 
h_store_8x8(uint8_t * dst,ptrdiff_t stride,uint8x8_t d0)770 static INLINE void h_store_8x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
771   vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0));
772   vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1));
773   vst1_u8(dst + 2 * stride, vdup_lane_u8(d0, 2));
774   vst1_u8(dst + 3 * stride, vdup_lane_u8(d0, 3));
775   vst1_u8(dst + 4 * stride, vdup_lane_u8(d0, 4));
776   vst1_u8(dst + 5 * stride, vdup_lane_u8(d0, 5));
777   vst1_u8(dst + 6 * stride, vdup_lane_u8(d0, 6));
778   vst1_u8(dst + 7 * stride, vdup_lane_u8(d0, 7));
779 }
780 
h_store_16x8(uint8_t * dst,ptrdiff_t stride,uint8x8_t d0)781 static INLINE void h_store_16x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
782   vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0));
783   vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1));
784   vst1q_u8(dst + 2 * stride, vdupq_lane_u8(d0, 2));
785   vst1q_u8(dst + 3 * stride, vdupq_lane_u8(d0, 3));
786   vst1q_u8(dst + 4 * stride, vdupq_lane_u8(d0, 4));
787   vst1q_u8(dst + 5 * stride, vdupq_lane_u8(d0, 5));
788   vst1q_u8(dst + 6 * stride, vdupq_lane_u8(d0, 6));
789   vst1q_u8(dst + 7 * stride, vdupq_lane_u8(d0, 7));
790 }
791 
h_store_32x8(uint8_t * dst,ptrdiff_t stride,uint8x8_t d0)792 static INLINE void h_store_32x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
793   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0));
794   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0));
795   dst += stride;
796   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 1));
797   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 1));
798   dst += stride;
799   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 2));
800   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 2));
801   dst += stride;
802   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 3));
803   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 3));
804   dst += stride;
805   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 4));
806   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 4));
807   dst += stride;
808   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 5));
809   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 5));
810   dst += stride;
811   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 6));
812   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 6));
813   dst += stride;
814   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 7));
815   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 7));
816 }
817 
h_store_64x8(uint8_t * dst,ptrdiff_t stride,uint8x8_t d0)818 static INLINE void h_store_64x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
819   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0));
820   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0));
821   vst1q_u8(dst + 32, vdupq_lane_u8(d0, 0));
822   vst1q_u8(dst + 48, vdupq_lane_u8(d0, 0));
823   dst += stride;
824   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 1));
825   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 1));
826   vst1q_u8(dst + 32, vdupq_lane_u8(d0, 1));
827   vst1q_u8(dst + 48, vdupq_lane_u8(d0, 1));
828   dst += stride;
829   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 2));
830   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 2));
831   vst1q_u8(dst + 32, vdupq_lane_u8(d0, 2));
832   vst1q_u8(dst + 48, vdupq_lane_u8(d0, 2));
833   dst += stride;
834   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 3));
835   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 3));
836   vst1q_u8(dst + 32, vdupq_lane_u8(d0, 3));
837   vst1q_u8(dst + 48, vdupq_lane_u8(d0, 3));
838   dst += stride;
839   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 4));
840   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 4));
841   vst1q_u8(dst + 32, vdupq_lane_u8(d0, 4));
842   vst1q_u8(dst + 48, vdupq_lane_u8(d0, 4));
843   dst += stride;
844   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 5));
845   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 5));
846   vst1q_u8(dst + 32, vdupq_lane_u8(d0, 5));
847   vst1q_u8(dst + 48, vdupq_lane_u8(d0, 5));
848   dst += stride;
849   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 6));
850   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 6));
851   vst1q_u8(dst + 32, vdupq_lane_u8(d0, 6));
852   vst1q_u8(dst + 48, vdupq_lane_u8(d0, 6));
853   dst += stride;
854   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 7));
855   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 7));
856   vst1q_u8(dst + 32, vdupq_lane_u8(d0, 7));
857   vst1q_u8(dst + 48, vdupq_lane_u8(d0, 7));
858 }
859 
aom_h_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)860 void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
861                               const uint8_t *above, const uint8_t *left) {
862   const uint8x8_t d0 = load_u8_4x1(left);
863   (void)above;
864   store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0));
865   store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1));
866   store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2));
867   store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3));
868 }
869 
aom_h_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)870 void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
871                               const uint8_t *above, const uint8_t *left) {
872   const uint8x8_t d0 = vld1_u8(left);
873   (void)above;
874   h_store_8x8(dst, stride, d0);
875 }
876 
aom_h_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)877 void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
878                                 const uint8_t *above, const uint8_t *left) {
879   const uint8x16_t d0 = vld1q_u8(left);
880   (void)above;
881   h_store_16x8(dst, stride, vget_low_u8(d0));
882   h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
883 }
884 
aom_h_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)885 void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
886                                 const uint8_t *above, const uint8_t *left) {
887   const uint8x16_t d0 = vld1q_u8(left);
888   const uint8x16_t d1 = vld1q_u8(left + 16);
889   (void)above;
890   h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
891   h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
892   h_store_32x8(dst + 16 * stride, stride, vget_low_u8(d1));
893   h_store_32x8(dst + 24 * stride, stride, vget_high_u8(d1));
894 }
895 
aom_h_predictor_4x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)896 void aom_h_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
897                               const uint8_t *above, const uint8_t *left) {
898   const uint8x8_t d0 = vld1_u8(left);
899   (void)above;
900   h_store_4x8(dst, stride, d0);
901 }
902 
aom_h_predictor_4x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)903 void aom_h_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
904                                const uint8_t *above, const uint8_t *left) {
905   const uint8x16_t d0 = vld1q_u8(left);
906   (void)above;
907   h_store_4x8(dst + 0 * stride, stride, vget_low_u8(d0));
908   h_store_4x8(dst + 8 * stride, stride, vget_high_u8(d0));
909 }
910 
aom_h_predictor_8x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)911 void aom_h_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
912                               const uint8_t *above, const uint8_t *left) {
913   const uint8x8_t d0 = load_u8_4x1(left);
914   (void)above;
915   vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0));
916   vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1));
917   vst1_u8(dst + 2 * stride, vdup_lane_u8(d0, 2));
918   vst1_u8(dst + 3 * stride, vdup_lane_u8(d0, 3));
919 }
920 
aom_h_predictor_8x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)921 void aom_h_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
922                                const uint8_t *above, const uint8_t *left) {
923   const uint8x16_t d0 = vld1q_u8(left);
924   (void)above;
925   h_store_8x8(dst + 0 * stride, stride, vget_low_u8(d0));
926   h_store_8x8(dst + 8 * stride, stride, vget_high_u8(d0));
927 }
928 
aom_h_predictor_8x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)929 void aom_h_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
930                                const uint8_t *above, const uint8_t *left) {
931   const uint8x16_t d0 = vld1q_u8(left);
932   const uint8x16_t d1 = vld1q_u8(left + 16);
933   (void)above;
934   h_store_8x8(dst + 0 * stride, stride, vget_low_u8(d0));
935   h_store_8x8(dst + 8 * stride, stride, vget_high_u8(d0));
936   h_store_8x8(dst + 16 * stride, stride, vget_low_u8(d1));
937   h_store_8x8(dst + 24 * stride, stride, vget_high_u8(d1));
938 }
939 
aom_h_predictor_16x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)940 void aom_h_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
941                                const uint8_t *above, const uint8_t *left) {
942   const uint8x8_t d0 = load_u8_4x1(left);
943   (void)above;
944   vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0));
945   vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1));
946   vst1q_u8(dst + 2 * stride, vdupq_lane_u8(d0, 2));
947   vst1q_u8(dst + 3 * stride, vdupq_lane_u8(d0, 3));
948 }
949 
aom_h_predictor_16x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)950 void aom_h_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
951                                const uint8_t *above, const uint8_t *left) {
952   const uint8x8_t d0 = vld1_u8(left);
953   (void)above;
954   h_store_16x8(dst, stride, d0);
955 }
956 
aom_h_predictor_16x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)957 void aom_h_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
958                                 const uint8_t *above, const uint8_t *left) {
959   const uint8x16_t d0 = vld1q_u8(left);
960   const uint8x16_t d1 = vld1q_u8(left + 16);
961   (void)above;
962   h_store_16x8(dst + 0 * stride, stride, vget_low_u8(d0));
963   h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
964   h_store_16x8(dst + 16 * stride, stride, vget_low_u8(d1));
965   h_store_16x8(dst + 24 * stride, stride, vget_high_u8(d1));
966 }
967 
aom_h_predictor_16x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)968 void aom_h_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
969                                 const uint8_t *above, const uint8_t *left) {
970   const uint8x16_t d0 = vld1q_u8(left);
971   const uint8x16_t d1 = vld1q_u8(left + 16);
972   const uint8x16_t d2 = vld1q_u8(left + 32);
973   const uint8x16_t d3 = vld1q_u8(left + 48);
974   (void)above;
975   h_store_16x8(dst + 0 * stride, stride, vget_low_u8(d0));
976   h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
977   h_store_16x8(dst + 16 * stride, stride, vget_low_u8(d1));
978   h_store_16x8(dst + 24 * stride, stride, vget_high_u8(d1));
979   h_store_16x8(dst + 32 * stride, stride, vget_low_u8(d2));
980   h_store_16x8(dst + 40 * stride, stride, vget_high_u8(d2));
981   h_store_16x8(dst + 48 * stride, stride, vget_low_u8(d3));
982   h_store_16x8(dst + 56 * stride, stride, vget_high_u8(d3));
983 }
984 
aom_h_predictor_32x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)985 void aom_h_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
986                                const uint8_t *above, const uint8_t *left) {
987   const uint8x8_t d0 = vld1_u8(left);
988   (void)above;
989   h_store_32x8(dst, stride, d0);
990 }
991 
aom_h_predictor_32x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)992 void aom_h_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
993                                 const uint8_t *above, const uint8_t *left) {
994   const uint8x16_t d0 = vld1q_u8(left);
995   (void)above;
996   h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
997   h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
998 }
999 
aom_h_predictor_32x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1000 void aom_h_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
1001                                 const uint8_t *above, const uint8_t *left) {
1002   const uint8x16_t d0 = vld1q_u8(left + 0);
1003   const uint8x16_t d1 = vld1q_u8(left + 16);
1004   const uint8x16_t d2 = vld1q_u8(left + 32);
1005   const uint8x16_t d3 = vld1q_u8(left + 48);
1006   (void)above;
1007   h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
1008   h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
1009   h_store_32x8(dst + 16 * stride, stride, vget_low_u8(d1));
1010   h_store_32x8(dst + 24 * stride, stride, vget_high_u8(d1));
1011   h_store_32x8(dst + 32 * stride, stride, vget_low_u8(d2));
1012   h_store_32x8(dst + 40 * stride, stride, vget_high_u8(d2));
1013   h_store_32x8(dst + 48 * stride, stride, vget_low_u8(d3));
1014   h_store_32x8(dst + 56 * stride, stride, vget_high_u8(d3));
1015 }
1016 
aom_h_predictor_64x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1017 void aom_h_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
1018                                 const uint8_t *above, const uint8_t *left) {
1019   const uint8x16_t d0 = vld1q_u8(left);
1020   (void)above;
1021   h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
1022   h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
1023 }
1024 
aom_h_predictor_64x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1025 void aom_h_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
1026                                 const uint8_t *above, const uint8_t *left) {
1027   (void)above;
1028   for (int i = 0; i < 2; ++i) {
1029     const uint8x16_t d0 = vld1q_u8(left);
1030     h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
1031     h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
1032     left += 16;
1033     dst += 16 * stride;
1034   }
1035 }
1036 
aom_h_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1037 void aom_h_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
1038                                 const uint8_t *above, const uint8_t *left) {
1039   (void)above;
1040   for (int i = 0; i < 4; ++i) {
1041     const uint8x16_t d0 = vld1q_u8(left);
1042     h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
1043     h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
1044     left += 16;
1045     dst += 16 * stride;
1046   }
1047 }
1048 
1049 /* ---------------------P R E D I C T I O N   Z 1--------------------------- */
1050 
1051 // Low bit depth functions
1052 static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
1053   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1054     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1055   { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1056     0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1057   { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1058     0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1059   { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1060     0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1061   { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1062     0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1063   { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1064     0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1065   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1066     0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1067   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1068     0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0 },
1069   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
1070     0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0 },
1071   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
1072     0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0 },
1073   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
1074     0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
1075     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1076   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1077     0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
1078     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1079   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1080     0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
1081     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1082   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1083     0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,
1084     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1085   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1086     0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,
1087     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1088   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1089     0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,
1090     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1091   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1092     0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,
1093     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1094   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1095     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,
1096     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1097   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1098     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,
1099     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1100   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1101     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,
1102     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1103   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1104     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,
1105     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1106   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1107     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
1108     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1109   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1110     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1111     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1112   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1113     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1114     0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0 },
1115   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1116     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1117     0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0 },
1118   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1119     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1120     0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0 },
1121   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1122     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1123     0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0 },
1124   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1125     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1126     0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0 },
1127   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1128     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1129     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0 },
1130   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1131     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1132     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0 },
1133   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1134     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1135     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0 },
1136   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1137     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1138     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
1139   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1140     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1141     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
1142 };
1143 
dr_prediction_z1_HxW_internal_neon_64(int H,int W,uint8x8_t * dst,const uint8_t * above,int upsample_above,int dx)1144 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon_64(
1145     int H, int W, uint8x8_t *dst, const uint8_t *above, int upsample_above,
1146     int dx) {
1147   const int frac_bits = 6 - upsample_above;
1148   const int max_base_x = ((W + H) - 1) << upsample_above;
1149 
1150   assert(dx > 0);
1151   // pre-filter above pixels
1152   // store in temp buffers:
1153   //   above[x] * 32 + 16
1154   //   above[x+1] - above[x]
1155   // final pixels will be calculated as:
1156   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1157 
1158   const uint8x8_t a_mbase_x = vdup_n_u8(above[max_base_x]);
1159 
1160   int x = dx;
1161   for (int r = 0; r < W; r++) {
1162     int base = x >> frac_bits;
1163     int base_max_diff = (max_base_x - base) >> upsample_above;
1164     if (base_max_diff <= 0) {
1165       for (int i = r; i < W; ++i) {
1166         dst[i] = a_mbase_x;  // save 4 values
1167       }
1168       return;
1169     }
1170 
1171     if (base_max_diff > H) base_max_diff = H;
1172 
1173     uint8x8x2_t a01_128;
1174     uint16x8_t shift;
1175     if (upsample_above) {
1176       a01_128 = vld2_u8(above + base);
1177       shift = vdupq_n_u16(((x << upsample_above) & 0x3f) >> 1);
1178     } else {
1179       a01_128.val[0] = vld1_u8(above + base);
1180       a01_128.val[1] = vld1_u8(above + base + 1);
1181       shift = vdupq_n_u16((x & 0x3f) >> 1);
1182     }
1183     uint16x8_t diff = vsubl_u8(a01_128.val[1], a01_128.val[0]);
1184     uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a01_128.val[0], vdup_n_u8(32));
1185     uint16x8_t res = vmlaq_u16(a32, diff, shift);
1186 
1187     uint8x8_t mask = vld1_u8(BaseMask[base_max_diff]);
1188     dst[r] = vbsl_u8(mask, vshrn_n_u16(res, 5), a_mbase_x);
1189 
1190     x += dx;
1191   }
1192 }
1193 
dr_prediction_z1_4xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)1194 static void dr_prediction_z1_4xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
1195                                       const uint8_t *above, int upsample_above,
1196                                       int dx) {
1197   uint8x8_t dstvec[16];
1198 
1199   dr_prediction_z1_HxW_internal_neon_64(4, N, dstvec, above, upsample_above,
1200                                         dx);
1201   for (int i = 0; i < N; i++) {
1202     vst1_lane_u32((uint32_t *)(dst + stride * i),
1203                   vreinterpret_u32_u8(dstvec[i]), 0);
1204   }
1205 }
1206 
dr_prediction_z1_8xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)1207 static void dr_prediction_z1_8xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
1208                                       const uint8_t *above, int upsample_above,
1209                                       int dx) {
1210   uint8x8_t dstvec[32];
1211 
1212   dr_prediction_z1_HxW_internal_neon_64(8, N, dstvec, above, upsample_above,
1213                                         dx);
1214   for (int i = 0; i < N; i++) {
1215     vst1_u8(dst + stride * i, dstvec[i]);
1216   }
1217 }
1218 
dr_prediction_z1_HxW_internal_neon(int H,int W,uint8x16_t * dst,const uint8_t * above,int upsample_above,int dx)1219 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon(
1220     int H, int W, uint8x16_t *dst, const uint8_t *above, int upsample_above,
1221     int dx) {
1222   const int frac_bits = 6 - upsample_above;
1223   const int max_base_x = ((W + H) - 1) << upsample_above;
1224 
1225   assert(dx > 0);
1226   // pre-filter above pixels
1227   // store in temp buffers:
1228   //   above[x] * 32 + 16
1229   //   above[x+1] - above[x]
1230   // final pixels will be calculated as:
1231   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1232 
1233   const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
1234 
1235   int x = dx;
1236   for (int r = 0; r < W; r++) {
1237     int base = x >> frac_bits;
1238     int base_max_diff = (max_base_x - base) >> upsample_above;
1239     if (base_max_diff <= 0) {
1240       for (int i = r; i < W; ++i) {
1241         dst[i] = a_mbase_x;  // save 4 values
1242       }
1243       return;
1244     }
1245 
1246     if (base_max_diff > H) base_max_diff = H;
1247 
1248     uint16x8_t shift;
1249     uint8x16_t a0_128, a1_128;
1250     if (upsample_above) {
1251       uint8x8x2_t v_tmp_a0_128 = vld2_u8(above + base);
1252       a0_128 = vcombine_u8(v_tmp_a0_128.val[0], v_tmp_a0_128.val[1]);
1253       a1_128 = vextq_u8(a0_128, vdupq_n_u8(0), 8);
1254       shift = vdupq_n_u16(x & 0x1f);
1255     } else {
1256       a0_128 = vld1q_u8(above + base);
1257       a1_128 = vld1q_u8(above + base + 1);
1258       shift = vdupq_n_u16((x & 0x3f) >> 1);
1259     }
1260     uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
1261     uint16x8_t diff_hi = vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
1262     uint16x8_t a32_lo =
1263         vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32));
1264     uint16x8_t a32_hi =
1265         vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
1266     uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
1267     uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
1268     uint8x16_t v_temp =
1269         vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5));
1270 
1271     uint8x16_t mask = vld1q_u8(BaseMask[base_max_diff]);
1272     dst[r] = vbslq_u8(mask, v_temp, a_mbase_x);
1273 
1274     x += dx;
1275   }
1276 }
1277 
dr_prediction_z1_16xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)1278 static void dr_prediction_z1_16xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
1279                                        const uint8_t *above, int upsample_above,
1280                                        int dx) {
1281   uint8x16_t dstvec[64];
1282 
1283   dr_prediction_z1_HxW_internal_neon(16, N, dstvec, above, upsample_above, dx);
1284   for (int i = 0; i < N; i++) {
1285     vst1q_u8(dst + stride * i, dstvec[i]);
1286   }
1287 }
1288 
dr_prediction_z1_32xN_internal_neon(int N,uint8x16x2_t * dstvec,const uint8_t * above,int dx)1289 static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_neon(
1290     int N, uint8x16x2_t *dstvec, const uint8_t *above, int dx) {
1291   const int frac_bits = 6;
1292   const int max_base_x = ((32 + N) - 1);
1293 
1294   // pre-filter above pixels
1295   // store in temp buffers:
1296   //   above[x] * 32 + 16
1297   //   above[x+1] - above[x]
1298   // final pixels will be calculated as:
1299   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1300 
1301   const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
1302 
1303   int x = dx;
1304   for (int r = 0; r < N; r++) {
1305     int base = x >> frac_bits;
1306     int base_max_diff = (max_base_x - base);
1307     if (base_max_diff <= 0) {
1308       for (int i = r; i < N; ++i) {
1309         dstvec[i].val[0] = a_mbase_x;  // save 32 values
1310         dstvec[i].val[1] = a_mbase_x;
1311       }
1312       return;
1313     }
1314     if (base_max_diff > 32) base_max_diff = 32;
1315 
1316     uint16x8_t shift = vdupq_n_u16((x & 0x3f) >> 1);
1317 
1318     uint8x16_t res16[2];
1319     for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
1320       int mdiff = base_max_diff - j;
1321       if (mdiff <= 0) {
1322         res16[jj] = a_mbase_x;
1323       } else {
1324         uint8x16_t a0_128 = vld1q_u8(above + base + j);
1325         uint8x16_t a1_128 = vld1q_u8(above + base + j + 1);
1326         uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
1327         uint16x8_t diff_hi =
1328             vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
1329         uint16x8_t a32_lo =
1330             vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32));
1331         uint16x8_t a32_hi =
1332             vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
1333         uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
1334         uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
1335 
1336         res16[jj] = vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5));
1337       }
1338     }
1339 
1340     uint8x16_t mask_lo = vld1q_u8(BaseMask[base_max_diff]);
1341     uint8x16_t mask_hi = vld1q_u8(BaseMask[base_max_diff] + 16);
1342     dstvec[r].val[0] = vbslq_u8(mask_lo, res16[0], a_mbase_x);
1343     dstvec[r].val[1] = vbslq_u8(mask_hi, res16[1], a_mbase_x);
1344     x += dx;
1345   }
1346 }
1347 
dr_prediction_z1_32xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int dx)1348 static void dr_prediction_z1_32xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
1349                                        const uint8_t *above, int dx) {
1350   uint8x16x2_t dstvec[64];
1351 
1352   dr_prediction_z1_32xN_internal_neon(N, dstvec, above, dx);
1353   for (int i = 0; i < N; i++) {
1354     vst1q_u8(dst + stride * i, dstvec[i].val[0]);
1355     vst1q_u8(dst + stride * i + 16, dstvec[i].val[1]);
1356   }
1357 }
1358 
1359 // clang-format off
1360 static const uint8_t kLoadMaxShuffles[] = {
1361   15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1362   14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1363   13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1364   12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1365   11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1366   10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1367    9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1368    8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1369    7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15,
1370    6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15,
1371    5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15,
1372    4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15,
1373    3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15,
1374    2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15,
1375    1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15,
1376    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
1377 };
1378 // clang-format on
1379 
z1_load_masked_neon(const uint8_t * ptr,int shuffle_idx)1380 static INLINE uint8x16_t z1_load_masked_neon(const uint8_t *ptr,
1381                                              int shuffle_idx) {
1382   uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]);
1383   uint8x16_t src = vld1q_u8(ptr);
1384 #if AOM_ARCH_AARCH64
1385   return vqtbl1q_u8(src, shuffle);
1386 #else
1387   uint8x8x2_t src2 = { { vget_low_u8(src), vget_high_u8(src) } };
1388   uint8x8_t lo = vtbl2_u8(src2, vget_low_u8(shuffle));
1389   uint8x8_t hi = vtbl2_u8(src2, vget_high_u8(shuffle));
1390   return vcombine_u8(lo, hi);
1391 #endif
1392 }
1393 
dr_prediction_z1_64xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int dx)1394 static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
1395                                        const uint8_t *above, int dx) {
1396   const int frac_bits = 6;
1397   const int max_base_x = ((64 + N) - 1);
1398 
1399   // pre-filter above pixels
1400   // store in temp buffers:
1401   //   above[x] * 32 + 16
1402   //   above[x+1] - above[x]
1403   // final pixels will be calculated as:
1404   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1405 
1406   const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
1407 
1408   int x = dx;
1409   for (int r = 0; r < N; r++, dst += stride) {
1410     int base = x >> frac_bits;
1411     if (base >= max_base_x) {
1412       for (int i = r; i < N; ++i) {
1413         vst1q_u8(dst, a_mbase_x);
1414         vst1q_u8(dst + 16, a_mbase_x);
1415         vst1q_u8(dst + 32, a_mbase_x);
1416         vst1q_u8(dst + 48, a_mbase_x);
1417         dst += stride;
1418       }
1419       return;
1420     }
1421 
1422     uint16x8_t shift = vdupq_n_u16((x & 0x3f) >> 1);
1423     uint8x16_t base_inc128 =
1424         vaddq_u8(vdupq_n_u8(base), vcombine_u8(vcreate_u8(0x0706050403020100),
1425                                                vcreate_u8(0x0F0E0D0C0B0A0908)));
1426 
1427     for (int j = 0; j < 64; j += 16) {
1428       if (base + j >= max_base_x) {
1429         vst1q_u8(dst + j, a_mbase_x);
1430       } else {
1431         uint8x16_t a0_128;
1432         uint8x16_t a1_128;
1433         if (base + j + 15 >= max_base_x) {
1434           int shuffle_idx = max_base_x - base - j;
1435           a0_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx);
1436         } else {
1437           a0_128 = vld1q_u8(above + base + j);
1438         }
1439         if (base + j + 16 >= max_base_x) {
1440           int shuffle_idx = max_base_x - base - j - 1;
1441           a1_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx);
1442         } else {
1443           a1_128 = vld1q_u8(above + base + j + 1);
1444         }
1445 
1446         uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
1447         uint16x8_t diff_hi =
1448             vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
1449         uint16x8_t a32_lo =
1450             vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32));
1451         uint16x8_t a32_hi =
1452             vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
1453         uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
1454         uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
1455         vst1q_u8(dst + j,
1456                  vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5)));
1457 
1458         base_inc128 = vaddq_u8(base_inc128, vdupq_n_u8(16));
1459       }
1460     }
1461     x += dx;
1462   }
1463 }
1464 
1465 // Directional prediction, zone 1: 0 < angle < 90
av1_dr_prediction_z1_neon(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int dx,int dy)1466 void av1_dr_prediction_z1_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
1467                                const uint8_t *above, const uint8_t *left,
1468                                int upsample_above, int dx, int dy) {
1469   (void)left;
1470   (void)dy;
1471 
1472   switch (bw) {
1473     case 4:
1474       dr_prediction_z1_4xN_neon(bh, dst, stride, above, upsample_above, dx);
1475       break;
1476     case 8:
1477       dr_prediction_z1_8xN_neon(bh, dst, stride, above, upsample_above, dx);
1478       break;
1479     case 16:
1480       dr_prediction_z1_16xN_neon(bh, dst, stride, above, upsample_above, dx);
1481       break;
1482     case 32: dr_prediction_z1_32xN_neon(bh, dst, stride, above, dx); break;
1483     case 64: dr_prediction_z1_64xN_neon(bh, dst, stride, above, dx); break;
1484     default: break;
1485   }
1486 }
1487 
1488 /* ---------------------P R E D I C T I O N   Z 2--------------------------- */
1489 
1490 // TODO(aomedia:349428506): enable this for armv7 after SIGBUS is fixed.
1491 #if AOM_ARCH_AARCH64
1492 #if !AOM_ARCH_AARCH64
1493 static DECLARE_ALIGNED(16, uint8_t, LoadMaskz2[4][16]) = {
1494   { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1495   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
1496   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
1497     0, 0, 0 },
1498   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1499     0xff, 0xff, 0xff, 0xff }
1500 };
1501 #endif  // !AOM_ARCH_AARCH64
1502 
dr_prediction_z2_Nx4_above_neon(const uint8_t * above,int upsample_above,int dx,int base_x,int y,uint8x8_t * a0_x,uint8x8_t * a1_x,uint16x4_t * shift0)1503 static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_above_neon(
1504     const uint8_t *above, int upsample_above, int dx, int base_x, int y,
1505     uint8x8_t *a0_x, uint8x8_t *a1_x, uint16x4_t *shift0) {
1506   uint16x4_t r6 = vcreate_u16(0x00C0008000400000);
1507   uint16x4_t ydx = vdup_n_u16(y * dx);
1508   if (upsample_above) {
1509     // Cannot use LD2 here since we only want to load eight bytes, but LD2 can
1510     // only load either 16 or 32.
1511     uint8x8_t v_tmp = vld1_u8(above + base_x);
1512     *a0_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[0];
1513     *a1_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[1];
1514     *shift0 = vand_u16(vsub_u16(r6, ydx), vdup_n_u16(0x1f));
1515   } else {
1516     *a0_x = load_u8_4x1(above + base_x);
1517     *a1_x = load_u8_4x1(above + base_x + 1);
1518     *shift0 = vand_u16(vhsub_u16(r6, ydx), vdup_n_u16(0x1f));
1519   }
1520 }
1521 
dr_prediction_z2_Nx4_left_neon(uint8x16x2_t left_vals,int upsample_left,int dy,int r,int min_base_y,int frac_bits_y,uint16x4_t * a0_y,uint16x4_t * a1_y,uint16x4_t * shift1)1522 static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_left_neon(
1523 #if AOM_ARCH_AARCH64
1524     uint8x16x2_t left_vals,
1525 #else
1526     const uint8_t *left,
1527 #endif
1528     int upsample_left, int dy, int r, int min_base_y, int frac_bits_y,
1529     uint16x4_t *a0_y, uint16x4_t *a1_y, uint16x4_t *shift1) {
1530   int16x4_t dy64 = vdup_n_s16(dy);
1531   int16x4_t v_1234 = vcreate_s16(0x0004000300020001);
1532   int16x4_t v_frac_bits_y = vdup_n_s16(-frac_bits_y);
1533   int16x4_t min_base_y64 = vdup_n_s16(min_base_y);
1534   int16x4_t v_r6 = vdup_n_s16(r << 6);
1535   int16x4_t y_c64 = vmls_s16(v_r6, v_1234, dy64);
1536   int16x4_t base_y_c64 = vshl_s16(y_c64, v_frac_bits_y);
1537 
1538   // Values in base_y_c64 range from -2 through 14 inclusive.
1539   base_y_c64 = vmax_s16(base_y_c64, min_base_y64);
1540 
1541 #if AOM_ARCH_AARCH64
1542   uint8x8_t left_idx0 =
1543       vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(2)));  // [0, 16]
1544   uint8x8_t left_idx1 =
1545       vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(3)));  // [1, 17]
1546 
1547   *a0_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx0));
1548   *a1_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx1));
1549 #else   // !AOM_ARCH_AARCH64
1550   DECLARE_ALIGNED(32, int16_t, base_y_c[4]);
1551 
1552   vst1_s16(base_y_c, base_y_c64);
1553   uint8x8_t a0_y_u8 = vdup_n_u8(0);
1554   a0_y_u8 = vld1_lane_u8(left + base_y_c[0], a0_y_u8, 0);
1555   a0_y_u8 = vld1_lane_u8(left + base_y_c[1], a0_y_u8, 2);
1556   a0_y_u8 = vld1_lane_u8(left + base_y_c[2], a0_y_u8, 4);
1557   a0_y_u8 = vld1_lane_u8(left + base_y_c[3], a0_y_u8, 6);
1558 
1559   base_y_c64 = vadd_s16(base_y_c64, vdup_n_s16(1));
1560   vst1_s16(base_y_c, base_y_c64);
1561   uint8x8_t a1_y_u8 = vdup_n_u8(0);
1562   a1_y_u8 = vld1_lane_u8(left + base_y_c[0], a1_y_u8, 0);
1563   a1_y_u8 = vld1_lane_u8(left + base_y_c[1], a1_y_u8, 2);
1564   a1_y_u8 = vld1_lane_u8(left + base_y_c[2], a1_y_u8, 4);
1565   a1_y_u8 = vld1_lane_u8(left + base_y_c[3], a1_y_u8, 6);
1566 
1567   *a0_y = vreinterpret_u16_u8(a0_y_u8);
1568   *a1_y = vreinterpret_u16_u8(a1_y_u8);
1569 #endif  // AOM_ARCH_AARCH64
1570 
1571   if (upsample_left) {
1572     *shift1 = vand_u16(vreinterpret_u16_s16(y_c64), vdup_n_u16(0x1f));
1573   } else {
1574     *shift1 =
1575         vand_u16(vshr_n_u16(vreinterpret_u16_s16(y_c64), 1), vdup_n_u16(0x1f));
1576   }
1577 }
1578 
dr_prediction_z2_Nx8_above_neon(const uint8_t * above,int upsample_above,int dx,int base_x,int y)1579 static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_above_neon(
1580     const uint8_t *above, int upsample_above, int dx, int base_x, int y) {
1581   uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
1582                                   vcreate_u16(0x0008000700060005));
1583   uint16x8_t ydx = vdupq_n_u16(y * dx);
1584   uint16x8_t r6 = vshlq_n_u16(vextq_u16(c1234, vdupq_n_u16(0), 2), 6);
1585 
1586   uint16x8_t shift0;
1587   uint8x8_t a0_x0;
1588   uint8x8_t a1_x0;
1589   if (upsample_above) {
1590     uint8x8x2_t v_tmp = vld2_u8(above + base_x);
1591     a0_x0 = v_tmp.val[0];
1592     a1_x0 = v_tmp.val[1];
1593     shift0 = vandq_u16(vsubq_u16(r6, ydx), vdupq_n_u16(0x1f));
1594   } else {
1595     a0_x0 = vld1_u8(above + base_x);
1596     a1_x0 = vld1_u8(above + base_x + 1);
1597     shift0 = vandq_u16(vhsubq_u16(r6, ydx), vdupq_n_u16(0x1f));
1598   }
1599 
1600   uint16x8_t diff0 = vsubl_u8(a1_x0, a0_x0);  // a[x+1] - a[x]
1601   uint16x8_t a32 =
1602       vmlal_u8(vdupq_n_u16(16), a0_x0, vdup_n_u8(32));  // a[x] * 32 + 16
1603   uint16x8_t res = vmlaq_u16(a32, diff0, shift0);
1604   return vshrn_n_u16(res, 5);
1605 }
1606 
dr_prediction_z2_Nx8_left_neon(uint8x16x3_t left_vals,int upsample_left,int dy,int r,int min_base_y,int frac_bits_y)1607 static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_left_neon(
1608 #if AOM_ARCH_AARCH64
1609     uint8x16x3_t left_vals,
1610 #else
1611     const uint8_t *left,
1612 #endif
1613     int upsample_left, int dy, int r, int min_base_y, int frac_bits_y) {
1614   int16x8_t v_r6 = vdupq_n_s16(r << 6);
1615   int16x8_t dy128 = vdupq_n_s16(dy);
1616   int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y);
1617   int16x8_t min_base_y128 = vdupq_n_s16(min_base_y);
1618 
1619   uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
1620                                   vcreate_u16(0x0008000700060005));
1621   int16x8_t y_c128 = vmlsq_s16(v_r6, vreinterpretq_s16_u16(c1234), dy128);
1622   int16x8_t base_y_c128 = vshlq_s16(y_c128, v_frac_bits_y);
1623 
1624   // Values in base_y_c128 range from -2 through 31 inclusive.
1625   base_y_c128 = vmaxq_s16(base_y_c128, min_base_y128);
1626 
1627 #if AOM_ARCH_AARCH64
1628   uint8x16_t left_idx0 =
1629       vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(2)));  // [0, 33]
1630   uint8x16_t left_idx1 =
1631       vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(3)));  // [1, 34]
1632   uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
1633 
1634   uint8x16_t a01_x = vqtbl3q_u8(left_vals, left_idx01);
1635   uint8x8_t a0_x1 = vget_low_u8(a01_x);
1636   uint8x8_t a1_x1 = vget_high_u8(a01_x);
1637 #else   // !AOM_ARCH_AARCH64
1638   uint8x8_t a0_x1 = load_u8_gather_s16_x8(left, base_y_c128);
1639   uint8x8_t a1_x1 = load_u8_gather_s16_x8(left + 1, base_y_c128);
1640 #endif  // AOM_ARCH_AARCH64
1641 
1642   uint16x8_t shift1;
1643   if (upsample_left) {
1644     shift1 = vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x1f));
1645   } else {
1646     shift1 = vshrq_n_u16(
1647         vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x3f)), 1);
1648   }
1649 
1650   uint16x8_t diff1 = vsubl_u8(a1_x1, a0_x1);
1651   uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a0_x1, vdup_n_u8(32));
1652   uint16x8_t res = vmlaq_u16(a32, diff1, shift1);
1653   return vshrn_n_u16(res, 5);
1654 }
1655 
dr_prediction_z2_NxW_above_neon(const uint8_t * above,int dx,int base_x,int y,int j)1656 static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_above_neon(
1657     const uint8_t *above, int dx, int base_x, int y, int j) {
1658   uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000),
1659                                         vcreate_u16(0x0007000600050004)),
1660                            vcombine_u16(vcreate_u16(0x000B000A00090008),
1661                                         vcreate_u16(0x000F000E000D000C)) } };
1662   uint16x8_t j256 = vdupq_n_u16(j);
1663   uint16x8_t ydx = vdupq_n_u16((uint16_t)(y * dx));
1664 
1665   const uint8x16_t a0_x128 = vld1q_u8(above + base_x + j);
1666   const uint8x16_t a1_x128 = vld1q_u8(above + base_x + j + 1);
1667   uint16x8_t res6_0 = vshlq_n_u16(vaddq_u16(c0123.val[0], j256), 6);
1668   uint16x8_t res6_1 = vshlq_n_u16(vaddq_u16(c0123.val[1], j256), 6);
1669   uint16x8_t shift0 =
1670       vshrq_n_u16(vandq_u16(vsubq_u16(res6_0, ydx), vdupq_n_u16(0x3f)), 1);
1671   uint16x8_t shift1 =
1672       vshrq_n_u16(vandq_u16(vsubq_u16(res6_1, ydx), vdupq_n_u16(0x3f)), 1);
1673   // a[x+1] - a[x]
1674   uint16x8_t diff0 = vsubl_u8(vget_low_u8(a1_x128), vget_low_u8(a0_x128));
1675   uint16x8_t diff1 = vsubl_u8(vget_high_u8(a1_x128), vget_high_u8(a0_x128));
1676   // a[x] * 32 + 16
1677   uint16x8_t a32_0 =
1678       vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_x128), vdup_n_u8(32));
1679   uint16x8_t a32_1 =
1680       vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_x128), vdup_n_u8(32));
1681   uint16x8_t res0 = vmlaq_u16(a32_0, diff0, shift0);
1682   uint16x8_t res1 = vmlaq_u16(a32_1, diff1, shift1);
1683   return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5));
1684 }
1685 
dr_prediction_z2_NxW_left_neon(uint8x16x4_t left_vals0,uint8x16x4_t left_vals1,int dy,int r,int j)1686 static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_left_neon(
1687 #if AOM_ARCH_AARCH64
1688     uint8x16x4_t left_vals0, uint8x16x4_t left_vals1,
1689 #else
1690     const uint8_t *left,
1691 #endif
1692     int dy, int r, int j) {
1693   // here upsample_above and upsample_left are 0 by design of
1694   // av1_use_intra_edge_upsample
1695   const int min_base_y = -1;
1696 
1697   int16x8_t min_base_y256 = vdupq_n_s16(min_base_y);
1698   int16x8_t half_min_base_y256 = vdupq_n_s16(min_base_y >> 1);
1699   int16x8_t dy256 = vdupq_n_s16(dy);
1700   uint16x8_t j256 = vdupq_n_u16(j);
1701 
1702   uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000),
1703                                         vcreate_u16(0x0007000600050004)),
1704                            vcombine_u16(vcreate_u16(0x000B000A00090008),
1705                                         vcreate_u16(0x000F000E000D000C)) } };
1706   uint16x8x2_t c1234 = { { vaddq_u16(c0123.val[0], vdupq_n_u16(1)),
1707                            vaddq_u16(c0123.val[1], vdupq_n_u16(1)) } };
1708 
1709   int16x8_t v_r6 = vdupq_n_s16(r << 6);
1710 
1711   int16x8_t c256_0 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[0]));
1712   int16x8_t c256_1 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[1]));
1713   int16x8_t mul16_lo = vreinterpretq_s16_u16(
1714       vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_0, dy256)),
1715                 vreinterpretq_u16_s16(half_min_base_y256)));
1716   int16x8_t mul16_hi = vreinterpretq_s16_u16(
1717       vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_1, dy256)),
1718                 vreinterpretq_u16_s16(half_min_base_y256)));
1719   int16x8_t y_c256_lo = vsubq_s16(v_r6, mul16_lo);
1720   int16x8_t y_c256_hi = vsubq_s16(v_r6, mul16_hi);
1721 
1722   int16x8_t base_y_c256_lo = vshrq_n_s16(y_c256_lo, 6);
1723   int16x8_t base_y_c256_hi = vshrq_n_s16(y_c256_hi, 6);
1724 
1725   base_y_c256_lo = vmaxq_s16(min_base_y256, base_y_c256_lo);
1726   base_y_c256_hi = vmaxq_s16(min_base_y256, base_y_c256_hi);
1727 
1728 #if !AOM_ARCH_AARCH64
1729   int16_t min_y = vgetq_lane_s16(base_y_c256_hi, 7);
1730   int16_t max_y = vgetq_lane_s16(base_y_c256_lo, 0);
1731   int16_t offset_diff = max_y - min_y;
1732 
1733   uint8x8_t a0_y0;
1734   uint8x8_t a0_y1;
1735   uint8x8_t a1_y0;
1736   uint8x8_t a1_y1;
1737   if (offset_diff < 16) {
1738     // Avoid gathers where the data we want is close together in memory.
1739     // We don't need this for AArch64 since we can already use TBL to cover the
1740     // full range of possible values.
1741     assert(offset_diff >= 0);
1742     int16x8_t min_y256 = vdupq_lane_s16(vget_high_s16(base_y_c256_hi), 3);
1743 
1744     int16x8x2_t base_y_offset;
1745     base_y_offset.val[0] = vsubq_s16(base_y_c256_lo, min_y256);
1746     base_y_offset.val[1] = vsubq_s16(base_y_c256_hi, min_y256);
1747 
1748     int8x16_t base_y_offset128 = vcombine_s8(vqmovn_s16(base_y_offset.val[0]),
1749                                              vqmovn_s16(base_y_offset.val[1]));
1750 
1751     uint8x16_t v_loadmaskz2 = vld1q_u8(LoadMaskz2[offset_diff / 4]);
1752     uint8x16_t a0_y128 = vld1q_u8(left + min_y);
1753     uint8x16_t a1_y128 = vld1q_u8(left + min_y + 1);
1754     a0_y128 = vandq_u8(a0_y128, v_loadmaskz2);
1755     a1_y128 = vandq_u8(a1_y128, v_loadmaskz2);
1756 
1757     uint8x8_t v_index_low = vget_low_u8(vreinterpretq_u8_s8(base_y_offset128));
1758     uint8x8_t v_index_high =
1759         vget_high_u8(vreinterpretq_u8_s8(base_y_offset128));
1760     uint8x8x2_t v_tmp, v_res;
1761     v_tmp.val[0] = vget_low_u8(a0_y128);
1762     v_tmp.val[1] = vget_high_u8(a0_y128);
1763     v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
1764     v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
1765     a0_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
1766     v_tmp.val[0] = vget_low_u8(a1_y128);
1767     v_tmp.val[1] = vget_high_u8(a1_y128);
1768     v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
1769     v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
1770     a1_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
1771 
1772     a0_y0 = vget_low_u8(a0_y128);
1773     a0_y1 = vget_high_u8(a0_y128);
1774     a1_y0 = vget_low_u8(a1_y128);
1775     a1_y1 = vget_high_u8(a1_y128);
1776   } else {
1777     a0_y0 = load_u8_gather_s16_x8(left, base_y_c256_lo);
1778     a0_y1 = load_u8_gather_s16_x8(left, base_y_c256_hi);
1779     a1_y0 = load_u8_gather_s16_x8(left + 1, base_y_c256_lo);
1780     a1_y1 = load_u8_gather_s16_x8(left + 1, base_y_c256_hi);
1781   }
1782 #else
1783   // Values in left_idx{0,1} range from 0 through 63 inclusive.
1784   uint8x16_t left_idx0 =
1785       vreinterpretq_u8_s16(vaddq_s16(base_y_c256_lo, vdupq_n_s16(1)));
1786   uint8x16_t left_idx1 =
1787       vreinterpretq_u8_s16(vaddq_s16(base_y_c256_hi, vdupq_n_s16(1)));
1788   uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
1789 
1790   uint8x16_t a0_y01 = vqtbl4q_u8(left_vals0, left_idx01);
1791   uint8x16_t a1_y01 = vqtbl4q_u8(left_vals1, left_idx01);
1792 
1793   uint8x8_t a0_y0 = vget_low_u8(a0_y01);
1794   uint8x8_t a0_y1 = vget_high_u8(a0_y01);
1795   uint8x8_t a1_y0 = vget_low_u8(a1_y01);
1796   uint8x8_t a1_y1 = vget_high_u8(a1_y01);
1797 #endif  // !AOM_ARCH_AARCH64
1798 
1799   uint16x8_t shifty_lo = vshrq_n_u16(
1800       vandq_u16(vreinterpretq_u16_s16(y_c256_lo), vdupq_n_u16(0x3f)), 1);
1801   uint16x8_t shifty_hi = vshrq_n_u16(
1802       vandq_u16(vreinterpretq_u16_s16(y_c256_hi), vdupq_n_u16(0x3f)), 1);
1803 
1804   // a[x+1] - a[x]
1805   uint16x8_t diff_lo = vsubl_u8(a1_y0, a0_y0);
1806   uint16x8_t diff_hi = vsubl_u8(a1_y1, a0_y1);
1807   // a[x] * 32 + 16
1808   uint16x8_t a32_lo = vmlal_u8(vdupq_n_u16(16), a0_y0, vdup_n_u8(32));
1809   uint16x8_t a32_hi = vmlal_u8(vdupq_n_u16(16), a0_y1, vdup_n_u8(32));
1810 
1811   uint16x8_t res0 = vmlaq_u16(a32_lo, diff_lo, shifty_lo);
1812   uint16x8_t res1 = vmlaq_u16(a32_hi, diff_hi, shifty_hi);
1813 
1814   return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5));
1815 }
1816 
dr_prediction_z2_Nx4_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)1817 static void dr_prediction_z2_Nx4_neon(int N, uint8_t *dst, ptrdiff_t stride,
1818                                       const uint8_t *above, const uint8_t *left,
1819                                       int upsample_above, int upsample_left,
1820                                       int dx, int dy) {
1821   const int min_base_x = -(1 << upsample_above);
1822   const int min_base_y = -(1 << upsample_left);
1823   const int frac_bits_x = 6 - upsample_above;
1824   const int frac_bits_y = 6 - upsample_left;
1825 
1826   assert(dx > 0);
1827   // pre-filter above pixels
1828   // store in temp buffers:
1829   //   above[x] * 32 + 16
1830   //   above[x+1] - above[x]
1831   // final pixels will be calculated as:
1832   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1833 
1834 #if AOM_ARCH_AARCH64
1835   // Use ext rather than loading left + 14 directly to avoid over-read.
1836   const uint8x16_t left_m2 = vld1q_u8(left - 2);
1837   const uint8x16_t left_0 = vld1q_u8(left);
1838   const uint8x16_t left_14 = vextq_u8(left_0, left_0, 14);
1839   const uint8x16x2_t left_vals = { { left_m2, left_14 } };
1840 #define LEFT left_vals
1841 #else  // !AOM_ARCH_AARCH64
1842 #define LEFT left
1843 #endif  // AOM_ARCH_AARCH64
1844 
1845   for (int r = 0; r < N; r++) {
1846     int y = r + 1;
1847     int base_x = (-y * dx) >> frac_bits_x;
1848     const int base_min_diff =
1849         (min_base_x - ((-y * dx) >> frac_bits_x) + upsample_above) >>
1850         upsample_above;
1851 
1852     if (base_min_diff <= 0) {
1853       uint8x8_t a0_x_u8, a1_x_u8;
1854       uint16x4_t shift0;
1855       dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y,
1856                                       &a0_x_u8, &a1_x_u8, &shift0);
1857       uint8x8_t a0_x = a0_x_u8;
1858       uint8x8_t a1_x = a1_x_u8;
1859 
1860       uint16x8_t diff = vsubl_u8(a1_x, a0_x);  // a[x+1] - a[x]
1861       uint16x8_t a32 =
1862           vmlal_u8(vdupq_n_u16(16), a0_x, vdup_n_u8(32));  // a[x] * 32 + 16
1863       uint16x8_t res =
1864           vmlaq_u16(a32, diff, vcombine_u16(shift0, vdup_n_u16(0)));
1865       uint8x8_t resx = vshrn_n_u16(res, 5);
1866       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resx), 0);
1867     } else if (base_min_diff < 4) {
1868       uint8x8_t a0_x_u8, a1_x_u8;
1869       uint16x4_t shift0;
1870       dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y,
1871                                       &a0_x_u8, &a1_x_u8, &shift0);
1872       uint16x8_t a0_x = vmovl_u8(a0_x_u8);
1873       uint16x8_t a1_x = vmovl_u8(a1_x_u8);
1874 
1875       uint16x4_t a0_y;
1876       uint16x4_t a1_y;
1877       uint16x4_t shift1;
1878       dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y,
1879                                      frac_bits_y, &a0_y, &a1_y, &shift1);
1880       a0_x = vcombine_u16(vget_low_u16(a0_x), a0_y);
1881       a1_x = vcombine_u16(vget_low_u16(a1_x), a1_y);
1882 
1883       uint16x8_t shift = vcombine_u16(shift0, shift1);
1884       uint16x8_t diff = vsubq_u16(a1_x, a0_x);  // a[x+1] - a[x]
1885       uint16x8_t a32 =
1886           vmlaq_n_u16(vdupq_n_u16(16), a0_x, 32);  // a[x] * 32 + 16
1887       uint16x8_t res = vmlaq_u16(a32, diff, shift);
1888       uint8x8_t resx = vshrn_n_u16(res, 5);
1889       uint8x8_t resy = vext_u8(resx, vdup_n_u8(0), 4);
1890 
1891       uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
1892       uint8x8_t v_resxy = vbsl_u8(mask, resy, resx);
1893       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(v_resxy), 0);
1894     } else {
1895       uint16x4_t a0_y, a1_y;
1896       uint16x4_t shift1;
1897       dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y,
1898                                      frac_bits_y, &a0_y, &a1_y, &shift1);
1899       uint16x4_t diff = vsub_u16(a1_y, a0_y);                 // a[x+1] - a[x]
1900       uint16x4_t a32 = vmla_n_u16(vdup_n_u16(16), a0_y, 32);  // a[x] * 32 + 16
1901       uint16x4_t res = vmla_u16(a32, diff, shift1);
1902       uint8x8_t resy = vshrn_n_u16(vcombine_u16(res, vdup_n_u16(0)), 5);
1903 
1904       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resy), 0);
1905     }
1906 
1907     dst += stride;
1908   }
1909 #undef LEFT
1910 }
1911 
dr_prediction_z2_Nx8_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)1912 static void dr_prediction_z2_Nx8_neon(int N, uint8_t *dst, ptrdiff_t stride,
1913                                       const uint8_t *above, const uint8_t *left,
1914                                       int upsample_above, int upsample_left,
1915                                       int dx, int dy) {
1916   const int min_base_x = -(1 << upsample_above);
1917   const int min_base_y = -(1 << upsample_left);
1918   const int frac_bits_x = 6 - upsample_above;
1919   const int frac_bits_y = 6 - upsample_left;
1920 
1921   // pre-filter above pixels
1922   // store in temp buffers:
1923   //   above[x] * 32 + 16
1924   //   above[x+1] - above[x]
1925   // final pixels will be calculated as:
1926   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1927 
1928 #if AOM_ARCH_AARCH64
1929   // Use ext rather than loading left + 30 directly to avoid over-read.
1930   const uint8x16_t left_m2 = vld1q_u8(left - 2);
1931   const uint8x16_t left_0 = vld1q_u8(left + 0);
1932   const uint8x16_t left_16 = vld1q_u8(left + 16);
1933   const uint8x16_t left_14 = vextq_u8(left_0, left_16, 14);
1934   const uint8x16_t left_30 = vextq_u8(left_16, left_16, 14);
1935   const uint8x16x3_t left_vals = { { left_m2, left_14, left_30 } };
1936 #define LEFT left_vals
1937 #else  // !AOM_ARCH_AARCH64
1938 #define LEFT left
1939 #endif  // AOM_ARCH_AARCH64
1940 
1941   for (int r = 0; r < N; r++) {
1942     int y = r + 1;
1943     int base_x = (-y * dx) >> frac_bits_x;
1944     int base_min_diff =
1945         (min_base_x - base_x + upsample_above) >> upsample_above;
1946 
1947     if (base_min_diff <= 0) {
1948       uint8x8_t resx =
1949           dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y);
1950       vst1_u8(dst, resx);
1951     } else if (base_min_diff < 8) {
1952       uint8x8_t resx =
1953           dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y);
1954       uint8x8_t resy = dr_prediction_z2_Nx8_left_neon(
1955           LEFT, upsample_left, dy, r, min_base_y, frac_bits_y);
1956       uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
1957       uint8x8_t resxy = vbsl_u8(mask, resy, resx);
1958       vst1_u8(dst, resxy);
1959     } else {
1960       uint8x8_t resy = dr_prediction_z2_Nx8_left_neon(
1961           LEFT, upsample_left, dy, r, min_base_y, frac_bits_y);
1962       vst1_u8(dst, resy);
1963     }
1964 
1965     dst += stride;
1966   }
1967 #undef LEFT
1968 }
1969 
dr_prediction_z2_HxW_neon(int H,int W,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int dx,int dy)1970 static void dr_prediction_z2_HxW_neon(int H, int W, uint8_t *dst,
1971                                       ptrdiff_t stride, const uint8_t *above,
1972                                       const uint8_t *left, int dx, int dy) {
1973   // here upsample_above and upsample_left are 0 by design of
1974   // av1_use_intra_edge_upsample
1975   const int min_base_x = -1;
1976 
1977 #if AOM_ARCH_AARCH64
1978   const uint8x16_t left_m1 = vld1q_u8(left - 1);
1979   const uint8x16_t left_0 = vld1q_u8(left + 0);
1980   const uint8x16_t left_16 = vld1q_u8(left + 16);
1981   const uint8x16_t left_32 = vld1q_u8(left + 32);
1982   const uint8x16_t left_48 = vld1q_u8(left + 48);
1983   const uint8x16_t left_15 = vextq_u8(left_0, left_16, 15);
1984   const uint8x16_t left_31 = vextq_u8(left_16, left_32, 15);
1985   const uint8x16_t left_47 = vextq_u8(left_32, left_48, 15);
1986   const uint8x16x4_t left_vals0 = { { left_m1, left_15, left_31, left_47 } };
1987   const uint8x16x4_t left_vals1 = { { left_0, left_16, left_32, left_48 } };
1988 #define LEFT left_vals0, left_vals1
1989 #else  // !AOM_ARCH_AARCH64
1990 #define LEFT left
1991 #endif  // AOM_ARCH_AARCH64
1992 
1993   for (int r = 0; r < H; r++) {
1994     int y = r + 1;
1995     int base_x = (-y * dx) >> 6;
1996     for (int j = 0; j < W; j += 16) {
1997       const int base_min_diff = min_base_x - base_x - j;
1998 
1999       if (base_min_diff <= 0) {
2000         uint8x16_t resx =
2001             dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j);
2002         vst1q_u8(dst + j, resx);
2003       } else if (base_min_diff < 16) {
2004         uint8x16_t resx =
2005             dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j);
2006         uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j);
2007         uint8x16_t mask = vld1q_u8(BaseMask[base_min_diff]);
2008         uint8x16_t resxy = vbslq_u8(mask, resy, resx);
2009         vst1q_u8(dst + j, resxy);
2010       } else {
2011         uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j);
2012         vst1q_u8(dst + j, resy);
2013       }
2014     }  // for j
2015     dst += stride;
2016   }
2017 #undef LEFT
2018 }
2019 
2020 // Directional prediction, zone 2: 90 < angle < 180
av1_dr_prediction_z2_neon(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)2021 void av1_dr_prediction_z2_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
2022                                const uint8_t *above, const uint8_t *left,
2023                                int upsample_above, int upsample_left, int dx,
2024                                int dy) {
2025   assert(dx > 0);
2026   assert(dy > 0);
2027 
2028   switch (bw) {
2029     case 4:
2030       dr_prediction_z2_Nx4_neon(bh, dst, stride, above, left, upsample_above,
2031                                 upsample_left, dx, dy);
2032       break;
2033     case 8:
2034       dr_prediction_z2_Nx8_neon(bh, dst, stride, above, left, upsample_above,
2035                                 upsample_left, dx, dy);
2036       break;
2037     default:
2038       dr_prediction_z2_HxW_neon(bh, bw, dst, stride, above, left, dx, dy);
2039       break;
2040   }
2041 }
2042 #endif  // AOM_ARCH_AARCH64
2043 
2044 /* ---------------------P R E D I C T I O N   Z 3--------------------------- */
2045 
z3_transpose_arrays_u8_16x4(const uint8x16_t * x,uint8x16x2_t * d)2046 static AOM_FORCE_INLINE void z3_transpose_arrays_u8_16x4(const uint8x16_t *x,
2047                                                          uint8x16x2_t *d) {
2048   uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
2049   uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
2050 
2051   d[0] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
2052                                               vreinterpretq_u16_u8(w1.val[0])));
2053   d[1] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
2054                                               vreinterpretq_u16_u8(w1.val[1])));
2055 }
2056 
z3_transpose_arrays_u8_4x4(const uint8x8_t * x,uint8x8x2_t * d)2057 static AOM_FORCE_INLINE void z3_transpose_arrays_u8_4x4(const uint8x8_t *x,
2058                                                         uint8x8x2_t *d) {
2059   uint8x8x2_t w0 = vzip_u8(x[0], x[1]);
2060   uint8x8x2_t w1 = vzip_u8(x[2], x[3]);
2061 
2062   *d = aom_reinterpret_u8_u16_x2(
2063       vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])));
2064 }
2065 
z3_transpose_arrays_u8_8x4(const uint8x8_t * x,uint8x8x2_t * d)2066 static AOM_FORCE_INLINE void z3_transpose_arrays_u8_8x4(const uint8x8_t *x,
2067                                                         uint8x8x2_t *d) {
2068   uint8x8x2_t w0 = vzip_u8(x[0], x[1]);
2069   uint8x8x2_t w1 = vzip_u8(x[2], x[3]);
2070 
2071   d[0] = aom_reinterpret_u8_u16_x2(
2072       vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])));
2073   d[1] = aom_reinterpret_u8_u16_x2(
2074       vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1])));
2075 }
2076 
z3_transpose_arrays_u8_16x16(const uint8_t * src,ptrdiff_t pitchSrc,uint8_t * dst,ptrdiff_t pitchDst)2077 static void z3_transpose_arrays_u8_16x16(const uint8_t *src, ptrdiff_t pitchSrc,
2078                                          uint8_t *dst, ptrdiff_t pitchDst) {
2079   // The same as the normal transposes in transpose_neon.h, but with a stride
2080   // between consecutive vectors of elements.
2081   uint8x16_t r[16];
2082   uint8x16_t d[16];
2083   for (int i = 0; i < 16; i++) {
2084     r[i] = vld1q_u8(src + i * pitchSrc);
2085   }
2086   transpose_arrays_u8_16x16(r, d);
2087   for (int i = 0; i < 16; i++) {
2088     vst1q_u8(dst + i * pitchDst, d[i]);
2089   }
2090 }
2091 
z3_transpose_arrays_u8_16nx16n(const uint8_t * src,ptrdiff_t pitchSrc,uint8_t * dst,ptrdiff_t pitchDst,int width,int height)2092 static void z3_transpose_arrays_u8_16nx16n(const uint8_t *src,
2093                                            ptrdiff_t pitchSrc, uint8_t *dst,
2094                                            ptrdiff_t pitchDst, int width,
2095                                            int height) {
2096   for (int j = 0; j < height; j += 16) {
2097     for (int i = 0; i < width; i += 16) {
2098       z3_transpose_arrays_u8_16x16(src + i * pitchSrc + j, pitchSrc,
2099                                    dst + j * pitchDst + i, pitchDst);
2100     }
2101   }
2102 }
2103 
dr_prediction_z3_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2104 static void dr_prediction_z3_4x4_neon(uint8_t *dst, ptrdiff_t stride,
2105                                       const uint8_t *left, int upsample_left,
2106                                       int dy) {
2107   uint8x8_t dstvec[4];
2108   uint8x8x2_t dest;
2109 
2110   dr_prediction_z1_HxW_internal_neon_64(4, 4, dstvec, left, upsample_left, dy);
2111   z3_transpose_arrays_u8_4x4(dstvec, &dest);
2112   store_u8x4_strided_x2(dst + stride * 0, stride, dest.val[0]);
2113   store_u8x4_strided_x2(dst + stride * 2, stride, dest.val[1]);
2114 }
2115 
dr_prediction_z3_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2116 static void dr_prediction_z3_8x8_neon(uint8_t *dst, ptrdiff_t stride,
2117                                       const uint8_t *left, int upsample_left,
2118                                       int dy) {
2119   uint8x8_t dstvec[8];
2120   uint8x8_t d[8];
2121 
2122   dr_prediction_z1_HxW_internal_neon_64(8, 8, dstvec, left, upsample_left, dy);
2123   transpose_arrays_u8_8x8(dstvec, d);
2124   store_u8_8x8(dst, stride, d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]);
2125 }
2126 
dr_prediction_z3_4x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2127 static void dr_prediction_z3_4x8_neon(uint8_t *dst, ptrdiff_t stride,
2128                                       const uint8_t *left, int upsample_left,
2129                                       int dy) {
2130   uint8x8_t dstvec[4];
2131   uint8x8x2_t d[2];
2132 
2133   dr_prediction_z1_HxW_internal_neon_64(8, 4, dstvec, left, upsample_left, dy);
2134   z3_transpose_arrays_u8_8x4(dstvec, d);
2135   store_u8x4_strided_x2(dst + stride * 0, stride, d[0].val[0]);
2136   store_u8x4_strided_x2(dst + stride * 2, stride, d[0].val[1]);
2137   store_u8x4_strided_x2(dst + stride * 4, stride, d[1].val[0]);
2138   store_u8x4_strided_x2(dst + stride * 6, stride, d[1].val[1]);
2139 }
2140 
dr_prediction_z3_8x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2141 static void dr_prediction_z3_8x4_neon(uint8_t *dst, ptrdiff_t stride,
2142                                       const uint8_t *left, int upsample_left,
2143                                       int dy) {
2144   uint8x8_t dstvec[8];
2145   uint8x8_t d[8];
2146 
2147   dr_prediction_z1_HxW_internal_neon_64(4, 8, dstvec, left, upsample_left, dy);
2148   transpose_arrays_u8_8x8(dstvec, d);
2149   store_u8_8x4(dst, stride, d[0], d[1], d[2], d[3]);
2150 }
2151 
dr_prediction_z3_8x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2152 static void dr_prediction_z3_8x16_neon(uint8_t *dst, ptrdiff_t stride,
2153                                        const uint8_t *left, int upsample_left,
2154                                        int dy) {
2155   uint8x16_t dstvec[8];
2156   uint8x8_t d[16];
2157 
2158   dr_prediction_z1_HxW_internal_neon(16, 8, dstvec, left, upsample_left, dy);
2159   transpose_arrays_u8_16x8(dstvec, d);
2160   for (int i = 0; i < 16; i++) {
2161     vst1_u8(dst + i * stride, d[i]);
2162   }
2163 }
2164 
dr_prediction_z3_16x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2165 static void dr_prediction_z3_16x8_neon(uint8_t *dst, ptrdiff_t stride,
2166                                        const uint8_t *left, int upsample_left,
2167                                        int dy) {
2168   uint8x8_t dstvec[16];
2169   uint8x16_t d[8];
2170 
2171   dr_prediction_z1_HxW_internal_neon_64(8, 16, dstvec, left, upsample_left, dy);
2172   transpose_arrays_u8_8x16(dstvec, d);
2173   for (int i = 0; i < 8; i++) {
2174     vst1q_u8(dst + i * stride, d[i]);
2175   }
2176 }
2177 
dr_prediction_z3_4x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2178 static void dr_prediction_z3_4x16_neon(uint8_t *dst, ptrdiff_t stride,
2179                                        const uint8_t *left, int upsample_left,
2180                                        int dy) {
2181   uint8x16_t dstvec[4];
2182   uint8x16x2_t d[2];
2183 
2184   dr_prediction_z1_HxW_internal_neon(16, 4, dstvec, left, upsample_left, dy);
2185   z3_transpose_arrays_u8_16x4(dstvec, d);
2186   store_u8x4_strided_x4(dst + stride * 0, stride, d[0].val[0]);
2187   store_u8x4_strided_x4(dst + stride * 4, stride, d[0].val[1]);
2188   store_u8x4_strided_x4(dst + stride * 8, stride, d[1].val[0]);
2189   store_u8x4_strided_x4(dst + stride * 12, stride, d[1].val[1]);
2190 }
2191 
dr_prediction_z3_16x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2192 static void dr_prediction_z3_16x4_neon(uint8_t *dst, ptrdiff_t stride,
2193                                        const uint8_t *left, int upsample_left,
2194                                        int dy) {
2195   uint8x8_t dstvec[16];
2196   uint8x16_t d[8];
2197 
2198   dr_prediction_z1_HxW_internal_neon_64(4, 16, dstvec, left, upsample_left, dy);
2199   transpose_arrays_u8_8x16(dstvec, d);
2200   for (int i = 0; i < 4; i++) {
2201     vst1q_u8(dst + i * stride, d[i]);
2202   }
2203 }
2204 
dr_prediction_z3_8x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2205 static void dr_prediction_z3_8x32_neon(uint8_t *dst, ptrdiff_t stride,
2206                                        const uint8_t *left, int upsample_left,
2207                                        int dy) {
2208   (void)upsample_left;
2209   uint8x16x2_t dstvec[16];
2210   uint8x16_t d[32];
2211   uint8x16_t v_zero = vdupq_n_u8(0);
2212 
2213   dr_prediction_z1_32xN_internal_neon(8, dstvec, left, dy);
2214   for (int i = 8; i < 16; i++) {
2215     dstvec[i].val[0] = v_zero;
2216     dstvec[i].val[1] = v_zero;
2217   }
2218   transpose_arrays_u8_32x16(dstvec, d);
2219   for (int i = 0; i < 32; i++) {
2220     vst1_u8(dst + i * stride, vget_low_u8(d[i]));
2221   }
2222 }
2223 
dr_prediction_z3_32x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2224 static void dr_prediction_z3_32x8_neon(uint8_t *dst, ptrdiff_t stride,
2225                                        const uint8_t *left, int upsample_left,
2226                                        int dy) {
2227   uint8x8_t dstvec[32];
2228   uint8x16_t d[16];
2229 
2230   dr_prediction_z1_HxW_internal_neon_64(8, 32, dstvec, left, upsample_left, dy);
2231   transpose_arrays_u8_8x16(dstvec, d);
2232   transpose_arrays_u8_8x16(dstvec + 16, d + 8);
2233   for (int i = 0; i < 8; i++) {
2234     vst1q_u8(dst + i * stride, d[i]);
2235     vst1q_u8(dst + i * stride + 16, d[i + 8]);
2236   }
2237 }
2238 
dr_prediction_z3_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2239 static void dr_prediction_z3_16x16_neon(uint8_t *dst, ptrdiff_t stride,
2240                                         const uint8_t *left, int upsample_left,
2241                                         int dy) {
2242   uint8x16_t dstvec[16];
2243   uint8x16_t d[16];
2244 
2245   dr_prediction_z1_HxW_internal_neon(16, 16, dstvec, left, upsample_left, dy);
2246   transpose_arrays_u8_16x16(dstvec, d);
2247   for (int i = 0; i < 16; i++) {
2248     vst1q_u8(dst + i * stride, d[i]);
2249   }
2250 }
2251 
dr_prediction_z3_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2252 static void dr_prediction_z3_32x32_neon(uint8_t *dst, ptrdiff_t stride,
2253                                         const uint8_t *left, int upsample_left,
2254                                         int dy) {
2255   (void)upsample_left;
2256   uint8x16x2_t dstvec[32];
2257   uint8x16_t d[64];
2258 
2259   dr_prediction_z1_32xN_internal_neon(32, dstvec, left, dy);
2260   transpose_arrays_u8_32x16(dstvec, d);
2261   transpose_arrays_u8_32x16(dstvec + 16, d + 32);
2262   for (int i = 0; i < 32; i++) {
2263     vst1q_u8(dst + i * stride, d[i]);
2264     vst1q_u8(dst + i * stride + 16, d[i + 32]);
2265   }
2266 }
2267 
dr_prediction_z3_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2268 static void dr_prediction_z3_64x64_neon(uint8_t *dst, ptrdiff_t stride,
2269                                         const uint8_t *left, int upsample_left,
2270                                         int dy) {
2271   (void)upsample_left;
2272   DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
2273 
2274   dr_prediction_z1_64xN_neon(64, dstT, 64, left, dy);
2275   z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 64, 64);
2276 }
2277 
dr_prediction_z3_16x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2278 static void dr_prediction_z3_16x32_neon(uint8_t *dst, ptrdiff_t stride,
2279                                         const uint8_t *left, int upsample_left,
2280                                         int dy) {
2281   (void)upsample_left;
2282   uint8x16x2_t dstvec[16];
2283   uint8x16_t d[32];
2284 
2285   dr_prediction_z1_32xN_internal_neon(16, dstvec, left, dy);
2286   transpose_arrays_u8_32x16(dstvec, d);
2287   for (int i = 0; i < 16; i++) {
2288     vst1q_u8(dst + 2 * i * stride, d[2 * i + 0]);
2289     vst1q_u8(dst + (2 * i + 1) * stride, d[2 * i + 1]);
2290   }
2291 }
2292 
dr_prediction_z3_32x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2293 static void dr_prediction_z3_32x16_neon(uint8_t *dst, ptrdiff_t stride,
2294                                         const uint8_t *left, int upsample_left,
2295                                         int dy) {
2296   uint8x16_t dstvec[32];
2297 
2298   dr_prediction_z1_HxW_internal_neon(16, 32, dstvec, left, upsample_left, dy);
2299   for (int i = 0; i < 32; i += 16) {
2300     uint8x16_t d[16];
2301     transpose_arrays_u8_16x16(dstvec + i, d);
2302     for (int j = 0; j < 16; j++) {
2303       vst1q_u8(dst + j * stride + i, d[j]);
2304     }
2305   }
2306 }
2307 
dr_prediction_z3_32x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2308 static void dr_prediction_z3_32x64_neon(uint8_t *dst, ptrdiff_t stride,
2309                                         const uint8_t *left, int upsample_left,
2310                                         int dy) {
2311   (void)upsample_left;
2312   uint8_t dstT[64 * 32];
2313 
2314   dr_prediction_z1_64xN_neon(32, dstT, 64, left, dy);
2315   z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 32, 64);
2316 }
2317 
dr_prediction_z3_64x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2318 static void dr_prediction_z3_64x32_neon(uint8_t *dst, ptrdiff_t stride,
2319                                         const uint8_t *left, int upsample_left,
2320                                         int dy) {
2321   (void)upsample_left;
2322   uint8_t dstT[32 * 64];
2323 
2324   dr_prediction_z1_32xN_neon(64, dstT, 32, left, dy);
2325   z3_transpose_arrays_u8_16nx16n(dstT, 32, dst, stride, 64, 32);
2326 }
2327 
dr_prediction_z3_16x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2328 static void dr_prediction_z3_16x64_neon(uint8_t *dst, ptrdiff_t stride,
2329                                         const uint8_t *left, int upsample_left,
2330                                         int dy) {
2331   (void)upsample_left;
2332   uint8_t dstT[64 * 16];
2333 
2334   dr_prediction_z1_64xN_neon(16, dstT, 64, left, dy);
2335   z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 16, 64);
2336 }
2337 
dr_prediction_z3_64x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2338 static void dr_prediction_z3_64x16_neon(uint8_t *dst, ptrdiff_t stride,
2339                                         const uint8_t *left, int upsample_left,
2340                                         int dy) {
2341   uint8x16_t dstvec[64];
2342 
2343   dr_prediction_z1_HxW_internal_neon(16, 64, dstvec, left, upsample_left, dy);
2344   for (int i = 0; i < 64; i += 16) {
2345     uint8x16_t d[16];
2346     transpose_arrays_u8_16x16(dstvec + i, d);
2347     for (int j = 0; j < 16; ++j) {
2348       vst1q_u8(dst + j * stride + i, d[j]);
2349     }
2350   }
2351 }
2352 
2353 typedef void (*dr_prediction_z3_fn)(uint8_t *dst, ptrdiff_t stride,
2354                                     const uint8_t *left, int upsample_left,
2355                                     int dy);
2356 
2357 static dr_prediction_z3_fn dr_prediction_z3_arr[7][7] = {
2358   { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
2359   { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
2360   { NULL, NULL, dr_prediction_z3_4x4_neon, dr_prediction_z3_4x8_neon,
2361     dr_prediction_z3_4x16_neon, NULL, NULL },
2362   { NULL, NULL, dr_prediction_z3_8x4_neon, dr_prediction_z3_8x8_neon,
2363     dr_prediction_z3_8x16_neon, dr_prediction_z3_8x32_neon, NULL },
2364   { NULL, NULL, dr_prediction_z3_16x4_neon, dr_prediction_z3_16x8_neon,
2365     dr_prediction_z3_16x16_neon, dr_prediction_z3_16x32_neon,
2366     dr_prediction_z3_16x64_neon },
2367   { NULL, NULL, NULL, dr_prediction_z3_32x8_neon, dr_prediction_z3_32x16_neon,
2368     dr_prediction_z3_32x32_neon, dr_prediction_z3_32x64_neon },
2369   { NULL, NULL, NULL, NULL, dr_prediction_z3_64x16_neon,
2370     dr_prediction_z3_64x32_neon, dr_prediction_z3_64x64_neon },
2371 };
2372 
av1_dr_prediction_z3_neon(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_left,int dx,int dy)2373 void av1_dr_prediction_z3_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
2374                                const uint8_t *above, const uint8_t *left,
2375                                int upsample_left, int dx, int dy) {
2376   (void)above;
2377   (void)dx;
2378   assert(dx == 1);
2379   assert(dy > 0);
2380 
2381   dr_prediction_z3_fn f = dr_prediction_z3_arr[get_msb(bw)][get_msb(bh)];
2382   assert(f != NULL);
2383   f(dst, stride, left, upsample_left, dy);
2384 }
2385 
2386 // -----------------------------------------------------------------------------
2387 // SMOOTH_PRED
2388 
2389 // 256 - v = vneg_s8(v)
negate_s8(const uint8x8_t v)2390 static INLINE uint8x8_t negate_s8(const uint8x8_t v) {
2391   return vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(v)));
2392 }
2393 
smooth_4xh_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * const top_row,const uint8_t * const left_column,const int height)2394 static void smooth_4xh_neon(uint8_t *dst, ptrdiff_t stride,
2395                             const uint8_t *const top_row,
2396                             const uint8_t *const left_column,
2397                             const int height) {
2398   const uint8_t top_right = top_row[3];
2399   const uint8_t bottom_left = left_column[height - 1];
2400   const uint8_t *const weights_y = smooth_weights + height - 4;
2401 
2402   uint8x8_t top_v = load_u8_4x1(top_row);
2403   const uint8x8_t top_right_v = vdup_n_u8(top_right);
2404   const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
2405   uint8x8_t weights_x_v = load_u8_4x1(smooth_weights);
2406   const uint8x8_t scaled_weights_x = negate_s8(weights_x_v);
2407   const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
2408 
2409   assert(height > 0);
2410   int y = 0;
2411   do {
2412     const uint8x8_t left_v = vdup_n_u8(left_column[y]);
2413     const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
2414     const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);
2415     const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
2416     const uint16x8_t weighted_top_bl =
2417         vmlal_u8(weighted_bl, weights_y_v, top_v);
2418     const uint16x8_t weighted_left_tr =
2419         vmlal_u8(weighted_tr, weights_x_v, left_v);
2420     // Maximum value of each parameter: 0xFF00
2421     const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
2422     const uint8x8_t result = vrshrn_n_u16(avg, SMOOTH_WEIGHT_LOG2_SCALE);
2423 
2424     vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(result), 0);
2425     dst += stride;
2426   } while (++y != height);
2427 }
2428 
calculate_pred(const uint16x8_t weighted_top_bl,const uint16x8_t weighted_left_tr)2429 static INLINE uint8x8_t calculate_pred(const uint16x8_t weighted_top_bl,
2430                                        const uint16x8_t weighted_left_tr) {
2431   // Maximum value of each parameter: 0xFF00
2432   const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
2433   return vrshrn_n_u16(avg, SMOOTH_WEIGHT_LOG2_SCALE);
2434 }
2435 
calculate_weights_and_pred(const uint8x8_t top,const uint8x8_t left,const uint16x8_t weighted_tr,const uint8x8_t bottom_left,const uint8x8_t weights_x,const uint8x8_t scaled_weights_y,const uint8x8_t weights_y)2436 static INLINE uint8x8_t calculate_weights_and_pred(
2437     const uint8x8_t top, const uint8x8_t left, const uint16x8_t weighted_tr,
2438     const uint8x8_t bottom_left, const uint8x8_t weights_x,
2439     const uint8x8_t scaled_weights_y, const uint8x8_t weights_y) {
2440   const uint16x8_t weighted_top = vmull_u8(weights_y, top);
2441   const uint16x8_t weighted_top_bl =
2442       vmlal_u8(weighted_top, scaled_weights_y, bottom_left);
2443   const uint16x8_t weighted_left_tr = vmlal_u8(weighted_tr, weights_x, left);
2444   return calculate_pred(weighted_top_bl, weighted_left_tr);
2445 }
2446 
smooth_8xh_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * const top_row,const uint8_t * const left_column,const int height)2447 static void smooth_8xh_neon(uint8_t *dst, ptrdiff_t stride,
2448                             const uint8_t *const top_row,
2449                             const uint8_t *const left_column,
2450                             const int height) {
2451   const uint8_t top_right = top_row[7];
2452   const uint8_t bottom_left = left_column[height - 1];
2453   const uint8_t *const weights_y = smooth_weights + height - 4;
2454 
2455   const uint8x8_t top_v = vld1_u8(top_row);
2456   const uint8x8_t top_right_v = vdup_n_u8(top_right);
2457   const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
2458   const uint8x8_t weights_x_v = vld1_u8(smooth_weights + 4);
2459   const uint8x8_t scaled_weights_x = negate_s8(weights_x_v);
2460   const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
2461 
2462   assert(height > 0);
2463   int y = 0;
2464   do {
2465     const uint8x8_t left_v = vdup_n_u8(left_column[y]);
2466     const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
2467     const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);
2468     const uint8x8_t result =
2469         calculate_weights_and_pred(top_v, left_v, weighted_tr, bottom_left_v,
2470                                    weights_x_v, scaled_weights_y, weights_y_v);
2471 
2472     vst1_u8(dst, result);
2473     dst += stride;
2474   } while (++y != height);
2475 }
2476 
2477 #define SMOOTH_NXM(W, H)                                                       \
2478   void aom_smooth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t y_stride, \
2479                                              const uint8_t *above,             \
2480                                              const uint8_t *left) {            \
2481     smooth_##W##xh_neon(dst, y_stride, above, left, H);                        \
2482   }
2483 
2484 SMOOTH_NXM(4, 4)
2485 SMOOTH_NXM(4, 8)
2486 SMOOTH_NXM(8, 4)
2487 SMOOTH_NXM(8, 8)
2488 SMOOTH_NXM(4, 16)
2489 SMOOTH_NXM(8, 16)
2490 SMOOTH_NXM(8, 32)
2491 
2492 #undef SMOOTH_NXM
2493 
calculate_weights_and_predq(const uint8x16_t top,const uint8x8_t left,const uint8x8_t top_right,const uint8x8_t weights_y,const uint8x16_t weights_x,const uint8x16_t scaled_weights_x,const uint16x8_t weighted_bl)2494 static INLINE uint8x16_t calculate_weights_and_predq(
2495     const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right,
2496     const uint8x8_t weights_y, const uint8x16_t weights_x,
2497     const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) {
2498   const uint16x8_t weighted_top_bl_low =
2499       vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
2500   const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
2501   const uint16x8_t weighted_left_tr_low =
2502       vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
2503   const uint8x8_t result_low =
2504       calculate_pred(weighted_top_bl_low, weighted_left_tr_low);
2505 
2506   const uint16x8_t weighted_top_bl_high =
2507       vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
2508   const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
2509   const uint16x8_t weighted_left_tr_high =
2510       vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
2511   const uint8x8_t result_high =
2512       calculate_pred(weighted_top_bl_high, weighted_left_tr_high);
2513 
2514   return vcombine_u8(result_low, result_high);
2515 }
2516 
2517 // 256 - v = vneg_s8(v)
negate_s8q(const uint8x16_t v)2518 static INLINE uint8x16_t negate_s8q(const uint8x16_t v) {
2519   return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(v)));
2520 }
2521 
2522 // For width 16 and above.
2523 #define SMOOTH_PREDICTOR(W)                                                 \
2524   static void smooth_##W##xh_neon(                                          \
2525       uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,         \
2526       const uint8_t *const left_column, const int height) {                 \
2527     const uint8_t top_right = top_row[(W)-1];                               \
2528     const uint8_t bottom_left = left_column[height - 1];                    \
2529     const uint8_t *const weights_y = smooth_weights + height - 4;           \
2530                                                                             \
2531     uint8x16_t top_v[4];                                                    \
2532     top_v[0] = vld1q_u8(top_row);                                           \
2533     if ((W) > 16) {                                                         \
2534       top_v[1] = vld1q_u8(top_row + 16);                                    \
2535       if ((W) == 64) {                                                      \
2536         top_v[2] = vld1q_u8(top_row + 32);                                  \
2537         top_v[3] = vld1q_u8(top_row + 48);                                  \
2538       }                                                                     \
2539     }                                                                       \
2540                                                                             \
2541     const uint8x8_t top_right_v = vdup_n_u8(top_right);                     \
2542     const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);                 \
2543                                                                             \
2544     uint8x16_t weights_x_v[4];                                              \
2545     weights_x_v[0] = vld1q_u8(smooth_weights + (W)-4);                      \
2546     if ((W) > 16) {                                                         \
2547       weights_x_v[1] = vld1q_u8(smooth_weights + (W) + 16 - 4);             \
2548       if ((W) == 64) {                                                      \
2549         weights_x_v[2] = vld1q_u8(smooth_weights + (W) + 32 - 4);           \
2550         weights_x_v[3] = vld1q_u8(smooth_weights + (W) + 48 - 4);           \
2551       }                                                                     \
2552     }                                                                       \
2553                                                                             \
2554     uint8x16_t scaled_weights_x[4];                                         \
2555     scaled_weights_x[0] = negate_s8q(weights_x_v[0]);                       \
2556     if ((W) > 16) {                                                         \
2557       scaled_weights_x[1] = negate_s8q(weights_x_v[1]);                     \
2558       if ((W) == 64) {                                                      \
2559         scaled_weights_x[2] = negate_s8q(weights_x_v[2]);                   \
2560         scaled_weights_x[3] = negate_s8q(weights_x_v[3]);                   \
2561       }                                                                     \
2562     }                                                                       \
2563                                                                             \
2564     for (int y = 0; y < height; ++y) {                                      \
2565       const uint8x8_t left_v = vdup_n_u8(left_column[y]);                   \
2566       const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);                \
2567       const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);            \
2568       const uint16x8_t weighted_bl =                                        \
2569           vmull_u8(scaled_weights_y, bottom_left_v);                        \
2570                                                                             \
2571       vst1q_u8(dst, calculate_weights_and_predq(                            \
2572                         top_v[0], left_v, top_right_v, weights_y_v,         \
2573                         weights_x_v[0], scaled_weights_x[0], weighted_bl)); \
2574                                                                             \
2575       if ((W) > 16) {                                                       \
2576         vst1q_u8(dst + 16,                                                  \
2577                  calculate_weights_and_predq(                               \
2578                      top_v[1], left_v, top_right_v, weights_y_v,            \
2579                      weights_x_v[1], scaled_weights_x[1], weighted_bl));    \
2580         if ((W) == 64) {                                                    \
2581           vst1q_u8(dst + 32,                                                \
2582                    calculate_weights_and_predq(                             \
2583                        top_v[2], left_v, top_right_v, weights_y_v,          \
2584                        weights_x_v[2], scaled_weights_x[2], weighted_bl));  \
2585           vst1q_u8(dst + 48,                                                \
2586                    calculate_weights_and_predq(                             \
2587                        top_v[3], left_v, top_right_v, weights_y_v,          \
2588                        weights_x_v[3], scaled_weights_x[3], weighted_bl));  \
2589         }                                                                   \
2590       }                                                                     \
2591                                                                             \
2592       dst += stride;                                                        \
2593     }                                                                       \
2594   }
2595 
2596 SMOOTH_PREDICTOR(16)
2597 SMOOTH_PREDICTOR(32)
2598 SMOOTH_PREDICTOR(64)
2599 
2600 #undef SMOOTH_PREDICTOR
2601 
2602 #define SMOOTH_NXM_WIDE(W, H)                                                  \
2603   void aom_smooth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t y_stride, \
2604                                              const uint8_t *above,             \
2605                                              const uint8_t *left) {            \
2606     smooth_##W##xh_neon(dst, y_stride, above, left, H);                        \
2607   }
2608 
2609 SMOOTH_NXM_WIDE(16, 4)
2610 SMOOTH_NXM_WIDE(16, 8)
2611 SMOOTH_NXM_WIDE(16, 16)
2612 SMOOTH_NXM_WIDE(16, 32)
2613 SMOOTH_NXM_WIDE(16, 64)
2614 SMOOTH_NXM_WIDE(32, 8)
2615 SMOOTH_NXM_WIDE(32, 16)
2616 SMOOTH_NXM_WIDE(32, 32)
2617 SMOOTH_NXM_WIDE(32, 64)
2618 SMOOTH_NXM_WIDE(64, 16)
2619 SMOOTH_NXM_WIDE(64, 32)
2620 SMOOTH_NXM_WIDE(64, 64)
2621 
2622 #undef SMOOTH_NXM_WIDE
2623 
2624 // -----------------------------------------------------------------------------
2625 // SMOOTH_V_PRED
2626 
2627 // For widths 4 and 8.
2628 #define SMOOTH_V_PREDICTOR(W)                                         \
2629   static void smooth_v_##W##xh_neon(                                  \
2630       uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,   \
2631       const uint8_t *const left_column, const int height) {           \
2632     const uint8_t bottom_left = left_column[height - 1];              \
2633     const uint8_t *const weights_y = smooth_weights + height - 4;     \
2634                                                                       \
2635     uint8x8_t top_v;                                                  \
2636     if ((W) == 4) {                                                   \
2637       top_v = load_u8_4x1(top_row);                                   \
2638     } else { /* width == 8 */                                         \
2639       top_v = vld1_u8(top_row);                                       \
2640     }                                                                 \
2641                                                                       \
2642     const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);           \
2643                                                                       \
2644     assert(height > 0);                                               \
2645     int y = 0;                                                        \
2646     do {                                                              \
2647       const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);          \
2648       const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);      \
2649                                                                       \
2650       const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v);   \
2651       const uint16x8_t weighted_top_bl =                              \
2652           vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v);    \
2653       const uint8x8_t pred =                                          \
2654           vrshrn_n_u16(weighted_top_bl, SMOOTH_WEIGHT_LOG2_SCALE);    \
2655                                                                       \
2656       if ((W) == 4) {                                                 \
2657         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0); \
2658       } else { /* width == 8 */                                       \
2659         vst1_u8(dst, pred);                                           \
2660       }                                                               \
2661       dst += stride;                                                  \
2662     } while (++y != height);                                          \
2663   }
2664 
2665 SMOOTH_V_PREDICTOR(4)
2666 SMOOTH_V_PREDICTOR(8)
2667 
2668 #undef SMOOTH_V_PREDICTOR
2669 
2670 #define SMOOTH_V_NXM(W, H)                                    \
2671   void aom_smooth_v_predictor_##W##x##H##_neon(               \
2672       uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
2673       const uint8_t *left) {                                  \
2674     smooth_v_##W##xh_neon(dst, y_stride, above, left, H);     \
2675   }
2676 
2677 SMOOTH_V_NXM(4, 4)
2678 SMOOTH_V_NXM(4, 8)
2679 SMOOTH_V_NXM(4, 16)
2680 SMOOTH_V_NXM(8, 4)
2681 SMOOTH_V_NXM(8, 8)
2682 SMOOTH_V_NXM(8, 16)
2683 SMOOTH_V_NXM(8, 32)
2684 
2685 #undef SMOOTH_V_NXM
2686 
calculate_vertical_weights_and_pred(const uint8x16_t top,const uint8x8_t weights_y,const uint16x8_t weighted_bl)2687 static INLINE uint8x16_t calculate_vertical_weights_and_pred(
2688     const uint8x16_t top, const uint8x8_t weights_y,
2689     const uint16x8_t weighted_bl) {
2690   const uint16x8_t pred_low =
2691       vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
2692   const uint16x8_t pred_high =
2693       vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
2694   const uint8x8_t pred_scaled_low =
2695       vrshrn_n_u16(pred_low, SMOOTH_WEIGHT_LOG2_SCALE);
2696   const uint8x8_t pred_scaled_high =
2697       vrshrn_n_u16(pred_high, SMOOTH_WEIGHT_LOG2_SCALE);
2698   return vcombine_u8(pred_scaled_low, pred_scaled_high);
2699 }
2700 
2701 // For width 16 and above.
2702 #define SMOOTH_V_PREDICTOR(W)                                            \
2703   static void smooth_v_##W##xh_neon(                                     \
2704       uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,      \
2705       const uint8_t *const left_column, const int height) {              \
2706     const uint8_t bottom_left = left_column[height - 1];                 \
2707     const uint8_t *const weights_y = smooth_weights + height - 4;        \
2708                                                                          \
2709     uint8x16_t top_v[4];                                                 \
2710     top_v[0] = vld1q_u8(top_row);                                        \
2711     if ((W) > 16) {                                                      \
2712       top_v[1] = vld1q_u8(top_row + 16);                                 \
2713       if ((W) == 64) {                                                   \
2714         top_v[2] = vld1q_u8(top_row + 32);                               \
2715         top_v[3] = vld1q_u8(top_row + 48);                               \
2716       }                                                                  \
2717     }                                                                    \
2718                                                                          \
2719     const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);              \
2720                                                                          \
2721     assert(height > 0);                                                  \
2722     int y = 0;                                                           \
2723     do {                                                                 \
2724       const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);             \
2725       const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);         \
2726       const uint16x8_t weighted_bl =                                     \
2727           vmull_u8(scaled_weights_y, bottom_left_v);                     \
2728                                                                          \
2729       const uint8x16_t pred_0 = calculate_vertical_weights_and_pred(     \
2730           top_v[0], weights_y_v, weighted_bl);                           \
2731       vst1q_u8(dst, pred_0);                                             \
2732                                                                          \
2733       if ((W) > 16) {                                                    \
2734         const uint8x16_t pred_1 = calculate_vertical_weights_and_pred(   \
2735             top_v[1], weights_y_v, weighted_bl);                         \
2736         vst1q_u8(dst + 16, pred_1);                                      \
2737                                                                          \
2738         if ((W) == 64) {                                                 \
2739           const uint8x16_t pred_2 = calculate_vertical_weights_and_pred( \
2740               top_v[2], weights_y_v, weighted_bl);                       \
2741           vst1q_u8(dst + 32, pred_2);                                    \
2742                                                                          \
2743           const uint8x16_t pred_3 = calculate_vertical_weights_and_pred( \
2744               top_v[3], weights_y_v, weighted_bl);                       \
2745           vst1q_u8(dst + 48, pred_3);                                    \
2746         }                                                                \
2747       }                                                                  \
2748                                                                          \
2749       dst += stride;                                                     \
2750     } while (++y != height);                                             \
2751   }
2752 
2753 SMOOTH_V_PREDICTOR(16)
2754 SMOOTH_V_PREDICTOR(32)
2755 SMOOTH_V_PREDICTOR(64)
2756 
2757 #undef SMOOTH_V_PREDICTOR
2758 
2759 #define SMOOTH_V_NXM_WIDE(W, H)                               \
2760   void aom_smooth_v_predictor_##W##x##H##_neon(               \
2761       uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
2762       const uint8_t *left) {                                  \
2763     smooth_v_##W##xh_neon(dst, y_stride, above, left, H);     \
2764   }
2765 
2766 SMOOTH_V_NXM_WIDE(16, 4)
2767 SMOOTH_V_NXM_WIDE(16, 8)
2768 SMOOTH_V_NXM_WIDE(16, 16)
2769 SMOOTH_V_NXM_WIDE(16, 32)
2770 SMOOTH_V_NXM_WIDE(16, 64)
2771 SMOOTH_V_NXM_WIDE(32, 8)
2772 SMOOTH_V_NXM_WIDE(32, 16)
2773 SMOOTH_V_NXM_WIDE(32, 32)
2774 SMOOTH_V_NXM_WIDE(32, 64)
2775 SMOOTH_V_NXM_WIDE(64, 16)
2776 SMOOTH_V_NXM_WIDE(64, 32)
2777 SMOOTH_V_NXM_WIDE(64, 64)
2778 
2779 #undef SMOOTH_V_NXM_WIDE
2780 
2781 // -----------------------------------------------------------------------------
2782 // SMOOTH_H_PRED
2783 
2784 // For widths 4 and 8.
2785 #define SMOOTH_H_PREDICTOR(W)                                               \
2786   static void smooth_h_##W##xh_neon(                                        \
2787       uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,         \
2788       const uint8_t *const left_column, const int height) {                 \
2789     const uint8_t top_right = top_row[(W)-1];                               \
2790                                                                             \
2791     const uint8x8_t top_right_v = vdup_n_u8(top_right);                     \
2792     /* Over-reads for 4xN but still within the array. */                    \
2793     const uint8x8_t weights_x = vld1_u8(smooth_weights + (W)-4);            \
2794     const uint8x8_t scaled_weights_x = negate_s8(weights_x);                \
2795     const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); \
2796                                                                             \
2797     assert(height > 0);                                                     \
2798     int y = 0;                                                              \
2799     do {                                                                    \
2800       const uint8x8_t left_v = vdup_n_u8(left_column[y]);                   \
2801       const uint16x8_t weighted_left_tr =                                   \
2802           vmlal_u8(weighted_tr, weights_x, left_v);                         \
2803       const uint8x8_t pred =                                                \
2804           vrshrn_n_u16(weighted_left_tr, SMOOTH_WEIGHT_LOG2_SCALE);         \
2805                                                                             \
2806       if ((W) == 4) {                                                       \
2807         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0);       \
2808       } else { /* width == 8 */                                             \
2809         vst1_u8(dst, pred);                                                 \
2810       }                                                                     \
2811       dst += stride;                                                        \
2812     } while (++y != height);                                                \
2813   }
2814 
2815 SMOOTH_H_PREDICTOR(4)
2816 SMOOTH_H_PREDICTOR(8)
2817 
2818 #undef SMOOTH_H_PREDICTOR
2819 
2820 #define SMOOTH_H_NXM(W, H)                                    \
2821   void aom_smooth_h_predictor_##W##x##H##_neon(               \
2822       uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
2823       const uint8_t *left) {                                  \
2824     smooth_h_##W##xh_neon(dst, y_stride, above, left, H);     \
2825   }
2826 
2827 SMOOTH_H_NXM(4, 4)
2828 SMOOTH_H_NXM(4, 8)
2829 SMOOTH_H_NXM(4, 16)
2830 SMOOTH_H_NXM(8, 4)
2831 SMOOTH_H_NXM(8, 8)
2832 SMOOTH_H_NXM(8, 16)
2833 SMOOTH_H_NXM(8, 32)
2834 
2835 #undef SMOOTH_H_NXM
2836 
calculate_horizontal_weights_and_pred(const uint8x8_t left,const uint8x8_t top_right,const uint8x16_t weights_x,const uint8x16_t scaled_weights_x)2837 static INLINE uint8x16_t calculate_horizontal_weights_and_pred(
2838     const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x,
2839     const uint8x16_t scaled_weights_x) {
2840   const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
2841   const uint16x8_t weighted_left_tr_low =
2842       vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
2843   const uint8x8_t pred_scaled_low =
2844       vrshrn_n_u16(weighted_left_tr_low, SMOOTH_WEIGHT_LOG2_SCALE);
2845 
2846   const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
2847   const uint16x8_t weighted_left_tr_high =
2848       vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
2849   const uint8x8_t pred_scaled_high =
2850       vrshrn_n_u16(weighted_left_tr_high, SMOOTH_WEIGHT_LOG2_SCALE);
2851 
2852   return vcombine_u8(pred_scaled_low, pred_scaled_high);
2853 }
2854 
2855 // For width 16 and above.
2856 #define SMOOTH_H_PREDICTOR(W)                                              \
2857   static void smooth_h_##W##xh_neon(                                       \
2858       uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,        \
2859       const uint8_t *const left_column, const int height) {                \
2860     const uint8_t top_right = top_row[(W)-1];                              \
2861                                                                            \
2862     const uint8x8_t top_right_v = vdup_n_u8(top_right);                    \
2863                                                                            \
2864     uint8x16_t weights_x[4];                                               \
2865     weights_x[0] = vld1q_u8(smooth_weights + (W)-4);                       \
2866     if ((W) > 16) {                                                        \
2867       weights_x[1] = vld1q_u8(smooth_weights + (W) + 16 - 4);              \
2868       if ((W) == 64) {                                                     \
2869         weights_x[2] = vld1q_u8(smooth_weights + (W) + 32 - 4);            \
2870         weights_x[3] = vld1q_u8(smooth_weights + (W) + 48 - 4);            \
2871       }                                                                    \
2872     }                                                                      \
2873                                                                            \
2874     uint8x16_t scaled_weights_x[4];                                        \
2875     scaled_weights_x[0] = negate_s8q(weights_x[0]);                        \
2876     if ((W) > 16) {                                                        \
2877       scaled_weights_x[1] = negate_s8q(weights_x[1]);                      \
2878       if ((W) == 64) {                                                     \
2879         scaled_weights_x[2] = negate_s8q(weights_x[2]);                    \
2880         scaled_weights_x[3] = negate_s8q(weights_x[3]);                    \
2881       }                                                                    \
2882     }                                                                      \
2883                                                                            \
2884     assert(height > 0);                                                    \
2885     int y = 0;                                                             \
2886     do {                                                                   \
2887       const uint8x8_t left_v = vdup_n_u8(left_column[y]);                  \
2888                                                                            \
2889       const uint8x16_t pred_0 = calculate_horizontal_weights_and_pred(     \
2890           left_v, top_right_v, weights_x[0], scaled_weights_x[0]);         \
2891       vst1q_u8(dst, pred_0);                                               \
2892                                                                            \
2893       if ((W) > 16) {                                                      \
2894         const uint8x16_t pred_1 = calculate_horizontal_weights_and_pred(   \
2895             left_v, top_right_v, weights_x[1], scaled_weights_x[1]);       \
2896         vst1q_u8(dst + 16, pred_1);                                        \
2897                                                                            \
2898         if ((W) == 64) {                                                   \
2899           const uint8x16_t pred_2 = calculate_horizontal_weights_and_pred( \
2900               left_v, top_right_v, weights_x[2], scaled_weights_x[2]);     \
2901           vst1q_u8(dst + 32, pred_2);                                      \
2902                                                                            \
2903           const uint8x16_t pred_3 = calculate_horizontal_weights_and_pred( \
2904               left_v, top_right_v, weights_x[3], scaled_weights_x[3]);     \
2905           vst1q_u8(dst + 48, pred_3);                                      \
2906         }                                                                  \
2907       }                                                                    \
2908       dst += stride;                                                       \
2909     } while (++y != height);                                               \
2910   }
2911 
2912 SMOOTH_H_PREDICTOR(16)
2913 SMOOTH_H_PREDICTOR(32)
2914 SMOOTH_H_PREDICTOR(64)
2915 
2916 #undef SMOOTH_H_PREDICTOR
2917 
2918 #define SMOOTH_H_NXM_WIDE(W, H)                               \
2919   void aom_smooth_h_predictor_##W##x##H##_neon(               \
2920       uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
2921       const uint8_t *left) {                                  \
2922     smooth_h_##W##xh_neon(dst, y_stride, above, left, H);     \
2923   }
2924 
2925 SMOOTH_H_NXM_WIDE(16, 4)
2926 SMOOTH_H_NXM_WIDE(16, 8)
2927 SMOOTH_H_NXM_WIDE(16, 16)
2928 SMOOTH_H_NXM_WIDE(16, 32)
2929 SMOOTH_H_NXM_WIDE(16, 64)
2930 SMOOTH_H_NXM_WIDE(32, 8)
2931 SMOOTH_H_NXM_WIDE(32, 16)
2932 SMOOTH_H_NXM_WIDE(32, 32)
2933 SMOOTH_H_NXM_WIDE(32, 64)
2934 SMOOTH_H_NXM_WIDE(64, 16)
2935 SMOOTH_H_NXM_WIDE(64, 32)
2936 SMOOTH_H_NXM_WIDE(64, 64)
2937 
2938 #undef SMOOTH_H_NXM_WIDE
2939 
2940 // -----------------------------------------------------------------------------
2941 // PAETH
2942 
paeth_4or8_x_h_neon(uint8_t * dest,ptrdiff_t stride,const uint8_t * const top_row,const uint8_t * const left_column,int width,int height)2943 static INLINE void paeth_4or8_x_h_neon(uint8_t *dest, ptrdiff_t stride,
2944                                        const uint8_t *const top_row,
2945                                        const uint8_t *const left_column,
2946                                        int width, int height) {
2947   const uint8x8_t top_left = vdup_n_u8(top_row[-1]);
2948   const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
2949   uint8x8_t top;
2950   if (width == 4) {
2951     top = load_u8_4x1(top_row);
2952   } else {  // width == 8
2953     top = vld1_u8(top_row);
2954   }
2955 
2956   assert(height > 0);
2957   int y = 0;
2958   do {
2959     const uint8x8_t left = vdup_n_u8(left_column[y]);
2960 
2961     const uint8x8_t left_dist = vabd_u8(top, top_left);
2962     const uint8x8_t top_dist = vabd_u8(left, top_left);
2963     const uint16x8_t top_left_dist =
2964         vabdq_u16(vaddl_u8(top, left), top_left_x2);
2965 
2966     const uint8x8_t left_le_top = vcle_u8(left_dist, top_dist);
2967     const uint8x8_t left_le_top_left =
2968         vmovn_u16(vcleq_u16(vmovl_u8(left_dist), top_left_dist));
2969     const uint8x8_t top_le_top_left =
2970         vmovn_u16(vcleq_u16(vmovl_u8(top_dist), top_left_dist));
2971 
2972     // if (left_dist <= top_dist && left_dist <= top_left_dist)
2973     const uint8x8_t left_mask = vand_u8(left_le_top, left_le_top_left);
2974     //   dest[x] = left_column[y];
2975     // Fill all the unused spaces with 'top'. They will be overwritten when
2976     // the positions for top_left are known.
2977     uint8x8_t result = vbsl_u8(left_mask, left, top);
2978     // else if (top_dist <= top_left_dist)
2979     //   dest[x] = top_row[x];
2980     // Add these values to the mask. They were already set.
2981     const uint8x8_t left_or_top_mask = vorr_u8(left_mask, top_le_top_left);
2982     // else
2983     //   dest[x] = top_left;
2984     result = vbsl_u8(left_or_top_mask, result, top_left);
2985 
2986     if (width == 4) {
2987       store_u8_4x1(dest, result);
2988     } else {  // width == 8
2989       vst1_u8(dest, result);
2990     }
2991     dest += stride;
2992   } while (++y != height);
2993 }
2994 
2995 #define PAETH_NXM(W, H)                                                     \
2996   void aom_paeth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t stride, \
2997                                             const uint8_t *above,           \
2998                                             const uint8_t *left) {          \
2999     paeth_4or8_x_h_neon(dst, stride, above, left, W, H);                    \
3000   }
3001 
3002 PAETH_NXM(4, 4)
3003 PAETH_NXM(4, 8)
3004 PAETH_NXM(8, 4)
3005 PAETH_NXM(8, 8)
3006 PAETH_NXM(8, 16)
3007 
3008 PAETH_NXM(4, 16)
3009 PAETH_NXM(8, 32)
3010 
3011 // Calculate X distance <= TopLeft distance and pack the resulting mask into
3012 // uint8x8_t.
x_le_top_left(const uint8x16_t x_dist,const uint16x8_t top_left_dist_low,const uint16x8_t top_left_dist_high)3013 static INLINE uint8x16_t x_le_top_left(const uint8x16_t x_dist,
3014                                        const uint16x8_t top_left_dist_low,
3015                                        const uint16x8_t top_left_dist_high) {
3016   const uint8x16_t top_left_dist = vcombine_u8(vqmovn_u16(top_left_dist_low),
3017                                                vqmovn_u16(top_left_dist_high));
3018   return vcleq_u8(x_dist, top_left_dist);
3019 }
3020 
3021 // Select the closest values and collect them.
select_paeth(const uint8x16_t top,const uint8x16_t left,const uint8x16_t top_left,const uint8x16_t left_le_top,const uint8x16_t left_le_top_left,const uint8x16_t top_le_top_left)3022 static INLINE uint8x16_t select_paeth(const uint8x16_t top,
3023                                       const uint8x16_t left,
3024                                       const uint8x16_t top_left,
3025                                       const uint8x16_t left_le_top,
3026                                       const uint8x16_t left_le_top_left,
3027                                       const uint8x16_t top_le_top_left) {
3028   // if (left_dist <= top_dist && left_dist <= top_left_dist)
3029   const uint8x16_t left_mask = vandq_u8(left_le_top, left_le_top_left);
3030   //   dest[x] = left_column[y];
3031   // Fill all the unused spaces with 'top'. They will be overwritten when
3032   // the positions for top_left are known.
3033   uint8x16_t result = vbslq_u8(left_mask, left, top);
3034   // else if (top_dist <= top_left_dist)
3035   //   dest[x] = top_row[x];
3036   // Add these values to the mask. They were already set.
3037   const uint8x16_t left_or_top_mask = vorrq_u8(left_mask, top_le_top_left);
3038   // else
3039   //   dest[x] = top_left;
3040   return vbslq_u8(left_or_top_mask, result, top_left);
3041 }
3042 
3043 // Generate numbered and high/low versions of top_left_dist.
3044 #define TOP_LEFT_DIST(num)                                              \
3045   const uint16x8_t top_left_##num##_dist_low = vabdq_u16(               \
3046       vaddl_u8(vget_low_u8(top[num]), vget_low_u8(left)), top_left_x2); \
3047   const uint16x8_t top_left_##num##_dist_high = vabdq_u16(              \
3048       vaddl_u8(vget_high_u8(top[num]), vget_low_u8(left)), top_left_x2)
3049 
3050 // Generate numbered versions of XLeTopLeft with x = left.
3051 #define LEFT_LE_TOP_LEFT(num)                                     \
3052   const uint8x16_t left_le_top_left_##num =                       \
3053       x_le_top_left(left_##num##_dist, top_left_##num##_dist_low, \
3054                     top_left_##num##_dist_high)
3055 
3056 // Generate numbered versions of XLeTopLeft with x = top.
3057 #define TOP_LE_TOP_LEFT(num)                              \
3058   const uint8x16_t top_le_top_left_##num = x_le_top_left( \
3059       top_dist, top_left_##num##_dist_low, top_left_##num##_dist_high)
3060 
paeth16_plus_x_h_neon(uint8_t * dest,ptrdiff_t stride,const uint8_t * const top_row,const uint8_t * const left_column,int width,int height)3061 static INLINE void paeth16_plus_x_h_neon(uint8_t *dest, ptrdiff_t stride,
3062                                          const uint8_t *const top_row,
3063                                          const uint8_t *const left_column,
3064                                          int width, int height) {
3065   const uint8x16_t top_left = vdupq_n_u8(top_row[-1]);
3066   const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
3067   uint8x16_t top[4];
3068   top[0] = vld1q_u8(top_row);
3069   if (width > 16) {
3070     top[1] = vld1q_u8(top_row + 16);
3071     if (width == 64) {
3072       top[2] = vld1q_u8(top_row + 32);
3073       top[3] = vld1q_u8(top_row + 48);
3074     }
3075   }
3076 
3077   assert(height > 0);
3078   int y = 0;
3079   do {
3080     const uint8x16_t left = vdupq_n_u8(left_column[y]);
3081 
3082     const uint8x16_t top_dist = vabdq_u8(left, top_left);
3083 
3084     const uint8x16_t left_0_dist = vabdq_u8(top[0], top_left);
3085     TOP_LEFT_DIST(0);
3086     const uint8x16_t left_0_le_top = vcleq_u8(left_0_dist, top_dist);
3087     LEFT_LE_TOP_LEFT(0);
3088     TOP_LE_TOP_LEFT(0);
3089 
3090     const uint8x16_t result_0 =
3091         select_paeth(top[0], left, top_left, left_0_le_top, left_le_top_left_0,
3092                      top_le_top_left_0);
3093     vst1q_u8(dest, result_0);
3094 
3095     if (width > 16) {
3096       const uint8x16_t left_1_dist = vabdq_u8(top[1], top_left);
3097       TOP_LEFT_DIST(1);
3098       const uint8x16_t left_1_le_top = vcleq_u8(left_1_dist, top_dist);
3099       LEFT_LE_TOP_LEFT(1);
3100       TOP_LE_TOP_LEFT(1);
3101 
3102       const uint8x16_t result_1 =
3103           select_paeth(top[1], left, top_left, left_1_le_top,
3104                        left_le_top_left_1, top_le_top_left_1);
3105       vst1q_u8(dest + 16, result_1);
3106 
3107       if (width == 64) {
3108         const uint8x16_t left_2_dist = vabdq_u8(top[2], top_left);
3109         TOP_LEFT_DIST(2);
3110         const uint8x16_t left_2_le_top = vcleq_u8(left_2_dist, top_dist);
3111         LEFT_LE_TOP_LEFT(2);
3112         TOP_LE_TOP_LEFT(2);
3113 
3114         const uint8x16_t result_2 =
3115             select_paeth(top[2], left, top_left, left_2_le_top,
3116                          left_le_top_left_2, top_le_top_left_2);
3117         vst1q_u8(dest + 32, result_2);
3118 
3119         const uint8x16_t left_3_dist = vabdq_u8(top[3], top_left);
3120         TOP_LEFT_DIST(3);
3121         const uint8x16_t left_3_le_top = vcleq_u8(left_3_dist, top_dist);
3122         LEFT_LE_TOP_LEFT(3);
3123         TOP_LE_TOP_LEFT(3);
3124 
3125         const uint8x16_t result_3 =
3126             select_paeth(top[3], left, top_left, left_3_le_top,
3127                          left_le_top_left_3, top_le_top_left_3);
3128         vst1q_u8(dest + 48, result_3);
3129       }
3130     }
3131 
3132     dest += stride;
3133   } while (++y != height);
3134 }
3135 
3136 #define PAETH_NXM_WIDE(W, H)                                                \
3137   void aom_paeth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t stride, \
3138                                             const uint8_t *above,           \
3139                                             const uint8_t *left) {          \
3140     paeth16_plus_x_h_neon(dst, stride, above, left, W, H);                  \
3141   }
3142 
3143 PAETH_NXM_WIDE(16, 8)
3144 PAETH_NXM_WIDE(16, 16)
3145 PAETH_NXM_WIDE(16, 32)
3146 PAETH_NXM_WIDE(32, 16)
3147 PAETH_NXM_WIDE(32, 32)
3148 PAETH_NXM_WIDE(32, 64)
3149 PAETH_NXM_WIDE(64, 32)
3150 PAETH_NXM_WIDE(64, 64)
3151 
3152 PAETH_NXM_WIDE(16, 4)
3153 PAETH_NXM_WIDE(16, 64)
3154 PAETH_NXM_WIDE(32, 8)
3155 PAETH_NXM_WIDE(64, 16)
3156