• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2022, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 
14 #include "config/aom_config.h"
15 #include "config/aom_dsp_rtcd.h"
16 
17 #include "aom/aom_integer.h"
18 #include "aom_dsp/intrapred_common.h"
19 
20 // -----------------------------------------------------------------------------
21 // DC
22 
highbd_dc_predictor(uint16_t * dst,ptrdiff_t stride,int bw,const uint16_t * above,const uint16_t * left)23 static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
24                                        const uint16_t *above,
25                                        const uint16_t *left) {
26   assert(bw >= 4);
27   assert(IS_POWER_OF_TWO(bw));
28   int expected_dc, sum = 0;
29   const int count = bw * 2;
30   uint32x4_t sum_q = vdupq_n_u32(0);
31   uint32x2_t sum_d;
32   uint16_t *dst_1;
33   if (bw >= 8) {
34     for (int i = 0; i < bw; i += 8) {
35       sum_q = vpadalq_u16(sum_q, vld1q_u16(above));
36       sum_q = vpadalq_u16(sum_q, vld1q_u16(left));
37       above += 8;
38       left += 8;
39     }
40     sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
41     sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
42     expected_dc = (sum + (count >> 1)) / count;
43     const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc);
44     for (int r = 0; r < bw; r++) {
45       dst_1 = dst;
46       for (int i = 0; i < bw; i += 8) {
47         vst1q_u16(dst_1, dc);
48         dst_1 += 8;
49       }
50       dst += stride;
51     }
52   } else {  // 4x4
53     sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left));
54     sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
55     sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
56     expected_dc = (sum + (count >> 1)) / count;
57     const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc);
58     for (int r = 0; r < bw; r++) {
59       vst1_u16(dst, dc);
60       dst += stride;
61     }
62   }
63 }
64 
65 #define INTRA_PRED_HIGHBD_SIZED_NEON(type, width)               \
66   void aom_highbd_##type##_predictor_##width##x##width##_neon(  \
67       uint16_t *dst, ptrdiff_t stride, const uint16_t *above,   \
68       const uint16_t *left, int bd) {                           \
69     (void)bd;                                                   \
70     highbd_##type##_predictor(dst, stride, width, above, left); \
71   }
72 
73 #define INTRA_PRED_SQUARE(type)          \
74   INTRA_PRED_HIGHBD_SIZED_NEON(type, 4)  \
75   INTRA_PRED_HIGHBD_SIZED_NEON(type, 8)  \
76   INTRA_PRED_HIGHBD_SIZED_NEON(type, 16) \
77   INTRA_PRED_HIGHBD_SIZED_NEON(type, 32) \
78   INTRA_PRED_HIGHBD_SIZED_NEON(type, 64)
79 
INTRA_PRED_SQUARE(dc)80 INTRA_PRED_SQUARE(dc)
81 
82 #undef INTRA_PRED_SQUARE
83 
84 // -----------------------------------------------------------------------------
85 // V_PRED
86 
87 #define HIGHBD_V_NXM(W, H)                                    \
88   void aom_highbd_v_predictor_##W##x##H##_neon(               \
89       uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
90       const uint16_t *left, int bd) {                         \
91     (void)left;                                               \
92     (void)bd;                                                 \
93     vertical##W##xh_neon(dst, stride, above, H);              \
94   }
95 
96 static INLINE uint16x8x2_t load_uint16x8x2(uint16_t const *ptr) {
97   uint16x8x2_t x;
98   // Clang/gcc uses ldp here.
99   x.val[0] = vld1q_u16(ptr);
100   x.val[1] = vld1q_u16(ptr + 8);
101   return x;
102 }
103 
store_uint16x8x2(uint16_t * ptr,uint16x8x2_t x)104 static INLINE void store_uint16x8x2(uint16_t *ptr, uint16x8x2_t x) {
105   vst1q_u16(ptr, x.val[0]);
106   vst1q_u16(ptr + 8, x.val[1]);
107 }
108 
vertical4xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const above,int height)109 static INLINE void vertical4xh_neon(uint16_t *dst, ptrdiff_t stride,
110                                     const uint16_t *const above, int height) {
111   const uint16x4_t row = vld1_u16(above);
112   int y = height;
113   do {
114     vst1_u16(dst, row);
115     vst1_u16(dst + stride, row);
116     dst += stride << 1;
117     y -= 2;
118   } while (y != 0);
119 }
120 
vertical8xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const above,int height)121 static INLINE void vertical8xh_neon(uint16_t *dst, ptrdiff_t stride,
122                                     const uint16_t *const above, int height) {
123   const uint16x8_t row = vld1q_u16(above);
124   int y = height;
125   do {
126     vst1q_u16(dst, row);
127     vst1q_u16(dst + stride, row);
128     dst += stride << 1;
129     y -= 2;
130   } while (y != 0);
131 }
132 
vertical16xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const above,int height)133 static INLINE void vertical16xh_neon(uint16_t *dst, ptrdiff_t stride,
134                                      const uint16_t *const above, int height) {
135   const uint16x8x2_t row = load_uint16x8x2(above);
136   int y = height;
137   do {
138     store_uint16x8x2(dst, row);
139     store_uint16x8x2(dst + stride, row);
140     dst += stride << 1;
141     y -= 2;
142   } while (y != 0);
143 }
144 
load_uint16x8x4(uint16_t const * ptr)145 static INLINE uint16x8x4_t load_uint16x8x4(uint16_t const *ptr) {
146   uint16x8x4_t x;
147   // Clang/gcc uses ldp here.
148   x.val[0] = vld1q_u16(ptr);
149   x.val[1] = vld1q_u16(ptr + 8);
150   x.val[2] = vld1q_u16(ptr + 16);
151   x.val[3] = vld1q_u16(ptr + 24);
152   return x;
153 }
154 
store_uint16x8x4(uint16_t * ptr,uint16x8x4_t x)155 static INLINE void store_uint16x8x4(uint16_t *ptr, uint16x8x4_t x) {
156   vst1q_u16(ptr, x.val[0]);
157   vst1q_u16(ptr + 8, x.val[1]);
158   vst1q_u16(ptr + 16, x.val[2]);
159   vst1q_u16(ptr + 24, x.val[3]);
160 }
161 
vertical32xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const above,int height)162 static INLINE void vertical32xh_neon(uint16_t *dst, ptrdiff_t stride,
163                                      const uint16_t *const above, int height) {
164   const uint16x8x4_t row = load_uint16x8x4(above);
165   int y = height;
166   do {
167     store_uint16x8x4(dst, row);
168     store_uint16x8x4(dst + stride, row);
169     dst += stride << 1;
170     y -= 2;
171   } while (y != 0);
172 }
173 
vertical64xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const above,int height)174 static INLINE void vertical64xh_neon(uint16_t *dst, ptrdiff_t stride,
175                                      const uint16_t *const above, int height) {
176   uint16_t *dst32 = dst + 32;
177   const uint16x8x4_t row = load_uint16x8x4(above);
178   const uint16x8x4_t row32 = load_uint16x8x4(above + 32);
179   int y = height;
180   do {
181     store_uint16x8x4(dst, row);
182     store_uint16x8x4(dst32, row32);
183     store_uint16x8x4(dst + stride, row);
184     store_uint16x8x4(dst32 + stride, row32);
185     dst += stride << 1;
186     dst32 += stride << 1;
187     y -= 2;
188   } while (y != 0);
189 }
190 
191 HIGHBD_V_NXM(4, 4)
192 HIGHBD_V_NXM(4, 8)
193 HIGHBD_V_NXM(4, 16)
194 
195 HIGHBD_V_NXM(8, 4)
196 HIGHBD_V_NXM(8, 8)
197 HIGHBD_V_NXM(8, 16)
198 HIGHBD_V_NXM(8, 32)
199 
200 HIGHBD_V_NXM(16, 4)
201 HIGHBD_V_NXM(16, 8)
202 HIGHBD_V_NXM(16, 16)
203 HIGHBD_V_NXM(16, 32)
204 HIGHBD_V_NXM(16, 64)
205 
206 HIGHBD_V_NXM(32, 8)
207 HIGHBD_V_NXM(32, 16)
208 HIGHBD_V_NXM(32, 32)
209 HIGHBD_V_NXM(32, 64)
210 
211 HIGHBD_V_NXM(64, 16)
212 HIGHBD_V_NXM(64, 32)
213 HIGHBD_V_NXM(64, 64)
214 
215 // -----------------------------------------------------------------------------
216 // PAETH
217 
highbd_paeth_4or8_x_h_neon(uint16_t * dest,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,int width,int height)218 static INLINE void highbd_paeth_4or8_x_h_neon(uint16_t *dest, ptrdiff_t stride,
219                                               const uint16_t *const top_row,
220                                               const uint16_t *const left_column,
221                                               int width, int height) {
222   const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
223   const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
224   uint16x8_t top;
225   if (width == 4) {
226     top = vcombine_u16(vld1_u16(top_row), vdup_n_u16(0));
227   } else {  // width == 8
228     top = vld1q_u16(top_row);
229   }
230 
231   for (int y = 0; y < height; ++y) {
232     const uint16x8_t left = vdupq_n_u16(left_column[y]);
233 
234     const uint16x8_t left_dist = vabdq_u16(top, top_left);
235     const uint16x8_t top_dist = vabdq_u16(left, top_left);
236     const uint16x8_t top_left_dist =
237         vabdq_u16(vaddq_u16(top, left), top_left_x2);
238 
239     const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
240     const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
241     const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
242 
243     // if (left_dist <= top_dist && left_dist <= top_left_dist)
244     const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
245     //   dest[x] = left_column[y];
246     // Fill all the unused spaces with 'top'. They will be overwritten when
247     // the positions for top_left are known.
248     uint16x8_t result = vbslq_u16(left_mask, left, top);
249     // else if (top_dist <= top_left_dist)
250     //   dest[x] = top_row[x];
251     // Add these values to the mask. They were already set.
252     const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
253     // else
254     //   dest[x] = top_left;
255     result = vbslq_u16(left_or_top_mask, result, top_left);
256 
257     if (width == 4) {
258       vst1_u16(dest, vget_low_u16(result));
259     } else {  // width == 8
260       vst1q_u16(dest, result);
261     }
262     dest += stride;
263   }
264 }
265 
266 #define HIGHBD_PAETH_NXM(W, H)                                  \
267   void aom_highbd_paeth_predictor_##W##x##H##_neon(             \
268       uint16_t *dst, ptrdiff_t stride, const uint16_t *above,   \
269       const uint16_t *left, int bd) {                           \
270     (void)bd;                                                   \
271     highbd_paeth_4or8_x_h_neon(dst, stride, above, left, W, H); \
272   }
273 
274 HIGHBD_PAETH_NXM(4, 4)
275 HIGHBD_PAETH_NXM(4, 8)
276 HIGHBD_PAETH_NXM(4, 16)
277 HIGHBD_PAETH_NXM(8, 4)
278 HIGHBD_PAETH_NXM(8, 8)
279 HIGHBD_PAETH_NXM(8, 16)
280 HIGHBD_PAETH_NXM(8, 32)
281 
282 // Select the closest values and collect them.
select_paeth(const uint16x8_t top,const uint16x8_t left,const uint16x8_t top_left,const uint16x8_t left_le_top,const uint16x8_t left_le_top_left,const uint16x8_t top_le_top_left)283 static INLINE uint16x8_t select_paeth(const uint16x8_t top,
284                                       const uint16x8_t left,
285                                       const uint16x8_t top_left,
286                                       const uint16x8_t left_le_top,
287                                       const uint16x8_t left_le_top_left,
288                                       const uint16x8_t top_le_top_left) {
289   // if (left_dist <= top_dist && left_dist <= top_left_dist)
290   const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
291   //   dest[x] = left_column[y];
292   // Fill all the unused spaces with 'top'. They will be overwritten when
293   // the positions for top_left are known.
294   const uint16x8_t result = vbslq_u16(left_mask, left, top);
295   // else if (top_dist <= top_left_dist)
296   //   dest[x] = top_row[x];
297   // Add these values to the mask. They were already set.
298   const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
299   // else
300   //   dest[x] = top_left;
301   return vbslq_u16(left_or_top_mask, result, top_left);
302 }
303 
304 #define PAETH_PREDICTOR(num)                                                  \
305   do {                                                                        \
306     const uint16x8_t left_dist = vabdq_u16(top[num], top_left);               \
307     const uint16x8_t top_left_dist =                                          \
308         vabdq_u16(vaddq_u16(top[num], left), top_left_x2);                    \
309     const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);            \
310     const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);  \
311     const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);    \
312     const uint16x8_t result =                                                 \
313         select_paeth(top[num], left, top_left, left_le_top, left_le_top_left, \
314                      top_le_top_left);                                        \
315     vst1q_u16(dest + (num * 8), result);                                      \
316   } while (0)
317 
318 #define LOAD_TOP_ROW(num) vld1q_u16(top_row + (num * 8))
319 
highbd_paeth16_plus_x_h_neon(uint16_t * dest,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,int width,int height)320 static INLINE void highbd_paeth16_plus_x_h_neon(
321     uint16_t *dest, ptrdiff_t stride, const uint16_t *const top_row,
322     const uint16_t *const left_column, int width, int height) {
323   const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
324   const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
325   uint16x8_t top[8];
326   top[0] = LOAD_TOP_ROW(0);
327   top[1] = LOAD_TOP_ROW(1);
328   if (width > 16) {
329     top[2] = LOAD_TOP_ROW(2);
330     top[3] = LOAD_TOP_ROW(3);
331     if (width == 64) {
332       top[4] = LOAD_TOP_ROW(4);
333       top[5] = LOAD_TOP_ROW(5);
334       top[6] = LOAD_TOP_ROW(6);
335       top[7] = LOAD_TOP_ROW(7);
336     }
337   }
338 
339   for (int y = 0; y < height; ++y) {
340     const uint16x8_t left = vdupq_n_u16(left_column[y]);
341     const uint16x8_t top_dist = vabdq_u16(left, top_left);
342     PAETH_PREDICTOR(0);
343     PAETH_PREDICTOR(1);
344     if (width > 16) {
345       PAETH_PREDICTOR(2);
346       PAETH_PREDICTOR(3);
347       if (width == 64) {
348         PAETH_PREDICTOR(4);
349         PAETH_PREDICTOR(5);
350         PAETH_PREDICTOR(6);
351         PAETH_PREDICTOR(7);
352       }
353     }
354     dest += stride;
355   }
356 }
357 
358 #define HIGHBD_PAETH_NXM_WIDE(W, H)                               \
359   void aom_highbd_paeth_predictor_##W##x##H##_neon(               \
360       uint16_t *dst, ptrdiff_t stride, const uint16_t *above,     \
361       const uint16_t *left, int bd) {                             \
362     (void)bd;                                                     \
363     highbd_paeth16_plus_x_h_neon(dst, stride, above, left, W, H); \
364   }
365 
366 HIGHBD_PAETH_NXM_WIDE(16, 4)
367 HIGHBD_PAETH_NXM_WIDE(16, 8)
368 HIGHBD_PAETH_NXM_WIDE(16, 16)
369 HIGHBD_PAETH_NXM_WIDE(16, 32)
370 HIGHBD_PAETH_NXM_WIDE(16, 64)
371 HIGHBD_PAETH_NXM_WIDE(32, 8)
372 HIGHBD_PAETH_NXM_WIDE(32, 16)
373 HIGHBD_PAETH_NXM_WIDE(32, 32)
374 HIGHBD_PAETH_NXM_WIDE(32, 64)
375 HIGHBD_PAETH_NXM_WIDE(64, 16)
376 HIGHBD_PAETH_NXM_WIDE(64, 32)
377 HIGHBD_PAETH_NXM_WIDE(64, 64)
378 
379 // -----------------------------------------------------------------------------
380 // SMOOTH
381 
382 // 256 - v = vneg_s8(v)
negate_s8(const uint16x4_t v)383 static INLINE uint16x4_t negate_s8(const uint16x4_t v) {
384   return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v)));
385 }
386 
highbd_smooth_4xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,const int height)387 static INLINE void highbd_smooth_4xh_neon(uint16_t *dst, ptrdiff_t stride,
388                                           const uint16_t *const top_row,
389                                           const uint16_t *const left_column,
390                                           const int height) {
391   const uint16_t top_right = top_row[3];
392   const uint16_t bottom_left = left_column[height - 1];
393   const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
394 
395   const uint16x4_t top_v = vld1_u16(top_row);
396   const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
397   const uint16x4_t weights_x_v = vld1_u16(smooth_weights_u16);
398   const uint16x4_t scaled_weights_x = negate_s8(weights_x_v);
399   const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
400 
401   for (int y = 0; y < height; ++y) {
402     // Each variable in the running summation is named for the last item to be
403     // accumulated.
404     const uint32x4_t weighted_top =
405         vmlal_n_u16(weighted_tr, top_v, weights_y[y]);
406     const uint32x4_t weighted_left =
407         vmlal_n_u16(weighted_top, weights_x_v, left_column[y]);
408     const uint32x4_t weighted_bl =
409         vmlal_n_u16(weighted_left, bottom_left_v, 256 - weights_y[y]);
410 
411     const uint16x4_t pred =
412         vrshrn_n_u32(weighted_bl, SMOOTH_WEIGHT_LOG2_SCALE + 1);
413     vst1_u16(dst, pred);
414     dst += stride;
415   }
416 }
417 
418 // Common code between 8xH and [16|32|64]xH.
highbd_calculate_pred8(uint16_t * dst,const uint32x4_t weighted_corners_low,const uint32x4_t weighted_corners_high,const uint16x4x2_t top_vals,const uint16x4x2_t weights_x,const uint16_t left_y,const uint16_t weight_y)419 static INLINE void highbd_calculate_pred8(
420     uint16_t *dst, const uint32x4_t weighted_corners_low,
421     const uint32x4_t weighted_corners_high, const uint16x4x2_t top_vals,
422     const uint16x4x2_t weights_x, const uint16_t left_y,
423     const uint16_t weight_y) {
424   // Each variable in the running summation is named for the last item to be
425   // accumulated.
426   const uint32x4_t weighted_top_low =
427       vmlal_n_u16(weighted_corners_low, top_vals.val[0], weight_y);
428   const uint32x4_t weighted_edges_low =
429       vmlal_n_u16(weighted_top_low, weights_x.val[0], left_y);
430 
431   const uint16x4_t pred_low =
432       vrshrn_n_u32(weighted_edges_low, SMOOTH_WEIGHT_LOG2_SCALE + 1);
433   vst1_u16(dst, pred_low);
434 
435   const uint32x4_t weighted_top_high =
436       vmlal_n_u16(weighted_corners_high, top_vals.val[1], weight_y);
437   const uint32x4_t weighted_edges_high =
438       vmlal_n_u16(weighted_top_high, weights_x.val[1], left_y);
439 
440   const uint16x4_t pred_high =
441       vrshrn_n_u32(weighted_edges_high, SMOOTH_WEIGHT_LOG2_SCALE + 1);
442   vst1_u16(dst + 4, pred_high);
443 }
444 
highbd_smooth_8xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,const int height)445 static void highbd_smooth_8xh_neon(uint16_t *dst, ptrdiff_t stride,
446                                    const uint16_t *const top_row,
447                                    const uint16_t *const left_column,
448                                    const int height) {
449   const uint16_t top_right = top_row[7];
450   const uint16_t bottom_left = left_column[height - 1];
451   const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
452 
453   const uint16x4x2_t top_vals = { { vld1_u16(top_row),
454                                     vld1_u16(top_row + 4) } };
455   const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
456   const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4),
457                                      vld1_u16(smooth_weights_u16 + 8) } };
458   const uint32x4_t weighted_tr_low =
459       vmull_n_u16(negate_s8(weights_x.val[0]), top_right);
460   const uint32x4_t weighted_tr_high =
461       vmull_n_u16(negate_s8(weights_x.val[1]), top_right);
462 
463   for (int y = 0; y < height; ++y) {
464     const uint32x4_t weighted_bl =
465         vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
466     const uint32x4_t weighted_corners_low =
467         vaddq_u32(weighted_bl, weighted_tr_low);
468     const uint32x4_t weighted_corners_high =
469         vaddq_u32(weighted_bl, weighted_tr_high);
470     highbd_calculate_pred8(dst, weighted_corners_low, weighted_corners_high,
471                            top_vals, weights_x, left_column[y], weights_y[y]);
472     dst += stride;
473   }
474 }
475 
476 #define HIGHBD_SMOOTH_NXM(W, H)                                 \
477   void aom_highbd_smooth_predictor_##W##x##H##_neon(            \
478       uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
479       const uint16_t *left, int bd) {                           \
480     (void)bd;                                                   \
481     highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H);  \
482   }
483 
484 HIGHBD_SMOOTH_NXM(4, 4)
485 HIGHBD_SMOOTH_NXM(4, 8)
486 HIGHBD_SMOOTH_NXM(8, 4)
487 HIGHBD_SMOOTH_NXM(8, 8)
488 HIGHBD_SMOOTH_NXM(4, 16)
489 HIGHBD_SMOOTH_NXM(8, 16)
490 HIGHBD_SMOOTH_NXM(8, 32)
491 
492 #undef HIGHBD_SMOOTH_NXM
493 
494 // For width 16 and above.
495 #define HIGHBD_SMOOTH_PREDICTOR(W)                                             \
496   static void highbd_smooth_##W##xh_neon(                                      \
497       uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row,          \
498       const uint16_t *const left_column, const int height) {                   \
499     const uint16_t top_right = top_row[(W)-1];                                 \
500     const uint16_t bottom_left = left_column[height - 1];                      \
501     const uint16_t *const weights_y = smooth_weights_u16 + height - 4;         \
502                                                                                \
503     /* Precompute weighted values that don't vary with |y|. */                 \
504     uint32x4_t weighted_tr_low[(W) >> 3];                                      \
505     uint32x4_t weighted_tr_high[(W) >> 3];                                     \
506     for (int i = 0; i < (W) >> 3; ++i) {                                       \
507       const int x = i << 3;                                                    \
508       const uint16x4_t weights_x_low =                                         \
509           vld1_u16(smooth_weights_u16 + (W)-4 + x);                            \
510       weighted_tr_low[i] = vmull_n_u16(negate_s8(weights_x_low), top_right);   \
511       const uint16x4_t weights_x_high =                                        \
512           vld1_u16(smooth_weights_u16 + (W) + x);                              \
513       weighted_tr_high[i] = vmull_n_u16(negate_s8(weights_x_high), top_right); \
514     }                                                                          \
515                                                                                \
516     const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);                  \
517     for (int y = 0; y < height; ++y) {                                         \
518       const uint32x4_t weighted_bl =                                           \
519           vmull_n_u16(bottom_left_v, 256 - weights_y[y]);                      \
520       uint16_t *dst_x = dst;                                                   \
521       for (int i = 0; i < (W) >> 3; ++i) {                                     \
522         const int x = i << 3;                                                  \
523         const uint16x4x2_t top_vals = { { vld1_u16(top_row + x),               \
524                                           vld1_u16(top_row + x + 4) } };       \
525         const uint32x4_t weighted_corners_low =                                \
526             vaddq_u32(weighted_bl, weighted_tr_low[i]);                        \
527         const uint32x4_t weighted_corners_high =                               \
528             vaddq_u32(weighted_bl, weighted_tr_high[i]);                       \
529         /* Accumulate weighted edge values and store. */                       \
530         const uint16x4x2_t weights_x = {                                       \
531           { vld1_u16(smooth_weights_u16 + (W)-4 + x),                          \
532             vld1_u16(smooth_weights_u16 + (W) + x) }                           \
533         };                                                                     \
534         highbd_calculate_pred8(dst_x, weighted_corners_low,                    \
535                                weighted_corners_high, top_vals, weights_x,     \
536                                left_column[y], weights_y[y]);                  \
537         dst_x += 8;                                                            \
538       }                                                                        \
539       dst += stride;                                                           \
540     }                                                                          \
541   }
542 
543 HIGHBD_SMOOTH_PREDICTOR(16)
544 HIGHBD_SMOOTH_PREDICTOR(32)
545 HIGHBD_SMOOTH_PREDICTOR(64)
546 
547 #undef HIGHBD_SMOOTH_PREDICTOR
548 
549 #define HIGHBD_SMOOTH_NXM_WIDE(W, H)                            \
550   void aom_highbd_smooth_predictor_##W##x##H##_neon(            \
551       uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
552       const uint16_t *left, int bd) {                           \
553     (void)bd;                                                   \
554     highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H);  \
555   }
556 
557 HIGHBD_SMOOTH_NXM_WIDE(16, 4)
558 HIGHBD_SMOOTH_NXM_WIDE(16, 8)
559 HIGHBD_SMOOTH_NXM_WIDE(16, 16)
560 HIGHBD_SMOOTH_NXM_WIDE(16, 32)
561 HIGHBD_SMOOTH_NXM_WIDE(16, 64)
562 HIGHBD_SMOOTH_NXM_WIDE(32, 8)
563 HIGHBD_SMOOTH_NXM_WIDE(32, 16)
564 HIGHBD_SMOOTH_NXM_WIDE(32, 32)
565 HIGHBD_SMOOTH_NXM_WIDE(32, 64)
566 HIGHBD_SMOOTH_NXM_WIDE(64, 16)
567 HIGHBD_SMOOTH_NXM_WIDE(64, 32)
568 HIGHBD_SMOOTH_NXM_WIDE(64, 64)
569 
570 #undef HIGHBD_SMOOTH_NXM_WIDE
571 
highbd_smooth_v_4xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,const int height)572 static void highbd_smooth_v_4xh_neon(uint16_t *dst, ptrdiff_t stride,
573                                      const uint16_t *const top_row,
574                                      const uint16_t *const left_column,
575                                      const int height) {
576   const uint16_t bottom_left = left_column[height - 1];
577   const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
578 
579   const uint16x4_t top_v = vld1_u16(top_row);
580   const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
581 
582   for (int y = 0; y < height; ++y) {
583     const uint32x4_t weighted_bl =
584         vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
585     const uint32x4_t weighted_top =
586         vmlal_n_u16(weighted_bl, top_v, weights_y[y]);
587     vst1_u16(dst, vrshrn_n_u32(weighted_top, SMOOTH_WEIGHT_LOG2_SCALE));
588 
589     dst += stride;
590   }
591 }
592 
highbd_smooth_v_8xh_neon(uint16_t * dst,const ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,const int height)593 static void highbd_smooth_v_8xh_neon(uint16_t *dst, const ptrdiff_t stride,
594                                      const uint16_t *const top_row,
595                                      const uint16_t *const left_column,
596                                      const int height) {
597   const uint16_t bottom_left = left_column[height - 1];
598   const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
599 
600   const uint16x4_t top_low = vld1_u16(top_row);
601   const uint16x4_t top_high = vld1_u16(top_row + 4);
602   const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
603 
604   for (int y = 0; y < height; ++y) {
605     const uint32x4_t weighted_bl =
606         vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
607 
608     const uint32x4_t weighted_top_low =
609         vmlal_n_u16(weighted_bl, top_low, weights_y[y]);
610     vst1_u16(dst, vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE));
611 
612     const uint32x4_t weighted_top_high =
613         vmlal_n_u16(weighted_bl, top_high, weights_y[y]);
614     vst1_u16(dst + 4,
615              vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE));
616     dst += stride;
617   }
618 }
619 
620 #define HIGHBD_SMOOTH_V_NXM(W, H)                                \
621   void aom_highbd_smooth_v_predictor_##W##x##H##_neon(           \
622       uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \
623       const uint16_t *left, int bd) {                            \
624     (void)bd;                                                    \
625     highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
626   }
627 
628 HIGHBD_SMOOTH_V_NXM(4, 4)
629 HIGHBD_SMOOTH_V_NXM(4, 8)
630 HIGHBD_SMOOTH_V_NXM(4, 16)
631 HIGHBD_SMOOTH_V_NXM(8, 4)
632 HIGHBD_SMOOTH_V_NXM(8, 8)
633 HIGHBD_SMOOTH_V_NXM(8, 16)
634 HIGHBD_SMOOTH_V_NXM(8, 32)
635 
636 #undef HIGHBD_SMOOTH_V_NXM
637 
638 // For width 16 and above.
639 #define HIGHBD_SMOOTH_V_PREDICTOR(W)                                         \
640   static void highbd_smooth_v_##W##xh_neon(                                  \
641       uint16_t *dst, const ptrdiff_t stride, const uint16_t *const top_row,  \
642       const uint16_t *const left_column, const int height) {                 \
643     const uint16_t bottom_left = left_column[height - 1];                    \
644     const uint16_t *const weights_y = smooth_weights_u16 + height - 4;       \
645                                                                              \
646     uint16x4x2_t top_vals[(W) >> 3];                                         \
647     for (int i = 0; i < (W) >> 3; ++i) {                                     \
648       const int x = i << 3;                                                  \
649       top_vals[i].val[0] = vld1_u16(top_row + x);                            \
650       top_vals[i].val[1] = vld1_u16(top_row + x + 4);                        \
651     }                                                                        \
652                                                                              \
653     const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);                \
654     for (int y = 0; y < height; ++y) {                                       \
655       const uint32x4_t weighted_bl =                                         \
656           vmull_n_u16(bottom_left_v, 256 - weights_y[y]);                    \
657                                                                              \
658       uint16_t *dst_x = dst;                                                 \
659       for (int i = 0; i < (W) >> 3; ++i) {                                   \
660         const uint32x4_t weighted_top_low =                                  \
661             vmlal_n_u16(weighted_bl, top_vals[i].val[0], weights_y[y]);      \
662         vst1_u16(dst_x,                                                      \
663                  vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE));  \
664                                                                              \
665         const uint32x4_t weighted_top_high =                                 \
666             vmlal_n_u16(weighted_bl, top_vals[i].val[1], weights_y[y]);      \
667         vst1_u16(dst_x + 4,                                                  \
668                  vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE)); \
669         dst_x += 8;                                                          \
670       }                                                                      \
671       dst += stride;                                                         \
672     }                                                                        \
673   }
674 
675 HIGHBD_SMOOTH_V_PREDICTOR(16)
676 HIGHBD_SMOOTH_V_PREDICTOR(32)
677 HIGHBD_SMOOTH_V_PREDICTOR(64)
678 
679 #undef HIGHBD_SMOOTH_V_PREDICTOR
680 
681 #define HIGHBD_SMOOTH_V_NXM_WIDE(W, H)                           \
682   void aom_highbd_smooth_v_predictor_##W##x##H##_neon(           \
683       uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \
684       const uint16_t *left, int bd) {                            \
685     (void)bd;                                                    \
686     highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
687   }
688 
689 HIGHBD_SMOOTH_V_NXM_WIDE(16, 4)
690 HIGHBD_SMOOTH_V_NXM_WIDE(16, 8)
691 HIGHBD_SMOOTH_V_NXM_WIDE(16, 16)
692 HIGHBD_SMOOTH_V_NXM_WIDE(16, 32)
693 HIGHBD_SMOOTH_V_NXM_WIDE(16, 64)
694 HIGHBD_SMOOTH_V_NXM_WIDE(32, 8)
695 HIGHBD_SMOOTH_V_NXM_WIDE(32, 16)
696 HIGHBD_SMOOTH_V_NXM_WIDE(32, 32)
697 HIGHBD_SMOOTH_V_NXM_WIDE(32, 64)
698 HIGHBD_SMOOTH_V_NXM_WIDE(64, 16)
699 HIGHBD_SMOOTH_V_NXM_WIDE(64, 32)
700 HIGHBD_SMOOTH_V_NXM_WIDE(64, 64)
701 
702 #undef HIGHBD_SMOOTH_V_NXM_WIDE
703 
highbd_smooth_h_4xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,const int height)704 static INLINE void highbd_smooth_h_4xh_neon(uint16_t *dst, ptrdiff_t stride,
705                                             const uint16_t *const top_row,
706                                             const uint16_t *const left_column,
707                                             const int height) {
708   const uint16_t top_right = top_row[3];
709 
710   const uint16x4_t weights_x = vld1_u16(smooth_weights_u16);
711   const uint16x4_t scaled_weights_x = negate_s8(weights_x);
712 
713   const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
714   for (int y = 0; y < height; ++y) {
715     const uint32x4_t weighted_left =
716         vmlal_n_u16(weighted_tr, weights_x, left_column[y]);
717     vst1_u16(dst, vrshrn_n_u32(weighted_left, SMOOTH_WEIGHT_LOG2_SCALE));
718     dst += stride;
719   }
720 }
721 
highbd_smooth_h_8xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,const int height)722 static INLINE void highbd_smooth_h_8xh_neon(uint16_t *dst, ptrdiff_t stride,
723                                             const uint16_t *const top_row,
724                                             const uint16_t *const left_column,
725                                             const int height) {
726   const uint16_t top_right = top_row[7];
727 
728   const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4),
729                                      vld1_u16(smooth_weights_u16 + 8) } };
730 
731   const uint32x4_t weighted_tr_low =
732       vmull_n_u16(negate_s8(weights_x.val[0]), top_right);
733   const uint32x4_t weighted_tr_high =
734       vmull_n_u16(negate_s8(weights_x.val[1]), top_right);
735 
736   for (int y = 0; y < height; ++y) {
737     const uint16_t left_y = left_column[y];
738     const uint32x4_t weighted_left_low =
739         vmlal_n_u16(weighted_tr_low, weights_x.val[0], left_y);
740     vst1_u16(dst, vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE));
741 
742     const uint32x4_t weighted_left_high =
743         vmlal_n_u16(weighted_tr_high, weights_x.val[1], left_y);
744     vst1_u16(dst + 4,
745              vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE));
746     dst += stride;
747   }
748 }
749 
750 #define HIGHBD_SMOOTH_H_NXM(W, H)                                \
751   void aom_highbd_smooth_h_predictor_##W##x##H##_neon(           \
752       uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \
753       const uint16_t *left, int bd) {                            \
754     (void)bd;                                                    \
755     highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
756   }
757 
758 HIGHBD_SMOOTH_H_NXM(4, 4)
759 HIGHBD_SMOOTH_H_NXM(4, 8)
760 HIGHBD_SMOOTH_H_NXM(4, 16)
761 HIGHBD_SMOOTH_H_NXM(8, 4)
762 HIGHBD_SMOOTH_H_NXM(8, 8)
763 HIGHBD_SMOOTH_H_NXM(8, 16)
764 HIGHBD_SMOOTH_H_NXM(8, 32)
765 
766 #undef HIGHBD_SMOOTH_H_NXM
767 
768 // For width 16 and above.
769 #define HIGHBD_SMOOTH_H_PREDICTOR(W)                                          \
770   void highbd_smooth_h_##W##xh_neon(                                          \
771       uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row,         \
772       const uint16_t *const left_column, const int height) {                  \
773     const uint16_t top_right = top_row[(W)-1];                                \
774                                                                               \
775     uint16x4_t weights_x_low[(W) >> 3];                                       \
776     uint16x4_t weights_x_high[(W) >> 3];                                      \
777     uint32x4_t weighted_tr_low[(W) >> 3];                                     \
778     uint32x4_t weighted_tr_high[(W) >> 3];                                    \
779     for (int i = 0; i < (W) >> 3; ++i) {                                      \
780       const int x = i << 3;                                                   \
781       weights_x_low[i] = vld1_u16(smooth_weights_u16 + (W)-4 + x);            \
782       weighted_tr_low[i] =                                                    \
783           vmull_n_u16(negate_s8(weights_x_low[i]), top_right);                \
784       weights_x_high[i] = vld1_u16(smooth_weights_u16 + (W) + x);             \
785       weighted_tr_high[i] =                                                   \
786           vmull_n_u16(negate_s8(weights_x_high[i]), top_right);               \
787     }                                                                         \
788                                                                               \
789     for (int y = 0; y < height; ++y) {                                        \
790       uint16_t *dst_x = dst;                                                  \
791       const uint16_t left_y = left_column[y];                                 \
792       for (int i = 0; i < (W) >> 3; ++i) {                                    \
793         const uint32x4_t weighted_left_low =                                  \
794             vmlal_n_u16(weighted_tr_low[i], weights_x_low[i], left_y);        \
795         vst1_u16(dst_x,                                                       \
796                  vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE));  \
797                                                                               \
798         const uint32x4_t weighted_left_high =                                 \
799             vmlal_n_u16(weighted_tr_high[i], weights_x_high[i], left_y);      \
800         vst1_u16(dst_x + 4,                                                   \
801                  vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE)); \
802         dst_x += 8;                                                           \
803       }                                                                       \
804       dst += stride;                                                          \
805     }                                                                         \
806   }
807 
808 HIGHBD_SMOOTH_H_PREDICTOR(16)
809 HIGHBD_SMOOTH_H_PREDICTOR(32)
810 HIGHBD_SMOOTH_H_PREDICTOR(64)
811 
812 #undef HIGHBD_SMOOTH_H_PREDICTOR
813 
814 #define HIGHBD_SMOOTH_H_NXM_WIDE(W, H)                           \
815   void aom_highbd_smooth_h_predictor_##W##x##H##_neon(           \
816       uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \
817       const uint16_t *left, int bd) {                            \
818     (void)bd;                                                    \
819     highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
820   }
821 
822 HIGHBD_SMOOTH_H_NXM_WIDE(16, 4)
823 HIGHBD_SMOOTH_H_NXM_WIDE(16, 8)
824 HIGHBD_SMOOTH_H_NXM_WIDE(16, 16)
825 HIGHBD_SMOOTH_H_NXM_WIDE(16, 32)
826 HIGHBD_SMOOTH_H_NXM_WIDE(16, 64)
827 HIGHBD_SMOOTH_H_NXM_WIDE(32, 8)
828 HIGHBD_SMOOTH_H_NXM_WIDE(32, 16)
829 HIGHBD_SMOOTH_H_NXM_WIDE(32, 32)
830 HIGHBD_SMOOTH_H_NXM_WIDE(32, 64)
831 HIGHBD_SMOOTH_H_NXM_WIDE(64, 16)
832 HIGHBD_SMOOTH_H_NXM_WIDE(64, 32)
833 HIGHBD_SMOOTH_H_NXM_WIDE(64, 64)
834 
835 #undef HIGHBD_SMOOTH_H_NXM_WIDE
836