1 /*
2 * Copyright (c) 2022, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13
14 #include "config/aom_config.h"
15 #include "config/aom_dsp_rtcd.h"
16
17 #include "aom/aom_integer.h"
18 #include "aom_dsp/intrapred_common.h"
19
20 // -----------------------------------------------------------------------------
21 // DC
22
highbd_dc_predictor(uint16_t * dst,ptrdiff_t stride,int bw,const uint16_t * above,const uint16_t * left)23 static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
24 const uint16_t *above,
25 const uint16_t *left) {
26 assert(bw >= 4);
27 assert(IS_POWER_OF_TWO(bw));
28 int expected_dc, sum = 0;
29 const int count = bw * 2;
30 uint32x4_t sum_q = vdupq_n_u32(0);
31 uint32x2_t sum_d;
32 uint16_t *dst_1;
33 if (bw >= 8) {
34 for (int i = 0; i < bw; i += 8) {
35 sum_q = vpadalq_u16(sum_q, vld1q_u16(above));
36 sum_q = vpadalq_u16(sum_q, vld1q_u16(left));
37 above += 8;
38 left += 8;
39 }
40 sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
41 sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
42 expected_dc = (sum + (count >> 1)) / count;
43 const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc);
44 for (int r = 0; r < bw; r++) {
45 dst_1 = dst;
46 for (int i = 0; i < bw; i += 8) {
47 vst1q_u16(dst_1, dc);
48 dst_1 += 8;
49 }
50 dst += stride;
51 }
52 } else { // 4x4
53 sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left));
54 sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
55 sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
56 expected_dc = (sum + (count >> 1)) / count;
57 const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc);
58 for (int r = 0; r < bw; r++) {
59 vst1_u16(dst, dc);
60 dst += stride;
61 }
62 }
63 }
64
65 #define INTRA_PRED_HIGHBD_SIZED_NEON(type, width) \
66 void aom_highbd_##type##_predictor_##width##x##width##_neon( \
67 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
68 const uint16_t *left, int bd) { \
69 (void)bd; \
70 highbd_##type##_predictor(dst, stride, width, above, left); \
71 }
72
73 #define INTRA_PRED_SQUARE(type) \
74 INTRA_PRED_HIGHBD_SIZED_NEON(type, 4) \
75 INTRA_PRED_HIGHBD_SIZED_NEON(type, 8) \
76 INTRA_PRED_HIGHBD_SIZED_NEON(type, 16) \
77 INTRA_PRED_HIGHBD_SIZED_NEON(type, 32) \
78 INTRA_PRED_HIGHBD_SIZED_NEON(type, 64)
79
INTRA_PRED_SQUARE(dc)80 INTRA_PRED_SQUARE(dc)
81
82 #undef INTRA_PRED_SQUARE
83
84 // -----------------------------------------------------------------------------
85 // V_PRED
86
87 #define HIGHBD_V_NXM(W, H) \
88 void aom_highbd_v_predictor_##W##x##H##_neon( \
89 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
90 const uint16_t *left, int bd) { \
91 (void)left; \
92 (void)bd; \
93 vertical##W##xh_neon(dst, stride, above, H); \
94 }
95
96 static INLINE uint16x8x2_t load_uint16x8x2(uint16_t const *ptr) {
97 uint16x8x2_t x;
98 // Clang/gcc uses ldp here.
99 x.val[0] = vld1q_u16(ptr);
100 x.val[1] = vld1q_u16(ptr + 8);
101 return x;
102 }
103
store_uint16x8x2(uint16_t * ptr,uint16x8x2_t x)104 static INLINE void store_uint16x8x2(uint16_t *ptr, uint16x8x2_t x) {
105 vst1q_u16(ptr, x.val[0]);
106 vst1q_u16(ptr + 8, x.val[1]);
107 }
108
vertical4xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const above,int height)109 static INLINE void vertical4xh_neon(uint16_t *dst, ptrdiff_t stride,
110 const uint16_t *const above, int height) {
111 const uint16x4_t row = vld1_u16(above);
112 int y = height;
113 do {
114 vst1_u16(dst, row);
115 vst1_u16(dst + stride, row);
116 dst += stride << 1;
117 y -= 2;
118 } while (y != 0);
119 }
120
vertical8xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const above,int height)121 static INLINE void vertical8xh_neon(uint16_t *dst, ptrdiff_t stride,
122 const uint16_t *const above, int height) {
123 const uint16x8_t row = vld1q_u16(above);
124 int y = height;
125 do {
126 vst1q_u16(dst, row);
127 vst1q_u16(dst + stride, row);
128 dst += stride << 1;
129 y -= 2;
130 } while (y != 0);
131 }
132
vertical16xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const above,int height)133 static INLINE void vertical16xh_neon(uint16_t *dst, ptrdiff_t stride,
134 const uint16_t *const above, int height) {
135 const uint16x8x2_t row = load_uint16x8x2(above);
136 int y = height;
137 do {
138 store_uint16x8x2(dst, row);
139 store_uint16x8x2(dst + stride, row);
140 dst += stride << 1;
141 y -= 2;
142 } while (y != 0);
143 }
144
load_uint16x8x4(uint16_t const * ptr)145 static INLINE uint16x8x4_t load_uint16x8x4(uint16_t const *ptr) {
146 uint16x8x4_t x;
147 // Clang/gcc uses ldp here.
148 x.val[0] = vld1q_u16(ptr);
149 x.val[1] = vld1q_u16(ptr + 8);
150 x.val[2] = vld1q_u16(ptr + 16);
151 x.val[3] = vld1q_u16(ptr + 24);
152 return x;
153 }
154
store_uint16x8x4(uint16_t * ptr,uint16x8x4_t x)155 static INLINE void store_uint16x8x4(uint16_t *ptr, uint16x8x4_t x) {
156 vst1q_u16(ptr, x.val[0]);
157 vst1q_u16(ptr + 8, x.val[1]);
158 vst1q_u16(ptr + 16, x.val[2]);
159 vst1q_u16(ptr + 24, x.val[3]);
160 }
161
vertical32xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const above,int height)162 static INLINE void vertical32xh_neon(uint16_t *dst, ptrdiff_t stride,
163 const uint16_t *const above, int height) {
164 const uint16x8x4_t row = load_uint16x8x4(above);
165 int y = height;
166 do {
167 store_uint16x8x4(dst, row);
168 store_uint16x8x4(dst + stride, row);
169 dst += stride << 1;
170 y -= 2;
171 } while (y != 0);
172 }
173
vertical64xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const above,int height)174 static INLINE void vertical64xh_neon(uint16_t *dst, ptrdiff_t stride,
175 const uint16_t *const above, int height) {
176 uint16_t *dst32 = dst + 32;
177 const uint16x8x4_t row = load_uint16x8x4(above);
178 const uint16x8x4_t row32 = load_uint16x8x4(above + 32);
179 int y = height;
180 do {
181 store_uint16x8x4(dst, row);
182 store_uint16x8x4(dst32, row32);
183 store_uint16x8x4(dst + stride, row);
184 store_uint16x8x4(dst32 + stride, row32);
185 dst += stride << 1;
186 dst32 += stride << 1;
187 y -= 2;
188 } while (y != 0);
189 }
190
191 HIGHBD_V_NXM(4, 4)
192 HIGHBD_V_NXM(4, 8)
193 HIGHBD_V_NXM(4, 16)
194
195 HIGHBD_V_NXM(8, 4)
196 HIGHBD_V_NXM(8, 8)
197 HIGHBD_V_NXM(8, 16)
198 HIGHBD_V_NXM(8, 32)
199
200 HIGHBD_V_NXM(16, 4)
201 HIGHBD_V_NXM(16, 8)
202 HIGHBD_V_NXM(16, 16)
203 HIGHBD_V_NXM(16, 32)
204 HIGHBD_V_NXM(16, 64)
205
206 HIGHBD_V_NXM(32, 8)
207 HIGHBD_V_NXM(32, 16)
208 HIGHBD_V_NXM(32, 32)
209 HIGHBD_V_NXM(32, 64)
210
211 HIGHBD_V_NXM(64, 16)
212 HIGHBD_V_NXM(64, 32)
213 HIGHBD_V_NXM(64, 64)
214
215 // -----------------------------------------------------------------------------
216 // PAETH
217
highbd_paeth_4or8_x_h_neon(uint16_t * dest,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,int width,int height)218 static INLINE void highbd_paeth_4or8_x_h_neon(uint16_t *dest, ptrdiff_t stride,
219 const uint16_t *const top_row,
220 const uint16_t *const left_column,
221 int width, int height) {
222 const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
223 const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
224 uint16x8_t top;
225 if (width == 4) {
226 top = vcombine_u16(vld1_u16(top_row), vdup_n_u16(0));
227 } else { // width == 8
228 top = vld1q_u16(top_row);
229 }
230
231 for (int y = 0; y < height; ++y) {
232 const uint16x8_t left = vdupq_n_u16(left_column[y]);
233
234 const uint16x8_t left_dist = vabdq_u16(top, top_left);
235 const uint16x8_t top_dist = vabdq_u16(left, top_left);
236 const uint16x8_t top_left_dist =
237 vabdq_u16(vaddq_u16(top, left), top_left_x2);
238
239 const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
240 const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
241 const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
242
243 // if (left_dist <= top_dist && left_dist <= top_left_dist)
244 const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
245 // dest[x] = left_column[y];
246 // Fill all the unused spaces with 'top'. They will be overwritten when
247 // the positions for top_left are known.
248 uint16x8_t result = vbslq_u16(left_mask, left, top);
249 // else if (top_dist <= top_left_dist)
250 // dest[x] = top_row[x];
251 // Add these values to the mask. They were already set.
252 const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
253 // else
254 // dest[x] = top_left;
255 result = vbslq_u16(left_or_top_mask, result, top_left);
256
257 if (width == 4) {
258 vst1_u16(dest, vget_low_u16(result));
259 } else { // width == 8
260 vst1q_u16(dest, result);
261 }
262 dest += stride;
263 }
264 }
265
266 #define HIGHBD_PAETH_NXM(W, H) \
267 void aom_highbd_paeth_predictor_##W##x##H##_neon( \
268 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
269 const uint16_t *left, int bd) { \
270 (void)bd; \
271 highbd_paeth_4or8_x_h_neon(dst, stride, above, left, W, H); \
272 }
273
274 HIGHBD_PAETH_NXM(4, 4)
275 HIGHBD_PAETH_NXM(4, 8)
276 HIGHBD_PAETH_NXM(4, 16)
277 HIGHBD_PAETH_NXM(8, 4)
278 HIGHBD_PAETH_NXM(8, 8)
279 HIGHBD_PAETH_NXM(8, 16)
280 HIGHBD_PAETH_NXM(8, 32)
281
282 // Select the closest values and collect them.
select_paeth(const uint16x8_t top,const uint16x8_t left,const uint16x8_t top_left,const uint16x8_t left_le_top,const uint16x8_t left_le_top_left,const uint16x8_t top_le_top_left)283 static INLINE uint16x8_t select_paeth(const uint16x8_t top,
284 const uint16x8_t left,
285 const uint16x8_t top_left,
286 const uint16x8_t left_le_top,
287 const uint16x8_t left_le_top_left,
288 const uint16x8_t top_le_top_left) {
289 // if (left_dist <= top_dist && left_dist <= top_left_dist)
290 const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
291 // dest[x] = left_column[y];
292 // Fill all the unused spaces with 'top'. They will be overwritten when
293 // the positions for top_left are known.
294 const uint16x8_t result = vbslq_u16(left_mask, left, top);
295 // else if (top_dist <= top_left_dist)
296 // dest[x] = top_row[x];
297 // Add these values to the mask. They were already set.
298 const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
299 // else
300 // dest[x] = top_left;
301 return vbslq_u16(left_or_top_mask, result, top_left);
302 }
303
304 #define PAETH_PREDICTOR(num) \
305 do { \
306 const uint16x8_t left_dist = vabdq_u16(top[num], top_left); \
307 const uint16x8_t top_left_dist = \
308 vabdq_u16(vaddq_u16(top[num], left), top_left_x2); \
309 const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist); \
310 const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist); \
311 const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist); \
312 const uint16x8_t result = \
313 select_paeth(top[num], left, top_left, left_le_top, left_le_top_left, \
314 top_le_top_left); \
315 vst1q_u16(dest + (num * 8), result); \
316 } while (0)
317
318 #define LOAD_TOP_ROW(num) vld1q_u16(top_row + (num * 8))
319
highbd_paeth16_plus_x_h_neon(uint16_t * dest,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,int width,int height)320 static INLINE void highbd_paeth16_plus_x_h_neon(
321 uint16_t *dest, ptrdiff_t stride, const uint16_t *const top_row,
322 const uint16_t *const left_column, int width, int height) {
323 const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
324 const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
325 uint16x8_t top[8];
326 top[0] = LOAD_TOP_ROW(0);
327 top[1] = LOAD_TOP_ROW(1);
328 if (width > 16) {
329 top[2] = LOAD_TOP_ROW(2);
330 top[3] = LOAD_TOP_ROW(3);
331 if (width == 64) {
332 top[4] = LOAD_TOP_ROW(4);
333 top[5] = LOAD_TOP_ROW(5);
334 top[6] = LOAD_TOP_ROW(6);
335 top[7] = LOAD_TOP_ROW(7);
336 }
337 }
338
339 for (int y = 0; y < height; ++y) {
340 const uint16x8_t left = vdupq_n_u16(left_column[y]);
341 const uint16x8_t top_dist = vabdq_u16(left, top_left);
342 PAETH_PREDICTOR(0);
343 PAETH_PREDICTOR(1);
344 if (width > 16) {
345 PAETH_PREDICTOR(2);
346 PAETH_PREDICTOR(3);
347 if (width == 64) {
348 PAETH_PREDICTOR(4);
349 PAETH_PREDICTOR(5);
350 PAETH_PREDICTOR(6);
351 PAETH_PREDICTOR(7);
352 }
353 }
354 dest += stride;
355 }
356 }
357
358 #define HIGHBD_PAETH_NXM_WIDE(W, H) \
359 void aom_highbd_paeth_predictor_##W##x##H##_neon( \
360 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
361 const uint16_t *left, int bd) { \
362 (void)bd; \
363 highbd_paeth16_plus_x_h_neon(dst, stride, above, left, W, H); \
364 }
365
366 HIGHBD_PAETH_NXM_WIDE(16, 4)
367 HIGHBD_PAETH_NXM_WIDE(16, 8)
368 HIGHBD_PAETH_NXM_WIDE(16, 16)
369 HIGHBD_PAETH_NXM_WIDE(16, 32)
370 HIGHBD_PAETH_NXM_WIDE(16, 64)
371 HIGHBD_PAETH_NXM_WIDE(32, 8)
372 HIGHBD_PAETH_NXM_WIDE(32, 16)
373 HIGHBD_PAETH_NXM_WIDE(32, 32)
374 HIGHBD_PAETH_NXM_WIDE(32, 64)
375 HIGHBD_PAETH_NXM_WIDE(64, 16)
376 HIGHBD_PAETH_NXM_WIDE(64, 32)
377 HIGHBD_PAETH_NXM_WIDE(64, 64)
378
379 // -----------------------------------------------------------------------------
380 // SMOOTH
381
382 // 256 - v = vneg_s8(v)
negate_s8(const uint16x4_t v)383 static INLINE uint16x4_t negate_s8(const uint16x4_t v) {
384 return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v)));
385 }
386
highbd_smooth_4xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,const int height)387 static INLINE void highbd_smooth_4xh_neon(uint16_t *dst, ptrdiff_t stride,
388 const uint16_t *const top_row,
389 const uint16_t *const left_column,
390 const int height) {
391 const uint16_t top_right = top_row[3];
392 const uint16_t bottom_left = left_column[height - 1];
393 const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
394
395 const uint16x4_t top_v = vld1_u16(top_row);
396 const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
397 const uint16x4_t weights_x_v = vld1_u16(smooth_weights_u16);
398 const uint16x4_t scaled_weights_x = negate_s8(weights_x_v);
399 const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
400
401 for (int y = 0; y < height; ++y) {
402 // Each variable in the running summation is named for the last item to be
403 // accumulated.
404 const uint32x4_t weighted_top =
405 vmlal_n_u16(weighted_tr, top_v, weights_y[y]);
406 const uint32x4_t weighted_left =
407 vmlal_n_u16(weighted_top, weights_x_v, left_column[y]);
408 const uint32x4_t weighted_bl =
409 vmlal_n_u16(weighted_left, bottom_left_v, 256 - weights_y[y]);
410
411 const uint16x4_t pred =
412 vrshrn_n_u32(weighted_bl, SMOOTH_WEIGHT_LOG2_SCALE + 1);
413 vst1_u16(dst, pred);
414 dst += stride;
415 }
416 }
417
418 // Common code between 8xH and [16|32|64]xH.
highbd_calculate_pred8(uint16_t * dst,const uint32x4_t weighted_corners_low,const uint32x4_t weighted_corners_high,const uint16x4x2_t top_vals,const uint16x4x2_t weights_x,const uint16_t left_y,const uint16_t weight_y)419 static INLINE void highbd_calculate_pred8(
420 uint16_t *dst, const uint32x4_t weighted_corners_low,
421 const uint32x4_t weighted_corners_high, const uint16x4x2_t top_vals,
422 const uint16x4x2_t weights_x, const uint16_t left_y,
423 const uint16_t weight_y) {
424 // Each variable in the running summation is named for the last item to be
425 // accumulated.
426 const uint32x4_t weighted_top_low =
427 vmlal_n_u16(weighted_corners_low, top_vals.val[0], weight_y);
428 const uint32x4_t weighted_edges_low =
429 vmlal_n_u16(weighted_top_low, weights_x.val[0], left_y);
430
431 const uint16x4_t pred_low =
432 vrshrn_n_u32(weighted_edges_low, SMOOTH_WEIGHT_LOG2_SCALE + 1);
433 vst1_u16(dst, pred_low);
434
435 const uint32x4_t weighted_top_high =
436 vmlal_n_u16(weighted_corners_high, top_vals.val[1], weight_y);
437 const uint32x4_t weighted_edges_high =
438 vmlal_n_u16(weighted_top_high, weights_x.val[1], left_y);
439
440 const uint16x4_t pred_high =
441 vrshrn_n_u32(weighted_edges_high, SMOOTH_WEIGHT_LOG2_SCALE + 1);
442 vst1_u16(dst + 4, pred_high);
443 }
444
highbd_smooth_8xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,const int height)445 static void highbd_smooth_8xh_neon(uint16_t *dst, ptrdiff_t stride,
446 const uint16_t *const top_row,
447 const uint16_t *const left_column,
448 const int height) {
449 const uint16_t top_right = top_row[7];
450 const uint16_t bottom_left = left_column[height - 1];
451 const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
452
453 const uint16x4x2_t top_vals = { { vld1_u16(top_row),
454 vld1_u16(top_row + 4) } };
455 const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
456 const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4),
457 vld1_u16(smooth_weights_u16 + 8) } };
458 const uint32x4_t weighted_tr_low =
459 vmull_n_u16(negate_s8(weights_x.val[0]), top_right);
460 const uint32x4_t weighted_tr_high =
461 vmull_n_u16(negate_s8(weights_x.val[1]), top_right);
462
463 for (int y = 0; y < height; ++y) {
464 const uint32x4_t weighted_bl =
465 vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
466 const uint32x4_t weighted_corners_low =
467 vaddq_u32(weighted_bl, weighted_tr_low);
468 const uint32x4_t weighted_corners_high =
469 vaddq_u32(weighted_bl, weighted_tr_high);
470 highbd_calculate_pred8(dst, weighted_corners_low, weighted_corners_high,
471 top_vals, weights_x, left_column[y], weights_y[y]);
472 dst += stride;
473 }
474 }
475
476 #define HIGHBD_SMOOTH_NXM(W, H) \
477 void aom_highbd_smooth_predictor_##W##x##H##_neon( \
478 uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
479 const uint16_t *left, int bd) { \
480 (void)bd; \
481 highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H); \
482 }
483
484 HIGHBD_SMOOTH_NXM(4, 4)
485 HIGHBD_SMOOTH_NXM(4, 8)
486 HIGHBD_SMOOTH_NXM(8, 4)
487 HIGHBD_SMOOTH_NXM(8, 8)
488 HIGHBD_SMOOTH_NXM(4, 16)
489 HIGHBD_SMOOTH_NXM(8, 16)
490 HIGHBD_SMOOTH_NXM(8, 32)
491
492 #undef HIGHBD_SMOOTH_NXM
493
494 // For width 16 and above.
495 #define HIGHBD_SMOOTH_PREDICTOR(W) \
496 static void highbd_smooth_##W##xh_neon( \
497 uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, \
498 const uint16_t *const left_column, const int height) { \
499 const uint16_t top_right = top_row[(W)-1]; \
500 const uint16_t bottom_left = left_column[height - 1]; \
501 const uint16_t *const weights_y = smooth_weights_u16 + height - 4; \
502 \
503 /* Precompute weighted values that don't vary with |y|. */ \
504 uint32x4_t weighted_tr_low[(W) >> 3]; \
505 uint32x4_t weighted_tr_high[(W) >> 3]; \
506 for (int i = 0; i < (W) >> 3; ++i) { \
507 const int x = i << 3; \
508 const uint16x4_t weights_x_low = \
509 vld1_u16(smooth_weights_u16 + (W)-4 + x); \
510 weighted_tr_low[i] = vmull_n_u16(negate_s8(weights_x_low), top_right); \
511 const uint16x4_t weights_x_high = \
512 vld1_u16(smooth_weights_u16 + (W) + x); \
513 weighted_tr_high[i] = vmull_n_u16(negate_s8(weights_x_high), top_right); \
514 } \
515 \
516 const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); \
517 for (int y = 0; y < height; ++y) { \
518 const uint32x4_t weighted_bl = \
519 vmull_n_u16(bottom_left_v, 256 - weights_y[y]); \
520 uint16_t *dst_x = dst; \
521 for (int i = 0; i < (W) >> 3; ++i) { \
522 const int x = i << 3; \
523 const uint16x4x2_t top_vals = { { vld1_u16(top_row + x), \
524 vld1_u16(top_row + x + 4) } }; \
525 const uint32x4_t weighted_corners_low = \
526 vaddq_u32(weighted_bl, weighted_tr_low[i]); \
527 const uint32x4_t weighted_corners_high = \
528 vaddq_u32(weighted_bl, weighted_tr_high[i]); \
529 /* Accumulate weighted edge values and store. */ \
530 const uint16x4x2_t weights_x = { \
531 { vld1_u16(smooth_weights_u16 + (W)-4 + x), \
532 vld1_u16(smooth_weights_u16 + (W) + x) } \
533 }; \
534 highbd_calculate_pred8(dst_x, weighted_corners_low, \
535 weighted_corners_high, top_vals, weights_x, \
536 left_column[y], weights_y[y]); \
537 dst_x += 8; \
538 } \
539 dst += stride; \
540 } \
541 }
542
543 HIGHBD_SMOOTH_PREDICTOR(16)
544 HIGHBD_SMOOTH_PREDICTOR(32)
545 HIGHBD_SMOOTH_PREDICTOR(64)
546
547 #undef HIGHBD_SMOOTH_PREDICTOR
548
549 #define HIGHBD_SMOOTH_NXM_WIDE(W, H) \
550 void aom_highbd_smooth_predictor_##W##x##H##_neon( \
551 uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
552 const uint16_t *left, int bd) { \
553 (void)bd; \
554 highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H); \
555 }
556
557 HIGHBD_SMOOTH_NXM_WIDE(16, 4)
558 HIGHBD_SMOOTH_NXM_WIDE(16, 8)
559 HIGHBD_SMOOTH_NXM_WIDE(16, 16)
560 HIGHBD_SMOOTH_NXM_WIDE(16, 32)
561 HIGHBD_SMOOTH_NXM_WIDE(16, 64)
562 HIGHBD_SMOOTH_NXM_WIDE(32, 8)
563 HIGHBD_SMOOTH_NXM_WIDE(32, 16)
564 HIGHBD_SMOOTH_NXM_WIDE(32, 32)
565 HIGHBD_SMOOTH_NXM_WIDE(32, 64)
566 HIGHBD_SMOOTH_NXM_WIDE(64, 16)
567 HIGHBD_SMOOTH_NXM_WIDE(64, 32)
568 HIGHBD_SMOOTH_NXM_WIDE(64, 64)
569
570 #undef HIGHBD_SMOOTH_NXM_WIDE
571
highbd_smooth_v_4xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,const int height)572 static void highbd_smooth_v_4xh_neon(uint16_t *dst, ptrdiff_t stride,
573 const uint16_t *const top_row,
574 const uint16_t *const left_column,
575 const int height) {
576 const uint16_t bottom_left = left_column[height - 1];
577 const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
578
579 const uint16x4_t top_v = vld1_u16(top_row);
580 const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
581
582 for (int y = 0; y < height; ++y) {
583 const uint32x4_t weighted_bl =
584 vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
585 const uint32x4_t weighted_top =
586 vmlal_n_u16(weighted_bl, top_v, weights_y[y]);
587 vst1_u16(dst, vrshrn_n_u32(weighted_top, SMOOTH_WEIGHT_LOG2_SCALE));
588
589 dst += stride;
590 }
591 }
592
highbd_smooth_v_8xh_neon(uint16_t * dst,const ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,const int height)593 static void highbd_smooth_v_8xh_neon(uint16_t *dst, const ptrdiff_t stride,
594 const uint16_t *const top_row,
595 const uint16_t *const left_column,
596 const int height) {
597 const uint16_t bottom_left = left_column[height - 1];
598 const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
599
600 const uint16x4_t top_low = vld1_u16(top_row);
601 const uint16x4_t top_high = vld1_u16(top_row + 4);
602 const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
603
604 for (int y = 0; y < height; ++y) {
605 const uint32x4_t weighted_bl =
606 vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
607
608 const uint32x4_t weighted_top_low =
609 vmlal_n_u16(weighted_bl, top_low, weights_y[y]);
610 vst1_u16(dst, vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE));
611
612 const uint32x4_t weighted_top_high =
613 vmlal_n_u16(weighted_bl, top_high, weights_y[y]);
614 vst1_u16(dst + 4,
615 vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE));
616 dst += stride;
617 }
618 }
619
620 #define HIGHBD_SMOOTH_V_NXM(W, H) \
621 void aom_highbd_smooth_v_predictor_##W##x##H##_neon( \
622 uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
623 const uint16_t *left, int bd) { \
624 (void)bd; \
625 highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
626 }
627
628 HIGHBD_SMOOTH_V_NXM(4, 4)
629 HIGHBD_SMOOTH_V_NXM(4, 8)
630 HIGHBD_SMOOTH_V_NXM(4, 16)
631 HIGHBD_SMOOTH_V_NXM(8, 4)
632 HIGHBD_SMOOTH_V_NXM(8, 8)
633 HIGHBD_SMOOTH_V_NXM(8, 16)
634 HIGHBD_SMOOTH_V_NXM(8, 32)
635
636 #undef HIGHBD_SMOOTH_V_NXM
637
638 // For width 16 and above.
639 #define HIGHBD_SMOOTH_V_PREDICTOR(W) \
640 static void highbd_smooth_v_##W##xh_neon( \
641 uint16_t *dst, const ptrdiff_t stride, const uint16_t *const top_row, \
642 const uint16_t *const left_column, const int height) { \
643 const uint16_t bottom_left = left_column[height - 1]; \
644 const uint16_t *const weights_y = smooth_weights_u16 + height - 4; \
645 \
646 uint16x4x2_t top_vals[(W) >> 3]; \
647 for (int i = 0; i < (W) >> 3; ++i) { \
648 const int x = i << 3; \
649 top_vals[i].val[0] = vld1_u16(top_row + x); \
650 top_vals[i].val[1] = vld1_u16(top_row + x + 4); \
651 } \
652 \
653 const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); \
654 for (int y = 0; y < height; ++y) { \
655 const uint32x4_t weighted_bl = \
656 vmull_n_u16(bottom_left_v, 256 - weights_y[y]); \
657 \
658 uint16_t *dst_x = dst; \
659 for (int i = 0; i < (W) >> 3; ++i) { \
660 const uint32x4_t weighted_top_low = \
661 vmlal_n_u16(weighted_bl, top_vals[i].val[0], weights_y[y]); \
662 vst1_u16(dst_x, \
663 vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE)); \
664 \
665 const uint32x4_t weighted_top_high = \
666 vmlal_n_u16(weighted_bl, top_vals[i].val[1], weights_y[y]); \
667 vst1_u16(dst_x + 4, \
668 vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE)); \
669 dst_x += 8; \
670 } \
671 dst += stride; \
672 } \
673 }
674
675 HIGHBD_SMOOTH_V_PREDICTOR(16)
676 HIGHBD_SMOOTH_V_PREDICTOR(32)
677 HIGHBD_SMOOTH_V_PREDICTOR(64)
678
679 #undef HIGHBD_SMOOTH_V_PREDICTOR
680
681 #define HIGHBD_SMOOTH_V_NXM_WIDE(W, H) \
682 void aom_highbd_smooth_v_predictor_##W##x##H##_neon( \
683 uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
684 const uint16_t *left, int bd) { \
685 (void)bd; \
686 highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
687 }
688
689 HIGHBD_SMOOTH_V_NXM_WIDE(16, 4)
690 HIGHBD_SMOOTH_V_NXM_WIDE(16, 8)
691 HIGHBD_SMOOTH_V_NXM_WIDE(16, 16)
692 HIGHBD_SMOOTH_V_NXM_WIDE(16, 32)
693 HIGHBD_SMOOTH_V_NXM_WIDE(16, 64)
694 HIGHBD_SMOOTH_V_NXM_WIDE(32, 8)
695 HIGHBD_SMOOTH_V_NXM_WIDE(32, 16)
696 HIGHBD_SMOOTH_V_NXM_WIDE(32, 32)
697 HIGHBD_SMOOTH_V_NXM_WIDE(32, 64)
698 HIGHBD_SMOOTH_V_NXM_WIDE(64, 16)
699 HIGHBD_SMOOTH_V_NXM_WIDE(64, 32)
700 HIGHBD_SMOOTH_V_NXM_WIDE(64, 64)
701
702 #undef HIGHBD_SMOOTH_V_NXM_WIDE
703
highbd_smooth_h_4xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,const int height)704 static INLINE void highbd_smooth_h_4xh_neon(uint16_t *dst, ptrdiff_t stride,
705 const uint16_t *const top_row,
706 const uint16_t *const left_column,
707 const int height) {
708 const uint16_t top_right = top_row[3];
709
710 const uint16x4_t weights_x = vld1_u16(smooth_weights_u16);
711 const uint16x4_t scaled_weights_x = negate_s8(weights_x);
712
713 const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
714 for (int y = 0; y < height; ++y) {
715 const uint32x4_t weighted_left =
716 vmlal_n_u16(weighted_tr, weights_x, left_column[y]);
717 vst1_u16(dst, vrshrn_n_u32(weighted_left, SMOOTH_WEIGHT_LOG2_SCALE));
718 dst += stride;
719 }
720 }
721
highbd_smooth_h_8xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,const int height)722 static INLINE void highbd_smooth_h_8xh_neon(uint16_t *dst, ptrdiff_t stride,
723 const uint16_t *const top_row,
724 const uint16_t *const left_column,
725 const int height) {
726 const uint16_t top_right = top_row[7];
727
728 const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4),
729 vld1_u16(smooth_weights_u16 + 8) } };
730
731 const uint32x4_t weighted_tr_low =
732 vmull_n_u16(negate_s8(weights_x.val[0]), top_right);
733 const uint32x4_t weighted_tr_high =
734 vmull_n_u16(negate_s8(weights_x.val[1]), top_right);
735
736 for (int y = 0; y < height; ++y) {
737 const uint16_t left_y = left_column[y];
738 const uint32x4_t weighted_left_low =
739 vmlal_n_u16(weighted_tr_low, weights_x.val[0], left_y);
740 vst1_u16(dst, vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE));
741
742 const uint32x4_t weighted_left_high =
743 vmlal_n_u16(weighted_tr_high, weights_x.val[1], left_y);
744 vst1_u16(dst + 4,
745 vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE));
746 dst += stride;
747 }
748 }
749
750 #define HIGHBD_SMOOTH_H_NXM(W, H) \
751 void aom_highbd_smooth_h_predictor_##W##x##H##_neon( \
752 uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
753 const uint16_t *left, int bd) { \
754 (void)bd; \
755 highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
756 }
757
758 HIGHBD_SMOOTH_H_NXM(4, 4)
759 HIGHBD_SMOOTH_H_NXM(4, 8)
760 HIGHBD_SMOOTH_H_NXM(4, 16)
761 HIGHBD_SMOOTH_H_NXM(8, 4)
762 HIGHBD_SMOOTH_H_NXM(8, 8)
763 HIGHBD_SMOOTH_H_NXM(8, 16)
764 HIGHBD_SMOOTH_H_NXM(8, 32)
765
766 #undef HIGHBD_SMOOTH_H_NXM
767
768 // For width 16 and above.
769 #define HIGHBD_SMOOTH_H_PREDICTOR(W) \
770 void highbd_smooth_h_##W##xh_neon( \
771 uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, \
772 const uint16_t *const left_column, const int height) { \
773 const uint16_t top_right = top_row[(W)-1]; \
774 \
775 uint16x4_t weights_x_low[(W) >> 3]; \
776 uint16x4_t weights_x_high[(W) >> 3]; \
777 uint32x4_t weighted_tr_low[(W) >> 3]; \
778 uint32x4_t weighted_tr_high[(W) >> 3]; \
779 for (int i = 0; i < (W) >> 3; ++i) { \
780 const int x = i << 3; \
781 weights_x_low[i] = vld1_u16(smooth_weights_u16 + (W)-4 + x); \
782 weighted_tr_low[i] = \
783 vmull_n_u16(negate_s8(weights_x_low[i]), top_right); \
784 weights_x_high[i] = vld1_u16(smooth_weights_u16 + (W) + x); \
785 weighted_tr_high[i] = \
786 vmull_n_u16(negate_s8(weights_x_high[i]), top_right); \
787 } \
788 \
789 for (int y = 0; y < height; ++y) { \
790 uint16_t *dst_x = dst; \
791 const uint16_t left_y = left_column[y]; \
792 for (int i = 0; i < (W) >> 3; ++i) { \
793 const uint32x4_t weighted_left_low = \
794 vmlal_n_u16(weighted_tr_low[i], weights_x_low[i], left_y); \
795 vst1_u16(dst_x, \
796 vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE)); \
797 \
798 const uint32x4_t weighted_left_high = \
799 vmlal_n_u16(weighted_tr_high[i], weights_x_high[i], left_y); \
800 vst1_u16(dst_x + 4, \
801 vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE)); \
802 dst_x += 8; \
803 } \
804 dst += stride; \
805 } \
806 }
807
808 HIGHBD_SMOOTH_H_PREDICTOR(16)
809 HIGHBD_SMOOTH_H_PREDICTOR(32)
810 HIGHBD_SMOOTH_H_PREDICTOR(64)
811
812 #undef HIGHBD_SMOOTH_H_PREDICTOR
813
814 #define HIGHBD_SMOOTH_H_NXM_WIDE(W, H) \
815 void aom_highbd_smooth_h_predictor_##W##x##H##_neon( \
816 uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
817 const uint16_t *left, int bd) { \
818 (void)bd; \
819 highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
820 }
821
822 HIGHBD_SMOOTH_H_NXM_WIDE(16, 4)
823 HIGHBD_SMOOTH_H_NXM_WIDE(16, 8)
824 HIGHBD_SMOOTH_H_NXM_WIDE(16, 16)
825 HIGHBD_SMOOTH_H_NXM_WIDE(16, 32)
826 HIGHBD_SMOOTH_H_NXM_WIDE(16, 64)
827 HIGHBD_SMOOTH_H_NXM_WIDE(32, 8)
828 HIGHBD_SMOOTH_H_NXM_WIDE(32, 16)
829 HIGHBD_SMOOTH_H_NXM_WIDE(32, 32)
830 HIGHBD_SMOOTH_H_NXM_WIDE(32, 64)
831 HIGHBD_SMOOTH_H_NXM_WIDE(64, 16)
832 HIGHBD_SMOOTH_H_NXM_WIDE(64, 32)
833 HIGHBD_SMOOTH_H_NXM_WIDE(64, 64)
834
835 #undef HIGHBD_SMOOTH_H_NXM_WIDE
836