1 /*
2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13 #include <assert.h>
14
15 #include "aom/aom_integer.h"
16 #include "aom_dsp/arm/sum_neon.h"
17
18 DECLARE_ALIGNED(16, const int8_t,
19 av1_filter_intra_taps_neon[FILTER_INTRA_MODES][8][8]) = {
20 {
21 { -6, 0, 0, 0, -5, 10, 0, 0 },
22 { 10, 0, 12, 0, 2, 0, 9, 0 },
23 { -3, 1, 0, 0, -3, 1, 10, 0 },
24 { 1, 10, 7, 0, 1, 2, 5, 0 },
25 { -4, 0, 0, 12, -3, 6, 0, 9 },
26 { 6, 0, 2, 0, 2, 0, 2, 0 },
27 { -3, 2, 0, 7, -3, 2, 6, 5 },
28 { 2, 6, 2, 0, 1, 2, 3, 0 },
29 },
30 {
31 { -10, 0, 0, 0, -6, 16, 0, 0 },
32 { 16, 0, 10, 0, 0, 0, 6, 0 },
33 { -4, 0, 0, 0, -2, 0, 16, 0 },
34 { 0, 16, 4, 0, 0, 0, 2, 0 },
35 { -10, 0, 0, 10, -6, 16, 0, 6 },
36 { 16, 0, 0, 0, 0, 0, 0, 0 },
37 { -4, 0, 0, 4, -2, 0, 16, 2 },
38 { 0, 16, 0, 0, 0, 0, 0, 0 },
39 },
40 {
41 { -8, 0, 0, 0, -8, 8, 0, 0 },
42 { 8, 0, 16, 0, 0, 0, 16, 0 },
43 { -8, 0, 0, 0, -8, 0, 8, 0 },
44 { 0, 8, 16, 0, 0, 0, 16, 0 },
45 { -4, 0, 0, 16, -4, 4, 0, 16 },
46 { 4, 0, 0, 0, 0, 0, 0, 0 },
47 { -4, 0, 0, 16, -4, 0, 4, 16 },
48 { 0, 4, 0, 0, 0, 0, 0, 0 },
49 },
50 {
51 { -2, 0, 0, 0, -1, 8, 0, 0 },
52 { 8, 0, 10, 0, 3, 0, 6, 0 },
53 { -1, 3, 0, 0, 0, 2, 8, 0 },
54 { 2, 8, 4, 0, 1, 3, 2, 0 },
55 { -1, 0, 0, 10, -1, 4, 0, 6 },
56 { 4, 0, 3, 0, 3, 0, 4, 0 },
57 { -1, 3, 0, 4, -1, 2, 4, 3 },
58 { 2, 4, 4, 0, 2, 3, 3, 0 },
59 },
60 {
61 { -12, 0, 0, 0, -10, 14, 0, 0 },
62 { 14, 0, 14, 0, 0, 0, 12, 0 },
63 { -9, 0, 0, 0, -8, 0, 14, 0 },
64 { 0, 14, 11, 0, 0, 0, 10, 0 },
65 { -10, 0, 0, 14, -9, 12, 0, 12 },
66 { 12, 0, 0, 0, 1, 0, 0, 0 },
67 { -8, 0, 0, 11, -7, 0, 12, 9 },
68 { 0, 12, 1, 0, 0, 1, 1, 0 },
69 },
70 };
71
72 #define FILTER_INTRA_SCALE_BITS 4
73 #define SHIFT_INTRA_SCALE_BITS 15 - FILTER_INTRA_SCALE_BITS
74
75 #define MASK_LOW \
76 0x604020006040200 // (0 | (2 << 8) | (4 << 16) | (6 << 24)) x 2
77 #define MASK_HIGH \
78 0x705030107050301 // (1 | (3 << 8) | (5 << 16) | (7 << 24)) x 2
79
av1_filter_intra_predictor_neon(uint8_t * dst,ptrdiff_t stride,TX_SIZE tx_size,const uint8_t * above,const uint8_t * left,int mode)80 void av1_filter_intra_predictor_neon(uint8_t *dst, ptrdiff_t stride,
81 TX_SIZE tx_size, const uint8_t *above,
82 const uint8_t *left, int mode) {
83 int r, c;
84 uint8_t buffer[33][33];
85 const int bw = tx_size_wide[tx_size];
86 const int bh = tx_size_high[tx_size];
87
88 const int8x16_t f1f0 = vld1q_s8(av1_filter_intra_taps_neon[mode][0]);
89 const int8x16_t f3f2 = vld1q_s8(av1_filter_intra_taps_neon[mode][2]);
90 const int8x16_t f5f4 = vld1q_s8(av1_filter_intra_taps_neon[mode][4]);
91 const int8x16_t f7f6 = vld1q_s8(av1_filter_intra_taps_neon[mode][6]);
92 const int16x8_t f1f0_lo = vmovl_s8(vget_low_s8(f1f0));
93 const int16x8_t f1f0_hi = vmovl_s8(vget_high_s8(f1f0));
94 const int16x8_t f3f2_lo = vmovl_s8(vget_low_s8(f3f2));
95 const int16x8_t f3f2_hi = vmovl_s8(vget_high_s8(f3f2));
96 const int16x8_t f5f4_lo = vmovl_s8(vget_low_s8(f5f4));
97 const int16x8_t f5f4_hi = vmovl_s8(vget_high_s8(f5f4));
98 const int16x8_t f7f6_lo = vmovl_s8(vget_low_s8(f7f6));
99 const int16x8_t f7f6_hi = vmovl_s8(vget_high_s8(f7f6));
100 const uint8x8_t vmask_low = vcreate_u8(MASK_LOW);
101 const uint8x8_t vmask_high = vcreate_u8(MASK_HIGH);
102
103 assert(bw <= 32 && bh <= 32);
104
105 for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
106 memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
107
108 for (r = 1; r < bh + 1; r += 2) {
109 for (c = 1; c < bw + 1; c += 4) {
110 DECLARE_ALIGNED(16, uint8_t, p[8]);
111 memcpy(p, &buffer[r - 1][c - 1], 5 * sizeof(uint8_t));
112 p[5] = buffer[r][c - 1];
113 p[6] = buffer[r + 1][c - 1];
114 p[7] = 0;
115
116 const uint8x8_t p_b = vld1_u8(p);
117
118 const uint16x8_t p_b_lo = vmovl_u8(vtbl1_u8(p_b, vmask_low));
119 const uint16x8_t p_b_hi = vmovl_u8(vtbl1_u8(p_b, vmask_high));
120
121 int16x8_t out_01 = vmulq_s16(vreinterpretq_s16_u16(p_b_lo), f1f0_lo);
122 out_01 = vmlaq_s16(out_01, vreinterpretq_s16_u16(p_b_hi), f1f0_hi);
123 int16x8_t out_23 = vmulq_s16(vreinterpretq_s16_u16(p_b_lo), f3f2_lo);
124 out_23 = vmlaq_s16(out_23, vreinterpretq_s16_u16(p_b_hi), f3f2_hi);
125 int16x8_t out_45 = vmulq_s16(vreinterpretq_s16_u16(p_b_lo), f5f4_lo);
126 out_45 = vmlaq_s16(out_45, vreinterpretq_s16_u16(p_b_hi), f5f4_hi);
127 int16x8_t out_67 = vmulq_s16(vreinterpretq_s16_u16(p_b_lo), f7f6_lo);
128 out_67 = vmlaq_s16(out_67, vreinterpretq_s16_u16(p_b_hi), f7f6_hi);
129 #if defined(__aarch64__)
130 const int16x8_t out_0123 = vpaddq_s16(out_01, out_23);
131 const int16x8_t out_4567 = vpaddq_s16(out_45, out_67);
132 const int16x8_t out_01234567 = vpaddq_s16(out_0123, out_4567);
133 #else
134 const int16x8_t out_0123 = vcombine_s16(vqmovn_s32(vpaddlq_s16(out_01)),
135 vqmovn_s32(vpaddlq_s16(out_23)));
136 const int16x8_t out_4567 = vcombine_s16(vqmovn_s32(vpaddlq_s16(out_45)),
137 vqmovn_s32(vpaddlq_s16(out_67)));
138 const int16x8_t out_01234567 = vcombine_s16(
139 vqmovn_s32(vpaddlq_s16(out_0123)), vqmovn_s32(vpaddlq_s16(out_4567)));
140 #endif // (__aarch64__)
141 const uint32x2_t out_r =
142 vreinterpret_u32_u8(vqmovun_s16(vrshrq_n_s16(out_01234567, 4)));
143 // Storing
144 vst1_lane_u32((uint32_t *)&buffer[r][c], out_r, 0);
145 vst1_lane_u32((uint32_t *)&buffer[r + 1][c], out_r, 1);
146 }
147 }
148
149 for (r = 0; r < bh; ++r) {
150 memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t));
151 dst += stride;
152 }
153 }
154