1 /*
2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13
14 #include "config/aom_config.h"
15 #include "config/aom_dsp_rtcd.h"
16
17 #include "aom/aom_integer.h"
18 #include "aom_dsp/arm/mem_neon.h"
19 #include "aom_dsp/arm/sum_neon.h"
20
sadwxh_neon_dotprod(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int w,int h)21 static inline unsigned int sadwxh_neon_dotprod(const uint8_t *src_ptr,
22 int src_stride,
23 const uint8_t *ref_ptr,
24 int ref_stride, int w, int h) {
25 // Only two accumulators are required for optimal instruction throughput of
26 // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
27 uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
28
29 int i = h;
30 do {
31 int j = 0;
32 do {
33 uint8x16_t s0, s1, r0, r1, diff0, diff1;
34
35 s0 = vld1q_u8(src_ptr + j);
36 r0 = vld1q_u8(ref_ptr + j);
37 diff0 = vabdq_u8(s0, r0);
38 sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
39
40 s1 = vld1q_u8(src_ptr + j + 16);
41 r1 = vld1q_u8(ref_ptr + j + 16);
42 diff1 = vabdq_u8(s1, r1);
43 sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
44
45 j += 32;
46 } while (j < w);
47
48 src_ptr += src_stride;
49 ref_ptr += ref_stride;
50 } while (--i != 0);
51
52 return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
53 }
54
sad128xh_neon_dotprod(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)55 static inline unsigned int sad128xh_neon_dotprod(const uint8_t *src_ptr,
56 int src_stride,
57 const uint8_t *ref_ptr,
58 int ref_stride, int h) {
59 return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128, h);
60 }
61
sad64xh_neon_dotprod(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)62 static inline unsigned int sad64xh_neon_dotprod(const uint8_t *src_ptr,
63 int src_stride,
64 const uint8_t *ref_ptr,
65 int ref_stride, int h) {
66 return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
67 }
68
sad32xh_neon_dotprod(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)69 static inline unsigned int sad32xh_neon_dotprod(const uint8_t *src_ptr,
70 int src_stride,
71 const uint8_t *ref_ptr,
72 int ref_stride, int h) {
73 return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
74 }
75
sad16xh_neon_dotprod(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)76 static inline unsigned int sad16xh_neon_dotprod(const uint8_t *src_ptr,
77 int src_stride,
78 const uint8_t *ref_ptr,
79 int ref_stride, int h) {
80 uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
81
82 int i = h / 2;
83 do {
84 uint8x16_t s0, s1, r0, r1, diff0, diff1;
85
86 s0 = vld1q_u8(src_ptr);
87 r0 = vld1q_u8(ref_ptr);
88 diff0 = vabdq_u8(s0, r0);
89 sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
90
91 src_ptr += src_stride;
92 ref_ptr += ref_stride;
93
94 s1 = vld1q_u8(src_ptr);
95 r1 = vld1q_u8(ref_ptr);
96 diff1 = vabdq_u8(s1, r1);
97 sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
98
99 src_ptr += src_stride;
100 ref_ptr += ref_stride;
101 } while (--i != 0);
102
103 return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
104 }
105
106 #define SAD_WXH_NEON_DOTPROD(w, h) \
107 unsigned int aom_sad##w##x##h##_neon_dotprod( \
108 const uint8_t *src, int src_stride, const uint8_t *ref, \
109 int ref_stride) { \
110 return sad##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, (h)); \
111 }
112
113 SAD_WXH_NEON_DOTPROD(16, 8)
114 SAD_WXH_NEON_DOTPROD(16, 16)
115 SAD_WXH_NEON_DOTPROD(16, 32)
116
117 SAD_WXH_NEON_DOTPROD(32, 16)
118 SAD_WXH_NEON_DOTPROD(32, 32)
119 SAD_WXH_NEON_DOTPROD(32, 64)
120
121 SAD_WXH_NEON_DOTPROD(64, 32)
122 SAD_WXH_NEON_DOTPROD(64, 64)
123 SAD_WXH_NEON_DOTPROD(64, 128)
124
125 SAD_WXH_NEON_DOTPROD(128, 64)
126 SAD_WXH_NEON_DOTPROD(128, 128)
127
128 #if !CONFIG_REALTIME_ONLY
129 SAD_WXH_NEON_DOTPROD(16, 4)
130 SAD_WXH_NEON_DOTPROD(16, 64)
131 SAD_WXH_NEON_DOTPROD(32, 8)
132 SAD_WXH_NEON_DOTPROD(64, 16)
133 #endif // !CONFIG_REALTIME_ONLY
134
135 #undef SAD_WXH_NEON_DOTPROD
136
137 #define SAD_SKIP_WXH_NEON_DOTPROD(w, h) \
138 unsigned int aom_sad_skip_##w##x##h##_neon_dotprod( \
139 const uint8_t *src, int src_stride, const uint8_t *ref, \
140 int ref_stride) { \
141 return 2 * sad##w##xh_neon_dotprod(src, 2 * src_stride, ref, \
142 2 * ref_stride, (h) / 2); \
143 }
144
145 SAD_SKIP_WXH_NEON_DOTPROD(16, 16)
146 SAD_SKIP_WXH_NEON_DOTPROD(16, 32)
147
148 SAD_SKIP_WXH_NEON_DOTPROD(32, 16)
149 SAD_SKIP_WXH_NEON_DOTPROD(32, 32)
150 SAD_SKIP_WXH_NEON_DOTPROD(32, 64)
151
152 SAD_SKIP_WXH_NEON_DOTPROD(64, 32)
153 SAD_SKIP_WXH_NEON_DOTPROD(64, 64)
154 SAD_SKIP_WXH_NEON_DOTPROD(64, 128)
155
156 SAD_SKIP_WXH_NEON_DOTPROD(128, 64)
157 SAD_SKIP_WXH_NEON_DOTPROD(128, 128)
158
159 #if !CONFIG_REALTIME_ONLY
160 SAD_SKIP_WXH_NEON_DOTPROD(16, 64)
161 SAD_SKIP_WXH_NEON_DOTPROD(64, 16)
162 #endif // !CONFIG_REALTIME_ONLY
163
164 #undef SAD_SKIP_WXH_NEON_DOTPROD
165
sadwxh_avg_neon_dotprod(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int w,int h,const uint8_t * second_pred)166 static inline unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr,
167 int src_stride,
168 const uint8_t *ref_ptr,
169 int ref_stride, int w, int h,
170 const uint8_t *second_pred) {
171 // Only two accumulators are required for optimal instruction throughput of
172 // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
173 uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
174
175 int i = h;
176 do {
177 int j = 0;
178 do {
179 uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
180
181 s0 = vld1q_u8(src_ptr + j);
182 r0 = vld1q_u8(ref_ptr + j);
183 p0 = vld1q_u8(second_pred);
184 avg0 = vrhaddq_u8(r0, p0);
185 diff0 = vabdq_u8(s0, avg0);
186 sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
187
188 s1 = vld1q_u8(src_ptr + j + 16);
189 r1 = vld1q_u8(ref_ptr + j + 16);
190 p1 = vld1q_u8(second_pred + 16);
191 avg1 = vrhaddq_u8(r1, p1);
192 diff1 = vabdq_u8(s1, avg1);
193 sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
194
195 j += 32;
196 second_pred += 32;
197 } while (j < w);
198
199 src_ptr += src_stride;
200 ref_ptr += ref_stride;
201 } while (--i != 0);
202
203 return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
204 }
205
sad128xh_avg_neon_dotprod(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)206 static inline unsigned int sad128xh_avg_neon_dotprod(
207 const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
208 int ref_stride, int h, const uint8_t *second_pred) {
209 return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128,
210 h, second_pred);
211 }
212
sad64xh_avg_neon_dotprod(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)213 static inline unsigned int sad64xh_avg_neon_dotprod(
214 const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
215 int ref_stride, int h, const uint8_t *second_pred) {
216 return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64,
217 h, second_pred);
218 }
219
sad32xh_avg_neon_dotprod(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)220 static inline unsigned int sad32xh_avg_neon_dotprod(
221 const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
222 int ref_stride, int h, const uint8_t *second_pred) {
223 return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32,
224 h, second_pred);
225 }
226
sad16xh_avg_neon_dotprod(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)227 static inline unsigned int sad16xh_avg_neon_dotprod(
228 const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
229 int ref_stride, int h, const uint8_t *second_pred) {
230 uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
231
232 int i = h / 2;
233 do {
234 uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
235
236 s0 = vld1q_u8(src_ptr);
237 r0 = vld1q_u8(ref_ptr);
238 p0 = vld1q_u8(second_pred);
239 avg0 = vrhaddq_u8(r0, p0);
240 diff0 = vabdq_u8(s0, avg0);
241 sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
242
243 src_ptr += src_stride;
244 ref_ptr += ref_stride;
245 second_pred += 16;
246
247 s1 = vld1q_u8(src_ptr);
248 r1 = vld1q_u8(ref_ptr);
249 p1 = vld1q_u8(second_pred);
250 avg1 = vrhaddq_u8(r1, p1);
251 diff1 = vabdq_u8(s1, avg1);
252 sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
253
254 src_ptr += src_stride;
255 ref_ptr += ref_stride;
256 second_pred += 16;
257 } while (--i != 0);
258
259 return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
260 }
261
262 #define SAD_WXH_AVG_NEON_DOTPROD(w, h) \
263 unsigned int aom_sad##w##x##h##_avg_neon_dotprod( \
264 const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
265 const uint8_t *second_pred) { \
266 return sad##w##xh_avg_neon_dotprod(src, src_stride, ref, ref_stride, (h), \
267 second_pred); \
268 }
269
270 SAD_WXH_AVG_NEON_DOTPROD(16, 8)
271 SAD_WXH_AVG_NEON_DOTPROD(16, 16)
272 SAD_WXH_AVG_NEON_DOTPROD(16, 32)
273
274 SAD_WXH_AVG_NEON_DOTPROD(32, 16)
275 SAD_WXH_AVG_NEON_DOTPROD(32, 32)
276 SAD_WXH_AVG_NEON_DOTPROD(32, 64)
277
278 SAD_WXH_AVG_NEON_DOTPROD(64, 32)
279 SAD_WXH_AVG_NEON_DOTPROD(64, 64)
280 SAD_WXH_AVG_NEON_DOTPROD(64, 128)
281
282 SAD_WXH_AVG_NEON_DOTPROD(128, 64)
283 SAD_WXH_AVG_NEON_DOTPROD(128, 128)
284
285 #if !CONFIG_REALTIME_ONLY
286 SAD_WXH_AVG_NEON_DOTPROD(16, 64)
287 SAD_WXH_AVG_NEON_DOTPROD(32, 8)
288 SAD_WXH_AVG_NEON_DOTPROD(64, 16)
289 #endif // !CONFIG_REALTIME_ONLY
290
291 #undef SAD_WXH_AVG_NEON_DOTPROD
292