• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 
14 #include "config/aom_config.h"
15 #include "config/aom_dsp_rtcd.h"
16 
17 #include "aom/aom_integer.h"
18 #include "aom_dsp/arm/mem_neon.h"
19 #include "aom_dsp/arm/sum_neon.h"
20 
sadwxh_neon_dotprod(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int w,int h)21 static inline unsigned int sadwxh_neon_dotprod(const uint8_t *src_ptr,
22                                                int src_stride,
23                                                const uint8_t *ref_ptr,
24                                                int ref_stride, int w, int h) {
25   // Only two accumulators are required for optimal instruction throughput of
26   // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
27   uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
28 
29   int i = h;
30   do {
31     int j = 0;
32     do {
33       uint8x16_t s0, s1, r0, r1, diff0, diff1;
34 
35       s0 = vld1q_u8(src_ptr + j);
36       r0 = vld1q_u8(ref_ptr + j);
37       diff0 = vabdq_u8(s0, r0);
38       sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
39 
40       s1 = vld1q_u8(src_ptr + j + 16);
41       r1 = vld1q_u8(ref_ptr + j + 16);
42       diff1 = vabdq_u8(s1, r1);
43       sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
44 
45       j += 32;
46     } while (j < w);
47 
48     src_ptr += src_stride;
49     ref_ptr += ref_stride;
50   } while (--i != 0);
51 
52   return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
53 }
54 
sad128xh_neon_dotprod(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)55 static inline unsigned int sad128xh_neon_dotprod(const uint8_t *src_ptr,
56                                                  int src_stride,
57                                                  const uint8_t *ref_ptr,
58                                                  int ref_stride, int h) {
59   return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128, h);
60 }
61 
sad64xh_neon_dotprod(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)62 static inline unsigned int sad64xh_neon_dotprod(const uint8_t *src_ptr,
63                                                 int src_stride,
64                                                 const uint8_t *ref_ptr,
65                                                 int ref_stride, int h) {
66   return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
67 }
68 
sad32xh_neon_dotprod(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)69 static inline unsigned int sad32xh_neon_dotprod(const uint8_t *src_ptr,
70                                                 int src_stride,
71                                                 const uint8_t *ref_ptr,
72                                                 int ref_stride, int h) {
73   return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
74 }
75 
sad16xh_neon_dotprod(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)76 static inline unsigned int sad16xh_neon_dotprod(const uint8_t *src_ptr,
77                                                 int src_stride,
78                                                 const uint8_t *ref_ptr,
79                                                 int ref_stride, int h) {
80   uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
81 
82   int i = h / 2;
83   do {
84     uint8x16_t s0, s1, r0, r1, diff0, diff1;
85 
86     s0 = vld1q_u8(src_ptr);
87     r0 = vld1q_u8(ref_ptr);
88     diff0 = vabdq_u8(s0, r0);
89     sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
90 
91     src_ptr += src_stride;
92     ref_ptr += ref_stride;
93 
94     s1 = vld1q_u8(src_ptr);
95     r1 = vld1q_u8(ref_ptr);
96     diff1 = vabdq_u8(s1, r1);
97     sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
98 
99     src_ptr += src_stride;
100     ref_ptr += ref_stride;
101   } while (--i != 0);
102 
103   return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
104 }
105 
106 #define SAD_WXH_NEON_DOTPROD(w, h)                                         \
107   unsigned int aom_sad##w##x##h##_neon_dotprod(                            \
108       const uint8_t *src, int src_stride, const uint8_t *ref,              \
109       int ref_stride) {                                                    \
110     return sad##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, (h)); \
111   }
112 
113 SAD_WXH_NEON_DOTPROD(16, 8)
114 SAD_WXH_NEON_DOTPROD(16, 16)
115 SAD_WXH_NEON_DOTPROD(16, 32)
116 
117 SAD_WXH_NEON_DOTPROD(32, 16)
118 SAD_WXH_NEON_DOTPROD(32, 32)
119 SAD_WXH_NEON_DOTPROD(32, 64)
120 
121 SAD_WXH_NEON_DOTPROD(64, 32)
122 SAD_WXH_NEON_DOTPROD(64, 64)
123 SAD_WXH_NEON_DOTPROD(64, 128)
124 
125 SAD_WXH_NEON_DOTPROD(128, 64)
126 SAD_WXH_NEON_DOTPROD(128, 128)
127 
128 #if !CONFIG_REALTIME_ONLY
129 SAD_WXH_NEON_DOTPROD(16, 4)
130 SAD_WXH_NEON_DOTPROD(16, 64)
131 SAD_WXH_NEON_DOTPROD(32, 8)
132 SAD_WXH_NEON_DOTPROD(64, 16)
133 #endif  // !CONFIG_REALTIME_ONLY
134 
135 #undef SAD_WXH_NEON_DOTPROD
136 
137 #define SAD_SKIP_WXH_NEON_DOTPROD(w, h)                          \
138   unsigned int aom_sad_skip_##w##x##h##_neon_dotprod(            \
139       const uint8_t *src, int src_stride, const uint8_t *ref,    \
140       int ref_stride) {                                          \
141     return 2 * sad##w##xh_neon_dotprod(src, 2 * src_stride, ref, \
142                                        2 * ref_stride, (h) / 2); \
143   }
144 
145 SAD_SKIP_WXH_NEON_DOTPROD(16, 16)
146 SAD_SKIP_WXH_NEON_DOTPROD(16, 32)
147 
148 SAD_SKIP_WXH_NEON_DOTPROD(32, 16)
149 SAD_SKIP_WXH_NEON_DOTPROD(32, 32)
150 SAD_SKIP_WXH_NEON_DOTPROD(32, 64)
151 
152 SAD_SKIP_WXH_NEON_DOTPROD(64, 32)
153 SAD_SKIP_WXH_NEON_DOTPROD(64, 64)
154 SAD_SKIP_WXH_NEON_DOTPROD(64, 128)
155 
156 SAD_SKIP_WXH_NEON_DOTPROD(128, 64)
157 SAD_SKIP_WXH_NEON_DOTPROD(128, 128)
158 
159 #if !CONFIG_REALTIME_ONLY
160 SAD_SKIP_WXH_NEON_DOTPROD(16, 64)
161 SAD_SKIP_WXH_NEON_DOTPROD(64, 16)
162 #endif  // !CONFIG_REALTIME_ONLY
163 
164 #undef SAD_SKIP_WXH_NEON_DOTPROD
165 
sadwxh_avg_neon_dotprod(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int w,int h,const uint8_t * second_pred)166 static inline unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr,
167                                                    int src_stride,
168                                                    const uint8_t *ref_ptr,
169                                                    int ref_stride, int w, int h,
170                                                    const uint8_t *second_pred) {
171   // Only two accumulators are required for optimal instruction throughput of
172   // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
173   uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
174 
175   int i = h;
176   do {
177     int j = 0;
178     do {
179       uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
180 
181       s0 = vld1q_u8(src_ptr + j);
182       r0 = vld1q_u8(ref_ptr + j);
183       p0 = vld1q_u8(second_pred);
184       avg0 = vrhaddq_u8(r0, p0);
185       diff0 = vabdq_u8(s0, avg0);
186       sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
187 
188       s1 = vld1q_u8(src_ptr + j + 16);
189       r1 = vld1q_u8(ref_ptr + j + 16);
190       p1 = vld1q_u8(second_pred + 16);
191       avg1 = vrhaddq_u8(r1, p1);
192       diff1 = vabdq_u8(s1, avg1);
193       sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
194 
195       j += 32;
196       second_pred += 32;
197     } while (j < w);
198 
199     src_ptr += src_stride;
200     ref_ptr += ref_stride;
201   } while (--i != 0);
202 
203   return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
204 }
205 
sad128xh_avg_neon_dotprod(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)206 static inline unsigned int sad128xh_avg_neon_dotprod(
207     const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
208     int ref_stride, int h, const uint8_t *second_pred) {
209   return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128,
210                                  h, second_pred);
211 }
212 
sad64xh_avg_neon_dotprod(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)213 static inline unsigned int sad64xh_avg_neon_dotprod(
214     const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
215     int ref_stride, int h, const uint8_t *second_pred) {
216   return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64,
217                                  h, second_pred);
218 }
219 
sad32xh_avg_neon_dotprod(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)220 static inline unsigned int sad32xh_avg_neon_dotprod(
221     const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
222     int ref_stride, int h, const uint8_t *second_pred) {
223   return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32,
224                                  h, second_pred);
225 }
226 
sad16xh_avg_neon_dotprod(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)227 static inline unsigned int sad16xh_avg_neon_dotprod(
228     const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
229     int ref_stride, int h, const uint8_t *second_pred) {
230   uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
231 
232   int i = h / 2;
233   do {
234     uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
235 
236     s0 = vld1q_u8(src_ptr);
237     r0 = vld1q_u8(ref_ptr);
238     p0 = vld1q_u8(second_pred);
239     avg0 = vrhaddq_u8(r0, p0);
240     diff0 = vabdq_u8(s0, avg0);
241     sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
242 
243     src_ptr += src_stride;
244     ref_ptr += ref_stride;
245     second_pred += 16;
246 
247     s1 = vld1q_u8(src_ptr);
248     r1 = vld1q_u8(ref_ptr);
249     p1 = vld1q_u8(second_pred);
250     avg1 = vrhaddq_u8(r1, p1);
251     diff1 = vabdq_u8(s1, avg1);
252     sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
253 
254     src_ptr += src_stride;
255     ref_ptr += ref_stride;
256     second_pred += 16;
257   } while (--i != 0);
258 
259   return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
260 }
261 
262 #define SAD_WXH_AVG_NEON_DOTPROD(w, h)                                        \
263   unsigned int aom_sad##w##x##h##_avg_neon_dotprod(                           \
264       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
265       const uint8_t *second_pred) {                                           \
266     return sad##w##xh_avg_neon_dotprod(src, src_stride, ref, ref_stride, (h), \
267                                        second_pred);                          \
268   }
269 
270 SAD_WXH_AVG_NEON_DOTPROD(16, 8)
271 SAD_WXH_AVG_NEON_DOTPROD(16, 16)
272 SAD_WXH_AVG_NEON_DOTPROD(16, 32)
273 
274 SAD_WXH_AVG_NEON_DOTPROD(32, 16)
275 SAD_WXH_AVG_NEON_DOTPROD(32, 32)
276 SAD_WXH_AVG_NEON_DOTPROD(32, 64)
277 
278 SAD_WXH_AVG_NEON_DOTPROD(64, 32)
279 SAD_WXH_AVG_NEON_DOTPROD(64, 64)
280 SAD_WXH_AVG_NEON_DOTPROD(64, 128)
281 
282 SAD_WXH_AVG_NEON_DOTPROD(128, 64)
283 SAD_WXH_AVG_NEON_DOTPROD(128, 128)
284 
285 #if !CONFIG_REALTIME_ONLY
286 SAD_WXH_AVG_NEON_DOTPROD(16, 64)
287 SAD_WXH_AVG_NEON_DOTPROD(32, 8)
288 SAD_WXH_AVG_NEON_DOTPROD(64, 16)
289 #endif  // !CONFIG_REALTIME_ONLY
290 
291 #undef SAD_WXH_AVG_NEON_DOTPROD
292