1 /*
2 * Copyright 2020 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef ANDROID_AUDIO_UTILS_INTRINSIC_UTILS_H
18 #define ANDROID_AUDIO_UTILS_INTRINSIC_UTILS_H
19
20 #include <array> // std::size
21 #include <type_traits>
22
23 /*
24 The intrinsics utility library contain helper functions for wide width DSP support.
25 We use templated types to allow testing from scalar to vector values.
26
27 See the Eigen project for general abstracted linear algebra acceleration.
28 http://eigen.tuxfamily.org/
29 */
30
31 // We conditionally include neon optimizations for ARM devices
32 #pragma push_macro("USE_NEON")
33 #undef USE_NEON
34
35 #if defined(__ARM_NEON__) || defined(__aarch64__)
36 #include <arm_neon.h>
37 #define USE_NEON
38 #endif
39
40 namespace android::audio_utils::intrinsics {
41
42 // For static assert(false) we need a template version to avoid early failure.
43 // See: https://stackoverflow.com/questions/51523965/template-dependent-false
44 template <typename T>
45 inline constexpr bool dependent_false_v = false;
46
47 // Type of array embedded in a struct that is usable in the Neon template functions below.
48 // This type must satisfy std::is_array_v<>.
49 template<typename T, size_t N>
50 struct internal_array_t {
51 T v[N];
52 };
53
54 /*
55 Generalized template functions for the Neon instruction set.
56
57 See here for some general comments from ARM.
58 https://developer.arm.com/documentation/dht0004/a/neon-support-in-compilation-tools/automatic-vectorization/floating-point-vectorization
59
60 Notes:
61 1) We provide scalar equivalents which are compilable even on non-ARM processors.
62 2) We use recursive calls to decompose array types, e.g. float32x4x4_t -> float32x4_t
63 3) NEON double SIMD acceleration is only available on 64 bit architectures.
64 On Pixel 3XL, NEON double x 2 SIMD is actually slightly slower than the FP unit.
65
66 We create a generic Neon acceleration to be applied to a composite type.
67
68 The type follows the following compositional rules for simplicity:
69 1) must be a primitive floating point type.
70 2) must be a NEON data type.
71 3) must be a struct with one member, either
72 a) an array of types 1-3.
73 b) a cons-pair struct of 2 possibly different members of types 1-3.
74
75 Examples of possible struct definitions:
76 using alternative_2_t = struct { struct { float a; float b; } s; };
77 using alternative_9_t = struct { struct { float32x4x2_t a; float b; } s; };
78 using alternative_15_t = struct { struct { float32x4x2_t a; struct { float v[7]; } b; } s; };
79 */
80
81 // add a + b
82 template<typename T>
vadd(T a,T b)83 static inline T vadd(T a, T b) {
84 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
85 return a + b;
86
87 #ifdef USE_NEON
88 } else if constexpr (std::is_same_v<T, float32x2_t>) {
89 return vadd_f32(a, b);
90 } else if constexpr (std::is_same_v<T, float32x4_t>) {
91 return vaddq_f32(a, b);
92 #if defined(__aarch64__)
93 } else if constexpr (std::is_same_v<T, float64x2_t>) {
94 return vaddq_f64(a, b);
95 #endif
96 #endif // USE_NEON
97
98 } else /* constexpr */ {
99 T ret;
100 auto &[retval] = ret; // single-member struct
101 const auto &[aval] = a;
102 const auto &[bval] = b;
103 if constexpr (std::is_array_v<decltype(retval)>) {
104 #pragma unroll
105 for (size_t i = 0; i < std::size(aval); ++i) {
106 retval[i] = vadd(aval[i], bval[i]);
107 }
108 return ret;
109 } else /* constexpr */ {
110 auto &[r1, r2] = retval;
111 const auto &[a1, a2] = aval;
112 const auto &[b1, b2] = bval;
113 r1 = vadd(a1, b1);
114 r2 = vadd(a2, b2);
115 return ret;
116 }
117 }
118 }
119
120 // duplicate float into all elements.
121 template<typename T, typename F>
vdupn(F f)122 static inline T vdupn(F f) {
123 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
124 return f;
125
126 #ifdef USE_NEON
127 } else if constexpr (std::is_same_v<T, float32x2_t>) {
128 return vdup_n_f32(f);
129 } else if constexpr (std::is_same_v<T, float32x4_t>) {
130 return vdupq_n_f32(f);
131 #if defined(__aarch64__)
132 } else if constexpr (std::is_same_v<T, float64x2_t>) {
133 return vdupq_n_f64(f);
134 #endif
135 #endif // USE_NEON
136
137 } else /* constexpr */ {
138 T ret;
139 auto &[retval] = ret; // single-member struct
140 if constexpr (std::is_array_v<decltype(retval)>) {
141 #pragma unroll
142 for (auto& val : retval) {
143 val = vdupn<std::decay_t<decltype(val)>>(f);
144 }
145 return ret;
146 } else /* constexpr */ {
147 auto &[r1, r2] = retval;
148 using r1_type = std::decay_t<decltype(r1)>;
149 using r2_type = std::decay_t<decltype(r2)>;
150 r1 = vdupn<r1_type>(f);
151 r2 = vdupn<r2_type>(f);
152 return ret;
153 }
154 }
155 }
156
157 // load from float pointer.
158 template<typename T, typename F>
vld1(const F * f)159 static inline T vld1(const F *f) {
160 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
161 return *f;
162
163 #ifdef USE_NEON
164 } else if constexpr (std::is_same_v<T, float32x2_t>) {
165 return vld1_f32(f);
166 } else if constexpr (std::is_same_v<T, float32x4_t>) {
167 return vld1q_f32(f);
168 #if defined(__aarch64__)
169 } else if constexpr (std::is_same_v<T, float64x2_t>) {
170 return vld1q_f64(f);
171 #endif
172 #endif // USE_NEON
173
174 } else /* constexpr */ {
175 T ret;
176 auto &[retval] = ret; // single-member struct
177 if constexpr (std::is_array_v<decltype(retval)>) {
178 using element_type = std::decay_t<decltype(retval[0])>;
179 constexpr size_t subelements = sizeof(element_type) / sizeof(F);
180 #pragma unroll
181 for (size_t i = 0; i < std::size(retval); ++i) {
182 retval[i] = vld1<element_type>(f);
183 f += subelements;
184 }
185 return ret;
186 } else /* constexpr */ {
187 auto &[r1, r2] = retval;
188 using r1_type = std::decay_t<decltype(r1)>;
189 using r2_type = std::decay_t<decltype(r2)>;
190 r1 = vld1<r1_type>(f);
191 f += sizeof(r1) / sizeof(F);
192 r2 = vld1<r2_type>(f);
193 return ret;
194 }
195 }
196 }
197
198 /**
199 * Returns c as follows:
200 * c_i = a_i * b_i if a and b are the same vector type or
201 * c_i = a_i * b if a is a vector and b is scalar or
202 * c_i = a * b_i if a is scalar and b is a vector.
203 */
204 template<typename T, typename S, typename F>
vmla(T a,S b,F c)205 static inline T vmla(T a, S b, F c) {
206 // Both types T and S are non-primitive and they are not equal. T == S handled below.
207 (void) a;
208 (void) b;
209 (void) c;
210 static_assert(dependent_false_v<T>);
211 }
212
213 template<typename T, typename F>
vmla(T a,T b,F c)214 static inline T vmla(T a, T b, F c) {
215 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
216 if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
217 return a + b * c;
218 } else {
219 static_assert(dependent_false_v<T>);
220 }
221 } else if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
222 // handle the lane variant
223 #ifdef USE_NEON
224 if constexpr (std::is_same_v<T, float32x2_t>) {
225 return vmla_n_f32(a, b, c);
226 } else if constexpr (std::is_same_v<T, float32x4_t>) {
227 return vmlaq_n_f32(a, b,c);
228 #if defined(__aarch64__)
229 } else if constexpr (std::is_same_v<T, float64x2_t>) {
230 return vmlaq_n_f64(a, b);
231 #endif
232 } else
233 #endif // USE_NEON
234 {
235 T ret;
236 auto &[retval] = ret; // single-member struct
237 const auto &[aval] = a;
238 const auto &[bval] = b;
239 if constexpr (std::is_array_v<decltype(retval)>) {
240 #pragma unroll
241 for (size_t i = 0; i < std::size(aval); ++i) {
242 retval[i] = vmla(aval[i], bval[i], c);
243 }
244 return ret;
245 } else /* constexpr */ {
246 auto &[r1, r2] = retval;
247 const auto &[a1, a2] = aval;
248 const auto &[b1, b2] = bval;
249 r1 = vmla(a1, b1, c);
250 r2 = vmla(a2, b2, c);
251 return ret;
252 }
253 }
254 } else {
255 // Both types T and F are non-primitive and they are not equal.
256 static_assert(dependent_false_v<T>);
257 }
258 }
259
260 template<typename T, typename F>
vmla(T a,F b,T c)261 static inline T vmla(T a, F b, T c) {
262 return vmla(a, c, b);
263 }
264
265 // fused multiply-add a + b * c
266 template<typename T>
vmla(T a,T b,T c)267 static inline T vmla(T a, T b, T c) {
268 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
269 return a + b * c;
270
271 #ifdef USE_NEON
272 } else if constexpr (std::is_same_v<T, float32x2_t>) {
273 return vmla_f32(a, b, c);
274 } else if constexpr (std::is_same_v<T, float32x4_t>) {
275 return vmlaq_f32(a, b, c);
276 #if defined(__aarch64__)
277 } else if constexpr (std::is_same_v<T, float64x2_t>) {
278 return vmlaq_f64(a, b, c);
279 #endif
280 #endif // USE_NEON
281
282 } else /* constexpr */ {
283 T ret;
284 auto &[retval] = ret; // single-member struct
285 const auto &[aval] = a;
286 const auto &[bval] = b;
287 const auto &[cval] = c;
288 if constexpr (std::is_array_v<decltype(retval)>) {
289 #pragma unroll
290 for (size_t i = 0; i < std::size(aval); ++i) {
291 retval[i] = vmla(aval[i], bval[i], cval[i]);
292 }
293 return ret;
294 } else /* constexpr */ {
295 auto &[r1, r2] = retval;
296 const auto &[a1, a2] = aval;
297 const auto &[b1, b2] = bval;
298 const auto &[c1, c2] = cval;
299 r1 = vmla(a1, b1, c1);
300 r2 = vmla(a2, b2, c2);
301 return ret;
302 }
303 }
304 }
305
306 /**
307 * Returns c as follows:
308 * c_i = a_i * b_i if a and b are the same vector type or
309 * c_i = a_i * b if a is a vector and b is scalar or
310 * c_i = a * b_i if a is scalar and b is a vector.
311 */
312 template<typename T, typename F>
vmul(T a,F b)313 static inline auto vmul(T a, F b) {
314 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
315 if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
316 return a * b;
317 } else /* constexpr */ {
318 return vmul(b, a); // we prefer T to be the vector/struct form.
319 }
320 } else if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
321 // handle the lane variant
322 #ifdef USE_NEON
323 if constexpr (std::is_same_v<T, float32x2_t>) {
324 return vmul_n_f32(a, b);
325 } else if constexpr (std::is_same_v<T, float32x4_t>) {
326 return vmulq_n_f32(a, b);
327 #if defined(__aarch64__)
328 } else if constexpr (std::is_same_v<T, float64x2_t>) {
329 return vmulq_n_f64(a, b);
330 #endif
331 } else
332 #endif // USE_NEON
333 {
334 T ret;
335 auto &[retval] = ret; // single-member struct
336 const auto &[aval] = a;
337 if constexpr (std::is_array_v<decltype(retval)>) {
338 #pragma unroll
339 for (size_t i = 0; i < std::size(aval); ++i) {
340 retval[i] = vmul(aval[i], b);
341 }
342 return ret;
343 } else /* constexpr */ {
344 auto &[r1, r2] = retval;
345 const auto &[a1, a2] = aval;
346 r1 = vmul(a1, b);
347 r2 = vmul(a2, b);
348 return ret;
349 }
350 }
351 } else {
352 // Both types T and F are non-primitive and they are not equal.
353 static_assert(dependent_false_v<T>);
354 }
355 }
356
357 template<typename T>
vmul(T a,T b)358 static inline T vmul(T a, T b) {
359 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
360 return a * b;
361
362 #ifdef USE_NEON
363 } else if constexpr (std::is_same_v<T, float32x2_t>) {
364 return vmul_f32(a, b);
365 } else if constexpr (std::is_same_v<T, float32x4_t>) {
366 return vmulq_f32(a, b);
367 #if defined(__aarch64__)
368 } else if constexpr (std::is_same_v<T, float64x2_t>) {
369 return vmulq_f64(a, b);
370 #endif
371 #endif // USE_NEON
372
373 } else /* constexpr */ {
374 T ret;
375 auto &[retval] = ret; // single-member struct
376 const auto &[aval] = a;
377 const auto &[bval] = b;
378 if constexpr (std::is_array_v<decltype(retval)>) {
379 #pragma unroll
380 for (size_t i = 0; i < std::size(aval); ++i) {
381 retval[i] = vmul(aval[i], bval[i]);
382 }
383 return ret;
384 } else /* constexpr */ {
385 auto &[r1, r2] = retval;
386 const auto &[a1, a2] = aval;
387 const auto &[b1, b2] = bval;
388 r1 = vmul(a1, b1);
389 r2 = vmul(a2, b2);
390 return ret;
391 }
392 }
393 }
394
395 // negate
396 template<typename T>
vneg(T f)397 static inline T vneg(T f) {
398 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
399 return -f;
400
401 #ifdef USE_NEON
402 } else if constexpr (std::is_same_v<T, float32x2_t>) {
403 return vneg_f32(f);
404 } else if constexpr (std::is_same_v<T, float32x4_t>) {
405 return vnegq_f32(f);
406 #if defined(__aarch64__)
407 } else if constexpr (std::is_same_v<T, float64x2_t>) {
408 return vnegq_f64(f);
409 #endif
410 #endif // USE_NEON
411
412 } else /* constexpr */ {
413 T ret;
414 auto &[retval] = ret; // single-member struct
415 const auto &[fval] = f;
416 if constexpr (std::is_array_v<decltype(retval)>) {
417 #pragma unroll
418 for (size_t i = 0; i < std::size(fval); ++i) {
419 retval[i] = vneg(fval[i]);
420 }
421 return ret;
422 } else /* constexpr */ {
423 auto &[r1, r2] = retval;
424 const auto &[f1, f2] = fval;
425 r1 = vneg(f1);
426 r2 = vneg(f2);
427 return ret;
428 }
429 }
430 }
431
432 // store to float pointer.
433 template<typename T, typename F>
vst1(F * f,T a)434 static inline void vst1(F *f, T a) {
435 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
436 *f = a;
437
438 #ifdef USE_NEON
439 } else if constexpr (std::is_same_v<T, float32x2_t>) {
440 return vst1_f32(f, a);
441 } else if constexpr (std::is_same_v<T, float32x4_t>) {
442 return vst1q_f32(f, a);
443 #if defined(__aarch64__)
444 } else if constexpr (std::is_same_v<T, float64x2_t>) {
445 return vst1q_f64(f, a);
446 #endif
447 #endif // USE_NEON
448
449 } else /* constexpr */ {
450 const auto &[aval] = a;
451 if constexpr (std::is_array_v<decltype(aval)>) {
452 constexpr size_t subelements = sizeof(std::decay_t<decltype(aval[0])>) / sizeof(F);
453 #pragma unroll
454 for (size_t i = 0; i < std::size(aval); ++i) {
455 vst1(f, aval[i]);
456 f += subelements;
457 }
458 } else /* constexpr */ {
459 const auto &[a1, a2] = aval;
460 vst1(f, a1);
461 f += sizeof(std::decay_t<decltype(a1)>) / sizeof(F);
462 vst1(f, a2);
463 }
464 }
465 }
466
467 // subtract a - b
468 template<typename T>
vsub(T a,T b)469 static inline T vsub(T a, T b) {
470 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
471 return a - b;
472
473 #ifdef USE_NEON
474 } else if constexpr (std::is_same_v<T, float32x2_t>) {
475 return vsub_f32(a, b);
476 } else if constexpr (std::is_same_v<T, float32x4_t>) {
477 return vsubq_f32(a, b);
478 #if defined(__aarch64__)
479 } else if constexpr (std::is_same_v<T, float64x2_t>) {
480 return vsubq_f64(a, b);
481 #endif
482 #endif // USE_NEON
483
484 } else /* constexpr */ {
485 T ret;
486 auto &[retval] = ret; // single-member struct
487 const auto &[aval] = a;
488 const auto &[bval] = b;
489 if constexpr (std::is_array_v<decltype(retval)>) {
490 #pragma unroll
491 for (size_t i = 0; i < std::size(aval); ++i) {
492 retval[i] = vsub(aval[i], bval[i]);
493 }
494 return ret;
495 } else /* constexpr */ {
496 auto &[r1, r2] = retval;
497 const auto &[a1, a2] = aval;
498 const auto &[b1, b2] = bval;
499 r1 = vsub(a1, b1);
500 r2 = vsub(a2, b2);
501 return ret;
502 }
503 }
504 }
505
506 } // namespace android::audio_utils::intrinsics
507
508 #pragma pop_macro("USE_NEON")
509
510 #endif // !ANDROID_AUDIO_UTILS_INTRINSIC_UTILS_H
511