• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2020 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ANDROID_AUDIO_UTILS_INTRINSIC_UTILS_H
18 #define ANDROID_AUDIO_UTILS_INTRINSIC_UTILS_H
19 
20 #include <array>  // std::size
21 #include <type_traits>
22 
23 /*
24   The intrinsics utility library contain helper functions for wide width DSP support.
25   We use templated types to allow testing from scalar to vector values.
26 
27   See the Eigen project for general abstracted linear algebra acceleration.
28   http://eigen.tuxfamily.org/
29 */
30 
31 // We conditionally include neon optimizations for ARM devices
32 #pragma push_macro("USE_NEON")
33 #undef USE_NEON
34 
35 #if defined(__ARM_NEON__) || defined(__aarch64__)
36 #include <arm_neon.h>
37 #define USE_NEON
38 #endif
39 
40 namespace android::audio_utils::intrinsics {
41 
42 // For static assert(false) we need a template version to avoid early failure.
43 // See: https://stackoverflow.com/questions/51523965/template-dependent-false
44 template <typename T>
45 inline constexpr bool dependent_false_v = false;
46 
47 // Type of array embedded in a struct that is usable in the Neon template functions below.
48 // This type must satisfy std::is_array_v<>.
49 template<typename T, size_t N>
50 struct internal_array_t {
51     T v[N];
52 };
53 
54 /*
55   Generalized template functions for the Neon instruction set.
56 
57   See here for some general comments from ARM.
58   https://developer.arm.com/documentation/dht0004/a/neon-support-in-compilation-tools/automatic-vectorization/floating-point-vectorization
59 
60   Notes:
61   1) We provide scalar equivalents which are compilable even on non-ARM processors.
62   2) We use recursive calls to decompose array types, e.g. float32x4x4_t -> float32x4_t
63   3) NEON double SIMD acceleration is only available on 64 bit architectures.
64      On Pixel 3XL, NEON double x 2 SIMD is actually slightly slower than the FP unit.
65 
66   We create a generic Neon acceleration to be applied to a composite type.
67 
68   The type follows the following compositional rules for simplicity:
69       1) must be a primitive floating point type.
70       2) must be a NEON data type.
71       3) must be a struct with one member, either
72            a) an array of types 1-3.
73            b) a cons-pair struct of 2 possibly different members of types 1-3.
74 
75   Examples of possible struct definitions:
76   using alternative_2_t = struct { struct { float a; float b; } s; };
77   using alternative_9_t = struct { struct { float32x4x2_t a; float b; } s; };
78   using alternative_15_t = struct { struct { float32x4x2_t a; struct { float v[7]; } b; } s; };
79 */
80 
81 // add a + b
82 template<typename T>
vadd(T a,T b)83 static inline T vadd(T a, T b) {
84     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
85         return a + b;
86 
87 #ifdef USE_NEON
88     } else if constexpr (std::is_same_v<T, float32x2_t>) {
89         return vadd_f32(a, b);
90     } else if constexpr (std::is_same_v<T, float32x4_t>) {
91         return vaddq_f32(a, b);
92 #if defined(__aarch64__)
93     } else if constexpr (std::is_same_v<T, float64x2_t>) {
94         return vaddq_f64(a, b);
95 #endif
96 #endif // USE_NEON
97 
98     } else /* constexpr */ {
99         T ret;
100         auto &[retval] = ret;  // single-member struct
101         const auto &[aval] = a;
102         const auto &[bval] = b;
103         if constexpr (std::is_array_v<decltype(retval)>) {
104 #pragma unroll
105             for (size_t i = 0; i < std::size(aval); ++i) {
106                 retval[i] = vadd(aval[i], bval[i]);
107             }
108             return ret;
109         } else /* constexpr */ {
110              auto &[r1, r2] = retval;
111              const auto &[a1, a2] = aval;
112              const auto &[b1, b2] = bval;
113              r1 = vadd(a1, b1);
114              r2 = vadd(a2, b2);
115              return ret;
116         }
117     }
118 }
119 
120 // duplicate float into all elements.
121 template<typename T, typename F>
vdupn(F f)122 static inline T vdupn(F f) {
123     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
124         return f;
125 
126 #ifdef USE_NEON
127     } else if constexpr (std::is_same_v<T, float32x2_t>) {
128         return vdup_n_f32(f);
129     } else if constexpr (std::is_same_v<T, float32x4_t>) {
130         return vdupq_n_f32(f);
131 #if defined(__aarch64__)
132     } else if constexpr (std::is_same_v<T, float64x2_t>) {
133         return vdupq_n_f64(f);
134 #endif
135 #endif // USE_NEON
136 
137     } else /* constexpr */ {
138         T ret;
139         auto &[retval] = ret;  // single-member struct
140         if constexpr (std::is_array_v<decltype(retval)>) {
141 #pragma unroll
142             for (auto& val : retval) {
143                 val = vdupn<std::decay_t<decltype(val)>>(f);
144             }
145             return ret;
146         } else /* constexpr */ {
147              auto &[r1, r2] = retval;
148              using r1_type = std::decay_t<decltype(r1)>;
149              using r2_type = std::decay_t<decltype(r2)>;
150              r1 = vdupn<r1_type>(f);
151              r2 = vdupn<r2_type>(f);
152              return ret;
153         }
154     }
155 }
156 
157 // load from float pointer.
158 template<typename T, typename F>
vld1(const F * f)159 static inline T vld1(const F *f) {
160     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
161         return *f;
162 
163 #ifdef USE_NEON
164     } else if constexpr (std::is_same_v<T, float32x2_t>) {
165         return vld1_f32(f);
166     } else if constexpr (std::is_same_v<T, float32x4_t>) {
167         return vld1q_f32(f);
168 #if defined(__aarch64__)
169     } else if constexpr (std::is_same_v<T, float64x2_t>) {
170         return vld1q_f64(f);
171 #endif
172 #endif // USE_NEON
173 
174     } else /* constexpr */ {
175         T ret;
176         auto &[retval] = ret;  // single-member struct
177         if constexpr (std::is_array_v<decltype(retval)>) {
178             using element_type = std::decay_t<decltype(retval[0])>;
179             constexpr size_t subelements = sizeof(element_type) / sizeof(F);
180 #pragma unroll
181             for (size_t i = 0; i < std::size(retval); ++i) {
182                 retval[i] = vld1<element_type>(f);
183                 f += subelements;
184             }
185             return ret;
186         } else /* constexpr */ {
187              auto &[r1, r2] = retval;
188              using r1_type = std::decay_t<decltype(r1)>;
189              using r2_type = std::decay_t<decltype(r2)>;
190              r1 = vld1<r1_type>(f);
191              f += sizeof(r1) / sizeof(F);
192              r2 = vld1<r2_type>(f);
193              return ret;
194         }
195     }
196 }
197 
198 /**
199  * Returns c as follows:
200  * c_i = a_i * b_i if a and b are the same vector type or
201  * c_i = a_i * b if a is a vector and b is scalar or
202  * c_i = a * b_i if a is scalar and b is a vector.
203  */
204 template<typename T, typename S, typename F>
vmla(T a,S b,F c)205 static inline T vmla(T a, S b, F c) {
206     // Both types T and S are non-primitive and they are not equal.  T == S handled below.
207     (void) a;
208     (void) b;
209     (void) c;
210     static_assert(dependent_false_v<T>);
211 }
212 
213 template<typename T, typename F>
vmla(T a,T b,F c)214 static inline T vmla(T a, T b, F c) {
215     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
216         if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
217             return a + b * c;
218         } else {
219             static_assert(dependent_false_v<T>);
220         }
221     } else if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
222         // handle the lane variant
223 #ifdef USE_NEON
224         if constexpr (std::is_same_v<T, float32x2_t>) {
225             return vmla_n_f32(a, b, c);
226         } else if constexpr (std::is_same_v<T, float32x4_t>) {
227             return vmlaq_n_f32(a, b,c);
228 #if defined(__aarch64__)
229         } else if constexpr (std::is_same_v<T, float64x2_t>) {
230             return vmlaq_n_f64(a, b);
231 #endif
232         } else
233 #endif // USE_NEON
234         {
235         T ret;
236         auto &[retval] = ret;  // single-member struct
237         const auto &[aval] = a;
238         const auto &[bval] = b;
239         if constexpr (std::is_array_v<decltype(retval)>) {
240 #pragma unroll
241             for (size_t i = 0; i < std::size(aval); ++i) {
242                 retval[i] = vmla(aval[i], bval[i], c);
243             }
244             return ret;
245         } else /* constexpr */ {
246              auto &[r1, r2] = retval;
247              const auto &[a1, a2] = aval;
248              const auto &[b1, b2] = bval;
249              r1 = vmla(a1, b1, c);
250              r2 = vmla(a2, b2, c);
251              return ret;
252         }
253         }
254     } else {
255         // Both types T and F are non-primitive and they are not equal.
256         static_assert(dependent_false_v<T>);
257     }
258 }
259 
260 template<typename T, typename F>
vmla(T a,F b,T c)261 static inline T vmla(T a, F b, T c) {
262     return vmla(a, c, b);
263 }
264 
265 // fused multiply-add a + b * c
266 template<typename T>
vmla(T a,T b,T c)267 static inline T vmla(T a, T b, T c) {
268     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
269         return a + b * c;
270 
271 #ifdef USE_NEON
272     } else if constexpr (std::is_same_v<T, float32x2_t>) {
273         return vmla_f32(a, b, c);
274     } else if constexpr (std::is_same_v<T, float32x4_t>) {
275         return vmlaq_f32(a, b, c);
276 #if defined(__aarch64__)
277     } else if constexpr (std::is_same_v<T, float64x2_t>) {
278         return vmlaq_f64(a, b, c);
279 #endif
280 #endif // USE_NEON
281 
282     } else /* constexpr */ {
283         T ret;
284         auto &[retval] = ret;  // single-member struct
285         const auto &[aval] = a;
286         const auto &[bval] = b;
287         const auto &[cval] = c;
288         if constexpr (std::is_array_v<decltype(retval)>) {
289 #pragma unroll
290             for (size_t i = 0; i < std::size(aval); ++i) {
291                 retval[i] = vmla(aval[i], bval[i], cval[i]);
292             }
293             return ret;
294         } else /* constexpr */ {
295              auto &[r1, r2] = retval;
296              const auto &[a1, a2] = aval;
297              const auto &[b1, b2] = bval;
298              const auto &[c1, c2] = cval;
299              r1 = vmla(a1, b1, c1);
300              r2 = vmla(a2, b2, c2);
301              return ret;
302         }
303     }
304 }
305 
306 /**
307  * Returns c as follows:
308  * c_i = a_i * b_i if a and b are the same vector type or
309  * c_i = a_i * b if a is a vector and b is scalar or
310  * c_i = a * b_i if a is scalar and b is a vector.
311  */
312 template<typename T, typename F>
vmul(T a,F b)313 static inline auto vmul(T a, F b) {
314     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
315         if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
316             return a * b;
317         } else /* constexpr */ {
318             return vmul(b, a); // we prefer T to be the vector/struct form.
319         }
320     } else if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
321         // handle the lane variant
322 #ifdef USE_NEON
323         if constexpr (std::is_same_v<T, float32x2_t>) {
324             return vmul_n_f32(a, b);
325         } else if constexpr (std::is_same_v<T, float32x4_t>) {
326             return vmulq_n_f32(a, b);
327 #if defined(__aarch64__)
328         } else if constexpr (std::is_same_v<T, float64x2_t>) {
329             return vmulq_n_f64(a, b);
330 #endif
331         } else
332 #endif // USE_NEON
333         {
334         T ret;
335         auto &[retval] = ret;  // single-member struct
336         const auto &[aval] = a;
337         if constexpr (std::is_array_v<decltype(retval)>) {
338 #pragma unroll
339             for (size_t i = 0; i < std::size(aval); ++i) {
340                 retval[i] = vmul(aval[i], b);
341             }
342             return ret;
343         } else /* constexpr */ {
344              auto &[r1, r2] = retval;
345              const auto &[a1, a2] = aval;
346              r1 = vmul(a1, b);
347              r2 = vmul(a2, b);
348              return ret;
349         }
350         }
351     } else {
352         // Both types T and F are non-primitive and they are not equal.
353         static_assert(dependent_false_v<T>);
354     }
355 }
356 
357 template<typename T>
vmul(T a,T b)358 static inline T vmul(T a, T b) {
359     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
360         return a * b;
361 
362 #ifdef USE_NEON
363     } else if constexpr (std::is_same_v<T, float32x2_t>) {
364         return vmul_f32(a, b);
365     } else if constexpr (std::is_same_v<T, float32x4_t>) {
366         return vmulq_f32(a, b);
367 #if defined(__aarch64__)
368     } else if constexpr (std::is_same_v<T, float64x2_t>) {
369         return vmulq_f64(a, b);
370 #endif
371 #endif // USE_NEON
372 
373     } else /* constexpr */ {
374         T ret;
375         auto &[retval] = ret;  // single-member struct
376         const auto &[aval] = a;
377         const auto &[bval] = b;
378         if constexpr (std::is_array_v<decltype(retval)>) {
379 #pragma unroll
380             for (size_t i = 0; i < std::size(aval); ++i) {
381                 retval[i] = vmul(aval[i], bval[i]);
382             }
383             return ret;
384         } else /* constexpr */ {
385              auto &[r1, r2] = retval;
386              const auto &[a1, a2] = aval;
387              const auto &[b1, b2] = bval;
388              r1 = vmul(a1, b1);
389              r2 = vmul(a2, b2);
390              return ret;
391         }
392     }
393 }
394 
395 // negate
396 template<typename T>
vneg(T f)397 static inline T vneg(T f) {
398     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
399         return -f;
400 
401 #ifdef USE_NEON
402     } else if constexpr (std::is_same_v<T, float32x2_t>) {
403         return vneg_f32(f);
404     } else if constexpr (std::is_same_v<T, float32x4_t>) {
405         return vnegq_f32(f);
406 #if defined(__aarch64__)
407     } else if constexpr (std::is_same_v<T, float64x2_t>) {
408         return vnegq_f64(f);
409 #endif
410 #endif // USE_NEON
411 
412     } else /* constexpr */ {
413         T ret;
414         auto &[retval] = ret;  // single-member struct
415         const auto &[fval] = f;
416         if constexpr (std::is_array_v<decltype(retval)>) {
417 #pragma unroll
418             for (size_t i = 0; i < std::size(fval); ++i) {
419                 retval[i] = vneg(fval[i]);
420             }
421             return ret;
422         } else /* constexpr */ {
423              auto &[r1, r2] = retval;
424              const auto &[f1, f2] = fval;
425              r1 = vneg(f1);
426              r2 = vneg(f2);
427              return ret;
428         }
429     }
430 }
431 
432 // store to float pointer.
433 template<typename T, typename F>
vst1(F * f,T a)434 static inline void vst1(F *f, T a) {
435     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
436         *f = a;
437 
438 #ifdef USE_NEON
439     } else if constexpr (std::is_same_v<T, float32x2_t>) {
440         return vst1_f32(f, a);
441     } else if constexpr (std::is_same_v<T, float32x4_t>) {
442         return vst1q_f32(f, a);
443 #if defined(__aarch64__)
444     } else if constexpr (std::is_same_v<T, float64x2_t>) {
445         return vst1q_f64(f, a);
446 #endif
447 #endif // USE_NEON
448 
449     } else /* constexpr */ {
450         const auto &[aval] = a;
451         if constexpr (std::is_array_v<decltype(aval)>) {
452             constexpr size_t subelements = sizeof(std::decay_t<decltype(aval[0])>) / sizeof(F);
453 #pragma unroll
454             for (size_t i = 0; i < std::size(aval); ++i) {
455                 vst1(f, aval[i]);
456                 f += subelements;
457             }
458         } else /* constexpr */ {
459              const auto &[a1, a2] = aval;
460              vst1(f, a1);
461              f += sizeof(std::decay_t<decltype(a1)>) / sizeof(F);
462              vst1(f, a2);
463         }
464     }
465 }
466 
467 // subtract a - b
468 template<typename T>
vsub(T a,T b)469 static inline T vsub(T a, T b) {
470     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
471         return a - b;
472 
473 #ifdef USE_NEON
474     } else if constexpr (std::is_same_v<T, float32x2_t>) {
475         return vsub_f32(a, b);
476     } else if constexpr (std::is_same_v<T, float32x4_t>) {
477         return vsubq_f32(a, b);
478 #if defined(__aarch64__)
479     } else if constexpr (std::is_same_v<T, float64x2_t>) {
480         return vsubq_f64(a, b);
481 #endif
482 #endif // USE_NEON
483 
484     } else /* constexpr */ {
485         T ret;
486         auto &[retval] = ret;  // single-member struct
487         const auto &[aval] = a;
488         const auto &[bval] = b;
489         if constexpr (std::is_array_v<decltype(retval)>) {
490 #pragma unroll
491             for (size_t i = 0; i < std::size(aval); ++i) {
492                 retval[i] = vsub(aval[i], bval[i]);
493             }
494             return ret;
495         } else /* constexpr */ {
496              auto &[r1, r2] = retval;
497              const auto &[a1, a2] = aval;
498              const auto &[b1, b2] = bval;
499              r1 = vsub(a1, b1);
500              r2 = vsub(a2, b2);
501              return ret;
502         }
503     }
504 }
505 
506 } // namespace android::audio_utils::intrinsics
507 
508 #pragma pop_macro("USE_NEON")
509 
510 #endif // !ANDROID_AUDIO_UTILS_INTRINSIC_UTILS_H
511