1 /*
2 * Copyright 2020 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef ANDROID_AUDIO_UTILS_INTRINSIC_UTILS_H
18 #define ANDROID_AUDIO_UTILS_INTRINSIC_UTILS_H
19
20 #include <array> // std::size
21 #include <type_traits>
22 #include "template_utils.h"
23
24 /*
25 The intrinsics utility library contain helper functions for wide width DSP support.
26 We use templated types to allow testing from scalar to vector values.
27
28 See the Eigen project for general abstracted linear algebra acceleration.
29 http://eigen.tuxfamily.org/
30 */
31
32 // We conditionally include neon optimizations for ARM devices
33 #pragma push_macro("USE_NEON")
34 #undef USE_NEON
35
36 #if defined(__ARM_NEON__) || defined(__aarch64__)
37 #include <arm_neon.h>
38 #define USE_NEON
39 #endif
40
41 // We use macros to hide intrinsic methods that do not exist for
42 // incompatible target architectures; otherwise we have a
43 // "use of undeclared identifier" compilation error when
44 // we invoke our templated method.
45 //
46 // For example, we pass in DN_(vadd_f32) into implement_arg2().
47 // For ARM compilation, this works as expected, vadd_f32 is used.
48 // For x64 compilation, the macro converts vadd_f32 to a nullptr
49 // (so there is no undeclared identifier) and the calling site is safely
50 // ifdef'ed out in implement_arg2() for non ARM architectures.
51 //
52 // DN_(x) replaces x with nullptr for non-ARM arch
53 // DN64_(x) replaces x with nullptr for non-ARM64 arch
54 #pragma push_macro("DN_")
55 #pragma push_macro("DN64_")
56 #undef DN_
57 #undef DN64_
58
59 #ifdef USE_NEON
60 #if defined(__aarch64__)
61 #define DN_(x) x
62 #define DN64_(x) x
63 #else
64 #define DN_(x) x
65 #define DN64_(x) nullptr
66 #endif
67 #else
68 #define DN_(x) nullptr
69 #define DN64_(x) nullptr
70 #endif // USE_NEON
71
72 namespace android::audio_utils::intrinsics {
73
74 // For static assert(false) we need a template version to avoid early failure.
75 // See: https://stackoverflow.com/questions/51523965/template-dependent-false
76 template <typename T>
77 inline constexpr bool dependent_false_v = false;
78
79 // Detect if the value is directly addressable as an array.
80 // This is more advanced than std::is_array and works with neon intrinsics.
81 template<typename T>
requires(T a)82 concept is_array_like = requires(T a) {
83 a[0]; // can index first element
84 };
85
86 template<typename F, typename T>
requires(F f,T a)87 concept takes_identical_parameter_pair_v = requires(F f, T a) {
88 f(a, a);
89 };
90
91 /**
92 * Applies a functional or a constant to an intrinsic struct.
93 *
94 * The vapply method has no return value, but can modify an input intrinsic struct
95 * through element-wise application of a functional.
96 * Compare the behavior with veval which returns a struct result.
97 *
98 * Using vector terminology:
99 * if f is a constant: v[i] = f;
100 * if f is a void method that takes an element value: f(v[i]);
101 * if f returns an element value but takes no arg: v[i] = f();
102 * if f returns an element value but takes an element value: v[i] = f(v[i]);
103 */
104 template <typename V, typename F>
vapply(const F & f,V & v)105 constexpr void vapply(const F& f, V& v) {
106 if constexpr (std::is_same_v<V, float> || std::is_same_v<V, double>) {
107 using E = std::decay_t<decltype(v)>;
108 if constexpr (std::is_invocable_r_v<void, F, E>) {
109 f(v);
110 } else if constexpr (std::is_invocable_r_v<E, F, E>) {
111 v = f(v);
112 } else if constexpr (std::is_invocable_r_v<E, F>) {
113 v = f();
114 } else /* constexpr */ {
115 v = f;
116 }
117 } else if constexpr (is_array_like<V>) {
118 // this vector access within a neon object prevents constexpr.
119 using E = std::decay_t<decltype(v[0])>;
120 #pragma unroll
121 for (size_t i = 0; i < sizeof(v) / sizeof(v[0]); ++i) {
122 if constexpr (std::is_invocable_r_v<void, F, E>) {
123 f(v[i]);
124 } else if constexpr (std::is_invocable_r_v<E, F, E>) {
125 v[i] = f(v[i]);
126 } else if constexpr (std::is_invocable_r_v<E, F>) {
127 v[i] = f();
128 } else /* constexpr */ {
129 v[i] = f;
130 }
131 }
132 } else /* constexpr */ {
133 auto& [vv] = v;
134 // for constexpr purposes, non-const references can't bind to array elements.
135 using VT = decltype(vv);
136 // automatically generated from tests/generate_constexpr_constructible.cpp
137 if constexpr (is_braces_constructible<VT,
138 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
139 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
140 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
141 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type
142 >()) {
143 auto& [v1, v2, v3, v4, v5, v6, v7, v8,
144 v9, v10, v11, v12, v13, v14, v15, v16,
145 v17, v18, v19, v20, v21, v22, v23, v24,
146 v25, v26, v27, v28, v29, v30, v31, v32] = vv;
147 vapply(f, v1);
148 vapply(f, v2);
149 vapply(f, v3);
150 vapply(f, v4);
151 vapply(f, v5);
152 vapply(f, v6);
153 vapply(f, v7);
154 vapply(f, v8);
155 vapply(f, v9);
156 vapply(f, v10);
157 vapply(f, v11);
158 vapply(f, v12);
159 vapply(f, v13);
160 vapply(f, v14);
161 vapply(f, v15);
162 vapply(f, v16);
163 vapply(f, v17);
164 vapply(f, v18);
165 vapply(f, v19);
166 vapply(f, v20);
167 vapply(f, v21);
168 vapply(f, v22);
169 vapply(f, v23);
170 vapply(f, v24);
171 vapply(f, v25);
172 vapply(f, v26);
173 vapply(f, v27);
174 vapply(f, v28);
175 vapply(f, v29);
176 vapply(f, v30);
177 vapply(f, v31);
178 vapply(f, v32);
179 } else if constexpr (is_braces_constructible<VT,
180 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
181 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
182 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
183 any_type, any_type, any_type, any_type, any_type, any_type, any_type>()) {
184 auto& [v1, v2, v3, v4, v5, v6, v7, v8,
185 v9, v10, v11, v12, v13, v14, v15, v16,
186 v17, v18, v19, v20, v21, v22, v23, v24,
187 v25, v26, v27, v28, v29, v30, v31] = vv;
188 vapply(f, v1);
189 vapply(f, v2);
190 vapply(f, v3);
191 vapply(f, v4);
192 vapply(f, v5);
193 vapply(f, v6);
194 vapply(f, v7);
195 vapply(f, v8);
196 vapply(f, v9);
197 vapply(f, v10);
198 vapply(f, v11);
199 vapply(f, v12);
200 vapply(f, v13);
201 vapply(f, v14);
202 vapply(f, v15);
203 vapply(f, v16);
204 vapply(f, v17);
205 vapply(f, v18);
206 vapply(f, v19);
207 vapply(f, v20);
208 vapply(f, v21);
209 vapply(f, v22);
210 vapply(f, v23);
211 vapply(f, v24);
212 vapply(f, v25);
213 vapply(f, v26);
214 vapply(f, v27);
215 vapply(f, v28);
216 vapply(f, v29);
217 vapply(f, v30);
218 vapply(f, v31);
219 } else if constexpr (is_braces_constructible<VT,
220 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
221 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
222 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
223 any_type, any_type, any_type, any_type, any_type, any_type>()) {
224 auto& [v1, v2, v3, v4, v5, v6, v7, v8,
225 v9, v10, v11, v12, v13, v14, v15, v16,
226 v17, v18, v19, v20, v21, v22, v23, v24,
227 v25, v26, v27, v28, v29, v30] = vv;
228 vapply(f, v1);
229 vapply(f, v2);
230 vapply(f, v3);
231 vapply(f, v4);
232 vapply(f, v5);
233 vapply(f, v6);
234 vapply(f, v7);
235 vapply(f, v8);
236 vapply(f, v9);
237 vapply(f, v10);
238 vapply(f, v11);
239 vapply(f, v12);
240 vapply(f, v13);
241 vapply(f, v14);
242 vapply(f, v15);
243 vapply(f, v16);
244 vapply(f, v17);
245 vapply(f, v18);
246 vapply(f, v19);
247 vapply(f, v20);
248 vapply(f, v21);
249 vapply(f, v22);
250 vapply(f, v23);
251 vapply(f, v24);
252 vapply(f, v25);
253 vapply(f, v26);
254 vapply(f, v27);
255 vapply(f, v28);
256 vapply(f, v29);
257 vapply(f, v30);
258 } else if constexpr (is_braces_constructible<VT,
259 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
260 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
261 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
262 any_type, any_type, any_type, any_type, any_type>()) {
263 auto& [v1, v2, v3, v4, v5, v6, v7, v8,
264 v9, v10, v11, v12, v13, v14, v15, v16,
265 v17, v18, v19, v20, v21, v22, v23, v24,
266 v25, v26, v27, v28, v29] = vv;
267 vapply(f, v1);
268 vapply(f, v2);
269 vapply(f, v3);
270 vapply(f, v4);
271 vapply(f, v5);
272 vapply(f, v6);
273 vapply(f, v7);
274 vapply(f, v8);
275 vapply(f, v9);
276 vapply(f, v10);
277 vapply(f, v11);
278 vapply(f, v12);
279 vapply(f, v13);
280 vapply(f, v14);
281 vapply(f, v15);
282 vapply(f, v16);
283 vapply(f, v17);
284 vapply(f, v18);
285 vapply(f, v19);
286 vapply(f, v20);
287 vapply(f, v21);
288 vapply(f, v22);
289 vapply(f, v23);
290 vapply(f, v24);
291 vapply(f, v25);
292 vapply(f, v26);
293 vapply(f, v27);
294 vapply(f, v28);
295 vapply(f, v29);
296 } else if constexpr (is_braces_constructible<VT,
297 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
298 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
299 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
300 any_type, any_type, any_type, any_type>()) {
301 auto& [v1, v2, v3, v4, v5, v6, v7, v8,
302 v9, v10, v11, v12, v13, v14, v15, v16,
303 v17, v18, v19, v20, v21, v22, v23, v24,
304 v25, v26, v27, v28] = vv;
305 vapply(f, v1);
306 vapply(f, v2);
307 vapply(f, v3);
308 vapply(f, v4);
309 vapply(f, v5);
310 vapply(f, v6);
311 vapply(f, v7);
312 vapply(f, v8);
313 vapply(f, v9);
314 vapply(f, v10);
315 vapply(f, v11);
316 vapply(f, v12);
317 vapply(f, v13);
318 vapply(f, v14);
319 vapply(f, v15);
320 vapply(f, v16);
321 vapply(f, v17);
322 vapply(f, v18);
323 vapply(f, v19);
324 vapply(f, v20);
325 vapply(f, v21);
326 vapply(f, v22);
327 vapply(f, v23);
328 vapply(f, v24);
329 vapply(f, v25);
330 vapply(f, v26);
331 vapply(f, v27);
332 vapply(f, v28);
333 } else if constexpr (is_braces_constructible<VT,
334 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
335 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
336 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
337 any_type, any_type, any_type>()) {
338 auto& [v1, v2, v3, v4, v5, v6, v7, v8,
339 v9, v10, v11, v12, v13, v14, v15, v16,
340 v17, v18, v19, v20, v21, v22, v23, v24,
341 v25, v26, v27] = vv;
342 vapply(f, v1);
343 vapply(f, v2);
344 vapply(f, v3);
345 vapply(f, v4);
346 vapply(f, v5);
347 vapply(f, v6);
348 vapply(f, v7);
349 vapply(f, v8);
350 vapply(f, v9);
351 vapply(f, v10);
352 vapply(f, v11);
353 vapply(f, v12);
354 vapply(f, v13);
355 vapply(f, v14);
356 vapply(f, v15);
357 vapply(f, v16);
358 vapply(f, v17);
359 vapply(f, v18);
360 vapply(f, v19);
361 vapply(f, v20);
362 vapply(f, v21);
363 vapply(f, v22);
364 vapply(f, v23);
365 vapply(f, v24);
366 vapply(f, v25);
367 vapply(f, v26);
368 vapply(f, v27);
369 } else if constexpr (is_braces_constructible<VT,
370 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
371 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
372 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
373 any_type, any_type>()) {
374 auto& [v1, v2, v3, v4, v5, v6, v7, v8,
375 v9, v10, v11, v12, v13, v14, v15, v16,
376 v17, v18, v19, v20, v21, v22, v23, v24,
377 v25, v26] = vv;
378 vapply(f, v1);
379 vapply(f, v2);
380 vapply(f, v3);
381 vapply(f, v4);
382 vapply(f, v5);
383 vapply(f, v6);
384 vapply(f, v7);
385 vapply(f, v8);
386 vapply(f, v9);
387 vapply(f, v10);
388 vapply(f, v11);
389 vapply(f, v12);
390 vapply(f, v13);
391 vapply(f, v14);
392 vapply(f, v15);
393 vapply(f, v16);
394 vapply(f, v17);
395 vapply(f, v18);
396 vapply(f, v19);
397 vapply(f, v20);
398 vapply(f, v21);
399 vapply(f, v22);
400 vapply(f, v23);
401 vapply(f, v24);
402 vapply(f, v25);
403 vapply(f, v26);
404 } else if constexpr (is_braces_constructible<VT,
405 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
406 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
407 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
408 any_type>()) {
409 auto& [v1, v2, v3, v4, v5, v6, v7, v8,
410 v9, v10, v11, v12, v13, v14, v15, v16,
411 v17, v18, v19, v20, v21, v22, v23, v24,
412 v25] = vv;
413 vapply(f, v1);
414 vapply(f, v2);
415 vapply(f, v3);
416 vapply(f, v4);
417 vapply(f, v5);
418 vapply(f, v6);
419 vapply(f, v7);
420 vapply(f, v8);
421 vapply(f, v9);
422 vapply(f, v10);
423 vapply(f, v11);
424 vapply(f, v12);
425 vapply(f, v13);
426 vapply(f, v14);
427 vapply(f, v15);
428 vapply(f, v16);
429 vapply(f, v17);
430 vapply(f, v18);
431 vapply(f, v19);
432 vapply(f, v20);
433 vapply(f, v21);
434 vapply(f, v22);
435 vapply(f, v23);
436 vapply(f, v24);
437 vapply(f, v25);
438 } else if constexpr (is_braces_constructible<VT,
439 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
440 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
441 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type
442 >()) {
443 auto& [v1, v2, v3, v4, v5, v6, v7, v8,
444 v9, v10, v11, v12, v13, v14, v15, v16,
445 v17, v18, v19, v20, v21, v22, v23, v24] = vv;
446 vapply(f, v1);
447 vapply(f, v2);
448 vapply(f, v3);
449 vapply(f, v4);
450 vapply(f, v5);
451 vapply(f, v6);
452 vapply(f, v7);
453 vapply(f, v8);
454 vapply(f, v9);
455 vapply(f, v10);
456 vapply(f, v11);
457 vapply(f, v12);
458 vapply(f, v13);
459 vapply(f, v14);
460 vapply(f, v15);
461 vapply(f, v16);
462 vapply(f, v17);
463 vapply(f, v18);
464 vapply(f, v19);
465 vapply(f, v20);
466 vapply(f, v21);
467 vapply(f, v22);
468 vapply(f, v23);
469 vapply(f, v24);
470 } else if constexpr (is_braces_constructible<VT,
471 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
472 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
473 any_type, any_type, any_type, any_type, any_type, any_type, any_type>()) {
474 auto& [v1, v2, v3, v4, v5, v6, v7, v8,
475 v9, v10, v11, v12, v13, v14, v15, v16,
476 v17, v18, v19, v20, v21, v22, v23] = vv;
477 vapply(f, v1);
478 vapply(f, v2);
479 vapply(f, v3);
480 vapply(f, v4);
481 vapply(f, v5);
482 vapply(f, v6);
483 vapply(f, v7);
484 vapply(f, v8);
485 vapply(f, v9);
486 vapply(f, v10);
487 vapply(f, v11);
488 vapply(f, v12);
489 vapply(f, v13);
490 vapply(f, v14);
491 vapply(f, v15);
492 vapply(f, v16);
493 vapply(f, v17);
494 vapply(f, v18);
495 vapply(f, v19);
496 vapply(f, v20);
497 vapply(f, v21);
498 vapply(f, v22);
499 vapply(f, v23);
500 } else if constexpr (is_braces_constructible<VT,
501 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
502 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
503 any_type, any_type, any_type, any_type, any_type, any_type>()) {
504 auto& [v1, v2, v3, v4, v5, v6, v7, v8,
505 v9, v10, v11, v12, v13, v14, v15, v16,
506 v17, v18, v19, v20, v21, v22] = vv;
507 vapply(f, v1);
508 vapply(f, v2);
509 vapply(f, v3);
510 vapply(f, v4);
511 vapply(f, v5);
512 vapply(f, v6);
513 vapply(f, v7);
514 vapply(f, v8);
515 vapply(f, v9);
516 vapply(f, v10);
517 vapply(f, v11);
518 vapply(f, v12);
519 vapply(f, v13);
520 vapply(f, v14);
521 vapply(f, v15);
522 vapply(f, v16);
523 vapply(f, v17);
524 vapply(f, v18);
525 vapply(f, v19);
526 vapply(f, v20);
527 vapply(f, v21);
528 vapply(f, v22);
529 } else if constexpr (is_braces_constructible<VT,
530 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
531 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
532 any_type, any_type, any_type, any_type, any_type>()) {
533 auto& [v1, v2, v3, v4, v5, v6, v7, v8,
534 v9, v10, v11, v12, v13, v14, v15, v16,
535 v17, v18, v19, v20, v21] = vv;
536 vapply(f, v1);
537 vapply(f, v2);
538 vapply(f, v3);
539 vapply(f, v4);
540 vapply(f, v5);
541 vapply(f, v6);
542 vapply(f, v7);
543 vapply(f, v8);
544 vapply(f, v9);
545 vapply(f, v10);
546 vapply(f, v11);
547 vapply(f, v12);
548 vapply(f, v13);
549 vapply(f, v14);
550 vapply(f, v15);
551 vapply(f, v16);
552 vapply(f, v17);
553 vapply(f, v18);
554 vapply(f, v19);
555 vapply(f, v20);
556 vapply(f, v21);
557 } else if constexpr (is_braces_constructible<VT,
558 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
559 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
560 any_type, any_type, any_type, any_type>()) {
561 auto& [v1, v2, v3, v4, v5, v6, v7, v8,
562 v9, v10, v11, v12, v13, v14, v15, v16,
563 v17, v18, v19, v20] = vv;
564 vapply(f, v1);
565 vapply(f, v2);
566 vapply(f, v3);
567 vapply(f, v4);
568 vapply(f, v5);
569 vapply(f, v6);
570 vapply(f, v7);
571 vapply(f, v8);
572 vapply(f, v9);
573 vapply(f, v10);
574 vapply(f, v11);
575 vapply(f, v12);
576 vapply(f, v13);
577 vapply(f, v14);
578 vapply(f, v15);
579 vapply(f, v16);
580 vapply(f, v17);
581 vapply(f, v18);
582 vapply(f, v19);
583 vapply(f, v20);
584 } else if constexpr (is_braces_constructible<VT,
585 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
586 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
587 any_type, any_type, any_type>()) {
588 auto& [v1, v2, v3, v4, v5, v6, v7, v8,
589 v9, v10, v11, v12, v13, v14, v15, v16,
590 v17, v18, v19] = vv;
591 vapply(f, v1);
592 vapply(f, v2);
593 vapply(f, v3);
594 vapply(f, v4);
595 vapply(f, v5);
596 vapply(f, v6);
597 vapply(f, v7);
598 vapply(f, v8);
599 vapply(f, v9);
600 vapply(f, v10);
601 vapply(f, v11);
602 vapply(f, v12);
603 vapply(f, v13);
604 vapply(f, v14);
605 vapply(f, v15);
606 vapply(f, v16);
607 vapply(f, v17);
608 vapply(f, v18);
609 vapply(f, v19);
610 } else if constexpr (is_braces_constructible<VT,
611 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
612 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
613 any_type, any_type>()) {
614 auto& [v1, v2, v3, v4, v5, v6, v7, v8,
615 v9, v10, v11, v12, v13, v14, v15, v16,
616 v17, v18] = vv;
617 vapply(f, v1);
618 vapply(f, v2);
619 vapply(f, v3);
620 vapply(f, v4);
621 vapply(f, v5);
622 vapply(f, v6);
623 vapply(f, v7);
624 vapply(f, v8);
625 vapply(f, v9);
626 vapply(f, v10);
627 vapply(f, v11);
628 vapply(f, v12);
629 vapply(f, v13);
630 vapply(f, v14);
631 vapply(f, v15);
632 vapply(f, v16);
633 vapply(f, v17);
634 vapply(f, v18);
635 } else if constexpr (is_braces_constructible<VT,
636 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
637 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
638 any_type>()) {
639 auto& [v1, v2, v3, v4, v5, v6, v7, v8,
640 v9, v10, v11, v12, v13, v14, v15, v16,
641 v17] = vv;
642 vapply(f, v1);
643 vapply(f, v2);
644 vapply(f, v3);
645 vapply(f, v4);
646 vapply(f, v5);
647 vapply(f, v6);
648 vapply(f, v7);
649 vapply(f, v8);
650 vapply(f, v9);
651 vapply(f, v10);
652 vapply(f, v11);
653 vapply(f, v12);
654 vapply(f, v13);
655 vapply(f, v14);
656 vapply(f, v15);
657 vapply(f, v16);
658 vapply(f, v17);
659 } else if constexpr (is_braces_constructible<VT,
660 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
661 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type
662 >()) {
663 auto& [v1, v2, v3, v4, v5, v6, v7, v8,
664 v9, v10, v11, v12, v13, v14, v15, v16] = vv;
665 vapply(f, v1);
666 vapply(f, v2);
667 vapply(f, v3);
668 vapply(f, v4);
669 vapply(f, v5);
670 vapply(f, v6);
671 vapply(f, v7);
672 vapply(f, v8);
673 vapply(f, v9);
674 vapply(f, v10);
675 vapply(f, v11);
676 vapply(f, v12);
677 vapply(f, v13);
678 vapply(f, v14);
679 vapply(f, v15);
680 vapply(f, v16);
681 } else if constexpr (is_braces_constructible<VT,
682 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
683 any_type, any_type, any_type, any_type, any_type, any_type, any_type>()) {
684 auto& [v1, v2, v3, v4, v5, v6, v7, v8,
685 v9, v10, v11, v12, v13, v14, v15] = vv;
686 vapply(f, v1);
687 vapply(f, v2);
688 vapply(f, v3);
689 vapply(f, v4);
690 vapply(f, v5);
691 vapply(f, v6);
692 vapply(f, v7);
693 vapply(f, v8);
694 vapply(f, v9);
695 vapply(f, v10);
696 vapply(f, v11);
697 vapply(f, v12);
698 vapply(f, v13);
699 vapply(f, v14);
700 vapply(f, v15);
701 } else if constexpr (is_braces_constructible<VT,
702 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
703 any_type, any_type, any_type, any_type, any_type, any_type>()) {
704 auto& [v1, v2, v3, v4, v5, v6, v7, v8,
705 v9, v10, v11, v12, v13, v14] = vv;
706 vapply(f, v1);
707 vapply(f, v2);
708 vapply(f, v3);
709 vapply(f, v4);
710 vapply(f, v5);
711 vapply(f, v6);
712 vapply(f, v7);
713 vapply(f, v8);
714 vapply(f, v9);
715 vapply(f, v10);
716 vapply(f, v11);
717 vapply(f, v12);
718 vapply(f, v13);
719 vapply(f, v14);
720 } else if constexpr (is_braces_constructible<VT,
721 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
722 any_type, any_type, any_type, any_type, any_type>()) {
723 auto& [v1, v2, v3, v4, v5, v6, v7, v8,
724 v9, v10, v11, v12, v13] = vv;
725 vapply(f, v1);
726 vapply(f, v2);
727 vapply(f, v3);
728 vapply(f, v4);
729 vapply(f, v5);
730 vapply(f, v6);
731 vapply(f, v7);
732 vapply(f, v8);
733 vapply(f, v9);
734 vapply(f, v10);
735 vapply(f, v11);
736 vapply(f, v12);
737 vapply(f, v13);
738 } else if constexpr (is_braces_constructible<VT,
739 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
740 any_type, any_type, any_type, any_type>()) {
741 auto& [v1, v2, v3, v4, v5, v6, v7, v8,
742 v9, v10, v11, v12] = vv;
743 vapply(f, v1);
744 vapply(f, v2);
745 vapply(f, v3);
746 vapply(f, v4);
747 vapply(f, v5);
748 vapply(f, v6);
749 vapply(f, v7);
750 vapply(f, v8);
751 vapply(f, v9);
752 vapply(f, v10);
753 vapply(f, v11);
754 vapply(f, v12);
755 } else if constexpr (is_braces_constructible<VT,
756 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
757 any_type, any_type, any_type>()) {
758 auto& [v1, v2, v3, v4, v5, v6, v7, v8,
759 v9, v10, v11] = vv;
760 vapply(f, v1);
761 vapply(f, v2);
762 vapply(f, v3);
763 vapply(f, v4);
764 vapply(f, v5);
765 vapply(f, v6);
766 vapply(f, v7);
767 vapply(f, v8);
768 vapply(f, v9);
769 vapply(f, v10);
770 vapply(f, v11);
771 } else if constexpr (is_braces_constructible<VT,
772 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
773 any_type, any_type>()) {
774 auto& [v1, v2, v3, v4, v5, v6, v7, v8,
775 v9, v10] = vv;
776 vapply(f, v1);
777 vapply(f, v2);
778 vapply(f, v3);
779 vapply(f, v4);
780 vapply(f, v5);
781 vapply(f, v6);
782 vapply(f, v7);
783 vapply(f, v8);
784 vapply(f, v9);
785 vapply(f, v10);
786 } else if constexpr (is_braces_constructible<VT,
787 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type,
788 any_type>()) {
789 auto& [v1, v2, v3, v4, v5, v6, v7, v8,
790 v9] = vv;
791 vapply(f, v1);
792 vapply(f, v2);
793 vapply(f, v3);
794 vapply(f, v4);
795 vapply(f, v5);
796 vapply(f, v6);
797 vapply(f, v7);
798 vapply(f, v8);
799 vapply(f, v9);
800 } else if constexpr (is_braces_constructible<VT,
801 any_type, any_type, any_type, any_type, any_type, any_type, any_type, any_type
802 >()) {
803 auto& [v1, v2, v3, v4, v5, v6, v7, v8] = vv;
804 vapply(f, v1);
805 vapply(f, v2);
806 vapply(f, v3);
807 vapply(f, v4);
808 vapply(f, v5);
809 vapply(f, v6);
810 vapply(f, v7);
811 vapply(f, v8);
812 } else if constexpr (is_braces_constructible<VT,
813 any_type, any_type, any_type, any_type, any_type, any_type, any_type>()) {
814 auto& [v1, v2, v3, v4, v5, v6, v7] = vv;
815 vapply(f, v1);
816 vapply(f, v2);
817 vapply(f, v3);
818 vapply(f, v4);
819 vapply(f, v5);
820 vapply(f, v6);
821 vapply(f, v7);
822 } else if constexpr (is_braces_constructible<VT,
823 any_type, any_type, any_type, any_type, any_type, any_type>()) {
824 auto& [v1, v2, v3, v4, v5, v6] = vv;
825 vapply(f, v1);
826 vapply(f, v2);
827 vapply(f, v3);
828 vapply(f, v4);
829 vapply(f, v5);
830 vapply(f, v6);
831 } else if constexpr (is_braces_constructible<VT,
832 any_type, any_type, any_type, any_type, any_type>()) {
833 auto& [v1, v2, v3, v4, v5] = vv;
834 vapply(f, v1);
835 vapply(f, v2);
836 vapply(f, v3);
837 vapply(f, v4);
838 vapply(f, v5);
839 } else if constexpr (is_braces_constructible<VT,
840 any_type, any_type, any_type, any_type>()) {
841 auto& [v1, v2, v3, v4] = vv;
842 vapply(f, v1);
843 vapply(f, v2);
844 vapply(f, v3);
845 vapply(f, v4);
846 } else if constexpr (is_braces_constructible<VT,
847 any_type, any_type, any_type>()) {
848 auto& [v1, v2, v3] = vv;
849 vapply(f, v1);
850 vapply(f, v2);
851 vapply(f, v3);
852 } else if constexpr (is_braces_constructible<VT,
853 any_type, any_type>()) {
854 auto& [v1, v2] = vv;
855 vapply(f, v1);
856 vapply(f, v2);
857 } else if constexpr (is_braces_constructible<VT,
858 any_type>()) {
859 auto& [v1] = vv;
860 vapply(f, v1);
861 } else {
862 static_assert(false, "Currently supports up to 32 members only.");
863 }
864 }
865 }
866
867 // Type of array embedded in a struct that is usable in the Neon template functions below.
868 // This type must satisfy std::is_array_v<>.
869 template<typename T, size_t N>
870 struct internal_array_t {
871 T v[N];
sizeinternal_array_t872 static constexpr size_t size() { return N; }
873 using element_t = T;
874 constexpr bool operator==(const internal_array_t<T, N> other) const {
875 for (size_t i = 0; i < N; ++i) {
876 if (v[i] != other.v[i]) return false;
877 }
878 return true;
879 }
880 constexpr internal_array_t<T, N>& operator=(T value) {
881 for (size_t i = 0; i < N; ++i) {
882 v[i] = value;
883 }
884 return *this;
885 }
886 constexpr internal_array_t() = default;
887 // explicit: disallow internal_array_t<float, 3> x = 10.f;
internal_array_tinternal_array_t888 constexpr explicit internal_array_t(T value) {
889 *this = value;
890 }
891 // allow internal_array_t<float, 3> x = { 10.f };
internal_array_tinternal_array_t892 constexpr internal_array_t(std::initializer_list<T> value) {
893 size_t i = 0;
894 auto vptr = value.begin();
895 for (; i < std::min(N, value.size()); ++i) {
896 v[i] = *vptr++;
897 }
898 for (; i < N; ++i) {
899 v[i] = {};
900 }
901 }
902 };
903
904 // assert our structs are trivially copyable so we can use memcpy freely.
905 static_assert(std::is_trivially_copyable_v<internal_array_t<float, 31>>);
906 static_assert(std::is_trivially_copyable_v<internal_array_t<double, 31>>);
907
908 // Vector convert between type T to type S.
909 template <typename S, typename T>
vconvert(const T & in)910 constexpr inline S vconvert(const T& in) {
911 S out;
912
913 if constexpr (is_array_like<S>) {
914 if constexpr (is_array_like<T>) {
915 #pragma unroll
916 // neon intrinsics need sizeof.
917 for (size_t i = 0; i < sizeof(in) / sizeof(in[0]); ++i) {
918 out[i] = in[i];
919 }
920 } else { /* constexpr */
921 const auto& [inv] = in;
922 #pragma unroll
923 for (size_t i = 0; i < T::size(); ++i) {
924 out[i] = inv[i];
925 }
926 }
927 } else { /* constexpr */
928 auto& [outv] = out;
929 if constexpr (is_array_like<T>) {
930 #pragma unroll
931 // neon intrinsics need sizeof.
932 for (size_t i = 0; i < sizeof(in) / sizeof(in[0]); ++i) {
933 outv[i] = in[i];
934 }
935 } else { /* constexpr */
936 const auto& [inv] = in;
937 #pragma unroll
938 for (size_t i = 0; i < T::size(); ++i) {
939 outv[i] = inv[i];
940 }
941 }
942 }
943 return out;
944 }
945
946 /*
947 Generalized template functions for the Neon instruction set.
948
949 See here for some general comments from ARM.
950 https://developer.arm.com/documentation/dht0004/a/neon-support-in-compilation-tools/automatic-vectorization/floating-point-vectorization
951
952 Notes:
953 1) We provide scalar equivalents which are compilable even on non-ARM processors.
954 2) We use recursive calls to decompose array types, e.g. float32x4x4_t -> float32x4_t
955 3) NEON double SIMD acceleration is only available on 64 bit architectures.
956 On Pixel 3XL, NEON double x 2 SIMD is actually slightly slower than the FP unit.
957
958 We create a generic Neon acceleration to be applied to a composite type.
959
960 The type follows the following compositional rules for simplicity:
961 1) must be a primitive floating point type.
962 2) must be a NEON data type.
963 3) must be a struct with one member, either
964 a) an array of types 1-3.
965 b) a cons-pair struct of 2 possibly different members of types 1-3.
966
967 Examples of possible struct definitions:
968 using alternative_2_t = struct { struct { float a; float b; } s; };
969 using alternative_9_t = struct { struct { float32x4x2_t a; float b; } s; };
970 using alternative_15_t = struct { struct { float32x4x2_t a; struct { float v[7]; } b; } s; };
971 */
972
973 #ifdef USE_NEON
974
975 // This will be specialized later to hold different types.
976 template<int N>
977 struct vfloat_struct {};
978
979 // Helper method to extract type contained in the struct.
980 template<int N>
981 using vfloat_t = typename vfloat_struct<N>::t;
982
983 // Create vfloat_extended_t to add helper methods.
984 //
985 // It is preferable to use vector_hw_t instead, which
986 // chooses between vfloat_extended_t and internal_array_t
987 // based on type support.
988 //
989 // Note: Adding helper methods will not affect std::is_trivially_copyable_v.
990 template<size_t N>
991 struct vfloat_extended_t : public vfloat_t<N> {
sizevfloat_extended_t992 static constexpr size_t size() { return N; }
993 using element_t = float;
994 constexpr bool operator==(const vfloat_extended_t<N>& other) const {
995 return veq(*this, other);
996 }
997 vfloat_extended_t<N>& operator=(float value) {
998 vapply(value, *this);
999 return *this;
1000 }
1001 constexpr vfloat_extended_t(const vfloat_extended_t<N>& other) = default;
1002 vfloat_extended_t() = default;
1003 // explicit: disallow vfloat_extended_t<float, 3> x = 10.f;
vfloat_extended_tvfloat_extended_t1004 explicit vfloat_extended_t(float value) {
1005 *this = value;
1006 }
1007 // allow internal_array_t<float, 3> x = { 10.f };
vfloat_extended_tvfloat_extended_t1008 vfloat_extended_t(std::initializer_list<float> value) {
1009 size_t i = 0;
1010 auto vptr = value.begin();
1011 float v[N];
1012 for (; i < std::min(N, value.size()); ++i) {
1013 v[i] = *vptr++;
1014 }
1015 for (; i < N; ++i) {
1016 v[i] = {};
1017 }
1018 static_assert(sizeof(*this) == sizeof(v));
1019 static_assert(sizeof(*this) == N * sizeof(float));
1020 memcpy(this, v, sizeof(*this));
1021 }
vfloat_extended_tvfloat_extended_t1022 vfloat_extended_t(internal_array_t<float, N> value) {
1023 static_assert(sizeof(*this) == sizeof(value.v));
1024 static_assert(sizeof(*this) == N * sizeof(float));
1025 memcpy(this, value.v, sizeof(*this));
1026 }
1027 };
1028
1029 // Create type alias vector_hw_t as platform independent SIMD intrinsic
1030 // type for hardware support.
1031
1032 template<typename F, size_t N>
1033 using vector_hw_t = std::conditional_t<
1034 std::is_same_v<F, float>, vfloat_extended_t<N>, internal_array_t<F, N>>;
1035
1036 // Recursively define structs containing the NEON intrinsic types for a given vector size.
1037 // intrinsic_utils.h allows structurally recursive type definitions based on
1038 // pairs of types (much like Lisp list cons pairs).
1039 //
1040 // For unpacking these type pairs, we use structured binding, so the naming of the
1041 // element members is irrelevant. Hence, it is possible to use pragma pack and
1042 // std::pair<> to define these structs as follows:
1043 //
1044 // #pragma pack(push, 1)
1045 // struct vfloat_struct<3> { using t = struct {
1046 // std::pair<vfloat_t<2>, vfloat_t<1>> p; }; };
1047 // #pragma pack(pop)
1048 //
1049 // But due to ctor requirements, the resulting struct composed of std::pair is
1050 // no longer considered trivially copyable.
1051 //
1052 template<>
1053 struct vfloat_struct<1> { using t = struct { float v[1]; }; };
1054 template<>
1055 struct vfloat_struct<2> { using t = struct { float32x2_t v[1]; }; };
1056 template<>
1057 struct vfloat_struct<3> { using t = struct { struct __attribute__((packed)) {
1058 vfloat_t<2> a; vfloat_t<1> b; } s; }; };
1059 template<>
1060 struct vfloat_struct<4> { using t = struct { float32x4_t v[1]; }; };
1061 template<>
1062 struct vfloat_struct<5> { using t = struct { struct __attribute__((packed)) {
1063 vfloat_t<4> a; vfloat_t<1> b; } s; }; };
1064 template<>
1065 struct vfloat_struct<6> { using t = struct { struct __attribute__((packed)) {
1066 vfloat_t<4> a; vfloat_t<2> b; } s; }; };
1067 template<>
1068 struct vfloat_struct<7> { using t = struct { struct __attribute__((packed)) {
1069 vfloat_t<4> a; vfloat_t<3> b; } s; }; };
1070 template<>
1071 struct vfloat_struct<8> { using t = float32x4x2_t; };
1072 template<>
1073 struct vfloat_struct<9> { using t = struct { struct __attribute__((packed)) {
1074 vfloat_t<8> a; vfloat_t<1> b; } s; }; };
1075 template<>
1076 struct vfloat_struct<10> { using t = struct { struct __attribute__((packed)) {
1077 vfloat_t<8> a; vfloat_t<2> b; } s; }; };
1078 template<>
1079 struct vfloat_struct<11> { using t = struct { struct __attribute__((packed)) {
1080 vfloat_t<8> a; vfloat_t<3> b; } s; }; };
1081 template<>
1082 struct vfloat_struct<12> { using t = struct { struct __attribute__((packed)) {
1083 vfloat_t<8> a; vfloat_t<4> b; } s; }; };
1084 template<>
1085 struct vfloat_struct<13> { using t = struct { struct __attribute__((packed)) {
1086 vfloat_t<8> a; vfloat_t<5> b; } s; }; };
1087 template<>
1088 struct vfloat_struct<14> { using t = struct { struct __attribute__((packed)) {
1089 vfloat_t<8> a; vfloat_t<6> b; } s; }; };
1090 template<>
1091 struct vfloat_struct<15> { using t = struct { struct __attribute__((packed)) {
1092 vfloat_t<8> a; vfloat_t<7> b; } s; }; };
1093 template<>
1094 struct vfloat_struct<16> { using t = float32x4x4_t; };
1095 template<>
1096 struct vfloat_struct<17> { using t = struct { struct __attribute__((packed)) {
1097 vfloat_t<16> a; vfloat_t<1> b; } s; }; };
1098 template<>
1099 struct vfloat_struct<18> { using t = struct { struct __attribute__((packed)) {
1100 vfloat_t<16> a; vfloat_t<2> b; } s; }; };
1101 template<>
1102 struct vfloat_struct<19> { using t = struct { struct __attribute__((packed)) {
1103 vfloat_t<16> a; vfloat_t<3> b; } s; }; };
1104 template<>
1105 struct vfloat_struct<20> { using t = struct { struct __attribute__((packed)) {
1106 vfloat_t<16> a; vfloat_t<4> b; } s; }; };
1107 template<>
1108 struct vfloat_struct<21> { using t = struct { struct __attribute__((packed)) {
1109 vfloat_t<16> a; vfloat_t<5> b; } s; }; };
1110 template<>
1111 struct vfloat_struct<22> { using t = struct { struct __attribute__((packed)) {
1112 vfloat_t<16> a; vfloat_t<6> b; } s; }; };
1113 template<>
1114 struct vfloat_struct<23> { using t = struct { struct __attribute__((packed)) {
1115 vfloat_t<16> a; vfloat_t<7> b; } s; }; };
1116 template<>
1117 struct vfloat_struct<24> { using t = struct { struct __attribute__((packed)) {
1118 vfloat_t<16> a; vfloat_t<8> b; } s; }; };
1119 template<>
1120 struct vfloat_struct<25> { using t = struct { struct __attribute__((packed)) {
1121 vfloat_t<16> a; vfloat_t<9> b; } s; }; };
1122 template<>
1123 struct vfloat_struct<26> { using t = struct { struct __attribute__((packed)) {
1124 vfloat_t<16> a; vfloat_t<10> b; } s; }; };
1125 template<>
1126 struct vfloat_struct<27> { using t = struct { struct __attribute__((packed)) {
1127 vfloat_t<16> a; vfloat_t<11> b; } s; }; };
1128 template<>
1129 struct vfloat_struct<28> { using t = struct { struct __attribute__((packed)) {
1130 vfloat_t<16> a; vfloat_t<12> b; } s; }; };
1131 template<>
1132 struct vfloat_struct<29> { using t = struct { struct __attribute__((packed)) {
1133 vfloat_t<16> a; vfloat_t<13> b; } s; }; };
1134 template<>
1135 struct vfloat_struct<30> { using t = struct { struct __attribute__((packed)) {
1136 vfloat_t<16> a; vfloat_t<14> b; } s; }; };
1137 template<>
1138 struct vfloat_struct<31> { using t = struct { struct __attribute__((packed)) {
1139 vfloat_t<16> a; vfloat_t<15> b; } s; }; };
1140 template<>
1141 struct vfloat_struct<32> { using t = struct { struct __attribute__((packed)) {
1142 vfloat_t<16> a; vfloat_t<16> b; } s; }; };
1143
1144 // assert our structs are trivially copyable so we can use memcpy freely.
1145 static_assert(std::is_trivially_copyable_v<vfloat_struct<31>>);
1146 static_assert(std::is_trivially_copyable_v<vfloat_t<31>>);
1147
1148 #else
1149
1150 // x64 or risc-v, use loop vectorization if no HW type exists.
1151 template<typename F, int N>
1152 using vector_hw_t = internal_array_t<F, N>;
1153
1154 #endif // USE_NEON
1155
1156 /**
1157 * Returns the first element of the intrinsic struct.
1158 */
1159 template <typename T>
1160 constexpr auto first_element_of(const T& t) {
1161 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
1162 return t;
1163 } else if constexpr (is_array_like<T>) {
1164 return first_element_of(t[0]);
1165 } else /* constexpr */ {
1166 const auto& [tval] = t; // single-member struct
1167 if constexpr (std::is_array_v<decltype(tval)>) {
1168 return first_element_of(tval[0]);
1169 } else /* constexpr */ {
1170 const auto& [p1, p2] = tval;
1171 return first_element_of(p1);
1172 }
1173 }
1174 }
1175
1176 /**
1177 * Evaluate f(v1 [, v2 [, v3]]) and return an intrinsic struct result.
1178 *
1179 * The veval method returns the vector result by element-wise
1180 * evaulating a functional f to one or more intrinsic struct inputs.
1181 * Compare this method with the single argument vapply,
1182 * which can modify a single struct argument in-place.
1183 */
1184 template <typename F, typename V>
1185 constexpr V veval(const F& f, const V& v1) {
1186 if constexpr (std::is_same_v<V, float> || std::is_same_v<V, double>) {
1187 return f(v1);
1188 } else if constexpr (is_array_like<V>) {
1189 V out;
1190 #pragma unroll
1191 // neon intrinsics need sizeof.
1192 for (size_t i = 0; i < sizeof(v1) / sizeof(v1[0]); ++i) {
1193 out[i] = f(v1[i]);
1194 }
1195 return out;
1196 } else /* constexpr */ {
1197 V ret;
1198 auto& [retval] = ret; // single-member struct
1199 const auto& [v1val] = v1;
1200 if constexpr (std::is_array_v<decltype(v1val)>) {
1201 #pragma unroll
1202 for (size_t i = 0; i < std::size(v1val); ++i) {
1203 retval[i] = veval(f, v1val[i]);
1204 }
1205 return ret;
1206 } else /* constexpr */ {
1207 auto& [r1, r2] = retval;
1208 const auto& [p1, p2] = v1val;
1209 r1 = veval(f, p1);
1210 r2 = veval(f, p2);
1211 return ret;
1212 }
1213 }
1214 }
1215
1216 template <typename F, typename V>
1217 constexpr V veval(const F& f, const V& v1, const V& v2) {
1218 if constexpr (std::is_same_v<V, float> || std::is_same_v<V, double>) {
1219 return f(v1, v2);
1220 } else if constexpr (is_array_like<V>) {
1221 V out;
1222 #pragma unroll
1223 // neon intrinsics need sizeof.
1224 for (size_t i = 0; i < sizeof(v1) / sizeof(v1[0]); ++i) {
1225 out[i] = f(v1[i], v2[i]);
1226 }
1227 return out;
1228 } else /* constexpr */ {
1229 V ret;
1230 auto& [retval] = ret; // single-member struct
1231 const auto& [v1val] = v1;
1232 const auto& [v2val] = v2;
1233 if constexpr (std::is_array_v<decltype(v1val)>) {
1234 #pragma unroll
1235 for (size_t i = 0; i < std::size(v1val); ++i) {
1236 retval[i] = veval(f, v1val[i], v2val[i]);
1237 }
1238 return ret;
1239 } else /* constexpr */ {
1240 auto& [r1, r2] = retval;
1241 const auto& [p11, p12] = v1val;
1242 const auto& [p21, p22] = v2val;
1243 r1 = veval(f, p11, p21);
1244 r2 = veval(f, p12, p22);
1245 return ret;
1246 }
1247 }
1248 }
1249
1250 template <typename F, typename V>
1251 constexpr V veval(const F& f, const V& v1, const V& v2, const V& v3) {
1252 if constexpr (std::is_same_v<V, float> || std::is_same_v<V, double>) {
1253 return f(v1, v2, v3);
1254 } else if constexpr (is_array_like<V>) {
1255 V out;
1256 #pragma unroll
1257 // neon intrinsics need sizeof.
1258 for (size_t i = 0; i < sizeof(v1) / sizeof(v1[0]); ++i) {
1259 out[i] = f(v1[i], v2[i], v3[i]);
1260 }
1261 return out;
1262 } else /* constexpr */ {
1263 V ret;
1264 auto& [retval] = ret; // single-member struct
1265 const auto& [v1val] = v1;
1266 const auto& [v2val] = v2;
1267 const auto& [v3val] = v3;
1268 if constexpr (std::is_array_v<decltype(v1val)>) {
1269 #pragma unroll
1270 for (size_t i = 0; i < std::size(v1val); ++i) {
1271 retval[i] = veval(f, v1val[i], v2val[i], v3val[i]);
1272 }
1273 return ret;
1274 } else /* constexpr */ {
1275 auto& [r1, r2] = retval;
1276 const auto& [p11, p12] = v1val;
1277 const auto& [p21, p22] = v2val;
1278 const auto& [p31, p32] = v3val;
1279 r1 = veval(f, p11, p21, p31);
1280 r2 = veval(f, p12, p22, p32);
1281 return ret;
1282 }
1283 }
1284 }
1285
1286 /**
1287 * Compare two intrinsic structs and return true iff equal.
1288 *
1289 * As opposed to memcmp, this handles floating point equality
1290 * which is different due to signed 0 and NaN, etc.
1291 */
1292 template<typename T>
1293 inline bool veq(T a, T b) {
1294 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
1295 return a == b;
1296 } else if constexpr (is_array_like<T>) {
1297 #pragma unroll
1298 for (size_t i = 0; i < sizeof(a) / sizeof(a[0]); ++i) {
1299 if (!veq(a[i], b[i])) return false;
1300 }
1301 return true;
1302 } else /* constexpr */ {
1303 const auto& [aval] = a;
1304 const auto& [bval] = b;
1305 if constexpr (std::is_array_v<decltype(aval)>) {
1306 #pragma unroll
1307 for (size_t i = 0; i < std::size(aval); ++i) {
1308 if (!veq(aval[i], bval[i])) return false;
1309 }
1310 return true;
1311 } else /* constexpr */ {
1312 const auto& [a1, a2] = aval;
1313 const auto& [b1, b2] = bval;
1314 return veq(a1, b1) && veq(a2, b2);
1315 }
1316 }
1317 }
1318
1319 // --------------------------------------------------------------------
1320
1321 template<typename F, typename FN1, typename FN2, typename FN3, typename T>
1322 inline T implement_arg1(const F& f, const FN1& fn1, const FN2& fn2, const FN3& fn3, T a) {
1323 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
1324 return f(a);
1325 #ifdef USE_NEON
1326 } else if constexpr (std::is_same_v<T, float32x2_t>) {
1327 return fn1(a);
1328 } else if constexpr (std::is_same_v<T, float32x4_t>) {
1329 return fn2(a);
1330 #if defined(__aarch64__)
1331 } else if constexpr (std::is_same_v<T, float64x2_t>) {
1332 return fn3(a);
1333 #endif
1334 #endif // USE_NEON
1335
1336 } else /* constexpr */ {
1337 T ret;
1338 auto& [retval] = ret; // single-member struct
1339 const auto& [aval] = a;
1340 if constexpr (std::is_array_v<decltype(retval)>) {
1341 #pragma unroll
1342 for (size_t i = 0; i < std::size(aval); ++i) {
1343 retval[i] = implement_arg1(f, fn1, fn2, fn3, aval[i]);
1344 }
1345 return ret;
1346 } else /* constexpr */ {
1347 auto& [r1, r2] = retval;
1348 const auto& [a1, a2] = aval;
1349 r1 = implement_arg1(f, fn1, fn2, fn3, a1);
1350 r2 = implement_arg1(f, fn1, fn2, fn3, a2);
1351 return ret;
1352 }
1353 }
1354 }
1355
1356 template<typename F, typename FN1, typename FN2, typename FN3, typename T>
1357 inline auto implement_arg1v(const F& f, const FN1& fn1, const FN2& fn2, const FN3& fn3, T a) {
1358 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
1359 return a;
1360
1361 #if defined(USE_NEON) && defined(__aarch64__)
1362 } else if constexpr (std::is_same_v<T, float32x2_t>) {
1363 return fn1(a);
1364 } else if constexpr (std::is_same_v<T, float32x4_t>) {
1365 return fn2(a);
1366 } else if constexpr (std::is_same_v<T, float64x2_t>) {
1367 return fn3(a);
1368 #endif // defined(USE_NEON) && defined(__aarch64__)
1369 } else if constexpr (is_array_like<T>) {
1370 using ret_t = std::decay_t<decltype(a[0])>;
1371
1372 ret_t ret = a[0];
1373 // array_like is not the same as an array, so we use sizeof here
1374 // to handle neon instrinsics.
1375 #pragma unroll
1376 for (size_t i = 1; i < sizeof(a) / sizeof(a[0]); ++i) {
1377 ret = f(ret, a[i]);
1378 }
1379 return ret;
1380 } else /* constexpr */ {
1381 const auto &[aval] = a;
1382 if constexpr (std::is_array_v<decltype(aval)>) {
1383 using ret_t = std::decay_t<decltype(first_element_of(aval[0]))>;
1384 ret_t ret = implement_arg1v(f, fn1, fn2, fn3, aval[0]);
1385 #pragma unroll
1386 for (size_t i = 1; i < std::size(aval); ++i) {
1387 ret = f(ret, implement_arg1v(f, fn1, fn2, fn3, aval[i]));
1388 }
1389 return ret;
1390 } else /* constexpr */ {
1391 using ret_t = std::decay_t<decltype(first_element_of(a))>;
1392 const auto& [a1, a2] = aval;
1393 ret_t ret = implement_arg1v(f, fn1, fn2, fn3, a1);
1394 ret = f(ret, implement_arg1v(f, fn1, fn2, fn3, a2));
1395 return ret;
1396 }
1397 }
1398 }
1399
1400 template<typename T, typename F>
1401 inline T vdupn(F f);
1402
1403 /**
1404 * Invoke vector intrinsic with a vector argument T and a scalar argument S.
1405 *
1406 * If the vector intrinsic does not support vector-scalar operation, we dup the scalar
1407 * argument.
1408 */
1409 template <typename F, typename T, typename S>
1410 auto invoke_intrinsic_with_dup_as_needed(const F& f, T a, S b) {
1411 if constexpr (takes_identical_parameter_pair_v<F, T>) {
1412 return f(a, vdupn<T>(b));
1413 } else /* constexpr */ {
1414 return f(a, b);
1415 }
1416 }
1417
1418 // arg2 with a vector and scalar parameter.
1419 template<typename F, typename FN1, typename FN2, typename FN3, typename T, typename S>
1420 inline auto implement_arg2(const F& f, const FN1& fn1, const FN2& fn2, const FN3& fn3, T a, S b) {
1421 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
1422 if constexpr (std::is_same_v<S, float> || std::is_same_v<S, double>) {
1423 return f(a, b);
1424 } else /* constexpr */ {
1425 return implement_arg2(f, fn1, fn2, fn3, b, a); // we prefer T to be the vector/struct.
1426 }
1427 } else if constexpr (std::is_same_v<S, float> || std::is_same_v<S, double>) {
1428 // handle the lane variant
1429 #ifdef USE_NEON
1430 if constexpr (std::is_same_v<T, float32x2_t>) {
1431 return invoke_intrinsic_with_dup_as_needed(fn1, a, b);
1432 } else if constexpr (std::is_same_v<T, float32x4_t>) {
1433 return invoke_intrinsic_with_dup_as_needed(fn2, a, b);
1434 #if defined(__aarch64__)
1435 } else if constexpr (std::is_same_v<T, float64x2_t>) {
1436 return invoke_intrinsic_with_dup_as_needed(fn3, a, b);
1437 #endif
1438 } else
1439 #endif // USE_NEON
1440 {
1441 T ret;
1442 auto &[retval] = ret; // single-member struct
1443 const auto &[aval] = a;
1444 if constexpr (std::is_array_v<decltype(retval)>) {
1445 #pragma unroll
1446 for (size_t i = 0; i < std::size(aval); ++i) {
1447 retval[i] = implement_arg2(f, fn1, fn2, fn3, aval[i], b);
1448 }
1449 return ret;
1450 } else /* constexpr */ {
1451 auto& [r1, r2] = retval;
1452 const auto& [a1, a2] = aval;
1453 r1 = implement_arg2(f, fn1, fn2, fn3, a1, b);
1454 r2 = implement_arg2(f, fn1, fn2, fn3, a2, b);
1455 return ret;
1456 }
1457 }
1458 } else {
1459 // Both types T and S are non-primitive and they are not equal.
1460 static_assert(dependent_false_v<T>);
1461 }
1462 }
1463
1464 template<typename F, typename FN1, typename FN2, typename FN3, typename T>
1465 inline T implement_arg2(const F& f, const FN1& fn1, const FN2& fn2, const FN3& fn3, T a, T b) {
1466 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
1467 return f(a, b);
1468
1469 #ifdef USE_NEON
1470 } else if constexpr (std::is_same_v<T, float32x2_t>) {
1471 return fn1(a, b);
1472 } else if constexpr (std::is_same_v<T, float32x4_t>) {
1473 return fn2(a, b);
1474 #if defined(__aarch64__)
1475 } else if constexpr (std::is_same_v<T, float64x2_t>) {
1476 return fn3(a, b);
1477 #endif
1478 #endif // USE_NEON
1479
1480 } else /* constexpr */ {
1481 T ret;
1482 auto& [retval] = ret; // single-member struct
1483 const auto& [aval] = a;
1484 const auto& [bval] = b;
1485 if constexpr (std::is_array_v<decltype(retval)>) {
1486 #pragma unroll
1487 for (size_t i = 0; i < std::size(aval); ++i) {
1488 retval[i] = implement_arg2(f, fn1, fn2, fn3, aval[i], bval[i]);
1489 }
1490 return ret;
1491 } else /* constexpr */ {
1492 auto& [r1, r2] = retval;
1493 const auto& [a1, a2] = aval;
1494 const auto& [b1, b2] = bval;
1495 r1 = implement_arg2(f, fn1, fn2, fn3, a1, b1);
1496 r2 = implement_arg2(f, fn1, fn2, fn3, a2, b2);
1497 return ret;
1498 }
1499 }
1500 }
1501
1502 template<typename F, typename FN1, typename FN2, typename FN3, typename T, typename S, typename R>
1503 inline auto implement_arg3(
1504 const F& f, const FN1& fn1, const FN2& fn2, const FN3& fn3, R a, T b, S c) {
1505 // Arbitrary support is not allowed.
1506 (void) f;
1507 (void) fn1;
1508 (void) fn2;
1509 (void) fn3;
1510 (void) a;
1511 (void) b;
1512 (void) c;
1513 static_assert(dependent_false_v<T>);
1514 }
1515
1516 template<typename F, typename FN1, typename FN2, typename FN3, typename T, typename S>
1517 inline auto implement_arg3(
1518 const F& f, const FN1& fn1, const FN2& fn2, const FN3& fn3, T a, T b, S c) {
1519 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
1520 if constexpr (std::is_same_v<S, float> || std::is_same_v<S, double>) {
1521 return f(a, b, c);
1522 } else {
1523 static_assert(dependent_false_v<T>);
1524 }
1525 } else if constexpr (std::is_same_v<S, float> || std::is_same_v<S, double>) {
1526 // handle the lane variant
1527 #ifdef USE_NEON
1528 if constexpr (std::is_same_v<T, float32x2_t>) {
1529 return fn1(a, b, c);
1530 } else if constexpr (std::is_same_v<T, float32x4_t>) {
1531 return fn2(a, b, c);
1532 #if defined(__aarch64__)
1533 } else if constexpr (std::is_same_v<T, float64x2_t>) {
1534 return fn3(a, b, c);
1535 #endif
1536 } else
1537 #endif // USE_NEON
1538 {
1539 T ret;
1540 auto &[retval] = ret; // single-member struct
1541 const auto &[aval] = a;
1542 const auto &[bval] = b;
1543 if constexpr (std::is_array_v<decltype(retval)>) {
1544 #pragma unroll
1545 for (size_t i = 0; i < std::size(aval); ++i) {
1546 retval[i] = implement_arg3(f, fn1, fn2, fn3, aval[i], bval[i], c);
1547 }
1548 return ret;
1549 } else /* constexpr */ {
1550 auto &[r1, r2] = retval;
1551 const auto &[a1, a2] = aval;
1552 const auto &[b1, b2] = bval;
1553 r1 = implement_arg3(f, fn1, fn2, fn3, a1, b1, c);
1554 r2 = implement_arg3(f, fn1, fn2, fn3, a2, b2, c);
1555 return ret;
1556 }
1557 }
1558 } else {
1559 // Both types T and S are non-primitive and they are not equal.
1560 static_assert(dependent_false_v<T>);
1561 }
1562 }
1563
1564 template<typename F, typename FN1, typename FN2, typename FN3, typename T>
1565 inline T implement_arg3(
1566 const F& f, const FN1& fn1, const FN2& fn2, const FN3& fn3, T a, T b, T c) {
1567 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
1568 return f(a, b, c);
1569
1570 #ifdef USE_NEON
1571 } else if constexpr (std::is_same_v<T, float32x2_t>) {
1572 return fn1(a, b, c);
1573 } else if constexpr (std::is_same_v<T, float32x4_t>) {
1574 return fn2(a, b, c);
1575 #if defined(__aarch64__)
1576 } else if constexpr (std::is_same_v<T, float64x2_t>) {
1577 return fn3(a, b, c);
1578 #endif
1579 #endif // USE_NEON
1580
1581 } else /* constexpr */ {
1582 T ret;
1583 auto& [retval] = ret; // single-member struct
1584 const auto& [aval] = a;
1585 const auto& [bval] = b;
1586 const auto& [cval] = c;
1587 if constexpr (std::is_array_v<decltype(retval)>) {
1588 #pragma unroll
1589 for (size_t i = 0; i < std::size(aval); ++i) {
1590 retval[i] = implement_arg3(f, fn1, fn2, fn3, aval[i], bval[i], cval[i]);
1591 }
1592 return ret;
1593 } else /* constexpr */ {
1594 auto& [r1, r2] = retval;
1595 const auto& [a1, a2] = aval;
1596 const auto& [b1, b2] = bval;
1597 const auto& [c1, c2] = cval;
1598 r1 = implement_arg3(f, fn1, fn2, fn3, a1, b1, c1);
1599 r2 = implement_arg3(f, fn1, fn2, fn3, a2, b2, c2);
1600 return ret;
1601 }
1602 }
1603 }
1604
1605 // absolute value
1606 template<typename T>
1607 static inline T vabs(T a) {
1608 return implement_arg1([](const auto& x) { return std::abs(x); },
1609 DN_(vabs_f32), DN_(vabsq_f32), DN64_(vabsq_f64), a);
1610 }
1611
1612 template<typename T>
1613 inline T vadd(T a, T b) {
1614 return implement_arg2([](const auto& x, const auto& y) { return x + y; },
1615 DN_(vadd_f32), DN_(vaddq_f32), DN64_(vaddq_f64), a, b);
1616 }
1617
1618 // add internally
1619 template<typename T>
1620 inline auto vaddv(const T& a) {
1621 return implement_arg1v([](const auto& x, const auto& y) { return x + y; },
1622 DN64_(vaddv_f32), DN64_(vaddvq_f32), DN64_(vaddvq_f64), a);
1623 }
1624
1625 // duplicate float into all elements.
1626 template<typename T, typename F>
1627 inline T vdupn(F f) {
1628 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
1629 return f;
1630
1631 #ifdef USE_NEON
1632 } else if constexpr (std::is_same_v<T, float32x2_t>) {
1633 return vdup_n_f32(f);
1634 } else if constexpr (std::is_same_v<T, float32x4_t>) {
1635 return vdupq_n_f32(f);
1636 #if defined(__aarch64__)
1637 } else if constexpr (std::is_same_v<T, float64x2_t>) {
1638 return vdupq_n_f64(f);
1639 #endif
1640 #endif // USE_NEON
1641
1642 } else /* constexpr */ {
1643 T ret;
1644 auto &[retval] = ret; // single-member struct
1645 if constexpr (std::is_array_v<decltype(retval)>) {
1646 #pragma unroll
1647 for (auto& val : retval) {
1648 val = vdupn<std::decay_t<decltype(val)>>(f);
1649 }
1650 return ret;
1651 } else /* constexpr */ {
1652 auto &[r1, r2] = retval;
1653 using r1_type = std::decay_t<decltype(r1)>;
1654 using r2_type = std::decay_t<decltype(r2)>;
1655 r1 = vdupn<r1_type>(f);
1656 r2 = vdupn<r2_type>(f);
1657 return ret;
1658 }
1659 }
1660 }
1661
1662 // load from float pointer.
1663 template<typename T, typename F>
1664 static inline T vld1(const F *f) {
1665 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
1666 return *f;
1667
1668 #ifdef USE_NEON
1669 } else if constexpr (std::is_same_v<T, float32x2_t>) {
1670 return vld1_f32(f);
1671 } else if constexpr (std::is_same_v<T, float32x4_t>) {
1672 return vld1q_f32(f);
1673 #if defined(__aarch64__)
1674 } else if constexpr (std::is_same_v<T, float64x2_t>) {
1675 return vld1q_f64(f);
1676 #endif
1677 #endif // USE_NEON
1678
1679 } else /* constexpr */ {
1680 T ret;
1681 auto &[retval] = ret; // single-member struct
1682 if constexpr (std::is_array_v<decltype(retval)>) {
1683 using element_type = std::decay_t<decltype(retval[0])>;
1684 constexpr size_t subelements = sizeof(element_type) / sizeof(F);
1685 #pragma unroll
1686 for (size_t i = 0; i < std::size(retval); ++i) {
1687 retval[i] = vld1<element_type>(f);
1688 f += subelements;
1689 }
1690 return ret;
1691 } else /* constexpr */ {
1692 auto &[r1, r2] = retval;
1693 using r1_type = std::decay_t<decltype(r1)>;
1694 using r2_type = std::decay_t<decltype(r2)>;
1695 r1 = vld1<r1_type>(f);
1696 f += sizeof(r1) / sizeof(F);
1697 r2 = vld1<r2_type>(f);
1698 return ret;
1699 }
1700 }
1701 }
1702
1703 template<typename T, typename F>
1704 inline auto vmax(T a, F b) {
1705 return implement_arg2([](const auto& x, const auto& y) { return std::max(x, y); },
1706 DN_(vmax_f32), DN_(vmaxq_f32), DN64_(vmaxq_f64), a, b);
1707 }
1708
1709 template<typename T>
1710 inline T vmax(T a, T b) {
1711 return implement_arg2([](const auto& x, const auto& y) { return std::max(x, y); },
1712 DN_(vmax_f32), DN_(vmaxq_f32), DN64_(vmaxq_f64), a, b);
1713 }
1714
1715 template<typename T>
1716 inline auto vmaxv(const T& a) {
1717 return implement_arg1v([](const auto& x, const auto& y) { return std::max(x, y); },
1718 DN64_(vmaxv_f32), DN64_(vmaxvq_f32), DN64_(vmaxvq_f64), a);
1719 }
1720
1721 template<typename T, typename F>
1722 inline auto vmin(T a, F b) {
1723 return implement_arg2([](const auto& x, const auto& y) { return std::min(x, y); },
1724 DN_(vmin_f32), DN_(vminq_f32), DN64_(vminq_f64), a, b);
1725 }
1726
1727 template<typename T>
1728 inline T vmin(T a, T b) {
1729 return implement_arg2([](const auto& x, const auto& y) { return std::min(x, y); },
1730 DN_(vmin_f32), DN_(vminq_f32), DN64_(vminq_f64), a, b);
1731 }
1732
1733 template<typename T>
1734 inline auto vminv(const T& a) {
1735 return implement_arg1v([](const auto& x, const auto& y) { return std::min(x, y); },
1736 DN64_(vminv_f32), DN64_(vminvq_f32), DN64_(vminvq_f64), a);
1737 }
1738
1739 /**
1740 * Returns c as follows:
1741 * c_i = a_i * b_i if a and b are the same vector type or
1742 * c_i = a_i * b if a is a vector and b is scalar or
1743 * c_i = a * b_i if a is scalar and b is a vector.
1744 */
1745
1746 // Workaround for missing method.
1747 #if defined(USE_NEON) && defined(__aarch64__)
1748 float64x2_t vmlaq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2);
1749 #endif
1750
1751 template<typename T, typename F>
1752 static inline T vmla(T a, T b, F c) {
1753 return implement_arg3([](const auto& x, const auto& y, const auto& z) { return x + y * z; },
1754 DN_(vmla_n_f32), DN_(vmlaq_n_f32), DN64_(vmlaq_n_f64), a, b, c);
1755 }
1756
1757 template<typename T, typename F>
1758 static inline T vmla(T a, F b, T c) {
1759 return vmla(a, c, b);
1760 }
1761
1762 // fused multiply-add a + b * c
1763 template<typename T>
1764 inline T vmla(const T& a, const T& b, const T& c) {
1765 return implement_arg3([](const auto& x, const auto& y, const auto& z) { return x + y * z; },
1766 DN_(vmla_f32), DN_(vmlaq_f32), DN64_(vmlaq_f64), a, b, c);
1767 }
1768
1769 /**
1770 * Returns c as follows:
1771 * c_i = a_i * b_i if a and b are the same vector type or
1772 * c_i = a_i * b if a is a vector and b is scalar or
1773 * c_i = a * b_i if a is scalar and b is a vector.
1774 */
1775 template<typename T, typename F>
1776 static inline auto vmul(T a, F b) {
1777 return implement_arg2([](const auto& x, const auto& y) { return x * y; },
1778 DN_(vmul_n_f32), DN_(vmulq_n_f32), DN64_(vmulq_n_f64), a, b);
1779 }
1780
1781 template<typename T>
1782 inline T vmul(T a, T b) {
1783 return implement_arg2([](const auto& x, const auto& y) { return x * y; },
1784 DN_(vmul_f32), DN_(vmulq_f32), DN64_(vmulq_f64), a, b);
1785 }
1786
1787 // negate
1788 template<typename T>
1789 inline T vneg(T a) {
1790 return implement_arg1([](const auto& x) { return -x; },
1791 DN_(vneg_f32), DN_(vnegq_f32), DN64_(vnegq_f64), a);
1792 }
1793
1794 // store to float pointer.
1795 template<typename T, typename F>
1796 static inline void vst1(F *f, T a) {
1797 if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
1798 *f = a;
1799
1800 #ifdef USE_NEON
1801 } else if constexpr (std::is_same_v<T, float32x2_t>) {
1802 return vst1_f32(f, a);
1803 } else if constexpr (std::is_same_v<T, float32x4_t>) {
1804 return vst1q_f32(f, a);
1805 #if defined(__aarch64__)
1806 } else if constexpr (std::is_same_v<T, float64x2_t>) {
1807 return vst1q_f64(f, a);
1808 #endif
1809 #endif // USE_NEON
1810
1811 } else /* constexpr */ {
1812 const auto &[aval] = a;
1813 if constexpr (std::is_array_v<decltype(aval)>) {
1814 constexpr size_t subelements = sizeof(std::decay_t<decltype(aval[0])>) / sizeof(F);
1815 #pragma unroll
1816 for (size_t i = 0; i < std::size(aval); ++i) {
1817 vst1(f, aval[i]);
1818 f += subelements;
1819 }
1820 } else /* constexpr */ {
1821 const auto &[a1, a2] = aval;
1822 vst1(f, a1);
1823 f += sizeof(std::decay_t<decltype(a1)>) / sizeof(F);
1824 vst1(f, a2);
1825 }
1826 }
1827 }
1828
1829 // subtract a - b
1830 template<typename T>
1831 inline T vsub(T a, T b) {
1832 return implement_arg2([](const auto& x, const auto& y) { return x - y; },
1833 DN_(vsub_f32), DN_(vsubq_f32), DN64_(vsubq_f64), a, b);
1834 }
1835
1836 // Derived methods
1837
1838 /**
1839 * Clamps a value between the specified min and max.
1840 */
1841 template<typename T, typename S, typename R>
1842 static inline T vclamp(const T& value, const S& min_value, const R& max_value) {
1843 return vmin(vmax(value, min_value), max_value);
1844 }
1845
1846 } // namespace android::audio_utils::intrinsics
1847
1848 #pragma pop_macro("DN64_")
1849 #pragma pop_macro("DN_")
1850 #pragma pop_macro("USE_NEON")
1851
1852 #endif // !ANDROID_AUDIO_UTILS_INTRINSIC_UTILS_H
1853