1 /*
2 * Vector math abstractions.
3 *
4 * Copyright (c) 2019-2023, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8 #ifndef _V_MATH_H
9 #define _V_MATH_H
10
11 #ifndef WANT_VMATH
12 /* Enable the build of vector math code. */
13 # define WANT_VMATH 1
14 #endif
15 #if WANT_VMATH
16
17 /* The goal of this header is to allow vector (only Neon for now)
18 and scalar build of the same algorithm. */
19
20 #if SCALAR
21 #define V_NAME(x) __s_##x
22 #elif VPCS && __aarch64__
23 #define V_NAME(x) __vn_##x
24 #define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
25 #else
26 #define V_NAME(x) __v_##x
27 #endif
28
29 #ifndef VPCS_ATTR
30 #define VPCS_ATTR
31 #endif
32 #ifndef VPCS_ALIAS
33 #define VPCS_ALIAS
34 #endif
35
36 #include <stdint.h>
37 #include "math_config.h"
38
39 typedef float f32_t;
40 typedef uint32_t u32_t;
41 typedef int32_t s32_t;
42 typedef double f64_t;
43 typedef uint64_t u64_t;
44 typedef int64_t s64_t;
45
46 /* reinterpret as type1 from type2. */
47 static inline u32_t
as_u32_f32(f32_t x)48 as_u32_f32 (f32_t x)
49 {
50 union { f32_t f; u32_t u; } r = {x};
51 return r.u;
52 }
53 static inline f32_t
as_f32_u32(u32_t x)54 as_f32_u32 (u32_t x)
55 {
56 union { u32_t u; f32_t f; } r = {x};
57 return r.f;
58 }
59 static inline s32_t
as_s32_u32(u32_t x)60 as_s32_u32 (u32_t x)
61 {
62 union { u32_t u; s32_t i; } r = {x};
63 return r.i;
64 }
65 static inline u32_t
as_u32_s32(s32_t x)66 as_u32_s32 (s32_t x)
67 {
68 union { s32_t i; u32_t u; } r = {x};
69 return r.u;
70 }
71 static inline u64_t
as_u64_f64(f64_t x)72 as_u64_f64 (f64_t x)
73 {
74 union { f64_t f; u64_t u; } r = {x};
75 return r.u;
76 }
77 static inline f64_t
as_f64_u64(u64_t x)78 as_f64_u64 (u64_t x)
79 {
80 union { u64_t u; f64_t f; } r = {x};
81 return r.f;
82 }
83 static inline s64_t
as_s64_u64(u64_t x)84 as_s64_u64 (u64_t x)
85 {
86 union { u64_t u; s64_t i; } r = {x};
87 return r.i;
88 }
89 static inline u64_t
as_u64_s64(s64_t x)90 as_u64_s64 (s64_t x)
91 {
92 union { s64_t i; u64_t u; } r = {x};
93 return r.u;
94 }
95
96 #if SCALAR
97 #define V_SUPPORTED 1
98 typedef f32_t v_f32_t;
99 typedef u32_t v_u32_t;
100 typedef s32_t v_s32_t;
101 typedef f64_t v_f64_t;
102 typedef u64_t v_u64_t;
103 typedef s64_t v_s64_t;
104
105 static inline int
v_lanes32(void)106 v_lanes32 (void)
107 {
108 return 1;
109 }
110
111 static inline v_f32_t
v_f32(f32_t x)112 v_f32 (f32_t x)
113 {
114 return x;
115 }
116 static inline v_u32_t
v_u32(u32_t x)117 v_u32 (u32_t x)
118 {
119 return x;
120 }
121 static inline v_s32_t
v_s32(s32_t x)122 v_s32 (s32_t x)
123 {
124 return x;
125 }
126
127 static inline f32_t
v_get_f32(v_f32_t x,int i)128 v_get_f32 (v_f32_t x, int i)
129 {
130 return x;
131 }
132 static inline u32_t
v_get_u32(v_u32_t x,int i)133 v_get_u32 (v_u32_t x, int i)
134 {
135 return x;
136 }
137 static inline s32_t
v_get_s32(v_s32_t x,int i)138 v_get_s32 (v_s32_t x, int i)
139 {
140 return x;
141 }
142
143 static inline void
v_set_f32(v_f32_t * x,int i,f32_t v)144 v_set_f32 (v_f32_t *x, int i, f32_t v)
145 {
146 *x = v;
147 }
148 static inline void
v_set_u32(v_u32_t * x,int i,u32_t v)149 v_set_u32 (v_u32_t *x, int i, u32_t v)
150 {
151 *x = v;
152 }
153 static inline void
v_set_s32(v_s32_t * x,int i,s32_t v)154 v_set_s32 (v_s32_t *x, int i, s32_t v)
155 {
156 *x = v;
157 }
158
159 /* true if any elements of a v_cond result is non-zero. */
160 static inline int
v_any_u32(v_u32_t x)161 v_any_u32 (v_u32_t x)
162 {
163 return x != 0;
164 }
165 /* to wrap the result of relational operators. */
166 static inline v_u32_t
v_cond_u32(v_u32_t x)167 v_cond_u32 (v_u32_t x)
168 {
169 return x ? -1 : 0;
170 }
171 static inline v_f32_t
v_abs_f32(v_f32_t x)172 v_abs_f32 (v_f32_t x)
173 {
174 return __builtin_fabsf (x);
175 }
176 static inline v_u32_t
v_bsl_u32(v_u32_t m,v_u32_t x,v_u32_t y)177 v_bsl_u32 (v_u32_t m, v_u32_t x, v_u32_t y)
178 {
179 return (y & ~m) | (x & m);
180 }
181 static inline v_u32_t
v_cagt_f32(v_f32_t x,v_f32_t y)182 v_cagt_f32 (v_f32_t x, v_f32_t y)
183 {
184 return fabsf (x) > fabsf (y);
185 }
186 /* to wrap |x| >= |y|. */
187 static inline v_u32_t
v_cage_f32(v_f32_t x,v_f32_t y)188 v_cage_f32 (v_f32_t x, v_f32_t y)
189 {
190 return fabsf (x) >= fabsf (y);
191 }
192 static inline v_u32_t
v_calt_f32(v_f32_t x,v_f32_t y)193 v_calt_f32 (v_f32_t x, v_f32_t y)
194 {
195 return fabsf (x) < fabsf (y);
196 }
197 static inline v_f32_t
v_div_f32(v_f32_t x,v_f32_t y)198 v_div_f32 (v_f32_t x, v_f32_t y)
199 {
200 return x / y;
201 }
202 static inline v_f32_t
v_fma_f32(v_f32_t x,v_f32_t y,v_f32_t z)203 v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
204 {
205 return __builtin_fmaf (x, y, z);
206 }
207 static inline v_f32_t
v_round_f32(v_f32_t x)208 v_round_f32 (v_f32_t x)
209 {
210 return __builtin_roundf (x);
211 }
212 static inline v_s32_t
v_round_s32(v_f32_t x)213 v_round_s32 (v_f32_t x)
214 {
215 return __builtin_lroundf (x); /* relies on -fno-math-errno. */
216 }
217 static inline v_f32_t
v_sel_f32(v_u32_t p,v_f32_t x,v_f32_t y)218 v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y)
219 {
220 return p ? x : y;
221 }
222 static inline v_u32_t
v_sel_u32(v_u32_t p,v_u32_t x,v_u32_t y)223 v_sel_u32 (v_u32_t p, v_u32_t x, v_u32_t y)
224 {
225 return p ? x : y;
226 }
227 static inline v_f32_t
v_sqrt_f32(v_f32_t x)228 v_sqrt_f32 (v_f32_t x)
229 {
230 return __builtin_sqrtf (x);
231 }
232 /* convert to type1 from type2. */
233 static inline v_f32_t
v_to_f32_s32(v_s32_t x)234 v_to_f32_s32 (v_s32_t x)
235 {
236 return x;
237 }
238 static inline v_s32_t
v_to_s32_f32(v_f32_t x)239 v_to_s32_f32 (v_f32_t x)
240 {
241 return x;
242 }
243 static inline v_f32_t
v_to_f32_u32(v_u32_t x)244 v_to_f32_u32 (v_u32_t x)
245 {
246 return x;
247 }
248 /* reinterpret as type1 from type2. */
249 static inline v_u32_t
v_as_u32_f32(v_f32_t x)250 v_as_u32_f32 (v_f32_t x)
251 {
252 union { v_f32_t f; v_u32_t u; } r = {x};
253 return r.u;
254 }
255 static inline v_s32_t
v_as_s32_f32(v_f32_t x)256 v_as_s32_f32 (v_f32_t x)
257 {
258 union
259 {
260 v_f32_t f;
261 v_s32_t u;
262 } r = {x};
263 return r.u;
264 }
265 static inline v_f32_t
v_as_f32_u32(v_u32_t x)266 v_as_f32_u32 (v_u32_t x)
267 {
268 union { v_u32_t u; v_f32_t f; } r = {x};
269 return r.f;
270 }
271 static inline v_s32_t
v_as_s32_u32(v_u32_t x)272 v_as_s32_u32 (v_u32_t x)
273 {
274 union { v_u32_t u; v_s32_t i; } r = {x};
275 return r.i;
276 }
277 static inline v_u32_t
v_as_u32_s32(v_s32_t x)278 v_as_u32_s32 (v_s32_t x)
279 {
280 union { v_s32_t i; v_u32_t u; } r = {x};
281 return r.u;
282 }
283 static inline v_f32_t
v_lookup_f32(const f32_t * tab,v_u32_t idx)284 v_lookup_f32 (const f32_t *tab, v_u32_t idx)
285 {
286 return tab[idx];
287 }
288 static inline v_u32_t
v_lookup_u32(const u32_t * tab,v_u32_t idx)289 v_lookup_u32 (const u32_t *tab, v_u32_t idx)
290 {
291 return tab[idx];
292 }
293 static inline v_f32_t
v_call_f32(f32_t (* f)(f32_t),v_f32_t x,v_f32_t y,v_u32_t p)294 v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
295 {
296 return f (x);
297 }
298 static inline v_f32_t
v_call2_f32(f32_t (* f)(f32_t,f32_t),v_f32_t x1,v_f32_t x2,v_f32_t y,v_u32_t p)299 v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
300 v_u32_t p)
301 {
302 return f (x1, x2);
303 }
304
305 static inline int
v_lanes64(void)306 v_lanes64 (void)
307 {
308 return 1;
309 }
310 static inline v_f64_t
v_f64(f64_t x)311 v_f64 (f64_t x)
312 {
313 return x;
314 }
315 static inline v_u64_t
v_u64(u64_t x)316 v_u64 (u64_t x)
317 {
318 return x;
319 }
320 static inline v_s64_t
v_s64(s64_t x)321 v_s64 (s64_t x)
322 {
323 return x;
324 }
325 static inline f64_t
v_get_f64(v_f64_t x,int i)326 v_get_f64 (v_f64_t x, int i)
327 {
328 return x;
329 }
330 static inline void
v_set_f64(v_f64_t * x,int i,f64_t v)331 v_set_f64 (v_f64_t *x, int i, f64_t v)
332 {
333 *x = v;
334 }
335 /* true if any elements of a v_cond result is non-zero. */
336 static inline int
v_any_u64(v_u64_t x)337 v_any_u64 (v_u64_t x)
338 {
339 return x != 0;
340 }
341 /* true if all elements of a v_cond result is non-zero. */
342 static inline int
v_all_u64(v_u64_t x)343 v_all_u64 (v_u64_t x)
344 {
345 return x;
346 }
347 /* to wrap the result of relational operators. */
348 static inline v_u64_t
v_cond_u64(v_u64_t x)349 v_cond_u64 (v_u64_t x)
350 {
351 return x ? -1 : 0;
352 }
353 static inline v_f64_t
v_abs_f64(v_f64_t x)354 v_abs_f64 (v_f64_t x)
355 {
356 return __builtin_fabs (x);
357 }
358 static inline v_u64_t
v_bsl_u64(v_u64_t m,v_u64_t x,v_u64_t y)359 v_bsl_u64 (v_u64_t m, v_u64_t x, v_u64_t y)
360 {
361 return (y & ~m) | (x & m);
362 }
363 static inline v_u64_t
v_cagt_f64(v_f64_t x,v_f64_t y)364 v_cagt_f64 (v_f64_t x, v_f64_t y)
365 {
366 return fabs (x) > fabs (y);
367 }
368 static inline v_f64_t
v_div_f64(v_f64_t x,v_f64_t y)369 v_div_f64 (v_f64_t x, v_f64_t y)
370 {
371 return x / y;
372 }
373 static inline v_f64_t
v_fma_f64(v_f64_t x,v_f64_t y,v_f64_t z)374 v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
375 {
376 return __builtin_fma (x, y, z);
377 }
378 static inline v_f64_t
v_min_f64(v_f64_t x,v_f64_t y)379 v_min_f64(v_f64_t x, v_f64_t y) {
380 return x < y ? x : y;
381 }
382 static inline v_f64_t
v_round_f64(v_f64_t x)383 v_round_f64 (v_f64_t x)
384 {
385 return __builtin_round (x);
386 }
387 static inline v_f64_t
v_sel_f64(v_u64_t p,v_f64_t x,v_f64_t y)388 v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
389 {
390 return p ? x : y;
391 }
392 static inline v_f64_t
v_sqrt_f64(v_f64_t x)393 v_sqrt_f64 (v_f64_t x)
394 {
395 return __builtin_sqrt (x);
396 }
397 static inline v_s64_t
v_round_s64(v_f64_t x)398 v_round_s64 (v_f64_t x)
399 {
400 return __builtin_lround (x); /* relies on -fno-math-errno. */
401 }
402 static inline v_u64_t
v_trunc_u64(v_f64_t x)403 v_trunc_u64 (v_f64_t x)
404 {
405 return __builtin_trunc (x);
406 }
407 /* convert to type1 from type2. */
408 static inline v_f64_t
v_to_f64_s64(v_s64_t x)409 v_to_f64_s64 (v_s64_t x)
410 {
411 return x;
412 }
413 static inline v_f64_t
v_to_f64_u64(v_u64_t x)414 v_to_f64_u64 (v_u64_t x)
415 {
416 return x;
417 }
418
419 static inline v_s64_t
v_to_s64_f64(v_f64_t x)420 v_to_s64_f64 (v_f64_t x)
421 {
422 return x;
423 }
424 /* reinterpret as type1 from type2. */
425 static inline v_u64_t
v_as_u64_f64(v_f64_t x)426 v_as_u64_f64 (v_f64_t x)
427 {
428 union { v_f64_t f; v_u64_t u; } r = {x};
429 return r.u;
430 }
431 static inline v_f64_t
v_as_f64_u64(v_u64_t x)432 v_as_f64_u64 (v_u64_t x)
433 {
434 union { v_u64_t u; v_f64_t f; } r = {x};
435 return r.f;
436 }
437 static inline v_s64_t
v_as_s64_u64(v_u64_t x)438 v_as_s64_u64 (v_u64_t x)
439 {
440 union { v_u64_t u; v_s64_t i; } r = {x};
441 return r.i;
442 }
443 static inline v_u64_t
v_as_u64_s64(v_s64_t x)444 v_as_u64_s64 (v_s64_t x)
445 {
446 union { v_s64_t i; v_u64_t u; } r = {x};
447 return r.u;
448 }
449 static inline v_f64_t
v_lookup_f64(const f64_t * tab,v_u64_t idx)450 v_lookup_f64 (const f64_t *tab, v_u64_t idx)
451 {
452 return tab[idx];
453 }
454 static inline v_u64_t
v_lookup_u64(const u64_t * tab,v_u64_t idx)455 v_lookup_u64 (const u64_t *tab, v_u64_t idx)
456 {
457 return tab[idx];
458 }
459 static inline v_f64_t
v_call_f64(f64_t (* f)(f64_t),v_f64_t x,v_f64_t y,v_u64_t p)460 v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
461 {
462 return f (x);
463 }
464 static inline v_f64_t
v_call2_f64(f64_t (* f)(f64_t,f64_t),v_f64_t x1,v_f64_t x2,v_f64_t y,v_u64_t p)465 v_call2_f64 (f64_t (*f) (f64_t, f64_t), v_f64_t x1, v_f64_t x2, v_f64_t y,
466 v_u64_t p)
467 {
468 return f (x1, x2);
469 }
470
471 #elif __aarch64__
472 #define V_SUPPORTED 1
473 #include <arm_neon.h>
474 typedef float32x4_t v_f32_t;
475 typedef uint32x4_t v_u32_t;
476 typedef int32x4_t v_s32_t;
477 typedef float64x2_t v_f64_t;
478 typedef uint64x2_t v_u64_t;
479 typedef int64x2_t v_s64_t;
480
481 static inline int
v_lanes32(void)482 v_lanes32 (void)
483 {
484 return 4;
485 }
486
487 static inline v_f32_t
v_f32(f32_t x)488 v_f32 (f32_t x)
489 {
490 return (v_f32_t){x, x, x, x};
491 }
492 static inline v_u32_t
v_u32(u32_t x)493 v_u32 (u32_t x)
494 {
495 return (v_u32_t){x, x, x, x};
496 }
497 static inline v_s32_t
v_s32(s32_t x)498 v_s32 (s32_t x)
499 {
500 return (v_s32_t){x, x, x, x};
501 }
502
503 static inline f32_t
v_get_f32(v_f32_t x,int i)504 v_get_f32 (v_f32_t x, int i)
505 {
506 return x[i];
507 }
508 static inline u32_t
v_get_u32(v_u32_t x,int i)509 v_get_u32 (v_u32_t x, int i)
510 {
511 return x[i];
512 }
513 static inline s32_t
v_get_s32(v_s32_t x,int i)514 v_get_s32 (v_s32_t x, int i)
515 {
516 return x[i];
517 }
518
519 static inline void
v_set_f32(v_f32_t * x,int i,f32_t v)520 v_set_f32 (v_f32_t *x, int i, f32_t v)
521 {
522 (*x)[i] = v;
523 }
524 static inline void
v_set_u32(v_u32_t * x,int i,u32_t v)525 v_set_u32 (v_u32_t *x, int i, u32_t v)
526 {
527 (*x)[i] = v;
528 }
529 static inline void
v_set_s32(v_s32_t * x,int i,s32_t v)530 v_set_s32 (v_s32_t *x, int i, s32_t v)
531 {
532 (*x)[i] = v;
533 }
534
535 /* true if any elements of a v_cond result is non-zero. */
536 static inline int
v_any_u32(v_u32_t x)537 v_any_u32 (v_u32_t x)
538 {
539 /* assume elements in x are either 0 or -1u. */
540 return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
541 }
542 /* to wrap the result of relational operators. */
543 static inline v_u32_t
v_cond_u32(v_u32_t x)544 v_cond_u32 (v_u32_t x)
545 {
546 return x;
547 }
548 static inline v_f32_t
v_abs_f32(v_f32_t x)549 v_abs_f32 (v_f32_t x)
550 {
551 return vabsq_f32 (x);
552 }
553 static inline v_u32_t
v_bsl_u32(v_u32_t m,v_u32_t x,v_u32_t y)554 v_bsl_u32 (v_u32_t m, v_u32_t x, v_u32_t y)
555 {
556 return vbslq_u32 (m, x, y);
557 }
558 static inline v_u32_t
v_cagt_f32(v_f32_t x,v_f32_t y)559 v_cagt_f32 (v_f32_t x, v_f32_t y)
560 {
561 return vcagtq_f32 (x, y);
562 }
563 /* to wrap |x| >= |y|. */
564 static inline v_u32_t
v_cage_f32(v_f32_t x,v_f32_t y)565 v_cage_f32 (v_f32_t x, v_f32_t y)
566 {
567 return vcageq_f32 (x, y);
568 }
569 static inline v_u32_t
v_calt_f32(v_f32_t x,v_f32_t y)570 v_calt_f32 (v_f32_t x, v_f32_t y)
571 {
572 return vcaltq_f32 (x, y);
573 }
574 static inline v_f32_t
v_div_f32(v_f32_t x,v_f32_t y)575 v_div_f32 (v_f32_t x, v_f32_t y)
576 {
577 return vdivq_f32 (x, y);
578 }
579 static inline v_f32_t
v_fma_f32(v_f32_t x,v_f32_t y,v_f32_t z)580 v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
581 {
582 return vfmaq_f32 (z, x, y);
583 }
584 static inline v_f32_t
v_round_f32(v_f32_t x)585 v_round_f32 (v_f32_t x)
586 {
587 return vrndaq_f32 (x);
588 }
589 static inline v_s32_t
v_round_s32(v_f32_t x)590 v_round_s32 (v_f32_t x)
591 {
592 return vcvtaq_s32_f32 (x);
593 }
594 static inline v_f32_t
v_sel_f32(v_u32_t p,v_f32_t x,v_f32_t y)595 v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y)
596 {
597 return vbslq_f32 (p, x, y);
598 }
599 static inline v_u32_t
v_sel_u32(v_u32_t p,v_u32_t x,v_u32_t y)600 v_sel_u32 (v_u32_t p, v_u32_t x, v_u32_t y)
601 {
602 return vbslq_u32 (p, x, y);
603 }
604 static inline v_f32_t
v_sqrt_f32(v_f32_t x)605 v_sqrt_f32 (v_f32_t x)
606 {
607 return vsqrtq_f32 (x);
608 }
609 /* convert to type1 from type2. */
610 static inline v_f32_t
v_to_f32_s32(v_s32_t x)611 v_to_f32_s32 (v_s32_t x)
612 {
613 return (v_f32_t){x[0], x[1], x[2], x[3]};
614 }
615 static inline v_s32_t
v_to_s32_f32(v_f32_t x)616 v_to_s32_f32 (v_f32_t x)
617 {
618 return vcvtq_s32_f32 (x);
619 }
620 static inline v_f32_t
v_to_f32_u32(v_u32_t x)621 v_to_f32_u32 (v_u32_t x)
622 {
623 return (v_f32_t){x[0], x[1], x[2], x[3]};
624 }
625 /* reinterpret as type1 from type2. */
626 static inline v_u32_t
v_as_u32_f32(v_f32_t x)627 v_as_u32_f32 (v_f32_t x)
628 {
629 union { v_f32_t f; v_u32_t u; } r = {x};
630 return r.u;
631 }
632 static inline v_s32_t
v_as_s32_f32(v_f32_t x)633 v_as_s32_f32 (v_f32_t x)
634 {
635 union
636 {
637 v_f32_t f;
638 v_s32_t u;
639 } r = {x};
640 return r.u;
641 }
642 static inline v_f32_t
v_as_f32_u32(v_u32_t x)643 v_as_f32_u32 (v_u32_t x)
644 {
645 union { v_u32_t u; v_f32_t f; } r = {x};
646 return r.f;
647 }
648 static inline v_s32_t
v_as_s32_u32(v_u32_t x)649 v_as_s32_u32 (v_u32_t x)
650 {
651 union { v_u32_t u; v_s32_t i; } r = {x};
652 return r.i;
653 }
654 static inline v_u32_t
v_as_u32_s32(v_s32_t x)655 v_as_u32_s32 (v_s32_t x)
656 {
657 union { v_s32_t i; v_u32_t u; } r = {x};
658 return r.u;
659 }
660 static inline v_f32_t
v_lookup_f32(const f32_t * tab,v_u32_t idx)661 v_lookup_f32 (const f32_t *tab, v_u32_t idx)
662 {
663 return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
664 }
665 static inline v_u32_t
v_lookup_u32(const u32_t * tab,v_u32_t idx)666 v_lookup_u32 (const u32_t *tab, v_u32_t idx)
667 {
668 return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
669 }
670 static inline v_f32_t
v_call_f32(f32_t (* f)(f32_t),v_f32_t x,v_f32_t y,v_u32_t p)671 v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
672 {
673 return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
674 p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
675 }
676 static inline v_f32_t
v_call2_f32(f32_t (* f)(f32_t,f32_t),v_f32_t x1,v_f32_t x2,v_f32_t y,v_u32_t p)677 v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
678 v_u32_t p)
679 {
680 return (
681 v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1],
682 p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]};
683 }
684
685 static inline int
v_lanes64(void)686 v_lanes64 (void)
687 {
688 return 2;
689 }
690 static inline v_f64_t
v_f64(f64_t x)691 v_f64 (f64_t x)
692 {
693 return (v_f64_t){x, x};
694 }
695 static inline v_u64_t
v_u64(u64_t x)696 v_u64 (u64_t x)
697 {
698 return (v_u64_t){x, x};
699 }
700 static inline v_s64_t
v_s64(s64_t x)701 v_s64 (s64_t x)
702 {
703 return (v_s64_t){x, x};
704 }
705 static inline f64_t
v_get_f64(v_f64_t x,int i)706 v_get_f64 (v_f64_t x, int i)
707 {
708 return x[i];
709 }
710 static inline void
v_set_f64(v_f64_t * x,int i,f64_t v)711 v_set_f64 (v_f64_t *x, int i, f64_t v)
712 {
713 (*x)[i] = v;
714 }
715 /* true if any elements of a v_cond result is non-zero. */
716 static inline int
v_any_u64(v_u64_t x)717 v_any_u64 (v_u64_t x)
718 {
719 /* assume elements in x are either 0 or -1u. */
720 return vpaddd_u64 (x) != 0;
721 }
722 /* true if all elements of a v_cond result is 1. */
723 static inline int
v_all_u64(v_u64_t x)724 v_all_u64 (v_u64_t x)
725 {
726 /* assume elements in x are either 0 or -1u. */
727 return vpaddd_s64 (vreinterpretq_s64_u64 (x)) == -2;
728 }
729 /* to wrap the result of relational operators. */
730 static inline v_u64_t
v_cond_u64(v_u64_t x)731 v_cond_u64 (v_u64_t x)
732 {
733 return x;
734 }
735 static inline v_f64_t
v_abs_f64(v_f64_t x)736 v_abs_f64 (v_f64_t x)
737 {
738 return vabsq_f64 (x);
739 }
740 static inline v_u64_t
v_bsl_u64(v_u64_t m,v_u64_t x,v_u64_t y)741 v_bsl_u64 (v_u64_t m, v_u64_t x, v_u64_t y)
742 {
743 return vbslq_u64 (m, x, y);
744 }
745 static inline v_u64_t
v_cagt_f64(v_f64_t x,v_f64_t y)746 v_cagt_f64 (v_f64_t x, v_f64_t y)
747 {
748 return vcagtq_f64 (x, y);
749 }
750 static inline v_f64_t
v_div_f64(v_f64_t x,v_f64_t y)751 v_div_f64 (v_f64_t x, v_f64_t y)
752 {
753 return vdivq_f64 (x, y);
754 }
755 static inline v_f64_t
v_fma_f64(v_f64_t x,v_f64_t y,v_f64_t z)756 v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
757 {
758 return vfmaq_f64 (z, x, y);
759 }
760 static inline v_f64_t
v_min_f64(v_f64_t x,v_f64_t y)761 v_min_f64(v_f64_t x, v_f64_t y) {
762 return vminq_f64(x, y);
763 }
764 static inline v_f64_t
v_round_f64(v_f64_t x)765 v_round_f64 (v_f64_t x)
766 {
767 return vrndaq_f64 (x);
768 }
769 static inline v_f64_t
v_sel_f64(v_u64_t p,v_f64_t x,v_f64_t y)770 v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
771 {
772 return vbslq_f64 (p, x, y);
773 }
774 static inline v_f64_t
v_sqrt_f64(v_f64_t x)775 v_sqrt_f64 (v_f64_t x)
776 {
777 return vsqrtq_f64 (x);
778 }
779 static inline v_s64_t
v_round_s64(v_f64_t x)780 v_round_s64 (v_f64_t x)
781 {
782 return vcvtaq_s64_f64 (x);
783 }
784 static inline v_u64_t
v_trunc_u64(v_f64_t x)785 v_trunc_u64 (v_f64_t x)
786 {
787 return vcvtq_u64_f64 (x);
788 }
789 /* convert to type1 from type2. */
790 static inline v_f64_t
v_to_f64_s64(v_s64_t x)791 v_to_f64_s64 (v_s64_t x)
792 {
793 return (v_f64_t){x[0], x[1]};
794 }
795 static inline v_f64_t
v_to_f64_u64(v_u64_t x)796 v_to_f64_u64 (v_u64_t x)
797 {
798 return (v_f64_t){x[0], x[1]};
799 }
800 static inline v_s64_t
v_to_s64_f64(v_f64_t x)801 v_to_s64_f64 (v_f64_t x)
802 {
803 return vcvtq_s64_f64 (x);
804 }
805 /* reinterpret as type1 from type2. */
806 static inline v_u64_t
v_as_u64_f64(v_f64_t x)807 v_as_u64_f64 (v_f64_t x)
808 {
809 union { v_f64_t f; v_u64_t u; } r = {x};
810 return r.u;
811 }
812 static inline v_f64_t
v_as_f64_u64(v_u64_t x)813 v_as_f64_u64 (v_u64_t x)
814 {
815 union { v_u64_t u; v_f64_t f; } r = {x};
816 return r.f;
817 }
818 static inline v_s64_t
v_as_s64_u64(v_u64_t x)819 v_as_s64_u64 (v_u64_t x)
820 {
821 union { v_u64_t u; v_s64_t i; } r = {x};
822 return r.i;
823 }
824 static inline v_u64_t
v_as_u64_s64(v_s64_t x)825 v_as_u64_s64 (v_s64_t x)
826 {
827 union { v_s64_t i; v_u64_t u; } r = {x};
828 return r.u;
829 }
830 static inline v_f64_t
v_lookup_f64(const f64_t * tab,v_u64_t idx)831 v_lookup_f64 (const f64_t *tab, v_u64_t idx)
832 {
833 return (v_f64_t){tab[idx[0]], tab[idx[1]]};
834 }
835 static inline v_u64_t
v_lookup_u64(const u64_t * tab,v_u64_t idx)836 v_lookup_u64 (const u64_t *tab, v_u64_t idx)
837 {
838 return (v_u64_t){tab[idx[0]], tab[idx[1]]};
839 }
840 static inline v_f64_t
v_call_f64(f64_t (* f)(f64_t),v_f64_t x,v_f64_t y,v_u64_t p)841 v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
842 {
843 return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]};
844 }
845 static inline v_f64_t
v_call2_f64(f64_t (* f)(f64_t,f64_t),v_f64_t x1,v_f64_t x2,v_f64_t y,v_u64_t p)846 v_call2_f64 (f64_t (*f) (f64_t, f64_t), v_f64_t x1, v_f64_t x2, v_f64_t y,
847 v_u64_t p)
848 {
849 return (v_f64_t){p[0] ? f (x1[0], x2[0]) : y[0],
850 p[1] ? f (x1[1], x2[1]) : y[1]};
851 }
852 #endif
853
854 #endif
855 #endif
856