• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #pragma once
2 #ifndef PSIMD_H
3 #define PSIMD_H
4 
5 #if defined(__CUDA_ARCH__)
6 	/* CUDA compiler */
7 	#define PSIMD_INTRINSIC __forceinline__ __device__
8 #elif defined(__OPENCL_VERSION__)
9 	/* OpenCL compiler */
10 	#define PSIMD_INTRINSIC inline static
11 #elif defined(__INTEL_COMPILER)
12 	/* Intel compiler, even on Windows */
13 	#define PSIMD_INTRINSIC inline static __attribute__((__always_inline__))
14 #elif defined(__GNUC__)
15 	/* GCC-compatible compiler (gcc/clang/icc) */
16 	#define PSIMD_INTRINSIC inline static __attribute__((__always_inline__))
17 #elif defined(_MSC_VER)
18 	/* MSVC-compatible compiler (cl/icl/clang-cl) */
19 	#define PSIMD_INTRINSIC __forceinline static
20 #elif defined(__cplusplus)
21 	/* Generic C++ compiler */
22 	#define PSIMD_INTRINSIC inline static
23 #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
24 	/* Generic C99 compiler */
25 	#define PSIMD_INTRINSIC inline static
26 #else
27 	/* Generic C compiler */
28 	#define PSIMD_INTRINSIC static
29 #endif
30 
31 #if defined(__GNUC__)
32 	#if defined(__ARM_NEON__) || defined(__ARM_NEON)
33 		#include <arm_neon.h>
34 	#endif
35 
36 	#if defined(__SSE2__)
37 		#include <emmintrin.h>
38 	#endif
39 
40 	#if defined(__SSE3__)
41 		#include <pmmintrin.h>
42 	#endif
43 
44 	#if defined(__SSSE3__)
45 		#include <tmmintrin.h>
46 	#endif
47 
48 	#if defined(__SSE4_1__)
49 		#include <smmintrin.h>
50 	#endif
51 
52 	#if defined(__SSE4_2__)
53 		#include <nmmintrin.h>
54 	#endif
55 
56 	#if defined(__AVX__)
57 		#include <immintrin.h>
58 	#endif
59 #elif defined(_MSC_VER)
60 	#include <intrin.h>
61 #endif
62 
63 #if defined(__cplusplus)
64 	#define PSIMD_CXX_SYNTAX
65 #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
66 	#define PSIMD_C11_SYNTAX
67 #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
68 	#define PSIMD_C99_SYNTAX
69 #else
70 	#define PSIMD_C89_SYNTAX
71 #endif
72 
73 #if defined(__cplusplus) && (__cplusplus >= 201103L)
74 	#include <cstddef>
75 	#include <cstdint>
76 #elif !defined(__OPENCL_VERSION__)
77 	#include <stddef.h>
78 	#include <stdint.h>
79 #endif
80 
81 #if defined(__GNUC__)
82 	#define PSIMD_HAVE_F64 0
83 	#define PSIMD_HAVE_F32 1
84 	#define PSIMD_HAVE_U8 1
85 	#define PSIMD_HAVE_S8 1
86 	#define PSIMD_HAVE_U16 1
87 	#define PSIMD_HAVE_S16 1
88 	#define PSIMD_HAVE_U32 1
89 	#define PSIMD_HAVE_S32 1
90 	#define PSIMD_HAVE_U64 0
91 	#define PSIMD_HAVE_S64 0
92 
93 	typedef int8_t   psimd_s8  __attribute__((vector_size(16), aligned(1)));
94 	typedef uint8_t  psimd_u8  __attribute__((vector_size(16), aligned(1)));
95 	typedef int16_t  psimd_s16 __attribute__((vector_size(16), aligned(2)));
96 	typedef uint16_t psimd_u16 __attribute__((vector_size(16), aligned(2)));
97 	typedef int32_t  psimd_s32 __attribute__((vector_size(16), aligned(4)));
98 	typedef uint32_t psimd_u32 __attribute__((vector_size(16), aligned(4)));
99 	typedef float    psimd_f32 __attribute__((vector_size(16), aligned(4)));
100 
101 	typedef struct {
102 		psimd_s8 lo;
103 		psimd_s8 hi;
104 	} psimd_s8x2;
105 
106 	typedef struct {
107 		psimd_u8 lo;
108 		psimd_u8 hi;
109 	} psimd_u8x2;
110 
111 	typedef struct {
112 		psimd_s16 lo;
113 		psimd_s16 hi;
114 	} psimd_s16x2;
115 
116 	typedef struct {
117 		psimd_u16 lo;
118 		psimd_u16 hi;
119 	} psimd_u16x2;
120 
121 	typedef struct {
122 		psimd_s32 lo;
123 		psimd_s32 hi;
124 	} psimd_s32x2;
125 
126 	typedef struct {
127 		psimd_u32 lo;
128 		psimd_u32 hi;
129 	} psimd_u32x2;
130 
131 	typedef struct {
132 		psimd_f32 lo;
133 		psimd_f32 hi;
134 	} psimd_f32x2;
135 
136 	/* Bit casts */
psimd_cast_s32x2_u32x2(psimd_s32x2 v)137 	PSIMD_INTRINSIC psimd_u32x2 psimd_cast_s32x2_u32x2(psimd_s32x2 v) {
138 		return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi };
139 	}
140 
psimd_cast_s32x2_f32x2(psimd_s32x2 v)141 	PSIMD_INTRINSIC psimd_f32x2 psimd_cast_s32x2_f32x2(psimd_s32x2 v) {
142 		return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi };
143 	}
144 
psimd_cast_u32x2_s32x2(psimd_u32x2 v)145 	PSIMD_INTRINSIC psimd_s32x2 psimd_cast_u32x2_s32x2(psimd_u32x2 v) {
146 		return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi };
147 	}
148 
psimd_cast_u32x2_f32x2(psimd_u32x2 v)149 	PSIMD_INTRINSIC psimd_f32x2 psimd_cast_u32x2_f32x2(psimd_u32x2 v) {
150 		return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi };
151 	}
152 
psimd_cast_f32x2_s32x2(psimd_f32x2 v)153 	PSIMD_INTRINSIC psimd_s32x2 psimd_cast_f32x2_s32x2(psimd_f32x2 v) {
154 		return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi };
155 	}
156 
psimd_cast_f32x2_u32x2(psimd_f32x2 v)157 	PSIMD_INTRINSIC psimd_u32x2 psimd_cast_f32x2_u32x2(psimd_f32x2 v) {
158 		return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi };
159 	}
160 
161 	/* Swap */
psimd_swap_s8(psimd_s8 a[1],psimd_s8 b[1])162 	PSIMD_INTRINSIC void psimd_swap_s8(psimd_s8 a[1], psimd_s8 b[1]) {
163 		const psimd_s8 new_a = *b;
164 		const psimd_s8 new_b = *a;
165 		*a = new_a;
166 		*b = new_b;
167 	}
168 
psimd_swap_u8(psimd_u8 a[1],psimd_u8 b[1])169 	PSIMD_INTRINSIC void psimd_swap_u8(psimd_u8 a[1], psimd_u8 b[1]) {
170 		const psimd_u8 new_a = *b;
171 		const psimd_u8 new_b = *a;
172 		*a = new_a;
173 		*b = new_b;
174 	}
175 
psimd_swap_s16(psimd_s16 a[1],psimd_s16 b[1])176 	PSIMD_INTRINSIC void psimd_swap_s16(psimd_s16 a[1], psimd_s16 b[1]) {
177 		const psimd_s16 new_a = *b;
178 		const psimd_s16 new_b = *a;
179 		*a = new_a;
180 		*b = new_b;
181 	}
182 
psimd_swap_u16(psimd_u16 a[1],psimd_u16 b[1])183 	PSIMD_INTRINSIC void psimd_swap_u16(psimd_u16 a[1], psimd_u16 b[1]) {
184 		const psimd_u16 new_a = *b;
185 		const psimd_u16 new_b = *a;
186 		*a = new_a;
187 		*b = new_b;
188 	}
189 
psimd_swap_s32(psimd_s32 a[1],psimd_s32 b[1])190 	PSIMD_INTRINSIC void psimd_swap_s32(psimd_s32 a[1], psimd_s32 b[1]) {
191 		const psimd_s32 new_a = *b;
192 		const psimd_s32 new_b = *a;
193 		*a = new_a;
194 		*b = new_b;
195 	}
196 
psimd_swap_u32(psimd_u32 a[1],psimd_u32 b[1])197 	PSIMD_INTRINSIC void psimd_swap_u32(psimd_u32 a[1], psimd_u32 b[1]) {
198 		const psimd_u32 new_a = *b;
199 		const psimd_u32 new_b = *a;
200 		*a = new_a;
201 		*b = new_b;
202 	}
203 
psimd_swap_f32(psimd_f32 a[1],psimd_f32 b[1])204 	PSIMD_INTRINSIC void psimd_swap_f32(psimd_f32 a[1], psimd_f32 b[1]) {
205 		const psimd_f32 new_a = *b;
206 		const psimd_f32 new_b = *a;
207 		*a = new_a;
208 		*b = new_b;
209 	}
210 
211 	/* Zero-initialization */
psimd_zero_s8(void)212 	PSIMD_INTRINSIC psimd_s8 psimd_zero_s8(void) {
213 		return (psimd_s8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
214 	}
215 
psimd_zero_u8(void)216 	PSIMD_INTRINSIC psimd_u8 psimd_zero_u8(void) {
217 		return (psimd_u8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
218 	}
219 
psimd_zero_s16(void)220 	PSIMD_INTRINSIC psimd_s16 psimd_zero_s16(void) {
221 		return (psimd_s16) { 0, 0, 0, 0, 0, 0, 0, 0 };
222 	}
223 
psimd_zero_u16(void)224 	PSIMD_INTRINSIC psimd_u16 psimd_zero_u16(void) {
225 		return (psimd_u16) { 0, 0, 0, 0, 0, 0, 0, 0 };
226 	}
227 
psimd_zero_s32(void)228 	PSIMD_INTRINSIC psimd_s32 psimd_zero_s32(void) {
229 		return (psimd_s32) { 0, 0, 0, 0 };
230 	}
231 
psimd_zero_u32(void)232 	PSIMD_INTRINSIC psimd_u32 psimd_zero_u32(void) {
233 		return (psimd_u32) { 0, 0, 0, 0 };
234 	}
235 
psimd_zero_f32(void)236 	PSIMD_INTRINSIC psimd_f32 psimd_zero_f32(void) {
237 		return (psimd_f32) { 0.0f, 0.0f, 0.0f, 0.0f };
238 	}
239 
240 	/* Initialization to the same constant */
psimd_splat_s8(int8_t c)241 	PSIMD_INTRINSIC psimd_s8 psimd_splat_s8(int8_t c) {
242 		return (psimd_s8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c };
243 	}
244 
psimd_splat_u8(uint8_t c)245 	PSIMD_INTRINSIC psimd_u8 psimd_splat_u8(uint8_t c) {
246 		return (psimd_u8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c };
247 	}
248 
psimd_splat_s16(int16_t c)249 	PSIMD_INTRINSIC psimd_s16 psimd_splat_s16(int16_t c) {
250 		return (psimd_s16) { c, c, c, c, c, c, c, c };
251 	}
252 
psimd_splat_u16(uint16_t c)253 	PSIMD_INTRINSIC psimd_u16 psimd_splat_u16(uint16_t c) {
254 		return (psimd_u16) { c, c, c, c, c, c, c, c };
255 	}
256 
psimd_splat_s32(int32_t c)257 	PSIMD_INTRINSIC psimd_s32 psimd_splat_s32(int32_t c) {
258 		return (psimd_s32) { c, c, c, c };
259 	}
260 
psimd_splat_u32(uint32_t c)261 	PSIMD_INTRINSIC psimd_u32 psimd_splat_u32(uint32_t c) {
262 		return (psimd_u32) { c, c, c, c };
263 	}
264 
psimd_splat_f32(float c)265 	PSIMD_INTRINSIC psimd_f32 psimd_splat_f32(float c) {
266 		return (psimd_f32) { c, c, c, c };
267 	}
268 
269 	/* Load vector */
psimd_load_s8(const void * address)270 	PSIMD_INTRINSIC psimd_s8 psimd_load_s8(const void* address) {
271 		return *((const psimd_s8*) address);
272 	}
273 
psimd_load_u8(const void * address)274 	PSIMD_INTRINSIC psimd_u8 psimd_load_u8(const void* address) {
275 		return *((const psimd_u8*) address);
276 	}
277 
psimd_load_s16(const void * address)278 	PSIMD_INTRINSIC psimd_s16 psimd_load_s16(const void* address) {
279 		return *((const psimd_s16*) address);
280 	}
281 
psimd_load_u16(const void * address)282 	PSIMD_INTRINSIC psimd_u16 psimd_load_u16(const void* address) {
283 		return *((const psimd_u16*) address);
284 	}
285 
psimd_load_s32(const void * address)286 	PSIMD_INTRINSIC psimd_s32 psimd_load_s32(const void* address) {
287 		return *((const psimd_s32*) address);
288 	}
289 
psimd_load_u32(const void * address)290 	PSIMD_INTRINSIC psimd_u32 psimd_load_u32(const void* address) {
291 		return *((const psimd_u32*) address);
292 	}
293 
psimd_load_f32(const void * address)294 	PSIMD_INTRINSIC psimd_f32 psimd_load_f32(const void* address) {
295 		return *((const psimd_f32*) address);
296 	}
297 
psimd_load_splat_s8(const void * address)298 	PSIMD_INTRINSIC psimd_s8 psimd_load_splat_s8(const void* address) {
299 		return psimd_splat_s8(*((const int8_t*) address));
300 	}
301 
psimd_load_splat_u8(const void * address)302 	PSIMD_INTRINSIC psimd_u8 psimd_load_splat_u8(const void* address) {
303 		return psimd_splat_u8(*((const uint8_t*) address));
304 	}
305 
psimd_load_splat_s16(const void * address)306 	PSIMD_INTRINSIC psimd_s16 psimd_load_splat_s16(const void* address) {
307 		return psimd_splat_s16(*((const int16_t*) address));
308 	}
309 
psimd_load_splat_u16(const void * address)310 	PSIMD_INTRINSIC psimd_u16 psimd_load_splat_u16(const void* address) {
311 		return psimd_splat_u16(*((const uint16_t*) address));
312 	}
313 
psimd_load_splat_s32(const void * address)314 	PSIMD_INTRINSIC psimd_s32 psimd_load_splat_s32(const void* address) {
315 		return psimd_splat_s32(*((const int32_t*) address));
316 	}
317 
psimd_load_splat_u32(const void * address)318 	PSIMD_INTRINSIC psimd_u32 psimd_load_splat_u32(const void* address) {
319 		return psimd_splat_u32(*((const uint32_t*) address));
320 	}
321 
psimd_load_splat_f32(const void * address)322 	PSIMD_INTRINSIC psimd_f32 psimd_load_splat_f32(const void* address) {
323 		return psimd_splat_f32(*((const float*) address));
324 	}
325 
psimd_load1_s32(const void * address)326 	PSIMD_INTRINSIC psimd_s32 psimd_load1_s32(const void* address) {
327 		return (psimd_s32) { *((const int32_t*) address), 0, 0, 0 };
328 	}
329 
psimd_load1_u32(const void * address)330 	PSIMD_INTRINSIC psimd_u32 psimd_load1_u32(const void* address) {
331 		return (psimd_u32) { *((const uint32_t*) address), 0, 0, 0 };
332 	}
333 
psimd_load1_f32(const void * address)334 	PSIMD_INTRINSIC psimd_f32 psimd_load1_f32(const void* address) {
335 		return (psimd_f32) { *((const float*) address), 0.0f, 0.0f, 0.0f };
336 	}
337 
psimd_load2_s32(const void * address)338 	PSIMD_INTRINSIC psimd_s32 psimd_load2_s32(const void* address) {
339 		const int32_t* address_s32 = (const int32_t*) address;
340 		return (psimd_s32) { address_s32[0], address_s32[1], 0, 0 };
341 	}
342 
psimd_load2_u32(const void * address)343 	PSIMD_INTRINSIC psimd_u32 psimd_load2_u32(const void* address) {
344 		const uint32_t* address_u32 = (const uint32_t*) address;
345 		return (psimd_u32) { address_u32[0], address_u32[1], 0, 0 };
346 	}
347 
psimd_load2_f32(const void * address)348 	PSIMD_INTRINSIC psimd_f32 psimd_load2_f32(const void* address) {
349 		const float* address_f32 = (const float*) address;
350 		return (psimd_f32) { address_f32[0], address_f32[1], 0.0f, 0.0f };
351 	}
352 
psimd_load3_s32(const void * address)353 	PSIMD_INTRINSIC psimd_s32 psimd_load3_s32(const void* address) {
354 		const int32_t* address_s32 = (const int32_t*) address;
355 		return (psimd_s32) { address_s32[0], address_s32[1], address_s32[2], 0 };
356 	}
357 
psimd_load3_u32(const void * address)358 	PSIMD_INTRINSIC psimd_u32 psimd_load3_u32(const void* address) {
359 		const uint32_t* address_u32 = (const uint32_t*) address;
360 		return (psimd_u32) { address_u32[0], address_u32[1], address_u32[2], 0 };
361 	}
362 
psimd_load3_f32(const void * address)363 	PSIMD_INTRINSIC psimd_f32 psimd_load3_f32(const void* address) {
364 		const float* address_f32 = (const float*) address;
365 		return (psimd_f32) { address_f32[0], address_f32[1], address_f32[2], 0.0f };
366 	}
367 
psimd_load4_s32(const void * address)368 	PSIMD_INTRINSIC psimd_s32 psimd_load4_s32(const void* address) {
369 		return psimd_load_s32(address);
370 	}
371 
psimd_load4_u32(const void * address)372 	PSIMD_INTRINSIC psimd_u32 psimd_load4_u32(const void* address) {
373 		return psimd_load_u32(address);
374 	}
375 
psimd_load4_f32(const void * address)376 	PSIMD_INTRINSIC psimd_f32 psimd_load4_f32(const void* address) {
377 		return psimd_load_f32(address);
378 	}
379 
psimd_load_stride2_f32(const void * address)380 	PSIMD_INTRINSIC psimd_f32 psimd_load_stride2_f32(const void* address) {
381 		const psimd_f32 v0x1x = psimd_load_f32(address);
382 		const psimd_f32 vx2x3 = psimd_load_f32((const float*) address + 3);
383 		#if defined(__clang__)
384 			return __builtin_shufflevector(v0x1x, vx2x3, 0, 2, 5, 7);
385 		#else
386 			return __builtin_shuffle(v0x1x, vx2x3, (psimd_s32) { 0, 2, 5, 7 });
387 		#endif
388 	}
389 
psimd_load1_stride2_f32(const void * address)390 	PSIMD_INTRINSIC psimd_f32 psimd_load1_stride2_f32(const void* address) {
391 		return psimd_load_f32(address);
392 	}
393 
psimd_load2_stride2_f32(const void * address)394 	PSIMD_INTRINSIC psimd_f32 psimd_load2_stride2_f32(const void* address) {
395 		const float* address_f32 = (const float*) address;
396 		return (psimd_f32) { address_f32[0], address_f32[2], 0.0f, 0.0f };
397 	}
398 
psimd_load3_stride2_f32(const void * address)399 	PSIMD_INTRINSIC psimd_f32 psimd_load3_stride2_f32(const void* address) {
400 		const psimd_f32 v0x1x = psimd_load_f32(address);
401 		const psimd_f32 v2zzz = psimd_load1_f32((const float*) address + 2);
402 		#if defined(__clang__)
403 			return __builtin_shufflevector(v0x1x, v2zzz, 0, 2, 4, 6);
404 		#else
405 			return __builtin_shuffle(v0x1x, v2zzz, (psimd_s32) { 0, 2, 4, 6 });
406 		#endif
407 	}
408 
psimd_load4_stride2_f32(const void * address)409 	PSIMD_INTRINSIC psimd_f32 psimd_load4_stride2_f32(const void* address) {
410 		return psimd_load_stride2_f32(address);
411 	}
412 
psimd_load_stride_f32(const void * address,size_t stride)413 	PSIMD_INTRINSIC psimd_f32 psimd_load_stride_f32(const void* address, size_t stride) {
414 		const float* address0_f32 = (const float*) address;
415 		const float* address1_f32 = address0_f32 + stride;
416 		const float* address2_f32 = address1_f32 + stride;
417 		const float* address3_f32 = address2_f32 + stride;
418 		return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, *address3_f32 };
419 	}
420 
psimd_load1_stride_f32(const void * address,size_t stride)421 	PSIMD_INTRINSIC psimd_f32 psimd_load1_stride_f32(const void* address, size_t stride) {
422 		return psimd_load1_f32(address);
423 	}
424 
psimd_load2_stride_f32(const void * address,size_t stride)425 	PSIMD_INTRINSIC psimd_f32 psimd_load2_stride_f32(const void* address, size_t stride) {
426 		const float* address_f32 = (const float*) address;
427 		return (psimd_f32) { address_f32[0], address_f32[stride], 0.0f, 0.0f };
428 	}
429 
psimd_load3_stride_f32(const void * address,size_t stride)430 	PSIMD_INTRINSIC psimd_f32 psimd_load3_stride_f32(const void* address, size_t stride) {
431 		const float* address0_f32 = (const float*) address;
432 		const float* address1_f32 = address0_f32 + stride;
433 		const float* address2_f32 = address1_f32 + stride;
434 		return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, 0.0f };
435 	}
436 
psimd_load4_stride_f32(const void * address,size_t stride)437 	PSIMD_INTRINSIC psimd_f32 psimd_load4_stride_f32(const void* address, size_t stride) {
438 		return psimd_load_stride_f32(address, stride);
439 	}
440 
441 	/* Store vector */
psimd_store_s8(void * address,psimd_s8 value)442 	PSIMD_INTRINSIC void psimd_store_s8(void* address, psimd_s8 value) {
443 		*((psimd_s8*) address) = value;
444 	}
445 
psimd_store_u8(void * address,psimd_u8 value)446 	PSIMD_INTRINSIC void psimd_store_u8(void* address, psimd_u8 value) {
447 		*((psimd_u8*) address) = value;
448 	}
449 
psimd_store_s16(void * address,psimd_s16 value)450 	PSIMD_INTRINSIC void psimd_store_s16(void* address, psimd_s16 value) {
451 		*((psimd_s16*) address) = value;
452 	}
453 
psimd_store_u16(void * address,psimd_u16 value)454 	PSIMD_INTRINSIC void psimd_store_u16(void* address, psimd_u16 value) {
455 		*((psimd_u16*) address) = value;
456 	}
457 
psimd_store_s32(void * address,psimd_s32 value)458 	PSIMD_INTRINSIC void psimd_store_s32(void* address, psimd_s32 value) {
459 		*((psimd_s32*) address) = value;
460 	}
461 
psimd_store_u32(void * address,psimd_u32 value)462 	PSIMD_INTRINSIC void psimd_store_u32(void* address, psimd_u32 value) {
463 		*((psimd_u32*) address) = value;
464 	}
465 
psimd_store_f32(void * address,psimd_f32 value)466 	PSIMD_INTRINSIC void psimd_store_f32(void* address, psimd_f32 value) {
467 		*((psimd_f32*) address) = value;
468 	}
469 
psimd_store1_s32(void * address,psimd_s32 value)470 	PSIMD_INTRINSIC void psimd_store1_s32(void* address, psimd_s32 value) {
471 		*((int32_t*) address) = value[0];
472 	}
473 
psimd_store1_u32(void * address,psimd_u32 value)474 	PSIMD_INTRINSIC void psimd_store1_u32(void* address, psimd_u32 value) {
475 		*((uint32_t*) address) = value[0];
476 	}
477 
psimd_store1_f32(void * address,psimd_f32 value)478 	PSIMD_INTRINSIC void psimd_store1_f32(void* address, psimd_f32 value) {
479 		*((float*) address) = value[0];
480 	}
481 
psimd_store2_s32(void * address,psimd_s32 value)482 	PSIMD_INTRINSIC void psimd_store2_s32(void* address, psimd_s32 value) {
483 		int32_t* address_s32 = (int32_t*) address;
484 		address_s32[0] = value[0];
485 		address_s32[1] = value[1];
486 	}
487 
psimd_store2_u32(void * address,psimd_u32 value)488 	PSIMD_INTRINSIC void psimd_store2_u32(void* address, psimd_u32 value) {
489 		uint32_t* address_u32 = (uint32_t*) address;
490 		address_u32[0] = value[0];
491 		address_u32[1] = value[1];
492 	}
493 
psimd_store2_f32(void * address,psimd_f32 value)494 	PSIMD_INTRINSIC void psimd_store2_f32(void* address, psimd_f32 value) {
495 		float* address_f32 = (float*) address;
496 		address_f32[0] = value[0];
497 		address_f32[1] = value[1];
498 	}
499 
psimd_store3_s32(void * address,psimd_s32 value)500 	PSIMD_INTRINSIC void psimd_store3_s32(void* address, psimd_s32 value) {
501 		int32_t* address_s32 = (int32_t*) address;
502 		address_s32[0] = value[0];
503 		address_s32[1] = value[1];
504 		address_s32[2] = value[2];
505 	}
506 
psimd_store3_u32(void * address,psimd_u32 value)507 	PSIMD_INTRINSIC void psimd_store3_u32(void* address, psimd_u32 value) {
508 		uint32_t* address_u32 = (uint32_t*) address;
509 		address_u32[0] = value[0];
510 		address_u32[1] = value[1];
511 		address_u32[2] = value[2];
512 	}
513 
psimd_store3_f32(void * address,psimd_f32 value)514 	PSIMD_INTRINSIC void psimd_store3_f32(void* address, psimd_f32 value) {
515 		float* address_f32 = (float*) address;
516 		address_f32[0] = value[0];
517 		address_f32[1] = value[1];
518 		address_f32[2] = value[2];
519 	}
520 
psimd_store4_s32(void * address,psimd_s32 value)521 	PSIMD_INTRINSIC void psimd_store4_s32(void* address, psimd_s32 value) {
522 		psimd_store_s32(address, value);
523 	}
524 
psimd_store4_u32(void * address,psimd_u32 value)525 	PSIMD_INTRINSIC void psimd_store4_u32(void* address, psimd_u32 value) {
526 		psimd_store_u32(address, value);
527 	}
528 
psimd_store4_f32(void * address,psimd_f32 value)529 	PSIMD_INTRINSIC void psimd_store4_f32(void* address, psimd_f32 value) {
530 		psimd_store_f32(address, value);
531 	}
532 
psimd_store_stride_f32(void * address,size_t stride,psimd_f32 value)533 	PSIMD_INTRINSIC void psimd_store_stride_f32(void* address, size_t stride, psimd_f32 value) {
534 		float* address0_f32 = (float*) address;
535 		float* address1_f32 = address0_f32 + stride;
536 		float* address2_f32 = address1_f32 + stride;
537 		float* address3_f32 = address2_f32 + stride;
538 		*address0_f32 = value[0];
539 		*address1_f32 = value[1];
540 		*address2_f32 = value[2];
541 		*address3_f32 = value[3];
542 	}
543 
psimd_store1_stride_f32(void * address,size_t stride,psimd_f32 value)544 	PSIMD_INTRINSIC void psimd_store1_stride_f32(void* address, size_t stride, psimd_f32 value) {
545 		psimd_store1_f32(address, value);
546 	}
547 
psimd_store2_stride_f32(void * address,size_t stride,psimd_f32 value)548 	PSIMD_INTRINSIC void psimd_store2_stride_f32(void* address, size_t stride, psimd_f32 value) {
549 		float* address_f32 = (float*) address;
550 		address_f32[0]      = value[0];
551 		address_f32[stride] = value[1];
552 	}
553 
psimd_store3_stride_f32(void * address,size_t stride,psimd_f32 value)554 	PSIMD_INTRINSIC void psimd_store3_stride_f32(void* address, size_t stride, psimd_f32 value) {
555 		float* address0_f32 = (float*) address;
556 		float* address1_f32 = address0_f32 + stride;
557 		float* address2_f32 = address1_f32 + stride;
558 		*address0_f32 = value[0];
559 		*address1_f32 = value[1];
560 		*address2_f32 = value[2];
561 	}
562 
563 	/* Vector addition */
psimd_add_s8(psimd_s8 a,psimd_s8 b)564 	PSIMD_INTRINSIC psimd_s8 psimd_add_s8(psimd_s8 a, psimd_s8 b) {
565 		return a + b;
566 	}
567 
psimd_add_u8(psimd_u8 a,psimd_u8 b)568 	PSIMD_INTRINSIC psimd_u8 psimd_add_u8(psimd_u8 a, psimd_u8 b) {
569 		return a + b;
570 	}
571 
psimd_add_s16(psimd_s16 a,psimd_s16 b)572 	PSIMD_INTRINSIC psimd_s16 psimd_add_s16(psimd_s16 a, psimd_s16 b) {
573 		return a + b;
574 	}
575 
psimd_add_u16(psimd_u16 a,psimd_u16 b)576 	PSIMD_INTRINSIC psimd_u16 psimd_add_u16(psimd_u16 a, psimd_u16 b) {
577 		return a + b;
578 	}
579 
psimd_add_s32(psimd_s32 a,psimd_s32 b)580 	PSIMD_INTRINSIC psimd_s32 psimd_add_s32(psimd_s32 a, psimd_s32 b) {
581 		return a + b;
582 	}
583 
psimd_add_u32(psimd_u32 a,psimd_u32 b)584 	PSIMD_INTRINSIC psimd_u32 psimd_add_u32(psimd_u32 a, psimd_u32 b) {
585 		return a + b;
586 	}
587 
psimd_add_f32(psimd_f32 a,psimd_f32 b)588 	PSIMD_INTRINSIC psimd_f32 psimd_add_f32(psimd_f32 a, psimd_f32 b) {
589 		#if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
590 			return (psimd_f32) vaddq_f32((float32x4_t) a, (float32x4_t) b);
591 		#else
592 			return a + b;
593 		#endif
594 	}
595 
596 	/* Vector subtraction */
psimd_sub_s8(psimd_s8 a,psimd_s8 b)597 	PSIMD_INTRINSIC psimd_s8 psimd_sub_s8(psimd_s8 a, psimd_s8 b) {
598 		return a - b;
599 	}
600 
psimd_sub_u8(psimd_u8 a,psimd_u8 b)601 	PSIMD_INTRINSIC psimd_u8 psimd_sub_u8(psimd_u8 a, psimd_u8 b) {
602 		return a - b;
603 	}
604 
psimd_sub_s16(psimd_s16 a,psimd_s16 b)605 	PSIMD_INTRINSIC psimd_s16 psimd_sub_s16(psimd_s16 a, psimd_s16 b) {
606 		return a - b;
607 	}
608 
psimd_sub_u16(psimd_u16 a,psimd_u16 b)609 	PSIMD_INTRINSIC psimd_u16 psimd_sub_u16(psimd_u16 a, psimd_u16 b) {
610 		return a - b;
611 	}
612 
psimd_sub_s32(psimd_s32 a,psimd_s32 b)613 	PSIMD_INTRINSIC psimd_s32 psimd_sub_s32(psimd_s32 a, psimd_s32 b) {
614 		return a - b;
615 	}
616 
psimd_sub_u32(psimd_u32 a,psimd_u32 b)617 	PSIMD_INTRINSIC psimd_u32 psimd_sub_u32(psimd_u32 a, psimd_u32 b) {
618 		return a - b;
619 	}
620 
psimd_sub_f32(psimd_f32 a,psimd_f32 b)621 	PSIMD_INTRINSIC psimd_f32 psimd_sub_f32(psimd_f32 a, psimd_f32 b) {
622 		#if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
623 			return (psimd_f32) vsubq_f32((float32x4_t) a, (float32x4_t) b);
624 		#else
625 			return a - b;
626 		#endif
627 	}
628 
629 	/* Vector multiplication */
psimd_mul_s8(psimd_s8 a,psimd_s8 b)630 	PSIMD_INTRINSIC psimd_s8 psimd_mul_s8(psimd_s8 a, psimd_s8 b) {
631 		return a * b;
632 	}
633 
psimd_mul_u8(psimd_u8 a,psimd_u8 b)634 	PSIMD_INTRINSIC psimd_u8 psimd_mul_u8(psimd_u8 a, psimd_u8 b) {
635 		return a * b;
636 	}
637 
psimd_mul_s16(psimd_s16 a,psimd_s16 b)638 	PSIMD_INTRINSIC psimd_s16 psimd_mul_s16(psimd_s16 a, psimd_s16 b) {
639 		return a * b;
640 	}
641 
psimd_mul_u16(psimd_u16 a,psimd_u16 b)642 	PSIMD_INTRINSIC psimd_u16 psimd_mul_u16(psimd_u16 a, psimd_u16 b) {
643 		return a * b;
644 	}
645 
psimd_mul_s32(psimd_s32 a,psimd_s32 b)646 	PSIMD_INTRINSIC psimd_s32 psimd_mul_s32(psimd_s32 a, psimd_s32 b) {
647 		return a * b;
648 	}
649 
psimd_mul_u32(psimd_u32 a,psimd_u32 b)650 	PSIMD_INTRINSIC psimd_u32 psimd_mul_u32(psimd_u32 a, psimd_u32 b) {
651 		return a * b;
652 	}
653 
psimd_mul_f32(psimd_f32 a,psimd_f32 b)654 	PSIMD_INTRINSIC psimd_f32 psimd_mul_f32(psimd_f32 a, psimd_f32 b) {
655 		#if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
656 			return (psimd_f32) vmulq_f32((float32x4_t) a, (float32x4_t) b);
657 		#else
658 			return a * b;
659 		#endif
660 	}
661 
662 	/* Quasi-Fused Multiply-Add */
psimd_qfma_f32(psimd_f32 a,psimd_f32 b,psimd_f32 c)663 	PSIMD_INTRINSIC psimd_f32 psimd_qfma_f32(psimd_f32 a, psimd_f32 b, psimd_f32 c) {
664 		#if defined(__aarch64__) || defined(__ARM_NEON__) && defined(__ARM_FEATURE_FMA)
665 			return (psimd_f32) vfmaq_f32((float32x4_t) a, (float32x4_t) b, (float32x4_t) c);
666 		#elif (defined(__x86_64__) || defined(__i386__) || defined(__i686__)) && defined(__FMA__)
667 			return (psimd_f32) _mm_fmadd_ps((__m128) c, (__m128) a, (__m128) b);
668 		#elif (defined(__x86_64__) || defined(__i386__) || defined(__i686__)) && defined(__FMA4__)
669 			return (psimd_f32) _mm_macc_ps((__m128) c, (__m128) a, (__m128) b);
670 		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
671 			return (psimd_f32) __builtin_wasm_qfma_f32x4(a, b, c);
672 		#else
673 			return a + b * c;
674 		#endif
675 	}
676 
psimd_div_f32(psimd_f32 a,psimd_f32 b)677 	PSIMD_INTRINSIC psimd_f32 psimd_div_f32(psimd_f32 a, psimd_f32 b) {
678 		return a / b;
679 	}
680 
681 	/* Vector and */
psimd_andmask_f32(psimd_s32 mask,psimd_f32 v)682 	PSIMD_INTRINSIC psimd_f32 psimd_andmask_f32(psimd_s32 mask, psimd_f32 v) {
683 		return (psimd_f32) (mask & (psimd_s32) v);
684 	}
685 
686 	/* Vector and-not */
psimd_andnotmask_f32(psimd_s32 mask,psimd_f32 v)687 	PSIMD_INTRINSIC psimd_f32 psimd_andnotmask_f32(psimd_s32 mask, psimd_f32 v) {
688 		return (psimd_f32) (~mask & (psimd_s32) v);
689 	}
690 
691 	/* Vector blend */
psimd_blend_s8(psimd_s8 mask,psimd_s8 a,psimd_s8 b)692 	PSIMD_INTRINSIC psimd_s8 psimd_blend_s8(psimd_s8 mask, psimd_s8 a, psimd_s8 b) {
693 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
694 			return (psimd_s8) vbslq_s8((uint8x16_t) mask, (int8x16_t) a, (int8x16_t) b);
695 		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
696 			return (psimd_s8) __builtin_wasm_bitselect(a, b, mask);
697 		#else
698 			return (mask & a) | (~mask & b);
699 		#endif
700 	}
701 
psimd_blend_u8(psimd_s8 mask,psimd_u8 a,psimd_u8 b)702 	PSIMD_INTRINSIC psimd_u8 psimd_blend_u8(psimd_s8 mask, psimd_u8 a, psimd_u8 b) {
703 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
704 			return (psimd_u8) vbslq_u8((uint8x16_t) mask, (uint8x16_t) a, (uint8x16_t) b);
705 		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
706 			return (psimd_u8) __builtin_wasm_bitselect(a, b, mask);
707 		#else
708 			return (psimd_u8) ((mask & (psimd_s8) a) | (~mask & (psimd_s8) b));
709 		#endif
710 	}
711 
psimd_blend_s16(psimd_s16 mask,psimd_s16 a,psimd_s16 b)712 	PSIMD_INTRINSIC psimd_s16 psimd_blend_s16(psimd_s16 mask, psimd_s16 a, psimd_s16 b) {
713 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
714 			return (psimd_s16) vbslq_s16((uint16x8_t) mask, (int16x8_t) a, (int16x8_t) b);
715 		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
716 			return (psimd_s16) __builtin_wasm_bitselect(a, b, mask);
717 		#else
718 			return (mask & a) | (~mask & b);
719 		#endif
720 	}
721 
psimd_blend_u16(psimd_s16 mask,psimd_u16 a,psimd_u16 b)722 	PSIMD_INTRINSIC psimd_u16 psimd_blend_u16(psimd_s16 mask, psimd_u16 a, psimd_u16 b) {
723 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
724 			return (psimd_u16) vbslq_u16((uint16x8_t) mask, (uint16x8_t) a, (uint16x8_t) b);
725 		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
726 			return (psimd_u16) __builtin_wasm_bitselect(a, b, mask);
727 		#else
728 			return (psimd_u16) ((mask & (psimd_s16) a) | (~mask & (psimd_s16) b));
729 		#endif
730 	}
731 
psimd_blend_s32(psimd_s32 mask,psimd_s32 a,psimd_s32 b)732 	PSIMD_INTRINSIC psimd_s32 psimd_blend_s32(psimd_s32 mask, psimd_s32 a, psimd_s32 b) {
733 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
734 			return (psimd_s32) vbslq_s32((uint32x4_t) mask, (int32x4_t) a, (int32x4_t) b);
735 		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
736 			return (psimd_s32) __builtin_wasm_bitselect(a, b, mask);
737 		#else
738 			return (mask & a) | (~mask & b);
739 		#endif
740 	}
741 
psimd_blend_u32(psimd_s32 mask,psimd_u32 a,psimd_u32 b)742 	PSIMD_INTRINSIC psimd_u32 psimd_blend_u32(psimd_s32 mask, psimd_u32 a, psimd_u32 b) {
743 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
744 			return (psimd_u32) vbslq_u32((uint32x4_t) mask, (uint32x4_t) a, (uint32x4_t) b);
745 		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
746 			return (psimd_u32) __builtin_wasm_bitselect(a, b, mask);
747 		#else
748 			return (psimd_u32) ((mask & (psimd_s32) a) | (~mask & (psimd_s32) b));
749 		#endif
750 	}
751 
psimd_blend_f32(psimd_s32 mask,psimd_f32 a,psimd_f32 b)752 	PSIMD_INTRINSIC psimd_f32 psimd_blend_f32(psimd_s32 mask, psimd_f32 a, psimd_f32 b) {
753 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
754 			return (psimd_f32) vbslq_f32((uint32x4_t) mask, (float32x4_t) a, (float32x4_t) b);
755 		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
756 			return (psimd_f32) __builtin_wasm_bitselect(a, b, mask);
757 		#else
758 			return (psimd_f32) ((mask & (psimd_s32) a) | (~mask & (psimd_s32) b));
759 		#endif
760 	}
761 
762 	/* Vector blend on sign */
psimd_signblend_s8(psimd_s8 x,psimd_s8 a,psimd_s8 b)763 	PSIMD_INTRINSIC psimd_s8 psimd_signblend_s8(psimd_s8 x, psimd_s8 a, psimd_s8 b) {
764 		return psimd_blend_s8(x >> psimd_splat_s8(7), a, b);
765 	}
766 
psimd_signblend_u8(psimd_s8 x,psimd_u8 a,psimd_u8 b)767 	PSIMD_INTRINSIC psimd_u8 psimd_signblend_u8(psimd_s8 x, psimd_u8 a, psimd_u8 b) {
768 		return psimd_blend_u8((x >> psimd_splat_s8(7)), a, b);
769 	}
770 
psimd_signblend_s16(psimd_s16 x,psimd_s16 a,psimd_s16 b)771 	PSIMD_INTRINSIC psimd_s16 psimd_signblend_s16(psimd_s16 x, psimd_s16 a, psimd_s16 b) {
772 		return psimd_blend_s16(x >> psimd_splat_s16(15), a, b);
773 	}
774 
psimd_signblend_u16(psimd_s16 x,psimd_u16 a,psimd_u16 b)775 	PSIMD_INTRINSIC psimd_u16 psimd_signblend_u16(psimd_s16 x, psimd_u16 a, psimd_u16 b) {
776 		return psimd_blend_u16((x >> psimd_splat_s16(15)), a, b);
777 	}
778 
psimd_signblend_s32(psimd_s32 x,psimd_s32 a,psimd_s32 b)779 	PSIMD_INTRINSIC psimd_s32 psimd_signblend_s32(psimd_s32 x, psimd_s32 a, psimd_s32 b) {
780 		return psimd_blend_s32(x >> psimd_splat_s32(31), a, b);
781 	}
782 
psimd_signblend_u32(psimd_s32 x,psimd_u32 a,psimd_u32 b)783 	PSIMD_INTRINSIC psimd_u32 psimd_signblend_u32(psimd_s32 x, psimd_u32 a, psimd_u32 b) {
784 		return psimd_blend_u32((x >> psimd_splat_s32(31)), a, b);
785 	}
786 
psimd_signblend_f32(psimd_f32 x,psimd_f32 a,psimd_f32 b)787 	PSIMD_INTRINSIC psimd_f32 psimd_signblend_f32(psimd_f32 x, psimd_f32 a, psimd_f32 b) {
788 		const psimd_s32 mask = (psimd_s32) x >> psimd_splat_s32(31);
789 		return psimd_blend_f32(mask, a, b);
790 	}
791 
792 	/* Vector absolute value */
psimd_abs_f32(psimd_f32 v)793 	PSIMD_INTRINSIC psimd_f32 psimd_abs_f32(psimd_f32 v) {
794 		const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f);
795 		return (psimd_f32) ((psimd_s32) v & ~mask);
796 	}
797 
798 	/* Vector negation */
psimd_neg_f32(psimd_f32 v)799 	PSIMD_INTRINSIC psimd_f32 psimd_neg_f32(psimd_f32 v) {
800 		const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f);
801 		return (psimd_f32) ((psimd_s32) v ^ mask);
802 	}
803 
804 	/* Vector maximum */
psimd_max_s8(psimd_s8 a,psimd_s8 b)805 	PSIMD_INTRINSIC psimd_s8 psimd_max_s8(psimd_s8 a, psimd_s8 b) {
806 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
807 			return (psimd_s8) vmaxq_s8((int8x16_t) a, (int8x16_t) b);
808 		#else
809 			return psimd_blend_s8(a > b, a, b);
810 		#endif
811 	}
812 
psimd_max_u8(psimd_u8 a,psimd_u8 b)813 	PSIMD_INTRINSIC psimd_u8 psimd_max_u8(psimd_u8 a, psimd_u8 b) {
814 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
815 			return (psimd_u8) vmaxq_u8((uint8x16_t) a, (uint8x16_t) b);
816 		#else
817 			return psimd_blend_u8(a > b, a, b);
818 		#endif
819 	}
820 
psimd_max_s16(psimd_s16 a,psimd_s16 b)821 	PSIMD_INTRINSIC psimd_s16 psimd_max_s16(psimd_s16 a, psimd_s16 b) {
822 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
823 			return (psimd_s16) vmaxq_s16((int16x8_t) a, (int16x8_t) b);
824 		#else
825 			return psimd_blend_s16(a > b, a, b);
826 		#endif
827 	}
828 
psimd_max_u16(psimd_u16 a,psimd_u16 b)829 	PSIMD_INTRINSIC psimd_u16 psimd_max_u16(psimd_u16 a, psimd_u16 b) {
830 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
831 			return (psimd_u16) vmaxq_u16((uint16x8_t) a, (uint16x8_t) b);
832 		#else
833 			return psimd_blend_u16(a > b, a, b);
834 		#endif
835 	}
836 
psimd_max_s32(psimd_s32 a,psimd_s32 b)837 	PSIMD_INTRINSIC psimd_s32 psimd_max_s32(psimd_s32 a, psimd_s32 b) {
838 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
839 			return (psimd_s32) vmaxq_s32((int32x4_t) a, (int32x4_t) b);
840 		#else
841 			return psimd_blend_s32(a > b, a, b);
842 		#endif
843 	}
844 
psimd_max_u32(psimd_u32 a,psimd_u32 b)845 	PSIMD_INTRINSIC psimd_u32 psimd_max_u32(psimd_u32 a, psimd_u32 b) {
846 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
847 			return (psimd_u32) vmaxq_u32((uint32x4_t) a, (uint32x4_t) b);
848 		#else
849 			return psimd_blend_u32(a > b, a, b);
850 		#endif
851 	}
852 
psimd_max_f32(psimd_f32 a,psimd_f32 b)853 	PSIMD_INTRINSIC psimd_f32 psimd_max_f32(psimd_f32 a, psimd_f32 b) {
854 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
855 			return (psimd_f32) vmaxq_f32((float32x4_t) a, (float32x4_t) b);
856 		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
857 			return __builtin_wasm_max_f32x4(a, b);
858 		#else
859 			return psimd_blend_f32(a > b, a, b);
860 		#endif
861 	}
862 
863 	/* Vector minimum */
psimd_min_s8(psimd_s8 a,psimd_s8 b)864 	PSIMD_INTRINSIC psimd_s8 psimd_min_s8(psimd_s8 a, psimd_s8 b) {
865 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
866 			return (psimd_s8) vminq_s8((int8x16_t) a, (int8x16_t) b);
867 		#else
868 			return psimd_blend_s8(a < b, a, b);
869 		#endif
870 	}
871 
psimd_min_u8(psimd_u8 a,psimd_u8 b)872 	PSIMD_INTRINSIC psimd_u8 psimd_min_u8(psimd_u8 a, psimd_u8 b) {
873 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
874 			return (psimd_u8) vminq_u8((uint8x16_t) a, (uint8x16_t) b);
875 		#else
876 			return psimd_blend_u8(a < b, a, b);
877 		#endif
878 	}
879 
psimd_min_s16(psimd_s16 a,psimd_s16 b)880 	PSIMD_INTRINSIC psimd_s16 psimd_min_s16(psimd_s16 a, psimd_s16 b) {
881 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
882 			return (psimd_s16) vminq_s16((int16x8_t) a, (int16x8_t) b);
883 		#else
884 			return psimd_blend_s16(a < b, a, b);
885 		#endif
886 	}
887 
psimd_min_u16(psimd_u16 a,psimd_u16 b)888 	PSIMD_INTRINSIC psimd_u16 psimd_min_u16(psimd_u16 a, psimd_u16 b) {
889 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
890 			return (psimd_u16) vminq_u16((uint16x8_t) a, (uint16x8_t) b);
891 		#else
892 			return psimd_blend_u16(a < b, a, b);
893 		#endif
894 	}
895 
psimd_min_s32(psimd_s32 a,psimd_s32 b)896 	PSIMD_INTRINSIC psimd_s32 psimd_min_s32(psimd_s32 a, psimd_s32 b) {
897 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
898 			return (psimd_s32) vminq_s32((int32x4_t) a, (int32x4_t) b);
899 		#else
900 			return psimd_blend_s32(a < b, a, b);
901 		#endif
902 	}
903 
psimd_min_u32(psimd_u32 a,psimd_u32 b)904 	PSIMD_INTRINSIC psimd_u32 psimd_min_u32(psimd_u32 a, psimd_u32 b) {
905 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
906 			return (psimd_u32) vminq_u32((uint32x4_t) a, (uint32x4_t) b);
907 		#else
908 			return psimd_blend_u32(a < b, a, b);
909 		#endif
910 	}
911 
psimd_min_f32(psimd_f32 a,psimd_f32 b)912 	PSIMD_INTRINSIC psimd_f32 psimd_min_f32(psimd_f32 a, psimd_f32 b) {
913 		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
914 			return (psimd_f32) vminq_f32((float32x4_t) a, (float32x4_t) b);
915 		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
916 			return __builtin_wasm_min_f32x4(a, b);
917 		#else
918 			return psimd_blend_f32(a < b, a, b);
919 		#endif
920 	}
921 
psimd_cvt_s32_f32(psimd_s32 v)922 	PSIMD_INTRINSIC psimd_f32 psimd_cvt_s32_f32(psimd_s32 v) {
923 		#if defined(__clang__)
924 			return __builtin_convertvector(v, psimd_f32);
925 		#elif defined(__ARM_NEON__) || defined(__ARM_NEON)
926 			return (psimd_f32) vcvtq_f32_s32((int32x4_t) v);
927 		#elif defined(__SSE2__)
928 			return (psimd_f32) _mm_cvtepi32_ps((__m128i) v);
929 		#else
930 			return (psimd_f32) { (float) v[0], (float) v[1], (float) v[2], (float) v[3] };
931 		#endif
932 	}
933 
934 	/* Broadcast vector element */
935 	#if defined(__clang__)
psimd_splat0_f32(psimd_f32 v)936 		PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) {
937 			return __builtin_shufflevector(v, v, 0, 0, 0, 0);
938 		}
939 
psimd_splat1_f32(psimd_f32 v)940 		PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) {
941 			return __builtin_shufflevector(v, v, 1, 1, 1, 1);
942 		}
943 
psimd_splat2_f32(psimd_f32 v)944 		PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) {
945 			return __builtin_shufflevector(v, v, 2, 2, 2, 2);
946 		}
947 
psimd_splat3_f32(psimd_f32 v)948 		PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) {
949 			return __builtin_shufflevector(v, v, 3, 3, 3, 3);
950 		}
951 	#else
psimd_splat0_f32(psimd_f32 v)952 		PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) {
953 			return __builtin_shuffle(v, (psimd_s32) { 0, 0, 0, 0 });
954 		}
955 
psimd_splat1_f32(psimd_f32 v)956 		PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) {
957 			return __builtin_shuffle(v, (psimd_s32) { 1, 1, 1, 1 });
958 		}
959 
psimd_splat2_f32(psimd_f32 v)960 		PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) {
961 			return __builtin_shuffle(v, (psimd_s32) { 2, 2, 2, 2 });
962 		}
963 
psimd_splat3_f32(psimd_f32 v)964 		PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) {
965 			return __builtin_shuffle(v, (psimd_s32) { 3, 3, 3, 3 });
966 		}
967 	#endif
968 
969 	/* Reversal of vector elements */
970 	#if defined(__clang__)
psimd_reverse_s8(psimd_s8 v)971 		PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) {
972 			return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
973 		}
974 
psimd_reverse_u8(psimd_u8 v)975 		PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) {
976 			return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
977 		}
978 
psimd_reverse_s16(psimd_s16 v)979 		PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) {
980 			return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0);
981 		}
982 
psimd_reverse_u16(psimd_u16 v)983 		PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) {
984 			return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0);
985 		}
986 
psimd_reverse_s32(psimd_s32 v)987 		PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) {
988 			return __builtin_shufflevector(v, v, 3, 2, 1, 0);
989 		}
990 
psimd_reverse_u32(psimd_u32 v)991 		PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) {
992 			return __builtin_shufflevector(v, v, 3, 2, 1, 0);
993 		}
994 
psimd_reverse_f32(psimd_f32 v)995 		PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) {
996 			return __builtin_shufflevector(v, v, 3, 2, 1, 0);
997 		}
998 	#else
psimd_reverse_s8(psimd_s8 v)999 		PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) {
1000 			return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 });
1001 		}
1002 
psimd_reverse_u8(psimd_u8 v)1003 		PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) {
1004 			return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 });
1005 		}
1006 
psimd_reverse_s16(psimd_s16 v)1007 		PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) {
1008 			return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 });
1009 		}
1010 
psimd_reverse_u16(psimd_u16 v)1011 		PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) {
1012 			return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 });
1013 		}
1014 
psimd_reverse_s32(psimd_s32 v)1015 		PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) {
1016 			return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
1017 		}
1018 
psimd_reverse_u32(psimd_u32 v)1019 		PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) {
1020 			return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
1021 		}
1022 
psimd_reverse_f32(psimd_f32 v)1023 		PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) {
1024 			return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
1025 		}
1026 	#endif
1027 
1028 	/* Interleaving of vector elements */
1029 	#if defined(__clang__)
psimd_interleave_lo_s16(psimd_s16 a,psimd_s16 b)1030 		PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) {
1031 			return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1032 		}
1033 
psimd_interleave_hi_s16(psimd_s16 a,psimd_s16 b)1034 		PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) {
1035 			return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1036 		}
1037 
psimd_interleave_lo_u16(psimd_u16 a,psimd_u16 b)1038 		PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) {
1039 			return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1040 		}
1041 
psimd_interleave_hi_u16(psimd_u16 a,psimd_u16 b)1042 		PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) {
1043 			return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1044 		}
1045 
psimd_interleave_lo_s32(psimd_s32 a,psimd_s32 b)1046 		PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) {
1047 			return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
1048 		}
1049 
psimd_interleave_hi_s32(psimd_s32 a,psimd_s32 b)1050 		PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) {
1051 			return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
1052 		}
1053 
psimd_interleave_lo_u32(psimd_u32 a,psimd_u32 b)1054 		PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) {
1055 			return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
1056 		}
1057 
psimd_interleave_hi_u32(psimd_u32 a,psimd_u32 b)1058 		PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) {
1059 			return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
1060 		}
1061 
psimd_interleave_lo_f32(psimd_f32 a,psimd_f32 b)1062 		PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
1063 			return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
1064 		}
1065 
psimd_interleave_hi_f32(psimd_f32 a,psimd_f32 b)1066 		PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
1067 			return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
1068 		}
1069 	#else
psimd_interleave_lo_s16(psimd_s16 a,psimd_s16 b)1070 		PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) {
1071 			return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 });
1072 		}
1073 
psimd_interleave_hi_s16(psimd_s16 a,psimd_s16 b)1074 		PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) {
1075 			return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 });
1076 		}
1077 
psimd_interleave_lo_u16(psimd_u16 a,psimd_u16 b)1078 		PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) {
1079 			return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 });
1080 		}
1081 
psimd_interleave_hi_u16(psimd_u16 a,psimd_u16 b)1082 		PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) {
1083 			return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 });
1084 		}
1085 
psimd_interleave_lo_s32(psimd_s32 a,psimd_s32 b)1086 		PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) {
1087 			return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
1088 		}
1089 
psimd_interleave_hi_s32(psimd_s32 a,psimd_s32 b)1090 		PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) {
1091 			return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
1092 		}
1093 
psimd_interleave_lo_u32(psimd_u32 a,psimd_u32 b)1094 		PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) {
1095 			return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
1096 		}
1097 
psimd_interleave_hi_u32(psimd_u32 a,psimd_u32 b)1098 		PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) {
1099 			return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
1100 		}
1101 
psimd_interleave_lo_f32(psimd_f32 a,psimd_f32 b)1102 		PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
1103 			return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
1104 		}
1105 
psimd_interleave_hi_f32(psimd_f32 a,psimd_f32 b)1106 		PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
1107 			return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
1108 		}
1109 	#endif
1110 
1111 	/* Concatenation of low/high vector elements */
1112 	#if defined(__clang__)
psimd_concat_lo_s16(psimd_s16 a,psimd_s16 b)1113 		PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) {
1114 			return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3);
1115 		}
1116 
psimd_concat_hi_s16(psimd_s16 a,psimd_s16 b)1117 		PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) {
1118 			return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7);
1119 		}
1120 
psimd_concat_lo_u16(psimd_u16 a,psimd_u16 b)1121 		PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) {
1122 			return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3);
1123 		}
1124 
psimd_concat_hi_u16(psimd_u16 a,psimd_u16 b)1125 		PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) {
1126 			return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7);
1127 		}
1128 
psimd_concat_lo_s32(psimd_s32 a,psimd_s32 b)1129 		PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) {
1130 			return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
1131 		}
1132 
psimd_concat_hi_s32(psimd_s32 a,psimd_s32 b)1133 		PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) {
1134 			return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
1135 		}
1136 
psimd_concat_lo_u32(psimd_u32 a,psimd_u32 b)1137 		PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) {
1138 			return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
1139 		}
1140 
psimd_concat_hi_u32(psimd_u32 a,psimd_u32 b)1141 		PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) {
1142 			return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
1143 		}
1144 
psimd_concat_lo_f32(psimd_f32 a,psimd_f32 b)1145 		PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
1146 			return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
1147 		}
1148 
psimd_concat_hi_f32(psimd_f32 a,psimd_f32 b)1149 		PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
1150 			return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
1151 		}
1152 	#else
psimd_concat_lo_s16(psimd_s16 a,psimd_s16 b)1153 		PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) {
1154 			return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 });
1155 		}
1156 
psimd_concat_hi_s16(psimd_s16 a,psimd_s16 b)1157 		PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) {
1158 			return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 });
1159 		}
1160 
psimd_concat_lo_u16(psimd_u16 a,psimd_u16 b)1161 		PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) {
1162 			return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 });
1163 		}
1164 
psimd_concat_hi_u16(psimd_u16 a,psimd_u16 b)1165 		PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) {
1166 			return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 });
1167 		}
1168 
psimd_concat_lo_s32(psimd_s32 a,psimd_s32 b)1169 		PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) {
1170 			return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
1171 		}
1172 
psimd_concat_hi_s32(psimd_s32 a,psimd_s32 b)1173 		PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) {
1174 			return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
1175 		}
1176 
psimd_concat_lo_u32(psimd_u32 a,psimd_u32 b)1177 		PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) {
1178 			return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
1179 		}
1180 
psimd_concat_hi_u32(psimd_u32 a,psimd_u32 b)1181 		PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) {
1182 			return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
1183 		}
1184 
psimd_concat_lo_f32(psimd_f32 a,psimd_f32 b)1185 		PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
1186 			return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
1187 		}
1188 
psimd_concat_hi_f32(psimd_f32 a,psimd_f32 b)1189 		PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
1190 			return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
1191 		}
1192 	#endif
1193 
1194 	/* Concatenation of even/odd vector elements */
1195 	#if defined(__clang__)
psimd_concat_even_s8(psimd_s8 a,psimd_s8 b)1196 		PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) {
1197 			return __builtin_shufflevector(a, b,
1198 				0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14);
1199 		}
1200 
psimd_concat_odd_s8(psimd_s8 a,psimd_s8 b)1201 		PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) {
1202 			return __builtin_shufflevector(a, b,
1203 				1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15);
1204 		}
1205 
psimd_concat_even_u8(psimd_u8 a,psimd_u8 b)1206 		PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) {
1207 			return __builtin_shufflevector(a, b,
1208 				0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14);
1209 		}
1210 
psimd_concat_odd_u8(psimd_u8 a,psimd_u8 b)1211 		PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) {
1212 			return __builtin_shufflevector(a, b,
1213 				1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15);
1214 		}
1215 
psimd_concat_even_s16(psimd_s16 a,psimd_s16 b)1216 		PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) {
1217 			return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6);
1218 		}
1219 
psimd_concat_odd_s16(psimd_s16 a,psimd_s16 b)1220 		PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) {
1221 			return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7);
1222 		}
1223 
psimd_concat_even_u16(psimd_u16 a,psimd_u16 b)1224 		PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) {
1225 			return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6);
1226 		}
1227 
psimd_concat_odd_u16(psimd_u16 a,psimd_u16 b)1228 		PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) {
1229 			return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7);
1230 		}
1231 
psimd_concat_even_s32(psimd_s32 a,psimd_s32 b)1232 		PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) {
1233 			return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
1234 		}
1235 
psimd_concat_odd_s32(psimd_s32 a,psimd_s32 b)1236 		PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) {
1237 			return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
1238 		}
1239 
psimd_concat_even_u32(psimd_u32 a,psimd_u32 b)1240 		PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) {
1241 			return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
1242 		}
1243 
psimd_concat_odd_u32(psimd_u32 a,psimd_u32 b)1244 		PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) {
1245 			return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
1246 		}
1247 
psimd_concat_even_f32(psimd_f32 a,psimd_f32 b)1248 		PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) {
1249 			return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
1250 		}
1251 
psimd_concat_odd_f32(psimd_f32 a,psimd_f32 b)1252 		PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) {
1253 			return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
1254 		}
1255 	#else
psimd_concat_even_s8(psimd_s8 a,psimd_s8 b)1256 		PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) {
1257 			return __builtin_shuffle(a, b,
1258 				(psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 });
1259 		}
1260 
psimd_concat_odd_s8(psimd_s8 a,psimd_s8 b)1261 		PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) {
1262 			return __builtin_shuffle(a, b,
1263 				(psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 });
1264 		}
1265 
psimd_concat_even_u8(psimd_u8 a,psimd_u8 b)1266 		PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) {
1267 			return __builtin_shuffle(a, b,
1268 				(psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 });
1269 		}
1270 
psimd_concat_odd_u8(psimd_u8 a,psimd_u8 b)1271 		PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) {
1272 			return __builtin_shuffle(a, b,
1273 				(psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 });
1274 		}
1275 
psimd_concat_even_s16(psimd_s16 a,psimd_s16 b)1276 		PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) {
1277 			return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 });
1278 		}
1279 
psimd_concat_odd_s16(psimd_s16 a,psimd_s16 b)1280 		PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) {
1281 			return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 });
1282 		}
1283 
psimd_concat_even_u16(psimd_u16 a,psimd_u16 b)1284 		PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) {
1285 			return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 });
1286 		}
1287 
psimd_concat_odd_u16(psimd_u16 a,psimd_u16 b)1288 		PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) {
1289 			return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 });
1290 		}
1291 
psimd_concat_even_s32(psimd_s32 a,psimd_s32 b)1292 		PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) {
1293 			return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
1294 		}
1295 
psimd_concat_odd_s32(psimd_s32 a,psimd_s32 b)1296 		PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) {
1297 			return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
1298 		}
1299 
psimd_concat_even_u32(psimd_u32 a,psimd_u32 b)1300 		PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) {
1301 			return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
1302 		}
1303 
psimd_concat_odd_u32(psimd_u32 a,psimd_u32 b)1304 		PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) {
1305 			return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
1306 		}
1307 
psimd_concat_even_f32(psimd_f32 a,psimd_f32 b)1308 		PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) {
1309 			return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
1310 		}
1311 
psimd_concat_odd_f32(psimd_f32 a,psimd_f32 b)1312 		PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) {
1313 			return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
1314 		}
1315 	#endif
1316 
1317 	/* Vector reduce */
1318 	#if defined(__clang__)
psimd_allreduce_sum_f32(psimd_f32 v)1319 		PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) {
1320 			const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, 0, 1);
1321 			return temp + __builtin_shufflevector(temp, temp, 1, 0, 3, 2);
1322 		}
1323 
psimd_allreduce_max_f32(psimd_f32 v)1324 		PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) {
1325 			const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1));
1326 			return psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2));
1327 		}
1328 
psimd_allreduce_min_f32(psimd_f32 v)1329 		PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) {
1330 			const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1));
1331 			return psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2));
1332 		}
1333 
psimd_reduce_sum_f32(psimd_f32 v)1334 		PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) {
1335 			const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, -1, -1);
1336 			const psimd_f32 result = temp + __builtin_shufflevector(temp, temp, 1, -1, -1, -1);
1337 			return result[0];
1338 		}
1339 
psimd_reduce_max_f32(psimd_f32 v)1340 		PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) {
1341 			const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1));
1342 			const psimd_f32 result = psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1));
1343 			return result[0];
1344 		}
1345 
psimd_reduce_min_f32(psimd_f32 v)1346 		PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) {
1347 			const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1));
1348 			const psimd_f32 result = psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1));
1349 			return result[0];
1350 		}
1351 	#else
psimd_allreduce_sum_f32(psimd_f32 v)1352 		PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) {
1353 			const psimd_f32 temp = v + __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 });
1354 			return temp + __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 });
1355 		}
1356 
psimd_allreduce_max_f32(psimd_f32 v)1357 		PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) {
1358 			const psimd_f32 temp = psimd_max_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }));
1359 			return psimd_max_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }));
1360 		}
1361 
psimd_allreduce_min_f32(psimd_f32 v)1362 		PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) {
1363 			const psimd_f32 temp = psimd_min_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }));
1364 			return psimd_min_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }));
1365 		}
1366 
psimd_reduce_sum_f32(psimd_f32 v)1367 		PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) {
1368 			const psimd_f32 result = psimd_allreduce_sum_f32(v);
1369 			return result[0];
1370 		}
1371 
psimd_reduce_max_f32(psimd_f32 v)1372 		PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) {
1373 			const psimd_f32 result = psimd_allreduce_max_f32(v);
1374 			return result[0];
1375 		}
1376 
psimd_reduce_min_f32(psimd_f32 v)1377 		PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) {
1378 			const psimd_f32 result = psimd_allreduce_min_f32(v);
1379 			return result[0];
1380 		}
1381 	#endif
1382 #endif
1383 
1384 #endif /* PSIMD_H */
1385