1 #pragma once 2 #ifndef PSIMD_H 3 #define PSIMD_H 4 5 #if defined(__CUDA_ARCH__) 6 /* CUDA compiler */ 7 #define PSIMD_INTRINSIC __forceinline__ __device__ 8 #elif defined(__OPENCL_VERSION__) 9 /* OpenCL compiler */ 10 #define PSIMD_INTRINSIC inline static 11 #elif defined(__INTEL_COMPILER) 12 /* Intel compiler, even on Windows */ 13 #define PSIMD_INTRINSIC inline static __attribute__((__always_inline__)) 14 #elif defined(__GNUC__) 15 /* GCC-compatible compiler (gcc/clang/icc) */ 16 #define PSIMD_INTRINSIC inline static __attribute__((__always_inline__)) 17 #elif defined(_MSC_VER) 18 /* MSVC-compatible compiler (cl/icl/clang-cl) */ 19 #define PSIMD_INTRINSIC __forceinline static 20 #elif defined(__cplusplus) 21 /* Generic C++ compiler */ 22 #define PSIMD_INTRINSIC inline static 23 #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) 24 /* Generic C99 compiler */ 25 #define PSIMD_INTRINSIC inline static 26 #else 27 /* Generic C compiler */ 28 #define PSIMD_INTRINSIC static 29 #endif 30 31 #if defined(__GNUC__) || defined(__clang__) 32 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 33 #include <arm_neon.h> 34 #endif 35 36 #if defined(__SSE2__) 37 #include <emmintrin.h> 38 #endif 39 40 #if defined(__SSE3__) 41 #include <pmmintrin.h> 42 #endif 43 44 #if defined(__SSSE3__) 45 #include <tmmintrin.h> 46 #endif 47 48 #if defined(__SSE4_1__) 49 #include <smmintrin.h> 50 #endif 51 52 #if defined(__SSE4_2__) 53 #include <nmmintrin.h> 54 #endif 55 56 #if defined(__AVX__) 57 #include <immintrin.h> 58 #endif 59 #elif defined(_MSC_VER) 60 #include <intrin.h> 61 #endif 62 63 #if defined(__cplusplus) 64 #define PSIMD_CXX_SYNTAX 65 #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) 66 #define PSIMD_C11_SYNTAX 67 #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) 68 #define PSIMD_C99_SYNTAX 69 #else 70 #define PSIMD_C89_SYNTAX 71 #endif 72 73 #if defined(__cplusplus) && (__cplusplus >= 201103L) 74 #include <cstddef> 75 #include <cstdint> 76 #elif !defined(__OPENCL_VERSION__) 77 #include <stddef.h> 78 #include <stdint.h> 79 #endif 80 81 #if defined(__GNUC__) || defined(__clang__) 82 #define PSIMD_HAVE_F64 0 83 #define PSIMD_HAVE_F32 1 84 #define PSIMD_HAVE_U8 1 85 #define PSIMD_HAVE_S8 1 86 #define PSIMD_HAVE_U16 1 87 #define PSIMD_HAVE_S16 1 88 #define PSIMD_HAVE_U32 1 89 #define PSIMD_HAVE_S32 1 90 #define PSIMD_HAVE_U64 0 91 #define PSIMD_HAVE_S64 0 92 93 typedef int8_t psimd_s8 __attribute__((vector_size(16), aligned(1))); 94 typedef uint8_t psimd_u8 __attribute__((vector_size(16), aligned(1))); 95 typedef int16_t psimd_s16 __attribute__((vector_size(16), aligned(2))); 96 typedef uint16_t psimd_u16 __attribute__((vector_size(16), aligned(2))); 97 typedef int32_t psimd_s32 __attribute__((vector_size(16), aligned(4))); 98 typedef uint32_t psimd_u32 __attribute__((vector_size(16), aligned(4))); 99 typedef float psimd_f32 __attribute__((vector_size(16), aligned(4))); 100 101 typedef struct { 102 psimd_s8 lo; 103 psimd_s8 hi; 104 } psimd_s8x2; 105 106 typedef struct { 107 psimd_u8 lo; 108 psimd_u8 hi; 109 } psimd_u8x2; 110 111 typedef struct { 112 psimd_s16 lo; 113 psimd_s16 hi; 114 } psimd_s16x2; 115 116 typedef struct { 117 psimd_u16 lo; 118 psimd_u16 hi; 119 } psimd_u16x2; 120 121 typedef struct { 122 psimd_s32 lo; 123 psimd_s32 hi; 124 } psimd_s32x2; 125 126 typedef struct { 127 psimd_u32 lo; 128 psimd_u32 hi; 129 } psimd_u32x2; 130 131 typedef struct { 132 psimd_f32 lo; 133 psimd_f32 hi; 134 } psimd_f32x2; 135 136 /* Bit casts */ psimd_cast_s32x2_u32x2(psimd_s32x2 v)137 PSIMD_INTRINSIC psimd_u32x2 psimd_cast_s32x2_u32x2(psimd_s32x2 v) { 138 return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi }; 139 } 140 psimd_cast_s32x2_f32x2(psimd_s32x2 v)141 PSIMD_INTRINSIC psimd_f32x2 psimd_cast_s32x2_f32x2(psimd_s32x2 v) { 142 return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi }; 143 } 144 psimd_cast_u32x2_s32x2(psimd_u32x2 v)145 PSIMD_INTRINSIC psimd_s32x2 psimd_cast_u32x2_s32x2(psimd_u32x2 v) { 146 return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi }; 147 } 148 psimd_cast_u32x2_f32x2(psimd_u32x2 v)149 PSIMD_INTRINSIC psimd_f32x2 psimd_cast_u32x2_f32x2(psimd_u32x2 v) { 150 return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi }; 151 } 152 psimd_cast_f32x2_s32x2(psimd_f32x2 v)153 PSIMD_INTRINSIC psimd_s32x2 psimd_cast_f32x2_s32x2(psimd_f32x2 v) { 154 return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi }; 155 } 156 psimd_cast_f32x2_u32x2(psimd_f32x2 v)157 PSIMD_INTRINSIC psimd_u32x2 psimd_cast_f32x2_u32x2(psimd_f32x2 v) { 158 return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi }; 159 } 160 161 /* Swap */ psimd_swap_s8(psimd_s8 a[1],psimd_s8 b[1])162 PSIMD_INTRINSIC void psimd_swap_s8(psimd_s8 a[1], psimd_s8 b[1]) { 163 const psimd_s8 new_a = *b; 164 const psimd_s8 new_b = *a; 165 *a = new_a; 166 *b = new_b; 167 } 168 psimd_swap_u8(psimd_u8 a[1],psimd_u8 b[1])169 PSIMD_INTRINSIC void psimd_swap_u8(psimd_u8 a[1], psimd_u8 b[1]) { 170 const psimd_u8 new_a = *b; 171 const psimd_u8 new_b = *a; 172 *a = new_a; 173 *b = new_b; 174 } 175 psimd_swap_s16(psimd_s16 a[1],psimd_s16 b[1])176 PSIMD_INTRINSIC void psimd_swap_s16(psimd_s16 a[1], psimd_s16 b[1]) { 177 const psimd_s16 new_a = *b; 178 const psimd_s16 new_b = *a; 179 *a = new_a; 180 *b = new_b; 181 } 182 psimd_swap_u16(psimd_u16 a[1],psimd_u16 b[1])183 PSIMD_INTRINSIC void psimd_swap_u16(psimd_u16 a[1], psimd_u16 b[1]) { 184 const psimd_u16 new_a = *b; 185 const psimd_u16 new_b = *a; 186 *a = new_a; 187 *b = new_b; 188 } 189 psimd_swap_s32(psimd_s32 a[1],psimd_s32 b[1])190 PSIMD_INTRINSIC void psimd_swap_s32(psimd_s32 a[1], psimd_s32 b[1]) { 191 const psimd_s32 new_a = *b; 192 const psimd_s32 new_b = *a; 193 *a = new_a; 194 *b = new_b; 195 } 196 psimd_swap_u32(psimd_u32 a[1],psimd_u32 b[1])197 PSIMD_INTRINSIC void psimd_swap_u32(psimd_u32 a[1], psimd_u32 b[1]) { 198 const psimd_u32 new_a = *b; 199 const psimd_u32 new_b = *a; 200 *a = new_a; 201 *b = new_b; 202 } 203 psimd_swap_f32(psimd_f32 a[1],psimd_f32 b[1])204 PSIMD_INTRINSIC void psimd_swap_f32(psimd_f32 a[1], psimd_f32 b[1]) { 205 const psimd_f32 new_a = *b; 206 const psimd_f32 new_b = *a; 207 *a = new_a; 208 *b = new_b; 209 } 210 211 /* Zero-initialization */ psimd_zero_s8(void)212 PSIMD_INTRINSIC psimd_s8 psimd_zero_s8(void) { 213 return (psimd_s8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 214 } 215 psimd_zero_u8(void)216 PSIMD_INTRINSIC psimd_u8 psimd_zero_u8(void) { 217 return (psimd_u8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 218 } 219 psimd_zero_s16(void)220 PSIMD_INTRINSIC psimd_s16 psimd_zero_s16(void) { 221 return (psimd_s16) { 0, 0, 0, 0, 0, 0, 0, 0 }; 222 } 223 psimd_zero_u16(void)224 PSIMD_INTRINSIC psimd_u16 psimd_zero_u16(void) { 225 return (psimd_u16) { 0, 0, 0, 0, 0, 0, 0, 0 }; 226 } 227 psimd_zero_s32(void)228 PSIMD_INTRINSIC psimd_s32 psimd_zero_s32(void) { 229 return (psimd_s32) { 0, 0, 0, 0 }; 230 } 231 psimd_zero_u32(void)232 PSIMD_INTRINSIC psimd_u32 psimd_zero_u32(void) { 233 return (psimd_u32) { 0, 0, 0, 0 }; 234 } 235 psimd_zero_f32(void)236 PSIMD_INTRINSIC psimd_f32 psimd_zero_f32(void) { 237 return (psimd_f32) { 0.0f, 0.0f, 0.0f, 0.0f }; 238 } 239 240 /* Initialization to the same constant */ psimd_splat_s8(int8_t c)241 PSIMD_INTRINSIC psimd_s8 psimd_splat_s8(int8_t c) { 242 return (psimd_s8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c }; 243 } 244 psimd_splat_u8(uint8_t c)245 PSIMD_INTRINSIC psimd_u8 psimd_splat_u8(uint8_t c) { 246 return (psimd_u8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c }; 247 } 248 psimd_splat_s16(int16_t c)249 PSIMD_INTRINSIC psimd_s16 psimd_splat_s16(int16_t c) { 250 return (psimd_s16) { c, c, c, c, c, c, c, c }; 251 } 252 psimd_splat_u16(uint16_t c)253 PSIMD_INTRINSIC psimd_u16 psimd_splat_u16(uint16_t c) { 254 return (psimd_u16) { c, c, c, c, c, c, c, c }; 255 } 256 psimd_splat_s32(int32_t c)257 PSIMD_INTRINSIC psimd_s32 psimd_splat_s32(int32_t c) { 258 return (psimd_s32) { c, c, c, c }; 259 } 260 psimd_splat_u32(uint32_t c)261 PSIMD_INTRINSIC psimd_u32 psimd_splat_u32(uint32_t c) { 262 return (psimd_u32) { c, c, c, c }; 263 } 264 psimd_splat_f32(float c)265 PSIMD_INTRINSIC psimd_f32 psimd_splat_f32(float c) { 266 return (psimd_f32) { c, c, c, c }; 267 } 268 269 /* Load vector */ psimd_load_s8(const void * address)270 PSIMD_INTRINSIC psimd_s8 psimd_load_s8(const void* address) { 271 return *((const psimd_s8*) address); 272 } 273 psimd_load_u8(const void * address)274 PSIMD_INTRINSIC psimd_u8 psimd_load_u8(const void* address) { 275 return *((const psimd_u8*) address); 276 } 277 psimd_load_s16(const void * address)278 PSIMD_INTRINSIC psimd_s16 psimd_load_s16(const void* address) { 279 return *((const psimd_s16*) address); 280 } 281 psimd_load_u16(const void * address)282 PSIMD_INTRINSIC psimd_u16 psimd_load_u16(const void* address) { 283 return *((const psimd_u16*) address); 284 } 285 psimd_load_s32(const void * address)286 PSIMD_INTRINSIC psimd_s32 psimd_load_s32(const void* address) { 287 return *((const psimd_s32*) address); 288 } 289 psimd_load_u32(const void * address)290 PSIMD_INTRINSIC psimd_u32 psimd_load_u32(const void* address) { 291 return *((const psimd_u32*) address); 292 } 293 psimd_load_f32(const void * address)294 PSIMD_INTRINSIC psimd_f32 psimd_load_f32(const void* address) { 295 return *((const psimd_f32*) address); 296 } 297 psimd_load_splat_s8(const void * address)298 PSIMD_INTRINSIC psimd_s8 psimd_load_splat_s8(const void* address) { 299 return psimd_splat_s8(*((const int8_t*) address)); 300 } 301 psimd_load_splat_u8(const void * address)302 PSIMD_INTRINSIC psimd_u8 psimd_load_splat_u8(const void* address) { 303 return psimd_splat_u8(*((const uint8_t*) address)); 304 } 305 psimd_load_splat_s16(const void * address)306 PSIMD_INTRINSIC psimd_s16 psimd_load_splat_s16(const void* address) { 307 return psimd_splat_s16(*((const int16_t*) address)); 308 } 309 psimd_load_splat_u16(const void * address)310 PSIMD_INTRINSIC psimd_u16 psimd_load_splat_u16(const void* address) { 311 return psimd_splat_u16(*((const uint16_t*) address)); 312 } 313 psimd_load_splat_s32(const void * address)314 PSIMD_INTRINSIC psimd_s32 psimd_load_splat_s32(const void* address) { 315 return psimd_splat_s32(*((const int32_t*) address)); 316 } 317 psimd_load_splat_u32(const void * address)318 PSIMD_INTRINSIC psimd_u32 psimd_load_splat_u32(const void* address) { 319 return psimd_splat_u32(*((const uint32_t*) address)); 320 } 321 psimd_load_splat_f32(const void * address)322 PSIMD_INTRINSIC psimd_f32 psimd_load_splat_f32(const void* address) { 323 return psimd_splat_f32(*((const float*) address)); 324 } 325 psimd_load1_s32(const void * address)326 PSIMD_INTRINSIC psimd_s32 psimd_load1_s32(const void* address) { 327 return (psimd_s32) { *((const int32_t*) address), 0, 0, 0 }; 328 } 329 psimd_load1_u32(const void * address)330 PSIMD_INTRINSIC psimd_u32 psimd_load1_u32(const void* address) { 331 return (psimd_u32) { *((const uint32_t*) address), 0, 0, 0 }; 332 } 333 psimd_load1_f32(const void * address)334 PSIMD_INTRINSIC psimd_f32 psimd_load1_f32(const void* address) { 335 return (psimd_f32) { *((const float*) address), 0.0f, 0.0f, 0.0f }; 336 } 337 psimd_load2_s32(const void * address)338 PSIMD_INTRINSIC psimd_s32 psimd_load2_s32(const void* address) { 339 const int32_t* address_s32 = (const int32_t*) address; 340 return (psimd_s32) { address_s32[0], address_s32[1], 0, 0 }; 341 } 342 psimd_load2_u32(const void * address)343 PSIMD_INTRINSIC psimd_u32 psimd_load2_u32(const void* address) { 344 const uint32_t* address_u32 = (const uint32_t*) address; 345 return (psimd_u32) { address_u32[0], address_u32[1], 0, 0 }; 346 } 347 psimd_load2_f32(const void * address)348 PSIMD_INTRINSIC psimd_f32 psimd_load2_f32(const void* address) { 349 const float* address_f32 = (const float*) address; 350 return (psimd_f32) { address_f32[0], address_f32[1], 0.0f, 0.0f }; 351 } 352 psimd_load3_s32(const void * address)353 PSIMD_INTRINSIC psimd_s32 psimd_load3_s32(const void* address) { 354 const int32_t* address_s32 = (const int32_t*) address; 355 return (psimd_s32) { address_s32[0], address_s32[1], address_s32[2], 0 }; 356 } 357 psimd_load3_u32(const void * address)358 PSIMD_INTRINSIC psimd_u32 psimd_load3_u32(const void* address) { 359 const uint32_t* address_u32 = (const uint32_t*) address; 360 return (psimd_u32) { address_u32[0], address_u32[1], address_u32[2], 0 }; 361 } 362 psimd_load3_f32(const void * address)363 PSIMD_INTRINSIC psimd_f32 psimd_load3_f32(const void* address) { 364 const float* address_f32 = (const float*) address; 365 return (psimd_f32) { address_f32[0], address_f32[1], address_f32[2], 0.0f }; 366 } 367 psimd_load4_s32(const void * address)368 PSIMD_INTRINSIC psimd_s32 psimd_load4_s32(const void* address) { 369 return psimd_load_s32(address); 370 } 371 psimd_load4_u32(const void * address)372 PSIMD_INTRINSIC psimd_u32 psimd_load4_u32(const void* address) { 373 return psimd_load_u32(address); 374 } 375 psimd_load4_f32(const void * address)376 PSIMD_INTRINSIC psimd_f32 psimd_load4_f32(const void* address) { 377 return psimd_load_f32(address); 378 } 379 psimd_load_stride2_f32(const void * address)380 PSIMD_INTRINSIC psimd_f32 psimd_load_stride2_f32(const void* address) { 381 const psimd_f32 v0x1x = psimd_load_f32(address); 382 const psimd_f32 vx2x3 = psimd_load_f32((const float*) address + 3); 383 #if defined(__clang__) 384 return __builtin_shufflevector(v0x1x, vx2x3, 0, 2, 5, 7); 385 #else 386 return __builtin_shuffle(v0x1x, vx2x3, (psimd_s32) { 0, 2, 5, 7 }); 387 #endif 388 } 389 psimd_load1_stride2_f32(const void * address)390 PSIMD_INTRINSIC psimd_f32 psimd_load1_stride2_f32(const void* address) { 391 return psimd_load_f32(address); 392 } 393 psimd_load2_stride2_f32(const void * address)394 PSIMD_INTRINSIC psimd_f32 psimd_load2_stride2_f32(const void* address) { 395 const float* address_f32 = (const float*) address; 396 return (psimd_f32) { address_f32[0], address_f32[2], 0.0f, 0.0f }; 397 } 398 psimd_load3_stride2_f32(const void * address)399 PSIMD_INTRINSIC psimd_f32 psimd_load3_stride2_f32(const void* address) { 400 const psimd_f32 v0x1x = psimd_load_f32(address); 401 const psimd_f32 v2zzz = psimd_load1_f32((const float*) address + 2); 402 #if defined(__clang__) 403 return __builtin_shufflevector(v0x1x, v2zzz, 0, 2, 4, 6); 404 #else 405 return __builtin_shuffle(v0x1x, v2zzz, (psimd_s32) { 0, 2, 4, 6 }); 406 #endif 407 } 408 psimd_load4_stride2_f32(const void * address)409 PSIMD_INTRINSIC psimd_f32 psimd_load4_stride2_f32(const void* address) { 410 return psimd_load_stride2_f32(address); 411 } 412 psimd_load_stride_f32(const void * address,size_t stride)413 PSIMD_INTRINSIC psimd_f32 psimd_load_stride_f32(const void* address, size_t stride) { 414 const float* address0_f32 = (const float*) address; 415 const float* address1_f32 = address0_f32 + stride; 416 const float* address2_f32 = address1_f32 + stride; 417 const float* address3_f32 = address2_f32 + stride; 418 return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, *address3_f32 }; 419 } 420 psimd_load1_stride_f32(const void * address,size_t stride)421 PSIMD_INTRINSIC psimd_f32 psimd_load1_stride_f32(const void* address, size_t stride) { 422 return psimd_load1_f32(address); 423 } 424 psimd_load2_stride_f32(const void * address,size_t stride)425 PSIMD_INTRINSIC psimd_f32 psimd_load2_stride_f32(const void* address, size_t stride) { 426 const float* address_f32 = (const float*) address; 427 return (psimd_f32) { address_f32[0], address_f32[stride], 0.0f, 0.0f }; 428 } 429 psimd_load3_stride_f32(const void * address,size_t stride)430 PSIMD_INTRINSIC psimd_f32 psimd_load3_stride_f32(const void* address, size_t stride) { 431 const float* address0_f32 = (const float*) address; 432 const float* address1_f32 = address0_f32 + stride; 433 const float* address2_f32 = address1_f32 + stride; 434 return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, 0.0f }; 435 } 436 psimd_load4_stride_f32(const void * address,size_t stride)437 PSIMD_INTRINSIC psimd_f32 psimd_load4_stride_f32(const void* address, size_t stride) { 438 return psimd_load_stride_f32(address, stride); 439 } 440 441 /* Store vector */ psimd_store_s8(void * address,psimd_s8 value)442 PSIMD_INTRINSIC void psimd_store_s8(void* address, psimd_s8 value) { 443 *((psimd_s8*) address) = value; 444 } 445 psimd_store_u8(void * address,psimd_u8 value)446 PSIMD_INTRINSIC void psimd_store_u8(void* address, psimd_u8 value) { 447 *((psimd_u8*) address) = value; 448 } 449 psimd_store_s16(void * address,psimd_s16 value)450 PSIMD_INTRINSIC void psimd_store_s16(void* address, psimd_s16 value) { 451 *((psimd_s16*) address) = value; 452 } 453 psimd_store_u16(void * address,psimd_u16 value)454 PSIMD_INTRINSIC void psimd_store_u16(void* address, psimd_u16 value) { 455 *((psimd_u16*) address) = value; 456 } 457 psimd_store_s32(void * address,psimd_s32 value)458 PSIMD_INTRINSIC void psimd_store_s32(void* address, psimd_s32 value) { 459 *((psimd_s32*) address) = value; 460 } 461 psimd_store_u32(void * address,psimd_u32 value)462 PSIMD_INTRINSIC void psimd_store_u32(void* address, psimd_u32 value) { 463 *((psimd_u32*) address) = value; 464 } 465 psimd_store_f32(void * address,psimd_f32 value)466 PSIMD_INTRINSIC void psimd_store_f32(void* address, psimd_f32 value) { 467 *((psimd_f32*) address) = value; 468 } 469 psimd_store1_s32(void * address,psimd_s32 value)470 PSIMD_INTRINSIC void psimd_store1_s32(void* address, psimd_s32 value) { 471 *((int32_t*) address) = value[0]; 472 } 473 psimd_store1_u32(void * address,psimd_u32 value)474 PSIMD_INTRINSIC void psimd_store1_u32(void* address, psimd_u32 value) { 475 *((uint32_t*) address) = value[0]; 476 } 477 psimd_store1_f32(void * address,psimd_f32 value)478 PSIMD_INTRINSIC void psimd_store1_f32(void* address, psimd_f32 value) { 479 *((float*) address) = value[0]; 480 } 481 psimd_store2_s32(void * address,psimd_s32 value)482 PSIMD_INTRINSIC void psimd_store2_s32(void* address, psimd_s32 value) { 483 int32_t* address_s32 = (int32_t*) address; 484 address_s32[0] = value[0]; 485 address_s32[1] = value[1]; 486 } 487 psimd_store2_u32(void * address,psimd_u32 value)488 PSIMD_INTRINSIC void psimd_store2_u32(void* address, psimd_u32 value) { 489 uint32_t* address_u32 = (uint32_t*) address; 490 address_u32[0] = value[0]; 491 address_u32[1] = value[1]; 492 } 493 psimd_store2_f32(void * address,psimd_f32 value)494 PSIMD_INTRINSIC void psimd_store2_f32(void* address, psimd_f32 value) { 495 float* address_f32 = (float*) address; 496 address_f32[0] = value[0]; 497 address_f32[1] = value[1]; 498 } 499 psimd_store3_s32(void * address,psimd_s32 value)500 PSIMD_INTRINSIC void psimd_store3_s32(void* address, psimd_s32 value) { 501 int32_t* address_s32 = (int32_t*) address; 502 address_s32[0] = value[0]; 503 address_s32[1] = value[1]; 504 address_s32[2] = value[2]; 505 } 506 psimd_store3_u32(void * address,psimd_u32 value)507 PSIMD_INTRINSIC void psimd_store3_u32(void* address, psimd_u32 value) { 508 uint32_t* address_u32 = (uint32_t*) address; 509 address_u32[0] = value[0]; 510 address_u32[1] = value[1]; 511 address_u32[2] = value[2]; 512 } 513 psimd_store3_f32(void * address,psimd_f32 value)514 PSIMD_INTRINSIC void psimd_store3_f32(void* address, psimd_f32 value) { 515 float* address_f32 = (float*) address; 516 address_f32[0] = value[0]; 517 address_f32[1] = value[1]; 518 address_f32[2] = value[2]; 519 } 520 psimd_store4_s32(void * address,psimd_s32 value)521 PSIMD_INTRINSIC void psimd_store4_s32(void* address, psimd_s32 value) { 522 psimd_store_s32(address, value); 523 } 524 psimd_store4_u32(void * address,psimd_u32 value)525 PSIMD_INTRINSIC void psimd_store4_u32(void* address, psimd_u32 value) { 526 psimd_store_u32(address, value); 527 } 528 psimd_store4_f32(void * address,psimd_f32 value)529 PSIMD_INTRINSIC void psimd_store4_f32(void* address, psimd_f32 value) { 530 psimd_store_f32(address, value); 531 } 532 psimd_store_stride_f32(void * address,size_t stride,psimd_f32 value)533 PSIMD_INTRINSIC void psimd_store_stride_f32(void* address, size_t stride, psimd_f32 value) { 534 float* address0_f32 = (float*) address; 535 float* address1_f32 = address0_f32 + stride; 536 float* address2_f32 = address1_f32 + stride; 537 float* address3_f32 = address2_f32 + stride; 538 *address0_f32 = value[0]; 539 *address1_f32 = value[1]; 540 *address2_f32 = value[2]; 541 *address3_f32 = value[3]; 542 } 543 psimd_store1_stride_f32(void * address,size_t stride,psimd_f32 value)544 PSIMD_INTRINSIC void psimd_store1_stride_f32(void* address, size_t stride, psimd_f32 value) { 545 psimd_store1_f32(address, value); 546 } 547 psimd_store2_stride_f32(void * address,size_t stride,psimd_f32 value)548 PSIMD_INTRINSIC void psimd_store2_stride_f32(void* address, size_t stride, psimd_f32 value) { 549 float* address_f32 = (float*) address; 550 address_f32[0] = value[0]; 551 address_f32[stride] = value[1]; 552 } 553 psimd_store3_stride_f32(void * address,size_t stride,psimd_f32 value)554 PSIMD_INTRINSIC void psimd_store3_stride_f32(void* address, size_t stride, psimd_f32 value) { 555 float* address0_f32 = (float*) address; 556 float* address1_f32 = address0_f32 + stride; 557 float* address2_f32 = address1_f32 + stride; 558 *address0_f32 = value[0]; 559 *address1_f32 = value[1]; 560 *address2_f32 = value[2]; 561 } 562 563 /* Vector addition */ psimd_add_s8(psimd_s8 a,psimd_s8 b)564 PSIMD_INTRINSIC psimd_s8 psimd_add_s8(psimd_s8 a, psimd_s8 b) { 565 return a + b; 566 } 567 psimd_add_u8(psimd_u8 a,psimd_u8 b)568 PSIMD_INTRINSIC psimd_u8 psimd_add_u8(psimd_u8 a, psimd_u8 b) { 569 return a + b; 570 } 571 psimd_add_s16(psimd_s16 a,psimd_s16 b)572 PSIMD_INTRINSIC psimd_s16 psimd_add_s16(psimd_s16 a, psimd_s16 b) { 573 return a + b; 574 } 575 psimd_add_u16(psimd_u16 a,psimd_u16 b)576 PSIMD_INTRINSIC psimd_u16 psimd_add_u16(psimd_u16 a, psimd_u16 b) { 577 return a + b; 578 } 579 psimd_add_s32(psimd_s32 a,psimd_s32 b)580 PSIMD_INTRINSIC psimd_s32 psimd_add_s32(psimd_s32 a, psimd_s32 b) { 581 return a + b; 582 } 583 psimd_add_u32(psimd_u32 a,psimd_u32 b)584 PSIMD_INTRINSIC psimd_u32 psimd_add_u32(psimd_u32 a, psimd_u32 b) { 585 return a + b; 586 } 587 psimd_add_f32(psimd_f32 a,psimd_f32 b)588 PSIMD_INTRINSIC psimd_f32 psimd_add_f32(psimd_f32 a, psimd_f32 b) { 589 #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__) 590 return (psimd_f32) vaddq_f32((float32x4_t) a, (float32x4_t) b); 591 #else 592 return a + b; 593 #endif 594 } 595 596 /* Vector subtraction */ psimd_sub_s8(psimd_s8 a,psimd_s8 b)597 PSIMD_INTRINSIC psimd_s8 psimd_sub_s8(psimd_s8 a, psimd_s8 b) { 598 return a - b; 599 } 600 psimd_sub_u8(psimd_u8 a,psimd_u8 b)601 PSIMD_INTRINSIC psimd_u8 psimd_sub_u8(psimd_u8 a, psimd_u8 b) { 602 return a - b; 603 } 604 psimd_sub_s16(psimd_s16 a,psimd_s16 b)605 PSIMD_INTRINSIC psimd_s16 psimd_sub_s16(psimd_s16 a, psimd_s16 b) { 606 return a - b; 607 } 608 psimd_sub_u16(psimd_u16 a,psimd_u16 b)609 PSIMD_INTRINSIC psimd_u16 psimd_sub_u16(psimd_u16 a, psimd_u16 b) { 610 return a - b; 611 } 612 psimd_sub_s32(psimd_s32 a,psimd_s32 b)613 PSIMD_INTRINSIC psimd_s32 psimd_sub_s32(psimd_s32 a, psimd_s32 b) { 614 return a - b; 615 } 616 psimd_sub_u32(psimd_u32 a,psimd_u32 b)617 PSIMD_INTRINSIC psimd_u32 psimd_sub_u32(psimd_u32 a, psimd_u32 b) { 618 return a - b; 619 } 620 psimd_sub_f32(psimd_f32 a,psimd_f32 b)621 PSIMD_INTRINSIC psimd_f32 psimd_sub_f32(psimd_f32 a, psimd_f32 b) { 622 #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__) 623 return (psimd_f32) vsubq_f32((float32x4_t) a, (float32x4_t) b); 624 #else 625 return a - b; 626 #endif 627 } 628 629 /* Vector multiplication */ psimd_mul_s8(psimd_s8 a,psimd_s8 b)630 PSIMD_INTRINSIC psimd_s8 psimd_mul_s8(psimd_s8 a, psimd_s8 b) { 631 return a * b; 632 } 633 psimd_mul_u8(psimd_u8 a,psimd_u8 b)634 PSIMD_INTRINSIC psimd_u8 psimd_mul_u8(psimd_u8 a, psimd_u8 b) { 635 return a * b; 636 } 637 psimd_mul_s16(psimd_s16 a,psimd_s16 b)638 PSIMD_INTRINSIC psimd_s16 psimd_mul_s16(psimd_s16 a, psimd_s16 b) { 639 return a * b; 640 } 641 psimd_mul_u16(psimd_u16 a,psimd_u16 b)642 PSIMD_INTRINSIC psimd_u16 psimd_mul_u16(psimd_u16 a, psimd_u16 b) { 643 return a * b; 644 } 645 psimd_mul_s32(psimd_s32 a,psimd_s32 b)646 PSIMD_INTRINSIC psimd_s32 psimd_mul_s32(psimd_s32 a, psimd_s32 b) { 647 return a * b; 648 } 649 psimd_mul_u32(psimd_u32 a,psimd_u32 b)650 PSIMD_INTRINSIC psimd_u32 psimd_mul_u32(psimd_u32 a, psimd_u32 b) { 651 return a * b; 652 } 653 psimd_mul_f32(psimd_f32 a,psimd_f32 b)654 PSIMD_INTRINSIC psimd_f32 psimd_mul_f32(psimd_f32 a, psimd_f32 b) { 655 #if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__) 656 return (psimd_f32) vmulq_f32((float32x4_t) a, (float32x4_t) b); 657 #else 658 return a * b; 659 #endif 660 } 661 662 /* Quasi-Fused Multiply-Add */ psimd_qfma_f32(psimd_f32 a,psimd_f32 b,psimd_f32 c)663 PSIMD_INTRINSIC psimd_f32 psimd_qfma_f32(psimd_f32 a, psimd_f32 b, psimd_f32 c) { 664 #if defined(__aarch64__) || defined(__ARM_NEON__) && defined(__ARM_FEATURE_FMA) 665 return (psimd_f32) vfmaq_f32((float32x4_t) a, (float32x4_t) b, (float32x4_t) c); 666 #elif (defined(__x86_64__) || defined(__i386__) || defined(__i686__)) && defined(__FMA__) 667 return (psimd_f32) _mm_fmadd_ps((__m128) b, (__m128) c, (__m128) a); 668 #elif (defined(__x86_64__) || defined(__i386__) || defined(__i686__)) && defined(__FMA4__) 669 return (psimd_f32) _mm_macc_ps((__m128) b, (__m128) c, (__m128) a); 670 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) && PSIMD_ENABLE_WASM_QFMA 671 return (psimd_f32) __builtin_wasm_qfma_f32x4(a, b, c); 672 #else 673 return a + b * c; 674 #endif 675 } 676 psimd_div_f32(psimd_f32 a,psimd_f32 b)677 PSIMD_INTRINSIC psimd_f32 psimd_div_f32(psimd_f32 a, psimd_f32 b) { 678 return a / b; 679 } 680 681 /* Vector and */ psimd_andmask_f32(psimd_s32 mask,psimd_f32 v)682 PSIMD_INTRINSIC psimd_f32 psimd_andmask_f32(psimd_s32 mask, psimd_f32 v) { 683 return (psimd_f32) (mask & (psimd_s32) v); 684 } 685 686 /* Vector and-not */ psimd_andnotmask_f32(psimd_s32 mask,psimd_f32 v)687 PSIMD_INTRINSIC psimd_f32 psimd_andnotmask_f32(psimd_s32 mask, psimd_f32 v) { 688 return (psimd_f32) (~mask & (psimd_s32) v); 689 } 690 691 /* Vector blend */ psimd_blend_s8(psimd_s8 mask,psimd_s8 a,psimd_s8 b)692 PSIMD_INTRINSIC psimd_s8 psimd_blend_s8(psimd_s8 mask, psimd_s8 a, psimd_s8 b) { 693 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 694 return (psimd_s8) vbslq_s8((uint8x16_t) mask, (int8x16_t) a, (int8x16_t) b); 695 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) 696 return (psimd_s8) __builtin_wasm_bitselect(a, b, mask); 697 #else 698 return (mask & a) | (~mask & b); 699 #endif 700 } 701 psimd_blend_u8(psimd_s8 mask,psimd_u8 a,psimd_u8 b)702 PSIMD_INTRINSIC psimd_u8 psimd_blend_u8(psimd_s8 mask, psimd_u8 a, psimd_u8 b) { 703 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 704 return (psimd_u8) vbslq_u8((uint8x16_t) mask, (uint8x16_t) a, (uint8x16_t) b); 705 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) 706 return (psimd_u8) __builtin_wasm_bitselect(a, b, mask); 707 #else 708 return (psimd_u8) ((mask & (psimd_s8) a) | (~mask & (psimd_s8) b)); 709 #endif 710 } 711 psimd_blend_s16(psimd_s16 mask,psimd_s16 a,psimd_s16 b)712 PSIMD_INTRINSIC psimd_s16 psimd_blend_s16(psimd_s16 mask, psimd_s16 a, psimd_s16 b) { 713 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 714 return (psimd_s16) vbslq_s16((uint16x8_t) mask, (int16x8_t) a, (int16x8_t) b); 715 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) 716 return (psimd_s16) __builtin_wasm_bitselect(a, b, mask); 717 #else 718 return (mask & a) | (~mask & b); 719 #endif 720 } 721 psimd_blend_u16(psimd_s16 mask,psimd_u16 a,psimd_u16 b)722 PSIMD_INTRINSIC psimd_u16 psimd_blend_u16(psimd_s16 mask, psimd_u16 a, psimd_u16 b) { 723 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 724 return (psimd_u16) vbslq_u16((uint16x8_t) mask, (uint16x8_t) a, (uint16x8_t) b); 725 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) 726 return (psimd_u16) __builtin_wasm_bitselect(a, b, mask); 727 #else 728 return (psimd_u16) ((mask & (psimd_s16) a) | (~mask & (psimd_s16) b)); 729 #endif 730 } 731 psimd_blend_s32(psimd_s32 mask,psimd_s32 a,psimd_s32 b)732 PSIMD_INTRINSIC psimd_s32 psimd_blend_s32(psimd_s32 mask, psimd_s32 a, psimd_s32 b) { 733 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 734 return (psimd_s32) vbslq_s32((uint32x4_t) mask, (int32x4_t) a, (int32x4_t) b); 735 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) 736 return (psimd_s32) __builtin_wasm_bitselect(a, b, mask); 737 #else 738 return (mask & a) | (~mask & b); 739 #endif 740 } 741 psimd_blend_u32(psimd_s32 mask,psimd_u32 a,psimd_u32 b)742 PSIMD_INTRINSIC psimd_u32 psimd_blend_u32(psimd_s32 mask, psimd_u32 a, psimd_u32 b) { 743 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 744 return (psimd_u32) vbslq_u32((uint32x4_t) mask, (uint32x4_t) a, (uint32x4_t) b); 745 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) 746 return (psimd_u32) __builtin_wasm_bitselect(a, b, mask); 747 #else 748 return (psimd_u32) ((mask & (psimd_s32) a) | (~mask & (psimd_s32) b)); 749 #endif 750 } 751 psimd_blend_f32(psimd_s32 mask,psimd_f32 a,psimd_f32 b)752 PSIMD_INTRINSIC psimd_f32 psimd_blend_f32(psimd_s32 mask, psimd_f32 a, psimd_f32 b) { 753 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 754 return (psimd_f32) vbslq_f32((uint32x4_t) mask, (float32x4_t) a, (float32x4_t) b); 755 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) 756 return (psimd_f32) __builtin_wasm_bitselect(a, b, mask); 757 #else 758 return (psimd_f32) ((mask & (psimd_s32) a) | (~mask & (psimd_s32) b)); 759 #endif 760 } 761 762 /* Vector blend on sign */ psimd_signblend_s8(psimd_s8 x,psimd_s8 a,psimd_s8 b)763 PSIMD_INTRINSIC psimd_s8 psimd_signblend_s8(psimd_s8 x, psimd_s8 a, psimd_s8 b) { 764 return psimd_blend_s8(x >> psimd_splat_s8(7), a, b); 765 } 766 psimd_signblend_u8(psimd_s8 x,psimd_u8 a,psimd_u8 b)767 PSIMD_INTRINSIC psimd_u8 psimd_signblend_u8(psimd_s8 x, psimd_u8 a, psimd_u8 b) { 768 return psimd_blend_u8((x >> psimd_splat_s8(7)), a, b); 769 } 770 psimd_signblend_s16(psimd_s16 x,psimd_s16 a,psimd_s16 b)771 PSIMD_INTRINSIC psimd_s16 psimd_signblend_s16(psimd_s16 x, psimd_s16 a, psimd_s16 b) { 772 return psimd_blend_s16(x >> psimd_splat_s16(15), a, b); 773 } 774 psimd_signblend_u16(psimd_s16 x,psimd_u16 a,psimd_u16 b)775 PSIMD_INTRINSIC psimd_u16 psimd_signblend_u16(psimd_s16 x, psimd_u16 a, psimd_u16 b) { 776 return psimd_blend_u16((x >> psimd_splat_s16(15)), a, b); 777 } 778 psimd_signblend_s32(psimd_s32 x,psimd_s32 a,psimd_s32 b)779 PSIMD_INTRINSIC psimd_s32 psimd_signblend_s32(psimd_s32 x, psimd_s32 a, psimd_s32 b) { 780 return psimd_blend_s32(x >> psimd_splat_s32(31), a, b); 781 } 782 psimd_signblend_u32(psimd_s32 x,psimd_u32 a,psimd_u32 b)783 PSIMD_INTRINSIC psimd_u32 psimd_signblend_u32(psimd_s32 x, psimd_u32 a, psimd_u32 b) { 784 return psimd_blend_u32((x >> psimd_splat_s32(31)), a, b); 785 } 786 psimd_signblend_f32(psimd_f32 x,psimd_f32 a,psimd_f32 b)787 PSIMD_INTRINSIC psimd_f32 psimd_signblend_f32(psimd_f32 x, psimd_f32 a, psimd_f32 b) { 788 const psimd_s32 mask = (psimd_s32) x >> psimd_splat_s32(31); 789 return psimd_blend_f32(mask, a, b); 790 } 791 792 /* Vector absolute value */ psimd_abs_f32(psimd_f32 v)793 PSIMD_INTRINSIC psimd_f32 psimd_abs_f32(psimd_f32 v) { 794 const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f); 795 return (psimd_f32) ((psimd_s32) v & ~mask); 796 } 797 798 /* Vector negation */ psimd_neg_f32(psimd_f32 v)799 PSIMD_INTRINSIC psimd_f32 psimd_neg_f32(psimd_f32 v) { 800 const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f); 801 return (psimd_f32) ((psimd_s32) v ^ mask); 802 } 803 804 /* Vector maximum */ psimd_max_s8(psimd_s8 a,psimd_s8 b)805 PSIMD_INTRINSIC psimd_s8 psimd_max_s8(psimd_s8 a, psimd_s8 b) { 806 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 807 return (psimd_s8) vmaxq_s8((int8x16_t) a, (int8x16_t) b); 808 #else 809 return psimd_blend_s8(a > b, a, b); 810 #endif 811 } 812 psimd_max_u8(psimd_u8 a,psimd_u8 b)813 PSIMD_INTRINSIC psimd_u8 psimd_max_u8(psimd_u8 a, psimd_u8 b) { 814 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 815 return (psimd_u8) vmaxq_u8((uint8x16_t) a, (uint8x16_t) b); 816 #else 817 return psimd_blend_u8(a > b, a, b); 818 #endif 819 } 820 psimd_max_s16(psimd_s16 a,psimd_s16 b)821 PSIMD_INTRINSIC psimd_s16 psimd_max_s16(psimd_s16 a, psimd_s16 b) { 822 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 823 return (psimd_s16) vmaxq_s16((int16x8_t) a, (int16x8_t) b); 824 #else 825 return psimd_blend_s16(a > b, a, b); 826 #endif 827 } 828 psimd_max_u16(psimd_u16 a,psimd_u16 b)829 PSIMD_INTRINSIC psimd_u16 psimd_max_u16(psimd_u16 a, psimd_u16 b) { 830 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 831 return (psimd_u16) vmaxq_u16((uint16x8_t) a, (uint16x8_t) b); 832 #else 833 return psimd_blend_u16(a > b, a, b); 834 #endif 835 } 836 psimd_max_s32(psimd_s32 a,psimd_s32 b)837 PSIMD_INTRINSIC psimd_s32 psimd_max_s32(psimd_s32 a, psimd_s32 b) { 838 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 839 return (psimd_s32) vmaxq_s32((int32x4_t) a, (int32x4_t) b); 840 #else 841 return psimd_blend_s32(a > b, a, b); 842 #endif 843 } 844 psimd_max_u32(psimd_u32 a,psimd_u32 b)845 PSIMD_INTRINSIC psimd_u32 psimd_max_u32(psimd_u32 a, psimd_u32 b) { 846 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 847 return (psimd_u32) vmaxq_u32((uint32x4_t) a, (uint32x4_t) b); 848 #else 849 return psimd_blend_u32(a > b, a, b); 850 #endif 851 } 852 psimd_max_f32(psimd_f32 a,psimd_f32 b)853 PSIMD_INTRINSIC psimd_f32 psimd_max_f32(psimd_f32 a, psimd_f32 b) { 854 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 855 return (psimd_f32) vmaxq_f32((float32x4_t) a, (float32x4_t) b); 856 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) 857 return __builtin_wasm_max_f32x4(a, b); 858 #else 859 return psimd_blend_f32(a > b, a, b); 860 #endif 861 } 862 863 /* Vector minimum */ psimd_min_s8(psimd_s8 a,psimd_s8 b)864 PSIMD_INTRINSIC psimd_s8 psimd_min_s8(psimd_s8 a, psimd_s8 b) { 865 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 866 return (psimd_s8) vminq_s8((int8x16_t) a, (int8x16_t) b); 867 #else 868 return psimd_blend_s8(a < b, a, b); 869 #endif 870 } 871 psimd_min_u8(psimd_u8 a,psimd_u8 b)872 PSIMD_INTRINSIC psimd_u8 psimd_min_u8(psimd_u8 a, psimd_u8 b) { 873 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 874 return (psimd_u8) vminq_u8((uint8x16_t) a, (uint8x16_t) b); 875 #else 876 return psimd_blend_u8(a < b, a, b); 877 #endif 878 } 879 psimd_min_s16(psimd_s16 a,psimd_s16 b)880 PSIMD_INTRINSIC psimd_s16 psimd_min_s16(psimd_s16 a, psimd_s16 b) { 881 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 882 return (psimd_s16) vminq_s16((int16x8_t) a, (int16x8_t) b); 883 #else 884 return psimd_blend_s16(a < b, a, b); 885 #endif 886 } 887 psimd_min_u16(psimd_u16 a,psimd_u16 b)888 PSIMD_INTRINSIC psimd_u16 psimd_min_u16(psimd_u16 a, psimd_u16 b) { 889 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 890 return (psimd_u16) vminq_u16((uint16x8_t) a, (uint16x8_t) b); 891 #else 892 return psimd_blend_u16(a < b, a, b); 893 #endif 894 } 895 psimd_min_s32(psimd_s32 a,psimd_s32 b)896 PSIMD_INTRINSIC psimd_s32 psimd_min_s32(psimd_s32 a, psimd_s32 b) { 897 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 898 return (psimd_s32) vminq_s32((int32x4_t) a, (int32x4_t) b); 899 #else 900 return psimd_blend_s32(a < b, a, b); 901 #endif 902 } 903 psimd_min_u32(psimd_u32 a,psimd_u32 b)904 PSIMD_INTRINSIC psimd_u32 psimd_min_u32(psimd_u32 a, psimd_u32 b) { 905 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 906 return (psimd_u32) vminq_u32((uint32x4_t) a, (uint32x4_t) b); 907 #else 908 return psimd_blend_u32(a < b, a, b); 909 #endif 910 } 911 psimd_min_f32(psimd_f32 a,psimd_f32 b)912 PSIMD_INTRINSIC psimd_f32 psimd_min_f32(psimd_f32 a, psimd_f32 b) { 913 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 914 return (psimd_f32) vminq_f32((float32x4_t) a, (float32x4_t) b); 915 #elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) 916 return __builtin_wasm_min_f32x4(a, b); 917 #else 918 return psimd_blend_f32(a < b, a, b); 919 #endif 920 } 921 psimd_cvt_s32_f32(psimd_s32 v)922 PSIMD_INTRINSIC psimd_f32 psimd_cvt_s32_f32(psimd_s32 v) { 923 #if defined(__clang__) 924 return __builtin_convertvector(v, psimd_f32); 925 #elif defined(__ARM_NEON__) || defined(__ARM_NEON) 926 return (psimd_f32) vcvtq_f32_s32((int32x4_t) v); 927 #elif defined(__SSE2__) 928 return (psimd_f32) _mm_cvtepi32_ps((__m128i) v); 929 #else 930 return (psimd_f32) { (float) v[0], (float) v[1], (float) v[2], (float) v[3] }; 931 #endif 932 } 933 934 /* Broadcast vector element */ 935 #if defined(__clang__) psimd_splat0_f32(psimd_f32 v)936 PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) { 937 return __builtin_shufflevector(v, v, 0, 0, 0, 0); 938 } 939 psimd_splat1_f32(psimd_f32 v)940 PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) { 941 return __builtin_shufflevector(v, v, 1, 1, 1, 1); 942 } 943 psimd_splat2_f32(psimd_f32 v)944 PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) { 945 return __builtin_shufflevector(v, v, 2, 2, 2, 2); 946 } 947 psimd_splat3_f32(psimd_f32 v)948 PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) { 949 return __builtin_shufflevector(v, v, 3, 3, 3, 3); 950 } 951 #else psimd_splat0_f32(psimd_f32 v)952 PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) { 953 return __builtin_shuffle(v, (psimd_s32) { 0, 0, 0, 0 }); 954 } 955 psimd_splat1_f32(psimd_f32 v)956 PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) { 957 return __builtin_shuffle(v, (psimd_s32) { 1, 1, 1, 1 }); 958 } 959 psimd_splat2_f32(psimd_f32 v)960 PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) { 961 return __builtin_shuffle(v, (psimd_s32) { 2, 2, 2, 2 }); 962 } 963 psimd_splat3_f32(psimd_f32 v)964 PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) { 965 return __builtin_shuffle(v, (psimd_s32) { 3, 3, 3, 3 }); 966 } 967 #endif 968 969 /* Reversal of vector elements */ 970 #if defined(__clang__) psimd_reverse_s8(psimd_s8 v)971 PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) { 972 return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 973 } 974 psimd_reverse_u8(psimd_u8 v)975 PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) { 976 return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 977 } 978 psimd_reverse_s16(psimd_s16 v)979 PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) { 980 return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0); 981 } 982 psimd_reverse_u16(psimd_u16 v)983 PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) { 984 return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0); 985 } 986 psimd_reverse_s32(psimd_s32 v)987 PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) { 988 return __builtin_shufflevector(v, v, 3, 2, 1, 0); 989 } 990 psimd_reverse_u32(psimd_u32 v)991 PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) { 992 return __builtin_shufflevector(v, v, 3, 2, 1, 0); 993 } 994 psimd_reverse_f32(psimd_f32 v)995 PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) { 996 return __builtin_shufflevector(v, v, 3, 2, 1, 0); 997 } 998 #else psimd_reverse_s8(psimd_s8 v)999 PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) { 1000 return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }); 1001 } 1002 psimd_reverse_u8(psimd_u8 v)1003 PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) { 1004 return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }); 1005 } 1006 psimd_reverse_s16(psimd_s16 v)1007 PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) { 1008 return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 }); 1009 } 1010 psimd_reverse_u16(psimd_u16 v)1011 PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) { 1012 return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 }); 1013 } 1014 psimd_reverse_s32(psimd_s32 v)1015 PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) { 1016 return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 }); 1017 } 1018 psimd_reverse_u32(psimd_u32 v)1019 PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) { 1020 return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 }); 1021 } 1022 psimd_reverse_f32(psimd_f32 v)1023 PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) { 1024 return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 }); 1025 } 1026 #endif 1027 1028 /* Interleaving of vector elements */ 1029 #if defined(__clang__) psimd_interleave_lo_s16(psimd_s16 a,psimd_s16 b)1030 PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) { 1031 return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 1032 } 1033 psimd_interleave_hi_s16(psimd_s16 a,psimd_s16 b)1034 PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) { 1035 return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 1036 } 1037 psimd_interleave_lo_u16(psimd_u16 a,psimd_u16 b)1038 PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) { 1039 return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 1040 } 1041 psimd_interleave_hi_u16(psimd_u16 a,psimd_u16 b)1042 PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) { 1043 return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 1044 } 1045 psimd_interleave_lo_s32(psimd_s32 a,psimd_s32 b)1046 PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) { 1047 return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1); 1048 } 1049 psimd_interleave_hi_s32(psimd_s32 a,psimd_s32 b)1050 PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) { 1051 return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3); 1052 } 1053 psimd_interleave_lo_u32(psimd_u32 a,psimd_u32 b)1054 PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) { 1055 return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1); 1056 } 1057 psimd_interleave_hi_u32(psimd_u32 a,psimd_u32 b)1058 PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) { 1059 return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3); 1060 } 1061 psimd_interleave_lo_f32(psimd_f32 a,psimd_f32 b)1062 PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) { 1063 return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1); 1064 } 1065 psimd_interleave_hi_f32(psimd_f32 a,psimd_f32 b)1066 PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) { 1067 return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3); 1068 } 1069 #else psimd_interleave_lo_s16(psimd_s16 a,psimd_s16 b)1070 PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) { 1071 return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 }); 1072 } 1073 psimd_interleave_hi_s16(psimd_s16 a,psimd_s16 b)1074 PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) { 1075 return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 }); 1076 } 1077 psimd_interleave_lo_u16(psimd_u16 a,psimd_u16 b)1078 PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) { 1079 return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 }); 1080 } 1081 psimd_interleave_hi_u16(psimd_u16 a,psimd_u16 b)1082 PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) { 1083 return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 }); 1084 } 1085 psimd_interleave_lo_s32(psimd_s32 a,psimd_s32 b)1086 PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) { 1087 return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 }); 1088 } 1089 psimd_interleave_hi_s32(psimd_s32 a,psimd_s32 b)1090 PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) { 1091 return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 }); 1092 } 1093 psimd_interleave_lo_u32(psimd_u32 a,psimd_u32 b)1094 PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) { 1095 return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 }); 1096 } 1097 psimd_interleave_hi_u32(psimd_u32 a,psimd_u32 b)1098 PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) { 1099 return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 }); 1100 } 1101 psimd_interleave_lo_f32(psimd_f32 a,psimd_f32 b)1102 PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) { 1103 return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 }); 1104 } 1105 psimd_interleave_hi_f32(psimd_f32 a,psimd_f32 b)1106 PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) { 1107 return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 }); 1108 } 1109 #endif 1110 1111 /* Concatenation of low/high vector elements */ 1112 #if defined(__clang__) psimd_concat_lo_s16(psimd_s16 a,psimd_s16 b)1113 PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) { 1114 return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3); 1115 } 1116 psimd_concat_hi_s16(psimd_s16 a,psimd_s16 b)1117 PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) { 1118 return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7); 1119 } 1120 psimd_concat_lo_u16(psimd_u16 a,psimd_u16 b)1121 PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) { 1122 return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3); 1123 } 1124 psimd_concat_hi_u16(psimd_u16 a,psimd_u16 b)1125 PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) { 1126 return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7); 1127 } 1128 psimd_concat_lo_s32(psimd_s32 a,psimd_s32 b)1129 PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) { 1130 return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1); 1131 } 1132 psimd_concat_hi_s32(psimd_s32 a,psimd_s32 b)1133 PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) { 1134 return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3); 1135 } 1136 psimd_concat_lo_u32(psimd_u32 a,psimd_u32 b)1137 PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) { 1138 return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1); 1139 } 1140 psimd_concat_hi_u32(psimd_u32 a,psimd_u32 b)1141 PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) { 1142 return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3); 1143 } 1144 psimd_concat_lo_f32(psimd_f32 a,psimd_f32 b)1145 PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) { 1146 return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1); 1147 } 1148 psimd_concat_hi_f32(psimd_f32 a,psimd_f32 b)1149 PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) { 1150 return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3); 1151 } 1152 #else psimd_concat_lo_s16(psimd_s16 a,psimd_s16 b)1153 PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) { 1154 return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 }); 1155 } 1156 psimd_concat_hi_s16(psimd_s16 a,psimd_s16 b)1157 PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) { 1158 return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 }); 1159 } 1160 psimd_concat_lo_u16(psimd_u16 a,psimd_u16 b)1161 PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) { 1162 return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 }); 1163 } 1164 psimd_concat_hi_u16(psimd_u16 a,psimd_u16 b)1165 PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) { 1166 return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 }); 1167 } 1168 psimd_concat_lo_s32(psimd_s32 a,psimd_s32 b)1169 PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) { 1170 return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 }); 1171 } 1172 psimd_concat_hi_s32(psimd_s32 a,psimd_s32 b)1173 PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) { 1174 return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 }); 1175 } 1176 psimd_concat_lo_u32(psimd_u32 a,psimd_u32 b)1177 PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) { 1178 return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 }); 1179 } 1180 psimd_concat_hi_u32(psimd_u32 a,psimd_u32 b)1181 PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) { 1182 return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 }); 1183 } 1184 psimd_concat_lo_f32(psimd_f32 a,psimd_f32 b)1185 PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) { 1186 return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 }); 1187 } 1188 psimd_concat_hi_f32(psimd_f32 a,psimd_f32 b)1189 PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) { 1190 return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 }); 1191 } 1192 #endif 1193 1194 /* Concatenation of even/odd vector elements */ 1195 #if defined(__clang__) psimd_concat_even_s8(psimd_s8 a,psimd_s8 b)1196 PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) { 1197 return __builtin_shufflevector(a, b, 1198 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14); 1199 } 1200 psimd_concat_odd_s8(psimd_s8 a,psimd_s8 b)1201 PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) { 1202 return __builtin_shufflevector(a, b, 1203 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15); 1204 } 1205 psimd_concat_even_u8(psimd_u8 a,psimd_u8 b)1206 PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) { 1207 return __builtin_shufflevector(a, b, 1208 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14); 1209 } 1210 psimd_concat_odd_u8(psimd_u8 a,psimd_u8 b)1211 PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) { 1212 return __builtin_shufflevector(a, b, 1213 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15); 1214 } 1215 psimd_concat_even_s16(psimd_s16 a,psimd_s16 b)1216 PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) { 1217 return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6); 1218 } 1219 psimd_concat_odd_s16(psimd_s16 a,psimd_s16 b)1220 PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) { 1221 return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7); 1222 } 1223 psimd_concat_even_u16(psimd_u16 a,psimd_u16 b)1224 PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) { 1225 return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6); 1226 } 1227 psimd_concat_odd_u16(psimd_u16 a,psimd_u16 b)1228 PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) { 1229 return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7); 1230 } 1231 psimd_concat_even_s32(psimd_s32 a,psimd_s32 b)1232 PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) { 1233 return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2); 1234 } 1235 psimd_concat_odd_s32(psimd_s32 a,psimd_s32 b)1236 PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) { 1237 return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3); 1238 } 1239 psimd_concat_even_u32(psimd_u32 a,psimd_u32 b)1240 PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) { 1241 return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2); 1242 } 1243 psimd_concat_odd_u32(psimd_u32 a,psimd_u32 b)1244 PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) { 1245 return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3); 1246 } 1247 psimd_concat_even_f32(psimd_f32 a,psimd_f32 b)1248 PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) { 1249 return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2); 1250 } 1251 psimd_concat_odd_f32(psimd_f32 a,psimd_f32 b)1252 PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) { 1253 return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3); 1254 } 1255 #else psimd_concat_even_s8(psimd_s8 a,psimd_s8 b)1256 PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) { 1257 return __builtin_shuffle(a, b, 1258 (psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 }); 1259 } 1260 psimd_concat_odd_s8(psimd_s8 a,psimd_s8 b)1261 PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) { 1262 return __builtin_shuffle(a, b, 1263 (psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 }); 1264 } 1265 psimd_concat_even_u8(psimd_u8 a,psimd_u8 b)1266 PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) { 1267 return __builtin_shuffle(a, b, 1268 (psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 }); 1269 } 1270 psimd_concat_odd_u8(psimd_u8 a,psimd_u8 b)1271 PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) { 1272 return __builtin_shuffle(a, b, 1273 (psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 }); 1274 } 1275 psimd_concat_even_s16(psimd_s16 a,psimd_s16 b)1276 PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) { 1277 return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 }); 1278 } 1279 psimd_concat_odd_s16(psimd_s16 a,psimd_s16 b)1280 PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) { 1281 return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 }); 1282 } 1283 psimd_concat_even_u16(psimd_u16 a,psimd_u16 b)1284 PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) { 1285 return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 }); 1286 } 1287 psimd_concat_odd_u16(psimd_u16 a,psimd_u16 b)1288 PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) { 1289 return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 }); 1290 } 1291 psimd_concat_even_s32(psimd_s32 a,psimd_s32 b)1292 PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) { 1293 return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 }); 1294 } 1295 psimd_concat_odd_s32(psimd_s32 a,psimd_s32 b)1296 PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) { 1297 return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 }); 1298 } 1299 psimd_concat_even_u32(psimd_u32 a,psimd_u32 b)1300 PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) { 1301 return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 }); 1302 } 1303 psimd_concat_odd_u32(psimd_u32 a,psimd_u32 b)1304 PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) { 1305 return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 }); 1306 } 1307 psimd_concat_even_f32(psimd_f32 a,psimd_f32 b)1308 PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) { 1309 return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 }); 1310 } 1311 psimd_concat_odd_f32(psimd_f32 a,psimd_f32 b)1312 PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) { 1313 return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 }); 1314 } 1315 #endif 1316 1317 /* Vector reduce */ 1318 #if defined(__clang__) psimd_allreduce_sum_f32(psimd_f32 v)1319 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) { 1320 const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, 0, 1); 1321 return temp + __builtin_shufflevector(temp, temp, 1, 0, 3, 2); 1322 } 1323 psimd_allreduce_max_f32(psimd_f32 v)1324 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) { 1325 const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1)); 1326 return psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2)); 1327 } 1328 psimd_allreduce_min_f32(psimd_f32 v)1329 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) { 1330 const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1)); 1331 return psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2)); 1332 } 1333 psimd_reduce_sum_f32(psimd_f32 v)1334 PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) { 1335 const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, -1, -1); 1336 const psimd_f32 result = temp + __builtin_shufflevector(temp, temp, 1, -1, -1, -1); 1337 return result[0]; 1338 } 1339 psimd_reduce_max_f32(psimd_f32 v)1340 PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) { 1341 const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1)); 1342 const psimd_f32 result = psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1)); 1343 return result[0]; 1344 } 1345 psimd_reduce_min_f32(psimd_f32 v)1346 PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) { 1347 const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1)); 1348 const psimd_f32 result = psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1)); 1349 return result[0]; 1350 } 1351 #else psimd_allreduce_sum_f32(psimd_f32 v)1352 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) { 1353 const psimd_f32 temp = v + __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }); 1354 return temp + __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }); 1355 } 1356 psimd_allreduce_max_f32(psimd_f32 v)1357 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) { 1358 const psimd_f32 temp = psimd_max_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 })); 1359 return psimd_max_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 })); 1360 } 1361 psimd_allreduce_min_f32(psimd_f32 v)1362 PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) { 1363 const psimd_f32 temp = psimd_min_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 })); 1364 return psimd_min_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 })); 1365 } 1366 psimd_reduce_sum_f32(psimd_f32 v)1367 PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) { 1368 const psimd_f32 result = psimd_allreduce_sum_f32(v); 1369 return result[0]; 1370 } 1371 psimd_reduce_max_f32(psimd_f32 v)1372 PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) { 1373 const psimd_f32 result = psimd_allreduce_max_f32(v); 1374 return result[0]; 1375 } 1376 psimd_reduce_min_f32(psimd_f32 v)1377 PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) { 1378 const psimd_f32 result = psimd_allreduce_min_f32(v); 1379 return result[0]; 1380 } 1381 #endif 1382 #endif 1383 1384 #endif /* PSIMD_H */ 1385