1 /* 2 * Copyright 2019 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #ifndef SKVX_DEFINED 9 #define SKVX_DEFINED 10 11 // skvx::Vec<N,T> are SIMD vectors of N T's, a v1.5 successor to SkNx<N,T>. 12 // 13 // This time we're leaning a bit less on platform-specific intrinsics and a bit 14 // more on Clang/GCC vector extensions, but still keeping the option open to 15 // drop in platform-specific intrinsics, actually more easily than before. 16 // 17 // We've also fixed a few of the caveats that used to make SkNx awkward to work 18 // with across translation units. skvx::Vec<N,T> always has N*sizeof(T) size 19 // and alignment and is safe to use across translation units freely. 20 // (Ideally we'd only align to T, but that tanks ARMv7 NEON codegen.) 21 22 // Please try to keep this file independent of Skia headers. 23 #include <algorithm> // std::min, std::max 24 #include <cmath> // ceilf, floorf, truncf, roundf, sqrtf, etc. 25 #include <cstdint> // intXX_t 26 #include <cstring> // memcpy() 27 #include <initializer_list> // std::initializer_list 28 #include <utility> // std::index_sequence 29 30 #if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) 31 #include <immintrin.h> 32 #elif defined(__ARM_NEON) 33 #include <arm_neon.h> 34 #elif defined(__wasm_simd128__) 35 #include <wasm_simd128.h> 36 #endif 37 38 // To avoid ODR violations, all methods must be force-inlined... 39 #if defined(_MSC_VER) 40 #define SKVX_ALWAYS_INLINE __forceinline 41 #else 42 #define SKVX_ALWAYS_INLINE __attribute__((always_inline)) 43 #endif 44 45 // ... and all standalone functions must be static. Please use these helpers: 46 #define SI static inline 47 #define SIT template < typename T> SI 48 #define SIN template <int N > SI 49 #define SINT template <int N, typename T> SI 50 #define SINTU template <int N, typename T, typename U, \ 51 typename=std::enable_if_t<std::is_convertible<U,T>::value>> SI 52 53 namespace skvx { 54 55 // All Vec have the same simple memory layout, the same as `T vec[N]`. 56 template <int N, typename T> 57 struct alignas(N*sizeof(T)) Vec { 58 static_assert((N & (N-1)) == 0, "N must be a power of 2."); 59 static_assert(sizeof(T) >= alignof(T), "What kind of crazy T is this?"); 60 61 Vec<N/2,T> lo, hi; 62 63 // Methods belong here in the class declaration of Vec only if: 64 // - they must be here, like constructors or operator[]; 65 // - they'll definitely never want a specialized implementation. 66 // Other operations on Vec should be defined outside the type. 67 68 SKVX_ALWAYS_INLINE Vec() = default; 69 70 template <typename U, typename=std::enable_if_t<std::is_convertible<U,T>::value>> 71 SKVX_ALWAYS_INLINE VecVec72 Vec(U x) : lo(x), hi(x) {} 73 VecVec74 SKVX_ALWAYS_INLINE Vec(std::initializer_list<T> xs) { 75 T vals[N] = {0}; 76 memcpy(vals, xs.begin(), std::min(xs.size(), (size_t)N)*sizeof(T)); 77 78 lo = Vec<N/2,T>::Load(vals + 0); 79 hi = Vec<N/2,T>::Load(vals + N/2); 80 } 81 82 SKVX_ALWAYS_INLINE T operator[](int i) const { return i < N/2 ? lo[i] : hi[i-N/2]; } 83 SKVX_ALWAYS_INLINE T& operator[](int i) { return i < N/2 ? lo[i] : hi[i-N/2]; } 84 LoadVec85 SKVX_ALWAYS_INLINE static Vec Load(const void* ptr) { 86 Vec v; 87 memcpy(&v, ptr, sizeof(Vec)); 88 return v; 89 } storeVec90 SKVX_ALWAYS_INLINE void store(void* ptr) const { 91 memcpy(ptr, this, sizeof(Vec)); 92 } 93 }; 94 95 template <typename T> 96 struct Vec<1,T> { 97 T val; 98 99 SKVX_ALWAYS_INLINE Vec() = default; 100 101 template <typename U, typename=std::enable_if_t<std::is_convertible<U,T>::value>> 102 SKVX_ALWAYS_INLINE 103 Vec(U x) : val(x) {} 104 105 SKVX_ALWAYS_INLINE Vec(std::initializer_list<T> xs) : val(xs.size() ? *xs.begin() : 0) {} 106 107 SKVX_ALWAYS_INLINE T operator[](int) const { return val; } 108 SKVX_ALWAYS_INLINE T& operator[](int) { return val; } 109 110 SKVX_ALWAYS_INLINE static Vec Load(const void* ptr) { 111 Vec v; 112 memcpy(&v, ptr, sizeof(Vec)); 113 return v; 114 } 115 SKVX_ALWAYS_INLINE void store(void* ptr) const { 116 memcpy(ptr, this, sizeof(Vec)); 117 } 118 }; 119 120 // Ideally we'd only use bit_pun(), but until this file is always built as C++17 with constexpr if, 121 // we'll sometimes find need to use unchecked_bit_pun(). Please do check the call sites yourself! 122 template <typename D, typename S> 123 SI D unchecked_bit_pun(const S& s) { 124 D d; 125 memcpy(&d, &s, sizeof(D)); 126 return d; 127 } 128 129 template <typename D, typename S> 130 SI D bit_pun(const S& s) { 131 static_assert(sizeof(D) == sizeof(S), ""); 132 return unchecked_bit_pun<D>(s); 133 } 134 135 // Translate from a value type T to its corresponding Mask, the result of a comparison. 136 template <typename T> struct Mask { using type = T; }; 137 template <> struct Mask<float > { using type = int32_t; }; 138 template <> struct Mask<double> { using type = int64_t; }; 139 template <typename T> using M = typename Mask<T>::type; 140 141 // Join two Vec<N,T> into one Vec<2N,T>. 142 SINT Vec<2*N,T> join(const Vec<N,T>& lo, const Vec<N,T>& hi) { 143 Vec<2*N,T> v; 144 v.lo = lo; 145 v.hi = hi; 146 return v; 147 } 148 149 // We have three strategies for implementing Vec operations: 150 // 1) lean on Clang/GCC vector extensions when available; 151 // 2) use map() to apply a scalar function lane-wise; 152 // 3) recurse on lo/hi to scalar portable implementations. 153 // We can slot in platform-specific implementations as overloads for particular Vec<N,T>, 154 // or often integrate them directly into the recursion of style 3), allowing fine control. 155 156 #if !defined(SKNX_NO_SIMD) && (defined(__clang__) || defined(__GNUC__)) 157 158 // VExt<N,T> types have the same size as Vec<N,T> and support most operations directly. 159 #if defined(__clang__) 160 template <int N, typename T> 161 using VExt = T __attribute__((ext_vector_type(N))); 162 163 #elif defined(__GNUC__) 164 template <int N, typename T> 165 struct VExtHelper { 166 typedef T __attribute__((vector_size(N*sizeof(T)))) type; 167 }; 168 169 template <int N, typename T> 170 using VExt = typename VExtHelper<N,T>::type; 171 172 // For some reason some (new!) versions of GCC cannot seem to deduce N in the generic 173 // to_vec<N,T>() below for N=4 and T=float. This workaround seems to help... 174 SI Vec<4,float> to_vec(VExt<4,float> v) { return bit_pun<Vec<4,float>>(v); } 175 #endif 176 177 SINT VExt<N,T> to_vext(const Vec<N,T>& v) { return bit_pun<VExt<N,T>>(v); } 178 SINT Vec <N,T> to_vec(const VExt<N,T>& v) { return bit_pun<Vec <N,T>>(v); } 179 180 SINT Vec<N,T> operator+(const Vec<N,T>& x, const Vec<N,T>& y) { 181 return to_vec<N,T>(to_vext(x) + to_vext(y)); 182 } 183 SINT Vec<N,T> operator-(const Vec<N,T>& x, const Vec<N,T>& y) { 184 return to_vec<N,T>(to_vext(x) - to_vext(y)); 185 } 186 SINT Vec<N,T> operator*(const Vec<N,T>& x, const Vec<N,T>& y) { 187 return to_vec<N,T>(to_vext(x) * to_vext(y)); 188 } 189 SINT Vec<N,T> operator/(const Vec<N,T>& x, const Vec<N,T>& y) { 190 return to_vec<N,T>(to_vext(x) / to_vext(y)); 191 } 192 193 SINT Vec<N,T> operator^(const Vec<N,T>& x, const Vec<N,T>& y) { 194 return to_vec<N,T>(to_vext(x) ^ to_vext(y)); 195 } 196 SINT Vec<N,T> operator&(const Vec<N,T>& x, const Vec<N,T>& y) { 197 return to_vec<N,T>(to_vext(x) & to_vext(y)); 198 } 199 SINT Vec<N,T> operator|(const Vec<N,T>& x, const Vec<N,T>& y) { 200 return to_vec<N,T>(to_vext(x) | to_vext(y)); 201 } 202 203 SINT Vec<N,T> operator!(const Vec<N,T>& x) { return to_vec<N,T>(!to_vext(x)); } 204 SINT Vec<N,T> operator-(const Vec<N,T>& x) { return to_vec<N,T>(-to_vext(x)); } 205 SINT Vec<N,T> operator~(const Vec<N,T>& x) { return to_vec<N,T>(~to_vext(x)); } 206 207 SINT Vec<N,T> operator<<(const Vec<N,T>& x, int k) { return to_vec<N,T>(to_vext(x) << k); } 208 SINT Vec<N,T> operator>>(const Vec<N,T>& x, int k) { return to_vec<N,T>(to_vext(x) >> k); } 209 210 SINT Vec<N,M<T>> operator==(const Vec<N,T>& x, const Vec<N,T>& y) { 211 return bit_pun<Vec<N,M<T>>>(to_vext(x) == to_vext(y)); 212 } 213 SINT Vec<N,M<T>> operator!=(const Vec<N,T>& x, const Vec<N,T>& y) { 214 return bit_pun<Vec<N,M<T>>>(to_vext(x) != to_vext(y)); 215 } 216 SINT Vec<N,M<T>> operator<=(const Vec<N,T>& x, const Vec<N,T>& y) { 217 return bit_pun<Vec<N,M<T>>>(to_vext(x) <= to_vext(y)); 218 } 219 SINT Vec<N,M<T>> operator>=(const Vec<N,T>& x, const Vec<N,T>& y) { 220 return bit_pun<Vec<N,M<T>>>(to_vext(x) >= to_vext(y)); 221 } 222 SINT Vec<N,M<T>> operator< (const Vec<N,T>& x, const Vec<N,T>& y) { 223 return bit_pun<Vec<N,M<T>>>(to_vext(x) < to_vext(y)); 224 } 225 SINT Vec<N,M<T>> operator> (const Vec<N,T>& x, const Vec<N,T>& y) { 226 return bit_pun<Vec<N,M<T>>>(to_vext(x) > to_vext(y)); 227 } 228 229 #else 230 231 // Either SKNX_NO_SIMD is defined, or Clang/GCC vector extensions are not available. 232 // We'll implement things portably with N==1 scalar implementations and recursion onto them. 233 234 // N == 1 scalar implementations. 235 SIT Vec<1,T> operator+(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val + y.val; } 236 SIT Vec<1,T> operator-(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val - y.val; } 237 SIT Vec<1,T> operator*(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val * y.val; } 238 SIT Vec<1,T> operator/(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val / y.val; } 239 240 SIT Vec<1,T> operator^(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val ^ y.val; } 241 SIT Vec<1,T> operator&(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val & y.val; } 242 SIT Vec<1,T> operator|(const Vec<1,T>& x, const Vec<1,T>& y) { return x.val | y.val; } 243 244 SIT Vec<1,T> operator!(const Vec<1,T>& x) { return !x.val; } 245 SIT Vec<1,T> operator-(const Vec<1,T>& x) { return -x.val; } 246 SIT Vec<1,T> operator~(const Vec<1,T>& x) { return ~x.val; } 247 248 SIT Vec<1,T> operator<<(const Vec<1,T>& x, int k) { return x.val << k; } 249 SIT Vec<1,T> operator>>(const Vec<1,T>& x, int k) { return x.val >> k; } 250 251 SIT Vec<1,M<T>> operator==(const Vec<1,T>& x, const Vec<1,T>& y) { 252 return x.val == y.val ? ~0 : 0; 253 } 254 SIT Vec<1,M<T>> operator!=(const Vec<1,T>& x, const Vec<1,T>& y) { 255 return x.val != y.val ? ~0 : 0; 256 } 257 SIT Vec<1,M<T>> operator<=(const Vec<1,T>& x, const Vec<1,T>& y) { 258 return x.val <= y.val ? ~0 : 0; 259 } 260 SIT Vec<1,M<T>> operator>=(const Vec<1,T>& x, const Vec<1,T>& y) { 261 return x.val >= y.val ? ~0 : 0; 262 } 263 SIT Vec<1,M<T>> operator< (const Vec<1,T>& x, const Vec<1,T>& y) { 264 return x.val < y.val ? ~0 : 0; 265 } 266 SIT Vec<1,M<T>> operator> (const Vec<1,T>& x, const Vec<1,T>& y) { 267 return x.val > y.val ? ~0 : 0; 268 } 269 270 // Recurse on lo/hi down to N==1 scalar implementations. 271 SINT Vec<N,T> operator+(const Vec<N,T>& x, const Vec<N,T>& y) { 272 return join(x.lo + y.lo, x.hi + y.hi); 273 } 274 SINT Vec<N,T> operator-(const Vec<N,T>& x, const Vec<N,T>& y) { 275 return join(x.lo - y.lo, x.hi - y.hi); 276 } 277 SINT Vec<N,T> operator*(const Vec<N,T>& x, const Vec<N,T>& y) { 278 return join(x.lo * y.lo, x.hi * y.hi); 279 } 280 SINT Vec<N,T> operator/(const Vec<N,T>& x, const Vec<N,T>& y) { 281 return join(x.lo / y.lo, x.hi / y.hi); 282 } 283 284 SINT Vec<N,T> operator^(const Vec<N,T>& x, const Vec<N,T>& y) { 285 return join(x.lo ^ y.lo, x.hi ^ y.hi); 286 } 287 SINT Vec<N,T> operator&(const Vec<N,T>& x, const Vec<N,T>& y) { 288 return join(x.lo & y.lo, x.hi & y.hi); 289 } 290 SINT Vec<N,T> operator|(const Vec<N,T>& x, const Vec<N,T>& y) { 291 return join(x.lo | y.lo, x.hi | y.hi); 292 } 293 294 SINT Vec<N,T> operator!(const Vec<N,T>& x) { return join(!x.lo, !x.hi); } 295 SINT Vec<N,T> operator-(const Vec<N,T>& x) { return join(-x.lo, -x.hi); } 296 SINT Vec<N,T> operator~(const Vec<N,T>& x) { return join(~x.lo, ~x.hi); } 297 298 SINT Vec<N,T> operator<<(const Vec<N,T>& x, int k) { return join(x.lo << k, x.hi << k); } 299 SINT Vec<N,T> operator>>(const Vec<N,T>& x, int k) { return join(x.lo >> k, x.hi >> k); } 300 301 SINT Vec<N,M<T>> operator==(const Vec<N,T>& x, const Vec<N,T>& y) { 302 return join(x.lo == y.lo, x.hi == y.hi); 303 } 304 SINT Vec<N,M<T>> operator!=(const Vec<N,T>& x, const Vec<N,T>& y) { 305 return join(x.lo != y.lo, x.hi != y.hi); 306 } 307 SINT Vec<N,M<T>> operator<=(const Vec<N,T>& x, const Vec<N,T>& y) { 308 return join(x.lo <= y.lo, x.hi <= y.hi); 309 } 310 SINT Vec<N,M<T>> operator>=(const Vec<N,T>& x, const Vec<N,T>& y) { 311 return join(x.lo >= y.lo, x.hi >= y.hi); 312 } 313 SINT Vec<N,M<T>> operator< (const Vec<N,T>& x, const Vec<N,T>& y) { 314 return join(x.lo < y.lo, x.hi < y.hi); 315 } 316 SINT Vec<N,M<T>> operator> (const Vec<N,T>& x, const Vec<N,T>& y) { 317 return join(x.lo > y.lo, x.hi > y.hi); 318 } 319 #endif 320 321 // Scalar/vector operations splat the scalar to a vector. 322 SINTU Vec<N,T> operator+ (U x, const Vec<N,T>& y) { return Vec<N,T>(x) + y; } 323 SINTU Vec<N,T> operator- (U x, const Vec<N,T>& y) { return Vec<N,T>(x) - y; } 324 SINTU Vec<N,T> operator* (U x, const Vec<N,T>& y) { return Vec<N,T>(x) * y; } 325 SINTU Vec<N,T> operator/ (U x, const Vec<N,T>& y) { return Vec<N,T>(x) / y; } 326 SINTU Vec<N,T> operator^ (U x, const Vec<N,T>& y) { return Vec<N,T>(x) ^ y; } 327 SINTU Vec<N,T> operator& (U x, const Vec<N,T>& y) { return Vec<N,T>(x) & y; } 328 SINTU Vec<N,T> operator| (U x, const Vec<N,T>& y) { return Vec<N,T>(x) | y; } 329 SINTU Vec<N,M<T>> operator==(U x, const Vec<N,T>& y) { return Vec<N,T>(x) == y; } 330 SINTU Vec<N,M<T>> operator!=(U x, const Vec<N,T>& y) { return Vec<N,T>(x) != y; } 331 SINTU Vec<N,M<T>> operator<=(U x, const Vec<N,T>& y) { return Vec<N,T>(x) <= y; } 332 SINTU Vec<N,M<T>> operator>=(U x, const Vec<N,T>& y) { return Vec<N,T>(x) >= y; } 333 SINTU Vec<N,M<T>> operator< (U x, const Vec<N,T>& y) { return Vec<N,T>(x) < y; } 334 SINTU Vec<N,M<T>> operator> (U x, const Vec<N,T>& y) { return Vec<N,T>(x) > y; } 335 336 SINTU Vec<N,T> operator+ (const Vec<N,T>& x, U y) { return x + Vec<N,T>(y); } 337 SINTU Vec<N,T> operator- (const Vec<N,T>& x, U y) { return x - Vec<N,T>(y); } 338 SINTU Vec<N,T> operator* (const Vec<N,T>& x, U y) { return x * Vec<N,T>(y); } 339 SINTU Vec<N,T> operator/ (const Vec<N,T>& x, U y) { return x / Vec<N,T>(y); } 340 SINTU Vec<N,T> operator^ (const Vec<N,T>& x, U y) { return x ^ Vec<N,T>(y); } 341 SINTU Vec<N,T> operator& (const Vec<N,T>& x, U y) { return x & Vec<N,T>(y); } 342 SINTU Vec<N,T> operator| (const Vec<N,T>& x, U y) { return x | Vec<N,T>(y); } 343 SINTU Vec<N,M<T>> operator==(const Vec<N,T>& x, U y) { return x == Vec<N,T>(y); } 344 SINTU Vec<N,M<T>> operator!=(const Vec<N,T>& x, U y) { return x != Vec<N,T>(y); } 345 SINTU Vec<N,M<T>> operator<=(const Vec<N,T>& x, U y) { return x <= Vec<N,T>(y); } 346 SINTU Vec<N,M<T>> operator>=(const Vec<N,T>& x, U y) { return x >= Vec<N,T>(y); } 347 SINTU Vec<N,M<T>> operator< (const Vec<N,T>& x, U y) { return x < Vec<N,T>(y); } 348 SINTU Vec<N,M<T>> operator> (const Vec<N,T>& x, U y) { return x > Vec<N,T>(y); } 349 350 SINT Vec<N,T>& operator+=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x + y); } 351 SINT Vec<N,T>& operator-=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x - y); } 352 SINT Vec<N,T>& operator*=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x * y); } 353 SINT Vec<N,T>& operator/=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x / y); } 354 SINT Vec<N,T>& operator^=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x ^ y); } 355 SINT Vec<N,T>& operator&=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x & y); } 356 SINT Vec<N,T>& operator|=(Vec<N,T>& x, const Vec<N,T>& y) { return (x = x | y); } 357 358 SINTU Vec<N,T>& operator+=(Vec<N,T>& x, U y) { return (x = x + Vec<N,T>(y)); } 359 SINTU Vec<N,T>& operator-=(Vec<N,T>& x, U y) { return (x = x - Vec<N,T>(y)); } 360 SINTU Vec<N,T>& operator*=(Vec<N,T>& x, U y) { return (x = x * Vec<N,T>(y)); } 361 SINTU Vec<N,T>& operator/=(Vec<N,T>& x, U y) { return (x = x / Vec<N,T>(y)); } 362 SINTU Vec<N,T>& operator^=(Vec<N,T>& x, U y) { return (x = x ^ Vec<N,T>(y)); } 363 SINTU Vec<N,T>& operator&=(Vec<N,T>& x, U y) { return (x = x & Vec<N,T>(y)); } 364 SINTU Vec<N,T>& operator|=(Vec<N,T>& x, U y) { return (x = x | Vec<N,T>(y)); } 365 366 SINT Vec<N,T>& operator<<=(Vec<N,T>& x, int bits) { return (x = x << bits); } 367 SINT Vec<N,T>& operator>>=(Vec<N,T>& x, int bits) { return (x = x >> bits); } 368 369 // Some operations we want are not expressible with Clang/GCC vector extensions. 370 371 // Clang can reason about naive_if_then_else() and optimize through it better 372 // than if_then_else(), so it's sometimes useful to call it directly when we 373 // think an entire expression should optimize away, e.g. min()/max(). 374 SINT Vec<N,T> naive_if_then_else(const Vec<N,M<T>>& cond, const Vec<N,T>& t, const Vec<N,T>& e) { 375 return bit_pun<Vec<N,T>>(( cond & bit_pun<Vec<N, M<T>>>(t)) | 376 (~cond & bit_pun<Vec<N, M<T>>>(e)) ); 377 } 378 379 SIT Vec<1,T> if_then_else(const Vec<1,M<T>>& cond, const Vec<1,T>& t, const Vec<1,T>& e) { 380 // In practice this scalar implementation is unlikely to be used. See next if_then_else(). 381 return bit_pun<Vec<1,T>>(( cond & bit_pun<Vec<1, M<T>>>(t)) | 382 (~cond & bit_pun<Vec<1, M<T>>>(e)) ); 383 } 384 SINT Vec<N,T> if_then_else(const Vec<N,M<T>>& cond, const Vec<N,T>& t, const Vec<N,T>& e) { 385 // Specializations inline here so they can generalize what types the apply to. 386 // (This header is used in C++14 contexts, so we have to kind of fake constexpr if.) 387 #if defined(__AVX2__) 388 if /*constexpr*/ (N*sizeof(T) == 32) { 389 return unchecked_bit_pun<Vec<N,T>>(_mm256_blendv_epi8(unchecked_bit_pun<__m256i>(e), 390 unchecked_bit_pun<__m256i>(t), 391 unchecked_bit_pun<__m256i>(cond))); 392 } 393 #endif 394 #if defined(__SSE4_1__) 395 if /*constexpr*/ (N*sizeof(T) == 16) { 396 return unchecked_bit_pun<Vec<N,T>>(_mm_blendv_epi8(unchecked_bit_pun<__m128i>(e), 397 unchecked_bit_pun<__m128i>(t), 398 unchecked_bit_pun<__m128i>(cond))); 399 } 400 #endif 401 #if defined(__ARM_NEON) 402 if /*constexpr*/ (N*sizeof(T) == 16) { 403 return unchecked_bit_pun<Vec<N,T>>(vbslq_u8(unchecked_bit_pun<uint8x16_t>(cond), 404 unchecked_bit_pun<uint8x16_t>(t), 405 unchecked_bit_pun<uint8x16_t>(e))); 406 } 407 #endif 408 // Recurse for large vectors to try to hit the specializations above. 409 if /*constexpr*/ (N*sizeof(T) > 16) { 410 return join(if_then_else(cond.lo, t.lo, e.lo), 411 if_then_else(cond.hi, t.hi, e.hi)); 412 } 413 // This default can lead to better code than the recursing onto scalars. 414 return naive_if_then_else(cond, t, e); 415 } 416 417 SIT bool any(const Vec<1,T>& x) { return x.val != 0; } 418 SINT bool any(const Vec<N,T>& x) { 419 #if defined(__wasm_simd128__) 420 if constexpr (N == 4 && sizeof(T) == 4) { 421 return wasm_i32x4_any_true(unchecked_bit_pun<VExt<4,int>>(x)); 422 } 423 #endif 424 return any(x.lo) 425 || any(x.hi); 426 } 427 428 SIT bool all(const Vec<1,T>& x) { return x.val != 0; } 429 SINT bool all(const Vec<N,T>& x) { 430 #if defined(__AVX2__) 431 if /*constexpr*/ (N*sizeof(T) == 32) { 432 return _mm256_testc_si256(unchecked_bit_pun<__m256i>(x), 433 _mm256_set1_epi32(-1)); 434 } 435 #endif 436 #if defined(__SSE4_1__) 437 if /*constexpr*/ (N*sizeof(T) == 16) { 438 return _mm_testc_si128(unchecked_bit_pun<__m128i>(x), 439 _mm_set1_epi32(-1)); 440 } 441 #endif 442 #if defined(__wasm_simd128__) 443 if /*constexpr*/ (N == 4 && sizeof(T) == 4) { 444 return wasm_i32x4_all_true(unchecked_bit_pun<VExt<4,int>>(x)); 445 } 446 #endif 447 return all(x.lo) 448 && all(x.hi); 449 } 450 451 // cast() Vec<N,S> to Vec<N,D>, as if applying a C-cast to each lane. 452 // TODO: implement with map()? 453 template <typename D, typename S> 454 SI Vec<1,D> cast(const Vec<1,S>& src) { return (D)src.val; } 455 456 template <typename D, int N, typename S> 457 SI Vec<N,D> cast(const Vec<N,S>& src) { 458 #if !defined(SKNX_NO_SIMD) && defined(__clang__) 459 return to_vec(__builtin_convertvector(to_vext(src), VExt<N,D>)); 460 #else 461 return join(cast<D>(src.lo), cast<D>(src.hi)); 462 #endif 463 } 464 465 // min/max match logic of std::min/std::max, which is important when NaN is involved. 466 SIT T min(const Vec<1,T>& x) { return x.val; } 467 SIT T max(const Vec<1,T>& x) { return x.val; } 468 SINT T min(const Vec<N,T>& x) { return std::min(min(x.lo), min(x.hi)); } 469 SINT T max(const Vec<N,T>& x) { return std::max(max(x.lo), max(x.hi)); } 470 471 SINT Vec<N,T> min(const Vec<N,T>& x, const Vec<N,T>& y) { return naive_if_then_else(y < x, y, x); } 472 SINT Vec<N,T> max(const Vec<N,T>& x, const Vec<N,T>& y) { return naive_if_then_else(x < y, y, x); } 473 474 SINTU Vec<N,T> min(const Vec<N,T>& x, U y) { return min(x, Vec<N,T>(y)); } 475 SINTU Vec<N,T> max(const Vec<N,T>& x, U y) { return max(x, Vec<N,T>(y)); } 476 SINTU Vec<N,T> min(U x, const Vec<N,T>& y) { return min(Vec<N,T>(x), y); } 477 SINTU Vec<N,T> max(U x, const Vec<N,T>& y) { return max(Vec<N,T>(x), y); } 478 479 // pin matches the logic of SkTPin, which is important when NaN is involved. It always returns 480 // values in the range lo..hi, and if x is NaN, it returns lo. 481 SINT Vec<N,T> pin(const Vec<N,T>& x, const Vec<N,T>& lo, const Vec<N,T>& hi) { 482 return max(lo, min(x, hi)); 483 } 484 485 // Shuffle values from a vector pretty arbitrarily: 486 // skvx::Vec<4,float> rgba = {R,G,B,A}; 487 // shuffle<2,1,0,3> (rgba) ~> {B,G,R,A} 488 // shuffle<2,1> (rgba) ~> {B,G} 489 // shuffle<2,1,2,1,2,1,2,1>(rgba) ~> {B,G,B,G,B,G,B,G} 490 // shuffle<3,3,3,3> (rgba) ~> {A,A,A,A} 491 // The only real restriction is that the output also be a legal N=power-of-two sknx::Vec. 492 template <int... Ix, int N, typename T> 493 SI Vec<sizeof...(Ix),T> shuffle(const Vec<N,T>& x) { 494 #if !defined(SKNX_NO_SIMD) && defined(__clang__) 495 // TODO: can we just always use { x[Ix]... }? 496 return to_vec<sizeof...(Ix),T>(__builtin_shufflevector(to_vext(x), to_vext(x), Ix...)); 497 #else 498 return { x[Ix]... }; 499 #endif 500 } 501 502 // Call map(fn, x) for a vector with fn() applied to each lane of x, { fn(x[0]), fn(x[1]), ... }, 503 // or map(fn, x,y) for a vector of fn(x[i], y[i]), etc. 504 505 template <typename Fn, typename... Args, size_t... I> 506 SI auto map(std::index_sequence<I...>, 507 Fn&& fn, const Args&... args) -> skvx::Vec<sizeof...(I), decltype(fn(args[0]...))> { 508 auto lane = [&](size_t i) 509 #if defined(__clang__) 510 // CFI, specifically -fsanitize=cfi-icall, seems to give a false positive here, 511 // with errors like "control flow integrity check for type 'float (float) 512 // noexcept' failed during indirect function call... note: sqrtf.cfi_jt defined 513 // here". But we can be quite sure fn is the right type: it's all inferred! 514 // So, stifle CFI in this function. 515 __attribute__((no_sanitize("cfi"))) 516 #endif 517 { return fn(args[i]...); }; 518 519 return { lane(I)... }; 520 } 521 522 template <typename Fn, int N, typename T, typename... Rest> 523 auto map(Fn&& fn, const Vec<N,T>& first, const Rest&... rest) { 524 // Derive an {0...N-1} index_sequence from the size of the first arg: N lanes in, N lanes out. 525 return map(std::make_index_sequence<N>{}, fn, first,rest...); 526 } 527 528 SIN Vec<N,float> ceil(const Vec<N,float>& x) { return map( ceilf, x); } 529 SIN Vec<N,float> floor(const Vec<N,float>& x) { return map(floorf, x); } 530 SIN Vec<N,float> trunc(const Vec<N,float>& x) { return map(truncf, x); } 531 SIN Vec<N,float> round(const Vec<N,float>& x) { return map(roundf, x); } 532 SIN Vec<N,float> sqrt(const Vec<N,float>& x) { return map( sqrtf, x); } 533 SIN Vec<N,float> abs(const Vec<N,float>& x) { return map( fabsf, x); } 534 SIN Vec<N,float> fma(const Vec<N,float>& x, 535 const Vec<N,float>& y, 536 const Vec<N,float>& z) { 537 // I don't understand why Clang's codegen is terrible if we write map(fmaf, x,y,z) directly. 538 auto fn = [](float x, float y, float z) { return fmaf(x,y,z); }; 539 return map(fn, x,y,z); 540 } 541 542 SI Vec<1,int> lrint(const Vec<1,float>& x) { 543 return (int)lrintf(x.val); 544 } 545 SIN Vec<N,int> lrint(const Vec<N,float>& x) { 546 #if defined(__AVX__) 547 if /*constexpr*/ (N == 8) { 548 return unchecked_bit_pun<Vec<N,int>>(_mm256_cvtps_epi32(unchecked_bit_pun<__m256>(x))); 549 } 550 #endif 551 #if defined(__SSE__) 552 if /*constexpr*/ (N == 4) { 553 return unchecked_bit_pun<Vec<N,int>>(_mm_cvtps_epi32(unchecked_bit_pun<__m128>(x))); 554 } 555 #endif 556 return join(lrint(x.lo), 557 lrint(x.hi)); 558 } 559 560 SIN Vec<N,float> fract(const Vec<N,float>& x) { return x - floor(x); } 561 562 // The default logic for to_half/from_half is borrowed from skcms, 563 // and assumes inputs are finite and treat/flush denorm half floats as/to zero. 564 // Key constants to watch for: 565 // - a float is 32-bit, 1-8-23 sign-exponent-mantissa, with 127 exponent bias; 566 // - a half is 16-bit, 1-5-10 sign-exponent-mantissa, with 15 exponent bias. 567 SIN Vec<N,uint16_t> to_half_finite_ftz(const Vec<N,float>& x) { 568 Vec<N,uint32_t> sem = bit_pun<Vec<N,uint32_t>>(x), 569 s = sem & 0x8000'0000, 570 em = sem ^ s, 571 is_denorm = em < 0x3880'0000; 572 return cast<uint16_t>(if_then_else(is_denorm, Vec<N,uint32_t>(0) 573 , (s>>16) + (em>>13) - ((127-15)<<10))); 574 } 575 SIN Vec<N,float> from_half_finite_ftz(const Vec<N,uint16_t>& x) { 576 Vec<N,uint32_t> wide = cast<uint32_t>(x), 577 s = wide & 0x8000, 578 em = wide ^ s; 579 auto is_denorm = bit_pun<Vec<N,int32_t>>(em < 0x0400); 580 return if_then_else(is_denorm, Vec<N,float>(0) 581 , bit_pun<Vec<N,float>>( (s<<16) + (em<<13) + ((127-15)<<23) )); 582 } 583 584 // Like if_then_else(), these N=1 base cases won't actually be used unless explicitly called. 585 SI Vec<1,uint16_t> to_half(const Vec<1,float>& x) { return to_half_finite_ftz(x); } 586 SI Vec<1,float> from_half(const Vec<1,uint16_t>& x) { return from_half_finite_ftz(x); } 587 588 SIN Vec<N,uint16_t> to_half(const Vec<N,float>& x) { 589 #if defined(__F16C__) 590 if /*constexpr*/ (N == 8) { 591 return unchecked_bit_pun<Vec<N,uint16_t>>(_mm256_cvtps_ph(unchecked_bit_pun<__m256>(x), 592 _MM_FROUND_CUR_DIRECTION)); 593 } 594 #endif 595 #if defined(__aarch64__) 596 if /*constexpr*/ (N == 4) { 597 return unchecked_bit_pun<Vec<N,uint16_t>>(vcvt_f16_f32(unchecked_bit_pun<float32x4_t>(x))); 598 599 } 600 #endif 601 if /*constexpr*/ (N > 4) { 602 return join(to_half(x.lo), 603 to_half(x.hi)); 604 } 605 return to_half_finite_ftz(x); 606 } 607 608 SIN Vec<N,float> from_half(const Vec<N,uint16_t>& x) { 609 #if defined(__F16C__) 610 if /*constexpr*/ (N == 8) { 611 return unchecked_bit_pun<Vec<N,float>>(_mm256_cvtph_ps(unchecked_bit_pun<__m128i>(x))); 612 } 613 #endif 614 #if defined(__aarch64__) 615 if /*constexpr*/ (N == 4) { 616 return unchecked_bit_pun<Vec<N,float>>(vcvt_f32_f16(unchecked_bit_pun<float16x4_t>(x))); 617 } 618 #endif 619 if /*constexpr*/ (N > 4) { 620 return join(from_half(x.lo), 621 from_half(x.hi)); 622 } 623 return from_half_finite_ftz(x); 624 } 625 626 627 // div255(x) = (x + 127) / 255 is a bit-exact rounding divide-by-255, packing down to 8-bit. 628 SIN Vec<N,uint8_t> div255(const Vec<N,uint16_t>& x) { 629 return cast<uint8_t>( (x+127)/255 ); 630 } 631 632 // approx_scale(x,y) approximates div255(cast<uint16_t>(x)*cast<uint16_t>(y)) within a bit, 633 // and is always perfect when x or y is 0 or 255. 634 SIN Vec<N,uint8_t> approx_scale(const Vec<N,uint8_t>& x, const Vec<N,uint8_t>& y) { 635 // All of (x*y+x)/256, (x*y+y)/256, and (x*y+255)/256 meet the criteria above. 636 // We happen to have historically picked (x*y+x)/256. 637 auto X = cast<uint16_t>(x), 638 Y = cast<uint16_t>(y); 639 return cast<uint8_t>( (X*Y+X)/256 ); 640 } 641 642 #if !defined(SKNX_NO_SIMD) && defined(__ARM_NEON) 643 // With NEON we can do eight u8*u8 -> u16 in one instruction, vmull_u8 (read, mul-long). 644 SI Vec<8,uint16_t> mull(const Vec<8,uint8_t>& x, 645 const Vec<8,uint8_t>& y) { 646 return to_vec<8,uint16_t>(vmull_u8(to_vext(x), 647 to_vext(y))); 648 } 649 650 SIN std::enable_if_t<(N < 8), Vec<N,uint16_t>> mull(const Vec<N,uint8_t>& x, 651 const Vec<N,uint8_t>& y) { 652 // N < 8 --> double up data until N == 8, returning the part we need. 653 return mull(join(x,x), 654 join(y,y)).lo; 655 } 656 657 SIN std::enable_if_t<(N > 8), Vec<N,uint16_t>> mull(const Vec<N,uint8_t>& x, 658 const Vec<N,uint8_t>& y) { 659 // N > 8 --> usual join(lo,hi) strategy to recurse down to N == 8. 660 return join(mull(x.lo, y.lo), 661 mull(x.hi, y.hi)); 662 } 663 #else 664 // Nothing special when we don't have NEON... just cast up to 16-bit and multiply. 665 SIN Vec<N,uint16_t> mull(const Vec<N,uint8_t>& x, 666 const Vec<N,uint8_t>& y) { 667 return cast<uint16_t>(x) 668 * cast<uint16_t>(y); 669 } 670 #endif 671 672 } // namespace skvx 673 674 #undef SINTU 675 #undef SINT 676 #undef SIN 677 #undef SIT 678 #undef SI 679 #undef SKVX_ALWAYS_INLINE 680 681 #endif//SKVX_DEFINED 682