1/**************************************************************************** 2 * Copyright (C) 2017 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 ****************************************************************************/ 23#if !defined(__SIMD_LIB_AVX_HPP__) 24#error Do not include this file directly, use "simdlib.hpp" instead. 25#endif 26 27//============================================================================ 28// SIMD128 AVX (1) implementation 29//============================================================================ 30 31#define SIMD_WRAPPER_1(op) \ 32 static SIMDINLINE Float SIMDCALL op(Float a) { return _mm_##op(a); } 33 34#define SIMD_WRAPPER_2(op) \ 35 static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm_##op(a, b); } 36 37#define SIMD_DWRAPPER_2(op) \ 38 static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm_##op(a, b); } 39 40#define SIMD_WRAPPER_2I(op) \ 41 template <int ImmT> \ 42 static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ 43 { \ 44 return _mm_##op(a, b, ImmT); \ 45 } 46 47#define SIMD_DWRAPPER_2I(op) \ 48 template <int ImmT> \ 49 static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ 50 { \ 51 return _mm_##op(a, b, ImmT); \ 52 } 53 54#define SIMD_WRAPPER_3(op) \ 55 static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm_##op(a, b, c); } 56 57#define SIMD_IWRAPPER_1(op) \ 58 static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm_##op(a); } 59 60#define SIMD_IWRAPPER_1I_(op, intrin) \ 61 template <int ImmT> \ 62 static SIMDINLINE Integer SIMDCALL op(Integer a) \ 63 { \ 64 return intrin(a, ImmT); \ 65 } 66#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm_##op) 67 68#define SIMD_IWRAPPER_2_(op, intrin) \ 69 static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return intrin(a, b); } 70 71#define SIMD_IWRAPPER_2(op) \ 72 static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm_##op(a, b); } 73 74#define SIMD_IFWRAPPER_2(op, intrin) \ 75 static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ 76 { \ 77 return castps_si(intrin(castsi_ps(a), castsi_ps(b))); \ 78 } 79 80#define SIMD_IWRAPPER_2I(op) \ 81 template <int ImmT> \ 82 static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ 83 { \ 84 return _mm_##op(a, b, ImmT); \ 85 } 86 87//----------------------------------------------------------------------- 88// Single precision floating point arithmetic operations 89//----------------------------------------------------------------------- 90SIMD_WRAPPER_2(add_ps); // return a + b 91SIMD_WRAPPER_2(div_ps); // return a / b 92SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b 93SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b 94SIMD_WRAPPER_2(mul_ps); // return a * b 95SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a 96SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a) 97SIMD_WRAPPER_2(sub_ps); // return a - b 98 99static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c 100{ 101 return add_ps(mul_ps(a, b), c); 102} 103static SIMDINLINE Float SIMDCALL fmsub_ps(Float a, Float b, Float c) // return (a * b) - c 104{ 105 return sub_ps(mul_ps(a, b), c); 106} 107 108template <RoundMode RMT> 109static SIMDINLINE Float SIMDCALL round_ps(Float a) 110{ 111 return _mm_round_ps(a, static_cast<int>(RMT)); 112} 113 114static SIMDINLINE Float SIMDCALL ceil_ps(Float a) 115{ 116 return round_ps<RoundMode::CEIL_NOEXC>(a); 117} 118static SIMDINLINE Float SIMDCALL floor_ps(Float a) 119{ 120 return round_ps<RoundMode::FLOOR_NOEXC>(a); 121} 122 123//----------------------------------------------------------------------- 124// Integer (various width) arithmetic operations 125//----------------------------------------------------------------------- 126SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32) 127SIMD_IWRAPPER_2(add_epi32); // return a + b (int32) 128SIMD_IWRAPPER_2(add_epi8); // return a + b (int8) 129SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 130SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32) 131SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32) 132SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32) 133SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32) 134SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32) 135 136// return (a * b) & 0xFFFFFFFF 137// 138// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, 139// and store the low 32 bits of the intermediate integers in dst. 140SIMD_IWRAPPER_2(mullo_epi32); 141SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32) 142SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64) 143SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) 144 145//----------------------------------------------------------------------- 146// Logical operations 147//----------------------------------------------------------------------- 148SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int) 149SIMD_IWRAPPER_2_(and_si, _mm_and_si128); // return a & b (int) 150SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int) 151SIMD_IWRAPPER_2_(andnot_si, _mm_andnot_si128); // return (~a) & b (int) 152SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int) 153SIMD_IWRAPPER_2_(or_si, _mm_or_si128); // return a | b (int) 154SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int) 155SIMD_IWRAPPER_2_(xor_si, _mm_xor_si128); // return a ^ b (int) 156 157//----------------------------------------------------------------------- 158// Shift operations 159//----------------------------------------------------------------------- 160SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT 161SIMD_IWRAPPER_1I(slli_epi64); // return a << ImmT 162 163static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b (uint32) 164{ 165 int32_t a, count; 166 a = _mm_extract_epi32(vA, 0); 167 count = _mm_extract_epi32(vB, 0); 168 a <<= count; 169 vA = _mm_insert_epi32(vA, a, 0); 170 171 a = _mm_extract_epi32(vA, 1); 172 count = _mm_extract_epi32(vB, 1); 173 a <<= count; 174 vA = _mm_insert_epi32(vA, a, 1); 175 176 a = _mm_extract_epi32(vA, 2); 177 count = _mm_extract_epi32(vB, 2); 178 a <<= count; 179 vA = _mm_insert_epi32(vA, a, 2); 180 181 a = _mm_extract_epi32(vA, 3); 182 count = _mm_extract_epi32(vB, 3); 183 a <<= count; 184 vA = _mm_insert_epi32(vA, a, 3); 185 186 return vA; 187} 188 189SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32) 190SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32) 191SIMD_IWRAPPER_1I_(srli_si, _mm_srli_si128); // return a >> (ImmT*8) (uint) 192 193static SIMDINLINE Integer SIMDCALL srl_epi64(Integer a, Integer n) 194{ 195 return _mm_srl_epi64(a, n); 196} 197 198template <int ImmT> // same as srli_si, but with Float cast to int 199static SIMDINLINE Float SIMDCALL srlisi_ps(Float a) 200{ 201 return castsi_ps(srli_si<ImmT>(castps_si(a))); 202} 203 204static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b (uint32) 205{ 206 int32_t a, count; 207 a = _mm_extract_epi32(vA, 0); 208 count = _mm_extract_epi32(vB, 0); 209 a >>= count; 210 vA = _mm_insert_epi32(vA, a, 0); 211 212 a = _mm_extract_epi32(vA, 1); 213 count = _mm_extract_epi32(vB, 1); 214 a >>= count; 215 vA = _mm_insert_epi32(vA, a, 1); 216 217 a = _mm_extract_epi32(vA, 2); 218 count = _mm_extract_epi32(vB, 2); 219 a >>= count; 220 vA = _mm_insert_epi32(vA, a, 2); 221 222 a = _mm_extract_epi32(vA, 3); 223 count = _mm_extract_epi32(vB, 3); 224 a >>= count; 225 vA = _mm_insert_epi32(vA, a, 3); 226 227 return vA; 228} 229 230//----------------------------------------------------------------------- 231// Conversion operations 232//----------------------------------------------------------------------- 233static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a) 234{ 235 return _mm_castpd_ps(a); 236} 237 238static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a) 239{ 240 return _mm_castps_si128(a); 241} 242 243static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a) 244{ 245 return _mm_castsi128_pd(a); 246} 247 248static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a) 249{ 250 return _mm_castps_pd(a); 251} 252 253static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a) 254{ 255 return _mm_castsi128_ps(a); 256} 257 258static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float) 259{ 260 return _mm_cvtepi32_ps(a); 261} 262 263static SIMDINLINE int32_t SIMDCALL cvtsi128_si32(Integer a) // return a.v[0] 264{ 265 return _mm_cvtsi128_si32(a); 266} 267 268static SIMDINLINE Integer SIMDCALL cvtsi32_si128(int32_t n) // return a[0] = n, a[1]...a[3] = 0 269{ 270 return _mm_cvtsi32_si128(n); 271} 272 273SIMD_IWRAPPER_1(cvtepu8_epi16); // return (int16)a (uint8 --> int16) 274SIMD_IWRAPPER_1(cvtepu8_epi32); // return (int32)a (uint8 --> int32) 275SIMD_IWRAPPER_1(cvtepu16_epi32); // return (int32)a (uint16 --> int32) 276SIMD_IWRAPPER_1(cvtepu16_epi64); // return (int64)a (uint16 --> int64) 277SIMD_IWRAPPER_1(cvtepu32_epi64); // return (int64)a (uint32 --> int64) 278 279static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32) 280{ 281 return _mm_cvtps_epi32(a); 282} 283 284static SIMDINLINE Integer SIMDCALL 285 cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32) 286{ 287 return _mm_cvttps_epi32(a); 288} 289 290//----------------------------------------------------------------------- 291// Comparison operations 292//----------------------------------------------------------------------- 293template <CompareType CmpTypeT> 294static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b 295{ 296 return _mm_cmp_ps(a, b, static_cast<const int>(CmpTypeT)); 297} 298static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) 299{ 300 return cmp_ps<CompareType::LT_OQ>(a, b); 301} 302static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) 303{ 304 return cmp_ps<CompareType::GT_OQ>(a, b); 305} 306static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) 307{ 308 return cmp_ps<CompareType::NEQ_OQ>(a, b); 309} 310static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) 311{ 312 return cmp_ps<CompareType::EQ_OQ>(a, b); 313} 314static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) 315{ 316 return cmp_ps<CompareType::GE_OQ>(a, b); 317} 318static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) 319{ 320 return cmp_ps<CompareType::LE_OQ>(a, b); 321} 322 323SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8) 324SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16) 325SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32) 326SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64) 327SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8) 328SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16) 329SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32) 330SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64) 331SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32) 332 333static SIMDINLINE bool SIMDCALL testz_ps(Float a, 334 Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float) 335{ 336 return 0 != _mm_testz_ps(a, b); 337} 338 339static SIMDINLINE bool SIMDCALL testz_si(Integer a, 340 Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int) 341{ 342 return 0 != _mm_testz_si128(a, b); 343} 344 345//----------------------------------------------------------------------- 346// Blend / shuffle / permute operations 347//----------------------------------------------------------------------- 348SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float) 349SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float) 350 351static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, 352 Integer b, 353 Float mask) // return mask ? b : a (int) 354{ 355 return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask)); 356} 357 358static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, 359 Integer b, 360 Integer mask) // return mask ? b : a (int) 361{ 362 return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask))); 363} 364 365static SIMDINLINE Float SIMDCALL 366 broadcast_ss(float const* p) // return *p (all elements in vector get same value) 367{ 368 return _mm_broadcast_ss(p); 369} 370 371SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm_packs_epi16 and _mm512_packs_epi16 372SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm_packs_epi32 and _mm512_packs_epi32 373SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm_packus_epi16 and _mm512_packus_epi16 374SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm_packus_epi32 and _mm512_packus_epi32 375 376static SIMDINLINE Integer SIMDCALL 377 permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float) 378{ 379 return castps_si(_mm_permutevar_ps(castsi_ps(a), swiz)); 380} 381 382static SIMDINLINE Float SIMDCALL 383 permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float) 384{ 385 return _mm_permutevar_ps(a, swiz); 386} 387 388SIMD_IWRAPPER_1I(shuffle_epi32); 389 390template <int ImmT> 391static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) = delete; 392 393SIMD_IWRAPPER_2(shuffle_epi8); 394SIMD_DWRAPPER_2I(shuffle_pd); 395SIMD_WRAPPER_2I(shuffle_ps); 396SIMD_IWRAPPER_2(unpackhi_epi16); 397 398// SIMD_IFWRAPPER_2(unpackhi_epi32, _mm_unpackhi_ps); 399static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b) 400{ 401 return castps_si(_mm_unpackhi_ps(castsi_ps(a), castsi_ps(b))); 402} 403 404SIMD_IWRAPPER_2(unpackhi_epi64); 405SIMD_IWRAPPER_2(unpackhi_epi8); 406SIMD_DWRAPPER_2(unpackhi_pd); 407SIMD_WRAPPER_2(unpackhi_ps); 408SIMD_IWRAPPER_2(unpacklo_epi16); 409SIMD_IFWRAPPER_2(unpacklo_epi32, _mm_unpacklo_ps); 410SIMD_IWRAPPER_2(unpacklo_epi64); 411SIMD_IWRAPPER_2(unpacklo_epi8); 412SIMD_DWRAPPER_2(unpacklo_pd); 413SIMD_WRAPPER_2(unpacklo_ps); 414 415//----------------------------------------------------------------------- 416// Load / store operations 417//----------------------------------------------------------------------- 418template <ScaleFactor ScaleT = ScaleFactor::SF_1> 419static SIMDINLINE Float SIMDCALL 420 i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) 421{ 422 uint32_t* pOffsets = (uint32_t*)&idx; 423 Float vResult; 424 float* pResult = (float*)&vResult; 425 for (uint32_t i = 0; i < SIMD_WIDTH; ++i) 426 { 427 uint32_t offset = pOffsets[i]; 428 offset = offset * static_cast<uint32_t>(ScaleT); 429 pResult[i] = *(float const*)(((uint8_t const*)p + offset)); 430 } 431 432 return vResult; 433} 434 435static SIMDINLINE Float SIMDCALL 436 load1_ps(float const* p) // return *p (broadcast 1 value to all elements) 437{ 438 return broadcast_ss(p); 439} 440 441static SIMDINLINE Float SIMDCALL 442 load_ps(float const* p) // return *p (loads SIMD width elements from memory) 443{ 444 return _mm_load_ps(p); 445} 446 447static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p 448{ 449 return _mm_load_si128(&p->v); 450} 451 452static SIMDINLINE Float SIMDCALL 453 loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem) 454{ 455 return _mm_loadu_ps(p); 456} 457 458static SIMDINLINE Integer SIMDCALL 459 loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem) 460{ 461 return _mm_lddqu_si128(&p->v); 462} 463 464// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old 465template <ScaleFactor ScaleT = ScaleFactor::SF_1> 466static SIMDINLINE Float SIMDCALL 467 mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) 468{ 469 uint32_t* pOffsets = (uint32_t*)&idx; 470 Float vResult = old; 471 float* pResult = (float*)&vResult; 472 unsigned long index; 473 uint32_t umask = movemask_ps(mask); 474 while (_BitScanForward(&index, umask)) 475 { 476 umask &= ~(1 << index); 477 uint32_t offset = pOffsets[index]; 478 offset = offset * static_cast<uint32_t>(ScaleT); 479 pResult[index] = *(float const*)(((uint8_t const*)p + offset)); 480 } 481 482 return vResult; 483} 484 485static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src) 486{ 487 _mm_maskstore_ps(p, mask, src); 488} 489 490static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a) 491{ 492 return static_cast<uint32_t>(_mm_movemask_epi8(a)); 493} 494 495static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a) 496{ 497 return static_cast<uint32_t>(_mm_movemask_pd(a)); 498} 499static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a) 500{ 501 return static_cast<uint32_t>(_mm_movemask_ps(a)); 502} 503 504static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value) 505{ 506 return _mm_set1_epi32(i); 507} 508 509static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value) 510{ 511 return _mm_set1_epi8(i); 512} 513 514static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value) 515{ 516 return _mm_set1_ps(f); 517} 518 519static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float) 520{ 521 return _mm_setzero_ps(); 522} 523 524static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer) 525{ 526 return _mm_setzero_si128(); 527} 528 529static SIMDINLINE void SIMDCALL 530 store_ps(float* p, Float a) // *p = a (stores all elements contiguously in memory) 531{ 532 _mm_store_ps(p, a); 533} 534 535static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a 536{ 537 _mm_store_si128(&p->v, a); 538} 539 540static SIMDINLINE void SIMDCALL 541 storeu_si(Integer* p, Integer a) // *p = a (same as store_si but allows for unaligned mem) 542{ 543 _mm_storeu_si128(&p->v, a); 544} 545 546static SIMDINLINE void SIMDCALL 547 stream_ps(float* p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache) 548{ 549 _mm_stream_ps(p, a); 550} 551 552static SIMDINLINE Float SIMDCALL set_ps(float in3, float in2, float in1, float in0) 553{ 554 return _mm_set_ps(in3, in2, in1, in0); 555} 556 557static SIMDINLINE Integer SIMDCALL set_epi32(int in3, int in2, int in1, int in0) 558{ 559 return _mm_set_epi32(in3, in2, in1, in0); 560} 561 562template <int ImmT> 563static SIMDINLINE float SIMDCALL extract_ps(Float a) 564{ 565 int tmp = _mm_extract_ps(a, ImmT); 566 return *reinterpret_cast<float*>(&tmp); 567} 568 569static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask) 570{ 571 Integer vec = set1_epi32(mask); 572 const Integer bit = set_epi32(0x08, 0x04, 0x02, 0x01); 573 vec = and_si(vec, bit); 574 vec = cmplt_epi32(setzero_si(), vec); 575 return castsi_ps(vec); 576} 577 578#undef SIMD_WRAPPER_1 579#undef SIMD_WRAPPER_2 580#undef SIMD_DWRAPPER_2 581#undef SIMD_DWRAPPER_2I 582#undef SIMD_WRAPPER_2I 583#undef SIMD_WRAPPER_3 584#undef SIMD_IWRAPPER_1 585#undef SIMD_IWRAPPER_2 586#undef SIMD_IFWRAPPER_2 587#undef SIMD_IWRAPPER_2I 588#undef SIMD_IWRAPPER_1 589#undef SIMD_IWRAPPER_1I 590#undef SIMD_IWRAPPER_1I_ 591#undef SIMD_IWRAPPER_2 592#undef SIMD_IWRAPPER_2_ 593#undef SIMD_IWRAPPER_2I 594