1/**************************************************************************** 2* Copyright (C) 2017 Intel Corporation. All Rights Reserved. 3* 4* Permission is hereby granted, free of charge, to any person obtaining a 5* copy of this software and associated documentation files (the "Software"), 6* to deal in the Software without restriction, including without limitation 7* the rights to use, copy, modify, merge, publish, distribute, sublicense, 8* and/or sell copies of the Software, and to permit persons to whom the 9* Software is furnished to do so, subject to the following conditions: 10* 11* The above copyright notice and this permission notice (including the next 12* paragraph) shall be included in all copies or substantial portions of the 13* Software. 14* 15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21* IN THE SOFTWARE. 22****************************************************************************/ 23#if !defined(__SIMD_LIB_AVX_HPP__) 24#error Do not include this file directly, use "simdlib.hpp" instead. 25#endif 26 27//============================================================================ 28// SIMD128 AVX (1) implementation 29//============================================================================ 30 31#define SIMD_WRAPPER_1(op) \ 32 static SIMDINLINE Float SIMDCALL op(Float a) \ 33 {\ 34 return _mm_##op(a);\ 35 } 36 37#define SIMD_WRAPPER_2(op) \ 38 static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ 39 {\ 40 return _mm_##op(a, b);\ 41 } 42 43#define SIMD_DWRAPPER_2(op) \ 44 static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ 45 {\ 46 return _mm_##op(a, b);\ 47 } 48 49#define SIMD_WRAPPER_2I(op) \ 50 template<int ImmT>\ 51 static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ 52 {\ 53 return _mm_##op(a, b, ImmT);\ 54 } 55 56#define SIMD_DWRAPPER_2I(op) \ 57 template<int ImmT>\ 58 static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ 59 {\ 60 return _mm_##op(a, b, ImmT);\ 61 } 62 63#define SIMD_WRAPPER_3(op) \ 64 static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \ 65 {\ 66 return _mm_##op(a, b, c);\ 67 } 68 69#define SIMD_IWRAPPER_1(op) \ 70 static SIMDINLINE Integer SIMDCALL op(Integer a) \ 71 {\ 72 return _mm_##op(a);\ 73 } 74 75#define SIMD_IWRAPPER_1I_(op, intrin) \ 76 template<int ImmT> \ 77 static SIMDINLINE Integer SIMDCALL op(Integer a) \ 78 {\ 79 return intrin(a, ImmT);\ 80 } 81#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm_##op) 82 83#define SIMD_IWRAPPER_2_(op, intrin) \ 84 static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ 85 {\ 86 return intrin(a, b);\ 87 } 88 89#define SIMD_IWRAPPER_2(op) \ 90 static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ 91 {\ 92 return _mm_##op(a, b);\ 93 } 94 95#define SIMD_IFWRAPPER_2(op, intrin) \ 96 static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ 97 {\ 98 return castps_si( intrin(castsi_ps(a), castsi_ps(b)) );\ 99 } 100 101#define SIMD_IWRAPPER_2I(op) \ 102 template<int ImmT>\ 103 static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ 104 {\ 105 return _mm_##op(a, b, ImmT);\ 106 } 107 108//----------------------------------------------------------------------- 109// Single precision floating point arithmetic operations 110//----------------------------------------------------------------------- 111SIMD_WRAPPER_2(add_ps); // return a + b 112SIMD_WRAPPER_2(div_ps); // return a / b 113SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b 114SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b 115SIMD_WRAPPER_2(mul_ps); // return a * b 116SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a 117SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a) 118SIMD_WRAPPER_2(sub_ps); // return a - b 119 120static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c 121{ 122 return add_ps(mul_ps(a, b), c); 123} 124static SIMDINLINE Float SIMDCALL fmsub_ps(Float a, Float b, Float c) // return (a * b) - c 125{ 126 return sub_ps(mul_ps(a, b), c); 127} 128 129template <RoundMode RMT> 130static SIMDINLINE Float SIMDCALL round_ps(Float a) 131{ 132 return _mm_round_ps(a, static_cast<int>(RMT)); 133} 134 135static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); } 136static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); } 137 138//----------------------------------------------------------------------- 139// Integer (various width) arithmetic operations 140//----------------------------------------------------------------------- 141SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32) 142SIMD_IWRAPPER_2(add_epi32); // return a + b (int32) 143SIMD_IWRAPPER_2(add_epi8); // return a + b (int8) 144SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 145SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32) 146SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32) 147SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32) 148SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32) 149SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32) 150 151// return (a * b) & 0xFFFFFFFF 152// 153// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, 154// and store the low 32 bits of the intermediate integers in dst. 155SIMD_IWRAPPER_2(mullo_epi32); 156SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32) 157SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64) 158SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) 159 160//----------------------------------------------------------------------- 161// Logical operations 162//----------------------------------------------------------------------- 163SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int) 164SIMD_IWRAPPER_2_(and_si, _mm_and_si128); // return a & b (int) 165SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int) 166SIMD_IWRAPPER_2_(andnot_si, _mm_andnot_si128); // return (~a) & b (int) 167SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int) 168SIMD_IWRAPPER_2_(or_si, _mm_or_si128); // return a | b (int) 169SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int) 170SIMD_IWRAPPER_2_(xor_si, _mm_xor_si128); // return a ^ b (int) 171 172 173//----------------------------------------------------------------------- 174// Shift operations 175//----------------------------------------------------------------------- 176SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT 177 178static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b (uint32) 179{ 180 int32_t a, count; 181 a = _mm_extract_epi32(vA, 0); 182 count = _mm_extract_epi32(vB, 0); 183 a <<= count; 184 vA = _mm_insert_epi32(vA, a, 0); 185 186 a = _mm_extract_epi32(vA, 1); 187 count = _mm_extract_epi32(vB, 1); 188 a <<= count; 189 vA = _mm_insert_epi32(vA, a, 1); 190 191 a = _mm_extract_epi32(vA, 2); 192 count = _mm_extract_epi32(vB, 2); 193 a <<= count; 194 vA = _mm_insert_epi32(vA, a, 2); 195 196 a = _mm_extract_epi32(vA, 3); 197 count = _mm_extract_epi32(vB, 3); 198 a <<= count; 199 vA = _mm_insert_epi32(vA, a, 3); 200 201 return vA; 202} 203 204SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32) 205SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32) 206SIMD_IWRAPPER_1I_(srli_si, _mm_srli_si128); // return a >> (ImmT*8) (uint) 207 208template<int ImmT> // same as srli_si, but with Float cast to int 209static SIMDINLINE Float SIMDCALL srlisi_ps(Float a) 210{ 211 return castsi_ps(srli_si<ImmT>(castps_si(a))); 212} 213 214static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b (uint32) 215{ 216 int32_t a, count; 217 a = _mm_extract_epi32(vA, 0); 218 count = _mm_extract_epi32(vB, 0); 219 a >>= count; 220 vA = _mm_insert_epi32(vA, a, 0); 221 222 a = _mm_extract_epi32(vA, 1); 223 count = _mm_extract_epi32(vB, 1); 224 a >>= count; 225 vA = _mm_insert_epi32(vA, a, 1); 226 227 a = _mm_extract_epi32(vA, 2); 228 count = _mm_extract_epi32(vB, 2); 229 a >>= count; 230 vA = _mm_insert_epi32(vA, a, 2); 231 232 a = _mm_extract_epi32(vA, 3); 233 count = _mm_extract_epi32(vB, 3); 234 a >>= count; 235 vA = _mm_insert_epi32(vA, a, 3); 236 237 return vA; 238} 239 240 241 242//----------------------------------------------------------------------- 243// Conversion operations 244//----------------------------------------------------------------------- 245static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a) 246{ 247 return _mm_castpd_ps(a); 248} 249 250static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a) 251{ 252 return _mm_castps_si128(a); 253} 254 255static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a) 256{ 257 return _mm_castsi128_pd(a); 258} 259 260static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a) 261{ 262 return _mm_castps_pd(a); 263} 264 265static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a) 266{ 267 return _mm_castsi128_ps(a); 268} 269 270static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float) 271{ 272 return _mm_cvtepi32_ps(a); 273} 274 275SIMD_IWRAPPER_1(cvtepu8_epi16); // return (int16)a (uint8 --> int16) 276SIMD_IWRAPPER_1(cvtepu8_epi32); // return (int32)a (uint8 --> int32) 277SIMD_IWRAPPER_1(cvtepu16_epi32); // return (int32)a (uint16 --> int32) 278SIMD_IWRAPPER_1(cvtepu16_epi64); // return (int64)a (uint16 --> int64) 279SIMD_IWRAPPER_1(cvtepu32_epi64); // return (int64)a (uint32 --> int64) 280 281static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32) 282{ 283 return _mm_cvtps_epi32(a); 284} 285 286static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32) 287{ 288 return _mm_cvttps_epi32(a); 289} 290 291//----------------------------------------------------------------------- 292// Comparison operations 293//----------------------------------------------------------------------- 294template<CompareType CmpTypeT> 295static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b 296{ 297 return _mm_cmp_ps(a, b, static_cast<const int>(CmpTypeT)); 298} 299static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); } 300static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); } 301static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); } 302static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); } 303static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); } 304static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); } 305 306SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8) 307SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16) 308SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32) 309SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64) 310SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8) 311SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16) 312SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32) 313SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64) 314SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32) 315 316static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float) 317{ 318 return 0 != _mm_testz_ps(a, b); 319} 320 321static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int) 322{ 323 return 0 != _mm_testz_si128(a, b); 324} 325 326//----------------------------------------------------------------------- 327// Blend / shuffle / permute operations 328//----------------------------------------------------------------------- 329SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float) 330SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float) 331 332static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int) 333{ 334 return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask)); 335} 336 337static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int) 338{ 339 return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask))); 340} 341 342static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p) // return *p (all elements in vector get same value) 343{ 344 return _mm_broadcast_ss(p); 345} 346 347SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm_packs_epi16 and _mm512_packs_epi16 348SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm_packs_epi32 and _mm512_packs_epi32 349SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm_packus_epi16 and _mm512_packus_epi16 350SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm_packus_epi32 and _mm512_packus_epi32 351 352static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float) 353{ 354 return castps_si(_mm_permutevar_ps(castsi_ps(a), swiz)); 355} 356 357static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float) 358{ 359 return _mm_permutevar_ps(a, swiz); 360} 361 362SIMD_IWRAPPER_1I(shuffle_epi32); 363 364template<int ImmT> 365static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) = delete; 366 367SIMD_IWRAPPER_2(shuffle_epi8); 368SIMD_DWRAPPER_2I(shuffle_pd); 369SIMD_WRAPPER_2I(shuffle_ps); 370SIMD_IWRAPPER_2(unpackhi_epi16); 371 372//SIMD_IFWRAPPER_2(unpackhi_epi32, _mm_unpackhi_ps); 373static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b) 374{ 375 return castps_si(_mm_unpackhi_ps(castsi_ps(a), castsi_ps(b))); 376} 377 378SIMD_IWRAPPER_2(unpackhi_epi64); 379SIMD_IWRAPPER_2(unpackhi_epi8); 380SIMD_DWRAPPER_2(unpackhi_pd); 381SIMD_WRAPPER_2(unpackhi_ps); 382SIMD_IWRAPPER_2(unpacklo_epi16); 383SIMD_IFWRAPPER_2(unpacklo_epi32, _mm_unpacklo_ps); 384SIMD_IWRAPPER_2(unpacklo_epi64); 385SIMD_IWRAPPER_2(unpacklo_epi8); 386SIMD_DWRAPPER_2(unpacklo_pd); 387SIMD_WRAPPER_2(unpacklo_ps); 388 389//----------------------------------------------------------------------- 390// Load / store operations 391//----------------------------------------------------------------------- 392template<ScaleFactor ScaleT> 393static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) 394{ 395 uint32_t *pOffsets = (uint32_t*)&idx; 396 Float vResult; 397 float* pResult = (float*)&vResult; 398 for (uint32_t i = 0; i < SIMD_WIDTH; ++i) 399 { 400 uint32_t offset = pOffsets[i]; 401 offset = offset * static_cast<uint32_t>(ScaleT); 402 pResult[i] = *(float const*)(((uint8_t const*)p + offset)); 403 } 404 405 return vResult; 406} 407 408static SIMDINLINE Float SIMDCALL load1_ps(float const *p) // return *p (broadcast 1 value to all elements) 409{ 410 return broadcast_ss(p); 411} 412 413static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory) 414{ 415 return _mm_load_ps(p); 416} 417 418static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p 419{ 420 return _mm_load_si128(&p->v); 421} 422 423static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem) 424{ 425 return _mm_loadu_ps(p); 426} 427 428static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem) 429{ 430 return _mm_lddqu_si128(&p->v); 431} 432 433// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old 434template<ScaleFactor ScaleT> 435static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) 436{ 437 uint32_t *pOffsets = (uint32_t*)&idx; 438 Float vResult = old; 439 float* pResult = (float*)&vResult; 440 DWORD index; 441 uint32_t umask = movemask_ps(mask); 442 while (_BitScanForward(&index, umask)) 443 { 444 umask &= ~(1 << index); 445 uint32_t offset = pOffsets[index]; 446 offset = offset * static_cast<uint32_t>(ScaleT); 447 pResult[index] = *(float const *)(((uint8_t const *)p + offset)); 448 } 449 450 return vResult; 451} 452 453static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src) 454{ 455 _mm_maskstore_ps(p, mask, src); 456} 457 458static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a) 459{ 460 return static_cast<uint32_t>(_mm_movemask_epi8(a)); 461} 462 463static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a) 464{ 465 return static_cast<uint32_t>(_mm_movemask_pd(a)); 466} 467static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a) 468{ 469 return static_cast<uint32_t>(_mm_movemask_ps(a)); 470} 471 472static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value) 473{ 474 return _mm_set1_epi32(i); 475} 476 477static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value) 478{ 479 return _mm_set1_epi8(i); 480} 481 482static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value) 483{ 484 return _mm_set1_ps(f); 485} 486 487static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float) 488{ 489 return _mm_setzero_ps(); 490} 491 492static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer) 493{ 494 return _mm_setzero_si128(); 495} 496 497static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory) 498{ 499 _mm_store_ps(p, a); 500} 501 502static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a 503{ 504 _mm_store_si128(&p->v, a); 505} 506 507static SIMDINLINE void SIMDCALL storeu_si(Integer *p, Integer a) // *p = a (same as store_si but allows for unaligned mem) 508{ 509 _mm_storeu_si128(&p->v, a); 510} 511 512static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache) 513{ 514 _mm_stream_ps(p, a); 515} 516 517static SIMDINLINE Float SIMDCALL set_ps(float in3, float in2, float in1, float in0) 518{ 519 return _mm_set_ps(in3, in2, in1, in0); 520} 521 522static SIMDINLINE Integer SIMDCALL set_epi32(int in3, int in2, int in1, int in0) 523{ 524 return _mm_set_epi32(in3, in2, in1, in0); 525} 526 527template <int ImmT> 528static SIMDINLINE float SIMDCALL extract_ps(Float a) 529{ 530 int tmp = _mm_extract_ps(a, ImmT); 531 return *reinterpret_cast<float*>(&tmp); 532} 533 534static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask) 535{ 536 Integer vec = set1_epi32(mask); 537 const Integer bit = set_epi32( 538 0x08, 0x04, 0x02, 0x01); 539 vec = and_si(vec, bit); 540 vec = cmplt_epi32(setzero_si(), vec); 541 return castsi_ps(vec); 542} 543 544#undef SIMD_WRAPPER_1 545#undef SIMD_WRAPPER_2 546#undef SIMD_DWRAPPER_2 547#undef SIMD_DWRAPPER_2I 548#undef SIMD_WRAPPER_2I 549#undef SIMD_WRAPPER_3 550#undef SIMD_IWRAPPER_1 551#undef SIMD_IWRAPPER_2 552#undef SIMD_IFWRAPPER_2 553#undef SIMD_IWRAPPER_2I 554#undef SIMD_IWRAPPER_1 555#undef SIMD_IWRAPPER_1I 556#undef SIMD_IWRAPPER_1I_ 557#undef SIMD_IWRAPPER_2 558#undef SIMD_IWRAPPER_2_ 559#undef SIMD_IWRAPPER_2I 560 561