1/**************************************************************************** 2* Copyright (C) 2017 Intel Corporation. All Rights Reserved. 3* 4* Permission is hereby granted, free of charge, to any person obtaining a 5* copy of this software and associated documentation files (the "Software"), 6* to deal in the Software without restriction, including without limitation 7* the rights to use, copy, modify, merge, publish, distribute, sublicense, 8* and/or sell copies of the Software, and to permit persons to whom the 9* Software is furnished to do so, subject to the following conditions: 10* 11* The above copyright notice and this permission notice (including the next 12* paragraph) shall be included in all copies or substantial portions of the 13* Software. 14* 15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21* IN THE SOFTWARE. 22****************************************************************************/ 23#if !defined(__SIMD_LIB_AVX_HPP__) 24#error Do not include this file directly, use "simdlib.hpp" instead. 25#endif 26 27using SIMD128T = SIMD128Impl::AVXImpl; 28 29//============================================================================ 30// SIMD256 AVX (1) implementation 31//============================================================================ 32 33#define SIMD_WRAPPER_1(op) \ 34 static SIMDINLINE Float SIMDCALL op(Float const &a) \ 35 {\ 36 return _mm256_##op(a);\ 37 } 38 39#define SIMD_WRAPPER_2(op) \ 40 static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b) \ 41 {\ 42 return _mm256_##op(a, b);\ 43 } 44 45#define SIMD_DWRAPPER_2(op) \ 46 static SIMDINLINE Double SIMDCALL op(Double const &a, Double const &b) \ 47 {\ 48 return _mm256_##op(a, b);\ 49 } 50 51#define SIMD_WRAPPER_2I(op) \ 52 template<int ImmT>\ 53 static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b) \ 54 {\ 55 return _mm256_##op(a, b, ImmT);\ 56 } 57 58#define SIMD_DWRAPPER_2I(op) \ 59 template<int ImmT>\ 60 static SIMDINLINE Double SIMDCALL op(Double const &a, Double const &b) \ 61 {\ 62 return _mm256_##op(a, b, ImmT);\ 63 } 64 65#define SIMD_WRAPPER_3(op) \ 66 static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b, Float const &c) \ 67 {\ 68 return _mm256_##op(a, b, c);\ 69 } 70 71#define SIMD_IWRAPPER_1(op) \ 72 static SIMDINLINE Integer SIMDCALL op(Integer const &a) \ 73 {\ 74 return _mm256_##op(a);\ 75 } 76 77#define SIMD_IWRAPPER_2(op) \ 78 static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \ 79 {\ 80 return _mm256_##op(a, b);\ 81 } 82 83#define SIMD_IFWRAPPER_2(op, intrin) \ 84 static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \ 85 {\ 86 return castps_si( intrin(castsi_ps(a), castsi_ps(b)) );\ 87 } 88 89#define SIMD_IFWRAPPER_2I(op, intrin) \ 90 template<int ImmT> \ 91 static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \ 92 {\ 93 return castps_si( intrin(castsi_ps(a), castsi_ps(b), ImmT) );\ 94 } 95 96#define SIMD_IWRAPPER_2I_(op, intrin) \ 97 template<int ImmT>\ 98 static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \ 99 {\ 100 return _mm256_##intrin(a, b, ImmT);\ 101 } 102#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op) 103 104#define SIMD_IWRAPPER_3(op) \ 105 static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b, Integer const &c) \ 106 {\ 107 return _mm256_##op(a, b, c);\ 108 } 109 110// emulated integer simd 111#define SIMD_EMU_IWRAPPER_1(op) \ 112 static SIMDINLINE \ 113 Integer SIMDCALL op(Integer const &a)\ 114 {\ 115 return Integer\ 116 {\ 117 SIMD128T::op(a.v4[0]),\ 118 SIMD128T::op(a.v4[1]),\ 119 };\ 120 } 121#define SIMD_EMU_IWRAPPER_1L(op, shift) \ 122 static SIMDINLINE \ 123 Integer SIMDCALL op(Integer const &a)\ 124 {\ 125 return Integer \ 126 {\ 127 SIMD128T::op(a.v4[0]), \ 128 SIMD128T::op(SIMD128T::template srli_si<shift>(a.v4[0])), \ 129 };\ 130 }\ 131 static SIMDINLINE \ 132 Integer SIMDCALL op(SIMD128Impl::Integer const &a)\ 133 {\ 134 return Integer \ 135 {\ 136 SIMD128T::op(a), \ 137 SIMD128T::op(SIMD128T::template srli_si<shift>(a)), \ 138 };\ 139 } 140 141#define SIMD_EMU_IWRAPPER_1I(op) \ 142 template <int ImmT> static SIMDINLINE \ 143 Integer SIMDCALL op(Integer const &a)\ 144 {\ 145 return Integer\ 146 {\ 147 SIMD128T::template op<ImmT>(a.v4[0]),\ 148 SIMD128T::template op<ImmT>(a.v4[1]),\ 149 };\ 150 } 151 152#define SIMD_EMU_IWRAPPER_2(op) \ 153 static SIMDINLINE \ 154 Integer SIMDCALL op(Integer const &a, Integer const &b)\ 155 {\ 156 return Integer\ 157 {\ 158 SIMD128T::op(a.v4[0], b.v4[0]),\ 159 SIMD128T::op(a.v4[1], b.v4[1]),\ 160 };\ 161 } 162 163#define SIMD_EMU_IWRAPPER_2I(op) \ 164 template <int ImmT> static SIMDINLINE \ 165 Integer SIMDCALL op(Integer const &a, Integer const &b)\ 166 {\ 167 return Integer\ 168 {\ 169 SIMD128T::template op<ImmT>(a.v4[0], b.v[0]),\ 170 SIMD128T::template op<ImmT>(a.v4[1], b.v[1]),\ 171 };\ 172 } 173 174//----------------------------------------------------------------------- 175// Single precision floating point arithmetic operations 176//----------------------------------------------------------------------- 177SIMD_WRAPPER_2(add_ps); // return a + b 178SIMD_WRAPPER_2(div_ps); // return a / b 179 180static SIMDINLINE Float SIMDCALL fmadd_ps(Float const &a, Float const &b, Float const &c) // return (a * b) + c 181{ 182 return add_ps(mul_ps(a, b), c); 183} 184 185static SIMDINLINE Float SIMDCALL fmsub_ps(Float const &a, Float const &b, Float const &c) // return (a * b) - c 186{ 187 return sub_ps(mul_ps(a, b), c); 188} 189 190SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b 191SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b 192SIMD_WRAPPER_2(mul_ps); // return a * b 193SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a 194SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a) 195SIMD_WRAPPER_2(sub_ps); // return a - b 196 197template <RoundMode RMT> 198static SIMDINLINE Float SIMDCALL round_ps(Float const &a) 199{ 200 return _mm256_round_ps(a, static_cast<int>(RMT)); 201} 202 203static SIMDINLINE Float SIMDCALL ceil_ps(Float const &a) { return round_ps<RoundMode::CEIL_NOEXC>(a); } 204static SIMDINLINE Float SIMDCALL floor_ps(Float const &a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); } 205 206//----------------------------------------------------------------------- 207// Integer (various width) arithmetic operations 208//----------------------------------------------------------------------- 209SIMD_EMU_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32) 210SIMD_EMU_IWRAPPER_2(add_epi32); // return a + b (int32) 211SIMD_EMU_IWRAPPER_2(add_epi8); // return a + b (int8) 212SIMD_EMU_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 213SIMD_EMU_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32) 214SIMD_EMU_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32) 215SIMD_EMU_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32) 216SIMD_EMU_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32) 217SIMD_EMU_IWRAPPER_2(mul_epi32); // return a * b (int32) 218 219// return (a * b) & 0xFFFFFFFF 220// 221// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, 222// and store the low 32 bits of the intermediate integers in dst. 223SIMD_EMU_IWRAPPER_2(mullo_epi32); 224SIMD_EMU_IWRAPPER_2(sub_epi32); // return a - b (int32) 225SIMD_EMU_IWRAPPER_2(sub_epi64); // return a - b (int64) 226SIMD_EMU_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) 227 228//----------------------------------------------------------------------- 229// Logical operations 230//----------------------------------------------------------------------- 231SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int) 232SIMD_EMU_IWRAPPER_2(and_si); // return a & b (int) 233SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int) 234SIMD_EMU_IWRAPPER_2(andnot_si); // return (~a) & b (int) 235SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int) 236SIMD_EMU_IWRAPPER_2(or_si); // return a | b (int) 237SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int) 238SIMD_EMU_IWRAPPER_2(xor_si); // return a ^ b (int) 239 240 241//----------------------------------------------------------------------- 242// Shift operations 243//----------------------------------------------------------------------- 244SIMD_EMU_IWRAPPER_1I(slli_epi32); // return a << ImmT 245 246static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer const &vA, Integer const &vCount) // return a << b (uint32) 247{ 248 int32_t aHi, aLow, countHi, countLow; 249 __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1)); 250 __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0)); 251 __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1)); 252 __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0)); 253 254 aHi = _mm_extract_epi32(vAHi, 0); 255 countHi = _mm_extract_epi32(vCountHi, 0); 256 aHi <<= countHi; 257 vAHi = _mm_insert_epi32(vAHi, aHi, 0); 258 259 aLow = _mm_extract_epi32(vALow, 0); 260 countLow = _mm_extract_epi32(vCountLow, 0); 261 aLow <<= countLow; 262 vALow = _mm_insert_epi32(vALow, aLow, 0); 263 264 aHi = _mm_extract_epi32(vAHi, 1); 265 countHi = _mm_extract_epi32(vCountHi, 1); 266 aHi <<= countHi; 267 vAHi = _mm_insert_epi32(vAHi, aHi, 1); 268 269 aLow = _mm_extract_epi32(vALow, 1); 270 countLow = _mm_extract_epi32(vCountLow, 1); 271 aLow <<= countLow; 272 vALow = _mm_insert_epi32(vALow, aLow, 1); 273 274 aHi = _mm_extract_epi32(vAHi, 2); 275 countHi = _mm_extract_epi32(vCountHi, 2); 276 aHi <<= countHi; 277 vAHi = _mm_insert_epi32(vAHi, aHi, 2); 278 279 aLow = _mm_extract_epi32(vALow, 2); 280 countLow = _mm_extract_epi32(vCountLow, 2); 281 aLow <<= countLow; 282 vALow = _mm_insert_epi32(vALow, aLow, 2); 283 284 aHi = _mm_extract_epi32(vAHi, 3); 285 countHi = _mm_extract_epi32(vCountHi, 3); 286 aHi <<= countHi; 287 vAHi = _mm_insert_epi32(vAHi, aHi, 3); 288 289 aLow = _mm_extract_epi32(vALow, 3); 290 countLow = _mm_extract_epi32(vCountLow, 3); 291 aLow <<= countLow; 292 vALow = _mm_insert_epi32(vALow, aLow, 3); 293 294 __m256i ret = _mm256_set1_epi32(0); 295 ret = _mm256_insertf128_si256(ret, vAHi, 1); 296 ret = _mm256_insertf128_si256(ret, vALow, 0); 297 return ret; 298} 299 300SIMD_EMU_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32) 301SIMD_EMU_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32) 302SIMD_EMU_IWRAPPER_1I(srli_si); // return a >> (ImmT*8) (uint) 303 304template<int ImmT> // same as srli_si, but with Float cast to int 305static SIMDINLINE Float SIMDCALL srlisi_ps(Float const &a) 306{ 307 return castsi_ps(srli_si<ImmT>(castps_si(a))); 308} 309 310static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer const &vA, Integer const &vCount) // return a >> b (uint32) 311{ 312 int32_t aHi, aLow, countHi, countLow; 313 __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1)); 314 __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0)); 315 __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1)); 316 __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0)); 317 318 aHi = _mm_extract_epi32(vAHi, 0); 319 countHi = _mm_extract_epi32(vCountHi, 0); 320 aHi >>= countHi; 321 vAHi = _mm_insert_epi32(vAHi, aHi, 0); 322 323 aLow = _mm_extract_epi32(vALow, 0); 324 countLow = _mm_extract_epi32(vCountLow, 0); 325 aLow >>= countLow; 326 vALow = _mm_insert_epi32(vALow, aLow, 0); 327 328 aHi = _mm_extract_epi32(vAHi, 1); 329 countHi = _mm_extract_epi32(vCountHi, 1); 330 aHi >>= countHi; 331 vAHi = _mm_insert_epi32(vAHi, aHi, 1); 332 333 aLow = _mm_extract_epi32(vALow, 1); 334 countLow = _mm_extract_epi32(vCountLow, 1); 335 aLow >>= countLow; 336 vALow = _mm_insert_epi32(vALow, aLow, 1); 337 338 aHi = _mm_extract_epi32(vAHi, 2); 339 countHi = _mm_extract_epi32(vCountHi, 2); 340 aHi >>= countHi; 341 vAHi = _mm_insert_epi32(vAHi, aHi, 2); 342 343 aLow = _mm_extract_epi32(vALow, 2); 344 countLow = _mm_extract_epi32(vCountLow, 2); 345 aLow >>= countLow; 346 vALow = _mm_insert_epi32(vALow, aLow, 2); 347 348 aHi = _mm_extract_epi32(vAHi, 3); 349 countHi = _mm_extract_epi32(vCountHi, 3); 350 aHi >>= countHi; 351 vAHi = _mm_insert_epi32(vAHi, aHi, 3); 352 353 aLow = _mm_extract_epi32(vALow, 3); 354 countLow = _mm_extract_epi32(vCountLow, 3); 355 aLow >>= countLow; 356 vALow = _mm_insert_epi32(vALow, aLow, 3); 357 358 __m256i ret = _mm256_set1_epi32(0); 359 ret = _mm256_insertf128_si256(ret, vAHi, 1); 360 ret = _mm256_insertf128_si256(ret, vALow, 0); 361 return ret; 362} 363 364 365 366//----------------------------------------------------------------------- 367// Conversion operations 368//----------------------------------------------------------------------- 369static SIMDINLINE Float SIMDCALL castpd_ps(Double const &a) // return *(Float*)(&a) 370{ 371 return _mm256_castpd_ps(a); 372} 373 374static SIMDINLINE Integer SIMDCALL castps_si(Float const &a) // return *(Integer*)(&a) 375{ 376 return _mm256_castps_si256(a); 377} 378 379static SIMDINLINE Double SIMDCALL castsi_pd(Integer const &a) // return *(Double*)(&a) 380{ 381 return _mm256_castsi256_pd(a); 382} 383 384static SIMDINLINE Double SIMDCALL castps_pd(Float const &a) // return *(Double*)(&a) 385{ 386 return _mm256_castps_pd(a); 387} 388 389static SIMDINLINE Integer SIMDCALL castpd_si(Double const &a) // return *(Integer*)(&a) 390{ 391 return _mm256_castpd_si256(a); 392} 393 394static SIMDINLINE Float SIMDCALL castsi_ps(Integer const &a) // return *(Float*)(&a) 395{ 396 return _mm256_castsi256_ps(a); 397} 398 399static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer const &a) // return (float)a (int32 --> float) 400{ 401 return _mm256_cvtepi32_ps(a); 402} 403 404SIMD_EMU_IWRAPPER_1L(cvtepu8_epi16, 8); // return (int16)a (uint8 --> int16) 405SIMD_EMU_IWRAPPER_1L(cvtepu8_epi32, 4); // return (int32)a (uint8 --> int32) 406SIMD_EMU_IWRAPPER_1L(cvtepu16_epi32, 8); // return (int32)a (uint16 --> int32) 407SIMD_EMU_IWRAPPER_1L(cvtepu16_epi64, 4); // return (int64)a (uint16 --> int64) 408SIMD_EMU_IWRAPPER_1L(cvtepu32_epi64, 8); // return (int64)a (uint32 --> int64) 409 410static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float const &a) // return (int32)a (float --> int32) 411{ 412 return _mm256_cvtps_epi32(a); 413} 414 415static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float const &a) // return (int32)a (rnd_to_zero(float) --> int32) 416{ 417 return _mm256_cvttps_epi32(a); 418} 419 420//----------------------------------------------------------------------- 421// Comparison operations 422//----------------------------------------------------------------------- 423template<CompareType CmpTypeT> 424static SIMDINLINE Float SIMDCALL cmp_ps(Float const &a, Float const &b) // return a (CmpTypeT) b 425{ 426 return _mm256_cmp_ps(a, b, static_cast<const int>(CmpTypeT)); 427} 428static SIMDINLINE Float SIMDCALL cmplt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LT_OQ>(a, b); } 429static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GT_OQ>(a, b); } 430static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); } 431static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::EQ_OQ>(a, b); } 432static SIMDINLINE Float SIMDCALL cmpge_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GE_OQ>(a, b); } 433static SIMDINLINE Float SIMDCALL cmple_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LE_OQ>(a, b); } 434 435SIMD_EMU_IWRAPPER_2(cmpeq_epi8); // return a == b (int8) 436SIMD_EMU_IWRAPPER_2(cmpeq_epi16); // return a == b (int16) 437SIMD_EMU_IWRAPPER_2(cmpeq_epi32); // return a == b (int32) 438SIMD_EMU_IWRAPPER_2(cmpeq_epi64); // return a == b (int64) 439SIMD_EMU_IWRAPPER_2(cmpgt_epi8); // return a > b (int8) 440SIMD_EMU_IWRAPPER_2(cmpgt_epi16); // return a > b (int16) 441SIMD_EMU_IWRAPPER_2(cmpgt_epi32); // return a > b (int32) 442SIMD_EMU_IWRAPPER_2(cmpgt_epi64); // return a > b (int64) 443SIMD_EMU_IWRAPPER_2(cmplt_epi32); // return a < b (int32) 444 445static SIMDINLINE bool SIMDCALL testz_ps(Float const &a, Float const &b) // return all_lanes_zero(a & b) ? 1 : 0 (float) 446{ 447 return 0 != _mm256_testz_ps(a, b); 448} 449 450static SIMDINLINE bool SIMDCALL testz_si(Integer const &a, Integer const &b) // return all_lanes_zero(a & b) ? 1 : 0 (int) 451{ 452 return 0 != _mm256_testz_si256(a, b); 453} 454 455//----------------------------------------------------------------------- 456// Blend / shuffle / permute operations 457//----------------------------------------------------------------------- 458SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float) 459SIMD_IFWRAPPER_2I(blend_epi32, _mm256_blend_ps); // return ImmT ? b : a (int32) 460SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float) 461 462static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Float const &mask) // return mask ? b : a (int) 463{ 464 return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask)); 465} 466 467static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Integer const &mask) // return mask ? b : a (int) 468{ 469 return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask))); 470} 471 472static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p) // return *p (all elements in vector get same value) 473{ 474 return _mm256_broadcast_ss(p); 475} 476 477SIMD_EMU_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16 478SIMD_EMU_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32 479SIMD_EMU_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16 480SIMD_EMU_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32 481 482static SIMDINLINE Integer SIMDCALL permute_epi32(Integer const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (int32) 483{ 484 Integer result; 485 486 // Ugly slow implementation 487 uint32_t const *pA = reinterpret_cast<uint32_t const*>(&a); 488 uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz); 489 uint32_t *pResult = reinterpret_cast<uint32_t *>(&result); 490 491 for (uint32_t i = 0; i < SIMD_WIDTH; ++i) 492 { 493 pResult[i] = pA[0xF & pSwiz[i]]; 494 } 495 496 return result; 497} 498 499static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (float) 500{ 501 Float result; 502 503 // Ugly slow implementation 504 float const *pA = reinterpret_cast<float const*>(&a); 505 uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz); 506 float *pResult = reinterpret_cast<float *>(&result); 507 508 for (uint32_t i = 0; i < SIMD_WIDTH; ++i) 509 { 510 pResult[i] = pA[0xF & pSwiz[i]]; 511 } 512 513 return result; 514} 515 516SIMD_WRAPPER_2I(permute2f128_ps); 517SIMD_DWRAPPER_2I(permute2f128_pd); 518SIMD_IWRAPPER_2I_(permute2f128_si, permute2f128_si256); 519 520 521SIMD_EMU_IWRAPPER_1I(shuffle_epi32); 522 523template<int ImmT> 524static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const &a, Integer const &b) 525{ 526 return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b))); 527} 528SIMD_EMU_IWRAPPER_2(shuffle_epi8); 529SIMD_DWRAPPER_2I(shuffle_pd); 530SIMD_WRAPPER_2I(shuffle_ps); 531SIMD_EMU_IWRAPPER_2(unpackhi_epi16); 532SIMD_IFWRAPPER_2(unpackhi_epi32, _mm256_unpackhi_ps); 533SIMD_EMU_IWRAPPER_2(unpackhi_epi64); 534SIMD_EMU_IWRAPPER_2(unpackhi_epi8); 535SIMD_DWRAPPER_2(unpackhi_pd); 536SIMD_WRAPPER_2(unpackhi_ps); 537SIMD_EMU_IWRAPPER_2(unpacklo_epi16); 538SIMD_IFWRAPPER_2(unpacklo_epi32, _mm256_unpacklo_ps); 539SIMD_EMU_IWRAPPER_2(unpacklo_epi64); 540SIMD_EMU_IWRAPPER_2(unpacklo_epi8); 541SIMD_DWRAPPER_2(unpacklo_pd); 542SIMD_WRAPPER_2(unpacklo_ps); 543 544//----------------------------------------------------------------------- 545// Load / store operations 546//----------------------------------------------------------------------- 547template<ScaleFactor ScaleT> 548static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer const &idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) 549{ 550 uint32_t *pOffsets = (uint32_t*)&idx; 551 Float vResult; 552 float* pResult = (float*)&vResult; 553 for (uint32_t i = 0; i < SIMD_WIDTH; ++i) 554 { 555 uint32_t offset = pOffsets[i]; 556 offset = offset * static_cast<uint32_t>(ScaleT); 557 pResult[i] = *(float const*)(((uint8_t const*)p + offset)); 558 } 559 560 return vResult; 561} 562 563static SIMDINLINE Float SIMDCALL load1_ps(float const *p) // return *p (broadcast 1 value to all elements) 564{ 565 return broadcast_ss(p); 566} 567 568static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory) 569{ 570 return _mm256_load_ps(p); 571} 572 573static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p 574{ 575 return _mm256_load_si256(&p->v); 576} 577 578static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem) 579{ 580 return _mm256_loadu_ps(p); 581} 582 583static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem) 584{ 585 return _mm256_lddqu_si256(&p->v); 586} 587 588// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old 589template<ScaleFactor ScaleT> 590static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float const &old, float const* p, Integer const &idx, Float const &mask) 591{ 592 uint32_t *pOffsets = (uint32_t*)&idx; 593 Float vResult = old; 594 float* pResult = (float*)&vResult; 595 DWORD index; 596 uint32_t umask = movemask_ps(mask); 597 while (_BitScanForward(&index, umask)) 598 { 599 umask &= ~(1 << index); 600 uint32_t offset = pOffsets[index]; 601 offset = offset * static_cast<uint32_t>(ScaleT); 602 pResult[index] = *(float const *)(((uint8_t const *)p + offset)); 603 } 604 605 return vResult; 606} 607 608static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer const &mask, Float const &src) 609{ 610 _mm256_maskstore_ps(p, mask, src); 611} 612 613static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const &a) 614{ 615 return SIMD128T::movemask_epi8(a.v4[0]) | 616 (SIMD128T::movemask_epi8(a.v4[1]) << 16); 617} 618 619static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const &a) 620{ 621 return static_cast<uint32_t>(_mm256_movemask_pd(a)); 622} 623static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const &a) 624{ 625 return static_cast<uint32_t>(_mm256_movemask_ps(a)); 626} 627 628static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value) 629{ 630 return _mm256_set1_epi32(i); 631} 632 633static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value) 634{ 635 return _mm256_set1_epi8(i); 636} 637 638static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value) 639{ 640 return _mm256_set1_ps(f); 641} 642 643static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float) 644{ 645 return _mm256_setzero_ps(); 646} 647 648static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer) 649{ 650 return _mm256_setzero_si256(); 651} 652 653static SIMDINLINE void SIMDCALL store_ps(float *p, Float const &a) // *p = a (stores all elements contiguously in memory) 654{ 655 _mm256_store_ps(p, a); 656} 657 658static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer const &a) // *p = a 659{ 660 _mm256_store_si256(&p->v, a); 661} 662 663static SIMDINLINE void SIMDCALL stream_ps(float *p, Float const &a) // *p = a (same as store_ps, but doesn't keep memory in cache) 664{ 665 _mm256_stream_ps(p, a); 666} 667 668//======================================================================= 669// Legacy interface (available only in SIMD256 width) 670//======================================================================= 671 672static SIMDINLINE Float SIMDCALL broadcast_ps(SIMD128Impl::Float const *p) 673{ 674 return _mm256_broadcast_ps(&p->v); 675} 676 677template<int ImmT> 678static SIMDINLINE SIMD128Impl::Double SIMDCALL extractf128_pd(Double const &a) 679{ 680 return _mm256_extractf128_pd(a, ImmT); 681} 682 683template<int ImmT> 684static SIMDINLINE SIMD128Impl::Float SIMDCALL extractf128_ps(Float const &a) 685{ 686 return _mm256_extractf128_ps(a, ImmT); 687} 688 689template<int ImmT> 690static SIMDINLINE SIMD128Impl::Integer SIMDCALL extractf128_si(Integer const &a) 691{ 692 return _mm256_extractf128_si256(a, ImmT); 693} 694 695template<int ImmT> 696static SIMDINLINE Double SIMDCALL insertf128_pd(Double const &a, SIMD128Impl::Double const &b) 697{ 698 return _mm256_insertf128_pd(a, b, ImmT); 699} 700 701template<int ImmT> 702static SIMDINLINE Float SIMDCALL insertf128_ps(Float const &a, SIMD128Impl::Float const &b) 703{ 704 return _mm256_insertf128_ps(a, b, ImmT); 705} 706 707template<int ImmT> 708static SIMDINLINE Integer SIMDCALL insertf128_si(Integer const &a, SIMD128Impl::Integer const &b) 709{ 710 return _mm256_insertf128_si256(a, b, ImmT); 711} 712 713#ifndef _mm256_set_m128i 714#define _mm256_set_m128i(/* SIMD128Impl::Integer */ hi, /* SIMD128Impl::Integer */ lo) \ 715 _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1) 716#endif 717 718#ifndef _mm256_loadu2_m128i 719#define _mm256_loadu2_m128i(/* SIMD128Impl::Integer const* */ hiaddr, \ 720 /* SIMD128Impl::Integer const* */ loaddr) \ 721 _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr)) 722#endif 723 724static SIMDINLINE Integer SIMDCALL loadu2_si(SIMD128Impl::Integer const* phi, SIMD128Impl::Integer const* plo) 725{ 726 return _mm256_loadu2_m128i(&phi->v, &plo->v); 727} 728 729static SIMDINLINE Integer SIMDCALL set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0) 730{ 731 return _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0); 732} 733 734static SIMDINLINE Float SIMDCALL set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0) 735{ 736 return _mm256_set_ps(i7, i6, i5, i4, i3, i2, i1, i0); 737} 738 739static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer *phi, SIMD128Impl::Integer *plo, Integer const &src) 740{ 741 _mm256_storeu2_m128i(&phi->v, &plo->v, src); 742} 743 744static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask) 745{ 746 Integer vec = set1_epi32(mask); 747 const Integer bit = set_epi32( 748 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); 749 vec = and_si(vec, bit); 750 vec = cmplt_epi32(setzero_si(), vec); 751 return castsi_ps(vec); 752} 753 754#undef SIMD_WRAPPER_1 755#undef SIMD_WRAPPER_2 756#undef SIMD_DWRAPPER_2 757#undef SIMD_DWRAPPER_2I 758#undef SIMD_WRAPPER_2I 759#undef SIMD_WRAPPER_3 760#undef SIMD_IWRAPPER_1 761#undef SIMD_IWRAPPER_2 762#undef SIMD_IFWRAPPER_2 763#undef SIMD_IFWRAPPER_2I 764#undef SIMD_IWRAPPER_2I 765#undef SIMD_IWRAPPER_2I_ 766#undef SIMD_IWRAPPER_2_ 767#undef SIMD_IWRAPPER_3 768#undef SIMD_EMU_IWRAPPER_1 769#undef SIMD_EMU_IWRAPPER_1I 770#undef SIMD_EMU_IWRAPPER_2 771#undef SIMD_EMU_IWRAPPER_2I 772