1/**************************************************************************** 2 * Copyright (C) 2017 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 ****************************************************************************/ 23#if !defined(__SIMD_LIB_AVX_HPP__) 24#error Do not include this file directly, use "simdlib.hpp" instead. 25#endif 26 27using SIMD128T = SIMD128Impl::AVXImpl; 28 29//============================================================================ 30// SIMD256 AVX (1) implementation 31//============================================================================ 32 33#define SIMD_WRAPPER_1(op) \ 34 static SIMDINLINE Float SIMDCALL op(Float const& a) { return _mm256_##op(a); } 35 36#define SIMD_WRAPPER_2(op) \ 37 static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \ 38 { \ 39 return _mm256_##op(a, b); \ 40 } 41 42#define SIMD_DWRAPPER_2(op) \ 43 static SIMDINLINE Double SIMDCALL op(Double const& a, Double const& b) \ 44 { \ 45 return _mm256_##op(a, b); \ 46 } 47 48#define SIMD_WRAPPER_2I(op) \ 49 template <int ImmT> \ 50 static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \ 51 { \ 52 return _mm256_##op(a, b, ImmT); \ 53 } 54 55#define SIMD_DWRAPPER_2I(op) \ 56 template <int ImmT> \ 57 static SIMDINLINE Double SIMDCALL op(Double const& a, Double const& b) \ 58 { \ 59 return _mm256_##op(a, b, ImmT); \ 60 } 61 62#define SIMD_WRAPPER_3(op) \ 63 static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b, Float const& c) \ 64 { \ 65 return _mm256_##op(a, b, c); \ 66 } 67 68#define SIMD_IWRAPPER_1(op) \ 69 static SIMDINLINE Integer SIMDCALL op(Integer const& a) { return _mm256_##op(a); } 70 71#define SIMD_IWRAPPER_2(op) \ 72 static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ 73 { \ 74 return _mm256_##op(a, b); \ 75 } 76 77#define SIMD_IFWRAPPER_2(op, intrin) \ 78 static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ 79 { \ 80 return castps_si(intrin(castsi_ps(a), castsi_ps(b))); \ 81 } 82 83#define SIMD_IFWRAPPER_2I(op, intrin) \ 84 template <int ImmT> \ 85 static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ 86 { \ 87 return castps_si(intrin(castsi_ps(a), castsi_ps(b), ImmT)); \ 88 } 89 90#define SIMD_IWRAPPER_2I_(op, intrin) \ 91 template <int ImmT> \ 92 static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ 93 { \ 94 return _mm256_##intrin(a, b, ImmT); \ 95 } 96#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op) 97 98#define SIMD_IWRAPPER_3(op) \ 99 static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b, Integer const& c) \ 100 { \ 101 return _mm256_##op(a, b, c); \ 102 } 103 104// emulated integer simd 105#define SIMD_EMU_IWRAPPER_1(op) \ 106 static SIMDINLINE Integer SIMDCALL op(Integer const& a) \ 107 { \ 108 return Integer{ \ 109 SIMD128T::op(a.v4[0]), \ 110 SIMD128T::op(a.v4[1]), \ 111 }; \ 112 } 113#define SIMD_EMU_IWRAPPER_1L(op, shift) \ 114 static SIMDINLINE Integer SIMDCALL op(Integer const& a) \ 115 { \ 116 return Integer{ \ 117 SIMD128T::op(a.v4[0]), \ 118 SIMD128T::op(SIMD128T::template srli_si<shift>(a.v4[0])), \ 119 }; \ 120 } \ 121 static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer const& a) \ 122 { \ 123 return Integer{ \ 124 SIMD128T::op(a), \ 125 SIMD128T::op(SIMD128T::template srli_si<shift>(a)), \ 126 }; \ 127 } 128 129#define SIMD_EMU_IWRAPPER_1I(op) \ 130 template <int ImmT> \ 131 static SIMDINLINE Integer SIMDCALL op(Integer const& a) \ 132 { \ 133 return Integer{ \ 134 SIMD128T::template op<ImmT>(a.v4[0]), \ 135 SIMD128T::template op<ImmT>(a.v4[1]), \ 136 }; \ 137 } 138 139#define SIMD_EMU_IWRAPPER_2(op) \ 140 static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ 141 { \ 142 return Integer{ \ 143 SIMD128T::op(a.v4[0], b.v4[0]), \ 144 SIMD128T::op(a.v4[1], b.v4[1]), \ 145 }; \ 146 } 147 148#define SIMD_EMU_IWRAPPER_2I(op) \ 149 template <int ImmT> \ 150 static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ 151 { \ 152 return Integer{ \ 153 SIMD128T::template op<ImmT>(a.v4[0], b.v[0]), \ 154 SIMD128T::template op<ImmT>(a.v4[1], b.v[1]), \ 155 }; \ 156 } 157 158//----------------------------------------------------------------------- 159// Single precision floating point arithmetic operations 160//----------------------------------------------------------------------- 161SIMD_WRAPPER_2(add_ps); // return a + b 162SIMD_WRAPPER_2(div_ps); // return a / b 163 164static SIMDINLINE Float SIMDCALL fmadd_ps(Float const& a, 165 Float const& b, 166 Float const& c) // return (a * b) + c 167{ 168 return add_ps(mul_ps(a, b), c); 169} 170 171static SIMDINLINE Float SIMDCALL fmsub_ps(Float const& a, 172 Float const& b, 173 Float const& c) // return (a * b) - c 174{ 175 return sub_ps(mul_ps(a, b), c); 176} 177 178SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b 179SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b 180SIMD_WRAPPER_2(mul_ps); // return a * b 181SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a 182SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a) 183SIMD_WRAPPER_2(sub_ps); // return a - b 184 185template <RoundMode RMT> 186static SIMDINLINE Float SIMDCALL round_ps(Float const& a) 187{ 188 return _mm256_round_ps(a, static_cast<int>(RMT)); 189} 190 191static SIMDINLINE Float SIMDCALL ceil_ps(Float const& a) 192{ 193 return round_ps<RoundMode::CEIL_NOEXC>(a); 194} 195static SIMDINLINE Float SIMDCALL floor_ps(Float const& a) 196{ 197 return round_ps<RoundMode::FLOOR_NOEXC>(a); 198} 199 200//----------------------------------------------------------------------- 201// Integer (various width) arithmetic operations 202//----------------------------------------------------------------------- 203SIMD_EMU_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32) 204SIMD_EMU_IWRAPPER_2(add_epi32); // return a + b (int32) 205SIMD_EMU_IWRAPPER_2(add_epi8); // return a + b (int8) 206SIMD_EMU_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 207SIMD_EMU_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32) 208SIMD_EMU_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32) 209SIMD_EMU_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32) 210SIMD_EMU_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32) 211SIMD_EMU_IWRAPPER_2(mul_epi32); // return a * b (int32) 212 213// return (a * b) & 0xFFFFFFFF 214// 215// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, 216// and store the low 32 bits of the intermediate integers in dst. 217SIMD_EMU_IWRAPPER_2(mullo_epi32); 218SIMD_EMU_IWRAPPER_2(sub_epi32); // return a - b (int32) 219SIMD_EMU_IWRAPPER_2(sub_epi64); // return a - b (int64) 220SIMD_EMU_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) 221 222//----------------------------------------------------------------------- 223// Logical operations 224//----------------------------------------------------------------------- 225SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int) 226SIMD_IFWRAPPER_2(and_si, _mm256_and_ps); // return a & b (int) 227SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int) 228SIMD_IFWRAPPER_2(andnot_si, _mm256_andnot_ps); // return (~a) & b (int) 229SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int) 230SIMD_IFWRAPPER_2(or_si, _mm256_or_ps); // return a | b (int) 231SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int) 232SIMD_IFWRAPPER_2(xor_si, _mm256_xor_ps); // return a ^ b (int) 233 234//----------------------------------------------------------------------- 235// Shift operations 236//----------------------------------------------------------------------- 237SIMD_EMU_IWRAPPER_1I(slli_epi32); // return a << ImmT 238 239static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer const& vA, 240 Integer const& vCount) // return a << b (uint32) 241{ 242 int32_t aHi, aLow, countHi, countLow; 243 __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1)); 244 __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0)); 245 __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1)); 246 __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0)); 247 248 aHi = _mm_extract_epi32(vAHi, 0); 249 countHi = _mm_extract_epi32(vCountHi, 0); 250 aHi <<= countHi; 251 vAHi = _mm_insert_epi32(vAHi, aHi, 0); 252 253 aLow = _mm_extract_epi32(vALow, 0); 254 countLow = _mm_extract_epi32(vCountLow, 0); 255 aLow <<= countLow; 256 vALow = _mm_insert_epi32(vALow, aLow, 0); 257 258 aHi = _mm_extract_epi32(vAHi, 1); 259 countHi = _mm_extract_epi32(vCountHi, 1); 260 aHi <<= countHi; 261 vAHi = _mm_insert_epi32(vAHi, aHi, 1); 262 263 aLow = _mm_extract_epi32(vALow, 1); 264 countLow = _mm_extract_epi32(vCountLow, 1); 265 aLow <<= countLow; 266 vALow = _mm_insert_epi32(vALow, aLow, 1); 267 268 aHi = _mm_extract_epi32(vAHi, 2); 269 countHi = _mm_extract_epi32(vCountHi, 2); 270 aHi <<= countHi; 271 vAHi = _mm_insert_epi32(vAHi, aHi, 2); 272 273 aLow = _mm_extract_epi32(vALow, 2); 274 countLow = _mm_extract_epi32(vCountLow, 2); 275 aLow <<= countLow; 276 vALow = _mm_insert_epi32(vALow, aLow, 2); 277 278 aHi = _mm_extract_epi32(vAHi, 3); 279 countHi = _mm_extract_epi32(vCountHi, 3); 280 aHi <<= countHi; 281 vAHi = _mm_insert_epi32(vAHi, aHi, 3); 282 283 aLow = _mm_extract_epi32(vALow, 3); 284 countLow = _mm_extract_epi32(vCountLow, 3); 285 aLow <<= countLow; 286 vALow = _mm_insert_epi32(vALow, aLow, 3); 287 288 __m256i ret = _mm256_set1_epi32(0); 289 ret = _mm256_insertf128_si256(ret, vAHi, 1); 290 ret = _mm256_insertf128_si256(ret, vALow, 0); 291 return ret; 292} 293 294SIMD_EMU_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32) 295SIMD_EMU_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32) 296SIMD_EMU_IWRAPPER_1I(srli_si); // return a >> (ImmT*8) (uint) 297 298template <int ImmT> // same as srli_si, but with Float cast to int 299static SIMDINLINE Float SIMDCALL srlisi_ps(Float const& a) 300{ 301 return castsi_ps(srli_si<ImmT>(castps_si(a))); 302} 303 304static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer const& vA, 305 Integer const& vCount) // return a >> b (uint32) 306{ 307 int32_t aHi, aLow, countHi, countLow; 308 __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1)); 309 __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0)); 310 __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1)); 311 __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0)); 312 313 aHi = _mm_extract_epi32(vAHi, 0); 314 countHi = _mm_extract_epi32(vCountHi, 0); 315 aHi >>= countHi; 316 vAHi = _mm_insert_epi32(vAHi, aHi, 0); 317 318 aLow = _mm_extract_epi32(vALow, 0); 319 countLow = _mm_extract_epi32(vCountLow, 0); 320 aLow >>= countLow; 321 vALow = _mm_insert_epi32(vALow, aLow, 0); 322 323 aHi = _mm_extract_epi32(vAHi, 1); 324 countHi = _mm_extract_epi32(vCountHi, 1); 325 aHi >>= countHi; 326 vAHi = _mm_insert_epi32(vAHi, aHi, 1); 327 328 aLow = _mm_extract_epi32(vALow, 1); 329 countLow = _mm_extract_epi32(vCountLow, 1); 330 aLow >>= countLow; 331 vALow = _mm_insert_epi32(vALow, aLow, 1); 332 333 aHi = _mm_extract_epi32(vAHi, 2); 334 countHi = _mm_extract_epi32(vCountHi, 2); 335 aHi >>= countHi; 336 vAHi = _mm_insert_epi32(vAHi, aHi, 2); 337 338 aLow = _mm_extract_epi32(vALow, 2); 339 countLow = _mm_extract_epi32(vCountLow, 2); 340 aLow >>= countLow; 341 vALow = _mm_insert_epi32(vALow, aLow, 2); 342 343 aHi = _mm_extract_epi32(vAHi, 3); 344 countHi = _mm_extract_epi32(vCountHi, 3); 345 aHi >>= countHi; 346 vAHi = _mm_insert_epi32(vAHi, aHi, 3); 347 348 aLow = _mm_extract_epi32(vALow, 3); 349 countLow = _mm_extract_epi32(vCountLow, 3); 350 aLow >>= countLow; 351 vALow = _mm_insert_epi32(vALow, aLow, 3); 352 353 __m256i ret = _mm256_set1_epi32(0); 354 ret = _mm256_insertf128_si256(ret, vAHi, 1); 355 ret = _mm256_insertf128_si256(ret, vALow, 0); 356 return ret; 357} 358 359//----------------------------------------------------------------------- 360// Conversion operations 361//----------------------------------------------------------------------- 362static SIMDINLINE Float SIMDCALL castpd_ps(Double const& a) // return *(Float*)(&a) 363{ 364 return _mm256_castpd_ps(a); 365} 366 367static SIMDINLINE Integer SIMDCALL castps_si(Float const& a) // return *(Integer*)(&a) 368{ 369 return _mm256_castps_si256(a); 370} 371 372static SIMDINLINE Double SIMDCALL castsi_pd(Integer const& a) // return *(Double*)(&a) 373{ 374 return _mm256_castsi256_pd(a); 375} 376 377static SIMDINLINE Double SIMDCALL castps_pd(Float const& a) // return *(Double*)(&a) 378{ 379 return _mm256_castps_pd(a); 380} 381 382static SIMDINLINE Integer SIMDCALL castpd_si(Double const& a) // return *(Integer*)(&a) 383{ 384 return _mm256_castpd_si256(a); 385} 386 387static SIMDINLINE Float SIMDCALL castsi_ps(Integer const& a) // return *(Float*)(&a) 388{ 389 return _mm256_castsi256_ps(a); 390} 391 392static SIMDINLINE Float SIMDCALL 393 cvtepi32_ps(Integer const& a) // return (float)a (int32 --> float) 394{ 395 return _mm256_cvtepi32_ps(a); 396} 397 398SIMD_EMU_IWRAPPER_1L(cvtepu8_epi16, 8); // return (int16)a (uint8 --> int16) 399SIMD_EMU_IWRAPPER_1L(cvtepu8_epi32, 4); // return (int32)a (uint8 --> int32) 400SIMD_EMU_IWRAPPER_1L(cvtepu16_epi32, 8); // return (int32)a (uint16 --> int32) 401SIMD_EMU_IWRAPPER_1L(cvtepu16_epi64, 4); // return (int64)a (uint16 --> int64) 402SIMD_EMU_IWRAPPER_1L(cvtepu32_epi64, 8); // return (int64)a (uint32 --> int64) 403 404static SIMDINLINE Integer SIMDCALL 405 cvtps_epi32(Float const& a) // return (int32)a (float --> int32) 406{ 407 return _mm256_cvtps_epi32(a); 408} 409 410static SIMDINLINE Integer SIMDCALL 411 cvttps_epi32(Float const& a) // return (int32)a (rnd_to_zero(float) --> int32) 412{ 413 return _mm256_cvttps_epi32(a); 414} 415 416//----------------------------------------------------------------------- 417// Comparison operations 418//----------------------------------------------------------------------- 419template <CompareType CmpTypeT> 420static SIMDINLINE Float SIMDCALL cmp_ps(Float const& a, Float const& b) // return a (CmpTypeT) b 421{ 422 return _mm256_cmp_ps(a, b, static_cast<const int>(CmpTypeT)); 423} 424static SIMDINLINE Float SIMDCALL cmplt_ps(Float const& a, Float const& b) 425{ 426 return cmp_ps<CompareType::LT_OQ>(a, b); 427} 428static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const& a, Float const& b) 429{ 430 return cmp_ps<CompareType::GT_OQ>(a, b); 431} 432static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const& a, Float const& b) 433{ 434 return cmp_ps<CompareType::NEQ_OQ>(a, b); 435} 436static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const& a, Float const& b) 437{ 438 return cmp_ps<CompareType::EQ_OQ>(a, b); 439} 440static SIMDINLINE Float SIMDCALL cmpge_ps(Float const& a, Float const& b) 441{ 442 return cmp_ps<CompareType::GE_OQ>(a, b); 443} 444static SIMDINLINE Float SIMDCALL cmple_ps(Float const& a, Float const& b) 445{ 446 return cmp_ps<CompareType::LE_OQ>(a, b); 447} 448 449SIMD_EMU_IWRAPPER_2(cmpeq_epi8); // return a == b (int8) 450SIMD_EMU_IWRAPPER_2(cmpeq_epi16); // return a == b (int16) 451SIMD_EMU_IWRAPPER_2(cmpeq_epi32); // return a == b (int32) 452SIMD_EMU_IWRAPPER_2(cmpeq_epi64); // return a == b (int64) 453SIMD_EMU_IWRAPPER_2(cmpgt_epi8); // return a > b (int8) 454SIMD_EMU_IWRAPPER_2(cmpgt_epi16); // return a > b (int16) 455SIMD_EMU_IWRAPPER_2(cmpgt_epi32); // return a > b (int32) 456SIMD_EMU_IWRAPPER_2(cmpgt_epi64); // return a > b (int64) 457SIMD_EMU_IWRAPPER_2(cmplt_epi32); // return a < b (int32) 458 459static SIMDINLINE bool SIMDCALL 460 testz_ps(Float const& a, Float const& b) // return all_lanes_zero(a & b) ? 1 : 0 (float) 461{ 462 return 0 != _mm256_testz_ps(a, b); 463} 464 465static SIMDINLINE bool SIMDCALL 466 testz_si(Integer const& a, Integer const& b) // return all_lanes_zero(a & b) ? 1 : 0 (int) 467{ 468 return 0 != _mm256_testz_si256(a, b); 469} 470 471//----------------------------------------------------------------------- 472// Blend / shuffle / permute operations 473//----------------------------------------------------------------------- 474SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float) 475SIMD_IFWRAPPER_2I(blend_epi32, _mm256_blend_ps); // return ImmT ? b : a (int32) 476SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float) 477 478static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a, 479 Integer const& b, 480 Float const& mask) // return mask ? b : a (int) 481{ 482 return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask)); 483} 484 485static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a, 486 Integer const& b, 487 Integer const& mask) // return mask ? b : a (int) 488{ 489 return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask))); 490} 491 492static SIMDINLINE Float SIMDCALL 493 broadcast_ss(float const* p) // return *p (all elements in vector get same value) 494{ 495 return _mm256_broadcast_ss(p); 496} 497 498SIMD_EMU_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16 499SIMD_EMU_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32 500SIMD_EMU_IWRAPPER_2( 501 packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16 502SIMD_EMU_IWRAPPER_2( 503 packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32 504 505template <int ImmT> 506static SIMDINLINE Float SIMDCALL permute_ps(Float const& a) 507{ 508 return _mm256_permute_ps(a, ImmT); 509} 510 511static SIMDINLINE Integer SIMDCALL permute_epi32( 512 Integer const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (int32) 513{ 514 Integer result; 515 516 // Ugly slow implementation 517 uint32_t const* pA = reinterpret_cast<uint32_t const*>(&a); 518 uint32_t const* pSwiz = reinterpret_cast<uint32_t const*>(&swiz); 519 uint32_t* pResult = reinterpret_cast<uint32_t*>(&result); 520 521 for (uint32_t i = 0; i < SIMD_WIDTH; ++i) 522 { 523 pResult[i] = pA[0xF & pSwiz[i]]; 524 } 525 526 return result; 527} 528 529static SIMDINLINE Float SIMDCALL 530 permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float) 531{ 532 Float result; 533 534 // Ugly slow implementation 535 float const* pA = reinterpret_cast<float const*>(&a); 536 uint32_t const* pSwiz = reinterpret_cast<uint32_t const*>(&swiz); 537 float* pResult = reinterpret_cast<float*>(&result); 538 539 for (uint32_t i = 0; i < SIMD_WIDTH; ++i) 540 { 541 pResult[i] = pA[0xF & pSwiz[i]]; 542 } 543 544 return result; 545} 546 547SIMD_WRAPPER_2I(permute2f128_ps); 548SIMD_DWRAPPER_2I(permute2f128_pd); 549SIMD_IWRAPPER_2I_(permute2f128_si, permute2f128_si256); 550 551SIMD_EMU_IWRAPPER_1I(shuffle_epi32); 552 553template <int ImmT> 554static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const& a, Integer const& b) 555{ 556 return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b))); 557} 558SIMD_EMU_IWRAPPER_2(shuffle_epi8); 559SIMD_DWRAPPER_2I(shuffle_pd); 560SIMD_WRAPPER_2I(shuffle_ps); 561SIMD_EMU_IWRAPPER_2(unpackhi_epi16); 562SIMD_IFWRAPPER_2(unpackhi_epi32, _mm256_unpackhi_ps); 563SIMD_EMU_IWRAPPER_2(unpackhi_epi64); 564SIMD_EMU_IWRAPPER_2(unpackhi_epi8); 565SIMD_DWRAPPER_2(unpackhi_pd); 566SIMD_WRAPPER_2(unpackhi_ps); 567SIMD_EMU_IWRAPPER_2(unpacklo_epi16); 568SIMD_IFWRAPPER_2(unpacklo_epi32, _mm256_unpacklo_ps); 569SIMD_EMU_IWRAPPER_2(unpacklo_epi64); 570SIMD_EMU_IWRAPPER_2(unpacklo_epi8); 571SIMD_DWRAPPER_2(unpacklo_pd); 572SIMD_WRAPPER_2(unpacklo_ps); 573 574//----------------------------------------------------------------------- 575// Load / store operations 576//----------------------------------------------------------------------- 577template <ScaleFactor ScaleT = ScaleFactor::SF_1> 578static SIMDINLINE Float SIMDCALL 579 i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) 580{ 581 uint32_t* pOffsets = (uint32_t*)&idx; 582 Float vResult; 583 float* pResult = (float*)&vResult; 584 for (uint32_t i = 0; i < SIMD_WIDTH; ++i) 585 { 586 uint32_t offset = pOffsets[i]; 587 offset = offset * static_cast<uint32_t>(ScaleT); 588 pResult[i] = *(float const*)(((uint8_t const*)p + offset)); 589 } 590 591 return vResult; 592} 593 594template <ScaleFactor ScaleT = ScaleFactor::SF_1> 595static SIMDINLINE Float SIMDCALL 596sw_i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) 597{ 598 return i32gather_ps<ScaleT>(p, idx); 599} 600 601static SIMDINLINE Float SIMDCALL 602 load1_ps(float const* p) // return *p (broadcast 1 value to all elements) 603{ 604 return broadcast_ss(p); 605} 606 607static SIMDINLINE Float SIMDCALL 608 load_ps(float const* p) // return *p (loads SIMD width elements from memory) 609{ 610 return _mm256_load_ps(p); 611} 612 613static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p 614{ 615 return _mm256_load_si256(&p->v); 616} 617 618static SIMDINLINE Float SIMDCALL 619 loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem) 620{ 621 return _mm256_loadu_ps(p); 622} 623 624static SIMDINLINE Integer SIMDCALL 625 loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem) 626{ 627 return _mm256_lddqu_si256(&p->v); 628} 629 630// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old 631template <ScaleFactor ScaleT = ScaleFactor::SF_1> 632static SIMDINLINE Float SIMDCALL 633 mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask) 634{ 635 uint32_t* pOffsets = (uint32_t*)&idx; 636 Float vResult = old; 637 float* pResult = (float*)&vResult; 638 unsigned long index = 0; 639 uint32_t umask = movemask_ps(mask); 640 while (_BitScanForward(&index, umask)) 641 { 642 umask &= ~(1 << index); 643 uint32_t offset = pOffsets[index]; 644 offset = offset * static_cast<uint32_t>(ScaleT); 645 pResult[index] = *(float const*)(((uint8_t const*)p + offset)); 646 } 647 648 return vResult; 649} 650 651template <ScaleFactor ScaleT = ScaleFactor::SF_1> 652static SIMDINLINE Float SIMDCALL 653sw_mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask) 654{ 655 return mask_i32gather_ps<ScaleT>(old, p, idx, mask); 656} 657 658static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src) 659{ 660 _mm256_maskstore_ps(p, mask, src); 661} 662 663static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const& a) 664{ 665 return SIMD128T::movemask_epi8(a.v4[0]) | (SIMD128T::movemask_epi8(a.v4[1]) << 16); 666} 667 668static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const& a) 669{ 670 return static_cast<uint32_t>(_mm256_movemask_pd(a)); 671} 672static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const& a) 673{ 674 return static_cast<uint32_t>(_mm256_movemask_ps(a)); 675} 676 677static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value) 678{ 679 return _mm256_set1_epi32(i); 680} 681 682static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value) 683{ 684 return _mm256_set1_epi8(i); 685} 686 687static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value) 688{ 689 return _mm256_set1_ps(f); 690} 691 692static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float) 693{ 694 return _mm256_setzero_ps(); 695} 696 697static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer) 698{ 699 return _mm256_setzero_si256(); 700} 701 702static SIMDINLINE void SIMDCALL 703 store_ps(float* p, Float const& a) // *p = a (stores all elements contiguously in memory) 704{ 705 _mm256_store_ps(p, a); 706} 707 708static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer const& a) // *p = a 709{ 710 _mm256_store_si256(&p->v, a); 711} 712 713static SIMDINLINE void SIMDCALL 714 stream_ps(float* p, Float const& a) // *p = a (same as store_ps, but doesn't keep memory in cache) 715{ 716 _mm256_stream_ps(p, a); 717} 718 719//======================================================================= 720// Legacy interface (available only in SIMD256 width) 721//======================================================================= 722 723static SIMDINLINE Float SIMDCALL broadcast_ps(SIMD128Impl::Float const* p) 724{ 725 return _mm256_broadcast_ps(&p->v); 726} 727 728template <int ImmT> 729static SIMDINLINE SIMD128Impl::Double SIMDCALL extractf128_pd(Double const& a) 730{ 731 return _mm256_extractf128_pd(a, ImmT); 732} 733 734template <int ImmT> 735static SIMDINLINE SIMD128Impl::Float SIMDCALL extractf128_ps(Float const& a) 736{ 737 return _mm256_extractf128_ps(a, ImmT); 738} 739 740template <int ImmT> 741static SIMDINLINE SIMD128Impl::Integer SIMDCALL extractf128_si(Integer const& a) 742{ 743 return _mm256_extractf128_si256(a, ImmT); 744} 745 746template <int ImmT> 747static SIMDINLINE Double SIMDCALL insertf128_pd(Double const& a, SIMD128Impl::Double const& b) 748{ 749 return _mm256_insertf128_pd(a, b, ImmT); 750} 751 752template <int ImmT> 753static SIMDINLINE Float SIMDCALL insertf128_ps(Float const& a, SIMD128Impl::Float const& b) 754{ 755 return _mm256_insertf128_ps(a, b, ImmT); 756} 757 758template <int ImmT> 759static SIMDINLINE Integer SIMDCALL insertf128_si(Integer const& a, SIMD128Impl::Integer const& b) 760{ 761 return _mm256_insertf128_si256(a, b, ImmT); 762} 763 764#ifndef _mm256_set_m128i 765#define _mm256_set_m128i(/* SIMD128Impl::Integer */ hi, /* SIMD128Impl::Integer */ lo) \ 766 _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1) 767#endif 768 769#ifndef _mm256_loadu2_m128i 770#define _mm256_loadu2_m128i(/* SIMD128Impl::Integer const* */ hiaddr, \ 771 /* SIMD128Impl::Integer const* */ loaddr) \ 772 _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr)) 773#endif 774 775static SIMDINLINE Integer SIMDCALL loadu2_si(SIMD128Impl::Integer const* phi, 776 SIMD128Impl::Integer const* plo) 777{ 778 return _mm256_loadu2_m128i(&phi->v, &plo->v); 779} 780 781static SIMDINLINE Integer SIMDCALL 782 set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0) 783{ 784 return _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0); 785} 786 787static SIMDINLINE Float SIMDCALL 788 set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0) 789{ 790 return _mm256_set_ps(i7, i6, i5, i4, i3, i2, i1, i0); 791} 792 793static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer* phi, 794 SIMD128Impl::Integer* plo, 795 Integer const& src) 796{ 797 _mm256_storeu2_m128i(&phi->v, &plo->v, src); 798} 799 800static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask) 801{ 802 Integer vec = set1_epi32(mask); 803 const Integer bit = set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); 804 vec = and_si(vec, bit); 805 vec = cmplt_epi32(setzero_si(), vec); 806 return castsi_ps(vec); 807} 808 809#undef SIMD_WRAPPER_1 810#undef SIMD_WRAPPER_2 811#undef SIMD_DWRAPPER_2 812#undef SIMD_DWRAPPER_2I 813#undef SIMD_WRAPPER_2I 814#undef SIMD_WRAPPER_3 815#undef SIMD_IWRAPPER_1 816#undef SIMD_IWRAPPER_2 817#undef SIMD_IFWRAPPER_2 818#undef SIMD_IFWRAPPER_2I 819#undef SIMD_IWRAPPER_2I 820#undef SIMD_IWRAPPER_2I_ 821#undef SIMD_IWRAPPER_2_ 822#undef SIMD_IWRAPPER_3 823#undef SIMD_EMU_IWRAPPER_1 824#undef SIMD_EMU_IWRAPPER_1I 825#undef SIMD_EMU_IWRAPPER_2 826#undef SIMD_EMU_IWRAPPER_2I 827