1/**************************************************************************** 2* Copyright (C) 2017 Intel Corporation. All Rights Reserved. 3* 4* Permission is hereby granted, free of charge, to any person obtaining a 5* copy of this software and associated documentation files (the "Software"), 6* to deal in the Software without restriction, including without limitation 7* the rights to use, copy, modify, merge, publish, distribute, sublicense, 8* and/or sell copies of the Software, and to permit persons to whom the 9* Software is furnished to do so, subject to the following conditions: 10* 11* The above copyright notice and this permission notice (including the next 12* paragraph) shall be included in all copies or substantial portions of the 13* Software. 14* 15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21* IN THE SOFTWARE. 22****************************************************************************/ 23#if !defined(__SIMD_LIB_AVX_HPP__) 24#error Do not include this file directly, use "simdlib.hpp" instead. 25#endif 26 27//============================================================================ 28// SIMD16 AVX (1) implementation 29//============================================================================ 30 31static const int TARGET_SIMD_WIDTH = 8; 32using SIMD128T = SIMD128Impl::AVXImpl; 33 34#define SIMD_WRAPPER_1(op) \ 35 static SIMDINLINE Float SIMDCALL op(Float const &a) \ 36 {\ 37 return Float\ 38 {\ 39 SIMD256T::op(a.v8[0]),\ 40 SIMD256T::op(a.v8[1]),\ 41 };\ 42 } 43 44#define SIMD_WRAPPER_2(op) \ 45 static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b) \ 46 {\ 47 return Float\ 48 {\ 49 SIMD256T::op(a.v8[0], b.v8[0]),\ 50 SIMD256T::op(a.v8[1], b.v8[1]),\ 51 };\ 52 } 53 54#define SIMD_WRAPPER_2I(op) \ 55 template<int ImmT>\ 56 static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b) \ 57 {\ 58 return Float\ 59 {\ 60 SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\ 61 SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\ 62 };\ 63 } 64 65#define SIMD_WRAPPER_2I_1(op) \ 66 template<int ImmT>\ 67 static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b) \ 68 {\ 69 return Float\ 70 {\ 71 SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\ 72 SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\ 73 };\ 74 } 75 76#define SIMD_WRAPPER_3(op) \ 77 static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b, Float const &c) \ 78 {\ 79 return Float\ 80 {\ 81 SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\ 82 SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\ 83 };\ 84 } 85 86#define SIMD_IWRAPPER_1(op) \ 87 static SIMDINLINE Integer SIMDCALL op(Integer const &a) \ 88 {\ 89 return Integer\ 90 {\ 91 SIMD256T::op(a.v8[0]),\ 92 SIMD256T::op(a.v8[1]),\ 93 };\ 94 } 95 96#define SIMD_IWRAPPER_2(op) \ 97 static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \ 98 {\ 99 return Integer\ 100 {\ 101 SIMD256T::op(a.v8[0], b.v8[0]),\ 102 SIMD256T::op(a.v8[1], b.v8[1]),\ 103 };\ 104 } 105 106#define SIMD_IWRAPPER_2I(op) \ 107 template<int ImmT>\ 108 static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \ 109 {\ 110 return Integer\ 111 {\ 112 SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\ 113 SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\ 114 };\ 115 } 116 117#define SIMD_IWRAPPER_2I_1(op) \ 118 template<int ImmT>\ 119 static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \ 120 {\ 121 return Integer\ 122 {\ 123 SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\ 124 SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\ 125 };\ 126 } 127 128#define SIMD_IWRAPPER_2I_2(op) \ 129 template<int ImmT>\ 130 static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \ 131 {\ 132 return Integer\ 133 {\ 134 SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]),\ 135 SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]),\ 136 };\ 137 } 138 139#define SIMD_IWRAPPER_3(op) \ 140 static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b, Integer const &c) \ 141 {\ 142 return Integer\ 143 {\ 144 SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\ 145 SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\ 146 };\ 147 } 148 149//----------------------------------------------------------------------- 150// Single precision floating point arithmetic operations 151//----------------------------------------------------------------------- 152SIMD_WRAPPER_2(add_ps); // return a + b 153SIMD_WRAPPER_2(div_ps); // return a / b 154SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c 155SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c 156SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b 157SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b 158SIMD_WRAPPER_2(mul_ps); // return a * b 159SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a 160SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a) 161SIMD_WRAPPER_2(sub_ps); // return a - b 162 163template <RoundMode RMT> 164static SIMDINLINE Float SIMDCALL round_ps(Float const &a) 165{ 166 return Float 167 { 168 SIMD256T::template round_ps<RMT>(a.v8[0]), 169 SIMD256T::template round_ps<RMT>(a.v8[1]), 170 }; 171} 172 173static SIMDINLINE Float SIMDCALL ceil_ps(Float const &a) { return round_ps<RoundMode::CEIL_NOEXC>(a); } 174static SIMDINLINE Float SIMDCALL floor_ps(Float const &a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); } 175 176//----------------------------------------------------------------------- 177// Integer (various width) arithmetic operations 178//----------------------------------------------------------------------- 179SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32) 180SIMD_IWRAPPER_2(add_epi32); // return a + b (int32) 181SIMD_IWRAPPER_2(add_epi8); // return a + b (int8) 182SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 183SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32) 184SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32) 185SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32) 186SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32) 187SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32) 188 189// return (a * b) & 0xFFFFFFFF 190// 191// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, 192// and store the low 32 bits of the intermediate integers in dst. 193SIMD_IWRAPPER_2(mullo_epi32); 194SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32) 195SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64) 196SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) 197 198//----------------------------------------------------------------------- 199// Logical operations 200//----------------------------------------------------------------------- 201SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int) 202SIMD_IWRAPPER_2(and_si); // return a & b (int) 203SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int) 204SIMD_IWRAPPER_2(andnot_si); // return (~a) & b (int) 205SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int) 206SIMD_IWRAPPER_2(or_si); // return a | b (int) 207SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int) 208SIMD_IWRAPPER_2(xor_si); // return a ^ b (int) 209 210 211//----------------------------------------------------------------------- 212// Shift operations 213//----------------------------------------------------------------------- 214template<int ImmT> 215static SIMDINLINE Integer SIMDCALL slli_epi32(Integer const &a) // return a << ImmT 216{ 217 return Integer 218 { 219 SIMD256T::template slli_epi32<ImmT>(a.v8[0]), 220 SIMD256T::template slli_epi32<ImmT>(a.v8[1]), 221 }; 222} 223 224SIMD_IWRAPPER_2(sllv_epi32); // return a << b (uint32) 225 226template<int ImmT> 227static SIMDINLINE Integer SIMDCALL srai_epi32(Integer const &a) // return a >> ImmT (int32) 228{ 229 return Integer 230 { 231 SIMD256T::template srai_epi32<ImmT>(a.v8[0]), 232 SIMD256T::template srai_epi32<ImmT>(a.v8[1]), 233 }; 234} 235 236template<int ImmT> 237static SIMDINLINE Integer SIMDCALL srli_epi32(Integer const &a) // return a >> ImmT (uint32) 238{ 239 return Integer 240 { 241 SIMD256T::template srli_epi32<ImmT>(a.v8[0]), 242 SIMD256T::template srli_epi32<ImmT>(a.v8[1]), 243 }; 244} 245 246template<int ImmT> // for each 128-bit lane: 247static SIMDINLINE Integer SIMDCALL srli_si(Integer const &a) // return a >> (ImmT*8) (uint) 248{ 249 return Integer 250 { 251 SIMD256T::template srli_si<ImmT>(a.v8[0]), 252 SIMD256T::template srli_si<ImmT>(a.v8[1]), 253 }; 254} 255template<int ImmT> 256static SIMDINLINE Float SIMDCALL srlisi_ps(Float const &a) // same as srli_si, but with Float cast to int 257{ 258 return Float 259 { 260 SIMD256T::template srlisi_ps<ImmT>(a.v8[0]), 261 SIMD256T::template srlisi_ps<ImmT>(a.v8[1]), 262 }; 263} 264 265SIMD_IWRAPPER_2(srlv_epi32); // return a >> b (uint32) 266 267//----------------------------------------------------------------------- 268// Conversion operations 269//----------------------------------------------------------------------- 270static SIMDINLINE Float SIMDCALL castpd_ps(Double const &a) // return *(Float*)(&a) 271{ 272 return Float 273 { 274 SIMD256T::castpd_ps(a.v8[0]), 275 SIMD256T::castpd_ps(a.v8[1]), 276 }; 277} 278 279static SIMDINLINE Integer SIMDCALL castps_si(Float const &a) // return *(Integer*)(&a) 280{ 281 return Integer 282 { 283 SIMD256T::castps_si(a.v8[0]), 284 SIMD256T::castps_si(a.v8[1]), 285 }; 286} 287 288static SIMDINLINE Double SIMDCALL castsi_pd(Integer const &a) // return *(Double*)(&a) 289{ 290 return Double 291 { 292 SIMD256T::castsi_pd(a.v8[0]), 293 SIMD256T::castsi_pd(a.v8[1]), 294 }; 295} 296 297static SIMDINLINE Double SIMDCALL castps_pd(Float const &a) // return *(Double*)(&a) 298{ 299 return Double 300 { 301 SIMD256T::castps_pd(a.v8[0]), 302 SIMD256T::castps_pd(a.v8[1]), 303 }; 304} 305 306static SIMDINLINE Float SIMDCALL castsi_ps(Integer const &a) // return *(Float*)(&a) 307{ 308 return Float 309 { 310 SIMD256T::castsi_ps(a.v8[0]), 311 SIMD256T::castsi_ps(a.v8[1]), 312 }; 313} 314 315static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer const &a) // return (float)a (int32 --> float) 316{ 317 return Float 318 { 319 SIMD256T::cvtepi32_ps(a.v8[0]), 320 SIMD256T::cvtepi32_ps(a.v8[1]), 321 }; 322} 323 324static SIMDINLINE Integer SIMDCALL cvtepu8_epi16(SIMD256Impl::Integer const &a) // return (int16)a (uint8 --> int16) 325{ 326 return Integer 327 { 328 SIMD256T::cvtepu8_epi16(a.v4[0]), 329 SIMD256T::cvtepu8_epi16(a.v4[1]), 330 }; 331} 332 333static SIMDINLINE Integer SIMDCALL cvtepu8_epi32(SIMD256Impl::Integer const &a) // return (int32)a (uint8 --> int32) 334{ 335 return Integer 336 { 337 SIMD256T::cvtepu8_epi32(a.v4[0]), 338 SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])), 339 }; 340} 341 342static SIMDINLINE Integer SIMDCALL cvtepu16_epi32(SIMD256Impl::Integer const &a) // return (int32)a (uint16 --> int32) 343{ 344 return Integer 345 { 346 SIMD256T::cvtepu16_epi32(a.v4[0]), 347 SIMD256T::cvtepu16_epi32(a.v4[1]), 348 }; 349} 350 351static SIMDINLINE Integer SIMDCALL cvtepu16_epi64(SIMD256Impl::Integer const &a) // return (int64)a (uint16 --> int64) 352{ 353 return Integer 354 { 355 SIMD256T::cvtepu16_epi64(a.v4[0]), 356 SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])), 357 }; 358} 359 360static SIMDINLINE Integer SIMDCALL cvtepu32_epi64(SIMD256Impl::Integer const &a) // return (int64)a (uint32 --> int64) 361{ 362 return Integer 363 { 364 SIMD256T::cvtepu32_epi64(a.v4[0]), 365 SIMD256T::cvtepu32_epi64(a.v4[1]), 366 }; 367} 368 369static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float const &a) // return (int32)a (float --> int32) 370{ 371 return Integer 372 { 373 SIMD256T::cvtps_epi32(a.v8[0]), 374 SIMD256T::cvtps_epi32(a.v8[1]), 375 }; 376} 377 378static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float const &a) // return (int32)a (rnd_to_zero(float) --> int32) 379{ 380 return Integer 381 { 382 SIMD256T::cvtps_epi32(a.v8[0]), 383 SIMD256T::cvtps_epi32(a.v8[1]), 384 }; 385} 386 387//----------------------------------------------------------------------- 388// Comparison operations 389//----------------------------------------------------------------------- 390template<CompareType CmpTypeT> 391static SIMDINLINE Float SIMDCALL cmp_ps(Float const &a, Float const &b) // return a (CmpTypeT) b 392{ 393 return Float 394 { 395 SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]), 396 SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]), 397 }; 398} 399static SIMDINLINE Float SIMDCALL cmplt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LT_OQ>(a, b); } 400static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GT_OQ>(a, b); } 401static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); } 402static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::EQ_OQ>(a, b); } 403static SIMDINLINE Float SIMDCALL cmpge_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GE_OQ>(a, b); } 404static SIMDINLINE Float SIMDCALL cmple_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LE_OQ>(a, b); } 405 406template<CompareType CmpTypeT> 407static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float const &a, Float const &b) 408{ 409 return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b))); 410} 411 412 413SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8) 414SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16) 415SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32) 416SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64) 417SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8) 418SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16) 419SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32) 420SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64) 421SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32) 422 423static SIMDINLINE bool SIMDCALL testz_ps(Float const &a, Float const &b) // return all_lanes_zero(a & b) ? 1 : 0 (float) 424{ 425 return 0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) & 426 SIMD256T::testz_ps(a.v8[1], b.v8[1])); 427} 428 429static SIMDINLINE int SIMDCALL testz_si(Integer const &a, Integer const &b) // return all_lanes_zero(a & b) ? 1 : 0 (int) 430{ 431 return 0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) & 432 SIMD256T::testz_si(a.v8[1], b.v8[1])); 433} 434 435//----------------------------------------------------------------------- 436// Blend / shuffle / permute operations 437//----------------------------------------------------------------------- 438SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float) 439SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a (int32) 440SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float) 441static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Float const &mask) // return mask ? b : a (int) 442{ 443 return Integer 444 { 445 SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]), 446 SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]), 447 }; 448} 449 450static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Integer const &mask) // return mask ? b : a (int) 451{ 452 return Integer 453 { 454 SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]), 455 SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]), 456 }; 457} 458 459static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p) // return *p (all elements in vector get same value) 460{ 461 float f = *p; 462 return Float 463 { 464 SIMD256T::set1_ps(f), 465 SIMD256T::set1_ps(f), 466 }; 467} 468 469template<int imm> 470static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float const &a) 471{ 472 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); 473 return a.v8[imm]; 474} 475 476template<int imm> 477static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double const &a) 478{ 479 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); 480 return a.v8[imm]; 481} 482 483template<int imm> 484static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer const &a) 485{ 486 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); 487 return a.v8[imm]; 488} 489 490template<int imm> 491static SIMDINLINE Float SIMDCALL insert_ps(Float const &a, SIMD256Impl::Float const &b) 492{ 493 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); 494 Float r = a; 495 r.v8[imm] = b; 496 return r; 497} 498 499template<int imm> 500static SIMDINLINE Double SIMDCALL insert_pd(Double const &a, SIMD256Impl::Double const &b) 501{ 502 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); 503 Double r = a; 504 r.v8[imm] = b; 505 return r; 506} 507 508template<int imm> 509static SIMDINLINE Integer SIMDCALL insert_si(Integer const &a, SIMD256Impl::Integer const &b) 510{ 511 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); 512 Integer r = a; 513 r.v8[imm] = b; 514 return r; 515} 516 517SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16 518SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32 519SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16 520SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32 521 522static SIMDINLINE Integer SIMDCALL permute_epi32(Integer const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (int32) 523{ 524 return castps_si(permute_ps(castsi_ps(a), swiz)); 525} 526 527static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (float) 528{ 529 const auto mask = SIMD256T::set1_epi32(7); 530 531 auto lolo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[0], mask)); 532 auto lohi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[0], mask)); 533 534 auto hilo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[1], mask)); 535 auto hihi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[1], mask)); 536 537 return Float 538 { 539 SIMD256T::blendv_ps(lolo, lohi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[0], mask))), 540 SIMD256T::blendv_ps(hilo, hihi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[1], mask))), 541 }; 542} 543 544// All of the 512-bit permute2f128_XX intrinsics do the following: 545// 546// SELECT4(src, control) { 547// CASE(control[1:0]) 548// 0: tmp[127:0] : = src[127:0] 549// 1 : tmp[127:0] : = src[255:128] 550// 2 : tmp[127:0] : = src[383:256] 551// 3 : tmp[127:0] : = src[511:384] 552// ESAC 553// RETURN tmp[127:0] 554// } 555// 556// dst[127:0] : = SELECT4(a[511:0], imm8[1:0]) 557// dst[255:128] : = SELECT4(a[511:0], imm8[3:2]) 558// dst[383:256] : = SELECT4(b[511:0], imm8[5:4]) 559// dst[511:384] : = SELECT4(b[511:0], imm8[7:6]) 560// dst[MAX:512] : = 0 561// 562// Since the 256-bit AVX instructions use a 4-bit control field (instead 563// of 2-bit for AVX512), we need to expand the control bits sent to the 564// AVX instructions for emulation. 565// 566template <int shuf> 567static SIMDINLINE Float SIMDCALL permute2f128_ps(Float const &a, Float const &b) 568{ 569 return Float 570 { 571 SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]), 572 SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]), 573 }; 574} 575 576template <int shuf> 577static SIMDINLINE Double SIMDCALL permute2f128_pd(Double const &a, Double const &b) 578{ 579 return Double 580 { 581 SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]), 582 SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]), 583 }; 584} 585 586template <int shuf> 587static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer const &a, Integer const &b) 588{ 589 return Integer 590 { 591 SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]), 592 SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]), 593 }; 594} 595 596SIMD_IWRAPPER_2I_1(shuffle_epi32); 597SIMD_IWRAPPER_2I_2(shuffle_epi64); 598SIMD_IWRAPPER_2(shuffle_epi8); 599SIMD_WRAPPER_2I_1(shuffle_pd); 600SIMD_WRAPPER_2I_1(shuffle_ps); 601SIMD_IWRAPPER_2(unpackhi_epi16); 602SIMD_IWRAPPER_2(unpackhi_epi32); 603SIMD_IWRAPPER_2(unpackhi_epi64); 604SIMD_IWRAPPER_2(unpackhi_epi8); 605SIMD_WRAPPER_2(unpackhi_pd); 606SIMD_WRAPPER_2(unpackhi_ps); 607SIMD_IWRAPPER_2(unpacklo_epi16); 608SIMD_IWRAPPER_2(unpacklo_epi32); 609SIMD_IWRAPPER_2(unpacklo_epi64); 610SIMD_IWRAPPER_2(unpacklo_epi8); 611SIMD_WRAPPER_2(unpacklo_pd); 612SIMD_WRAPPER_2(unpacklo_ps); 613 614//----------------------------------------------------------------------- 615// Load / store operations 616//----------------------------------------------------------------------- 617template<ScaleFactor ScaleT> 618static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer const &idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) 619{ 620 return Float 621 { 622 SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]), 623 SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]), 624 }; 625} 626 627static SIMDINLINE Float SIMDCALL load1_ps(float const *p) // return *p (broadcast 1 value to all elements) 628{ 629 return broadcast_ss(p); 630} 631 632static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory) 633{ 634 return Float 635 { 636 SIMD256T::load_ps(p), 637 SIMD256T::load_ps(p + TARGET_SIMD_WIDTH) 638 }; 639} 640 641static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p 642{ 643 return Integer 644 { 645 SIMD256T::load_si(&p->v8[0]), 646 SIMD256T::load_si(&p->v8[1]), 647 }; 648} 649 650static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem) 651{ 652 return Float 653 { 654 SIMD256T::loadu_ps(p), 655 SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH) 656 }; 657} 658 659static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem) 660{ 661 return Integer 662 { 663 SIMD256T::loadu_si(&p->v8[0]), 664 SIMD256T::loadu_si(&p->v8[1]), 665 }; 666} 667 668// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old 669template<ScaleFactor ScaleT> 670static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float const &old, float const* p, Integer const &idx, Float const &mask) 671{ 672 return Float 673 { 674 SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]), 675 SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]), 676 }; 677} 678 679static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer const &mask, Float const &src) 680{ 681 SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]); 682 SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]); 683} 684 685static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer const &a) 686{ 687 uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0])); 688 mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4); 689 690 return mask; 691} 692 693static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const &a) 694{ 695 uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0])); 696 mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2); 697 698 return mask; 699} 700static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const &a) 701{ 702 uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0])); 703 mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH; 704 705 return mask; 706} 707 708static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value) 709{ 710 return Integer 711 { 712 SIMD256T::set1_epi32(i), 713 SIMD256T::set1_epi32(i) 714 }; 715} 716 717static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value) 718{ 719 return Integer 720 { 721 SIMD256T::set1_epi8(i), 722 SIMD256T::set1_epi8(i) 723 }; 724} 725 726static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value) 727{ 728 return Float 729 { 730 SIMD256T::set1_ps(f), 731 SIMD256T::set1_ps(f) 732 }; 733} 734 735static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float) 736{ 737 return Float 738 { 739 SIMD256T::setzero_ps(), 740 SIMD256T::setzero_ps() 741 }; 742} 743 744static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer) 745{ 746 return Integer 747 { 748 SIMD256T::setzero_si(), 749 SIMD256T::setzero_si() 750 }; 751} 752 753static SIMDINLINE void SIMDCALL store_ps(float *p, Float const &a) // *p = a (stores all elements contiguously in memory) 754{ 755 SIMD256T::store_ps(p, a.v8[0]); 756 SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]); 757} 758 759static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer const &a) // *p = a 760{ 761 SIMD256T::store_si(&p->v8[0], a.v8[0]); 762 SIMD256T::store_si(&p->v8[1], a.v8[1]); 763} 764 765static SIMDINLINE void SIMDCALL stream_ps(float *p, Float const &a) // *p = a (same as store_ps, but doesn't keep memory in cache) 766{ 767 SIMD256T::stream_ps(p, a.v8[0]); 768 SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]); 769} 770 771static SIMDINLINE Integer SIMDCALL set_epi32( 772 int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8, 773 int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0) 774{ 775 return Integer 776 { 777 SIMD256T::set_epi32( 778 i7, i6, i5, i4, i3, i2, i1, i0), 779 SIMD256T::set_epi32( 780 i15, i14, i13, i12, i11, i10, i9, i8) 781 }; 782} 783 784static SIMDINLINE Integer SIMDCALL set_epi32( 785 int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0) 786{ 787 return set_epi32( 788 0, 0, 0, 0, 0, 0, 0, 0, 789 i7, i6, i5, i4, i3, i2, i1, i0); 790} 791 792static SIMDINLINE Float SIMDCALL set_ps( 793 float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8, 794 float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0) 795{ 796 return Float 797 { 798 SIMD256T::set_ps( 799 i7, i6, i5, i4, i3, i2, i1, i0), 800 SIMD256T::set_ps( 801 i15, i14, i13, i12, i11, i10, i9, i8) 802 }; 803} 804 805static SIMDINLINE Float SIMDCALL set_ps( 806 float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0) 807{ 808 return set_ps( 809 0, 0, 0, 0, 0, 0, 0, 0, 810 i7, i6, i5, i4, i3, i2, i1, i0); 811} 812 813static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask) 814{ 815 return Float 816 { 817 SIMD256T::vmask_ps(mask), 818 SIMD256T::vmask_ps(mask >> TARGET_SIMD_WIDTH) 819 }; 820} 821 822#undef SIMD_WRAPPER_1 823#undef SIMD_WRAPPER_2 824#undef SIMD_WRAPPER_2I 825#undef SIMD_WRAPPER_2I_1 826#undef SIMD_WRAPPER_3 827#undef SIMD_IWRAPPER_1 828#undef SIMD_IWRAPPER_2 829#undef SIMD_IWRAPPER_2I 830#undef SIMD_IWRAPPER_2I_1 831#undef SIMD_IWRAPPER_3 832