1/**************************************************************************** 2 * Copyright (C) 2017 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 ****************************************************************************/ 23#if !defined(__SIMD_LIB_AVX_HPP__) 24#error Do not include this file directly, use "simdlib.hpp" instead. 25#endif 26 27//============================================================================ 28// SIMD16 AVX (1) implementation 29//============================================================================ 30 31static const int TARGET_SIMD_WIDTH = 8; 32using SIMD128T = SIMD128Impl::AVXImpl; 33 34#define SIMD_WRAPPER_1(op) \ 35 static SIMDINLINE Float SIMDCALL op(Float const& a) \ 36 { \ 37 return Float{ \ 38 SIMD256T::op(a.v8[0]), \ 39 SIMD256T::op(a.v8[1]), \ 40 }; \ 41 } 42 43#define SIMD_WRAPPER_2(op) \ 44 static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \ 45 { \ 46 return Float{ \ 47 SIMD256T::op(a.v8[0], b.v8[0]), \ 48 SIMD256T::op(a.v8[1], b.v8[1]), \ 49 }; \ 50 } 51 52#define SIMD_WRAPPER_2I(op) \ 53 template <int ImmT> \ 54 static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \ 55 { \ 56 return Float{ \ 57 SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]), \ 58 SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \ 59 }; \ 60 } 61 62#define SIMD_WRAPPER_2I_1(op) \ 63 template <int ImmT> \ 64 static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \ 65 { \ 66 return Float{ \ 67 SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]), \ 68 SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]), \ 69 }; \ 70 } 71 72#define SIMD_WRAPPER_3(op) \ 73 static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b, Float const& c) \ 74 { \ 75 return Float{ \ 76 SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]), \ 77 SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]), \ 78 }; \ 79 } 80 81#define SIMD_IWRAPPER_1(op) \ 82 static SIMDINLINE Integer SIMDCALL op(Integer const& a) \ 83 { \ 84 return Integer{ \ 85 SIMD256T::op(a.v8[0]), \ 86 SIMD256T::op(a.v8[1]), \ 87 }; \ 88 } 89 90#define SIMD_IWRAPPER_2(op) \ 91 static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ 92 { \ 93 return Integer{ \ 94 SIMD256T::op(a.v8[0], b.v8[0]), \ 95 SIMD256T::op(a.v8[1], b.v8[1]), \ 96 }; \ 97 } 98 99#define SIMD_IWRAPPER_2I(op) \ 100 template <int ImmT> \ 101 static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ 102 { \ 103 return Integer{ \ 104 SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]), \ 105 SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \ 106 }; \ 107 } 108 109#define SIMD_IWRAPPER_2I_1(op) \ 110 template <int ImmT> \ 111 static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ 112 { \ 113 return Integer{ \ 114 SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]), \ 115 SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]), \ 116 }; \ 117 } 118 119#define SIMD_IWRAPPER_2I_2(op) \ 120 template <int ImmT> \ 121 static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \ 122 { \ 123 return Integer{ \ 124 SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]), \ 125 SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]), \ 126 }; \ 127 } 128 129#define SIMD_IWRAPPER_3(op) \ 130 static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b, Integer const& c) \ 131 { \ 132 return Integer{ \ 133 SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]), \ 134 SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]), \ 135 }; \ 136 } 137 138//----------------------------------------------------------------------- 139// Single precision floating point arithmetic operations 140//----------------------------------------------------------------------- 141SIMD_WRAPPER_2(add_ps); // return a + b 142SIMD_WRAPPER_2(div_ps); // return a / b 143SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c 144SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c 145SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b 146SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b 147SIMD_WRAPPER_2(mul_ps); // return a * b 148SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a 149SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a) 150SIMD_WRAPPER_2(sub_ps); // return a - b 151 152template <RoundMode RMT> 153static SIMDINLINE Float SIMDCALL round_ps(Float const& a) 154{ 155 return Float{ 156 SIMD256T::template round_ps<RMT>(a.v8[0]), 157 SIMD256T::template round_ps<RMT>(a.v8[1]), 158 }; 159} 160 161static SIMDINLINE Float SIMDCALL ceil_ps(Float const& a) 162{ 163 return round_ps<RoundMode::CEIL_NOEXC>(a); 164} 165static SIMDINLINE Float SIMDCALL floor_ps(Float const& a) 166{ 167 return round_ps<RoundMode::FLOOR_NOEXC>(a); 168} 169 170//----------------------------------------------------------------------- 171// Integer (various width) arithmetic operations 172//----------------------------------------------------------------------- 173SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32) 174SIMD_IWRAPPER_2(add_epi32); // return a + b (int32) 175SIMD_IWRAPPER_2(add_epi8); // return a + b (int8) 176SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 177SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32) 178SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32) 179SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32) 180SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32) 181SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32) 182 183// return (a * b) & 0xFFFFFFFF 184// 185// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, 186// and store the low 32 bits of the intermediate integers in dst. 187SIMD_IWRAPPER_2(mullo_epi32); 188SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32) 189SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64) 190SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) 191 192//----------------------------------------------------------------------- 193// Logical operations 194//----------------------------------------------------------------------- 195SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int) 196SIMD_IWRAPPER_2(and_si); // return a & b (int) 197SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int) 198SIMD_IWRAPPER_2(andnot_si); // return (~a) & b (int) 199SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int) 200SIMD_IWRAPPER_2(or_si); // return a | b (int) 201SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int) 202SIMD_IWRAPPER_2(xor_si); // return a ^ b (int) 203 204//----------------------------------------------------------------------- 205// Shift operations 206//----------------------------------------------------------------------- 207template <int ImmT> 208static SIMDINLINE Integer SIMDCALL slli_epi32(Integer const& a) // return a << ImmT 209{ 210 return Integer{ 211 SIMD256T::template slli_epi32<ImmT>(a.v8[0]), 212 SIMD256T::template slli_epi32<ImmT>(a.v8[1]), 213 }; 214} 215 216SIMD_IWRAPPER_2(sllv_epi32); // return a << b (uint32) 217 218template <int ImmT> 219static SIMDINLINE Integer SIMDCALL srai_epi32(Integer const& a) // return a >> ImmT (int32) 220{ 221 return Integer{ 222 SIMD256T::template srai_epi32<ImmT>(a.v8[0]), 223 SIMD256T::template srai_epi32<ImmT>(a.v8[1]), 224 }; 225} 226 227template <int ImmT> 228static SIMDINLINE Integer SIMDCALL srli_epi32(Integer const& a) // return a >> ImmT (uint32) 229{ 230 return Integer{ 231 SIMD256T::template srli_epi32<ImmT>(a.v8[0]), 232 SIMD256T::template srli_epi32<ImmT>(a.v8[1]), 233 }; 234} 235 236template <int ImmT> // for each 128-bit lane: 237static SIMDINLINE Integer SIMDCALL srli_si(Integer const& a) // return a >> (ImmT*8) (uint) 238{ 239 return Integer{ 240 SIMD256T::template srli_si<ImmT>(a.v8[0]), 241 SIMD256T::template srli_si<ImmT>(a.v8[1]), 242 }; 243} 244template <int ImmT> 245static SIMDINLINE Float SIMDCALL 246 srlisi_ps(Float const& a) // same as srli_si, but with Float cast to int 247{ 248 return Float{ 249 SIMD256T::template srlisi_ps<ImmT>(a.v8[0]), 250 SIMD256T::template srlisi_ps<ImmT>(a.v8[1]), 251 }; 252} 253 254SIMD_IWRAPPER_2(srlv_epi32); // return a >> b (uint32) 255 256//----------------------------------------------------------------------- 257// Conversion operations 258//----------------------------------------------------------------------- 259static SIMDINLINE Float SIMDCALL castpd_ps(Double const& a) // return *(Float*)(&a) 260{ 261 return Float{ 262 SIMD256T::castpd_ps(a.v8[0]), 263 SIMD256T::castpd_ps(a.v8[1]), 264 }; 265} 266 267static SIMDINLINE Integer SIMDCALL castps_si(Float const& a) // return *(Integer*)(&a) 268{ 269 return Integer{ 270 SIMD256T::castps_si(a.v8[0]), 271 SIMD256T::castps_si(a.v8[1]), 272 }; 273} 274 275static SIMDINLINE Double SIMDCALL castsi_pd(Integer const& a) // return *(Double*)(&a) 276{ 277 return Double{ 278 SIMD256T::castsi_pd(a.v8[0]), 279 SIMD256T::castsi_pd(a.v8[1]), 280 }; 281} 282 283static SIMDINLINE Double SIMDCALL castps_pd(Float const& a) // return *(Double*)(&a) 284{ 285 return Double{ 286 SIMD256T::castps_pd(a.v8[0]), 287 SIMD256T::castps_pd(a.v8[1]), 288 }; 289} 290 291static SIMDINLINE Float SIMDCALL castsi_ps(Integer const& a) // return *(Float*)(&a) 292{ 293 return Float{ 294 SIMD256T::castsi_ps(a.v8[0]), 295 SIMD256T::castsi_ps(a.v8[1]), 296 }; 297} 298 299static SIMDINLINE Float SIMDCALL 300 cvtepi32_ps(Integer const& a) // return (float)a (int32 --> float) 301{ 302 return Float{ 303 SIMD256T::cvtepi32_ps(a.v8[0]), 304 SIMD256T::cvtepi32_ps(a.v8[1]), 305 }; 306} 307 308static SIMDINLINE Integer SIMDCALL 309 cvtepu8_epi16(SIMD256Impl::Integer const& a) // return (int16)a (uint8 --> int16) 310{ 311 return Integer{ 312 SIMD256T::cvtepu8_epi16(a.v4[0]), 313 SIMD256T::cvtepu8_epi16(a.v4[1]), 314 }; 315} 316 317static SIMDINLINE Integer SIMDCALL 318 cvtepu8_epi32(SIMD256Impl::Integer const& a) // return (int32)a (uint8 --> int32) 319{ 320 return Integer{ 321 SIMD256T::cvtepu8_epi32(a.v4[0]), 322 SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])), 323 }; 324} 325 326static SIMDINLINE Integer SIMDCALL 327 cvtepu16_epi32(SIMD256Impl::Integer const& a) // return (int32)a (uint16 --> int32) 328{ 329 return Integer{ 330 SIMD256T::cvtepu16_epi32(a.v4[0]), 331 SIMD256T::cvtepu16_epi32(a.v4[1]), 332 }; 333} 334 335static SIMDINLINE Integer SIMDCALL 336 cvtepu16_epi64(SIMD256Impl::Integer const& a) // return (int64)a (uint16 --> int64) 337{ 338 return Integer{ 339 SIMD256T::cvtepu16_epi64(a.v4[0]), 340 SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])), 341 }; 342} 343 344static SIMDINLINE Integer SIMDCALL 345 cvtepu32_epi64(SIMD256Impl::Integer const& a) // return (int64)a (uint32 --> int64) 346{ 347 return Integer{ 348 SIMD256T::cvtepu32_epi64(a.v4[0]), 349 SIMD256T::cvtepu32_epi64(a.v4[1]), 350 }; 351} 352 353static SIMDINLINE Integer SIMDCALL 354 cvtps_epi32(Float const& a) // return (int32)a (float --> int32) 355{ 356 return Integer{ 357 SIMD256T::cvtps_epi32(a.v8[0]), 358 SIMD256T::cvtps_epi32(a.v8[1]), 359 }; 360} 361 362static SIMDINLINE Integer SIMDCALL 363 cvttps_epi32(Float const& a) // return (int32)a (rnd_to_zero(float) --> int32) 364{ 365 return Integer{ 366 SIMD256T::cvtps_epi32(a.v8[0]), 367 SIMD256T::cvtps_epi32(a.v8[1]), 368 }; 369} 370 371//----------------------------------------------------------------------- 372// Comparison operations 373//----------------------------------------------------------------------- 374template <CompareType CmpTypeT> 375static SIMDINLINE Float SIMDCALL cmp_ps(Float const& a, Float const& b) // return a (CmpTypeT) b 376{ 377 return Float{ 378 SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]), 379 SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]), 380 }; 381} 382static SIMDINLINE Float SIMDCALL cmplt_ps(Float const& a, Float const& b) 383{ 384 return cmp_ps<CompareType::LT_OQ>(a, b); 385} 386static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const& a, Float const& b) 387{ 388 return cmp_ps<CompareType::GT_OQ>(a, b); 389} 390static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const& a, Float const& b) 391{ 392 return cmp_ps<CompareType::NEQ_OQ>(a, b); 393} 394static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const& a, Float const& b) 395{ 396 return cmp_ps<CompareType::EQ_OQ>(a, b); 397} 398static SIMDINLINE Float SIMDCALL cmpge_ps(Float const& a, Float const& b) 399{ 400 return cmp_ps<CompareType::GE_OQ>(a, b); 401} 402static SIMDINLINE Float SIMDCALL cmple_ps(Float const& a, Float const& b) 403{ 404 return cmp_ps<CompareType::LE_OQ>(a, b); 405} 406 407template <CompareType CmpTypeT> 408static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float const& a, Float const& b) 409{ 410 return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b))); 411} 412 413SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8) 414SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16) 415SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32) 416SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64) 417SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8) 418SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16) 419SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32) 420SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64) 421SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32) 422 423static SIMDINLINE bool SIMDCALL 424 testz_ps(Float const& a, Float const& b) // return all_lanes_zero(a & b) ? 1 : 0 (float) 425{ 426 return 0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) & SIMD256T::testz_ps(a.v8[1], b.v8[1])); 427} 428 429static SIMDINLINE bool SIMDCALL 430 testz_si(Integer const& a, Integer const& b) // return all_lanes_zero(a & b) ? 1 : 0 (int) 431{ 432 return 0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) & SIMD256T::testz_si(a.v8[1], b.v8[1])); 433} 434 435//----------------------------------------------------------------------- 436// Blend / shuffle / permute operations 437//----------------------------------------------------------------------- 438SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float) 439SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a (int32) 440SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float) 441static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a, 442 Integer const& b, 443 Float const& mask) // return mask ? b : a (int) 444{ 445 return Integer{ 446 SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]), 447 SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]), 448 }; 449} 450 451static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a, 452 Integer const& b, 453 Integer const& mask) // return mask ? b : a (int) 454{ 455 return Integer{ 456 SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]), 457 SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]), 458 }; 459} 460 461static SIMDINLINE Float SIMDCALL 462 broadcast_ss(float const* p) // return *p (all elements in vector get same value) 463{ 464 float f = *p; 465 return Float{ 466 SIMD256T::set1_ps(f), 467 SIMD256T::set1_ps(f), 468 }; 469} 470 471template <int imm> 472static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float const& a) 473{ 474 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); 475 return a.v8[imm]; 476} 477 478template <int imm> 479static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double const& a) 480{ 481 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); 482 return a.v8[imm]; 483} 484 485template <int imm> 486static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer const& a) 487{ 488 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); 489 return a.v8[imm]; 490} 491 492template <int imm> 493static SIMDINLINE Float SIMDCALL insert_ps(Float const& a, SIMD256Impl::Float const& b) 494{ 495 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); 496 Float r = a; 497 r.v8[imm] = b; 498 return r; 499} 500 501template <int imm> 502static SIMDINLINE Double SIMDCALL insert_pd(Double const& a, SIMD256Impl::Double const& b) 503{ 504 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); 505 Double r = a; 506 r.v8[imm] = b; 507 return r; 508} 509 510template <int imm> 511static SIMDINLINE Integer SIMDCALL insert_si(Integer const& a, SIMD256Impl::Integer const& b) 512{ 513 SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm); 514 Integer r = a; 515 r.v8[imm] = b; 516 return r; 517} 518 519SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16 520SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32 521SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16 522SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32 523 524template <int ImmT> 525static SIMDINLINE Float SIMDCALL permute_ps(Float const& a) 526{ 527 return Float{ 528 SIMD256T::template permute_ps<ImmT>(a.v8[0]), 529 SIMD256T::template permute_ps<ImmT>(a.v8[1]), 530 }; 531} 532 533static SIMDINLINE Integer SIMDCALL permute_epi32( 534 Integer const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (int32) 535{ 536 return castps_si(permute_ps(castsi_ps(a), swiz)); 537} 538 539static SIMDINLINE Float SIMDCALL 540 permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float) 541{ 542 const auto mask = SIMD256T::set1_epi32(7); 543 544 auto lolo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[0], mask)); 545 auto lohi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[0], mask)); 546 547 auto hilo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[1], mask)); 548 auto hihi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[1], mask)); 549 550 return Float{ 551 SIMD256T::blendv_ps( 552 lolo, lohi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[0], mask))), 553 SIMD256T::blendv_ps( 554 hilo, hihi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[1], mask))), 555 }; 556} 557 558// All of the 512-bit permute2f128_XX intrinsics do the following: 559// 560// SELECT4(src, control) { 561// CASE(control[1:0]) 562// 0 : tmp[127:0] : = src[127:0] 563// 1 : tmp[127:0] : = src[255:128] 564// 2 : tmp[127:0] : = src[383:256] 565// 3 : tmp[127:0] : = src[511:384] 566// ESAC 567// RETURN tmp[127:0] 568// } 569// 570// dst[127:0] : = SELECT4(a[511:0], imm8[1:0]) 571// dst[255:128] : = SELECT4(a[511:0], imm8[3:2]) 572// dst[383:256] : = SELECT4(b[511:0], imm8[5:4]) 573// dst[511:384] : = SELECT4(b[511:0], imm8[7:6]) 574// dst[MAX:512] : = 0 575// 576// Since the 256-bit AVX instructions use a 4-bit control field (instead 577// of 2-bit for AVX512), we need to expand the control bits sent to the 578// AVX instructions for emulation. 579// 580template <int shuf> 581static SIMDINLINE Float SIMDCALL permute2f128_ps(Float const& a, Float const& b) 582{ 583 return Float{ 584 SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], 585 a.v8[1]), 586 SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], 587 b.v8[1]), 588 }; 589} 590 591template <int shuf> 592static SIMDINLINE Double SIMDCALL permute2f128_pd(Double const& a, Double const& b) 593{ 594 return Double{ 595 SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], 596 a.v8[1]), 597 SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], 598 b.v8[1]), 599 }; 600} 601 602template <int shuf> 603static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer const& a, Integer const& b) 604{ 605 return Integer{ 606 SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], 607 a.v8[1]), 608 SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], 609 b.v8[1]), 610 }; 611} 612 613SIMD_IWRAPPER_2I_1(shuffle_epi32); 614SIMD_IWRAPPER_2I_2(shuffle_epi64); 615SIMD_IWRAPPER_2(shuffle_epi8); 616SIMD_WRAPPER_2I_1(shuffle_pd); 617SIMD_WRAPPER_2I_1(shuffle_ps); 618SIMD_IWRAPPER_2(unpackhi_epi16); 619SIMD_IWRAPPER_2(unpackhi_epi32); 620SIMD_IWRAPPER_2(unpackhi_epi64); 621SIMD_IWRAPPER_2(unpackhi_epi8); 622SIMD_WRAPPER_2(unpackhi_pd); 623SIMD_WRAPPER_2(unpackhi_ps); 624SIMD_IWRAPPER_2(unpacklo_epi16); 625SIMD_IWRAPPER_2(unpacklo_epi32); 626SIMD_IWRAPPER_2(unpacklo_epi64); 627SIMD_IWRAPPER_2(unpacklo_epi8); 628SIMD_WRAPPER_2(unpacklo_pd); 629SIMD_WRAPPER_2(unpacklo_ps); 630 631//----------------------------------------------------------------------- 632// Load / store operations 633//----------------------------------------------------------------------- 634template <ScaleFactor ScaleT = ScaleFactor::SF_1> 635static SIMDINLINE Float SIMDCALL 636 i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) 637{ 638 return Float{ 639 SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]), 640 SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]), 641 }; 642} 643 644template <ScaleFactor ScaleT = ScaleFactor::SF_1> 645static SIMDINLINE Float SIMDCALL 646 sw_i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) 647{ 648 return Float{ 649 SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[0]), 650 SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[1]), 651 }; 652} 653 654static SIMDINLINE Float SIMDCALL 655 load1_ps(float const* p) // return *p (broadcast 1 value to all elements) 656{ 657 return broadcast_ss(p); 658} 659 660static SIMDINLINE Float SIMDCALL 661 load_ps(float const* p) // return *p (loads SIMD width elements from memory) 662{ 663 return Float{SIMD256T::load_ps(p), SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)}; 664} 665 666static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p 667{ 668 return Integer{ 669 SIMD256T::load_si(&p->v8[0]), 670 SIMD256T::load_si(&p->v8[1]), 671 }; 672} 673 674static SIMDINLINE Float SIMDCALL 675 loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem) 676{ 677 return Float{SIMD256T::loadu_ps(p), SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)}; 678} 679 680static SIMDINLINE Integer SIMDCALL 681 loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem) 682{ 683 return Integer{ 684 SIMD256T::loadu_si(&p->v8[0]), 685 SIMD256T::loadu_si(&p->v8[1]), 686 }; 687} 688 689// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old 690template <ScaleFactor ScaleT = ScaleFactor::SF_1> 691static SIMDINLINE Float SIMDCALL 692 mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask) 693{ 694 return Float{ 695 SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]), 696 SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]), 697 }; 698} 699 700template <ScaleFactor ScaleT = ScaleFactor::SF_1> 701static SIMDINLINE Float SIMDCALL 702 sw_mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask) 703{ 704 return Float{ 705 SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]), 706 SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]), 707 }; 708} 709 710static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src) 711{ 712 SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]); 713 SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]); 714} 715 716static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer const& a) 717{ 718 uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0])); 719 mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4); 720 721 return mask; 722} 723 724static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const& a) 725{ 726 uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0])); 727 mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2); 728 729 return mask; 730} 731static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const& a) 732{ 733 uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0])); 734 mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH; 735 736 return mask; 737} 738 739static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value) 740{ 741 return Integer{SIMD256T::set1_epi32(i), SIMD256T::set1_epi32(i)}; 742} 743 744static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value) 745{ 746 return Integer{SIMD256T::set1_epi8(i), SIMD256T::set1_epi8(i)}; 747} 748 749static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value) 750{ 751 return Float{SIMD256T::set1_ps(f), SIMD256T::set1_ps(f)}; 752} 753 754static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float) 755{ 756 return Float{SIMD256T::setzero_ps(), SIMD256T::setzero_ps()}; 757} 758 759static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer) 760{ 761 return Integer{SIMD256T::setzero_si(), SIMD256T::setzero_si()}; 762} 763 764static SIMDINLINE void SIMDCALL 765 store_ps(float* p, Float const& a) // *p = a (stores all elements contiguously in memory) 766{ 767 SIMD256T::store_ps(p, a.v8[0]); 768 SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]); 769} 770 771static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer const& a) // *p = a 772{ 773 SIMD256T::store_si(&p->v8[0], a.v8[0]); 774 SIMD256T::store_si(&p->v8[1], a.v8[1]); 775} 776 777static SIMDINLINE void SIMDCALL 778 stream_ps(float* p, Float const& a) // *p = a (same as store_ps, but doesn't keep memory in cache) 779{ 780 SIMD256T::stream_ps(p, a.v8[0]); 781 SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]); 782} 783 784static SIMDINLINE Integer SIMDCALL set_epi32(int i15, 785 int i14, 786 int i13, 787 int i12, 788 int i11, 789 int i10, 790 int i9, 791 int i8, 792 int i7, 793 int i6, 794 int i5, 795 int i4, 796 int i3, 797 int i2, 798 int i1, 799 int i0) 800{ 801 return Integer{SIMD256T::set_epi32(i7, i6, i5, i4, i3, i2, i1, i0), 802 SIMD256T::set_epi32(i15, i14, i13, i12, i11, i10, i9, i8)}; 803} 804 805static SIMDINLINE Integer SIMDCALL 806 set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0) 807{ 808 return set_epi32(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0); 809} 810 811static SIMDINLINE Float SIMDCALL set_ps(float i15, 812 float i14, 813 float i13, 814 float i12, 815 float i11, 816 float i10, 817 float i9, 818 float i8, 819 float i7, 820 float i6, 821 float i5, 822 float i4, 823 float i3, 824 float i2, 825 float i1, 826 float i0) 827{ 828 return Float{SIMD256T::set_ps(i7, i6, i5, i4, i3, i2, i1, i0), 829 SIMD256T::set_ps(i15, i14, i13, i12, i11, i10, i9, i8)}; 830} 831 832static SIMDINLINE Float SIMDCALL 833 set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0) 834{ 835 return set_ps(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0); 836} 837 838static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask) 839{ 840 return Float{SIMD256T::vmask_ps(mask), SIMD256T::vmask_ps(mask >> TARGET_SIMD_WIDTH)}; 841} 842 843#undef SIMD_WRAPPER_1 844#undef SIMD_WRAPPER_2 845#undef SIMD_WRAPPER_2I 846#undef SIMD_WRAPPER_2I_1 847#undef SIMD_WRAPPER_3 848#undef SIMD_IWRAPPER_1 849#undef SIMD_IWRAPPER_2 850#undef SIMD_IWRAPPER_2I 851#undef SIMD_IWRAPPER_2I_1 852#undef SIMD_IWRAPPER_3 853