1/**************************************************************************** 2* Copyright (C) 2017 Intel Corporation. All Rights Reserved. 3* 4* Permission is hereby granted, free of charge, to any person obtaining a 5* copy of this software and associated documentation files (the "Software"), 6* to deal in the Software without restriction, including without limitation 7* the rights to use, copy, modify, merge, publish, distribute, sublicense, 8* and/or sell copies of the Software, and to permit persons to whom the 9* Software is furnished to do so, subject to the following conditions: 10* 11* The above copyright notice and this permission notice (including the next 12* paragraph) shall be included in all copies or substantial portions of the 13* Software. 14* 15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21* IN THE SOFTWARE. 22****************************************************************************/ 23#if !defined(__SIMD_LIB_AVX512_HPP__) 24#error Do not include this file directly, use "simdlib.hpp" instead. 25#endif 26 27#if defined(__GNUC__) && !defined( __clang__) && !defined(__INTEL_COMPILER) 28// gcc as of 7.1 was missing these intrinsics 29#ifndef _mm512_cmpneq_ps_mask 30#define _mm512_cmpneq_ps_mask(a,b) _mm512_cmp_ps_mask((a),(b),_CMP_NEQ_UQ) 31#endif 32 33#ifndef _mm512_cmplt_ps_mask 34#define _mm512_cmplt_ps_mask(a,b) _mm512_cmp_ps_mask((a),(b),_CMP_LT_OS) 35#endif 36 37#ifndef _mm512_cmplt_pd_mask 38#define _mm512_cmplt_pd_mask(a,b) _mm512_cmp_pd_mask((a),(b),_CMP_LT_OS) 39#endif 40 41#endif 42 43//============================================================================ 44// SIMD16 AVX512 (F) implementation (compatible with Knights and Core 45// processors) 46// 47//============================================================================ 48 49static const int TARGET_SIMD_WIDTH = 16; 50using SIMD256T = SIMD256Impl::AVX2Impl; 51 52#define SIMD_WRAPPER_1_(op, intrin) \ 53 static SIMDINLINE Float SIMDCALL op(Float a) \ 54 {\ 55 return intrin(a);\ 56 } 57 58#define SIMD_WRAPPER_1(op) \ 59 SIMD_WRAPPER_1_(op, _mm512_##op) 60 61#define SIMD_WRAPPER_2_(op, intrin) \ 62 static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ 63 {\ 64 return _mm512_##intrin(a, b);\ 65 } 66#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op) 67 68#define SIMD_WRAPPERI_2_(op, intrin) \ 69 static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ 70 {\ 71 return _mm512_castsi512_ps(_mm512_##intrin(\ 72 _mm512_castps_si512(a), _mm512_castps_si512(b)));\ 73 } 74 75#define SIMD_DWRAPPER_2(op) \ 76 static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ 77 {\ 78 return _mm512_##op(a, b);\ 79 } 80 81#define SIMD_WRAPPER_2I_(op, intrin) \ 82 template<int ImmT>\ 83 static SIMDINLINE Float SIMDCALL op(Float a, Float b) \ 84 {\ 85 return _mm512_##intrin(a, b, ImmT);\ 86 } 87#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op) 88 89#define SIMD_DWRAPPER_2I_(op, intrin) \ 90 template<int ImmT>\ 91 static SIMDINLINE Double SIMDCALL op(Double a, Double b) \ 92 {\ 93 return _mm512_##intrin(a, b, ImmT);\ 94 } 95#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op) 96 97#define SIMD_WRAPPER_3(op) \ 98 static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \ 99 {\ 100 return _mm512_##op(a, b, c);\ 101 } 102 103#define SIMD_IWRAPPER_1(op) \ 104 static SIMDINLINE Integer SIMDCALL op(Integer a) \ 105 {\ 106 return _mm512_##op(a);\ 107 } 108#define SIMD_IWRAPPER_1_8(op) \ 109 static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) \ 110 {\ 111 return _mm512_##op(a);\ 112 } 113 114#define SIMD_IWRAPPER_1_4(op) \ 115 static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) \ 116 {\ 117 return _mm512_##op(a);\ 118 } 119 120#define SIMD_IWRAPPER_1I_(op, intrin) \ 121 template<int ImmT> \ 122 static SIMDINLINE Integer SIMDCALL op(Integer a) \ 123 {\ 124 return intrin(a, ImmT);\ 125 } 126#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op) 127 128#define SIMD_IWRAPPER_2_(op, intrin) \ 129 static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ 130 {\ 131 return _mm512_##intrin(a, b);\ 132 } 133#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op) 134 135#define SIMD_IWRAPPER_2_CMP(op, cmp) \ 136 static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ 137 {\ 138 return cmp(a, b);\ 139 } 140 141#define SIMD_IFWRAPPER_2(op, intrin) \ 142 static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ 143 {\ 144 return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\ 145 } 146 147#define SIMD_IWRAPPER_2I_(op, intrin) \ 148 template<int ImmT>\ 149 static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \ 150 {\ 151 return _mm512_##intrin(a, b, ImmT);\ 152 } 153#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op) 154 155private: 156 static SIMDINLINE Integer vmask(__mmask16 m) 157 { 158 return _mm512_maskz_set1_epi32(m, -1); 159 } 160 161 static SIMDINLINE Integer vmask(__mmask8 m) 162 { 163 return _mm512_maskz_set1_epi64(m, -1LL); 164 } 165 166public: 167//----------------------------------------------------------------------- 168// Single precision floating point arithmetic operations 169//----------------------------------------------------------------------- 170SIMD_WRAPPER_2(add_ps); // return a + b 171SIMD_WRAPPER_2(div_ps); // return a / b 172SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c 173SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c 174SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b 175SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b 176SIMD_WRAPPER_2(mul_ps); // return a * b 177SIMD_WRAPPER_1_(rcp_ps, _mm512_rcp14_ps); // return 1.0f / a 178SIMD_WRAPPER_1_(rsqrt_ps, _mm512_rsqrt14_ps); // return 1.0f / sqrt(a) 179SIMD_WRAPPER_2(sub_ps); // return a - b 180 181template <RoundMode RMT> 182static SIMDINLINE Float SIMDCALL round_ps(Float a) 183{ 184 return _mm512_roundscale_ps(a, static_cast<int>(RMT)); 185} 186 187static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); } 188static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); } 189 190//----------------------------------------------------------------------- 191// Integer (various width) arithmetic operations 192//----------------------------------------------------------------------- 193SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32) 194SIMD_IWRAPPER_2(add_epi32); // return a + b (int32) 195//SIMD_IWRAPPER_2(add_epi8); // return a + b (int8) 196//SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 197SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32) 198SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32) 199SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32) 200SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32) 201SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32) 202 203 // return (a * b) & 0xFFFFFFFF 204 // 205 // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, 206 // and store the low 32 bits of the intermediate integers in dst. 207SIMD_IWRAPPER_2(mullo_epi32); 208SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32) 209SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64) 210//SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) 211 212//----------------------------------------------------------------------- 213// Logical operations 214//----------------------------------------------------------------------- 215SIMD_IWRAPPER_2_(and_si, and_si512); // return a & b (int) 216SIMD_IWRAPPER_2_(andnot_si, andnot_si512); // return (~a) & b (int) 217SIMD_IWRAPPER_2_(or_si, or_si512); // return a | b (int) 218SIMD_IWRAPPER_2_(xor_si, xor_si512); // return a ^ b (int) 219 220// SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int) 221// SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int) 222// SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int) 223// SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int) 224 225 226//----------------------------------------------------------------------- 227// Shift operations 228//----------------------------------------------------------------------- 229SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT 230SIMD_IWRAPPER_2(sllv_epi32); 231SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32) 232SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32) 233 234#if 0 235SIMD_IWRAPPER_1I_(srli_si, srli_si512); // return a >> (ImmT*8) (uint) 236 237template<int ImmT> // same as srli_si, but with Float cast to int 238static SIMDINLINE Float SIMDCALL srlisi_ps(Float a) 239{ 240 return castsi_ps(srli_si<ImmT>(castps_si(a))); 241} 242#endif 243 244SIMD_IWRAPPER_2(srlv_epi32); 245 246//----------------------------------------------------------------------- 247// Conversion operations 248//----------------------------------------------------------------------- 249static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a) 250{ 251 return _mm512_castpd_ps(a); 252} 253 254static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a) 255{ 256 return _mm512_castps_si512(a); 257} 258 259static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a) 260{ 261 return _mm512_castsi512_pd(a); 262} 263 264static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a) 265{ 266 return _mm512_castps_pd(a); 267} 268 269static SIMDINLINE Integer SIMDCALL castpd_si(Double a) // return *(Integer*)(&a) 270{ 271 return _mm512_castpd_si512(a); 272} 273 274static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a) 275{ 276 return _mm512_castsi512_ps(a); 277} 278 279static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float) 280{ 281 return _mm512_cvtepi32_ps(a); 282} 283 284//SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a (uint8 --> int16) 285SIMD_IWRAPPER_1_4(cvtepu8_epi32); // return (int32)a (uint8 --> int32) 286SIMD_IWRAPPER_1_8(cvtepu16_epi32); // return (int32)a (uint16 --> int32) 287SIMD_IWRAPPER_1_4(cvtepu16_epi64); // return (int64)a (uint16 --> int64) 288SIMD_IWRAPPER_1_8(cvtepu32_epi64); // return (int64)a (uint32 --> int64) 289 290static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32) 291{ 292 return _mm512_cvtps_epi32(a); 293} 294 295static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32) 296{ 297 return _mm512_cvttps_epi32(a); 298} 299 300//----------------------------------------------------------------------- 301// Comparison operations 302//----------------------------------------------------------------------- 303template<CompareType CmpTypeT> 304static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b) 305{ 306 return _mm512_cmp_ps_mask(a, b, static_cast<const int>(CmpTypeT)); 307} 308 309template<CompareType CmpTypeT> 310static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b 311{ 312 // Legacy vector mask generator 313 __mmask16 result = cmp_ps_mask<CmpTypeT>(a, b); 314 return castsi_ps(vmask(result)); 315} 316 317static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); } 318static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); } 319static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); } 320static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); } 321static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); } 322static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); } 323 324template<CompareTypeInt CmpTypeT> 325static SIMDINLINE Integer SIMDCALL cmp_epi32(Integer a, Integer b) 326{ 327 // Legacy vector mask generator 328 __mmask16 result = _mm512_cmp_epi32_mask(a, b, static_cast<const int>(CmpTypeT)); 329 return vmask(result); 330} 331template<CompareTypeInt CmpTypeT> 332static SIMDINLINE Integer SIMDCALL cmp_epi64(Integer a, Integer b) 333{ 334 // Legacy vector mask generator 335 __mmask8 result = _mm512_cmp_epi64_mask(a, b, static_cast<const int>(CmpTypeT)); 336 return vmask(result); 337} 338 339//SIMD_IWRAPPER_2_CMP(cmpeq_epi8, cmp_epi8<CompareTypeInt::EQ>); // return a == b (int8) 340//SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>); // return a == b (int16) 341SIMD_IWRAPPER_2_CMP(cmpeq_epi32, cmp_epi32<CompareTypeInt::EQ>); // return a == b (int32) 342SIMD_IWRAPPER_2_CMP(cmpeq_epi64, cmp_epi64<CompareTypeInt::EQ>); // return a == b (int64) 343//SIMD_IWRAPPER_2_CMP(cmpgt_epi8, cmp_epi8<CompareTypeInt::GT>); // return a > b (int8) 344//SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>); // return a > b (int16) 345SIMD_IWRAPPER_2_CMP(cmpgt_epi32, cmp_epi32<CompareTypeInt::GT>); // return a > b (int32) 346SIMD_IWRAPPER_2_CMP(cmpgt_epi64, cmp_epi64<CompareTypeInt::GT>); // return a > b (int64) 347SIMD_IWRAPPER_2_CMP(cmplt_epi32, cmp_epi32<CompareTypeInt::LT>); // return a < b (int32) 348 349static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float) 350{ 351 return (0 == static_cast<int>(_mm512_test_epi32_mask(castps_si(a), castps_si(b)))); 352} 353 354static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int) 355{ 356 return (0 == static_cast<int>(_mm512_test_epi32_mask(a, b))); 357} 358 359//----------------------------------------------------------------------- 360// Blend / shuffle / permute operations 361//----------------------------------------------------------------------- 362template <int ImmT> 363static SIMDINLINE Float blend_ps(Float a, Float b) // return ImmT ? b : a (float) 364{ 365 return _mm512_mask_blend_ps(__mmask16(ImmT), a, b); 366} 367 368template <int ImmT> 369static SIMDINLINE Integer blend_epi32(Integer a, Integer b) // return ImmT ? b : a (int32) 370{ 371 return _mm512_mask_blend_epi32(__mmask16(ImmT), a, b); 372} 373 374static SIMDINLINE Float blendv_ps(Float a, Float b, Float mask) // return mask ? b : a (float) 375{ 376 return _mm512_mask_blend_ps(__mmask16(movemask_ps(mask)), a, b); 377} 378 379 380static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int) 381{ 382 return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask)); 383} 384 385static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int) 386{ 387 return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask))); 388} 389 390static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p) // return *p (all elements in vector get same value) 391{ 392 return _mm512_set1_ps(*p); 393} 394 395template<int imm> 396static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a) 397{ 398 return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(a), imm)); 399} 400 401template<int imm> 402static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a) 403{ 404 return _mm512_extractf64x4_pd(a, imm); 405} 406 407template<int imm> 408static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a) 409{ 410 return _mm512_extracti64x4_epi64(a, imm); 411} 412 413template<int imm> 414static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b) 415{ 416 return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(a), _mm256_castps_pd(b), imm)); 417} 418 419template<int imm> 420static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b) 421{ 422 return _mm512_insertf64x4(a, b, imm); 423} 424 425template<int imm> 426static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b) 427{ 428 return _mm512_inserti64x4(a, b, imm); 429} 430 431// SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16 432// SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32 433// SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16 434// SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32 435 436static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float) 437{ 438 return _mm512_permutexvar_epi32(swiz, a); 439} 440 441static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float) 442{ 443 return _mm512_permutexvar_ps(swiz, a); 444} 445 446SIMD_WRAPPER_2I_(permute2f128_ps, shuffle_f32x4); 447SIMD_DWRAPPER_2I_(permute2f128_pd, shuffle_f64x2); 448SIMD_IWRAPPER_2I_(permute2f128_si, shuffle_i32x4); 449 450SIMD_IWRAPPER_1I(shuffle_epi32); 451 452//SIMD_IWRAPPER_2(shuffle_epi8); 453SIMD_DWRAPPER_2I(shuffle_pd); 454SIMD_WRAPPER_2I(shuffle_ps); 455 456template<int ImmT> 457static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) 458{ 459 return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b))); 460} 461 462SIMD_IWRAPPER_2(unpackhi_epi16); 463 464//SIMD_IFWRAPPER_2(unpackhi_epi32, _mm512_unpackhi_ps); 465static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b) 466{ 467 return castps_si(_mm512_unpackhi_ps(castsi_ps(a), castsi_ps(b))); 468} 469 470SIMD_IWRAPPER_2(unpackhi_epi64); 471//SIMD_IWRAPPER_2(unpackhi_epi8); 472SIMD_DWRAPPER_2(unpackhi_pd); 473SIMD_WRAPPER_2(unpackhi_ps); 474//SIMD_IWRAPPER_2(unpacklo_epi16); 475SIMD_IFWRAPPER_2(unpacklo_epi32, unpacklo_ps); 476SIMD_IWRAPPER_2(unpacklo_epi64); 477//SIMD_IWRAPPER_2(unpacklo_epi8); 478SIMD_DWRAPPER_2(unpacklo_pd); 479SIMD_WRAPPER_2(unpacklo_ps); 480 481//----------------------------------------------------------------------- 482// Load / store operations 483//----------------------------------------------------------------------- 484template<ScaleFactor ScaleT> 485static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) 486{ 487 return _mm512_i32gather_ps(idx, p, static_cast<int>(ScaleT)); 488} 489 490static SIMDINLINE Float SIMDCALL load1_ps(float const *p) // return *p (broadcast 1 value to all elements) 491{ 492 return broadcast_ss(p); 493} 494 495static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory) 496{ 497 return _mm512_load_ps(p); 498} 499 500static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p 501{ 502 return _mm512_load_si512(&p->v); 503} 504 505static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem) 506{ 507 return _mm512_loadu_ps(p); 508} 509 510static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem) 511{ 512 return _mm512_loadu_si512(p); 513} 514 515// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old 516template<ScaleFactor ScaleT> 517static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) 518{ 519 __mmask16 k = _mm512_cmpneq_ps_mask(mask, setzero_ps()); 520 521 return _mm512_mask_i32gather_ps(old, k, idx, p, static_cast<int>(ScaleT)); 522} 523 524static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src) 525{ 526 Mask m = _mm512_cmplt_epi32_mask(mask, setzero_si()); 527 _mm512_mask_store_ps(p, m, src); 528} 529 530//static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a) 531//{ 532// __mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si()); 533// return static_cast<uint64_t>(m); 534//} 535 536static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a) 537{ 538 __mmask8 m = _mm512_test_epi64_mask(castpd_si(a), set1_epi64(0x8000000000000000LL)); 539 return static_cast<uint32_t>(m); 540} 541static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a) 542{ 543 __mmask16 m = _mm512_test_epi32_mask(castps_si(a), set1_epi32(0x80000000)); 544 return static_cast<uint32_t>(m); 545} 546 547static SIMDINLINE Integer SIMDCALL set1_epi64(long long i) // return i (all elements are same value) 548{ 549 return _mm512_set1_epi64(i); 550} 551 552static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value) 553{ 554 return _mm512_set1_epi32(i); 555} 556 557static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value) 558{ 559 return _mm512_set1_epi8(i); 560} 561 562static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value) 563{ 564 return _mm512_set1_ps(f); 565} 566 567static SIMDINLINE Double SIMDCALL setzero_pd() // return 0 (double) 568{ 569 return _mm512_setzero_pd(); 570} 571 572static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float) 573{ 574 return _mm512_setzero_ps(); 575} 576 577static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer) 578{ 579 return _mm512_setzero_si512(); 580} 581 582static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory) 583{ 584 _mm512_store_ps(p, a); 585} 586 587static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a 588{ 589 _mm512_store_si512(&p->v, a); 590} 591 592static SIMDINLINE void SIMDCALL storeu_si(Integer *p, Integer a) // *p = a (same as store_si but allows for unaligned mem) 593{ 594 _mm512_storeu_si512(&p->v, a); 595} 596 597static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache) 598{ 599 _mm512_stream_ps(p, a); 600} 601 602static SIMDINLINE Integer SIMDCALL set_epi32( 603 int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8, 604 int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0) 605{ 606 return _mm512_set_epi32( 607 i15, i14, i13, i12, i11, i10, i9, i8, 608 i7, i6, i5, i4, i3, i2, i1, i0); 609} 610 611static SIMDINLINE Integer SIMDCALL set_epi32( 612 int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0) 613{ 614 return set_epi32( 615 0, 0, 0, 0, 0, 0, 0, 0, 616 i7, i6, i5, i4, i3, i2, i1, i0); 617} 618 619static SIMDINLINE Float SIMDCALL set_ps( 620 float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8, 621 float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0) 622{ 623 return _mm512_set_ps( 624 i15, i14, i13, i12, i11, i10, i9, i8, 625 i7, i6, i5, i4, i3, i2, i1, i0); 626} 627 628static SIMDINLINE Float SIMDCALL set_ps( 629 float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0) 630{ 631 return set_ps( 632 0, 0, 0, 0, 0, 0, 0, 0, 633 i7, i6, i5, i4, i3, i2, i1, i0); 634} 635 636static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask) 637{ 638 return castsi_ps(_mm512_maskz_mov_epi32(__mmask16(mask), set1_epi32(-1))); 639} 640 641#undef SIMD_WRAPPER_1_ 642#undef SIMD_WRAPPER_1 643#undef SIMD_WRAPPER_2 644#undef SIMD_WRAPPER_2_ 645#undef SIMD_WRAPPERI_2_ 646#undef SIMD_DWRAPPER_2 647#undef SIMD_DWRAPPER_2I 648#undef SIMD_WRAPPER_2I_ 649#undef SIMD_WRAPPER_3_ 650#undef SIMD_WRAPPER_2I 651#undef SIMD_WRAPPER_3 652#undef SIMD_IWRAPPER_1 653#undef SIMD_IWRAPPER_2 654#undef SIMD_IFWRAPPER_2 655#undef SIMD_IWRAPPER_2I 656#undef SIMD_IWRAPPER_1 657#undef SIMD_IWRAPPER_1I 658#undef SIMD_IWRAPPER_1I_ 659#undef SIMD_IWRAPPER_2 660#undef SIMD_IWRAPPER_2_ 661#undef SIMD_IWRAPPER_2I 662 663