1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2016 Benoit Steiner (benoit.steiner.goog@gmail.com) 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #ifndef EIGEN_PACKET_MATH_AVX512_H 11 #define EIGEN_PACKET_MATH_AVX512_H 12 13 namespace Eigen { 14 15 namespace internal { 16 17 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 18 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 19 #endif 20 21 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 22 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) 23 #endif 24 25 #ifdef __FMA__ 26 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 27 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 28 #endif 29 #endif 30 31 typedef __m512 Packet16f; 32 typedef __m512i Packet16i; 33 typedef __m512d Packet8d; 34 35 template <> 36 struct is_arithmetic<__m512> { 37 enum { value = true }; 38 }; 39 template <> 40 struct is_arithmetic<__m512i> { 41 enum { value = true }; 42 }; 43 template <> 44 struct is_arithmetic<__m512d> { 45 enum { value = true }; 46 }; 47 48 template<> struct packet_traits<float> : default_packet_traits 49 { 50 typedef Packet16f type; 51 typedef Packet8f half; 52 enum { 53 Vectorizable = 1, 54 AlignedOnScalar = 1, 55 size = 16, 56 HasHalfPacket = 1, 57 #if EIGEN_GNUC_AT_LEAST(5, 3) 58 #ifdef EIGEN_VECTORIZE_AVX512DQ 59 HasLog = 1, 60 #endif 61 HasExp = 1, 62 HasSqrt = 1, 63 HasRsqrt = 1, 64 #endif 65 HasDiv = 1 66 }; 67 }; 68 template<> struct packet_traits<double> : default_packet_traits 69 { 70 typedef Packet8d type; 71 typedef Packet4d half; 72 enum { 73 Vectorizable = 1, 74 AlignedOnScalar = 1, 75 size = 8, 76 HasHalfPacket = 1, 77 #if EIGEN_GNUC_AT_LEAST(5, 3) 78 HasSqrt = 1, 79 HasRsqrt = EIGEN_FAST_MATH, 80 #endif 81 HasDiv = 1 82 }; 83 }; 84 85 /* TODO Implement AVX512 for integers 86 template<> struct packet_traits<int> : default_packet_traits 87 { 88 typedef Packet16i type; 89 enum { 90 Vectorizable = 1, 91 AlignedOnScalar = 1, 92 size=8 93 }; 94 }; 95 */ 96 97 template <> 98 struct unpacket_traits<Packet16f> { 99 typedef float type; 100 typedef Packet8f half; 101 enum { size = 16, alignment=Aligned64 }; 102 }; 103 template <> 104 struct unpacket_traits<Packet8d> { 105 typedef double type; 106 typedef Packet4d half; 107 enum { size = 8, alignment=Aligned64 }; 108 }; 109 template <> 110 struct unpacket_traits<Packet16i> { 111 typedef int type; 112 typedef Packet8i half; 113 enum { size = 16, alignment=Aligned64 }; 114 }; 115 116 template <> 117 EIGEN_STRONG_INLINE Packet16f pset1<Packet16f>(const float& from) { 118 return _mm512_set1_ps(from); 119 } 120 template <> 121 EIGEN_STRONG_INLINE Packet8d pset1<Packet8d>(const double& from) { 122 return _mm512_set1_pd(from); 123 } 124 template <> 125 EIGEN_STRONG_INLINE Packet16i pset1<Packet16i>(const int& from) { 126 return _mm512_set1_epi32(from); 127 } 128 129 template <> 130 EIGEN_STRONG_INLINE Packet16f pload1<Packet16f>(const float* from) { 131 return _mm512_broadcastss_ps(_mm_load_ps1(from)); 132 } 133 template <> 134 EIGEN_STRONG_INLINE Packet8d pload1<Packet8d>(const double* from) { 135 return _mm512_broadcastsd_pd(_mm_load_pd1(from)); 136 } 137 138 template <> 139 EIGEN_STRONG_INLINE Packet16f plset<Packet16f>(const float& a) { 140 return _mm512_add_ps( 141 _mm512_set1_ps(a), 142 _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 143 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); 144 } 145 template <> 146 EIGEN_STRONG_INLINE Packet8d plset<Packet8d>(const double& a) { 147 return _mm512_add_pd(_mm512_set1_pd(a), 148 _mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0)); 149 } 150 151 template <> 152 EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a, 153 const Packet16f& b) { 154 return _mm512_add_ps(a, b); 155 } 156 template <> 157 EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a, 158 const Packet8d& b) { 159 return _mm512_add_pd(a, b); 160 } 161 162 template <> 163 EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a, 164 const Packet16f& b) { 165 return _mm512_sub_ps(a, b); 166 } 167 template <> 168 EIGEN_STRONG_INLINE Packet8d psub<Packet8d>(const Packet8d& a, 169 const Packet8d& b) { 170 return _mm512_sub_pd(a, b); 171 } 172 173 template <> 174 EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) { 175 return _mm512_sub_ps(_mm512_set1_ps(0.0), a); 176 } 177 template <> 178 EIGEN_STRONG_INLINE Packet8d pnegate(const Packet8d& a) { 179 return _mm512_sub_pd(_mm512_set1_pd(0.0), a); 180 } 181 182 template <> 183 EIGEN_STRONG_INLINE Packet16f pconj(const Packet16f& a) { 184 return a; 185 } 186 template <> 187 EIGEN_STRONG_INLINE Packet8d pconj(const Packet8d& a) { 188 return a; 189 } 190 template <> 191 EIGEN_STRONG_INLINE Packet16i pconj(const Packet16i& a) { 192 return a; 193 } 194 195 template <> 196 EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a, 197 const Packet16f& b) { 198 return _mm512_mul_ps(a, b); 199 } 200 template <> 201 EIGEN_STRONG_INLINE Packet8d pmul<Packet8d>(const Packet8d& a, 202 const Packet8d& b) { 203 return _mm512_mul_pd(a, b); 204 } 205 206 template <> 207 EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a, 208 const Packet16f& b) { 209 return _mm512_div_ps(a, b); 210 } 211 template <> 212 EIGEN_STRONG_INLINE Packet8d pdiv<Packet8d>(const Packet8d& a, 213 const Packet8d& b) { 214 return _mm512_div_pd(a, b); 215 } 216 217 #ifdef __FMA__ 218 template <> 219 EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b, 220 const Packet16f& c) { 221 return _mm512_fmadd_ps(a, b, c); 222 } 223 template <> 224 EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b, 225 const Packet8d& c) { 226 return _mm512_fmadd_pd(a, b, c); 227 } 228 #endif 229 230 template <> 231 EIGEN_STRONG_INLINE Packet16f pmin<Packet16f>(const Packet16f& a, 232 const Packet16f& b) { 233 return _mm512_min_ps(a, b); 234 } 235 template <> 236 EIGEN_STRONG_INLINE Packet8d pmin<Packet8d>(const Packet8d& a, 237 const Packet8d& b) { 238 return _mm512_min_pd(a, b); 239 } 240 241 template <> 242 EIGEN_STRONG_INLINE Packet16f pmax<Packet16f>(const Packet16f& a, 243 const Packet16f& b) { 244 return _mm512_max_ps(a, b); 245 } 246 template <> 247 EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a, 248 const Packet8d& b) { 249 return _mm512_max_pd(a, b); 250 } 251 252 template <> 253 EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a, 254 const Packet16f& b) { 255 #ifdef EIGEN_VECTORIZE_AVX512DQ 256 return _mm512_and_ps(a, b); 257 #else 258 Packet16f res = _mm512_undefined_ps(); 259 Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); 260 Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); 261 res = _mm512_insertf32x4(res, _mm_and_ps(lane0_a, lane0_b), 0); 262 263 Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); 264 Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); 265 res = _mm512_insertf32x4(res, _mm_and_ps(lane1_a, lane1_b), 1); 266 267 Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); 268 Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); 269 res = _mm512_insertf32x4(res, _mm_and_ps(lane2_a, lane2_b), 2); 270 271 Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); 272 Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); 273 res = _mm512_insertf32x4(res, _mm_and_ps(lane3_a, lane3_b), 3); 274 275 return res; 276 #endif 277 } 278 template <> 279 EIGEN_STRONG_INLINE Packet8d pand<Packet8d>(const Packet8d& a, 280 const Packet8d& b) { 281 #ifdef EIGEN_VECTORIZE_AVX512DQ 282 return _mm512_and_pd(a, b); 283 #else 284 Packet8d res = _mm512_undefined_pd(); 285 Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); 286 Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); 287 res = _mm512_insertf64x4(res, _mm256_and_pd(lane0_a, lane0_b), 0); 288 289 Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); 290 Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); 291 res = _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1); 292 293 return res; 294 #endif 295 } 296 template <> 297 EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a, 298 const Packet16f& b) { 299 #ifdef EIGEN_VECTORIZE_AVX512DQ 300 return _mm512_or_ps(a, b); 301 #else 302 Packet16f res = _mm512_undefined_ps(); 303 Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); 304 Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); 305 res = _mm512_insertf32x4(res, _mm_or_ps(lane0_a, lane0_b), 0); 306 307 Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); 308 Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); 309 res = _mm512_insertf32x4(res, _mm_or_ps(lane1_a, lane1_b), 1); 310 311 Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); 312 Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); 313 res = _mm512_insertf32x4(res, _mm_or_ps(lane2_a, lane2_b), 2); 314 315 Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); 316 Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); 317 res = _mm512_insertf32x4(res, _mm_or_ps(lane3_a, lane3_b), 3); 318 319 return res; 320 #endif 321 } 322 323 template <> 324 EIGEN_STRONG_INLINE Packet8d por<Packet8d>(const Packet8d& a, 325 const Packet8d& b) { 326 #ifdef EIGEN_VECTORIZE_AVX512DQ 327 return _mm512_or_pd(a, b); 328 #else 329 Packet8d res = _mm512_undefined_pd(); 330 Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); 331 Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); 332 res = _mm512_insertf64x4(res, _mm256_or_pd(lane0_a, lane0_b), 0); 333 334 Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); 335 Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); 336 res = _mm512_insertf64x4(res, _mm256_or_pd(lane1_a, lane1_b), 1); 337 338 return res; 339 #endif 340 } 341 342 template <> 343 EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a, 344 const Packet16f& b) { 345 #ifdef EIGEN_VECTORIZE_AVX512DQ 346 return _mm512_xor_ps(a, b); 347 #else 348 Packet16f res = _mm512_undefined_ps(); 349 Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); 350 Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); 351 res = _mm512_insertf32x4(res, _mm_xor_ps(lane0_a, lane0_b), 0); 352 353 Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); 354 Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); 355 res = _mm512_insertf32x4(res, _mm_xor_ps(lane1_a, lane1_b), 1); 356 357 Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); 358 Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); 359 res = _mm512_insertf32x4(res, _mm_xor_ps(lane2_a, lane2_b), 2); 360 361 Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); 362 Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); 363 res = _mm512_insertf32x4(res, _mm_xor_ps(lane3_a, lane3_b), 3); 364 365 return res; 366 #endif 367 } 368 template <> 369 EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a, 370 const Packet8d& b) { 371 #ifdef EIGEN_VECTORIZE_AVX512DQ 372 return _mm512_xor_pd(a, b); 373 #else 374 Packet8d res = _mm512_undefined_pd(); 375 Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); 376 Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); 377 res = _mm512_insertf64x4(res, _mm256_xor_pd(lane0_a, lane0_b), 0); 378 379 Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); 380 Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); 381 res = _mm512_insertf64x4(res, _mm256_xor_pd(lane1_a, lane1_b), 1); 382 383 return res; 384 #endif 385 } 386 387 template <> 388 EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a, 389 const Packet16f& b) { 390 #ifdef EIGEN_VECTORIZE_AVX512DQ 391 return _mm512_andnot_ps(a, b); 392 #else 393 Packet16f res = _mm512_undefined_ps(); 394 Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); 395 Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); 396 res = _mm512_insertf32x4(res, _mm_andnot_ps(lane0_a, lane0_b), 0); 397 398 Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); 399 Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); 400 res = _mm512_insertf32x4(res, _mm_andnot_ps(lane1_a, lane1_b), 1); 401 402 Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); 403 Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); 404 res = _mm512_insertf32x4(res, _mm_andnot_ps(lane2_a, lane2_b), 2); 405 406 Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); 407 Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); 408 res = _mm512_insertf32x4(res, _mm_andnot_ps(lane3_a, lane3_b), 3); 409 410 return res; 411 #endif 412 } 413 template <> 414 EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a, 415 const Packet8d& b) { 416 #ifdef EIGEN_VECTORIZE_AVX512DQ 417 return _mm512_andnot_pd(a, b); 418 #else 419 Packet8d res = _mm512_undefined_pd(); 420 Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); 421 Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); 422 res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane0_a, lane0_b), 0); 423 424 Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); 425 Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); 426 res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane1_a, lane1_b), 1); 427 428 return res; 429 #endif 430 } 431 432 template <> 433 EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) { 434 EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ps(from); 435 } 436 template <> 437 EIGEN_STRONG_INLINE Packet8d pload<Packet8d>(const double* from) { 438 EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_pd(from); 439 } 440 template <> 441 EIGEN_STRONG_INLINE Packet16i pload<Packet16i>(const int* from) { 442 EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512( 443 reinterpret_cast<const __m512i*>(from)); 444 } 445 446 template <> 447 EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from) { 448 EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ps(from); 449 } 450 template <> 451 EIGEN_STRONG_INLINE Packet8d ploadu<Packet8d>(const double* from) { 452 EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_pd(from); 453 } 454 template <> 455 EIGEN_STRONG_INLINE Packet16i ploadu<Packet16i>(const int* from) { 456 EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512( 457 reinterpret_cast<const __m512i*>(from)); 458 } 459 460 // Loads 8 floats from memory a returns the packet 461 // {a0, a0 a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7} 462 template <> 463 EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) { 464 Packet8f lane0 = _mm256_broadcast_ps((const __m128*)(const void*)from); 465 // mimic an "inplace" permutation of the lower 128bits using a blend 466 lane0 = _mm256_blend_ps( 467 lane0, _mm256_castps128_ps256(_mm_permute_ps( 468 _mm256_castps256_ps128(lane0), _MM_SHUFFLE(1, 0, 1, 0))), 469 15); 470 // then we can perform a consistent permutation on the global register to get 471 // everything in shape: 472 lane0 = _mm256_permute_ps(lane0, _MM_SHUFFLE(3, 3, 2, 2)); 473 474 Packet8f lane1 = _mm256_broadcast_ps((const __m128*)(const void*)(from + 4)); 475 // mimic an "inplace" permutation of the lower 128bits using a blend 476 lane1 = _mm256_blend_ps( 477 lane1, _mm256_castps128_ps256(_mm_permute_ps( 478 _mm256_castps256_ps128(lane1), _MM_SHUFFLE(1, 0, 1, 0))), 479 15); 480 // then we can perform a consistent permutation on the global register to get 481 // everything in shape: 482 lane1 = _mm256_permute_ps(lane1, _MM_SHUFFLE(3, 3, 2, 2)); 483 484 #ifdef EIGEN_VECTORIZE_AVX512DQ 485 Packet16f res = _mm512_undefined_ps(); 486 return _mm512_insertf32x8(res, lane0, 0); 487 return _mm512_insertf32x8(res, lane1, 1); 488 return res; 489 #else 490 Packet16f res = _mm512_undefined_ps(); 491 res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 0), 0); 492 res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 1), 1); 493 res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 0), 2); 494 res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 1), 3); 495 return res; 496 #endif 497 } 498 // Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, 499 // a3} 500 template <> 501 EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) { 502 Packet4d lane0 = _mm256_broadcast_pd((const __m128d*)(const void*)from); 503 lane0 = _mm256_permute_pd(lane0, 3 << 2); 504 505 Packet4d lane1 = _mm256_broadcast_pd((const __m128d*)(const void*)(from + 2)); 506 lane1 = _mm256_permute_pd(lane1, 3 << 2); 507 508 Packet8d res = _mm512_undefined_pd(); 509 res = _mm512_insertf64x4(res, lane0, 0); 510 return _mm512_insertf64x4(res, lane1, 1); 511 } 512 513 // Loads 4 floats from memory a returns the packet 514 // {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3} 515 template <> 516 EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) { 517 Packet16f tmp = _mm512_undefined_ps(); 518 tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from), 0); 519 tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 1), 1); 520 tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 2), 2); 521 tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 3), 3); 522 return tmp; 523 } 524 // Loads 2 doubles from memory a returns the packet 525 // {a0, a0 a0, a0, a1, a1, a1, a1} 526 template <> 527 EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) { 528 Packet8d tmp = _mm512_undefined_pd(); 529 Packet2d tmp0 = _mm_load_pd1(from); 530 Packet2d tmp1 = _mm_load_pd1(from + 1); 531 Packet4d lane0 = _mm256_broadcastsd_pd(tmp0); 532 Packet4d lane1 = _mm256_broadcastsd_pd(tmp1); 533 tmp = _mm512_insertf64x4(tmp, lane0, 0); 534 return _mm512_insertf64x4(tmp, lane1, 1); 535 } 536 537 template <> 538 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet16f& from) { 539 EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ps(to, from); 540 } 541 template <> 542 EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet8d& from) { 543 EIGEN_DEBUG_ALIGNED_STORE _mm512_store_pd(to, from); 544 } 545 template <> 546 EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet16i& from) { 547 EIGEN_DEBUG_ALIGNED_STORE _mm512_storeu_si512(reinterpret_cast<__m512i*>(to), 548 from); 549 } 550 551 template <> 552 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from) { 553 EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ps(to, from); 554 } 555 template <> 556 EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet8d& from) { 557 EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_pd(to, from); 558 } 559 template <> 560 EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) { 561 EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( 562 reinterpret_cast<__m512i*>(to), from); 563 } 564 565 template <> 566 EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from, 567 Index stride) { 568 Packet16i stride_vector = _mm512_set1_epi32(stride); 569 Packet16i stride_multiplier = 570 _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 571 Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); 572 573 return _mm512_i32gather_ps(indices, from, 4); 574 } 575 template <> 576 EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from, 577 Index stride) { 578 Packet8i stride_vector = _mm256_set1_epi32(stride); 579 Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); 580 Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); 581 582 return _mm512_i32gather_pd(indices, from, 8); 583 } 584 585 template <> 586 EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to, 587 const Packet16f& from, 588 Index stride) { 589 Packet16i stride_vector = _mm512_set1_epi32(stride); 590 Packet16i stride_multiplier = 591 _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 592 Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); 593 _mm512_i32scatter_ps(to, indices, from, 4); 594 } 595 template <> 596 EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to, 597 const Packet8d& from, 598 Index stride) { 599 Packet8i stride_vector = _mm256_set1_epi32(stride); 600 Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); 601 Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); 602 _mm512_i32scatter_pd(to, indices, from, 8); 603 } 604 605 template <> 606 EIGEN_STRONG_INLINE void pstore1<Packet16f>(float* to, const float& a) { 607 Packet16f pa = pset1<Packet16f>(a); 608 pstore(to, pa); 609 } 610 template <> 611 EIGEN_STRONG_INLINE void pstore1<Packet8d>(double* to, const double& a) { 612 Packet8d pa = pset1<Packet8d>(a); 613 pstore(to, pa); 614 } 615 template <> 616 EIGEN_STRONG_INLINE void pstore1<Packet16i>(int* to, const int& a) { 617 Packet16i pa = pset1<Packet16i>(a); 618 pstore(to, pa); 619 } 620 621 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } 622 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } 623 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } 624 625 template <> 626 EIGEN_STRONG_INLINE float pfirst<Packet16f>(const Packet16f& a) { 627 return _mm_cvtss_f32(_mm512_extractf32x4_ps(a, 0)); 628 } 629 template <> 630 EIGEN_STRONG_INLINE double pfirst<Packet8d>(const Packet8d& a) { 631 return _mm_cvtsd_f64(_mm256_extractf128_pd(_mm512_extractf64x4_pd(a, 0), 0)); 632 } 633 template <> 634 EIGEN_STRONG_INLINE int pfirst<Packet16i>(const Packet16i& a) { 635 return _mm_extract_epi32(_mm512_extracti32x4_epi32(a, 0), 0); 636 } 637 638 template<> EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a) 639 { 640 return _mm512_permutexvar_ps(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a); 641 } 642 643 template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a) 644 { 645 return _mm512_permutexvar_pd(_mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), a); 646 } 647 648 template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) 649 { 650 // _mm512_abs_ps intrinsic not found, so hack around it 651 return (__m512)_mm512_and_si512((__m512i)a, _mm512_set1_epi32(0x7fffffff)); 652 } 653 template <> 654 EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) { 655 // _mm512_abs_ps intrinsic not found, so hack around it 656 return (__m512d)_mm512_and_si512((__m512i)a, 657 _mm512_set1_epi64(0x7fffffffffffffff)); 658 } 659 660 #ifdef EIGEN_VECTORIZE_AVX512DQ 661 // AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512 662 #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \ 663 __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0) __m256 OUTPUT##_1 = \ 664 _mm512_extractf32x8_ps(INPUT, 1) 665 #else 666 #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \ 667 __m256 OUTPUT##_0 = _mm256_insertf128_ps( \ 668 _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 0)), \ 669 _mm512_extractf32x4_ps(INPUT, 1), 1); \ 670 __m256 OUTPUT##_1 = _mm256_insertf128_ps( \ 671 _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \ 672 _mm512_extractf32x4_ps(INPUT, 3), 1); 673 #endif 674 675 #ifdef EIGEN_VECTORIZE_AVX512DQ 676 #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ 677 OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTA, 0); \ 678 OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTB, 1); 679 #else 680 #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ 681 OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \ 682 OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \ 683 OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \ 684 OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3); 685 #endif 686 template<> EIGEN_STRONG_INLINE Packet16f preduxp<Packet16f>(const Packet16f* 687 vecs) 688 { 689 EIGEN_EXTRACT_8f_FROM_16f(vecs[0], vecs0); 690 EIGEN_EXTRACT_8f_FROM_16f(vecs[1], vecs1); 691 EIGEN_EXTRACT_8f_FROM_16f(vecs[2], vecs2); 692 EIGEN_EXTRACT_8f_FROM_16f(vecs[3], vecs3); 693 EIGEN_EXTRACT_8f_FROM_16f(vecs[4], vecs4); 694 EIGEN_EXTRACT_8f_FROM_16f(vecs[5], vecs5); 695 EIGEN_EXTRACT_8f_FROM_16f(vecs[6], vecs6); 696 EIGEN_EXTRACT_8f_FROM_16f(vecs[7], vecs7); 697 EIGEN_EXTRACT_8f_FROM_16f(vecs[8], vecs8); 698 EIGEN_EXTRACT_8f_FROM_16f(vecs[9], vecs9); 699 EIGEN_EXTRACT_8f_FROM_16f(vecs[10], vecs10); 700 EIGEN_EXTRACT_8f_FROM_16f(vecs[11], vecs11); 701 EIGEN_EXTRACT_8f_FROM_16f(vecs[12], vecs12); 702 EIGEN_EXTRACT_8f_FROM_16f(vecs[13], vecs13); 703 EIGEN_EXTRACT_8f_FROM_16f(vecs[14], vecs14); 704 EIGEN_EXTRACT_8f_FROM_16f(vecs[15], vecs15); 705 706 __m256 hsum1 = _mm256_hadd_ps(vecs0_0, vecs1_0); 707 __m256 hsum2 = _mm256_hadd_ps(vecs2_0, vecs3_0); 708 __m256 hsum3 = _mm256_hadd_ps(vecs4_0, vecs5_0); 709 __m256 hsum4 = _mm256_hadd_ps(vecs6_0, vecs7_0); 710 711 __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1); 712 __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2); 713 __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3); 714 __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4); 715 716 __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); 717 __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); 718 __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); 719 __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); 720 721 __m256 sum1 = _mm256_add_ps(perm1, hsum5); 722 __m256 sum2 = _mm256_add_ps(perm2, hsum6); 723 __m256 sum3 = _mm256_add_ps(perm3, hsum7); 724 __m256 sum4 = _mm256_add_ps(perm4, hsum8); 725 726 __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); 727 __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); 728 729 __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0); 730 731 hsum1 = _mm256_hadd_ps(vecs0_1, vecs1_1); 732 hsum2 = _mm256_hadd_ps(vecs2_1, vecs3_1); 733 hsum3 = _mm256_hadd_ps(vecs4_1, vecs5_1); 734 hsum4 = _mm256_hadd_ps(vecs6_1, vecs7_1); 735 736 hsum5 = _mm256_hadd_ps(hsum1, hsum1); 737 hsum6 = _mm256_hadd_ps(hsum2, hsum2); 738 hsum7 = _mm256_hadd_ps(hsum3, hsum3); 739 hsum8 = _mm256_hadd_ps(hsum4, hsum4); 740 741 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); 742 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); 743 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); 744 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); 745 746 sum1 = _mm256_add_ps(perm1, hsum5); 747 sum2 = _mm256_add_ps(perm2, hsum6); 748 sum3 = _mm256_add_ps(perm3, hsum7); 749 sum4 = _mm256_add_ps(perm4, hsum8); 750 751 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); 752 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); 753 754 final = padd(final, _mm256_blend_ps(blend1, blend2, 0xf0)); 755 756 hsum1 = _mm256_hadd_ps(vecs8_0, vecs9_0); 757 hsum2 = _mm256_hadd_ps(vecs10_0, vecs11_0); 758 hsum3 = _mm256_hadd_ps(vecs12_0, vecs13_0); 759 hsum4 = _mm256_hadd_ps(vecs14_0, vecs15_0); 760 761 hsum5 = _mm256_hadd_ps(hsum1, hsum1); 762 hsum6 = _mm256_hadd_ps(hsum2, hsum2); 763 hsum7 = _mm256_hadd_ps(hsum3, hsum3); 764 hsum8 = _mm256_hadd_ps(hsum4, hsum4); 765 766 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); 767 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); 768 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); 769 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); 770 771 sum1 = _mm256_add_ps(perm1, hsum5); 772 sum2 = _mm256_add_ps(perm2, hsum6); 773 sum3 = _mm256_add_ps(perm3, hsum7); 774 sum4 = _mm256_add_ps(perm4, hsum8); 775 776 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); 777 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); 778 779 __m256 final_1 = _mm256_blend_ps(blend1, blend2, 0xf0); 780 781 hsum1 = _mm256_hadd_ps(vecs8_1, vecs9_1); 782 hsum2 = _mm256_hadd_ps(vecs10_1, vecs11_1); 783 hsum3 = _mm256_hadd_ps(vecs12_1, vecs13_1); 784 hsum4 = _mm256_hadd_ps(vecs14_1, vecs15_1); 785 786 hsum5 = _mm256_hadd_ps(hsum1, hsum1); 787 hsum6 = _mm256_hadd_ps(hsum2, hsum2); 788 hsum7 = _mm256_hadd_ps(hsum3, hsum3); 789 hsum8 = _mm256_hadd_ps(hsum4, hsum4); 790 791 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); 792 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); 793 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); 794 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); 795 796 sum1 = _mm256_add_ps(perm1, hsum5); 797 sum2 = _mm256_add_ps(perm2, hsum6); 798 sum3 = _mm256_add_ps(perm3, hsum7); 799 sum4 = _mm256_add_ps(perm4, hsum8); 800 801 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); 802 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); 803 804 final_1 = padd(final_1, _mm256_blend_ps(blend1, blend2, 0xf0)); 805 806 __m512 final_output; 807 808 EIGEN_INSERT_8f_INTO_16f(final_output, final, final_1); 809 return final_output; 810 } 811 812 template<> EIGEN_STRONG_INLINE Packet8d preduxp<Packet8d>(const Packet8d* vecs) 813 { 814 Packet4d vecs0_0 = _mm512_extractf64x4_pd(vecs[0], 0); 815 Packet4d vecs0_1 = _mm512_extractf64x4_pd(vecs[0], 1); 816 817 Packet4d vecs1_0 = _mm512_extractf64x4_pd(vecs[1], 0); 818 Packet4d vecs1_1 = _mm512_extractf64x4_pd(vecs[1], 1); 819 820 Packet4d vecs2_0 = _mm512_extractf64x4_pd(vecs[2], 0); 821 Packet4d vecs2_1 = _mm512_extractf64x4_pd(vecs[2], 1); 822 823 Packet4d vecs3_0 = _mm512_extractf64x4_pd(vecs[3], 0); 824 Packet4d vecs3_1 = _mm512_extractf64x4_pd(vecs[3], 1); 825 826 Packet4d vecs4_0 = _mm512_extractf64x4_pd(vecs[4], 0); 827 Packet4d vecs4_1 = _mm512_extractf64x4_pd(vecs[4], 1); 828 829 Packet4d vecs5_0 = _mm512_extractf64x4_pd(vecs[5], 0); 830 Packet4d vecs5_1 = _mm512_extractf64x4_pd(vecs[5], 1); 831 832 Packet4d vecs6_0 = _mm512_extractf64x4_pd(vecs[6], 0); 833 Packet4d vecs6_1 = _mm512_extractf64x4_pd(vecs[6], 1); 834 835 Packet4d vecs7_0 = _mm512_extractf64x4_pd(vecs[7], 0); 836 Packet4d vecs7_1 = _mm512_extractf64x4_pd(vecs[7], 1); 837 838 Packet4d tmp0, tmp1; 839 840 tmp0 = _mm256_hadd_pd(vecs0_0, vecs1_0); 841 tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); 842 843 tmp1 = _mm256_hadd_pd(vecs2_0, vecs3_0); 844 tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); 845 846 __m256d final_0 = _mm256_blend_pd(tmp0, tmp1, 0xC); 847 848 tmp0 = _mm256_hadd_pd(vecs0_1, vecs1_1); 849 tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); 850 851 tmp1 = _mm256_hadd_pd(vecs2_1, vecs3_1); 852 tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); 853 854 final_0 = padd(final_0, _mm256_blend_pd(tmp0, tmp1, 0xC)); 855 856 tmp0 = _mm256_hadd_pd(vecs4_0, vecs5_0); 857 tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); 858 859 tmp1 = _mm256_hadd_pd(vecs6_0, vecs7_0); 860 tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); 861 862 __m256d final_1 = _mm256_blend_pd(tmp0, tmp1, 0xC); 863 864 tmp0 = _mm256_hadd_pd(vecs4_1, vecs5_1); 865 tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); 866 867 tmp1 = _mm256_hadd_pd(vecs6_1, vecs7_1); 868 tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); 869 870 final_1 = padd(final_1, _mm256_blend_pd(tmp0, tmp1, 0xC)); 871 872 __m512d final_output = _mm512_insertf64x4(final_output, final_0, 0); 873 874 return _mm512_insertf64x4(final_output, final_1, 1); 875 } 876 877 template <> 878 EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) { 879 //#ifdef EIGEN_VECTORIZE_AVX512DQ 880 #if 0 881 Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); 882 Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); 883 Packet8f sum = padd(lane0, lane1); 884 Packet8f tmp0 = _mm256_hadd_ps(sum, _mm256_permute2f128_ps(a, a, 1)); 885 tmp0 = _mm256_hadd_ps(tmp0, tmp0); 886 return pfirst(_mm256_hadd_ps(tmp0, tmp0)); 887 #else 888 Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); 889 Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); 890 Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); 891 Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); 892 Packet4f sum = padd(padd(lane0, lane1), padd(lane2, lane3)); 893 sum = _mm_hadd_ps(sum, sum); 894 sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1)); 895 return pfirst(sum); 896 #endif 897 } 898 template <> 899 EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) { 900 Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); 901 Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); 902 Packet4d sum = padd(lane0, lane1); 903 Packet4d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1)); 904 return pfirst(_mm256_hadd_pd(tmp0, tmp0)); 905 } 906 907 template <> 908 EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) { 909 #ifdef EIGEN_VECTORIZE_AVX512DQ 910 Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); 911 Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); 912 return padd(lane0, lane1); 913 #else 914 Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); 915 Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); 916 Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); 917 Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); 918 Packet4f sum0 = padd(lane0, lane2); 919 Packet4f sum1 = padd(lane1, lane3); 920 return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1); 921 #endif 922 } 923 template <> 924 EIGEN_STRONG_INLINE Packet4d predux_downto4<Packet8d>(const Packet8d& a) { 925 Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); 926 Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); 927 Packet4d res = padd(lane0, lane1); 928 return res; 929 } 930 931 template <> 932 EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) { 933 //#ifdef EIGEN_VECTORIZE_AVX512DQ 934 #if 0 935 Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); 936 Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); 937 Packet8f res = pmul(lane0, lane1); 938 res = pmul(res, _mm256_permute2f128_ps(res, res, 1)); 939 res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); 940 return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); 941 #else 942 Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); 943 Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); 944 Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); 945 Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); 946 Packet4f res = pmul(pmul(lane0, lane1), pmul(lane2, lane3)); 947 res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); 948 return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); 949 #endif 950 } 951 template <> 952 EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) { 953 Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); 954 Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); 955 Packet4d res = pmul(lane0, lane1); 956 res = pmul(res, _mm256_permute2f128_pd(res, res, 1)); 957 return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1))); 958 } 959 960 template <> 961 EIGEN_STRONG_INLINE float predux_min<Packet16f>(const Packet16f& a) { 962 Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); 963 Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); 964 Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); 965 Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); 966 Packet4f res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3)); 967 res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); 968 return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); 969 } 970 template <> 971 EIGEN_STRONG_INLINE double predux_min<Packet8d>(const Packet8d& a) { 972 Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); 973 Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); 974 Packet4d res = _mm256_min_pd(lane0, lane1); 975 res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1)); 976 return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1))); 977 } 978 979 template <> 980 EIGEN_STRONG_INLINE float predux_max<Packet16f>(const Packet16f& a) { 981 Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); 982 Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); 983 Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); 984 Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); 985 Packet4f res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3)); 986 res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); 987 return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); 988 } 989 template <> 990 EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) { 991 Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); 992 Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); 993 Packet4d res = _mm256_max_pd(lane0, lane1); 994 res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1)); 995 return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1))); 996 } 997 998 template <int Offset> 999 struct palign_impl<Offset, Packet16f> { 1000 static EIGEN_STRONG_INLINE void run(Packet16f& first, 1001 const Packet16f& second) { 1002 if (Offset != 0) { 1003 __m512i first_idx = _mm512_set_epi32( 1004 Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11, 1005 Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6, 1006 Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset); 1007 1008 __m512i second_idx = 1009 _mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4, 1010 Offset - 5, Offset - 6, Offset - 7, Offset - 8, 1011 Offset - 9, Offset - 10, Offset - 11, Offset - 12, 1012 Offset - 13, Offset - 14, Offset - 15, Offset - 16); 1013 1014 unsigned short mask = 0xFFFF; 1015 mask <<= (16 - Offset); 1016 1017 first = _mm512_permutexvar_ps(first_idx, first); 1018 Packet16f tmp = _mm512_permutexvar_ps(second_idx, second); 1019 first = _mm512_mask_blend_ps(mask, first, tmp); 1020 } 1021 } 1022 }; 1023 template <int Offset> 1024 struct palign_impl<Offset, Packet8d> { 1025 static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) { 1026 if (Offset != 0) { 1027 __m512i first_idx = _mm512_set_epi32( 1028 0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0, 1029 Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset); 1030 1031 __m512i second_idx = _mm512_set_epi32( 1032 0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0, 1033 Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8); 1034 1035 unsigned char mask = 0xFF; 1036 mask <<= (8 - Offset); 1037 1038 first = _mm512_permutexvar_pd(first_idx, first); 1039 Packet8d tmp = _mm512_permutexvar_pd(second_idx, second); 1040 first = _mm512_mask_blend_pd(mask, first, tmp); 1041 } 1042 } 1043 }; 1044 1045 1046 #define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \ 1047 EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]); 1048 1049 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 16>& kernel) { 1050 __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]); 1051 __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]); 1052 __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]); 1053 __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]); 1054 __m512 T4 = _mm512_unpacklo_ps(kernel.packet[4], kernel.packet[5]); 1055 __m512 T5 = _mm512_unpackhi_ps(kernel.packet[4], kernel.packet[5]); 1056 __m512 T6 = _mm512_unpacklo_ps(kernel.packet[6], kernel.packet[7]); 1057 __m512 T7 = _mm512_unpackhi_ps(kernel.packet[6], kernel.packet[7]); 1058 __m512 T8 = _mm512_unpacklo_ps(kernel.packet[8], kernel.packet[9]); 1059 __m512 T9 = _mm512_unpackhi_ps(kernel.packet[8], kernel.packet[9]); 1060 __m512 T10 = _mm512_unpacklo_ps(kernel.packet[10], kernel.packet[11]); 1061 __m512 T11 = _mm512_unpackhi_ps(kernel.packet[10], kernel.packet[11]); 1062 __m512 T12 = _mm512_unpacklo_ps(kernel.packet[12], kernel.packet[13]); 1063 __m512 T13 = _mm512_unpackhi_ps(kernel.packet[12], kernel.packet[13]); 1064 __m512 T14 = _mm512_unpacklo_ps(kernel.packet[14], kernel.packet[15]); 1065 __m512 T15 = _mm512_unpackhi_ps(kernel.packet[14], kernel.packet[15]); 1066 __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0)); 1067 __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2)); 1068 __m512 S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0)); 1069 __m512 S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2)); 1070 __m512 S4 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(1, 0, 1, 0)); 1071 __m512 S5 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(3, 2, 3, 2)); 1072 __m512 S6 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(1, 0, 1, 0)); 1073 __m512 S7 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(3, 2, 3, 2)); 1074 __m512 S8 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(1, 0, 1, 0)); 1075 __m512 S9 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(3, 2, 3, 2)); 1076 __m512 S10 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(1, 0, 1, 0)); 1077 __m512 S11 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(3, 2, 3, 2)); 1078 __m512 S12 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(1, 0, 1, 0)); 1079 __m512 S13 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(3, 2, 3, 2)); 1080 __m512 S14 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(1, 0, 1, 0)); 1081 __m512 S15 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(3, 2, 3, 2)); 1082 1083 EIGEN_EXTRACT_8f_FROM_16f(S0, S0); 1084 EIGEN_EXTRACT_8f_FROM_16f(S1, S1); 1085 EIGEN_EXTRACT_8f_FROM_16f(S2, S2); 1086 EIGEN_EXTRACT_8f_FROM_16f(S3, S3); 1087 EIGEN_EXTRACT_8f_FROM_16f(S4, S4); 1088 EIGEN_EXTRACT_8f_FROM_16f(S5, S5); 1089 EIGEN_EXTRACT_8f_FROM_16f(S6, S6); 1090 EIGEN_EXTRACT_8f_FROM_16f(S7, S7); 1091 EIGEN_EXTRACT_8f_FROM_16f(S8, S8); 1092 EIGEN_EXTRACT_8f_FROM_16f(S9, S9); 1093 EIGEN_EXTRACT_8f_FROM_16f(S10, S10); 1094 EIGEN_EXTRACT_8f_FROM_16f(S11, S11); 1095 EIGEN_EXTRACT_8f_FROM_16f(S12, S12); 1096 EIGEN_EXTRACT_8f_FROM_16f(S13, S13); 1097 EIGEN_EXTRACT_8f_FROM_16f(S14, S14); 1098 EIGEN_EXTRACT_8f_FROM_16f(S15, S15); 1099 1100 PacketBlock<Packet8f, 32> tmp; 1101 1102 tmp.packet[0] = _mm256_permute2f128_ps(S0_0, S4_0, 0x20); 1103 tmp.packet[1] = _mm256_permute2f128_ps(S1_0, S5_0, 0x20); 1104 tmp.packet[2] = _mm256_permute2f128_ps(S2_0, S6_0, 0x20); 1105 tmp.packet[3] = _mm256_permute2f128_ps(S3_0, S7_0, 0x20); 1106 tmp.packet[4] = _mm256_permute2f128_ps(S0_0, S4_0, 0x31); 1107 tmp.packet[5] = _mm256_permute2f128_ps(S1_0, S5_0, 0x31); 1108 tmp.packet[6] = _mm256_permute2f128_ps(S2_0, S6_0, 0x31); 1109 tmp.packet[7] = _mm256_permute2f128_ps(S3_0, S7_0, 0x31); 1110 1111 tmp.packet[8] = _mm256_permute2f128_ps(S0_1, S4_1, 0x20); 1112 tmp.packet[9] = _mm256_permute2f128_ps(S1_1, S5_1, 0x20); 1113 tmp.packet[10] = _mm256_permute2f128_ps(S2_1, S6_1, 0x20); 1114 tmp.packet[11] = _mm256_permute2f128_ps(S3_1, S7_1, 0x20); 1115 tmp.packet[12] = _mm256_permute2f128_ps(S0_1, S4_1, 0x31); 1116 tmp.packet[13] = _mm256_permute2f128_ps(S1_1, S5_1, 0x31); 1117 tmp.packet[14] = _mm256_permute2f128_ps(S2_1, S6_1, 0x31); 1118 tmp.packet[15] = _mm256_permute2f128_ps(S3_1, S7_1, 0x31); 1119 1120 // Second set of _m256 outputs 1121 tmp.packet[16] = _mm256_permute2f128_ps(S8_0, S12_0, 0x20); 1122 tmp.packet[17] = _mm256_permute2f128_ps(S9_0, S13_0, 0x20); 1123 tmp.packet[18] = _mm256_permute2f128_ps(S10_0, S14_0, 0x20); 1124 tmp.packet[19] = _mm256_permute2f128_ps(S11_0, S15_0, 0x20); 1125 tmp.packet[20] = _mm256_permute2f128_ps(S8_0, S12_0, 0x31); 1126 tmp.packet[21] = _mm256_permute2f128_ps(S9_0, S13_0, 0x31); 1127 tmp.packet[22] = _mm256_permute2f128_ps(S10_0, S14_0, 0x31); 1128 tmp.packet[23] = _mm256_permute2f128_ps(S11_0, S15_0, 0x31); 1129 1130 tmp.packet[24] = _mm256_permute2f128_ps(S8_1, S12_1, 0x20); 1131 tmp.packet[25] = _mm256_permute2f128_ps(S9_1, S13_1, 0x20); 1132 tmp.packet[26] = _mm256_permute2f128_ps(S10_1, S14_1, 0x20); 1133 tmp.packet[27] = _mm256_permute2f128_ps(S11_1, S15_1, 0x20); 1134 tmp.packet[28] = _mm256_permute2f128_ps(S8_1, S12_1, 0x31); 1135 tmp.packet[29] = _mm256_permute2f128_ps(S9_1, S13_1, 0x31); 1136 tmp.packet[30] = _mm256_permute2f128_ps(S10_1, S14_1, 0x31); 1137 tmp.packet[31] = _mm256_permute2f128_ps(S11_1, S15_1, 0x31); 1138 1139 // Pack them into the output 1140 PACK_OUTPUT(kernel.packet, tmp.packet, 0, 16); 1141 PACK_OUTPUT(kernel.packet, tmp.packet, 1, 16); 1142 PACK_OUTPUT(kernel.packet, tmp.packet, 2, 16); 1143 PACK_OUTPUT(kernel.packet, tmp.packet, 3, 16); 1144 1145 PACK_OUTPUT(kernel.packet, tmp.packet, 4, 16); 1146 PACK_OUTPUT(kernel.packet, tmp.packet, 5, 16); 1147 PACK_OUTPUT(kernel.packet, tmp.packet, 6, 16); 1148 PACK_OUTPUT(kernel.packet, tmp.packet, 7, 16); 1149 1150 PACK_OUTPUT(kernel.packet, tmp.packet, 8, 16); 1151 PACK_OUTPUT(kernel.packet, tmp.packet, 9, 16); 1152 PACK_OUTPUT(kernel.packet, tmp.packet, 10, 16); 1153 PACK_OUTPUT(kernel.packet, tmp.packet, 11, 16); 1154 1155 PACK_OUTPUT(kernel.packet, tmp.packet, 12, 16); 1156 PACK_OUTPUT(kernel.packet, tmp.packet, 13, 16); 1157 PACK_OUTPUT(kernel.packet, tmp.packet, 14, 16); 1158 PACK_OUTPUT(kernel.packet, tmp.packet, 15, 16); 1159 } 1160 #define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE) \ 1161 EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX], \ 1162 INPUT[2 * INDEX + STRIDE]); 1163 1164 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 4>& kernel) { 1165 __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]); 1166 __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]); 1167 __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]); 1168 __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]); 1169 1170 __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0)); 1171 __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2)); 1172 __m512 S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0)); 1173 __m512 S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2)); 1174 1175 EIGEN_EXTRACT_8f_FROM_16f(S0, S0); 1176 EIGEN_EXTRACT_8f_FROM_16f(S1, S1); 1177 EIGEN_EXTRACT_8f_FROM_16f(S2, S2); 1178 EIGEN_EXTRACT_8f_FROM_16f(S3, S3); 1179 1180 PacketBlock<Packet8f, 8> tmp; 1181 1182 tmp.packet[0] = _mm256_permute2f128_ps(S0_0, S1_0, 0x20); 1183 tmp.packet[1] = _mm256_permute2f128_ps(S2_0, S3_0, 0x20); 1184 tmp.packet[2] = _mm256_permute2f128_ps(S0_0, S1_0, 0x31); 1185 tmp.packet[3] = _mm256_permute2f128_ps(S2_0, S3_0, 0x31); 1186 1187 tmp.packet[4] = _mm256_permute2f128_ps(S0_1, S1_1, 0x20); 1188 tmp.packet[5] = _mm256_permute2f128_ps(S2_1, S3_1, 0x20); 1189 tmp.packet[6] = _mm256_permute2f128_ps(S0_1, S1_1, 0x31); 1190 tmp.packet[7] = _mm256_permute2f128_ps(S2_1, S3_1, 0x31); 1191 1192 PACK_OUTPUT_2(kernel.packet, tmp.packet, 0, 1); 1193 PACK_OUTPUT_2(kernel.packet, tmp.packet, 1, 1); 1194 PACK_OUTPUT_2(kernel.packet, tmp.packet, 2, 1); 1195 PACK_OUTPUT_2(kernel.packet, tmp.packet, 3, 1); 1196 } 1197 1198 #define PACK_OUTPUT_SQ_D(OUTPUT, INPUT, INDEX, STRIDE) \ 1199 OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX], 0); \ 1200 OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX + STRIDE], 1); 1201 1202 #define PACK_OUTPUT_D(OUTPUT, INPUT, INDEX, STRIDE) \ 1203 OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \ 1204 OUTPUT[INDEX] = \ 1205 _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1); 1206 1207 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 4>& kernel) { 1208 __m512d T0 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0); 1209 __m512d T1 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0xff); 1210 __m512d T2 = _mm512_shuffle_pd(kernel.packet[2], kernel.packet[3], 0); 1211 __m512d T3 = _mm512_shuffle_pd(kernel.packet[2], kernel.packet[3], 0xff); 1212 1213 PacketBlock<Packet4d, 8> tmp; 1214 1215 tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), 1216 _mm512_extractf64x4_pd(T2, 0), 0x20); 1217 tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), 1218 _mm512_extractf64x4_pd(T3, 0), 0x20); 1219 tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), 1220 _mm512_extractf64x4_pd(T2, 0), 0x31); 1221 tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), 1222 _mm512_extractf64x4_pd(T3, 0), 0x31); 1223 1224 tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), 1225 _mm512_extractf64x4_pd(T2, 1), 0x20); 1226 tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), 1227 _mm512_extractf64x4_pd(T3, 1), 0x20); 1228 tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), 1229 _mm512_extractf64x4_pd(T2, 1), 0x31); 1230 tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), 1231 _mm512_extractf64x4_pd(T3, 1), 0x31); 1232 1233 PACK_OUTPUT_D(kernel.packet, tmp.packet, 0, 1); 1234 PACK_OUTPUT_D(kernel.packet, tmp.packet, 1, 1); 1235 PACK_OUTPUT_D(kernel.packet, tmp.packet, 2, 1); 1236 PACK_OUTPUT_D(kernel.packet, tmp.packet, 3, 1); 1237 } 1238 1239 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 8>& kernel) { 1240 __m512d T0 = _mm512_unpacklo_pd(kernel.packet[0], kernel.packet[1]); 1241 __m512d T1 = _mm512_unpackhi_pd(kernel.packet[0], kernel.packet[1]); 1242 __m512d T2 = _mm512_unpacklo_pd(kernel.packet[2], kernel.packet[3]); 1243 __m512d T3 = _mm512_unpackhi_pd(kernel.packet[2], kernel.packet[3]); 1244 __m512d T4 = _mm512_unpacklo_pd(kernel.packet[4], kernel.packet[5]); 1245 __m512d T5 = _mm512_unpackhi_pd(kernel.packet[4], kernel.packet[5]); 1246 __m512d T6 = _mm512_unpacklo_pd(kernel.packet[6], kernel.packet[7]); 1247 __m512d T7 = _mm512_unpackhi_pd(kernel.packet[6], kernel.packet[7]); 1248 1249 PacketBlock<Packet4d, 16> tmp; 1250 1251 tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), 1252 _mm512_extractf64x4_pd(T2, 0), 0x20); 1253 tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), 1254 _mm512_extractf64x4_pd(T3, 0), 0x20); 1255 tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), 1256 _mm512_extractf64x4_pd(T2, 0), 0x31); 1257 tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), 1258 _mm512_extractf64x4_pd(T3, 0), 0x31); 1259 1260 tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), 1261 _mm512_extractf64x4_pd(T2, 1), 0x20); 1262 tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), 1263 _mm512_extractf64x4_pd(T3, 1), 0x20); 1264 tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), 1265 _mm512_extractf64x4_pd(T2, 1), 0x31); 1266 tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), 1267 _mm512_extractf64x4_pd(T3, 1), 0x31); 1268 1269 tmp.packet[8] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 0), 1270 _mm512_extractf64x4_pd(T6, 0), 0x20); 1271 tmp.packet[9] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 0), 1272 _mm512_extractf64x4_pd(T7, 0), 0x20); 1273 tmp.packet[10] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 0), 1274 _mm512_extractf64x4_pd(T6, 0), 0x31); 1275 tmp.packet[11] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 0), 1276 _mm512_extractf64x4_pd(T7, 0), 0x31); 1277 1278 tmp.packet[12] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 1), 1279 _mm512_extractf64x4_pd(T6, 1), 0x20); 1280 tmp.packet[13] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 1), 1281 _mm512_extractf64x4_pd(T7, 1), 0x20); 1282 tmp.packet[14] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 1), 1283 _mm512_extractf64x4_pd(T6, 1), 0x31); 1284 tmp.packet[15] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 1), 1285 _mm512_extractf64x4_pd(T7, 1), 0x31); 1286 1287 PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 0, 8); 1288 PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 1, 8); 1289 PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 2, 8); 1290 PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 3, 8); 1291 1292 PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 4, 8); 1293 PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 5, 8); 1294 PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 6, 8); 1295 PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 7, 8); 1296 } 1297 template <> 1298 EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/, 1299 const Packet16f& /*thenPacket*/, 1300 const Packet16f& /*elsePacket*/) { 1301 assert(false && "To be implemented"); 1302 return Packet16f(); 1303 } 1304 template <> 1305 EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& /*ifPacket*/, 1306 const Packet8d& /*thenPacket*/, 1307 const Packet8d& /*elsePacket*/) { 1308 assert(false && "To be implemented"); 1309 return Packet8d(); 1310 } 1311 1312 } // end namespace internal 1313 1314 } // end namespace Eigen 1315 1316 #endif // EIGEN_PACKET_MATH_AVX512_H 1317