1 #ifndef CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_ 2 #define CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_ 3 4 #include "PacketMathAVX2.h" 5 6 namespace Eigen { 7 namespace internal { 8 9 typedef eigen_packet_wrapper<__m512i, 30> Packet64q8i; 10 typedef eigen_packet_wrapper<__m512i, 31> Packet32q16i; 11 typedef eigen_packet_wrapper<__m512i, 32> Packet64q8u; 12 typedef eigen_packet_wrapper<__m512i, 33> Packet16q32i; 13 14 template <> 15 struct packet_traits<QInt8> : default_packet_traits { 16 typedef Packet64q8i type; 17 typedef Packet32q8i half; 18 enum { 19 Vectorizable = 1, 20 AlignedOnScalar = 1, 21 size = 64, 22 }; 23 enum { 24 HasAdd = 0, 25 HasSub = 0, 26 HasMul = 0, 27 HasNegate = 0, 28 HasAbs = 0, 29 HasAbs2 = 0, 30 HasMin = 1, 31 HasMax = 1, 32 HasConj = 0, 33 HasSetLinear = 0 34 }; 35 }; 36 template <> 37 struct packet_traits<QUInt8> : default_packet_traits { 38 typedef Packet64q8u type; 39 typedef Packet32q8u half; 40 enum { 41 Vectorizable = 1, 42 AlignedOnScalar = 1, 43 size = 64, 44 }; 45 enum { 46 HasAdd = 0, 47 HasSub = 0, 48 HasMul = 0, 49 HasNegate = 0, 50 HasAbs = 0, 51 HasAbs2 = 0, 52 HasMin = 1, 53 HasMax = 1, 54 HasConj = 0, 55 HasSetLinear = 0 56 }; 57 }; 58 template <> 59 struct packet_traits<QInt16> : default_packet_traits { 60 typedef Packet32q16i type; 61 typedef Packet16q16i half; 62 enum { 63 Vectorizable = 1, 64 AlignedOnScalar = 1, 65 size = 32, 66 }; 67 enum { 68 HasAdd = 0, 69 HasSub = 0, 70 HasMul = 0, 71 HasNegate = 0, 72 HasAbs = 0, 73 HasAbs2 = 0, 74 HasMin = 1, 75 HasMax = 1, 76 HasConj = 0, 77 HasSetLinear = 0 78 }; 79 }; 80 template <> 81 struct packet_traits<QInt32> : default_packet_traits { 82 typedef Packet16q32i type; 83 typedef Packet8q32i half; 84 enum { 85 Vectorizable = 1, 86 AlignedOnScalar = 1, 87 size = 16, 88 }; 89 enum { 90 HasAdd = 1, 91 HasSub = 1, 92 HasMul = 1, 93 HasNegate = 1, 94 HasAbs = 0, 95 HasAbs2 = 0, 96 HasMin = 1, 97 HasMax = 1, 98 HasConj = 0, 99 HasSetLinear = 0 100 }; 101 }; 102 103 template <> 104 struct unpacket_traits<Packet64q8i> { 105 typedef QInt8 type; 106 typedef Packet32q8i half; 107 enum { 108 size = 64, 109 alignment = Aligned64, 110 masked_load_available = false, 111 masked_store_available = false 112 }; 113 }; 114 template <> 115 struct unpacket_traits<Packet32q16i> { 116 typedef QInt16 type; 117 typedef Packet16q16i half; 118 enum { 119 size = 32, 120 alignment = Aligned64, 121 masked_load_available = false, 122 masked_store_available = false 123 }; 124 }; 125 template <> 126 struct unpacket_traits<Packet64q8u> { 127 typedef QUInt8 type; 128 typedef Packet32q8u half; 129 enum { 130 size = 64, 131 alignment = Aligned64, 132 masked_load_available = false, 133 masked_store_available = false 134 }; 135 }; 136 template <> 137 struct unpacket_traits<Packet16q32i> { 138 typedef QInt32 type; 139 typedef Packet8q32i half; 140 enum { 141 size = 16, 142 alignment = Aligned64, 143 masked_load_available = false, 144 masked_store_available = false 145 }; 146 }; 147 148 // Unaligned load 149 template <> 150 EIGEN_STRONG_INLINE Packet64q8i ploadu<Packet64q8i>(const QInt8* from) { 151 EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512( 152 reinterpret_cast<const __m512i*>(from)); 153 } 154 template <> 155 EIGEN_STRONG_INLINE Packet32q16i ploadu<Packet32q16i>(const QInt16* from) { 156 EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512( 157 reinterpret_cast<const __m512i*>(from)); 158 } 159 template <> 160 EIGEN_STRONG_INLINE Packet64q8u ploadu<Packet64q8u>(const QUInt8* from) { 161 EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512( 162 reinterpret_cast<const __m512i*>(from)); 163 } 164 template <> 165 EIGEN_STRONG_INLINE Packet16q32i ploadu<Packet16q32i>(const QInt32* from) { 166 EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512( 167 reinterpret_cast<const __m512i*>(from)); 168 } 169 170 // Aligned load 171 template <> 172 EIGEN_STRONG_INLINE Packet64q8i pload<Packet64q8i>(const QInt8* from) { 173 EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512( 174 reinterpret_cast<const __m512i*>(from)); 175 } 176 template <> 177 EIGEN_STRONG_INLINE Packet32q16i pload<Packet32q16i>(const QInt16* from) { 178 EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512( 179 reinterpret_cast<const __m512i*>(from)); 180 } 181 template <> 182 EIGEN_STRONG_INLINE Packet64q8u pload<Packet64q8u>(const QUInt8* from) { 183 EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512( 184 reinterpret_cast<const __m512i*>(from)); 185 } 186 template <> 187 EIGEN_STRONG_INLINE Packet16q32i pload<Packet16q32i>(const QInt32* from) { 188 EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512( 189 reinterpret_cast<const __m512i*>(from)); 190 } 191 192 // Unaligned store 193 template <> 194 EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet64q8i& from) { 195 EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( 196 reinterpret_cast<__m512i*>(to), from.m_val); 197 } 198 template <> 199 EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet32q16i& from) { 200 EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( 201 reinterpret_cast<__m512i*>(to), from.m_val); 202 } 203 template <> 204 EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet64q8u& from) { 205 EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( 206 reinterpret_cast<__m512i*>(to), from.m_val); 207 } 208 template <> 209 EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet16q32i& from) { 210 EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( 211 reinterpret_cast<__m512i*>(to), from.m_val); 212 } 213 214 // Aligned store 215 template <> 216 EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet16q32i& from) { 217 EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to), 218 from.m_val); 219 } 220 template <> 221 EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet64q8u& from) { 222 EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to), 223 from.m_val); 224 } 225 template <> 226 EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet64q8i& from) { 227 EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to), 228 from.m_val); 229 } 230 template <> 231 EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet32q16i& from) { 232 EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to), 233 from.m_val); 234 } 235 236 // Extract first element. 237 template <> 238 EIGEN_STRONG_INLINE QInt32 pfirst<Packet16q32i>(const Packet16q32i& a) { 239 return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a, 0)); 240 } 241 template <> 242 EIGEN_STRONG_INLINE QUInt8 pfirst<Packet64q8u>(const Packet64q8u& a) { 243 return static_cast<uint8_t>( 244 _mm_extract_epi8(_mm512_extracti32x4_epi32(a.m_val, 0), 0)); 245 } 246 template <> 247 EIGEN_STRONG_INLINE QInt8 pfirst<Packet64q8i>(const Packet64q8i& a) { 248 return _mm_extract_epi8(_mm512_extracti32x4_epi32(a.m_val, 0), 0); 249 } 250 template <> 251 EIGEN_STRONG_INLINE QInt16 pfirst<Packet32q16i>(const Packet32q16i& a) { 252 return _mm_extract_epi16(_mm512_extracti32x4_epi32(a.m_val, 0), 0); 253 } 254 255 // Initialize to constant value. 256 template <> 257 EIGEN_STRONG_INLINE Packet64q8i pset1<Packet64q8i>(const QInt8& from) { 258 return _mm512_set1_epi8(from.value); 259 } 260 template <> 261 EIGEN_STRONG_INLINE Packet32q16i pset1<Packet32q16i>(const QInt16& from) { 262 return _mm512_set1_epi16(from.value); 263 } 264 template <> 265 EIGEN_STRONG_INLINE Packet64q8u pset1<Packet64q8u>(const QUInt8& from) { 266 return _mm512_set1_epi8(static_cast<uint8_t>(from.value)); 267 } 268 template <> 269 EIGEN_STRONG_INLINE Packet16q32i pset1<Packet16q32i>(const QInt32& from) { 270 return _mm512_set1_epi32(from.value); 271 } 272 273 // Basic arithmetic packet ops for QInt32. 274 template <> 275 EIGEN_STRONG_INLINE Packet16q32i padd<Packet16q32i>(const Packet16q32i& a, 276 const Packet16q32i& b) { 277 return _mm512_add_epi32(a.m_val, b.m_val); 278 } 279 template <> 280 EIGEN_STRONG_INLINE Packet16q32i psub<Packet16q32i>(const Packet16q32i& a, 281 const Packet16q32i& b) { 282 return _mm512_sub_epi32(a.m_val, b.m_val); 283 } 284 // Note: mullo truncates the result to 32 bits. 285 template <> 286 EIGEN_STRONG_INLINE Packet16q32i pmul<Packet16q32i>(const Packet16q32i& a, 287 const Packet16q32i& b) { 288 return _mm512_mullo_epi32(a.m_val, b.m_val); 289 } 290 template <> 291 EIGEN_STRONG_INLINE Packet16q32i pnegate<Packet16q32i>(const Packet16q32i& a) { 292 return _mm512_sub_epi32(_mm512_setzero_si512(), a.m_val); 293 } 294 295 // Min and max. 296 template <> 297 EIGEN_STRONG_INLINE Packet16q32i pmin<Packet16q32i>(const Packet16q32i& a, 298 const Packet16q32i& b) { 299 return _mm512_min_epi32(a.m_val, b.m_val); 300 } 301 template <> 302 EIGEN_STRONG_INLINE Packet16q32i pmax<Packet16q32i>(const Packet16q32i& a, 303 const Packet16q32i& b) { 304 return _mm512_max_epi32(a.m_val, b.m_val); 305 } 306 307 template <> 308 EIGEN_STRONG_INLINE Packet64q8u pmin<Packet64q8u>(const Packet64q8u& a, 309 const Packet64q8u& b) { 310 #ifdef EIGEN_VECTORIZE_AVX512BW 311 return _mm512_min_epu8(a.m_val, b.m_val); 312 #else 313 __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0); 314 __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1); 315 __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0); 316 __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1); 317 __m256i r0 = _mm256_min_epu8(ap0, bp0); 318 __m256i r1 = _mm256_min_epu8(ap1, bp1); 319 return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); 320 #endif 321 } 322 template <> 323 EIGEN_STRONG_INLINE Packet64q8u pmax<Packet64q8u>(const Packet64q8u& a, 324 const Packet64q8u& b) { 325 #ifdef EIGEN_VECTORIZE_AVX512BW 326 return _mm512_max_epu8(a.m_val, b.m_val); 327 #else 328 __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0); 329 __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1); 330 __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0); 331 __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1); 332 __m256i r0 = _mm256_max_epu8(ap0, bp0); 333 __m256i r1 = _mm256_max_epu8(ap1, bp1); 334 return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); 335 #endif 336 } 337 338 template <> 339 EIGEN_STRONG_INLINE Packet64q8i pmin<Packet64q8i>(const Packet64q8i& a, 340 const Packet64q8i& b) { 341 #ifdef EIGEN_VECTORIZE_AVX512BW 342 return _mm512_min_epi8(a.m_val, b.m_val); 343 #else 344 __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0); 345 __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1); 346 __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0); 347 __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1); 348 __m256i r0 = _mm256_min_epi8(ap0, bp0); 349 __m256i r1 = _mm256_min_epi8(ap1, bp1); 350 return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); 351 #endif 352 } 353 template <> 354 EIGEN_STRONG_INLINE Packet32q16i pmin<Packet32q16i>(const Packet32q16i& a, 355 const Packet32q16i& b) { 356 #ifdef EIGEN_VECTORIZE_AVX512BW 357 return _mm512_min_epi16(a.m_val, b.m_val); 358 #else 359 __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0); 360 __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1); 361 __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0); 362 __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1); 363 __m256i r0 = _mm256_min_epi16(ap0, bp0); 364 __m256i r1 = _mm256_min_epi16(ap1, bp1); 365 return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); 366 #endif 367 } 368 template <> 369 EIGEN_STRONG_INLINE Packet64q8i pmax<Packet64q8i>(const Packet64q8i& a, 370 const Packet64q8i& b) { 371 #ifdef EIGEN_VECTORIZE_AVX512BW 372 return _mm512_max_epi8(a.m_val, b.m_val); 373 #else 374 __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0); 375 __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1); 376 __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0); 377 __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1); 378 __m256i r0 = _mm256_max_epi8(ap0, bp0); 379 __m256i r1 = _mm256_max_epi8(ap1, bp1); 380 return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); 381 #endif 382 } 383 template <> 384 EIGEN_STRONG_INLINE Packet32q16i pmax<Packet32q16i>(const Packet32q16i& a, 385 const Packet32q16i& b) { 386 #ifdef EIGEN_VECTORIZE_AVX512BW 387 return _mm512_max_epi16(a.m_val, b.m_val); 388 #else 389 __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0); 390 __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1); 391 __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0); 392 __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1); 393 __m256i r0 = _mm256_max_epi16(ap0, bp0); 394 __m256i r1 = _mm256_max_epi16(ap1, bp1); 395 return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1); 396 #endif 397 } 398 399 // Reductions. 400 template <> 401 EIGEN_STRONG_INLINE QInt32 predux_min<Packet16q32i>(const Packet16q32i& a) { 402 Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0); 403 Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1); 404 Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2); 405 Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3); 406 Packet4i res = 407 _mm_min_epi32(_mm_min_epi32(lane0, lane1), _mm_min_epi32(lane2, lane3)); 408 res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); 409 res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))); 410 return pfirst(res); 411 } 412 template <> 413 EIGEN_STRONG_INLINE QInt32 predux_max<Packet16q32i>(const Packet16q32i& a) { 414 Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0); 415 Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1); 416 Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2); 417 Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3); 418 Packet4i res = 419 _mm_max_epi32(_mm_max_epi32(lane0, lane1), _mm_max_epi32(lane2, lane3)); 420 res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); 421 res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))); 422 return pfirst(res); 423 } 424 template <> 425 EIGEN_STRONG_INLINE QInt16 predux_min<Packet32q16i>(const Packet32q16i& a) { 426 Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0); 427 Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1); 428 Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2); 429 Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3); 430 Packet4i res = 431 _mm_min_epi16(_mm_min_epi16(lane0, lane1), _mm_min_epi16(lane2, lane3)); 432 res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); 433 res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))); 434 std::uint32_t w = pfirst(res); 435 return std::min( 436 {static_cast<std::int16_t>(w >> 16), static_cast<std::int16_t>(w)}); 437 } 438 template <> 439 EIGEN_STRONG_INLINE QInt16 predux_max<Packet32q16i>(const Packet32q16i& a) { 440 Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0); 441 Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1); 442 Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2); 443 Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3); 444 Packet4i res = 445 _mm_max_epi16(_mm_max_epi16(lane0, lane1), _mm_max_epi16(lane2, lane3)); 446 res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); 447 res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))); 448 std::uint32_t w = pfirst(res); 449 return std::max( 450 {static_cast<std::int16_t>(w >> 16), static_cast<std::int16_t>(w)}); 451 } 452 template <> 453 EIGEN_STRONG_INLINE QUInt8 predux_min<Packet64q8u>(const Packet64q8u& a) { 454 Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0); 455 Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1); 456 Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2); 457 Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3); 458 Packet4i res = 459 _mm_min_epu8(_mm_min_epu8(lane0, lane1), _mm_min_epu8(lane2, lane3)); 460 res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); 461 res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))); 462 std::uint32_t w = pfirst(res); 463 return std::min( 464 {static_cast<std::uint8_t>(w >> 24), static_cast<std::uint8_t>(w >> 16), 465 static_cast<std::uint8_t>(w >> 8), static_cast<std::uint8_t>(w)}); 466 } 467 template <> 468 EIGEN_STRONG_INLINE QUInt8 predux_max<Packet64q8u>(const Packet64q8u& a) { 469 Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0); 470 Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1); 471 Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2); 472 Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3); 473 Packet4i res = 474 _mm_max_epu8(_mm_max_epu8(lane0, lane1), _mm_max_epu8(lane2, lane3)); 475 res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); 476 res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))); 477 std::uint32_t w = pfirst(res); 478 return std::max( 479 {static_cast<std::uint8_t>(w >> 24), static_cast<std::uint8_t>(w >> 16), 480 static_cast<std::uint8_t>(w >> 8), static_cast<std::uint8_t>(w)}); 481 } 482 template <> 483 EIGEN_STRONG_INLINE QInt8 predux_min<Packet64q8i>(const Packet64q8i& a) { 484 Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0); 485 Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1); 486 Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2); 487 Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3); 488 Packet4i res = 489 _mm_min_epi8(_mm_min_epi8(lane0, lane1), _mm_min_epi8(lane2, lane3)); 490 res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); 491 res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))); 492 std::uint32_t w = pfirst(res); 493 return std::min( 494 {static_cast<std::int8_t>(w >> 24), static_cast<std::int8_t>(w >> 16), 495 static_cast<std::int8_t>(w >> 8), static_cast<std::int8_t>(w)}); 496 } 497 template <> 498 EIGEN_STRONG_INLINE QInt8 predux_max<Packet64q8i>(const Packet64q8i& a) { 499 Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0); 500 Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1); 501 Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2); 502 Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3); 503 Packet4i res = 504 _mm_max_epi8(_mm_max_epi8(lane0, lane1), _mm_max_epi8(lane2, lane3)); 505 res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2))); 506 res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))); 507 std::uint32_t w = pfirst(res); 508 return std::min( 509 {static_cast<std::int8_t>(w >> 24), static_cast<std::int8_t>(w >> 16), 510 static_cast<std::int8_t>(w >> 8), static_cast<std::int8_t>(w)}); 511 } 512 513 } // end namespace internal 514 } // end namespace Eigen 515 516 #endif // CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_ 517