• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #ifndef CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
2 #define CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
3 
4 #include "PacketMathAVX2.h"
5 
6 namespace Eigen {
7 namespace internal {
8 
9 typedef eigen_packet_wrapper<__m512i, 30> Packet64q8i;
10 typedef eigen_packet_wrapper<__m512i, 31> Packet32q16i;
11 typedef eigen_packet_wrapper<__m512i, 32> Packet64q8u;
12 typedef eigen_packet_wrapper<__m512i, 33> Packet16q32i;
13 
14 template <>
15 struct packet_traits<QInt8> : default_packet_traits {
16   typedef Packet64q8i type;
17   typedef Packet32q8i half;
18   enum {
19     Vectorizable = 1,
20     AlignedOnScalar = 1,
21     size = 64,
22   };
23   enum {
24     HasAdd = 0,
25     HasSub = 0,
26     HasMul = 0,
27     HasNegate = 0,
28     HasAbs = 0,
29     HasAbs2 = 0,
30     HasMin = 1,
31     HasMax = 1,
32     HasConj = 0,
33     HasSetLinear = 0
34   };
35 };
36 template <>
37 struct packet_traits<QUInt8> : default_packet_traits {
38   typedef Packet64q8u type;
39   typedef Packet32q8u half;
40   enum {
41     Vectorizable = 1,
42     AlignedOnScalar = 1,
43     size = 64,
44   };
45   enum {
46     HasAdd = 0,
47     HasSub = 0,
48     HasMul = 0,
49     HasNegate = 0,
50     HasAbs = 0,
51     HasAbs2 = 0,
52     HasMin = 1,
53     HasMax = 1,
54     HasConj = 0,
55     HasSetLinear = 0
56   };
57 };
58 template <>
59 struct packet_traits<QInt16> : default_packet_traits {
60   typedef Packet32q16i type;
61   typedef Packet16q16i half;
62   enum {
63     Vectorizable = 1,
64     AlignedOnScalar = 1,
65     size = 32,
66   };
67   enum {
68     HasAdd = 0,
69     HasSub = 0,
70     HasMul = 0,
71     HasNegate = 0,
72     HasAbs = 0,
73     HasAbs2 = 0,
74     HasMin = 1,
75     HasMax = 1,
76     HasConj = 0,
77     HasSetLinear = 0
78   };
79 };
80 template <>
81 struct packet_traits<QInt32> : default_packet_traits {
82   typedef Packet16q32i type;
83   typedef Packet8q32i half;
84   enum {
85     Vectorizable = 1,
86     AlignedOnScalar = 1,
87     size = 16,
88   };
89   enum {
90     HasAdd = 1,
91     HasSub = 1,
92     HasMul = 1,
93     HasNegate = 1,
94     HasAbs = 0,
95     HasAbs2 = 0,
96     HasMin = 1,
97     HasMax = 1,
98     HasConj = 0,
99     HasSetLinear = 0
100   };
101 };
102 
103 template <>
104 struct unpacket_traits<Packet64q8i> {
105   typedef QInt8 type;
106   typedef Packet32q8i half;
107   enum {
108     size = 64,
109     alignment = Aligned64,
110     masked_load_available = false,
111     masked_store_available = false
112   };
113 };
114 template <>
115 struct unpacket_traits<Packet32q16i> {
116   typedef QInt16 type;
117   typedef Packet16q16i half;
118   enum {
119     size = 32,
120     alignment = Aligned64,
121     masked_load_available = false,
122     masked_store_available = false
123   };
124 };
125 template <>
126 struct unpacket_traits<Packet64q8u> {
127   typedef QUInt8 type;
128   typedef Packet32q8u half;
129   enum {
130     size = 64,
131     alignment = Aligned64,
132     masked_load_available = false,
133     masked_store_available = false
134   };
135 };
136 template <>
137 struct unpacket_traits<Packet16q32i> {
138   typedef QInt32 type;
139   typedef Packet8q32i half;
140   enum {
141     size = 16,
142     alignment = Aligned64,
143     masked_load_available = false,
144     masked_store_available = false
145   };
146 };
147 
148 // Unaligned load
149 template <>
150 EIGEN_STRONG_INLINE Packet64q8i ploadu<Packet64q8i>(const QInt8* from) {
151   EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
152       reinterpret_cast<const __m512i*>(from));
153 }
154 template <>
155 EIGEN_STRONG_INLINE Packet32q16i ploadu<Packet32q16i>(const QInt16* from) {
156   EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
157       reinterpret_cast<const __m512i*>(from));
158 }
159 template <>
160 EIGEN_STRONG_INLINE Packet64q8u ploadu<Packet64q8u>(const QUInt8* from) {
161   EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
162       reinterpret_cast<const __m512i*>(from));
163 }
164 template <>
165 EIGEN_STRONG_INLINE Packet16q32i ploadu<Packet16q32i>(const QInt32* from) {
166   EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
167       reinterpret_cast<const __m512i*>(from));
168 }
169 
170 // Aligned load
171 template <>
172 EIGEN_STRONG_INLINE Packet64q8i pload<Packet64q8i>(const QInt8* from) {
173   EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
174       reinterpret_cast<const __m512i*>(from));
175 }
176 template <>
177 EIGEN_STRONG_INLINE Packet32q16i pload<Packet32q16i>(const QInt16* from) {
178   EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
179       reinterpret_cast<const __m512i*>(from));
180 }
181 template <>
182 EIGEN_STRONG_INLINE Packet64q8u pload<Packet64q8u>(const QUInt8* from) {
183   EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
184       reinterpret_cast<const __m512i*>(from));
185 }
186 template <>
187 EIGEN_STRONG_INLINE Packet16q32i pload<Packet16q32i>(const QInt32* from) {
188   EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
189       reinterpret_cast<const __m512i*>(from));
190 }
191 
192 // Unaligned store
193 template <>
194 EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet64q8i& from) {
195   EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
196       reinterpret_cast<__m512i*>(to), from.m_val);
197 }
198 template <>
199 EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet32q16i& from) {
200   EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
201       reinterpret_cast<__m512i*>(to), from.m_val);
202 }
203 template <>
204 EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet64q8u& from) {
205   EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
206       reinterpret_cast<__m512i*>(to), from.m_val);
207 }
208 template <>
209 EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet16q32i& from) {
210   EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
211       reinterpret_cast<__m512i*>(to), from.m_val);
212 }
213 
214 // Aligned store
215 template <>
216 EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet16q32i& from) {
217   EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
218                                                from.m_val);
219 }
220 template <>
221 EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet64q8u& from) {
222   EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
223                                                from.m_val);
224 }
225 template <>
226 EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet64q8i& from) {
227   EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
228                                                from.m_val);
229 }
230 template <>
231 EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet32q16i& from) {
232   EIGEN_DEBUG_ALIGNED_STORE _mm512_store_si512(reinterpret_cast<__m512i*>(to),
233                                                from.m_val);
234 }
235 
236 // Extract first element.
237 template <>
238 EIGEN_STRONG_INLINE QInt32 pfirst<Packet16q32i>(const Packet16q32i& a) {
239   return _mm_cvtsi128_si32(_mm512_extracti32x4_epi32(a, 0));
240 }
241 template <>
242 EIGEN_STRONG_INLINE QUInt8 pfirst<Packet64q8u>(const Packet64q8u& a) {
243   return static_cast<uint8_t>(
244       _mm_extract_epi8(_mm512_extracti32x4_epi32(a.m_val, 0), 0));
245 }
246 template <>
247 EIGEN_STRONG_INLINE QInt8 pfirst<Packet64q8i>(const Packet64q8i& a) {
248   return _mm_extract_epi8(_mm512_extracti32x4_epi32(a.m_val, 0), 0);
249 }
250 template <>
251 EIGEN_STRONG_INLINE QInt16 pfirst<Packet32q16i>(const Packet32q16i& a) {
252   return _mm_extract_epi16(_mm512_extracti32x4_epi32(a.m_val, 0), 0);
253 }
254 
255 // Initialize to constant value.
256 template <>
257 EIGEN_STRONG_INLINE Packet64q8i pset1<Packet64q8i>(const QInt8& from) {
258   return _mm512_set1_epi8(from.value);
259 }
260 template <>
261 EIGEN_STRONG_INLINE Packet32q16i pset1<Packet32q16i>(const QInt16& from) {
262   return _mm512_set1_epi16(from.value);
263 }
264 template <>
265 EIGEN_STRONG_INLINE Packet64q8u pset1<Packet64q8u>(const QUInt8& from) {
266   return _mm512_set1_epi8(static_cast<uint8_t>(from.value));
267 }
268 template <>
269 EIGEN_STRONG_INLINE Packet16q32i pset1<Packet16q32i>(const QInt32& from) {
270   return _mm512_set1_epi32(from.value);
271 }
272 
273 // Basic arithmetic packet ops for QInt32.
274 template <>
275 EIGEN_STRONG_INLINE Packet16q32i padd<Packet16q32i>(const Packet16q32i& a,
276                                                     const Packet16q32i& b) {
277   return _mm512_add_epi32(a.m_val, b.m_val);
278 }
279 template <>
280 EIGEN_STRONG_INLINE Packet16q32i psub<Packet16q32i>(const Packet16q32i& a,
281                                                     const Packet16q32i& b) {
282   return _mm512_sub_epi32(a.m_val, b.m_val);
283 }
284 // Note: mullo truncates the result to 32 bits.
285 template <>
286 EIGEN_STRONG_INLINE Packet16q32i pmul<Packet16q32i>(const Packet16q32i& a,
287                                                     const Packet16q32i& b) {
288   return _mm512_mullo_epi32(a.m_val, b.m_val);
289 }
290 template <>
291 EIGEN_STRONG_INLINE Packet16q32i pnegate<Packet16q32i>(const Packet16q32i& a) {
292   return _mm512_sub_epi32(_mm512_setzero_si512(), a.m_val);
293 }
294 
295 // Min and max.
296 template <>
297 EIGEN_STRONG_INLINE Packet16q32i pmin<Packet16q32i>(const Packet16q32i& a,
298                                                     const Packet16q32i& b) {
299   return _mm512_min_epi32(a.m_val, b.m_val);
300 }
301 template <>
302 EIGEN_STRONG_INLINE Packet16q32i pmax<Packet16q32i>(const Packet16q32i& a,
303                                                     const Packet16q32i& b) {
304   return _mm512_max_epi32(a.m_val, b.m_val);
305 }
306 
307 template <>
308 EIGEN_STRONG_INLINE Packet64q8u pmin<Packet64q8u>(const Packet64q8u& a,
309                                                   const Packet64q8u& b) {
310 #ifdef EIGEN_VECTORIZE_AVX512BW
311   return _mm512_min_epu8(a.m_val, b.m_val);
312 #else
313   __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
314   __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
315   __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
316   __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
317   __m256i r0 = _mm256_min_epu8(ap0, bp0);
318   __m256i r1 = _mm256_min_epu8(ap1, bp1);
319   return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
320 #endif
321 }
322 template <>
323 EIGEN_STRONG_INLINE Packet64q8u pmax<Packet64q8u>(const Packet64q8u& a,
324                                                   const Packet64q8u& b) {
325 #ifdef EIGEN_VECTORIZE_AVX512BW
326   return _mm512_max_epu8(a.m_val, b.m_val);
327 #else
328   __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
329   __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
330   __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
331   __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
332   __m256i r0 = _mm256_max_epu8(ap0, bp0);
333   __m256i r1 = _mm256_max_epu8(ap1, bp1);
334   return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
335 #endif
336 }
337 
338 template <>
339 EIGEN_STRONG_INLINE Packet64q8i pmin<Packet64q8i>(const Packet64q8i& a,
340                                                   const Packet64q8i& b) {
341 #ifdef EIGEN_VECTORIZE_AVX512BW
342   return _mm512_min_epi8(a.m_val, b.m_val);
343 #else
344   __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
345   __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
346   __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
347   __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
348   __m256i r0 = _mm256_min_epi8(ap0, bp0);
349   __m256i r1 = _mm256_min_epi8(ap1, bp1);
350   return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
351 #endif
352 }
353 template <>
354 EIGEN_STRONG_INLINE Packet32q16i pmin<Packet32q16i>(const Packet32q16i& a,
355                                                     const Packet32q16i& b) {
356 #ifdef EIGEN_VECTORIZE_AVX512BW
357   return _mm512_min_epi16(a.m_val, b.m_val);
358 #else
359   __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
360   __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
361   __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
362   __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
363   __m256i r0 = _mm256_min_epi16(ap0, bp0);
364   __m256i r1 = _mm256_min_epi16(ap1, bp1);
365   return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
366 #endif
367 }
368 template <>
369 EIGEN_STRONG_INLINE Packet64q8i pmax<Packet64q8i>(const Packet64q8i& a,
370                                                   const Packet64q8i& b) {
371 #ifdef EIGEN_VECTORIZE_AVX512BW
372   return _mm512_max_epi8(a.m_val, b.m_val);
373 #else
374   __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
375   __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
376   __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
377   __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
378   __m256i r0 = _mm256_max_epi8(ap0, bp0);
379   __m256i r1 = _mm256_max_epi8(ap1, bp1);
380   return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
381 #endif
382 }
383 template <>
384 EIGEN_STRONG_INLINE Packet32q16i pmax<Packet32q16i>(const Packet32q16i& a,
385                                                     const Packet32q16i& b) {
386 #ifdef EIGEN_VECTORIZE_AVX512BW
387   return _mm512_max_epi16(a.m_val, b.m_val);
388 #else
389   __m256i ap0 = _mm512_extracti32x8_epi32(a.m_val, 0);
390   __m256i ap1 = _mm512_extracti32x8_epi32(a.m_val, 1);
391   __m256i bp0 = _mm512_extracti32x8_epi32(b.m_val, 0);
392   __m256i bp1 = _mm512_extracti32x8_epi32(b.m_val, 1);
393   __m256i r0 = _mm256_max_epi16(ap0, bp0);
394   __m256i r1 = _mm256_max_epi16(ap1, bp1);
395   return _mm512_inserti32x8(_mm512_castsi256_si512(r0), r1, 1);
396 #endif
397 }
398 
399 // Reductions.
400 template <>
401 EIGEN_STRONG_INLINE QInt32 predux_min<Packet16q32i>(const Packet16q32i& a) {
402   Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
403   Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
404   Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
405   Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
406   Packet4i res =
407       _mm_min_epi32(_mm_min_epi32(lane0, lane1), _mm_min_epi32(lane2, lane3));
408   res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
409   res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
410   return pfirst(res);
411 }
412 template <>
413 EIGEN_STRONG_INLINE QInt32 predux_max<Packet16q32i>(const Packet16q32i& a) {
414   Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
415   Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
416   Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
417   Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
418   Packet4i res =
419       _mm_max_epi32(_mm_max_epi32(lane0, lane1), _mm_max_epi32(lane2, lane3));
420   res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
421   res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
422   return pfirst(res);
423 }
424 template <>
425 EIGEN_STRONG_INLINE QInt16 predux_min<Packet32q16i>(const Packet32q16i& a) {
426   Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
427   Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
428   Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
429   Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
430   Packet4i res =
431       _mm_min_epi16(_mm_min_epi16(lane0, lane1), _mm_min_epi16(lane2, lane3));
432   res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
433   res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
434   std::uint32_t w = pfirst(res);
435   return std::min(
436       {static_cast<std::int16_t>(w >> 16), static_cast<std::int16_t>(w)});
437 }
438 template <>
439 EIGEN_STRONG_INLINE QInt16 predux_max<Packet32q16i>(const Packet32q16i& a) {
440   Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
441   Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
442   Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
443   Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
444   Packet4i res =
445       _mm_max_epi16(_mm_max_epi16(lane0, lane1), _mm_max_epi16(lane2, lane3));
446   res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
447   res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
448   std::uint32_t w = pfirst(res);
449   return std::max(
450       {static_cast<std::int16_t>(w >> 16), static_cast<std::int16_t>(w)});
451 }
452 template <>
453 EIGEN_STRONG_INLINE QUInt8 predux_min<Packet64q8u>(const Packet64q8u& a) {
454   Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
455   Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
456   Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
457   Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
458   Packet4i res =
459       _mm_min_epu8(_mm_min_epu8(lane0, lane1), _mm_min_epu8(lane2, lane3));
460   res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
461   res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
462   std::uint32_t w = pfirst(res);
463   return std::min(
464       {static_cast<std::uint8_t>(w >> 24), static_cast<std::uint8_t>(w >> 16),
465        static_cast<std::uint8_t>(w >> 8), static_cast<std::uint8_t>(w)});
466 }
467 template <>
468 EIGEN_STRONG_INLINE QUInt8 predux_max<Packet64q8u>(const Packet64q8u& a) {
469   Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
470   Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
471   Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
472   Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
473   Packet4i res =
474       _mm_max_epu8(_mm_max_epu8(lane0, lane1), _mm_max_epu8(lane2, lane3));
475   res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
476   res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
477   std::uint32_t w = pfirst(res);
478   return std::max(
479       {static_cast<std::uint8_t>(w >> 24), static_cast<std::uint8_t>(w >> 16),
480        static_cast<std::uint8_t>(w >> 8), static_cast<std::uint8_t>(w)});
481 }
482 template <>
483 EIGEN_STRONG_INLINE QInt8 predux_min<Packet64q8i>(const Packet64q8i& a) {
484   Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
485   Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
486   Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
487   Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
488   Packet4i res =
489       _mm_min_epi8(_mm_min_epi8(lane0, lane1), _mm_min_epi8(lane2, lane3));
490   res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
491   res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
492   std::uint32_t w = pfirst(res);
493   return std::min(
494       {static_cast<std::int8_t>(w >> 24), static_cast<std::int8_t>(w >> 16),
495        static_cast<std::int8_t>(w >> 8), static_cast<std::int8_t>(w)});
496 }
497 template <>
498 EIGEN_STRONG_INLINE QInt8 predux_max<Packet64q8i>(const Packet64q8i& a) {
499   Packet4i lane0 = _mm512_extracti32x4_epi32(a.m_val, 0);
500   Packet4i lane1 = _mm512_extracti32x4_epi32(a.m_val, 1);
501   Packet4i lane2 = _mm512_extracti32x4_epi32(a.m_val, 2);
502   Packet4i lane3 = _mm512_extracti32x4_epi32(a.m_val, 3);
503   Packet4i res =
504       _mm_max_epi8(_mm_max_epi8(lane0, lane1), _mm_max_epi8(lane2, lane3));
505   res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
506   res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1)));
507   std::uint32_t w = pfirst(res);
508   return std::min(
509       {static_cast<std::int8_t>(w >> 24), static_cast<std::int8_t>(w >> 16),
510        static_cast<std::int8_t>(w >> 8), static_cast<std::int8_t>(w)});
511 }
512 
513 }  // end namespace internal
514 }  // end namespace Eigen
515 
516 #endif  // CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
517