• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #ifndef CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
2 #define CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
3 #ifdef _MSC_VER
4 
5 #include <emmintrin.h>
6 #include <immintrin.h>
7 #include <smmintrin.h>
8 
9 #endif
10 
_mm256_extract_epi16_N0(const __m256i X)11 inline int _mm256_extract_epi16_N0(const __m256i X) {
12   return _mm_extract_epi16(_mm256_extractf128_si256(X, 0 >> 3), 0 % 8);
13 }
14 
_mm256_extract_epi16_N1(const __m256i X)15 inline int _mm256_extract_epi16_N1(const __m256i X) {
16   return _mm_extract_epi16(_mm256_extractf128_si256(X, 1 >> 3), 1 % 8);
17 }
18 
_mm256_extract_epi8_N0(const __m256i X)19 inline int _mm256_extract_epi8_N0(const __m256i X) {
20   return _mm_extract_epi8(_mm256_extractf128_si256((X), 0 >> 4), 0 % 16);
21 }
22 
_mm256_extract_epi8_N1(const __m256i X)23 inline int _mm256_extract_epi8_N1(const __m256i X) {
24   return _mm_extract_epi8(_mm256_extractf128_si256((X), 1 >> 4), 1 % 16);
25 }
26 
27 namespace Eigen {
28 namespace internal {
29 
30 typedef eigen_packet_wrapper<__m256i, 20> Packet32q8i;
31 typedef eigen_packet_wrapper<__m256i, 21> Packet16q16i;
32 typedef eigen_packet_wrapper<__m256i, 22> Packet32q8u;
33 typedef eigen_packet_wrapper<__m128i, 23> Packet16q8i;
34 typedef eigen_packet_wrapper<__m128i, 25> Packet16q8u;
35 typedef eigen_packet_wrapper<__m128i, 26> Packet8q16i;
36 typedef eigen_packet_wrapper<__m256i, 27> Packet8q32i;
37 typedef eigen_packet_wrapper<__m128i, 28> Packet4q32i;
38 
39 #ifndef EIGEN_VECTORIZE_AVX512
40 template <>
41 struct packet_traits<QInt8> : default_packet_traits {
42   typedef Packet32q8i type;
43   typedef Packet16q8i half;
44   enum {
45     Vectorizable = 1,
46     AlignedOnScalar = 1,
47     size = 32,
48   };
49   enum {
50     HasAdd = 0,
51     HasSub = 0,
52     HasMul = 0,
53     HasNegate = 0,
54     HasAbs = 0,
55     HasAbs2 = 0,
56     HasMin = 1,
57     HasMax = 1,
58     HasConj = 0,
59     HasSetLinear = 0
60   };
61 };
62 template <>
63 struct packet_traits<QUInt8> : default_packet_traits {
64   typedef Packet32q8u type;
65   typedef Packet16q8u half;
66   enum {
67     Vectorizable = 1,
68     AlignedOnScalar = 1,
69     size = 32,
70   };
71   enum {
72     HasAdd = 0,
73     HasSub = 0,
74     HasMul = 0,
75     HasNegate = 0,
76     HasAbs = 0,
77     HasAbs2 = 0,
78     HasMin = 1,
79     HasMax = 1,
80     HasConj = 0,
81     HasSetLinear = 0
82   };
83 };
84 template <>
85 struct packet_traits<QInt16> : default_packet_traits {
86   typedef Packet16q16i type;
87   typedef Packet8q16i half;
88   enum {
89     Vectorizable = 1,
90     AlignedOnScalar = 1,
91     size = 16,
92   };
93   enum {
94     HasAdd = 0,
95     HasSub = 0,
96     HasMul = 0,
97     HasNegate = 0,
98     HasAbs = 0,
99     HasAbs2 = 0,
100     HasMin = 1,
101     HasMax = 1,
102     HasConj = 0,
103     HasSetLinear = 0
104   };
105 };
106 template <>
107 struct packet_traits<QInt32> : default_packet_traits {
108   typedef Packet8q32i type;
109   typedef Packet4q32i half;
110   enum {
111     Vectorizable = 1,
112     AlignedOnScalar = 1,
113     size = 8,
114   };
115   enum {
116     HasAdd = 1,
117     HasSub = 1,
118     HasMul = 1,
119     HasNegate = 1,
120     HasAbs = 0,
121     HasAbs2 = 0,
122     HasMin = 1,
123     HasMax = 1,
124     HasConj = 0,
125     HasSetLinear = 0
126   };
127 };
128 #endif
129 
130 template <>
131 struct unpacket_traits<Packet32q8i> {
132   typedef QInt8 type;
133   typedef Packet16q8i half;
134   enum {
135     size = 32,
136     alignment = Aligned32,
137     vectorizable = true,
138     masked_load_available = false,
139     masked_store_available = false
140   };
141 };
142 template <>
143 struct unpacket_traits<Packet16q8i> {
144   typedef QInt8 type;
145   typedef Packet16q8i half;
146   enum {
147     size = 16,
148     alignment = Aligned32,
149     vectorizable = true,
150     masked_load_available = false,
151     masked_store_available = false
152   };
153 };
154 template <>
155 struct unpacket_traits<Packet16q16i> {
156   typedef QInt16 type;
157   typedef Packet8q16i half;
158   enum {
159     size = 16,
160     alignment = Aligned32,
161     vectorizable = true,
162     masked_load_available = false,
163     masked_store_available = false
164   };
165 };
166 template <>
167 struct unpacket_traits<Packet8q16i> {
168   typedef QInt16 type;
169   typedef Packet8q16i half;
170   enum {
171     size = 8,
172     alignment = Aligned32,
173     vectorizable = true,
174     masked_load_available = false,
175     masked_store_available = false
176   };
177 };
178 template <>
179 struct unpacket_traits<Packet32q8u> {
180   typedef QUInt8 type;
181   typedef Packet16q8u half;
182   enum {
183     size = 32,
184     alignment = Aligned32,
185     vectorizable = true,
186     masked_load_available = false,
187     masked_store_available = false
188   };
189 };
190 template <>
191 struct unpacket_traits<Packet8q32i> {
192   typedef QInt32 type;
193   typedef Packet4q32i half;
194   enum {
195     size = 8,
196     alignment = Aligned32,
197     vectorizable = true,
198     masked_load_available = false,
199     masked_store_available = false
200   };
201 };
202 
203 // Unaligned load
204 template <>
205 EIGEN_STRONG_INLINE Packet32q8i ploadu<Packet32q8i>(const QInt8* from) {
206   EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
207       reinterpret_cast<const __m256i*>(from));
208 }
209 template <>
210 EIGEN_STRONG_INLINE Packet16q8i ploadu<Packet16q8i>(const QInt8* from) {
211   EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(
212       reinterpret_cast<const __m128i*>(from));
213 }
214 template <>
215 EIGEN_STRONG_INLINE Packet32q8u ploadu<Packet32q8u>(const QUInt8* from) {
216   EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
217       reinterpret_cast<const __m256i*>(from));
218 }
219 template <>
220 EIGEN_STRONG_INLINE Packet16q16i ploadu<Packet16q16i>(const QInt16* from) {
221   EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
222       reinterpret_cast<const __m256i*>(from));
223 }
224 template <>
225 EIGEN_STRONG_INLINE Packet8q16i ploadu<Packet8q16i>(const QInt16* from) {
226   EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(
227       reinterpret_cast<const __m128i*>(from));
228 }
229 template <>
230 EIGEN_STRONG_INLINE Packet8q32i ploadu<Packet8q32i>(const QInt32* from) {
231   EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
232       reinterpret_cast<const __m256i*>(from));
233 }
234 
235 // Aligned load
236 template <>
237 EIGEN_STRONG_INLINE Packet32q8i pload<Packet32q8i>(const QInt8* from) {
238   EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
239       reinterpret_cast<const __m256i*>(from));
240 }
241 template <>
242 EIGEN_STRONG_INLINE Packet16q8i pload<Packet16q8i>(const QInt8* from) {
243   EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(
244       reinterpret_cast<const __m128i*>(from));
245 }
246 template <>
247 EIGEN_STRONG_INLINE Packet32q8u pload<Packet32q8u>(const QUInt8* from) {
248   EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
249       reinterpret_cast<const __m256i*>(from));
250 }
251 template <>
252 EIGEN_STRONG_INLINE Packet16q16i pload<Packet16q16i>(const QInt16* from) {
253   EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
254       reinterpret_cast<const __m256i*>(from));
255 }
256 template <>
257 EIGEN_STRONG_INLINE Packet8q16i pload<Packet8q16i>(const QInt16* from) {
258   EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(
259       reinterpret_cast<const __m128i*>(from));
260 }
261 template <>
262 EIGEN_STRONG_INLINE Packet8q32i pload<Packet8q32i>(const QInt32* from) {
263   EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
264       reinterpret_cast<const __m256i*>(from));
265 }
266 
267 // Unaligned store
268 template <>
269 EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet32q8i& from) {
270   EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
271       reinterpret_cast<__m256i*>(to), from.m_val);
272 }
273 template <>
274 EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet16q8i& from) {
275   EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to),
276                                                from.m_val);
277 }
278 template <>
279 EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet32q8u& from) {
280   EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
281       reinterpret_cast<__m256i*>(to), from.m_val);
282 }
283 template <>
284 EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet16q16i& from) {
285   EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
286       reinterpret_cast<__m256i*>(to), from.m_val);
287 }
288 template <>
289 EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet8q16i& from) {
290   EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to),
291                                                from.m_val);
292 }
293 template <>
294 EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet8q32i& from) {
295   EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
296       reinterpret_cast<__m256i*>(to), from.m_val);
297 }
298 
299 // Aligned store
300 template <>
301 EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet8q32i& from) {
302   EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
303                                                from.m_val);
304 }
305 template <>
306 EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet16q16i& from) {
307   EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
308                                                from.m_val);
309 }
310 template <>
311 EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet8q16i& from) {
312   EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
313                                             from.m_val);
314 }
315 template <>
316 EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet32q8u& from) {
317   EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
318                                                from.m_val);
319 }
320 template <>
321 EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet32q8i& from) {
322   EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
323                                                from.m_val);
324 }
325 template <>
326 EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet16q8i& from) {
327   EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
328                                             from.m_val);
329 }
330 
331 // Extract first element.
332 template <>
333 EIGEN_STRONG_INLINE QInt32 pfirst<Packet8q32i>(const Packet8q32i& a) {
334   return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
335 }
336 template <>
337 EIGEN_STRONG_INLINE QInt16 pfirst<Packet16q16i>(const Packet16q16i& a) {
338   return _mm256_extract_epi16_N0(a.m_val);
339 }
340 template <>
341 EIGEN_STRONG_INLINE QUInt8 pfirst<Packet32q8u>(const Packet32q8u& a) {
342   return static_cast<uint8_t>(_mm256_extract_epi8_N0(a.m_val));
343 }
344 template <>
345 EIGEN_STRONG_INLINE QInt8 pfirst<Packet32q8i>(const Packet32q8i& a) {
346   return _mm256_extract_epi8_N0(a.m_val);
347 }
348 
349 // Initialize to constant value.
350 template <>
351 EIGEN_STRONG_INLINE Packet32q8i pset1<Packet32q8i>(const QInt8& from) {
352   return _mm256_set1_epi8(from.value);
353 }
354 template <>
355 EIGEN_STRONG_INLINE Packet32q8u pset1<Packet32q8u>(const QUInt8& from) {
356   return _mm256_set1_epi8(static_cast<uint8_t>(from.value));
357 }
358 template <>
359 EIGEN_STRONG_INLINE Packet8q32i pset1<Packet8q32i>(const QInt32& from) {
360   return _mm256_set1_epi32(from.value);
361 }
362 
363 // Basic arithmetic packet ops for QInt32.
364 template <>
365 EIGEN_STRONG_INLINE Packet8q32i padd<Packet8q32i>(const Packet8q32i& a,
366                                                   const Packet8q32i& b) {
367   return _mm256_add_epi32(a.m_val, b.m_val);
368 }
369 template <>
370 EIGEN_STRONG_INLINE Packet16q16i pset1<Packet16q16i>(const QInt16& from) {
371   return _mm256_set1_epi16(from.value);
372 }
373 template <>
374 EIGEN_STRONG_INLINE Packet8q32i psub<Packet8q32i>(const Packet8q32i& a,
375                                                   const Packet8q32i& b) {
376   return _mm256_sub_epi32(a.m_val, b.m_val);
377 }
378 // Note: mullo truncates the result to 32 bits.
379 template <>
380 EIGEN_STRONG_INLINE Packet8q32i pmul<Packet8q32i>(const Packet8q32i& a,
381                                                   const Packet8q32i& b) {
382   return _mm256_mullo_epi32(a.m_val, b.m_val);
383 }
384 template <>
385 EIGEN_STRONG_INLINE Packet8q32i pnegate<Packet8q32i>(const Packet8q32i& a) {
386   return _mm256_sub_epi32(_mm256_setzero_si256(), a.m_val);
387 }
388 
389 // Min and max.
390 template <>
391 EIGEN_STRONG_INLINE Packet8q32i pmin<Packet8q32i>(const Packet8q32i& a,
392                                                   const Packet8q32i& b) {
393   return _mm256_min_epi32(a.m_val, b.m_val);
394 }
395 template <>
396 EIGEN_STRONG_INLINE Packet8q32i pmax<Packet8q32i>(const Packet8q32i& a,
397                                                   const Packet8q32i& b) {
398   return _mm256_max_epi32(a.m_val, b.m_val);
399 }
400 
401 template <>
402 EIGEN_STRONG_INLINE Packet16q16i pmin<Packet16q16i>(const Packet16q16i& a,
403                                                     const Packet16q16i& b) {
404   return _mm256_min_epi16(a.m_val, b.m_val);
405 }
406 template <>
407 EIGEN_STRONG_INLINE Packet16q16i pmax<Packet16q16i>(const Packet16q16i& a,
408                                                     const Packet16q16i& b) {
409   return _mm256_max_epi16(a.m_val, b.m_val);
410 }
411 
412 template <>
413 EIGEN_STRONG_INLINE Packet32q8u pmin<Packet32q8u>(const Packet32q8u& a,
414                                                   const Packet32q8u& b) {
415   return _mm256_min_epu8(a.m_val, b.m_val);
416 }
417 template <>
418 EIGEN_STRONG_INLINE Packet32q8u pmax<Packet32q8u>(const Packet32q8u& a,
419                                                   const Packet32q8u& b) {
420   return _mm256_max_epu8(a.m_val, b.m_val);
421 }
422 
423 template <>
424 EIGEN_STRONG_INLINE Packet32q8i pmin<Packet32q8i>(const Packet32q8i& a,
425                                                   const Packet32q8i& b) {
426   return _mm256_min_epi8(a.m_val, b.m_val);
427 }
428 template <>
429 EIGEN_STRONG_INLINE Packet32q8i pmax<Packet32q8i>(const Packet32q8i& a,
430                                                   const Packet32q8i& b) {
431   return _mm256_max_epi8(a.m_val, b.m_val);
432 }
433 
434 // Reductions.
435 template <>
436 EIGEN_STRONG_INLINE QInt32 predux_min<Packet8q32i>(const Packet8q32i& a) {
437   __m256i tmp = _mm256_min_epi32(a, _mm256_permute2f128_si256(a, a, 1));
438   tmp =
439       _mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
440   return pfirst<Packet8q32i>(
441       _mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, 1)));
442 }
443 template <>
444 EIGEN_STRONG_INLINE QInt32 predux_max<Packet8q32i>(const Packet8q32i& a) {
445   __m256i tmp = _mm256_max_epi32(a, _mm256_permute2f128_si256(a, a, 1));
446   tmp =
447       _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
448   return pfirst<Packet8q32i>(
449       _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, 1)));
450 }
451 
452 template <>
453 EIGEN_STRONG_INLINE QInt16 predux_min<Packet16q16i>(const Packet16q16i& a) {
454   __m256i tmp = _mm256_min_epi16(a, _mm256_permute2f128_si256(a, a, 1));
455   tmp =
456       _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
457   tmp = _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
458   return std::min(_mm256_extract_epi16_N0(tmp), _mm256_extract_epi16_N1(tmp));
459 }
460 template <>
461 EIGEN_STRONG_INLINE QInt16 predux_max<Packet16q16i>(const Packet16q16i& a) {
462   __m256i tmp = _mm256_max_epi16(a, _mm256_permute2f128_si256(a, a, 1));
463   tmp =
464       _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
465   tmp = _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
466   return std::max(_mm256_extract_epi16_N0(tmp), _mm256_extract_epi16_N1(tmp));
467 }
468 
469 template <>
470 EIGEN_STRONG_INLINE QUInt8 predux_min<Packet32q8u>(const Packet32q8u& a) {
471   __m256i tmp = _mm256_min_epu8(a, _mm256_permute2f128_si256(a, a, 1));
472   tmp =
473       _mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
474   tmp = _mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, 1));
475   tmp = _mm256_min_epu8(tmp,
476                         _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
477   return std::min(static_cast<uint8_t>(_mm256_extract_epi8_N0(tmp)),
478                   static_cast<uint8_t>(_mm256_extract_epi8_N1(tmp)));
479 }
480 template <>
481 EIGEN_STRONG_INLINE QUInt8 predux_max<Packet32q8u>(const Packet32q8u& a) {
482   __m256i tmp = _mm256_max_epu8(a, _mm256_permute2f128_si256(a, a, 1));
483   tmp =
484       _mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
485   tmp = _mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, 1));
486   tmp = _mm256_max_epu8(tmp,
487                         _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
488   return std::max(static_cast<uint8_t>(_mm256_extract_epi8_N0(tmp)),
489                   static_cast<uint8_t>(_mm256_extract_epi8_N1(tmp)));
490 }
491 
492 template <>
493 EIGEN_STRONG_INLINE QInt8 predux_min<Packet32q8i>(const Packet32q8i& a) {
494   __m256i tmp = _mm256_min_epi8(a, _mm256_permute2f128_si256(a, a, 1));
495   tmp =
496       _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
497   tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, 1));
498   tmp = _mm256_min_epi8(tmp,
499                         _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
500   return std::min(_mm256_extract_epi8_N0(tmp), _mm256_extract_epi8_N1(tmp));
501 }
502 template <>
503 EIGEN_STRONG_INLINE QInt8 predux_max<Packet32q8i>(const Packet32q8i& a) {
504   __m256i tmp = _mm256_max_epi8(a, _mm256_permute2f128_si256(a, a, 1));
505   tmp =
506       _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
507   tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, 1));
508   tmp = _mm256_max_epi8(tmp,
509                         _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
510   return std::max(_mm256_extract_epi8_N0(tmp), _mm256_extract_epi8_N1(tmp));
511 }
512 
513 // Vectorized scaling of Packet32q8i by float.
514 template <>
515 struct scalar_product_op<QInt32, double> : binary_op_base<QInt32, double> {
516   typedef typename ScalarBinaryOpTraits<QInt32, double>::ReturnType result_type;
517 #ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN
518   scalar_product_op() { EIGEN_SCALAR_BINARY_OP_PLUGIN }
519 #endif
520   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type
521   operator()(const QInt32& a, const double& b) const {
522     return a * b;
523   }
524 
525   EIGEN_STRONG_INLINE const Packet8q32i packetOp(const Packet8q32i& a,
526                                                  const double& b) const {
527     __m256d scale = _mm256_set1_pd(b);
528     __m256d a_lo = _mm256_cvtepi32_pd(_mm256_castsi256_si128(a));
529     __m128i result_lo = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_lo));
530     __m256d a_hi = _mm256_cvtepi32_pd(_mm256_extracti128_si256(a, 1));
531     __m128i result_hi = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_hi));
532     return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi,
533                                    1);
534   }
535 };
536 
537 template <>
538 struct functor_traits<scalar_product_op<QInt32, double>> {
539   enum { Cost = 4 * NumTraits<float>::MulCost, PacketAccess = true };
540 };
541 
542 }  // end namespace internal
543 }  // end namespace Eigen
544 
545 #endif  // CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
546