1 #ifndef CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
2 #define CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
3 #ifdef _MSC_VER
4
5 #include <emmintrin.h>
6 #include <immintrin.h>
7 #include <smmintrin.h>
8
9 #endif
10
_mm256_extract_epi16_N0(const __m256i X)11 inline int _mm256_extract_epi16_N0(const __m256i X) {
12 return _mm_extract_epi16(_mm256_extractf128_si256(X, 0 >> 3), 0 % 8);
13 }
14
_mm256_extract_epi16_N1(const __m256i X)15 inline int _mm256_extract_epi16_N1(const __m256i X) {
16 return _mm_extract_epi16(_mm256_extractf128_si256(X, 1 >> 3), 1 % 8);
17 }
18
_mm256_extract_epi8_N0(const __m256i X)19 inline int _mm256_extract_epi8_N0(const __m256i X) {
20 return _mm_extract_epi8(_mm256_extractf128_si256((X), 0 >> 4), 0 % 16);
21 }
22
_mm256_extract_epi8_N1(const __m256i X)23 inline int _mm256_extract_epi8_N1(const __m256i X) {
24 return _mm_extract_epi8(_mm256_extractf128_si256((X), 1 >> 4), 1 % 16);
25 }
26
27 namespace Eigen {
28 namespace internal {
29
30 typedef eigen_packet_wrapper<__m256i, 20> Packet32q8i;
31 typedef eigen_packet_wrapper<__m256i, 21> Packet16q16i;
32 typedef eigen_packet_wrapper<__m256i, 22> Packet32q8u;
33 typedef eigen_packet_wrapper<__m128i, 23> Packet16q8i;
34 typedef eigen_packet_wrapper<__m128i, 25> Packet16q8u;
35 typedef eigen_packet_wrapper<__m128i, 26> Packet8q16i;
36 typedef eigen_packet_wrapper<__m256i, 27> Packet8q32i;
37 typedef eigen_packet_wrapper<__m128i, 28> Packet4q32i;
38
39 #ifndef EIGEN_VECTORIZE_AVX512
40 template <>
41 struct packet_traits<QInt8> : default_packet_traits {
42 typedef Packet32q8i type;
43 typedef Packet16q8i half;
44 enum {
45 Vectorizable = 1,
46 AlignedOnScalar = 1,
47 size = 32,
48 };
49 enum {
50 HasAdd = 0,
51 HasSub = 0,
52 HasMul = 0,
53 HasNegate = 0,
54 HasAbs = 0,
55 HasAbs2 = 0,
56 HasMin = 1,
57 HasMax = 1,
58 HasConj = 0,
59 HasSetLinear = 0
60 };
61 };
62 template <>
63 struct packet_traits<QUInt8> : default_packet_traits {
64 typedef Packet32q8u type;
65 typedef Packet16q8u half;
66 enum {
67 Vectorizable = 1,
68 AlignedOnScalar = 1,
69 size = 32,
70 };
71 enum {
72 HasAdd = 0,
73 HasSub = 0,
74 HasMul = 0,
75 HasNegate = 0,
76 HasAbs = 0,
77 HasAbs2 = 0,
78 HasMin = 1,
79 HasMax = 1,
80 HasConj = 0,
81 HasSetLinear = 0
82 };
83 };
84 template <>
85 struct packet_traits<QInt16> : default_packet_traits {
86 typedef Packet16q16i type;
87 typedef Packet8q16i half;
88 enum {
89 Vectorizable = 1,
90 AlignedOnScalar = 1,
91 size = 16,
92 };
93 enum {
94 HasAdd = 0,
95 HasSub = 0,
96 HasMul = 0,
97 HasNegate = 0,
98 HasAbs = 0,
99 HasAbs2 = 0,
100 HasMin = 1,
101 HasMax = 1,
102 HasConj = 0,
103 HasSetLinear = 0
104 };
105 };
106 template <>
107 struct packet_traits<QInt32> : default_packet_traits {
108 typedef Packet8q32i type;
109 typedef Packet4q32i half;
110 enum {
111 Vectorizable = 1,
112 AlignedOnScalar = 1,
113 size = 8,
114 };
115 enum {
116 HasAdd = 1,
117 HasSub = 1,
118 HasMul = 1,
119 HasNegate = 1,
120 HasAbs = 0,
121 HasAbs2 = 0,
122 HasMin = 1,
123 HasMax = 1,
124 HasConj = 0,
125 HasSetLinear = 0
126 };
127 };
128 #endif
129
130 template <>
131 struct unpacket_traits<Packet32q8i> {
132 typedef QInt8 type;
133 typedef Packet16q8i half;
134 enum {
135 size = 32,
136 alignment = Aligned32,
137 vectorizable = true,
138 masked_load_available = false,
139 masked_store_available = false
140 };
141 };
142 template <>
143 struct unpacket_traits<Packet16q8i> {
144 typedef QInt8 type;
145 typedef Packet16q8i half;
146 enum {
147 size = 16,
148 alignment = Aligned32,
149 vectorizable = true,
150 masked_load_available = false,
151 masked_store_available = false
152 };
153 };
154 template <>
155 struct unpacket_traits<Packet16q16i> {
156 typedef QInt16 type;
157 typedef Packet8q16i half;
158 enum {
159 size = 16,
160 alignment = Aligned32,
161 vectorizable = true,
162 masked_load_available = false,
163 masked_store_available = false
164 };
165 };
166 template <>
167 struct unpacket_traits<Packet8q16i> {
168 typedef QInt16 type;
169 typedef Packet8q16i half;
170 enum {
171 size = 8,
172 alignment = Aligned32,
173 vectorizable = true,
174 masked_load_available = false,
175 masked_store_available = false
176 };
177 };
178 template <>
179 struct unpacket_traits<Packet32q8u> {
180 typedef QUInt8 type;
181 typedef Packet16q8u half;
182 enum {
183 size = 32,
184 alignment = Aligned32,
185 vectorizable = true,
186 masked_load_available = false,
187 masked_store_available = false
188 };
189 };
190 template <>
191 struct unpacket_traits<Packet8q32i> {
192 typedef QInt32 type;
193 typedef Packet4q32i half;
194 enum {
195 size = 8,
196 alignment = Aligned32,
197 vectorizable = true,
198 masked_load_available = false,
199 masked_store_available = false
200 };
201 };
202
203 // Unaligned load
204 template <>
205 EIGEN_STRONG_INLINE Packet32q8i ploadu<Packet32q8i>(const QInt8* from) {
206 EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
207 reinterpret_cast<const __m256i*>(from));
208 }
209 template <>
210 EIGEN_STRONG_INLINE Packet16q8i ploadu<Packet16q8i>(const QInt8* from) {
211 EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(
212 reinterpret_cast<const __m128i*>(from));
213 }
214 template <>
215 EIGEN_STRONG_INLINE Packet32q8u ploadu<Packet32q8u>(const QUInt8* from) {
216 EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
217 reinterpret_cast<const __m256i*>(from));
218 }
219 template <>
220 EIGEN_STRONG_INLINE Packet16q16i ploadu<Packet16q16i>(const QInt16* from) {
221 EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
222 reinterpret_cast<const __m256i*>(from));
223 }
224 template <>
225 EIGEN_STRONG_INLINE Packet8q16i ploadu<Packet8q16i>(const QInt16* from) {
226 EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(
227 reinterpret_cast<const __m128i*>(from));
228 }
229 template <>
230 EIGEN_STRONG_INLINE Packet8q32i ploadu<Packet8q32i>(const QInt32* from) {
231 EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
232 reinterpret_cast<const __m256i*>(from));
233 }
234
235 // Aligned load
236 template <>
237 EIGEN_STRONG_INLINE Packet32q8i pload<Packet32q8i>(const QInt8* from) {
238 EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
239 reinterpret_cast<const __m256i*>(from));
240 }
241 template <>
242 EIGEN_STRONG_INLINE Packet16q8i pload<Packet16q8i>(const QInt8* from) {
243 EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(
244 reinterpret_cast<const __m128i*>(from));
245 }
246 template <>
247 EIGEN_STRONG_INLINE Packet32q8u pload<Packet32q8u>(const QUInt8* from) {
248 EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
249 reinterpret_cast<const __m256i*>(from));
250 }
251 template <>
252 EIGEN_STRONG_INLINE Packet16q16i pload<Packet16q16i>(const QInt16* from) {
253 EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
254 reinterpret_cast<const __m256i*>(from));
255 }
256 template <>
257 EIGEN_STRONG_INLINE Packet8q16i pload<Packet8q16i>(const QInt16* from) {
258 EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(
259 reinterpret_cast<const __m128i*>(from));
260 }
261 template <>
262 EIGEN_STRONG_INLINE Packet8q32i pload<Packet8q32i>(const QInt32* from) {
263 EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
264 reinterpret_cast<const __m256i*>(from));
265 }
266
267 // Unaligned store
268 template <>
269 EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet32q8i& from) {
270 EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
271 reinterpret_cast<__m256i*>(to), from.m_val);
272 }
273 template <>
274 EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet16q8i& from) {
275 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to),
276 from.m_val);
277 }
278 template <>
279 EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet32q8u& from) {
280 EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
281 reinterpret_cast<__m256i*>(to), from.m_val);
282 }
283 template <>
284 EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet16q16i& from) {
285 EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
286 reinterpret_cast<__m256i*>(to), from.m_val);
287 }
288 template <>
289 EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet8q16i& from) {
290 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to),
291 from.m_val);
292 }
293 template <>
294 EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet8q32i& from) {
295 EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
296 reinterpret_cast<__m256i*>(to), from.m_val);
297 }
298
299 // Aligned store
300 template <>
301 EIGEN_STRONG_INLINE void pstore<QInt32>(QInt32* to, const Packet8q32i& from) {
302 EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
303 from.m_val);
304 }
305 template <>
306 EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet16q16i& from) {
307 EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
308 from.m_val);
309 }
310 template <>
311 EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet8q16i& from) {
312 EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
313 from.m_val);
314 }
315 template <>
316 EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet32q8u& from) {
317 EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
318 from.m_val);
319 }
320 template <>
321 EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet32q8i& from) {
322 EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
323 from.m_val);
324 }
325 template <>
326 EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet16q8i& from) {
327 EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
328 from.m_val);
329 }
330
331 // Extract first element.
332 template <>
333 EIGEN_STRONG_INLINE QInt32 pfirst<Packet8q32i>(const Packet8q32i& a) {
334 return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
335 }
336 template <>
337 EIGEN_STRONG_INLINE QInt16 pfirst<Packet16q16i>(const Packet16q16i& a) {
338 return _mm256_extract_epi16_N0(a.m_val);
339 }
340 template <>
341 EIGEN_STRONG_INLINE QUInt8 pfirst<Packet32q8u>(const Packet32q8u& a) {
342 return static_cast<uint8_t>(_mm256_extract_epi8_N0(a.m_val));
343 }
344 template <>
345 EIGEN_STRONG_INLINE QInt8 pfirst<Packet32q8i>(const Packet32q8i& a) {
346 return _mm256_extract_epi8_N0(a.m_val);
347 }
348
349 // Initialize to constant value.
350 template <>
351 EIGEN_STRONG_INLINE Packet32q8i pset1<Packet32q8i>(const QInt8& from) {
352 return _mm256_set1_epi8(from.value);
353 }
354 template <>
355 EIGEN_STRONG_INLINE Packet32q8u pset1<Packet32q8u>(const QUInt8& from) {
356 return _mm256_set1_epi8(static_cast<uint8_t>(from.value));
357 }
358 template <>
359 EIGEN_STRONG_INLINE Packet8q32i pset1<Packet8q32i>(const QInt32& from) {
360 return _mm256_set1_epi32(from.value);
361 }
362
363 // Basic arithmetic packet ops for QInt32.
364 template <>
365 EIGEN_STRONG_INLINE Packet8q32i padd<Packet8q32i>(const Packet8q32i& a,
366 const Packet8q32i& b) {
367 return _mm256_add_epi32(a.m_val, b.m_val);
368 }
369 template <>
370 EIGEN_STRONG_INLINE Packet16q16i pset1<Packet16q16i>(const QInt16& from) {
371 return _mm256_set1_epi16(from.value);
372 }
373 template <>
374 EIGEN_STRONG_INLINE Packet8q32i psub<Packet8q32i>(const Packet8q32i& a,
375 const Packet8q32i& b) {
376 return _mm256_sub_epi32(a.m_val, b.m_val);
377 }
378 // Note: mullo truncates the result to 32 bits.
379 template <>
380 EIGEN_STRONG_INLINE Packet8q32i pmul<Packet8q32i>(const Packet8q32i& a,
381 const Packet8q32i& b) {
382 return _mm256_mullo_epi32(a.m_val, b.m_val);
383 }
384 template <>
385 EIGEN_STRONG_INLINE Packet8q32i pnegate<Packet8q32i>(const Packet8q32i& a) {
386 return _mm256_sub_epi32(_mm256_setzero_si256(), a.m_val);
387 }
388
389 // Min and max.
390 template <>
391 EIGEN_STRONG_INLINE Packet8q32i pmin<Packet8q32i>(const Packet8q32i& a,
392 const Packet8q32i& b) {
393 return _mm256_min_epi32(a.m_val, b.m_val);
394 }
395 template <>
396 EIGEN_STRONG_INLINE Packet8q32i pmax<Packet8q32i>(const Packet8q32i& a,
397 const Packet8q32i& b) {
398 return _mm256_max_epi32(a.m_val, b.m_val);
399 }
400
401 template <>
402 EIGEN_STRONG_INLINE Packet16q16i pmin<Packet16q16i>(const Packet16q16i& a,
403 const Packet16q16i& b) {
404 return _mm256_min_epi16(a.m_val, b.m_val);
405 }
406 template <>
407 EIGEN_STRONG_INLINE Packet16q16i pmax<Packet16q16i>(const Packet16q16i& a,
408 const Packet16q16i& b) {
409 return _mm256_max_epi16(a.m_val, b.m_val);
410 }
411
412 template <>
413 EIGEN_STRONG_INLINE Packet32q8u pmin<Packet32q8u>(const Packet32q8u& a,
414 const Packet32q8u& b) {
415 return _mm256_min_epu8(a.m_val, b.m_val);
416 }
417 template <>
418 EIGEN_STRONG_INLINE Packet32q8u pmax<Packet32q8u>(const Packet32q8u& a,
419 const Packet32q8u& b) {
420 return _mm256_max_epu8(a.m_val, b.m_val);
421 }
422
423 template <>
424 EIGEN_STRONG_INLINE Packet32q8i pmin<Packet32q8i>(const Packet32q8i& a,
425 const Packet32q8i& b) {
426 return _mm256_min_epi8(a.m_val, b.m_val);
427 }
428 template <>
429 EIGEN_STRONG_INLINE Packet32q8i pmax<Packet32q8i>(const Packet32q8i& a,
430 const Packet32q8i& b) {
431 return _mm256_max_epi8(a.m_val, b.m_val);
432 }
433
434 // Reductions.
435 template <>
436 EIGEN_STRONG_INLINE QInt32 predux_min<Packet8q32i>(const Packet8q32i& a) {
437 __m256i tmp = _mm256_min_epi32(a, _mm256_permute2f128_si256(a, a, 1));
438 tmp =
439 _mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
440 return pfirst<Packet8q32i>(
441 _mm256_min_epi32(tmp, _mm256_shuffle_epi32(tmp, 1)));
442 }
443 template <>
444 EIGEN_STRONG_INLINE QInt32 predux_max<Packet8q32i>(const Packet8q32i& a) {
445 __m256i tmp = _mm256_max_epi32(a, _mm256_permute2f128_si256(a, a, 1));
446 tmp =
447 _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
448 return pfirst<Packet8q32i>(
449 _mm256_max_epi32(tmp, _mm256_shuffle_epi32(tmp, 1)));
450 }
451
452 template <>
453 EIGEN_STRONG_INLINE QInt16 predux_min<Packet16q16i>(const Packet16q16i& a) {
454 __m256i tmp = _mm256_min_epi16(a, _mm256_permute2f128_si256(a, a, 1));
455 tmp =
456 _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
457 tmp = _mm256_min_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
458 return std::min(_mm256_extract_epi16_N0(tmp), _mm256_extract_epi16_N1(tmp));
459 }
460 template <>
461 EIGEN_STRONG_INLINE QInt16 predux_max<Packet16q16i>(const Packet16q16i& a) {
462 __m256i tmp = _mm256_max_epi16(a, _mm256_permute2f128_si256(a, a, 1));
463 tmp =
464 _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
465 tmp = _mm256_max_epi16(tmp, _mm256_shuffle_epi32(tmp, 1));
466 return std::max(_mm256_extract_epi16_N0(tmp), _mm256_extract_epi16_N1(tmp));
467 }
468
469 template <>
470 EIGEN_STRONG_INLINE QUInt8 predux_min<Packet32q8u>(const Packet32q8u& a) {
471 __m256i tmp = _mm256_min_epu8(a, _mm256_permute2f128_si256(a, a, 1));
472 tmp =
473 _mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
474 tmp = _mm256_min_epu8(tmp, _mm256_shuffle_epi32(tmp, 1));
475 tmp = _mm256_min_epu8(tmp,
476 _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
477 return std::min(static_cast<uint8_t>(_mm256_extract_epi8_N0(tmp)),
478 static_cast<uint8_t>(_mm256_extract_epi8_N1(tmp)));
479 }
480 template <>
481 EIGEN_STRONG_INLINE QUInt8 predux_max<Packet32q8u>(const Packet32q8u& a) {
482 __m256i tmp = _mm256_max_epu8(a, _mm256_permute2f128_si256(a, a, 1));
483 tmp =
484 _mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
485 tmp = _mm256_max_epu8(tmp, _mm256_shuffle_epi32(tmp, 1));
486 tmp = _mm256_max_epu8(tmp,
487 _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
488 return std::max(static_cast<uint8_t>(_mm256_extract_epi8_N0(tmp)),
489 static_cast<uint8_t>(_mm256_extract_epi8_N1(tmp)));
490 }
491
492 template <>
493 EIGEN_STRONG_INLINE QInt8 predux_min<Packet32q8i>(const Packet32q8i& a) {
494 __m256i tmp = _mm256_min_epi8(a, _mm256_permute2f128_si256(a, a, 1));
495 tmp =
496 _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
497 tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, 1));
498 tmp = _mm256_min_epi8(tmp,
499 _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
500 return std::min(_mm256_extract_epi8_N0(tmp), _mm256_extract_epi8_N1(tmp));
501 }
502 template <>
503 EIGEN_STRONG_INLINE QInt8 predux_max<Packet32q8i>(const Packet32q8i& a) {
504 __m256i tmp = _mm256_max_epi8(a, _mm256_permute2f128_si256(a, a, 1));
505 tmp =
506 _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
507 tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, 1));
508 tmp = _mm256_max_epi8(tmp,
509 _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
510 return std::max(_mm256_extract_epi8_N0(tmp), _mm256_extract_epi8_N1(tmp));
511 }
512
513 // Vectorized scaling of Packet32q8i by float.
514 template <>
515 struct scalar_product_op<QInt32, double> : binary_op_base<QInt32, double> {
516 typedef typename ScalarBinaryOpTraits<QInt32, double>::ReturnType result_type;
517 #ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN
518 scalar_product_op() { EIGEN_SCALAR_BINARY_OP_PLUGIN }
519 #endif
520 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type
521 operator()(const QInt32& a, const double& b) const {
522 return a * b;
523 }
524
525 EIGEN_STRONG_INLINE const Packet8q32i packetOp(const Packet8q32i& a,
526 const double& b) const {
527 __m256d scale = _mm256_set1_pd(b);
528 __m256d a_lo = _mm256_cvtepi32_pd(_mm256_castsi256_si128(a));
529 __m128i result_lo = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_lo));
530 __m256d a_hi = _mm256_cvtepi32_pd(_mm256_extracti128_si256(a, 1));
531 __m128i result_hi = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_hi));
532 return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi,
533 1);
534 }
535 };
536
537 template <>
538 struct functor_traits<scalar_product_op<QInt32, double>> {
539 enum { Cost = 4 * NumTraits<float>::MulCost, PacketAccess = true };
540 };
541
542 } // end namespace internal
543 } // end namespace Eigen
544
545 #endif // CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
546