• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #ifndef CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
2 #define CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
3 
4 namespace Eigen {
5 namespace internal {
6 
7 typedef __m256 Packet8f;
8 
9 template <>
10 struct type_casting_traits<QInt32, float> {
11   enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
12 };
13 
14 template <>
15 EIGEN_STRONG_INLINE Packet8f pcast<Packet8q32i>(const Packet8q32i& a) {
16   return _mm256_cvtepi32_ps(a.m_val);
17 }
18 
19 template <>
20 struct type_casting_traits<float, QInt32> {
21   enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
22 };
23 
24 template <>
25 EIGEN_STRONG_INLINE Packet8q32i pcast<Packet8f>(const Packet8f& a) {
26   return _mm256_cvtps_epi32(a);
27 }
28 
29 template <>
30 struct type_casting_traits<QInt32, QInt8> {
31   enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
32 };
33 
34 template <>
35 EIGEN_STRONG_INLINE Packet32q8i
36 pcast<Packet8q32i, Packet32q8i>(const Packet8q32i& a, const Packet8q32i& b,
37                                 const Packet8q32i& c, const Packet8q32i& d) {
38   __m256i converted = _mm256_packs_epi16(_mm256_packs_epi32(a.m_val, b.m_val),
39                                          _mm256_packs_epi32(c.m_val, d.m_val));
40   // Since packs does not cross 128 bit lane boundaries,
41   // we have to permute to properly order the final result.
42   const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
43   return _mm256_permutevar8x32_epi32(converted, permute_mask);
44 }
45 
46 template <>
47 struct type_casting_traits<float, QInt8> {
48   enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
49 };
50 
51 template <>
52 EIGEN_STRONG_INLINE Packet32q8i
53 pcast<Packet8f, Packet32q8i>(const Packet8f& a, const Packet8f& b,
54                              const Packet8f& c, const Packet8f& d) {
55   const __m256i a_conv = _mm256_cvtps_epi32(a);
56   const __m256i b_conv = _mm256_cvtps_epi32(b);
57   const __m256i c_conv = _mm256_cvtps_epi32(c);
58   const __m256i d_conv = _mm256_cvtps_epi32(d);
59   __m256i converted = _mm256_packs_epi16(_mm256_packs_epi32(a_conv, b_conv),
60                                          _mm256_packs_epi32(c_conv, d_conv));
61   const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
62   return _mm256_permutevar8x32_epi32(converted, permute_mask);
63 }
64 
65 template <>
66 struct type_casting_traits<QInt32, QUInt8> {
67   enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
68 };
69 
70 template <>
71 EIGEN_STRONG_INLINE Packet32q8u
72 pcast<Packet8q32i, Packet32q8u>(const Packet8q32i& a, const Packet8q32i& b,
73                                 const Packet8q32i& c, const Packet8q32i& d) {
74   // _mm256_packus_epi32 trims negative numbers to 0 but we can't allow numbers
75   // that are too large because _mm256_packus_epi16 expects signed input
76   // (example of problem input: 0x11111111, which saturates to 0xffff = -1,
77   // which saturates to 0).
78   const __m256i a_clip = _mm256_min_epi32(a, _mm256_set1_epi32(255));
79   const __m256i b_clip = _mm256_min_epi32(b, _mm256_set1_epi32(255));
80   const __m256i c_clip = _mm256_min_epi32(c, _mm256_set1_epi32(255));
81   const __m256i d_clip = _mm256_min_epi32(d, _mm256_set1_epi32(255));
82   const __m256i converted = _mm256_packus_epi16(
83       _mm256_packus_epi32(a_clip, b_clip), _mm256_packus_epi32(c_clip, d_clip));
84   // Since packus does not cross 128 bit lane boundaries,
85   // we have to permute to properly order the final result.
86   const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
87   return _mm256_permutevar8x32_epi32(converted, permute_mask);
88 }
89 
90 }  // end namespace internal
91 }  // end namespace Eigen
92 
93 #endif  // CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
94