• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #ifndef CXX11_SRC_FIXEDPOINT_PACKETMATHAVX_H_
2 #define CXX11_SRC_FIXEDPOINT_PACKETMATHAVX_H_
3 #ifdef _MSC_VER
4 
5 #include <emmintrin.h>
6 #include <immintrin.h>
7 #include <smmintrin.h>
8 
9 #endif
10 
11 namespace Eigen {
12 namespace internal {
13 
14 typedef eigen_packet_wrapper<__m256i, 10> Packet32q8i;
15 typedef eigen_packet_wrapper<__m128i, 11> Packet16q8i;
16 
17 template <>
18 struct packet_traits<QInt8> : default_packet_traits {
19   typedef Packet32q8i type;
20   typedef Packet16q8i half;
21   enum {
22     Vectorizable = 1,
23     AlignedOnScalar = 1,
24     size = 32,
25   };
26   enum {
27     HasAdd = 0,
28     HasSub = 0,
29     HasMul = 0,
30     HasNegate = 0,
31     HasAbs = 0,
32     HasAbs2 = 0,
33     HasMin = 0,
34     HasMax = 0,
35     HasConj = 0,
36     HasSetLinear = 0
37   };
38 };
39 
40 template <>
41 struct unpacket_traits<Packet32q8i> {
42   typedef QInt8 type;
43   typedef Packet16q8i half;
44   enum {
45     size = 32,
46     alignment = Aligned32,
47     vectorizable = true,
48     masked_load_available = false,
49     masked_store_available = false
50   };
51 };
52 
53 template <>
54 struct unpacket_traits<Packet16q8i> {
55   typedef QInt8 type;
56   typedef Packet16q8i half;
57   enum {
58     size = 16,
59     alignment = Aligned32,
60     vectorizable = true,
61     masked_load_available = false,
62     masked_store_available = false
63   };
64 };
65 template <>
66 EIGEN_STRONG_INLINE Packet32q8i pset1<Packet32q8i>(const QInt8& from) {
67   return _mm256_set1_epi8(from.value);
68 }
69 template <>
70 EIGEN_STRONG_INLINE Packet32q8i ploadu<Packet32q8i>(const QInt8* from) {
71   EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
72       reinterpret_cast<const __m256i*>(from));
73 }
74 template <>
75 EIGEN_STRONG_INLINE Packet16q8i ploadu<Packet16q8i>(const QInt8* from) {
76   EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(
77       reinterpret_cast<const __m128i*>(from));
78 }
79 
80 template <>
81 EIGEN_STRONG_INLINE Packet32q8i pload<Packet32q8i>(const QInt8* from) {
82   EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
83       reinterpret_cast<const __m256i*>(from));
84 }
85 template <>
86 EIGEN_STRONG_INLINE Packet16q8i pload<Packet16q8i>(const QInt8* from) {
87   EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(
88       reinterpret_cast<const __m128i*>(from));
89 }
90 
91 template <>
92 EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet32q8i& from) {
93   EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
94       reinterpret_cast<__m256i*>(to), from.m_val);
95 }
96 template <>
97 EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet16q8i& from) {
98   EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to),
99                                                from.m_val);
100 }
101 
102 template <>
103 EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet32q8i& from) {
104   EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
105                                                from.m_val);
106 }
107 template <>
108 EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet16q8i& from) {
109   EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
110                                             from.m_val);
111 }
112 
113 typedef __m256 Packet8f;
114 
115 template <>
116 struct type_casting_traits<float, QInt8> {
117   enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
118 };
119 
120 template <>
121 EIGEN_STRONG_INLINE Packet32q8i
122 pcast<Packet8f, Packet32q8i>(const Packet8f& a, const Packet8f& b,
123                              const Packet8f& c, const Packet8f& d) {
124   const __m256i a_conv = _mm256_cvtps_epi32(a);
125   const __m256i b_conv = _mm256_cvtps_epi32(b);
126   const __m256i c_conv = _mm256_cvtps_epi32(c);
127   const __m256i d_conv = _mm256_cvtps_epi32(d);
128   __m128i low = _mm256_castsi256_si128(a_conv);
129   __m128i high = _mm256_extractf128_si256(a_conv, 1);
130   __m128i tmp = _mm_packs_epi32(low, high);
131   __m128i low2 = _mm256_castsi256_si128(b_conv);
132   __m128i high2 = _mm256_extractf128_si256(b_conv, 1);
133   __m128i tmp2 = _mm_packs_epi32(low2, high2);
134   __m128i converted_low = _mm_packs_epi16(tmp, tmp2);
135   low = _mm256_castsi256_si128(c_conv);
136   high = _mm256_extractf128_si256(c_conv, 1);
137   tmp = _mm_packs_epi32(low, high);
138   low2 = _mm256_castsi256_si128(d_conv);
139   high2 = _mm256_extractf128_si256(d_conv, 1);
140   tmp2 = _mm_packs_epi32(low2, high2);
141   __m128i converted_high = _mm_packs_epi16(tmp, tmp2);
142   return _mm256_insertf128_si256(_mm256_castsi128_si256(converted_low),
143                                  converted_high, 1);
144 }
145 
146 }  // end namespace internal
147 }  // end namespace Eigen
148 
149 #endif  // CXX11_SRC_FIXEDPOINT_PACKETMATHAVX_H_
150