• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #ifndef CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
2 #define CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
3 
4 namespace Eigen {
5 namespace internal {
6 
7 typedef __m512 Packet16f;
8 typedef __m512i Packet16i;
9 
10 template <>
11 struct type_casting_traits<QInt32, float> {
12   enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
13 };
14 
15 template <>
16 EIGEN_STRONG_INLINE Packet16f pcast<Packet16q32i>(const Packet16q32i& a) {
17   return _mm512_cvtepi32_ps(a.m_val);
18 }
19 
20 template <>
21 struct type_casting_traits<float, QInt32> {
22   enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
23 };
24 
25 template <>
26 EIGEN_STRONG_INLINE Packet16q32i pcast<Packet16f>(const Packet16f& a) {
27   return _mm512_cvtps_epi32(a);
28 }
29 
30 template <>
31 struct type_casting_traits<float, QInt16> {
32   enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
33 };
34 
35 template <>
36 EIGEN_STRONG_INLINE Packet32q16i pcast<Packet16f>(const Packet16f& a,
37                                                   const Packet16f& b) {
38   Packet16i a_int = _mm512_cvtps_epi32(a);
39   Packet16i b_int = _mm512_cvtps_epi32(b);
40 #ifdef EIGEN_VECTORIZE_AVX512BW
41   return _mm512_packs_epi32(a_int, b_int);
42 #else
43   Packet8i ab_int16_low = _mm256_permute4x64_epi64(
44       _mm256_packs_epi32(_mm512_castsi512_si256(a_int),
45                          _mm512_castsi512_si256(b_int)),
46       _MM_SHUFFLE(0, 2, 1, 3));
47   Packet8i ab_int16_high = _mm256_permute4x64_epi64(
48       _mm256_packs_epi32(_mm512_extracti32x8_epi32(a_int, 1),
49                          _mm512_extracti32x8_epi32(b_int, 1)),
50       _MM_SHUFFLE(0, 2, 1, 3));
51   return _mm512_inserti32x8(_mm512_castsi256_si512(ab_int16_low), ab_int16_high,
52                             1);
53 #endif
54 }
55 
56 template <>
57 struct type_casting_traits<float, QInt8> {
58   enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
59 };
60 
61 template <>
62 EIGEN_STRONG_INLINE Packet64q8i pcast<Packet16f>(const Packet16f& a,
63                                                  const Packet16f& b,
64                                                  const Packet16f& c,
65                                                  const Packet16f& d) {
66   Packet16i a_int = _mm512_cvtps_epi32(a);
67   Packet16i b_int = _mm512_cvtps_epi32(b);
68   Packet16i c_int = _mm512_cvtps_epi32(c);
69   Packet16i d_int = _mm512_cvtps_epi32(d);
70 #ifdef EIGEN_VECTORIZE_AVX512BW
71   return _mm512_packs_epi16(_mm512_packs_epi32(a_int, b_int),
72                             _mm512_packs_epi32(c_int, d_int));
73 #else
74   Packet8i ab_int16_low = _mm256_permute4x64_epi64(
75       _mm256_packs_epi32(_mm512_castsi512_si256(a_int),
76                          _mm512_castsi512_si256(b_int)),
77       _MM_SHUFFLE(0, 2, 1, 3));
78   Packet8i cd_int16_low = _mm256_permute4x64_epi64(
79       _mm256_packs_epi32(_mm512_castsi512_si256(c_int),
80                          _mm512_castsi512_si256(d_int)),
81       _MM_SHUFFLE(0, 2, 1, 3));
82   Packet8i ab_int16_high = _mm256_permute4x64_epi64(
83       _mm256_packs_epi32(_mm512_extracti32x8_epi32(a_int, 1),
84                          _mm512_extracti32x8_epi32(b_int, 1)),
85       _MM_SHUFFLE(0, 2, 1, 3));
86   Packet8i cd_int16_high = _mm256_permute4x64_epi64(
87       _mm256_packs_epi32(_mm512_extracti32x8_epi32(c_int, 1),
88                          _mm512_extracti32x8_epi32(d_int, 1)),
89       _MM_SHUFFLE(0, 2, 1, 3));
90   Packet8i abcd_int8_low = _mm256_permute4x64_epi64(
91       _mm256_packs_epi16(ab_int16_low, cd_int16_low), _MM_SHUFFLE(0, 2, 1, 3));
92   Packet8i abcd_int8_high =
93       _mm256_permute4x64_epi64(_mm256_packs_epi16(ab_int16_high, cd_int16_high),
94                                _MM_SHUFFLE(0, 2, 1, 3));
95   return _mm512_inserti32x8(_mm512_castsi256_si512(abcd_int8_low),
96                             abcd_int8_high, 1);
97 #endif
98 }
99 
100 template <>
101 struct type_casting_traits<QInt32, QInt8> {
102   enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
103 };
104 
105 template <>
106 struct type_casting_traits<QInt32, QInt16> {
107   enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
108 };
109 
110 template <>
111 EIGEN_STRONG_INLINE Packet64q8i
112 pcast<Packet16q32i, Packet64q8i>(const Packet16q32i& a, const Packet16q32i& b,
113                                  const Packet16q32i& c, const Packet16q32i& d) {
114   __m128i a_part = _mm512_cvtsepi32_epi8(a);
115   __m128i b_part = _mm512_cvtsepi32_epi8(b);
116   __m128i c_part = _mm512_cvtsepi32_epi8(c);
117   __m128i d_part = _mm512_cvtsepi32_epi8(d);
118   __m256i ab =
119       _mm256_inserti128_si256(_mm256_castsi128_si256(a_part), b_part, 1);
120   __m256i cd =
121       _mm256_inserti128_si256(_mm256_castsi128_si256(c_part), d_part, 1);
122   __m512i converted = _mm512_inserti64x4(_mm512_castsi256_si512(ab), cd, 1);
123   return converted;
124 }
125 
126 template <>
127 EIGEN_STRONG_INLINE Packet32q16i pcast<Packet16q32i, Packet32q16i>(
128     const Packet16q32i& a, const Packet16q32i& b) {
129   __m256i a_part = _mm512_cvtsepi32_epi16(a);
130   __m256i b_part = _mm512_cvtsepi32_epi16(b);
131   __m512i converted =
132       _mm512_inserti64x4(_mm512_castsi256_si512(a_part), b_part, 1);
133   return converted;
134 }
135 
136 template <>
137 struct type_casting_traits<QInt32, QUInt8> {
138   enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
139 };
140 
141 template <>
142 EIGEN_STRONG_INLINE Packet64q8u
143 pcast<Packet16q32i, Packet64q8u>(const Packet16q32i& a, const Packet16q32i& b,
144                                  const Packet16q32i& c, const Packet16q32i& d) {
145   // Brute-force saturation since there isn't a pack operation for unsigned
146   // numbers that keeps the elements in order.
147   __m128i a_part = _mm512_cvtepi32_epi8(_mm512_max_epi32(
148       _mm512_min_epi32(a, _mm512_set1_epi32(255)), _mm512_setzero_si512()));
149   __m128i b_part = _mm512_cvtepi32_epi8(_mm512_max_epi32(
150       _mm512_min_epi32(b, _mm512_set1_epi32(255)), _mm512_setzero_si512()));
151   __m128i c_part = _mm512_cvtepi32_epi8(_mm512_max_epi32(
152       _mm512_min_epi32(c, _mm512_set1_epi32(255)), _mm512_setzero_si512()));
153   __m128i d_part = _mm512_cvtepi32_epi8(_mm512_max_epi32(
154       _mm512_min_epi32(d, _mm512_set1_epi32(255)), _mm512_setzero_si512()));
155   __m256i ab =
156       _mm256_inserti128_si256(_mm256_castsi128_si256(a_part), b_part, 1);
157   __m256i cd =
158       _mm256_inserti128_si256(_mm256_castsi128_si256(c_part), d_part, 1);
159   __m512i converted = _mm512_inserti64x4(_mm512_castsi256_si512(ab), cd, 1);
160   return converted;
161 }
162 
163 #if 0
164 // The type Packet32q16u does not exist for AVX-512 yet
165 template <>
166 struct type_casting_traits<QInt32, QUInt16> {
167   enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
168 };
169 
170 template <>
171 EIGEN_STRONG_INLINE Packet32q16u
172 pcast<Packet16q32i, Packet32q16u>(const Packet16q32i& a,
173                                   const Packet16q32i& b) {
174   // Brute-force saturation since there isn't a pack operation for unsigned
175   // numbers that keeps the elements in order.
176   __m256i a_part =
177       _mm512_cvtepi32_epi16(_mm512_max_epi32(
178         _mm512_min_epi32(a, _mm512_set1_epi32(65535)), _mm512_setzero_si512()));
179   __m256i b_part = _mm512_cvtepi32_epi16(
180     _mm512_max_epi32(_mm512_min_epi32(b, _mm512_set1_epi32(65535)),
181                      _mm512_setzero_si512()));
182   __m512i converted =
183       _mm512_inserti64x4(_mm512_castsi256_si512(a_part), b_part, 1);
184   return converted;
185 }
186 #endif
187 
188 }  // end namespace internal
189 }  // end namespace Eigen
190 
191 #endif  // CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
192