1 #ifndef CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_ 2 #define CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_ 3 4 namespace Eigen { 5 namespace internal { 6 7 typedef __m512 Packet16f; 8 typedef __m512i Packet16i; 9 10 template <> 11 struct type_casting_traits<QInt32, float> { 12 enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; 13 }; 14 15 template <> 16 EIGEN_STRONG_INLINE Packet16f pcast<Packet16q32i>(const Packet16q32i& a) { 17 return _mm512_cvtepi32_ps(a.m_val); 18 } 19 20 template <> 21 struct type_casting_traits<float, QInt32> { 22 enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; 23 }; 24 25 template <> 26 EIGEN_STRONG_INLINE Packet16q32i pcast<Packet16f>(const Packet16f& a) { 27 return _mm512_cvtps_epi32(a); 28 } 29 30 template <> 31 struct type_casting_traits<float, QInt16> { 32 enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; 33 }; 34 35 template <> 36 EIGEN_STRONG_INLINE Packet32q16i pcast<Packet16f>(const Packet16f& a, 37 const Packet16f& b) { 38 Packet16i a_int = _mm512_cvtps_epi32(a); 39 Packet16i b_int = _mm512_cvtps_epi32(b); 40 #ifdef EIGEN_VECTORIZE_AVX512BW 41 return _mm512_packs_epi32(a_int, b_int); 42 #else 43 Packet8i ab_int16_low = _mm256_permute4x64_epi64( 44 _mm256_packs_epi32(_mm512_castsi512_si256(a_int), 45 _mm512_castsi512_si256(b_int)), 46 _MM_SHUFFLE(0, 2, 1, 3)); 47 Packet8i ab_int16_high = _mm256_permute4x64_epi64( 48 _mm256_packs_epi32(_mm512_extracti32x8_epi32(a_int, 1), 49 _mm512_extracti32x8_epi32(b_int, 1)), 50 _MM_SHUFFLE(0, 2, 1, 3)); 51 return _mm512_inserti32x8(_mm512_castsi256_si512(ab_int16_low), ab_int16_high, 52 1); 53 #endif 54 } 55 56 template <> 57 struct type_casting_traits<float, QInt8> { 58 enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; 59 }; 60 61 template <> 62 EIGEN_STRONG_INLINE Packet64q8i pcast<Packet16f>(const Packet16f& a, 63 const Packet16f& b, 64 const Packet16f& c, 65 const Packet16f& d) { 66 Packet16i a_int = _mm512_cvtps_epi32(a); 67 Packet16i b_int = _mm512_cvtps_epi32(b); 68 Packet16i c_int = _mm512_cvtps_epi32(c); 69 Packet16i d_int = _mm512_cvtps_epi32(d); 70 #ifdef EIGEN_VECTORIZE_AVX512BW 71 return _mm512_packs_epi16(_mm512_packs_epi32(a_int, b_int), 72 _mm512_packs_epi32(c_int, d_int)); 73 #else 74 Packet8i ab_int16_low = _mm256_permute4x64_epi64( 75 _mm256_packs_epi32(_mm512_castsi512_si256(a_int), 76 _mm512_castsi512_si256(b_int)), 77 _MM_SHUFFLE(0, 2, 1, 3)); 78 Packet8i cd_int16_low = _mm256_permute4x64_epi64( 79 _mm256_packs_epi32(_mm512_castsi512_si256(c_int), 80 _mm512_castsi512_si256(d_int)), 81 _MM_SHUFFLE(0, 2, 1, 3)); 82 Packet8i ab_int16_high = _mm256_permute4x64_epi64( 83 _mm256_packs_epi32(_mm512_extracti32x8_epi32(a_int, 1), 84 _mm512_extracti32x8_epi32(b_int, 1)), 85 _MM_SHUFFLE(0, 2, 1, 3)); 86 Packet8i cd_int16_high = _mm256_permute4x64_epi64( 87 _mm256_packs_epi32(_mm512_extracti32x8_epi32(c_int, 1), 88 _mm512_extracti32x8_epi32(d_int, 1)), 89 _MM_SHUFFLE(0, 2, 1, 3)); 90 Packet8i abcd_int8_low = _mm256_permute4x64_epi64( 91 _mm256_packs_epi16(ab_int16_low, cd_int16_low), _MM_SHUFFLE(0, 2, 1, 3)); 92 Packet8i abcd_int8_high = 93 _mm256_permute4x64_epi64(_mm256_packs_epi16(ab_int16_high, cd_int16_high), 94 _MM_SHUFFLE(0, 2, 1, 3)); 95 return _mm512_inserti32x8(_mm512_castsi256_si512(abcd_int8_low), 96 abcd_int8_high, 1); 97 #endif 98 } 99 100 template <> 101 struct type_casting_traits<QInt32, QInt8> { 102 enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; 103 }; 104 105 template <> 106 struct type_casting_traits<QInt32, QInt16> { 107 enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; 108 }; 109 110 template <> 111 EIGEN_STRONG_INLINE Packet64q8i 112 pcast<Packet16q32i, Packet64q8i>(const Packet16q32i& a, const Packet16q32i& b, 113 const Packet16q32i& c, const Packet16q32i& d) { 114 __m128i a_part = _mm512_cvtsepi32_epi8(a); 115 __m128i b_part = _mm512_cvtsepi32_epi8(b); 116 __m128i c_part = _mm512_cvtsepi32_epi8(c); 117 __m128i d_part = _mm512_cvtsepi32_epi8(d); 118 __m256i ab = 119 _mm256_inserti128_si256(_mm256_castsi128_si256(a_part), b_part, 1); 120 __m256i cd = 121 _mm256_inserti128_si256(_mm256_castsi128_si256(c_part), d_part, 1); 122 __m512i converted = _mm512_inserti64x4(_mm512_castsi256_si512(ab), cd, 1); 123 return converted; 124 } 125 126 template <> 127 EIGEN_STRONG_INLINE Packet32q16i pcast<Packet16q32i, Packet32q16i>( 128 const Packet16q32i& a, const Packet16q32i& b) { 129 __m256i a_part = _mm512_cvtsepi32_epi16(a); 130 __m256i b_part = _mm512_cvtsepi32_epi16(b); 131 __m512i converted = 132 _mm512_inserti64x4(_mm512_castsi256_si512(a_part), b_part, 1); 133 return converted; 134 } 135 136 template <> 137 struct type_casting_traits<QInt32, QUInt8> { 138 enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; 139 }; 140 141 template <> 142 EIGEN_STRONG_INLINE Packet64q8u 143 pcast<Packet16q32i, Packet64q8u>(const Packet16q32i& a, const Packet16q32i& b, 144 const Packet16q32i& c, const Packet16q32i& d) { 145 // Brute-force saturation since there isn't a pack operation for unsigned 146 // numbers that keeps the elements in order. 147 __m128i a_part = _mm512_cvtepi32_epi8(_mm512_max_epi32( 148 _mm512_min_epi32(a, _mm512_set1_epi32(255)), _mm512_setzero_si512())); 149 __m128i b_part = _mm512_cvtepi32_epi8(_mm512_max_epi32( 150 _mm512_min_epi32(b, _mm512_set1_epi32(255)), _mm512_setzero_si512())); 151 __m128i c_part = _mm512_cvtepi32_epi8(_mm512_max_epi32( 152 _mm512_min_epi32(c, _mm512_set1_epi32(255)), _mm512_setzero_si512())); 153 __m128i d_part = _mm512_cvtepi32_epi8(_mm512_max_epi32( 154 _mm512_min_epi32(d, _mm512_set1_epi32(255)), _mm512_setzero_si512())); 155 __m256i ab = 156 _mm256_inserti128_si256(_mm256_castsi128_si256(a_part), b_part, 1); 157 __m256i cd = 158 _mm256_inserti128_si256(_mm256_castsi128_si256(c_part), d_part, 1); 159 __m512i converted = _mm512_inserti64x4(_mm512_castsi256_si512(ab), cd, 1); 160 return converted; 161 } 162 163 #if 0 164 // The type Packet32q16u does not exist for AVX-512 yet 165 template <> 166 struct type_casting_traits<QInt32, QUInt16> { 167 enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; 168 }; 169 170 template <> 171 EIGEN_STRONG_INLINE Packet32q16u 172 pcast<Packet16q32i, Packet32q16u>(const Packet16q32i& a, 173 const Packet16q32i& b) { 174 // Brute-force saturation since there isn't a pack operation for unsigned 175 // numbers that keeps the elements in order. 176 __m256i a_part = 177 _mm512_cvtepi32_epi16(_mm512_max_epi32( 178 _mm512_min_epi32(a, _mm512_set1_epi32(65535)), _mm512_setzero_si512())); 179 __m256i b_part = _mm512_cvtepi32_epi16( 180 _mm512_max_epi32(_mm512_min_epi32(b, _mm512_set1_epi32(65535)), 181 _mm512_setzero_si512())); 182 __m512i converted = 183 _mm512_inserti64x4(_mm512_castsi256_si512(a_part), b_part, 1); 184 return converted; 185 } 186 #endif 187 188 } // end namespace internal 189 } // end namespace Eigen 190 191 #endif // CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_ 192