1 #ifndef CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_ 2 #define CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_ 3 4 namespace Eigen { 5 namespace internal { 6 7 typedef __m256 Packet8f; 8 9 template <> 10 struct type_casting_traits<QInt32, float> { 11 enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; 12 }; 13 14 template <> 15 EIGEN_STRONG_INLINE Packet8f pcast<Packet8q32i>(const Packet8q32i& a) { 16 return _mm256_cvtepi32_ps(a.m_val); 17 } 18 19 template <> 20 struct type_casting_traits<float, QInt32> { 21 enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; 22 }; 23 24 template <> 25 EIGEN_STRONG_INLINE Packet8q32i pcast<Packet8f>(const Packet8f& a) { 26 return _mm256_cvtps_epi32(a); 27 } 28 29 template <> 30 struct type_casting_traits<QInt32, QInt8> { 31 enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; 32 }; 33 34 template <> 35 EIGEN_STRONG_INLINE Packet32q8i 36 pcast<Packet8q32i, Packet32q8i>(const Packet8q32i& a, const Packet8q32i& b, 37 const Packet8q32i& c, const Packet8q32i& d) { 38 __m256i converted = _mm256_packs_epi16(_mm256_packs_epi32(a.m_val, b.m_val), 39 _mm256_packs_epi32(c.m_val, d.m_val)); 40 // Since packs does not cross 128 bit lane boundaries, 41 // we have to permute to properly order the final result. 42 const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); 43 return _mm256_permutevar8x32_epi32(converted, permute_mask); 44 } 45 46 template <> 47 struct type_casting_traits<float, QInt8> { 48 enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; 49 }; 50 51 template <> 52 EIGEN_STRONG_INLINE Packet32q8i 53 pcast<Packet8f, Packet32q8i>(const Packet8f& a, const Packet8f& b, 54 const Packet8f& c, const Packet8f& d) { 55 const __m256i a_conv = _mm256_cvtps_epi32(a); 56 const __m256i b_conv = _mm256_cvtps_epi32(b); 57 const __m256i c_conv = _mm256_cvtps_epi32(c); 58 const __m256i d_conv = _mm256_cvtps_epi32(d); 59 __m256i converted = _mm256_packs_epi16(_mm256_packs_epi32(a_conv, b_conv), 60 _mm256_packs_epi32(c_conv, d_conv)); 61 const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); 62 return _mm256_permutevar8x32_epi32(converted, permute_mask); 63 } 64 65 template <> 66 struct type_casting_traits<QInt32, QUInt8> { 67 enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; 68 }; 69 70 template <> 71 EIGEN_STRONG_INLINE Packet32q8u 72 pcast<Packet8q32i, Packet32q8u>(const Packet8q32i& a, const Packet8q32i& b, 73 const Packet8q32i& c, const Packet8q32i& d) { 74 // _mm256_packus_epi32 trims negative numbers to 0 but we can't allow numbers 75 // that are too large because _mm256_packus_epi16 expects signed input 76 // (example of problem input: 0x11111111, which saturates to 0xffff = -1, 77 // which saturates to 0). 78 const __m256i a_clip = _mm256_min_epi32(a, _mm256_set1_epi32(255)); 79 const __m256i b_clip = _mm256_min_epi32(b, _mm256_set1_epi32(255)); 80 const __m256i c_clip = _mm256_min_epi32(c, _mm256_set1_epi32(255)); 81 const __m256i d_clip = _mm256_min_epi32(d, _mm256_set1_epi32(255)); 82 const __m256i converted = _mm256_packus_epi16( 83 _mm256_packus_epi32(a_clip, b_clip), _mm256_packus_epi32(c_clip, d_clip)); 84 // Since packus does not cross 128 bit lane boundaries, 85 // we have to permute to properly order the final result. 86 const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); 87 return _mm256_permutevar8x32_epi32(converted, permute_mask); 88 } 89 90 } // end namespace internal 91 } // end namespace Eigen 92 93 #endif // CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_ 94