1 #ifndef CXX11_SRC_FIXEDPOINT_PACKETMATHAVX_H_ 2 #define CXX11_SRC_FIXEDPOINT_PACKETMATHAVX_H_ 3 #ifdef _MSC_VER 4 5 #include <emmintrin.h> 6 #include <immintrin.h> 7 #include <smmintrin.h> 8 9 #endif 10 11 namespace Eigen { 12 namespace internal { 13 14 typedef eigen_packet_wrapper<__m256i, 10> Packet32q8i; 15 typedef eigen_packet_wrapper<__m128i, 11> Packet16q8i; 16 17 template <> 18 struct packet_traits<QInt8> : default_packet_traits { 19 typedef Packet32q8i type; 20 typedef Packet16q8i half; 21 enum { 22 Vectorizable = 1, 23 AlignedOnScalar = 1, 24 size = 32, 25 }; 26 enum { 27 HasAdd = 0, 28 HasSub = 0, 29 HasMul = 0, 30 HasNegate = 0, 31 HasAbs = 0, 32 HasAbs2 = 0, 33 HasMin = 0, 34 HasMax = 0, 35 HasConj = 0, 36 HasSetLinear = 0 37 }; 38 }; 39 40 template <> 41 struct unpacket_traits<Packet32q8i> { 42 typedef QInt8 type; 43 typedef Packet16q8i half; 44 enum { 45 size = 32, 46 alignment = Aligned32, 47 vectorizable = true, 48 masked_load_available = false, 49 masked_store_available = false 50 }; 51 }; 52 53 template <> 54 struct unpacket_traits<Packet16q8i> { 55 typedef QInt8 type; 56 typedef Packet16q8i half; 57 enum { 58 size = 16, 59 alignment = Aligned32, 60 vectorizable = true, 61 masked_load_available = false, 62 masked_store_available = false 63 }; 64 }; 65 template <> 66 EIGEN_STRONG_INLINE Packet32q8i pset1<Packet32q8i>(const QInt8& from) { 67 return _mm256_set1_epi8(from.value); 68 } 69 template <> 70 EIGEN_STRONG_INLINE Packet32q8i ploadu<Packet32q8i>(const QInt8* from) { 71 EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256( 72 reinterpret_cast<const __m256i*>(from)); 73 } 74 template <> 75 EIGEN_STRONG_INLINE Packet16q8i ploadu<Packet16q8i>(const QInt8* from) { 76 EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128( 77 reinterpret_cast<const __m128i*>(from)); 78 } 79 80 template <> 81 EIGEN_STRONG_INLINE Packet32q8i pload<Packet32q8i>(const QInt8* from) { 82 EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256( 83 reinterpret_cast<const __m256i*>(from)); 84 } 85 template <> 86 EIGEN_STRONG_INLINE Packet16q8i pload<Packet16q8i>(const QInt8* from) { 87 EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128( 88 reinterpret_cast<const __m128i*>(from)); 89 } 90 91 template <> 92 EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet32q8i& from) { 93 EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256( 94 reinterpret_cast<__m256i*>(to), from.m_val); 95 } 96 template <> 97 EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet16q8i& from) { 98 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), 99 from.m_val); 100 } 101 102 template <> 103 EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet32q8i& from) { 104 EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), 105 from.m_val); 106 } 107 template <> 108 EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet16q8i& from) { 109 EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), 110 from.m_val); 111 } 112 113 typedef __m256 Packet8f; 114 115 template <> 116 struct type_casting_traits<float, QInt8> { 117 enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 }; 118 }; 119 120 template <> 121 EIGEN_STRONG_INLINE Packet32q8i 122 pcast<Packet8f, Packet32q8i>(const Packet8f& a, const Packet8f& b, 123 const Packet8f& c, const Packet8f& d) { 124 const __m256i a_conv = _mm256_cvtps_epi32(a); 125 const __m256i b_conv = _mm256_cvtps_epi32(b); 126 const __m256i c_conv = _mm256_cvtps_epi32(c); 127 const __m256i d_conv = _mm256_cvtps_epi32(d); 128 __m128i low = _mm256_castsi256_si128(a_conv); 129 __m128i high = _mm256_extractf128_si256(a_conv, 1); 130 __m128i tmp = _mm_packs_epi32(low, high); 131 __m128i low2 = _mm256_castsi256_si128(b_conv); 132 __m128i high2 = _mm256_extractf128_si256(b_conv, 1); 133 __m128i tmp2 = _mm_packs_epi32(low2, high2); 134 __m128i converted_low = _mm_packs_epi16(tmp, tmp2); 135 low = _mm256_castsi256_si128(c_conv); 136 high = _mm256_extractf128_si256(c_conv, 1); 137 tmp = _mm_packs_epi32(low, high); 138 low2 = _mm256_castsi256_si128(d_conv); 139 high2 = _mm256_extractf128_si256(d_conv, 1); 140 tmp2 = _mm_packs_epi32(low2, high2); 141 __m128i converted_high = _mm_packs_epi16(tmp, tmp2); 142 return _mm256_insertf128_si256(_mm256_castsi128_si256(converted_low), 143 converted_high, 1); 144 } 145 146 } // end namespace internal 147 } // end namespace Eigen 148 149 #endif // CXX11_SRC_FIXEDPOINT_PACKETMATHAVX_H_ 150