1/// @ref core 2/// @file glm/gtc/quaternion_simd.inl 3 4#if GLM_ARCH & GLM_ARCH_SSE2_BIT 5 6namespace glm{ 7namespace detail 8{ 9/* 10 template <precision P> 11 struct compute_quat_mul<float, P, true> 12 { 13 static tquat<float, P> call(tquat<float, P> const& q1, tquat<float, P> const& q2) 14 { 15 // SSE2 STATS: 11 shuffle, 8 mul, 8 add 16 // SSE4 STATS: 3 shuffle, 4 mul, 4 dpps 17 18 __m128 const mul0 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(0, 1, 2, 3))); 19 __m128 const mul1 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2))); 20 __m128 const mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1))); 21 __m128 const mul3 = _mm_mul_ps(q1.Data, q2.Data); 22 23# if GLM_ARCH & GLM_ARCH_SSE41_BIT 24 __m128 const add0 = _mm_dp_ps(mul0, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f), 0xff); 25 __m128 const add1 = _mm_dp_ps(mul1, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f), 0xff); 26 __m128 const add2 = _mm_dp_ps(mul2, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f), 0xff); 27 __m128 const add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f), 0xff); 28# else 29 __m128 const mul4 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f)); 30 __m128 const add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul4, mul4)); 31 __m128 const add4 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1)); 32 33 __m128 const mul5 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f)); 34 __m128 const add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul5, mul5)); 35 __m128 const add5 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1)); 36 37 __m128 const mul6 = _mm_mul_ps(mul2, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f)); 38 __m128 const add2 = _mm_add_ps(mul6, _mm_movehl_ps(mul6, mul6)); 39 __m128 const add6 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1)); 40 41 __m128 const mul7 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f)); 42 __m128 const add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul7, mul7)); 43 __m128 const add7 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1)); 44 #endif 45 46 // This SIMD code is a politically correct way of doing this, but in every test I've tried it has been slower than 47 // the final code below. I'll keep this here for reference - maybe somebody else can do something better... 48 // 49 //__m128 xxyy = _mm_shuffle_ps(add4, add5, _MM_SHUFFLE(0, 0, 0, 0)); 50 //__m128 zzww = _mm_shuffle_ps(add6, add7, _MM_SHUFFLE(0, 0, 0, 0)); 51 // 52 //return _mm_shuffle_ps(xxyy, zzww, _MM_SHUFFLE(2, 0, 2, 0)); 53 54 tquat<float, P> Result(uninitialize); 55 _mm_store_ss(&Result.x, add4); 56 _mm_store_ss(&Result.y, add5); 57 _mm_store_ss(&Result.z, add6); 58 _mm_store_ss(&Result.w, add7); 59 return Result; 60 } 61 }; 62*/ 63 64 template <precision P> 65 struct compute_dot<tquat, float, P, true> 66 { 67 static GLM_FUNC_QUALIFIER float call(tquat<float, P> const& x, tquat<float, P> const& y) 68 { 69 return _mm_cvtss_f32(glm_vec1_dot(x.data, y.data)); 70 } 71 }; 72 73 template <precision P> 74 struct compute_quat_add<float, P, true> 75 { 76 static tquat<float, P> call(tquat<float, P> const& q, tquat<float, P> const& p) 77 { 78 tquat<float, P> Result(uninitialize); 79 Result.data = _mm_add_ps(q.data, p.data); 80 return Result; 81 } 82 }; 83 84# if GLM_ARCH & GLM_ARCH_AVX_BIT 85 template <precision P> 86 struct compute_quat_add<double, P, true> 87 { 88 static tquat<double, P> call(tquat<double, P> const & a, tquat<double, P> const & b) 89 { 90 tquat<double, P> Result(uninitialize); 91 Result.data = _mm256_add_pd(a.data, b.data); 92 return Result; 93 } 94 }; 95# endif 96 97 template <precision P> 98 struct compute_quat_sub<float, P, true> 99 { 100 static tquat<float, P> call(tquat<float, P> const& q, tquat<float, P> const& p) 101 { 102 tvec4<float, P> Result(uninitialize); 103 Result.data = _mm_sub_ps(q.data, p.data); 104 return Result; 105 } 106 }; 107 108# if GLM_ARCH & GLM_ARCH_AVX_BIT 109 template <precision P> 110 struct compute_quat_sub<double, P, true> 111 { 112 static tquat<double, P> call(tquat<double, P> const & a, tquat<double, P> const & b) 113 { 114 tquat<double, P> Result(uninitialize); 115 Result.data = _mm256_sub_pd(a.data, b.data); 116 return Result; 117 } 118 }; 119# endif 120 121 template <precision P> 122 struct compute_quat_mul_scalar<float, P, true> 123 { 124 static tquat<float, P> call(tquat<float, P> const& q, float s) 125 { 126 tvec4<float, P> Result(uninitialize); 127 Result.data = _mm_mul_ps(q.data, _mm_set_ps1(s)); 128 return Result; 129 } 130 }; 131 132# if GLM_ARCH & GLM_ARCH_AVX_BIT 133 template <precision P> 134 struct compute_quat_mul_scalar<double, P, true> 135 { 136 static tquat<double, P> call(tquat<double, P> const& q, double s) 137 { 138 tquat<double, P> Result(uninitialize); 139 Result.data = _mm256_mul_pd(q.data, _mm_set_ps1(s)); 140 return Result; 141 } 142 }; 143# endif 144 145 template <precision P> 146 struct compute_quat_div_scalar<float, P, true> 147 { 148 static tquat<float, P> call(tquat<float, P> const& q, float s) 149 { 150 tvec4<float, P> Result(uninitialize); 151 Result.data = _mm_div_ps(q.data, _mm_set_ps1(s)); 152 return Result; 153 } 154 }; 155 156# if GLM_ARCH & GLM_ARCH_AVX_BIT 157 template <precision P> 158 struct compute_quat_div_scalar<double, P, true> 159 { 160 static tquat<double, P> call(tquat<double, P> const& q, double s) 161 { 162 tquat<double, P> Result(uninitialize); 163 Result.data = _mm256_div_pd(q.data, _mm_set_ps1(s)); 164 return Result; 165 } 166 }; 167# endif 168 169 template <precision P> 170 struct compute_quat_mul_vec4<float, P, true> 171 { 172 static tvec4<float, P> call(tquat<float, P> const& q, tvec4<float, P> const& v) 173 { 174 __m128 const q_wwww = _mm_shuffle_ps(q.data, q.data, _MM_SHUFFLE(3, 3, 3, 3)); 175 __m128 const q_swp0 = _mm_shuffle_ps(q.data, q.data, _MM_SHUFFLE(3, 0, 2, 1)); 176 __m128 const q_swp1 = _mm_shuffle_ps(q.data, q.data, _MM_SHUFFLE(3, 1, 0, 2)); 177 __m128 const v_swp0 = _mm_shuffle_ps(v.data, v.data, _MM_SHUFFLE(3, 0, 2, 1)); 178 __m128 const v_swp1 = _mm_shuffle_ps(v.data, v.data, _MM_SHUFFLE(3, 1, 0, 2)); 179 180 __m128 uv = _mm_sub_ps(_mm_mul_ps(q_swp0, v_swp1), _mm_mul_ps(q_swp1, v_swp0)); 181 __m128 uv_swp0 = _mm_shuffle_ps(uv, uv, _MM_SHUFFLE(3, 0, 2, 1)); 182 __m128 uv_swp1 = _mm_shuffle_ps(uv, uv, _MM_SHUFFLE(3, 1, 0, 2)); 183 __m128 uuv = _mm_sub_ps(_mm_mul_ps(q_swp0, uv_swp1), _mm_mul_ps(q_swp1, uv_swp0)); 184 185 __m128 const two = _mm_set1_ps(2.0f); 186 uv = _mm_mul_ps(uv, _mm_mul_ps(q_wwww, two)); 187 uuv = _mm_mul_ps(uuv, two); 188 189 tvec4<float, P> Result(uninitialize); 190 Result.data = _mm_add_ps(v.Data, _mm_add_ps(uv, uuv)); 191 return Result; 192 } 193 }; 194}//namespace detail 195}//namespace glm 196 197#endif//GLM_ARCH & GLM_ARCH_SSE2_BIT 198 199