• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/// @ref core
2/// @file glm/gtc/quaternion_simd.inl
3
4#if GLM_ARCH & GLM_ARCH_SSE2_BIT
5
6namespace glm{
7namespace detail
8{
9/*
10	template <precision P>
11	struct compute_quat_mul<float, P, true>
12	{
13		static tquat<float, P> call(tquat<float, P> const& q1, tquat<float, P> const& q2)
14		{
15			// SSE2 STATS: 11 shuffle, 8 mul, 8 add
16			// SSE4 STATS: 3 shuffle, 4 mul, 4 dpps
17
18			__m128 const mul0 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(0, 1, 2, 3)));
19			__m128 const mul1 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2)));
20			__m128 const mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1)));
21			__m128 const mul3 = _mm_mul_ps(q1.Data, q2.Data);
22
23#			if GLM_ARCH & GLM_ARCH_SSE41_BIT
24				__m128 const add0 = _mm_dp_ps(mul0, _mm_set_ps(1.0f, -1.0f,  1.0f,  1.0f), 0xff);
25				__m128 const add1 = _mm_dp_ps(mul1, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f), 0xff);
26				__m128 const add2 = _mm_dp_ps(mul2, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f), 0xff);
27				__m128 const add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f), 0xff);
28#			else
29				__m128 const mul4 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f,  1.0f,  1.0f));
30				__m128 const add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul4, mul4));
31				__m128 const add4 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1));
32
33				__m128 const mul5 = _mm_mul_ps(mul1, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f));
34				__m128 const add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul5, mul5));
35				__m128 const add5 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1));
36
37				__m128 const mul6 = _mm_mul_ps(mul2, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f));
38				__m128 const add2 = _mm_add_ps(mul6, _mm_movehl_ps(mul6, mul6));
39				__m128 const add6 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1));
40
41				__m128 const mul7 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f));
42				__m128 const add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul7, mul7));
43				__m128 const add7 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1));
44		#endif
45
46			// This SIMD code is a politically correct way of doing this, but in every test I've tried it has been slower than
47			// the final code below. I'll keep this here for reference - maybe somebody else can do something better...
48			//
49			//__m128 xxyy = _mm_shuffle_ps(add4, add5, _MM_SHUFFLE(0, 0, 0, 0));
50			//__m128 zzww = _mm_shuffle_ps(add6, add7, _MM_SHUFFLE(0, 0, 0, 0));
51			//
52			//return _mm_shuffle_ps(xxyy, zzww, _MM_SHUFFLE(2, 0, 2, 0));
53
54			tquat<float, P> Result(uninitialize);
55			_mm_store_ss(&Result.x, add4);
56			_mm_store_ss(&Result.y, add5);
57			_mm_store_ss(&Result.z, add6);
58			_mm_store_ss(&Result.w, add7);
59			return Result;
60		}
61	};
62*/
63
64	template <precision P>
65	struct compute_dot<tquat, float, P, true>
66	{
67		static GLM_FUNC_QUALIFIER float call(tquat<float, P> const& x, tquat<float, P> const& y)
68		{
69			return _mm_cvtss_f32(glm_vec1_dot(x.data, y.data));
70		}
71	};
72
73	template <precision P>
74	struct compute_quat_add<float, P, true>
75	{
76		static tquat<float, P> call(tquat<float, P> const& q, tquat<float, P> const& p)
77		{
78			tquat<float, P> Result(uninitialize);
79			Result.data = _mm_add_ps(q.data, p.data);
80			return Result;
81		}
82	};
83
84#	if GLM_ARCH & GLM_ARCH_AVX_BIT
85	template <precision P>
86	struct compute_quat_add<double, P, true>
87	{
88		static tquat<double, P> call(tquat<double, P> const & a, tquat<double, P> const & b)
89		{
90			tquat<double, P> Result(uninitialize);
91			Result.data = _mm256_add_pd(a.data, b.data);
92			return Result;
93		}
94	};
95#	endif
96
97	template <precision P>
98	struct compute_quat_sub<float, P, true>
99	{
100		static tquat<float, P> call(tquat<float, P> const& q, tquat<float, P> const& p)
101		{
102			tvec4<float, P> Result(uninitialize);
103			Result.data = _mm_sub_ps(q.data, p.data);
104			return Result;
105		}
106	};
107
108#	if GLM_ARCH & GLM_ARCH_AVX_BIT
109	template <precision P>
110	struct compute_quat_sub<double, P, true>
111	{
112		static tquat<double, P> call(tquat<double, P> const & a, tquat<double, P> const & b)
113		{
114			tquat<double, P> Result(uninitialize);
115			Result.data = _mm256_sub_pd(a.data, b.data);
116			return Result;
117		}
118	};
119#	endif
120
121	template <precision P>
122	struct compute_quat_mul_scalar<float, P, true>
123	{
124		static tquat<float, P> call(tquat<float, P> const& q, float s)
125		{
126			tvec4<float, P> Result(uninitialize);
127			Result.data = _mm_mul_ps(q.data, _mm_set_ps1(s));
128			return Result;
129		}
130	};
131
132#	if GLM_ARCH & GLM_ARCH_AVX_BIT
133	template <precision P>
134	struct compute_quat_mul_scalar<double, P, true>
135	{
136		static tquat<double, P> call(tquat<double, P> const& q, double s)
137		{
138			tquat<double, P> Result(uninitialize);
139			Result.data = _mm256_mul_pd(q.data, _mm_set_ps1(s));
140			return Result;
141		}
142	};
143#	endif
144
145	template <precision P>
146	struct compute_quat_div_scalar<float, P, true>
147	{
148		static tquat<float, P> call(tquat<float, P> const& q, float s)
149		{
150			tvec4<float, P> Result(uninitialize);
151			Result.data = _mm_div_ps(q.data, _mm_set_ps1(s));
152			return Result;
153		}
154	};
155
156#	if GLM_ARCH & GLM_ARCH_AVX_BIT
157	template <precision P>
158	struct compute_quat_div_scalar<double, P, true>
159	{
160		static tquat<double, P> call(tquat<double, P> const& q, double s)
161		{
162			tquat<double, P> Result(uninitialize);
163			Result.data = _mm256_div_pd(q.data, _mm_set_ps1(s));
164			return Result;
165		}
166	};
167#	endif
168
169	template <precision P>
170	struct compute_quat_mul_vec4<float, P, true>
171	{
172		static tvec4<float, P> call(tquat<float, P> const& q, tvec4<float, P> const& v)
173		{
174			__m128 const q_wwww = _mm_shuffle_ps(q.data, q.data, _MM_SHUFFLE(3, 3, 3, 3));
175			__m128 const q_swp0 = _mm_shuffle_ps(q.data, q.data, _MM_SHUFFLE(3, 0, 2, 1));
176			__m128 const q_swp1 = _mm_shuffle_ps(q.data, q.data, _MM_SHUFFLE(3, 1, 0, 2));
177			__m128 const v_swp0 = _mm_shuffle_ps(v.data, v.data, _MM_SHUFFLE(3, 0, 2, 1));
178			__m128 const v_swp1 = _mm_shuffle_ps(v.data, v.data, _MM_SHUFFLE(3, 1, 0, 2));
179
180			__m128 uv      = _mm_sub_ps(_mm_mul_ps(q_swp0, v_swp1), _mm_mul_ps(q_swp1, v_swp0));
181			__m128 uv_swp0 = _mm_shuffle_ps(uv, uv, _MM_SHUFFLE(3, 0, 2, 1));
182			__m128 uv_swp1 = _mm_shuffle_ps(uv, uv, _MM_SHUFFLE(3, 1, 0, 2));
183			__m128 uuv     = _mm_sub_ps(_mm_mul_ps(q_swp0, uv_swp1), _mm_mul_ps(q_swp1, uv_swp0));
184
185			__m128 const two = _mm_set1_ps(2.0f);
186			uv  = _mm_mul_ps(uv, _mm_mul_ps(q_wwww, two));
187			uuv = _mm_mul_ps(uuv, two);
188
189			tvec4<float, P> Result(uninitialize);
190			Result.data = _mm_add_ps(v.Data, _mm_add_ps(uv, uuv));
191			return Result;
192		}
193	};
194}//namespace detail
195}//namespace glm
196
197#endif//GLM_ARCH & GLM_ARCH_SSE2_BIT
198
199