• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1///////////////////////////////////////////////////////////////////////////////////
2/// OpenGL Mathematics (glm.g-truc.net)
3///
4/// Copyright (c) 2005 - 2014 G-Truc Creation (www.g-truc.net)
5/// Permission is hereby granted, free of charge, to any person obtaining a copy
6/// of this software and associated documentation files (the "Software"), to deal
7/// in the Software without restriction, including without limitation the rights
8/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9/// copies of the Software, and to permit persons to whom the Software is
10/// furnished to do so, subject to the following conditions:
11///
12/// The above copyright notice and this permission notice shall be included in
13/// all copies or substantial portions of the Software.
14///
15/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21/// THE SOFTWARE.
22///
23/// @ref core
24/// @file glm/core/intrinsic_common.inl
25/// @date 2009-06-05 / 2011-06-15
26/// @author Christophe Riccio
27///////////////////////////////////////////////////////////////////////////////////
28
29namespace glm{
30namespace detail{
31
32static const __m128 GLM_VAR_USED _m128_rad_ps = _mm_set_ps1(3.141592653589793238462643383279f / 180.f);
33static const __m128 GLM_VAR_USED _m128_deg_ps = _mm_set_ps1(180.f / 3.141592653589793238462643383279f);
34
35template <typename matType>
36GLM_FUNC_QUALIFIER matType sse_comp_mul_ps
37(
38	__m128 const in1[4],
39	__m128 const in2[4],
40	__m128 out[4]
41)
42{
43	out[0] = _mm_mul_ps(in1[0], in2[0]);
44	out[1] = _mm_mul_ps(in1[1], in2[1]);
45	out[2] = _mm_mul_ps(in1[2], in2[2]);
46	out[3] = _mm_mul_ps(in1[3], in2[3]);
47}
48
49GLM_FUNC_QUALIFIER void sse_add_ps(__m128 const in1[4], __m128 const in2[4], __m128 out[4])
50{
51	{
52		out[0] = _mm_add_ps(in1[0], in2[0]);
53		out[1] = _mm_add_ps(in1[1], in2[1]);
54		out[2] = _mm_add_ps(in1[2], in2[2]);
55		out[3] = _mm_add_ps(in1[3], in2[3]);
56	}
57}
58
59GLM_FUNC_QUALIFIER void sse_sub_ps(__m128 const in1[4], __m128 const in2[4], __m128 out[4])
60{
61	{
62		out[0] = _mm_sub_ps(in1[0], in2[0]);
63		out[1] = _mm_sub_ps(in1[1], in2[1]);
64		out[2] = _mm_sub_ps(in1[2], in2[2]);
65		out[3] = _mm_sub_ps(in1[3], in2[3]);
66	}
67}
68
69GLM_FUNC_QUALIFIER __m128 sse_mul_ps(__m128 const m[4], __m128 v)
70{
71	__m128 v0 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0));
72	__m128 v1 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1));
73	__m128 v2 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2));
74	__m128 v3 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
75
76	__m128 m0 = _mm_mul_ps(m[0], v0);
77	__m128 m1 = _mm_mul_ps(m[1], v1);
78	__m128 m2 = _mm_mul_ps(m[2], v2);
79	__m128 m3 = _mm_mul_ps(m[3], v3);
80
81	__m128 a0 = _mm_add_ps(m0, m1);
82	__m128 a1 = _mm_add_ps(m2, m3);
83	__m128 a2 = _mm_add_ps(a0, a1);
84
85	return a2;
86}
87
88GLM_FUNC_QUALIFIER __m128 sse_mul_ps(__m128 v, __m128 const m[4])
89{
90	__m128 i0 = m[0];
91	__m128 i1 = m[1];
92	__m128 i2 = m[2];
93	__m128 i3 = m[3];
94
95	__m128 m0 = _mm_mul_ps(v, i0);
96	__m128 m1 = _mm_mul_ps(v, i1);
97	__m128 m2 = _mm_mul_ps(v, i2);
98	__m128 m3 = _mm_mul_ps(v, i3);
99
100	__m128 u0 = _mm_unpacklo_ps(m0, m1);
101	__m128 u1 = _mm_unpackhi_ps(m0, m1);
102	__m128 a0 = _mm_add_ps(u0, u1);
103
104	__m128 u2 = _mm_unpacklo_ps(m2, m3);
105	__m128 u3 = _mm_unpackhi_ps(m2, m3);
106	__m128 a1 = _mm_add_ps(u2, u3);
107
108	__m128 f0 = _mm_movelh_ps(a0, a1);
109	__m128 f1 = _mm_movehl_ps(a1, a0);
110	__m128 f2 = _mm_add_ps(f0, f1);
111
112	return f2;
113}
114
115GLM_FUNC_QUALIFIER void sse_mul_ps(__m128 const in1[4], __m128 const in2[4], __m128 out[4])
116{
117	{
118		__m128 e0 = _mm_shuffle_ps(in2[0], in2[0], _MM_SHUFFLE(0, 0, 0, 0));
119		__m128 e1 = _mm_shuffle_ps(in2[0], in2[0], _MM_SHUFFLE(1, 1, 1, 1));
120		__m128 e2 = _mm_shuffle_ps(in2[0], in2[0], _MM_SHUFFLE(2, 2, 2, 2));
121		__m128 e3 = _mm_shuffle_ps(in2[0], in2[0], _MM_SHUFFLE(3, 3, 3, 3));
122
123		__m128 m0 = _mm_mul_ps(in1[0], e0);
124		__m128 m1 = _mm_mul_ps(in1[1], e1);
125		__m128 m2 = _mm_mul_ps(in1[2], e2);
126		__m128 m3 = _mm_mul_ps(in1[3], e3);
127
128		__m128 a0 = _mm_add_ps(m0, m1);
129		__m128 a1 = _mm_add_ps(m2, m3);
130		__m128 a2 = _mm_add_ps(a0, a1);
131
132		out[0] = a2;
133	}
134
135	{
136		__m128 e0 = _mm_shuffle_ps(in2[1], in2[1], _MM_SHUFFLE(0, 0, 0, 0));
137		__m128 e1 = _mm_shuffle_ps(in2[1], in2[1], _MM_SHUFFLE(1, 1, 1, 1));
138		__m128 e2 = _mm_shuffle_ps(in2[1], in2[1], _MM_SHUFFLE(2, 2, 2, 2));
139		__m128 e3 = _mm_shuffle_ps(in2[1], in2[1], _MM_SHUFFLE(3, 3, 3, 3));
140
141		__m128 m0 = _mm_mul_ps(in1[0], e0);
142		__m128 m1 = _mm_mul_ps(in1[1], e1);
143		__m128 m2 = _mm_mul_ps(in1[2], e2);
144		__m128 m3 = _mm_mul_ps(in1[3], e3);
145
146		__m128 a0 = _mm_add_ps(m0, m1);
147		__m128 a1 = _mm_add_ps(m2, m3);
148		__m128 a2 = _mm_add_ps(a0, a1);
149
150		out[1] = a2;
151	}
152
153	{
154		__m128 e0 = _mm_shuffle_ps(in2[2], in2[2], _MM_SHUFFLE(0, 0, 0, 0));
155		__m128 e1 = _mm_shuffle_ps(in2[2], in2[2], _MM_SHUFFLE(1, 1, 1, 1));
156		__m128 e2 = _mm_shuffle_ps(in2[2], in2[2], _MM_SHUFFLE(2, 2, 2, 2));
157		__m128 e3 = _mm_shuffle_ps(in2[2], in2[2], _MM_SHUFFLE(3, 3, 3, 3));
158
159		__m128 m0 = _mm_mul_ps(in1[0], e0);
160		__m128 m1 = _mm_mul_ps(in1[1], e1);
161		__m128 m2 = _mm_mul_ps(in1[2], e2);
162		__m128 m3 = _mm_mul_ps(in1[3], e3);
163
164		__m128 a0 = _mm_add_ps(m0, m1);
165		__m128 a1 = _mm_add_ps(m2, m3);
166		__m128 a2 = _mm_add_ps(a0, a1);
167
168		out[2] = a2;
169	}
170
171	{
172		//(__m128&)_mm_shuffle_epi32(__m128i&)in2[0], _MM_SHUFFLE(3, 3, 3, 3))
173		__m128 e0 = _mm_shuffle_ps(in2[3], in2[3], _MM_SHUFFLE(0, 0, 0, 0));
174		__m128 e1 = _mm_shuffle_ps(in2[3], in2[3], _MM_SHUFFLE(1, 1, 1, 1));
175		__m128 e2 = _mm_shuffle_ps(in2[3], in2[3], _MM_SHUFFLE(2, 2, 2, 2));
176		__m128 e3 = _mm_shuffle_ps(in2[3], in2[3], _MM_SHUFFLE(3, 3, 3, 3));
177
178		__m128 m0 = _mm_mul_ps(in1[0], e0);
179		__m128 m1 = _mm_mul_ps(in1[1], e1);
180		__m128 m2 = _mm_mul_ps(in1[2], e2);
181		__m128 m3 = _mm_mul_ps(in1[3], e3);
182
183		__m128 a0 = _mm_add_ps(m0, m1);
184		__m128 a1 = _mm_add_ps(m2, m3);
185		__m128 a2 = _mm_add_ps(a0, a1);
186
187		out[3] = a2;
188	}
189}
190
191GLM_FUNC_QUALIFIER void sse_transpose_ps(__m128 const in[4], __m128 out[4])
192{
193    __m128 tmp0 = _mm_shuffle_ps(in[0], in[1], 0x44);
194    __m128 tmp2 = _mm_shuffle_ps(in[0], in[1], 0xEE);
195    __m128 tmp1 = _mm_shuffle_ps(in[2], in[3], 0x44);
196    __m128 tmp3 = _mm_shuffle_ps(in[2], in[3], 0xEE);
197
198    out[0] = _mm_shuffle_ps(tmp0, tmp1, 0x88);
199    out[1] = _mm_shuffle_ps(tmp0, tmp1, 0xDD);
200    out[2] = _mm_shuffle_ps(tmp2, tmp3, 0x88);
201    out[3] = _mm_shuffle_ps(tmp2, tmp3, 0xDD);
202}
203
204GLM_FUNC_QUALIFIER __m128 sse_slow_det_ps(__m128 const in[4])
205{
206	__m128 Fac0;
207	{
208		//	valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
209		//	valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
210		//	valType SubFactor06 = m[1][2] * m[3][3] - m[3][2] * m[1][3];
211		//	valType SubFactor13 = m[1][2] * m[2][3] - m[2][2] * m[1][3];
212
213		__m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
214		__m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
215
216		__m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
217		__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
218		__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
219		__m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
220
221		__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
222		__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
223		Fac0 = _mm_sub_ps(Mul00, Mul01);
224	}
225
226	__m128 Fac1;
227	{
228		//	valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
229		//	valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
230		//	valType SubFactor07 = m[1][1] * m[3][3] - m[3][1] * m[1][3];
231		//	valType SubFactor14 = m[1][1] * m[2][3] - m[2][1] * m[1][3];
232
233		__m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
234		__m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
235
236		__m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
237		__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
238		__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
239		__m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
240
241		__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
242		__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
243		Fac1 = _mm_sub_ps(Mul00, Mul01);
244	}
245
246
247	__m128 Fac2;
248	{
249		//	valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
250		//	valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
251		//	valType SubFactor08 = m[1][1] * m[3][2] - m[3][1] * m[1][2];
252		//	valType SubFactor15 = m[1][1] * m[2][2] - m[2][1] * m[1][2];
253
254		__m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
255		__m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
256
257		__m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
258		__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
259		__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
260		__m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
261
262		__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
263		__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
264		Fac2 = _mm_sub_ps(Mul00, Mul01);
265	}
266
267	__m128 Fac3;
268	{
269		//	valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
270		//	valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
271		//	valType SubFactor09 = m[1][0] * m[3][3] - m[3][0] * m[1][3];
272		//	valType SubFactor16 = m[1][0] * m[2][3] - m[2][0] * m[1][3];
273
274		__m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
275		__m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
276
277		__m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
278		__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
279		__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
280		__m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
281
282		__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
283		__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
284		Fac3 = _mm_sub_ps(Mul00, Mul01);
285	}
286
287	__m128 Fac4;
288	{
289		//	valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
290		//	valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
291		//	valType SubFactor10 = m[1][0] * m[3][2] - m[3][0] * m[1][2];
292		//	valType SubFactor17 = m[1][0] * m[2][2] - m[2][0] * m[1][2];
293
294		__m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
295		__m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
296
297		__m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
298		__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
299		__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
300		__m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
301
302		__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
303		__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
304		Fac4 = _mm_sub_ps(Mul00, Mul01);
305	}
306
307	__m128 Fac5;
308	{
309		//	valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
310		//	valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
311		//	valType SubFactor12 = m[1][0] * m[3][1] - m[3][0] * m[1][1];
312		//	valType SubFactor18 = m[1][0] * m[2][1] - m[2][0] * m[1][1];
313
314		__m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
315		__m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
316
317		__m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
318		__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
319		__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
320		__m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
321
322		__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
323		__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
324		Fac5 = _mm_sub_ps(Mul00, Mul01);
325	}
326
327	__m128 SignA = _mm_set_ps( 1.0f,-1.0f, 1.0f,-1.0f);
328	__m128 SignB = _mm_set_ps(-1.0f, 1.0f,-1.0f, 1.0f);
329
330	// m[1][0]
331	// m[0][0]
332	// m[0][0]
333	// m[0][0]
334	__m128 Temp0 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(0, 0, 0, 0));
335	__m128 Vec0 = _mm_shuffle_ps(Temp0, Temp0, _MM_SHUFFLE(2, 2, 2, 0));
336
337	// m[1][1]
338	// m[0][1]
339	// m[0][1]
340	// m[0][1]
341	__m128 Temp1 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(1, 1, 1, 1));
342	__m128 Vec1 = _mm_shuffle_ps(Temp1, Temp1, _MM_SHUFFLE(2, 2, 2, 0));
343
344	// m[1][2]
345	// m[0][2]
346	// m[0][2]
347	// m[0][2]
348	__m128 Temp2 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(2, 2, 2, 2));
349	__m128 Vec2 = _mm_shuffle_ps(Temp2, Temp2, _MM_SHUFFLE(2, 2, 2, 0));
350
351	// m[1][3]
352	// m[0][3]
353	// m[0][3]
354	// m[0][3]
355	__m128 Temp3 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(3, 3, 3, 3));
356	__m128 Vec3 = _mm_shuffle_ps(Temp3, Temp3, _MM_SHUFFLE(2, 2, 2, 0));
357
358	// col0
359	// + (Vec1[0] * Fac0[0] - Vec2[0] * Fac1[0] + Vec3[0] * Fac2[0]),
360	// - (Vec1[1] * Fac0[1] - Vec2[1] * Fac1[1] + Vec3[1] * Fac2[1]),
361	// + (Vec1[2] * Fac0[2] - Vec2[2] * Fac1[2] + Vec3[2] * Fac2[2]),
362	// - (Vec1[3] * Fac0[3] - Vec2[3] * Fac1[3] + Vec3[3] * Fac2[3]),
363	__m128 Mul00 = _mm_mul_ps(Vec1, Fac0);
364	__m128 Mul01 = _mm_mul_ps(Vec2, Fac1);
365	__m128 Mul02 = _mm_mul_ps(Vec3, Fac2);
366	__m128 Sub00 = _mm_sub_ps(Mul00, Mul01);
367	__m128 Add00 = _mm_add_ps(Sub00, Mul02);
368	__m128 Inv0 = _mm_mul_ps(SignB, Add00);
369
370	// col1
371	// - (Vec0[0] * Fac0[0] - Vec2[0] * Fac3[0] + Vec3[0] * Fac4[0]),
372	// + (Vec0[0] * Fac0[1] - Vec2[1] * Fac3[1] + Vec3[1] * Fac4[1]),
373	// - (Vec0[0] * Fac0[2] - Vec2[2] * Fac3[2] + Vec3[2] * Fac4[2]),
374	// + (Vec0[0] * Fac0[3] - Vec2[3] * Fac3[3] + Vec3[3] * Fac4[3]),
375	__m128 Mul03 = _mm_mul_ps(Vec0, Fac0);
376	__m128 Mul04 = _mm_mul_ps(Vec2, Fac3);
377	__m128 Mul05 = _mm_mul_ps(Vec3, Fac4);
378	__m128 Sub01 = _mm_sub_ps(Mul03, Mul04);
379	__m128 Add01 = _mm_add_ps(Sub01, Mul05);
380	__m128 Inv1 = _mm_mul_ps(SignA, Add01);
381
382	// col2
383	// + (Vec0[0] * Fac1[0] - Vec1[0] * Fac3[0] + Vec3[0] * Fac5[0]),
384	// - (Vec0[0] * Fac1[1] - Vec1[1] * Fac3[1] + Vec3[1] * Fac5[1]),
385	// + (Vec0[0] * Fac1[2] - Vec1[2] * Fac3[2] + Vec3[2] * Fac5[2]),
386	// - (Vec0[0] * Fac1[3] - Vec1[3] * Fac3[3] + Vec3[3] * Fac5[3]),
387	__m128 Mul06 = _mm_mul_ps(Vec0, Fac1);
388	__m128 Mul07 = _mm_mul_ps(Vec1, Fac3);
389	__m128 Mul08 = _mm_mul_ps(Vec3, Fac5);
390	__m128 Sub02 = _mm_sub_ps(Mul06, Mul07);
391	__m128 Add02 = _mm_add_ps(Sub02, Mul08);
392	__m128 Inv2 = _mm_mul_ps(SignB, Add02);
393
394	// col3
395	// - (Vec1[0] * Fac2[0] - Vec1[0] * Fac4[0] + Vec2[0] * Fac5[0]),
396	// + (Vec1[0] * Fac2[1] - Vec1[1] * Fac4[1] + Vec2[1] * Fac5[1]),
397	// - (Vec1[0] * Fac2[2] - Vec1[2] * Fac4[2] + Vec2[2] * Fac5[2]),
398	// + (Vec1[0] * Fac2[3] - Vec1[3] * Fac4[3] + Vec2[3] * Fac5[3]));
399	__m128 Mul09 = _mm_mul_ps(Vec0, Fac2);
400	__m128 Mul10 = _mm_mul_ps(Vec1, Fac4);
401	__m128 Mul11 = _mm_mul_ps(Vec2, Fac5);
402	__m128 Sub03 = _mm_sub_ps(Mul09, Mul10);
403	__m128 Add03 = _mm_add_ps(Sub03, Mul11);
404	__m128 Inv3 = _mm_mul_ps(SignA, Add03);
405
406	__m128 Row0 = _mm_shuffle_ps(Inv0, Inv1, _MM_SHUFFLE(0, 0, 0, 0));
407	__m128 Row1 = _mm_shuffle_ps(Inv2, Inv3, _MM_SHUFFLE(0, 0, 0, 0));
408	__m128 Row2 = _mm_shuffle_ps(Row0, Row1, _MM_SHUFFLE(2, 0, 2, 0));
409
410	//	valType Determinant = m[0][0] * Inverse[0][0]
411	//						+ m[0][1] * Inverse[1][0]
412	//						+ m[0][2] * Inverse[2][0]
413	//						+ m[0][3] * Inverse[3][0];
414	__m128 Det0 = sse_dot_ps(in[0], Row2);
415	return Det0;
416}
417
418GLM_FUNC_QUALIFIER __m128 sse_detd_ps
419(
420	__m128 const m[4]
421)
422{
423	// _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(
424
425	//T SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
426	//T SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
427	//T SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
428	//T SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
429	//T SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
430	//T SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
431
432	// First 2 columns
433 	__m128 Swp2A = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[2]), _MM_SHUFFLE(0, 1, 1, 2)));
434 	__m128 Swp3A = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[3]), _MM_SHUFFLE(3, 2, 3, 3)));
435	__m128 MulA = _mm_mul_ps(Swp2A, Swp3A);
436
437	// Second 2 columns
438	__m128 Swp2B = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[2]), _MM_SHUFFLE(3, 2, 3, 3)));
439	__m128 Swp3B = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[3]), _MM_SHUFFLE(0, 1, 1, 2)));
440	__m128 MulB = _mm_mul_ps(Swp2B, Swp3B);
441
442	// Columns subtraction
443	__m128 SubE = _mm_sub_ps(MulA, MulB);
444
445	// Last 2 rows
446	__m128 Swp2C = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[2]), _MM_SHUFFLE(0, 0, 1, 2)));
447	__m128 Swp3C = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[3]), _MM_SHUFFLE(1, 2, 0, 0)));
448	__m128 MulC = _mm_mul_ps(Swp2C, Swp3C);
449	__m128 SubF = _mm_sub_ps(_mm_movehl_ps(MulC, MulC), MulC);
450
451	//detail::tvec4<T, P> DetCof(
452	//	+ (m[1][1] * SubFactor00 - m[1][2] * SubFactor01 + m[1][3] * SubFactor02),
453	//	- (m[1][0] * SubFactor00 - m[1][2] * SubFactor03 + m[1][3] * SubFactor04),
454	//	+ (m[1][0] * SubFactor01 - m[1][1] * SubFactor03 + m[1][3] * SubFactor05),
455	//	- (m[1][0] * SubFactor02 - m[1][1] * SubFactor04 + m[1][2] * SubFactor05));
456
457	__m128 SubFacA = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(SubE), _MM_SHUFFLE(2, 1, 0, 0)));
458	__m128 SwpFacA = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[1]), _MM_SHUFFLE(0, 0, 0, 1)));
459	__m128 MulFacA = _mm_mul_ps(SwpFacA, SubFacA);
460
461	__m128 SubTmpB = _mm_shuffle_ps(SubE, SubF, _MM_SHUFFLE(0, 0, 3, 1));
462	__m128 SubFacB = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(SubTmpB), _MM_SHUFFLE(3, 1, 1, 0)));//SubF[0], SubE[3], SubE[3], SubE[1];
463	__m128 SwpFacB = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[1]), _MM_SHUFFLE(1, 1, 2, 2)));
464	__m128 MulFacB = _mm_mul_ps(SwpFacB, SubFacB);
465
466	__m128 SubRes = _mm_sub_ps(MulFacA, MulFacB);
467
468	__m128 SubTmpC = _mm_shuffle_ps(SubE, SubF, _MM_SHUFFLE(1, 0, 2, 2));
469	__m128 SubFacC = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(SubTmpC), _MM_SHUFFLE(3, 3, 2, 0)));
470	__m128 SwpFacC = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[1]), _MM_SHUFFLE(2, 3, 3, 3)));
471	__m128 MulFacC = _mm_mul_ps(SwpFacC, SubFacC);
472
473	__m128 AddRes = _mm_add_ps(SubRes, MulFacC);
474	__m128 DetCof = _mm_mul_ps(AddRes, _mm_setr_ps( 1.0f,-1.0f, 1.0f,-1.0f));
475
476	//return m[0][0] * DetCof[0]
477	//	 + m[0][1] * DetCof[1]
478	//	 + m[0][2] * DetCof[2]
479	//	 + m[0][3] * DetCof[3];
480
481	return sse_dot_ps(m[0], DetCof);
482}
483
484GLM_FUNC_QUALIFIER __m128 sse_det_ps
485(
486	__m128 const m[4]
487)
488{
489	// _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(add)
490
491	//T SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
492	//T SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
493	//T SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
494	//T SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
495	//T SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
496	//T SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
497
498	// First 2 columns
499 	__m128 Swp2A = _mm_shuffle_ps(m[2], m[2], _MM_SHUFFLE(0, 1, 1, 2));
500 	__m128 Swp3A = _mm_shuffle_ps(m[3], m[3], _MM_SHUFFLE(3, 2, 3, 3));
501	__m128 MulA = _mm_mul_ps(Swp2A, Swp3A);
502
503	// Second 2 columns
504	__m128 Swp2B = _mm_shuffle_ps(m[2], m[2], _MM_SHUFFLE(3, 2, 3, 3));
505	__m128 Swp3B = _mm_shuffle_ps(m[3], m[3], _MM_SHUFFLE(0, 1, 1, 2));
506	__m128 MulB = _mm_mul_ps(Swp2B, Swp3B);
507
508	// Columns subtraction
509	__m128 SubE = _mm_sub_ps(MulA, MulB);
510
511	// Last 2 rows
512	__m128 Swp2C = _mm_shuffle_ps(m[2], m[2], _MM_SHUFFLE(0, 0, 1, 2));
513	__m128 Swp3C = _mm_shuffle_ps(m[3], m[3], _MM_SHUFFLE(1, 2, 0, 0));
514	__m128 MulC = _mm_mul_ps(Swp2C, Swp3C);
515	__m128 SubF = _mm_sub_ps(_mm_movehl_ps(MulC, MulC), MulC);
516
517	//detail::tvec4<T, P> DetCof(
518	//	+ (m[1][1] * SubFactor00 - m[1][2] * SubFactor01 + m[1][3] * SubFactor02),
519	//	- (m[1][0] * SubFactor00 - m[1][2] * SubFactor03 + m[1][3] * SubFactor04),
520	//	+ (m[1][0] * SubFactor01 - m[1][1] * SubFactor03 + m[1][3] * SubFactor05),
521	//	- (m[1][0] * SubFactor02 - m[1][1] * SubFactor04 + m[1][2] * SubFactor05));
522
523	__m128 SubFacA = _mm_shuffle_ps(SubE, SubE, _MM_SHUFFLE(2, 1, 0, 0));
524	__m128 SwpFacA = _mm_shuffle_ps(m[1], m[1], _MM_SHUFFLE(0, 0, 0, 1));
525	__m128 MulFacA = _mm_mul_ps(SwpFacA, SubFacA);
526
527	__m128 SubTmpB = _mm_shuffle_ps(SubE, SubF, _MM_SHUFFLE(0, 0, 3, 1));
528	__m128 SubFacB = _mm_shuffle_ps(SubTmpB, SubTmpB, _MM_SHUFFLE(3, 1, 1, 0));//SubF[0], SubE[3], SubE[3], SubE[1];
529	__m128 SwpFacB = _mm_shuffle_ps(m[1], m[1], _MM_SHUFFLE(1, 1, 2, 2));
530	__m128 MulFacB = _mm_mul_ps(SwpFacB, SubFacB);
531
532	__m128 SubRes = _mm_sub_ps(MulFacA, MulFacB);
533
534	__m128 SubTmpC = _mm_shuffle_ps(SubE, SubF, _MM_SHUFFLE(1, 0, 2, 2));
535	__m128 SubFacC = _mm_shuffle_ps(SubTmpC, SubTmpC, _MM_SHUFFLE(3, 3, 2, 0));
536	__m128 SwpFacC = _mm_shuffle_ps(m[1], m[1], _MM_SHUFFLE(2, 3, 3, 3));
537	__m128 MulFacC = _mm_mul_ps(SwpFacC, SubFacC);
538
539	__m128 AddRes = _mm_add_ps(SubRes, MulFacC);
540	__m128 DetCof = _mm_mul_ps(AddRes, _mm_setr_ps( 1.0f,-1.0f, 1.0f,-1.0f));
541
542	//return m[0][0] * DetCof[0]
543	//	 + m[0][1] * DetCof[1]
544	//	 + m[0][2] * DetCof[2]
545	//	 + m[0][3] * DetCof[3];
546
547	return sse_dot_ps(m[0], DetCof);
548}
549
550GLM_FUNC_QUALIFIER void sse_inverse_ps(__m128 const in[4], __m128 out[4])
551{
552	__m128 Fac0;
553	{
554		//	valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
555		//	valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
556		//	valType SubFactor06 = m[1][2] * m[3][3] - m[3][2] * m[1][3];
557		//	valType SubFactor13 = m[1][2] * m[2][3] - m[2][2] * m[1][3];
558
559		__m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
560		__m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
561
562		__m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
563		__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
564		__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
565		__m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
566
567		__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
568		__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
569		Fac0 = _mm_sub_ps(Mul00, Mul01);
570	}
571
572	__m128 Fac1;
573	{
574		//	valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
575		//	valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
576		//	valType SubFactor07 = m[1][1] * m[3][3] - m[3][1] * m[1][3];
577		//	valType SubFactor14 = m[1][1] * m[2][3] - m[2][1] * m[1][3];
578
579		__m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
580		__m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
581
582		__m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
583		__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
584		__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
585		__m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
586
587		__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
588		__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
589		Fac1 = _mm_sub_ps(Mul00, Mul01);
590	}
591
592
593	__m128 Fac2;
594	{
595		//	valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
596		//	valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
597		//	valType SubFactor08 = m[1][1] * m[3][2] - m[3][1] * m[1][2];
598		//	valType SubFactor15 = m[1][1] * m[2][2] - m[2][1] * m[1][2];
599
600		__m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
601		__m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
602
603		__m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
604		__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
605		__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
606		__m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
607
608		__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
609		__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
610		Fac2 = _mm_sub_ps(Mul00, Mul01);
611    }
612
613	__m128 Fac3;
614	{
615		//	valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
616		//	valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
617		//	valType SubFactor09 = m[1][0] * m[3][3] - m[3][0] * m[1][3];
618		//	valType SubFactor16 = m[1][0] * m[2][3] - m[2][0] * m[1][3];
619
620		__m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
621		__m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
622
623		__m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
624		__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
625		__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
626		__m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
627
628		__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
629		__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
630		Fac3 = _mm_sub_ps(Mul00, Mul01);
631	}
632
633	__m128 Fac4;
634	{
635		//	valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
636		//	valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
637		//	valType SubFactor10 = m[1][0] * m[3][2] - m[3][0] * m[1][2];
638		//	valType SubFactor17 = m[1][0] * m[2][2] - m[2][0] * m[1][2];
639
640		__m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
641		__m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
642
643		__m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
644		__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
645		__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
646		__m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
647
648		__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
649		__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
650		Fac4 = _mm_sub_ps(Mul00, Mul01);
651	}
652
653	__m128 Fac5;
654	{
655		//	valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
656		//	valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
657		//	valType SubFactor12 = m[1][0] * m[3][1] - m[3][0] * m[1][1];
658		//	valType SubFactor18 = m[1][0] * m[2][1] - m[2][0] * m[1][1];
659
660		__m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
661		__m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
662
663		__m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
664		__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
665		__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
666		__m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
667
668		__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
669		__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
670		Fac5 = _mm_sub_ps(Mul00, Mul01);
671	}
672
673	__m128 SignA = _mm_set_ps( 1.0f,-1.0f, 1.0f,-1.0f);
674	__m128 SignB = _mm_set_ps(-1.0f, 1.0f,-1.0f, 1.0f);
675
676	// m[1][0]
677	// m[0][0]
678	// m[0][0]
679	// m[0][0]
680	__m128 Temp0 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(0, 0, 0, 0));
681	__m128 Vec0 = _mm_shuffle_ps(Temp0, Temp0, _MM_SHUFFLE(2, 2, 2, 0));
682
683	// m[1][1]
684	// m[0][1]
685	// m[0][1]
686	// m[0][1]
687	__m128 Temp1 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(1, 1, 1, 1));
688	__m128 Vec1 = _mm_shuffle_ps(Temp1, Temp1, _MM_SHUFFLE(2, 2, 2, 0));
689
690	// m[1][2]
691	// m[0][2]
692	// m[0][2]
693	// m[0][2]
694	__m128 Temp2 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(2, 2, 2, 2));
695	__m128 Vec2 = _mm_shuffle_ps(Temp2, Temp2, _MM_SHUFFLE(2, 2, 2, 0));
696
697	// m[1][3]
698	// m[0][3]
699	// m[0][3]
700	// m[0][3]
701	__m128 Temp3 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(3, 3, 3, 3));
702	__m128 Vec3 = _mm_shuffle_ps(Temp3, Temp3, _MM_SHUFFLE(2, 2, 2, 0));
703
704	// col0
705	// + (Vec1[0] * Fac0[0] - Vec2[0] * Fac1[0] + Vec3[0] * Fac2[0]),
706	// - (Vec1[1] * Fac0[1] - Vec2[1] * Fac1[1] + Vec3[1] * Fac2[1]),
707	// + (Vec1[2] * Fac0[2] - Vec2[2] * Fac1[2] + Vec3[2] * Fac2[2]),
708	// - (Vec1[3] * Fac0[3] - Vec2[3] * Fac1[3] + Vec3[3] * Fac2[3]),
709	__m128 Mul00 = _mm_mul_ps(Vec1, Fac0);
710	__m128 Mul01 = _mm_mul_ps(Vec2, Fac1);
711	__m128 Mul02 = _mm_mul_ps(Vec3, Fac2);
712	__m128 Sub00 = _mm_sub_ps(Mul00, Mul01);
713	__m128 Add00 = _mm_add_ps(Sub00, Mul02);
714	__m128 Inv0 = _mm_mul_ps(SignB, Add00);
715
716	// col1
717	// - (Vec0[0] * Fac0[0] - Vec2[0] * Fac3[0] + Vec3[0] * Fac4[0]),
718	// + (Vec0[0] * Fac0[1] - Vec2[1] * Fac3[1] + Vec3[1] * Fac4[1]),
719	// - (Vec0[0] * Fac0[2] - Vec2[2] * Fac3[2] + Vec3[2] * Fac4[2]),
720	// + (Vec0[0] * Fac0[3] - Vec2[3] * Fac3[3] + Vec3[3] * Fac4[3]),
721	__m128 Mul03 = _mm_mul_ps(Vec0, Fac0);
722	__m128 Mul04 = _mm_mul_ps(Vec2, Fac3);
723	__m128 Mul05 = _mm_mul_ps(Vec3, Fac4);
724	__m128 Sub01 = _mm_sub_ps(Mul03, Mul04);
725	__m128 Add01 = _mm_add_ps(Sub01, Mul05);
726	__m128 Inv1 = _mm_mul_ps(SignA, Add01);
727
728	// col2
729	// + (Vec0[0] * Fac1[0] - Vec1[0] * Fac3[0] + Vec3[0] * Fac5[0]),
730	// - (Vec0[0] * Fac1[1] - Vec1[1] * Fac3[1] + Vec3[1] * Fac5[1]),
731	// + (Vec0[0] * Fac1[2] - Vec1[2] * Fac3[2] + Vec3[2] * Fac5[2]),
732	// - (Vec0[0] * Fac1[3] - Vec1[3] * Fac3[3] + Vec3[3] * Fac5[3]),
733	__m128 Mul06 = _mm_mul_ps(Vec0, Fac1);
734	__m128 Mul07 = _mm_mul_ps(Vec1, Fac3);
735	__m128 Mul08 = _mm_mul_ps(Vec3, Fac5);
736	__m128 Sub02 = _mm_sub_ps(Mul06, Mul07);
737	__m128 Add02 = _mm_add_ps(Sub02, Mul08);
738	__m128 Inv2 = _mm_mul_ps(SignB, Add02);
739
740	// col3
741	// - (Vec1[0] * Fac2[0] - Vec1[0] * Fac4[0] + Vec2[0] * Fac5[0]),
742	// + (Vec1[0] * Fac2[1] - Vec1[1] * Fac4[1] + Vec2[1] * Fac5[1]),
743	// - (Vec1[0] * Fac2[2] - Vec1[2] * Fac4[2] + Vec2[2] * Fac5[2]),
744	// + (Vec1[0] * Fac2[3] - Vec1[3] * Fac4[3] + Vec2[3] * Fac5[3]));
745	__m128 Mul09 = _mm_mul_ps(Vec0, Fac2);
746	__m128 Mul10 = _mm_mul_ps(Vec1, Fac4);
747	__m128 Mul11 = _mm_mul_ps(Vec2, Fac5);
748	__m128 Sub03 = _mm_sub_ps(Mul09, Mul10);
749	__m128 Add03 = _mm_add_ps(Sub03, Mul11);
750	__m128 Inv3 = _mm_mul_ps(SignA, Add03);
751
752	__m128 Row0 = _mm_shuffle_ps(Inv0, Inv1, _MM_SHUFFLE(0, 0, 0, 0));
753	__m128 Row1 = _mm_shuffle_ps(Inv2, Inv3, _MM_SHUFFLE(0, 0, 0, 0));
754	__m128 Row2 = _mm_shuffle_ps(Row0, Row1, _MM_SHUFFLE(2, 0, 2, 0));
755
756	//	valType Determinant = m[0][0] * Inverse[0][0]
757	//						+ m[0][1] * Inverse[1][0]
758	//						+ m[0][2] * Inverse[2][0]
759	//						+ m[0][3] * Inverse[3][0];
760	__m128 Det0 = sse_dot_ps(in[0], Row2);
761	__m128 Rcp0 = _mm_div_ps(one, Det0);
762	//__m128 Rcp0 = _mm_rcp_ps(Det0);
763
764	//	Inverse /= Determinant;
765	out[0] = _mm_mul_ps(Inv0, Rcp0);
766	out[1] = _mm_mul_ps(Inv1, Rcp0);
767	out[2] = _mm_mul_ps(Inv2, Rcp0);
768	out[3] = _mm_mul_ps(Inv3, Rcp0);
769}
770
771GLM_FUNC_QUALIFIER void sse_inverse_fast_ps(__m128 const in[4], __m128 out[4])
772{
773	__m128 Fac0;
774	{
775		//	valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
776		//	valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
777		//	valType SubFactor06 = m[1][2] * m[3][3] - m[3][2] * m[1][3];
778		//	valType SubFactor13 = m[1][2] * m[2][3] - m[2][2] * m[1][3];
779
780		__m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
781		__m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
782
783		__m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
784		__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
785		__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
786		__m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
787
788		__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
789		__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
790		Fac0 = _mm_sub_ps(Mul00, Mul01);
791	}
792
793	__m128 Fac1;
794	{
795		//	valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
796		//	valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
797		//	valType SubFactor07 = m[1][1] * m[3][3] - m[3][1] * m[1][3];
798		//	valType SubFactor14 = m[1][1] * m[2][3] - m[2][1] * m[1][3];
799
800		__m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
801		__m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
802
803		__m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
804		__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
805		__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
806		__m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
807
808		__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
809		__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
810		Fac1 = _mm_sub_ps(Mul00, Mul01);
811	}
812
813
814	__m128 Fac2;
815	{
816		//	valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
817		//	valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
818		//	valType SubFactor08 = m[1][1] * m[3][2] - m[3][1] * m[1][2];
819		//	valType SubFactor15 = m[1][1] * m[2][2] - m[2][1] * m[1][2];
820
821		__m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
822		__m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
823
824		__m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
825		__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
826		__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
827		__m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
828
829		__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
830		__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
831		Fac2 = _mm_sub_ps(Mul00, Mul01);
832	}
833
834	__m128 Fac3;
835	{
836		//	valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
837		//	valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
838		//	valType SubFactor09 = m[1][0] * m[3][3] - m[3][0] * m[1][3];
839		//	valType SubFactor16 = m[1][0] * m[2][3] - m[2][0] * m[1][3];
840
841		__m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
842		__m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
843
844		__m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
845		__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
846		__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
847		__m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
848
849		__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
850		__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
851		Fac3 = _mm_sub_ps(Mul00, Mul01);
852	}
853
854	__m128 Fac4;
855	{
856		//	valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
857		//	valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
858		//	valType SubFactor10 = m[1][0] * m[3][2] - m[3][0] * m[1][2];
859		//	valType SubFactor17 = m[1][0] * m[2][2] - m[2][0] * m[1][2];
860
861		__m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
862		__m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
863
864		__m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
865		__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
866		__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
867		__m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
868
869		__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
870		__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
871		Fac4 = _mm_sub_ps(Mul00, Mul01);
872	}
873
874	__m128 Fac5;
875	{
876		//	valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
877		//	valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
878		//	valType SubFactor12 = m[1][0] * m[3][1] - m[3][0] * m[1][1];
879		//	valType SubFactor18 = m[1][0] * m[2][1] - m[2][0] * m[1][1];
880
881		__m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
882		__m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
883
884		__m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
885		__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
886		__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
887		__m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
888
889		__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
890		__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
891		Fac5 = _mm_sub_ps(Mul00, Mul01);
892	}
893
894	__m128 SignA = _mm_set_ps( 1.0f,-1.0f, 1.0f,-1.0f);
895	__m128 SignB = _mm_set_ps(-1.0f, 1.0f,-1.0f, 1.0f);
896
897	// m[1][0]
898	// m[0][0]
899	// m[0][0]
900	// m[0][0]
901	__m128 Temp0 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(0, 0, 0, 0));
902	__m128 Vec0 = _mm_shuffle_ps(Temp0, Temp0, _MM_SHUFFLE(2, 2, 2, 0));
903
904	// m[1][1]
905	// m[0][1]
906	// m[0][1]
907	// m[0][1]
908	__m128 Temp1 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(1, 1, 1, 1));
909	__m128 Vec1 = _mm_shuffle_ps(Temp1, Temp1, _MM_SHUFFLE(2, 2, 2, 0));
910
911	// m[1][2]
912	// m[0][2]
913	// m[0][2]
914	// m[0][2]
915	__m128 Temp2 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(2, 2, 2, 2));
916	__m128 Vec2 = _mm_shuffle_ps(Temp2, Temp2, _MM_SHUFFLE(2, 2, 2, 0));
917
918	// m[1][3]
919	// m[0][3]
920	// m[0][3]
921	// m[0][3]
922	__m128 Temp3 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(3, 3, 3, 3));
923	__m128 Vec3 = _mm_shuffle_ps(Temp3, Temp3, _MM_SHUFFLE(2, 2, 2, 0));
924
925	// col0
926	// + (Vec1[0] * Fac0[0] - Vec2[0] * Fac1[0] + Vec3[0] * Fac2[0]),
927	// - (Vec1[1] * Fac0[1] - Vec2[1] * Fac1[1] + Vec3[1] * Fac2[1]),
928	// + (Vec1[2] * Fac0[2] - Vec2[2] * Fac1[2] + Vec3[2] * Fac2[2]),
929	// - (Vec1[3] * Fac0[3] - Vec2[3] * Fac1[3] + Vec3[3] * Fac2[3]),
930	__m128 Mul00 = _mm_mul_ps(Vec1, Fac0);
931	__m128 Mul01 = _mm_mul_ps(Vec2, Fac1);
932	__m128 Mul02 = _mm_mul_ps(Vec3, Fac2);
933	__m128 Sub00 = _mm_sub_ps(Mul00, Mul01);
934	__m128 Add00 = _mm_add_ps(Sub00, Mul02);
935	__m128 Inv0 = _mm_mul_ps(SignB, Add00);
936
937	// col1
938	// - (Vec0[0] * Fac0[0] - Vec2[0] * Fac3[0] + Vec3[0] * Fac4[0]),
939	// + (Vec0[0] * Fac0[1] - Vec2[1] * Fac3[1] + Vec3[1] * Fac4[1]),
940	// - (Vec0[0] * Fac0[2] - Vec2[2] * Fac3[2] + Vec3[2] * Fac4[2]),
941	// + (Vec0[0] * Fac0[3] - Vec2[3] * Fac3[3] + Vec3[3] * Fac4[3]),
942	__m128 Mul03 = _mm_mul_ps(Vec0, Fac0);
943	__m128 Mul04 = _mm_mul_ps(Vec2, Fac3);
944	__m128 Mul05 = _mm_mul_ps(Vec3, Fac4);
945	__m128 Sub01 = _mm_sub_ps(Mul03, Mul04);
946	__m128 Add01 = _mm_add_ps(Sub01, Mul05);
947	__m128 Inv1 = _mm_mul_ps(SignA, Add01);
948
949	// col2
950	// + (Vec0[0] * Fac1[0] - Vec1[0] * Fac3[0] + Vec3[0] * Fac5[0]),
951	// - (Vec0[0] * Fac1[1] - Vec1[1] * Fac3[1] + Vec3[1] * Fac5[1]),
952	// + (Vec0[0] * Fac1[2] - Vec1[2] * Fac3[2] + Vec3[2] * Fac5[2]),
953	// - (Vec0[0] * Fac1[3] - Vec1[3] * Fac3[3] + Vec3[3] * Fac5[3]),
954	__m128 Mul06 = _mm_mul_ps(Vec0, Fac1);
955	__m128 Mul07 = _mm_mul_ps(Vec1, Fac3);
956	__m128 Mul08 = _mm_mul_ps(Vec3, Fac5);
957	__m128 Sub02 = _mm_sub_ps(Mul06, Mul07);
958	__m128 Add02 = _mm_add_ps(Sub02, Mul08);
959	__m128 Inv2 = _mm_mul_ps(SignB, Add02);
960
961	// col3
962	// - (Vec1[0] * Fac2[0] - Vec1[0] * Fac4[0] + Vec2[0] * Fac5[0]),
963	// + (Vec1[0] * Fac2[1] - Vec1[1] * Fac4[1] + Vec2[1] * Fac5[1]),
964	// - (Vec1[0] * Fac2[2] - Vec1[2] * Fac4[2] + Vec2[2] * Fac5[2]),
965	// + (Vec1[0] * Fac2[3] - Vec1[3] * Fac4[3] + Vec2[3] * Fac5[3]));
966	__m128 Mul09 = _mm_mul_ps(Vec0, Fac2);
967	__m128 Mul10 = _mm_mul_ps(Vec1, Fac4);
968	__m128 Mul11 = _mm_mul_ps(Vec2, Fac5);
969	__m128 Sub03 = _mm_sub_ps(Mul09, Mul10);
970	__m128 Add03 = _mm_add_ps(Sub03, Mul11);
971	__m128 Inv3 = _mm_mul_ps(SignA, Add03);
972
973	__m128 Row0 = _mm_shuffle_ps(Inv0, Inv1, _MM_SHUFFLE(0, 0, 0, 0));
974	__m128 Row1 = _mm_shuffle_ps(Inv2, Inv3, _MM_SHUFFLE(0, 0, 0, 0));
975	__m128 Row2 = _mm_shuffle_ps(Row0, Row1, _MM_SHUFFLE(2, 0, 2, 0));
976
977	//	valType Determinant = m[0][0] * Inverse[0][0]
978	//						+ m[0][1] * Inverse[1][0]
979	//						+ m[0][2] * Inverse[2][0]
980	//						+ m[0][3] * Inverse[3][0];
981	__m128 Det0 = sse_dot_ps(in[0], Row2);
982	__m128 Rcp0 = _mm_rcp_ps(Det0);
983	//__m128 Rcp0 = _mm_div_ps(one, Det0);
984	//	Inverse /= Determinant;
985	out[0] = _mm_mul_ps(Inv0, Rcp0);
986	out[1] = _mm_mul_ps(Inv1, Rcp0);
987	out[2] = _mm_mul_ps(Inv2, Rcp0);
988	out[3] = _mm_mul_ps(Inv3, Rcp0);
989}
990/*
991GLM_FUNC_QUALIFIER void sse_rotate_ps(__m128 const in[4], float Angle, float const v[3], __m128 out[4])
992{
993	float a = glm::radians(Angle);
994    float c = cos(a);
995    float s = sin(a);
996
997	glm::vec4 AxisA(v[0], v[1], v[2], float(0));
998	__m128 AxisB = _mm_set_ps(AxisA.w, AxisA.z, AxisA.y, AxisA.x);
999    __m128 AxisC = detail::sse_nrm_ps(AxisB);
1000
1001	__m128 Cos0 = _mm_set_ss(c);
1002	__m128 CosA = _mm_shuffle_ps(Cos0, Cos0, _MM_SHUFFLE(0, 0, 0, 0));
1003	__m128 Sin0 = _mm_set_ss(s);
1004	__m128 SinA = _mm_shuffle_ps(Sin0, Sin0, _MM_SHUFFLE(0, 0, 0, 0));
1005
1006	// detail::tvec3<T, P> temp = (valType(1) - c) * axis;
1007	__m128 Temp0 = _mm_sub_ps(one, CosA);
1008	__m128 Temp1 = _mm_mul_ps(Temp0, AxisC);
1009
1010	//Rotate[0][0] = c + temp[0] * axis[0];
1011	//Rotate[0][1] = 0 + temp[0] * axis[1] + s * axis[2];
1012	//Rotate[0][2] = 0 + temp[0] * axis[2] - s * axis[1];
1013	__m128 Axis0 = _mm_shuffle_ps(AxisC, AxisC, _MM_SHUFFLE(0, 0, 0, 0));
1014	__m128 TmpA0 = _mm_mul_ps(Axis0, AxisC);
1015	__m128 CosA0 = _mm_shuffle_ps(Cos0, Cos0, _MM_SHUFFLE(1, 1, 1, 0));
1016	__m128 TmpA1 = _mm_add_ps(CosA0, TmpA0);
1017	__m128 SinA0 = SinA;//_mm_set_ps(0.0f, s, -s, 0.0f);
1018	__m128 TmpA2 = _mm_shuffle_ps(AxisC, AxisC, _MM_SHUFFLE(3, 1, 2, 3));
1019	__m128 TmpA3 = _mm_mul_ps(SinA0, TmpA2);
1020	__m128 TmpA4 = _mm_add_ps(TmpA1, TmpA3);
1021
1022	//Rotate[1][0] = 0 + temp[1] * axis[0] - s * axis[2];
1023	//Rotate[1][1] = c + temp[1] * axis[1];
1024	//Rotate[1][2] = 0 + temp[1] * axis[2] + s * axis[0];
1025	__m128 Axis1 = _mm_shuffle_ps(AxisC, AxisC, _MM_SHUFFLE(1, 1, 1, 1));
1026	__m128 TmpB0 = _mm_mul_ps(Axis1, AxisC);
1027	__m128 CosA1 = _mm_shuffle_ps(Cos0, Cos0, _MM_SHUFFLE(1, 1, 0, 1));
1028	__m128 TmpB1 = _mm_add_ps(CosA1, TmpB0);
1029	__m128 SinB0 = SinA;//_mm_set_ps(-s, 0.0f, s, 0.0f);
1030	__m128 TmpB2 = _mm_shuffle_ps(AxisC, AxisC, _MM_SHUFFLE(3, 0, 3, 2));
1031	__m128 TmpB3 = _mm_mul_ps(SinA0, TmpB2);
1032	__m128 TmpB4 = _mm_add_ps(TmpB1, TmpB3);
1033
1034    //Rotate[2][0] = 0 + temp[2] * axis[0] + s * axis[1];
1035    //Rotate[2][1] = 0 + temp[2] * axis[1] - s * axis[0];
1036    //Rotate[2][2] = c + temp[2] * axis[2];
1037	__m128 Axis2 = _mm_shuffle_ps(AxisC, AxisC, _MM_SHUFFLE(2, 2, 2, 2));
1038	__m128 TmpC0 = _mm_mul_ps(Axis2, AxisC);
1039	__m128 CosA2 = _mm_shuffle_ps(Cos0, Cos0, _MM_SHUFFLE(1, 0, 1, 1));
1040	__m128 TmpC1 = _mm_add_ps(CosA2, TmpC0);
1041	__m128 SinC0 = SinA;//_mm_set_ps(s, -s, 0.0f, 0.0f);
1042	__m128 TmpC2 = _mm_shuffle_ps(AxisC, AxisC, _MM_SHUFFLE(3, 3, 0, 1));
1043	__m128 TmpC3 = _mm_mul_ps(SinA0, TmpC2);
1044	__m128 TmpC4 = _mm_add_ps(TmpC1, TmpC3);
1045
1046	__m128 Result[4];
1047	Result[0] = TmpA4;
1048	Result[1] = TmpB4;
1049	Result[2] = TmpC4;
1050	Result[3] = _mm_set_ps(1, 0, 0, 0);
1051
1052	//detail::tmat4x4<valType> Result(detail::tmat4x4<valType>::_null);
1053	//Result[0] = m[0] * Rotate[0][0] + m[1] * Rotate[0][1] + m[2] * Rotate[0][2];
1054	//Result[1] = m[0] * Rotate[1][0] + m[1] * Rotate[1][1] + m[2] * Rotate[1][2];
1055	//Result[2] = m[0] * Rotate[2][0] + m[1] * Rotate[2][1] + m[2] * Rotate[2][2];
1056	//Result[3] = m[3];
1057	//return Result;
1058	sse_mul_ps(in, Result, out);
1059}
1060*/
1061GLM_FUNC_QUALIFIER void sse_outer_ps(__m128 const & c, __m128 const & r, __m128 out[4])
1062{
1063	out[0] = _mm_mul_ps(c, _mm_shuffle_ps(r, r, _MM_SHUFFLE(0, 0, 0, 0)));
1064	out[1] = _mm_mul_ps(c, _mm_shuffle_ps(r, r, _MM_SHUFFLE(1, 1, 1, 1)));
1065	out[2] = _mm_mul_ps(c, _mm_shuffle_ps(r, r, _MM_SHUFFLE(2, 2, 2, 2)));
1066	out[3] = _mm_mul_ps(c, _mm_shuffle_ps(r, r, _MM_SHUFFLE(3, 3, 3, 3)));
1067}
1068
1069}//namespace detail
1070}//namespace glm
1071