1 // This file is part of the ustl library, an STL implementation.
2 //
3 // Copyright (C) 2005 by Mike Sharov <msharov@users.sourceforge.net>
4 // This file is free software, distributed under the MIT License.
5 //
6 // ulaalgo.h
7 //
8
9 #ifndef ULAALGO_H_2E403D182E83FB596AFB800E68B255A1
10 #define ULAALGO_H_2E403D182E83FB596AFB800E68B255A1
11
12 #include "umatrix.h"
13 #include "simd.h"
14
15 namespace ustl {
16
17 /// \brief Creates an identity matrix in \p m
18 /// \ingroup NumericAlgorithms
19 template <size_t NX, size_t NY, typename T>
load_identity(matrix<NX,NY,T> & m)20 void load_identity (matrix<NX,NY,T>& m)
21 {
22 fill_n (m.begin(), NX * NY, 0);
23 for (typename matrix<NX,NY,T>::iterator i = m.begin(); i < m.end(); i += NX + 1)
24 *i = 1;
25 }
26
27 /// \brief Multiplies two matrices
28 /// \ingroup NumericAlgorithms
29 template <size_t NX, size_t NY, typename T>
30 matrix<NY,NY,T> operator* (const matrix<NX,NY,T>& m1, const matrix<NY,NX,T>& m2)
31 {
32 matrix<NY,NY,T> mr;
33 for (uoff_t ry = 0; ry < NY; ++ ry) {
34 for (uoff_t rx = 0; rx < NY; ++ rx) {
35 T dpv (0);
36 for (uoff_t x = 0; x < NX; ++ x)
37 dpv += m1[ry][x] * m2[x][rx];
38 mr[ry][rx] = dpv;
39 }
40 }
41 return (mr);
42 }
43
44 /// \brief Transforms vector \p t with matrix \p m
45 /// \ingroup NumericAlgorithms
46 template <size_t NX, size_t NY, typename T>
47 tuple<NX,T> operator* (const tuple<NY,T>& t, const matrix<NX,NY,T>& m)
48 {
49 tuple<NX,T> tr;
50 for (uoff_t x = 0; x < NX; ++ x) {
51 T dpv (0);
52 for (uoff_t y = 0; y < NY; ++ y)
53 dpv += t[y] * m[y][x];
54 tr[x] = dpv;
55 }
56 return (tr);
57 }
58
59 /// \brief Transposes (exchanges rows and columns) matrix \p m.
60 /// \ingroup NumericAlgorithms
61 template <size_t N, typename T>
transpose(matrix<N,N,T> & m)62 void transpose (matrix<N,N,T>& m)
63 {
64 for (uoff_t x = 0; x < N; ++ x)
65 for (uoff_t y = x; y < N; ++ y)
66 swap (m[x][y], m[y][x]);
67 }
68
69 #if WANT_UNROLLED_COPY
70
71 #if CPU_HAS_SSE
72
73 #if linux // Non-linux gcc versions (BSD, Solaris) can't handle "x" constraint and provide no alternative.
74 template <>
load_identity(matrix<4,4,float> & m)75 inline void load_identity (matrix<4,4,float>& m)
76 {
77 asm (
78 "movaps %4, %%xmm1 \n\t" // 1 0 0 0
79 "movups %4, %0 \n\t" // 1 0 0 0
80 "shufps $0xB1,%%xmm1,%%xmm1 \n\t" // 0 1 0 0
81 "movups %%xmm1, %1 \n\t" // 0 1 0 0
82 "shufps $0x4F,%4,%%xmm1 \n\t" // 0 0 1 0
83 "shufps $0x1B,%4,%4 \n\t" // 0 0 0 1
84 "movups %%xmm1, %2 \n\t" // 0 0 1 0
85 "movups %4, %3" // 0 0 0 1
86 : "=m"(m[0][0]), "=m"(m[1][0]), "=m"(m[2][0]), "=m"(m[3][0])
87 : "x"(1.0f)
88 : "xmm1"
89 );
90 }
91 #endif
92
_sse_load_matrix(const float * m)93 inline void _sse_load_matrix (const float* m)
94 {
95 asm (
96 "movups %0, %%xmm4 \n\t" // xmm4 = m[1 2 3 4]
97 "movups %1, %%xmm5 \n\t" // xmm5 = m[1 2 3 4]
98 "movups %2, %%xmm6 \n\t" // xmm6 = m[1 2 3 4]
99 "movups %3, %%xmm7" // xmm7 = m[1 2 3 4]
100 : : "m"(m[0]), "m"(m[4]), "m"(m[8]), "m"(m[12])
101 : "xmm4", "xmm5", "xmm6", "xmm7"
102 );
103 }
104
_sse_transform_to_vector(float * result)105 inline void _sse_transform_to_vector (float* result)
106 {
107 asm (
108 "movaps %%xmm0, %%xmm1 \n\t" // xmm1 = t[0 1 2 3]
109 "movaps %%xmm0, %%xmm2 \n\t" // xmm1 = t[0 1 2 3]
110 "movaps %%xmm0, %%xmm3 \n\t" // xmm1 = t[0 1 2 3]
111 "shufps $0x00, %%xmm0, %%xmm0 \n\t" // xmm0 = t[0 0 0 0]
112 "shufps $0x66, %%xmm1, %%xmm1 \n\t" // xmm1 = t[1 1 1 1]
113 "shufps $0xAA, %%xmm2, %%xmm2 \n\t" // xmm2 = t[2 2 2 2]
114 "shufps $0xFF, %%xmm3, %%xmm3 \n\t" // xmm3 = t[3 3 3 3]
115 "mulps %%xmm4, %%xmm0 \n\t" // xmm0 = t[0 0 0 0] * m[0 1 2 3]
116 "mulps %%xmm5, %%xmm1 \n\t" // xmm1 = t[1 1 1 1] * m[0 1 2 3]
117 "addps %%xmm1, %%xmm0 \n\t" // xmm0 = xmm0 + xmm1
118 "mulps %%xmm6, %%xmm2 \n\t" // xmm2 = t[2 2 2 2] * m[0 1 2 3]
119 "mulps %%xmm7, %%xmm3 \n\t" // xmm3 = t[3 3 3 3] * m[0 1 2 3]
120 "addps %%xmm3, %%xmm2 \n\t" // xmm2 = xmm2 + xmm3
121 "addps %%xmm2, %%xmm0 \n\t" // xmm0 = result
122 "movups %%xmm0, %0"
123 : "=m"(result[0]) :
124 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
125 );
126 }
127
128 template <>
129 tuple<4,float> operator* (const tuple<4,float>& t, const matrix<4,4,float>& m)
130 {
131 tuple<4,float> result;
132 _sse_load_matrix (m.begin());
133 asm ("movups %0, %%xmm0" : : "m"(t[0]) : "xmm0");
134 _sse_transform_to_vector (result.begin());
135 return (result);
136 }
137
138 template <>
139 matrix<4,4,float> operator* (const matrix<4,4,float>& m1, const matrix<4,4,float>& m2)
140 {
141 matrix<4,4,float> result;
142 _sse_load_matrix (m2.begin());
143 for (uoff_t r = 0; r < 4; ++ r) {
144 asm ("movups %0, %%xmm0" : : "m"(m1[r][0]) : "xmm0");
145 _sse_transform_to_vector (result[r]);
146 }
147 return (result);
148 }
149
150 #elif CPU_HAS_3DNOW
151
152 /// Specialization for 4-component vector transform, the slow part of 3D graphics.
153 template <>
154 tuple<4,float> operator* (const tuple<4,float>& t, const matrix<4,4,float>& m)
155 {
156 tuple<4,float> result;
157 // This is taken from "AMD Athlon Code Optimization Guide" from AMD. 18 cycles!
158 // If you are writing a 3D engine, you may want to copy it instead of calling it
159 // because of the femms instruction at the end, which takes 2 cycles.
160 asm (
161 "movq %2, %%mm0 \n\t" // y | x
162 "movq %3, %%mm1 \n\t" // w | z
163 "movq %%mm0, %%mm2 \n\t" // y | x
164 "movq %4, %%mm3 \n\t" // m[0][1] | m[0][0]
165 "punpckldq %%mm0, %%mm0 \n\t" // x | x
166 "movq %6, %%mm4 \n\t" // m[1][1] | m[1][0]
167 "pfmul %%mm0, %%mm3 \n\t" // x*m[0][1] | x*m[0][0]
168 "punpckhdq %%mm2, %%mm2 \n\t" // y | y
169 "pfmul %%mm2, %%mm4 \n\t" // y*m[1][1] | y*m[1][0]
170 "movq %5, %%mm5 \n\t" // m[0][3] | m[0][2]
171 "movq %7, %%mm7 \n\t" // m[1][3] | m[1][2]
172 "movq %%mm1, %%mm6 \n\t" // w | z
173 "pfmul %%mm0, %%mm5 \n\t" // x*m[0][3] | v0>x*m[0][2]
174 "movq %8, %%mm0 \n\t" // m[2][1] | m[2][0]
175 "punpckldq %%mm1, %%mm1 \n\t" // z | z
176 "pfmul %%mm2, %%mm7 \n\t" // y*m[1][3] | y*m[1][2]
177 "movq %9, %%mm2 \n\t" // m[2][3] | m[2][2]
178 "pfmul %%mm1, %%mm0 \n\t" // z*m[2][1] | z*m[2][0]
179 "pfadd %%mm4, %%mm3 \n\t" // x*m[0][1]+y*m[1][1] | x*m[0][0]+y*m[1][0]
180 "movq %10, %%mm4 \n\t" // m[3][1] | m[3][0]
181 "pfmul %%mm1, %%mm2 \n\t" // z*m[2][3] | z*m[2][2]
182 "pfadd %%mm7, %%mm5 \n\t" // x*m[0][3]+y*m[1][3] | x*m[0][2]+y*m[1][2]
183 "movq %11, %%mm1 \n\t" // m[3][3] | m[3][2]
184 "punpckhdq %%mm6, %%mm6 \n\t" // w | w
185 "pfadd %%mm0, %%mm3 \n\t" // x*m[0][1]+y*m[1][1]+z*m[2][1] | x*m[0][0]+y*m[1][0]+z*m[2][0]
186 "pfmul %%mm6, %%mm4 \n\t" // w*m[3][1] | w*m[3][0]
187 "pfmul %%mm6, %%mm1 \n\t" // w*m[3][3] | w*m[3][2]
188 "pfadd %%mm2, %%mm5 \n\t" // x*m[0][3]+y*m[1][3]+z*m[2][3] | x*m[0][2]+y*m[1][2]+z*m[2][2]
189 "pfadd %%mm4, %%mm3 \n\t" // x*m[0][1]+y*m[1][1]+z*m[2][1]+w*m[3][1] | x*m[0][0]+y*m[1][0]+z*m[2][0]+w*m[3][0]
190 "movq %%mm3, %0 \n\t" // store result->y | result->x
191 "pfadd %%mm1, %%mm5 \n\t" // x*m[0][3]+y*m[1][3]+z*m[2][3]+w*m[3][3] | x*m[0][2]+y*m[1][2]+z*m[2][2]+w*m[3][2]
192 "movq %%mm5, %1" // store result->w | result->z
193 : "=m"(result[0]), "=m"(result[2])
194 : "m"(t[0]), "m"(t[2]),
195 "m"(m[0][0]), "m"(m[0][2]),
196 "m"(m[1][0]), "m"(m[1][2]),
197 "m"(m[2][0]), "m"(m[2][2]),
198 "m"(m[3][0]), "m"(m[3][2])
199 : "mm0","mm1","mm2","mm3","mm4","mm5","mm6","mm7"
200 );
201 simd::reset_mmx();
202 return (result);
203 }
204
205 #else // If no processor extensions, just unroll the multiplication
206
207 /// Specialization for 4-component vector transform, the slow part of 3D graphics.
208 template <>
209 tuple<4,float> operator* (const tuple<4,float>& t, const matrix<4,4,float>& m)
210 {
211 tuple<4,float> tr;
212 for (uoff_t i = 0; i < 4; ++ i)
213 tr[i] = t[0] * m[0][i] + t[1] * m[1][i] + t[2] * m[2][i] + t[3] * m[3][i];
214 return (tr);
215 }
216
217 #endif // CPU_HAS_3DNOW
218 #endif // WANT_UNROLLED_COPY
219
220 } // namespace ustl
221
222 #endif
223
224