• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /****************************************************************************
2 * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23 #pragma once
24 
25 #include "simdlib_types.hpp"
26 
27 // For documentation, please see the following include...
28 // #include "simdlib_interface.hpp"
29 
30 namespace SIMDImpl
31 {
32     namespace SIMD128Impl
33     {
34 #if SIMD_ARCH >= SIMD_ARCH_AVX
35         struct AVXImpl
36         {
37 #define __SIMD_LIB_AVX_HPP__
38 #include "simdlib_128_avx.inl"
39 #undef __SIMD_LIB_AVX_HPP__
40         }; // struct AVXImpl
41 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
42 
43 
44 #if SIMD_ARCH >= SIMD_ARCH_AVX2
45         struct AVX2Impl : AVXImpl
46         {
47 #define __SIMD_LIB_AVX2_HPP__
48 #include "simdlib_128_avx2.inl"
49 #undef __SIMD_LIB_AVX2_HPP__
50         }; // struct AVX2Impl
51 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
52 
53 #if SIMD_ARCH >= SIMD_ARCH_AVX512
54         struct AVX512Impl : AVX2Impl
55         {
56 #if defined(SIMD_OPT_128_AVX512)
57 #define __SIMD_LIB_AVX512_HPP__
58 #include "simdlib_128_avx512.inl"
59 #if defined(SIMD_ARCH_KNIGHTS)
60 #include "simdlib_128_avx512_knights.inl"
61 #else // optimize for core
62 #include "simdlib_128_avx512_core.inl"
63 #endif // defined(SIMD_ARCH_KNIGHTS)
64 #undef __SIMD_LIB_AVX512_HPP__
65 #endif // SIMD_OPT_128_AVX512
66         }; // struct AVX2Impl
67 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
68 
69         struct Traits : SIMDImpl::Traits
70         {
71 #if SIMD_ARCH == SIMD_ARCH_AVX
72             using IsaImpl = AVXImpl;
73 #elif SIMD_ARCH == SIMD_ARCH_AVX2
74             using IsaImpl = AVX2Impl;
75 #elif SIMD_ARCH == SIMD_ARCH_AVX512
76             using IsaImpl = AVX512Impl;
77 #else
78 #error Invalid value for SIMD_ARCH
79 #endif
80 
81             using Float     = SIMD128Impl::Float;
82             using Double    = SIMD128Impl::Double;
83             using Integer   = SIMD128Impl::Integer;
84             using Vec4      = SIMD128Impl::Vec4;
85             using Mask      = SIMD128Impl::Mask;
86         };
87     } // ns SIMD128Impl
88 
89     namespace SIMD256Impl
90     {
91 #if SIMD_ARCH >= SIMD_ARCH_AVX
92         struct AVXImpl
93         {
94 #define __SIMD_LIB_AVX_HPP__
95 #include "simdlib_256_avx.inl"
96 #undef __SIMD_LIB_AVX_HPP__
97         }; // struct AVXImpl
98 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
99 
100 
101 #if SIMD_ARCH >= SIMD_ARCH_AVX2
102         struct AVX2Impl : AVXImpl
103         {
104 #define __SIMD_LIB_AVX2_HPP__
105 #include "simdlib_256_avx2.inl"
106 #undef __SIMD_LIB_AVX2_HPP__
107         }; // struct AVX2Impl
108 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
109 
110 #if SIMD_ARCH >= SIMD_ARCH_AVX512
111         struct AVX512Impl : AVX2Impl
112         {
113 #if defined(SIMD_OPT_256_AVX512)
114 #define __SIMD_LIB_AVX512_HPP__
115 #include "simdlib_256_avx512.inl"
116 #if defined(SIMD_ARCH_KNIGHTS)
117 #include "simdlib_256_avx512_knights.inl"
118 #else // optimize for core
119 #include "simdlib_256_avx512_core.inl"
120 #endif // defined(SIMD_ARCH_KNIGHTS)
121 #undef __SIMD_LIB_AVX512_HPP__
122 #endif // SIMD_OPT_256_AVX512
123         }; // struct AVX2Impl
124 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
125 
126         struct Traits : SIMDImpl::Traits
127         {
128 #if SIMD_ARCH == SIMD_ARCH_AVX
129             using IsaImpl = AVXImpl;
130 #elif SIMD_ARCH == SIMD_ARCH_AVX2
131             using IsaImpl = AVX2Impl;
132 #elif SIMD_ARCH == SIMD_ARCH_AVX512
133             using IsaImpl = AVX512Impl;
134 #else
135 #error Invalid value for SIMD_ARCH
136 #endif
137 
138             using Float     = SIMD256Impl::Float;
139             using Double    = SIMD256Impl::Double;
140             using Integer   = SIMD256Impl::Integer;
141             using Vec4      = SIMD256Impl::Vec4;
142             using Mask      = SIMD256Impl::Mask;
143         };
144     } // ns SIMD256Impl
145 
146     namespace SIMD512Impl
147     {
148 #if SIMD_ARCH >= SIMD_ARCH_AVX
149         template<typename SIMD256T>
150         struct AVXImplBase
151         {
152 #define __SIMD_LIB_AVX_HPP__
153 #include "simdlib_512_emu.inl"
154 #include "simdlib_512_emu_masks.inl"
155 #undef __SIMD_LIB_AVX_HPP__
156         }; // struct AVXImplBase
157         using AVXImpl = AVXImplBase<SIMD256Impl::AVXImpl>;
158 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
159 
160 
161 #if SIMD_ARCH >= SIMD_ARCH_AVX2
162         using AVX2Impl = AVXImplBase<SIMD256Impl::AVX2Impl>;
163 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
164 
165 
166 #if SIMD_ARCH >= SIMD_ARCH_AVX512
167         struct AVX512Impl : AVXImplBase<SIMD256Impl::AVX512Impl>
168         {
169 #define __SIMD_LIB_AVX512_HPP__
170 #include "simdlib_512_avx512.inl"
171 #include "simdlib_512_avx512_masks.inl"
172 #if defined(SIMD_ARCH_KNIGHTS)
173 #include "simdlib_512_avx512_knights.inl"
174 #include "simdlib_512_avx512_masks_knights.inl"
175 #else // optimize for core
176 #include "simdlib_512_avx512_core.inl"
177 #include "simdlib_512_avx512_masks_core.inl"
178 #endif // defined(SIMD_ARCH_KNIGHTS)
179 #undef __SIMD_LIB_AVX512_HPP__
180         }; // struct AVX512ImplBase
181 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
182 
183         struct Traits : SIMDImpl::Traits
184         {
185 #if SIMD_ARCH == SIMD_ARCH_AVX
186             using IsaImpl = AVXImpl;
187 #elif SIMD_ARCH == SIMD_ARCH_AVX2
188             using IsaImpl = AVX2Impl;
189 #elif SIMD_ARCH == SIMD_ARCH_AVX512
190             using IsaImpl = AVX512Impl;
191 #else
192 #error Invalid value for SIMD_ARCH
193 #endif
194 
195             using Float     = SIMD512Impl::Float;
196             using Double    = SIMD512Impl::Double;
197             using Integer   = SIMD512Impl::Integer;
198             using Vec4      = SIMD512Impl::Vec4;
199             using Mask      = SIMD512Impl::Mask;
200         };
201     } // ns SIMD512Impl
202 } // ns SIMDImpl
203 
204 template <typename Traits>
205 struct SIMDBase : Traits::IsaImpl
206 {
207     using CompareType   = typename Traits::CompareType;
208     using ScaleFactor   = typename Traits::ScaleFactor;
209     using RoundMode     = typename Traits::RoundMode;
210     using SIMD          = typename Traits::IsaImpl;
211     using Float         = typename Traits::Float;
212     using Double        = typename Traits::Double;
213     using Integer       = typename Traits::Integer;
214     using Vec4          = typename Traits::Vec4;
215     using Mask          = typename Traits::Mask;
216 
217     static const size_t VECTOR_BYTES = sizeof(Float);
218 
219     // Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
220     static SIMDINLINE
vec4_load1_psSIMDBase221     void vec4_load1_ps(Vec4& r, const float *p)
222     {
223         r[0] = SIMD::set1_ps(p[0]);
224         r[1] = SIMD::set1_ps(p[1]);
225         r[2] = SIMD::set1_ps(p[2]);
226         r[3] = SIMD::set1_ps(p[3]);
227     }
228 
229     static SIMDINLINE
vec4_set1_vpsSIMDBase230     void vec4_set1_vps(Vec4& r, Float const &s)
231     {
232         r[0] = s;
233         r[1] = s;
234         r[2] = s;
235         r[3] = s;
236     }
237 
238     static SIMDINLINE
vec4_dp3_psSIMDBase239     Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1)
240     {
241         Float tmp, r;
242         r   = SIMD::mul_ps(v0[0], v1[0]);     // (v0.x*v1.x)
243 
244         tmp = SIMD::mul_ps(v0[1], v1[1]);     // (v0.y*v1.y)
245         r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y)
246 
247         tmp = SIMD::mul_ps(v0[2], v1[2]);     // (v0.z*v1.z)
248         r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
249 
250         return r;
251     }
252 
253     static SIMDINLINE
vec4_dp4_psSIMDBase254     Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1)
255     {
256         Float tmp, r;
257         r   = SIMD::mul_ps(v0[0], v1[0]);     // (v0.x*v1.x)
258 
259         tmp = SIMD::mul_ps(v0[1], v1[1]);     // (v0.y*v1.y)
260         r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y)
261 
262         tmp = SIMD::mul_ps(v0[2], v1[2]);     // (v0.z*v1.z)
263         r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
264 
265         tmp = SIMD::mul_ps(v0[3], v1[3]);     // (v0.w*v1.w)
266         r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
267 
268         return r;
269     }
270 
271     static SIMDINLINE
vec4_rcp_length_psSIMDBase272     Float vec4_rcp_length_ps(const Vec4& v)
273     {
274         Float length = vec4_dp4_ps(v, v);
275         return SIMD::rsqrt_ps(length);
276     }
277 
278     static SIMDINLINE
vec4_normalize_psSIMDBase279     void vec4_normalize_ps(Vec4& r, const Vec4& v)
280     {
281         Float rcpLength = vec4_rcp_length_ps(v);
282 
283         r[0] = SIMD::mul_ps(v[0], rcpLength);
284         r[1] = SIMD::mul_ps(v[1], rcpLength);
285         r[2] = SIMD::mul_ps(v[2], rcpLength);
286         r[3] = SIMD::mul_ps(v[3], rcpLength);
287     }
288 
289     static SIMDINLINE
vec4_mul_psSIMDBase290     void vec4_mul_ps(Vec4& r, const Vec4& v, Float const &s)
291     {
292         r[0] = SIMD::mul_ps(v[0], s);
293         r[1] = SIMD::mul_ps(v[1], s);
294         r[2] = SIMD::mul_ps(v[2], s);
295         r[3] = SIMD::mul_ps(v[3], s);
296     }
297 
298     static SIMDINLINE
vec4_mul_psSIMDBase299     void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
300     {
301         r[0] = SIMD::mul_ps(v0[0], v1[0]);
302         r[1] = SIMD::mul_ps(v0[1], v1[1]);
303         r[2] = SIMD::mul_ps(v0[2], v1[2]);
304         r[3] = SIMD::mul_ps(v0[3], v1[3]);
305     }
306 
307     static SIMDINLINE
vec4_add_psSIMDBase308     void vec4_add_ps(Vec4& r, const Vec4& v0, Float const &s)
309     {
310         r[0] = SIMD::add_ps(v0[0], s);
311         r[1] = SIMD::add_ps(v0[1], s);
312         r[2] = SIMD::add_ps(v0[2], s);
313         r[3] = SIMD::add_ps(v0[3], s);
314     }
315 
316     static SIMDINLINE
vec4_add_psSIMDBase317     void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
318     {
319         r[0] = SIMD::add_ps(v0[0], v1[0]);
320         r[1] = SIMD::add_ps(v0[1], v1[1]);
321         r[2] = SIMD::add_ps(v0[2], v1[2]);
322         r[3] = SIMD::add_ps(v0[3], v1[3]);
323     }
324 
325     static SIMDINLINE
vec4_min_psSIMDBase326     void vec4_min_ps(Vec4& r, const Vec4& v0, Float const &s)
327     {
328         r[0] = SIMD::min_ps(v0[0], s);
329         r[1] = SIMD::min_ps(v0[1], s);
330         r[2] = SIMD::min_ps(v0[2], s);
331         r[3] = SIMD::min_ps(v0[3], s);
332     }
333 
334     static SIMDINLINE
vec4_max_psSIMDBase335     void vec4_max_ps(Vec4& r, const Vec4& v0, Float const &s)
336     {
337         r[0] = SIMD::max_ps(v0[0], s);
338         r[1] = SIMD::max_ps(v0[1], s);
339         r[2] = SIMD::max_ps(v0[2], s);
340         r[3] = SIMD::max_ps(v0[3], s);
341     }
342 
343     // Matrix4x4 * Vector4
344     //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
345     //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
346     //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
347     //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
348     static SIMDINLINE
mat4x4_vec4_multiplySIMDBase349     void SIMDCALL mat4x4_vec4_multiply(
350         Vec4& result,
351         const float *pMatrix,
352         const Vec4& v)
353     {
354         Float m;
355         Float r0;
356         Float r1;
357 
358         m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
359         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
360         m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
361         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
362         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
363         m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
364         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
365         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
366         m   = SIMD::load1_ps(pMatrix + 0*4 + 3);  // m[row][3]
367         r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
368         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
369         result[0] = r0;
370 
371         m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
372         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
373         m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
374         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
375         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
376         m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
377         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
378         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
379         m   = SIMD::load1_ps(pMatrix + 1*4 + 3);  // m[row][3]
380         r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
381         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
382         result[1] = r0;
383 
384         m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
385         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
386         m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
387         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
388         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
389         m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
390         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
391         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
392         m   = SIMD::load1_ps(pMatrix + 2*4 + 3);  // m[row][3]
393         r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
394         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
395         result[2] = r0;
396 
397         m   = SIMD::load1_ps(pMatrix + 3*4 + 0);  // m[row][0]
398         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
399         m   = SIMD::load1_ps(pMatrix + 3*4 + 1);  // m[row][1]
400         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
401         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
402         m   = SIMD::load1_ps(pMatrix + 3*4 + 2);  // m[row][2]
403         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
404         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
405         m   = SIMD::load1_ps(pMatrix + 3*4 + 3);  // m[row][3]
406         r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
407         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
408         result[3] = r0;
409     }
410 
411     // Matrix4x4 * Vector3 - Direction Vector where w = 0.
412     //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
413     //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
414     //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
415     //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
416     static SIMDINLINE
mat3x3_vec3_w0_multiplySIMDBase417     void SIMDCALL mat3x3_vec3_w0_multiply(
418         Vec4& result,
419         const float *pMatrix,
420         const Vec4& v)
421     {
422         Float m;
423         Float r0;
424         Float r1;
425 
426         m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
427         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
428         m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
429         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
430         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
431         m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
432         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
433         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
434         result[0] = r0;
435 
436         m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
437         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
438         m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
439         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
440         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
441         m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
442         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
443         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
444         result[1] = r0;
445 
446         m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
447         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
448         m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
449         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
450         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
451         m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
452         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
453         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
454         result[2] = r0;
455 
456         result[3] = SIMD::setzero_ps();
457     }
458 
459     // Matrix4x4 * Vector3 - Position vector where w = 1.
460     //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
461     //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
462     //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
463     //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
464     static SIMDINLINE
mat4x4_vec3_w1_multiplySIMDBase465     void SIMDCALL mat4x4_vec3_w1_multiply(
466         Vec4& result,
467         const float *pMatrix,
468         const Vec4& v)
469     {
470         Float m;
471         Float r0;
472         Float r1;
473 
474         m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
475         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
476         m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
477         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
478         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
479         m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
480         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
481         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
482         m   = SIMD::load1_ps(pMatrix + 0*4 + 3);  // m[row][3]
483         r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
484         result[0] = r0;
485 
486         m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
487         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
488         m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
489         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
490         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
491         m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
492         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
493         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
494         m   = SIMD::load1_ps(pMatrix + 1*4 + 3);  // m[row][3]
495         r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
496         result[1] = r0;
497 
498         m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
499         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
500         m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
501         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
502         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
503         m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
504         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
505         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
506         m   = SIMD::load1_ps(pMatrix + 2*4 + 3);  // m[row][3]
507         r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
508         result[2] = r0;
509 
510         m   = SIMD::load1_ps(pMatrix + 3*4 + 0);  // m[row][0]
511         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
512         m   = SIMD::load1_ps(pMatrix + 3*4 + 1);  // m[row][1]
513         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
514         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
515         m   = SIMD::load1_ps(pMatrix + 3*4 + 2);  // m[row][2]
516         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
517         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
518         m   = SIMD::load1_ps(pMatrix + 3*4 + 3);  // m[row][3]
519         result[3] = SIMD::add_ps(r0, m);        // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
520     }
521 
522     static SIMDINLINE
mat4x3_vec3_w1_multiplySIMDBase523     void SIMDCALL mat4x3_vec3_w1_multiply(
524         Vec4& result,
525         const float *pMatrix,
526         const Vec4& v)
527     {
528         Float m;
529         Float r0;
530         Float r1;
531 
532         m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
533         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
534         m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
535         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
536         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
537         m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
538         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
539         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
540         m   = SIMD::load1_ps(pMatrix + 0*4 + 3);  // m[row][3]
541         r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
542         result[0] = r0;
543 
544         m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
545         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
546         m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
547         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
548         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
549         m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
550         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
551         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
552         m   = SIMD::load1_ps(pMatrix + 1*4 + 3);  // m[row][3]
553         r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
554         result[1] = r0;
555 
556         m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
557         r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
558         m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
559         r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
560         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
561         m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
562         r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
563         r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
564         m   = SIMD::load1_ps(pMatrix + 2*4 + 3);  // m[row][3]
565         r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
566         result[2] = r0;
567         result[3] = SIMD::set1_ps(1.0f);
568     }
569 }; // struct SIMDBase
570 
571 using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>;
572 using SIMD256 = SIMDBase<SIMDImpl::SIMD256Impl::Traits>;
573 using SIMD512 = SIMDBase<SIMDImpl::SIMD512Impl::Traits>;
574