1 /**************************************************************************** 2 * Copyright (C) 2017 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 ****************************************************************************/ 23 #pragma once 24 25 #include "simdlib_types.hpp" 26 27 // For documentation, please see the following include... 28 // #include "simdlib_interface.hpp" 29 30 namespace SIMDImpl 31 { 32 namespace SIMD128Impl 33 { 34 #if SIMD_ARCH >= SIMD_ARCH_AVX 35 struct AVXImpl 36 { 37 #define __SIMD_LIB_AVX_HPP__ 38 #include "simdlib_128_avx.inl" 39 #undef __SIMD_LIB_AVX_HPP__ 40 }; // struct AVXImpl 41 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX 42 43 44 #if SIMD_ARCH >= SIMD_ARCH_AVX2 45 struct AVX2Impl : AVXImpl 46 { 47 #define __SIMD_LIB_AVX2_HPP__ 48 #include "simdlib_128_avx2.inl" 49 #undef __SIMD_LIB_AVX2_HPP__ 50 }; // struct AVX2Impl 51 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2 52 53 #if SIMD_ARCH >= SIMD_ARCH_AVX512 54 struct AVX512Impl : AVX2Impl 55 { 56 #if defined(SIMD_OPT_128_AVX512) 57 #define __SIMD_LIB_AVX512_HPP__ 58 #include "simdlib_128_avx512.inl" 59 #if defined(SIMD_ARCH_KNIGHTS) 60 #include "simdlib_128_avx512_knights.inl" 61 #else // optimize for core 62 #include "simdlib_128_avx512_core.inl" 63 #endif // defined(SIMD_ARCH_KNIGHTS) 64 #undef __SIMD_LIB_AVX512_HPP__ 65 #endif // SIMD_OPT_128_AVX512 66 }; // struct AVX2Impl 67 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512 68 69 struct Traits : SIMDImpl::Traits 70 { 71 #if SIMD_ARCH == SIMD_ARCH_AVX 72 using IsaImpl = AVXImpl; 73 #elif SIMD_ARCH == SIMD_ARCH_AVX2 74 using IsaImpl = AVX2Impl; 75 #elif SIMD_ARCH == SIMD_ARCH_AVX512 76 using IsaImpl = AVX512Impl; 77 #else 78 #error Invalid value for SIMD_ARCH 79 #endif 80 81 using Float = SIMD128Impl::Float; 82 using Double = SIMD128Impl::Double; 83 using Integer = SIMD128Impl::Integer; 84 using Vec4 = SIMD128Impl::Vec4; 85 using Mask = SIMD128Impl::Mask; 86 }; 87 } // ns SIMD128Impl 88 89 namespace SIMD256Impl 90 { 91 #if SIMD_ARCH >= SIMD_ARCH_AVX 92 struct AVXImpl 93 { 94 #define __SIMD_LIB_AVX_HPP__ 95 #include "simdlib_256_avx.inl" 96 #undef __SIMD_LIB_AVX_HPP__ 97 }; // struct AVXImpl 98 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX 99 100 101 #if SIMD_ARCH >= SIMD_ARCH_AVX2 102 struct AVX2Impl : AVXImpl 103 { 104 #define __SIMD_LIB_AVX2_HPP__ 105 #include "simdlib_256_avx2.inl" 106 #undef __SIMD_LIB_AVX2_HPP__ 107 }; // struct AVX2Impl 108 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2 109 110 #if SIMD_ARCH >= SIMD_ARCH_AVX512 111 struct AVX512Impl : AVX2Impl 112 { 113 #if defined(SIMD_OPT_256_AVX512) 114 #define __SIMD_LIB_AVX512_HPP__ 115 #include "simdlib_256_avx512.inl" 116 #if defined(SIMD_ARCH_KNIGHTS) 117 #include "simdlib_256_avx512_knights.inl" 118 #else // optimize for core 119 #include "simdlib_256_avx512_core.inl" 120 #endif // defined(SIMD_ARCH_KNIGHTS) 121 #undef __SIMD_LIB_AVX512_HPP__ 122 #endif // SIMD_OPT_256_AVX512 123 }; // struct AVX2Impl 124 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512 125 126 struct Traits : SIMDImpl::Traits 127 { 128 #if SIMD_ARCH == SIMD_ARCH_AVX 129 using IsaImpl = AVXImpl; 130 #elif SIMD_ARCH == SIMD_ARCH_AVX2 131 using IsaImpl = AVX2Impl; 132 #elif SIMD_ARCH == SIMD_ARCH_AVX512 133 using IsaImpl = AVX512Impl; 134 #else 135 #error Invalid value for SIMD_ARCH 136 #endif 137 138 using Float = SIMD256Impl::Float; 139 using Double = SIMD256Impl::Double; 140 using Integer = SIMD256Impl::Integer; 141 using Vec4 = SIMD256Impl::Vec4; 142 using Mask = SIMD256Impl::Mask; 143 }; 144 } // ns SIMD256Impl 145 146 namespace SIMD512Impl 147 { 148 #if SIMD_ARCH >= SIMD_ARCH_AVX 149 template<typename SIMD256T> 150 struct AVXImplBase 151 { 152 #define __SIMD_LIB_AVX_HPP__ 153 #include "simdlib_512_emu.inl" 154 #include "simdlib_512_emu_masks.inl" 155 #undef __SIMD_LIB_AVX_HPP__ 156 }; // struct AVXImplBase 157 using AVXImpl = AVXImplBase<SIMD256Impl::AVXImpl>; 158 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX 159 160 161 #if SIMD_ARCH >= SIMD_ARCH_AVX2 162 using AVX2Impl = AVXImplBase<SIMD256Impl::AVX2Impl>; 163 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2 164 165 166 #if SIMD_ARCH >= SIMD_ARCH_AVX512 167 struct AVX512Impl : AVXImplBase<SIMD256Impl::AVX512Impl> 168 { 169 #define __SIMD_LIB_AVX512_HPP__ 170 #include "simdlib_512_avx512.inl" 171 #include "simdlib_512_avx512_masks.inl" 172 #if defined(SIMD_ARCH_KNIGHTS) 173 #include "simdlib_512_avx512_knights.inl" 174 #include "simdlib_512_avx512_masks_knights.inl" 175 #else // optimize for core 176 #include "simdlib_512_avx512_core.inl" 177 #include "simdlib_512_avx512_masks_core.inl" 178 #endif // defined(SIMD_ARCH_KNIGHTS) 179 #undef __SIMD_LIB_AVX512_HPP__ 180 }; // struct AVX512ImplBase 181 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512 182 183 struct Traits : SIMDImpl::Traits 184 { 185 #if SIMD_ARCH == SIMD_ARCH_AVX 186 using IsaImpl = AVXImpl; 187 #elif SIMD_ARCH == SIMD_ARCH_AVX2 188 using IsaImpl = AVX2Impl; 189 #elif SIMD_ARCH == SIMD_ARCH_AVX512 190 using IsaImpl = AVX512Impl; 191 #else 192 #error Invalid value for SIMD_ARCH 193 #endif 194 195 using Float = SIMD512Impl::Float; 196 using Double = SIMD512Impl::Double; 197 using Integer = SIMD512Impl::Integer; 198 using Vec4 = SIMD512Impl::Vec4; 199 using Mask = SIMD512Impl::Mask; 200 }; 201 } // ns SIMD512Impl 202 } // ns SIMDImpl 203 204 template <typename Traits> 205 struct SIMDBase : Traits::IsaImpl 206 { 207 using CompareType = typename Traits::CompareType; 208 using ScaleFactor = typename Traits::ScaleFactor; 209 using RoundMode = typename Traits::RoundMode; 210 using SIMD = typename Traits::IsaImpl; 211 using Float = typename Traits::Float; 212 using Double = typename Traits::Double; 213 using Integer = typename Traits::Integer; 214 using Vec4 = typename Traits::Vec4; 215 using Mask = typename Traits::Mask; 216 217 static const size_t VECTOR_BYTES = sizeof(Float); 218 219 // Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes xxxx yyyy zzzz wwww. 220 static SIMDINLINE vec4_load1_psSIMDBase221 void vec4_load1_ps(Vec4& r, const float *p) 222 { 223 r[0] = SIMD::set1_ps(p[0]); 224 r[1] = SIMD::set1_ps(p[1]); 225 r[2] = SIMD::set1_ps(p[2]); 226 r[3] = SIMD::set1_ps(p[3]); 227 } 228 229 static SIMDINLINE vec4_set1_vpsSIMDBase230 void vec4_set1_vps(Vec4& r, Float const &s) 231 { 232 r[0] = s; 233 r[1] = s; 234 r[2] = s; 235 r[3] = s; 236 } 237 238 static SIMDINLINE vec4_dp3_psSIMDBase239 Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1) 240 { 241 Float tmp, r; 242 r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x) 243 244 tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y) 245 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) 246 247 tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z) 248 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) 249 250 return r; 251 } 252 253 static SIMDINLINE vec4_dp4_psSIMDBase254 Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1) 255 { 256 Float tmp, r; 257 r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x) 258 259 tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y) 260 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) 261 262 tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z) 263 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) 264 265 tmp = SIMD::mul_ps(v0[3], v1[3]); // (v0.w*v1.w) 266 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) 267 268 return r; 269 } 270 271 static SIMDINLINE vec4_rcp_length_psSIMDBase272 Float vec4_rcp_length_ps(const Vec4& v) 273 { 274 Float length = vec4_dp4_ps(v, v); 275 return SIMD::rsqrt_ps(length); 276 } 277 278 static SIMDINLINE vec4_normalize_psSIMDBase279 void vec4_normalize_ps(Vec4& r, const Vec4& v) 280 { 281 Float rcpLength = vec4_rcp_length_ps(v); 282 283 r[0] = SIMD::mul_ps(v[0], rcpLength); 284 r[1] = SIMD::mul_ps(v[1], rcpLength); 285 r[2] = SIMD::mul_ps(v[2], rcpLength); 286 r[3] = SIMD::mul_ps(v[3], rcpLength); 287 } 288 289 static SIMDINLINE vec4_mul_psSIMDBase290 void vec4_mul_ps(Vec4& r, const Vec4& v, Float const &s) 291 { 292 r[0] = SIMD::mul_ps(v[0], s); 293 r[1] = SIMD::mul_ps(v[1], s); 294 r[2] = SIMD::mul_ps(v[2], s); 295 r[3] = SIMD::mul_ps(v[3], s); 296 } 297 298 static SIMDINLINE vec4_mul_psSIMDBase299 void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1) 300 { 301 r[0] = SIMD::mul_ps(v0[0], v1[0]); 302 r[1] = SIMD::mul_ps(v0[1], v1[1]); 303 r[2] = SIMD::mul_ps(v0[2], v1[2]); 304 r[3] = SIMD::mul_ps(v0[3], v1[3]); 305 } 306 307 static SIMDINLINE vec4_add_psSIMDBase308 void vec4_add_ps(Vec4& r, const Vec4& v0, Float const &s) 309 { 310 r[0] = SIMD::add_ps(v0[0], s); 311 r[1] = SIMD::add_ps(v0[1], s); 312 r[2] = SIMD::add_ps(v0[2], s); 313 r[3] = SIMD::add_ps(v0[3], s); 314 } 315 316 static SIMDINLINE vec4_add_psSIMDBase317 void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1) 318 { 319 r[0] = SIMD::add_ps(v0[0], v1[0]); 320 r[1] = SIMD::add_ps(v0[1], v1[1]); 321 r[2] = SIMD::add_ps(v0[2], v1[2]); 322 r[3] = SIMD::add_ps(v0[3], v1[3]); 323 } 324 325 static SIMDINLINE vec4_min_psSIMDBase326 void vec4_min_ps(Vec4& r, const Vec4& v0, Float const &s) 327 { 328 r[0] = SIMD::min_ps(v0[0], s); 329 r[1] = SIMD::min_ps(v0[1], s); 330 r[2] = SIMD::min_ps(v0[2], s); 331 r[3] = SIMD::min_ps(v0[3], s); 332 } 333 334 static SIMDINLINE vec4_max_psSIMDBase335 void vec4_max_ps(Vec4& r, const Vec4& v0, Float const &s) 336 { 337 r[0] = SIMD::max_ps(v0[0], s); 338 r[1] = SIMD::max_ps(v0[1], s); 339 r[2] = SIMD::max_ps(v0[2], s); 340 r[3] = SIMD::max_ps(v0[3], s); 341 } 342 343 // Matrix4x4 * Vector4 344 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w) 345 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w) 346 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w) 347 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w) 348 static SIMDINLINE mat4x4_vec4_multiplySIMDBase349 void SIMDCALL mat4x4_vec4_multiply( 350 Vec4& result, 351 const float *pMatrix, 352 const Vec4& v) 353 { 354 Float m; 355 Float r0; 356 Float r1; 357 358 m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0] 359 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 360 m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1] 361 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 362 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 363 m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2] 364 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 365 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 366 m = SIMD::load1_ps(pMatrix + 0*4 + 3); // m[row][3] 367 r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z) 368 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) 369 result[0] = r0; 370 371 m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0] 372 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 373 m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1] 374 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 375 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 376 m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2] 377 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 378 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 379 m = SIMD::load1_ps(pMatrix + 1*4 + 3); // m[row][3] 380 r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z) 381 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) 382 result[1] = r0; 383 384 m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0] 385 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 386 m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1] 387 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 388 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 389 m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2] 390 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 391 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 392 m = SIMD::load1_ps(pMatrix + 2*4 + 3); // m[row][3] 393 r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z) 394 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) 395 result[2] = r0; 396 397 m = SIMD::load1_ps(pMatrix + 3*4 + 0); // m[row][0] 398 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 399 m = SIMD::load1_ps(pMatrix + 3*4 + 1); // m[row][1] 400 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 401 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 402 m = SIMD::load1_ps(pMatrix + 3*4 + 2); // m[row][2] 403 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 404 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 405 m = SIMD::load1_ps(pMatrix + 3*4 + 3); // m[row][3] 406 r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z) 407 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) 408 result[3] = r0; 409 } 410 411 // Matrix4x4 * Vector3 - Direction Vector where w = 0. 412 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0) 413 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0) 414 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0) 415 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0) 416 static SIMDINLINE mat3x3_vec3_w0_multiplySIMDBase417 void SIMDCALL mat3x3_vec3_w0_multiply( 418 Vec4& result, 419 const float *pMatrix, 420 const Vec4& v) 421 { 422 Float m; 423 Float r0; 424 Float r1; 425 426 m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0] 427 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 428 m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1] 429 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 430 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 431 m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2] 432 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 433 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 434 result[0] = r0; 435 436 m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0] 437 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 438 m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1] 439 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 440 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 441 m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2] 442 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 443 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 444 result[1] = r0; 445 446 m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0] 447 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 448 m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1] 449 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 450 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 451 m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2] 452 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 453 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 454 result[2] = r0; 455 456 result[3] = SIMD::setzero_ps(); 457 } 458 459 // Matrix4x4 * Vector3 - Position vector where w = 1. 460 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1) 461 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1) 462 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1) 463 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1) 464 static SIMDINLINE mat4x4_vec3_w1_multiplySIMDBase465 void SIMDCALL mat4x4_vec3_w1_multiply( 466 Vec4& result, 467 const float *pMatrix, 468 const Vec4& v) 469 { 470 Float m; 471 Float r0; 472 Float r1; 473 474 m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0] 475 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 476 m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1] 477 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 478 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 479 m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2] 480 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 481 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 482 m = SIMD::load1_ps(pMatrix + 0*4 + 3); // m[row][3] 483 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) 484 result[0] = r0; 485 486 m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0] 487 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 488 m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1] 489 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 490 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 491 m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2] 492 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 493 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 494 m = SIMD::load1_ps(pMatrix + 1*4 + 3); // m[row][3] 495 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) 496 result[1] = r0; 497 498 m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0] 499 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 500 m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1] 501 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 502 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 503 m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2] 504 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 505 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 506 m = SIMD::load1_ps(pMatrix + 2*4 + 3); // m[row][3] 507 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) 508 result[2] = r0; 509 510 m = SIMD::load1_ps(pMatrix + 3*4 + 0); // m[row][0] 511 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 512 m = SIMD::load1_ps(pMatrix + 3*4 + 1); // m[row][1] 513 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 514 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 515 m = SIMD::load1_ps(pMatrix + 3*4 + 2); // m[row][2] 516 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 517 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 518 m = SIMD::load1_ps(pMatrix + 3*4 + 3); // m[row][3] 519 result[3] = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) 520 } 521 522 static SIMDINLINE mat4x3_vec3_w1_multiplySIMDBase523 void SIMDCALL mat4x3_vec3_w1_multiply( 524 Vec4& result, 525 const float *pMatrix, 526 const Vec4& v) 527 { 528 Float m; 529 Float r0; 530 Float r1; 531 532 m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0] 533 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 534 m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1] 535 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 536 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 537 m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2] 538 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 539 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 540 m = SIMD::load1_ps(pMatrix + 0*4 + 3); // m[row][3] 541 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) 542 result[0] = r0; 543 544 m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0] 545 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 546 m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1] 547 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 548 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 549 m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2] 550 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 551 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 552 m = SIMD::load1_ps(pMatrix + 1*4 + 3); // m[row][3] 553 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) 554 result[1] = r0; 555 556 m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0] 557 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 558 m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1] 559 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 560 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 561 m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2] 562 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 563 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 564 m = SIMD::load1_ps(pMatrix + 2*4 + 3); // m[row][3] 565 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) 566 result[2] = r0; 567 result[3] = SIMD::set1_ps(1.0f); 568 } 569 }; // struct SIMDBase 570 571 using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>; 572 using SIMD256 = SIMDBase<SIMDImpl::SIMD256Impl::Traits>; 573 using SIMD512 = SIMDBase<SIMDImpl::SIMD512Impl::Traits>; 574