1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include "ShaderCore.hpp" 16 17 #include "Renderer/Renderer.hpp" 18 #include "Common/Debug.hpp" 19 20 #include <limits.h> 21 22 namespace sw 23 { 24 extern TranscendentalPrecision logPrecision; 25 extern TranscendentalPrecision expPrecision; 26 extern TranscendentalPrecision rcpPrecision; 27 extern TranscendentalPrecision rsqPrecision; 28 Vector4s()29 Vector4s::Vector4s() 30 { 31 } 32 Vector4s(unsigned short x,unsigned short y,unsigned short z,unsigned short w)33 Vector4s::Vector4s(unsigned short x, unsigned short y, unsigned short z, unsigned short w) 34 { 35 this->x = Short4(x); 36 this->y = Short4(y); 37 this->z = Short4(z); 38 this->w = Short4(w); 39 } 40 Vector4s(const Vector4s & rhs)41 Vector4s::Vector4s(const Vector4s &rhs) 42 { 43 x = rhs.x; 44 y = rhs.y; 45 z = rhs.z; 46 w = rhs.w; 47 } 48 operator =(const Vector4s & rhs)49 Vector4s &Vector4s::operator=(const Vector4s &rhs) 50 { 51 x = rhs.x; 52 y = rhs.y; 53 z = rhs.z; 54 w = rhs.w; 55 56 return *this; 57 } 58 operator [](int i)59 Short4 &Vector4s::operator[](int i) 60 { 61 switch(i) 62 { 63 case 0: return x; 64 case 1: return y; 65 case 2: return z; 66 case 3: return w; 67 } 68 69 return x; 70 } 71 Vector4f()72 Vector4f::Vector4f() 73 { 74 } 75 Vector4f(float x,float y,float z,float w)76 Vector4f::Vector4f(float x, float y, float z, float w) 77 { 78 this->x = Float4(x); 79 this->y = Float4(y); 80 this->z = Float4(z); 81 this->w = Float4(w); 82 } 83 Vector4f(const Vector4f & rhs)84 Vector4f::Vector4f(const Vector4f &rhs) 85 { 86 x = rhs.x; 87 y = rhs.y; 88 z = rhs.z; 89 w = rhs.w; 90 } 91 operator =(const Vector4f & rhs)92 Vector4f &Vector4f::operator=(const Vector4f &rhs) 93 { 94 x = rhs.x; 95 y = rhs.y; 96 z = rhs.z; 97 w = rhs.w; 98 99 return *this; 100 } 101 operator [](int i)102 Float4 &Vector4f::operator[](int i) 103 { 104 switch(i) 105 { 106 case 0: return x; 107 case 1: return y; 108 case 2: return z; 109 case 3: return w; 110 } 111 112 return x; 113 } 114 exponential2(RValue<Float4> x,bool pp)115 Float4 exponential2(RValue<Float4> x, bool pp) 116 { 117 // This implementation is based on 2^(i + f) = 2^i * 2^f, 118 // where i is the integer part of x and f is the fraction. 119 120 // For 2^i we can put the integer part directly in the exponent of 121 // the IEEE-754 floating-point number. Clamp to prevent overflow 122 // past the representation of infinity. 123 Float4 x0 = x; 124 x0 = Min(x0, As<Float4>(Int4(0x43010000))); // 129.00000e+0f 125 x0 = Max(x0, As<Float4>(Int4(0xC2FDFFFF))); // -126.99999e+0f 126 127 Int4 i = RoundInt(x0 - Float4(0.5f)); 128 Float4 ii = As<Float4>((i + Int4(127)) << 23); // Add single-precision bias, and shift into exponent. 129 130 // For the fractional part use a polynomial 131 // which approximates 2^f in the 0 to 1 range. 132 Float4 f = x0 - Float4(i); 133 Float4 ff = As<Float4>(Int4(0x3AF61905)); // 1.8775767e-3f 134 ff = ff * f + As<Float4>(Int4(0x3C134806)); // 8.9893397e-3f 135 ff = ff * f + As<Float4>(Int4(0x3D64AA23)); // 5.5826318e-2f 136 ff = ff * f + As<Float4>(Int4(0x3E75EAD4)); // 2.4015361e-1f 137 ff = ff * f + As<Float4>(Int4(0x3F31727B)); // 6.9315308e-1f 138 ff = ff * f + Float4(1.0f); 139 140 return ii * ff; 141 } 142 logarithm2(RValue<Float4> x,bool absolute,bool pp)143 Float4 logarithm2(RValue<Float4> x, bool absolute, bool pp) 144 { 145 Float4 x0; 146 Float4 x1; 147 Float4 x2; 148 Float4 x3; 149 150 x0 = x; 151 152 x1 = As<Float4>(As<Int4>(x0) & Int4(0x7F800000)); 153 x1 = As<Float4>(As<UInt4>(x1) >> 8); 154 x1 = As<Float4>(As<Int4>(x1) | As<Int4>(Float4(1.0f))); 155 x1 = (x1 - Float4(1.4960938f)) * Float4(256.0f); // FIXME: (x1 - 1.4960938f) * 256.0f; 156 x0 = As<Float4>((As<Int4>(x0) & Int4(0x007FFFFF)) | As<Int4>(Float4(1.0f))); 157 158 x2 = (Float4(9.5428179e-2f) * x0 + Float4(4.7779095e-1f)) * x0 + Float4(1.9782813e-1f); 159 x3 = ((Float4(1.6618466e-2f) * x0 + Float4(2.0350508e-1f)) * x0 + Float4(2.7382900e-1f)) * x0 + Float4(4.0496687e-2f); 160 x2 /= x3; 161 162 x1 += (x0 - Float4(1.0f)) * x2; 163 164 Int4 pos_inf_x = CmpEQ(As<Int4>(x), Int4(0x7F800000)); 165 return As<Float4>((pos_inf_x & As<Int4>(x)) | (~pos_inf_x & As<Int4>(x1))); 166 } 167 exponential(RValue<Float4> x,bool pp)168 Float4 exponential(RValue<Float4> x, bool pp) 169 { 170 // FIXME: Propagate the constant 171 return exponential2(Float4(1.44269504f) * x, pp); // 1/ln(2) 172 } 173 logarithm(RValue<Float4> x,bool absolute,bool pp)174 Float4 logarithm(RValue<Float4> x, bool absolute, bool pp) 175 { 176 // FIXME: Propagate the constant 177 return Float4(6.93147181e-1f) * logarithm2(x, absolute, pp); // ln(2) 178 } 179 power(RValue<Float4> x,RValue<Float4> y,bool pp)180 Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp) 181 { 182 Float4 log = logarithm2(x, true, pp); 183 log *= y; 184 return exponential2(log, pp); 185 } 186 reciprocal(RValue<Float4> x,bool pp,bool finite,bool exactAtPow2)187 Float4 reciprocal(RValue<Float4> x, bool pp, bool finite, bool exactAtPow2) 188 { 189 Float4 rcp; 190 191 if(!pp && rcpPrecision >= WHQL) 192 { 193 rcp = Float4(1.0f) / x; 194 } 195 else 196 { 197 rcp = Rcp_pp(x, exactAtPow2); 198 199 if(!pp) 200 { 201 rcp = (rcp + rcp) - (x * rcp * rcp); 202 } 203 } 204 205 if(finite) 206 { 207 int big = 0x7F7FFFFF; 208 rcp = Min(rcp, Float4((float&)big)); 209 } 210 211 return rcp; 212 } 213 reciprocalSquareRoot(RValue<Float4> x,bool absolute,bool pp)214 Float4 reciprocalSquareRoot(RValue<Float4> x, bool absolute, bool pp) 215 { 216 Float4 abs = x; 217 218 if(absolute) 219 { 220 abs = Abs(abs); 221 } 222 223 Float4 rsq; 224 225 if(!pp) 226 { 227 rsq = Float4(1.0f) / Sqrt(abs); 228 } 229 else 230 { 231 rsq = RcpSqrt_pp(abs); 232 233 if(!pp) 234 { 235 rsq = rsq * (Float4(3.0f) - rsq * rsq * abs) * Float4(0.5f); 236 } 237 238 rsq = As<Float4>(CmpNEQ(As<Int4>(abs), Int4(0x7F800000)) & As<Int4>(rsq)); 239 } 240 241 return rsq; 242 } 243 modulo(RValue<Float4> x,RValue<Float4> y)244 Float4 modulo(RValue<Float4> x, RValue<Float4> y) 245 { 246 return x - y * Floor(x / y); 247 } 248 sine_pi(RValue<Float4> x,bool pp)249 Float4 sine_pi(RValue<Float4> x, bool pp) 250 { 251 const Float4 A = Float4(-4.05284734e-1f); // -4/pi^2 252 const Float4 B = Float4(1.27323954e+0f); // 4/pi 253 const Float4 C = Float4(7.75160950e-1f); 254 const Float4 D = Float4(2.24839049e-1f); 255 256 // Parabola approximating sine 257 Float4 sin = x * (Abs(x) * A + B); 258 259 // Improve precision from 0.06 to 0.001 260 if(true) 261 { 262 sin = sin * (Abs(sin) * D + C); 263 } 264 265 return sin; 266 } 267 cosine_pi(RValue<Float4> x,bool pp)268 Float4 cosine_pi(RValue<Float4> x, bool pp) 269 { 270 // cos(x) = sin(x + pi/2) 271 Float4 y = x + Float4(1.57079632e+0f); 272 273 // Wrap around 274 y -= As<Float4>(CmpNLT(y, Float4(3.14159265e+0f)) & As<Int4>(Float4(6.28318530e+0f))); 275 276 return sine_pi(y, pp); 277 } 278 279 // Assumes x is a finite floating point value clamp(const Float4 & x,const Float4 & min,const Float4 & max)280 static RValue<Float4> clamp(const Float4 &x, const Float4 &min, const Float4 &max) 281 { 282 return Min(Max(x, min), max); 283 } 284 sine(RValue<Float4> x,bool pp)285 Float4 sine(RValue<Float4> x, bool pp) 286 { 287 // Reduce to [-0.5, 0.5] range 288 Float4 y = x * Float4(1.59154943e-1f); // 1/2pi 289 y = y - Round(y); 290 291 if(!pp) 292 { 293 // From the paper: "A Fast, Vectorizable Algorithm for Producing Single-Precision Sine-Cosine Pairs" 294 // This implementation passes OpenGL ES 3.0 precision requirements, at the cost of more operations: 295 // !pp : 17 mul, 7 add, 1 sub, 1 reciprocal 296 // pp : 4 mul, 2 add, 2 abs 297 298 Float4 y2 = y * y; 299 Float4 c1 = y2 * (y2 * (y2 * Float4(-0.0204391631f) + Float4(0.2536086171f)) + Float4(-1.2336977925f)) + Float4(1.0f); 300 Float4 s1 = y * (y2 * (y2 * (y2 * Float4(-0.0046075748f) + Float4(0.0796819754f)) + Float4(-0.645963615f)) + Float4(1.5707963235f)); 301 Float4 c2 = (c1 * c1) - (s1 * s1); 302 Float4 s2 = Float4(2.0f) * s1 * c1; 303 return Float4(2.0f) * s2 * c2 * reciprocal(s2 * s2 + c2 * c2, pp, true); 304 } 305 306 const Float4 A = Float4(-16.0f); 307 const Float4 B = Float4(8.0f); 308 const Float4 C = Float4(7.75160950e-1f); 309 const Float4 D = Float4(2.24839049e-1f); 310 311 // Parabola approximating sine 312 Float4 sin = y * (Abs(y) * A + B); 313 314 // Improve precision from 0.06 to 0.001 315 if(true) 316 { 317 sin = sin * (Abs(sin) * D + C); 318 } 319 320 // TODO(b/151461290): Fix precision loss instead of clamping. 321 sin = clamp(sin, Float4(-1.0f), Float4(1.0f)); 322 323 return sin; 324 } 325 cosine(RValue<Float4> x,bool pp)326 Float4 cosine(RValue<Float4> x, bool pp) 327 { 328 // cos(x) = sin(x + pi/2) 329 Float4 y = x + Float4(1.57079632e+0f); 330 auto cos = sine(y, pp); 331 332 // TODO(b/151461290): Fix precision loss instead of clamping. 333 cos = clamp(cos, Float4(-1.0f), Float4(1.0f)); 334 335 return cos; 336 } 337 tangent(RValue<Float4> x,bool pp)338 Float4 tangent(RValue<Float4> x, bool pp) 339 { 340 return sine(x, pp) / cosine(x, pp); 341 } 342 arccos(RValue<Float4> x,bool pp)343 Float4 arccos(RValue<Float4> x, bool pp) 344 { 345 // pi/2 - arcsin(x) 346 return Float4(1.57079632e+0f) - arcsin(x); 347 } 348 arcsin(RValue<Float4> x,bool pp)349 Float4 arcsin(RValue<Float4> x, bool pp) 350 { 351 if(false) // Simpler implementation fails even lowp precision tests 352 { 353 // x*(pi/2-sqrt(1-x*x)*pi/5) 354 return x * (Float4(1.57079632e+0f) - Sqrt(Float4(1.0f) - x*x) * Float4(6.28318531e-1f)); 355 } 356 else 357 { 358 // From 4.4.45, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun 359 const Float4 half_pi(1.57079632f); 360 const Float4 a0(1.5707288f); 361 const Float4 a1(-0.2121144f); 362 const Float4 a2(0.0742610f); 363 const Float4 a3(-0.0187293f); 364 Float4 absx = Abs(x); 365 return As<Float4>(As<Int4>(half_pi - Sqrt(Float4(1.0f) - absx) * (a0 + absx * (a1 + absx * (a2 + absx * a3)))) ^ 366 (As<Int4>(x) & Int4(0x80000000))); 367 } 368 } 369 370 // Approximation of atan in [0..1] arctan_01(Float4 x,bool pp)371 Float4 arctan_01(Float4 x, bool pp) 372 { 373 if(pp) 374 { 375 return x * (Float4(-0.27f) * x + Float4(1.05539816f)); 376 } 377 else 378 { 379 // From 4.4.49, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun 380 const Float4 a2(-0.3333314528f); 381 const Float4 a4(0.1999355085f); 382 const Float4 a6(-0.1420889944f); 383 const Float4 a8(0.1065626393f); 384 const Float4 a10(-0.0752896400f); 385 const Float4 a12(0.0429096138f); 386 const Float4 a14(-0.0161657367f); 387 const Float4 a16(0.0028662257f); 388 Float4 x2 = x * x; 389 return (x + x * (x2 * (a2 + x2 * (a4 + x2 * (a6 + x2 * (a8 + x2 * (a10 + x2 * (a12 + x2 * (a14 + x2 * a16))))))))); 390 } 391 } 392 arctan(RValue<Float4> x,bool pp)393 Float4 arctan(RValue<Float4> x, bool pp) 394 { 395 Float4 absx = Abs(x); 396 Int4 O = CmpNLT(absx, Float4(1.0f)); 397 Float4 y = As<Float4>((O & As<Int4>(Float4(1.0f) / absx)) | (~O & As<Int4>(absx))); // FIXME: Vector select 398 399 const Float4 half_pi(1.57079632f); 400 Float4 theta = arctan_01(y, pp); 401 return As<Float4>(((O & As<Int4>(half_pi - theta)) | (~O & As<Int4>(theta))) ^ // FIXME: Vector select 402 (As<Int4>(x) & Int4(0x80000000))); 403 } 404 arctan(RValue<Float4> y,RValue<Float4> x,bool pp)405 Float4 arctan(RValue<Float4> y, RValue<Float4> x, bool pp) 406 { 407 const Float4 pi(3.14159265f); // pi 408 const Float4 minus_pi(-3.14159265f); // -pi 409 const Float4 half_pi(1.57079632f); // pi/2 410 const Float4 quarter_pi(7.85398163e-1f); // pi/4 411 412 // Rotate to upper semicircle when in lower semicircle 413 Int4 S = CmpLT(y, Float4(0.0f)); 414 Float4 theta = As<Float4>(S & As<Int4>(minus_pi)); 415 Float4 x0 = As<Float4>((As<Int4>(y) & Int4(0x80000000)) ^ As<Int4>(x)); 416 Float4 y0 = Abs(y); 417 418 // Rotate to right quadrant when in left quadrant 419 Int4 Q = CmpLT(x0, Float4(0.0f)); 420 theta += As<Float4>(Q & As<Int4>(half_pi)); 421 Float4 x1 = As<Float4>((Q & As<Int4>(y0)) | (~Q & As<Int4>(x0))); // FIXME: Vector select 422 Float4 y1 = As<Float4>((Q & As<Int4>(-x0)) | (~Q & As<Int4>(y0))); // FIXME: Vector select 423 424 // Mirror to first octant when in second octant 425 Int4 O = CmpNLT(y1, x1); 426 Float4 x2 = As<Float4>((O & As<Int4>(y1)) | (~O & As<Int4>(x1))); // FIXME: Vector select 427 Float4 y2 = As<Float4>((O & As<Int4>(x1)) | (~O & As<Int4>(y1))); // FIXME: Vector select 428 429 // Approximation of atan in [0..1] 430 Int4 zero_x = CmpEQ(x2, Float4(0.0f)); 431 Int4 inf_y = IsInf(y2); // Since x2 >= y2, this means x2 == y2 == inf, so we use 45 degrees or pi/4 432 Float4 atan2_theta = arctan_01(y2 / x2, pp); 433 theta += As<Float4>((~zero_x & ~inf_y & ((O & As<Int4>(half_pi - atan2_theta)) | (~O & (As<Int4>(atan2_theta))))) | // FIXME: Vector select 434 (inf_y & As<Int4>(quarter_pi))); 435 436 // Recover loss of precision for tiny theta angles 437 Int4 precision_loss = S & Q & O & ~inf_y; // This combination results in (-pi + half_pi + half_pi - atan2_theta) which is equivalent to -atan2_theta 438 return As<Float4>((precision_loss & As<Int4>(-atan2_theta)) | (~precision_loss & As<Int4>(theta))); // FIXME: Vector select 439 } 440 sineh(RValue<Float4> x,bool pp)441 Float4 sineh(RValue<Float4> x, bool pp) 442 { 443 return (exponential(x, pp) - exponential(-x, pp)) * Float4(0.5f); 444 } 445 cosineh(RValue<Float4> x,bool pp)446 Float4 cosineh(RValue<Float4> x, bool pp) 447 { 448 return (exponential(x, pp) + exponential(-x, pp)) * Float4(0.5f); 449 } 450 tangenth(RValue<Float4> x,bool pp)451 Float4 tangenth(RValue<Float4> x, bool pp) 452 { 453 Float4 e_x = exponential(x, pp); 454 Float4 e_minus_x = exponential(-x, pp); 455 return (e_x - e_minus_x) / (e_x + e_minus_x); 456 } 457 arccosh(RValue<Float4> x,bool pp)458 Float4 arccosh(RValue<Float4> x, bool pp) 459 { 460 return logarithm(x + Sqrt(x + Float4(1.0f)) * Sqrt(x - Float4(1.0f)), pp); 461 } 462 arcsinh(RValue<Float4> x,bool pp)463 Float4 arcsinh(RValue<Float4> x, bool pp) 464 { 465 return logarithm(x + Sqrt(x * x + Float4(1.0f)), pp); 466 } 467 arctanh(RValue<Float4> x,bool pp)468 Float4 arctanh(RValue<Float4> x, bool pp) 469 { 470 return logarithm((Float4(1.0f) + x) / (Float4(1.0f) - x), pp) * Float4(0.5f); 471 } 472 dot2(const Vector4f & v0,const Vector4f & v1)473 Float4 dot2(const Vector4f &v0, const Vector4f &v1) 474 { 475 return v0.x * v1.x + v0.y * v1.y; 476 } 477 dot3(const Vector4f & v0,const Vector4f & v1)478 Float4 dot3(const Vector4f &v0, const Vector4f &v1) 479 { 480 return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z; 481 } 482 dot4(const Vector4f & v0,const Vector4f & v1)483 Float4 dot4(const Vector4f &v0, const Vector4f &v1) 484 { 485 return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z + v0.w * v1.w; 486 } 487 transpose4x4(Short4 & row0,Short4 & row1,Short4 & row2,Short4 & row3)488 void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3) 489 { 490 Int2 tmp0 = UnpackHigh(row0, row1); 491 Int2 tmp1 = UnpackHigh(row2, row3); 492 Int2 tmp2 = UnpackLow(row0, row1); 493 Int2 tmp3 = UnpackLow(row2, row3); 494 495 row0 = UnpackLow(tmp2, tmp3); 496 row1 = UnpackHigh(tmp2, tmp3); 497 row2 = UnpackLow(tmp0, tmp1); 498 row3 = UnpackHigh(tmp0, tmp1); 499 } 500 transpose4x3(Short4 & row0,Short4 & row1,Short4 & row2,Short4 & row3)501 void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3) 502 { 503 Int2 tmp0 = UnpackHigh(row0, row1); 504 Int2 tmp1 = UnpackHigh(row2, row3); 505 Int2 tmp2 = UnpackLow(row0, row1); 506 Int2 tmp3 = UnpackLow(row2, row3); 507 508 row0 = UnpackLow(tmp2, tmp3); 509 row1 = UnpackHigh(tmp2, tmp3); 510 row2 = UnpackLow(tmp0, tmp1); 511 } 512 transpose4x4(Float4 & row0,Float4 & row1,Float4 & row2,Float4 & row3)513 void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3) 514 { 515 Float4 tmp0 = UnpackLow(row0, row1); 516 Float4 tmp1 = UnpackLow(row2, row3); 517 Float4 tmp2 = UnpackHigh(row0, row1); 518 Float4 tmp3 = UnpackHigh(row2, row3); 519 520 row0 = Float4(tmp0.xy, tmp1.xy); 521 row1 = Float4(tmp0.zw, tmp1.zw); 522 row2 = Float4(tmp2.xy, tmp3.xy); 523 row3 = Float4(tmp2.zw, tmp3.zw); 524 } 525 transpose4x3(Float4 & row0,Float4 & row1,Float4 & row2,Float4 & row3)526 void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3) 527 { 528 Float4 tmp0 = UnpackLow(row0, row1); 529 Float4 tmp1 = UnpackLow(row2, row3); 530 Float4 tmp2 = UnpackHigh(row0, row1); 531 Float4 tmp3 = UnpackHigh(row2, row3); 532 533 row0 = Float4(tmp0.xy, tmp1.xy); 534 row1 = Float4(tmp0.zw, tmp1.zw); 535 row2 = Float4(tmp2.xy, tmp3.xy); 536 } 537 transpose4x2(Float4 & row0,Float4 & row1,Float4 & row2,Float4 & row3)538 void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3) 539 { 540 Float4 tmp0 = UnpackLow(row0, row1); 541 Float4 tmp1 = UnpackLow(row2, row3); 542 543 row0 = Float4(tmp0.xy, tmp1.xy); 544 row1 = Float4(tmp0.zw, tmp1.zw); 545 } 546 transpose4x1(Float4 & row0,Float4 & row1,Float4 & row2,Float4 & row3)547 void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3) 548 { 549 Float4 tmp0 = UnpackLow(row0, row1); 550 Float4 tmp1 = UnpackLow(row2, row3); 551 552 row0 = Float4(tmp0.xy, tmp1.xy); 553 } 554 transpose2x4(Float4 & row0,Float4 & row1,Float4 & row2,Float4 & row3)555 void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3) 556 { 557 Float4 tmp01 = UnpackLow(row0, row1); 558 Float4 tmp23 = UnpackHigh(row0, row1); 559 560 row0 = tmp01; 561 row1 = Float4(tmp01.zw, row1.zw); 562 row2 = tmp23; 563 row3 = Float4(tmp23.zw, row3.zw); 564 } 565 transpose4xN(Float4 & row0,Float4 & row1,Float4 & row2,Float4 & row3,int N)566 void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N) 567 { 568 switch(N) 569 { 570 case 1: transpose4x1(row0, row1, row2, row3); break; 571 case 2: transpose4x2(row0, row1, row2, row3); break; 572 case 3: transpose4x3(row0, row1, row2, row3); break; 573 case 4: transpose4x4(row0, row1, row2, row3); break; 574 } 575 } 576 operator [](RValue<Int4> index)577 const Vector4f RegisterFile::operator[](RValue<Int4> index) 578 { 579 ASSERT(indirectAddressable); 580 581 Int index0 = Extract(index, 0); 582 Int index1 = Extract(index, 1); 583 Int index2 = Extract(index, 2); 584 Int index3 = Extract(index, 3); 585 586 Vector4f r; 587 588 r.x.x = Extract(x[0][index0], 0); 589 r.x.y = Extract(x[0][index1], 1); 590 r.x.z = Extract(x[0][index2], 2); 591 r.x.w = Extract(x[0][index3], 3); 592 593 r.y.x = Extract(y[0][index0], 0); 594 r.y.y = Extract(y[0][index1], 1); 595 r.y.z = Extract(y[0][index2], 2); 596 r.y.w = Extract(y[0][index3], 3); 597 598 r.z.x = Extract(z[0][index0], 0); 599 r.z.y = Extract(z[0][index1], 1); 600 r.z.z = Extract(z[0][index2], 2); 601 r.z.w = Extract(z[0][index3], 3); 602 603 r.w.x = Extract(w[0][index0], 0); 604 r.w.y = Extract(w[0][index1], 1); 605 r.w.z = Extract(w[0][index2], 2); 606 r.w.w = Extract(w[0][index3], 3); 607 608 return r; 609 } 610 scatter_x(Int4 index,RValue<Float4> r)611 void RegisterFile::scatter_x(Int4 index, RValue<Float4> r) 612 { 613 ASSERT(indirectAddressable); 614 615 Int index0 = Extract(index, 0); 616 Int index1 = Extract(index, 1); 617 Int index2 = Extract(index, 2); 618 Int index3 = Extract(index, 3); 619 620 x[0][index0] = Insert(x[0][index0], Extract(r, 0), 0); 621 x[0][index1] = Insert(x[0][index1], Extract(r, 1), 1); 622 x[0][index2] = Insert(x[0][index2], Extract(r, 2), 2); 623 x[0][index3] = Insert(x[0][index3], Extract(r, 3), 3); 624 } 625 scatter_y(Int4 index,RValue<Float4> r)626 void RegisterFile::scatter_y(Int4 index, RValue<Float4> r) 627 { 628 ASSERT(indirectAddressable); 629 630 Int index0 = Extract(index, 0); 631 Int index1 = Extract(index, 1); 632 Int index2 = Extract(index, 2); 633 Int index3 = Extract(index, 3); 634 635 y[0][index0] = Insert(y[0][index0], Extract(r, 0), 0); 636 y[0][index1] = Insert(y[0][index1], Extract(r, 1), 1); 637 y[0][index2] = Insert(y[0][index2], Extract(r, 2), 2); 638 y[0][index3] = Insert(y[0][index3], Extract(r, 3), 3); 639 } 640 scatter_z(Int4 index,RValue<Float4> r)641 void RegisterFile::scatter_z(Int4 index, RValue<Float4> r) 642 { 643 ASSERT(indirectAddressable); 644 645 Int index0 = Extract(index, 0); 646 Int index1 = Extract(index, 1); 647 Int index2 = Extract(index, 2); 648 Int index3 = Extract(index, 3); 649 650 z[0][index0] = Insert(z[0][index0], Extract(r, 0), 0); 651 z[0][index1] = Insert(z[0][index1], Extract(r, 1), 1); 652 z[0][index2] = Insert(z[0][index2], Extract(r, 2), 2); 653 z[0][index3] = Insert(z[0][index3], Extract(r, 3), 3); 654 } 655 scatter_w(Int4 index,RValue<Float4> r)656 void RegisterFile::scatter_w(Int4 index, RValue<Float4> r) 657 { 658 ASSERT(indirectAddressable); 659 660 Int index0 = Extract(index, 0); 661 Int index1 = Extract(index, 1); 662 Int index2 = Extract(index, 2); 663 Int index3 = Extract(index, 3); 664 665 w[0][index0] = Insert(w[0][index0], Extract(r, 0), 0); 666 w[0][index1] = Insert(w[0][index1], Extract(r, 1), 1); 667 w[0][index2] = Insert(w[0][index2], Extract(r, 2), 2); 668 w[0][index3] = Insert(w[0][index3], Extract(r, 3), 3); 669 } 670 mov(Vector4f & dst,const Vector4f & src,bool integerDestination)671 void ShaderCore::mov(Vector4f &dst, const Vector4f &src, bool integerDestination) 672 { 673 if(integerDestination) 674 { 675 dst.x = As<Float4>(RoundInt(src.x)); 676 dst.y = As<Float4>(RoundInt(src.y)); 677 dst.z = As<Float4>(RoundInt(src.z)); 678 dst.w = As<Float4>(RoundInt(src.w)); 679 } 680 else 681 { 682 dst = src; 683 } 684 } 685 neg(Vector4f & dst,const Vector4f & src)686 void ShaderCore::neg(Vector4f &dst, const Vector4f &src) 687 { 688 dst.x = -src.x; 689 dst.y = -src.y; 690 dst.z = -src.z; 691 dst.w = -src.w; 692 } 693 ineg(Vector4f & dst,const Vector4f & src)694 void ShaderCore::ineg(Vector4f &dst, const Vector4f &src) 695 { 696 dst.x = As<Float4>(-As<Int4>(src.x)); 697 dst.y = As<Float4>(-As<Int4>(src.y)); 698 dst.z = As<Float4>(-As<Int4>(src.z)); 699 dst.w = As<Float4>(-As<Int4>(src.w)); 700 } 701 f2b(Vector4f & dst,const Vector4f & src)702 void ShaderCore::f2b(Vector4f &dst, const Vector4f &src) 703 { 704 dst.x = As<Float4>(CmpNEQ(src.x, Float4(0.0f))); 705 dst.y = As<Float4>(CmpNEQ(src.y, Float4(0.0f))); 706 dst.z = As<Float4>(CmpNEQ(src.z, Float4(0.0f))); 707 dst.w = As<Float4>(CmpNEQ(src.w, Float4(0.0f))); 708 } 709 b2f(Vector4f & dst,const Vector4f & src)710 void ShaderCore::b2f(Vector4f &dst, const Vector4f &src) 711 { 712 dst.x = As<Float4>(As<Int4>(src.x) & As<Int4>(Float4(1.0f))); 713 dst.y = As<Float4>(As<Int4>(src.y) & As<Int4>(Float4(1.0f))); 714 dst.z = As<Float4>(As<Int4>(src.z) & As<Int4>(Float4(1.0f))); 715 dst.w = As<Float4>(As<Int4>(src.w) & As<Int4>(Float4(1.0f))); 716 } 717 f2i(Vector4f & dst,const Vector4f & src)718 void ShaderCore::f2i(Vector4f &dst, const Vector4f &src) 719 { 720 dst.x = As<Float4>(Int4(src.x)); 721 dst.y = As<Float4>(Int4(src.y)); 722 dst.z = As<Float4>(Int4(src.z)); 723 dst.w = As<Float4>(Int4(src.w)); 724 } 725 i2f(Vector4f & dst,const Vector4f & src)726 void ShaderCore::i2f(Vector4f &dst, const Vector4f &src) 727 { 728 dst.x = Float4(As<Int4>(src.x)); 729 dst.y = Float4(As<Int4>(src.y)); 730 dst.z = Float4(As<Int4>(src.z)); 731 dst.w = Float4(As<Int4>(src.w)); 732 } 733 f2u(Vector4f & dst,const Vector4f & src)734 void ShaderCore::f2u(Vector4f &dst, const Vector4f &src) 735 { 736 dst.x = As<Float4>(UInt4(src.x)); 737 dst.y = As<Float4>(UInt4(src.y)); 738 dst.z = As<Float4>(UInt4(src.z)); 739 dst.w = As<Float4>(UInt4(src.w)); 740 } 741 u2f(Vector4f & dst,const Vector4f & src)742 void ShaderCore::u2f(Vector4f &dst, const Vector4f &src) 743 { 744 dst.x = Float4(As<UInt4>(src.x)); 745 dst.y = Float4(As<UInt4>(src.y)); 746 dst.z = Float4(As<UInt4>(src.z)); 747 dst.w = Float4(As<UInt4>(src.w)); 748 } 749 i2b(Vector4f & dst,const Vector4f & src)750 void ShaderCore::i2b(Vector4f &dst, const Vector4f &src) 751 { 752 dst.x = As<Float4>(CmpNEQ(As<Int4>(src.x), Int4(0))); 753 dst.y = As<Float4>(CmpNEQ(As<Int4>(src.y), Int4(0))); 754 dst.z = As<Float4>(CmpNEQ(As<Int4>(src.z), Int4(0))); 755 dst.w = As<Float4>(CmpNEQ(As<Int4>(src.w), Int4(0))); 756 } 757 b2i(Vector4f & dst,const Vector4f & src)758 void ShaderCore::b2i(Vector4f &dst, const Vector4f &src) 759 { 760 dst.x = As<Float4>(As<Int4>(src.x) & Int4(1)); 761 dst.y = As<Float4>(As<Int4>(src.y) & Int4(1)); 762 dst.z = As<Float4>(As<Int4>(src.z) & Int4(1)); 763 dst.w = As<Float4>(As<Int4>(src.w) & Int4(1)); 764 } 765 add(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)766 void ShaderCore::add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 767 { 768 dst.x = src0.x + src1.x; 769 dst.y = src0.y + src1.y; 770 dst.z = src0.z + src1.z; 771 dst.w = src0.w + src1.w; 772 } 773 iadd(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)774 void ShaderCore::iadd(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 775 { 776 dst.x = As<Float4>(As<Int4>(src0.x) + As<Int4>(src1.x)); 777 dst.y = As<Float4>(As<Int4>(src0.y) + As<Int4>(src1.y)); 778 dst.z = As<Float4>(As<Int4>(src0.z) + As<Int4>(src1.z)); 779 dst.w = As<Float4>(As<Int4>(src0.w) + As<Int4>(src1.w)); 780 } 781 sub(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)782 void ShaderCore::sub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 783 { 784 dst.x = src0.x - src1.x; 785 dst.y = src0.y - src1.y; 786 dst.z = src0.z - src1.z; 787 dst.w = src0.w - src1.w; 788 } 789 isub(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)790 void ShaderCore::isub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 791 { 792 dst.x = As<Float4>(As<Int4>(src0.x) - As<Int4>(src1.x)); 793 dst.y = As<Float4>(As<Int4>(src0.y) - As<Int4>(src1.y)); 794 dst.z = As<Float4>(As<Int4>(src0.z) - As<Int4>(src1.z)); 795 dst.w = As<Float4>(As<Int4>(src0.w) - As<Int4>(src1.w)); 796 } 797 mad(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)798 void ShaderCore::mad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2) 799 { 800 dst.x = src0.x * src1.x + src2.x; 801 dst.y = src0.y * src1.y + src2.y; 802 dst.z = src0.z * src1.z + src2.z; 803 dst.w = src0.w * src1.w + src2.w; 804 } 805 imad(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)806 void ShaderCore::imad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2) 807 { 808 dst.x = As<Float4>(As<Int4>(src0.x) * As<Int4>(src1.x) + As<Int4>(src2.x)); 809 dst.y = As<Float4>(As<Int4>(src0.y) * As<Int4>(src1.y) + As<Int4>(src2.y)); 810 dst.z = As<Float4>(As<Int4>(src0.z) * As<Int4>(src1.z) + As<Int4>(src2.z)); 811 dst.w = As<Float4>(As<Int4>(src0.w) * As<Int4>(src1.w) + As<Int4>(src2.w)); 812 } 813 mul(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)814 void ShaderCore::mul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 815 { 816 dst.x = src0.x * src1.x; 817 dst.y = src0.y * src1.y; 818 dst.z = src0.z * src1.z; 819 dst.w = src0.w * src1.w; 820 } 821 imul(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)822 void ShaderCore::imul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 823 { 824 dst.x = As<Float4>(As<Int4>(src0.x) * As<Int4>(src1.x)); 825 dst.y = As<Float4>(As<Int4>(src0.y) * As<Int4>(src1.y)); 826 dst.z = As<Float4>(As<Int4>(src0.z) * As<Int4>(src1.z)); 827 dst.w = As<Float4>(As<Int4>(src0.w) * As<Int4>(src1.w)); 828 } 829 rcpx(Vector4f & dst,const Vector4f & src,bool pp)830 void ShaderCore::rcpx(Vector4f &dst, const Vector4f &src, bool pp) 831 { 832 Float4 rcp = reciprocal(src.x, pp, true, true); 833 834 dst.x = rcp; 835 dst.y = rcp; 836 dst.z = rcp; 837 dst.w = rcp; 838 } 839 div(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)840 void ShaderCore::div(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 841 { 842 dst.x = src0.x / src1.x; 843 dst.y = src0.y / src1.y; 844 dst.z = src0.z / src1.z; 845 dst.w = src0.w / src1.w; 846 } 847 idiv(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)848 void ShaderCore::idiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 849 { 850 Float4 intMax(As<Float4>(Int4(INT_MAX))); 851 cmp0i(dst.x, src1.x, intMax, src1.x); 852 dst.x = As<Float4>(As<Int4>(src0.x) / As<Int4>(dst.x)); 853 cmp0i(dst.y, src1.y, intMax, src1.y); 854 dst.y = As<Float4>(As<Int4>(src0.y) / As<Int4>(dst.y)); 855 cmp0i(dst.z, src1.z, intMax, src1.z); 856 dst.z = As<Float4>(As<Int4>(src0.z) / As<Int4>(dst.z)); 857 cmp0i(dst.w, src1.w, intMax, src1.w); 858 dst.w = As<Float4>(As<Int4>(src0.w) / As<Int4>(dst.w)); 859 } 860 udiv(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)861 void ShaderCore::udiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 862 { 863 Float4 uintMax(As<Float4>(UInt4(UINT_MAX))); 864 cmp0i(dst.x, src1.x, uintMax, src1.x); 865 dst.x = As<Float4>(As<UInt4>(src0.x) / As<UInt4>(dst.x)); 866 cmp0i(dst.y, src1.y, uintMax, src1.y); 867 dst.y = As<Float4>(As<UInt4>(src0.y) / As<UInt4>(dst.y)); 868 cmp0i(dst.z, src1.z, uintMax, src1.z); 869 dst.z = As<Float4>(As<UInt4>(src0.z) / As<UInt4>(dst.z)); 870 cmp0i(dst.w, src1.w, uintMax, src1.w); 871 dst.w = As<Float4>(As<UInt4>(src0.w) / As<UInt4>(dst.w)); 872 } 873 mod(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)874 void ShaderCore::mod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 875 { 876 dst.x = modulo(src0.x, src1.x); 877 dst.y = modulo(src0.y, src1.y); 878 dst.z = modulo(src0.z, src1.z); 879 dst.w = modulo(src0.w, src1.w); 880 } 881 imod(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)882 void ShaderCore::imod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 883 { 884 Float4 intMax(As<Float4>(Int4(INT_MAX))); 885 cmp0i(dst.x, src1.x, intMax, src1.x); 886 dst.x = As<Float4>(As<Int4>(src0.x) % As<Int4>(dst.x)); 887 cmp0i(dst.y, src1.y, intMax, src1.y); 888 dst.y = As<Float4>(As<Int4>(src0.y) % As<Int4>(dst.y)); 889 cmp0i(dst.z, src1.z, intMax, src1.z); 890 dst.z = As<Float4>(As<Int4>(src0.z) % As<Int4>(dst.z)); 891 cmp0i(dst.w, src1.w, intMax, src1.w); 892 dst.w = As<Float4>(As<Int4>(src0.w) % As<Int4>(dst.w)); 893 } 894 umod(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)895 void ShaderCore::umod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 896 { 897 Float4 uintMax(As<Float4>(UInt4(UINT_MAX))); 898 cmp0i(dst.x, src1.x, uintMax, src1.x); 899 dst.x = As<Float4>(As<UInt4>(src0.x) % As<UInt4>(dst.x)); 900 cmp0i(dst.y, src1.y, uintMax, src1.y); 901 dst.y = As<Float4>(As<UInt4>(src0.y) % As<UInt4>(dst.y)); 902 cmp0i(dst.z, src1.z, uintMax, src1.z); 903 dst.z = As<Float4>(As<UInt4>(src0.z) % As<UInt4>(dst.z)); 904 cmp0i(dst.w, src1.w, uintMax, src1.w); 905 dst.w = As<Float4>(As<UInt4>(src0.w) % As<UInt4>(dst.w)); 906 } 907 shl(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)908 void ShaderCore::shl(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 909 { 910 dst.x = As<Float4>(As<Int4>(src0.x) << As<Int4>(src1.x)); 911 dst.y = As<Float4>(As<Int4>(src0.y) << As<Int4>(src1.y)); 912 dst.z = As<Float4>(As<Int4>(src0.z) << As<Int4>(src1.z)); 913 dst.w = As<Float4>(As<Int4>(src0.w) << As<Int4>(src1.w)); 914 } 915 ishr(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)916 void ShaderCore::ishr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 917 { 918 dst.x = As<Float4>(As<Int4>(src0.x) >> As<Int4>(src1.x)); 919 dst.y = As<Float4>(As<Int4>(src0.y) >> As<Int4>(src1.y)); 920 dst.z = As<Float4>(As<Int4>(src0.z) >> As<Int4>(src1.z)); 921 dst.w = As<Float4>(As<Int4>(src0.w) >> As<Int4>(src1.w)); 922 } 923 ushr(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)924 void ShaderCore::ushr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 925 { 926 dst.x = As<Float4>(As<UInt4>(src0.x) >> As<UInt4>(src1.x)); 927 dst.y = As<Float4>(As<UInt4>(src0.y) >> As<UInt4>(src1.y)); 928 dst.z = As<Float4>(As<UInt4>(src0.z) >> As<UInt4>(src1.z)); 929 dst.w = As<Float4>(As<UInt4>(src0.w) >> As<UInt4>(src1.w)); 930 } 931 rsqx(Vector4f & dst,const Vector4f & src,bool pp)932 void ShaderCore::rsqx(Vector4f &dst, const Vector4f &src, bool pp) 933 { 934 Float4 rsq = reciprocalSquareRoot(src.x, true, pp); 935 936 dst.x = rsq; 937 dst.y = rsq; 938 dst.z = rsq; 939 dst.w = rsq; 940 } 941 sqrt(Vector4f & dst,const Vector4f & src,bool pp)942 void ShaderCore::sqrt(Vector4f &dst, const Vector4f &src, bool pp) 943 { 944 dst.x = Sqrt(src.x); 945 dst.y = Sqrt(src.y); 946 dst.z = Sqrt(src.z); 947 dst.w = Sqrt(src.w); 948 } 949 rsq(Vector4f & dst,const Vector4f & src,bool pp)950 void ShaderCore::rsq(Vector4f &dst, const Vector4f &src, bool pp) 951 { 952 dst.x = reciprocalSquareRoot(src.x, false, pp); 953 dst.y = reciprocalSquareRoot(src.y, false, pp); 954 dst.z = reciprocalSquareRoot(src.z, false, pp); 955 dst.w = reciprocalSquareRoot(src.w, false, pp); 956 } 957 len2(Float4 & dst,const Vector4f & src,bool pp)958 void ShaderCore::len2(Float4 &dst, const Vector4f &src, bool pp) 959 { 960 dst = Sqrt(dot2(src, src)); 961 } 962 len3(Float4 & dst,const Vector4f & src,bool pp)963 void ShaderCore::len3(Float4 &dst, const Vector4f &src, bool pp) 964 { 965 dst = Sqrt(dot3(src, src)); 966 } 967 len4(Float4 & dst,const Vector4f & src,bool pp)968 void ShaderCore::len4(Float4 &dst, const Vector4f &src, bool pp) 969 { 970 dst = Sqrt(dot4(src, src)); 971 } 972 dist1(Float4 & dst,const Vector4f & src0,const Vector4f & src1,bool pp)973 void ShaderCore::dist1(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp) 974 { 975 dst = Abs(src0.x - src1.x); 976 } 977 dist2(Float4 & dst,const Vector4f & src0,const Vector4f & src1,bool pp)978 void ShaderCore::dist2(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp) 979 { 980 Float4 dx = src0.x - src1.x; 981 Float4 dy = src0.y - src1.y; 982 Float4 dot2 = dx * dx + dy * dy; 983 dst = Sqrt(dot2); 984 } 985 dist3(Float4 & dst,const Vector4f & src0,const Vector4f & src1,bool pp)986 void ShaderCore::dist3(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp) 987 { 988 Float4 dx = src0.x - src1.x; 989 Float4 dy = src0.y - src1.y; 990 Float4 dz = src0.z - src1.z; 991 Float4 dot3 = dx * dx + dy * dy + dz * dz; 992 dst = Sqrt(dot3); 993 } 994 dist4(Float4 & dst,const Vector4f & src0,const Vector4f & src1,bool pp)995 void ShaderCore::dist4(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp) 996 { 997 Float4 dx = src0.x - src1.x; 998 Float4 dy = src0.y - src1.y; 999 Float4 dz = src0.z - src1.z; 1000 Float4 dw = src0.w - src1.w; 1001 Float4 dot4 = dx * dx + dy * dy + dz * dz + dw * dw; 1002 dst = Sqrt(dot4); 1003 } 1004 dp1(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1005 void ShaderCore::dp1(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1006 { 1007 Float4 t = src0.x * src1.x; 1008 1009 dst.x = t; 1010 dst.y = t; 1011 dst.z = t; 1012 dst.w = t; 1013 } 1014 dp2(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1015 void ShaderCore::dp2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1016 { 1017 Float4 t = dot2(src0, src1); 1018 1019 dst.x = t; 1020 dst.y = t; 1021 dst.z = t; 1022 dst.w = t; 1023 } 1024 dp2add(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)1025 void ShaderCore::dp2add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2) 1026 { 1027 Float4 t = dot2(src0, src1) + src2.x; 1028 1029 dst.x = t; 1030 dst.y = t; 1031 dst.z = t; 1032 dst.w = t; 1033 } 1034 dp3(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1035 void ShaderCore::dp3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1036 { 1037 Float4 dot = dot3(src0, src1); 1038 1039 dst.x = dot; 1040 dst.y = dot; 1041 dst.z = dot; 1042 dst.w = dot; 1043 } 1044 dp4(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1045 void ShaderCore::dp4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1046 { 1047 Float4 dot = dot4(src0, src1); 1048 1049 dst.x = dot; 1050 dst.y = dot; 1051 dst.z = dot; 1052 dst.w = dot; 1053 } 1054 min(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1055 void ShaderCore::min(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1056 { 1057 dst.x = Min(src0.x, src1.x); 1058 dst.y = Min(src0.y, src1.y); 1059 dst.z = Min(src0.z, src1.z); 1060 dst.w = Min(src0.w, src1.w); 1061 } 1062 imin(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1063 void ShaderCore::imin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1064 { 1065 dst.x = As<Float4>(Min(As<Int4>(src0.x), As<Int4>(src1.x))); 1066 dst.y = As<Float4>(Min(As<Int4>(src0.y), As<Int4>(src1.y))); 1067 dst.z = As<Float4>(Min(As<Int4>(src0.z), As<Int4>(src1.z))); 1068 dst.w = As<Float4>(Min(As<Int4>(src0.w), As<Int4>(src1.w))); 1069 } 1070 umin(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1071 void ShaderCore::umin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1072 { 1073 dst.x = As<Float4>(Min(As<UInt4>(src0.x), As<UInt4>(src1.x))); 1074 dst.y = As<Float4>(Min(As<UInt4>(src0.y), As<UInt4>(src1.y))); 1075 dst.z = As<Float4>(Min(As<UInt4>(src0.z), As<UInt4>(src1.z))); 1076 dst.w = As<Float4>(Min(As<UInt4>(src0.w), As<UInt4>(src1.w))); 1077 } 1078 max(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1079 void ShaderCore::max(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1080 { 1081 dst.x = Max(src0.x, src1.x); 1082 dst.y = Max(src0.y, src1.y); 1083 dst.z = Max(src0.z, src1.z); 1084 dst.w = Max(src0.w, src1.w); 1085 } 1086 imax(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1087 void ShaderCore::imax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1088 { 1089 dst.x = As<Float4>(Max(As<Int4>(src0.x), As<Int4>(src1.x))); 1090 dst.y = As<Float4>(Max(As<Int4>(src0.y), As<Int4>(src1.y))); 1091 dst.z = As<Float4>(Max(As<Int4>(src0.z), As<Int4>(src1.z))); 1092 dst.w = As<Float4>(Max(As<Int4>(src0.w), As<Int4>(src1.w))); 1093 } 1094 umax(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1095 void ShaderCore::umax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1096 { 1097 dst.x = As<Float4>(Max(As<Int4>(src0.x), As<Int4>(src1.x))); 1098 dst.y = As<Float4>(Max(As<Int4>(src0.y), As<Int4>(src1.y))); 1099 dst.z = As<Float4>(Max(As<Int4>(src0.z), As<Int4>(src1.z))); 1100 dst.w = As<Float4>(Max(As<Int4>(src0.w), As<Int4>(src1.w))); 1101 } 1102 slt(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1103 void ShaderCore::slt(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1104 { 1105 dst.x = As<Float4>(As<Int4>(CmpLT(src0.x, src1.x)) & As<Int4>(Float4(1.0f))); 1106 dst.y = As<Float4>(As<Int4>(CmpLT(src0.y, src1.y)) & As<Int4>(Float4(1.0f))); 1107 dst.z = As<Float4>(As<Int4>(CmpLT(src0.z, src1.z)) & As<Int4>(Float4(1.0f))); 1108 dst.w = As<Float4>(As<Int4>(CmpLT(src0.w, src1.w)) & As<Int4>(Float4(1.0f))); 1109 } 1110 step(Vector4f & dst,const Vector4f & edge,const Vector4f & x)1111 void ShaderCore::step(Vector4f &dst, const Vector4f &edge, const Vector4f &x) 1112 { 1113 dst.x = As<Float4>(CmpNLT(x.x, edge.x) & As<Int4>(Float4(1.0f))); 1114 dst.y = As<Float4>(CmpNLT(x.y, edge.y) & As<Int4>(Float4(1.0f))); 1115 dst.z = As<Float4>(CmpNLT(x.z, edge.z) & As<Int4>(Float4(1.0f))); 1116 dst.w = As<Float4>(CmpNLT(x.w, edge.w) & As<Int4>(Float4(1.0f))); 1117 } 1118 exp2x(Vector4f & dst,const Vector4f & src,bool pp)1119 void ShaderCore::exp2x(Vector4f &dst, const Vector4f &src, bool pp) 1120 { 1121 Float4 exp = exponential2(src.x, pp); 1122 1123 dst.x = exp; 1124 dst.y = exp; 1125 dst.z = exp; 1126 dst.w = exp; 1127 } 1128 exp2(Vector4f & dst,const Vector4f & src,bool pp)1129 void ShaderCore::exp2(Vector4f &dst, const Vector4f &src, bool pp) 1130 { 1131 dst.x = exponential2(src.x, pp); 1132 dst.y = exponential2(src.y, pp); 1133 dst.z = exponential2(src.z, pp); 1134 dst.w = exponential2(src.w, pp); 1135 } 1136 exp(Vector4f & dst,const Vector4f & src,bool pp)1137 void ShaderCore::exp(Vector4f &dst, const Vector4f &src, bool pp) 1138 { 1139 dst.x = exponential(src.x, pp); 1140 dst.y = exponential(src.y, pp); 1141 dst.z = exponential(src.z, pp); 1142 dst.w = exponential(src.w, pp); 1143 } 1144 log2x(Vector4f & dst,const Vector4f & src,bool pp)1145 void ShaderCore::log2x(Vector4f &dst, const Vector4f &src, bool pp) 1146 { 1147 Float4 log = logarithm2(src.x, true, pp); 1148 1149 dst.x = log; 1150 dst.y = log; 1151 dst.z = log; 1152 dst.w = log; 1153 } 1154 log2(Vector4f & dst,const Vector4f & src,bool pp)1155 void ShaderCore::log2(Vector4f &dst, const Vector4f &src, bool pp) 1156 { 1157 dst.x = logarithm2(src.x, false, pp); 1158 dst.y = logarithm2(src.y, false, pp); 1159 dst.z = logarithm2(src.z, false, pp); 1160 dst.w = logarithm2(src.w, false, pp); 1161 } 1162 log(Vector4f & dst,const Vector4f & src,bool pp)1163 void ShaderCore::log(Vector4f &dst, const Vector4f &src, bool pp) 1164 { 1165 dst.x = logarithm(src.x, false, pp); 1166 dst.y = logarithm(src.y, false, pp); 1167 dst.z = logarithm(src.z, false, pp); 1168 dst.w = logarithm(src.w, false, pp); 1169 } 1170 lit(Vector4f & dst,const Vector4f & src)1171 void ShaderCore::lit(Vector4f &dst, const Vector4f &src) 1172 { 1173 dst.x = Float4(1.0f); 1174 dst.y = Max(src.x, Float4(0.0f)); 1175 1176 Float4 pow; 1177 1178 pow = src.w; 1179 pow = Min(pow, Float4(127.9961f)); 1180 pow = Max(pow, Float4(-127.9961f)); 1181 1182 dst.z = power(src.y, pow); 1183 dst.z = As<Float4>(As<Int4>(dst.z) & CmpNLT(src.x, Float4(0.0f))); 1184 dst.z = As<Float4>(As<Int4>(dst.z) & CmpNLT(src.y, Float4(0.0f))); 1185 1186 dst.w = Float4(1.0f); 1187 } 1188 att(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1189 void ShaderCore::att(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1190 { 1191 // Computes attenuation factors (1, d, d^2, 1/d) assuming src0 = d^2, src1 = 1/d 1192 dst.x = 1; 1193 dst.y = src0.y * src1.y; 1194 dst.z = src0.z; 1195 dst.w = src1.w; 1196 } 1197 lrp(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)1198 void ShaderCore::lrp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2) 1199 { 1200 dst.x = src0.x * (src1.x - src2.x) + src2.x; 1201 dst.y = src0.y * (src1.y - src2.y) + src2.y; 1202 dst.z = src0.z * (src1.z - src2.z) + src2.z; 1203 dst.w = src0.w * (src1.w - src2.w) + src2.w; 1204 } 1205 isinf(Vector4f & dst,const Vector4f & src)1206 void ShaderCore::isinf(Vector4f &dst, const Vector4f &src) 1207 { 1208 dst.x = As<Float4>(IsInf(src.x)); 1209 dst.y = As<Float4>(IsInf(src.y)); 1210 dst.z = As<Float4>(IsInf(src.z)); 1211 dst.w = As<Float4>(IsInf(src.w)); 1212 } 1213 isnan(Vector4f & dst,const Vector4f & src)1214 void ShaderCore::isnan(Vector4f &dst, const Vector4f &src) 1215 { 1216 dst.x = As<Float4>(IsNan(src.x)); 1217 dst.y = As<Float4>(IsNan(src.y)); 1218 dst.z = As<Float4>(IsNan(src.z)); 1219 dst.w = As<Float4>(IsNan(src.w)); 1220 } 1221 smooth(Vector4f & dst,const Vector4f & edge0,const Vector4f & edge1,const Vector4f & x)1222 void ShaderCore::smooth(Vector4f &dst, const Vector4f &edge0, const Vector4f &edge1, const Vector4f &x) 1223 { 1224 Float4 tx = Min(Max((x.x - edge0.x) / (edge1.x - edge0.x), Float4(0.0f)), Float4(1.0f)); dst.x = tx * tx * (Float4(3.0f) - Float4(2.0f) * tx); 1225 Float4 ty = Min(Max((x.y - edge0.y) / (edge1.y - edge0.y), Float4(0.0f)), Float4(1.0f)); dst.y = ty * ty * (Float4(3.0f) - Float4(2.0f) * ty); 1226 Float4 tz = Min(Max((x.z - edge0.z) / (edge1.z - edge0.z), Float4(0.0f)), Float4(1.0f)); dst.z = tz * tz * (Float4(3.0f) - Float4(2.0f) * tz); 1227 Float4 tw = Min(Max((x.w - edge0.w) / (edge1.w - edge0.w), Float4(0.0f)), Float4(1.0f)); dst.w = tw * tw * (Float4(3.0f) - Float4(2.0f) * tw); 1228 } 1229 floatToHalfBits(Float4 & dst,const Float4 & floatBits,bool storeInUpperBits)1230 void ShaderCore::floatToHalfBits(Float4& dst, const Float4& floatBits, bool storeInUpperBits) 1231 { 1232 static const uint32_t mask_sign = 0x80000000u; 1233 static const uint32_t mask_round = ~0xfffu; 1234 static const uint32_t c_f32infty = 255 << 23; 1235 static const uint32_t c_magic = 15 << 23; 1236 static const uint32_t c_nanbit = 0x200; 1237 static const uint32_t c_infty_as_fp16 = 0x7c00; 1238 static const uint32_t c_clamp = (31 << 23) - 0x1000; 1239 1240 UInt4 justsign = UInt4(mask_sign) & As<UInt4>(floatBits); 1241 UInt4 absf = As<UInt4>(floatBits) ^ justsign; 1242 UInt4 b_isnormal = CmpNLE(UInt4(c_f32infty), absf); 1243 1244 // Note: this version doesn't round to the nearest even in case of a tie as defined by IEEE 754-2008, it rounds to +inf 1245 // instead of nearest even, since that's fine for GLSL ES 3.0's needs (see section 2.1.1 Floating-Point Computation) 1246 UInt4 joined = ((((As<UInt4>(Min(As<Float4>(absf & UInt4(mask_round)) * As<Float4>(UInt4(c_magic)), 1247 As<Float4>(UInt4(c_clamp))))) - UInt4(mask_round)) >> 13) & b_isnormal) | 1248 ((b_isnormal ^ UInt4(0xFFFFFFFF)) & ((CmpNLE(absf, UInt4(c_f32infty)) & UInt4(c_nanbit)) | 1249 UInt4(c_infty_as_fp16))); 1250 1251 dst = As<Float4>(storeInUpperBits ? As<UInt4>(dst) | ((joined << 16) | justsign) : joined | (justsign >> 16)); 1252 } 1253 halfToFloatBits(Float4 & dst,const Float4 & halfBits)1254 void ShaderCore::halfToFloatBits(Float4& dst, const Float4& halfBits) 1255 { 1256 static const uint32_t mask_nosign = 0x7FFF; 1257 static const uint32_t magic = (254 - 15) << 23; 1258 static const uint32_t was_infnan = 0x7BFF; 1259 static const uint32_t exp_infnan = 255 << 23; 1260 1261 UInt4 expmant = As<UInt4>(halfBits) & UInt4(mask_nosign); 1262 dst = As<Float4>(As<UInt4>(As<Float4>(expmant << 13) * As<Float4>(UInt4(magic))) | 1263 ((As<UInt4>(halfBits) ^ UInt4(expmant)) << 16) | 1264 (CmpNLE(As<UInt4>(expmant), UInt4(was_infnan)) & UInt4(exp_infnan))); 1265 } 1266 packHalf2x16(Vector4f & d,const Vector4f & s0)1267 void ShaderCore::packHalf2x16(Vector4f &d, const Vector4f &s0) 1268 { 1269 // half2 | half1 1270 floatToHalfBits(d.x, s0.x, false); 1271 floatToHalfBits(d.x, s0.y, true); 1272 } 1273 unpackHalf2x16(Vector4f & dst,const Vector4f & s0)1274 void ShaderCore::unpackHalf2x16(Vector4f &dst, const Vector4f &s0) 1275 { 1276 // half2 | half1 1277 halfToFloatBits(dst.x, As<Float4>(As<UInt4>(s0.x) & UInt4(0x0000FFFF))); 1278 halfToFloatBits(dst.y, As<Float4>((As<UInt4>(s0.x) & UInt4(0xFFFF0000)) >> 16)); 1279 } 1280 packSnorm2x16(Vector4f & d,const Vector4f & s0)1281 void ShaderCore::packSnorm2x16(Vector4f &d, const Vector4f &s0) 1282 { 1283 // round(clamp(c, -1.0, 1.0) * 32767.0) 1284 d.x = As<Float4>((Int4(Round(Min(Max(s0.x, Float4(-1.0f)), Float4(1.0f)) * Float4(32767.0f))) & Int4(0xFFFF)) | 1285 ((Int4(Round(Min(Max(s0.y, Float4(-1.0f)), Float4(1.0f)) * Float4(32767.0f))) & Int4(0xFFFF)) << 16)); 1286 } 1287 packUnorm2x16(Vector4f & d,const Vector4f & s0)1288 void ShaderCore::packUnorm2x16(Vector4f &d, const Vector4f &s0) 1289 { 1290 // round(clamp(c, 0.0, 1.0) * 65535.0) 1291 d.x = As<Float4>((Int4(Round(Min(Max(s0.x, Float4(0.0f)), Float4(1.0f)) * Float4(65535.0f))) & Int4(0xFFFF)) | 1292 ((Int4(Round(Min(Max(s0.y, Float4(0.0f)), Float4(1.0f)) * Float4(65535.0f))) & Int4(0xFFFF)) << 16)); 1293 } 1294 unpackSnorm2x16(Vector4f & dst,const Vector4f & s0)1295 void ShaderCore::unpackSnorm2x16(Vector4f &dst, const Vector4f &s0) 1296 { 1297 // clamp(f / 32727.0, -1.0, 1.0) 1298 dst.x = Min(Max(Float4(As<Int4>((As<UInt4>(s0.x) & UInt4(0x0000FFFF)) << 16)) * Float4(1.0f / float(0x7FFF0000)), Float4(-1.0f)), Float4(1.0f)); 1299 dst.y = Min(Max(Float4(As<Int4>(As<UInt4>(s0.x) & UInt4(0xFFFF0000))) * Float4(1.0f / float(0x7FFF0000)), Float4(-1.0f)), Float4(1.0f)); 1300 } 1301 unpackUnorm2x16(Vector4f & dst,const Vector4f & s0)1302 void ShaderCore::unpackUnorm2x16(Vector4f &dst, const Vector4f &s0) 1303 { 1304 // f / 65535.0 1305 dst.x = Float4((As<UInt4>(s0.x) & UInt4(0x0000FFFF)) << 16) * Float4(1.0f / float(0xFFFF0000)); 1306 dst.y = Float4(As<UInt4>(s0.x) & UInt4(0xFFFF0000)) * Float4(1.0f / float(0xFFFF0000)); 1307 } 1308 det2(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1309 void ShaderCore::det2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1310 { 1311 dst.x = src0.x * src1.y - src0.y * src1.x; 1312 dst.y = dst.z = dst.w = dst.x; 1313 } 1314 det3(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)1315 void ShaderCore::det3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2) 1316 { 1317 crs(dst, src1, src2); 1318 dp3(dst, dst, src0); 1319 } 1320 det4(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2,const Vector4f & src3)1321 void ShaderCore::det4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2, const Vector4f &src3) 1322 { 1323 dst.x = src2.z * src3.w - src2.w * src3.z; 1324 dst.y = src1.w * src3.z - src1.z * src3.w; 1325 dst.z = src1.z * src2.w - src1.w * src2.z; 1326 dst.x = src0.x * (src1.y * dst.x + src2.y * dst.y + src3.y * dst.z) - 1327 src0.y * (src1.x * dst.x + src2.x * dst.y + src3.x * dst.z) + 1328 src0.z * (src1.x * (src2.y * src3.w - src2.w * src3.y) + 1329 src2.x * (src1.w * src3.y - src1.y * src3.w) + 1330 src3.x * (src1.y * src2.w - src1.w * src2.y)) + 1331 src0.w * (src1.x * (src2.z * src3.y - src2.y * src3.z) + 1332 src2.x * (src1.y * src3.z - src1.z * src3.y) + 1333 src3.x * (src1.z * src2.y - src1.y * src2.z)); 1334 dst.y = dst.z = dst.w = dst.x; 1335 } 1336 frc(Vector4f & dst,const Vector4f & src)1337 void ShaderCore::frc(Vector4f &dst, const Vector4f &src) 1338 { 1339 dst.x = Frac(src.x); 1340 dst.y = Frac(src.y); 1341 dst.z = Frac(src.z); 1342 dst.w = Frac(src.w); 1343 } 1344 trunc(Vector4f & dst,const Vector4f & src)1345 void ShaderCore::trunc(Vector4f &dst, const Vector4f &src) 1346 { 1347 dst.x = Trunc(src.x); 1348 dst.y = Trunc(src.y); 1349 dst.z = Trunc(src.z); 1350 dst.w = Trunc(src.w); 1351 } 1352 floor(Vector4f & dst,const Vector4f & src)1353 void ShaderCore::floor(Vector4f &dst, const Vector4f &src) 1354 { 1355 dst.x = Floor(src.x); 1356 dst.y = Floor(src.y); 1357 dst.z = Floor(src.z); 1358 dst.w = Floor(src.w); 1359 } 1360 round(Vector4f & dst,const Vector4f & src)1361 void ShaderCore::round(Vector4f &dst, const Vector4f &src) 1362 { 1363 dst.x = Round(src.x); 1364 dst.y = Round(src.y); 1365 dst.z = Round(src.z); 1366 dst.w = Round(src.w); 1367 } 1368 roundEven(Vector4f & dst,const Vector4f & src)1369 void ShaderCore::roundEven(Vector4f &dst, const Vector4f &src) 1370 { 1371 // dst = round(src) + ((round(src) < src) * 2 - 1) * (fract(src) == 0.5) * isOdd(round(src)); 1372 // ex.: 1.5: 2 + (0 * 2 - 1) * 1 * 0 = 2 1373 // 2.5: 3 + (0 * 2 - 1) * 1 * 1 = 2 1374 // -1.5: -2 + (1 * 2 - 1) * 1 * 0 = -2 1375 // -2.5: -3 + (1 * 2 - 1) * 1 * 1 = -2 1376 // Even if the round implementation rounds the other way: 1377 // 1.5: 1 + (1 * 2 - 1) * 1 * 1 = 2 1378 // 2.5: 2 + (1 * 2 - 1) * 1 * 0 = 2 1379 // -1.5: -1 + (0 * 2 - 1) * 1 * 1 = -2 1380 // -2.5: -2 + (0 * 2 - 1) * 1 * 0 = -2 1381 round(dst, src); 1382 dst.x += ((Float4(CmpLT(dst.x, src.x) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.x), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.x) & Int4(1)); 1383 dst.y += ((Float4(CmpLT(dst.y, src.y) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.y), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.y) & Int4(1)); 1384 dst.z += ((Float4(CmpLT(dst.z, src.z) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.z), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.z) & Int4(1)); 1385 dst.w += ((Float4(CmpLT(dst.w, src.w) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.w), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.w) & Int4(1)); 1386 } 1387 ceil(Vector4f & dst,const Vector4f & src)1388 void ShaderCore::ceil(Vector4f &dst, const Vector4f &src) 1389 { 1390 dst.x = Ceil(src.x); 1391 dst.y = Ceil(src.y); 1392 dst.z = Ceil(src.z); 1393 dst.w = Ceil(src.w); 1394 } 1395 powx(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,bool pp)1396 void ShaderCore::powx(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp) 1397 { 1398 Float4 pow = power(src0.x, src1.x, pp); 1399 1400 dst.x = pow; 1401 dst.y = pow; 1402 dst.z = pow; 1403 dst.w = pow; 1404 } 1405 pow(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,bool pp)1406 void ShaderCore::pow(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp) 1407 { 1408 dst.x = power(src0.x, src1.x, pp); 1409 dst.y = power(src0.y, src1.y, pp); 1410 dst.z = power(src0.z, src1.z, pp); 1411 dst.w = power(src0.w, src1.w, pp); 1412 } 1413 crs(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1414 void ShaderCore::crs(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1415 { 1416 dst.x = src0.y * src1.z - src0.z * src1.y; 1417 dst.y = src0.z * src1.x - src0.x * src1.z; 1418 dst.z = src0.x * src1.y - src0.y * src1.x; 1419 } 1420 forward1(Vector4f & dst,const Vector4f & N,const Vector4f & I,const Vector4f & Nref)1421 void ShaderCore::forward1(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref) 1422 { 1423 Int4 flip = CmpNLT(Nref.x * I.x, Float4(0.0f)) & Int4(0x80000000); 1424 1425 dst.x = As<Float4>(flip ^ As<Int4>(N.x)); 1426 } 1427 forward2(Vector4f & dst,const Vector4f & N,const Vector4f & I,const Vector4f & Nref)1428 void ShaderCore::forward2(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref) 1429 { 1430 Int4 flip = CmpNLT(dot2(Nref, I), Float4(0.0f)) & Int4(0x80000000); 1431 1432 dst.x = As<Float4>(flip ^ As<Int4>(N.x)); 1433 dst.y = As<Float4>(flip ^ As<Int4>(N.y)); 1434 } 1435 forward3(Vector4f & dst,const Vector4f & N,const Vector4f & I,const Vector4f & Nref)1436 void ShaderCore::forward3(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref) 1437 { 1438 Int4 flip = CmpNLT(dot3(Nref, I), Float4(0.0f)) & Int4(0x80000000); 1439 1440 dst.x = As<Float4>(flip ^ As<Int4>(N.x)); 1441 dst.y = As<Float4>(flip ^ As<Int4>(N.y)); 1442 dst.z = As<Float4>(flip ^ As<Int4>(N.z)); 1443 } 1444 forward4(Vector4f & dst,const Vector4f & N,const Vector4f & I,const Vector4f & Nref)1445 void ShaderCore::forward4(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref) 1446 { 1447 Int4 flip = CmpNLT(dot4(Nref, I), Float4(0.0f)) & Int4(0x80000000); 1448 1449 dst.x = As<Float4>(flip ^ As<Int4>(N.x)); 1450 dst.y = As<Float4>(flip ^ As<Int4>(N.y)); 1451 dst.z = As<Float4>(flip ^ As<Int4>(N.z)); 1452 dst.w = As<Float4>(flip ^ As<Int4>(N.w)); 1453 } 1454 reflect1(Vector4f & dst,const Vector4f & I,const Vector4f & N)1455 void ShaderCore::reflect1(Vector4f &dst, const Vector4f &I, const Vector4f &N) 1456 { 1457 Float4 d = N.x * I.x; 1458 1459 dst.x = I.x - Float4(2.0f) * d * N.x; 1460 } 1461 reflect2(Vector4f & dst,const Vector4f & I,const Vector4f & N)1462 void ShaderCore::reflect2(Vector4f &dst, const Vector4f &I, const Vector4f &N) 1463 { 1464 Float4 d = dot2(N, I); 1465 1466 dst.x = I.x - Float4(2.0f) * d * N.x; 1467 dst.y = I.y - Float4(2.0f) * d * N.y; 1468 } 1469 reflect3(Vector4f & dst,const Vector4f & I,const Vector4f & N)1470 void ShaderCore::reflect3(Vector4f &dst, const Vector4f &I, const Vector4f &N) 1471 { 1472 Float4 d = dot3(N, I); 1473 1474 dst.x = I.x - Float4(2.0f) * d * N.x; 1475 dst.y = I.y - Float4(2.0f) * d * N.y; 1476 dst.z = I.z - Float4(2.0f) * d * N.z; 1477 } 1478 reflect4(Vector4f & dst,const Vector4f & I,const Vector4f & N)1479 void ShaderCore::reflect4(Vector4f &dst, const Vector4f &I, const Vector4f &N) 1480 { 1481 Float4 d = dot4(N, I); 1482 1483 dst.x = I.x - Float4(2.0f) * d * N.x; 1484 dst.y = I.y - Float4(2.0f) * d * N.y; 1485 dst.z = I.z - Float4(2.0f) * d * N.z; 1486 dst.w = I.w - Float4(2.0f) * d * N.w; 1487 } 1488 refract1(Vector4f & dst,const Vector4f & I,const Vector4f & N,const Float4 & eta)1489 void ShaderCore::refract1(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta) 1490 { 1491 Float4 d = N.x * I.x; 1492 Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d); 1493 Int4 pos = CmpNLT(k, Float4(0.0f)); 1494 Float4 t = (eta * d + Sqrt(k)); 1495 1496 dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x)); 1497 } 1498 refract2(Vector4f & dst,const Vector4f & I,const Vector4f & N,const Float4 & eta)1499 void ShaderCore::refract2(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta) 1500 { 1501 Float4 d = dot2(N, I); 1502 Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d); 1503 Int4 pos = CmpNLT(k, Float4(0.0f)); 1504 Float4 t = (eta * d + Sqrt(k)); 1505 1506 dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x)); 1507 dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y)); 1508 } 1509 refract3(Vector4f & dst,const Vector4f & I,const Vector4f & N,const Float4 & eta)1510 void ShaderCore::refract3(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta) 1511 { 1512 Float4 d = dot3(N, I); 1513 Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d); 1514 Int4 pos = CmpNLT(k, Float4(0.0f)); 1515 Float4 t = (eta * d + Sqrt(k)); 1516 1517 dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x)); 1518 dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y)); 1519 dst.z = As<Float4>(pos & As<Int4>(eta * I.z - t * N.z)); 1520 } 1521 refract4(Vector4f & dst,const Vector4f & I,const Vector4f & N,const Float4 & eta)1522 void ShaderCore::refract4(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta) 1523 { 1524 Float4 d = dot4(N, I); 1525 Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d); 1526 Int4 pos = CmpNLT(k, Float4(0.0f)); 1527 Float4 t = (eta * d + Sqrt(k)); 1528 1529 dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x)); 1530 dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y)); 1531 dst.z = As<Float4>(pos & As<Int4>(eta * I.z - t * N.z)); 1532 dst.w = As<Float4>(pos & As<Int4>(eta * I.w - t * N.w)); 1533 } 1534 sgn(Vector4f & dst,const Vector4f & src)1535 void ShaderCore::sgn(Vector4f &dst, const Vector4f &src) 1536 { 1537 sgn(dst.x, src.x); 1538 sgn(dst.y, src.y); 1539 sgn(dst.z, src.z); 1540 sgn(dst.w, src.w); 1541 } 1542 isgn(Vector4f & dst,const Vector4f & src)1543 void ShaderCore::isgn(Vector4f &dst, const Vector4f &src) 1544 { 1545 isgn(dst.x, src.x); 1546 isgn(dst.y, src.y); 1547 isgn(dst.z, src.z); 1548 isgn(dst.w, src.w); 1549 } 1550 abs(Vector4f & dst,const Vector4f & src)1551 void ShaderCore::abs(Vector4f &dst, const Vector4f &src) 1552 { 1553 dst.x = Abs(src.x); 1554 dst.y = Abs(src.y); 1555 dst.z = Abs(src.z); 1556 dst.w = Abs(src.w); 1557 } 1558 iabs(Vector4f & dst,const Vector4f & src)1559 void ShaderCore::iabs(Vector4f &dst, const Vector4f &src) 1560 { 1561 dst.x = As<Float4>(Abs(As<Int4>(src.x))); 1562 dst.y = As<Float4>(Abs(As<Int4>(src.y))); 1563 dst.z = As<Float4>(Abs(As<Int4>(src.z))); 1564 dst.w = As<Float4>(Abs(As<Int4>(src.w))); 1565 } 1566 nrm2(Vector4f & dst,const Vector4f & src,bool pp)1567 void ShaderCore::nrm2(Vector4f &dst, const Vector4f &src, bool pp) 1568 { 1569 Float4 dot = dot2(src, src); 1570 Float4 rsq = reciprocalSquareRoot(dot, false, pp); 1571 1572 dst.x = src.x * rsq; 1573 dst.y = src.y * rsq; 1574 dst.z = src.z * rsq; 1575 dst.w = src.w * rsq; 1576 } 1577 nrm3(Vector4f & dst,const Vector4f & src,bool pp)1578 void ShaderCore::nrm3(Vector4f &dst, const Vector4f &src, bool pp) 1579 { 1580 Float4 dot = dot3(src, src); 1581 Float4 rsq = reciprocalSquareRoot(dot, false, pp); 1582 1583 dst.x = src.x * rsq; 1584 dst.y = src.y * rsq; 1585 dst.z = src.z * rsq; 1586 dst.w = src.w * rsq; 1587 } 1588 nrm4(Vector4f & dst,const Vector4f & src,bool pp)1589 void ShaderCore::nrm4(Vector4f &dst, const Vector4f &src, bool pp) 1590 { 1591 Float4 dot = dot4(src, src); 1592 Float4 rsq = reciprocalSquareRoot(dot, false, pp); 1593 1594 dst.x = src.x * rsq; 1595 dst.y = src.y * rsq; 1596 dst.z = src.z * rsq; 1597 dst.w = src.w * rsq; 1598 } 1599 sincos(Vector4f & dst,const Vector4f & src,bool pp)1600 void ShaderCore::sincos(Vector4f &dst, const Vector4f &src, bool pp) 1601 { 1602 dst.x = cosine_pi(src.x, pp); 1603 dst.y = sine_pi(src.x, pp); 1604 } 1605 cos(Vector4f & dst,const Vector4f & src,bool pp)1606 void ShaderCore::cos(Vector4f &dst, const Vector4f &src, bool pp) 1607 { 1608 dst.x = cosine(src.x, pp); 1609 dst.y = cosine(src.y, pp); 1610 dst.z = cosine(src.z, pp); 1611 dst.w = cosine(src.w, pp); 1612 } 1613 sin(Vector4f & dst,const Vector4f & src,bool pp)1614 void ShaderCore::sin(Vector4f &dst, const Vector4f &src, bool pp) 1615 { 1616 dst.x = sine(src.x, pp); 1617 dst.y = sine(src.y, pp); 1618 dst.z = sine(src.z, pp); 1619 dst.w = sine(src.w, pp); 1620 } 1621 tan(Vector4f & dst,const Vector4f & src,bool pp)1622 void ShaderCore::tan(Vector4f &dst, const Vector4f &src, bool pp) 1623 { 1624 dst.x = tangent(src.x, pp); 1625 dst.y = tangent(src.y, pp); 1626 dst.z = tangent(src.z, pp); 1627 dst.w = tangent(src.w, pp); 1628 } 1629 acos(Vector4f & dst,const Vector4f & src,bool pp)1630 void ShaderCore::acos(Vector4f &dst, const Vector4f &src, bool pp) 1631 { 1632 dst.x = arccos(src.x, pp); 1633 dst.y = arccos(src.y, pp); 1634 dst.z = arccos(src.z, pp); 1635 dst.w = arccos(src.w, pp); 1636 } 1637 asin(Vector4f & dst,const Vector4f & src,bool pp)1638 void ShaderCore::asin(Vector4f &dst, const Vector4f &src, bool pp) 1639 { 1640 dst.x = arcsin(src.x, pp); 1641 dst.y = arcsin(src.y, pp); 1642 dst.z = arcsin(src.z, pp); 1643 dst.w = arcsin(src.w, pp); 1644 } 1645 atan(Vector4f & dst,const Vector4f & src,bool pp)1646 void ShaderCore::atan(Vector4f &dst, const Vector4f &src, bool pp) 1647 { 1648 dst.x = arctan(src.x, pp); 1649 dst.y = arctan(src.y, pp); 1650 dst.z = arctan(src.z, pp); 1651 dst.w = arctan(src.w, pp); 1652 } 1653 atan2(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,bool pp)1654 void ShaderCore::atan2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp) 1655 { 1656 dst.x = arctan(src0.x, src1.x, pp); 1657 dst.y = arctan(src0.y, src1.y, pp); 1658 dst.z = arctan(src0.z, src1.z, pp); 1659 dst.w = arctan(src0.w, src1.w, pp); 1660 } 1661 cosh(Vector4f & dst,const Vector4f & src,bool pp)1662 void ShaderCore::cosh(Vector4f &dst, const Vector4f &src, bool pp) 1663 { 1664 dst.x = cosineh(src.x, pp); 1665 dst.y = cosineh(src.y, pp); 1666 dst.z = cosineh(src.z, pp); 1667 dst.w = cosineh(src.w, pp); 1668 } 1669 sinh(Vector4f & dst,const Vector4f & src,bool pp)1670 void ShaderCore::sinh(Vector4f &dst, const Vector4f &src, bool pp) 1671 { 1672 dst.x = sineh(src.x, pp); 1673 dst.y = sineh(src.y, pp); 1674 dst.z = sineh(src.z, pp); 1675 dst.w = sineh(src.w, pp); 1676 } 1677 tanh(Vector4f & dst,const Vector4f & src,bool pp)1678 void ShaderCore::tanh(Vector4f &dst, const Vector4f &src, bool pp) 1679 { 1680 dst.x = tangenth(src.x, pp); 1681 dst.y = tangenth(src.y, pp); 1682 dst.z = tangenth(src.z, pp); 1683 dst.w = tangenth(src.w, pp); 1684 } 1685 acosh(Vector4f & dst,const Vector4f & src,bool pp)1686 void ShaderCore::acosh(Vector4f &dst, const Vector4f &src, bool pp) 1687 { 1688 dst.x = arccosh(src.x, pp); 1689 dst.y = arccosh(src.y, pp); 1690 dst.z = arccosh(src.z, pp); 1691 dst.w = arccosh(src.w, pp); 1692 } 1693 asinh(Vector4f & dst,const Vector4f & src,bool pp)1694 void ShaderCore::asinh(Vector4f &dst, const Vector4f &src, bool pp) 1695 { 1696 dst.x = arcsinh(src.x, pp); 1697 dst.y = arcsinh(src.y, pp); 1698 dst.z = arcsinh(src.z, pp); 1699 dst.w = arcsinh(src.w, pp); 1700 } 1701 atanh(Vector4f & dst,const Vector4f & src,bool pp)1702 void ShaderCore::atanh(Vector4f &dst, const Vector4f &src, bool pp) 1703 { 1704 dst.x = arctanh(src.x, pp); 1705 dst.y = arctanh(src.y, pp); 1706 dst.z = arctanh(src.z, pp); 1707 dst.w = arctanh(src.w, pp); 1708 } 1709 expp(Vector4f & dst,const Vector4f & src,unsigned short shaderModel)1710 void ShaderCore::expp(Vector4f &dst, const Vector4f &src, unsigned short shaderModel) 1711 { 1712 if(shaderModel < 0x0200) 1713 { 1714 Float4 frc = Frac(src.x); 1715 Float4 floor = src.x - frc; 1716 1717 dst.x = exponential2(floor, true); 1718 dst.y = frc; 1719 dst.z = exponential2(src.x, true); 1720 dst.w = Float4(1.0f); 1721 } 1722 else // Version >= 2.0 1723 { 1724 exp2x(dst, src, true); // FIXME: 10-bit precision suffices 1725 } 1726 } 1727 logp(Vector4f & dst,const Vector4f & src,unsigned short shaderModel)1728 void ShaderCore::logp(Vector4f &dst, const Vector4f &src, unsigned short shaderModel) 1729 { 1730 if(shaderModel < 0x0200) 1731 { 1732 Float4 tmp0; 1733 Float4 tmp1; 1734 Float4 t; 1735 Int4 r; 1736 1737 tmp0 = Abs(src.x); 1738 tmp1 = tmp0; 1739 1740 // X component 1741 r = As<Int4>(As<UInt4>(tmp0) >> 23) - Int4(127); 1742 dst.x = Float4(r); 1743 1744 // Y component 1745 dst.y = As<Float4>((As<Int4>(tmp1) & Int4(0x007FFFFF)) | As<Int4>(Float4(1.0f))); 1746 1747 // Z component 1748 dst.z = logarithm2(src.x, true, true); 1749 1750 // W component 1751 dst.w = 1.0f; 1752 } 1753 else 1754 { 1755 log2x(dst, src, true); 1756 } 1757 } 1758 cmp0(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)1759 void ShaderCore::cmp0(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2) 1760 { 1761 cmp0(dst.x, src0.x, src1.x, src2.x); 1762 cmp0(dst.y, src0.y, src1.y, src2.y); 1763 cmp0(dst.z, src0.z, src1.z, src2.z); 1764 cmp0(dst.w, src0.w, src1.w, src2.w); 1765 } 1766 select(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,const Vector4f & src2)1767 void ShaderCore::select(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2) 1768 { 1769 select(dst.x, As<Int4>(src0.x), src1.x, src2.x); 1770 select(dst.y, As<Int4>(src0.y), src1.y, src2.y); 1771 select(dst.z, As<Int4>(src0.z), src1.z, src2.z); 1772 select(dst.w, As<Int4>(src0.w), src1.w, src2.w); 1773 } 1774 extract(Float4 & dst,const Vector4f & src0,const Float4 & src1)1775 void ShaderCore::extract(Float4 &dst, const Vector4f &src0, const Float4 &src1) 1776 { 1777 select(dst, CmpEQ(As<Int4>(src1), Int4(1)), src0.y, src0.x); 1778 select(dst, CmpEQ(As<Int4>(src1), Int4(2)), src0.z, dst); 1779 select(dst, CmpEQ(As<Int4>(src1), Int4(3)), src0.w, dst); 1780 } 1781 insert(Vector4f & dst,const Vector4f & src,const Float4 & element,const Float4 & index)1782 void ShaderCore::insert(Vector4f &dst, const Vector4f &src, const Float4 &element, const Float4 &index) 1783 { 1784 select(dst.x, CmpEQ(As<Int4>(index), Int4(0)), element, src.x); 1785 select(dst.y, CmpEQ(As<Int4>(index), Int4(1)), element, src.y); 1786 select(dst.z, CmpEQ(As<Int4>(index), Int4(2)), element, src.z); 1787 select(dst.w, CmpEQ(As<Int4>(index), Int4(3)), element, src.w); 1788 } 1789 sgn(Float4 & dst,const Float4 & src)1790 void ShaderCore::sgn(Float4 &dst, const Float4 &src) 1791 { 1792 Int4 neg = As<Int4>(CmpLT(src, Float4(-0.0f))) & As<Int4>(Float4(-1.0f)); 1793 Int4 pos = As<Int4>(CmpNLE(src, Float4(+0.0f))) & As<Int4>(Float4(1.0f)); 1794 dst = As<Float4>(neg | pos); 1795 } 1796 isgn(Float4 & dst,const Float4 & src)1797 void ShaderCore::isgn(Float4 &dst, const Float4 &src) 1798 { 1799 Int4 neg = CmpLT(As<Int4>(src), Int4(0)) & Int4(-1); 1800 Int4 pos = CmpNLE(As<Int4>(src), Int4(0)) & Int4(1); 1801 dst = As<Float4>(neg | pos); 1802 } 1803 cmp0(Float4 & dst,const Float4 & src0,const Float4 & src1,const Float4 & src2)1804 void ShaderCore::cmp0(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2) 1805 { 1806 Int4 pos = CmpLE(Float4(0.0f), src0); 1807 select(dst, pos, src1, src2); 1808 } 1809 cmp0i(Float4 & dst,const Float4 & src0,const Float4 & src1,const Float4 & src2)1810 void ShaderCore::cmp0i(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2) 1811 { 1812 Int4 pos = CmpEQ(Int4(0), As<Int4>(src0)); 1813 select(dst, pos, src1, src2); 1814 } 1815 select(Float4 & dst,RValue<Int4> src0,const Float4 & src1,const Float4 & src2)1816 void ShaderCore::select(Float4 &dst, RValue<Int4> src0, const Float4 &src1, const Float4 &src2) 1817 { 1818 // FIXME: LLVM vector select 1819 dst = As<Float4>((src0 & As<Int4>(src1)) | (~src0 & As<Int4>(src2))); 1820 } 1821 cmp(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,Control control)1822 void ShaderCore::cmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control) 1823 { 1824 switch(control) 1825 { 1826 case Shader::CONTROL_GT: 1827 dst.x = As<Float4>(CmpNLE(src0.x, src1.x)); 1828 dst.y = As<Float4>(CmpNLE(src0.y, src1.y)); 1829 dst.z = As<Float4>(CmpNLE(src0.z, src1.z)); 1830 dst.w = As<Float4>(CmpNLE(src0.w, src1.w)); 1831 break; 1832 case Shader::CONTROL_EQ: 1833 dst.x = As<Float4>(CmpEQ(src0.x, src1.x)); 1834 dst.y = As<Float4>(CmpEQ(src0.y, src1.y)); 1835 dst.z = As<Float4>(CmpEQ(src0.z, src1.z)); 1836 dst.w = As<Float4>(CmpEQ(src0.w, src1.w)); 1837 break; 1838 case Shader::CONTROL_GE: 1839 dst.x = As<Float4>(CmpNLT(src0.x, src1.x)); 1840 dst.y = As<Float4>(CmpNLT(src0.y, src1.y)); 1841 dst.z = As<Float4>(CmpNLT(src0.z, src1.z)); 1842 dst.w = As<Float4>(CmpNLT(src0.w, src1.w)); 1843 break; 1844 case Shader::CONTROL_LT: 1845 dst.x = As<Float4>(CmpLT(src0.x, src1.x)); 1846 dst.y = As<Float4>(CmpLT(src0.y, src1.y)); 1847 dst.z = As<Float4>(CmpLT(src0.z, src1.z)); 1848 dst.w = As<Float4>(CmpLT(src0.w, src1.w)); 1849 break; 1850 case Shader::CONTROL_NE: 1851 dst.x = As<Float4>(CmpNEQ(src0.x, src1.x)); 1852 dst.y = As<Float4>(CmpNEQ(src0.y, src1.y)); 1853 dst.z = As<Float4>(CmpNEQ(src0.z, src1.z)); 1854 dst.w = As<Float4>(CmpNEQ(src0.w, src1.w)); 1855 break; 1856 case Shader::CONTROL_LE: 1857 dst.x = As<Float4>(CmpLE(src0.x, src1.x)); 1858 dst.y = As<Float4>(CmpLE(src0.y, src1.y)); 1859 dst.z = As<Float4>(CmpLE(src0.z, src1.z)); 1860 dst.w = As<Float4>(CmpLE(src0.w, src1.w)); 1861 break; 1862 default: 1863 ASSERT(false); 1864 } 1865 } 1866 icmp(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,Control control)1867 void ShaderCore::icmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control) 1868 { 1869 switch(control) 1870 { 1871 case Shader::CONTROL_GT: 1872 dst.x = As<Float4>(CmpNLE(As<Int4>(src0.x), As<Int4>(src1.x))); 1873 dst.y = As<Float4>(CmpNLE(As<Int4>(src0.y), As<Int4>(src1.y))); 1874 dst.z = As<Float4>(CmpNLE(As<Int4>(src0.z), As<Int4>(src1.z))); 1875 dst.w = As<Float4>(CmpNLE(As<Int4>(src0.w), As<Int4>(src1.w))); 1876 break; 1877 case Shader::CONTROL_EQ: 1878 dst.x = As<Float4>(CmpEQ(As<Int4>(src0.x), As<Int4>(src1.x))); 1879 dst.y = As<Float4>(CmpEQ(As<Int4>(src0.y), As<Int4>(src1.y))); 1880 dst.z = As<Float4>(CmpEQ(As<Int4>(src0.z), As<Int4>(src1.z))); 1881 dst.w = As<Float4>(CmpEQ(As<Int4>(src0.w), As<Int4>(src1.w))); 1882 break; 1883 case Shader::CONTROL_GE: 1884 dst.x = As<Float4>(CmpNLT(As<Int4>(src0.x), As<Int4>(src1.x))); 1885 dst.y = As<Float4>(CmpNLT(As<Int4>(src0.y), As<Int4>(src1.y))); 1886 dst.z = As<Float4>(CmpNLT(As<Int4>(src0.z), As<Int4>(src1.z))); 1887 dst.w = As<Float4>(CmpNLT(As<Int4>(src0.w), As<Int4>(src1.w))); 1888 break; 1889 case Shader::CONTROL_LT: 1890 dst.x = As<Float4>(CmpLT(As<Int4>(src0.x), As<Int4>(src1.x))); 1891 dst.y = As<Float4>(CmpLT(As<Int4>(src0.y), As<Int4>(src1.y))); 1892 dst.z = As<Float4>(CmpLT(As<Int4>(src0.z), As<Int4>(src1.z))); 1893 dst.w = As<Float4>(CmpLT(As<Int4>(src0.w), As<Int4>(src1.w))); 1894 break; 1895 case Shader::CONTROL_NE: 1896 dst.x = As<Float4>(CmpNEQ(As<Int4>(src0.x), As<Int4>(src1.x))); 1897 dst.y = As<Float4>(CmpNEQ(As<Int4>(src0.y), As<Int4>(src1.y))); 1898 dst.z = As<Float4>(CmpNEQ(As<Int4>(src0.z), As<Int4>(src1.z))); 1899 dst.w = As<Float4>(CmpNEQ(As<Int4>(src0.w), As<Int4>(src1.w))); 1900 break; 1901 case Shader::CONTROL_LE: 1902 dst.x = As<Float4>(CmpLE(As<Int4>(src0.x), As<Int4>(src1.x))); 1903 dst.y = As<Float4>(CmpLE(As<Int4>(src0.y), As<Int4>(src1.y))); 1904 dst.z = As<Float4>(CmpLE(As<Int4>(src0.z), As<Int4>(src1.z))); 1905 dst.w = As<Float4>(CmpLE(As<Int4>(src0.w), As<Int4>(src1.w))); 1906 break; 1907 default: 1908 ASSERT(false); 1909 } 1910 } 1911 ucmp(Vector4f & dst,const Vector4f & src0,const Vector4f & src1,Control control)1912 void ShaderCore::ucmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control) 1913 { 1914 switch(control) 1915 { 1916 case Shader::CONTROL_GT: 1917 dst.x = As<Float4>(CmpNLE(As<UInt4>(src0.x), As<UInt4>(src1.x))); 1918 dst.y = As<Float4>(CmpNLE(As<UInt4>(src0.y), As<UInt4>(src1.y))); 1919 dst.z = As<Float4>(CmpNLE(As<UInt4>(src0.z), As<UInt4>(src1.z))); 1920 dst.w = As<Float4>(CmpNLE(As<UInt4>(src0.w), As<UInt4>(src1.w))); 1921 break; 1922 case Shader::CONTROL_EQ: 1923 dst.x = As<Float4>(CmpEQ(As<UInt4>(src0.x), As<UInt4>(src1.x))); 1924 dst.y = As<Float4>(CmpEQ(As<UInt4>(src0.y), As<UInt4>(src1.y))); 1925 dst.z = As<Float4>(CmpEQ(As<UInt4>(src0.z), As<UInt4>(src1.z))); 1926 dst.w = As<Float4>(CmpEQ(As<UInt4>(src0.w), As<UInt4>(src1.w))); 1927 break; 1928 case Shader::CONTROL_GE: 1929 dst.x = As<Float4>(CmpNLT(As<UInt4>(src0.x), As<UInt4>(src1.x))); 1930 dst.y = As<Float4>(CmpNLT(As<UInt4>(src0.y), As<UInt4>(src1.y))); 1931 dst.z = As<Float4>(CmpNLT(As<UInt4>(src0.z), As<UInt4>(src1.z))); 1932 dst.w = As<Float4>(CmpNLT(As<UInt4>(src0.w), As<UInt4>(src1.w))); 1933 break; 1934 case Shader::CONTROL_LT: 1935 dst.x = As<Float4>(CmpLT(As<UInt4>(src0.x), As<UInt4>(src1.x))); 1936 dst.y = As<Float4>(CmpLT(As<UInt4>(src0.y), As<UInt4>(src1.y))); 1937 dst.z = As<Float4>(CmpLT(As<UInt4>(src0.z), As<UInt4>(src1.z))); 1938 dst.w = As<Float4>(CmpLT(As<UInt4>(src0.w), As<UInt4>(src1.w))); 1939 break; 1940 case Shader::CONTROL_NE: 1941 dst.x = As<Float4>(CmpNEQ(As<UInt4>(src0.x), As<UInt4>(src1.x))); 1942 dst.y = As<Float4>(CmpNEQ(As<UInt4>(src0.y), As<UInt4>(src1.y))); 1943 dst.z = As<Float4>(CmpNEQ(As<UInt4>(src0.z), As<UInt4>(src1.z))); 1944 dst.w = As<Float4>(CmpNEQ(As<UInt4>(src0.w), As<UInt4>(src1.w))); 1945 break; 1946 case Shader::CONTROL_LE: 1947 dst.x = As<Float4>(CmpLE(As<UInt4>(src0.x), As<UInt4>(src1.x))); 1948 dst.y = As<Float4>(CmpLE(As<UInt4>(src0.y), As<UInt4>(src1.y))); 1949 dst.z = As<Float4>(CmpLE(As<UInt4>(src0.z), As<UInt4>(src1.z))); 1950 dst.w = As<Float4>(CmpLE(As<UInt4>(src0.w), As<UInt4>(src1.w))); 1951 break; 1952 default: 1953 ASSERT(false); 1954 } 1955 } 1956 all(Float4 & dst,const Vector4f & src)1957 void ShaderCore::all(Float4 &dst, const Vector4f &src) 1958 { 1959 dst = As<Float4>(As<Int4>(src.x) & As<Int4>(src.y) & As<Int4>(src.z) & As<Int4>(src.w)); 1960 } 1961 any(Float4 & dst,const Vector4f & src)1962 void ShaderCore::any(Float4 &dst, const Vector4f &src) 1963 { 1964 dst = As<Float4>(As<Int4>(src.x) | As<Int4>(src.y) | As<Int4>(src.z) | As<Int4>(src.w)); 1965 } 1966 bitwise_not(Vector4f & dst,const Vector4f & src)1967 void ShaderCore::bitwise_not(Vector4f &dst, const Vector4f &src) 1968 { 1969 dst.x = As<Float4>(As<Int4>(src.x) ^ Int4(0xFFFFFFFF)); 1970 dst.y = As<Float4>(As<Int4>(src.y) ^ Int4(0xFFFFFFFF)); 1971 dst.z = As<Float4>(As<Int4>(src.z) ^ Int4(0xFFFFFFFF)); 1972 dst.w = As<Float4>(As<Int4>(src.w) ^ Int4(0xFFFFFFFF)); 1973 } 1974 bitwise_or(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1975 void ShaderCore::bitwise_or(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1976 { 1977 dst.x = As<Float4>(As<Int4>(src0.x) | As<Int4>(src1.x)); 1978 dst.y = As<Float4>(As<Int4>(src0.y) | As<Int4>(src1.y)); 1979 dst.z = As<Float4>(As<Int4>(src0.z) | As<Int4>(src1.z)); 1980 dst.w = As<Float4>(As<Int4>(src0.w) | As<Int4>(src1.w)); 1981 } 1982 bitwise_xor(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1983 void ShaderCore::bitwise_xor(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1984 { 1985 dst.x = As<Float4>(As<Int4>(src0.x) ^ As<Int4>(src1.x)); 1986 dst.y = As<Float4>(As<Int4>(src0.y) ^ As<Int4>(src1.y)); 1987 dst.z = As<Float4>(As<Int4>(src0.z) ^ As<Int4>(src1.z)); 1988 dst.w = As<Float4>(As<Int4>(src0.w) ^ As<Int4>(src1.w)); 1989 } 1990 bitwise_and(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1991 void ShaderCore::bitwise_and(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 1992 { 1993 dst.x = As<Float4>(As<Int4>(src0.x) & As<Int4>(src1.x)); 1994 dst.y = As<Float4>(As<Int4>(src0.y) & As<Int4>(src1.y)); 1995 dst.z = As<Float4>(As<Int4>(src0.z) & As<Int4>(src1.z)); 1996 dst.w = As<Float4>(As<Int4>(src0.w) & As<Int4>(src1.w)); 1997 } 1998 equal(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)1999 void ShaderCore::equal(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 2000 { 2001 dst.x = As<Float4>(CmpEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)) & 2002 CmpEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)) & 2003 CmpEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)) & 2004 CmpEQ(As<UInt4>(src0.w), As<UInt4>(src1.w))); 2005 dst.y = dst.x; 2006 dst.z = dst.x; 2007 dst.w = dst.x; 2008 } 2009 notEqual(Vector4f & dst,const Vector4f & src0,const Vector4f & src1)2010 void ShaderCore::notEqual(Vector4f &dst, const Vector4f &src0, const Vector4f &src1) 2011 { 2012 dst.x = As<Float4>(CmpNEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)) | 2013 CmpNEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)) | 2014 CmpNEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)) | 2015 CmpNEQ(As<UInt4>(src0.w), As<UInt4>(src1.w))); 2016 dst.y = dst.x; 2017 dst.z = dst.x; 2018 dst.w = dst.x; 2019 } 2020 } 2021