// Copyright 2016 The SwiftShader Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef sw_ShaderCore_hpp #define sw_ShaderCore_hpp #include "Reactor/Print.hpp" #include "Reactor/Reactor.hpp" #include "System/Debug.hpp" #include #include // std::memory_order #include // std::pair namespace sw { using namespace rr; class Vector4s { public: Vector4s(); Vector4s(unsigned short x, unsigned short y, unsigned short z, unsigned short w); Vector4s(const Vector4s &rhs); Short4 &operator[](int i); Vector4s &operator=(const Vector4s &rhs); Short4 x; Short4 y; Short4 z; Short4 w; }; class Vector4f { public: Vector4f(); Vector4f(float x, float y, float z, float w); Vector4f(const Vector4f &rhs); Float4 &operator[](int i); Vector4f &operator=(const Vector4f &rhs); Float4 x; Float4 y; Float4 z; Float4 w; }; class Vector4i { public: Vector4i(); Vector4i(int x, int y, int z, int w); Vector4i(const Vector4i &rhs); Int4 &operator[](int i); Vector4i &operator=(const Vector4i &rhs); Int4 x; Int4 y; Int4 z; Int4 w; }; enum class OutOfBoundsBehavior { Nullify, // Loads become zero, stores are elided. RobustBufferAccess, // As defined by the Vulkan spec (in short: access anywhere within bounds, or zeroing). UndefinedValue, // Only for load operations. Not secure. No program termination. UndefinedBehavior, // Program may terminate. }; // SIMD contains types that represent multiple scalars packed into a single // vector data type. Types in the SIMD namespace provide a semantic hint // that the data should be treated as a per-execution-lane scalar instead of // a typical euclidean-style vector type. namespace SIMD { // Width is the number of per-lane scalars packed into each SIMD vector. static constexpr int Width = 4; using Float = rr::Float4; using Int = rr::Int4; using UInt = rr::UInt4; struct Pointer { Pointer(rr::Pointer base, rr::Int limit); Pointer(rr::Pointer base, unsigned int limit); Pointer(rr::Pointer base, rr::Int limit, SIMD::Int offset); Pointer(rr::Pointer base, unsigned int limit, SIMD::Int offset); Pointer &operator+=(Int i); Pointer &operator*=(Int i); Pointer operator+(SIMD::Int i); Pointer operator*(SIMD::Int i); Pointer &operator+=(int i); Pointer &operator*=(int i); Pointer operator+(int i); Pointer operator*(int i); SIMD::Int offsets() const; SIMD::Int isInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const; bool isStaticallyInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const; rr::Int limit() const; // Returns true if all offsets are sequential // (N+0*step, N+1*step, N+2*step, N+3*step) rr::Bool hasSequentialOffsets(unsigned int step) const; // Returns true if all offsets are are compile-time static and // sequential (N+0*step, N+1*step, N+2*step, N+3*step) bool hasStaticSequentialOffsets(unsigned int step) const; // Returns true if all offsets are equal (N, N, N, N) rr::Bool hasEqualOffsets() const; // Returns true if all offsets are compile-time static and are equal // (N, N, N, N) bool hasStaticEqualOffsets() const; template inline T Load(OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed, int alignment = sizeof(float)); template inline void Store(T val, OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed); template inline void Store(RValue val, OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed); // Base address for the pointer, common across all lanes. rr::Pointer base; // Upper (non-inclusive) limit for offsets from base. rr::Int dynamicLimit; // If hasDynamicLimit is false, dynamicLimit is zero. unsigned int staticLimit; // Per lane offsets from base. SIMD::Int dynamicOffsets; // If hasDynamicOffsets is false, all dynamicOffsets are zero. std::array staticOffsets; bool hasDynamicLimit; // True if dynamicLimit is non-zero. bool hasDynamicOffsets; // True if any dynamicOffsets are non-zero. }; template struct Element {}; template<> struct Element { using type = rr::Float; }; template<> struct Element { using type = rr::Int; }; template<> struct Element { using type = rr::UInt; }; } // namespace SIMD // Vulkan 'SPIR-V Extended Instructions for GLSL' (GLSL.std.450) compliant transcendental functions RValue Sin(RValue x, bool relaxedPrecision); RValue Cos(RValue x, bool relaxedPrecision); RValue Tan(RValue x, bool relaxedPrecision); RValue Asin(RValue x, bool relaxedPrecision); RValue Acos(RValue x, bool relaxedPrecision); RValue Atan(RValue x, bool relaxedPrecision); RValue Atan2(RValue y, RValue x, bool relaxedPrecision); RValue Exp2(RValue x, bool relaxedPrecision); RValue Log2(RValue x, bool relaxedPrecision); RValue Exp(RValue x, bool relaxedPrecision); RValue Log(RValue x, bool relaxedPrecision); RValue Pow(RValue x, RValue y, bool relaxedPrecision); RValue Sinh(RValue x, bool relaxedPrecision); RValue Cosh(RValue x, bool relaxedPrecision); RValue Tanh(RValue x, bool relaxedPrecision); RValue Asinh(RValue x, bool relaxedPrecision); RValue Acosh(RValue x, bool relaxedPrecision); RValue Atanh(RValue x, bool relaxedPrecision); RValue Sqrt(RValue x, bool relaxedPrecision); // Math functions with uses outside of shaders can be invoked using a verbose template argument instead // of a Boolean argument to indicate precision. For example Sqrt(x) equals Sqrt(x, true). enum Precision { Highp, Relaxed, Mediump = Relaxed, // GLSL defines mediump and lowp as corresponding with SPIR-V's RelaxedPrecision }; // clang-format off template RValue Sqrt(RValue x); template<> inline RValue Sqrt(RValue x) { return Sqrt(x, false); } template<> inline RValue Sqrt(RValue x) { return Sqrt(x, true); } template RValue Pow(RValue x, RValue y); template<> inline RValue Pow(RValue x, RValue y) { return Pow(x, y, false); } template<> inline RValue Pow(RValue x, RValue y) { return Pow(x, y, true); } // clang-format on RValue reciprocal(RValue x, bool pp = false, bool exactAtPow2 = false); RValue reciprocalSquareRoot(RValue x, bool abs, bool pp = false); RValue mulAdd(RValue x, RValue y, RValue z); // TODO(chromium:1299047) void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3); void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3); void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3); void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3); void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3); void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3); void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3); void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N); sw::SIMD::UInt halfToFloatBits(sw::SIMD::UInt halfBits); sw::SIMD::UInt floatToHalfBits(sw::SIMD::UInt floatBits, bool storeInUpperBits); Float4 r11g11b10Unpack(UInt r11g11b10bits); UInt r11g11b10Pack(const Float4 &value); Float4 linearToSRGB(const Float4 &c); Float4 sRGBtoLinear(const Float4 &c); RValue AnyTrue(const RValue &bools); RValue AnyFalse(const RValue &bools); RValue AllTrue(const RValue &bools); RValue AllFalse(const RValue &bools); RValue Divergent(const RValue &ints); RValue Divergent(const RValue &floats); RValue Uniform(const RValue &ints); RValue Uniform(const RValue &floats); template inline rr::RValue AndAll(rr::RValue const &mask); template inline rr::RValue OrAll(rr::RValue const &mask); rr::RValue Sign(rr::RValue const &val); // Returns the of val. // Both whole and frac will have the same sign as val. std::pair, rr::RValue> Modf(rr::RValue const &val); // Returns the number of 1s in bits, per lane. sw::SIMD::UInt CountBits(rr::RValue const &bits); // Returns 1 << bits. // If the resulting bit overflows a 32 bit integer, 0 is returned. rr::RValue NthBit32(rr::RValue const &bits); // Returns bitCount number of of 1's starting from the LSB. rr::RValue Bitmask32(rr::RValue const &bitCount); // Computes `a * b + c`, which may be fused into one operation to produce a higher-precision result. rr::RValue FMA( rr::RValue const &a, rr::RValue const &b, rr::RValue const &c); // Returns the exponent of the floating point number f. // Assumes IEEE 754 rr::RValue Exponent(rr::RValue f); // Returns y if y < x; otherwise result is x. // If one operand is a NaN, the other operand is the result. // If both operands are NaN, the result is a NaN. rr::RValue NMin(rr::RValue const &x, rr::RValue const &y); // Returns y if y > x; otherwise result is x. // If one operand is a NaN, the other operand is the result. // If both operands are NaN, the result is a NaN. rr::RValue NMax(rr::RValue const &x, rr::RValue const &y); // Returns the determinant of a 2x2 matrix. rr::RValue Determinant( rr::RValue const &a, rr::RValue const &b, rr::RValue const &c, rr::RValue const &d); // Returns the determinant of a 3x3 matrix. rr::RValue Determinant( rr::RValue const &a, rr::RValue const &b, rr::RValue const &c, rr::RValue const &d, rr::RValue const &e, rr::RValue const &f, rr::RValue const &g, rr::RValue const &h, rr::RValue const &i); // Returns the determinant of a 4x4 matrix. rr::RValue Determinant( rr::RValue const &a, rr::RValue const &b, rr::RValue const &c, rr::RValue const &d, rr::RValue const &e, rr::RValue const &f, rr::RValue const &g, rr::RValue const &h, rr::RValue const &i, rr::RValue const &j, rr::RValue const &k, rr::RValue const &l, rr::RValue const &m, rr::RValue const &n, rr::RValue const &o, rr::RValue const &p); // Returns the inverse of a 2x2 matrix. std::array, 4> MatrixInverse( rr::RValue const &a, rr::RValue const &b, rr::RValue const &c, rr::RValue const &d); // Returns the inverse of a 3x3 matrix. std::array, 9> MatrixInverse( rr::RValue const &a, rr::RValue const &b, rr::RValue const &c, rr::RValue const &d, rr::RValue const &e, rr::RValue const &f, rr::RValue const &g, rr::RValue const &h, rr::RValue const &i); // Returns the inverse of a 4x4 matrix. std::array, 16> MatrixInverse( rr::RValue const &a, rr::RValue const &b, rr::RValue const &c, rr::RValue const &d, rr::RValue const &e, rr::RValue const &f, rr::RValue const &g, rr::RValue const &h, rr::RValue const &i, rr::RValue const &j, rr::RValue const &k, rr::RValue const &l, rr::RValue const &m, rr::RValue const &n, rr::RValue const &o, rr::RValue const &p); //////////////////////////////////////////////////////////////////////////// // Inline functions //////////////////////////////////////////////////////////////////////////// template inline T SIMD::Pointer::Load(OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */) { using EL = typename Element::type; if(isStaticallyInBounds(sizeof(float), robustness)) { // All elements are statically known to be in-bounds. // We can avoid costly conditional on masks. if(hasStaticSequentialOffsets(sizeof(float))) { // Offsets are sequential. Perform regular load. return rr::Load(rr::Pointer(base + staticOffsets[0]), alignment, atomic, order); } if(hasStaticEqualOffsets()) { // Load one, replicate. return T(*rr::Pointer(base + staticOffsets[0], alignment)); } } else { switch(robustness) { case OutOfBoundsBehavior::Nullify: case OutOfBoundsBehavior::RobustBufferAccess: case OutOfBoundsBehavior::UndefinedValue: mask &= isInBounds(sizeof(float), robustness); // Disable out-of-bounds reads. break; case OutOfBoundsBehavior::UndefinedBehavior: // Nothing to do. Application/compiler must guarantee no out-of-bounds accesses. break; } } auto offs = offsets(); if(!atomic && order == std::memory_order_relaxed) { if(hasStaticEqualOffsets()) { // Load one, replicate. // Be careful of the case where the post-bounds-check mask // is 0, in which case we must not load. T out = T(0); If(AnyTrue(mask)) { EL el = *rr::Pointer(base + staticOffsets[0], alignment); out = T(el); } return out; } bool zeroMaskedLanes = true; switch(robustness) { case OutOfBoundsBehavior::Nullify: case OutOfBoundsBehavior::RobustBufferAccess: // Must either return an in-bounds value, or zero. zeroMaskedLanes = true; break; case OutOfBoundsBehavior::UndefinedValue: case OutOfBoundsBehavior::UndefinedBehavior: zeroMaskedLanes = false; break; } // TODO(b/195446858): Optimize static sequential offsets case by using masked load. return rr::Gather(rr::Pointer(base), offs, mask, alignment, zeroMaskedLanes); } else { T out; auto anyLanesDisabled = AnyFalse(mask); If(hasEqualOffsets() && !anyLanesDisabled) { // Load one, replicate. auto offset = Extract(offs, 0); out = T(rr::Load(rr::Pointer(&base[offset]), alignment, atomic, order)); } Else If(hasSequentialOffsets(sizeof(float)) && !anyLanesDisabled) { // Load all elements in a single SIMD instruction. auto offset = Extract(offs, 0); out = rr::Load(rr::Pointer(&base[offset]), alignment, atomic, order); } Else { // Divergent offsets or masked lanes. out = T(0); for(int i = 0; i < SIMD::Width; i++) { If(Extract(mask, i) != 0) { auto offset = Extract(offs, i); auto el = rr::Load(rr::Pointer(&base[offset]), alignment, atomic, order); out = Insert(out, el, i); } } } return out; } } template inline void SIMD::Pointer::Store(T val, OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */) { using EL = typename Element::type; constexpr size_t alignment = sizeof(float); auto offs = offsets(); switch(robustness) { case OutOfBoundsBehavior::Nullify: case OutOfBoundsBehavior::RobustBufferAccess: // TODO: Allows writing anywhere within bounds. Could be faster than masking. case OutOfBoundsBehavior::UndefinedValue: // Should not be used for store operations. Treat as robust buffer access. mask &= isInBounds(sizeof(float), robustness); // Disable out-of-bounds writes. break; case OutOfBoundsBehavior::UndefinedBehavior: // Nothing to do. Application/compiler must guarantee no out-of-bounds accesses. break; } if(!atomic && order == std::memory_order_relaxed) { if(hasStaticEqualOffsets()) { If(AnyTrue(mask)) { // All equal. One of these writes will win -- elect the winning lane. auto v0111 = SIMD::Int(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); auto elect = mask & ~(v0111 & (mask.xxyz | mask.xxxy | mask.xxxx)); auto maskedVal = As(val) & elect; auto scalarVal = Extract(maskedVal, 0) | Extract(maskedVal, 1) | Extract(maskedVal, 2) | Extract(maskedVal, 3); *rr::Pointer(base + staticOffsets[0], alignment) = As(scalarVal); } } else if(hasStaticSequentialOffsets(sizeof(float)) && isStaticallyInBounds(sizeof(float), robustness)) { // TODO(b/195446858): Optimize using masked store. // Pointer has no elements OOB, and the store is not atomic. // Perform a read-modify-write. auto p = rr::Pointer(base + staticOffsets[0], alignment); auto prev = *p; *p = (prev & ~mask) | (As(val) & mask); } else { rr::Scatter(rr::Pointer(base), val, offs, mask, alignment); } } else { auto anyLanesDisabled = AnyFalse(mask); If(hasSequentialOffsets(sizeof(float)) && !anyLanesDisabled) { // Store all elements in a single SIMD instruction. auto offset = Extract(offs, 0); rr::Store(val, rr::Pointer(&base[offset]), alignment, atomic, order); } Else { // Divergent offsets or masked lanes. for(int i = 0; i < SIMD::Width; i++) { If(Extract(mask, i) != 0) { auto offset = Extract(offs, i); rr::Store(Extract(val, i), rr::Pointer(&base[offset]), alignment, atomic, order); } } } } } template inline void SIMD::Pointer::Store(RValue val, OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */) { Store(T(val), robustness, mask, atomic, order); } template inline rr::RValue AndAll(rr::RValue const &mask) { T v1 = mask; // [x] [y] [z] [w] T v2 = v1.xzxz & v1.ywyw; // [xy] [zw] [xy] [zw] return v2.xxxx & v2.yyyy; // [xyzw] [xyzw] [xyzw] [xyzw] } template inline rr::RValue OrAll(rr::RValue const &mask) { T v1 = mask; // [x] [y] [z] [w] T v2 = v1.xzxz | v1.ywyw; // [xy] [zw] [xy] [zw] return v2.xxxx | v2.yyyy; // [xyzw] [xyzw] [xyzw] [xyzw] } } // namespace sw #ifdef ENABLE_RR_PRINT namespace rr { template<> struct PrintValue::Ty { static std::string fmt(const sw::Vector4f &v) { return "[x: " + PrintValue::fmt(v.x) + ", y: " + PrintValue::fmt(v.y) + ", z: " + PrintValue::fmt(v.z) + ", w: " + PrintValue::fmt(v.w) + "]"; } static std::vector val(const sw::Vector4f &v) { return PrintValue::vals(v.x, v.y, v.z, v.w); } }; template<> struct PrintValue::Ty { static std::string fmt(const sw::Vector4s &v) { return "[x: " + PrintValue::fmt(v.x) + ", y: " + PrintValue::fmt(v.y) + ", z: " + PrintValue::fmt(v.z) + ", w: " + PrintValue::fmt(v.w) + "]"; } static std::vector val(const sw::Vector4s &v) { return PrintValue::vals(v.x, v.y, v.z, v.w); } }; template<> struct PrintValue::Ty { static std::string fmt(const sw::SIMD::Pointer &v) { return "{" + PrintValue::fmt(v.base) + " +" + PrintValue::fmt(v.offsets()) + "}"; } static std::vector val(const sw::SIMD::Pointer &v) { return PrintValue::vals(v.base, v.offsets()); } }; } // namespace rr #endif // ENABLE_RR_PRINT #endif // sw_ShaderCore_hpp