// Copyright 2016 The SwiftShader Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef sw_ShaderCore_hpp #define sw_ShaderCore_hpp #include "Reactor/Print.hpp" #include "Reactor/Reactor.hpp" #include "System/Debug.hpp" #include #include // std::memory_order #include // std::pair namespace sw { using namespace rr; class Vector4s { public: Vector4s(); Vector4s(unsigned short x, unsigned short y, unsigned short z, unsigned short w); Vector4s(const Vector4s &rhs); Short4 &operator[](int i); Vector4s &operator=(const Vector4s &rhs); Short4 x; Short4 y; Short4 z; Short4 w; }; class Vector4f { public: Vector4f(); Vector4f(float x, float y, float z, float w); Vector4f(const Vector4f &rhs); Float4 &operator[](int i); Vector4f &operator=(const Vector4f &rhs); Float4 x; Float4 y; Float4 z; Float4 w; }; class Vector4i { public: Vector4i(); Vector4i(int x, int y, int z, int w); Vector4i(const Vector4i &rhs); Int4 &operator[](int i); Vector4i &operator=(const Vector4i &rhs); Int4 x; Int4 y; Int4 z; Int4 w; }; enum class OutOfBoundsBehavior { Nullify, // Loads become zero, stores are elided. RobustBufferAccess, // As defined by the Vulkan spec (in short: access anywhere within bounds, or zeroing). UndefinedValue, // Only for load operations. Not secure. No program termination. UndefinedBehavior, // Program may terminate. }; // SIMD contains types that represent multiple scalars packed into a single // vector data type. Types in the SIMD namespace provide a semantic hint // that the data should be treated as a per-execution-lane scalar instead of // a typical euclidean-style vector type. namespace SIMD { // Width is the number of per-lane scalars packed into each SIMD vector. static constexpr int Width = 4; using Float = rr::Float4; using Int = rr::Int4; using UInt = rr::UInt4; struct Pointer { Pointer(rr::Pointer base, rr::Int limit); Pointer(rr::Pointer base, unsigned int limit); Pointer(rr::Pointer base, rr::Int limit, SIMD::Int offset); Pointer(rr::Pointer base, unsigned int limit, SIMD::Int offset); Pointer &operator+=(Int i); Pointer &operator*=(Int i); Pointer operator+(SIMD::Int i); Pointer operator*(SIMD::Int i); Pointer &operator+=(int i); Pointer &operator*=(int i); Pointer operator+(int i); Pointer operator*(int i); SIMD::Int offsets() const; SIMD::Int isInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const; bool isStaticallyInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const; rr::Int limit() const; // Returns true if all offsets are sequential // (N+0*step, N+1*step, N+2*step, N+3*step) rr::Bool hasSequentialOffsets(unsigned int step) const; // Returns true if all offsets are are compile-time static and // sequential (N+0*step, N+1*step, N+2*step, N+3*step) bool hasStaticSequentialOffsets(unsigned int step) const; // Returns true if all offsets are equal (N, N, N, N) rr::Bool hasEqualOffsets() const; // Returns true if all offsets are compile-time static and are equal // (N, N, N, N) bool hasStaticEqualOffsets() const; template inline T Load(OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed, int alignment = sizeof(float)); template inline void Store(T val, OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed); template inline void Store(RValue val, OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed); // Base address for the pointer, common across all lanes. rr::Pointer base; // Upper (non-inclusive) limit for offsets from base. rr::Int dynamicLimit; // If hasDynamicLimit is false, dynamicLimit is zero. unsigned int staticLimit; // Per lane offsets from base. SIMD::Int dynamicOffsets; // If hasDynamicOffsets is false, all dynamicOffsets are zero. std::array staticOffsets; bool hasDynamicLimit; // True if dynamicLimit is non-zero. bool hasDynamicOffsets; // True if any dynamicOffsets are non-zero. }; template struct Element {}; template<> struct Element { using type = rr::Float; }; template<> struct Element { using type = rr::Int; }; template<> struct Element { using type = rr::UInt; }; } // namespace SIMD Float4 exponential2(RValue x, bool pp = false); Float4 logarithm2(RValue x, bool pp = false); Float4 exponential(RValue x, bool pp = false); Float4 logarithm(RValue x, bool pp = false); Float4 power(RValue x, RValue y, bool pp = false); Float4 reciprocal(RValue x, bool pp = false, bool finite = false, bool exactAtPow2 = false); Float4 reciprocalSquareRoot(RValue x, bool abs, bool pp = false); Float4 modulo(RValue x, RValue y); Float4 sine_pi(RValue x, bool pp = false); // limited to [-pi, pi] range Float4 cosine_pi(RValue x, bool pp = false); // limited to [-pi, pi] range Float4 sine(RValue x, bool pp = false); Float4 cosine(RValue x, bool pp = false); Float4 tangent(RValue x, bool pp = false); Float4 arccos(RValue x, bool pp = false); Float4 arcsin(RValue x, bool pp = false); Float4 arctan(RValue x, bool pp = false); Float4 arctan(RValue y, RValue x, bool pp = false); Float4 sineh(RValue x, bool pp = false); Float4 cosineh(RValue x, bool pp = false); Float4 tangenth(RValue x, bool pp = false); Float4 arccosh(RValue x, bool pp = false); // Limited to x >= 1 Float4 arcsinh(RValue x, bool pp = false); Float4 arctanh(RValue x, bool pp = false); // Limited to ]-1, 1[ range Float4 dot2(const Vector4f &v0, const Vector4f &v1); Float4 dot3(const Vector4f &v0, const Vector4f &v1); Float4 dot4(const Vector4f &v0, const Vector4f &v1); void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3); void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3); void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3); void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3); void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3); void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3); void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3); void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N); sw::SIMD::UInt halfToFloatBits(sw::SIMD::UInt halfBits); sw::SIMD::UInt floatToHalfBits(sw::SIMD::UInt floatBits, bool storeInUpperBits); Float4 r11g11b10Unpack(UInt r11g11b10bits); UInt r11g11b10Pack(const Float4 &value); Vector4s a2b10g10r10Unpack(const Int4 &value); Vector4s a2r10g10b10Unpack(const Int4 &value); rr::RValue AnyTrue(rr::RValue const &ints); rr::RValue AnyFalse(rr::RValue const &ints); template inline rr::RValue AndAll(rr::RValue const &mask); template inline rr::RValue OrAll(rr::RValue const &mask); rr::RValue Sign(rr::RValue const &val); // Returns the of val. // Both whole and frac will have the same sign as val. std::pair, rr::RValue> Modf(rr::RValue const &val); // Returns the number of 1s in bits, per lane. sw::SIMD::UInt CountBits(rr::RValue const &bits); // Returns 1 << bits. // If the resulting bit overflows a 32 bit integer, 0 is returned. rr::RValue NthBit32(rr::RValue const &bits); // Returns bitCount number of of 1's starting from the LSB. rr::RValue Bitmask32(rr::RValue const &bitCount); // Performs a fused-multiply add, returning a * b + c. rr::RValue FMA( rr::RValue const &a, rr::RValue const &b, rr::RValue const &c); // Returns the exponent of the floating point number f. // Assumes IEEE 754 rr::RValue Exponent(rr::RValue f); // Returns y if y < x; otherwise result is x. // If one operand is a NaN, the other operand is the result. // If both operands are NaN, the result is a NaN. rr::RValue NMin(rr::RValue const &x, rr::RValue const &y); // Returns y if y > x; otherwise result is x. // If one operand is a NaN, the other operand is the result. // If both operands are NaN, the result is a NaN. rr::RValue NMax(rr::RValue const &x, rr::RValue const &y); // Returns the determinant of a 2x2 matrix. rr::RValue Determinant( rr::RValue const &a, rr::RValue const &b, rr::RValue const &c, rr::RValue const &d); // Returns the determinant of a 3x3 matrix. rr::RValue Determinant( rr::RValue const &a, rr::RValue const &b, rr::RValue const &c, rr::RValue const &d, rr::RValue const &e, rr::RValue const &f, rr::RValue const &g, rr::RValue const &h, rr::RValue const &i); // Returns the determinant of a 4x4 matrix. rr::RValue Determinant( rr::RValue const &a, rr::RValue const &b, rr::RValue const &c, rr::RValue const &d, rr::RValue const &e, rr::RValue const &f, rr::RValue const &g, rr::RValue const &h, rr::RValue const &i, rr::RValue const &j, rr::RValue const &k, rr::RValue const &l, rr::RValue const &m, rr::RValue const &n, rr::RValue const &o, rr::RValue const &p); // Returns the inverse of a 2x2 matrix. std::array, 4> MatrixInverse( rr::RValue const &a, rr::RValue const &b, rr::RValue const &c, rr::RValue const &d); // Returns the inverse of a 3x3 matrix. std::array, 9> MatrixInverse( rr::RValue const &a, rr::RValue const &b, rr::RValue const &c, rr::RValue const &d, rr::RValue const &e, rr::RValue const &f, rr::RValue const &g, rr::RValue const &h, rr::RValue const &i); // Returns the inverse of a 4x4 matrix. std::array, 16> MatrixInverse( rr::RValue const &a, rr::RValue const &b, rr::RValue const &c, rr::RValue const &d, rr::RValue const &e, rr::RValue const &f, rr::RValue const &g, rr::RValue const &h, rr::RValue const &i, rr::RValue const &j, rr::RValue const &k, rr::RValue const &l, rr::RValue const &m, rr::RValue const &n, rr::RValue const &o, rr::RValue const &p); //////////////////////////////////////////////////////////////////////////// // Inline functions //////////////////////////////////////////////////////////////////////////// template inline T SIMD::Pointer::Load(OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */) { using EL = typename Element::type; if(isStaticallyInBounds(sizeof(float), robustness)) { // All elements are statically known to be in-bounds. // We can avoid costly conditional on masks. if(hasStaticSequentialOffsets(sizeof(float))) { // Offsets are sequential. Perform regular load. return rr::Load(rr::Pointer(base + staticOffsets[0]), alignment, atomic, order); } if(hasStaticEqualOffsets()) { // Load one, replicate. return T(*rr::Pointer(base + staticOffsets[0], alignment)); } } else { switch(robustness) { case OutOfBoundsBehavior::Nullify: case OutOfBoundsBehavior::RobustBufferAccess: case OutOfBoundsBehavior::UndefinedValue: mask &= isInBounds(sizeof(float), robustness); // Disable out-of-bounds reads. break; case OutOfBoundsBehavior::UndefinedBehavior: // Nothing to do. Application/compiler must guarantee no out-of-bounds accesses. break; } } auto offs = offsets(); if(!atomic && order == std::memory_order_relaxed) { if(hasStaticEqualOffsets()) { // Load one, replicate. // Be careful of the case where the post-bounds-check mask // is 0, in which case we must not load. T out = T(0); If(AnyTrue(mask)) { EL el = *rr::Pointer(base + staticOffsets[0], alignment); out = T(el); } return out; } bool zeroMaskedLanes = true; switch(robustness) { case OutOfBoundsBehavior::Nullify: case OutOfBoundsBehavior::RobustBufferAccess: // Must either return an in-bounds value, or zero. zeroMaskedLanes = true; break; case OutOfBoundsBehavior::UndefinedValue: case OutOfBoundsBehavior::UndefinedBehavior: zeroMaskedLanes = false; break; } if(hasStaticSequentialOffsets(sizeof(float))) { return rr::MaskedLoad(rr::Pointer(base + staticOffsets[0]), mask, alignment, zeroMaskedLanes); } return rr::Gather(rr::Pointer(base), offs, mask, alignment, zeroMaskedLanes); } else { T out; auto anyLanesDisabled = AnyFalse(mask); If(hasEqualOffsets() && !anyLanesDisabled) { // Load one, replicate. auto offset = Extract(offs, 0); out = T(rr::Load(rr::Pointer(&base[offset]), alignment, atomic, order)); } Else If(hasSequentialOffsets(sizeof(float)) && !anyLanesDisabled) { // Load all elements in a single SIMD instruction. auto offset = Extract(offs, 0); out = rr::Load(rr::Pointer(&base[offset]), alignment, atomic, order); } Else { // Divergent offsets or masked lanes. out = T(0); for(int i = 0; i < SIMD::Width; i++) { If(Extract(mask, i) != 0) { auto offset = Extract(offs, i); auto el = rr::Load(rr::Pointer(&base[offset]), alignment, atomic, order); out = Insert(out, el, i); } } } return out; } } template inline void SIMD::Pointer::Store(T val, OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */) { using EL = typename Element::type; constexpr size_t alignment = sizeof(float); auto offs = offsets(); switch(robustness) { case OutOfBoundsBehavior::Nullify: case OutOfBoundsBehavior::RobustBufferAccess: // TODO: Allows writing anywhere within bounds. Could be faster than masking. case OutOfBoundsBehavior::UndefinedValue: // Should not be used for store operations. Treat as robust buffer access. mask &= isInBounds(sizeof(float), robustness); // Disable out-of-bounds writes. break; case OutOfBoundsBehavior::UndefinedBehavior: // Nothing to do. Application/compiler must guarantee no out-of-bounds accesses. break; } if(!atomic && order == std::memory_order_relaxed) { if(hasStaticEqualOffsets()) { If(AnyTrue(mask)) { // All equal. One of these writes will win -- elect the winning lane. auto v0111 = SIMD::Int(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); auto elect = mask & ~(v0111 & (mask.xxyz | mask.xxxy | mask.xxxx)); auto maskedVal = As(val) & elect; auto scalarVal = Extract(maskedVal, 0) | Extract(maskedVal, 1) | Extract(maskedVal, 2) | Extract(maskedVal, 3); *rr::Pointer(base + staticOffsets[0], alignment) = As(scalarVal); } } else if(hasStaticSequentialOffsets(sizeof(float))) { if(isStaticallyInBounds(sizeof(float), robustness)) { // Pointer has no elements OOB, and the store is not atomic. // Perform a RMW. auto p = rr::Pointer(base + staticOffsets[0], alignment); auto prev = *p; *p = (prev & ~mask) | (As(val) & mask); } else { rr::MaskedStore(rr::Pointer(base + staticOffsets[0]), val, mask, alignment); } } else { rr::Scatter(rr::Pointer(base), val, offs, mask, alignment); } } else { auto anyLanesDisabled = AnyFalse(mask); If(hasSequentialOffsets(sizeof(float)) && !anyLanesDisabled) { // Store all elements in a single SIMD instruction. auto offset = Extract(offs, 0); rr::Store(val, rr::Pointer(&base[offset]), alignment, atomic, order); } Else { // Divergent offsets or masked lanes. for(int i = 0; i < SIMD::Width; i++) { If(Extract(mask, i) != 0) { auto offset = Extract(offs, i); rr::Store(Extract(val, i), rr::Pointer(&base[offset]), alignment, atomic, order); } } } } } template inline void SIMD::Pointer::Store(RValue val, OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */) { Store(T(val), robustness, mask, atomic, order); } template inline rr::RValue AndAll(rr::RValue const &mask) { T v1 = mask; // [x] [y] [z] [w] T v2 = v1.xzxz & v1.ywyw; // [xy] [zw] [xy] [zw] return v2.xxxx & v2.yyyy; // [xyzw] [xyzw] [xyzw] [xyzw] } template inline rr::RValue OrAll(rr::RValue const &mask) { T v1 = mask; // [x] [y] [z] [w] T v2 = v1.xzxz | v1.ywyw; // [xy] [zw] [xy] [zw] return v2.xxxx | v2.yyyy; // [xyzw] [xyzw] [xyzw] [xyzw] } } // namespace sw #ifdef ENABLE_RR_PRINT namespace rr { template<> struct PrintValue::Ty { static std::string fmt(const sw::Vector4f &v) { return "[x: " + PrintValue::fmt(v.x) + ", y: " + PrintValue::fmt(v.y) + ", z: " + PrintValue::fmt(v.z) + ", w: " + PrintValue::fmt(v.w) + "]"; } static std::vector val(const sw::Vector4f &v) { return PrintValue::vals(v.x, v.y, v.z, v.w); } }; template<> struct PrintValue::Ty { static std::string fmt(const sw::Vector4s &v) { return "[x: " + PrintValue::fmt(v.x) + ", y: " + PrintValue::fmt(v.y) + ", z: " + PrintValue::fmt(v.z) + ", w: " + PrintValue::fmt(v.w) + "]"; } static std::vector val(const sw::Vector4s &v) { return PrintValue::vals(v.x, v.y, v.z, v.w); } }; template<> struct PrintValue::Ty { static std::string fmt(const sw::SIMD::Pointer &v) { return "{" + PrintValue::fmt(v.base) + " +" + PrintValue::fmt(v.offsets()) + "}"; } static std::vector val(const sw::SIMD::Pointer &v) { return PrintValue::vals(v.base, v.offsets()); } }; } // namespace rr #endif // ENABLE_RR_PRINT #endif // sw_ShaderCore_hpp