• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef sw_ShaderCore_hpp
16 #define sw_ShaderCore_hpp
17 
18 #include "Reactor/Print.hpp"
19 #include "Reactor/Reactor.hpp"
20 #include "System/Debug.hpp"
21 
22 #include <array>
23 #include <atomic>   // std::memory_order
24 #include <utility>  // std::pair
25 
26 namespace sw {
27 
28 using namespace rr;
29 
30 class Vector4s
31 {
32 public:
33 	Vector4s();
34 	Vector4s(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
35 	Vector4s(const Vector4s &rhs);
36 
37 	Short4 &operator[](int i);
38 	Vector4s &operator=(const Vector4s &rhs);
39 
40 	Short4 x;
41 	Short4 y;
42 	Short4 z;
43 	Short4 w;
44 };
45 
46 class Vector4f
47 {
48 public:
49 	Vector4f();
50 	Vector4f(float x, float y, float z, float w);
51 	Vector4f(const Vector4f &rhs);
52 
53 	Float4 &operator[](int i);
54 	Vector4f &operator=(const Vector4f &rhs);
55 
56 	Float4 x;
57 	Float4 y;
58 	Float4 z;
59 	Float4 w;
60 };
61 
62 class Vector4i
63 {
64 public:
65 	Vector4i();
66 	Vector4i(int x, int y, int z, int w);
67 	Vector4i(const Vector4i &rhs);
68 
69 	Int4 &operator[](int i);
70 	Vector4i &operator=(const Vector4i &rhs);
71 
72 	Int4 x;
73 	Int4 y;
74 	Int4 z;
75 	Int4 w;
76 };
77 
78 enum class OutOfBoundsBehavior
79 {
80 	Nullify,             // Loads become zero, stores are elided.
81 	RobustBufferAccess,  // As defined by the Vulkan spec (in short: access anywhere within bounds, or zeroing).
82 	UndefinedValue,      // Only for load operations. Not secure. No program termination.
83 	UndefinedBehavior,   // Program may terminate.
84 };
85 
86 // SIMD contains types that represent multiple scalars packed into a single
87 // vector data type. Types in the SIMD namespace provide a semantic hint
88 // that the data should be treated as a per-execution-lane scalar instead of
89 // a typical euclidean-style vector type.
90 namespace SIMD {
91 
92 // Width is the number of per-lane scalars packed into each SIMD vector.
93 static constexpr int Width = 4;
94 
95 using Float = rr::Float4;
96 using Int = rr::Int4;
97 using UInt = rr::UInt4;
98 
99 struct Pointer
100 {
101 	Pointer(rr::Pointer<Byte> base, rr::Int limit);
102 	Pointer(rr::Pointer<Byte> base, unsigned int limit);
103 	Pointer(rr::Pointer<Byte> base, rr::Int limit, SIMD::Int offset);
104 	Pointer(rr::Pointer<Byte> base, unsigned int limit, SIMD::Int offset);
105 
106 	Pointer &operator+=(Int i);
107 	Pointer &operator*=(Int i);
108 
109 	Pointer operator+(SIMD::Int i);
110 	Pointer operator*(SIMD::Int i);
111 
112 	Pointer &operator+=(int i);
113 	Pointer &operator*=(int i);
114 
115 	Pointer operator+(int i);
116 	Pointer operator*(int i);
117 
118 	SIMD::Int offsets() const;
119 
120 	SIMD::Int isInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const;
121 
122 	bool isStaticallyInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const;
123 
124 	rr::Int limit() const;
125 
126 	// Returns true if all offsets are sequential
127 	// (N+0*step, N+1*step, N+2*step, N+3*step)
128 	rr::Bool hasSequentialOffsets(unsigned int step) const;
129 
130 	// Returns true if all offsets are are compile-time static and
131 	// sequential (N+0*step, N+1*step, N+2*step, N+3*step)
132 	bool hasStaticSequentialOffsets(unsigned int step) const;
133 
134 	// Returns true if all offsets are equal (N, N, N, N)
135 	rr::Bool hasEqualOffsets() const;
136 
137 	// Returns true if all offsets are compile-time static and are equal
138 	// (N, N, N, N)
139 	bool hasStaticEqualOffsets() const;
140 
141 	template<typename T>
142 	inline T Load(OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed, int alignment = sizeof(float));
143 
144 	template<typename T>
145 	inline void Store(T val, OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
146 
147 	template<typename T>
148 	inline void Store(RValue<T> val, OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
149 
150 	// Base address for the pointer, common across all lanes.
151 	rr::Pointer<rr::Byte> base;
152 
153 	// Upper (non-inclusive) limit for offsets from base.
154 	rr::Int dynamicLimit;  // If hasDynamicLimit is false, dynamicLimit is zero.
155 	unsigned int staticLimit;
156 
157 	// Per lane offsets from base.
158 	SIMD::Int dynamicOffsets;  // If hasDynamicOffsets is false, all dynamicOffsets are zero.
159 	std::array<int32_t, SIMD::Width> staticOffsets;
160 
161 	bool hasDynamicLimit;    // True if dynamicLimit is non-zero.
162 	bool hasDynamicOffsets;  // True if any dynamicOffsets are non-zero.
163 };
164 
165 template<typename T>
166 struct Element
167 {};
168 template<>
169 struct Element<Float>
170 {
171 	using type = rr::Float;
172 };
173 template<>
174 struct Element<Int>
175 {
176 	using type = rr::Int;
177 };
178 template<>
179 struct Element<UInt>
180 {
181 	using type = rr::UInt;
182 };
183 
184 }  // namespace SIMD
185 
186 // Vulkan 'SPIR-V Extended Instructions for GLSL' (GLSL.std.450) compliant transcendental functions
187 RValue<Float4> Sin(RValue<Float4> x, bool relaxedPrecision);
188 RValue<Float4> Cos(RValue<Float4> x, bool relaxedPrecision);
189 RValue<Float4> Tan(RValue<Float4> x, bool relaxedPrecision);
190 RValue<Float4> Asin(RValue<Float4> x, bool relaxedPrecision);
191 RValue<Float4> Acos(RValue<Float4> x, bool relaxedPrecision);
192 RValue<Float4> Atan(RValue<Float4> x, bool relaxedPrecision);
193 RValue<Float4> Atan2(RValue<Float4> y, RValue<Float4> x, bool relaxedPrecision);
194 RValue<Float4> Exp2(RValue<Float4> x, bool relaxedPrecision);
195 RValue<Float4> Log2(RValue<Float4> x, bool relaxedPrecision);
196 RValue<Float4> Exp(RValue<Float4> x, bool relaxedPrecision);
197 RValue<Float4> Log(RValue<Float4> x, bool relaxedPrecision);
198 RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y, bool relaxedPrecision);
199 RValue<Float4> Sinh(RValue<Float4> x, bool relaxedPrecision);
200 RValue<Float4> Cosh(RValue<Float4> x, bool relaxedPrecision);
201 RValue<Float4> Tanh(RValue<Float4> x, bool relaxedPrecision);
202 RValue<Float4> Asinh(RValue<Float4> x, bool relaxedPrecision);
203 RValue<Float4> Acosh(RValue<Float4> x, bool relaxedPrecision);
204 RValue<Float4> Atanh(RValue<Float4> x, bool relaxedPrecision);
205 RValue<Float4> Sqrt(RValue<Float4> x, bool relaxedPrecision);
206 
207 // Math functions with uses outside of shaders can be invoked using a verbose template argument instead
208 // of a Boolean argument to indicate precision. For example Sqrt<Mediump>(x) equals Sqrt(x, true).
209 enum Precision
210 {
211 	Highp,
212 	Relaxed,
213 	Mediump = Relaxed,  // GLSL defines mediump and lowp as corresponding with SPIR-V's RelaxedPrecision
214 };
215 
216 // clang-format off
217 template<Precision precision> RValue<Float4> Sqrt(RValue<Float4> x);
Sqrt(RValue<Float4> x)218 template<> inline RValue<Float4> Sqrt<Highp>(RValue<Float4> x) { return Sqrt(x, false); }
Sqrt(RValue<Float4> x)219 template<> inline RValue<Float4> Sqrt<Mediump>(RValue<Float4> x) { return Sqrt(x, true); }
220 
221 template<Precision precision> RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y);
Pow(RValue<Float4> x,RValue<Float4> y)222 template<> inline RValue<Float4> Pow<Highp>(RValue<Float4> x, RValue<Float4> y) { return Pow(x, y, false); }
Pow(RValue<Float4> x,RValue<Float4> y)223 template<> inline RValue<Float4> Pow<Mediump>(RValue<Float4> x, RValue<Float4> y) { return Pow(x, y, true); }
224 // clang-format on
225 
226 RValue<Float4> reciprocal(RValue<Float4> x, bool pp = false, bool exactAtPow2 = false);
227 RValue<Float4> reciprocalSquareRoot(RValue<Float4> x, bool abs, bool pp = false);
228 
229 RValue<Float4> mulAdd(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z);  // TODO(chromium:1299047)
230 
231 void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3);
232 void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3);
233 void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
234 void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
235 void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
236 void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
237 void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
238 void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N);
239 
240 sw::SIMD::UInt halfToFloatBits(sw::SIMD::UInt halfBits);
241 sw::SIMD::UInt floatToHalfBits(sw::SIMD::UInt floatBits, bool storeInUpperBits);
242 Float4 r11g11b10Unpack(UInt r11g11b10bits);
243 UInt r11g11b10Pack(const Float4 &value);
244 Float4 linearToSRGB(const Float4 &c);
245 Float4 sRGBtoLinear(const Float4 &c);
246 
247 RValue<Bool> AnyTrue(const RValue<SIMD::Int> &bools);
248 RValue<Bool> AnyFalse(const RValue<SIMD::Int> &bools);
249 RValue<Bool> AllTrue(const RValue<SIMD::Int> &bools);
250 RValue<Bool> AllFalse(const RValue<SIMD::Int> &bools);
251 
252 RValue<Bool> Divergent(const RValue<SIMD::Int> &ints);
253 RValue<Bool> Divergent(const RValue<SIMD::Float> &floats);
254 RValue<Bool> Uniform(const RValue<SIMD::Int> &ints);
255 RValue<Bool> Uniform(const RValue<SIMD::Float> &floats);
256 
257 template<typename T>
258 inline rr::RValue<T> AndAll(rr::RValue<T> const &mask);
259 
260 template<typename T>
261 inline rr::RValue<T> OrAll(rr::RValue<T> const &mask);
262 
263 rr::RValue<sw::SIMD::Float> Sign(rr::RValue<sw::SIMD::Float> const &val);
264 
265 // Returns the <whole, frac> of val.
266 // Both whole and frac will have the same sign as val.
267 std::pair<rr::RValue<sw::SIMD::Float>, rr::RValue<sw::SIMD::Float>>
268 Modf(rr::RValue<sw::SIMD::Float> const &val);
269 
270 // Returns the number of 1s in bits, per lane.
271 sw::SIMD::UInt CountBits(rr::RValue<sw::SIMD::UInt> const &bits);
272 
273 // Returns 1 << bits.
274 // If the resulting bit overflows a 32 bit integer, 0 is returned.
275 rr::RValue<sw::SIMD::UInt> NthBit32(rr::RValue<sw::SIMD::UInt> const &bits);
276 
277 // Returns bitCount number of of 1's starting from the LSB.
278 rr::RValue<sw::SIMD::UInt> Bitmask32(rr::RValue<sw::SIMD::UInt> const &bitCount);
279 
280 // Computes `a * b + c`, which may be fused into one operation to produce a higher-precision result.
281 rr::RValue<sw::SIMD::Float> FMA(
282     rr::RValue<sw::SIMD::Float> const &a,
283     rr::RValue<sw::SIMD::Float> const &b,
284     rr::RValue<sw::SIMD::Float> const &c);
285 
286 // Returns the exponent of the floating point number f.
287 // Assumes IEEE 754
288 rr::RValue<sw::SIMD::Int> Exponent(rr::RValue<sw::SIMD::Float> f);
289 
290 // Returns y if y < x; otherwise result is x.
291 // If one operand is a NaN, the other operand is the result.
292 // If both operands are NaN, the result is a NaN.
293 rr::RValue<sw::SIMD::Float> NMin(rr::RValue<sw::SIMD::Float> const &x, rr::RValue<sw::SIMD::Float> const &y);
294 
295 // Returns y if y > x; otherwise result is x.
296 // If one operand is a NaN, the other operand is the result.
297 // If both operands are NaN, the result is a NaN.
298 rr::RValue<sw::SIMD::Float> NMax(rr::RValue<sw::SIMD::Float> const &x, rr::RValue<sw::SIMD::Float> const &y);
299 
300 // Returns the determinant of a 2x2 matrix.
301 rr::RValue<sw::SIMD::Float> Determinant(
302     rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b,
303     rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d);
304 
305 // Returns the determinant of a 3x3 matrix.
306 rr::RValue<sw::SIMD::Float> Determinant(
307     rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c,
308     rr::RValue<sw::SIMD::Float> const &d, rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f,
309     rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h, rr::RValue<sw::SIMD::Float> const &i);
310 
311 // Returns the determinant of a 4x4 matrix.
312 rr::RValue<sw::SIMD::Float> Determinant(
313     rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d,
314     rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f, rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h,
315     rr::RValue<sw::SIMD::Float> const &i, rr::RValue<sw::SIMD::Float> const &j, rr::RValue<sw::SIMD::Float> const &k, rr::RValue<sw::SIMD::Float> const &l,
316     rr::RValue<sw::SIMD::Float> const &m, rr::RValue<sw::SIMD::Float> const &n, rr::RValue<sw::SIMD::Float> const &o, rr::RValue<sw::SIMD::Float> const &p);
317 
318 // Returns the inverse of a 2x2 matrix.
319 std::array<rr::RValue<sw::SIMD::Float>, 4> MatrixInverse(
320     rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b,
321     rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d);
322 
323 // Returns the inverse of a 3x3 matrix.
324 std::array<rr::RValue<sw::SIMD::Float>, 9> MatrixInverse(
325     rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c,
326     rr::RValue<sw::SIMD::Float> const &d, rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f,
327     rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h, rr::RValue<sw::SIMD::Float> const &i);
328 
329 // Returns the inverse of a 4x4 matrix.
330 std::array<rr::RValue<sw::SIMD::Float>, 16> MatrixInverse(
331     rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d,
332     rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f, rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h,
333     rr::RValue<sw::SIMD::Float> const &i, rr::RValue<sw::SIMD::Float> const &j, rr::RValue<sw::SIMD::Float> const &k, rr::RValue<sw::SIMD::Float> const &l,
334     rr::RValue<sw::SIMD::Float> const &m, rr::RValue<sw::SIMD::Float> const &n, rr::RValue<sw::SIMD::Float> const &o, rr::RValue<sw::SIMD::Float> const &p);
335 
336 ////////////////////////////////////////////////////////////////////////////
337 // Inline functions
338 ////////////////////////////////////////////////////////////////////////////
339 
340 template<typename T>
Load(OutOfBoundsBehavior robustness,Int mask,bool atomic,std::memory_order order,int alignment)341 inline T SIMD::Pointer::Load(OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */)
342 {
343 	using EL = typename Element<T>::type;
344 
345 	if(isStaticallyInBounds(sizeof(float), robustness))
346 	{
347 		// All elements are statically known to be in-bounds.
348 		// We can avoid costly conditional on masks.
349 
350 		if(hasStaticSequentialOffsets(sizeof(float)))
351 		{
352 			// Offsets are sequential. Perform regular load.
353 			return rr::Load(rr::Pointer<T>(base + staticOffsets[0]), alignment, atomic, order);
354 		}
355 
356 		if(hasStaticEqualOffsets())
357 		{
358 			// Load one, replicate.
359 			return T(*rr::Pointer<EL>(base + staticOffsets[0], alignment));
360 		}
361 	}
362 	else
363 	{
364 		switch(robustness)
365 		{
366 		case OutOfBoundsBehavior::Nullify:
367 		case OutOfBoundsBehavior::RobustBufferAccess:
368 		case OutOfBoundsBehavior::UndefinedValue:
369 			mask &= isInBounds(sizeof(float), robustness);  // Disable out-of-bounds reads.
370 			break;
371 		case OutOfBoundsBehavior::UndefinedBehavior:
372 			// Nothing to do. Application/compiler must guarantee no out-of-bounds accesses.
373 			break;
374 		}
375 	}
376 
377 	auto offs = offsets();
378 
379 	if(!atomic && order == std::memory_order_relaxed)
380 	{
381 		if(hasStaticEqualOffsets())
382 		{
383 			// Load one, replicate.
384 			// Be careful of the case where the post-bounds-check mask
385 			// is 0, in which case we must not load.
386 			T out = T(0);
387 			If(AnyTrue(mask))
388 			{
389 				EL el = *rr::Pointer<EL>(base + staticOffsets[0], alignment);
390 				out = T(el);
391 			}
392 			return out;
393 		}
394 
395 		bool zeroMaskedLanes = true;
396 		switch(robustness)
397 		{
398 		case OutOfBoundsBehavior::Nullify:
399 		case OutOfBoundsBehavior::RobustBufferAccess:  // Must either return an in-bounds value, or zero.
400 			zeroMaskedLanes = true;
401 			break;
402 		case OutOfBoundsBehavior::UndefinedValue:
403 		case OutOfBoundsBehavior::UndefinedBehavior:
404 			zeroMaskedLanes = false;
405 			break;
406 		}
407 
408 		// TODO(b/195446858): Optimize static sequential offsets case by using masked load.
409 
410 		return rr::Gather(rr::Pointer<EL>(base), offs, mask, alignment, zeroMaskedLanes);
411 	}
412 	else
413 	{
414 		T out;
415 		auto anyLanesDisabled = AnyFalse(mask);
416 		If(hasEqualOffsets() && !anyLanesDisabled)
417 		{
418 			// Load one, replicate.
419 			auto offset = Extract(offs, 0);
420 			out = T(rr::Load(rr::Pointer<EL>(&base[offset]), alignment, atomic, order));
421 		}
422 		Else If(hasSequentialOffsets(sizeof(float)) && !anyLanesDisabled)
423 		{
424 			// Load all elements in a single SIMD instruction.
425 			auto offset = Extract(offs, 0);
426 			out = rr::Load(rr::Pointer<T>(&base[offset]), alignment, atomic, order);
427 		}
428 		Else
429 		{
430 			// Divergent offsets or masked lanes.
431 			out = T(0);
432 			for(int i = 0; i < SIMD::Width; i++)
433 			{
434 				If(Extract(mask, i) != 0)
435 				{
436 					auto offset = Extract(offs, i);
437 					auto el = rr::Load(rr::Pointer<EL>(&base[offset]), alignment, atomic, order);
438 					out = Insert(out, el, i);
439 				}
440 			}
441 		}
442 		return out;
443 	}
444 }
445 
446 template<typename T>
Store(T val,OutOfBoundsBehavior robustness,Int mask,bool atomic,std::memory_order order)447 inline void SIMD::Pointer::Store(T val, OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
448 {
449 	using EL = typename Element<T>::type;
450 	constexpr size_t alignment = sizeof(float);
451 	auto offs = offsets();
452 
453 	switch(robustness)
454 	{
455 	case OutOfBoundsBehavior::Nullify:
456 	case OutOfBoundsBehavior::RobustBufferAccess:       // TODO: Allows writing anywhere within bounds. Could be faster than masking.
457 	case OutOfBoundsBehavior::UndefinedValue:           // Should not be used for store operations. Treat as robust buffer access.
458 		mask &= isInBounds(sizeof(float), robustness);  // Disable out-of-bounds writes.
459 		break;
460 	case OutOfBoundsBehavior::UndefinedBehavior:
461 		// Nothing to do. Application/compiler must guarantee no out-of-bounds accesses.
462 		break;
463 	}
464 
465 	if(!atomic && order == std::memory_order_relaxed)
466 	{
467 		if(hasStaticEqualOffsets())
468 		{
469 			If(AnyTrue(mask))
470 			{
471 				// All equal. One of these writes will win -- elect the winning lane.
472 				auto v0111 = SIMD::Int(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
473 				auto elect = mask & ~(v0111 & (mask.xxyz | mask.xxxy | mask.xxxx));
474 				auto maskedVal = As<SIMD::Int>(val) & elect;
475 				auto scalarVal = Extract(maskedVal, 0) |
476 				                 Extract(maskedVal, 1) |
477 				                 Extract(maskedVal, 2) |
478 				                 Extract(maskedVal, 3);
479 				*rr::Pointer<EL>(base + staticOffsets[0], alignment) = As<EL>(scalarVal);
480 			}
481 		}
482 		else if(hasStaticSequentialOffsets(sizeof(float)) &&
483 		        isStaticallyInBounds(sizeof(float), robustness))
484 		{
485 			// TODO(b/195446858): Optimize using masked store.
486 			// Pointer has no elements OOB, and the store is not atomic.
487 			// Perform a read-modify-write.
488 			auto p = rr::Pointer<SIMD::Int>(base + staticOffsets[0], alignment);
489 			auto prev = *p;
490 			*p = (prev & ~mask) | (As<SIMD::Int>(val) & mask);
491 		}
492 		else
493 		{
494 			rr::Scatter(rr::Pointer<EL>(base), val, offs, mask, alignment);
495 		}
496 	}
497 	else
498 	{
499 		auto anyLanesDisabled = AnyFalse(mask);
500 		If(hasSequentialOffsets(sizeof(float)) && !anyLanesDisabled)
501 		{
502 			// Store all elements in a single SIMD instruction.
503 			auto offset = Extract(offs, 0);
504 			rr::Store(val, rr::Pointer<T>(&base[offset]), alignment, atomic, order);
505 		}
506 		Else
507 		{
508 			// Divergent offsets or masked lanes.
509 			for(int i = 0; i < SIMD::Width; i++)
510 			{
511 				If(Extract(mask, i) != 0)
512 				{
513 					auto offset = Extract(offs, i);
514 					rr::Store(Extract(val, i), rr::Pointer<EL>(&base[offset]), alignment, atomic, order);
515 				}
516 			}
517 		}
518 	}
519 }
520 
521 template<typename T>
Store(RValue<T> val,OutOfBoundsBehavior robustness,Int mask,bool atomic,std::memory_order order)522 inline void SIMD::Pointer::Store(RValue<T> val, OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
523 {
524 	Store(T(val), robustness, mask, atomic, order);
525 }
526 
527 template<typename T>
AndAll(rr::RValue<T> const & mask)528 inline rr::RValue<T> AndAll(rr::RValue<T> const &mask)
529 {
530 	T v1 = mask;               // [x]    [y]    [z]    [w]
531 	T v2 = v1.xzxz & v1.ywyw;  // [xy]   [zw]   [xy]   [zw]
532 	return v2.xxxx & v2.yyyy;  // [xyzw] [xyzw] [xyzw] [xyzw]
533 }
534 
535 template<typename T>
OrAll(rr::RValue<T> const & mask)536 inline rr::RValue<T> OrAll(rr::RValue<T> const &mask)
537 {
538 	T v1 = mask;               // [x]    [y]    [z]    [w]
539 	T v2 = v1.xzxz | v1.ywyw;  // [xy]   [zw]   [xy]   [zw]
540 	return v2.xxxx | v2.yyyy;  // [xyzw] [xyzw] [xyzw] [xyzw]
541 }
542 
543 }  // namespace sw
544 
545 #ifdef ENABLE_RR_PRINT
546 namespace rr {
547 template<>
548 struct PrintValue::Ty<sw::Vector4f>
549 {
fmtrr::PrintValue::Ty550 	static std::string fmt(const sw::Vector4f &v)
551 	{
552 		return "[x: " + PrintValue::fmt(v.x) +
553 		       ", y: " + PrintValue::fmt(v.y) +
554 		       ", z: " + PrintValue::fmt(v.z) +
555 		       ", w: " + PrintValue::fmt(v.w) + "]";
556 	}
557 
valrr::PrintValue::Ty558 	static std::vector<rr::Value *> val(const sw::Vector4f &v)
559 	{
560 		return PrintValue::vals(v.x, v.y, v.z, v.w);
561 	}
562 };
563 template<>
564 struct PrintValue::Ty<sw::Vector4s>
565 {
fmtrr::PrintValue::Ty566 	static std::string fmt(const sw::Vector4s &v)
567 	{
568 		return "[x: " + PrintValue::fmt(v.x) +
569 		       ", y: " + PrintValue::fmt(v.y) +
570 		       ", z: " + PrintValue::fmt(v.z) +
571 		       ", w: " + PrintValue::fmt(v.w) + "]";
572 	}
573 
valrr::PrintValue::Ty574 	static std::vector<rr::Value *> val(const sw::Vector4s &v)
575 	{
576 		return PrintValue::vals(v.x, v.y, v.z, v.w);
577 	}
578 };
579 template<>
580 struct PrintValue::Ty<sw::SIMD::Pointer>
581 {
fmtrr::PrintValue::Ty582 	static std::string fmt(const sw::SIMD::Pointer &v)
583 	{
584 		return "{" + PrintValue::fmt(v.base) + " +" + PrintValue::fmt(v.offsets()) + "}";
585 	}
586 
valrr::PrintValue::Ty587 	static std::vector<rr::Value *> val(const sw::SIMD::Pointer &v)
588 	{
589 		return PrintValue::vals(v.base, v.offsets());
590 	}
591 };
592 }  // namespace rr
593 #endif  // ENABLE_RR_PRINT
594 
595 #endif  // sw_ShaderCore_hpp
596