• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "VertexRoutine.hpp"
16 
17 #include "Constants.hpp"
18 #include "SpirvShader.hpp"
19 #include "Device/Renderer.hpp"
20 #include "Device/Vertex.hpp"
21 #include "System/Debug.hpp"
22 #include "System/Half.hpp"
23 #include "Vulkan/VkDevice.hpp"
24 
25 namespace sw {
26 
VertexRoutine(const VertexProcessor::State & state,vk::PipelineLayout const * pipelineLayout,SpirvShader const * spirvShader)27 VertexRoutine::VertexRoutine(
28     const VertexProcessor::State &state,
29     vk::PipelineLayout const *pipelineLayout,
30     SpirvShader const *spirvShader)
31     : routine(pipelineLayout)
32     , state(state)
33     , spirvShader(spirvShader)
34 {
35 	spirvShader->emitProlog(&routine);
36 }
37 
~VertexRoutine()38 VertexRoutine::~VertexRoutine()
39 {
40 }
41 
generate()42 void VertexRoutine::generate()
43 {
44 	Pointer<Byte> cache = task + OFFSET(VertexTask, vertexCache);
45 	Pointer<Byte> vertexCache = cache + OFFSET(VertexCache, vertex);
46 	Pointer<UInt> tagCache = Pointer<UInt>(cache + OFFSET(VertexCache, tag));
47 
48 	UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask, vertexCount));
49 
50 	constants = device + OFFSET(vk::Device, constants);
51 
52 	// Check the cache one vertex index at a time. If a hit occurs, copy from the cache to the 'vertex' output buffer.
53 	// On a cache miss, process a SIMD width of consecutive indices from the input batch. They're written to the cache
54 	// in reverse order to guarantee that the first one doesn't get evicted and can be written out.
55 
56 	Do
57 	{
58 		UInt index = *batch;
59 		UInt cacheIndex = index & VertexCache::TAG_MASK;
60 
61 		If(tagCache[cacheIndex] != index)
62 		{
63 			readInput(batch);
64 			program(batch, vertexCount);
65 			computeClipFlags();
66 			computeCullMask();
67 
68 			writeCache(vertexCache, tagCache, batch);
69 		}
70 
71 		Pointer<Byte> cacheEntry = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
72 
73 		// For points, vertexCount is 1 per primitive, so duplicate vertex for all 3 vertices of the primitive
74 		for(int i = 0; i < (state.isPoint ? 3 : 1); i++)
75 		{
76 			writeVertex(vertex, cacheEntry);
77 			vertex += sizeof(Vertex);
78 		}
79 
80 		batch = Pointer<UInt>(Pointer<Byte>(batch) + sizeof(uint32_t));
81 		vertexCount--;
82 	}
83 	Until(vertexCount == 0);
84 
85 	Return();
86 }
87 
readInput(Pointer<UInt> & batch)88 void VertexRoutine::readInput(Pointer<UInt> &batch)
89 {
90 	for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
91 	{
92 		if(spirvShader->inputs[i + 0].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
93 		   spirvShader->inputs[i + 1].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
94 		   spirvShader->inputs[i + 2].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
95 		   spirvShader->inputs[i + 3].Type != SpirvShader::ATTRIBTYPE_UNUSED)
96 		{
97 			Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, input) + sizeof(void *) * (i / 4));
98 			UInt stride = *Pointer<UInt>(data + OFFSET(DrawData, stride) + sizeof(uint32_t) * (i / 4));
99 			Int baseVertex = *Pointer<Int>(data + OFFSET(DrawData, baseVertex));
100 			UInt robustnessSize(0);
101 			if(state.robustBufferAccess)
102 			{
103 				robustnessSize = *Pointer<UInt>(data + OFFSET(DrawData, robustnessSize) + sizeof(uint32_t) * (i / 4));
104 			}
105 
106 			auto value = readStream(input, stride, state.input[i / 4], batch, state.robustBufferAccess, robustnessSize, baseVertex);
107 			routine.inputs[i + 0] = value.x;
108 			routine.inputs[i + 1] = value.y;
109 			routine.inputs[i + 2] = value.z;
110 			routine.inputs[i + 3] = value.w;
111 		}
112 	}
113 }
114 
computeClipFlags()115 void VertexRoutine::computeClipFlags()
116 {
117 	auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition);
118 	if(it != spirvShader->outputBuiltins.end())
119 	{
120 		assert(it->second.SizeInComponents == 4);
121 		auto &pos = routine.getVariable(it->second.Id);
122 		auto posX = pos[it->second.FirstComponent + 0];
123 		auto posY = pos[it->second.FirstComponent + 1];
124 		auto posZ = pos[it->second.FirstComponent + 2];
125 		auto posW = pos[it->second.FirstComponent + 3];
126 
127 		Int4 maxX = CmpLT(posW, posX);
128 		Int4 maxY = CmpLT(posW, posY);
129 		Int4 minX = CmpNLE(-posW, posX);
130 		Int4 minY = CmpNLE(-posW, posY);
131 
132 		clipFlags = Pointer<Int>(constants + OFFSET(Constants, maxX))[SignMask(maxX)];
133 		clipFlags |= Pointer<Int>(constants + OFFSET(Constants, maxY))[SignMask(maxY)];
134 		clipFlags |= Pointer<Int>(constants + OFFSET(Constants, minX))[SignMask(minX)];
135 		clipFlags |= Pointer<Int>(constants + OFFSET(Constants, minY))[SignMask(minY)];
136 		if(state.depthClipEnable)
137 		{
138 			Int4 maxZ = CmpLT(posW, posZ);
139 			Int4 minZ = CmpNLE(Float4(0.0f), posZ);
140 			clipFlags |= Pointer<Int>(constants + OFFSET(Constants, maxZ))[SignMask(maxZ)];
141 			clipFlags |= Pointer<Int>(constants + OFFSET(Constants, minZ))[SignMask(minZ)];
142 		}
143 
144 		Float4 maxPos = As<Float4>(Int4(0x7F7FFFFF));
145 		Int4 finiteX = CmpLE(Abs(posX), maxPos);
146 		Int4 finiteY = CmpLE(Abs(posY), maxPos);
147 		Int4 finiteZ = CmpLE(Abs(posZ), maxPos);
148 
149 		Int4 finiteXYZ = finiteX & finiteY & finiteZ;
150 		clipFlags |= Pointer<Int>(constants + OFFSET(Constants, fini))[SignMask(finiteXYZ)];
151 	}
152 }
153 
computeCullMask()154 void VertexRoutine::computeCullMask()
155 {
156 	cullMask = Int(15);
157 
158 	auto it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance);
159 	if(it != spirvShader->outputBuiltins.end())
160 	{
161 		auto count = spirvShader->getNumOutputCullDistances();
162 		for(uint32_t i = 0; i < count; i++)
163 		{
164 			auto const &distance = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
165 			auto mask = SignMask(CmpGE(distance, SIMD::Float(0)));
166 			cullMask &= mask;
167 		}
168 	}
169 }
170 
readStream(Pointer<Byte> & buffer,UInt & stride,const Stream & stream,Pointer<UInt> & batch,bool robustBufferAccess,UInt & robustnessSize,Int baseVertex)171 Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, Pointer<UInt> &batch,
172                                    bool robustBufferAccess, UInt &robustnessSize, Int baseVertex)
173 {
174 	Vector4f v;
175 	// Because of the following rule in the Vulkan spec, we do not care if a very large negative
176 	// baseVertex would overflow all the way back into a valid region of the index buffer:
177 	// "Out-of-bounds buffer loads will return any of the following values :
178 	//  - Values from anywhere within the memory range(s) bound to the buffer (possibly including
179 	//    bytes of memory past the end of the buffer, up to the end of the bound range)."
180 	UInt4 offsets = (*Pointer<UInt4>(As<Pointer<UInt4>>(batch)) + As<UInt4>(Int4(baseVertex))) * UInt4(stride);
181 
182 	Pointer<Byte> source0 = buffer + offsets.x;
183 	Pointer<Byte> source1 = buffer + offsets.y;
184 	Pointer<Byte> source2 = buffer + offsets.z;
185 	Pointer<Byte> source3 = buffer + offsets.w;
186 
187 	vk::Format format(stream.format);
188 
189 	UInt4 zero(0);
190 	if(robustBufferAccess)
191 	{
192 		// Prevent integer overflow on the addition below.
193 		offsets = Min(offsets, UInt4(robustnessSize));
194 
195 		// "vertex input attributes are considered out of bounds if the offset of the attribute
196 		//  in the bound vertex buffer range plus the size of the attribute is greater than ..."
197 		UInt4 limits = offsets + UInt4(format.bytes());
198 
199 		Pointer<Byte> zeroSource = As<Pointer<Byte>>(&zero);
200 		// TODO(b/141124876): Optimize for wide-vector gather operations.
201 		source0 = IfThenElse(limits.x > robustnessSize, zeroSource, source0);
202 		source1 = IfThenElse(limits.y > robustnessSize, zeroSource, source1);
203 		source2 = IfThenElse(limits.z > robustnessSize, zeroSource, source2);
204 		source3 = IfThenElse(limits.w > robustnessSize, zeroSource, source3);
205 	}
206 
207 	int componentCount = format.componentCount();
208 	bool normalized = !format.isUnnormalizedInteger();
209 	bool isNativeFloatAttrib = (stream.attribType == SpirvShader::ATTRIBTYPE_FLOAT) || normalized;
210 	bool bgra = false;
211 
212 	switch(stream.format)
213 	{
214 	case VK_FORMAT_R32_SFLOAT:
215 	case VK_FORMAT_R32G32_SFLOAT:
216 	case VK_FORMAT_R32G32B32_SFLOAT:
217 	case VK_FORMAT_R32G32B32A32_SFLOAT:
218 		{
219 			if(componentCount == 0)
220 			{
221 				// Null stream, all default components
222 			}
223 			else
224 			{
225 				if(componentCount == 1)
226 				{
227 					v.x.x = *Pointer<Float>(source0);
228 					v.x.y = *Pointer<Float>(source1);
229 					v.x.z = *Pointer<Float>(source2);
230 					v.x.w = *Pointer<Float>(source3);
231 				}
232 				else
233 				{
234 					v.x = *Pointer<Float4>(source0);
235 					v.y = *Pointer<Float4>(source1);
236 					v.z = *Pointer<Float4>(source2);
237 					v.w = *Pointer<Float4>(source3);
238 
239 					transpose4xN(v.x, v.y, v.z, v.w, componentCount);
240 				}
241 			}
242 		}
243 		break;
244 	case VK_FORMAT_B8G8R8A8_UNORM:
245 		bgra = true;
246 		// [[fallthrough]]
247 	case VK_FORMAT_R8_UNORM:
248 	case VK_FORMAT_R8G8_UNORM:
249 	case VK_FORMAT_R8G8B8A8_UNORM:
250 	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
251 		v.x = Float4(*Pointer<Byte4>(source0));
252 		v.y = Float4(*Pointer<Byte4>(source1));
253 		v.z = Float4(*Pointer<Byte4>(source2));
254 		v.w = Float4(*Pointer<Byte4>(source3));
255 
256 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
257 
258 		if(componentCount >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleByte));
259 		if(componentCount >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleByte));
260 		if(componentCount >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleByte));
261 		if(componentCount >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleByte));
262 		break;
263 	case VK_FORMAT_R8_UINT:
264 	case VK_FORMAT_R8G8_UINT:
265 	case VK_FORMAT_R8G8B8A8_UINT:
266 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
267 		v.x = As<Float4>(Int4(*Pointer<Byte4>(source0)));
268 		v.y = As<Float4>(Int4(*Pointer<Byte4>(source1)));
269 		v.z = As<Float4>(Int4(*Pointer<Byte4>(source2)));
270 		v.w = As<Float4>(Int4(*Pointer<Byte4>(source3)));
271 
272 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
273 		break;
274 	case VK_FORMAT_R8_SNORM:
275 	case VK_FORMAT_R8G8_SNORM:
276 	case VK_FORMAT_R8G8B8A8_SNORM:
277 	case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
278 		v.x = Float4(*Pointer<SByte4>(source0));
279 		v.y = Float4(*Pointer<SByte4>(source1));
280 		v.z = Float4(*Pointer<SByte4>(source2));
281 		v.w = Float4(*Pointer<SByte4>(source3));
282 
283 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
284 
285 		if(componentCount >= 1) v.x = Max(v.x * *Pointer<Float4>(constants + OFFSET(Constants, unscaleSByte)), Float4(-1.0f));
286 		if(componentCount >= 2) v.y = Max(v.y * *Pointer<Float4>(constants + OFFSET(Constants, unscaleSByte)), Float4(-1.0f));
287 		if(componentCount >= 3) v.z = Max(v.z * *Pointer<Float4>(constants + OFFSET(Constants, unscaleSByte)), Float4(-1.0f));
288 		if(componentCount >= 4) v.w = Max(v.w * *Pointer<Float4>(constants + OFFSET(Constants, unscaleSByte)), Float4(-1.0f));
289 		break;
290 	case VK_FORMAT_R8_USCALED:
291 	case VK_FORMAT_R8G8_USCALED:
292 	case VK_FORMAT_R8G8B8A8_USCALED:
293 	case VK_FORMAT_A8B8G8R8_USCALED_PACK32:
294 		v.x = Float4(*Pointer<Byte4>(source0));
295 		v.y = Float4(*Pointer<Byte4>(source1));
296 		v.z = Float4(*Pointer<Byte4>(source2));
297 		v.w = Float4(*Pointer<Byte4>(source3));
298 
299 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
300 		break;
301 	case VK_FORMAT_R8_SSCALED:
302 	case VK_FORMAT_R8G8_SSCALED:
303 	case VK_FORMAT_R8G8B8A8_SSCALED:
304 	case VK_FORMAT_A8B8G8R8_SSCALED_PACK32:
305 		v.x = Float4(*Pointer<SByte4>(source0));
306 		v.y = Float4(*Pointer<SByte4>(source1));
307 		v.z = Float4(*Pointer<SByte4>(source2));
308 		v.w = Float4(*Pointer<SByte4>(source3));
309 
310 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
311 		break;
312 	case VK_FORMAT_R8_SINT:
313 	case VK_FORMAT_R8G8_SINT:
314 	case VK_FORMAT_R8G8B8A8_SINT:
315 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
316 		v.x = As<Float4>(Int4(*Pointer<SByte4>(source0)));
317 		v.y = As<Float4>(Int4(*Pointer<SByte4>(source1)));
318 		v.z = As<Float4>(Int4(*Pointer<SByte4>(source2)));
319 		v.w = As<Float4>(Int4(*Pointer<SByte4>(source3)));
320 
321 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
322 		break;
323 	case VK_FORMAT_R16_UNORM:
324 	case VK_FORMAT_R16G16_UNORM:
325 	case VK_FORMAT_R16G16B16A16_UNORM:
326 		v.x = Float4(*Pointer<UShort4>(source0));
327 		v.y = Float4(*Pointer<UShort4>(source1));
328 		v.z = Float4(*Pointer<UShort4>(source2));
329 		v.w = Float4(*Pointer<UShort4>(source3));
330 
331 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
332 
333 		if(componentCount >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUShort));
334 		if(componentCount >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUShort));
335 		if(componentCount >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUShort));
336 		if(componentCount >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUShort));
337 		break;
338 	case VK_FORMAT_R16_SNORM:
339 	case VK_FORMAT_R16G16_SNORM:
340 	case VK_FORMAT_R16G16B16A16_SNORM:
341 		v.x = Float4(*Pointer<Short4>(source0));
342 		v.y = Float4(*Pointer<Short4>(source1));
343 		v.z = Float4(*Pointer<Short4>(source2));
344 		v.w = Float4(*Pointer<Short4>(source3));
345 
346 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
347 
348 		if(componentCount >= 1) v.x = Max(v.x * *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort)), Float4(-1.0f));
349 		if(componentCount >= 2) v.y = Max(v.y * *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort)), Float4(-1.0f));
350 		if(componentCount >= 3) v.z = Max(v.z * *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort)), Float4(-1.0f));
351 		if(componentCount >= 4) v.w = Max(v.w * *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort)), Float4(-1.0f));
352 		break;
353 	case VK_FORMAT_R16_USCALED:
354 	case VK_FORMAT_R16G16_USCALED:
355 	case VK_FORMAT_R16G16B16A16_USCALED:
356 		v.x = Float4(*Pointer<UShort4>(source0));
357 		v.y = Float4(*Pointer<UShort4>(source1));
358 		v.z = Float4(*Pointer<UShort4>(source2));
359 		v.w = Float4(*Pointer<UShort4>(source3));
360 
361 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
362 		break;
363 	case VK_FORMAT_R16_SSCALED:
364 	case VK_FORMAT_R16G16_SSCALED:
365 	case VK_FORMAT_R16G16B16A16_SSCALED:
366 		v.x = Float4(*Pointer<Short4>(source0));
367 		v.y = Float4(*Pointer<Short4>(source1));
368 		v.z = Float4(*Pointer<Short4>(source2));
369 		v.w = Float4(*Pointer<Short4>(source3));
370 
371 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
372 		break;
373 	case VK_FORMAT_R16_SINT:
374 	case VK_FORMAT_R16G16_SINT:
375 	case VK_FORMAT_R16G16B16A16_SINT:
376 		v.x = As<Float4>(Int4(*Pointer<Short4>(source0)));
377 		v.y = As<Float4>(Int4(*Pointer<Short4>(source1)));
378 		v.z = As<Float4>(Int4(*Pointer<Short4>(source2)));
379 		v.w = As<Float4>(Int4(*Pointer<Short4>(source3)));
380 
381 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
382 		break;
383 	case VK_FORMAT_R16_UINT:
384 	case VK_FORMAT_R16G16_UINT:
385 	case VK_FORMAT_R16G16B16A16_UINT:
386 		v.x = As<Float4>(Int4(*Pointer<UShort4>(source0)));
387 		v.y = As<Float4>(Int4(*Pointer<UShort4>(source1)));
388 		v.z = As<Float4>(Int4(*Pointer<UShort4>(source2)));
389 		v.w = As<Float4>(Int4(*Pointer<UShort4>(source3)));
390 
391 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
392 		break;
393 	case VK_FORMAT_R32_SINT:
394 	case VK_FORMAT_R32G32_SINT:
395 	case VK_FORMAT_R32G32B32_SINT:
396 	case VK_FORMAT_R32G32B32A32_SINT:
397 		v.x = *Pointer<Float4>(source0);
398 		v.y = *Pointer<Float4>(source1);
399 		v.z = *Pointer<Float4>(source2);
400 		v.w = *Pointer<Float4>(source3);
401 
402 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
403 		break;
404 	case VK_FORMAT_R32_UINT:
405 	case VK_FORMAT_R32G32_UINT:
406 	case VK_FORMAT_R32G32B32_UINT:
407 	case VK_FORMAT_R32G32B32A32_UINT:
408 		v.x = *Pointer<Float4>(source0);
409 		v.y = *Pointer<Float4>(source1);
410 		v.z = *Pointer<Float4>(source2);
411 		v.w = *Pointer<Float4>(source3);
412 
413 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
414 		break;
415 	case VK_FORMAT_R16_SFLOAT:
416 	case VK_FORMAT_R16G16_SFLOAT:
417 	case VK_FORMAT_R16G16B16A16_SFLOAT:
418 		{
419 			if(componentCount >= 1)
420 			{
421 				UShort x0 = *Pointer<UShort>(source0 + 0);
422 				UShort x1 = *Pointer<UShort>(source1 + 0);
423 				UShort x2 = *Pointer<UShort>(source2 + 0);
424 				UShort x3 = *Pointer<UShort>(source3 + 0);
425 
426 				v.x.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x0) * 4);
427 				v.x.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x1) * 4);
428 				v.x.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x2) * 4);
429 				v.x.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x3) * 4);
430 			}
431 
432 			if(componentCount >= 2)
433 			{
434 				UShort y0 = *Pointer<UShort>(source0 + 2);
435 				UShort y1 = *Pointer<UShort>(source1 + 2);
436 				UShort y2 = *Pointer<UShort>(source2 + 2);
437 				UShort y3 = *Pointer<UShort>(source3 + 2);
438 
439 				v.y.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y0) * 4);
440 				v.y.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y1) * 4);
441 				v.y.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y2) * 4);
442 				v.y.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y3) * 4);
443 			}
444 
445 			if(componentCount >= 3)
446 			{
447 				UShort z0 = *Pointer<UShort>(source0 + 4);
448 				UShort z1 = *Pointer<UShort>(source1 + 4);
449 				UShort z2 = *Pointer<UShort>(source2 + 4);
450 				UShort z3 = *Pointer<UShort>(source3 + 4);
451 
452 				v.z.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z0) * 4);
453 				v.z.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z1) * 4);
454 				v.z.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z2) * 4);
455 				v.z.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z3) * 4);
456 			}
457 
458 			if(componentCount >= 4)
459 			{
460 				UShort w0 = *Pointer<UShort>(source0 + 6);
461 				UShort w1 = *Pointer<UShort>(source1 + 6);
462 				UShort w2 = *Pointer<UShort>(source2 + 6);
463 				UShort w3 = *Pointer<UShort>(source3 + 6);
464 
465 				v.w.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w0) * 4);
466 				v.w.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w1) * 4);
467 				v.w.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w2) * 4);
468 				v.w.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w3) * 4);
469 			}
470 		}
471 		break;
472 	case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
473 		bgra = true;
474 		// [[fallthrough]]
475 	case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
476 		{
477 			Int4 src;
478 			src = Insert(src, *Pointer<Int>(source0), 0);
479 			src = Insert(src, *Pointer<Int>(source1), 1);
480 			src = Insert(src, *Pointer<Int>(source2), 2);
481 			src = Insert(src, *Pointer<Int>(source3), 3);
482 			v.x = Float4((src << 22) >> 22);
483 			v.y = Float4((src << 12) >> 22);
484 			v.z = Float4((src << 02) >> 22);
485 			v.w = Float4(src >> 30);
486 
487 			v.x = Max(v.x * Float4(1.0f / 0x1FF), Float4(-1.0f));
488 			v.y = Max(v.y * Float4(1.0f / 0x1FF), Float4(-1.0f));
489 			v.z = Max(v.z * Float4(1.0f / 0x1FF), Float4(-1.0f));
490 			v.w = Max(v.w, Float4(-1.0f));
491 		}
492 		break;
493 	case VK_FORMAT_A2R10G10B10_SINT_PACK32:
494 		bgra = true;
495 		// [[fallthrough]]
496 	case VK_FORMAT_A2B10G10R10_SINT_PACK32:
497 		{
498 			Int4 src;
499 			src = Insert(src, *Pointer<Int>(source0), 0);
500 			src = Insert(src, *Pointer<Int>(source1), 1);
501 			src = Insert(src, *Pointer<Int>(source2), 2);
502 			src = Insert(src, *Pointer<Int>(source3), 3);
503 			v.x = As<Float4>((src << 22) >> 22);
504 			v.y = As<Float4>((src << 12) >> 22);
505 			v.z = As<Float4>((src << 02) >> 22);
506 			v.w = As<Float4>(src >> 30);
507 		}
508 		break;
509 	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
510 		bgra = true;
511 		// [[fallthrough]]
512 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
513 		{
514 			Int4 src;
515 			src = Insert(src, *Pointer<Int>(source0), 0);
516 			src = Insert(src, *Pointer<Int>(source1), 1);
517 			src = Insert(src, *Pointer<Int>(source2), 2);
518 			src = Insert(src, *Pointer<Int>(source3), 3);
519 
520 			v.x = Float4(src & Int4(0x3FF));
521 			v.y = Float4((src >> 10) & Int4(0x3FF));
522 			v.z = Float4((src >> 20) & Int4(0x3FF));
523 			v.w = Float4((src >> 30) & Int4(0x3));
524 
525 			v.x *= Float4(1.0f / 0x3FF);
526 			v.y *= Float4(1.0f / 0x3FF);
527 			v.z *= Float4(1.0f / 0x3FF);
528 			v.w *= Float4(1.0f / 0x3);
529 		}
530 		break;
531 	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
532 		bgra = true;
533 		// [[fallthrough]]
534 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
535 		{
536 			Int4 src;
537 			src = Insert(src, *Pointer<Int>(source0), 0);
538 			src = Insert(src, *Pointer<Int>(source1), 1);
539 			src = Insert(src, *Pointer<Int>(source2), 2);
540 			src = Insert(src, *Pointer<Int>(source3), 3);
541 
542 			v.x = As<Float4>(src & Int4(0x3FF));
543 			v.y = As<Float4>((src >> 10) & Int4(0x3FF));
544 			v.z = As<Float4>((src >> 20) & Int4(0x3FF));
545 			v.w = As<Float4>((src >> 30) & Int4(0x3));
546 		}
547 		break;
548 	default:
549 		UNSUPPORTED("stream.format %d", int(stream.format));
550 	}
551 
552 	if(bgra)
553 	{
554 		// Swap red and blue
555 		Float4 t = v.x;
556 		v.x = v.z;
557 		v.z = t;
558 	}
559 
560 	if(componentCount < 1) v.x = Float4(0.0f);
561 	if(componentCount < 2) v.y = Float4(0.0f);
562 	if(componentCount < 3) v.z = Float4(0.0f);
563 	if(componentCount < 4) v.w = isNativeFloatAttrib ? As<Float4>(Float4(1.0f)) : As<Float4>(Int4(1));
564 
565 	return v;
566 }
567 
writeCache(Pointer<Byte> & vertexCache,Pointer<UInt> & tagCache,Pointer<UInt> & batch)568 void VertexRoutine::writeCache(Pointer<Byte> &vertexCache, Pointer<UInt> &tagCache, Pointer<UInt> &batch)
569 {
570 	UInt index0 = batch[0];
571 	UInt index1 = batch[1];
572 	UInt index2 = batch[2];
573 	UInt index3 = batch[3];
574 
575 	UInt cacheIndex0 = index0 & VertexCache::TAG_MASK;
576 	UInt cacheIndex1 = index1 & VertexCache::TAG_MASK;
577 	UInt cacheIndex2 = index2 & VertexCache::TAG_MASK;
578 	UInt cacheIndex3 = index3 & VertexCache::TAG_MASK;
579 
580 	// We processed a SIMD group of vertices, with the first one being the one that missed the cache tag check.
581 	// Write them out in reverse order here and below to ensure the first one is now guaranteed to be in the cache.
582 	tagCache[cacheIndex3] = index3;
583 	tagCache[cacheIndex2] = index2;
584 	tagCache[cacheIndex1] = index1;
585 	tagCache[cacheIndex0] = index0;
586 
587 	auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition);
588 	if(it != spirvShader->outputBuiltins.end())
589 	{
590 		assert(it->second.SizeInComponents == 4);
591 		auto &position = routine.getVariable(it->second.Id);
592 
593 		Vector4f pos;
594 		pos.x = position[it->second.FirstComponent + 0];
595 		pos.y = position[it->second.FirstComponent + 1];
596 		pos.z = position[it->second.FirstComponent + 2];
597 		pos.w = position[it->second.FirstComponent + 3];
598 
599 		// Projection and viewport transform.
600 		Float4 w = As<Float4>(As<Int4>(pos.w) | (As<Int4>(CmpEQ(pos.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
601 		Float4 rhw = Float4(1.0f) / w;
602 
603 		Vector4f proj;
604 		proj.x = As<Float4>(RoundIntClamped(*Pointer<Float4>(data + OFFSET(DrawData, X0xF)) + pos.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData, WxF))));
605 		proj.y = As<Float4>(RoundIntClamped(*Pointer<Float4>(data + OFFSET(DrawData, Y0xF)) + pos.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData, HxF))));
606 		proj.z = pos.z * rhw;
607 		proj.w = rhw;
608 
609 		transpose4x4(pos.x, pos.y, pos.z, pos.w);
610 
611 		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, position), 16) = pos.w;
612 		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, position), 16) = pos.z;
613 		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, position), 16) = pos.y;
614 		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, position), 16) = pos.x;
615 
616 		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 24) & 0x0000000FF;
617 		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 16) & 0x0000000FF;
618 		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 8) & 0x0000000FF;
619 		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 0) & 0x0000000FF;
620 
621 		transpose4x4(proj.x, proj.y, proj.z, proj.w);
622 
623 		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, projected), 16) = proj.w;
624 		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, projected), 16) = proj.z;
625 		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, projected), 16) = proj.y;
626 		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, projected), 16) = proj.x;
627 	}
628 
629 	it = spirvShader->outputBuiltins.find(spv::BuiltInPointSize);
630 	if(it != spirvShader->outputBuiltins.end())
631 	{
632 		ASSERT(it->second.SizeInComponents == 1);
633 		auto psize = routine.getVariable(it->second.Id)[it->second.FirstComponent];
634 
635 		*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, pointSize)) = Extract(psize, 3);
636 		*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, pointSize)) = Extract(psize, 2);
637 		*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, pointSize)) = Extract(psize, 1);
638 		*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, pointSize)) = Extract(psize, 0);
639 	}
640 
641 	it = spirvShader->outputBuiltins.find(spv::BuiltInClipDistance);
642 	if(it != spirvShader->outputBuiltins.end())
643 	{
644 		auto count = spirvShader->getNumOutputClipDistances();
645 		for(unsigned int i = 0; i < count; i++)
646 		{
647 			auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
648 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 3);
649 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 2);
650 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 1);
651 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 0);
652 		}
653 	}
654 
655 	it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance);
656 	if(it != spirvShader->outputBuiltins.end())
657 	{
658 		auto count = spirvShader->getNumOutputCullDistances();
659 		for(unsigned int i = 0; i < count; i++)
660 		{
661 			auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
662 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 3);
663 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 2);
664 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 1);
665 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 0);
666 		}
667 	}
668 
669 	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, cullMask)) = -((cullMask >> 3) & 1);
670 	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, cullMask)) = -((cullMask >> 2) & 1);
671 	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, cullMask)) = -((cullMask >> 1) & 1);
672 	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, cullMask)) = -((cullMask >> 0) & 1);
673 
674 	for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
675 	{
676 		if(spirvShader->outputs[i + 0].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
677 		   spirvShader->outputs[i + 1].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
678 		   spirvShader->outputs[i + 2].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
679 		   spirvShader->outputs[i + 3].Type != SpirvShader::ATTRIBTYPE_UNUSED)
680 		{
681 			Vector4f v;
682 			v.x = routine.outputs[i + 0];
683 			v.y = routine.outputs[i + 1];
684 			v.z = routine.outputs[i + 2];
685 			v.w = routine.outputs[i + 3];
686 
687 			transpose4x4(v.x, v.y, v.z, v.w);
688 
689 			*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, v[i]), 16) = v.w;
690 			*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, v[i]), 16) = v.z;
691 			*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, v[i]), 16) = v.y;
692 			*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, v[i]), 16) = v.x;
693 		}
694 	}
695 }
696 
writeVertex(const Pointer<Byte> & vertex,Pointer<Byte> & cacheEntry)697 void VertexRoutine::writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cacheEntry)
698 {
699 	*Pointer<Int4>(vertex + OFFSET(Vertex, position)) = *Pointer<Int4>(cacheEntry + OFFSET(Vertex, position));
700 	*Pointer<Int>(vertex + OFFSET(Vertex, pointSize)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, pointSize));
701 
702 	*Pointer<Int>(vertex + OFFSET(Vertex, clipFlags)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, clipFlags));
703 	*Pointer<Int>(vertex + OFFSET(Vertex, cullMask)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, cullMask));
704 	*Pointer<Int4>(vertex + OFFSET(Vertex, projected)) = *Pointer<Int4>(cacheEntry + OFFSET(Vertex, projected));
705 
706 	for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i++)
707 	{
708 		if(spirvShader->outputs[i].Type != SpirvShader::ATTRIBTYPE_UNUSED)
709 		{
710 			*Pointer<Int>(vertex + OFFSET(Vertex, v[i]), 4) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, v[i]), 4);
711 		}
712 	}
713 	for(unsigned int i = 0; i < spirvShader->getNumOutputClipDistances(); i++)
714 	{
715 		*Pointer<Float>(vertex + OFFSET(Vertex, clipDistance[i]), 4) = *Pointer<Float>(cacheEntry + OFFSET(Vertex, clipDistance[i]), 4);
716 	}
717 	for(unsigned int i = 0; i < spirvShader->getNumOutputCullDistances(); i++)
718 	{
719 		*Pointer<Float>(vertex + OFFSET(Vertex, cullDistance[i]), 4) = *Pointer<Float>(cacheEntry + OFFSET(Vertex, cullDistance[i]), 4);
720 	}
721 }
722 
723 }  // namespace sw
724