• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "VertexRoutine.hpp"
16 
17 #include "Constants.hpp"
18 #include "SpirvShader.hpp"
19 #include "Device/Renderer.hpp"
20 #include "Device/Vertex.hpp"
21 #include "System/Debug.hpp"
22 #include "System/Half.hpp"
23 
24 namespace sw {
25 
VertexRoutine(const VertexProcessor::State & state,vk::PipelineLayout const * pipelineLayout,SpirvShader const * spirvShader)26 VertexRoutine::VertexRoutine(
27     const VertexProcessor::State &state,
28     vk::PipelineLayout const *pipelineLayout,
29     SpirvShader const *spirvShader)
30     : routine(pipelineLayout)
31     , state(state)
32     , spirvShader(spirvShader)
33 {
34 	spirvShader->emitProlog(&routine);
35 }
36 
~VertexRoutine()37 VertexRoutine::~VertexRoutine()
38 {
39 }
40 
generate()41 void VertexRoutine::generate()
42 {
43 	Pointer<Byte> cache = task + OFFSET(VertexTask, vertexCache);
44 	Pointer<Byte> vertexCache = cache + OFFSET(VertexCache, vertex);
45 	Pointer<UInt> tagCache = Pointer<UInt>(cache + OFFSET(VertexCache, tag));
46 
47 	UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask, vertexCount));
48 
49 	constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, constants));
50 
51 	// Check the cache one vertex index at a time. If a hit occurs, copy from the cache to the 'vertex' output buffer.
52 	// On a cache miss, process a SIMD width of consecutive indices from the input batch. They're written to the cache
53 	// in reverse order to guarantee that the first one doesn't get evicted and can be written out.
54 
55 	Do
56 	{
57 		UInt index = *batch;
58 		UInt cacheIndex = index & VertexCache::TAG_MASK;
59 
60 		If(tagCache[cacheIndex] != index)
61 		{
62 			readInput(batch);
63 			program(batch, vertexCount);
64 			computeClipFlags();
65 			computeCullMask();
66 
67 			writeCache(vertexCache, tagCache, batch);
68 		}
69 
70 		Pointer<Byte> cacheEntry = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
71 
72 		// For points, vertexCount is 1 per primitive, so duplicate vertex for all 3 vertices of the primitive
73 		for(int i = 0; i < (state.isPoint ? 3 : 1); i++)
74 		{
75 			writeVertex(vertex, cacheEntry);
76 			vertex += sizeof(Vertex);
77 		}
78 
79 		batch = Pointer<UInt>(Pointer<Byte>(batch) + sizeof(uint32_t));
80 		vertexCount--;
81 	}
82 	Until(vertexCount == 0);
83 
84 	Return();
85 }
86 
readInput(Pointer<UInt> & batch)87 void VertexRoutine::readInput(Pointer<UInt> &batch)
88 {
89 	for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
90 	{
91 		if(spirvShader->inputs[i + 0].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
92 		   spirvShader->inputs[i + 1].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
93 		   spirvShader->inputs[i + 2].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
94 		   spirvShader->inputs[i + 3].Type != SpirvShader::ATTRIBTYPE_UNUSED)
95 		{
96 			Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, input) + sizeof(void *) * (i / 4));
97 			UInt stride = *Pointer<UInt>(data + OFFSET(DrawData, stride) + sizeof(uint32_t) * (i / 4));
98 			Int baseVertex = *Pointer<Int>(data + OFFSET(DrawData, baseVertex));
99 			UInt robustnessSize(0);
100 			if(state.robustBufferAccess)
101 			{
102 				robustnessSize = *Pointer<UInt>(data + OFFSET(DrawData, robustnessSize) + sizeof(uint32_t) * (i / 4));
103 			}
104 
105 			auto value = readStream(input, stride, state.input[i / 4], batch, state.robustBufferAccess, robustnessSize, baseVertex);
106 			routine.inputs[i + 0] = value.x;
107 			routine.inputs[i + 1] = value.y;
108 			routine.inputs[i + 2] = value.z;
109 			routine.inputs[i + 3] = value.w;
110 		}
111 	}
112 }
113 
computeClipFlags()114 void VertexRoutine::computeClipFlags()
115 {
116 	auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition);
117 	assert(it != spirvShader->outputBuiltins.end());
118 	assert(it->second.SizeInComponents == 4);
119 	auto &pos = routine.getVariable(it->second.Id);
120 	auto posX = pos[it->second.FirstComponent + 0];
121 	auto posY = pos[it->second.FirstComponent + 1];
122 	auto posZ = pos[it->second.FirstComponent + 2];
123 	auto posW = pos[it->second.FirstComponent + 3];
124 
125 	Int4 maxX = CmpLT(posW, posX);
126 	Int4 maxY = CmpLT(posW, posY);
127 	Int4 maxZ = CmpLT(posW, posZ);
128 	Int4 minX = CmpNLE(-posW, posX);
129 	Int4 minY = CmpNLE(-posW, posY);
130 	Int4 minZ = CmpNLE(Float4(0.0f), posZ);
131 
132 	clipFlags = Pointer<Int>(constants + OFFSET(Constants, maxX))[SignMask(maxX)];
133 	clipFlags |= Pointer<Int>(constants + OFFSET(Constants, maxY))[SignMask(maxY)];
134 	clipFlags |= Pointer<Int>(constants + OFFSET(Constants, maxZ))[SignMask(maxZ)];
135 	clipFlags |= Pointer<Int>(constants + OFFSET(Constants, minX))[SignMask(minX)];
136 	clipFlags |= Pointer<Int>(constants + OFFSET(Constants, minY))[SignMask(minY)];
137 	clipFlags |= Pointer<Int>(constants + OFFSET(Constants, minZ))[SignMask(minZ)];
138 
139 	Int4 finiteX = CmpLE(Abs(posX), *Pointer<Float4>(constants + OFFSET(Constants, maxPos)));
140 	Int4 finiteY = CmpLE(Abs(posY), *Pointer<Float4>(constants + OFFSET(Constants, maxPos)));
141 	Int4 finiteZ = CmpLE(Abs(posZ), *Pointer<Float4>(constants + OFFSET(Constants, maxPos)));
142 
143 	Int4 finiteXYZ = finiteX & finiteY & finiteZ;
144 	clipFlags |= Pointer<Int>(constants + OFFSET(Constants, fini))[SignMask(finiteXYZ)];
145 }
146 
computeCullMask()147 void VertexRoutine::computeCullMask()
148 {
149 	cullMask = Int(15);
150 
151 	auto it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance);
152 	if(it != spirvShader->outputBuiltins.end())
153 	{
154 		auto count = spirvShader->getNumOutputCullDistances();
155 		for(uint32_t i = 0; i < count; i++)
156 		{
157 			auto const &distance = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
158 			auto mask = SignMask(CmpGE(distance, SIMD::Float(0)));
159 			cullMask &= mask;
160 		}
161 	}
162 }
163 
readStream(Pointer<Byte> & buffer,UInt & stride,const Stream & stream,Pointer<UInt> & batch,bool robustBufferAccess,UInt & robustnessSize,Int baseVertex)164 Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, Pointer<UInt> &batch,
165                                    bool robustBufferAccess, UInt &robustnessSize, Int baseVertex)
166 {
167 	Vector4f v;
168 	// Because of the following rule in the Vulkan spec, we do not care if a very large negative
169 	// baseVertex would overflow all the way back into a valid region of the index buffer:
170 	// "Out-of-bounds buffer loads will return any of the following values :
171 	//  - Values from anywhere within the memory range(s) bound to the buffer (possibly including
172 	//    bytes of memory past the end of the buffer, up to the end of the bound range)."
173 	UInt4 offsets = (*Pointer<UInt4>(As<Pointer<UInt4>>(batch)) + As<UInt4>(Int4(baseVertex))) * UInt4(stride);
174 
175 	Pointer<Byte> source0 = buffer + offsets.x;
176 	Pointer<Byte> source1 = buffer + offsets.y;
177 	Pointer<Byte> source2 = buffer + offsets.z;
178 	Pointer<Byte> source3 = buffer + offsets.w;
179 
180 	vk::Format format(stream.format);
181 
182 	UInt4 zero(0);
183 	if(robustBufferAccess)
184 	{
185 		// TODO(b/141124876): Optimize for wide-vector gather operations.
186 		UInt4 limits = offsets + UInt4(format.bytes());
187 		Pointer<Byte> zeroSource = As<Pointer<Byte>>(&zero);
188 		source0 = IfThenElse(limits.x <= robustnessSize, source0, zeroSource);
189 		source1 = IfThenElse(limits.y <= robustnessSize, source1, zeroSource);
190 		source2 = IfThenElse(limits.z <= robustnessSize, source2, zeroSource);
191 		source3 = IfThenElse(limits.w <= robustnessSize, source3, zeroSource);
192 	}
193 
194 	int componentCount = format.componentCount();
195 	bool normalized = !format.isUnnormalizedInteger();
196 	bool isNativeFloatAttrib = (stream.attribType == SpirvShader::ATTRIBTYPE_FLOAT) || normalized;
197 	bool bgra = false;
198 
199 	switch(stream.format)
200 	{
201 		case VK_FORMAT_R32_SFLOAT:
202 		case VK_FORMAT_R32G32_SFLOAT:
203 		case VK_FORMAT_R32G32B32_SFLOAT:
204 		case VK_FORMAT_R32G32B32A32_SFLOAT:
205 		{
206 			if(componentCount == 0)
207 			{
208 				// Null stream, all default components
209 			}
210 			else
211 			{
212 				if(componentCount == 1)
213 				{
214 					v.x.x = *Pointer<Float>(source0);
215 					v.x.y = *Pointer<Float>(source1);
216 					v.x.z = *Pointer<Float>(source2);
217 					v.x.w = *Pointer<Float>(source3);
218 				}
219 				else
220 				{
221 					v.x = *Pointer<Float4>(source0);
222 					v.y = *Pointer<Float4>(source1);
223 					v.z = *Pointer<Float4>(source2);
224 					v.w = *Pointer<Float4>(source3);
225 
226 					transpose4xN(v.x, v.y, v.z, v.w, componentCount);
227 				}
228 
229 				switch(stream.attribType)
230 				{
231 					case SpirvShader::ATTRIBTYPE_INT:
232 						if(componentCount >= 1) v.x = As<Float4>(Int4(v.x));
233 						if(componentCount >= 2) v.x = As<Float4>(Int4(v.y));
234 						if(componentCount >= 3) v.x = As<Float4>(Int4(v.z));
235 						if(componentCount >= 4) v.x = As<Float4>(Int4(v.w));
236 						break;
237 					case SpirvShader::ATTRIBTYPE_UINT:
238 						if(componentCount >= 1) v.x = As<Float4>(UInt4(v.x));
239 						if(componentCount >= 2) v.x = As<Float4>(UInt4(v.y));
240 						if(componentCount >= 3) v.x = As<Float4>(UInt4(v.z));
241 						if(componentCount >= 4) v.x = As<Float4>(UInt4(v.w));
242 						break;
243 					default:
244 						break;
245 				}
246 			}
247 		}
248 		break;
249 		case VK_FORMAT_B8G8R8A8_UNORM:
250 			bgra = true;
251 		case VK_FORMAT_R8_UNORM:
252 		case VK_FORMAT_R8G8_UNORM:
253 		case VK_FORMAT_R8G8B8A8_UNORM:
254 		case VK_FORMAT_R8_UINT:
255 		case VK_FORMAT_R8G8_UINT:
256 		case VK_FORMAT_R8G8B8A8_UINT:
257 		case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
258 		case VK_FORMAT_A8B8G8R8_UINT_PACK32:
259 			if(isNativeFloatAttrib)  // Stream: UByte, Shader attrib: Float
260 			{
261 				v.x = Float4(*Pointer<Byte4>(source0));
262 				v.y = Float4(*Pointer<Byte4>(source1));
263 				v.z = Float4(*Pointer<Byte4>(source2));
264 				v.w = Float4(*Pointer<Byte4>(source3));
265 
266 				transpose4xN(v.x, v.y, v.z, v.w, componentCount);
267 
268 				if(normalized)
269 				{
270 					if(componentCount >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleByte));
271 					if(componentCount >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleByte));
272 					if(componentCount >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleByte));
273 					if(componentCount >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleByte));
274 				}
275 			}
276 			else  // Stream: UByte, Shader attrib: Int / UInt
277 			{
278 				v.x = As<Float4>(Int4(*Pointer<Byte4>(source0)));
279 				v.y = As<Float4>(Int4(*Pointer<Byte4>(source1)));
280 				v.z = As<Float4>(Int4(*Pointer<Byte4>(source2)));
281 				v.w = As<Float4>(Int4(*Pointer<Byte4>(source3)));
282 
283 				transpose4xN(v.x, v.y, v.z, v.w, componentCount);
284 			}
285 			break;
286 		case VK_FORMAT_R8_SNORM:
287 		case VK_FORMAT_R8_SINT:
288 		case VK_FORMAT_R8G8_SNORM:
289 		case VK_FORMAT_R8G8_SINT:
290 		case VK_FORMAT_R8G8B8A8_SNORM:
291 		case VK_FORMAT_R8G8B8A8_SINT:
292 		case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
293 		case VK_FORMAT_A8B8G8R8_SINT_PACK32:
294 			if(isNativeFloatAttrib)  // Stream: SByte, Shader attrib: Float
295 			{
296 				v.x = Float4(*Pointer<SByte4>(source0));
297 				v.y = Float4(*Pointer<SByte4>(source1));
298 				v.z = Float4(*Pointer<SByte4>(source2));
299 				v.w = Float4(*Pointer<SByte4>(source3));
300 
301 				transpose4xN(v.x, v.y, v.z, v.w, componentCount);
302 
303 				if(normalized)
304 				{
305 					if(componentCount >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleSByte));
306 					if(componentCount >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleSByte));
307 					if(componentCount >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleSByte));
308 					if(componentCount >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleSByte));
309 				}
310 			}
311 			else  // Stream: SByte, Shader attrib: Int / UInt
312 			{
313 				v.x = As<Float4>(Int4(*Pointer<SByte4>(source0)));
314 				v.y = As<Float4>(Int4(*Pointer<SByte4>(source1)));
315 				v.z = As<Float4>(Int4(*Pointer<SByte4>(source2)));
316 				v.w = As<Float4>(Int4(*Pointer<SByte4>(source3)));
317 
318 				transpose4xN(v.x, v.y, v.z, v.w, componentCount);
319 			}
320 			break;
321 		case VK_FORMAT_R16_SNORM:
322 		case VK_FORMAT_R16_SINT:
323 		case VK_FORMAT_R16G16_SNORM:
324 		case VK_FORMAT_R16G16_SINT:
325 		case VK_FORMAT_R16G16B16A16_SNORM:
326 		case VK_FORMAT_R16G16B16A16_SINT:
327 			if(isNativeFloatAttrib)  // Stream: Int, Shader attrib: Float
328 			{
329 				v.x = Float4(*Pointer<Short4>(source0));
330 				v.y = Float4(*Pointer<Short4>(source1));
331 				v.z = Float4(*Pointer<Short4>(source2));
332 				v.w = Float4(*Pointer<Short4>(source3));
333 
334 				transpose4xN(v.x, v.y, v.z, v.w, componentCount);
335 
336 				if(normalized)
337 				{
338 					if(componentCount >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort));
339 					if(componentCount >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort));
340 					if(componentCount >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort));
341 					if(componentCount >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort));
342 				}
343 			}
344 			else  // Stream: Short, Shader attrib: Int/UInt, no type conversion
345 			{
346 				v.x = As<Float4>(Int4(*Pointer<Short4>(source0)));
347 				v.y = As<Float4>(Int4(*Pointer<Short4>(source1)));
348 				v.z = As<Float4>(Int4(*Pointer<Short4>(source2)));
349 				v.w = As<Float4>(Int4(*Pointer<Short4>(source3)));
350 
351 				transpose4xN(v.x, v.y, v.z, v.w, componentCount);
352 			}
353 			break;
354 		case VK_FORMAT_R16_UNORM:
355 		case VK_FORMAT_R16_UINT:
356 		case VK_FORMAT_R16G16_UNORM:
357 		case VK_FORMAT_R16G16_UINT:
358 		case VK_FORMAT_R16G16B16A16_UNORM:
359 		case VK_FORMAT_R16G16B16A16_UINT:
360 			if(isNativeFloatAttrib)  // Stream: Int, Shader attrib: Float
361 			{
362 				v.x = Float4(*Pointer<UShort4>(source0));
363 				v.y = Float4(*Pointer<UShort4>(source1));
364 				v.z = Float4(*Pointer<UShort4>(source2));
365 				v.w = Float4(*Pointer<UShort4>(source3));
366 
367 				transpose4xN(v.x, v.y, v.z, v.w, componentCount);
368 
369 				if(normalized)
370 				{
371 					if(componentCount >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUShort));
372 					if(componentCount >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUShort));
373 					if(componentCount >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUShort));
374 					if(componentCount >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUShort));
375 				}
376 			}
377 			else  // Stream: UShort, Shader attrib: Int/UInt, no type conversion
378 			{
379 				v.x = As<Float4>(Int4(*Pointer<UShort4>(source0)));
380 				v.y = As<Float4>(Int4(*Pointer<UShort4>(source1)));
381 				v.z = As<Float4>(Int4(*Pointer<UShort4>(source2)));
382 				v.w = As<Float4>(Int4(*Pointer<UShort4>(source3)));
383 
384 				transpose4xN(v.x, v.y, v.z, v.w, componentCount);
385 			}
386 			break;
387 		case VK_FORMAT_R32_SINT:
388 		case VK_FORMAT_R32G32_SINT:
389 		case VK_FORMAT_R32G32B32_SINT:
390 		case VK_FORMAT_R32G32B32A32_SINT:
391 			if(isNativeFloatAttrib)  // Stream: Int, Shader attrib: Float
392 			{
393 				v.x = Float4(*Pointer<Int4>(source0));
394 				v.y = Float4(*Pointer<Int4>(source1));
395 				v.z = Float4(*Pointer<Int4>(source2));
396 				v.w = Float4(*Pointer<Int4>(source3));
397 
398 				transpose4xN(v.x, v.y, v.z, v.w, componentCount);
399 
400 				if(normalized)
401 				{
402 					if(componentCount >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
403 					if(componentCount >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
404 					if(componentCount >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
405 					if(componentCount >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
406 				}
407 			}
408 			else  // Stream: Int, Shader attrib: Int/UInt, no type conversion
409 			{
410 				v.x = *Pointer<Float4>(source0);
411 				v.y = *Pointer<Float4>(source1);
412 				v.z = *Pointer<Float4>(source2);
413 				v.w = *Pointer<Float4>(source3);
414 
415 				transpose4xN(v.x, v.y, v.z, v.w, componentCount);
416 			}
417 			break;
418 		case VK_FORMAT_R32_UINT:
419 		case VK_FORMAT_R32G32_UINT:
420 		case VK_FORMAT_R32G32B32_UINT:
421 		case VK_FORMAT_R32G32B32A32_UINT:
422 			if(isNativeFloatAttrib)  // Stream: UInt, Shader attrib: Float
423 			{
424 				v.x = Float4(*Pointer<UInt4>(source0));
425 				v.y = Float4(*Pointer<UInt4>(source1));
426 				v.z = Float4(*Pointer<UInt4>(source2));
427 				v.w = Float4(*Pointer<UInt4>(source3));
428 
429 				transpose4xN(v.x, v.y, v.z, v.w, componentCount);
430 
431 				if(normalized)
432 				{
433 					if(componentCount >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
434 					if(componentCount >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
435 					if(componentCount >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
436 					if(componentCount >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
437 				}
438 			}
439 			else  // Stream: UInt, Shader attrib: Int/UInt, no type conversion
440 			{
441 				v.x = *Pointer<Float4>(source0);
442 				v.y = *Pointer<Float4>(source1);
443 				v.z = *Pointer<Float4>(source2);
444 				v.w = *Pointer<Float4>(source3);
445 
446 				transpose4xN(v.x, v.y, v.z, v.w, componentCount);
447 			}
448 			break;
449 		case VK_FORMAT_R16_SFLOAT:
450 		case VK_FORMAT_R16G16_SFLOAT:
451 		case VK_FORMAT_R16G16B16A16_SFLOAT:
452 		{
453 			if(componentCount >= 1)
454 			{
455 				UShort x0 = *Pointer<UShort>(source0 + 0);
456 				UShort x1 = *Pointer<UShort>(source1 + 0);
457 				UShort x2 = *Pointer<UShort>(source2 + 0);
458 				UShort x3 = *Pointer<UShort>(source3 + 0);
459 
460 				v.x.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x0) * 4);
461 				v.x.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x1) * 4);
462 				v.x.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x2) * 4);
463 				v.x.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x3) * 4);
464 			}
465 
466 			if(componentCount >= 2)
467 			{
468 				UShort y0 = *Pointer<UShort>(source0 + 2);
469 				UShort y1 = *Pointer<UShort>(source1 + 2);
470 				UShort y2 = *Pointer<UShort>(source2 + 2);
471 				UShort y3 = *Pointer<UShort>(source3 + 2);
472 
473 				v.y.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y0) * 4);
474 				v.y.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y1) * 4);
475 				v.y.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y2) * 4);
476 				v.y.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y3) * 4);
477 			}
478 
479 			if(componentCount >= 3)
480 			{
481 				UShort z0 = *Pointer<UShort>(source0 + 4);
482 				UShort z1 = *Pointer<UShort>(source1 + 4);
483 				UShort z2 = *Pointer<UShort>(source2 + 4);
484 				UShort z3 = *Pointer<UShort>(source3 + 4);
485 
486 				v.z.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z0) * 4);
487 				v.z.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z1) * 4);
488 				v.z.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z2) * 4);
489 				v.z.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z3) * 4);
490 			}
491 
492 			if(componentCount >= 4)
493 			{
494 				UShort w0 = *Pointer<UShort>(source0 + 6);
495 				UShort w1 = *Pointer<UShort>(source1 + 6);
496 				UShort w2 = *Pointer<UShort>(source2 + 6);
497 				UShort w3 = *Pointer<UShort>(source3 + 6);
498 
499 				v.w.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w0) * 4);
500 				v.w.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w1) * 4);
501 				v.w.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w2) * 4);
502 				v.w.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w3) * 4);
503 			}
504 		}
505 		break;
506 		case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
507 		case VK_FORMAT_A2R10G10B10_SINT_PACK32:
508 			bgra = true;
509 		case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
510 		case VK_FORMAT_A2B10G10R10_SINT_PACK32:
511 		{
512 			Int4 src;
513 			src = Insert(src, *Pointer<Int>(source0), 0);
514 			src = Insert(src, *Pointer<Int>(source1), 1);
515 			src = Insert(src, *Pointer<Int>(source2), 2);
516 			src = Insert(src, *Pointer<Int>(source3), 3);
517 			if(isNativeFloatAttrib)  // Stream: Int, Shader attrib: Float
518 			{
519 				v.x = Float4((src << 22) >> 22);
520 				v.y = Float4((src << 12) >> 22);
521 				v.z = Float4((src << 02) >> 22);
522 				v.w = Float4(src >> 30);
523 
524 				if(normalized)
525 				{
526 					v.x = Max(v.x * Float4(1.0f / 0x1FF), Float4(-1.0f));
527 					v.y = Max(v.y * Float4(1.0f / 0x1FF), Float4(-1.0f));
528 					v.z = Max(v.z * Float4(1.0f / 0x1FF), Float4(-1.0f));
529 					v.w = Max(v.w, Float4(-1.0f));
530 				}
531 			}
532 			else  // Stream: UInt, Shader attrib: Int/UInt, no type conversion
533 			{
534 				v.x = As<Float4>((src << 22) >> 22);
535 				v.y = As<Float4>((src << 12) >> 22);
536 				v.z = As<Float4>((src << 02) >> 22);
537 				v.w = As<Float4>(src >> 30);
538 			}
539 		}
540 		break;
541 		case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
542 		case VK_FORMAT_A2R10G10B10_UINT_PACK32:
543 			bgra = true;
544 		case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
545 		case VK_FORMAT_A2B10G10R10_UINT_PACK32:
546 		{
547 			Int4 src;
548 			src = Insert(src, *Pointer<Int>(source0), 0);
549 			src = Insert(src, *Pointer<Int>(source1), 1);
550 			src = Insert(src, *Pointer<Int>(source2), 2);
551 			src = Insert(src, *Pointer<Int>(source3), 3);
552 
553 			if(isNativeFloatAttrib)  // Stream: Int, Shader attrib: Float
554 			{
555 				v.x = Float4(src & Int4(0x3FF));
556 				v.y = Float4((src >> 10) & Int4(0x3FF));
557 				v.z = Float4((src >> 20) & Int4(0x3FF));
558 				v.w = Float4((src >> 30) & Int4(0x3));
559 
560 				if(normalized)
561 				{
562 					v.x *= Float4(1.0f / 0x3FF);
563 					v.y *= Float4(1.0f / 0x3FF);
564 					v.z *= Float4(1.0f / 0x3FF);
565 					v.w *= Float4(1.0f / 0x3);
566 				}
567 			}
568 			else  // Stream: UInt, Shader attrib: Int/UInt, no type conversion
569 			{
570 				v.x = As<Float4>(src & Int4(0x3FF));
571 				v.y = As<Float4>((src >> 10) & Int4(0x3FF));
572 				v.z = As<Float4>((src >> 20) & Int4(0x3FF));
573 				v.w = As<Float4>((src >> 30) & Int4(0x3));
574 			}
575 		}
576 		break;
577 		default:
578 			UNSUPPORTED("stream.format %d", int(stream.format));
579 	}
580 
581 	if(bgra)
582 	{
583 		// Swap red and blue
584 		Float4 t = v.x;
585 		v.x = v.z;
586 		v.z = t;
587 	}
588 
589 	if(componentCount < 1) v.x = Float4(0.0f);
590 	if(componentCount < 2) v.y = Float4(0.0f);
591 	if(componentCount < 3) v.z = Float4(0.0f);
592 	if(componentCount < 4) v.w = isNativeFloatAttrib ? As<Float4>(Float4(1.0f)) : As<Float4>(Int4(1));
593 
594 	return v;
595 }
596 
writeCache(Pointer<Byte> & vertexCache,Pointer<UInt> & tagCache,Pointer<UInt> & batch)597 void VertexRoutine::writeCache(Pointer<Byte> &vertexCache, Pointer<UInt> &tagCache, Pointer<UInt> &batch)
598 {
599 	UInt index0 = batch[0];
600 	UInt index1 = batch[1];
601 	UInt index2 = batch[2];
602 	UInt index3 = batch[3];
603 
604 	UInt cacheIndex0 = index0 & VertexCache::TAG_MASK;
605 	UInt cacheIndex1 = index1 & VertexCache::TAG_MASK;
606 	UInt cacheIndex2 = index2 & VertexCache::TAG_MASK;
607 	UInt cacheIndex3 = index3 & VertexCache::TAG_MASK;
608 
609 	// We processed a SIMD group of vertices, with the first one being the one that missed the cache tag check.
610 	// Write them out in reverse order here and below to ensure the first one is now guaranteed to be in the cache.
611 	tagCache[cacheIndex3] = index3;
612 	tagCache[cacheIndex2] = index2;
613 	tagCache[cacheIndex1] = index1;
614 	tagCache[cacheIndex0] = index0;
615 
616 	auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition);
617 	assert(it != spirvShader->outputBuiltins.end());
618 	assert(it->second.SizeInComponents == 4);
619 	auto &position = routine.getVariable(it->second.Id);
620 
621 	Vector4f pos;
622 	pos.x = position[it->second.FirstComponent + 0];
623 	pos.y = position[it->second.FirstComponent + 1];
624 	pos.z = position[it->second.FirstComponent + 2];
625 	pos.w = position[it->second.FirstComponent + 3];
626 
627 	// Projection and viewport transform.
628 	Float4 w = As<Float4>(As<Int4>(pos.w) | (As<Int4>(CmpEQ(pos.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
629 	Float4 rhw = Float4(1.0f) / w;
630 
631 	Vector4f proj;
632 	proj.x = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData, X0xF)) + pos.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData, WxF))));
633 	proj.y = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData, Y0xF)) + pos.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData, HxF))));
634 	proj.z = pos.z * rhw;
635 	proj.w = rhw;
636 
637 	transpose4x4(pos.x, pos.y, pos.z, pos.w);
638 
639 	*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, position), 16) = pos.w;
640 	*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, position), 16) = pos.z;
641 	*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, position), 16) = pos.y;
642 	*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, position), 16) = pos.x;
643 
644 	it = spirvShader->outputBuiltins.find(spv::BuiltInPointSize);
645 	if(it != spirvShader->outputBuiltins.end())
646 	{
647 		ASSERT(it->second.SizeInComponents == 1);
648 		auto psize = routine.getVariable(it->second.Id)[it->second.FirstComponent];
649 
650 		*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, pointSize)) = Extract(psize, 3);
651 		*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, pointSize)) = Extract(psize, 2);
652 		*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, pointSize)) = Extract(psize, 1);
653 		*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, pointSize)) = Extract(psize, 0);
654 	}
655 
656 	it = spirvShader->outputBuiltins.find(spv::BuiltInClipDistance);
657 	if(it != spirvShader->outputBuiltins.end())
658 	{
659 		auto count = spirvShader->getNumOutputClipDistances();
660 		for(unsigned int i = 0; i < count; i++)
661 		{
662 			auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
663 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 3);
664 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 2);
665 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 1);
666 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 0);
667 		}
668 	}
669 
670 	it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance);
671 	if(it != spirvShader->outputBuiltins.end())
672 	{
673 		auto count = spirvShader->getNumOutputCullDistances();
674 		for(unsigned int i = 0; i < count; i++)
675 		{
676 			auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
677 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 3);
678 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 2);
679 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 1);
680 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 0);
681 		}
682 	}
683 
684 	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 24) & 0x0000000FF;
685 	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 16) & 0x0000000FF;
686 	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 8) & 0x0000000FF;
687 	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 0) & 0x0000000FF;
688 
689 	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, cullMask)) = -((cullMask >> 3) & 1);
690 	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, cullMask)) = -((cullMask >> 2) & 1);
691 	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, cullMask)) = -((cullMask >> 1) & 1);
692 	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, cullMask)) = -((cullMask >> 0) & 1);
693 
694 	transpose4x4(proj.x, proj.y, proj.z, proj.w);
695 
696 	*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, projected), 16) = proj.w;
697 	*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, projected), 16) = proj.z;
698 	*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, projected), 16) = proj.y;
699 	*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, projected), 16) = proj.x;
700 
701 	for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
702 	{
703 		if(spirvShader->outputs[i + 0].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
704 		   spirvShader->outputs[i + 1].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
705 		   spirvShader->outputs[i + 2].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
706 		   spirvShader->outputs[i + 3].Type != SpirvShader::ATTRIBTYPE_UNUSED)
707 		{
708 			Vector4f v;
709 			v.x = routine.outputs[i + 0];
710 			v.y = routine.outputs[i + 1];
711 			v.z = routine.outputs[i + 2];
712 			v.w = routine.outputs[i + 3];
713 
714 			transpose4x4(v.x, v.y, v.z, v.w);
715 
716 			*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, v[i]), 16) = v.w;
717 			*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, v[i]), 16) = v.z;
718 			*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, v[i]), 16) = v.y;
719 			*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, v[i]), 16) = v.x;
720 		}
721 	}
722 }
723 
writeVertex(const Pointer<Byte> & vertex,Pointer<Byte> & cacheEntry)724 void VertexRoutine::writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cacheEntry)
725 {
726 	*Pointer<Int4>(vertex + OFFSET(Vertex, position)) = *Pointer<Int4>(cacheEntry + OFFSET(Vertex, position));
727 	*Pointer<Int>(vertex + OFFSET(Vertex, pointSize)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, pointSize));
728 
729 	*Pointer<Int>(vertex + OFFSET(Vertex, clipFlags)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, clipFlags));
730 	*Pointer<Int>(vertex + OFFSET(Vertex, cullMask)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, cullMask));
731 	*Pointer<Int4>(vertex + OFFSET(Vertex, projected)) = *Pointer<Int4>(cacheEntry + OFFSET(Vertex, projected));
732 
733 	for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i++)
734 	{
735 		if(spirvShader->outputs[i].Type != SpirvShader::ATTRIBTYPE_UNUSED)
736 		{
737 			*Pointer<Int>(vertex + OFFSET(Vertex, v[i]), 4) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, v[i]), 4);
738 		}
739 	}
740 	for(unsigned int i = 0; i < spirvShader->getNumOutputClipDistances(); i++)
741 	{
742 		*Pointer<Float>(vertex + OFFSET(Vertex, clipDistance[i]), 4) = *Pointer<Float>(cacheEntry + OFFSET(Vertex, clipDistance[i]), 4);
743 	}
744 	for(unsigned int i = 0; i < spirvShader->getNumOutputCullDistances(); i++)
745 	{
746 		*Pointer<Float>(vertex + OFFSET(Vertex, cullDistance[i]), 4) = *Pointer<Float>(cacheEntry + OFFSET(Vertex, cullDistance[i]), 4);
747 	}
748 }
749 
750 }  // namespace sw
751