1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "VertexRoutine.hpp"
16
17 #include "Constants.hpp"
18 #include "SpirvShader.hpp"
19 #include "Device/Renderer.hpp"
20 #include "Device/Vertex.hpp"
21 #include "System/Debug.hpp"
22 #include "System/Half.hpp"
23
24 namespace sw {
25
VertexRoutine(const VertexProcessor::State & state,vk::PipelineLayout const * pipelineLayout,SpirvShader const * spirvShader)26 VertexRoutine::VertexRoutine(
27 const VertexProcessor::State &state,
28 vk::PipelineLayout const *pipelineLayout,
29 SpirvShader const *spirvShader)
30 : routine(pipelineLayout)
31 , state(state)
32 , spirvShader(spirvShader)
33 {
34 spirvShader->emitProlog(&routine);
35 }
36
~VertexRoutine()37 VertexRoutine::~VertexRoutine()
38 {
39 }
40
generate()41 void VertexRoutine::generate()
42 {
43 Pointer<Byte> cache = task + OFFSET(VertexTask, vertexCache);
44 Pointer<Byte> vertexCache = cache + OFFSET(VertexCache, vertex);
45 Pointer<UInt> tagCache = Pointer<UInt>(cache + OFFSET(VertexCache, tag));
46
47 UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask, vertexCount));
48
49 constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, constants));
50
51 // Check the cache one vertex index at a time. If a hit occurs, copy from the cache to the 'vertex' output buffer.
52 // On a cache miss, process a SIMD width of consecutive indices from the input batch. They're written to the cache
53 // in reverse order to guarantee that the first one doesn't get evicted and can be written out.
54
55 Do
56 {
57 UInt index = *batch;
58 UInt cacheIndex = index & VertexCache::TAG_MASK;
59
60 If(tagCache[cacheIndex] != index)
61 {
62 readInput(batch);
63 program(batch, vertexCount);
64 computeClipFlags();
65 computeCullMask();
66
67 writeCache(vertexCache, tagCache, batch);
68 }
69
70 Pointer<Byte> cacheEntry = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
71
72 // For points, vertexCount is 1 per primitive, so duplicate vertex for all 3 vertices of the primitive
73 for(int i = 0; i < (state.isPoint ? 3 : 1); i++)
74 {
75 writeVertex(vertex, cacheEntry);
76 vertex += sizeof(Vertex);
77 }
78
79 batch = Pointer<UInt>(Pointer<Byte>(batch) + sizeof(uint32_t));
80 vertexCount--;
81 }
82 Until(vertexCount == 0);
83
84 Return();
85 }
86
readInput(Pointer<UInt> & batch)87 void VertexRoutine::readInput(Pointer<UInt> &batch)
88 {
89 for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
90 {
91 if(spirvShader->inputs[i + 0].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
92 spirvShader->inputs[i + 1].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
93 spirvShader->inputs[i + 2].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
94 spirvShader->inputs[i + 3].Type != SpirvShader::ATTRIBTYPE_UNUSED)
95 {
96 Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, input) + sizeof(void *) * (i / 4));
97 UInt stride = *Pointer<UInt>(data + OFFSET(DrawData, stride) + sizeof(uint32_t) * (i / 4));
98 Int baseVertex = *Pointer<Int>(data + OFFSET(DrawData, baseVertex));
99 UInt robustnessSize(0);
100 if(state.robustBufferAccess)
101 {
102 robustnessSize = *Pointer<UInt>(data + OFFSET(DrawData, robustnessSize) + sizeof(uint32_t) * (i / 4));
103 }
104
105 auto value = readStream(input, stride, state.input[i / 4], batch, state.robustBufferAccess, robustnessSize, baseVertex);
106 routine.inputs[i + 0] = value.x;
107 routine.inputs[i + 1] = value.y;
108 routine.inputs[i + 2] = value.z;
109 routine.inputs[i + 3] = value.w;
110 }
111 }
112 }
113
computeClipFlags()114 void VertexRoutine::computeClipFlags()
115 {
116 auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition);
117 assert(it != spirvShader->outputBuiltins.end());
118 assert(it->second.SizeInComponents == 4);
119 auto &pos = routine.getVariable(it->second.Id);
120 auto posX = pos[it->second.FirstComponent + 0];
121 auto posY = pos[it->second.FirstComponent + 1];
122 auto posZ = pos[it->second.FirstComponent + 2];
123 auto posW = pos[it->second.FirstComponent + 3];
124
125 Int4 maxX = CmpLT(posW, posX);
126 Int4 maxY = CmpLT(posW, posY);
127 Int4 maxZ = CmpLT(posW, posZ);
128 Int4 minX = CmpNLE(-posW, posX);
129 Int4 minY = CmpNLE(-posW, posY);
130 Int4 minZ = CmpNLE(Float4(0.0f), posZ);
131
132 clipFlags = Pointer<Int>(constants + OFFSET(Constants, maxX))[SignMask(maxX)];
133 clipFlags |= Pointer<Int>(constants + OFFSET(Constants, maxY))[SignMask(maxY)];
134 clipFlags |= Pointer<Int>(constants + OFFSET(Constants, maxZ))[SignMask(maxZ)];
135 clipFlags |= Pointer<Int>(constants + OFFSET(Constants, minX))[SignMask(minX)];
136 clipFlags |= Pointer<Int>(constants + OFFSET(Constants, minY))[SignMask(minY)];
137 clipFlags |= Pointer<Int>(constants + OFFSET(Constants, minZ))[SignMask(minZ)];
138
139 Int4 finiteX = CmpLE(Abs(posX), *Pointer<Float4>(constants + OFFSET(Constants, maxPos)));
140 Int4 finiteY = CmpLE(Abs(posY), *Pointer<Float4>(constants + OFFSET(Constants, maxPos)));
141 Int4 finiteZ = CmpLE(Abs(posZ), *Pointer<Float4>(constants + OFFSET(Constants, maxPos)));
142
143 Int4 finiteXYZ = finiteX & finiteY & finiteZ;
144 clipFlags |= Pointer<Int>(constants + OFFSET(Constants, fini))[SignMask(finiteXYZ)];
145 }
146
computeCullMask()147 void VertexRoutine::computeCullMask()
148 {
149 cullMask = Int(15);
150
151 auto it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance);
152 if(it != spirvShader->outputBuiltins.end())
153 {
154 auto count = spirvShader->getNumOutputCullDistances();
155 for(uint32_t i = 0; i < count; i++)
156 {
157 auto const &distance = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
158 auto mask = SignMask(CmpGE(distance, SIMD::Float(0)));
159 cullMask &= mask;
160 }
161 }
162 }
163
readStream(Pointer<Byte> & buffer,UInt & stride,const Stream & stream,Pointer<UInt> & batch,bool robustBufferAccess,UInt & robustnessSize,Int baseVertex)164 Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, Pointer<UInt> &batch,
165 bool robustBufferAccess, UInt &robustnessSize, Int baseVertex)
166 {
167 Vector4f v;
168 // Because of the following rule in the Vulkan spec, we do not care if a very large negative
169 // baseVertex would overflow all the way back into a valid region of the index buffer:
170 // "Out-of-bounds buffer loads will return any of the following values :
171 // - Values from anywhere within the memory range(s) bound to the buffer (possibly including
172 // bytes of memory past the end of the buffer, up to the end of the bound range)."
173 UInt4 offsets = (*Pointer<UInt4>(As<Pointer<UInt4>>(batch)) + As<UInt4>(Int4(baseVertex))) * UInt4(stride);
174
175 Pointer<Byte> source0 = buffer + offsets.x;
176 Pointer<Byte> source1 = buffer + offsets.y;
177 Pointer<Byte> source2 = buffer + offsets.z;
178 Pointer<Byte> source3 = buffer + offsets.w;
179
180 vk::Format format(stream.format);
181
182 UInt4 zero(0);
183 if(robustBufferAccess)
184 {
185 // TODO(b/141124876): Optimize for wide-vector gather operations.
186 UInt4 limits = offsets + UInt4(format.bytes());
187 Pointer<Byte> zeroSource = As<Pointer<Byte>>(&zero);
188 source0 = IfThenElse(limits.x <= robustnessSize, source0, zeroSource);
189 source1 = IfThenElse(limits.y <= robustnessSize, source1, zeroSource);
190 source2 = IfThenElse(limits.z <= robustnessSize, source2, zeroSource);
191 source3 = IfThenElse(limits.w <= robustnessSize, source3, zeroSource);
192 }
193
194 int componentCount = format.componentCount();
195 bool normalized = !format.isUnnormalizedInteger();
196 bool isNativeFloatAttrib = (stream.attribType == SpirvShader::ATTRIBTYPE_FLOAT) || normalized;
197 bool bgra = false;
198
199 switch(stream.format)
200 {
201 case VK_FORMAT_R32_SFLOAT:
202 case VK_FORMAT_R32G32_SFLOAT:
203 case VK_FORMAT_R32G32B32_SFLOAT:
204 case VK_FORMAT_R32G32B32A32_SFLOAT:
205 {
206 if(componentCount == 0)
207 {
208 // Null stream, all default components
209 }
210 else
211 {
212 if(componentCount == 1)
213 {
214 v.x.x = *Pointer<Float>(source0);
215 v.x.y = *Pointer<Float>(source1);
216 v.x.z = *Pointer<Float>(source2);
217 v.x.w = *Pointer<Float>(source3);
218 }
219 else
220 {
221 v.x = *Pointer<Float4>(source0);
222 v.y = *Pointer<Float4>(source1);
223 v.z = *Pointer<Float4>(source2);
224 v.w = *Pointer<Float4>(source3);
225
226 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
227 }
228
229 switch(stream.attribType)
230 {
231 case SpirvShader::ATTRIBTYPE_INT:
232 if(componentCount >= 1) v.x = As<Float4>(Int4(v.x));
233 if(componentCount >= 2) v.x = As<Float4>(Int4(v.y));
234 if(componentCount >= 3) v.x = As<Float4>(Int4(v.z));
235 if(componentCount >= 4) v.x = As<Float4>(Int4(v.w));
236 break;
237 case SpirvShader::ATTRIBTYPE_UINT:
238 if(componentCount >= 1) v.x = As<Float4>(UInt4(v.x));
239 if(componentCount >= 2) v.x = As<Float4>(UInt4(v.y));
240 if(componentCount >= 3) v.x = As<Float4>(UInt4(v.z));
241 if(componentCount >= 4) v.x = As<Float4>(UInt4(v.w));
242 break;
243 default:
244 break;
245 }
246 }
247 }
248 break;
249 case VK_FORMAT_B8G8R8A8_UNORM:
250 bgra = true;
251 case VK_FORMAT_R8_UNORM:
252 case VK_FORMAT_R8G8_UNORM:
253 case VK_FORMAT_R8G8B8A8_UNORM:
254 case VK_FORMAT_R8_UINT:
255 case VK_FORMAT_R8G8_UINT:
256 case VK_FORMAT_R8G8B8A8_UINT:
257 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
258 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
259 if(isNativeFloatAttrib) // Stream: UByte, Shader attrib: Float
260 {
261 v.x = Float4(*Pointer<Byte4>(source0));
262 v.y = Float4(*Pointer<Byte4>(source1));
263 v.z = Float4(*Pointer<Byte4>(source2));
264 v.w = Float4(*Pointer<Byte4>(source3));
265
266 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
267
268 if(normalized)
269 {
270 if(componentCount >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleByte));
271 if(componentCount >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleByte));
272 if(componentCount >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleByte));
273 if(componentCount >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleByte));
274 }
275 }
276 else // Stream: UByte, Shader attrib: Int / UInt
277 {
278 v.x = As<Float4>(Int4(*Pointer<Byte4>(source0)));
279 v.y = As<Float4>(Int4(*Pointer<Byte4>(source1)));
280 v.z = As<Float4>(Int4(*Pointer<Byte4>(source2)));
281 v.w = As<Float4>(Int4(*Pointer<Byte4>(source3)));
282
283 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
284 }
285 break;
286 case VK_FORMAT_R8_SNORM:
287 case VK_FORMAT_R8_SINT:
288 case VK_FORMAT_R8G8_SNORM:
289 case VK_FORMAT_R8G8_SINT:
290 case VK_FORMAT_R8G8B8A8_SNORM:
291 case VK_FORMAT_R8G8B8A8_SINT:
292 case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
293 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
294 if(isNativeFloatAttrib) // Stream: SByte, Shader attrib: Float
295 {
296 v.x = Float4(*Pointer<SByte4>(source0));
297 v.y = Float4(*Pointer<SByte4>(source1));
298 v.z = Float4(*Pointer<SByte4>(source2));
299 v.w = Float4(*Pointer<SByte4>(source3));
300
301 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
302
303 if(normalized)
304 {
305 if(componentCount >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleSByte));
306 if(componentCount >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleSByte));
307 if(componentCount >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleSByte));
308 if(componentCount >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleSByte));
309 }
310 }
311 else // Stream: SByte, Shader attrib: Int / UInt
312 {
313 v.x = As<Float4>(Int4(*Pointer<SByte4>(source0)));
314 v.y = As<Float4>(Int4(*Pointer<SByte4>(source1)));
315 v.z = As<Float4>(Int4(*Pointer<SByte4>(source2)));
316 v.w = As<Float4>(Int4(*Pointer<SByte4>(source3)));
317
318 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
319 }
320 break;
321 case VK_FORMAT_R16_SNORM:
322 case VK_FORMAT_R16_SINT:
323 case VK_FORMAT_R16G16_SNORM:
324 case VK_FORMAT_R16G16_SINT:
325 case VK_FORMAT_R16G16B16A16_SNORM:
326 case VK_FORMAT_R16G16B16A16_SINT:
327 if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
328 {
329 v.x = Float4(*Pointer<Short4>(source0));
330 v.y = Float4(*Pointer<Short4>(source1));
331 v.z = Float4(*Pointer<Short4>(source2));
332 v.w = Float4(*Pointer<Short4>(source3));
333
334 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
335
336 if(normalized)
337 {
338 if(componentCount >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort));
339 if(componentCount >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort));
340 if(componentCount >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort));
341 if(componentCount >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort));
342 }
343 }
344 else // Stream: Short, Shader attrib: Int/UInt, no type conversion
345 {
346 v.x = As<Float4>(Int4(*Pointer<Short4>(source0)));
347 v.y = As<Float4>(Int4(*Pointer<Short4>(source1)));
348 v.z = As<Float4>(Int4(*Pointer<Short4>(source2)));
349 v.w = As<Float4>(Int4(*Pointer<Short4>(source3)));
350
351 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
352 }
353 break;
354 case VK_FORMAT_R16_UNORM:
355 case VK_FORMAT_R16_UINT:
356 case VK_FORMAT_R16G16_UNORM:
357 case VK_FORMAT_R16G16_UINT:
358 case VK_FORMAT_R16G16B16A16_UNORM:
359 case VK_FORMAT_R16G16B16A16_UINT:
360 if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
361 {
362 v.x = Float4(*Pointer<UShort4>(source0));
363 v.y = Float4(*Pointer<UShort4>(source1));
364 v.z = Float4(*Pointer<UShort4>(source2));
365 v.w = Float4(*Pointer<UShort4>(source3));
366
367 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
368
369 if(normalized)
370 {
371 if(componentCount >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUShort));
372 if(componentCount >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUShort));
373 if(componentCount >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUShort));
374 if(componentCount >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUShort));
375 }
376 }
377 else // Stream: UShort, Shader attrib: Int/UInt, no type conversion
378 {
379 v.x = As<Float4>(Int4(*Pointer<UShort4>(source0)));
380 v.y = As<Float4>(Int4(*Pointer<UShort4>(source1)));
381 v.z = As<Float4>(Int4(*Pointer<UShort4>(source2)));
382 v.w = As<Float4>(Int4(*Pointer<UShort4>(source3)));
383
384 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
385 }
386 break;
387 case VK_FORMAT_R32_SINT:
388 case VK_FORMAT_R32G32_SINT:
389 case VK_FORMAT_R32G32B32_SINT:
390 case VK_FORMAT_R32G32B32A32_SINT:
391 if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
392 {
393 v.x = Float4(*Pointer<Int4>(source0));
394 v.y = Float4(*Pointer<Int4>(source1));
395 v.z = Float4(*Pointer<Int4>(source2));
396 v.w = Float4(*Pointer<Int4>(source3));
397
398 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
399
400 if(normalized)
401 {
402 if(componentCount >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
403 if(componentCount >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
404 if(componentCount >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
405 if(componentCount >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
406 }
407 }
408 else // Stream: Int, Shader attrib: Int/UInt, no type conversion
409 {
410 v.x = *Pointer<Float4>(source0);
411 v.y = *Pointer<Float4>(source1);
412 v.z = *Pointer<Float4>(source2);
413 v.w = *Pointer<Float4>(source3);
414
415 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
416 }
417 break;
418 case VK_FORMAT_R32_UINT:
419 case VK_FORMAT_R32G32_UINT:
420 case VK_FORMAT_R32G32B32_UINT:
421 case VK_FORMAT_R32G32B32A32_UINT:
422 if(isNativeFloatAttrib) // Stream: UInt, Shader attrib: Float
423 {
424 v.x = Float4(*Pointer<UInt4>(source0));
425 v.y = Float4(*Pointer<UInt4>(source1));
426 v.z = Float4(*Pointer<UInt4>(source2));
427 v.w = Float4(*Pointer<UInt4>(source3));
428
429 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
430
431 if(normalized)
432 {
433 if(componentCount >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
434 if(componentCount >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
435 if(componentCount >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
436 if(componentCount >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
437 }
438 }
439 else // Stream: UInt, Shader attrib: Int/UInt, no type conversion
440 {
441 v.x = *Pointer<Float4>(source0);
442 v.y = *Pointer<Float4>(source1);
443 v.z = *Pointer<Float4>(source2);
444 v.w = *Pointer<Float4>(source3);
445
446 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
447 }
448 break;
449 case VK_FORMAT_R16_SFLOAT:
450 case VK_FORMAT_R16G16_SFLOAT:
451 case VK_FORMAT_R16G16B16A16_SFLOAT:
452 {
453 if(componentCount >= 1)
454 {
455 UShort x0 = *Pointer<UShort>(source0 + 0);
456 UShort x1 = *Pointer<UShort>(source1 + 0);
457 UShort x2 = *Pointer<UShort>(source2 + 0);
458 UShort x3 = *Pointer<UShort>(source3 + 0);
459
460 v.x.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x0) * 4);
461 v.x.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x1) * 4);
462 v.x.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x2) * 4);
463 v.x.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x3) * 4);
464 }
465
466 if(componentCount >= 2)
467 {
468 UShort y0 = *Pointer<UShort>(source0 + 2);
469 UShort y1 = *Pointer<UShort>(source1 + 2);
470 UShort y2 = *Pointer<UShort>(source2 + 2);
471 UShort y3 = *Pointer<UShort>(source3 + 2);
472
473 v.y.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y0) * 4);
474 v.y.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y1) * 4);
475 v.y.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y2) * 4);
476 v.y.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y3) * 4);
477 }
478
479 if(componentCount >= 3)
480 {
481 UShort z0 = *Pointer<UShort>(source0 + 4);
482 UShort z1 = *Pointer<UShort>(source1 + 4);
483 UShort z2 = *Pointer<UShort>(source2 + 4);
484 UShort z3 = *Pointer<UShort>(source3 + 4);
485
486 v.z.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z0) * 4);
487 v.z.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z1) * 4);
488 v.z.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z2) * 4);
489 v.z.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z3) * 4);
490 }
491
492 if(componentCount >= 4)
493 {
494 UShort w0 = *Pointer<UShort>(source0 + 6);
495 UShort w1 = *Pointer<UShort>(source1 + 6);
496 UShort w2 = *Pointer<UShort>(source2 + 6);
497 UShort w3 = *Pointer<UShort>(source3 + 6);
498
499 v.w.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w0) * 4);
500 v.w.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w1) * 4);
501 v.w.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w2) * 4);
502 v.w.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w3) * 4);
503 }
504 }
505 break;
506 case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
507 case VK_FORMAT_A2R10G10B10_SINT_PACK32:
508 bgra = true;
509 case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
510 case VK_FORMAT_A2B10G10R10_SINT_PACK32:
511 {
512 Int4 src;
513 src = Insert(src, *Pointer<Int>(source0), 0);
514 src = Insert(src, *Pointer<Int>(source1), 1);
515 src = Insert(src, *Pointer<Int>(source2), 2);
516 src = Insert(src, *Pointer<Int>(source3), 3);
517 if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
518 {
519 v.x = Float4((src << 22) >> 22);
520 v.y = Float4((src << 12) >> 22);
521 v.z = Float4((src << 02) >> 22);
522 v.w = Float4(src >> 30);
523
524 if(normalized)
525 {
526 v.x = Max(v.x * Float4(1.0f / 0x1FF), Float4(-1.0f));
527 v.y = Max(v.y * Float4(1.0f / 0x1FF), Float4(-1.0f));
528 v.z = Max(v.z * Float4(1.0f / 0x1FF), Float4(-1.0f));
529 v.w = Max(v.w, Float4(-1.0f));
530 }
531 }
532 else // Stream: UInt, Shader attrib: Int/UInt, no type conversion
533 {
534 v.x = As<Float4>((src << 22) >> 22);
535 v.y = As<Float4>((src << 12) >> 22);
536 v.z = As<Float4>((src << 02) >> 22);
537 v.w = As<Float4>(src >> 30);
538 }
539 }
540 break;
541 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
542 case VK_FORMAT_A2R10G10B10_UINT_PACK32:
543 bgra = true;
544 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
545 case VK_FORMAT_A2B10G10R10_UINT_PACK32:
546 {
547 Int4 src;
548 src = Insert(src, *Pointer<Int>(source0), 0);
549 src = Insert(src, *Pointer<Int>(source1), 1);
550 src = Insert(src, *Pointer<Int>(source2), 2);
551 src = Insert(src, *Pointer<Int>(source3), 3);
552
553 if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
554 {
555 v.x = Float4(src & Int4(0x3FF));
556 v.y = Float4((src >> 10) & Int4(0x3FF));
557 v.z = Float4((src >> 20) & Int4(0x3FF));
558 v.w = Float4((src >> 30) & Int4(0x3));
559
560 if(normalized)
561 {
562 v.x *= Float4(1.0f / 0x3FF);
563 v.y *= Float4(1.0f / 0x3FF);
564 v.z *= Float4(1.0f / 0x3FF);
565 v.w *= Float4(1.0f / 0x3);
566 }
567 }
568 else // Stream: UInt, Shader attrib: Int/UInt, no type conversion
569 {
570 v.x = As<Float4>(src & Int4(0x3FF));
571 v.y = As<Float4>((src >> 10) & Int4(0x3FF));
572 v.z = As<Float4>((src >> 20) & Int4(0x3FF));
573 v.w = As<Float4>((src >> 30) & Int4(0x3));
574 }
575 }
576 break;
577 default:
578 UNSUPPORTED("stream.format %d", int(stream.format));
579 }
580
581 if(bgra)
582 {
583 // Swap red and blue
584 Float4 t = v.x;
585 v.x = v.z;
586 v.z = t;
587 }
588
589 if(componentCount < 1) v.x = Float4(0.0f);
590 if(componentCount < 2) v.y = Float4(0.0f);
591 if(componentCount < 3) v.z = Float4(0.0f);
592 if(componentCount < 4) v.w = isNativeFloatAttrib ? As<Float4>(Float4(1.0f)) : As<Float4>(Int4(1));
593
594 return v;
595 }
596
writeCache(Pointer<Byte> & vertexCache,Pointer<UInt> & tagCache,Pointer<UInt> & batch)597 void VertexRoutine::writeCache(Pointer<Byte> &vertexCache, Pointer<UInt> &tagCache, Pointer<UInt> &batch)
598 {
599 UInt index0 = batch[0];
600 UInt index1 = batch[1];
601 UInt index2 = batch[2];
602 UInt index3 = batch[3];
603
604 UInt cacheIndex0 = index0 & VertexCache::TAG_MASK;
605 UInt cacheIndex1 = index1 & VertexCache::TAG_MASK;
606 UInt cacheIndex2 = index2 & VertexCache::TAG_MASK;
607 UInt cacheIndex3 = index3 & VertexCache::TAG_MASK;
608
609 // We processed a SIMD group of vertices, with the first one being the one that missed the cache tag check.
610 // Write them out in reverse order here and below to ensure the first one is now guaranteed to be in the cache.
611 tagCache[cacheIndex3] = index3;
612 tagCache[cacheIndex2] = index2;
613 tagCache[cacheIndex1] = index1;
614 tagCache[cacheIndex0] = index0;
615
616 auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition);
617 assert(it != spirvShader->outputBuiltins.end());
618 assert(it->second.SizeInComponents == 4);
619 auto &position = routine.getVariable(it->second.Id);
620
621 Vector4f pos;
622 pos.x = position[it->second.FirstComponent + 0];
623 pos.y = position[it->second.FirstComponent + 1];
624 pos.z = position[it->second.FirstComponent + 2];
625 pos.w = position[it->second.FirstComponent + 3];
626
627 // Projection and viewport transform.
628 Float4 w = As<Float4>(As<Int4>(pos.w) | (As<Int4>(CmpEQ(pos.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
629 Float4 rhw = Float4(1.0f) / w;
630
631 Vector4f proj;
632 proj.x = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData, X0xF)) + pos.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData, WxF))));
633 proj.y = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData, Y0xF)) + pos.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData, HxF))));
634 proj.z = pos.z * rhw;
635 proj.w = rhw;
636
637 transpose4x4(pos.x, pos.y, pos.z, pos.w);
638
639 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, position), 16) = pos.w;
640 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, position), 16) = pos.z;
641 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, position), 16) = pos.y;
642 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, position), 16) = pos.x;
643
644 it = spirvShader->outputBuiltins.find(spv::BuiltInPointSize);
645 if(it != spirvShader->outputBuiltins.end())
646 {
647 ASSERT(it->second.SizeInComponents == 1);
648 auto psize = routine.getVariable(it->second.Id)[it->second.FirstComponent];
649
650 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, pointSize)) = Extract(psize, 3);
651 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, pointSize)) = Extract(psize, 2);
652 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, pointSize)) = Extract(psize, 1);
653 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, pointSize)) = Extract(psize, 0);
654 }
655
656 it = spirvShader->outputBuiltins.find(spv::BuiltInClipDistance);
657 if(it != spirvShader->outputBuiltins.end())
658 {
659 auto count = spirvShader->getNumOutputClipDistances();
660 for(unsigned int i = 0; i < count; i++)
661 {
662 auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
663 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 3);
664 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 2);
665 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 1);
666 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 0);
667 }
668 }
669
670 it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance);
671 if(it != spirvShader->outputBuiltins.end())
672 {
673 auto count = spirvShader->getNumOutputCullDistances();
674 for(unsigned int i = 0; i < count; i++)
675 {
676 auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
677 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 3);
678 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 2);
679 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 1);
680 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 0);
681 }
682 }
683
684 *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 24) & 0x0000000FF;
685 *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 16) & 0x0000000FF;
686 *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 8) & 0x0000000FF;
687 *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 0) & 0x0000000FF;
688
689 *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, cullMask)) = -((cullMask >> 3) & 1);
690 *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, cullMask)) = -((cullMask >> 2) & 1);
691 *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, cullMask)) = -((cullMask >> 1) & 1);
692 *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, cullMask)) = -((cullMask >> 0) & 1);
693
694 transpose4x4(proj.x, proj.y, proj.z, proj.w);
695
696 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, projected), 16) = proj.w;
697 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, projected), 16) = proj.z;
698 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, projected), 16) = proj.y;
699 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, projected), 16) = proj.x;
700
701 for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
702 {
703 if(spirvShader->outputs[i + 0].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
704 spirvShader->outputs[i + 1].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
705 spirvShader->outputs[i + 2].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
706 spirvShader->outputs[i + 3].Type != SpirvShader::ATTRIBTYPE_UNUSED)
707 {
708 Vector4f v;
709 v.x = routine.outputs[i + 0];
710 v.y = routine.outputs[i + 1];
711 v.z = routine.outputs[i + 2];
712 v.w = routine.outputs[i + 3];
713
714 transpose4x4(v.x, v.y, v.z, v.w);
715
716 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, v[i]), 16) = v.w;
717 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, v[i]), 16) = v.z;
718 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, v[i]), 16) = v.y;
719 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, v[i]), 16) = v.x;
720 }
721 }
722 }
723
writeVertex(const Pointer<Byte> & vertex,Pointer<Byte> & cacheEntry)724 void VertexRoutine::writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cacheEntry)
725 {
726 *Pointer<Int4>(vertex + OFFSET(Vertex, position)) = *Pointer<Int4>(cacheEntry + OFFSET(Vertex, position));
727 *Pointer<Int>(vertex + OFFSET(Vertex, pointSize)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, pointSize));
728
729 *Pointer<Int>(vertex + OFFSET(Vertex, clipFlags)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, clipFlags));
730 *Pointer<Int>(vertex + OFFSET(Vertex, cullMask)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, cullMask));
731 *Pointer<Int4>(vertex + OFFSET(Vertex, projected)) = *Pointer<Int4>(cacheEntry + OFFSET(Vertex, projected));
732
733 for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i++)
734 {
735 if(spirvShader->outputs[i].Type != SpirvShader::ATTRIBTYPE_UNUSED)
736 {
737 *Pointer<Int>(vertex + OFFSET(Vertex, v[i]), 4) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, v[i]), 4);
738 }
739 }
740 for(unsigned int i = 0; i < spirvShader->getNumOutputClipDistances(); i++)
741 {
742 *Pointer<Float>(vertex + OFFSET(Vertex, clipDistance[i]), 4) = *Pointer<Float>(cacheEntry + OFFSET(Vertex, clipDistance[i]), 4);
743 }
744 for(unsigned int i = 0; i < spirvShader->getNumOutputCullDistances(); i++)
745 {
746 *Pointer<Float>(vertex + OFFSET(Vertex, cullDistance[i]), 4) = *Pointer<Float>(cacheEntry + OFFSET(Vertex, cullDistance[i]), 4);
747 }
748 }
749
750 } // namespace sw
751