1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "VertexRoutine.hpp"
16
17 #include "Constants.hpp"
18 #include "SpirvShader.hpp"
19 #include "Device/Renderer.hpp"
20 #include "Device/Vertex.hpp"
21 #include "System/Debug.hpp"
22 #include "System/Half.hpp"
23
24 namespace sw {
25
VertexRoutine(const VertexProcessor::State & state,vk::PipelineLayout const * pipelineLayout,SpirvShader const * spirvShader)26 VertexRoutine::VertexRoutine(
27 const VertexProcessor::State &state,
28 vk::PipelineLayout const *pipelineLayout,
29 SpirvShader const *spirvShader)
30 : routine(pipelineLayout)
31 , state(state)
32 , spirvShader(spirvShader)
33 {
34 spirvShader->emitProlog(&routine);
35 }
36
~VertexRoutine()37 VertexRoutine::~VertexRoutine()
38 {
39 }
40
generate()41 void VertexRoutine::generate()
42 {
43 Pointer<Byte> cache = task + OFFSET(VertexTask, vertexCache);
44 Pointer<Byte> vertexCache = cache + OFFSET(VertexCache, vertex);
45 Pointer<UInt> tagCache = Pointer<UInt>(cache + OFFSET(VertexCache, tag));
46
47 UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask, vertexCount));
48
49 constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, constants));
50
51 // Check the cache one vertex index at a time. If a hit occurs, copy from the cache to the 'vertex' output buffer.
52 // On a cache miss, process a SIMD width of consecutive indices from the input batch. They're written to the cache
53 // in reverse order to guarantee that the first one doesn't get evicted and can be written out.
54
55 Do
56 {
57 UInt index = *batch;
58 UInt cacheIndex = index & VertexCache::TAG_MASK;
59
60 If(tagCache[cacheIndex] != index)
61 {
62 readInput(batch);
63 program(batch, vertexCount);
64 computeClipFlags();
65 computeCullMask();
66
67 writeCache(vertexCache, tagCache, batch);
68 }
69
70 Pointer<Byte> cacheEntry = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
71
72 // For points, vertexCount is 1 per primitive, so duplicate vertex for all 3 vertices of the primitive
73 for(int i = 0; i < (state.isPoint ? 3 : 1); i++)
74 {
75 writeVertex(vertex, cacheEntry);
76 vertex += sizeof(Vertex);
77 }
78
79 batch = Pointer<UInt>(Pointer<Byte>(batch) + sizeof(uint32_t));
80 vertexCount--;
81 }
82 Until(vertexCount == 0);
83
84 Return();
85 }
86
readInput(Pointer<UInt> & batch)87 void VertexRoutine::readInput(Pointer<UInt> &batch)
88 {
89 for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
90 {
91 if(spirvShader->inputs[i + 0].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
92 spirvShader->inputs[i + 1].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
93 spirvShader->inputs[i + 2].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
94 spirvShader->inputs[i + 3].Type != SpirvShader::ATTRIBTYPE_UNUSED)
95 {
96 Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, input) + sizeof(void *) * (i / 4));
97 UInt stride = *Pointer<UInt>(data + OFFSET(DrawData, stride) + sizeof(uint32_t) * (i / 4));
98 Int baseVertex = *Pointer<Int>(data + OFFSET(DrawData, baseVertex));
99 UInt robustnessSize(0);
100 if(state.robustBufferAccess)
101 {
102 robustnessSize = *Pointer<UInt>(data + OFFSET(DrawData, robustnessSize) + sizeof(uint32_t) * (i / 4));
103 }
104
105 auto value = readStream(input, stride, state.input[i / 4], batch, state.robustBufferAccess, robustnessSize, baseVertex);
106 routine.inputs[i + 0] = value.x;
107 routine.inputs[i + 1] = value.y;
108 routine.inputs[i + 2] = value.z;
109 routine.inputs[i + 3] = value.w;
110 }
111 }
112 }
113
computeClipFlags()114 void VertexRoutine::computeClipFlags()
115 {
116 auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition);
117 if(it != spirvShader->outputBuiltins.end())
118 {
119 assert(it->second.SizeInComponents == 4);
120 auto &pos = routine.getVariable(it->second.Id);
121 auto posX = pos[it->second.FirstComponent + 0];
122 auto posY = pos[it->second.FirstComponent + 1];
123 auto posZ = pos[it->second.FirstComponent + 2];
124 auto posW = pos[it->second.FirstComponent + 3];
125
126 Int4 maxX = CmpLT(posW, posX);
127 Int4 maxY = CmpLT(posW, posY);
128 Int4 maxZ = CmpLT(posW, posZ);
129 Int4 minX = CmpNLE(-posW, posX);
130 Int4 minY = CmpNLE(-posW, posY);
131 Int4 minZ = CmpNLE(Float4(0.0f), posZ);
132
133 clipFlags = Pointer<Int>(constants + OFFSET(Constants, maxX))[SignMask(maxX)];
134 clipFlags |= Pointer<Int>(constants + OFFSET(Constants, maxY))[SignMask(maxY)];
135 clipFlags |= Pointer<Int>(constants + OFFSET(Constants, maxZ))[SignMask(maxZ)];
136 clipFlags |= Pointer<Int>(constants + OFFSET(Constants, minX))[SignMask(minX)];
137 clipFlags |= Pointer<Int>(constants + OFFSET(Constants, minY))[SignMask(minY)];
138 clipFlags |= Pointer<Int>(constants + OFFSET(Constants, minZ))[SignMask(minZ)];
139
140 Float4 maxPos = As<Float4>(Int4(0x7F7FFFFF));
141 Int4 finiteX = CmpLE(Abs(posX), maxPos);
142 Int4 finiteY = CmpLE(Abs(posY), maxPos);
143 Int4 finiteZ = CmpLE(Abs(posZ), maxPos);
144
145 Int4 finiteXYZ = finiteX & finiteY & finiteZ;
146 clipFlags |= Pointer<Int>(constants + OFFSET(Constants, fini))[SignMask(finiteXYZ)];
147 }
148 }
149
computeCullMask()150 void VertexRoutine::computeCullMask()
151 {
152 cullMask = Int(15);
153
154 auto it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance);
155 if(it != spirvShader->outputBuiltins.end())
156 {
157 auto count = spirvShader->getNumOutputCullDistances();
158 for(uint32_t i = 0; i < count; i++)
159 {
160 auto const &distance = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
161 auto mask = SignMask(CmpGE(distance, SIMD::Float(0)));
162 cullMask &= mask;
163 }
164 }
165 }
166
readStream(Pointer<Byte> & buffer,UInt & stride,const Stream & stream,Pointer<UInt> & batch,bool robustBufferAccess,UInt & robustnessSize,Int baseVertex)167 Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, Pointer<UInt> &batch,
168 bool robustBufferAccess, UInt &robustnessSize, Int baseVertex)
169 {
170 Vector4f v;
171 // Because of the following rule in the Vulkan spec, we do not care if a very large negative
172 // baseVertex would overflow all the way back into a valid region of the index buffer:
173 // "Out-of-bounds buffer loads will return any of the following values :
174 // - Values from anywhere within the memory range(s) bound to the buffer (possibly including
175 // bytes of memory past the end of the buffer, up to the end of the bound range)."
176 UInt4 offsets = (*Pointer<UInt4>(As<Pointer<UInt4>>(batch)) + As<UInt4>(Int4(baseVertex))) * UInt4(stride);
177
178 Pointer<Byte> source0 = buffer + offsets.x;
179 Pointer<Byte> source1 = buffer + offsets.y;
180 Pointer<Byte> source2 = buffer + offsets.z;
181 Pointer<Byte> source3 = buffer + offsets.w;
182
183 vk::Format format(stream.format);
184
185 UInt4 zero(0);
186 if(robustBufferAccess)
187 {
188 // TODO(b/141124876): Optimize for wide-vector gather operations.
189 UInt4 limits = offsets + UInt4(format.bytes());
190 Pointer<Byte> zeroSource = As<Pointer<Byte>>(&zero);
191 source0 = IfThenElse(limits.x <= robustnessSize, source0, zeroSource);
192 source1 = IfThenElse(limits.y <= robustnessSize, source1, zeroSource);
193 source2 = IfThenElse(limits.z <= robustnessSize, source2, zeroSource);
194 source3 = IfThenElse(limits.w <= robustnessSize, source3, zeroSource);
195 }
196
197 int componentCount = format.componentCount();
198 bool normalized = !format.isUnnormalizedInteger();
199 bool isNativeFloatAttrib = (stream.attribType == SpirvShader::ATTRIBTYPE_FLOAT) || normalized;
200 bool bgra = false;
201
202 switch(stream.format)
203 {
204 case VK_FORMAT_R32_SFLOAT:
205 case VK_FORMAT_R32G32_SFLOAT:
206 case VK_FORMAT_R32G32B32_SFLOAT:
207 case VK_FORMAT_R32G32B32A32_SFLOAT:
208 {
209 if(componentCount == 0)
210 {
211 // Null stream, all default components
212 }
213 else
214 {
215 if(componentCount == 1)
216 {
217 v.x.x = *Pointer<Float>(source0);
218 v.x.y = *Pointer<Float>(source1);
219 v.x.z = *Pointer<Float>(source2);
220 v.x.w = *Pointer<Float>(source3);
221 }
222 else
223 {
224 v.x = *Pointer<Float4>(source0);
225 v.y = *Pointer<Float4>(source1);
226 v.z = *Pointer<Float4>(source2);
227 v.w = *Pointer<Float4>(source3);
228
229 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
230 }
231 }
232 }
233 break;
234 case VK_FORMAT_B8G8R8A8_UNORM:
235 bgra = true;
236 // [[fallthrough]]
237 case VK_FORMAT_R8_UNORM:
238 case VK_FORMAT_R8G8_UNORM:
239 case VK_FORMAT_R8G8B8A8_UNORM:
240 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
241 v.x = Float4(*Pointer<Byte4>(source0));
242 v.y = Float4(*Pointer<Byte4>(source1));
243 v.z = Float4(*Pointer<Byte4>(source2));
244 v.w = Float4(*Pointer<Byte4>(source3));
245
246 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
247
248 if(componentCount >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleByte));
249 if(componentCount >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleByte));
250 if(componentCount >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleByte));
251 if(componentCount >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleByte));
252 break;
253 case VK_FORMAT_R8_UINT:
254 case VK_FORMAT_R8G8_UINT:
255 case VK_FORMAT_R8G8B8A8_UINT:
256 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
257 v.x = As<Float4>(Int4(*Pointer<Byte4>(source0)));
258 v.y = As<Float4>(Int4(*Pointer<Byte4>(source1)));
259 v.z = As<Float4>(Int4(*Pointer<Byte4>(source2)));
260 v.w = As<Float4>(Int4(*Pointer<Byte4>(source3)));
261
262 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
263 break;
264 case VK_FORMAT_R8_SNORM:
265 case VK_FORMAT_R8G8_SNORM:
266 case VK_FORMAT_R8G8B8A8_SNORM:
267 case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
268 v.x = Float4(*Pointer<SByte4>(source0));
269 v.y = Float4(*Pointer<SByte4>(source1));
270 v.z = Float4(*Pointer<SByte4>(source2));
271 v.w = Float4(*Pointer<SByte4>(source3));
272
273 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
274
275 if(componentCount >= 1) v.x = Max(v.x * *Pointer<Float4>(constants + OFFSET(Constants, unscaleSByte)), Float4(-1.0f));
276 if(componentCount >= 2) v.y = Max(v.y * *Pointer<Float4>(constants + OFFSET(Constants, unscaleSByte)), Float4(-1.0f));
277 if(componentCount >= 3) v.z = Max(v.z * *Pointer<Float4>(constants + OFFSET(Constants, unscaleSByte)), Float4(-1.0f));
278 if(componentCount >= 4) v.w = Max(v.w * *Pointer<Float4>(constants + OFFSET(Constants, unscaleSByte)), Float4(-1.0f));
279 break;
280 case VK_FORMAT_R8_SINT:
281 case VK_FORMAT_R8G8_SINT:
282 case VK_FORMAT_R8G8B8A8_SINT:
283 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
284 v.x = As<Float4>(Int4(*Pointer<SByte4>(source0)));
285 v.y = As<Float4>(Int4(*Pointer<SByte4>(source1)));
286 v.z = As<Float4>(Int4(*Pointer<SByte4>(source2)));
287 v.w = As<Float4>(Int4(*Pointer<SByte4>(source3)));
288
289 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
290 break;
291 case VK_FORMAT_R16_SNORM:
292 case VK_FORMAT_R16G16_SNORM:
293 case VK_FORMAT_R16G16B16A16_SNORM:
294 v.x = Float4(*Pointer<Short4>(source0));
295 v.y = Float4(*Pointer<Short4>(source1));
296 v.z = Float4(*Pointer<Short4>(source2));
297 v.w = Float4(*Pointer<Short4>(source3));
298
299 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
300
301 if(componentCount >= 1) v.x = Max(v.x * *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort)), Float4(-1.0f));
302 if(componentCount >= 2) v.y = Max(v.y * *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort)), Float4(-1.0f));
303 if(componentCount >= 3) v.z = Max(v.z * *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort)), Float4(-1.0f));
304 if(componentCount >= 4) v.w = Max(v.w * *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort)), Float4(-1.0f));
305 break;
306 case VK_FORMAT_R16_SINT:
307 case VK_FORMAT_R16G16_SINT:
308 case VK_FORMAT_R16G16B16A16_SINT:
309 v.x = As<Float4>(Int4(*Pointer<Short4>(source0)));
310 v.y = As<Float4>(Int4(*Pointer<Short4>(source1)));
311 v.z = As<Float4>(Int4(*Pointer<Short4>(source2)));
312 v.w = As<Float4>(Int4(*Pointer<Short4>(source3)));
313
314 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
315 break;
316 case VK_FORMAT_R16_UNORM:
317 case VK_FORMAT_R16G16_UNORM:
318 case VK_FORMAT_R16G16B16A16_UNORM:
319 v.x = Float4(*Pointer<UShort4>(source0));
320 v.y = Float4(*Pointer<UShort4>(source1));
321 v.z = Float4(*Pointer<UShort4>(source2));
322 v.w = Float4(*Pointer<UShort4>(source3));
323
324 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
325
326 if(componentCount >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUShort));
327 if(componentCount >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUShort));
328 if(componentCount >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUShort));
329 if(componentCount >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUShort));
330 break;
331 case VK_FORMAT_R16_UINT:
332 case VK_FORMAT_R16G16_UINT:
333 case VK_FORMAT_R16G16B16A16_UINT:
334 v.x = As<Float4>(Int4(*Pointer<UShort4>(source0)));
335 v.y = As<Float4>(Int4(*Pointer<UShort4>(source1)));
336 v.z = As<Float4>(Int4(*Pointer<UShort4>(source2)));
337 v.w = As<Float4>(Int4(*Pointer<UShort4>(source3)));
338
339 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
340 break;
341 case VK_FORMAT_R32_SINT:
342 case VK_FORMAT_R32G32_SINT:
343 case VK_FORMAT_R32G32B32_SINT:
344 case VK_FORMAT_R32G32B32A32_SINT:
345 v.x = *Pointer<Float4>(source0);
346 v.y = *Pointer<Float4>(source1);
347 v.z = *Pointer<Float4>(source2);
348 v.w = *Pointer<Float4>(source3);
349
350 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
351 break;
352 case VK_FORMAT_R32_UINT:
353 case VK_FORMAT_R32G32_UINT:
354 case VK_FORMAT_R32G32B32_UINT:
355 case VK_FORMAT_R32G32B32A32_UINT:
356 v.x = *Pointer<Float4>(source0);
357 v.y = *Pointer<Float4>(source1);
358 v.z = *Pointer<Float4>(source2);
359 v.w = *Pointer<Float4>(source3);
360
361 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
362 break;
363 case VK_FORMAT_R16_SFLOAT:
364 case VK_FORMAT_R16G16_SFLOAT:
365 case VK_FORMAT_R16G16B16A16_SFLOAT:
366 {
367 if(componentCount >= 1)
368 {
369 UShort x0 = *Pointer<UShort>(source0 + 0);
370 UShort x1 = *Pointer<UShort>(source1 + 0);
371 UShort x2 = *Pointer<UShort>(source2 + 0);
372 UShort x3 = *Pointer<UShort>(source3 + 0);
373
374 v.x.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x0) * 4);
375 v.x.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x1) * 4);
376 v.x.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x2) * 4);
377 v.x.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x3) * 4);
378 }
379
380 if(componentCount >= 2)
381 {
382 UShort y0 = *Pointer<UShort>(source0 + 2);
383 UShort y1 = *Pointer<UShort>(source1 + 2);
384 UShort y2 = *Pointer<UShort>(source2 + 2);
385 UShort y3 = *Pointer<UShort>(source3 + 2);
386
387 v.y.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y0) * 4);
388 v.y.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y1) * 4);
389 v.y.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y2) * 4);
390 v.y.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y3) * 4);
391 }
392
393 if(componentCount >= 3)
394 {
395 UShort z0 = *Pointer<UShort>(source0 + 4);
396 UShort z1 = *Pointer<UShort>(source1 + 4);
397 UShort z2 = *Pointer<UShort>(source2 + 4);
398 UShort z3 = *Pointer<UShort>(source3 + 4);
399
400 v.z.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z0) * 4);
401 v.z.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z1) * 4);
402 v.z.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z2) * 4);
403 v.z.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z3) * 4);
404 }
405
406 if(componentCount >= 4)
407 {
408 UShort w0 = *Pointer<UShort>(source0 + 6);
409 UShort w1 = *Pointer<UShort>(source1 + 6);
410 UShort w2 = *Pointer<UShort>(source2 + 6);
411 UShort w3 = *Pointer<UShort>(source3 + 6);
412
413 v.w.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w0) * 4);
414 v.w.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w1) * 4);
415 v.w.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w2) * 4);
416 v.w.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w3) * 4);
417 }
418 }
419 break;
420 case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
421 bgra = true;
422 // [[fallthrough]]
423 case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
424 {
425 Int4 src;
426 src = Insert(src, *Pointer<Int>(source0), 0);
427 src = Insert(src, *Pointer<Int>(source1), 1);
428 src = Insert(src, *Pointer<Int>(source2), 2);
429 src = Insert(src, *Pointer<Int>(source3), 3);
430 v.x = Float4((src << 22) >> 22);
431 v.y = Float4((src << 12) >> 22);
432 v.z = Float4((src << 02) >> 22);
433 v.w = Float4(src >> 30);
434
435 v.x = Max(v.x * Float4(1.0f / 0x1FF), Float4(-1.0f));
436 v.y = Max(v.y * Float4(1.0f / 0x1FF), Float4(-1.0f));
437 v.z = Max(v.z * Float4(1.0f / 0x1FF), Float4(-1.0f));
438 v.w = Max(v.w, Float4(-1.0f));
439 }
440 break;
441 case VK_FORMAT_A2R10G10B10_SINT_PACK32:
442 bgra = true;
443 // [[fallthrough]]
444 case VK_FORMAT_A2B10G10R10_SINT_PACK32:
445 {
446 Int4 src;
447 src = Insert(src, *Pointer<Int>(source0), 0);
448 src = Insert(src, *Pointer<Int>(source1), 1);
449 src = Insert(src, *Pointer<Int>(source2), 2);
450 src = Insert(src, *Pointer<Int>(source3), 3);
451 v.x = As<Float4>((src << 22) >> 22);
452 v.y = As<Float4>((src << 12) >> 22);
453 v.z = As<Float4>((src << 02) >> 22);
454 v.w = As<Float4>(src >> 30);
455 }
456 break;
457 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
458 bgra = true;
459 // [[fallthrough]]
460 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
461 {
462 Int4 src;
463 src = Insert(src, *Pointer<Int>(source0), 0);
464 src = Insert(src, *Pointer<Int>(source1), 1);
465 src = Insert(src, *Pointer<Int>(source2), 2);
466 src = Insert(src, *Pointer<Int>(source3), 3);
467
468 v.x = Float4(src & Int4(0x3FF));
469 v.y = Float4((src >> 10) & Int4(0x3FF));
470 v.z = Float4((src >> 20) & Int4(0x3FF));
471 v.w = Float4((src >> 30) & Int4(0x3));
472
473 v.x *= Float4(1.0f / 0x3FF);
474 v.y *= Float4(1.0f / 0x3FF);
475 v.z *= Float4(1.0f / 0x3FF);
476 v.w *= Float4(1.0f / 0x3);
477 }
478 break;
479 case VK_FORMAT_A2R10G10B10_UINT_PACK32:
480 bgra = true;
481 // [[fallthrough]]
482 case VK_FORMAT_A2B10G10R10_UINT_PACK32:
483 {
484 Int4 src;
485 src = Insert(src, *Pointer<Int>(source0), 0);
486 src = Insert(src, *Pointer<Int>(source1), 1);
487 src = Insert(src, *Pointer<Int>(source2), 2);
488 src = Insert(src, *Pointer<Int>(source3), 3);
489
490 v.x = As<Float4>(src & Int4(0x3FF));
491 v.y = As<Float4>((src >> 10) & Int4(0x3FF));
492 v.z = As<Float4>((src >> 20) & Int4(0x3FF));
493 v.w = As<Float4>((src >> 30) & Int4(0x3));
494 }
495 break;
496 default:
497 UNSUPPORTED("stream.format %d", int(stream.format));
498 }
499
500 if(bgra)
501 {
502 // Swap red and blue
503 Float4 t = v.x;
504 v.x = v.z;
505 v.z = t;
506 }
507
508 if(componentCount < 1) v.x = Float4(0.0f);
509 if(componentCount < 2) v.y = Float4(0.0f);
510 if(componentCount < 3) v.z = Float4(0.0f);
511 if(componentCount < 4) v.w = isNativeFloatAttrib ? As<Float4>(Float4(1.0f)) : As<Float4>(Int4(1));
512
513 return v;
514 }
515
writeCache(Pointer<Byte> & vertexCache,Pointer<UInt> & tagCache,Pointer<UInt> & batch)516 void VertexRoutine::writeCache(Pointer<Byte> &vertexCache, Pointer<UInt> &tagCache, Pointer<UInt> &batch)
517 {
518 UInt index0 = batch[0];
519 UInt index1 = batch[1];
520 UInt index2 = batch[2];
521 UInt index3 = batch[3];
522
523 UInt cacheIndex0 = index0 & VertexCache::TAG_MASK;
524 UInt cacheIndex1 = index1 & VertexCache::TAG_MASK;
525 UInt cacheIndex2 = index2 & VertexCache::TAG_MASK;
526 UInt cacheIndex3 = index3 & VertexCache::TAG_MASK;
527
528 // We processed a SIMD group of vertices, with the first one being the one that missed the cache tag check.
529 // Write them out in reverse order here and below to ensure the first one is now guaranteed to be in the cache.
530 tagCache[cacheIndex3] = index3;
531 tagCache[cacheIndex2] = index2;
532 tagCache[cacheIndex1] = index1;
533 tagCache[cacheIndex0] = index0;
534
535 auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition);
536 if(it != spirvShader->outputBuiltins.end())
537 {
538 assert(it->second.SizeInComponents == 4);
539 auto &position = routine.getVariable(it->second.Id);
540
541 Vector4f pos;
542 pos.x = position[it->second.FirstComponent + 0];
543 pos.y = position[it->second.FirstComponent + 1];
544 pos.z = position[it->second.FirstComponent + 2];
545 pos.w = position[it->second.FirstComponent + 3];
546
547 // Projection and viewport transform.
548 Float4 w = As<Float4>(As<Int4>(pos.w) | (As<Int4>(CmpEQ(pos.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
549 Float4 rhw = Float4(1.0f) / w;
550
551 Vector4f proj;
552 proj.x = As<Float4>(RoundIntClamped(*Pointer<Float4>(data + OFFSET(DrawData, X0xF)) + pos.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData, WxF))));
553 proj.y = As<Float4>(RoundIntClamped(*Pointer<Float4>(data + OFFSET(DrawData, Y0xF)) + pos.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData, HxF))));
554 proj.z = pos.z * rhw;
555 proj.w = rhw;
556
557 transpose4x4(pos.x, pos.y, pos.z, pos.w);
558
559 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, position), 16) = pos.w;
560 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, position), 16) = pos.z;
561 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, position), 16) = pos.y;
562 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, position), 16) = pos.x;
563
564 *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 24) & 0x0000000FF;
565 *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 16) & 0x0000000FF;
566 *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 8) & 0x0000000FF;
567 *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 0) & 0x0000000FF;
568
569 transpose4x4(proj.x, proj.y, proj.z, proj.w);
570
571 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, projected), 16) = proj.w;
572 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, projected), 16) = proj.z;
573 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, projected), 16) = proj.y;
574 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, projected), 16) = proj.x;
575 }
576
577 it = spirvShader->outputBuiltins.find(spv::BuiltInPointSize);
578 if(it != spirvShader->outputBuiltins.end())
579 {
580 ASSERT(it->second.SizeInComponents == 1);
581 auto psize = routine.getVariable(it->second.Id)[it->second.FirstComponent];
582
583 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, pointSize)) = Extract(psize, 3);
584 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, pointSize)) = Extract(psize, 2);
585 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, pointSize)) = Extract(psize, 1);
586 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, pointSize)) = Extract(psize, 0);
587 }
588
589 it = spirvShader->outputBuiltins.find(spv::BuiltInClipDistance);
590 if(it != spirvShader->outputBuiltins.end())
591 {
592 auto count = spirvShader->getNumOutputClipDistances();
593 for(unsigned int i = 0; i < count; i++)
594 {
595 auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
596 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 3);
597 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 2);
598 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 1);
599 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 0);
600 }
601 }
602
603 it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance);
604 if(it != spirvShader->outputBuiltins.end())
605 {
606 auto count = spirvShader->getNumOutputCullDistances();
607 for(unsigned int i = 0; i < count; i++)
608 {
609 auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
610 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 3);
611 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 2);
612 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 1);
613 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 0);
614 }
615 }
616
617 *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, cullMask)) = -((cullMask >> 3) & 1);
618 *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, cullMask)) = -((cullMask >> 2) & 1);
619 *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, cullMask)) = -((cullMask >> 1) & 1);
620 *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, cullMask)) = -((cullMask >> 0) & 1);
621
622 for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
623 {
624 if(spirvShader->outputs[i + 0].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
625 spirvShader->outputs[i + 1].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
626 spirvShader->outputs[i + 2].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
627 spirvShader->outputs[i + 3].Type != SpirvShader::ATTRIBTYPE_UNUSED)
628 {
629 Vector4f v;
630 v.x = routine.outputs[i + 0];
631 v.y = routine.outputs[i + 1];
632 v.z = routine.outputs[i + 2];
633 v.w = routine.outputs[i + 3];
634
635 transpose4x4(v.x, v.y, v.z, v.w);
636
637 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, v[i]), 16) = v.w;
638 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, v[i]), 16) = v.z;
639 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, v[i]), 16) = v.y;
640 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, v[i]), 16) = v.x;
641 }
642 }
643 }
644
writeVertex(const Pointer<Byte> & vertex,Pointer<Byte> & cacheEntry)645 void VertexRoutine::writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cacheEntry)
646 {
647 *Pointer<Int4>(vertex + OFFSET(Vertex, position)) = *Pointer<Int4>(cacheEntry + OFFSET(Vertex, position));
648 *Pointer<Int>(vertex + OFFSET(Vertex, pointSize)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, pointSize));
649
650 *Pointer<Int>(vertex + OFFSET(Vertex, clipFlags)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, clipFlags));
651 *Pointer<Int>(vertex + OFFSET(Vertex, cullMask)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, cullMask));
652 *Pointer<Int4>(vertex + OFFSET(Vertex, projected)) = *Pointer<Int4>(cacheEntry + OFFSET(Vertex, projected));
653
654 for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i++)
655 {
656 if(spirvShader->outputs[i].Type != SpirvShader::ATTRIBTYPE_UNUSED)
657 {
658 *Pointer<Int>(vertex + OFFSET(Vertex, v[i]), 4) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, v[i]), 4);
659 }
660 }
661 for(unsigned int i = 0; i < spirvShader->getNumOutputClipDistances(); i++)
662 {
663 *Pointer<Float>(vertex + OFFSET(Vertex, clipDistance[i]), 4) = *Pointer<Float>(cacheEntry + OFFSET(Vertex, clipDistance[i]), 4);
664 }
665 for(unsigned int i = 0; i < spirvShader->getNumOutputCullDistances(); i++)
666 {
667 *Pointer<Float>(vertex + OFFSET(Vertex, cullDistance[i]), 4) = *Pointer<Float>(cacheEntry + OFFSET(Vertex, cullDistance[i]), 4);
668 }
669 }
670
671 } // namespace sw
672