1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file pa.h
24 *
25 * @brief Definitions for primitive assembly.
26 * N primitives are assembled at a time, where N is the SIMD width.
27 * A state machine, that is specific for a given topology, drives the
28 * assembly of vertices into triangles.
29 *
30 ******************************************************************************/
31 #pragma once
32
33 #include "frontend.h"
34
35 struct PA_STATE
36 {
37 #if USE_SIMD16_FRONTEND
38 enum
39 {
40 SIMD_WIDTH = KNOB_SIMD16_WIDTH,
41 SIMD_WIDTH_DIV2 = KNOB_SIMD16_WIDTH / 2,
42 SIMD_WIDTH_LOG2 = 4
43 };
44
45 typedef simd16mask SIMDMASK;
46
47 typedef simd16scalar SIMDSCALAR;
48 typedef simd16vector SIMDVECTOR;
49 typedef simd16vertex SIMDVERTEX;
50
51 typedef simd16scalari SIMDSCALARI;
52
53 #else
54 enum
55 {
56 SIMD_WIDTH = KNOB_SIMD_WIDTH,
57 SIMD_WIDTH_DIV2 = KNOB_SIMD_WIDTH / 2,
58 SIMD_WIDTH_LOG2 = 3
59 };
60
61 typedef simdmask SIMDMASK;
62
63 typedef simdscalar SIMDSCALAR;
64 typedef simdvector SIMDVECTOR;
65 typedef simdvertex SIMDVERTEX;
66
67 typedef simdscalari SIMDSCALARI;
68
69 #endif
70 DRAW_CONTEXT* pDC{nullptr}; // draw context
71 uint8_t* pStreamBase{nullptr}; // vertex stream
72 uint32_t streamSizeInVerts{0}; // total size of the input stream in verts
73 uint32_t vertexStride{0}; // stride of a vertex in simdvector units
74
75 // The topology the binner will use. In some cases the FE changes the topology from the api
76 // state.
77 PRIMITIVE_TOPOLOGY binTopology{TOP_UNKNOWN};
78
79 #if ENABLE_AVX512_SIMD16
80 bool useAlternateOffset{false};
81 #endif
82
83 bool viewportArrayActive{false};
84 bool rtArrayActive{false};
85 uint32_t numVertsPerPrim{0};
86
PA_STATEPA_STATE87 PA_STATE() {}
PA_STATEPA_STATE88 PA_STATE(DRAW_CONTEXT* in_pDC,
89 uint8_t* in_pStreamBase,
90 uint32_t in_streamSizeInVerts,
91 uint32_t in_vertexStride,
92 uint32_t in_numVertsPerPrim) :
93 pDC(in_pDC),
94 pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts),
95 vertexStride(in_vertexStride), numVertsPerPrim(in_numVertsPerPrim)
96 {
97 }
98
99 virtual bool HasWork() = 0;
100 virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
101 #if ENABLE_AVX512_SIMD16
102 virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0;
103 #endif
104 virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
105 #if ENABLE_AVX512_SIMD16
106 virtual bool Assemble(uint32_t slot, simd16vector verts[]) = 0;
107 #endif
108 virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0;
109 virtual bool NextPrim() = 0;
110 virtual SIMDVERTEX& GetNextVsOutput() = 0;
111 virtual bool GetNextStreamOutput() = 0;
112 virtual SIMDMASK& GetNextVsIndices() = 0;
113 virtual uint32_t NumPrims() = 0;
114 virtual void Reset() = 0;
115 virtual SIMDSCALARI GetPrimID(uint32_t startID) = 0;
116 };
117
118 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
119 // output. Here is the sequence
120 // 1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
121 // 2. Execute PA function to assemble and bin triangles.
122 // a. The PA function is a set of functions that collectively make up the
123 // state machine for a given topology.
124 // 1. We use a state index to track which PA function to call.
125 // b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
126 // 1. We call this the current and previous simd vertex.
127 // 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
128 // order to assemble the second triangle, for a triangle list, we'll need the
129 // last vertex from the previous simd and the first 2 vertices from the current
130 // simd.
131 // 3. At times the PA can assemble multiple triangles from the 2 simd vertices.
132 //
133 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
134 // cuts
135 struct PA_STATE_OPT : public PA_STATE
136 {
137 uint32_t numPrims{0}; // Total number of primitives for draw.
138 uint32_t numPrimsComplete{0}; // Total number of complete primitives.
139
140 uint32_t numSimdPrims{0}; // Number of prims in current simd.
141
142 uint32_t cur{0}; // index to current VS output.
143 uint32_t prev{0}; // index to prev VS output. Not really needed in the state.
144 const uint32_t first{0}; // index to first VS output. Used for tri fan and line loop.
145
146 uint32_t counter{0}; // state counter
147 bool reset{false}; // reset state
148
149 uint32_t primIDIncr{0}; // how much to increment for each vector (typically vector / {1, 2})
150 SIMDSCALARI primID;
151
152 typedef bool (*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
153 #if ENABLE_AVX512_SIMD16
154 typedef bool (*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
155 #endif
156 typedef void (*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa,
157 uint32_t slot,
158 uint32_t primIndex,
159 simd4scalar verts[]);
160
161 PFN_PA_FUNC pfnPaFunc{nullptr}; // PA state machine function for assembling 4 triangles.
162 #if ENABLE_AVX512_SIMD16
163 PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{nullptr};
164 #endif
165 PFN_PA_SINGLE_FUNC pfnPaSingleFunc{
166 nullptr}; // PA state machine function for assembling single triangle.
167 PFN_PA_FUNC pfnPaFuncReset{nullptr}; // initial state to set on reset
168 #if ENABLE_AVX512_SIMD16
169 PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{nullptr};
170 #endif
171
172 // state used to advance the PA when Next is called
173 PFN_PA_FUNC pfnPaNextFunc{nullptr};
174 #if ENABLE_AVX512_SIMD16
175 PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{nullptr};
176 #endif
177 uint32_t nextNumSimdPrims{0};
178 uint32_t nextNumPrimsIncrement{0};
179 bool nextReset{false};
180 bool isStreaming{false};
181
182 SIMDMASK junkIndices{0}; // temporary index store for unused virtual function
183
PA_STATE_OPTPA_STATE_OPT184 PA_STATE_OPT() {}
185 PA_STATE_OPT(DRAW_CONTEXT* pDC,
186 uint32_t numPrims,
187 uint8_t* pStream,
188 uint32_t streamSizeInVerts,
189 uint32_t vertexStride,
190 bool in_isStreaming,
191 uint32_t numVertsPerPrim,
192 PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
193
HasWorkPA_STATE_OPT194 bool HasWork() { return (this->numPrimsComplete < this->numPrims) ? true : false; }
195
GetSimdVectorPA_STATE_OPT196 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
197 {
198 SWR_ASSERT(slot < vertexStride);
199 uint32_t offset = index * vertexStride + slot;
200 simdvector& vertexSlot = ((simdvector*)pStreamBase)[offset];
201 return vertexSlot;
202 }
203
204 #if ENABLE_AVX512_SIMD16
GetSimdVector_simd16PA_STATE_OPT205 simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
206 {
207 SWR_ASSERT(slot < vertexStride);
208 uint32_t offset = index * vertexStride + slot;
209 simd16vector& vertexSlot = ((simd16vector*)pStreamBase)[offset];
210 return vertexSlot;
211 }
212
213 #endif
214 // Assembles 4 triangles. Each simdvector is a single vertex from 4
215 // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
AssemblePA_STATE_OPT216 bool Assemble(uint32_t slot, simdvector verts[]) { return this->pfnPaFunc(*this, slot, verts); }
217
218 #if ENABLE_AVX512_SIMD16
AssemblePA_STATE_OPT219 bool Assemble(uint32_t slot, simd16vector verts[])
220 {
221 return this->pfnPaFunc_simd16(*this, slot, verts);
222 }
223
224 #endif
225 // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
AssembleSinglePA_STATE_OPT226 void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
227 {
228 return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
229 }
230
NextPrimPA_STATE_OPT231 bool NextPrim()
232 {
233 this->pfnPaFunc = this->pfnPaNextFunc;
234 #if ENABLE_AVX512_SIMD16
235 this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16;
236 #endif
237 this->numSimdPrims = this->nextNumSimdPrims;
238 this->numPrimsComplete += this->nextNumPrimsIncrement;
239 this->reset = this->nextReset;
240
241 if (this->isStreaming)
242 {
243 this->reset = false;
244 }
245
246 bool morePrims = false;
247
248 if (this->numSimdPrims > 0)
249 {
250 morePrims = true;
251 this->numSimdPrims--;
252 }
253 else
254 {
255 this->counter = (this->reset) ? 0 : (this->counter + 1);
256 this->reset = false;
257 }
258
259 if (!HasWork())
260 {
261 morePrims = false; // no more to do
262 }
263
264 return morePrims;
265 }
266
GetNextVsOutputPA_STATE_OPT267 SIMDVERTEX& GetNextVsOutput()
268 {
269 const uint32_t numSimdVerts = streamSizeInVerts / SIMD_WIDTH;
270
271 // increment cur and prev indices
272 if (counter < numSimdVerts)
273 {
274 // prev undefined for first state
275 prev = cur;
276 cur = counter;
277 }
278 else
279 {
280 // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in
281 // the buffer
282 uint32_t temp = prev;
283
284 prev = cur;
285 cur = temp;
286 }
287
288 SWR_ASSERT(cur < numSimdVerts);
289 SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[cur * vertexStride];
290
291 return *(SIMDVERTEX*)pVertex;
292 }
293
GetNextVsIndicesPA_STATE_OPT294 SIMDMASK& GetNextVsIndices()
295 {
296 // unused in optimized PA, pass tmp buffer back
297 return junkIndices;
298 }
299
GetNextStreamOutputPA_STATE_OPT300 bool GetNextStreamOutput()
301 {
302 this->prev = this->cur;
303 this->cur = this->counter;
304
305 return HasWork();
306 }
307
NumPrimsPA_STATE_OPT308 uint32_t NumPrims()
309 {
310 return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims)
311 ? (SIMD_WIDTH -
312 (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims))
313 : SIMD_WIDTH;
314 }
315
316 void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
317 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
318 uint32_t numSimdPrims = 0,
319 uint32_t numPrimsIncrement = 0,
320 bool reset = false)
321 {
322 this->pfnPaNextFunc = pfnPaNextFunc;
323 this->nextNumSimdPrims = numSimdPrims;
324 this->nextNumPrimsIncrement = numPrimsIncrement;
325 this->nextReset = reset;
326
327 this->pfnPaSingleFunc = pfnPaNextSingleFunc;
328 }
329
330 #if ENABLE_AVX512_SIMD16
331 void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
332 PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
333 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
334 uint32_t numSimdPrims = 0,
335 uint32_t numPrimsIncrement = 0,
336 bool reset = false)
337 {
338 this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16;
339 this->pfnPaNextFunc = pfnPaNextFunc;
340 this->nextNumSimdPrims = numSimdPrims;
341 this->nextNumPrimsIncrement = numPrimsIncrement;
342 this->nextReset = reset;
343
344 this->pfnPaSingleFunc = pfnPaNextSingleFunc;
345 }
346
347 #endif
ResetPA_STATE_OPT348 void Reset()
349 {
350 #if ENABLE_AVX512_SIMD16
351 useAlternateOffset = false;
352
353 #endif
354 this->pfnPaFunc = this->pfnPaFuncReset;
355 #if ENABLE_AVX512_SIMD16
356 this->pfnPaFunc_simd16 = this->pfnPaFuncReset_simd16;
357 #endif
358 this->numPrimsComplete = 0;
359 this->numSimdPrims = 0;
360 this->cur = 0;
361 this->prev = 0;
362 this->counter = 0;
363 this->reset = false;
364 }
365
GetPrimIDPA_STATE_OPT366 SIMDSCALARI GetPrimID(uint32_t startID)
367 {
368 #if USE_SIMD16_FRONTEND
369 return _simd16_add_epi32(
370 this->primID,
371 _simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
372 #else
373 return _simd_add_epi32(
374 this->primID,
375 _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
376 #endif
377 }
378 };
379
380 // helper C wrappers to avoid having to rewrite all the PA topology state functions
381 INLINE void SetNextPaState(PA_STATE_OPT& pa,
382 PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
383 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
384 uint32_t numSimdPrims = 0,
385 uint32_t numPrimsIncrement = 0,
386 bool reset = false)
387 {
388 return pa.SetNextState(
389 pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
390 }
391
392 #if ENABLE_AVX512_SIMD16
393 INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa,
394 PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
395 PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
396 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
397 uint32_t numSimdPrims = 0,
398 uint32_t numPrimsIncrement = 0,
399 bool reset = false)
400 {
401 return pa.SetNextState_simd16(pfnPaNextFunc_simd16,
402 pfnPaNextFunc,
403 pfnPaNextSingleFunc,
404 numSimdPrims,
405 numPrimsIncrement,
406 reset);
407 }
408
409 #endif
PaGetSimdVector(PA_STATE & pa,uint32_t index,uint32_t slot)410 INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
411 {
412 return pa.GetSimdVector(index, slot);
413 }
414
415 #if ENABLE_AVX512_SIMD16
PaGetSimdVector_simd16(PA_STATE & pa,uint32_t index,uint32_t slot)416 INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32_t slot)
417 {
418 return pa.GetSimdVector_simd16(index, slot);
419 }
420
421 #endif
422 // Cut-aware primitive assembler.
423 struct PA_STATE_CUT : public PA_STATE
424 {
425 SIMDMASK* pCutIndices{nullptr}; // cut indices buffer, 1 bit per vertex
426 uint32_t numVerts{0}; // number of vertices available in buffer store
427 uint32_t numAttribs{0}; // number of attributes
428 int32_t numRemainingVerts{0}; // number of verts remaining to be assembled
429 uint32_t numVertsToAssemble{0}; // total number of verts to assemble for the draw
430 #if ENABLE_AVX512_SIMD16
431 OSALIGNSIMD16(uint32_t)
432 indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
433 #else
434 OSALIGNSIMD(uint32_t)
435 indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
436 #endif
437 SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd
438 uint32_t numPrimsAssembled{0}; // number of primitives that are fully assembled
439 uint32_t headVertex{0}; // current unused vertex slot in vertex buffer store
440 uint32_t tailVertex{0}; // beginning vertex currently assembling
441 uint32_t curVertex{0}; // current unprocessed vertex
442 uint32_t startPrimId{0}; // starting prim id
443 SIMDSCALARI vPrimId; // vector of prim ID
444 bool needOffsets{false}; // need to compute gather offsets for current SIMD
445 uint32_t vertsPerPrim{0};
446 bool processCutVerts{
447 false}; // vertex indices with cuts should be processed as normal, otherwise they
448 // are ignored. Fetch shader sends invalid verts on cuts that should be ignored
449 // while the GS sends valid verts for every index
450
451 simdvector junkVector; // junk simdvector for unimplemented API
452 #if ENABLE_AVX512_SIMD16
453 simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
454 #endif
455
456 // Topology state tracking
457 uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
458 uint32_t curIndex{0};
459 bool reverseWinding{false}; // indicates reverse winding for strips
460 int32_t adjExtraVert{0}; // extra vert uses for tristrip w/ adj
461
462 typedef void (PA_STATE_CUT::*PFN_PA_FUNC)(uint32_t vert, bool finish);
463 PFN_PA_FUNC pfnPa{nullptr}; // per-topology function that processes a single vert
464
PA_STATE_CUTPA_STATE_CUT465 PA_STATE_CUT() {}
PA_STATE_CUTPA_STATE_CUT466 PA_STATE_CUT(DRAW_CONTEXT* pDC,
467 uint8_t* in_pStream,
468 uint32_t in_streamSizeInVerts,
469 uint32_t in_vertexStride,
470 SIMDMASK* in_pIndices,
471 uint32_t in_numVerts,
472 uint32_t in_numAttribs,
473 PRIMITIVE_TOPOLOGY topo,
474 bool in_processCutVerts,
475 uint32_t in_numVertsPerPrim) :
476 PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride, in_numVertsPerPrim)
477 {
478 numVerts = in_streamSizeInVerts;
479 numAttribs = in_numAttribs;
480 binTopology = topo;
481 needOffsets = false;
482 processCutVerts = in_processCutVerts;
483
484 numVertsToAssemble = numRemainingVerts = in_numVerts;
485 numPrimsAssembled = 0;
486 headVertex = tailVertex = curVertex = 0;
487
488 curIndex = 0;
489 pCutIndices = in_pIndices;
490 memset(indices, 0, sizeof(indices));
491 #if USE_SIMD16_FRONTEND
492 vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
493 #else
494 vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
495 #endif
496 reverseWinding = false;
497 adjExtraVert = -1;
498
499 bool gsEnabled = pDC->pState->state.gsState.gsEnable;
500 vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
501
502 switch (topo)
503 {
504 case TOP_TRIANGLE_LIST:
505 pfnPa = &PA_STATE_CUT::ProcessVertTriList;
506 break;
507 case TOP_TRI_LIST_ADJ:
508 pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj
509 : &PA_STATE_CUT::ProcessVertTriListAdjNoGs;
510 break;
511 case TOP_TRIANGLE_STRIP:
512 pfnPa = &PA_STATE_CUT::ProcessVertTriStrip;
513 break;
514 case TOP_TRI_STRIP_ADJ:
515 if (gsEnabled)
516 {
517 pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj<true>;
518 }
519 else
520 {
521 pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj<false>;
522 }
523 break;
524
525 case TOP_POINT_LIST:
526 pfnPa = &PA_STATE_CUT::ProcessVertPointList;
527 break;
528 case TOP_LINE_LIST:
529 pfnPa = &PA_STATE_CUT::ProcessVertLineList;
530 break;
531 case TOP_LINE_LIST_ADJ:
532 pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj
533 : &PA_STATE_CUT::ProcessVertLineListAdjNoGs;
534 break;
535 case TOP_LINE_STRIP:
536 pfnPa = &PA_STATE_CUT::ProcessVertLineStrip;
537 break;
538 case TOP_LISTSTRIP_ADJ:
539 pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj
540 : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs;
541 break;
542 case TOP_RECT_LIST:
543 pfnPa = &PA_STATE_CUT::ProcessVertRectList;
544 break;
545 default:
546 assert(0 && "Unimplemented topology");
547 }
548 }
549
GetNextVsOutputPA_STATE_CUT550 SIMDVERTEX& GetNextVsOutput()
551 {
552 uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
553 this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts;
554 this->needOffsets = true;
555 SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride];
556
557 return *(SIMDVERTEX*)pVertex;
558 }
559
GetNextVsIndicesPA_STATE_CUT560 SIMDMASK& GetNextVsIndices()
561 {
562 uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
563 SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex;
564 return *pCurCutIndex;
565 }
566
GetSimdVectorPA_STATE_CUT567 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
568 {
569 // unused
570 SWR_ASSERT(0 && "Not implemented");
571 return junkVector;
572 }
573
574 #if ENABLE_AVX512_SIMD16
GetSimdVector_simd16PA_STATE_CUT575 simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
576 {
577 // unused
578 SWR_ASSERT(0 && "Not implemented");
579 return junkVector_simd16;
580 }
581
582 #endif
GetNextStreamOutputPA_STATE_CUT583 bool GetNextStreamOutput()
584 {
585 this->headVertex += SIMD_WIDTH;
586 this->needOffsets = true;
587 return HasWork();
588 }
589
GetPrimIDPA_STATE_CUT590 SIMDSCALARI GetPrimID(uint32_t startID)
591 {
592 #if USE_SIMD16_FRONTEND
593 return _simd16_add_epi32(_simd16_set1_epi32(startID), this->vPrimId);
594 #else
595 return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
596 #endif
597 }
598
ResetPA_STATE_CUT599 void Reset()
600 {
601 #if ENABLE_AVX512_SIMD16
602 useAlternateOffset = false;
603
604 #endif
605 this->numRemainingVerts = this->numVertsToAssemble;
606 this->numPrimsAssembled = 0;
607 this->curIndex = 0;
608 this->curVertex = 0;
609 this->tailVertex = 0;
610 this->headVertex = 0;
611 this->reverseWinding = false;
612 this->adjExtraVert = -1;
613 #if USE_SIMD16_FRONTEND
614 this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
615 #else
616 this->vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
617 #endif
618 }
619
HasWorkPA_STATE_CUT620 bool HasWork() { return this->numRemainingVerts > 0 || this->adjExtraVert != -1; }
621
IsVertexStoreFullPA_STATE_CUT622 bool IsVertexStoreFull()
623 {
624 return ((this->headVertex + SIMD_WIDTH) % this->numVerts) == this->tailVertex;
625 }
626
RestartTopologyPA_STATE_CUT627 void RestartTopology()
628 {
629 this->curIndex = 0;
630 this->reverseWinding = false;
631 this->adjExtraVert = -1;
632 }
633
IsCutIndexPA_STATE_CUT634 bool IsCutIndex(uint32_t vertex)
635 {
636 uint32_t vertexIndex = vertex / SIMD_WIDTH;
637 uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1);
638 return CheckBit(this->pCutIndices[vertexIndex], vertexOffset);
639 }
640
641 // iterates across the unprocessed verts until we hit the end or we
642 // have assembled SIMD prims
ProcessVertsPA_STATE_CUT643 void ProcessVerts()
644 {
645 while (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0 &&
646 this->curVertex != this->headVertex)
647 {
648 // if cut index, restart topology
649 if (IsCutIndex(this->curVertex))
650 {
651 if (this->processCutVerts)
652 {
653 (this->*pfnPa)(this->curVertex, false);
654 }
655 // finish off tri strip w/ adj before restarting topo
656 if (this->adjExtraVert != -1)
657 {
658 (this->*pfnPa)(this->curVertex, true);
659 }
660 RestartTopology();
661 }
662 else
663 {
664 (this->*pfnPa)(this->curVertex, false);
665 }
666
667 this->curVertex++;
668 if (this->curVertex >= this->numVerts)
669 {
670 this->curVertex = 0;
671 }
672 this->numRemainingVerts--;
673 }
674
675 // special case last primitive for tri strip w/ adj
676 if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 &&
677 this->adjExtraVert != -1)
678 {
679 (this->*pfnPa)(this->curVertex, true);
680 }
681 }
682
AdvancePA_STATE_CUT683 void Advance()
684 {
685 // done with current batch
686 // advance tail to the current unsubmitted vertex
687 this->tailVertex = this->curVertex;
688 this->numPrimsAssembled = 0;
689 #if USE_SIMD16_FRONTEND
690 this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH));
691 #else
692 this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(SIMD_WIDTH));
693 #endif
694 }
695
NextPrimPA_STATE_CUT696 bool NextPrim()
697 {
698 // if we've assembled enough prims, we can advance to the next set of verts
699 if (this->numPrimsAssembled == SIMD_WIDTH || this->numRemainingVerts <= 0)
700 {
701 Advance();
702 }
703 return false;
704 }
705
ComputeOffsetsPA_STATE_CUT706 void ComputeOffsets()
707 {
708 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
709 {
710 uint32_t vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR);
711 SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0];
712
713 // step to simdvertex batch
714 const uint32_t simdShift = SIMD_WIDTH_LOG2;
715 #if USE_SIMD16_FRONTEND
716 SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
717 this->vOffsets[v] =
718 _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes));
719 #else
720 SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
721 this->vOffsets[v] =
722 _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes));
723 #endif
724
725 // step to index
726 const uint32_t simdMask = SIMD_WIDTH - 1;
727 #if USE_SIMD16_FRONTEND
728 SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask));
729 this->vOffsets[v] = _simd16_add_epi32(
730 this->vOffsets[v],
731 _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float))));
732 #else
733 SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
734 this->vOffsets[v] =
735 _simd_add_epi32(this->vOffsets[v],
736 _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
737 #endif
738 }
739 }
740
AssemblePA_STATE_CUT741 bool Assemble(uint32_t slot, simdvector* verts)
742 {
743 // process any outstanding verts
744 ProcessVerts();
745
746 // return false if we don't have enough prims assembled
747 if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
748 {
749 return false;
750 }
751
752 // cache off gather offsets given the current SIMD set of indices the first time we get an
753 // assemble
754 if (this->needOffsets)
755 {
756 ComputeOffsets();
757 this->needOffsets = false;
758 }
759
760 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
761 {
762 SIMDSCALARI offsets = this->vOffsets[v];
763
764 // step to attribute
765 #if USE_SIMD16_FRONTEND
766 offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
767 #else
768 offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(SIMDVECTOR)));
769 #endif
770
771 float* pBase = (float*)this->pStreamBase;
772 for (uint32_t c = 0; c < 4; ++c)
773 {
774 #if USE_SIMD16_FRONTEND
775 simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);
776
777 // Assigning to a temporary first to avoid an MSVC 2017 compiler bug
778 simdscalar t =
779 useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
780 verts[v].v[c] = t;
781 #else
782 verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
783 #endif
784
785 // move base to next component
786 pBase += SIMD_WIDTH;
787 }
788 }
789
790 // compute the implied 4th vertex, v3
791 if (this->binTopology == TOP_RECT_LIST)
792 {
793 for (uint32_t c = 0; c < 4; ++c)
794 {
795 // v1, v3 = v1 + v2 - v0, v2
796 // v1 stored in verts[0], v0 stored in verts[1], v2 stored in verts[2]
797 simd16scalar temp = _simd16_add_ps(verts[0].v[c], verts[2].v[c]);
798 temp = _simd16_sub_ps(temp, verts[1].v[c]);
799 temp = _simd16_blend_ps(verts[1].v[c], temp, 0xAAAA); // 1010 1010 1010 1010
800 verts[1].v[c] = _simd16_extract_ps(temp, 0);
801 }
802 }
803
804 return true;
805 }
806
807 #if ENABLE_AVX512_SIMD16
AssemblePA_STATE_CUT808 bool Assemble(uint32_t slot, simd16vector verts[])
809 {
810 // process any outstanding verts
811 ProcessVerts();
812
813 // return false if we don't have enough prims assembled
814 if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
815 {
816 return false;
817 }
818
819 // cache off gather offsets given the current SIMD set of indices the first time we get an
820 // assemble
821 if (this->needOffsets)
822 {
823 ComputeOffsets();
824 this->needOffsets = false;
825 }
826
827 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
828 {
829 SIMDSCALARI offsets = this->vOffsets[v];
830
831 // step to attribute
832 #if USE_SIMD16_FRONTEND
833 offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
834 #else
835 offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
836 #endif
837
838 float* pBase = (float*)this->pStreamBase;
839 for (uint32_t c = 0; c < 4; ++c)
840 {
841 #if USE_SIMD16_FRONTEND
842 verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1);
843 #else
844 verts[v].v[c] = _simd16_insert_ps(
845 _simd16_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1), 0);
846 #endif
847
848 // move base to next component
849 pBase += SIMD_WIDTH;
850 }
851 }
852
853 // compute the implied 4th vertex, v3
854 if (this->binTopology == TOP_RECT_LIST)
855 {
856 for (uint32_t c = 0; c < 4; ++c)
857 {
858 // v1, v3 = v1 + v2 - v0, v2
859 // v1 stored in verts[0], v0 stored in verts[1], v2 stored in verts[2]
860 simd16scalar temp = _simd16_add_ps(verts[0].v[c], verts[2].v[c]);
861 temp = _simd16_sub_ps(temp, verts[1].v[c]);
862 verts[1].v[c] =
863 _simd16_blend_ps(verts[1].v[c], temp, 0xAAAA); // 1010 1010 1010 1010
864 }
865 }
866
867 return true;
868 }
869
870 #endif
AssembleSinglePA_STATE_CUT871 void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3])
872 {
873 // move to slot
874 for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
875 {
876 uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
877 #if USE_SIMD16_FRONTEND
878 uint32_t offset =
879 useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex];
880 #else
881 uint32_t offset = pOffset[triIndex];
882 #endif
883 offset += sizeof(SIMDVECTOR) * slot;
884 float* pVert = (float*)&tri[v];
885 for (uint32_t c = 0; c < 4; ++c)
886 {
887 float* pComponent = (float*)(this->pStreamBase + offset);
888 pVert[c] = *pComponent;
889 offset += SIMD_WIDTH * sizeof(float);
890 }
891 }
892
893 // compute the implied 4th vertex, v3
894 if ((this->binTopology == TOP_RECT_LIST) && (triIndex % 2 == 1))
895 {
896 // v1, v3 = v1 + v2 - v0, v2
897 // v1 stored in tri[0], v0 stored in tri[1], v2 stored in tri[2]
898 float* pVert0 = (float*)&tri[1];
899 float* pVert1 = (float*)&tri[0];
900 float* pVert2 = (float*)&tri[2];
901 float* pVert3 = (float*)&tri[1];
902 for (uint32_t c = 0; c < 4; ++c)
903 {
904 pVert3[c] = pVert1[c] + pVert2[c] - pVert0[c];
905 }
906 }
907 }
908
NumPrimsPA_STATE_CUT909 uint32_t NumPrims() { return this->numPrimsAssembled; }
910
911 // Per-topology functions
ProcessVertTriStripPA_STATE_CUT912 void ProcessVertTriStrip(uint32_t index, bool finish)
913 {
914 this->vert[this->curIndex] = index;
915 this->curIndex++;
916 if (this->curIndex == 3)
917 {
918 // assembled enough verts for prim, add to gather indices
919 this->indices[0][this->numPrimsAssembled] = this->vert[0];
920 if (reverseWinding)
921 {
922 this->indices[1][this->numPrimsAssembled] = this->vert[2];
923 this->indices[2][this->numPrimsAssembled] = this->vert[1];
924 }
925 else
926 {
927 this->indices[1][this->numPrimsAssembled] = this->vert[1];
928 this->indices[2][this->numPrimsAssembled] = this->vert[2];
929 }
930
931 // increment numPrimsAssembled
932 this->numPrimsAssembled++;
933
934 // set up next prim state
935 this->vert[0] = this->vert[1];
936 this->vert[1] = this->vert[2];
937 this->curIndex = 2;
938 this->reverseWinding ^= 1;
939 }
940 }
941
942 template <bool gsEnabled>
AssembleTriStripAdjPA_STATE_CUT943 void AssembleTriStripAdj()
944 {
945 if (!gsEnabled)
946 {
947 this->vert[1] = this->vert[2];
948 this->vert[2] = this->vert[4];
949
950 this->indices[0][this->numPrimsAssembled] = this->vert[0];
951 this->indices[1][this->numPrimsAssembled] = this->vert[1];
952 this->indices[2][this->numPrimsAssembled] = this->vert[2];
953
954 this->vert[4] = this->vert[2];
955 this->vert[2] = this->vert[1];
956 }
957 else
958 {
959 this->indices[0][this->numPrimsAssembled] = this->vert[0];
960 this->indices[1][this->numPrimsAssembled] = this->vert[1];
961 this->indices[2][this->numPrimsAssembled] = this->vert[2];
962 this->indices[3][this->numPrimsAssembled] = this->vert[3];
963 this->indices[4][this->numPrimsAssembled] = this->vert[4];
964 this->indices[5][this->numPrimsAssembled] = this->vert[5];
965 }
966 this->numPrimsAssembled++;
967 }
968
969 template <bool gsEnabled>
ProcessVertTriStripAdjPA_STATE_CUT970 void ProcessVertTriStripAdj(uint32_t index, bool finish)
971 {
972 // handle last primitive of tristrip
973 if (finish && this->adjExtraVert != -1)
974 {
975 this->vert[3] = this->adjExtraVert;
976 AssembleTriStripAdj<gsEnabled>();
977 this->adjExtraVert = -1;
978 return;
979 }
980
981 switch (this->curIndex)
982 {
983 case 0:
984 case 1:
985 case 2:
986 case 4:
987 this->vert[this->curIndex] = index;
988 this->curIndex++;
989 break;
990 case 3:
991 this->vert[5] = index;
992 this->curIndex++;
993 break;
994 case 5:
995 if (this->adjExtraVert == -1)
996 {
997 this->adjExtraVert = index;
998 }
999 else
1000 {
1001 this->vert[3] = index;
1002 if (!gsEnabled)
1003 {
1004 AssembleTriStripAdj<gsEnabled>();
1005
1006 uint32_t nextTri[6];
1007 if (this->reverseWinding)
1008 {
1009 nextTri[0] = this->vert[4];
1010 nextTri[1] = this->vert[0];
1011 nextTri[2] = this->vert[2];
1012 nextTri[4] = this->vert[3];
1013 nextTri[5] = this->adjExtraVert;
1014 }
1015 else
1016 {
1017 nextTri[0] = this->vert[2];
1018 nextTri[1] = this->adjExtraVert;
1019 nextTri[2] = this->vert[3];
1020 nextTri[4] = this->vert[4];
1021 nextTri[5] = this->vert[0];
1022 }
1023 for (uint32_t i = 0; i < 6; ++i)
1024 {
1025 this->vert[i] = nextTri[i];
1026 }
1027
1028 this->adjExtraVert = -1;
1029 this->reverseWinding ^= 1;
1030 }
1031 else
1032 {
1033 this->curIndex++;
1034 }
1035 }
1036 break;
1037 case 6:
1038 SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
1039 AssembleTriStripAdj<gsEnabled>();
1040
1041 uint32_t nextTri[6];
1042 if (this->reverseWinding)
1043 {
1044 nextTri[0] = this->vert[4];
1045 nextTri[1] = this->vert[0];
1046 nextTri[2] = this->vert[2];
1047 nextTri[4] = this->vert[3];
1048 nextTri[5] = this->adjExtraVert;
1049 }
1050 else
1051 {
1052 nextTri[0] = this->vert[2];
1053 nextTri[1] = this->adjExtraVert;
1054 nextTri[2] = this->vert[3];
1055 nextTri[4] = this->vert[4];
1056 nextTri[5] = this->vert[0];
1057 }
1058 for (uint32_t i = 0; i < 6; ++i)
1059 {
1060 this->vert[i] = nextTri[i];
1061 }
1062 this->reverseWinding ^= 1;
1063 this->adjExtraVert = index;
1064 this->curIndex--;
1065 break;
1066 }
1067 }
1068
ProcessVertTriListPA_STATE_CUT1069 void ProcessVertTriList(uint32_t index, bool finish)
1070 {
1071 this->vert[this->curIndex] = index;
1072 this->curIndex++;
1073 if (this->curIndex == 3)
1074 {
1075 // assembled enough verts for prim, add to gather indices
1076 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1077 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1078 this->indices[2][this->numPrimsAssembled] = this->vert[2];
1079
1080 // increment numPrimsAssembled
1081 this->numPrimsAssembled++;
1082
1083 // set up next prim state
1084 this->curIndex = 0;
1085 }
1086 }
1087
ProcessVertTriListAdjPA_STATE_CUT1088 void ProcessVertTriListAdj(uint32_t index, bool finish)
1089 {
1090 this->vert[this->curIndex] = index;
1091 this->curIndex++;
1092 if (this->curIndex == 6)
1093 {
1094 // assembled enough verts for prim, add to gather indices
1095 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1096 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1097 this->indices[2][this->numPrimsAssembled] = this->vert[2];
1098 this->indices[3][this->numPrimsAssembled] = this->vert[3];
1099 this->indices[4][this->numPrimsAssembled] = this->vert[4];
1100 this->indices[5][this->numPrimsAssembled] = this->vert[5];
1101
1102 // increment numPrimsAssembled
1103 this->numPrimsAssembled++;
1104
1105 // set up next prim state
1106 this->curIndex = 0;
1107 }
1108 }
1109
ProcessVertTriListAdjNoGsPA_STATE_CUT1110 void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
1111 {
1112 this->vert[this->curIndex] = index;
1113 this->curIndex++;
1114 if (this->curIndex == 6)
1115 {
1116 // assembled enough verts for prim, add to gather indices
1117 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1118 this->indices[1][this->numPrimsAssembled] = this->vert[2];
1119 this->indices[2][this->numPrimsAssembled] = this->vert[4];
1120
1121 // increment numPrimsAssembled
1122 this->numPrimsAssembled++;
1123
1124 // set up next prim state
1125 this->curIndex = 0;
1126 }
1127 }
1128
ProcessVertLineListPA_STATE_CUT1129 void ProcessVertLineList(uint32_t index, bool finish)
1130 {
1131 this->vert[this->curIndex] = index;
1132 this->curIndex++;
1133 if (this->curIndex == 2)
1134 {
1135 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1136 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1137
1138 this->numPrimsAssembled++;
1139 this->curIndex = 0;
1140 }
1141 }
1142
ProcessVertLineStripPA_STATE_CUT1143 void ProcessVertLineStrip(uint32_t index, bool finish)
1144 {
1145 this->vert[this->curIndex] = index;
1146 this->curIndex++;
1147 if (this->curIndex == 2)
1148 {
1149 // assembled enough verts for prim, add to gather indices
1150 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1151 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1152
1153 // increment numPrimsAssembled
1154 this->numPrimsAssembled++;
1155
1156 // set up next prim state
1157 this->vert[0] = this->vert[1];
1158 this->curIndex = 1;
1159 }
1160 }
1161
ProcessVertLineStripAdjPA_STATE_CUT1162 void ProcessVertLineStripAdj(uint32_t index, bool finish)
1163 {
1164 this->vert[this->curIndex] = index;
1165 this->curIndex++;
1166 if (this->curIndex == 4)
1167 {
1168 // assembled enough verts for prim, add to gather indices
1169 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1170 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1171 this->indices[2][this->numPrimsAssembled] = this->vert[2];
1172 this->indices[3][this->numPrimsAssembled] = this->vert[3];
1173
1174 // increment numPrimsAssembled
1175 this->numPrimsAssembled++;
1176
1177 // set up next prim state
1178 this->vert[0] = this->vert[1];
1179 this->vert[1] = this->vert[2];
1180 this->vert[2] = this->vert[3];
1181 this->curIndex = 3;
1182 }
1183 }
1184
ProcessVertLineStripAdjNoGsPA_STATE_CUT1185 void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
1186 {
1187 this->vert[this->curIndex] = index;
1188 this->curIndex++;
1189 if (this->curIndex == 4)
1190 {
1191 // assembled enough verts for prim, add to gather indices
1192 this->indices[0][this->numPrimsAssembled] = this->vert[1];
1193 this->indices[1][this->numPrimsAssembled] = this->vert[2];
1194
1195 // increment numPrimsAssembled
1196 this->numPrimsAssembled++;
1197
1198 // set up next prim state
1199 this->vert[0] = this->vert[1];
1200 this->vert[1] = this->vert[2];
1201 this->vert[2] = this->vert[3];
1202 this->curIndex = 3;
1203 }
1204 }
1205
ProcessVertLineListAdjPA_STATE_CUT1206 void ProcessVertLineListAdj(uint32_t index, bool finish)
1207 {
1208 this->vert[this->curIndex] = index;
1209 this->curIndex++;
1210 if (this->curIndex == 4)
1211 {
1212 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1213 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1214 this->indices[2][this->numPrimsAssembled] = this->vert[2];
1215 this->indices[3][this->numPrimsAssembled] = this->vert[3];
1216
1217 this->numPrimsAssembled++;
1218 this->curIndex = 0;
1219 }
1220 }
1221
ProcessVertLineListAdjNoGsPA_STATE_CUT1222 void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
1223 {
1224 this->vert[this->curIndex] = index;
1225 this->curIndex++;
1226 if (this->curIndex == 4)
1227 {
1228 this->indices[0][this->numPrimsAssembled] = this->vert[1];
1229 this->indices[1][this->numPrimsAssembled] = this->vert[2];
1230
1231 this->numPrimsAssembled++;
1232 this->curIndex = 0;
1233 }
1234 }
1235
ProcessVertPointListPA_STATE_CUT1236 void ProcessVertPointList(uint32_t index, bool finish)
1237 {
1238 this->vert[this->curIndex] = index;
1239 this->curIndex++;
1240 if (this->curIndex == 1)
1241 {
1242 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1243 this->numPrimsAssembled++;
1244 this->curIndex = 0;
1245 }
1246 }
1247
ProcessVertRectListPA_STATE_CUT1248 void ProcessVertRectList(uint32_t index, bool finish)
1249 {
1250 this->vert[this->curIndex] = index;
1251 this->curIndex++;
1252 if (this->curIndex == 3)
1253 {
1254 // assembled enough verts for prim, add to gather indices
1255 this->indices[0][this->numPrimsAssembled] = this->vert[0];
1256 this->indices[1][this->numPrimsAssembled] = this->vert[1];
1257 this->indices[2][this->numPrimsAssembled] = this->vert[2];
1258
1259 // second triangle in the rectangle
1260 // v1, v3 = v1 + v2 - v0, v2
1261 this->indices[0][this->numPrimsAssembled + 1] = this->vert[1];
1262 this->indices[1][this->numPrimsAssembled + 1] = this->vert[0];
1263 this->indices[2][this->numPrimsAssembled + 1] = this->vert[2];
1264
1265 // increment numPrimsAssembled
1266 this->numPrimsAssembled += 2;
1267
1268 // set up next prim state
1269 this->curIndex = 0;
1270 }
1271 }
1272 };
1273
1274 // Primitive Assembly for data output from the DomainShader.
1275 struct PA_TESS : PA_STATE
1276 {
1277 PA_TESS(DRAW_CONTEXT* in_pDC,
1278 const SIMDSCALAR* in_pVertData,
1279 uint32_t in_attributeStrideInVectors,
1280 uint32_t in_vertexStride,
1281 uint32_t in_numAttributes,
1282 uint32_t* (&in_ppIndices)[3],
1283 uint32_t in_numPrims,
1284 PRIMITIVE_TOPOLOGY in_binTopology,
1285 uint32_t numVertsPerPrim,
1286 bool SOA = true) :
1287
1288 PA_STATE(in_pDC, nullptr, 0, in_vertexStride, numVertsPerPrim),
1289 m_pVertexData(in_pVertData), m_attributeStrideInVectors(in_attributeStrideInVectors),
1290 m_numAttributes(in_numAttributes), m_numPrims(in_numPrims), m_SOA(SOA)
1291 {
1292 #if USE_SIMD16_FRONTEND
1293 m_vPrimId = _simd16_setzero_si();
1294 #else
1295 m_vPrimId = _simd_setzero_si();
1296 #endif
1297 binTopology = in_binTopology;
1298 m_ppIndices[0] = in_ppIndices[0];
1299 m_ppIndices[1] = in_ppIndices[1];
1300 m_ppIndices[2] = in_ppIndices[2];
1301
1302 switch (binTopology)
1303 {
1304 case TOP_POINT_LIST:
1305 m_numVertsPerPrim = 1;
1306 break;
1307
1308 case TOP_LINE_LIST:
1309 m_numVertsPerPrim = 2;
1310 break;
1311
1312 case TOP_TRIANGLE_LIST:
1313 m_numVertsPerPrim = 3;
1314 break;
1315
1316 default:
1317 SWR_INVALID("Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
1318 break;
1319 }
1320 }
1321
HasWorkPA_TESS1322 bool HasWork() { return m_numPrims != 0; }
1323
GetSimdVectorPA_TESS1324 simdvector& GetSimdVector(uint32_t index, uint32_t slot)
1325 {
1326 SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
1327 return junkVector;
1328 }
1329
1330 #if ENABLE_AVX512_SIMD16
GetSimdVector_simd16PA_TESS1331 simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
1332 {
1333 SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
1334 return junkVector_simd16;
1335 }
1336
1337 #endif
GenPrimMaskPA_TESS1338 static SIMDSCALARI GenPrimMask(uint32_t numPrims)
1339 {
1340 SWR_ASSERT(numPrims <= SIMD_WIDTH);
1341 #if USE_SIMD16_FRONTEND
1342 static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] = {
1343 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1344 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1345
1346 return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1347 #else
1348 static const OSALIGNLINE(int32_t)
1349 maskGen[SIMD_WIDTH * 2] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0};
1350
1351 return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1352 #endif
1353 }
1354
AssemblePA_TESS1355 bool Assemble(uint32_t slot, simdvector verts[])
1356 {
1357 SWR_ASSERT(slot < m_numAttributes);
1358
1359 uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1360 if (0 == numPrimsToAssemble)
1361 {
1362 return false;
1363 }
1364
1365 SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1366
1367 const float* pBaseAttrib;
1368 if (m_SOA)
1369 {
1370 pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1371 }
1372 else
1373 {
1374 const float* pVertData = (const float*)m_pVertexData;
1375 pBaseAttrib = pVertData + slot * 4;
1376 }
1377
1378 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1379 {
1380 #if USE_SIMD16_FRONTEND
1381 SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1382 #else
1383 SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1384 #endif
1385
1386 const float* pBase = pBaseAttrib;
1387 for (uint32_t c = 0; c < 4; ++c)
1388 {
1389 #if USE_SIMD16_FRONTEND
1390 simd16scalar temp =
1391 _simd16_mask_i32gather_ps(_simd16_setzero_ps(),
1392 pBase,
1393 indices,
1394 _simd16_castsi_ps(mask),
1395 4 /* gcc doesn't like sizeof(float) */);
1396
1397 verts[i].v[c] =
1398 useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
1399 #else
1400 verts[i].v[c] = _simd_mask_i32gather_ps(_simd_setzero_ps(),
1401 pBase,
1402 indices,
1403 _simd_castsi_ps(mask),
1404 4); // gcc doesn't like sizeof(float)
1405 #endif
1406 if (m_SOA)
1407 {
1408 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1409 }
1410 else
1411 {
1412 pBase += sizeof(float);
1413 }
1414 }
1415 }
1416
1417 return true;
1418 }
1419
1420 #if ENABLE_AVX512_SIMD16
AssemblePA_TESS1421 bool Assemble(uint32_t slot, simd16vector verts[])
1422 {
1423 SWR_ASSERT(slot < m_numAttributes);
1424
1425 uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1426 if (0 == numPrimsToAssemble)
1427 {
1428 return false;
1429 }
1430
1431 SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1432
1433 const float* pBaseAttrib;
1434 if (m_SOA)
1435 {
1436 pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1437 }
1438 else
1439 {
1440 const float* pVertData = (const float*)m_pVertexData;
1441 pBaseAttrib = pVertData + slot * 4;
1442 }
1443
1444 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1445 {
1446 #if USE_SIMD16_FRONTEND
1447 SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1448 if (!m_SOA)
1449 {
1450 indices = _simd16_mullo_epi32(indices, _simd16_set1_epi32(vertexStride / 4));
1451 }
1452 #else
1453 SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1454 #endif
1455
1456 const float* pBase = pBaseAttrib;
1457 for (uint32_t c = 0; c < 4; ++c)
1458 {
1459 #if USE_SIMD16_FRONTEND
1460 verts[i].v[c] = _simd16_mask_i32gather_ps(_simd16_setzero_ps(),
1461 pBase,
1462 indices,
1463 _simd16_castsi_ps(mask),
1464 4 /* gcc doesn't like sizeof(float) */);
1465 #else
1466 simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(),
1467 pBase,
1468 indices,
1469 _simd_castsi_ps(mask),
1470 4 /* gcc doesn't like sizeof(float) */);
1471 verts[i].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
1472 #endif
1473 if (m_SOA)
1474 {
1475 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1476 }
1477 else
1478 {
1479 pBase++;
1480 }
1481 }
1482 }
1483
1484 return true;
1485 }
1486
1487 #endif
AssembleSinglePA_TESS1488 void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1489 {
1490 SWR_ASSERT(slot < m_numAttributes);
1491
1492
1493 SWR_ASSERT(primIndex < PA_TESS::NumPrims());
1494
1495 const float* pVertDataBase;
1496 if (m_SOA)
1497 {
1498 pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1499 }
1500 else
1501 {
1502 const float* pVertData = (const float*)m_pVertexData;
1503 pVertDataBase = pVertData + slot * 4;
1504 };
1505 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1506 {
1507 #if USE_SIMD16_FRONTEND
1508 uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2]
1509 : m_ppIndices[i][primIndex];
1510 if (!m_SOA)
1511 {
1512 index *= (vertexStride / 4);
1513 }
1514 #else
1515 uint32_t index = m_ppIndices[i][primIndex];
1516 #endif
1517 const float* pVertData = pVertDataBase;
1518 float* pVert = (float*)&verts[i];
1519
1520 for (uint32_t c = 0; c < 4; ++c)
1521 {
1522 pVert[c] = pVertData[index];
1523 if (m_SOA)
1524 {
1525 pVertData += m_attributeStrideInVectors * SIMD_WIDTH;
1526 }
1527 else
1528 {
1529 pVertData++;
1530 }
1531 }
1532
1533 }
1534 }
1535
NextPrimPA_TESS1536 bool NextPrim()
1537 {
1538 uint32_t numPrims = PA_TESS::NumPrims();
1539 m_numPrims -= numPrims;
1540 m_ppIndices[0] += numPrims;
1541 m_ppIndices[1] += numPrims;
1542 m_ppIndices[2] += numPrims;
1543
1544 return HasWork();
1545 }
1546
GetNextVsOutputPA_TESS1547 SIMDVERTEX& GetNextVsOutput()
1548 {
1549 SWR_NOT_IMPL;
1550 return junkVertex;
1551 }
1552
GetNextStreamOutputPA_TESS1553 bool GetNextStreamOutput()
1554 {
1555 SWR_NOT_IMPL;
1556 return false;
1557 }
1558
GetNextVsIndicesPA_TESS1559 SIMDMASK& GetNextVsIndices()
1560 {
1561 SWR_NOT_IMPL;
1562 return junkIndices;
1563 }
1564
NumPrimsPA_TESS1565 uint32_t NumPrims() { return std::min<uint32_t>(m_numPrims, SIMD_WIDTH); }
1566
ResetPA_TESS1567 void Reset() { SWR_NOT_IMPL; }
1568
GetPrimIDPA_TESS1569 SIMDSCALARI GetPrimID(uint32_t startID)
1570 {
1571 #if USE_SIMD16_FRONTEND
1572 return _simd16_add_epi32(_simd16_set1_epi32(startID), m_vPrimId);
1573 #else
1574 return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
1575 #endif
1576 }
1577
1578 private:
1579 const SIMDSCALAR* m_pVertexData = nullptr;
1580 uint32_t m_attributeStrideInVectors = 0;
1581 uint32_t m_numAttributes = 0;
1582 uint32_t m_numPrims = 0;
1583 uint32_t* m_ppIndices[3];
1584
1585 uint32_t m_numVertsPerPrim = 0;
1586
1587 SIMDSCALARI m_vPrimId;
1588
1589 simdvector junkVector; // junk simdvector for unimplemented API
1590 #if ENABLE_AVX512_SIMD16
1591 simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
1592 #endif
1593 SIMDVERTEX junkVertex; // junk SIMDVERTEX for unimplemented API
1594 SIMDMASK junkIndices; // temporary index store for unused virtual function
1595
1596 bool m_SOA;
1597 };
1598
1599 // Primitive Assembler factory class, responsible for creating and initializing the correct
1600 // assembler based on state.
1601 template <typename IsIndexedT, typename IsCutIndexEnabledT>
1602 struct PA_FACTORY
1603 {
PA_FACTORYPA_FACTORY1604 PA_FACTORY(DRAW_CONTEXT* pDC,
1605 PRIMITIVE_TOPOLOGY in_topo,
1606 uint32_t numVerts,
1607 PA_STATE::SIMDVERTEX* pVertexStore,
1608 uint32_t vertexStoreSize,
1609 uint32_t vertexStride,
1610 uint32_t numVertsPerPrim) :
1611 topo(in_topo)
1612 {
1613 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1614 const API_STATE& state = GetApiState(pDC);
1615 if ((IsIndexedT::value && IsCutIndexEnabledT::value &&
1616 (topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST || topo == TOP_LINE_LIST ||
1617 topo == TOP_LINE_STRIP || topo == TOP_TRIANGLE_LIST)) ||
1618
1619 // non-indexed draws with adjacency topologies must use cut-aware PA until we add
1620 // support for them in the optimized PA
1621 (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ ||
1622 topo == TOP_TRI_STRIP_ADJ))
1623 {
1624 memset(&indexStore, 0, sizeof(indexStore));
1625 uint32_t numAttribs = state.feNumAttributes;
1626
1627 new (&this->paCut) PA_STATE_CUT(pDC,
1628 reinterpret_cast<uint8_t*>(pVertexStore),
1629 vertexStoreSize * PA_STATE::SIMD_WIDTH,
1630 vertexStride,
1631 &this->indexStore[0],
1632 numVerts,
1633 numAttribs,
1634 state.topology,
1635 false,
1636 numVertsPerPrim);
1637 cutPA = true;
1638 }
1639 else
1640 #endif
1641 {
1642 uint32_t numPrims = GetNumPrims(in_topo, numVerts);
1643 new (&this->paOpt) PA_STATE_OPT(pDC,
1644 numPrims,
1645 reinterpret_cast<uint8_t*>(pVertexStore),
1646 vertexStoreSize * PA_STATE::SIMD_WIDTH,
1647 vertexStride,
1648 false,
1649 numVertsPerPrim);
1650 cutPA = false;
1651 }
1652 }
1653
GetPAPA_FACTORY1654 PA_STATE& GetPA()
1655 {
1656 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1657 if (cutPA)
1658 {
1659 return this->paCut;
1660 }
1661 else
1662 #endif
1663 {
1664 return this->paOpt;
1665 }
1666 }
1667
1668 PA_STATE_OPT paOpt;
1669 PA_STATE_CUT paCut;
1670
1671 bool cutPA{false};
1672
1673 PRIMITIVE_TOPOLOGY topo{TOP_UNKNOWN};
1674
1675 PA_STATE::SIMDMASK indexStore[MAX_NUM_VERTS_PER_PRIM];
1676 };
1677