• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file pa.h
24 *
25 * @brief Definitions for primitive assembly.
26 *        N primitives are assembled at a time, where N is the SIMD width.
27 *        A state machine, that is specific for a given topology, drives the
28 *        assembly of vertices into triangles.
29 *
30 ******************************************************************************/
31 #pragma once
32 
33 #include "frontend.h"
34 
35 struct PA_STATE
36 {
37     DRAW_CONTEXT *pDC{ nullptr };              // draw context
38     uint8_t* pStreamBase{ nullptr };           // vertex stream
39     uint32_t streamSizeInVerts{ 0 };     // total size of the input stream in verts
40 
41     // The topology the binner will use. In some cases the FE changes the topology from the api state.
42     PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
43 
PA_STATEPA_STATE44     PA_STATE() {}
PA_STATEPA_STATE45     PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) :
46         pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts) {}
47 
48     virtual bool HasWork() = 0;
49     virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
50     virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
51     virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0;
52     virtual bool NextPrim() = 0;
53     virtual simdvertex& GetNextVsOutput() = 0;
54     virtual bool GetNextStreamOutput() = 0;
55     virtual simdmask& GetNextVsIndices() = 0;
56     virtual uint32_t NumPrims() = 0;
57     virtual void Reset() = 0;
58     virtual simdscalari GetPrimID(uint32_t startID) = 0;
59 };
60 
61 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
62 // output. Here is the sequence
63 //    1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
64 //    2. Execute PA function to assemble and bin triangles.
65 //        a.    The PA function is a set of functions that collectively make up the
66 //            state machine for a given topology.
67 //                1.    We use a state index to track which PA function to call.
68 //        b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
69 //                1.    We call this the current and previous simd vertex.
70 //                2.    The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
71 //                    order to assemble the second triangle, for a triangle list, we'll need the
72 //                    last vertex from the previous simd and the first 2 vertices from the current simd.
73 //                3. At times the PA can assemble multiple triangles from the 2 simd vertices.
74 //
75 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
76 // cuts
77 struct PA_STATE_OPT : public PA_STATE
78 {
79     simdvertex leadingVertex;            // For tri-fan
80     uint32_t numPrims{ 0 };              // Total number of primitives for draw.
81     uint32_t numPrimsComplete{ 0 };      // Total number of complete primitives.
82 
83     uint32_t numSimdPrims{ 0 };          // Number of prims in current simd.
84 
85     uint32_t cur{ 0 };                   // index to current VS output.
86     uint32_t prev{ 0 };                  // index to prev VS output. Not really needed in the state.
87     uint32_t first{ 0 };                 // index to first VS output. Used for trifan.
88 
89     uint32_t counter{ 0 };               // state counter
90     bool reset{ false };                 // reset state
91 
92     uint32_t primIDIncr{ 0 };            // how much to increment for each vector (typically vector / {1, 2})
93     simdscalari primID;
94 
95     typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]);
96     typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
97 
98     PFN_PA_FUNC        pfnPaFunc{ nullptr };        // PA state machine function for assembling 4 triangles.
99     PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr };  // PA state machine function for assembling single triangle.
100     PFN_PA_FUNC        pfnPaFuncReset{ nullptr };   // initial state to set on reset
101 
102     // state used to advance the PA when Next is called
103     PFN_PA_FUNC        pfnPaNextFunc{ nullptr };
104     uint32_t           nextNumSimdPrims{ 0 };
105     uint32_t           nextNumPrimsIncrement{ 0 };
106     bool               nextReset{ false };
107     bool               isStreaming{ false };
108 
109     simdmask tmpIndices{ 0 };            // temporary index store for unused virtual function
110 
PA_STATE_OPTPA_STATE_OPT111     PA_STATE_OPT() {}
112     PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
113         bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
114 
HasWorkPA_STATE_OPT115     bool HasWork()
116     {
117         return (this->numPrimsComplete < this->numPrims) ? true : false;
118     }
119 
GetSimdVectorPA_STATE_OPT120     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
121     {
122         simdvertex* pVertex = (simdvertex*)pStreamBase;
123         return pVertex[index].attrib[slot];
124     }
125 
126     // Assembles 4 triangles. Each simdvector is a single vertex from 4
127     // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
AssemblePA_STATE_OPT128     bool Assemble(uint32_t slot, simdvector verts[])
129     {
130         return this->pfnPaFunc(*this, slot, verts);
131     }
132 
133     // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
AssembleSinglePA_STATE_OPT134     void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
135     {
136         return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
137     }
138 
NextPrimPA_STATE_OPT139     bool NextPrim()
140     {
141         this->pfnPaFunc = this->pfnPaNextFunc;
142         this->numSimdPrims = this->nextNumSimdPrims;
143         this->numPrimsComplete += this->nextNumPrimsIncrement;
144         this->reset = this->nextReset;
145 
146         if (this->isStreaming)
147         {
148             this->reset = false;
149         }
150 
151         bool morePrims = false;
152 
153         if (this->numSimdPrims > 0)
154         {
155             morePrims = true;
156             this->numSimdPrims--;
157         }
158         else
159         {
160             this->counter = (this->reset) ? 0 : (this->counter + 1);
161             this->reset = false;
162         }
163 
164         this->pfnPaFunc = this->pfnPaNextFunc;
165 
166         if (!HasWork())
167         {
168             morePrims = false;    // no more to do
169         }
170 
171         return morePrims;
172     }
173 
GetNextVsOutputPA_STATE_OPT174     simdvertex& GetNextVsOutput()
175     {
176         // increment cur and prev indices
177         const uint32_t numSimdVerts = this->streamSizeInVerts / KNOB_SIMD_WIDTH;
178         this->prev = this->cur;  // prev is undefined for first state.
179         this->cur = this->counter % numSimdVerts;
180 
181         simdvertex* pVertex = (simdvertex*)pStreamBase;
182         return pVertex[this->cur];
183     }
184 
GetNextVsIndicesPA_STATE_OPT185     simdmask& GetNextVsIndices()
186     {
187         // unused in optimized PA, pass tmp buffer back
188         return tmpIndices;
189     }
190 
GetNextStreamOutputPA_STATE_OPT191     bool GetNextStreamOutput()
192     {
193         this->prev = this->cur;
194         this->cur = this->counter;
195 
196         return HasWork();
197     }
198 
NumPrimsPA_STATE_OPT199     uint32_t NumPrims()
200     {
201         return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
202             (KNOB_SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : KNOB_SIMD_WIDTH;
203     }
204 
205     void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
206         PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
207         uint32_t numSimdPrims = 0,
208         uint32_t numPrimsIncrement = 0,
209         bool reset = false)
210     {
211         this->pfnPaNextFunc = pfnPaNextFunc;
212         this->nextNumSimdPrims = numSimdPrims;
213         this->nextNumPrimsIncrement = numPrimsIncrement;
214         this->nextReset = reset;
215 
216         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
217     }
218 
ResetPA_STATE_OPT219     void Reset()
220     {
221         this->pfnPaFunc = this->pfnPaFuncReset;
222         this->numPrimsComplete = 0;
223         this->numSimdPrims = 0;
224         this->cur = 0;
225         this->prev = 0;
226         this->first = 0;
227         this->counter = 0;
228         this->reset = false;
229     }
230 
GetPrimIDPA_STATE_OPT231     simdscalari GetPrimID(uint32_t startID)
232     {
233         return _simd_add_epi32(this->primID,
234             _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / KNOB_SIMD_WIDTH)));
235     }
236 };
237 
238 // helper C wrappers to avoid having to rewrite all the PA topology state functions
239 INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
240     PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
241     uint32_t numSimdPrims = 0,
242     uint32_t numPrimsIncrement = 0,
243     bool reset = false)
244 {
245     return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
246 }
PaGetSimdVector(PA_STATE & pa,uint32_t index,uint32_t slot)247 INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
248 {
249     return pa.GetSimdVector(index, slot);
250 }
251 
swizzleLane0(const simdvector & a)252 INLINE __m128 swizzleLane0(const simdvector &a)
253 {
254     simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
255     simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
256     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
257 }
258 
swizzleLane1(const simdvector & a)259 INLINE __m128 swizzleLane1(const simdvector &a)
260 {
261     simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
262     simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
263     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
264 }
265 
swizzleLane2(const simdvector & a)266 INLINE __m128 swizzleLane2(const simdvector &a)
267 {
268     simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
269     simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
270     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
271 }
272 
swizzleLane3(const simdvector & a)273 INLINE __m128 swizzleLane3(const simdvector &a)
274 {
275     simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
276     simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
277     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
278 }
279 
swizzleLane4(const simdvector & a)280 INLINE __m128 swizzleLane4(const simdvector &a)
281 {
282     simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
283     simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
284     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
285 
286 }
287 
swizzleLane5(const simdvector & a)288 INLINE __m128 swizzleLane5(const simdvector &a)
289 {
290     simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
291     simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
292     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
293 }
294 
swizzleLane6(const simdvector & a)295 INLINE __m128 swizzleLane6(const simdvector &a)
296 {
297     simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
298     simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
299     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
300 }
301 
swizzleLane7(const simdvector & a)302 INLINE __m128 swizzleLane7(const simdvector &a)
303 {
304     simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
305     simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
306     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
307 }
308 
swizzleLaneN(const simdvector & a,int lane)309 INLINE __m128 swizzleLaneN(const simdvector &a, int lane)
310 {
311     switch (lane) {
312     case 0:
313         return swizzleLane0(a);
314     case 1:
315         return swizzleLane1(a);
316     case 2:
317         return swizzleLane2(a);
318     case 3:
319         return swizzleLane3(a);
320     case 4:
321         return swizzleLane4(a);
322     case 5:
323         return swizzleLane5(a);
324     case 6:
325         return swizzleLane6(a);
326     case 7:
327         return swizzleLane7(a);
328     default:
329         return _mm_setzero_ps();
330     }
331 }
332 
333 // Cut-aware primitive assembler.
334 struct PA_STATE_CUT : public PA_STATE
335 {
336     simdmask* pCutIndices{ nullptr };    // cut indices buffer, 1 bit per vertex
337     uint32_t numVerts{ 0 };              // number of vertices available in buffer store
338     uint32_t numAttribs{ 0 };            // number of attributes
339     int32_t numRemainingVerts{ 0 };      // number of verts remaining to be assembled
340     uint32_t numVertsToAssemble{ 0 };    // total number of verts to assemble for the draw
341     OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][KNOB_SIMD_WIDTH];    // current index buffer for gather
342     simdscalari vOffsets[MAX_NUM_VERTS_PER_PRIM];           // byte offsets for currently assembling simd
343     uint32_t numPrimsAssembled{ 0 };     // number of primitives that are fully assembled
344     uint32_t headVertex{ 0 };            // current unused vertex slot in vertex buffer store
345     uint32_t tailVertex{ 0 };            // beginning vertex currently assembling
346     uint32_t curVertex{ 0 };             // current unprocessed vertex
347     uint32_t startPrimId{ 0 };           // starting prim id
348     simdscalari vPrimId;                 // vector of prim ID
349     bool needOffsets{ false };           // need to compute gather offsets for current SIMD
350     uint32_t vertsPerPrim{ 0 };
351     simdvertex tmpVertex;                // temporary simdvertex for unimplemented API
352     bool processCutVerts{ false };       // vertex indices with cuts should be processed as normal, otherwise they
353                                          // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
354                                          // while the GS sends valid verts for every index
355     // Topology state tracking
356     uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
357     uint32_t curIndex{ 0 };
358     bool reverseWinding{ false };        // indicates reverse winding for strips
359     int32_t adjExtraVert{ 0 };           // extra vert uses for tristrip w/ adj
360 
361     typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
362     PFN_PA_FUNC pfnPa{ nullptr };        // per-topology function that processes a single vert
363 
PA_STATE_CUTPA_STATE_CUT364     PA_STATE_CUT() {}
PA_STATE_CUTPA_STATE_CUT365     PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, simdmask* in_pIndices, uint32_t in_numVerts,
366         uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
367         : PA_STATE(pDC, in_pStream, in_streamSizeInVerts)
368     {
369         numVerts = in_streamSizeInVerts;
370         numAttribs = in_numAttribs;
371         binTopology = topo;
372         needOffsets = false;
373         processCutVerts = in_processCutVerts;
374 
375         numVertsToAssemble = numRemainingVerts = in_numVerts;
376         numPrimsAssembled = 0;
377         headVertex = tailVertex = curVertex = 0;
378 
379         curIndex = 0;
380         pCutIndices = in_pIndices;
381         memset(indices, 0, sizeof(indices));
382         vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
383         reverseWinding = false;
384         adjExtraVert = -1;
385 
386         bool gsEnabled = pDC->pState->state.gsState.gsEnable;
387         vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
388 
389         switch (topo)
390         {
391         case TOP_TRIANGLE_LIST:     pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
392         case TOP_TRI_LIST_ADJ:      pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
393         case TOP_TRIANGLE_STRIP:    pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
394         case TOP_TRI_STRIP_ADJ:     if (gsEnabled)
395                                     {
396                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
397                                     }
398                                     else
399                                     {
400                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
401                                     }
402                                     break;
403 
404         case TOP_POINT_LIST:        pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
405         case TOP_LINE_LIST:         pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
406         case TOP_LINE_LIST_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
407         case TOP_LINE_STRIP:        pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
408         case TOP_LISTSTRIP_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
409         default: assert(0 && "Unimplemented topology");
410         }
411     }
412 
GetNextVsOutputPA_STATE_CUT413     simdvertex& GetNextVsOutput()
414     {
415         uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH;
416         this->headVertex = (this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts;
417         this->needOffsets = true;
418         return ((simdvertex*)pStreamBase)[vertexIndex];
419     }
420 
GetNextVsIndicesPA_STATE_CUT421     simdmask& GetNextVsIndices()
422     {
423         uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH;
424         simdmask* pCurCutIndex = this->pCutIndices + vertexIndex;
425         return *pCurCutIndex;
426     }
427 
GetSimdVectorPA_STATE_CUT428     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
429     {
430         // unused
431         SWR_ASSERT(0 && "Not implemented");
432         return this->tmpVertex.attrib[0];
433     }
434 
GetNextStreamOutputPA_STATE_CUT435     bool GetNextStreamOutput()
436     {
437         this->headVertex += KNOB_SIMD_WIDTH;
438         this->needOffsets = true;
439         return HasWork();
440     }
441 
GetPrimIDPA_STATE_CUT442     simdscalari GetPrimID(uint32_t startID)
443     {
444         return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
445     }
446 
ResetPA_STATE_CUT447     void Reset()
448     {
449         this->numRemainingVerts = this->numVertsToAssemble;
450         this->numPrimsAssembled = 0;
451         this->curIndex = 0;
452         this->curVertex = 0;
453         this->tailVertex = 0;
454         this->headVertex = 0;
455         this->reverseWinding = false;
456         this->adjExtraVert = -1;
457         this->vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
458     }
459 
HasWorkPA_STATE_CUT460     bool HasWork()
461     {
462         return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
463     }
464 
IsVertexStoreFullPA_STATE_CUT465     bool IsVertexStoreFull()
466     {
467         return ((this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts) == this->tailVertex;
468     }
469 
RestartTopologyPA_STATE_CUT470     void RestartTopology()
471     {
472         this->curIndex = 0;
473         this->reverseWinding = false;
474         this->adjExtraVert = -1;
475     }
476 
IsCutIndexPA_STATE_CUT477     bool IsCutIndex(uint32_t vertex)
478     {
479         uint32_t vertexIndex = vertex / KNOB_SIMD_WIDTH;
480         uint32_t vertexOffset = vertex & (KNOB_SIMD_WIDTH - 1);
481         return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1;
482     }
483 
484     // iterates across the unprocessed verts until we hit the end or we
485     // have assembled SIMD prims
ProcessVertsPA_STATE_CUT486     void ProcessVerts()
487     {
488         while (this->numPrimsAssembled != KNOB_SIMD_WIDTH &&
489             this->numRemainingVerts > 0 &&
490             this->curVertex != this->headVertex)
491         {
492             // if cut index, restart topology
493             if (IsCutIndex(this->curVertex))
494             {
495                 if (this->processCutVerts)
496                 {
497                     (this->*pfnPa)(this->curVertex, false);
498                 }
499                 // finish off tri strip w/ adj before restarting topo
500                 if (this->adjExtraVert != -1)
501                 {
502                     (this->*pfnPa)(this->curVertex, true);
503                 }
504                 RestartTopology();
505             }
506             else
507             {
508                 (this->*pfnPa)(this->curVertex, false);
509             }
510 
511             this->curVertex++;
512             if (this->curVertex >= this->numVerts) {
513                this->curVertex = 0;
514             }
515             this->numRemainingVerts--;
516         }
517 
518         // special case last primitive for tri strip w/ adj
519         if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
520         {
521             (this->*pfnPa)(this->curVertex, true);
522         }
523     }
524 
AdvancePA_STATE_CUT525     void Advance()
526     {
527         // done with current batch
528         // advance tail to the current unsubmitted vertex
529         this->tailVertex = this->curVertex;
530         this->numPrimsAssembled = 0;
531         this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(KNOB_SIMD_WIDTH));
532     }
533 
NextPrimPA_STATE_CUT534     bool NextPrim()
535     {
536         // if we've assembled enough prims, we can advance to the next set of verts
537         if (this->numPrimsAssembled == KNOB_SIMD_WIDTH || this->numRemainingVerts <= 0)
538         {
539             Advance();
540         }
541         return false;
542     }
543 
ComputeOffsetsPA_STATE_CUT544     void ComputeOffsets()
545     {
546         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
547         {
548             simdscalari vIndices = *(simdscalari*)&this->indices[v][0];
549 
550             // step to simdvertex batch
551             const uint32_t simdShift = 3; // @todo make knob
552             simdscalari vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
553             this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(sizeof(simdvertex)));
554 
555             // step to index
556             const uint32_t simdMask = 0x7; // @todo make knob
557             simdscalari vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
558             this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
559         }
560     }
561 
AssemblePA_STATE_CUT562     bool Assemble(uint32_t slot, simdvector result[])
563     {
564         // process any outstanding verts
565         ProcessVerts();
566 
567         // return false if we don't have enough prims assembled
568         if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts > 0)
569         {
570             return false;
571         }
572 
573         // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
574         if (this->needOffsets)
575         {
576             ComputeOffsets();
577             this->needOffsets = false;
578         }
579 
580         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
581         {
582             simdscalari offsets = this->vOffsets[v];
583 
584             // step to attribute
585             offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
586 
587             float* pBase = (float*)this->pStreamBase;
588             for (uint32_t c = 0; c < 4; ++c)
589             {
590                 result[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
591 
592                 // move base to next component
593                 pBase += KNOB_SIMD_WIDTH;
594             }
595         }
596 
597         return true;
598     }
599 
AssembleSinglePA_STATE_CUT600     void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3])
601     {
602         // move to slot
603         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
604         {
605             uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
606             uint32_t offset = pOffset[triIndex];
607             offset += sizeof(simdvector) * slot;
608             float* pVert = (float*)&tri[v];
609             for (uint32_t c = 0; c < 4; ++c)
610             {
611                 float* pComponent = (float*)(this->pStreamBase + offset);
612                 pVert[c] = *pComponent;
613                 offset += KNOB_SIMD_WIDTH * sizeof(float);
614             }
615         }
616     }
617 
NumPrimsPA_STATE_CUT618     uint32_t NumPrims()
619     {
620         return this->numPrimsAssembled;
621     }
622 
623     // Per-topology functions
ProcessVertTriStripPA_STATE_CUT624     void ProcessVertTriStrip(uint32_t index, bool finish)
625     {
626         this->vert[this->curIndex] = index;
627         this->curIndex++;
628         if (this->curIndex == 3)
629         {
630             // assembled enough verts for prim, add to gather indices
631             this->indices[0][this->numPrimsAssembled] = this->vert[0];
632             if (reverseWinding)
633             {
634                 this->indices[1][this->numPrimsAssembled] = this->vert[2];
635                 this->indices[2][this->numPrimsAssembled] = this->vert[1];
636             }
637             else
638             {
639                 this->indices[1][this->numPrimsAssembled] = this->vert[1];
640                 this->indices[2][this->numPrimsAssembled] = this->vert[2];
641             }
642 
643             // increment numPrimsAssembled
644             this->numPrimsAssembled++;
645 
646             // set up next prim state
647             this->vert[0] = this->vert[1];
648             this->vert[1] = this->vert[2];
649             this->curIndex = 2;
650             this->reverseWinding ^= 1;
651         }
652     }
653 
654     template<bool gsEnabled>
AssembleTriStripAdjPA_STATE_CUT655     void AssembleTriStripAdj()
656     {
657         if (!gsEnabled)
658         {
659             this->vert[1] = this->vert[2];
660             this->vert[2] = this->vert[4];
661 
662             this->indices[0][this->numPrimsAssembled] = this->vert[0];
663             this->indices[1][this->numPrimsAssembled] = this->vert[1];
664             this->indices[2][this->numPrimsAssembled] = this->vert[2];
665 
666             this->vert[4] = this->vert[2];
667             this->vert[2] = this->vert[1];
668         }
669         else
670         {
671             this->indices[0][this->numPrimsAssembled] = this->vert[0];
672             this->indices[1][this->numPrimsAssembled] = this->vert[1];
673             this->indices[2][this->numPrimsAssembled] = this->vert[2];
674             this->indices[3][this->numPrimsAssembled] = this->vert[3];
675             this->indices[4][this->numPrimsAssembled] = this->vert[4];
676             this->indices[5][this->numPrimsAssembled] = this->vert[5];
677         }
678         this->numPrimsAssembled++;
679     }
680 
681 
682     template<bool gsEnabled>
ProcessVertTriStripAdjPA_STATE_CUT683     void ProcessVertTriStripAdj(uint32_t index, bool finish)
684     {
685         // handle last primitive of tristrip
686         if (finish && this->adjExtraVert != -1)
687         {
688             this->vert[3] = this->adjExtraVert;
689             AssembleTriStripAdj<gsEnabled>();
690             this->adjExtraVert = -1;
691             return;
692         }
693 
694         switch (this->curIndex)
695         {
696         case 0:
697         case 1:
698         case 2:
699         case 4:
700             this->vert[this->curIndex] = index;
701             this->curIndex++;
702             break;
703         case 3:
704             this->vert[5] = index;
705             this->curIndex++;
706             break;
707         case 5:
708             if (this->adjExtraVert == -1)
709             {
710                 this->adjExtraVert = index;
711             }
712             else
713             {
714                 this->vert[3] = index;
715                 if (!gsEnabled)
716                 {
717                     AssembleTriStripAdj<gsEnabled>();
718 
719                     uint32_t nextTri[6];
720                     if (this->reverseWinding)
721                     {
722                         nextTri[0] = this->vert[4];
723                         nextTri[1] = this->vert[0];
724                         nextTri[2] = this->vert[2];
725                         nextTri[4] = this->vert[3];
726                         nextTri[5] = this->adjExtraVert;
727                     }
728                     else
729                     {
730                         nextTri[0] = this->vert[2];
731                         nextTri[1] = this->adjExtraVert;
732                         nextTri[2] = this->vert[3];
733                         nextTri[4] = this->vert[4];
734                         nextTri[5] = this->vert[0];
735                     }
736                     for (uint32_t i = 0; i < 6; ++i)
737                     {
738                         this->vert[i] = nextTri[i];
739                     }
740 
741                     this->adjExtraVert = -1;
742                     this->reverseWinding ^= 1;
743                 }
744                 else
745                 {
746                     this->curIndex++;
747                 }
748             }
749             break;
750         case 6:
751             SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
752             AssembleTriStripAdj<gsEnabled>();
753 
754             uint32_t nextTri[6];
755             if (this->reverseWinding)
756             {
757                 nextTri[0] = this->vert[4];
758                 nextTri[1] = this->vert[0];
759                 nextTri[2] = this->vert[2];
760                 nextTri[4] = this->vert[3];
761                 nextTri[5] = this->adjExtraVert;
762             }
763             else
764             {
765                 nextTri[0] = this->vert[2];
766                 nextTri[1] = this->adjExtraVert;
767                 nextTri[2] = this->vert[3];
768                 nextTri[4] = this->vert[4];
769                 nextTri[5] = this->vert[0];
770             }
771             for (uint32_t i = 0; i < 6; ++i)
772             {
773                 this->vert[i] = nextTri[i];
774             }
775             this->reverseWinding ^= 1;
776             this->adjExtraVert = index;
777             this->curIndex--;
778             break;
779         }
780     }
781 
ProcessVertTriListPA_STATE_CUT782     void ProcessVertTriList(uint32_t index, bool finish)
783     {
784         this->vert[this->curIndex] = index;
785         this->curIndex++;
786         if (this->curIndex == 3)
787         {
788             // assembled enough verts for prim, add to gather indices
789             this->indices[0][this->numPrimsAssembled] = this->vert[0];
790             this->indices[1][this->numPrimsAssembled] = this->vert[1];
791             this->indices[2][this->numPrimsAssembled] = this->vert[2];
792 
793             // increment numPrimsAssembled
794             this->numPrimsAssembled++;
795 
796             // set up next prim state
797             this->curIndex = 0;
798         }
799     }
800 
ProcessVertTriListAdjPA_STATE_CUT801     void ProcessVertTriListAdj(uint32_t index, bool finish)
802     {
803         this->vert[this->curIndex] = index;
804         this->curIndex++;
805         if (this->curIndex == 6)
806         {
807             // assembled enough verts for prim, add to gather indices
808             this->indices[0][this->numPrimsAssembled] = this->vert[0];
809             this->indices[1][this->numPrimsAssembled] = this->vert[1];
810             this->indices[2][this->numPrimsAssembled] = this->vert[2];
811             this->indices[3][this->numPrimsAssembled] = this->vert[3];
812             this->indices[4][this->numPrimsAssembled] = this->vert[4];
813             this->indices[5][this->numPrimsAssembled] = this->vert[5];
814 
815             // increment numPrimsAssembled
816             this->numPrimsAssembled++;
817 
818             // set up next prim state
819             this->curIndex = 0;
820         }
821     }
822 
ProcessVertTriListAdjNoGsPA_STATE_CUT823     void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
824     {
825         this->vert[this->curIndex] = index;
826         this->curIndex++;
827         if (this->curIndex == 6)
828         {
829             // assembled enough verts for prim, add to gather indices
830             this->indices[0][this->numPrimsAssembled] = this->vert[0];
831             this->indices[1][this->numPrimsAssembled] = this->vert[2];
832             this->indices[2][this->numPrimsAssembled] = this->vert[4];
833 
834             // increment numPrimsAssembled
835             this->numPrimsAssembled++;
836 
837             // set up next prim state
838             this->curIndex = 0;
839         }
840     }
841 
842 
ProcessVertLineListPA_STATE_CUT843     void ProcessVertLineList(uint32_t index, bool finish)
844     {
845         this->vert[this->curIndex] = index;
846         this->curIndex++;
847         if (this->curIndex == 2)
848         {
849             this->indices[0][this->numPrimsAssembled] = this->vert[0];
850             this->indices[1][this->numPrimsAssembled] = this->vert[1];
851 
852             this->numPrimsAssembled++;
853             this->curIndex = 0;
854         }
855     }
856 
ProcessVertLineStripPA_STATE_CUT857     void ProcessVertLineStrip(uint32_t index, bool finish)
858     {
859         this->vert[this->curIndex] = index;
860         this->curIndex++;
861         if (this->curIndex == 2)
862         {
863             // assembled enough verts for prim, add to gather indices
864             this->indices[0][this->numPrimsAssembled] = this->vert[0];
865             this->indices[1][this->numPrimsAssembled] = this->vert[1];
866 
867             // increment numPrimsAssembled
868             this->numPrimsAssembled++;
869 
870             // set up next prim state
871             this->vert[0] = this->vert[1];
872             this->curIndex = 1;
873         }
874     }
875 
ProcessVertLineStripAdjPA_STATE_CUT876     void ProcessVertLineStripAdj(uint32_t index, bool finish)
877     {
878         this->vert[this->curIndex] = index;
879         this->curIndex++;
880         if (this->curIndex == 4)
881         {
882             // assembled enough verts for prim, add to gather indices
883             this->indices[0][this->numPrimsAssembled] = this->vert[0];
884             this->indices[1][this->numPrimsAssembled] = this->vert[1];
885             this->indices[2][this->numPrimsAssembled] = this->vert[2];
886             this->indices[3][this->numPrimsAssembled] = this->vert[3];
887 
888             // increment numPrimsAssembled
889             this->numPrimsAssembled++;
890 
891             // set up next prim state
892             this->vert[0] = this->vert[1];
893             this->vert[1] = this->vert[2];
894             this->vert[2] = this->vert[3];
895             this->curIndex = 3;
896         }
897     }
898 
ProcessVertLineStripAdjNoGsPA_STATE_CUT899     void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
900     {
901         this->vert[this->curIndex] = index;
902         this->curIndex++;
903         if (this->curIndex == 4)
904         {
905             // assembled enough verts for prim, add to gather indices
906             this->indices[0][this->numPrimsAssembled] = this->vert[1];
907             this->indices[1][this->numPrimsAssembled] = this->vert[2];
908 
909             // increment numPrimsAssembled
910             this->numPrimsAssembled++;
911 
912             // set up next prim state
913             this->vert[0] = this->vert[1];
914             this->vert[1] = this->vert[2];
915             this->vert[2] = this->vert[3];
916             this->curIndex = 3;
917         }
918     }
919 
ProcessVertLineListAdjPA_STATE_CUT920     void ProcessVertLineListAdj(uint32_t index, bool finish)
921     {
922         this->vert[this->curIndex] = index;
923         this->curIndex++;
924         if (this->curIndex == 4)
925         {
926             this->indices[0][this->numPrimsAssembled] = this->vert[0];
927             this->indices[1][this->numPrimsAssembled] = this->vert[1];
928             this->indices[2][this->numPrimsAssembled] = this->vert[2];
929             this->indices[3][this->numPrimsAssembled] = this->vert[3];
930 
931             this->numPrimsAssembled++;
932             this->curIndex = 0;
933         }
934     }
935 
ProcessVertLineListAdjNoGsPA_STATE_CUT936     void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
937     {
938         this->vert[this->curIndex] = index;
939         this->curIndex++;
940         if (this->curIndex == 4)
941         {
942             this->indices[0][this->numPrimsAssembled] = this->vert[1];
943             this->indices[1][this->numPrimsAssembled] = this->vert[2];
944 
945             this->numPrimsAssembled++;
946             this->curIndex = 0;
947         }
948     }
949 
ProcessVertPointListPA_STATE_CUT950     void ProcessVertPointList(uint32_t index, bool finish)
951     {
952         this->vert[this->curIndex] = index;
953         this->curIndex++;
954         if (this->curIndex == 1)
955         {
956             this->indices[0][this->numPrimsAssembled] = this->vert[0];
957             this->numPrimsAssembled++;
958             this->curIndex = 0;
959         }
960     }
961 };
962 
963 // Primitive Assembly for data output from the DomainShader.
964 struct PA_TESS : PA_STATE
965 {
PA_TESSPA_TESS966     PA_TESS(
967         DRAW_CONTEXT *in_pDC,
968         const simdscalar* in_pVertData,
969         uint32_t in_attributeStrideInVectors,
970         uint32_t in_numAttributes,
971         uint32_t* (&in_ppIndices)[3],
972         uint32_t in_numPrims,
973         PRIMITIVE_TOPOLOGY in_binTopology) :
974 
975         PA_STATE(in_pDC, nullptr, 0),
976         m_pVertexData(in_pVertData),
977         m_attributeStrideInVectors(in_attributeStrideInVectors),
978         m_numAttributes(in_numAttributes),
979         m_numPrims(in_numPrims)
980     {
981         m_vPrimId = _simd_setzero_si();
982         binTopology = in_binTopology;
983         m_ppIndices[0] = in_ppIndices[0];
984         m_ppIndices[1] = in_ppIndices[1];
985         m_ppIndices[2] = in_ppIndices[2];
986 
987         switch (binTopology)
988         {
989         case TOP_POINT_LIST:
990             m_numVertsPerPrim = 1;
991             break;
992 
993         case TOP_LINE_LIST:
994             m_numVertsPerPrim = 2;
995             break;
996 
997         case TOP_TRIANGLE_LIST:
998             m_numVertsPerPrim = 3;
999             break;
1000 
1001         default:
1002             SWR_ASSERT(0, "Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
1003             break;
1004         }
1005     }
1006 
HasWorkPA_TESS1007     bool HasWork()
1008     {
1009         return m_numPrims != 0;
1010     }
1011 
GetSimdVectorPA_TESS1012     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
1013     {
1014         SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__);
1015         static simdvector junk;
1016         return junk;
1017     }
1018 
GenPrimMaskPA_TESS1019     static simdscalari GenPrimMask(uint32_t numPrims)
1020     {
1021         SWR_ASSERT(numPrims <= KNOB_SIMD_WIDTH);
1022 #if KNOB_SIMD_WIDTH == 8
1023         static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] =
1024         {
1025             -1, -1, -1, -1, -1, -1, -1, -1,
1026              0,  0,  0,  0,  0,  0,  0,  0
1027         };
1028 #elif KNOB_SIMD_WIDTH == 16
1029         static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] =
1030         {
1031             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1032              0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
1033         };
1034 #else
1035 #error "Help, help, I can't get up!"
1036 #endif
1037 
1038         return _simd_loadu_si((const simdscalari*)&maskGen[KNOB_SIMD_WIDTH - numPrims]);
1039     }
1040 
AssemblePA_TESS1041     bool Assemble(uint32_t slot, simdvector verts[])
1042     {
1043         static_assert(KNOB_SIMD_WIDTH == 8, "Need to revisit this when AVX512 is implemented");
1044         SWR_ASSERT(slot < m_numAttributes);
1045 
1046         uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1047         if (0 == numPrimsToAssemble)
1048         {
1049             return false;
1050         }
1051 
1052         simdscalari mask = GenPrimMask(numPrimsToAssemble);
1053 
1054         const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1055         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1056         {
1057             simdscalari indices = _simd_load_si((const simdscalari*)m_ppIndices[i]);
1058 
1059             const float* pBase = pBaseAttrib;
1060             for (uint32_t c = 0; c < 4; ++c)
1061             {
1062                 verts[i].v[c] = _simd_mask_i32gather_ps(
1063                     _simd_setzero_ps(),
1064                     pBase,
1065                     indices,
1066                     _simd_castsi_ps(mask),
1067                     4 /* gcc doesn't like sizeof(float) */);
1068                 pBase += m_attributeStrideInVectors * KNOB_SIMD_WIDTH;
1069             }
1070         }
1071 
1072         return true;
1073     }
1074 
AssembleSinglePA_TESS1075     void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
1076     {
1077         SWR_ASSERT(slot < m_numAttributes);
1078         SWR_ASSERT(primIndex < PA_TESS::NumPrims());
1079 
1080         const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1081         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1082         {
1083             uint32_t index = m_ppIndices[i][primIndex];
1084             const float* pVertData = pVertDataBase;
1085             float* pVert = (float*)&verts[i];
1086 
1087             for (uint32_t c = 0; c < 4; ++c)
1088             {
1089                 pVert[c] = pVertData[index];
1090                 pVertData += m_attributeStrideInVectors * KNOB_SIMD_WIDTH;
1091             }
1092         }
1093     }
1094 
NextPrimPA_TESS1095     bool NextPrim()
1096     {
1097         uint32_t numPrims = PA_TESS::NumPrims();
1098         m_numPrims -= numPrims;
1099         m_ppIndices[0] += numPrims;
1100         m_ppIndices[1] += numPrims;
1101         m_ppIndices[2] += numPrims;
1102 
1103         return HasWork();
1104     }
1105 
GetNextVsOutputPA_TESS1106     simdvertex& GetNextVsOutput()
1107     {
1108         SWR_ASSERT(0, "%s", __FUNCTION__);
1109         static simdvertex junk;
1110         return junk;
1111     }
1112 
GetNextStreamOutputPA_TESS1113     bool GetNextStreamOutput()
1114     {
1115         SWR_ASSERT(0, "%s", __FUNCTION__);
1116         return false;
1117     }
1118 
GetNextVsIndicesPA_TESS1119     simdmask& GetNextVsIndices()
1120     {
1121         SWR_ASSERT(0, "%s", __FUNCTION__);
1122         static simdmask junk;
1123         return junk;
1124     }
1125 
NumPrimsPA_TESS1126     uint32_t NumPrims()
1127     {
1128         return std::min<uint32_t>(m_numPrims, KNOB_SIMD_WIDTH);
1129     }
1130 
ResetPA_TESS1131     void Reset() { SWR_ASSERT(0); };
1132 
GetPrimIDPA_TESS1133     simdscalari GetPrimID(uint32_t startID)
1134     {
1135         return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
1136     }
1137 
1138 private:
1139     const simdscalar*   m_pVertexData = nullptr;
1140     uint32_t            m_attributeStrideInVectors = 0;
1141     uint32_t            m_numAttributes = 0;
1142     uint32_t            m_numPrims = 0;
1143     uint32_t*           m_ppIndices[3];
1144 
1145     uint32_t            m_numVertsPerPrim = 0;
1146 
1147     simdscalari         m_vPrimId;
1148 };
1149 
1150 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
1151 // based on state.
1152 template <typename IsIndexedT, typename IsCutIndexEnabledT>
1153 struct PA_FACTORY
1154 {
PA_FACTORYPA_FACTORY1155     PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts) : topo(in_topo)
1156     {
1157 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1158         const API_STATE& state = GetApiState(pDC);
1159         if ((IsIndexedT::value && IsCutIndexEnabledT::value && (
1160             topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
1161             topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
1162             topo == TOP_TRIANGLE_LIST)) ||
1163 
1164             // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
1165             // for them in the optimized PA
1166             (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ))
1167         {
1168             memset(&indexStore, 0, sizeof(indexStore));
1169             uint32_t numAttribs = state.feNumAttributes;
1170 
1171             new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH,
1172                 &this->indexStore[0], numVerts, numAttribs, state.topology, false);
1173             cutPA = true;
1174         }
1175         else
1176 #endif
1177         {
1178             uint32_t numPrims = GetNumPrims(in_topo, numVerts);
1179             new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, false);
1180             cutPA = false;
1181         }
1182 
1183     }
1184 
GetPAPA_FACTORY1185     PA_STATE& GetPA()
1186     {
1187 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1188         if (cutPA)
1189         {
1190             return this->paCut;
1191         }
1192         else
1193 #endif
1194         {
1195             return this->paOpt;
1196         }
1197     }
1198 
1199     PA_STATE_OPT paOpt;
1200     PA_STATE_CUT paCut;
1201     bool cutPA{ false };
1202 
1203     PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
1204 
1205     simdvertex vertexStore[MAX_NUM_VERTS_PER_PRIM];
1206     simdmask indexStore[MAX_NUM_VERTS_PER_PRIM];
1207 };
1208