• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file frontend.cpp
24 *
25 * @brief Implementation for Frontend which handles vertex processing,
26 *        primitive assembly, clipping, binning, etc.
27 *
28 ******************************************************************************/
29 
30 #include "api.h"
31 #include "frontend.h"
32 #include "backend.h"
33 #include "context.h"
34 #include "rdtsc_core.h"
35 #include "utils.h"
36 #include "threads.h"
37 #include "pa.h"
38 #include "clip.h"
39 #include "tilemgr.h"
40 #include "tessellator.h"
41 #include <limits>
42 
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Helper macro to generate a bitmask
GenMask(uint32_t numBits)45 static INLINE uint32_t GenMask(uint32_t numBits)
46 {
47     SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
48     return ((1U << numBits) - 1);
49 }
50 
51 //////////////////////////////////////////////////////////////////////////
52 /// @brief FE handler for SwrSync.
53 /// @param pContext - pointer to SWR context.
54 /// @param pDC - pointer to draw context.
55 /// @param workerId - thread's worker id. Even thread has a unique id.
56 /// @param pUserData - Pointer to user data passed back to sync callback.
57 /// @todo This should go away when we switch this to use compute threading.
ProcessSync(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)58 void ProcessSync(
59     SWR_CONTEXT *pContext,
60     DRAW_CONTEXT *pDC,
61     uint32_t workerId,
62     void *pUserData)
63 {
64     BE_WORK work;
65     work.type = SYNC;
66     work.pfnWork = ProcessSyncBE;
67 
68     MacroTileMgr *pTileMgr = pDC->pTileMgr;
69     pTileMgr->enqueue(0, 0, &work);
70 }
71 
72 //////////////////////////////////////////////////////////////////////////
73 /// @brief FE handler for SwrDestroyContext.
74 /// @param pContext - pointer to SWR context.
75 /// @param pDC - pointer to draw context.
76 /// @param workerId - thread's worker id. Even thread has a unique id.
77 /// @param pUserData - Pointer to user data passed back to sync callback.
ProcessShutdown(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)78 void ProcessShutdown(
79     SWR_CONTEXT *pContext,
80     DRAW_CONTEXT *pDC,
81     uint32_t workerId,
82     void *pUserData)
83 {
84     BE_WORK work;
85     work.type = SHUTDOWN;
86     work.pfnWork = ProcessShutdownBE;
87 
88     MacroTileMgr *pTileMgr = pDC->pTileMgr;
89     // Enqueue at least 1 work item for each worker thread
90     // account for number of numa nodes
91     uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
92 
93     for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
94     {
95         for (uint32_t n = 0; n < numNumaNodes; ++n)
96         {
97             pTileMgr->enqueue(i, n, &work);
98         }
99     }
100 }
101 
102 //////////////////////////////////////////////////////////////////////////
103 /// @brief FE handler for SwrClearRenderTarget.
104 /// @param pContext - pointer to SWR context.
105 /// @param pDC - pointer to draw context.
106 /// @param workerId - thread's worker id. Even thread has a unique id.
107 /// @param pUserData - Pointer to user data passed back to clear callback.
108 /// @todo This should go away when we switch this to use compute threading.
ProcessClear(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)109 void ProcessClear(
110     SWR_CONTEXT *pContext,
111     DRAW_CONTEXT *pDC,
112     uint32_t workerId,
113     void *pUserData)
114 {
115     CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
116     MacroTileMgr *pTileMgr = pDC->pTileMgr;
117 
118     // queue a clear to each macro tile
119     // compute macro tile bounds for the specified rect
120     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
121     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
122     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
123     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
124 
125     BE_WORK work;
126     work.type = CLEAR;
127     work.pfnWork = ProcessClearBE;
128     work.desc.clear = *pDesc;
129 
130     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
131     {
132         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
133         {
134             pTileMgr->enqueue(x, y, &work);
135         }
136     }
137 }
138 
139 //////////////////////////////////////////////////////////////////////////
140 /// @brief FE handler for SwrStoreTiles.
141 /// @param pContext - pointer to SWR context.
142 /// @param pDC - pointer to draw context.
143 /// @param workerId - thread's worker id. Even thread has a unique id.
144 /// @param pUserData - Pointer to user data passed back to callback.
145 /// @todo This should go away when we switch this to use compute threading.
ProcessStoreTiles(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)146 void ProcessStoreTiles(
147     SWR_CONTEXT *pContext,
148     DRAW_CONTEXT *pDC,
149     uint32_t workerId,
150     void *pUserData)
151 {
152     AR_BEGIN(FEProcessStoreTiles, pDC->drawId);
153     MacroTileMgr *pTileMgr = pDC->pTileMgr;
154     STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
155 
156     // queue a store to each macro tile
157     // compute macro tile bounds for the specified rect
158     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
159     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
160     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
161     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
162 
163     // store tiles
164     BE_WORK work;
165     work.type = STORETILES;
166     work.pfnWork = ProcessStoreTilesBE;
167     work.desc.storeTiles = *pDesc;
168 
169     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
170     {
171         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
172         {
173             pTileMgr->enqueue(x, y, &work);
174         }
175     }
176 
177     AR_END(FEProcessStoreTiles, 0);
178 }
179 
180 //////////////////////////////////////////////////////////////////////////
181 /// @brief FE handler for SwrInvalidateTiles.
182 /// @param pContext - pointer to SWR context.
183 /// @param pDC - pointer to draw context.
184 /// @param workerId - thread's worker id. Even thread has a unique id.
185 /// @param pUserData - Pointer to user data passed back to callback.
186 /// @todo This should go away when we switch this to use compute threading.
ProcessDiscardInvalidateTiles(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)187 void ProcessDiscardInvalidateTiles(
188     SWR_CONTEXT *pContext,
189     DRAW_CONTEXT *pDC,
190     uint32_t workerId,
191     void *pUserData)
192 {
193     AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
194     DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
195     MacroTileMgr *pTileMgr = pDC->pTileMgr;
196 
197     // compute macro tile bounds for the specified rect
198     uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
199     uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
200     uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
201     uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
202 
203     if (pDesc->fullTilesOnly == false)
204     {
205         // include partial tiles
206         macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
207         macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
208         macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
209         macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
210     }
211 
212     SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
213     SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
214 
215     macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
216     macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
217 
218     // load tiles
219     BE_WORK work;
220     work.type = DISCARDINVALIDATETILES;
221     work.pfnWork = ProcessDiscardInvalidateTilesBE;
222     work.desc.discardInvalidateTiles = *pDesc;
223 
224     for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
225     {
226         for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
227         {
228             pTileMgr->enqueue(x, y, &work);
229         }
230     }
231 
232     AR_END(FEProcessInvalidateTiles, 0);
233 }
234 
235 //////////////////////////////////////////////////////////////////////////
236 /// @brief Computes the number of primitives given the number of verts.
237 /// @param mode - primitive topology for draw operation.
238 /// @param numPrims - number of vertices or indices for draw.
239 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
GetNumPrims(PRIMITIVE_TOPOLOGY mode,uint32_t numPrims)240 uint32_t GetNumPrims(
241     PRIMITIVE_TOPOLOGY mode,
242     uint32_t numPrims)
243 {
244     switch (mode)
245     {
246     case TOP_POINT_LIST: return numPrims;
247     case TOP_TRIANGLE_LIST: return numPrims / 3;
248     case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
249     case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
250     case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
251     case TOP_QUAD_LIST: return numPrims / 4;
252     case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
253     case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
254     case TOP_LINE_LIST: return numPrims / 2;
255     case TOP_LINE_LOOP: return numPrims;
256     case TOP_RECT_LIST: return numPrims / 3;
257     case TOP_LINE_LIST_ADJ: return numPrims / 4;
258     case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
259     case TOP_TRI_LIST_ADJ: return numPrims / 6;
260     case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
261 
262     case TOP_PATCHLIST_1:
263     case TOP_PATCHLIST_2:
264     case TOP_PATCHLIST_3:
265     case TOP_PATCHLIST_4:
266     case TOP_PATCHLIST_5:
267     case TOP_PATCHLIST_6:
268     case TOP_PATCHLIST_7:
269     case TOP_PATCHLIST_8:
270     case TOP_PATCHLIST_9:
271     case TOP_PATCHLIST_10:
272     case TOP_PATCHLIST_11:
273     case TOP_PATCHLIST_12:
274     case TOP_PATCHLIST_13:
275     case TOP_PATCHLIST_14:
276     case TOP_PATCHLIST_15:
277     case TOP_PATCHLIST_16:
278     case TOP_PATCHLIST_17:
279     case TOP_PATCHLIST_18:
280     case TOP_PATCHLIST_19:
281     case TOP_PATCHLIST_20:
282     case TOP_PATCHLIST_21:
283     case TOP_PATCHLIST_22:
284     case TOP_PATCHLIST_23:
285     case TOP_PATCHLIST_24:
286     case TOP_PATCHLIST_25:
287     case TOP_PATCHLIST_26:
288     case TOP_PATCHLIST_27:
289     case TOP_PATCHLIST_28:
290     case TOP_PATCHLIST_29:
291     case TOP_PATCHLIST_30:
292     case TOP_PATCHLIST_31:
293     case TOP_PATCHLIST_32:
294         return numPrims / (mode - TOP_PATCHLIST_BASE);
295 
296     case TOP_POLYGON:
297     case TOP_POINT_LIST_BF:
298     case TOP_LINE_STRIP_CONT:
299     case TOP_LINE_STRIP_BF:
300     case TOP_LINE_STRIP_CONT_BF:
301     case TOP_TRIANGLE_FAN_NOSTIPPLE:
302     case TOP_TRI_STRIP_REVERSE:
303     case TOP_PATCHLIST_BASE:
304     case TOP_UNKNOWN:
305         SWR_ASSERT(false, "Unsupported topology: %d", mode);
306         return 0;
307     }
308 
309     return 0;
310 }
311 
312 //////////////////////////////////////////////////////////////////////////
313 /// @brief Computes the number of verts given the number of primitives.
314 /// @param mode - primitive topology for draw operation.
315 /// @param numPrims - number of primitives for draw.
GetNumVerts(PRIMITIVE_TOPOLOGY mode,uint32_t numPrims)316 uint32_t GetNumVerts(
317     PRIMITIVE_TOPOLOGY mode,
318     uint32_t numPrims)
319 {
320     switch (mode)
321     {
322     case TOP_POINT_LIST: return numPrims;
323     case TOP_TRIANGLE_LIST: return numPrims * 3;
324     case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
325     case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
326     case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
327     case TOP_QUAD_LIST: return numPrims * 4;
328     case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
329     case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
330     case TOP_LINE_LIST: return numPrims * 2;
331     case TOP_LINE_LOOP: return numPrims;
332     case TOP_RECT_LIST: return numPrims * 3;
333     case TOP_LINE_LIST_ADJ: return numPrims * 4;
334     case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
335     case TOP_TRI_LIST_ADJ: return numPrims * 6;
336     case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
337 
338     case TOP_PATCHLIST_1:
339     case TOP_PATCHLIST_2:
340     case TOP_PATCHLIST_3:
341     case TOP_PATCHLIST_4:
342     case TOP_PATCHLIST_5:
343     case TOP_PATCHLIST_6:
344     case TOP_PATCHLIST_7:
345     case TOP_PATCHLIST_8:
346     case TOP_PATCHLIST_9:
347     case TOP_PATCHLIST_10:
348     case TOP_PATCHLIST_11:
349     case TOP_PATCHLIST_12:
350     case TOP_PATCHLIST_13:
351     case TOP_PATCHLIST_14:
352     case TOP_PATCHLIST_15:
353     case TOP_PATCHLIST_16:
354     case TOP_PATCHLIST_17:
355     case TOP_PATCHLIST_18:
356     case TOP_PATCHLIST_19:
357     case TOP_PATCHLIST_20:
358     case TOP_PATCHLIST_21:
359     case TOP_PATCHLIST_22:
360     case TOP_PATCHLIST_23:
361     case TOP_PATCHLIST_24:
362     case TOP_PATCHLIST_25:
363     case TOP_PATCHLIST_26:
364     case TOP_PATCHLIST_27:
365     case TOP_PATCHLIST_28:
366     case TOP_PATCHLIST_29:
367     case TOP_PATCHLIST_30:
368     case TOP_PATCHLIST_31:
369     case TOP_PATCHLIST_32:
370         return numPrims * (mode - TOP_PATCHLIST_BASE);
371 
372     case TOP_POLYGON:
373     case TOP_POINT_LIST_BF:
374     case TOP_LINE_STRIP_CONT:
375     case TOP_LINE_STRIP_BF:
376     case TOP_LINE_STRIP_CONT_BF:
377     case TOP_TRIANGLE_FAN_NOSTIPPLE:
378     case TOP_TRI_STRIP_REVERSE:
379     case TOP_PATCHLIST_BASE:
380     case TOP_UNKNOWN:
381         SWR_ASSERT(false, "Unsupported topology: %d", mode);
382         return 0;
383     }
384 
385     return 0;
386 }
387 
388 //////////////////////////////////////////////////////////////////////////
389 /// @brief Return number of verts per primitive.
390 /// @param topology - topology
391 /// @param includeAdjVerts - include adjacent verts in primitive vertices
NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology,bool includeAdjVerts)392 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
393 {
394     uint32_t numVerts = 0;
395     switch (topology)
396     {
397     case TOP_POINT_LIST:
398     case TOP_POINT_LIST_BF:
399         numVerts = 1;
400         break;
401     case TOP_LINE_LIST:
402     case TOP_LINE_STRIP:
403     case TOP_LINE_LIST_ADJ:
404     case TOP_LINE_LOOP:
405     case TOP_LINE_STRIP_CONT:
406     case TOP_LINE_STRIP_BF:
407     case TOP_LISTSTRIP_ADJ:
408         numVerts = 2;
409         break;
410     case TOP_TRIANGLE_LIST:
411     case TOP_TRIANGLE_STRIP:
412     case TOP_TRIANGLE_FAN:
413     case TOP_TRI_LIST_ADJ:
414     case TOP_TRI_STRIP_ADJ:
415     case TOP_TRI_STRIP_REVERSE:
416     case TOP_RECT_LIST:
417         numVerts = 3;
418         break;
419     case TOP_QUAD_LIST:
420     case TOP_QUAD_STRIP:
421         numVerts = 4;
422         break;
423     case TOP_PATCHLIST_1:
424     case TOP_PATCHLIST_2:
425     case TOP_PATCHLIST_3:
426     case TOP_PATCHLIST_4:
427     case TOP_PATCHLIST_5:
428     case TOP_PATCHLIST_6:
429     case TOP_PATCHLIST_7:
430     case TOP_PATCHLIST_8:
431     case TOP_PATCHLIST_9:
432     case TOP_PATCHLIST_10:
433     case TOP_PATCHLIST_11:
434     case TOP_PATCHLIST_12:
435     case TOP_PATCHLIST_13:
436     case TOP_PATCHLIST_14:
437     case TOP_PATCHLIST_15:
438     case TOP_PATCHLIST_16:
439     case TOP_PATCHLIST_17:
440     case TOP_PATCHLIST_18:
441     case TOP_PATCHLIST_19:
442     case TOP_PATCHLIST_20:
443     case TOP_PATCHLIST_21:
444     case TOP_PATCHLIST_22:
445     case TOP_PATCHLIST_23:
446     case TOP_PATCHLIST_24:
447     case TOP_PATCHLIST_25:
448     case TOP_PATCHLIST_26:
449     case TOP_PATCHLIST_27:
450     case TOP_PATCHLIST_28:
451     case TOP_PATCHLIST_29:
452     case TOP_PATCHLIST_30:
453     case TOP_PATCHLIST_31:
454     case TOP_PATCHLIST_32:
455         numVerts = topology - TOP_PATCHLIST_BASE;
456         break;
457     default:
458         SWR_ASSERT(false, "Unsupported topology: %d", topology);
459         break;
460     }
461 
462     if (includeAdjVerts)
463     {
464         switch (topology)
465         {
466         case TOP_LISTSTRIP_ADJ:
467         case TOP_LINE_LIST_ADJ: numVerts = 4; break;
468         case TOP_TRI_STRIP_ADJ:
469         case TOP_TRI_LIST_ADJ: numVerts = 6; break;
470         default: break;
471         }
472     }
473 
474     return numVerts;
475 }
476 
477 //////////////////////////////////////////////////////////////////////////
478 /// @brief Generate mask from remaining work.
479 /// @param numWorkItems - Number of items being worked on by a SIMD.
GenerateMask(uint32_t numItemsRemaining)480 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
481 {
482     uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
483     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
484     return _simd_castps_si(vMask(mask));
485 }
486 
487 //////////////////////////////////////////////////////////////////////////
488 /// @brief StreamOut - Streams vertex data out to SO buffers.
489 ///        Generally, we are only streaming out a SIMDs worth of triangles.
490 /// @param pDC - pointer to draw context.
491 /// @param workerId - thread's worker id. Even thread has a unique id.
492 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
StreamOut(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,uint32_t * pPrimData,uint32_t streamIndex)493 static void StreamOut(
494     DRAW_CONTEXT* pDC,
495     PA_STATE& pa,
496     uint32_t workerId,
497     uint32_t* pPrimData,
498     uint32_t streamIndex)
499 {
500     SWR_CONTEXT *pContext = pDC->pContext;
501 
502     AR_BEGIN(FEStreamout, pDC->drawId);
503 
504     const API_STATE& state = GetApiState(pDC);
505     const SWR_STREAMOUT_STATE &soState = state.soState;
506 
507     uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
508 
509     // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
510     uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
511 
512     SWR_STREAMOUT_CONTEXT soContext = { 0 };
513 
514     // Setup buffer state pointers.
515     for (uint32_t i = 0; i < 4; ++i)
516     {
517         soContext.pBuffer[i] = &state.soBuffer[i];
518     }
519 
520     uint32_t numPrims = pa.NumPrims();
521     for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
522     {
523         DWORD slot = 0;
524         uint32_t soMask = soState.streamMasks[streamIndex];
525 
526         // Write all entries into primitive data buffer for SOS.
527         while (_BitScanForward(&slot, soMask))
528         {
529             __m128 attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
530             uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
531             pa.AssembleSingle(paSlot, primIndex, attrib);
532 
533             // Attribute offset is relative offset from start of vertex.
534             // Note that attributes start at slot 1 in the PA buffer. We need to write this
535             // to prim data starting at slot 0. Which is why we do (slot - 1).
536             // Also note: GL works slightly differently, and needs slot 0
537             uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
538 
539             // Store each vertex's attrib at appropriate locations in pPrimData buffer.
540             for (uint32_t v = 0; v < soVertsPerPrim; ++v)
541             {
542                 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
543 
544                 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
545             }
546             soMask &= ~(1 << slot);
547         }
548 
549         // Update pPrimData pointer
550         soContext.pPrimData = pPrimData;
551 
552         // Call SOS
553         SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
554         state.pfnSoFunc[streamIndex](soContext);
555     }
556 
557     // Update SO write offset. The driver provides memory for the update.
558     for (uint32_t i = 0; i < 4; ++i)
559     {
560         if (state.soBuffer[i].pWriteOffset)
561         {
562             *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
563         }
564 
565         if (state.soBuffer[i].soWriteEnable)
566         {
567             pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
568             pDC->dynState.SoWriteOffsetDirty[i] = true;
569         }
570     }
571 
572     UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
573     UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
574 
575     AR_END(FEStreamout, 1);
576 }
577 
578 //////////////////////////////////////////////////////////////////////////
579 /// @brief Computes number of invocations. The current index represents
580 ///        the start of the SIMD. The max index represents how much work
581 ///        items are remaining. If there is less then a SIMD's xmin of work
582 ///        then return the remaining amount of work.
583 /// @param curIndex - The start index for the SIMD.
584 /// @param maxIndex - The last index for all work items.
GetNumInvocations(uint32_t curIndex,uint32_t maxIndex)585 static INLINE uint32_t GetNumInvocations(
586     uint32_t curIndex,
587     uint32_t maxIndex)
588 {
589     uint32_t remainder = (maxIndex - curIndex);
590     return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
591 }
592 
593 //////////////////////////////////////////////////////////////////////////
594 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
595 ///        The geometry shader will loop over each active streamout buffer, assembling
596 ///        primitives for the downstream stages. When multistream output is enabled,
597 ///        the generated stream ID buffer from the GS needs to be converted to a cut
598 ///        buffer for the primitive assembler.
599 /// @param stream - stream id to generate the cut buffer for
600 /// @param pStreamIdBase - pointer to the stream ID buffer
601 /// @param numEmittedVerts - Number of total verts emitted by the GS
602 /// @param pCutBuffer - output buffer to write cuts to
ProcessStreamIdBuffer(uint32_t stream,uint8_t * pStreamIdBase,uint32_t numEmittedVerts,uint8_t * pCutBuffer)603 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
604 {
605     SWR_ASSERT(stream < MAX_SO_STREAMS);
606 
607     uint32_t numInputBytes = (numEmittedVerts * 2  + 7) / 8;
608     uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
609 
610     for (uint32_t b = 0; b < numOutputBytes; ++b)
611     {
612         uint8_t curInputByte = pStreamIdBase[2*b];
613         uint8_t outByte = 0;
614         for (uint32_t i = 0; i < 4; ++i)
615         {
616             if ((curInputByte & 0x3) != stream)
617             {
618                 outByte |= (1 << i);
619             }
620             curInputByte >>= 2;
621         }
622 
623         curInputByte = pStreamIdBase[2 * b + 1];
624         for (uint32_t i = 0; i < 4; ++i)
625         {
626             if ((curInputByte & 0x3) != stream)
627             {
628                 outByte |= (1 << (i + 4));
629             }
630             curInputByte >>= 2;
631         }
632 
633         *pCutBuffer++ = outByte;
634     }
635 }
636 
637 THREAD SWR_GS_CONTEXT tlsGsContext;
638 
639 //////////////////////////////////////////////////////////////////////////
640 /// @brief Implements GS stage.
641 /// @param pDC - pointer to draw context.
642 /// @param workerId - thread's worker id. Even thread has a unique id.
643 /// @param pa - The primitive assembly object.
644 /// @param pGsOut - output stream for GS
645 template <
646     typename HasStreamOutT,
647     typename HasRastT>
GeometryShaderStage(DRAW_CONTEXT * pDC,uint32_t workerId,PA_STATE & pa,void * pGsOut,void * pCutBuffer,void * pStreamCutBuffer,uint32_t * pSoPrimData,simdscalari primID)648 static void GeometryShaderStage(
649     DRAW_CONTEXT *pDC,
650     uint32_t workerId,
651     PA_STATE& pa,
652     void* pGsOut,
653     void* pCutBuffer,
654     void* pStreamCutBuffer,
655     uint32_t* pSoPrimData,
656     simdscalari primID)
657 {
658     SWR_CONTEXT *pContext = pDC->pContext;
659 
660     AR_BEGIN(FEGeometryShader, pDC->drawId);
661 
662     const API_STATE& state = GetApiState(pDC);
663     const SWR_GS_STATE* pState = &state.gsState;
664 
665     SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
666     SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
667 
668     tlsGsContext.pStream = (uint8_t*)pGsOut;
669     tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
670     tlsGsContext.PrimitiveID = primID;
671 
672     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
673     simdvector attrib[MAX_ATTRIBUTES];
674 
675     // assemble all attributes for the input primitive
676     for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
677     {
678         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
679         pa.Assemble(attribSlot, attrib);
680 
681         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
682         {
683             tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
684         }
685     }
686 
687     // assemble position
688     pa.Assemble(VERTEX_POSITION_SLOT, attrib);
689     for (uint32_t i = 0; i < numVertsPerPrim; ++i)
690     {
691         tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
692     }
693 
694     const uint32_t vertexStride = sizeof(simdvertex);
695     const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
696     const uint32_t inputPrimStride = numSimdBatches * vertexStride;
697     const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
698     uint32_t cutPrimStride;
699     uint32_t cutInstanceStride;
700 
701     if (pState->isSingleStream)
702     {
703         cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
704         cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
705     }
706     else
707     {
708         cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
709         cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
710     }
711 
712     // record valid prims from the frontend to avoid over binning the newly generated
713     // prims from the GS
714     uint32_t numInputPrims = pa.NumPrims();
715 
716     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
717     {
718         tlsGsContext.InstanceID = instance;
719         tlsGsContext.mask = GenerateMask(numInputPrims);
720 
721         // execute the geometry shader
722         state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
723 
724         tlsGsContext.pStream += instanceStride;
725         tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
726     }
727 
728     // set up new binner and state for the GS output topology
729     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
730     if (HasRastT::value)
731     {
732         switch (pState->outputTopology)
733         {
734         case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles; break;
735         case TOP_LINE_STRIP:        pfnClipFunc = ClipLines; break;
736         case TOP_POINT_LIST:        pfnClipFunc = ClipPoints; break;
737         default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
738         }
739     }
740 
741     // foreach input prim:
742     // - setup a new PA based on the emitted verts for that prim
743     // - loop over the new verts, calling PA to assemble each prim
744     uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
745     uint32_t* pPrimitiveId = (uint32_t*)&primID;
746 
747     uint32_t totalPrimsGenerated = 0;
748     for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
749     {
750         uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
751         uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
752         for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
753         {
754             uint32_t numEmittedVerts = pVertexCount[inputPrim];
755             if (numEmittedVerts == 0)
756             {
757                 continue;
758             }
759 
760             uint8_t* pBase = pInstanceBase + instance * instanceStride;
761             uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
762 
763             uint32_t numAttribs = state.feNumAttributes;
764 
765             for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
766             {
767                 bool processCutVerts = false;
768 
769                 uint8_t* pCutBuffer = pCutBase;
770 
771                 // assign default stream ID, only relevant when GS is outputting a single stream
772                 uint32_t streamID = 0;
773                 if (pState->isSingleStream)
774                 {
775                     processCutVerts = true;
776                     streamID = pState->singleStreamID;
777                     if (streamID != stream) continue;
778                 }
779                 else
780                 {
781                     // early exit if this stream is not enabled for streamout
782                     if (HasStreamOutT::value && !state.soState.streamEnable[stream])
783                     {
784                         continue;
785                     }
786 
787                     // multi-stream output, need to translate StreamID buffer to a cut buffer
788                     ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
789                     pCutBuffer = (uint8_t*)pStreamCutBuffer;
790                     processCutVerts = false;
791                 }
792 
793                 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
794 
795                 while (gsPa.GetNextStreamOutput())
796                 {
797                     do
798                     {
799                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
800 
801                         if (assemble)
802                         {
803                             totalPrimsGenerated += gsPa.NumPrims();
804 
805                             if (HasStreamOutT::value)
806                             {
807                                 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
808                             }
809 
810                             if (HasRastT::value && state.soState.streamToRasterizer == stream)
811                             {
812                                 simdscalari vPrimId;
813                                 // pull primitiveID from the GS output if available
814                                 if (state.gsState.emitsPrimitiveID)
815                                 {
816                                     simdvector primIdAttrib[3];
817                                     gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
818                                     vPrimId = _simd_castps_si(primIdAttrib[0].x);
819                                 }
820                                 else
821                                 {
822                                     vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
823                                 }
824 
825                                 // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
826                                 simdscalari vViewPortIdx;
827                                 if (state.gsState.emitsViewportArrayIndex)
828                                 {
829                                     simdvector vpiAttrib[3];
830                                     gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
831 
832                                     // OOB indices => forced to zero.
833                                     simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
834                                     simdscalari vClearMask = _simd_cmplt_epi32(_simd_castps_si(vpiAttrib[0].x), vNumViewports);
835                                     vpiAttrib[0].x = _simd_and_ps(_simd_castsi_ps(vClearMask), vpiAttrib[0].x);
836 
837                                     vViewPortIdx = _simd_castps_si(vpiAttrib[0].x);
838                                 }
839                                 else
840                                 {
841                                     vViewPortIdx = _simd_set1_epi32(0);
842                                 }
843 
844                                 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
845                             }
846                         }
847                     } while (gsPa.NextPrim());
848                 }
849             }
850         }
851     }
852 
853     // update GS pipeline stats
854     UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
855     UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
856 	AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
857     AR_END(FEGeometryShader, 1);
858 }
859 
860 //////////////////////////////////////////////////////////////////////////
861 /// @brief Allocate GS buffers
862 /// @param pDC - pointer to draw context.
863 /// @param state - API state
864 /// @param ppGsOut - pointer to GS output buffer allocation
865 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
AllocateGsBuffers(DRAW_CONTEXT * pDC,const API_STATE & state,void ** ppGsOut,void ** ppCutBuffer,void ** ppStreamCutBuffer)866 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
867     void **ppStreamCutBuffer)
868 {
869     auto pArena = pDC->pArena;
870     SWR_ASSERT(pArena != nullptr);
871     SWR_ASSERT(state.gsState.gsEnable);
872     // allocate arena space to hold GS output verts
873     // @todo pack attribs
874     // @todo support multiple streams
875     const uint32_t vertexStride = sizeof(simdvertex);
876     const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
877     uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
878     *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
879 
880     const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
881     const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
882     const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
883     const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
884 
885     // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
886     // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
887 
888     // allocate space for temporary per-stream cut buffer if multi-stream is enabled
889     if (state.gsState.isSingleStream)
890     {
891         *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
892         *ppStreamCutBuffer = nullptr;
893     }
894     else
895     {
896         *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
897         *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
898     }
899 
900 }
901 
902 //////////////////////////////////////////////////////////////////////////
903 /// @brief Contains all data generated by the HS and passed to the
904 /// tessellator and DS.
905 struct TessellationThreadLocalData
906 {
907     SWR_HS_CONTEXT hsContext;
908     ScalarPatch patchData[KNOB_SIMD_WIDTH];
909     void* pTxCtx;
910     size_t tsCtxSize;
911 
912     simdscalar* pDSOutput;
913     size_t numDSOutputVectors;
914 };
915 
916 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
917 
918 //////////////////////////////////////////////////////////////////////////
919 /// @brief Allocate tessellation data for this worker thread.
920 INLINE
AllocateTessellationData(SWR_CONTEXT * pContext)921 static void AllocateTessellationData(SWR_CONTEXT* pContext)
922 {
923     /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
924     if (gt_pTessellationThreadData == nullptr)
925     {
926         gt_pTessellationThreadData = (TessellationThreadLocalData*)
927             AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
928         memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
929     }
930 }
931 
932 //////////////////////////////////////////////////////////////////////////
933 /// @brief Implements Tessellation Stages.
934 /// @param pDC - pointer to draw context.
935 /// @param workerId - thread's worker id. Even thread has a unique id.
936 /// @param pa - The primitive assembly object.
937 /// @param pGsOut - output stream for GS
938 template <
939     typename HasGeometryShaderT,
940     typename HasStreamOutT,
941     typename HasRastT>
TessellationStages(DRAW_CONTEXT * pDC,uint32_t workerId,PA_STATE & pa,void * pGsOut,void * pCutBuffer,void * pCutStreamBuffer,uint32_t * pSoPrimData,simdscalari primID)942 static void TessellationStages(
943     DRAW_CONTEXT *pDC,
944     uint32_t workerId,
945     PA_STATE& pa,
946     void* pGsOut,
947     void* pCutBuffer,
948     void* pCutStreamBuffer,
949     uint32_t* pSoPrimData,
950     simdscalari primID)
951 {
952     SWR_CONTEXT *pContext = pDC->pContext;
953     const API_STATE& state = GetApiState(pDC);
954     const SWR_TS_STATE& tsState = state.tsState;
955 
956     SWR_ASSERT(gt_pTessellationThreadData);
957 
958     HANDLE tsCtx = TSInitCtx(
959         tsState.domain,
960         tsState.partitioning,
961         tsState.tsOutputTopology,
962         gt_pTessellationThreadData->pTxCtx,
963         gt_pTessellationThreadData->tsCtxSize);
964     if (tsCtx == nullptr)
965     {
966         gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
967         tsCtx = TSInitCtx(
968             tsState.domain,
969             tsState.partitioning,
970             tsState.tsOutputTopology,
971             gt_pTessellationThreadData->pTxCtx,
972             gt_pTessellationThreadData->tsCtxSize);
973     }
974     SWR_ASSERT(tsCtx);
975 
976     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
977     if (HasRastT::value)
978     {
979         switch (tsState.postDSTopology)
980         {
981         case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
982         case TOP_LINE_LIST:     pfnClipFunc = ClipLines; break;
983         case TOP_POINT_LIST:    pfnClipFunc = ClipPoints; break;
984         default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
985         }
986     }
987 
988     SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
989     hsContext.pCPout = gt_pTessellationThreadData->patchData;
990     hsContext.PrimitiveID = primID;
991 
992     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
993     // Max storage for one attribute for an entire simdprimitive
994     simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
995 
996     // assemble all attributes for the input primitives
997     for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
998     {
999         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
1000         pa.Assemble(attribSlot, simdattrib);
1001 
1002         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1003         {
1004             hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
1005         }
1006     }
1007 
1008 #if defined(_DEBUG)
1009     memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1010 #endif
1011 
1012     uint32_t numPrims = pa.NumPrims();
1013     hsContext.mask = GenerateMask(numPrims);
1014 
1015     // Run the HS
1016     AR_BEGIN(FEHullShader, pDC->drawId);
1017     state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1018     AR_END(FEHullShader, 0);
1019 
1020     UPDATE_STAT_FE(HsInvocations, numPrims);
1021 
1022     const uint32_t* pPrimId = (const uint32_t*)&primID;
1023 
1024     for (uint32_t p = 0; p < numPrims; ++p)
1025     {
1026         // Run Tessellator
1027         SWR_TS_TESSELLATED_DATA tsData = { 0 };
1028         AR_BEGIN(FETessellation, pDC->drawId);
1029         TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1030 		AR_EVENT(TessPrimCount(1));
1031         AR_END(FETessellation, 0);
1032 
1033         if (tsData.NumPrimitives == 0)
1034         {
1035             continue;
1036         }
1037         SWR_ASSERT(tsData.NumDomainPoints);
1038 
1039         // Allocate DS Output memory
1040         uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1041         size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1042         size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1043         if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1044         {
1045             AlignedFree(gt_pTessellationThreadData->pDSOutput);
1046             gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1047             gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1048         }
1049         SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1050         SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1051 
1052 #if defined(_DEBUG)
1053         memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1054 #endif
1055 
1056         // Run Domain Shader
1057         SWR_DS_CONTEXT dsContext;
1058         dsContext.PrimitiveID = pPrimId[p];
1059         dsContext.pCpIn = &hsContext.pCPout[p];
1060         dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1061         dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1062         dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1063         dsContext.vectorStride = requiredDSVectorInvocations;
1064 
1065         uint32_t dsInvocations = 0;
1066 
1067         for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1068         {
1069             dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1070 
1071             AR_BEGIN(FEDomainShader, pDC->drawId);
1072             state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1073             AR_END(FEDomainShader, 0);
1074 
1075             dsInvocations += KNOB_SIMD_WIDTH;
1076         }
1077         UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1078 
1079         PA_TESS tessPa(
1080             pDC,
1081             dsContext.pOutputData,
1082             dsContext.vectorStride,
1083             tsState.numDsOutputAttribs,
1084             tsData.ppIndices,
1085             tsData.NumPrimitives,
1086             tsState.postDSTopology);
1087 
1088         while (tessPa.HasWork())
1089         {
1090             if (HasGeometryShaderT::value)
1091             {
1092                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1093                     pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1094                     _simd_set1_epi32(dsContext.PrimitiveID));
1095             }
1096             else
1097             {
1098                 if (HasStreamOutT::value)
1099                 {
1100                     StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1101                 }
1102 
1103                 if (HasRastT::value)
1104                 {
1105                     simdvector prim[3]; // Only deal with triangles, lines, or points
1106                     AR_BEGIN(FEPAAssemble, pDC->drawId);
1107 #if SWR_ENABLE_ASSERTS
1108                     bool assemble =
1109 #endif
1110                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1111                     AR_END(FEPAAssemble, 1);
1112                     SWR_ASSERT(assemble);
1113 
1114                     SWR_ASSERT(pfnClipFunc);
1115                     pfnClipFunc(pDC, tessPa, workerId, prim,
1116                         GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
1117                 }
1118             }
1119 
1120             tessPa.NextPrim();
1121 
1122         } // while (tessPa.HasWork())
1123     } // for (uint32_t p = 0; p < numPrims; ++p)
1124 
1125     TSDestroyCtx(tsCtx);
1126 }
1127 
1128 //////////////////////////////////////////////////////////////////////////
1129 /// @brief FE handler for SwrDraw.
1130 /// @tparam IsIndexedT - Is indexed drawing enabled
1131 /// @tparam HasTessellationT - Is tessellation enabled
1132 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1133 /// @tparam HasStreamOutT - Is stream-out enabled
1134 /// @tparam HasRastT - Is rasterization enabled
1135 /// @param pContext - pointer to SWR context.
1136 /// @param pDC - pointer to draw context.
1137 /// @param workerId - thread's worker id.
1138 /// @param pUserData - Pointer to DRAW_WORK
1139 template <
1140     typename IsIndexedT,
1141     typename IsCutIndexEnabledT,
1142     typename HasTessellationT,
1143     typename HasGeometryShaderT,
1144     typename HasStreamOutT,
1145     typename HasRastT>
ProcessDraw(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)1146 void ProcessDraw(
1147     SWR_CONTEXT *pContext,
1148     DRAW_CONTEXT *pDC,
1149     uint32_t workerId,
1150     void *pUserData)
1151 {
1152 
1153 #if KNOB_ENABLE_TOSS_POINTS
1154     if (KNOB_TOSS_QUEUE_FE)
1155     {
1156         return;
1157     }
1158 #endif
1159 
1160     AR_BEGIN(FEProcessDraw, pDC->drawId);
1161 
1162     DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
1163     const API_STATE&    state = GetApiState(pDC);
1164     __m256i             vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1165     SWR_VS_CONTEXT      vsContext;
1166     simdvertex          vin;
1167 
1168     int indexSize = 0;
1169     uint32_t endVertex = work.numVerts;
1170 
1171     const int32_t* pLastRequestedIndex = nullptr;
1172     if (IsIndexedT::value)
1173     {
1174         switch (work.type)
1175         {
1176         case R32_UINT:
1177             indexSize = sizeof(uint32_t);
1178             pLastRequestedIndex = &(work.pIB[endVertex]);
1179             break;
1180         case R16_UINT:
1181             indexSize = sizeof(uint16_t);
1182             // nasty address offset to last index
1183             pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1184             break;
1185         case R8_UINT:
1186             indexSize = sizeof(uint8_t);
1187             // nasty address offset to last index
1188             pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1189             break;
1190         default:
1191             SWR_ASSERT(0);
1192         }
1193     }
1194     else
1195     {
1196         // No cuts, prune partial primitives.
1197         endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1198     }
1199 
1200     SWR_FETCH_CONTEXT fetchInfo = { 0 };
1201     fetchInfo.pStreams = &state.vertexBuffers[0];
1202     fetchInfo.StartInstance = work.startInstance;
1203     fetchInfo.StartVertex = 0;
1204 
1205     vsContext.pVin = &vin;
1206 
1207     if (IsIndexedT::value)
1208     {
1209         fetchInfo.BaseVertex = work.baseVertex;
1210 
1211         // if the entire index buffer isn't being consumed, set the last index
1212         // so that fetches < a SIMD wide will be masked off
1213         fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1214         if (pLastRequestedIndex < fetchInfo.pLastIndex)
1215         {
1216             fetchInfo.pLastIndex = pLastRequestedIndex;
1217         }
1218     }
1219     else
1220     {
1221         fetchInfo.StartVertex = work.startVertex;
1222     }
1223 
1224 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1225     uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1226 #endif
1227 
1228     void* pGsOut = nullptr;
1229     void* pCutBuffer = nullptr;
1230     void* pStreamCutBuffer = nullptr;
1231     if (HasGeometryShaderT::value)
1232     {
1233         AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1234     }
1235 
1236     if (HasTessellationT::value)
1237     {
1238         SWR_ASSERT(state.tsState.tsEnable == true);
1239         SWR_ASSERT(state.pfnHsFunc != nullptr);
1240         SWR_ASSERT(state.pfnDsFunc != nullptr);
1241 
1242         AllocateTessellationData(pContext);
1243     }
1244     else
1245     {
1246         SWR_ASSERT(state.tsState.tsEnable == false);
1247         SWR_ASSERT(state.pfnHsFunc == nullptr);
1248         SWR_ASSERT(state.pfnDsFunc == nullptr);
1249     }
1250 
1251     // allocate space for streamout input prim data
1252     uint32_t* pSoPrimData = nullptr;
1253     if (HasStreamOutT::value)
1254     {
1255         pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1256     }
1257 
1258     // choose primitive assembler
1259     PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
1260     PA_STATE& pa = paFactory.GetPA();
1261 
1262     /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1263     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1264     {
1265         simdscalari vIndex;
1266         uint32_t  i = 0;
1267 
1268         if (IsIndexedT::value)
1269         {
1270             fetchInfo.pIndices = work.pIB;
1271         }
1272         else
1273         {
1274             vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1275             fetchInfo.pIndices = (const int32_t*)&vIndex;
1276         }
1277 
1278         fetchInfo.CurInstance = instanceNum;
1279         vsContext.InstanceID = instanceNum;
1280 
1281         while (pa.HasWork())
1282         {
1283             // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
1284             // So we need to keep this outside of (i < endVertex) check.
1285             simdmask* pvCutIndices = nullptr;
1286             if (IsIndexedT::value)
1287             {
1288                 pvCutIndices = &pa.GetNextVsIndices();
1289             }
1290 
1291             simdvertex& vout = pa.GetNextVsOutput();
1292             vsContext.pVout = &vout;
1293 
1294             if (i < endVertex)
1295             {
1296 
1297                 // 1. Execute FS/VS for a single SIMD.
1298                 AR_BEGIN(FEFetchShader, pDC->drawId);
1299                 state.pfnFetchFunc(fetchInfo, vin);
1300                 AR_END(FEFetchShader, 0);
1301 
1302                 // forward fetch generated vertex IDs to the vertex shader
1303                 vsContext.VertexID = fetchInfo.VertexID;
1304 
1305                 // Setup active mask for vertex shader.
1306                 vsContext.mask = GenerateMask(endVertex - i);
1307 
1308                 // forward cut mask to the PA
1309                 if (IsIndexedT::value)
1310                 {
1311                     *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1312                 }
1313 
1314                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1315 
1316 #if KNOB_ENABLE_TOSS_POINTS
1317                 if (!KNOB_TOSS_FETCH)
1318 #endif
1319                 {
1320                     AR_BEGIN(FEVertexShader, pDC->drawId);
1321                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1322                     AR_END(FEVertexShader, 0);
1323 
1324                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1325                 }
1326             }
1327 
1328             // 2. Assemble primitives given the last two SIMD.
1329             do
1330             {
1331                 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1332                 // PaAssemble returns false if there is not enough verts to assemble.
1333                 AR_BEGIN(FEPAAssemble, pDC->drawId);
1334                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1335                 AR_END(FEPAAssemble, 1);
1336 
1337 #if KNOB_ENABLE_TOSS_POINTS
1338                 if (!KNOB_TOSS_FETCH)
1339 #endif
1340                 {
1341 #if KNOB_ENABLE_TOSS_POINTS
1342                     if (!KNOB_TOSS_VS)
1343 #endif
1344                     {
1345                         if (assemble)
1346                         {
1347                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1348 
1349                             if (HasTessellationT::value)
1350                             {
1351                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1352                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1353                             }
1354                             else if (HasGeometryShaderT::value)
1355                             {
1356                                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1357                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1358                             }
1359                             else
1360                             {
1361                                 // If streamout is enabled then stream vertices out to memory.
1362                                 if (HasStreamOutT::value)
1363                                 {
1364                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1365                                 }
1366 
1367                                 if (HasRastT::value)
1368                                 {
1369                                     SWR_ASSERT(pDC->pState->pfnProcessPrims);
1370                                     pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1371                                         GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
1372                                 }
1373                             }
1374                         }
1375                     }
1376                 }
1377             } while (pa.NextPrim());
1378 
1379             i += KNOB_SIMD_WIDTH;
1380             if (IsIndexedT::value)
1381             {
1382                 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1383             }
1384             else
1385             {
1386                 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1387             }
1388         }
1389         pa.Reset();
1390     }
1391 
1392 
1393     AR_END(FEProcessDraw, numPrims * work.numInstances);
1394 }
1395 
1396 struct FEDrawChooser
1397 {
1398     typedef PFN_FE_WORK_FUNC FuncType;
1399 
1400     template <typename... ArgsB>
GetFuncFEDrawChooser1401     static FuncType GetFunc()
1402     {
1403         return ProcessDraw<ArgsB...>;
1404     }
1405 };
1406 
1407 
1408 // Selector for correct templated Draw front-end function
GetProcessDrawFunc(bool IsIndexed,bool IsCutIndexEnabled,bool HasTessellation,bool HasGeometryShader,bool HasStreamOut,bool HasRasterization)1409 PFN_FE_WORK_FUNC GetProcessDrawFunc(
1410     bool IsIndexed,
1411     bool IsCutIndexEnabled,
1412     bool HasTessellation,
1413     bool HasGeometryShader,
1414     bool HasStreamOut,
1415     bool HasRasterization)
1416 {
1417     return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1418 }