1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file frontend.cpp
24 *
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
27 *
28 ******************************************************************************/
29
30 #include "api.h"
31 #include "frontend.h"
32 #include "backend.h"
33 #include "context.h"
34 #include "rdtsc_core.h"
35 #include "utils.h"
36 #include "threads.h"
37 #include "pa.h"
38 #include "clip.h"
39 #include "tilemgr.h"
40 #include "tessellator.h"
41 #include <limits>
42 #include <iostream>
43
44 //////////////////////////////////////////////////////////////////////////
45 /// @brief FE handler for SwrSync.
46 /// @param pContext - pointer to SWR context.
47 /// @param pDC - pointer to draw context.
48 /// @param workerId - thread's worker id. Even thread has a unique id.
49 /// @param pUserData - Pointer to user data passed back to sync callback.
50 /// @todo This should go away when we switch this to use compute threading.
ProcessSync(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)51 void ProcessSync(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
52 {
53 BE_WORK work;
54 work.type = SYNC;
55 work.pfnWork = ProcessSyncBE;
56
57 MacroTileMgr* pTileMgr = pDC->pTileMgr;
58 pTileMgr->enqueue(0, 0, &work);
59 }
60
61 //////////////////////////////////////////////////////////////////////////
62 /// @brief FE handler for SwrDestroyContext.
63 /// @param pContext - pointer to SWR context.
64 /// @param pDC - pointer to draw context.
65 /// @param workerId - thread's worker id. Even thread has a unique id.
66 /// @param pUserData - Pointer to user data passed back to sync callback.
ProcessShutdown(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)67 void ProcessShutdown(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
68 {
69 BE_WORK work;
70 work.type = SHUTDOWN;
71 work.pfnWork = ProcessShutdownBE;
72
73 MacroTileMgr* pTileMgr = pDC->pTileMgr;
74 // Enqueue at least 1 work item for each worker thread
75 // account for number of numa nodes
76 uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
77
78 for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
79 {
80 for (uint32_t n = 0; n < numNumaNodes; ++n)
81 {
82 pTileMgr->enqueue(i, n, &work);
83 }
84 }
85 }
86
87 //////////////////////////////////////////////////////////////////////////
88 /// @brief FE handler for SwrClearRenderTarget.
89 /// @param pContext - pointer to SWR context.
90 /// @param pDC - pointer to draw context.
91 /// @param workerId - thread's worker id. Even thread has a unique id.
92 /// @param pUserData - Pointer to user data passed back to clear callback.
93 /// @todo This should go away when we switch this to use compute threading.
ProcessClear(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)94 void ProcessClear(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
95 {
96 CLEAR_DESC* pDesc = (CLEAR_DESC*)pUserData;
97 MacroTileMgr* pTileMgr = pDC->pTileMgr;
98
99 // queue a clear to each macro tile
100 // compute macro tile bounds for the specified rect
101 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
102 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
103 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
104 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
105
106 BE_WORK work;
107 work.type = CLEAR;
108 work.pfnWork = ProcessClearBE;
109 work.desc.clear = *pDesc;
110
111 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
112 {
113 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
114 {
115 pTileMgr->enqueue(x, y, &work);
116 }
117 }
118 }
119
120 //////////////////////////////////////////////////////////////////////////
121 /// @brief FE handler for SwrStoreTiles.
122 /// @param pContext - pointer to SWR context.
123 /// @param pDC - pointer to draw context.
124 /// @param workerId - thread's worker id. Even thread has a unique id.
125 /// @param pUserData - Pointer to user data passed back to callback.
126 /// @todo This should go away when we switch this to use compute threading.
ProcessStoreTiles(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)127 void ProcessStoreTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
128 {
129 RDTSC_BEGIN(pContext->pBucketMgr, FEProcessStoreTiles, pDC->drawId);
130 MacroTileMgr* pTileMgr = pDC->pTileMgr;
131 STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
132
133 // queue a store to each macro tile
134 // compute macro tile bounds for the specified rect
135 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
136 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
137 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
138 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
139
140 // store tiles
141 BE_WORK work;
142 work.type = STORETILES;
143 work.pfnWork = ProcessStoreTilesBE;
144 work.desc.storeTiles = *pDesc;
145
146 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
147 {
148 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
149 {
150 pTileMgr->enqueue(x, y, &work);
151 }
152 }
153
154 RDTSC_END(pContext->pBucketMgr, FEProcessStoreTiles, 0);
155 }
156
157 //////////////////////////////////////////////////////////////////////////
158 /// @brief FE handler for SwrInvalidateTiles.
159 /// @param pContext - pointer to SWR context.
160 /// @param pDC - pointer to draw context.
161 /// @param workerId - thread's worker id. Even thread has a unique id.
162 /// @param pUserData - Pointer to user data passed back to callback.
163 /// @todo This should go away when we switch this to use compute threading.
ProcessDiscardInvalidateTiles(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)164 void ProcessDiscardInvalidateTiles(SWR_CONTEXT* pContext,
165 DRAW_CONTEXT* pDC,
166 uint32_t workerId,
167 void* pUserData)
168 {
169 RDTSC_BEGIN(pContext->pBucketMgr, FEProcessInvalidateTiles, pDC->drawId);
170 DISCARD_INVALIDATE_TILES_DESC* pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
171 MacroTileMgr* pTileMgr = pDC->pTileMgr;
172
173 // compute macro tile bounds for the specified rect
174 uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
175 uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
176 uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
177 uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
178
179 if (pDesc->fullTilesOnly == false)
180 {
181 // include partial tiles
182 macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
183 macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
184 macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
185 macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
186 }
187
188 SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
189 SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
190
191 macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
192 macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
193
194 // load tiles
195 BE_WORK work;
196 work.type = DISCARDINVALIDATETILES;
197 work.pfnWork = ProcessDiscardInvalidateTilesBE;
198 work.desc.discardInvalidateTiles = *pDesc;
199
200 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
201 {
202 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
203 {
204 pTileMgr->enqueue(x, y, &work);
205 }
206 }
207
208 RDTSC_END(pContext->pBucketMgr, FEProcessInvalidateTiles, 0);
209 }
210
211 //////////////////////////////////////////////////////////////////////////
212 /// @brief Computes the number of primitives given the number of verts.
213 /// @param mode - primitive topology for draw operation.
214 /// @param numPrims - number of vertices or indices for draw.
215 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
GetNumPrims(PRIMITIVE_TOPOLOGY mode,uint32_t numPrims)216 uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims)
217 {
218 switch (mode)
219 {
220 case TOP_POINT_LIST:
221 return numPrims;
222 case TOP_TRIANGLE_LIST:
223 return numPrims / 3;
224 case TOP_TRIANGLE_STRIP:
225 return numPrims < 3 ? 0 : numPrims - 2;
226 case TOP_TRIANGLE_FAN:
227 return numPrims < 3 ? 0 : numPrims - 2;
228 case TOP_TRIANGLE_DISC:
229 return numPrims < 2 ? 0 : numPrims - 1;
230 case TOP_QUAD_LIST:
231 return numPrims / 4;
232 case TOP_QUAD_STRIP:
233 return numPrims < 4 ? 0 : (numPrims - 2) / 2;
234 case TOP_LINE_STRIP:
235 return numPrims < 2 ? 0 : numPrims - 1;
236 case TOP_LINE_LIST:
237 return numPrims / 2;
238 case TOP_LINE_LOOP:
239 return numPrims;
240 case TOP_RECT_LIST:
241 return numPrims / 3;
242 case TOP_LINE_LIST_ADJ:
243 return numPrims / 4;
244 case TOP_LISTSTRIP_ADJ:
245 return numPrims < 3 ? 0 : numPrims - 3;
246 case TOP_TRI_LIST_ADJ:
247 return numPrims / 6;
248 case TOP_TRI_STRIP_ADJ:
249 return numPrims < 4 ? 0 : (numPrims / 2) - 2;
250
251 case TOP_PATCHLIST_1:
252 case TOP_PATCHLIST_2:
253 case TOP_PATCHLIST_3:
254 case TOP_PATCHLIST_4:
255 case TOP_PATCHLIST_5:
256 case TOP_PATCHLIST_6:
257 case TOP_PATCHLIST_7:
258 case TOP_PATCHLIST_8:
259 case TOP_PATCHLIST_9:
260 case TOP_PATCHLIST_10:
261 case TOP_PATCHLIST_11:
262 case TOP_PATCHLIST_12:
263 case TOP_PATCHLIST_13:
264 case TOP_PATCHLIST_14:
265 case TOP_PATCHLIST_15:
266 case TOP_PATCHLIST_16:
267 case TOP_PATCHLIST_17:
268 case TOP_PATCHLIST_18:
269 case TOP_PATCHLIST_19:
270 case TOP_PATCHLIST_20:
271 case TOP_PATCHLIST_21:
272 case TOP_PATCHLIST_22:
273 case TOP_PATCHLIST_23:
274 case TOP_PATCHLIST_24:
275 case TOP_PATCHLIST_25:
276 case TOP_PATCHLIST_26:
277 case TOP_PATCHLIST_27:
278 case TOP_PATCHLIST_28:
279 case TOP_PATCHLIST_29:
280 case TOP_PATCHLIST_30:
281 case TOP_PATCHLIST_31:
282 case TOP_PATCHLIST_32:
283 return numPrims / (mode - TOP_PATCHLIST_BASE);
284
285 case TOP_POLYGON:
286 case TOP_POINT_LIST_BF:
287 case TOP_LINE_STRIP_CONT:
288 case TOP_LINE_STRIP_BF:
289 case TOP_LINE_STRIP_CONT_BF:
290 case TOP_TRIANGLE_FAN_NOSTIPPLE:
291 case TOP_TRI_STRIP_REVERSE:
292 case TOP_PATCHLIST_BASE:
293 case TOP_UNKNOWN:
294 SWR_INVALID("Unsupported topology: %d", mode);
295 return 0;
296 }
297
298 return 0;
299 }
300
301 //////////////////////////////////////////////////////////////////////////
302 /// @brief Computes the number of verts given the number of primitives.
303 /// @param mode - primitive topology for draw operation.
304 /// @param numPrims - number of primitives for draw.
GetNumVerts(PRIMITIVE_TOPOLOGY mode,uint32_t numPrims)305 uint32_t GetNumVerts(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims)
306 {
307 switch (mode)
308 {
309 case TOP_POINT_LIST:
310 return numPrims;
311 case TOP_TRIANGLE_LIST:
312 return numPrims * 3;
313 case TOP_TRIANGLE_STRIP:
314 return numPrims ? numPrims + 2 : 0;
315 case TOP_TRIANGLE_FAN:
316 return numPrims ? numPrims + 2 : 0;
317 case TOP_TRIANGLE_DISC:
318 return numPrims ? numPrims + 1 : 0;
319 case TOP_QUAD_LIST:
320 return numPrims * 4;
321 case TOP_QUAD_STRIP:
322 return numPrims ? numPrims * 2 + 2 : 0;
323 case TOP_LINE_STRIP:
324 return numPrims ? numPrims + 1 : 0;
325 case TOP_LINE_LIST:
326 return numPrims * 2;
327 case TOP_LINE_LOOP:
328 return numPrims;
329 case TOP_RECT_LIST:
330 return numPrims * 3;
331 case TOP_LINE_LIST_ADJ:
332 return numPrims * 4;
333 case TOP_LISTSTRIP_ADJ:
334 return numPrims ? numPrims + 3 : 0;
335 case TOP_TRI_LIST_ADJ:
336 return numPrims * 6;
337 case TOP_TRI_STRIP_ADJ:
338 return numPrims ? (numPrims + 2) * 2 : 0;
339
340 case TOP_PATCHLIST_1:
341 case TOP_PATCHLIST_2:
342 case TOP_PATCHLIST_3:
343 case TOP_PATCHLIST_4:
344 case TOP_PATCHLIST_5:
345 case TOP_PATCHLIST_6:
346 case TOP_PATCHLIST_7:
347 case TOP_PATCHLIST_8:
348 case TOP_PATCHLIST_9:
349 case TOP_PATCHLIST_10:
350 case TOP_PATCHLIST_11:
351 case TOP_PATCHLIST_12:
352 case TOP_PATCHLIST_13:
353 case TOP_PATCHLIST_14:
354 case TOP_PATCHLIST_15:
355 case TOP_PATCHLIST_16:
356 case TOP_PATCHLIST_17:
357 case TOP_PATCHLIST_18:
358 case TOP_PATCHLIST_19:
359 case TOP_PATCHLIST_20:
360 case TOP_PATCHLIST_21:
361 case TOP_PATCHLIST_22:
362 case TOP_PATCHLIST_23:
363 case TOP_PATCHLIST_24:
364 case TOP_PATCHLIST_25:
365 case TOP_PATCHLIST_26:
366 case TOP_PATCHLIST_27:
367 case TOP_PATCHLIST_28:
368 case TOP_PATCHLIST_29:
369 case TOP_PATCHLIST_30:
370 case TOP_PATCHLIST_31:
371 case TOP_PATCHLIST_32:
372 return numPrims * (mode - TOP_PATCHLIST_BASE);
373
374 case TOP_POLYGON:
375 case TOP_POINT_LIST_BF:
376 case TOP_LINE_STRIP_CONT:
377 case TOP_LINE_STRIP_BF:
378 case TOP_LINE_STRIP_CONT_BF:
379 case TOP_TRIANGLE_FAN_NOSTIPPLE:
380 case TOP_TRI_STRIP_REVERSE:
381 case TOP_PATCHLIST_BASE:
382 case TOP_UNKNOWN:
383 SWR_INVALID("Unsupported topology: %d", mode);
384 return 0;
385 }
386
387 return 0;
388 }
389
390 //////////////////////////////////////////////////////////////////////////
391 /// @brief Return number of verts per primitive.
392 /// @param topology - topology
393 /// @param includeAdjVerts - include adjacent verts in primitive vertices
NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology,bool includeAdjVerts)394 uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
395 {
396 uint32_t numVerts = 0;
397 switch (topology)
398 {
399 case TOP_POINT_LIST:
400 case TOP_POINT_LIST_BF:
401 numVerts = 1;
402 break;
403 case TOP_LINE_LIST:
404 case TOP_LINE_STRIP:
405 case TOP_LINE_LIST_ADJ:
406 case TOP_LINE_LOOP:
407 case TOP_LINE_STRIP_CONT:
408 case TOP_LINE_STRIP_BF:
409 case TOP_LISTSTRIP_ADJ:
410 numVerts = 2;
411 break;
412 case TOP_TRIANGLE_LIST:
413 case TOP_TRIANGLE_STRIP:
414 case TOP_TRIANGLE_FAN:
415 case TOP_TRI_LIST_ADJ:
416 case TOP_TRI_STRIP_ADJ:
417 case TOP_TRI_STRIP_REVERSE:
418 case TOP_RECT_LIST:
419 numVerts = 3;
420 break;
421 case TOP_QUAD_LIST:
422 case TOP_QUAD_STRIP:
423 numVerts = 4;
424 break;
425 case TOP_PATCHLIST_1:
426 case TOP_PATCHLIST_2:
427 case TOP_PATCHLIST_3:
428 case TOP_PATCHLIST_4:
429 case TOP_PATCHLIST_5:
430 case TOP_PATCHLIST_6:
431 case TOP_PATCHLIST_7:
432 case TOP_PATCHLIST_8:
433 case TOP_PATCHLIST_9:
434 case TOP_PATCHLIST_10:
435 case TOP_PATCHLIST_11:
436 case TOP_PATCHLIST_12:
437 case TOP_PATCHLIST_13:
438 case TOP_PATCHLIST_14:
439 case TOP_PATCHLIST_15:
440 case TOP_PATCHLIST_16:
441 case TOP_PATCHLIST_17:
442 case TOP_PATCHLIST_18:
443 case TOP_PATCHLIST_19:
444 case TOP_PATCHLIST_20:
445 case TOP_PATCHLIST_21:
446 case TOP_PATCHLIST_22:
447 case TOP_PATCHLIST_23:
448 case TOP_PATCHLIST_24:
449 case TOP_PATCHLIST_25:
450 case TOP_PATCHLIST_26:
451 case TOP_PATCHLIST_27:
452 case TOP_PATCHLIST_28:
453 case TOP_PATCHLIST_29:
454 case TOP_PATCHLIST_30:
455 case TOP_PATCHLIST_31:
456 case TOP_PATCHLIST_32:
457 numVerts = topology - TOP_PATCHLIST_BASE;
458 break;
459 default:
460 SWR_INVALID("Unsupported topology: %d", topology);
461 break;
462 }
463
464 if (includeAdjVerts)
465 {
466 switch (topology)
467 {
468 case TOP_LISTSTRIP_ADJ:
469 case TOP_LINE_LIST_ADJ:
470 numVerts = 4;
471 break;
472 case TOP_TRI_STRIP_ADJ:
473 case TOP_TRI_LIST_ADJ:
474 numVerts = 6;
475 break;
476 default:
477 break;
478 }
479 }
480
481 return numVerts;
482 }
483
484 //////////////////////////////////////////////////////////////////////////
485 /// @brief Generate mask from remaining work.
486 /// @param numWorkItems - Number of items being worked on by a SIMD.
GenerateMask(uint32_t numItemsRemaining)487 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
488 {
489 uint32_t numActive =
490 (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
491 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
492 return _simd_castps_si(_simd_vmask_ps(mask));
493 }
494
GenerateMask16(uint32_t numItemsRemaining)495 static INLINE simd16scalari GenerateMask16(uint32_t numItemsRemaining)
496 {
497 uint32_t numActive =
498 (numItemsRemaining >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : numItemsRemaining;
499 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
500 return _simd16_castps_si(_simd16_vmask_ps(mask));
501 }
502
503 //////////////////////////////////////////////////////////////////////////
504 /// @brief StreamOut - Streams vertex data out to SO buffers.
505 /// Generally, we are only streaming out a SIMDs worth of triangles.
506 /// @param pDC - pointer to draw context.
507 /// @param workerId - thread's worker id. Even thread has a unique id.
508 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
StreamOut(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,uint32_t * pPrimData,uint32_t streamIndex)509 static void StreamOut(
510 DRAW_CONTEXT* pDC, PA_STATE& pa, uint32_t workerId, uint32_t* pPrimData, uint32_t streamIndex)
511 {
512 RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEStreamout, pDC->drawId);
513
514 void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
515
516 const API_STATE& state = GetApiState(pDC);
517 const SWR_STREAMOUT_STATE& soState = state.soState;
518
519 uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
520
521 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each
522 // vertex.
523 uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t);
524
525 SWR_STREAMOUT_CONTEXT soContext = {0};
526
527 // Setup buffer state pointers.
528 for (uint32_t i = 0; i < 4; ++i)
529 {
530 soContext.pBuffer[i] = &state.soBuffer[i];
531 }
532
533 uint32_t numPrims = pa.NumPrims();
534
535 for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
536 {
537 unsigned long slot = 0;
538 uint64_t soMask = soState.streamMasks[streamIndex];
539
540 // Write all entries into primitive data buffer for SOS.
541 while (_BitScanForward64(&slot, soMask))
542 {
543 simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
544 uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex];
545 pa.AssembleSingle(paSlot, primIndex, attrib);
546
547 // Attribute offset is relative offset from start of vertex.
548 // Note that attributes start at slot 1 in the PA buffer. We need to write this
549 // to prim data starting at slot 0. Which is why we do (slot - 1).
550 // Also note: GL works slightly differently, and needs slot 0
551 uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
552
553 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
554 for (uint32_t v = 0; v < soVertsPerPrim; ++v)
555 {
556 uint32_t* pPrimDataAttrib =
557 pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
558
559 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
560 }
561
562 soMask &= ~(uint64_t(1) << slot);
563 }
564
565 // Update pPrimData pointer
566 soContext.pPrimData = pPrimData;
567
568 // Call SOS
569 SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr,
570 "Trying to execute uninitialized streamout jit function.");
571 state.pfnSoFunc[streamIndex](GetPrivateState(pDC), pWorkerData, soContext);
572 }
573
574 // Update SO write offset. The driver provides memory for the update.
575 for (uint32_t i = 0; i < 4; ++i)
576 {
577 if (state.soBuffer[i].pWriteOffset)
578 {
579 bool nullTileAccessed = false;
580 void* pWriteOffset = pDC->pContext->pfnTranslateGfxptrForWrite(
581 GetPrivateState(pDC), soContext.pBuffer[i]->pWriteOffset, &nullTileAccessed, pWorkerData);
582 *((uint32_t*)pWriteOffset) = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
583 }
584
585 if (state.soBuffer[i].soWriteEnable)
586 {
587 pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
588 pDC->dynState.SoWriteOffsetDirty[i] = true;
589 }
590 }
591
592 pDC->dynState.soPrims += soContext.numPrimsWritten;
593
594 UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
595 UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
596
597 RDTSC_END(pDC->pContext->pBucketMgr, FEStreamout, 1);
598 }
599
600 #if USE_SIMD16_FRONTEND
601 //////////////////////////////////////////////////////////////////////////
602 /// Is value an even number (a multiple of two)
603 ///
604 template <typename T>
IsEven(T value)605 INLINE static bool IsEven(T value)
606 {
607 return (value & 1) == 0;
608 }
609
610 //////////////////////////////////////////////////////////////////////////
611 /// Round up value to an even number (a multiple of two)
612 ///
613 template <typename T>
RoundUpEven(T value)614 INLINE static T RoundUpEven(T value)
615 {
616 return (value + 1) & ~1;
617 }
618
619 //////////////////////////////////////////////////////////////////////////
620 /// Round down value to an even number (a multiple of two)
621 ///
622 template <typename T>
RoundDownEven(T value)623 INLINE static T RoundDownEven(T value)
624 {
625 return value & ~1;
626 }
627
628 //////////////////////////////////////////////////////////////////////////
629 /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
630 ///
631 /// vertexCount is in terms of the source simdvertexes and must be even
632 ///
633 /// attribCount will limit the vector copies to those attribs specified
634 ///
635 /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
636 ///
PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex * vertex_simd16,const simdvertex * vertex,uint32_t vertexCount,uint32_t attribCount)637 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex* vertex_simd16,
638 const simdvertex* vertex,
639 uint32_t vertexCount,
640 uint32_t attribCount)
641 {
642 SWR_ASSERT(vertex);
643 SWR_ASSERT(vertex_simd16);
644 SWR_ASSERT(attribCount <= SWR_VTX_NUM_SLOTS);
645
646 simd16vertex temp;
647
648 for (uint32_t i = 0; i < vertexCount; i += 2)
649 {
650 for (uint32_t j = 0; j < attribCount; j += 1)
651 {
652 for (uint32_t k = 0; k < 4; k += 1)
653 {
654 temp.attrib[j][k] =
655 _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
656
657 if ((i + 1) < vertexCount)
658 {
659 temp.attrib[j][k] =
660 _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
661 }
662 }
663 }
664
665 for (uint32_t j = 0; j < attribCount; j += 1)
666 {
667 vertex_simd16[i >> 1].attrib[j] = temp.attrib[j];
668 }
669 }
670 }
671
672 #endif
673 //////////////////////////////////////////////////////////////////////////
674 /// @brief Computes number of invocations. The current index represents
675 /// the start of the SIMD. The max index represents how much work
676 /// items are remaining. If there is less then a SIMD's xmin of work
677 /// then return the remaining amount of work.
678 /// @param curIndex - The start index for the SIMD.
679 /// @param maxIndex - The last index for all work items.
GetNumInvocations(uint32_t curIndex,uint32_t maxIndex)680 static INLINE uint32_t GetNumInvocations(uint32_t curIndex, uint32_t maxIndex)
681 {
682 uint32_t remainder = (maxIndex - curIndex);
683 #if USE_SIMD16_FRONTEND
684 return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder;
685 #else
686 return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
687 #endif
688 }
689
690 //////////////////////////////////////////////////////////////////////////
691 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
692 /// The geometry shader will loop over each active streamout buffer, assembling
693 /// primitives for the downstream stages. When multistream output is enabled,
694 /// the generated stream ID buffer from the GS needs to be converted to a cut
695 /// buffer for the primitive assembler.
696 /// @param stream - stream id to generate the cut buffer for
697 /// @param pStreamIdBase - pointer to the stream ID buffer
698 /// @param numEmittedVerts - Number of total verts emitted by the GS
699 /// @param pCutBuffer - output buffer to write cuts to
ProcessStreamIdBuffer(uint32_t stream,uint8_t * pStreamIdBase,uint32_t numEmittedVerts,uint8_t * pCutBuffer)700 void ProcessStreamIdBuffer(uint32_t stream,
701 uint8_t* pStreamIdBase,
702 uint32_t numEmittedVerts,
703 uint8_t* pCutBuffer)
704 {
705 SWR_ASSERT(stream < MAX_SO_STREAMS);
706
707 uint32_t numOutputBytes = AlignUp(numEmittedVerts, 8) / 8;
708
709 for (uint32_t b = 0; b < numOutputBytes; ++b)
710 {
711 uint8_t curInputByte = pStreamIdBase[2 * b];
712 uint8_t outByte = 0;
713 for (uint32_t i = 0; i < 4; ++i)
714 {
715 if ((curInputByte & 0x3) != stream)
716 {
717 outByte |= (1 << i);
718 }
719 curInputByte >>= 2;
720 }
721
722 curInputByte = pStreamIdBase[2 * b + 1];
723 for (uint32_t i = 0; i < 4; ++i)
724 {
725 if ((curInputByte & 0x3) != stream)
726 {
727 outByte |= (1 << (i + 4));
728 }
729 curInputByte >>= 2;
730 }
731
732 *pCutBuffer++ = outByte;
733 }
734 }
735
736 // Buffers that are allocated if GS is enabled
737 struct GsBuffers
738 {
739 uint8_t* pGsIn;
740 uint8_t* pGsOut[KNOB_SIMD_WIDTH];
741 uint8_t* pGsTransposed;
742 void* pStreamCutBuffer;
743 };
744
745 //////////////////////////////////////////////////////////////////////////
746 /// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
747 /// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive
748 /// assembler
749 /// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
750 /// @param numVerts - Number of vertices outputted by the GS
751 /// @param numAttribs - Number of attributes per vertex
752 template <typename SIMD_T, uint32_t SimdWidth>
TransposeSOAtoAOS(uint8_t * pDst,uint8_t * pSrc,uint32_t numVerts,uint32_t numAttribs)753 void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs)
754 {
755 uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
756 uint32_t dstVertexStride = numAttribs * sizeof(Float<SIMD_T>) * 4;
757
758 OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];
759
760 for (uint32_t i = 0; i < SimdWidth; ++i)
761 {
762 gatherOffsets[i] = srcVertexStride * i;
763 }
764 auto vGatherOffsets = SIMD_T::load_si((Integer<SIMD_T>*)&gatherOffsets[0]);
765
766 uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth;
767 uint32_t remainingVerts = numVerts;
768
769 for (uint32_t s = 0; s < numSimd; ++s)
770 {
771 uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;
772 uint8_t* pDstBase = pDst + s * dstVertexStride;
773
774 // Compute mask to prevent src overflow
775 uint32_t mask = std::min(remainingVerts, SimdWidth);
776 mask = GenMask(mask);
777 auto vMask = SIMD_T::vmask_ps(mask);
778 auto viMask = SIMD_T::castps_si(vMask);
779
780 for (uint32_t a = 0; a < numAttribs; ++a)
781 {
782 auto attribGatherX = SIMD_T::mask_i32gather_ps(
783 SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
784 auto attribGatherY = SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
785 (const float*)(pSrcBase + sizeof(float)),
786 vGatherOffsets,
787 vMask);
788 auto attribGatherZ =
789 SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
790 (const float*)(pSrcBase + sizeof(float) * 2),
791 vGatherOffsets,
792 vMask);
793 auto attribGatherW =
794 SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
795 (const float*)(pSrcBase + sizeof(float) * 3),
796 vGatherOffsets,
797 vMask);
798
799 SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
800 SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>)), viMask, attribGatherY);
801 SIMD_T::maskstore_ps(
802 (float*)(pDstBase + sizeof(Float<SIMD_T>) * 2), viMask, attribGatherZ);
803 SIMD_T::maskstore_ps(
804 (float*)(pDstBase + sizeof(Float<SIMD_T>) * 3), viMask, attribGatherW);
805
806 pSrcBase += sizeof(float) * 4;
807 pDstBase += sizeof(Float<SIMD_T>) * 4;
808 }
809 remainingVerts -= SimdWidth;
810 }
811 }
812
813
814 //////////////////////////////////////////////////////////////////////////
815 /// @brief Implements GS stage.
816 /// @param pDC - pointer to draw context.
817 /// @param workerId - thread's worker id. Even thread has a unique id.
818 /// @param pa - The primitive assembly object.
819 /// @param pGsOut - output stream for GS
820 template <typename HasStreamOutT, typename HasRastT>
GeometryShaderStage(DRAW_CONTEXT * pDC,uint32_t workerId,PA_STATE & pa,GsBuffers * pGsBuffers,uint32_t * pSoPrimData,uint32_t numPrims_simd8,simdscalari const & primID)821 static void GeometryShaderStage(DRAW_CONTEXT* pDC,
822 uint32_t workerId,
823 PA_STATE& pa,
824 GsBuffers* pGsBuffers,
825 uint32_t* pSoPrimData,
826 #if USE_SIMD16_FRONTEND
827 uint32_t numPrims_simd8,
828 #endif
829 simdscalari const& primID)
830 {
831 RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEGeometryShader, pDC->drawId);
832
833 void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
834
835 const API_STATE& state = GetApiState(pDC);
836 const SWR_GS_STATE* pState = &state.gsState;
837 SWR_GS_CONTEXT gsContext;
838
839 static uint8_t sNullBuffer[128] = {0};
840
841 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
842 {
843 gsContext.pStreams[i] = pGsBuffers->pGsOut[i];
844 }
845 gsContext.pVerts = (simdvector*)pGsBuffers->pGsIn;
846 gsContext.PrimitiveID = primID;
847
848 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
849 simdvector attrib[MAX_NUM_VERTS_PER_PRIM];
850
851 // assemble all attributes for the input primitive
852 gsContext.inputVertStride = pState->inputVertStride;
853 for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
854 {
855 uint32_t attribOffset = slot + pState->vertexAttribOffset;
856 pa.Assemble(attribOffset, attrib);
857
858 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
859 {
860 gsContext.pVerts[attribOffset + pState->inputVertStride * i] = attrib[i];
861 }
862 }
863
864 // record valid prims from the frontend to avoid over binning the newly generated
865 // prims from the GS
866 #if USE_SIMD16_FRONTEND
867 uint32_t numInputPrims = numPrims_simd8;
868 #else
869 uint32_t numInputPrims = pa.NumPrims();
870 #endif
871
872 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
873 {
874 gsContext.InstanceID = instance;
875 gsContext.mask = GenerateMask(numInputPrims);
876
877 // execute the geometry shader
878 state.pfnGsFunc(GetPrivateState(pDC), pWorkerData, &gsContext);
879 AR_EVENT(GSStats((HANDLE)&gsContext.stats));
880
881 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
882 {
883 gsContext.pStreams[i] += pState->allocationSize;
884 }
885 }
886
887 // set up new binner and state for the GS output topology
888 #if USE_SIMD16_FRONTEND
889 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
890 if (HasRastT::value)
891 {
892 switch (pState->outputTopology)
893 {
894 case TOP_RECT_LIST:
895 pfnClipFunc = ClipRectangles_simd16;
896 break;
897 case TOP_TRIANGLE_STRIP:
898 pfnClipFunc = ClipTriangles_simd16;
899 break;
900 case TOP_LINE_STRIP:
901 pfnClipFunc = ClipLines_simd16;
902 break;
903 case TOP_POINT_LIST:
904 pfnClipFunc = ClipPoints_simd16;
905 break;
906 default:
907 SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
908 }
909 }
910
911 #else
912 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
913 if (HasRastT::value)
914 {
915 switch (pState->outputTopology)
916 {
917 case TOP_RECT_LIST:
918 pfnClipFunc = ClipRectangles;
919 break;
920 case TOP_TRIANGLE_STRIP:
921 pfnClipFunc = ClipTriangles;
922 break;
923 case TOP_LINE_STRIP:
924 pfnClipFunc = ClipLines;
925 break;
926 case TOP_POINT_LIST:
927 pfnClipFunc = ClipPoints;
928 break;
929 default:
930 SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
931 }
932 }
933
934 #endif
935 // foreach input prim:
936 // - setup a new PA based on the emitted verts for that prim
937 // - loop over the new verts, calling PA to assemble each prim
938 uint32_t* pPrimitiveId = (uint32_t*)&primID;
939
940 uint32_t totalPrimsGenerated = 0;
941 for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
942 {
943 uint8_t* pInstanceBase = (uint8_t*)pGsBuffers->pGsOut[inputPrim];
944
945 // Vertex count is either emitted by shader or static
946 uint32_t vertexCount = 0;
947 if (pState->staticVertexCount)
948 {
949 vertexCount = pState->staticVertexCount;
950 }
951 else
952 {
953 // If emitted in shader, it should be the stored in the first dword of the output buffer
954 vertexCount = *(uint32_t*)pInstanceBase;
955 }
956
957 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
958 {
959 uint32_t numEmittedVerts = vertexCount;
960 if (numEmittedVerts == 0)
961 {
962 continue;
963 }
964
965 uint8_t* pBase = pInstanceBase + instance * pState->allocationSize;
966 uint8_t* pCutBase =
967 pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset;
968 uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset;
969
970 #if USE_SIMD16_FRONTEND
971 TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed,
972 pVertexBaseAOS,
973 vertexCount,
974 pState->outputVertexSize);
975 #else
976 TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed,
977 pVertexBaseAOS,
978 vertexCount,
979 pState->outputVertexSize);
980 #endif
981
982 uint32_t numAttribs = state.feNumAttributes;
983
984 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
985 {
986 bool processCutVerts = false;
987 uint8_t* pCutBuffer = pCutBase;
988
989 // assign default stream ID, only relevant when GS is outputting a single stream
990 uint32_t streamID = 0;
991 if (pState->isSingleStream)
992 {
993 processCutVerts = true;
994 streamID = pState->singleStreamID;
995 if (streamID != stream)
996 continue;
997 }
998 else
999 {
1000 // early exit if this stream is not enabled for streamout
1001 if (HasStreamOutT::value && !state.soState.streamEnable[stream])
1002 {
1003 continue;
1004 }
1005
1006 // multi-stream output, need to translate StreamID buffer to a cut buffer
1007 ProcessStreamIdBuffer(
1008 stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer);
1009 pCutBuffer = (uint8_t*)pGsBuffers->pStreamCutBuffer;
1010 processCutVerts = false;
1011 }
1012
1013 #if USE_SIMD16_FRONTEND
1014 PA_STATE_CUT gsPa(pDC,
1015 (uint8_t*)pGsBuffers->pGsTransposed,
1016 numEmittedVerts,
1017 pState->outputVertexSize,
1018 reinterpret_cast<simd16mask*>(pCutBuffer),
1019 numEmittedVerts,
1020 numAttribs,
1021 pState->outputTopology,
1022 processCutVerts,
1023 pa.numVertsPerPrim);
1024
1025 #else
1026 PA_STATE_CUT gsPa(pDC,
1027 (uint8_t*)pGsBuffers->pGsTransposed,
1028 numEmittedVerts,
1029 pState->outputVertexSize,
1030 pCutBuffer,
1031 numEmittedVerts,
1032 numAttribs,
1033 pState->outputTopology,
1034 processCutVerts,
1035 pa.numVertsPerPrim);
1036
1037 #endif
1038 while (gsPa.GetNextStreamOutput())
1039 {
1040 do
1041 {
1042 #if USE_SIMD16_FRONTEND
1043 simd16vector attrib_simd16[3];
1044
1045 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib_simd16);
1046
1047 #else
1048 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
1049
1050 #endif
1051 if (assemble)
1052 {
1053 totalPrimsGenerated += gsPa.NumPrims();
1054
1055 if (HasStreamOutT::value)
1056 {
1057 #if ENABLE_AVX512_SIMD16
1058 gsPa.useAlternateOffset = false;
1059 #endif
1060 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
1061 }
1062
1063 if (HasRastT::value && state.soState.streamToRasterizer == stream)
1064 {
1065 #if USE_SIMD16_FRONTEND
1066 simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
1067
1068 // Gather data from the SVG if provided.
1069 simd16scalari vViewportIdx = SIMD16::setzero_si();
1070 simd16scalari vRtIdx = SIMD16::setzero_si();
1071 SIMD16::Vec4 svgAttrib[4];
1072
1073 if (state.backendState.readViewportArrayIndex ||
1074 state.backendState.readRenderTargetArrayIndex)
1075 {
1076 gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1077 }
1078
1079 if (state.backendState.readViewportArrayIndex)
1080 {
1081 vViewportIdx =
1082 SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1083 gsPa.viewportArrayActive = true;
1084 }
1085 if (state.backendState.readRenderTargetArrayIndex)
1086 {
1087 vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1088 gsPa.rtArrayActive = true;
1089 }
1090
1091 {
1092 // OOB VPAI indices => forced to zero.
1093 vViewportIdx =
1094 SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
1095 simd16scalari vNumViewports =
1096 SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1097 simd16scalari vClearMask =
1098 SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
1099 vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
1100
1101 gsPa.useAlternateOffset = false;
1102 pfnClipFunc(pDC,
1103 gsPa,
1104 workerId,
1105 attrib_simd16,
1106 GenMask(gsPa.NumPrims()),
1107 vPrimId,
1108 vViewportIdx,
1109 vRtIdx);
1110 }
1111 #else
1112 simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
1113
1114 // Gather data from the SVG if provided.
1115 simdscalari vViewportIdx = SIMD::setzero_si();
1116 simdscalari vRtIdx = SIMD::setzero_si();
1117 SIMD::Vec4 svgAttrib[4];
1118
1119 if (state.backendState.readViewportArrayIndex ||
1120 state.backendState.readRenderTargetArrayIndex)
1121 {
1122 gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1123 }
1124
1125 if (state.backendState.readViewportArrayIndex)
1126 {
1127 vViewportIdx =
1128 SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1129
1130 // OOB VPAI indices => forced to zero.
1131 vViewportIdx =
1132 SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
1133 simdscalari vNumViewports =
1134 SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1135 simdscalari vClearMask =
1136 SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
1137 vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
1138 gsPa.viewportArrayActive = true;
1139 }
1140 if (state.backendState.readRenderTargetArrayIndex)
1141 {
1142 vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1143 gsPa.rtArrayActive = true;
1144 }
1145
1146 pfnClipFunc(pDC,
1147 gsPa,
1148 workerId,
1149 attrib,
1150 GenMask(gsPa.NumPrims()),
1151 vPrimId,
1152 vViewportIdx,
1153 vRtIdx);
1154 #endif
1155 }
1156 }
1157 } while (gsPa.NextPrim());
1158 }
1159 }
1160 }
1161 }
1162
1163 // update GS pipeline stats
1164 UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
1165 UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
1166 AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim * numInputPrims));
1167 RDTSC_END(pDC->pContext->pBucketMgr, FEGeometryShader, 1);
1168 }
1169
1170 //////////////////////////////////////////////////////////////////////////
1171 /// @brief Allocate GS buffers
1172 /// @param pDC - pointer to draw context.
1173 /// @param state - API state
1174 /// @param ppGsOut - pointer to GS output buffer allocation
1175 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
1176 template <typename SIMD_T, uint32_t SIMD_WIDTH>
AllocateGsBuffers(DRAW_CONTEXT * pDC,const API_STATE & state,uint32_t vertsPerPrim,GsBuffers * pGsBuffers)1177 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC,
1178 const API_STATE& state,
1179 uint32_t vertsPerPrim,
1180 GsBuffers* pGsBuffers)
1181 {
1182 auto pArena = pDC->pArena;
1183 SWR_ASSERT(pArena != nullptr);
1184 SWR_ASSERT(state.gsState.gsEnable);
1185
1186 const SWR_GS_STATE& gsState = state.gsState;
1187
1188 // Allocate storage for vertex inputs
1189 uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim;
1190 pGsBuffers->pGsIn = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32);
1191
1192 // Allocate arena space to hold GS output verts
1193 const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize;
1194
1195 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
1196 {
1197 pGsBuffers->pGsOut[i] = (uint8_t*)pArena->AllocAligned(vertexBufferSize, 32);
1198 }
1199
1200 // Allocate storage for transposed GS output
1201 uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH;
1202 uint32_t transposedBufferSize =
1203 numSimdBatches * gsState.outputVertexSize * sizeof(Vec4<SIMD_T>);
1204 pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32);
1205
1206 // Allocate storage to hold temporary stream->cut buffer, if necessary
1207 if (state.gsState.isSingleStream)
1208 {
1209 pGsBuffers->pStreamCutBuffer = nullptr;
1210 }
1211 else
1212 {
1213 pGsBuffers->pStreamCutBuffer =
1214 (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32);
1215 }
1216 }
1217
1218 //////////////////////////////////////////////////////////////////////////
1219 /// @brief Contains all data generated by the HS and passed to the
1220 /// tessellator and DS.
1221 struct TessellationThreadLocalData
1222 {
1223 SWR_HS_CONTEXT hsContext;
1224 void* pTxCtx;
1225 size_t tsCtxSize;
1226
1227 uint8_t* pHSOutput;
1228 size_t hsOutputAllocSize;
1229
1230 simdscalar* pDSOutput;
1231 size_t dsOutputAllocSize;
1232 };
1233
1234 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
1235
1236 //////////////////////////////////////////////////////////////////////////
1237 /// @brief Allocate tessellation data for this worker thread.
1238 INLINE
AllocateTessellationData(SWR_CONTEXT * pContext)1239 static void AllocateTessellationData(SWR_CONTEXT* pContext)
1240 {
1241 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
1242 if (gt_pTessellationThreadData == nullptr)
1243 {
1244 gt_pTessellationThreadData =
1245 (TessellationThreadLocalData*)AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
1246 memset((void*)gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
1247 }
1248 }
1249
1250 //////////////////////////////////////////////////////////////////////////
1251 /// @brief Implements Tessellation Stages.
1252 /// @param pDC - pointer to draw context.
1253 /// @param workerId - thread's worker id. Even thread has a unique id.
1254 /// @param pa - The primitive assembly object.
1255 /// @param pGsOut - output stream for GS
1256 template <typename HasGeometryShaderT, typename HasStreamOutT, typename HasRastT>
TessellationStages(DRAW_CONTEXT * pDC,uint32_t workerId,PA_STATE & pa,GsBuffers * pGsBuffers,uint32_t * pSoPrimData,uint32_t numPrims_simd8,simdscalari const & primID)1257 static void TessellationStages(DRAW_CONTEXT* pDC,
1258 uint32_t workerId,
1259 PA_STATE& pa,
1260 GsBuffers* pGsBuffers,
1261 uint32_t* pSoPrimData,
1262 #if USE_SIMD16_FRONTEND
1263 uint32_t numPrims_simd8,
1264 #endif
1265 simdscalari const& primID)
1266 {
1267 const API_STATE& state = GetApiState(pDC);
1268 const SWR_TS_STATE& tsState = state.tsState;
1269 void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
1270
1271 SWR_ASSERT(gt_pTessellationThreadData);
1272
1273 HANDLE tsCtx = TSInitCtx(tsState.domain,
1274 tsState.partitioning,
1275 tsState.tsOutputTopology,
1276 gt_pTessellationThreadData->pTxCtx,
1277 gt_pTessellationThreadData->tsCtxSize);
1278 if (tsCtx == nullptr)
1279 {
1280 gt_pTessellationThreadData->pTxCtx =
1281 AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
1282 tsCtx = TSInitCtx(tsState.domain,
1283 tsState.partitioning,
1284 tsState.tsOutputTopology,
1285 gt_pTessellationThreadData->pTxCtx,
1286 gt_pTessellationThreadData->tsCtxSize);
1287 }
1288 SWR_ASSERT(tsCtx);
1289
1290 #if USE_SIMD16_FRONTEND
1291 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
1292 if (HasRastT::value)
1293 {
1294 switch (tsState.postDSTopology)
1295 {
1296 case TOP_TRIANGLE_LIST:
1297 pfnClipFunc = ClipTriangles_simd16;
1298 break;
1299 case TOP_LINE_LIST:
1300 pfnClipFunc = ClipLines_simd16;
1301 break;
1302 case TOP_POINT_LIST:
1303 pfnClipFunc = ClipPoints_simd16;
1304 break;
1305 default:
1306 SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1307 }
1308 }
1309
1310 #else
1311 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
1312 if (HasRastT::value)
1313 {
1314 switch (tsState.postDSTopology)
1315 {
1316 case TOP_TRIANGLE_LIST:
1317 pfnClipFunc = ClipTriangles;
1318 break;
1319 case TOP_LINE_LIST:
1320 pfnClipFunc = ClipLines;
1321 break;
1322 case TOP_POINT_LIST:
1323 pfnClipFunc = ClipPoints;
1324 break;
1325 default:
1326 SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1327 }
1328 }
1329
1330 #endif
1331 SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1332 hsContext.PrimitiveID = primID;
1333 hsContext.outputSize = tsState.hsAllocationSize;
1334
1335 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1336 // Max storage for one attribute for an entire simdprimitive
1337 simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1338
1339 // Assemble position separately
1340 // TESS_TODO: this could be avoided - fix it
1341 pa.Assemble(VERTEX_POSITION_SLOT, simdattrib);
1342 for (uint32_t i = 0; i < numVertsPerPrim; ++i) {
1343 hsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = simdattrib[i];
1344 }
1345
1346 // assemble all attributes for the input primitives
1347 for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1348 {
1349 uint32_t attribSlot = tsState.srcVertexAttribOffset + slot;
1350 pa.Assemble(attribSlot, simdattrib);
1351
1352 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1353 {
1354 hsContext.vert[i].attrib[tsState.vertexAttribOffset + slot] = simdattrib[i];
1355 }
1356 }
1357
1358 // Allocate HS output storage
1359 uint32_t requiredAllocSize = KNOB_SIMD_WIDTH * tsState.hsAllocationSize;
1360
1361 if (requiredAllocSize > gt_pTessellationThreadData->hsOutputAllocSize)
1362 {
1363 AlignedFree(gt_pTessellationThreadData->pHSOutput);
1364 gt_pTessellationThreadData->pHSOutput = (uint8_t*)AlignedMalloc(requiredAllocSize, 64);
1365 gt_pTessellationThreadData->hsOutputAllocSize = requiredAllocSize;
1366 }
1367
1368 hsContext.pCPout = (ScalarPatch*)gt_pTessellationThreadData->pHSOutput;
1369
1370 #if defined(_DEBUG)
1371 //memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1372 #endif
1373 memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1374
1375 #if USE_SIMD16_FRONTEND
1376 uint32_t numPrims = numPrims_simd8;
1377 #else
1378 uint32_t numPrims = pa.NumPrims();
1379 #endif
1380 hsContext.mask = GenerateMask(numPrims);
1381
1382 // Run the HS
1383 RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEHullShader, pDC->drawId);
1384 state.pfnHsFunc(GetPrivateState(pDC), pWorkerData, &hsContext);
1385 RDTSC_END(pDC->pContext->pBucketMgr, FEHullShader, 0);
1386
1387 UPDATE_STAT_FE(HsInvocations, numPrims);
1388 AR_EVENT(HSStats((HANDLE)&hsContext.stats));
1389
1390 const uint32_t* pPrimId = (const uint32_t*)&primID;
1391
1392 for (uint32_t p = 0; p < numPrims; ++p)
1393 {
1394 ScalarPatch* pCPout = (ScalarPatch*)(gt_pTessellationThreadData->pHSOutput + tsState.hsAllocationSize * p);
1395
1396 SWR_TESSELLATION_FACTORS tessFactors;
1397 tessFactors = hsContext.pCPout[p].tessFactors;
1398
1399 // Run Tessellator
1400 SWR_TS_TESSELLATED_DATA tsData = {0};
1401 RDTSC_BEGIN(pDC->pContext->pBucketMgr, FETessellation, pDC->drawId);
1402 TSTessellate(tsCtx, tessFactors, tsData);
1403 AR_EVENT(TessPrimCount(1));
1404 RDTSC_END(pDC->pContext->pBucketMgr, FETessellation, 0);
1405
1406 if (tsData.NumPrimitives == 0)
1407 {
1408 continue;
1409 }
1410 SWR_ASSERT(tsData.NumDomainPoints);
1411
1412 // Allocate DS Output memory
1413 uint32_t requiredDSVectorInvocations =
1414 AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1415 #if USE_SIMD16_FRONTEND
1416 size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) *
1417 tsState.dsAllocationSize; // simd8 -> simd16, padding
1418 #else
1419 size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.dsAllocationSize;
1420 size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1421 #endif
1422 if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize)
1423 {
1424 AlignedFree(gt_pTessellationThreadData->pDSOutput);
1425 gt_pTessellationThreadData->pDSOutput =
1426 (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1427 gt_pTessellationThreadData->dsOutputAllocSize = requiredAllocSize;
1428 }
1429 SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1430 SWR_ASSERT(gt_pTessellationThreadData->dsOutputAllocSize >= requiredAllocSize);
1431
1432 #if defined(_DEBUG)
1433 memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1434 #endif
1435
1436 // Run Domain Shader
1437 SWR_DS_CONTEXT dsContext;
1438 dsContext.PrimitiveID = pPrimId[p];
1439 dsContext.pCpIn = pCPout;
1440 dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1441 dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1442 dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1443 dsContext.outVertexAttribOffset = tsState.dsOutVtxAttribOffset;
1444 #if USE_SIMD16_FRONTEND
1445 dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); // simd8 -> simd16
1446 #else
1447 dsContext.vectorStride = requiredDSVectorInvocations;
1448 #endif
1449
1450 uint32_t dsInvocations = 0;
1451
1452 for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations;
1453 ++dsContext.vectorOffset)
1454 {
1455 dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1456
1457 RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEDomainShader, pDC->drawId);
1458 state.pfnDsFunc(GetPrivateState(pDC), pWorkerData, &dsContext);
1459 RDTSC_END(pDC->pContext->pBucketMgr, FEDomainShader, 0);
1460
1461 AR_EVENT(DSStats((HANDLE)&dsContext.stats));
1462
1463 dsInvocations += KNOB_SIMD_WIDTH;
1464 }
1465 UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1466
1467 #if USE_SIMD16_FRONTEND
1468 SWR_ASSERT(IsEven(dsContext.vectorStride)); // simd8 -> simd16
1469
1470 #endif
1471 PA_TESS tessPa(
1472 pDC,
1473 #if USE_SIMD16_FRONTEND
1474 reinterpret_cast<const simd16scalar*>(dsContext.pOutputData), // simd8 -> simd16
1475 dsContext.vectorStride / 2, // simd8 -> simd16
1476 #else
1477 dsContext.pOutputData,
1478 dsContext.vectorStride,
1479 #endif
1480 SWR_VTX_NUM_SLOTS,
1481 tsState.numDsOutputAttribs + tsState.dsOutVtxAttribOffset,
1482 tsData.ppIndices,
1483 tsData.NumPrimitives,
1484 tsState.postDSTopology,
1485 NumVertsPerPrim(tsState.postDSTopology, false));
1486
1487 while (tessPa.HasWork())
1488 {
1489 #if USE_SIMD16_FRONTEND
1490 const uint32_t numPrims = tessPa.NumPrims();
1491 const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1492 const uint32_t numPrims_hi =
1493 std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1494
1495 const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID);
1496 const simdscalari primID_lo = _simd16_extract_si(primID, 0);
1497 const simdscalari primID_hi = _simd16_extract_si(primID, 1);
1498
1499 #endif
1500 if (HasGeometryShaderT::value)
1501 {
1502 #if USE_SIMD16_FRONTEND
1503 tessPa.useAlternateOffset = false;
1504 GeometryShaderStage<HasStreamOutT, HasRastT>(
1505 pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo);
1506
1507 if (numPrims_hi)
1508 {
1509 tessPa.useAlternateOffset = true;
1510 GeometryShaderStage<HasStreamOutT, HasRastT>(
1511 pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi);
1512 }
1513 #else
1514 GeometryShaderStage<HasStreamOutT, HasRastT>(
1515 pDC,
1516 workerId,
1517 tessPa,
1518 pGsBuffers,
1519 pSoPrimData,
1520 _simd_set1_epi32(dsContext.PrimitiveID));
1521 #endif
1522 }
1523 else
1524 {
1525 if (HasStreamOutT::value)
1526 {
1527 #if ENABLE_AVX512_SIMD16
1528 tessPa.useAlternateOffset = false;
1529 #endif
1530 StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1531 }
1532
1533 if (HasRastT::value)
1534 {
1535 #if USE_SIMD16_FRONTEND
1536 simd16vector prim_simd16[3]; // Only deal with triangles, lines, or points
1537 #else
1538 simdvector prim[3]; // Only deal with triangles, lines, or points
1539 #endif
1540 RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEPAAssemble, pDC->drawId);
1541 bool assemble =
1542 #if USE_SIMD16_FRONTEND
1543 tessPa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
1544 #else
1545 tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1546 #endif
1547 RDTSC_END(pDC->pContext->pBucketMgr, FEPAAssemble, 1);
1548 SWR_ASSERT(assemble);
1549
1550 SWR_ASSERT(pfnClipFunc);
1551 #if USE_SIMD16_FRONTEND
1552 // Gather data from the SVG if provided.
1553 simd16scalari vViewportIdx = SIMD16::setzero_si();
1554 simd16scalari vRtIdx = SIMD16::setzero_si();
1555 SIMD16::Vec4 svgAttrib[4] = {SIMD16::setzero_ps()};
1556
1557 if (state.backendState.readViewportArrayIndex ||
1558 state.backendState.readRenderTargetArrayIndex)
1559 {
1560 tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1561 }
1562
1563 if (state.backendState.readViewportArrayIndex)
1564 {
1565 vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1566 tessPa.viewportArrayActive = true;
1567 }
1568 if (state.backendState.readRenderTargetArrayIndex)
1569 {
1570 vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1571 tessPa.rtArrayActive = true;
1572 }
1573
1574
1575 {
1576 // OOB VPAI indices => forced to zero.
1577 vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
1578 simd16scalari vNumViewports =
1579 SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1580 simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
1581 vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
1582
1583 tessPa.useAlternateOffset = false;
1584 pfnClipFunc(pDC,
1585 tessPa,
1586 workerId,
1587 prim_simd16,
1588 GenMask(numPrims),
1589 primID,
1590 vViewportIdx,
1591 vRtIdx);
1592 }
1593 #else
1594 // Gather data from the SGV if provided.
1595 simdscalari vViewportIdx = SIMD::setzero_si();
1596 simdscalari vRtIdx = SIMD::setzero_si();
1597 SIMD::Vec4 svgAttrib[4];
1598
1599 if (state.backendState.readViewportArrayIndex ||
1600 state.backendState.readRenderTargetArrayIndex)
1601 {
1602 tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1603 }
1604
1605 if (state.backendState.readViewportArrayIndex)
1606 {
1607 vViewportIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1608
1609 // OOB VPAI indices => forced to zero.
1610 vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
1611 simdscalari vNumViewports = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1612 simdscalari vClearMask = SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
1613 vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
1614 tessPa.viewportArrayActive = true;
1615 }
1616 if (state.backendState.readRenderTargetArrayIndex)
1617 {
1618 vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1619 tessPa.rtArrayActive = true;
1620 }
1621 pfnClipFunc(pDC,
1622 tessPa,
1623 workerId,
1624 prim,
1625 GenMask(tessPa.NumPrims()),
1626 _simd_set1_epi32(dsContext.PrimitiveID),
1627 vViewportIdx,
1628 vRtIdx);
1629 #endif
1630 }
1631 }
1632
1633 tessPa.NextPrim();
1634
1635 } // while (tessPa.HasWork())
1636 } // for (uint32_t p = 0; p < numPrims; ++p)
1637
1638 #if USE_SIMD16_FRONTEND
1639 if (gt_pTessellationThreadData->pDSOutput != nullptr)
1640 {
1641 AlignedFree(gt_pTessellationThreadData->pDSOutput);
1642 gt_pTessellationThreadData->pDSOutput = nullptr;
1643 }
1644 gt_pTessellationThreadData->dsOutputAllocSize = 0;
1645
1646 #endif
1647 TSDestroyCtx(tsCtx);
1648 }
1649
1650 THREAD PA_STATE::SIMDVERTEX* gpVertexStore = nullptr;
1651 THREAD uint32_t gVertexStoreSize = 0;
1652
1653 //////////////////////////////////////////////////////////////////////////
1654 /// @brief FE handler for SwrDraw.
1655 /// @tparam IsIndexedT - Is indexed drawing enabled
1656 /// @tparam HasTessellationT - Is tessellation enabled
1657 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1658 /// @tparam HasStreamOutT - Is stream-out enabled
1659 /// @tparam HasRastT - Is rasterization enabled
1660 /// @param pContext - pointer to SWR context.
1661 /// @param pDC - pointer to draw context.
1662 /// @param workerId - thread's worker id.
1663 /// @param pUserData - Pointer to DRAW_WORK
1664 template <typename IsIndexedT,
1665 typename IsCutIndexEnabledT,
1666 typename HasTessellationT,
1667 typename HasGeometryShaderT,
1668 typename HasStreamOutT,
1669 typename HasRastT>
ProcessDraw(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)1670 void ProcessDraw(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
1671 {
1672 #if KNOB_ENABLE_TOSS_POINTS
1673 if (KNOB_TOSS_QUEUE_FE)
1674 {
1675 return;
1676 }
1677 #endif
1678
1679 RDTSC_BEGIN(pContext->pBucketMgr, FEProcessDraw, pDC->drawId);
1680
1681 void* pWorkerData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
1682
1683 DRAW_WORK& work = *(DRAW_WORK*)pUserData;
1684 const API_STATE& state = GetApiState(pDC);
1685
1686 uint32_t indexSize = 0;
1687 uint32_t endVertex = work.numVerts;
1688
1689 gfxptr_t xpLastRequestedIndex = 0;
1690 if (IsIndexedT::value)
1691 {
1692 switch (work.type)
1693 {
1694 case R32_UINT:
1695 indexSize = sizeof(uint32_t);
1696 break;
1697 case R16_UINT:
1698 indexSize = sizeof(uint16_t);
1699 break;
1700 case R8_UINT:
1701 indexSize = sizeof(uint8_t);
1702 break;
1703 default:
1704 SWR_INVALID("Invalid work.type: %d", work.type);
1705 }
1706 xpLastRequestedIndex = work.xpIB + endVertex * indexSize;
1707 }
1708 else
1709 {
1710 // No cuts, prune partial primitives.
1711 endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1712 }
1713
1714 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1715 uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1716 #endif
1717
1718 GsBuffers gsBuffers;
1719 if (HasGeometryShaderT::value)
1720 {
1721 #if USE_SIMD16_FRONTEND
1722 AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(
1723 pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
1724 #else
1725 AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(
1726 pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
1727 #endif
1728 }
1729
1730 if (HasTessellationT::value)
1731 {
1732 SWR_ASSERT(state.tsState.tsEnable == true);
1733 SWR_ASSERT(state.pfnHsFunc != nullptr);
1734 SWR_ASSERT(state.pfnDsFunc != nullptr);
1735
1736 AllocateTessellationData(pContext);
1737 }
1738 else
1739 {
1740 SWR_ASSERT(state.tsState.tsEnable == false);
1741 SWR_ASSERT(state.pfnHsFunc == nullptr);
1742 SWR_ASSERT(state.pfnDsFunc == nullptr);
1743 }
1744
1745 // allocate space for streamout input prim data
1746 uint32_t* pSoPrimData = nullptr;
1747 if (HasStreamOutT::value)
1748 {
1749 pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1750 }
1751
1752 const uint32_t vertexCount = NumVertsPerPrim(state.topology, true);
1753 #if USE_SIMD16_FRONTEND
1754 uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simd16vector);
1755 #else
1756 uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector);
1757 #endif
1758
1759 SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);
1760
1761 // Compute storage requirements for vertex store
1762 // TODO: allocation needs to be rethought for better cut support
1763 uint32_t numVerts = vertexCount + 2; // Need extra space for PA state machine
1764 uint32_t vertexStoreSize = numVerts * simdVertexSizeBytes;
1765
1766 // grow the vertex store for the PA as necessary
1767 if (gVertexStoreSize < vertexStoreSize)
1768 {
1769 if (gpVertexStore != nullptr)
1770 {
1771 AlignedFree(gpVertexStore);
1772 gpVertexStore = nullptr;
1773 }
1774
1775 SWR_ASSERT(gpVertexStore == nullptr);
1776
1777 gpVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX*>(AlignedMalloc(vertexStoreSize, 64));
1778 gVertexStoreSize = vertexStoreSize;
1779
1780 SWR_ASSERT(gpVertexStore != nullptr);
1781 }
1782
1783 // choose primitive assembler
1784
1785 PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC,
1786 state.topology,
1787 work.numVerts,
1788 gpVertexStore,
1789 numVerts,
1790 state.frontendState.vsVertexSize,
1791 GetNumVerts(state.topology, 1));
1792 PA_STATE& pa = paFactory.GetPA();
1793
1794 #if USE_SIMD16_FRONTEND
1795 #if USE_SIMD16_SHADERS
1796 simd16vertex vin;
1797 #else
1798 simdvertex vin_lo;
1799 simdvertex vin_hi;
1800 #endif
1801 SWR_VS_CONTEXT vsContext_lo;
1802 SWR_VS_CONTEXT vsContext_hi;
1803
1804 #if USE_SIMD16_SHADERS
1805 vsContext_lo.pVin = reinterpret_cast<simdvertex*>(&vin);
1806 vsContext_hi.pVin = reinterpret_cast<simdvertex*>(&vin);
1807 #else
1808 vsContext_lo.pVin = &vin_lo;
1809 vsContext_hi.pVin = &vin_hi;
1810 #endif
1811 vsContext_lo.AlternateOffset = 0;
1812 vsContext_hi.AlternateOffset = 1;
1813
1814 SWR_FETCH_CONTEXT fetchInfo_lo = {0};
1815
1816 fetchInfo_lo.pStreams = &state.vertexBuffers[0];
1817 fetchInfo_lo.StartInstance = work.startInstance;
1818 fetchInfo_lo.StartVertex = 0;
1819
1820 if (IsIndexedT::value)
1821 {
1822 fetchInfo_lo.BaseVertex = work.baseVertex;
1823
1824 // if the entire index buffer isn't being consumed, set the last index
1825 // so that fetches < a SIMD wide will be masked off
1826 fetchInfo_lo.xpLastIndex = state.indexBuffer.xpIndices + state.indexBuffer.size;
1827 if (xpLastRequestedIndex < fetchInfo_lo.xpLastIndex)
1828 {
1829 fetchInfo_lo.xpLastIndex = xpLastRequestedIndex;
1830 }
1831 }
1832 else
1833 {
1834 fetchInfo_lo.StartVertex = work.startVertex;
1835 }
1836
1837 SWR_FETCH_CONTEXT fetchInfo_hi = fetchInfo_lo;
1838
1839 const simd16scalari vScale =
1840 _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1841
1842 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1843 {
1844 uint32_t i = 0;
1845
1846 simd16scalari vIndex;
1847
1848 if (IsIndexedT::value)
1849 {
1850 fetchInfo_lo.xpIndices = work.xpIB;
1851 fetchInfo_hi.xpIndices =
1852 fetchInfo_lo.xpIndices + KNOB_SIMD_WIDTH * indexSize; // 1/2 of KNOB_SIMD16_WIDTH
1853 }
1854 else
1855 {
1856 vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
1857
1858 fetchInfo_lo.xpIndices = pDC->pContext->pfnMakeGfxPtr(GetPrivateState(pDC), &vIndex);
1859
1860 int32_t* sysAddr = reinterpret_cast<int32_t*>(&vIndex);
1861 sysAddr += KNOB_SIMD_WIDTH; // 1/2 of KNOB_SIMD16_WIDTH
1862
1863 fetchInfo_hi.xpIndices = pDC->pContext->pfnMakeGfxPtr(GetPrivateState(pDC), sysAddr);
1864 }
1865
1866 fetchInfo_lo.CurInstance = instanceNum;
1867 fetchInfo_hi.CurInstance = instanceNum;
1868
1869 vsContext_lo.InstanceID = instanceNum;
1870 vsContext_hi.InstanceID = instanceNum;
1871
1872 while (pa.HasWork())
1873 {
1874 // GetNextVsOutput currently has the side effect of updating some PA state machine
1875 // state. So we need to keep this outside of (i < endVertex) check.
1876
1877 simdmask* pvCutIndices_lo = nullptr;
1878 simdmask* pvCutIndices_hi = nullptr;
1879
1880 if (IsIndexedT::value)
1881 {
1882 // simd16mask <=> simdmask[2]
1883
1884 pvCutIndices_lo = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[0];
1885 pvCutIndices_hi = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[1];
1886 }
1887
1888 simd16vertex& vout = pa.GetNextVsOutput();
1889
1890 vsContext_lo.pVout = reinterpret_cast<simdvertex*>(&vout);
1891 vsContext_hi.pVout = reinterpret_cast<simdvertex*>(&vout);
1892
1893 if (i < endVertex)
1894 {
1895 if (!IsIndexedT::value)
1896 {
1897 fetchInfo_lo.xpLastIndex = fetchInfo_lo.xpIndices;
1898 uint32_t offset;
1899 offset = std::min(endVertex - i, (uint32_t)KNOB_SIMD16_WIDTH);
1900 offset *= 4; // convert from index to address
1901 #if USE_SIMD16_SHADERS
1902 fetchInfo_lo.xpLastIndex += offset;
1903 #else
1904 fetchInfo_lo.xpLastIndex += std::min(offset, (uint32_t)KNOB_SIMD_WIDTH);
1905 uint32_t offset2 =
1906 std::min(offset, (uint32_t)KNOB_SIMD16_WIDTH) - KNOB_SIMD_WIDTH;
1907 assert(offset >= 0);
1908 fetchInfo_hi.xpLastIndex = fetchInfo_hi.xpIndices;
1909 fetchInfo_hi.xpLastIndex += offset2;
1910 #endif
1911 }
1912 // 1. Execute FS/VS for a single SIMD.
1913 RDTSC_BEGIN(pContext->pBucketMgr, FEFetchShader, pDC->drawId);
1914 #if USE_SIMD16_SHADERS
1915 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin);
1916 #else
1917 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin_lo);
1918
1919 if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
1920 {
1921 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_hi, vin_hi);
1922 }
1923 #endif
1924 RDTSC_END(pContext->pBucketMgr, FEFetchShader, 0);
1925
1926 // forward fetch generated vertex IDs to the vertex shader
1927 #if USE_SIMD16_SHADERS
1928 #if USE_SIMD16_VS
1929 vsContext_lo.VertexID16 =
1930 _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID, 0);
1931 vsContext_lo.VertexID16 =
1932 _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID2, 1);
1933 #else
1934 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1935 vsContext_hi.VertexID = fetchInfo_lo.VertexID2;
1936 #endif
1937 #else
1938 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1939 vsContext_hi.VertexID = fetchInfo_hi.VertexID;
1940 #endif
1941
1942 // Setup active mask for vertex shader.
1943 #if USE_SIMD16_VS
1944 vsContext_lo.mask16 = GenerateMask16(endVertex - i);
1945 #else
1946 vsContext_lo.mask = GenerateMask(endVertex - i);
1947 vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
1948 #endif
1949
1950 // forward cut mask to the PA
1951 if (IsIndexedT::value)
1952 {
1953 #if USE_SIMD16_SHADERS
1954 *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1955 *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask2));
1956 #else
1957 *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1958 *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
1959 #endif
1960 }
1961
1962 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1963
1964 #if KNOB_ENABLE_TOSS_POINTS
1965 if (!KNOB_TOSS_FETCH)
1966 #endif
1967 {
1968 RDTSC_BEGIN(pContext->pBucketMgr, FEVertexShader, pDC->drawId);
1969 #if USE_SIMD16_VS
1970 state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo);
1971 AR_EVENT(VSStats((HANDLE)&vsContext_lo.stats));
1972 #else
1973 state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo);
1974 AR_EVENT(VSStats((HANDLE)&vsContext_lo.stats));
1975
1976 if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
1977 {
1978 state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_hi);
1979 AR_EVENT(VSStats((HANDLE)&vsContext_hi.stats));
1980 }
1981 #endif
1982 RDTSC_END(pContext->pBucketMgr, FEVertexShader, 0);
1983
1984 UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1985 }
1986 }
1987
1988 // 2. Assemble primitives given the last two SIMD.
1989 do
1990 {
1991 simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
1992
1993 RDTSC_START(pContext->pBucketMgr, FEPAAssemble);
1994 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
1995 RDTSC_STOP(pContext->pBucketMgr, FEPAAssemble, 1, 0);
1996
1997 #if KNOB_ENABLE_TOSS_POINTS
1998 if (!KNOB_TOSS_FETCH)
1999 #endif
2000 {
2001 #if KNOB_ENABLE_TOSS_POINTS
2002 if (!KNOB_TOSS_VS)
2003 #endif
2004 {
2005 if (assemble)
2006 {
2007 UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
2008
2009 const uint32_t numPrims = pa.NumPrims();
2010 const uint32_t numPrims_lo =
2011 std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
2012 const uint32_t numPrims_hi =
2013 std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
2014
2015 const simd16scalari primID = pa.GetPrimID(work.startPrimID);
2016 const simdscalari primID_lo = _simd16_extract_si(primID, 0);
2017 const simdscalari primID_hi = _simd16_extract_si(primID, 1);
2018
2019 if (HasTessellationT::value)
2020 {
2021 pa.useAlternateOffset = false;
2022 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
2023 pDC,
2024 workerId,
2025 pa,
2026 &gsBuffers,
2027 pSoPrimData,
2028 numPrims_lo,
2029 primID_lo);
2030
2031 if (numPrims_hi)
2032 {
2033 pa.useAlternateOffset = true;
2034 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
2035 pDC,
2036 workerId,
2037 pa,
2038 &gsBuffers,
2039 pSoPrimData,
2040 numPrims_hi,
2041 primID_hi);
2042 }
2043 }
2044 else if (HasGeometryShaderT::value)
2045 {
2046 pa.useAlternateOffset = false;
2047 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC,
2048 workerId,
2049 pa,
2050 &gsBuffers,
2051 pSoPrimData,
2052 numPrims_lo,
2053 primID_lo);
2054
2055 if (numPrims_hi)
2056 {
2057 pa.useAlternateOffset = true;
2058 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC,
2059 workerId,
2060 pa,
2061 &gsBuffers,
2062 pSoPrimData,
2063 numPrims_hi,
2064 primID_hi);
2065 }
2066 }
2067 else
2068 {
2069 // If streamout is enabled then stream vertices out to memory.
2070 if (HasStreamOutT::value)
2071 {
2072 pa.useAlternateOffset = false;
2073 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
2074 }
2075
2076 if (HasRastT::value)
2077 {
2078 SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
2079 // Gather data from the SVG if provided.
2080 simd16scalari vpai = SIMD16::setzero_si();
2081 simd16scalari rtai = SIMD16::setzero_si();
2082 SIMD16::Vec4 svgAttrib[4];
2083
2084 if (state.backendState.readViewportArrayIndex ||
2085 state.backendState.readRenderTargetArrayIndex)
2086 {
2087 pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
2088 }
2089
2090 if (state.backendState.readViewportArrayIndex)
2091 {
2092 vpai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
2093 pa.viewportArrayActive = true;
2094 }
2095 if (state.backendState.readRenderTargetArrayIndex)
2096 {
2097 rtai =
2098 SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
2099 pa.rtArrayActive = true;
2100 }
2101
2102 {
2103 // OOB VPAI indices => forced to zero.
2104 vpai = SIMD16::max_epi32(vpai, SIMD16::setzero_si());
2105 simd16scalari vNumViewports =
2106 SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
2107 simd16scalari vClearMask =
2108 SIMD16::cmplt_epi32(vpai, vNumViewports);
2109 vpai = SIMD16::and_si(vClearMask, vpai);
2110
2111 pa.useAlternateOffset = false;
2112 pDC->pState->pfnProcessPrims_simd16(pDC,
2113 pa,
2114 workerId,
2115 prim_simd16,
2116 GenMask(numPrims),
2117 primID,
2118 vpai,
2119 rtai);
2120 }
2121 }
2122 }
2123 }
2124 }
2125 }
2126 } while (pa.NextPrim());
2127
2128 if (IsIndexedT::value)
2129 {
2130 fetchInfo_lo.xpIndices = fetchInfo_lo.xpIndices + KNOB_SIMD16_WIDTH * indexSize;
2131 fetchInfo_hi.xpIndices = fetchInfo_hi.xpIndices + KNOB_SIMD16_WIDTH * indexSize;
2132 }
2133 else
2134 {
2135 vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
2136 }
2137
2138 i += KNOB_SIMD16_WIDTH;
2139 }
2140
2141 pa.Reset();
2142 }
2143
2144 #else
2145 SWR_VS_CONTEXT vsContext;
2146 SWR_FETCH_CONTEXT fetchInfo = {0};
2147
2148 fetchInfo.pStreams = &state.vertexBuffers[0];
2149 fetchInfo.StartInstance = work.startInstance;
2150 fetchInfo.StartVertex = 0;
2151
2152 if (IsIndexedT::value)
2153 {
2154 fetchInfo.BaseVertex = work.baseVertex;
2155
2156 // if the entire index buffer isn't being consumed, set the last index
2157 // so that fetches < a SIMD wide will be masked off
2158 fetchInfo.pLastIndex =
2159 (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
2160 if (xpLastRequestedIndex < fetchInfo.pLastIndex)
2161 {
2162 fetchInfo.pLastIndex = xpLastRequestedIndex;
2163 }
2164 }
2165 else
2166 {
2167 fetchInfo.StartVertex = work.startVertex;
2168 }
2169
2170 const simdscalari vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
2171
2172 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
2173 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
2174 {
2175 simdscalari vIndex;
2176 uint32_t i = 0;
2177
2178 if (IsIndexedT::value)
2179 {
2180 fetchInfo.pIndices = work.pIB;
2181 }
2182 else
2183 {
2184 vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
2185 fetchInfo.pIndices = (const int32_t*)&vIndex;
2186 }
2187
2188 fetchInfo.CurInstance = instanceNum;
2189 vsContext.InstanceID = instanceNum;
2190
2191 while (pa.HasWork())
2192 {
2193 // GetNextVsOutput currently has the side effect of updating some PA state machine
2194 // state. So we need to keep this outside of (i < endVertex) check.
2195 simdmask* pvCutIndices = nullptr;
2196 if (IsIndexedT::value)
2197 {
2198 pvCutIndices = &pa.GetNextVsIndices();
2199 }
2200
2201 simdvertex& vout = pa.GetNextVsOutput();
2202 vsContext.pVin = &vout;
2203 vsContext.pVout = &vout;
2204
2205 if (i < endVertex)
2206 {
2207 // 1. Execute FS/VS for a single SIMD.
2208 RDTSC_BEGIN(pContext->pBucketMgr, FEFetchShader, pDC->drawId);
2209 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo, vout);
2210 RDTSC_END(pContext->pBucketMgr, FEFetchShader, 0);
2211
2212 // forward fetch generated vertex IDs to the vertex shader
2213 vsContext.VertexID = fetchInfo.VertexID;
2214
2215 // Setup active mask for vertex shader.
2216 vsContext.mask = GenerateMask(endVertex - i);
2217
2218 // forward cut mask to the PA
2219 if (IsIndexedT::value)
2220 {
2221 *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
2222 }
2223
2224 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
2225
2226 #if KNOB_ENABLE_TOSS_POINTS
2227 if (!KNOB_TOSS_FETCH)
2228 #endif
2229 {
2230 RDTSC_BEGIN(pContext->pBucketMgr, FEVertexShader, pDC->drawId);
2231 state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext);
2232 RDTSC_END(pContext->pBucketMgr, FEVertexShader, 0);
2233
2234 UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
2235 AR_EVENT(VSStats((HANDLE)&vsContext.stats));
2236 }
2237 }
2238
2239 // 2. Assemble primitives given the last two SIMD.
2240 do
2241 {
2242 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
2243 // PaAssemble returns false if there is not enough verts to assemble.
2244 RDTSC_BEGIN(pContext->pBucketMgr, FEPAAssemble, pDC->drawId);
2245 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
2246 RDTSC_END(pContext->pBucketMgr, FEPAAssemble, 1);
2247
2248 #if KNOB_ENABLE_TOSS_POINTS
2249 if (!KNOB_TOSS_FETCH)
2250 #endif
2251 {
2252 #if KNOB_ENABLE_TOSS_POINTS
2253 if (!KNOB_TOSS_VS)
2254 #endif
2255 {
2256 if (assemble)
2257 {
2258 UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
2259
2260 if (HasTessellationT::value)
2261 {
2262 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
2263 pDC,
2264 workerId,
2265 pa,
2266 &gsBuffers,
2267 pSoPrimData,
2268 pa.GetPrimID(work.startPrimID));
2269 }
2270 else if (HasGeometryShaderT::value)
2271 {
2272 GeometryShaderStage<HasStreamOutT, HasRastT>(
2273 pDC,
2274 workerId,
2275 pa,
2276 &gsBuffers,
2277 pSoPrimData,
2278 pa.GetPrimID(work.startPrimID));
2279 }
2280 else
2281 {
2282 // If streamout is enabled then stream vertices out to memory.
2283 if (HasStreamOutT::value)
2284 {
2285 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
2286 }
2287
2288 if (HasRastT::value)
2289 {
2290 SWR_ASSERT(pDC->pState->pfnProcessPrims);
2291
2292 // Gather data from the SVG if provided.
2293 simdscalari vViewportIdx = SIMD::setzero_si();
2294 simdscalari vRtIdx = SIMD::setzero_si();
2295 SIMD::Vec4 svgAttrib[4];
2296
2297 if (state.backendState.readViewportArrayIndex ||
2298 state.backendState.readRenderTargetArrayIndex)
2299 {
2300 pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
2301 }
2302
2303 if (state.backendState.readViewportArrayIndex)
2304 {
2305 vViewportIdx =
2306 SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
2307
2308 // OOB VPAI indices => forced to zero.
2309 vViewportIdx =
2310 SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
2311 simdscalari vNumViewports =
2312 SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
2313 simdscalari vClearMask =
2314 SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
2315 vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
2316 pa.viewportArrayActive = true;
2317 }
2318 if (state.backendState.readRenderTargetArrayIndex)
2319 {
2320 vRtIdx =
2321 SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
2322 pa.rtArrayActive = true;
2323 }
2324
2325 pDC->pState->pfnProcessPrims(pDC,
2326 pa,
2327 workerId,
2328 prim,
2329 GenMask(pa.NumPrims()),
2330 pa.GetPrimID(work.startPrimID),
2331 vViewportIdx,
2332 vRtIdx);
2333 }
2334 }
2335 }
2336 }
2337 }
2338 } while (pa.NextPrim());
2339
2340 if (IsIndexedT::value)
2341 {
2342 fetchInfo.pIndices =
2343 (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
2344 }
2345 else
2346 {
2347 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
2348 }
2349
2350 i += KNOB_SIMD_WIDTH;
2351 }
2352 pa.Reset();
2353 }
2354
2355 #endif
2356
2357 RDTSC_END(pContext->pBucketMgr, FEProcessDraw, numPrims * work.numInstances);
2358 }
2359
2360 struct FEDrawChooser
2361 {
2362 typedef PFN_FE_WORK_FUNC FuncType;
2363
2364 template <typename... ArgsB>
GetFuncFEDrawChooser2365 static FuncType GetFunc()
2366 {
2367 return ProcessDraw<ArgsB...>;
2368 }
2369 };
2370
2371 // Selector for correct templated Draw front-end function
GetProcessDrawFunc(bool IsIndexed,bool IsCutIndexEnabled,bool HasTessellation,bool HasGeometryShader,bool HasStreamOut,bool HasRasterization)2372 PFN_FE_WORK_FUNC GetProcessDrawFunc(bool IsIndexed,
2373 bool IsCutIndexEnabled,
2374 bool HasTessellation,
2375 bool HasGeometryShader,
2376 bool HasStreamOut,
2377 bool HasRasterization)
2378 {
2379 return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed,
2380 IsCutIndexEnabled,
2381 HasTessellation,
2382 HasGeometryShader,
2383 HasStreamOut,
2384 HasRasterization);
2385 }
2386