1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file frontend.cpp
24 *
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
27 *
28 ******************************************************************************/
29
30 #include "api.h"
31 #include "frontend.h"
32 #include "backend.h"
33 #include "context.h"
34 #include "rdtsc_core.h"
35 #include "utils.h"
36 #include "threads.h"
37 #include "pa.h"
38 #include "clip.h"
39 #include "tilemgr.h"
40 #include "tessellator.h"
41 #include <limits>
42 #include <iostream>
43
44 //////////////////////////////////////////////////////////////////////////
45 /// @brief FE handler for SwrSync.
46 /// @param pContext - pointer to SWR context.
47 /// @param pDC - pointer to draw context.
48 /// @param workerId - thread's worker id. Even thread has a unique id.
49 /// @param pUserData - Pointer to user data passed back to sync callback.
50 /// @todo This should go away when we switch this to use compute threading.
ProcessSync(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)51 void ProcessSync(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
52 {
53 BE_WORK work;
54 work.type = SYNC;
55 work.pfnWork = ProcessSyncBE;
56
57 MacroTileMgr* pTileMgr = pDC->pTileMgr;
58 pTileMgr->enqueue(0, 0, &work);
59 }
60
61 //////////////////////////////////////////////////////////////////////////
62 /// @brief FE handler for SwrDestroyContext.
63 /// @param pContext - pointer to SWR context.
64 /// @param pDC - pointer to draw context.
65 /// @param workerId - thread's worker id. Even thread has a unique id.
66 /// @param pUserData - Pointer to user data passed back to sync callback.
ProcessShutdown(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)67 void ProcessShutdown(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
68 {
69 BE_WORK work;
70 work.type = SHUTDOWN;
71 work.pfnWork = ProcessShutdownBE;
72
73 MacroTileMgr* pTileMgr = pDC->pTileMgr;
74 // Enqueue at least 1 work item for each worker thread
75 // account for number of numa nodes
76 uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
77
78 for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
79 {
80 for (uint32_t n = 0; n < numNumaNodes; ++n)
81 {
82 pTileMgr->enqueue(i, n, &work);
83 }
84 }
85 }
86
87 //////////////////////////////////////////////////////////////////////////
88 /// @brief FE handler for SwrClearRenderTarget.
89 /// @param pContext - pointer to SWR context.
90 /// @param pDC - pointer to draw context.
91 /// @param workerId - thread's worker id. Even thread has a unique id.
92 /// @param pUserData - Pointer to user data passed back to clear callback.
93 /// @todo This should go away when we switch this to use compute threading.
ProcessClear(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)94 void ProcessClear(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
95 {
96 CLEAR_DESC* pDesc = (CLEAR_DESC*)pUserData;
97 MacroTileMgr* pTileMgr = pDC->pTileMgr;
98
99 // queue a clear to each macro tile
100 // compute macro tile bounds for the specified rect
101 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
102 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
103 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
104 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
105
106 BE_WORK work;
107 work.type = CLEAR;
108 work.pfnWork = ProcessClearBE;
109 work.desc.clear = *pDesc;
110
111 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
112 {
113 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
114 {
115 pTileMgr->enqueue(x, y, &work);
116 }
117 }
118 }
119
120 //////////////////////////////////////////////////////////////////////////
121 /// @brief FE handler for SwrStoreTiles.
122 /// @param pContext - pointer to SWR context.
123 /// @param pDC - pointer to draw context.
124 /// @param workerId - thread's worker id. Even thread has a unique id.
125 /// @param pUserData - Pointer to user data passed back to callback.
126 /// @todo This should go away when we switch this to use compute threading.
ProcessStoreTiles(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)127 void ProcessStoreTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
128 {
129 RDTSC_BEGIN(pContext->pBucketMgr, FEProcessStoreTiles, pDC->drawId);
130 MacroTileMgr* pTileMgr = pDC->pTileMgr;
131 STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
132
133 // queue a store to each macro tile
134 // compute macro tile bounds for the specified rect
135 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
136 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
137 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
138 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
139
140 // store tiles
141 BE_WORK work;
142 work.type = STORETILES;
143 work.pfnWork = ProcessStoreTilesBE;
144 work.desc.storeTiles = *pDesc;
145
146 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
147 {
148 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
149 {
150 pTileMgr->enqueue(x, y, &work);
151 }
152 }
153
154 RDTSC_END(pContext->pBucketMgr, FEProcessStoreTiles, 0);
155 }
156
157 //////////////////////////////////////////////////////////////////////////
158 /// @brief FE handler for SwrInvalidateTiles.
159 /// @param pContext - pointer to SWR context.
160 /// @param pDC - pointer to draw context.
161 /// @param workerId - thread's worker id. Even thread has a unique id.
162 /// @param pUserData - Pointer to user data passed back to callback.
163 /// @todo This should go away when we switch this to use compute threading.
ProcessDiscardInvalidateTiles(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)164 void ProcessDiscardInvalidateTiles(SWR_CONTEXT* pContext,
165 DRAW_CONTEXT* pDC,
166 uint32_t workerId,
167 void* pUserData)
168 {
169 RDTSC_BEGIN(pContext->pBucketMgr, FEProcessInvalidateTiles, pDC->drawId);
170 DISCARD_INVALIDATE_TILES_DESC* pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
171 MacroTileMgr* pTileMgr = pDC->pTileMgr;
172
173 // compute macro tile bounds for the specified rect
174 uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
175 uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
176 uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
177 uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
178
179 if (pDesc->fullTilesOnly == false)
180 {
181 // include partial tiles
182 macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
183 macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
184 macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
185 macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
186 }
187
188 SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
189 SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
190
191 macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
192 macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
193
194 // load tiles
195 BE_WORK work;
196 work.type = DISCARDINVALIDATETILES;
197 work.pfnWork = ProcessDiscardInvalidateTilesBE;
198 work.desc.discardInvalidateTiles = *pDesc;
199
200 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
201 {
202 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
203 {
204 pTileMgr->enqueue(x, y, &work);
205 }
206 }
207
208 RDTSC_END(pContext->pBucketMgr, FEProcessInvalidateTiles, 0);
209 }
210
211 //////////////////////////////////////////////////////////////////////////
212 /// @brief Computes the number of primitives given the number of verts.
213 /// @param mode - primitive topology for draw operation.
214 /// @param numPrims - number of vertices or indices for draw.
215 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
GetNumPrims(PRIMITIVE_TOPOLOGY mode,uint32_t numPrims)216 uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims)
217 {
218 switch (mode)
219 {
220 case TOP_POINT_LIST:
221 return numPrims;
222 case TOP_TRIANGLE_LIST:
223 return numPrims / 3;
224 case TOP_TRIANGLE_STRIP:
225 return numPrims < 3 ? 0 : numPrims - 2;
226 case TOP_TRIANGLE_FAN:
227 return numPrims < 3 ? 0 : numPrims - 2;
228 case TOP_TRIANGLE_DISC:
229 return numPrims < 2 ? 0 : numPrims - 1;
230 case TOP_QUAD_LIST:
231 return numPrims / 4;
232 case TOP_QUAD_STRIP:
233 return numPrims < 4 ? 0 : (numPrims - 2) / 2;
234 case TOP_LINE_STRIP:
235 return numPrims < 2 ? 0 : numPrims - 1;
236 case TOP_LINE_LIST:
237 return numPrims / 2;
238 case TOP_LINE_LOOP:
239 return numPrims;
240 case TOP_RECT_LIST:
241 return numPrims / 3;
242 case TOP_LINE_LIST_ADJ:
243 return numPrims / 4;
244 case TOP_LISTSTRIP_ADJ:
245 return numPrims < 3 ? 0 : numPrims - 3;
246 case TOP_TRI_LIST_ADJ:
247 return numPrims / 6;
248 case TOP_TRI_STRIP_ADJ:
249 return numPrims < 4 ? 0 : (numPrims / 2) - 2;
250
251 case TOP_PATCHLIST_1:
252 case TOP_PATCHLIST_2:
253 case TOP_PATCHLIST_3:
254 case TOP_PATCHLIST_4:
255 case TOP_PATCHLIST_5:
256 case TOP_PATCHLIST_6:
257 case TOP_PATCHLIST_7:
258 case TOP_PATCHLIST_8:
259 case TOP_PATCHLIST_9:
260 case TOP_PATCHLIST_10:
261 case TOP_PATCHLIST_11:
262 case TOP_PATCHLIST_12:
263 case TOP_PATCHLIST_13:
264 case TOP_PATCHLIST_14:
265 case TOP_PATCHLIST_15:
266 case TOP_PATCHLIST_16:
267 case TOP_PATCHLIST_17:
268 case TOP_PATCHLIST_18:
269 case TOP_PATCHLIST_19:
270 case TOP_PATCHLIST_20:
271 case TOP_PATCHLIST_21:
272 case TOP_PATCHLIST_22:
273 case TOP_PATCHLIST_23:
274 case TOP_PATCHLIST_24:
275 case TOP_PATCHLIST_25:
276 case TOP_PATCHLIST_26:
277 case TOP_PATCHLIST_27:
278 case TOP_PATCHLIST_28:
279 case TOP_PATCHLIST_29:
280 case TOP_PATCHLIST_30:
281 case TOP_PATCHLIST_31:
282 case TOP_PATCHLIST_32:
283 return numPrims / (mode - TOP_PATCHLIST_BASE);
284
285 case TOP_POLYGON:
286 case TOP_POINT_LIST_BF:
287 case TOP_LINE_STRIP_CONT:
288 case TOP_LINE_STRIP_BF:
289 case TOP_LINE_STRIP_CONT_BF:
290 case TOP_TRIANGLE_FAN_NOSTIPPLE:
291 case TOP_TRI_STRIP_REVERSE:
292 case TOP_PATCHLIST_BASE:
293 case TOP_UNKNOWN:
294 SWR_INVALID("Unsupported topology: %d", mode);
295 return 0;
296 }
297
298 return 0;
299 }
300
301 //////////////////////////////////////////////////////////////////////////
302 /// @brief Computes the number of verts given the number of primitives.
303 /// @param mode - primitive topology for draw operation.
304 /// @param numPrims - number of primitives for draw.
GetNumVerts(PRIMITIVE_TOPOLOGY mode,uint32_t numPrims)305 uint32_t GetNumVerts(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims)
306 {
307 switch (mode)
308 {
309 case TOP_POINT_LIST:
310 return numPrims;
311 case TOP_TRIANGLE_LIST:
312 return numPrims * 3;
313 case TOP_TRIANGLE_STRIP:
314 return numPrims ? numPrims + 2 : 0;
315 case TOP_TRIANGLE_FAN:
316 return numPrims ? numPrims + 2 : 0;
317 case TOP_TRIANGLE_DISC:
318 return numPrims ? numPrims + 1 : 0;
319 case TOP_QUAD_LIST:
320 return numPrims * 4;
321 case TOP_QUAD_STRIP:
322 return numPrims ? numPrims * 2 + 2 : 0;
323 case TOP_LINE_STRIP:
324 return numPrims ? numPrims + 1 : 0;
325 case TOP_LINE_LIST:
326 return numPrims * 2;
327 case TOP_LINE_LOOP:
328 return numPrims;
329 case TOP_RECT_LIST:
330 return numPrims * 3;
331 case TOP_LINE_LIST_ADJ:
332 return numPrims * 4;
333 case TOP_LISTSTRIP_ADJ:
334 return numPrims ? numPrims + 3 : 0;
335 case TOP_TRI_LIST_ADJ:
336 return numPrims * 6;
337 case TOP_TRI_STRIP_ADJ:
338 return numPrims ? (numPrims + 2) * 2 : 0;
339
340 case TOP_PATCHLIST_1:
341 case TOP_PATCHLIST_2:
342 case TOP_PATCHLIST_3:
343 case TOP_PATCHLIST_4:
344 case TOP_PATCHLIST_5:
345 case TOP_PATCHLIST_6:
346 case TOP_PATCHLIST_7:
347 case TOP_PATCHLIST_8:
348 case TOP_PATCHLIST_9:
349 case TOP_PATCHLIST_10:
350 case TOP_PATCHLIST_11:
351 case TOP_PATCHLIST_12:
352 case TOP_PATCHLIST_13:
353 case TOP_PATCHLIST_14:
354 case TOP_PATCHLIST_15:
355 case TOP_PATCHLIST_16:
356 case TOP_PATCHLIST_17:
357 case TOP_PATCHLIST_18:
358 case TOP_PATCHLIST_19:
359 case TOP_PATCHLIST_20:
360 case TOP_PATCHLIST_21:
361 case TOP_PATCHLIST_22:
362 case TOP_PATCHLIST_23:
363 case TOP_PATCHLIST_24:
364 case TOP_PATCHLIST_25:
365 case TOP_PATCHLIST_26:
366 case TOP_PATCHLIST_27:
367 case TOP_PATCHLIST_28:
368 case TOP_PATCHLIST_29:
369 case TOP_PATCHLIST_30:
370 case TOP_PATCHLIST_31:
371 case TOP_PATCHLIST_32:
372 return numPrims * (mode - TOP_PATCHLIST_BASE);
373
374 case TOP_POLYGON:
375 case TOP_POINT_LIST_BF:
376 case TOP_LINE_STRIP_CONT:
377 case TOP_LINE_STRIP_BF:
378 case TOP_LINE_STRIP_CONT_BF:
379 case TOP_TRIANGLE_FAN_NOSTIPPLE:
380 case TOP_TRI_STRIP_REVERSE:
381 case TOP_PATCHLIST_BASE:
382 case TOP_UNKNOWN:
383 SWR_INVALID("Unsupported topology: %d", mode);
384 return 0;
385 }
386
387 return 0;
388 }
389
390 //////////////////////////////////////////////////////////////////////////
391 /// @brief Return number of verts per primitive.
392 /// @param topology - topology
393 /// @param includeAdjVerts - include adjacent verts in primitive vertices
NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology,bool includeAdjVerts)394 uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
395 {
396 uint32_t numVerts = 0;
397 switch (topology)
398 {
399 case TOP_POINT_LIST:
400 case TOP_POINT_LIST_BF:
401 numVerts = 1;
402 break;
403 case TOP_LINE_LIST:
404 case TOP_LINE_STRIP:
405 case TOP_LINE_LIST_ADJ:
406 case TOP_LINE_LOOP:
407 case TOP_LINE_STRIP_CONT:
408 case TOP_LINE_STRIP_BF:
409 case TOP_LISTSTRIP_ADJ:
410 numVerts = 2;
411 break;
412 case TOP_TRIANGLE_LIST:
413 case TOP_TRIANGLE_STRIP:
414 case TOP_TRIANGLE_FAN:
415 case TOP_TRI_LIST_ADJ:
416 case TOP_TRI_STRIP_ADJ:
417 case TOP_TRI_STRIP_REVERSE:
418 case TOP_RECT_LIST:
419 numVerts = 3;
420 break;
421 case TOP_QUAD_LIST:
422 case TOP_QUAD_STRIP:
423 numVerts = 4;
424 break;
425 case TOP_PATCHLIST_1:
426 case TOP_PATCHLIST_2:
427 case TOP_PATCHLIST_3:
428 case TOP_PATCHLIST_4:
429 case TOP_PATCHLIST_5:
430 case TOP_PATCHLIST_6:
431 case TOP_PATCHLIST_7:
432 case TOP_PATCHLIST_8:
433 case TOP_PATCHLIST_9:
434 case TOP_PATCHLIST_10:
435 case TOP_PATCHLIST_11:
436 case TOP_PATCHLIST_12:
437 case TOP_PATCHLIST_13:
438 case TOP_PATCHLIST_14:
439 case TOP_PATCHLIST_15:
440 case TOP_PATCHLIST_16:
441 case TOP_PATCHLIST_17:
442 case TOP_PATCHLIST_18:
443 case TOP_PATCHLIST_19:
444 case TOP_PATCHLIST_20:
445 case TOP_PATCHLIST_21:
446 case TOP_PATCHLIST_22:
447 case TOP_PATCHLIST_23:
448 case TOP_PATCHLIST_24:
449 case TOP_PATCHLIST_25:
450 case TOP_PATCHLIST_26:
451 case TOP_PATCHLIST_27:
452 case TOP_PATCHLIST_28:
453 case TOP_PATCHLIST_29:
454 case TOP_PATCHLIST_30:
455 case TOP_PATCHLIST_31:
456 case TOP_PATCHLIST_32:
457 numVerts = topology - TOP_PATCHLIST_BASE;
458 break;
459 default:
460 SWR_INVALID("Unsupported topology: %d", topology);
461 break;
462 }
463
464 if (includeAdjVerts)
465 {
466 switch (topology)
467 {
468 case TOP_LISTSTRIP_ADJ:
469 case TOP_LINE_LIST_ADJ:
470 numVerts = 4;
471 break;
472 case TOP_TRI_STRIP_ADJ:
473 case TOP_TRI_LIST_ADJ:
474 numVerts = 6;
475 break;
476 default:
477 break;
478 }
479 }
480
481 return numVerts;
482 }
483
484 //////////////////////////////////////////////////////////////////////////
485 /// @brief Generate mask from remaining work.
486 /// @param numWorkItems - Number of items being worked on by a SIMD.
GenerateMask(uint32_t numItemsRemaining)487 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
488 {
489 uint32_t numActive =
490 (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
491 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
492 return _simd_castps_si(_simd_vmask_ps(mask));
493 }
494
GenerateMask16(uint32_t numItemsRemaining)495 static INLINE simd16scalari GenerateMask16(uint32_t numItemsRemaining)
496 {
497 uint32_t numActive =
498 (numItemsRemaining >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : numItemsRemaining;
499 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
500 return _simd16_castps_si(_simd16_vmask_ps(mask));
501 }
502
503 //////////////////////////////////////////////////////////////////////////
504 /// @brief StreamOut - Streams vertex data out to SO buffers.
505 /// Generally, we are only streaming out a SIMDs worth of triangles.
506 /// @param pDC - pointer to draw context.
507 /// @param workerId - thread's worker id. Even thread has a unique id.
508 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
StreamOut(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,uint32_t * pPrimData,uint32_t streamIndex)509 static void StreamOut(
510 DRAW_CONTEXT* pDC, PA_STATE& pa, uint32_t workerId, uint32_t* pPrimData, uint32_t streamIndex)
511 {
512 RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEStreamout, pDC->drawId);
513
514 void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
515
516 const API_STATE& state = GetApiState(pDC);
517 const SWR_STREAMOUT_STATE& soState = state.soState;
518
519 uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
520
521 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each
522 // vertex.
523 uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t);
524
525 SWR_STREAMOUT_CONTEXT soContext = {0};
526
527 // Setup buffer state pointers.
528 for (uint32_t i = 0; i < 4; ++i)
529 {
530 soContext.pBuffer[i] = &state.soBuffer[i];
531 }
532
533 uint32_t numPrims = pa.NumPrims();
534
535 for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
536 {
537 unsigned long slot = 0;
538 uint64_t soMask = soState.streamMasks[streamIndex];
539
540 // Write all entries into primitive data buffer for SOS.
541 while (_BitScanForward64(&slot, soMask))
542 {
543 simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
544 uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex];
545 pa.AssembleSingle(paSlot, primIndex, attrib);
546
547 // Attribute offset is relative offset from start of vertex.
548 // Note that attributes start at slot 1 in the PA buffer. We need to write this
549 // to prim data starting at slot 0. Which is why we do (slot - 1).
550 // Also note: GL works slightly differently, and needs slot 0
551 uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
552
553 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
554 for (uint32_t v = 0; v < soVertsPerPrim; ++v)
555 {
556 uint32_t* pPrimDataAttrib =
557 pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
558
559 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
560 }
561
562 soMask &= ~(uint64_t(1) << slot);
563 }
564
565 // Update pPrimData pointer
566 soContext.pPrimData = pPrimData;
567
568 // Call SOS
569 SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr,
570 "Trying to execute uninitialized streamout jit function.");
571 state.pfnSoFunc[streamIndex](GetPrivateState(pDC), pWorkerData, soContext);
572 }
573
574 // Update SO write offset. The driver provides memory for the update.
575 for (uint32_t i = 0; i < 4; ++i)
576 {
577 if (state.soBuffer[i].pWriteOffset)
578 {
579 bool nullTileAccessed = false;
580 void* pWriteOffset = pDC->pContext->pfnTranslateGfxptrForWrite(
581 GetPrivateState(pDC), soContext.pBuffer[i]->pWriteOffset, &nullTileAccessed, pWorkerData);
582 *((uint32_t*)pWriteOffset) = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
583 }
584
585 if (state.soBuffer[i].soWriteEnable)
586 {
587 pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
588 pDC->dynState.SoWriteOffsetDirty[i] = true;
589 }
590 }
591
592 pDC->dynState.soPrims += soContext.numPrimsWritten;
593
594 UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
595 UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
596
597 RDTSC_END(pDC->pContext->pBucketMgr, FEStreamout, 1);
598 }
599
600 #if USE_SIMD16_FRONTEND
601 //////////////////////////////////////////////////////////////////////////
602 /// Is value an even number (a multiple of two)
603 ///
604 template <typename T>
IsEven(T value)605 INLINE static bool IsEven(T value)
606 {
607 return (value & 1) == 0;
608 }
609
610 //////////////////////////////////////////////////////////////////////////
611 /// Round up value to an even number (a multiple of two)
612 ///
613 template <typename T>
RoundUpEven(T value)614 INLINE static T RoundUpEven(T value)
615 {
616 return (value + 1) & ~1;
617 }
618
619 //////////////////////////////////////////////////////////////////////////
620 /// Round down value to an even number (a multiple of two)
621 ///
622 template <typename T>
RoundDownEven(T value)623 INLINE static T RoundDownEven(T value)
624 {
625 return value & ~1;
626 }
627
628 //////////////////////////////////////////////////////////////////////////
629 /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
630 ///
631 /// vertexCount is in terms of the source simdvertexes and must be even
632 ///
633 /// attribCount will limit the vector copies to those attribs specified
634 ///
635 /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
636 ///
PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex * vertex_simd16,const simdvertex * vertex,uint32_t vertexCount,uint32_t attribCount)637 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex* vertex_simd16,
638 const simdvertex* vertex,
639 uint32_t vertexCount,
640 uint32_t attribCount)
641 {
642 SWR_ASSERT(vertex);
643 SWR_ASSERT(vertex_simd16);
644 SWR_ASSERT(attribCount <= SWR_VTX_NUM_SLOTS);
645
646 simd16vertex temp;
647
648 for (uint32_t i = 0; i < vertexCount; i += 2)
649 {
650 for (uint32_t j = 0; j < attribCount; j += 1)
651 {
652 for (uint32_t k = 0; k < 4; k += 1)
653 {
654 temp.attrib[j][k] =
655 _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
656
657 if ((i + 1) < vertexCount)
658 {
659 temp.attrib[j][k] =
660 _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
661 }
662 }
663 }
664
665 for (uint32_t j = 0; j < attribCount; j += 1)
666 {
667 vertex_simd16[i >> 1].attrib[j] = temp.attrib[j];
668 }
669 }
670 }
671
672 #endif
673 //////////////////////////////////////////////////////////////////////////
674 /// @brief Computes number of invocations. The current index represents
675 /// the start of the SIMD. The max index represents how much work
676 /// items are remaining. If there is less then a SIMD's xmin of work
677 /// then return the remaining amount of work.
678 /// @param curIndex - The start index for the SIMD.
679 /// @param maxIndex - The last index for all work items.
GetNumInvocations(uint32_t curIndex,uint32_t maxIndex)680 static INLINE uint32_t GetNumInvocations(uint32_t curIndex, uint32_t maxIndex)
681 {
682 uint32_t remainder = (maxIndex - curIndex);
683 #if USE_SIMD16_FRONTEND
684 return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder;
685 #else
686 return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
687 #endif
688 }
689
690 //////////////////////////////////////////////////////////////////////////
691 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
692 /// The geometry shader will loop over each active streamout buffer, assembling
693 /// primitives for the downstream stages. When multistream output is enabled,
694 /// the generated stream ID buffer from the GS needs to be converted to a cut
695 /// buffer for the primitive assembler.
696 /// @param stream - stream id to generate the cut buffer for
697 /// @param pStreamIdBase - pointer to the stream ID buffer
698 /// @param numEmittedVerts - Number of total verts emitted by the GS
699 /// @param pCutBuffer - output buffer to write cuts to
ProcessStreamIdBuffer(uint32_t stream,uint8_t * pStreamIdBase,uint32_t numEmittedVerts,uint8_t * pCutBuffer)700 void ProcessStreamIdBuffer(uint32_t stream,
701 uint8_t* pStreamIdBase,
702 uint32_t numEmittedVerts,
703 uint8_t* pCutBuffer)
704 {
705 SWR_ASSERT(stream < MAX_SO_STREAMS);
706
707 uint32_t numOutputBytes = AlignUp(numEmittedVerts, 8) / 8;
708
709 for (uint32_t b = 0; b < numOutputBytes; ++b)
710 {
711 uint8_t curInputByte = pStreamIdBase[2 * b];
712 uint8_t outByte = 0;
713 for (uint32_t i = 0; i < 4; ++i)
714 {
715 if ((curInputByte & 0x3) != stream)
716 {
717 outByte |= (1 << i);
718 }
719 curInputByte >>= 2;
720 }
721
722 curInputByte = pStreamIdBase[2 * b + 1];
723 for (uint32_t i = 0; i < 4; ++i)
724 {
725 if ((curInputByte & 0x3) != stream)
726 {
727 outByte |= (1 << (i + 4));
728 }
729 curInputByte >>= 2;
730 }
731
732 *pCutBuffer++ = outByte;
733 }
734 }
735
736 // Buffers that are allocated if GS is enabled
737 struct GsBuffers
738 {
739 uint8_t* pGsIn;
740 uint8_t* pGsOut[KNOB_SIMD_WIDTH];
741 uint8_t* pGsTransposed;
742 void* pStreamCutBuffer;
743 };
744
745 //////////////////////////////////////////////////////////////////////////
746 /// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
747 /// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive
748 /// assembler
749 /// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
750 /// @param numVerts - Number of vertices outputted by the GS
751 /// @param numAttribs - Number of attributes per vertex
752 template <typename SIMD_T, uint32_t SimdWidth>
TransposeSOAtoAOS(uint8_t * pDst,uint8_t * pSrc,uint32_t numVerts,uint32_t numAttribs)753 void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs)
754 {
755 uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
756 uint32_t dstVertexStride = numAttribs * sizeof(Float<SIMD_T>) * 4;
757
758 OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];
759
760 for (uint32_t i = 0; i < SimdWidth; ++i)
761 {
762 gatherOffsets[i] = srcVertexStride * i;
763 }
764 auto vGatherOffsets = SIMD_T::load_si((Integer<SIMD_T>*)&gatherOffsets[0]);
765
766 uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth;
767 uint32_t remainingVerts = numVerts;
768
769 for (uint32_t s = 0; s < numSimd; ++s)
770 {
771 uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;
772 uint8_t* pDstBase = pDst + s * dstVertexStride;
773
774 // Compute mask to prevent src overflow
775 uint32_t mask = std::min(remainingVerts, SimdWidth);
776 mask = GenMask(mask);
777 auto vMask = SIMD_T::vmask_ps(mask);
778 auto viMask = SIMD_T::castps_si(vMask);
779
780 for (uint32_t a = 0; a < numAttribs; ++a)
781 {
782 auto attribGatherX = SIMD_T::mask_i32gather_ps(
783 SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
784 auto attribGatherY = SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
785 (const float*)(pSrcBase + sizeof(float)),
786 vGatherOffsets,
787 vMask);
788 auto attribGatherZ =
789 SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
790 (const float*)(pSrcBase + sizeof(float) * 2),
791 vGatherOffsets,
792 vMask);
793 auto attribGatherW =
794 SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
795 (const float*)(pSrcBase + sizeof(float) * 3),
796 vGatherOffsets,
797 vMask);
798
799 SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
800 SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>)), viMask, attribGatherY);
801 SIMD_T::maskstore_ps(
802 (float*)(pDstBase + sizeof(Float<SIMD_T>) * 2), viMask, attribGatherZ);
803 SIMD_T::maskstore_ps(
804 (float*)(pDstBase + sizeof(Float<SIMD_T>) * 3), viMask, attribGatherW);
805
806 pSrcBase += sizeof(float) * 4;
807 pDstBase += sizeof(Float<SIMD_T>) * 4;
808 }
809 remainingVerts -= SimdWidth;
810 }
811 }
812
813
814 //////////////////////////////////////////////////////////////////////////
815 /// @brief Implements GS stage.
816 /// @param pDC - pointer to draw context.
817 /// @param workerId - thread's worker id. Even thread has a unique id.
818 /// @param pa - The primitive assembly object.
819 /// @param pGsOut - output stream for GS
820 template <typename HasStreamOutT, typename HasRastT>
GeometryShaderStage(DRAW_CONTEXT * pDC,uint32_t workerId,PA_STATE & pa,GsBuffers * pGsBuffers,uint32_t * pSoPrimData,uint32_t numPrims_simd8,simdscalari const & primID)821 static void GeometryShaderStage(DRAW_CONTEXT* pDC,
822 uint32_t workerId,
823 PA_STATE& pa,
824 GsBuffers* pGsBuffers,
825 uint32_t* pSoPrimData,
826 #if USE_SIMD16_FRONTEND
827 uint32_t numPrims_simd8,
828 #endif
829 simdscalari const& primID)
830 {
831 RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEGeometryShader, pDC->drawId);
832
833 void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
834
835 const API_STATE& state = GetApiState(pDC);
836 const SWR_GS_STATE* pState = &state.gsState;
837 SWR_GS_CONTEXT gsContext;
838
839 static uint8_t sNullBuffer[128] = {0};
840
841 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
842 {
843 gsContext.pStreams[i] = pGsBuffers->pGsOut[i];
844 }
845 gsContext.pVerts = (simdvector*)pGsBuffers->pGsIn;
846 gsContext.PrimitiveID = primID;
847
848 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
849 simdvector attrib[MAX_NUM_VERTS_PER_PRIM];
850
851 // assemble all attributes for the input primitive
852 gsContext.inputVertStride = pState->inputVertStride;
853 for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
854 {
855 uint32_t attribOffset = slot + pState->vertexAttribOffset;
856 pa.Assemble(attribOffset, attrib);
857
858 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
859 {
860 gsContext.pVerts[attribOffset + pState->inputVertStride * i] = attrib[i];
861 }
862 }
863
864 // record valid prims from the frontend to avoid over binning the newly generated
865 // prims from the GS
866 #if USE_SIMD16_FRONTEND
867 uint32_t numInputPrims = numPrims_simd8;
868 #else
869 uint32_t numInputPrims = pa.NumPrims();
870 #endif
871
872 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
873 {
874 gsContext.InstanceID = instance;
875 gsContext.mask = GenerateMask(numInputPrims);
876
877 // execute the geometry shader
878 state.pfnGsFunc(GetPrivateState(pDC), pWorkerData, &gsContext);
879 AR_EVENT(GSStats((HANDLE)&gsContext.stats));
880
881 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
882 {
883 gsContext.pStreams[i] += pState->allocationSize;
884 }
885 }
886
887 // set up new binner and state for the GS output topology
888 #if USE_SIMD16_FRONTEND
889 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
890 if (HasRastT::value)
891 {
892 switch (pState->outputTopology)
893 {
894 case TOP_RECT_LIST:
895 pfnClipFunc = ClipRectangles_simd16;
896 break;
897 case TOP_TRIANGLE_STRIP:
898 pfnClipFunc = ClipTriangles_simd16;
899 break;
900 case TOP_LINE_STRIP:
901 pfnClipFunc = ClipLines_simd16;
902 break;
903 case TOP_POINT_LIST:
904 pfnClipFunc = ClipPoints_simd16;
905 break;
906 default:
907 SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
908 }
909 }
910
911 #else
912 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
913 if (HasRastT::value)
914 {
915 switch (pState->outputTopology)
916 {
917 case TOP_RECT_LIST:
918 pfnClipFunc = ClipRectangles;
919 break;
920 case TOP_TRIANGLE_STRIP:
921 pfnClipFunc = ClipTriangles;
922 break;
923 case TOP_LINE_STRIP:
924 pfnClipFunc = ClipLines;
925 break;
926 case TOP_POINT_LIST:
927 pfnClipFunc = ClipPoints;
928 break;
929 default:
930 SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
931 }
932 }
933
934 #endif
935 // foreach input prim:
936 // - setup a new PA based on the emitted verts for that prim
937 // - loop over the new verts, calling PA to assemble each prim
938 uint32_t* pPrimitiveId = (uint32_t*)&primID;
939
940 uint32_t totalPrimsGenerated = 0;
941 for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
942 {
943 uint8_t* pInstanceBase = (uint8_t*)pGsBuffers->pGsOut[inputPrim];
944
945 // Vertex count is either emitted by shader or static
946 uint32_t vertexCount = 0;
947 if (pState->staticVertexCount)
948 {
949 vertexCount = pState->staticVertexCount;
950 }
951 else
952 {
953 // If emitted in shader, it should be the stored in the first dword of the output buffer
954 vertexCount = *(uint32_t*)pInstanceBase;
955 }
956
957 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
958 {
959 uint32_t numEmittedVerts = vertexCount;
960 if (numEmittedVerts == 0)
961 {
962 continue;
963 }
964
965 uint8_t* pBase = pInstanceBase + instance * pState->allocationSize;
966 uint8_t* pCutBase =
967 pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset;
968 uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset;
969
970 #if USE_SIMD16_FRONTEND
971 TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed,
972 pVertexBaseAOS,
973 vertexCount,
974 pState->outputVertexSize);
975 #else
976 TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed,
977 pVertexBaseAOS,
978 vertexCount,
979 pState->outputVertexSize);
980 #endif
981
982 uint32_t numAttribs = state.feNumAttributes;
983
984 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
985 {
986 bool processCutVerts = false;
987 uint8_t* pCutBuffer = pCutBase;
988
989 // assign default stream ID, only relevant when GS is outputting a single stream
990 uint32_t streamID = 0;
991 if (pState->isSingleStream)
992 {
993 processCutVerts = true;
994 streamID = pState->singleStreamID;
995 if (streamID != stream)
996 continue;
997 }
998 else
999 {
1000 // early exit if this stream is not enabled for streamout
1001 if (HasStreamOutT::value && !state.soState.streamEnable[stream])
1002 {
1003 continue;
1004 }
1005
1006 // multi-stream output, need to translate StreamID buffer to a cut buffer
1007 ProcessStreamIdBuffer(
1008 stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer);
1009 pCutBuffer = (uint8_t*)pGsBuffers->pStreamCutBuffer;
1010 processCutVerts = false;
1011 }
1012
1013 #if USE_SIMD16_FRONTEND
1014 PA_STATE_CUT gsPa(pDC,
1015 (uint8_t*)pGsBuffers->pGsTransposed,
1016 numEmittedVerts,
1017 pState->outputVertexSize,
1018 reinterpret_cast<simd16mask*>(pCutBuffer),
1019 numEmittedVerts,
1020 numAttribs,
1021 pState->outputTopology,
1022 processCutVerts,
1023 pa.numVertsPerPrim);
1024
1025 #else
1026 PA_STATE_CUT gsPa(pDC,
1027 (uint8_t*)pGsBuffers->pGsTransposed,
1028 numEmittedVerts,
1029 pState->outputVertexSize,
1030 pCutBuffer,
1031 numEmittedVerts,
1032 numAttribs,
1033 pState->outputTopology,
1034 processCutVerts,
1035 pa.numVertsPerPrim);
1036
1037 #endif
1038 while (gsPa.GetNextStreamOutput())
1039 {
1040 do
1041 {
1042 #if USE_SIMD16_FRONTEND
1043 simd16vector attrib_simd16[3];
1044
1045 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib_simd16);
1046
1047 #else
1048 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
1049
1050 #endif
1051 if (assemble)
1052 {
1053 totalPrimsGenerated += gsPa.NumPrims();
1054
1055 if (HasStreamOutT::value)
1056 {
1057 #if ENABLE_AVX512_SIMD16
1058 gsPa.useAlternateOffset = false;
1059 #endif
1060 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
1061 }
1062
1063 if (HasRastT::value && state.soState.streamToRasterizer == stream)
1064 {
1065 #if USE_SIMD16_FRONTEND
1066 simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
1067
1068 // Gather data from the SVG if provided.
1069 simd16scalari vViewportIdx = SIMD16::setzero_si();
1070 simd16scalari vRtIdx = SIMD16::setzero_si();
1071 SIMD16::Vec4 svgAttrib[4];
1072
1073 if (state.backendState.readViewportArrayIndex ||
1074 state.backendState.readRenderTargetArrayIndex)
1075 {
1076 gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1077 }
1078
1079 if (state.backendState.readViewportArrayIndex)
1080 {
1081 vViewportIdx =
1082 SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1083 gsPa.viewportArrayActive = true;
1084 }
1085 if (state.backendState.readRenderTargetArrayIndex)
1086 {
1087 vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1088 gsPa.rtArrayActive = true;
1089 }
1090
1091 {
1092 // OOB VPAI indices => forced to zero.
1093 vViewportIdx =
1094 SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
1095 simd16scalari vNumViewports =
1096 SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1097 simd16scalari vClearMask =
1098 SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
1099 vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
1100
1101 gsPa.useAlternateOffset = false;
1102 pfnClipFunc(pDC,
1103 gsPa,
1104 workerId,
1105 attrib_simd16,
1106 GenMask(gsPa.NumPrims()),
1107 vPrimId,
1108 vViewportIdx,
1109 vRtIdx);
1110 }
1111 #else
1112 simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
1113
1114 // Gather data from the SVG if provided.
1115 simdscalari vViewportIdx = SIMD::setzero_si();
1116 simdscalari vRtIdx = SIMD::setzero_si();
1117 SIMD::Vec4 svgAttrib[4];
1118
1119 if (state.backendState.readViewportArrayIndex ||
1120 state.backendState.readRenderTargetArrayIndex)
1121 {
1122 gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1123 }
1124
1125 if (state.backendState.readViewportArrayIndex)
1126 {
1127 vViewportIdx =
1128 SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1129
1130 // OOB VPAI indices => forced to zero.
1131 vViewportIdx =
1132 SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
1133 simdscalari vNumViewports =
1134 SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1135 simdscalari vClearMask =
1136 SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
1137 vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
1138 gsPa.viewportArrayActive = true;
1139 }
1140 if (state.backendState.readRenderTargetArrayIndex)
1141 {
1142 vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1143 gsPa.rtArrayActive = true;
1144 }
1145
1146 pfnClipFunc(pDC,
1147 gsPa,
1148 workerId,
1149 attrib,
1150 GenMask(gsPa.NumPrims()),
1151 vPrimId,
1152 vViewportIdx,
1153 vRtIdx);
1154 #endif
1155 }
1156 }
1157 } while (gsPa.NextPrim());
1158 }
1159 }
1160 }
1161 }
1162
1163 // update GS pipeline stats
1164 UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
1165 UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
1166 AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim * numInputPrims));
1167 RDTSC_END(pDC->pContext->pBucketMgr, FEGeometryShader, 1);
1168 }
1169
1170 //////////////////////////////////////////////////////////////////////////
1171 /// @brief Allocate GS buffers
1172 /// @param pDC - pointer to draw context.
1173 /// @param state - API state
1174 /// @param ppGsOut - pointer to GS output buffer allocation
1175 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
1176 template <typename SIMD_T, uint32_t SIMD_WIDTH>
AllocateGsBuffers(DRAW_CONTEXT * pDC,const API_STATE & state,uint32_t vertsPerPrim,GsBuffers * pGsBuffers)1177 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC,
1178 const API_STATE& state,
1179 uint32_t vertsPerPrim,
1180 GsBuffers* pGsBuffers)
1181 {
1182 auto pArena = pDC->pArena;
1183 SWR_ASSERT(pArena != nullptr);
1184 SWR_ASSERT(state.gsState.gsEnable);
1185
1186 const SWR_GS_STATE& gsState = state.gsState;
1187
1188 // Allocate storage for vertex inputs
1189 uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim;
1190 pGsBuffers->pGsIn = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32);
1191
1192 // Allocate arena space to hold GS output verts
1193 const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize;
1194
1195 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
1196 {
1197 pGsBuffers->pGsOut[i] = (uint8_t*)pArena->AllocAligned(vertexBufferSize, 32);
1198 }
1199
1200 // Allocate storage for transposed GS output
1201 uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH;
1202 uint32_t transposedBufferSize =
1203 numSimdBatches * gsState.outputVertexSize * sizeof(Vec4<SIMD_T>);
1204 pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32);
1205
1206 // Allocate storage to hold temporary stream->cut buffer, if necessary
1207 if (state.gsState.isSingleStream)
1208 {
1209 pGsBuffers->pStreamCutBuffer = nullptr;
1210 }
1211 else
1212 {
1213 pGsBuffers->pStreamCutBuffer =
1214 (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32);
1215 }
1216 }
1217
1218 //////////////////////////////////////////////////////////////////////////
1219 /// @brief Contains all data generated by the HS and passed to the
1220 /// tessellator and DS.
1221 struct TessellationThreadLocalData
1222 {
1223 SWR_HS_CONTEXT hsContext;
1224 void* pTxCtx;
1225 size_t tsCtxSize;
1226
1227 uint8_t* pHSOutput;
1228 size_t hsOutputAllocSize;
1229
1230 simdscalar* pDSOutput;
1231 size_t dsOutputAllocSize;
1232 };
1233
1234 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
1235
1236 //////////////////////////////////////////////////////////////////////////
1237 /// @brief Allocate tessellation data for this worker thread.
1238 INLINE
AllocateTessellationData(SWR_CONTEXT * pContext)1239 static void AllocateTessellationData(SWR_CONTEXT* pContext)
1240 {
1241 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
1242 if (gt_pTessellationThreadData == nullptr)
1243 {
1244 gt_pTessellationThreadData =
1245 (TessellationThreadLocalData*)AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
1246 memset((void*)gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
1247 }
1248 }
1249
1250 //////////////////////////////////////////////////////////////////////////
1251 /// @brief Implements Tessellation Stages.
1252 /// @param pDC - pointer to draw context.
1253 /// @param workerId - thread's worker id. Even thread has a unique id.
1254 /// @param pa - The primitive assembly object.
1255 /// @param pGsOut - output stream for GS
1256 template <typename HasGeometryShaderT, typename HasStreamOutT, typename HasRastT>
TessellationStages(DRAW_CONTEXT * pDC,uint32_t workerId,PA_STATE & pa,GsBuffers * pGsBuffers,uint32_t * pSoPrimData,uint32_t numPrims_simd8,simdscalari const & primID)1257 static void TessellationStages(DRAW_CONTEXT* pDC,
1258 uint32_t workerId,
1259 PA_STATE& pa,
1260 GsBuffers* pGsBuffers,
1261 uint32_t* pSoPrimData,
1262 #if USE_SIMD16_FRONTEND
1263 uint32_t numPrims_simd8,
1264 #endif
1265 simdscalari const& primID)
1266 {
1267 const API_STATE& state = GetApiState(pDC);
1268 const SWR_TS_STATE& tsState = state.tsState;
1269 void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
1270
1271 SWR_ASSERT(gt_pTessellationThreadData);
1272
1273 HANDLE tsCtx = TSInitCtx(tsState.domain,
1274 tsState.partitioning,
1275 tsState.tsOutputTopology,
1276 gt_pTessellationThreadData->pTxCtx,
1277 gt_pTessellationThreadData->tsCtxSize);
1278 if (tsCtx == nullptr)
1279 {
1280 gt_pTessellationThreadData->pTxCtx =
1281 AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
1282 tsCtx = TSInitCtx(tsState.domain,
1283 tsState.partitioning,
1284 tsState.tsOutputTopology,
1285 gt_pTessellationThreadData->pTxCtx,
1286 gt_pTessellationThreadData->tsCtxSize);
1287 }
1288 SWR_ASSERT(tsCtx);
1289
1290 #if USE_SIMD16_FRONTEND
1291 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
1292 if (HasRastT::value)
1293 {
1294 switch (tsState.postDSTopology)
1295 {
1296 case TOP_TRIANGLE_LIST:
1297 pfnClipFunc = ClipTriangles_simd16;
1298 break;
1299 case TOP_LINE_LIST:
1300 pfnClipFunc = ClipLines_simd16;
1301 break;
1302 case TOP_POINT_LIST:
1303 pfnClipFunc = ClipPoints_simd16;
1304 break;
1305 default:
1306 SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1307 }
1308 }
1309
1310 #else
1311 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
1312 if (HasRastT::value)
1313 {
1314 switch (tsState.postDSTopology)
1315 {
1316 case TOP_TRIANGLE_LIST:
1317 pfnClipFunc = ClipTriangles;
1318 break;
1319 case TOP_LINE_LIST:
1320 pfnClipFunc = ClipLines;
1321 break;
1322 case TOP_POINT_LIST:
1323 pfnClipFunc = ClipPoints;
1324 break;
1325 default:
1326 SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1327 }
1328 }
1329
1330 #endif
1331 SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1332 hsContext.PrimitiveID = primID;
1333 hsContext.outputSize = tsState.hsAllocationSize;
1334
1335 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1336 // Max storage for one attribute for an entire simdprimitive
1337 simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1338
1339 // Assemble position separately
1340 // TESS_TODO: this could be avoided - fix it
1341 pa.Assemble(VERTEX_POSITION_SLOT, simdattrib);
1342 for (uint32_t i = 0; i < numVertsPerPrim; ++i) {
1343 hsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = simdattrib[i];
1344 }
1345
1346 // assemble all attributes for the input primitives
1347 for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1348 {
1349 uint32_t attribSlot = tsState.srcVertexAttribOffset + slot;
1350 pa.Assemble(attribSlot, simdattrib);
1351
1352 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1353 {
1354 hsContext.vert[i].attrib[tsState.vertexAttribOffset + slot] = simdattrib[i];
1355 }
1356 }
1357
1358 // Allocate HS output storage
1359 uint32_t requiredAllocSize = KNOB_SIMD_WIDTH * tsState.hsAllocationSize;
1360
1361 if (requiredAllocSize > gt_pTessellationThreadData->hsOutputAllocSize)
1362 {
1363 AlignedFree(gt_pTessellationThreadData->pHSOutput);
1364 gt_pTessellationThreadData->pHSOutput = (uint8_t*)AlignedMalloc(requiredAllocSize, 64);
1365 gt_pTessellationThreadData->hsOutputAllocSize = requiredAllocSize;
1366 }
1367
1368 hsContext.pCPout = (ScalarPatch*)gt_pTessellationThreadData->pHSOutput;
1369
1370 #if defined(_DEBUG)
1371 //memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1372 #endif
1373 memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1374
1375 #if USE_SIMD16_FRONTEND
1376 uint32_t numPrims = numPrims_simd8;
1377 #else
1378 uint32_t numPrims = pa.NumPrims();
1379 #endif
1380 hsContext.mask = GenerateMask(numPrims);
1381
1382 // Run the HS
1383 RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEHullShader, pDC->drawId);
1384 state.pfnHsFunc(GetPrivateState(pDC), pWorkerData, &hsContext);
1385 RDTSC_END(pDC->pContext->pBucketMgr, FEHullShader, 0);
1386
1387 UPDATE_STAT_FE(HsInvocations, numPrims);
1388 AR_EVENT(HSStats((HANDLE)&hsContext.stats));
1389
1390 const uint32_t* pPrimId = (const uint32_t*)&primID;
1391
1392 for (uint32_t p = 0; p < numPrims; ++p)
1393 {
1394 ScalarPatch* pCPout = (ScalarPatch*)(gt_pTessellationThreadData->pHSOutput + tsState.hsAllocationSize * p);
1395
1396 SWR_TESSELLATION_FACTORS tessFactors;
1397 tessFactors = hsContext.pCPout[p].tessFactors;
1398
1399 // Run Tessellator
1400 SWR_TS_TESSELLATED_DATA tsData = {0};
1401 RDTSC_BEGIN(pDC->pContext->pBucketMgr, FETessellation, pDC->drawId);
1402 TSTessellate(tsCtx, tessFactors, tsData);
1403 AR_EVENT(TessPrimCount(1));
1404 RDTSC_END(pDC->pContext->pBucketMgr, FETessellation, 0);
1405
1406 if (tsData.NumPrimitives == 0)
1407 {
1408 continue;
1409 }
1410 SWR_ASSERT(tsData.NumDomainPoints);
1411
1412 // Allocate DS Output memory
1413 uint32_t requiredDSVectorInvocations =
1414 AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1415 #if USE_SIMD16_FRONTEND
1416 size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) *
1417 tsState.dsAllocationSize; // simd8 -> simd16, padding
1418 #else
1419 size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.dsAllocationSize;
1420 size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1421 #endif
1422 if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize)
1423 {
1424 AlignedFree(gt_pTessellationThreadData->pDSOutput);
1425 gt_pTessellationThreadData->pDSOutput =
1426 (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1427 gt_pTessellationThreadData->dsOutputAllocSize = requiredAllocSize;
1428 }
1429 SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1430 SWR_ASSERT(gt_pTessellationThreadData->dsOutputAllocSize >= requiredAllocSize);
1431
1432 #if defined(_DEBUG)
1433 memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1434 #endif
1435
1436 // Run Domain Shader
1437 SWR_DS_CONTEXT dsContext;
1438 dsContext.PrimitiveID = pPrimId[p];
1439 dsContext.pCpIn = pCPout;
1440 dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1441 dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1442 dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1443 dsContext.outVertexAttribOffset = tsState.dsOutVtxAttribOffset;
1444 #if USE_SIMD16_FRONTEND
1445 dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); // simd8 -> simd16
1446 #else
1447 dsContext.vectorStride = requiredDSVectorInvocations;
1448 #endif
1449
1450 uint32_t dsInvocations = 0;
1451
1452 for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations;
1453 ++dsContext.vectorOffset)
1454 {
1455 dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1456
1457 RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEDomainShader, pDC->drawId);
1458 state.pfnDsFunc(GetPrivateState(pDC), pWorkerData, &dsContext);
1459 RDTSC_END(pDC->pContext->pBucketMgr, FEDomainShader, 0);
1460
1461 AR_EVENT(DSStats((HANDLE)&dsContext.stats));
1462
1463 dsInvocations += KNOB_SIMD_WIDTH;
1464 }
1465 UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1466
1467 #if USE_SIMD16_FRONTEND
1468 SWR_ASSERT(IsEven(dsContext.vectorStride)); // simd8 -> simd16
1469
1470 #endif
1471 PA_TESS tessPa(
1472 pDC,
1473 #if USE_SIMD16_FRONTEND
1474 reinterpret_cast<const simd16scalar*>(dsContext.pOutputData), // simd8 -> simd16
1475 dsContext.vectorStride / 2, // simd8 -> simd16
1476 #else
1477 dsContext.pOutputData,
1478 dsContext.vectorStride,
1479 #endif
1480 SWR_VTX_NUM_SLOTS,
1481 tsState.numDsOutputAttribs + tsState.dsOutVtxAttribOffset,
1482 tsData.ppIndices,
1483 tsData.NumPrimitives,
1484 tsState.postDSTopology,
1485 NumVertsPerPrim(tsState.postDSTopology, false));
1486
1487 while (tessPa.HasWork())
1488 {
1489 #if USE_SIMD16_FRONTEND
1490 const uint32_t numPrims = tessPa.NumPrims();
1491 const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1492 const uint32_t numPrims_hi =
1493 std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1494
1495 const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID);
1496 const simdscalari primID_lo = _simd16_extract_si(primID, 0);
1497 const simdscalari primID_hi = _simd16_extract_si(primID, 1);
1498
1499 #endif
1500 if (HasGeometryShaderT::value)
1501 {
1502 #if USE_SIMD16_FRONTEND
1503 tessPa.useAlternateOffset = false;
1504 GeometryShaderStage<HasStreamOutT, HasRastT>(
1505 pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo);
1506
1507 if (numPrims_hi)
1508 {
1509 tessPa.useAlternateOffset = true;
1510 GeometryShaderStage<HasStreamOutT, HasRastT>(
1511 pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi);
1512 }
1513 #else
1514 GeometryShaderStage<HasStreamOutT, HasRastT>(
1515 pDC,
1516 workerId,
1517 tessPa,
1518 pGsBuffers,
1519 pSoPrimData,
1520 _simd_set1_epi32(dsContext.PrimitiveID));
1521 #endif
1522 }
1523 else
1524 {
1525 if (HasStreamOutT::value)
1526 {
1527 #if ENABLE_AVX512_SIMD16
1528 tessPa.useAlternateOffset = false;
1529 #endif
1530 StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1531 }
1532
1533 if (HasRastT::value)
1534 {
1535 #if USE_SIMD16_FRONTEND
1536 simd16vector prim_simd16[3]; // Only deal with triangles, lines, or points
1537 #else
1538 simdvector prim[3]; // Only deal with triangles, lines, or points
1539 #endif
1540 RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEPAAssemble, pDC->drawId);
1541 bool assemble =
1542 #if USE_SIMD16_FRONTEND
1543 tessPa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
1544 #else
1545 tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1546 #endif
1547 RDTSC_END(pDC->pContext->pBucketMgr, FEPAAssemble, 1);
1548 SWR_ASSERT(assemble);
1549
1550 SWR_ASSERT(pfnClipFunc);
1551 #if USE_SIMD16_FRONTEND
1552 // Gather data from the SVG if provided.
1553 simd16scalari vViewportIdx = SIMD16::setzero_si();
1554 simd16scalari vRtIdx = SIMD16::setzero_si();
1555 SIMD16::Vec4 svgAttrib[4] = {SIMD16::setzero_ps()};
1556
1557 if (state.backendState.readViewportArrayIndex ||
1558 state.backendState.readRenderTargetArrayIndex)
1559 {
1560 tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1561 }
1562
1563 if (state.backendState.readViewportArrayIndex)
1564 {
1565 vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1566 tessPa.viewportArrayActive = true;
1567 }
1568 if (state.backendState.readRenderTargetArrayIndex)
1569 {
1570 vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1571 tessPa.rtArrayActive = true;
1572 }
1573
1574
1575 {
1576 // OOB VPAI indices => forced to zero.
1577 vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
1578 simd16scalari vNumViewports =
1579 SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1580 simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
1581 vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
1582
1583 tessPa.useAlternateOffset = false;
1584 pfnClipFunc(pDC,
1585 tessPa,
1586 workerId,
1587 prim_simd16,
1588 GenMask(numPrims),
1589 primID,
1590 vViewportIdx,
1591 vRtIdx);
1592 }
1593 #else
1594 // Gather data from the SGV if provided.
1595 simdscalari vViewportIdx = SIMD::setzero_si();
1596 simdscalari vRtIdx = SIMD::setzero_si();
1597 SIMD::Vec4 svgAttrib[4];
1598
1599 if (state.backendState.readViewportArrayIndex ||
1600 state.backendState.readRenderTargetArrayIndex)
1601 {
1602 tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1603 }
1604
1605 if (state.backendState.readViewportArrayIndex)
1606 {
1607 vViewportIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1608
1609 // OOB VPAI indices => forced to zero.
1610 vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
1611 simdscalari vNumViewports = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1612 simdscalari vClearMask = SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
1613 vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
1614 tessPa.viewportArrayActive = true;
1615 }
1616 if (state.backendState.readRenderTargetArrayIndex)
1617 {
1618 vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1619 tessPa.rtArrayActive = true;
1620 }
1621 pfnClipFunc(pDC,
1622 tessPa,
1623 workerId,
1624 prim,
1625 GenMask(tessPa.NumPrims()),
1626 _simd_set1_epi32(dsContext.PrimitiveID),
1627 vViewportIdx,
1628 vRtIdx);
1629 #endif
1630 }
1631 }
1632
1633 tessPa.NextPrim();
1634
1635 } // while (tessPa.HasWork())
1636 } // for (uint32_t p = 0; p < numPrims; ++p)
1637
1638 #if USE_SIMD16_FRONTEND
1639 if (gt_pTessellationThreadData->pDSOutput != nullptr)
1640 {
1641 AlignedFree(gt_pTessellationThreadData->pDSOutput);
1642 gt_pTessellationThreadData->pDSOutput = nullptr;
1643 }
1644 gt_pTessellationThreadData->dsOutputAllocSize = 0;
1645
1646 #endif
1647 TSDestroyCtx(tsCtx);
1648 }
1649
1650 THREAD PA_STATE::SIMDVERTEX* gpVertexStore = nullptr;
1651 THREAD uint32_t gVertexStoreSize = 0;
1652
1653 //////////////////////////////////////////////////////////////////////////
1654 /// @brief FE handler for SwrDraw.
1655 /// @tparam IsIndexedT - Is indexed drawing enabled
1656 /// @tparam HasTessellationT - Is tessellation enabled
1657 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1658 /// @tparam HasStreamOutT - Is stream-out enabled
1659 /// @tparam HasRastT - Is rasterization enabled
1660 /// @param pContext - pointer to SWR context.
1661 /// @param pDC - pointer to draw context.
1662 /// @param workerId - thread's worker id.
1663 /// @param pUserData - Pointer to DRAW_WORK
1664 template <typename IsIndexedT,
1665 typename IsCutIndexEnabledT,
1666 typename HasTessellationT,
1667 typename HasGeometryShaderT,
1668 typename HasStreamOutT,
1669 typename HasRastT>
ProcessDraw(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)1670 void ProcessDraw(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
1671 {
1672 #if KNOB_ENABLE_TOSS_POINTS
1673 if (KNOB_TOSS_QUEUE_FE)
1674 {
1675 return;
1676 }
1677 #endif
1678
1679 RDTSC_BEGIN(pContext->pBucketMgr, FEProcessDraw, pDC->drawId);
1680
1681 void* pWorkerData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
1682
1683 DRAW_WORK& work = *(DRAW_WORK*)pUserData;
1684 const API_STATE& state = GetApiState(pDC);
1685
1686 uint32_t indexSize = 0;
1687 uint32_t endVertex = work.numVerts;
1688
1689 gfxptr_t xpLastRequestedIndex = 0;
1690 if (IsIndexedT::value)
1691 {
1692 switch (work.type)
1693 {
1694 case R32_UINT:
1695 indexSize = sizeof(uint32_t);
1696 break;
1697 case R16_UINT:
1698 indexSize = sizeof(uint16_t);
1699 break;
1700 case R8_UINT:
1701 indexSize = sizeof(uint8_t);
1702 break;
1703 default:
1704 SWR_INVALID("Invalid work.type: %d", work.type);
1705 }
1706 xpLastRequestedIndex = work.xpIB + endVertex * indexSize;
1707 }
1708 else
1709 {
1710 // No cuts, prune partial primitives.
1711 endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1712 }
1713
1714 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1715 uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1716 #endif
1717
1718 GsBuffers gsBuffers;
1719 if (HasGeometryShaderT::value)
1720 {
1721 #if USE_SIMD16_FRONTEND
1722 AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(
1723 pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
1724 #else
1725 AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(
1726 pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
1727 #endif
1728 }
1729
1730 if (HasTessellationT::value)
1731 {
1732 SWR_ASSERT(state.tsState.tsEnable == true);
1733 SWR_ASSERT(state.pfnHsFunc != nullptr);
1734 SWR_ASSERT(state.pfnDsFunc != nullptr);
1735
1736 AllocateTessellationData(pContext);
1737 }
1738 else
1739 {
1740 SWR_ASSERT(state.tsState.tsEnable == false);
1741 SWR_ASSERT(state.pfnHsFunc == nullptr);
1742 SWR_ASSERT(state.pfnDsFunc == nullptr);
1743 }
1744
1745 // allocate space for streamout input prim data
1746 uint32_t* pSoPrimData = nullptr;
1747 if (HasStreamOutT::value)
1748 {
1749 pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1750 }
1751
1752 const uint32_t vertexCount = NumVertsPerPrim(state.topology, true);
1753 #if USE_SIMD16_FRONTEND
1754 uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simd16vector);
1755 #else
1756 uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector);
1757 #endif
1758
1759 SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);
1760
1761 // Compute storage requirements for vertex store
1762 // TODO: allocation needs to be rethought for better cut support
1763 uint32_t numVerts = vertexCount + 2; // Need extra space for PA state machine
1764 uint32_t vertexStoreSize = numVerts * simdVertexSizeBytes;
1765
1766 // grow the vertex store for the PA as necessary
1767 if (gVertexStoreSize < vertexStoreSize)
1768 {
1769 if (gpVertexStore != nullptr)
1770 {
1771 AlignedFree(gpVertexStore);
1772 gpVertexStore = nullptr;
1773 }
1774
1775 SWR_ASSERT(gpVertexStore == nullptr);
1776
1777 gpVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX*>(AlignedMalloc(vertexStoreSize, 64));
1778 gVertexStoreSize = vertexStoreSize;
1779
1780 SWR_ASSERT(gpVertexStore != nullptr);
1781 }
1782
1783 // choose primitive assembler
1784
1785 PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC,
1786 state.topology,
1787 work.numVerts,
1788 gpVertexStore,
1789 numVerts,
1790 state.frontendState.vsVertexSize,
1791 GetNumVerts(state.topology, 1));
1792 PA_STATE& pa = paFactory.GetPA();
1793
1794 #if USE_SIMD16_FRONTEND
1795 #if USE_SIMD16_SHADERS
1796 simd16vertex vin;
1797 #else
1798 simdvertex vin_lo;
1799 simdvertex vin_hi;
1800 #endif
1801 SWR_VS_CONTEXT vsContext_lo;
1802 SWR_VS_CONTEXT vsContext_hi;
1803
1804 #if USE_SIMD16_SHADERS
1805 vsContext_lo.pVin = reinterpret_cast<simdvertex*>(&vin);
1806 vsContext_hi.pVin = reinterpret_cast<simdvertex*>(&vin);
1807 #else
1808 vsContext_lo.pVin = &vin_lo;
1809 vsContext_hi.pVin = &vin_hi;
1810 #endif
1811 vsContext_lo.AlternateOffset = 0;
1812 vsContext_hi.AlternateOffset = 1;
1813
1814 SWR_FETCH_CONTEXT fetchInfo_lo = {0};
1815
1816 fetchInfo_lo.pStreams = &state.vertexBuffers[0];
1817 fetchInfo_lo.StartInstance = work.startInstance;
1818 fetchInfo_lo.StartVertex = 0;
1819
1820 if (IsIndexedT::value)
1821 {
1822 fetchInfo_lo.BaseVertex = work.baseVertex;
1823
1824 // if the entire index buffer isn't being consumed, set the last index
1825 // so that fetches < a SIMD wide will be masked off
1826 fetchInfo_lo.xpLastIndex = state.indexBuffer.xpIndices + state.indexBuffer.size;
1827 if (xpLastRequestedIndex < fetchInfo_lo.xpLastIndex)
1828 {
1829 fetchInfo_lo.xpLastIndex = xpLastRequestedIndex;
1830 }
1831 }
1832 else
1833 {
1834 fetchInfo_lo.StartVertex = work.startVertex;
1835 }
1836
1837 SWR_FETCH_CONTEXT fetchInfo_hi = fetchInfo_lo;
1838
1839 const simd16scalari vScale =
1840 _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1841
1842 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1843 {
1844 uint32_t i = 0;
1845
1846 simd16scalari vIndex;
1847
1848 if (IsIndexedT::value)
1849 {
1850 fetchInfo_lo.xpIndices = work.xpIB;
1851 fetchInfo_hi.xpIndices =
1852 fetchInfo_lo.xpIndices + KNOB_SIMD_WIDTH * indexSize; // 1/2 of KNOB_SIMD16_WIDTH
1853 }
1854 else
1855 {
1856 vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
1857
1858 fetchInfo_lo.xpIndices = pDC->pContext->pfnMakeGfxPtr(GetPrivateState(pDC), &vIndex);
1859 fetchInfo_hi.xpIndices = pDC->pContext->pfnMakeGfxPtr(
1860 GetPrivateState(pDC),
1861 &vIndex + KNOB_SIMD_WIDTH * sizeof(int32_t)); // 1/2 of KNOB_SIMD16_WIDTH
1862 }
1863
1864 fetchInfo_lo.CurInstance = instanceNum;
1865 fetchInfo_hi.CurInstance = instanceNum;
1866
1867 vsContext_lo.InstanceID = instanceNum;
1868 vsContext_hi.InstanceID = instanceNum;
1869
1870 while (pa.HasWork())
1871 {
1872 // GetNextVsOutput currently has the side effect of updating some PA state machine
1873 // state. So we need to keep this outside of (i < endVertex) check.
1874
1875 simdmask* pvCutIndices_lo = nullptr;
1876 simdmask* pvCutIndices_hi = nullptr;
1877
1878 if (IsIndexedT::value)
1879 {
1880 // simd16mask <=> simdmask[2]
1881
1882 pvCutIndices_lo = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[0];
1883 pvCutIndices_hi = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[1];
1884 }
1885
1886 simd16vertex& vout = pa.GetNextVsOutput();
1887
1888 vsContext_lo.pVout = reinterpret_cast<simdvertex*>(&vout);
1889 vsContext_hi.pVout = reinterpret_cast<simdvertex*>(&vout);
1890
1891 if (i < endVertex)
1892 {
1893 if (!IsIndexedT::value)
1894 {
1895 fetchInfo_lo.xpLastIndex = fetchInfo_lo.xpIndices;
1896 uint32_t offset;
1897 offset = std::min(endVertex - i, (uint32_t)KNOB_SIMD16_WIDTH);
1898 offset *= 4; // convert from index to address
1899 #if USE_SIMD16_SHADERS
1900 fetchInfo_lo.xpLastIndex += offset;
1901 #else
1902 fetchInfo_lo.xpLastIndex += std::min(offset, (uint32_t)KNOB_SIMD_WIDTH);
1903 uint32_t offset2 =
1904 std::min(offset, (uint32_t)KNOB_SIMD16_WIDTH) - KNOB_SIMD_WIDTH;
1905 assert(offset >= 0);
1906 fetchInfo_hi.xpLastIndex = fetchInfo_hi.xpIndices;
1907 fetchInfo_hi.xpLastIndex += offset2;
1908 #endif
1909 }
1910 // 1. Execute FS/VS for a single SIMD.
1911 RDTSC_BEGIN(pContext->pBucketMgr, FEFetchShader, pDC->drawId);
1912 #if USE_SIMD16_SHADERS
1913 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin);
1914 #else
1915 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin_lo);
1916
1917 if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
1918 {
1919 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_hi, vin_hi);
1920 }
1921 #endif
1922 RDTSC_END(pContext->pBucketMgr, FEFetchShader, 0);
1923
1924 // forward fetch generated vertex IDs to the vertex shader
1925 #if USE_SIMD16_SHADERS
1926 #if USE_SIMD16_VS
1927 vsContext_lo.VertexID16 =
1928 _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID, 0);
1929 vsContext_lo.VertexID16 =
1930 _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID2, 1);
1931 #else
1932 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1933 vsContext_hi.VertexID = fetchInfo_lo.VertexID2;
1934 #endif
1935 #else
1936 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1937 vsContext_hi.VertexID = fetchInfo_hi.VertexID;
1938 #endif
1939
1940 // Setup active mask for vertex shader.
1941 #if USE_SIMD16_VS
1942 vsContext_lo.mask16 = GenerateMask16(endVertex - i);
1943 #else
1944 vsContext_lo.mask = GenerateMask(endVertex - i);
1945 vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
1946 #endif
1947
1948 // forward cut mask to the PA
1949 if (IsIndexedT::value)
1950 {
1951 #if USE_SIMD16_SHADERS
1952 *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1953 *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask2));
1954 #else
1955 *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1956 *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
1957 #endif
1958 }
1959
1960 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1961
1962 #if KNOB_ENABLE_TOSS_POINTS
1963 if (!KNOB_TOSS_FETCH)
1964 #endif
1965 {
1966 RDTSC_BEGIN(pContext->pBucketMgr, FEVertexShader, pDC->drawId);
1967 #if USE_SIMD16_VS
1968 state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo);
1969 AR_EVENT(VSStats((HANDLE)&vsContext_lo.stats));
1970 #else
1971 state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo);
1972 AR_EVENT(VSStats((HANDLE)&vsContext_lo.stats));
1973
1974 if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
1975 {
1976 state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_hi);
1977 AR_EVENT(VSStats((HANDLE)&vsContext_hi.stats));
1978 }
1979 #endif
1980 RDTSC_END(pContext->pBucketMgr, FEVertexShader, 0);
1981
1982 UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1983 }
1984 }
1985
1986 // 2. Assemble primitives given the last two SIMD.
1987 do
1988 {
1989 simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
1990
1991 RDTSC_START(pContext->pBucketMgr, FEPAAssemble);
1992 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
1993 RDTSC_STOP(pContext->pBucketMgr, FEPAAssemble, 1, 0);
1994
1995 #if KNOB_ENABLE_TOSS_POINTS
1996 if (!KNOB_TOSS_FETCH)
1997 #endif
1998 {
1999 #if KNOB_ENABLE_TOSS_POINTS
2000 if (!KNOB_TOSS_VS)
2001 #endif
2002 {
2003 if (assemble)
2004 {
2005 UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
2006
2007 const uint32_t numPrims = pa.NumPrims();
2008 const uint32_t numPrims_lo =
2009 std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
2010 const uint32_t numPrims_hi =
2011 std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
2012
2013 const simd16scalari primID = pa.GetPrimID(work.startPrimID);
2014 const simdscalari primID_lo = _simd16_extract_si(primID, 0);
2015 const simdscalari primID_hi = _simd16_extract_si(primID, 1);
2016
2017 if (HasTessellationT::value)
2018 {
2019 pa.useAlternateOffset = false;
2020 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
2021 pDC,
2022 workerId,
2023 pa,
2024 &gsBuffers,
2025 pSoPrimData,
2026 numPrims_lo,
2027 primID_lo);
2028
2029 if (numPrims_hi)
2030 {
2031 pa.useAlternateOffset = true;
2032 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
2033 pDC,
2034 workerId,
2035 pa,
2036 &gsBuffers,
2037 pSoPrimData,
2038 numPrims_hi,
2039 primID_hi);
2040 }
2041 }
2042 else if (HasGeometryShaderT::value)
2043 {
2044 pa.useAlternateOffset = false;
2045 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC,
2046 workerId,
2047 pa,
2048 &gsBuffers,
2049 pSoPrimData,
2050 numPrims_lo,
2051 primID_lo);
2052
2053 if (numPrims_hi)
2054 {
2055 pa.useAlternateOffset = true;
2056 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC,
2057 workerId,
2058 pa,
2059 &gsBuffers,
2060 pSoPrimData,
2061 numPrims_hi,
2062 primID_hi);
2063 }
2064 }
2065 else
2066 {
2067 // If streamout is enabled then stream vertices out to memory.
2068 if (HasStreamOutT::value)
2069 {
2070 pa.useAlternateOffset = false;
2071 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
2072 }
2073
2074 if (HasRastT::value)
2075 {
2076 SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
2077 // Gather data from the SVG if provided.
2078 simd16scalari vpai = SIMD16::setzero_si();
2079 simd16scalari rtai = SIMD16::setzero_si();
2080 SIMD16::Vec4 svgAttrib[4];
2081
2082 if (state.backendState.readViewportArrayIndex ||
2083 state.backendState.readRenderTargetArrayIndex)
2084 {
2085 pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
2086 }
2087
2088 if (state.backendState.readViewportArrayIndex)
2089 {
2090 vpai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
2091 pa.viewportArrayActive = true;
2092 }
2093 if (state.backendState.readRenderTargetArrayIndex)
2094 {
2095 rtai =
2096 SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
2097 pa.rtArrayActive = true;
2098 }
2099
2100 {
2101 // OOB VPAI indices => forced to zero.
2102 vpai = SIMD16::max_epi32(vpai, SIMD16::setzero_si());
2103 simd16scalari vNumViewports =
2104 SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
2105 simd16scalari vClearMask =
2106 SIMD16::cmplt_epi32(vpai, vNumViewports);
2107 vpai = SIMD16::and_si(vClearMask, vpai);
2108
2109 pa.useAlternateOffset = false;
2110 pDC->pState->pfnProcessPrims_simd16(pDC,
2111 pa,
2112 workerId,
2113 prim_simd16,
2114 GenMask(numPrims),
2115 primID,
2116 vpai,
2117 rtai);
2118 }
2119 }
2120 }
2121 }
2122 }
2123 }
2124 } while (pa.NextPrim());
2125
2126 if (IsIndexedT::value)
2127 {
2128 fetchInfo_lo.xpIndices = fetchInfo_lo.xpIndices + KNOB_SIMD16_WIDTH * indexSize;
2129 fetchInfo_hi.xpIndices = fetchInfo_hi.xpIndices + KNOB_SIMD16_WIDTH * indexSize;
2130 }
2131 else
2132 {
2133 vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
2134 }
2135
2136 i += KNOB_SIMD16_WIDTH;
2137 }
2138
2139 pa.Reset();
2140 }
2141
2142 #else
2143 SWR_VS_CONTEXT vsContext;
2144 SWR_FETCH_CONTEXT fetchInfo = {0};
2145
2146 fetchInfo.pStreams = &state.vertexBuffers[0];
2147 fetchInfo.StartInstance = work.startInstance;
2148 fetchInfo.StartVertex = 0;
2149
2150 if (IsIndexedT::value)
2151 {
2152 fetchInfo.BaseVertex = work.baseVertex;
2153
2154 // if the entire index buffer isn't being consumed, set the last index
2155 // so that fetches < a SIMD wide will be masked off
2156 fetchInfo.pLastIndex =
2157 (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
2158 if (xpLastRequestedIndex < fetchInfo.pLastIndex)
2159 {
2160 fetchInfo.pLastIndex = xpLastRequestedIndex;
2161 }
2162 }
2163 else
2164 {
2165 fetchInfo.StartVertex = work.startVertex;
2166 }
2167
2168 const simdscalari vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
2169
2170 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
2171 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
2172 {
2173 simdscalari vIndex;
2174 uint32_t i = 0;
2175
2176 if (IsIndexedT::value)
2177 {
2178 fetchInfo.pIndices = work.pIB;
2179 }
2180 else
2181 {
2182 vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
2183 fetchInfo.pIndices = (const int32_t*)&vIndex;
2184 }
2185
2186 fetchInfo.CurInstance = instanceNum;
2187 vsContext.InstanceID = instanceNum;
2188
2189 while (pa.HasWork())
2190 {
2191 // GetNextVsOutput currently has the side effect of updating some PA state machine
2192 // state. So we need to keep this outside of (i < endVertex) check.
2193 simdmask* pvCutIndices = nullptr;
2194 if (IsIndexedT::value)
2195 {
2196 pvCutIndices = &pa.GetNextVsIndices();
2197 }
2198
2199 simdvertex& vout = pa.GetNextVsOutput();
2200 vsContext.pVin = &vout;
2201 vsContext.pVout = &vout;
2202
2203 if (i < endVertex)
2204 {
2205 // 1. Execute FS/VS for a single SIMD.
2206 RDTSC_BEGIN(pContext->pBucketMgr, FEFetchShader, pDC->drawId);
2207 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo, vout);
2208 RDTSC_END(pContext->pBucketMgr, FEFetchShader, 0);
2209
2210 // forward fetch generated vertex IDs to the vertex shader
2211 vsContext.VertexID = fetchInfo.VertexID;
2212
2213 // Setup active mask for vertex shader.
2214 vsContext.mask = GenerateMask(endVertex - i);
2215
2216 // forward cut mask to the PA
2217 if (IsIndexedT::value)
2218 {
2219 *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
2220 }
2221
2222 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
2223
2224 #if KNOB_ENABLE_TOSS_POINTS
2225 if (!KNOB_TOSS_FETCH)
2226 #endif
2227 {
2228 RDTSC_BEGIN(pContext->pBucketMgr, FEVertexShader, pDC->drawId);
2229 state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext);
2230 RDTSC_END(pContext->pBucketMgr, FEVertexShader, 0);
2231
2232 UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
2233 AR_EVENT(VSStats((HANDLE)&vsContext.stats));
2234 }
2235 }
2236
2237 // 2. Assemble primitives given the last two SIMD.
2238 do
2239 {
2240 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
2241 // PaAssemble returns false if there is not enough verts to assemble.
2242 RDTSC_BEGIN(pContext->pBucketMgr, FEPAAssemble, pDC->drawId);
2243 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
2244 RDTSC_END(pContext->pBucketMgr, FEPAAssemble, 1);
2245
2246 #if KNOB_ENABLE_TOSS_POINTS
2247 if (!KNOB_TOSS_FETCH)
2248 #endif
2249 {
2250 #if KNOB_ENABLE_TOSS_POINTS
2251 if (!KNOB_TOSS_VS)
2252 #endif
2253 {
2254 if (assemble)
2255 {
2256 UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
2257
2258 if (HasTessellationT::value)
2259 {
2260 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
2261 pDC,
2262 workerId,
2263 pa,
2264 &gsBuffers,
2265 pSoPrimData,
2266 pa.GetPrimID(work.startPrimID));
2267 }
2268 else if (HasGeometryShaderT::value)
2269 {
2270 GeometryShaderStage<HasStreamOutT, HasRastT>(
2271 pDC,
2272 workerId,
2273 pa,
2274 &gsBuffers,
2275 pSoPrimData,
2276 pa.GetPrimID(work.startPrimID));
2277 }
2278 else
2279 {
2280 // If streamout is enabled then stream vertices out to memory.
2281 if (HasStreamOutT::value)
2282 {
2283 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
2284 }
2285
2286 if (HasRastT::value)
2287 {
2288 SWR_ASSERT(pDC->pState->pfnProcessPrims);
2289
2290 // Gather data from the SVG if provided.
2291 simdscalari vViewportIdx = SIMD::setzero_si();
2292 simdscalari vRtIdx = SIMD::setzero_si();
2293 SIMD::Vec4 svgAttrib[4];
2294
2295 if (state.backendState.readViewportArrayIndex ||
2296 state.backendState.readRenderTargetArrayIndex)
2297 {
2298 pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
2299 }
2300
2301 if (state.backendState.readViewportArrayIndex)
2302 {
2303 vViewportIdx =
2304 SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
2305
2306 // OOB VPAI indices => forced to zero.
2307 vViewportIdx =
2308 SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
2309 simdscalari vNumViewports =
2310 SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
2311 simdscalari vClearMask =
2312 SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
2313 vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
2314 pa.viewportArrayActive = true;
2315 }
2316 if (state.backendState.readRenderTargetArrayIndex)
2317 {
2318 vRtIdx =
2319 SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
2320 pa.rtArrayActive = true;
2321 }
2322
2323 pDC->pState->pfnProcessPrims(pDC,
2324 pa,
2325 workerId,
2326 prim,
2327 GenMask(pa.NumPrims()),
2328 pa.GetPrimID(work.startPrimID),
2329 vViewportIdx,
2330 vRtIdx);
2331 }
2332 }
2333 }
2334 }
2335 }
2336 } while (pa.NextPrim());
2337
2338 if (IsIndexedT::value)
2339 {
2340 fetchInfo.pIndices =
2341 (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
2342 }
2343 else
2344 {
2345 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
2346 }
2347
2348 i += KNOB_SIMD_WIDTH;
2349 }
2350 pa.Reset();
2351 }
2352
2353 #endif
2354
2355 RDTSC_END(pContext->pBucketMgr, FEProcessDraw, numPrims * work.numInstances);
2356 }
2357
2358 struct FEDrawChooser
2359 {
2360 typedef PFN_FE_WORK_FUNC FuncType;
2361
2362 template <typename... ArgsB>
GetFuncFEDrawChooser2363 static FuncType GetFunc()
2364 {
2365 return ProcessDraw<ArgsB...>;
2366 }
2367 };
2368
2369 // Selector for correct templated Draw front-end function
GetProcessDrawFunc(bool IsIndexed,bool IsCutIndexEnabled,bool HasTessellation,bool HasGeometryShader,bool HasStreamOut,bool HasRasterization)2370 PFN_FE_WORK_FUNC GetProcessDrawFunc(bool IsIndexed,
2371 bool IsCutIndexEnabled,
2372 bool HasTessellation,
2373 bool HasGeometryShader,
2374 bool HasStreamOut,
2375 bool HasRasterization)
2376 {
2377 return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed,
2378 IsCutIndexEnabled,
2379 HasTessellation,
2380 HasGeometryShader,
2381 HasStreamOut,
2382 HasRasterization);
2383 }
2384