1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file frontend.cpp
24 *
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
27 *
28 ******************************************************************************/
29
30 #include "api.h"
31 #include "frontend.h"
32 #include "backend.h"
33 #include "context.h"
34 #include "rdtsc_core.h"
35 #include "utils.h"
36 #include "threads.h"
37 #include "pa.h"
38 #include "clip.h"
39 #include "tilemgr.h"
40 #include "tessellator.h"
41 #include <limits>
42 #include <iostream>
43
44 //////////////////////////////////////////////////////////////////////////
45 /// @brief Helper macro to generate a bitmask
GenMask(uint32_t numBits)46 static INLINE uint32_t GenMask(uint32_t numBits)
47 {
48 SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
49 return ((1U << numBits) - 1);
50 }
51
52 //////////////////////////////////////////////////////////////////////////
53 /// @brief FE handler for SwrSync.
54 /// @param pContext - pointer to SWR context.
55 /// @param pDC - pointer to draw context.
56 /// @param workerId - thread's worker id. Even thread has a unique id.
57 /// @param pUserData - Pointer to user data passed back to sync callback.
58 /// @todo This should go away when we switch this to use compute threading.
ProcessSync(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)59 void ProcessSync(
60 SWR_CONTEXT *pContext,
61 DRAW_CONTEXT *pDC,
62 uint32_t workerId,
63 void *pUserData)
64 {
65 BE_WORK work;
66 work.type = SYNC;
67 work.pfnWork = ProcessSyncBE;
68
69 MacroTileMgr *pTileMgr = pDC->pTileMgr;
70 pTileMgr->enqueue(0, 0, &work);
71 }
72
73 //////////////////////////////////////////////////////////////////////////
74 /// @brief FE handler for SwrDestroyContext.
75 /// @param pContext - pointer to SWR context.
76 /// @param pDC - pointer to draw context.
77 /// @param workerId - thread's worker id. Even thread has a unique id.
78 /// @param pUserData - Pointer to user data passed back to sync callback.
ProcessShutdown(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)79 void ProcessShutdown(
80 SWR_CONTEXT *pContext,
81 DRAW_CONTEXT *pDC,
82 uint32_t workerId,
83 void *pUserData)
84 {
85 BE_WORK work;
86 work.type = SHUTDOWN;
87 work.pfnWork = ProcessShutdownBE;
88
89 MacroTileMgr *pTileMgr = pDC->pTileMgr;
90 // Enqueue at least 1 work item for each worker thread
91 // account for number of numa nodes
92 uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
93
94 for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
95 {
96 for (uint32_t n = 0; n < numNumaNodes; ++n)
97 {
98 pTileMgr->enqueue(i, n, &work);
99 }
100 }
101 }
102
103 //////////////////////////////////////////////////////////////////////////
104 /// @brief FE handler for SwrClearRenderTarget.
105 /// @param pContext - pointer to SWR context.
106 /// @param pDC - pointer to draw context.
107 /// @param workerId - thread's worker id. Even thread has a unique id.
108 /// @param pUserData - Pointer to user data passed back to clear callback.
109 /// @todo This should go away when we switch this to use compute threading.
ProcessClear(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)110 void ProcessClear(
111 SWR_CONTEXT *pContext,
112 DRAW_CONTEXT *pDC,
113 uint32_t workerId,
114 void *pUserData)
115 {
116 CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
117 MacroTileMgr *pTileMgr = pDC->pTileMgr;
118
119 // queue a clear to each macro tile
120 // compute macro tile bounds for the specified rect
121 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
122 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
123 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
124 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
125
126 BE_WORK work;
127 work.type = CLEAR;
128 work.pfnWork = ProcessClearBE;
129 work.desc.clear = *pDesc;
130
131 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
132 {
133 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
134 {
135 pTileMgr->enqueue(x, y, &work);
136 }
137 }
138 }
139
140 //////////////////////////////////////////////////////////////////////////
141 /// @brief FE handler for SwrStoreTiles.
142 /// @param pContext - pointer to SWR context.
143 /// @param pDC - pointer to draw context.
144 /// @param workerId - thread's worker id. Even thread has a unique id.
145 /// @param pUserData - Pointer to user data passed back to callback.
146 /// @todo This should go away when we switch this to use compute threading.
ProcessStoreTiles(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)147 void ProcessStoreTiles(
148 SWR_CONTEXT *pContext,
149 DRAW_CONTEXT *pDC,
150 uint32_t workerId,
151 void *pUserData)
152 {
153 AR_BEGIN(FEProcessStoreTiles, pDC->drawId);
154 MacroTileMgr *pTileMgr = pDC->pTileMgr;
155 STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
156
157 // queue a store to each macro tile
158 // compute macro tile bounds for the specified rect
159 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
160 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
161 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
162 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
163
164 // store tiles
165 BE_WORK work;
166 work.type = STORETILES;
167 work.pfnWork = ProcessStoreTilesBE;
168 work.desc.storeTiles = *pDesc;
169
170 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
171 {
172 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
173 {
174 pTileMgr->enqueue(x, y, &work);
175 }
176 }
177
178 AR_END(FEProcessStoreTiles, 0);
179 }
180
181 //////////////////////////////////////////////////////////////////////////
182 /// @brief FE handler for SwrInvalidateTiles.
183 /// @param pContext - pointer to SWR context.
184 /// @param pDC - pointer to draw context.
185 /// @param workerId - thread's worker id. Even thread has a unique id.
186 /// @param pUserData - Pointer to user data passed back to callback.
187 /// @todo This should go away when we switch this to use compute threading.
ProcessDiscardInvalidateTiles(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)188 void ProcessDiscardInvalidateTiles(
189 SWR_CONTEXT *pContext,
190 DRAW_CONTEXT *pDC,
191 uint32_t workerId,
192 void *pUserData)
193 {
194 AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
195 DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
196 MacroTileMgr *pTileMgr = pDC->pTileMgr;
197
198 // compute macro tile bounds for the specified rect
199 uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
200 uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
201 uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
202 uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
203
204 if (pDesc->fullTilesOnly == false)
205 {
206 // include partial tiles
207 macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
208 macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
209 macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
210 macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
211 }
212
213 SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
214 SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
215
216 macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
217 macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
218
219 // load tiles
220 BE_WORK work;
221 work.type = DISCARDINVALIDATETILES;
222 work.pfnWork = ProcessDiscardInvalidateTilesBE;
223 work.desc.discardInvalidateTiles = *pDesc;
224
225 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
226 {
227 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
228 {
229 pTileMgr->enqueue(x, y, &work);
230 }
231 }
232
233 AR_END(FEProcessInvalidateTiles, 0);
234 }
235
236 //////////////////////////////////////////////////////////////////////////
237 /// @brief Computes the number of primitives given the number of verts.
238 /// @param mode - primitive topology for draw operation.
239 /// @param numPrims - number of vertices or indices for draw.
240 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
GetNumPrims(PRIMITIVE_TOPOLOGY mode,uint32_t numPrims)241 uint32_t GetNumPrims(
242 PRIMITIVE_TOPOLOGY mode,
243 uint32_t numPrims)
244 {
245 switch (mode)
246 {
247 case TOP_POINT_LIST: return numPrims;
248 case TOP_TRIANGLE_LIST: return numPrims / 3;
249 case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
250 case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
251 case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
252 case TOP_QUAD_LIST: return numPrims / 4;
253 case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
254 case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
255 case TOP_LINE_LIST: return numPrims / 2;
256 case TOP_LINE_LOOP: return numPrims;
257 case TOP_RECT_LIST: return numPrims / 3;
258 case TOP_LINE_LIST_ADJ: return numPrims / 4;
259 case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
260 case TOP_TRI_LIST_ADJ: return numPrims / 6;
261 case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
262
263 case TOP_PATCHLIST_1:
264 case TOP_PATCHLIST_2:
265 case TOP_PATCHLIST_3:
266 case TOP_PATCHLIST_4:
267 case TOP_PATCHLIST_5:
268 case TOP_PATCHLIST_6:
269 case TOP_PATCHLIST_7:
270 case TOP_PATCHLIST_8:
271 case TOP_PATCHLIST_9:
272 case TOP_PATCHLIST_10:
273 case TOP_PATCHLIST_11:
274 case TOP_PATCHLIST_12:
275 case TOP_PATCHLIST_13:
276 case TOP_PATCHLIST_14:
277 case TOP_PATCHLIST_15:
278 case TOP_PATCHLIST_16:
279 case TOP_PATCHLIST_17:
280 case TOP_PATCHLIST_18:
281 case TOP_PATCHLIST_19:
282 case TOP_PATCHLIST_20:
283 case TOP_PATCHLIST_21:
284 case TOP_PATCHLIST_22:
285 case TOP_PATCHLIST_23:
286 case TOP_PATCHLIST_24:
287 case TOP_PATCHLIST_25:
288 case TOP_PATCHLIST_26:
289 case TOP_PATCHLIST_27:
290 case TOP_PATCHLIST_28:
291 case TOP_PATCHLIST_29:
292 case TOP_PATCHLIST_30:
293 case TOP_PATCHLIST_31:
294 case TOP_PATCHLIST_32:
295 return numPrims / (mode - TOP_PATCHLIST_BASE);
296
297 case TOP_POLYGON:
298 case TOP_POINT_LIST_BF:
299 case TOP_LINE_STRIP_CONT:
300 case TOP_LINE_STRIP_BF:
301 case TOP_LINE_STRIP_CONT_BF:
302 case TOP_TRIANGLE_FAN_NOSTIPPLE:
303 case TOP_TRI_STRIP_REVERSE:
304 case TOP_PATCHLIST_BASE:
305 case TOP_UNKNOWN:
306 SWR_INVALID("Unsupported topology: %d", mode);
307 return 0;
308 }
309
310 return 0;
311 }
312
313 //////////////////////////////////////////////////////////////////////////
314 /// @brief Computes the number of verts given the number of primitives.
315 /// @param mode - primitive topology for draw operation.
316 /// @param numPrims - number of primitives for draw.
GetNumVerts(PRIMITIVE_TOPOLOGY mode,uint32_t numPrims)317 uint32_t GetNumVerts(
318 PRIMITIVE_TOPOLOGY mode,
319 uint32_t numPrims)
320 {
321 switch (mode)
322 {
323 case TOP_POINT_LIST: return numPrims;
324 case TOP_TRIANGLE_LIST: return numPrims * 3;
325 case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
326 case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
327 case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
328 case TOP_QUAD_LIST: return numPrims * 4;
329 case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
330 case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
331 case TOP_LINE_LIST: return numPrims * 2;
332 case TOP_LINE_LOOP: return numPrims;
333 case TOP_RECT_LIST: return numPrims * 3;
334 case TOP_LINE_LIST_ADJ: return numPrims * 4;
335 case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
336 case TOP_TRI_LIST_ADJ: return numPrims * 6;
337 case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
338
339 case TOP_PATCHLIST_1:
340 case TOP_PATCHLIST_2:
341 case TOP_PATCHLIST_3:
342 case TOP_PATCHLIST_4:
343 case TOP_PATCHLIST_5:
344 case TOP_PATCHLIST_6:
345 case TOP_PATCHLIST_7:
346 case TOP_PATCHLIST_8:
347 case TOP_PATCHLIST_9:
348 case TOP_PATCHLIST_10:
349 case TOP_PATCHLIST_11:
350 case TOP_PATCHLIST_12:
351 case TOP_PATCHLIST_13:
352 case TOP_PATCHLIST_14:
353 case TOP_PATCHLIST_15:
354 case TOP_PATCHLIST_16:
355 case TOP_PATCHLIST_17:
356 case TOP_PATCHLIST_18:
357 case TOP_PATCHLIST_19:
358 case TOP_PATCHLIST_20:
359 case TOP_PATCHLIST_21:
360 case TOP_PATCHLIST_22:
361 case TOP_PATCHLIST_23:
362 case TOP_PATCHLIST_24:
363 case TOP_PATCHLIST_25:
364 case TOP_PATCHLIST_26:
365 case TOP_PATCHLIST_27:
366 case TOP_PATCHLIST_28:
367 case TOP_PATCHLIST_29:
368 case TOP_PATCHLIST_30:
369 case TOP_PATCHLIST_31:
370 case TOP_PATCHLIST_32:
371 return numPrims * (mode - TOP_PATCHLIST_BASE);
372
373 case TOP_POLYGON:
374 case TOP_POINT_LIST_BF:
375 case TOP_LINE_STRIP_CONT:
376 case TOP_LINE_STRIP_BF:
377 case TOP_LINE_STRIP_CONT_BF:
378 case TOP_TRIANGLE_FAN_NOSTIPPLE:
379 case TOP_TRI_STRIP_REVERSE:
380 case TOP_PATCHLIST_BASE:
381 case TOP_UNKNOWN:
382 SWR_INVALID("Unsupported topology: %d", mode);
383 return 0;
384 }
385
386 return 0;
387 }
388
389 //////////////////////////////////////////////////////////////////////////
390 /// @brief Return number of verts per primitive.
391 /// @param topology - topology
392 /// @param includeAdjVerts - include adjacent verts in primitive vertices
NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology,bool includeAdjVerts)393 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
394 {
395 uint32_t numVerts = 0;
396 switch (topology)
397 {
398 case TOP_POINT_LIST:
399 case TOP_POINT_LIST_BF:
400 numVerts = 1;
401 break;
402 case TOP_LINE_LIST:
403 case TOP_LINE_STRIP:
404 case TOP_LINE_LIST_ADJ:
405 case TOP_LINE_LOOP:
406 case TOP_LINE_STRIP_CONT:
407 case TOP_LINE_STRIP_BF:
408 case TOP_LISTSTRIP_ADJ:
409 numVerts = 2;
410 break;
411 case TOP_TRIANGLE_LIST:
412 case TOP_TRIANGLE_STRIP:
413 case TOP_TRIANGLE_FAN:
414 case TOP_TRI_LIST_ADJ:
415 case TOP_TRI_STRIP_ADJ:
416 case TOP_TRI_STRIP_REVERSE:
417 case TOP_RECT_LIST:
418 numVerts = 3;
419 break;
420 case TOP_QUAD_LIST:
421 case TOP_QUAD_STRIP:
422 numVerts = 4;
423 break;
424 case TOP_PATCHLIST_1:
425 case TOP_PATCHLIST_2:
426 case TOP_PATCHLIST_3:
427 case TOP_PATCHLIST_4:
428 case TOP_PATCHLIST_5:
429 case TOP_PATCHLIST_6:
430 case TOP_PATCHLIST_7:
431 case TOP_PATCHLIST_8:
432 case TOP_PATCHLIST_9:
433 case TOP_PATCHLIST_10:
434 case TOP_PATCHLIST_11:
435 case TOP_PATCHLIST_12:
436 case TOP_PATCHLIST_13:
437 case TOP_PATCHLIST_14:
438 case TOP_PATCHLIST_15:
439 case TOP_PATCHLIST_16:
440 case TOP_PATCHLIST_17:
441 case TOP_PATCHLIST_18:
442 case TOP_PATCHLIST_19:
443 case TOP_PATCHLIST_20:
444 case TOP_PATCHLIST_21:
445 case TOP_PATCHLIST_22:
446 case TOP_PATCHLIST_23:
447 case TOP_PATCHLIST_24:
448 case TOP_PATCHLIST_25:
449 case TOP_PATCHLIST_26:
450 case TOP_PATCHLIST_27:
451 case TOP_PATCHLIST_28:
452 case TOP_PATCHLIST_29:
453 case TOP_PATCHLIST_30:
454 case TOP_PATCHLIST_31:
455 case TOP_PATCHLIST_32:
456 numVerts = topology - TOP_PATCHLIST_BASE;
457 break;
458 default:
459 SWR_INVALID("Unsupported topology: %d", topology);
460 break;
461 }
462
463 if (includeAdjVerts)
464 {
465 switch (topology)
466 {
467 case TOP_LISTSTRIP_ADJ:
468 case TOP_LINE_LIST_ADJ: numVerts = 4; break;
469 case TOP_TRI_STRIP_ADJ:
470 case TOP_TRI_LIST_ADJ: numVerts = 6; break;
471 default: break;
472 }
473 }
474
475 return numVerts;
476 }
477
478 //////////////////////////////////////////////////////////////////////////
479 /// @brief Generate mask from remaining work.
480 /// @param numWorkItems - Number of items being worked on by a SIMD.
GenerateMask(uint32_t numItemsRemaining)481 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
482 {
483 uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
484 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
485 return _simd_castps_si(_simd_vmask_ps(mask));
486 }
487
GenerateMask16(uint32_t numItemsRemaining)488 static INLINE simd16scalari GenerateMask16(uint32_t numItemsRemaining)
489 {
490 uint32_t numActive = (numItemsRemaining >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : numItemsRemaining;
491 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
492 return _simd16_castps_si(_simd16_vmask_ps(mask));
493 }
494
495 //////////////////////////////////////////////////////////////////////////
496 /// @brief StreamOut - Streams vertex data out to SO buffers.
497 /// Generally, we are only streaming out a SIMDs worth of triangles.
498 /// @param pDC - pointer to draw context.
499 /// @param workerId - thread's worker id. Even thread has a unique id.
500 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
StreamOut(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,uint32_t * pPrimData,uint32_t streamIndex)501 static void StreamOut(
502 DRAW_CONTEXT* pDC,
503 PA_STATE& pa,
504 uint32_t workerId,
505 uint32_t* pPrimData,
506 uint32_t streamIndex)
507 {
508 SWR_CONTEXT *pContext = pDC->pContext;
509
510 AR_BEGIN(FEStreamout, pDC->drawId);
511
512 const API_STATE& state = GetApiState(pDC);
513 const SWR_STREAMOUT_STATE &soState = state.soState;
514
515 uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
516
517 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
518 uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t);
519
520 SWR_STREAMOUT_CONTEXT soContext = { 0 };
521
522 // Setup buffer state pointers.
523 for (uint32_t i = 0; i < 4; ++i)
524 {
525 soContext.pBuffer[i] = &state.soBuffer[i];
526 }
527
528 uint32_t numPrims = pa.NumPrims();
529
530 for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
531 {
532 DWORD slot = 0;
533 uint32_t soMask = soState.streamMasks[streamIndex];
534
535 // Write all entries into primitive data buffer for SOS.
536 while (_BitScanForward(&slot, soMask))
537 {
538 simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
539 uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex];
540 pa.AssembleSingle(paSlot, primIndex, attrib);
541
542 // Attribute offset is relative offset from start of vertex.
543 // Note that attributes start at slot 1 in the PA buffer. We need to write this
544 // to prim data starting at slot 0. Which is why we do (slot - 1).
545 // Also note: GL works slightly differently, and needs slot 0
546 uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
547
548 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
549 for (uint32_t v = 0; v < soVertsPerPrim; ++v)
550 {
551 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
552
553 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
554 }
555
556 soMask &= ~(1 << slot);
557 }
558
559 // Update pPrimData pointer
560 soContext.pPrimData = pPrimData;
561
562 // Call SOS
563 SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
564 state.pfnSoFunc[streamIndex](soContext);
565 }
566
567 // Update SO write offset. The driver provides memory for the update.
568 for (uint32_t i = 0; i < 4; ++i)
569 {
570 if (state.soBuffer[i].pWriteOffset)
571 {
572 *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
573 }
574
575 if (state.soBuffer[i].soWriteEnable)
576 {
577 pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
578 pDC->dynState.SoWriteOffsetDirty[i] = true;
579 }
580 }
581
582 UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
583 UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
584
585 AR_END(FEStreamout, 1);
586 }
587
588 #if USE_SIMD16_FRONTEND
589 //////////////////////////////////////////////////////////////////////////
590 /// Is value an even number (a multiple of two)
591 ///
592 template <typename T>
IsEven(T value)593 INLINE static bool IsEven(T value)
594 {
595 return (value & 1) == 0;
596 }
597
598 //////////////////////////////////////////////////////////////////////////
599 /// Round up value to an even number (a multiple of two)
600 ///
601 template <typename T>
RoundUpEven(T value)602 INLINE static T RoundUpEven(T value)
603 {
604 return (value + 1) & ~1;
605 }
606
607 //////////////////////////////////////////////////////////////////////////
608 /// Round down value to an even number (a multiple of two)
609 ///
610 template <typename T>
RoundDownEven(T value)611 INLINE static T RoundDownEven(T value)
612 {
613 return value & ~1;
614 }
615
616 //////////////////////////////////////////////////////////////////////////
617 /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
618 ///
619 /// vertexCount is in terms of the source simdvertexes and must be even
620 ///
621 /// attribCount will limit the vector copies to those attribs specified
622 ///
623 /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
624 ///
PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex * vertex_simd16,const simdvertex * vertex,uint32_t vertexCount,uint32_t attribCount)625 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount)
626 {
627 SWR_ASSERT(vertex);
628 SWR_ASSERT(vertex_simd16);
629 SWR_ASSERT(attribCount <= SWR_VTX_NUM_SLOTS);
630
631 simd16vertex temp;
632
633 for (uint32_t i = 0; i < vertexCount; i += 2)
634 {
635 for (uint32_t j = 0; j < attribCount; j += 1)
636 {
637 for (uint32_t k = 0; k < 4; k += 1)
638 {
639 temp.attrib[j][k] = _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
640
641 if ((i + 1) < vertexCount)
642 {
643 temp.attrib[j][k] = _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
644 }
645 }
646 }
647
648 for (uint32_t j = 0; j < attribCount; j += 1)
649 {
650 vertex_simd16[i >> 1].attrib[j] = temp.attrib[j];
651 }
652 }
653 }
654
655 #endif
656 //////////////////////////////////////////////////////////////////////////
657 /// @brief Computes number of invocations. The current index represents
658 /// the start of the SIMD. The max index represents how much work
659 /// items are remaining. If there is less then a SIMD's xmin of work
660 /// then return the remaining amount of work.
661 /// @param curIndex - The start index for the SIMD.
662 /// @param maxIndex - The last index for all work items.
GetNumInvocations(uint32_t curIndex,uint32_t maxIndex)663 static INLINE uint32_t GetNumInvocations(
664 uint32_t curIndex,
665 uint32_t maxIndex)
666 {
667 uint32_t remainder = (maxIndex - curIndex);
668 #if USE_SIMD16_FRONTEND
669 return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder;
670 #else
671 return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
672 #endif
673 }
674
675 //////////////////////////////////////////////////////////////////////////
676 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
677 /// The geometry shader will loop over each active streamout buffer, assembling
678 /// primitives for the downstream stages. When multistream output is enabled,
679 /// the generated stream ID buffer from the GS needs to be converted to a cut
680 /// buffer for the primitive assembler.
681 /// @param stream - stream id to generate the cut buffer for
682 /// @param pStreamIdBase - pointer to the stream ID buffer
683 /// @param numEmittedVerts - Number of total verts emitted by the GS
684 /// @param pCutBuffer - output buffer to write cuts to
ProcessStreamIdBuffer(uint32_t stream,uint8_t * pStreamIdBase,uint32_t numEmittedVerts,uint8_t * pCutBuffer)685 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
686 {
687 SWR_ASSERT(stream < MAX_SO_STREAMS);
688
689 uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8;
690 uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
691
692 for (uint32_t b = 0; b < numOutputBytes; ++b)
693 {
694 uint8_t curInputByte = pStreamIdBase[2*b];
695 uint8_t outByte = 0;
696 for (uint32_t i = 0; i < 4; ++i)
697 {
698 if ((curInputByte & 0x3) != stream)
699 {
700 outByte |= (1 << i);
701 }
702 curInputByte >>= 2;
703 }
704
705 curInputByte = pStreamIdBase[2 * b + 1];
706 for (uint32_t i = 0; i < 4; ++i)
707 {
708 if ((curInputByte & 0x3) != stream)
709 {
710 outByte |= (1 << (i + 4));
711 }
712 curInputByte >>= 2;
713 }
714
715 *pCutBuffer++ = outByte;
716 }
717 }
718
719 // Buffers that are allocated if GS is enabled
720 struct GsBuffers
721 {
722 uint8_t* pGsIn;
723 uint8_t* pGsOut[KNOB_SIMD_WIDTH];
724 uint8_t* pGsTransposed;
725 void* pStreamCutBuffer;
726 };
727
728 //////////////////////////////////////////////////////////////////////////
729 /// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
730 /// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive assembler
731 /// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
732 /// @param numVerts - Number of vertices outputted by the GS
733 /// @param numAttribs - Number of attributes per vertex
734 template<typename SIMD_T, uint32_t SimdWidth>
TransposeSOAtoAOS(uint8_t * pDst,uint8_t * pSrc,uint32_t numVerts,uint32_t numAttribs)735 void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs)
736 {
737 uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
738 uint32_t dstVertexStride = numAttribs * sizeof(typename SIMD_T::Float) * 4;
739
740 OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];
741
742 for (uint32_t i = 0; i < SimdWidth; ++i)
743 {
744 gatherOffsets[i] = srcVertexStride * i;
745 }
746 auto vGatherOffsets = SIMD_T::load_si((typename SIMD_T::Integer*)&gatherOffsets[0]);
747
748 uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth;
749 uint32_t remainingVerts = numVerts;
750
751 for (uint32_t s = 0; s < numSimd; ++s)
752 {
753 uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;
754 uint8_t* pDstBase = pDst + s * dstVertexStride;
755
756 // Compute mask to prevent src overflow
757 uint32_t mask = std::min(remainingVerts, SimdWidth);
758 mask = GenMask(mask);
759 auto vMask = SIMD_T::vmask_ps(mask);
760 auto viMask = SIMD_T::castps_si(vMask);
761
762 for (uint32_t a = 0; a < numAttribs; ++a)
763 {
764 auto attribGatherX = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
765 auto attribGatherY = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float)), vGatherOffsets, vMask);
766 auto attribGatherZ = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 2), vGatherOffsets, vMask);
767 auto attribGatherW = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 3), vGatherOffsets, vMask);
768
769 SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
770 SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float)), viMask, attribGatherY);
771 SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 2), viMask, attribGatherZ);
772 SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 3), viMask, attribGatherW);
773
774 pSrcBase += sizeof(float) * 4;
775 pDstBase += sizeof(typename SIMD_T::Float) * 4;
776 }
777 remainingVerts -= SimdWidth;
778 }
779 }
780
781
782 //////////////////////////////////////////////////////////////////////////
783 /// @brief Implements GS stage.
784 /// @param pDC - pointer to draw context.
785 /// @param workerId - thread's worker id. Even thread has a unique id.
786 /// @param pa - The primitive assembly object.
787 /// @param pGsOut - output stream for GS
788 template <
789 typename HasStreamOutT,
790 typename HasRastT>
GeometryShaderStage(DRAW_CONTEXT * pDC,uint32_t workerId,PA_STATE & pa,GsBuffers * pGsBuffers,uint32_t * pSoPrimData,uint32_t numPrims_simd8,simdscalari const & primID)791 static void GeometryShaderStage(
792 DRAW_CONTEXT *pDC,
793 uint32_t workerId,
794 PA_STATE& pa,
795 GsBuffers* pGsBuffers,
796 uint32_t* pSoPrimData,
797 #if USE_SIMD16_FRONTEND
798 uint32_t numPrims_simd8,
799 #endif
800 simdscalari const &primID)
801 {
802 SWR_CONTEXT *pContext = pDC->pContext;
803
804 AR_BEGIN(FEGeometryShader, pDC->drawId);
805
806 const API_STATE& state = GetApiState(pDC);
807 const SWR_GS_STATE* pState = &state.gsState;
808 SWR_GS_CONTEXT gsContext;
809
810 static uint8_t sNullBuffer[128] = { 0 };
811
812 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
813 {
814 gsContext.pStreams[i] = pGsBuffers->pGsOut[i];
815 }
816 gsContext.pVerts = (simdvector*)pGsBuffers->pGsIn;
817 gsContext.PrimitiveID = primID;
818
819 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
820 simdvector attrib[MAX_NUM_VERTS_PER_PRIM];
821
822 // assemble all attributes for the input primitive
823 gsContext.inputVertStride = pState->inputVertStride;
824 for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
825 {
826 uint32_t srcAttribSlot = pState->srcVertexAttribOffset + slot;
827 uint32_t attribSlot = pState->vertexAttribOffset + slot;
828 pa.Assemble(srcAttribSlot, attrib);
829
830 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
831 {
832 gsContext.pVerts[attribSlot + pState->inputVertStride * i] = attrib[i];
833 }
834 }
835
836 // assemble position
837 pa.Assemble(VERTEX_POSITION_SLOT, attrib);
838 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
839 {
840 gsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * i] = attrib[i];
841 }
842
843 // record valid prims from the frontend to avoid over binning the newly generated
844 // prims from the GS
845 #if USE_SIMD16_FRONTEND
846 uint32_t numInputPrims = numPrims_simd8;
847 #else
848 uint32_t numInputPrims = pa.NumPrims();
849 #endif
850
851 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
852 {
853 gsContext.InstanceID = instance;
854 gsContext.mask = GenerateMask(numInputPrims);
855
856 // execute the geometry shader
857 state.pfnGsFunc(GetPrivateState(pDC), &gsContext);
858
859 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
860 {
861 gsContext.pStreams[i] += pState->allocationSize;
862 }
863 }
864
865 // set up new binner and state for the GS output topology
866 #if USE_SIMD16_FRONTEND
867 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
868 if (HasRastT::value)
869 {
870 switch (pState->outputTopology)
871 {
872 case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles_simd16; break;
873 case TOP_LINE_STRIP: pfnClipFunc = ClipLines_simd16; break;
874 case TOP_POINT_LIST: pfnClipFunc = ClipPoints_simd16; break;
875 default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
876 }
877 }
878
879 #else
880 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
881 if (HasRastT::value)
882 {
883 switch (pState->outputTopology)
884 {
885 case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles; break;
886 case TOP_LINE_STRIP: pfnClipFunc = ClipLines; break;
887 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
888 default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
889 }
890 }
891
892 #endif
893 // foreach input prim:
894 // - setup a new PA based on the emitted verts for that prim
895 // - loop over the new verts, calling PA to assemble each prim
896 uint32_t* pPrimitiveId = (uint32_t*)&primID;
897
898 uint32_t totalPrimsGenerated = 0;
899 for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
900 {
901 uint8_t* pInstanceBase = (uint8_t*)pGsBuffers->pGsOut[inputPrim];
902
903 // Vertex count is either emitted by shader or static
904 uint32_t vertexCount = 0;
905 if (pState->staticVertexCount)
906 {
907 vertexCount = pState->staticVertexCount;
908 }
909 else
910 {
911 // If emitted in shader, it should be the stored in the first dword of the output buffer
912 vertexCount = *(uint32_t*)pInstanceBase;
913 }
914
915 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
916 {
917 uint32_t numEmittedVerts = vertexCount;
918 if (numEmittedVerts == 0)
919 {
920 continue;
921 }
922
923 uint8_t* pBase = pInstanceBase + instance * pState->allocationSize;
924 uint8_t* pCutBase = pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset;
925 uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset;
926
927 #if USE_SIMD16_FRONTEND
928 TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize);
929 #else
930 TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize);
931 #endif
932
933 uint32_t numAttribs = state.feNumAttributes;
934
935 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
936 {
937 bool processCutVerts = false;
938 uint8_t* pCutBuffer = pCutBase;
939
940 // assign default stream ID, only relevant when GS is outputting a single stream
941 uint32_t streamID = 0;
942 if (pState->isSingleStream)
943 {
944 processCutVerts = true;
945 streamID = pState->singleStreamID;
946 if (streamID != stream) continue;
947 }
948 else
949 {
950 // early exit if this stream is not enabled for streamout
951 if (HasStreamOutT::value && !state.soState.streamEnable[stream])
952 {
953 continue;
954 }
955
956 // multi-stream output, need to translate StreamID buffer to a cut buffer
957 ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer);
958 pCutBuffer = (uint8_t*)pGsBuffers->pStreamCutBuffer;
959 processCutVerts = false;
960 }
961
962 #if USE_SIMD16_FRONTEND
963 PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts, pa.numVertsPerPrim);
964
965 #else
966 PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts, pa.numVertsPerPrim);
967
968 #endif
969 while (gsPa.GetNextStreamOutput())
970 {
971 do
972 {
973 #if USE_SIMD16_FRONTEND
974 simd16vector attrib_simd16[3];
975
976 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib_simd16);
977
978 #else
979 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
980
981 #endif
982 if (assemble)
983 {
984 totalPrimsGenerated += gsPa.NumPrims();
985
986 if (HasStreamOutT::value)
987 {
988 #if ENABLE_AVX512_SIMD16
989 gsPa.useAlternateOffset = false;
990 #endif
991 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
992 }
993
994 if (HasRastT::value && state.soState.streamToRasterizer == stream)
995 {
996 #if USE_SIMD16_FRONTEND
997 simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
998
999 // Gather data from the SVG if provided.
1000 simd16scalari vViewportIdx = SIMD16::setzero_si();
1001 simd16scalari vRtIdx = SIMD16::setzero_si();
1002 SIMD16::Vec4 svgAttrib[4];
1003
1004 if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
1005 {
1006 gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1007 }
1008
1009
1010 if (state.backendState.readViewportArrayIndex)
1011 {
1012 vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1013 gsPa.viewportArrayActive = true;
1014 }
1015 if (state.backendState.readRenderTargetArrayIndex)
1016 {
1017 vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1018 gsPa.rtArrayActive = true;
1019 }
1020
1021 {
1022 // OOB VPAI indices => forced to zero.
1023 vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
1024 simd16scalari vNumViewports = SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1025 simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
1026 vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
1027
1028 gsPa.useAlternateOffset = false;
1029 pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId, vViewportIdx, vRtIdx);
1030 }
1031 #else
1032 simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
1033
1034 // Gather data from the SVG if provided.
1035 simdscalari vViewportIdx = SIMD16::setzero_si();
1036 simdscalari vRtIdx = SIMD16::setzero_si();
1037 SIMD8::Vec4 svgAttrib[4];
1038
1039 if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
1040 {
1041 tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1042 }
1043
1044
1045 if (state.backendState.readViewportArrayIndex)
1046 {
1047 vViewportIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1048
1049 // OOB VPAI indices => forced to zero.
1050 vViewportIdx = SIMD8::max_epi32(vViewportIdx, SIMD8::setzero_si());
1051 simd16scalari vNumViewports = SIMD8::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1052 simd16scalari vClearMask = SIMD8::cmplt_epi32(vViewportIdx, vNumViewports);
1053 vViewportIdx = SIMD8::and_si(vClearMask, vViewportIdx);
1054 tessPa.viewportArrayActive = true;
1055 }
1056 if (state.backendState.readRenderTargetArrayIndex)
1057 {
1058 vRtIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1059 tessPa.rtArrayActive = true;
1060 }
1061
1062 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewportIdx, vRtIdx);
1063 #endif
1064 }
1065 }
1066 } while (gsPa.NextPrim());
1067 }
1068 }
1069 }
1070 }
1071
1072 // update GS pipeline stats
1073 UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
1074 UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
1075 AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
1076 AR_END(FEGeometryShader, 1);
1077 }
1078
1079 //////////////////////////////////////////////////////////////////////////
1080 /// @brief Allocate GS buffers
1081 /// @param pDC - pointer to draw context.
1082 /// @param state - API state
1083 /// @param ppGsOut - pointer to GS output buffer allocation
1084 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
1085 template<typename SIMD_T, uint32_t SIMD_WIDTH>
AllocateGsBuffers(DRAW_CONTEXT * pDC,const API_STATE & state,uint32_t vertsPerPrim,GsBuffers * pGsBuffers)1086 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, uint32_t vertsPerPrim, GsBuffers* pGsBuffers)
1087 {
1088 auto pArena = pDC->pArena;
1089 SWR_ASSERT(pArena != nullptr);
1090 SWR_ASSERT(state.gsState.gsEnable);
1091
1092 const SWR_GS_STATE& gsState = state.gsState;
1093
1094 // Allocate storage for vertex inputs
1095 uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim;
1096 pGsBuffers->pGsIn = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32);
1097
1098 // Allocate arena space to hold GS output verts
1099 const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize;
1100
1101 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
1102 {
1103 pGsBuffers->pGsOut[i] = (uint8_t*)pArena->AllocAligned(vertexBufferSize, 32);
1104 }
1105
1106 // Allocate storage for transposed GS output
1107 uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH;
1108 uint32_t transposedBufferSize = numSimdBatches * gsState.outputVertexSize * sizeof(typename SIMD_T::Vec4);
1109 pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32);
1110
1111 // Allocate storage to hold temporary stream->cut buffer, if necessary
1112 if (state.gsState.isSingleStream)
1113 {
1114 pGsBuffers->pStreamCutBuffer = nullptr;
1115 }
1116 else
1117 {
1118 pGsBuffers->pStreamCutBuffer = (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32);
1119 }
1120 }
1121
1122 //////////////////////////////////////////////////////////////////////////
1123 /// @brief Contains all data generated by the HS and passed to the
1124 /// tessellator and DS.
1125 struct TessellationThreadLocalData
1126 {
1127 SWR_HS_CONTEXT hsContext;
1128 ScalarPatch patchData[KNOB_SIMD_WIDTH];
1129 void* pTxCtx;
1130 size_t tsCtxSize;
1131
1132 simdscalar* pDSOutput;
1133 size_t dsOutputAllocSize;
1134 };
1135
1136 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
1137
1138 //////////////////////////////////////////////////////////////////////////
1139 /// @brief Allocate tessellation data for this worker thread.
1140 INLINE
AllocateTessellationData(SWR_CONTEXT * pContext)1141 static void AllocateTessellationData(SWR_CONTEXT* pContext)
1142 {
1143 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
1144 if (gt_pTessellationThreadData == nullptr)
1145 {
1146 gt_pTessellationThreadData = (TessellationThreadLocalData*)
1147 AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
1148 memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
1149 }
1150 }
1151
1152 //////////////////////////////////////////////////////////////////////////
1153 /// @brief Implements Tessellation Stages.
1154 /// @param pDC - pointer to draw context.
1155 /// @param workerId - thread's worker id. Even thread has a unique id.
1156 /// @param pa - The primitive assembly object.
1157 /// @param pGsOut - output stream for GS
1158 template <
1159 typename HasGeometryShaderT,
1160 typename HasStreamOutT,
1161 typename HasRastT>
TessellationStages(DRAW_CONTEXT * pDC,uint32_t workerId,PA_STATE & pa,GsBuffers * pGsBuffers,uint32_t * pSoPrimData,uint32_t numPrims_simd8,simdscalari const & primID)1162 static void TessellationStages(
1163 DRAW_CONTEXT *pDC,
1164 uint32_t workerId,
1165 PA_STATE& pa,
1166 GsBuffers* pGsBuffers,
1167 uint32_t* pSoPrimData,
1168 #if USE_SIMD16_FRONTEND
1169 uint32_t numPrims_simd8,
1170 #endif
1171 simdscalari const &primID)
1172 {
1173 SWR_CONTEXT *pContext = pDC->pContext;
1174 const API_STATE& state = GetApiState(pDC);
1175 const SWR_TS_STATE& tsState = state.tsState;
1176
1177 SWR_ASSERT(gt_pTessellationThreadData);
1178
1179 HANDLE tsCtx = TSInitCtx(
1180 tsState.domain,
1181 tsState.partitioning,
1182 tsState.tsOutputTopology,
1183 gt_pTessellationThreadData->pTxCtx,
1184 gt_pTessellationThreadData->tsCtxSize);
1185 if (tsCtx == nullptr)
1186 {
1187 gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
1188 tsCtx = TSInitCtx(
1189 tsState.domain,
1190 tsState.partitioning,
1191 tsState.tsOutputTopology,
1192 gt_pTessellationThreadData->pTxCtx,
1193 gt_pTessellationThreadData->tsCtxSize);
1194 }
1195 SWR_ASSERT(tsCtx);
1196
1197 #if USE_SIMD16_FRONTEND
1198 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
1199 if (HasRastT::value)
1200 {
1201 switch (tsState.postDSTopology)
1202 {
1203 case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles_simd16; break;
1204 case TOP_LINE_LIST: pfnClipFunc = ClipLines_simd16; break;
1205 case TOP_POINT_LIST: pfnClipFunc = ClipPoints_simd16; break;
1206 default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1207 }
1208 }
1209
1210 #else
1211 PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
1212 if (HasRastT::value)
1213 {
1214 switch (tsState.postDSTopology)
1215 {
1216 case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
1217 case TOP_LINE_LIST: pfnClipFunc = ClipLines; break;
1218 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
1219 default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1220 }
1221 }
1222
1223 #endif
1224 SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1225 hsContext.pCPout = gt_pTessellationThreadData->patchData;
1226 hsContext.PrimitiveID = primID;
1227
1228 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1229 // Max storage for one attribute for an entire simdprimitive
1230 simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1231
1232 // assemble all attributes for the input primitives
1233 for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1234 {
1235 uint32_t attribSlot = tsState.vertexAttribOffset + slot;
1236 pa.Assemble(attribSlot, simdattrib);
1237
1238 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1239 {
1240 hsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = simdattrib[i];
1241 }
1242 }
1243
1244 #if defined(_DEBUG)
1245 memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1246 #endif
1247
1248 #if USE_SIMD16_FRONTEND
1249 uint32_t numPrims = numPrims_simd8;
1250 #else
1251 uint32_t numPrims = pa.NumPrims();
1252 #endif
1253 hsContext.mask = GenerateMask(numPrims);
1254
1255 // Run the HS
1256 AR_BEGIN(FEHullShader, pDC->drawId);
1257 state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1258 AR_END(FEHullShader, 0);
1259
1260 UPDATE_STAT_FE(HsInvocations, numPrims);
1261
1262 const uint32_t* pPrimId = (const uint32_t*)&primID;
1263
1264 for (uint32_t p = 0; p < numPrims; ++p)
1265 {
1266 // Run Tessellator
1267 SWR_TS_TESSELLATED_DATA tsData = { 0 };
1268 AR_BEGIN(FETessellation, pDC->drawId);
1269 TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1270 AR_EVENT(TessPrimCount(1));
1271 AR_END(FETessellation, 0);
1272
1273 if (tsData.NumPrimitives == 0)
1274 {
1275 continue;
1276 }
1277 SWR_ASSERT(tsData.NumDomainPoints);
1278
1279 // Allocate DS Output memory
1280 uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1281 #if USE_SIMD16_FRONTEND
1282 size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * tsState.dsAllocationSize; // simd8 -> simd16, padding
1283 #else
1284 size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.dsAllocationSize;
1285 size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1286 #endif
1287 if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize)
1288 {
1289 AlignedFree(gt_pTessellationThreadData->pDSOutput);
1290 gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1291 gt_pTessellationThreadData->dsOutputAllocSize = requiredAllocSize;
1292 }
1293 SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1294 SWR_ASSERT(gt_pTessellationThreadData->dsOutputAllocSize >= requiredAllocSize);
1295
1296 #if defined(_DEBUG)
1297 memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1298 #endif
1299
1300 // Run Domain Shader
1301 SWR_DS_CONTEXT dsContext;
1302 dsContext.PrimitiveID = pPrimId[p];
1303 dsContext.pCpIn = &hsContext.pCPout[p];
1304 dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1305 dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1306 dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1307 dsContext.outVertexAttribOffset = tsState.dsOutVtxAttribOffset;
1308 #if USE_SIMD16_FRONTEND
1309 dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); // simd8 -> simd16
1310 #else
1311 dsContext.vectorStride = requiredDSVectorInvocations;
1312 #endif
1313
1314 uint32_t dsInvocations = 0;
1315
1316 for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1317 {
1318 dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1319
1320 AR_BEGIN(FEDomainShader, pDC->drawId);
1321 state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1322 AR_END(FEDomainShader, 0);
1323
1324 dsInvocations += KNOB_SIMD_WIDTH;
1325 }
1326 UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1327
1328 #if USE_SIMD16_FRONTEND
1329 SWR_ASSERT(IsEven(dsContext.vectorStride)); // simd8 -> simd16
1330
1331 #endif
1332 PA_TESS tessPa(
1333 pDC,
1334 #if USE_SIMD16_FRONTEND
1335 reinterpret_cast<const simd16scalar *>(dsContext.pOutputData), // simd8 -> simd16
1336 dsContext.vectorStride / 2, // simd8 -> simd16
1337 #else
1338 dsContext.pOutputData,
1339 dsContext.vectorStride,
1340 #endif
1341 SWR_VTX_NUM_SLOTS,
1342 tsState.numDsOutputAttribs,
1343 tsData.ppIndices,
1344 tsData.NumPrimitives,
1345 tsState.postDSTopology,
1346 numVertsPerPrim);
1347
1348 while (tessPa.HasWork())
1349 {
1350 #if USE_SIMD16_FRONTEND
1351 const uint32_t numPrims = tessPa.NumPrims();
1352 const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1353 const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1354
1355 const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID);
1356 const simdscalari primID_lo = _simd16_extract_si(primID, 0);
1357 const simdscalari primID_hi = _simd16_extract_si(primID, 1);
1358
1359 #endif
1360 if (HasGeometryShaderT::value)
1361 {
1362 #if USE_SIMD16_FRONTEND
1363 tessPa.useAlternateOffset = false;
1364 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo);
1365
1366 if (numPrims_hi)
1367 {
1368 tessPa.useAlternateOffset = true;
1369 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi);
1370 }
1371 #else
1372 GeometryShaderStage<HasStreamOutT, HasRastT>(
1373 pDC, workerId, tessPa, pGsBuffers, pSoPrimData, _simd_set1_epi32(dsContext.PrimitiveID));
1374 #endif
1375 }
1376 else
1377 {
1378 if (HasStreamOutT::value)
1379 {
1380 #if ENABLE_AVX512_SIMD16
1381 tessPa.useAlternateOffset = false;
1382 #endif
1383 StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1384 }
1385
1386 if (HasRastT::value)
1387 {
1388 #if USE_SIMD16_FRONTEND
1389 simd16vector prim_simd16[3]; // Only deal with triangles, lines, or points
1390 #else
1391 simdvector prim[3]; // Only deal with triangles, lines, or points
1392 #endif
1393 AR_BEGIN(FEPAAssemble, pDC->drawId);
1394 bool assemble =
1395 #if USE_SIMD16_FRONTEND
1396 tessPa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
1397 #else
1398 tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1399 #endif
1400 AR_END(FEPAAssemble, 1);
1401 SWR_ASSERT(assemble);
1402
1403 SWR_ASSERT(pfnClipFunc);
1404 #if USE_SIMD16_FRONTEND
1405 // Gather data from the SVG if provided.
1406 simd16scalari vViewportIdx = SIMD16::setzero_si();
1407 simd16scalari vRtIdx = SIMD16::setzero_si();
1408 SIMD16::Vec4 svgAttrib[4];
1409
1410 if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
1411 {
1412 tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1413 }
1414
1415
1416 if (state.backendState.readViewportArrayIndex)
1417 {
1418 vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1419 tessPa.viewportArrayActive = true;
1420 }
1421 if (state.backendState.readRenderTargetArrayIndex)
1422 {
1423 vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1424 tessPa.rtArrayActive = true;
1425 }
1426
1427
1428 {
1429 // OOB VPAI indices => forced to zero.
1430 vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
1431 simd16scalari vNumViewports = SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1432 simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
1433 vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
1434
1435 tessPa.useAlternateOffset = false;
1436 pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID, vViewportIdx, vRtIdx);
1437 }
1438 #else
1439 // Gather data from the SVG if provided.
1440 simdscalari vViewportIdx = SIMD16::setzero_si();
1441 simdscalari vRtIdx = SIMD16::setzero_si();
1442 SIMD8::Vec4 svgAttrib[4];
1443
1444 if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
1445 {
1446 tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1447 }
1448
1449 if (state.backendState.readViewportArrayIndex)
1450 {
1451 vViewportIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1452
1453 // OOB VPAI indices => forced to zero.
1454 vViewportIdx = SIMD8::max_epi32(vViewportIdx, SIMD8::setzero_si());
1455 simd16scalari vNumViewports = SIMD8::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1456 simd16scalari vClearMask = SIMD8::cmplt_epi32(vViewportIdx, vNumViewports);
1457 vViewportIdx = SIMD8::and_si(vClearMask, vViewportIdx);
1458 tessPa.viewportArrayActive = true;
1459 }
1460 if (state.backendState.readRenderTargetArrayIndex)
1461 {
1462 vRtIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1463 tessPa.rtArrayActive = true;
1464 }
1465 pfnClipFunc(pDC, tessPa, workerId, prim,
1466 GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), vViewportIdx, vRtIdx);
1467 #endif
1468 }
1469 }
1470
1471 tessPa.NextPrim();
1472
1473 } // while (tessPa.HasWork())
1474 } // for (uint32_t p = 0; p < numPrims; ++p)
1475
1476 #if USE_SIMD16_FRONTEND
1477 if (gt_pTessellationThreadData->pDSOutput != nullptr)
1478 {
1479 AlignedFree(gt_pTessellationThreadData->pDSOutput);
1480 gt_pTessellationThreadData->pDSOutput = nullptr;
1481 }
1482 gt_pTessellationThreadData->dsOutputAllocSize = 0;
1483
1484 #endif
1485 TSDestroyCtx(tsCtx);
1486 }
1487
1488 THREAD PA_STATE::SIMDVERTEX *gpVertexStore = nullptr;
1489 THREAD uint32_t gVertexStoreSize = 0;
1490
1491 //////////////////////////////////////////////////////////////////////////
1492 /// @brief FE handler for SwrDraw.
1493 /// @tparam IsIndexedT - Is indexed drawing enabled
1494 /// @tparam HasTessellationT - Is tessellation enabled
1495 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1496 /// @tparam HasStreamOutT - Is stream-out enabled
1497 /// @tparam HasRastT - Is rasterization enabled
1498 /// @param pContext - pointer to SWR context.
1499 /// @param pDC - pointer to draw context.
1500 /// @param workerId - thread's worker id.
1501 /// @param pUserData - Pointer to DRAW_WORK
1502 template <
1503 typename IsIndexedT,
1504 typename IsCutIndexEnabledT,
1505 typename HasTessellationT,
1506 typename HasGeometryShaderT,
1507 typename HasStreamOutT,
1508 typename HasRastT>
ProcessDraw(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)1509 void ProcessDraw(
1510 SWR_CONTEXT *pContext,
1511 DRAW_CONTEXT *pDC,
1512 uint32_t workerId,
1513 void *pUserData)
1514 {
1515
1516 #if KNOB_ENABLE_TOSS_POINTS
1517 if (KNOB_TOSS_QUEUE_FE)
1518 {
1519 return;
1520 }
1521 #endif
1522
1523 AR_BEGIN(FEProcessDraw, pDC->drawId);
1524
1525 DRAW_WORK& work = *(DRAW_WORK*)pUserData;
1526 const API_STATE& state = GetApiState(pDC);
1527
1528 uint32_t indexSize = 0;
1529 uint32_t endVertex = work.numVerts;
1530
1531 const int32_t* pLastRequestedIndex = nullptr;
1532 if (IsIndexedT::value)
1533 {
1534 switch (work.type)
1535 {
1536 case R32_UINT:
1537 indexSize = sizeof(uint32_t);
1538 pLastRequestedIndex = &(work.pIB[endVertex]);
1539 break;
1540 case R16_UINT:
1541 indexSize = sizeof(uint16_t);
1542 // nasty address offset to last index
1543 pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1544 break;
1545 case R8_UINT:
1546 indexSize = sizeof(uint8_t);
1547 // nasty address offset to last index
1548 pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1549 break;
1550 default:
1551 SWR_INVALID("Invalid work.type: %d", work.type);
1552 }
1553 }
1554 else
1555 {
1556 // No cuts, prune partial primitives.
1557 endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1558 }
1559
1560 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1561 uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1562 #endif
1563
1564 GsBuffers gsBuffers;
1565 if (HasGeometryShaderT::value)
1566 {
1567 #if USE_SIMD16_FRONTEND
1568 AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
1569 #else
1570 AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
1571 #endif
1572 }
1573
1574 if (HasTessellationT::value)
1575 {
1576 SWR_ASSERT(state.tsState.tsEnable == true);
1577 SWR_ASSERT(state.pfnHsFunc != nullptr);
1578 SWR_ASSERT(state.pfnDsFunc != nullptr);
1579
1580 AllocateTessellationData(pContext);
1581 }
1582 else
1583 {
1584 SWR_ASSERT(state.tsState.tsEnable == false);
1585 SWR_ASSERT(state.pfnHsFunc == nullptr);
1586 SWR_ASSERT(state.pfnDsFunc == nullptr);
1587 }
1588
1589 // allocate space for streamout input prim data
1590 uint32_t* pSoPrimData = nullptr;
1591 if (HasStreamOutT::value)
1592 {
1593 pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1594 }
1595
1596 const uint32_t vertexCount = NumVertsPerPrim(state.topology, true);
1597 #if USE_SIMD16_FRONTEND
1598 uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simd16vector);
1599 #else
1600 uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector);
1601 #endif
1602
1603 SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);
1604
1605 // Compute storage requirements for vertex store
1606 // TODO: allocation needs to be rethought for better cut support
1607 uint32_t numVerts = vertexCount + 2; // Need extra space for PA state machine
1608 uint32_t vertexStoreSize = numVerts * simdVertexSizeBytes;
1609
1610 // grow the vertex store for the PA as necessary
1611 if (gVertexStoreSize < vertexStoreSize)
1612 {
1613 if (gpVertexStore != nullptr)
1614 {
1615 AlignedFree(gpVertexStore);
1616 gpVertexStore = nullptr;
1617 }
1618
1619 SWR_ASSERT(gpVertexStore == nullptr);
1620
1621 gpVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX *>(AlignedMalloc(vertexStoreSize, 64));
1622 gVertexStoreSize = vertexStoreSize;
1623
1624 SWR_ASSERT(gpVertexStore != nullptr);
1625 }
1626
1627 // choose primitive assembler
1628
1629 PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, gpVertexStore, numVerts, state.frontendState.vsVertexSize, GetNumVerts(state.topology, 1));
1630 PA_STATE& pa = paFactory.GetPA();
1631
1632 #if USE_SIMD16_FRONTEND
1633 #if USE_SIMD16_SHADERS
1634 simd16vertex vin;
1635 #else
1636 simdvertex vin_lo;
1637 simdvertex vin_hi;
1638 #endif
1639 SWR_VS_CONTEXT vsContext_lo;
1640 SWR_VS_CONTEXT vsContext_hi;
1641
1642 #if USE_SIMD16_SHADERS
1643 vsContext_lo.pVin = reinterpret_cast<simdvertex *>(&vin);
1644 vsContext_hi.pVin = reinterpret_cast<simdvertex *>(&vin);
1645 #else
1646 vsContext_lo.pVin = &vin_lo;
1647 vsContext_hi.pVin = &vin_hi;
1648 #endif
1649 vsContext_lo.AlternateOffset = 0;
1650 vsContext_hi.AlternateOffset = 1;
1651
1652 SWR_FETCH_CONTEXT fetchInfo_lo = { 0 };
1653
1654 fetchInfo_lo.pStreams = &state.vertexBuffers[0];
1655 fetchInfo_lo.StartInstance = work.startInstance;
1656 fetchInfo_lo.StartVertex = 0;
1657
1658 if (IsIndexedT::value)
1659 {
1660 fetchInfo_lo.BaseVertex = work.baseVertex;
1661
1662 // if the entire index buffer isn't being consumed, set the last index
1663 // so that fetches < a SIMD wide will be masked off
1664 fetchInfo_lo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1665 if (pLastRequestedIndex < fetchInfo_lo.pLastIndex)
1666 {
1667 fetchInfo_lo.pLastIndex = pLastRequestedIndex;
1668 }
1669 }
1670 else
1671 {
1672 fetchInfo_lo.StartVertex = work.startVertex;
1673 }
1674
1675 SWR_FETCH_CONTEXT fetchInfo_hi = fetchInfo_lo;
1676
1677 const simd16scalari vScale = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1678
1679 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1680 {
1681 uint32_t i = 0;
1682
1683 simd16scalari vIndex;
1684
1685 if (IsIndexedT::value)
1686 {
1687 fetchInfo_lo.pIndices = work.pIB;
1688 fetchInfo_hi.pIndices = (int32_t *)((uint8_t *)fetchInfo_lo.pIndices + KNOB_SIMD_WIDTH * indexSize); // 1/2 of KNOB_SIMD16_WIDTH
1689 }
1690 else
1691 {
1692 vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
1693
1694 fetchInfo_lo.pIndices = (const int32_t *)&vIndex;
1695 fetchInfo_hi.pIndices = (const int32_t *)&vIndex + KNOB_SIMD_WIDTH; // 1/2 of KNOB_SIMD16_WIDTH
1696 }
1697
1698 fetchInfo_lo.CurInstance = instanceNum;
1699 fetchInfo_hi.CurInstance = instanceNum;
1700
1701 vsContext_lo.InstanceID = instanceNum;
1702 vsContext_hi.InstanceID = instanceNum;
1703
1704 while (pa.HasWork())
1705 {
1706 // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1707 // So we need to keep this outside of (i < endVertex) check.
1708
1709 simdmask *pvCutIndices_lo = nullptr;
1710 simdmask *pvCutIndices_hi = nullptr;
1711
1712 if (IsIndexedT::value)
1713 {
1714 // simd16mask <=> simdmask[2]
1715
1716 pvCutIndices_lo = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[0];
1717 pvCutIndices_hi = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[1];
1718 }
1719
1720 simd16vertex &vout = pa.GetNextVsOutput();
1721
1722 vsContext_lo.pVout = reinterpret_cast<simdvertex *>(&vout);
1723 vsContext_hi.pVout = reinterpret_cast<simdvertex *>(&vout);
1724
1725 if (i < endVertex)
1726 {
1727 if (!IsIndexedT::value)
1728 {
1729 fetchInfo_lo.pLastIndex = fetchInfo_lo.pIndices;
1730 uint32_t offset;
1731 offset = std::min(endVertex-i, (uint32_t) KNOB_SIMD16_WIDTH);
1732 #if USE_SIMD16_SHADERS
1733 fetchInfo_lo.pLastIndex += offset;
1734 #else
1735 fetchInfo_lo.pLastIndex += std::min(offset, (uint32_t) KNOB_SIMD_WIDTH);
1736 uint32_t offset2 = std::min(offset, (uint32_t) KNOB_SIMD16_WIDTH)-KNOB_SIMD_WIDTH;
1737 assert(offset >= 0);
1738 fetchInfo_hi.pLastIndex = fetchInfo_hi.pIndices;
1739 fetchInfo_hi.pLastIndex += offset2;
1740 #endif
1741 }
1742 // 1. Execute FS/VS for a single SIMD.
1743 AR_BEGIN(FEFetchShader, pDC->drawId);
1744 #if USE_SIMD16_SHADERS
1745 state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo_lo, vin);
1746 #else
1747 state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo_lo, vin_lo);
1748
1749 if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
1750 {
1751 state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo_hi, vin_hi);
1752 }
1753 #endif
1754 AR_END(FEFetchShader, 0);
1755
1756 // forward fetch generated vertex IDs to the vertex shader
1757 #if USE_SIMD16_SHADERS
1758 #if USE_SIMD16_VS
1759 vsContext_lo.VertexID16 = _simd16_insert_si(
1760 vsContext_lo.VertexID16, fetchInfo_lo.VertexID, 0);
1761 vsContext_lo.VertexID16 = _simd16_insert_si(
1762 vsContext_lo.VertexID16, fetchInfo_lo.VertexID2, 1);
1763 #else
1764 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1765 vsContext_hi.VertexID = fetchInfo_lo.VertexID2;
1766 #endif
1767 #else
1768 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1769 vsContext_hi.VertexID = fetchInfo_hi.VertexID;
1770 #endif
1771
1772 // Setup active mask for vertex shader.
1773 #if USE_SIMD16_VS
1774 vsContext_lo.mask16 = GenerateMask16(endVertex - i);
1775 #else
1776 vsContext_lo.mask = GenerateMask(endVertex - i);
1777 vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
1778 #endif
1779
1780 // forward cut mask to the PA
1781 if (IsIndexedT::value)
1782 {
1783 #if USE_SIMD16_SHADERS
1784 *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1785 *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask2));
1786 #else
1787 *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1788 *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
1789 #endif
1790 }
1791
1792 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1793
1794 #if KNOB_ENABLE_TOSS_POINTS
1795 if (!KNOB_TOSS_FETCH)
1796 #endif
1797 {
1798 AR_BEGIN(FEVertexShader, pDC->drawId);
1799 #if USE_SIMD16_VS
1800 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);
1801 #else
1802 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);
1803
1804 if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
1805 {
1806 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_hi);
1807 }
1808 #endif
1809 AR_END(FEVertexShader, 0);
1810
1811 UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1812 }
1813 }
1814
1815 // 2. Assemble primitives given the last two SIMD.
1816 do
1817 {
1818 simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
1819
1820 RDTSC_START(FEPAAssemble);
1821 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
1822 RDTSC_STOP(FEPAAssemble, 1, 0);
1823
1824 #if KNOB_ENABLE_TOSS_POINTS
1825 if (!KNOB_TOSS_FETCH)
1826 #endif
1827 {
1828 #if KNOB_ENABLE_TOSS_POINTS
1829 if (!KNOB_TOSS_VS)
1830 #endif
1831 {
1832 if (assemble)
1833 {
1834 UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1835
1836 const uint32_t numPrims = pa.NumPrims();
1837 const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1838 const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1839
1840 const simd16scalari primID = pa.GetPrimID(work.startPrimID);
1841 const simdscalari primID_lo = _simd16_extract_si(primID, 0);
1842 const simdscalari primID_hi = _simd16_extract_si(primID, 1);
1843
1844 if (HasTessellationT::value)
1845 {
1846 pa.useAlternateOffset = false;
1847 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo);
1848
1849 if (numPrims_hi)
1850 {
1851 pa.useAlternateOffset = true;
1852 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi);
1853 }
1854 }
1855 else if (HasGeometryShaderT::value)
1856 {
1857 pa.useAlternateOffset = false;
1858 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo);
1859
1860 if (numPrims_hi)
1861 {
1862 pa.useAlternateOffset = true;
1863 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi);
1864 }
1865 }
1866 else
1867 {
1868 // If streamout is enabled then stream vertices out to memory.
1869 if (HasStreamOutT::value)
1870 {
1871 pa.useAlternateOffset = false;
1872 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1873 }
1874
1875 if (HasRastT::value)
1876 {
1877 SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
1878 // Gather data from the SVG if provided.
1879 simd16scalari vpai = SIMD16::setzero_si();
1880 simd16scalari rtai = SIMD16::setzero_si();
1881 SIMD16::Vec4 svgAttrib[4];
1882
1883 if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
1884 {
1885 pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1886 }
1887
1888
1889 if (state.backendState.readViewportArrayIndex)
1890 {
1891 vpai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1892 pa.viewportArrayActive = true;
1893 }
1894 if (state.backendState.readRenderTargetArrayIndex)
1895 {
1896 rtai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1897 pa.rtArrayActive = true;
1898 }
1899
1900 {
1901 // OOB VPAI indices => forced to zero.
1902 vpai = SIMD16::max_epi32(vpai, SIMD16::setzero_si());
1903 simd16scalari vNumViewports = SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1904 simd16scalari vClearMask = SIMD16::cmplt_epi32(vpai, vNumViewports);
1905 vpai = SIMD16::and_si(vClearMask, vpai);
1906
1907 pa.useAlternateOffset = false;
1908 pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID, vpai, rtai);
1909 }
1910 }
1911 }
1912 }
1913 }
1914 }
1915 } while (pa.NextPrim());
1916
1917 if (IsIndexedT::value)
1918 {
1919 fetchInfo_lo.pIndices = (int32_t *)((uint8_t*)fetchInfo_lo.pIndices + KNOB_SIMD16_WIDTH * indexSize);
1920 fetchInfo_hi.pIndices = (int32_t *)((uint8_t*)fetchInfo_hi.pIndices + KNOB_SIMD16_WIDTH * indexSize);
1921 }
1922 else
1923 {
1924 vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
1925 }
1926
1927 i += KNOB_SIMD16_WIDTH;
1928 }
1929
1930 pa.Reset();
1931 }
1932
1933 #else
1934 SWR_VS_CONTEXT vsContext;
1935 SWR_FETCH_CONTEXT fetchInfo = { 0 };
1936
1937 fetchInfo.pStreams = &state.vertexBuffers[0];
1938 fetchInfo.StartInstance = work.startInstance;
1939 fetchInfo.StartVertex = 0;
1940
1941 if (IsIndexedT::value)
1942 {
1943 fetchInfo.BaseVertex = work.baseVertex;
1944
1945 // if the entire index buffer isn't being consumed, set the last index
1946 // so that fetches < a SIMD wide will be masked off
1947 fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1948 if (pLastRequestedIndex < fetchInfo.pLastIndex)
1949 {
1950 fetchInfo.pLastIndex = pLastRequestedIndex;
1951 }
1952 }
1953 else
1954 {
1955 fetchInfo.StartVertex = work.startVertex;
1956 }
1957
1958 const simdscalari vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1959
1960 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1961 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1962 {
1963 simdscalari vIndex;
1964 uint32_t i = 0;
1965
1966 if (IsIndexedT::value)
1967 {
1968 fetchInfo.pIndices = work.pIB;
1969 }
1970 else
1971 {
1972 vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1973 fetchInfo.pIndices = (const int32_t*)&vIndex;
1974 }
1975
1976 fetchInfo.CurInstance = instanceNum;
1977 vsContext.InstanceID = instanceNum;
1978
1979 while (pa.HasWork())
1980 {
1981 // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1982 // So we need to keep this outside of (i < endVertex) check.
1983 simdmask* pvCutIndices = nullptr;
1984 if (IsIndexedT::value)
1985 {
1986 pvCutIndices = &pa.GetNextVsIndices();
1987 }
1988
1989 simdvertex& vout = pa.GetNextVsOutput();
1990 vsContext.pVin = &vout;
1991 vsContext.pVout = &vout;
1992
1993 if (i < endVertex)
1994 {
1995
1996 // 1. Execute FS/VS for a single SIMD.
1997 AR_BEGIN(FEFetchShader, pDC->drawId);
1998 state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo, vout);
1999 AR_END(FEFetchShader, 0);
2000
2001 // forward fetch generated vertex IDs to the vertex shader
2002 vsContext.VertexID = fetchInfo.VertexID;
2003
2004 // Setup active mask for vertex shader.
2005 vsContext.mask = GenerateMask(endVertex - i);
2006
2007 // forward cut mask to the PA
2008 if (IsIndexedT::value)
2009 {
2010 *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
2011 }
2012
2013 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
2014
2015 #if KNOB_ENABLE_TOSS_POINTS
2016 if (!KNOB_TOSS_FETCH)
2017 #endif
2018 {
2019 AR_BEGIN(FEVertexShader, pDC->drawId);
2020 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
2021 AR_END(FEVertexShader, 0);
2022
2023 UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
2024 }
2025 }
2026
2027 // 2. Assemble primitives given the last two SIMD.
2028 do
2029 {
2030 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
2031 // PaAssemble returns false if there is not enough verts to assemble.
2032 AR_BEGIN(FEPAAssemble, pDC->drawId);
2033 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
2034 AR_END(FEPAAssemble, 1);
2035
2036 #if KNOB_ENABLE_TOSS_POINTS
2037 if (!KNOB_TOSS_FETCH)
2038 #endif
2039 {
2040 #if KNOB_ENABLE_TOSS_POINTS
2041 if (!KNOB_TOSS_VS)
2042 #endif
2043 {
2044 if (assemble)
2045 {
2046 UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
2047
2048 if (HasTessellationT::value)
2049 {
2050 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
2051 pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID));
2052 }
2053 else if (HasGeometryShaderT::value)
2054 {
2055 GeometryShaderStage<HasStreamOutT, HasRastT>(
2056 pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID));
2057 }
2058 else
2059 {
2060 // If streamout is enabled then stream vertices out to memory.
2061 if (HasStreamOutT::value)
2062 {
2063 StreamOut(pDC, pa, workerId, pSoPrimData, 0);
2064 }
2065
2066 if (HasRastT::value)
2067 {
2068 SWR_ASSERT(pDC->pState->pfnProcessPrims);
2069
2070 // Gather data from the SVG if provided.
2071 simdscalari vViewportIdx = SIMD16::setzero_si();
2072 simdscalari vRtIdx = SIMD16::setzero_si();
2073 SIMD8::Vec4 svgAttrib[4];
2074
2075 if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
2076 {
2077 tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
2078 }
2079
2080 if (state.backendState.readViewportArrayIndex)
2081 {
2082 vViewportIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
2083
2084 // OOB VPAI indices => forced to zero.
2085 vViewportIdx = SIMD8::max_epi32(vViewportIdx, SIMD8::setzero_si());
2086 simd16scalari vNumViewports = SIMD8::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
2087 simd16scalari vClearMask = SIMD8::cmplt_epi32(vViewportIdx, vNumViewports);
2088 vViewportIdx = SIMD8::and_si(vClearMask, vViewportIdx);
2089 tessPa.viewportArrayActive = true;
2090 }
2091 if (state.backendState.readRenderTargetArrayIndex)
2092 {
2093 vRtIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
2094 tessPa.rtArrayActive = true;
2095 }
2096
2097 pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
2098 GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), vViewportIdx, vRtIdx);
2099 }
2100 }
2101 }
2102 }
2103 }
2104 } while (pa.NextPrim());
2105
2106 if (IsIndexedT::value)
2107 {
2108 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
2109 }
2110 else
2111 {
2112 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
2113 }
2114
2115 i += KNOB_SIMD_WIDTH;
2116 }
2117 pa.Reset();
2118 }
2119
2120 #endif
2121
2122 AR_END(FEProcessDraw, numPrims * work.numInstances);
2123 }
2124
2125 struct FEDrawChooser
2126 {
2127 typedef PFN_FE_WORK_FUNC FuncType;
2128
2129 template <typename... ArgsB>
GetFuncFEDrawChooser2130 static FuncType GetFunc()
2131 {
2132 return ProcessDraw<ArgsB...>;
2133 }
2134 };
2135
2136
2137 // Selector for correct templated Draw front-end function
GetProcessDrawFunc(bool IsIndexed,bool IsCutIndexEnabled,bool HasTessellation,bool HasGeometryShader,bool HasStreamOut,bool HasRasterization)2138 PFN_FE_WORK_FUNC GetProcessDrawFunc(
2139 bool IsIndexed,
2140 bool IsCutIndexEnabled,
2141 bool HasTessellation,
2142 bool HasGeometryShader,
2143 bool HasStreamOut,
2144 bool HasRasterization)
2145 {
2146 return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
2147 }
2148