1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file api.cpp
24 *
25 * @brief API implementation
26 *
27 ******************************************************************************/
28
29 #include <cfloat>
30 #include <cmath>
31 #include <cstdio>
32 #include <new>
33
34 #include "core/api.h"
35 #include "core/backend.h"
36 #include "core/context.h"
37 #include "core/depthstencil.h"
38 #include "core/frontend.h"
39 #include "core/rasterizer.h"
40 #include "core/rdtsc_core.h"
41 #include "core/threads.h"
42 #include "core/tilemgr.h"
43 #include "core/clip.h"
44 #include "core/utils.h"
45 #include "core/tileset.h"
46
47 #include "common/os.h"
48
49 static const SWR_RECT g_MaxScissorRect = {0, 0, KNOB_MAX_SCISSOR_X, KNOB_MAX_SCISSOR_Y};
50
51 void SetupDefaultState(SWR_CONTEXT* pContext);
52
GetContext(HANDLE hContext)53 static INLINE SWR_CONTEXT* GetContext(HANDLE hContext)
54 {
55 return (SWR_CONTEXT*)hContext;
56 }
57
WakeAllThreads(SWR_CONTEXT * pContext)58 void WakeAllThreads(SWR_CONTEXT* pContext)
59 {
60 pContext->FifosNotEmpty.notify_all();
61 }
62
63 //////////////////////////////////////////////////////////////////////////
64 /// @brief Create SWR Context.
65 /// @param pCreateInfo - pointer to creation info.
SwrCreateContext(SWR_CREATECONTEXT_INFO * pCreateInfo)66 HANDLE SwrCreateContext(SWR_CREATECONTEXT_INFO* pCreateInfo)
67 {
68 void* pContextMem = AlignedMalloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4);
69 memset(pContextMem, 0, sizeof(SWR_CONTEXT));
70 SWR_CONTEXT* pContext = new (pContextMem) SWR_CONTEXT();
71
72 pContext->privateStateSize = pCreateInfo->privateStateSize;
73
74 // initialize callback functions
75 pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
76 pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
77 pContext->pfnTranslateGfxptrForRead = pCreateInfo->pfnTranslateGfxptrForRead;
78 pContext->pfnTranslateGfxptrForWrite = pCreateInfo->pfnTranslateGfxptrForWrite;
79 pContext->pfnMakeGfxPtr = pCreateInfo->pfnMakeGfxPtr;
80 pContext->pfnCreateMemoryContext = pCreateInfo->pfnCreateMemoryContext;
81 pContext->pfnDestroyMemoryContext = pCreateInfo->pfnDestroyMemoryContext;
82 pContext->pfnUpdateSoWriteOffset = pCreateInfo->pfnUpdateSoWriteOffset;
83 pContext->pfnUpdateStats = pCreateInfo->pfnUpdateStats;
84 pContext->pfnUpdateStatsFE = pCreateInfo->pfnUpdateStatsFE;
85 pContext->pfnUpdateStreamOut = pCreateInfo->pfnUpdateStreamOut;
86
87
88 pContext->hExternalMemory = pCreateInfo->hExternalMemory;
89
90 pContext->MAX_DRAWS_IN_FLIGHT = KNOB_MAX_DRAWS_IN_FLIGHT;
91 if (pCreateInfo->MAX_DRAWS_IN_FLIGHT != 0)
92 {
93 pContext->MAX_DRAWS_IN_FLIGHT = pCreateInfo->MAX_DRAWS_IN_FLIGHT;
94 }
95
96 pContext->dcRing.Init(pContext->MAX_DRAWS_IN_FLIGHT);
97 pContext->dsRing.Init(pContext->MAX_DRAWS_IN_FLIGHT);
98
99 pContext->pMacroTileManagerArray =
100 (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * pContext->MAX_DRAWS_IN_FLIGHT, 64);
101 pContext->pDispatchQueueArray =
102 (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * pContext->MAX_DRAWS_IN_FLIGHT, 64);
103
104 for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
105 {
106 pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
107 new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
108 new (&pContext->pDispatchQueueArray[dc]) DispatchQueue();
109
110 pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
111 }
112
113 if (pCreateInfo->pThreadInfo)
114 {
115 pContext->threadInfo = *pCreateInfo->pThreadInfo;
116 }
117 else
118 {
119 pContext->threadInfo.MAX_WORKER_THREADS = KNOB_MAX_WORKER_THREADS;
120 pContext->threadInfo.BASE_NUMA_NODE = KNOB_BASE_NUMA_NODE;
121 pContext->threadInfo.BASE_CORE = KNOB_BASE_CORE;
122 pContext->threadInfo.BASE_THREAD = KNOB_BASE_THREAD;
123 pContext->threadInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES;
124 pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE;
125 pContext->threadInfo.MAX_THREADS_PER_CORE = KNOB_MAX_THREADS_PER_CORE;
126 pContext->threadInfo.SINGLE_THREADED = KNOB_SINGLE_THREADED;
127 }
128
129 if (pCreateInfo->pApiThreadInfo)
130 {
131 pContext->apiThreadInfo = *pCreateInfo->pApiThreadInfo;
132 }
133 else
134 {
135 pContext->apiThreadInfo.bindAPIThread0 = true;
136 pContext->apiThreadInfo.numAPIReservedThreads = 1;
137 pContext->apiThreadInfo.numAPIThreadsPerCore = 1;
138 }
139
140 if (pCreateInfo->pWorkerPrivateState)
141 {
142 pContext->workerPrivateState = *pCreateInfo->pWorkerPrivateState;
143 }
144
145 memset((void*)&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
146 memset((void*)&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
147 new (&pContext->WaitLock) std::mutex();
148 new (&pContext->FifosNotEmpty) std::condition_variable();
149
150 CreateThreadPool(pContext, &pContext->threadPool);
151
152 if (pContext->apiThreadInfo.bindAPIThread0)
153 {
154 BindApiThread(pContext, 0);
155 }
156
157 if (pContext->threadInfo.SINGLE_THREADED)
158 {
159 pContext->pSingleThreadLockedTiles = new TileSet();
160 }
161
162 pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads];
163 pContext->pStats =
164 (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64);
165
166 #if defined(KNOB_ENABLE_AR)
167 // Setup ArchRast thread contexts which includes +1 for API thread.
168 pContext->pArContext = new HANDLE[pContext->NumWorkerThreads + 1];
169 pContext->pArContext[pContext->NumWorkerThreads] =
170 ArchRast::CreateThreadContext(ArchRast::AR_THREAD::API);
171 #endif
172
173 #if defined(KNOB_ENABLE_RDTSC)
174 pContext->pBucketMgr = new BucketManager(pCreateInfo->contextName);
175 RDTSC_RESET(pContext->pBucketMgr);
176 RDTSC_INIT(pContext->pBucketMgr, 0);
177 #endif
178
179 // Allocate scratch space for workers.
180 ///@note We could lazily allocate this but its rather small amount of memory.
181 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
182 {
183 #if defined(_WIN32)
184 uint32_t numaNode =
185 pContext->threadPool.pThreadData ? pContext->threadPool.pThreadData[i].numaId : 0;
186 pContext->ppScratch[i] = (uint8_t*)VirtualAllocExNuma(GetCurrentProcess(),
187 nullptr,
188 KNOB_WORKER_SCRATCH_SPACE_SIZE,
189 MEM_RESERVE | MEM_COMMIT,
190 PAGE_READWRITE,
191 numaNode);
192 #else
193 pContext->ppScratch[i] =
194 (uint8_t*)AlignedMalloc(KNOB_WORKER_SCRATCH_SPACE_SIZE, KNOB_SIMD_WIDTH * 4);
195 #endif
196
197 #if defined(KNOB_ENABLE_AR)
198 // Initialize worker thread context for ArchRast.
199 pContext->pArContext[i] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::WORKER);
200
201 SWR_WORKER_DATA* pWorkerData = (SWR_WORKER_DATA*)pContext->threadPool.pThreadData[i].pWorkerPrivateData;
202 pWorkerData->hArContext = pContext->pArContext[i];
203 #endif
204
205
206 }
207
208 #if defined(KNOB_ENABLE_AR)
209 // cache the API thread event manager, for use with sim layer
210 pCreateInfo->hArEventManager = pContext->pArContext[pContext->NumWorkerThreads];
211 #endif
212
213 // State setup AFTER context is fully initialized
214 SetupDefaultState(pContext);
215
216 // initialize hot tile manager
217 pContext->pHotTileMgr = new HotTileMgr();
218
219 // pass pointer to bucket manager back to caller
220 #ifdef KNOB_ENABLE_RDTSC
221 pCreateInfo->pBucketMgr = pContext->pBucketMgr;
222 #endif
223
224 pCreateInfo->contextSaveSize = sizeof(API_STATE);
225
226 StartThreadPool(pContext, &pContext->threadPool);
227
228 return (HANDLE)pContext;
229 }
230
CopyState(DRAW_STATE & dst,const DRAW_STATE & src)231 void CopyState(DRAW_STATE& dst, const DRAW_STATE& src)
232 {
233 memcpy((void*)&dst.state, (void*)&src.state, sizeof(API_STATE));
234 }
235
236 template <bool IsDraw>
QueueWork(SWR_CONTEXT * pContext)237 void QueueWork(SWR_CONTEXT* pContext)
238 {
239 DRAW_CONTEXT* pDC = pContext->pCurDrawContext;
240 uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT;
241
242 if (IsDraw)
243 {
244 pDC->pTileMgr = &pContext->pMacroTileManagerArray[dcIndex];
245 pDC->pTileMgr->initialize();
246 }
247
248 // Each worker thread looks at a DC for both FE and BE work at different times and so we
249 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
250 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
251 // then moved on if all work is done.)
252 pContext->pCurDrawContext->threadsDone = pContext->NumFEThreads + pContext->NumBEThreads;
253
254 if (IsDraw)
255 {
256 InterlockedIncrement(&pContext->drawsOutstandingFE);
257 }
258
259 _ReadWriteBarrier();
260 {
261 std::unique_lock<std::mutex> lock(pContext->WaitLock);
262 pContext->dcRing.Enqueue();
263 }
264
265 if (pContext->threadInfo.SINGLE_THREADED)
266 {
267 uint32_t mxcsr = SetOptimalVectorCSR();
268
269 if (IsDraw)
270 {
271 uint32_t curDraw[2] = {pContext->pCurDrawContext->drawId,
272 pContext->pCurDrawContext->drawId};
273 WorkOnFifoFE(pContext, 0, curDraw[0]);
274 WorkOnFifoBE(pContext, 0, curDraw[1], *pContext->pSingleThreadLockedTiles, 0, 0);
275 }
276 else
277 {
278 uint32_t curDispatch = pContext->pCurDrawContext->drawId;
279 WorkOnCompute(pContext, 0, curDispatch);
280 }
281
282 // Dequeue the work here, if not already done, since we're single threaded (i.e. no
283 // workers).
284 while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0)
285 {
286 }
287
288 // restore csr
289 RestoreVectorCSR(mxcsr);
290 }
291 else
292 {
293 RDTSC_BEGIN(pContext->pBucketMgr, APIDrawWakeAllThreads, pDC->drawId);
294 WakeAllThreads(pContext);
295 RDTSC_END(pContext->pBucketMgr, APIDrawWakeAllThreads, 1);
296 }
297
298 // Set current draw context to NULL so that next state call forces a new draw context to be
299 // created and populated.
300 pContext->pPrevDrawContext = pContext->pCurDrawContext;
301 pContext->pCurDrawContext = nullptr;
302 }
303
QueueDraw(SWR_CONTEXT * pContext)304 INLINE void QueueDraw(SWR_CONTEXT* pContext)
305 {
306 QueueWork<true>(pContext);
307 }
308
QueueDispatch(SWR_CONTEXT * pContext)309 INLINE void QueueDispatch(SWR_CONTEXT* pContext)
310 {
311 QueueWork<false>(pContext);
312 }
313
GetDrawContext(SWR_CONTEXT * pContext,bool isSplitDraw=false)314 DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT* pContext, bool isSplitDraw = false)
315 {
316 RDTSC_BEGIN(pContext->pBucketMgr, APIGetDrawContext, 0);
317 // If current draw context is null then need to obtain a new draw context to use from ring.
318 if (pContext->pCurDrawContext == nullptr)
319 {
320 // Need to wait for a free entry.
321 while (pContext->dcRing.IsFull())
322 {
323 _mm_pause();
324 }
325
326 uint64_t curDraw = pContext->dcRing.GetHead();
327 uint32_t dcIndex = curDraw % pContext->MAX_DRAWS_IN_FLIGHT;
328
329 if ((pContext->frameCount - pContext->lastFrameChecked) > 2 ||
330 (curDraw - pContext->lastDrawChecked) > 0x10000)
331 {
332 // Take this opportunity to clean-up old arena allocations
333 pContext->cachingArenaAllocator.FreeOldBlocks();
334
335 pContext->lastFrameChecked = pContext->frameCount;
336 pContext->lastDrawChecked = curDraw;
337 }
338
339 DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
340 pContext->pCurDrawContext = pCurDrawContext;
341
342 // Assign next available entry in DS ring to this DC.
343 uint32_t dsIndex = pContext->curStateId % pContext->MAX_DRAWS_IN_FLIGHT;
344 pCurDrawContext->pState = &pContext->dsRing[dsIndex];
345
346 // Copy previous state to current state.
347 if (pContext->pPrevDrawContext)
348 {
349 DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext;
350
351 // If we're splitting our draw then we can just use the same state from the previous
352 // draw. In this case, we won't increment the DS ring index so the next non-split
353 // draw can receive the state.
354 if (isSplitDraw == false)
355 {
356 CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);
357
358 // Should have been cleaned up previously
359 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
360
361 pCurDrawContext->pState->pPrivateState = nullptr;
362
363 pContext->curStateId++; // Progress state ring index forward.
364 }
365 else
366 {
367 // If its a split draw then just copy the state pointer over
368 // since its the same draw.
369 pCurDrawContext->pState = pPrevDrawContext->pState;
370 SWR_ASSERT(pPrevDrawContext->cleanupState == false);
371 }
372 }
373 else
374 {
375 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
376 pContext->curStateId++; // Progress state ring index forward.
377 }
378
379 SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true);
380
381 // Reset dependency
382 pCurDrawContext->dependent = false;
383 pCurDrawContext->dependentFE = false;
384
385 pCurDrawContext->pContext = pContext;
386 pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
387
388 pCurDrawContext->doneFE = false;
389 pCurDrawContext->FeLock = 0;
390 pCurDrawContext->threadsDone = 0;
391 pCurDrawContext->retireCallback.pfnCallbackFunc = nullptr;
392
393 pCurDrawContext->dynState.Reset(pContext->NumWorkerThreads);
394
395 // Assign unique drawId for this DC
396 pCurDrawContext->drawId = pContext->dcRing.GetHead();
397
398 pCurDrawContext->cleanupState = true;
399 }
400 else
401 {
402 SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
403 }
404
405 RDTSC_END(pContext->pBucketMgr, APIGetDrawContext, 0);
406 return pContext->pCurDrawContext;
407 }
408
GetDrawState(SWR_CONTEXT * pContext)409 API_STATE* GetDrawState(SWR_CONTEXT* pContext)
410 {
411 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
412 SWR_ASSERT(pDC->pState != nullptr);
413
414 return &pDC->pState->state;
415 }
416
SwrDestroyContext(HANDLE hContext)417 void SwrDestroyContext(HANDLE hContext)
418 {
419 SWR_CONTEXT* pContext = GetContext(hContext);
420 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
421
422 pDC->FeWork.type = SHUTDOWN;
423 pDC->FeWork.pfnWork = ProcessShutdown;
424
425 // enqueue
426 QueueDraw(pContext);
427
428 DestroyThreadPool(pContext, &pContext->threadPool);
429
430 // free the fifos
431 for (uint32_t i = 0; i < pContext->MAX_DRAWS_IN_FLIGHT; ++i)
432 {
433 AlignedFree(pContext->dcRing[i].dynState.pStats);
434 delete pContext->dcRing[i].pArena;
435 delete pContext->dsRing[i].pArena;
436 pContext->pMacroTileManagerArray[i].~MacroTileMgr();
437 pContext->pDispatchQueueArray[i].~DispatchQueue();
438 }
439
440 AlignedFree(pContext->pDispatchQueueArray);
441 AlignedFree(pContext->pMacroTileManagerArray);
442
443 // Free scratch space.
444 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
445 {
446 #if defined(_WIN32)
447 VirtualFree(pContext->ppScratch[i], 0, MEM_RELEASE);
448 #else
449 AlignedFree(pContext->ppScratch[i]);
450 #endif
451
452 #if defined(KNOB_ENABLE_AR)
453 ArchRast::DestroyThreadContext(pContext->pArContext[i]);
454 #endif
455 }
456
457 #if defined(KNOB_ENABLE_RDTSC)
458 delete pContext->pBucketMgr;
459 #endif
460
461 delete[] pContext->ppScratch;
462 AlignedFree(pContext->pStats);
463
464 delete pContext->pHotTileMgr;
465 delete pContext->pSingleThreadLockedTiles;
466
467 pContext->~SWR_CONTEXT();
468 AlignedFree(GetContext(hContext));
469 }
470
SwrBindApiThread(HANDLE hContext,uint32_t apiThreadId)471 void SwrBindApiThread(HANDLE hContext, uint32_t apiThreadId)
472 {
473 SWR_CONTEXT* pContext = GetContext(hContext);
474 BindApiThread(pContext, apiThreadId);
475 }
476
SwrSaveState(HANDLE hContext,void * pOutputStateBlock,size_t memSize)477 void SWR_API SwrSaveState(HANDLE hContext, void* pOutputStateBlock, size_t memSize)
478 {
479 SWR_CONTEXT* pContext = GetContext(hContext);
480 auto pSrc = GetDrawState(pContext);
481 assert(pOutputStateBlock && memSize >= sizeof(*pSrc));
482
483 memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc));
484 }
485
SwrRestoreState(HANDLE hContext,const void * pStateBlock,size_t memSize)486 void SWR_API SwrRestoreState(HANDLE hContext, const void* pStateBlock, size_t memSize)
487 {
488 SWR_CONTEXT* pContext = GetContext(hContext);
489 auto pDst = GetDrawState(pContext);
490 assert(pStateBlock && memSize >= sizeof(*pDst));
491
492 memcpy((void*)pDst, (void*)pStateBlock, sizeof(*pDst));
493 }
494
SetupDefaultState(SWR_CONTEXT * pContext)495 void SetupDefaultState(SWR_CONTEXT* pContext)
496 {
497 API_STATE* pState = GetDrawState(pContext);
498
499 pState->rastState.cullMode = SWR_CULLMODE_NONE;
500 pState->rastState.frontWinding = SWR_FRONTWINDING_CCW;
501
502 pState->depthBoundsState.depthBoundsTestEnable = false;
503 pState->depthBoundsState.depthBoundsTestMinValue = 0.0f;
504 pState->depthBoundsState.depthBoundsTestMaxValue = 1.0f;
505 }
506
SwrSync(HANDLE hContext,PFN_CALLBACK_FUNC pfnFunc,uint64_t userData,uint64_t userData2,uint64_t userData3)507 void SWR_API SwrSync(HANDLE hContext,
508 PFN_CALLBACK_FUNC pfnFunc,
509 uint64_t userData,
510 uint64_t userData2,
511 uint64_t userData3)
512 {
513 SWR_ASSERT(pfnFunc != nullptr);
514
515 SWR_CONTEXT* pContext = GetContext(hContext);
516 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
517
518 RDTSC_BEGIN(pContext->pBucketMgr, APISync, 0);
519
520 pDC->FeWork.type = SYNC;
521 pDC->FeWork.pfnWork = ProcessSync;
522
523 // Setup callback function
524 pDC->retireCallback.pfnCallbackFunc = pfnFunc;
525 pDC->retireCallback.userData = userData;
526 pDC->retireCallback.userData2 = userData2;
527 pDC->retireCallback.userData3 = userData3;
528
529 AR_API_EVENT(SwrSyncEvent(pDC->drawId));
530
531 // enqueue
532 QueueDraw(pContext);
533
534 RDTSC_END(pContext->pBucketMgr, APISync, 1);
535 }
536
SwrStallBE(HANDLE hContext)537 void SwrStallBE(HANDLE hContext)
538 {
539 SWR_CONTEXT* pContext = GetContext(hContext);
540 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
541
542 pDC->dependent = true;
543 }
544
SwrWaitForIdle(HANDLE hContext)545 void SwrWaitForIdle(HANDLE hContext)
546 {
547 SWR_CONTEXT* pContext = GetContext(hContext);
548
549 RDTSC_BEGIN(pContext->pBucketMgr, APIWaitForIdle, 0);
550
551 while (!pContext->dcRing.IsEmpty())
552 {
553 _mm_pause();
554 }
555
556 RDTSC_END(pContext->pBucketMgr, APIWaitForIdle, 1);
557 }
558
SwrWaitForIdleFE(HANDLE hContext)559 void SwrWaitForIdleFE(HANDLE hContext)
560 {
561 SWR_CONTEXT* pContext = GetContext(hContext);
562
563 RDTSC_BEGIN(pContext->pBucketMgr, APIWaitForIdle, 0);
564
565 while (pContext->drawsOutstandingFE > 0)
566 {
567 _mm_pause();
568 }
569
570 RDTSC_END(pContext->pBucketMgr, APIWaitForIdle, 1);
571 }
572
SwrSetVertexBuffers(HANDLE hContext,uint32_t numBuffers,const SWR_VERTEX_BUFFER_STATE * pVertexBuffers)573 void SwrSetVertexBuffers(HANDLE hContext,
574 uint32_t numBuffers,
575 const SWR_VERTEX_BUFFER_STATE* pVertexBuffers)
576 {
577 API_STATE* pState = GetDrawState(GetContext(hContext));
578
579 for (uint32_t i = 0; i < numBuffers; ++i)
580 {
581 const SWR_VERTEX_BUFFER_STATE* pVB = &pVertexBuffers[i];
582 pState->vertexBuffers[pVB->index] = *pVB;
583 }
584 }
585
SwrSetIndexBuffer(HANDLE hContext,const SWR_INDEX_BUFFER_STATE * pIndexBuffer)586 void SwrSetIndexBuffer(HANDLE hContext, const SWR_INDEX_BUFFER_STATE* pIndexBuffer)
587 {
588 API_STATE* pState = GetDrawState(GetContext(hContext));
589
590 pState->indexBuffer = *pIndexBuffer;
591 }
592
SwrSetFetchFunc(HANDLE hContext,PFN_FETCH_FUNC pfnFetchFunc)593 void SwrSetFetchFunc(HANDLE hContext, PFN_FETCH_FUNC pfnFetchFunc)
594 {
595 API_STATE* pState = GetDrawState(GetContext(hContext));
596
597 pState->pfnFetchFunc = pfnFetchFunc;
598 }
599
SwrSetSoFunc(HANDLE hContext,PFN_SO_FUNC pfnSoFunc,uint32_t streamIndex)600 void SwrSetSoFunc(HANDLE hContext, PFN_SO_FUNC pfnSoFunc, uint32_t streamIndex)
601 {
602 API_STATE* pState = GetDrawState(GetContext(hContext));
603
604 SWR_ASSERT(streamIndex < MAX_SO_STREAMS);
605
606 pState->pfnSoFunc[streamIndex] = pfnSoFunc;
607 }
608
SwrSetSoState(HANDLE hContext,SWR_STREAMOUT_STATE * pSoState)609 void SwrSetSoState(HANDLE hContext, SWR_STREAMOUT_STATE* pSoState)
610 {
611 API_STATE* pState = GetDrawState(GetContext(hContext));
612
613 pState->soState = *pSoState;
614 }
615
SwrSetSoBuffers(HANDLE hContext,SWR_STREAMOUT_BUFFER * pSoBuffer,uint32_t slot)616 void SwrSetSoBuffers(HANDLE hContext, SWR_STREAMOUT_BUFFER* pSoBuffer, uint32_t slot)
617 {
618 API_STATE* pState = GetDrawState(GetContext(hContext));
619
620 SWR_ASSERT((slot < MAX_SO_STREAMS), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot);
621
622 // remember buffer status in case of future resume StreamOut
623 if ((pState->soBuffer[slot].pBuffer != 0) && (pSoBuffer->pBuffer == 0))
624 pState->soPausedBuffer[slot] = pState->soBuffer[slot];
625
626 // resume
627 if (pState->soPausedBuffer[slot].pBuffer == pSoBuffer->pBuffer)
628 pState->soBuffer[slot] = pState->soPausedBuffer[slot];
629 else
630 pState->soBuffer[slot] = *pSoBuffer;
631 }
632
SwrSetVertexFunc(HANDLE hContext,PFN_VERTEX_FUNC pfnVertexFunc)633 void SwrSetVertexFunc(HANDLE hContext, PFN_VERTEX_FUNC pfnVertexFunc)
634 {
635 API_STATE* pState = GetDrawState(GetContext(hContext));
636
637 pState->pfnVertexFunc = pfnVertexFunc;
638 }
639
SwrSetFrontendState(HANDLE hContext,SWR_FRONTEND_STATE * pFEState)640 void SwrSetFrontendState(HANDLE hContext, SWR_FRONTEND_STATE* pFEState)
641 {
642 API_STATE* pState = GetDrawState(GetContext(hContext));
643 pState->frontendState = *pFEState;
644 }
645
SwrSetGsState(HANDLE hContext,SWR_GS_STATE * pGSState)646 void SwrSetGsState(HANDLE hContext, SWR_GS_STATE* pGSState)
647 {
648 API_STATE* pState = GetDrawState(GetContext(hContext));
649 pState->gsState = *pGSState;
650 }
651
SwrSetGsFunc(HANDLE hContext,PFN_GS_FUNC pfnGsFunc)652 void SwrSetGsFunc(HANDLE hContext, PFN_GS_FUNC pfnGsFunc)
653 {
654 API_STATE* pState = GetDrawState(GetContext(hContext));
655 pState->pfnGsFunc = pfnGsFunc;
656 }
657
SwrSetCsFunc(HANDLE hContext,PFN_CS_FUNC pfnCsFunc,uint32_t totalThreadsInGroup,uint32_t totalSpillFillSize,uint32_t scratchSpaceSizePerWarp,uint32_t numWarps)658 void SwrSetCsFunc(HANDLE hContext,
659 PFN_CS_FUNC pfnCsFunc,
660 uint32_t totalThreadsInGroup,
661 uint32_t totalSpillFillSize,
662 uint32_t scratchSpaceSizePerWarp,
663 uint32_t numWarps)
664 {
665 API_STATE* pState = GetDrawState(GetContext(hContext));
666 pState->pfnCsFunc = pfnCsFunc;
667 pState->totalThreadsInGroup = totalThreadsInGroup;
668 pState->totalSpillFillSize = totalSpillFillSize;
669 pState->scratchSpaceSizePerWarp = scratchSpaceSizePerWarp;
670 pState->scratchSpaceNumWarps = numWarps;
671 }
672
SwrSetTsState(HANDLE hContext,SWR_TS_STATE * pState)673 void SwrSetTsState(HANDLE hContext, SWR_TS_STATE* pState)
674 {
675 API_STATE* pApiState = GetDrawState(GetContext(hContext));
676 pApiState->tsState = *pState;
677 }
678
SwrSetHsFunc(HANDLE hContext,PFN_HS_FUNC pfnFunc)679 void SwrSetHsFunc(HANDLE hContext, PFN_HS_FUNC pfnFunc)
680 {
681 API_STATE* pApiState = GetDrawState(GetContext(hContext));
682 pApiState->pfnHsFunc = pfnFunc;
683 }
684
SwrSetDsFunc(HANDLE hContext,PFN_DS_FUNC pfnFunc)685 void SwrSetDsFunc(HANDLE hContext, PFN_DS_FUNC pfnFunc)
686 {
687 API_STATE* pApiState = GetDrawState(GetContext(hContext));
688 pApiState->pfnDsFunc = pfnFunc;
689 }
690
SwrSetDepthStencilState(HANDLE hContext,SWR_DEPTH_STENCIL_STATE * pDSState)691 void SwrSetDepthStencilState(HANDLE hContext, SWR_DEPTH_STENCIL_STATE* pDSState)
692 {
693 API_STATE* pState = GetDrawState(GetContext(hContext));
694
695 pState->depthStencilState = *pDSState;
696 }
697
SwrSetBackendState(HANDLE hContext,SWR_BACKEND_STATE * pBEState)698 void SwrSetBackendState(HANDLE hContext, SWR_BACKEND_STATE* pBEState)
699 {
700 API_STATE* pState = GetDrawState(GetContext(hContext));
701
702 pState->backendState = *pBEState;
703 }
704
SwrSetDepthBoundsState(HANDLE hContext,SWR_DEPTH_BOUNDS_STATE * pDBState)705 void SwrSetDepthBoundsState(HANDLE hContext, SWR_DEPTH_BOUNDS_STATE* pDBState)
706 {
707 API_STATE* pState = GetDrawState(GetContext(hContext));
708
709 pState->depthBoundsState = *pDBState;
710 }
711
SwrSetPixelShaderState(HANDLE hContext,SWR_PS_STATE * pPSState)712 void SwrSetPixelShaderState(HANDLE hContext, SWR_PS_STATE* pPSState)
713 {
714 API_STATE* pState = GetDrawState(GetContext(hContext));
715 pState->psState = *pPSState;
716 }
717
SwrSetBlendState(HANDLE hContext,SWR_BLEND_STATE * pBlendState)718 void SwrSetBlendState(HANDLE hContext, SWR_BLEND_STATE* pBlendState)
719 {
720 API_STATE* pState = GetDrawState(GetContext(hContext));
721 memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE));
722 }
723
SwrSetBlendFunc(HANDLE hContext,uint32_t renderTarget,PFN_BLEND_JIT_FUNC pfnBlendFunc)724 void SwrSetBlendFunc(HANDLE hContext, uint32_t renderTarget, PFN_BLEND_JIT_FUNC pfnBlendFunc)
725 {
726 SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS);
727 API_STATE* pState = GetDrawState(GetContext(hContext));
728 pState->pfnBlendFunc[renderTarget] = pfnBlendFunc;
729 }
730
731 // update guardband multipliers for the viewport
updateGuardbands(API_STATE * pState)732 void updateGuardbands(API_STATE* pState)
733 {
734 uint32_t numGbs = pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
735
736 for (uint32_t i = 0; i < numGbs; ++i)
737 {
738 // guardband center is viewport center
739 pState->gbState.left[i] = KNOB_GUARDBAND_WIDTH / pState->vp[i].width;
740 pState->gbState.right[i] = KNOB_GUARDBAND_WIDTH / pState->vp[i].width;
741 pState->gbState.top[i] = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height;
742 pState->gbState.bottom[i] = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height;
743 }
744 }
745
SwrSetRastState(HANDLE hContext,const SWR_RASTSTATE * pRastState)746 void SwrSetRastState(HANDLE hContext, const SWR_RASTSTATE* pRastState)
747 {
748 SWR_CONTEXT* pContext = GetContext(hContext);
749 API_STATE* pState = GetDrawState(pContext);
750
751 memcpy((void*)&pState->rastState, (void*)pRastState, sizeof(SWR_RASTSTATE));
752 }
753
SwrSetViewports(HANDLE hContext,uint32_t numViewports,const SWR_VIEWPORT * pViewports,const SWR_VIEWPORT_MATRICES * pMatrices)754 void SwrSetViewports(HANDLE hContext,
755 uint32_t numViewports,
756 const SWR_VIEWPORT* pViewports,
757 const SWR_VIEWPORT_MATRICES* pMatrices)
758 {
759 SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS, "Invalid number of viewports.");
760
761 SWR_CONTEXT* pContext = GetContext(hContext);
762 API_STATE* pState = GetDrawState(pContext);
763
764 memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports);
765 // @todo Faster to copy portions of the SOA or just copy all of it?
766 memcpy(&pState->vpMatrices, pMatrices, sizeof(SWR_VIEWPORT_MATRICES));
767 }
768
SwrSetScissorRects(HANDLE hContext,uint32_t numScissors,const SWR_RECT * pScissors)769 void SwrSetScissorRects(HANDLE hContext, uint32_t numScissors, const SWR_RECT* pScissors)
770 {
771 SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS, "Invalid number of scissor rects.");
772
773 API_STATE* pState = GetDrawState(GetContext(hContext));
774 memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(pScissors[0]));
775 };
776
SetupMacroTileScissors(DRAW_CONTEXT * pDC)777 void SetupMacroTileScissors(DRAW_CONTEXT* pDC)
778 {
779 API_STATE* pState = &pDC->pState->state;
780 uint32_t numScissors =
781 pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
782 pState->scissorsTileAligned = true;
783
784 for (uint32_t index = 0; index < numScissors; ++index)
785 {
786 SWR_RECT& scissorInFixedPoint = pState->scissorsInFixedPoint[index];
787
788 // Set up scissor dimensions based on scissor or viewport
789 if (pState->rastState.scissorEnable)
790 {
791 scissorInFixedPoint = pState->scissorRects[index];
792 }
793 else
794 {
795 // the vp width and height must be added to origin un-rounded then the result round to
796 // -inf. The cast to int works for rounding assuming all [left, right, top, bottom] are
797 // positive.
798 scissorInFixedPoint.xmin = (int32_t)pState->vp[index].x;
799 scissorInFixedPoint.xmax = (int32_t)(pState->vp[index].x + pState->vp[index].width);
800 scissorInFixedPoint.ymin = (int32_t)pState->vp[index].y;
801 scissorInFixedPoint.ymax = (int32_t)(pState->vp[index].y + pState->vp[index].height);
802 }
803
804 // Clamp to max rect
805 scissorInFixedPoint &= g_MaxScissorRect;
806
807 // Test for tile alignment
808 bool tileAligned;
809 tileAligned = (scissorInFixedPoint.xmin % KNOB_TILE_X_DIM) == 0;
810 tileAligned &= (scissorInFixedPoint.ymin % KNOB_TILE_Y_DIM) == 0;
811 tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_X_DIM) == 0;
812 tileAligned &= (scissorInFixedPoint.ymax % KNOB_TILE_Y_DIM) == 0;
813
814 pState->scissorsTileAligned &= tileAligned;
815
816 // Scale to fixed point
817 scissorInFixedPoint.xmin *= FIXED_POINT_SCALE;
818 scissorInFixedPoint.xmax *= FIXED_POINT_SCALE;
819 scissorInFixedPoint.ymin *= FIXED_POINT_SCALE;
820 scissorInFixedPoint.ymax *= FIXED_POINT_SCALE;
821
822 // Make scissor inclusive
823 scissorInFixedPoint.xmax -= 1;
824 scissorInFixedPoint.ymax -= 1;
825 }
826 }
827
828
829 // templated backend function tables
830
SetupPipeline(DRAW_CONTEXT * pDC)831 void SetupPipeline(DRAW_CONTEXT* pDC)
832 {
833 DRAW_STATE* pState = pDC->pState;
834 const SWR_RASTSTATE& rastState = pState->state.rastState;
835 const SWR_PS_STATE& psState = pState->state.psState;
836 BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
837
838 // setup backend
839 if (psState.pfnPixelShader == nullptr)
840 {
841 backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
842 }
843 else
844 {
845 const uint32_t forcedSampleCount = (rastState.forcedSampleCount) ? 1 : 0;
846 const bool bMultisampleEnable =
847 ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || forcedSampleCount) ? 1 : 0;
848 const uint32_t centroid =
849 ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
850 const uint32_t canEarlyZ =
851 (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesUAV)) ? 1 : 0;
852 SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
853
854 // select backend function
855 switch (psState.shadingRate)
856 {
857 case SWR_SHADING_RATE_PIXEL:
858 if (bMultisampleEnable)
859 {
860 // always need to generate I & J per sample for Z interpolation
861 barycentricsMask =
862 (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
863 backendFuncs.pfnBackend =
864 gBackendPixelRateTable[rastState.sampleCount][rastState.bIsCenterPattern]
865 [psState.inputCoverage][centroid][forcedSampleCount]
866 [canEarlyZ]
867 ;
868 }
869 else
870 {
871 // always need to generate I & J per pixel for Z interpolation
872 barycentricsMask =
873 (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
874 backendFuncs.pfnBackend =
875 gBackendSingleSample[psState.inputCoverage][centroid][canEarlyZ];
876 }
877 break;
878 case SWR_SHADING_RATE_SAMPLE:
879 SWR_ASSERT(rastState.bIsCenterPattern != true);
880 // always need to generate I & J per sample for Z interpolation
881 barycentricsMask =
882 (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
883 backendFuncs.pfnBackend =
884 gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid]
885 [canEarlyZ];
886 break;
887 default:
888 SWR_ASSERT(0 && "Invalid shading rate");
889 break;
890 }
891 }
892
893 SWR_ASSERT(backendFuncs.pfnBackend);
894
895 PFN_PROCESS_PRIMS pfnBinner;
896 #if USE_SIMD16_FRONTEND
897 PFN_PROCESS_PRIMS_SIMD16 pfnBinner_simd16;
898 #endif
899 switch (pState->state.topology)
900 {
901 case TOP_POINT_LIST:
902 pState->pfnProcessPrims = ClipPoints;
903 pfnBinner = BinPoints;
904 #if USE_SIMD16_FRONTEND
905 pState->pfnProcessPrims_simd16 = ClipPoints_simd16;
906 pfnBinner_simd16 = BinPoints_simd16;
907 #endif
908 break;
909 case TOP_LINE_LIST:
910 case TOP_LINE_STRIP:
911 case TOP_LINE_LOOP:
912 case TOP_LINE_LIST_ADJ:
913 case TOP_LISTSTRIP_ADJ:
914 pState->pfnProcessPrims = ClipLines;
915 pfnBinner = BinLines;
916 #if USE_SIMD16_FRONTEND
917 pState->pfnProcessPrims_simd16 = ClipLines_simd16;
918 pfnBinner_simd16 = BinLines_simd16;
919 #endif
920 break;
921 default:
922 pState->pfnProcessPrims = ClipTriangles;
923 pfnBinner = GetBinTrianglesFunc((rastState.conservativeRast > 0));
924 #if USE_SIMD16_FRONTEND
925 pState->pfnProcessPrims_simd16 = ClipTriangles_simd16;
926 pfnBinner_simd16 = GetBinTrianglesFunc_simd16((rastState.conservativeRast > 0));
927 #endif
928 break;
929 };
930
931
932 // Disable clipper if viewport transform is disabled or if clipper is disabled
933 if (pState->state.frontendState.vpTransformDisable || !pState->state.rastState.clipEnable)
934 {
935 pState->pfnProcessPrims = pfnBinner;
936 #if USE_SIMD16_FRONTEND
937 pState->pfnProcessPrims_simd16 = pfnBinner_simd16;
938 #endif
939 }
940
941 // Disable rasterizer and backend if no pixel, no depth/stencil, and no attributes
942 if ((pState->state.psState.pfnPixelShader == nullptr) &&
943 (pState->state.depthStencilState.depthTestEnable == FALSE) &&
944 (pState->state.depthStencilState.depthWriteEnable == FALSE) &&
945 (pState->state.depthStencilState.stencilTestEnable == FALSE) &&
946 (pState->state.depthStencilState.stencilWriteEnable == FALSE) &&
947 (pState->state.backendState.numAttributes == 0))
948 {
949 pState->pfnProcessPrims = nullptr;
950 #if USE_SIMD16_FRONTEND
951 pState->pfnProcessPrims_simd16 = nullptr;
952 #endif
953 }
954
955 if (pState->state.soState.rasterizerDisable == true)
956 {
957 pState->pfnProcessPrims = nullptr;
958 #if USE_SIMD16_FRONTEND
959 pState->pfnProcessPrims_simd16 = nullptr;
960 #endif
961 }
962
963
964 // set up the frontend attribute count
965 pState->state.feNumAttributes = 0;
966 const SWR_BACKEND_STATE& backendState = pState->state.backendState;
967 if (backendState.swizzleEnable)
968 {
969 // attribute swizzling is enabled, iterate over the map and record the max attribute used
970 for (uint32_t i = 0; i < backendState.numAttributes; ++i)
971 {
972 pState->state.feNumAttributes =
973 std::max(pState->state.feNumAttributes,
974 (uint32_t)backendState.swizzleMap[i].sourceAttrib + 1);
975 }
976 }
977 else
978 {
979 pState->state.feNumAttributes = pState->state.backendState.numAttributes;
980 }
981
982 if (pState->state.soState.soEnable)
983 {
984 uint64_t streamMasks = 0;
985 for (uint32_t i = 0; i < 4; ++i)
986 {
987 streamMasks |= pState->state.soState.streamMasks[i];
988 }
989
990 unsigned long maxAttrib;
991 if (_BitScanReverse64(&maxAttrib, streamMasks))
992 {
993 pState->state.feNumAttributes =
994 std::max(pState->state.feNumAttributes, (uint32_t)(maxAttrib + 1));
995 }
996 }
997
998 // complicated logic to test for cases where we don't need backing hottile memory for a draw
999 // have to check for the special case where depth/stencil test is enabled but depthwrite is
1000 // disabled.
1001 pState->state.depthHottileEnable =
1002 ((!(pState->state.depthStencilState.depthTestEnable &&
1003 !pState->state.depthStencilState.depthWriteEnable &&
1004 !pState->state.depthBoundsState.depthBoundsTestEnable &&
1005 pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) &&
1006 (pState->state.depthStencilState.depthTestEnable ||
1007 pState->state.depthStencilState.depthWriteEnable ||
1008 pState->state.depthBoundsState.depthBoundsTestEnable))
1009 ? true
1010 : false;
1011
1012 pState->state.stencilHottileEnable =
1013 (((!(pState->state.depthStencilState.stencilTestEnable &&
1014 !pState->state.depthStencilState.stencilWriteEnable &&
1015 pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) ||
1016 // for stencil we have to check the double sided state as well
1017 (!(pState->state.depthStencilState.doubleSidedStencilTestEnable &&
1018 !pState->state.depthStencilState.stencilWriteEnable &&
1019 pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) &&
1020 (pState->state.depthStencilState.stencilTestEnable ||
1021 pState->state.depthStencilState.stencilWriteEnable))
1022 ? true
1023 : false;
1024
1025 uint32_t hotTileEnable = pState->state.psState.renderTargetMask;
1026
1027 // Disable hottile for surfaces with no writes
1028 if (psState.pfnPixelShader != nullptr)
1029 {
1030 unsigned long rt;
1031 uint32_t rtMask = pState->state.psState.renderTargetMask;
1032 while (_BitScanForward(&rt, rtMask))
1033 {
1034 rtMask &= ~(1 << rt);
1035
1036 if (pState->state.blendState.renderTarget[rt].writeDisableAlpha &&
1037 pState->state.blendState.renderTarget[rt].writeDisableRed &&
1038 pState->state.blendState.renderTarget[rt].writeDisableGreen &&
1039 pState->state.blendState.renderTarget[rt].writeDisableBlue)
1040 {
1041 hotTileEnable &= ~(1 << rt);
1042 }
1043 }
1044 }
1045
1046 pState->state.colorHottileEnable = hotTileEnable;
1047
1048 // Setup depth quantization function
1049 if (pState->state.depthHottileEnable)
1050 {
1051 switch (pState->state.rastState.depthFormat)
1052 {
1053 case R32_FLOAT_X8X24_TYPELESS:
1054 pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT_X8X24_TYPELESS>;
1055 break;
1056 case R32_FLOAT:
1057 pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT>;
1058 break;
1059 case R24_UNORM_X8_TYPELESS:
1060 pState->state.pfnQuantizeDepth = QuantizeDepth<R24_UNORM_X8_TYPELESS>;
1061 break;
1062 case R16_UNORM:
1063 pState->state.pfnQuantizeDepth = QuantizeDepth<R16_UNORM>;
1064 break;
1065 default:
1066 SWR_INVALID("Unsupported depth format for depth quantization.");
1067 pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT>;
1068 }
1069 }
1070 else
1071 {
1072 // set up pass-through quantize if depth isn't enabled
1073 pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT>;
1074 }
1075
1076 // Generate guardbands
1077 updateGuardbands(&pState->state);
1078 }
1079
1080 //////////////////////////////////////////////////////////////////////////
1081 /// @brief InitDraw
1082 /// @param pDC - Draw context to initialize for this draw.
InitDraw(DRAW_CONTEXT * pDC,bool isSplitDraw)1083 void InitDraw(DRAW_CONTEXT* pDC, bool isSplitDraw)
1084 {
1085 // We don't need to re-setup the scissors/pipeline state again for split draw.
1086 if (isSplitDraw == false)
1087 {
1088 SetupMacroTileScissors(pDC);
1089 SetupPipeline(pDC);
1090 }
1091
1092 }
1093
1094 //////////////////////////////////////////////////////////////////////////
1095 /// @brief We can split the draw for certain topologies for better performance.
1096 /// @param totalVerts - Total vertices for draw
1097 /// @param topology - Topology used for draw
MaxVertsPerDraw(DRAW_CONTEXT * pDC,uint32_t totalVerts,PRIMITIVE_TOPOLOGY topology)1098 uint32_t MaxVertsPerDraw(DRAW_CONTEXT* pDC, uint32_t totalVerts, PRIMITIVE_TOPOLOGY topology)
1099 {
1100 API_STATE& state = pDC->pState->state;
1101
1102 // We can not split draws that have streamout enabled because there is no practical way
1103 // to support multiple threads generating SO data for a single set of buffers.
1104 if (state.soState.soEnable)
1105 {
1106 return totalVerts;
1107 }
1108
1109 // The Primitive Assembly code can only handle 1 RECT at a time. Specified with only 3 verts.
1110 if (topology == TOP_RECT_LIST)
1111 {
1112 return 3;
1113 }
1114
1115 // Is split drawing disabled?
1116 if (KNOB_DISABLE_SPLIT_DRAW)
1117 {
1118 return totalVerts;
1119 }
1120
1121 uint32_t vertsPerDraw = totalVerts;
1122
1123 switch (topology)
1124 {
1125 case TOP_POINT_LIST:
1126 case TOP_TRIANGLE_LIST:
1127 vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW;
1128 break;
1129
1130 case TOP_PATCHLIST_1:
1131 case TOP_PATCHLIST_2:
1132 case TOP_PATCHLIST_3:
1133 case TOP_PATCHLIST_4:
1134 case TOP_PATCHLIST_5:
1135 case TOP_PATCHLIST_6:
1136 case TOP_PATCHLIST_7:
1137 case TOP_PATCHLIST_8:
1138 case TOP_PATCHLIST_9:
1139 case TOP_PATCHLIST_10:
1140 case TOP_PATCHLIST_11:
1141 case TOP_PATCHLIST_12:
1142 case TOP_PATCHLIST_13:
1143 case TOP_PATCHLIST_14:
1144 case TOP_PATCHLIST_15:
1145 case TOP_PATCHLIST_16:
1146 case TOP_PATCHLIST_17:
1147 case TOP_PATCHLIST_18:
1148 case TOP_PATCHLIST_19:
1149 case TOP_PATCHLIST_20:
1150 case TOP_PATCHLIST_21:
1151 case TOP_PATCHLIST_22:
1152 case TOP_PATCHLIST_23:
1153 case TOP_PATCHLIST_24:
1154 case TOP_PATCHLIST_25:
1155 case TOP_PATCHLIST_26:
1156 case TOP_PATCHLIST_27:
1157 case TOP_PATCHLIST_28:
1158 case TOP_PATCHLIST_29:
1159 case TOP_PATCHLIST_30:
1160 case TOP_PATCHLIST_31:
1161 case TOP_PATCHLIST_32:
1162 if (pDC->pState->state.tsState.tsEnable)
1163 {
1164 uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE;
1165 vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW;
1166 }
1167 break;
1168 default:
1169 // We are not splitting up draws for other topologies.
1170 break;
1171 }
1172
1173 return vertsPerDraw;
1174 }
1175
1176 //////////////////////////////////////////////////////////////////////////
1177 /// @brief DrawInstanced
1178 /// @param hContext - Handle passed back from SwrCreateContext
1179 /// @param topology - Specifies topology for draw.
1180 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
1181 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1182 /// @param numInstances - How many instances to render.
1183 /// @param startInstance - Which instance to start sequentially fetching from in each buffer
1184 /// (instanced data)
DrawInstanced(HANDLE hContext,PRIMITIVE_TOPOLOGY topology,uint32_t numVertices,uint32_t startVertex,uint32_t numInstances=1,uint32_t startInstance=0)1185 void DrawInstanced(HANDLE hContext,
1186 PRIMITIVE_TOPOLOGY topology,
1187 uint32_t numVertices,
1188 uint32_t startVertex,
1189 uint32_t numInstances = 1,
1190 uint32_t startInstance = 0)
1191 {
1192 if (KNOB_TOSS_DRAW)
1193 {
1194 return;
1195 }
1196
1197 SWR_CONTEXT* pContext = GetContext(hContext);
1198 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1199
1200 RDTSC_BEGIN(pContext->pBucketMgr, APIDraw, pDC->drawId);
1201
1202 uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
1203 uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
1204 uint32_t remainingVerts = numVertices;
1205
1206 API_STATE* pState = &pDC->pState->state;
1207 pState->topology = topology;
1208 pState->forceFront = false;
1209
1210 // disable culling for points/lines
1211 uint32_t oldCullMode = pState->rastState.cullMode;
1212 if (topology == TOP_POINT_LIST)
1213 {
1214 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1215 pState->forceFront = true;
1216 }
1217 else if (topology == TOP_RECT_LIST)
1218 {
1219 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1220 }
1221
1222 int draw = 0;
1223 while (remainingVerts)
1224 {
1225 uint32_t numVertsForDraw =
1226 (remainingVerts < maxVertsPerDraw) ? remainingVerts : maxVertsPerDraw;
1227
1228 bool isSplitDraw = (draw > 0) ? !KNOB_DISABLE_SPLIT_DRAW : false;
1229 DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw);
1230 InitDraw(pDC, isSplitDraw);
1231
1232 pDC->FeWork.type = DRAW;
1233 pDC->FeWork.pfnWork = GetProcessDrawFunc(false, // IsIndexed
1234 false, // bEnableCutIndex
1235 pState->tsState.tsEnable,
1236 pState->gsState.gsEnable,
1237 pState->soState.soEnable,
1238 pDC->pState->pfnProcessPrims != nullptr);
1239 pDC->FeWork.desc.draw.numVerts = numVertsForDraw;
1240 pDC->FeWork.desc.draw.startVertex = startVertex;
1241 pDC->FeWork.desc.draw.numInstances = numInstances;
1242 pDC->FeWork.desc.draw.startInstance = startInstance;
1243 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1244 pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw;
1245
1246 pDC->cleanupState = (remainingVerts == numVertsForDraw);
1247
1248 // enqueue DC
1249 QueueDraw(pContext);
1250
1251 AR_API_EVENT(DrawInstancedEvent(pDC->drawId,
1252 topology,
1253 numVertsForDraw,
1254 startVertex,
1255 numInstances,
1256 startInstance,
1257 pState->tsState.tsEnable,
1258 pState->gsState.gsEnable,
1259 pState->soState.soEnable,
1260 pState->gsState.outputTopology,
1261 draw));
1262
1263 remainingVerts -= numVertsForDraw;
1264 draw++;
1265 }
1266
1267 // restore culling state
1268 pDC = GetDrawContext(pContext);
1269 pDC->pState->state.rastState.cullMode = oldCullMode;
1270
1271 RDTSC_END(pContext->pBucketMgr, APIDraw, numVertices * numInstances);
1272 }
1273
1274 //////////////////////////////////////////////////////////////////////////
1275 /// @brief SwrDraw
1276 /// @param hContext - Handle passed back from SwrCreateContext
1277 /// @param topology - Specifies topology for draw.
1278 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
1279 /// @param primCount - Number of vertices.
SwrDraw(HANDLE hContext,PRIMITIVE_TOPOLOGY topology,uint32_t startVertex,uint32_t numVertices)1280 void SwrDraw(HANDLE hContext,
1281 PRIMITIVE_TOPOLOGY topology,
1282 uint32_t startVertex,
1283 uint32_t numVertices)
1284 {
1285 DrawInstanced(hContext, topology, numVertices, startVertex);
1286 }
1287
1288 //////////////////////////////////////////////////////////////////////////
1289 /// @brief SwrDrawInstanced
1290 /// @param hContext - Handle passed back from SwrCreateContext
1291 /// @param topology - Specifies topology for draw.
1292 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
1293 /// @param numInstances - How many instances to render.
1294 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1295 /// @param startInstance - Which instance to start sequentially fetching from in each buffer
1296 /// (instanced data)
SwrDrawInstanced(HANDLE hContext,PRIMITIVE_TOPOLOGY topology,uint32_t numVertsPerInstance,uint32_t numInstances,uint32_t startVertex,uint32_t startInstance)1297 void SwrDrawInstanced(HANDLE hContext,
1298 PRIMITIVE_TOPOLOGY topology,
1299 uint32_t numVertsPerInstance,
1300 uint32_t numInstances,
1301 uint32_t startVertex,
1302 uint32_t startInstance)
1303 {
1304 DrawInstanced(
1305 hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance);
1306 }
1307
1308 //////////////////////////////////////////////////////////////////////////
1309 /// @brief DrawIndexedInstanced
1310 /// @param hContext - Handle passed back from SwrCreateContext
1311 /// @param topology - Specifies topology for draw.
1312 /// @param numIndices - Number of indices to read sequentially from index buffer.
1313 /// @param indexOffset - Starting index into index buffer.
1314 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1315 /// @param numInstances - Number of instances to render.
1316 /// @param startInstance - Which instance to start sequentially fetching from in each buffer
1317 /// (instanced data)
DrawIndexedInstance(HANDLE hContext,PRIMITIVE_TOPOLOGY topology,uint32_t numIndices,uint32_t indexOffset,int32_t baseVertex,uint32_t numInstances=1,uint32_t startInstance=0)1318 void DrawIndexedInstance(HANDLE hContext,
1319 PRIMITIVE_TOPOLOGY topology,
1320 uint32_t numIndices,
1321 uint32_t indexOffset,
1322 int32_t baseVertex,
1323 uint32_t numInstances = 1,
1324 uint32_t startInstance = 0)
1325 {
1326 if (KNOB_TOSS_DRAW)
1327 {
1328 return;
1329 }
1330
1331 SWR_CONTEXT* pContext = GetContext(hContext);
1332 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1333 API_STATE* pState = &pDC->pState->state;
1334
1335 RDTSC_BEGIN(pContext->pBucketMgr, APIDrawIndexed, pDC->drawId);
1336
1337 uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
1338 uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
1339 uint32_t remainingIndices = numIndices;
1340
1341 uint32_t indexSize = 0;
1342 switch (pState->indexBuffer.format)
1343 {
1344 case R32_UINT:
1345 indexSize = sizeof(uint32_t);
1346 break;
1347 case R16_UINT:
1348 indexSize = sizeof(uint16_t);
1349 break;
1350 case R8_UINT:
1351 indexSize = sizeof(uint8_t);
1352 break;
1353 default:
1354 SWR_INVALID("Invalid index buffer format: %d", pState->indexBuffer.format);
1355 }
1356
1357 int draw = 0;
1358 gfxptr_t xpIB = pState->indexBuffer.xpIndices;
1359 xpIB += (uint64_t)indexOffset * (uint64_t)indexSize;
1360
1361 pState->topology = topology;
1362 pState->forceFront = false;
1363
1364 // disable culling for points/lines
1365 uint32_t oldCullMode = pState->rastState.cullMode;
1366 if (topology == TOP_POINT_LIST)
1367 {
1368 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1369 pState->forceFront = true;
1370 }
1371 else if (topology == TOP_RECT_LIST)
1372 {
1373 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1374 }
1375
1376 while (remainingIndices)
1377 {
1378 uint32_t numIndicesForDraw =
1379 (remainingIndices < maxIndicesPerDraw) ? remainingIndices : maxIndicesPerDraw;
1380
1381 // When breaking up draw, we need to obtain new draw context for each iteration.
1382 bool isSplitDraw = (draw > 0) ? !KNOB_DISABLE_SPLIT_DRAW : false;
1383
1384 pDC = GetDrawContext(pContext, isSplitDraw);
1385 InitDraw(pDC, isSplitDraw);
1386
1387 pDC->FeWork.type = DRAW;
1388 pDC->FeWork.pfnWork = GetProcessDrawFunc(true, // IsIndexed
1389 pState->frontendState.bEnableCutIndex,
1390 pState->tsState.tsEnable,
1391 pState->gsState.gsEnable,
1392 pState->soState.soEnable,
1393 pDC->pState->pfnProcessPrims != nullptr);
1394 pDC->FeWork.desc.draw.pDC = pDC;
1395 pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
1396 pDC->FeWork.desc.draw.xpIB = xpIB;
1397 pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format;
1398
1399 pDC->FeWork.desc.draw.numInstances = numInstances;
1400 pDC->FeWork.desc.draw.startInstance = startInstance;
1401 pDC->FeWork.desc.draw.baseVertex = baseVertex;
1402 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1403
1404 pDC->cleanupState = (remainingIndices == numIndicesForDraw);
1405
1406 // enqueue DC
1407 QueueDraw(pContext);
1408
1409 AR_API_EVENT(DrawIndexedInstancedEvent(pDC->drawId,
1410 topology,
1411 numIndicesForDraw,
1412 indexOffset,
1413 baseVertex,
1414 numInstances,
1415 startInstance,
1416 pState->tsState.tsEnable,
1417 pState->gsState.gsEnable,
1418 pState->soState.soEnable,
1419 pState->gsState.outputTopology,
1420 draw));
1421
1422 xpIB += maxIndicesPerDraw * indexSize;
1423 remainingIndices -= numIndicesForDraw;
1424 draw++;
1425 }
1426
1427 // Restore culling state
1428 pDC = GetDrawContext(pContext);
1429 pDC->pState->state.rastState.cullMode = oldCullMode;
1430
1431 RDTSC_END(pContext->pBucketMgr, APIDrawIndexed, numIndices * numInstances);
1432 }
1433
1434 //////////////////////////////////////////////////////////////////////////
1435 /// @brief DrawIndexed
1436 /// @param hContext - Handle passed back from SwrCreateContext
1437 /// @param topology - Specifies topology for draw.
1438 /// @param numIndices - Number of indices to read sequentially from index buffer.
1439 /// @param indexOffset - Starting index into index buffer.
1440 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
SwrDrawIndexed(HANDLE hContext,PRIMITIVE_TOPOLOGY topology,uint32_t numIndices,uint32_t indexOffset,int32_t baseVertex)1441 void SwrDrawIndexed(HANDLE hContext,
1442 PRIMITIVE_TOPOLOGY topology,
1443 uint32_t numIndices,
1444 uint32_t indexOffset,
1445 int32_t baseVertex)
1446 {
1447 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex);
1448 }
1449
1450 //////////////////////////////////////////////////////////////////////////
1451 /// @brief SwrDrawIndexedInstanced
1452 /// @param hContext - Handle passed back from SwrCreateContext
1453 /// @param topology - Specifies topology for draw.
1454 /// @param numIndices - Number of indices to read sequentially from index buffer.
1455 /// @param numInstances - Number of instances to render.
1456 /// @param indexOffset - Starting index into index buffer.
1457 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1458 /// @param startInstance - Which instance to start sequentially fetching from in each buffer
1459 /// (instanced data)
SwrDrawIndexedInstanced(HANDLE hContext,PRIMITIVE_TOPOLOGY topology,uint32_t numIndices,uint32_t numInstances,uint32_t indexOffset,int32_t baseVertex,uint32_t startInstance)1460 void SwrDrawIndexedInstanced(HANDLE hContext,
1461 PRIMITIVE_TOPOLOGY topology,
1462 uint32_t numIndices,
1463 uint32_t numInstances,
1464 uint32_t indexOffset,
1465 int32_t baseVertex,
1466 uint32_t startInstance)
1467 {
1468 DrawIndexedInstance(
1469 hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
1470 }
1471
1472 //////////////////////////////////////////////////////////////////////////
1473 /// @brief SwrInvalidateTiles
1474 /// @param hContext - Handle passed back from SwrCreateContext
1475 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to
1476 /// invalidate.
1477 /// @param invalidateRect - The pixel-coordinate rectangle to invalidate. This will be expanded to
1478 /// be hottile size-aligned.
SwrInvalidateTiles(HANDLE hContext,uint32_t attachmentMask,const SWR_RECT & invalidateRect)1479 void SWR_API SwrInvalidateTiles(HANDLE hContext,
1480 uint32_t attachmentMask,
1481 const SWR_RECT& invalidateRect)
1482 {
1483 if (KNOB_TOSS_DRAW)
1484 {
1485 return;
1486 }
1487
1488 SWR_CONTEXT* pContext = GetContext(hContext);
1489 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1490
1491 pDC->FeWork.type = DISCARDINVALIDATETILES;
1492 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1493 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1494 pDC->FeWork.desc.discardInvalidateTiles.rect = invalidateRect;
1495 pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect;
1496 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID;
1497 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false;
1498 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false;
1499
1500 // enqueue
1501 QueueDraw(pContext);
1502
1503 AR_API_EVENT(SwrInvalidateTilesEvent(pDC->drawId));
1504 }
1505
1506 //////////////////////////////////////////////////////////////////////////
1507 /// @brief SwrDiscardRect
1508 /// @param hContext - Handle passed back from SwrCreateContext
1509 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
1510 /// @param rect - The pixel-coordinate rectangle to discard. Only fully-covered hottiles will be
1511 /// discarded.
SwrDiscardRect(HANDLE hContext,uint32_t attachmentMask,const SWR_RECT & rect)1512 void SWR_API SwrDiscardRect(HANDLE hContext, uint32_t attachmentMask, const SWR_RECT& rect)
1513 {
1514 if (KNOB_TOSS_DRAW)
1515 {
1516 return;
1517 }
1518
1519 SWR_CONTEXT* pContext = GetContext(hContext);
1520 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1521
1522 // Queue a load to the hottile
1523 pDC->FeWork.type = DISCARDINVALIDATETILES;
1524 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1525 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1526 pDC->FeWork.desc.discardInvalidateTiles.rect = rect;
1527 pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect;
1528 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED;
1529 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true;
1530 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true;
1531
1532 // enqueue
1533 QueueDraw(pContext);
1534
1535 AR_API_EVENT(SwrDiscardRectEvent(pDC->drawId));
1536 }
1537
1538 //////////////////////////////////////////////////////////////////////////
1539 /// @brief SwrDispatch
1540 /// @param hContext - Handle passed back from SwrCreateContext
1541 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
1542 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
1543 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
SwrDispatch(HANDLE hContext,uint32_t threadGroupCountX,uint32_t threadGroupCountY,uint32_t threadGroupCountZ)1544 void SwrDispatch(HANDLE hContext,
1545 uint32_t threadGroupCountX,
1546 uint32_t threadGroupCountY,
1547 uint32_t threadGroupCountZ
1548
1549 )
1550 {
1551 if (KNOB_TOSS_DRAW)
1552 {
1553 return;
1554 }
1555
1556 SWR_CONTEXT* pContext = GetContext(hContext);
1557 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1558
1559 RDTSC_BEGIN(pContext->pBucketMgr, APIDispatch, pDC->drawId);
1560 AR_API_EVENT(
1561 DispatchEvent(pDC->drawId, threadGroupCountX, threadGroupCountY, threadGroupCountZ));
1562 pDC->isCompute = true; // This is a compute context.
1563
1564 COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
1565
1566 pTaskData->threadGroupCountX = threadGroupCountX;
1567 pTaskData->threadGroupCountY = threadGroupCountY;
1568 pTaskData->threadGroupCountZ = threadGroupCountZ;
1569
1570 pTaskData->enableThreadDispatch = false;
1571
1572 uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
1573 uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT;
1574 pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex];
1575 pDC->pDispatch->initialize(totalThreadGroups, pTaskData, &ProcessComputeBE);
1576
1577 QueueDispatch(pContext);
1578 RDTSC_END(pContext->pBucketMgr,
1579 APIDispatch,
1580 threadGroupCountX * threadGroupCountY * threadGroupCountZ);
1581 }
1582
1583 // Deswizzles, converts and stores current contents of the hot tiles to surface
1584 // described by pState
SwrStoreTiles(HANDLE hContext,uint32_t attachmentMask,SWR_TILE_STATE postStoreTileState,const SWR_RECT & storeRect)1585 void SWR_API SwrStoreTiles(HANDLE hContext,
1586 uint32_t attachmentMask,
1587 SWR_TILE_STATE postStoreTileState,
1588 const SWR_RECT& storeRect)
1589 {
1590 if (KNOB_TOSS_DRAW)
1591 {
1592 return;
1593 }
1594
1595 SWR_CONTEXT* pContext = GetContext(hContext);
1596 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1597
1598 RDTSC_BEGIN(pContext->pBucketMgr, APIStoreTiles, pDC->drawId);
1599
1600 pDC->FeWork.type = STORETILES;
1601 pDC->FeWork.pfnWork = ProcessStoreTiles;
1602 pDC->FeWork.desc.storeTiles.attachmentMask = attachmentMask;
1603 pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState;
1604 pDC->FeWork.desc.storeTiles.rect = storeRect;
1605 pDC->FeWork.desc.storeTiles.rect &= g_MaxScissorRect;
1606
1607 // enqueue
1608 QueueDraw(pContext);
1609
1610 AR_API_EVENT(SwrStoreTilesEvent(pDC->drawId));
1611
1612 RDTSC_END(pContext->pBucketMgr, APIStoreTiles, 1);
1613 }
1614
1615 //////////////////////////////////////////////////////////////////////////
1616 /// @brief SwrClearRenderTarget - Clear attached render targets / depth / stencil
1617 /// @param hContext - Handle passed back from SwrCreateContext
1618 /// @param attachmentMask - combination of SWR_ATTACHMENT_*_BIT attachments to clear
1619 /// @param renderTargetArrayIndex - the RT array index to clear
1620 /// @param clearColor - color use for clearing render targets
1621 /// @param z - depth value use for clearing depth buffer
1622 /// @param stencil - stencil value used for clearing stencil buffer
1623 /// @param clearRect - The pixel-coordinate rectangle to clear in all cleared buffers
SwrClearRenderTarget(HANDLE hContext,uint32_t attachmentMask,uint32_t renderTargetArrayIndex,const float clearColor[4],float z,uint8_t stencil,const SWR_RECT & clearRect)1624 void SWR_API SwrClearRenderTarget(HANDLE hContext,
1625 uint32_t attachmentMask,
1626 uint32_t renderTargetArrayIndex,
1627 const float clearColor[4],
1628 float z,
1629 uint8_t stencil,
1630 const SWR_RECT& clearRect)
1631 {
1632 if (KNOB_TOSS_DRAW)
1633 {
1634 return;
1635 }
1636
1637 SWR_CONTEXT* pContext = GetContext(hContext);
1638 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1639
1640 RDTSC_BEGIN(pContext->pBucketMgr, APIClearRenderTarget, pDC->drawId);
1641
1642 pDC->FeWork.type = CLEAR;
1643 pDC->FeWork.pfnWork = ProcessClear;
1644 pDC->FeWork.desc.clear.rect = clearRect;
1645 pDC->FeWork.desc.clear.rect &= g_MaxScissorRect;
1646 pDC->FeWork.desc.clear.attachmentMask = attachmentMask;
1647 pDC->FeWork.desc.clear.renderTargetArrayIndex = renderTargetArrayIndex;
1648 pDC->FeWork.desc.clear.clearDepth = z;
1649 pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0];
1650 pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1];
1651 pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2];
1652 pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3];
1653 pDC->FeWork.desc.clear.clearStencil = stencil;
1654
1655 // enqueue draw
1656 QueueDraw(pContext);
1657
1658 RDTSC_END(pContext->pBucketMgr, APIClearRenderTarget, 1);
1659 }
1660
1661 //////////////////////////////////////////////////////////////////////////
1662 /// @brief Returns a pointer to the private context state for the current
1663 /// draw operation. This is used for external componets such as the
1664 /// sampler.
1665 /// SWR is responsible for the allocation of the private context state.
1666 /// @param hContext - Handle passed back from SwrCreateContext
SwrGetPrivateContextState(HANDLE hContext)1667 VOID* SwrGetPrivateContextState(HANDLE hContext)
1668 {
1669 SWR_CONTEXT* pContext = GetContext(hContext);
1670 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1671 DRAW_STATE* pState = pDC->pState;
1672
1673 if (pState->pPrivateState == nullptr)
1674 {
1675 pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize,
1676 KNOB_SIMD_WIDTH * sizeof(float));
1677 }
1678
1679 return pState->pPrivateState;
1680 }
1681
1682 //////////////////////////////////////////////////////////////////////////
1683 /// @brief Clients can use this to allocate memory for draw/dispatch
1684 /// operations. The memory will automatically be freed once operation
1685 /// has completed. Client can use this to allocate binding tables,
1686 /// etc. needed for shader execution.
1687 /// @param hContext - Handle passed back from SwrCreateContext
1688 /// @param size - Size of allocation
1689 /// @param align - Alignment needed for allocation.
SwrAllocDrawContextMemory(HANDLE hContext,uint32_t size,uint32_t align)1690 VOID* SwrAllocDrawContextMemory(HANDLE hContext, uint32_t size, uint32_t align)
1691 {
1692 SWR_CONTEXT* pContext = GetContext(hContext);
1693 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1694
1695 return pDC->pState->pArena->AllocAligned(size, align);
1696 }
1697
1698 //////////////////////////////////////////////////////////////////////////
1699 /// @brief Enables stats counting
1700 /// @param hContext - Handle passed back from SwrCreateContext
1701 /// @param enable - If true then counts are incremented.
SwrEnableStatsFE(HANDLE hContext,bool enable)1702 void SwrEnableStatsFE(HANDLE hContext, bool enable)
1703 {
1704 SWR_CONTEXT* pContext = GetContext(hContext);
1705 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1706
1707 pDC->pState->state.enableStatsFE = enable;
1708 }
1709
1710 //////////////////////////////////////////////////////////////////////////
1711 /// @brief Enables stats counting
1712 /// @param hContext - Handle passed back from SwrCreateContext
1713 /// @param enable - If true then counts are incremented.
SwrEnableStatsBE(HANDLE hContext,bool enable)1714 void SwrEnableStatsBE(HANDLE hContext, bool enable)
1715 {
1716 SWR_CONTEXT* pContext = GetContext(hContext);
1717 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1718
1719 pDC->pState->state.enableStatsBE = enable;
1720 }
1721
1722 //////////////////////////////////////////////////////////////////////////
1723 /// @brief Mark end of frame - used for performance profiling
1724 /// @param hContext - Handle passed back from SwrCreateContext
SwrEndFrame(HANDLE hContext)1725 void SWR_API SwrEndFrame(HANDLE hContext)
1726 {
1727 SWR_CONTEXT* pContext = GetContext(hContext);
1728 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1729 (void)pDC; // var used
1730
1731 RDTSC_ENDFRAME(pContext->pBucketMgr);
1732 AR_API_EVENT(FrameEndEvent(pContext->frameCount, pDC->drawId));
1733
1734 pContext->frameCount++;
1735 }
1736
1737 void InitSimLoadTilesTable();
1738 void InitSimStoreTilesTable();
1739 void InitSimClearTilesTable();
1740
1741 void InitClearTilesTable();
1742 void InitBackendFuncTables();
1743
1744 //////////////////////////////////////////////////////////////////////////
1745 /// @brief Initialize swr backend and memory internal tables
SwrInit()1746 void SwrInit()
1747 {
1748 InitClearTilesTable();
1749 InitBackendFuncTables();
1750 InitRasterizerFunctions();
1751 }
1752
SwrGetInterface(SWR_INTERFACE & out_funcs)1753 void SwrGetInterface(SWR_INTERFACE& out_funcs)
1754 {
1755 out_funcs.pfnSwrCreateContext = SwrCreateContext;
1756 out_funcs.pfnSwrDestroyContext = SwrDestroyContext;
1757 out_funcs.pfnSwrBindApiThread = SwrBindApiThread;
1758 out_funcs.pfnSwrSaveState = SwrSaveState;
1759 out_funcs.pfnSwrRestoreState = SwrRestoreState;
1760 out_funcs.pfnSwrSync = SwrSync;
1761 out_funcs.pfnSwrStallBE = SwrStallBE;
1762 out_funcs.pfnSwrWaitForIdle = SwrWaitForIdle;
1763 out_funcs.pfnSwrWaitForIdleFE = SwrWaitForIdleFE;
1764 out_funcs.pfnSwrSetVertexBuffers = SwrSetVertexBuffers;
1765 out_funcs.pfnSwrSetIndexBuffer = SwrSetIndexBuffer;
1766 out_funcs.pfnSwrSetFetchFunc = SwrSetFetchFunc;
1767 out_funcs.pfnSwrSetSoFunc = SwrSetSoFunc;
1768 out_funcs.pfnSwrSetSoState = SwrSetSoState;
1769 out_funcs.pfnSwrSetSoBuffers = SwrSetSoBuffers;
1770 out_funcs.pfnSwrSetVertexFunc = SwrSetVertexFunc;
1771 out_funcs.pfnSwrSetFrontendState = SwrSetFrontendState;
1772 out_funcs.pfnSwrSetGsState = SwrSetGsState;
1773 out_funcs.pfnSwrSetGsFunc = SwrSetGsFunc;
1774 out_funcs.pfnSwrSetCsFunc = SwrSetCsFunc;
1775 out_funcs.pfnSwrSetTsState = SwrSetTsState;
1776 out_funcs.pfnSwrSetHsFunc = SwrSetHsFunc;
1777 out_funcs.pfnSwrSetDsFunc = SwrSetDsFunc;
1778 out_funcs.pfnSwrSetDepthStencilState = SwrSetDepthStencilState;
1779 out_funcs.pfnSwrSetBackendState = SwrSetBackendState;
1780 out_funcs.pfnSwrSetDepthBoundsState = SwrSetDepthBoundsState;
1781 out_funcs.pfnSwrSetPixelShaderState = SwrSetPixelShaderState;
1782 out_funcs.pfnSwrSetBlendState = SwrSetBlendState;
1783 out_funcs.pfnSwrSetBlendFunc = SwrSetBlendFunc;
1784 out_funcs.pfnSwrDraw = SwrDraw;
1785 out_funcs.pfnSwrDrawInstanced = SwrDrawInstanced;
1786 out_funcs.pfnSwrDrawIndexed = SwrDrawIndexed;
1787 out_funcs.pfnSwrDrawIndexedInstanced = SwrDrawIndexedInstanced;
1788 out_funcs.pfnSwrInvalidateTiles = SwrInvalidateTiles;
1789 out_funcs.pfnSwrDiscardRect = SwrDiscardRect;
1790 out_funcs.pfnSwrDispatch = SwrDispatch;
1791 out_funcs.pfnSwrStoreTiles = SwrStoreTiles;
1792 out_funcs.pfnSwrClearRenderTarget = SwrClearRenderTarget;
1793 out_funcs.pfnSwrSetRastState = SwrSetRastState;
1794 out_funcs.pfnSwrSetViewports = SwrSetViewports;
1795 out_funcs.pfnSwrSetScissorRects = SwrSetScissorRects;
1796 out_funcs.pfnSwrGetPrivateContextState = SwrGetPrivateContextState;
1797 out_funcs.pfnSwrAllocDrawContextMemory = SwrAllocDrawContextMemory;
1798 out_funcs.pfnSwrEnableStatsFE = SwrEnableStatsFE;
1799 out_funcs.pfnSwrEnableStatsBE = SwrEnableStatsBE;
1800 out_funcs.pfnSwrEndFrame = SwrEndFrame;
1801 out_funcs.pfnSwrInit = SwrInit;
1802 }
1803