1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23
24 #include <stdio.h>
25 #include <thread>
26 #include <algorithm>
27 #include <float.h>
28 #include <vector>
29 #include <utility>
30 #include <fstream>
31 #include <string>
32
33 #if defined(__linux__) || defined(__gnu_linux__) || defined(__APPLE__)
34 #include <pthread.h>
35 #include <sched.h>
36 #include <unistd.h>
37 #endif
38
39 #ifdef __APPLE__
40 #include <sys/types.h>
41 #include <sys/sysctl.h>
42 #endif
43
44 #include "common/os.h"
45 #include "core/api.h"
46 #include "context.h"
47 #include "frontend.h"
48 #include "backend.h"
49 #include "rasterizer.h"
50 #include "rdtsc_core.h"
51 #include "tilemgr.h"
52 #include "tileset.h"
53
54
55 // ThreadId
56 struct Core
57 {
58 uint32_t procGroup = 0;
59 std::vector<uint32_t> threadIds;
60 };
61
62 struct NumaNode
63 {
64 uint32_t numaId;
65 std::vector<Core> cores;
66 };
67
68 typedef std::vector<NumaNode> CPUNumaNodes;
69
CalculateProcessorTopology(CPUNumaNodes & out_nodes,uint32_t & out_numThreadsPerProcGroup)70 void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup)
71 {
72 out_nodes.clear();
73 out_numThreadsPerProcGroup = 0;
74
75 #if defined(_WIN32)
76
77 std::vector<KAFFINITY> threadMaskPerProcGroup;
78
79 static std::mutex m;
80 std::lock_guard<std::mutex> l(m);
81
82 DWORD bufSize = 0;
83
84 BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufSize);
85 SWR_ASSERT(ret == FALSE && GetLastError() == ERROR_INSUFFICIENT_BUFFER);
86
87 PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem =
88 (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize);
89 SWR_ASSERT(pBufferMem);
90
91 ret = GetLogicalProcessorInformationEx(RelationProcessorCore, pBufferMem, &bufSize);
92 SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information");
93
94 uint32_t count = bufSize / pBufferMem->Size;
95 PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = pBufferMem;
96
97 for (uint32_t i = 0; i < count; ++i)
98 {
99 SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore);
100 for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g)
101 {
102 auto& gmask = pBuffer->Processor.GroupMask[g];
103 uint32_t threadId = 0;
104 uint32_t procGroup = gmask.Group;
105
106 Core* pCore = nullptr;
107
108 while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask))
109 {
110 // clear mask
111 KAFFINITY threadMask = KAFFINITY(1) << threadId;
112 gmask.Mask &= ~threadMask;
113
114 if (procGroup >= threadMaskPerProcGroup.size())
115 {
116 threadMaskPerProcGroup.resize(procGroup + 1);
117 }
118
119 if (threadMaskPerProcGroup[procGroup] & threadMask)
120 {
121 // Already seen this mask. This means that we are in 32-bit mode and
122 // have seen more than 32 HW threads for this procGroup
123 // Don't use it
124 #if defined(_WIN64)
125 SWR_INVALID("Shouldn't get here in 64-bit mode");
126 #endif
127 continue;
128 }
129
130 threadMaskPerProcGroup[procGroup] |= (KAFFINITY(1) << threadId);
131
132 // Find Numa Node
133 uint32_t numaId = 0;
134 PROCESSOR_NUMBER procNum = {};
135 procNum.Group = WORD(procGroup);
136 procNum.Number = UCHAR(threadId);
137
138 ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId);
139 SWR_ASSERT(ret);
140
141 // Store data
142 if (out_nodes.size() <= numaId)
143 {
144 out_nodes.resize(numaId + 1);
145 }
146 auto& numaNode = out_nodes[numaId];
147 numaNode.numaId = numaId;
148
149 if (nullptr == pCore)
150 {
151 numaNode.cores.push_back(Core());
152 pCore = &numaNode.cores.back();
153 pCore->procGroup = procGroup;
154 }
155 pCore->threadIds.push_back(threadId);
156 if (procGroup == 0)
157 {
158 out_numThreadsPerProcGroup++;
159 }
160 }
161 }
162 pBuffer = PtrAdd(pBuffer, pBuffer->Size);
163 }
164
165 free(pBufferMem);
166
167 #elif defined(__linux__) || defined(__gnu_linux__)
168
169 // Parse /proc/cpuinfo to get full topology
170 std::ifstream input("/proc/cpuinfo");
171 std::string line;
172 char* c;
173 uint32_t procId = uint32_t(-1);
174 uint32_t coreId = uint32_t(-1);
175 uint32_t physId = uint32_t(-1);
176
177 while (std::getline(input, line))
178 {
179 if (line.find("processor") != std::string::npos)
180 {
181 auto data_start = line.find(": ") + 2;
182 procId = std::strtoul(&line.c_str()[data_start], &c, 10);
183 continue;
184 }
185 if (line.find("core id") != std::string::npos)
186 {
187 auto data_start = line.find(": ") + 2;
188 coreId = std::strtoul(&line.c_str()[data_start], &c, 10);
189 continue;
190 }
191 if (line.find("physical id") != std::string::npos)
192 {
193 auto data_start = line.find(": ") + 2;
194 physId = std::strtoul(&line.c_str()[data_start], &c, 10);
195 continue;
196 }
197 if (line.length() == 0)
198 {
199 if (physId + 1 > out_nodes.size())
200 out_nodes.resize(physId + 1);
201 auto& numaNode = out_nodes[physId];
202 numaNode.numaId = physId;
203
204 if (coreId + 1 > numaNode.cores.size())
205 numaNode.cores.resize(coreId + 1);
206 auto& core = numaNode.cores[coreId];
207 core.procGroup = coreId;
208 core.threadIds.push_back(procId);
209 }
210 }
211
212 out_numThreadsPerProcGroup = 0;
213 for (auto& node : out_nodes)
214 {
215 for (auto& core : node.cores)
216 {
217 out_numThreadsPerProcGroup += core.threadIds.size();
218 }
219 }
220
221 #elif defined(__APPLE__)
222
223 auto numProcessors = 0;
224 auto numCores = 0;
225 auto numPhysicalIds = 0;
226
227 int value;
228 size_t size = sizeof(value);
229
230 int result = sysctlbyname("hw.packages", &value, &size, NULL, 0);
231 SWR_ASSERT(result == 0);
232 numPhysicalIds = value;
233
234 result = sysctlbyname("hw.logicalcpu", &value, &size, NULL, 0);
235 SWR_ASSERT(result == 0);
236 numProcessors = value;
237
238 result = sysctlbyname("hw.physicalcpu", &value, &size, NULL, 0);
239 SWR_ASSERT(result == 0);
240 numCores = value;
241
242 out_nodes.resize(numPhysicalIds);
243
244 for (auto physId = 0; physId < numPhysicalIds; ++physId)
245 {
246 auto& numaNode = out_nodes[physId];
247 auto procId = 0;
248
249 numaNode.cores.resize(numCores);
250
251 while (procId < numProcessors)
252 {
253 for (auto coreId = 0; coreId < numaNode.cores.size(); ++coreId, ++procId)
254 {
255 auto& core = numaNode.cores[coreId];
256
257 core.procGroup = coreId;
258 core.threadIds.push_back(procId);
259 }
260 }
261 }
262
263 out_numThreadsPerProcGroup = 0;
264
265 for (auto& node : out_nodes)
266 {
267 for (auto& core : node.cores)
268 {
269 out_numThreadsPerProcGroup += core.threadIds.size();
270 }
271 }
272
273 #else
274
275 #error Unsupported platform
276
277 #endif
278
279 // Prune empty cores and numa nodes
280 for (auto node_it = out_nodes.begin(); node_it != out_nodes.end();)
281 {
282 // Erase empty cores (first)
283 for (auto core_it = node_it->cores.begin(); core_it != node_it->cores.end();)
284 {
285 if (core_it->threadIds.size() == 0)
286 {
287 core_it = node_it->cores.erase(core_it);
288 }
289 else
290 {
291 ++core_it;
292 }
293 }
294
295 // Erase empty numa nodes (second)
296 if (node_it->cores.size() == 0)
297 {
298 node_it = out_nodes.erase(node_it);
299 }
300 else
301 {
302 ++node_it;
303 }
304 }
305 }
306
bindThread(SWR_CONTEXT * pContext,uint32_t threadId,uint32_t procGroupId=0,bool bindProcGroup=false)307 void bindThread(SWR_CONTEXT* pContext,
308 uint32_t threadId,
309 uint32_t procGroupId = 0,
310 bool bindProcGroup = false)
311 {
312 // Only bind threads when MAX_WORKER_THREADS isn't set.
313 if (pContext->threadInfo.SINGLE_THREADED ||
314 (pContext->threadInfo.MAX_WORKER_THREADS && bindProcGroup == false))
315 {
316 return;
317 }
318
319 #if defined(_WIN32)
320
321 GROUP_AFFINITY affinity = {};
322 affinity.Group = procGroupId;
323
324 #if !defined(_WIN64)
325 if (threadId >= 32)
326 {
327 // Hopefully we don't get here. Logic in CreateThreadPool should prevent this.
328 SWR_INVALID("Shouldn't get here");
329
330 // In a 32-bit process on Windows it is impossible to bind
331 // to logical processors 32-63 within a processor group.
332 // In this case set the mask to 0 and let the system assign
333 // the processor. Hopefully it will make smart choices.
334 affinity.Mask = 0;
335 }
336 else
337 #endif
338 {
339 // If MAX_WORKER_THREADS is set, only bind to the proc group,
340 // Not the individual HW thread.
341 if (!bindProcGroup && !pContext->threadInfo.MAX_WORKER_THREADS)
342 {
343 affinity.Mask = KAFFINITY(1) << threadId;
344 }
345 else
346 {
347 affinity.Mask = KAFFINITY(0);
348 }
349 }
350
351 if (!SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr))
352 {
353 SWR_INVALID("Failed to set Thread Affinity");
354 }
355
356 #elif defined(__linux__) || defined(__gnu_linux__)
357
358 cpu_set_t cpuset;
359 pthread_t thread = pthread_self();
360 CPU_ZERO(&cpuset);
361 CPU_SET(threadId, &cpuset);
362
363 int err = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
364 if (err != 0)
365 {
366 fprintf(stderr, "pthread_setaffinity_np failure for tid %u: %s\n", threadId, strerror(err));
367 }
368
369 #endif
370 }
371
372 INLINE
GetEnqueuedDraw(SWR_CONTEXT * pContext)373 uint32_t GetEnqueuedDraw(SWR_CONTEXT* pContext)
374 {
375 return pContext->dcRing.GetHead();
376 }
377
378 INLINE
GetDC(SWR_CONTEXT * pContext,uint32_t drawId)379 DRAW_CONTEXT* GetDC(SWR_CONTEXT* pContext, uint32_t drawId)
380 {
381 return &pContext->dcRing[(drawId - 1) % pContext->MAX_DRAWS_IN_FLIGHT];
382 }
383
384 INLINE
IDComparesLess(uint32_t a,uint32_t b)385 bool IDComparesLess(uint32_t a, uint32_t b)
386 {
387 // Use signed delta to ensure that wrap-around to 0 is correctly handled.
388 int32_t delta = int32_t(a - b);
389 return (delta < 0);
390 }
391
392 // returns true if dependency not met
393 INLINE
CheckDependency(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t lastRetiredDraw)394 bool CheckDependency(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t lastRetiredDraw)
395 {
396 return pDC->dependent && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
397 }
398
CheckDependencyFE(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t lastRetiredDraw)399 bool CheckDependencyFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t lastRetiredDraw)
400 {
401 return pDC->dependentFE && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
402 }
403
404 //////////////////////////////////////////////////////////////////////////
405 /// @brief Update client stats.
UpdateClientStats(SWR_CONTEXT * pContext,uint32_t workerId,DRAW_CONTEXT * pDC)406 INLINE void UpdateClientStats(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
407 {
408 if ((pContext->pfnUpdateStats == nullptr) || (GetApiState(pDC).enableStatsBE == false))
409 {
410 return;
411 }
412
413 DRAW_DYNAMIC_STATE& dynState = pDC->dynState;
414 OSALIGNLINE(SWR_STATS) stats{0};
415
416 // Sum up stats across all workers before sending to client.
417 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
418 {
419 stats.DepthPassCount += dynState.pStats[i].DepthPassCount;
420 stats.PsInvocations += dynState.pStats[i].PsInvocations;
421 stats.CsInvocations += dynState.pStats[i].CsInvocations;
422
423 }
424
425
426 pContext->pfnUpdateStats(GetPrivateState(pDC), &stats);
427 }
428
ExecuteCallbacks(SWR_CONTEXT * pContext,uint32_t workerId,DRAW_CONTEXT * pDC)429 INLINE void ExecuteCallbacks(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
430 {
431 UpdateClientStats(pContext, workerId, pDC);
432
433 if (pDC->retireCallback.pfnCallbackFunc)
434 {
435 pDC->retireCallback.pfnCallbackFunc(pDC->retireCallback.userData,
436 pDC->retireCallback.userData2,
437 pDC->retireCallback.userData3);
438
439 // Callbacks to external code *could* change floating point control state
440 // Reset our optimal flags
441 SetOptimalVectorCSR();
442 }
443 }
444
445 // inlined-only version
CompleteDrawContextInl(SWR_CONTEXT * pContext,uint32_t workerId,DRAW_CONTEXT * pDC)446 INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
447 {
448 int32_t result = static_cast<int32_t>(InterlockedDecrement(&pDC->threadsDone));
449 SWR_ASSERT(result >= 0);
450
451 AR_FLUSH(pDC->drawId);
452
453 if (result == 0)
454 {
455 ExecuteCallbacks(pContext, workerId, pDC);
456
457
458 // Cleanup memory allocations
459 pDC->pArena->Reset(true);
460 if (!pDC->isCompute)
461 {
462 pDC->pTileMgr->initialize();
463 }
464 if (pDC->cleanupState)
465 {
466 pDC->pState->pArena->Reset(true);
467 }
468
469 _ReadWriteBarrier();
470
471 pContext->dcRing.Dequeue(); // Remove from tail
472 }
473
474 return result;
475 }
476
477 // available to other translation modules
CompleteDrawContext(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC)478 int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
479 {
480 return CompleteDrawContextInl(pContext, 0, pDC);
481 }
482
FindFirstIncompleteDraw(SWR_CONTEXT * pContext,uint32_t workerId,uint32_t & curDrawBE,uint32_t & drawEnqueued)483 INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext,
484 uint32_t workerId,
485 uint32_t& curDrawBE,
486 uint32_t& drawEnqueued)
487 {
488 // increment our current draw id to the first incomplete draw
489 drawEnqueued = GetEnqueuedDraw(pContext);
490 while (IDComparesLess(curDrawBE, drawEnqueued))
491 {
492 DRAW_CONTEXT* pDC = &pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT];
493
494 // If its not compute and FE is not done then break out of loop.
495 if (!pDC->doneFE && !pDC->isCompute)
496 break;
497
498 bool isWorkComplete =
499 pDC->isCompute ? pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete();
500
501 if (isWorkComplete)
502 {
503 curDrawBE++;
504 CompleteDrawContextInl(pContext, workerId, pDC);
505 }
506 else
507 {
508 break;
509 }
510 }
511
512 // If there are no more incomplete draws then return false.
513 return IDComparesLess(curDrawBE, drawEnqueued);
514 }
515
516 //////////////////////////////////////////////////////////////////////////
517 /// @brief If there is any BE work then go work on it.
518 /// @param pContext - pointer to SWR context.
519 /// @param workerId - The unique worker ID that is assigned to this thread.
520 /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker
521 /// thread
522 /// has its own curDrawBE counter and this ensures that each worker processes all
523 /// the draws in order.
524 /// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its
525 /// own set and each time it fails to lock a macrotile, because its already
526 /// locked, then it will add that tile to the lockedTiles set. As a worker
527 /// begins to work on future draws the lockedTiles ensure that it doesn't work
528 /// on tiles that may still have work pending in a previous draw. Additionally,
529 /// the lockedTiles is hueristic that can steer a worker back to the same
530 /// macrotile that it had been working on in a previous draw.
531 /// @returns true if worker thread should shutdown
WorkOnFifoBE(SWR_CONTEXT * pContext,uint32_t workerId,uint32_t & curDrawBE,TileSet & lockedTiles,uint32_t numaNode,uint32_t numaMask)532 bool WorkOnFifoBE(SWR_CONTEXT* pContext,
533 uint32_t workerId,
534 uint32_t& curDrawBE,
535 TileSet& lockedTiles,
536 uint32_t numaNode,
537 uint32_t numaMask)
538 {
539 bool bShutdown = false;
540
541 // Find the first incomplete draw that has pending work. If no such draw is found then
542 // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
543 uint32_t drawEnqueued = 0;
544 if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
545 {
546 return false;
547 }
548
549 uint32_t lastRetiredDraw =
550 pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;
551
552 // Reset our history for locked tiles. We'll have to re-learn which tiles are locked.
553 lockedTiles.clear();
554
555 // Try to work on each draw in order of the available draws in flight.
556 // 1. If we're on curDrawBE, we can work on any macrotile that is available.
557 // 2. If we're trying to work on draws after curDrawBE, we are restricted to
558 // working on those macrotiles that are known to be complete in the prior draw to
559 // maintain order. The locked tiles provides the history to ensures this.
560 for (uint32_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
561 {
562 DRAW_CONTEXT* pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
563
564 if (pDC->isCompute)
565 return false; // We don't look at compute work.
566
567 // First wait for FE to be finished with this draw. This keeps threading model simple
568 // but if there are lots of bubbles between draws then serializing FE and BE may
569 // need to be revisited.
570 if (!pDC->doneFE)
571 return false;
572
573 // If this draw is dependent on a previous draw then we need to bail.
574 if (CheckDependency(pContext, pDC, lastRetiredDraw))
575 {
576 return false;
577 }
578
579 // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it.
580 auto& macroTiles = pDC->pTileMgr->getDirtyTiles();
581
582 for (auto tile : macroTiles)
583 {
584 uint32_t tileID = tile->mId;
585
586 // Only work on tiles for this numa node
587 uint32_t x, y;
588 pDC->pTileMgr->getTileIndices(tileID, x, y);
589 if (((x ^ y) & numaMask) != numaNode)
590 {
591 _mm_pause();
592 continue;
593 }
594
595 if (!tile->getNumQueued())
596 {
597 _mm_pause();
598 continue;
599 }
600
601 // can only work on this draw if it's not in use by other threads
602 if (lockedTiles.get(tileID))
603 {
604 _mm_pause();
605 continue;
606 }
607
608 if (tile->tryLock())
609 {
610 BE_WORK* pWork;
611
612 RDTSC_BEGIN(pContext->pBucketMgr, WorkerFoundWork, pDC->drawId);
613
614 uint32_t numWorkItems = tile->getNumQueued();
615 SWR_ASSERT(numWorkItems);
616
617 pWork = tile->peek();
618 SWR_ASSERT(pWork);
619 if (pWork->type == DRAW)
620 {
621 pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, workerId, tileID);
622 }
623 else if (pWork->type == SHUTDOWN)
624 {
625 bShutdown = true;
626 }
627
628 while ((pWork = tile->peek()) != nullptr)
629 {
630 pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
631 tile->dequeue();
632 }
633 RDTSC_END(pContext->pBucketMgr, WorkerFoundWork, numWorkItems);
634
635 _ReadWriteBarrier();
636
637 pDC->pTileMgr->markTileComplete(tileID);
638
639 // Optimization: If the draw is complete and we're the last one to have worked on it
640 // then we can reset the locked list as we know that all previous draws before the
641 // next are guaranteed to be complete.
642 if ((curDrawBE == i) && (bShutdown || pDC->pTileMgr->isWorkComplete()))
643 {
644 // We can increment the current BE and safely move to next draw since we know
645 // this draw is complete.
646 curDrawBE++;
647 CompleteDrawContextInl(pContext, workerId, pDC);
648
649 lastRetiredDraw++;
650
651 lockedTiles.clear();
652 break;
653 }
654
655 if (bShutdown)
656 {
657 break;
658 }
659 }
660 else
661 {
662 // This tile is already locked. So let's add it to our locked tiles set. This way we
663 // don't try locking this one again.
664 lockedTiles.set(tileID);
665 _mm_pause();
666 }
667 }
668 }
669
670 return bShutdown;
671 }
672
673 //////////////////////////////////////////////////////////////////////////
674 /// @brief Called when FE work is complete for this DC.
CompleteDrawFE(SWR_CONTEXT * pContext,uint32_t workerId,DRAW_CONTEXT * pDC)675 INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
676 {
677 if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStatsFE)
678 {
679 SWR_STATS_FE& stats = pDC->dynState.statsFE;
680
681 AR_EVENT(FrontendStatsEvent(pDC->drawId,
682 stats.IaVertices,
683 stats.IaPrimitives,
684 stats.VsInvocations,
685 stats.HsInvocations,
686 stats.DsInvocations,
687 stats.GsInvocations,
688 stats.GsPrimitives,
689 stats.CInvocations,
690 stats.CPrimitives,
691 stats.SoPrimStorageNeeded[0],
692 stats.SoPrimStorageNeeded[1],
693 stats.SoPrimStorageNeeded[2],
694 stats.SoPrimStorageNeeded[3],
695 stats.SoNumPrimsWritten[0],
696 stats.SoNumPrimsWritten[1],
697 stats.SoNumPrimsWritten[2],
698 stats.SoNumPrimsWritten[3]));
699 AR_EVENT(FrontendDrawEndEvent(pDC->drawId));
700
701 pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &stats);
702 }
703
704 if (pContext->pfnUpdateSoWriteOffset)
705 {
706 for (uint32_t i = 0; i < MAX_SO_BUFFERS; ++i)
707 {
708 if ((pDC->dynState.SoWriteOffsetDirty[i]) &&
709 (pDC->pState->state.soBuffer[i].soWriteEnable))
710 {
711 pContext->pfnUpdateSoWriteOffset(
712 GetPrivateState(pDC), i, pDC->dynState.SoWriteOffset[i]);
713 }
714 }
715 }
716
717 if (pContext->pfnUpdateStreamOut)
718 pContext->pfnUpdateStreamOut(GetPrivateState(pDC), pDC->dynState.soPrims);
719
720 // Ensure all streaming writes are globally visible before marking this FE done
721 _mm_mfence();
722 pDC->doneFE = true;
723
724 InterlockedDecrement(&pContext->drawsOutstandingFE);
725 }
726
WorkOnFifoFE(SWR_CONTEXT * pContext,uint32_t workerId,uint32_t & curDrawFE)727 void WorkOnFifoFE(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawFE)
728 {
729 // Try to grab the next DC from the ring
730 uint32_t drawEnqueued = GetEnqueuedDraw(pContext);
731 while (IDComparesLess(curDrawFE, drawEnqueued))
732 {
733 uint32_t dcSlot = curDrawFE % pContext->MAX_DRAWS_IN_FLIGHT;
734 DRAW_CONTEXT* pDC = &pContext->dcRing[dcSlot];
735 if (pDC->isCompute || pDC->doneFE)
736 {
737 CompleteDrawContextInl(pContext, workerId, pDC);
738 curDrawFE++;
739 }
740 else
741 {
742 break;
743 }
744 }
745
746 uint32_t lastRetiredFE = curDrawFE - 1;
747 uint32_t curDraw = curDrawFE;
748 while (IDComparesLess(curDraw, drawEnqueued))
749 {
750 uint32_t dcSlot = curDraw % pContext->MAX_DRAWS_IN_FLIGHT;
751 DRAW_CONTEXT* pDC = &pContext->dcRing[dcSlot];
752
753 if (!pDC->FeLock && !pDC->isCompute)
754 {
755 if (CheckDependencyFE(pContext, pDC, lastRetiredFE))
756 {
757 return;
758 }
759
760 uint32_t initial = InterlockedCompareExchange((volatile uint32_t*)&pDC->FeLock, 1, 0);
761 if (initial == 0)
762 {
763 // successfully grabbed the DC, now run the FE
764 pDC->FeWork.pfnWork(pContext, pDC, workerId, &pDC->FeWork.desc);
765
766 CompleteDrawFE(pContext, workerId, pDC);
767 }
768 else
769 {
770 _mm_pause();
771 }
772 }
773 else
774 {
775 _mm_pause();
776 }
777
778 curDraw++;
779 }
780 }
781
782 //////////////////////////////////////////////////////////////////////////
783 /// @brief If there is any compute work then go work on it.
784 /// @param pContext - pointer to SWR context.
785 /// @param workerId - The unique worker ID that is assigned to this thread.
786 /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker
787 /// thread
788 /// has its own curDrawBE counter and this ensures that each worker processes all
789 /// the draws in order.
WorkOnCompute(SWR_CONTEXT * pContext,uint32_t workerId,uint32_t & curDrawBE)790 void WorkOnCompute(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawBE)
791 {
792 uint32_t drawEnqueued = 0;
793 if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
794 {
795 return;
796 }
797
798 uint32_t lastRetiredDraw =
799 pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;
800
801 for (uint64_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
802 {
803 DRAW_CONTEXT* pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
804 if (pDC->isCompute == false)
805 return;
806
807 // check dependencies
808 if (CheckDependency(pContext, pDC, lastRetiredDraw))
809 {
810 return;
811 }
812
813 SWR_ASSERT(pDC->pDispatch != nullptr);
814 DispatchQueue& queue = *pDC->pDispatch;
815
816 // Is there any work remaining?
817 if (queue.getNumQueued() > 0)
818 {
819 void* pSpillFillBuffer = nullptr;
820 void* pScratchSpace = nullptr;
821 uint32_t threadGroupId = 0;
822 while (queue.getWork(threadGroupId))
823 {
824 queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer, pScratchSpace);
825 queue.finishedWork();
826 }
827
828 // Ensure all streaming writes are globally visible before moving onto the next draw
829 _mm_mfence();
830 }
831 }
832 }
833
BindApiThread(SWR_CONTEXT * pContext,uint32_t apiThreadId)834 void BindApiThread(SWR_CONTEXT* pContext, uint32_t apiThreadId)
835 {
836 if (nullptr == pContext)
837 {
838 return;
839 }
840
841 if (apiThreadId >= pContext->threadPool.numReservedThreads)
842 {
843 if (pContext->threadPool.numReservedThreads)
844 {
845 const THREAD_DATA& threadData = pContext->threadPool.pApiThreadData[0];
846 // Just bind to the process group used for API thread 0
847 bindThread(pContext, 0, threadData.procGroupId, true);
848 }
849 return;
850 }
851
852 const THREAD_DATA& threadData = pContext->threadPool.pApiThreadData[apiThreadId];
853
854 bindThread(
855 pContext, threadData.threadId, threadData.procGroupId, threadData.forceBindProcGroup);
856 }
857
858 template <bool IsFEThread, bool IsBEThread>
workerThreadMain(LPVOID pData)859 DWORD workerThreadMain(LPVOID pData)
860 {
861 THREAD_DATA* pThreadData = (THREAD_DATA*)pData;
862 SWR_CONTEXT* pContext = pThreadData->pContext;
863 uint32_t threadId = pThreadData->threadId;
864 uint32_t workerId = pThreadData->workerId;
865
866 bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup);
867
868 {
869 char threadName[64];
870 sprintf_s(threadName,
871 #if defined(_WIN32)
872 "SWRWorker_%02d_NUMA%d_Core%02d_T%d",
873 #else
874 // linux pthread name limited to 16 chars (including \0)
875 "w%03d-n%d-c%03d-t%d",
876 #endif
877 workerId,
878 pThreadData->numaId,
879 pThreadData->coreId,
880 pThreadData->htId);
881 SetCurrentThreadName(threadName);
882 }
883
884 RDTSC_INIT(pContext->pBucketMgr, threadId);
885
886 // Only need offset numa index from base for correct masking
887 uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE;
888 uint32_t numaMask = pContext->threadPool.numaMask;
889
890 SetOptimalVectorCSR();
891
892 // Track tiles locked by other threads. If we try to lock a macrotile and find its already
893 // locked then we'll add it to this list so that we don't try and lock it again.
894 TileSet lockedTiles;
895
896 // each worker has the ability to work on any of the queued draws as long as certain
897 // conditions are met. the data associated
898 // with a draw is guaranteed to be active as long as a worker hasn't signaled that he
899 // has moved on to the next draw when he determines there is no more work to do. The api
900 // thread will not increment the head of the dc ring until all workers have moved past the
901 // current head.
902 // the logic to determine what to work on is:
903 // 1- try to work on the FE any draw that is queued. For now there are no dependencies
904 // on the FE work, so any worker can grab any FE and process in parallel. Eventually
905 // we'll need dependency tracking to force serialization on FEs. The worker will try
906 // to pick an FE by atomically incrementing a counter in the swr context. he'll keep
907 // trying until he reaches the tail.
908 // 2- BE work must be done in strict order. we accomplish this today by pulling work off
909 // the oldest draw (ie the head) of the dcRing. the worker can determine if there is
910 // any work left by comparing the total # of binned work items and the total # of completed
911 // work items. If they are equal, then there is no more work to do for this draw, and
912 // the worker can safely increment its oldestDraw counter and move on to the next draw.
913 std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
914
915 auto threadHasWork = [&](uint32_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };
916
917 uint32_t curDrawBE = 0;
918 uint32_t curDrawFE = 0;
919
920 bool bShutdown = false;
921
922 while (true)
923 {
924 if (bShutdown && !threadHasWork(curDrawBE))
925 {
926 break;
927 }
928
929 uint32_t loop = 0;
930 while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE))
931 {
932 _mm_pause();
933 }
934
935 if (!threadHasWork(curDrawBE))
936 {
937 lock.lock();
938
939 // check for thread idle condition again under lock
940 if (threadHasWork(curDrawBE))
941 {
942 lock.unlock();
943 continue;
944 }
945
946 pContext->FifosNotEmpty.wait(lock);
947 lock.unlock();
948 }
949
950 if (IsBEThread)
951 {
952 RDTSC_BEGIN(pContext->pBucketMgr, WorkerWorkOnFifoBE, 0);
953 bShutdown |=
954 WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
955 RDTSC_END(pContext->pBucketMgr, WorkerWorkOnFifoBE, 0);
956
957 WorkOnCompute(pContext, workerId, curDrawBE);
958 }
959
960 if (IsFEThread)
961 {
962 WorkOnFifoFE(pContext, workerId, curDrawFE);
963
964 if (!IsBEThread)
965 {
966 curDrawBE = curDrawFE;
967 }
968 }
969 }
970
971 return 0;
972 }
973 template <>
974 DWORD workerThreadMain<false, false>(LPVOID) = delete;
975
976 template <bool IsFEThread, bool IsBEThread>
workerThreadInit(LPVOID pData)977 DWORD workerThreadInit(LPVOID pData)
978 {
979 #if defined(_MSC_VER)
980 __try
981 #endif // _WIN32
982 {
983 return workerThreadMain<IsFEThread, IsBEThread>(pData);
984 }
985
986 #if defined(_MSC_VER)
987 __except (EXCEPTION_CONTINUE_SEARCH)
988 {
989 }
990
991 #endif // _WIN32
992
993 return 1;
994 }
995 template <>
996 DWORD workerThreadInit<false, false>(LPVOID pData) = delete;
997
InitPerThreadStats(SWR_CONTEXT * pContext,uint32_t numThreads)998 static void InitPerThreadStats(SWR_CONTEXT* pContext, uint32_t numThreads)
999 {
1000 // Initialize DRAW_CONTEXT's per-thread stats
1001 for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
1002 {
1003 pContext->dcRing[dc].dynState.pStats =
1004 (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64);
1005 memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads);
1006 }
1007 }
1008
1009 //////////////////////////////////////////////////////////////////////////
1010 /// @brief Creates thread pool info but doesn't launch threads.
1011 /// @param pContext - pointer to context
1012 /// @param pPool - pointer to thread pool object.
CreateThreadPool(SWR_CONTEXT * pContext,THREAD_POOL * pPool)1013 void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
1014 {
1015 CPUNumaNodes nodes;
1016 uint32_t numThreadsPerProcGroup = 0;
1017 CalculateProcessorTopology(nodes, numThreadsPerProcGroup);
1018 assert(numThreadsPerProcGroup > 0);
1019
1020 // Assumption, for asymmetric topologies, multi-threaded cores will appear
1021 // in the list before single-threaded cores. This appears to be true for
1022 // Windows when the total HW threads is limited to 64.
1023 uint32_t numHWNodes = (uint32_t)nodes.size();
1024 uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size();
1025 uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size();
1026
1027 #if defined(_WIN32) && !defined(_WIN64)
1028 if (!pContext->threadInfo.MAX_WORKER_THREADS)
1029 {
1030 // Limit 32-bit windows to bindable HW threads only
1031 if ((numHWCoresPerNode * numHWHyperThreads) > 32)
1032 {
1033 numHWCoresPerNode = 32 / numHWHyperThreads;
1034 }
1035 }
1036 #endif
1037
1038 // Calculate num HW threads. Due to asymmetric topologies, this is not
1039 // a trivial multiplication.
1040 uint32_t numHWThreads = 0;
1041 for (auto const& node : nodes)
1042 {
1043 for (auto const& core : node.cores)
1044 {
1045 numHWThreads += (uint32_t)core.threadIds.size();
1046 }
1047 }
1048
1049 uint32_t numNodes = numHWNodes;
1050 uint32_t numCoresPerNode = numHWCoresPerNode;
1051 uint32_t numHyperThreads = numHWHyperThreads;
1052
1053 // Calc used threads per-core
1054 if (numHyperThreads > pContext->threadInfo.BASE_THREAD)
1055 {
1056 numHyperThreads -= pContext->threadInfo.BASE_THREAD;
1057 }
1058 else
1059 {
1060 SWR_ASSERT(false,
1061 "Cannot use BASE_THREAD value: %d, maxThreads: %d, reverting BASE_THREAD to 0",
1062 pContext->threadInfo.BASE_THREAD,
1063 numHyperThreads);
1064 pContext->threadInfo.BASE_THREAD = 0;
1065 }
1066
1067 if (pContext->threadInfo.MAX_THREADS_PER_CORE)
1068 {
1069 numHyperThreads = std::min(numHyperThreads, pContext->threadInfo.MAX_THREADS_PER_CORE);
1070 }
1071
1072 // Prune any cores that don't support the number of threads
1073 if (numHyperThreads > 1)
1074 {
1075 for (auto& node : nodes)
1076 {
1077 uint32_t numUsableCores = 0;
1078 for (auto& core : node.cores)
1079 {
1080 numUsableCores += (core.threadIds.size() >= numHyperThreads);
1081 }
1082 numCoresPerNode = std::min(numCoresPerNode, numUsableCores);
1083 }
1084 }
1085
1086 // Calc used cores per NUMA node
1087 if (numCoresPerNode > pContext->threadInfo.BASE_CORE)
1088 {
1089 numCoresPerNode -= pContext->threadInfo.BASE_CORE;
1090 }
1091 else
1092 {
1093 SWR_ASSERT(false,
1094 "Cannot use BASE_CORE value: %d, maxCores: %d, reverting BASE_CORE to 0",
1095 pContext->threadInfo.BASE_CORE,
1096 numCoresPerNode);
1097 pContext->threadInfo.BASE_CORE = 0;
1098 }
1099
1100 if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE)
1101 {
1102 numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE);
1103 }
1104
1105 // Calc used NUMA nodes
1106 if (numNodes > pContext->threadInfo.BASE_NUMA_NODE)
1107 {
1108 numNodes -= pContext->threadInfo.BASE_NUMA_NODE;
1109 }
1110 else
1111 {
1112 SWR_ASSERT(
1113 false,
1114 "Cannot use BASE_NUMA_NODE value: %d, maxNodes: %d, reverting BASE_NUMA_NODE to 0",
1115 pContext->threadInfo.BASE_NUMA_NODE,
1116 numNodes);
1117 pContext->threadInfo.BASE_NUMA_NODE = 0;
1118 }
1119
1120 if (pContext->threadInfo.MAX_NUMA_NODES)
1121 {
1122 numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES);
1123 }
1124
1125 // Calculate numThreads - at this point everything should be symmetric
1126 uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
1127 SWR_REL_ASSERT(numThreads <= numHWThreads);
1128
1129 uint32_t& numAPIReservedThreads = pContext->apiThreadInfo.numAPIReservedThreads;
1130 uint32_t& numAPIThreadsPerCore = pContext->apiThreadInfo.numAPIThreadsPerCore;
1131 uint32_t numRemovedThreads = 0;
1132
1133 if (pContext->threadInfo.SINGLE_THREADED)
1134 {
1135 numAPIReservedThreads = 0;
1136 numThreads = 1;
1137 pContext->NumWorkerThreads = 1;
1138 pContext->NumFEThreads = 1;
1139 pContext->NumBEThreads = 1;
1140 pPool->numThreads = 0;
1141 }
1142 else if (pContext->threadInfo.MAX_WORKER_THREADS)
1143 {
1144 numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, numHWThreads);
1145 pContext->threadInfo.BASE_NUMA_NODE = 0;
1146 pContext->threadInfo.BASE_CORE = 0;
1147 pContext->threadInfo.BASE_THREAD = 0;
1148 numAPIReservedThreads = 0;
1149 }
1150 else
1151 {
1152 if (numAPIReservedThreads >= numThreads)
1153 {
1154 numAPIReservedThreads = 0;
1155 }
1156 else if (numAPIReservedThreads)
1157 {
1158 numAPIThreadsPerCore = std::min(numAPIThreadsPerCore, numHWHyperThreads);
1159
1160 if (0 == numAPIThreadsPerCore)
1161 {
1162 numAPIThreadsPerCore = numHWHyperThreads;
1163 }
1164
1165 numRemovedThreads = numAPIReservedThreads;
1166 if (numAPIThreadsPerCore == 2 && numHyperThreads == 1)
1167 {
1168 // Adjust removed threads to make logic below work
1169 numRemovedThreads =
1170 std::max(1U, (numRemovedThreads + numAPIThreadsPerCore - 1) / 2);
1171 }
1172
1173 numThreads -= numRemovedThreads;
1174 }
1175 }
1176
1177 InitPerThreadStats(pContext, numThreads);
1178
1179 if (pContext->threadInfo.SINGLE_THREADED)
1180 {
1181 numAPIReservedThreads = 0;
1182 numThreads = 1;
1183 }
1184
1185 if (numAPIReservedThreads)
1186 {
1187 pPool->pApiThreadData = new (std::nothrow) THREAD_DATA[numAPIReservedThreads];
1188 SWR_ASSERT(pPool->pApiThreadData);
1189 if (!pPool->pApiThreadData)
1190 {
1191 numAPIReservedThreads = 0;
1192 }
1193 else
1194 {
1195 memset(pPool->pApiThreadData, 0, sizeof(THREAD_DATA) * numAPIReservedThreads);
1196 }
1197 }
1198 pPool->numReservedThreads = numAPIReservedThreads;
1199
1200 pPool->numThreads = numThreads;
1201 pContext->NumWorkerThreads = pPool->numThreads;
1202
1203 pPool->pThreadData = new (std::nothrow) THREAD_DATA[pPool->numThreads];
1204 assert(pPool->pThreadData);
1205 memset(pPool->pThreadData, 0, sizeof(THREAD_DATA) * pPool->numThreads);
1206 pPool->numaMask = 0;
1207
1208 // Allocate worker private data
1209 pPool->pWorkerPrivateDataArray = nullptr;
1210 if (pContext->workerPrivateState.perWorkerPrivateStateSize == 0)
1211 {
1212 pContext->workerPrivateState.perWorkerPrivateStateSize = sizeof(SWR_WORKER_DATA);
1213 pContext->workerPrivateState.pfnInitWorkerData = nullptr;
1214 pContext->workerPrivateState.pfnFinishWorkerData = nullptr;
1215 }
1216
1217 // initialize contents of SWR_WORKER_DATA
1218 size_t perWorkerSize =
1219 AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64);
1220 size_t totalSize = perWorkerSize * pPool->numThreads;
1221 if (totalSize)
1222 {
1223 pPool->pWorkerPrivateDataArray = AlignedMalloc(totalSize, 64);
1224 SWR_ASSERT(pPool->pWorkerPrivateDataArray);
1225
1226 void* pWorkerData = pPool->pWorkerPrivateDataArray;
1227 for (uint32_t i = 0; i < pPool->numThreads; ++i)
1228 {
1229 pPool->pThreadData[i].pWorkerPrivateData = pWorkerData;
1230 if (pContext->workerPrivateState.pfnInitWorkerData)
1231 {
1232 pContext->workerPrivateState.pfnInitWorkerData(pContext, pWorkerData, i);
1233 }
1234 pWorkerData = PtrAdd(pWorkerData, perWorkerSize);
1235 }
1236 }
1237
1238 if (pContext->threadInfo.SINGLE_THREADED)
1239 {
1240 return;
1241 }
1242
1243 pPool->pThreads = new (std::nothrow) THREAD_PTR[pPool->numThreads];
1244 assert(pPool->pThreads);
1245
1246 if (pContext->threadInfo.MAX_WORKER_THREADS)
1247 {
1248 bool bForceBindProcGroup = (numThreads > numThreadsPerProcGroup);
1249 uint32_t numProcGroups = (numThreads + numThreadsPerProcGroup - 1) / numThreadsPerProcGroup;
1250 // When MAX_WORKER_THREADS is set we don't bother to bind to specific HW threads
1251 // But Windows will still require binding to specific process groups
1252 for (uint32_t workerId = 0; workerId < numThreads; ++workerId)
1253 {
1254 pPool->pThreadData[workerId].workerId = workerId;
1255 pPool->pThreadData[workerId].procGroupId = workerId % numProcGroups;
1256 pPool->pThreadData[workerId].threadId = 0;
1257 pPool->pThreadData[workerId].numaId = 0;
1258 pPool->pThreadData[workerId].coreId = 0;
1259 pPool->pThreadData[workerId].htId = 0;
1260 pPool->pThreadData[workerId].pContext = pContext;
1261 pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
1262
1263 pContext->NumBEThreads++;
1264 pContext->NumFEThreads++;
1265 }
1266 }
1267 else
1268 {
1269 // numa distribution assumes workers on all nodes
1270 bool useNuma = true;
1271 if (numCoresPerNode * numHyperThreads == 1)
1272 {
1273 useNuma = false;
1274 }
1275
1276 if (useNuma)
1277 {
1278 pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.)
1279 }
1280 else
1281 {
1282 pPool->numaMask = 0;
1283 }
1284
1285 uint32_t workerId = 0;
1286 uint32_t numReservedThreads = numAPIReservedThreads;
1287 for (uint32_t n = 0; n < numNodes; ++n)
1288 {
1289 if ((n + pContext->threadInfo.BASE_NUMA_NODE) >= nodes.size())
1290 {
1291 break;
1292 }
1293 auto& node = nodes[n + pContext->threadInfo.BASE_NUMA_NODE];
1294 uint32_t numCores = numCoresPerNode;
1295 for (uint32_t c = 0; c < numCores; ++c)
1296 {
1297 if ((c + pContext->threadInfo.BASE_CORE) >= node.cores.size())
1298 {
1299 break;
1300 }
1301
1302 auto& core = node.cores[c + pContext->threadInfo.BASE_CORE];
1303 for (uint32_t t = 0; t < numHyperThreads; ++t)
1304 {
1305 if ((t + pContext->threadInfo.BASE_THREAD) >= core.threadIds.size())
1306 {
1307 break;
1308 }
1309
1310 if (numRemovedThreads)
1311 {
1312 --numRemovedThreads;
1313 assert(numReservedThreads);
1314 --numReservedThreads;
1315 pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU;
1316 pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
1317 pPool->pApiThreadData[numReservedThreads].threadId = core.threadIds[t];
1318 pPool->pApiThreadData[numReservedThreads].numaId =
1319 useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
1320 pPool->pApiThreadData[numReservedThreads].coreId =
1321 c + pContext->threadInfo.BASE_CORE;
1322 pPool->pApiThreadData[numReservedThreads].htId =
1323 t + pContext->threadInfo.BASE_THREAD;
1324 pPool->pApiThreadData[numReservedThreads].pContext = pContext;
1325 pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
1326
1327 if (numAPIThreadsPerCore > numHyperThreads && numReservedThreads)
1328 {
1329 --numReservedThreads;
1330 pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU;
1331 pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
1332 pPool->pApiThreadData[numReservedThreads].threadId =
1333 core.threadIds[t + 1];
1334 pPool->pApiThreadData[numReservedThreads].numaId =
1335 useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
1336 pPool->pApiThreadData[numReservedThreads].coreId =
1337 c + pContext->threadInfo.BASE_CORE;
1338 pPool->pApiThreadData[numReservedThreads].htId =
1339 t + pContext->threadInfo.BASE_THREAD;
1340 pPool->pApiThreadData[numReservedThreads].pContext = pContext;
1341 pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
1342 }
1343
1344 continue;
1345 }
1346
1347 SWR_ASSERT(workerId < numThreads);
1348
1349 pPool->pThreadData[workerId].workerId = workerId;
1350 pPool->pThreadData[workerId].procGroupId = core.procGroup;
1351 pPool->pThreadData[workerId].threadId =
1352 core.threadIds[t + pContext->threadInfo.BASE_THREAD];
1353 pPool->pThreadData[workerId].numaId =
1354 useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
1355 pPool->pThreadData[workerId].coreId = c + pContext->threadInfo.BASE_CORE;
1356 pPool->pThreadData[workerId].htId = t + pContext->threadInfo.BASE_THREAD;
1357 pPool->pThreadData[workerId].pContext = pContext;
1358 pPool->pThreadData[workerId].forceBindProcGroup = false;
1359
1360 pContext->NumBEThreads++;
1361 pContext->NumFEThreads++;
1362
1363 ++workerId;
1364 }
1365 }
1366 }
1367 SWR_ASSERT(workerId == pContext->NumWorkerThreads);
1368 }
1369 }
1370
1371 //////////////////////////////////////////////////////////////////////////
1372 /// @brief Launches worker threads in thread pool.
1373 /// @param pContext - pointer to context
1374 /// @param pPool - pointer to thread pool object.
StartThreadPool(SWR_CONTEXT * pContext,THREAD_POOL * pPool)1375 void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
1376 {
1377 if (pContext->threadInfo.SINGLE_THREADED)
1378 {
1379 return;
1380 }
1381
1382 for (uint32_t workerId = 0; workerId < pContext->NumWorkerThreads; ++workerId)
1383 {
1384 pPool->pThreads[workerId] =
1385 new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
1386 }
1387 }
1388
1389 //////////////////////////////////////////////////////////////////////////
1390 /// @brief Destroys thread pool.
1391 /// @param pContext - pointer to context
1392 /// @param pPool - pointer to thread pool object.
DestroyThreadPool(SWR_CONTEXT * pContext,THREAD_POOL * pPool)1393 void DestroyThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
1394 {
1395 // Wait for all threads to finish
1396 SwrWaitForIdle(pContext);
1397
1398 // Wait for threads to finish and destroy them
1399 for (uint32_t t = 0; t < pPool->numThreads; ++t)
1400 {
1401 if (!pContext->threadInfo.SINGLE_THREADED)
1402 {
1403 // Detach from thread. Cannot join() due to possibility (in Windows) of code
1404 // in some DLLMain(THREAD_DETATCH case) blocking the thread until after this returns.
1405 pPool->pThreads[t]->detach();
1406 delete (pPool->pThreads[t]);
1407 }
1408
1409 if (pContext->workerPrivateState.pfnFinishWorkerData)
1410 {
1411 pContext->workerPrivateState.pfnFinishWorkerData(
1412 pContext, pPool->pThreadData[t].pWorkerPrivateData, t);
1413 }
1414 }
1415
1416 delete[] pPool->pThreads;
1417
1418 // Clean up data used by threads
1419 delete[] pPool->pThreadData;
1420 delete[] pPool->pApiThreadData;
1421
1422 AlignedFree(pPool->pWorkerPrivateDataArray);
1423 }
1424