1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file context.h
24 *
25 * @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT
26 * The SWR_CONTEXT is our global context and contains the DC ring,
27 * thread state, etc.
28 *
29 * The DRAW_CONTEXT contains all state associated with a draw operation.
30 *
31 ******************************************************************************/
32 #pragma once
33
34 #include <condition_variable>
35 #include <algorithm>
36
37 #include "core/api.h"
38 #include "core/utils.h"
39 #include "core/arena.h"
40 #include "core/fifo.hpp"
41 #include "core/knobs.h"
42 #include "common/intrin.h"
43 #include "common/rdtsc_buckets.h"
44 #include "core/threads.h"
45 #include "ringbuffer.h"
46 #include "archrast/archrast.h"
47
48 // x.8 fixed point precision values
49 #define FIXED_POINT_SHIFT 8
50 #define FIXED_POINT_SCALE 256
51
52 // x.16 fixed point precision values
53 #define FIXED_POINT16_SHIFT 16
54 #define FIXED_POINT16_SCALE 65536
55
56 struct SWR_CONTEXT;
57 struct DRAW_CONTEXT;
58
59 struct TRI_FLAGS
60 {
61 uint32_t frontFacing : 1;
62 uint32_t yMajor : 1;
63 uint32_t coverageMask : (SIMD_TILE_X_DIM* SIMD_TILE_Y_DIM);
64 uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
65 float pointSize;
66 uint32_t renderTargetArrayIndex;
67 uint32_t viewportIndex;
68 };
69
70 //////////////////////////////////////////////////////////////////////////
71 /// SWR_TRIANGLE_DESC
72 /////////////////////////////////////////////////////////////////////////
73 struct SWR_TRIANGLE_DESC
74 {
75 float I[3];
76 float J[3];
77 float Z[3];
78 float OneOverW[3];
79 float recipDet;
80
81 float* pRecipW;
82 float* pAttribs;
83 float* pPerspAttribs;
84 float* pSamplePos;
85 float* pUserClipBuffer;
86
87 uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
88 uint64_t innerCoverageMask; // Conservative rasterization inner coverage: marked covered if
89 // entire pixel is covered
90 uint64_t anyCoveredSamples;
91
92 TRI_FLAGS triFlags;
93 };
94
95 struct TRIANGLE_WORK_DESC
96 {
97 float* pTriBuffer;
98 float* pAttribs;
99 float* pUserClipBuffer;
100 uint32_t numAttribs;
101 TRI_FLAGS triFlags;
102 };
103
104 struct CLEAR_DESC
105 {
106 SWR_RECT rect;
107 uint32_t attachmentMask;
108 uint32_t renderTargetArrayIndex;
109 float clearRTColor[4]; // RGBA_32F
110 float clearDepth; // [0..1]
111 uint8_t clearStencil;
112 };
113
114 struct DISCARD_INVALIDATE_TILES_DESC
115 {
116 uint32_t attachmentMask;
117 SWR_RECT rect;
118 SWR_TILE_STATE newTileState;
119 bool createNewTiles;
120 bool fullTilesOnly;
121 };
122
123 struct SYNC_DESC
124 {
125 PFN_CALLBACK_FUNC pfnCallbackFunc;
126 uint64_t userData;
127 uint64_t userData2;
128 uint64_t userData3;
129 };
130
131 struct STORE_TILES_DESC
132 {
133 uint32_t attachmentMask;
134 SWR_TILE_STATE postStoreTileState;
135 SWR_RECT rect;
136 };
137
138 struct COMPUTE_DESC
139 {
140 uint32_t threadGroupCountX;
141 uint32_t threadGroupCountY;
142 uint32_t threadGroupCountZ;
143 bool enableThreadDispatch;
144 };
145
146 typedef void (*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC,
147 uint32_t workerId,
148 uint32_t macroTile,
149 void* pDesc);
150
151 enum WORK_TYPE
152 {
153 SYNC,
154 DRAW,
155 CLEAR,
156 DISCARDINVALIDATETILES,
157 STORETILES,
158 SHUTDOWN,
159 };
160
OSALIGNSIMD(struct)161 OSALIGNSIMD(struct) BE_WORK
162 {
163 WORK_TYPE type;
164 PFN_WORK_FUNC pfnWork;
165 union
166 {
167 SYNC_DESC sync;
168 TRIANGLE_WORK_DESC tri;
169 CLEAR_DESC clear;
170 DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
171 STORE_TILES_DESC storeTiles;
172 } desc;
173 };
174
175 struct DRAW_WORK
176 {
177 DRAW_CONTEXT* pDC;
178 union
179 {
180 uint32_t numIndices; // DrawIndexed: Number of indices for draw.
181 uint32_t numVerts; // Draw: Number of verts (triangles, lines, etc)
182 };
183 union
184 {
185 gfxptr_t xpIB; // DrawIndexed: App supplied int32 indices
186 uint32_t startVertex; // Draw: Starting vertex in VB to render from.
187 };
188 int32_t baseVertex;
189 uint32_t numInstances; // Number of instances
190 uint32_t startInstance; // Instance offset
191 uint32_t startPrimID; // starting primitiveID for this draw batch
192 uint32_t
193 startVertexID; // starting VertexID for this draw batch (only needed for non-indexed draws)
194 SWR_FORMAT type; // index buffer type
195 };
196
197 typedef void (*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext,
198 DRAW_CONTEXT* pDC,
199 uint32_t workerId,
200 void* pDesc);
201 struct FE_WORK
202 {
203 WORK_TYPE type;
204 PFN_FE_WORK_FUNC pfnWork;
205 union
206 {
207 SYNC_DESC sync;
208 DRAW_WORK draw;
209 CLEAR_DESC clear;
210 DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
211 STORE_TILES_DESC storeTiles;
212 } desc;
213 };
214
215 struct GUARDBANDS
216 {
217 float left[KNOB_NUM_VIEWPORTS_SCISSORS];
218 float right[KNOB_NUM_VIEWPORTS_SCISSORS];
219 float top[KNOB_NUM_VIEWPORTS_SCISSORS];
220 float bottom[KNOB_NUM_VIEWPORTS_SCISSORS];
221 };
222
223 struct PA_STATE;
224
225 // function signature for pipeline stages that execute after primitive assembly
226 typedef void (*PFN_PROCESS_PRIMS)(DRAW_CONTEXT* pDC,
227 PA_STATE& pa,
228 uint32_t workerId,
229 simdvector prims[],
230 uint32_t primMask,
231 simdscalari const& primID,
232 simdscalari const& viewportIdx,
233 simdscalari const& rtIdx);
234
235 // function signature for pipeline stages that execute after primitive assembly
236 typedef void(SIMDCALL* PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT* pDC,
237 PA_STATE& pa,
238 uint32_t workerId,
239 simd16vector prims[],
240 uint32_t primMask,
241 simd16scalari const& primID,
242 simd16scalari const& viewportIdx,
243 simd16scalari const& rtIdx);
244
OSALIGNLINE(struct)245 OSALIGNLINE(struct) API_STATE
246 {
247 // Vertex Buffers
248 SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS];
249
250 // GS - Geometry Shader State
251 SWR_GS_STATE gsState;
252 PFN_GS_FUNC pfnGsFunc;
253
254 // FS - Fetch Shader State
255 PFN_FETCH_FUNC pfnFetchFunc;
256
257 // VS - Vertex Shader State
258 PFN_VERTEX_FUNC pfnVertexFunc;
259
260 // Index Buffer
261 SWR_INDEX_BUFFER_STATE indexBuffer;
262
263 // CS - Compute Shader
264 PFN_CS_FUNC pfnCsFunc;
265 uint32_t totalThreadsInGroup;
266 uint32_t totalSpillFillSize;
267 uint32_t scratchSpaceSizePerWarp;
268 uint32_t scratchSpaceNumWarps;
269
270 // FE - Frontend State
271 SWR_FRONTEND_STATE frontendState;
272
273 // SOS - Streamout Shader State
274 PFN_SO_FUNC pfnSoFunc[MAX_SO_STREAMS];
275
276 // Streamout state
277 SWR_STREAMOUT_STATE soState;
278 mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS];
279 mutable SWR_STREAMOUT_BUFFER soPausedBuffer[MAX_SO_STREAMS];
280
281 // Tessellation State
282 PFN_HS_FUNC pfnHsFunc;
283 PFN_DS_FUNC pfnDsFunc;
284 SWR_TS_STATE tsState;
285
286 // Number of attributes used by the frontend (vs, so, gs)
287 uint32_t feNumAttributes;
288
289 // RS - Rasterizer State
290 SWR_RASTSTATE rastState;
291 // floating point multisample offsets
292 float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2];
293
294 GUARDBANDS gbState;
295
296 SWR_VIEWPORT vp[KNOB_NUM_VIEWPORTS_SCISSORS];
297 SWR_VIEWPORT_MATRICES vpMatrices;
298
299 SWR_RECT scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS];
300 SWR_RECT scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS];
301 bool scissorsTileAligned;
302
303 bool forceFront;
304 PRIMITIVE_TOPOLOGY topology;
305
306
307 // Backend state
308 OSALIGNLINE(SWR_BACKEND_STATE) backendState;
309
310 SWR_DEPTH_BOUNDS_STATE depthBoundsState;
311
312 // PS - Pixel shader state
313 SWR_PS_STATE psState;
314
315 SWR_DEPTH_STENCIL_STATE depthStencilState;
316
317 // OM - Output Merger State
318 SWR_BLEND_STATE blendState;
319 PFN_BLEND_JIT_FUNC pfnBlendFunc[SWR_NUM_RENDERTARGETS];
320
321 struct
322 {
323 uint32_t enableStatsFE : 1; // Enable frontend pipeline stats
324 uint32_t enableStatsBE : 1; // Enable backend pipeline stats
325 uint32_t colorHottileEnable : 8; // Bitmask of enabled color hottiles
326 uint32_t depthHottileEnable : 1; // Enable depth buffer hottile
327 uint32_t stencilHottileEnable : 1; // Enable stencil buffer hottile
328 };
329
330 PFN_QUANTIZE_DEPTH pfnQuantizeDepth;
331 };
332
333 class MacroTileMgr;
334 class DispatchQueue;
335 class HOTTILE;
336
337 struct RenderOutputBuffers
338 {
339 uint8_t* pColor[SWR_NUM_RENDERTARGETS];
340 uint8_t* pDepth;
341 uint8_t* pStencil;
342
343 HOTTILE* pColorHotTile[SWR_NUM_RENDERTARGETS];
344 HOTTILE* pDepthHotTile;
345 HOTTILE* pStencilHotTile;
346 };
347
348 // Plane equation A/B/C coeffs used to evaluate I/J barycentric coords
349 struct BarycentricCoeffs
350 {
351 simdscalar vIa;
352 simdscalar vIb;
353 simdscalar vIc;
354
355 simdscalar vJa;
356 simdscalar vJb;
357 simdscalar vJc;
358
359 simdscalar vZa;
360 simdscalar vZb;
361 simdscalar vZc;
362
363 simdscalar vRecipDet;
364
365 simdscalar vAOneOverW;
366 simdscalar vBOneOverW;
367 simdscalar vCOneOverW;
368 };
369
370 // pipeline function pointer types
371 typedef void (*PFN_BACKEND_FUNC)(
372 DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&);
373 typedef void (*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT&,
374 uint8_t* (&)[SWR_NUM_RENDERTARGETS],
375 uint32_t,
376 const SWR_BLEND_STATE*,
377 const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS],
378 simdscalar&,
379 simdscalar const&);
380 typedef void (*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
381 typedef void (*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
382 typedef void (*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&,
383 SWR_PS_CONTEXT&,
384 const uint64_t* const,
385 const uint32_t,
386 simdscalar const&,
387 simdscalar const&);
388
389 struct BACKEND_FUNCS
390 {
391 PFN_BACKEND_FUNC pfnBackend;
392 };
393
394 // Draw State
395 struct DRAW_STATE
396 {
397 API_STATE state;
398
399 void* pPrivateState; // Its required the driver sets this up for each draw.
400
401 // pipeline function pointers, filled in by API thread when setting up the draw
402 BACKEND_FUNCS backendFuncs;
403 PFN_PROCESS_PRIMS pfnProcessPrims;
404 #if USE_SIMD16_FRONTEND
405 PFN_PROCESS_PRIMS_SIMD16 pfnProcessPrims_simd16;
406 #endif
407
408 CachingArena* pArena; // This should only be used by API thread.
409 };
410
411 struct DRAW_DYNAMIC_STATE
412 {
ResetDRAW_DYNAMIC_STATE413 void Reset(uint32_t numThreads)
414 {
415 SWR_STATS* pSavePtr = pStats;
416 memset(this, 0, sizeof(*this));
417 pStats = pSavePtr;
418 memset(pStats, 0, sizeof(SWR_STATS) * numThreads);
419 }
420 ///@todo Currently assumes only a single FE can do stream output for a draw.
421 uint32_t SoWriteOffset[4];
422 bool SoWriteOffsetDirty[4];
423
424 SWR_STATS_FE statsFE; // Only one FE thread per DC.
425 SWR_STATS* pStats;
426 uint64_t soPrims; // number of primitives written to StremOut buffer
427 };
428
429 // Draw Context
430 // The api thread sets up a draw context that exists for the life of the draw.
431 // This draw context maintains all of the state needed for the draw operation.
432 struct DRAW_CONTEXT
433 {
434 SWR_CONTEXT* pContext;
435 union
436 {
437 MacroTileMgr* pTileMgr;
438 DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
439 };
440 DRAW_STATE* pState; // Read-only state. Core should not update this outside of API thread.
441 CachingArena* pArena;
442
443 uint32_t drawId;
444 bool dependentFE; // Frontend work is dependent on all previous FE
445 bool dependent; // Backend work is dependent on all previous BE
446 bool isCompute; // Is this DC a compute context?
447 bool cleanupState; // True if this is the last draw using an entry in the state ring.
448
449 FE_WORK FeWork;
450
451 SYNC_DESC retireCallback; // Call this func when this DC is retired.
452
453 DRAW_DYNAMIC_STATE dynState;
454
455 volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw?
456 volatile OSALIGNLINE(uint32_t) FeLock;
457 volatile OSALIGNLINE(uint32_t) threadsDone;
458 };
459
460 static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
461
GetApiState(const DRAW_CONTEXT * pDC)462 INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
463 {
464 SWR_ASSERT(pDC != nullptr);
465 SWR_ASSERT(pDC->pState != nullptr);
466
467 return pDC->pState->state;
468 }
469
GetPrivateState(const DRAW_CONTEXT * pDC)470 INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC)
471 {
472 SWR_ASSERT(pDC != nullptr);
473 SWR_ASSERT(pDC->pState != nullptr);
474
475 return pDC->pState->pPrivateState;
476 }
477
478 class HotTileMgr;
479
480 struct SWR_CONTEXT
481 {
482 // Draw Context Ring
483 // Each draw needs its own state in order to support mulitple draws in flight across multiple
484 // threads. We maintain N draw contexts configured as a ring. The size of the ring limits the
485 // maximum number of draws that can be in flight at any given time.
486 //
487 // Description:
488 // 1. State - When an application first sets state we'll request a new draw context to use.
489 // a. If there are no available draw contexts then we'll have to wait until one becomes
490 // free. b. If one is available then set pCurDrawContext to point to it and mark it in use.
491 // c. All state calls set state on pCurDrawContext.
492 // 2. Draw - Creates submits a work item that is associated with current draw context.
493 // a. Set pPrevDrawContext = pCurDrawContext
494 // b. Set pCurDrawContext to NULL.
495 // 3. State - When an applications sets state after draw
496 // a. Same as step 1.
497 // b. State is copied from prev draw context to current.
498 RingBuffer<DRAW_CONTEXT> dcRing;
499
500 DRAW_CONTEXT* pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw.
501 DRAW_CONTEXT* pPrevDrawContext; // This points to DC entry for the previous context submitted
502 // that we can copy state from.
503
504 MacroTileMgr* pMacroTileManagerArray;
505 DispatchQueue* pDispatchQueueArray;
506
507 // Draw State Ring
508 // When draw are very large (lots of primitives) then the API thread will break these up.
509 // These split draws all have identical state. So instead of storing the state directly
510 // in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
511 // to reference a single entry in the DS ring.
512 RingBuffer<DRAW_STATE> dsRing;
513
514 uint32_t curStateId; // Current index to the next available entry in the DS ring.
515
516 uint32_t NumWorkerThreads;
517 uint32_t NumFEThreads;
518 uint32_t NumBEThreads;
519
520 THREAD_POOL threadPool; // Thread pool associated with this context
521 SWR_THREADING_INFO threadInfo;
522 SWR_API_THREADING_INFO apiThreadInfo;
523 SWR_WORKER_PRIVATE_STATE workerPrivateState;
524
525 uint32_t MAX_DRAWS_IN_FLIGHT;
526
527 std::condition_variable FifosNotEmpty;
528 std::mutex WaitLock;
529
530 uint32_t privateStateSize;
531
532 HotTileMgr* pHotTileMgr;
533
534 // Callback functions, passed in at create context time
535 PFN_LOAD_TILE pfnLoadTile;
536 PFN_STORE_TILE pfnStoreTile;
537 PFN_TRANSLATE_GFXPTR_FOR_READ pfnTranslateGfxptrForRead;
538 PFN_TRANSLATE_GFXPTR_FOR_WRITE pfnTranslateGfxptrForWrite;
539 PFN_MAKE_GFXPTR pfnMakeGfxPtr;
540 PFN_CREATE_MEMORY_CONTEXT pfnCreateMemoryContext;
541 PFN_DESTROY_MEMORY_CONTEXT pfnDestroyMemoryContext;
542 PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset;
543 PFN_UPDATE_STATS pfnUpdateStats;
544 PFN_UPDATE_STATS_FE pfnUpdateStatsFE;
545 PFN_UPDATE_STREAMOUT pfnUpdateStreamOut;
546
547
548 // Global Stats
549 SWR_STATS* pStats;
550
551 // Scratch space for workers.
552 uint8_t** ppScratch;
553
554 volatile OSALIGNLINE(uint32_t) drawsOutstandingFE;
555
556 OSALIGNLINE(CachingAllocator) cachingArenaAllocator;
557 uint32_t frameCount;
558
559 uint32_t lastFrameChecked;
560 uint64_t lastDrawChecked;
561 TileSet* pSingleThreadLockedTiles;
562
563 // ArchRast thread contexts.
564 HANDLE* pArContext;
565
566 // handle to external memory for worker datas to create memory contexts
567 HANDLE hExternalMemory;
568
569 BucketManager *pBucketMgr;
570 };
571
572 #define UPDATE_STAT_BE(name, count) \
573 if (GetApiState(pDC).enableStatsBE) \
574 { \
575 pDC->dynState.pStats[workerId].name += count; \
576 }
577 #define UPDATE_STAT_FE(name, count) \
578 if (GetApiState(pDC).enableStatsFE) \
579 { \
580 pDC->dynState.statsFE.name += count; \
581 }
582
583 // ArchRast instrumentation framework
584 #define AR_WORKER_CTX pDC->pContext->pArContext[workerId]
585 #define AR_API_CTX pDC->pContext->pArContext[pContext->NumWorkerThreads]
586
587 #ifdef KNOB_ENABLE_RDTSC
588 #define RDTSC_BEGIN(pBucketMgr, type, drawid) RDTSC_START(pBucketMgr, type)
589 #define RDTSC_END(pBucketMgr, type, count) RDTSC_STOP(pBucketMgr, type, count, 0)
590 #else
591 #define RDTSC_BEGIN(pBucketMgr, type, drawid)
592 #define RDTSC_END(pBucketMgr, type, count)
593 #endif
594
595 #ifdef KNOB_ENABLE_AR
596 #define _AR_EVENT(ctx, event) ArchRast::Dispatch(ctx, ArchRast::event)
597 #define _AR_FLUSH(ctx, id) ArchRast::FlushDraw(ctx, id)
598 #else
599 #define _AR_EVENT(ctx, event)
600 #define _AR_FLUSH(ctx, id)
601 #endif
602
603 // Use these macros for api thread.
604 #define AR_API_EVENT(event) _AR_EVENT(AR_API_CTX, event)
605
606 // Use these macros for worker threads.
607 #define AR_EVENT(event) _AR_EVENT(AR_WORKER_CTX, event)
608 #define AR_FLUSH(id) _AR_FLUSH(AR_WORKER_CTX, id)
609