1 /****************************************************************************
2 * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file context.h
24 *
25 * @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT
26 * The SWR_CONTEXT is our global context and contains the DC ring,
27 * thread state, etc.
28 *
29 * The DRAW_CONTEXT contains all state associated with a draw operation.
30 *
31 ******************************************************************************/
32 #pragma once
33
34 #include <condition_variable>
35 #include <algorithm>
36
37 #include "core/api.h"
38 #include "core/utils.h"
39 #include "core/arena.h"
40 #include "core/fifo.hpp"
41 #include "core/knobs.h"
42 #include "common/simdintrin.h"
43 #include "core/threads.h"
44 #include "ringbuffer.h"
45 #include "archrast/archrast.h"
46
47 // x.8 fixed point precision values
48 #define FIXED_POINT_SHIFT 8
49 #define FIXED_POINT_SCALE 256
50
51 // x.16 fixed point precision values
52 #define FIXED_POINT16_SHIFT 16
53 #define FIXED_POINT16_SCALE 65536
54
55 struct SWR_CONTEXT;
56 struct DRAW_CONTEXT;
57
58 struct TRI_FLAGS
59 {
60 uint32_t frontFacing : 1;
61 uint32_t yMajor : 1;
62 uint32_t coverageMask : (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
63 uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
64 float pointSize;
65 uint32_t primID;
66 uint32_t renderTargetArrayIndex;
67 uint32_t viewportIndex;
68 };
69
70 //////////////////////////////////////////////////////////////////////////
71 /// SWR_TRIANGLE_DESC
72 /////////////////////////////////////////////////////////////////////////
73 struct SWR_TRIANGLE_DESC
74 {
75 float I[3];
76 float J[3];
77 float Z[3];
78 float OneOverW[3];
79 float recipDet;
80
81 float *pRecipW;
82 float *pAttribs;
83 float *pPerspAttribs;
84 float *pSamplePos;
85 float *pUserClipBuffer;
86
87 uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
88 uint64_t innerCoverageMask; // Conservative rasterization inner coverage: marked covered if entire pixel is covered
89 uint64_t anyCoveredSamples;
90
91 TRI_FLAGS triFlags;
92 };
93
94 struct TRIANGLE_WORK_DESC
95 {
96 float *pTriBuffer;
97 float *pAttribs;
98 float *pUserClipBuffer;
99 uint32_t numAttribs;
100 TRI_FLAGS triFlags;
101 };
102
103 struct CLEAR_DESC
104 {
105 SWR_RECT rect;
106 uint32_t attachmentMask;
107 uint32_t renderTargetArrayIndex;
108 float clearRTColor[4]; // RGBA_32F
109 float clearDepth; // [0..1]
110 uint8_t clearStencil;
111 };
112
113 struct DISCARD_INVALIDATE_TILES_DESC
114 {
115 uint32_t attachmentMask;
116 SWR_RECT rect;
117 SWR_TILE_STATE newTileState;
118 bool createNewTiles;
119 bool fullTilesOnly;
120 };
121
122 struct SYNC_DESC
123 {
124 PFN_CALLBACK_FUNC pfnCallbackFunc;
125 uint64_t userData;
126 uint64_t userData2;
127 uint64_t userData3;
128 };
129
130 struct STORE_TILES_DESC
131 {
132 uint32_t attachmentMask;
133 SWR_TILE_STATE postStoreTileState;
134 SWR_RECT rect;
135 };
136
137 struct COMPUTE_DESC
138 {
139 uint32_t threadGroupCountX;
140 uint32_t threadGroupCountY;
141 uint32_t threadGroupCountZ;
142 };
143
144 typedef void(*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc);
145
146 enum WORK_TYPE
147 {
148 SYNC,
149 DRAW,
150 CLEAR,
151 DISCARDINVALIDATETILES,
152 STORETILES,
153 SHUTDOWN,
154 };
155
OSALIGNSIMD(struct)156 OSALIGNSIMD(struct) BE_WORK
157 {
158 WORK_TYPE type;
159 PFN_WORK_FUNC pfnWork;
160 union
161 {
162 SYNC_DESC sync;
163 TRIANGLE_WORK_DESC tri;
164 CLEAR_DESC clear;
165 DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
166 STORE_TILES_DESC storeTiles;
167 } desc;
168 };
169
170 struct DRAW_WORK
171 {
172 DRAW_CONTEXT* pDC;
173 union
174 {
175 uint32_t numIndices; // DrawIndexed: Number of indices for draw.
176 uint32_t numVerts; // Draw: Number of verts (triangles, lines, etc)
177 };
178 union
179 {
180 const int32_t* pIB; // DrawIndexed: App supplied indices
181 uint32_t startVertex; // Draw: Starting vertex in VB to render from.
182 };
183 int32_t baseVertex;
184 uint32_t numInstances; // Number of instances
185 uint32_t startInstance; // Instance offset
186 uint32_t startPrimID; // starting primitiveID for this draw batch
187 uint32_t startVertexID; // starting VertexID for this draw batch (only needed for non-indexed draws)
188 SWR_FORMAT type; // index buffer type
189 };
190
191 typedef void(*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pDesc);
192 struct FE_WORK
193 {
194 WORK_TYPE type;
195 PFN_FE_WORK_FUNC pfnWork;
196 union
197 {
198 SYNC_DESC sync;
199 DRAW_WORK draw;
200 CLEAR_DESC clear;
201 DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
202 STORE_TILES_DESC storeTiles;
203 } desc;
204 };
205
206 struct GUARDBANDS
207 {
208 float left[KNOB_NUM_VIEWPORTS_SCISSORS];
209 float right[KNOB_NUM_VIEWPORTS_SCISSORS];
210 float top[KNOB_NUM_VIEWPORTS_SCISSORS];
211 float bottom[KNOB_NUM_VIEWPORTS_SCISSORS];
212 };
213
214 struct PA_STATE;
215
216 // function signature for pipeline stages that execute after primitive assembly
217 typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[],
218 uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
219
OSALIGNLINE(struct)220 OSALIGNLINE(struct) API_STATE
221 {
222 // Vertex Buffers
223 SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS];
224
225 // Index Buffer
226 SWR_INDEX_BUFFER_STATE indexBuffer;
227
228 // FS - Fetch Shader State
229 PFN_FETCH_FUNC pfnFetchFunc;
230
231 // VS - Vertex Shader State
232 PFN_VERTEX_FUNC pfnVertexFunc;
233
234 // GS - Geometry Shader State
235 PFN_GS_FUNC pfnGsFunc;
236 SWR_GS_STATE gsState;
237
238 // CS - Compute Shader
239 PFN_CS_FUNC pfnCsFunc;
240 uint32_t totalThreadsInGroup;
241 uint32_t totalSpillFillSize;
242
243 // FE - Frontend State
244 SWR_FRONTEND_STATE frontendState;
245
246 // SOS - Streamout Shader State
247 PFN_SO_FUNC pfnSoFunc[MAX_SO_STREAMS];
248
249 // Streamout state
250 SWR_STREAMOUT_STATE soState;
251 mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS];
252
253 // Tessellation State
254 PFN_HS_FUNC pfnHsFunc;
255 PFN_DS_FUNC pfnDsFunc;
256 SWR_TS_STATE tsState;
257
258 // Number of attributes used by the frontend (vs, so, gs)
259 uint32_t feNumAttributes;
260
261 PRIMITIVE_TOPOLOGY topology;
262 bool forceFront;
263
264 // RS - Rasterizer State
265 SWR_RASTSTATE rastState;
266 // floating point multisample offsets
267 float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2];
268
269 GUARDBANDS gbState;
270
271 SWR_VIEWPORT vp[KNOB_NUM_VIEWPORTS_SCISSORS];
272 SWR_VIEWPORT_MATRICES vpMatrices;
273
274 SWR_RECT scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS];
275 SWR_RECT scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS];
276 bool scissorsTileAligned;
277
278 // Backend state
279 SWR_BACKEND_STATE backendState;
280
281 SWR_DEPTH_BOUNDS_STATE depthBoundsState;
282
283 // PS - Pixel shader state
284 SWR_PS_STATE psState;
285
286 SWR_DEPTH_STENCIL_STATE depthStencilState;
287
288 // OM - Output Merger State
289 SWR_BLEND_STATE blendState;
290 PFN_BLEND_JIT_FUNC pfnBlendFunc[SWR_NUM_RENDERTARGETS];
291
292 struct
293 {
294 uint32_t enableStatsFE : 1; // Enable frontend pipeline stats
295 uint32_t enableStatsBE : 1; // Enable backend pipeline stats
296 uint32_t colorHottileEnable : 8; // Bitmask of enabled color hottiles
297 uint32_t depthHottileEnable: 1; // Enable depth buffer hottile
298 uint32_t stencilHottileEnable : 1; // Enable stencil buffer hottile
299 };
300
301 PFN_QUANTIZE_DEPTH pfnQuantizeDepth;
302 };
303
304 class MacroTileMgr;
305 class DispatchQueue;
306
307 struct RenderOutputBuffers
308 {
309 uint8_t* pColor[SWR_NUM_RENDERTARGETS];
310 uint8_t* pDepth;
311 uint8_t* pStencil;
312 };
313
314 // Plane equation A/B/C coeffs used to evaluate I/J barycentric coords
315 struct BarycentricCoeffs
316 {
317 simdscalar vIa;
318 simdscalar vIb;
319 simdscalar vIc;
320
321 simdscalar vJa;
322 simdscalar vJb;
323 simdscalar vJc;
324
325 simdscalar vZa;
326 simdscalar vZb;
327 simdscalar vZc;
328
329 simdscalar vRecipDet;
330
331 simdscalar vAOneOverW;
332 simdscalar vBOneOverW;
333 simdscalar vCOneOverW;
334 };
335
336 // pipeline function pointer types
337 typedef void(*PFN_BACKEND_FUNC)(DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&);
338 typedef void(*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT &, uint8_t* (&)[SWR_NUM_RENDERTARGETS], uint32_t, const SWR_BLEND_STATE*,
339 const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS], simdscalar&, simdscalar);
340 typedef void(*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &);
341 typedef void(*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
342 typedef void(*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &, const uint64_t *const, const uint32_t,
343 const simdscalar, const simdscalar);
344
345 struct BACKEND_FUNCS
346 {
347 PFN_BACKEND_FUNC pfnBackend;
348 };
349
350 // Draw State
351 struct DRAW_STATE
352 {
353 API_STATE state;
354
355 void* pPrivateState; // Its required the driver sets this up for each draw.
356
357 // pipeline function pointers, filled in by API thread when setting up the draw
358 BACKEND_FUNCS backendFuncs;
359 PFN_PROCESS_PRIMS pfnProcessPrims;
360
361 CachingArena* pArena; // This should only be used by API thread.
362 };
363
364 struct DRAW_DYNAMIC_STATE
365 {
ResetDRAW_DYNAMIC_STATE366 void Reset(uint32_t numThreads)
367 {
368 SWR_STATS* pSavePtr = pStats;
369 memset(this, 0, sizeof(*this));
370 pStats = pSavePtr;
371 memset(pStats, 0, sizeof(SWR_STATS) * numThreads);
372 }
373 ///@todo Currently assumes only a single FE can do stream output for a draw.
374 uint32_t SoWriteOffset[4];
375 bool SoWriteOffsetDirty[4];
376
377 SWR_STATS_FE statsFE; // Only one FE thread per DC.
378 SWR_STATS* pStats;
379 };
380
381 // Draw Context
382 // The api thread sets up a draw context that exists for the life of the draw.
383 // This draw context maintains all of the state needed for the draw operation.
384 struct DRAW_CONTEXT
385 {
386 SWR_CONTEXT* pContext;
387 union
388 {
389 MacroTileMgr* pTileMgr;
390 DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
391 };
392 DRAW_STATE* pState; // Read-only state. Core should not update this outside of API thread.
393 DRAW_DYNAMIC_STATE dynState;
394
395 CachingArena* pArena;
396
397 uint32_t drawId;
398 bool dependentFE; // Frontend work is dependent on all previous FE
399 bool dependent; // Backend work is dependent on all previous BE
400 bool isCompute; // Is this DC a compute context?
401 bool cleanupState; // True if this is the last draw using an entry in the state ring.
402 volatile bool doneFE; // Is FE work done for this draw?
403
404 FE_WORK FeWork;
405
406 volatile OSALIGNLINE(uint32_t) FeLock;
407 volatile int32_t threadsDone;
408
409 SYNC_DESC retireCallback; // Call this func when this DC is retired.
410
411
412 };
413
414 static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
415
GetApiState(const DRAW_CONTEXT * pDC)416 INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
417 {
418 SWR_ASSERT(pDC != nullptr);
419 SWR_ASSERT(pDC->pState != nullptr);
420
421 return pDC->pState->state;
422 }
423
GetPrivateState(const DRAW_CONTEXT * pDC)424 INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC)
425 {
426 SWR_ASSERT(pDC != nullptr);
427 SWR_ASSERT(pDC->pState != nullptr);
428
429 return pDC->pState->pPrivateState;
430 }
431
432 class HotTileMgr;
433
434 struct SWR_CONTEXT
435 {
436 // Draw Context Ring
437 // Each draw needs its own state in order to support mulitple draws in flight across multiple threads.
438 // We maintain N draw contexts configured as a ring. The size of the ring limits the maximum number
439 // of draws that can be in flight at any given time.
440 //
441 // Description:
442 // 1. State - When an application first sets state we'll request a new draw context to use.
443 // a. If there are no available draw contexts then we'll have to wait until one becomes free.
444 // b. If one is available then set pCurDrawContext to point to it and mark it in use.
445 // c. All state calls set state on pCurDrawContext.
446 // 2. Draw - Creates submits a work item that is associated with current draw context.
447 // a. Set pPrevDrawContext = pCurDrawContext
448 // b. Set pCurDrawContext to NULL.
449 // 3. State - When an applications sets state after draw
450 // a. Same as step 1.
451 // b. State is copied from prev draw context to current.
452 RingBuffer<DRAW_CONTEXT> dcRing;
453
454 DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw.
455 DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from.
456
457 MacroTileMgr* pMacroTileManagerArray;
458 DispatchQueue* pDispatchQueueArray;
459
460 // Draw State Ring
461 // When draw are very large (lots of primitives) then the API thread will break these up.
462 // These split draws all have identical state. So instead of storing the state directly
463 // in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
464 // to reference a single entry in the DS ring.
465 RingBuffer<DRAW_STATE> dsRing;
466
467 uint32_t curStateId; // Current index to the next available entry in the DS ring.
468
469 uint32_t NumWorkerThreads;
470 uint32_t NumFEThreads;
471 uint32_t NumBEThreads;
472
473 THREAD_POOL threadPool; // Thread pool associated with this context
474 SWR_THREADING_INFO threadInfo;
475
476 std::condition_variable FifosNotEmpty;
477 std::mutex WaitLock;
478
479 uint32_t privateStateSize;
480
481 HotTileMgr *pHotTileMgr;
482
483 // Callback functions, passed in at create context time
484 PFN_LOAD_TILE pfnLoadTile;
485 PFN_STORE_TILE pfnStoreTile;
486 PFN_CLEAR_TILE pfnClearTile;
487 PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset;
488 PFN_UPDATE_STATS pfnUpdateStats;
489 PFN_UPDATE_STATS_FE pfnUpdateStatsFE;
490
491
492 // Global Stats
493 SWR_STATS* pStats;
494
495 // Scratch space for workers.
496 uint8_t** ppScratch;
497
498 volatile int32_t drawsOutstandingFE;
499
500 CachingAllocator cachingArenaAllocator;
501 uint32_t frameCount;
502
503 uint32_t lastFrameChecked;
504 uint64_t lastDrawChecked;
505 TileSet singleThreadLockedTiles;
506
507 // ArchRast thread contexts.
508 HANDLE* pArContext;
509 };
510
511 #define UPDATE_STAT_BE(name, count) if (GetApiState(pDC).enableStatsBE) { pDC->dynState.pStats[workerId].name += count; }
512 #define UPDATE_STAT_FE(name, count) if (GetApiState(pDC).enableStatsFE) { pDC->dynState.statsFE.name += count; }
513
514 // ArchRast instrumentation framework
515 #define AR_WORKER_CTX pContext->pArContext[workerId]
516 #define AR_API_CTX pContext->pArContext[pContext->NumWorkerThreads]
517
518 #ifdef KNOB_ENABLE_AR
519 #define _AR_BEGIN(ctx, type, id) ArchRast::Dispatch(ctx, ArchRast::Start(ArchRast::type, id))
520 #define _AR_END(ctx, type, count) ArchRast::Dispatch(ctx, ArchRast::End(ArchRast::type, count))
521 #define _AR_EVENT(ctx, event) ArchRast::Dispatch(ctx, ArchRast::event)
522 #else
523 #ifdef KNOB_ENABLE_RDTSC
524 #define _AR_BEGIN(ctx, type, id) (void)ctx; RDTSC_START(type)
525 #define _AR_END(ctx, type, id) RDTSC_STOP(type, id, 0)
526 #else
527 #define _AR_BEGIN(ctx, type, id) (void)ctx
528 #define _AR_END(ctx, type, id)
529 #endif
530 #define _AR_EVENT(ctx, event)
531 #endif
532
533 // Use these macros for api thread.
534 #define AR_API_BEGIN(type, id) _AR_BEGIN(AR_API_CTX, type, id)
535 #define AR_API_END(type, count) _AR_END(AR_API_CTX, type, count)
536 #define AR_API_EVENT(event) _AR_EVENT(AR_API_CTX, event)
537
538 // Use these macros for worker threads.
539 #define AR_BEGIN(type, id) _AR_BEGIN(AR_WORKER_CTX, type, id)
540 #define AR_END(type, count) _AR_END(AR_WORKER_CTX, type, count)
541 #define AR_EVENT(event) _AR_EVENT(AR_WORKER_CTX, event)
542