• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /****************************************************************************
2 * Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file context.h
24 *
25 * @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT
26 *        The SWR_CONTEXT is our global context and contains the DC ring,
27 *        thread state, etc.
28 *
29 *        The DRAW_CONTEXT contains all state associated with a draw operation.
30 *
31 ******************************************************************************/
32 #pragma once
33 
34 #include <condition_variable>
35 #include <algorithm>
36 
37 #include "core/api.h"
38 #include "core/utils.h"
39 #include "core/arena.h"
40 #include "core/fifo.hpp"
41 #include "core/knobs.h"
42 #include "common/simdintrin.h"
43 #include "core/threads.h"
44 #include "ringbuffer.h"
45 #include "archrast/archrast.h"
46 
47 // x.8 fixed point precision values
48 #define FIXED_POINT_SHIFT 8
49 #define FIXED_POINT_SCALE 256
50 
51 // x.16 fixed point precision values
52 #define FIXED_POINT16_SHIFT 16
53 #define FIXED_POINT16_SCALE 65536
54 
55 struct SWR_CONTEXT;
56 struct DRAW_CONTEXT;
57 
58 struct TRI_FLAGS
59 {
60     uint32_t frontFacing : 1;
61     uint32_t yMajor : 1;
62     uint32_t coverageMask : (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
63     uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
64     float pointSize;
65     uint32_t primID;
66     uint32_t renderTargetArrayIndex;
67     uint32_t viewportIndex;
68 };
69 
70 //////////////////////////////////////////////////////////////////////////
71 /// SWR_TRIANGLE_DESC
72 /////////////////////////////////////////////////////////////////////////
73 struct SWR_TRIANGLE_DESC
74 {
75     float I[3];
76     float J[3];
77     float Z[3];
78     float OneOverW[3];
79     float recipDet;
80 
81     float *pRecipW;
82     float *pAttribs;
83     float *pPerspAttribs;
84     float *pSamplePos;
85     float *pUserClipBuffer;
86 
87     uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
88     uint64_t innerCoverageMask; // Conservative rasterization inner coverage: marked covered if entire pixel is covered
89     uint64_t anyCoveredSamples;
90 
91     TRI_FLAGS triFlags;
92 };
93 
94 struct TRIANGLE_WORK_DESC
95 {
96     float *pTriBuffer;
97     float *pAttribs;
98     float *pUserClipBuffer;
99     uint32_t numAttribs;
100     TRI_FLAGS triFlags;
101 };
102 
103 struct CLEAR_DESC
104 {
105     SWR_RECT rect;
106     uint32_t attachmentMask;
107     uint32_t renderTargetArrayIndex;
108     float clearRTColor[4];  // RGBA_32F
109     float clearDepth;   // [0..1]
110     uint8_t clearStencil;
111 };
112 
113 struct DISCARD_INVALIDATE_TILES_DESC
114 {
115     uint32_t attachmentMask;
116     SWR_RECT rect;
117     SWR_TILE_STATE newTileState;
118     bool createNewTiles;
119     bool fullTilesOnly;
120 };
121 
122 struct SYNC_DESC
123 {
124     PFN_CALLBACK_FUNC pfnCallbackFunc;
125     uint64_t userData;
126     uint64_t userData2;
127     uint64_t userData3;
128 };
129 
130 struct STORE_TILES_DESC
131 {
132     uint32_t attachmentMask;
133     SWR_TILE_STATE postStoreTileState;
134     SWR_RECT rect;
135 };
136 
137 struct COMPUTE_DESC
138 {
139     uint32_t threadGroupCountX;
140     uint32_t threadGroupCountY;
141     uint32_t threadGroupCountZ;
142 };
143 
144 typedef void(*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc);
145 
146 enum WORK_TYPE
147 {
148     SYNC,
149     DRAW,
150     CLEAR,
151     DISCARDINVALIDATETILES,
152     STORETILES,
153     SHUTDOWN,
154 };
155 
OSALIGNSIMD(struct)156 OSALIGNSIMD(struct) BE_WORK
157 {
158     WORK_TYPE type;
159     PFN_WORK_FUNC pfnWork;
160     union
161     {
162         SYNC_DESC sync;
163         TRIANGLE_WORK_DESC tri;
164         CLEAR_DESC clear;
165         DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
166         STORE_TILES_DESC storeTiles;
167     } desc;
168 };
169 
170 struct DRAW_WORK
171 {
172     DRAW_CONTEXT*   pDC;
173     union
174     {
175         uint32_t   numIndices;      // DrawIndexed: Number of indices for draw.
176         uint32_t   numVerts;        // Draw: Number of verts (triangles, lines, etc)
177     };
178     union
179     {
180         const int32_t* pIB;        // DrawIndexed: App supplied indices
181         uint32_t   startVertex;    // Draw: Starting vertex in VB to render from.
182     };
183     int32_t    baseVertex;
184     uint32_t   numInstances;        // Number of instances
185     uint32_t   startInstance;       // Instance offset
186     uint32_t   startPrimID;         // starting primitiveID for this draw batch
187     uint32_t   startVertexID;       // starting VertexID for this draw batch (only needed for non-indexed draws)
188     SWR_FORMAT type;                // index buffer type
189 };
190 
191 typedef void(*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pDesc);
192 struct FE_WORK
193 {
194     WORK_TYPE type;
195     PFN_FE_WORK_FUNC pfnWork;
196     union
197     {
198         SYNC_DESC sync;
199         DRAW_WORK draw;
200         CLEAR_DESC clear;
201         DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
202         STORE_TILES_DESC storeTiles;
203     } desc;
204 };
205 
206 struct GUARDBANDS
207 {
208     float left[KNOB_NUM_VIEWPORTS_SCISSORS];
209     float right[KNOB_NUM_VIEWPORTS_SCISSORS];
210     float top[KNOB_NUM_VIEWPORTS_SCISSORS];
211     float bottom[KNOB_NUM_VIEWPORTS_SCISSORS];
212 };
213 
214 struct PA_STATE;
215 
216 // function signature for pipeline stages that execute after primitive assembly
217 typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[],
218     uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
219 
OSALIGNLINE(struct)220 OSALIGNLINE(struct) API_STATE
221 {
222     // Vertex Buffers
223     SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS];
224 
225     // Index Buffer
226     SWR_INDEX_BUFFER_STATE  indexBuffer;
227 
228     // FS - Fetch Shader State
229     PFN_FETCH_FUNC          pfnFetchFunc;
230 
231     // VS - Vertex Shader State
232     PFN_VERTEX_FUNC         pfnVertexFunc;
233 
234     // GS - Geometry Shader State
235     PFN_GS_FUNC             pfnGsFunc;
236     SWR_GS_STATE            gsState;
237 
238     // CS - Compute Shader
239     PFN_CS_FUNC             pfnCsFunc;
240     uint32_t                totalThreadsInGroup;
241     uint32_t                totalSpillFillSize;
242 
243     // FE - Frontend State
244     SWR_FRONTEND_STATE      frontendState;
245 
246     // SOS - Streamout Shader State
247     PFN_SO_FUNC             pfnSoFunc[MAX_SO_STREAMS];
248 
249     // Streamout state
250     SWR_STREAMOUT_STATE     soState;
251     mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS];
252 
253     // Tessellation State
254     PFN_HS_FUNC             pfnHsFunc;
255     PFN_DS_FUNC             pfnDsFunc;
256     SWR_TS_STATE            tsState;
257 
258     // Number of attributes used by the frontend (vs, so, gs)
259     uint32_t                feNumAttributes;
260 
261     PRIMITIVE_TOPOLOGY      topology;
262     bool                    forceFront;
263 
264     // RS - Rasterizer State
265     SWR_RASTSTATE           rastState;
266     // floating point multisample offsets
267     float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2];
268 
269     GUARDBANDS               gbState;
270 
271     SWR_VIEWPORT            vp[KNOB_NUM_VIEWPORTS_SCISSORS];
272     SWR_VIEWPORT_MATRICES   vpMatrices;
273 
274     SWR_RECT                scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS];
275     SWR_RECT                scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS];
276     bool                    scissorsTileAligned;
277 
278     // Backend state
279     SWR_BACKEND_STATE       backendState;
280 
281     SWR_DEPTH_BOUNDS_STATE  depthBoundsState;
282 
283     // PS - Pixel shader state
284     SWR_PS_STATE            psState;
285 
286     SWR_DEPTH_STENCIL_STATE depthStencilState;
287 
288     // OM - Output Merger State
289     SWR_BLEND_STATE         blendState;
290     PFN_BLEND_JIT_FUNC      pfnBlendFunc[SWR_NUM_RENDERTARGETS];
291 
292     struct
293     {
294         uint32_t enableStatsFE : 1;             // Enable frontend pipeline stats
295         uint32_t enableStatsBE : 1;             // Enable backend pipeline stats
296         uint32_t colorHottileEnable : 8;        // Bitmask of enabled color hottiles
297         uint32_t depthHottileEnable: 1;         // Enable depth buffer hottile
298         uint32_t stencilHottileEnable : 1;      // Enable stencil buffer hottile
299     };
300 
301     PFN_QUANTIZE_DEPTH      pfnQuantizeDepth;
302 };
303 
304 class MacroTileMgr;
305 class DispatchQueue;
306 
307 struct RenderOutputBuffers
308 {
309     uint8_t* pColor[SWR_NUM_RENDERTARGETS];
310     uint8_t* pDepth;
311     uint8_t* pStencil;
312 };
313 
314 // Plane equation A/B/C coeffs used to evaluate I/J barycentric coords
315 struct BarycentricCoeffs
316 {
317     simdscalar vIa;
318     simdscalar vIb;
319     simdscalar vIc;
320 
321     simdscalar vJa;
322     simdscalar vJb;
323     simdscalar vJc;
324 
325     simdscalar vZa;
326     simdscalar vZb;
327     simdscalar vZc;
328 
329     simdscalar vRecipDet;
330 
331     simdscalar vAOneOverW;
332     simdscalar vBOneOverW;
333     simdscalar vCOneOverW;
334 };
335 
336 // pipeline function pointer types
337 typedef void(*PFN_BACKEND_FUNC)(DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&);
338 typedef void(*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT &, uint8_t* (&)[SWR_NUM_RENDERTARGETS], uint32_t, const SWR_BLEND_STATE*,
339                                  const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS], simdscalar&, simdscalar);
340 typedef void(*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &);
341 typedef void(*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
342 typedef void(*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &, const uint64_t *const, const uint32_t,
343                                               const simdscalar, const simdscalar);
344 
345 struct BACKEND_FUNCS
346 {
347     PFN_BACKEND_FUNC pfnBackend;
348 };
349 
350 // Draw State
351 struct DRAW_STATE
352 {
353     API_STATE state;
354 
355     void* pPrivateState;  // Its required the driver sets this up for each draw.
356 
357     // pipeline function pointers, filled in by API thread when setting up the draw
358     BACKEND_FUNCS backendFuncs;
359     PFN_PROCESS_PRIMS pfnProcessPrims;
360 
361     CachingArena* pArena;     // This should only be used by API thread.
362 };
363 
364 struct DRAW_DYNAMIC_STATE
365 {
ResetDRAW_DYNAMIC_STATE366     void Reset(uint32_t numThreads)
367     {
368         SWR_STATS* pSavePtr = pStats;
369         memset(this, 0, sizeof(*this));
370         pStats = pSavePtr;
371         memset(pStats, 0, sizeof(SWR_STATS) * numThreads);
372     }
373     ///@todo Currently assumes only a single FE can do stream output for a draw.
374     uint32_t SoWriteOffset[4];
375     bool     SoWriteOffsetDirty[4];
376 
377     SWR_STATS_FE statsFE;   // Only one FE thread per DC.
378     SWR_STATS*   pStats;
379 };
380 
381 // Draw Context
382 //    The api thread sets up a draw context that exists for the life of the draw.
383 //    This draw context maintains all of the state needed for the draw operation.
384 struct DRAW_CONTEXT
385 {
386     SWR_CONTEXT*    pContext;
387     union
388     {
389         MacroTileMgr*   pTileMgr;
390         DispatchQueue*  pDispatch;      // Queue for thread groups. (isCompute)
391     };
392     DRAW_STATE*     pState;             // Read-only state. Core should not update this outside of API thread.
393     DRAW_DYNAMIC_STATE dynState;
394 
395     CachingArena*   pArena;
396 
397     uint32_t        drawId;
398     bool            dependentFE;    // Frontend work is dependent on all previous FE
399     bool            dependent;      // Backend work is dependent on all previous BE
400     bool            isCompute;      // Is this DC a compute context?
401     bool            cleanupState;   // True if this is the last draw using an entry in the state ring.
402     volatile bool   doneFE;         // Is FE work done for this draw?
403 
404     FE_WORK         FeWork;
405 
406     volatile OSALIGNLINE(uint32_t)   FeLock;
407     volatile int32_t    threadsDone;
408 
409     SYNC_DESC       retireCallback; // Call this func when this DC is retired.
410 
411 
412 };
413 
414 static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
415 
GetApiState(const DRAW_CONTEXT * pDC)416 INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
417 {
418     SWR_ASSERT(pDC != nullptr);
419     SWR_ASSERT(pDC->pState != nullptr);
420 
421     return pDC->pState->state;
422 }
423 
GetPrivateState(const DRAW_CONTEXT * pDC)424 INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC)
425 {
426     SWR_ASSERT(pDC != nullptr);
427     SWR_ASSERT(pDC->pState != nullptr);
428 
429     return pDC->pState->pPrivateState;
430 }
431 
432 class HotTileMgr;
433 
434 struct SWR_CONTEXT
435 {
436     // Draw Context Ring
437     //  Each draw needs its own state in order to support mulitple draws in flight across multiple threads.
438     //  We maintain N draw contexts configured as a ring. The size of the ring limits the maximum number
439     //  of draws that can be in flight at any given time.
440     //
441     //  Description:
442     //  1. State - When an application first sets state we'll request a new draw context to use.
443     //     a. If there are no available draw contexts then we'll have to wait until one becomes free.
444     //     b. If one is available then set pCurDrawContext to point to it and mark it in use.
445     //     c. All state calls set state on pCurDrawContext.
446     //  2. Draw - Creates submits a work item that is associated with current draw context.
447     //     a. Set pPrevDrawContext = pCurDrawContext
448     //     b. Set pCurDrawContext to NULL.
449     //  3. State - When an applications sets state after draw
450     //     a. Same as step 1.
451     //     b. State is copied from prev draw context to current.
452     RingBuffer<DRAW_CONTEXT> dcRing;
453 
454     DRAW_CONTEXT *pCurDrawContext;    // This points to DC entry in ring for an unsubmitted draw.
455     DRAW_CONTEXT *pPrevDrawContext;   // This points to DC entry for the previous context submitted that we can copy state from.
456 
457     MacroTileMgr* pMacroTileManagerArray;
458     DispatchQueue* pDispatchQueueArray;
459 
460     // Draw State Ring
461     //  When draw are very large (lots of primitives) then the API thread will break these up.
462     //  These split draws all have identical state. So instead of storing the state directly
463     //  in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
464     //  to reference a single entry in the DS ring.
465     RingBuffer<DRAW_STATE> dsRing;
466 
467     uint32_t curStateId;               // Current index to the next available entry in the DS ring.
468 
469     uint32_t NumWorkerThreads;
470     uint32_t NumFEThreads;
471     uint32_t NumBEThreads;
472 
473     THREAD_POOL threadPool; // Thread pool associated with this context
474     SWR_THREADING_INFO threadInfo;
475 
476     std::condition_variable FifosNotEmpty;
477     std::mutex WaitLock;
478 
479     uint32_t privateStateSize;
480 
481     HotTileMgr *pHotTileMgr;
482 
483     // Callback functions, passed in at create context time
484     PFN_LOAD_TILE               pfnLoadTile;
485     PFN_STORE_TILE              pfnStoreTile;
486     PFN_CLEAR_TILE              pfnClearTile;
487     PFN_UPDATE_SO_WRITE_OFFSET  pfnUpdateSoWriteOffset;
488     PFN_UPDATE_STATS            pfnUpdateStats;
489     PFN_UPDATE_STATS_FE         pfnUpdateStatsFE;
490 
491 
492     // Global Stats
493     SWR_STATS* pStats;
494 
495     // Scratch space for workers.
496     uint8_t** ppScratch;
497 
498     volatile int32_t  drawsOutstandingFE;
499 
500     CachingAllocator cachingArenaAllocator;
501     uint32_t frameCount;
502 
503     uint32_t lastFrameChecked;
504     uint64_t lastDrawChecked;
505     TileSet singleThreadLockedTiles;
506 
507     // ArchRast thread contexts.
508     HANDLE* pArContext;
509 };
510 
511 #define UPDATE_STAT_BE(name, count) if (GetApiState(pDC).enableStatsBE) { pDC->dynState.pStats[workerId].name += count; }
512 #define UPDATE_STAT_FE(name, count) if (GetApiState(pDC).enableStatsFE) { pDC->dynState.statsFE.name += count; }
513 
514 // ArchRast instrumentation framework
515 #define AR_WORKER_CTX  pContext->pArContext[workerId]
516 #define AR_API_CTX     pContext->pArContext[pContext->NumWorkerThreads]
517 
518 #ifdef KNOB_ENABLE_AR
519     #define _AR_BEGIN(ctx, type, id)    ArchRast::Dispatch(ctx, ArchRast::Start(ArchRast::type, id))
520     #define _AR_END(ctx, type, count)   ArchRast::Dispatch(ctx, ArchRast::End(ArchRast::type, count))
521     #define _AR_EVENT(ctx, event)       ArchRast::Dispatch(ctx, ArchRast::event)
522 #else
523     #ifdef KNOB_ENABLE_RDTSC
524         #define _AR_BEGIN(ctx, type, id) (void)ctx; RDTSC_START(type)
525         #define _AR_END(ctx, type, id)   RDTSC_STOP(type, id, 0)
526     #else
527         #define _AR_BEGIN(ctx, type, id) (void)ctx
528         #define _AR_END(ctx, type, id)
529     #endif
530     #define _AR_EVENT(ctx, event)
531 #endif
532 
533 // Use these macros for api thread.
534 #define AR_API_BEGIN(type, id) _AR_BEGIN(AR_API_CTX, type, id)
535 #define AR_API_END(type, count) _AR_END(AR_API_CTX, type, count)
536 #define AR_API_EVENT(event) _AR_EVENT(AR_API_CTX, event)
537 
538 // Use these macros for worker threads.
539 #define AR_BEGIN(type, id) _AR_BEGIN(AR_WORKER_CTX, type, id)
540 #define AR_END(type, count) _AR_END(AR_WORKER_CTX, type, count)
541 #define AR_EVENT(event) _AR_EVENT(AR_WORKER_CTX, event)
542