1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.cpp
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 * operations.
27 *
28 ******************************************************************************/
29
30 #include <smmintrin.h>
31
32 #include "backend.h"
33 #include "backend_impl.h"
34 #include "tilemgr.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
37 #include "backends/gen_BackendPixelRate.hpp"
38
39 #include <algorithm>
40
41
42 //////////////////////////////////////////////////////////////////////////
43 /// @brief Process compute work.
44 /// @param pDC - pointer to draw context (dispatch).
45 /// @param workerId - The unique worker ID that is assigned to this thread.
46 /// @param threadGroupId - the linear index for the thread group within the dispatch.
ProcessComputeBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t threadGroupId,void * & pSpillFillBuffer,void * & pScratchSpace)47 void ProcessComputeBE(DRAW_CONTEXT* pDC,
48 uint32_t workerId,
49 uint32_t threadGroupId,
50 void*& pSpillFillBuffer,
51 void*& pScratchSpace)
52 {
53 SWR_CONTEXT* pContext = pDC->pContext;
54
55 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEDispatch, pDC->drawId);
56
57 const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
58 SWR_ASSERT(pTaskData != nullptr);
59
60 // Ensure spill fill memory has been allocated.
61 size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
62 if (spillFillSize && pSpillFillBuffer == nullptr)
63 {
64 pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD16_BYTES);
65 }
66
67 size_t scratchSpaceSize =
68 pDC->pState->state.scratchSpaceSizePerWarp * pDC->pState->state.scratchSpaceNumWarps;
69 if (scratchSpaceSize && pScratchSpace == nullptr)
70 {
71 pScratchSpace = pDC->pArena->AllocAlignedSync(scratchSpaceSize, KNOB_SIMD16_BYTES);
72 }
73
74 const API_STATE& state = GetApiState(pDC);
75
76 SWR_CS_CONTEXT csContext{0};
77 csContext.tileCounter = threadGroupId;
78 csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
79 csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
80 csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
81 csContext.pTGSM = pContext->ppScratch[workerId];
82 csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
83 csContext.pScratchSpace = (uint8_t*)pScratchSpace;
84 csContext.scratchSpacePerWarp = pDC->pState->state.scratchSpaceSizePerWarp;
85
86 state.pfnCsFunc(GetPrivateState(pDC),
87 pContext->threadPool.pThreadData[workerId].pWorkerPrivateData,
88 &csContext);
89
90 UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup);
91 AR_EVENT(CSStats((HANDLE)&csContext.stats));
92
93 RDTSC_END(pDC->pContext->pBucketMgr, BEDispatch, 1);
94 }
95
96 //////////////////////////////////////////////////////////////////////////
97 /// @brief Process shutdown.
98 /// @param pDC - pointer to draw context (dispatch).
99 /// @param workerId - The unique worker ID that is assigned to this thread.
100 /// @param threadGroupId - the linear index for the thread group within the dispatch.
ProcessShutdownBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pUserData)101 void ProcessShutdownBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData)
102 {
103 // Dummy function
104 }
105
ProcessSyncBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pUserData)106 void ProcessSyncBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData)
107 {
108 uint32_t x, y;
109 MacroTileMgr::getTileIndices(macroTile, x, y);
110 SWR_ASSERT(x == 0 && y == 0);
111 }
112
ProcessStoreTileBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,STORE_TILES_DESC * pDesc,SWR_RENDERTARGET_ATTACHMENT attachment)113 void ProcessStoreTileBE(DRAW_CONTEXT* pDC,
114 uint32_t workerId,
115 uint32_t macroTile,
116 STORE_TILES_DESC* pDesc,
117 SWR_RENDERTARGET_ATTACHMENT attachment)
118 {
119 SWR_CONTEXT* pContext = pDC->pContext;
120 HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
121
122 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEStoreTiles, pDC->drawId);
123
124 SWR_FORMAT srcFormat;
125 switch (attachment)
126 {
127 case SWR_ATTACHMENT_COLOR0:
128 case SWR_ATTACHMENT_COLOR1:
129 case SWR_ATTACHMENT_COLOR2:
130 case SWR_ATTACHMENT_COLOR3:
131 case SWR_ATTACHMENT_COLOR4:
132 case SWR_ATTACHMENT_COLOR5:
133 case SWR_ATTACHMENT_COLOR6:
134 case SWR_ATTACHMENT_COLOR7:
135 srcFormat = KNOB_COLOR_HOT_TILE_FORMAT;
136 break;
137 case SWR_ATTACHMENT_DEPTH:
138 srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT;
139 break;
140 case SWR_ATTACHMENT_STENCIL:
141 srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT;
142 break;
143 default:
144 SWR_INVALID("Unknown attachment: %d", attachment);
145 srcFormat = KNOB_COLOR_HOT_TILE_FORMAT;
146 break;
147 }
148
149 uint32_t x, y;
150 MacroTileMgr::getTileIndices(macroTile, x, y);
151
152 // Only need to store the hottile if it's been rendered to...
153 HOTTILE* pHotTile =
154 pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false);
155 if (pHotTile)
156 {
157 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
158 if (pHotTile->state == HOTTILE_CLEAR)
159 {
160 PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[srcFormat];
161 SWR_ASSERT(pfnClearTiles != nullptr);
162
163 pfnClearTiles(pDC,
164 hWorkerPrivateData,
165 attachment,
166 macroTile,
167 pHotTile->renderTargetArrayIndex,
168 pHotTile->clearData,
169 pDesc->rect);
170 }
171
172 if (pHotTile->state == HOTTILE_DIRTY ||
173 pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
174 {
175 int32_t destX = KNOB_MACROTILE_X_DIM * x;
176 int32_t destY = KNOB_MACROTILE_Y_DIM * y;
177
178 pContext->pfnStoreTile(pDC,
179 hWorkerPrivateData,
180 srcFormat,
181 attachment,
182 destX,
183 destY,
184 pHotTile->renderTargetArrayIndex,
185 pHotTile->pBuffer);
186 }
187
188 if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
189 {
190 if (!(pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY &&
191 pHotTile->state == HOTTILE_RESOLVED))
192 {
193 pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
194 }
195 }
196 }
197 RDTSC_END(pDC->pContext->pBucketMgr, BEStoreTiles, 1);
198 }
199
ProcessStoreTilesBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pData)200 void ProcessStoreTilesBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
201 {
202 STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pData;
203
204 unsigned long rt = 0;
205 uint32_t mask = pDesc->attachmentMask;
206 while (_BitScanForward(&rt, mask))
207 {
208 mask &= ~(1 << rt);
209 ProcessStoreTileBE(pDC, workerId, macroTile, pDesc, (SWR_RENDERTARGET_ATTACHMENT)rt);
210 }
211 }
212
ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pData)213 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT* pDC,
214 uint32_t workerId,
215 uint32_t macroTile,
216 void* pData)
217 {
218 DISCARD_INVALIDATE_TILES_DESC* pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pData;
219 SWR_CONTEXT* pContext = pDC->pContext;
220
221 const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
222
223 for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
224 {
225 if (pDesc->attachmentMask & (1 << i))
226 {
227 HOTTILE* pHotTile =
228 pContext->pHotTileMgr->GetHotTileNoLoad(pContext,
229 pDC,
230 macroTile,
231 (SWR_RENDERTARGET_ATTACHMENT)i,
232 pDesc->createNewTiles,
233 numSamples);
234 if (pHotTile)
235 {
236 HOTTILE_STATE newState = (HOTTILE_STATE)pDesc->newTileState;;
237 if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_CLEAR)
238 {
239 if (newState == HOTTILE_INVALID)
240 {
241 // This is OK for APIs that explicitly allow discards
242 // (for e.g. depth / stencil data)
243 //SWR_INVALID("Discarding valid data!");
244 }
245 }
246 pHotTile->state = newState;
247 }
248 }
249 }
250 }
251
252 template <uint32_t sampleCountT>
BackendNullPS(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t x,uint32_t y,SWR_TRIANGLE_DESC & work,RenderOutputBuffers & renderBuffers)253 void BackendNullPS(DRAW_CONTEXT* pDC,
254 uint32_t workerId,
255 uint32_t x,
256 uint32_t y,
257 SWR_TRIANGLE_DESC& work,
258 RenderOutputBuffers& renderBuffers)
259 {
260 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BENullBackend, pDC->drawId);
261 ///@todo: handle center multisample pattern
262 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId);
263
264 const API_STATE& state = GetApiState(pDC);
265
266 BarycentricCoeffs coeffs;
267 SetupBarycentricCoeffs(&coeffs, work);
268
269 uint8_t *pDepthBuffer, *pStencilBuffer;
270 SetupRenderBuffers(NULL, &pDepthBuffer, &pStencilBuffer, 0, renderBuffers);
271
272 SWR_PS_CONTEXT psContext;
273 // skip SetupPixelShaderContext(&psContext, ...); // not needed here
274
275 RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 0);
276
277 simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
278
279 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
280 const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
281 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
282 {
283 simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
284
285 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
286
287 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
288 {
289 // iterate over active samples
290 unsigned long sample = 0;
291 uint32_t sampleMask = state.blendState.sampleMask;
292 while (_BitScanForward(&sample, sampleMask))
293 {
294 sampleMask &= ~(1 << sample);
295
296 simdmask coverageMask = work.coverageMask[sample] & MASK;
297
298 if (coverageMask)
299 {
300 // offset depth/stencil buffers current sample
301 uint8_t* pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
302 uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
303
304 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
305 {
306 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
307 "Unsupported depth hot tile format");
308
309 const simdscalar z =
310 _simd_load_ps(reinterpret_cast<const float*>(pDepthSample));
311
312 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
313 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
314
315 coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
316 }
317
318 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
319
320 // calculate per sample positions
321 psContext.vX.sample = _simd_add_ps(vXSamplePosUL, samplePos.vX(sample));
322 psContext.vY.sample = _simd_add_ps(vYSamplePosUL, samplePos.vY(sample));
323
324 CalcSampleBarycentrics(coeffs, psContext);
325
326 // interpolate and quantize z
327 psContext.vZ = vplaneps(coeffs.vZa,
328 coeffs.vZb,
329 coeffs.vZc,
330 psContext.vI.sample,
331 psContext.vJ.sample);
332 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
333
334 RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
335
336 // interpolate user clip distance if available
337 if (state.backendState.clipDistanceMask)
338 {
339 coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask,
340 work.pUserClipBuffer,
341 psContext.vI.sample,
342 psContext.vJ.sample);
343 }
344
345 simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
346 simdscalar stencilPassMask = vCoverageMask;
347
348 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEarlyDepthTest, pDC->drawId);
349 simdscalar depthPassMask = DepthStencilTest(&state,
350 work.triFlags.frontFacing,
351 work.triFlags.viewportIndex,
352 psContext.vZ,
353 pDepthSample,
354 vCoverageMask,
355 pStencilSample,
356 &stencilPassMask);
357 AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask),
358 _simd_movemask_ps(stencilPassMask),
359 _simd_movemask_ps(vCoverageMask)));
360 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
361 &state.depthStencilState,
362 work.triFlags.frontFacing,
363 psContext.vZ,
364 pDepthSample,
365 depthPassMask,
366 vCoverageMask,
367 pStencilSample,
368 stencilPassMask);
369 RDTSC_END(pDC->pContext->pBucketMgr, BEEarlyDepthTest, 0);
370
371 uint32_t statMask = _simd_movemask_ps(depthPassMask);
372 uint32_t statCount = _mm_popcnt_u32(statMask);
373 UPDATE_STAT_BE(DepthPassCount, statCount);
374 }
375
376 Endtile:
377 ATTR_UNUSED;
378 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
379 }
380
381 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
382 pStencilBuffer +=
383 (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
384
385 vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx);
386 }
387
388 vYSamplePosUL = _simd_add_ps(vYSamplePosUL, dy);
389 }
390
391 RDTSC_END(pDC->pContext->pBucketMgr, BENullBackend, 0);
392 }
393
394 PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS] = {};
395 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
396 PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2] // centroid
397 [2] // canEarlyZ
398 = {};
399 PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][2] // isCenterPattern
400 [SWR_INPUT_COVERAGE_COUNT][2] // centroid
401 [2] // forcedSampleCount
402 [2] // canEarlyZ
403 = {};
404 PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT]
405 [2] // centroid
406 [2] // canEarlyZ
407 = {};
408
InitBackendFuncTables()409 void InitBackendFuncTables()
410 {
411 InitBackendPixelRate();
412 InitBackendSingleFuncTable(gBackendSingleSample);
413 InitBackendSampleFuncTable(gBackendSampleRateTable);
414
415 gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS<SWR_MULTISAMPLE_1X>;
416 gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS<SWR_MULTISAMPLE_2X>;
417 gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS<SWR_MULTISAMPLE_4X>;
418 gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS<SWR_MULTISAMPLE_8X>;
419 gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS<SWR_MULTISAMPLE_16X>;
420 }
421