1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file rasterizer.cpp
24 *
25 * @brief Implementation for the rasterizer.
26 *
27 ******************************************************************************/
28
29 #include <vector>
30 #include <algorithm>
31
32 #include "rasterizer.h"
33 #include "rdtsc_core.h"
34 #include "backend.h"
35 #include "utils.h"
36 #include "frontend.h"
37 #include "tilemgr.h"
38 #include "memory/tilingtraits.h"
39
40 extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT]
41 [STATE_VALID_TRI_EDGE_COUNT][2];
42
43 template <uint32_t numSamples = 1>
44 void GetRenderHotTiles(DRAW_CONTEXT* pDC,
45 uint32_t workerId,
46 uint32_t macroID,
47 uint32_t x,
48 uint32_t y,
49 RenderOutputBuffers& renderBuffers,
50 uint32_t renderTargetArrayIndex);
51 template <typename RT>
52 void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers& buffers);
53 template <typename RT>
54 void StepRasterTileY(uint32_t colorHotTileMask,
55 RenderOutputBuffers& buffers,
56 RenderOutputBuffers& startBufferRow);
57
58 #define MASKTOVEC(i3, i2, i1, i0) \
59 { \
60 -i0, -i1, -i2, -i3 \
61 }
62 static const __m256d gMaskToVecpd[] = {
63 MASKTOVEC(0, 0, 0, 0),
64 MASKTOVEC(0, 0, 0, 1),
65 MASKTOVEC(0, 0, 1, 0),
66 MASKTOVEC(0, 0, 1, 1),
67 MASKTOVEC(0, 1, 0, 0),
68 MASKTOVEC(0, 1, 0, 1),
69 MASKTOVEC(0, 1, 1, 0),
70 MASKTOVEC(0, 1, 1, 1),
71 MASKTOVEC(1, 0, 0, 0),
72 MASKTOVEC(1, 0, 0, 1),
73 MASKTOVEC(1, 0, 1, 0),
74 MASKTOVEC(1, 0, 1, 1),
75 MASKTOVEC(1, 1, 0, 0),
76 MASKTOVEC(1, 1, 0, 1),
77 MASKTOVEC(1, 1, 1, 0),
78 MASKTOVEC(1, 1, 1, 1),
79 };
80
81 struct POS
82 {
83 int32_t x, y;
84 };
85
86 struct EDGE
87 {
88 double a, b; // a, b edge coefficients in fix8
89 double stepQuadX; // step to adjacent horizontal quad in fix16
90 double stepQuadY; // step to adjacent vertical quad in fix16
91 double stepRasterTileX; // step to adjacent horizontal raster tile in fix16
92 double stepRasterTileY; // step to adjacent vertical raster tile in fix16
93
94 __m256d vQuadOffsets; // offsets for 4 samples of a quad
95 __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile
96 };
97
98 //////////////////////////////////////////////////////////////////////////
99 /// @brief rasterize a raster tile partially covered by the triangle
100 /// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster
101 /// tile
102 /// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
103 /// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
104 /// Used to step between quads when sweeping over the raster tile.
105 template <uint32_t NumEdges, typename EdgeMaskT>
rasterizePartialTile(DRAW_CONTEXT * pDC,double startEdges[NumEdges],EDGE * pRastEdges)106 INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT* pDC,
107 double startEdges[NumEdges],
108 EDGE* pRastEdges)
109 {
110 uint64_t coverageMask = 0;
111
112 __m256d vEdges[NumEdges];
113 __m256d vStepX[NumEdges];
114 __m256d vStepY[NumEdges];
115
116 for (uint32_t e = 0; e < NumEdges; ++e)
117 {
118 // Step to the pixel sample locations of the 1st quad
119 vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets);
120
121 // compute step to next quad (mul by 2 in x and y direction)
122 vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX);
123 vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY);
124 }
125
126 // fast unrolled version for 8x8 tile
127 #if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
128 int edgeMask[NumEdges];
129 uint64_t mask;
130
131 auto eval_lambda = [&](int e) { edgeMask[e] = _mm256_movemask_pd(vEdges[e]); };
132 auto update_lambda = [&](int e) { mask &= edgeMask[e]; };
133 auto incx_lambda = [&](int e) { vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]); };
134 auto incy_lambda = [&](int e) { vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]); };
135 auto decx_lambda = [&](int e) { vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]); };
136
137 // evaluate which pixels in the quad are covered
138 #define EVAL UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
139
140 // update coverage mask
141 // if edge 0 is degenerate and will be skipped; init the mask
142 #define UPDATE_MASK(bit) \
143 if (std::is_same<EdgeMaskT, E1E2ValidT>::value || \
144 std::is_same<EdgeMaskT, NoEdgesValidT>::value) \
145 { \
146 mask = 0xf; \
147 } \
148 else \
149 { \
150 mask = edgeMask[0]; \
151 } \
152 UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
153 coverageMask |= (mask << bit);
154
155 // step in the +x direction to the next quad
156 #define INCX UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
157
158 // step in the +y direction to the next quad
159 #define INCY UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
160
161 // step in the -x direction to the next quad
162 #define DECX UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
163
164 // sweep 2x2 quad back and forth through the raster tile,
165 // computing coverage masks for the entire tile
166
167 // raster tile
168 // 0 1 2 3 4 5 6 7
169 // x x
170 // x x ------------------>
171 // x x |
172 // <-----------------x x V
173 // ..
174
175 // row 0
176 EVAL;
177 UPDATE_MASK(0);
178 INCX;
179 EVAL;
180 UPDATE_MASK(4);
181 INCX;
182 EVAL;
183 UPDATE_MASK(8);
184 INCX;
185 EVAL;
186 UPDATE_MASK(12);
187 INCY;
188
189 // row 1
190 EVAL;
191 UPDATE_MASK(28);
192 DECX;
193 EVAL;
194 UPDATE_MASK(24);
195 DECX;
196 EVAL;
197 UPDATE_MASK(20);
198 DECX;
199 EVAL;
200 UPDATE_MASK(16);
201 INCY;
202
203 // row 2
204 EVAL;
205 UPDATE_MASK(32);
206 INCX;
207 EVAL;
208 UPDATE_MASK(36);
209 INCX;
210 EVAL;
211 UPDATE_MASK(40);
212 INCX;
213 EVAL;
214 UPDATE_MASK(44);
215 INCY;
216
217 // row 3
218 EVAL;
219 UPDATE_MASK(60);
220 DECX;
221 EVAL;
222 UPDATE_MASK(56);
223 DECX;
224 EVAL;
225 UPDATE_MASK(52);
226 DECX;
227 EVAL;
228 UPDATE_MASK(48);
229 #else
230 uint32_t bit = 0;
231 for (uint32_t y = 0; y < KNOB_TILE_Y_DIM / 2; ++y)
232 {
233 __m256d vStartOfRowEdge[NumEdges];
234 for (uint32_t e = 0; e < NumEdges; ++e)
235 {
236 vStartOfRowEdge[e] = vEdges[e];
237 }
238
239 for (uint32_t x = 0; x < KNOB_TILE_X_DIM / 2; ++x)
240 {
241 int edgeMask[NumEdges];
242 for (uint32_t e = 0; e < NumEdges; ++e)
243 {
244 edgeMask[e] = _mm256_movemask_pd(vEdges[e]);
245 }
246
247 uint64_t mask = edgeMask[0];
248 for (uint32_t e = 1; e < NumEdges; ++e)
249 {
250 mask &= edgeMask[e];
251 }
252 coverageMask |= (mask << bit);
253
254 // step to the next pixel in the x
255 for (uint32_t e = 0; e < NumEdges; ++e)
256 {
257 vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);
258 }
259 bit += 4;
260 }
261
262 // step to the next row
263 for (uint32_t e = 0; e < NumEdges; ++e)
264 {
265 vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]);
266 }
267 }
268 #endif
269 return coverageMask;
270 }
271 // Top left rule:
272 // Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
273 // Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it
274 // is a 'left' edge Top left: a sample is in if it is a top or left edge. Out: !(horizontal &&
275 // above) = !horizontal && below Out: !horizontal && left = !(!horizontal && left) = horizontal and
276 // right
adjustTopLeftRuleIntFix16(const __m128i vA,const __m128i vB,__m256d & vEdge)277 INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d& vEdge)
278 {
279 // if vA < 0, vC--
280 // if vA == 0 && vB < 0, vC--
281
282 __m256d vEdgeOut = vEdge;
283 __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));
284
285 // if vA < 0 (line is not horizontal and below)
286 int msk = _mm_movemask_ps(_mm_castsi128_ps(vA));
287
288 // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
289 __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
290 int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
291 msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));
292
293 // if either of these are true and we're on the line (edge == 0), bump it outside the line
294 vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]);
295 }
296
297 //////////////////////////////////////////////////////////////////////////
298 /// @brief calculates difference in precision between the result of manh
299 /// calculation and the edge precision, based on compile time trait values
300 template <typename RT>
ManhToEdgePrecisionAdjust()301 constexpr int64_t ManhToEdgePrecisionAdjust()
302 {
303 static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >=
304 RT::EdgePrecisionT::BitsT::value,
305 "Inadequate precision of result of manh calculation ");
306 return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) -
307 RT::EdgePrecisionT::BitsT::value);
308 }
309
310 //////////////////////////////////////////////////////////////////////////
311 /// @struct adjustEdgeConservative
312 /// @brief Primary template definition used for partially specializing
313 /// the adjustEdgeConservative function. This struct should never
314 /// be instantiated.
315 /// @tparam RT: rasterizer traits
316 /// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
317 template <typename RT, typename ConservativeEdgeOffsetT>
318 struct adjustEdgeConservative
319 {
320 //////////////////////////////////////////////////////////////////////////
321 /// @brief Performs calculations to adjust each edge of a triangle away
322 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
323 /// direction.
324 ///
325 /// Uncertainty regions arise from fixed point rounding, which
326 /// can snap a vertex +/- by min fixed point value.
327 /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
328 /// This allows the rasterizer to test for coverage only at the pixel center,
329 /// instead of having to test individual pixel corners for conservative coverage
adjustEdgeConservativeadjustEdgeConservative330 INLINE adjustEdgeConservative(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge)
331 {
332 // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge
333 // away from the pixel center (in the direction of the edge normal A/B)
334
335 // edge = Ax + Bx + C - (manh/e)
336 // manh = manhattan distance = abs(A) + abs(B)
337 // e = absolute rounding error from snapping from float to fixed point precision
338
339 // 'fixed point' multiply (in double to be avx1 friendly)
340 // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
341 __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)),
342 vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi));
343 __m256d manh =
344 _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)),
345 _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)));
346
347 static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >=
348 RT::EdgePrecisionT::BitsT::value,
349 "Inadequate precision of result of manh calculation ");
350
351 // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the
352 // same precision since we're doing fixed math in double format, multiply by multiples of
353 // 1/2 instead of a bit shift right
354 manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5));
355
356 // move the edge away from the pixel center by the required conservative precision + 1/2
357 // pixel this allows the rasterizer to do a single conservative coverage test to see if the
358 // primitive intersects the pixel at all
359 vEdge = _mm256_sub_pd(vEdge, manh);
360 };
361 };
362
363 //////////////////////////////////////////////////////////////////////////
364 /// @brief adjustEdgeConservative specialization where no edge offset is needed
365 template <typename RT>
366 struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>>
367 {
368 INLINE adjustEdgeConservative(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge){};
369 };
370
371 //////////////////////////////////////////////////////////////////////////
372 /// @brief calculates the distance a degenerate BBox needs to be adjusted
373 /// for conservative rast based on compile time trait values
374 template <typename RT>
375 constexpr int64_t ConservativeScissorOffset()
376 {
377 static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0,
378 "Rasterizer precision > conservative precision");
379 // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox
380 // when calculating scissor edges
381 typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1>
382 DegenerateEdgeOffsetT;
383 // 1/2 pixel edge offset + conservative offset - degenerateTriangle
384 return RT::ConservativeEdgeOffsetT::value -
385 (DegenerateEdgeOffsetT::value
386 << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value));
387 }
388
389 //////////////////////////////////////////////////////////////////////////
390 /// @brief Performs calculations to adjust each a vector of evaluated edges out
391 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
392 /// direction.
393 template <typename RT>
394 INLINE void adjustScissorEdge(const double a, const double b, __m256d& vEdge)
395 {
396 int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
397 int64_t manh =
398 ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >>
399 ManhToEdgePrecisionAdjust<RT>();
400 vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh));
401 };
402
403 //////////////////////////////////////////////////////////////////////////
404 /// @brief Performs calculations to adjust each a scalar evaluated edge out
405 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
406 /// direction.
407 template <typename RT, typename OffsetT>
408 INLINE double adjustScalarEdge(const double a, const double b, const double Edge)
409 {
410 int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
411 int64_t manh =
412 ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>();
413 return (Edge - manh);
414 };
415
416 //////////////////////////////////////////////////////////////////////////
417 /// @brief Perform any needed adjustments to evaluated triangle edges
418 template <typename RT, typename EdgeOffsetT>
419 struct adjustEdgesFix16
420 {
421 INLINE adjustEdgesFix16(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge)
422 {
423 static_assert(
424 std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value,
425 "Edge equation expected to be in x.16 fixed point");
426
427 static_assert(RT::IsConservativeT::value,
428 "Edge offset assumes conservative rasterization is enabled");
429
430 // need to apply any edge offsets before applying the top-left rule
431 adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge);
432
433 adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
434 }
435 };
436
437 //////////////////////////////////////////////////////////////////////////
438 /// @brief Perform top left adjustments to evaluated triangle edges
439 template <typename RT>
440 struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>>
441 {
442 INLINE adjustEdgesFix16(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge)
443 {
444 adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
445 }
446 };
447
448 // max(abs(dz/dx), abs(dz,dy)
449 INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
450 {
451 /*
452 // evaluate i,j at (0,0)
453 float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
454 float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
455
456 // evaluate i,j at (1,0)
457 float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
458 float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
459
460 // compute dz/dx
461 float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
462 float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
463 float dzdx = abs(d10 - d00);
464
465 // evaluate i,j at (0,1)
466 float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
467 float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
468
469 float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
470 float dzdy = abs(d01 - d00);
471 */
472
473 // optimized version of above
474 float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0]));
475 float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1]));
476
477 return std::max(dzdx, dzdy);
478 }
479
480 INLINE float
481 ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
482 {
483 if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
484 {
485 return (1.0f / (1 << 24));
486 }
487 else if (pState->depthFormat == R16_UNORM)
488 {
489 return (1.0f / (1 << 16));
490 }
491 else
492 {
493 SWR_ASSERT(pState->depthFormat == R32_FLOAT);
494
495 // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
496 float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
497 uint32_t zMaxInt = *(uint32_t*)&zMax;
498 zMaxInt &= 0x7f800000;
499 zMax = *(float*)&zMaxInt;
500
501 return zMax * (1.0f / (1 << 23));
502 }
503 }
504
505 INLINE float
506 ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
507 {
508 if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
509 {
510 return 0.0f;
511 }
512
513 float scale = pState->slopeScaledDepthBias;
514 if (scale != 0.0f)
515 {
516 scale *= ComputeMaxDepthSlope(pTri);
517 }
518
519 float bias = pState->depthBias;
520 if (!pState->depthBiasPreAdjusted)
521 {
522 bias *= ComputeBiasFactor(pState, pTri, z);
523 }
524 bias += scale;
525
526 if (pState->depthBiasClamp > 0.0f)
527 {
528 bias = std::min(bias, pState->depthBiasClamp);
529 }
530 else if (pState->depthBiasClamp < 0.0f)
531 {
532 bias = std::max(bias, pState->depthBiasClamp);
533 }
534
535 return bias;
536 }
537
538 // Prevent DCE by writing coverage mask from rasterizer to volatile
539 #if KNOB_ENABLE_TOSS_POINTS
540 __declspec(thread) volatile uint64_t gToss;
541 #endif
542
543 static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
544 // try to avoid _chkstk insertions; make this thread local
545 static THREAD
546 OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib];
547
548 INLINE
549 void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
550 {
551 edge.a = a;
552 edge.b = b;
553
554 // compute constant steps to adjacent quads
555 edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE));
556 edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE));
557
558 // compute constant steps to adjacent raster tiles
559 edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE));
560 edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE));
561
562 // compute quad offsets
563 const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0);
564 const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0);
565
566 __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8);
567 __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8);
568 edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);
569
570 // compute raster tile offsets
571 const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd(
572 (KNOB_TILE_X_DIM - 1) * FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1) * FIXED_POINT_SCALE, 0);
573 const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd(
574 (KNOB_TILE_Y_DIM - 1) * FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1) * FIXED_POINT_SCALE, 0, 0);
575
576 __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8);
577 __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8);
578 edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16);
579 }
580
581 INLINE
582 void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge)
583 {
584 ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge);
585 }
586
587 //////////////////////////////////////////////////////////////////////////
588 /// @brief Primary template definition used for partially specializing
589 /// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel
590 /// corner to sample position, and test for coverage
591 /// @tparam sampleCount: multisample count
592 template <typename NumSamplesT>
593 INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3],
594 const __m256d* vEdgeFix16,
595 int32_t& mask0,
596 int32_t& mask1,
597 int32_t& mask2)
598 {
599 __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
600 // evaluate edge equations at the tile multisample bounding box
601 vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
602 vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
603 vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
604 mask0 = _mm256_movemask_pd(vSampleBboxTest0);
605 mask1 = _mm256_movemask_pd(vSampleBboxTest1);
606 mask2 = _mm256_movemask_pd(vSampleBboxTest2);
607 }
608
609 //////////////////////////////////////////////////////////////////////////
610 /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
611 /// when only rasterizing a single coverage test point
612 template <>
613 INLINE void UpdateEdgeMasks<SingleSampleT>(
614 const __m256d (&)[3], const __m256d* vEdgeFix16, int32_t& mask0, int32_t& mask1, int32_t& mask2)
615 {
616 mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
617 mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
618 mask2 = _mm256_movemask_pd(vEdgeFix16[2]);
619 }
620
621 //////////////////////////////////////////////////////////////////////////
622 /// @struct ComputeScissorEdges
623 /// @brief Primary template definition. Allows the function to be generically
624 /// called. When paired with below specializations, will result in an empty
625 /// inlined function if scissor is not enabled
626 /// @tparam RasterScissorEdgesT: is scissor enabled?
627 /// @tparam IsConservativeT: is conservative rast enabled?
628 /// @tparam RT: rasterizer traits
629 template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT>
630 struct ComputeScissorEdges
631 {
632 INLINE ComputeScissorEdges(const SWR_RECT& triBBox,
633 const SWR_RECT& scissorBBox,
634 const int32_t x,
635 const int32_t y,
636 EDGE (&rastEdges)[RT::NumEdgesT::value],
637 __m256d (&vEdgeFix16)[7]){};
638 };
639
640 //////////////////////////////////////////////////////////////////////////
641 /// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial
642 /// specialization. Instantiated when conservative rast and scissor are enabled
643 template <typename RT>
644 struct ComputeScissorEdges<std::true_type, std::true_type, RT>
645 {
646 //////////////////////////////////////////////////////////////////////////
647 /// @brief Intersect tri bbox with scissor, compute scissor edge vectors,
648 /// evaluate edge equations and offset them away from pixel center.
649 INLINE ComputeScissorEdges(const SWR_RECT& triBBox,
650 const SWR_RECT& scissorBBox,
651 const int32_t x,
652 const int32_t y,
653 EDGE (&rastEdges)[RT::NumEdgesT::value],
654 __m256d (&vEdgeFix16)[7])
655 {
656 // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
657 SWR_RECT scissor;
658 scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin);
659 scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax);
660 scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin);
661 scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax);
662
663 POS topLeft{scissor.xmin, scissor.ymin};
664 POS bottomLeft{scissor.xmin, scissor.ymax};
665 POS topRight{scissor.xmax, scissor.ymin};
666 POS bottomRight{scissor.xmax, scissor.ymax};
667
668 // construct 4 scissor edges in ccw direction
669 ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
670 ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
671 ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
672 ComputeEdgeData(topRight, topLeft, rastEdges[6]);
673
674 vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) +
675 (rastEdges[3].b * (y - scissor.ymin)));
676 vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) +
677 (rastEdges[4].b * (y - scissor.ymax)));
678 vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) +
679 (rastEdges[5].b * (y - scissor.ymax)));
680 vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) +
681 (rastEdges[6].b * (y - scissor.ymin)));
682
683 // if conservative rasterizing, need to bump the scissor edges out by the conservative
684 // uncertainty distance, else do nothing
685 adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]);
686 adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]);
687 adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]);
688 adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]);
689
690 // Upper left rule for scissor
691 vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
692 vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
693 }
694 };
695
696 //////////////////////////////////////////////////////////////////////////
697 /// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial
698 /// specialization. Instantiated when scissor is enabled and conservative rast
699 /// is disabled.
700 template <typename RT>
701 struct ComputeScissorEdges<std::true_type, std::false_type, RT>
702 {
703 //////////////////////////////////////////////////////////////////////////
704 /// @brief Compute scissor edge vectors and evaluate edge equations
705 INLINE ComputeScissorEdges(const SWR_RECT&,
706 const SWR_RECT& scissorBBox,
707 const int32_t x,
708 const int32_t y,
709 EDGE (&rastEdges)[RT::NumEdgesT::value],
710 __m256d (&vEdgeFix16)[7])
711 {
712 const SWR_RECT& scissor = scissorBBox;
713 POS topLeft{scissor.xmin, scissor.ymin};
714 POS bottomLeft{scissor.xmin, scissor.ymax};
715 POS topRight{scissor.xmax, scissor.ymin};
716 POS bottomRight{scissor.xmax, scissor.ymax};
717
718 // construct 4 scissor edges in ccw direction
719 ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
720 ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
721 ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
722 ComputeEdgeData(topRight, topLeft, rastEdges[6]);
723
724 vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) +
725 (rastEdges[3].b * (y - scissor.ymin)));
726 vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) +
727 (rastEdges[4].b * (y - scissor.ymax)));
728 vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) +
729 (rastEdges[5].b * (y - scissor.ymax)));
730 vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) +
731 (rastEdges[6].b * (y - scissor.ymin)));
732
733 // Upper left rule for scissor
734 vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
735 vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
736 }
737 };
738
739 //////////////////////////////////////////////////////////////////////////
740 /// @brief Primary function template for TrivialRejectTest. Should
741 /// never be called, but TemplateUnroller instantiates a few unused values,
742 /// so it calls a runtime assert instead of a static_assert.
743 template <typename ValidEdgeMaskT>
744 INLINE bool TrivialRejectTest(const int, const int, const int)
745 {
746 SWR_INVALID("Primary templated function should never be called");
747 return false;
748 };
749
750 //////////////////////////////////////////////////////////////////////////
751 /// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
752 /// and edge 1 for trivial coverage reject
753 template <>
754 INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int)
755 {
756 return (!(mask0 && mask1)) ? true : false;
757 };
758
759 //////////////////////////////////////////////////////////////////////////
760 /// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
761 /// and edge 2 for trivial coverage reject
762 template <>
763 INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2)
764 {
765 return (!(mask0 && mask2)) ? true : false;
766 };
767
768 //////////////////////////////////////////////////////////////////////////
769 /// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
770 /// and edge 2 for trivial coverage reject
771 template <>
772 INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2)
773 {
774 return (!(mask1 && mask2)) ? true : false;
775 };
776
777 //////////////////////////////////////////////////////////////////////////
778 /// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
779 /// primitive edges for trivial coverage reject
780 template <>
781 INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2)
782 {
783 return (!(mask0 && mask1 && mask2)) ? true : false;
784 ;
785 };
786
787 //////////////////////////////////////////////////////////////////////////
788 /// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
789 /// point, so return false and rasterize against conservative BBox
790 template <>
791 INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int)
792 {
793 return false;
794 };
795
796 //////////////////////////////////////////////////////////////////////////
797 /// @brief Primary function template for TrivialAcceptTest. Always returns
798 /// false, since it will only be called for degenerate tris, and as such
799 /// will never cover the entire raster tile
800 template <typename ScissorEnableT>
801 INLINE bool TrivialAcceptTest(const int, const int, const int)
802 {
803 return false;
804 };
805
806 //////////////////////////////////////////////////////////////////////////
807 /// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
808 /// edge masks for a fully covered raster tile
809 template <>
810 INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1, const int mask2)
811 {
812 return ((mask0 & mask1 & mask2) == 0xf);
813 };
814
815 //////////////////////////////////////////////////////////////////////////
816 /// @brief Primary function template for GenerateSVInnerCoverage. Results
817 /// in an empty function call if SVInnerCoverage isn't requested
818 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
819 struct GenerateSVInnerCoverage
820 {
821 INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*, uint64_t&){};
822 };
823
824 //////////////////////////////////////////////////////////////////////////
825 /// @brief Specialization of GenerateSVInnerCoverage where all edges
826 /// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated
827 /// edge values from OuterConservative to InnerConservative and rasterizes.
828 template <typename RT>
829 struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
830 {
831 INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC,
832 uint32_t workerId,
833 EDGE* pRastEdges,
834 double* pStartQuadEdges,
835 uint64_t& innerCoverageMask)
836 {
837 double startQuadEdgesAdj[RT::NumEdgesT::value];
838 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
839 {
840 startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(
841 pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]);
842 }
843
844 // not trivial accept or reject, must rasterize full tile
845 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizePartial, pDC->drawId);
846 innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(
847 pDC, startQuadEdgesAdj, pRastEdges);
848 RDTSC_END(pDC->pContext->pBucketMgr, BERasterizePartial, 0);
849 }
850 };
851
852 //////////////////////////////////////////////////////////////////////////
853 /// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
854 /// in an empty function call if SVInnerCoverage isn't requested
855 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
856 struct UpdateEdgeMasksInnerConservative
857 {
858 INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3],
859 const __m256d*,
860 const __m128i,
861 const __m128i,
862 int32_t&,
863 int32_t&,
864 int32_t&){};
865 };
866
867 //////////////////////////////////////////////////////////////////////////
868 /// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
869 /// are non-degenerate and SVInnerCoverage is requested. Offsets the edges
870 /// evaluated at raster tile corners to inner conservative position and
871 /// updates edge masks
872 template <typename RT>
873 struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT>
874 {
875 INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3],
876 const __m256d* vEdgeFix16,
877 const __m128i vAi,
878 const __m128i vBi,
879 int32_t& mask0,
880 int32_t& mask1,
881 int32_t& mask2)
882 {
883 __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]};
884
885 // instead of keeping 2 copies of evaluated edges around, just compensate for the outer
886 // conservative evaluated edge when adjusting the edge in for inner conservative tests
887 adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(
888 vAi, vBi, vTempEdge[0]);
889 adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(
890 vAi, vBi, vTempEdge[1]);
891 adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(
892 vAi, vBi, vTempEdge[2]);
893
894 UpdateEdgeMasks<typename RT::NumCoverageSamplesT>(
895 vEdgeTileBbox, vTempEdge, mask0, mask1, mask2);
896 }
897 };
898
899 //////////////////////////////////////////////////////////////////////////
900 /// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage
901 /// is requested but at least one edge is degenerate. Since a degenerate triangle cannot
902 /// cover an entire raster tile, set mask0 to 0 to force it down the
903 /// rastierizePartialTile path
904 template <typename RT, typename ValidEdgeMaskT>
905 struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT>
906 {
907 INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3],
908 const __m256d*,
909 const __m128i,
910 const __m128i,
911 int32_t& mask0,
912 int32_t&,
913 int32_t&)
914 {
915 // set one mask to zero to force the triangle down the rastierizePartialTile path
916 mask0 = 0;
917 }
918 };
919
920 template <typename RT>
921 void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
922 {
923 const TRIANGLE_WORK_DESC& workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
924 #if KNOB_ENABLE_TOSS_POINTS
925 if (KNOB_TOSS_BIN_TRIS)
926 {
927 return;
928 }
929 #endif
930 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizeTriangle, pDC->drawId);
931 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BETriangleSetup, pDC->drawId);
932
933 const API_STATE& state = GetApiState(pDC);
934 const SWR_RASTSTATE& rastState = state.rastState;
935 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
936
937 OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
938 triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
939
940 __m128 vX, vY, vZ, vRecipW;
941
942 // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
943 // eg: vX = [x0 x1 x2 dc]
944 vX = _mm_load_ps(workDesc.pTriBuffer);
945 vY = _mm_load_ps(workDesc.pTriBuffer + 4);
946 vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
947 vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
948
949 // convert to fixed point
950 static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value,
951 "Rasterizer expects 16.8 fixed point precision");
952 __m128i vXi = fpToFixedPoint(vX);
953 __m128i vYi = fpToFixedPoint(vY);
954
955 // quantize floating point position to fixed point precision
956 // to prevent attribute creep around the triangle vertices
957 vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
958 vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
959
960 // triangle setup - A and B edge equation coefs
961 __m128 vA, vB;
962 triangleSetupAB(vX, vY, vA, vB);
963
964 __m128i vAi, vBi;
965 triangleSetupABInt(vXi, vYi, vAi, vBi);
966
967 // determinant
968 float det = calcDeterminantInt(vAi, vBi);
969
970 // Verts in Pixel Coordinate Space at this point
971 // Det > 0 = CW winding order
972 // Convert CW triangles to CCW
973 if (det > 0.0)
974 {
975 vA = _mm_mul_ps(vA, _mm_set1_ps(-1));
976 vB = _mm_mul_ps(vB, _mm_set1_ps(-1));
977 vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1));
978 vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1));
979 det = -det;
980 }
981
982 __m128 vC;
983 // Finish triangle setup - C edge coef
984 triangleSetupC(vX, vY, vA, vB, vC);
985
986 if (RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
987 {
988 // If we have degenerate edge(s) to rasterize, set I and J coefs
989 // to 0 for constant interpolation of attributes
990 triDesc.I[0] = 0.0f;
991 triDesc.I[1] = 0.0f;
992 triDesc.I[2] = 0.0f;
993 triDesc.J[0] = 0.0f;
994 triDesc.J[1] = 0.0f;
995 triDesc.J[2] = 0.0f;
996
997 // Degenerate triangles have no area
998 triDesc.recipDet = 0.0f;
999 }
1000 else
1001 {
1002 // only extract coefs for 2 of the barycentrics; the 3rd can be
1003 // determined from the barycentric equation:
1004 // i + j + k = 1 <=> k = 1 - j - i
1005 _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
1006 _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1);
1007 _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1);
1008 _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2);
1009 _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
1010 _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
1011
1012 // compute recipDet, used to calculate barycentric i and j in the backend
1013 triDesc.recipDet = 1.0f / det;
1014 }
1015
1016 OSALIGNSIMD(float) oneOverW[4];
1017 _mm_store_ps(oneOverW, vRecipW);
1018 triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
1019 triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
1020 triDesc.OneOverW[2] = oneOverW[2];
1021
1022 // calculate perspective correct coefs per vertex attrib
1023 float* pPerspAttribs = perspAttribsTLS;
1024 float* pAttribs = workDesc.pAttribs;
1025 triDesc.pPerspAttribs = pPerspAttribs;
1026 triDesc.pAttribs = pAttribs;
1027 float* pRecipW = workDesc.pTriBuffer + 12;
1028 triDesc.pRecipW = pRecipW;
1029 __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW);
1030 __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW += 1);
1031 __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW += 1);
1032 for (uint32_t i = 0; i < workDesc.numAttribs; i++)
1033 {
1034 __m128 attribA = _mm_load_ps(pAttribs);
1035 __m128 attribB = _mm_load_ps(pAttribs += 4);
1036 __m128 attribC = _mm_load_ps(pAttribs += 4);
1037 pAttribs += 4;
1038
1039 attribA = _mm_mul_ps(attribA, vOneOverWV0);
1040 attribB = _mm_mul_ps(attribB, vOneOverWV1);
1041 attribC = _mm_mul_ps(attribC, vOneOverWV2);
1042
1043 _mm_store_ps(pPerspAttribs, attribA);
1044 _mm_store_ps(pPerspAttribs += 4, attribB);
1045 _mm_store_ps(pPerspAttribs += 4, attribC);
1046 pPerspAttribs += 4;
1047 }
1048
1049 // compute bary Z
1050 // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
1051 OSALIGNSIMD(float) a[4];
1052 _mm_store_ps(a, vZ);
1053 triDesc.Z[0] = a[0] - a[2];
1054 triDesc.Z[1] = a[1] - a[2];
1055 triDesc.Z[2] = a[2];
1056
1057 // add depth bias
1058 triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);
1059
1060 // Calc bounding box of triangle
1061 OSALIGNSIMD(SWR_RECT) bbox;
1062 calcBoundingBoxInt(vXi, vYi, bbox);
1063
1064 const SWR_RECT& scissorInFixedPoint =
1065 state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
1066
1067 if (RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
1068 {
1069 // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is
1070 // valid
1071 bbox.xmin--;
1072 bbox.xmax++;
1073 bbox.ymin--;
1074 bbox.ymax++;
1075 SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0,
1076 "Conservative rast degenerate handling requires a valid scissor rect");
1077 }
1078
1079 // Intersect with scissor/viewport
1080 OSALIGNSIMD(SWR_RECT) intersect;
1081 intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin);
1082 intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax);
1083 intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin);
1084 intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax);
1085
1086 triDesc.triFlags = workDesc.triFlags;
1087
1088 // further constrain backend to intersecting bounding box of macro tile and scissored triangle
1089 // bbox
1090 uint32_t macroX, macroY;
1091 MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
1092 int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
1093 int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
1094 int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
1095 int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
1096
1097 intersect.xmin = std::max(intersect.xmin, macroBoxLeft);
1098 intersect.ymin = std::max(intersect.ymin, macroBoxTop);
1099 intersect.xmax = std::min(intersect.xmax, macroBoxRight);
1100 intersect.ymax = std::min(intersect.ymax, macroBoxBottom);
1101
1102 SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax &&
1103 intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 &&
1104 intersect.ymax >= 0);
1105
1106 RDTSC_END(pDC->pContext->pBucketMgr, BETriangleSetup, 0);
1107
1108 // update triangle desc
1109 uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1110 uint32_t minTileY = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1111 uint32_t maxTileX = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1112 uint32_t maxTileY = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1113 uint32_t numTilesX = maxTileX - minTileX + 1;
1114 uint32_t numTilesY = maxTileY - minTileY + 1;
1115
1116 if (numTilesX == 0 || numTilesY == 0)
1117 {
1118 RDTSC_EVENT(pDC->pContext->pBucketMgr, BEEmptyTriangle, 1, 0);
1119 RDTSC_END(pDC->pContext->pBucketMgr, BERasterizeTriangle, 1);
1120 return;
1121 }
1122
1123 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEStepSetup, pDC->drawId);
1124
1125 // Step to pixel center of top-left pixel of the triangle bbox
1126 // Align intersect bbox (top/left) to raster tile's (top/left).
1127 int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM));
1128 int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM));
1129
1130 // convenience typedef
1131 typedef typename RT::NumCoverageSamplesT NumCoverageSamplesT;
1132
1133 // single sample rasterization evaluates edges at pixel center,
1134 // multisample evaluates edges UL pixel corner and steps to each sample position
1135 if (std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
1136 {
1137 // Add 0.5, in fixed point, to offset to pixel center
1138 x += (FIXED_POINT_SCALE / 2);
1139 y += (FIXED_POINT_SCALE / 2);
1140 }
1141
1142 __m128i vTopLeftX = _mm_set1_epi32(x);
1143 __m128i vTopLeftY = _mm_set1_epi32(y);
1144
1145 // evaluate edge equations at top-left pixel using 64bit math
1146 //
1147 // line = Ax + By + C
1148 // solving for C:
1149 // C = -Ax - By
1150 // we know x0 and y0 are on the line; plug them in:
1151 // C = -Ax0 - By0
1152 // plug C back into line equation:
1153 // line = Ax - By - Ax0 - By0
1154 // line = A(x - x0) + B(y - y0)
1155 // dX = (x-x0), dY = (y-y0)
1156 // so all this simplifies to
1157 // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
1158
1159 __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
1160 __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);
1161
1162 // evaluate A(dx) and B(dY) for all points
1163 __m256d vAipd = _mm256_cvtepi32_pd(vAi);
1164 __m256d vBipd = _mm256_cvtepi32_pd(vBi);
1165 __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
1166 __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);
1167
1168 __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
1169 __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
1170 __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
1171
1172 // apply any edge adjustments(top-left, crast, etc)
1173 adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge);
1174
1175 // broadcast respective edge results to all lanes
1176 double* pEdge = (double*)&vEdge;
1177 __m256d vEdgeFix16[7];
1178 vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]);
1179 vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]);
1180 vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]);
1181
1182 OSALIGNSIMD(int32_t) aAi[4], aBi[4];
1183 _mm_store_si128((__m128i*)aAi, vAi);
1184 _mm_store_si128((__m128i*)aBi, vBi);
1185 EDGE rastEdges[RT::NumEdgesT::value];
1186
1187 // Compute and store triangle edge data
1188 ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]);
1189 ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]);
1190 ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]);
1191
1192 // Compute and store triangle edge data if scissor needs to rasterized
1193 ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>(
1194 bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
1195
1196 // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
1197 // used to for testing if entire raster tile is inside a triangle
1198 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1199 {
1200 vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
1201 }
1202
1203 // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
1204 // step sample positions to the raster tile bbox of multisample points
1205 // min(xSamples),min(ySamples) ------ max(xSamples),min(ySamples)
1206 // | |
1207 // | |
1208 // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples)
1209 __m256d vEdgeTileBbox[3];
1210 if (NumCoverageSamplesT::value > 1)
1211 {
1212 const SWR_MULTISAMPLE_POS& samplePos = rastState.samplePositions;
1213 const __m128i vTileSampleBBoxXh = samplePos.TileSampleOffsetsX();
1214 const __m128i vTileSampleBBoxYh = samplePos.TileSampleOffsetsY();
1215
1216 __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
1217 __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);
1218
1219 // step edge equation tests from Tile
1220 // used to for testing if entire raster tile is inside a triangle
1221 for (uint32_t e = 0; e < 3; ++e)
1222 {
1223 __m256d vResultAxFix16 =
1224 _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
1225 __m256d vResultByFix16 =
1226 _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
1227 vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
1228
1229 // adjust for msaa tile bbox edges outward for conservative rast, if enabled
1230 adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(
1231 vAi, vBi, vEdgeTileBbox[e]);
1232 }
1233 }
1234
1235 RDTSC_END(pDC->pContext->pBucketMgr, BEStepSetup, 0);
1236
1237 uint32_t tY = minTileY;
1238 uint32_t tX = minTileX;
1239 uint32_t maxY = maxTileY;
1240 uint32_t maxX = maxTileX;
1241
1242 RenderOutputBuffers renderBuffers, currentRenderBufferRow;
1243 GetRenderHotTiles<RT::MT::numSamples>(pDC,
1244 workerId,
1245 macroTile,
1246 minTileX,
1247 minTileY,
1248 renderBuffers,
1249 triDesc.triFlags.renderTargetArrayIndex);
1250 currentRenderBufferRow = renderBuffers;
1251
1252 // rasterize and generate coverage masks per sample
1253 for (uint32_t tileY = tY; tileY <= maxY; ++tileY)
1254 {
1255 __m256d vStartOfRowEdge[RT::NumEdgesT::value];
1256 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1257 {
1258 vStartOfRowEdge[e] = vEdgeFix16[e];
1259 }
1260
1261 for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
1262 {
1263 triDesc.anyCoveredSamples = 0;
1264
1265 // is the corner of the edge outside of the raster tile? (vEdge < 0)
1266 int mask0, mask1, mask2;
1267 UpdateEdgeMasks<NumCoverageSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2);
1268
1269 for (uint32_t sampleNum = 0; sampleNum < NumCoverageSamplesT::value; sampleNum++)
1270 {
1271 // trivial reject, at least one edge has all 4 corners of raster tile outside
1272 bool trivialReject =
1273 TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2);
1274
1275 if (!trivialReject)
1276 {
1277 // trivial accept mask
1278 triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
1279
1280 // Update the raster tile edge masks based on inner conservative edge offsets,
1281 // if enabled
1282 UpdateEdgeMasksInnerConservative<RT,
1283 typename RT::ValidEdgeMaskT,
1284 typename RT::InputCoverageT>(
1285 vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2);
1286
1287 // @todo Make this a bit smarter to allow use of trivial accept when:
1288 // 1) scissor/vp intersection rect is raster tile aligned
1289 // 2) raster tile is entirely within scissor/vp intersection rect
1290 if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2))
1291 {
1292 // trivial accept, all 4 corners of all 3 edges are negative
1293 // i.e. raster tile completely inside triangle
1294 triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
1295 if (std::is_same<typename RT::InputCoverageT,
1296 InnerConservativeCoverageT>::value)
1297 {
1298 triDesc.innerCoverageMask = 0xffffffffffffffffULL;
1299 }
1300 RDTSC_EVENT(pDC->pContext->pBucketMgr, BETrivialAccept, 1, 0);
1301 }
1302 else
1303 {
1304 __m256d vEdgeAtSample[RT::NumEdgesT::value];
1305 if (std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
1306 {
1307 // should get optimized out for single sample case (global value
1308 // numbering or copy propagation)
1309 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1310 {
1311 vEdgeAtSample[e] = vEdgeFix16[e];
1312 }
1313 }
1314 else
1315 {
1316 const SWR_MULTISAMPLE_POS& samplePos = rastState.samplePositions;
1317 __m128i vSampleOffsetXh = samplePos.vXi(sampleNum);
1318 __m128i vSampleOffsetYh = samplePos.vYi(sampleNum);
1319 __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
1320 __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);
1321
1322 // step edge equation tests from UL tile corner to pixel sample position
1323 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1324 {
1325 __m256d vResultAxFix16 =
1326 _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
1327 __m256d vResultByFix16 =
1328 _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
1329 vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
1330 vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
1331 }
1332 }
1333
1334 double startQuadEdges[RT::NumEdgesT::value];
1335 const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
1336 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1337 {
1338 _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
1339 }
1340
1341 // not trivial accept or reject, must rasterize full tile
1342 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizePartial, pDC->drawId);
1343 triDesc.coverageMask[sampleNum] =
1344 rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(
1345 pDC, startQuadEdges, rastEdges);
1346 RDTSC_END(pDC->pContext->pBucketMgr, BERasterizePartial, 0);
1347
1348 triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum];
1349
1350 // Output SV InnerCoverage, if needed
1351 GenerateSVInnerCoverage<RT,
1352 typename RT::ValidEdgeMaskT,
1353 typename RT::InputCoverageT>(
1354 pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
1355 }
1356 }
1357 else
1358 {
1359 // if we're calculating coverage per sample, need to store it off. otherwise no
1360 // covered samples, don't need to do anything
1361 if (NumCoverageSamplesT::value > 1)
1362 {
1363 triDesc.coverageMask[sampleNum] = 0;
1364 }
1365 RDTSC_EVENT(pDC->pContext->pBucketMgr, BETrivialReject, 1, 0);
1366 }
1367 }
1368
1369 #if KNOB_ENABLE_TOSS_POINTS
1370 if (KNOB_TOSS_RS)
1371 {
1372 gToss = triDesc.coverageMask[0];
1373 }
1374 else
1375 #endif
1376 if (triDesc.anyCoveredSamples)
1377 {
1378 // if conservative rast and MSAA are enabled, conservative coverage for a pixel
1379 // means all samples in that pixel are covered copy conservative coverage result to
1380 // all samples
1381 if (RT::IsConservativeT::value)
1382 {
1383 auto copyCoverage = [&](int sample) {
1384 triDesc.coverageMask[sample] = triDesc.coverageMask[0];
1385 };
1386 UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage);
1387 }
1388
1389 // Track rasterized subspans
1390 AR_EVENT(RasterTileCount(pDC->drawId, 1));
1391
1392 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelBackend, pDC->drawId);
1393 backendFuncs.pfnBackend(pDC,
1394 workerId,
1395 tileX << KNOB_TILE_X_DIM_SHIFT,
1396 tileY << KNOB_TILE_Y_DIM_SHIFT,
1397 triDesc,
1398 renderBuffers);
1399 RDTSC_END(pDC->pContext->pBucketMgr, BEPixelBackend, 0);
1400 }
1401
1402 // step to the next tile in X
1403 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1404 {
1405 vEdgeFix16[e] =
1406 _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
1407 }
1408 StepRasterTileX<RT>(state.colorHottileEnable, renderBuffers);
1409 }
1410
1411 // step to the next tile in Y
1412 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1413 {
1414 vEdgeFix16[e] =
1415 _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
1416 }
1417 StepRasterTileY<RT>(state.colorHottileEnable, renderBuffers, currentRenderBufferRow);
1418 }
1419
1420 RDTSC_END(pDC->pContext->pBucketMgr, BERasterizeTriangle, 1);
1421 }
1422
1423 // Get pointers to hot tile memory for color RT, depth, stencil
1424 template <uint32_t numSamples>
1425 void GetRenderHotTiles(DRAW_CONTEXT* pDC,
1426 uint32_t workerId,
1427 uint32_t macroID,
1428 uint32_t tileX,
1429 uint32_t tileY,
1430 RenderOutputBuffers& renderBuffers,
1431 uint32_t renderTargetArrayIndex)
1432 {
1433 const API_STATE& state = GetApiState(pDC);
1434 SWR_CONTEXT* pContext = pDC->pContext;
1435 HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
1436
1437 uint32_t mx, my;
1438 MacroTileMgr::getTileIndices(macroID, mx, my);
1439 tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx;
1440 tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my;
1441
1442 // compute tile offset for active hottile buffers
1443 const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
1444 uint32_t offset = ComputeTileOffset2D<
1445 TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp>>(
1446 pitch, tileX, tileY);
1447 offset *= numSamples;
1448
1449 unsigned long rtSlot = 0;
1450 uint32_t colorHottileEnableMask = state.colorHottileEnable;
1451 while (_BitScanForward(&rtSlot, colorHottileEnableMask))
1452 {
1453 HOTTILE* pColor = pContext->pHotTileMgr->GetHotTile(
1454 pContext,
1455 pDC,
1456 hWorkerPrivateData,
1457 macroID,
1458 (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot),
1459 true,
1460 numSamples,
1461 renderTargetArrayIndex);
1462 renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;
1463 renderBuffers.pColorHotTile[rtSlot] = pColor;
1464
1465 colorHottileEnableMask &= ~(1 << rtSlot);
1466 }
1467 if (state.depthHottileEnable)
1468 {
1469 const uint32_t pitch =
1470 KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
1471 uint32_t offset = ComputeTileOffset2D<
1472 TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp>>(
1473 pitch, tileX, tileY);
1474 offset *= numSamples;
1475 HOTTILE* pDepth = pContext->pHotTileMgr->GetHotTile(pContext,
1476 pDC,
1477 hWorkerPrivateData,
1478 macroID,
1479 SWR_ATTACHMENT_DEPTH,
1480 true,
1481 numSamples,
1482 renderTargetArrayIndex);
1483 pDepth->state = HOTTILE_DIRTY;
1484 SWR_ASSERT(pDepth->pBuffer != nullptr);
1485 renderBuffers.pDepth = pDepth->pBuffer + offset;
1486 renderBuffers.pDepthHotTile = pDepth;
1487 }
1488 if (state.stencilHottileEnable)
1489 {
1490 const uint32_t pitch =
1491 KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
1492 uint32_t offset = ComputeTileOffset2D<
1493 TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp>>(
1494 pitch, tileX, tileY);
1495 offset *= numSamples;
1496 HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext,
1497 pDC,
1498 hWorkerPrivateData,
1499 macroID,
1500 SWR_ATTACHMENT_STENCIL,
1501 true,
1502 numSamples,
1503 renderTargetArrayIndex);
1504 pStencil->state = HOTTILE_DIRTY;
1505 SWR_ASSERT(pStencil->pBuffer != nullptr);
1506 renderBuffers.pStencil = pStencil->pBuffer + offset;
1507 renderBuffers.pStencilHotTile = pStencil;
1508 }
1509 }
1510
1511 template <typename RT>
1512 INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers& buffers)
1513 {
1514 unsigned long rt = 0;
1515 while (_BitScanForward(&rt, colorHotTileMask))
1516 {
1517 colorHotTileMask &= ~(1 << rt);
1518 buffers.pColor[rt] += RT::colorRasterTileStep;
1519 }
1520
1521 buffers.pDepth += RT::depthRasterTileStep;
1522 buffers.pStencil += RT::stencilRasterTileStep;
1523 }
1524
1525 template <typename RT>
1526 INLINE void StepRasterTileY(uint32_t colorHotTileMask,
1527 RenderOutputBuffers& buffers,
1528 RenderOutputBuffers& startBufferRow)
1529 {
1530 unsigned long rt = 0;
1531 while (_BitScanForward(&rt, colorHotTileMask))
1532 {
1533 colorHotTileMask &= ~(1 << rt);
1534 startBufferRow.pColor[rt] += RT::colorRasterTileRowStep;
1535 buffers.pColor[rt] = startBufferRow.pColor[rt];
1536 }
1537 startBufferRow.pDepth += RT::depthRasterTileRowStep;
1538 buffers.pDepth = startBufferRow.pDepth;
1539
1540 startBufferRow.pStencil += RT::stencilRasterTileRowStep;
1541 buffers.pStencil = startBufferRow.pStencil;
1542 }
1543