• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file rasterizer.cpp
24 *
25 * @brief Implementation for the rasterizer.
26 *
27 ******************************************************************************/
28 
29 #include <vector>
30 #include <algorithm>
31 
32 #include "rasterizer.h"
33 #include "rdtsc_core.h"
34 #include "backend.h"
35 #include "utils.h"
36 #include "frontend.h"
37 #include "tilemgr.h"
38 #include "memory/tilingtraits.h"
39 
40 template <uint32_t numSamples = 1>
41 void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex);
42 template <typename RT>
43 void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers);
44 template <typename RT>
45 void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow);
46 
47 #define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
48 const __m256d gMaskToVecpd[] =
49 {
50     MASKTOVEC(0, 0, 0, 0),
51     MASKTOVEC(0, 0, 0, 1),
52     MASKTOVEC(0, 0, 1, 0),
53     MASKTOVEC(0, 0, 1, 1),
54     MASKTOVEC(0, 1, 0, 0),
55     MASKTOVEC(0, 1, 0, 1),
56     MASKTOVEC(0, 1, 1, 0),
57     MASKTOVEC(0, 1, 1, 1),
58     MASKTOVEC(1, 0, 0, 0),
59     MASKTOVEC(1, 0, 0, 1),
60     MASKTOVEC(1, 0, 1, 0),
61     MASKTOVEC(1, 0, 1, 1),
62     MASKTOVEC(1, 1, 0, 0),
63     MASKTOVEC(1, 1, 0, 1),
64     MASKTOVEC(1, 1, 1, 0),
65     MASKTOVEC(1, 1, 1, 1),
66 };
67 
68 struct POS
69 {
70     int32_t x, y;
71 };
72 
73 struct EDGE
74 {
75     double a, b;                // a, b edge coefficients in fix8
76     double stepQuadX;           // step to adjacent horizontal quad in fix16
77     double stepQuadY;           // step to adjacent vertical quad in fix16
78     double stepRasterTileX;     // step to adjacent horizontal raster tile in fix16
79     double stepRasterTileY;     // step to adjacent vertical raster tile in fix16
80 
81     __m256d vQuadOffsets;       // offsets for 4 samples of a quad
82     __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile
83 };
84 
85 //////////////////////////////////////////////////////////////////////////
86 /// @brief rasterize a raster tile partially covered by the triangle
87 /// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile
88 /// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
89 /// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
90 ///        Used to step between quads when sweeping over the raster tile.
91 template<uint32_t NumEdges, typename EdgeMaskT>
rasterizePartialTile(DRAW_CONTEXT * pDC,double startEdges[NumEdges],EDGE * pRastEdges)92 INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges)
93 {
94     uint64_t coverageMask = 0;
95 
96     __m256d vEdges[NumEdges];
97     __m256d vStepX[NumEdges];
98     __m256d vStepY[NumEdges];
99 
100     for (uint32_t e = 0; e < NumEdges; ++e)
101     {
102         // Step to the pixel sample locations of the 1st quad
103         vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets);
104 
105         // compute step to next quad (mul by 2 in x and y direction)
106         vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX);
107         vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY);
108     }
109 
110     // fast unrolled version for 8x8 tile
111 #if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
112     int edgeMask[NumEdges];
113     uint64_t mask;
114 
115     auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);};
116     auto update_lambda = [&](int e){mask &= edgeMask[e];};
117     auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);};
118     auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);};
119     auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);};
120 
121 // evaluate which pixels in the quad are covered
122 #define EVAL \
123             UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
124 
125     // update coverage mask
126     // if edge 0 is degenerate and will be skipped; init the mask
127 #define UPDATE_MASK(bit) \
128             if(std::is_same<EdgeMaskT, E1E2ValidT>::value || std::is_same<EdgeMaskT, NoEdgesValidT>::value){\
129                 mask = 0xf;\
130             }\
131             else{\
132                 mask = edgeMask[0]; \
133             }\
134             UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
135             coverageMask |= (mask << bit);
136 
137     // step in the +x direction to the next quad
138 #define INCX \
139             UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
140 
141     // step in the +y direction to the next quad
142 #define INCY \
143             UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
144 
145     // step in the -x direction to the next quad
146 #define DECX \
147             UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
148 
149     // sweep 2x2 quad back and forth through the raster tile,
150     // computing coverage masks for the entire tile
151 
152     // raster tile
153     // 0  1  2  3  4  5  6  7
154     // x  x
155     // x  x ------------------>
156     //                   x  x  |
157     // <-----------------x  x  V
158     // ..
159 
160     // row 0
161     EVAL;
162     UPDATE_MASK(0);
163     INCX;
164     EVAL;
165     UPDATE_MASK(4);
166     INCX;
167     EVAL;
168     UPDATE_MASK(8);
169     INCX;
170     EVAL;
171     UPDATE_MASK(12);
172     INCY;
173 
174     //row 1
175     EVAL;
176     UPDATE_MASK(28);
177     DECX;
178     EVAL;
179     UPDATE_MASK(24);
180     DECX;
181     EVAL;
182     UPDATE_MASK(20);
183     DECX;
184     EVAL;
185     UPDATE_MASK(16);
186     INCY;
187 
188     // row 2
189     EVAL;
190     UPDATE_MASK(32);
191     INCX;
192     EVAL;
193     UPDATE_MASK(36);
194     INCX;
195     EVAL;
196     UPDATE_MASK(40);
197     INCX;
198     EVAL;
199     UPDATE_MASK(44);
200     INCY;
201 
202     // row 3
203     EVAL;
204     UPDATE_MASK(60);
205     DECX;
206     EVAL;
207     UPDATE_MASK(56);
208     DECX;
209     EVAL;
210     UPDATE_MASK(52);
211     DECX;
212     EVAL;
213     UPDATE_MASK(48);
214 #else
215     uint32_t bit = 0;
216     for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y)
217     {
218         __m256d vStartOfRowEdge[NumEdges];
219         for (uint32_t e = 0; e < NumEdges; ++e)
220         {
221             vStartOfRowEdge[e] = vEdges[e];
222         }
223 
224         for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x)
225         {
226             int edgeMask[NumEdges];
227             for (uint32_t e = 0; e < NumEdges; ++e)
228             {
229                 edgeMask[e] = _mm256_movemask_pd(vEdges[e]);
230             }
231 
232             uint64_t mask = edgeMask[0];
233             for (uint32_t e = 1; e < NumEdges; ++e)
234             {
235                 mask &= edgeMask[e];
236             }
237             coverageMask |= (mask << bit);
238 
239             // step to the next pixel in the x
240             for (uint32_t e = 0; e < NumEdges; ++e)
241             {
242                 vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);
243             }
244             bit+=4;
245         }
246 
247         // step to the next row
248         for (uint32_t e = 0; e < NumEdges; ++e)
249         {
250             vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]);
251         }
252     }
253 #endif
254     return coverageMask;
255 
256 }
257 // Top left rule:
258 // Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
259 // Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge
260 // Top left: a sample is in if it is a top or left edge.
261 // Out: !(horizontal && above) = !horizontal && below
262 // Out: !horizontal && left = !(!horizontal && left) = horizontal and right
adjustTopLeftRuleIntFix16(const __m128i vA,const __m128i vB,__m256d & vEdge)263 INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d &vEdge)
264 {
265     // if vA < 0, vC--
266     // if vA == 0 && vB < 0, vC--
267 
268     __m256d vEdgeOut = vEdge;
269     __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));
270 
271     // if vA < 0 (line is not horizontal and below)
272     int msk = _mm_movemask_ps(_mm_castsi128_ps(vA));
273 
274     // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
275     __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
276     int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
277     msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));
278 
279     // if either of these are true and we're on the line (edge == 0), bump it outside the line
280     vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]);
281 }
282 
283 //////////////////////////////////////////////////////////////////////////
284 /// @brief calculates difference in precision between the result of manh
285 /// calculation and the edge precision, based on compile time trait values
286 template<typename RT>
ManhToEdgePrecisionAdjust()287 constexpr int64_t ManhToEdgePrecisionAdjust()
288 {
289     static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
290                   "Inadequate precision of result of manh calculation ");
291     return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - RT::EdgePrecisionT::BitsT::value);
292 }
293 
294 //////////////////////////////////////////////////////////////////////////
295 /// @struct adjustEdgeConservative
296 /// @brief Primary template definition used for partially specializing
297 /// the adjustEdgeConservative function. This struct should never
298 /// be instantiated.
299 /// @tparam RT: rasterizer traits
300 /// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
301 template <typename RT, typename ConservativeEdgeOffsetT>
302 struct adjustEdgeConservative
303 {
304     //////////////////////////////////////////////////////////////////////////
305     /// @brief Performs calculations to adjust each edge of a triangle away
306     /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
307     /// direction.
308     ///
309     /// Uncertainty regions arise from fixed point rounding, which
310     /// can snap a vertex +/- by min fixed point value.
311     /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
312     /// This allows the rasterizer to test for coverage only at the pixel center,
313     /// instead of having to test individual pixel corners for conservative coverage
adjustEdgeConservativeadjustEdgeConservative314     INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
315     {
316         // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away
317         // from the pixel center (in the direction of the edge normal A/B)
318 
319         // edge = Ax + Bx + C - (manh/e)
320         // manh = manhattan distance = abs(A) + abs(B)
321         // e = absolute rounding error from snapping from float to fixed point precision
322 
323         // 'fixed point' multiply (in double to be avx1 friendly)
324         // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
325         __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi));
326         __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)),
327                                      _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)));
328 
329         static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
330                       "Inadequate precision of result of manh calculation ");
331 
332         // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision
333         // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right
334         manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5));
335 
336         // move the edge away from the pixel center by the required conservative precision + 1/2 pixel
337         // this allows the rasterizer to do a single conservative coverage test to see if the primitive
338         // intersects the pixel at all
339         vEdge = _mm256_sub_pd(vEdge, manh);
340     };
341 };
342 
343 //////////////////////////////////////////////////////////////////////////
344 /// @brief adjustEdgeConservative specialization where no edge offset is needed
345 template <typename RT>
346 struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>>
347 {
adjustEdgeConservativeadjustEdgeConservative348     INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) {};
349 };
350 
351 //////////////////////////////////////////////////////////////////////////
352 /// @brief calculates the distance a degenerate BBox needs to be adjusted
353 /// for conservative rast based on compile time trait values
354 template<typename RT>
ConservativeScissorOffset()355 constexpr int64_t ConservativeScissorOffset()
356 {
357     static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, "Rasterizer precision > conservative precision");
358     // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges
359     typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1> DegenerateEdgeOffsetT;
360     // 1/2 pixel edge offset + conservative offset - degenerateTriangle
361     return RT::ConservativeEdgeOffsetT::value - (DegenerateEdgeOffsetT::value << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value));
362 }
363 
364 //////////////////////////////////////////////////////////////////////////
365 /// @brief Performs calculations to adjust each a vector of evaluated edges out
366 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
367 /// direction.
368 template <typename RT>
adjustScissorEdge(const double a,const double b,__m256d & vEdge)369 INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge)
370 {
371     int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
372     int64_t manh = ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >> ManhToEdgePrecisionAdjust<RT>();
373     vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh));
374 };
375 
376 //////////////////////////////////////////////////////////////////////////
377 /// @brief Performs calculations to adjust each a scalar evaluated edge out
378 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
379 /// direction.
380 template <typename RT, typename OffsetT>
adjustScalarEdge(const double a,const double b,const double Edge)381 INLINE double adjustScalarEdge(const double a, const double b, const double Edge)
382 {
383     int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
384     int64_t manh = ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>();
385     return (Edge - manh);
386 };
387 
388 //////////////////////////////////////////////////////////////////////////
389 /// @brief Perform any needed adjustments to evaluated triangle edges
390 template <typename RT, typename EdgeOffsetT>
391 struct adjustEdgesFix16
392 {
adjustEdgesFix16adjustEdgesFix16393     INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
394     {
395         static_assert(std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value,
396                       "Edge equation expected to be in x.16 fixed point");
397 
398         static_assert(RT::IsConservativeT::value, "Edge offset assumes conservative rasterization is enabled");
399 
400         // need to apply any edge offsets before applying the top-left rule
401         adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge);
402 
403         adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
404     }
405 };
406 
407 //////////////////////////////////////////////////////////////////////////
408 /// @brief Perform top left adjustments to evaluated triangle edges
409 template <typename RT>
410 struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>>
411 {
adjustEdgesFix16adjustEdgesFix16412     INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
413     {
414         adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
415     }
416 };
417 
418 // max(abs(dz/dx), abs(dz,dy)
ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC * pDesc)419 INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
420 {
421     /*
422     // evaluate i,j at (0,0)
423     float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
424     float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
425 
426     // evaluate i,j at (1,0)
427     float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
428     float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
429 
430     // compute dz/dx
431     float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
432     float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
433     float dzdx = abs(d10 - d00);
434 
435     // evaluate i,j at (0,1)
436     float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
437     float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
438 
439     float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
440     float dzdy = abs(d01 - d00);
441     */
442 
443     // optimized version of above
444     float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0]));
445     float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1]));
446 
447     return std::max(dzdx, dzdy);
448 }
449 
ComputeBiasFactor(const SWR_RASTSTATE * pState,const SWR_TRIANGLE_DESC * pDesc,const float * z)450 INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
451 {
452     if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
453     {
454         return (1.0f / (1 << 24));
455     }
456     else if (pState->depthFormat == R16_UNORM)
457     {
458         return (1.0f / (1 << 16));
459     }
460     else
461     {
462         SWR_ASSERT(pState->depthFormat == R32_FLOAT);
463 
464         // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
465         float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
466         uint32_t zMaxInt = *(uint32_t*)&zMax;
467         zMaxInt &= 0x7f800000;
468         zMax = *(float*)&zMaxInt;
469 
470         return zMax * (1.0f / (1 << 23));
471     }
472 }
473 
ComputeDepthBias(const SWR_RASTSTATE * pState,const SWR_TRIANGLE_DESC * pTri,const float * z)474 INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
475 {
476     if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
477     {
478         return 0.0f;
479     }
480 
481     float scale = pState->slopeScaledDepthBias;
482     if (scale != 0.0f)
483     {
484         scale *= ComputeMaxDepthSlope(pTri);
485     }
486 
487     float bias = pState->depthBias;
488     if (!pState->depthBiasPreAdjusted)
489     {
490         bias *= ComputeBiasFactor(pState, pTri, z);
491     }
492     bias += scale;
493 
494     if (pState->depthBiasClamp > 0.0f)
495     {
496         bias = std::min(bias, pState->depthBiasClamp);
497     }
498     else if (pState->depthBiasClamp < 0.0f)
499     {
500         bias = std::max(bias, pState->depthBiasClamp);
501     }
502 
503     return bias;
504 }
505 
506 // Prevent DCE by writing coverage mask from rasterizer to volatile
507 #if KNOB_ENABLE_TOSS_POINTS
508 __declspec(thread) volatile uint64_t gToss;
509 #endif
510 
511 static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
512 // try to avoid _chkstk insertions; make this thread local
513 static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * KNOB_NUM_ATTRIBUTES * componentsPerAttrib];
514 
515 INLINE
ComputeEdgeData(int32_t a,int32_t b,EDGE & edge)516 void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
517 {
518     edge.a = a;
519     edge.b = b;
520 
521     // compute constant steps to adjacent quads
522     edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE));
523     edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE));
524 
525     // compute constant steps to adjacent raster tiles
526     edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE));
527     edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE));
528 
529     // compute quad offsets
530     const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0);
531     const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0);
532 
533     __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8);
534     __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8);
535     edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);
536 
537     // compute raster tile offsets
538     const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0);
539     const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0);
540 
541     __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8);
542     __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8);
543     edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16);
544 }
545 
546 INLINE
ComputeEdgeData(const POS & p0,const POS & p1,EDGE & edge)547 void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge)
548 {
549     ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge);
550 }
551 
552 //////////////////////////////////////////////////////////////////////////
553 /// @brief Primary template definition used for partially specializing
554 /// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel
555 /// corner to sample position, and test for coverage
556 /// @tparam sampleCount: multisample count
557 template <typename NumSamplesT>
UpdateEdgeMasks(const __m256d (& vEdgeTileBbox)[3],const __m256d * vEdgeFix16,int32_t & mask0,int32_t & mask1,int32_t & mask2)558 INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
559                             int32_t &mask0, int32_t &mask1, int32_t &mask2)
560 {
561     __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
562     // evaluate edge equations at the tile multisample bounding box
563     vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
564     vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
565     vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
566     mask0 = _mm256_movemask_pd(vSampleBboxTest0);
567     mask1 = _mm256_movemask_pd(vSampleBboxTest1);
568     mask2 = _mm256_movemask_pd(vSampleBboxTest2);
569 }
570 
571 //////////////////////////////////////////////////////////////////////////
572 /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
573 /// when only rasterizing a single coverage test point
574 template <>
UpdateEdgeMasks(const __m256d (&)[3],const __m256d * vEdgeFix16,int32_t & mask0,int32_t & mask1,int32_t & mask2)575 INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d* vEdgeFix16,
576                                            int32_t &mask0, int32_t &mask1, int32_t &mask2)
577 {
578     mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
579     mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
580     mask2 = _mm256_movemask_pd(vEdgeFix16[2]);
581 }
582 
583 //////////////////////////////////////////////////////////////////////////
584 /// @struct ComputeScissorEdges
585 /// @brief Primary template definition. Allows the function to be generically
586 /// called. When paired with below specializations, will result in an empty
587 /// inlined function if scissor is not enabled
588 /// @tparam RasterScissorEdgesT: is scissor enabled?
589 /// @tparam IsConservativeT: is conservative rast enabled?
590 /// @tparam RT: rasterizer traits
591 template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT>
592 struct ComputeScissorEdges
593 {
ComputeScissorEdgesComputeScissorEdges594     INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
595                               EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]){};
596 };
597 
598 //////////////////////////////////////////////////////////////////////////
599 /// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial
600 /// specialization. Instantiated when conservative rast and scissor are enabled
601 template <typename RT>
602 struct ComputeScissorEdges<std::true_type, std::true_type, RT>
603 {
604     //////////////////////////////////////////////////////////////////////////
605     /// @brief Intersect tri bbox with scissor, compute scissor edge vectors,
606     /// evaluate edge equations and offset them away from pixel center.
ComputeScissorEdgesComputeScissorEdges607     INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
608                               EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
609     {
610         // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
611         SWR_RECT scissor;
612         scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin);
613         scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax);
614         scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin);
615         scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax);
616 
617         POS topLeft{scissor.xmin, scissor.ymin};
618         POS bottomLeft{scissor.xmin, scissor.ymax};
619         POS topRight{scissor.xmax, scissor.ymin};
620         POS bottomRight{scissor.xmax, scissor.ymax};
621 
622         // construct 4 scissor edges in ccw direction
623         ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
624         ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
625         ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
626         ComputeEdgeData(topRight, topLeft, rastEdges[6]);
627 
628         vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
629         vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
630         vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
631         vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
632 
633         // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing
634         adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]);
635         adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]);
636         adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]);
637         adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]);
638 
639         // Upper left rule for scissor
640         vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
641         vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
642     }
643 };
644 
645 //////////////////////////////////////////////////////////////////////////
646 /// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial
647 /// specialization. Instantiated when scissor is enabled and conservative rast
648 /// is disabled.
649 template <typename RT>
650 struct ComputeScissorEdges<std::true_type, std::false_type, RT>
651 {
652     //////////////////////////////////////////////////////////////////////////
653     /// @brief Compute scissor edge vectors and evaluate edge equations
ComputeScissorEdgesComputeScissorEdges654     INLINE ComputeScissorEdges(const SWR_RECT &, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
655                               EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
656     {
657         const SWR_RECT &scissor = scissorBBox;
658         POS topLeft{scissor.xmin, scissor.ymin};
659         POS bottomLeft{scissor.xmin, scissor.ymax};
660         POS topRight{scissor.xmax, scissor.ymin};
661         POS bottomRight{scissor.xmax, scissor.ymax};
662 
663         // construct 4 scissor edges in ccw direction
664         ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
665         ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
666         ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
667         ComputeEdgeData(topRight, topLeft, rastEdges[6]);
668 
669         vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
670         vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
671         vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
672         vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
673 
674         // Upper left rule for scissor
675         vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
676         vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
677     }
678 };
679 
680 //////////////////////////////////////////////////////////////////////////
681 /// @brief Primary function template for TrivialRejectTest. Should
682 /// never be called, but TemplateUnroller instantiates a few unused values,
683 /// so it calls a runtime assert instead of a static_assert.
684 template <typename ValidEdgeMaskT>
TrivialRejectTest(const int,const int,const int)685 INLINE bool TrivialRejectTest(const int, const int, const int)
686 {
687     SWR_ASSERT(0, "Primary templated function should never be called");
688     return false;
689 };
690 
691 //////////////////////////////////////////////////////////////////////////
692 /// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
693 /// and edge 1 for trivial coverage reject
694 template <>
TrivialRejectTest(const int mask0,const int mask1,const int)695 INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int)
696 {
697     return (!(mask0 && mask1)) ? true : false;
698 };
699 
700 //////////////////////////////////////////////////////////////////////////
701 /// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
702 /// and edge 2 for trivial coverage reject
703 template <>
TrivialRejectTest(const int mask0,const int,const int mask2)704 INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2)
705 {
706     return (!(mask0 && mask2)) ? true : false;
707 };
708 
709 //////////////////////////////////////////////////////////////////////////
710 /// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
711 /// and edge 2 for trivial coverage reject
712 template <>
TrivialRejectTest(const int,const int mask1,const int mask2)713 INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2)
714 {
715     return (!(mask1 && mask2)) ? true : false;
716 };
717 
718 //////////////////////////////////////////////////////////////////////////
719 /// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
720 /// primitive edges for trivial coverage reject
721 template <>
TrivialRejectTest(const int mask0,const int mask1,const int mask2)722 INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2)
723 {
724     return (!(mask0 && mask1 && mask2)) ? true : false;;
725 };
726 
727 //////////////////////////////////////////////////////////////////////////
728 /// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
729 /// point, so return false and rasterize against conservative BBox
730 template <>
TrivialRejectTest(const int,const int,const int)731 INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int)
732 {
733     return false;
734 };
735 
736 //////////////////////////////////////////////////////////////////////////
737 /// @brief Primary function template for TrivialAcceptTest. Always returns
738 /// false, since it will only be called for degenerate tris, and as such
739 /// will never cover the entire raster tile
740 template <typename ScissorEnableT>
TrivialAcceptTest(const int,const int,const int)741 INLINE bool TrivialAcceptTest(const int, const int, const int)
742 {
743     return false;
744 };
745 
746 //////////////////////////////////////////////////////////////////////////
747 /// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
748 /// edge masks for a fully covered raster tile
749 template <>
TrivialAcceptTest(const int mask0,const int mask1,const int mask2)750 INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1, const int mask2)
751 {
752     return ((mask0 & mask1 & mask2) == 0xf);
753 };
754 
755 //////////////////////////////////////////////////////////////////////////
756 /// @brief Primary function template for GenerateSVInnerCoverage. Results
757 /// in an empty function call if SVInnerCoverage isn't requested
758 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
759 struct GenerateSVInnerCoverage
760 {
GenerateSVInnerCoverageGenerateSVInnerCoverage761     INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*,  uint64_t &){};
762 };
763 
764 //////////////////////////////////////////////////////////////////////////
765 /// @brief Specialization of GenerateSVInnerCoverage where all edges
766 /// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated
767 /// edge values from OuterConservative to InnerConservative and rasterizes.
768 template <typename RT>
769 struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
770 {
GenerateSVInnerCoverageGenerateSVInnerCoverage771     INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, uint32_t workerId, EDGE* pRastEdges, double* pStartQuadEdges,  uint64_t &innerCoverageMask)
772     {
773         SWR_CONTEXT *pContext = pDC->pContext;
774 
775         double startQuadEdgesAdj[RT::NumEdgesT::value];
776         for(uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
777         {
778             startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]);
779         }
780 
781         // not trivial accept or reject, must rasterize full tile
782         AR_BEGIN(BERasterizePartial, pDC->drawId);
783         innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdgesAdj, pRastEdges);
784         AR_END(BERasterizePartial, 0);
785     }
786 };
787 
788 //////////////////////////////////////////////////////////////////////////
789 /// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
790 /// in an empty function call if SVInnerCoverage isn't requested
791 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
792 struct UpdateEdgeMasksInnerConservative
793 {
UpdateEdgeMasksInnerConservativeUpdateEdgeMasksInnerConservative794     INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d*,
795                                            const __m128i, const __m128i, int32_t &, int32_t &, int32_t &){};
796 };
797 
798 //////////////////////////////////////////////////////////////////////////
799 /// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
800 /// are non-degenerate and SVInnerCoverage is requested. Offsets the edges
801 /// evaluated at raster tile corners to inner conservative position and
802 /// updates edge masks
803 template <typename RT>
804 struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT>
805 {
UpdateEdgeMasksInnerConservativeUpdateEdgeMasksInnerConservative806     INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
807                                            const __m128i vAi, const __m128i vBi, int32_t &mask0, int32_t &mask1, int32_t &mask2)
808     {
809         __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]};
810 
811         // instead of keeping 2 copies of evaluated edges around, just compensate for the outer
812         // conservative evaluated edge when adjusting the edge in for inner conservative tests
813         adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[0]);
814         adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[1]);
815         adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[2]);
816 
817         UpdateEdgeMasks<typename RT::NumRasterSamplesT>(vEdgeTileBbox, vTempEdge, mask0, mask1, mask2);
818     }
819 };
820 
821 //////////////////////////////////////////////////////////////////////////
822 /// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage
823 /// is requested but at least one edge is degenerate. Since a degenerate triangle cannot
824 /// cover an entire raster tile, set mask0 to 0 to force it down the
825 /// rastierizePartialTile path
826 template <typename RT, typename ValidEdgeMaskT>
827 struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT>
828 {
UpdateEdgeMasksInnerConservativeUpdateEdgeMasksInnerConservative829     INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d*,
830                                    const __m128i, const __m128i, int32_t &mask0, int32_t &, int32_t &)
831     {
832         // set one mask to zero to force the triangle down the rastierizePartialTile path
833         mask0 = 0;
834     }
835 };
836 
837 template <typename RT>
RasterizeTriangle(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pDesc)838 void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
839 {
840     SWR_CONTEXT *pContext = pDC->pContext;
841     const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
842 #if KNOB_ENABLE_TOSS_POINTS
843     if (KNOB_TOSS_BIN_TRIS)
844     {
845         return;
846     }
847 #endif
848     AR_BEGIN(BERasterizeTriangle, pDC->drawId);
849     AR_BEGIN(BETriangleSetup, pDC->drawId);
850 
851     const API_STATE &state = GetApiState(pDC);
852     const SWR_RASTSTATE &rastState = state.rastState;
853     const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
854 
855     OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
856     triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
857 
858     __m128 vX, vY, vZ, vRecipW;
859 
860     // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
861     // eg: vX = [x0 x1 x2 dc]
862     vX = _mm_load_ps(workDesc.pTriBuffer);
863     vY = _mm_load_ps(workDesc.pTriBuffer + 4);
864     vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
865     vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
866 
867     // convert to fixed point
868     static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value, "Rasterizer expects 16.8 fixed point precision");
869     __m128i vXi = fpToFixedPoint(vX);
870     __m128i vYi = fpToFixedPoint(vY);
871 
872     // quantize floating point position to fixed point precision
873     // to prevent attribute creep around the triangle vertices
874     vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
875     vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
876 
877     // triangle setup - A and B edge equation coefs
878     __m128 vA, vB;
879     triangleSetupAB(vX, vY, vA, vB);
880 
881     __m128i vAi, vBi;
882     triangleSetupABInt(vXi, vYi, vAi, vBi);
883 
884     // determinant
885     float det = calcDeterminantInt(vAi, vBi);
886 
887     // Verts in Pixel Coordinate Space at this point
888     // Det > 0 = CW winding order
889     // Convert CW triangles to CCW
890     if (det > 0.0)
891     {
892         vA  = _mm_mul_ps(vA, _mm_set1_ps(-1));
893         vB  = _mm_mul_ps(vB, _mm_set1_ps(-1));
894         vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1));
895         vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1));
896         det = -det;
897     }
898 
899     __m128 vC;
900     // Finish triangle setup - C edge coef
901     triangleSetupC(vX, vY, vA, vB, vC);
902 
903     if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
904     {
905         // If we have degenerate edge(s) to rasterize, set I and J coefs
906         // to 0 for constant interpolation of attributes
907         triDesc.I[0] = 0.0f;
908         triDesc.I[1] = 0.0f;
909         triDesc.I[2] = 0.0f;
910         triDesc.J[0] = 0.0f;
911         triDesc.J[1] = 0.0f;
912         triDesc.J[2] = 0.0f;
913 
914         // Degenerate triangles have no area
915         triDesc.recipDet = 0.0f;
916     }
917     else
918     {
919         // only extract coefs for 2 of the barycentrics; the 3rd can be
920         // determined from the barycentric equation:
921         // i + j + k = 1 <=> k = 1 - j - i
922         _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
923         _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1);
924         _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1);
925         _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2);
926         _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
927         _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
928 
929         // compute recipDet, used to calculate barycentric i and j in the backend
930         triDesc.recipDet = 1.0f/det;
931     }
932 
933     OSALIGNSIMD(float) oneOverW[4];
934     _mm_store_ps(oneOverW, vRecipW);
935     triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
936     triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
937     triDesc.OneOverW[2] = oneOverW[2];
938 
939     // calculate perspective correct coefs per vertex attrib
940     float* pPerspAttribs = perspAttribsTLS;
941     float* pAttribs = workDesc.pAttribs;
942     triDesc.pPerspAttribs = pPerspAttribs;
943     triDesc.pAttribs = pAttribs;
944     float *pRecipW = workDesc.pTriBuffer + 12;
945     triDesc.pRecipW = pRecipW;
946     __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW);
947     __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1);
948     __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1);
949     for(uint32_t i = 0; i < workDesc.numAttribs; i++)
950     {
951         __m128 attribA = _mm_load_ps(pAttribs);
952         __m128 attribB = _mm_load_ps(pAttribs+=4);
953         __m128 attribC = _mm_load_ps(pAttribs+=4);
954         pAttribs+=4;
955 
956         attribA = _mm_mul_ps(attribA, vOneOverWV0);
957         attribB = _mm_mul_ps(attribB, vOneOverWV1);
958         attribC = _mm_mul_ps(attribC, vOneOverWV2);
959 
960         _mm_store_ps(pPerspAttribs, attribA);
961         _mm_store_ps(pPerspAttribs+=4, attribB);
962         _mm_store_ps(pPerspAttribs+=4, attribC);
963         pPerspAttribs+=4;
964     }
965 
966     // compute bary Z
967     // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
968     OSALIGNSIMD(float) a[4];
969     _mm_store_ps(a, vZ);
970     triDesc.Z[0] = a[0] - a[2];
971     triDesc.Z[1] = a[1] - a[2];
972     triDesc.Z[2] = a[2];
973 
974     // add depth bias
975     triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);
976 
977     // Calc bounding box of triangle
978     OSALIGNSIMD(SWR_RECT) bbox;
979     calcBoundingBoxInt(vXi, vYi, bbox);
980 
981     const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
982 
983     if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
984     {
985         // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid
986         bbox.xmin--;    bbox.xmax++;    bbox.ymin--;    bbox.ymax++;
987         SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0,
988                    "Conservative rast degenerate handling requires a valid scissor rect");
989     }
990 
991     // Intersect with scissor/viewport
992     OSALIGNSIMD(SWR_RECT) intersect;
993     intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin);
994     intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax);
995     intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin);
996     intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax);
997 
998     triDesc.triFlags = workDesc.triFlags;
999 
1000     // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox
1001     uint32_t macroX, macroY;
1002     MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
1003     int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
1004     int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
1005     int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
1006     int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
1007 
1008     intersect.xmin = std::max(intersect.xmin, macroBoxLeft);
1009     intersect.ymin = std::max(intersect.ymin, macroBoxTop);
1010     intersect.xmax = std::min(intersect.xmax, macroBoxRight);
1011     intersect.ymax = std::min(intersect.ymax, macroBoxBottom);
1012 
1013     SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && intersect.ymax >= 0);
1014 
1015     AR_END(BETriangleSetup, 0);
1016 
1017     // update triangle desc
1018     uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1019     uint32_t minTileY = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1020     uint32_t maxTileX = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1021     uint32_t maxTileY = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1022     uint32_t numTilesX = maxTileX - minTileX + 1;
1023     uint32_t numTilesY = maxTileY - minTileY + 1;
1024 
1025     if (numTilesX == 0 || numTilesY == 0)
1026     {
1027         RDTSC_EVENT(BEEmptyTriangle, 1, 0);
1028         AR_END(BERasterizeTriangle, 1);
1029         return;
1030     }
1031 
1032     AR_BEGIN(BEStepSetup, pDC->drawId);
1033 
1034     // Step to pixel center of top-left pixel of the triangle bbox
1035     // Align intersect bbox (top/left) to raster tile's (top/left).
1036     int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM));
1037     int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM));
1038 
1039     // convenience typedef
1040     typedef typename RT::NumRasterSamplesT NumRasterSamplesT;
1041 
1042     // single sample rasterization evaluates edges at pixel center,
1043     // multisample evaluates edges UL pixel corner and steps to each sample position
1044     if(std::is_same<NumRasterSamplesT, SingleSampleT>::value)
1045     {
1046         // Add 0.5, in fixed point, to offset to pixel center
1047         x += (FIXED_POINT_SCALE / 2);
1048         y += (FIXED_POINT_SCALE / 2);
1049     }
1050 
1051     __m128i vTopLeftX = _mm_set1_epi32(x);
1052     __m128i vTopLeftY = _mm_set1_epi32(y);
1053 
1054     // evaluate edge equations at top-left pixel using 64bit math
1055     //
1056     // line = Ax + By + C
1057     // solving for C:
1058     // C = -Ax - By
1059     // we know x0 and y0 are on the line; plug them in:
1060     // C = -Ax0 - By0
1061     // plug C back into line equation:
1062     // line = Ax - By - Ax0 - By0
1063     // line = A(x - x0) + B(y - y0)
1064     // dX = (x-x0), dY = (y-y0)
1065     // so all this simplifies to
1066     // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
1067 
1068     __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
1069     __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);
1070 
1071     // evaluate A(dx) and B(dY) for all points
1072     __m256d vAipd = _mm256_cvtepi32_pd(vAi);
1073     __m256d vBipd = _mm256_cvtepi32_pd(vBi);
1074     __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
1075     __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);
1076 
1077     __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
1078     __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
1079     __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
1080 
1081     // apply any edge adjustments(top-left, crast, etc)
1082     adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge);
1083 
1084     // broadcast respective edge results to all lanes
1085     double* pEdge = (double*)&vEdge;
1086     __m256d vEdgeFix16[7];
1087     vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]);
1088     vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]);
1089     vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]);
1090 
1091     OSALIGNSIMD(int32_t) aAi[4], aBi[4];
1092     _mm_store_si128((__m128i*)aAi, vAi);
1093     _mm_store_si128((__m128i*)aBi, vBi);
1094     EDGE rastEdges[RT::NumEdgesT::value];
1095 
1096     // Compute and store triangle edge data
1097     ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]);
1098     ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]);
1099     ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]);
1100 
1101     // Compute and store triangle edge data if scissor needs to rasterized
1102     ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>
1103                        (bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
1104 
1105     // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
1106     // used to for testing if entire raster tile is inside a triangle
1107     for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1108     {
1109         vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
1110     }
1111 
1112     // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
1113     // step sample positions to the raster tile bbox of multisample points
1114     // min(xSamples),min(ySamples)  ------  max(xSamples),min(ySamples)
1115     //                             |      |
1116     //                             |      |
1117     // min(xSamples),max(ySamples)  ------  max(xSamples),max(ySamples)
1118     __m256d vEdgeTileBbox[3];
1119     if (NumRasterSamplesT::value > 1)
1120     {
1121         __m128i vTileSampleBBoxXh = RT::MT::TileSampleOffsetsX();
1122         __m128i vTileSampleBBoxYh = RT::MT::TileSampleOffsetsY();
1123 
1124         __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
1125         __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);
1126 
1127         // step edge equation tests from Tile
1128         // used to for testing if entire raster tile is inside a triangle
1129         for (uint32_t e = 0; e < 3; ++e)
1130         {
1131             __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
1132             __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
1133             vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
1134 
1135             // adjust for msaa tile bbox edges outward for conservative rast, if enabled
1136             adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdgeTileBbox[e]);
1137         }
1138     }
1139 
1140     AR_END(BEStepSetup, 0);
1141 
1142     uint32_t tY = minTileY;
1143     uint32_t tX = minTileX;
1144     uint32_t maxY = maxTileY;
1145     uint32_t maxX = maxTileX;
1146 
1147     RenderOutputBuffers renderBuffers, currentRenderBufferRow;
1148     GetRenderHotTiles<RT::MT::numSamples>(pDC, macroTile, minTileX, minTileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
1149     currentRenderBufferRow = renderBuffers;
1150 
1151     // rasterize and generate coverage masks per sample
1152     for (uint32_t tileY = tY; tileY <= maxY; ++tileY)
1153     {
1154         __m256d vStartOfRowEdge[RT::NumEdgesT::value];
1155         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1156         {
1157             vStartOfRowEdge[e] = vEdgeFix16[e];
1158         }
1159 
1160         for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
1161         {
1162             triDesc.anyCoveredSamples = 0;
1163 
1164             // is the corner of the edge outside of the raster tile? (vEdge < 0)
1165             int mask0, mask1, mask2;
1166             UpdateEdgeMasks<NumRasterSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2);
1167 
1168             for (uint32_t sampleNum = 0; sampleNum < NumRasterSamplesT::value; sampleNum++)
1169             {
1170                 // trivial reject, at least one edge has all 4 corners of raster tile outside
1171                 bool trivialReject = TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2);
1172 
1173                 if (!trivialReject)
1174                 {
1175                     // trivial accept mask
1176                     triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
1177 
1178                     // Update the raster tile edge masks based on inner conservative edge offsets, if enabled
1179                     UpdateEdgeMasksInnerConservative<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>
1180                         (vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2);
1181 
1182                     // @todo Make this a bit smarter to allow use of trivial accept when:
1183                     //   1) scissor/vp intersection rect is raster tile aligned
1184                     //   2) raster tile is entirely within scissor/vp intersection rect
1185                     if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2))
1186                     {
1187                         // trivial accept, all 4 corners of all 3 edges are negative
1188                         // i.e. raster tile completely inside triangle
1189                         triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
1190                         if(std::is_same<typename RT::InputCoverageT, InnerConservativeCoverageT>::value)
1191                         {
1192                             triDesc.innerCoverageMask = 0xffffffffffffffffULL;
1193                         }
1194                         RDTSC_EVENT(BETrivialAccept, 1, 0);
1195                     }
1196                     else
1197                     {
1198                         __m256d vEdgeAtSample[RT::NumEdgesT::value];
1199                         if(std::is_same<NumRasterSamplesT, SingleSampleT>::value)
1200                         {
1201                             // should get optimized out for single sample case (global value numbering or copy propagation)
1202                             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1203                             {
1204                                 vEdgeAtSample[e] = vEdgeFix16[e];
1205                             }
1206                         }
1207                         else
1208                         {
1209                             __m128i vSampleOffsetXh = RT::MT::vXi(sampleNum);
1210                             __m128i vSampleOffsetYh = RT::MT::vYi(sampleNum);
1211                             __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
1212                             __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);
1213 
1214                             // step edge equation tests from UL tile corner to pixel sample position
1215                             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1216                             {
1217                                 __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
1218                                 __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
1219                                 vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
1220                                 vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
1221                             }
1222                         }
1223 
1224                         double startQuadEdges[RT::NumEdgesT::value];
1225                         const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
1226                         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1227                         {
1228                             _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
1229                         }
1230 
1231                         // not trivial accept or reject, must rasterize full tile
1232                         AR_BEGIN(BERasterizePartial, pDC->drawId);
1233                         triDesc.coverageMask[sampleNum] = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdges, rastEdges);
1234                         AR_END(BERasterizePartial, 0);
1235 
1236                         triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum];
1237 
1238                         // Output SV InnerCoverage, if needed
1239                         GenerateSVInnerCoverage<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>(pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
1240                     }
1241                 }
1242                 else
1243                 {
1244                     // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything
1245                     if(NumRasterSamplesT::value > 1)
1246                     {
1247                         triDesc.coverageMask[sampleNum] = 0;
1248                     }
1249                     RDTSC_EVENT(BETrivialReject, 1, 0);
1250                 }
1251             }
1252 
1253 #if KNOB_ENABLE_TOSS_POINTS
1254             if(KNOB_TOSS_RS)
1255             {
1256                 gToss = triDesc.coverageMask[0];
1257             }
1258             else
1259 #endif
1260             if(triDesc.anyCoveredSamples)
1261             {
1262                 // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered
1263                 // copy conservative coverage result to all samples
1264                 if(RT::IsConservativeT::value)
1265                 {
1266                     auto copyCoverage = [&](int sample){triDesc.coverageMask[sample] = triDesc.coverageMask[0]; };
1267                     UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage);
1268                 }
1269 
1270                 AR_BEGIN(BEPixelBackend, pDC->drawId);
1271                 backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
1272                 AR_END(BEPixelBackend, 0);
1273             }
1274 
1275             // step to the next tile in X
1276             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1277             {
1278                 vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
1279             }
1280             StepRasterTileX<RT>(state.psState.numRenderTargets, renderBuffers);
1281         }
1282 
1283         // step to the next tile in Y
1284         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1285         {
1286             vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
1287         }
1288         StepRasterTileY<RT>(state.psState.numRenderTargets, renderBuffers, currentRenderBufferRow);
1289     }
1290 
1291     AR_END(BERasterizeTriangle, 1);
1292 }
1293 
RasterizeTriPoint(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pData)1294 void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
1295 {
1296     const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
1297     const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
1298     const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1299 
1300     bool isPointSpriteTexCoordEnabled = backendState.pointSpriteTexCoordMask != 0;
1301 
1302     // load point vertex
1303     float x = *workDesc.pTriBuffer;
1304     float y = *(workDesc.pTriBuffer + 1);
1305     float z = *(workDesc.pTriBuffer + 2);
1306 
1307     // create a copy of the triangle buffer to write our adjusted vertices to
1308     OSALIGNSIMD(float) newTriBuffer[4 * 4];
1309     TRIANGLE_WORK_DESC newWorkDesc = workDesc;
1310     newWorkDesc.pTriBuffer = &newTriBuffer[0];
1311 
1312     // create a copy of the attrib buffer to write our adjusted attribs to
1313     OSALIGNSIMD(float) newAttribBuffer[4 * 3 * KNOB_NUM_ATTRIBUTES];
1314     newWorkDesc.pAttribs = &newAttribBuffer[0];
1315 
1316     newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
1317     newWorkDesc.numAttribs = workDesc.numAttribs;
1318     newWorkDesc.triFlags = workDesc.triFlags;
1319 
1320     // construct two tris by bloating point by point size
1321     float halfPointSize = workDesc.triFlags.pointSize * 0.5f;
1322     float lowerX = x - halfPointSize;
1323     float upperX = x + halfPointSize;
1324     float lowerY = y - halfPointSize;
1325     float upperY = y + halfPointSize;
1326 
1327     // tri 0
1328     float *pBuf = &newTriBuffer[0];
1329     *pBuf++ = lowerX;
1330     *pBuf++ = lowerX;
1331     *pBuf++ = upperX;
1332     pBuf++;
1333     *pBuf++ = lowerY;
1334     *pBuf++ = upperY;
1335     *pBuf++ = upperY;
1336     pBuf++;
1337     _mm_store_ps(pBuf, _mm_set1_ps(z));
1338     _mm_store_ps(pBuf+=4, _mm_set1_ps(1.0f));
1339 
1340     // setup triangle rasterizer function
1341     PFN_WORK_FUNC pfnTriRast;
1342     // for center sample pattern, all samples are at pixel center; calculate coverage
1343     // once at center and broadcast the results in the backend
1344     uint32_t sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
1345     // conservative rast not supported for points/lines
1346     pfnTriRast = GetRasterizerFunc(sampleCount, false, SWR_INPUT_COVERAGE_NONE, ALL_EDGES_VALID, (pDC->pState->state.scissorsTileAligned == false));
1347 
1348     // overwrite texcoords for point sprites
1349     if (isPointSpriteTexCoordEnabled)
1350     {
1351         // copy original attribs
1352         memcpy(&newAttribBuffer[0], workDesc.pAttribs, 4 * 3 * workDesc.numAttribs * sizeof(float));
1353         newWorkDesc.pAttribs = &newAttribBuffer[0];
1354 
1355         // overwrite texcoord for point sprites
1356         uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
1357         DWORD texCoordAttrib = 0;
1358 
1359         while (_BitScanForward(&texCoordAttrib, texCoordMask))
1360         {
1361             texCoordMask &= ~(1 << texCoordAttrib);
1362             __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
1363             if (rastState.pointSpriteTopOrigin)
1364             {
1365                 pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
1366                 pTexAttrib[1] = _mm_set_ps(1, 0, 1, 0);
1367                 pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
1368             }
1369             else
1370             {
1371                 pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
1372                 pTexAttrib[1] = _mm_set_ps(1, 0, 0, 0);
1373                 pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
1374             }
1375         }
1376     }
1377     else
1378     {
1379         // no texcoord overwrite, can reuse the attrib buffer from frontend
1380         newWorkDesc.pAttribs = workDesc.pAttribs;
1381     }
1382 
1383     pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
1384 
1385     // tri 1
1386     pBuf = &newTriBuffer[0];
1387     *pBuf++ = lowerX;
1388     *pBuf++ = upperX;
1389     *pBuf++ = upperX;
1390     pBuf++;
1391     *pBuf++ = lowerY;
1392     *pBuf++ = upperY;
1393     *pBuf++ = lowerY;
1394     // z, w unchanged
1395 
1396     if (isPointSpriteTexCoordEnabled)
1397     {
1398         uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
1399         DWORD texCoordAttrib = 0;
1400 
1401         while (_BitScanForward(&texCoordAttrib, texCoordMask))
1402         {
1403             texCoordMask &= ~(1 << texCoordAttrib);
1404             __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
1405             if (rastState.pointSpriteTopOrigin)
1406             {
1407                 pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
1408                 pTexAttrib[1] = _mm_set_ps(1, 0, 1, 1);
1409                 pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
1410 
1411             }
1412             else
1413             {
1414                 pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
1415                 pTexAttrib[1] = _mm_set_ps(1, 0, 0, 1);
1416                 pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
1417             }
1418         }
1419     }
1420 
1421     pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
1422 }
1423 
RasterizeSimplePoint(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pData)1424 void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
1425 {
1426     SWR_CONTEXT *pContext = pDC->pContext;
1427 
1428 #if KNOB_ENABLE_TOSS_POINTS
1429     if (KNOB_TOSS_BIN_TRIS)
1430     {
1431         return;
1432     }
1433 #endif
1434 
1435     const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
1436     const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
1437 
1438     // map x,y relative offsets from start of raster tile to bit position in
1439     // coverage mask for the point
1440     static const uint32_t coverageMap[8][8] = {
1441         { 0, 1, 4, 5, 8, 9, 12, 13 },
1442         { 2, 3, 6, 7, 10, 11, 14, 15 },
1443         { 16, 17, 20, 21, 24, 25, 28, 29 },
1444         { 18, 19, 22, 23, 26, 27, 30, 31 },
1445         { 32, 33, 36, 37, 40, 41, 44, 45 },
1446         { 34, 35, 38, 39, 42, 43, 46, 47 },
1447         { 48, 49, 52, 53, 56, 57, 60, 61 },
1448         { 50, 51, 54, 55, 58, 59, 62, 63 }
1449     };
1450 
1451     OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
1452 
1453     // pull point information from triangle buffer
1454     // @todo use structs for readability
1455     uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer;
1456     uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1);
1457     float z = *(workDesc.pTriBuffer + 2);
1458 
1459     // construct triangle descriptor for point
1460     // no interpolation, set up i,j for constant interpolation of z and attribs
1461     // @todo implement an optimized backend that doesn't require triangle information
1462 
1463     // compute coverage mask from x,y packed into the coverageMask flag
1464     // mask indices by the maximum valid index for x/y of coveragemap.
1465     uint32_t tX = workDesc.triFlags.coverageMask & 0x7;
1466     uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7;
1467     // todo: multisample points?
1468     triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX];
1469 
1470     // no persp divide needed for points
1471     triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
1472     triDesc.triFlags = workDesc.triFlags;
1473     triDesc.recipDet = 1.0f;
1474     triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f;
1475     triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f;
1476     triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f;
1477     triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z;
1478 
1479     RenderOutputBuffers renderBuffers;
1480     GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT,
1481         renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
1482 
1483     AR_BEGIN(BEPixelBackend, pDC->drawId);
1484     backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
1485     AR_END(BEPixelBackend, 0);
1486 }
1487 
1488 // Get pointers to hot tile memory for color RT, depth, stencil
1489 template <uint32_t numSamples>
GetRenderHotTiles(DRAW_CONTEXT * pDC,uint32_t macroID,uint32_t tileX,uint32_t tileY,RenderOutputBuffers & renderBuffers,uint32_t renderTargetArrayIndex)1490 void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex)
1491 {
1492     const API_STATE& state = GetApiState(pDC);
1493     SWR_CONTEXT *pContext = pDC->pContext;
1494 
1495     uint32_t mx, my;
1496     MacroTileMgr::getTileIndices(macroID, mx, my);
1497     tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx;
1498     tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my;
1499 
1500     // compute tile offset for active hottile buffers
1501     const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
1502     uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
1503     offset*=numSamples;
1504 
1505     unsigned long rtSlot = 0;
1506     uint32_t colorHottileEnableMask = state.colorHottileEnable;
1507     while(_BitScanForward(&rtSlot, colorHottileEnableMask))
1508     {
1509         HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true,
1510             numSamples, renderTargetArrayIndex);
1511         pColor->state = HOTTILE_DIRTY;
1512         renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;
1513 
1514         colorHottileEnableMask &= ~(1 << rtSlot);
1515     }
1516     if(state.depthHottileEnable)
1517     {
1518         const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
1519         uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
1520         offset*=numSamples;
1521         HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true,
1522             numSamples, renderTargetArrayIndex);
1523         pDepth->state = HOTTILE_DIRTY;
1524         SWR_ASSERT(pDepth->pBuffer != nullptr);
1525         renderBuffers.pDepth = pDepth->pBuffer + offset;
1526     }
1527     if(state.stencilHottileEnable)
1528     {
1529         const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
1530         uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
1531         offset*=numSamples;
1532         HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true,
1533             numSamples, renderTargetArrayIndex);
1534         pStencil->state = HOTTILE_DIRTY;
1535         SWR_ASSERT(pStencil->pBuffer != nullptr);
1536         renderBuffers.pStencil = pStencil->pBuffer + offset;
1537     }
1538 }
1539 
1540 template <typename RT>
StepRasterTileX(uint32_t NumRT,RenderOutputBuffers & buffers)1541 INLINE void StepRasterTileX(uint32_t NumRT, RenderOutputBuffers &buffers)
1542 {
1543     for(uint32_t rt = 0; rt < NumRT; ++rt)
1544     {
1545         buffers.pColor[rt] += RT::colorRasterTileStep;
1546     }
1547 
1548     buffers.pDepth += RT::depthRasterTileStep;
1549     buffers.pStencil += RT::stencilRasterTileStep;
1550 }
1551 
1552 template <typename RT>
StepRasterTileY(uint32_t NumRT,RenderOutputBuffers & buffers,RenderOutputBuffers & startBufferRow)1553 INLINE void StepRasterTileY(uint32_t NumRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow)
1554 {
1555     for(uint32_t rt = 0; rt < NumRT; ++rt)
1556     {
1557         startBufferRow.pColor[rt] += RT::colorRasterTileRowStep;
1558         buffers.pColor[rt] = startBufferRow.pColor[rt];
1559     }
1560     startBufferRow.pDepth += RT::depthRasterTileRowStep;
1561     buffers.pDepth = startBufferRow.pDepth;
1562 
1563     startBufferRow.pStencil += RT::stencilRasterTileRowStep;
1564     buffers.pStencil = startBufferRow.pStencil;
1565 }
1566 
RasterizeLine(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pData)1567 void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
1568 {
1569     SWR_CONTEXT *pContext = pDC->pContext;
1570     const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pData);
1571 #if KNOB_ENABLE_TOSS_POINTS
1572     if (KNOB_TOSS_BIN_TRIS)
1573     {
1574         return;
1575     }
1576 #endif
1577 
1578     // bloat line to two tris and call the triangle rasterizer twice
1579     AR_BEGIN(BERasterizeLine, pDC->drawId);
1580 
1581     const API_STATE &state = GetApiState(pDC);
1582     const SWR_RASTSTATE &rastState = state.rastState;
1583 
1584     // macrotile dimensioning
1585     uint32_t macroX, macroY;
1586     MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
1587     int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
1588     int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
1589     int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
1590     int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
1591 
1592     const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
1593 
1594     // create a copy of the triangle buffer to write our adjusted vertices to
1595     OSALIGNSIMD(float) newTriBuffer[4 * 4];
1596     TRIANGLE_WORK_DESC newWorkDesc = workDesc;
1597     newWorkDesc.pTriBuffer = &newTriBuffer[0];
1598 
1599     // create a copy of the attrib buffer to write our adjusted attribs to
1600     OSALIGNSIMD(float) newAttribBuffer[4 * 3 * KNOB_NUM_ATTRIBUTES];
1601     newWorkDesc.pAttribs = &newAttribBuffer[0];
1602 
1603     const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f);
1604     const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f);
1605 
1606     __m128 vX, vY, vZ, vRecipW;
1607 
1608     vX = _mm_load_ps(workDesc.pTriBuffer);
1609     vY = _mm_load_ps(workDesc.pTriBuffer + 4);
1610     vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
1611     vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
1612 
1613     // triangle 0
1614     // v0,v1 -> v0,v0,v1
1615     __m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0));
1616     __m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0));
1617     __m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0));
1618     __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0));
1619 
1620     __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth);
1621     __m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0);
1622     if (workDesc.triFlags.yMajor)
1623     {
1624         vXa = _mm_add_ps(vAdjust, vXa);
1625     }
1626     else
1627     {
1628         vYa = _mm_add_ps(vAdjust, vYa);
1629     }
1630 
1631     // Store triangle description for rasterizer
1632     _mm_store_ps((float*)&newTriBuffer[0], vXa);
1633     _mm_store_ps((float*)&newTriBuffer[4], vYa);
1634     _mm_store_ps((float*)&newTriBuffer[8], vZa);
1635     _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
1636 
1637     // binner bins 3 edges for lines as v0, v1, v1
1638     // tri0 needs v0, v0, v1
1639     for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
1640     {
1641         __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a*12 + 0]);
1642         __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a*12 + 4]);
1643 
1644         _mm_store_ps((float*)&newAttribBuffer[a*12 + 0], vAttrib0);
1645         _mm_store_ps((float*)&newAttribBuffer[a*12 + 4], vAttrib0);
1646         _mm_store_ps((float*)&newAttribBuffer[a*12 + 8], vAttrib1);
1647     }
1648 
1649     // Store user clip distances for triangle 0
1650     float newClipBuffer[3 * 8];
1651     uint32_t numClipDist = _mm_popcnt_u32(state.rastState.clipDistanceMask);
1652     if (numClipDist)
1653     {
1654         newWorkDesc.pUserClipBuffer = newClipBuffer;
1655 
1656         float* pOldBuffer = workDesc.pUserClipBuffer;
1657         float* pNewBuffer = newClipBuffer;
1658         for (uint32_t i = 0; i < numClipDist; ++i)
1659         {
1660             // read barycentric coeffs from binner
1661             float a = *(pOldBuffer++);
1662             float b = *(pOldBuffer++);
1663 
1664             // reconstruct original clip distance at vertices
1665             float c0 = a + b;
1666             float c1 = b;
1667 
1668             // construct triangle barycentrics
1669             *(pNewBuffer++) = c0 - c1;
1670             *(pNewBuffer++) = c0 - c1;
1671             *(pNewBuffer++) = c1;
1672         }
1673     }
1674 
1675     // setup triangle rasterizer function
1676     PFN_WORK_FUNC pfnTriRast;
1677     uint32_t sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
1678     // conservative rast not supported for points/lines
1679     pfnTriRast = GetRasterizerFunc(sampleCount, false, SWR_INPUT_COVERAGE_NONE, ALL_EDGES_VALID, (pDC->pState->state.scissorsTileAligned == false));
1680 
1681     // make sure this macrotile intersects the triangle
1682     __m128i vXai = fpToFixedPoint(vXa);
1683     __m128i vYai = fpToFixedPoint(vYa);
1684     OSALIGNSIMD(SWR_RECT) bboxA;
1685     calcBoundingBoxInt(vXai, vYai, bboxA);
1686 
1687     if (!(bboxA.xmin > macroBoxRight ||
1688           bboxA.xmin > scissorInFixedPoint.xmax ||
1689           bboxA.xmax - 1 < macroBoxLeft ||
1690           bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
1691           bboxA.ymin > macroBoxBottom ||
1692           bboxA.ymin > scissorInFixedPoint.ymax ||
1693           bboxA.ymax - 1 < macroBoxTop ||
1694           bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
1695         // rasterize triangle
1696         pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
1697     }
1698 
1699     // triangle 1
1700     // v0,v1 -> v1,v1,v0
1701     vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1));
1702     vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1));
1703     vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1));
1704     vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1));
1705 
1706     vAdjust = _mm_mul_ps(vLineWidth, vBloat1);
1707     if (workDesc.triFlags.yMajor)
1708     {
1709         vXa = _mm_add_ps(vAdjust, vXa);
1710     }
1711     else
1712     {
1713         vYa = _mm_add_ps(vAdjust, vYa);
1714     }
1715 
1716     // Store triangle description for rasterizer
1717     _mm_store_ps((float*)&newTriBuffer[0], vXa);
1718     _mm_store_ps((float*)&newTriBuffer[4], vYa);
1719     _mm_store_ps((float*)&newTriBuffer[8], vZa);
1720     _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
1721 
1722     // binner bins 3 edges for lines as v0, v1, v1
1723     // tri1 needs v1, v1, v0
1724     for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
1725     {
1726         __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
1727         __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
1728 
1729         _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1);
1730         _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1);
1731         _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0);
1732     }
1733 
1734     // store user clip distance for triangle 1
1735     if (numClipDist)
1736     {
1737         float* pOldBuffer = workDesc.pUserClipBuffer;
1738         float* pNewBuffer = newClipBuffer;
1739         for (uint32_t i = 0; i < numClipDist; ++i)
1740         {
1741             // read barycentric coeffs from binner
1742             float a = *(pOldBuffer++);
1743             float b = *(pOldBuffer++);
1744 
1745             // reconstruct original clip distance at vertices
1746             float c0 = a + b;
1747             float c1 = b;
1748 
1749             // construct triangle barycentrics
1750             *(pNewBuffer++) = c1 - c0;
1751             *(pNewBuffer++) = c1 - c0;
1752             *(pNewBuffer++) = c0;
1753         }
1754     }
1755 
1756     vXai = fpToFixedPoint(vXa);
1757     vYai = fpToFixedPoint(vYa);
1758     calcBoundingBoxInt(vXai, vYai, bboxA);
1759 
1760     if (!(bboxA.xmin > macroBoxRight ||
1761           bboxA.xmin > scissorInFixedPoint.xmax ||
1762           bboxA.xmax - 1 < macroBoxLeft ||
1763           bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
1764           bboxA.ymin > macroBoxBottom ||
1765           bboxA.ymin > scissorInFixedPoint.ymax ||
1766           bboxA.ymax - 1 < macroBoxTop ||
1767           bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
1768         // rasterize triangle
1769         pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
1770     }
1771 
1772     AR_END(BERasterizeLine, 1);
1773 }
1774 
1775 struct RasterizerChooser
1776 {
1777     typedef PFN_WORK_FUNC FuncType;
1778 
1779     template <typename... ArgsB>
GetFuncRasterizerChooser1780     static FuncType GetFunc()
1781     {
1782         return RasterizeTriangle<RasterizerTraits<ArgsB...>>;
1783     }
1784 };
1785 
1786 // Selector for correct templated RasterizeTriangle function
GetRasterizerFunc(uint32_t numSamples,bool IsConservative,uint32_t InputCoverage,uint32_t EdgeEnable,bool RasterizeScissorEdges)1787 PFN_WORK_FUNC GetRasterizerFunc(
1788     uint32_t numSamples,
1789     bool IsConservative,
1790     uint32_t InputCoverage,
1791     uint32_t EdgeEnable,
1792     bool RasterizeScissorEdges
1793 )
1794 {
1795     return TemplateArgUnroller<RasterizerChooser>::GetFunc(
1796         IntArg<SWR_MULTISAMPLE_1X,SWR_MULTISAMPLE_TYPE_COUNT-1>{numSamples},
1797         IsConservative,
1798         IntArg<SWR_INPUT_COVERAGE_NONE, SWR_INPUT_COVERAGE_COUNT-1>{InputCoverage},
1799         IntArg<0, VALID_TRI_EDGE_COUNT-1>{EdgeEnable},
1800         RasterizeScissorEdges);
1801 }
1802