• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file binner.cpp
24 *
25 * @brief Implementation for the macrotile binner
26 *
27 ******************************************************************************/
28 
29 #include "context.h"
30 #include "frontend.h"
31 #include "conservativeRast.h"
32 #include "pa.h"
33 #include "rasterizer.h"
34 #include "rdtsc_core.h"
35 #include "tilemgr.h"
36 
37 // Function Prototype
38 void BinPostSetupLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], simdscalar vRecipW[2], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
39 
40 //////////////////////////////////////////////////////////////////////////
41 /// @brief Offsets added to post-viewport vertex positions based on
42 /// raster state.
43 static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
44 {
45     _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
46     _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
47 };
48 
49 //////////////////////////////////////////////////////////////////////////
50 /// @brief Convert the X,Y coords of a triangle to the requested Fixed
51 /// Point precision from FP32.
52 template <typename PT = FixedPointTraits<Fixed_16_8>>
fpToFixedPointVertical(const simdscalar vIn)53 INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn)
54 {
55     simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value));
56     return _simd_cvtps_epi32(vFixed);
57 }
58 
59 //////////////////////////////////////////////////////////////////////////
60 /// @brief Helper function to set the X,Y coords of a triangle to the
61 /// requested Fixed Point precision from FP32.
62 /// @param tri: simdvector[3] of FP triangle verts
63 /// @param vXi: fixed point X coords of tri verts
64 /// @param vYi: fixed point Y coords of tri verts
FPToFixedPoint(const simdvector * const tri,simdscalari (& vXi)[3],simdscalari (& vYi)[3])65 INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari(&vXi)[3], simdscalari(&vYi)[3])
66 {
67     vXi[0] = fpToFixedPointVertical(tri[0].x);
68     vYi[0] = fpToFixedPointVertical(tri[0].y);
69     vXi[1] = fpToFixedPointVertical(tri[1].x);
70     vYi[1] = fpToFixedPointVertical(tri[1].y);
71     vXi[2] = fpToFixedPointVertical(tri[2].x);
72     vYi[2] = fpToFixedPointVertical(tri[2].y);
73 }
74 
75 //////////////////////////////////////////////////////////////////////////
76 /// @brief Calculate bounding box for current triangle
77 /// @tparam CT: ConservativeRastFETraits type
78 /// @param vX: fixed point X position for triangle verts
79 /// @param vY: fixed point Y position for triangle verts
80 /// @param bbox: fixed point bbox
81 /// *Note*: expects vX, vY to be in the correct precision for the type
82 /// of rasterization. This avoids unnecessary FP->fixed conversions.
83 template <typename CT>
calcBoundingBoxIntVertical(const simdvector * const tri,simdscalari (& vX)[3],simdscalari (& vY)[3],simdBBox & bbox)84 INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari(&vX)[3], simdscalari(&vY)[3], simdBBox &bbox)
85 {
86     simdscalari vMinX = vX[0];
87     vMinX = _simd_min_epi32(vMinX, vX[1]);
88     vMinX = _simd_min_epi32(vMinX, vX[2]);
89 
90     simdscalari vMaxX = vX[0];
91     vMaxX = _simd_max_epi32(vMaxX, vX[1]);
92     vMaxX = _simd_max_epi32(vMaxX, vX[2]);
93 
94     simdscalari vMinY = vY[0];
95     vMinY = _simd_min_epi32(vMinY, vY[1]);
96     vMinY = _simd_min_epi32(vMinY, vY[2]);
97 
98     simdscalari vMaxY = vY[0];
99     vMaxY = _simd_max_epi32(vMaxY, vY[1]);
100     vMaxY = _simd_max_epi32(vMaxY, vY[2]);
101 
102     bbox.xmin = vMinX;
103     bbox.xmax = vMaxX;
104     bbox.ymin = vMinY;
105     bbox.ymax = vMaxY;
106 }
107 
108 //////////////////////////////////////////////////////////////////////////
109 /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
110 /// Offsets BBox for conservative rast
111 template <>
calcBoundingBoxIntVertical(const simdvector * const tri,simdscalari (& vX)[3],simdscalari (& vY)[3],simdBBox & bbox)112 INLINE void calcBoundingBoxIntVertical<FEConservativeRastT>(const simdvector * const tri, simdscalari(&vX)[3], simdscalari(&vY)[3], simdBBox &bbox)
113 {
114     // FE conservative rast traits
115     typedef FEConservativeRastT CT;
116 
117     simdscalari vMinX = vX[0];
118     vMinX = _simd_min_epi32(vMinX, vX[1]);
119     vMinX = _simd_min_epi32(vMinX, vX[2]);
120 
121     simdscalari vMaxX = vX[0];
122     vMaxX = _simd_max_epi32(vMaxX, vX[1]);
123     vMaxX = _simd_max_epi32(vMaxX, vX[2]);
124 
125     simdscalari vMinY = vY[0];
126     vMinY = _simd_min_epi32(vMinY, vY[1]);
127     vMinY = _simd_min_epi32(vMinY, vY[2]);
128 
129     simdscalari vMaxY = vY[0];
130     vMaxY = _simd_max_epi32(vMaxY, vY[1]);
131     vMaxY = _simd_max_epi32(vMaxY, vY[2]);
132 
133     /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
134     /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
135     bbox.xmin = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
136     bbox.xmax = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
137     bbox.ymin = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
138     bbox.ymax = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
139 }
140 
141 //////////////////////////////////////////////////////////////////////////
142 /// @brief Processes attributes for the backend based on linkage mask and
143 ///        linkage map.  Essentially just doing an SOA->AOS conversion and pack.
144 /// @param pDC - Draw context
145 /// @param pa - Primitive Assembly state
146 /// @param linkageMask - Specifies which VS outputs are routed to PS.
147 /// @param pLinkageMap - maps VS attribute slot to PS slot
148 /// @param triIndex - Triangle to process attributes for
149 /// @param pBuffer - Output result
150 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
ProcessAttributes(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t triIndex,uint32_t primId,float * pBuffer)151 INLINE void ProcessAttributes(
152     DRAW_CONTEXT *pDC,
153     PA_STATE&pa,
154     uint32_t triIndex,
155     uint32_t primId,
156     float *pBuffer)
157 {
158     static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
159     const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
160     // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
161     LONG constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
162     const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
163     const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
164 
165     static const float constTable[3][4] = {
166         { 0.0f, 0.0f, 0.0f, 0.0f },
167         { 0.0f, 0.0f, 0.0f, 1.0f },
168         { 1.0f, 1.0f, 1.0f, 1.0f }
169     };
170 
171     for (uint32_t i = 0; i < backendState.numAttributes; ++i)
172     {
173         uint32_t inputSlot;
174         if (IsSwizzledT::value)
175         {
176             SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
177             inputSlot = VERTEX_ATTRIB_START_SLOT + attribSwizzle.sourceAttrib;
178 
179         }
180         else
181         {
182             inputSlot = VERTEX_ATTRIB_START_SLOT + i;
183         }
184 
185         __m128 attrib[3];    // triangle attribs (always 4 wide)
186         float* pAttribStart = pBuffer;
187 
188         if (HasConstantInterpT::value || IsDegenerate::value)
189         {
190             if (_bittest(&constantInterpMask, i))
191             {
192                 uint32_t vid;
193                 uint32_t adjustedTriIndex;
194                 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
195                 static const int32_t quadProvokingTri[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
196                 static const uint32_t quadProvokingVertex[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
197                 static const int32_t qstripProvokingTri[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
198                 static const uint32_t qstripProvokingVertex[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
199 
200                 switch (topo) {
201                 case TOP_QUAD_LIST:
202                     adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
203                     vid = quadProvokingVertex[triIndex & 1][provokingVertex];
204                     break;
205                 case TOP_QUAD_STRIP:
206                     adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
207                     vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
208                     break;
209                 case TOP_TRIANGLE_STRIP:
210                     adjustedTriIndex = triIndex;
211                     vid = (triIndex & 1)
212                         ? tristripProvokingVertex[provokingVertex]
213                         : provokingVertex;
214                     break;
215                 default:
216                     adjustedTriIndex = triIndex;
217                     vid = provokingVertex;
218                     break;
219                 }
220 
221                 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
222 
223                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
224                 {
225                     _mm_store_ps(pBuffer, attrib[vid]);
226                     pBuffer += 4;
227                 }
228             }
229             else
230             {
231                 pa.AssembleSingle(inputSlot, triIndex, attrib);
232 
233                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
234                 {
235                     _mm_store_ps(pBuffer, attrib[i]);
236                     pBuffer += 4;
237                 }
238             }
239         }
240         else
241         {
242             pa.AssembleSingle(inputSlot, triIndex, attrib);
243 
244             for (uint32_t i = 0; i < NumVertsT::value; ++i)
245             {
246                 _mm_store_ps(pBuffer, attrib[i]);
247                 pBuffer += 4;
248             }
249         }
250 
251         // pad out the attrib buffer to 3 verts to ensure the triangle
252         // interpolation code in the pixel shader works correctly for the
253         // 3 topologies - point, line, tri.  This effectively zeros out the
254         // effect of the missing vertices in the triangle interpolation.
255         for (uint32_t v = NumVertsT::value; v < 3; ++v)
256         {
257             _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
258             pBuffer += 4;
259         }
260 
261         // check for constant source overrides
262         if (IsSwizzledT::value)
263         {
264             uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
265             if (mask)
266             {
267                 DWORD comp;
268                 while (_BitScanForward(&comp, mask))
269                 {
270                     mask &= ~(1 << comp);
271 
272                     float constantValue = 0.0f;
273                     switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
274                     {
275                     case SWR_CONSTANT_SOURCE_CONST_0000:
276                     case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
277                     case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
278                         constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
279                         break;
280                     case SWR_CONSTANT_SOURCE_PRIM_ID:
281                         constantValue = *(float*)&primId;
282                         break;
283                     }
284 
285                     // apply constant value to all 3 vertices
286                     for (uint32_t v = 0; v < 3; ++v)
287                     {
288                         pAttribStart[comp + v * 4] = constantValue;
289                     }
290                 }
291             }
292         }
293     }
294 }
295 
296 //////////////////////////////////////////////////////////////////////////
297 /// @brief  Gather scissor rect data based on per-prim viewport indices.
298 /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
299 /// @param pViewportIndex - array of per-primitive vewport indexes.
300 /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
301 /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
302 /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
303 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
304 //
305 /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
306 template<size_t SimdWidth>
307 struct GatherScissors
308 {
GatherGatherScissors309     static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
310         simdscalari &scisXmin, simdscalari &scisYmin,
311         simdscalari &scisXmax, simdscalari &scisYmax)
312     {
313         SWR_ASSERT(0, "Unhandled Simd Width in Scissor Rect Gather");
314     }
315 };
316 
317 template<>
318 struct GatherScissors<8>
319 {
GatherGatherScissors320     static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
321         simdscalari &scisXmin, simdscalari &scisYmin,
322         simdscalari &scisXmax, simdscalari &scisYmax)
323     {
324         scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
325             pScissorsInFixedPoint[pViewportIndex[1]].xmin,
326             pScissorsInFixedPoint[pViewportIndex[2]].xmin,
327             pScissorsInFixedPoint[pViewportIndex[3]].xmin,
328             pScissorsInFixedPoint[pViewportIndex[4]].xmin,
329             pScissorsInFixedPoint[pViewportIndex[5]].xmin,
330             pScissorsInFixedPoint[pViewportIndex[6]].xmin,
331             pScissorsInFixedPoint[pViewportIndex[7]].xmin);
332         scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin,
333             pScissorsInFixedPoint[pViewportIndex[1]].ymin,
334             pScissorsInFixedPoint[pViewportIndex[2]].ymin,
335             pScissorsInFixedPoint[pViewportIndex[3]].ymin,
336             pScissorsInFixedPoint[pViewportIndex[4]].ymin,
337             pScissorsInFixedPoint[pViewportIndex[5]].ymin,
338             pScissorsInFixedPoint[pViewportIndex[6]].ymin,
339             pScissorsInFixedPoint[pViewportIndex[7]].ymin);
340         scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax,
341             pScissorsInFixedPoint[pViewportIndex[1]].xmax,
342             pScissorsInFixedPoint[pViewportIndex[2]].xmax,
343             pScissorsInFixedPoint[pViewportIndex[3]].xmax,
344             pScissorsInFixedPoint[pViewportIndex[4]].xmax,
345             pScissorsInFixedPoint[pViewportIndex[5]].xmax,
346             pScissorsInFixedPoint[pViewportIndex[6]].xmax,
347             pScissorsInFixedPoint[pViewportIndex[7]].xmax);
348         scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax,
349             pScissorsInFixedPoint[pViewportIndex[1]].ymax,
350             pScissorsInFixedPoint[pViewportIndex[2]].ymax,
351             pScissorsInFixedPoint[pViewportIndex[3]].ymax,
352             pScissorsInFixedPoint[pViewportIndex[4]].ymax,
353             pScissorsInFixedPoint[pViewportIndex[5]].ymax,
354             pScissorsInFixedPoint[pViewportIndex[6]].ymax,
355             pScissorsInFixedPoint[pViewportIndex[7]].ymax);
356     }
357 };
358 
359 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
360 
361 struct ProcessAttributesChooser
362 {
363     typedef PFN_PROCESS_ATTRIBUTES FuncType;
364 
365     template <typename... ArgsB>
GetFuncProcessAttributesChooser366     static FuncType GetFunc()
367     {
368         return ProcessAttributes<ArgsB...>;
369     }
370 };
371 
GetProcessAttributesFunc(uint32_t NumVerts,bool IsSwizzled,bool HasConstantInterp,bool IsDegenerate=false)372 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
373 {
374     return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
375 }
376 
377 //////////////////////////////////////////////////////////////////////////
378 /// @brief Processes enabled user clip distances. Loads the active clip
379 ///        distances from the PA, sets up barycentric equations, and
380 ///        stores the results to the output buffer
381 /// @param pa - Primitive Assembly state
382 /// @param primIndex - primitive index to process
383 /// @param clipDistMask - mask of enabled clip distances
384 /// @param pUserClipBuffer - buffer to store results
385 template<uint32_t NumVerts>
ProcessUserClipDist(PA_STATE & pa,uint32_t primIndex,uint8_t clipDistMask,float * pRecipW,float * pUserClipBuffer)386 void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float *pRecipW, float* pUserClipBuffer)
387 {
388     DWORD clipDist;
389     while (_BitScanForward(&clipDist, clipDistMask))
390     {
391         clipDistMask &= ~(1 << clipDist);
392         uint32_t clipSlot = clipDist >> 2;
393         uint32_t clipComp = clipDist & 0x3;
394         uint32_t clipAttribSlot = clipSlot == 0 ?
395             VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
396 
397         __m128 primClipDist[3];
398         pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
399 
400         float vertClipDist[NumVerts];
401         for (uint32_t e = 0; e < NumVerts; ++e)
402         {
403             OSALIGNSIMD(float) aVertClipDist[4];
404             _mm_store_ps(aVertClipDist, primClipDist[e]);
405             vertClipDist[e] = aVertClipDist[clipComp];
406         };
407 
408         // setup plane equations for barycentric interpolation in the backend
409         float baryCoeff[NumVerts];
410         float last = vertClipDist[NumVerts - 1] * pRecipW[NumVerts - 1];
411         for (uint32_t e = 0; e < NumVerts - 1; ++e)
412         {
413             baryCoeff[e] = vertClipDist[e] * pRecipW[e] - last;
414         }
415         baryCoeff[NumVerts - 1] = last;
416 
417         for (uint32_t e = 0; e < NumVerts; ++e)
418         {
419             *(pUserClipBuffer++) = baryCoeff[e];
420         }
421     }
422 }
423 
424 //////////////////////////////////////////////////////////////////////////
425 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
426 ///        culling, viewport transform, etc.
427 /// @param pDC - pointer to draw context.
428 /// @param pa - The primitive assembly object.
429 /// @param workerId - thread's worker id. Even thread has a unique id.
430 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
431 /// @param primID - Primitive ID for each triangle.
432 /// @param viewportIdx - viewport array index for each triangle.
433 /// @tparam CT - ConservativeRastFETraits
434 template <typename CT>
BinTriangles(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simdvector tri[3],uint32_t triMask,simdscalari primID,simdscalari viewportIdx)435 void BinTriangles(
436     DRAW_CONTEXT *pDC,
437     PA_STATE& pa,
438     uint32_t workerId,
439     simdvector tri[3],
440     uint32_t triMask,
441     simdscalari primID,
442     simdscalari viewportIdx)
443 {
444     SWR_CONTEXT *pContext = pDC->pContext;
445 
446     AR_BEGIN(FEBinTriangles, pDC->drawId);
447 
448     const API_STATE& state = GetApiState(pDC);
449     const SWR_RASTSTATE& rastState = state.rastState;
450     const SWR_FRONTEND_STATE& feState = state.frontendState;
451     const SWR_GS_STATE& gsState = state.gsState;
452     MacroTileMgr *pTileMgr = pDC->pTileMgr;
453 
454     simdscalar vRecipW0 = _simd_set1_ps(1.0f);
455     simdscalar vRecipW1 = _simd_set1_ps(1.0f);
456     simdscalar vRecipW2 = _simd_set1_ps(1.0f);
457 
458     if (feState.vpTransformDisable)
459     {
460         // RHW is passed in directly when VP transform is disabled
461         vRecipW0 = tri[0].v[3];
462         vRecipW1 = tri[1].v[3];
463         vRecipW2 = tri[2].v[3];
464     }
465     else
466     {
467         // Perspective divide
468         vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
469         vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
470         vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
471 
472         tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
473         tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
474         tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
475 
476         tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
477         tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
478         tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
479 
480         tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
481         tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
482         tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
483 
484         // Viewport transform to screen space coords
485         if (state.gsState.emitsViewportArrayIndex)
486         {
487             viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
488         }
489         else
490         {
491             viewportTransform<3>(tri, state.vpMatrices);
492         }
493     }
494 
495     // Adjust for pixel center location
496     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
497     tri[0].x = _simd_add_ps(tri[0].x, offset);
498     tri[0].y = _simd_add_ps(tri[0].y, offset);
499 
500     tri[1].x = _simd_add_ps(tri[1].x, offset);
501     tri[1].y = _simd_add_ps(tri[1].y, offset);
502 
503     tri[2].x = _simd_add_ps(tri[2].x, offset);
504     tri[2].y = _simd_add_ps(tri[2].y, offset);
505 
506     simdscalari vXi[3], vYi[3];
507     // Set vXi, vYi to required fixed point precision
508     FPToFixedPoint(tri, vXi, vYi);
509 
510     // triangle setup
511     simdscalari vAi[3], vBi[3];
512     triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
513 
514     // determinant
515     simdscalari vDet[2];
516     calcDeterminantIntVertical(vAi, vBi, vDet);
517 
518     // cull zero area
519     int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
520     int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
521 
522     int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
523 
524     uint32_t origTriMask = triMask;
525     // don't cull degenerate triangles if we're conservatively rasterizing
526     if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
527     {
528         triMask &= ~cullZeroAreaMask;
529     }
530 
531     // determine front winding tris
532     // CW  +det
533     // CCW det < 0;
534     // 0 area triangles are marked as backfacing regardless of winding order,
535     // which is required behavior for conservative rast and wireframe rendering
536     uint32_t frontWindingTris;
537     if (rastState.frontWinding == SWR_FRONTWINDING_CW)
538     {
539         maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
540         maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
541     }
542     else
543     {
544         maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet[0])));
545         maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet[1])));
546     }
547     frontWindingTris = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
548 
549     // cull
550     uint32_t cullTris;
551     switch ((SWR_CULLMODE)rastState.cullMode)
552     {
553     case SWR_CULLMODE_BOTH:  cullTris = 0xffffffff; break;
554     case SWR_CULLMODE_NONE:  cullTris = 0x0; break;
555     case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
556         // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
557     case SWR_CULLMODE_BACK:  cullTris = ~frontWindingTris; break;
558     default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
559     }
560 
561     triMask &= ~cullTris;
562 
563     if (origTriMask ^ triMask)
564     {
565         RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
566     }
567 
568     // Simple non-conformant wireframe mode, useful for debugging
569     if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
570     {
571         // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
572         simdvector line[2];
573         simdscalar recipW[2];
574         line[0] = tri[0];
575         line[1] = tri[1];
576         recipW[0] = vRecipW0;
577         recipW[1] = vRecipW1;
578         BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
579 
580         line[0] = tri[1];
581         line[1] = tri[2];
582         recipW[0] = vRecipW1;
583         recipW[1] = vRecipW2;
584         BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
585 
586         line[0] = tri[2];
587         line[1] = tri[0];
588         recipW[0] = vRecipW2;
589         recipW[1] = vRecipW0;
590         BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
591 
592         AR_END(FEBinTriangles, 1);
593         return;
594     }
595 
596     /// Note: these variable initializations must stay above any 'goto endBenTriangles'
597     // compute per tri backface
598     uint32_t frontFaceMask = frontWindingTris;
599     uint32_t *pPrimID = (uint32_t *)&primID;
600     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
601     DWORD triIndex = 0;
602     // for center sample pattern, all samples are at pixel center; calculate coverage
603     // once at center and broadcast the results in the backend
604     const SWR_MULTISAMPLE_COUNT sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
605     uint32_t edgeEnable;
606     PFN_WORK_FUNC pfnWork;
607     if (CT::IsConservativeT::value)
608     {
609         // determine which edges of the degenerate tri, if any, are valid to rasterize.
610         // used to call the appropriate templated rasterizer function
611         if (cullZeroAreaMask > 0)
612         {
613             // e0 = v1-v0
614             simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]);
615             simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]);
616             uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask)));
617 
618             // e1 = v2-v1
619             simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]);
620             simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]);
621             uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask)));
622 
623             // e2 = v0-v2
624             // if v0 == v1 & v1 == v2, v0 == v2
625             uint32_t e2Mask = e0Mask & e1Mask;
626             SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
627 
628             // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
629             // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
630             e0Mask = pdep_u32(e0Mask, 0x00249249);
631             // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
632             e1Mask = pdep_u32(e1Mask, 0x00492492);
633             // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
634             e2Mask = pdep_u32(e2Mask, 0x00924924);
635 
636             edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
637         }
638         else
639         {
640             edgeEnable = 0x00FFFFFF;
641         }
642     }
643     else
644     {
645         // degenerate triangles won't be sent to rasterizer; just enable all edges
646         pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
647             (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, ALL_EDGES_VALID,
648             (state.scissorsTileAligned == false));
649     }
650 
651     if (!triMask)
652     {
653         goto endBinTriangles;
654     }
655 
656     // Calc bounding box of triangles
657     simdBBox bbox;
658     calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
659 
660     // determine if triangle falls between pixel centers and discard
661     // only discard for non-MSAA case and when conservative rast is disabled
662     // (xmin + 127) & ~255
663     // (xmax + 128) & ~255
664     if (rastState.sampleCount == SWR_MULTISAMPLE_1X && (!CT::IsConservativeT::value))
665     {
666         origTriMask = triMask;
667 
668         int cullCenterMask;
669         {
670             simdscalari xmin = _simd_add_epi32(bbox.xmin, _simd_set1_epi32(127));
671             xmin = _simd_and_si(xmin, _simd_set1_epi32(~255));
672             simdscalari xmax = _simd_add_epi32(bbox.xmax, _simd_set1_epi32(128));
673             xmax = _simd_and_si(xmax, _simd_set1_epi32(~255));
674 
675             simdscalari vMaskH = _simd_cmpeq_epi32(xmin, xmax);
676 
677             simdscalari ymin = _simd_add_epi32(bbox.ymin, _simd_set1_epi32(127));
678             ymin = _simd_and_si(ymin, _simd_set1_epi32(~255));
679             simdscalari ymax = _simd_add_epi32(bbox.ymax, _simd_set1_epi32(128));
680             ymax = _simd_and_si(ymax, _simd_set1_epi32(~255));
681 
682             simdscalari vMaskV = _simd_cmpeq_epi32(ymin, ymax);
683             vMaskV = _simd_or_si(vMaskH, vMaskV);
684             cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
685         }
686 
687         triMask &= ~cullCenterMask;
688 
689         if (origTriMask ^ triMask)
690         {
691             RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
692         }
693     }
694 
695     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
696     // Gather the AOS effective scissor rects based on the per-prim VP index.
697     /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
698     simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
699     if (state.gsState.emitsViewportArrayIndex)
700     {
701         GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
702             scisXmin, scisYmin, scisXmax, scisYmax);
703     }
704     else // broadcast fast path for non-VPAI case.
705     {
706         scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
707         scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
708         scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
709         scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
710     }
711 
712     bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
713     bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
714     bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
715     bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
716 
717     if (CT::IsConservativeT::value)
718     {
719         // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
720         // some area. Bump the xmax/ymax edges out
721         simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.ymin, bbox.ymax);
722         bbox.ymax = _simd_blendv_epi32(bbox.ymax, _simd_add_epi32(bbox.ymax, _simd_set1_epi32(1)), topEqualsBottom);
723         simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.xmin, bbox.xmax);
724         bbox.xmax = _simd_blendv_epi32(bbox.xmax, _simd_add_epi32(bbox.xmax, _simd_set1_epi32(1)), leftEqualsRight);
725     }
726 
727     // Cull tris completely outside scissor
728     {
729         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
730         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
731         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
732         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
733         triMask = triMask & ~maskOutsideScissor;
734     }
735 
736     if (!triMask)
737     {
738         goto endBinTriangles;
739     }
740 
741     // Convert triangle bbox to macrotile units.
742     bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
743     bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
744     bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
745     bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
746 
747     OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
748     _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
749     _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
750     _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
751     _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
752 
753     // transpose verts needed for backend
754     /// @todo modify BE to take non-transformed verts
755     __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
756     vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
757     vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
758     vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
759     vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
760 
761     // store render target array index
762     OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
763     if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
764     {
765         simdvector vRtai[3];
766         pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
767         simdscalari vRtaii;
768         vRtaii = _simd_castps_si(vRtai[0].x);
769         _simd_store_si((simdscalari*)aRTAI, vRtaii);
770     }
771     else
772     {
773         _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
774     }
775 
776 endBinTriangles:
777 
778     // scan remaining valid triangles and bin each separately
779     while (_BitScanForward(&triIndex, triMask))
780     {
781         uint32_t linkageCount = state.backendState.numAttributes;
782         uint32_t numScalarAttribs = linkageCount * 4;
783 
784         BE_WORK work;
785         work.type = DRAW;
786 
787         bool isDegenerate;
788         if (CT::IsConservativeT::value)
789         {
790             // only rasterize valid edges if we have a degenerate primitive
791             int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
792             work.pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
793                 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, triEdgeEnable,
794                 (state.scissorsTileAligned == false));
795 
796             // Degenerate triangles are required to be constant interpolated
797             isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
798         }
799         else
800         {
801             isDegenerate = false;
802             work.pfnWork = pfnWork;
803         }
804 
805         // Select attribute processor
806         PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
807             state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
808 
809         TRIANGLE_WORK_DESC &desc = work.desc.tri;
810 
811         desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
812         desc.triFlags.primID = pPrimID[triIndex];
813         desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
814         desc.triFlags.viewportIndex = pViewportIndex[triIndex];
815 
816         auto pArena = pDC->pArena;
817         SWR_ASSERT(pArena != nullptr);
818 
819         // store active attribs
820         float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
821         desc.pAttribs = pAttribs;
822         desc.numAttribs = linkageCount;
823         pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
824 
825         // store triangle vertex data
826         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
827 
828         _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
829         _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
830         _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
831         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
832 
833         // store user clip distances
834         if (rastState.clipDistanceMask)
835         {
836             uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
837             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
838             ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer);
839         }
840 
841         for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
842         {
843             for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
844             {
845 #if KNOB_ENABLE_TOSS_POINTS
846                 if (!KNOB_TOSS_SETUP_TRIS)
847 #endif
848                 {
849                     pTileMgr->enqueue(x, y, &work);
850                 }
851             }
852         }
853                      triMask &= ~(1 << triIndex);
854     }
855 
856     AR_END(FEBinTriangles, 1);
857 }
858 
859 struct FEBinTrianglesChooser
860 {
861     typedef PFN_PROCESS_PRIMS FuncType;
862 
863     template <typename... ArgsB>
GetFuncFEBinTrianglesChooser864     static FuncType GetFunc()
865     {
866         return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
867     }
868 };
869 
870 // Selector for correct templated BinTrinagles function
GetBinTrianglesFunc(bool IsConservative)871 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
872 {
873     return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
874 }
875 
876 
877 //////////////////////////////////////////////////////////////////////////
878 /// @brief Bin SIMD points to the backend.  Only supports point size of 1
879 /// @param pDC - pointer to draw context.
880 /// @param pa - The primitive assembly object.
881 /// @param workerId - thread's worker id. Even thread has a unique id.
882 /// @param tri - Contains point position data for SIMDs worth of points.
883 /// @param primID - Primitive ID for each point.
BinPoints(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simdvector prim[3],uint32_t primMask,simdscalari primID,simdscalari viewportIdx)884 void BinPoints(
885     DRAW_CONTEXT *pDC,
886     PA_STATE& pa,
887     uint32_t workerId,
888     simdvector prim[3],
889     uint32_t primMask,
890     simdscalari primID,
891     simdscalari viewportIdx)
892 {
893     SWR_CONTEXT *pContext = pDC->pContext;
894 
895     AR_BEGIN(FEBinPoints, pDC->drawId);
896 
897     simdvector& primVerts = prim[0];
898 
899     const API_STATE& state = GetApiState(pDC);
900     const SWR_FRONTEND_STATE& feState = state.frontendState;
901     const SWR_GS_STATE& gsState = state.gsState;
902     const SWR_RASTSTATE& rastState = state.rastState;
903     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
904 
905     // Select attribute processor
906     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
907         state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
908 
909     if (!feState.vpTransformDisable)
910     {
911         // perspective divide
912         simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
913         primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
914         primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
915         primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
916 
917         // viewport transform to screen coords
918         if (state.gsState.emitsViewportArrayIndex)
919         {
920             viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
921         }
922         else
923         {
924             viewportTransform<1>(&primVerts, state.vpMatrices);
925         }
926     }
927 
928     // adjust for pixel center location
929     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
930     primVerts.x = _simd_add_ps(primVerts.x, offset);
931     primVerts.y = _simd_add_ps(primVerts.y, offset);
932 
933     // convert to fixed point
934     simdscalari vXi, vYi;
935     vXi = fpToFixedPointVertical(primVerts.x);
936     vYi = fpToFixedPointVertical(primVerts.y);
937 
938     if (CanUseSimplePoints(pDC))
939     {
940         // adjust for ymin-xmin rule
941         vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
942         vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
943 
944         // cull points off the ymin-xmin edge of the viewport
945         primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
946         primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
947 
948         // compute macro tile coordinates
949         simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
950         simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
951 
952         OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
953         _simd_store_si((simdscalari*)aMacroX, macroX);
954         _simd_store_si((simdscalari*)aMacroY, macroY);
955 
956         // compute raster tile coordinates
957         simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
958         simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
959 
960         // compute raster tile relative x,y for coverage mask
961         simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
962         simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
963 
964         simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
965         simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
966 
967         OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
968         OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
969         _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
970         _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
971 
972         OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
973         OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
974         _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
975         _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
976 
977         OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
978         _simd_store_ps((float*)aZ, primVerts.z);
979 
980         // store render target array index
981         OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
982         if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
983         {
984             simdvector vRtai;
985             pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
986             simdscalari vRtaii = _simd_castps_si(vRtai.x);
987             _simd_store_si((simdscalari*)aRTAI, vRtaii);
988         }
989         else
990         {
991             _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
992         }
993 
994         uint32_t *pPrimID = (uint32_t *)&primID;
995         DWORD primIndex = 0;
996 
997         const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
998 
999         // scan remaining valid triangles and bin each separately
1000         while (_BitScanForward(&primIndex, primMask))
1001         {
1002             uint32_t linkageCount = backendState.numAttributes;
1003             uint32_t numScalarAttribs = linkageCount * 4;
1004 
1005             BE_WORK work;
1006             work.type = DRAW;
1007 
1008             TRIANGLE_WORK_DESC &desc = work.desc.tri;
1009 
1010             // points are always front facing
1011             desc.triFlags.frontFacing = 1;
1012             desc.triFlags.primID = pPrimID[primIndex];
1013             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1014             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1015 
1016             work.pfnWork = RasterizeSimplePoint;
1017 
1018             auto pArena = pDC->pArena;
1019             SWR_ASSERT(pArena != nullptr);
1020 
1021             // store attributes
1022             float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
1023             desc.pAttribs = pAttribs;
1024             desc.numAttribs = linkageCount;
1025 
1026             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
1027 
1028             // store raster tile aligned x, y, perspective correct z
1029             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1030             desc.pTriBuffer = pTriBuffer;
1031             *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
1032             *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
1033             *pTriBuffer = aZ[primIndex];
1034 
1035             uint32_t tX = aTileRelativeX[primIndex];
1036             uint32_t tY = aTileRelativeY[primIndex];
1037 
1038             // pack the relative x,y into the coverageMask, the rasterizer will
1039             // generate the true coverage mask from it
1040             work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
1041 
1042             // bin it
1043             MacroTileMgr *pTileMgr = pDC->pTileMgr;
1044 #if KNOB_ENABLE_TOSS_POINTS
1045             if (!KNOB_TOSS_SETUP_TRIS)
1046 #endif
1047             {
1048                 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
1049             }
1050             primMask &= ~(1 << primIndex);
1051         }
1052     }
1053     else
1054     {
1055         // non simple points need to be potentially binned to multiple macro tiles
1056         simdscalar vPointSize;
1057         if (rastState.pointParam)
1058         {
1059             simdvector size[3];
1060             pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
1061             vPointSize = size[0].x;
1062         }
1063         else
1064         {
1065             vPointSize = _simd_set1_ps(rastState.pointSize);
1066         }
1067 
1068         // bloat point to bbox
1069         simdBBox bbox;
1070         bbox.xmin = bbox.xmax = vXi;
1071         bbox.ymin = bbox.ymax = vYi;
1072 
1073         simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
1074         simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
1075         bbox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
1076         bbox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
1077         bbox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
1078         bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
1079 
1080         // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1081         // Gather the AOS effective scissor rects based on the per-prim VP index.
1082         /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
1083         simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
1084         if (state.gsState.emitsViewportArrayIndex)
1085         {
1086             GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
1087                 scisXmin, scisYmin, scisXmax, scisYmax);
1088         }
1089         else // broadcast fast path for non-VPAI case.
1090         {
1091             scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
1092             scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
1093             scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
1094             scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
1095         }
1096 
1097         bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
1098         bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
1099         bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
1100         bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
1101 
1102         // Cull bloated points completely outside scissor
1103         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
1104         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
1105         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
1106         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
1107         primMask = primMask & ~maskOutsideScissor;
1108 
1109         // Convert bbox to macrotile units.
1110         bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1111         bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1112         bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1113         bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1114 
1115         OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
1116         _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
1117         _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
1118         _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
1119         _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
1120 
1121         // store render target array index
1122         OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
1123         if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1124         {
1125             simdvector vRtai[2];
1126             pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
1127             simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
1128             _simd_store_si((simdscalari*)aRTAI, vRtaii);
1129         }
1130         else
1131         {
1132             _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
1133         }
1134 
1135         OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
1136         _simd_store_ps((float*)aPointSize, vPointSize);
1137 
1138         uint32_t *pPrimID = (uint32_t *)&primID;
1139 
1140         OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
1141         OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
1142         OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
1143 
1144         _simd_store_ps((float*)aPrimVertsX, primVerts.x);
1145         _simd_store_ps((float*)aPrimVertsY, primVerts.y);
1146         _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
1147 
1148         // scan remaining valid prims and bin each separately
1149         const SWR_BACKEND_STATE& backendState = state.backendState;
1150         DWORD primIndex;
1151         while (_BitScanForward(&primIndex, primMask))
1152         {
1153             uint32_t linkageCount = backendState.numAttributes;
1154             uint32_t numScalarAttribs = linkageCount * 4;
1155 
1156             BE_WORK work;
1157             work.type = DRAW;
1158 
1159             TRIANGLE_WORK_DESC &desc = work.desc.tri;
1160 
1161             desc.triFlags.frontFacing = 1;
1162             desc.triFlags.primID = pPrimID[primIndex];
1163             desc.triFlags.pointSize = aPointSize[primIndex];
1164             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1165             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1166 
1167             work.pfnWork = RasterizeTriPoint;
1168 
1169             auto pArena = pDC->pArena;
1170             SWR_ASSERT(pArena != nullptr);
1171 
1172             // store active attribs
1173             desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1174             desc.numAttribs = linkageCount;
1175             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1176 
1177             // store point vertex data
1178             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1179             desc.pTriBuffer = pTriBuffer;
1180             *pTriBuffer++ = aPrimVertsX[primIndex];
1181             *pTriBuffer++ = aPrimVertsY[primIndex];
1182             *pTriBuffer = aPrimVertsZ[primIndex];
1183 
1184             // store user clip distances
1185             if (rastState.clipDistanceMask)
1186             {
1187                 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
1188                 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1189                 float dists[8];
1190                 float one = 1.0f;
1191                 ProcessUserClipDist<1>(pa, primIndex, rastState.clipDistanceMask, &one, dists);
1192                 for (uint32_t i = 0; i < numClipDist; i++) {
1193                     desc.pUserClipBuffer[3*i + 0] = 0.0f;
1194                     desc.pUserClipBuffer[3*i + 1] = 0.0f;
1195                     desc.pUserClipBuffer[3*i + 2] = dists[i];
1196                 }
1197             }
1198 
1199             MacroTileMgr *pTileMgr = pDC->pTileMgr;
1200             for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1201             {
1202                 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1203                 {
1204 #if KNOB_ENABLE_TOSS_POINTS
1205                     if (!KNOB_TOSS_SETUP_TRIS)
1206 #endif
1207                     {
1208                         pTileMgr->enqueue(x, y, &work);
1209                     }
1210                 }
1211             }
1212 
1213             primMask &= ~(1 << primIndex);
1214         }
1215     }
1216 
1217     AR_END(FEBinPoints, 1);
1218 }
1219 
1220 //////////////////////////////////////////////////////////////////////////
1221 /// @brief Bin SIMD lines to the backend.
1222 /// @param pDC - pointer to draw context.
1223 /// @param pa - The primitive assembly object.
1224 /// @param workerId - thread's worker id. Even thread has a unique id.
1225 /// @param tri - Contains line position data for SIMDs worth of points.
1226 /// @param primID - Primitive ID for each line.
1227 /// @param viewportIdx - Viewport Array Index for each line.
BinPostSetupLines(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simdvector prim[],simdscalar recipW[],uint32_t primMask,simdscalari primID,simdscalari viewportIdx)1228 void BinPostSetupLines(
1229     DRAW_CONTEXT *pDC,
1230     PA_STATE& pa,
1231     uint32_t workerId,
1232     simdvector prim[],
1233     simdscalar recipW[],
1234     uint32_t primMask,
1235     simdscalari primID,
1236     simdscalari viewportIdx)
1237 {
1238     SWR_CONTEXT *pContext = pDC->pContext;
1239 
1240     AR_BEGIN(FEBinLines, pDC->drawId);
1241 
1242     const API_STATE& state = GetApiState(pDC);
1243     const SWR_RASTSTATE& rastState = state.rastState;
1244     const SWR_FRONTEND_STATE& feState = state.frontendState;
1245     const SWR_GS_STATE& gsState = state.gsState;
1246 
1247     // Select attribute processor
1248     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
1249         state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1250 
1251     simdscalar& vRecipW0 = recipW[0];
1252     simdscalar& vRecipW1 = recipW[1];
1253 
1254     // convert to fixed point
1255     simdscalari vXi[2], vYi[2];
1256     vXi[0] = fpToFixedPointVertical(prim[0].x);
1257     vYi[0] = fpToFixedPointVertical(prim[0].y);
1258     vXi[1] = fpToFixedPointVertical(prim[1].x);
1259     vYi[1] = fpToFixedPointVertical(prim[1].y);
1260 
1261     // compute x-major vs y-major mask
1262     simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
1263     simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
1264     simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
1265     uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
1266 
1267     // cull zero-length lines
1268     simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
1269     vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
1270 
1271     primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
1272 
1273     uint32_t *pPrimID = (uint32_t *)&primID;
1274     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1275 
1276     simdscalar vUnused = _simd_setzero_ps();
1277 
1278     // Calc bounding box of lines
1279     simdBBox bbox;
1280     bbox.xmin = _simd_min_epi32(vXi[0], vXi[1]);
1281     bbox.xmax = _simd_max_epi32(vXi[0], vXi[1]);
1282     bbox.ymin = _simd_min_epi32(vYi[0], vYi[1]);
1283     bbox.ymax = _simd_max_epi32(vYi[0], vYi[1]);
1284 
1285     // bloat bbox by line width along minor axis
1286     simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
1287     simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
1288     simdBBox bloatBox;
1289     bloatBox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
1290     bloatBox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
1291     bloatBox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
1292     bloatBox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
1293 
1294     bbox.xmin = _simd_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
1295     bbox.xmax = _simd_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
1296     bbox.ymin = _simd_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
1297     bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
1298 
1299     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1300     simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
1301     if (state.gsState.emitsViewportArrayIndex)
1302     {
1303         GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
1304             scisXmin, scisYmin, scisXmax, scisYmax);
1305     }
1306     else // broadcast fast path for non-VPAI case.
1307     {
1308         scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
1309         scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
1310         scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
1311         scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
1312     }
1313 
1314     bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
1315     bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
1316     bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
1317     bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
1318 
1319     // Cull prims completely outside scissor
1320     {
1321         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
1322         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
1323         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
1324         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
1325         primMask = primMask & ~maskOutsideScissor;
1326     }
1327 
1328     if (!primMask)
1329     {
1330         goto endBinLines;
1331     }
1332 
1333     // Convert triangle bbox to macrotile units.
1334     bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1335     bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1336     bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1337     bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1338 
1339     OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
1340     _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
1341     _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
1342     _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
1343     _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
1344 
1345     // transpose verts needed for backend
1346     /// @todo modify BE to take non-transformed verts
1347     __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
1348     vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
1349     vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
1350     vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
1351     vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
1352 
1353     // store render target array index
1354     OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
1355     if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1356     {
1357         simdvector vRtai[2];
1358         pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
1359         simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
1360         _simd_store_si((simdscalari*)aRTAI, vRtaii);
1361     }
1362     else
1363     {
1364         _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
1365     }
1366 
1367     // scan remaining valid prims and bin each separately
1368     DWORD primIndex;
1369     while (_BitScanForward(&primIndex, primMask))
1370     {
1371         uint32_t linkageCount = state.backendState.numAttributes;
1372         uint32_t numScalarAttribs = linkageCount * 4;
1373 
1374         BE_WORK work;
1375         work.type = DRAW;
1376 
1377         TRIANGLE_WORK_DESC &desc = work.desc.tri;
1378 
1379         desc.triFlags.frontFacing = 1;
1380         desc.triFlags.primID = pPrimID[primIndex];
1381         desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
1382         desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1383         desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1384 
1385         work.pfnWork = RasterizeLine;
1386 
1387         auto pArena = pDC->pArena;
1388         SWR_ASSERT(pArena != nullptr);
1389 
1390         // store active attribs
1391         desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1392         desc.numAttribs = linkageCount;
1393         pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1394 
1395         // store line vertex data
1396         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1397         _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
1398         _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
1399         _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
1400         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
1401 
1402         // store user clip distances
1403         if (rastState.clipDistanceMask)
1404         {
1405             uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
1406             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
1407             ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1408         }
1409 
1410         MacroTileMgr *pTileMgr = pDC->pTileMgr;
1411         for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1412         {
1413             for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1414             {
1415 #if KNOB_ENABLE_TOSS_POINTS
1416                 if (!KNOB_TOSS_SETUP_TRIS)
1417 #endif
1418                 {
1419                     pTileMgr->enqueue(x, y, &work);
1420                 }
1421             }
1422         }
1423 
1424         primMask &= ~(1 << primIndex);
1425     }
1426 
1427 endBinLines:
1428 
1429     AR_END(FEBinLines, 1);
1430 }
1431 
1432 //////////////////////////////////////////////////////////////////////////
1433 /// @brief Bin SIMD lines to the backend.
1434 /// @param pDC - pointer to draw context.
1435 /// @param pa - The primitive assembly object.
1436 /// @param workerId - thread's worker id. Even thread has a unique id.
1437 /// @param tri - Contains line position data for SIMDs worth of points.
1438 /// @param primID - Primitive ID for each line.
1439 /// @param viewportIdx - Viewport Array Index for each line.
BinLines(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simdvector prim[],uint32_t primMask,simdscalari primID,simdscalari viewportIdx)1440 void BinLines(
1441     DRAW_CONTEXT *pDC,
1442     PA_STATE& pa,
1443     uint32_t workerId,
1444     simdvector prim[],
1445     uint32_t primMask,
1446     simdscalari primID,
1447     simdscalari viewportIdx)
1448 {
1449     SWR_CONTEXT *pContext = pDC->pContext;
1450 
1451     const API_STATE& state = GetApiState(pDC);
1452     const SWR_RASTSTATE& rastState = state.rastState;
1453     const SWR_FRONTEND_STATE& feState = state.frontendState;
1454     const SWR_GS_STATE& gsState = state.gsState;
1455 
1456     // Select attribute processor
1457     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
1458         state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1459 
1460     simdscalar vRecipW[2] = { _simd_set1_ps(1.0f), _simd_set1_ps(1.0f) };
1461 
1462     if (!feState.vpTransformDisable)
1463     {
1464         // perspective divide
1465         vRecipW[0] = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
1466         vRecipW[1] = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
1467 
1468         prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW[0]);
1469         prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW[1]);
1470 
1471         prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW[0]);
1472         prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW[1]);
1473 
1474         prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW[0]);
1475         prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW[1]);
1476 
1477         // viewport transform to screen coords
1478         if (state.gsState.emitsViewportArrayIndex)
1479         {
1480             viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
1481         }
1482         else
1483         {
1484             viewportTransform<2>(prim, state.vpMatrices);
1485         }
1486     }
1487 
1488     // adjust for pixel center location
1489     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1490     prim[0].x = _simd_add_ps(prim[0].x, offset);
1491     prim[0].y = _simd_add_ps(prim[0].y, offset);
1492 
1493     prim[1].x = _simd_add_ps(prim[1].x, offset);
1494     prim[1].y = _simd_add_ps(prim[1].y, offset);
1495 
1496     BinPostSetupLines(
1497         pDC,
1498         pa,
1499         workerId,
1500         prim,
1501         vRecipW,
1502         primMask,
1503         primID,
1504         viewportIdx);
1505 }
1506