1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file binner.cpp
24 *
25 * @brief Implementation for the macrotile binner
26 *
27 ******************************************************************************/
28
29 #include "context.h"
30 #include "frontend.h"
31 #include "conservativeRast.h"
32 #include "pa.h"
33 #include "rasterizer.h"
34 #include "rdtsc_core.h"
35 #include "tilemgr.h"
36
37 // Function Prototype
38 void BinPostSetupLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], simdscalar vRecipW[2], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
39
40 //////////////////////////////////////////////////////////////////////////
41 /// @brief Offsets added to post-viewport vertex positions based on
42 /// raster state.
43 static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
44 {
45 _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
46 _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
47 };
48
49 //////////////////////////////////////////////////////////////////////////
50 /// @brief Convert the X,Y coords of a triangle to the requested Fixed
51 /// Point precision from FP32.
52 template <typename PT = FixedPointTraits<Fixed_16_8>>
fpToFixedPointVertical(const simdscalar vIn)53 INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn)
54 {
55 simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value));
56 return _simd_cvtps_epi32(vFixed);
57 }
58
59 //////////////////////////////////////////////////////////////////////////
60 /// @brief Helper function to set the X,Y coords of a triangle to the
61 /// requested Fixed Point precision from FP32.
62 /// @param tri: simdvector[3] of FP triangle verts
63 /// @param vXi: fixed point X coords of tri verts
64 /// @param vYi: fixed point Y coords of tri verts
FPToFixedPoint(const simdvector * const tri,simdscalari (& vXi)[3],simdscalari (& vYi)[3])65 INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari(&vXi)[3], simdscalari(&vYi)[3])
66 {
67 vXi[0] = fpToFixedPointVertical(tri[0].x);
68 vYi[0] = fpToFixedPointVertical(tri[0].y);
69 vXi[1] = fpToFixedPointVertical(tri[1].x);
70 vYi[1] = fpToFixedPointVertical(tri[1].y);
71 vXi[2] = fpToFixedPointVertical(tri[2].x);
72 vYi[2] = fpToFixedPointVertical(tri[2].y);
73 }
74
75 //////////////////////////////////////////////////////////////////////////
76 /// @brief Calculate bounding box for current triangle
77 /// @tparam CT: ConservativeRastFETraits type
78 /// @param vX: fixed point X position for triangle verts
79 /// @param vY: fixed point Y position for triangle verts
80 /// @param bbox: fixed point bbox
81 /// *Note*: expects vX, vY to be in the correct precision for the type
82 /// of rasterization. This avoids unnecessary FP->fixed conversions.
83 template <typename CT>
calcBoundingBoxIntVertical(const simdvector * const tri,simdscalari (& vX)[3],simdscalari (& vY)[3],simdBBox & bbox)84 INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari(&vX)[3], simdscalari(&vY)[3], simdBBox &bbox)
85 {
86 simdscalari vMinX = vX[0];
87 vMinX = _simd_min_epi32(vMinX, vX[1]);
88 vMinX = _simd_min_epi32(vMinX, vX[2]);
89
90 simdscalari vMaxX = vX[0];
91 vMaxX = _simd_max_epi32(vMaxX, vX[1]);
92 vMaxX = _simd_max_epi32(vMaxX, vX[2]);
93
94 simdscalari vMinY = vY[0];
95 vMinY = _simd_min_epi32(vMinY, vY[1]);
96 vMinY = _simd_min_epi32(vMinY, vY[2]);
97
98 simdscalari vMaxY = vY[0];
99 vMaxY = _simd_max_epi32(vMaxY, vY[1]);
100 vMaxY = _simd_max_epi32(vMaxY, vY[2]);
101
102 bbox.xmin = vMinX;
103 bbox.xmax = vMaxX;
104 bbox.ymin = vMinY;
105 bbox.ymax = vMaxY;
106 }
107
108 //////////////////////////////////////////////////////////////////////////
109 /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
110 /// Offsets BBox for conservative rast
111 template <>
calcBoundingBoxIntVertical(const simdvector * const tri,simdscalari (& vX)[3],simdscalari (& vY)[3],simdBBox & bbox)112 INLINE void calcBoundingBoxIntVertical<FEConservativeRastT>(const simdvector * const tri, simdscalari(&vX)[3], simdscalari(&vY)[3], simdBBox &bbox)
113 {
114 // FE conservative rast traits
115 typedef FEConservativeRastT CT;
116
117 simdscalari vMinX = vX[0];
118 vMinX = _simd_min_epi32(vMinX, vX[1]);
119 vMinX = _simd_min_epi32(vMinX, vX[2]);
120
121 simdscalari vMaxX = vX[0];
122 vMaxX = _simd_max_epi32(vMaxX, vX[1]);
123 vMaxX = _simd_max_epi32(vMaxX, vX[2]);
124
125 simdscalari vMinY = vY[0];
126 vMinY = _simd_min_epi32(vMinY, vY[1]);
127 vMinY = _simd_min_epi32(vMinY, vY[2]);
128
129 simdscalari vMaxY = vY[0];
130 vMaxY = _simd_max_epi32(vMaxY, vY[1]);
131 vMaxY = _simd_max_epi32(vMaxY, vY[2]);
132
133 /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
134 /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
135 bbox.xmin = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
136 bbox.xmax = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
137 bbox.ymin = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
138 bbox.ymax = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
139 }
140
141 //////////////////////////////////////////////////////////////////////////
142 /// @brief Processes attributes for the backend based on linkage mask and
143 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
144 /// @param pDC - Draw context
145 /// @param pa - Primitive Assembly state
146 /// @param linkageMask - Specifies which VS outputs are routed to PS.
147 /// @param pLinkageMap - maps VS attribute slot to PS slot
148 /// @param triIndex - Triangle to process attributes for
149 /// @param pBuffer - Output result
150 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
ProcessAttributes(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t triIndex,uint32_t primId,float * pBuffer)151 INLINE void ProcessAttributes(
152 DRAW_CONTEXT *pDC,
153 PA_STATE&pa,
154 uint32_t triIndex,
155 uint32_t primId,
156 float *pBuffer)
157 {
158 static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
159 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
160 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
161 LONG constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
162 const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
163 const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
164
165 static const float constTable[3][4] = {
166 { 0.0f, 0.0f, 0.0f, 0.0f },
167 { 0.0f, 0.0f, 0.0f, 1.0f },
168 { 1.0f, 1.0f, 1.0f, 1.0f }
169 };
170
171 for (uint32_t i = 0; i < backendState.numAttributes; ++i)
172 {
173 uint32_t inputSlot;
174 if (IsSwizzledT::value)
175 {
176 SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
177 inputSlot = VERTEX_ATTRIB_START_SLOT + attribSwizzle.sourceAttrib;
178
179 }
180 else
181 {
182 inputSlot = VERTEX_ATTRIB_START_SLOT + i;
183 }
184
185 __m128 attrib[3]; // triangle attribs (always 4 wide)
186 float* pAttribStart = pBuffer;
187
188 if (HasConstantInterpT::value || IsDegenerate::value)
189 {
190 if (_bittest(&constantInterpMask, i))
191 {
192 uint32_t vid;
193 uint32_t adjustedTriIndex;
194 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
195 static const int32_t quadProvokingTri[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
196 static const uint32_t quadProvokingVertex[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
197 static const int32_t qstripProvokingTri[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
198 static const uint32_t qstripProvokingVertex[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
199
200 switch (topo) {
201 case TOP_QUAD_LIST:
202 adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
203 vid = quadProvokingVertex[triIndex & 1][provokingVertex];
204 break;
205 case TOP_QUAD_STRIP:
206 adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
207 vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
208 break;
209 case TOP_TRIANGLE_STRIP:
210 adjustedTriIndex = triIndex;
211 vid = (triIndex & 1)
212 ? tristripProvokingVertex[provokingVertex]
213 : provokingVertex;
214 break;
215 default:
216 adjustedTriIndex = triIndex;
217 vid = provokingVertex;
218 break;
219 }
220
221 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
222
223 for (uint32_t i = 0; i < NumVertsT::value; ++i)
224 {
225 _mm_store_ps(pBuffer, attrib[vid]);
226 pBuffer += 4;
227 }
228 }
229 else
230 {
231 pa.AssembleSingle(inputSlot, triIndex, attrib);
232
233 for (uint32_t i = 0; i < NumVertsT::value; ++i)
234 {
235 _mm_store_ps(pBuffer, attrib[i]);
236 pBuffer += 4;
237 }
238 }
239 }
240 else
241 {
242 pa.AssembleSingle(inputSlot, triIndex, attrib);
243
244 for (uint32_t i = 0; i < NumVertsT::value; ++i)
245 {
246 _mm_store_ps(pBuffer, attrib[i]);
247 pBuffer += 4;
248 }
249 }
250
251 // pad out the attrib buffer to 3 verts to ensure the triangle
252 // interpolation code in the pixel shader works correctly for the
253 // 3 topologies - point, line, tri. This effectively zeros out the
254 // effect of the missing vertices in the triangle interpolation.
255 for (uint32_t v = NumVertsT::value; v < 3; ++v)
256 {
257 _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
258 pBuffer += 4;
259 }
260
261 // check for constant source overrides
262 if (IsSwizzledT::value)
263 {
264 uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
265 if (mask)
266 {
267 DWORD comp;
268 while (_BitScanForward(&comp, mask))
269 {
270 mask &= ~(1 << comp);
271
272 float constantValue = 0.0f;
273 switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
274 {
275 case SWR_CONSTANT_SOURCE_CONST_0000:
276 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
277 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
278 constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
279 break;
280 case SWR_CONSTANT_SOURCE_PRIM_ID:
281 constantValue = *(float*)&primId;
282 break;
283 }
284
285 // apply constant value to all 3 vertices
286 for (uint32_t v = 0; v < 3; ++v)
287 {
288 pAttribStart[comp + v * 4] = constantValue;
289 }
290 }
291 }
292 }
293 }
294 }
295
296 //////////////////////////////////////////////////////////////////////////
297 /// @brief Gather scissor rect data based on per-prim viewport indices.
298 /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
299 /// @param pViewportIndex - array of per-primitive vewport indexes.
300 /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
301 /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
302 /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
303 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
304 //
305 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
306 template<size_t SimdWidth>
307 struct GatherScissors
308 {
GatherGatherScissors309 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
310 simdscalari &scisXmin, simdscalari &scisYmin,
311 simdscalari &scisXmax, simdscalari &scisYmax)
312 {
313 SWR_ASSERT(0, "Unhandled Simd Width in Scissor Rect Gather");
314 }
315 };
316
317 template<>
318 struct GatherScissors<8>
319 {
GatherGatherScissors320 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
321 simdscalari &scisXmin, simdscalari &scisYmin,
322 simdscalari &scisXmax, simdscalari &scisYmax)
323 {
324 scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
325 pScissorsInFixedPoint[pViewportIndex[1]].xmin,
326 pScissorsInFixedPoint[pViewportIndex[2]].xmin,
327 pScissorsInFixedPoint[pViewportIndex[3]].xmin,
328 pScissorsInFixedPoint[pViewportIndex[4]].xmin,
329 pScissorsInFixedPoint[pViewportIndex[5]].xmin,
330 pScissorsInFixedPoint[pViewportIndex[6]].xmin,
331 pScissorsInFixedPoint[pViewportIndex[7]].xmin);
332 scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin,
333 pScissorsInFixedPoint[pViewportIndex[1]].ymin,
334 pScissorsInFixedPoint[pViewportIndex[2]].ymin,
335 pScissorsInFixedPoint[pViewportIndex[3]].ymin,
336 pScissorsInFixedPoint[pViewportIndex[4]].ymin,
337 pScissorsInFixedPoint[pViewportIndex[5]].ymin,
338 pScissorsInFixedPoint[pViewportIndex[6]].ymin,
339 pScissorsInFixedPoint[pViewportIndex[7]].ymin);
340 scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax,
341 pScissorsInFixedPoint[pViewportIndex[1]].xmax,
342 pScissorsInFixedPoint[pViewportIndex[2]].xmax,
343 pScissorsInFixedPoint[pViewportIndex[3]].xmax,
344 pScissorsInFixedPoint[pViewportIndex[4]].xmax,
345 pScissorsInFixedPoint[pViewportIndex[5]].xmax,
346 pScissorsInFixedPoint[pViewportIndex[6]].xmax,
347 pScissorsInFixedPoint[pViewportIndex[7]].xmax);
348 scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax,
349 pScissorsInFixedPoint[pViewportIndex[1]].ymax,
350 pScissorsInFixedPoint[pViewportIndex[2]].ymax,
351 pScissorsInFixedPoint[pViewportIndex[3]].ymax,
352 pScissorsInFixedPoint[pViewportIndex[4]].ymax,
353 pScissorsInFixedPoint[pViewportIndex[5]].ymax,
354 pScissorsInFixedPoint[pViewportIndex[6]].ymax,
355 pScissorsInFixedPoint[pViewportIndex[7]].ymax);
356 }
357 };
358
359 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
360
361 struct ProcessAttributesChooser
362 {
363 typedef PFN_PROCESS_ATTRIBUTES FuncType;
364
365 template <typename... ArgsB>
GetFuncProcessAttributesChooser366 static FuncType GetFunc()
367 {
368 return ProcessAttributes<ArgsB...>;
369 }
370 };
371
GetProcessAttributesFunc(uint32_t NumVerts,bool IsSwizzled,bool HasConstantInterp,bool IsDegenerate=false)372 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
373 {
374 return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
375 }
376
377 //////////////////////////////////////////////////////////////////////////
378 /// @brief Processes enabled user clip distances. Loads the active clip
379 /// distances from the PA, sets up barycentric equations, and
380 /// stores the results to the output buffer
381 /// @param pa - Primitive Assembly state
382 /// @param primIndex - primitive index to process
383 /// @param clipDistMask - mask of enabled clip distances
384 /// @param pUserClipBuffer - buffer to store results
385 template<uint32_t NumVerts>
ProcessUserClipDist(PA_STATE & pa,uint32_t primIndex,uint8_t clipDistMask,float * pRecipW,float * pUserClipBuffer)386 void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float *pRecipW, float* pUserClipBuffer)
387 {
388 DWORD clipDist;
389 while (_BitScanForward(&clipDist, clipDistMask))
390 {
391 clipDistMask &= ~(1 << clipDist);
392 uint32_t clipSlot = clipDist >> 2;
393 uint32_t clipComp = clipDist & 0x3;
394 uint32_t clipAttribSlot = clipSlot == 0 ?
395 VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
396
397 __m128 primClipDist[3];
398 pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
399
400 float vertClipDist[NumVerts];
401 for (uint32_t e = 0; e < NumVerts; ++e)
402 {
403 OSALIGNSIMD(float) aVertClipDist[4];
404 _mm_store_ps(aVertClipDist, primClipDist[e]);
405 vertClipDist[e] = aVertClipDist[clipComp];
406 };
407
408 // setup plane equations for barycentric interpolation in the backend
409 float baryCoeff[NumVerts];
410 float last = vertClipDist[NumVerts - 1] * pRecipW[NumVerts - 1];
411 for (uint32_t e = 0; e < NumVerts - 1; ++e)
412 {
413 baryCoeff[e] = vertClipDist[e] * pRecipW[e] - last;
414 }
415 baryCoeff[NumVerts - 1] = last;
416
417 for (uint32_t e = 0; e < NumVerts; ++e)
418 {
419 *(pUserClipBuffer++) = baryCoeff[e];
420 }
421 }
422 }
423
424 //////////////////////////////////////////////////////////////////////////
425 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
426 /// culling, viewport transform, etc.
427 /// @param pDC - pointer to draw context.
428 /// @param pa - The primitive assembly object.
429 /// @param workerId - thread's worker id. Even thread has a unique id.
430 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
431 /// @param primID - Primitive ID for each triangle.
432 /// @param viewportIdx - viewport array index for each triangle.
433 /// @tparam CT - ConservativeRastFETraits
434 template <typename CT>
BinTriangles(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simdvector tri[3],uint32_t triMask,simdscalari primID,simdscalari viewportIdx)435 void BinTriangles(
436 DRAW_CONTEXT *pDC,
437 PA_STATE& pa,
438 uint32_t workerId,
439 simdvector tri[3],
440 uint32_t triMask,
441 simdscalari primID,
442 simdscalari viewportIdx)
443 {
444 SWR_CONTEXT *pContext = pDC->pContext;
445
446 AR_BEGIN(FEBinTriangles, pDC->drawId);
447
448 const API_STATE& state = GetApiState(pDC);
449 const SWR_RASTSTATE& rastState = state.rastState;
450 const SWR_FRONTEND_STATE& feState = state.frontendState;
451 const SWR_GS_STATE& gsState = state.gsState;
452 MacroTileMgr *pTileMgr = pDC->pTileMgr;
453
454 simdscalar vRecipW0 = _simd_set1_ps(1.0f);
455 simdscalar vRecipW1 = _simd_set1_ps(1.0f);
456 simdscalar vRecipW2 = _simd_set1_ps(1.0f);
457
458 if (feState.vpTransformDisable)
459 {
460 // RHW is passed in directly when VP transform is disabled
461 vRecipW0 = tri[0].v[3];
462 vRecipW1 = tri[1].v[3];
463 vRecipW2 = tri[2].v[3];
464 }
465 else
466 {
467 // Perspective divide
468 vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
469 vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
470 vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
471
472 tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
473 tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
474 tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
475
476 tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
477 tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
478 tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
479
480 tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
481 tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
482 tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
483
484 // Viewport transform to screen space coords
485 if (state.gsState.emitsViewportArrayIndex)
486 {
487 viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
488 }
489 else
490 {
491 viewportTransform<3>(tri, state.vpMatrices);
492 }
493 }
494
495 // Adjust for pixel center location
496 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
497 tri[0].x = _simd_add_ps(tri[0].x, offset);
498 tri[0].y = _simd_add_ps(tri[0].y, offset);
499
500 tri[1].x = _simd_add_ps(tri[1].x, offset);
501 tri[1].y = _simd_add_ps(tri[1].y, offset);
502
503 tri[2].x = _simd_add_ps(tri[2].x, offset);
504 tri[2].y = _simd_add_ps(tri[2].y, offset);
505
506 simdscalari vXi[3], vYi[3];
507 // Set vXi, vYi to required fixed point precision
508 FPToFixedPoint(tri, vXi, vYi);
509
510 // triangle setup
511 simdscalari vAi[3], vBi[3];
512 triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
513
514 // determinant
515 simdscalari vDet[2];
516 calcDeterminantIntVertical(vAi, vBi, vDet);
517
518 // cull zero area
519 int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
520 int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
521
522 int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
523
524 uint32_t origTriMask = triMask;
525 // don't cull degenerate triangles if we're conservatively rasterizing
526 if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
527 {
528 triMask &= ~cullZeroAreaMask;
529 }
530
531 // determine front winding tris
532 // CW +det
533 // CCW det < 0;
534 // 0 area triangles are marked as backfacing regardless of winding order,
535 // which is required behavior for conservative rast and wireframe rendering
536 uint32_t frontWindingTris;
537 if (rastState.frontWinding == SWR_FRONTWINDING_CW)
538 {
539 maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
540 maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
541 }
542 else
543 {
544 maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet[0])));
545 maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet[1])));
546 }
547 frontWindingTris = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
548
549 // cull
550 uint32_t cullTris;
551 switch ((SWR_CULLMODE)rastState.cullMode)
552 {
553 case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
554 case SWR_CULLMODE_NONE: cullTris = 0x0; break;
555 case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
556 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
557 case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
558 default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
559 }
560
561 triMask &= ~cullTris;
562
563 if (origTriMask ^ triMask)
564 {
565 RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
566 }
567
568 // Simple non-conformant wireframe mode, useful for debugging
569 if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
570 {
571 // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
572 simdvector line[2];
573 simdscalar recipW[2];
574 line[0] = tri[0];
575 line[1] = tri[1];
576 recipW[0] = vRecipW0;
577 recipW[1] = vRecipW1;
578 BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
579
580 line[0] = tri[1];
581 line[1] = tri[2];
582 recipW[0] = vRecipW1;
583 recipW[1] = vRecipW2;
584 BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
585
586 line[0] = tri[2];
587 line[1] = tri[0];
588 recipW[0] = vRecipW2;
589 recipW[1] = vRecipW0;
590 BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
591
592 AR_END(FEBinTriangles, 1);
593 return;
594 }
595
596 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
597 // compute per tri backface
598 uint32_t frontFaceMask = frontWindingTris;
599 uint32_t *pPrimID = (uint32_t *)&primID;
600 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
601 DWORD triIndex = 0;
602 // for center sample pattern, all samples are at pixel center; calculate coverage
603 // once at center and broadcast the results in the backend
604 const SWR_MULTISAMPLE_COUNT sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
605 uint32_t edgeEnable;
606 PFN_WORK_FUNC pfnWork;
607 if (CT::IsConservativeT::value)
608 {
609 // determine which edges of the degenerate tri, if any, are valid to rasterize.
610 // used to call the appropriate templated rasterizer function
611 if (cullZeroAreaMask > 0)
612 {
613 // e0 = v1-v0
614 simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]);
615 simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]);
616 uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask)));
617
618 // e1 = v2-v1
619 simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]);
620 simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]);
621 uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask)));
622
623 // e2 = v0-v2
624 // if v0 == v1 & v1 == v2, v0 == v2
625 uint32_t e2Mask = e0Mask & e1Mask;
626 SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
627
628 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
629 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
630 e0Mask = pdep_u32(e0Mask, 0x00249249);
631 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
632 e1Mask = pdep_u32(e1Mask, 0x00492492);
633 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
634 e2Mask = pdep_u32(e2Mask, 0x00924924);
635
636 edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
637 }
638 else
639 {
640 edgeEnable = 0x00FFFFFF;
641 }
642 }
643 else
644 {
645 // degenerate triangles won't be sent to rasterizer; just enable all edges
646 pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
647 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, ALL_EDGES_VALID,
648 (state.scissorsTileAligned == false));
649 }
650
651 if (!triMask)
652 {
653 goto endBinTriangles;
654 }
655
656 // Calc bounding box of triangles
657 simdBBox bbox;
658 calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
659
660 // determine if triangle falls between pixel centers and discard
661 // only discard for non-MSAA case and when conservative rast is disabled
662 // (xmin + 127) & ~255
663 // (xmax + 128) & ~255
664 if (rastState.sampleCount == SWR_MULTISAMPLE_1X && (!CT::IsConservativeT::value))
665 {
666 origTriMask = triMask;
667
668 int cullCenterMask;
669 {
670 simdscalari xmin = _simd_add_epi32(bbox.xmin, _simd_set1_epi32(127));
671 xmin = _simd_and_si(xmin, _simd_set1_epi32(~255));
672 simdscalari xmax = _simd_add_epi32(bbox.xmax, _simd_set1_epi32(128));
673 xmax = _simd_and_si(xmax, _simd_set1_epi32(~255));
674
675 simdscalari vMaskH = _simd_cmpeq_epi32(xmin, xmax);
676
677 simdscalari ymin = _simd_add_epi32(bbox.ymin, _simd_set1_epi32(127));
678 ymin = _simd_and_si(ymin, _simd_set1_epi32(~255));
679 simdscalari ymax = _simd_add_epi32(bbox.ymax, _simd_set1_epi32(128));
680 ymax = _simd_and_si(ymax, _simd_set1_epi32(~255));
681
682 simdscalari vMaskV = _simd_cmpeq_epi32(ymin, ymax);
683 vMaskV = _simd_or_si(vMaskH, vMaskV);
684 cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
685 }
686
687 triMask &= ~cullCenterMask;
688
689 if (origTriMask ^ triMask)
690 {
691 RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
692 }
693 }
694
695 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
696 // Gather the AOS effective scissor rects based on the per-prim VP index.
697 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
698 simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
699 if (state.gsState.emitsViewportArrayIndex)
700 {
701 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
702 scisXmin, scisYmin, scisXmax, scisYmax);
703 }
704 else // broadcast fast path for non-VPAI case.
705 {
706 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
707 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
708 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
709 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
710 }
711
712 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
713 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
714 bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
715 bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
716
717 if (CT::IsConservativeT::value)
718 {
719 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
720 // some area. Bump the xmax/ymax edges out
721 simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.ymin, bbox.ymax);
722 bbox.ymax = _simd_blendv_epi32(bbox.ymax, _simd_add_epi32(bbox.ymax, _simd_set1_epi32(1)), topEqualsBottom);
723 simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.xmin, bbox.xmax);
724 bbox.xmax = _simd_blendv_epi32(bbox.xmax, _simd_add_epi32(bbox.xmax, _simd_set1_epi32(1)), leftEqualsRight);
725 }
726
727 // Cull tris completely outside scissor
728 {
729 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
730 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
731 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
732 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
733 triMask = triMask & ~maskOutsideScissor;
734 }
735
736 if (!triMask)
737 {
738 goto endBinTriangles;
739 }
740
741 // Convert triangle bbox to macrotile units.
742 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
743 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
744 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
745 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
746
747 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
748 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
749 _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
750 _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
751 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
752
753 // transpose verts needed for backend
754 /// @todo modify BE to take non-transformed verts
755 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
756 vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
757 vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
758 vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
759 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
760
761 // store render target array index
762 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
763 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
764 {
765 simdvector vRtai[3];
766 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
767 simdscalari vRtaii;
768 vRtaii = _simd_castps_si(vRtai[0].x);
769 _simd_store_si((simdscalari*)aRTAI, vRtaii);
770 }
771 else
772 {
773 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
774 }
775
776 endBinTriangles:
777
778 // scan remaining valid triangles and bin each separately
779 while (_BitScanForward(&triIndex, triMask))
780 {
781 uint32_t linkageCount = state.backendState.numAttributes;
782 uint32_t numScalarAttribs = linkageCount * 4;
783
784 BE_WORK work;
785 work.type = DRAW;
786
787 bool isDegenerate;
788 if (CT::IsConservativeT::value)
789 {
790 // only rasterize valid edges if we have a degenerate primitive
791 int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
792 work.pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
793 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, triEdgeEnable,
794 (state.scissorsTileAligned == false));
795
796 // Degenerate triangles are required to be constant interpolated
797 isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
798 }
799 else
800 {
801 isDegenerate = false;
802 work.pfnWork = pfnWork;
803 }
804
805 // Select attribute processor
806 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
807 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
808
809 TRIANGLE_WORK_DESC &desc = work.desc.tri;
810
811 desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
812 desc.triFlags.primID = pPrimID[triIndex];
813 desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
814 desc.triFlags.viewportIndex = pViewportIndex[triIndex];
815
816 auto pArena = pDC->pArena;
817 SWR_ASSERT(pArena != nullptr);
818
819 // store active attribs
820 float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
821 desc.pAttribs = pAttribs;
822 desc.numAttribs = linkageCount;
823 pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
824
825 // store triangle vertex data
826 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
827
828 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
829 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
830 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
831 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
832
833 // store user clip distances
834 if (rastState.clipDistanceMask)
835 {
836 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
837 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
838 ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer);
839 }
840
841 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
842 {
843 for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
844 {
845 #if KNOB_ENABLE_TOSS_POINTS
846 if (!KNOB_TOSS_SETUP_TRIS)
847 #endif
848 {
849 pTileMgr->enqueue(x, y, &work);
850 }
851 }
852 }
853 triMask &= ~(1 << triIndex);
854 }
855
856 AR_END(FEBinTriangles, 1);
857 }
858
859 struct FEBinTrianglesChooser
860 {
861 typedef PFN_PROCESS_PRIMS FuncType;
862
863 template <typename... ArgsB>
GetFuncFEBinTrianglesChooser864 static FuncType GetFunc()
865 {
866 return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
867 }
868 };
869
870 // Selector for correct templated BinTrinagles function
GetBinTrianglesFunc(bool IsConservative)871 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
872 {
873 return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
874 }
875
876
877 //////////////////////////////////////////////////////////////////////////
878 /// @brief Bin SIMD points to the backend. Only supports point size of 1
879 /// @param pDC - pointer to draw context.
880 /// @param pa - The primitive assembly object.
881 /// @param workerId - thread's worker id. Even thread has a unique id.
882 /// @param tri - Contains point position data for SIMDs worth of points.
883 /// @param primID - Primitive ID for each point.
BinPoints(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simdvector prim[3],uint32_t primMask,simdscalari primID,simdscalari viewportIdx)884 void BinPoints(
885 DRAW_CONTEXT *pDC,
886 PA_STATE& pa,
887 uint32_t workerId,
888 simdvector prim[3],
889 uint32_t primMask,
890 simdscalari primID,
891 simdscalari viewportIdx)
892 {
893 SWR_CONTEXT *pContext = pDC->pContext;
894
895 AR_BEGIN(FEBinPoints, pDC->drawId);
896
897 simdvector& primVerts = prim[0];
898
899 const API_STATE& state = GetApiState(pDC);
900 const SWR_FRONTEND_STATE& feState = state.frontendState;
901 const SWR_GS_STATE& gsState = state.gsState;
902 const SWR_RASTSTATE& rastState = state.rastState;
903 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
904
905 // Select attribute processor
906 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
907 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
908
909 if (!feState.vpTransformDisable)
910 {
911 // perspective divide
912 simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
913 primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
914 primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
915 primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
916
917 // viewport transform to screen coords
918 if (state.gsState.emitsViewportArrayIndex)
919 {
920 viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
921 }
922 else
923 {
924 viewportTransform<1>(&primVerts, state.vpMatrices);
925 }
926 }
927
928 // adjust for pixel center location
929 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
930 primVerts.x = _simd_add_ps(primVerts.x, offset);
931 primVerts.y = _simd_add_ps(primVerts.y, offset);
932
933 // convert to fixed point
934 simdscalari vXi, vYi;
935 vXi = fpToFixedPointVertical(primVerts.x);
936 vYi = fpToFixedPointVertical(primVerts.y);
937
938 if (CanUseSimplePoints(pDC))
939 {
940 // adjust for ymin-xmin rule
941 vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
942 vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
943
944 // cull points off the ymin-xmin edge of the viewport
945 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
946 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
947
948 // compute macro tile coordinates
949 simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
950 simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
951
952 OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
953 _simd_store_si((simdscalari*)aMacroX, macroX);
954 _simd_store_si((simdscalari*)aMacroY, macroY);
955
956 // compute raster tile coordinates
957 simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
958 simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
959
960 // compute raster tile relative x,y for coverage mask
961 simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
962 simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
963
964 simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
965 simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
966
967 OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
968 OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
969 _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
970 _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
971
972 OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
973 OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
974 _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
975 _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
976
977 OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
978 _simd_store_ps((float*)aZ, primVerts.z);
979
980 // store render target array index
981 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
982 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
983 {
984 simdvector vRtai;
985 pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
986 simdscalari vRtaii = _simd_castps_si(vRtai.x);
987 _simd_store_si((simdscalari*)aRTAI, vRtaii);
988 }
989 else
990 {
991 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
992 }
993
994 uint32_t *pPrimID = (uint32_t *)&primID;
995 DWORD primIndex = 0;
996
997 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
998
999 // scan remaining valid triangles and bin each separately
1000 while (_BitScanForward(&primIndex, primMask))
1001 {
1002 uint32_t linkageCount = backendState.numAttributes;
1003 uint32_t numScalarAttribs = linkageCount * 4;
1004
1005 BE_WORK work;
1006 work.type = DRAW;
1007
1008 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1009
1010 // points are always front facing
1011 desc.triFlags.frontFacing = 1;
1012 desc.triFlags.primID = pPrimID[primIndex];
1013 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1014 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1015
1016 work.pfnWork = RasterizeSimplePoint;
1017
1018 auto pArena = pDC->pArena;
1019 SWR_ASSERT(pArena != nullptr);
1020
1021 // store attributes
1022 float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
1023 desc.pAttribs = pAttribs;
1024 desc.numAttribs = linkageCount;
1025
1026 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
1027
1028 // store raster tile aligned x, y, perspective correct z
1029 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1030 desc.pTriBuffer = pTriBuffer;
1031 *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
1032 *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
1033 *pTriBuffer = aZ[primIndex];
1034
1035 uint32_t tX = aTileRelativeX[primIndex];
1036 uint32_t tY = aTileRelativeY[primIndex];
1037
1038 // pack the relative x,y into the coverageMask, the rasterizer will
1039 // generate the true coverage mask from it
1040 work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
1041
1042 // bin it
1043 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1044 #if KNOB_ENABLE_TOSS_POINTS
1045 if (!KNOB_TOSS_SETUP_TRIS)
1046 #endif
1047 {
1048 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
1049 }
1050 primMask &= ~(1 << primIndex);
1051 }
1052 }
1053 else
1054 {
1055 // non simple points need to be potentially binned to multiple macro tiles
1056 simdscalar vPointSize;
1057 if (rastState.pointParam)
1058 {
1059 simdvector size[3];
1060 pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
1061 vPointSize = size[0].x;
1062 }
1063 else
1064 {
1065 vPointSize = _simd_set1_ps(rastState.pointSize);
1066 }
1067
1068 // bloat point to bbox
1069 simdBBox bbox;
1070 bbox.xmin = bbox.xmax = vXi;
1071 bbox.ymin = bbox.ymax = vYi;
1072
1073 simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
1074 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
1075 bbox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
1076 bbox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
1077 bbox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
1078 bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
1079
1080 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1081 // Gather the AOS effective scissor rects based on the per-prim VP index.
1082 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1083 simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
1084 if (state.gsState.emitsViewportArrayIndex)
1085 {
1086 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
1087 scisXmin, scisYmin, scisXmax, scisYmax);
1088 }
1089 else // broadcast fast path for non-VPAI case.
1090 {
1091 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
1092 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
1093 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
1094 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
1095 }
1096
1097 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
1098 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
1099 bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
1100 bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
1101
1102 // Cull bloated points completely outside scissor
1103 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
1104 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
1105 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
1106 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
1107 primMask = primMask & ~maskOutsideScissor;
1108
1109 // Convert bbox to macrotile units.
1110 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1111 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1112 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1113 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1114
1115 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
1116 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
1117 _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
1118 _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
1119 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
1120
1121 // store render target array index
1122 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
1123 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1124 {
1125 simdvector vRtai[2];
1126 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
1127 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
1128 _simd_store_si((simdscalari*)aRTAI, vRtaii);
1129 }
1130 else
1131 {
1132 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
1133 }
1134
1135 OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
1136 _simd_store_ps((float*)aPointSize, vPointSize);
1137
1138 uint32_t *pPrimID = (uint32_t *)&primID;
1139
1140 OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
1141 OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
1142 OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
1143
1144 _simd_store_ps((float*)aPrimVertsX, primVerts.x);
1145 _simd_store_ps((float*)aPrimVertsY, primVerts.y);
1146 _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
1147
1148 // scan remaining valid prims and bin each separately
1149 const SWR_BACKEND_STATE& backendState = state.backendState;
1150 DWORD primIndex;
1151 while (_BitScanForward(&primIndex, primMask))
1152 {
1153 uint32_t linkageCount = backendState.numAttributes;
1154 uint32_t numScalarAttribs = linkageCount * 4;
1155
1156 BE_WORK work;
1157 work.type = DRAW;
1158
1159 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1160
1161 desc.triFlags.frontFacing = 1;
1162 desc.triFlags.primID = pPrimID[primIndex];
1163 desc.triFlags.pointSize = aPointSize[primIndex];
1164 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1165 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1166
1167 work.pfnWork = RasterizeTriPoint;
1168
1169 auto pArena = pDC->pArena;
1170 SWR_ASSERT(pArena != nullptr);
1171
1172 // store active attribs
1173 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1174 desc.numAttribs = linkageCount;
1175 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1176
1177 // store point vertex data
1178 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1179 desc.pTriBuffer = pTriBuffer;
1180 *pTriBuffer++ = aPrimVertsX[primIndex];
1181 *pTriBuffer++ = aPrimVertsY[primIndex];
1182 *pTriBuffer = aPrimVertsZ[primIndex];
1183
1184 // store user clip distances
1185 if (rastState.clipDistanceMask)
1186 {
1187 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
1188 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1189 float dists[8];
1190 float one = 1.0f;
1191 ProcessUserClipDist<1>(pa, primIndex, rastState.clipDistanceMask, &one, dists);
1192 for (uint32_t i = 0; i < numClipDist; i++) {
1193 desc.pUserClipBuffer[3*i + 0] = 0.0f;
1194 desc.pUserClipBuffer[3*i + 1] = 0.0f;
1195 desc.pUserClipBuffer[3*i + 2] = dists[i];
1196 }
1197 }
1198
1199 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1200 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1201 {
1202 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1203 {
1204 #if KNOB_ENABLE_TOSS_POINTS
1205 if (!KNOB_TOSS_SETUP_TRIS)
1206 #endif
1207 {
1208 pTileMgr->enqueue(x, y, &work);
1209 }
1210 }
1211 }
1212
1213 primMask &= ~(1 << primIndex);
1214 }
1215 }
1216
1217 AR_END(FEBinPoints, 1);
1218 }
1219
1220 //////////////////////////////////////////////////////////////////////////
1221 /// @brief Bin SIMD lines to the backend.
1222 /// @param pDC - pointer to draw context.
1223 /// @param pa - The primitive assembly object.
1224 /// @param workerId - thread's worker id. Even thread has a unique id.
1225 /// @param tri - Contains line position data for SIMDs worth of points.
1226 /// @param primID - Primitive ID for each line.
1227 /// @param viewportIdx - Viewport Array Index for each line.
BinPostSetupLines(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simdvector prim[],simdscalar recipW[],uint32_t primMask,simdscalari primID,simdscalari viewportIdx)1228 void BinPostSetupLines(
1229 DRAW_CONTEXT *pDC,
1230 PA_STATE& pa,
1231 uint32_t workerId,
1232 simdvector prim[],
1233 simdscalar recipW[],
1234 uint32_t primMask,
1235 simdscalari primID,
1236 simdscalari viewportIdx)
1237 {
1238 SWR_CONTEXT *pContext = pDC->pContext;
1239
1240 AR_BEGIN(FEBinLines, pDC->drawId);
1241
1242 const API_STATE& state = GetApiState(pDC);
1243 const SWR_RASTSTATE& rastState = state.rastState;
1244 const SWR_FRONTEND_STATE& feState = state.frontendState;
1245 const SWR_GS_STATE& gsState = state.gsState;
1246
1247 // Select attribute processor
1248 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
1249 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1250
1251 simdscalar& vRecipW0 = recipW[0];
1252 simdscalar& vRecipW1 = recipW[1];
1253
1254 // convert to fixed point
1255 simdscalari vXi[2], vYi[2];
1256 vXi[0] = fpToFixedPointVertical(prim[0].x);
1257 vYi[0] = fpToFixedPointVertical(prim[0].y);
1258 vXi[1] = fpToFixedPointVertical(prim[1].x);
1259 vYi[1] = fpToFixedPointVertical(prim[1].y);
1260
1261 // compute x-major vs y-major mask
1262 simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
1263 simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
1264 simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
1265 uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
1266
1267 // cull zero-length lines
1268 simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
1269 vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
1270
1271 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
1272
1273 uint32_t *pPrimID = (uint32_t *)&primID;
1274 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1275
1276 simdscalar vUnused = _simd_setzero_ps();
1277
1278 // Calc bounding box of lines
1279 simdBBox bbox;
1280 bbox.xmin = _simd_min_epi32(vXi[0], vXi[1]);
1281 bbox.xmax = _simd_max_epi32(vXi[0], vXi[1]);
1282 bbox.ymin = _simd_min_epi32(vYi[0], vYi[1]);
1283 bbox.ymax = _simd_max_epi32(vYi[0], vYi[1]);
1284
1285 // bloat bbox by line width along minor axis
1286 simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
1287 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
1288 simdBBox bloatBox;
1289 bloatBox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
1290 bloatBox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
1291 bloatBox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
1292 bloatBox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
1293
1294 bbox.xmin = _simd_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
1295 bbox.xmax = _simd_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
1296 bbox.ymin = _simd_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
1297 bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
1298
1299 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1300 simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
1301 if (state.gsState.emitsViewportArrayIndex)
1302 {
1303 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
1304 scisXmin, scisYmin, scisXmax, scisYmax);
1305 }
1306 else // broadcast fast path for non-VPAI case.
1307 {
1308 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
1309 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
1310 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
1311 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
1312 }
1313
1314 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
1315 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
1316 bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
1317 bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
1318
1319 // Cull prims completely outside scissor
1320 {
1321 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
1322 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
1323 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
1324 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
1325 primMask = primMask & ~maskOutsideScissor;
1326 }
1327
1328 if (!primMask)
1329 {
1330 goto endBinLines;
1331 }
1332
1333 // Convert triangle bbox to macrotile units.
1334 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1335 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1336 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1337 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1338
1339 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
1340 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
1341 _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
1342 _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
1343 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
1344
1345 // transpose verts needed for backend
1346 /// @todo modify BE to take non-transformed verts
1347 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
1348 vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
1349 vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
1350 vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
1351 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
1352
1353 // store render target array index
1354 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
1355 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1356 {
1357 simdvector vRtai[2];
1358 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
1359 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
1360 _simd_store_si((simdscalari*)aRTAI, vRtaii);
1361 }
1362 else
1363 {
1364 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
1365 }
1366
1367 // scan remaining valid prims and bin each separately
1368 DWORD primIndex;
1369 while (_BitScanForward(&primIndex, primMask))
1370 {
1371 uint32_t linkageCount = state.backendState.numAttributes;
1372 uint32_t numScalarAttribs = linkageCount * 4;
1373
1374 BE_WORK work;
1375 work.type = DRAW;
1376
1377 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1378
1379 desc.triFlags.frontFacing = 1;
1380 desc.triFlags.primID = pPrimID[primIndex];
1381 desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
1382 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1383 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1384
1385 work.pfnWork = RasterizeLine;
1386
1387 auto pArena = pDC->pArena;
1388 SWR_ASSERT(pArena != nullptr);
1389
1390 // store active attribs
1391 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1392 desc.numAttribs = linkageCount;
1393 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1394
1395 // store line vertex data
1396 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1397 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
1398 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
1399 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
1400 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
1401
1402 // store user clip distances
1403 if (rastState.clipDistanceMask)
1404 {
1405 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
1406 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
1407 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1408 }
1409
1410 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1411 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1412 {
1413 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1414 {
1415 #if KNOB_ENABLE_TOSS_POINTS
1416 if (!KNOB_TOSS_SETUP_TRIS)
1417 #endif
1418 {
1419 pTileMgr->enqueue(x, y, &work);
1420 }
1421 }
1422 }
1423
1424 primMask &= ~(1 << primIndex);
1425 }
1426
1427 endBinLines:
1428
1429 AR_END(FEBinLines, 1);
1430 }
1431
1432 //////////////////////////////////////////////////////////////////////////
1433 /// @brief Bin SIMD lines to the backend.
1434 /// @param pDC - pointer to draw context.
1435 /// @param pa - The primitive assembly object.
1436 /// @param workerId - thread's worker id. Even thread has a unique id.
1437 /// @param tri - Contains line position data for SIMDs worth of points.
1438 /// @param primID - Primitive ID for each line.
1439 /// @param viewportIdx - Viewport Array Index for each line.
BinLines(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simdvector prim[],uint32_t primMask,simdscalari primID,simdscalari viewportIdx)1440 void BinLines(
1441 DRAW_CONTEXT *pDC,
1442 PA_STATE& pa,
1443 uint32_t workerId,
1444 simdvector prim[],
1445 uint32_t primMask,
1446 simdscalari primID,
1447 simdscalari viewportIdx)
1448 {
1449 SWR_CONTEXT *pContext = pDC->pContext;
1450
1451 const API_STATE& state = GetApiState(pDC);
1452 const SWR_RASTSTATE& rastState = state.rastState;
1453 const SWR_FRONTEND_STATE& feState = state.frontendState;
1454 const SWR_GS_STATE& gsState = state.gsState;
1455
1456 // Select attribute processor
1457 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
1458 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1459
1460 simdscalar vRecipW[2] = { _simd_set1_ps(1.0f), _simd_set1_ps(1.0f) };
1461
1462 if (!feState.vpTransformDisable)
1463 {
1464 // perspective divide
1465 vRecipW[0] = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
1466 vRecipW[1] = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
1467
1468 prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW[0]);
1469 prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW[1]);
1470
1471 prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW[0]);
1472 prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW[1]);
1473
1474 prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW[0]);
1475 prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW[1]);
1476
1477 // viewport transform to screen coords
1478 if (state.gsState.emitsViewportArrayIndex)
1479 {
1480 viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
1481 }
1482 else
1483 {
1484 viewportTransform<2>(prim, state.vpMatrices);
1485 }
1486 }
1487
1488 // adjust for pixel center location
1489 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1490 prim[0].x = _simd_add_ps(prim[0].x, offset);
1491 prim[0].y = _simd_add_ps(prim[0].y, offset);
1492
1493 prim[1].x = _simd_add_ps(prim[1].x, offset);
1494 prim[1].y = _simd_add_ps(prim[1].y, offset);
1495
1496 BinPostSetupLines(
1497 pDC,
1498 pa,
1499 workerId,
1500 prim,
1501 vRecipW,
1502 primMask,
1503 primID,
1504 viewportIdx);
1505 }
1506