1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file clip.h
24 *
25 * @brief Definitions for clipping
26 *
27 ******************************************************************************/
28 #pragma once
29
30 #include "common/simdintrin.h"
31 #include "core/context.h"
32 #include "core/pa.h"
33 #include "rdtsc_core.h"
34
35 // Temp storage used by the clipper
36 extern THREAD simdvertex tlsTempVertices[7];
37
38 enum SWR_CLIPCODES
39 {
40 // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
41 // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
42 #define CLIPCODE_SHIFT 23
43 FRUSTUM_LEFT = (0x01 << CLIPCODE_SHIFT),
44 FRUSTUM_TOP = (0x02 << CLIPCODE_SHIFT),
45 FRUSTUM_RIGHT = (0x04 << CLIPCODE_SHIFT),
46 FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT),
47
48 FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT),
49 FRUSTUM_FAR = (0x20 << CLIPCODE_SHIFT),
50
51 NEGW = (0x40 << CLIPCODE_SHIFT),
52
53 GUARDBAND_LEFT = (0x80 << CLIPCODE_SHIFT | 0x1),
54 GUARDBAND_TOP = (0x80 << CLIPCODE_SHIFT | 0x2),
55 GUARDBAND_RIGHT = (0x80 << CLIPCODE_SHIFT | 0x4),
56 GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
57 };
58
59 #define FRUSTUM_CLIP_MASK (FRUSTUM_LEFT|FRUSTUM_TOP|FRUSTUM_RIGHT|FRUSTUM_BOTTOM|FRUSTUM_NEAR|FRUSTUM_FAR)
60 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
61
62 void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles,
63 int *numVerts, float *pOutAttribs);
64
65 INLINE
ComputeClipCodes(const API_STATE & state,const simdvector & vertex,simdscalar & clipCodes,simdscalari viewportIndexes)66 void ComputeClipCodes(const API_STATE& state, const simdvector& vertex, simdscalar& clipCodes, simdscalari viewportIndexes)
67 {
68 clipCodes = _simd_setzero_ps();
69
70 // -w
71 simdscalar vNegW = _simd_mul_ps(vertex.w, _simd_set1_ps(-1.0f));
72
73 // FRUSTUM_LEFT
74 simdscalar vRes = _simd_cmplt_ps(vertex.x, vNegW);
75 clipCodes = _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_LEFT)));
76
77 // FRUSTUM_TOP
78 vRes = _simd_cmplt_ps(vertex.y, vNegW);
79 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_TOP))));
80
81 // FRUSTUM_RIGHT
82 vRes = _simd_cmpgt_ps(vertex.x, vertex.w);
83 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_RIGHT))));
84
85 // FRUSTUM_BOTTOM
86 vRes = _simd_cmpgt_ps(vertex.y, vertex.w);
87 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_BOTTOM))));
88
89 if (state.rastState.depthClipEnable)
90 {
91 // FRUSTUM_NEAR
92 // DX clips depth [0..w], GL clips [-w..w]
93 if (state.rastState.clipHalfZ)
94 {
95 vRes = _simd_cmplt_ps(vertex.z, _simd_setzero_ps());
96 }
97 else
98 {
99 vRes = _simd_cmplt_ps(vertex.z, vNegW);
100 }
101 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_NEAR))));
102
103 // FRUSTUM_FAR
104 vRes = _simd_cmpgt_ps(vertex.z, vertex.w);
105 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_FAR))));
106 }
107
108 // NEGW
109 vRes = _simd_cmple_ps(vertex.w, _simd_setzero_ps());
110 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(NEGW))));
111
112 // GUARDBAND_LEFT
113 simdscalar gbMult = _simd_mul_ps(vNegW, _simd_i32gather_ps(&state.gbState.left[0], viewportIndexes, 4));
114 vRes = _simd_cmplt_ps(vertex.x, gbMult);
115 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_LEFT))));
116
117 // GUARDBAND_TOP
118 gbMult = _simd_mul_ps(vNegW, _simd_i32gather_ps(&state.gbState.top[0], viewportIndexes, 4));
119 vRes = _simd_cmplt_ps(vertex.y, gbMult);
120 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_TOP))));
121
122 // GUARDBAND_RIGHT
123 gbMult = _simd_mul_ps(vertex.w, _simd_i32gather_ps(&state.gbState.right[0], viewportIndexes, 4));
124 vRes = _simd_cmpgt_ps(vertex.x, gbMult);
125 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_RIGHT))));
126
127 // GUARDBAND_BOTTOM
128 gbMult = _simd_mul_ps(vertex.w, _simd_i32gather_ps(&state.gbState.bottom[0], viewportIndexes, 4));
129 vRes = _simd_cmpgt_ps(vertex.y, gbMult);
130 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_BOTTOM))));
131 }
132
133 template<uint32_t NumVertsPerPrim>
134 class Clipper
135 {
136 public:
Clipper(uint32_t in_workerId,DRAW_CONTEXT * in_pDC)137 Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
138 workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
139 {
140 static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
141 }
142
ComputeClipCodes(simdvector vertex[],simdscalari viewportIndexes)143 void ComputeClipCodes(simdvector vertex[], simdscalari viewportIndexes)
144 {
145 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
146 {
147 ::ComputeClipCodes(this->state, vertex[i], this->clipCodes[i], viewportIndexes);
148 }
149 }
150
ComputeClipCodeIntersection()151 simdscalar ComputeClipCodeIntersection()
152 {
153 simdscalar result = this->clipCodes[0];
154 for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
155 {
156 result = _simd_and_ps(result, this->clipCodes[i]);
157 }
158 return result;
159 }
160
ComputeClipCodeUnion()161 simdscalar ComputeClipCodeUnion()
162 {
163 simdscalar result = this->clipCodes[0];
164 for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
165 {
166 result = _simd_or_ps(result, this->clipCodes[i]);
167 }
168 return result;
169 }
170
ComputeNegWMask()171 int ComputeNegWMask()
172 {
173 simdscalar clipCodeUnion = ComputeClipCodeUnion();
174 clipCodeUnion = _simd_and_ps(clipCodeUnion, _simd_castsi_ps(_simd_set1_epi32(NEGW)));
175 return _simd_movemask_ps(_simd_cmpneq_ps(clipCodeUnion, _simd_setzero_ps()));
176 }
177
ComputeClipMask()178 int ComputeClipMask()
179 {
180 simdscalar clipUnion = ComputeClipCodeUnion();
181 clipUnion = _simd_and_ps(clipUnion, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_CLIP_MASK)));
182 return _simd_movemask_ps(_simd_cmpneq_ps(clipUnion, _simd_setzero_ps()));
183 }
184
185 // clipper is responsible for culling any prims with NAN coordinates
ComputeNaNMask(simdvector prim[])186 int ComputeNaNMask(simdvector prim[])
187 {
188 simdscalar vNanMask = _simd_setzero_ps();
189 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
190 {
191 simdscalar vNan01 = _simd_cmp_ps(prim[e].v[0], prim[e].v[1], _CMP_UNORD_Q);
192 vNanMask = _simd_or_ps(vNanMask, vNan01);
193 simdscalar vNan23 = _simd_cmp_ps(prim[e].v[2], prim[e].v[3], _CMP_UNORD_Q);
194 vNanMask = _simd_or_ps(vNanMask, vNan23);
195 }
196
197 return _simd_movemask_ps(vNanMask);
198 }
199
ComputeUserClipCullMask(PA_STATE & pa,simdvector prim[])200 int ComputeUserClipCullMask(PA_STATE& pa, simdvector prim[])
201 {
202 uint8_t cullMask = this->state.rastState.cullDistanceMask;
203 simdscalar vClipCullMask = _simd_setzero_ps();
204 DWORD index;
205
206 simdvector vClipCullDistLo[3];
207 simdvector vClipCullDistHi[3];
208
209 pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo);
210 pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi);
211 while (_BitScanForward(&index, cullMask))
212 {
213 cullMask &= ~(1 << index);
214 uint32_t slot = index >> 2;
215 uint32_t component = index & 0x3;
216
217 simdscalar vCullMaskElem = _simd_set1_ps(-1.0f);
218 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
219 {
220 simdscalar vCullComp;
221 if (slot == 0)
222 {
223 vCullComp = vClipCullDistLo[e][component];
224 }
225 else
226 {
227 vCullComp = vClipCullDistHi[e][component];
228 }
229
230 // cull if cull distance < 0 || NAN
231 simdscalar vCull = _simd_cmp_ps(_mm256_setzero_ps(), vCullComp, _CMP_NLE_UQ);
232 vCullMaskElem = _simd_and_ps(vCullMaskElem, vCull);
233 }
234 vClipCullMask = _simd_or_ps(vClipCullMask, vCullMaskElem);
235 }
236
237 // clipper should also discard any primitive with NAN clip distance
238 uint8_t clipMask = this->state.rastState.clipDistanceMask;
239 while (_BitScanForward(&index, clipMask))
240 {
241 clipMask &= ~(1 << index);
242 uint32_t slot = index >> 2;
243 uint32_t component = index & 0x3;
244
245 for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
246 {
247 simdscalar vClipComp;
248 if (slot == 0)
249 {
250 vClipComp = vClipCullDistLo[e][component];
251 }
252 else
253 {
254 vClipComp = vClipCullDistHi[e][component];
255 }
256
257 simdscalar vClip = _simd_cmp_ps(vClipComp, vClipComp, _CMP_UNORD_Q);
258 vClipCullMask = _simd_or_ps(vClipCullMask, vClip);
259 }
260 }
261
262 return _simd_movemask_ps(vClipCullMask);
263 }
264
265 // clip SIMD primitives
ClipSimd(const simdscalar & vPrimMask,const simdscalar & vClipMask,PA_STATE & pa,const simdscalari & vPrimId,const simdscalari & vViewportIdx)266 void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId, const simdscalari& vViewportIdx)
267 {
268 // input/output vertex store for clipper
269 simdvertex vertices[7]; // maximum 7 verts generated per triangle
270
271 LONG constantInterpMask = this->state.backendState.constantInterpolationMask;
272 uint32_t provokingVertex = 0;
273 if(pa.binTopology == TOP_TRIANGLE_FAN)
274 {
275 provokingVertex = this->state.frontendState.provokingVertex.triFan;
276 }
277 ///@todo: line topology for wireframe?
278
279 // assemble pos
280 simdvector tmpVector[NumVertsPerPrim];
281 pa.Assemble(VERTEX_POSITION_SLOT, tmpVector);
282 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
283 {
284 vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i];
285 }
286
287 // assemble attribs
288 const SWR_BACKEND_STATE& backendState = this->state.backendState;
289
290 int32_t maxSlot = -1;
291 for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
292 {
293 // Compute absolute attrib slot in vertex array
294 uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
295 maxSlot = std::max<int32_t>(maxSlot, mapSlot);
296 uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + mapSlot;
297
298 pa.Assemble(inputSlot, tmpVector);
299
300 // if constant interpolation enabled for this attribute, assign the provoking
301 // vertex values to all edges
302 if (_bittest(&constantInterpMask, slot))
303 {
304 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
305 {
306 vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
307 }
308 }
309 else
310 {
311 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
312 {
313 vertices[i].attrib[inputSlot] = tmpVector[i];
314 }
315 }
316 }
317
318 // assemble user clip distances if enabled
319 if (this->state.rastState.clipDistanceMask & 0xf)
320 {
321 pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector);
322 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
323 {
324 vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = tmpVector[i];
325 }
326 }
327
328 if (this->state.rastState.clipDistanceMask & 0xf0)
329 {
330 pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector);
331 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
332 {
333 vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = tmpVector[i];
334 }
335 }
336
337 uint32_t numAttribs = maxSlot + 1;
338
339 simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
340
341 // set up new PA for binning clipped primitives
342 PFN_PROCESS_PRIMS pfnBinFunc = nullptr;
343 PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
344 if (NumVertsPerPrim == 3)
345 {
346 pfnBinFunc = GetBinTrianglesFunc((pa.pDC->pState->state.rastState.conservativeRast > 0));
347 clipTopology = TOP_TRIANGLE_FAN;
348
349 // so that the binner knows to bloat wide points later
350 if (pa.binTopology == TOP_POINT_LIST)
351 clipTopology = TOP_POINT_LIST;
352
353 }
354 else if (NumVertsPerPrim == 2)
355 {
356 pfnBinFunc = BinLines;
357 clipTopology = TOP_LINE_LIST;
358 }
359 else
360 {
361 SWR_ASSERT(0 && "Unexpected points in clipper.");
362 }
363
364 uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts;
365 uint32_t* pPrimitiveId = (uint32_t*)&vPrimId;
366 uint32_t* pViewportIdx = (uint32_t*)&vViewportIdx;
367
368 const simdscalari vOffsets = _mm256_set_epi32(
369 0 * sizeof(simdvertex), // unused lane
370 6 * sizeof(simdvertex),
371 5 * sizeof(simdvertex),
372 4 * sizeof(simdvertex),
373 3 * sizeof(simdvertex),
374 2 * sizeof(simdvertex),
375 1 * sizeof(simdvertex),
376 0 * sizeof(simdvertex));
377
378 // only need to gather 7 verts
379 // @todo dynamic mask based on actual # of verts generated per lane
380 const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1);
381
382 uint32_t numClippedPrims = 0;
383 for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim)
384 {
385 uint32_t numEmittedVerts = pVertexCount[inputPrim];
386 if (numEmittedVerts < NumVertsPerPrim)
387 {
388 continue;
389 }
390 SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
391
392 uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
393 numClippedPrims += numEmittedPrims;
394
395 // tranpose clipper output so that each lane's vertices are in SIMD order
396 // set aside space for 2 vertices, as the PA will try to read up to 16 verts
397 // for triangle fan
398 simdvertex transposedPrims[2];
399
400 // transpose pos
401 uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
402 for (uint32_t c = 0; c < 4; ++c)
403 {
404 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
405 pBase += sizeof(simdscalar);
406 }
407
408 // transpose attribs
409 pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_ATTRIB_START_SLOT]) + sizeof(float) * inputPrim;
410 for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
411 {
412 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + attrib;
413 for (uint32_t c = 0; c < 4; ++c)
414 {
415 transposedPrims[0].attrib[attribSlot][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
416 pBase += sizeof(simdscalar);
417 }
418 }
419
420 // transpose user clip distances if enabled
421 if (this->state.rastState.clipDistanceMask & 0xf)
422 {
423 pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim;
424 for (uint32_t c = 0; c < 4; ++c)
425 {
426 transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
427 pBase += sizeof(simdscalar);
428 }
429 }
430
431 if (this->state.rastState.clipDistanceMask & 0xf0)
432 {
433 pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim;
434 for (uint32_t c = 0; c < 4; ++c)
435 {
436 transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
437 pBase += sizeof(simdscalar);
438 }
439 }
440
441 PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology);
442
443 while (clipPa.GetNextStreamOutput())
444 {
445 do
446 {
447 simdvector attrib[NumVertsPerPrim];
448 bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, attrib);
449 if (assemble)
450 {
451 static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
452 pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]), _simd_set1_epi32(pViewportIdx[inputPrim]));
453 }
454 } while (clipPa.NextPrim());
455 }
456 }
457
458 // update global pipeline stat
459 UPDATE_STAT_FE(CPrimitives, numClippedPrims);
460 }
461
462 // execute the clipper stage
ExecuteStage(PA_STATE & pa,simdvector prim[],uint32_t primMask,simdscalari primId,simdscalari viewportIdx)463 void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx)
464 {
465 SWR_ASSERT(pa.pDC != nullptr);
466 SWR_CONTEXT* pContext = pa.pDC->pContext;
467
468 // set up binner based on PA state
469 PFN_PROCESS_PRIMS pfnBinner;
470 switch (pa.binTopology)
471 {
472 case TOP_POINT_LIST:
473 pfnBinner = BinPoints;
474 break;
475 case TOP_LINE_LIST:
476 case TOP_LINE_STRIP:
477 case TOP_LINE_LOOP:
478 case TOP_LINE_LIST_ADJ:
479 case TOP_LISTSTRIP_ADJ:
480 pfnBinner = BinLines;
481 break;
482 default:
483 pfnBinner = GetBinTrianglesFunc((pa.pDC->pState->state.rastState.conservativeRast > 0));
484 break;
485 };
486
487 // update clipper invocations pipeline stat
488 uint32_t numInvoc = _mm_popcnt_u32(primMask);
489 UPDATE_STAT_FE(CInvocations, numInvoc);
490
491 ComputeClipCodes(prim, viewportIdx);
492
493 // cull prims with NAN coords
494 primMask &= ~ComputeNaNMask(prim);
495
496 // user cull distance cull
497 if (this->state.rastState.cullDistanceMask)
498 {
499 primMask &= ~ComputeUserClipCullMask(pa, prim);
500 }
501
502 // cull prims outside view frustum
503 simdscalar clipIntersection = ComputeClipCodeIntersection();
504 int validMask = primMask & _simd_movemask_ps(_simd_cmpeq_ps(clipIntersection, _simd_setzero_ps()));
505
506 // skip clipping for points
507 uint32_t clipMask = 0;
508 if (NumVertsPerPrim != 1)
509 {
510 clipMask = primMask & ComputeClipMask();
511 }
512
513 if (clipMask)
514 {
515 AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
516 // we have to clip tris, execute the clipper, which will also
517 // call the binner
518 ClipSimd(vMask(primMask), vMask(clipMask), pa, primId, viewportIdx);
519 AR_END(FEGuardbandClip, 1);
520 }
521 else if (validMask)
522 {
523 // update CPrimitives pipeline state
524 UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
525
526 // forward valid prims directly to binner
527 pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx);
528 }
529 }
530
531 private:
ComputeInterpFactor(simdscalar boundaryCoord0,simdscalar boundaryCoord1)532 inline simdscalar ComputeInterpFactor(simdscalar boundaryCoord0, simdscalar boundaryCoord1)
533 {
534 return _simd_div_ps(boundaryCoord0, _simd_sub_ps(boundaryCoord0, boundaryCoord1));
535 }
536
ComputeOffsets(uint32_t attrib,simdscalari vIndices,uint32_t component)537 inline simdscalari ComputeOffsets(uint32_t attrib, simdscalari vIndices, uint32_t component)
538 {
539 const uint32_t simdVertexStride = sizeof(simdvertex);
540 const uint32_t componentStride = sizeof(simdscalar);
541 const uint32_t attribStride = sizeof(simdvector);
542 const __m256i vElemOffset = _mm256_set_epi32(7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float),
543 3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float));
544
545 // step to the simdvertex
546 simdscalari vOffsets = _simd_mullo_epi32(vIndices, _simd_set1_epi32(simdVertexStride));
547
548 // step to the attribute and component
549 vOffsets = _simd_add_epi32(vOffsets, _simd_set1_epi32(attribStride * attrib + componentStride * component));
550
551 // step to the lane
552 vOffsets = _simd_add_epi32(vOffsets, vElemOffset);
553
554 return vOffsets;
555 }
556
557 // gathers a single component for a given attribute for each SIMD lane
GatherComponent(const float * pBuffer,uint32_t attrib,simdscalar vMask,simdscalari vIndices,uint32_t component)558 inline simdscalar GatherComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component)
559 {
560 simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
561 simdscalar vSrc = _mm256_undefined_ps();
562 return _simd_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1);
563 }
564
ScatterComponent(const float * pBuffer,uint32_t attrib,simdscalar vMask,simdscalari vIndices,uint32_t component,simdscalar vSrc)565 inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component, simdscalar vSrc)
566 {
567 simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
568
569 uint32_t* pOffsets = (uint32_t*)&vOffsets;
570 float* pSrc = (float*)&vSrc;
571 uint32_t mask = _simd_movemask_ps(vMask);
572 DWORD lane;
573 while (_BitScanForward(&lane, mask))
574 {
575 mask &= ~(1 << lane);
576 uint8_t* pBuf = (uint8_t*)pBuffer + pOffsets[lane];
577 *(float*)pBuf = pSrc[lane];
578 }
579 }
580
581 template<SWR_CLIPCODES ClippingPlane>
intersect(const simdscalar & vActiveMask,const simdscalari & s,const simdscalari & p,const simdvector & v1,const simdvector & v2,simdscalari & outIndex,const float * pInVerts,uint32_t numInAttribs,float * pOutVerts)582 inline void intersect(
583 const simdscalar& vActiveMask, // active lanes to operate on
584 const simdscalari& s, // index to first edge vertex v0 in pInPts.
585 const simdscalari& p, // index to second edge vertex v1 in pInPts.
586 const simdvector& v1, // vertex 0 position
587 const simdvector& v2, // vertex 1 position
588 simdscalari& outIndex, // output index.
589 const float *pInVerts, // array of all the input positions.
590 uint32_t numInAttribs, // number of attributes per vertex.
591 float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4.
592 {
593 // compute interpolation factor
594 simdscalar t;
595 switch (ClippingPlane)
596 {
597 case FRUSTUM_LEFT: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[0]), _simd_add_ps(v2[3], v2[0])); break;
598 case FRUSTUM_RIGHT: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[0]), _simd_sub_ps(v2[3], v2[0])); break;
599 case FRUSTUM_TOP: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[1]), _simd_add_ps(v2[3], v2[1])); break;
600 case FRUSTUM_BOTTOM: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[1]), _simd_sub_ps(v2[3], v2[1])); break;
601 case FRUSTUM_NEAR:
602 // DX Znear plane is 0, GL is -w
603 if (this->state.rastState.clipHalfZ)
604 {
605 t = ComputeInterpFactor(v1[2], v2[2]);
606 }
607 else
608 {
609 t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[2]), _simd_add_ps(v2[3], v2[2]));
610 }
611 break;
612 case FRUSTUM_FAR: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[2]), _simd_sub_ps(v2[3], v2[2])); break;
613 default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
614 };
615
616 // interpolate position and store
617 for (uint32_t c = 0; c < 4; ++c)
618 {
619 simdscalar vOutPos = _simd_fmadd_ps(_simd_sub_ps(v2[c], v1[c]), t, v1[c]);
620 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
621 }
622
623 // interpolate attributes and store
624 for (uint32_t a = 0; a < numInAttribs; ++a)
625 {
626 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
627 for (uint32_t c = 0; c < 4; ++c)
628 {
629 simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
630 simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
631 simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
632 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
633 }
634 }
635
636 // interpolate clip distance if enabled
637 if (this->state.rastState.clipDistanceMask & 0xf)
638 {
639 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
640 for (uint32_t c = 0; c < 4; ++c)
641 {
642 simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
643 simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
644 simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
645 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
646 }
647 }
648
649 if (this->state.rastState.clipDistanceMask & 0xf0)
650 {
651 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
652 for (uint32_t c = 0; c < 4; ++c)
653 {
654 simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
655 simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
656 simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
657 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
658 }
659 }
660 }
661
662 template<SWR_CLIPCODES ClippingPlane>
inside(const simdvector & v)663 inline simdscalar inside(const simdvector& v)
664 {
665 switch (ClippingPlane)
666 {
667 case FRUSTUM_LEFT: return _simd_cmpge_ps(v[0], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
668 case FRUSTUM_RIGHT: return _simd_cmple_ps(v[0], v[3]);
669 case FRUSTUM_TOP: return _simd_cmpge_ps(v[1], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
670 case FRUSTUM_BOTTOM: return _simd_cmple_ps(v[1], v[3]);
671 case FRUSTUM_NEAR: return _simd_cmpge_ps(v[2], this->state.rastState.clipHalfZ ? _simd_setzero_ps() : _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
672 case FRUSTUM_FAR: return _simd_cmple_ps(v[2], v[3]);
673 default:
674 SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
675 return _simd_setzero_ps();
676 }
677 }
678
679 template<SWR_CLIPCODES ClippingPlane>
ClipTriToPlane(const float * pInVerts,const simdscalari & vNumInPts,uint32_t numInAttribs,float * pOutVerts)680 simdscalari ClipTriToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
681 {
682 simdscalari vCurIndex = _simd_setzero_si();
683 simdscalari vOutIndex = _simd_setzero_si();
684 simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
685
686 while (!_simd_testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
687 {
688 simdscalari s = vCurIndex;
689 simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
690 simdscalari underFlowMask = _simd_cmpgt_epi32(vNumInPts, p);
691 p = _simd_castps_si(_simd_blendv_ps(_simd_setzero_ps(), _simd_castsi_ps(p), _simd_castsi_ps(underFlowMask)));
692
693 // gather position
694 simdvector vInPos0, vInPos1;
695 for (uint32_t c = 0; c < 4; ++c)
696 {
697 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
698 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
699 }
700
701 // compute inside mask
702 simdscalar s_in = inside<ClippingPlane>(vInPos0);
703 simdscalar p_in = inside<ClippingPlane>(vInPos1);
704
705 // compute intersection mask (s_in != p_in)
706 simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
707 intersectMask = _simd_and_ps(intersectMask, vActiveMask);
708
709 // store s if inside
710 s_in = _simd_and_ps(s_in, vActiveMask);
711 if (!_simd_testz_ps(s_in, s_in))
712 {
713 // store position
714 for (uint32_t c = 0; c < 4; ++c)
715 {
716 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
717 }
718
719 // store attribs
720 for (uint32_t a = 0; a < numInAttribs; ++a)
721 {
722 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
723 for (uint32_t c = 0; c < 4; ++c)
724 {
725 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
726 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
727 }
728 }
729
730 // store clip distance if enabled
731 if (this->state.rastState.clipDistanceMask & 0xf)
732 {
733 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
734 for (uint32_t c = 0; c < 4; ++c)
735 {
736 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
737 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
738 }
739 }
740
741 if (this->state.rastState.clipDistanceMask & 0xf0)
742 {
743 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
744 for (uint32_t c = 0; c < 4; ++c)
745 {
746 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
747 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
748 }
749 }
750
751 // increment outIndex
752 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
753 }
754
755 // compute and store intersection
756 if (!_simd_testz_ps(intersectMask, intersectMask))
757 {
758 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
759
760 // increment outIndex for active lanes
761 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
762 }
763
764 // increment loop index and update active mask
765 vCurIndex = _simd_add_epi32(vCurIndex, _simd_set1_epi32(1));
766 vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
767 }
768
769 return vOutIndex;
770 }
771
772 template<SWR_CLIPCODES ClippingPlane>
ClipLineToPlane(const float * pInVerts,const simdscalari & vNumInPts,uint32_t numInAttribs,float * pOutVerts)773 simdscalari ClipLineToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
774 {
775 simdscalari vCurIndex = _simd_setzero_si();
776 simdscalari vOutIndex = _simd_setzero_si();
777 simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
778
779 if (!_simd_testz_ps(vActiveMask, vActiveMask))
780 {
781 simdscalari s = vCurIndex;
782 simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
783
784 // gather position
785 simdvector vInPos0, vInPos1;
786 for (uint32_t c = 0; c < 4; ++c)
787 {
788 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
789 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
790 }
791
792 // compute inside mask
793 simdscalar s_in = inside<ClippingPlane>(vInPos0);
794 simdscalar p_in = inside<ClippingPlane>(vInPos1);
795
796 // compute intersection mask (s_in != p_in)
797 simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
798 intersectMask = _simd_and_ps(intersectMask, vActiveMask);
799
800 // store s if inside
801 s_in = _simd_and_ps(s_in, vActiveMask);
802 if (!_simd_testz_ps(s_in, s_in))
803 {
804 for (uint32_t c = 0; c < 4; ++c)
805 {
806 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
807 }
808
809 // interpolate attributes and store
810 for (uint32_t a = 0; a < numInAttribs; ++a)
811 {
812 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
813 for (uint32_t c = 0; c < 4; ++c)
814 {
815 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
816 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
817 }
818 }
819
820 // increment outIndex
821 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
822 }
823
824 // compute and store intersection
825 if (!_simd_testz_ps(intersectMask, intersectMask))
826 {
827 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
828
829 // increment outIndex for active lanes
830 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
831 }
832
833 // store p if inside
834 p_in = _simd_and_ps(p_in, vActiveMask);
835 if (!_simd_testz_ps(p_in, p_in))
836 {
837 for (uint32_t c = 0; c < 4; ++c)
838 {
839 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
840 }
841
842 // interpolate attributes and store
843 for (uint32_t a = 0; a < numInAttribs; ++a)
844 {
845 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
846 for (uint32_t c = 0; c < 4; ++c)
847 {
848 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
849 ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
850 }
851 }
852
853 // increment outIndex
854 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), p_in);
855 }
856 }
857
858 return vOutIndex;
859 }
860
861 //////////////////////////////////////////////////////////////////////////
862 /// @brief Vertical clipper. Clips SIMD primitives at a time
863 /// @param pVertices - pointer to vertices in SOA form. Clipper will read input and write results to this buffer
864 /// @param vPrimMask - mask of valid input primitives, including non-clipped prims
865 /// @param numAttribs - number of valid input attribs, including position
ClipPrims(float * pVertices,const simdscalar & vPrimMask,const simdscalar & vClipMask,int numAttribs)866 simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs)
867 {
868 // temp storage
869 float* pTempVerts = (float*)&tlsTempVertices[0];
870
871 // zero out num input verts for non-active lanes
872 simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim);
873 vNumInPts = _simd_blendv_epi32(_simd_setzero_si(), vNumInPts, vClipMask);
874
875 // clip prims to frustum
876 simdscalari vNumOutPts;
877 if (NumVertsPerPrim == 3)
878 {
879 vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
880 vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
881 vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
882 vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
883 vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
884 vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
885 }
886 else
887 {
888 SWR_ASSERT(NumVertsPerPrim == 2);
889 vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
890 vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
891 vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
892 vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
893 vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
894 vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
895 }
896
897 // restore num verts for non-clipped, active lanes
898 simdscalar vNonClippedMask = _simd_andnot_ps(vClipMask, vPrimMask);
899 vNumOutPts = _simd_blendv_epi32(vNumOutPts, _simd_set1_epi32(NumVertsPerPrim), vNonClippedMask);
900
901 return vNumOutPts;
902 }
903
904 const uint32_t workerId{ 0 };
905 DRAW_CONTEXT* pDC{ nullptr };
906 const API_STATE& state;
907 simdscalar clipCodes[NumVertsPerPrim];
908 };
909
910
911 // pipeline stage functions
912 void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
913 void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
914 void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
915