1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file frontend.h
24 *
25 * @brief Definitions for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
27 *
28 ******************************************************************************/
29 #pragma once
30 #include "context.h"
31 #include <type_traits>
32
33 // Calculates the A and B coefficients for the 3 edges of the triangle
34 //
35 // maths for edge equations:
36 // standard form of a line in 2d
37 // Ax + By + C = 0
38 // A = y0 - y1
39 // B = x1 - x0
40 // C = x0y1 - x1y0
41 INLINE
triangleSetupAB(const __m128 vX,const __m128 vY,__m128 & vA,__m128 & vB)42 void triangleSetupAB(const __m128 vX, const __m128 vY, __m128 & vA, __m128 & vB)
43 {
44 // vYsub = y1 y2 y0 dc
45 __m128 vYsub = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(3, 0, 2, 1));
46 // vY = y0 y1 y2 dc
47 vA = _mm_sub_ps(vY, vYsub);
48
49 // Result:
50 // A[0] = y0 - y1
51 // A[1] = y1 - y2
52 // A[2] = y2 - y0
53
54 // vXsub = x1 x2 x0 dc
55 __m128 vXsub = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(3, 0, 2, 1));
56 // vX = x0 x1 x2 dc
57 vB = _mm_sub_ps(vXsub, vX);
58
59 // Result:
60 // B[0] = x1 - x0
61 // B[1] = x2 - x1
62 // B[2] = x0 - x2
63 }
64
65 INLINE
triangleSetupABVertical(const simdscalar vX[3],const simdscalar vY[3],simdscalar (& vA)[3],simdscalar (& vB)[3])66 void triangleSetupABVertical(const simdscalar vX[3], const simdscalar vY[3], simdscalar (&vA)[3], simdscalar (&vB)[3])
67 {
68 // generate edge equations
69 // A = y0 - y1
70 // B = x1 - x0
71 vA[0] = _simd_sub_ps(vY[0], vY[1]);
72 vA[1] = _simd_sub_ps(vY[1], vY[2]);
73 vA[2] = _simd_sub_ps(vY[2], vY[0]);
74
75 vB[0] = _simd_sub_ps(vX[1], vX[0]);
76 vB[1] = _simd_sub_ps(vX[2], vX[1]);
77 vB[2] = _simd_sub_ps(vX[0], vX[2]);
78 }
79
80 INLINE
triangleSetupABInt(const __m128i vX,const __m128i vY,__m128i & vA,__m128i & vB)81 void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i & vA, __m128i & vB)
82 {
83 // generate edge equations
84 // A = y0 - y1
85 // B = x1 - x0
86 // C = x0y1 - x1y0
87 __m128i vYsub = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 2, 1));
88 vA = _mm_sub_epi32(vY, vYsub);
89
90 __m128i vXsub = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 2, 1));
91 vB = _mm_sub_epi32(vXsub, vX);
92 }
93
94 INLINE
triangleSetupABIntVertical(const simdscalari vX[3],const simdscalari vY[3],simdscalari (& vA)[3],simdscalari (& vB)[3])95 void triangleSetupABIntVertical(const simdscalari vX[3], const simdscalari vY[3], simdscalari (&vA)[3], simdscalari (&vB)[3])
96 {
97 // A = y0 - y1
98 // B = x1 - x0
99 vA[0] = _simd_sub_epi32(vY[0], vY[1]);
100 vA[1] = _simd_sub_epi32(vY[1], vY[2]);
101 vA[2] = _simd_sub_epi32(vY[2], vY[0]);
102
103 vB[0] = _simd_sub_epi32(vX[1], vX[0]);
104 vB[1] = _simd_sub_epi32(vX[2], vX[1]);
105 vB[2] = _simd_sub_epi32(vX[0], vX[2]);
106 }
107 // Calculate the determinant of the triangle
108 // 2 vectors between the 3 points: P, Q
109 // Px = x0-x2, Py = y0-y2
110 // Qx = x1-x2, Qy = y1-y2
111 // |Px Qx|
112 // det = | | = PxQy - PyQx
113 // |Py Qy|
114 // simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2)
115 // try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx
116 // : B[2]*A[1] - (-(y2-y0))*(-(x2-x1))
117 // : B[2]*A[1] - (-1)(-1)(y2-y0)*(x2-x1)
118 // : B[2]*A[1] - A[2]*B[1]
119 INLINE
calcDeterminantInt(const __m128i vA,const __m128i vB)120 float calcDeterminantInt(const __m128i vA, const __m128i vB)
121 {
122 // vAShuf = [A1, A0, A2, A0]
123 __m128i vAShuf = _mm_shuffle_epi32(vA, _MM_SHUFFLE(0, 2, 0, 1));
124 // vBShuf = [B2, B0, B1, B0]
125 __m128i vBShuf = _mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 1, 0, 2));
126 // vMul = [A1*B2, B1*A2]
127 __m128i vMul = _mm_mul_epi32(vAShuf, vBShuf);
128
129 // shuffle upper to lower
130 // vMul2 = [B1*A2, B1*A2]
131 __m128i vMul2 = _mm_shuffle_epi32(vMul, _MM_SHUFFLE(3, 2, 3, 2));
132 //vMul = [A1*B2 - B1*A2]
133 vMul = _mm_sub_epi64(vMul, vMul2);
134
135 int64_t result;
136 _mm_store_sd((double*)&result, _mm_castsi128_pd(vMul));
137
138 double dResult = (double)result;
139 dResult = dResult * (1.0 / FIXED_POINT16_SCALE);
140
141 return (float)dResult;
142 }
143
144 INLINE
calcDeterminantIntVertical(const simdscalari vA[3],const simdscalari vB[3],simdscalari * pvDet)145 void calcDeterminantIntVertical(const simdscalari vA[3], const simdscalari vB[3], simdscalari *pvDet)
146 {
147 // refer to calcDeterminantInt comment for calculation explanation
148 // A1*B2
149 simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5
150 simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7
151
152 simdscalari vB2Lo = _simd_unpacklo_epi32(vB[2], vB[2]);
153 simdscalari vB2Hi = _simd_unpackhi_epi32(vB[2], vB[2]);
154
155 simdscalari vA1B2Lo = _simd_mul_epi32(vA1Lo, vB2Lo); // 0 1 4 5
156 simdscalari vA1B2Hi = _simd_mul_epi32(vA1Hi, vB2Hi); // 2 3 6 7
157
158 // B1*A2
159 simdscalari vA2Lo = _simd_unpacklo_epi32(vA[2], vA[2]);
160 simdscalari vA2Hi = _simd_unpackhi_epi32(vA[2], vA[2]);
161
162 simdscalari vB1Lo = _simd_unpacklo_epi32(vB[1], vB[1]);
163 simdscalari vB1Hi = _simd_unpackhi_epi32(vB[1], vB[1]);
164
165 simdscalari vA2B1Lo = _simd_mul_epi32(vA2Lo, vB1Lo);
166 simdscalari vA2B1Hi = _simd_mul_epi32(vA2Hi, vB1Hi);
167
168 // A1*B2 - A2*B1
169 simdscalari detLo = _simd_sub_epi64(vA1B2Lo, vA2B1Lo);
170 simdscalari detHi = _simd_sub_epi64(vA1B2Hi, vA2B1Hi);
171
172 // shuffle 0 1 4 5 -> 0 1 2 3
173 simdscalari vResultLo = _mm256_permute2f128_si256(detLo, detHi, 0x20);
174 simdscalari vResultHi = _mm256_permute2f128_si256(detLo, detHi, 0x31);
175
176 pvDet[0] = vResultLo;
177 pvDet[1] = vResultHi;
178 }
179
180 INLINE
triangleSetupC(const __m128 vX,const __m128 vY,const __m128 vA,const __m128 & vB,__m128 & vC)181 void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128 &vB, __m128 &vC)
182 {
183 // C = -Ax - By
184 vC = _mm_mul_ps(vA, vX);
185 __m128 vCy = _mm_mul_ps(vB, vY);
186 vC = _mm_mul_ps(vC, _mm_set1_ps(-1.0f));
187 vC = _mm_sub_ps(vC, vCy);
188 }
189
190 INLINE
viewportTransform(__m128 & vX,__m128 & vY,__m128 & vZ,const SWR_VIEWPORT_MATRIX & vpMatrix)191 void viewportTransform(__m128 &vX, __m128 &vY, __m128 &vZ, const SWR_VIEWPORT_MATRIX &vpMatrix)
192 {
193 vX = _mm_mul_ps(vX, _mm_set1_ps(vpMatrix.m00));
194 vX = _mm_add_ps(vX, _mm_set1_ps(vpMatrix.m30));
195
196 vY = _mm_mul_ps(vY, _mm_set1_ps(vpMatrix.m11));
197 vY = _mm_add_ps(vY, _mm_set1_ps(vpMatrix.m31));
198
199 vZ = _mm_mul_ps(vZ, _mm_set1_ps(vpMatrix.m22));
200 vZ = _mm_add_ps(vZ, _mm_set1_ps(vpMatrix.m32));
201 }
202
203 template<uint32_t NumVerts>
204 INLINE
viewportTransform(simdvector * v,const SWR_VIEWPORT_MATRICES & vpMatrices)205 void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices)
206 {
207 simdscalar m00 = _simd_load1_ps(&vpMatrices.m00[0]);
208 simdscalar m30 = _simd_load1_ps(&vpMatrices.m30[0]);
209 simdscalar m11 = _simd_load1_ps(&vpMatrices.m11[0]);
210 simdscalar m31 = _simd_load1_ps(&vpMatrices.m31[0]);
211 simdscalar m22 = _simd_load1_ps(&vpMatrices.m22[0]);
212 simdscalar m32 = _simd_load1_ps(&vpMatrices.m32[0]);
213
214 for (uint32_t i = 0; i < NumVerts; ++i)
215 {
216 v[i].x = _simd_fmadd_ps(v[i].x, m00, m30);
217 v[i].y = _simd_fmadd_ps(v[i].y, m11, m31);
218 v[i].z = _simd_fmadd_ps(v[i].z, m22, m32);
219 }
220 }
221
222 template<uint32_t NumVerts>
223 INLINE
viewportTransform(simdvector * v,const SWR_VIEWPORT_MATRICES & vpMatrices,simdscalari vViewportIdx)224 void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simdscalari vViewportIdx)
225 {
226 // perform a gather of each matrix element based on the viewport array indexes
227 simdscalar m00 = _simd_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4);
228 simdscalar m30 = _simd_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4);
229 simdscalar m11 = _simd_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4);
230 simdscalar m31 = _simd_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4);
231 simdscalar m22 = _simd_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4);
232 simdscalar m32 = _simd_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4);
233
234 for (uint32_t i = 0; i < NumVerts; ++i)
235 {
236 v[i].x = _simd_fmadd_ps(v[i].x, m00, m30);
237 v[i].y = _simd_fmadd_ps(v[i].y, m11, m31);
238 v[i].z = _simd_fmadd_ps(v[i].z, m22, m32);
239 }
240 }
241
242 INLINE
calcBoundingBoxInt(const __m128i & vX,const __m128i & vY,SWR_RECT & bbox)243 void calcBoundingBoxInt(const __m128i &vX, const __m128i &vY, SWR_RECT &bbox)
244 {
245 // Need horizontal fp min here
246 __m128i vX1 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 2, 0, 1));
247 __m128i vX2 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 1, 2));
248
249 __m128i vY1 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 2, 0, 1));
250 __m128i vY2 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 1, 2));
251
252
253 __m128i vMinX = _mm_min_epi32(vX, vX1);
254 vMinX = _mm_min_epi32(vMinX, vX2);
255
256 __m128i vMaxX = _mm_max_epi32(vX, vX1);
257 vMaxX = _mm_max_epi32(vMaxX, vX2);
258
259 __m128i vMinY = _mm_min_epi32(vY, vY1);
260 vMinY = _mm_min_epi32(vMinY, vY2);
261
262 __m128i vMaxY = _mm_max_epi32(vY, vY1);
263 vMaxY = _mm_max_epi32(vMaxY, vY2);
264
265 bbox.xmin = _mm_extract_epi32(vMinX, 0);
266 bbox.xmax = _mm_extract_epi32(vMaxX, 0);
267 bbox.ymin = _mm_extract_epi32(vMinY, 0);
268 bbox.ymax = _mm_extract_epi32(vMaxY, 0);
269 }
270
271 INLINE
CanUseSimplePoints(DRAW_CONTEXT * pDC)272 bool CanUseSimplePoints(DRAW_CONTEXT *pDC)
273 {
274 const API_STATE& state = GetApiState(pDC);
275
276 return (state.rastState.sampleCount == SWR_MULTISAMPLE_1X &&
277 state.rastState.pointSize == 1.0f &&
278 !state.rastState.pointParam &&
279 !state.rastState.pointSpriteEnable);
280 }
281
282 INLINE
vHasNaN(const __m128 & vec)283 bool vHasNaN(const __m128& vec)
284 {
285 const __m128 result = _mm_cmpunord_ps(vec, vec);
286 const int32_t mask = _mm_movemask_ps(result);
287 return (mask != 0);
288 }
289
290 uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numElements);
291 uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts);
292
293
294 // ProcessDraw front-end function. All combinations of parameter values are available
295 PFN_FE_WORK_FUNC GetProcessDrawFunc(
296 bool IsIndexed,
297 bool IsCutIndexEnabled,
298 bool HasTessellation,
299 bool HasGeometryShader,
300 bool HasStreamOut,
301 bool HasRasterization);
302
303 void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
304 void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
305 void ProcessDiscardInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
306 void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
307 void ProcessShutdown(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
308
309 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative);
310
311 struct PA_STATE_BASE; // forward decl
312 void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
313 void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
314
315