1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file pa_avx.cpp
24 *
25 * @brief AVX implementation for primitive assembly.
26 * N primitives are assembled at a time, where N is the SIMD width.
27 * A state machine, that is specific for a given topology, drives the
28 * assembly of vertices into triangles.
29 *
30 ******************************************************************************/
31 #include "context.h"
32 #include "pa.h"
33 #include "frontend.h"
34
35 #if (KNOB_SIMD_WIDTH == 8)
36
37 bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
38 bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
39 bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
40 void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
41
42 bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
43 bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
44 void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
45
46 bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
47 bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
48 void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
49
50 bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
51 bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
52 void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
53
54 bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
55 bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
56
57 bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
58 bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
59 void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t index, __m128 verts[]);
60
61 bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
62 bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
63 void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 lineverts[]);
64
65 bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
66 void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
67
68 bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
69 bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
70 bool PaRectList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
71 void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
72
73 template <uint32_t TotalControlPoints>
PaPatchListSingle(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,__m128 verts[])74 void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
75 {
76 // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
77 // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute.
78 // Each attribute has 4 components.
79
80 /// @todo Optimize this
81
82 float* pOutVec = (float*)verts;
83
84 for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
85 {
86 uint32_t input_cp = primIndex * TotalControlPoints + cp;
87 uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
88 uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
89
90 // Loop over all components of the attribute
91 for (uint32_t i = 0; i < 4; ++i)
92 {
93 const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
94 pOutVec[cp * 4 + i] = pInputVec[input_lane];
95 }
96 }
97 }
98
99 template<uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1>
PaPatchList(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])100 static bool PaPatchList(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
101 {
102 SetNextPaState(
103 pa,
104 PaPatchList<TotalControlPoints, CurrentControlPoints + 1>,
105 PaPatchListSingle<TotalControlPoints>);
106
107 return false;
108 }
109
110 template<uint32_t TotalControlPoints>
PaPatchListTerm(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])111 static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
112 {
113 // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
114 // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute.
115 // Each attribute has 4 components.
116
117 /// @todo Optimize this
118
119 // Loop over all components of the attribute
120 for (uint32_t i = 0; i < 4; ++i)
121 {
122 for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
123 {
124 float vec[KNOB_SIMD_WIDTH];
125 for (uint32_t lane = 0; lane < KNOB_SIMD_WIDTH; ++lane)
126 {
127 uint32_t input_cp = lane * TotalControlPoints + cp;
128 uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
129 uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
130
131 const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
132 vec[lane] = pInputVec[input_lane];
133 }
134 verts[cp][i] = _simd_loadu_ps(vec);
135 }
136 }
137
138 SetNextPaState(
139 pa,
140 PaPatchList<TotalControlPoints>,
141 PaPatchListSingle<TotalControlPoints>,
142 0,
143 KNOB_SIMD_WIDTH,
144 true);
145
146 return true;
147 }
148
149 #define PA_PATCH_LIST_TERMINATOR(N) \
150 template<> bool PaPatchList<N, N>(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])\
151 { return PaPatchListTerm<N>(pa, slot, verts); }
152 PA_PATCH_LIST_TERMINATOR(1)
153 PA_PATCH_LIST_TERMINATOR(2)
154 PA_PATCH_LIST_TERMINATOR(3)
155 PA_PATCH_LIST_TERMINATOR(4)
156 PA_PATCH_LIST_TERMINATOR(5)
157 PA_PATCH_LIST_TERMINATOR(6)
158 PA_PATCH_LIST_TERMINATOR(7)
159 PA_PATCH_LIST_TERMINATOR(8)
160 PA_PATCH_LIST_TERMINATOR(9)
161 PA_PATCH_LIST_TERMINATOR(10)
162 PA_PATCH_LIST_TERMINATOR(11)
163 PA_PATCH_LIST_TERMINATOR(12)
164 PA_PATCH_LIST_TERMINATOR(13)
165 PA_PATCH_LIST_TERMINATOR(14)
166 PA_PATCH_LIST_TERMINATOR(15)
167 PA_PATCH_LIST_TERMINATOR(16)
168 PA_PATCH_LIST_TERMINATOR(17)
169 PA_PATCH_LIST_TERMINATOR(18)
170 PA_PATCH_LIST_TERMINATOR(19)
171 PA_PATCH_LIST_TERMINATOR(20)
172 PA_PATCH_LIST_TERMINATOR(21)
173 PA_PATCH_LIST_TERMINATOR(22)
174 PA_PATCH_LIST_TERMINATOR(23)
175 PA_PATCH_LIST_TERMINATOR(24)
176 PA_PATCH_LIST_TERMINATOR(25)
177 PA_PATCH_LIST_TERMINATOR(26)
178 PA_PATCH_LIST_TERMINATOR(27)
179 PA_PATCH_LIST_TERMINATOR(28)
180 PA_PATCH_LIST_TERMINATOR(29)
181 PA_PATCH_LIST_TERMINATOR(30)
182 PA_PATCH_LIST_TERMINATOR(31)
183 PA_PATCH_LIST_TERMINATOR(32)
184 #undef PA_PATCH_LIST_TERMINATOR
185
PaTriList0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])186 bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
187 {
188 SetNextPaState(pa, PaTriList1, PaTriListSingle0);
189 return false; // Not enough vertices to assemble 4 or 8 triangles.
190 }
191
PaTriList1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])192 bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
193 {
194 SetNextPaState(pa, PaTriList2, PaTriListSingle0);
195 return false; // Not enough vertices to assemble 8 triangles.
196 }
197
PaTriList2(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])198 bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
199 {
200 #if KNOB_ARCH == KNOB_ARCH_AVX
201
202 simdvector& a = PaGetSimdVector(pa, 0, slot);
203 simdvector& b = PaGetSimdVector(pa, 1, slot);
204 simdvector& c = PaGetSimdVector(pa, 2, slot);
205 simdscalar s;
206
207 // Tri Pattern - provoking vertex is always v0
208 // v0 -> 0 3 6 9 12 15 18 21
209 // v1 -> 1 4 7 10 13 16 19 22
210 // v2 -> 2 5 8 11 14 17 20 23
211
212 for (int i = 0; i < 4; ++i)
213 {
214 simdvector& v0 = verts[0];
215 v0[i] = _simd_blend_ps(a[i], b[i], 0x92);
216 v0[i] = _simd_blend_ps(v0[i], c[i], 0x24);
217 v0[i] = _mm256_permute_ps(v0[i], 0x6C);
218 s = _mm256_permute2f128_ps(v0[i], v0[i], 0x21);
219 v0[i] = _simd_blend_ps(v0[i], s, 0x44);
220
221 simdvector& v1 = verts[1];
222 v1[i] = _simd_blend_ps(a[i], b[i], 0x24);
223 v1[i] = _simd_blend_ps(v1[i], c[i], 0x49);
224 v1[i] = _mm256_permute_ps(v1[i], 0xB1);
225 s = _mm256_permute2f128_ps(v1[i], v1[i], 0x21);
226 v1[i] = _simd_blend_ps(v1[i], s, 0x66);
227
228 simdvector& v2 = verts[2];
229 v2[i] = _simd_blend_ps(a[i], b[i], 0x49);
230 v2[i] = _simd_blend_ps(v2[i], c[i], 0x92);
231 v2[i] = _mm256_permute_ps(v2[i], 0xC6);
232 s = _mm256_permute2f128_ps(v2[i], v2[i], 0x21);
233 v2[i] = _simd_blend_ps(v2[i], s, 0x22);
234 }
235
236 #elif KNOB_ARCH >= KNOB_ARCH_AVX2
237
238 simdvector &a = PaGetSimdVector(pa, 0, slot);
239 simdvector &b = PaGetSimdVector(pa, 1, slot);
240 simdvector &c = PaGetSimdVector(pa, 2, slot);
241
242 // v0 -> a0 a3 a6 b1 b4 b7 c2 c5
243 // v1 -> a1 a4 a7 b2 b5 c0 c3 c6
244 // v2 -> a2 a5 b0 b3 b6 c1 c4 c7
245
246 const simdscalari perm0 = _simd_set_epi32(5, 2, 7, 4, 1, 6, 3, 0);
247 const simdscalari perm1 = _simd_set_epi32(6, 3, 0, 5, 2, 7, 4, 1);
248 const simdscalari perm2 = _simd_set_epi32(7, 4, 1, 6, 3, 0, 5, 2);
249
250 simdvector &v0 = verts[0];
251 simdvector &v1 = verts[1];
252 simdvector &v2 = verts[2];
253
254 for (int i = 0; i < 4; ++i)
255 {
256 v0[i] = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x92), c[i], 0x24);
257 v0[i] = _mm256_permutevar8x32_ps(v0[i], perm0);
258
259 v1[i] = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x24), c[i], 0x49);
260 v1[i] = _mm256_permutevar8x32_ps(v1[i], perm1);
261
262 v2[i] = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x49), c[i], 0x92);
263 v2[i] = _mm256_permutevar8x32_ps(v2[i], perm2);
264 }
265
266 #endif
267
268 SetNextPaState(pa, PaTriList0, PaTriListSingle0, 0, KNOB_SIMD_WIDTH, true);
269 return true;
270 }
271
PaTriListSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,__m128 verts[])272 void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
273 {
274 // We have 12 simdscalars contained within 3 simdvectors which
275 // hold at least 8 triangles worth of data. We want to assemble a single
276 // triangle with data in horizontal form.
277 simdvector& a = PaGetSimdVector(pa, 0, slot);
278 simdvector& b = PaGetSimdVector(pa, 1, slot);
279 simdvector& c = PaGetSimdVector(pa, 2, slot);
280
281 // Convert from vertical to horizontal.
282 // Tri Pattern - provoking vertex is always v0
283 // v0 -> 0 3 6 9 12 15 18 21
284 // v1 -> 1 4 7 10 13 16 19 22
285 // v2 -> 2 5 8 11 14 17 20 23
286 switch(primIndex)
287 {
288 case 0:
289 verts[0] = swizzleLane0(a);
290 verts[1] = swizzleLane1(a);
291 verts[2] = swizzleLane2(a);
292 break;
293 case 1:
294 verts[0] = swizzleLane3(a);
295 verts[1] = swizzleLane4(a);
296 verts[2] = swizzleLane5(a);
297 break;
298 case 2:
299 verts[0] = swizzleLane6(a);
300 verts[1] = swizzleLane7(a);
301 verts[2] = swizzleLane0(b);
302 break;
303 case 3:
304 verts[0] = swizzleLane1(b);
305 verts[1] = swizzleLane2(b);
306 verts[2] = swizzleLane3(b);
307 break;
308 case 4:
309 verts[0] = swizzleLane4(b);
310 verts[1] = swizzleLane5(b);
311 verts[2] = swizzleLane6(b);
312 break;
313 case 5:
314 verts[0] = swizzleLane7(b);
315 verts[1] = swizzleLane0(c);
316 verts[2] = swizzleLane1(c);
317 break;
318 case 6:
319 verts[0] = swizzleLane2(c);
320 verts[1] = swizzleLane3(c);
321 verts[2] = swizzleLane4(c);
322 break;
323 case 7:
324 verts[0] = swizzleLane5(c);
325 verts[1] = swizzleLane6(c);
326 verts[2] = swizzleLane7(c);
327 break;
328 };
329 }
330
PaTriStrip0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])331 bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
332 {
333 SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0);
334 return false; // Not enough vertices to assemble 8 triangles.
335 }
336
PaTriStrip1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])337 bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
338 {
339 simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
340 simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
341 simdscalar s;
342
343 for(int i = 0; i < 4; ++i)
344 {
345 simdscalar a0 = a[i];
346 simdscalar b0 = b[i];
347
348 // Tri Pattern - provoking vertex is always v0
349 // v0 -> 01234567
350 // v1 -> 13355779
351 // v2 -> 22446688
352 simdvector& v0 = verts[0];
353 v0[i] = a0;
354
355 // s -> 4567891011
356 s = _mm256_permute2f128_ps(a0, b0, 0x21);
357 // s -> 23456789
358 s = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
359
360 simdvector& v1 = verts[1];
361 // v1 -> 13355779
362 v1[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(3, 1, 3, 1));
363
364 simdvector& v2 = verts[2];
365 // v2 -> 22446688
366 v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(2, 2, 2, 2));
367 }
368
369 SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0, 0, KNOB_SIMD_WIDTH);
370 return true;
371 }
372
PaTriStripSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,__m128 verts[])373 void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
374 {
375 simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
376 simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
377
378 // Convert from vertical to horizontal.
379 // Tri Pattern - provoking vertex is always v0
380 // v0 -> 01234567
381 // v1 -> 13355779
382 // v2 -> 22446688
383 switch(primIndex)
384 {
385 case 0:
386 verts[0] = swizzleLane0(a);
387 verts[1] = swizzleLane1(a);
388 verts[2] = swizzleLane2(a);
389 break;
390 case 1:
391 verts[0] = swizzleLane1(a);
392 verts[1] = swizzleLane3(a);
393 verts[2] = swizzleLane2(a);
394 break;
395 case 2:
396 verts[0] = swizzleLane2(a);
397 verts[1] = swizzleLane3(a);
398 verts[2] = swizzleLane4(a);
399 break;
400 case 3:
401 verts[0] = swizzleLane3(a);
402 verts[1] = swizzleLane5(a);
403 verts[2] = swizzleLane4(a);
404 break;
405 case 4:
406 verts[0] = swizzleLane4(a);
407 verts[1] = swizzleLane5(a);
408 verts[2] = swizzleLane6(a);
409 break;
410 case 5:
411 verts[0] = swizzleLane5(a);
412 verts[1] = swizzleLane7(a);
413 verts[2] = swizzleLane6(a);
414 break;
415 case 6:
416 verts[0] = swizzleLane6(a);
417 verts[1] = swizzleLane7(a);
418 verts[2] = swizzleLane0(b);
419 break;
420 case 7:
421 verts[0] = swizzleLane7(a);
422 verts[1] = swizzleLane1(b);
423 verts[2] = swizzleLane0(b);
424 break;
425 };
426 }
427
PaTriFan0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])428 bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
429 {
430 simdvector& a = PaGetSimdVector(pa, pa.cur, slot);
431
432 // Extract vertex 0 to every lane of first vector
433 for(int i = 0; i < 4; ++i)
434 {
435 __m256 a0 = a[i];
436 simdvector& v0 = verts[0];
437 v0[i] = _simd_shuffle_ps(a0, a0, _MM_SHUFFLE(0, 0, 0, 0));
438 v0[i] = _mm256_permute2f128_ps(v0[i], a0, 0x00);
439 }
440
441 // store off leading vertex for attributes
442 simdvertex* pVertex = (simdvertex*)pa.pStreamBase;
443 pa.leadingVertex = pVertex[pa.cur];
444
445 SetNextPaState(pa, PaTriFan1, PaTriFanSingle0);
446 return false; // Not enough vertices to assemble 8 triangles.
447 }
448
PaTriFan1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])449 bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
450 {
451 simdvector& leadVert = pa.leadingVertex.attrib[slot];
452 simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
453 simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
454 simdscalar s;
455
456 // need to fill vectors 1/2 with new verts, and v0 with anchor vert.
457 for(int i = 0; i < 4; ++i)
458 {
459 simdscalar a0 = a[i];
460 simdscalar b0 = b[i];
461
462 __m256 comp = leadVert[i];
463 simdvector& v0 = verts[0];
464 v0[i] = _simd_shuffle_ps(comp, comp, _MM_SHUFFLE(0, 0, 0, 0));
465 v0[i] = _mm256_permute2f128_ps(v0[i], comp, 0x00);
466
467 simdvector& v2 = verts[2];
468 s = _mm256_permute2f128_ps(a0, b0, 0x21);
469 v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
470
471 simdvector& v1 = verts[1];
472 v1[i] = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 1, 2, 1));
473 }
474
475 SetNextPaState(pa, PaTriFan1, PaTriFanSingle0, 0, KNOB_SIMD_WIDTH);
476 return true;
477 }
478
PaTriFanSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,__m128 verts[])479 void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
480 {
481 // vert 0 from leading vertex
482 simdvector& lead = pa.leadingVertex.attrib[slot];
483 verts[0] = swizzleLane0(lead);
484
485 simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
486 simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
487
488 // vert 1
489 if (primIndex < 7)
490 {
491 verts[1] = swizzleLaneN(a, primIndex + 1);
492 }
493 else
494 {
495 verts[1] = swizzleLane0(b);
496 }
497
498 // vert 2
499 if (primIndex < 6)
500 {
501 verts[2] = swizzleLaneN(a, primIndex + 2);
502 }
503 else
504 {
505 verts[2] = swizzleLaneN(b, primIndex - 6);
506 }
507 }
508
PaQuadList0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])509 bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
510 {
511 SetNextPaState(pa, PaQuadList1, PaQuadListSingle0);
512 return false; // Not enough vertices to assemble 8 triangles.
513 }
514
PaQuadList1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])515 bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
516 {
517 simdvector& a = PaGetSimdVector(pa, 0, slot);
518 simdvector& b = PaGetSimdVector(pa, 1, slot);
519 simdscalar s1, s2;
520
521 for(int i = 0; i < 4; ++i)
522 {
523 simdscalar a0 = a[i];
524 simdscalar b0 = b[i];
525
526 s1 = _mm256_permute2f128_ps(a0, b0, 0x20);
527 s2 = _mm256_permute2f128_ps(a0, b0, 0x31);
528
529 simdvector& v0 = verts[0];
530 v0[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(0, 0, 0, 0));
531
532 simdvector& v1 = verts[1];
533 v1[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(2, 1, 2, 1));
534
535 simdvector& v2 = verts[2];
536 v2[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(3, 2, 3, 2));
537 }
538
539 SetNextPaState(pa, PaQuadList0, PaQuadListSingle0, 0, KNOB_SIMD_WIDTH, true);
540 return true;
541 }
542
PaQuadListSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,__m128 verts[])543 void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
544 {
545 simdvector& a = PaGetSimdVector(pa, 0, slot);
546 simdvector& b = PaGetSimdVector(pa, 1, slot);
547
548 switch (primIndex)
549 {
550 case 0:
551 // triangle 0 - 0 1 2
552 verts[0] = swizzleLane0(a);
553 verts[1] = swizzleLane1(a);
554 verts[2] = swizzleLane2(a);
555 break;
556
557 case 1:
558 // triangle 1 - 0 2 3
559 verts[0] = swizzleLane0(a);
560 verts[1] = swizzleLane2(a);
561 verts[2] = swizzleLane3(a);
562 break;
563
564 case 2:
565 // triangle 2 - 4 5 6
566 verts[0] = swizzleLane4(a);
567 verts[1] = swizzleLane5(a);
568 verts[2] = swizzleLane6(a);
569 break;
570
571 case 3:
572 // triangle 3 - 4 6 7
573 verts[0] = swizzleLane4(a);
574 verts[1] = swizzleLane6(a);
575 verts[2] = swizzleLane7(a);
576 break;
577
578 case 4:
579 // triangle 4 - 8 9 10 (0 1 2)
580 verts[0] = swizzleLane0(b);
581 verts[1] = swizzleLane1(b);
582 verts[2] = swizzleLane2(b);
583 break;
584
585 case 5:
586 // triangle 1 - 0 2 3
587 verts[0] = swizzleLane0(b);
588 verts[1] = swizzleLane2(b);
589 verts[2] = swizzleLane3(b);
590 break;
591
592 case 6:
593 // triangle 2 - 4 5 6
594 verts[0] = swizzleLane4(b);
595 verts[1] = swizzleLane5(b);
596 verts[2] = swizzleLane6(b);
597 break;
598
599 case 7:
600 // triangle 3 - 4 6 7
601 verts[0] = swizzleLane4(b);
602 verts[1] = swizzleLane6(b);
603 verts[2] = swizzleLane7(b);
604 break;
605 }
606 }
607
PaLineLoopSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t lineIndex,__m128 verts[])608 void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t lineIndex, __m128 verts[])
609 {
610 PaLineStripSingle0(pa, slot, lineIndex, verts);
611
612 if (pa.numPrimsComplete + lineIndex == pa.numPrims - 1) {
613 simdvector &start = PaGetSimdVector(pa, pa.first, slot);
614 verts[1] = swizzleLane0(start);
615 }
616 }
617
PaLineLoop0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])618 bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
619 {
620 SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0);
621 return false;
622 }
623
PaLineLoop1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])624 bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
625 {
626 PaLineStrip1(pa, slot, verts);
627
628 if (pa.numPrimsComplete + KNOB_SIMD_WIDTH > pa.numPrims - 1) {
629 // loop reconnect now
630 int lane = pa.numPrims - pa.numPrimsComplete - 1;
631 simdvector &start = PaGetSimdVector(pa, pa.first, slot);
632 for (int i = 0; i < 4; i++) {
633 float *startVtx = (float *)&(start[i]);
634 float *targetVtx = (float *)&(verts[1][i]);
635 targetVtx[lane] = startVtx[0];
636 }
637 }
638
639 SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0, 0, KNOB_SIMD_WIDTH);
640 return true;
641 }
642
643
PaLineList0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])644 bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
645 {
646 SetNextPaState(pa, PaLineList1, PaLineListSingle0);
647 return false; // Not enough vertices to assemble 8 lines
648 }
649
PaLineList1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])650 bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
651 {
652 simdvector& a = PaGetSimdVector(pa, 0, slot);
653 simdvector& b = PaGetSimdVector(pa, 1, slot);
654 /// @todo: verify provoking vertex is correct
655 // Line list 0 1 2 3 4 5 6 7
656 // 8 9 10 11 12 13 14 15
657
658 // shuffle:
659 // 0 2 4 6 8 10 12 14
660 // 1 3 5 7 9 11 13 15
661
662 for (uint32_t i = 0; i < 4; ++i)
663 {
664 // 0 1 2 3 8 9 10 11
665 __m256 vALowBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x20);
666 // 4 5 6 7 12 13 14 15
667 __m256 vAHighBHigh = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x31);
668
669 // 0 2 4 6 8 10 12 14
670 verts[0].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(2, 0, 2, 0));
671 // 1 3 5 7 9 11 13 15
672 verts[1].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(3, 1, 3, 1));
673 }
674
675 SetNextPaState(pa, PaLineList0, PaLineListSingle0, 0, KNOB_SIMD_WIDTH, true);
676 return true;
677 }
678
PaLineListSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,__m128 verts[])679 void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
680 {
681 simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
682 simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
683
684 switch (primIndex)
685 {
686 case 0:
687 verts[0] = swizzleLane0(a);
688 verts[1] = swizzleLane1(a);
689 break;
690 case 1:
691 verts[0] = swizzleLane2(a);
692 verts[1] = swizzleLane3(a);
693 break;
694 case 2:
695 verts[0] = swizzleLane4(a);
696 verts[1] = swizzleLane5(a);
697 break;
698 case 3:
699 verts[0] = swizzleLane6(a);
700 verts[1] = swizzleLane7(a);
701 break;
702 case 4:
703 verts[0] = swizzleLane0(b);
704 verts[1] = swizzleLane1(b);
705 break;
706 case 5:
707 verts[0] = swizzleLane2(b);
708 verts[1] = swizzleLane3(b);
709 break;
710 case 6:
711 verts[0] = swizzleLane4(b);
712 verts[1] = swizzleLane5(b);
713 break;
714 case 7:
715 verts[0] = swizzleLane6(b);
716 verts[1] = swizzleLane7(b);
717 break;
718 }
719 }
720
PaLineStrip0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])721 bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
722 {
723 SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0);
724 return false; // Not enough vertices to assemble 8 lines
725 }
726
PaLineStrip1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])727 bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
728 {
729 simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
730 simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
731
732 /// @todo: verify provoking vertex is correct
733 // Line list 0 1 2 3 4 5 6 7
734 // 8 9 10 11 12 13 14 15
735
736 // shuffle:
737 // 0 1 2 3 4 5 6 7
738 // 1 2 3 4 5 6 7 8
739
740 verts[0] = a;
741
742 for(uint32_t i = 0; i < 4; ++i)
743 {
744 // 1 2 3 x 5 6 7 x
745 __m256 vPermA = _mm256_permute_ps(a.v[i], 0x39); // indices hi->low 00 11 10 01 (0 3 2 1)
746 // 4 5 6 7 8 9 10 11
747 __m256 vAHighBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x21);
748
749 // x x x 4 x x x 8
750 __m256 vPermB = _mm256_permute_ps(vAHighBLow, 0); // indices hi->low (0 0 0 0)
751
752 verts[1].v[i] = _mm256_blend_ps(vPermA, vPermB, 0x88);
753 }
754
755 SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0, 0, KNOB_SIMD_WIDTH);
756 return true;
757 }
758
PaLineStripSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t lineIndex,__m128 verts[])759 void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t lineIndex, __m128 verts[])
760 {
761 simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
762 simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
763
764 switch (lineIndex)
765 {
766 case 0:
767 verts[0] = swizzleLane0(a);
768 verts[1] = swizzleLane1(a);
769 break;
770 case 1:
771 verts[0] = swizzleLane1(a);
772 verts[1] = swizzleLane2(a);
773 break;
774 case 2:
775 verts[0] = swizzleLane2(a);
776 verts[1] = swizzleLane3(a);
777 break;
778 case 3:
779 verts[0] = swizzleLane3(a);
780 verts[1] = swizzleLane4(a);
781 break;
782 case 4:
783 verts[0] = swizzleLane4(a);
784 verts[1] = swizzleLane5(a);
785 break;
786 case 5:
787 verts[0] = swizzleLane5(a);
788 verts[1] = swizzleLane6(a);
789 break;
790 case 6:
791 verts[0] = swizzleLane6(a);
792 verts[1] = swizzleLane7(a);
793 break;
794 case 7:
795 verts[0] = swizzleLane7(a);
796 verts[1] = swizzleLane0(b);
797 break;
798 }
799 }
800
PaPoints0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])801 bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
802 {
803 simdvector& a = PaGetSimdVector(pa, pa.cur, slot);
804
805 verts[0] = a; // points only have 1 vertex.
806
807 SetNextPaState(pa, PaPoints0, PaPointsSingle0, 0, KNOB_SIMD_WIDTH, true);
808 return true;
809 }
810
PaPointsSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,__m128 verts[])811 void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
812 {
813 simdvector &a = PaGetSimdVector(pa, pa.cur, slot);
814 switch(primIndex)
815 {
816 case 0:
817 verts[0] = swizzleLane0(a);
818 break;
819 case 1:
820 verts[0] = swizzleLane1(a);
821 break;
822 case 2:
823 verts[0] = swizzleLane2(a);
824 break;
825 case 3:
826 verts[0] = swizzleLane3(a);
827 break;
828 case 4:
829 verts[0] = swizzleLane4(a);
830 break;
831 case 5:
832 verts[0] = swizzleLane5(a);
833 break;
834 case 6:
835 verts[0] = swizzleLane6(a);
836 break;
837 case 7:
838 verts[0] = swizzleLane7(a);
839 break;
840 }
841 }
842
843 //////////////////////////////////////////////////////////////////////////
844 /// @brief State 1 for RECT_LIST topology.
845 /// There is not enough to assemble 8 triangles.
PaRectList0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])846 bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
847 {
848 SetNextPaState(pa, PaRectList1, PaRectListSingle0);
849 return false;
850 }
851
852 //////////////////////////////////////////////////////////////////////////
853 /// @brief State 1 for RECT_LIST topology.
854 /// Rect lists has the following format.
855 /// w x y z
856 /// v2 o---o v5 o---o v8 o---o v11 o---o
857 /// | \ | | \ | | \ | | \ |
858 /// v1 o---o v4 o---o v7 o---o v10 o---o
859 /// v0 v3 v6 v9
860 ///
861 /// Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
862 ///
863 /// tri0 = { v0, v1, v2 } tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
864 /// tri2 = { v3, v4, v5 } tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
865 /// etc.
866 ///
867 /// PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
868 /// where v0 contains all the first vertices for 8 triangles.
869 ///
870 /// Result:
871 /// verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
872 /// verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
873 /// verts[2] = { v2, w, v5, x, v8, y, v11, z }
874 ///
875 /// @param pa - State for PA state machine.
876 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
877 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
PaRectList1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])878 bool PaRectList1(
879 PA_STATE_OPT& pa,
880 uint32_t slot,
881 simdvector verts[])
882 {
883 // SIMD vectors a and b are the last two vertical outputs from the vertex shader.
884 simdvector& a = PaGetSimdVector(pa, 0, slot); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7 }
885 simdvector& b = PaGetSimdVector(pa, 1, slot); // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 }
886
887 __m256 tmp0, tmp1, tmp2;
888
889 // Loop over each component in the simdvector.
890 for(int i = 0; i < 4; ++i)
891 {
892 simdvector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
893 tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
894 v0[i] = _mm256_blend_ps(a[i], tmp0, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care.
895 tmp1 = _mm256_permute_ps(v0[i], 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * }
896 v0[i] = _mm256_permute_ps(v0[i], 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 }
897 v0[i] = _mm256_blend_ps(tmp1, v0[i], 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 }
898
899 /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
900 /// AVX2 should make this much cheaper.
901 simdvector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
902 v1[i] = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * }
903 tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 }
904 tmp2 = _mm256_blend_ps(v1[i], tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 }
905 tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, * *, *, * }
906 v1[i] = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 }
907 v1[i] = _mm256_blend_ps(tmp2, v1[i], 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 }
908 v1[i] = _mm256_blend_ps(v1[i], tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
909
910 // verts[2] = { v2, w, v5, x, v8, y, v11, z }
911 simdvector& v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z }
912 v2[i] = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * }
913 tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * }
914 v2[i] = _mm256_blend_ps(tmp1, v2[i], 0xF0);
915
916 // Need to compute 4th implied vertex for the rectangle.
917 tmp2 = _mm256_sub_ps(v0[i], v1[i]);
918 tmp2 = _mm256_add_ps(tmp2, v2[i]); // tmp2 = { w, *, x, *, y, *, z, * }
919 tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z }
920 v2[i] = _mm256_blend_ps(v2[i], tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z }
921 }
922
923 SetNextPaState(pa, PaRectList1, PaRectListSingle0, 0, KNOB_SIMD_WIDTH, true);
924 return true;
925 }
926
927 //////////////////////////////////////////////////////////////////////////
928 /// @brief State 2 for RECT_LIST topology.
929 /// Not implemented unless there is a use case for more then 8 rects.
930 /// @param pa - State for PA state machine.
931 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
932 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
PaRectList2(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])933 bool PaRectList2(
934 PA_STATE_OPT& pa,
935 uint32_t slot,
936 simdvector verts[])
937 {
938 SWR_ASSERT(0); // Is rect list used for anything other then clears?
939 SetNextPaState(pa, PaRectList0, PaRectListSingle0, 0, KNOB_SIMD_WIDTH, true);
940 return true;
941 }
942
943 //////////////////////////////////////////////////////////////////////////
944 /// @brief This procedure is called by the Binner to assemble the attributes.
945 /// Unlike position, which is stored vertically, the attributes are
946 /// stored horizontally. The outputs from the VS, labeled as 'a' and
947 /// 'b' are vertical. This function needs to transpose the lanes
948 /// containing the vertical attribute data into horizontal form.
949 /// @param pa - State for PA state machine.
950 /// @param slot - Index into VS output for a given attribute.
951 /// @param primIndex - Binner processes each triangle individually.
952 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
PaRectListSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,__m128 verts[])953 void PaRectListSingle0(
954 PA_STATE_OPT& pa,
955 uint32_t slot,
956 uint32_t primIndex,
957 __m128 verts[])
958 {
959 // We have 12 simdscalars contained within 3 simdvectors which
960 // hold at least 8 triangles worth of data. We want to assemble a single
961 // triangle with data in horizontal form.
962 simdvector& a = PaGetSimdVector(pa, 0, slot);
963
964 // Convert from vertical to horizontal.
965 switch(primIndex)
966 {
967 case 0:
968 verts[0] = swizzleLane0(a);
969 verts[1] = swizzleLane1(a);
970 verts[2] = swizzleLane2(a);
971 break;
972 case 1:
973 verts[0] = swizzleLane0(a);
974 verts[1] = swizzleLane2(a);
975 verts[2] = _mm_blend_ps(verts[0], verts[1], 0x2);
976 break;
977 case 2:
978 case 3:
979 case 4:
980 case 5:
981 case 6:
982 case 7:
983 SWR_ASSERT(0);
984 break;
985 };
986 }
987
PA_STATE_OPT(DRAW_CONTEXT * in_pDC,uint32_t in_numPrims,uint8_t * pStream,uint32_t in_streamSizeInVerts,bool in_isStreaming,PRIMITIVE_TOPOLOGY topo)988 PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t* pStream, uint32_t in_streamSizeInVerts,
989 bool in_isStreaming, PRIMITIVE_TOPOLOGY topo) : PA_STATE(in_pDC, pStream, in_streamSizeInVerts), numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0),
990 cur(0), prev(0), first(0), counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming)
991 {
992 const API_STATE& state = GetApiState(pDC);
993
994 this->binTopology = topo == TOP_UNKNOWN ? state.topology : topo;
995
996 switch (this->binTopology)
997 {
998 case TOP_TRIANGLE_LIST:
999 this->pfnPaFunc = PaTriList0;
1000 break;
1001 case TOP_TRIANGLE_STRIP:
1002 this->pfnPaFunc = PaTriStrip0;
1003 break;
1004 case TOP_TRIANGLE_FAN:
1005 this->pfnPaFunc = PaTriFan0;
1006 break;
1007 case TOP_QUAD_LIST:
1008 this->pfnPaFunc = PaQuadList0;
1009 this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles
1010 break;
1011 case TOP_QUAD_STRIP:
1012 // quad strip pattern when decomposed into triangles is the same as verts strips
1013 this->pfnPaFunc = PaTriStrip0;
1014 this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles
1015 break;
1016 case TOP_LINE_LIST:
1017 this->pfnPaFunc = PaLineList0;
1018 this->numPrims = in_numPrims;
1019 break;
1020 case TOP_LINE_STRIP:
1021 this->pfnPaFunc = PaLineStrip0;
1022 this->numPrims = in_numPrims;
1023 break;
1024 case TOP_LINE_LOOP:
1025 this->pfnPaFunc = PaLineLoop0;
1026 this->numPrims = in_numPrims;
1027 break;
1028 case TOP_POINT_LIST:
1029 // use point binner and rasterizer if supported
1030 this->pfnPaFunc = PaPoints0;
1031 this->numPrims = in_numPrims;
1032 break;
1033 case TOP_RECT_LIST:
1034 this->pfnPaFunc = PaRectList0;
1035 this->numPrims = in_numPrims * 2;
1036 break;
1037
1038 case TOP_PATCHLIST_1:
1039 this->pfnPaFunc = PaPatchList<1>;
1040 break;
1041 case TOP_PATCHLIST_2:
1042 this->pfnPaFunc = PaPatchList<2>;
1043 break;
1044 case TOP_PATCHLIST_3:
1045 this->pfnPaFunc = PaPatchList<3>;
1046 break;
1047 case TOP_PATCHLIST_4:
1048 this->pfnPaFunc = PaPatchList<4>;
1049 break;
1050 case TOP_PATCHLIST_5:
1051 this->pfnPaFunc = PaPatchList<5>;
1052 break;
1053 case TOP_PATCHLIST_6:
1054 this->pfnPaFunc = PaPatchList<6>;
1055 break;
1056 case TOP_PATCHLIST_7:
1057 this->pfnPaFunc = PaPatchList<7>;
1058 break;
1059 case TOP_PATCHLIST_8:
1060 this->pfnPaFunc = PaPatchList<8>;
1061 break;
1062 case TOP_PATCHLIST_9:
1063 this->pfnPaFunc = PaPatchList<9>;
1064 break;
1065 case TOP_PATCHLIST_10:
1066 this->pfnPaFunc = PaPatchList<10>;
1067 break;
1068 case TOP_PATCHLIST_11:
1069 this->pfnPaFunc = PaPatchList<11>;
1070 break;
1071 case TOP_PATCHLIST_12:
1072 this->pfnPaFunc = PaPatchList<12>;
1073 break;
1074 case TOP_PATCHLIST_13:
1075 this->pfnPaFunc = PaPatchList<13>;
1076 break;
1077 case TOP_PATCHLIST_14:
1078 this->pfnPaFunc = PaPatchList<14>;
1079 break;
1080 case TOP_PATCHLIST_15:
1081 this->pfnPaFunc = PaPatchList<15>;
1082 break;
1083 case TOP_PATCHLIST_16:
1084 this->pfnPaFunc = PaPatchList<16>;
1085 break;
1086 case TOP_PATCHLIST_17:
1087 this->pfnPaFunc = PaPatchList<17>;
1088 break;
1089 case TOP_PATCHLIST_18:
1090 this->pfnPaFunc = PaPatchList<18>;
1091 break;
1092 case TOP_PATCHLIST_19:
1093 this->pfnPaFunc = PaPatchList<19>;
1094 break;
1095 case TOP_PATCHLIST_20:
1096 this->pfnPaFunc = PaPatchList<20>;
1097 break;
1098 case TOP_PATCHLIST_21:
1099 this->pfnPaFunc = PaPatchList<21>;
1100 break;
1101 case TOP_PATCHLIST_22:
1102 this->pfnPaFunc = PaPatchList<22>;
1103 break;
1104 case TOP_PATCHLIST_23:
1105 this->pfnPaFunc = PaPatchList<23>;
1106 break;
1107 case TOP_PATCHLIST_24:
1108 this->pfnPaFunc = PaPatchList<24>;
1109 break;
1110 case TOP_PATCHLIST_25:
1111 this->pfnPaFunc = PaPatchList<25>;
1112 break;
1113 case TOP_PATCHLIST_26:
1114 this->pfnPaFunc = PaPatchList<26>;
1115 break;
1116 case TOP_PATCHLIST_27:
1117 this->pfnPaFunc = PaPatchList<27>;
1118 break;
1119 case TOP_PATCHLIST_28:
1120 this->pfnPaFunc = PaPatchList<28>;
1121 break;
1122 case TOP_PATCHLIST_29:
1123 this->pfnPaFunc = PaPatchList<29>;
1124 break;
1125 case TOP_PATCHLIST_30:
1126 this->pfnPaFunc = PaPatchList<30>;
1127 break;
1128 case TOP_PATCHLIST_31:
1129 this->pfnPaFunc = PaPatchList<31>;
1130 break;
1131 case TOP_PATCHLIST_32:
1132 this->pfnPaFunc = PaPatchList<32>;
1133 break;
1134
1135 default:
1136 SWR_ASSERT(0);
1137 break;
1138 };
1139
1140 this->pfnPaFuncReset = this->pfnPaFunc;
1141
1142 // simdscalari id8 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
1143 // simdscalari id4 = _mm256_set_epi32(0, 0, 1, 1, 2, 2, 3, 3);
1144 simdscalari id8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1145 simdscalari id4 = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
1146
1147 switch(this->binTopology)
1148 {
1149 case TOP_TRIANGLE_LIST:
1150 case TOP_TRIANGLE_STRIP:
1151 case TOP_TRIANGLE_FAN:
1152 case TOP_LINE_STRIP:
1153 case TOP_LINE_LIST:
1154 case TOP_LINE_LOOP:
1155 this->primIDIncr = 8;
1156 this->primID = id8;
1157 break;
1158 case TOP_QUAD_LIST:
1159 case TOP_QUAD_STRIP:
1160 case TOP_RECT_LIST:
1161 this->primIDIncr = 4;
1162 this->primID = id4;
1163 break;
1164 case TOP_POINT_LIST:
1165 this->primIDIncr = 8;
1166 this->primID = id8;
1167 break;
1168 case TOP_PATCHLIST_1:
1169 case TOP_PATCHLIST_2:
1170 case TOP_PATCHLIST_3:
1171 case TOP_PATCHLIST_4:
1172 case TOP_PATCHLIST_5:
1173 case TOP_PATCHLIST_6:
1174 case TOP_PATCHLIST_7:
1175 case TOP_PATCHLIST_8:
1176 case TOP_PATCHLIST_9:
1177 case TOP_PATCHLIST_10:
1178 case TOP_PATCHLIST_11:
1179 case TOP_PATCHLIST_12:
1180 case TOP_PATCHLIST_13:
1181 case TOP_PATCHLIST_14:
1182 case TOP_PATCHLIST_15:
1183 case TOP_PATCHLIST_16:
1184 case TOP_PATCHLIST_17:
1185 case TOP_PATCHLIST_18:
1186 case TOP_PATCHLIST_19:
1187 case TOP_PATCHLIST_20:
1188 case TOP_PATCHLIST_21:
1189 case TOP_PATCHLIST_22:
1190 case TOP_PATCHLIST_23:
1191 case TOP_PATCHLIST_24:
1192 case TOP_PATCHLIST_25:
1193 case TOP_PATCHLIST_26:
1194 case TOP_PATCHLIST_27:
1195 case TOP_PATCHLIST_28:
1196 case TOP_PATCHLIST_29:
1197 case TOP_PATCHLIST_30:
1198 case TOP_PATCHLIST_31:
1199 case TOP_PATCHLIST_32:
1200 // Always run KNOB_SIMD_WIDTH number of patches at a time.
1201 this->primIDIncr = 8;
1202 this->primID = id8;
1203 break;
1204
1205 default:
1206 SWR_ASSERT(0);
1207 break;
1208 };
1209
1210 }
1211 #endif
1212