• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file pa_avx.cpp
24 *
25 * @brief AVX implementation for primitive assembly.
26 *        N primitives are assembled at a time, where N is the SIMD width.
27 *        A state machine, that is specific for a given topology, drives the
28 *        assembly of vertices into triangles.
29 *
30 ******************************************************************************/
31 #include "context.h"
32 #include "pa.h"
33 #include "frontend.h"
34 
35 #if (KNOB_SIMD_WIDTH == 8)
36 
37 bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
38 bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
39 bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
40 void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
41 
42 bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
43 bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
44 void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
45 
46 bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
47 bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
48 void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
49 
50 bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
51 bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
52 void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
53 
54 bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
55 bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
56 
57 bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
58 bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
59 void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t index, __m128 verts[]);
60 
61 bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
62 bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
63 void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 lineverts[]);
64 
65 bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
66 void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
67 
68 bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
69 bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
70 bool PaRectList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
71 void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
72 
73 template <uint32_t TotalControlPoints>
PaPatchListSingle(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,__m128 verts[])74 void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
75 {
76     // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
77     // KNOB_SIMD_WIDTH * 1 patch.  This function is called once per attribute.
78     // Each attribute has 4 components.
79 
80     /// @todo Optimize this
81 
82     float* pOutVec = (float*)verts;
83 
84     for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
85     {
86         uint32_t input_cp = primIndex * TotalControlPoints + cp;
87         uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
88         uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
89 
90         // Loop over all components of the attribute
91         for (uint32_t i = 0; i < 4; ++i)
92         {
93             const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
94             pOutVec[cp * 4 + i] = pInputVec[input_lane];
95         }
96     }
97 }
98 
99 template<uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1>
PaPatchList(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])100 static bool PaPatchList(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
101 {
102     SetNextPaState(
103         pa,
104         PaPatchList<TotalControlPoints, CurrentControlPoints + 1>,
105         PaPatchListSingle<TotalControlPoints>);
106 
107     return false;
108 }
109 
110 template<uint32_t TotalControlPoints>
PaPatchListTerm(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])111 static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
112 {
113     // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
114     // KNOB_SIMD_WIDTH * 1 patch.  This function is called once per attribute.
115     // Each attribute has 4 components.
116 
117     /// @todo Optimize this
118 
119     // Loop over all components of the attribute
120     for (uint32_t i = 0; i < 4; ++i)
121     {
122         for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
123         {
124             float vec[KNOB_SIMD_WIDTH];
125             for (uint32_t lane = 0; lane < KNOB_SIMD_WIDTH; ++lane)
126             {
127                 uint32_t input_cp = lane * TotalControlPoints + cp;
128                 uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
129                 uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
130 
131                 const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
132                 vec[lane] = pInputVec[input_lane];
133             }
134             verts[cp][i] = _simd_loadu_ps(vec);
135         }
136     }
137 
138     SetNextPaState(
139         pa,
140         PaPatchList<TotalControlPoints>,
141         PaPatchListSingle<TotalControlPoints>,
142         0,
143         KNOB_SIMD_WIDTH,
144         true);
145 
146     return true;
147 }
148 
149 #define PA_PATCH_LIST_TERMINATOR(N) \
150     template<> bool PaPatchList<N, N>(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])\
151                            { return PaPatchListTerm<N>(pa, slot, verts); }
152 PA_PATCH_LIST_TERMINATOR(1)
153 PA_PATCH_LIST_TERMINATOR(2)
154 PA_PATCH_LIST_TERMINATOR(3)
155 PA_PATCH_LIST_TERMINATOR(4)
156 PA_PATCH_LIST_TERMINATOR(5)
157 PA_PATCH_LIST_TERMINATOR(6)
158 PA_PATCH_LIST_TERMINATOR(7)
159 PA_PATCH_LIST_TERMINATOR(8)
160 PA_PATCH_LIST_TERMINATOR(9)
161 PA_PATCH_LIST_TERMINATOR(10)
162 PA_PATCH_LIST_TERMINATOR(11)
163 PA_PATCH_LIST_TERMINATOR(12)
164 PA_PATCH_LIST_TERMINATOR(13)
165 PA_PATCH_LIST_TERMINATOR(14)
166 PA_PATCH_LIST_TERMINATOR(15)
167 PA_PATCH_LIST_TERMINATOR(16)
168 PA_PATCH_LIST_TERMINATOR(17)
169 PA_PATCH_LIST_TERMINATOR(18)
170 PA_PATCH_LIST_TERMINATOR(19)
171 PA_PATCH_LIST_TERMINATOR(20)
172 PA_PATCH_LIST_TERMINATOR(21)
173 PA_PATCH_LIST_TERMINATOR(22)
174 PA_PATCH_LIST_TERMINATOR(23)
175 PA_PATCH_LIST_TERMINATOR(24)
176 PA_PATCH_LIST_TERMINATOR(25)
177 PA_PATCH_LIST_TERMINATOR(26)
178 PA_PATCH_LIST_TERMINATOR(27)
179 PA_PATCH_LIST_TERMINATOR(28)
180 PA_PATCH_LIST_TERMINATOR(29)
181 PA_PATCH_LIST_TERMINATOR(30)
182 PA_PATCH_LIST_TERMINATOR(31)
183 PA_PATCH_LIST_TERMINATOR(32)
184 #undef PA_PATCH_LIST_TERMINATOR
185 
PaTriList0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])186 bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
187 {
188     SetNextPaState(pa, PaTriList1, PaTriListSingle0);
189     return false;    // Not enough vertices to assemble 4 or 8 triangles.
190 }
191 
PaTriList1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])192 bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
193 {
194     SetNextPaState(pa, PaTriList2, PaTriListSingle0);
195     return false;    // Not enough vertices to assemble 8 triangles.
196 }
197 
PaTriList2(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])198 bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
199 {
200 #if KNOB_ARCH == KNOB_ARCH_AVX
201 
202     simdvector& a = PaGetSimdVector(pa, 0, slot);
203     simdvector& b = PaGetSimdVector(pa, 1, slot);
204     simdvector& c = PaGetSimdVector(pa, 2, slot);
205     simdscalar    s;
206 
207     // Tri Pattern - provoking vertex is always v0
208     //  v0 -> 0 3 6 9  12 15 18 21
209     //  v1 -> 1 4 7 10 13 16 19 22
210     //  v2 -> 2 5 8 11 14 17 20 23
211 
212     for (int i = 0; i < 4; ++i)
213     {
214         simdvector& v0 = verts[0];
215         v0[i] = _simd_blend_ps(a[i], b[i], 0x92);
216         v0[i] = _simd_blend_ps(v0[i], c[i], 0x24);
217         v0[i] = _mm256_permute_ps(v0[i], 0x6C);
218         s = _mm256_permute2f128_ps(v0[i], v0[i], 0x21);
219         v0[i] = _simd_blend_ps(v0[i], s, 0x44);
220 
221         simdvector& v1 = verts[1];
222         v1[i] = _simd_blend_ps(a[i], b[i], 0x24);
223         v1[i] = _simd_blend_ps(v1[i], c[i], 0x49);
224         v1[i] = _mm256_permute_ps(v1[i], 0xB1);
225         s = _mm256_permute2f128_ps(v1[i], v1[i], 0x21);
226         v1[i] = _simd_blend_ps(v1[i], s, 0x66);
227 
228         simdvector& v2 = verts[2];
229         v2[i] = _simd_blend_ps(a[i], b[i], 0x49);
230         v2[i] = _simd_blend_ps(v2[i], c[i], 0x92);
231         v2[i] = _mm256_permute_ps(v2[i], 0xC6);
232         s = _mm256_permute2f128_ps(v2[i], v2[i], 0x21);
233         v2[i] = _simd_blend_ps(v2[i], s, 0x22);
234     }
235 
236 #elif KNOB_ARCH >= KNOB_ARCH_AVX2
237 
238     simdvector &a = PaGetSimdVector(pa, 0, slot);
239     simdvector &b = PaGetSimdVector(pa, 1, slot);
240     simdvector &c = PaGetSimdVector(pa, 2, slot);
241 
242     //  v0 -> a0 a3 a6 b1 b4 b7 c2 c5
243     //  v1 -> a1 a4 a7 b2 b5 c0 c3 c6
244     //  v2 -> a2 a5 b0 b3 b6 c1 c4 c7
245 
246     const simdscalari perm0 = _simd_set_epi32(5, 2, 7, 4, 1, 6, 3, 0);
247     const simdscalari perm1 = _simd_set_epi32(6, 3, 0, 5, 2, 7, 4, 1);
248     const simdscalari perm2 = _simd_set_epi32(7, 4, 1, 6, 3, 0, 5, 2);
249 
250     simdvector &v0 = verts[0];
251     simdvector &v1 = verts[1];
252     simdvector &v2 = verts[2];
253 
254     for (int i = 0; i < 4; ++i)
255     {
256         v0[i] = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x92), c[i], 0x24);
257         v0[i] = _mm256_permutevar8x32_ps(v0[i], perm0);
258 
259         v1[i] = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x24), c[i], 0x49);
260         v1[i] = _mm256_permutevar8x32_ps(v1[i], perm1);
261 
262         v2[i] = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x49), c[i], 0x92);
263         v2[i] = _mm256_permutevar8x32_ps(v2[i], perm2);
264     }
265 
266 #endif
267 
268     SetNextPaState(pa, PaTriList0, PaTriListSingle0, 0, KNOB_SIMD_WIDTH, true);
269     return true;
270 }
271 
PaTriListSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,__m128 verts[])272 void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
273 {
274     // We have 12 simdscalars contained within 3 simdvectors which
275     // hold at least 8 triangles worth of data. We want to assemble a single
276     // triangle with data in horizontal form.
277     simdvector& a = PaGetSimdVector(pa, 0, slot);
278     simdvector& b = PaGetSimdVector(pa, 1, slot);
279     simdvector& c = PaGetSimdVector(pa, 2, slot);
280 
281     // Convert from vertical to horizontal.
282     // Tri Pattern - provoking vertex is always v0
283     //  v0 -> 0 3 6 9  12 15 18 21
284     //  v1 -> 1 4 7 10 13 16 19 22
285     //  v2 -> 2 5 8 11 14 17 20 23
286     switch(primIndex)
287     {
288     case 0:
289         verts[0] = swizzleLane0(a);
290         verts[1] = swizzleLane1(a);
291         verts[2] = swizzleLane2(a);
292         break;
293     case 1:
294         verts[0] = swizzleLane3(a);
295         verts[1] = swizzleLane4(a);
296         verts[2] = swizzleLane5(a);
297         break;
298     case 2:
299         verts[0] = swizzleLane6(a);
300         verts[1] = swizzleLane7(a);
301         verts[2] = swizzleLane0(b);
302         break;
303     case 3:
304         verts[0] = swizzleLane1(b);
305         verts[1] = swizzleLane2(b);
306         verts[2] = swizzleLane3(b);
307         break;
308     case 4:
309         verts[0] = swizzleLane4(b);
310         verts[1] = swizzleLane5(b);
311         verts[2] = swizzleLane6(b);
312         break;
313     case 5:
314         verts[0] = swizzleLane7(b);
315         verts[1] = swizzleLane0(c);
316         verts[2] = swizzleLane1(c);
317         break;
318     case 6:
319         verts[0] = swizzleLane2(c);
320         verts[1] = swizzleLane3(c);
321         verts[2] = swizzleLane4(c);
322         break;
323     case 7:
324         verts[0] = swizzleLane5(c);
325         verts[1] = swizzleLane6(c);
326         verts[2] = swizzleLane7(c);
327         break;
328     };
329 }
330 
PaTriStrip0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])331 bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
332 {
333     SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0);
334     return false;    // Not enough vertices to assemble 8 triangles.
335 }
336 
PaTriStrip1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])337 bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
338 {
339     simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
340     simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
341     simdscalar  s;
342 
343     for(int i = 0; i < 4; ++i)
344     {
345         simdscalar a0 = a[i];
346         simdscalar b0 = b[i];
347 
348         // Tri Pattern - provoking vertex is always v0
349         //  v0 -> 01234567
350         //  v1 -> 13355779
351         //  v2 -> 22446688
352         simdvector& v0 = verts[0];
353         v0[i] = a0;
354 
355         //  s -> 4567891011
356         s = _mm256_permute2f128_ps(a0, b0, 0x21);
357         //  s -> 23456789
358         s = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
359 
360         simdvector& v1 = verts[1];
361         //  v1 -> 13355779
362         v1[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(3, 1, 3, 1));
363 
364         simdvector& v2 = verts[2];
365         //  v2 -> 22446688
366         v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(2, 2, 2, 2));
367     }
368 
369     SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0, 0, KNOB_SIMD_WIDTH);
370     return true;
371 }
372 
PaTriStripSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,__m128 verts[])373 void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
374 {
375     simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
376     simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
377 
378     // Convert from vertical to horizontal.
379     // Tri Pattern - provoking vertex is always v0
380     //  v0 -> 01234567
381     //  v1 -> 13355779
382     //  v2 -> 22446688
383     switch(primIndex)
384     {
385     case 0:
386         verts[0] = swizzleLane0(a);
387         verts[1] = swizzleLane1(a);
388         verts[2] = swizzleLane2(a);
389         break;
390     case 1:
391         verts[0] = swizzleLane1(a);
392         verts[1] = swizzleLane3(a);
393         verts[2] = swizzleLane2(a);
394         break;
395     case 2:
396         verts[0] = swizzleLane2(a);
397         verts[1] = swizzleLane3(a);
398         verts[2] = swizzleLane4(a);
399         break;
400     case 3:
401         verts[0] = swizzleLane3(a);
402         verts[1] = swizzleLane5(a);
403         verts[2] = swizzleLane4(a);
404         break;
405     case 4:
406         verts[0] = swizzleLane4(a);
407         verts[1] = swizzleLane5(a);
408         verts[2] = swizzleLane6(a);
409         break;
410     case 5:
411         verts[0] = swizzleLane5(a);
412         verts[1] = swizzleLane7(a);
413         verts[2] = swizzleLane6(a);
414         break;
415     case 6:
416         verts[0] = swizzleLane6(a);
417         verts[1] = swizzleLane7(a);
418         verts[2] = swizzleLane0(b);
419         break;
420     case 7:
421         verts[0] = swizzleLane7(a);
422         verts[1] = swizzleLane1(b);
423         verts[2] = swizzleLane0(b);
424         break;
425     };
426 }
427 
PaTriFan0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])428 bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
429 {
430     simdvector& a = PaGetSimdVector(pa, pa.cur, slot);
431 
432     // Extract vertex 0 to every lane of first vector
433     for(int i = 0; i < 4; ++i)
434     {
435         __m256 a0 = a[i];
436         simdvector& v0 = verts[0];
437         v0[i] = _simd_shuffle_ps(a0, a0, _MM_SHUFFLE(0, 0, 0, 0));
438         v0[i] = _mm256_permute2f128_ps(v0[i], a0, 0x00);
439     }
440 
441     // store off leading vertex for attributes
442     simdvertex* pVertex = (simdvertex*)pa.pStreamBase;
443     pa.leadingVertex = pVertex[pa.cur];
444 
445     SetNextPaState(pa, PaTriFan1, PaTriFanSingle0);
446     return false;    // Not enough vertices to assemble 8 triangles.
447 }
448 
PaTriFan1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])449 bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
450 {
451     simdvector& leadVert = pa.leadingVertex.attrib[slot];
452     simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
453     simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
454     simdscalar    s;
455 
456     // need to fill vectors 1/2 with new verts, and v0 with anchor vert.
457     for(int i = 0; i < 4; ++i)
458     {
459         simdscalar a0 = a[i];
460         simdscalar b0 = b[i];
461 
462         __m256 comp = leadVert[i];
463         simdvector& v0 = verts[0];
464         v0[i] = _simd_shuffle_ps(comp, comp, _MM_SHUFFLE(0, 0, 0, 0));
465         v0[i] = _mm256_permute2f128_ps(v0[i], comp, 0x00);
466 
467         simdvector& v2 = verts[2];
468         s = _mm256_permute2f128_ps(a0, b0, 0x21);
469         v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
470 
471         simdvector& v1 = verts[1];
472         v1[i] = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 1, 2, 1));
473     }
474 
475     SetNextPaState(pa, PaTriFan1, PaTriFanSingle0, 0, KNOB_SIMD_WIDTH);
476     return true;
477 }
478 
PaTriFanSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,__m128 verts[])479 void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
480 {
481     // vert 0 from leading vertex
482     simdvector& lead = pa.leadingVertex.attrib[slot];
483     verts[0] = swizzleLane0(lead);
484 
485     simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
486     simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
487 
488     // vert 1
489     if (primIndex < 7)
490     {
491         verts[1] = swizzleLaneN(a, primIndex + 1);
492     }
493     else
494     {
495         verts[1] = swizzleLane0(b);
496     }
497 
498     // vert 2
499     if (primIndex < 6)
500     {
501         verts[2] = swizzleLaneN(a, primIndex + 2);
502     }
503     else
504     {
505         verts[2] = swizzleLaneN(b, primIndex - 6);
506     }
507 }
508 
PaQuadList0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])509 bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
510 {
511     SetNextPaState(pa, PaQuadList1, PaQuadListSingle0);
512     return false;    // Not enough vertices to assemble 8 triangles.
513 }
514 
PaQuadList1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])515 bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
516 {
517     simdvector& a = PaGetSimdVector(pa, 0, slot);
518     simdvector& b = PaGetSimdVector(pa, 1, slot);
519     simdscalar    s1, s2;
520 
521     for(int i = 0; i < 4; ++i)
522     {
523         simdscalar a0 = a[i];
524         simdscalar b0 = b[i];
525 
526         s1 = _mm256_permute2f128_ps(a0, b0, 0x20);
527         s2 = _mm256_permute2f128_ps(a0, b0, 0x31);
528 
529         simdvector& v0 = verts[0];
530         v0[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(0, 0, 0, 0));
531 
532         simdvector& v1 = verts[1];
533         v1[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(2, 1, 2, 1));
534 
535         simdvector& v2 = verts[2];
536         v2[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(3, 2, 3, 2));
537     }
538 
539     SetNextPaState(pa, PaQuadList0, PaQuadListSingle0, 0, KNOB_SIMD_WIDTH, true);
540     return true;
541 }
542 
PaQuadListSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,__m128 verts[])543 void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
544 {
545     simdvector& a = PaGetSimdVector(pa, 0, slot);
546     simdvector& b = PaGetSimdVector(pa, 1, slot);
547 
548     switch (primIndex)
549     {
550     case 0:
551         // triangle 0 - 0 1 2
552         verts[0] = swizzleLane0(a);
553         verts[1] = swizzleLane1(a);
554         verts[2] = swizzleLane2(a);
555         break;
556 
557     case 1:
558         // triangle 1 - 0 2 3
559         verts[0] = swizzleLane0(a);
560         verts[1] = swizzleLane2(a);
561         verts[2] = swizzleLane3(a);
562         break;
563 
564     case 2:
565         // triangle 2 - 4 5 6
566         verts[0] = swizzleLane4(a);
567         verts[1] = swizzleLane5(a);
568         verts[2] = swizzleLane6(a);
569         break;
570 
571     case 3:
572         // triangle 3 - 4 6 7
573         verts[0] = swizzleLane4(a);
574         verts[1] = swizzleLane6(a);
575         verts[2] = swizzleLane7(a);
576         break;
577 
578     case 4:
579         // triangle 4 - 8 9 10 (0 1 2)
580         verts[0] = swizzleLane0(b);
581         verts[1] = swizzleLane1(b);
582         verts[2] = swizzleLane2(b);
583         break;
584 
585     case 5:
586         // triangle 1 - 0 2 3
587         verts[0] = swizzleLane0(b);
588         verts[1] = swizzleLane2(b);
589         verts[2] = swizzleLane3(b);
590         break;
591 
592     case 6:
593         // triangle 2 - 4 5 6
594         verts[0] = swizzleLane4(b);
595         verts[1] = swizzleLane5(b);
596         verts[2] = swizzleLane6(b);
597         break;
598 
599     case 7:
600         // triangle 3 - 4 6 7
601         verts[0] = swizzleLane4(b);
602         verts[1] = swizzleLane6(b);
603         verts[2] = swizzleLane7(b);
604         break;
605     }
606 }
607 
PaLineLoopSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t lineIndex,__m128 verts[])608 void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t lineIndex, __m128 verts[])
609 {
610     PaLineStripSingle0(pa, slot, lineIndex, verts);
611 
612     if (pa.numPrimsComplete + lineIndex == pa.numPrims - 1) {
613         simdvector &start = PaGetSimdVector(pa, pa.first, slot);
614         verts[1] = swizzleLane0(start);
615     }
616 }
617 
PaLineLoop0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])618 bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
619 {
620     SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0);
621     return false;
622 }
623 
PaLineLoop1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])624 bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
625 {
626     PaLineStrip1(pa, slot, verts);
627 
628     if (pa.numPrimsComplete + KNOB_SIMD_WIDTH > pa.numPrims - 1) {
629         // loop reconnect now
630         int lane = pa.numPrims - pa.numPrimsComplete - 1;
631         simdvector &start = PaGetSimdVector(pa, pa.first, slot);
632         for (int i = 0; i < 4; i++) {
633             float *startVtx = (float *)&(start[i]);
634             float *targetVtx = (float *)&(verts[1][i]);
635             targetVtx[lane] = startVtx[0];
636         }
637     }
638 
639     SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0, 0, KNOB_SIMD_WIDTH);
640     return true;
641 }
642 
643 
PaLineList0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])644 bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
645 {
646     SetNextPaState(pa, PaLineList1, PaLineListSingle0);
647     return false;    // Not enough vertices to assemble 8 lines
648 }
649 
PaLineList1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])650 bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
651 {
652     simdvector& a = PaGetSimdVector(pa, 0, slot);
653     simdvector& b = PaGetSimdVector(pa, 1, slot);
654     /// @todo: verify provoking vertex is correct
655     // Line list 0  1  2  3  4  5  6  7
656     //           8  9 10 11 12 13 14 15
657 
658     // shuffle:
659     //           0 2 4 6 8 10 12 14
660     //           1 3 5 7 9 11 13 15
661 
662     for (uint32_t i = 0; i < 4; ++i)
663     {
664         // 0 1 2 3 8 9 10 11
665         __m256 vALowBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x20);
666         // 4 5 6 7 12 13 14 15
667         __m256 vAHighBHigh = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x31);
668 
669         // 0 2 4 6 8 10 12 14
670         verts[0].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(2, 0, 2, 0));
671         // 1 3 5 7 9 11 13 15
672         verts[1].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(3, 1, 3, 1));
673     }
674 
675     SetNextPaState(pa, PaLineList0, PaLineListSingle0, 0, KNOB_SIMD_WIDTH, true);
676     return true;
677 }
678 
PaLineListSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,__m128 verts[])679 void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
680 {
681     simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
682     simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
683 
684     switch (primIndex)
685     {
686     case 0:
687         verts[0] = swizzleLane0(a);
688         verts[1] = swizzleLane1(a);
689         break;
690     case 1:
691         verts[0] = swizzleLane2(a);
692         verts[1] = swizzleLane3(a);
693         break;
694     case 2:
695         verts[0] = swizzleLane4(a);
696         verts[1] = swizzleLane5(a);
697         break;
698     case 3:
699         verts[0] = swizzleLane6(a);
700         verts[1] = swizzleLane7(a);
701         break;
702     case 4:
703         verts[0] = swizzleLane0(b);
704         verts[1] = swizzleLane1(b);
705         break;
706     case 5:
707         verts[0] = swizzleLane2(b);
708         verts[1] = swizzleLane3(b);
709         break;
710     case 6:
711         verts[0] = swizzleLane4(b);
712         verts[1] = swizzleLane5(b);
713         break;
714     case 7:
715         verts[0] = swizzleLane6(b);
716         verts[1] = swizzleLane7(b);
717         break;
718     }
719 }
720 
PaLineStrip0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])721 bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
722 {
723     SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0);
724     return false;    // Not enough vertices to assemble 8 lines
725 }
726 
PaLineStrip1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])727 bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
728 {
729     simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
730     simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
731 
732     /// @todo: verify provoking vertex is correct
733     // Line list 0  1  2  3  4  5  6  7
734     //           8  9 10 11 12 13 14 15
735 
736     // shuffle:
737     //           0  1  2  3  4  5  6  7
738     //           1  2  3  4  5  6  7  8
739 
740     verts[0] = a;
741 
742     for(uint32_t i = 0; i < 4; ++i)
743     {
744         // 1 2 3 x 5 6 7 x
745         __m256 vPermA = _mm256_permute_ps(a.v[i], 0x39); // indices hi->low 00 11 10 01 (0 3 2 1)
746         // 4 5 6 7 8 9 10 11
747         __m256 vAHighBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x21);
748 
749         // x x x 4 x x x 8
750         __m256 vPermB = _mm256_permute_ps(vAHighBLow, 0); // indices hi->low  (0 0 0 0)
751 
752         verts[1].v[i] = _mm256_blend_ps(vPermA, vPermB, 0x88);
753     }
754 
755     SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0, 0, KNOB_SIMD_WIDTH);
756     return true;
757 }
758 
PaLineStripSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t lineIndex,__m128 verts[])759 void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t lineIndex, __m128 verts[])
760 {
761     simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
762     simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
763 
764     switch (lineIndex)
765     {
766     case 0:
767         verts[0] = swizzleLane0(a);
768         verts[1] = swizzleLane1(a);
769         break;
770     case 1:
771         verts[0] = swizzleLane1(a);
772         verts[1] = swizzleLane2(a);
773         break;
774     case 2:
775         verts[0] = swizzleLane2(a);
776         verts[1] = swizzleLane3(a);
777         break;
778     case 3:
779         verts[0] = swizzleLane3(a);
780         verts[1] = swizzleLane4(a);
781         break;
782     case 4:
783         verts[0] = swizzleLane4(a);
784         verts[1] = swizzleLane5(a);
785         break;
786     case 5:
787         verts[0] = swizzleLane5(a);
788         verts[1] = swizzleLane6(a);
789         break;
790     case 6:
791         verts[0] = swizzleLane6(a);
792         verts[1] = swizzleLane7(a);
793         break;
794     case 7:
795         verts[0] = swizzleLane7(a);
796         verts[1] = swizzleLane0(b);
797         break;
798     }
799 }
800 
PaPoints0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])801 bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
802 {
803     simdvector& a = PaGetSimdVector(pa, pa.cur, slot);
804 
805     verts[0] = a;  // points only have 1 vertex.
806 
807     SetNextPaState(pa, PaPoints0, PaPointsSingle0, 0, KNOB_SIMD_WIDTH, true);
808     return true;
809 }
810 
PaPointsSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,__m128 verts[])811 void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
812 {
813     simdvector &a = PaGetSimdVector(pa, pa.cur, slot);
814     switch(primIndex)
815     {
816     case 0:
817         verts[0] = swizzleLane0(a);
818         break;
819     case 1:
820         verts[0] = swizzleLane1(a);
821         break;
822     case 2:
823         verts[0] = swizzleLane2(a);
824         break;
825     case 3:
826         verts[0] = swizzleLane3(a);
827         break;
828     case 4:
829         verts[0] = swizzleLane4(a);
830         break;
831     case 5:
832         verts[0] = swizzleLane5(a);
833         break;
834     case 6:
835         verts[0] = swizzleLane6(a);
836         break;
837     case 7:
838         verts[0] = swizzleLane7(a);
839         break;
840     }
841 }
842 
843 //////////////////////////////////////////////////////////////////////////
844 /// @brief State 1 for RECT_LIST topology.
845 ///        There is not enough to assemble 8 triangles.
PaRectList0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])846 bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
847 {
848     SetNextPaState(pa, PaRectList1, PaRectListSingle0);
849     return false;
850 }
851 
852 //////////////////////////////////////////////////////////////////////////
853 /// @brief State 1 for RECT_LIST topology.
854 ///   Rect lists has the following format.
855 ///             w          x          y           z
856 ///      v2 o---o   v5 o---o   v8 o---o   v11 o---o
857 ///         | \ |      | \ |      | \ |       | \ |
858 ///      v1 o---o   v4 o---o   v7 o---o   v10 o---o
859 ///            v0         v3         v6          v9
860 ///
861 ///   Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
862 ///
863 ///   tri0 = { v0, v1, v2 }  tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
864 ///   tri2 = { v3, v4, v5 }  tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
865 ///   etc.
866 ///
867 ///   PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
868 ///   where v0 contains all the first vertices for 8 triangles.
869 ///
870 ///     Result:
871 ///      verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
872 ///      verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
873 ///      verts[2] = { v2,  w, v5,  x, v8,  y, v11, z }
874 ///
875 /// @param pa - State for PA state machine.
876 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
877 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
PaRectList1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])878 bool PaRectList1(
879     PA_STATE_OPT& pa,
880     uint32_t slot,
881     simdvector verts[])
882 {
883     // SIMD vectors a and b are the last two vertical outputs from the vertex shader.
884     simdvector& a = PaGetSimdVector(pa, 0, slot);   // a[] = { v0, v1,  v2,  v3,  v4,  v5,  v6,  v7 }
885     simdvector& b = PaGetSimdVector(pa, 1, slot);   // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 }
886 
887     __m256 tmp0, tmp1, tmp2;
888 
889     // Loop over each component in the simdvector.
890     for(int i = 0; i < 4; ++i)
891     {
892         simdvector& v0 = verts[0];                          // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
893         tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01);  // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
894         v0[i] = _mm256_blend_ps(a[i], tmp0, 0x20);        //   v0 = {  v0,   *,   *,  v3,  *, v9,  v6,  * } where * is don't care.
895         tmp1  = _mm256_permute_ps(v0[i], 0xF0);           // tmp1 = {  v0,  v0,  v3,  v3,  *,  *,  *,  * }
896         v0[i] = _mm256_permute_ps(v0[i], 0x5A);           //   v0 = {   *,   *,   *,   *,  v6, v6, v9, v9 }
897         v0[i] = _mm256_blend_ps(tmp1, v0[i], 0xF0);       //   v0 = {  v0,  v0,  v3,  v3,  v6, v6, v9, v9 }
898 
899         /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
900         ///      AVX2 should make this much cheaper.
901         simdvector& v1 = verts[1];                          // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
902         v1[i] = _mm256_permute_ps(a[i], 0x09);            //   v1 = { v1, v2,  *,  *,  *, *,  *, * }
903         tmp1  = _mm256_permute_ps(a[i], 0x43);            // tmp1 = {  *,  *,  *,  *, v7, *, v4, v5 }
904         tmp2  = _mm256_blend_ps(v1[i], tmp1, 0xF0);       // tmp2 = { v1, v2,  *,  *, v7, *, v4, v5 }
905         tmp1  = _mm256_permute2f128_ps(tmp2, tmp2, 0x1);  // tmp1 = { v7,  *, v4,  v5, *  *,  *,  * }
906         v1[i] = _mm256_permute_ps(tmp0, 0xE0);            //   v1 = {  *,  *,  *,  *,  *, v8, v10, v11 }
907         v1[i] = _mm256_blend_ps(tmp2, v1[i], 0xE0);       //   v1 = { v1, v2,  *,  *, v7, v8, v10, v11 }
908         v1[i] = _mm256_blend_ps(v1[i], tmp1, 0x0C);       //   v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
909 
910         // verts[2] = { v2,  w, v5,  x, v8,  y, v11, z }
911         simdvector& v2 = verts[2];                          // verts[2] needs to be { v2,  w, v5,  x, v8,  y, v11, z }
912         v2[i] = _mm256_permute_ps(tmp0, 0x30);            //   v2 = { *, *, *, *, v8, *, v11, * }
913         tmp1  = _mm256_permute_ps(tmp2, 0x31);            // tmp1 = { v2, *, v5, *, *, *, *, * }
914         v2[i] = _mm256_blend_ps(tmp1, v2[i], 0xF0);
915 
916         // Need to compute 4th implied vertex for the rectangle.
917         tmp2  = _mm256_sub_ps(v0[i], v1[i]);
918         tmp2  = _mm256_add_ps(tmp2, v2[i]);               // tmp2 = {  w,  *,  x, *, y,  *,  z,  * }
919         tmp2  = _mm256_permute_ps(tmp2, 0xA0);            // tmp2 = {  *,  w,  *, x, *,   y,  *,  z }
920         v2[i] = _mm256_blend_ps(v2[i], tmp2, 0xAA);       //   v2 = { v2,  w, v5, x, v8,  y, v11, z }
921     }
922 
923     SetNextPaState(pa, PaRectList1, PaRectListSingle0, 0, KNOB_SIMD_WIDTH, true);
924     return true;
925 }
926 
927 //////////////////////////////////////////////////////////////////////////
928 /// @brief State 2 for RECT_LIST topology.
929 ///        Not implemented unless there is a use case for more then 8 rects.
930 /// @param pa - State for PA state machine.
931 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
932 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
PaRectList2(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])933 bool PaRectList2(
934     PA_STATE_OPT& pa,
935     uint32_t slot,
936     simdvector verts[])
937 {
938     SWR_ASSERT(0); // Is rect list used for anything other then clears?
939     SetNextPaState(pa, PaRectList0, PaRectListSingle0, 0, KNOB_SIMD_WIDTH, true);
940     return true;
941 }
942 
943 //////////////////////////////////////////////////////////////////////////
944 /// @brief This procedure is called by the Binner to assemble the attributes.
945 ///        Unlike position, which is stored vertically, the attributes are
946 ///        stored horizontally. The outputs from the VS, labeled as 'a' and
947 ///        'b' are vertical. This function needs to transpose the lanes
948 ///        containing the vertical attribute data into horizontal form.
949 /// @param pa - State for PA state machine.
950 /// @param slot - Index into VS output for a given attribute.
951 /// @param primIndex - Binner processes each triangle individually.
952 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
PaRectListSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,__m128 verts[])953 void PaRectListSingle0(
954     PA_STATE_OPT& pa,
955     uint32_t slot,
956     uint32_t primIndex,
957     __m128 verts[])
958 {
959     // We have 12 simdscalars contained within 3 simdvectors which
960     // hold at least 8 triangles worth of data. We want to assemble a single
961     // triangle with data in horizontal form.
962     simdvector& a = PaGetSimdVector(pa, 0, slot);
963 
964     // Convert from vertical to horizontal.
965     switch(primIndex)
966     {
967     case 0:
968         verts[0] = swizzleLane0(a);
969         verts[1] = swizzleLane1(a);
970         verts[2] = swizzleLane2(a);
971         break;
972     case 1:
973         verts[0] = swizzleLane0(a);
974         verts[1] = swizzleLane2(a);
975         verts[2] = _mm_blend_ps(verts[0], verts[1], 0x2);
976         break;
977     case 2:
978     case 3:
979     case 4:
980     case 5:
981     case 6:
982     case 7:
983         SWR_ASSERT(0);
984         break;
985     };
986 }
987 
PA_STATE_OPT(DRAW_CONTEXT * in_pDC,uint32_t in_numPrims,uint8_t * pStream,uint32_t in_streamSizeInVerts,bool in_isStreaming,PRIMITIVE_TOPOLOGY topo)988 PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t* pStream, uint32_t in_streamSizeInVerts,
989     bool in_isStreaming, PRIMITIVE_TOPOLOGY topo) : PA_STATE(in_pDC, pStream, in_streamSizeInVerts), numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0),
990     cur(0), prev(0), first(0), counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming)
991 {
992     const API_STATE& state = GetApiState(pDC);
993 
994     this->binTopology = topo == TOP_UNKNOWN ? state.topology : topo;
995 
996     switch (this->binTopology)
997     {
998         case TOP_TRIANGLE_LIST:
999             this->pfnPaFunc = PaTriList0;
1000             break;
1001         case TOP_TRIANGLE_STRIP:
1002             this->pfnPaFunc = PaTriStrip0;
1003             break;
1004         case TOP_TRIANGLE_FAN:
1005             this->pfnPaFunc = PaTriFan0;
1006             break;
1007         case TOP_QUAD_LIST:
1008             this->pfnPaFunc = PaQuadList0;
1009             this->numPrims = in_numPrims * 2;    // Convert quad primitives into triangles
1010             break;
1011         case TOP_QUAD_STRIP:
1012             // quad strip pattern when decomposed into triangles is the same as verts strips
1013             this->pfnPaFunc = PaTriStrip0;
1014             this->numPrims = in_numPrims * 2;    // Convert quad primitives into triangles
1015             break;
1016         case TOP_LINE_LIST:
1017             this->pfnPaFunc = PaLineList0;
1018             this->numPrims = in_numPrims;
1019             break;
1020         case TOP_LINE_STRIP:
1021             this->pfnPaFunc = PaLineStrip0;
1022             this->numPrims = in_numPrims;
1023             break;
1024         case TOP_LINE_LOOP:
1025             this->pfnPaFunc = PaLineLoop0;
1026             this->numPrims = in_numPrims;
1027             break;
1028         case TOP_POINT_LIST:
1029             // use point binner and rasterizer if supported
1030             this->pfnPaFunc = PaPoints0;
1031             this->numPrims = in_numPrims;
1032             break;
1033         case TOP_RECT_LIST:
1034             this->pfnPaFunc = PaRectList0;
1035             this->numPrims = in_numPrims * 2;
1036             break;
1037 
1038         case TOP_PATCHLIST_1:
1039             this->pfnPaFunc = PaPatchList<1>;
1040             break;
1041         case TOP_PATCHLIST_2:
1042             this->pfnPaFunc = PaPatchList<2>;
1043             break;
1044         case TOP_PATCHLIST_3:
1045             this->pfnPaFunc = PaPatchList<3>;
1046             break;
1047         case TOP_PATCHLIST_4:
1048             this->pfnPaFunc = PaPatchList<4>;
1049             break;
1050         case TOP_PATCHLIST_5:
1051             this->pfnPaFunc = PaPatchList<5>;
1052             break;
1053         case TOP_PATCHLIST_6:
1054             this->pfnPaFunc = PaPatchList<6>;
1055             break;
1056         case TOP_PATCHLIST_7:
1057             this->pfnPaFunc = PaPatchList<7>;
1058             break;
1059         case TOP_PATCHLIST_8:
1060             this->pfnPaFunc = PaPatchList<8>;
1061             break;
1062         case TOP_PATCHLIST_9:
1063             this->pfnPaFunc = PaPatchList<9>;
1064             break;
1065         case TOP_PATCHLIST_10:
1066             this->pfnPaFunc = PaPatchList<10>;
1067             break;
1068         case TOP_PATCHLIST_11:
1069             this->pfnPaFunc = PaPatchList<11>;
1070             break;
1071         case TOP_PATCHLIST_12:
1072             this->pfnPaFunc = PaPatchList<12>;
1073             break;
1074         case TOP_PATCHLIST_13:
1075             this->pfnPaFunc = PaPatchList<13>;
1076             break;
1077         case TOP_PATCHLIST_14:
1078             this->pfnPaFunc = PaPatchList<14>;
1079             break;
1080         case TOP_PATCHLIST_15:
1081             this->pfnPaFunc = PaPatchList<15>;
1082             break;
1083         case TOP_PATCHLIST_16:
1084             this->pfnPaFunc = PaPatchList<16>;
1085             break;
1086         case TOP_PATCHLIST_17:
1087             this->pfnPaFunc = PaPatchList<17>;
1088             break;
1089         case TOP_PATCHLIST_18:
1090             this->pfnPaFunc = PaPatchList<18>;
1091             break;
1092         case TOP_PATCHLIST_19:
1093             this->pfnPaFunc = PaPatchList<19>;
1094             break;
1095         case TOP_PATCHLIST_20:
1096             this->pfnPaFunc = PaPatchList<20>;
1097             break;
1098         case TOP_PATCHLIST_21:
1099             this->pfnPaFunc = PaPatchList<21>;
1100             break;
1101         case TOP_PATCHLIST_22:
1102             this->pfnPaFunc = PaPatchList<22>;
1103             break;
1104         case TOP_PATCHLIST_23:
1105             this->pfnPaFunc = PaPatchList<23>;
1106             break;
1107         case TOP_PATCHLIST_24:
1108             this->pfnPaFunc = PaPatchList<24>;
1109             break;
1110         case TOP_PATCHLIST_25:
1111             this->pfnPaFunc = PaPatchList<25>;
1112             break;
1113         case TOP_PATCHLIST_26:
1114             this->pfnPaFunc = PaPatchList<26>;
1115             break;
1116         case TOP_PATCHLIST_27:
1117             this->pfnPaFunc = PaPatchList<27>;
1118             break;
1119         case TOP_PATCHLIST_28:
1120             this->pfnPaFunc = PaPatchList<28>;
1121             break;
1122         case TOP_PATCHLIST_29:
1123             this->pfnPaFunc = PaPatchList<29>;
1124             break;
1125         case TOP_PATCHLIST_30:
1126             this->pfnPaFunc = PaPatchList<30>;
1127             break;
1128         case TOP_PATCHLIST_31:
1129             this->pfnPaFunc = PaPatchList<31>;
1130             break;
1131         case TOP_PATCHLIST_32:
1132             this->pfnPaFunc = PaPatchList<32>;
1133             break;
1134 
1135         default:
1136             SWR_ASSERT(0);
1137             break;
1138     };
1139 
1140     this->pfnPaFuncReset = this->pfnPaFunc;
1141 
1142     //    simdscalari id8 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
1143     //    simdscalari id4 = _mm256_set_epi32(0, 0, 1, 1, 2, 2, 3, 3);
1144     simdscalari id8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1145     simdscalari id4 = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
1146 
1147     switch(this->binTopology)
1148     {
1149         case TOP_TRIANGLE_LIST:
1150         case TOP_TRIANGLE_STRIP:
1151         case TOP_TRIANGLE_FAN:
1152         case TOP_LINE_STRIP:
1153         case TOP_LINE_LIST:
1154         case TOP_LINE_LOOP:
1155             this->primIDIncr = 8;
1156             this->primID = id8;
1157             break;
1158         case TOP_QUAD_LIST:
1159         case TOP_QUAD_STRIP:
1160         case TOP_RECT_LIST:
1161             this->primIDIncr = 4;
1162             this->primID = id4;
1163             break;
1164         case TOP_POINT_LIST:
1165             this->primIDIncr = 8;
1166             this->primID = id8;
1167             break;
1168         case TOP_PATCHLIST_1:
1169         case TOP_PATCHLIST_2:
1170         case TOP_PATCHLIST_3:
1171         case TOP_PATCHLIST_4:
1172         case TOP_PATCHLIST_5:
1173         case TOP_PATCHLIST_6:
1174         case TOP_PATCHLIST_7:
1175         case TOP_PATCHLIST_8:
1176         case TOP_PATCHLIST_9:
1177         case TOP_PATCHLIST_10:
1178         case TOP_PATCHLIST_11:
1179         case TOP_PATCHLIST_12:
1180         case TOP_PATCHLIST_13:
1181         case TOP_PATCHLIST_14:
1182         case TOP_PATCHLIST_15:
1183         case TOP_PATCHLIST_16:
1184         case TOP_PATCHLIST_17:
1185         case TOP_PATCHLIST_18:
1186         case TOP_PATCHLIST_19:
1187         case TOP_PATCHLIST_20:
1188         case TOP_PATCHLIST_21:
1189         case TOP_PATCHLIST_22:
1190         case TOP_PATCHLIST_23:
1191         case TOP_PATCHLIST_24:
1192         case TOP_PATCHLIST_25:
1193         case TOP_PATCHLIST_26:
1194         case TOP_PATCHLIST_27:
1195         case TOP_PATCHLIST_28:
1196         case TOP_PATCHLIST_29:
1197         case TOP_PATCHLIST_30:
1198         case TOP_PATCHLIST_31:
1199         case TOP_PATCHLIST_32:
1200             // Always run KNOB_SIMD_WIDTH number of patches at a time.
1201             this->primIDIncr = 8;
1202             this->primID = id8;
1203             break;
1204 
1205         default:
1206             SWR_ASSERT(0);
1207             break;
1208     };
1209 
1210 }
1211 #endif
1212