• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright 2023 Alyssa Rosenzweig
3 * Copyright 2023 Valve Corporation
4 * SPDX-License-Identifier: MIT
5 */
6
7#include "geometry.h"
8
9static uint
10align(uint x, uint y)
11{
12   return (x + y - 1) & ~(y - 1);
13}
14
15/* Compatible with util/u_math.h */
16static inline uint
17util_logbase2_ceil(uint n)
18{
19   if (n <= 1)
20      return 0;
21   else
22      return 32 - clz(n - 1);
23}
24
25/* Swap the two non-provoking vertices third vert in odd triangles. This
26 * generates a vertex ID list with a consistent winding order.
27 *
28 * With prim and flatshade_first, the map : [0, 1, 2] -> [0, 1, 2] is its own
29 * inverse. This lets us reuse it for both vertex fetch and transform feedback.
30 */
31uint
32libagx_map_vertex_in_tri_strip(uint prim, uint vert, bool flatshade_first)
33{
34   unsigned pv = flatshade_first ? 0 : 2;
35
36   bool even = (prim & 1) == 0;
37   bool provoking = vert == pv;
38
39   return (provoking || even) ? vert : ((3 - pv) - vert);
40}
41
42uint64_t
43libagx_xfb_vertex_address(global struct agx_geometry_params *p, uint base_index,
44                          uint vert, uint buffer, uint stride,
45                          uint output_offset)
46{
47   uint index = base_index + vert;
48   uint xfb_offset = (index * stride) + output_offset;
49
50   return (uintptr_t)(p->xfb_base[buffer]) + xfb_offset;
51}
52
53uint
54libagx_vertex_id_for_line_loop(uint prim, uint vert, uint num_prims)
55{
56   /* (0, 1), (1, 2), (2, 0) */
57   if (prim == (num_prims - 1) && vert == 1)
58      return 0;
59   else
60      return prim + vert;
61}
62
63uint
64libagx_vertex_id_for_line_class(enum mesa_prim mode, uint prim, uint vert,
65                                uint num_prims)
66{
67   /* Line list, line strip, or line loop */
68   if (mode == MESA_PRIM_LINE_LOOP && prim == (num_prims - 1) && vert == 1)
69      return 0;
70
71   if (mode == MESA_PRIM_LINES)
72      prim *= 2;
73
74   return prim + vert;
75}
76
77uint
78libagx_vertex_id_for_tri_fan(uint prim, uint vert, bool flatshade_first)
79{
80   /* Vulkan spec section 20.1.7 gives (i + 1, i + 2, 0) for a provoking
81    * first. OpenGL instead wants (0, i + 1, i + 2) with a provoking last.
82    * Piglit clipflat expects us to switch between these orders depending on
83    * provoking vertex, to avoid trivializing the fan.
84    *
85    * Rotate accordingly.
86    */
87   if (flatshade_first) {
88      vert = vert + 1;
89      vert = (vert == 2) ? 0 : vert;
90   }
91
92   /* The simpler form assuming last is provoking. */
93   return (vert == 0) ? 0 : prim + vert;
94}
95
96uint
97libagx_vertex_id_for_tri_class(enum mesa_prim mode, uint prim, uint vert,
98                               bool flatshade_first)
99{
100   if (flatshade_first && mode == MESA_PRIM_TRIANGLE_FAN) {
101      vert = vert + 1;
102      vert = (vert == 3) ? 0 : vert;
103   }
104
105   if (mode == MESA_PRIM_TRIANGLE_FAN && vert == 0)
106      return 0;
107
108   if (mode == MESA_PRIM_TRIANGLES)
109      prim *= 3;
110
111   /* Triangle list, triangle strip, or triangle fan */
112   if (mode == MESA_PRIM_TRIANGLE_STRIP) {
113      unsigned pv = flatshade_first ? 0 : 2;
114
115      bool even = (prim & 1) == 0;
116      bool provoking = vert == pv;
117
118      vert = ((provoking || even) ? vert : ((3 - pv) - vert));
119   }
120
121   return prim + vert;
122}
123
124uint
125libagx_vertex_id_for_line_adj_class(enum mesa_prim mode, uint prim, uint vert)
126{
127   /* Line list adj or line strip adj */
128   if (mode == MESA_PRIM_LINES_ADJACENCY)
129      prim *= 4;
130
131   return prim + vert;
132}
133
134uint
135libagx_vertex_id_for_tri_strip_adj(uint prim, uint vert, uint num_prims,
136                                   bool flatshade_first)
137{
138   /* See Vulkan spec section 20.1.11 "Triangle Strips With Adjancency".
139    *
140    * There are different cases for first/middle/last/only primitives and for
141    * odd/even primitives.  Determine which case we're in.
142    */
143   bool last = prim == (num_prims - 1);
144   bool first = prim == 0;
145   bool even = (prim & 1) == 0;
146   bool even_or_first = even || first;
147
148   /* When the last vertex is provoking, we rotate the primitives
149    * accordingly. This seems required for OpenGL.
150    */
151   if (!flatshade_first && !even_or_first) {
152      vert = (vert + 4u) % 6u;
153   }
154
155   /* Offsets per the spec. The spec lists 6 cases with 6 offsets. Luckily,
156    * there are lots of patterns we can exploit, avoiding a full 6x6 LUT.
157    *
158    * Here we assume the first vertex is provoking, the Vulkan default.
159    */
160   uint offsets[6] = {
161      0,
162      first ? 1 : (even ? -2 : 3),
163      even_or_first ? 2 : 4,
164      last ? 5 : 6,
165      even_or_first ? 4 : 2,
166      even_or_first ? 3 : -2,
167   };
168
169   /* Ensure NIR can see thru the local array */
170   uint offset = 0;
171   for (uint i = 1; i < 6; ++i) {
172      if (i == vert)
173         offset = offsets[i];
174   }
175
176   /* Finally add to the base of the primitive */
177   return (prim * 2) + offset;
178}
179
180uint
181libagx_vertex_id_for_tri_adj_class(enum mesa_prim mode, uint prim, uint vert,
182                                   uint nr, bool flatshade_first)
183{
184   /* Tri adj list or tri adj strip */
185   if (mode == MESA_PRIM_TRIANGLE_STRIP_ADJACENCY) {
186      return libagx_vertex_id_for_tri_strip_adj(prim, vert, nr,
187                                                flatshade_first);
188   } else {
189      return (6 * prim) + vert;
190   }
191}
192
193uint
194libagx_vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first,
195                              uint prim, uint vert, uint num_prims)
196{
197   switch (mode) {
198   case MESA_PRIM_POINTS:
199   case MESA_PRIM_LINES:
200   case MESA_PRIM_TRIANGLES:
201   case MESA_PRIM_LINES_ADJACENCY:
202   case MESA_PRIM_TRIANGLES_ADJACENCY:
203      /* Regular primitive: every N vertices defines a primitive */
204      return (prim * mesa_vertices_per_prim(mode)) + vert;
205
206   case MESA_PRIM_LINE_LOOP:
207      return libagx_vertex_id_for_line_loop(prim, vert, num_prims);
208
209   case MESA_PRIM_LINE_STRIP:
210   case MESA_PRIM_LINE_STRIP_ADJACENCY:
211      /* (i, i + 1) or (i, ..., i + 3) */
212      return prim + vert;
213
214   case MESA_PRIM_TRIANGLE_STRIP: {
215      /* Order depends on the provoking vert.
216       *
217       * First: (0, 1, 2), (1, 3, 2), (2, 3, 4).
218       * Last:  (0, 1, 2), (2, 1, 3), (2, 3, 4).
219       *
220       * Pull the (maybe swapped) vert from the corresponding primitive
221       */
222      return prim + libagx_map_vertex_in_tri_strip(prim, vert, flatshade_first);
223   }
224
225   case MESA_PRIM_TRIANGLE_FAN:
226      return libagx_vertex_id_for_tri_fan(prim, vert, flatshade_first);
227
228   case MESA_PRIM_TRIANGLE_STRIP_ADJACENCY:
229      return libagx_vertex_id_for_tri_strip_adj(prim, vert, num_prims,
230                                                flatshade_first);
231
232   default:
233      return 0;
234   }
235}
236
237/*
238 * When unrolling the index buffer for a draw, we translate the old indirect
239 * draws to new indirect draws. This routine allocates the new index buffer and
240 * sets up most of the new draw descriptor.
241 */
242static global void *
243setup_unroll_for_draw(global struct agx_ia_state *ia, constant uint *in_draw,
244                      uint draw, enum mesa_prim mode, uint index_size_B)
245{
246   /* Determine an upper bound on the memory required for the index buffer.
247    * Restarts only decrease the unrolled index buffer size, so the maximum size
248    * is the unrolled size when the input has no restarts.
249    */
250   uint max_prims = u_decomposed_prims_for_vertices(mode, in_draw[0]);
251   uint max_verts = max_prims * mesa_vertices_per_prim(mode);
252   uint alloc_size = max_verts * index_size_B;
253
254   /* Allocate memory from the heap for the unrolled index buffer. Use an atomic
255    * since multiple threads may be running to handle multidraw in parallel.
256    */
257   global struct agx_geometry_state *heap = ia->heap;
258   uint old_heap_bottom = atomic_fetch_add(
259      (volatile atomic_uint *)(&heap->heap_bottom), align(alloc_size, 4));
260
261   /* Regardless of the input stride, we use tightly packed output draws */
262   global uint *out = &ia->out_draws[5 * draw];
263
264   /* Setup most of the descriptor. Count will be determined after unroll. */
265   out[1] = in_draw[1];                     /* instance count */
266   out[2] = old_heap_bottom / index_size_B; /* index offset */
267   out[3] = in_draw[3];                     /* index bias */
268   out[4] = in_draw[4];                     /* base instance */
269
270   /* Return the index buffer we allocated */
271   return (global uchar *)heap->heap + (old_heap_bottom * index_size_B);
272}
273
274#define UNROLL(INDEX, suffix)                                                  \
275   void libagx_unroll_restart_##suffix(global struct agx_ia_state *ia,         \
276                                       enum mesa_prim mode, uint draw)         \
277   {                                                                           \
278      /* For an indirect multidraw, we are dispatched maxDraws times and       \
279       * terminate trailing invocations.                                       \
280       */                                                                      \
281      if (ia->count && draw >= *(ia->count))                                   \
282         return;                                                               \
283                                                                               \
284      constant uint *in_draw =                                                 \
285         (constant uint *)(ia->draws + (draw * ia->draw_stride));              \
286                                                                               \
287      uint count = in_draw[0];                                                 \
288      constant INDEX *in = (constant INDEX *)ia->index_buffer;                 \
289      in += in_draw[2];                                                        \
290                                                                               \
291      global INDEX *out =                                                      \
292         setup_unroll_for_draw(ia, in_draw, draw, mode, sizeof(INDEX));        \
293                                                                               \
294      uint out_prims = 0;                                                      \
295      INDEX restart_idx = ia->restart_index;                                   \
296      bool flatshade_first = ia->flatshade_first;                              \
297      uint in_size_el = ia->index_buffer_size_B / sizeof(INDEX);               \
298                                                                               \
299      uint needle = 0;                                                         \
300      uint per_prim = mesa_vertices_per_prim(mode);                            \
301      while (needle < count) {                                                 \
302         /* Search for next restart or the end */                              \
303         uint next_restart = needle;                                           \
304         while ((next_restart < count) && in[next_restart] != restart_idx)     \
305            ++next_restart;                                                    \
306                                                                               \
307         /* Emit up to the next restart */                                     \
308         uint subcount = next_restart - needle;                                \
309         uint subprims = u_decomposed_prims_for_vertices(mode, subcount);      \
310         for (uint i = 0; i < subprims; ++i) {                                 \
311            for (uint vtx = 0; vtx < per_prim; ++vtx) {                        \
312               uint id = libagx_vertex_id_for_topology(mode, flatshade_first,  \
313                                                       i, vtx, subprims);      \
314               uint offset = needle + id;                                      \
315                                                                               \
316               out[(out_prims * per_prim) + vtx] =                             \
317                  offset < in_size_el ? in[offset] : 0;                        \
318            }                                                                  \
319                                                                               \
320            out_prims++;                                                       \
321         }                                                                     \
322                                                                               \
323         needle = next_restart + 1;                                            \
324      }                                                                        \
325                                                                               \
326      ia->out_draws[(5 * draw) + 0] = out_prims * per_prim;                    \
327   }
328
329UNROLL(uchar, u8)
330UNROLL(ushort, u16)
331UNROLL(uint, u32)
332
333uintptr_t
334libagx_index_buffer(constant struct agx_ia_state *p, uint id,
335                    uint index_size)
336{
337   return (uintptr_t)&p->index_buffer[id * index_size];
338}
339
340uint
341libagx_setup_xfb_buffer(global struct agx_geometry_params *p, uint i)
342{
343   global uint *off_ptr = p->xfb_offs_ptrs[i];
344   if (!off_ptr)
345      return 0;
346
347   uint off = *off_ptr;
348   p->xfb_base[i] = p->xfb_base_original[i] + off;
349   return off;
350}
351
352/*
353 * Translate EndPrimitive for LINE_STRIP or TRIANGLE_STRIP output prims into
354 * writes into the 32-bit output index buffer. We write the sequence (b, b + 1,
355 * b + 2, ..., b + n - 1, -1), where b (base) is the first vertex in the prim, n
356 * (count) is the number of verts in the prims, and -1 is the prim restart index
357 * used to signal the end of the prim.
358 *
359 * For points, we write index buffers without restart, just as a sideband to
360 * pass data into the vertex shader.
361 */
362void
363libagx_end_primitive(global int *index_buffer, uint total_verts,
364                     uint verts_in_prim, uint total_prims,
365                     uint invocation_vertex_base, uint invocation_prim_base,
366                     uint geometry_base, bool restart)
367{
368   /* Previous verts/prims are from previous invocations plus earlier
369    * prims in this invocation. For the intra-invocation counts, we
370    * subtract the count for this prim from the inclusive sum NIR gives us.
371    */
372   uint previous_verts_in_invoc = (total_verts - verts_in_prim);
373   uint previous_verts = invocation_vertex_base + previous_verts_in_invoc;
374   uint previous_prims = restart ? invocation_prim_base + (total_prims - 1) : 0;
375
376   /* The indices are encoded as: (unrolled ID * output vertices) + vertex. */
377   uint index_base = geometry_base + previous_verts_in_invoc;
378
379   /* Index buffer contains 1 index for each vertex and 1 for each prim */
380   global int *out = &index_buffer[previous_verts + previous_prims];
381
382   /* Write out indices for the strip */
383   for (uint i = 0; i < verts_in_prim; ++i) {
384      out[i] = index_base + i;
385   }
386
387   if (restart)
388      out[verts_in_prim] = -1;
389}
390
391void
392libagx_build_gs_draw(global struct agx_geometry_params *p, bool indexed,
393                     uint vertices, uint primitives)
394{
395   global uint *descriptor = p->indirect_desc;
396   global struct agx_geometry_state *state = p->state;
397
398   /* Setup the indirect draw descriptor */
399   if (indexed) {
400      uint indices = vertices + primitives; /* includes restart indices */
401
402      /* Allocate the index buffer */
403      uint index_buffer_offset_B = state->heap_bottom;
404      p->output_index_buffer =
405         (global uint *)(state->heap + index_buffer_offset_B);
406      state->heap_bottom += (indices * 4);
407
408      descriptor[0] = indices;                   /* count */
409      descriptor[1] = 1;                         /* instance count */
410      descriptor[2] = index_buffer_offset_B / 4; /* start */
411      descriptor[3] = 0;                         /* index bias */
412      descriptor[4] = 0;                         /* start instance */
413   } else {
414      descriptor[0] = vertices; /* count */
415      descriptor[1] = 1;        /* instance count */
416      descriptor[2] = 0;        /* start */
417      descriptor[3] = 0;        /* start instance */
418   }
419
420   if (state->heap_bottom > 1024 * 1024 * 128) {
421      global uint *foo = (global uint *)(uintptr_t)0xdeadbeef;
422      *foo = 0x1234;
423   }
424}
425
426void
427libagx_gs_setup_indirect(global struct agx_geometry_params *p,
428                         global struct agx_ia_state *ia, enum mesa_prim mode,
429                         uint local_id)
430{
431   global uint *in_draw = (global uint *)ia->draws;
432
433   /* Determine the (primitives, instances) grid size. */
434   uint vertex_count = in_draw[0];
435   uint instance_count = in_draw[1];
436
437   /* Calculate number of primitives input into the GS */
438   uint prim_per_instance = u_decomposed_prims_for_vertices(mode, vertex_count);
439   p->input_primitives = prim_per_instance * instance_count;
440   p->input_vertices = vertex_count;
441
442   /* Invoke VS as (vertices, instances, 1); GS as (primitives, instances, 1) */
443   p->vs_grid[0] = vertex_count;
444   p->vs_grid[1] = instance_count;
445   p->vs_grid[2] = 1;
446
447   p->gs_grid[0] = prim_per_instance;
448   p->gs_grid[1] = instance_count;
449   p->gs_grid[2] = 1;
450
451   p->primitives_log2 = util_logbase2_ceil(prim_per_instance);
452
453   /* If indexing is enabled, the third word is the offset into the index buffer
454    * in elements. Apply that offset now that we have it. For a hardware
455    * indirect draw, the hardware would do this for us, but for software input
456    * assembly we need to do it ourselves.
457    */
458   if (ia->index_buffer) {
459      ia->index_buffer += ((constant uint *)ia->draws)[2] * ia->index_size_B;
460   }
461
462   /* We may need to allocate VS and GS count buffers, do so now */
463   global struct agx_geometry_state *state = p->state;
464
465   uint vertex_buffer_size =
466      libagx_tcs_in_size(vertex_count * instance_count, p->vs_outputs);
467
468   p->count_buffer = (global uint *)(state->heap + state->heap_bottom);
469   state->heap_bottom +=
470      align(p->input_primitives * p->count_buffer_stride, 16);
471
472   p->vertex_buffer = (global uint *)(state->heap + state->heap_bottom);
473   state->heap_bottom += align(vertex_buffer_size, 4);
474}
475
476void
477libagx_prefix_sum(global uint *buffer, uint len, uint words, uint2 local_id)
478{
479   /* Main loop: complete subgroups processing 32 values at once
480    *
481    * TODO: Don't do a serial bottleneck! This is bad!
482    */
483   uint i, count = 0;
484   uint len_remainder = len % 32;
485   uint len_rounded_down = len - len_remainder;
486
487   for (i = local_id.x; i < len_rounded_down; i += 32) {
488      global uint *ptr = &buffer[(i * words) + local_id.y];
489      uint value = *ptr;
490
491      /* TODO: use inclusive once that's wired up */
492      uint value_prefix_sum = sub_group_scan_exclusive_add(value) + value;
493      *ptr = count + value_prefix_sum;
494
495      /* Advance count by the reduction sum of all processed values. We already
496       * have that sum calculated in the last lane. We know that lane is active,
497       * since all control flow is uniform except in the last iteration.
498       */
499      count += sub_group_broadcast(value_prefix_sum, 31);
500   }
501
502   /* The last iteration is special since we won't have a full subgroup unless
503    * the length is divisible by the subgroup size, and we don't advance count.
504    */
505   if (local_id.x < len_remainder) {
506      global uint *ptr = &buffer[(i * words) + local_id.y];
507      uint value = *ptr;
508
509      /* TODO: use inclusive once that's wired up */
510      *ptr = count + sub_group_scan_exclusive_add(value) + value;
511   }
512}
513
514bool
515libagx_is_provoking_last(global struct agx_ia_state *ia)
516{
517   return !ia->flatshade_first;
518}
519
520uintptr_t
521libagx_vertex_output_address(constant struct agx_geometry_params *p, uint vtx,
522                             gl_varying_slot location, uint64_t vs_outputs)
523{
524   return (uintptr_t)p->vertex_buffer +
525          libagx_tcs_in_offs(vtx, location, vs_outputs);
526}
527