• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2023 Alyssa Rosenzweig
3  * Copyright 2023 Valve Corporation
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "compiler/shader_enums.h"
8 #include "libagx.h"
9 
10 #ifndef __OPENCL_VERSION__
11 #include "util/bitscan.h"
12 #define CONST(type_)       uint64_t
13 #define libagx_popcount(x) util_bitcount64(x)
14 #else
15 #define CONST(type_)       constant type_ *
16 #define libagx_popcount(x) popcount(x)
17 #endif
18 
19 #ifndef LIBAGX_GEOMETRY_H
20 #define LIBAGX_GEOMETRY_H
21 
22 #define MAX_SO_BUFFERS     4
23 #define MAX_VERTEX_STREAMS 4
24 
25 /* Packed geometry state buffer */
26 struct agx_geometry_state {
27    /* Heap to allocate from, in either direction. By convention, the top is used
28     * for intra-draw allocations and the bottom is used for full-batch
29     * allocations. In the future we could use kernel support to improve this.
30     */
31    GLOBAL(uchar) heap;
32    uint32_t heap_bottom, heap_top, heap_size, padding;
33 } PACKED;
34 AGX_STATIC_ASSERT(sizeof(struct agx_geometry_state) == 6 * 4);
35 
36 struct agx_ia_state {
37    /* Heap to allocate from across draws */
38    GLOBAL(struct agx_geometry_state) heap;
39 
40    /* Input: index buffer if present. */
41    CONST(uchar) index_buffer;
42 
43    /* Input: draw count */
44    CONST(uint) count;
45 
46    /* Input: indirect draw descriptor. Raw pointer since it's strided. */
47    uint64_t draws;
48 
49    /* For the geom/tess path, this is the temporary prefix sum buffer.
50     * Caller-allocated. For regular MDI, this is ok since the CPU knows the
51     * worst-case draw count.
52     */
53    GLOBAL(uint) prefix_sums;
54 
55    /* When unrolling primitive restart, output draw descriptors */
56    GLOBAL(uint) out_draws;
57 
58    /* Input: maximum draw count, count is clamped to this */
59    uint32_t max_draws;
60 
61    /* Primitive restart index, if unrolling */
62    uint32_t restart_index;
63 
64    /* Input index buffer size in bytes, if unrolling */
65    uint32_t index_buffer_size_B;
66 
67    /* Stride for the draw descrptor array */
68    uint32_t draw_stride;
69 
70    /* When unrolling primitive restart, use first vertex as the provoking vertex
71     * for flat shading. We could stick this in the key, but meh, you're already
72     * hosed for perf on the unroll path.
73     */
74    uint32_t flatshade_first;
75 
76    /* The index size (1, 2, 4) or 0 if drawing without an index buffer. */
77    uint32_t index_size_B;
78 } PACKED;
79 AGX_STATIC_ASSERT(sizeof(struct agx_ia_state) == 18 * 4);
80 
81 struct agx_geometry_params {
82    /* Persistent (cross-draw) geometry state */
83    GLOBAL(struct agx_geometry_state) state;
84 
85    /* Address of associated indirect draw buffer */
86    GLOBAL(uint) indirect_desc;
87 
88    /* Address of vertex shader output buffer */
89    GLOBAL(uchar) vertex_buffer;
90 
91    /* Address of count buffer. For an indirect draw, this will be written by the
92     * indirect setup kernel.
93     */
94    GLOBAL(uint) count_buffer;
95 
96    /* Address of the primitives generated counters */
97    GLOBAL(uint) prims_generated_counter[MAX_VERTEX_STREAMS];
98    GLOBAL(uint) xfb_prims_generated_counter[MAX_VERTEX_STREAMS];
99    GLOBAL(uint) xfb_overflow[MAX_VERTEX_STREAMS];
100    GLOBAL(uint) xfb_any_overflow;
101 
102    /* Pointers to transform feedback buffer offsets in bytes */
103    GLOBAL(uint) xfb_offs_ptrs[MAX_SO_BUFFERS];
104 
105    /* Output index buffer, allocated by pre-GS. */
106    GLOBAL(uint) output_index_buffer;
107 
108    /* Address of transform feedback buffer in general, supplied by the CPU. */
109    GLOBAL(uchar) xfb_base_original[MAX_SO_BUFFERS];
110 
111    /* Address of transform feedback for the current primitive. Written by pre-GS
112     * program.
113     */
114    GLOBAL(uchar) xfb_base[MAX_SO_BUFFERS];
115 
116    /* Bitfield of VS outputs. TODO: Optimize linked shaders. */
117    uint64_t vs_outputs;
118 
119    /* Location-indexed mask of flat outputs, used for lowering GL edge flags. */
120    uint64_t flat_outputs;
121 
122    uint32_t xfb_size[MAX_SO_BUFFERS];
123 
124    /* Number of primitives emitted by transform feedback per stream. Written by
125     * the pre-GS program.
126     */
127    uint32_t xfb_prims[MAX_VERTEX_STREAMS];
128 
129    /* Within an indirect GS draw, the grids used to dispatch the VS/GS written
130     * out by the GS indirect setup kernel. Unused for direct GS draws.
131     */
132    uint32_t vs_grid[3];
133    uint32_t gs_grid[3];
134 
135    /* Number of input vertices, part of the stride for the vertex buffer */
136    uint32_t input_vertices;
137 
138    /* Number of input primitives across all instances, calculated by the CPU for
139     * a direct draw or the GS indirect setup kernel for an indirect draw.
140     */
141    uint32_t input_primitives;
142 
143    /* Number of input primitives per instance, rounded up to a power-of-two and
144     * with the base-2 log taken. This is used to partition the output vertex IDs
145     * efficiently.
146     */
147    uint32_t primitives_log2;
148 
149    /* Number of bytes output by the GS count shader per input primitive (may be
150     * 0), written by CPU and consumed by indirect draw setup shader for
151     * allocating counts.
152     */
153    uint32_t count_buffer_stride;
154 
155    /* Dynamic input topology. Must be compatible with the geometry shader's
156     * layout() declared input class.
157     */
158    uint32_t input_topology;
159 } PACKED;
160 AGX_STATIC_ASSERT(sizeof(struct agx_geometry_params) == 83 * 4);
161 
162 struct agx_tess_params {
163    /* Persistent (cross-draw) geometry state */
164    GLOBAL(struct agx_geometry_state) state;
165 
166    /* Patch coordinate offsets in patch_coord_buffer, indexed by patch ID. */
167    GLOBAL(uint) patch_coord_offs;
168 
169    /* Patch coordinate buffer, indexed as:
170     *
171     *    patch_coord_offs[patch_ID] + vertex_in_patch
172     *
173     * Currently float2s, but we might be able to compact later?
174     */
175    GLOBAL(float2) patch_coord_buffer;
176 
177    /* Tessellation control shader output buffer, indexed by patch ID. */
178    GLOBAL(uchar) tcs_buffer;
179 
180    /* Bitfield of TCS per-vertex outputs */
181    uint64_t tcs_per_vertex_outputs;
182 
183    /* Default tess levels used in OpenGL when there is no TCS in the pipeline.
184     * Unused in Vulkan and OpenGL ES.
185     */
186    float tess_level_outer_default[4];
187    float tess_level_inner_default[4];
188 
189    /* Number of vertices in the input patch */
190    uint input_patch_size;
191 
192    /* Number of vertices in the TCS output patch */
193    uint output_patch_size;
194 
195    /* Number of patch constants written by TCS */
196    uint tcs_patch_constants;
197 
198    /* Number of input patches per instance of the VS/TCS */
199    uint patches_per_instance;
200 } PACKED;
201 AGX_STATIC_ASSERT(sizeof(struct agx_tess_params) == 22 * 4);
202 
203 /* TCS shared memory layout:
204  *
205  *    vec4 vs_outputs[VERTICES_IN_INPUT_PATCH][TOTAL_VERTEX_OUTPUTS];
206  *
207  * TODO: compact.
208  */
209 static inline uint
libagx_tcs_in_offs(uint vtx,gl_varying_slot location,uint64_t crosslane_vs_out_mask)210 libagx_tcs_in_offs(uint vtx, gl_varying_slot location,
211                    uint64_t crosslane_vs_out_mask)
212 {
213    uint base = vtx * libagx_popcount(crosslane_vs_out_mask);
214    uint offs = libagx_popcount(crosslane_vs_out_mask &
215                                (((uint64_t)(1) << location) - 1));
216 
217    return (base + offs) * 16;
218 }
219 
220 static inline uint
libagx_tcs_in_size(uint32_t vertices_in_patch,uint64_t crosslane_vs_out_mask)221 libagx_tcs_in_size(uint32_t vertices_in_patch, uint64_t crosslane_vs_out_mask)
222 {
223    return vertices_in_patch * libagx_popcount(crosslane_vs_out_mask) * 16;
224 }
225 
226 /*
227  * TCS out buffer layout, per-patch:
228  *
229  *    float tess_level_outer[4];
230  *    float tess_level_inner[2];
231  *    vec4 patch_out[MAX_PATCH_OUTPUTS];
232  *    vec4 vtx_out[OUT_PATCH_SIZE][TOTAL_VERTEX_OUTPUTS];
233  *
234  * Vertex out are compacted based on the mask of written out. Patch
235  * out are used as-is.
236  *
237  * Bounding boxes are ignored.
238  */
239 static inline uint
libagx_tcs_out_offs(uint vtx_id,gl_varying_slot location,uint nr_patch_out,uint out_patch_size,uint64_t vtx_out_mask)240 libagx_tcs_out_offs(uint vtx_id, gl_varying_slot location, uint nr_patch_out,
241                     uint out_patch_size, uint64_t vtx_out_mask)
242 {
243    uint off = 0;
244    if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
245       return off;
246 
247    off += 4 * sizeof(float);
248    if (location == VARYING_SLOT_TESS_LEVEL_INNER)
249       return off;
250 
251    off += 2 * sizeof(float);
252    if (location >= VARYING_SLOT_PATCH0)
253       return off + (16 * (location - VARYING_SLOT_PATCH0));
254 
255    /* Anything else is a per-vtx output */
256    off += 16 * nr_patch_out;
257    off += 16 * vtx_id * libagx_popcount(vtx_out_mask);
258 
259    uint idx = libagx_popcount(vtx_out_mask & (((uint64_t)(1) << location) - 1));
260    return off + (16 * idx);
261 }
262 
263 static inline uint
libagx_tcs_out_stride(uint nr_patch_out,uint out_patch_size,uint64_t vtx_out_mask)264 libagx_tcs_out_stride(uint nr_patch_out, uint out_patch_size,
265                       uint64_t vtx_out_mask)
266 {
267    return libagx_tcs_out_offs(out_patch_size, VARYING_SLOT_VAR0, nr_patch_out,
268                               out_patch_size, vtx_out_mask);
269 }
270 
271 /* In a tess eval shader, stride for hw vertex ID */
272 #define LIBAGX_TES_PATCH_ID_STRIDE 8192
273 
274 #endif
275