1 /*
2 * Copyright 2023 Alyssa Rosenzweig
3 * Copyright 2023 Valve Corporation
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "compiler/shader_enums.h"
8 #include "libagx.h"
9
10 #ifndef __OPENCL_VERSION__
11 #include "util/bitscan.h"
12 #define CONST(type_) uint64_t
13 #define libagx_popcount(x) util_bitcount64(x)
14 #else
15 #define CONST(type_) constant type_ *
16 #define libagx_popcount(x) popcount(x)
17 #endif
18
19 #ifndef LIBAGX_GEOMETRY_H
20 #define LIBAGX_GEOMETRY_H
21
22 #define MAX_SO_BUFFERS 4
23 #define MAX_VERTEX_STREAMS 4
24
25 /* Packed geometry state buffer */
26 struct agx_geometry_state {
27 /* Heap to allocate from, in either direction. By convention, the top is used
28 * for intra-draw allocations and the bottom is used for full-batch
29 * allocations. In the future we could use kernel support to improve this.
30 */
31 GLOBAL(uchar) heap;
32 uint32_t heap_bottom, heap_top, heap_size, padding;
33 } PACKED;
34 AGX_STATIC_ASSERT(sizeof(struct agx_geometry_state) == 6 * 4);
35
36 struct agx_ia_state {
37 /* Heap to allocate from across draws */
38 GLOBAL(struct agx_geometry_state) heap;
39
40 /* Input: index buffer if present. */
41 CONST(uchar) index_buffer;
42
43 /* Input: draw count */
44 CONST(uint) count;
45
46 /* Input: indirect draw descriptor. Raw pointer since it's strided. */
47 uint64_t draws;
48
49 /* For the geom/tess path, this is the temporary prefix sum buffer.
50 * Caller-allocated. For regular MDI, this is ok since the CPU knows the
51 * worst-case draw count.
52 */
53 GLOBAL(uint) prefix_sums;
54
55 /* When unrolling primitive restart, output draw descriptors */
56 GLOBAL(uint) out_draws;
57
58 /* Input: maximum draw count, count is clamped to this */
59 uint32_t max_draws;
60
61 /* Primitive restart index, if unrolling */
62 uint32_t restart_index;
63
64 /* Input index buffer size in bytes, if unrolling */
65 uint32_t index_buffer_size_B;
66
67 /* Stride for the draw descrptor array */
68 uint32_t draw_stride;
69
70 /* When unrolling primitive restart, use first vertex as the provoking vertex
71 * for flat shading. We could stick this in the key, but meh, you're already
72 * hosed for perf on the unroll path.
73 */
74 uint32_t flatshade_first;
75
76 /* The index size (1, 2, 4) or 0 if drawing without an index buffer. */
77 uint32_t index_size_B;
78 } PACKED;
79 AGX_STATIC_ASSERT(sizeof(struct agx_ia_state) == 18 * 4);
80
81 struct agx_geometry_params {
82 /* Persistent (cross-draw) geometry state */
83 GLOBAL(struct agx_geometry_state) state;
84
85 /* Address of associated indirect draw buffer */
86 GLOBAL(uint) indirect_desc;
87
88 /* Address of vertex shader output buffer */
89 GLOBAL(uchar) vertex_buffer;
90
91 /* Address of count buffer. For an indirect draw, this will be written by the
92 * indirect setup kernel.
93 */
94 GLOBAL(uint) count_buffer;
95
96 /* Address of the primitives generated counters */
97 GLOBAL(uint) prims_generated_counter[MAX_VERTEX_STREAMS];
98 GLOBAL(uint) xfb_prims_generated_counter[MAX_VERTEX_STREAMS];
99 GLOBAL(uint) xfb_overflow[MAX_VERTEX_STREAMS];
100 GLOBAL(uint) xfb_any_overflow;
101
102 /* Pointers to transform feedback buffer offsets in bytes */
103 GLOBAL(uint) xfb_offs_ptrs[MAX_SO_BUFFERS];
104
105 /* Output index buffer, allocated by pre-GS. */
106 GLOBAL(uint) output_index_buffer;
107
108 /* Address of transform feedback buffer in general, supplied by the CPU. */
109 GLOBAL(uchar) xfb_base_original[MAX_SO_BUFFERS];
110
111 /* Address of transform feedback for the current primitive. Written by pre-GS
112 * program.
113 */
114 GLOBAL(uchar) xfb_base[MAX_SO_BUFFERS];
115
116 /* Bitfield of VS outputs. TODO: Optimize linked shaders. */
117 uint64_t vs_outputs;
118
119 /* Location-indexed mask of flat outputs, used for lowering GL edge flags. */
120 uint64_t flat_outputs;
121
122 uint32_t xfb_size[MAX_SO_BUFFERS];
123
124 /* Number of primitives emitted by transform feedback per stream. Written by
125 * the pre-GS program.
126 */
127 uint32_t xfb_prims[MAX_VERTEX_STREAMS];
128
129 /* Within an indirect GS draw, the grids used to dispatch the VS/GS written
130 * out by the GS indirect setup kernel. Unused for direct GS draws.
131 */
132 uint32_t vs_grid[3];
133 uint32_t gs_grid[3];
134
135 /* Number of input vertices, part of the stride for the vertex buffer */
136 uint32_t input_vertices;
137
138 /* Number of input primitives across all instances, calculated by the CPU for
139 * a direct draw or the GS indirect setup kernel for an indirect draw.
140 */
141 uint32_t input_primitives;
142
143 /* Number of input primitives per instance, rounded up to a power-of-two and
144 * with the base-2 log taken. This is used to partition the output vertex IDs
145 * efficiently.
146 */
147 uint32_t primitives_log2;
148
149 /* Number of bytes output by the GS count shader per input primitive (may be
150 * 0), written by CPU and consumed by indirect draw setup shader for
151 * allocating counts.
152 */
153 uint32_t count_buffer_stride;
154
155 /* Dynamic input topology. Must be compatible with the geometry shader's
156 * layout() declared input class.
157 */
158 uint32_t input_topology;
159 } PACKED;
160 AGX_STATIC_ASSERT(sizeof(struct agx_geometry_params) == 83 * 4);
161
162 struct agx_tess_params {
163 /* Persistent (cross-draw) geometry state */
164 GLOBAL(struct agx_geometry_state) state;
165
166 /* Patch coordinate offsets in patch_coord_buffer, indexed by patch ID. */
167 GLOBAL(uint) patch_coord_offs;
168
169 /* Patch coordinate buffer, indexed as:
170 *
171 * patch_coord_offs[patch_ID] + vertex_in_patch
172 *
173 * Currently float2s, but we might be able to compact later?
174 */
175 GLOBAL(float2) patch_coord_buffer;
176
177 /* Tessellation control shader output buffer, indexed by patch ID. */
178 GLOBAL(uchar) tcs_buffer;
179
180 /* Bitfield of TCS per-vertex outputs */
181 uint64_t tcs_per_vertex_outputs;
182
183 /* Default tess levels used in OpenGL when there is no TCS in the pipeline.
184 * Unused in Vulkan and OpenGL ES.
185 */
186 float tess_level_outer_default[4];
187 float tess_level_inner_default[4];
188
189 /* Number of vertices in the input patch */
190 uint input_patch_size;
191
192 /* Number of vertices in the TCS output patch */
193 uint output_patch_size;
194
195 /* Number of patch constants written by TCS */
196 uint tcs_patch_constants;
197
198 /* Number of input patches per instance of the VS/TCS */
199 uint patches_per_instance;
200 } PACKED;
201 AGX_STATIC_ASSERT(sizeof(struct agx_tess_params) == 22 * 4);
202
203 /* TCS shared memory layout:
204 *
205 * vec4 vs_outputs[VERTICES_IN_INPUT_PATCH][TOTAL_VERTEX_OUTPUTS];
206 *
207 * TODO: compact.
208 */
209 static inline uint
libagx_tcs_in_offs(uint vtx,gl_varying_slot location,uint64_t crosslane_vs_out_mask)210 libagx_tcs_in_offs(uint vtx, gl_varying_slot location,
211 uint64_t crosslane_vs_out_mask)
212 {
213 uint base = vtx * libagx_popcount(crosslane_vs_out_mask);
214 uint offs = libagx_popcount(crosslane_vs_out_mask &
215 (((uint64_t)(1) << location) - 1));
216
217 return (base + offs) * 16;
218 }
219
220 static inline uint
libagx_tcs_in_size(uint32_t vertices_in_patch,uint64_t crosslane_vs_out_mask)221 libagx_tcs_in_size(uint32_t vertices_in_patch, uint64_t crosslane_vs_out_mask)
222 {
223 return vertices_in_patch * libagx_popcount(crosslane_vs_out_mask) * 16;
224 }
225
226 /*
227 * TCS out buffer layout, per-patch:
228 *
229 * float tess_level_outer[4];
230 * float tess_level_inner[2];
231 * vec4 patch_out[MAX_PATCH_OUTPUTS];
232 * vec4 vtx_out[OUT_PATCH_SIZE][TOTAL_VERTEX_OUTPUTS];
233 *
234 * Vertex out are compacted based on the mask of written out. Patch
235 * out are used as-is.
236 *
237 * Bounding boxes are ignored.
238 */
239 static inline uint
libagx_tcs_out_offs(uint vtx_id,gl_varying_slot location,uint nr_patch_out,uint out_patch_size,uint64_t vtx_out_mask)240 libagx_tcs_out_offs(uint vtx_id, gl_varying_slot location, uint nr_patch_out,
241 uint out_patch_size, uint64_t vtx_out_mask)
242 {
243 uint off = 0;
244 if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
245 return off;
246
247 off += 4 * sizeof(float);
248 if (location == VARYING_SLOT_TESS_LEVEL_INNER)
249 return off;
250
251 off += 2 * sizeof(float);
252 if (location >= VARYING_SLOT_PATCH0)
253 return off + (16 * (location - VARYING_SLOT_PATCH0));
254
255 /* Anything else is a per-vtx output */
256 off += 16 * nr_patch_out;
257 off += 16 * vtx_id * libagx_popcount(vtx_out_mask);
258
259 uint idx = libagx_popcount(vtx_out_mask & (((uint64_t)(1) << location) - 1));
260 return off + (16 * idx);
261 }
262
263 static inline uint
libagx_tcs_out_stride(uint nr_patch_out,uint out_patch_size,uint64_t vtx_out_mask)264 libagx_tcs_out_stride(uint nr_patch_out, uint out_patch_size,
265 uint64_t vtx_out_mask)
266 {
267 return libagx_tcs_out_offs(out_patch_size, VARYING_SLOT_VAR0, nr_patch_out,
268 out_patch_size, vtx_out_mask);
269 }
270
271 /* In a tess eval shader, stride for hw vertex ID */
272 #define LIBAGX_TES_PATCH_ID_STRIDE 8192
273
274 #endif
275