1 /*
2 * Copyright 2023 Alyssa Rosenzweig
3 * Copyright 2023 Valve Corporation
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "compiler/libcl/libcl.h"
8 #include "compiler/shader_enums.h"
9
10 #ifndef __OPENCL_VERSION__
11 #include "util/bitscan.h"
12 #define libagx_popcount(x) util_bitcount64(x)
13 #define libagx_sub_sat(x, y) ((x >= y) ? (x - y) : 0)
14 #else
15 #define libagx_popcount(x) popcount(x)
16 #define libagx_sub_sat(x, y) sub_sat(x, y)
17 #endif
18
19 #ifndef LIBAGX_GEOMETRY_H
20 #define LIBAGX_GEOMETRY_H
21
22 #define MAX_SO_BUFFERS 4
23 #define MAX_VERTEX_STREAMS 4
24
25 /* Packed geometry state buffer */
26 struct agx_geometry_state {
27 /* Heap to allocate from. */
28 DEVICE(uchar) heap;
29 uint32_t heap_bottom, heap_size;
30 } PACKED;
31 static_assert(sizeof(struct agx_geometry_state) == 4 * 4);
32
33 #ifdef __OPENCL_VERSION__
34 static inline global void *
agx_heap_alloc_nonatomic(global struct agx_geometry_state * heap,size_t size)35 agx_heap_alloc_nonatomic(global struct agx_geometry_state *heap, size_t size)
36 {
37 global void *out = heap->heap + heap->heap_bottom;
38 heap->heap_bottom += size;
39 return out;
40 }
41 #endif
42
43 struct agx_ia_state {
44 /* Index buffer if present. */
45 uint64_t index_buffer;
46
47 /* Size of the bound index buffer for bounds checking */
48 uint32_t index_buffer_range_el;
49
50 /* Number of vertices per instance. Written by CPU for direct draw, indirect
51 * setup kernel for indirect. This is used for VS->GS and VS->TCS indexing.
52 */
53 uint32_t verts_per_instance;
54 } PACKED;
55 static_assert(sizeof(struct agx_ia_state) == 4 * 4);
56
57 static inline uint64_t
libagx_index_buffer(uint64_t index_buffer,uint size_el,uint offset_el,uint elsize_B,uint64_t zero_sink)58 libagx_index_buffer(uint64_t index_buffer, uint size_el, uint offset_el,
59 uint elsize_B, uint64_t zero_sink)
60 {
61 if (offset_el < size_el)
62 return index_buffer + (offset_el * elsize_B);
63 else
64 return zero_sink;
65 }
66
67 static inline uint
libagx_index_buffer_range_el(uint size_el,uint offset_el)68 libagx_index_buffer_range_el(uint size_el, uint offset_el)
69 {
70 return libagx_sub_sat(size_el, offset_el);
71 }
72
73 struct agx_geometry_params {
74 /* Persistent (cross-draw) geometry state */
75 DEVICE(struct agx_geometry_state) state;
76
77 /* Address of associated indirect draw buffer */
78 DEVICE(uint) indirect_desc;
79
80 /* Address of count buffer. For an indirect draw, this will be written by the
81 * indirect setup kernel.
82 */
83 DEVICE(uint) count_buffer;
84
85 /* Address of the primitives generated counters */
86 DEVICE(uint) prims_generated_counter[MAX_VERTEX_STREAMS];
87 DEVICE(uint) xfb_prims_generated_counter[MAX_VERTEX_STREAMS];
88 DEVICE(uint) xfb_overflow[MAX_VERTEX_STREAMS];
89 DEVICE(uint) xfb_any_overflow;
90
91 /* Pointers to transform feedback buffer offsets in bytes */
92 DEVICE(uint) xfb_offs_ptrs[MAX_SO_BUFFERS];
93
94 /* Output index buffer, allocated by pre-GS. */
95 DEVICE(uint) output_index_buffer;
96
97 /* Address of transform feedback buffer in general, supplied by the CPU. */
98 DEVICE(uchar) xfb_base_original[MAX_SO_BUFFERS];
99
100 /* Address of transform feedback for the current primitive. Written by pre-GS
101 * program.
102 */
103 DEVICE(uchar) xfb_base[MAX_SO_BUFFERS];
104
105 /* Address and present mask for the input to the geometry shader. These will
106 * reflect the vertex shader for VS->GS or instead the tessellation
107 * evaluation shader for TES->GS.
108 */
109 uint64_t input_buffer;
110 uint64_t input_mask;
111
112 /* Location-indexed mask of flat outputs, used for lowering GL edge flags. */
113 uint64_t flat_outputs;
114
115 uint32_t xfb_size[MAX_SO_BUFFERS];
116
117 /* Number of primitives emitted by transform feedback per stream. Written by
118 * the pre-GS program.
119 */
120 uint32_t xfb_prims[MAX_VERTEX_STREAMS];
121
122 /* Within an indirect GS draw, the grids used to dispatch the VS/GS written
123 * out by the GS indirect setup kernel or the CPU for a direct draw.
124 */
125 uint32_t vs_grid[3];
126 uint32_t gs_grid[3];
127
128 /* Number of input primitives across all instances, calculated by the CPU for
129 * a direct draw or the GS indirect setup kernel for an indirect draw.
130 */
131 uint32_t input_primitives;
132
133 /* Number of input primitives per instance, rounded up to a power-of-two and
134 * with the base-2 log taken. This is used to partition the output vertex IDs
135 * efficiently.
136 */
137 uint32_t primitives_log2;
138
139 /* Number of bytes output by the GS count shader per input primitive (may be
140 * 0), written by CPU and consumed by indirect draw setup shader for
141 * allocating counts.
142 */
143 uint32_t count_buffer_stride;
144
145 /* Dynamic input topology. Must be compatible with the geometry shader's
146 * layout() declared input class.
147 */
148 uint32_t input_topology;
149 } PACKED;
150 static_assert(sizeof(struct agx_geometry_params) == 82 * 4);
151
152 /* TCS shared memory layout:
153 *
154 * vec4 vs_outputs[VERTICES_IN_INPUT_PATCH][TOTAL_VERTEX_OUTPUTS];
155 *
156 * TODO: compact.
157 */
158 static inline uint
libagx_tcs_in_offs_el(uint vtx,gl_varying_slot location,uint64_t crosslane_vs_out_mask)159 libagx_tcs_in_offs_el(uint vtx, gl_varying_slot location,
160 uint64_t crosslane_vs_out_mask)
161 {
162 uint base = vtx * libagx_popcount(crosslane_vs_out_mask);
163 uint offs = libagx_popcount(crosslane_vs_out_mask &
164 (((uint64_t)(1) << location) - 1));
165
166 return base + offs;
167 }
168
169 static inline uint
libagx_tcs_in_offs(uint vtx,gl_varying_slot location,uint64_t crosslane_vs_out_mask)170 libagx_tcs_in_offs(uint vtx, gl_varying_slot location,
171 uint64_t crosslane_vs_out_mask)
172 {
173 return libagx_tcs_in_offs_el(vtx, location, crosslane_vs_out_mask) * 16;
174 }
175
176 static inline uint
libagx_tcs_in_size(uint32_t vertices_in_patch,uint64_t crosslane_vs_out_mask)177 libagx_tcs_in_size(uint32_t vertices_in_patch, uint64_t crosslane_vs_out_mask)
178 {
179 return vertices_in_patch * libagx_popcount(crosslane_vs_out_mask) * 16;
180 }
181
182 /*
183 * TCS out buffer layout, per-patch:
184 *
185 * float tess_level_outer[4];
186 * float tess_level_inner[2];
187 * vec4 patch_out[MAX_PATCH_OUTPUTS];
188 * vec4 vtx_out[OUT_PATCH_SIZE][TOTAL_VERTEX_OUTPUTS];
189 *
190 * Vertex out are compacted based on the mask of written out. Patch
191 * out are used as-is.
192 *
193 * Bounding boxes are ignored.
194 */
195 static inline uint
libagx_tcs_out_offs_el(uint vtx_id,gl_varying_slot location,uint nr_patch_out,uint64_t vtx_out_mask)196 libagx_tcs_out_offs_el(uint vtx_id, gl_varying_slot location, uint nr_patch_out,
197 uint64_t vtx_out_mask)
198 {
199 uint off = 0;
200 if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
201 return off;
202
203 off += 4;
204 if (location == VARYING_SLOT_TESS_LEVEL_INNER)
205 return off;
206
207 off += 2;
208 if (location >= VARYING_SLOT_PATCH0)
209 return off + (4 * (location - VARYING_SLOT_PATCH0));
210
211 /* Anything else is a per-vtx output */
212 off += 4 * nr_patch_out;
213 off += 4 * vtx_id * libagx_popcount(vtx_out_mask);
214
215 uint idx = libagx_popcount(vtx_out_mask & (((uint64_t)(1) << location) - 1));
216 return off + (4 * idx);
217 }
218
219 static inline uint
libagx_tcs_out_offs(uint vtx_id,gl_varying_slot location,uint nr_patch_out,uint64_t vtx_out_mask)220 libagx_tcs_out_offs(uint vtx_id, gl_varying_slot location, uint nr_patch_out,
221 uint64_t vtx_out_mask)
222 {
223 return libagx_tcs_out_offs_el(vtx_id, location, nr_patch_out, vtx_out_mask) *
224 4;
225 }
226
227 static inline uint
libagx_tcs_out_stride_el(uint nr_patch_out,uint out_patch_size,uint64_t vtx_out_mask)228 libagx_tcs_out_stride_el(uint nr_patch_out, uint out_patch_size,
229 uint64_t vtx_out_mask)
230 {
231 return libagx_tcs_out_offs_el(out_patch_size, 0, nr_patch_out, vtx_out_mask);
232 }
233
234 static inline uint
libagx_tcs_out_stride(uint nr_patch_out,uint out_patch_size,uint64_t vtx_out_mask)235 libagx_tcs_out_stride(uint nr_patch_out, uint out_patch_size,
236 uint64_t vtx_out_mask)
237 {
238 return libagx_tcs_out_stride_el(nr_patch_out, out_patch_size, vtx_out_mask) *
239 4;
240 }
241
242 /* In a tess eval shader, stride for hw vertex ID */
243 #define LIBAGX_TES_PATCH_ID_STRIDE 8192
244
245 static uint
libagx_compact_prim(enum mesa_prim prim)246 libagx_compact_prim(enum mesa_prim prim)
247 {
248 static_assert(MESA_PRIM_QUAD_STRIP == MESA_PRIM_QUADS + 1);
249 static_assert(MESA_PRIM_POLYGON == MESA_PRIM_QUADS + 2);
250
251 #ifndef __OPENCL_VERSION__
252 assert(prim != MESA_PRIM_QUADS);
253 assert(prim != MESA_PRIM_QUAD_STRIP);
254 assert(prim != MESA_PRIM_POLYGON);
255 assert(prim != MESA_PRIM_PATCHES);
256 #endif
257
258 return (prim >= MESA_PRIM_QUADS) ? (prim - 3) : prim;
259 }
260
261 static enum mesa_prim
libagx_uncompact_prim(uint packed)262 libagx_uncompact_prim(uint packed)
263 {
264 return (packed >= MESA_PRIM_QUADS) ? (packed + 3) : packed;
265 }
266
267 #endif
268