1 /*
2 * Copyright 2018-2021 Alyssa Rosenzweig
3 * SPDX-License-Identifier: MIT
4 */
5
6 #pragma once
7
8 #include "compiler/nir/nir.h"
9 #include "util/u_dynarray.h"
10 #include "util/u_tristate.h"
11 #include "shader_enums.h"
12
13 struct agx_cf_binding {
14 /* Base coefficient register */
15 uint8_t cf_base;
16
17 /* Slot being bound */
18 gl_varying_slot slot : 8;
19
20 /* First component bound.
21 *
22 * Must be 2 (Z) or 3 (W) if slot == VARYING_SLOT_POS.
23 */
24 unsigned offset : 2;
25
26 /* Number of components bound */
27 unsigned count : 3;
28
29 /* Is smooth shading enabled? If false, flat shading is used */
30 bool smooth : 1;
31
32 /* Perspective correct interpolation */
33 bool perspective : 1;
34
35 uint8_t pad;
36 };
37
38 /* Conservative bound, * 4 due to offsets (TODO: maybe worth eliminating
39 * coefficient register aliasing?)
40 */
41 #define AGX_MAX_CF_BINDINGS (VARYING_SLOT_MAX * 4)
42
43 struct agx_varyings_fs {
44 /* Number of coefficient registers used */
45 unsigned nr_cf;
46
47 /* Number of coefficient register bindings */
48 unsigned nr_bindings;
49
50 /* Whether gl_FragCoord.z is read */
51 bool reads_z;
52
53 /* Coefficient register bindings */
54 struct agx_cf_binding bindings[AGX_MAX_CF_BINDINGS];
55 };
56
57 union agx_varyings {
58 struct agx_varyings_fs fs;
59 };
60
61 struct agx_interp_info {
62 /* Bit masks indexed by I/O location of flat and linear varyings */
63 uint64_t flat;
64 uint64_t linear;
65 };
66 static_assert(sizeof(struct agx_interp_info) == 16, "packed");
67
68 struct agx_rodata {
69 /* Offset in the binary */
70 uint32_t offset;
71
72 /* Base uniform to map constants */
73 uint16_t base_uniform;
74
75 /* Number of 16-bit constants to map contiguously there */
76 uint16_t size_16;
77 };
78
79 struct agx_shader_info {
80 enum pipe_shader_type stage;
81 uint32_t binary_size;
82
83 union agx_varyings varyings;
84
85 /* Number of uniforms */
86 uint16_t push_count;
87
88 /* Local memory allocation in bytes */
89 uint16_t local_size;
90
91 /* Local imageblock allocation in bytes per thread */
92 uint16_t imageblock_stride;
93
94 /* Scratch memory allocation in bytes for main/preamble respectively */
95 unsigned scratch_size, preamble_scratch_size;
96
97 /* Size in bytes of the main sahder */
98 unsigned main_size;
99
100 /* Does the shader have a preamble? If so, it is at offset preamble_offset.
101 * The main shader is at offset main_offset. The preamble is executed first.
102 */
103 bool has_preamble;
104 unsigned preamble_offset, main_offset;
105
106 /* Does the shader read the tilebuffer? */
107 bool reads_tib;
108
109 /* Does the shader require early fragment tests? */
110 bool early_fragment_tests;
111
112 /* Does the shader potentially draw to a nonzero viewport? */
113 bool nonzero_viewport;
114
115 /* Does the shader write layer and/or viewport index? Written together */
116 bool writes_layer_viewport;
117
118 /* Does the shader control the sample mask? */
119 bool writes_sample_mask;
120
121 /* Depth layout, never equal to NONE */
122 enum gl_frag_depth_layout depth_layout;
123
124 /* Based only the compiled shader, should tag writes be disabled? This is set
125 * based on what is outputted. Note if rasterizer discard is used, that needs
126 * to disable tag writes regardless of this flag.
127 */
128 bool tag_write_disable;
129
130 /* Shader is incompatible with triangle merging */
131 bool disable_tri_merging;
132
133 /* Reads draw ID system value */
134 bool uses_draw_id;
135
136 /* Reads base vertex/instance */
137 bool uses_base_param;
138
139 /* Uses txf and hence needs a txf sampler mapped */
140 bool uses_txf;
141
142 /* Number of 16-bit registers used by the main shader and preamble
143 * respectively.
144 */
145 uint16_t nr_gprs, nr_preamble_gprs;
146
147 /* Output mask set during driver lowering */
148 uint64_t outputs;
149
150 /* Workgroup size */
151 uint16_t workgroup_size[3];
152
153 /* There may be constants in the binary. The driver must map these to uniform
154 * registers as specified hre.
155 */
156 struct agx_rodata rodata;
157 };
158
159 struct agx_precompiled_kernel_info {
160 uint32_t preamble_offset, main_offset;
161 uint32_t main_size, binary_size;
162 struct agx_rodata rodata;
163 uint16_t nr_gprs, nr_preamble_gprs;
164 uint16_t push_count;
165 uint16_t workgroup_size[3];
166 uint16_t local_size;
167 uint16_t imageblock_stride;
168 bool uses_txf;
169 };
170
171 static inline struct agx_precompiled_kernel_info
agx_compact_kernel_info(struct agx_shader_info * info)172 agx_compact_kernel_info(struct agx_shader_info *info)
173 {
174 assert(info->has_preamble == (info->nr_preamble_gprs > 0));
175
176 return (struct agx_precompiled_kernel_info){
177 .preamble_offset = info->preamble_offset,
178 .main_offset = info->main_offset,
179 .main_size = info->main_size,
180 .binary_size = info->binary_size,
181 .rodata = info->rodata,
182 .nr_gprs = info->nr_gprs,
183 .nr_preamble_gprs = info->nr_preamble_gprs,
184 .push_count = info->push_count,
185 .workgroup_size = {info->workgroup_size[0], info->workgroup_size[1],
186 info->workgroup_size[2]},
187 .local_size = info->local_size,
188 .imageblock_stride = info->imageblock_stride,
189 .uses_txf = info->uses_txf,
190 };
191 }
192
193 struct agx_shader_part {
194 struct agx_shader_info info;
195 void *binary;
196 };
197
198 #define AGX_MAX_RTS (8)
199
200 enum agx_format {
201 AGX_FORMAT_I8 = 0,
202 AGX_FORMAT_I16 = 1,
203 AGX_FORMAT_I32 = 2,
204 AGX_FORMAT_F16 = 3,
205 AGX_FORMAT_U8NORM = 4,
206 AGX_FORMAT_S8NORM = 5,
207 AGX_FORMAT_U16NORM = 6,
208 AGX_FORMAT_S16NORM = 7,
209 AGX_FORMAT_RGB10A2 = 8,
210 AGX_FORMAT_SRGBA8 = 10,
211 AGX_FORMAT_RG11B10F = 12,
212 AGX_FORMAT_RGB9E5 = 13,
213
214 /* Keep last */
215 AGX_NUM_FORMATS,
216 };
217
218 struct agx_fs_shader_key {
219 /* Normally, access to the tilebuffer must be guarded by appropriate fencing
220 * instructions to ensure correct results in the presence of out-of-order
221 * hardware optimizations. However, specially dispatched clear shaders are
222 * not subject to these conditions and can omit the wait instructions.
223 *
224 * Must (only) be set for special clear shaders.
225 *
226 * Must not be used with sample mask writes (including discards) or
227 * tilebuffer loads (including blending).
228 */
229 bool ignore_tib_dependencies;
230
231 /* When dynamic sample shading is used, the fragment shader is wrapped in a
232 * loop external to the API shader. This bit indicates that we are compiling
233 * inside the sample loop, meaning the execution nesting counter is already
234 * zero and must be preserved.
235 */
236 bool inside_sample_loop;
237
238 /* Base coefficient register. 0 for API shaders but nonzero for FS prolog */
239 uint8_t cf_base;
240 };
241
242 struct agx_device_key {
243 /* Does the target GPU need explicit cluster coherency for atomics?
244 * Only used on G13X.
245 */
246 enum u_tristate needs_g13x_coherency;
247
248 /* Is soft fault enabled? This is technically system-wide policy set by the
249 * kernel, but that's functionally a hardware feature.
250 */
251 bool soft_fault;
252 };
253
254 struct agx_shader_key {
255 /* Device info */
256 struct agx_device_key dev;
257
258 /* Number of reserved preamble slots at the start */
259 unsigned reserved_preamble;
260
261 /* Library routines to link against */
262 const nir_shader *libagx;
263
264 /* Whether scratch memory is available in the given shader stage */
265 bool has_scratch;
266
267 /* Whether we're compiling the helper program used for scratch allocation.
268 * This has special register allocation requirements.
269 */
270 bool is_helper;
271
272 /* Whether the driver supports uploading constants for this shader. If
273 * false, constants will not be promoted to uniforms.
274 */
275 bool promote_constants;
276
277 /* Set if this is a non-monolithic shader that must be linked with additional
278 * shader parts before the program can be used. This suppresses omission of
279 * `stop` instructions, which the linker must insert instead.
280 */
281 bool no_stop;
282
283 /* Set if this is a secondary shader part (prolog or epilog). This prevents
284 * the compiler from allocating uniform registers. For example, this turns
285 * off preambles.
286 */
287 bool secondary;
288
289 union {
290 struct agx_fs_shader_key fs;
291 };
292 };
293
294 struct agx_interp_info agx_gather_interp_info(nir_shader *nir);
295 uint64_t agx_gather_texcoords(nir_shader *nir);
296
297 void agx_link_libagx(nir_shader *nir, const nir_shader *libagx);
298 void agx_preprocess_nir(nir_shader *nir, const nir_shader *libagx);
299 bool agx_nir_lower_discard_zs_emit(nir_shader *s);
300 bool agx_nir_lower_sample_mask(nir_shader *s);
301 bool agx_nir_lower_interpolation(nir_shader *s);
302
303 bool agx_nir_lower_cull_distance_vs(struct nir_shader *s);
304 bool agx_nir_lower_cull_distance_fs(struct nir_shader *s,
305 unsigned nr_distances);
306 bool agx_mem_vectorize_cb(unsigned align_mul, unsigned align_offset,
307 unsigned bit_size, unsigned num_components,
308 int64_t hole_size, nir_intrinsic_instr *low,
309 nir_intrinsic_instr *high, void *data);
310
311 void agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,
312 struct util_debug_callback *debug,
313 struct agx_shader_part *out);
314
315 struct agx_occupancy {
316 unsigned max_registers;
317 unsigned max_threads;
318 };
319
320 struct agx_occupancy agx_occupancy_for_register_count(unsigned halfregs);
321 unsigned agx_max_registers_for_occupancy(unsigned occupancy);
322
323 static const nir_shader_compiler_options agx_nir_options = {
324 .lower_fdiv = true,
325 .fuse_ffma16 = true,
326 .fuse_ffma32 = true,
327 .lower_flrp16 = true,
328 .lower_flrp32 = true,
329 .lower_fpow = true,
330 .lower_fmod = true,
331 .lower_bitfield_insert = true,
332 .lower_ifind_msb = true,
333 .lower_find_lsb = true,
334 .lower_uadd_carry = true,
335 .lower_usub_borrow = true,
336 .lower_fisnormal = true,
337 .lower_scmp = true,
338 .lower_isign = true,
339 .lower_fsign = true,
340 .lower_iabs = true,
341 .lower_fminmax_signed_zero = true,
342 .lower_fdph = true,
343 .lower_ffract = true,
344 .lower_ldexp = true,
345 .lower_pack_half_2x16 = true,
346 .lower_pack_unorm_2x16 = true,
347 .lower_pack_snorm_2x16 = true,
348 .lower_pack_unorm_4x8 = true,
349 .lower_pack_snorm_4x8 = true,
350 .lower_pack_64_2x32 = true,
351 .lower_unpack_half_2x16 = true,
352 .lower_unpack_unorm_2x16 = true,
353 .lower_unpack_snorm_2x16 = true,
354 .lower_unpack_unorm_4x8 = true,
355 .lower_unpack_snorm_4x8 = true,
356 .lower_extract_byte = true,
357 .lower_insert_byte = true,
358 .lower_insert_word = true,
359 .has_cs_global_id = true,
360 .lower_device_index_to_zero = true,
361 .lower_hadd = true,
362 .vectorize_io = true,
363 .has_amul = true,
364 .has_isub = true,
365 .support_16bit_alu = true,
366 .max_unroll_iterations = 32,
367 .lower_uniforms_to_ubo = true,
368 .late_lower_int64 = true,
369 .lower_int64_options =
370 (nir_lower_int64_options) ~(nir_lower_iadd64 | nir_lower_imul_2x32_64),
371 .lower_doubles_options = (nir_lower_doubles_options)(~0),
372 .support_indirect_inputs = (uint8_t)BITFIELD_MASK(PIPE_SHADER_TYPES),
373 .support_indirect_outputs = (uint8_t)BITFIELD_MASK(PIPE_SHADER_TYPES),
374 .lower_fquantize2f16 = true,
375 .compact_arrays = true,
376 .discard_is_demote = true,
377 .scalarize_ddx = true,
378 .io_options = nir_io_always_interpolate_convergent_fs_inputs,
379 };
380