1 /*
2 * Copyright 2022 Alyssa Rosenzweig
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "compiler/nir/nir_builder.h"
7 #include "util/bitset.h"
8 #include "util/u_dynarray.h"
9 #include "agx_state.h"
10 #include "nir.h"
11 #include "nir_builder_opcodes.h"
12 #include "nir_intrinsics.h"
13 #include "nir_intrinsics_indices.h"
14 #include "shader_enums.h"
15
16 #define AGX_TEXTURE_DESC_STRIDE 24
17
18 /*
19 * Lower all system values to uniform loads. This pass tries to compact ranges
20 * of contiguous uploaded uniforms to reduce the draw-time overhead of uploading
21 * many tiny ranges. To do so, it works in 4 steps:
22 *
23 * 1. Lower NIR sysvals to loads from the system value buffers.
24 * 2. Walk the NIR, recording loads from system value buffers.
25 * 2. Walk the ranges of uniforms needed, compacting into contiguous ranges.
26 * 3. Fill in the load_preamble instructions with the real uniforms.
27 */
28
29 #define MAX_TABLE_SIZE sizeof(struct agx_stage_uniforms)
30 static_assert(sizeof(struct agx_draw_uniforms) <= MAX_TABLE_SIZE, "packed");
31
32 struct table_state {
33 /* Bitset of 16-bit uniforms pushed */
34 BITSET_DECLARE(pushed, MAX_TABLE_SIZE / 2);
35
36 /* Element size in 16-bit units, so we may split ranges of different sizes
37 * to guarantee natural alignment.
38 */
39 uint8_t element_size[MAX_TABLE_SIZE / 2];
40 };
41
42 struct state {
43 /* Array of nir_intrinsic_instr's to fix up at the end */
44 struct util_dynarray loads;
45
46 struct table_state tables[AGX_NUM_SYSVAL_TABLES];
47 };
48
49 static nir_def *
load_sysval(nir_builder * b,unsigned dim,unsigned bitsize,uint8_t table,uint16_t offset)50 load_sysval(nir_builder *b, unsigned dim, unsigned bitsize, uint8_t table,
51 uint16_t offset)
52 {
53 return nir_load_sysval_agx(b, dim, bitsize, .desc_set = table,
54 .binding = offset);
55 }
56
57 static nir_def *
load_sysval_root(nir_builder * b,unsigned dim,unsigned bitsize,void * ptr)58 load_sysval_root(nir_builder *b, unsigned dim, unsigned bitsize, void *ptr)
59 {
60 return load_sysval(b, dim, bitsize, AGX_SYSVAL_TABLE_ROOT, (uintptr_t)ptr);
61 }
62
63 static nir_def *
load_sysval_indirect(nir_builder * b,unsigned dim,unsigned bitsize,uint8_t table,void * base,nir_def * offset_el)64 load_sysval_indirect(nir_builder *b, unsigned dim, unsigned bitsize,
65 uint8_t table, void *base, nir_def *offset_el)
66 {
67 nir_scalar scalar = {offset_el, 0};
68 unsigned stride = (dim * bitsize) / 8;
69
70 if (nir_scalar_is_const(scalar)) {
71 /* Load the sysval directly */
72 return load_sysval(
73 b, dim, bitsize, table,
74 (uintptr_t)base + (nir_scalar_as_uint(scalar) * stride));
75 } else {
76 /* Load the base address of the table */
77 struct agx_draw_uniforms *u = NULL;
78 nir_def *table_base = load_sysval_root(b, 1, 64, &u->tables[table]);
79
80 /* Load address of the array in the table */
81 nir_def *array_base = nir_iadd_imm(b, table_base, (uintptr_t)base);
82
83 /* Index into the table and load */
84 nir_def *address = nir_iadd(
85 b, array_base, nir_u2u64(b, nir_imul_imm(b, offset_el, stride)));
86 return nir_load_global_constant(b, address, bitsize / 8, dim, bitsize);
87 }
88 }
89
90 static unsigned
stage_table(nir_builder * b)91 stage_table(nir_builder *b)
92 {
93 gl_shader_stage stage = b->shader->info.stage;
94 if (stage == MESA_SHADER_VERTEX && b->shader->info.vs.tes_agx)
95 stage = MESA_SHADER_TESS_EVAL;
96
97 assert(stage < PIPE_SHADER_TYPES);
98 return AGX_SYSVAL_STAGE(stage);
99 }
100
101 static nir_def *
load_ubo(nir_builder * b,nir_intrinsic_instr * intr,void * bases)102 load_ubo(nir_builder *b, nir_intrinsic_instr *intr, void *bases)
103 {
104 nir_def *base =
105 load_sysval_indirect(b, 1, 64, stage_table(b), bases, intr->src[0].ssa);
106
107 nir_def *address = nir_iadd(b, base, nir_u2u64(b, intr->src[1].ssa));
108
109 return nir_load_global_constant(b, address, nir_intrinsic_align(intr),
110 intr->num_components, intr->def.bit_size);
111 }
112
113 static nir_def *
load_texture_handle(nir_builder * b,nir_intrinsic_instr * intr,void * base)114 load_texture_handle(nir_builder *b, nir_intrinsic_instr *intr, void *base)
115 {
116 nir_def *uniform =
117 nir_load_sysval_agx(b, 1, 64, .desc_set = stage_table(b),
118 .binding = (uintptr_t)base, .flags = ~0);
119
120 return nir_vec2(
121 b, nir_u2u32(b, uniform),
122 nir_imul_imm(b, nir_u2u32(b, intr->src[0].ssa), AGX_TEXTURE_DESC_STRIDE));
123 }
124
125 static nir_def *
lower_intrinsic(nir_builder * b,nir_intrinsic_instr * intr,bool lower_draw_params)126 lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
127 bool lower_draw_params)
128 {
129 struct agx_draw_uniforms *u = NULL;
130 struct agx_stage_uniforms *s = NULL;
131
132 switch (intr->intrinsic) {
133 case nir_intrinsic_load_ubo:
134 return load_ubo(b, intr, s->ubo_base);
135 case nir_intrinsic_load_texture_handle_agx:
136 return load_texture_handle(b, intr, &s->texture_base);
137 case nir_intrinsic_load_sampler_handle_agx:
138 return load_sysval_indirect(b, 1, 16, stage_table(b), &s->sampler_handle,
139 intr->src[0].ssa);
140 case nir_intrinsic_load_vbo_base_agx:
141 return load_sysval_indirect(b, 1, 64, AGX_SYSVAL_TABLE_ROOT,
142 &u->attrib_base, intr->src[0].ssa);
143 case nir_intrinsic_load_attrib_clamp_agx:
144 return load_sysval_indirect(b, 1, 32, AGX_SYSVAL_TABLE_ROOT,
145 &u->attrib_clamp, intr->src[0].ssa);
146 case nir_intrinsic_load_blend_const_color_r_float:
147 return load_sysval_root(b, 1, 32, &u->blend_constant[0]);
148 case nir_intrinsic_load_blend_const_color_g_float:
149 return load_sysval_root(b, 1, 32, &u->blend_constant[1]);
150 case nir_intrinsic_load_blend_const_color_b_float:
151 return load_sysval_root(b, 1, 32, &u->blend_constant[2]);
152 case nir_intrinsic_load_blend_const_color_a_float:
153 return load_sysval_root(b, 1, 32, &u->blend_constant[3]);
154 case nir_intrinsic_load_api_sample_mask_agx:
155 return load_sysval_root(b, 1, 16, &u->sample_mask);
156 case nir_intrinsic_load_sample_positions_agx:
157 return load_sysval_root(b, 1, 32, &u->ppp_multisamplectl);
158 case nir_intrinsic_load_stat_query_address_agx:
159 return load_sysval_root(
160 b, 1, 64, &u->pipeline_statistics[nir_intrinsic_base(intr)]);
161 case nir_intrinsic_load_ssbo_address:
162 return load_sysval_indirect(b, 1, 64, stage_table(b), &s->ssbo_base,
163 intr->src[0].ssa);
164 case nir_intrinsic_get_ubo_size:
165 return load_sysval_indirect(b, 1, 32, stage_table(b), &s->ubo_size,
166 intr->src[0].ssa);
167 case nir_intrinsic_get_ssbo_size:
168 return load_sysval_indirect(b, 1, 32, stage_table(b), &s->ssbo_size,
169 intr->src[0].ssa);
170 case nir_intrinsic_load_layer_id_written_agx:
171 return load_sysval_root(b, 1, 16, &u->layer_id_written);
172 case nir_intrinsic_load_input_assembly_buffer_agx:
173 return load_sysval_root(b, 1, 64, &u->input_assembly);
174 case nir_intrinsic_load_geometry_param_buffer_agx:
175 return load_sysval_root(b, 1, 64, &u->geometry_params);
176 case nir_intrinsic_load_tess_param_buffer_agx:
177 return load_sysval_root(b, 1, 64, &u->tess_params);
178 case nir_intrinsic_load_fixed_point_size_agx:
179 return load_sysval_root(b, 1, 32, &u->fixed_point_size);
180 case nir_intrinsic_load_tex_sprite_mask_agx:
181 return load_sysval_root(b, 1, 16, &u->sprite_mask);
182 case nir_intrinsic_load_clip_z_coeff_agx:
183 return nir_f2f32(b, load_sysval_root(b, 1, 16, &u->clip_z_coeff));
184 case nir_intrinsic_load_polygon_stipple_agx: {
185 nir_def *base = load_sysval_root(b, 1, 64, &u->polygon_stipple);
186 nir_def *row = intr->src[0].ssa;
187 nir_def *addr = nir_iadd(b, base, nir_u2u64(b, nir_imul_imm(b, row, 4)));
188
189 return nir_load_global_constant(b, addr, 4, 1, 32);
190 }
191
192 default:
193 break;
194 }
195
196 if (!lower_draw_params)
197 return NULL;
198
199 switch (intr->intrinsic) {
200 case nir_intrinsic_load_num_workgroups:
201 return load_sysval(b, 3, 32, AGX_SYSVAL_TABLE_GRID, 0);
202 case nir_intrinsic_load_first_vertex:
203 return load_sysval(b, 1, 32, AGX_SYSVAL_TABLE_PARAMS, 0);
204 case nir_intrinsic_load_base_instance:
205 return load_sysval(b, 1, 32, AGX_SYSVAL_TABLE_PARAMS, 4);
206 case nir_intrinsic_load_base_vertex:
207 /* first vertex if indexed, 0 otherwise. More efficient for our hw than
208 * the lowering in NIR.
209 */
210 return nir_bcsel(
211 b, nir_i2b(b, load_sysval_root(b, 1, 16, &u->is_indexed_draw)),
212 load_sysval(b, 1, 32, AGX_SYSVAL_TABLE_PARAMS, 0), nir_imm_int(b, 0));
213 case nir_intrinsic_load_draw_id:
214 return load_sysval_root(b, 1, 32, &u->draw_id);
215 default:
216 return NULL;
217 }
218 }
219
220 /* Step 1. Lower NIR sysvals */
221 static bool
lower_sysvals(nir_builder * b,nir_instr * instr,void * data)222 lower_sysvals(nir_builder *b, nir_instr *instr, void *data)
223 {
224 bool *lower_draw_params = data;
225 b->cursor = nir_before_instr(instr);
226 nir_def *old;
227 nir_def *replacement = NULL;
228
229 if (instr->type == nir_instr_type_intrinsic) {
230 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
231 old = &intr->def;
232 replacement = lower_intrinsic(b, intr, *lower_draw_params);
233 } else if (instr->type == nir_instr_type_tex) {
234 nir_tex_instr *tex = nir_instr_as_tex(instr);
235 old = &tex->def;
236
237 if (tex->op != nir_texop_lod_bias_agx)
238 return false;
239
240 struct agx_stage_uniforms *s = NULL;
241
242 int src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_offset);
243 if (src_idx >= 0) {
244 replacement = load_sysval_indirect(
245 b, 1, 16, stage_table(b), s->lod_bias, tex->src[src_idx].src.ssa);
246 } else {
247 replacement = load_sysval(b, 1, 16, stage_table(b),
248 (uintptr_t)&s->lod_bias[tex->sampler_index]);
249 }
250 }
251
252 if (replacement != NULL) {
253 nir_def_rewrite_uses(old, replacement);
254 return true;
255 } else {
256 return false;
257 }
258 }
259
260 /* Step 2: Record system value loads */
261 static bool
record_loads(nir_builder * b,nir_intrinsic_instr * intr,void * data)262 record_loads(nir_builder *b, nir_intrinsic_instr *intr, void *data)
263 {
264 if (intr->intrinsic != nir_intrinsic_load_sysval_agx)
265 return false;
266
267 assert(intr->def.bit_size >= 16 && "no 8-bit sysvals");
268 unsigned dim = intr->def.num_components;
269 unsigned element_size = intr->def.bit_size / 16;
270 unsigned length = dim * element_size;
271
272 struct state *state = data;
273 struct table_state *table = &state->tables[nir_intrinsic_desc_set(intr)];
274 unsigned offset = nir_intrinsic_binding(intr);
275 assert((offset % 2) == 0 && "all entries are aligned by ABI");
276
277 BITSET_SET_RANGE(table->pushed, (offset / 2), (offset / 2) + length - 1);
278
279 for (unsigned i = 0; i < length; ++i) {
280 if (table->element_size[(offset / 2) + i])
281 assert((table->element_size[(offset / 2) + i]) == element_size);
282 else
283 table->element_size[(offset / 2) + i] = element_size;
284 }
285
286 util_dynarray_append(&state->loads, nir_intrinsic_instr *, intr);
287 return false;
288 }
289
290 /* Step 3: Decide where to push the system values */
291 static struct agx_push_range *
find_push_range_containing(struct agx_compiled_shader * shader,uint8_t table,uint16_t offset)292 find_push_range_containing(struct agx_compiled_shader *shader, uint8_t table,
293 uint16_t offset)
294 {
295 for (unsigned i = 0; i < shader->push_range_count; ++i) {
296 struct agx_push_range *range = &shader->push[i];
297
298 if (range->table != table)
299 continue;
300
301 /* range->length is 16-bit words, need to convert. offset is bytes. */
302 uint16_t length_B = range->length * 2;
303
304 if (range->offset <= offset && offset < (range->offset + length_B))
305 return range;
306 }
307
308 unreachable("no containing range");
309 }
310
311 static unsigned
lay_out_table(struct agx_compiled_shader * shader,struct table_state * state,unsigned table_index,unsigned uniform)312 lay_out_table(struct agx_compiled_shader *shader, struct table_state *state,
313 unsigned table_index, unsigned uniform)
314 {
315 unsigned start, end;
316 BITSET_FOREACH_RANGE(start, end, state->pushed, sizeof(state->pushed) * 8) {
317 unsigned range_start = start;
318
319 do {
320 uint8_t size = state->element_size[range_start];
321
322 /* Find a range of constant element size. [range_start, range_end).
323 * Ranges may be at most 64 halfs.
324 */
325 unsigned range_end;
326 for (range_end = range_start + 1;
327 range_end < end && state->element_size[range_end] == size &&
328 range_end < range_start + 64;
329 ++range_end)
330 ;
331
332 /* Now make the range with the given size (naturally aligned) */
333 uniform = ALIGN_POT(uniform, size);
334
335 assert((shader->push_range_count < ARRAY_SIZE(shader->push)) &&
336 "AGX_MAX_PUSH_RANGES must be an upper bound");
337
338 /* Offsets must be aligned to 4 bytes, this may require pushing a
339 * little more than intended (otherwise we would need extra copies)
340 */
341 range_start = ROUND_DOWN_TO(range_start, 4 / 2);
342
343 shader->push[shader->push_range_count++] = (struct agx_push_range){
344 .uniform = uniform,
345 .table = table_index,
346 .offset = range_start * 2 /* bytes, not elements */,
347 .length = (range_end - range_start),
348 };
349
350 uniform += (range_end - range_start);
351 range_start = range_end;
352 } while (range_start < end);
353 }
354
355 return uniform;
356 }
357
358 static unsigned
lay_out_uniforms(struct agx_compiled_shader * shader,struct state * state)359 lay_out_uniforms(struct agx_compiled_shader *shader, struct state *state)
360 {
361 unsigned uniform = 0;
362
363 /* Lay out each system value table. We do this backwards to ensure the first
364 * uniform goes to the bindless texture base.
365 */
366 for (int t = AGX_NUM_SYSVAL_TABLES - 1; t >= 0; --t)
367 uniform = lay_out_table(shader, &state->tables[t], t, uniform);
368
369 /* Step 4: Fill in the loads */
370 util_dynarray_foreach(&state->loads, nir_intrinsic_instr *, intr_) {
371 nir_intrinsic_instr *intr = *intr_;
372 uint8_t table = nir_intrinsic_desc_set(intr);
373 uint16_t offset = nir_intrinsic_binding(intr);
374 bool load_uniform_location = nir_intrinsic_flags(intr);
375
376 struct agx_push_range *range =
377 find_push_range_containing(shader, table, offset);
378 unsigned base = range->uniform + ((offset - range->offset) / 2);
379
380 nir_builder b = nir_builder_at(nir_instr_remove(&(intr->instr)));
381 nir_def *repl;
382
383 if (load_uniform_location) {
384 repl = nir_imm_int(&b, base);
385 } else {
386 repl = nir_load_preamble(&b, intr->def.num_components,
387 intr->def.bit_size, .base = base);
388 }
389
390 nir_def_rewrite_uses(&intr->def, repl);
391 }
392
393 return uniform;
394 }
395
396 bool
agx_nir_lower_sysvals(nir_shader * shader,enum pipe_shader_type desc_stage,bool lower_draw_params)397 agx_nir_lower_sysvals(nir_shader *shader, enum pipe_shader_type desc_stage,
398 bool lower_draw_params)
399 {
400 /* override stage for the duration on the pass. XXX: should refactor, but
401 * it's annoying!
402 */
403 enum pipe_shader_type phys_stage = shader->info.stage;
404 shader->info.stage = desc_stage;
405
406 bool progress = nir_shader_instructions_pass(
407 shader, lower_sysvals, nir_metadata_block_index | nir_metadata_dominance,
408 &lower_draw_params);
409
410 shader->info.stage = phys_stage;
411 return progress;
412 }
413
414 bool
agx_nir_layout_uniforms(nir_shader * shader,struct agx_compiled_shader * compiled,unsigned * push_size)415 agx_nir_layout_uniforms(nir_shader *shader,
416 struct agx_compiled_shader *compiled,
417 unsigned *push_size)
418 {
419 struct state state = {0};
420 nir_shader_intrinsics_pass(shader, record_loads,
421 nir_metadata_block_index | nir_metadata_dominance,
422 &state);
423
424 *push_size = lay_out_uniforms(compiled, &state);
425
426 util_dynarray_fini(&state.loads);
427
428 /* Make sure texture handles have constants associated */
429 nir_opt_constant_folding(shader);
430
431 return true;
432 }
433