• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2022 Alyssa Rosenzweig
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "compiler/nir/nir_builder.h"
7 #include "util/bitset.h"
8 #include "util/u_dynarray.h"
9 #include "agx_state.h"
10 #include "nir.h"
11 #include "nir_builder_opcodes.h"
12 #include "nir_intrinsics.h"
13 #include "nir_intrinsics_indices.h"
14 #include "shader_enums.h"
15 
16 #define AGX_TEXTURE_DESC_STRIDE 24
17 
18 /*
19  * Lower all system values to uniform loads. This pass tries to compact ranges
20  * of contiguous uploaded uniforms to reduce the draw-time overhead of uploading
21  * many tiny ranges. To do so, it works in 4 steps:
22  *
23  * 1. Lower NIR sysvals to loads from the system value buffers.
24  * 2. Walk the NIR, recording loads from system value buffers.
25  * 2. Walk the ranges of uniforms needed, compacting into contiguous ranges.
26  * 3. Fill in the load_preamble instructions with the real uniforms.
27  */
28 
29 #define MAX_TABLE_SIZE sizeof(struct agx_stage_uniforms)
30 static_assert(sizeof(struct agx_draw_uniforms) <= MAX_TABLE_SIZE, "packed");
31 
32 struct table_state {
33    /* Bitset of 16-bit uniforms pushed */
34    BITSET_DECLARE(pushed, MAX_TABLE_SIZE / 2);
35 
36    /* Element size in 16-bit units, so we may split ranges of different sizes
37     * to guarantee natural alignment.
38     */
39    uint8_t element_size[MAX_TABLE_SIZE / 2];
40 };
41 
42 struct state {
43    /* Array of nir_intrinsic_instr's to fix up at the end */
44    struct util_dynarray loads;
45 
46    struct table_state tables[AGX_NUM_SYSVAL_TABLES];
47 };
48 
49 static nir_def *
load_sysval(nir_builder * b,unsigned dim,unsigned bitsize,uint8_t table,uint16_t offset)50 load_sysval(nir_builder *b, unsigned dim, unsigned bitsize, uint8_t table,
51             uint16_t offset)
52 {
53    return nir_load_sysval_agx(b, dim, bitsize, .desc_set = table,
54                               .binding = offset);
55 }
56 
57 static nir_def *
load_sysval_root(nir_builder * b,unsigned dim,unsigned bitsize,void * ptr)58 load_sysval_root(nir_builder *b, unsigned dim, unsigned bitsize, void *ptr)
59 {
60    return load_sysval(b, dim, bitsize, AGX_SYSVAL_TABLE_ROOT, (uintptr_t)ptr);
61 }
62 
63 static nir_def *
load_sysval_indirect(nir_builder * b,unsigned dim,unsigned bitsize,uint8_t table,void * base,nir_def * offset_el)64 load_sysval_indirect(nir_builder *b, unsigned dim, unsigned bitsize,
65                      uint8_t table, void *base, nir_def *offset_el)
66 {
67    nir_scalar scalar = {offset_el, 0};
68    unsigned stride = (dim * bitsize) / 8;
69 
70    if (nir_scalar_is_const(scalar)) {
71       /* Load the sysval directly */
72       return load_sysval(
73          b, dim, bitsize, table,
74          (uintptr_t)base + (nir_scalar_as_uint(scalar) * stride));
75    } else {
76       /* Load the base address of the table */
77       struct agx_draw_uniforms *u = NULL;
78       nir_def *table_base = load_sysval_root(b, 1, 64, &u->tables[table]);
79 
80       /* Load address of the array in the table */
81       nir_def *array_base = nir_iadd_imm(b, table_base, (uintptr_t)base);
82 
83       /* Index into the table and load */
84       nir_def *address = nir_iadd(
85          b, array_base, nir_u2u64(b, nir_imul_imm(b, offset_el, stride)));
86       return nir_load_global_constant(b, address, bitsize / 8, dim, bitsize);
87    }
88 }
89 
90 static unsigned
stage_table(nir_builder * b)91 stage_table(nir_builder *b)
92 {
93    gl_shader_stage stage = b->shader->info.stage;
94    if (stage == MESA_SHADER_VERTEX && b->shader->info.vs.tes_agx)
95       stage = MESA_SHADER_TESS_EVAL;
96 
97    assert(stage < PIPE_SHADER_TYPES);
98    return AGX_SYSVAL_STAGE(stage);
99 }
100 
101 static nir_def *
load_ubo(nir_builder * b,nir_intrinsic_instr * intr,void * bases)102 load_ubo(nir_builder *b, nir_intrinsic_instr *intr, void *bases)
103 {
104    nir_def *base =
105       load_sysval_indirect(b, 1, 64, stage_table(b), bases, intr->src[0].ssa);
106 
107    nir_def *address = nir_iadd(b, base, nir_u2u64(b, intr->src[1].ssa));
108 
109    return nir_load_global_constant(b, address, nir_intrinsic_align(intr),
110                                    intr->num_components, intr->def.bit_size);
111 }
112 
113 static nir_def *
load_texture_handle(nir_builder * b,nir_intrinsic_instr * intr,void * base)114 load_texture_handle(nir_builder *b, nir_intrinsic_instr *intr, void *base)
115 {
116    nir_def *uniform =
117       nir_load_sysval_agx(b, 1, 64, .desc_set = stage_table(b),
118                           .binding = (uintptr_t)base, .flags = ~0);
119 
120    return nir_vec2(
121       b, nir_u2u32(b, uniform),
122       nir_imul_imm(b, nir_u2u32(b, intr->src[0].ssa), AGX_TEXTURE_DESC_STRIDE));
123 }
124 
125 static nir_def *
lower_intrinsic(nir_builder * b,nir_intrinsic_instr * intr,bool lower_draw_params)126 lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
127                 bool lower_draw_params)
128 {
129    struct agx_draw_uniforms *u = NULL;
130    struct agx_stage_uniforms *s = NULL;
131 
132    switch (intr->intrinsic) {
133    case nir_intrinsic_load_ubo:
134       return load_ubo(b, intr, s->ubo_base);
135    case nir_intrinsic_load_texture_handle_agx:
136       return load_texture_handle(b, intr, &s->texture_base);
137    case nir_intrinsic_load_sampler_handle_agx:
138       return load_sysval_indirect(b, 1, 16, stage_table(b), &s->sampler_handle,
139                                   intr->src[0].ssa);
140    case nir_intrinsic_load_vbo_base_agx:
141       return load_sysval_indirect(b, 1, 64, AGX_SYSVAL_TABLE_ROOT,
142                                   &u->attrib_base, intr->src[0].ssa);
143    case nir_intrinsic_load_attrib_clamp_agx:
144       return load_sysval_indirect(b, 1, 32, AGX_SYSVAL_TABLE_ROOT,
145                                   &u->attrib_clamp, intr->src[0].ssa);
146    case nir_intrinsic_load_blend_const_color_r_float:
147       return load_sysval_root(b, 1, 32, &u->blend_constant[0]);
148    case nir_intrinsic_load_blend_const_color_g_float:
149       return load_sysval_root(b, 1, 32, &u->blend_constant[1]);
150    case nir_intrinsic_load_blend_const_color_b_float:
151       return load_sysval_root(b, 1, 32, &u->blend_constant[2]);
152    case nir_intrinsic_load_blend_const_color_a_float:
153       return load_sysval_root(b, 1, 32, &u->blend_constant[3]);
154    case nir_intrinsic_load_api_sample_mask_agx:
155       return load_sysval_root(b, 1, 16, &u->sample_mask);
156    case nir_intrinsic_load_sample_positions_agx:
157       return load_sysval_root(b, 1, 32, &u->ppp_multisamplectl);
158    case nir_intrinsic_load_stat_query_address_agx:
159       return load_sysval_root(
160          b, 1, 64, &u->pipeline_statistics[nir_intrinsic_base(intr)]);
161    case nir_intrinsic_load_ssbo_address:
162       return load_sysval_indirect(b, 1, 64, stage_table(b), &s->ssbo_base,
163                                   intr->src[0].ssa);
164    case nir_intrinsic_get_ubo_size:
165       return load_sysval_indirect(b, 1, 32, stage_table(b), &s->ubo_size,
166                                   intr->src[0].ssa);
167    case nir_intrinsic_get_ssbo_size:
168       return load_sysval_indirect(b, 1, 32, stage_table(b), &s->ssbo_size,
169                                   intr->src[0].ssa);
170    case nir_intrinsic_load_layer_id_written_agx:
171       return load_sysval_root(b, 1, 16, &u->layer_id_written);
172    case nir_intrinsic_load_input_assembly_buffer_agx:
173       return load_sysval_root(b, 1, 64, &u->input_assembly);
174    case nir_intrinsic_load_geometry_param_buffer_agx:
175       return load_sysval_root(b, 1, 64, &u->geometry_params);
176    case nir_intrinsic_load_tess_param_buffer_agx:
177       return load_sysval_root(b, 1, 64, &u->tess_params);
178    case nir_intrinsic_load_fixed_point_size_agx:
179       return load_sysval_root(b, 1, 32, &u->fixed_point_size);
180    case nir_intrinsic_load_tex_sprite_mask_agx:
181       return load_sysval_root(b, 1, 16, &u->sprite_mask);
182    case nir_intrinsic_load_clip_z_coeff_agx:
183       return nir_f2f32(b, load_sysval_root(b, 1, 16, &u->clip_z_coeff));
184    case nir_intrinsic_load_polygon_stipple_agx: {
185       nir_def *base = load_sysval_root(b, 1, 64, &u->polygon_stipple);
186       nir_def *row = intr->src[0].ssa;
187       nir_def *addr = nir_iadd(b, base, nir_u2u64(b, nir_imul_imm(b, row, 4)));
188 
189       return nir_load_global_constant(b, addr, 4, 1, 32);
190    }
191 
192    default:
193       break;
194    }
195 
196    if (!lower_draw_params)
197       return NULL;
198 
199    switch (intr->intrinsic) {
200    case nir_intrinsic_load_num_workgroups:
201       return load_sysval(b, 3, 32, AGX_SYSVAL_TABLE_GRID, 0);
202    case nir_intrinsic_load_first_vertex:
203       return load_sysval(b, 1, 32, AGX_SYSVAL_TABLE_PARAMS, 0);
204    case nir_intrinsic_load_base_instance:
205       return load_sysval(b, 1, 32, AGX_SYSVAL_TABLE_PARAMS, 4);
206    case nir_intrinsic_load_base_vertex:
207       /* first vertex if indexed, 0 otherwise. More efficient for our hw than
208        * the lowering in NIR.
209        */
210       return nir_bcsel(
211          b, nir_i2b(b, load_sysval_root(b, 1, 16, &u->is_indexed_draw)),
212          load_sysval(b, 1, 32, AGX_SYSVAL_TABLE_PARAMS, 0), nir_imm_int(b, 0));
213    case nir_intrinsic_load_draw_id:
214       return load_sysval_root(b, 1, 32, &u->draw_id);
215    default:
216       return NULL;
217    }
218 }
219 
220 /* Step 1. Lower NIR sysvals */
221 static bool
lower_sysvals(nir_builder * b,nir_instr * instr,void * data)222 lower_sysvals(nir_builder *b, nir_instr *instr, void *data)
223 {
224    bool *lower_draw_params = data;
225    b->cursor = nir_before_instr(instr);
226    nir_def *old;
227    nir_def *replacement = NULL;
228 
229    if (instr->type == nir_instr_type_intrinsic) {
230       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
231       old = &intr->def;
232       replacement = lower_intrinsic(b, intr, *lower_draw_params);
233    } else if (instr->type == nir_instr_type_tex) {
234       nir_tex_instr *tex = nir_instr_as_tex(instr);
235       old = &tex->def;
236 
237       if (tex->op != nir_texop_lod_bias_agx)
238          return false;
239 
240       struct agx_stage_uniforms *s = NULL;
241 
242       int src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_offset);
243       if (src_idx >= 0) {
244          replacement = load_sysval_indirect(
245             b, 1, 16, stage_table(b), s->lod_bias, tex->src[src_idx].src.ssa);
246       } else {
247          replacement = load_sysval(b, 1, 16, stage_table(b),
248                                    (uintptr_t)&s->lod_bias[tex->sampler_index]);
249       }
250    }
251 
252    if (replacement != NULL) {
253       nir_def_rewrite_uses(old, replacement);
254       return true;
255    } else {
256       return false;
257    }
258 }
259 
260 /* Step 2: Record system value loads */
261 static bool
record_loads(nir_builder * b,nir_intrinsic_instr * intr,void * data)262 record_loads(nir_builder *b, nir_intrinsic_instr *intr, void *data)
263 {
264    if (intr->intrinsic != nir_intrinsic_load_sysval_agx)
265       return false;
266 
267    assert(intr->def.bit_size >= 16 && "no 8-bit sysvals");
268    unsigned dim = intr->def.num_components;
269    unsigned element_size = intr->def.bit_size / 16;
270    unsigned length = dim * element_size;
271 
272    struct state *state = data;
273    struct table_state *table = &state->tables[nir_intrinsic_desc_set(intr)];
274    unsigned offset = nir_intrinsic_binding(intr);
275    assert((offset % 2) == 0 && "all entries are aligned by ABI");
276 
277    BITSET_SET_RANGE(table->pushed, (offset / 2), (offset / 2) + length - 1);
278 
279    for (unsigned i = 0; i < length; ++i) {
280       if (table->element_size[(offset / 2) + i])
281          assert((table->element_size[(offset / 2) + i]) == element_size);
282       else
283          table->element_size[(offset / 2) + i] = element_size;
284    }
285 
286    util_dynarray_append(&state->loads, nir_intrinsic_instr *, intr);
287    return false;
288 }
289 
290 /* Step 3: Decide where to push the system values */
291 static struct agx_push_range *
find_push_range_containing(struct agx_compiled_shader * shader,uint8_t table,uint16_t offset)292 find_push_range_containing(struct agx_compiled_shader *shader, uint8_t table,
293                            uint16_t offset)
294 {
295    for (unsigned i = 0; i < shader->push_range_count; ++i) {
296       struct agx_push_range *range = &shader->push[i];
297 
298       if (range->table != table)
299          continue;
300 
301       /* range->length is 16-bit words, need to convert. offset is bytes. */
302       uint16_t length_B = range->length * 2;
303 
304       if (range->offset <= offset && offset < (range->offset + length_B))
305          return range;
306    }
307 
308    unreachable("no containing range");
309 }
310 
311 static unsigned
lay_out_table(struct agx_compiled_shader * shader,struct table_state * state,unsigned table_index,unsigned uniform)312 lay_out_table(struct agx_compiled_shader *shader, struct table_state *state,
313               unsigned table_index, unsigned uniform)
314 {
315    unsigned start, end;
316    BITSET_FOREACH_RANGE(start, end, state->pushed, sizeof(state->pushed) * 8) {
317       unsigned range_start = start;
318 
319       do {
320          uint8_t size = state->element_size[range_start];
321 
322          /* Find a range of constant element size. [range_start, range_end).
323           * Ranges may be at most 64 halfs.
324           */
325          unsigned range_end;
326          for (range_end = range_start + 1;
327               range_end < end && state->element_size[range_end] == size &&
328               range_end < range_start + 64;
329               ++range_end)
330             ;
331 
332          /* Now make the range with the given size (naturally aligned) */
333          uniform = ALIGN_POT(uniform, size);
334 
335          assert((shader->push_range_count < ARRAY_SIZE(shader->push)) &&
336                 "AGX_MAX_PUSH_RANGES must be an upper bound");
337 
338          /* Offsets must be aligned to 4 bytes, this may require pushing a
339           * little more than intended (otherwise we would need extra copies)
340           */
341          range_start = ROUND_DOWN_TO(range_start, 4 / 2);
342 
343          shader->push[shader->push_range_count++] = (struct agx_push_range){
344             .uniform = uniform,
345             .table = table_index,
346             .offset = range_start * 2 /* bytes, not elements */,
347             .length = (range_end - range_start),
348          };
349 
350          uniform += (range_end - range_start);
351          range_start = range_end;
352       } while (range_start < end);
353    }
354 
355    return uniform;
356 }
357 
358 static unsigned
lay_out_uniforms(struct agx_compiled_shader * shader,struct state * state)359 lay_out_uniforms(struct agx_compiled_shader *shader, struct state *state)
360 {
361    unsigned uniform = 0;
362 
363    /* Lay out each system value table. We do this backwards to ensure the first
364     * uniform goes to the bindless texture base.
365     */
366    for (int t = AGX_NUM_SYSVAL_TABLES - 1; t >= 0; --t)
367       uniform = lay_out_table(shader, &state->tables[t], t, uniform);
368 
369    /* Step 4: Fill in the loads */
370    util_dynarray_foreach(&state->loads, nir_intrinsic_instr *, intr_) {
371       nir_intrinsic_instr *intr = *intr_;
372       uint8_t table = nir_intrinsic_desc_set(intr);
373       uint16_t offset = nir_intrinsic_binding(intr);
374       bool load_uniform_location = nir_intrinsic_flags(intr);
375 
376       struct agx_push_range *range =
377          find_push_range_containing(shader, table, offset);
378       unsigned base = range->uniform + ((offset - range->offset) / 2);
379 
380       nir_builder b = nir_builder_at(nir_instr_remove(&(intr->instr)));
381       nir_def *repl;
382 
383       if (load_uniform_location) {
384          repl = nir_imm_int(&b, base);
385       } else {
386          repl = nir_load_preamble(&b, intr->def.num_components,
387                                   intr->def.bit_size, .base = base);
388       }
389 
390       nir_def_rewrite_uses(&intr->def, repl);
391    }
392 
393    return uniform;
394 }
395 
396 bool
agx_nir_lower_sysvals(nir_shader * shader,enum pipe_shader_type desc_stage,bool lower_draw_params)397 agx_nir_lower_sysvals(nir_shader *shader, enum pipe_shader_type desc_stage,
398                       bool lower_draw_params)
399 {
400    /* override stage for the duration on the pass. XXX: should refactor, but
401     * it's annoying!
402     */
403    enum pipe_shader_type phys_stage = shader->info.stage;
404    shader->info.stage = desc_stage;
405 
406    bool progress = nir_shader_instructions_pass(
407       shader, lower_sysvals, nir_metadata_block_index | nir_metadata_dominance,
408       &lower_draw_params);
409 
410    shader->info.stage = phys_stage;
411    return progress;
412 }
413 
414 bool
agx_nir_layout_uniforms(nir_shader * shader,struct agx_compiled_shader * compiled,unsigned * push_size)415 agx_nir_layout_uniforms(nir_shader *shader,
416                         struct agx_compiled_shader *compiled,
417                         unsigned *push_size)
418 {
419    struct state state = {0};
420    nir_shader_intrinsics_pass(shader, record_loads,
421                               nir_metadata_block_index | nir_metadata_dominance,
422                               &state);
423 
424    *push_size = lay_out_uniforms(compiled, &state);
425 
426    util_dynarray_fini(&state.loads);
427 
428    /* Make sure texture handles have constants associated */
429    nir_opt_constant_folding(shader);
430 
431    return true;
432 }
433