• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2023 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "ac_gpu_info.h"
8 #include "ac_nir.h"
9 #include "nir.h"
10 #include "nir_builder.h"
11 #include "nir_deref.h"
12 #include "radv_constants.h"
13 #include "radv_nir.h"
14 #include "radv_shader.h"
15 #include "radv_shader_args.h"
16 
17 typedef struct {
18    const struct radv_shader_args *args;
19    const struct radv_shader_info *info;
20    const struct radv_graphics_state_key *gfx_state;
21    const struct radeon_info *gpu_info;
22 } lower_vs_inputs_state;
23 
24 static nir_def *
lower_load_vs_input_from_prolog(nir_builder * b,nir_intrinsic_instr * intrin,lower_vs_inputs_state * s)25 lower_load_vs_input_from_prolog(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs_state *s)
26 {
27    nir_src *offset_src = nir_get_io_offset_src(intrin);
28    assert(nir_src_is_const(*offset_src));
29 
30    const nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin);
31    const unsigned base_offset = nir_src_as_uint(*offset_src);
32    const unsigned location = io_sem.location + base_offset - VERT_ATTRIB_GENERIC0;
33    const unsigned component = nir_intrinsic_component(intrin);
34    const unsigned bit_size = intrin->def.bit_size;
35    const unsigned num_components = intrin->def.num_components;
36 
37    /* 64-bit inputs: they occupy twice as many 32-bit components.
38     * 16-bit inputs: they occupy a 32-bit component (not packed).
39     */
40    const unsigned arg_bit_size = MAX2(bit_size, 32);
41 
42    unsigned num_input_args = 1;
43    nir_def *input_args[2] = {ac_nir_load_arg(b, &s->args->ac, s->args->vs_inputs[location]), NULL};
44    if (component * 32 + arg_bit_size * num_components > 128) {
45       assert(bit_size == 64);
46 
47       num_input_args++;
48       input_args[1] = ac_nir_load_arg(b, &s->args->ac, s->args->vs_inputs[location + 1]);
49    }
50 
51    nir_def *extracted = nir_extract_bits(b, input_args, num_input_args, component * 32, num_components, arg_bit_size);
52 
53    if (bit_size < arg_bit_size) {
54       assert(bit_size == 16);
55 
56       if (nir_alu_type_get_base_type(nir_intrinsic_dest_type(intrin)) == nir_type_float)
57          return nir_f2f16(b, extracted);
58       else
59          return nir_u2u16(b, extracted);
60    }
61 
62    return extracted;
63 }
64 
65 static nir_def *
calc_vs_input_index_instance_rate(nir_builder * b,unsigned location,lower_vs_inputs_state * s)66 calc_vs_input_index_instance_rate(nir_builder *b, unsigned location, lower_vs_inputs_state *s)
67 {
68    const uint32_t divisor = s->gfx_state->vi.instance_rate_divisors[location];
69    nir_def *start_instance = nir_load_base_instance(b);
70 
71    if (divisor == 0)
72       return start_instance;
73 
74    nir_def *instance_id = nir_udiv_imm(b, nir_load_instance_id(b), divisor);
75    return nir_iadd(b, start_instance, instance_id);
76 }
77 
78 static nir_def *
calc_vs_input_index(nir_builder * b,unsigned location,lower_vs_inputs_state * s)79 calc_vs_input_index(nir_builder *b, unsigned location, lower_vs_inputs_state *s)
80 {
81    if (s->gfx_state->vi.instance_rate_inputs & BITFIELD_BIT(location))
82       return calc_vs_input_index_instance_rate(b, location, s);
83 
84    return nir_iadd(b, nir_load_first_vertex(b), nir_load_vertex_id_zero_base(b));
85 }
86 
87 static bool
can_use_untyped_load(const struct util_format_description * f,const unsigned bit_size)88 can_use_untyped_load(const struct util_format_description *f, const unsigned bit_size)
89 {
90    /* All components must have same size and type. */
91    if (!f->is_array)
92       return false;
93 
94    const struct util_format_channel_description *c = &f->channel[0];
95    return c->size == bit_size && bit_size >= 32;
96 }
97 
98 static nir_def *
oob_input_load_value(nir_builder * b,const unsigned channel_idx,const unsigned bit_size,const bool is_float)99 oob_input_load_value(nir_builder *b, const unsigned channel_idx, const unsigned bit_size, const bool is_float)
100 {
101    /* 22.1.1. Attribute Location and Component Assignment of Vulkan 1.3 specification:
102     * For 64-bit data types, no default attribute values are provided. Input variables
103     * must not use more components than provided by the attribute.
104     */
105    if (bit_size == 64)
106       return nir_undef(b, 1, bit_size);
107 
108    if (channel_idx == 3) {
109       if (is_float)
110          return nir_imm_floatN_t(b, 1.0, bit_size);
111       else
112          return nir_imm_intN_t(b, 1, bit_size);
113    }
114 
115    return nir_imm_intN_t(b, 0, bit_size);
116 }
117 
118 static unsigned
count_format_bytes(const struct util_format_description * f,const unsigned first_channel,const unsigned num_channels)119 count_format_bytes(const struct util_format_description *f, const unsigned first_channel, const unsigned num_channels)
120 {
121    if (!num_channels)
122       return 0;
123 
124    const unsigned last_channel = first_channel + num_channels - 1;
125    assert(last_channel < f->nr_channels);
126    unsigned bits = 0;
127    for (unsigned i = first_channel; i <= last_channel; ++i) {
128       bits += f->channel[i].size;
129    }
130 
131    assert(bits % 8 == 0);
132    return bits / 8;
133 }
134 
135 static bool
format_needs_swizzle(const struct util_format_description * f)136 format_needs_swizzle(const struct util_format_description *f)
137 {
138    for (unsigned i = 0; i < f->nr_channels; ++i) {
139       if (f->swizzle[i] != PIPE_SWIZZLE_X + i)
140          return true;
141    }
142 
143    return false;
144 }
145 
146 static unsigned
first_used_swizzled_channel(const struct util_format_description * f,const unsigned mask,const bool backwards)147 first_used_swizzled_channel(const struct util_format_description *f, const unsigned mask, const bool backwards)
148 {
149    unsigned first_used = backwards ? 0 : f->nr_channels;
150    const unsigned it_mask = mask & BITFIELD_MASK(f->nr_channels);
151 
152    u_foreach_bit (b, it_mask) {
153       assert(f->swizzle[b] != PIPE_SWIZZLE_0 && f->swizzle[b] != PIPE_SWIZZLE_1);
154       const unsigned c = f->swizzle[b] - PIPE_SWIZZLE_X;
155       first_used = backwards ? MAX2(first_used, c) : MIN2(first_used, c);
156    }
157 
158    return first_used;
159 }
160 
161 static nir_def *
adjust_vertex_fetch_alpha(nir_builder * b,enum ac_vs_input_alpha_adjust alpha_adjust,nir_def * alpha)162 adjust_vertex_fetch_alpha(nir_builder *b, enum ac_vs_input_alpha_adjust alpha_adjust, nir_def *alpha)
163 {
164    if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED)
165       alpha = nir_f2u32(b, alpha);
166 
167    /* For the integer-like cases, do a natural sign extension.
168     *
169     * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 and happen to contain 0, 1, 2, 3 as
170     * the two LSBs of the exponent.
171     */
172    unsigned offset = alpha_adjust == AC_ALPHA_ADJUST_SNORM ? 23u : 0u;
173 
174    alpha = nir_ibfe_imm(b, alpha, offset, 2u);
175 
176    /* Convert back to the right type. */
177    if (alpha_adjust == AC_ALPHA_ADJUST_SNORM) {
178       alpha = nir_i2f32(b, alpha);
179       alpha = nir_fmax(b, alpha, nir_imm_float(b, -1.0f));
180    } else if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED) {
181       alpha = nir_i2f32(b, alpha);
182    }
183 
184    return alpha;
185 }
186 
187 static nir_def *
lower_load_vs_input(nir_builder * b,nir_intrinsic_instr * intrin,lower_vs_inputs_state * s)188 lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs_state *s)
189 {
190    nir_src *offset_src = nir_get_io_offset_src(intrin);
191    assert(nir_src_is_const(*offset_src));
192 
193    const nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin);
194    const unsigned base_offset = nir_src_as_uint(*offset_src);
195    const unsigned location = io_sem.location + base_offset - VERT_ATTRIB_GENERIC0;
196    const unsigned bit_size = intrin->def.bit_size;
197    const unsigned dest_num_components = intrin->def.num_components;
198 
199    /* Convert the component offset to bit_size units.
200     * (Intrinsic component offset is in 32-bit units.)
201     *
202     * Small bitsize inputs consume the same space as 32-bit inputs,
203     * but 64-bit inputs consume twice as many.
204     * 64-bit variables must not have a component of 1 or 3.
205     * (See VK spec 15.1.5 "Component Assignment")
206     */
207    const unsigned component = nir_intrinsic_component(intrin) / (MAX2(32, bit_size) / 32);
208 
209    /* Bitmask of components in bit_size units
210     * of the current input load that are actually used.
211     */
212    const unsigned dest_use_mask = nir_def_components_read(&intrin->def) << component;
213 
214    /* If the input is entirely unused, just replace it with undef.
215     * This is just in case we debug this pass without running DCE first.
216     */
217    if (!dest_use_mask)
218       return nir_undef(b, dest_num_components, bit_size);
219 
220    const uint32_t attrib_binding = s->gfx_state->vi.vertex_attribute_bindings[location];
221    const uint32_t attrib_offset = s->gfx_state->vi.vertex_attribute_offsets[location];
222    const uint32_t attrib_stride = s->gfx_state->vi.vertex_attribute_strides[location];
223    const enum pipe_format attrib_format = s->gfx_state->vi.vertex_attribute_formats[location];
224    const struct util_format_description *f = util_format_description(attrib_format);
225    const struct ac_vtx_format_info *vtx_info =
226       ac_get_vtx_format_info(s->gpu_info->gfx_level, s->gpu_info->family, attrib_format);
227    const unsigned binding_index = s->info->vs.use_per_attribute_vb_descs ? location : attrib_binding;
228    const unsigned desc_index = util_bitcount(s->info->vs.vb_desc_usage_mask & u_bit_consecutive(0, binding_index));
229 
230    nir_def *vertex_buffers_arg = ac_nir_load_arg(b, &s->args->ac, s->args->ac.vertex_buffers);
231    nir_def *vertex_buffers = nir_pack_64_2x32_split(b, vertex_buffers_arg, nir_imm_int(b, s->gpu_info->address32_hi));
232    nir_def *descriptor = nir_load_smem_amd(b, 4, vertex_buffers, nir_imm_int(b, desc_index * 16));
233    nir_def *base_index = calc_vs_input_index(b, location, s);
234    nir_def *zero = nir_imm_int(b, 0);
235 
236    /* We currently implement swizzling for all formats in shaders.
237     * Note, it is possible to specify swizzling in the DST_SEL fields of descriptors,
238     * but we don't use that because typed loads using the MTBUF instruction format
239     * don't support DST_SEL, so it's simpler to just handle it all in shaders.
240     */
241    const bool needs_swizzle = format_needs_swizzle(f);
242 
243    /* We need to adjust the alpha channel as loaded by the HW,
244     * for example sign extension and normalization may be necessary.
245     */
246    const enum ac_vs_input_alpha_adjust alpha_adjust = vtx_info->alpha_adjust;
247 
248    /* Try to shrink the load format by skipping unused components from the start.
249     * Beneficial because the backend may be able to emit fewer HW instructions.
250     * Only possible with array formats.
251     */
252    const unsigned first_used_channel = first_used_swizzled_channel(f, dest_use_mask, false);
253    const unsigned skipped_start = f->is_array ? first_used_channel : 0;
254 
255    /* Number of channels we actually use and load.
256     * Don't shrink the format here because this might allow the backend to
257     * emit fewer (but larger than needed) HW instructions.
258     */
259    const unsigned first_trailing_unused_channel = first_used_swizzled_channel(f, dest_use_mask, true) + 1;
260    const unsigned max_loaded_channels = MIN2(first_trailing_unused_channel, f->nr_channels);
261    const unsigned fetch_num_channels =
262       first_used_channel >= max_loaded_channels ? 0 : max_loaded_channels - skipped_start;
263 
264    /* Load VS inputs from VRAM.
265     *
266     * For the vast majority of cases this will only create 1x load_(typed)_buffer_amd
267     * intrinsic and the backend is responsible for further splitting that
268     * to as many HW instructions as needed based on alignment.
269     *
270     * Take care to prevent loaded components from failing the range check,
271     * by emitting several load intrinsics with different index sources.
272     * This is necessary because the backend can't further roll the const offset
273     * into the index source of MUBUF / MTBUF instructions.
274     */
275    nir_def *loads[NIR_MAX_VEC_COMPONENTS] = {0};
276    unsigned num_loads = 0;
277    for (unsigned x = 0, channels; x < fetch_num_channels; x += channels) {
278       channels = fetch_num_channels - x;
279       const unsigned start = skipped_start + x;
280       enum pipe_format fetch_format = attrib_format;
281       nir_def *index = base_index;
282 
283       /* Add excess constant offset to the index. */
284       unsigned const_off = attrib_offset + count_format_bytes(f, 0, start);
285       if (attrib_stride && const_off >= attrib_stride) {
286          index = nir_iadd_imm(b, base_index, const_off / attrib_stride);
287          const_off %= attrib_stride;
288       }
289 
290       /* Reduce the number of loaded channels until we can pass the range check.
291        * Only for array formats. VK spec mandates proper alignment for packed formats.
292        * Note, NONE seems to occur in real use and is considered an array format.
293        */
294       if (f->is_array && fetch_format != PIPE_FORMAT_NONE) {
295          while (channels > 1 && attrib_stride && (const_off + count_format_bytes(f, start, channels)) > attrib_stride) {
296             channels--;
297          }
298 
299          /* Keep the fetch format as large as possible to let the backend emit
300           * larger load instructions when it deems them beneficial.
301           */
302          fetch_format = util_format_get_array(f->channel[0].type, f->channel[0].size, f->nr_channels - start,
303                                               f->is_unorm || f->is_snorm, f->channel[0].pure_integer);
304       }
305 
306       assert(f->is_array || channels == fetch_num_channels);
307 
308       /* Prefer using untyped buffer loads if possible, to avoid potential alignment issues.
309        * Typed loads can cause GPU hangs when used with improper alignment.
310        */
311       if (can_use_untyped_load(f, bit_size)) {
312          loads[num_loads++] = nir_load_buffer_amd(b, channels, bit_size, descriptor, zero, zero, index,
313                                                   .base = const_off, .memory_modes = nir_var_shader_in);
314       } else {
315          const unsigned align_mul = MAX2(1, s->gfx_state->vi.vertex_binding_align[attrib_binding]);
316          const unsigned align_offset = const_off % align_mul;
317 
318          loads[num_loads++] = nir_load_typed_buffer_amd(
319             b, channels, bit_size, descriptor, zero, zero, index, .base = const_off, .format = fetch_format,
320             .align_mul = align_mul, .align_offset = align_offset, .memory_modes = nir_var_shader_in);
321       }
322    }
323 
324    nir_def *load = loads[0];
325 
326    /* Extract the channels we actually need when we couldn't skip starting
327     * components or had to emit more than one load intrinsic.
328     */
329    if (num_loads > 0 && (first_used_channel > skipped_start || num_loads != 1))
330       load = nir_extract_bits(b, loads, num_loads, (first_used_channel - skipped_start) * bit_size,
331                               max_loaded_channels - first_used_channel, bit_size);
332 
333    /* Return early if possible to avoid generating unnecessary IR. */
334    if (num_loads > 0 && first_used_channel == component && load->num_components == dest_num_components &&
335        !needs_swizzle && alpha_adjust == AC_ALPHA_ADJUST_NONE)
336       return load;
337 
338    /* Fill unused and OOB components.
339     * Apply swizzle and alpha adjust according to the format.
340     */
341    const nir_alu_type dst_type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(intrin));
342    nir_def *channels[NIR_MAX_VEC_COMPONENTS] = {0};
343    for (unsigned i = 0; i < dest_num_components; ++i) {
344       const unsigned c = i + component;
345 
346       if (!(dest_use_mask & BITFIELD_BIT(c))) {
347          /* Fill unused channels with zero. */
348          channels[i] = nir_imm_zero(b, 1, bit_size);
349          continue;
350       }
351 
352       const unsigned sw = f->swizzle[c];
353       assert(sw >= first_used_channel);
354       const unsigned loaded_channel = sw - first_used_channel;
355 
356       if (load && loaded_channel < load->num_components) {
357          /* Use channels that were loaded from VRAM. */
358          channels[i] = nir_channel(b, load, loaded_channel);
359 
360          if (alpha_adjust != AC_ALPHA_ADJUST_NONE && c == 3)
361             channels[i] = adjust_vertex_fetch_alpha(b, alpha_adjust, channels[i]);
362       } else {
363          /* Handle input loads that are larger than their format. */
364          channels[i] = oob_input_load_value(b, c, bit_size, dst_type == nir_type_float);
365       }
366    }
367 
368    return nir_vec(b, channels, dest_num_components);
369 }
370 
371 static bool
lower_vs_input_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * state)372 lower_vs_input_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *state)
373 {
374    if (intrin->intrinsic != nir_intrinsic_load_input)
375       return false;
376 
377    lower_vs_inputs_state *s = (lower_vs_inputs_state *)state;
378 
379    b->cursor = nir_before_instr(&intrin->instr);
380 
381    nir_def *replacement = NULL;
382 
383    if (s->info->vs.dynamic_inputs) {
384       replacement = lower_load_vs_input_from_prolog(b, intrin, s);
385    } else {
386       replacement = lower_load_vs_input(b, intrin, s);
387    }
388 
389    nir_def_replace(&intrin->def, replacement);
390    nir_instr_free(&intrin->instr);
391 
392    return true;
393 }
394 
395 bool
radv_nir_lower_vs_inputs(nir_shader * shader,const struct radv_shader_stage * vs_stage,const struct radv_graphics_state_key * gfx_state,const struct radeon_info * gpu_info)396 radv_nir_lower_vs_inputs(nir_shader *shader, const struct radv_shader_stage *vs_stage,
397                          const struct radv_graphics_state_key *gfx_state, const struct radeon_info *gpu_info)
398 {
399    assert(shader->info.stage == MESA_SHADER_VERTEX);
400 
401    lower_vs_inputs_state state = {
402       .info = &vs_stage->info,
403       .args = &vs_stage->args,
404       .gfx_state = gfx_state,
405       .gpu_info = gpu_info,
406    };
407 
408    return nir_shader_intrinsics_pass(shader, lower_vs_input_instr, nir_metadata_control_flow, &state);
409 }
410 
411 static void
type_size_vec4(const struct glsl_type * type,unsigned * size,unsigned * align)412 type_size_vec4(const struct glsl_type *type, unsigned *size, unsigned *align)
413 {
414    *size = glsl_count_attribute_slots(type, false);
415    *align = 1;
416 }
417 
418 static bool
opt_vs_input_to_const(nir_builder * b,nir_intrinsic_instr * intrin,void * state)419 opt_vs_input_to_const(nir_builder *b, nir_intrinsic_instr *intrin, void *state)
420 {
421    const struct radv_graphics_state_key *gfx_state = state;
422 
423    if (intrin->intrinsic != nir_intrinsic_load_deref)
424       return false;
425 
426    nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
427    if (!nir_deref_mode_is(deref, nir_var_shader_in) || nir_deref_instr_has_indirect(deref))
428       return false;
429 
430    const nir_variable *var = nir_deref_instr_get_variable(deref);
431    const unsigned location =
432       var->data.location + nir_deref_instr_get_const_offset(deref, &type_size_vec4) - VERT_ATTRIB_GENERIC0;
433    const bool is_integer = glsl_base_type_is_integer(glsl_get_base_type(deref->type));
434    const unsigned bit_size = intrin->def.bit_size;
435    const unsigned component = var->data.location_frac >> (bit_size == 64 ? 1 : 0);
436 
437    const enum pipe_format attrib_format = gfx_state->vi.vertex_attribute_formats[location];
438    const struct util_format_description *f = util_format_description(attrib_format);
439 
440    b->cursor = nir_after_instr(&intrin->instr);
441 
442    nir_def *res = &intrin->def;
443    for (unsigned i = 0; i < intrin->def.num_components; i++) {
444       const unsigned c = i + component;
445       if (f->swizzle[c] >= f->nr_channels) {
446          /* Handle input loads that are larger than their format. */
447          nir_def *channel = oob_input_load_value(b, c, bit_size, !is_integer);
448          res = nir_vector_insert_imm(b, res, channel, i);
449       }
450    }
451 
452    if (res != &intrin->def) {
453       nir_def_rewrite_uses_after(&intrin->def, res, res->parent_instr);
454       return true;
455    } else {
456       return false;
457    }
458 }
459 
460 bool
radv_nir_optimize_vs_inputs_to_const(nir_shader * shader,const struct radv_graphics_state_key * gfx_state)461 radv_nir_optimize_vs_inputs_to_const(nir_shader *shader, const struct radv_graphics_state_key *gfx_state)
462 {
463    assert(shader->info.stage == MESA_SHADER_VERTEX);
464    return nir_shader_intrinsics_pass(shader, opt_vs_input_to_const, nir_metadata_control_flow, (void *)gfx_state);
465 }
466