• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2017 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "nir_builder.h"
8 #include "nir_xfb_info.h"
9 #include "si_pipe.h"
10 #include "ac_nir.h"
11 #include "aco_interface.h"
12 
13 
si_alu_to_scalar_packed_math_filter(const nir_instr * instr,const void * data)14 bool si_alu_to_scalar_packed_math_filter(const nir_instr *instr, const void *data)
15 {
16    if (instr->type == nir_instr_type_alu) {
17       nir_alu_instr *alu = nir_instr_as_alu(instr);
18       bool use_aco = (bool)data;
19 
20       if (alu->def.bit_size == 16 && alu->def.num_components == 2 &&
21           (!use_aco || aco_nir_op_supports_packed_math_16bit(alu)))
22          return false;
23    }
24 
25    return true;
26 }
27 
si_vectorize_callback(const nir_instr * instr,const void * data)28 static uint8_t si_vectorize_callback(const nir_instr *instr, const void *data)
29 {
30    if (instr->type != nir_instr_type_alu)
31       return 0;
32 
33    nir_alu_instr *alu = nir_instr_as_alu(instr);
34    if (alu->def.bit_size != 16)
35       return 1;
36 
37    bool use_aco = (bool)data;
38 
39    if (use_aco) {
40       return aco_nir_op_supports_packed_math_16bit(alu) ? 2 : 1;
41    } else {
42       switch (alu->op) {
43       case nir_op_unpack_32_2x16_split_x:
44       case nir_op_unpack_32_2x16_split_y:
45       case nir_op_extract_i8:
46       case nir_op_extract_u8:
47       case nir_op_extract_i16:
48       case nir_op_extract_u16:
49       case nir_op_insert_u8:
50       case nir_op_insert_u16:
51          return 1;
52       default:
53          return 2;
54       }
55    }
56 }
57 
si_nir_opts(struct si_screen * sscreen,struct nir_shader * nir,bool first)58 void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first)
59 {
60    bool use_aco = sscreen->use_aco || nir->info.use_aco_amd;
61    bool progress;
62 
63    do {
64       progress = false;
65       bool lower_alu_to_scalar = false;
66       bool lower_phis_to_scalar = false;
67 
68       NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
69       NIR_PASS(progress, nir, nir_lower_alu_to_scalar,
70                nir->options->lower_to_scalar_filter, (void *)use_aco);
71       NIR_PASS(progress, nir, nir_lower_phis_to_scalar, false);
72 
73       if (first) {
74          NIR_PASS(progress, nir, nir_split_array_vars, nir_var_function_temp);
75          NIR_PASS(lower_alu_to_scalar, nir, nir_shrink_vec_array_vars, nir_var_function_temp);
76          NIR_PASS(progress, nir, nir_opt_find_array_copies);
77       }
78       NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
79       NIR_PASS(progress, nir, nir_opt_dead_write_vars);
80 
81       NIR_PASS(lower_alu_to_scalar, nir, nir_opt_loop);
82       /* (Constant) copy propagation is needed for txf with offsets. */
83       NIR_PASS(progress, nir, nir_copy_prop);
84       NIR_PASS(progress, nir, nir_opt_remove_phis);
85       NIR_PASS(progress, nir, nir_opt_dce);
86       /* nir_opt_if_optimize_phi_true_false is disabled on LLVM14 (#6976) */
87       NIR_PASS(lower_phis_to_scalar, nir, nir_opt_if,
88                nir_opt_if_optimize_phi_true_false);
89       NIR_PASS(progress, nir, nir_opt_dead_cf);
90 
91       if (lower_alu_to_scalar) {
92          NIR_PASS_V(nir, nir_lower_alu_to_scalar,
93                     nir->options->lower_to_scalar_filter, (void *)use_aco);
94       }
95       if (lower_phis_to_scalar)
96          NIR_PASS_V(nir, nir_lower_phis_to_scalar, false);
97       progress |= lower_alu_to_scalar | lower_phis_to_scalar;
98 
99       NIR_PASS(progress, nir, nir_opt_cse);
100       NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
101 
102       /* Needed for algebraic lowering */
103       NIR_PASS(progress, nir, nir_opt_algebraic);
104       NIR_PASS(progress, nir, nir_opt_generate_bfi);
105       NIR_PASS(progress, nir, nir_opt_constant_folding);
106 
107       if (!nir->info.flrp_lowered) {
108          unsigned lower_flrp = (nir->options->lower_flrp16 ? 16 : 0) |
109                                (nir->options->lower_flrp32 ? 32 : 0) |
110                                (nir->options->lower_flrp64 ? 64 : 0);
111          assert(lower_flrp);
112          bool lower_flrp_progress = false;
113 
114          NIR_PASS(lower_flrp_progress, nir, nir_lower_flrp, lower_flrp, false /* always_precise */);
115          if (lower_flrp_progress) {
116             NIR_PASS(progress, nir, nir_opt_constant_folding);
117             progress = true;
118          }
119 
120          /* Nothing should rematerialize any flrps, so we only
121           * need to do this lowering once.
122           */
123          nir->info.flrp_lowered = true;
124       }
125 
126       NIR_PASS(progress, nir, nir_opt_undef);
127       NIR_PASS(progress, nir, nir_opt_conditional_discard);
128       if (nir->options->max_unroll_iterations) {
129          NIR_PASS(progress, nir, nir_opt_loop_unroll);
130       }
131 
132       if (nir->info.stage == MESA_SHADER_FRAGMENT)
133          NIR_PASS_V(nir, nir_opt_move_discards_to_top);
134 
135       if (sscreen->info.has_packed_math_16bit)
136          NIR_PASS(progress, nir, nir_opt_vectorize, si_vectorize_callback, (void *)use_aco);
137    } while (progress);
138 
139    NIR_PASS_V(nir, nir_lower_var_copies);
140 }
141 
si_nir_late_opts(nir_shader * nir)142 void si_nir_late_opts(nir_shader *nir)
143 {
144    bool more_late_algebraic = true;
145    while (more_late_algebraic) {
146       more_late_algebraic = false;
147       NIR_PASS(more_late_algebraic, nir, nir_opt_algebraic_late);
148       NIR_PASS_V(nir, nir_opt_constant_folding);
149 
150       /* We should run this after constant folding for stages that support indirect
151        * inputs/outputs.
152        */
153       if (nir->options->support_indirect_inputs & BITFIELD_BIT(nir->info.stage) ||
154           nir->options->support_indirect_outputs & BITFIELD_BIT(nir->info.stage))
155          NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in | nir_var_shader_out);
156 
157       NIR_PASS_V(nir, nir_copy_prop);
158       NIR_PASS_V(nir, nir_opt_dce);
159       NIR_PASS_V(nir, nir_opt_cse);
160    }
161 }
162 
si_late_optimize_16bit_samplers(struct si_screen * sscreen,nir_shader * nir)163 static void si_late_optimize_16bit_samplers(struct si_screen *sscreen, nir_shader *nir)
164 {
165    /* Optimize types of image_sample sources and destinations.
166     *
167     * The image_sample sources bit sizes are:
168     *   nir_tex_src_coord:       a16 ? 16 : 32
169     *   nir_tex_src_comparator:  32
170     *   nir_tex_src_offset:      32
171     *   nir_tex_src_bias:        a16 ? 16 : 32
172     *   nir_tex_src_lod:         a16 ? 16 : 32
173     *   nir_tex_src_min_lod:     a16 ? 16 : 32
174     *   nir_tex_src_ms_index:    a16 ? 16 : 32
175     *   nir_tex_src_ddx:         has_g16 ? (g16 ? 16 : 32) : (a16 ? 16 : 32)
176     *   nir_tex_src_ddy:         has_g16 ? (g16 ? 16 : 32) : (a16 ? 16 : 32)
177     *
178     * We only use a16/g16 if all of the affected sources are 16bit.
179     */
180    bool has_g16 = sscreen->info.gfx_level >= GFX10;
181    struct nir_opt_tex_srcs_options opt_srcs_options[] = {
182       {
183          .sampler_dims =
184             ~(BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE) | BITFIELD_BIT(GLSL_SAMPLER_DIM_BUF)),
185          .src_types = (1 << nir_tex_src_coord) | (1 << nir_tex_src_lod) |
186                       (1 << nir_tex_src_bias) | (1 << nir_tex_src_min_lod) |
187                       (1 << nir_tex_src_ms_index) |
188                       (has_g16 ? 0 : (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy)),
189       },
190       {
191          .sampler_dims = ~BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE),
192          .src_types = (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy),
193       },
194    };
195    struct nir_opt_16bit_tex_image_options opt_16bit_options = {
196       .rounding_mode = nir_rounding_mode_undef,
197       .opt_tex_dest_types = nir_type_float | nir_type_int | nir_type_uint,
198       .opt_image_dest_types = nir_type_float | nir_type_int | nir_type_uint,
199       .integer_dest_saturates = true,
200       .opt_image_store_data = true,
201       .opt_image_srcs = true,
202       .opt_srcs_options_count = has_g16 ? 2 : 1,
203       .opt_srcs_options = opt_srcs_options,
204    };
205    bool changed = false;
206    NIR_PASS(changed, nir, nir_opt_16bit_tex_image, &opt_16bit_options);
207 
208    if (changed) {
209       si_nir_opts(sscreen, nir, false);
210       si_nir_late_opts(nir);
211    }
212 }
213 
214 static bool
lower_intrinsic_filter(const nir_instr * instr,const void * dummy)215 lower_intrinsic_filter(const nir_instr *instr, const void *dummy)
216 {
217    return instr->type == nir_instr_type_intrinsic;
218 }
219 
220 static nir_def *
lower_intrinsic_instr(nir_builder * b,nir_instr * instr,void * dummy)221 lower_intrinsic_instr(nir_builder *b, nir_instr *instr, void *dummy)
222 {
223    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
224 
225    switch (intrin->intrinsic) {
226    case nir_intrinsic_is_sparse_texels_resident:
227       /* code==0 means sparse texels are resident */
228       return nir_ieq_imm(b, intrin->src[0].ssa, 0);
229    case nir_intrinsic_sparse_residency_code_and:
230       return nir_ior(b, intrin->src[0].ssa, intrin->src[1].ssa);
231    default:
232       return NULL;
233    }
234 }
235 
si_lower_intrinsics(nir_shader * nir)236 static bool si_lower_intrinsics(nir_shader *nir)
237 {
238    return nir_shader_lower_instructions(nir,
239                                         lower_intrinsic_filter,
240                                         lower_intrinsic_instr,
241                                         NULL);
242 }
243 
si_lower_mediump_io(nir_shader * nir)244 void si_lower_mediump_io(nir_shader *nir)
245 {
246    NIR_PASS_V(nir, nir_lower_mediump_io,
247               /* TODO: LLVM fails to compile this test if VS inputs are 16-bit:
248                * dEQP-GLES31.functional.shaders.builtin_functions.integer.bitfieldinsert.uvec3_lowp_geometry
249                */
250               (nir->info.stage != MESA_SHADER_VERTEX ? nir_var_shader_in : 0) | nir_var_shader_out,
251               BITFIELD64_BIT(VARYING_SLOT_PNTC) | BITFIELD64_RANGE(VARYING_SLOT_VAR0, 32),
252               true);
253 }
254 
255 /**
256  * Perform "lowering" operations on the NIR that are run once when the shader
257  * selector is created.
258  */
si_lower_nir(struct si_screen * sscreen,struct nir_shader * nir)259 static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir)
260 {
261    /* Perform lowerings (and optimizations) of code.
262     *
263     * Performance considerations aside, we must:
264     * - lower certain ALU operations
265     * - ensure constant offsets for texture instructions are folded
266     *   and copy-propagated
267     */
268    const struct nir_lower_tex_options lower_tex_options = {
269       .lower_txp = ~0u,
270       .lower_txf_offset = true,
271       .lower_txs_cube_array = true,
272       .lower_invalid_implicit_lod = true,
273       .lower_tg4_offsets = true,
274       .lower_to_fragment_fetch_amd = sscreen->info.gfx_level < GFX11,
275       .lower_1d = sscreen->info.gfx_level == GFX9,
276    };
277    NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
278 
279    const struct nir_lower_image_options lower_image_options = {
280       .lower_cube_size = true,
281       .lower_to_fragment_mask_load_amd = sscreen->info.gfx_level < GFX11 &&
282                                          !(sscreen->debug_flags & DBG(NO_FMASK)),
283    };
284    NIR_PASS_V(nir, nir_lower_image, &lower_image_options);
285 
286    NIR_PASS_V(nir, si_lower_intrinsics);
287 
288    NIR_PASS_V(nir, ac_nir_lower_sin_cos);
289 
290    /* Lower load constants to scalar and then clean up the mess */
291    NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
292    NIR_PASS_V(nir, nir_lower_var_copies);
293    NIR_PASS_V(nir, nir_opt_intrinsics);
294    NIR_PASS_V(nir, nir_lower_system_values);
295 
296    /* si_nir_kill_outputs and ac_nir_optimize_outputs require outputs to be scalar. */
297    if (nir->info.stage == MESA_SHADER_VERTEX ||
298        nir->info.stage == MESA_SHADER_TESS_EVAL ||
299        nir->info.stage == MESA_SHADER_GEOMETRY)
300       NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
301 
302    if (nir->info.stage == MESA_SHADER_GEOMETRY) {
303       unsigned flags = nir_lower_gs_intrinsics_per_stream;
304       if (sscreen->use_ngg) {
305          flags |= nir_lower_gs_intrinsics_count_primitives |
306             nir_lower_gs_intrinsics_count_vertices_per_primitive |
307             nir_lower_gs_intrinsics_overwrite_incomplete;
308       }
309 
310       NIR_PASS_V(nir, nir_lower_gs_intrinsics, flags);
311    }
312 
313    if (gl_shader_stage_is_compute(nir->info.stage)) {
314       nir_lower_compute_system_values_options options = {0};
315 
316       /* gl_LocalInvocationIndex must be derived from gl_LocalInvocationID.xyz to make it correct
317        * with quad derivatives. Using gl_SubgroupID for that (which is what we do by default) is
318        * incorrect with a non-linear thread order.
319        *
320        * On Gfx12, we always use a non-linear thread order if the workgroup X and Y size is
321        * divisible by 2.
322        */
323       options.lower_local_invocation_index =
324          nir->info.derivative_group == DERIVATIVE_GROUP_QUADS ||
325          (sscreen->info.gfx_level >= GFX12 &&
326           nir->info.derivative_group == DERIVATIVE_GROUP_NONE &&
327           (nir->info.workgroup_size_variable ||
328            (nir->info.workgroup_size[0] % 2 == 0 && nir->info.workgroup_size[1] % 2 == 0)));
329       NIR_PASS_V(nir, nir_lower_compute_system_values, &options);
330 
331       /* Gfx12 supports this in hw. */
332       if (sscreen->info.gfx_level < GFX12 &&
333           nir->info.derivative_group == DERIVATIVE_GROUP_QUADS) {
334          nir_opt_cse(nir); /* CSE load_local_invocation_id */
335          memset(&options, 0, sizeof(options));
336          options.shuffle_local_ids_for_quad_derivatives = true;
337          NIR_PASS_V(nir, nir_lower_compute_system_values, &options);
338       }
339    }
340 
341    si_nir_opts(sscreen, nir, true);
342    /* Run late optimizations to fuse ffma and eliminate 16-bit conversions. */
343    si_nir_late_opts(nir);
344 
345    if (sscreen->info.gfx_level >= GFX9)
346       si_late_optimize_16bit_samplers(sscreen, nir);
347 
348    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
349 
350    NIR_PASS_V(nir, nir_lower_fp16_casts, nir_lower_fp16_split_fp64);
351 }
352 
si_mark_divergent_texture_non_uniform(struct nir_shader * nir)353 static bool si_mark_divergent_texture_non_uniform(struct nir_shader *nir)
354 {
355    assert(nir->info.divergence_analysis_run);
356 
357    /* sampler_non_uniform and texture_non_uniform are always false in GLSL,
358     * but this can lead to unexpected behavior if texture/sampler index come from
359     * a vertex attribute.
360     *
361     * For instance, 2 consecutive draws using 2 different index values,
362     * could be squashed together by the hw - producing a single draw with
363     * non-dynamically uniform index.
364     *
365     * To avoid this, detect divergent indexing, mark them as non-uniform,
366     * so that we can apply waterfall loop on these index later (either llvm
367     * backend or nir_lower_non_uniform_access).
368     *
369     * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/2253
370     */
371 
372    bool divergence_changed = false;
373 
374    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
375    nir_foreach_block_safe(block, impl) {
376       nir_foreach_instr_safe(instr, block) {
377          if (instr->type != nir_instr_type_tex)
378             continue;
379 
380          nir_tex_instr *tex = nir_instr_as_tex(instr);
381          for (int i = 0; i < tex->num_srcs; i++) {
382             bool divergent = nir_src_is_divergent(&tex->src[i].src);
383 
384             switch (tex->src[i].src_type) {
385             case nir_tex_src_texture_deref:
386             case nir_tex_src_texture_handle:
387                tex->texture_non_uniform |= divergent;
388                break;
389             case nir_tex_src_sampler_deref:
390             case nir_tex_src_sampler_handle:
391                tex->sampler_non_uniform |= divergent;
392                break;
393             default:
394                break;
395             }
396          }
397 
398          /* If dest is already divergent, divergence won't change. */
399          divergence_changed |= !tex->def.divergent &&
400             (tex->texture_non_uniform || tex->sampler_non_uniform);
401       }
402    }
403 
404    nir_metadata_preserve(impl, nir_metadata_all);
405    return divergence_changed;
406 }
407 
si_finalize_nir(struct pipe_screen * screen,struct nir_shader * nir)408 char *si_finalize_nir(struct pipe_screen *screen, struct nir_shader *nir)
409 {
410    struct si_screen *sscreen = (struct si_screen *)screen;
411 
412    nir_lower_io_passes(nir, false);
413    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_in | nir_var_shader_out, NULL);
414 
415    if (nir->info.stage == MESA_SHADER_FRAGMENT)
416       NIR_PASS_V(nir, nir_lower_color_inputs);
417 
418    NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_shared, nir_address_format_32bit_offset);
419 
420    /* Remove dead derefs, so that we can remove uniforms. */
421    NIR_PASS_V(nir, nir_opt_dce);
422 
423    /* Remove uniforms because those should have been lowered to UBOs already. */
424    nir_foreach_variable_with_modes_safe(var, nir, nir_var_uniform) {
425       if (!glsl_type_get_image_count(var->type) &&
426           !glsl_type_get_texture_count(var->type) &&
427           !glsl_type_get_sampler_count(var->type))
428          exec_node_remove(&var->node);
429    }
430 
431    si_lower_nir(sscreen, nir);
432    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
433 
434    /* Update xfb info after we did medium io lowering. */
435    if (nir->xfb_info && nir->info.outputs_written_16bit)
436       nir_gather_xfb_info_from_intrinsics(nir);
437 
438    if (sscreen->options.inline_uniforms)
439       nir_find_inlinable_uniforms(nir);
440 
441    /* Lower large variables that are always constant with load_constant intrinsics, which
442     * get turned into PC-relative loads from a data section next to the shader.
443     *
444     * Run this once before lcssa because the added phis may prevent this
445     * pass from operating correctly.
446     *
447     * nir_opt_large_constants may use op_amul (see nir_build_deref_offset),
448     * or may create unneeded code, so run si_nir_opts if needed.
449     */
450    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
451    bool progress = false;
452    NIR_PASS(progress, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
453    if (progress)
454       si_nir_opts(sscreen, nir, false);
455 
456    NIR_PASS_V(nir, nir_divergence_analysis); /* to find divergent loops */
457 
458    /* Must be after divergence analysis. */
459    bool divergence_changed = false;
460    NIR_PASS(divergence_changed, nir, si_mark_divergent_texture_non_uniform);
461    /* Re-analysis whole shader if texture instruction divergence changed. */
462    if (divergence_changed)
463       NIR_PASS_V(nir, nir_divergence_analysis);
464 
465    return NULL;
466 }
467