• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2017 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "nir_builder.h"
8 #include "nir_xfb_info.h"
9 #include "si_pipe.h"
10 #include "ac_nir.h"
11 #include "aco_interface.h"
12 
13 
si_alu_to_scalar_packed_math_filter(const nir_instr * instr,const void * data)14 bool si_alu_to_scalar_packed_math_filter(const nir_instr *instr, const void *data)
15 {
16    if (instr->type == nir_instr_type_alu) {
17       nir_alu_instr *alu = nir_instr_as_alu(instr);
18       bool use_aco = (bool)data;
19 
20       if (alu->def.bit_size == 16 && alu->def.num_components == 2 &&
21           (!use_aco || aco_nir_op_supports_packed_math_16bit(alu)))
22          return false;
23    }
24 
25    return true;
26 }
27 
si_vectorize_callback(const nir_instr * instr,const void * data)28 static uint8_t si_vectorize_callback(const nir_instr *instr, const void *data)
29 {
30    if (instr->type != nir_instr_type_alu)
31       return 0;
32 
33    nir_alu_instr *alu = nir_instr_as_alu(instr);
34    if (alu->def.bit_size != 16)
35       return 1;
36 
37    bool use_aco = (bool)data;
38 
39    if (use_aco) {
40       return aco_nir_op_supports_packed_math_16bit(alu) ? 2 : 1;
41    } else {
42       switch (alu->op) {
43       case nir_op_unpack_32_2x16_split_x:
44       case nir_op_unpack_32_2x16_split_y:
45          return 1;
46       default:
47          return 2;
48       }
49    }
50 }
51 
si_lower_bit_size_callback(const nir_instr * instr,void * data)52 static unsigned si_lower_bit_size_callback(const nir_instr *instr, void *data)
53 {
54    if (instr->type != nir_instr_type_alu)
55       return 0;
56 
57    nir_alu_instr *alu = nir_instr_as_alu(instr);
58 
59    switch (alu->op) {
60    case nir_op_imul_high:
61    case nir_op_umul_high:
62       if (alu->def.bit_size < 32)
63          return 32;
64       break;
65    default:
66       break;
67    }
68 
69    return 0;
70 }
71 
si_nir_opts(struct si_screen * sscreen,struct nir_shader * nir,bool first)72 void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first)
73 {
74    bool progress;
75 
76    do {
77       progress = false;
78       bool lower_alu_to_scalar = false;
79       bool lower_phis_to_scalar = false;
80 
81       NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
82       NIR_PASS(progress, nir, nir_lower_alu_to_scalar,
83                nir->options->lower_to_scalar_filter, (void *)sscreen->use_aco);
84       NIR_PASS(progress, nir, nir_lower_phis_to_scalar, false);
85 
86       if (first) {
87          NIR_PASS(progress, nir, nir_split_array_vars, nir_var_function_temp);
88          NIR_PASS(lower_alu_to_scalar, nir, nir_shrink_vec_array_vars, nir_var_function_temp);
89          NIR_PASS(progress, nir, nir_opt_find_array_copies);
90       }
91       NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
92       NIR_PASS(progress, nir, nir_opt_dead_write_vars);
93 
94       NIR_PASS(lower_alu_to_scalar, nir, nir_opt_loop);
95       /* (Constant) copy propagation is needed for txf with offsets. */
96       NIR_PASS(progress, nir, nir_copy_prop);
97       NIR_PASS(progress, nir, nir_opt_remove_phis);
98       NIR_PASS(progress, nir, nir_opt_dce);
99       /* nir_opt_if_optimize_phi_true_false is disabled on LLVM14 (#6976) */
100       NIR_PASS(lower_phis_to_scalar, nir, nir_opt_if,
101                nir_opt_if_optimize_phi_true_false);
102       NIR_PASS(progress, nir, nir_opt_dead_cf);
103 
104       if (lower_alu_to_scalar) {
105          NIR_PASS_V(nir, nir_lower_alu_to_scalar,
106                     nir->options->lower_to_scalar_filter, (void *)sscreen->use_aco);
107       }
108       if (lower_phis_to_scalar)
109          NIR_PASS_V(nir, nir_lower_phis_to_scalar, false);
110       progress |= lower_alu_to_scalar | lower_phis_to_scalar;
111 
112       NIR_PASS(progress, nir, nir_opt_cse);
113       NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
114 
115       /* Needed for algebraic lowering */
116       NIR_PASS(progress, nir, nir_lower_bit_size, si_lower_bit_size_callback, NULL);
117       NIR_PASS(progress, nir, nir_opt_algebraic);
118       NIR_PASS(progress, nir, nir_opt_constant_folding);
119 
120       if (!nir->info.flrp_lowered) {
121          unsigned lower_flrp = (nir->options->lower_flrp16 ? 16 : 0) |
122                                (nir->options->lower_flrp32 ? 32 : 0) |
123                                (nir->options->lower_flrp64 ? 64 : 0);
124          assert(lower_flrp);
125          bool lower_flrp_progress = false;
126 
127          NIR_PASS(lower_flrp_progress, nir, nir_lower_flrp, lower_flrp, false /* always_precise */);
128          if (lower_flrp_progress) {
129             NIR_PASS(progress, nir, nir_opt_constant_folding);
130             progress = true;
131          }
132 
133          /* Nothing should rematerialize any flrps, so we only
134           * need to do this lowering once.
135           */
136          nir->info.flrp_lowered = true;
137       }
138 
139       NIR_PASS(progress, nir, nir_opt_undef);
140       NIR_PASS(progress, nir, nir_opt_conditional_discard);
141       if (nir->options->max_unroll_iterations) {
142          NIR_PASS(progress, nir, nir_opt_loop_unroll);
143       }
144 
145       if (nir->info.stage == MESA_SHADER_FRAGMENT)
146          NIR_PASS_V(nir, nir_opt_move_discards_to_top);
147 
148       if (sscreen->info.has_packed_math_16bit) {
149          NIR_PASS(progress, nir, nir_opt_vectorize, si_vectorize_callback,
150                   (void *)sscreen->use_aco);
151       }
152    } while (progress);
153 
154    NIR_PASS_V(nir, nir_lower_var_copies);
155 }
156 
si_nir_late_opts(nir_shader * nir)157 void si_nir_late_opts(nir_shader *nir)
158 {
159    bool more_late_algebraic = true;
160    while (more_late_algebraic) {
161       more_late_algebraic = false;
162       NIR_PASS(more_late_algebraic, nir, nir_opt_algebraic_late);
163       NIR_PASS_V(nir, nir_opt_constant_folding);
164 
165       /* We should run this after constant folding for stages that support indirect
166        * inputs/outputs.
167        */
168       if (nir->options->support_indirect_inputs & BITFIELD_BIT(nir->info.stage) ||
169           nir->options->support_indirect_outputs & BITFIELD_BIT(nir->info.stage))
170          NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in | nir_var_shader_out);
171 
172       NIR_PASS_V(nir, nir_copy_prop);
173       NIR_PASS_V(nir, nir_opt_dce);
174       NIR_PASS_V(nir, nir_opt_cse);
175    }
176 }
177 
si_late_optimize_16bit_samplers(struct si_screen * sscreen,nir_shader * nir)178 static void si_late_optimize_16bit_samplers(struct si_screen *sscreen, nir_shader *nir)
179 {
180    /* Optimize types of image_sample sources and destinations.
181     *
182     * The image_sample sources bit sizes are:
183     *   nir_tex_src_coord:       a16 ? 16 : 32
184     *   nir_tex_src_comparator:  32
185     *   nir_tex_src_offset:      32
186     *   nir_tex_src_bias:        a16 ? 16 : 32
187     *   nir_tex_src_lod:         a16 ? 16 : 32
188     *   nir_tex_src_min_lod:     a16 ? 16 : 32
189     *   nir_tex_src_ms_index:    a16 ? 16 : 32
190     *   nir_tex_src_ddx:         has_g16 ? (g16 ? 16 : 32) : (a16 ? 16 : 32)
191     *   nir_tex_src_ddy:         has_g16 ? (g16 ? 16 : 32) : (a16 ? 16 : 32)
192     *
193     * We only use a16/g16 if all of the affected sources are 16bit.
194     */
195    bool has_g16 = sscreen->info.gfx_level >= GFX10;
196    struct nir_fold_tex_srcs_options fold_srcs_options[] = {
197       {
198          .sampler_dims =
199             ~(BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE) | BITFIELD_BIT(GLSL_SAMPLER_DIM_BUF)),
200          .src_types = (1 << nir_tex_src_coord) | (1 << nir_tex_src_lod) |
201                       (1 << nir_tex_src_bias) | (1 << nir_tex_src_min_lod) |
202                       (1 << nir_tex_src_ms_index) |
203                       (has_g16 ? 0 : (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy)),
204       },
205       {
206          .sampler_dims = ~BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE),
207          .src_types = (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy),
208       },
209    };
210    struct nir_fold_16bit_tex_image_options fold_16bit_options = {
211       .rounding_mode = nir_rounding_mode_rtz,
212       .fold_tex_dest_types = nir_type_float,
213       .fold_image_dest_types = nir_type_float,
214       .fold_image_store_data = true,
215       .fold_srcs_options_count = has_g16 ? 2 : 1,
216       .fold_srcs_options = fold_srcs_options,
217    };
218    bool changed = false;
219    NIR_PASS(changed, nir, nir_fold_16bit_tex_image, &fold_16bit_options);
220 
221    if (changed) {
222       si_nir_opts(sscreen, nir, false);
223       si_nir_late_opts(nir);
224    }
225 }
226 
227 static bool
lower_intrinsic_filter(const nir_instr * instr,const void * dummy)228 lower_intrinsic_filter(const nir_instr *instr, const void *dummy)
229 {
230    return instr->type == nir_instr_type_intrinsic;
231 }
232 
233 static nir_def *
lower_intrinsic_instr(nir_builder * b,nir_instr * instr,void * dummy)234 lower_intrinsic_instr(nir_builder *b, nir_instr *instr, void *dummy)
235 {
236    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
237 
238    switch (intrin->intrinsic) {
239    case nir_intrinsic_is_sparse_texels_resident:
240       /* code==0 means sparse texels are resident */
241       return nir_ieq_imm(b, intrin->src[0].ssa, 0);
242    case nir_intrinsic_sparse_residency_code_and:
243       return nir_ior(b, intrin->src[0].ssa, intrin->src[1].ssa);
244    default:
245       return NULL;
246    }
247 }
248 
si_lower_intrinsics(nir_shader * nir)249 static bool si_lower_intrinsics(nir_shader *nir)
250 {
251    return nir_shader_lower_instructions(nir,
252                                         lower_intrinsic_filter,
253                                         lower_intrinsic_instr,
254                                         NULL);
255 }
256 
257 const nir_lower_subgroups_options si_nir_subgroups_options = {
258    .subgroup_size = 64,
259    .ballot_bit_size = 64,
260    .ballot_components = 1,
261    .lower_to_scalar = true,
262    .lower_subgroup_masks = true,
263    .lower_relative_shuffle = true,
264    .lower_shuffle_to_32bit = true,
265    .lower_vote_trivial = false,
266    .lower_vote_eq = true,
267    .lower_vote_bool_eq = true,
268    .lower_inverse_ballot = true,
269    .lower_boolean_reduce = true,
270    .lower_boolean_shuffle = true,
271 };
272 
273 /**
274  * Perform "lowering" operations on the NIR that are run once when the shader
275  * selector is created.
276  */
si_lower_nir(struct si_screen * sscreen,struct nir_shader * nir)277 static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir)
278 {
279    /* Perform lowerings (and optimizations) of code.
280     *
281     * Performance considerations aside, we must:
282     * - lower certain ALU operations
283     * - ensure constant offsets for texture instructions are folded
284     *   and copy-propagated
285     */
286    NIR_PASS_V(nir, nir_lower_int64);
287 
288    const struct nir_lower_tex_options lower_tex_options = {
289       .lower_txp = ~0u,
290       .lower_txf_offset = true,
291       .lower_txs_cube_array = true,
292       .lower_invalid_implicit_lod = true,
293       .lower_tg4_offsets = true,
294       .lower_to_fragment_fetch_amd = sscreen->info.gfx_level < GFX11,
295       .lower_1d = sscreen->info.gfx_level == GFX9,
296    };
297    NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
298 
299    const struct nir_lower_image_options lower_image_options = {
300       .lower_cube_size = true,
301       .lower_to_fragment_mask_load_amd = sscreen->info.gfx_level < GFX11,
302    };
303    NIR_PASS_V(nir, nir_lower_image, &lower_image_options);
304 
305    NIR_PASS_V(nir, si_lower_intrinsics);
306 
307    NIR_PASS_V(nir, ac_nir_lower_sin_cos);
308 
309    NIR_PASS_V(nir, nir_lower_subgroups, &si_nir_subgroups_options);
310 
311    NIR_PASS_V(nir, nir_lower_discard_or_demote, true);
312 
313    /* Lower load constants to scalar and then clean up the mess */
314    NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
315    NIR_PASS_V(nir, nir_lower_var_copies);
316    NIR_PASS_V(nir, nir_opt_intrinsics);
317    NIR_PASS_V(nir, nir_lower_system_values);
318 
319    /* si_nir_kill_outputs and ac_nir_optimize_outputs require outputs to be scalar. */
320    if (nir->info.stage == MESA_SHADER_VERTEX ||
321        nir->info.stage == MESA_SHADER_TESS_EVAL ||
322        nir->info.stage == MESA_SHADER_GEOMETRY)
323       NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
324 
325    if (nir->info.stage == MESA_SHADER_GEOMETRY) {
326       unsigned flags = nir_lower_gs_intrinsics_per_stream;
327       if (sscreen->use_ngg) {
328          flags |= nir_lower_gs_intrinsics_count_primitives |
329             nir_lower_gs_intrinsics_count_vertices_per_primitive |
330             nir_lower_gs_intrinsics_overwrite_incomplete;
331       }
332 
333       NIR_PASS_V(nir, nir_lower_gs_intrinsics, flags);
334    }
335 
336    if (nir->info.stage == MESA_SHADER_COMPUTE) {
337       nir_lower_compute_system_values_options options = {0};
338 
339       /* gl_LocalInvocationIndex must be derived from gl_LocalInvocationID.xyz to make it correct
340        * with quad derivatives. Using gl_SubgroupID for that (which is what we do by default) is
341        * incorrect with a non-linear thread order.
342        */
343       options.lower_local_invocation_index =
344          nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS;
345       NIR_PASS_V(nir, nir_lower_compute_system_values, &options);
346 
347       if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS) {
348          nir_opt_cse(nir); /* CSE load_local_invocation_id */
349          memset(&options, 0, sizeof(options));
350          options.shuffle_local_ids_for_quad_derivatives = true;
351          NIR_PASS_V(nir, nir_lower_compute_system_values, &options);
352       }
353    }
354 
355    if (sscreen->b.get_shader_param(&sscreen->b, PIPE_SHADER_FRAGMENT, PIPE_SHADER_CAP_FP16)) {
356       NIR_PASS_V(nir, nir_lower_mediump_io,
357                  /* TODO: LLVM fails to compile this test if VS inputs are 16-bit:
358                   * dEQP-GLES31.functional.shaders.builtin_functions.integer.bitfieldinsert.uvec3_lowp_geometry
359                   */
360                  (nir->info.stage != MESA_SHADER_VERTEX ? nir_var_shader_in : 0) | nir_var_shader_out,
361                  BITFIELD64_BIT(VARYING_SLOT_PNTC) | BITFIELD64_RANGE(VARYING_SLOT_VAR0, 32),
362                  true);
363    }
364 
365    si_nir_opts(sscreen, nir, true);
366    /* Run late optimizations to fuse ffma and eliminate 16-bit conversions. */
367    si_nir_late_opts(nir);
368 
369    if (sscreen->b.get_shader_param(&sscreen->b, PIPE_SHADER_FRAGMENT, PIPE_SHADER_CAP_FP16))
370       si_late_optimize_16bit_samplers(sscreen, nir);
371 
372    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
373 
374    NIR_PASS_V(nir, nir_lower_fp16_casts, nir_lower_fp16_split_fp64);
375 }
376 
si_mark_divergent_texture_non_uniform(struct nir_shader * nir)377 static bool si_mark_divergent_texture_non_uniform(struct nir_shader *nir)
378 {
379    assert(nir->info.divergence_analysis_run);
380 
381    /* sampler_non_uniform and texture_non_uniform are always false in GLSL,
382     * but this can lead to unexpected behavior if texture/sampler index come from
383     * a vertex attribute.
384     *
385     * For instance, 2 consecutive draws using 2 different index values,
386     * could be squashed together by the hw - producing a single draw with
387     * non-dynamically uniform index.
388     *
389     * To avoid this, detect divergent indexing, mark them as non-uniform,
390     * so that we can apply waterfall loop on these index later (either llvm
391     * backend or nir_lower_non_uniform_access).
392     *
393     * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/2253
394     */
395 
396    bool divergence_changed = false;
397 
398    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
399    nir_foreach_block_safe(block, impl) {
400       nir_foreach_instr_safe(instr, block) {
401          if (instr->type != nir_instr_type_tex)
402             continue;
403 
404          nir_tex_instr *tex = nir_instr_as_tex(instr);
405          for (int i = 0; i < tex->num_srcs; i++) {
406             bool divergent = tex->src[i].src.ssa->divergent;
407 
408             switch (tex->src[i].src_type) {
409             case nir_tex_src_texture_deref:
410             case nir_tex_src_texture_handle:
411                tex->texture_non_uniform |= divergent;
412                break;
413             case nir_tex_src_sampler_deref:
414             case nir_tex_src_sampler_handle:
415                tex->sampler_non_uniform |= divergent;
416                break;
417             default:
418                break;
419             }
420          }
421 
422          /* If dest is already divergent, divergence won't change. */
423          divergence_changed |= !tex->def.divergent &&
424             (tex->texture_non_uniform || tex->sampler_non_uniform);
425       }
426    }
427 
428    nir_metadata_preserve(impl, nir_metadata_all);
429    return divergence_changed;
430 }
431 
si_finalize_nir(struct pipe_screen * screen,void * nirptr)432 char *si_finalize_nir(struct pipe_screen *screen, void *nirptr)
433 {
434    struct si_screen *sscreen = (struct si_screen *)screen;
435    struct nir_shader *nir = (struct nir_shader *)nirptr;
436 
437    nir_lower_io_passes(nir, false);
438    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_in | nir_var_shader_out, NULL);
439 
440    if (nir->info.stage == MESA_SHADER_FRAGMENT)
441       NIR_PASS_V(nir, nir_lower_color_inputs);
442 
443    NIR_PASS_V(nir, ac_nir_lower_subdword_loads,
444               (ac_nir_lower_subdword_options) {
445                  .modes_1_comp = nir_var_mem_ubo,
446                  .modes_N_comps = nir_var_mem_ubo | nir_var_mem_ssbo
447               });
448    NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_shared, nir_address_format_32bit_offset);
449 
450    /* Remove dead derefs, so that we can remove uniforms. */
451    NIR_PASS_V(nir, nir_opt_dce);
452 
453    /* Remove uniforms because those should have been lowered to UBOs already. */
454    nir_foreach_variable_with_modes_safe(var, nir, nir_var_uniform) {
455       if (!glsl_type_get_image_count(var->type) &&
456           !glsl_type_get_texture_count(var->type) &&
457           !glsl_type_get_sampler_count(var->type))
458          exec_node_remove(&var->node);
459    }
460 
461    si_lower_nir(sscreen, nir);
462    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
463 
464    /* Update xfb info after we did medium io lowering. */
465    if (nir->xfb_info && nir->info.outputs_written_16bit)
466       nir_gather_xfb_info_from_intrinsics(nir);
467 
468    if (sscreen->options.inline_uniforms)
469       nir_find_inlinable_uniforms(nir);
470 
471    /* Lower large variables that are always constant with load_constant intrinsics, which
472     * get turned into PC-relative loads from a data section next to the shader.
473     *
474     * Run this once before lcssa because the added phis may prevent this
475     * pass from operating correctly.
476     *
477     * nir_opt_large_constants may use op_amul (see nir_build_deref_offset),
478     * or may create unneeded code, so run si_nir_opts if needed.
479     */
480    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
481    bool progress = false;
482    NIR_PASS(progress, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
483    if (progress)
484       si_nir_opts(sscreen, nir, false);
485 
486    NIR_PASS_V(nir, nir_convert_to_lcssa, true, true); /* required by divergence analysis */
487    NIR_PASS_V(nir, nir_divergence_analysis); /* to find divergent loops */
488 
489    /* Must be after divergence analysis. */
490    bool divergence_changed = false;
491    NIR_PASS(divergence_changed, nir, si_mark_divergent_texture_non_uniform);
492    /* Re-analysis whole shader if texture instruction divergence changed. */
493    if (divergence_changed)
494       NIR_PASS_V(nir, nir_divergence_analysis);
495 
496    return NULL;
497 }
498