• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2017 Advanced Micro Devices, Inc.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * on the rights to use, copy, modify, merge, publish, distribute, sub
9  * license, and/or sell copies of the Software, and to permit persons to whom
10  * the Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 #include "nir_builder.h"
26 #include "si_pipe.h"
27 
28 
si_alu_to_scalar_filter(const nir_instr * instr,const void * data)29 static bool si_alu_to_scalar_filter(const nir_instr *instr, const void *data)
30 {
31    struct si_screen *sscreen = (struct si_screen *)data;
32 
33    if (sscreen->info.has_packed_math_16bit && instr->type == nir_instr_type_alu) {
34       nir_alu_instr *alu = nir_instr_as_alu(instr);
35 
36       if (alu->dest.dest.is_ssa &&
37           alu->dest.dest.ssa.bit_size == 16 &&
38           alu->dest.dest.ssa.num_components == 2)
39          return false;
40    }
41 
42    return true;
43 }
44 
si_vectorize_callback(const nir_instr * instr,const void * data)45 static uint8_t si_vectorize_callback(const nir_instr *instr, const void *data)
46 {
47    if (instr->type != nir_instr_type_alu)
48       return 0;
49 
50    nir_alu_instr *alu = nir_instr_as_alu(instr);
51    if (nir_dest_bit_size(alu->dest.dest) == 16)
52       return 2;
53 
54    return 1;
55 }
56 
si_nir_opts(struct si_screen * sscreen,struct nir_shader * nir,bool first)57 void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first)
58 {
59    bool progress;
60 
61    do {
62       progress = false;
63       bool lower_alu_to_scalar = false;
64       bool lower_phis_to_scalar = false;
65 
66       NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
67       NIR_PASS(progress, nir, nir_lower_alu_to_scalar, si_alu_to_scalar_filter, sscreen);
68       NIR_PASS(progress, nir, nir_lower_phis_to_scalar, false);
69 
70       if (first) {
71          NIR_PASS(progress, nir, nir_split_array_vars, nir_var_function_temp);
72          NIR_PASS(lower_alu_to_scalar, nir, nir_shrink_vec_array_vars, nir_var_function_temp);
73          NIR_PASS(progress, nir, nir_opt_find_array_copies);
74       }
75       NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
76       NIR_PASS(progress, nir, nir_opt_dead_write_vars);
77 
78       NIR_PASS(lower_alu_to_scalar, nir, nir_opt_trivial_continues);
79       /* (Constant) copy propagation is needed for txf with offsets. */
80       NIR_PASS(progress, nir, nir_copy_prop);
81       NIR_PASS(progress, nir, nir_opt_remove_phis);
82       NIR_PASS(progress, nir, nir_opt_dce);
83       /* nir_opt_if_optimize_phi_true_false is disabled on LLVM14 (#6976) */
84       NIR_PASS(lower_phis_to_scalar, nir, nir_opt_if,
85          nir_opt_if_aggressive_last_continue |
86             (LLVM_VERSION_MAJOR == 14 ? 0 : nir_opt_if_optimize_phi_true_false));
87       NIR_PASS(progress, nir, nir_opt_dead_cf);
88 
89       if (lower_alu_to_scalar)
90          NIR_PASS_V(nir, nir_lower_alu_to_scalar, si_alu_to_scalar_filter, sscreen);
91       if (lower_phis_to_scalar)
92          NIR_PASS_V(nir, nir_lower_phis_to_scalar, false);
93       progress |= lower_alu_to_scalar | lower_phis_to_scalar;
94 
95       NIR_PASS(progress, nir, nir_opt_cse);
96       NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
97 
98       /* Needed for algebraic lowering */
99       NIR_PASS(progress, nir, nir_opt_algebraic);
100       NIR_PASS(progress, nir, nir_opt_constant_folding);
101 
102       if (!nir->info.flrp_lowered) {
103          unsigned lower_flrp = (nir->options->lower_flrp16 ? 16 : 0) |
104                                (nir->options->lower_flrp32 ? 32 : 0) |
105                                (nir->options->lower_flrp64 ? 64 : 0);
106          assert(lower_flrp);
107          bool lower_flrp_progress = false;
108 
109          NIR_PASS(lower_flrp_progress, nir, nir_lower_flrp, lower_flrp, false /* always_precise */);
110          if (lower_flrp_progress) {
111             NIR_PASS(progress, nir, nir_opt_constant_folding);
112             progress = true;
113          }
114 
115          /* Nothing should rematerialize any flrps, so we only
116           * need to do this lowering once.
117           */
118          nir->info.flrp_lowered = true;
119       }
120 
121       NIR_PASS(progress, nir, nir_opt_undef);
122       NIR_PASS(progress, nir, nir_opt_conditional_discard);
123       if (nir->options->max_unroll_iterations) {
124          NIR_PASS(progress, nir, nir_opt_loop_unroll);
125       }
126 
127       if (nir->info.stage == MESA_SHADER_FRAGMENT)
128          NIR_PASS_V(nir, nir_opt_move_discards_to_top);
129 
130       if (sscreen->info.has_packed_math_16bit)
131          NIR_PASS(progress, nir, nir_opt_vectorize, si_vectorize_callback, NULL);
132    } while (progress);
133 
134    NIR_PASS_V(nir, nir_lower_var_copies);
135 }
136 
si_nir_late_opts(nir_shader * nir)137 void si_nir_late_opts(nir_shader *nir)
138 {
139    bool more_late_algebraic = true;
140    while (more_late_algebraic) {
141       more_late_algebraic = false;
142       NIR_PASS(more_late_algebraic, nir, nir_opt_algebraic_late);
143       NIR_PASS_V(nir, nir_opt_constant_folding);
144 
145       /* We should run this after constant folding for stages that support indirect
146        * inputs/outputs.
147        */
148       if (nir->options->support_indirect_inputs & BITFIELD_BIT(nir->info.stage) ||
149           nir->options->support_indirect_outputs & BITFIELD_BIT(nir->info.stage))
150          NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in | nir_var_shader_out);
151 
152       NIR_PASS_V(nir, nir_copy_prop);
153       NIR_PASS_V(nir, nir_opt_dce);
154       NIR_PASS_V(nir, nir_opt_cse);
155    }
156 }
157 
si_late_optimize_16bit_samplers(struct si_screen * sscreen,nir_shader * nir)158 static void si_late_optimize_16bit_samplers(struct si_screen *sscreen, nir_shader *nir)
159 {
160    /* Optimize types of image_sample sources and destinations.
161     *
162     * The image_sample sources bit sizes are:
163     *   nir_tex_src_coord:       a16 ? 16 : 32
164     *   nir_tex_src_comparator:  32
165     *   nir_tex_src_offset:      32
166     *   nir_tex_src_bias:        a16 ? 16 : 32
167     *   nir_tex_src_lod:         a16 ? 16 : 32
168     *   nir_tex_src_min_lod:     a16 ? 16 : 32
169     *   nir_tex_src_ms_index:    a16 ? 16 : 32
170     *   nir_tex_src_ddx:         has_g16 ? (g16 ? 16 : 32) : (a16 ? 16 : 32)
171     *   nir_tex_src_ddy:         has_g16 ? (g16 ? 16 : 32) : (a16 ? 16 : 32)
172     *
173     * We only use a16/g16 if all of the affected sources are 16bit.
174     */
175    bool has_g16 = sscreen->info.gfx_level >= GFX10 && LLVM_VERSION_MAJOR >= 12;
176    struct nir_fold_tex_srcs_options fold_srcs_options[] = {
177       {
178          .sampler_dims =
179             ~(BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE) | BITFIELD_BIT(GLSL_SAMPLER_DIM_BUF)),
180          .src_types = (1 << nir_tex_src_coord) | (1 << nir_tex_src_lod) |
181                       (1 << nir_tex_src_bias) | (1 << nir_tex_src_min_lod) |
182                       (1 << nir_tex_src_ms_index) |
183                       (has_g16 ? 0 : (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy)),
184       },
185       {
186          .sampler_dims = ~BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE),
187          .src_types = (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy),
188       },
189    };
190    struct nir_fold_16bit_tex_image_options fold_16bit_options = {
191       .rounding_mode = nir_rounding_mode_rtne,
192       .fold_tex_dest = true,
193       .fold_image_load_store_data = true,
194       .fold_srcs_options_count = has_g16 ? 2 : 1,
195       .fold_srcs_options = fold_srcs_options,
196    };
197    bool changed = false;
198    NIR_PASS(changed, nir, nir_fold_16bit_tex_image, &fold_16bit_options);
199 
200    if (changed) {
201       si_nir_opts(sscreen, nir, false);
202       si_nir_late_opts(nir);
203    }
204 }
205 
206 static bool
lower_intrinsic_filter(const nir_instr * instr,const void * dummy)207 lower_intrinsic_filter(const nir_instr *instr, const void *dummy)
208 {
209    return instr->type == nir_instr_type_intrinsic;
210 }
211 
212 static nir_ssa_def *
lower_intrinsic_instr(nir_builder * b,nir_instr * instr,void * dummy)213 lower_intrinsic_instr(nir_builder *b, nir_instr *instr, void *dummy)
214 {
215    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
216 
217    switch (intrin->intrinsic) {
218    case nir_intrinsic_is_sparse_texels_resident:
219       /* code==0 means sparse texels are resident */
220       return nir_ieq_imm(b, intrin->src[0].ssa, 0);
221    case nir_intrinsic_sparse_residency_code_and:
222       return nir_ior(b, intrin->src[0].ssa, intrin->src[1].ssa);
223    default:
224       return NULL;
225    }
226 }
227 
si_lower_intrinsics(nir_shader * nir)228 static bool si_lower_intrinsics(nir_shader *nir)
229 {
230    return nir_shader_lower_instructions(nir,
231                                         lower_intrinsic_filter,
232                                         lower_intrinsic_instr,
233                                         NULL);
234 }
235 
236 /**
237  * Perform "lowering" operations on the NIR that are run once when the shader
238  * selector is created.
239  */
si_lower_nir(struct si_screen * sscreen,struct nir_shader * nir)240 static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir)
241 {
242    /* Perform lowerings (and optimizations) of code.
243     *
244     * Performance considerations aside, we must:
245     * - lower certain ALU operations
246     * - ensure constant offsets for texture instructions are folded
247     *   and copy-propagated
248     */
249 
250    static const struct nir_lower_tex_options lower_tex_options = {
251       .lower_txp = ~0u,
252       .lower_txs_cube_array = true,
253       .lower_invalid_implicit_lod = true,
254       .lower_tg4_offsets = true,
255    };
256    NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
257 
258    static const struct nir_lower_image_options lower_image_options = {
259       .lower_cube_size = true,
260    };
261    NIR_PASS_V(nir, nir_lower_image, &lower_image_options);
262 
263    NIR_PASS_V(nir, si_lower_intrinsics);
264 
265    const nir_lower_subgroups_options subgroups_options = {
266       .subgroup_size = 64,
267       .ballot_bit_size = 64,
268       .ballot_components = 1,
269       .lower_to_scalar = true,
270       .lower_subgroup_masks = true,
271       .lower_vote_trivial = false,
272       .lower_vote_eq = true,
273    };
274    NIR_PASS_V(nir, nir_lower_subgroups, &subgroups_options);
275 
276    NIR_PASS_V(nir, nir_lower_discard_or_demote,
277               (sscreen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL)) ||
278                nir->info.use_legacy_math_rules);
279 
280    /* Lower load constants to scalar and then clean up the mess */
281    NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
282    NIR_PASS_V(nir, nir_lower_var_copies);
283    NIR_PASS_V(nir, nir_opt_intrinsics);
284    NIR_PASS_V(nir, nir_lower_system_values);
285    NIR_PASS_V(nir, nir_lower_compute_system_values, NULL);
286 
287    /* si_nir_kill_outputs and ac_nir_optimize_outputs require outputs to be scalar. */
288    if (nir->info.stage == MESA_SHADER_VERTEX ||
289        nir->info.stage == MESA_SHADER_TESS_EVAL ||
290        nir->info.stage == MESA_SHADER_GEOMETRY)
291       NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out);
292 
293    if (nir->info.stage == MESA_SHADER_COMPUTE) {
294       if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS) {
295          /* If we are shuffling local_invocation_id for quad derivatives, we
296           * need to derive local_invocation_index from local_invocation_id
297           * first, so that the value corresponds to the shuffled
298           * local_invocation_id.
299           */
300          nir_lower_compute_system_values_options options = {0};
301          options.lower_local_invocation_index = true;
302          NIR_PASS_V(nir, nir_lower_compute_system_values, &options);
303       }
304 
305       nir_opt_cse(nir); /* CSE load_local_invocation_id */
306       nir_lower_compute_system_values_options options = {0};
307       options.shuffle_local_ids_for_quad_derivatives = true;
308       NIR_PASS_V(nir, nir_lower_compute_system_values, &options);
309    }
310 
311    if (sscreen->b.get_shader_param(&sscreen->b, PIPE_SHADER_FRAGMENT, PIPE_SHADER_CAP_FP16)) {
312       NIR_PASS_V(nir, nir_lower_mediump_io,
313                  /* TODO: LLVM fails to compile this test if VS inputs are 16-bit:
314                   * dEQP-GLES31.functional.shaders.builtin_functions.integer.bitfieldinsert.uvec3_lowp_geometry
315                   */
316                  (nir->info.stage != MESA_SHADER_VERTEX ? nir_var_shader_in : 0) | nir_var_shader_out,
317                  BITFIELD64_BIT(VARYING_SLOT_PNTC) | BITFIELD64_RANGE(VARYING_SLOT_VAR0, 32),
318                  true);
319    }
320 
321    si_nir_opts(sscreen, nir, true);
322    /* Run late optimizations to fuse ffma and eliminate 16-bit conversions. */
323    si_nir_late_opts(nir);
324 
325    if (sscreen->b.get_shader_param(&sscreen->b, PIPE_SHADER_FRAGMENT, PIPE_SHADER_CAP_FP16))
326       si_late_optimize_16bit_samplers(sscreen, nir);
327 
328    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
329 }
330 
si_finalize_nir(struct pipe_screen * screen,void * nirptr)331 char *si_finalize_nir(struct pipe_screen *screen, void *nirptr)
332 {
333    struct si_screen *sscreen = (struct si_screen *)screen;
334    struct nir_shader *nir = (struct nir_shader *)nirptr;
335 
336    nir_lower_io_passes(nir);
337 
338    /* Remove dead derefs, so that we can remove uniforms. */
339    NIR_PASS_V(nir, nir_opt_dce);
340 
341    /* Remove uniforms because those should have been lowered to UBOs already. */
342    nir_foreach_variable_with_modes_safe(var, nir, nir_var_uniform) {
343       if (!glsl_type_get_image_count(var->type) &&
344           !glsl_type_get_sampler_count(var->type))
345          exec_node_remove(&var->node);
346    }
347 
348    si_lower_nir(sscreen, nir);
349    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
350 
351    if (sscreen->options.inline_uniforms)
352       nir_find_inlinable_uniforms(nir);
353 
354    /* Lower large variables that are always constant with load_constant intrinsics, which
355     * get turned into PC-relative loads from a data section next to the shader.
356     *
357     * Run this once before lcssa because the added phis may prevent this
358     * pass from operating correctly.
359     *
360     * nir_opt_large_constants may use op_amul (see nir_build_deref_offset),
361     * or may create unneeded code, so run si_nir_opts if needed.
362     */
363    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
364    bool progress = false;
365    NIR_PASS(progress, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
366    if (progress)
367       si_nir_opts(sscreen, nir, false);
368 
369    NIR_PASS_V(nir, nir_convert_to_lcssa, true, true); /* required by divergence analysis */
370    NIR_PASS_V(nir, nir_divergence_analysis); /* to find divergent loops */
371 
372    return NULL;
373 }
374