1 /*
2 * Copyright 2017 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "nir_builder.h"
8 #include "nir_xfb_info.h"
9 #include "si_pipe.h"
10 #include "ac_nir.h"
11 #include "aco_interface.h"
12
13
si_alu_to_scalar_packed_math_filter(const nir_instr * instr,const void * data)14 bool si_alu_to_scalar_packed_math_filter(const nir_instr *instr, const void *data)
15 {
16 if (instr->type == nir_instr_type_alu) {
17 nir_alu_instr *alu = nir_instr_as_alu(instr);
18 bool use_aco = (bool)data;
19
20 if (alu->def.bit_size == 16 && alu->def.num_components == 2 &&
21 (!use_aco || aco_nir_op_supports_packed_math_16bit(alu)))
22 return false;
23 }
24
25 return true;
26 }
27
si_vectorize_callback(const nir_instr * instr,const void * data)28 static uint8_t si_vectorize_callback(const nir_instr *instr, const void *data)
29 {
30 if (instr->type != nir_instr_type_alu)
31 return 0;
32
33 nir_alu_instr *alu = nir_instr_as_alu(instr);
34 if (alu->def.bit_size != 16)
35 return 1;
36
37 bool use_aco = (bool)data;
38
39 if (use_aco) {
40 return aco_nir_op_supports_packed_math_16bit(alu) ? 2 : 1;
41 } else {
42 switch (alu->op) {
43 case nir_op_unpack_32_2x16_split_x:
44 case nir_op_unpack_32_2x16_split_y:
45 return 1;
46 default:
47 return 2;
48 }
49 }
50 }
51
si_lower_bit_size_callback(const nir_instr * instr,void * data)52 static unsigned si_lower_bit_size_callback(const nir_instr *instr, void *data)
53 {
54 if (instr->type != nir_instr_type_alu)
55 return 0;
56
57 nir_alu_instr *alu = nir_instr_as_alu(instr);
58
59 switch (alu->op) {
60 case nir_op_imul_high:
61 case nir_op_umul_high:
62 if (alu->def.bit_size < 32)
63 return 32;
64 break;
65 default:
66 break;
67 }
68
69 return 0;
70 }
71
si_nir_opts(struct si_screen * sscreen,struct nir_shader * nir,bool first)72 void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first)
73 {
74 bool progress;
75
76 do {
77 progress = false;
78 bool lower_alu_to_scalar = false;
79 bool lower_phis_to_scalar = false;
80
81 NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
82 NIR_PASS(progress, nir, nir_lower_alu_to_scalar,
83 nir->options->lower_to_scalar_filter, (void *)sscreen->use_aco);
84 NIR_PASS(progress, nir, nir_lower_phis_to_scalar, false);
85
86 if (first) {
87 NIR_PASS(progress, nir, nir_split_array_vars, nir_var_function_temp);
88 NIR_PASS(lower_alu_to_scalar, nir, nir_shrink_vec_array_vars, nir_var_function_temp);
89 NIR_PASS(progress, nir, nir_opt_find_array_copies);
90 }
91 NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
92 NIR_PASS(progress, nir, nir_opt_dead_write_vars);
93
94 NIR_PASS(lower_alu_to_scalar, nir, nir_opt_loop);
95 /* (Constant) copy propagation is needed for txf with offsets. */
96 NIR_PASS(progress, nir, nir_copy_prop);
97 NIR_PASS(progress, nir, nir_opt_remove_phis);
98 NIR_PASS(progress, nir, nir_opt_dce);
99 /* nir_opt_if_optimize_phi_true_false is disabled on LLVM14 (#6976) */
100 NIR_PASS(lower_phis_to_scalar, nir, nir_opt_if,
101 nir_opt_if_optimize_phi_true_false);
102 NIR_PASS(progress, nir, nir_opt_dead_cf);
103
104 if (lower_alu_to_scalar) {
105 NIR_PASS_V(nir, nir_lower_alu_to_scalar,
106 nir->options->lower_to_scalar_filter, (void *)sscreen->use_aco);
107 }
108 if (lower_phis_to_scalar)
109 NIR_PASS_V(nir, nir_lower_phis_to_scalar, false);
110 progress |= lower_alu_to_scalar | lower_phis_to_scalar;
111
112 NIR_PASS(progress, nir, nir_opt_cse);
113 NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
114
115 /* Needed for algebraic lowering */
116 NIR_PASS(progress, nir, nir_lower_bit_size, si_lower_bit_size_callback, NULL);
117 NIR_PASS(progress, nir, nir_opt_algebraic);
118 NIR_PASS(progress, nir, nir_opt_constant_folding);
119
120 if (!nir->info.flrp_lowered) {
121 unsigned lower_flrp = (nir->options->lower_flrp16 ? 16 : 0) |
122 (nir->options->lower_flrp32 ? 32 : 0) |
123 (nir->options->lower_flrp64 ? 64 : 0);
124 assert(lower_flrp);
125 bool lower_flrp_progress = false;
126
127 NIR_PASS(lower_flrp_progress, nir, nir_lower_flrp, lower_flrp, false /* always_precise */);
128 if (lower_flrp_progress) {
129 NIR_PASS(progress, nir, nir_opt_constant_folding);
130 progress = true;
131 }
132
133 /* Nothing should rematerialize any flrps, so we only
134 * need to do this lowering once.
135 */
136 nir->info.flrp_lowered = true;
137 }
138
139 NIR_PASS(progress, nir, nir_opt_undef);
140 NIR_PASS(progress, nir, nir_opt_conditional_discard);
141 if (nir->options->max_unroll_iterations) {
142 NIR_PASS(progress, nir, nir_opt_loop_unroll);
143 }
144
145 if (nir->info.stage == MESA_SHADER_FRAGMENT)
146 NIR_PASS_V(nir, nir_opt_move_discards_to_top);
147
148 if (sscreen->info.has_packed_math_16bit) {
149 NIR_PASS(progress, nir, nir_opt_vectorize, si_vectorize_callback,
150 (void *)sscreen->use_aco);
151 }
152 } while (progress);
153
154 NIR_PASS_V(nir, nir_lower_var_copies);
155 }
156
si_nir_late_opts(nir_shader * nir)157 void si_nir_late_opts(nir_shader *nir)
158 {
159 bool more_late_algebraic = true;
160 while (more_late_algebraic) {
161 more_late_algebraic = false;
162 NIR_PASS(more_late_algebraic, nir, nir_opt_algebraic_late);
163 NIR_PASS_V(nir, nir_opt_constant_folding);
164
165 /* We should run this after constant folding for stages that support indirect
166 * inputs/outputs.
167 */
168 if (nir->options->support_indirect_inputs & BITFIELD_BIT(nir->info.stage) ||
169 nir->options->support_indirect_outputs & BITFIELD_BIT(nir->info.stage))
170 NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in | nir_var_shader_out);
171
172 NIR_PASS_V(nir, nir_copy_prop);
173 NIR_PASS_V(nir, nir_opt_dce);
174 NIR_PASS_V(nir, nir_opt_cse);
175 }
176 }
177
si_late_optimize_16bit_samplers(struct si_screen * sscreen,nir_shader * nir)178 static void si_late_optimize_16bit_samplers(struct si_screen *sscreen, nir_shader *nir)
179 {
180 /* Optimize types of image_sample sources and destinations.
181 *
182 * The image_sample sources bit sizes are:
183 * nir_tex_src_coord: a16 ? 16 : 32
184 * nir_tex_src_comparator: 32
185 * nir_tex_src_offset: 32
186 * nir_tex_src_bias: a16 ? 16 : 32
187 * nir_tex_src_lod: a16 ? 16 : 32
188 * nir_tex_src_min_lod: a16 ? 16 : 32
189 * nir_tex_src_ms_index: a16 ? 16 : 32
190 * nir_tex_src_ddx: has_g16 ? (g16 ? 16 : 32) : (a16 ? 16 : 32)
191 * nir_tex_src_ddy: has_g16 ? (g16 ? 16 : 32) : (a16 ? 16 : 32)
192 *
193 * We only use a16/g16 if all of the affected sources are 16bit.
194 */
195 bool has_g16 = sscreen->info.gfx_level >= GFX10;
196 struct nir_fold_tex_srcs_options fold_srcs_options[] = {
197 {
198 .sampler_dims =
199 ~(BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE) | BITFIELD_BIT(GLSL_SAMPLER_DIM_BUF)),
200 .src_types = (1 << nir_tex_src_coord) | (1 << nir_tex_src_lod) |
201 (1 << nir_tex_src_bias) | (1 << nir_tex_src_min_lod) |
202 (1 << nir_tex_src_ms_index) |
203 (has_g16 ? 0 : (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy)),
204 },
205 {
206 .sampler_dims = ~BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE),
207 .src_types = (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy),
208 },
209 };
210 struct nir_fold_16bit_tex_image_options fold_16bit_options = {
211 .rounding_mode = nir_rounding_mode_rtz,
212 .fold_tex_dest_types = nir_type_float,
213 .fold_image_dest_types = nir_type_float,
214 .fold_image_store_data = true,
215 .fold_srcs_options_count = has_g16 ? 2 : 1,
216 .fold_srcs_options = fold_srcs_options,
217 };
218 bool changed = false;
219 NIR_PASS(changed, nir, nir_fold_16bit_tex_image, &fold_16bit_options);
220
221 if (changed) {
222 si_nir_opts(sscreen, nir, false);
223 si_nir_late_opts(nir);
224 }
225 }
226
227 static bool
lower_intrinsic_filter(const nir_instr * instr,const void * dummy)228 lower_intrinsic_filter(const nir_instr *instr, const void *dummy)
229 {
230 return instr->type == nir_instr_type_intrinsic;
231 }
232
233 static nir_def *
lower_intrinsic_instr(nir_builder * b,nir_instr * instr,void * dummy)234 lower_intrinsic_instr(nir_builder *b, nir_instr *instr, void *dummy)
235 {
236 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
237
238 switch (intrin->intrinsic) {
239 case nir_intrinsic_is_sparse_texels_resident:
240 /* code==0 means sparse texels are resident */
241 return nir_ieq_imm(b, intrin->src[0].ssa, 0);
242 case nir_intrinsic_sparse_residency_code_and:
243 return nir_ior(b, intrin->src[0].ssa, intrin->src[1].ssa);
244 default:
245 return NULL;
246 }
247 }
248
si_lower_intrinsics(nir_shader * nir)249 static bool si_lower_intrinsics(nir_shader *nir)
250 {
251 return nir_shader_lower_instructions(nir,
252 lower_intrinsic_filter,
253 lower_intrinsic_instr,
254 NULL);
255 }
256
257 const nir_lower_subgroups_options si_nir_subgroups_options = {
258 .subgroup_size = 64,
259 .ballot_bit_size = 64,
260 .ballot_components = 1,
261 .lower_to_scalar = true,
262 .lower_subgroup_masks = true,
263 .lower_relative_shuffle = true,
264 .lower_shuffle_to_32bit = true,
265 .lower_vote_trivial = false,
266 .lower_vote_eq = true,
267 .lower_vote_bool_eq = true,
268 .lower_inverse_ballot = true,
269 .lower_boolean_reduce = true,
270 .lower_boolean_shuffle = true,
271 };
272
273 /**
274 * Perform "lowering" operations on the NIR that are run once when the shader
275 * selector is created.
276 */
si_lower_nir(struct si_screen * sscreen,struct nir_shader * nir)277 static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir)
278 {
279 /* Perform lowerings (and optimizations) of code.
280 *
281 * Performance considerations aside, we must:
282 * - lower certain ALU operations
283 * - ensure constant offsets for texture instructions are folded
284 * and copy-propagated
285 */
286 NIR_PASS_V(nir, nir_lower_int64);
287
288 const struct nir_lower_tex_options lower_tex_options = {
289 .lower_txp = ~0u,
290 .lower_txf_offset = true,
291 .lower_txs_cube_array = true,
292 .lower_invalid_implicit_lod = true,
293 .lower_tg4_offsets = true,
294 .lower_to_fragment_fetch_amd = sscreen->info.gfx_level < GFX11,
295 .lower_1d = sscreen->info.gfx_level == GFX9,
296 };
297 NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
298
299 const struct nir_lower_image_options lower_image_options = {
300 .lower_cube_size = true,
301 .lower_to_fragment_mask_load_amd = sscreen->info.gfx_level < GFX11,
302 };
303 NIR_PASS_V(nir, nir_lower_image, &lower_image_options);
304
305 NIR_PASS_V(nir, si_lower_intrinsics);
306
307 NIR_PASS_V(nir, ac_nir_lower_sin_cos);
308
309 NIR_PASS_V(nir, nir_lower_subgroups, &si_nir_subgroups_options);
310
311 NIR_PASS_V(nir, nir_lower_discard_or_demote, true);
312
313 /* Lower load constants to scalar and then clean up the mess */
314 NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
315 NIR_PASS_V(nir, nir_lower_var_copies);
316 NIR_PASS_V(nir, nir_opt_intrinsics);
317 NIR_PASS_V(nir, nir_lower_system_values);
318
319 /* si_nir_kill_outputs and ac_nir_optimize_outputs require outputs to be scalar. */
320 if (nir->info.stage == MESA_SHADER_VERTEX ||
321 nir->info.stage == MESA_SHADER_TESS_EVAL ||
322 nir->info.stage == MESA_SHADER_GEOMETRY)
323 NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
324
325 if (nir->info.stage == MESA_SHADER_GEOMETRY) {
326 unsigned flags = nir_lower_gs_intrinsics_per_stream;
327 if (sscreen->use_ngg) {
328 flags |= nir_lower_gs_intrinsics_count_primitives |
329 nir_lower_gs_intrinsics_count_vertices_per_primitive |
330 nir_lower_gs_intrinsics_overwrite_incomplete;
331 }
332
333 NIR_PASS_V(nir, nir_lower_gs_intrinsics, flags);
334 }
335
336 if (nir->info.stage == MESA_SHADER_COMPUTE) {
337 nir_lower_compute_system_values_options options = {0};
338
339 /* gl_LocalInvocationIndex must be derived from gl_LocalInvocationID.xyz to make it correct
340 * with quad derivatives. Using gl_SubgroupID for that (which is what we do by default) is
341 * incorrect with a non-linear thread order.
342 */
343 options.lower_local_invocation_index =
344 nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS;
345 NIR_PASS_V(nir, nir_lower_compute_system_values, &options);
346
347 if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS) {
348 nir_opt_cse(nir); /* CSE load_local_invocation_id */
349 memset(&options, 0, sizeof(options));
350 options.shuffle_local_ids_for_quad_derivatives = true;
351 NIR_PASS_V(nir, nir_lower_compute_system_values, &options);
352 }
353 }
354
355 if (sscreen->b.get_shader_param(&sscreen->b, PIPE_SHADER_FRAGMENT, PIPE_SHADER_CAP_FP16)) {
356 NIR_PASS_V(nir, nir_lower_mediump_io,
357 /* TODO: LLVM fails to compile this test if VS inputs are 16-bit:
358 * dEQP-GLES31.functional.shaders.builtin_functions.integer.bitfieldinsert.uvec3_lowp_geometry
359 */
360 (nir->info.stage != MESA_SHADER_VERTEX ? nir_var_shader_in : 0) | nir_var_shader_out,
361 BITFIELD64_BIT(VARYING_SLOT_PNTC) | BITFIELD64_RANGE(VARYING_SLOT_VAR0, 32),
362 true);
363 }
364
365 si_nir_opts(sscreen, nir, true);
366 /* Run late optimizations to fuse ffma and eliminate 16-bit conversions. */
367 si_nir_late_opts(nir);
368
369 if (sscreen->b.get_shader_param(&sscreen->b, PIPE_SHADER_FRAGMENT, PIPE_SHADER_CAP_FP16))
370 si_late_optimize_16bit_samplers(sscreen, nir);
371
372 NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
373
374 NIR_PASS_V(nir, nir_lower_fp16_casts, nir_lower_fp16_split_fp64);
375 }
376
si_mark_divergent_texture_non_uniform(struct nir_shader * nir)377 static bool si_mark_divergent_texture_non_uniform(struct nir_shader *nir)
378 {
379 assert(nir->info.divergence_analysis_run);
380
381 /* sampler_non_uniform and texture_non_uniform are always false in GLSL,
382 * but this can lead to unexpected behavior if texture/sampler index come from
383 * a vertex attribute.
384 *
385 * For instance, 2 consecutive draws using 2 different index values,
386 * could be squashed together by the hw - producing a single draw with
387 * non-dynamically uniform index.
388 *
389 * To avoid this, detect divergent indexing, mark them as non-uniform,
390 * so that we can apply waterfall loop on these index later (either llvm
391 * backend or nir_lower_non_uniform_access).
392 *
393 * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/2253
394 */
395
396 bool divergence_changed = false;
397
398 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
399 nir_foreach_block_safe(block, impl) {
400 nir_foreach_instr_safe(instr, block) {
401 if (instr->type != nir_instr_type_tex)
402 continue;
403
404 nir_tex_instr *tex = nir_instr_as_tex(instr);
405 for (int i = 0; i < tex->num_srcs; i++) {
406 bool divergent = tex->src[i].src.ssa->divergent;
407
408 switch (tex->src[i].src_type) {
409 case nir_tex_src_texture_deref:
410 case nir_tex_src_texture_handle:
411 tex->texture_non_uniform |= divergent;
412 break;
413 case nir_tex_src_sampler_deref:
414 case nir_tex_src_sampler_handle:
415 tex->sampler_non_uniform |= divergent;
416 break;
417 default:
418 break;
419 }
420 }
421
422 /* If dest is already divergent, divergence won't change. */
423 divergence_changed |= !tex->def.divergent &&
424 (tex->texture_non_uniform || tex->sampler_non_uniform);
425 }
426 }
427
428 nir_metadata_preserve(impl, nir_metadata_all);
429 return divergence_changed;
430 }
431
si_finalize_nir(struct pipe_screen * screen,void * nirptr)432 char *si_finalize_nir(struct pipe_screen *screen, void *nirptr)
433 {
434 struct si_screen *sscreen = (struct si_screen *)screen;
435 struct nir_shader *nir = (struct nir_shader *)nirptr;
436
437 nir_lower_io_passes(nir, false);
438 NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_in | nir_var_shader_out, NULL);
439
440 if (nir->info.stage == MESA_SHADER_FRAGMENT)
441 NIR_PASS_V(nir, nir_lower_color_inputs);
442
443 NIR_PASS_V(nir, ac_nir_lower_subdword_loads,
444 (ac_nir_lower_subdword_options) {
445 .modes_1_comp = nir_var_mem_ubo,
446 .modes_N_comps = nir_var_mem_ubo | nir_var_mem_ssbo
447 });
448 NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_shared, nir_address_format_32bit_offset);
449
450 /* Remove dead derefs, so that we can remove uniforms. */
451 NIR_PASS_V(nir, nir_opt_dce);
452
453 /* Remove uniforms because those should have been lowered to UBOs already. */
454 nir_foreach_variable_with_modes_safe(var, nir, nir_var_uniform) {
455 if (!glsl_type_get_image_count(var->type) &&
456 !glsl_type_get_texture_count(var->type) &&
457 !glsl_type_get_sampler_count(var->type))
458 exec_node_remove(&var->node);
459 }
460
461 si_lower_nir(sscreen, nir);
462 nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
463
464 /* Update xfb info after we did medium io lowering. */
465 if (nir->xfb_info && nir->info.outputs_written_16bit)
466 nir_gather_xfb_info_from_intrinsics(nir);
467
468 if (sscreen->options.inline_uniforms)
469 nir_find_inlinable_uniforms(nir);
470
471 /* Lower large variables that are always constant with load_constant intrinsics, which
472 * get turned into PC-relative loads from a data section next to the shader.
473 *
474 * Run this once before lcssa because the added phis may prevent this
475 * pass from operating correctly.
476 *
477 * nir_opt_large_constants may use op_amul (see nir_build_deref_offset),
478 * or may create unneeded code, so run si_nir_opts if needed.
479 */
480 NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
481 bool progress = false;
482 NIR_PASS(progress, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
483 if (progress)
484 si_nir_opts(sscreen, nir, false);
485
486 NIR_PASS_V(nir, nir_convert_to_lcssa, true, true); /* required by divergence analysis */
487 NIR_PASS_V(nir, nir_divergence_analysis); /* to find divergent loops */
488
489 /* Must be after divergence analysis. */
490 bool divergence_changed = false;
491 NIR_PASS(divergence_changed, nir, si_mark_divergent_texture_non_uniform);
492 /* Re-analysis whole shader if texture instruction divergence changed. */
493 if (divergence_changed)
494 NIR_PASS_V(nir, nir_divergence_analysis);
495
496 return NULL;
497 }
498