1 /*
2 * Copyright 2017 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "nir_builder.h"
8 #include "nir_xfb_info.h"
9 #include "si_pipe.h"
10 #include "ac_nir.h"
11 #include "aco_interface.h"
12
13
si_alu_to_scalar_packed_math_filter(const nir_instr * instr,const void * data)14 bool si_alu_to_scalar_packed_math_filter(const nir_instr *instr, const void *data)
15 {
16 if (instr->type == nir_instr_type_alu) {
17 nir_alu_instr *alu = nir_instr_as_alu(instr);
18 bool use_aco = (bool)data;
19
20 if (alu->def.bit_size == 16 && alu->def.num_components == 2 &&
21 (!use_aco || aco_nir_op_supports_packed_math_16bit(alu)))
22 return false;
23 }
24
25 return true;
26 }
27
si_vectorize_callback(const nir_instr * instr,const void * data)28 static uint8_t si_vectorize_callback(const nir_instr *instr, const void *data)
29 {
30 if (instr->type != nir_instr_type_alu)
31 return 0;
32
33 nir_alu_instr *alu = nir_instr_as_alu(instr);
34 if (alu->def.bit_size != 16)
35 return 1;
36
37 bool use_aco = (bool)data;
38
39 if (use_aco) {
40 return aco_nir_op_supports_packed_math_16bit(alu) ? 2 : 1;
41 } else {
42 switch (alu->op) {
43 case nir_op_unpack_32_2x16_split_x:
44 case nir_op_unpack_32_2x16_split_y:
45 case nir_op_extract_i8:
46 case nir_op_extract_u8:
47 case nir_op_extract_i16:
48 case nir_op_extract_u16:
49 case nir_op_insert_u8:
50 case nir_op_insert_u16:
51 return 1;
52 default:
53 return 2;
54 }
55 }
56 }
57
si_nir_opts(struct si_screen * sscreen,struct nir_shader * nir,bool first)58 void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first)
59 {
60 bool use_aco = sscreen->use_aco || nir->info.use_aco_amd;
61 bool progress;
62
63 do {
64 progress = false;
65 bool lower_alu_to_scalar = false;
66 bool lower_phis_to_scalar = false;
67
68 NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
69 NIR_PASS(progress, nir, nir_lower_alu_to_scalar,
70 nir->options->lower_to_scalar_filter, (void *)use_aco);
71 NIR_PASS(progress, nir, nir_lower_phis_to_scalar, false);
72
73 if (first) {
74 NIR_PASS(progress, nir, nir_split_array_vars, nir_var_function_temp);
75 NIR_PASS(lower_alu_to_scalar, nir, nir_shrink_vec_array_vars, nir_var_function_temp);
76 NIR_PASS(progress, nir, nir_opt_find_array_copies);
77 }
78 NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
79 NIR_PASS(progress, nir, nir_opt_dead_write_vars);
80
81 NIR_PASS(lower_alu_to_scalar, nir, nir_opt_loop);
82 /* (Constant) copy propagation is needed for txf with offsets. */
83 NIR_PASS(progress, nir, nir_copy_prop);
84 NIR_PASS(progress, nir, nir_opt_remove_phis);
85 NIR_PASS(progress, nir, nir_opt_dce);
86 /* nir_opt_if_optimize_phi_true_false is disabled on LLVM14 (#6976) */
87 NIR_PASS(lower_phis_to_scalar, nir, nir_opt_if,
88 nir_opt_if_optimize_phi_true_false);
89 NIR_PASS(progress, nir, nir_opt_dead_cf);
90
91 if (lower_alu_to_scalar) {
92 NIR_PASS_V(nir, nir_lower_alu_to_scalar,
93 nir->options->lower_to_scalar_filter, (void *)use_aco);
94 }
95 if (lower_phis_to_scalar)
96 NIR_PASS_V(nir, nir_lower_phis_to_scalar, false);
97 progress |= lower_alu_to_scalar | lower_phis_to_scalar;
98
99 NIR_PASS(progress, nir, nir_opt_cse);
100 NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
101
102 /* Needed for algebraic lowering */
103 NIR_PASS(progress, nir, nir_opt_algebraic);
104 NIR_PASS(progress, nir, nir_opt_generate_bfi);
105 NIR_PASS(progress, nir, nir_opt_constant_folding);
106
107 if (!nir->info.flrp_lowered) {
108 unsigned lower_flrp = (nir->options->lower_flrp16 ? 16 : 0) |
109 (nir->options->lower_flrp32 ? 32 : 0) |
110 (nir->options->lower_flrp64 ? 64 : 0);
111 assert(lower_flrp);
112 bool lower_flrp_progress = false;
113
114 NIR_PASS(lower_flrp_progress, nir, nir_lower_flrp, lower_flrp, false /* always_precise */);
115 if (lower_flrp_progress) {
116 NIR_PASS(progress, nir, nir_opt_constant_folding);
117 progress = true;
118 }
119
120 /* Nothing should rematerialize any flrps, so we only
121 * need to do this lowering once.
122 */
123 nir->info.flrp_lowered = true;
124 }
125
126 NIR_PASS(progress, nir, nir_opt_undef);
127 NIR_PASS(progress, nir, nir_opt_conditional_discard);
128 if (nir->options->max_unroll_iterations) {
129 NIR_PASS(progress, nir, nir_opt_loop_unroll);
130 }
131
132 if (nir->info.stage == MESA_SHADER_FRAGMENT)
133 NIR_PASS_V(nir, nir_opt_move_discards_to_top);
134
135 if (sscreen->info.has_packed_math_16bit)
136 NIR_PASS(progress, nir, nir_opt_vectorize, si_vectorize_callback, (void *)use_aco);
137 } while (progress);
138
139 NIR_PASS_V(nir, nir_lower_var_copies);
140 }
141
si_nir_late_opts(nir_shader * nir)142 void si_nir_late_opts(nir_shader *nir)
143 {
144 bool more_late_algebraic = true;
145 while (more_late_algebraic) {
146 more_late_algebraic = false;
147 NIR_PASS(more_late_algebraic, nir, nir_opt_algebraic_late);
148 NIR_PASS_V(nir, nir_opt_constant_folding);
149
150 /* We should run this after constant folding for stages that support indirect
151 * inputs/outputs.
152 */
153 if (nir->options->support_indirect_inputs & BITFIELD_BIT(nir->info.stage) ||
154 nir->options->support_indirect_outputs & BITFIELD_BIT(nir->info.stage))
155 NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in | nir_var_shader_out);
156
157 NIR_PASS_V(nir, nir_copy_prop);
158 NIR_PASS_V(nir, nir_opt_dce);
159 NIR_PASS_V(nir, nir_opt_cse);
160 }
161 }
162
si_late_optimize_16bit_samplers(struct si_screen * sscreen,nir_shader * nir)163 static void si_late_optimize_16bit_samplers(struct si_screen *sscreen, nir_shader *nir)
164 {
165 /* Optimize types of image_sample sources and destinations.
166 *
167 * The image_sample sources bit sizes are:
168 * nir_tex_src_coord: a16 ? 16 : 32
169 * nir_tex_src_comparator: 32
170 * nir_tex_src_offset: 32
171 * nir_tex_src_bias: a16 ? 16 : 32
172 * nir_tex_src_lod: a16 ? 16 : 32
173 * nir_tex_src_min_lod: a16 ? 16 : 32
174 * nir_tex_src_ms_index: a16 ? 16 : 32
175 * nir_tex_src_ddx: has_g16 ? (g16 ? 16 : 32) : (a16 ? 16 : 32)
176 * nir_tex_src_ddy: has_g16 ? (g16 ? 16 : 32) : (a16 ? 16 : 32)
177 *
178 * We only use a16/g16 if all of the affected sources are 16bit.
179 */
180 bool has_g16 = sscreen->info.gfx_level >= GFX10;
181 struct nir_opt_tex_srcs_options opt_srcs_options[] = {
182 {
183 .sampler_dims =
184 ~(BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE) | BITFIELD_BIT(GLSL_SAMPLER_DIM_BUF)),
185 .src_types = (1 << nir_tex_src_coord) | (1 << nir_tex_src_lod) |
186 (1 << nir_tex_src_bias) | (1 << nir_tex_src_min_lod) |
187 (1 << nir_tex_src_ms_index) |
188 (has_g16 ? 0 : (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy)),
189 },
190 {
191 .sampler_dims = ~BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE),
192 .src_types = (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy),
193 },
194 };
195 struct nir_opt_16bit_tex_image_options opt_16bit_options = {
196 .rounding_mode = nir_rounding_mode_undef,
197 .opt_tex_dest_types = nir_type_float | nir_type_int | nir_type_uint,
198 .opt_image_dest_types = nir_type_float | nir_type_int | nir_type_uint,
199 .integer_dest_saturates = true,
200 .opt_image_store_data = true,
201 .opt_image_srcs = true,
202 .opt_srcs_options_count = has_g16 ? 2 : 1,
203 .opt_srcs_options = opt_srcs_options,
204 };
205 bool changed = false;
206 NIR_PASS(changed, nir, nir_opt_16bit_tex_image, &opt_16bit_options);
207
208 if (changed) {
209 si_nir_opts(sscreen, nir, false);
210 si_nir_late_opts(nir);
211 }
212 }
213
214 static bool
lower_intrinsic_filter(const nir_instr * instr,const void * dummy)215 lower_intrinsic_filter(const nir_instr *instr, const void *dummy)
216 {
217 return instr->type == nir_instr_type_intrinsic;
218 }
219
220 static nir_def *
lower_intrinsic_instr(nir_builder * b,nir_instr * instr,void * dummy)221 lower_intrinsic_instr(nir_builder *b, nir_instr *instr, void *dummy)
222 {
223 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
224
225 switch (intrin->intrinsic) {
226 case nir_intrinsic_is_sparse_texels_resident:
227 /* code==0 means sparse texels are resident */
228 return nir_ieq_imm(b, intrin->src[0].ssa, 0);
229 case nir_intrinsic_sparse_residency_code_and:
230 return nir_ior(b, intrin->src[0].ssa, intrin->src[1].ssa);
231 default:
232 return NULL;
233 }
234 }
235
si_lower_intrinsics(nir_shader * nir)236 static bool si_lower_intrinsics(nir_shader *nir)
237 {
238 return nir_shader_lower_instructions(nir,
239 lower_intrinsic_filter,
240 lower_intrinsic_instr,
241 NULL);
242 }
243
si_lower_mediump_io(nir_shader * nir)244 void si_lower_mediump_io(nir_shader *nir)
245 {
246 NIR_PASS_V(nir, nir_lower_mediump_io,
247 /* TODO: LLVM fails to compile this test if VS inputs are 16-bit:
248 * dEQP-GLES31.functional.shaders.builtin_functions.integer.bitfieldinsert.uvec3_lowp_geometry
249 */
250 (nir->info.stage != MESA_SHADER_VERTEX ? nir_var_shader_in : 0) | nir_var_shader_out,
251 BITFIELD64_BIT(VARYING_SLOT_PNTC) | BITFIELD64_RANGE(VARYING_SLOT_VAR0, 32),
252 true);
253 }
254
255 /**
256 * Perform "lowering" operations on the NIR that are run once when the shader
257 * selector is created.
258 */
si_lower_nir(struct si_screen * sscreen,struct nir_shader * nir)259 static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir)
260 {
261 /* Perform lowerings (and optimizations) of code.
262 *
263 * Performance considerations aside, we must:
264 * - lower certain ALU operations
265 * - ensure constant offsets for texture instructions are folded
266 * and copy-propagated
267 */
268 const struct nir_lower_tex_options lower_tex_options = {
269 .lower_txp = ~0u,
270 .lower_txf_offset = true,
271 .lower_txs_cube_array = true,
272 .lower_invalid_implicit_lod = true,
273 .lower_tg4_offsets = true,
274 .lower_to_fragment_fetch_amd = sscreen->info.gfx_level < GFX11,
275 .lower_1d = sscreen->info.gfx_level == GFX9,
276 };
277 NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
278
279 const struct nir_lower_image_options lower_image_options = {
280 .lower_cube_size = true,
281 .lower_to_fragment_mask_load_amd = sscreen->info.gfx_level < GFX11 &&
282 !(sscreen->debug_flags & DBG(NO_FMASK)),
283 };
284 NIR_PASS_V(nir, nir_lower_image, &lower_image_options);
285
286 NIR_PASS_V(nir, si_lower_intrinsics);
287
288 NIR_PASS_V(nir, ac_nir_lower_sin_cos);
289
290 /* Lower load constants to scalar and then clean up the mess */
291 NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
292 NIR_PASS_V(nir, nir_lower_var_copies);
293 NIR_PASS_V(nir, nir_opt_intrinsics);
294 NIR_PASS_V(nir, nir_lower_system_values);
295
296 /* si_nir_kill_outputs and ac_nir_optimize_outputs require outputs to be scalar. */
297 if (nir->info.stage == MESA_SHADER_VERTEX ||
298 nir->info.stage == MESA_SHADER_TESS_EVAL ||
299 nir->info.stage == MESA_SHADER_GEOMETRY)
300 NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
301
302 if (nir->info.stage == MESA_SHADER_GEOMETRY) {
303 unsigned flags = nir_lower_gs_intrinsics_per_stream;
304 if (sscreen->use_ngg) {
305 flags |= nir_lower_gs_intrinsics_count_primitives |
306 nir_lower_gs_intrinsics_count_vertices_per_primitive |
307 nir_lower_gs_intrinsics_overwrite_incomplete;
308 }
309
310 NIR_PASS_V(nir, nir_lower_gs_intrinsics, flags);
311 }
312
313 if (gl_shader_stage_is_compute(nir->info.stage)) {
314 nir_lower_compute_system_values_options options = {0};
315
316 /* gl_LocalInvocationIndex must be derived from gl_LocalInvocationID.xyz to make it correct
317 * with quad derivatives. Using gl_SubgroupID for that (which is what we do by default) is
318 * incorrect with a non-linear thread order.
319 *
320 * On Gfx12, we always use a non-linear thread order if the workgroup X and Y size is
321 * divisible by 2.
322 */
323 options.lower_local_invocation_index =
324 nir->info.derivative_group == DERIVATIVE_GROUP_QUADS ||
325 (sscreen->info.gfx_level >= GFX12 &&
326 nir->info.derivative_group == DERIVATIVE_GROUP_NONE &&
327 (nir->info.workgroup_size_variable ||
328 (nir->info.workgroup_size[0] % 2 == 0 && nir->info.workgroup_size[1] % 2 == 0)));
329 NIR_PASS_V(nir, nir_lower_compute_system_values, &options);
330
331 /* Gfx12 supports this in hw. */
332 if (sscreen->info.gfx_level < GFX12 &&
333 nir->info.derivative_group == DERIVATIVE_GROUP_QUADS) {
334 nir_opt_cse(nir); /* CSE load_local_invocation_id */
335 memset(&options, 0, sizeof(options));
336 options.shuffle_local_ids_for_quad_derivatives = true;
337 NIR_PASS_V(nir, nir_lower_compute_system_values, &options);
338 }
339 }
340
341 si_nir_opts(sscreen, nir, true);
342 /* Run late optimizations to fuse ffma and eliminate 16-bit conversions. */
343 si_nir_late_opts(nir);
344
345 if (sscreen->info.gfx_level >= GFX9)
346 si_late_optimize_16bit_samplers(sscreen, nir);
347
348 NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
349
350 NIR_PASS_V(nir, nir_lower_fp16_casts, nir_lower_fp16_split_fp64);
351 }
352
si_mark_divergent_texture_non_uniform(struct nir_shader * nir)353 static bool si_mark_divergent_texture_non_uniform(struct nir_shader *nir)
354 {
355 assert(nir->info.divergence_analysis_run);
356
357 /* sampler_non_uniform and texture_non_uniform are always false in GLSL,
358 * but this can lead to unexpected behavior if texture/sampler index come from
359 * a vertex attribute.
360 *
361 * For instance, 2 consecutive draws using 2 different index values,
362 * could be squashed together by the hw - producing a single draw with
363 * non-dynamically uniform index.
364 *
365 * To avoid this, detect divergent indexing, mark them as non-uniform,
366 * so that we can apply waterfall loop on these index later (either llvm
367 * backend or nir_lower_non_uniform_access).
368 *
369 * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/2253
370 */
371
372 bool divergence_changed = false;
373
374 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
375 nir_foreach_block_safe(block, impl) {
376 nir_foreach_instr_safe(instr, block) {
377 if (instr->type != nir_instr_type_tex)
378 continue;
379
380 nir_tex_instr *tex = nir_instr_as_tex(instr);
381 for (int i = 0; i < tex->num_srcs; i++) {
382 bool divergent = nir_src_is_divergent(&tex->src[i].src);
383
384 switch (tex->src[i].src_type) {
385 case nir_tex_src_texture_deref:
386 case nir_tex_src_texture_handle:
387 tex->texture_non_uniform |= divergent;
388 break;
389 case nir_tex_src_sampler_deref:
390 case nir_tex_src_sampler_handle:
391 tex->sampler_non_uniform |= divergent;
392 break;
393 default:
394 break;
395 }
396 }
397
398 /* If dest is already divergent, divergence won't change. */
399 divergence_changed |= !tex->def.divergent &&
400 (tex->texture_non_uniform || tex->sampler_non_uniform);
401 }
402 }
403
404 nir_metadata_preserve(impl, nir_metadata_all);
405 return divergence_changed;
406 }
407
si_finalize_nir(struct pipe_screen * screen,struct nir_shader * nir)408 char *si_finalize_nir(struct pipe_screen *screen, struct nir_shader *nir)
409 {
410 struct si_screen *sscreen = (struct si_screen *)screen;
411
412 nir_lower_io_passes(nir, false);
413 NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_in | nir_var_shader_out, NULL);
414
415 if (nir->info.stage == MESA_SHADER_FRAGMENT)
416 NIR_PASS_V(nir, nir_lower_color_inputs);
417
418 NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_shared, nir_address_format_32bit_offset);
419
420 /* Remove dead derefs, so that we can remove uniforms. */
421 NIR_PASS_V(nir, nir_opt_dce);
422
423 /* Remove uniforms because those should have been lowered to UBOs already. */
424 nir_foreach_variable_with_modes_safe(var, nir, nir_var_uniform) {
425 if (!glsl_type_get_image_count(var->type) &&
426 !glsl_type_get_texture_count(var->type) &&
427 !glsl_type_get_sampler_count(var->type))
428 exec_node_remove(&var->node);
429 }
430
431 si_lower_nir(sscreen, nir);
432 nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
433
434 /* Update xfb info after we did medium io lowering. */
435 if (nir->xfb_info && nir->info.outputs_written_16bit)
436 nir_gather_xfb_info_from_intrinsics(nir);
437
438 if (sscreen->options.inline_uniforms)
439 nir_find_inlinable_uniforms(nir);
440
441 /* Lower large variables that are always constant with load_constant intrinsics, which
442 * get turned into PC-relative loads from a data section next to the shader.
443 *
444 * Run this once before lcssa because the added phis may prevent this
445 * pass from operating correctly.
446 *
447 * nir_opt_large_constants may use op_amul (see nir_build_deref_offset),
448 * or may create unneeded code, so run si_nir_opts if needed.
449 */
450 NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
451 bool progress = false;
452 NIR_PASS(progress, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
453 if (progress)
454 si_nir_opts(sscreen, nir, false);
455
456 NIR_PASS_V(nir, nir_divergence_analysis); /* to find divergent loops */
457
458 /* Must be after divergence analysis. */
459 bool divergence_changed = false;
460 NIR_PASS(divergence_changed, nir, si_mark_divergent_texture_non_uniform);
461 /* Re-analysis whole shader if texture instruction divergence changed. */
462 if (divergence_changed)
463 NIR_PASS_V(nir, nir_divergence_analysis);
464
465 return NULL;
466 }
467