1 /*
2 * Copyright 2023 Pavel Ondračka <pavel.ondracka@gmail.com>
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "r300_nir.h"
7
8 #include "compiler/nir/nir_builder.h"
9 #include "r300_screen.h"
10
11 bool
r300_is_only_used_as_float(const nir_alu_instr * instr)12 r300_is_only_used_as_float(const nir_alu_instr *instr)
13 {
14 nir_foreach_use (src, &instr->def) {
15 if (nir_src_is_if(src))
16 return false;
17
18 nir_instr *user_instr = nir_src_parent_instr(src);
19 if (user_instr->type == nir_instr_type_alu) {
20 nir_alu_instr *alu = nir_instr_as_alu(user_instr);
21 switch (alu->op) {
22 case nir_op_mov:
23 case nir_op_vec2:
24 case nir_op_vec3:
25 case nir_op_vec4:
26 case nir_op_bcsel:
27 case nir_op_b32csel:
28 if (!r300_is_only_used_as_float(alu))
29 return false;
30 break;
31 default:
32 break;
33 }
34
35 const nir_op_info *info = &nir_op_infos[alu->op];
36 nir_alu_src *alu_src = exec_node_data(nir_alu_src, src, src);
37 int src_idx = alu_src - &alu->src[0];
38 if ((info->input_types[src_idx] & nir_type_int) ||
39 (info->input_types[src_idx] & nir_type_bool))
40 return false;
41 }
42 }
43 return true;
44 }
45
46 static unsigned char
r300_should_vectorize_instr(const nir_instr * instr,const void * data)47 r300_should_vectorize_instr(const nir_instr *instr, const void *data)
48 {
49 bool *too_many_ubos = (bool *)data;
50
51 if (instr->type != nir_instr_type_alu)
52 return 0;
53
54 /* Vectorization can make the constant layout worse and increase
55 * the constant register usage. The worst scenario is vectorization
56 * of lowered indirect register access, where we access i-th element
57 * and later we access i-1 or i+1 (most notably glamor and gsk shaders).
58 * In this case we already added constants 1..n where n is the array
59 * size, however we can reuse them unless the lowered ladder gets
60 * vectorized later.
61 *
62 * Thus prevent vectorization of the specific patterns from lowered
63 * indirect access.
64 *
65 * This is quite a heavy hammer, we could in theory estimate how many
66 * slots will the current ubos and constants need and only disable
67 * vectorization when we are close to the limit. However, this would
68 * likely need a global shader analysis each time r300_should_vectorize_inst
69 * is called, which we want to avoid.
70 *
71 * So for now just don't vectorize anything that loads constants.
72 */
73 if (*too_many_ubos) {
74 nir_alu_instr *alu = nir_instr_as_alu(instr);
75 unsigned num_srcs = nir_op_infos[alu->op].num_inputs;
76 for (unsigned i = 0; i < num_srcs; i++) {
77 if (nir_src_is_const(alu->src[i].src)) {
78 return 0;
79 }
80 }
81 }
82
83 return 4;
84 }
85
86 /* R300 and R400 have just 32 vec4 constant register slots in fs.
87 * Therefore, while its possible we will be able to compact some of
88 * the constants later, we need to be extra careful with adding
89 * new constants anyway.
90 */
91 static bool
have_too_many_ubos(nir_shader * s,bool is_r500)92 have_too_many_ubos(nir_shader *s, bool is_r500)
93 {
94 if (s->info.stage != MESA_SHADER_FRAGMENT)
95 return false;
96
97 if (is_r500)
98 return false;
99
100 nir_foreach_variable_with_modes (var, s, nir_var_mem_ubo) {
101 int ubo = var->data.driver_location;
102 assert(ubo == 0);
103
104 unsigned size = glsl_get_explicit_size(var->interface_type, false);
105 if (DIV_ROUND_UP(size, 16) > 32)
106 return true;
107 }
108
109 return false;
110 }
111
112 static bool
set_speculate(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * _)113 set_speculate(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *_)
114 {
115 if (intr->intrinsic == nir_intrinsic_load_ubo_vec4) {
116 nir_intrinsic_set_access(intr, nir_intrinsic_access(intr) | ACCESS_CAN_SPECULATE);
117 return true;
118 }
119 return false;
120 }
121
122 static bool
remove_clip_vertex(nir_builder * b,nir_instr * instr,UNUSED void * _)123 remove_clip_vertex(nir_builder *b, nir_instr *instr, UNUSED void *_)
124 {
125 if (instr->type != nir_instr_type_deref)
126 return false;
127 nir_deref_instr *deref = nir_instr_as_deref(instr);
128 if (deref->deref_type == nir_deref_type_var &&
129 deref->var->data.mode == nir_var_shader_out &&
130 deref->var->data.location == VARYING_SLOT_CLIP_VERTEX) {
131 nir_foreach_use_safe(src, &deref->def) {
132 nir_instr_remove(nir_src_parent_instr(src));
133 }
134 nir_instr_remove(instr);
135 return true;
136 }
137 return false;
138 }
139
140 static void
r300_optimize_nir(struct nir_shader * s,struct pipe_screen * screen)141 r300_optimize_nir(struct nir_shader *s, struct pipe_screen *screen)
142 {
143 bool is_r500 = r300_screen(screen)->caps.is_r500;
144
145 bool progress;
146 if (s->info.stage == MESA_SHADER_VERTEX && r300_screen(screen)->caps.has_tcl) {
147 /* There is no HW support for gl_ClipVertex, so we just remove it early. */
148 if (nir_shader_instructions_pass(s, remove_clip_vertex,
149 nir_metadata_control_flow, NULL)) {
150 unsigned clip_vertex_location = 0;
151 nir_foreach_variable_with_modes(var, s, nir_var_shader_out) {
152 if (var->data.location == VARYING_SLOT_CLIP_VERTEX) {
153 clip_vertex_location = var->data.driver_location;
154 }
155 }
156 nir_foreach_variable_with_modes(var, s, nir_var_shader_out) {
157 if (var->data.driver_location > clip_vertex_location) {
158 var->data.driver_location--;
159 }
160 }
161 NIR_PASS_V(s, nir_remove_dead_variables, nir_var_shader_out, NULL);
162 fprintf(stderr, "r300: no HW support for clip vertex, expect misrendering.\n");
163 fprintf(stderr, "r300: software emulation can be enabled with RADEON_DEBUG=notcl.\n");
164 }
165 }
166
167 do {
168 progress = false;
169
170 NIR_PASS_V(s, nir_lower_vars_to_ssa);
171
172 NIR_PASS(progress, s, nir_copy_prop);
173 NIR_PASS(progress, s, r300_nir_lower_flrp);
174 NIR_PASS(progress, s, nir_opt_algebraic);
175 if (s->info.stage == MESA_SHADER_VERTEX) {
176 if (!is_r500)
177 NIR_PASS(progress, s, r300_nir_lower_bool_to_float);
178 NIR_PASS(progress, s, r300_nir_fuse_fround_d3d9);
179 }
180 NIR_PASS(progress, s, nir_opt_constant_folding);
181 NIR_PASS(progress, s, nir_opt_remove_phis);
182 NIR_PASS(progress, s, nir_opt_conditional_discard);
183 NIR_PASS(progress, s, nir_opt_dce);
184 NIR_PASS(progress, s, nir_opt_dead_cf);
185 NIR_PASS(progress, s, nir_opt_cse);
186 NIR_PASS(progress, s, nir_opt_find_array_copies);
187 NIR_PASS(progress, s, nir_opt_copy_prop_vars);
188 NIR_PASS(progress, s, nir_opt_dead_write_vars);
189
190 NIR_PASS(progress, s, nir_opt_if, nir_opt_if_optimize_phi_true_false);
191 if (is_r500)
192 nir_shader_intrinsics_pass(s, set_speculate, nir_metadata_control_flow, NULL);
193 NIR_PASS(progress, s, nir_opt_peephole_select, is_r500 ? 8 : ~0, true, true);
194 if (s->info.stage == MESA_SHADER_FRAGMENT) {
195 NIR_PASS(progress, s, r300_nir_lower_bool_to_float_fs);
196 }
197 NIR_PASS(progress, s, nir_opt_algebraic);
198 NIR_PASS(progress, s, nir_opt_constant_folding);
199 NIR_PASS(progress, s, nir_opt_shrink_stores, true);
200 NIR_PASS(progress, s, nir_opt_shrink_vectors, false);
201 NIR_PASS(progress, s, nir_opt_loop);
202
203 bool too_many_ubos = have_too_many_ubos(s, is_r500);
204 NIR_PASS(progress, s, nir_opt_vectorize, r300_should_vectorize_instr, &too_many_ubos);
205 NIR_PASS(progress, s, nir_opt_undef);
206 if (!progress)
207 NIR_PASS(progress, s, nir_lower_undef_to_zero);
208 NIR_PASS(progress, s, nir_opt_loop_unroll);
209
210 /* Try to fold addressing math into ubo_vec4's base to avoid load_consts
211 * and ALU ops for it.
212 */
213 nir_opt_offsets_options offset_options = {
214 .ubo_vec4_max = 255,
215
216 /* No const offset in TGSI for shared accesses. */
217 .shared_max = 0,
218
219 /* unused intrinsics */
220 .uniform_max = 0,
221 .buffer_max = 0,
222 };
223
224 NIR_PASS(progress, s, nir_opt_offsets, &offset_options);
225 } while (progress);
226
227 NIR_PASS_V(s, nir_lower_var_copies);
228 NIR_PASS(progress, s, nir_remove_dead_variables, nir_var_function_temp, NULL);
229 }
230
231 static char *
r300_check_control_flow(nir_shader * s)232 r300_check_control_flow(nir_shader *s)
233 {
234 nir_function_impl *impl = nir_shader_get_entrypoint(s);
235 nir_block *first = nir_start_block(impl);
236 nir_cf_node *next = nir_cf_node_next(&first->cf_node);
237
238 if (next) {
239 switch (next->type) {
240 case nir_cf_node_if:
241 return "If/then statements not supported by R300/R400 shaders, should have been "
242 "flattened by peephole_select.";
243 case nir_cf_node_loop:
244 return "Looping not supported R300/R400 shaders, all loops must be statically "
245 "unrollable.";
246 default:
247 return "Unknown control flow type";
248 }
249 }
250
251 return NULL;
252 }
253
254 char *
r300_finalize_nir(struct pipe_screen * pscreen,struct nir_shader * s)255 r300_finalize_nir(struct pipe_screen *pscreen, struct nir_shader *s)
256 {
257 r300_optimize_nir(s, pscreen);
258
259 /* st_program.c's parameter list optimization requires that future nir
260 * variants don't reallocate the uniform storage, so we have to remove
261 * uniforms that occupy storage. But we don't want to remove samplers,
262 * because they're needed for YUV variant lowering.
263 */
264 nir_remove_dead_derefs(s);
265 nir_foreach_uniform_variable_safe (var, s) {
266 if (var->data.mode == nir_var_uniform &&
267 (glsl_type_get_image_count(var->type) || glsl_type_get_sampler_count(var->type)))
268 continue;
269
270 exec_node_remove(&var->node);
271 }
272 nir_validate_shader(s, "after uniform var removal");
273
274 nir_sweep(s);
275
276 if (!r300_screen(pscreen)->caps.is_r500 &&
277 (r300_screen(pscreen)->caps.has_tcl || s->info.stage == MESA_SHADER_FRAGMENT)) {
278 char *msg = r300_check_control_flow(s);
279 if (msg)
280 return strdup(msg);
281 }
282
283 return NULL;
284 }
285