1 /*
2 * Copyright © 2021 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "ir3_compiler.h"
25 #include "ir3_nir.h"
26
27 /* Preamble optimization happens in two parts: first we generate the preamble
28 * using the generic NIR pass, then we setup the preamble sequence and inline
29 * the preamble into the main shader if there was a preamble. The first part
30 * should happen before UBO lowering, because we want to prefer more complex
31 * expressions over UBO loads, but the second part has to happen after UBO
32 * lowering because it may add copy instructions to the preamble.
33 */
34
35 static void
def_size(nir_ssa_def * def,unsigned * size,unsigned * align)36 def_size(nir_ssa_def *def, unsigned *size, unsigned *align)
37 {
38 unsigned bit_size = def->bit_size == 1 ? 32 : def->bit_size;
39 /* Due to the implicit const file promotion we want to expand 16-bit values
40 * to 32-bit so that the truncation in the main shader can hopefully be
41 * folded into the use.
42 */
43 *size = DIV_ROUND_UP(bit_size, 32) * def->num_components;
44 *align = 1;
45 }
46
47 static bool
all_uses_float(nir_ssa_def * def,bool allow_src2)48 all_uses_float(nir_ssa_def *def, bool allow_src2)
49 {
50 nir_foreach_if_use (use, def) {
51 return false;
52 }
53
54 nir_foreach_use (use, def) {
55 nir_instr *use_instr = use->parent_instr;
56 if (use_instr->type != nir_instr_type_alu)
57 return false;
58 nir_alu_instr *use_alu = nir_instr_as_alu(use_instr);
59 unsigned src_index = ~0;
60 for (unsigned i = 0; i < nir_op_infos[use_alu->op].num_inputs; i++) {
61 if (&use_alu->src[i].src == use) {
62 src_index = i;
63 break;
64 }
65 }
66
67 assert(src_index != ~0);
68 nir_alu_type src_type =
69 nir_alu_type_get_base_type(nir_op_infos[use_alu->op].input_types[src_index]);
70
71 if (src_type != nir_type_float || (src_index == 2 && !allow_src2))
72 return false;
73 }
74
75 return true;
76 }
77
78 static bool
all_uses_bit(nir_ssa_def * def)79 all_uses_bit(nir_ssa_def *def)
80 {
81 nir_foreach_if_use (use, def) {
82 return false;
83 }
84
85 nir_foreach_use (use, def) {
86 nir_instr *use_instr = use->parent_instr;
87 if (use_instr->type != nir_instr_type_alu)
88 return false;
89 nir_alu_instr *use_alu = nir_instr_as_alu(use_instr);
90
91 /* See ir3_cat2_absneg() */
92 switch (use_alu->op) {
93 case nir_op_iand:
94 case nir_op_ior:
95 case nir_op_inot:
96 case nir_op_ixor:
97 case nir_op_bitfield_reverse:
98 case nir_op_ufind_msb:
99 case nir_op_ifind_msb:
100 case nir_op_find_lsb:
101 case nir_op_ishl:
102 case nir_op_ushr:
103 case nir_op_ishr:
104 case nir_op_bit_count:
105 continue;
106 default:
107 return false;
108 }
109 }
110
111 return true;
112 }
113
114 static float
instr_cost(nir_instr * instr,const void * data)115 instr_cost(nir_instr *instr, const void *data)
116 {
117 /* We'll assume wave64 here for simplicity and assume normal cat1-cat3 ops
118 * take 1 (normalized) cycle.
119 *
120 * See https://gitlab.freedesktop.org/freedreno/freedreno/-/wikis/A6xx-SP
121 *
122 * TODO: assume wave128 on fragment/compute shaders?
123 */
124
125 switch (instr->type) {
126 case nir_instr_type_alu: {
127 nir_alu_instr *alu = nir_instr_as_alu(instr);
128 unsigned components = alu->dest.dest.ssa.num_components;
129 switch (alu->op) {
130 /* cat4 */
131 case nir_op_frcp:
132 case nir_op_fsqrt:
133 case nir_op_frsq:
134 case nir_op_flog2:
135 case nir_op_fexp2:
136 case nir_op_fsin:
137 case nir_op_fcos:
138 return 4 * components;
139
140 /* Instructions that become src modifiers. Note for conversions this is
141 * really an approximation.
142 *
143 * This prevents silly things like lifting a negate that would become a
144 * modifier.
145 */
146 case nir_op_f2f32:
147 case nir_op_f2f16:
148 case nir_op_f2fmp:
149 case nir_op_fneg:
150 return all_uses_float(&alu->dest.dest.ssa, true) ? 0 : 1 * components;
151
152 case nir_op_fabs:
153 return all_uses_float(&alu->dest.dest.ssa, false) ? 0 : 1 * components;
154
155 case nir_op_inot:
156 return all_uses_bit(&alu->dest.dest.ssa) ? 0 : 1 * components;
157
158 /* Instructions that become vector split/collect */
159 case nir_op_vec2:
160 case nir_op_vec3:
161 case nir_op_vec4:
162 case nir_op_mov:
163 return 0;
164
165 /* cat1-cat3 */
166 default:
167 return 1 * components;
168 }
169 break;
170 }
171
172 case nir_instr_type_tex:
173 /* cat5 */
174 return 8;
175
176 case nir_instr_type_intrinsic: {
177 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
178 switch (intrin->intrinsic) {
179 case nir_intrinsic_load_ubo: {
180 /* If the UBO and offset are constant, then UBO lowering should do a
181 * better job trying to lower this, and opt_preamble shouldn't try to
182 * duplicate it. However if it has a non-constant offset then we can
183 * avoid setting up a0.x etc. in the main shader and potentially have
184 * to push less.
185 */
186 bool const_ubo = nir_src_is_const(intrin->src[0]);
187 if (!const_ubo) {
188 nir_intrinsic_instr *rsrc = ir3_bindless_resource(intrin->src[0]);
189 if (rsrc)
190 const_ubo = nir_src_is_const(rsrc->src[0]);
191 }
192
193 if (const_ubo && nir_src_is_const(intrin->src[1]))
194 return 0;
195
196 /* TODO: get actual numbers for ldc */
197 return 8;
198 }
199
200 case nir_intrinsic_load_ssbo:
201 case nir_intrinsic_load_ssbo_ir3:
202 case nir_intrinsic_get_ssbo_size:
203 case nir_intrinsic_image_load:
204 case nir_intrinsic_bindless_image_load:
205 /* cat5/isam */
206 return 8;
207
208 /* By default assume it's a sysval or something */
209 default:
210 return 0;
211 }
212 }
213
214 default:
215 return 0;
216 }
217 }
218
219 static float
rewrite_cost(nir_ssa_def * def,const void * data)220 rewrite_cost(nir_ssa_def *def, const void *data)
221 {
222 /* We always have to expand booleans */
223 if (def->bit_size == 1)
224 return def->num_components;
225
226 bool mov_needed = false;
227 nir_foreach_use (use, def) {
228 nir_instr *parent_instr = use->parent_instr;
229 if (parent_instr->type != nir_instr_type_alu) {
230 mov_needed = true;
231 break;
232 } else {
233 nir_alu_instr *alu = nir_instr_as_alu(parent_instr);
234 if (alu->op == nir_op_vec2 ||
235 alu->op == nir_op_vec3 ||
236 alu->op == nir_op_vec4 ||
237 alu->op == nir_op_mov) {
238 mov_needed = true;
239 break;
240 } else {
241 /* Assume for non-moves that the const is folded into the src */
242 }
243 }
244 }
245
246 return mov_needed ? def->num_components : 0;
247 }
248
249 static bool
avoid_instr(const nir_instr * instr,const void * data)250 avoid_instr(const nir_instr *instr, const void *data)
251 {
252 if (instr->type != nir_instr_type_intrinsic)
253 return false;
254
255 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
256
257 return intrin->intrinsic == nir_intrinsic_bindless_resource_ir3;
258 }
259
260 bool
ir3_nir_opt_preamble(nir_shader * nir,struct ir3_shader_variant * v)261 ir3_nir_opt_preamble(nir_shader *nir, struct ir3_shader_variant *v)
262 {
263 struct ir3_const_state *const_state = ir3_const_state(v);
264
265 unsigned max_size;
266 if (v->binning_pass) {
267 max_size = const_state->preamble_size * 4;
268 } else {
269 struct ir3_const_state worst_case_const_state = {};
270 ir3_setup_const_state(nir, v, &worst_case_const_state);
271 max_size = (ir3_max_const(v) - worst_case_const_state.offsets.immediate) * 4;
272 }
273
274 if (max_size == 0)
275 return false;
276
277 nir_opt_preamble_options options = {
278 .drawid_uniform = true,
279 .subgroup_size_uniform = true,
280 .def_size = def_size,
281 .preamble_storage_size = max_size,
282 .instr_cost_cb = instr_cost,
283 .avoid_instr_cb = avoid_instr,
284 .rewrite_cost_cb = rewrite_cost,
285 };
286
287 unsigned size;
288 bool progress = nir_opt_preamble(nir, &options, &size);
289
290 if (!v->binning_pass)
291 const_state->preamble_size = DIV_ROUND_UP(size, 4);
292
293 return progress;
294 }
295
296 bool
ir3_nir_lower_preamble(nir_shader * nir,struct ir3_shader_variant * v)297 ir3_nir_lower_preamble(nir_shader *nir, struct ir3_shader_variant *v)
298 {
299 nir_function_impl *main = nir_shader_get_entrypoint(nir);
300
301 if (!main->preamble)
302 return false;
303
304 nir_function_impl *preamble = main->preamble->impl;
305
306 /* First, lower load/store_preamble. */
307 const struct ir3_const_state *const_state = ir3_const_state(v);
308 unsigned preamble_base = v->num_reserved_user_consts * 4 +
309 const_state->ubo_state.size / 4;
310 unsigned preamble_size = const_state->preamble_size * 4;
311
312 BITSET_DECLARE(promoted_to_float, preamble_size);
313 memset(promoted_to_float, 0, sizeof(promoted_to_float));
314
315 nir_builder _b;
316 nir_builder *b = &_b;
317 nir_builder_init(b, main);
318
319 nir_foreach_block (block, main) {
320 nir_foreach_instr_safe (instr, block) {
321 if (instr->type != nir_instr_type_intrinsic)
322 continue;
323
324 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
325 if (intrin->intrinsic != nir_intrinsic_load_preamble)
326 continue;
327
328 nir_ssa_def *dest = &intrin->dest.ssa;
329
330 unsigned offset = preamble_base + nir_intrinsic_base(intrin);
331 b->cursor = nir_before_instr(instr);
332
333 nir_ssa_def *new_dest =
334 nir_load_uniform(b, dest->num_components, 32, nir_imm_int(b, 0),
335 .base = offset);
336
337 if (dest->bit_size == 1) {
338 new_dest = nir_i2b1(b, new_dest);
339 } else if (dest->bit_size != 32) {
340 assert(dest->bit_size == 16);
341 if (all_uses_float(dest, true)) {
342 new_dest = nir_f2f16(b, new_dest);
343 BITSET_SET(promoted_to_float, nir_intrinsic_base(intrin));
344 } else {
345 new_dest = nir_u2u16(b, new_dest);
346 }
347 }
348
349 nir_ssa_def_rewrite_uses(dest, new_dest);
350 nir_instr_remove(instr);
351 nir_instr_free(instr);
352 }
353 }
354
355 nir_builder_init(b, preamble);
356
357 nir_foreach_block (block, preamble) {
358 nir_foreach_instr_safe (instr, block) {
359 if (instr->type != nir_instr_type_intrinsic)
360 continue;
361
362 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
363 if (intrin->intrinsic != nir_intrinsic_store_preamble)
364 continue;
365
366 nir_ssa_def *src = intrin->src[0].ssa;
367 unsigned offset = preamble_base + nir_intrinsic_base(intrin);
368
369 b->cursor = nir_before_instr(instr);
370
371 if (src->bit_size == 1)
372 src = nir_b2i32(b, src);
373 if (src->bit_size != 32) {
374 assert(src->bit_size == 16);
375 if (BITSET_TEST(promoted_to_float, nir_intrinsic_base(intrin))) {
376 src = nir_f2f32(b, src);
377 } else {
378 src = nir_u2u32(b, src);
379 }
380 }
381
382 nir_store_uniform_ir3(b, src, .base = offset);
383 nir_instr_remove(instr);
384 nir_instr_free(instr);
385 }
386 }
387
388 /* Now, create the preamble sequence and move the preamble into the main
389 * shader:
390 *
391 * if (preamble_start_ir3()) {
392 * if (subgroupElect()) {
393 * preamble();
394 * preamble_end_ir3();
395 * }
396 * }
397 * ...
398 */
399
400 b->cursor = nir_before_cf_list(&main->body);
401
402 nir_if *outer_if = nir_push_if(b, nir_preamble_start_ir3(b, 1));
403 {
404 nir_if *inner_if = nir_push_if(b, nir_elect(b, 1));
405 {
406 nir_call_instr *call = nir_call_instr_create(nir, main->preamble);
407 nir_builder_instr_insert(b, &call->instr);
408 nir_preamble_end_ir3(b);
409 }
410 nir_pop_if(b, inner_if);
411 }
412 nir_pop_if(b, outer_if);
413
414 nir_inline_functions(nir);
415 exec_node_remove(&main->preamble->node);
416 main->preamble = NULL;
417
418 nir_metadata_preserve(main, nir_metadata_none);
419 return true;
420 }
421