1 /*
2 * Copyright © 2021 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "ir3_compiler.h"
25 #include "ir3_nir.h"
26
27 /* Preamble optimization happens in two parts: first we generate the preamble
28 * using the generic NIR pass, then we setup the preamble sequence and inline
29 * the preamble into the main shader if there was a preamble. The first part
30 * should happen before UBO lowering, because we want to prefer more complex
31 * expressions over UBO loads, but the second part has to happen after UBO
32 * lowering because it may add copy instructions to the preamble.
33 */
34
35 static void
def_size(nir_def * def,unsigned * size,unsigned * align)36 def_size(nir_def *def, unsigned *size, unsigned *align)
37 {
38 unsigned bit_size = def->bit_size == 1 ? 32 : def->bit_size;
39 /* Due to the implicit const file promotion we want to expand 16-bit values
40 * to 32-bit so that the truncation in the main shader can hopefully be
41 * folded into the use.
42 */
43 *size = DIV_ROUND_UP(bit_size, 32) * def->num_components;
44 *align = 1;
45 }
46
47 static bool
all_uses_float(nir_def * def,bool allow_src2)48 all_uses_float(nir_def *def, bool allow_src2)
49 {
50 nir_foreach_use_including_if (use, def) {
51 if (nir_src_is_if(use))
52 return false;
53
54 nir_instr *use_instr = nir_src_parent_instr(use);
55 if (use_instr->type != nir_instr_type_alu)
56 return false;
57 nir_alu_instr *use_alu = nir_instr_as_alu(use_instr);
58 unsigned src_index = ~0;
59 for (unsigned i = 0; i < nir_op_infos[use_alu->op].num_inputs; i++) {
60 if (&use_alu->src[i].src == use) {
61 src_index = i;
62 break;
63 }
64 }
65
66 assert(src_index != ~0);
67 nir_alu_type src_type =
68 nir_alu_type_get_base_type(nir_op_infos[use_alu->op].input_types[src_index]);
69
70 if (src_type != nir_type_float || (src_index == 2 && !allow_src2))
71 return false;
72 }
73
74 return true;
75 }
76
77 static bool
all_uses_bit(nir_def * def)78 all_uses_bit(nir_def *def)
79 {
80 nir_foreach_use_including_if (use, def) {
81 if (nir_src_is_if(use))
82 return false;
83
84 nir_instr *use_instr = nir_src_parent_instr(use);
85 if (use_instr->type != nir_instr_type_alu)
86 return false;
87 nir_alu_instr *use_alu = nir_instr_as_alu(use_instr);
88
89 /* See ir3_cat2_absneg() */
90 switch (use_alu->op) {
91 case nir_op_iand:
92 case nir_op_ior:
93 case nir_op_inot:
94 case nir_op_ixor:
95 case nir_op_bitfield_reverse:
96 case nir_op_ufind_msb:
97 case nir_op_ifind_msb:
98 case nir_op_find_lsb:
99 case nir_op_ishl:
100 case nir_op_ushr:
101 case nir_op_ishr:
102 case nir_op_bit_count:
103 continue;
104 default:
105 return false;
106 }
107 }
108
109 return true;
110 }
111
112 static float
instr_cost(nir_instr * instr,const void * data)113 instr_cost(nir_instr *instr, const void *data)
114 {
115 /* We'll assume wave64 here for simplicity and assume normal cat1-cat3 ops
116 * take 1 (normalized) cycle.
117 *
118 * See https://gitlab.freedesktop.org/freedreno/freedreno/-/wikis/A6xx-SP
119 *
120 * TODO: assume wave128 on fragment/compute shaders?
121 */
122
123 switch (instr->type) {
124 case nir_instr_type_alu: {
125 nir_alu_instr *alu = nir_instr_as_alu(instr);
126 unsigned components = alu->def.num_components;
127 switch (alu->op) {
128 /* cat4 */
129 case nir_op_frcp:
130 case nir_op_fsqrt:
131 case nir_op_frsq:
132 case nir_op_flog2:
133 case nir_op_fexp2:
134 case nir_op_fsin:
135 case nir_op_fcos:
136 return 4 * components;
137
138 /* Instructions that become src modifiers. Note for conversions this is
139 * really an approximation.
140 *
141 * This prevents silly things like lifting a negate that would become a
142 * modifier.
143 */
144 case nir_op_f2f32:
145 case nir_op_f2f16:
146 case nir_op_f2fmp:
147 case nir_op_fneg:
148 return all_uses_float(&alu->def, true) ? 0 : 1 * components;
149
150 case nir_op_fabs:
151 return all_uses_float(&alu->def, false) ? 0 : 1 * components;
152
153 case nir_op_inot:
154 return all_uses_bit(&alu->def) ? 0 : 1 * components;
155
156 /* Instructions that become vector split/collect */
157 case nir_op_vec2:
158 case nir_op_vec3:
159 case nir_op_vec4:
160 case nir_op_mov:
161 return 0;
162
163 /* cat1-cat3 */
164 default:
165 return 1 * components;
166 }
167 break;
168 }
169
170 case nir_instr_type_tex:
171 /* cat5 */
172 return 8;
173
174 case nir_instr_type_intrinsic: {
175 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
176 switch (intrin->intrinsic) {
177 case nir_intrinsic_load_ubo: {
178 /* If the UBO and offset are constant, then UBO lowering should do a
179 * better job trying to lower this, and opt_preamble shouldn't try to
180 * duplicate it. However if it has a non-constant offset then we can
181 * avoid setting up a0.x etc. in the main shader and potentially have
182 * to push less.
183 */
184 bool const_ubo = nir_src_is_const(intrin->src[0]);
185 if (!const_ubo) {
186 nir_intrinsic_instr *rsrc = ir3_bindless_resource(intrin->src[0]);
187 if (rsrc)
188 const_ubo = nir_src_is_const(rsrc->src[0]);
189 }
190
191 if (const_ubo && nir_src_is_const(intrin->src[1]))
192 return 0;
193
194 /* TODO: get actual numbers for ldc */
195 return 8;
196 }
197
198 case nir_intrinsic_load_ssbo:
199 case nir_intrinsic_load_ssbo_ir3:
200 case nir_intrinsic_get_ssbo_size:
201 case nir_intrinsic_image_load:
202 case nir_intrinsic_bindless_image_load:
203 /* cat5/isam */
204 return 8;
205
206 /* By default assume it's a sysval or something */
207 default:
208 return 0;
209 }
210 }
211
212 case nir_instr_type_phi:
213 /* Although we can often coalesce phis, the cost of a phi is a proxy for
214 * the cost of the if-else statement... If all phis are moved, then the
215 * branches move too. So this needs to have a nonzero cost, even if we're
216 * optimistic about coalescing.
217 *
218 * Value chosen empirically. On Rob's shader-db, cost of 2 performs better
219 * across the board than a cost of 1. Values greater than 2 do not seem to
220 * have any change, so sticking with 2.
221 */
222 return 2;
223
224 default:
225 return 0;
226 }
227 }
228
229 static float
rewrite_cost(nir_def * def,const void * data)230 rewrite_cost(nir_def *def, const void *data)
231 {
232 /* We always have to expand booleans */
233 if (def->bit_size == 1)
234 return def->num_components;
235
236 bool mov_needed = false;
237 nir_foreach_use (use, def) {
238 nir_instr *parent_instr = nir_src_parent_instr(use);
239 if (parent_instr->type != nir_instr_type_alu) {
240 mov_needed = true;
241 break;
242 } else {
243 nir_alu_instr *alu = nir_instr_as_alu(parent_instr);
244 if (alu->op == nir_op_vec2 ||
245 alu->op == nir_op_vec3 ||
246 alu->op == nir_op_vec4 ||
247 alu->op == nir_op_mov) {
248 mov_needed = true;
249 break;
250 } else {
251 /* Assume for non-moves that the const is folded into the src */
252 }
253 }
254 }
255
256 return mov_needed ? def->num_components : 0;
257 }
258
259 static bool
avoid_instr(const nir_instr * instr,const void * data)260 avoid_instr(const nir_instr *instr, const void *data)
261 {
262 if (instr->type != nir_instr_type_intrinsic)
263 return false;
264
265 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
266
267 return intrin->intrinsic == nir_intrinsic_bindless_resource_ir3;
268 }
269
270 static bool
set_speculate(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * _)271 set_speculate(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *_)
272 {
273 switch (intr->intrinsic) {
274 /* These instructions go through bounds-checked hardware descriptors so
275 * should be safe to speculate.
276 *
277 * TODO: This isn't necessarily true in Vulkan, where descriptors don't need
278 * to be filled out and bindless descriptor offsets aren't bounds checked.
279 * We may need to plumb this information through from turnip for correctness
280 * to avoid regressing freedreno codegen.
281 */
282 case nir_intrinsic_load_ubo:
283 case nir_intrinsic_load_ubo_vec4:
284 case nir_intrinsic_image_load:
285 case nir_intrinsic_image_samples_identical:
286 case nir_intrinsic_bindless_image_load:
287 case nir_intrinsic_load_ssbo:
288 case nir_intrinsic_load_ssbo_ir3:
289 nir_intrinsic_set_access(intr, nir_intrinsic_access(intr) |
290 ACCESS_CAN_SPECULATE);
291 return true;
292
293 default:
294 return false;
295 }
296 }
297
298 bool
ir3_nir_opt_preamble(nir_shader * nir,struct ir3_shader_variant * v)299 ir3_nir_opt_preamble(nir_shader *nir, struct ir3_shader_variant *v)
300 {
301 struct ir3_const_state *const_state = ir3_const_state(v);
302
303 unsigned max_size;
304 if (v->binning_pass) {
305 max_size = const_state->preamble_size * 4;
306 } else {
307 struct ir3_const_state worst_case_const_state = {};
308 ir3_setup_const_state(nir, v, &worst_case_const_state);
309 max_size = (ir3_max_const(v) - worst_case_const_state.offsets.immediate) * 4;
310 }
311
312 if (max_size == 0)
313 return false;
314
315 bool progress = nir_shader_intrinsics_pass(nir, set_speculate,
316 nir_metadata_block_index |
317 nir_metadata_dominance, NULL);
318
319 nir_opt_preamble_options options = {
320 .drawid_uniform = true,
321 .subgroup_size_uniform = true,
322 .load_workgroup_size_allowed = true,
323 .def_size = def_size,
324 .preamble_storage_size = max_size,
325 .instr_cost_cb = instr_cost,
326 .avoid_instr_cb = avoid_instr,
327 .rewrite_cost_cb = rewrite_cost,
328 };
329
330 unsigned size = 0;
331 progress |= nir_opt_preamble(nir, &options, &size);
332
333 if (!v->binning_pass)
334 const_state->preamble_size = DIV_ROUND_UP(size, 4);
335
336 return progress;
337 }
338
339 bool
ir3_nir_lower_preamble(nir_shader * nir,struct ir3_shader_variant * v)340 ir3_nir_lower_preamble(nir_shader *nir, struct ir3_shader_variant *v)
341 {
342 nir_function_impl *main = nir_shader_get_entrypoint(nir);
343
344 if (!main->preamble)
345 return false;
346
347 nir_function_impl *preamble = main->preamble->impl;
348
349 /* First, lower load/store_preamble. */
350 const struct ir3_const_state *const_state = ir3_const_state(v);
351 unsigned preamble_base = v->shader_options.num_reserved_user_consts * 4 +
352 const_state->ubo_state.size / 4 + const_state->global_size * 4;
353 unsigned preamble_size = const_state->preamble_size * 4;
354
355 BITSET_DECLARE(promoted_to_float, preamble_size);
356 memset(promoted_to_float, 0, sizeof(promoted_to_float));
357
358 nir_builder builder_main = nir_builder_create(main);
359 nir_builder *b = &builder_main;
360
361 nir_foreach_block (block, main) {
362 nir_foreach_instr_safe (instr, block) {
363 if (instr->type != nir_instr_type_intrinsic)
364 continue;
365
366 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
367 if (intrin->intrinsic != nir_intrinsic_load_preamble)
368 continue;
369
370 nir_def *dest = &intrin->def;
371
372 unsigned offset = preamble_base + nir_intrinsic_base(intrin);
373 b->cursor = nir_before_instr(instr);
374
375 nir_def *new_dest =
376 nir_load_uniform(b, dest->num_components, 32, nir_imm_int(b, 0),
377 .base = offset);
378
379 if (dest->bit_size == 1) {
380 new_dest = nir_i2b(b, new_dest);
381 } else if (dest->bit_size != 32) {
382 assert(dest->bit_size == 16);
383 if (all_uses_float(dest, true)) {
384 new_dest = nir_f2f16(b, new_dest);
385 BITSET_SET(promoted_to_float, nir_intrinsic_base(intrin));
386 } else {
387 new_dest = nir_u2u16(b, new_dest);
388 }
389 }
390
391 nir_def_rewrite_uses(dest, new_dest);
392 nir_instr_remove(instr);
393 nir_instr_free(instr);
394 }
395 }
396
397 nir_builder builder_preamble = nir_builder_create(preamble);
398 b = &builder_preamble;
399
400 nir_foreach_block (block, preamble) {
401 nir_foreach_instr_safe (instr, block) {
402 if (instr->type != nir_instr_type_intrinsic)
403 continue;
404
405 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
406 if (intrin->intrinsic != nir_intrinsic_store_preamble)
407 continue;
408
409 nir_def *src = intrin->src[0].ssa;
410 unsigned offset = preamble_base + nir_intrinsic_base(intrin);
411
412 b->cursor = nir_before_instr(instr);
413
414 if (src->bit_size == 1)
415 src = nir_b2i32(b, src);
416 if (src->bit_size != 32) {
417 assert(src->bit_size == 16);
418 if (BITSET_TEST(promoted_to_float, nir_intrinsic_base(intrin))) {
419 src = nir_f2f32(b, src);
420 } else {
421 src = nir_u2u32(b, src);
422 }
423 }
424
425 nir_store_uniform_ir3(b, src, .base = offset);
426 nir_instr_remove(instr);
427 nir_instr_free(instr);
428 }
429 }
430
431 /* Now, create the preamble sequence and move the preamble into the main
432 * shader:
433 *
434 * if (preamble_start_ir3()) {
435 * if (subgroupElect()) {
436 * preamble();
437 * preamble_end_ir3();
438 * }
439 * }
440 * ...
441 */
442
443 /* @decl_regs need to stay in the first block. */
444 b->cursor = nir_after_reg_decls(main);
445
446 nir_if *outer_if = nir_push_if(b, nir_preamble_start_ir3(b, 1));
447 {
448 nir_if *inner_if = nir_push_if(b, nir_elect(b, 1));
449 {
450 nir_call_instr *call = nir_call_instr_create(nir, main->preamble);
451 nir_builder_instr_insert(b, &call->instr);
452 nir_preamble_end_ir3(b);
453 }
454 nir_pop_if(b, inner_if);
455 }
456 nir_pop_if(b, outer_if);
457
458 nir_inline_functions(nir);
459 exec_node_remove(&main->preamble->node);
460 main->preamble = NULL;
461
462 nir_metadata_preserve(main, nir_metadata_none);
463 return true;
464 }
465