1 /*
2 * Copyright © 2021 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "ir3_compiler.h"
7 #include "ir3_nir.h"
8 #include "nir_instr_set.h"
9
10 /* Preamble optimization happens in two parts: first we generate the preamble
11 * using the generic NIR pass, then we setup the preamble sequence and inline
12 * the preamble into the main shader if there was a preamble. The first part
13 * should happen before UBO lowering, because we want to prefer more complex
14 * expressions over UBO loads, but the second part has to happen after UBO
15 * lowering because it may add copy instructions to the preamble.
16 */
17
18 static void
def_size(nir_def * def,unsigned * size,unsigned * align)19 def_size(nir_def *def, unsigned *size, unsigned *align)
20 {
21 unsigned bit_size = def->bit_size == 1 ? 32 : def->bit_size;
22 /* Due to the implicit const file promotion we want to expand 16-bit values
23 * to 32-bit so that the truncation in the main shader can hopefully be
24 * folded into the use.
25 */
26 *size = DIV_ROUND_UP(bit_size, 32) * def->num_components;
27 *align = 1;
28 }
29
30 static bool
all_uses_float(nir_def * def,bool allow_src2)31 all_uses_float(nir_def *def, bool allow_src2)
32 {
33 nir_foreach_use_including_if (use, def) {
34 if (nir_src_is_if(use))
35 return false;
36
37 nir_instr *use_instr = nir_src_parent_instr(use);
38 if (use_instr->type != nir_instr_type_alu)
39 return false;
40 nir_alu_instr *use_alu = nir_instr_as_alu(use_instr);
41 unsigned src_index = ~0;
42 for (unsigned i = 0; i < nir_op_infos[use_alu->op].num_inputs; i++) {
43 if (&use_alu->src[i].src == use) {
44 src_index = i;
45 break;
46 }
47 }
48
49 assert(src_index != ~0);
50 nir_alu_type src_type =
51 nir_alu_type_get_base_type(nir_op_infos[use_alu->op].input_types[src_index]);
52
53 if (src_type != nir_type_float || (src_index == 2 && !allow_src2))
54 return false;
55 }
56
57 return true;
58 }
59
60 static bool
all_uses_bit(nir_def * def)61 all_uses_bit(nir_def *def)
62 {
63 nir_foreach_use_including_if (use, def) {
64 if (nir_src_is_if(use))
65 return false;
66
67 nir_instr *use_instr = nir_src_parent_instr(use);
68 if (use_instr->type != nir_instr_type_alu)
69 return false;
70 nir_alu_instr *use_alu = nir_instr_as_alu(use_instr);
71
72 /* See ir3_cat2_absneg() */
73 switch (use_alu->op) {
74 case nir_op_iand:
75 case nir_op_ior:
76 case nir_op_inot:
77 case nir_op_ixor:
78 case nir_op_bitfield_reverse:
79 case nir_op_ufind_msb:
80 case nir_op_ifind_msb:
81 case nir_op_find_lsb:
82 case nir_op_ishl:
83 case nir_op_ushr:
84 case nir_op_ishr:
85 case nir_op_bit_count:
86 continue;
87 default:
88 return false;
89 }
90 }
91
92 return true;
93 }
94
95 static float
instr_cost(nir_instr * instr,const void * data)96 instr_cost(nir_instr *instr, const void *data)
97 {
98 /* We'll assume wave64 here for simplicity and assume normal cat1-cat3 ops
99 * take 1 (normalized) cycle.
100 *
101 * See https://gitlab.freedesktop.org/freedreno/freedreno/-/wikis/A6xx-SP
102 *
103 * TODO: assume wave128 on fragment/compute shaders?
104 */
105
106 switch (instr->type) {
107 case nir_instr_type_alu: {
108 nir_alu_instr *alu = nir_instr_as_alu(instr);
109 unsigned components = alu->def.num_components;
110 switch (alu->op) {
111 /* cat4 */
112 case nir_op_frcp:
113 case nir_op_fsqrt:
114 case nir_op_frsq:
115 case nir_op_flog2:
116 case nir_op_fexp2:
117 case nir_op_fsin:
118 case nir_op_fcos:
119 return 4 * components;
120
121 /* Instructions that become src modifiers. Note for conversions this is
122 * really an approximation.
123 *
124 * This prevents silly things like lifting a negate that would become a
125 * modifier.
126 */
127 case nir_op_f2f32:
128 case nir_op_f2f16:
129 case nir_op_f2fmp:
130 case nir_op_fneg:
131 return all_uses_float(&alu->def, true) ? 0 : 1 * components;
132
133 case nir_op_fabs:
134 return all_uses_float(&alu->def, false) ? 0 : 1 * components;
135
136 case nir_op_inot:
137 return all_uses_bit(&alu->def) ? 0 : 1 * components;
138
139 /* Instructions that become vector split/collect */
140 case nir_op_vec2:
141 case nir_op_vec3:
142 case nir_op_vec4:
143 case nir_op_mov:
144 return 0;
145
146 /* cat1-cat3 */
147 default:
148 return 1 * components;
149 }
150 break;
151 }
152
153 case nir_instr_type_tex:
154 /* cat5 */
155 return 8;
156
157 case nir_instr_type_intrinsic: {
158 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
159 switch (intrin->intrinsic) {
160 case nir_intrinsic_load_ubo: {
161 /* If the UBO and offset are constant, then UBO lowering should do a
162 * better job trying to lower this, and opt_preamble shouldn't try to
163 * duplicate it. However if it has a non-constant offset then we can
164 * avoid setting up a0.x etc. in the main shader and potentially have
165 * to push less.
166 */
167 bool const_ubo = nir_src_is_const(intrin->src[0]);
168 if (!const_ubo) {
169 nir_intrinsic_instr *rsrc = ir3_bindless_resource(intrin->src[0]);
170 if (rsrc)
171 const_ubo = nir_src_is_const(rsrc->src[0]);
172 }
173
174 if (const_ubo && nir_src_is_const(intrin->src[1]))
175 return 0;
176
177 /* TODO: get actual numbers for ldc */
178 return 8;
179 }
180
181 case nir_intrinsic_load_ssbo:
182 case nir_intrinsic_load_ssbo_ir3:
183 case nir_intrinsic_get_ssbo_size:
184 case nir_intrinsic_image_load:
185 case nir_intrinsic_bindless_image_load:
186 /* cat5/isam */
187 return 8;
188
189 /* By default assume it's a sysval or something */
190 default:
191 return 0;
192 }
193 }
194
195 case nir_instr_type_phi:
196 /* Although we can often coalesce phis, the cost of a phi is a proxy for
197 * the cost of the if-else statement... If all phis are moved, then the
198 * branches move too. So this needs to have a nonzero cost, even if we're
199 * optimistic about coalescing.
200 *
201 * Value chosen empirically. On Rob's shader-db, cost of 2 performs better
202 * across the board than a cost of 1. Values greater than 2 do not seem to
203 * have any change, so sticking with 2.
204 */
205 return 2;
206
207 default:
208 return 0;
209 }
210 }
211
212 static float
rewrite_cost(nir_def * def,const void * data)213 rewrite_cost(nir_def *def, const void *data)
214 {
215 /* We always have to expand booleans */
216 if (def->bit_size == 1)
217 return def->num_components;
218
219 bool mov_needed = false;
220 nir_foreach_use (use, def) {
221 nir_instr *parent_instr = nir_src_parent_instr(use);
222 if (parent_instr->type != nir_instr_type_alu) {
223 mov_needed = true;
224 break;
225 } else {
226 nir_alu_instr *alu = nir_instr_as_alu(parent_instr);
227 if (alu->op == nir_op_vec2 ||
228 alu->op == nir_op_vec3 ||
229 alu->op == nir_op_vec4 ||
230 alu->op == nir_op_mov) {
231 mov_needed = true;
232 break;
233 } else {
234 /* Assume for non-moves that the const is folded into the src */
235 }
236 }
237 }
238
239 return mov_needed ? def->num_components : 0;
240 }
241
242 static bool
avoid_instr(const nir_instr * instr,const void * data)243 avoid_instr(const nir_instr *instr, const void *data)
244 {
245 if (instr->type != nir_instr_type_intrinsic)
246 return false;
247
248 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
249
250 return intrin->intrinsic == nir_intrinsic_bindless_resource_ir3;
251 }
252
253 static bool
set_speculate(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * _)254 set_speculate(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *_)
255 {
256 switch (intr->intrinsic) {
257 /* These instructions go through bounds-checked hardware descriptors so
258 * should be safe to speculate.
259 *
260 * TODO: This isn't necessarily true in Vulkan, where descriptors don't need
261 * to be filled out and bindless descriptor offsets aren't bounds checked.
262 * We may need to plumb this information through from turnip for correctness
263 * to avoid regressing freedreno codegen.
264 */
265 case nir_intrinsic_load_ubo:
266 case nir_intrinsic_load_ubo_vec4:
267 case nir_intrinsic_image_load:
268 case nir_intrinsic_image_samples_identical:
269 case nir_intrinsic_bindless_image_load:
270 case nir_intrinsic_load_ssbo:
271 case nir_intrinsic_load_ssbo_ir3:
272 nir_intrinsic_set_access(intr, nir_intrinsic_access(intr) |
273 ACCESS_CAN_SPECULATE);
274 return true;
275
276 default:
277 return false;
278 }
279 }
280
281 bool
ir3_nir_opt_preamble(nir_shader * nir,struct ir3_shader_variant * v)282 ir3_nir_opt_preamble(nir_shader *nir, struct ir3_shader_variant *v)
283 {
284 unsigned max_size;
285 if (v->binning_pass) {
286 const struct ir3_const_state *const_state = ir3_const_state(v);
287 max_size =
288 const_state->allocs.consts[IR3_CONST_ALLOC_PREAMBLE].size_vec4 * 4;
289 } else {
290 const struct ir3_const_state *const_state = ir3_const_state(v);
291 max_size = ir3_const_state_get_free_space(
292 v, const_state, v->compiler->const_upload_unit) * 4;
293 }
294
295 if (max_size == 0)
296 return false;
297
298 bool progress = nir_shader_intrinsics_pass(nir, set_speculate,
299 nir_metadata_control_flow, NULL);
300
301 nir_opt_preamble_options options = {
302 .drawid_uniform = true,
303 .subgroup_size_uniform = true,
304 .load_workgroup_size_allowed = true,
305 .def_size = def_size,
306 .preamble_storage_size = max_size,
307 .instr_cost_cb = instr_cost,
308 .avoid_instr_cb = avoid_instr,
309 .rewrite_cost_cb = rewrite_cost,
310 };
311
312 unsigned size = 0;
313 progress |= nir_opt_preamble(nir, &options, &size);
314
315 if (!v->binning_pass) {
316 uint32_t preamble_size_vec4 =
317 align(DIV_ROUND_UP(size, 4), v->compiler->const_upload_unit);
318 ir3_const_alloc(&ir3_const_state_mut(v)->allocs, IR3_CONST_ALLOC_PREAMBLE,
319 preamble_size_vec4, v->compiler->const_upload_unit);
320 }
321
322 return progress;
323 }
324
325 /* This isn't nearly as comprehensive as what's done in nir_opt_preamble, but in
326 * various use-cases we need to hoist definitions into preambles outside of
327 * opt_preamble. Currently we only handle a few uncomplicated intrinsics.
328 */
329 bool
ir3_def_is_rematerializable_for_preamble(nir_def * def,nir_def ** preamble_defs)330 ir3_def_is_rematerializable_for_preamble(nir_def *def,
331 nir_def **preamble_defs)
332 {
333 switch (def->parent_instr->type) {
334 case nir_instr_type_load_const:
335 return true;
336 case nir_instr_type_intrinsic: {
337 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(def->parent_instr);
338 switch (intrin->intrinsic) {
339 case nir_intrinsic_load_ubo:
340 return ir3_def_is_rematerializable_for_preamble(intrin->src[0].ssa,
341 preamble_defs) &&
342 ir3_def_is_rematerializable_for_preamble(intrin->src[1].ssa,
343 preamble_defs) &&
344 (def->parent_instr->block->cf_node.parent->type ==
345 nir_cf_node_function ||
346 (nir_intrinsic_access(intrin) & ACCESS_CAN_SPECULATE));
347 case nir_intrinsic_bindless_resource_ir3:
348 return ir3_def_is_rematerializable_for_preamble(intrin->src[0].ssa,
349 preamble_defs);
350 case nir_intrinsic_load_preamble:
351 return !!preamble_defs;
352 default:
353 return false;
354 }
355 }
356 case nir_instr_type_alu: {
357 nir_alu_instr *alu = nir_instr_as_alu(def->parent_instr);
358 for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) {
359 if (!ir3_def_is_rematerializable_for_preamble(alu->src[i].src.ssa,
360 preamble_defs))
361 return false;
362 }
363 return true;
364 }
365 default:
366 return false;
367 }
368 }
369
370 struct find_insert_block_state {
371 nir_block *insert_block;
372 };
373
374 static bool
find_dominated_src(nir_src * src,void * data)375 find_dominated_src(nir_src *src, void *data)
376 {
377 struct find_insert_block_state *state = data;
378 nir_block *src_block = src->ssa->parent_instr->block;
379
380 if (!state->insert_block) {
381 state->insert_block = src_block;
382 return true;
383 } else if (nir_block_dominates(state->insert_block, src_block)) {
384 state->insert_block = src_block;
385 return true;
386 } else if (nir_block_dominates(src_block, state->insert_block)) {
387 return true;
388 } else {
389 state->insert_block = NULL;
390 return false;
391 }
392 }
393
394 /* Find the block where instr can be inserted. This is the block that is
395 * dominated by all its sources. If instr doesn't have any sources, return dflt.
396 */
397 static nir_block *
find_insert_block(nir_instr * instr,nir_block * dflt)398 find_insert_block(nir_instr *instr, nir_block *dflt)
399 {
400 struct find_insert_block_state state = {
401 .insert_block = NULL,
402 };
403
404 if (nir_foreach_src(instr, find_dominated_src, &state)) {
405 return state.insert_block ? state.insert_block : dflt;
406 }
407
408 return NULL;
409 }
410
411 static bool
dominates(const nir_instr * old_instr,const nir_instr * new_instr)412 dominates(const nir_instr *old_instr, const nir_instr *new_instr)
413 {
414 return nir_block_dominates(old_instr->block, new_instr->block);
415 }
416
417 static nir_def *
_rematerialize_def(nir_builder * b,struct hash_table * remap_ht,struct set * instr_set,nir_def ** preamble_defs,nir_def * def)418 _rematerialize_def(nir_builder *b, struct hash_table *remap_ht,
419 struct set *instr_set, nir_def **preamble_defs,
420 nir_def *def)
421 {
422 if (_mesa_hash_table_search(remap_ht, def->parent_instr))
423 return NULL;
424
425 switch (def->parent_instr->type) {
426 case nir_instr_type_load_const:
427 break;
428 case nir_instr_type_intrinsic: {
429 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(def->parent_instr);
430 if (intrin->intrinsic == nir_intrinsic_load_preamble) {
431 _mesa_hash_table_insert(remap_ht, def,
432 preamble_defs[nir_intrinsic_base(intrin)]);
433 return preamble_defs[nir_intrinsic_base(intrin)];
434 } else {
435 for (unsigned i = 0; i < nir_intrinsic_infos[intrin->intrinsic].num_srcs;
436 i++)
437 _rematerialize_def(b, remap_ht, instr_set, preamble_defs,
438 intrin->src[i].ssa);
439 }
440 break;
441 }
442 case nir_instr_type_alu: {
443 nir_alu_instr *alu = nir_instr_as_alu(def->parent_instr);
444 for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++)
445 _rematerialize_def(b, remap_ht, instr_set, preamble_defs,
446 alu->src[i].src.ssa);
447 break;
448 }
449 default:
450 unreachable("should not get here");
451 }
452
453 nir_instr *instr = nir_instr_clone_deep(b->shader, def->parent_instr,
454 remap_ht);
455
456 /* Find a legal place to insert the new instruction. We cannot simply put it
457 * at the end of the preamble since the original instruction and its sources
458 * may be defined inside control flow.
459 */
460 nir_metadata_require(b->impl, nir_metadata_dominance);
461 nir_block *insert_block =
462 find_insert_block(instr, nir_cursor_current_block(b->cursor));
463
464 /* Since the preamble control flow was reconstructed from the original one,
465 * we must be able to find a legal place to insert the instruction.
466 */
467 assert(insert_block);
468 b->cursor = nir_after_block(insert_block);
469 nir_builder_instr_insert(b, instr);
470
471 if (instr_set) {
472 nir_instr *other_instr =
473 nir_instr_set_add_or_rewrite(instr_set, instr, dominates);
474 if (other_instr) {
475 instr = other_instr;
476 _mesa_hash_table_insert(remap_ht, def, nir_instr_def(other_instr));
477 }
478 }
479
480 return nir_instr_def(instr);
481 }
482
483 /* Hoist a given definition into the preamble. If "instr_set" is non-NULL,
484 * de-duplicate the hoisted definitions, and if "preamble_defs" is non-NULL then
485 * it is used to remap load_preamble instructions back to the original
486 * definition in the preamble, if the definition uses load_preamble
487 * instructions.
488 */
489
490 nir_def *
ir3_rematerialize_def_for_preamble(nir_builder * b,nir_def * def,struct set * instr_set,nir_def ** preamble_defs)491 ir3_rematerialize_def_for_preamble(nir_builder *b, nir_def *def,
492 struct set *instr_set,
493 nir_def **preamble_defs)
494 {
495 struct hash_table *remap_ht = _mesa_pointer_hash_table_create(NULL);
496
497 nir_def *new_def =
498 _rematerialize_def(b, remap_ht, instr_set, preamble_defs, def);
499
500 _mesa_hash_table_destroy(remap_ht, NULL);
501
502 return new_def;
503 }
504
505
506 static void
get_descriptors(nir_instr * instr,nir_def ** descs)507 get_descriptors(nir_instr *instr, nir_def **descs)
508 {
509 if (instr->type == nir_instr_type_tex) {
510 nir_tex_instr *tex = nir_instr_as_tex(instr);
511 /* TODO: handle non-bindless tex instructions. These are more complicated,
512 * because of the implicit addition in the instruction.
513 */
514 int texture_index =
515 nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
516 int sampler_index =
517 nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
518 if (texture_index >= 0)
519 descs[0] = tex->src[texture_index].src.ssa;
520 if (sampler_index >= 0)
521 descs[1] = tex->src[sampler_index].src.ssa;
522 } else if (instr->type == nir_instr_type_intrinsic) {
523 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
524 switch (intrin->intrinsic) {
525 case nir_intrinsic_load_ssbo:
526 case nir_intrinsic_load_ubo:
527 case nir_intrinsic_ssbo_atomic:
528 case nir_intrinsic_ssbo_atomic_swap:
529 case nir_intrinsic_get_ssbo_size:
530 case nir_intrinsic_image_load:
531 case nir_intrinsic_bindless_image_load:
532 case nir_intrinsic_image_store:
533 case nir_intrinsic_bindless_image_store:
534 case nir_intrinsic_image_atomic:
535 case nir_intrinsic_bindless_image_atomic:
536 case nir_intrinsic_image_size:
537 case nir_intrinsic_bindless_image_size:
538 descs[0] = intrin->src[0].ssa;
539 break;
540 case nir_intrinsic_store_ssbo:
541 descs[0] = intrin->src[1].ssa;
542 break;
543 default:
544 break;
545 }
546 }
547 }
548
549 #define MAX_PREFETCHES 32
550
551 struct prefetches {
552 nir_def *prefetches[MAX_PREFETCHES];
553 unsigned num_prefetches;
554 };
555
556 static bool
is_already_prefetched(struct prefetches * prefetches,nir_def * def)557 is_already_prefetched(struct prefetches *prefetches, nir_def *def)
558 {
559 for (unsigned i = 0; i < prefetches->num_prefetches; i++) {
560 if (prefetches->prefetches[i] == def)
561 return true;
562 }
563
564 return false;
565 }
566
567 static void
add_prefetch(struct prefetches * prefetches,nir_def * def)568 add_prefetch(struct prefetches *prefetches, nir_def *def)
569 {
570 assert(prefetches->num_prefetches < MAX_PREFETCHES);
571 prefetches->prefetches[prefetches->num_prefetches++] = def;
572 }
573
574 struct prefetch_state {
575 struct prefetches tex, sampler;
576 };
577
578 static bool
emit_descriptor_prefetch(nir_builder * b,nir_instr * instr,nir_def ** descs,struct prefetch_state * state)579 emit_descriptor_prefetch(nir_builder *b, nir_instr *instr, nir_def **descs,
580 struct prefetch_state *state)
581 {
582 if (instr->type == nir_instr_type_tex) {
583 nir_tex_instr *tex = nir_instr_as_tex(instr);
584 int sampler_index =
585 nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
586 int texture_index =
587 nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
588
589 /* For texture instructions, prefetch if at least one source hasn't been
590 * prefetched already. For example, the same sampler may be used with
591 * different textures, and we still want to prefetch the texture
592 * descriptor if we've already prefetched the sampler descriptor.
593 */
594
595 bool tex_already_prefetched = is_already_prefetched(&state->tex, descs[0]);
596
597 if (!tex_already_prefetched &&
598 state->tex.num_prefetches == MAX_PREFETCHES)
599 return false;
600
601 assert(texture_index >= 0);
602 if (sampler_index >= 0) {
603 bool sampler_already_prefetched =
604 is_already_prefetched(&state->sampler, descs[1]);
605
606 if (!sampler_already_prefetched &&
607 state->sampler.num_prefetches == MAX_PREFETCHES)
608 return false;
609
610 if (tex_already_prefetched && sampler_already_prefetched)
611 return false;
612
613 if (!tex_already_prefetched)
614 add_prefetch(&state->tex, descs[0]);
615 if (!sampler_already_prefetched)
616 add_prefetch(&state->sampler, descs[1]);
617
618 nir_prefetch_sam_ir3(b, descs[0], descs[1]);
619 } else {
620 if (tex_already_prefetched)
621 return false;
622
623 add_prefetch(&state->tex, descs[0]);
624 nir_prefetch_tex_ir3(b, descs[0]);
625 }
626 } else {
627 assert(instr->type == nir_instr_type_intrinsic);
628
629 if (state->tex.num_prefetches == MAX_PREFETCHES)
630 return false;
631
632 if (is_already_prefetched(&state->tex, descs[0]))
633 return false;
634
635 add_prefetch(&state->tex, descs[0]);
636
637 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
638 if (intrin->intrinsic == nir_intrinsic_load_ubo)
639 nir_prefetch_ubo_ir3(b, descs[0]);
640 else
641 nir_prefetch_tex_ir3(b, descs[0]);
642 }
643
644 return true;
645 }
646
647 static unsigned
get_preamble_offset(nir_def * def)648 get_preamble_offset(nir_def *def)
649 {
650 return nir_intrinsic_base(nir_instr_as_intrinsic(def->parent_instr));
651 }
652
653 /* Prefetch descriptors in the preamble. This is an optimization introduced on
654 * a7xx, mainly useful when the preamble is an early preamble, and replaces the
655 * use of CP_LOAD_STATE on a6xx to prefetch descriptors in HLSQ.
656 */
657
658 bool
ir3_nir_opt_prefetch_descriptors(nir_shader * nir,struct ir3_shader_variant * v)659 ir3_nir_opt_prefetch_descriptors(nir_shader *nir, struct ir3_shader_variant *v)
660 {
661 const struct ir3_const_state *const_state = ir3_const_state(v);
662
663 nir_function_impl *main = nir_shader_get_entrypoint(nir);
664 struct set *instr_set = nir_instr_set_create(NULL);
665 nir_function_impl *preamble = main->preamble ? main->preamble->impl : NULL;
666 nir_builder b;
667 bool progress = false;
668 struct prefetch_state state = {};
669
670 nir_def **preamble_defs =
671 calloc(const_state->allocs.consts[IR3_CONST_ALLOC_PREAMBLE].size_vec4 * 4,
672 sizeof(nir_def *));
673
674 /* Collect preamble defs. This is useful if the computation of the offset has
675 * already been hoisted to the preamble.
676 */
677 if (preamble) {
678 nir_foreach_block (block, preamble) {
679 nir_foreach_instr (instr, block) {
680 if (instr->type != nir_instr_type_intrinsic)
681 continue;
682
683 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
684
685 if (intrin->intrinsic != nir_intrinsic_store_preamble)
686 continue;
687
688 assert(
689 nir_intrinsic_base(intrin) <
690 const_state->allocs.consts[IR3_CONST_ALLOC_PREAMBLE].size_vec4 * 4);
691 preamble_defs[nir_intrinsic_base(intrin)] = intrin->src[0].ssa;
692 }
693 }
694 }
695
696 nir_foreach_block (block, main) {
697 nir_foreach_instr (instr, block) {
698 nir_def *descs[2] = { NULL, NULL };
699 nir_def *preamble_descs[2] = { NULL, NULL };
700 get_descriptors(instr, descs);
701
702 /* We must have found at least one descriptor */
703 if (!descs[0] && !descs[1])
704 continue;
705
706 /* The instruction itself must be hoistable.
707 * TODO: If the descriptor is statically referenced and in-bounds, then
708 * we should be able to hoist the descriptor load even if the
709 * descriptor contents aren't guaranteed. This would require more
710 * plumbing.
711 * TODO: Textures. This is broken in nir_opt_preamble at the moment and
712 * handling them would also require more plumbing.
713 */
714 if (instr->type == nir_instr_type_intrinsic &&
715 nir_intrinsic_has_access(nir_instr_as_intrinsic(instr)) &&
716 !(nir_intrinsic_access(nir_instr_as_intrinsic(instr)) &
717 ACCESS_CAN_SPECULATE) &&
718 block->cf_node.parent->type != nir_cf_node_function)
719 continue;
720
721 /* Each descriptor must be rematerializable */
722 if (descs[0] &&
723 !ir3_def_is_rematerializable_for_preamble(descs[0], preamble_defs))
724 continue;
725 if (descs[1] &&
726 !ir3_def_is_rematerializable_for_preamble(descs[1], preamble_defs))
727 continue;
728
729 /* If the preamble hasn't been created then this descriptor isn't a
730 * duplicate and we will definitely insert an instruction, so create
731 * the preamble if it hasn't already been created.
732 */
733 if (!preamble) {
734 preamble = nir_shader_get_preamble(nir);
735 }
736
737 b = nir_builder_at(nir_after_impl(preamble));
738
739 /* Materialize descriptors for the prefetch. Note that we deduplicate
740 * descriptors so that we don't blow our budget when repeatedly loading
741 * from the same descriptor, even if the calculation of the descriptor
742 * offset hasn't been CSE'd because the accesses are in different
743 * blocks. This is common because we emit the bindless_resource_ir3
744 * intrinsic right before the access.
745 */
746 for (unsigned i = 0; i < 2; i++) {
747 if (!descs[i])
748 continue;
749
750 preamble_descs[i] =
751 ir3_rematerialize_def_for_preamble(&b, descs[i], instr_set,
752 preamble_defs);
753 }
754
755 /* ir3_rematerialize_def_for_preamble may have moved the cursor. */
756 b.cursor = nir_after_impl(preamble);
757 progress |= emit_descriptor_prefetch(&b, instr, preamble_descs, &state);
758
759 if (state.sampler.num_prefetches == MAX_PREFETCHES &&
760 state.tex.num_prefetches == MAX_PREFETCHES)
761 goto finished;
762 }
763 }
764
765 finished:
766 nir_metadata_preserve(main, nir_metadata_all);
767 if (preamble) {
768 nir_metadata_preserve(preamble,
769 nir_metadata_block_index |
770 nir_metadata_dominance);
771 }
772 nir_instr_set_destroy(instr_set);
773 free(preamble_defs);
774 return progress;
775 }
776
777 bool
ir3_nir_lower_preamble(nir_shader * nir,struct ir3_shader_variant * v)778 ir3_nir_lower_preamble(nir_shader *nir, struct ir3_shader_variant *v)
779 {
780 nir_function_impl *main = nir_shader_get_entrypoint(nir);
781
782 if (!main->preamble)
783 return false;
784
785 nir_function_impl *preamble = main->preamble->impl;
786
787 /* First, lower load/store_preamble. */
788 const struct ir3_const_state *const_state = ir3_const_state(v);
789 unsigned preamble_base =
790 const_state->allocs.consts[IR3_CONST_ALLOC_PREAMBLE].offset_vec4 * 4;
791 unsigned preamble_size =
792 const_state->allocs.consts[IR3_CONST_ALLOC_PREAMBLE].size_vec4 * 4;
793
794 BITSET_DECLARE(promoted_to_float, preamble_size);
795 memset(promoted_to_float, 0, sizeof(promoted_to_float));
796
797 nir_builder builder_main = nir_builder_create(main);
798 nir_builder *b = &builder_main;
799
800 nir_foreach_block (block, main) {
801 nir_foreach_instr_safe (instr, block) {
802 if (instr->type != nir_instr_type_intrinsic)
803 continue;
804
805 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
806 if (intrin->intrinsic != nir_intrinsic_load_preamble)
807 continue;
808
809 nir_def *dest = &intrin->def;
810
811 unsigned offset = preamble_base + nir_intrinsic_base(intrin);
812 b->cursor = nir_before_instr(instr);
813
814 nir_def *new_dest = nir_load_const_ir3(
815 b, dest->num_components, 32, nir_imm_int(b, 0), .base = offset);
816
817 if (dest->bit_size == 1) {
818 new_dest = nir_i2b(b, new_dest);
819 } else if (dest->bit_size != 32) {
820 if (all_uses_float(dest, true)) {
821 assert(dest->bit_size == 16);
822 new_dest = nir_f2f16(b, new_dest);
823 BITSET_SET(promoted_to_float, nir_intrinsic_base(intrin));
824 } else {
825 new_dest = nir_u2uN(b, new_dest, dest->bit_size);
826 }
827 }
828
829 nir_def_rewrite_uses(dest, new_dest);
830 nir_instr_remove(instr);
831 nir_instr_free(instr);
832 }
833 }
834
835 nir_builder builder_preamble = nir_builder_create(preamble);
836 b = &builder_preamble;
837
838 nir_foreach_block (block, preamble) {
839 nir_foreach_instr_safe (instr, block) {
840 if (instr->type != nir_instr_type_intrinsic)
841 continue;
842
843 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
844 if (intrin->intrinsic != nir_intrinsic_store_preamble)
845 continue;
846
847 nir_def *src = intrin->src[0].ssa;
848 unsigned offset = preamble_base + nir_intrinsic_base(intrin);
849
850 b->cursor = nir_before_instr(instr);
851
852 if (src->bit_size == 1)
853 src = nir_b2i32(b, src);
854 if (src->bit_size != 32) {
855 if (BITSET_TEST(promoted_to_float, nir_intrinsic_base(intrin))){
856 assert(src->bit_size == 16);
857 src = nir_f2f32(b, src);
858 } else {
859 src = nir_u2u32(b, src);
860 }
861 }
862
863 nir_store_const_ir3(b, src, .base = offset);
864 nir_instr_remove(instr);
865 nir_instr_free(instr);
866 }
867 }
868
869 /* Now, create the preamble sequence and move the preamble into the main
870 * shader:
871 *
872 * if (preamble_start_ir3()) {
873 * if (subgroupElect()) {
874 * preamble();
875 * preamble_end_ir3();
876 * }
877 * }
878 * ...
879 */
880
881 /* @decl_regs need to stay in the first block. */
882 b->cursor = nir_after_reg_decls(main);
883
884 nir_if *outer_if = nir_push_if(b, nir_preamble_start_ir3(b, 1));
885 {
886 nir_if *inner_if = nir_push_if(b, nir_elect_any_ir3(b, 1));
887 {
888 nir_call_instr *call = nir_call_instr_create(nir, main->preamble);
889 nir_builder_instr_insert(b, &call->instr);
890 nir_preamble_end_ir3(b);
891 }
892 nir_pop_if(b, inner_if);
893 }
894 nir_pop_if(b, outer_if);
895
896 nir_inline_functions(nir);
897 exec_node_remove(&main->preamble->node);
898 main->preamble = NULL;
899
900 nir_metadata_preserve(main, nir_metadata_none);
901 return true;
902 }
903