• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Valve Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "ir3_compiler.h"
7 #include "ir3_nir.h"
8 #include "nir_instr_set.h"
9 
10 /* Preamble optimization happens in two parts: first we generate the preamble
11  * using the generic NIR pass, then we setup the preamble sequence and inline
12  * the preamble into the main shader if there was a preamble. The first part
13  * should happen before UBO lowering, because we want to prefer more complex
14  * expressions over UBO loads, but the second part has to happen after UBO
15  * lowering because it may add copy instructions to the preamble.
16  */
17 
18 static void
def_size(nir_def * def,unsigned * size,unsigned * align)19 def_size(nir_def *def, unsigned *size, unsigned *align)
20 {
21    unsigned bit_size = def->bit_size == 1 ? 32 : def->bit_size;
22    /* Due to the implicit const file promotion we want to expand 16-bit values
23     * to 32-bit so that the truncation in the main shader can hopefully be
24     * folded into the use.
25     */
26    *size = DIV_ROUND_UP(bit_size, 32) * def->num_components;
27    *align = 1;
28 }
29 
30 static bool
all_uses_float(nir_def * def,bool allow_src2)31 all_uses_float(nir_def *def, bool allow_src2)
32 {
33    nir_foreach_use_including_if (use, def) {
34       if (nir_src_is_if(use))
35          return false;
36 
37       nir_instr *use_instr = nir_src_parent_instr(use);
38       if (use_instr->type != nir_instr_type_alu)
39          return false;
40       nir_alu_instr *use_alu = nir_instr_as_alu(use_instr);
41       unsigned src_index = ~0;
42       for  (unsigned i = 0; i < nir_op_infos[use_alu->op].num_inputs; i++) {
43          if (&use_alu->src[i].src == use) {
44             src_index = i;
45             break;
46          }
47       }
48 
49       assert(src_index != ~0);
50       nir_alu_type src_type =
51          nir_alu_type_get_base_type(nir_op_infos[use_alu->op].input_types[src_index]);
52 
53       if (src_type != nir_type_float || (src_index == 2 && !allow_src2))
54          return false;
55    }
56 
57    return true;
58 }
59 
60 static bool
all_uses_bit(nir_def * def)61 all_uses_bit(nir_def *def)
62 {
63    nir_foreach_use_including_if (use, def) {
64       if (nir_src_is_if(use))
65          return false;
66 
67       nir_instr *use_instr = nir_src_parent_instr(use);
68       if (use_instr->type != nir_instr_type_alu)
69          return false;
70       nir_alu_instr *use_alu = nir_instr_as_alu(use_instr);
71 
72       /* See ir3_cat2_absneg() */
73       switch (use_alu->op) {
74       case nir_op_iand:
75       case nir_op_ior:
76       case nir_op_inot:
77       case nir_op_ixor:
78       case nir_op_bitfield_reverse:
79       case nir_op_ufind_msb:
80       case nir_op_ifind_msb:
81       case nir_op_find_lsb:
82       case nir_op_ishl:
83       case nir_op_ushr:
84       case nir_op_ishr:
85       case nir_op_bit_count:
86          continue;
87       default:
88          return false;
89       }
90    }
91 
92    return true;
93 }
94 
95 static float
instr_cost(nir_instr * instr,const void * data)96 instr_cost(nir_instr *instr, const void *data)
97 {
98    /* We'll assume wave64 here for simplicity and assume normal cat1-cat3 ops
99     * take 1 (normalized) cycle.
100     *
101     * See https://gitlab.freedesktop.org/freedreno/freedreno/-/wikis/A6xx-SP
102     *
103     * TODO: assume wave128 on fragment/compute shaders?
104     */
105 
106    switch (instr->type) {
107    case nir_instr_type_alu: {
108       nir_alu_instr *alu = nir_instr_as_alu(instr);
109       unsigned components = alu->def.num_components;
110       switch (alu->op) {
111       /* cat4 */
112       case nir_op_frcp:
113       case nir_op_fsqrt:
114       case nir_op_frsq:
115       case nir_op_flog2:
116       case nir_op_fexp2:
117       case nir_op_fsin:
118       case nir_op_fcos:
119          return 4 * components;
120 
121       /* Instructions that become src modifiers. Note for conversions this is
122        * really an approximation.
123        *
124        * This prevents silly things like lifting a negate that would become a
125        * modifier.
126        */
127       case nir_op_f2f32:
128       case nir_op_f2f16:
129       case nir_op_f2fmp:
130       case nir_op_fneg:
131          return all_uses_float(&alu->def, true) ? 0 : 1 * components;
132 
133       case nir_op_fabs:
134          return all_uses_float(&alu->def, false) ? 0 : 1 * components;
135 
136       case nir_op_inot:
137          return all_uses_bit(&alu->def) ? 0 : 1 * components;
138 
139       /* Instructions that become vector split/collect */
140       case nir_op_vec2:
141       case nir_op_vec3:
142       case nir_op_vec4:
143       case nir_op_mov:
144          return 0;
145 
146       /* cat1-cat3 */
147       default:
148          return 1 * components;
149       }
150       break;
151    }
152 
153    case nir_instr_type_tex:
154       /* cat5 */
155       return 8;
156 
157    case nir_instr_type_intrinsic: {
158       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
159       switch (intrin->intrinsic) {
160       case nir_intrinsic_load_ubo: {
161          /* If the UBO and offset are constant, then UBO lowering should do a
162           * better job trying to lower this, and opt_preamble shouldn't try to
163           * duplicate it. However if it has a non-constant offset then we can
164           * avoid setting up a0.x etc. in the main shader and potentially have
165           * to push less.
166           */
167          bool const_ubo = nir_src_is_const(intrin->src[0]);
168          if (!const_ubo) {
169             nir_intrinsic_instr *rsrc = ir3_bindless_resource(intrin->src[0]);
170             if (rsrc)
171                const_ubo = nir_src_is_const(rsrc->src[0]);
172          }
173 
174          if (const_ubo && nir_src_is_const(intrin->src[1]))
175             return 0;
176 
177          /* TODO: get actual numbers for ldc */
178          return 8;
179       }
180 
181       case nir_intrinsic_load_ssbo:
182       case nir_intrinsic_load_ssbo_ir3:
183       case nir_intrinsic_get_ssbo_size:
184       case nir_intrinsic_image_load:
185       case nir_intrinsic_bindless_image_load:
186          /* cat5/isam */
187          return 8;
188 
189       /* By default assume it's a sysval or something */
190       default:
191          return 0;
192       }
193    }
194 
195    case nir_instr_type_phi:
196       /* Although we can often coalesce phis, the cost of a phi is a proxy for
197        * the cost of the if-else statement... If all phis are moved, then the
198        * branches move too. So this needs to have a nonzero cost, even if we're
199        * optimistic about coalescing.
200        *
201        * Value chosen empirically. On Rob's shader-db, cost of 2 performs better
202        * across the board than a cost of 1. Values greater than 2 do not seem to
203        * have any change, so sticking with 2.
204        */
205       return 2;
206 
207    default:
208       return 0;
209    }
210 }
211 
212 static float
rewrite_cost(nir_def * def,const void * data)213 rewrite_cost(nir_def *def, const void *data)
214 {
215    /* We always have to expand booleans */
216    if (def->bit_size == 1)
217       return def->num_components;
218 
219    bool mov_needed = false;
220    nir_foreach_use (use, def) {
221       nir_instr *parent_instr = nir_src_parent_instr(use);
222       if (parent_instr->type != nir_instr_type_alu) {
223          mov_needed = true;
224          break;
225       } else {
226          nir_alu_instr *alu = nir_instr_as_alu(parent_instr);
227          if (alu->op == nir_op_vec2 ||
228              alu->op == nir_op_vec3 ||
229              alu->op == nir_op_vec4 ||
230              alu->op == nir_op_mov) {
231             mov_needed = true;
232             break;
233          } else {
234             /* Assume for non-moves that the const is folded into the src */
235          }
236       }
237    }
238 
239    return mov_needed ? def->num_components : 0;
240 }
241 
242 static bool
avoid_instr(const nir_instr * instr,const void * data)243 avoid_instr(const nir_instr *instr, const void *data)
244 {
245    if (instr->type != nir_instr_type_intrinsic)
246       return false;
247 
248    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
249 
250    return intrin->intrinsic == nir_intrinsic_bindless_resource_ir3;
251 }
252 
253 static bool
set_speculate(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * _)254 set_speculate(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *_)
255 {
256    switch (intr->intrinsic) {
257    /* These instructions go through bounds-checked hardware descriptors so
258     * should be safe to speculate.
259     *
260     * TODO: This isn't necessarily true in Vulkan, where descriptors don't need
261     * to be filled out and bindless descriptor offsets aren't bounds checked.
262     * We may need to plumb this information through from turnip for correctness
263     * to avoid regressing freedreno codegen.
264     */
265    case nir_intrinsic_load_ubo:
266    case nir_intrinsic_load_ubo_vec4:
267    case nir_intrinsic_image_load:
268    case nir_intrinsic_image_samples_identical:
269    case nir_intrinsic_bindless_image_load:
270    case nir_intrinsic_load_ssbo:
271    case nir_intrinsic_load_ssbo_ir3:
272       nir_intrinsic_set_access(intr, nir_intrinsic_access(intr) |
273                                      ACCESS_CAN_SPECULATE);
274       return true;
275 
276    default:
277       return false;
278    }
279 }
280 
281 bool
ir3_nir_opt_preamble(nir_shader * nir,struct ir3_shader_variant * v)282 ir3_nir_opt_preamble(nir_shader *nir, struct ir3_shader_variant *v)
283 {
284    unsigned max_size;
285    if (v->binning_pass) {
286       const struct ir3_const_state *const_state = ir3_const_state(v);
287       max_size =
288          const_state->allocs.consts[IR3_CONST_ALLOC_PREAMBLE].size_vec4 * 4;
289    } else {
290       const struct ir3_const_state *const_state = ir3_const_state(v);
291       max_size = ir3_const_state_get_free_space(
292                     v, const_state, v->compiler->const_upload_unit) * 4;
293    }
294 
295    if (max_size == 0)
296       return false;
297 
298    bool progress = nir_shader_intrinsics_pass(nir, set_speculate,
299                                               nir_metadata_control_flow, NULL);
300 
301    nir_opt_preamble_options options = {
302       .drawid_uniform = true,
303       .subgroup_size_uniform = true,
304       .load_workgroup_size_allowed = true,
305       .def_size = def_size,
306       .preamble_storage_size = max_size,
307       .instr_cost_cb = instr_cost,
308       .avoid_instr_cb = avoid_instr,
309       .rewrite_cost_cb = rewrite_cost,
310    };
311 
312    unsigned size = 0;
313    progress |= nir_opt_preamble(nir, &options, &size);
314 
315    if (!v->binning_pass) {
316       uint32_t preamble_size_vec4 =
317          align(DIV_ROUND_UP(size, 4), v->compiler->const_upload_unit);
318       ir3_const_alloc(&ir3_const_state_mut(v)->allocs, IR3_CONST_ALLOC_PREAMBLE,
319                       preamble_size_vec4, v->compiler->const_upload_unit);
320    }
321 
322    return progress;
323 }
324 
325 /* This isn't nearly as comprehensive as what's done in nir_opt_preamble, but in
326  * various use-cases we need to hoist definitions into preambles outside of
327  * opt_preamble. Currently we only handle a few uncomplicated intrinsics.
328  */
329 bool
ir3_def_is_rematerializable_for_preamble(nir_def * def,nir_def ** preamble_defs)330 ir3_def_is_rematerializable_for_preamble(nir_def *def,
331                                          nir_def **preamble_defs)
332 {
333    switch (def->parent_instr->type) {
334    case nir_instr_type_load_const:
335       return true;
336    case nir_instr_type_intrinsic: {
337       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(def->parent_instr);
338       switch (intrin->intrinsic) {
339       case nir_intrinsic_load_ubo:
340          return ir3_def_is_rematerializable_for_preamble(intrin->src[0].ssa,
341                                                          preamble_defs) &&
342             ir3_def_is_rematerializable_for_preamble(intrin->src[1].ssa,
343                                                      preamble_defs) &&
344             (def->parent_instr->block->cf_node.parent->type ==
345              nir_cf_node_function ||
346              (nir_intrinsic_access(intrin) & ACCESS_CAN_SPECULATE));
347       case nir_intrinsic_bindless_resource_ir3:
348          return ir3_def_is_rematerializable_for_preamble(intrin->src[0].ssa,
349                                                          preamble_defs);
350       case nir_intrinsic_load_preamble:
351          return !!preamble_defs;
352       default:
353          return false;
354       }
355    }
356    case nir_instr_type_alu: {
357       nir_alu_instr *alu = nir_instr_as_alu(def->parent_instr);
358       for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) {
359          if (!ir3_def_is_rematerializable_for_preamble(alu->src[i].src.ssa,
360                                                        preamble_defs))
361             return false;
362       }
363       return true;
364    }
365    default:
366       return false;
367    }
368 }
369 
370 struct find_insert_block_state {
371    nir_block *insert_block;
372 };
373 
374 static bool
find_dominated_src(nir_src * src,void * data)375 find_dominated_src(nir_src *src, void *data)
376 {
377    struct find_insert_block_state *state = data;
378    nir_block *src_block = src->ssa->parent_instr->block;
379 
380    if (!state->insert_block) {
381       state->insert_block = src_block;
382       return true;
383    } else if (nir_block_dominates(state->insert_block, src_block)) {
384       state->insert_block = src_block;
385       return true;
386    } else if (nir_block_dominates(src_block, state->insert_block)) {
387       return true;
388    } else {
389       state->insert_block = NULL;
390       return false;
391    }
392 }
393 
394 /* Find the block where instr can be inserted. This is the block that is
395  * dominated by all its sources. If instr doesn't have any sources, return dflt.
396  */
397 static nir_block *
find_insert_block(nir_instr * instr,nir_block * dflt)398 find_insert_block(nir_instr *instr, nir_block *dflt)
399 {
400    struct find_insert_block_state state = {
401       .insert_block = NULL,
402    };
403 
404    if (nir_foreach_src(instr, find_dominated_src, &state)) {
405       return state.insert_block ? state.insert_block : dflt;
406    }
407 
408    return NULL;
409 }
410 
411 static bool
dominates(const nir_instr * old_instr,const nir_instr * new_instr)412 dominates(const nir_instr *old_instr, const nir_instr *new_instr)
413 {
414    return nir_block_dominates(old_instr->block, new_instr->block);
415 }
416 
417 static nir_def *
_rematerialize_def(nir_builder * b,struct hash_table * remap_ht,struct set * instr_set,nir_def ** preamble_defs,nir_def * def)418 _rematerialize_def(nir_builder *b, struct hash_table *remap_ht,
419                    struct set *instr_set, nir_def **preamble_defs,
420                    nir_def *def)
421 {
422    if (_mesa_hash_table_search(remap_ht, def->parent_instr))
423       return NULL;
424 
425    switch (def->parent_instr->type) {
426    case nir_instr_type_load_const:
427       break;
428    case nir_instr_type_intrinsic: {
429       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(def->parent_instr);
430       if (intrin->intrinsic == nir_intrinsic_load_preamble) {
431          _mesa_hash_table_insert(remap_ht, def,
432                                  preamble_defs[nir_intrinsic_base(intrin)]);
433          return preamble_defs[nir_intrinsic_base(intrin)];
434       } else {
435          for (unsigned i = 0; i < nir_intrinsic_infos[intrin->intrinsic].num_srcs;
436               i++)
437             _rematerialize_def(b, remap_ht, instr_set, preamble_defs,
438                                intrin->src[i].ssa);
439       }
440       break;
441    }
442    case nir_instr_type_alu: {
443       nir_alu_instr *alu = nir_instr_as_alu(def->parent_instr);
444       for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++)
445          _rematerialize_def(b, remap_ht, instr_set, preamble_defs,
446                             alu->src[i].src.ssa);
447       break;
448    }
449    default:
450       unreachable("should not get here");
451    }
452 
453    nir_instr *instr = nir_instr_clone_deep(b->shader, def->parent_instr,
454                                            remap_ht);
455 
456    /* Find a legal place to insert the new instruction. We cannot simply put it
457     * at the end of the preamble since the original instruction and its sources
458     * may be defined inside control flow.
459     */
460    nir_metadata_require(b->impl, nir_metadata_dominance);
461    nir_block *insert_block =
462       find_insert_block(instr, nir_cursor_current_block(b->cursor));
463 
464    /* Since the preamble control flow was reconstructed from the original one,
465     * we must be able to find a legal place to insert the instruction.
466     */
467    assert(insert_block);
468    b->cursor = nir_after_block(insert_block);
469    nir_builder_instr_insert(b, instr);
470 
471    if (instr_set) {
472       nir_instr *other_instr =
473          nir_instr_set_add_or_rewrite(instr_set, instr, dominates);
474       if (other_instr) {
475          instr = other_instr;
476          _mesa_hash_table_insert(remap_ht, def, nir_instr_def(other_instr));
477       }
478    }
479 
480    return nir_instr_def(instr);
481 }
482 
483 /* Hoist a given definition into the preamble. If "instr_set" is non-NULL,
484  * de-duplicate the hoisted definitions, and if "preamble_defs" is non-NULL then
485  * it is used to remap load_preamble instructions back to the original
486  * definition in the preamble, if the definition uses load_preamble
487  * instructions.
488  */
489 
490 nir_def *
ir3_rematerialize_def_for_preamble(nir_builder * b,nir_def * def,struct set * instr_set,nir_def ** preamble_defs)491 ir3_rematerialize_def_for_preamble(nir_builder *b, nir_def *def,
492                                    struct set *instr_set,
493                                    nir_def **preamble_defs)
494 {
495    struct hash_table *remap_ht = _mesa_pointer_hash_table_create(NULL);
496 
497    nir_def *new_def =
498       _rematerialize_def(b, remap_ht, instr_set, preamble_defs, def);
499 
500    _mesa_hash_table_destroy(remap_ht, NULL);
501 
502    return new_def;
503 }
504 
505 
506 static void
get_descriptors(nir_instr * instr,nir_def ** descs)507 get_descriptors(nir_instr *instr, nir_def **descs)
508 {
509    if (instr->type == nir_instr_type_tex) {
510       nir_tex_instr *tex = nir_instr_as_tex(instr);
511       /* TODO: handle non-bindless tex instructions. These are more complicated,
512        * because of the implicit addition in the instruction.
513        */
514       int texture_index =
515          nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
516       int sampler_index =
517          nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
518       if (texture_index >= 0)
519          descs[0] = tex->src[texture_index].src.ssa;
520       if (sampler_index >= 0)
521          descs[1] = tex->src[sampler_index].src.ssa;
522    } else if (instr->type == nir_instr_type_intrinsic) {
523       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
524       switch (intrin->intrinsic) {
525       case nir_intrinsic_load_ssbo:
526       case nir_intrinsic_load_ubo:
527       case nir_intrinsic_ssbo_atomic:
528       case nir_intrinsic_ssbo_atomic_swap:
529       case nir_intrinsic_get_ssbo_size:
530       case nir_intrinsic_image_load:
531       case nir_intrinsic_bindless_image_load:
532       case nir_intrinsic_image_store:
533       case nir_intrinsic_bindless_image_store:
534       case nir_intrinsic_image_atomic:
535       case nir_intrinsic_bindless_image_atomic:
536       case nir_intrinsic_image_size:
537       case nir_intrinsic_bindless_image_size:
538          descs[0] = intrin->src[0].ssa;
539          break;
540       case nir_intrinsic_store_ssbo:
541          descs[0] = intrin->src[1].ssa;
542          break;
543       default:
544          break;
545       }
546    }
547 }
548 
549 #define MAX_PREFETCHES 32
550 
551 struct prefetches {
552    nir_def *prefetches[MAX_PREFETCHES];
553    unsigned num_prefetches;
554 };
555 
556 static bool
is_already_prefetched(struct prefetches * prefetches,nir_def * def)557 is_already_prefetched(struct prefetches *prefetches, nir_def *def)
558 {
559    for (unsigned i = 0; i < prefetches->num_prefetches; i++) {
560       if (prefetches->prefetches[i] == def)
561          return true;
562    }
563 
564    return false;
565 }
566 
567 static void
add_prefetch(struct prefetches * prefetches,nir_def * def)568 add_prefetch(struct prefetches *prefetches, nir_def *def)
569 {
570    assert(prefetches->num_prefetches < MAX_PREFETCHES);
571    prefetches->prefetches[prefetches->num_prefetches++] = def;
572 }
573 
574 struct prefetch_state {
575    struct prefetches tex, sampler;
576 };
577 
578 static bool
emit_descriptor_prefetch(nir_builder * b,nir_instr * instr,nir_def ** descs,struct prefetch_state * state)579 emit_descriptor_prefetch(nir_builder *b, nir_instr *instr, nir_def **descs,
580                          struct prefetch_state *state)
581 {
582    if (instr->type == nir_instr_type_tex) {
583       nir_tex_instr *tex = nir_instr_as_tex(instr);
584       int sampler_index =
585          nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
586       int texture_index =
587          nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
588 
589       /* For texture instructions, prefetch if at least one source hasn't been
590        * prefetched already. For example, the same sampler may be used with
591        * different textures, and we still want to prefetch the texture
592        * descriptor if we've already prefetched the sampler descriptor.
593        */
594 
595       bool tex_already_prefetched = is_already_prefetched(&state->tex, descs[0]);
596 
597       if (!tex_already_prefetched &&
598           state->tex.num_prefetches == MAX_PREFETCHES)
599          return false;
600 
601       assert(texture_index >= 0);
602       if (sampler_index >= 0) {
603          bool sampler_already_prefetched =
604             is_already_prefetched(&state->sampler, descs[1]);
605 
606          if (!sampler_already_prefetched &&
607              state->sampler.num_prefetches == MAX_PREFETCHES)
608             return false;
609 
610          if (tex_already_prefetched && sampler_already_prefetched)
611             return false;
612 
613          if (!tex_already_prefetched)
614             add_prefetch(&state->tex, descs[0]);
615          if (!sampler_already_prefetched)
616             add_prefetch(&state->sampler, descs[1]);
617 
618          nir_prefetch_sam_ir3(b, descs[0], descs[1]);
619       } else {
620          if (tex_already_prefetched)
621             return false;
622 
623          add_prefetch(&state->tex, descs[0]);
624          nir_prefetch_tex_ir3(b, descs[0]);
625       }
626    } else {
627       assert(instr->type == nir_instr_type_intrinsic);
628 
629       if (state->tex.num_prefetches == MAX_PREFETCHES)
630          return false;
631 
632       if (is_already_prefetched(&state->tex, descs[0]))
633          return false;
634 
635       add_prefetch(&state->tex, descs[0]);
636 
637       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
638       if (intrin->intrinsic == nir_intrinsic_load_ubo)
639          nir_prefetch_ubo_ir3(b, descs[0]);
640       else
641          nir_prefetch_tex_ir3(b, descs[0]);
642    }
643 
644    return true;
645 }
646 
647 static unsigned
get_preamble_offset(nir_def * def)648 get_preamble_offset(nir_def *def)
649 {
650    return nir_intrinsic_base(nir_instr_as_intrinsic(def->parent_instr));
651 }
652 
653 /* Prefetch descriptors in the preamble. This is an optimization introduced on
654  * a7xx, mainly useful when the preamble is an early preamble, and replaces the
655  * use of CP_LOAD_STATE on a6xx to prefetch descriptors in HLSQ.
656  */
657 
658 bool
ir3_nir_opt_prefetch_descriptors(nir_shader * nir,struct ir3_shader_variant * v)659 ir3_nir_opt_prefetch_descriptors(nir_shader *nir, struct ir3_shader_variant *v)
660 {
661    const struct ir3_const_state *const_state = ir3_const_state(v);
662 
663    nir_function_impl *main = nir_shader_get_entrypoint(nir);
664    struct set *instr_set = nir_instr_set_create(NULL);
665    nir_function_impl *preamble = main->preamble ? main->preamble->impl : NULL;
666    nir_builder b;
667    bool progress = false;
668    struct prefetch_state state = {};
669 
670    nir_def **preamble_defs =
671       calloc(const_state->allocs.consts[IR3_CONST_ALLOC_PREAMBLE].size_vec4 * 4,
672              sizeof(nir_def *));
673 
674    /* Collect preamble defs. This is useful if the computation of the offset has
675     * already been hoisted to the preamble.
676     */
677    if (preamble) {
678       nir_foreach_block (block, preamble) {
679          nir_foreach_instr (instr, block) {
680             if (instr->type != nir_instr_type_intrinsic)
681                continue;
682 
683             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
684 
685             if (intrin->intrinsic != nir_intrinsic_store_preamble)
686                continue;
687 
688             assert(
689                nir_intrinsic_base(intrin) <
690                const_state->allocs.consts[IR3_CONST_ALLOC_PREAMBLE].size_vec4 * 4);
691             preamble_defs[nir_intrinsic_base(intrin)] = intrin->src[0].ssa;
692          }
693       }
694    }
695 
696    nir_foreach_block (block, main) {
697       nir_foreach_instr (instr, block) {
698          nir_def *descs[2] = { NULL, NULL };
699          nir_def *preamble_descs[2] = { NULL, NULL };
700          get_descriptors(instr, descs);
701 
702          /* We must have found at least one descriptor */
703          if (!descs[0] && !descs[1])
704             continue;
705 
706          /* The instruction itself must be hoistable.
707           * TODO: If the descriptor is statically referenced and in-bounds, then
708           * we should be able to hoist the descriptor load even if the
709           * descriptor contents aren't guaranteed. This would require more
710           * plumbing.
711           * TODO: Textures. This is broken in nir_opt_preamble at the moment and
712           * handling them would also require more plumbing.
713           */
714          if (instr->type == nir_instr_type_intrinsic &&
715              nir_intrinsic_has_access(nir_instr_as_intrinsic(instr)) &&
716              !(nir_intrinsic_access(nir_instr_as_intrinsic(instr)) &
717                ACCESS_CAN_SPECULATE) &&
718              block->cf_node.parent->type != nir_cf_node_function)
719             continue;
720 
721          /* Each descriptor must be rematerializable */
722          if (descs[0] &&
723              !ir3_def_is_rematerializable_for_preamble(descs[0], preamble_defs))
724             continue;
725          if (descs[1] &&
726              !ir3_def_is_rematerializable_for_preamble(descs[1], preamble_defs))
727             continue;
728 
729          /* If the preamble hasn't been created then this descriptor isn't a
730           * duplicate and we will definitely insert an instruction, so create
731           * the preamble if it hasn't already been created.
732           */
733          if (!preamble) {
734             preamble = nir_shader_get_preamble(nir);
735          }
736 
737          b = nir_builder_at(nir_after_impl(preamble));
738 
739          /* Materialize descriptors for the prefetch. Note that we deduplicate
740           * descriptors so that we don't blow our budget when repeatedly loading
741           * from the same descriptor, even if the calculation of the descriptor
742           * offset hasn't been CSE'd because the accesses are in different
743           * blocks. This is common because we emit the bindless_resource_ir3
744           * intrinsic right before the access.
745           */
746          for (unsigned i = 0; i < 2; i++) {
747             if (!descs[i])
748                continue;
749 
750             preamble_descs[i] =
751                ir3_rematerialize_def_for_preamble(&b, descs[i], instr_set,
752                                                   preamble_defs);
753          }
754 
755          /* ir3_rematerialize_def_for_preamble may have moved the cursor. */
756          b.cursor = nir_after_impl(preamble);
757          progress |= emit_descriptor_prefetch(&b, instr, preamble_descs, &state);
758 
759          if (state.sampler.num_prefetches == MAX_PREFETCHES &&
760              state.tex.num_prefetches == MAX_PREFETCHES)
761             goto finished;
762       }
763    }
764 
765 finished:
766    nir_metadata_preserve(main, nir_metadata_all);
767    if (preamble) {
768       nir_metadata_preserve(preamble,
769                             nir_metadata_block_index |
770                             nir_metadata_dominance);
771    }
772    nir_instr_set_destroy(instr_set);
773    free(preamble_defs);
774    return progress;
775 }
776 
777 bool
ir3_nir_lower_preamble(nir_shader * nir,struct ir3_shader_variant * v)778 ir3_nir_lower_preamble(nir_shader *nir, struct ir3_shader_variant *v)
779 {
780    nir_function_impl *main = nir_shader_get_entrypoint(nir);
781 
782    if (!main->preamble)
783       return false;
784 
785    nir_function_impl *preamble = main->preamble->impl;
786 
787    /* First, lower load/store_preamble. */
788    const struct ir3_const_state *const_state = ir3_const_state(v);
789    unsigned preamble_base =
790       const_state->allocs.consts[IR3_CONST_ALLOC_PREAMBLE].offset_vec4 * 4;
791    unsigned preamble_size =
792       const_state->allocs.consts[IR3_CONST_ALLOC_PREAMBLE].size_vec4 * 4;
793 
794    BITSET_DECLARE(promoted_to_float, preamble_size);
795    memset(promoted_to_float, 0, sizeof(promoted_to_float));
796 
797    nir_builder builder_main = nir_builder_create(main);
798    nir_builder *b = &builder_main;
799 
800    nir_foreach_block (block, main) {
801       nir_foreach_instr_safe (instr, block) {
802          if (instr->type != nir_instr_type_intrinsic)
803             continue;
804 
805          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
806          if (intrin->intrinsic != nir_intrinsic_load_preamble)
807             continue;
808 
809          nir_def *dest = &intrin->def;
810 
811          unsigned offset = preamble_base + nir_intrinsic_base(intrin);
812          b->cursor = nir_before_instr(instr);
813 
814          nir_def *new_dest = nir_load_const_ir3(
815             b, dest->num_components, 32, nir_imm_int(b, 0), .base = offset);
816 
817          if (dest->bit_size == 1) {
818             new_dest = nir_i2b(b, new_dest);
819          } else if (dest->bit_size != 32) {
820             if (all_uses_float(dest, true)) {
821                assert(dest->bit_size == 16);
822                new_dest = nir_f2f16(b, new_dest);
823                BITSET_SET(promoted_to_float, nir_intrinsic_base(intrin));
824             } else {
825                new_dest = nir_u2uN(b, new_dest, dest->bit_size);
826             }
827          }
828 
829          nir_def_rewrite_uses(dest, new_dest);
830          nir_instr_remove(instr);
831          nir_instr_free(instr);
832       }
833    }
834 
835    nir_builder builder_preamble = nir_builder_create(preamble);
836    b = &builder_preamble;
837 
838    nir_foreach_block (block, preamble) {
839       nir_foreach_instr_safe (instr, block) {
840          if (instr->type != nir_instr_type_intrinsic)
841             continue;
842 
843          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
844          if (intrin->intrinsic != nir_intrinsic_store_preamble)
845             continue;
846 
847          nir_def *src = intrin->src[0].ssa;
848          unsigned offset = preamble_base + nir_intrinsic_base(intrin);
849 
850          b->cursor = nir_before_instr(instr);
851 
852          if (src->bit_size == 1)
853             src = nir_b2i32(b, src);
854          if (src->bit_size != 32) {
855             if (BITSET_TEST(promoted_to_float, nir_intrinsic_base(intrin))){
856                assert(src->bit_size == 16);
857                src = nir_f2f32(b, src);
858             } else {
859                src = nir_u2u32(b, src);
860             }
861          }
862 
863          nir_store_const_ir3(b, src, .base = offset);
864          nir_instr_remove(instr);
865          nir_instr_free(instr);
866       }
867    }
868 
869    /* Now, create the preamble sequence and move the preamble into the main
870     * shader:
871     *
872     * if (preamble_start_ir3()) {
873     *    if (subgroupElect()) {
874     *       preamble();
875     *       preamble_end_ir3();
876     *    }
877     * }
878     * ...
879     */
880 
881    /* @decl_regs need to stay in the first block. */
882    b->cursor = nir_after_reg_decls(main);
883 
884    nir_if *outer_if = nir_push_if(b, nir_preamble_start_ir3(b, 1));
885    {
886       nir_if *inner_if = nir_push_if(b, nir_elect_any_ir3(b, 1));
887       {
888          nir_call_instr *call = nir_call_instr_create(nir, main->preamble);
889          nir_builder_instr_insert(b, &call->instr);
890          nir_preamble_end_ir3(b);
891       }
892       nir_pop_if(b, inner_if);
893    }
894    nir_pop_if(b, outer_if);
895 
896    nir_inline_functions(nir);
897    exec_node_remove(&main->preamble->node);
898    main->preamble = NULL;
899 
900    nir_metadata_preserve(main, nir_metadata_none);
901    return true;
902 }
903