• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "ir3_compiler.h"
25 #include "ir3_nir.h"
26 
27 /* Preamble optimization happens in two parts: first we generate the preamble
28  * using the generic NIR pass, then we setup the preamble sequence and inline
29  * the preamble into the main shader if there was a preamble. The first part
30  * should happen before UBO lowering, because we want to prefer more complex
31  * expressions over UBO loads, but the second part has to happen after UBO
32  * lowering because it may add copy instructions to the preamble.
33  */
34 
35 static void
def_size(nir_ssa_def * def,unsigned * size,unsigned * align)36 def_size(nir_ssa_def *def, unsigned *size, unsigned *align)
37 {
38    unsigned bit_size = def->bit_size == 1 ? 32 : def->bit_size;
39    /* Due to the implicit const file promotion we want to expand 16-bit values
40     * to 32-bit so that the truncation in the main shader can hopefully be
41     * folded into the use.
42     */
43    *size = DIV_ROUND_UP(bit_size, 32) * def->num_components;
44    *align = 1;
45 }
46 
47 static bool
all_uses_float(nir_ssa_def * def,bool allow_src2)48 all_uses_float(nir_ssa_def *def, bool allow_src2)
49 {
50    nir_foreach_if_use (use, def) {
51       return false;
52    }
53 
54    nir_foreach_use (use, def) {
55       nir_instr *use_instr = use->parent_instr;
56       if (use_instr->type != nir_instr_type_alu)
57          return false;
58       nir_alu_instr *use_alu = nir_instr_as_alu(use_instr);
59       unsigned src_index = ~0;
60       for  (unsigned i = 0; i < nir_op_infos[use_alu->op].num_inputs; i++) {
61          if (&use_alu->src[i].src == use) {
62             src_index = i;
63             break;
64          }
65       }
66 
67       assert(src_index != ~0);
68       nir_alu_type src_type =
69          nir_alu_type_get_base_type(nir_op_infos[use_alu->op].input_types[src_index]);
70 
71       if (src_type != nir_type_float || (src_index == 2 && !allow_src2))
72          return false;
73    }
74 
75    return true;
76 }
77 
78 static bool
all_uses_bit(nir_ssa_def * def)79 all_uses_bit(nir_ssa_def *def)
80 {
81    nir_foreach_if_use (use, def) {
82       return false;
83    }
84 
85    nir_foreach_use (use, def) {
86       nir_instr *use_instr = use->parent_instr;
87       if (use_instr->type != nir_instr_type_alu)
88          return false;
89       nir_alu_instr *use_alu = nir_instr_as_alu(use_instr);
90 
91       /* See ir3_cat2_absneg() */
92       switch (use_alu->op) {
93       case nir_op_iand:
94       case nir_op_ior:
95       case nir_op_inot:
96       case nir_op_ixor:
97       case nir_op_bitfield_reverse:
98       case nir_op_ufind_msb:
99       case nir_op_ifind_msb:
100       case nir_op_find_lsb:
101       case nir_op_ishl:
102       case nir_op_ushr:
103       case nir_op_ishr:
104       case nir_op_bit_count:
105          continue;
106       default:
107          return false;
108       }
109    }
110 
111    return true;
112 }
113 
114 static float
instr_cost(nir_instr * instr,const void * data)115 instr_cost(nir_instr *instr, const void *data)
116 {
117    /* We'll assume wave64 here for simplicity and assume normal cat1-cat3 ops
118     * take 1 (normalized) cycle.
119     *
120     * See https://gitlab.freedesktop.org/freedreno/freedreno/-/wikis/A6xx-SP
121     *
122     * TODO: assume wave128 on fragment/compute shaders?
123     */
124 
125    switch (instr->type) {
126    case nir_instr_type_alu: {
127       nir_alu_instr *alu = nir_instr_as_alu(instr);
128       unsigned components = alu->dest.dest.ssa.num_components;
129       switch (alu->op) {
130       /* cat4 */
131       case nir_op_frcp:
132       case nir_op_fsqrt:
133       case nir_op_frsq:
134       case nir_op_flog2:
135       case nir_op_fexp2:
136       case nir_op_fsin:
137       case nir_op_fcos:
138          return 4 * components;
139 
140       /* Instructions that become src modifiers. Note for conversions this is
141        * really an approximation.
142        *
143        * This prevents silly things like lifting a negate that would become a
144        * modifier.
145        */
146       case nir_op_f2f32:
147       case nir_op_f2f16:
148       case nir_op_f2fmp:
149       case nir_op_fneg:
150          return all_uses_float(&alu->dest.dest.ssa, true) ? 0 : 1 * components;
151 
152       case nir_op_fabs:
153          return all_uses_float(&alu->dest.dest.ssa, false) ? 0 : 1 * components;
154 
155       case nir_op_inot:
156          return all_uses_bit(&alu->dest.dest.ssa) ? 0 : 1 * components;
157 
158       /* Instructions that become vector split/collect */
159       case nir_op_vec2:
160       case nir_op_vec3:
161       case nir_op_vec4:
162       case nir_op_mov:
163          return 0;
164 
165       /* cat1-cat3 */
166       default:
167          return 1 * components;
168       }
169       break;
170    }
171 
172    case nir_instr_type_tex:
173       /* cat5 */
174       return 8;
175 
176    case nir_instr_type_intrinsic: {
177       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
178       switch (intrin->intrinsic) {
179       case nir_intrinsic_load_ubo: {
180          /* If the UBO and offset are constant, then UBO lowering should do a
181           * better job trying to lower this, and opt_preamble shouldn't try to
182           * duplicate it. However if it has a non-constant offset then we can
183           * avoid setting up a0.x etc. in the main shader and potentially have
184           * to push less.
185           */
186          bool const_ubo = nir_src_is_const(intrin->src[0]);
187          if (!const_ubo) {
188             nir_intrinsic_instr *rsrc = ir3_bindless_resource(intrin->src[0]);
189             if (rsrc)
190                const_ubo = nir_src_is_const(rsrc->src[0]);
191          }
192 
193          if (const_ubo && nir_src_is_const(intrin->src[1]))
194             return 0;
195 
196          /* TODO: get actual numbers for ldc */
197          return 8;
198       }
199 
200       case nir_intrinsic_load_ssbo:
201       case nir_intrinsic_load_ssbo_ir3:
202       case nir_intrinsic_get_ssbo_size:
203       case nir_intrinsic_image_load:
204       case nir_intrinsic_bindless_image_load:
205          /* cat5/isam */
206          return 8;
207 
208       /* By default assume it's a sysval or something */
209       default:
210          return 0;
211       }
212    }
213 
214    default:
215       return 0;
216    }
217 }
218 
219 static float
rewrite_cost(nir_ssa_def * def,const void * data)220 rewrite_cost(nir_ssa_def *def, const void *data)
221 {
222    /* We always have to expand booleans */
223    if (def->bit_size == 1)
224       return def->num_components;
225 
226    bool mov_needed = false;
227    nir_foreach_use (use, def) {
228       nir_instr *parent_instr = use->parent_instr;
229       if (parent_instr->type != nir_instr_type_alu) {
230          mov_needed = true;
231          break;
232       } else {
233          nir_alu_instr *alu = nir_instr_as_alu(parent_instr);
234          if (alu->op == nir_op_vec2 ||
235              alu->op == nir_op_vec3 ||
236              alu->op == nir_op_vec4 ||
237              alu->op == nir_op_mov) {
238             mov_needed = true;
239             break;
240          } else {
241             /* Assume for non-moves that the const is folded into the src */
242          }
243       }
244    }
245 
246    return mov_needed ? def->num_components : 0;
247 }
248 
249 static bool
avoid_instr(const nir_instr * instr,const void * data)250 avoid_instr(const nir_instr *instr, const void *data)
251 {
252    if (instr->type != nir_instr_type_intrinsic)
253       return false;
254 
255    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
256 
257    return intrin->intrinsic == nir_intrinsic_bindless_resource_ir3;
258 }
259 
260 bool
ir3_nir_opt_preamble(nir_shader * nir,struct ir3_shader_variant * v)261 ir3_nir_opt_preamble(nir_shader *nir, struct ir3_shader_variant *v)
262 {
263    struct ir3_const_state *const_state = ir3_const_state(v);
264 
265    unsigned max_size;
266    if (v->binning_pass) {
267       max_size = const_state->preamble_size * 4;
268    } else {
269       struct ir3_const_state worst_case_const_state = {};
270       ir3_setup_const_state(nir, v, &worst_case_const_state);
271       max_size = (ir3_max_const(v) - worst_case_const_state.offsets.immediate) * 4;
272    }
273 
274    if (max_size == 0)
275       return false;
276 
277    nir_opt_preamble_options options = {
278       .drawid_uniform = true,
279       .subgroup_size_uniform = true,
280       .def_size = def_size,
281       .preamble_storage_size = max_size,
282       .instr_cost_cb = instr_cost,
283       .avoid_instr_cb = avoid_instr,
284       .rewrite_cost_cb = rewrite_cost,
285    };
286 
287    unsigned size;
288    bool progress = nir_opt_preamble(nir, &options, &size);
289 
290    if (!v->binning_pass)
291       const_state->preamble_size = DIV_ROUND_UP(size, 4);
292 
293    return progress;
294 }
295 
296 bool
ir3_nir_lower_preamble(nir_shader * nir,struct ir3_shader_variant * v)297 ir3_nir_lower_preamble(nir_shader *nir, struct ir3_shader_variant *v)
298 {
299    nir_function_impl *main = nir_shader_get_entrypoint(nir);
300 
301    if (!main->preamble)
302       return false;
303 
304    nir_function_impl *preamble = main->preamble->impl;
305 
306    /* First, lower load/store_preamble. */
307    const struct ir3_const_state *const_state = ir3_const_state(v);
308    unsigned preamble_base = v->num_reserved_user_consts * 4 +
309       const_state->ubo_state.size / 4;
310    unsigned preamble_size = const_state->preamble_size * 4;
311 
312    BITSET_DECLARE(promoted_to_float, preamble_size);
313    memset(promoted_to_float, 0, sizeof(promoted_to_float));
314 
315    nir_builder _b;
316    nir_builder *b = &_b;
317    nir_builder_init(b, main);
318 
319    nir_foreach_block (block, main) {
320       nir_foreach_instr_safe (instr, block) {
321          if (instr->type != nir_instr_type_intrinsic)
322             continue;
323 
324          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
325          if (intrin->intrinsic != nir_intrinsic_load_preamble)
326             continue;
327 
328          nir_ssa_def *dest = &intrin->dest.ssa;
329 
330          unsigned offset = preamble_base + nir_intrinsic_base(intrin);
331          b->cursor = nir_before_instr(instr);
332 
333          nir_ssa_def *new_dest =
334             nir_load_uniform(b, dest->num_components, 32, nir_imm_int(b, 0),
335                              .base = offset);
336 
337          if (dest->bit_size == 1) {
338             new_dest = nir_i2b1(b, new_dest);
339          } else if (dest->bit_size != 32) {
340             assert(dest->bit_size == 16);
341             if (all_uses_float(dest, true)) {
342                new_dest = nir_f2f16(b, new_dest);
343                BITSET_SET(promoted_to_float, nir_intrinsic_base(intrin));
344             } else {
345                new_dest = nir_u2u16(b, new_dest);
346             }
347          }
348 
349          nir_ssa_def_rewrite_uses(dest, new_dest);
350          nir_instr_remove(instr);
351          nir_instr_free(instr);
352       }
353    }
354 
355    nir_builder_init(b, preamble);
356 
357    nir_foreach_block (block, preamble) {
358       nir_foreach_instr_safe (instr, block) {
359          if (instr->type != nir_instr_type_intrinsic)
360             continue;
361 
362          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
363          if (intrin->intrinsic != nir_intrinsic_store_preamble)
364             continue;
365 
366          nir_ssa_def *src = intrin->src[0].ssa;
367          unsigned offset = preamble_base + nir_intrinsic_base(intrin);
368 
369          b->cursor = nir_before_instr(instr);
370 
371          if (src->bit_size == 1)
372             src = nir_b2i32(b, src);
373          if (src->bit_size != 32) {
374             assert(src->bit_size == 16);
375             if (BITSET_TEST(promoted_to_float, nir_intrinsic_base(intrin))) {
376                src = nir_f2f32(b, src);
377             } else {
378                src = nir_u2u32(b, src);
379             }
380          }
381 
382          nir_store_uniform_ir3(b, src, .base = offset);
383          nir_instr_remove(instr);
384          nir_instr_free(instr);
385       }
386    }
387 
388    /* Now, create the preamble sequence and move the preamble into the main
389     * shader:
390     *
391     * if (preamble_start_ir3()) {
392     *    if (subgroupElect()) {
393     *       preamble();
394     *       preamble_end_ir3();
395     *    }
396     * }
397     * ...
398     */
399 
400    b->cursor = nir_before_cf_list(&main->body);
401 
402    nir_if *outer_if = nir_push_if(b, nir_preamble_start_ir3(b, 1));
403    {
404       nir_if *inner_if = nir_push_if(b, nir_elect(b, 1));
405       {
406          nir_call_instr *call = nir_call_instr_create(nir, main->preamble);
407          nir_builder_instr_insert(b, &call->instr);
408          nir_preamble_end_ir3(b);
409       }
410       nir_pop_if(b, inner_if);
411    }
412    nir_pop_if(b, outer_if);
413 
414    nir_inline_functions(nir);
415    exec_node_remove(&main->preamble->node);
416    main->preamble = NULL;
417 
418    nir_metadata_preserve(main, nir_metadata_none);
419    return true;
420 }
421