• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "ir3_compiler.h"
25 #include "ir3_nir.h"
26 
27 /* Preamble optimization happens in two parts: first we generate the preamble
28  * using the generic NIR pass, then we setup the preamble sequence and inline
29  * the preamble into the main shader if there was a preamble. The first part
30  * should happen before UBO lowering, because we want to prefer more complex
31  * expressions over UBO loads, but the second part has to happen after UBO
32  * lowering because it may add copy instructions to the preamble.
33  */
34 
35 static void
def_size(nir_def * def,unsigned * size,unsigned * align)36 def_size(nir_def *def, unsigned *size, unsigned *align)
37 {
38    unsigned bit_size = def->bit_size == 1 ? 32 : def->bit_size;
39    /* Due to the implicit const file promotion we want to expand 16-bit values
40     * to 32-bit so that the truncation in the main shader can hopefully be
41     * folded into the use.
42     */
43    *size = DIV_ROUND_UP(bit_size, 32) * def->num_components;
44    *align = 1;
45 }
46 
47 static bool
all_uses_float(nir_def * def,bool allow_src2)48 all_uses_float(nir_def *def, bool allow_src2)
49 {
50    nir_foreach_use_including_if (use, def) {
51       if (nir_src_is_if(use))
52          return false;
53 
54       nir_instr *use_instr = nir_src_parent_instr(use);
55       if (use_instr->type != nir_instr_type_alu)
56          return false;
57       nir_alu_instr *use_alu = nir_instr_as_alu(use_instr);
58       unsigned src_index = ~0;
59       for  (unsigned i = 0; i < nir_op_infos[use_alu->op].num_inputs; i++) {
60          if (&use_alu->src[i].src == use) {
61             src_index = i;
62             break;
63          }
64       }
65 
66       assert(src_index != ~0);
67       nir_alu_type src_type =
68          nir_alu_type_get_base_type(nir_op_infos[use_alu->op].input_types[src_index]);
69 
70       if (src_type != nir_type_float || (src_index == 2 && !allow_src2))
71          return false;
72    }
73 
74    return true;
75 }
76 
77 static bool
all_uses_bit(nir_def * def)78 all_uses_bit(nir_def *def)
79 {
80    nir_foreach_use_including_if (use, def) {
81       if (nir_src_is_if(use))
82          return false;
83 
84       nir_instr *use_instr = nir_src_parent_instr(use);
85       if (use_instr->type != nir_instr_type_alu)
86          return false;
87       nir_alu_instr *use_alu = nir_instr_as_alu(use_instr);
88 
89       /* See ir3_cat2_absneg() */
90       switch (use_alu->op) {
91       case nir_op_iand:
92       case nir_op_ior:
93       case nir_op_inot:
94       case nir_op_ixor:
95       case nir_op_bitfield_reverse:
96       case nir_op_ufind_msb:
97       case nir_op_ifind_msb:
98       case nir_op_find_lsb:
99       case nir_op_ishl:
100       case nir_op_ushr:
101       case nir_op_ishr:
102       case nir_op_bit_count:
103          continue;
104       default:
105          return false;
106       }
107    }
108 
109    return true;
110 }
111 
112 static float
instr_cost(nir_instr * instr,const void * data)113 instr_cost(nir_instr *instr, const void *data)
114 {
115    /* We'll assume wave64 here for simplicity and assume normal cat1-cat3 ops
116     * take 1 (normalized) cycle.
117     *
118     * See https://gitlab.freedesktop.org/freedreno/freedreno/-/wikis/A6xx-SP
119     *
120     * TODO: assume wave128 on fragment/compute shaders?
121     */
122 
123    switch (instr->type) {
124    case nir_instr_type_alu: {
125       nir_alu_instr *alu = nir_instr_as_alu(instr);
126       unsigned components = alu->def.num_components;
127       switch (alu->op) {
128       /* cat4 */
129       case nir_op_frcp:
130       case nir_op_fsqrt:
131       case nir_op_frsq:
132       case nir_op_flog2:
133       case nir_op_fexp2:
134       case nir_op_fsin:
135       case nir_op_fcos:
136          return 4 * components;
137 
138       /* Instructions that become src modifiers. Note for conversions this is
139        * really an approximation.
140        *
141        * This prevents silly things like lifting a negate that would become a
142        * modifier.
143        */
144       case nir_op_f2f32:
145       case nir_op_f2f16:
146       case nir_op_f2fmp:
147       case nir_op_fneg:
148          return all_uses_float(&alu->def, true) ? 0 : 1 * components;
149 
150       case nir_op_fabs:
151          return all_uses_float(&alu->def, false) ? 0 : 1 * components;
152 
153       case nir_op_inot:
154          return all_uses_bit(&alu->def) ? 0 : 1 * components;
155 
156       /* Instructions that become vector split/collect */
157       case nir_op_vec2:
158       case nir_op_vec3:
159       case nir_op_vec4:
160       case nir_op_mov:
161          return 0;
162 
163       /* cat1-cat3 */
164       default:
165          return 1 * components;
166       }
167       break;
168    }
169 
170    case nir_instr_type_tex:
171       /* cat5 */
172       return 8;
173 
174    case nir_instr_type_intrinsic: {
175       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
176       switch (intrin->intrinsic) {
177       case nir_intrinsic_load_ubo: {
178          /* If the UBO and offset are constant, then UBO lowering should do a
179           * better job trying to lower this, and opt_preamble shouldn't try to
180           * duplicate it. However if it has a non-constant offset then we can
181           * avoid setting up a0.x etc. in the main shader and potentially have
182           * to push less.
183           */
184          bool const_ubo = nir_src_is_const(intrin->src[0]);
185          if (!const_ubo) {
186             nir_intrinsic_instr *rsrc = ir3_bindless_resource(intrin->src[0]);
187             if (rsrc)
188                const_ubo = nir_src_is_const(rsrc->src[0]);
189          }
190 
191          if (const_ubo && nir_src_is_const(intrin->src[1]))
192             return 0;
193 
194          /* TODO: get actual numbers for ldc */
195          return 8;
196       }
197 
198       case nir_intrinsic_load_ssbo:
199       case nir_intrinsic_load_ssbo_ir3:
200       case nir_intrinsic_get_ssbo_size:
201       case nir_intrinsic_image_load:
202       case nir_intrinsic_bindless_image_load:
203          /* cat5/isam */
204          return 8;
205 
206       /* By default assume it's a sysval or something */
207       default:
208          return 0;
209       }
210    }
211 
212    case nir_instr_type_phi:
213       /* Although we can often coalesce phis, the cost of a phi is a proxy for
214        * the cost of the if-else statement... If all phis are moved, then the
215        * branches move too. So this needs to have a nonzero cost, even if we're
216        * optimistic about coalescing.
217        *
218        * Value chosen empirically. On Rob's shader-db, cost of 2 performs better
219        * across the board than a cost of 1. Values greater than 2 do not seem to
220        * have any change, so sticking with 2.
221        */
222       return 2;
223 
224    default:
225       return 0;
226    }
227 }
228 
229 static float
rewrite_cost(nir_def * def,const void * data)230 rewrite_cost(nir_def *def, const void *data)
231 {
232    /* We always have to expand booleans */
233    if (def->bit_size == 1)
234       return def->num_components;
235 
236    bool mov_needed = false;
237    nir_foreach_use (use, def) {
238       nir_instr *parent_instr = nir_src_parent_instr(use);
239       if (parent_instr->type != nir_instr_type_alu) {
240          mov_needed = true;
241          break;
242       } else {
243          nir_alu_instr *alu = nir_instr_as_alu(parent_instr);
244          if (alu->op == nir_op_vec2 ||
245              alu->op == nir_op_vec3 ||
246              alu->op == nir_op_vec4 ||
247              alu->op == nir_op_mov) {
248             mov_needed = true;
249             break;
250          } else {
251             /* Assume for non-moves that the const is folded into the src */
252          }
253       }
254    }
255 
256    return mov_needed ? def->num_components : 0;
257 }
258 
259 static bool
avoid_instr(const nir_instr * instr,const void * data)260 avoid_instr(const nir_instr *instr, const void *data)
261 {
262    if (instr->type != nir_instr_type_intrinsic)
263       return false;
264 
265    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
266 
267    return intrin->intrinsic == nir_intrinsic_bindless_resource_ir3;
268 }
269 
270 static bool
set_speculate(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * _)271 set_speculate(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *_)
272 {
273    switch (intr->intrinsic) {
274    /* These instructions go through bounds-checked hardware descriptors so
275     * should be safe to speculate.
276     *
277     * TODO: This isn't necessarily true in Vulkan, where descriptors don't need
278     * to be filled out and bindless descriptor offsets aren't bounds checked.
279     * We may need to plumb this information through from turnip for correctness
280     * to avoid regressing freedreno codegen.
281     */
282    case nir_intrinsic_load_ubo:
283    case nir_intrinsic_load_ubo_vec4:
284    case nir_intrinsic_image_load:
285    case nir_intrinsic_image_samples_identical:
286    case nir_intrinsic_bindless_image_load:
287    case nir_intrinsic_load_ssbo:
288    case nir_intrinsic_load_ssbo_ir3:
289       nir_intrinsic_set_access(intr, nir_intrinsic_access(intr) |
290                                      ACCESS_CAN_SPECULATE);
291       return true;
292 
293    default:
294       return false;
295    }
296 }
297 
298 bool
ir3_nir_opt_preamble(nir_shader * nir,struct ir3_shader_variant * v)299 ir3_nir_opt_preamble(nir_shader *nir, struct ir3_shader_variant *v)
300 {
301    struct ir3_const_state *const_state = ir3_const_state(v);
302 
303    unsigned max_size;
304    if (v->binning_pass) {
305       max_size = const_state->preamble_size * 4;
306    } else {
307       struct ir3_const_state worst_case_const_state = {};
308       ir3_setup_const_state(nir, v, &worst_case_const_state);
309       max_size = (ir3_max_const(v) - worst_case_const_state.offsets.immediate) * 4;
310    }
311 
312    if (max_size == 0)
313       return false;
314 
315    bool progress = nir_shader_intrinsics_pass(nir, set_speculate,
316                                               nir_metadata_block_index |
317                                               nir_metadata_dominance, NULL);
318 
319    nir_opt_preamble_options options = {
320       .drawid_uniform = true,
321       .subgroup_size_uniform = true,
322       .load_workgroup_size_allowed = true,
323       .def_size = def_size,
324       .preamble_storage_size = max_size,
325       .instr_cost_cb = instr_cost,
326       .avoid_instr_cb = avoid_instr,
327       .rewrite_cost_cb = rewrite_cost,
328    };
329 
330    unsigned size = 0;
331    progress |= nir_opt_preamble(nir, &options, &size);
332 
333    if (!v->binning_pass)
334       const_state->preamble_size = DIV_ROUND_UP(size, 4);
335 
336    return progress;
337 }
338 
339 bool
ir3_nir_lower_preamble(nir_shader * nir,struct ir3_shader_variant * v)340 ir3_nir_lower_preamble(nir_shader *nir, struct ir3_shader_variant *v)
341 {
342    nir_function_impl *main = nir_shader_get_entrypoint(nir);
343 
344    if (!main->preamble)
345       return false;
346 
347    nir_function_impl *preamble = main->preamble->impl;
348 
349    /* First, lower load/store_preamble. */
350    const struct ir3_const_state *const_state = ir3_const_state(v);
351    unsigned preamble_base = v->shader_options.num_reserved_user_consts * 4 +
352       const_state->ubo_state.size / 4 + const_state->global_size * 4;
353    unsigned preamble_size = const_state->preamble_size * 4;
354 
355    BITSET_DECLARE(promoted_to_float, preamble_size);
356    memset(promoted_to_float, 0, sizeof(promoted_to_float));
357 
358    nir_builder builder_main = nir_builder_create(main);
359    nir_builder *b = &builder_main;
360 
361    nir_foreach_block (block, main) {
362       nir_foreach_instr_safe (instr, block) {
363          if (instr->type != nir_instr_type_intrinsic)
364             continue;
365 
366          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
367          if (intrin->intrinsic != nir_intrinsic_load_preamble)
368             continue;
369 
370          nir_def *dest = &intrin->def;
371 
372          unsigned offset = preamble_base + nir_intrinsic_base(intrin);
373          b->cursor = nir_before_instr(instr);
374 
375          nir_def *new_dest =
376             nir_load_uniform(b, dest->num_components, 32, nir_imm_int(b, 0),
377                              .base = offset);
378 
379          if (dest->bit_size == 1) {
380             new_dest = nir_i2b(b, new_dest);
381          } else if (dest->bit_size != 32) {
382             assert(dest->bit_size == 16);
383             if (all_uses_float(dest, true)) {
384                new_dest = nir_f2f16(b, new_dest);
385                BITSET_SET(promoted_to_float, nir_intrinsic_base(intrin));
386             } else {
387                new_dest = nir_u2u16(b, new_dest);
388             }
389          }
390 
391          nir_def_rewrite_uses(dest, new_dest);
392          nir_instr_remove(instr);
393          nir_instr_free(instr);
394       }
395    }
396 
397    nir_builder builder_preamble = nir_builder_create(preamble);
398    b = &builder_preamble;
399 
400    nir_foreach_block (block, preamble) {
401       nir_foreach_instr_safe (instr, block) {
402          if (instr->type != nir_instr_type_intrinsic)
403             continue;
404 
405          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
406          if (intrin->intrinsic != nir_intrinsic_store_preamble)
407             continue;
408 
409          nir_def *src = intrin->src[0].ssa;
410          unsigned offset = preamble_base + nir_intrinsic_base(intrin);
411 
412          b->cursor = nir_before_instr(instr);
413 
414          if (src->bit_size == 1)
415             src = nir_b2i32(b, src);
416          if (src->bit_size != 32) {
417             assert(src->bit_size == 16);
418             if (BITSET_TEST(promoted_to_float, nir_intrinsic_base(intrin))) {
419                src = nir_f2f32(b, src);
420             } else {
421                src = nir_u2u32(b, src);
422             }
423          }
424 
425          nir_store_uniform_ir3(b, src, .base = offset);
426          nir_instr_remove(instr);
427          nir_instr_free(instr);
428       }
429    }
430 
431    /* Now, create the preamble sequence and move the preamble into the main
432     * shader:
433     *
434     * if (preamble_start_ir3()) {
435     *    if (subgroupElect()) {
436     *       preamble();
437     *       preamble_end_ir3();
438     *    }
439     * }
440     * ...
441     */
442 
443    /* @decl_regs need to stay in the first block. */
444    b->cursor = nir_after_reg_decls(main);
445 
446    nir_if *outer_if = nir_push_if(b, nir_preamble_start_ir3(b, 1));
447    {
448       nir_if *inner_if = nir_push_if(b, nir_elect(b, 1));
449       {
450          nir_call_instr *call = nir_call_instr_create(nir, main->preamble);
451          nir_builder_instr_insert(b, &call->instr);
452          nir_preamble_end_ir3(b);
453       }
454       nir_pop_if(b, inner_if);
455    }
456    nir_pop_if(b, outer_if);
457 
458    nir_inline_functions(nir);
459    exec_node_remove(&main->preamble->node);
460    main->preamble = NULL;
461 
462    nir_metadata_preserve(main, nir_metadata_none);
463    return true;
464 }
465