• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "util/set.h"
25 #include "nir.h"
26 #include "nir_builder.h"
27 
28 /* This pass provides a way to move computations that are always the same for
29  * an entire draw/compute dispatch into a "preamble" that runs before the main
30  * entrypoint.
31  *
32  * We also expose a separate API to get or construct the preamble of a shader
33  * in case backends want to insert their own code.
34  */
35 
36 nir_function_impl *
nir_shader_get_preamble(nir_shader * shader)37 nir_shader_get_preamble(nir_shader *shader)
38 {
39    nir_function_impl *entrypoint = nir_shader_get_entrypoint(shader);
40    if (entrypoint->preamble) {
41       return entrypoint->preamble->impl;
42    } else {
43       nir_function *preamble = nir_function_create(shader, "@preamble");
44       preamble->is_preamble = true;
45       nir_function_impl *impl = nir_function_impl_create(preamble);
46       entrypoint->preamble = preamble;
47       return impl;
48    }
49 }
50 
51 typedef struct {
52    bool can_move;
53    bool candidate;
54    bool must_stay;
55    bool replace;
56 
57    unsigned can_move_users;
58 
59    unsigned size, align;
60 
61    unsigned offset;
62 
63    /* Average the cost of a value among its users, to try to account for
64     * values that have multiple can_move uses.
65     */
66    float value;
67 
68    /* Overall benefit, i.e. the value minus any cost to inserting
69     * load_preamble.
70     */
71    float benefit;
72 } def_state;
73 
74 typedef struct {
75    /* Per-definition array of states */
76    def_state *states;
77 
78    /* Number of levels of non-uniform control flow we're in. We don't
79     * reconstruct loops, so loops count as non-uniform conservatively. If-else
80     * is counted if the condition is not marked can_move.
81     */
82    unsigned nonuniform_cf_nesting;
83 
84    /* Set of nir_if's that must be reconstructed in the preamble. Note an if may
85     * need reconstruction even when not entirely moved. This does not account
86     * for nesting: the parent CF nodes of ifs in this set must be reconstructed
87     * but may not be in this set, even if the parent is another if.
88     */
89    struct set *reconstructed_ifs;
90 
91    /* Set of definitions that must be reconstructed in the preamble. This is a
92     * subset of can_move instructions, determined after replacement.
93     */
94    BITSET_WORD *reconstructed_defs;
95 
96    nir_def *def;
97 
98    const nir_opt_preamble_options *options;
99 } opt_preamble_ctx;
100 
101 static bool
instr_can_speculate(nir_instr * instr)102 instr_can_speculate(nir_instr *instr)
103 {
104    /* Intrinsics with an ACCESS index can only be speculated if they are
105     * explicitly CAN_SPECULATE.
106     */
107    if (instr->type == nir_instr_type_intrinsic) {
108       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
109 
110       if (nir_intrinsic_has_access(intr))
111          return nir_intrinsic_access(intr) & ACCESS_CAN_SPECULATE;
112    }
113 
114    /* For now, everything else can be speculated. TODO: Bindless textures. */
115    return true;
116 }
117 
118 static float
get_instr_cost(nir_instr * instr,const nir_opt_preamble_options * options)119 get_instr_cost(nir_instr *instr, const nir_opt_preamble_options *options)
120 {
121    /* No backend will want to hoist load_const or undef by itself, so handle
122     * this for them.
123     */
124    if (instr->type == nir_instr_type_load_const ||
125        instr->type == nir_instr_type_undef)
126       return 0;
127 
128    return options->instr_cost_cb(instr, options->cb_data);
129 }
130 
131 static bool
can_move_src(nir_src * src,void * state)132 can_move_src(nir_src *src, void *state)
133 {
134    opt_preamble_ctx *ctx = state;
135 
136    return ctx->states[src->ssa->index].can_move;
137 }
138 
139 static bool
can_move_srcs(nir_instr * instr,opt_preamble_ctx * ctx)140 can_move_srcs(nir_instr *instr, opt_preamble_ctx *ctx)
141 {
142    return nir_foreach_src(instr, can_move_src, ctx);
143 }
144 
145 static bool
can_move_intrinsic(nir_intrinsic_instr * instr,opt_preamble_ctx * ctx)146 can_move_intrinsic(nir_intrinsic_instr *instr, opt_preamble_ctx *ctx)
147 {
148    switch (instr->intrinsic) {
149    /* Intrinsics which can always be moved */
150    case nir_intrinsic_load_push_constant:
151    case nir_intrinsic_load_work_dim:
152    case nir_intrinsic_load_num_workgroups:
153    case nir_intrinsic_load_ray_launch_size:
154    case nir_intrinsic_load_sbt_base_amd:
155    case nir_intrinsic_load_is_indexed_draw:
156    case nir_intrinsic_load_viewport_scale:
157    case nir_intrinsic_load_user_clip_plane:
158    case nir_intrinsic_load_viewport_x_scale:
159    case nir_intrinsic_load_viewport_y_scale:
160    case nir_intrinsic_load_viewport_z_scale:
161    case nir_intrinsic_load_viewport_offset:
162    case nir_intrinsic_load_viewport_x_offset:
163    case nir_intrinsic_load_viewport_y_offset:
164    case nir_intrinsic_load_viewport_z_offset:
165    case nir_intrinsic_load_blend_const_color_a_float:
166    case nir_intrinsic_load_blend_const_color_b_float:
167    case nir_intrinsic_load_blend_const_color_g_float:
168    case nir_intrinsic_load_blend_const_color_r_float:
169    case nir_intrinsic_load_blend_const_color_rgba:
170    case nir_intrinsic_load_blend_const_color_aaaa8888_unorm:
171    case nir_intrinsic_load_blend_const_color_rgba8888_unorm:
172    case nir_intrinsic_load_line_width:
173    case nir_intrinsic_load_aa_line_width:
174    case nir_intrinsic_load_fb_layers_v3d:
175    case nir_intrinsic_load_tcs_num_patches_amd:
176    case nir_intrinsic_load_sample_positions_pan:
177    case nir_intrinsic_load_pipeline_stat_query_enabled_amd:
178    case nir_intrinsic_load_prim_gen_query_enabled_amd:
179    case nir_intrinsic_load_prim_xfb_query_enabled_amd:
180    case nir_intrinsic_load_clamp_vertex_color_amd:
181    case nir_intrinsic_load_cull_front_face_enabled_amd:
182    case nir_intrinsic_load_cull_back_face_enabled_amd:
183    case nir_intrinsic_load_cull_ccw_amd:
184    case nir_intrinsic_load_cull_small_primitives_enabled_amd:
185    case nir_intrinsic_load_cull_any_enabled_amd:
186    case nir_intrinsic_load_cull_small_prim_precision_amd:
187    case nir_intrinsic_load_vbo_base_agx:
188       return true;
189 
190    /* Intrinsics which can be moved depending on hardware */
191    case nir_intrinsic_load_base_instance:
192    case nir_intrinsic_load_base_vertex:
193    case nir_intrinsic_load_first_vertex:
194    case nir_intrinsic_load_draw_id:
195       return ctx->options->drawid_uniform;
196 
197    case nir_intrinsic_load_subgroup_size:
198    case nir_intrinsic_load_num_subgroups:
199       return ctx->options->subgroup_size_uniform;
200 
201    case nir_intrinsic_load_workgroup_size:
202       return ctx->options->load_workgroup_size_allowed;
203 
204    /* Intrinsics which can be moved if the sources can */
205    case nir_intrinsic_load_ubo:
206    case nir_intrinsic_load_ubo_vec4:
207    case nir_intrinsic_get_ubo_size:
208    case nir_intrinsic_get_ssbo_size:
209    case nir_intrinsic_ballot_bitfield_extract:
210    case nir_intrinsic_ballot_find_lsb:
211    case nir_intrinsic_ballot_find_msb:
212    case nir_intrinsic_ballot_bit_count_reduce:
213    case nir_intrinsic_load_deref:
214    case nir_intrinsic_load_global_constant:
215    case nir_intrinsic_load_uniform:
216    case nir_intrinsic_load_preamble:
217    case nir_intrinsic_load_constant:
218    case nir_intrinsic_load_sample_pos_from_id:
219    case nir_intrinsic_load_kernel_input:
220    case nir_intrinsic_load_buffer_amd:
221    case nir_intrinsic_image_samples:
222    case nir_intrinsic_image_deref_samples:
223    case nir_intrinsic_bindless_image_samples:
224    case nir_intrinsic_image_size:
225    case nir_intrinsic_image_deref_size:
226    case nir_intrinsic_bindless_image_size:
227    case nir_intrinsic_vulkan_resource_index:
228    case nir_intrinsic_vulkan_resource_reindex:
229    case nir_intrinsic_load_vulkan_descriptor:
230    case nir_intrinsic_quad_swizzle_amd:
231    case nir_intrinsic_masked_swizzle_amd:
232    case nir_intrinsic_load_ssbo_address:
233    case nir_intrinsic_bindless_resource_ir3:
234    case nir_intrinsic_load_constant_agx:
235       return can_move_srcs(&instr->instr, ctx);
236 
237    /* Image/SSBO loads can be moved if they are CAN_REORDER and their
238     * sources can be moved.
239     */
240    case nir_intrinsic_image_load:
241    case nir_intrinsic_image_samples_identical:
242    case nir_intrinsic_bindless_image_load:
243    case nir_intrinsic_load_ssbo:
244    case nir_intrinsic_load_ssbo_ir3:
245       return (nir_intrinsic_access(instr) & ACCESS_CAN_REORDER) &&
246              can_move_srcs(&instr->instr, ctx);
247 
248    default:
249       return false;
250    }
251 }
252 
253 static bool
can_move_instr(nir_instr * instr,opt_preamble_ctx * ctx)254 can_move_instr(nir_instr *instr, opt_preamble_ctx *ctx)
255 {
256    /* If we are only contained within uniform control flow, no speculation is
257     * needed since the control flow will be reconstructed in the preamble. But
258     * if we are not, we must be able to speculate instructions to move them.
259     */
260    if (ctx->nonuniform_cf_nesting > 0 && !instr_can_speculate(instr))
261       return false;
262 
263    switch (instr->type) {
264    case nir_instr_type_tex: {
265       nir_tex_instr *tex = nir_instr_as_tex(instr);
266       /* See note below about derivatives. We have special code to convert tex
267        * to txd, though, because it's a common case.
268        */
269       if (nir_tex_instr_has_implicit_derivative(tex) &&
270           tex->op != nir_texop_tex) {
271          return false;
272       }
273       return can_move_srcs(instr, ctx);
274    }
275    case nir_instr_type_alu: {
276       /* The preamble is presumably run with only one thread, so we can't run
277        * derivatives in it.
278        * TODO: Replace derivatives with 0 instead, if real apps hit this.
279        */
280       nir_alu_instr *alu = nir_instr_as_alu(instr);
281       if (nir_op_is_derivative(alu->op))
282          return false;
283       else
284          return can_move_srcs(instr, ctx);
285    }
286    case nir_instr_type_intrinsic:
287       return can_move_intrinsic(nir_instr_as_intrinsic(instr), ctx);
288 
289    case nir_instr_type_load_const:
290    case nir_instr_type_undef:
291       return true;
292 
293    case nir_instr_type_deref: {
294       nir_deref_instr *deref = nir_instr_as_deref(instr);
295       if (deref->deref_type == nir_deref_type_var) {
296          switch (deref->modes) {
297          case nir_var_uniform:
298          case nir_var_mem_ubo:
299             return true;
300          default:
301             return false;
302          }
303       } else {
304          return can_move_srcs(instr, ctx);
305       }
306    }
307 
308    /* We can only move phis if all of their sources are movable, and it is a phi
309     * for an if-else that is itself movable.
310     */
311    case nir_instr_type_phi: {
312       nir_cf_node *prev_node = nir_cf_node_prev(&instr->block->cf_node);
313       if (!prev_node)
314          return false;
315 
316       if (prev_node->type != nir_cf_node_if) {
317          assert(prev_node->type == nir_cf_node_loop);
318          return false;
319       }
320 
321       nir_if *nif = nir_cf_node_as_if(prev_node);
322       if (!can_move_src(&nif->condition, ctx))
323          return false;
324 
325       return can_move_srcs(instr, ctx);
326    }
327 
328    default:
329       return false;
330    }
331 }
332 
333 /* True if we should avoid making this a candidate. This is only called on
334  * instructions we already determined we can move, this just makes it so that
335  * uses of this instruction cannot be rewritten. Typically this happens
336  * because of static constraints on the IR, for example some deref chains
337  * cannot be broken.
338  */
339 static bool
avoid_instr(nir_instr * instr,const nir_opt_preamble_options * options)340 avoid_instr(nir_instr *instr, const nir_opt_preamble_options *options)
341 {
342    if (instr->type == nir_instr_type_deref)
343       return true;
344 
345    return options->avoid_instr_cb(instr, options->cb_data);
346 }
347 
348 static bool
update_src_value(nir_src * src,void * data)349 update_src_value(nir_src *src, void *data)
350 {
351    opt_preamble_ctx *ctx = data;
352 
353    def_state *state = &ctx->states[ctx->def->index];
354    def_state *src_state = &ctx->states[src->ssa->index];
355 
356    assert(src_state->can_move);
357 
358    /* If an instruction has can_move and non-can_move users, it becomes a
359     * candidate and its value shouldn't propagate downwards. For example,
360     * imagine a chain like this:
361     *
362     *         -- F (cannot move)
363     *        /
364     *  A <-- B <-- C <-- D <-- E (cannot move)
365     *
366     * B and D are marked candidates. Picking B removes A and B, picking D
367     * removes C and D, and picking both removes all 4. Therefore B and D are
368     * independent and B's value shouldn't flow into D.
369     *
370     * A similar argument holds for must_stay values.
371     */
372    if (!src_state->must_stay && !src_state->candidate)
373       state->value += src_state->value;
374    return true;
375 }
376 
377 static int
candidate_sort(const void * data1,const void * data2)378 candidate_sort(const void *data1, const void *data2)
379 {
380    const def_state *state1 = *(def_state **)data1;
381    const def_state *state2 = *(def_state **)data2;
382 
383    float value1 = state1->value / state1->size;
384    float value2 = state2->value / state2->size;
385    if (value1 < value2)
386       return 1;
387    else if (value1 > value2)
388       return -1;
389    else
390       return 0;
391 }
392 
393 static bool
calculate_can_move_for_block(opt_preamble_ctx * ctx,nir_block * block)394 calculate_can_move_for_block(opt_preamble_ctx *ctx, nir_block *block)
395 {
396    bool all_can_move = true;
397 
398    nir_foreach_instr(instr, block) {
399       nir_def *def = nir_instr_def(instr);
400       if (!def)
401          continue;
402 
403       def_state *state = &ctx->states[def->index];
404       state->can_move = can_move_instr(instr, ctx);
405       all_can_move &= state->can_move;
406    }
407 
408    return all_can_move;
409 }
410 
411 static bool
calculate_can_move_for_cf_list(opt_preamble_ctx * ctx,struct exec_list * list)412 calculate_can_move_for_cf_list(opt_preamble_ctx *ctx, struct exec_list *list)
413 {
414    bool all_can_move = true;
415 
416    foreach_list_typed(nir_cf_node, node, node, list) {
417       switch (node->type) {
418       case nir_cf_node_block:
419          all_can_move &=
420             calculate_can_move_for_block(ctx, nir_cf_node_as_block(node));
421          break;
422 
423       case nir_cf_node_if: {
424          nir_if *nif = nir_cf_node_as_if(node);
425          bool uniform = can_move_src(&nif->condition, ctx);
426 
427          if (!uniform)
428             ctx->nonuniform_cf_nesting++;
429 
430          bool if_can_move = uniform;
431          if_can_move &= calculate_can_move_for_cf_list(ctx, &nif->then_list);
432          if_can_move &= calculate_can_move_for_cf_list(ctx, &nif->else_list);
433 
434          if (!uniform)
435             ctx->nonuniform_cf_nesting--;
436 
437          all_can_move &= if_can_move;
438          break;
439       }
440 
441       case nir_cf_node_loop: {
442          nir_loop *loop = nir_cf_node_as_loop(node);
443 
444          /* Conservatively treat loops like conditional control flow, since an
445           * instruction might be conditionally unreachabled due to an earlier
446           * break in a loop that executes only one iteration.
447           */
448          ctx->nonuniform_cf_nesting++;
449          calculate_can_move_for_cf_list(ctx, &loop->body);
450          ctx->nonuniform_cf_nesting--;
451          all_can_move = false;
452          break;
453       }
454 
455       default:
456          unreachable("Unexpected CF node type");
457       }
458    }
459 
460    return all_can_move;
461 }
462 
463 static void
replace_for_block(nir_builder * b,opt_preamble_ctx * ctx,struct hash_table * remap_table,nir_block * block)464 replace_for_block(nir_builder *b, opt_preamble_ctx *ctx,
465                   struct hash_table *remap_table, nir_block *block)
466 {
467    nir_foreach_instr(instr, block) {
468       nir_def *def = nir_instr_def(instr);
469       if (!def)
470          continue;
471 
472       /* Only replace what we actually need. This is a micro-optimization for
473        * compile-time performance of regular instructions, but it's required for
474        * correctness with phi nodes, since we might not reconstruct the
475        * corresponding if.
476        */
477       if (!BITSET_TEST(ctx->reconstructed_defs, def->index))
478          continue;
479 
480       def_state *state = &ctx->states[def->index];
481       assert(state->can_move && "reconstructed => can_move");
482 
483       nir_instr *clone;
484 
485       if (instr->type == nir_instr_type_phi) {
486          /* Phis are special since they can't be cloned with nir_instr_clone */
487          nir_phi_instr *phi = nir_instr_as_phi(instr);
488 
489          nir_cf_node *nif_cf = nir_cf_node_prev(&block->cf_node);
490          assert(nif_cf->type == nir_cf_node_if && "only if's are moveable");
491          nir_if *nif = nir_cf_node_as_if(nif_cf);
492 
493          nir_block *then_block = nir_if_last_then_block(nif);
494          nir_block *else_block = nir_if_last_else_block(nif);
495 
496          nir_def *then_def = NULL, *else_def = NULL;
497 
498          nir_foreach_phi_src(phi_src, phi) {
499             if (phi_src->pred == then_block) {
500                assert(then_def == NULL);
501                then_def = phi_src->src.ssa;
502             } else if (phi_src->pred == else_block) {
503                assert(else_def == NULL);
504                else_def = phi_src->src.ssa;
505             } else {
506                unreachable("Invalid predecessor for phi of if");
507             }
508          }
509 
510          assert(exec_list_length(&phi->srcs) == 2 && "only if's are movable");
511          assert(then_def && else_def && "all sources seen");
512 
513          /* Remap */
514          then_def = _mesa_hash_table_search(remap_table, then_def)->data;
515          else_def = _mesa_hash_table_search(remap_table, else_def)->data;
516 
517          b->cursor =
518             nir_before_block_after_phis(nir_cursor_current_block(b->cursor));
519 
520          nir_def *repl = nir_if_phi(b, then_def, else_def);
521          clone = repl->parent_instr;
522 
523          _mesa_hash_table_insert(remap_table, &phi->def, repl);
524       } else {
525          clone = nir_instr_clone_deep(b->shader, instr, remap_table);
526          nir_builder_instr_insert(b, clone);
527       }
528 
529       if (clone->type == nir_instr_type_tex) {
530          nir_tex_instr *tex = nir_instr_as_tex(clone);
531          if (tex->op == nir_texop_tex) {
532             /* For maximum compatibility, replace normal textures with
533              * textureGrad with a gradient of 0.
534              * TODO: Handle txb somehow.
535              */
536             b->cursor = nir_before_instr(clone);
537 
538             nir_def *zero =
539                nir_imm_zero(b, tex->coord_components - tex->is_array, 32);
540             nir_tex_instr_add_src(tex, nir_tex_src_ddx, zero);
541             nir_tex_instr_add_src(tex, nir_tex_src_ddy, zero);
542             tex->op = nir_texop_txd;
543 
544             b->cursor = nir_after_instr(clone);
545          }
546       }
547 
548       if (state->replace) {
549          nir_def *clone_def = nir_instr_def(clone);
550          nir_store_preamble(b, clone_def, .base = state->offset);
551       }
552    }
553 }
554 
555 static void
replace_for_cf_list(nir_builder * b,opt_preamble_ctx * ctx,struct hash_table * remap_table,struct exec_list * list)556 replace_for_cf_list(nir_builder *b, opt_preamble_ctx *ctx,
557                     struct hash_table *remap_table, struct exec_list *list)
558 {
559    foreach_list_typed(nir_cf_node, node, node, list) {
560       switch (node->type) {
561       case nir_cf_node_block: {
562          replace_for_block(b, ctx, remap_table, nir_cf_node_as_block(node));
563          break;
564       }
565 
566       case nir_cf_node_if: {
567          nir_if *nif = nir_cf_node_as_if(node);
568 
569          /* If we moved something that requires reconstructing the if, do so */
570          if (_mesa_set_search(ctx->reconstructed_ifs, nif)) {
571             assert(can_move_src(&nif->condition, ctx));
572 
573             struct hash_entry *entry =
574                _mesa_hash_table_search(remap_table, nif->condition.ssa);
575             assert(entry != NULL && "can_move condition, def dominates use");
576             nir_def *remap_cond = entry->data;
577 
578             nir_if *reconstructed_nif = NULL;
579             reconstructed_nif = nir_push_if(b, remap_cond);
580 
581             b->cursor = nir_before_cf_list(&reconstructed_nif->then_list);
582             replace_for_cf_list(b, ctx, remap_table, &nif->then_list);
583 
584             b->cursor = nir_before_cf_list(&reconstructed_nif->else_list);
585             replace_for_cf_list(b, ctx, remap_table, &nif->else_list);
586 
587             nir_pop_if(b, reconstructed_nif);
588             b->cursor = nir_after_cf_node(&reconstructed_nif->cf_node);
589          } else {
590             replace_for_cf_list(b, ctx, remap_table, &nif->then_list);
591             replace_for_cf_list(b, ctx, remap_table, &nif->else_list);
592          }
593 
594          break;
595       }
596 
597       case nir_cf_node_loop: {
598          /* We don't try to reconstruct loops */
599          nir_loop *loop = nir_cf_node_as_loop(node);
600          replace_for_cf_list(b, ctx, remap_table, &loop->body);
601          break;
602       }
603 
604       default:
605          unreachable("Unexpected CF node type");
606       }
607    }
608 }
609 
610 /*
611  * If an if-statement contains an instruction that cannot be speculated, the
612  * if-statement must be reconstructed so we avoid the speculation. This applies
613  * even for nested if-statements. Determine which if-statements must be
614  * reconstructed for this reason by walking the program forward and looking
615  * inside uniform if's.
616  *
617  * Returns whether the CF list contains a reconstructed instruction that would
618  * otherwise be speculated, updating the reconstructed_ifs set. This depends on
619  * reconstructed_defs being correctly set by analyze_reconstructed.
620  */
621 static bool
analyze_speculation_for_cf_list(opt_preamble_ctx * ctx,struct exec_list * list)622 analyze_speculation_for_cf_list(opt_preamble_ctx *ctx, struct exec_list *list)
623 {
624    bool reconstruct_cf_list = false;
625 
626    foreach_list_typed(nir_cf_node, node, node, list) {
627       switch (node->type) {
628       case nir_cf_node_block: {
629          nir_foreach_instr(instr, nir_cf_node_as_block(node)) {
630             nir_def *def = nir_instr_def(instr);
631             if (!def)
632                continue;
633 
634             if (!BITSET_TEST(ctx->reconstructed_defs, def->index))
635                continue;
636 
637             if (!instr_can_speculate(instr)) {
638                reconstruct_cf_list = true;
639                break;
640             }
641          }
642 
643          break;
644       }
645 
646       case nir_cf_node_if: {
647          nir_if *nif = nir_cf_node_as_if(node);
648 
649          /* If we can move the if, we might need to reconstruct */
650          if (can_move_src(&nif->condition, ctx)) {
651             bool any = false;
652             any |= analyze_speculation_for_cf_list(ctx, &nif->then_list);
653             any |= analyze_speculation_for_cf_list(ctx, &nif->else_list);
654 
655             if (any)
656                _mesa_set_add(ctx->reconstructed_ifs, nif);
657 
658             reconstruct_cf_list |= any;
659          }
660 
661          break;
662       }
663 
664       /* We don't reconstruct loops */
665       default:
666          break;
667       }
668    }
669 
670    return reconstruct_cf_list;
671 }
672 
673 static bool
mark_reconstructed(nir_src * src,void * state)674 mark_reconstructed(nir_src *src, void *state)
675 {
676    BITSET_WORD *reconstructed_defs = state;
677    BITSET_SET(reconstructed_defs, src->ssa->index);
678    return true;
679 }
680 
681 /*
682  * If a phi is moved into the preamble, then the if it depends on must also be
683  * moved. However, it is not necessary to consider any nested control flow. As
684  * an example, if we have a shader:
685  *
686  *    if (not moveable condition) {
687  *       if (moveable condition) {
688  *          x = moveable
689  *       }
690  *       y = phi x, moveable
691  *       z = floor y
692  *    }
693  *
694  * Then if 'z' is in the replace set, we need to reconstruct the inner if, but
695  * not the outer if, unless there's also speculation to worry about.
696  *
697  * We do this by marking defs that need to be reconstructed, with a backwards
698  * sweep of the program (compatible with reverse dominance), and marking the
699  * if's preceding reconstructed phis.
700  */
701 static void
analyze_reconstructed(opt_preamble_ctx * ctx,nir_function_impl * impl)702 analyze_reconstructed(opt_preamble_ctx *ctx, nir_function_impl *impl)
703 {
704    nir_foreach_block_reverse(block, impl) {
705       /* If an if-statement is reconstructed, its condition must be as well */
706       nir_if *nif = nir_block_get_following_if(block);
707       if (nif && _mesa_set_search(ctx->reconstructed_ifs, nif))
708          BITSET_SET(ctx->reconstructed_defs, nif->condition.ssa->index);
709 
710       nir_foreach_instr_reverse(instr, block) {
711          nir_def *def = nir_instr_def(instr);
712          if (!def)
713             continue;
714 
715          def_state *state = &ctx->states[def->index];
716 
717          /* Anything that's replaced must be reconstructed */
718          if (state->replace)
719             BITSET_SET(ctx->reconstructed_defs, def->index);
720          else if (!BITSET_TEST(ctx->reconstructed_defs, def->index))
721             continue;
722 
723          /* If it must be reconstructed, it better be moveable */
724          assert(state->can_move);
725 
726          /* Anything that depends on something reconstructed is reconstructed */
727          nir_foreach_src(instr, mark_reconstructed, ctx->reconstructed_defs);
728 
729          /* Reconstructed phis need their ifs reconstructed */
730          if (instr->type == nir_instr_type_phi) {
731             nir_cf_node *prev_node = nir_cf_node_prev(&instr->block->cf_node);
732 
733             /* Invariants guaranteed by can_move_instr */
734             assert(prev_node != NULL);
735             assert(prev_node->type == nir_cf_node_if);
736 
737             nir_if *nif = nir_cf_node_as_if(prev_node);
738             assert(can_move_src(&nif->condition, ctx));
739 
740             /* Mark the if for reconstruction */
741             _mesa_set_add(ctx->reconstructed_ifs, nif);
742          }
743       }
744    }
745 }
746 
747 bool
nir_opt_preamble(nir_shader * shader,const nir_opt_preamble_options * options,unsigned * size)748 nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options,
749                  unsigned *size)
750 {
751    opt_preamble_ctx ctx = {
752       .options = options,
753    };
754 
755    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
756    ctx.states = calloc(impl->ssa_alloc, sizeof(*ctx.states));
757 
758    /* Step 1: Calculate can_move */
759    calculate_can_move_for_cf_list(&ctx, &impl->body);
760 
761    /* Step 2: Calculate is_candidate. This is complicated by the presence of
762     * non-candidate instructions like derefs whose users cannot be rewritten.
763     * If a deref chain is used at all by a non-can_move thing, then any offset
764     * sources anywhere along the chain should be considered candidates because
765     * the entire deref chain will never be deleted, but if it's only used by
766     * can_move things then it becomes subsumed by its users and none of the
767     * offset sources should be considered candidates as they will be removed
768     * when the users of the deref chain are moved. We need to replace "are
769     * there any non-can_move users" with "are there any non-can_move users,
770     * *recursing through non-candidate users*". We do this by walking backward
771     * and marking when a non-candidate instruction must stay in the final
772     * program because it has a non-can_move user, including recursively.
773     */
774    unsigned num_candidates = 0;
775    nir_foreach_block_reverse(block, impl) {
776       nir_foreach_instr_reverse(instr, block) {
777          nir_def *def = nir_instr_def(instr);
778          if (!def)
779             continue;
780 
781          def_state *state = &ctx.states[def->index];
782          if (!state->can_move)
783             continue;
784 
785          state->value = get_instr_cost(instr, options);
786          bool is_candidate = !avoid_instr(instr, options);
787          state->candidate = false;
788          state->must_stay = false;
789          nir_foreach_use_including_if(use, def) {
790             bool is_can_move_user;
791 
792             if (nir_src_is_if(use)) {
793                is_can_move_user = false;
794             } else {
795                nir_def *use_def = nir_instr_def(nir_src_parent_instr(use));
796                is_can_move_user = use_def != NULL &&
797                                   ctx.states[use_def->index].can_move &&
798                                   !ctx.states[use_def->index].must_stay;
799             }
800 
801             if (is_can_move_user) {
802                state->can_move_users++;
803             } else {
804                if (is_candidate)
805                   state->candidate = true;
806                else
807                   state->must_stay = true;
808             }
809          }
810 
811          if (state->candidate)
812             num_candidates++;
813       }
814    }
815 
816    if (num_candidates == 0) {
817       free(ctx.states);
818       return false;
819    }
820 
821    def_state **candidates = malloc(sizeof(*candidates) * num_candidates);
822    unsigned candidate_idx = 0;
823    unsigned total_size = 0;
824 
825    /* Step 3: Calculate value of candidates by propagating downwards. We try
826     * to share the value amongst can_move uses, in case there are multiple.
827     * This won't always find the most optimal solution, but is hopefully a
828     * good heuristic.
829     *
830     * Note that we use the can_move adjusted in the last pass, because if a
831     * can_move instruction cannot be moved because it's not a candidate and it
832     * has a non-can_move source then we don't want to count it as a use.
833     *
834     * While we're here, also collect an array of candidates.
835     */
836    nir_foreach_block(block, impl) {
837       nir_foreach_instr(instr, block) {
838          nir_def *def = nir_instr_def(instr);
839          if (!def)
840             continue;
841 
842          def_state *state = &ctx.states[def->index];
843          if (!state->can_move || state->must_stay)
844             continue;
845 
846          ctx.def = def;
847          nir_foreach_src(instr, update_src_value, &ctx);
848 
849          /* If this instruction is a candidate, its value shouldn't be
850           * propagated so we skip dividing it.
851           *
852           * Note: if it's can_move but not a candidate, then all its users
853           * must be can_move, so if there are no users then it must be dead.
854           */
855          if (!state->candidate && !state->must_stay) {
856             if (state->can_move_users > 0)
857                state->value /= state->can_move_users;
858             else
859                state->value = 0;
860          }
861 
862          if (state->candidate) {
863             state->benefit = state->value -
864                              options->rewrite_cost_cb(def, options->cb_data);
865 
866             if (state->benefit > 0) {
867                options->def_size(def, &state->size, &state->align);
868                total_size = ALIGN_POT(total_size, state->align);
869                total_size += state->size;
870                candidates[candidate_idx++] = state;
871             }
872          }
873       }
874    }
875 
876    assert(candidate_idx <= num_candidates);
877    num_candidates = candidate_idx;
878 
879    if (num_candidates == 0) {
880       free(ctx.states);
881       free(candidates);
882       return false;
883    }
884 
885    /* Step 4: Figure out which candidates we're going to replace and assign an
886     * offset. Assuming there is no expression sharing, this is similar to the
887     * 0-1 knapsack problem, except when there is a gap introduced by
888     * alignment. We use a well-known greedy approximation, sorting by value
889     * divided by size.
890     */
891 
892    if (((*size) + total_size) > options->preamble_storage_size) {
893       qsort(candidates, num_candidates, sizeof(*candidates), candidate_sort);
894    }
895 
896    unsigned offset = *size;
897    for (unsigned i = 0; i < num_candidates; i++) {
898       def_state *state = candidates[i];
899       offset = ALIGN_POT(offset, state->align);
900 
901       if (offset + state->size > options->preamble_storage_size)
902          break;
903 
904       state->replace = true;
905       state->offset = offset;
906 
907       offset += state->size;
908    }
909 
910    *size = offset;
911 
912    free(candidates);
913 
914    /* Determine which if's need to be reconstructed, based on the replacements
915     * we did.
916     */
917    ctx.reconstructed_ifs = _mesa_pointer_set_create(NULL);
918    ctx.reconstructed_defs = calloc(BITSET_WORDS(impl->ssa_alloc),
919                                    sizeof(BITSET_WORD));
920    analyze_reconstructed(&ctx, impl);
921 
922    /* If we make progress analyzing speculation, we need to re-analyze
923     * reconstructed defs to get the if-conditions in there.
924     */
925    if (analyze_speculation_for_cf_list(&ctx, &impl->body))
926       analyze_reconstructed(&ctx, impl);
927 
928    /* Step 5: Actually do the replacement. */
929    struct hash_table *remap_table =
930       _mesa_pointer_hash_table_create(NULL);
931    nir_function_impl *preamble =
932       nir_shader_get_preamble(impl->function->shader);
933    nir_builder preamble_builder = nir_builder_at(nir_before_impl(preamble));
934    nir_builder *b = &preamble_builder;
935 
936    replace_for_cf_list(b, &ctx, remap_table, &impl->body);
937 
938    nir_builder builder = nir_builder_create(impl);
939    b = &builder;
940 
941    unsigned max_index = impl->ssa_alloc;
942    nir_foreach_block(block, impl) {
943       nir_foreach_instr_safe(instr, block) {
944          nir_def *def = nir_instr_def(instr);
945          if (!def)
946             continue;
947 
948          /* Ignore new load_preamble instructions */
949          if (def->index >= max_index)
950             continue;
951 
952          def_state *state = &ctx.states[def->index];
953          if (!state->replace)
954             continue;
955 
956          b->cursor = nir_after_instr_and_phis(instr);
957 
958          nir_def *new_def =
959             nir_load_preamble(b, def->num_components, def->bit_size,
960                               .base = state->offset);
961 
962          nir_def_rewrite_uses(def, new_def);
963          nir_instr_free_and_dce(instr);
964       }
965    }
966 
967    nir_metadata_preserve(impl,
968                          nir_metadata_block_index |
969                             nir_metadata_dominance);
970 
971    ralloc_free(remap_table);
972    free(ctx.states);
973    free(ctx.reconstructed_defs);
974    _mesa_set_destroy(ctx.reconstructed_ifs, NULL);
975    return true;
976 }
977