• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "nir.h"
25 #include "nir_builder.h"
26 
27 /* This pass provides a way to move computations that are always the same for
28  * an entire draw/compute dispatch into a "preamble" that runs before the main
29  * entrypoint.
30  *
31  * We also expose a separate API to get or construct the preamble of a shader
32  * in case backends want to insert their own code.
33  */
34 
35 
36 nir_function_impl *
nir_shader_get_preamble(nir_shader * shader)37 nir_shader_get_preamble(nir_shader *shader)
38 {
39    nir_function_impl *entrypoint = nir_shader_get_entrypoint(shader);
40    if (entrypoint->preamble) {
41       return entrypoint->preamble->impl;
42    } else {
43       nir_function *preamble = nir_function_create(shader, "@preamble");
44       preamble->is_preamble = true;
45       nir_function_impl *impl = nir_function_impl_create(preamble);
46       entrypoint->preamble = preamble;
47       return impl;
48    }
49 }
50 
51 typedef struct {
52    bool can_move;
53    bool candidate;
54    bool must_stay;
55    bool replace;
56 
57    unsigned can_move_users;
58 
59    unsigned size, align;
60 
61    unsigned offset;
62 
63    /* Average the cost of a value among its users, to try to account for
64     * values that have multiple can_move uses.
65     */
66    float value;
67 
68    /* Overall benefit, i.e. the value minus any cost to inserting
69     * load_preamble.
70     */
71    float benefit;
72 } def_state;
73 
74 typedef struct {
75    /* Per-definition array of states */
76    def_state *states;
77 
78    nir_ssa_def *def;
79 
80    const nir_opt_preamble_options *options;
81 } opt_preamble_ctx;
82 
83 static float
get_instr_cost(nir_instr * instr,const nir_opt_preamble_options * options)84 get_instr_cost(nir_instr *instr, const nir_opt_preamble_options *options)
85 {
86    /* No backend will want to hoist load_const or undef by itself, so handle
87     * this for them.
88     */
89    if (instr->type == nir_instr_type_load_const ||
90        instr->type == nir_instr_type_ssa_undef)
91       return 0;
92 
93    return options->instr_cost_cb(instr, options->cb_data);
94 }
95 
96 static bool
can_move_src(nir_src * src,void * state)97 can_move_src(nir_src *src, void *state)
98 {
99    opt_preamble_ctx *ctx = state;
100 
101    assert(src->is_ssa);
102    return ctx->states[src->ssa->index].can_move;
103 }
104 
105 static bool
can_move_srcs(nir_instr * instr,opt_preamble_ctx * ctx)106 can_move_srcs(nir_instr *instr, opt_preamble_ctx *ctx)
107 {
108    return nir_foreach_src(instr, can_move_src, ctx);
109 }
110 
111 static bool
can_move_intrinsic(nir_intrinsic_instr * instr,opt_preamble_ctx * ctx)112 can_move_intrinsic(nir_intrinsic_instr *instr, opt_preamble_ctx *ctx)
113 {
114    switch (instr->intrinsic) {
115    /* Intrinsics which can always be moved */
116    case nir_intrinsic_load_push_constant:
117    case nir_intrinsic_load_work_dim:
118    case nir_intrinsic_load_num_workgroups:
119    case nir_intrinsic_load_workgroup_size:
120    case nir_intrinsic_load_ray_launch_size:
121    case nir_intrinsic_load_ray_launch_size_addr_amd:
122    case nir_intrinsic_load_sbt_base_amd:
123    case nir_intrinsic_load_is_indexed_draw:
124    case nir_intrinsic_load_viewport_scale:
125    case nir_intrinsic_load_user_clip_plane:
126    case nir_intrinsic_load_viewport_x_scale:
127    case nir_intrinsic_load_viewport_y_scale:
128    case nir_intrinsic_load_viewport_z_scale:
129    case nir_intrinsic_load_viewport_offset:
130    case nir_intrinsic_load_viewport_x_offset:
131    case nir_intrinsic_load_viewport_y_offset:
132    case nir_intrinsic_load_viewport_z_offset:
133    case nir_intrinsic_load_blend_const_color_a_float:
134    case nir_intrinsic_load_blend_const_color_b_float:
135    case nir_intrinsic_load_blend_const_color_g_float:
136    case nir_intrinsic_load_blend_const_color_r_float:
137    case nir_intrinsic_load_blend_const_color_rgba:
138    case nir_intrinsic_load_blend_const_color_aaaa8888_unorm:
139    case nir_intrinsic_load_blend_const_color_rgba8888_unorm:
140    case nir_intrinsic_load_line_width:
141    case nir_intrinsic_load_aa_line_width:
142    case nir_intrinsic_load_fb_layers_v3d:
143    case nir_intrinsic_load_tcs_num_patches_amd:
144    case nir_intrinsic_load_sample_positions_pan:
145    case nir_intrinsic_load_shader_query_enabled_amd:
146    case nir_intrinsic_load_cull_front_face_enabled_amd:
147    case nir_intrinsic_load_cull_back_face_enabled_amd:
148    case nir_intrinsic_load_cull_ccw_amd:
149    case nir_intrinsic_load_cull_small_primitives_enabled_amd:
150    case nir_intrinsic_load_cull_any_enabled_amd:
151    case nir_intrinsic_load_cull_small_prim_precision_amd:
152       return true;
153 
154    /* Intrinsics which can be moved depending on hardware */
155    case nir_intrinsic_load_base_instance:
156    case nir_intrinsic_load_base_vertex:
157    case nir_intrinsic_load_first_vertex:
158    case nir_intrinsic_load_draw_id:
159       return ctx->options->drawid_uniform;
160 
161    case nir_intrinsic_load_subgroup_size:
162    case nir_intrinsic_load_num_subgroups:
163       return ctx->options->subgroup_size_uniform;
164 
165    /* Intrinsics which can be moved if the sources can */
166    case nir_intrinsic_load_ubo:
167    case nir_intrinsic_load_ubo_vec4:
168    case nir_intrinsic_get_ubo_size:
169    case nir_intrinsic_get_ssbo_size:
170    case nir_intrinsic_ballot_bitfield_extract:
171    case nir_intrinsic_ballot_find_lsb:
172    case nir_intrinsic_ballot_find_msb:
173    case nir_intrinsic_ballot_bit_count_reduce:
174    case nir_intrinsic_load_deref:
175    case nir_intrinsic_load_global_constant:
176    case nir_intrinsic_load_uniform:
177    case nir_intrinsic_load_constant:
178    case nir_intrinsic_load_sample_pos_from_id:
179    case nir_intrinsic_load_kernel_input:
180    case nir_intrinsic_load_buffer_amd:
181    case nir_intrinsic_image_samples:
182    case nir_intrinsic_image_deref_samples:
183    case nir_intrinsic_bindless_image_samples:
184    case nir_intrinsic_image_size:
185    case nir_intrinsic_image_deref_size:
186    case nir_intrinsic_bindless_image_size:
187    case nir_intrinsic_vulkan_resource_index:
188    case nir_intrinsic_vulkan_resource_reindex:
189    case nir_intrinsic_load_vulkan_descriptor:
190    case nir_intrinsic_quad_swizzle_amd:
191    case nir_intrinsic_masked_swizzle_amd:
192    case nir_intrinsic_load_ssbo_address:
193    case nir_intrinsic_bindless_resource_ir3:
194       return can_move_srcs(&instr->instr, ctx);
195 
196    /* Image/SSBO loads can be moved if they are CAN_REORDER and their
197     * sources can be moved.
198     */
199    case nir_intrinsic_image_load:
200    case nir_intrinsic_bindless_image_load:
201    case nir_intrinsic_load_ssbo:
202    case nir_intrinsic_load_ssbo_ir3:
203       return (nir_intrinsic_access(instr) & ACCESS_CAN_REORDER) &&
204          can_move_srcs(&instr->instr, ctx);
205 
206    default:
207       return false;
208    }
209 }
210 
211 static bool
can_move_instr(nir_instr * instr,opt_preamble_ctx * ctx)212 can_move_instr(nir_instr *instr, opt_preamble_ctx *ctx)
213 {
214    switch (instr->type) {
215    case nir_instr_type_tex: {
216       nir_tex_instr *tex = nir_instr_as_tex(instr);
217       /* See note below about derivatives. We have special code to convert tex
218        * to txd, though, because it's a common case.
219        */
220       if (nir_tex_instr_has_implicit_derivative(tex) &&
221           tex->op != nir_texop_tex) {
222          return false;
223       }
224       return can_move_srcs(instr, ctx);
225    }
226    case nir_instr_type_alu: {
227       /* The preamble is presumably run with only one thread, so we can't run
228        * derivatives in it.
229        * TODO: Replace derivatives with 0 instead, if real apps hit this.
230        */
231       nir_alu_instr *alu = nir_instr_as_alu(instr);
232       switch (alu->op) {
233       case nir_op_fddx:
234       case nir_op_fddy:
235       case nir_op_fddx_fine:
236       case nir_op_fddy_fine:
237       case nir_op_fddx_coarse:
238       case nir_op_fddy_coarse:
239          return false;
240       default:
241          return can_move_srcs(instr, ctx);
242       }
243    }
244    case nir_instr_type_intrinsic:
245       return can_move_intrinsic(nir_instr_as_intrinsic(instr), ctx);
246 
247    case nir_instr_type_load_const:
248    case nir_instr_type_ssa_undef:
249       return true;
250 
251    case nir_instr_type_deref: {
252       nir_deref_instr *deref = nir_instr_as_deref(instr);
253       if (deref->deref_type == nir_deref_type_var) {
254          switch (deref->modes) {
255          case nir_var_uniform:
256          case nir_var_mem_ubo:
257             return true;
258          default:
259             return false;
260          }
261       } else {
262          return can_move_srcs(instr, ctx);
263       }
264    }
265 
266    case nir_instr_type_phi:
267       /* TODO: we could move an if-statement if everything inside it is
268        * moveable.
269        */
270       return false;
271 
272    default:
273       return false;
274    }
275 }
276 
277 /* True if we should avoid making this a candidate. This is only called on
278  * instructions we already determined we can move, this just makes it so that
279  * uses of this instruction cannot be rewritten. Typically this happens
280  * because of static constraints on the IR, for example some deref chains
281  * cannot be broken.
282  */
283 static bool
avoid_instr(nir_instr * instr,const nir_opt_preamble_options * options)284 avoid_instr(nir_instr *instr, const nir_opt_preamble_options *options)
285 {
286    if (instr->type == nir_instr_type_deref)
287       return true;
288 
289    return options->avoid_instr_cb(instr, options->cb_data);
290 }
291 
292 static bool
update_src_value(nir_src * src,void * data)293 update_src_value(nir_src *src, void *data)
294 {
295    opt_preamble_ctx *ctx = data;
296 
297    def_state *state = &ctx->states[ctx->def->index];
298    def_state *src_state = &ctx->states[src->ssa->index];
299 
300    assert(src_state->can_move);
301 
302    /* If an instruction has can_move and non-can_move users, it becomes a
303     * candidate and its value shouldn't propagate downwards. For example,
304     * imagine a chain like this:
305     *
306     *         -- F (cannot move)
307     *        /
308     *  A <-- B <-- C <-- D <-- E (cannot move)
309     *
310     * B and D are marked candidates. Picking B removes A and B, picking D
311     * removes C and D, and picking both removes all 4. Therefore B and D are
312     * independent and B's value shouldn't flow into D.
313     *
314     * A similar argument holds for must_stay values.
315     */
316    if (!src_state->must_stay && !src_state->candidate)
317       state->value += src_state->value;
318    return true;
319 }
320 
321 static int
candidate_sort(const void * data1,const void * data2)322 candidate_sort(const void *data1, const void *data2)
323 {
324    const def_state *state1 = *(def_state **)data1;
325    const def_state *state2 = *(def_state **)data2;
326 
327    float value1 = state1->value / state1->size;
328    float value2 = state2->value / state2->size;
329    if (value1 < value2)
330       return 1;
331    else if (value1 > value2)
332       return -1;
333    else
334       return 0;
335 }
336 
337 bool
nir_opt_preamble(nir_shader * shader,const nir_opt_preamble_options * options,unsigned * size)338 nir_opt_preamble(nir_shader *shader, const nir_opt_preamble_options *options,
339                  unsigned *size)
340 {
341    opt_preamble_ctx ctx = {
342       .options = options,
343    };
344 
345    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
346    ctx.states = calloc(impl->ssa_alloc, sizeof(*ctx.states));
347 
348    /* Step 1: Calculate can_move */
349    nir_foreach_block (block, impl) {
350       nir_foreach_instr (instr, block) {
351          nir_ssa_def *def = nir_instr_ssa_def(instr);
352          if (!def)
353             continue;
354 
355          def_state *state = &ctx.states[def->index];
356 
357          state->can_move = can_move_instr(instr, &ctx);
358       }
359    }
360 
361    /* Step 2: Calculate is_candidate. This is complicated by the presence of
362     * non-candidate instructions like derefs whose users cannot be rewritten.
363     * If a deref chain is used at all by a non-can_move thing, then any offset
364     * sources anywhere along the chain should be considered candidates because
365     * the entire deref chain will never be deleted, but if it's only used by
366     * can_move things then it becomes subsumed by its users and none of the
367     * offset sources should be considered candidates as they will be removed
368     * when the users of the deref chain are moved. We need to replace "are
369     * there any non-can_move users" with "are there any non-can_move users,
370     * *recursing through non-candidate users*". We do this by walking backward
371     * and marking when a non-candidate instruction must stay in the final
372     * program because it has a non-can_move user, including recursively.
373     */
374    unsigned num_candidates = 0;
375    nir_foreach_block_reverse (block, impl) {
376       nir_foreach_instr_reverse (instr, block) {
377          nir_ssa_def *def = nir_instr_ssa_def(instr);
378          if (!def)
379             continue;
380 
381          def_state *state = &ctx.states[def->index];
382          if (!state->can_move)
383             continue;
384 
385          state->value = get_instr_cost(instr, options);
386          bool is_candidate = !avoid_instr(instr, options);
387          state->candidate = false;
388          state->must_stay = false;
389          nir_foreach_use (use, def) {
390             nir_ssa_def *use_def = nir_instr_ssa_def(use->parent_instr);
391             if (!use_def || !ctx.states[use_def->index].can_move ||
392                 ctx.states[use_def->index].must_stay) {
393                if (is_candidate)
394                   state->candidate = true;
395                else
396                   state->must_stay = true;
397             } else {
398                state->can_move_users++;
399             }
400          }
401 
402          nir_foreach_if_use (use, def) {
403             if (is_candidate)
404                state->candidate = true;
405             else
406                state->must_stay = true;
407             break;
408          }
409 
410          if (state->candidate)
411             num_candidates++;
412       }
413    }
414 
415    if (num_candidates == 0) {
416       *size = 0;
417       free(ctx.states);
418       return false;
419    }
420 
421    def_state **candidates = malloc(sizeof(*candidates) * num_candidates);
422    unsigned candidate_idx = 0;
423    unsigned total_size = 0;
424 
425    /* Step 3: Calculate value of candidates by propagating downwards. We try
426     * to share the value amongst can_move uses, in case there are multiple.
427     * This won't always find the most optimal solution, but is hopefully a
428     * good heuristic.
429     *
430     * Note that we use the can_move adjusted in the last pass, because if a
431     * can_move instruction cannot be moved because it's not a candidate and it
432     * has a non-can_move source then we don't want to count it as a use.
433     *
434     * While we're here, also collect an array of candidates.
435     */
436    nir_foreach_block (block, impl) {
437       nir_foreach_instr (instr, block) {
438          nir_ssa_def *def = nir_instr_ssa_def(instr);
439          if (!def)
440             continue;
441 
442          def_state *state = &ctx.states[def->index];
443          if (!state->can_move || state->must_stay)
444             continue;
445 
446          ctx.def = def;
447          nir_foreach_src(instr, update_src_value, &ctx);
448 
449          /* If this instruction is a candidate, its value shouldn't be
450           * propagated so we skip dividing it.
451           *
452           * Note: if it's can_move but not a candidate, then all its users
453           * must be can_move, so if there are no users then it must be dead.
454           */
455          if (!state->candidate && !state->must_stay) {
456             if (state->can_move_users > 0)
457                state->value /= state->can_move_users;
458             else
459                state->value = 0;
460          }
461 
462          if (state->candidate) {
463             state->benefit = state->value -
464                options->rewrite_cost_cb(def, options->cb_data);
465 
466             if (state->benefit > 0) {
467                options->def_size(def, &state->size, &state->align);
468                total_size = ALIGN_POT(total_size, state->align);
469                total_size += state->size;
470                candidates[candidate_idx++] = state;
471             }
472          }
473       }
474    }
475 
476    assert(candidate_idx <= num_candidates);
477    num_candidates = candidate_idx;
478 
479    if (num_candidates == 0) {
480       *size = 0;
481       free(ctx.states);
482       free(candidates);
483       return false;
484    }
485 
486    /* Step 4: Figure out which candidates we're going to replace and assign an
487     * offset. Assuming there is no expression sharing, this is similar to the
488     * 0-1 knapsack problem, except when there is a gap introduced by
489     * alignment. We use a well-known greedy approximation, sorting by value
490     * divided by size.
491     */
492 
493    if (total_size > options->preamble_storage_size) {
494       qsort(candidates, num_candidates, sizeof(*candidates), candidate_sort);
495    }
496 
497    unsigned offset = 0;
498    for (unsigned i = 0; i < num_candidates; i++) {
499       def_state *state = candidates[i];
500       offset = ALIGN_POT(offset, state->align);
501 
502       if (offset + state->size > options->preamble_storage_size)
503          break;
504 
505       state->replace = true;
506       state->offset = offset;
507 
508       offset += state->size;
509    }
510 
511    *size = offset;
512 
513    free(candidates);
514 
515    /* Step 5: Actually do the replacement. */
516    struct hash_table *remap_table =
517       _mesa_pointer_hash_table_create(NULL);
518    nir_function_impl *preamble =
519       nir_shader_get_preamble(impl->function->shader);
520    nir_builder _b;
521    nir_builder *b = &_b;
522    nir_builder_init(b, preamble);
523    b->cursor = nir_before_cf_list(&preamble->body);
524 
525    nir_foreach_block (block, impl) {
526       nir_foreach_instr (instr, block) {
527          nir_ssa_def *def = nir_instr_ssa_def(instr);
528          if (!def)
529             continue;
530 
531          def_state *state = &ctx.states[def->index];
532          if (!state->can_move)
533             continue;
534 
535          nir_instr *clone = nir_instr_clone_deep(impl->function->shader,
536                                                  instr, remap_table);
537 
538          nir_builder_instr_insert(b, clone);
539 
540          if (clone->type == nir_instr_type_tex) {
541             nir_tex_instr *tex = nir_instr_as_tex(clone);
542             if (tex->op == nir_texop_tex) {
543                /* For maximum compatibility, replace normal textures with
544                 * textureGrad with a gradient of 0.
545                 * TODO: Handle txb somehow.
546                 */
547                b->cursor = nir_before_instr(clone);
548 
549                nir_ssa_def *zero =
550                   nir_imm_zero(b, tex->coord_components - tex->is_array, 32);
551                nir_tex_instr_add_src(tex, nir_tex_src_ddx, nir_src_for_ssa(zero));
552                nir_tex_instr_add_src(tex, nir_tex_src_ddy, nir_src_for_ssa(zero));
553                tex->op = nir_texop_txd;
554 
555                b->cursor = nir_after_instr(clone);
556             }
557          }
558 
559          if (state->replace) {
560             nir_ssa_def *clone_def = nir_instr_ssa_def(clone);
561             nir_store_preamble(b, clone_def, .base = state->offset);
562          }
563       }
564    }
565 
566    nir_builder_init(b, impl);
567 
568    nir_foreach_block (block, impl) {
569       nir_foreach_instr_safe (instr, block) {
570          nir_ssa_def *def = nir_instr_ssa_def(instr);
571          if (!def)
572             continue;
573 
574          def_state *state = &ctx.states[def->index];
575          if (!state->replace)
576             continue;
577 
578          b->cursor = nir_before_instr(instr);
579 
580          nir_ssa_def *new_def =
581             nir_load_preamble(b, def->num_components, def->bit_size,
582                               .base = state->offset);
583 
584 
585          nir_ssa_def_rewrite_uses(def, new_def);
586          nir_instr_free_and_dce(instr);
587       }
588    }
589 
590    nir_metadata_preserve(impl,
591                          nir_metadata_block_index |
592                          nir_metadata_dominance);
593 
594    ralloc_free(remap_table);
595    free(ctx.states);
596    return true;
597 }
598