• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "nir_schedule.h"
25 #include "util/dag.h"
26 #include "util/u_dynarray.h"
27 
28 /** @file
29  *
30  * Implements basic-block-level prepass instruction scheduling in NIR to
31  * manage register pressure.
32  *
33  * This is based on the Goodman/Hsu paper (1988, cached copy at
34  * https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf).  We
35  * make up the DDG for NIR (which can be mostly done using the NIR def/use
36  * chains for SSA instructions, plus some edges for ordering register writes
37  * vs reads, and some more for ordering intrinsics).  Then we pick heads off
38  * of the DDG using their heuristic to emit the NIR instructions back into the
39  * block in their new order.
40  *
41  * The hard case for prepass scheduling on GPUs seems to always be consuming
42  * texture/ubo results.  The register pressure heuristic doesn't want to pick
43  * an instr that starts consuming texture results because it usually won't be
44  * the only usage, so that instruction will increase pressure.
45  *
46  * If you try to force consumption of tex results always, then in a case where
47  * single sample is used for many outputs, you'll end up picking every other
48  * user and expanding register pressure.  The partially_evaluated_path flag
49  * helps tremendously, in that if you happen for whatever reason to pick a
50  * texture sample's output, then you'll try to finish off that sample.  Future
51  * work may include doing some local search before locking in a choice, to try
52  * to more reliably find the case where just a few choices going against the
53  * heuristic can manage to free the whole vector.
54  */
55 
56 static bool debug;
57 
58 /**
59  * Represents a node in the DDG for a NIR instruction.
60  */
61 typedef struct {
62    struct dag_node dag; /* must be first for our u_dynarray_foreach */
63    nir_instr *instr;
64    bool partially_evaluated_path;
65 
66    /* Approximate estimate of the delay between starting this instruction and
67     * its results being available.
68     *
69     * Accuracy is not too important, given that we're prepass scheduling here
70     * and just trying to reduce excess dependencies introduced by a register
71     * allocator by stretching out the live intervals of expensive
72     * instructions.
73     */
74    uint32_t delay;
75 
76    /* Cost of the maximum-delay path from this node to the leaves. */
77    uint32_t max_delay;
78 
79    /* scoreboard->time value when this instruction can be scheduled without
80     * any stalls expected.
81     */
82    uint32_t ready_time;
83 } nir_schedule_node;
84 
85 typedef struct {
86    struct dag *dag;
87 
88    nir_shader *shader;
89 
90    /* Mapping from nir_register * or nir_ssa_def * to a struct set of
91     * instructions remaining to be scheduled using the register.
92     */
93    struct hash_table *remaining_uses;
94 
95    /* Map from nir_instr to nir_schedule_node * */
96    struct hash_table *instr_map;
97 
98    /* Set of nir_register * or nir_ssa_def * that have had any instruction
99     * scheduled on them.
100     */
101    struct set *live_values;
102 
103    /* An abstract approximation of the number of nir_scheduler_node->delay
104     * units since the start of the shader.
105     */
106    uint32_t time;
107 
108    /* Number of channels currently used by the NIR instructions that have been
109     * scheduled.
110     */
111    int pressure;
112 
113    /* Options specified by the backend */
114    const nir_schedule_options *options;
115 } nir_schedule_scoreboard;
116 
117 /* When walking the instructions in reverse, we use this flag to swap
118  * before/after in add_dep().
119  */
120 enum direction { F, R };
121 
122 struct nir_schedule_class_dep {
123    int klass;
124    nir_schedule_node *node;
125    struct nir_schedule_class_dep *next;
126 };
127 
128 typedef struct {
129    nir_schedule_scoreboard *scoreboard;
130 
131    /* Map from nir_register to nir_schedule_node * */
132    struct hash_table *reg_map;
133 
134    /* Scheduler nodes for last instruction involved in some class of dependency.
135     */
136    nir_schedule_node *load_input;
137    nir_schedule_node *store_shared;
138    nir_schedule_node *unknown_intrinsic;
139    nir_schedule_node *discard;
140    nir_schedule_node *jump;
141 
142    struct nir_schedule_class_dep *class_deps;
143 
144    enum direction dir;
145 } nir_deps_state;
146 
147 static void *
_mesa_hash_table_search_data(struct hash_table * ht,void * key)148 _mesa_hash_table_search_data(struct hash_table *ht, void *key)
149 {
150    struct hash_entry *entry = _mesa_hash_table_search(ht, key);
151    if (!entry)
152       return NULL;
153    return entry->data;
154 }
155 
156 static nir_schedule_node *
nir_schedule_get_node(struct hash_table * instr_map,nir_instr * instr)157 nir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr)
158 {
159    return _mesa_hash_table_search_data(instr_map, instr);
160 }
161 
162 static struct set *
nir_schedule_scoreboard_get_src(nir_schedule_scoreboard * scoreboard,nir_src * src)163 nir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src)
164 {
165    if (src->is_ssa) {
166       return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa);
167    } else {
168       return _mesa_hash_table_search_data(scoreboard->remaining_uses,
169                                           src->reg.reg);
170    }
171 }
172 
173 static int
nir_schedule_def_pressure(nir_ssa_def * def)174 nir_schedule_def_pressure(nir_ssa_def *def)
175 {
176    return def->num_components;
177 }
178 
179 static int
nir_schedule_src_pressure(nir_src * src)180 nir_schedule_src_pressure(nir_src *src)
181 {
182    if (src->is_ssa)
183       return nir_schedule_def_pressure(src->ssa);
184    else
185       return src->reg.reg->num_components;
186 }
187 
188 static int
nir_schedule_dest_pressure(nir_dest * dest)189 nir_schedule_dest_pressure(nir_dest *dest)
190 {
191    if (dest->is_ssa)
192       return nir_schedule_def_pressure(&dest->ssa);
193    else
194       return dest->reg.reg->num_components;
195 }
196 
197 /**
198  * Adds a dependency such that @after must appear in the final program after
199  * @before.
200  *
201  * We add @before as a child of @after, so that DAG heads are the outputs of
202  * the program and we make our scheduling decisions bottom to top.
203  */
204 static void
add_dep(nir_deps_state * state,nir_schedule_node * before,nir_schedule_node * after)205 add_dep(nir_deps_state *state,
206         nir_schedule_node *before,
207         nir_schedule_node *after)
208 {
209    if (!before || !after)
210       return;
211 
212    assert(before != after);
213 
214    if (state->dir == F)
215       dag_add_edge(&before->dag, &after->dag, 0);
216    else
217       dag_add_edge(&after->dag, &before->dag, 0);
218 }
219 
220 
221 static void
add_read_dep(nir_deps_state * state,nir_schedule_node * before,nir_schedule_node * after)222 add_read_dep(nir_deps_state *state,
223              nir_schedule_node *before,
224              nir_schedule_node *after)
225 {
226    add_dep(state, before, after);
227 }
228 
229 static void
add_write_dep(nir_deps_state * state,nir_schedule_node ** before,nir_schedule_node * after)230 add_write_dep(nir_deps_state *state,
231               nir_schedule_node **before,
232               nir_schedule_node *after)
233 {
234    add_dep(state, *before, after);
235    *before = after;
236 }
237 
238 static bool
nir_schedule_reg_src_deps(nir_src * src,void * in_state)239 nir_schedule_reg_src_deps(nir_src *src, void *in_state)
240 {
241    nir_deps_state *state = in_state;
242 
243    if (src->is_ssa)
244       return true;
245 
246    struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
247                                                       src->reg.reg);
248    if (!entry)
249       return true;
250    nir_schedule_node *dst_n = entry->data;
251 
252    nir_schedule_node *src_n =
253       nir_schedule_get_node(state->scoreboard->instr_map,
254                             src->parent_instr);
255 
256    add_dep(state, dst_n, src_n);
257 
258    return true;
259 }
260 
261 static bool
nir_schedule_reg_dest_deps(nir_dest * dest,void * in_state)262 nir_schedule_reg_dest_deps(nir_dest *dest, void *in_state)
263 {
264    nir_deps_state *state = in_state;
265 
266    if (dest->is_ssa)
267       return true;
268 
269    nir_schedule_node *dest_n =
270       nir_schedule_get_node(state->scoreboard->instr_map,
271                             dest->reg.parent_instr);
272 
273    struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
274                                                       dest->reg.reg);
275    if (!entry) {
276       _mesa_hash_table_insert(state->reg_map, dest->reg.reg, dest_n);
277       return true;
278    }
279    nir_schedule_node **before = (nir_schedule_node **)&entry->data;
280 
281    add_write_dep(state, before, dest_n);
282 
283    return true;
284 }
285 
286 static bool
nir_schedule_ssa_deps(nir_ssa_def * def,void * in_state)287 nir_schedule_ssa_deps(nir_ssa_def *def, void *in_state)
288 {
289    nir_deps_state *state = in_state;
290    struct hash_table *instr_map = state->scoreboard->instr_map;
291    nir_schedule_node *def_n = nir_schedule_get_node(instr_map, def->parent_instr);
292 
293    nir_foreach_use(src, def) {
294       nir_schedule_node *use_n = nir_schedule_get_node(instr_map,
295                                                        src->parent_instr);
296 
297       add_read_dep(state, def_n, use_n);
298    }
299 
300    return true;
301 }
302 
303 static struct nir_schedule_class_dep *
nir_schedule_get_class_dep(nir_deps_state * state,int klass)304 nir_schedule_get_class_dep(nir_deps_state *state,
305                            int klass)
306 {
307    for (struct nir_schedule_class_dep *class_dep = state->class_deps;
308         class_dep != NULL;
309         class_dep = class_dep->next) {
310       if (class_dep->klass == klass)
311          return class_dep;
312    }
313 
314    struct nir_schedule_class_dep *class_dep =
315       ralloc(state->reg_map, struct nir_schedule_class_dep);
316 
317    class_dep->klass = klass;
318    class_dep->node = NULL;
319    class_dep->next = state->class_deps;
320 
321    state->class_deps = class_dep;
322 
323    return class_dep;
324 }
325 
326 static void
nir_schedule_intrinsic_deps(nir_deps_state * state,nir_intrinsic_instr * instr)327 nir_schedule_intrinsic_deps(nir_deps_state *state,
328                             nir_intrinsic_instr *instr)
329 {
330    nir_schedule_node *n = nir_schedule_get_node(state->scoreboard->instr_map,
331                                                 &instr->instr);
332    const nir_schedule_options *options = state->scoreboard->options;
333    nir_schedule_dependency dep;
334 
335    if (options->intrinsic_cb &&
336        options->intrinsic_cb(instr, &dep, options->intrinsic_cb_data)) {
337       struct nir_schedule_class_dep *class_dep =
338          nir_schedule_get_class_dep(state, dep.klass);
339 
340       switch (dep.type) {
341       case NIR_SCHEDULE_READ_DEPENDENCY:
342          add_read_dep(state, class_dep->node, n);
343          break;
344       case NIR_SCHEDULE_WRITE_DEPENDENCY:
345          add_write_dep(state, &class_dep->node, n);
346          break;
347       }
348    }
349 
350    switch (instr->intrinsic) {
351    case nir_intrinsic_load_uniform:
352    case nir_intrinsic_load_ubo:
353    case nir_intrinsic_load_front_face:
354       break;
355 
356    case nir_intrinsic_discard:
357    case nir_intrinsic_discard_if:
358    case nir_intrinsic_demote:
359    case nir_intrinsic_demote_if:
360    case nir_intrinsic_terminate:
361    case nir_intrinsic_terminate_if:
362       /* We are adding two dependencies:
363        *
364        * * A individual one that we could use to add a read_dep while handling
365        *   nir_instr_type_tex
366        *
367        * * Include it on the unknown intrinsic set, as we want discard to be
368        *   serialized in in the same order relative to intervening stores or
369        *   atomic accesses to SSBOs and images
370        */
371       add_write_dep(state, &state->discard, n);
372       add_write_dep(state, &state->unknown_intrinsic, n);
373       break;
374 
375    case nir_intrinsic_store_output:
376       /* For some hardware and stages, output stores affect the same shared
377        * memory as input loads.
378        */
379       if ((state->scoreboard->options->stages_with_shared_io_memory &
380            (1 << state->scoreboard->shader->info.stage)))
381          add_write_dep(state, &state->load_input, n);
382 
383       /* Make sure that preceding discards stay before the store_output */
384       add_read_dep(state, state->discard, n);
385 
386       break;
387 
388    case nir_intrinsic_load_input:
389    case nir_intrinsic_load_per_vertex_input:
390       add_read_dep(state, state->load_input, n);
391       break;
392 
393    case nir_intrinsic_load_shared:
394    case nir_intrinsic_load_shared2_amd:
395       /* Don't move load_shared beyond a following store_shared, as it could
396        * change their value
397        */
398       add_read_dep(state, state->store_shared, n);
399       break;
400 
401    case nir_intrinsic_store_shared:
402    case nir_intrinsic_store_shared2_amd:
403       add_write_dep(state, &state->store_shared, n);
404       break;
405 
406    case nir_intrinsic_control_barrier:
407    case nir_intrinsic_memory_barrier_shared:
408    case nir_intrinsic_group_memory_barrier:
409    /* A generic memory barrier can be emitted when multiple synchronization
410     * semantics are involved, including shared memory.
411     */
412    case nir_intrinsic_memory_barrier:
413       add_write_dep(state, &state->store_shared, n);
414 
415       /* Serialize against ssbos/atomics/etc. */
416       add_write_dep(state, &state->unknown_intrinsic, n);
417       break;
418 
419    case nir_intrinsic_scoped_barrier: {
420       const nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
421 
422       if (modes & nir_var_mem_shared)
423          add_write_dep(state, &state->store_shared, n);
424 
425       /* Serialize against other categories. */
426       add_write_dep(state, &state->unknown_intrinsic, n);
427 
428       break;
429    }
430 
431    default:
432       /* Attempt to handle other intrinsics that we haven't individually
433        * categorized by serializing them in the same order relative to each
434        * other.
435        */
436       add_write_dep(state, &state->unknown_intrinsic, n);
437       break;
438    }
439 }
440 
441 /**
442  * Common code for dependencies that need to be tracked both forward and
443  * backward.
444  *
445  * This is for things like "all reads of r4 have to happen between the r4
446  * writes that surround them".
447  */
448 static void
nir_schedule_calculate_deps(nir_deps_state * state,nir_schedule_node * n)449 nir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n)
450 {
451    nir_instr *instr = n->instr;
452 
453    /* For NIR SSA defs, we only need to do a single pass of making the uses
454     * depend on the def.
455     */
456    if (state->dir == F)
457       nir_foreach_ssa_def(instr, nir_schedule_ssa_deps, state);
458 
459    /* For NIR regs, track the last writer in the scheduler state so that we
460     * can keep the writes in order and let reads get reordered only between
461     * each write.
462     */
463    nir_foreach_src(instr, nir_schedule_reg_src_deps, state);
464 
465    nir_foreach_dest(instr, nir_schedule_reg_dest_deps, state);
466 
467    /* Make sure any other instructions keep their positions relative to
468     * jumps.
469     */
470    if (instr->type != nir_instr_type_jump)
471       add_read_dep(state, state->jump, n);
472 
473    switch (instr->type) {
474    case nir_instr_type_ssa_undef:
475    case nir_instr_type_load_const:
476    case nir_instr_type_alu:
477    case nir_instr_type_deref:
478       break;
479 
480    case nir_instr_type_tex:
481       /* Don't move texture ops before a discard, as that could increase
482        * memory bandwidth for reading the discarded samples.
483        */
484       add_read_dep(state, state->discard, n);
485       break;
486 
487    case nir_instr_type_jump:
488       add_write_dep(state, &state->jump, n);
489       break;
490 
491    case nir_instr_type_call:
492       unreachable("Calls should have been lowered");
493       break;
494 
495    case nir_instr_type_parallel_copy:
496       unreachable("Parallel copies should have been lowered");
497       break;
498 
499    case nir_instr_type_phi:
500       unreachable("nir_schedule() should be called after lowering from SSA");
501       break;
502 
503    case nir_instr_type_intrinsic:
504       nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr));
505       break;
506    }
507 }
508 
509 static void
calculate_forward_deps(nir_schedule_scoreboard * scoreboard,nir_block * block)510 calculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
511 {
512    nir_deps_state state = {
513       .scoreboard = scoreboard,
514       .dir = F,
515       .reg_map = _mesa_pointer_hash_table_create(NULL),
516    };
517 
518    nir_foreach_instr(instr, block) {
519       nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
520                                                       instr);
521       nir_schedule_calculate_deps(&state, node);
522    }
523 
524    ralloc_free(state.reg_map);
525 }
526 
527 static void
calculate_reverse_deps(nir_schedule_scoreboard * scoreboard,nir_block * block)528 calculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
529 {
530    nir_deps_state state = {
531       .scoreboard = scoreboard,
532       .dir = R,
533       .reg_map = _mesa_pointer_hash_table_create(NULL),
534    };
535 
536    nir_foreach_instr_reverse(instr, block) {
537       nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
538                                                       instr);
539       nir_schedule_calculate_deps(&state, node);
540    }
541 
542    ralloc_free(state.reg_map);
543 }
544 
545 typedef struct {
546    nir_schedule_scoreboard *scoreboard;
547    int regs_freed;
548 } nir_schedule_regs_freed_state;
549 
550 static bool
nir_schedule_regs_freed_src_cb(nir_src * src,void * in_state)551 nir_schedule_regs_freed_src_cb(nir_src *src, void *in_state)
552 {
553    nir_schedule_regs_freed_state *state = in_state;
554    nir_schedule_scoreboard *scoreboard = state->scoreboard;
555    struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
556 
557    if (remaining_uses->entries == 1 &&
558        _mesa_set_search(remaining_uses, src->parent_instr)) {
559       state->regs_freed += nir_schedule_src_pressure(src);
560    }
561 
562    return true;
563 }
564 
565 static bool
nir_schedule_regs_freed_def_cb(nir_ssa_def * def,void * in_state)566 nir_schedule_regs_freed_def_cb(nir_ssa_def *def, void *in_state)
567 {
568    nir_schedule_regs_freed_state *state = in_state;
569 
570    state->regs_freed -= nir_schedule_def_pressure(def);
571 
572    return true;
573 }
574 
575 static bool
nir_schedule_regs_freed_dest_cb(nir_dest * dest,void * in_state)576 nir_schedule_regs_freed_dest_cb(nir_dest *dest, void *in_state)
577 {
578    nir_schedule_regs_freed_state *state = in_state;
579    nir_schedule_scoreboard *scoreboard = state->scoreboard;
580 
581    if (dest->is_ssa)
582       return true;
583 
584    nir_register *reg = dest->reg.reg;
585 
586    /* Only the first def of a reg counts against register pressure. */
587    if (!_mesa_set_search(scoreboard->live_values, reg))
588       state->regs_freed -= nir_schedule_dest_pressure(dest);
589 
590    return true;
591 }
592 
593 static int
nir_schedule_regs_freed(nir_schedule_scoreboard * scoreboard,nir_schedule_node * n)594 nir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n)
595 {
596    nir_schedule_regs_freed_state state = {
597       .scoreboard = scoreboard,
598    };
599 
600    nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state);
601 
602    nir_foreach_ssa_def(n->instr, nir_schedule_regs_freed_def_cb, &state);
603 
604    nir_foreach_dest(n->instr, nir_schedule_regs_freed_dest_cb, &state);
605 
606    return state.regs_freed;
607 }
608 
609 /**
610  * Chooses an instruction that will minimise the register pressure as much as
611  * possible. This should only be used as a fallback when the regular scheduling
612  * generates a shader whose register allocation fails.
613  */
614 static nir_schedule_node *
nir_schedule_choose_instruction_fallback(nir_schedule_scoreboard * scoreboard)615 nir_schedule_choose_instruction_fallback(nir_schedule_scoreboard *scoreboard)
616 {
617    nir_schedule_node *chosen = NULL;
618 
619    /* Find the leader in the ready (shouldn't-stall) set with the mininum
620     * cost.
621     */
622    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
623       if (scoreboard->time < n->ready_time)
624          continue;
625 
626       if (!chosen || chosen->max_delay > n->max_delay)
627          chosen = n;
628    }
629    if (chosen) {
630       if (debug) {
631          fprintf(stderr, "chose (ready fallback):          ");
632          nir_print_instr(chosen->instr, stderr);
633          fprintf(stderr, "\n");
634       }
635 
636       return chosen;
637    }
638 
639    /* Otherwise, choose the leader with the minimum cost. */
640    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
641       if (!chosen || chosen->max_delay > n->max_delay)
642          chosen = n;
643    }
644    if (debug) {
645       fprintf(stderr, "chose (leader fallback):         ");
646       nir_print_instr(chosen->instr, stderr);
647       fprintf(stderr, "\n");
648    }
649 
650    return chosen;
651 }
652 
653 /**
654  * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code
655  * Scheduling for Parallelism) heuristic.
656  *
657  * Picks an instruction on the critical that's ready to execute without
658  * stalls, if possible, otherwise picks the instruction on the critical path.
659  */
660 static nir_schedule_node *
nir_schedule_choose_instruction_csp(nir_schedule_scoreboard * scoreboard)661 nir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard)
662 {
663    nir_schedule_node *chosen = NULL;
664 
665    /* Find the leader in the ready (shouldn't-stall) set with the maximum
666     * cost.
667     */
668    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
669       if (scoreboard->time < n->ready_time)
670          continue;
671 
672       if (!chosen || chosen->max_delay < n->max_delay)
673          chosen = n;
674    }
675    if (chosen) {
676       if (debug) {
677          fprintf(stderr, "chose (ready):          ");
678          nir_print_instr(chosen->instr, stderr);
679          fprintf(stderr, "\n");
680       }
681 
682       return chosen;
683    }
684 
685    /* Otherwise, choose the leader with the maximum cost. */
686    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
687       if (!chosen || chosen->max_delay < n->max_delay)
688          chosen = n;
689    }
690    if (debug) {
691       fprintf(stderr, "chose (leader):         ");
692       nir_print_instr(chosen->instr, stderr);
693       fprintf(stderr, "\n");
694    }
695 
696    return chosen;
697 }
698 
699 /**
700  * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
701  * Scheduling for Register pressure) heuristic.
702  */
703 static nir_schedule_node *
nir_schedule_choose_instruction_csr(nir_schedule_scoreboard * scoreboard)704 nir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard)
705 {
706    nir_schedule_node *chosen = NULL;
707 
708    /* Find a ready inst with regs freed and pick the one with max cost. */
709    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
710       if (n->ready_time > scoreboard->time)
711          continue;
712 
713       int regs_freed = nir_schedule_regs_freed(scoreboard, n);
714 
715       if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
716          chosen = n;
717       }
718    }
719    if (chosen) {
720       if (debug) {
721          fprintf(stderr, "chose (freed+ready):    ");
722          nir_print_instr(chosen->instr, stderr);
723          fprintf(stderr, "\n");
724       }
725 
726       return chosen;
727    }
728 
729    /* Find a leader with regs freed and pick the one with max cost. */
730    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
731       int regs_freed = nir_schedule_regs_freed(scoreboard, n);
732 
733       if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
734          chosen = n;
735       }
736    }
737    if (chosen) {
738       if (debug) {
739          fprintf(stderr, "chose (regs freed):     ");
740          nir_print_instr(chosen->instr, stderr);
741          fprintf(stderr, "\n");
742       }
743 
744       return chosen;
745    }
746 
747    /* Find a partially evaluated path and try to finish it off */
748    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
749       if (n->partially_evaluated_path &&
750           (!chosen || chosen->max_delay < n->max_delay)) {
751          chosen = n;
752       }
753    }
754    if (chosen) {
755       if (debug) {
756          fprintf(stderr, "chose (partial path):   ");
757          nir_print_instr(chosen->instr, stderr);
758          fprintf(stderr, "\n");
759       }
760 
761       return chosen;
762    }
763 
764    /* Contra the paper, pick a leader with no effect on used regs.  This may
765     * open up new opportunities, as otherwise a single-operand instr consuming
766     * a value will tend to block finding freeing that value.  This had a
767     * massive effect on reducing spilling on V3D.
768     *
769     * XXX: Should this prioritize ready?
770     */
771    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
772       if (nir_schedule_regs_freed(scoreboard, n) != 0)
773          continue;
774 
775       if (!chosen || chosen->max_delay < n->max_delay)
776          chosen = n;
777    }
778    if (chosen) {
779       if (debug) {
780          fprintf(stderr, "chose (regs no-op):     ");
781          nir_print_instr(chosen->instr, stderr);
782          fprintf(stderr, "\n");
783       }
784 
785       return chosen;
786    }
787 
788    /* Pick the max delay of the remaining ready set. */
789    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
790       if (n->ready_time > scoreboard->time)
791          continue;
792       if (!chosen || chosen->max_delay < n->max_delay)
793          chosen = n;
794    }
795    if (chosen) {
796       if (debug) {
797          fprintf(stderr, "chose (ready max delay):   ");
798          nir_print_instr(chosen->instr, stderr);
799          fprintf(stderr, "\n");
800       }
801       return chosen;
802    }
803 
804    /* Pick the max delay of the remaining leaders. */
805    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
806       if (!chosen || chosen->max_delay < n->max_delay)
807          chosen = n;
808    }
809 
810    if (debug) {
811       fprintf(stderr, "chose (max delay):         ");
812       nir_print_instr(chosen->instr, stderr);
813       fprintf(stderr, "\n");
814    }
815 
816    return chosen;
817 }
818 
819 static void
dump_state(nir_schedule_scoreboard * scoreboard)820 dump_state(nir_schedule_scoreboard *scoreboard)
821 {
822    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
823       fprintf(stderr, "maxdel %5d ", n->max_delay);
824       nir_print_instr(n->instr, stderr);
825       fprintf(stderr, "\n");
826 
827       util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
828          nir_schedule_node *child = (nir_schedule_node *)edge->child;
829 
830          fprintf(stderr, " -> (%d parents) ", child->dag.parent_count);
831          nir_print_instr(child->instr, stderr);
832          fprintf(stderr, "\n");
833       }
834    }
835 }
836 
837 static void
nir_schedule_mark_use(nir_schedule_scoreboard * scoreboard,void * reg_or_def,nir_instr * reg_or_def_parent,int pressure)838 nir_schedule_mark_use(nir_schedule_scoreboard *scoreboard,
839                       void *reg_or_def,
840                       nir_instr *reg_or_def_parent,
841                       int pressure)
842 {
843    /* Make the value live if it's the first time it's been used. */
844    if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) {
845       _mesa_set_add(scoreboard->live_values, reg_or_def);
846       scoreboard->pressure += pressure;
847    }
848 
849    /* Make the value dead if it's the last remaining use.  Be careful when one
850     * instruction uses a value twice to not decrement pressure twice.
851     */
852    struct set *remaining_uses =
853       _mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def);
854    struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent);
855    if (entry) {
856       _mesa_set_remove(remaining_uses, entry);
857 
858       if (remaining_uses->entries == 0)
859          scoreboard->pressure -= pressure;
860    }
861 }
862 
863 static bool
nir_schedule_mark_src_scheduled(nir_src * src,void * state)864 nir_schedule_mark_src_scheduled(nir_src *src, void *state)
865 {
866    nir_schedule_scoreboard *scoreboard = state;
867    struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
868 
869    struct set_entry *entry = _mesa_set_search(remaining_uses,
870                                               src->parent_instr);
871    if (entry) {
872       /* Once we've used an SSA value in one instruction, bump the priority of
873        * the other uses so the SSA value can get fully consumed.
874        *
875        * We don't do this for registers, and it's would be a hassle and it's
876        * unclear if that would help or not.  Also, skip it for constants, as
877        * they're often folded as immediates into backend instructions and have
878        * many unrelated instructions all referencing the same value (0).
879        */
880       if (src->is_ssa &&
881           src->ssa->parent_instr->type != nir_instr_type_load_const) {
882          nir_foreach_use(other_src, src->ssa) {
883             if (other_src->parent_instr == src->parent_instr)
884                continue;
885 
886             nir_schedule_node *n =
887                nir_schedule_get_node(scoreboard->instr_map,
888                                      other_src->parent_instr);
889 
890             if (n && !n->partially_evaluated_path) {
891                if (debug) {
892                   fprintf(stderr, "  New partially evaluated path: ");
893                   nir_print_instr(n->instr, stderr);
894                   fprintf(stderr, "\n");
895                }
896 
897                n->partially_evaluated_path = true;
898             }
899          }
900       }
901    }
902 
903    nir_schedule_mark_use(scoreboard,
904                          src->is_ssa ? (void *)src->ssa : (void *)src->reg.reg,
905                          src->parent_instr,
906                          nir_schedule_src_pressure(src));
907 
908    return true;
909 }
910 
911 static bool
nir_schedule_mark_def_scheduled(nir_ssa_def * def,void * state)912 nir_schedule_mark_def_scheduled(nir_ssa_def *def, void *state)
913 {
914    nir_schedule_scoreboard *scoreboard = state;
915 
916    nir_schedule_mark_use(scoreboard, def, def->parent_instr,
917                          nir_schedule_def_pressure(def));
918 
919    return true;
920 }
921 
922 static bool
nir_schedule_mark_dest_scheduled(nir_dest * dest,void * state)923 nir_schedule_mark_dest_scheduled(nir_dest *dest, void *state)
924 {
925    nir_schedule_scoreboard *scoreboard = state;
926 
927    /* SSA defs were handled in nir_schedule_mark_def_scheduled()
928     */
929    if (dest->is_ssa)
930       return true;
931 
932    /* XXX: This is not actually accurate for regs -- the last use of a reg may
933     * have a live interval that extends across control flow.  We should
934     * calculate the live ranges of regs, and have scheduler nodes for the CF
935     * nodes that also "use" the reg.
936     */
937    nir_schedule_mark_use(scoreboard, dest->reg.reg,
938                          dest->reg.parent_instr,
939                          nir_schedule_dest_pressure(dest));
940 
941    return true;
942 }
943 
944 static void
nir_schedule_mark_node_scheduled(nir_schedule_scoreboard * scoreboard,nir_schedule_node * n)945 nir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard,
946                                  nir_schedule_node *n)
947 {
948    nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard);
949    nir_foreach_ssa_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard);
950    nir_foreach_dest(n->instr, nir_schedule_mark_dest_scheduled, scoreboard);
951 
952    util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
953       nir_schedule_node *child = (nir_schedule_node *)edge->child;
954 
955       child->ready_time = MAX2(child->ready_time,
956                                scoreboard->time + n->delay);
957 
958       if (child->dag.parent_count == 1) {
959          if (debug) {
960             fprintf(stderr, "  New DAG head: ");
961             nir_print_instr(child->instr, stderr);
962             fprintf(stderr, "\n");
963          }
964       }
965    }
966 
967    dag_prune_head(scoreboard->dag, &n->dag);
968 
969    scoreboard->time = MAX2(n->ready_time, scoreboard->time);
970    scoreboard->time++;
971 }
972 
973 static void
nir_schedule_instructions(nir_schedule_scoreboard * scoreboard,nir_block * block)974 nir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block)
975 {
976    while (!list_is_empty(&scoreboard->dag->heads)) {
977       if (debug) {
978          fprintf(stderr, "current list:\n");
979          dump_state(scoreboard);
980       }
981 
982       nir_schedule_node *chosen;
983       if (scoreboard->options->fallback)
984          chosen = nir_schedule_choose_instruction_fallback(scoreboard);
985       else if (scoreboard->pressure < scoreboard->options->threshold)
986          chosen = nir_schedule_choose_instruction_csp(scoreboard);
987       else
988          chosen = nir_schedule_choose_instruction_csr(scoreboard);
989 
990       /* Now that we've scheduled a new instruction, some of its children may
991        * be promoted to the list of instructions ready to be scheduled.
992        */
993       nir_schedule_mark_node_scheduled(scoreboard, chosen);
994 
995       /* Move the instruction to the end (so our first chosen instructions are
996        * the start of the program).
997        */
998       exec_node_remove(&chosen->instr->node);
999       exec_list_push_tail(&block->instr_list, &chosen->instr->node);
1000 
1001       if (debug)
1002          fprintf(stderr, "\n");
1003    }
1004 }
1005 
1006 static uint32_t
nir_schedule_get_delay(nir_schedule_scoreboard * scoreboard,nir_instr * instr)1007 nir_schedule_get_delay(nir_schedule_scoreboard *scoreboard, nir_instr *instr)
1008 {
1009    if (scoreboard->options->instr_delay_cb) {
1010       void *cb_data = scoreboard->options->instr_delay_cb_data;
1011       return scoreboard->options->instr_delay_cb(instr, cb_data);
1012    }
1013 
1014    switch (instr->type) {
1015    case nir_instr_type_ssa_undef:
1016    case nir_instr_type_load_const:
1017    case nir_instr_type_alu:
1018    case nir_instr_type_deref:
1019    case nir_instr_type_jump:
1020    case nir_instr_type_parallel_copy:
1021    case nir_instr_type_call:
1022    case nir_instr_type_phi:
1023       return 1;
1024 
1025    case nir_instr_type_intrinsic:
1026       switch (nir_instr_as_intrinsic(instr)->intrinsic) {
1027       case nir_intrinsic_load_ubo:
1028       case nir_intrinsic_load_ssbo:
1029       case nir_intrinsic_load_scratch:
1030       case nir_intrinsic_load_shared:
1031       case nir_intrinsic_image_load:
1032          return 50;
1033       default:
1034          return 1;
1035       }
1036       break;
1037 
1038    case nir_instr_type_tex:
1039       /* Pick some large number to try to fetch textures early and sample them
1040        * late.
1041        */
1042       return 100;
1043    }
1044 
1045    return 0;
1046 }
1047 
1048 static void
nir_schedule_dag_max_delay_cb(struct dag_node * node,void * state)1049 nir_schedule_dag_max_delay_cb(struct dag_node *node, void *state)
1050 {
1051    nir_schedule_node *n = (nir_schedule_node *)node;
1052    uint32_t max_delay = 0;
1053 
1054    util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1055       nir_schedule_node *child = (nir_schedule_node *)edge->child;
1056       max_delay = MAX2(child->max_delay, max_delay);
1057    }
1058 
1059    n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
1060  }
1061 
1062 static void
nir_schedule_block(nir_schedule_scoreboard * scoreboard,nir_block * block)1063 nir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block)
1064 {
1065    void *mem_ctx = ralloc_context(NULL);
1066    scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx);
1067 
1068    scoreboard->dag = dag_create(mem_ctx);
1069 
1070    nir_foreach_instr(instr, block) {
1071       nir_schedule_node *n =
1072          rzalloc(mem_ctx, nir_schedule_node);
1073 
1074       n->instr = instr;
1075       n->delay = nir_schedule_get_delay(scoreboard, instr);
1076       dag_init_node(scoreboard->dag, &n->dag);
1077 
1078       _mesa_hash_table_insert(scoreboard->instr_map, instr, n);
1079    }
1080 
1081    calculate_forward_deps(scoreboard, block);
1082    calculate_reverse_deps(scoreboard, block);
1083 
1084    dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL);
1085 
1086    nir_schedule_instructions(scoreboard, block);
1087 
1088    ralloc_free(mem_ctx);
1089    scoreboard->instr_map = NULL;
1090 }
1091 
1092 static bool
nir_schedule_ssa_def_init_scoreboard(nir_ssa_def * def,void * state)1093 nir_schedule_ssa_def_init_scoreboard(nir_ssa_def *def, void *state)
1094 {
1095    nir_schedule_scoreboard *scoreboard = state;
1096    struct set *def_uses = _mesa_pointer_set_create(scoreboard);
1097 
1098    _mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses);
1099 
1100    _mesa_set_add(def_uses, def->parent_instr);
1101 
1102    nir_foreach_use(src, def) {
1103       _mesa_set_add(def_uses, src->parent_instr);
1104    }
1105 
1106    /* XXX: Handle if uses */
1107 
1108    return true;
1109 }
1110 
1111 static nir_schedule_scoreboard *
nir_schedule_get_scoreboard(nir_shader * shader,const nir_schedule_options * options)1112 nir_schedule_get_scoreboard(nir_shader *shader,
1113                             const nir_schedule_options *options)
1114 {
1115    nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard);
1116 
1117    scoreboard->shader = shader;
1118    scoreboard->live_values = _mesa_pointer_set_create(scoreboard);
1119    scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard);
1120    scoreboard->options = options;
1121    scoreboard->pressure = 0;
1122 
1123    nir_foreach_function(function, shader) {
1124       nir_foreach_register(reg, &function->impl->registers) {
1125          struct set *register_uses =
1126             _mesa_pointer_set_create(scoreboard);
1127 
1128          _mesa_hash_table_insert(scoreboard->remaining_uses, reg, register_uses);
1129 
1130          nir_foreach_use(src, reg) {
1131             _mesa_set_add(register_uses, src->parent_instr);
1132          }
1133 
1134          /* XXX: Handle if uses */
1135 
1136          nir_foreach_def(dest, reg) {
1137             _mesa_set_add(register_uses, dest->reg.parent_instr);
1138          }
1139       }
1140 
1141       nir_foreach_block(block, function->impl) {
1142          nir_foreach_instr(instr, block) {
1143             nir_foreach_ssa_def(instr, nir_schedule_ssa_def_init_scoreboard,
1144                                 scoreboard);
1145          }
1146 
1147          /* XXX: We're ignoring if uses, which may prioritize scheduling other
1148           * uses of the if src even when it doesn't help.  That's not many
1149           * values, though, so meh.
1150           */
1151       }
1152    }
1153 
1154    return scoreboard;
1155 }
1156 
1157 static void
nir_schedule_validate_uses(nir_schedule_scoreboard * scoreboard)1158 nir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard)
1159 {
1160 #ifdef NDEBUG
1161    return;
1162 #endif
1163 
1164    bool any_uses = false;
1165 
1166    hash_table_foreach(scoreboard->remaining_uses, entry) {
1167       struct set *remaining_uses = entry->data;
1168 
1169       set_foreach(remaining_uses, instr_entry) {
1170          if (!any_uses) {
1171             fprintf(stderr, "Tracked uses remain after scheduling.  "
1172                     "Affected instructions: \n");
1173             any_uses = true;
1174          }
1175          nir_print_instr(instr_entry->key, stderr);
1176          fprintf(stderr, "\n");
1177       }
1178    }
1179 
1180    assert(!any_uses);
1181 }
1182 
1183 /**
1184  * Schedules the NIR instructions to try to decrease stalls (for example,
1185  * delaying texture reads) while managing register pressure.
1186  *
1187  * The threshold represents "number of NIR register/SSA def channels live
1188  * before switching the scheduling heuristic to reduce register pressure",
1189  * since most of our GPU architectures are scalar (extending to vector with a
1190  * flag wouldn't be hard).  This number should be a bit below the number of
1191  * registers available (counting any that may be occupied by system value
1192  * payload values, for example), since the heuristic may not always be able to
1193  * free a register immediately.  The amount below the limit is up to you to
1194  * tune.
1195  */
1196 void
nir_schedule(nir_shader * shader,const nir_schedule_options * options)1197 nir_schedule(nir_shader *shader,
1198              const nir_schedule_options *options)
1199 {
1200    nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader,
1201                                                                      options);
1202 
1203    if (debug) {
1204       fprintf(stderr, "NIR shader before scheduling:\n");
1205       nir_print_shader(shader, stderr);
1206    }
1207 
1208    nir_foreach_function(function, shader) {
1209       if (!function->impl)
1210          continue;
1211 
1212       nir_foreach_block(block, function->impl) {
1213          nir_schedule_block(scoreboard, block);
1214       }
1215    }
1216 
1217    nir_schedule_validate_uses(scoreboard);
1218 
1219    ralloc_free(scoreboard);
1220 }
1221