1 /*
2 * Copyright © 2019 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "nir_schedule.h"
25 #include "util/dag.h"
26 #include "util/u_dynarray.h"
27
28 /** @file
29 *
30 * Implements basic-block-level prepass instruction scheduling in NIR to
31 * manage register pressure.
32 *
33 * This is based on the Goodman/Hsu paper (1988, cached copy at
34 * https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf). We
35 * make up the DDG for NIR (which can be mostly done using the NIR def/use
36 * chains for SSA instructions, plus some edges for ordering register writes
37 * vs reads, and some more for ordering intrinsics). Then we pick heads off
38 * of the DDG using their heuristic to emit the NIR instructions back into the
39 * block in their new order.
40 *
41 * The hard case for prepass scheduling on GPUs seems to always be consuming
42 * texture/ubo results. The register pressure heuristic doesn't want to pick
43 * an instr that starts consuming texture results because it usually won't be
44 * the only usage, so that instruction will increase pressure.
45 *
46 * If you try to force consumption of tex results always, then in a case where
47 * single sample is used for many outputs, you'll end up picking every other
48 * user and expanding register pressure. The partially_evaluated_path flag
49 * helps tremendously, in that if you happen for whatever reason to pick a
50 * texture sample's output, then you'll try to finish off that sample. Future
51 * work may include doing some local search before locking in a choice, to try
52 * to more reliably find the case where just a few choices going against the
53 * heuristic can manage to free the whole vector.
54 */
55
56 static bool debug;
57
58 /**
59 * Represents a node in the DDG for a NIR instruction.
60 */
61 typedef struct {
62 struct dag_node dag; /* must be first for our u_dynarray_foreach */
63 nir_instr *instr;
64 bool partially_evaluated_path;
65
66 /* Approximate estimate of the delay between starting this instruction and
67 * its results being available.
68 *
69 * Accuracy is not too important, given that we're prepass scheduling here
70 * and just trying to reduce excess dependencies introduced by a register
71 * allocator by stretching out the live intervals of expensive
72 * instructions.
73 */
74 uint32_t delay;
75
76 /* Cost of the maximum-delay path from this node to the leaves. */
77 uint32_t max_delay;
78
79 /* scoreboard->time value when this instruction can be scheduled without
80 * any stalls expected.
81 */
82 uint32_t ready_time;
83 } nir_schedule_node;
84
85 typedef struct {
86 struct dag *dag;
87
88 nir_shader *shader;
89
90 /* Mapping from nir_register * or nir_ssa_def * to a struct set of
91 * instructions remaining to be scheduled using the register.
92 */
93 struct hash_table *remaining_uses;
94
95 /* Map from nir_instr to nir_schedule_node * */
96 struct hash_table *instr_map;
97
98 /* Set of nir_register * or nir_ssa_def * that have had any instruction
99 * scheduled on them.
100 */
101 struct set *live_values;
102
103 /* An abstract approximation of the number of nir_scheduler_node->delay
104 * units since the start of the shader.
105 */
106 uint32_t time;
107
108 /* Number of channels currently used by the NIR instructions that have been
109 * scheduled.
110 */
111 int pressure;
112
113 /* Options specified by the backend */
114 const nir_schedule_options *options;
115 } nir_schedule_scoreboard;
116
117 /* When walking the instructions in reverse, we use this flag to swap
118 * before/after in add_dep().
119 */
120 enum direction { F, R };
121
122 struct nir_schedule_class_dep {
123 int klass;
124 nir_schedule_node *node;
125 struct nir_schedule_class_dep *next;
126 };
127
128 typedef struct {
129 nir_schedule_scoreboard *scoreboard;
130
131 /* Map from nir_register to nir_schedule_node * */
132 struct hash_table *reg_map;
133
134 /* Scheduler nodes for last instruction involved in some class of dependency.
135 */
136 nir_schedule_node *load_input;
137 nir_schedule_node *store_shared;
138 nir_schedule_node *unknown_intrinsic;
139 nir_schedule_node *discard;
140 nir_schedule_node *jump;
141
142 struct nir_schedule_class_dep *class_deps;
143
144 enum direction dir;
145 } nir_deps_state;
146
147 static void *
_mesa_hash_table_search_data(struct hash_table * ht,void * key)148 _mesa_hash_table_search_data(struct hash_table *ht, void *key)
149 {
150 struct hash_entry *entry = _mesa_hash_table_search(ht, key);
151 if (!entry)
152 return NULL;
153 return entry->data;
154 }
155
156 static nir_schedule_node *
nir_schedule_get_node(struct hash_table * instr_map,nir_instr * instr)157 nir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr)
158 {
159 return _mesa_hash_table_search_data(instr_map, instr);
160 }
161
162 static struct set *
nir_schedule_scoreboard_get_src(nir_schedule_scoreboard * scoreboard,nir_src * src)163 nir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src)
164 {
165 if (src->is_ssa) {
166 return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa);
167 } else {
168 return _mesa_hash_table_search_data(scoreboard->remaining_uses,
169 src->reg.reg);
170 }
171 }
172
173 static int
nir_schedule_def_pressure(nir_ssa_def * def)174 nir_schedule_def_pressure(nir_ssa_def *def)
175 {
176 return def->num_components;
177 }
178
179 static int
nir_schedule_src_pressure(nir_src * src)180 nir_schedule_src_pressure(nir_src *src)
181 {
182 if (src->is_ssa)
183 return nir_schedule_def_pressure(src->ssa);
184 else
185 return src->reg.reg->num_components;
186 }
187
188 static int
nir_schedule_dest_pressure(nir_dest * dest)189 nir_schedule_dest_pressure(nir_dest *dest)
190 {
191 if (dest->is_ssa)
192 return nir_schedule_def_pressure(&dest->ssa);
193 else
194 return dest->reg.reg->num_components;
195 }
196
197 /**
198 * Adds a dependency such that @after must appear in the final program after
199 * @before.
200 *
201 * We add @before as a child of @after, so that DAG heads are the outputs of
202 * the program and we make our scheduling decisions bottom to top.
203 */
204 static void
add_dep(nir_deps_state * state,nir_schedule_node * before,nir_schedule_node * after)205 add_dep(nir_deps_state *state,
206 nir_schedule_node *before,
207 nir_schedule_node *after)
208 {
209 if (!before || !after)
210 return;
211
212 assert(before != after);
213
214 if (state->dir == F)
215 dag_add_edge(&before->dag, &after->dag, 0);
216 else
217 dag_add_edge(&after->dag, &before->dag, 0);
218 }
219
220
221 static void
add_read_dep(nir_deps_state * state,nir_schedule_node * before,nir_schedule_node * after)222 add_read_dep(nir_deps_state *state,
223 nir_schedule_node *before,
224 nir_schedule_node *after)
225 {
226 add_dep(state, before, after);
227 }
228
229 static void
add_write_dep(nir_deps_state * state,nir_schedule_node ** before,nir_schedule_node * after)230 add_write_dep(nir_deps_state *state,
231 nir_schedule_node **before,
232 nir_schedule_node *after)
233 {
234 add_dep(state, *before, after);
235 *before = after;
236 }
237
238 static bool
nir_schedule_reg_src_deps(nir_src * src,void * in_state)239 nir_schedule_reg_src_deps(nir_src *src, void *in_state)
240 {
241 nir_deps_state *state = in_state;
242
243 if (src->is_ssa)
244 return true;
245
246 struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
247 src->reg.reg);
248 if (!entry)
249 return true;
250 nir_schedule_node *dst_n = entry->data;
251
252 nir_schedule_node *src_n =
253 nir_schedule_get_node(state->scoreboard->instr_map,
254 src->parent_instr);
255
256 add_dep(state, dst_n, src_n);
257
258 return true;
259 }
260
261 static bool
nir_schedule_reg_dest_deps(nir_dest * dest,void * in_state)262 nir_schedule_reg_dest_deps(nir_dest *dest, void *in_state)
263 {
264 nir_deps_state *state = in_state;
265
266 if (dest->is_ssa)
267 return true;
268
269 nir_schedule_node *dest_n =
270 nir_schedule_get_node(state->scoreboard->instr_map,
271 dest->reg.parent_instr);
272
273 struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
274 dest->reg.reg);
275 if (!entry) {
276 _mesa_hash_table_insert(state->reg_map, dest->reg.reg, dest_n);
277 return true;
278 }
279 nir_schedule_node **before = (nir_schedule_node **)&entry->data;
280
281 add_write_dep(state, before, dest_n);
282
283 return true;
284 }
285
286 static bool
nir_schedule_ssa_deps(nir_ssa_def * def,void * in_state)287 nir_schedule_ssa_deps(nir_ssa_def *def, void *in_state)
288 {
289 nir_deps_state *state = in_state;
290 struct hash_table *instr_map = state->scoreboard->instr_map;
291 nir_schedule_node *def_n = nir_schedule_get_node(instr_map, def->parent_instr);
292
293 nir_foreach_use(src, def) {
294 nir_schedule_node *use_n = nir_schedule_get_node(instr_map,
295 src->parent_instr);
296
297 add_read_dep(state, def_n, use_n);
298 }
299
300 return true;
301 }
302
303 static struct nir_schedule_class_dep *
nir_schedule_get_class_dep(nir_deps_state * state,int klass)304 nir_schedule_get_class_dep(nir_deps_state *state,
305 int klass)
306 {
307 for (struct nir_schedule_class_dep *class_dep = state->class_deps;
308 class_dep != NULL;
309 class_dep = class_dep->next) {
310 if (class_dep->klass == klass)
311 return class_dep;
312 }
313
314 struct nir_schedule_class_dep *class_dep =
315 ralloc(state->reg_map, struct nir_schedule_class_dep);
316
317 class_dep->klass = klass;
318 class_dep->node = NULL;
319 class_dep->next = state->class_deps;
320
321 state->class_deps = class_dep;
322
323 return class_dep;
324 }
325
326 static void
nir_schedule_intrinsic_deps(nir_deps_state * state,nir_intrinsic_instr * instr)327 nir_schedule_intrinsic_deps(nir_deps_state *state,
328 nir_intrinsic_instr *instr)
329 {
330 nir_schedule_node *n = nir_schedule_get_node(state->scoreboard->instr_map,
331 &instr->instr);
332 const nir_schedule_options *options = state->scoreboard->options;
333 nir_schedule_dependency dep;
334
335 if (options->intrinsic_cb &&
336 options->intrinsic_cb(instr, &dep, options->intrinsic_cb_data)) {
337 struct nir_schedule_class_dep *class_dep =
338 nir_schedule_get_class_dep(state, dep.klass);
339
340 switch (dep.type) {
341 case NIR_SCHEDULE_READ_DEPENDENCY:
342 add_read_dep(state, class_dep->node, n);
343 break;
344 case NIR_SCHEDULE_WRITE_DEPENDENCY:
345 add_write_dep(state, &class_dep->node, n);
346 break;
347 }
348 }
349
350 switch (instr->intrinsic) {
351 case nir_intrinsic_load_uniform:
352 case nir_intrinsic_load_ubo:
353 case nir_intrinsic_load_front_face:
354 break;
355
356 case nir_intrinsic_discard:
357 case nir_intrinsic_discard_if:
358 case nir_intrinsic_demote:
359 case nir_intrinsic_demote_if:
360 case nir_intrinsic_terminate:
361 case nir_intrinsic_terminate_if:
362 /* We are adding two dependencies:
363 *
364 * * A individual one that we could use to add a read_dep while handling
365 * nir_instr_type_tex
366 *
367 * * Include it on the unknown intrinsic set, as we want discard to be
368 * serialized in in the same order relative to intervening stores or
369 * atomic accesses to SSBOs and images
370 */
371 add_write_dep(state, &state->discard, n);
372 add_write_dep(state, &state->unknown_intrinsic, n);
373 break;
374
375 case nir_intrinsic_store_output:
376 /* For some hardware and stages, output stores affect the same shared
377 * memory as input loads.
378 */
379 if ((state->scoreboard->options->stages_with_shared_io_memory &
380 (1 << state->scoreboard->shader->info.stage)))
381 add_write_dep(state, &state->load_input, n);
382
383 /* Make sure that preceding discards stay before the store_output */
384 add_read_dep(state, state->discard, n);
385
386 break;
387
388 case nir_intrinsic_load_input:
389 case nir_intrinsic_load_per_vertex_input:
390 add_read_dep(state, state->load_input, n);
391 break;
392
393 case nir_intrinsic_load_shared:
394 case nir_intrinsic_load_shared2_amd:
395 /* Don't move load_shared beyond a following store_shared, as it could
396 * change their value
397 */
398 add_read_dep(state, state->store_shared, n);
399 break;
400
401 case nir_intrinsic_store_shared:
402 case nir_intrinsic_store_shared2_amd:
403 add_write_dep(state, &state->store_shared, n);
404 break;
405
406 case nir_intrinsic_control_barrier:
407 case nir_intrinsic_memory_barrier_shared:
408 case nir_intrinsic_group_memory_barrier:
409 /* A generic memory barrier can be emitted when multiple synchronization
410 * semantics are involved, including shared memory.
411 */
412 case nir_intrinsic_memory_barrier:
413 add_write_dep(state, &state->store_shared, n);
414
415 /* Serialize against ssbos/atomics/etc. */
416 add_write_dep(state, &state->unknown_intrinsic, n);
417 break;
418
419 case nir_intrinsic_scoped_barrier: {
420 const nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
421
422 if (modes & nir_var_mem_shared)
423 add_write_dep(state, &state->store_shared, n);
424
425 /* Serialize against other categories. */
426 add_write_dep(state, &state->unknown_intrinsic, n);
427
428 break;
429 }
430
431 default:
432 /* Attempt to handle other intrinsics that we haven't individually
433 * categorized by serializing them in the same order relative to each
434 * other.
435 */
436 add_write_dep(state, &state->unknown_intrinsic, n);
437 break;
438 }
439 }
440
441 /**
442 * Common code for dependencies that need to be tracked both forward and
443 * backward.
444 *
445 * This is for things like "all reads of r4 have to happen between the r4
446 * writes that surround them".
447 */
448 static void
nir_schedule_calculate_deps(nir_deps_state * state,nir_schedule_node * n)449 nir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n)
450 {
451 nir_instr *instr = n->instr;
452
453 /* For NIR SSA defs, we only need to do a single pass of making the uses
454 * depend on the def.
455 */
456 if (state->dir == F)
457 nir_foreach_ssa_def(instr, nir_schedule_ssa_deps, state);
458
459 /* For NIR regs, track the last writer in the scheduler state so that we
460 * can keep the writes in order and let reads get reordered only between
461 * each write.
462 */
463 nir_foreach_src(instr, nir_schedule_reg_src_deps, state);
464
465 nir_foreach_dest(instr, nir_schedule_reg_dest_deps, state);
466
467 /* Make sure any other instructions keep their positions relative to
468 * jumps.
469 */
470 if (instr->type != nir_instr_type_jump)
471 add_read_dep(state, state->jump, n);
472
473 switch (instr->type) {
474 case nir_instr_type_ssa_undef:
475 case nir_instr_type_load_const:
476 case nir_instr_type_alu:
477 case nir_instr_type_deref:
478 break;
479
480 case nir_instr_type_tex:
481 /* Don't move texture ops before a discard, as that could increase
482 * memory bandwidth for reading the discarded samples.
483 */
484 add_read_dep(state, state->discard, n);
485 break;
486
487 case nir_instr_type_jump:
488 add_write_dep(state, &state->jump, n);
489 break;
490
491 case nir_instr_type_call:
492 unreachable("Calls should have been lowered");
493 break;
494
495 case nir_instr_type_parallel_copy:
496 unreachable("Parallel copies should have been lowered");
497 break;
498
499 case nir_instr_type_phi:
500 unreachable("nir_schedule() should be called after lowering from SSA");
501 break;
502
503 case nir_instr_type_intrinsic:
504 nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr));
505 break;
506 }
507 }
508
509 static void
calculate_forward_deps(nir_schedule_scoreboard * scoreboard,nir_block * block)510 calculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
511 {
512 nir_deps_state state = {
513 .scoreboard = scoreboard,
514 .dir = F,
515 .reg_map = _mesa_pointer_hash_table_create(NULL),
516 };
517
518 nir_foreach_instr(instr, block) {
519 nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
520 instr);
521 nir_schedule_calculate_deps(&state, node);
522 }
523
524 ralloc_free(state.reg_map);
525 }
526
527 static void
calculate_reverse_deps(nir_schedule_scoreboard * scoreboard,nir_block * block)528 calculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
529 {
530 nir_deps_state state = {
531 .scoreboard = scoreboard,
532 .dir = R,
533 .reg_map = _mesa_pointer_hash_table_create(NULL),
534 };
535
536 nir_foreach_instr_reverse(instr, block) {
537 nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
538 instr);
539 nir_schedule_calculate_deps(&state, node);
540 }
541
542 ralloc_free(state.reg_map);
543 }
544
545 typedef struct {
546 nir_schedule_scoreboard *scoreboard;
547 int regs_freed;
548 } nir_schedule_regs_freed_state;
549
550 static bool
nir_schedule_regs_freed_src_cb(nir_src * src,void * in_state)551 nir_schedule_regs_freed_src_cb(nir_src *src, void *in_state)
552 {
553 nir_schedule_regs_freed_state *state = in_state;
554 nir_schedule_scoreboard *scoreboard = state->scoreboard;
555 struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
556
557 if (remaining_uses->entries == 1 &&
558 _mesa_set_search(remaining_uses, src->parent_instr)) {
559 state->regs_freed += nir_schedule_src_pressure(src);
560 }
561
562 return true;
563 }
564
565 static bool
nir_schedule_regs_freed_def_cb(nir_ssa_def * def,void * in_state)566 nir_schedule_regs_freed_def_cb(nir_ssa_def *def, void *in_state)
567 {
568 nir_schedule_regs_freed_state *state = in_state;
569
570 state->regs_freed -= nir_schedule_def_pressure(def);
571
572 return true;
573 }
574
575 static bool
nir_schedule_regs_freed_dest_cb(nir_dest * dest,void * in_state)576 nir_schedule_regs_freed_dest_cb(nir_dest *dest, void *in_state)
577 {
578 nir_schedule_regs_freed_state *state = in_state;
579 nir_schedule_scoreboard *scoreboard = state->scoreboard;
580
581 if (dest->is_ssa)
582 return true;
583
584 nir_register *reg = dest->reg.reg;
585
586 /* Only the first def of a reg counts against register pressure. */
587 if (!_mesa_set_search(scoreboard->live_values, reg))
588 state->regs_freed -= nir_schedule_dest_pressure(dest);
589
590 return true;
591 }
592
593 static int
nir_schedule_regs_freed(nir_schedule_scoreboard * scoreboard,nir_schedule_node * n)594 nir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n)
595 {
596 nir_schedule_regs_freed_state state = {
597 .scoreboard = scoreboard,
598 };
599
600 nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state);
601
602 nir_foreach_ssa_def(n->instr, nir_schedule_regs_freed_def_cb, &state);
603
604 nir_foreach_dest(n->instr, nir_schedule_regs_freed_dest_cb, &state);
605
606 return state.regs_freed;
607 }
608
609 /**
610 * Chooses an instruction that will minimise the register pressure as much as
611 * possible. This should only be used as a fallback when the regular scheduling
612 * generates a shader whose register allocation fails.
613 */
614 static nir_schedule_node *
nir_schedule_choose_instruction_fallback(nir_schedule_scoreboard * scoreboard)615 nir_schedule_choose_instruction_fallback(nir_schedule_scoreboard *scoreboard)
616 {
617 nir_schedule_node *chosen = NULL;
618
619 /* Find the leader in the ready (shouldn't-stall) set with the mininum
620 * cost.
621 */
622 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
623 if (scoreboard->time < n->ready_time)
624 continue;
625
626 if (!chosen || chosen->max_delay > n->max_delay)
627 chosen = n;
628 }
629 if (chosen) {
630 if (debug) {
631 fprintf(stderr, "chose (ready fallback): ");
632 nir_print_instr(chosen->instr, stderr);
633 fprintf(stderr, "\n");
634 }
635
636 return chosen;
637 }
638
639 /* Otherwise, choose the leader with the minimum cost. */
640 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
641 if (!chosen || chosen->max_delay > n->max_delay)
642 chosen = n;
643 }
644 if (debug) {
645 fprintf(stderr, "chose (leader fallback): ");
646 nir_print_instr(chosen->instr, stderr);
647 fprintf(stderr, "\n");
648 }
649
650 return chosen;
651 }
652
653 /**
654 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code
655 * Scheduling for Parallelism) heuristic.
656 *
657 * Picks an instruction on the critical that's ready to execute without
658 * stalls, if possible, otherwise picks the instruction on the critical path.
659 */
660 static nir_schedule_node *
nir_schedule_choose_instruction_csp(nir_schedule_scoreboard * scoreboard)661 nir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard)
662 {
663 nir_schedule_node *chosen = NULL;
664
665 /* Find the leader in the ready (shouldn't-stall) set with the maximum
666 * cost.
667 */
668 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
669 if (scoreboard->time < n->ready_time)
670 continue;
671
672 if (!chosen || chosen->max_delay < n->max_delay)
673 chosen = n;
674 }
675 if (chosen) {
676 if (debug) {
677 fprintf(stderr, "chose (ready): ");
678 nir_print_instr(chosen->instr, stderr);
679 fprintf(stderr, "\n");
680 }
681
682 return chosen;
683 }
684
685 /* Otherwise, choose the leader with the maximum cost. */
686 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
687 if (!chosen || chosen->max_delay < n->max_delay)
688 chosen = n;
689 }
690 if (debug) {
691 fprintf(stderr, "chose (leader): ");
692 nir_print_instr(chosen->instr, stderr);
693 fprintf(stderr, "\n");
694 }
695
696 return chosen;
697 }
698
699 /**
700 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
701 * Scheduling for Register pressure) heuristic.
702 */
703 static nir_schedule_node *
nir_schedule_choose_instruction_csr(nir_schedule_scoreboard * scoreboard)704 nir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard)
705 {
706 nir_schedule_node *chosen = NULL;
707
708 /* Find a ready inst with regs freed and pick the one with max cost. */
709 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
710 if (n->ready_time > scoreboard->time)
711 continue;
712
713 int regs_freed = nir_schedule_regs_freed(scoreboard, n);
714
715 if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
716 chosen = n;
717 }
718 }
719 if (chosen) {
720 if (debug) {
721 fprintf(stderr, "chose (freed+ready): ");
722 nir_print_instr(chosen->instr, stderr);
723 fprintf(stderr, "\n");
724 }
725
726 return chosen;
727 }
728
729 /* Find a leader with regs freed and pick the one with max cost. */
730 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
731 int regs_freed = nir_schedule_regs_freed(scoreboard, n);
732
733 if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
734 chosen = n;
735 }
736 }
737 if (chosen) {
738 if (debug) {
739 fprintf(stderr, "chose (regs freed): ");
740 nir_print_instr(chosen->instr, stderr);
741 fprintf(stderr, "\n");
742 }
743
744 return chosen;
745 }
746
747 /* Find a partially evaluated path and try to finish it off */
748 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
749 if (n->partially_evaluated_path &&
750 (!chosen || chosen->max_delay < n->max_delay)) {
751 chosen = n;
752 }
753 }
754 if (chosen) {
755 if (debug) {
756 fprintf(stderr, "chose (partial path): ");
757 nir_print_instr(chosen->instr, stderr);
758 fprintf(stderr, "\n");
759 }
760
761 return chosen;
762 }
763
764 /* Contra the paper, pick a leader with no effect on used regs. This may
765 * open up new opportunities, as otherwise a single-operand instr consuming
766 * a value will tend to block finding freeing that value. This had a
767 * massive effect on reducing spilling on V3D.
768 *
769 * XXX: Should this prioritize ready?
770 */
771 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
772 if (nir_schedule_regs_freed(scoreboard, n) != 0)
773 continue;
774
775 if (!chosen || chosen->max_delay < n->max_delay)
776 chosen = n;
777 }
778 if (chosen) {
779 if (debug) {
780 fprintf(stderr, "chose (regs no-op): ");
781 nir_print_instr(chosen->instr, stderr);
782 fprintf(stderr, "\n");
783 }
784
785 return chosen;
786 }
787
788 /* Pick the max delay of the remaining ready set. */
789 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
790 if (n->ready_time > scoreboard->time)
791 continue;
792 if (!chosen || chosen->max_delay < n->max_delay)
793 chosen = n;
794 }
795 if (chosen) {
796 if (debug) {
797 fprintf(stderr, "chose (ready max delay): ");
798 nir_print_instr(chosen->instr, stderr);
799 fprintf(stderr, "\n");
800 }
801 return chosen;
802 }
803
804 /* Pick the max delay of the remaining leaders. */
805 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
806 if (!chosen || chosen->max_delay < n->max_delay)
807 chosen = n;
808 }
809
810 if (debug) {
811 fprintf(stderr, "chose (max delay): ");
812 nir_print_instr(chosen->instr, stderr);
813 fprintf(stderr, "\n");
814 }
815
816 return chosen;
817 }
818
819 static void
dump_state(nir_schedule_scoreboard * scoreboard)820 dump_state(nir_schedule_scoreboard *scoreboard)
821 {
822 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
823 fprintf(stderr, "maxdel %5d ", n->max_delay);
824 nir_print_instr(n->instr, stderr);
825 fprintf(stderr, "\n");
826
827 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
828 nir_schedule_node *child = (nir_schedule_node *)edge->child;
829
830 fprintf(stderr, " -> (%d parents) ", child->dag.parent_count);
831 nir_print_instr(child->instr, stderr);
832 fprintf(stderr, "\n");
833 }
834 }
835 }
836
837 static void
nir_schedule_mark_use(nir_schedule_scoreboard * scoreboard,void * reg_or_def,nir_instr * reg_or_def_parent,int pressure)838 nir_schedule_mark_use(nir_schedule_scoreboard *scoreboard,
839 void *reg_or_def,
840 nir_instr *reg_or_def_parent,
841 int pressure)
842 {
843 /* Make the value live if it's the first time it's been used. */
844 if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) {
845 _mesa_set_add(scoreboard->live_values, reg_or_def);
846 scoreboard->pressure += pressure;
847 }
848
849 /* Make the value dead if it's the last remaining use. Be careful when one
850 * instruction uses a value twice to not decrement pressure twice.
851 */
852 struct set *remaining_uses =
853 _mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def);
854 struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent);
855 if (entry) {
856 _mesa_set_remove(remaining_uses, entry);
857
858 if (remaining_uses->entries == 0)
859 scoreboard->pressure -= pressure;
860 }
861 }
862
863 static bool
nir_schedule_mark_src_scheduled(nir_src * src,void * state)864 nir_schedule_mark_src_scheduled(nir_src *src, void *state)
865 {
866 nir_schedule_scoreboard *scoreboard = state;
867 struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
868
869 struct set_entry *entry = _mesa_set_search(remaining_uses,
870 src->parent_instr);
871 if (entry) {
872 /* Once we've used an SSA value in one instruction, bump the priority of
873 * the other uses so the SSA value can get fully consumed.
874 *
875 * We don't do this for registers, and it's would be a hassle and it's
876 * unclear if that would help or not. Also, skip it for constants, as
877 * they're often folded as immediates into backend instructions and have
878 * many unrelated instructions all referencing the same value (0).
879 */
880 if (src->is_ssa &&
881 src->ssa->parent_instr->type != nir_instr_type_load_const) {
882 nir_foreach_use(other_src, src->ssa) {
883 if (other_src->parent_instr == src->parent_instr)
884 continue;
885
886 nir_schedule_node *n =
887 nir_schedule_get_node(scoreboard->instr_map,
888 other_src->parent_instr);
889
890 if (n && !n->partially_evaluated_path) {
891 if (debug) {
892 fprintf(stderr, " New partially evaluated path: ");
893 nir_print_instr(n->instr, stderr);
894 fprintf(stderr, "\n");
895 }
896
897 n->partially_evaluated_path = true;
898 }
899 }
900 }
901 }
902
903 nir_schedule_mark_use(scoreboard,
904 src->is_ssa ? (void *)src->ssa : (void *)src->reg.reg,
905 src->parent_instr,
906 nir_schedule_src_pressure(src));
907
908 return true;
909 }
910
911 static bool
nir_schedule_mark_def_scheduled(nir_ssa_def * def,void * state)912 nir_schedule_mark_def_scheduled(nir_ssa_def *def, void *state)
913 {
914 nir_schedule_scoreboard *scoreboard = state;
915
916 nir_schedule_mark_use(scoreboard, def, def->parent_instr,
917 nir_schedule_def_pressure(def));
918
919 return true;
920 }
921
922 static bool
nir_schedule_mark_dest_scheduled(nir_dest * dest,void * state)923 nir_schedule_mark_dest_scheduled(nir_dest *dest, void *state)
924 {
925 nir_schedule_scoreboard *scoreboard = state;
926
927 /* SSA defs were handled in nir_schedule_mark_def_scheduled()
928 */
929 if (dest->is_ssa)
930 return true;
931
932 /* XXX: This is not actually accurate for regs -- the last use of a reg may
933 * have a live interval that extends across control flow. We should
934 * calculate the live ranges of regs, and have scheduler nodes for the CF
935 * nodes that also "use" the reg.
936 */
937 nir_schedule_mark_use(scoreboard, dest->reg.reg,
938 dest->reg.parent_instr,
939 nir_schedule_dest_pressure(dest));
940
941 return true;
942 }
943
944 static void
nir_schedule_mark_node_scheduled(nir_schedule_scoreboard * scoreboard,nir_schedule_node * n)945 nir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard,
946 nir_schedule_node *n)
947 {
948 nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard);
949 nir_foreach_ssa_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard);
950 nir_foreach_dest(n->instr, nir_schedule_mark_dest_scheduled, scoreboard);
951
952 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
953 nir_schedule_node *child = (nir_schedule_node *)edge->child;
954
955 child->ready_time = MAX2(child->ready_time,
956 scoreboard->time + n->delay);
957
958 if (child->dag.parent_count == 1) {
959 if (debug) {
960 fprintf(stderr, " New DAG head: ");
961 nir_print_instr(child->instr, stderr);
962 fprintf(stderr, "\n");
963 }
964 }
965 }
966
967 dag_prune_head(scoreboard->dag, &n->dag);
968
969 scoreboard->time = MAX2(n->ready_time, scoreboard->time);
970 scoreboard->time++;
971 }
972
973 static void
nir_schedule_instructions(nir_schedule_scoreboard * scoreboard,nir_block * block)974 nir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block)
975 {
976 while (!list_is_empty(&scoreboard->dag->heads)) {
977 if (debug) {
978 fprintf(stderr, "current list:\n");
979 dump_state(scoreboard);
980 }
981
982 nir_schedule_node *chosen;
983 if (scoreboard->options->fallback)
984 chosen = nir_schedule_choose_instruction_fallback(scoreboard);
985 else if (scoreboard->pressure < scoreboard->options->threshold)
986 chosen = nir_schedule_choose_instruction_csp(scoreboard);
987 else
988 chosen = nir_schedule_choose_instruction_csr(scoreboard);
989
990 /* Now that we've scheduled a new instruction, some of its children may
991 * be promoted to the list of instructions ready to be scheduled.
992 */
993 nir_schedule_mark_node_scheduled(scoreboard, chosen);
994
995 /* Move the instruction to the end (so our first chosen instructions are
996 * the start of the program).
997 */
998 exec_node_remove(&chosen->instr->node);
999 exec_list_push_tail(&block->instr_list, &chosen->instr->node);
1000
1001 if (debug)
1002 fprintf(stderr, "\n");
1003 }
1004 }
1005
1006 static uint32_t
nir_schedule_get_delay(nir_schedule_scoreboard * scoreboard,nir_instr * instr)1007 nir_schedule_get_delay(nir_schedule_scoreboard *scoreboard, nir_instr *instr)
1008 {
1009 if (scoreboard->options->instr_delay_cb) {
1010 void *cb_data = scoreboard->options->instr_delay_cb_data;
1011 return scoreboard->options->instr_delay_cb(instr, cb_data);
1012 }
1013
1014 switch (instr->type) {
1015 case nir_instr_type_ssa_undef:
1016 case nir_instr_type_load_const:
1017 case nir_instr_type_alu:
1018 case nir_instr_type_deref:
1019 case nir_instr_type_jump:
1020 case nir_instr_type_parallel_copy:
1021 case nir_instr_type_call:
1022 case nir_instr_type_phi:
1023 return 1;
1024
1025 case nir_instr_type_intrinsic:
1026 switch (nir_instr_as_intrinsic(instr)->intrinsic) {
1027 case nir_intrinsic_load_ubo:
1028 case nir_intrinsic_load_ssbo:
1029 case nir_intrinsic_load_scratch:
1030 case nir_intrinsic_load_shared:
1031 case nir_intrinsic_image_load:
1032 return 50;
1033 default:
1034 return 1;
1035 }
1036 break;
1037
1038 case nir_instr_type_tex:
1039 /* Pick some large number to try to fetch textures early and sample them
1040 * late.
1041 */
1042 return 100;
1043 }
1044
1045 return 0;
1046 }
1047
1048 static void
nir_schedule_dag_max_delay_cb(struct dag_node * node,void * state)1049 nir_schedule_dag_max_delay_cb(struct dag_node *node, void *state)
1050 {
1051 nir_schedule_node *n = (nir_schedule_node *)node;
1052 uint32_t max_delay = 0;
1053
1054 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1055 nir_schedule_node *child = (nir_schedule_node *)edge->child;
1056 max_delay = MAX2(child->max_delay, max_delay);
1057 }
1058
1059 n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
1060 }
1061
1062 static void
nir_schedule_block(nir_schedule_scoreboard * scoreboard,nir_block * block)1063 nir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block)
1064 {
1065 void *mem_ctx = ralloc_context(NULL);
1066 scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx);
1067
1068 scoreboard->dag = dag_create(mem_ctx);
1069
1070 nir_foreach_instr(instr, block) {
1071 nir_schedule_node *n =
1072 rzalloc(mem_ctx, nir_schedule_node);
1073
1074 n->instr = instr;
1075 n->delay = nir_schedule_get_delay(scoreboard, instr);
1076 dag_init_node(scoreboard->dag, &n->dag);
1077
1078 _mesa_hash_table_insert(scoreboard->instr_map, instr, n);
1079 }
1080
1081 calculate_forward_deps(scoreboard, block);
1082 calculate_reverse_deps(scoreboard, block);
1083
1084 dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL);
1085
1086 nir_schedule_instructions(scoreboard, block);
1087
1088 ralloc_free(mem_ctx);
1089 scoreboard->instr_map = NULL;
1090 }
1091
1092 static bool
nir_schedule_ssa_def_init_scoreboard(nir_ssa_def * def,void * state)1093 nir_schedule_ssa_def_init_scoreboard(nir_ssa_def *def, void *state)
1094 {
1095 nir_schedule_scoreboard *scoreboard = state;
1096 struct set *def_uses = _mesa_pointer_set_create(scoreboard);
1097
1098 _mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses);
1099
1100 _mesa_set_add(def_uses, def->parent_instr);
1101
1102 nir_foreach_use(src, def) {
1103 _mesa_set_add(def_uses, src->parent_instr);
1104 }
1105
1106 /* XXX: Handle if uses */
1107
1108 return true;
1109 }
1110
1111 static nir_schedule_scoreboard *
nir_schedule_get_scoreboard(nir_shader * shader,const nir_schedule_options * options)1112 nir_schedule_get_scoreboard(nir_shader *shader,
1113 const nir_schedule_options *options)
1114 {
1115 nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard);
1116
1117 scoreboard->shader = shader;
1118 scoreboard->live_values = _mesa_pointer_set_create(scoreboard);
1119 scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard);
1120 scoreboard->options = options;
1121 scoreboard->pressure = 0;
1122
1123 nir_foreach_function(function, shader) {
1124 nir_foreach_register(reg, &function->impl->registers) {
1125 struct set *register_uses =
1126 _mesa_pointer_set_create(scoreboard);
1127
1128 _mesa_hash_table_insert(scoreboard->remaining_uses, reg, register_uses);
1129
1130 nir_foreach_use(src, reg) {
1131 _mesa_set_add(register_uses, src->parent_instr);
1132 }
1133
1134 /* XXX: Handle if uses */
1135
1136 nir_foreach_def(dest, reg) {
1137 _mesa_set_add(register_uses, dest->reg.parent_instr);
1138 }
1139 }
1140
1141 nir_foreach_block(block, function->impl) {
1142 nir_foreach_instr(instr, block) {
1143 nir_foreach_ssa_def(instr, nir_schedule_ssa_def_init_scoreboard,
1144 scoreboard);
1145 }
1146
1147 /* XXX: We're ignoring if uses, which may prioritize scheduling other
1148 * uses of the if src even when it doesn't help. That's not many
1149 * values, though, so meh.
1150 */
1151 }
1152 }
1153
1154 return scoreboard;
1155 }
1156
1157 static void
nir_schedule_validate_uses(nir_schedule_scoreboard * scoreboard)1158 nir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard)
1159 {
1160 #ifdef NDEBUG
1161 return;
1162 #endif
1163
1164 bool any_uses = false;
1165
1166 hash_table_foreach(scoreboard->remaining_uses, entry) {
1167 struct set *remaining_uses = entry->data;
1168
1169 set_foreach(remaining_uses, instr_entry) {
1170 if (!any_uses) {
1171 fprintf(stderr, "Tracked uses remain after scheduling. "
1172 "Affected instructions: \n");
1173 any_uses = true;
1174 }
1175 nir_print_instr(instr_entry->key, stderr);
1176 fprintf(stderr, "\n");
1177 }
1178 }
1179
1180 assert(!any_uses);
1181 }
1182
1183 /**
1184 * Schedules the NIR instructions to try to decrease stalls (for example,
1185 * delaying texture reads) while managing register pressure.
1186 *
1187 * The threshold represents "number of NIR register/SSA def channels live
1188 * before switching the scheduling heuristic to reduce register pressure",
1189 * since most of our GPU architectures are scalar (extending to vector with a
1190 * flag wouldn't be hard). This number should be a bit below the number of
1191 * registers available (counting any that may be occupied by system value
1192 * payload values, for example), since the heuristic may not always be able to
1193 * free a register immediately. The amount below the limit is up to you to
1194 * tune.
1195 */
1196 void
nir_schedule(nir_shader * shader,const nir_schedule_options * options)1197 nir_schedule(nir_shader *shader,
1198 const nir_schedule_options *options)
1199 {
1200 nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader,
1201 options);
1202
1203 if (debug) {
1204 fprintf(stderr, "NIR shader before scheduling:\n");
1205 nir_print_shader(shader, stderr);
1206 }
1207
1208 nir_foreach_function(function, shader) {
1209 if (!function->impl)
1210 continue;
1211
1212 nir_foreach_block(block, function->impl) {
1213 nir_schedule_block(scoreboard, block);
1214 }
1215 }
1216
1217 nir_schedule_validate_uses(scoreboard);
1218
1219 ralloc_free(scoreboard);
1220 }
1221