1 /*
2 * Copyright © 2019 Google, Inc.
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Rob Clark <robclark@freedesktop.org>
7 */
8
9 #include "util/dag.h"
10 #include "util/u_math.h"
11
12 #include "ir3.h"
13 #include "ir3_compiler.h"
14 #include "ir3_context.h"
15
16 #if MESA_DEBUG
17 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
18 #else
19 #define SCHED_DEBUG 0
20 #endif
21 #define d(fmt, ...) \
22 do { \
23 if (SCHED_DEBUG) { \
24 mesa_logi("PSCHED: " fmt, ##__VA_ARGS__); \
25 } \
26 } while (0)
27
28 #define di(instr, fmt, ...) \
29 do { \
30 if (SCHED_DEBUG) { \
31 struct log_stream *stream = mesa_log_streami(); \
32 mesa_log_stream_printf(stream, "PSCHED: " fmt ": ", ##__VA_ARGS__); \
33 ir3_print_instr_stream(stream, instr); \
34 mesa_log_stream_destroy(stream); \
35 } \
36 } while (0)
37
38 #define SCHED_DEBUG_DUMP_DEPTH 1
39
40 /*
41 * Post RA Instruction Scheduling
42 */
43
44 struct ir3_postsched_ctx {
45 struct ir3 *ir;
46
47 struct ir3_shader_variant *v;
48
49 void *mem_ctx;
50 struct ir3_block *block; /* the current block */
51 struct dag *dag;
52
53 struct list_head unscheduled_list; /* unscheduled instructions */
54
55 unsigned ip;
56
57 int ss_delay;
58 int sy_delay;
59 };
60
61 struct ir3_postsched_node {
62 struct dag_node dag; /* must be first for util_dynarray_foreach */
63 struct ir3_instruction *instr;
64 bool partially_evaluated_path;
65
66 unsigned earliest_ip;
67
68 bool has_sy_src, has_ss_src;
69
70 unsigned max_delay;
71 };
72
73 #define foreach_sched_node(__n, __list) \
74 list_for_each_entry (struct ir3_postsched_node, __n, __list, dag.link)
75
76 static bool
has_sy_src(struct ir3_instruction * instr)77 has_sy_src(struct ir3_instruction *instr)
78 {
79 struct ir3_postsched_node *node = instr->data;
80 return node->has_sy_src;
81 }
82
83 static bool
has_ss_src(struct ir3_instruction * instr)84 has_ss_src(struct ir3_instruction *instr)
85 {
86 struct ir3_postsched_node *node = instr->data;
87 return node->has_ss_src;
88 }
89
90 #ifndef NDEBUG
91 static void
sched_dag_validate_cb(const struct dag_node * node,void * data)92 sched_dag_validate_cb(const struct dag_node *node, void *data)
93 {
94 struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
95
96 ir3_print_instr(n->instr);
97 }
98 #endif
99
100 static void
schedule(struct ir3_postsched_ctx * ctx,struct ir3_instruction * instr)101 schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
102 {
103 assert(ctx->block == instr->block);
104
105 /* remove from unscheduled_list:
106 */
107 list_delinit(&instr->node);
108
109 di(instr, "schedule");
110
111 bool counts_for_delay = is_alu(instr) || is_flow(instr);
112
113 unsigned delay_cycles = counts_for_delay ? 1 + instr->repeat : 0;
114
115 struct ir3_postsched_node *n = instr->data;
116
117 /* We insert any nop's needed to get to earliest_ip, then advance
118 * delay_cycles by scheduling the instruction.
119 */
120 ctx->ip = MAX2(ctx->ip, n->earliest_ip) + delay_cycles;
121
122 util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
123 unsigned delay = (unsigned)(uintptr_t)edge->data;
124 struct ir3_postsched_node *child =
125 container_of(edge->child, struct ir3_postsched_node, dag);
126 child->earliest_ip = MAX2(child->earliest_ip, ctx->ip + delay);
127 }
128
129 list_addtail(&instr->node, &instr->block->instr_list);
130
131 dag_prune_head(ctx->dag, &n->dag);
132
133 if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
134 return;
135
136 if (is_ss_producer(instr)) {
137 ctx->ss_delay = soft_ss_delay(instr);
138 } else if (has_ss_src(instr)) {
139 ctx->ss_delay = 0;
140 } else if (ctx->ss_delay > 0) {
141 ctx->ss_delay--;
142 }
143
144 if (is_sy_producer(instr)) {
145 ctx->sy_delay = soft_sy_delay(instr, ctx->block->shader);
146 } else if (has_sy_src(instr)) {
147 ctx->sy_delay = 0;
148 } else if (ctx->sy_delay > 0) {
149 ctx->sy_delay--;
150 }
151 }
152
153 static unsigned
node_delay(struct ir3_postsched_ctx * ctx,struct ir3_postsched_node * n)154 node_delay(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n)
155 {
156 return MAX2(n->earliest_ip, ctx->ip) - ctx->ip;
157 }
158
159 static unsigned
node_delay_soft(struct ir3_postsched_ctx * ctx,struct ir3_postsched_node * n)160 node_delay_soft(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n)
161 {
162 unsigned delay = node_delay(ctx, n);
163
164 /* This takes into account that as when we schedule multiple tex or sfu, the
165 * first user has to wait for all of them to complete.
166 */
167 if (n->has_ss_src)
168 delay = MAX2(delay, ctx->ss_delay);
169 if (n->has_sy_src)
170 delay = MAX2(delay, ctx->sy_delay);
171
172 return delay;
173 }
174
175 static void
dump_node(struct ir3_postsched_ctx * ctx,struct ir3_postsched_node * n,int level)176 dump_node(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n,
177 int level)
178 {
179 if (level > SCHED_DEBUG_DUMP_DEPTH)
180 return;
181
182 di(n->instr, "%*s%smaxdel=%d, node_delay=%d,node_delay_soft=%d, %d parents ",
183 level * 2, "", (level > 0 ? "-> " : ""), n->max_delay, node_delay(ctx, n),
184 node_delay_soft(ctx, n), n->dag.parent_count);
185
186 util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
187 struct ir3_postsched_node *child =
188 (struct ir3_postsched_node *)edge->child;
189
190 dump_node(ctx, child, level + 1);
191 }
192 }
193
194 static void
dump_state(struct ir3_postsched_ctx * ctx)195 dump_state(struct ir3_postsched_ctx *ctx)
196 {
197 if (!SCHED_DEBUG)
198 return;
199
200 foreach_sched_node (n, &ctx->dag->heads) {
201 dump_node(ctx, n, 0);
202 }
203 }
204
205 /* find instruction to schedule: */
206 static struct ir3_instruction *
choose_instr(struct ir3_postsched_ctx * ctx)207 choose_instr(struct ir3_postsched_ctx *ctx)
208 {
209 struct ir3_postsched_node *chosen = NULL;
210
211 dump_state(ctx);
212
213 foreach_sched_node (n, &ctx->dag->heads) {
214 if (!is_meta(n->instr))
215 continue;
216
217 if (!chosen || (chosen->max_delay < n->max_delay))
218 chosen = n;
219 }
220
221 if (chosen) {
222 di(chosen->instr, "prio: chose (meta)");
223 return chosen->instr;
224 }
225
226 /* Try to schedule inputs with a higher priority, if possible, as
227 * the last bary.f unlocks varying storage to unblock more VS
228 * warps.
229 */
230 foreach_sched_node (n, &ctx->dag->heads) {
231 if (!is_input(n->instr))
232 continue;
233
234 if (!chosen || (chosen->max_delay < n->max_delay))
235 chosen = n;
236 }
237
238 if (chosen) {
239 di(chosen->instr, "prio: chose (input)");
240 return chosen->instr;
241 }
242
243 /* Next prioritize discards: */
244 foreach_sched_node (n, &ctx->dag->heads) {
245 unsigned d = node_delay(ctx, n);
246
247 if (d > 0)
248 continue;
249
250 if (!is_kill_or_demote(n->instr))
251 continue;
252
253 if (!chosen || (chosen->max_delay < n->max_delay))
254 chosen = n;
255 }
256
257 if (chosen) {
258 di(chosen->instr, "csp: chose (kill, hard ready)");
259 return chosen->instr;
260 }
261
262 /* Next prioritize expensive instructions: */
263 foreach_sched_node (n, &ctx->dag->heads) {
264 unsigned d = node_delay_soft(ctx, n);
265
266 if (d > 0)
267 continue;
268
269 if (!(is_ss_producer(n->instr) || is_sy_producer(n->instr)))
270 continue;
271
272 if (!chosen || (chosen->max_delay < n->max_delay))
273 chosen = n;
274 }
275
276 if (chosen) {
277 di(chosen->instr, "csp: chose (sfu/tex, soft ready)");
278 return chosen->instr;
279 }
280
281 /* Next try to find a ready leader w/ soft delay (ie. including extra
282 * delay for things like tex fetch which can be synchronized w/ sync
283 * bit (but we probably do want to schedule some other instructions
284 * while we wait). We also allow a small amount of nops, to prefer now-nops
285 * over future-nops up to a point, as that gives better results.
286 */
287 unsigned chosen_delay = 0;
288 foreach_sched_node (n, &ctx->dag->heads) {
289 unsigned d = node_delay_soft(ctx, n);
290
291 if (d > 3)
292 continue;
293
294 if (!chosen || d < chosen_delay) {
295 chosen = n;
296 chosen_delay = d;
297 continue;
298 }
299
300 if (d > chosen_delay)
301 continue;
302
303 if (chosen->max_delay < n->max_delay) {
304 chosen = n;
305 chosen_delay = d;
306 }
307 }
308
309 if (chosen) {
310 di(chosen->instr, "csp: chose (soft ready)");
311 return chosen->instr;
312 }
313
314 /* Otherwise choose leader with maximum cost:
315 */
316 foreach_sched_node (n, &ctx->dag->heads) {
317 if (!chosen || chosen->max_delay < n->max_delay)
318 chosen = n;
319 }
320
321 if (chosen) {
322 di(chosen->instr, "csp: chose (leader)");
323 return chosen->instr;
324 }
325
326 return NULL;
327 }
328
329 struct ir3_postsched_deps_state {
330 struct ir3_postsched_ctx *ctx;
331
332 enum { F, R } direction;
333
334 bool merged;
335
336 /* Track the mapping between sched node (instruction) that last
337 * wrote a given register (in whichever direction we are iterating
338 * the block)
339 *
340 * Note, this table is twice as big as the # of regs, to deal with
341 * half-precision regs. The approach differs depending on whether
342 * the half and full precision register files are "merged" (conflict,
343 * ie. a6xx+) in which case we use "regs" for both full precision and half
344 * precision dependencies and consider each full precision dep
345 * as two half-precision dependencies, vs older separate (non-
346 * conflicting) in which case the separate "half_regs" table is used for
347 * half-precision deps. See ir3_reg_file_offset().
348 */
349 struct ir3_postsched_node *regs[2 * GPR_REG_SIZE];
350 unsigned dst_n[2 * GPR_REG_SIZE];
351 struct ir3_postsched_node *half_regs[GPR_REG_SIZE];
352 unsigned half_dst_n[GPR_REG_SIZE];
353 struct ir3_postsched_node *shared_regs[2 * SHARED_REG_SIZE];
354 unsigned shared_dst_n[2 * SHARED_REG_SIZE];
355 struct ir3_postsched_node *nongpr_regs[2 * NONGPR_REG_SIZE];
356 unsigned nongpr_dst_n[2 * NONGPR_REG_SIZE];
357 };
358
359 static void
add_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * before,struct ir3_postsched_node * after,unsigned d)360 add_dep(struct ir3_postsched_deps_state *state,
361 struct ir3_postsched_node *before, struct ir3_postsched_node *after,
362 unsigned d)
363 {
364 if (!before || !after)
365 return;
366
367 assert(before != after);
368
369 if (state->direction == F) {
370 dag_add_edge_max_data(&before->dag, &after->dag, (uintptr_t)d);
371 } else {
372 dag_add_edge_max_data(&after->dag, &before->dag, 0);
373 }
374 }
375
376 static void
add_single_reg_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node,struct ir3_postsched_node ** dep_ptr,unsigned * dst_n_ptr,unsigned num,int src_n,int dst_n)377 add_single_reg_dep(struct ir3_postsched_deps_state *state,
378 struct ir3_postsched_node *node,
379 struct ir3_postsched_node **dep_ptr,
380 unsigned *dst_n_ptr, unsigned num, int src_n,
381 int dst_n)
382 {
383 struct ir3_postsched_node *dep = *dep_ptr;
384
385 unsigned d = 0;
386 if (src_n >= 0 && dep && state->direction == F) {
387 struct ir3_compiler *compiler = state->ctx->ir->compiler;
388 /* get the dst_n this corresponds to */
389 unsigned dst_n = *dst_n_ptr;
390 d = ir3_delayslots_with_repeat(compiler, dep->instr, node->instr, dst_n, src_n);
391 if (is_sy_producer(dep->instr))
392 node->has_sy_src = true;
393 if (needs_ss(compiler, dep->instr, node->instr))
394 node->has_ss_src = true;
395 }
396
397 if (src_n >= 0 && dep && state->direction == R) {
398 /* If node generates a WAR hazard (because it doesn't consume its sources
399 * immediately, dep needs (ss) to sync its dest. Even though this isn't a
400 * (ss) source (but rather a dest), the effect is exactly the same so we
401 * model it as such.
402 */
403 if (is_war_hazard_producer(node->instr)) {
404 dep->has_ss_src = true;
405 }
406 }
407
408 add_dep(state, dep, node, d);
409 if (src_n < 0) {
410 *dep_ptr = node;
411 *dst_n_ptr = dst_n;
412 }
413 }
414
415 /* This is where we handled full vs half-precision, and potential conflicts
416 * between half and full precision that result in additional dependencies.
417 * The 'reg' arg is really just to know half vs full precision.
418 *
419 * If src_n is positive, then this adds a dependency on a source register, and
420 * src_n is the index passed into ir3_delayslots() for calculating the delay:
421 * it corresponds to node->instr->srcs[src_n]. If src_n is negative, then
422 * this is for the destination register corresponding to dst_n.
423 */
424 static void
add_reg_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node,const struct ir3_register * reg,unsigned num,int src_n,int dst_n)425 add_reg_dep(struct ir3_postsched_deps_state *state,
426 struct ir3_postsched_node *node, const struct ir3_register *reg,
427 unsigned num, int src_n, int dst_n)
428 {
429 struct ir3_postsched_node **regs;
430 unsigned *dst_n_ptr;
431 enum ir3_reg_file file;
432 unsigned size = reg_elem_size(reg);
433 unsigned offset = ir3_reg_file_offset(reg, num, state->merged, &file);
434 switch (file) {
435 case IR3_FILE_FULL:
436 assert(offset + size <= ARRAY_SIZE(state->regs));
437 regs = state->regs;
438 dst_n_ptr = state->dst_n;
439 break;
440 case IR3_FILE_HALF:
441 assert(offset + 1 <= ARRAY_SIZE(state->half_regs));
442 regs = state->half_regs;
443 dst_n_ptr = state->half_dst_n;
444 break;
445 case IR3_FILE_SHARED:
446 assert(offset + size <= ARRAY_SIZE(state->shared_regs));
447 regs = state->shared_regs;
448 dst_n_ptr = state->shared_dst_n;
449 break;
450 case IR3_FILE_NONGPR:
451 assert(offset + size <= ARRAY_SIZE(state->nongpr_regs));
452 regs = state->nongpr_regs;
453 dst_n_ptr = state->nongpr_dst_n;
454 break;
455 }
456
457 for (unsigned i = 0; i < size; i++)
458 add_single_reg_dep(state, node, ®s[offset + i], &dst_n_ptr[offset + i], num, src_n, dst_n);
459 }
460
461 static void
calculate_deps(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node)462 calculate_deps(struct ir3_postsched_deps_state *state,
463 struct ir3_postsched_node *node)
464 {
465 /* Add dependencies on instructions that previously (or next,
466 * in the reverse direction) wrote any of our src registers:
467 */
468 foreach_src_n (reg, i, node->instr) {
469 if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
470 continue;
471
472 if (reg->flags & IR3_REG_RELATIV) {
473 /* mark entire array as read: */
474 for (unsigned j = 0; j < reg->size; j++) {
475 add_reg_dep(state, node, reg, reg->array.base + j, i, -1);
476 }
477 } else {
478 u_foreach_bit (b, reg->wrmask) {
479 add_reg_dep(state, node, reg, reg->num + b, i, -1);
480 }
481 }
482 }
483
484 /* And then after we update the state for what this instruction
485 * wrote:
486 */
487 foreach_dst_n (reg, i, node->instr) {
488 if (reg->wrmask == 0)
489 continue;
490 if (reg->flags & IR3_REG_RT)
491 continue;
492 if (reg->flags & IR3_REG_RELATIV) {
493 /* mark the entire array as written: */
494 for (unsigned j = 0; j < reg->size; j++) {
495 add_reg_dep(state, node, reg, reg->array.base + j, -1, i);
496 }
497 } else {
498 assert(reg->wrmask >= 1);
499 u_foreach_bit (b, reg->wrmask) {
500 add_reg_dep(state, node, reg, reg->num + b, -1, i);
501 }
502 }
503 }
504 }
505
506 static void
calculate_forward_deps(struct ir3_postsched_ctx * ctx)507 calculate_forward_deps(struct ir3_postsched_ctx *ctx)
508 {
509 struct ir3_postsched_deps_state state = {
510 .ctx = ctx,
511 .direction = F,
512 .merged = ctx->v->mergedregs,
513 };
514
515 foreach_instr (instr, &ctx->unscheduled_list) {
516 calculate_deps(&state, instr->data);
517 }
518 }
519
520 static void
calculate_reverse_deps(struct ir3_postsched_ctx * ctx)521 calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
522 {
523 struct ir3_postsched_deps_state state = {
524 .ctx = ctx,
525 .direction = R,
526 .merged = ctx->v->mergedregs,
527 };
528
529 foreach_instr_rev (instr, &ctx->unscheduled_list) {
530 calculate_deps(&state, instr->data);
531 }
532 }
533
534 static void
sched_node_init(struct ir3_postsched_ctx * ctx,struct ir3_instruction * instr)535 sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
536 {
537 struct ir3_postsched_node *n =
538 rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
539
540 dag_init_node(ctx->dag, &n->dag);
541
542 n->instr = instr;
543 instr->data = n;
544 }
545
546 static void
sched_dag_max_delay_cb(struct dag_node * node,void * state)547 sched_dag_max_delay_cb(struct dag_node *node, void *state)
548 {
549 struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
550 struct ir3_postsched_ctx *ctx = state;
551 uint32_t max_delay = 0;
552
553 util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
554 struct ir3_postsched_node *child =
555 (struct ir3_postsched_node *)edge->child;
556 unsigned delay = edge->data;
557 unsigned sy_delay = 0;
558 unsigned ss_delay = 0;
559
560 if (child->has_sy_src && is_sy_producer(n->instr)) {
561 sy_delay = soft_sy_delay(n->instr, ctx->block->shader);
562 }
563
564 if (child->has_ss_src &&
565 needs_ss(ctx->v->compiler, n->instr, child->instr)) {
566 ss_delay = soft_ss_delay(n->instr);
567 }
568
569 delay = MAX3(delay, sy_delay, ss_delay);
570 max_delay = MAX2(child->max_delay + delay, max_delay);
571 }
572
573 n->max_delay = MAX2(n->max_delay, max_delay);
574 }
575
576 static void
sched_dag_init(struct ir3_postsched_ctx * ctx)577 sched_dag_init(struct ir3_postsched_ctx *ctx)
578 {
579 ctx->mem_ctx = ralloc_context(NULL);
580
581 ctx->dag = dag_create(ctx->mem_ctx);
582
583 foreach_instr (instr, &ctx->unscheduled_list)
584 sched_node_init(ctx, instr);
585
586 calculate_forward_deps(ctx);
587 calculate_reverse_deps(ctx);
588
589 /*
590 * To avoid expensive texture fetches, etc, from being moved ahead
591 * of kills, track the kills we've seen so far, so we can add an
592 * extra dependency on them for tex/mem instructions
593 */
594 struct util_dynarray kills;
595 util_dynarray_init(&kills, ctx->mem_ctx);
596
597 /* The last bary.f with the (ei) flag must be scheduled before any kills,
598 * or the hw gets angry. Keep track of inputs here so we can add the
599 * false dep on the kill instruction.
600 */
601 struct util_dynarray inputs;
602 util_dynarray_init(&inputs, ctx->mem_ctx);
603
604 /*
605 * Normal srcs won't be in SSA at this point, those are dealt with in
606 * calculate_forward_deps() and calculate_reverse_deps(). But we still
607 * have the false-dep information in SSA form, so go ahead and add
608 * dependencies for that here:
609 */
610 foreach_instr (instr, &ctx->unscheduled_list) {
611 struct ir3_postsched_node *n = instr->data;
612
613 foreach_ssa_src_n (src, i, instr) {
614 if (src->block != instr->block)
615 continue;
616
617 /* we can end up with unused false-deps.. just skip them: */
618 if (src->flags & IR3_INSTR_UNUSED)
619 continue;
620
621 struct ir3_postsched_node *sn = src->data;
622
623 /* don't consider dependencies in other blocks: */
624 if (src->block != instr->block)
625 continue;
626
627 dag_add_edge_max_data(&sn->dag, &n->dag, 0);
628 }
629
630 if (is_input(instr)) {
631 util_dynarray_append(&inputs, struct ir3_instruction *, instr);
632 } else if (is_kill_or_demote(instr)) {
633 util_dynarray_foreach (&inputs, struct ir3_instruction *, instrp) {
634 struct ir3_instruction *input = *instrp;
635 struct ir3_postsched_node *in = input->data;
636 dag_add_edge_max_data(&in->dag, &n->dag, 0);
637 }
638 util_dynarray_append(&kills, struct ir3_instruction *, instr);
639 } else if (is_tex(instr) || is_mem(instr)) {
640 util_dynarray_foreach (&kills, struct ir3_instruction *, instrp) {
641 struct ir3_instruction *kill = *instrp;
642 struct ir3_postsched_node *kn = kill->data;
643 dag_add_edge_max_data(&kn->dag, &n->dag, 0);
644 }
645 }
646 }
647
648 #ifndef NDEBUG
649 dag_validate(ctx->dag, sched_dag_validate_cb, NULL);
650 #endif
651
652 // TODO do we want to do this after reverse-dependencies?
653 dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, ctx);
654 }
655
656 static void
sched_dag_destroy(struct ir3_postsched_ctx * ctx)657 sched_dag_destroy(struct ir3_postsched_ctx *ctx)
658 {
659 ralloc_free(ctx->mem_ctx);
660 ctx->mem_ctx = NULL;
661 ctx->dag = NULL;
662 }
663
664 static void
sched_block(struct ir3_postsched_ctx * ctx,struct ir3_block * block)665 sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
666 {
667 ctx->block = block;
668 ctx->sy_delay = 0;
669 ctx->ss_delay = 0;
670
671 /* The terminator has to stay at the end. Instead of trying to set up
672 * dependencies to achieve this, it's easier to just remove it now and add it
673 * back after scheduling.
674 */
675 struct ir3_instruction *terminator = ir3_block_take_terminator(block);
676
677 /* move all instructions to the unscheduled list, and
678 * empty the block's instruction list (to which we will
679 * be inserting).
680 */
681 list_replace(&block->instr_list, &ctx->unscheduled_list);
682 list_inithead(&block->instr_list);
683
684 // TODO once we are using post-sched for everything we can
685 // just not stick in NOP's prior to post-sched, and drop this.
686 // for now keep this, since it makes post-sched optional:
687 foreach_instr_safe (instr, &ctx->unscheduled_list) {
688 switch (instr->opc) {
689 case OPC_NOP:
690 list_delinit(&instr->node);
691 break;
692 default:
693 break;
694 }
695 }
696
697 sched_dag_init(ctx);
698
699 /* First schedule all meta:input instructions, followed by
700 * tex-prefetch. We want all of the instructions that load
701 * values into registers before the shader starts to go
702 * before any other instructions. But in particular we
703 * want inputs to come before prefetches. This is because
704 * a FS's bary_ij input may not actually be live in the
705 * shader, but it should not be scheduled on top of any
706 * other input (but can be overwritten by a tex prefetch)
707 */
708 foreach_instr_safe (instr, &ctx->unscheduled_list)
709 if (instr->opc == OPC_META_INPUT)
710 schedule(ctx, instr);
711
712 foreach_instr_safe (instr, &ctx->unscheduled_list)
713 if (instr->opc == OPC_META_TEX_PREFETCH)
714 schedule(ctx, instr);
715
716 foreach_instr_safe (instr, &ctx->unscheduled_list)
717 if (instr->opc == OPC_PUSH_CONSTS_LOAD_MACRO)
718 schedule(ctx, instr);
719
720 while (!list_is_empty(&ctx->unscheduled_list)) {
721 struct ir3_instruction *instr = choose_instr(ctx);
722
723 unsigned delay = node_delay(ctx, instr->data);
724 d("delay=%u", delay);
725
726 assert(delay <= 6);
727
728 schedule(ctx, instr);
729 }
730
731 sched_dag_destroy(ctx);
732
733 if (terminator)
734 list_addtail(&terminator->node, &block->instr_list);
735 }
736
737 static bool
is_self_mov(struct ir3_instruction * instr)738 is_self_mov(struct ir3_instruction *instr)
739 {
740 if (!is_same_type_mov(instr))
741 return false;
742
743 if (instr->dsts[0]->num != instr->srcs[0]->num)
744 return false;
745
746 if (instr->dsts[0]->flags & IR3_REG_RELATIV)
747 return false;
748
749 if (instr->cat1.round != ROUND_ZERO)
750 return false;
751
752 if (instr->srcs[0]->flags &
753 (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_FNEG |
754 IR3_REG_FABS | IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT))
755 return false;
756
757 return true;
758 }
759
760 /* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
761 * as a result of places were before RA we are not sure that it is
762 * safe to eliminate. We could eliminate these earlier, but sometimes
763 * they are tangled up in false-dep's, etc, so it is easier just to
764 * let them exist until after RA
765 */
766 static void
cleanup_self_movs(struct ir3 * ir)767 cleanup_self_movs(struct ir3 *ir)
768 {
769 foreach_block (block, &ir->block_list) {
770 foreach_instr_safe (instr, &block->instr_list) {
771 for (unsigned i = 0; i < instr->deps_count; i++) {
772 if (instr->deps[i] && is_self_mov(instr->deps[i])) {
773 instr->deps[i] = NULL;
774 }
775 }
776
777 if (is_self_mov(instr))
778 list_delinit(&instr->node);
779 }
780 }
781 }
782
783 bool
ir3_postsched(struct ir3 * ir,struct ir3_shader_variant * v)784 ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v)
785 {
786 struct ir3_postsched_ctx ctx = {
787 .ir = ir,
788 .v = v,
789 };
790
791 cleanup_self_movs(ir);
792
793 foreach_block (block, &ir->block_list) {
794 sched_block(&ctx, block);
795 }
796
797 return true;
798 }
799