1 /*
2 * Copyright (C) 2019 Google, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27 #include "util/dag.h"
28 #include "util/u_math.h"
29
30 #include "ir3.h"
31 #include "ir3_compiler.h"
32 #include "ir3_context.h"
33
34 #ifdef DEBUG
35 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
36 #else
37 #define SCHED_DEBUG 0
38 #endif
39 #define d(fmt, ...) \
40 do { \
41 if (SCHED_DEBUG) { \
42 mesa_logi("PSCHED: " fmt, ##__VA_ARGS__); \
43 } \
44 } while (0)
45
46 #define di(instr, fmt, ...) \
47 do { \
48 if (SCHED_DEBUG) { \
49 struct log_stream *stream = mesa_log_streami(); \
50 mesa_log_stream_printf(stream, "PSCHED: " fmt ": ", ##__VA_ARGS__); \
51 ir3_print_instr_stream(stream, instr); \
52 mesa_log_stream_destroy(stream); \
53 } \
54 } while (0)
55
56 /*
57 * Post RA Instruction Scheduling
58 */
59
60 struct ir3_postsched_ctx {
61 struct ir3 *ir;
62
63 struct ir3_shader_variant *v;
64
65 void *mem_ctx;
66 struct ir3_block *block; /* the current block */
67 struct dag *dag;
68
69 struct list_head unscheduled_list; /* unscheduled instructions */
70
71 unsigned ip;
72
73 int ss_delay;
74 int sy_delay;
75 };
76
77 struct ir3_postsched_node {
78 struct dag_node dag; /* must be first for util_dynarray_foreach */
79 struct ir3_instruction *instr;
80 bool partially_evaluated_path;
81
82 unsigned earliest_ip;
83
84 bool has_sy_src, has_ss_src;
85
86 unsigned delay;
87 unsigned max_delay;
88 };
89
90 #define foreach_sched_node(__n, __list) \
91 list_for_each_entry (struct ir3_postsched_node, __n, __list, dag.link)
92
93 static bool
has_sy_src(struct ir3_instruction * instr)94 has_sy_src(struct ir3_instruction *instr)
95 {
96 struct ir3_postsched_node *node = instr->data;
97 return node->has_sy_src;
98 }
99
100 static bool
has_ss_src(struct ir3_instruction * instr)101 has_ss_src(struct ir3_instruction *instr)
102 {
103 struct ir3_postsched_node *node = instr->data;
104 return node->has_ss_src;
105 }
106
107 static void
sched_dag_validate_cb(const struct dag_node * node,void * data)108 sched_dag_validate_cb(const struct dag_node *node, void *data)
109 {
110 struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
111
112 ir3_print_instr(n->instr);
113 }
114
115 static void
schedule(struct ir3_postsched_ctx * ctx,struct ir3_instruction * instr)116 schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
117 {
118 assert(ctx->block == instr->block);
119
120 /* remove from unscheduled_list:
121 */
122 list_delinit(&instr->node);
123
124 di(instr, "schedule");
125
126 bool counts_for_delay = is_alu(instr) || is_flow(instr);
127
128 unsigned delay_cycles = counts_for_delay ? 1 + instr->repeat : 0;
129
130 struct ir3_postsched_node *n = instr->data;
131
132 /* We insert any nop's needed to get to earliest_ip, then advance
133 * delay_cycles by scheduling the instruction.
134 */
135 ctx->ip = MAX2(ctx->ip, n->earliest_ip) + delay_cycles;
136
137 util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
138 unsigned delay = (unsigned)(uintptr_t)edge->data;
139 struct ir3_postsched_node *child =
140 container_of(edge->child, struct ir3_postsched_node, dag);
141 child->earliest_ip = MAX2(child->earliest_ip, ctx->ip + delay);
142 }
143
144 list_addtail(&instr->node, &instr->block->instr_list);
145
146 dag_prune_head(ctx->dag, &n->dag);
147
148 if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
149 return;
150
151 if (is_ss_producer(instr)) {
152 ctx->ss_delay = soft_ss_delay(instr);
153 } else if (has_ss_src(instr)) {
154 ctx->ss_delay = 0;
155 } else if (ctx->ss_delay > 0) {
156 ctx->ss_delay--;
157 }
158
159 if (is_sy_producer(instr)) {
160 ctx->sy_delay = soft_sy_delay(instr, ctx->block->shader);
161 } else if (has_sy_src(instr)) {
162 ctx->sy_delay = 0;
163 } else if (ctx->sy_delay > 0) {
164 ctx->sy_delay--;
165 }
166 }
167
168 static void
dump_state(struct ir3_postsched_ctx * ctx)169 dump_state(struct ir3_postsched_ctx *ctx)
170 {
171 if (!SCHED_DEBUG)
172 return;
173
174 foreach_sched_node (n, &ctx->dag->heads) {
175 di(n->instr, "maxdel=%3d ", n->max_delay);
176
177 util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
178 struct ir3_postsched_node *child =
179 (struct ir3_postsched_node *)edge->child;
180
181 di(child->instr, " -> (%d parents) ", child->dag.parent_count);
182 }
183 }
184 }
185
186 static unsigned
node_delay(struct ir3_postsched_ctx * ctx,struct ir3_postsched_node * n)187 node_delay(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n)
188 {
189 return MAX2(n->earliest_ip, ctx->ip) - ctx->ip;
190 }
191
192 static unsigned
node_delay_soft(struct ir3_postsched_ctx * ctx,struct ir3_postsched_node * n)193 node_delay_soft(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n)
194 {
195 unsigned delay = node_delay(ctx, n);
196
197 /* This takes into account that as when we schedule multiple tex or sfu, the
198 * first user has to wait for all of them to complete.
199 */
200 if (n->has_ss_src)
201 delay = MAX2(delay, ctx->ss_delay);
202 if (n->has_sy_src)
203 delay = MAX2(delay, ctx->sy_delay);
204
205 return delay;
206 }
207
208 /* find instruction to schedule: */
209 static struct ir3_instruction *
choose_instr(struct ir3_postsched_ctx * ctx)210 choose_instr(struct ir3_postsched_ctx *ctx)
211 {
212 struct ir3_postsched_node *chosen = NULL;
213
214 dump_state(ctx);
215
216 foreach_sched_node (n, &ctx->dag->heads) {
217 if (!is_meta(n->instr))
218 continue;
219
220 if (!chosen || (chosen->max_delay < n->max_delay))
221 chosen = n;
222 }
223
224 if (chosen) {
225 di(chosen->instr, "prio: chose (meta)");
226 return chosen->instr;
227 }
228
229 /* Try to schedule inputs with a higher priority, if possible, as
230 * the last bary.f unlocks varying storage to unblock more VS
231 * warps.
232 */
233 foreach_sched_node (n, &ctx->dag->heads) {
234 if (!is_input(n->instr))
235 continue;
236
237 if (!chosen || (chosen->max_delay < n->max_delay))
238 chosen = n;
239 }
240
241 if (chosen) {
242 di(chosen->instr, "prio: chose (input)");
243 return chosen->instr;
244 }
245
246 /* Next prioritize discards: */
247 foreach_sched_node (n, &ctx->dag->heads) {
248 unsigned d = node_delay(ctx, n);
249
250 if (d > 0)
251 continue;
252
253 if (!is_kill_or_demote(n->instr))
254 continue;
255
256 if (!chosen || (chosen->max_delay < n->max_delay))
257 chosen = n;
258 }
259
260 if (chosen) {
261 di(chosen->instr, "csp: chose (kill, hard ready)");
262 return chosen->instr;
263 }
264
265 /* Next prioritize expensive instructions: */
266 foreach_sched_node (n, &ctx->dag->heads) {
267 unsigned d = node_delay_soft(ctx, n);
268
269 if (d > 0)
270 continue;
271
272 if (!(is_ss_producer(n->instr) || is_sy_producer(n->instr)))
273 continue;
274
275 if (!chosen || (chosen->max_delay < n->max_delay))
276 chosen = n;
277 }
278
279 if (chosen) {
280 di(chosen->instr, "csp: chose (sfu/tex, soft ready)");
281 return chosen->instr;
282 }
283
284 /* Next try to find a ready leader w/ soft delay (ie. including extra
285 * delay for things like tex fetch which can be synchronized w/ sync
286 * bit (but we probably do want to schedule some other instructions
287 * while we wait). We also allow a small amount of nops, to prefer now-nops
288 * over future-nops up to a point, as that gives better results.
289 */
290 unsigned chosen_delay = 0;
291 foreach_sched_node (n, &ctx->dag->heads) {
292 unsigned d = node_delay_soft(ctx, n);
293
294 if (d > 3)
295 continue;
296
297 if (!chosen || d < chosen_delay) {
298 chosen = n;
299 chosen_delay = d;
300 continue;
301 }
302
303 if (d > chosen_delay)
304 continue;
305
306 if (chosen->max_delay < n->max_delay) {
307 chosen = n;
308 chosen_delay = d;
309 }
310 }
311
312 if (chosen) {
313 di(chosen->instr, "csp: chose (soft ready)");
314 return chosen->instr;
315 }
316
317 /* Next try to find a ready leader that can be scheduled without nop's,
318 * which in the case of things that need (sy)/(ss) could result in
319 * stalls.. but we've already decided there is not a better option.
320 */
321 foreach_sched_node (n, &ctx->dag->heads) {
322 unsigned d = node_delay(ctx, n);
323
324 if (d > 0)
325 continue;
326
327 if (!chosen || (chosen->max_delay < n->max_delay))
328 chosen = n;
329 }
330
331 if (chosen) {
332 di(chosen->instr, "csp: chose (hard ready)");
333 return chosen->instr;
334 }
335
336 /* Otherwise choose leader with maximum cost:
337 */
338 foreach_sched_node (n, &ctx->dag->heads) {
339 if (!chosen || chosen->max_delay < n->max_delay)
340 chosen = n;
341 }
342
343 if (chosen) {
344 di(chosen->instr, "csp: chose (leader)");
345 return chosen->instr;
346 }
347
348 return NULL;
349 }
350
351 struct ir3_postsched_deps_state {
352 struct ir3_postsched_ctx *ctx;
353
354 enum { F, R } direction;
355
356 bool merged;
357
358 /* Track the mapping between sched node (instruction) that last
359 * wrote a given register (in whichever direction we are iterating
360 * the block)
361 *
362 * Note, this table is twice as big as the # of regs, to deal with
363 * half-precision regs. The approach differs depending on whether
364 * the half and full precision register files are "merged" (conflict,
365 * ie. a6xx+) in which case we consider each full precision dep
366 * as two half-precision dependencies, vs older separate (non-
367 * conflicting) in which case the first half of the table is used
368 * for full precision and 2nd half for half-precision.
369 */
370 struct ir3_postsched_node *regs[2 * 256];
371 unsigned dst_n[2 * 256];
372 };
373
374 /* bounds checking read/write accessors, since OoB access to stuff on
375 * the stack is gonna cause a bad day.
376 */
377 #define dep_reg(state, idx) \
378 *({ \
379 assert((idx) < ARRAY_SIZE((state)->regs)); \
380 &(state)->regs[(idx)]; \
381 })
382
383 static void
add_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * before,struct ir3_postsched_node * after,unsigned d)384 add_dep(struct ir3_postsched_deps_state *state,
385 struct ir3_postsched_node *before, struct ir3_postsched_node *after,
386 unsigned d)
387 {
388 if (!before || !after)
389 return;
390
391 assert(before != after);
392
393 if (state->direction == F) {
394 dag_add_edge_max_data(&before->dag, &after->dag, (uintptr_t)d);
395 } else {
396 dag_add_edge_max_data(&after->dag, &before->dag, 0);
397 }
398 }
399
400 static void
add_single_reg_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node,unsigned num,int src_n,int dst_n)401 add_single_reg_dep(struct ir3_postsched_deps_state *state,
402 struct ir3_postsched_node *node, unsigned num, int src_n,
403 int dst_n)
404 {
405 struct ir3_postsched_node *dep = dep_reg(state, num);
406
407 unsigned d = 0;
408 if (src_n >= 0 && dep && state->direction == F) {
409 /* get the dst_n this corresponds to */
410 unsigned dst_n = state->dst_n[num];
411 unsigned d_soft = ir3_delayslots(dep->instr, node->instr, src_n, true);
412 d = ir3_delayslots_with_repeat(dep->instr, node->instr, dst_n, src_n);
413 node->delay = MAX2(node->delay, d_soft);
414 if (is_sy_producer(dep->instr))
415 node->has_sy_src = true;
416 if (is_ss_producer(dep->instr))
417 node->has_ss_src = true;
418 }
419
420 add_dep(state, dep, node, d);
421 if (src_n < 0) {
422 dep_reg(state, num) = node;
423 state->dst_n[num] = dst_n;
424 }
425 }
426
427 /* This is where we handled full vs half-precision, and potential conflicts
428 * between half and full precision that result in additional dependencies.
429 * The 'reg' arg is really just to know half vs full precision.
430 *
431 * If src_n is positive, then this adds a dependency on a source register, and
432 * src_n is the index passed into ir3_delayslots() for calculating the delay:
433 * it corresponds to node->instr->srcs[src_n]. If src_n is negative, then
434 * this is for the destination register corresponding to dst_n.
435 */
436 static void
add_reg_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node,const struct ir3_register * reg,unsigned num,int src_n,int dst_n)437 add_reg_dep(struct ir3_postsched_deps_state *state,
438 struct ir3_postsched_node *node, const struct ir3_register *reg,
439 unsigned num, int src_n, int dst_n)
440 {
441 if (state->merged) {
442 /* Make sure that special registers like a0.x that are written as
443 * half-registers don't alias random full registers by pretending that
444 * they're full registers:
445 */
446 if ((reg->flags & IR3_REG_HALF) && !is_reg_special(reg)) {
447 /* single conflict in half-reg space: */
448 add_single_reg_dep(state, node, num, src_n, dst_n);
449 } else {
450 /* two conflicts in half-reg space: */
451 add_single_reg_dep(state, node, 2 * num + 0, src_n, dst_n);
452 add_single_reg_dep(state, node, 2 * num + 1, src_n, dst_n);
453 }
454 } else {
455 if (reg->flags & IR3_REG_HALF)
456 num += ARRAY_SIZE(state->regs) / 2;
457 add_single_reg_dep(state, node, num, src_n, dst_n);
458 }
459 }
460
461 static void
calculate_deps(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node)462 calculate_deps(struct ir3_postsched_deps_state *state,
463 struct ir3_postsched_node *node)
464 {
465 /* Add dependencies on instructions that previously (or next,
466 * in the reverse direction) wrote any of our src registers:
467 */
468 foreach_src_n (reg, i, node->instr) {
469 if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
470 continue;
471
472 if (reg->flags & IR3_REG_RELATIV) {
473 /* mark entire array as read: */
474 for (unsigned j = 0; j < reg->size; j++) {
475 add_reg_dep(state, node, reg, reg->array.base + j, i, -1);
476 }
477 } else {
478 assert(reg->wrmask >= 1);
479 u_foreach_bit (b, reg->wrmask) {
480 add_reg_dep(state, node, reg, reg->num + b, i, -1);
481 }
482 }
483 }
484
485 /* And then after we update the state for what this instruction
486 * wrote:
487 */
488 foreach_dst_n (reg, i, node->instr) {
489 if (reg->wrmask == 0)
490 continue;
491 if (reg->flags & IR3_REG_RELATIV) {
492 /* mark the entire array as written: */
493 for (unsigned j = 0; j < reg->size; j++) {
494 add_reg_dep(state, node, reg, reg->array.base + j, -1, i);
495 }
496 } else {
497 assert(reg->wrmask >= 1);
498 u_foreach_bit (b, reg->wrmask) {
499 add_reg_dep(state, node, reg, reg->num + b, -1, i);
500 }
501 }
502 }
503 }
504
505 static void
calculate_forward_deps(struct ir3_postsched_ctx * ctx)506 calculate_forward_deps(struct ir3_postsched_ctx *ctx)
507 {
508 struct ir3_postsched_deps_state state = {
509 .ctx = ctx,
510 .direction = F,
511 .merged = ctx->v->mergedregs,
512 };
513
514 foreach_instr (instr, &ctx->unscheduled_list) {
515 calculate_deps(&state, instr->data);
516 }
517 }
518
519 static void
calculate_reverse_deps(struct ir3_postsched_ctx * ctx)520 calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
521 {
522 struct ir3_postsched_deps_state state = {
523 .ctx = ctx,
524 .direction = R,
525 .merged = ctx->v->mergedregs,
526 };
527
528 foreach_instr_rev (instr, &ctx->unscheduled_list) {
529 calculate_deps(&state, instr->data);
530 }
531 }
532
533 static void
sched_node_init(struct ir3_postsched_ctx * ctx,struct ir3_instruction * instr)534 sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
535 {
536 struct ir3_postsched_node *n =
537 rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
538
539 dag_init_node(ctx->dag, &n->dag);
540
541 n->instr = instr;
542 instr->data = n;
543 }
544
545 static void
sched_dag_max_delay_cb(struct dag_node * node,void * state)546 sched_dag_max_delay_cb(struct dag_node *node, void *state)
547 {
548 struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
549 uint32_t max_delay = 0;
550
551 util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
552 struct ir3_postsched_node *child =
553 (struct ir3_postsched_node *)edge->child;
554 max_delay = MAX2(child->max_delay, max_delay);
555 }
556
557 n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
558 }
559
560 static void
sched_dag_init(struct ir3_postsched_ctx * ctx)561 sched_dag_init(struct ir3_postsched_ctx *ctx)
562 {
563 ctx->mem_ctx = ralloc_context(NULL);
564
565 ctx->dag = dag_create(ctx->mem_ctx);
566
567 foreach_instr (instr, &ctx->unscheduled_list)
568 sched_node_init(ctx, instr);
569
570 calculate_forward_deps(ctx);
571 calculate_reverse_deps(ctx);
572
573 /*
574 * To avoid expensive texture fetches, etc, from being moved ahead
575 * of kills, track the kills we've seen so far, so we can add an
576 * extra dependency on them for tex/mem instructions
577 */
578 struct util_dynarray kills;
579 util_dynarray_init(&kills, ctx->mem_ctx);
580
581 /* The last bary.f with the (ei) flag must be scheduled before any kills,
582 * or the hw gets angry. Keep track of inputs here so we can add the
583 * false dep on the kill instruction.
584 */
585 struct util_dynarray inputs;
586 util_dynarray_init(&inputs, ctx->mem_ctx);
587
588 /*
589 * Normal srcs won't be in SSA at this point, those are dealt with in
590 * calculate_forward_deps() and calculate_reverse_deps(). But we still
591 * have the false-dep information in SSA form, so go ahead and add
592 * dependencies for that here:
593 */
594 foreach_instr (instr, &ctx->unscheduled_list) {
595 struct ir3_postsched_node *n = instr->data;
596
597 foreach_ssa_src_n (src, i, instr) {
598 if (src->block != instr->block)
599 continue;
600
601 /* we can end up with unused false-deps.. just skip them: */
602 if (src->flags & IR3_INSTR_UNUSED)
603 continue;
604
605 struct ir3_postsched_node *sn = src->data;
606
607 /* don't consider dependencies in other blocks: */
608 if (src->block != instr->block)
609 continue;
610
611 dag_add_edge_max_data(&sn->dag, &n->dag, 0);
612 }
613
614 if (is_input(instr)) {
615 util_dynarray_append(&inputs, struct ir3_instruction *, instr);
616 } else if (is_kill_or_demote(instr)) {
617 util_dynarray_foreach (&inputs, struct ir3_instruction *, instrp) {
618 struct ir3_instruction *input = *instrp;
619 struct ir3_postsched_node *in = input->data;
620 dag_add_edge_max_data(&in->dag, &n->dag, 0);
621 }
622 util_dynarray_append(&kills, struct ir3_instruction *, instr);
623 } else if (is_tex(instr) || is_mem(instr)) {
624 util_dynarray_foreach (&kills, struct ir3_instruction *, instrp) {
625 struct ir3_instruction *kill = *instrp;
626 struct ir3_postsched_node *kn = kill->data;
627 dag_add_edge_max_data(&kn->dag, &n->dag, 0);
628 }
629 }
630 }
631
632 dag_validate(ctx->dag, sched_dag_validate_cb, NULL);
633
634 // TODO do we want to do this after reverse-dependencies?
635 dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
636 }
637
638 static void
sched_dag_destroy(struct ir3_postsched_ctx * ctx)639 sched_dag_destroy(struct ir3_postsched_ctx *ctx)
640 {
641 ralloc_free(ctx->mem_ctx);
642 ctx->mem_ctx = NULL;
643 ctx->dag = NULL;
644 }
645
646 static void
sched_block(struct ir3_postsched_ctx * ctx,struct ir3_block * block)647 sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
648 {
649 ctx->block = block;
650 ctx->sy_delay = 0;
651 ctx->ss_delay = 0;
652
653 /* move all instructions to the unscheduled list, and
654 * empty the block's instruction list (to which we will
655 * be inserting).
656 */
657 list_replace(&block->instr_list, &ctx->unscheduled_list);
658 list_inithead(&block->instr_list);
659
660 // TODO once we are using post-sched for everything we can
661 // just not stick in NOP's prior to post-sched, and drop this.
662 // for now keep this, since it makes post-sched optional:
663 foreach_instr_safe (instr, &ctx->unscheduled_list) {
664 switch (instr->opc) {
665 case OPC_NOP:
666 case OPC_B:
667 case OPC_JUMP:
668 list_delinit(&instr->node);
669 break;
670 default:
671 break;
672 }
673 }
674
675 sched_dag_init(ctx);
676
677 /* First schedule all meta:input instructions, followed by
678 * tex-prefetch. We want all of the instructions that load
679 * values into registers before the shader starts to go
680 * before any other instructions. But in particular we
681 * want inputs to come before prefetches. This is because
682 * a FS's bary_ij input may not actually be live in the
683 * shader, but it should not be scheduled on top of any
684 * other input (but can be overwritten by a tex prefetch)
685 */
686 foreach_instr_safe (instr, &ctx->unscheduled_list)
687 if (instr->opc == OPC_META_INPUT)
688 schedule(ctx, instr);
689
690 foreach_instr_safe (instr, &ctx->unscheduled_list)
691 if (instr->opc == OPC_META_TEX_PREFETCH)
692 schedule(ctx, instr);
693
694 foreach_instr_safe (instr, &ctx->unscheduled_list)
695 if (instr->opc == OPC_PUSH_CONSTS_LOAD_MACRO)
696 schedule(ctx, instr);
697
698 while (!list_is_empty(&ctx->unscheduled_list)) {
699 struct ir3_instruction *instr = choose_instr(ctx);
700
701 unsigned delay = node_delay(ctx, instr->data);
702 d("delay=%u", delay);
703
704 assert(delay <= 6);
705
706 schedule(ctx, instr);
707 }
708
709 sched_dag_destroy(ctx);
710 }
711
712 static bool
is_self_mov(struct ir3_instruction * instr)713 is_self_mov(struct ir3_instruction *instr)
714 {
715 if (!is_same_type_mov(instr))
716 return false;
717
718 if (instr->dsts[0]->num != instr->srcs[0]->num)
719 return false;
720
721 if (instr->dsts[0]->flags & IR3_REG_RELATIV)
722 return false;
723
724 if (instr->cat1.round != ROUND_ZERO)
725 return false;
726
727 if (instr->srcs[0]->flags &
728 (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_FNEG |
729 IR3_REG_FABS | IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT))
730 return false;
731
732 return true;
733 }
734
735 /* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
736 * as a result of places were before RA we are not sure that it is
737 * safe to eliminate. We could eliminate these earlier, but sometimes
738 * they are tangled up in false-dep's, etc, so it is easier just to
739 * let them exist until after RA
740 */
741 static void
cleanup_self_movs(struct ir3 * ir)742 cleanup_self_movs(struct ir3 *ir)
743 {
744 foreach_block (block, &ir->block_list) {
745 foreach_instr_safe (instr, &block->instr_list) {
746 for (unsigned i = 0; i < instr->deps_count; i++) {
747 if (instr->deps[i] && is_self_mov(instr->deps[i])) {
748 instr->deps[i] = NULL;
749 }
750 }
751
752 if (is_self_mov(instr))
753 list_delinit(&instr->node);
754 }
755 }
756 }
757
758 bool
ir3_postsched(struct ir3 * ir,struct ir3_shader_variant * v)759 ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v)
760 {
761 struct ir3_postsched_ctx ctx = {
762 .ir = ir,
763 .v = v,
764 };
765
766 cleanup_self_movs(ir);
767
768 foreach_block (block, &ir->block_list) {
769 sched_block(&ctx, block);
770 }
771
772 return true;
773 }
774