1 /*
2 * Copyright (C) 2019 Google, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27 #include "util/dag.h"
28 #include "util/u_math.h"
29
30 #include "ir3.h"
31 #include "ir3_compiler.h"
32 #include "ir3_context.h"
33
34 #ifdef DEBUG
35 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
36 #else
37 #define SCHED_DEBUG 0
38 #endif
39 #define d(fmt, ...) \
40 do { \
41 if (SCHED_DEBUG) { \
42 mesa_logi("PSCHED: " fmt, ##__VA_ARGS__); \
43 } \
44 } while (0)
45
46 #define di(instr, fmt, ...) \
47 do { \
48 if (SCHED_DEBUG) { \
49 struct log_stream *stream = mesa_log_streami(); \
50 mesa_log_stream_printf(stream, "PSCHED: " fmt ": ", ##__VA_ARGS__); \
51 ir3_print_instr_stream(stream, instr); \
52 mesa_log_stream_destroy(stream); \
53 } \
54 } while (0)
55
56 /*
57 * Post RA Instruction Scheduling
58 */
59
60 struct ir3_postsched_ctx {
61 struct ir3 *ir;
62
63 struct ir3_shader_variant *v;
64
65 void *mem_ctx;
66 struct ir3_block *block; /* the current block */
67 struct dag *dag;
68
69 struct list_head unscheduled_list; /* unscheduled instructions */
70
71 int sfu_delay;
72 int tex_delay;
73 };
74
75 struct ir3_postsched_node {
76 struct dag_node dag; /* must be first for util_dynarray_foreach */
77 struct ir3_instruction *instr;
78 bool partially_evaluated_path;
79
80 bool has_tex_src, has_sfu_src;
81
82 unsigned delay;
83 unsigned max_delay;
84 };
85
86 #define foreach_sched_node(__n, __list) \
87 list_for_each_entry (struct ir3_postsched_node, __n, __list, dag.link)
88
89 static bool
has_tex_src(struct ir3_instruction * instr)90 has_tex_src(struct ir3_instruction *instr)
91 {
92 struct ir3_postsched_node *node = instr->data;
93 return node->has_tex_src;
94 }
95
96 static bool
has_sfu_src(struct ir3_instruction * instr)97 has_sfu_src(struct ir3_instruction *instr)
98 {
99 struct ir3_postsched_node *node = instr->data;
100 return node->has_sfu_src;
101 }
102
103 static void
schedule(struct ir3_postsched_ctx * ctx,struct ir3_instruction * instr)104 schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
105 {
106 debug_assert(ctx->block == instr->block);
107
108 /* remove from unscheduled_list:
109 */
110 list_delinit(&instr->node);
111
112 di(instr, "schedule");
113
114 list_addtail(&instr->node, &instr->block->instr_list);
115
116 struct ir3_postsched_node *n = instr->data;
117 dag_prune_head(ctx->dag, &n->dag);
118
119 if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
120 return;
121
122 if (is_sfu(instr)) {
123 ctx->sfu_delay = 8;
124 } else if (has_sfu_src(instr)) {
125 ctx->sfu_delay = 0;
126 } else if (ctx->sfu_delay > 0) {
127 ctx->sfu_delay--;
128 }
129
130 if (is_tex_or_prefetch(instr)) {
131 ctx->tex_delay = 10;
132 } else if (has_tex_src(instr)) {
133 ctx->tex_delay = 0;
134 } else if (ctx->tex_delay > 0) {
135 ctx->tex_delay--;
136 }
137 }
138
139 static void
dump_state(struct ir3_postsched_ctx * ctx)140 dump_state(struct ir3_postsched_ctx *ctx)
141 {
142 if (!SCHED_DEBUG)
143 return;
144
145 foreach_sched_node (n, &ctx->dag->heads) {
146 di(n->instr, "maxdel=%3d ", n->max_delay);
147
148 util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
149 struct ir3_postsched_node *child =
150 (struct ir3_postsched_node *)edge->child;
151
152 di(child->instr, " -> (%d parents) ", child->dag.parent_count);
153 }
154 }
155 }
156
157 /* Determine if this is an instruction that we'd prefer not to schedule
158 * yet, in order to avoid an (ss) sync. This is limited by the sfu_delay
159 * counter, ie. the more cycles it has been since the last SFU, the less
160 * costly a sync would be.
161 */
162 static bool
would_sync(struct ir3_postsched_ctx * ctx,struct ir3_instruction * instr)163 would_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
164 {
165 if (ctx->sfu_delay) {
166 if (has_sfu_src(instr))
167 return true;
168 }
169
170 if (ctx->tex_delay) {
171 if (has_tex_src(instr))
172 return true;
173 }
174
175 return false;
176 }
177
178 /* find instruction to schedule: */
179 static struct ir3_instruction *
choose_instr(struct ir3_postsched_ctx * ctx)180 choose_instr(struct ir3_postsched_ctx *ctx)
181 {
182 struct ir3_postsched_node *chosen = NULL;
183
184 dump_state(ctx);
185
186 foreach_sched_node (n, &ctx->dag->heads) {
187 if (!is_meta(n->instr))
188 continue;
189
190 if (!chosen || (chosen->max_delay < n->max_delay))
191 chosen = n;
192 }
193
194 if (chosen) {
195 di(chosen->instr, "prio: chose (meta)");
196 return chosen->instr;
197 }
198
199 /* Try to schedule inputs with a higher priority, if possible, as
200 * the last bary.f unlocks varying storage to unblock more VS
201 * warps.
202 */
203 foreach_sched_node (n, &ctx->dag->heads) {
204 if (!is_input(n->instr))
205 continue;
206
207 if (!chosen || (chosen->max_delay < n->max_delay))
208 chosen = n;
209 }
210
211 if (chosen) {
212 di(chosen->instr, "prio: chose (input)");
213 return chosen->instr;
214 }
215
216 /* Next prioritize discards: */
217 foreach_sched_node (n, &ctx->dag->heads) {
218 unsigned d =
219 ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
220
221 if (d > 0)
222 continue;
223
224 if (!is_kill_or_demote(n->instr))
225 continue;
226
227 if (!chosen || (chosen->max_delay < n->max_delay))
228 chosen = n;
229 }
230
231 if (chosen) {
232 di(chosen->instr, "csp: chose (kill, hard ready)");
233 return chosen->instr;
234 }
235
236 /* Next prioritize expensive instructions: */
237 foreach_sched_node (n, &ctx->dag->heads) {
238 unsigned d =
239 ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
240
241 if (d > 0)
242 continue;
243
244 if (!(is_sfu(n->instr) || is_tex(n->instr)))
245 continue;
246
247 if (!chosen || (chosen->max_delay < n->max_delay))
248 chosen = n;
249 }
250
251 if (chosen) {
252 di(chosen->instr, "csp: chose (sfu/tex, hard ready)");
253 return chosen->instr;
254 }
255
256 /*
257 * Sometimes be better to take a nop, rather than scheduling an
258 * instruction that would require an (ss) shortly after another
259 * SFU.. ie. if last SFU was just one or two instr ago, and we
260 * could choose between taking a nop and then scheduling
261 * something else, vs scheduling the immed avail instruction that
262 * would require (ss), we are better with the nop.
263 */
264 for (unsigned delay = 0; delay < 4; delay++) {
265 foreach_sched_node (n, &ctx->dag->heads) {
266 if (would_sync(ctx, n->instr))
267 continue;
268
269 unsigned d = ir3_delay_calc_postra(ctx->block, n->instr, true,
270 ctx->v->mergedregs);
271
272 if (d > delay)
273 continue;
274
275 if (!chosen || (chosen->max_delay < n->max_delay))
276 chosen = n;
277 }
278
279 if (chosen) {
280 di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay);
281 return chosen->instr;
282 }
283 }
284
285 /* Next try to find a ready leader w/ soft delay (ie. including extra
286 * delay for things like tex fetch which can be synchronized w/ sync
287 * bit (but we probably do want to schedule some other instructions
288 * while we wait)
289 */
290 foreach_sched_node (n, &ctx->dag->heads) {
291 unsigned d =
292 ir3_delay_calc_postra(ctx->block, n->instr, true, ctx->v->mergedregs);
293
294 if (d > 0)
295 continue;
296
297 if (!chosen || (chosen->max_delay < n->max_delay))
298 chosen = n;
299 }
300
301 if (chosen) {
302 di(chosen->instr, "csp: chose (soft ready)");
303 return chosen->instr;
304 }
305
306 /* Next try to find a ready leader that can be scheduled without nop's,
307 * which in the case of things that need (sy)/(ss) could result in
308 * stalls.. but we've already decided there is not a better option.
309 */
310 foreach_sched_node (n, &ctx->dag->heads) {
311 unsigned d =
312 ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
313
314 if (d > 0)
315 continue;
316
317 if (!chosen || (chosen->max_delay < n->max_delay))
318 chosen = n;
319 }
320
321 if (chosen) {
322 di(chosen->instr, "csp: chose (hard ready)");
323 return chosen->instr;
324 }
325
326 /* Otherwise choose leader with maximum cost:
327 *
328 * TODO should we try to balance cost and delays? I guess it is
329 * a balance between now-nop's and future-nop's?
330 */
331 foreach_sched_node (n, &ctx->dag->heads) {
332 if (!chosen || chosen->max_delay < n->max_delay)
333 chosen = n;
334 }
335
336 if (chosen) {
337 di(chosen->instr, "csp: chose (leader)");
338 return chosen->instr;
339 }
340
341 return NULL;
342 }
343
344 struct ir3_postsched_deps_state {
345 struct ir3_postsched_ctx *ctx;
346
347 enum { F, R } direction;
348
349 bool merged;
350
351 /* Track the mapping between sched node (instruction) that last
352 * wrote a given register (in whichever direction we are iterating
353 * the block)
354 *
355 * Note, this table is twice as big as the # of regs, to deal with
356 * half-precision regs. The approach differs depending on whether
357 * the half and full precision register files are "merged" (conflict,
358 * ie. a6xx+) in which case we consider each full precision dep
359 * as two half-precision dependencies, vs older separate (non-
360 * conflicting) in which case the first half of the table is used
361 * for full precision and 2nd half for half-precision.
362 */
363 struct ir3_postsched_node *regs[2 * 256];
364 };
365
366 /* bounds checking read/write accessors, since OoB access to stuff on
367 * the stack is gonna cause a bad day.
368 */
369 #define dep_reg(state, idx) \
370 *({ \
371 assert((idx) < ARRAY_SIZE((state)->regs)); \
372 &(state)->regs[(idx)]; \
373 })
374
375 static void
add_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * before,struct ir3_postsched_node * after)376 add_dep(struct ir3_postsched_deps_state *state,
377 struct ir3_postsched_node *before, struct ir3_postsched_node *after)
378 {
379 if (!before || !after)
380 return;
381
382 assert(before != after);
383
384 if (state->direction == F) {
385 dag_add_edge(&before->dag, &after->dag, NULL);
386 } else {
387 dag_add_edge(&after->dag, &before->dag, NULL);
388 }
389 }
390
391 static void
add_single_reg_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node,unsigned num,int src_n)392 add_single_reg_dep(struct ir3_postsched_deps_state *state,
393 struct ir3_postsched_node *node, unsigned num, int src_n)
394 {
395 struct ir3_postsched_node *dep = dep_reg(state, num);
396
397 if (src_n >= 0 && dep && state->direction == F) {
398 unsigned d = ir3_delayslots(dep->instr, node->instr, src_n, true);
399 node->delay = MAX2(node->delay, d);
400 if (is_tex_or_prefetch(dep->instr))
401 node->has_tex_src = true;
402 if (is_tex_or_prefetch(dep->instr))
403 node->has_sfu_src = true;
404 }
405
406 add_dep(state, dep, node);
407 if (src_n < 0) {
408 dep_reg(state, num) = node;
409 }
410 }
411
412 /* This is where we handled full vs half-precision, and potential conflicts
413 * between half and full precision that result in additional dependencies.
414 * The 'reg' arg is really just to know half vs full precision.
415 *
416 * If non-negative, then this adds a dependency on a source register, and
417 * src_n is the index passed into ir3_delayslots() for calculating the delay:
418 * If positive, corresponds to node->instr->regs[src_n]. If negative, then
419 * this is for a destination register.
420 */
421 static void
add_reg_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node,const struct ir3_register * reg,unsigned num,int src_n)422 add_reg_dep(struct ir3_postsched_deps_state *state,
423 struct ir3_postsched_node *node, const struct ir3_register *reg,
424 unsigned num, int src_n)
425 {
426 if (state->merged) {
427 /* Make sure that special registers like a0.x that are written as
428 * half-registers don't alias random full registers by pretending that
429 * they're full registers:
430 */
431 if ((reg->flags & IR3_REG_HALF) && !is_reg_special(reg)) {
432 /* single conflict in half-reg space: */
433 add_single_reg_dep(state, node, num, src_n);
434 } else {
435 /* two conflicts in half-reg space: */
436 add_single_reg_dep(state, node, 2 * num + 0, src_n);
437 add_single_reg_dep(state, node, 2 * num + 1, src_n);
438 }
439 } else {
440 if (reg->flags & IR3_REG_HALF)
441 num += ARRAY_SIZE(state->regs) / 2;
442 add_single_reg_dep(state, node, num, src_n);
443 }
444 }
445
446 static void
calculate_deps(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node)447 calculate_deps(struct ir3_postsched_deps_state *state,
448 struct ir3_postsched_node *node)
449 {
450 /* Add dependencies on instructions that previously (or next,
451 * in the reverse direction) wrote any of our src registers:
452 */
453 foreach_src_n (reg, i, node->instr) {
454 if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
455 continue;
456
457 if (reg->flags & IR3_REG_RELATIV) {
458 /* mark entire array as read: */
459 for (unsigned j = 0; j < reg->size; j++) {
460 add_reg_dep(state, node, reg, reg->array.base + j, i);
461 }
462 } else {
463 assert(reg->wrmask >= 1);
464 u_foreach_bit (b, reg->wrmask) {
465 add_reg_dep(state, node, reg, reg->num + b, i);
466 }
467 }
468 }
469
470 /* And then after we update the state for what this instruction
471 * wrote:
472 */
473 foreach_dst (reg, node->instr) {
474 if (reg->wrmask == 0)
475 continue;
476 if (reg->flags & IR3_REG_RELATIV) {
477 /* mark the entire array as written: */
478 for (unsigned i = 0; i < reg->size; i++) {
479 add_reg_dep(state, node, reg, reg->array.base + i, -1);
480 }
481 } else {
482 assert(reg->wrmask >= 1);
483 u_foreach_bit (b, reg->wrmask) {
484 add_reg_dep(state, node, reg, reg->num + b, -1);
485 }
486 }
487 }
488 }
489
490 static void
calculate_forward_deps(struct ir3_postsched_ctx * ctx)491 calculate_forward_deps(struct ir3_postsched_ctx *ctx)
492 {
493 struct ir3_postsched_deps_state state = {
494 .ctx = ctx,
495 .direction = F,
496 .merged = ctx->v->mergedregs,
497 };
498
499 foreach_instr (instr, &ctx->unscheduled_list) {
500 calculate_deps(&state, instr->data);
501 }
502 }
503
504 static void
calculate_reverse_deps(struct ir3_postsched_ctx * ctx)505 calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
506 {
507 struct ir3_postsched_deps_state state = {
508 .ctx = ctx,
509 .direction = R,
510 .merged = ctx->v->mergedregs,
511 };
512
513 foreach_instr_rev (instr, &ctx->unscheduled_list) {
514 calculate_deps(&state, instr->data);
515 }
516 }
517
518 static void
sched_node_init(struct ir3_postsched_ctx * ctx,struct ir3_instruction * instr)519 sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
520 {
521 struct ir3_postsched_node *n =
522 rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
523
524 dag_init_node(ctx->dag, &n->dag);
525
526 n->instr = instr;
527 instr->data = n;
528 }
529
530 static void
sched_dag_max_delay_cb(struct dag_node * node,void * state)531 sched_dag_max_delay_cb(struct dag_node *node, void *state)
532 {
533 struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
534 uint32_t max_delay = 0;
535
536 util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
537 struct ir3_postsched_node *child =
538 (struct ir3_postsched_node *)edge->child;
539 max_delay = MAX2(child->max_delay, max_delay);
540 }
541
542 n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
543 }
544
545 static void
sched_dag_init(struct ir3_postsched_ctx * ctx)546 sched_dag_init(struct ir3_postsched_ctx *ctx)
547 {
548 ctx->mem_ctx = ralloc_context(NULL);
549
550 ctx->dag = dag_create(ctx->mem_ctx);
551
552 foreach_instr (instr, &ctx->unscheduled_list)
553 sched_node_init(ctx, instr);
554
555 calculate_forward_deps(ctx);
556 calculate_reverse_deps(ctx);
557
558 /*
559 * To avoid expensive texture fetches, etc, from being moved ahead
560 * of kills, track the kills we've seen so far, so we can add an
561 * extra dependency on them for tex/mem instructions
562 */
563 struct util_dynarray kills;
564 util_dynarray_init(&kills, ctx->mem_ctx);
565
566 /* The last bary.f with the (ei) flag must be scheduled before any kills,
567 * or the hw gets angry. Keep track of inputs here so we can add the
568 * false dep on the kill instruction.
569 */
570 struct util_dynarray inputs;
571 util_dynarray_init(&inputs, ctx->mem_ctx);
572
573 /*
574 * Normal srcs won't be in SSA at this point, those are dealt with in
575 * calculate_forward_deps() and calculate_reverse_deps(). But we still
576 * have the false-dep information in SSA form, so go ahead and add
577 * dependencies for that here:
578 */
579 foreach_instr (instr, &ctx->unscheduled_list) {
580 struct ir3_postsched_node *n = instr->data;
581
582 foreach_ssa_src_n (src, i, instr) {
583 if (src->block != instr->block)
584 continue;
585
586 /* we can end up with unused false-deps.. just skip them: */
587 if (src->flags & IR3_INSTR_UNUSED)
588 continue;
589
590 struct ir3_postsched_node *sn = src->data;
591
592 /* don't consider dependencies in other blocks: */
593 if (src->block != instr->block)
594 continue;
595
596 dag_add_edge(&sn->dag, &n->dag, NULL);
597 }
598
599 if (is_input(instr)) {
600 util_dynarray_append(&inputs, struct ir3_instruction *, instr);
601 } else if (is_kill_or_demote(instr)) {
602 util_dynarray_foreach (&inputs, struct ir3_instruction *, instrp) {
603 struct ir3_instruction *input = *instrp;
604 struct ir3_postsched_node *in = input->data;
605 dag_add_edge(&in->dag, &n->dag, NULL);
606 }
607 util_dynarray_append(&kills, struct ir3_instruction *, instr);
608 } else if (is_tex(instr) || is_mem(instr)) {
609 util_dynarray_foreach (&kills, struct ir3_instruction *, instrp) {
610 struct ir3_instruction *kill = *instrp;
611 struct ir3_postsched_node *kn = kill->data;
612 dag_add_edge(&kn->dag, &n->dag, NULL);
613 }
614 }
615 }
616
617 // TODO do we want to do this after reverse-dependencies?
618 dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
619 }
620
621 static void
sched_dag_destroy(struct ir3_postsched_ctx * ctx)622 sched_dag_destroy(struct ir3_postsched_ctx *ctx)
623 {
624 ralloc_free(ctx->mem_ctx);
625 ctx->mem_ctx = NULL;
626 ctx->dag = NULL;
627 }
628
629 static void
sched_block(struct ir3_postsched_ctx * ctx,struct ir3_block * block)630 sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
631 {
632 ctx->block = block;
633 ctx->tex_delay = 0;
634 ctx->sfu_delay = 0;
635
636 /* move all instructions to the unscheduled list, and
637 * empty the block's instruction list (to which we will
638 * be inserting).
639 */
640 list_replace(&block->instr_list, &ctx->unscheduled_list);
641 list_inithead(&block->instr_list);
642
643 // TODO once we are using post-sched for everything we can
644 // just not stick in NOP's prior to post-sched, and drop this.
645 // for now keep this, since it makes post-sched optional:
646 foreach_instr_safe (instr, &ctx->unscheduled_list) {
647 switch (instr->opc) {
648 case OPC_NOP:
649 case OPC_B:
650 case OPC_JUMP:
651 list_delinit(&instr->node);
652 break;
653 default:
654 break;
655 }
656 }
657
658 sched_dag_init(ctx);
659
660 /* First schedule all meta:input instructions, followed by
661 * tex-prefetch. We want all of the instructions that load
662 * values into registers before the shader starts to go
663 * before any other instructions. But in particular we
664 * want inputs to come before prefetches. This is because
665 * a FS's bary_ij input may not actually be live in the
666 * shader, but it should not be scheduled on top of any
667 * other input (but can be overwritten by a tex prefetch)
668 */
669 foreach_instr_safe (instr, &ctx->unscheduled_list)
670 if (instr->opc == OPC_META_INPUT)
671 schedule(ctx, instr);
672
673 foreach_instr_safe (instr, &ctx->unscheduled_list)
674 if (instr->opc == OPC_META_TEX_PREFETCH)
675 schedule(ctx, instr);
676
677 while (!list_is_empty(&ctx->unscheduled_list)) {
678 struct ir3_instruction *instr = choose_instr(ctx);
679
680 unsigned delay =
681 ir3_delay_calc_postra(ctx->block, instr, false, ctx->v->mergedregs);
682 d("delay=%u", delay);
683
684 /* and if we run out of instructions that can be scheduled,
685 * then it is time for nop's:
686 */
687 debug_assert(delay <= 6);
688 while (delay > 0) {
689 ir3_NOP(block);
690 delay--;
691 }
692
693 schedule(ctx, instr);
694 }
695
696 sched_dag_destroy(ctx);
697 }
698
699 static bool
is_self_mov(struct ir3_instruction * instr)700 is_self_mov(struct ir3_instruction *instr)
701 {
702 if (!is_same_type_mov(instr))
703 return false;
704
705 if (instr->dsts[0]->num != instr->srcs[0]->num)
706 return false;
707
708 if (instr->dsts[0]->flags & IR3_REG_RELATIV)
709 return false;
710
711 if (instr->cat1.round != ROUND_ZERO)
712 return false;
713
714 if (instr->srcs[0]->flags &
715 (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_FNEG |
716 IR3_REG_FABS | IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT))
717 return false;
718
719 return true;
720 }
721
722 /* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
723 * as a result of places were before RA we are not sure that it is
724 * safe to eliminate. We could eliminate these earlier, but sometimes
725 * they are tangled up in false-dep's, etc, so it is easier just to
726 * let them exist until after RA
727 */
728 static void
cleanup_self_movs(struct ir3 * ir)729 cleanup_self_movs(struct ir3 *ir)
730 {
731 foreach_block (block, &ir->block_list) {
732 foreach_instr_safe (instr, &block->instr_list) {
733 for (unsigned i = 0; i < instr->deps_count; i++) {
734 if (instr->deps[i] && is_self_mov(instr->deps[i])) {
735 instr->deps[i] = NULL;
736 }
737 }
738
739 if (is_self_mov(instr))
740 list_delinit(&instr->node);
741 }
742 }
743 }
744
745 bool
ir3_postsched(struct ir3 * ir,struct ir3_shader_variant * v)746 ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v)
747 {
748 struct ir3_postsched_ctx ctx = {
749 .ir = ir,
750 .v = v,
751 };
752
753 ir3_remove_nops(ir);
754 cleanup_self_movs(ir);
755
756 foreach_block (block, &ir->block_list) {
757 sched_block(&ctx, block);
758 }
759
760 return true;
761 }
762