• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2019 Google, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Rob Clark <robclark@freedesktop.org>
25  */
26 
27 #include "util/dag.h"
28 #include "util/u_math.h"
29 
30 #include "ir3.h"
31 #include "ir3_compiler.h"
32 #include "ir3_context.h"
33 
34 #ifdef DEBUG
35 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
36 #else
37 #define SCHED_DEBUG 0
38 #endif
39 #define d(fmt, ...)                                                            \
40    do {                                                                        \
41       if (SCHED_DEBUG) {                                                       \
42          mesa_logi("PSCHED: " fmt, ##__VA_ARGS__);                             \
43       }                                                                        \
44    } while (0)
45 
46 #define di(instr, fmt, ...)                                                    \
47    do {                                                                        \
48       if (SCHED_DEBUG) {                                                       \
49          struct log_stream *stream = mesa_log_streami();                       \
50          mesa_log_stream_printf(stream, "PSCHED: " fmt ": ", ##__VA_ARGS__);   \
51          ir3_print_instr_stream(stream, instr);                                \
52          mesa_log_stream_destroy(stream);                                      \
53       }                                                                        \
54    } while (0)
55 
56 /*
57  * Post RA Instruction Scheduling
58  */
59 
60 struct ir3_postsched_ctx {
61    struct ir3 *ir;
62 
63    struct ir3_shader_variant *v;
64 
65    void *mem_ctx;
66    struct ir3_block *block; /* the current block */
67    struct dag *dag;
68 
69    struct list_head unscheduled_list; /* unscheduled instructions */
70 
71    unsigned ip;
72 
73    int ss_delay;
74    int sy_delay;
75 };
76 
77 struct ir3_postsched_node {
78    struct dag_node dag; /* must be first for util_dynarray_foreach */
79    struct ir3_instruction *instr;
80    bool partially_evaluated_path;
81 
82    unsigned earliest_ip;
83 
84    bool has_sy_src, has_ss_src;
85 
86    unsigned delay;
87    unsigned max_delay;
88 };
89 
90 #define foreach_sched_node(__n, __list)                                        \
91    list_for_each_entry (struct ir3_postsched_node, __n, __list, dag.link)
92 
93 static bool
has_sy_src(struct ir3_instruction * instr)94 has_sy_src(struct ir3_instruction *instr)
95 {
96    struct ir3_postsched_node *node = instr->data;
97    return node->has_sy_src;
98 }
99 
100 static bool
has_ss_src(struct ir3_instruction * instr)101 has_ss_src(struct ir3_instruction *instr)
102 {
103    struct ir3_postsched_node *node = instr->data;
104    return node->has_ss_src;
105 }
106 
107 static void
schedule(struct ir3_postsched_ctx * ctx,struct ir3_instruction * instr)108 schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
109 {
110    assert(ctx->block == instr->block);
111 
112    /* remove from unscheduled_list:
113     */
114    list_delinit(&instr->node);
115 
116    di(instr, "schedule");
117 
118    bool counts_for_delay = is_alu(instr) || is_flow(instr);
119 
120    unsigned delay_cycles = counts_for_delay ? 1 + instr->repeat : 0;
121 
122    struct ir3_postsched_node *n = instr->data;
123 
124    /* We insert any nop's needed to get to earliest_ip, then advance
125     * delay_cycles by scheduling the instruction.
126     */
127    ctx->ip = MAX2(ctx->ip, n->earliest_ip) + delay_cycles;
128 
129    util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
130       unsigned delay = (unsigned)(uintptr_t)edge->data;
131       struct ir3_postsched_node *child =
132          container_of(edge->child, struct ir3_postsched_node, dag);
133       child->earliest_ip = MAX2(child->earliest_ip, ctx->ip + delay);
134    }
135 
136    list_addtail(&instr->node, &instr->block->instr_list);
137 
138    dag_prune_head(ctx->dag, &n->dag);
139 
140    if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
141       return;
142 
143    if (is_ss_producer(instr)) {
144       ctx->ss_delay = soft_ss_delay(instr);
145    } else if (has_ss_src(instr)) {
146       ctx->ss_delay = 0;
147    } else if (ctx->ss_delay > 0) {
148       ctx->ss_delay--;
149    }
150 
151    if (is_sy_producer(instr)) {
152       ctx->sy_delay = soft_sy_delay(instr, ctx->block->shader);
153    } else if (has_sy_src(instr)) {
154       ctx->sy_delay = 0;
155    } else if (ctx->sy_delay > 0) {
156       ctx->sy_delay--;
157    }
158 }
159 
160 static void
dump_state(struct ir3_postsched_ctx * ctx)161 dump_state(struct ir3_postsched_ctx *ctx)
162 {
163    if (!SCHED_DEBUG)
164       return;
165 
166    foreach_sched_node (n, &ctx->dag->heads) {
167       di(n->instr, "maxdel=%3d    ", n->max_delay);
168 
169       util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
170          struct ir3_postsched_node *child =
171             (struct ir3_postsched_node *)edge->child;
172 
173          di(child->instr, " -> (%d parents) ", child->dag.parent_count);
174       }
175    }
176 }
177 
178 static unsigned
node_delay(struct ir3_postsched_ctx * ctx,struct ir3_postsched_node * n)179 node_delay(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n)
180 {
181    return MAX2(n->earliest_ip, ctx->ip) - ctx->ip;
182 }
183 
184 static unsigned
node_delay_soft(struct ir3_postsched_ctx * ctx,struct ir3_postsched_node * n)185 node_delay_soft(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n)
186 {
187    unsigned delay = node_delay(ctx, n);
188 
189    /* This takes into account that as when we schedule multiple tex or sfu, the
190     * first user has to wait for all of them to complete.
191     */
192    if (n->has_ss_src)
193       delay = MAX2(delay, ctx->ss_delay);
194    if (n->has_sy_src)
195       delay = MAX2(delay, ctx->sy_delay);
196 
197    return delay;
198 }
199 
200 /* find instruction to schedule: */
201 static struct ir3_instruction *
choose_instr(struct ir3_postsched_ctx * ctx)202 choose_instr(struct ir3_postsched_ctx *ctx)
203 {
204    struct ir3_postsched_node *chosen = NULL;
205 
206    dump_state(ctx);
207 
208    foreach_sched_node (n, &ctx->dag->heads) {
209       if (!is_meta(n->instr))
210          continue;
211 
212       if (!chosen || (chosen->max_delay < n->max_delay))
213          chosen = n;
214    }
215 
216    if (chosen) {
217       di(chosen->instr, "prio: chose (meta)");
218       return chosen->instr;
219    }
220 
221    /* Try to schedule inputs with a higher priority, if possible, as
222     * the last bary.f unlocks varying storage to unblock more VS
223     * warps.
224     */
225    foreach_sched_node (n, &ctx->dag->heads) {
226       if (!is_input(n->instr))
227          continue;
228 
229       if (!chosen || (chosen->max_delay < n->max_delay))
230          chosen = n;
231    }
232 
233    if (chosen) {
234       di(chosen->instr, "prio: chose (input)");
235       return chosen->instr;
236    }
237 
238    /* Next prioritize discards: */
239    foreach_sched_node (n, &ctx->dag->heads) {
240       unsigned d = node_delay(ctx, n);
241 
242       if (d > 0)
243          continue;
244 
245       if (!is_kill_or_demote(n->instr))
246          continue;
247 
248       if (!chosen || (chosen->max_delay < n->max_delay))
249          chosen = n;
250    }
251 
252    if (chosen) {
253       di(chosen->instr, "csp: chose (kill, hard ready)");
254       return chosen->instr;
255    }
256 
257    /* Next prioritize expensive instructions: */
258    foreach_sched_node (n, &ctx->dag->heads) {
259       unsigned d = node_delay_soft(ctx, n);
260 
261       if (d > 0)
262          continue;
263 
264       if (!(is_ss_producer(n->instr) || is_sy_producer(n->instr)))
265          continue;
266 
267       if (!chosen || (chosen->max_delay < n->max_delay))
268          chosen = n;
269    }
270 
271    if (chosen) {
272       di(chosen->instr, "csp: chose (sfu/tex, soft ready)");
273       return chosen->instr;
274    }
275 
276    /* Next try to find a ready leader w/ soft delay (ie. including extra
277     * delay for things like tex fetch which can be synchronized w/ sync
278     * bit (but we probably do want to schedule some other instructions
279     * while we wait). We also allow a small amount of nops, to prefer now-nops
280     * over future-nops up to a point, as that gives better results.
281     */
282    unsigned chosen_delay = 0;
283    foreach_sched_node (n, &ctx->dag->heads) {
284       unsigned d = node_delay_soft(ctx, n);
285 
286       if (d > 3)
287          continue;
288 
289       if (!chosen || d < chosen_delay) {
290          chosen = n;
291          chosen_delay = d;
292          continue;
293       }
294 
295       if (d > chosen_delay)
296          continue;
297 
298       if (chosen->max_delay < n->max_delay) {
299          chosen = n;
300          chosen_delay = d;
301       }
302    }
303 
304    if (chosen) {
305       di(chosen->instr, "csp: chose (soft ready)");
306       return chosen->instr;
307    }
308 
309    /* Next try to find a ready leader that can be scheduled without nop's,
310     * which in the case of things that need (sy)/(ss) could result in
311     * stalls.. but we've already decided there is not a better option.
312     */
313    foreach_sched_node (n, &ctx->dag->heads) {
314       unsigned d = node_delay(ctx, n);
315 
316       if (d > 0)
317          continue;
318 
319       if (!chosen || (chosen->max_delay < n->max_delay))
320          chosen = n;
321    }
322 
323    if (chosen) {
324       di(chosen->instr, "csp: chose (hard ready)");
325       return chosen->instr;
326    }
327 
328    /* Otherwise choose leader with maximum cost:
329     */
330    foreach_sched_node (n, &ctx->dag->heads) {
331       if (!chosen || chosen->max_delay < n->max_delay)
332          chosen = n;
333    }
334 
335    if (chosen) {
336       di(chosen->instr, "csp: chose (leader)");
337       return chosen->instr;
338    }
339 
340    return NULL;
341 }
342 
343 struct ir3_postsched_deps_state {
344    struct ir3_postsched_ctx *ctx;
345 
346    enum { F, R } direction;
347 
348    bool merged;
349 
350    /* Track the mapping between sched node (instruction) that last
351     * wrote a given register (in whichever direction we are iterating
352     * the block)
353     *
354     * Note, this table is twice as big as the # of regs, to deal with
355     * half-precision regs.  The approach differs depending on whether
356     * the half and full precision register files are "merged" (conflict,
357     * ie. a6xx+) in which case we consider each full precision dep
358     * as two half-precision dependencies, vs older separate (non-
359     * conflicting) in which case the first half of the table is used
360     * for full precision and 2nd half for half-precision.
361     */
362    struct ir3_postsched_node *regs[2 * 256];
363    unsigned dst_n[2 * 256];
364 };
365 
366 /* bounds checking read/write accessors, since OoB access to stuff on
367  * the stack is gonna cause a bad day.
368  */
369 #define dep_reg(state, idx)                                                    \
370    *({                                                                         \
371       assert((idx) < ARRAY_SIZE((state)->regs));                               \
372       &(state)->regs[(idx)];                                                   \
373    })
374 
375 static void
add_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * before,struct ir3_postsched_node * after,unsigned d)376 add_dep(struct ir3_postsched_deps_state *state,
377         struct ir3_postsched_node *before, struct ir3_postsched_node *after,
378         unsigned d)
379 {
380    if (!before || !after)
381       return;
382 
383    assert(before != after);
384 
385    if (state->direction == F) {
386       dag_add_edge_max_data(&before->dag, &after->dag, (uintptr_t)d);
387    } else {
388       dag_add_edge_max_data(&after->dag, &before->dag, 0);
389    }
390 }
391 
392 static void
add_single_reg_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node,unsigned num,int src_n,int dst_n)393 add_single_reg_dep(struct ir3_postsched_deps_state *state,
394                    struct ir3_postsched_node *node, unsigned num, int src_n,
395                    int dst_n)
396 {
397    struct ir3_postsched_node *dep = dep_reg(state, num);
398 
399    unsigned d = 0;
400    if (src_n >= 0 && dep && state->direction == F) {
401       /* get the dst_n this corresponds to */
402       unsigned dst_n = state->dst_n[num];
403       unsigned d_soft = ir3_delayslots(dep->instr, node->instr, src_n, true);
404       d = ir3_delayslots_with_repeat(dep->instr, node->instr, dst_n, src_n);
405       node->delay = MAX2(node->delay, d_soft);
406       if (is_sy_producer(dep->instr))
407          node->has_sy_src = true;
408       if (is_ss_producer(dep->instr))
409          node->has_ss_src = true;
410    }
411 
412    add_dep(state, dep, node, d);
413    if (src_n < 0) {
414       dep_reg(state, num) = node;
415       state->dst_n[num] = dst_n;
416    }
417 }
418 
419 /* This is where we handled full vs half-precision, and potential conflicts
420  * between half and full precision that result in additional dependencies.
421  * The 'reg' arg is really just to know half vs full precision.
422  *
423  * If src_n is positive, then this adds a dependency on a source register, and
424  * src_n is the index passed into ir3_delayslots() for calculating the delay:
425  * it corresponds to node->instr->srcs[src_n]. If src_n is negative, then
426  * this is for the destination register corresponding to dst_n.
427  */
428 static void
add_reg_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node,const struct ir3_register * reg,unsigned num,int src_n,int dst_n)429 add_reg_dep(struct ir3_postsched_deps_state *state,
430             struct ir3_postsched_node *node, const struct ir3_register *reg,
431             unsigned num, int src_n, int dst_n)
432 {
433    if (state->merged) {
434       /* Make sure that special registers like a0.x that are written as
435        * half-registers don't alias random full registers by pretending that
436        * they're full registers:
437        */
438       if ((reg->flags & IR3_REG_HALF) && !is_reg_special(reg)) {
439          /* single conflict in half-reg space: */
440          add_single_reg_dep(state, node, num, src_n, dst_n);
441       } else {
442          /* two conflicts in half-reg space: */
443          add_single_reg_dep(state, node, 2 * num + 0, src_n, dst_n);
444          add_single_reg_dep(state, node, 2 * num + 1, src_n, dst_n);
445       }
446    } else {
447       if (reg->flags & IR3_REG_HALF)
448          num += ARRAY_SIZE(state->regs) / 2;
449       add_single_reg_dep(state, node, num, src_n, dst_n);
450    }
451 }
452 
453 static void
calculate_deps(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node)454 calculate_deps(struct ir3_postsched_deps_state *state,
455                struct ir3_postsched_node *node)
456 {
457    /* Add dependencies on instructions that previously (or next,
458     * in the reverse direction) wrote any of our src registers:
459     */
460    foreach_src_n (reg, i, node->instr) {
461       if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
462          continue;
463 
464       if (reg->flags & IR3_REG_RELATIV) {
465          /* mark entire array as read: */
466          for (unsigned j = 0; j < reg->size; j++) {
467             add_reg_dep(state, node, reg, reg->array.base + j, i, -1);
468          }
469       } else {
470          assert(reg->wrmask >= 1);
471          u_foreach_bit (b, reg->wrmask) {
472             add_reg_dep(state, node, reg, reg->num + b, i, -1);
473          }
474       }
475    }
476 
477    /* And then after we update the state for what this instruction
478     * wrote:
479     */
480    foreach_dst_n (reg, i, node->instr) {
481       if (reg->wrmask == 0)
482          continue;
483       if (reg->flags & IR3_REG_RELATIV) {
484          /* mark the entire array as written: */
485          for (unsigned j = 0; j < reg->size; j++) {
486             add_reg_dep(state, node, reg, reg->array.base + j, -1, i);
487          }
488       } else {
489          assert(reg->wrmask >= 1);
490          u_foreach_bit (b, reg->wrmask) {
491             add_reg_dep(state, node, reg, reg->num + b, -1, i);
492          }
493       }
494    }
495 }
496 
497 static void
calculate_forward_deps(struct ir3_postsched_ctx * ctx)498 calculate_forward_deps(struct ir3_postsched_ctx *ctx)
499 {
500    struct ir3_postsched_deps_state state = {
501       .ctx = ctx,
502       .direction = F,
503       .merged = ctx->v->mergedregs,
504    };
505 
506    foreach_instr (instr, &ctx->unscheduled_list) {
507       calculate_deps(&state, instr->data);
508    }
509 }
510 
511 static void
calculate_reverse_deps(struct ir3_postsched_ctx * ctx)512 calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
513 {
514    struct ir3_postsched_deps_state state = {
515       .ctx = ctx,
516       .direction = R,
517       .merged = ctx->v->mergedregs,
518    };
519 
520    foreach_instr_rev (instr, &ctx->unscheduled_list) {
521       calculate_deps(&state, instr->data);
522    }
523 }
524 
525 static void
sched_node_init(struct ir3_postsched_ctx * ctx,struct ir3_instruction * instr)526 sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
527 {
528    struct ir3_postsched_node *n =
529       rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
530 
531    dag_init_node(ctx->dag, &n->dag);
532 
533    n->instr = instr;
534    instr->data = n;
535 }
536 
537 static void
sched_dag_max_delay_cb(struct dag_node * node,void * state)538 sched_dag_max_delay_cb(struct dag_node *node, void *state)
539 {
540    struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
541    uint32_t max_delay = 0;
542 
543    util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
544       struct ir3_postsched_node *child =
545          (struct ir3_postsched_node *)edge->child;
546       max_delay = MAX2(child->max_delay, max_delay);
547    }
548 
549    n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
550 }
551 
552 static void
sched_dag_init(struct ir3_postsched_ctx * ctx)553 sched_dag_init(struct ir3_postsched_ctx *ctx)
554 {
555    ctx->mem_ctx = ralloc_context(NULL);
556 
557    ctx->dag = dag_create(ctx->mem_ctx);
558 
559    foreach_instr (instr, &ctx->unscheduled_list)
560       sched_node_init(ctx, instr);
561 
562    calculate_forward_deps(ctx);
563    calculate_reverse_deps(ctx);
564 
565    /*
566     * To avoid expensive texture fetches, etc, from being moved ahead
567     * of kills, track the kills we've seen so far, so we can add an
568     * extra dependency on them for tex/mem instructions
569     */
570    struct util_dynarray kills;
571    util_dynarray_init(&kills, ctx->mem_ctx);
572 
573    /* The last bary.f with the (ei) flag must be scheduled before any kills,
574     * or the hw gets angry. Keep track of inputs here so we can add the
575     * false dep on the kill instruction.
576     */
577    struct util_dynarray inputs;
578    util_dynarray_init(&inputs, ctx->mem_ctx);
579 
580    /*
581     * Normal srcs won't be in SSA at this point, those are dealt with in
582     * calculate_forward_deps() and calculate_reverse_deps().  But we still
583     * have the false-dep information in SSA form, so go ahead and add
584     * dependencies for that here:
585     */
586    foreach_instr (instr, &ctx->unscheduled_list) {
587       struct ir3_postsched_node *n = instr->data;
588 
589       foreach_ssa_src_n (src, i, instr) {
590          if (src->block != instr->block)
591             continue;
592 
593          /* we can end up with unused false-deps.. just skip them: */
594          if (src->flags & IR3_INSTR_UNUSED)
595             continue;
596 
597          struct ir3_postsched_node *sn = src->data;
598 
599          /* don't consider dependencies in other blocks: */
600          if (src->block != instr->block)
601             continue;
602 
603          dag_add_edge_max_data(&sn->dag, &n->dag, 0);
604       }
605 
606       if (is_input(instr)) {
607          util_dynarray_append(&inputs, struct ir3_instruction *, instr);
608       } else if (is_kill_or_demote(instr)) {
609          util_dynarray_foreach (&inputs, struct ir3_instruction *, instrp) {
610             struct ir3_instruction *input = *instrp;
611             struct ir3_postsched_node *in = input->data;
612             dag_add_edge_max_data(&in->dag, &n->dag, 0);
613          }
614          util_dynarray_append(&kills, struct ir3_instruction *, instr);
615       } else if (is_tex(instr) || is_mem(instr)) {
616          util_dynarray_foreach (&kills, struct ir3_instruction *, instrp) {
617             struct ir3_instruction *kill = *instrp;
618             struct ir3_postsched_node *kn = kill->data;
619             dag_add_edge_max_data(&kn->dag, &n->dag, 0);
620          }
621       }
622    }
623 
624    // TODO do we want to do this after reverse-dependencies?
625    dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
626 }
627 
628 static void
sched_dag_destroy(struct ir3_postsched_ctx * ctx)629 sched_dag_destroy(struct ir3_postsched_ctx *ctx)
630 {
631    ralloc_free(ctx->mem_ctx);
632    ctx->mem_ctx = NULL;
633    ctx->dag = NULL;
634 }
635 
636 static void
sched_block(struct ir3_postsched_ctx * ctx,struct ir3_block * block)637 sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
638 {
639    ctx->block = block;
640    ctx->sy_delay = 0;
641    ctx->ss_delay = 0;
642 
643    /* move all instructions to the unscheduled list, and
644     * empty the block's instruction list (to which we will
645     * be inserting).
646     */
647    list_replace(&block->instr_list, &ctx->unscheduled_list);
648    list_inithead(&block->instr_list);
649 
650    // TODO once we are using post-sched for everything we can
651    // just not stick in NOP's prior to post-sched, and drop this.
652    // for now keep this, since it makes post-sched optional:
653    foreach_instr_safe (instr, &ctx->unscheduled_list) {
654       switch (instr->opc) {
655       case OPC_NOP:
656       case OPC_B:
657       case OPC_JUMP:
658          list_delinit(&instr->node);
659          break;
660       default:
661          break;
662       }
663    }
664 
665    sched_dag_init(ctx);
666 
667    /* First schedule all meta:input instructions, followed by
668     * tex-prefetch.  We want all of the instructions that load
669     * values into registers before the shader starts to go
670     * before any other instructions.  But in particular we
671     * want inputs to come before prefetches.  This is because
672     * a FS's bary_ij input may not actually be live in the
673     * shader, but it should not be scheduled on top of any
674     * other input (but can be overwritten by a tex prefetch)
675     */
676    foreach_instr_safe (instr, &ctx->unscheduled_list)
677       if (instr->opc == OPC_META_INPUT)
678          schedule(ctx, instr);
679 
680    foreach_instr_safe (instr, &ctx->unscheduled_list)
681       if (instr->opc == OPC_META_TEX_PREFETCH)
682          schedule(ctx, instr);
683 
684    while (!list_is_empty(&ctx->unscheduled_list)) {
685       struct ir3_instruction *instr = choose_instr(ctx);
686 
687       unsigned delay = node_delay(ctx, instr->data);
688       d("delay=%u", delay);
689 
690       assert(delay <= 6);
691 
692       schedule(ctx, instr);
693    }
694 
695    sched_dag_destroy(ctx);
696 }
697 
698 static bool
is_self_mov(struct ir3_instruction * instr)699 is_self_mov(struct ir3_instruction *instr)
700 {
701    if (!is_same_type_mov(instr))
702       return false;
703 
704    if (instr->dsts[0]->num != instr->srcs[0]->num)
705       return false;
706 
707    if (instr->dsts[0]->flags & IR3_REG_RELATIV)
708       return false;
709 
710    if (instr->cat1.round != ROUND_ZERO)
711       return false;
712 
713    if (instr->srcs[0]->flags &
714        (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_FNEG |
715         IR3_REG_FABS | IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT))
716       return false;
717 
718    return true;
719 }
720 
721 /* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
722  * as a result of places were before RA we are not sure that it is
723  * safe to eliminate.  We could eliminate these earlier, but sometimes
724  * they are tangled up in false-dep's, etc, so it is easier just to
725  * let them exist until after RA
726  */
727 static void
cleanup_self_movs(struct ir3 * ir)728 cleanup_self_movs(struct ir3 *ir)
729 {
730    foreach_block (block, &ir->block_list) {
731       foreach_instr_safe (instr, &block->instr_list) {
732          for (unsigned i = 0; i < instr->deps_count; i++) {
733             if (instr->deps[i] && is_self_mov(instr->deps[i])) {
734                instr->deps[i] = NULL;
735             }
736          }
737 
738          if (is_self_mov(instr))
739             list_delinit(&instr->node);
740       }
741    }
742 }
743 
744 bool
ir3_postsched(struct ir3 * ir,struct ir3_shader_variant * v)745 ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v)
746 {
747    struct ir3_postsched_ctx ctx = {
748       .ir = ir,
749       .v = v,
750    };
751 
752    cleanup_self_movs(ir);
753 
754    foreach_block (block, &ir->block_list) {
755       sched_block(&ctx, block);
756    }
757 
758    return true;
759 }
760