• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2019 Google, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Rob Clark <robclark@freedesktop.org>
25  */
26 
27 #include "util/dag.h"
28 #include "util/u_math.h"
29 
30 #include "ir3.h"
31 #include "ir3_compiler.h"
32 #include "ir3_context.h"
33 
34 #ifdef DEBUG
35 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
36 #else
37 #define SCHED_DEBUG 0
38 #endif
39 #define d(fmt, ...)                                                            \
40    do {                                                                        \
41       if (SCHED_DEBUG) {                                                       \
42          mesa_logi("PSCHED: " fmt, ##__VA_ARGS__);                             \
43       }                                                                        \
44    } while (0)
45 
46 #define di(instr, fmt, ...)                                                    \
47    do {                                                                        \
48       if (SCHED_DEBUG) {                                                       \
49          struct log_stream *stream = mesa_log_streami();                       \
50          mesa_log_stream_printf(stream, "PSCHED: " fmt ": ", ##__VA_ARGS__);   \
51          ir3_print_instr_stream(stream, instr);                                \
52          mesa_log_stream_destroy(stream);                                      \
53       }                                                                        \
54    } while (0)
55 
56 /*
57  * Post RA Instruction Scheduling
58  */
59 
60 struct ir3_postsched_ctx {
61    struct ir3 *ir;
62 
63    struct ir3_shader_variant *v;
64 
65    void *mem_ctx;
66    struct ir3_block *block; /* the current block */
67    struct dag *dag;
68 
69    struct list_head unscheduled_list; /* unscheduled instructions */
70 
71    unsigned ip;
72 
73    int ss_delay;
74    int sy_delay;
75 };
76 
77 struct ir3_postsched_node {
78    struct dag_node dag; /* must be first for util_dynarray_foreach */
79    struct ir3_instruction *instr;
80    bool partially_evaluated_path;
81 
82    unsigned earliest_ip;
83 
84    bool has_sy_src, has_ss_src;
85 
86    unsigned delay;
87    unsigned max_delay;
88 };
89 
90 #define foreach_sched_node(__n, __list)                                        \
91    list_for_each_entry (struct ir3_postsched_node, __n, __list, dag.link)
92 
93 static bool
has_sy_src(struct ir3_instruction * instr)94 has_sy_src(struct ir3_instruction *instr)
95 {
96    struct ir3_postsched_node *node = instr->data;
97    return node->has_sy_src;
98 }
99 
100 static bool
has_ss_src(struct ir3_instruction * instr)101 has_ss_src(struct ir3_instruction *instr)
102 {
103    struct ir3_postsched_node *node = instr->data;
104    return node->has_ss_src;
105 }
106 
107 static void
sched_dag_validate_cb(const struct dag_node * node,void * data)108 sched_dag_validate_cb(const struct dag_node *node, void *data)
109 {
110    struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
111 
112    ir3_print_instr(n->instr);
113 }
114 
115 static void
schedule(struct ir3_postsched_ctx * ctx,struct ir3_instruction * instr)116 schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
117 {
118    assert(ctx->block == instr->block);
119 
120    /* remove from unscheduled_list:
121     */
122    list_delinit(&instr->node);
123 
124    di(instr, "schedule");
125 
126    bool counts_for_delay = is_alu(instr) || is_flow(instr);
127 
128    unsigned delay_cycles = counts_for_delay ? 1 + instr->repeat : 0;
129 
130    struct ir3_postsched_node *n = instr->data;
131 
132    /* We insert any nop's needed to get to earliest_ip, then advance
133     * delay_cycles by scheduling the instruction.
134     */
135    ctx->ip = MAX2(ctx->ip, n->earliest_ip) + delay_cycles;
136 
137    util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
138       unsigned delay = (unsigned)(uintptr_t)edge->data;
139       struct ir3_postsched_node *child =
140          container_of(edge->child, struct ir3_postsched_node, dag);
141       child->earliest_ip = MAX2(child->earliest_ip, ctx->ip + delay);
142    }
143 
144    list_addtail(&instr->node, &instr->block->instr_list);
145 
146    dag_prune_head(ctx->dag, &n->dag);
147 
148    if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
149       return;
150 
151    if (is_ss_producer(instr)) {
152       ctx->ss_delay = soft_ss_delay(instr);
153    } else if (has_ss_src(instr)) {
154       ctx->ss_delay = 0;
155    } else if (ctx->ss_delay > 0) {
156       ctx->ss_delay--;
157    }
158 
159    if (is_sy_producer(instr)) {
160       ctx->sy_delay = soft_sy_delay(instr, ctx->block->shader);
161    } else if (has_sy_src(instr)) {
162       ctx->sy_delay = 0;
163    } else if (ctx->sy_delay > 0) {
164       ctx->sy_delay--;
165    }
166 }
167 
168 static void
dump_state(struct ir3_postsched_ctx * ctx)169 dump_state(struct ir3_postsched_ctx *ctx)
170 {
171    if (!SCHED_DEBUG)
172       return;
173 
174    foreach_sched_node (n, &ctx->dag->heads) {
175       di(n->instr, "maxdel=%3d    ", n->max_delay);
176 
177       util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
178          struct ir3_postsched_node *child =
179             (struct ir3_postsched_node *)edge->child;
180 
181          di(child->instr, " -> (%d parents) ", child->dag.parent_count);
182       }
183    }
184 }
185 
186 static unsigned
node_delay(struct ir3_postsched_ctx * ctx,struct ir3_postsched_node * n)187 node_delay(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n)
188 {
189    return MAX2(n->earliest_ip, ctx->ip) - ctx->ip;
190 }
191 
192 static unsigned
node_delay_soft(struct ir3_postsched_ctx * ctx,struct ir3_postsched_node * n)193 node_delay_soft(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n)
194 {
195    unsigned delay = node_delay(ctx, n);
196 
197    /* This takes into account that as when we schedule multiple tex or sfu, the
198     * first user has to wait for all of them to complete.
199     */
200    if (n->has_ss_src)
201       delay = MAX2(delay, ctx->ss_delay);
202    if (n->has_sy_src)
203       delay = MAX2(delay, ctx->sy_delay);
204 
205    return delay;
206 }
207 
208 /* find instruction to schedule: */
209 static struct ir3_instruction *
choose_instr(struct ir3_postsched_ctx * ctx)210 choose_instr(struct ir3_postsched_ctx *ctx)
211 {
212    struct ir3_postsched_node *chosen = NULL;
213 
214    dump_state(ctx);
215 
216    foreach_sched_node (n, &ctx->dag->heads) {
217       if (!is_meta(n->instr))
218          continue;
219 
220       if (!chosen || (chosen->max_delay < n->max_delay))
221          chosen = n;
222    }
223 
224    if (chosen) {
225       di(chosen->instr, "prio: chose (meta)");
226       return chosen->instr;
227    }
228 
229    /* Try to schedule inputs with a higher priority, if possible, as
230     * the last bary.f unlocks varying storage to unblock more VS
231     * warps.
232     */
233    foreach_sched_node (n, &ctx->dag->heads) {
234       if (!is_input(n->instr))
235          continue;
236 
237       if (!chosen || (chosen->max_delay < n->max_delay))
238          chosen = n;
239    }
240 
241    if (chosen) {
242       di(chosen->instr, "prio: chose (input)");
243       return chosen->instr;
244    }
245 
246    /* Next prioritize discards: */
247    foreach_sched_node (n, &ctx->dag->heads) {
248       unsigned d = node_delay(ctx, n);
249 
250       if (d > 0)
251          continue;
252 
253       if (!is_kill_or_demote(n->instr))
254          continue;
255 
256       if (!chosen || (chosen->max_delay < n->max_delay))
257          chosen = n;
258    }
259 
260    if (chosen) {
261       di(chosen->instr, "csp: chose (kill, hard ready)");
262       return chosen->instr;
263    }
264 
265    /* Next prioritize expensive instructions: */
266    foreach_sched_node (n, &ctx->dag->heads) {
267       unsigned d = node_delay_soft(ctx, n);
268 
269       if (d > 0)
270          continue;
271 
272       if (!(is_ss_producer(n->instr) || is_sy_producer(n->instr)))
273          continue;
274 
275       if (!chosen || (chosen->max_delay < n->max_delay))
276          chosen = n;
277    }
278 
279    if (chosen) {
280       di(chosen->instr, "csp: chose (sfu/tex, soft ready)");
281       return chosen->instr;
282    }
283 
284    /* Next try to find a ready leader w/ soft delay (ie. including extra
285     * delay for things like tex fetch which can be synchronized w/ sync
286     * bit (but we probably do want to schedule some other instructions
287     * while we wait). We also allow a small amount of nops, to prefer now-nops
288     * over future-nops up to a point, as that gives better results.
289     */
290    unsigned chosen_delay = 0;
291    foreach_sched_node (n, &ctx->dag->heads) {
292       unsigned d = node_delay_soft(ctx, n);
293 
294       if (d > 3)
295          continue;
296 
297       if (!chosen || d < chosen_delay) {
298          chosen = n;
299          chosen_delay = d;
300          continue;
301       }
302 
303       if (d > chosen_delay)
304          continue;
305 
306       if (chosen->max_delay < n->max_delay) {
307          chosen = n;
308          chosen_delay = d;
309       }
310    }
311 
312    if (chosen) {
313       di(chosen->instr, "csp: chose (soft ready)");
314       return chosen->instr;
315    }
316 
317    /* Next try to find a ready leader that can be scheduled without nop's,
318     * which in the case of things that need (sy)/(ss) could result in
319     * stalls.. but we've already decided there is not a better option.
320     */
321    foreach_sched_node (n, &ctx->dag->heads) {
322       unsigned d = node_delay(ctx, n);
323 
324       if (d > 0)
325          continue;
326 
327       if (!chosen || (chosen->max_delay < n->max_delay))
328          chosen = n;
329    }
330 
331    if (chosen) {
332       di(chosen->instr, "csp: chose (hard ready)");
333       return chosen->instr;
334    }
335 
336    /* Otherwise choose leader with maximum cost:
337     */
338    foreach_sched_node (n, &ctx->dag->heads) {
339       if (!chosen || chosen->max_delay < n->max_delay)
340          chosen = n;
341    }
342 
343    if (chosen) {
344       di(chosen->instr, "csp: chose (leader)");
345       return chosen->instr;
346    }
347 
348    return NULL;
349 }
350 
351 struct ir3_postsched_deps_state {
352    struct ir3_postsched_ctx *ctx;
353 
354    enum { F, R } direction;
355 
356    bool merged;
357 
358    /* Track the mapping between sched node (instruction) that last
359     * wrote a given register (in whichever direction we are iterating
360     * the block)
361     *
362     * Note, this table is twice as big as the # of regs, to deal with
363     * half-precision regs.  The approach differs depending on whether
364     * the half and full precision register files are "merged" (conflict,
365     * ie. a6xx+) in which case we consider each full precision dep
366     * as two half-precision dependencies, vs older separate (non-
367     * conflicting) in which case the first half of the table is used
368     * for full precision and 2nd half for half-precision.
369     */
370    struct ir3_postsched_node *regs[2 * 256];
371    unsigned dst_n[2 * 256];
372 };
373 
374 /* bounds checking read/write accessors, since OoB access to stuff on
375  * the stack is gonna cause a bad day.
376  */
377 #define dep_reg(state, idx)                                                    \
378    *({                                                                         \
379       assert((idx) < ARRAY_SIZE((state)->regs));                               \
380       &(state)->regs[(idx)];                                                   \
381    })
382 
383 static void
add_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * before,struct ir3_postsched_node * after,unsigned d)384 add_dep(struct ir3_postsched_deps_state *state,
385         struct ir3_postsched_node *before, struct ir3_postsched_node *after,
386         unsigned d)
387 {
388    if (!before || !after)
389       return;
390 
391    assert(before != after);
392 
393    if (state->direction == F) {
394       dag_add_edge_max_data(&before->dag, &after->dag, (uintptr_t)d);
395    } else {
396       dag_add_edge_max_data(&after->dag, &before->dag, 0);
397    }
398 }
399 
400 static void
add_single_reg_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node,unsigned num,int src_n,int dst_n)401 add_single_reg_dep(struct ir3_postsched_deps_state *state,
402                    struct ir3_postsched_node *node, unsigned num, int src_n,
403                    int dst_n)
404 {
405    struct ir3_postsched_node *dep = dep_reg(state, num);
406 
407    unsigned d = 0;
408    if (src_n >= 0 && dep && state->direction == F) {
409       /* get the dst_n this corresponds to */
410       unsigned dst_n = state->dst_n[num];
411       unsigned d_soft = ir3_delayslots(dep->instr, node->instr, src_n, true);
412       d = ir3_delayslots_with_repeat(dep->instr, node->instr, dst_n, src_n);
413       node->delay = MAX2(node->delay, d_soft);
414       if (is_sy_producer(dep->instr))
415          node->has_sy_src = true;
416       if (is_ss_producer(dep->instr))
417          node->has_ss_src = true;
418    }
419 
420    add_dep(state, dep, node, d);
421    if (src_n < 0) {
422       dep_reg(state, num) = node;
423       state->dst_n[num] = dst_n;
424    }
425 }
426 
427 /* This is where we handled full vs half-precision, and potential conflicts
428  * between half and full precision that result in additional dependencies.
429  * The 'reg' arg is really just to know half vs full precision.
430  *
431  * If src_n is positive, then this adds a dependency on a source register, and
432  * src_n is the index passed into ir3_delayslots() for calculating the delay:
433  * it corresponds to node->instr->srcs[src_n]. If src_n is negative, then
434  * this is for the destination register corresponding to dst_n.
435  */
436 static void
add_reg_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node,const struct ir3_register * reg,unsigned num,int src_n,int dst_n)437 add_reg_dep(struct ir3_postsched_deps_state *state,
438             struct ir3_postsched_node *node, const struct ir3_register *reg,
439             unsigned num, int src_n, int dst_n)
440 {
441    if (state->merged) {
442       /* Make sure that special registers like a0.x that are written as
443        * half-registers don't alias random full registers by pretending that
444        * they're full registers:
445        */
446       if ((reg->flags & IR3_REG_HALF) && !is_reg_special(reg)) {
447          /* single conflict in half-reg space: */
448          add_single_reg_dep(state, node, num, src_n, dst_n);
449       } else {
450          /* two conflicts in half-reg space: */
451          add_single_reg_dep(state, node, 2 * num + 0, src_n, dst_n);
452          add_single_reg_dep(state, node, 2 * num + 1, src_n, dst_n);
453       }
454    } else {
455       if (reg->flags & IR3_REG_HALF)
456          num += ARRAY_SIZE(state->regs) / 2;
457       add_single_reg_dep(state, node, num, src_n, dst_n);
458    }
459 }
460 
461 static void
calculate_deps(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node)462 calculate_deps(struct ir3_postsched_deps_state *state,
463                struct ir3_postsched_node *node)
464 {
465    /* Add dependencies on instructions that previously (or next,
466     * in the reverse direction) wrote any of our src registers:
467     */
468    foreach_src_n (reg, i, node->instr) {
469       if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
470          continue;
471 
472       if (reg->flags & IR3_REG_RELATIV) {
473          /* mark entire array as read: */
474          for (unsigned j = 0; j < reg->size; j++) {
475             add_reg_dep(state, node, reg, reg->array.base + j, i, -1);
476          }
477       } else {
478          assert(reg->wrmask >= 1);
479          u_foreach_bit (b, reg->wrmask) {
480             add_reg_dep(state, node, reg, reg->num + b, i, -1);
481          }
482       }
483    }
484 
485    /* And then after we update the state for what this instruction
486     * wrote:
487     */
488    foreach_dst_n (reg, i, node->instr) {
489       if (reg->wrmask == 0)
490          continue;
491       if (reg->flags & IR3_REG_RELATIV) {
492          /* mark the entire array as written: */
493          for (unsigned j = 0; j < reg->size; j++) {
494             add_reg_dep(state, node, reg, reg->array.base + j, -1, i);
495          }
496       } else {
497          assert(reg->wrmask >= 1);
498          u_foreach_bit (b, reg->wrmask) {
499             add_reg_dep(state, node, reg, reg->num + b, -1, i);
500          }
501       }
502    }
503 }
504 
505 static void
calculate_forward_deps(struct ir3_postsched_ctx * ctx)506 calculate_forward_deps(struct ir3_postsched_ctx *ctx)
507 {
508    struct ir3_postsched_deps_state state = {
509       .ctx = ctx,
510       .direction = F,
511       .merged = ctx->v->mergedregs,
512    };
513 
514    foreach_instr (instr, &ctx->unscheduled_list) {
515       calculate_deps(&state, instr->data);
516    }
517 }
518 
519 static void
calculate_reverse_deps(struct ir3_postsched_ctx * ctx)520 calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
521 {
522    struct ir3_postsched_deps_state state = {
523       .ctx = ctx,
524       .direction = R,
525       .merged = ctx->v->mergedregs,
526    };
527 
528    foreach_instr_rev (instr, &ctx->unscheduled_list) {
529       calculate_deps(&state, instr->data);
530    }
531 }
532 
533 static void
sched_node_init(struct ir3_postsched_ctx * ctx,struct ir3_instruction * instr)534 sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
535 {
536    struct ir3_postsched_node *n =
537       rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
538 
539    dag_init_node(ctx->dag, &n->dag);
540 
541    n->instr = instr;
542    instr->data = n;
543 }
544 
545 static void
sched_dag_max_delay_cb(struct dag_node * node,void * state)546 sched_dag_max_delay_cb(struct dag_node *node, void *state)
547 {
548    struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
549    uint32_t max_delay = 0;
550 
551    util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
552       struct ir3_postsched_node *child =
553          (struct ir3_postsched_node *)edge->child;
554       max_delay = MAX2(child->max_delay, max_delay);
555    }
556 
557    n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
558 }
559 
560 static void
sched_dag_init(struct ir3_postsched_ctx * ctx)561 sched_dag_init(struct ir3_postsched_ctx *ctx)
562 {
563    ctx->mem_ctx = ralloc_context(NULL);
564 
565    ctx->dag = dag_create(ctx->mem_ctx);
566 
567    foreach_instr (instr, &ctx->unscheduled_list)
568       sched_node_init(ctx, instr);
569 
570    calculate_forward_deps(ctx);
571    calculate_reverse_deps(ctx);
572 
573    /*
574     * To avoid expensive texture fetches, etc, from being moved ahead
575     * of kills, track the kills we've seen so far, so we can add an
576     * extra dependency on them for tex/mem instructions
577     */
578    struct util_dynarray kills;
579    util_dynarray_init(&kills, ctx->mem_ctx);
580 
581    /* The last bary.f with the (ei) flag must be scheduled before any kills,
582     * or the hw gets angry. Keep track of inputs here so we can add the
583     * false dep on the kill instruction.
584     */
585    struct util_dynarray inputs;
586    util_dynarray_init(&inputs, ctx->mem_ctx);
587 
588    /*
589     * Normal srcs won't be in SSA at this point, those are dealt with in
590     * calculate_forward_deps() and calculate_reverse_deps().  But we still
591     * have the false-dep information in SSA form, so go ahead and add
592     * dependencies for that here:
593     */
594    foreach_instr (instr, &ctx->unscheduled_list) {
595       struct ir3_postsched_node *n = instr->data;
596 
597       foreach_ssa_src_n (src, i, instr) {
598          if (src->block != instr->block)
599             continue;
600 
601          /* we can end up with unused false-deps.. just skip them: */
602          if (src->flags & IR3_INSTR_UNUSED)
603             continue;
604 
605          struct ir3_postsched_node *sn = src->data;
606 
607          /* don't consider dependencies in other blocks: */
608          if (src->block != instr->block)
609             continue;
610 
611          dag_add_edge_max_data(&sn->dag, &n->dag, 0);
612       }
613 
614       if (is_input(instr)) {
615          util_dynarray_append(&inputs, struct ir3_instruction *, instr);
616       } else if (is_kill_or_demote(instr)) {
617          util_dynarray_foreach (&inputs, struct ir3_instruction *, instrp) {
618             struct ir3_instruction *input = *instrp;
619             struct ir3_postsched_node *in = input->data;
620             dag_add_edge_max_data(&in->dag, &n->dag, 0);
621          }
622          util_dynarray_append(&kills, struct ir3_instruction *, instr);
623       } else if (is_tex(instr) || is_mem(instr)) {
624          util_dynarray_foreach (&kills, struct ir3_instruction *, instrp) {
625             struct ir3_instruction *kill = *instrp;
626             struct ir3_postsched_node *kn = kill->data;
627             dag_add_edge_max_data(&kn->dag, &n->dag, 0);
628          }
629       }
630    }
631 
632    dag_validate(ctx->dag, sched_dag_validate_cb, NULL);
633 
634    // TODO do we want to do this after reverse-dependencies?
635    dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
636 }
637 
638 static void
sched_dag_destroy(struct ir3_postsched_ctx * ctx)639 sched_dag_destroy(struct ir3_postsched_ctx *ctx)
640 {
641    ralloc_free(ctx->mem_ctx);
642    ctx->mem_ctx = NULL;
643    ctx->dag = NULL;
644 }
645 
646 static void
sched_block(struct ir3_postsched_ctx * ctx,struct ir3_block * block)647 sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
648 {
649    ctx->block = block;
650    ctx->sy_delay = 0;
651    ctx->ss_delay = 0;
652 
653    /* move all instructions to the unscheduled list, and
654     * empty the block's instruction list (to which we will
655     * be inserting).
656     */
657    list_replace(&block->instr_list, &ctx->unscheduled_list);
658    list_inithead(&block->instr_list);
659 
660    // TODO once we are using post-sched for everything we can
661    // just not stick in NOP's prior to post-sched, and drop this.
662    // for now keep this, since it makes post-sched optional:
663    foreach_instr_safe (instr, &ctx->unscheduled_list) {
664       switch (instr->opc) {
665       case OPC_NOP:
666       case OPC_B:
667       case OPC_JUMP:
668          list_delinit(&instr->node);
669          break;
670       default:
671          break;
672       }
673    }
674 
675    sched_dag_init(ctx);
676 
677    /* First schedule all meta:input instructions, followed by
678     * tex-prefetch.  We want all of the instructions that load
679     * values into registers before the shader starts to go
680     * before any other instructions.  But in particular we
681     * want inputs to come before prefetches.  This is because
682     * a FS's bary_ij input may not actually be live in the
683     * shader, but it should not be scheduled on top of any
684     * other input (but can be overwritten by a tex prefetch)
685     */
686    foreach_instr_safe (instr, &ctx->unscheduled_list)
687       if (instr->opc == OPC_META_INPUT)
688          schedule(ctx, instr);
689 
690    foreach_instr_safe (instr, &ctx->unscheduled_list)
691       if (instr->opc == OPC_META_TEX_PREFETCH)
692          schedule(ctx, instr);
693 
694    foreach_instr_safe (instr, &ctx->unscheduled_list)
695       if (instr->opc == OPC_PUSH_CONSTS_LOAD_MACRO)
696          schedule(ctx, instr);
697 
698    while (!list_is_empty(&ctx->unscheduled_list)) {
699       struct ir3_instruction *instr = choose_instr(ctx);
700 
701       unsigned delay = node_delay(ctx, instr->data);
702       d("delay=%u", delay);
703 
704       assert(delay <= 6);
705 
706       schedule(ctx, instr);
707    }
708 
709    sched_dag_destroy(ctx);
710 }
711 
712 static bool
is_self_mov(struct ir3_instruction * instr)713 is_self_mov(struct ir3_instruction *instr)
714 {
715    if (!is_same_type_mov(instr))
716       return false;
717 
718    if (instr->dsts[0]->num != instr->srcs[0]->num)
719       return false;
720 
721    if (instr->dsts[0]->flags & IR3_REG_RELATIV)
722       return false;
723 
724    if (instr->cat1.round != ROUND_ZERO)
725       return false;
726 
727    if (instr->srcs[0]->flags &
728        (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_FNEG |
729         IR3_REG_FABS | IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT))
730       return false;
731 
732    return true;
733 }
734 
735 /* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
736  * as a result of places were before RA we are not sure that it is
737  * safe to eliminate.  We could eliminate these earlier, but sometimes
738  * they are tangled up in false-dep's, etc, so it is easier just to
739  * let them exist until after RA
740  */
741 static void
cleanup_self_movs(struct ir3 * ir)742 cleanup_self_movs(struct ir3 *ir)
743 {
744    foreach_block (block, &ir->block_list) {
745       foreach_instr_safe (instr, &block->instr_list) {
746          for (unsigned i = 0; i < instr->deps_count; i++) {
747             if (instr->deps[i] && is_self_mov(instr->deps[i])) {
748                instr->deps[i] = NULL;
749             }
750          }
751 
752          if (is_self_mov(instr))
753             list_delinit(&instr->node);
754       }
755    }
756 }
757 
758 bool
ir3_postsched(struct ir3 * ir,struct ir3_shader_variant * v)759 ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v)
760 {
761    struct ir3_postsched_ctx ctx = {
762       .ir = ir,
763       .v = v,
764    };
765 
766    cleanup_self_movs(ir);
767 
768    foreach_block (block, &ir->block_list) {
769       sched_block(&ctx, block);
770    }
771 
772    return true;
773 }
774