• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  * Copyright © 2014-2015 Broadcom
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 /**
26  * @file vc4_qir_schedule.c
27  *
28  * The basic model of the list scheduler is to take a basic block, compute a
29  * DAG of the dependencies from the bottom up, and make a list of the DAG
30  * heads.  Heuristically pick a DAG head and schedule (remove) it, then put
31  * all the parents that are now DAG heads into the list of things to
32  * schedule.
33  *
34  * The goal of scheduling here, before register allocation and conversion to
35  * QPU instructions, is to reduce register pressure by reordering instructions
36  * to consume values when possible.
37  */
38 
39 #include "vc4_qir.h"
40 #include "util/dag.h"
41 
42 static bool debug;
43 
44 struct schedule_node {
45         struct dag_node dag;
46         struct list_head link;
47         struct qinst *inst;
48 
49         /* Length of the longest (latency) chain from a DAG head to the this
50          * instruction.
51          */
52         uint32_t delay;
53 
54         /* Longest time + latency_between(parent, this) of any parent of this
55          * node.
56          */
57         uint32_t unblocked_time;
58 };
59 
60 struct schedule_state {
61         struct dag *dag;
62 
63         uint32_t time;
64 
65         uint32_t *temp_writes;
66 
67         BITSET_WORD *temp_live;
68 };
69 
70 /* When walking the instructions in reverse, we need to swap before/after in
71  * add_dep().
72  */
73 enum direction { F, R };
74 
75 /**
76  * Marks a dependency between two intructions, that \p after must appear after
77  * \p before.
78  *
79  * Our dependencies are tracked as a DAG.  Since we're scheduling bottom-up,
80  * the latest instructions with nothing left to schedule are the DAG heads,
81  * and their inputs are their children.
82  */
83 static void
add_dep(enum direction dir,struct schedule_node * before,struct schedule_node * after)84 add_dep(enum direction dir,
85         struct schedule_node *before,
86         struct schedule_node *after)
87 {
88         if (!before || !after)
89                 return;
90 
91         assert(before != after);
92 
93         if (dir == R) {
94                 struct schedule_node *t = before;
95                 before = after;
96                 after = t;
97         }
98 
99         dag_add_edge(&after->dag, &before->dag, 0);
100 }
101 
102 static void
add_write_dep(enum direction dir,struct schedule_node ** before,struct schedule_node * after)103 add_write_dep(enum direction dir,
104               struct schedule_node **before,
105               struct schedule_node *after)
106 {
107         add_dep(dir, *before, after);
108         *before = after;
109 }
110 
111 struct schedule_setup_state {
112         struct schedule_node **last_temp_write;
113         struct schedule_node *last_sf;
114         struct schedule_node *last_vary_read;
115         struct schedule_node *last_vpm_read;
116         struct schedule_node *last_vpm_write;
117         struct schedule_node *last_tex_coord;
118         struct schedule_node *last_tex_result;
119         struct schedule_node *last_tlb;
120         struct schedule_node *last_uniforms_reset;
121         enum direction dir;
122 
123 	/**
124          * Texture FIFO tracking.  This is done top-to-bottom, and is used to
125          * track the QOP_TEX_RESULTs and add dependencies on previous ones
126          * when trying to submit texture coords with TFREQ full or new texture
127          * fetches with TXRCV full.
128          */
129         struct {
130                 struct schedule_node *node;
131                 int coords;
132         } tex_fifo[8];
133         int tfreq_count; /**< Number of texture coords outstanding. */
134         int tfrcv_count; /**< Number of texture results outstanding. */
135         int tex_fifo_pos;
136 };
137 
138 static void
block_until_tex_result(struct schedule_setup_state * state,struct schedule_node * n)139 block_until_tex_result(struct schedule_setup_state *state, struct schedule_node *n)
140 {
141         add_dep(state->dir, state->tex_fifo[0].node, n);
142 
143         state->tfreq_count -= state->tex_fifo[0].coords;
144         state->tfrcv_count--;
145 
146         memmove(&state->tex_fifo[0],
147                 &state->tex_fifo[1],
148                 state->tex_fifo_pos * sizeof(state->tex_fifo[0]));
149         state->tex_fifo_pos--;
150 }
151 
152 /**
153  * Common code for dependencies that need to be tracked both forward and
154  * backward.
155  *
156  * This is for things like "all VPM reads have to happen in order."
157  */
158 static void
calculate_deps(struct schedule_setup_state * state,struct schedule_node * n)159 calculate_deps(struct schedule_setup_state *state, struct schedule_node *n)
160 {
161         struct qinst *inst = n->inst;
162         enum direction dir = state->dir;
163 
164 
165         /* Add deps for temp registers and varyings accesses.  Note that we
166          * ignore uniforms accesses, because qir_reorder_uniforms() happens
167          * after this.
168          */
169         for (int i = 0; i < qir_get_nsrc(inst); i++) {
170                 switch (inst->src[i].file) {
171                 case QFILE_TEMP:
172                         add_dep(dir,
173                                 state->last_temp_write[inst->src[i].index], n);
174                         break;
175 
176                 case QFILE_VARY:
177                         add_write_dep(dir, &state->last_vary_read, n);
178                         break;
179 
180                 case QFILE_VPM:
181                         add_write_dep(dir, &state->last_vpm_read, n);
182                         break;
183 
184                 default:
185                         break;
186                 }
187         }
188 
189         switch (inst->op) {
190         case QOP_VARY_ADD_C:
191                 add_dep(dir, state->last_vary_read, n);
192                 break;
193 
194         case QOP_TEX_RESULT:
195                 /* Results have to be fetched in order. */
196                 add_write_dep(dir, &state->last_tex_result, n);
197                 break;
198 
199         case QOP_THRSW:
200                 /* After a new THRSW, one must collect all texture samples
201                  * queued since the previous THRSW/program start.  For now, we
202                  * have one THRSW in between each texture setup and its
203                  * results collection as our input, and we just make sure that
204                  * that ordering is maintained.
205                  */
206                 add_write_dep(dir, &state->last_tex_coord, n);
207                 add_write_dep(dir, &state->last_tex_result, n);
208 
209                 /* accumulators and flags are lost across thread switches. */
210                 add_write_dep(dir, &state->last_sf, n);
211 
212                 /* Setup, like the varyings, will need to be drained before we
213                  * thread switch.
214                  */
215                 add_write_dep(dir, &state->last_vary_read, n);
216 
217                 /* The TLB-locking operations have to stay after the last
218                  * thread switch.
219                  */
220                 add_write_dep(dir, &state->last_tlb, n);
221                 break;
222 
223         case QOP_TLB_COLOR_READ:
224         case QOP_MS_MASK:
225                 add_write_dep(dir, &state->last_tlb, n);
226                 break;
227 
228         default:
229                 break;
230         }
231 
232         switch (inst->dst.file) {
233         case QFILE_VPM:
234                 add_write_dep(dir, &state->last_vpm_write, n);
235                 break;
236 
237         case QFILE_TEMP:
238                 add_write_dep(dir, &state->last_temp_write[inst->dst.index], n);
239                 break;
240 
241         case QFILE_TLB_COLOR_WRITE:
242         case QFILE_TLB_COLOR_WRITE_MS:
243         case QFILE_TLB_Z_WRITE:
244         case QFILE_TLB_STENCIL_SETUP:
245                 add_write_dep(dir, &state->last_tlb, n);
246                 break;
247 
248         case QFILE_TEX_S_DIRECT:
249         case QFILE_TEX_S:
250         case QFILE_TEX_T:
251         case QFILE_TEX_R:
252         case QFILE_TEX_B:
253                 /* Texturing setup gets scheduled in order, because
254                  * the uniforms referenced by them have to land in a
255                  * specific order.
256                  */
257                 add_write_dep(dir, &state->last_tex_coord, n);
258                 break;
259 
260         default:
261                 break;
262         }
263 
264         if (qir_depends_on_flags(inst))
265                 add_dep(dir, state->last_sf, n);
266 
267         if (inst->sf)
268                 add_write_dep(dir, &state->last_sf, n);
269 }
270 
271 static void
calculate_forward_deps(struct vc4_compile * c,void * mem_ctx,struct list_head * schedule_list)272 calculate_forward_deps(struct vc4_compile *c, void *mem_ctx,
273                        struct list_head *schedule_list)
274 {
275         struct schedule_setup_state state;
276 
277         memset(&state, 0, sizeof(state));
278         state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *,
279                                               c->num_temps);
280         state.dir = F;
281 
282         list_for_each_entry(struct schedule_node, n, schedule_list, link) {
283                 struct qinst *inst = n->inst;
284 
285                 calculate_deps(&state, n);
286 
287                 for (int i = 0; i < qir_get_nsrc(inst); i++) {
288                         switch (inst->src[i].file) {
289                         case QFILE_UNIF:
290                                 add_dep(state.dir, state.last_uniforms_reset, n);
291                                 break;
292                         default:
293                                 break;
294                         }
295                 }
296 
297                 switch (inst->dst.file) {
298                 case QFILE_TEX_S_DIRECT:
299                 case QFILE_TEX_S:
300                 case QFILE_TEX_T:
301                 case QFILE_TEX_R:
302                 case QFILE_TEX_B:
303                         /* From the VC4 spec:
304                          *
305                          *     "The TFREQ input FIFO holds two full lots of s,
306                          *      t, r, b data, plus associated setup data, per
307                          *      QPU, that is, there are eight data slots. For
308                          *      each texture request, slots are only consumed
309                          *      for the components of s, t, r, and b actually
310                          *      written. Thus the FIFO can hold four requests
311                          *      of just (s, t) data, or eight requests of just
312                          *      s data (for direct addressed data lookups).
313                          *
314                          *      Note that there is one FIFO per QPU, and the
315                          *      FIFO has no concept of threads - that is,
316                          *      multi-threaded shaders must be careful to use
317                          *      only 1/2 the FIFO depth before reading
318                          *      back. Multi-threaded programs must also
319                          *      therefore always thread switch on texture
320                          *      fetch as the other thread may have data
321                          *      waiting in the FIFO."
322                          *
323                          * If the texture coordinate fifo is full, block this
324                          * on the last QOP_TEX_RESULT.
325                          */
326                         if (state.tfreq_count == (c->fs_threaded ? 4 : 8)) {
327                                 block_until_tex_result(&state, n);
328                         }
329 
330                         /* From the VC4 spec:
331                          *
332                          *     "Since the maximum number of texture requests
333                          *      in the input (TFREQ) FIFO is four lots of (s,
334                          *      t) data, the output (TFRCV) FIFO is sized to
335                          *      holds four lots of max-size color data per
336                          *      QPU. For non-float color, reads are packed
337                          *      RGBA8888 data (one read per pixel). For 16-bit
338                          *      float color, two reads are necessary per
339                          *      pixel, with reads packed as RG1616 then
340                          *      BA1616. So per QPU there are eight color slots
341                          *      in the TFRCV FIFO."
342                          *
343                          * If the texture result fifo is full, block adding
344                          * any more to it until the last QOP_TEX_RESULT.
345                          */
346                         if (inst->dst.file == QFILE_TEX_S ||
347                             inst->dst.file == QFILE_TEX_S_DIRECT) {
348                                 if (state.tfrcv_count ==
349                                     (c->fs_threaded ? 2 : 4))
350                                         block_until_tex_result(&state, n);
351                                 state.tfrcv_count++;
352                         }
353 
354                         state.tex_fifo[state.tex_fifo_pos].coords++;
355                         state.tfreq_count++;
356                         break;
357 
358                 default:
359                         break;
360                 }
361 
362                 switch (inst->op) {
363                 case QOP_TEX_RESULT:
364                         /* Results have to be fetched after the
365                          * coordinate setup.  Note that we're assuming
366                          * here that our input shader has the texture
367                          * coord setup and result fetch in order,
368                          * which is true initially but not of our
369                          * instruction stream after this pass.
370                          */
371                         add_dep(state.dir, state.last_tex_coord, n);
372 
373                         state.tex_fifo[state.tex_fifo_pos].node = n;
374 
375                         state.tex_fifo_pos++;
376                         memset(&state.tex_fifo[state.tex_fifo_pos], 0,
377                                sizeof(state.tex_fifo[0]));
378                         break;
379 
380                 case QOP_UNIFORMS_RESET:
381                         add_write_dep(state.dir, &state.last_uniforms_reset, n);
382                         break;
383 
384                 default:
385                         break;
386                 }
387         }
388 }
389 
390 static void
calculate_reverse_deps(struct vc4_compile * c,void * mem_ctx,struct list_head * schedule_list)391 calculate_reverse_deps(struct vc4_compile *c, void *mem_ctx,
392                        struct list_head *schedule_list)
393 {
394         struct schedule_setup_state state;
395 
396         memset(&state, 0, sizeof(state));
397         state.dir = R;
398         state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *,
399                                               c->num_temps);
400 
401         list_for_each_entry_rev(struct schedule_node, n, schedule_list, link) {
402                 calculate_deps(&state, n);
403         }
404 }
405 
406 static int
get_register_pressure_cost(struct schedule_state * state,struct qinst * inst)407 get_register_pressure_cost(struct schedule_state *state, struct qinst *inst)
408 {
409         int cost = 0;
410 
411         if (inst->dst.file == QFILE_TEMP &&
412             state->temp_writes[inst->dst.index] == 1)
413                 cost--;
414 
415         for (int i = 0; i < qir_get_nsrc(inst); i++) {
416                 if (inst->src[i].file != QFILE_TEMP ||
417                     BITSET_TEST(state->temp_live, inst->src[i].index)) {
418                         continue;
419                 }
420 
421                 bool already_counted = false;
422                 for (int j = 0; j < i; j++) {
423                         if (inst->src[i].file == inst->src[j].file &&
424                             inst->src[i].index == inst->src[j].index) {
425                                 already_counted = true;
426                         }
427                 }
428                 if (!already_counted)
429                         cost++;
430         }
431 
432         return cost;
433 }
434 
435 static bool
locks_scoreboard(struct qinst * inst)436 locks_scoreboard(struct qinst *inst)
437 {
438         if (inst->op == QOP_TLB_COLOR_READ)
439                 return true;
440 
441         switch (inst->dst.file) {
442         case QFILE_TLB_Z_WRITE:
443         case QFILE_TLB_COLOR_WRITE:
444         case QFILE_TLB_COLOR_WRITE_MS:
445                 return true;
446         default:
447                 return false;
448         }
449 }
450 
451 static struct schedule_node *
choose_instruction(struct schedule_state * state)452 choose_instruction(struct schedule_state *state)
453 {
454         struct schedule_node *chosen = NULL;
455 
456         list_for_each_entry(struct schedule_node, n, &state->dag->heads,
457                             dag.link) {
458                 /* The branches aren't being tracked as dependencies.  Make
459                  * sure that they stay scheduled as the last instruction of
460                  * the block, which is to say the first one we choose to
461                  * schedule.
462                  */
463                 if (n->inst->op == QOP_BRANCH)
464                         return n;
465 
466                 if (!chosen) {
467                         chosen = n;
468                         continue;
469                 }
470 
471                 /* Prefer scheduling things that lock the scoreboard, so that
472                  * they appear late in the program and we get more parallelism
473                  * between shaders on multiple QPUs hitting the same fragment.
474                  */
475                 if (locks_scoreboard(n->inst) &&
476                     !locks_scoreboard(chosen->inst)) {
477                         chosen = n;
478                         continue;
479                 } else if (!locks_scoreboard(n->inst) &&
480                            locks_scoreboard(chosen->inst)) {
481                         continue;
482                 }
483 
484                 /* If we would block on the previously chosen node, but would
485                  * block less on this one, then prefer it.
486                  */
487                 if (chosen->unblocked_time > state->time &&
488                     n->unblocked_time < chosen->unblocked_time) {
489                         chosen = n;
490                         continue;
491                 } else if (n->unblocked_time > state->time &&
492                            n->unblocked_time > chosen->unblocked_time) {
493                         continue;
494                 }
495 
496                 /* If we can definitely reduce register pressure, do so
497                  * immediately.
498                  */
499                 int register_pressure_cost =
500                         get_register_pressure_cost(state, n->inst);
501                 int chosen_register_pressure_cost =
502                         get_register_pressure_cost(state, chosen->inst);
503 
504                 if (register_pressure_cost < chosen_register_pressure_cost) {
505                         chosen = n;
506                         continue;
507                 } else if (register_pressure_cost >
508                            chosen_register_pressure_cost) {
509                         continue;
510                 }
511 
512                 /* Otherwise, prefer instructions with the deepest chain to
513                  * the end of the program.  This avoids the problem of
514                  * "everything generates a temp, nothing finishes freeing one,
515                  * guess I'll just keep emitting varying mul/adds".
516                  */
517                 if (n->delay > chosen->delay) {
518                         chosen = n;
519                         continue;
520                 } else if (n->delay < chosen->delay) {
521                         continue;
522                 }
523         }
524 
525         return chosen;
526 }
527 
528 static void
dump_state(struct vc4_compile * c,struct schedule_state * state)529 dump_state(struct vc4_compile *c, struct schedule_state *state)
530 {
531         uint32_t i = 0;
532         list_for_each_entry(struct schedule_node, n, &state->dag->heads,
533                             dag.link) {
534                 fprintf(stderr, "%3d: ", i++);
535                 qir_dump_inst(c, n->inst);
536                 fprintf(stderr, " (%d cost)\n",
537                         get_register_pressure_cost(state, n->inst));
538 
539                 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
540                         struct schedule_node *child =
541                                 (struct schedule_node *)edge->child;
542                         fprintf(stderr, "   - ");
543                         qir_dump_inst(c, child->inst);
544                         fprintf(stderr, " (%d parents)\n",
545                                 child->dag.parent_count);
546                 }
547         }
548 }
549 
550 /* Estimate of how many instructions we should schedule between operations.
551  *
552  * These aren't in real cycle counts, because we're just estimating cycle
553  * times anyway.  QIR instructions will get paired up when turned into QPU
554  * instructions, or extra NOP delays will have to be added due to register
555  * allocation choices.
556  */
557 static uint32_t
latency_between(struct schedule_node * before,struct schedule_node * after)558 latency_between(struct schedule_node *before, struct schedule_node *after)
559 {
560         if ((before->inst->dst.file == QFILE_TEX_S ||
561              before->inst->dst.file == QFILE_TEX_S_DIRECT) &&
562             after->inst->op == QOP_TEX_RESULT)
563                 return 100;
564 
565         switch (before->inst->op) {
566         case QOP_RCP:
567         case QOP_RSQ:
568         case QOP_EXP2:
569         case QOP_LOG2:
570                 for (int i = 0; i < qir_get_nsrc(after->inst); i++) {
571                         if (after->inst->src[i].file ==
572                             before->inst->dst.file &&
573                             after->inst->src[i].index ==
574                             before->inst->dst.index) {
575                                 /* There are two QPU delay slots before we can
576                                  * read a math result, which could be up to 4
577                                  * QIR instructions if they packed well.
578                                  */
579                                 return 4;
580                         }
581                 }
582                 break;
583         default:
584                 break;
585         }
586 
587         return 1;
588 }
589 
590 /** Recursive computation of the delay member of a node. */
591 static void
compute_delay(struct dag_node * node,void * state)592 compute_delay(struct dag_node *node, void *state)
593 {
594         struct schedule_node *n = (struct schedule_node *)node;
595 
596         /* The color read needs to be scheduled late, to avoid locking the
597          * scoreboard early.  This is our best tool for encouraging that.  The
598          * other scoreboard locking ops will have this happen by default,
599          * since they are generally the DAG heads or close to them.
600          */
601         if (n->inst->op == QOP_TLB_COLOR_READ)
602                 n->delay = 1000;
603         else
604                 n->delay = 1;
605 
606         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
607                 struct schedule_node *child =
608                         (struct schedule_node *)edge->child;
609 
610                 n->delay = MAX2(n->delay, (child->delay +
611                                            latency_between(child, n)));
612         }
613 }
614 
615 static void
schedule_instructions(struct vc4_compile * c,struct qblock * block,struct schedule_state * state)616 schedule_instructions(struct vc4_compile *c,
617                       struct qblock *block, struct schedule_state *state)
618 {
619         if (debug) {
620                 fprintf(stderr, "initial deps:\n");
621                 dump_state(c, state);
622         }
623 
624         state->time = 0;
625         while (!list_is_empty(&state->dag->heads)) {
626                 struct schedule_node *chosen = choose_instruction(state);
627                 struct qinst *inst = chosen->inst;
628 
629                 if (debug) {
630                         fprintf(stderr, "current list:\n");
631                         dump_state(c, state);
632                         fprintf(stderr, "chose: ");
633                         qir_dump_inst(c, inst);
634                         fprintf(stderr, " (%d cost)\n",
635                                 get_register_pressure_cost(state, inst));
636                 }
637 
638                 state->time = MAX2(state->time, chosen->unblocked_time);
639 
640                 /* Schedule this instruction back onto the QIR list. */
641                 list_add(&inst->link, &block->instructions);
642 
643                 /* Now that we've scheduled a new instruction, some of its
644                  * children can be promoted to the list of instructions ready to
645                  * be scheduled.  Update the children's unblocked time for this
646                  * DAG edge as we do so.
647                  */
648                 util_dynarray_foreach(&chosen->dag.edges,
649                                       struct dag_edge, edge) {
650                         struct schedule_node *child =
651                                 (struct schedule_node *)edge->child;
652 
653                         child->unblocked_time = MAX2(child->unblocked_time,
654                                                      state->time +
655                                                      latency_between(child,
656                                                                      chosen));
657                 }
658                 dag_prune_head(state->dag, &chosen->dag);
659 
660                 /* Update our tracking of register pressure. */
661                 for (int i = 0; i < qir_get_nsrc(inst); i++) {
662                         if (inst->src[i].file == QFILE_TEMP)
663                                 BITSET_SET(state->temp_live, inst->src[i].index);
664                 }
665                 if (inst->dst.file == QFILE_TEMP) {
666                         state->temp_writes[inst->dst.index]--;
667                         if (state->temp_writes[inst->dst.index] == 0)
668                                 BITSET_CLEAR(state->temp_live, inst->dst.index);
669                 }
670 
671                 state->time++;
672         }
673 }
674 
675 static void
qir_schedule_instructions_block(struct vc4_compile * c,struct qblock * block)676 qir_schedule_instructions_block(struct vc4_compile *c,
677                                 struct qblock *block)
678 {
679         struct schedule_state *state = rzalloc(NULL, struct schedule_state);
680 
681         state->temp_writes = rzalloc_array(state, uint32_t, c->num_temps);
682         state->temp_live = rzalloc_array(state, BITSET_WORD,
683                                          BITSET_WORDS(c->num_temps));
684         state->dag = dag_create(state);
685 
686         struct list_head setup_list;
687         list_inithead(&setup_list);
688 
689         /* Wrap each instruction in a scheduler structure. */
690         qir_for_each_inst_safe(inst, block) {
691                 struct schedule_node *n = rzalloc(state, struct schedule_node);
692 
693                 n->inst = inst;
694                 list_del(&inst->link);
695                 list_addtail(&n->link, &setup_list);
696                 dag_init_node(state->dag, &n->dag);
697 
698                 if (inst->dst.file == QFILE_TEMP)
699                         state->temp_writes[inst->dst.index]++;
700         }
701 
702         /* Dependencies tracked top-to-bottom. */
703         calculate_forward_deps(c, state, &setup_list);
704         /* Dependencies tracked bottom-to-top. */
705         calculate_reverse_deps(c, state, &setup_list);
706 
707         dag_traverse_bottom_up(state->dag, compute_delay, NULL);
708 
709         schedule_instructions(c, block, state);
710 
711         ralloc_free(state);
712 }
713 
714 void
qir_schedule_instructions(struct vc4_compile * c)715 qir_schedule_instructions(struct vc4_compile *c)
716 {
717 
718         if (debug) {
719                 fprintf(stderr, "Pre-schedule instructions\n");
720                 qir_dump(c);
721         }
722 
723         qir_for_each_block(block, c)
724                 qir_schedule_instructions_block(c, block);
725 
726         if (debug) {
727                 fprintf(stderr, "Post-schedule instructions\n");
728                 qir_dump(c);
729         }
730 }
731