• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  * Copyright © 2014 Broadcom
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 /**
26  * @file vc4_qpu_schedule.c
27  *
28  * The basic model of the list scheduler is to take a basic block, compute a
29  * DAG of the dependencies, and make a list of the DAG heads.  Heuristically
30  * pick a DAG head, then put all the children that are now DAG heads into the
31  * list of things to schedule.
32  *
33  * The goal of scheduling here is to pack pairs of operations together in a
34  * single QPU instruction.
35  */
36 
37 #include "vc4_qir.h"
38 #include "vc4_qpu.h"
39 #include "util/ralloc.h"
40 #include "util/dag.h"
41 
42 static bool debug;
43 
44 struct schedule_node_child;
45 
46 struct schedule_node {
47         struct dag_node dag;
48         struct list_head link;
49         struct queued_qpu_inst *inst;
50 
51         /* Longest cycles + instruction_latency() of any parent of this node. */
52         uint32_t unblocked_time;
53 
54         /**
55          * Minimum number of cycles from scheduling this instruction until the
56          * end of the program, based on the slowest dependency chain through
57          * the children.
58          */
59         uint32_t delay;
60 
61         /**
62          * cycles between this instruction being scheduled and when its result
63          * can be consumed.
64          */
65         uint32_t latency;
66 
67         /**
68          * Which uniform from uniform_data[] this instruction read, or -1 if
69          * not reading a uniform.
70          */
71         int uniform;
72 };
73 
74 /* When walking the instructions in reverse, we need to swap before/after in
75  * add_dep().
76  */
77 enum direction { F, R };
78 
79 struct schedule_state {
80         struct dag *dag;
81         struct schedule_node *last_r[6];
82         struct schedule_node *last_ra[32];
83         struct schedule_node *last_rb[32];
84         struct schedule_node *last_sf;
85         struct schedule_node *last_vpm_read;
86         struct schedule_node *last_tmu_write;
87         struct schedule_node *last_tlb;
88         struct schedule_node *last_vpm;
89         struct schedule_node *last_uniforms_reset;
90         enum direction dir;
91         /* Estimated cycle when the current instruction would start. */
92         uint32_t time;
93 };
94 
95 static void
add_dep(struct schedule_state * state,struct schedule_node * before,struct schedule_node * after,bool write)96 add_dep(struct schedule_state *state,
97         struct schedule_node *before,
98         struct schedule_node *after,
99         bool write)
100 {
101         bool write_after_read = !write && state->dir == R;
102         uintptr_t edge_data = write_after_read;
103 
104         if (!before || !after)
105                 return;
106 
107         assert(before != after);
108 
109         if (state->dir == F)
110                 dag_add_edge(&before->dag, &after->dag, edge_data);
111         else
112                 dag_add_edge(&after->dag, &before->dag, edge_data);
113 }
114 
115 static void
add_read_dep(struct schedule_state * state,struct schedule_node * before,struct schedule_node * after)116 add_read_dep(struct schedule_state *state,
117               struct schedule_node *before,
118               struct schedule_node *after)
119 {
120         add_dep(state, before, after, false);
121 }
122 
123 static void
add_write_dep(struct schedule_state * state,struct schedule_node ** before,struct schedule_node * after)124 add_write_dep(struct schedule_state *state,
125               struct schedule_node **before,
126               struct schedule_node *after)
127 {
128         add_dep(state, *before, after, true);
129         *before = after;
130 }
131 
132 static bool
qpu_writes_r4(uint64_t inst)133 qpu_writes_r4(uint64_t inst)
134 {
135         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
136 
137         switch(sig) {
138         case QPU_SIG_COLOR_LOAD:
139         case QPU_SIG_LOAD_TMU0:
140         case QPU_SIG_LOAD_TMU1:
141         case QPU_SIG_ALPHA_MASK_LOAD:
142                 return true;
143         default:
144                 return false;
145         }
146 }
147 
148 static void
process_raddr_deps(struct schedule_state * state,struct schedule_node * n,uint32_t raddr,bool is_a)149 process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
150                    uint32_t raddr, bool is_a)
151 {
152         switch (raddr) {
153         case QPU_R_VARY:
154                 add_write_dep(state, &state->last_r[5], n);
155                 break;
156 
157         case QPU_R_VPM:
158                 add_write_dep(state, &state->last_vpm_read, n);
159                 break;
160 
161         case QPU_R_UNIF:
162                 add_read_dep(state, state->last_uniforms_reset, n);
163                 break;
164 
165         case QPU_R_NOP:
166         case QPU_R_ELEM_QPU:
167         case QPU_R_XY_PIXEL_COORD:
168         case QPU_R_MS_REV_FLAGS:
169                 break;
170 
171         default:
172                 if (raddr < 32) {
173                         if (is_a)
174                                 add_read_dep(state, state->last_ra[raddr], n);
175                         else
176                                 add_read_dep(state, state->last_rb[raddr], n);
177                 } else {
178                         fprintf(stderr, "unknown raddr %d\n", raddr);
179                         abort();
180                 }
181                 break;
182         }
183 }
184 
185 static bool
is_tmu_write(uint32_t waddr)186 is_tmu_write(uint32_t waddr)
187 {
188         switch (waddr) {
189         case QPU_W_TMU0_S:
190         case QPU_W_TMU0_T:
191         case QPU_W_TMU0_R:
192         case QPU_W_TMU0_B:
193         case QPU_W_TMU1_S:
194         case QPU_W_TMU1_T:
195         case QPU_W_TMU1_R:
196         case QPU_W_TMU1_B:
197                 return true;
198         default:
199                 return false;
200         }
201 }
202 
203 static bool
reads_uniform(uint64_t inst)204 reads_uniform(uint64_t inst)
205 {
206         if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM)
207                 return false;
208 
209         return (QPU_GET_FIELD(inst, QPU_RADDR_A) == QPU_R_UNIF ||
210                 (QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_UNIF &&
211                  QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM) ||
212                 is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_ADD)) ||
213                 is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_MUL)));
214 }
215 
216 static void
process_mux_deps(struct schedule_state * state,struct schedule_node * n,uint32_t mux)217 process_mux_deps(struct schedule_state *state, struct schedule_node *n,
218                  uint32_t mux)
219 {
220         if (mux != QPU_MUX_A && mux != QPU_MUX_B)
221                 add_read_dep(state, state->last_r[mux], n);
222 }
223 
224 
225 static void
process_waddr_deps(struct schedule_state * state,struct schedule_node * n,uint32_t waddr,bool is_add)226 process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
227                    uint32_t waddr, bool is_add)
228 {
229         uint64_t inst = n->inst->inst;
230         bool is_a = is_add ^ ((inst & QPU_WS) != 0);
231 
232         if (waddr < 32) {
233                 if (is_a) {
234                         add_write_dep(state, &state->last_ra[waddr], n);
235                 } else {
236                         add_write_dep(state, &state->last_rb[waddr], n);
237                 }
238         } else if (is_tmu_write(waddr)) {
239                 add_write_dep(state, &state->last_tmu_write, n);
240                 add_read_dep(state, state->last_uniforms_reset, n);
241         } else if (qpu_waddr_is_tlb(waddr) ||
242                    waddr == QPU_W_MS_FLAGS) {
243                 add_write_dep(state, &state->last_tlb, n);
244         } else {
245                 switch (waddr) {
246                 case QPU_W_ACC0:
247                 case QPU_W_ACC1:
248                 case QPU_W_ACC2:
249                 case QPU_W_ACC3:
250                 case QPU_W_ACC5:
251                         add_write_dep(state, &state->last_r[waddr - QPU_W_ACC0],
252                                       n);
253                         break;
254 
255                 case QPU_W_VPM:
256                         add_write_dep(state, &state->last_vpm, n);
257                         break;
258 
259                 case QPU_W_VPMVCD_SETUP:
260                         if (is_a)
261                                 add_write_dep(state, &state->last_vpm_read, n);
262                         else
263                                 add_write_dep(state, &state->last_vpm, n);
264                         break;
265 
266                 case QPU_W_SFU_RECIP:
267                 case QPU_W_SFU_RECIPSQRT:
268                 case QPU_W_SFU_EXP:
269                 case QPU_W_SFU_LOG:
270                         add_write_dep(state, &state->last_r[4], n);
271                         break;
272 
273                 case QPU_W_TLB_STENCIL_SETUP:
274                         /* This isn't a TLB operation that does things like
275                          * implicitly lock the scoreboard, but it does have to
276                          * appear before TLB_Z, and each of the TLB_STENCILs
277                          * have to schedule in the same order relative to each
278                          * other.
279                          */
280                         add_write_dep(state, &state->last_tlb, n);
281                         break;
282 
283                 case QPU_W_MS_FLAGS:
284                         add_write_dep(state, &state->last_tlb, n);
285                         break;
286 
287                 case QPU_W_UNIFORMS_ADDRESS:
288                         add_write_dep(state, &state->last_uniforms_reset, n);
289                         break;
290 
291                 case QPU_W_NOP:
292                         break;
293 
294                 default:
295                         fprintf(stderr, "Unknown waddr %d\n", waddr);
296                         abort();
297                 }
298         }
299 }
300 
301 static void
process_cond_deps(struct schedule_state * state,struct schedule_node * n,uint32_t cond)302 process_cond_deps(struct schedule_state *state, struct schedule_node *n,
303                   uint32_t cond)
304 {
305         switch (cond) {
306         case QPU_COND_NEVER:
307         case QPU_COND_ALWAYS:
308                 break;
309         default:
310                 add_read_dep(state, state->last_sf, n);
311                 break;
312         }
313 }
314 
315 /**
316  * Common code for dependencies that need to be tracked both forward and
317  * backward.
318  *
319  * This is for things like "all reads of r4 have to happen between the r4
320  * writes that surround them".
321  */
322 static void
calculate_deps(struct schedule_state * state,struct schedule_node * n)323 calculate_deps(struct schedule_state *state, struct schedule_node *n)
324 {
325         uint64_t inst = n->inst->inst;
326         uint32_t add_op = QPU_GET_FIELD(inst, QPU_OP_ADD);
327         uint32_t mul_op = QPU_GET_FIELD(inst, QPU_OP_MUL);
328         uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
329         uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
330         uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
331         uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
332         uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
333         uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
334         uint32_t mul_a = QPU_GET_FIELD(inst, QPU_MUL_A);
335         uint32_t mul_b = QPU_GET_FIELD(inst, QPU_MUL_B);
336         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
337 
338         if (sig != QPU_SIG_LOAD_IMM) {
339                 process_raddr_deps(state, n, raddr_a, true);
340                 if (sig != QPU_SIG_SMALL_IMM &&
341                     sig != QPU_SIG_BRANCH)
342                         process_raddr_deps(state, n, raddr_b, false);
343         }
344 
345         if (add_op != QPU_A_NOP) {
346                 process_mux_deps(state, n, add_a);
347                 process_mux_deps(state, n, add_b);
348         }
349         if (mul_op != QPU_M_NOP) {
350                 process_mux_deps(state, n, mul_a);
351                 process_mux_deps(state, n, mul_b);
352         }
353 
354         process_waddr_deps(state, n, waddr_add, true);
355         process_waddr_deps(state, n, waddr_mul, false);
356         if (qpu_writes_r4(inst))
357                 add_write_dep(state, &state->last_r[4], n);
358 
359         switch (sig) {
360         case QPU_SIG_SW_BREAKPOINT:
361         case QPU_SIG_NONE:
362         case QPU_SIG_SMALL_IMM:
363         case QPU_SIG_LOAD_IMM:
364                 break;
365 
366         case QPU_SIG_THREAD_SWITCH:
367         case QPU_SIG_LAST_THREAD_SWITCH:
368                 /* All accumulator contents and flags are undefined after the
369                  * switch.
370                  */
371                 for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
372                         add_write_dep(state, &state->last_r[i], n);
373                 add_write_dep(state, &state->last_sf, n);
374 
375                 /* Scoreboard-locking operations have to stay after the last
376                  * thread switch.
377                  */
378                 add_write_dep(state, &state->last_tlb, n);
379 
380                 add_write_dep(state, &state->last_tmu_write, n);
381                 break;
382 
383         case QPU_SIG_LOAD_TMU0:
384         case QPU_SIG_LOAD_TMU1:
385                 /* TMU loads are coming from a FIFO, so ordering is important.
386                  */
387                 add_write_dep(state, &state->last_tmu_write, n);
388                 break;
389 
390         case QPU_SIG_COLOR_LOAD:
391                 add_read_dep(state, state->last_tlb, n);
392                 break;
393 
394         case QPU_SIG_BRANCH:
395                 add_read_dep(state, state->last_sf, n);
396                 break;
397 
398         case QPU_SIG_PROG_END:
399         case QPU_SIG_WAIT_FOR_SCOREBOARD:
400         case QPU_SIG_SCOREBOARD_UNLOCK:
401         case QPU_SIG_COVERAGE_LOAD:
402         case QPU_SIG_COLOR_LOAD_END:
403         case QPU_SIG_ALPHA_MASK_LOAD:
404                 fprintf(stderr, "Unhandled signal bits %d\n", sig);
405                 abort();
406         }
407 
408         process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD));
409         process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_MUL));
410         if ((inst & QPU_SF) && sig != QPU_SIG_BRANCH)
411                 add_write_dep(state, &state->last_sf, n);
412 }
413 
414 static void
calculate_forward_deps(struct vc4_compile * c,struct dag * dag,struct list_head * schedule_list)415 calculate_forward_deps(struct vc4_compile *c, struct dag *dag,
416                        struct list_head *schedule_list)
417 {
418         struct schedule_state state;
419 
420         memset(&state, 0, sizeof(state));
421         state.dag = dag;
422         state.dir = F;
423 
424         list_for_each_entry(struct schedule_node, node, schedule_list, link)
425                 calculate_deps(&state, node);
426 }
427 
428 static void
calculate_reverse_deps(struct vc4_compile * c,struct dag * dag,struct list_head * schedule_list)429 calculate_reverse_deps(struct vc4_compile *c, struct dag *dag,
430                        struct list_head *schedule_list)
431 {
432         struct schedule_state state;
433 
434         memset(&state, 0, sizeof(state));
435         state.dag = dag;
436         state.dir = R;
437 
438         list_for_each_entry_rev(struct schedule_node, node, schedule_list,
439                                 link) {
440                 calculate_deps(&state, (struct schedule_node *)node);
441         }
442 }
443 
444 struct choose_scoreboard {
445         struct dag *dag;
446         int tick;
447         int last_sfu_write_tick;
448         int last_uniforms_reset_tick;
449         uint32_t last_waddr_a, last_waddr_b;
450         bool tlb_locked;
451 };
452 
453 static bool
reads_too_soon_after_write(struct choose_scoreboard * scoreboard,uint64_t inst)454 reads_too_soon_after_write(struct choose_scoreboard *scoreboard, uint64_t inst)
455 {
456         uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
457         uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
458         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
459 
460         /* Full immediate loads don't read any registers. */
461         if (sig == QPU_SIG_LOAD_IMM)
462                 return false;
463 
464         uint32_t src_muxes[] = {
465                 QPU_GET_FIELD(inst, QPU_ADD_A),
466                 QPU_GET_FIELD(inst, QPU_ADD_B),
467                 QPU_GET_FIELD(inst, QPU_MUL_A),
468                 QPU_GET_FIELD(inst, QPU_MUL_B),
469         };
470         for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) {
471                 if ((src_muxes[i] == QPU_MUX_A &&
472                      raddr_a < 32 &&
473                      scoreboard->last_waddr_a == raddr_a) ||
474                     (src_muxes[i] == QPU_MUX_B &&
475                      sig != QPU_SIG_SMALL_IMM &&
476                      raddr_b < 32 &&
477                      scoreboard->last_waddr_b == raddr_b)) {
478                         return true;
479                 }
480 
481                 if (src_muxes[i] == QPU_MUX_R4) {
482                         if (scoreboard->tick -
483                             scoreboard->last_sfu_write_tick <= 2) {
484                                 return true;
485                         }
486                 }
487         }
488 
489         if (sig == QPU_SIG_SMALL_IMM &&
490             QPU_GET_FIELD(inst, QPU_SMALL_IMM) >= QPU_SMALL_IMM_MUL_ROT) {
491                 uint32_t mux_a = QPU_GET_FIELD(inst, QPU_MUL_A);
492                 uint32_t mux_b = QPU_GET_FIELD(inst, QPU_MUL_B);
493 
494                 if (scoreboard->last_waddr_a == mux_a + QPU_W_ACC0 ||
495                     scoreboard->last_waddr_a == mux_b + QPU_W_ACC0 ||
496                     scoreboard->last_waddr_b == mux_a + QPU_W_ACC0 ||
497                     scoreboard->last_waddr_b == mux_b + QPU_W_ACC0) {
498                         return true;
499                 }
500         }
501 
502         if (reads_uniform(inst) &&
503             scoreboard->tick - scoreboard->last_uniforms_reset_tick <= 2) {
504                 return true;
505         }
506 
507         return false;
508 }
509 
510 static bool
pixel_scoreboard_too_soon(struct choose_scoreboard * scoreboard,uint64_t inst)511 pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, uint64_t inst)
512 {
513         return (scoreboard->tick < 2 && qpu_inst_is_tlb(inst));
514 }
515 
516 static int
get_instruction_priority(uint64_t inst)517 get_instruction_priority(uint64_t inst)
518 {
519         uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
520         uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
521         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
522         uint32_t baseline_score;
523         uint32_t next_score = 0;
524 
525         /* Schedule TLB operations as late as possible, to get more
526          * parallelism between shaders.
527          */
528         if (qpu_inst_is_tlb(inst))
529                 return next_score;
530         next_score++;
531 
532         /* Schedule texture read results collection late to hide latency. */
533         if (sig == QPU_SIG_LOAD_TMU0 || sig == QPU_SIG_LOAD_TMU1)
534                 return next_score;
535         next_score++;
536 
537         /* Default score for things that aren't otherwise special. */
538         baseline_score = next_score;
539         next_score++;
540 
541         /* Schedule texture read setup early to hide their latency better. */
542         if (is_tmu_write(waddr_add) || is_tmu_write(waddr_mul))
543                 return next_score;
544         next_score++;
545 
546         return baseline_score;
547 }
548 
549 static struct schedule_node *
choose_instruction_to_schedule(struct choose_scoreboard * scoreboard,struct list_head * schedule_list,struct schedule_node * prev_inst)550 choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
551                                struct list_head *schedule_list,
552                                struct schedule_node *prev_inst)
553 {
554         struct schedule_node *chosen = NULL;
555         int chosen_prio = 0;
556 
557         /* Don't pair up anything with a thread switch signal -- emit_thrsw()
558          * will handle pairing it along with filling the delay slots.
559          */
560         if (prev_inst) {
561                 uint32_t prev_sig = QPU_GET_FIELD(prev_inst->inst->inst,
562                                                   QPU_SIG);
563                 if (prev_sig == QPU_SIG_THREAD_SWITCH ||
564                     prev_sig == QPU_SIG_LAST_THREAD_SWITCH) {
565                         return NULL;
566                 }
567         }
568 
569         list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,
570                             dag.link) {
571                 uint64_t inst = n->inst->inst;
572                 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
573 
574                 /* Don't choose the branch instruction until it's the last one
575                  * left.  XXX: We could potentially choose it before it's the
576                  * last one, if the remaining instructions fit in the delay
577                  * slots.
578                  */
579                 if (sig == QPU_SIG_BRANCH &&
580                     !list_is_singular(&scoreboard->dag->heads)) {
581                         continue;
582                 }
583 
584                 /* "An instruction must not read from a location in physical
585                  *  regfile A or B that was written to by the previous
586                  *  instruction."
587                  */
588                 if (reads_too_soon_after_write(scoreboard, inst))
589                         continue;
590 
591                 /* "A scoreboard wait must not occur in the first two
592                  *  instructions of a fragment shader. This is either the
593                  *  explicit Wait for Scoreboard signal or an implicit wait
594                  *  with the first tile-buffer read or write instruction."
595                  */
596                 if (pixel_scoreboard_too_soon(scoreboard, inst))
597                         continue;
598 
599                 /* If we're trying to pair with another instruction, check
600                  * that they're compatible.
601                  */
602                 if (prev_inst) {
603                         /* Don't pair up a thread switch signal -- we'll
604                          * handle pairing it when we pick it on its own.
605                          */
606                         if (sig == QPU_SIG_THREAD_SWITCH ||
607                             sig == QPU_SIG_LAST_THREAD_SWITCH) {
608                                 continue;
609                         }
610 
611                         if (prev_inst->uniform != -1 && n->uniform != -1)
612                                 continue;
613 
614                         /* Don't merge in something that will lock the TLB.
615                          * Hopefully what we have in inst will release some
616                          * other instructions, allowing us to delay the
617                          * TLB-locking instruction until later.
618                          */
619                         if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))
620                                 continue;
621 
622                         inst = qpu_merge_inst(prev_inst->inst->inst, inst);
623                         if (!inst)
624                                 continue;
625                 }
626 
627                 int prio = get_instruction_priority(inst);
628 
629                 /* Found a valid instruction.  If nothing better comes along,
630                  * this one works.
631                  */
632                 if (!chosen) {
633                         chosen = n;
634                         chosen_prio = prio;
635                         continue;
636                 }
637 
638                 if (prio > chosen_prio) {
639                         chosen = n;
640                         chosen_prio = prio;
641                 } else if (prio < chosen_prio) {
642                         continue;
643                 }
644 
645                 if (n->delay > chosen->delay) {
646                         chosen = n;
647                         chosen_prio = prio;
648                 } else if (n->delay < chosen->delay) {
649                         continue;
650                 }
651         }
652 
653         return chosen;
654 }
655 
656 static void
update_scoreboard_for_chosen(struct choose_scoreboard * scoreboard,uint64_t inst)657 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
658                              uint64_t inst)
659 {
660         uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
661         uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
662 
663         if (!(inst & QPU_WS)) {
664                 scoreboard->last_waddr_a = waddr_add;
665                 scoreboard->last_waddr_b = waddr_mul;
666         } else {
667                 scoreboard->last_waddr_b = waddr_add;
668                 scoreboard->last_waddr_a = waddr_mul;
669         }
670 
671         if ((waddr_add >= QPU_W_SFU_RECIP && waddr_add <= QPU_W_SFU_LOG) ||
672             (waddr_mul >= QPU_W_SFU_RECIP && waddr_mul <= QPU_W_SFU_LOG)) {
673                 scoreboard->last_sfu_write_tick = scoreboard->tick;
674         }
675 
676         if (waddr_add == QPU_W_UNIFORMS_ADDRESS ||
677             waddr_mul == QPU_W_UNIFORMS_ADDRESS) {
678                 scoreboard->last_uniforms_reset_tick = scoreboard->tick;
679         }
680 
681         if (qpu_inst_is_tlb(inst))
682                 scoreboard->tlb_locked = true;
683 }
684 
685 static void
dump_state(struct dag * dag)686 dump_state(struct dag *dag)
687 {
688         list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) {
689                 fprintf(stderr, "         t=%4d: ", n->unblocked_time);
690                 vc4_qpu_disasm(&n->inst->inst, 1);
691                 fprintf(stderr, "\n");
692 
693                 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
694                         struct schedule_node *child =
695                                 (struct schedule_node *)edge->child;
696                         if (!child)
697                                 continue;
698 
699                         fprintf(stderr, "                 - ");
700                         vc4_qpu_disasm(&child->inst->inst, 1);
701                         fprintf(stderr, " (%d parents, %c)\n",
702                                 child->dag.parent_count,
703                                 edge->data ? 'w' : 'r');
704                 }
705         }
706 }
707 
waddr_latency(uint32_t waddr,uint64_t after)708 static uint32_t waddr_latency(uint32_t waddr, uint64_t after)
709 {
710         if (waddr < 32)
711                 return 2;
712 
713         /* Apply some huge latency between texture fetch requests and getting
714          * their results back.
715          *
716          * FIXME: This is actually pretty bogus.  If we do:
717          *
718          * mov tmu0_s, a
719          * <a bit of math>
720          * mov tmu0_s, b
721          * load_tmu0
722          * <more math>
723          * load_tmu0
724          *
725          * we count that as worse than
726          *
727          * mov tmu0_s, a
728          * mov tmu0_s, b
729          * <lots of math>
730          * load_tmu0
731          * <more math>
732          * load_tmu0
733          *
734          * because we associate the first load_tmu0 with the *second* tmu0_s.
735          */
736         if (waddr == QPU_W_TMU0_S) {
737                 if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU0)
738                         return 100;
739         }
740         if (waddr == QPU_W_TMU1_S) {
741                 if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU1)
742                         return 100;
743         }
744 
745         switch(waddr) {
746         case QPU_W_SFU_RECIP:
747         case QPU_W_SFU_RECIPSQRT:
748         case QPU_W_SFU_EXP:
749         case QPU_W_SFU_LOG:
750                 return 3;
751         default:
752                 return 1;
753         }
754 }
755 
756 static uint32_t
instruction_latency(struct schedule_node * before,struct schedule_node * after)757 instruction_latency(struct schedule_node *before, struct schedule_node *after)
758 {
759         uint64_t before_inst = before->inst->inst;
760         uint64_t after_inst = after->inst->inst;
761 
762         return MAX2(waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_ADD),
763                                   after_inst),
764                     waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_MUL),
765                                   after_inst));
766 }
767 
768 /** Recursive computation of the delay member of a node. */
769 static void
compute_delay(struct dag_node * node,void * state)770 compute_delay(struct dag_node *node, void *state)
771 {
772         struct schedule_node *n = (struct schedule_node *)node;
773 
774         n->delay = 1;
775 
776         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
777                 struct schedule_node *child =
778                         (struct schedule_node *)edge->child;
779                 n->delay = MAX2(n->delay, (child->delay +
780                                            instruction_latency(n, child)));
781         }
782 }
783 
784 /* Removes a DAG head, but removing only the WAR edges. (dag_prune_head()
785  * should be called on it later to finish pruning the other edges).
786  */
787 static void
pre_remove_head(struct dag * dag,struct schedule_node * n)788 pre_remove_head(struct dag *dag, struct schedule_node *n)
789 {
790         list_delinit(&n->dag.link);
791 
792         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
793                 if (edge->data)
794                         dag_remove_edge(dag, edge);
795         }
796 }
797 
798 static void
mark_instruction_scheduled(struct dag * dag,uint32_t time,struct schedule_node * node)799 mark_instruction_scheduled(struct dag *dag,
800                            uint32_t time,
801                            struct schedule_node *node)
802 {
803         if (!node)
804                 return;
805 
806         util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) {
807                 struct schedule_node *child =
808                         (struct schedule_node *)edge->child;
809 
810                 if (!child)
811                         continue;
812 
813                 uint32_t latency = instruction_latency(node, child);
814 
815                 child->unblocked_time = MAX2(child->unblocked_time,
816                                              time + latency);
817         }
818         dag_prune_head(dag, &node->dag);
819 }
820 
821 /**
822  * Emits a THRSW/LTHRSW signal in the stream, trying to move it up to pair
823  * with another instruction.
824  */
825 static void
emit_thrsw(struct vc4_compile * c,struct choose_scoreboard * scoreboard,uint64_t inst)826 emit_thrsw(struct vc4_compile *c,
827            struct choose_scoreboard *scoreboard,
828            uint64_t inst)
829 {
830         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
831 
832         /* There should be nothing in a thrsw inst being scheduled other than
833          * the signal bits.
834          */
835         assert(QPU_GET_FIELD(inst, QPU_OP_ADD) == QPU_A_NOP);
836         assert(QPU_GET_FIELD(inst, QPU_OP_MUL) == QPU_M_NOP);
837 
838         /* Try to find an earlier scheduled instruction that we can merge the
839          * thrsw into.
840          */
841         int thrsw_ip = c->qpu_inst_count;
842         for (int i = 1; i <= MIN2(c->qpu_inst_count, 3); i++) {
843                 uint64_t prev_instr = c->qpu_insts[c->qpu_inst_count - i];
844                 uint32_t prev_sig = QPU_GET_FIELD(prev_instr, QPU_SIG);
845 
846                 if (prev_sig == QPU_SIG_NONE)
847                         thrsw_ip = c->qpu_inst_count - i;
848         }
849 
850         if (thrsw_ip != c->qpu_inst_count) {
851                 /* Merge the thrsw into the existing instruction. */
852                 c->qpu_insts[thrsw_ip] =
853                         QPU_UPDATE_FIELD(c->qpu_insts[thrsw_ip], sig, QPU_SIG);
854         } else {
855                 qpu_serialize_one_inst(c, inst);
856                 update_scoreboard_for_chosen(scoreboard, inst);
857         }
858 
859         /* Fill the delay slots. */
860         while (c->qpu_inst_count < thrsw_ip + 3) {
861                 update_scoreboard_for_chosen(scoreboard, qpu_NOP());
862                 qpu_serialize_one_inst(c, qpu_NOP());
863         }
864 }
865 
866 static uint32_t
schedule_instructions(struct vc4_compile * c,struct choose_scoreboard * scoreboard,struct qblock * block,struct list_head * schedule_list,enum quniform_contents * orig_uniform_contents,uint32_t * orig_uniform_data,uint32_t * next_uniform)867 schedule_instructions(struct vc4_compile *c,
868                       struct choose_scoreboard *scoreboard,
869                       struct qblock *block,
870                       struct list_head *schedule_list,
871                       enum quniform_contents *orig_uniform_contents,
872                       uint32_t *orig_uniform_data,
873                       uint32_t *next_uniform)
874 {
875         uint32_t time = 0;
876 
877         while (!list_is_empty(&scoreboard->dag->heads)) {
878                 struct schedule_node *chosen =
879                         choose_instruction_to_schedule(scoreboard,
880                                                        schedule_list,
881                                                        NULL);
882                 struct schedule_node *merge = NULL;
883 
884                 /* If there are no valid instructions to schedule, drop a NOP
885                  * in.
886                  */
887                 uint64_t inst = chosen ? chosen->inst->inst : qpu_NOP();
888 
889                 if (debug) {
890                         fprintf(stderr, "t=%4d: current list:\n",
891                                 time);
892                         dump_state(scoreboard->dag);
893                         fprintf(stderr, "t=%4d: chose: ", time);
894                         vc4_qpu_disasm(&inst, 1);
895                         fprintf(stderr, "\n");
896                 }
897 
898                 /* Schedule this instruction onto the QPU list. Also try to
899                  * find an instruction to pair with it.
900                  */
901                 if (chosen) {
902                         time = MAX2(chosen->unblocked_time, time);
903                         pre_remove_head(scoreboard->dag, chosen);
904                         if (chosen->uniform != -1) {
905                                 c->uniform_data[*next_uniform] =
906                                         orig_uniform_data[chosen->uniform];
907                                 c->uniform_contents[*next_uniform] =
908                                         orig_uniform_contents[chosen->uniform];
909                                 (*next_uniform)++;
910                         }
911 
912                         merge = choose_instruction_to_schedule(scoreboard,
913                                                                schedule_list,
914                                                                chosen);
915                         if (merge) {
916                                 time = MAX2(merge->unblocked_time, time);
917                                 inst = qpu_merge_inst(inst, merge->inst->inst);
918                                 assert(inst != 0);
919                                 if (merge->uniform != -1) {
920                                         c->uniform_data[*next_uniform] =
921                                                 orig_uniform_data[merge->uniform];
922                                         c->uniform_contents[*next_uniform] =
923                                                 orig_uniform_contents[merge->uniform];
924                                         (*next_uniform)++;
925                                 }
926 
927                                 if (debug) {
928                                         fprintf(stderr, "t=%4d: merging: ",
929                                                 time);
930                                         vc4_qpu_disasm(&merge->inst->inst, 1);
931                                         fprintf(stderr, "\n");
932                                         fprintf(stderr, "            resulting in: ");
933                                         vc4_qpu_disasm(&inst, 1);
934                                         fprintf(stderr, "\n");
935                                 }
936                         }
937                 }
938 
939                 if (debug) {
940                         fprintf(stderr, "\n");
941                 }
942 
943                 /* Now that we've scheduled a new instruction, some of its
944                  * children can be promoted to the list of instructions ready to
945                  * be scheduled.  Update the children's unblocked time for this
946                  * DAG edge as we do so.
947                  */
948                 mark_instruction_scheduled(scoreboard->dag, time, chosen);
949                 mark_instruction_scheduled(scoreboard->dag, time, merge);
950 
951                 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_THREAD_SWITCH ||
952                     QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LAST_THREAD_SWITCH) {
953                         emit_thrsw(c, scoreboard, inst);
954                 } else {
955                         qpu_serialize_one_inst(c, inst);
956                         update_scoreboard_for_chosen(scoreboard, inst);
957                 }
958 
959                 scoreboard->tick++;
960                 time++;
961 
962                 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH) {
963                         block->branch_qpu_ip = c->qpu_inst_count - 1;
964                         /* Fill the delay slots.
965                          *
966                          * We should fill these with actual instructions,
967                          * instead, but that will probably need to be done
968                          * after this, once we know what the leading
969                          * instructions of the successors are (so we can
970                          * handle A/B register file write latency)
971                         */
972                         inst = qpu_NOP();
973                         update_scoreboard_for_chosen(scoreboard, inst);
974                         qpu_serialize_one_inst(c, inst);
975                         qpu_serialize_one_inst(c, inst);
976                         qpu_serialize_one_inst(c, inst);
977                 }
978         }
979 
980         return time;
981 }
982 
983 static uint32_t
qpu_schedule_instructions_block(struct vc4_compile * c,struct choose_scoreboard * scoreboard,struct qblock * block,enum quniform_contents * orig_uniform_contents,uint32_t * orig_uniform_data,uint32_t * next_uniform)984 qpu_schedule_instructions_block(struct vc4_compile *c,
985                                 struct choose_scoreboard *scoreboard,
986                                 struct qblock *block,
987                                 enum quniform_contents *orig_uniform_contents,
988                                 uint32_t *orig_uniform_data,
989                                 uint32_t *next_uniform)
990 {
991         scoreboard->dag = dag_create(NULL);
992         struct list_head setup_list;
993 
994         list_inithead(&setup_list);
995 
996         /* Wrap each instruction in a scheduler structure. */
997         uint32_t next_sched_uniform = *next_uniform;
998         while (!list_is_empty(&block->qpu_inst_list)) {
999                 struct queued_qpu_inst *inst =
1000                         (struct queued_qpu_inst *)block->qpu_inst_list.next;
1001                 struct schedule_node *n = rzalloc(scoreboard->dag,
1002                                                   struct schedule_node);
1003 
1004                 dag_init_node(scoreboard->dag, &n->dag);
1005                 n->inst = inst;
1006 
1007                 if (reads_uniform(inst->inst)) {
1008                         n->uniform = next_sched_uniform++;
1009                 } else {
1010                         n->uniform = -1;
1011                 }
1012                 list_del(&inst->link);
1013                 list_addtail(&n->link, &setup_list);
1014         }
1015 
1016         calculate_forward_deps(c, scoreboard->dag, &setup_list);
1017         calculate_reverse_deps(c, scoreboard->dag, &setup_list);
1018 
1019         dag_traverse_bottom_up(scoreboard->dag, compute_delay, NULL);
1020 
1021         uint32_t cycles = schedule_instructions(c, scoreboard, block,
1022                                                 &setup_list,
1023                                                 orig_uniform_contents,
1024                                                 orig_uniform_data,
1025                                                 next_uniform);
1026 
1027         ralloc_free(scoreboard->dag);
1028         scoreboard->dag = NULL;
1029 
1030         return cycles;
1031 }
1032 
1033 static void
qpu_set_branch_targets(struct vc4_compile * c)1034 qpu_set_branch_targets(struct vc4_compile *c)
1035 {
1036         qir_for_each_block(block, c) {
1037                 /* The end block of the program has no branch. */
1038                 if (!block->successors[0])
1039                         continue;
1040 
1041                 /* If there was no branch instruction, then the successor
1042                  * block must follow immediately after this one.
1043                  */
1044                 if (block->branch_qpu_ip == ~0) {
1045                         assert(block->end_qpu_ip + 1 ==
1046                                block->successors[0]->start_qpu_ip);
1047                         continue;
1048                 }
1049 
1050                 /* Set the branch target for the block that doesn't follow
1051                  * immediately after ours.
1052                  */
1053                 uint64_t *branch_inst = &c->qpu_insts[block->branch_qpu_ip];
1054                 assert(QPU_GET_FIELD(*branch_inst, QPU_SIG) == QPU_SIG_BRANCH);
1055                 assert(QPU_GET_FIELD(*branch_inst, QPU_BRANCH_TARGET) == 0);
1056 
1057                 uint32_t branch_target =
1058                         (block->successors[0]->start_qpu_ip -
1059                          (block->branch_qpu_ip + 4)) * sizeof(uint64_t);
1060                 *branch_inst = (*branch_inst |
1061                                 QPU_SET_FIELD(branch_target, QPU_BRANCH_TARGET));
1062 
1063                 /* Make sure that the if-we-don't-jump successor was scheduled
1064                  * just after the delay slots.
1065                  */
1066                 if (block->successors[1]) {
1067                         assert(block->successors[1]->start_qpu_ip ==
1068                                block->branch_qpu_ip + 4);
1069                 }
1070         }
1071 }
1072 
1073 uint32_t
qpu_schedule_instructions(struct vc4_compile * c)1074 qpu_schedule_instructions(struct vc4_compile *c)
1075 {
1076         /* We reorder the uniforms as we schedule instructions, so save the
1077          * old data off and replace it.
1078          */
1079         uint32_t *uniform_data = c->uniform_data;
1080         enum quniform_contents *uniform_contents = c->uniform_contents;
1081         c->uniform_contents = ralloc_array(c, enum quniform_contents,
1082                                            c->num_uniforms);
1083         c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
1084         c->uniform_array_size = c->num_uniforms;
1085         uint32_t next_uniform = 0;
1086 
1087         struct choose_scoreboard scoreboard;
1088         memset(&scoreboard, 0, sizeof(scoreboard));
1089         scoreboard.last_waddr_a = ~0;
1090         scoreboard.last_waddr_b = ~0;
1091         scoreboard.last_sfu_write_tick = -10;
1092         scoreboard.last_uniforms_reset_tick = -10;
1093 
1094         if (debug) {
1095                 fprintf(stderr, "Pre-schedule instructions\n");
1096                 qir_for_each_block(block, c) {
1097                         fprintf(stderr, "BLOCK %d\n", block->index);
1098                         list_for_each_entry(struct queued_qpu_inst, q,
1099                                             &block->qpu_inst_list, link) {
1100                                 vc4_qpu_disasm(&q->inst, 1);
1101                                 fprintf(stderr, "\n");
1102                         }
1103                 }
1104                 fprintf(stderr, "\n");
1105         }
1106 
1107         uint32_t cycles = 0;
1108         qir_for_each_block(block, c) {
1109                 block->start_qpu_ip = c->qpu_inst_count;
1110                 block->branch_qpu_ip = ~0;
1111 
1112                 cycles += qpu_schedule_instructions_block(c,
1113                                                           &scoreboard,
1114                                                           block,
1115                                                           uniform_contents,
1116                                                           uniform_data,
1117                                                           &next_uniform);
1118 
1119                 block->end_qpu_ip = c->qpu_inst_count - 1;
1120         }
1121 
1122         qpu_set_branch_targets(c);
1123 
1124         assert(next_uniform == c->num_uniforms);
1125 
1126         if (debug) {
1127                 fprintf(stderr, "Post-schedule instructions\n");
1128                 vc4_qpu_disasm(c->qpu_insts, c->qpu_inst_count);
1129                 fprintf(stderr, "\n");
1130         }
1131 
1132         return cycles;
1133 }
1134