• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  * Copyright © 2014-2017 Broadcom
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 /**
26  * @file
27  *
28  * The basic model of the list scheduler is to take a basic block, compute a
29  * DAG of the dependencies, and make a list of the DAG heads.  Heuristically
30  * pick a DAG head, then put all the children that are now DAG heads into the
31  * list of things to schedule.
32  *
33  * The goal of scheduling here is to pack pairs of operations together in a
34  * single QPU instruction.
35  */
36 
37 #include "qpu/qpu_disasm.h"
38 #include "v3d_compiler.h"
39 #include "util/ralloc.h"
40 #include "util/dag.h"
41 
42 static bool debug;
43 
44 struct schedule_node_child;
45 
46 struct schedule_node {
47         struct dag_node dag;
48         struct list_head link;
49         struct qinst *inst;
50 
51         /* Longest cycles + instruction_latency() of any parent of this node. */
52         uint32_t unblocked_time;
53 
54         /**
55          * Minimum number of cycles from scheduling this instruction until the
56          * end of the program, based on the slowest dependency chain through
57          * the children.
58          */
59         uint32_t delay;
60 
61         /**
62          * cycles between this instruction being scheduled and when its result
63          * can be consumed.
64          */
65         uint32_t latency;
66 };
67 
68 /* When walking the instructions in reverse, we need to swap before/after in
69  * add_dep().
70  */
71 enum direction { F, R };
72 
73 struct schedule_state {
74         const struct v3d_device_info *devinfo;
75         struct dag *dag;
76         struct schedule_node *last_r[6];
77         struct schedule_node *last_rf[64];
78         struct schedule_node *last_sf;
79         struct schedule_node *last_vpm_read;
80         struct schedule_node *last_tmu_write;
81         struct schedule_node *last_tmu_config;
82         struct schedule_node *last_tmu_read;
83         struct schedule_node *last_tlb;
84         struct schedule_node *last_vpm;
85         struct schedule_node *last_unif;
86         struct schedule_node *last_rtop;
87         struct schedule_node *last_unifa;
88         enum direction dir;
89         /* Estimated cycle when the current instruction would start. */
90         uint32_t time;
91 };
92 
93 static void
add_dep(struct schedule_state * state,struct schedule_node * before,struct schedule_node * after,bool write)94 add_dep(struct schedule_state *state,
95         struct schedule_node *before,
96         struct schedule_node *after,
97         bool write)
98 {
99         bool write_after_read = !write && state->dir == R;
100         void *edge_data = (void *)(uintptr_t)write_after_read;
101 
102         if (!before || !after)
103                 return;
104 
105         assert(before != after);
106 
107         if (state->dir == F)
108                 dag_add_edge(&before->dag, &after->dag, edge_data);
109         else
110                 dag_add_edge(&after->dag, &before->dag, edge_data);
111 }
112 
113 static void
add_read_dep(struct schedule_state * state,struct schedule_node * before,struct schedule_node * after)114 add_read_dep(struct schedule_state *state,
115               struct schedule_node *before,
116               struct schedule_node *after)
117 {
118         add_dep(state, before, after, false);
119 }
120 
121 static void
add_write_dep(struct schedule_state * state,struct schedule_node ** before,struct schedule_node * after)122 add_write_dep(struct schedule_state *state,
123               struct schedule_node **before,
124               struct schedule_node *after)
125 {
126         add_dep(state, *before, after, true);
127         *before = after;
128 }
129 
130 static bool
qpu_inst_is_tlb(const struct v3d_qpu_instr * inst)131 qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
132 {
133         if (inst->sig.ldtlb || inst->sig.ldtlbu)
134                 return true;
135 
136         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
137                 return false;
138 
139         if (inst->alu.add.magic_write &&
140             (inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
141              inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
142                 return true;
143 
144         if (inst->alu.mul.magic_write &&
145             (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
146              inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
147                 return true;
148 
149         return false;
150 }
151 
152 static void
process_mux_deps(struct schedule_state * state,struct schedule_node * n,enum v3d_qpu_mux mux)153 process_mux_deps(struct schedule_state *state, struct schedule_node *n,
154                  enum v3d_qpu_mux mux)
155 {
156         switch (mux) {
157         case V3D_QPU_MUX_A:
158                 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
159                 break;
160         case V3D_QPU_MUX_B:
161                 if (!n->inst->qpu.sig.small_imm) {
162                         add_read_dep(state,
163                                      state->last_rf[n->inst->qpu.raddr_b], n);
164                 }
165                 break;
166         default:
167                 add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n);
168                 break;
169         }
170 }
171 
172 static bool
tmu_write_is_sequence_terminator(uint32_t waddr)173 tmu_write_is_sequence_terminator(uint32_t waddr)
174 {
175         switch (waddr) {
176         case V3D_QPU_WADDR_TMUS:
177         case V3D_QPU_WADDR_TMUSCM:
178         case V3D_QPU_WADDR_TMUSF:
179         case V3D_QPU_WADDR_TMUSLOD:
180         case V3D_QPU_WADDR_TMUA:
181         case V3D_QPU_WADDR_TMUAU:
182                 return true;
183         default:
184                 return false;
185         }
186 }
187 
188 static bool
can_reorder_tmu_write(const struct v3d_device_info * devinfo,uint32_t waddr)189 can_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr)
190 {
191         if (devinfo->ver < 40)
192                 return false;
193 
194         if (tmu_write_is_sequence_terminator(waddr))
195                 return false;
196 
197         if (waddr == V3D_QPU_WADDR_TMUD)
198                 return false;
199 
200         return true;
201 }
202 
203 static void
process_waddr_deps(struct schedule_state * state,struct schedule_node * n,uint32_t waddr,bool magic)204 process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
205                    uint32_t waddr, bool magic)
206 {
207         if (!magic) {
208                 add_write_dep(state, &state->last_rf[waddr], n);
209         } else if (v3d_qpu_magic_waddr_is_tmu(state->devinfo, waddr)) {
210                 if (can_reorder_tmu_write(state->devinfo, waddr))
211                         add_read_dep(state, state->last_tmu_write, n);
212                 else
213                         add_write_dep(state, &state->last_tmu_write, n);
214 
215                 if (tmu_write_is_sequence_terminator(waddr))
216                         add_write_dep(state, &state->last_tmu_config, n);
217         } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) {
218                 /* Handled by v3d_qpu_writes_r4() check. */
219         } else {
220                 switch (waddr) {
221                 case V3D_QPU_WADDR_R0:
222                 case V3D_QPU_WADDR_R1:
223                 case V3D_QPU_WADDR_R2:
224                         add_write_dep(state,
225                                       &state->last_r[waddr - V3D_QPU_WADDR_R0],
226                                       n);
227                         break;
228                 case V3D_QPU_WADDR_R3:
229                 case V3D_QPU_WADDR_R4:
230                 case V3D_QPU_WADDR_R5:
231                         /* Handled by v3d_qpu_writes_r*() checks below. */
232                         break;
233 
234                 case V3D_QPU_WADDR_VPM:
235                 case V3D_QPU_WADDR_VPMU:
236                         add_write_dep(state, &state->last_vpm, n);
237                         break;
238 
239                 case V3D_QPU_WADDR_TLB:
240                 case V3D_QPU_WADDR_TLBU:
241                         add_write_dep(state, &state->last_tlb, n);
242                         break;
243 
244                 case V3D_QPU_WADDR_SYNC:
245                 case V3D_QPU_WADDR_SYNCB:
246                 case V3D_QPU_WADDR_SYNCU:
247                         /* For CS barrier(): Sync against any other memory
248                          * accesses.  There doesn't appear to be any need for
249                          * barriers to affect ALU operations.
250                          */
251                         add_write_dep(state, &state->last_tmu_write, n);
252                         add_write_dep(state, &state->last_tmu_read, n);
253                         break;
254 
255                 case V3D_QPU_WADDR_UNIFA:
256                         if (state->devinfo->ver >= 40)
257                                 add_write_dep(state, &state->last_unifa, n);
258                         break;
259 
260                 case V3D_QPU_WADDR_NOP:
261                         break;
262 
263                 default:
264                         fprintf(stderr, "Unknown waddr %d\n", waddr);
265                         abort();
266                 }
267         }
268 }
269 
270 /**
271  * Common code for dependencies that need to be tracked both forward and
272  * backward.
273  *
274  * This is for things like "all reads of r4 have to happen between the r4
275  * writes that surround them".
276  */
277 static void
calculate_deps(struct schedule_state * state,struct schedule_node * n)278 calculate_deps(struct schedule_state *state, struct schedule_node *n)
279 {
280         const struct v3d_device_info *devinfo = state->devinfo;
281         struct qinst *qinst = n->inst;
282         struct v3d_qpu_instr *inst = &qinst->qpu;
283         /* If the input and output segments are shared, then all VPM reads to
284          * a location need to happen before all writes.  We handle this by
285          * serializing all VPM operations for now.
286          */
287         bool separate_vpm_segment = false;
288 
289         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
290                 if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS)
291                         add_read_dep(state, state->last_sf, n);
292 
293                 /* XXX: BDI */
294                 /* XXX: BDU */
295                 /* XXX: ub */
296                 /* XXX: raddr_a */
297 
298                 add_write_dep(state, &state->last_unif, n);
299                 return;
300         }
301 
302         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
303 
304         /* XXX: LOAD_IMM */
305 
306         if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
307                 process_mux_deps(state, n, inst->alu.add.a);
308         if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
309                 process_mux_deps(state, n, inst->alu.add.b);
310 
311         if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
312                 process_mux_deps(state, n, inst->alu.mul.a);
313         if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
314                 process_mux_deps(state, n, inst->alu.mul.b);
315 
316         switch (inst->alu.add.op) {
317         case V3D_QPU_A_VPMSETUP:
318                 /* Could distinguish read/write by unpacking the uniform. */
319                 add_write_dep(state, &state->last_vpm, n);
320                 add_write_dep(state, &state->last_vpm_read, n);
321                 break;
322 
323         case V3D_QPU_A_STVPMV:
324         case V3D_QPU_A_STVPMD:
325         case V3D_QPU_A_STVPMP:
326                 add_write_dep(state, &state->last_vpm, n);
327                 break;
328 
329         case V3D_QPU_A_LDVPMV_IN:
330         case V3D_QPU_A_LDVPMD_IN:
331         case V3D_QPU_A_LDVPMG_IN:
332         case V3D_QPU_A_LDVPMP:
333                 if (!separate_vpm_segment)
334                         add_write_dep(state, &state->last_vpm, n);
335                 break;
336 
337         case V3D_QPU_A_VPMWT:
338                 add_read_dep(state, state->last_vpm, n);
339                 break;
340 
341         case V3D_QPU_A_MSF:
342                 add_read_dep(state, state->last_tlb, n);
343                 break;
344 
345         case V3D_QPU_A_SETMSF:
346         case V3D_QPU_A_SETREVF:
347                 add_write_dep(state, &state->last_tlb, n);
348                 break;
349 
350         default:
351                 break;
352         }
353 
354         switch (inst->alu.mul.op) {
355         case V3D_QPU_M_MULTOP:
356         case V3D_QPU_M_UMUL24:
357                 /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and
358                  * resets it to 0.  We could possibly reorder umul24s relative
359                  * to each other, but for now just keep all the MUL parts in
360                  * order.
361                  */
362                 add_write_dep(state, &state->last_rtop, n);
363                 break;
364         default:
365                 break;
366         }
367 
368         if (inst->alu.add.op != V3D_QPU_A_NOP) {
369                 process_waddr_deps(state, n, inst->alu.add.waddr,
370                                    inst->alu.add.magic_write);
371         }
372         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
373                 process_waddr_deps(state, n, inst->alu.mul.waddr,
374                                    inst->alu.mul.magic_write);
375         }
376         if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) {
377                 process_waddr_deps(state, n, inst->sig_addr,
378                                    inst->sig_magic);
379         }
380 
381         if (v3d_qpu_writes_r3(devinfo, inst))
382                 add_write_dep(state, &state->last_r[3], n);
383         if (v3d_qpu_writes_r4(devinfo, inst))
384                 add_write_dep(state, &state->last_r[4], n);
385         if (v3d_qpu_writes_r5(devinfo, inst))
386                 add_write_dep(state, &state->last_r[5], n);
387 
388         /* If we add any more dependencies here we should consider whether we
389          * also need to update qpu_inst_after_thrsw_valid_in_delay_slot.
390          */
391         if (inst->sig.thrsw) {
392                 /* All accumulator contents and flags are undefined after the
393                  * switch.
394                  */
395                 for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
396                         add_write_dep(state, &state->last_r[i], n);
397                 add_write_dep(state, &state->last_sf, n);
398                 add_write_dep(state, &state->last_rtop, n);
399 
400                 /* Scoreboard-locking operations have to stay after the last
401                  * thread switch.
402                  */
403                 add_write_dep(state, &state->last_tlb, n);
404 
405                 add_write_dep(state, &state->last_tmu_write, n);
406                 add_write_dep(state, &state->last_tmu_config, n);
407         }
408 
409         if (v3d_qpu_waits_on_tmu(inst)) {
410                 /* TMU loads are coming from a FIFO, so ordering is important.
411                  */
412                 add_write_dep(state, &state->last_tmu_read, n);
413                 /* Keep TMU loads after their TMU lookup terminator */
414                 add_read_dep(state, state->last_tmu_config, n);
415         }
416 
417         /* Allow wrtmuc to be reordered with other instructions in the
418          * same TMU sequence by using a read dependency on the last TMU
419          * sequence terminator.
420          */
421         if (inst->sig.wrtmuc)
422                 add_read_dep(state, state->last_tmu_config, n);
423 
424         if (inst->sig.ldtlb | inst->sig.ldtlbu)
425                 add_write_dep(state, &state->last_tlb, n);
426 
427         if (inst->sig.ldvpm) {
428                 add_write_dep(state, &state->last_vpm_read, n);
429 
430                 /* At least for now, we're doing shared I/O segments, so queue
431                  * all writes after all reads.
432                  */
433                 if (!separate_vpm_segment)
434                         add_write_dep(state, &state->last_vpm, n);
435         }
436 
437         /* inst->sig.ldunif or sideband uniform read */
438         if (vir_has_uniform(qinst))
439                 add_write_dep(state, &state->last_unif, n);
440 
441         /* Both unifa and ldunifa must preserve ordering */
442         if (inst->sig.ldunifa || inst->sig.ldunifarf)
443                 add_write_dep(state, &state->last_unifa, n);
444 
445         if (v3d_qpu_reads_flags(inst))
446                 add_read_dep(state, state->last_sf, n);
447         if (v3d_qpu_writes_flags(inst))
448                 add_write_dep(state, &state->last_sf, n);
449 }
450 
451 static void
calculate_forward_deps(struct v3d_compile * c,struct dag * dag,struct list_head * schedule_list)452 calculate_forward_deps(struct v3d_compile *c, struct dag *dag,
453                        struct list_head *schedule_list)
454 {
455         struct schedule_state state;
456 
457         memset(&state, 0, sizeof(state));
458         state.dag = dag;
459         state.devinfo = c->devinfo;
460         state.dir = F;
461 
462         list_for_each_entry(struct schedule_node, node, schedule_list, link)
463                 calculate_deps(&state, node);
464 }
465 
466 static void
calculate_reverse_deps(struct v3d_compile * c,struct dag * dag,struct list_head * schedule_list)467 calculate_reverse_deps(struct v3d_compile *c, struct dag *dag,
468                        struct list_head *schedule_list)
469 {
470         struct schedule_state state;
471 
472         memset(&state, 0, sizeof(state));
473         state.dag = dag;
474         state.devinfo = c->devinfo;
475         state.dir = R;
476 
477         list_for_each_entry_rev(struct schedule_node, node, schedule_list,
478                                 link) {
479                 calculate_deps(&state, (struct schedule_node *)node);
480         }
481 }
482 
483 struct choose_scoreboard {
484         struct dag *dag;
485         int tick;
486         int last_magic_sfu_write_tick;
487         int last_stallable_sfu_reg;
488         int last_stallable_sfu_tick;
489         int last_ldvary_tick;
490         int last_unifa_write_tick;
491         int last_uniforms_reset_tick;
492         int last_thrsw_tick;
493         int last_branch_tick;
494         int last_setmsf_tick;
495         bool first_thrsw_emitted;
496         bool last_thrsw_emitted;
497         bool fixup_ldvary;
498         int ldvary_count;
499 };
500 
501 static bool
mux_reads_too_soon(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst,enum v3d_qpu_mux mux)502 mux_reads_too_soon(struct choose_scoreboard *scoreboard,
503                    const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
504 {
505         switch (mux) {
506         case V3D_QPU_MUX_R4:
507                 if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2)
508                         return true;
509                 break;
510 
511         case V3D_QPU_MUX_R5:
512                 if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
513                         return true;
514                 break;
515         default:
516                 break;
517         }
518 
519         return false;
520 }
521 
522 static bool
reads_too_soon_after_write(struct choose_scoreboard * scoreboard,struct qinst * qinst)523 reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
524                            struct qinst *qinst)
525 {
526         const struct v3d_qpu_instr *inst = &qinst->qpu;
527 
528         /* XXX: Branching off of raddr. */
529         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
530                 return false;
531 
532         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
533 
534         if (inst->alu.add.op != V3D_QPU_A_NOP) {
535                 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
536                     mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
537                         return true;
538                 }
539                 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
540                     mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
541                         return true;
542                 }
543         }
544 
545         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
546                 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
547                     mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
548                         return true;
549                 }
550                 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
551                     mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
552                         return true;
553                 }
554         }
555 
556         /* XXX: imm */
557 
558         return false;
559 }
560 
561 static bool
writes_too_soon_after_write(const struct v3d_device_info * devinfo,struct choose_scoreboard * scoreboard,struct qinst * qinst)562 writes_too_soon_after_write(const struct v3d_device_info *devinfo,
563                             struct choose_scoreboard *scoreboard,
564                             struct qinst *qinst)
565 {
566         const struct v3d_qpu_instr *inst = &qinst->qpu;
567 
568         /* Don't schedule any other r4 write too soon after an SFU write.
569          * This would normally be prevented by dependency tracking, but might
570          * occur if a dead SFU computation makes it to scheduling.
571          */
572         if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 &&
573             v3d_qpu_writes_r4(devinfo, inst))
574                 return true;
575 
576         return false;
577 }
578 
579 static bool
scoreboard_is_locked(struct choose_scoreboard * scoreboard,bool lock_scoreboard_on_first_thrsw)580 scoreboard_is_locked(struct choose_scoreboard *scoreboard,
581                      bool lock_scoreboard_on_first_thrsw)
582 {
583         if (lock_scoreboard_on_first_thrsw) {
584                 return scoreboard->first_thrsw_emitted &&
585                        scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
586         }
587 
588         return scoreboard->last_thrsw_emitted &&
589                scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
590 }
591 
592 static bool
pixel_scoreboard_too_soon(struct v3d_compile * c,struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst)593 pixel_scoreboard_too_soon(struct v3d_compile *c,
594                           struct choose_scoreboard *scoreboard,
595                           const struct v3d_qpu_instr *inst)
596 {
597         return qpu_inst_is_tlb(inst) &&
598                !scoreboard_is_locked(scoreboard,
599                                      c->lock_scoreboard_on_first_thrsw);
600 }
601 
602 static bool
qpu_instruction_uses_rf(const struct v3d_qpu_instr * inst,uint32_t waddr)603 qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,
604                         uint32_t waddr) {
605 
606         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
607            return false;
608 
609         if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
610             inst->raddr_a == waddr)
611               return true;
612 
613         if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
614             !inst->sig.small_imm && (inst->raddr_b == waddr))
615               return true;
616 
617         return false;
618 }
619 
620 static bool
mux_read_stalls(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst)621 mux_read_stalls(struct choose_scoreboard *scoreboard,
622                 const struct v3d_qpu_instr *inst)
623 {
624         return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
625                 qpu_instruction_uses_rf(inst,
626                                         scoreboard->last_stallable_sfu_reg);
627 }
628 
629 /* We define a max schedule priority to allow negative priorities as result of
630  * substracting this max when an instruction stalls. So instructions that
631  * stall have lower priority than regular instructions. */
632 #define MAX_SCHEDULE_PRIORITY 16
633 
634 static int
get_instruction_priority(const struct v3d_device_info * devinfo,const struct v3d_qpu_instr * inst)635 get_instruction_priority(const struct v3d_device_info *devinfo,
636                          const struct v3d_qpu_instr *inst)
637 {
638         uint32_t baseline_score;
639         uint32_t next_score = 0;
640 
641         /* Schedule TLB operations as late as possible, to get more
642          * parallelism between shaders.
643          */
644         if (qpu_inst_is_tlb(inst))
645                 return next_score;
646         next_score++;
647 
648         /* Schedule texture read results collection late to hide latency. */
649         if (v3d_qpu_waits_on_tmu(inst))
650                 return next_score;
651         next_score++;
652 
653         /* Default score for things that aren't otherwise special. */
654         baseline_score = next_score;
655         next_score++;
656 
657         /* Schedule texture read setup early to hide their latency better. */
658         if (v3d_qpu_writes_tmu(devinfo, inst))
659                 return next_score;
660         next_score++;
661 
662         /* We should increase the maximum if we assert here */
663         assert(next_score < MAX_SCHEDULE_PRIORITY);
664 
665         return baseline_score;
666 }
667 
668 static bool
qpu_magic_waddr_is_periph(const struct v3d_device_info * devinfo,enum v3d_qpu_waddr waddr)669 qpu_magic_waddr_is_periph(const struct v3d_device_info *devinfo,
670                           enum v3d_qpu_waddr waddr)
671 {
672         return (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) ||
673                 v3d_qpu_magic_waddr_is_sfu(waddr) ||
674                 v3d_qpu_magic_waddr_is_tlb(waddr) ||
675                 v3d_qpu_magic_waddr_is_vpm(waddr) ||
676                 v3d_qpu_magic_waddr_is_tsy(waddr));
677 }
678 
679 static bool
qpu_accesses_peripheral(const struct v3d_device_info * devinfo,const struct v3d_qpu_instr * inst)680 qpu_accesses_peripheral(const struct v3d_device_info *devinfo,
681                         const struct v3d_qpu_instr *inst)
682 {
683         if (v3d_qpu_uses_vpm(inst))
684                 return true;
685         if (v3d_qpu_uses_sfu(inst))
686                 return true;
687 
688         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
689                 if (inst->alu.add.op != V3D_QPU_A_NOP &&
690                     inst->alu.add.magic_write &&
691                     qpu_magic_waddr_is_periph(devinfo, inst->alu.add.waddr)) {
692                         return true;
693                 }
694 
695                 if (inst->alu.add.op == V3D_QPU_A_TMUWT)
696                         return true;
697 
698                 if (inst->alu.mul.op != V3D_QPU_M_NOP &&
699                     inst->alu.mul.magic_write &&
700                     qpu_magic_waddr_is_periph(devinfo, inst->alu.mul.waddr)) {
701                         return true;
702                 }
703         }
704 
705         return (inst->sig.ldvpm ||
706                 inst->sig.ldtmu ||
707                 inst->sig.ldtlb ||
708                 inst->sig.ldtlbu ||
709                 inst->sig.wrtmuc);
710 }
711 
712 static bool
qpu_compatible_peripheral_access(const struct v3d_device_info * devinfo,const struct v3d_qpu_instr * a,const struct v3d_qpu_instr * b)713 qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
714                                  const struct v3d_qpu_instr *a,
715                                  const struct v3d_qpu_instr *b)
716 {
717         const bool a_uses_peripheral = qpu_accesses_peripheral(devinfo, a);
718         const bool b_uses_peripheral = qpu_accesses_peripheral(devinfo, b);
719 
720         /* We can always do one peripheral access per instruction. */
721         if (!a_uses_peripheral || !b_uses_peripheral)
722                 return true;
723 
724         if (devinfo->ver < 41)
725                 return false;
726 
727         /* V3D 4.1 and later allow TMU read along with a VPM read or write, and
728          * WRTMUC with a TMU magic register write (other than tmuc).
729          */
730         if ((a->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(b)) ||
731             (b->sig.ldtmu && v3d_qpu_reads_or_writes_vpm(a))) {
732                 return true;
733         }
734 
735         if ((a->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||
736             (b->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(devinfo, a))) {
737                 return true;
738         }
739 
740         return false;
741 }
742 
743 /* Compute a bitmask of which rf registers are used between
744  * the two instructions.
745  */
746 static uint64_t
qpu_raddrs_used(const struct v3d_qpu_instr * a,const struct v3d_qpu_instr * b)747 qpu_raddrs_used(const struct v3d_qpu_instr *a,
748                 const struct v3d_qpu_instr *b)
749 {
750         assert(a->type == V3D_QPU_INSTR_TYPE_ALU);
751         assert(b->type == V3D_QPU_INSTR_TYPE_ALU);
752 
753         uint64_t raddrs_used = 0;
754         if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A))
755                 raddrs_used |= (1ll << a->raddr_a);
756         if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
757                 raddrs_used |= (1ll << a->raddr_b);
758         if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A))
759                 raddrs_used |= (1ll << b->raddr_a);
760         if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
761                 raddrs_used |= (1ll << b->raddr_b);
762 
763         return raddrs_used;
764 }
765 
766 /* Take two instructions and attempt to merge their raddr fields
767  * into one merged instruction. Returns false if the two instructions
768  * access more than two different rf registers between them, or more
769  * than one rf register and one small immediate.
770  */
771 static bool
qpu_merge_raddrs(struct v3d_qpu_instr * result,const struct v3d_qpu_instr * add_instr,const struct v3d_qpu_instr * mul_instr)772 qpu_merge_raddrs(struct v3d_qpu_instr *result,
773                  const struct v3d_qpu_instr *add_instr,
774                  const struct v3d_qpu_instr *mul_instr)
775 {
776         uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
777         int naddrs = util_bitcount64(raddrs_used);
778 
779         if (naddrs > 2)
780                 return false;
781 
782         if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) {
783                 if (naddrs > 1)
784                         return false;
785 
786                 if (add_instr->sig.small_imm && mul_instr->sig.small_imm)
787                         if (add_instr->raddr_b != mul_instr->raddr_b)
788                                 return false;
789 
790                 result->sig.small_imm = true;
791                 result->raddr_b = add_instr->sig.small_imm ?
792                         add_instr->raddr_b : mul_instr->raddr_b;
793         }
794 
795         if (naddrs == 0)
796                 return true;
797 
798         int raddr_a = ffsll(raddrs_used) - 1;
799         raddrs_used &= ~(1ll << raddr_a);
800         result->raddr_a = raddr_a;
801 
802         if (!result->sig.small_imm) {
803                 if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
804                     raddr_a == add_instr->raddr_b) {
805                         if (add_instr->alu.add.a == V3D_QPU_MUX_B)
806                                 result->alu.add.a = V3D_QPU_MUX_A;
807                         if (add_instr->alu.add.b == V3D_QPU_MUX_B &&
808                             v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
809                                 result->alu.add.b = V3D_QPU_MUX_A;
810                         }
811                 }
812                 if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) &&
813                     raddr_a == mul_instr->raddr_b) {
814                         if (mul_instr->alu.mul.a == V3D_QPU_MUX_B)
815                                 result->alu.mul.a = V3D_QPU_MUX_A;
816                         if (mul_instr->alu.mul.b == V3D_QPU_MUX_B &&
817                             v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
818                                 result->alu.mul.b = V3D_QPU_MUX_A;
819                         }
820                 }
821         }
822         if (!raddrs_used)
823                 return true;
824 
825         int raddr_b = ffsll(raddrs_used) - 1;
826         result->raddr_b = raddr_b;
827         if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) &&
828             raddr_b == add_instr->raddr_a) {
829                 if (add_instr->alu.add.a == V3D_QPU_MUX_A)
830                         result->alu.add.a = V3D_QPU_MUX_B;
831                 if (add_instr->alu.add.b == V3D_QPU_MUX_A &&
832                     v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
833                         result->alu.add.b = V3D_QPU_MUX_B;
834                 }
835         }
836         if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) &&
837             raddr_b == mul_instr->raddr_a) {
838                 if (mul_instr->alu.mul.a == V3D_QPU_MUX_A)
839                         result->alu.mul.a = V3D_QPU_MUX_B;
840                 if (mul_instr->alu.mul.b == V3D_QPU_MUX_A &&
841                     v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
842                         result->alu.mul.b = V3D_QPU_MUX_B;
843                 }
844         }
845 
846         return true;
847 }
848 
849 static bool
can_do_add_as_mul(enum v3d_qpu_add_op op)850 can_do_add_as_mul(enum v3d_qpu_add_op op)
851 {
852         switch (op) {
853         case V3D_QPU_A_ADD:
854         case V3D_QPU_A_SUB:
855                 return true;
856         default:
857                 return false;
858         }
859 }
860 
861 static enum v3d_qpu_mul_op
add_op_as_mul_op(enum v3d_qpu_add_op op)862 add_op_as_mul_op(enum v3d_qpu_add_op op)
863 {
864         switch (op) {
865         case V3D_QPU_A_ADD:
866                 return V3D_QPU_M_ADD;
867         case V3D_QPU_A_SUB:
868                 return V3D_QPU_M_SUB;
869         default:
870                 unreachable("unexpected add opcode");
871         }
872 }
873 
874 static void
qpu_convert_add_to_mul(struct v3d_qpu_instr * inst)875 qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
876 {
877         STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));
878         assert(inst->alu.add.op != V3D_QPU_A_NOP);
879         assert(inst->alu.mul.op == V3D_QPU_M_NOP);
880 
881         memcpy(&inst->alu.mul, &inst->alu.add, sizeof(inst->alu.mul));
882         inst->alu.mul.op = add_op_as_mul_op(inst->alu.add.op);
883         inst->alu.add.op = V3D_QPU_A_NOP;
884 
885         inst->flags.mc = inst->flags.ac;
886         inst->flags.mpf = inst->flags.apf;
887         inst->flags.muf = inst->flags.auf;
888         inst->flags.ac = V3D_QPU_COND_NONE;
889         inst->flags.apf = V3D_QPU_PF_NONE;
890         inst->flags.auf = V3D_QPU_UF_NONE;
891 }
892 
893 static bool
qpu_merge_inst(const struct v3d_device_info * devinfo,struct v3d_qpu_instr * result,const struct v3d_qpu_instr * a,const struct v3d_qpu_instr * b)894 qpu_merge_inst(const struct v3d_device_info *devinfo,
895                struct v3d_qpu_instr *result,
896                const struct v3d_qpu_instr *a,
897                const struct v3d_qpu_instr *b)
898 {
899         if (a->type != V3D_QPU_INSTR_TYPE_ALU ||
900             b->type != V3D_QPU_INSTR_TYPE_ALU) {
901                 return false;
902         }
903 
904         if (!qpu_compatible_peripheral_access(devinfo, a, b))
905                 return false;
906 
907         struct v3d_qpu_instr merge = *a;
908         const struct v3d_qpu_instr *add_instr = NULL, *mul_instr = NULL;
909 
910         struct v3d_qpu_instr mul_inst;
911         if (b->alu.add.op != V3D_QPU_A_NOP) {
912                 if (a->alu.add.op == V3D_QPU_A_NOP) {
913                         merge.alu.add = b->alu.add;
914 
915                         merge.flags.ac = b->flags.ac;
916                         merge.flags.apf = b->flags.apf;
917                         merge.flags.auf = b->flags.auf;
918 
919                         add_instr = b;
920                         mul_instr = a;
921                 }
922                 /* If a's add op is used but its mul op is not, then see if we
923                  * can convert either a's add op or b's add op to a mul op
924                  * so we can merge.
925                  */
926                 else if (a->alu.mul.op == V3D_QPU_M_NOP &&
927                          can_do_add_as_mul(b->alu.add.op)) {
928                         mul_inst = *b;
929                         qpu_convert_add_to_mul(&mul_inst);
930 
931                         merge.alu.mul = mul_inst.alu.mul;
932 
933                         merge.flags.mc = b->flags.ac;
934                         merge.flags.mpf = b->flags.apf;
935                         merge.flags.muf = b->flags.auf;
936 
937                         add_instr = a;
938                         mul_instr = &mul_inst;
939                 } else if (a->alu.mul.op == V3D_QPU_M_NOP &&
940                            can_do_add_as_mul(a->alu.add.op)) {
941                         mul_inst = *a;
942                         qpu_convert_add_to_mul(&mul_inst);
943 
944                         merge = mul_inst;
945                         merge.alu.add = b->alu.add;
946 
947                         merge.flags.ac = b->flags.ac;
948                         merge.flags.apf = b->flags.apf;
949                         merge.flags.auf = b->flags.auf;
950 
951                         add_instr = b;
952                         mul_instr = &mul_inst;
953                 } else {
954                         return false;
955                 }
956         }
957 
958         if (b->alu.mul.op != V3D_QPU_M_NOP) {
959                 if (a->alu.mul.op != V3D_QPU_M_NOP)
960                         return false;
961                 merge.alu.mul = b->alu.mul;
962 
963                 merge.flags.mc = b->flags.mc;
964                 merge.flags.mpf = b->flags.mpf;
965                 merge.flags.muf = b->flags.muf;
966 
967                 mul_instr = b;
968                 add_instr = a;
969         }
970 
971         if (add_instr && mul_instr &&
972             !qpu_merge_raddrs(&merge, add_instr, mul_instr)) {
973                         return false;
974         }
975 
976         merge.sig.thrsw |= b->sig.thrsw;
977         merge.sig.ldunif |= b->sig.ldunif;
978         merge.sig.ldunifrf |= b->sig.ldunifrf;
979         merge.sig.ldunifa |= b->sig.ldunifa;
980         merge.sig.ldunifarf |= b->sig.ldunifarf;
981         merge.sig.ldtmu |= b->sig.ldtmu;
982         merge.sig.ldvary |= b->sig.ldvary;
983         merge.sig.ldvpm |= b->sig.ldvpm;
984         merge.sig.small_imm |= b->sig.small_imm;
985         merge.sig.ldtlb |= b->sig.ldtlb;
986         merge.sig.ldtlbu |= b->sig.ldtlbu;
987         merge.sig.ucb |= b->sig.ucb;
988         merge.sig.rotate |= b->sig.rotate;
989         merge.sig.wrtmuc |= b->sig.wrtmuc;
990 
991         if (v3d_qpu_sig_writes_address(devinfo, &a->sig) &&
992             v3d_qpu_sig_writes_address(devinfo, &b->sig))
993                 return false;
994         merge.sig_addr |= b->sig_addr;
995         merge.sig_magic |= b->sig_magic;
996 
997         uint64_t packed;
998         bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed);
999 
1000         *result = merge;
1001         /* No modifying the real instructions on failure. */
1002         assert(ok || (a != result && b != result));
1003 
1004         return ok;
1005 }
1006 
1007 static inline bool
try_skip_for_ldvary_pipelining(const struct v3d_qpu_instr * inst)1008 try_skip_for_ldvary_pipelining(const struct v3d_qpu_instr *inst)
1009 {
1010         return inst->sig.ldunif || inst->sig.ldunifrf;
1011 }
1012 
1013 static bool
1014 qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
1015                                          struct choose_scoreboard *scoreboard,
1016                                          const struct qinst *qinst);
1017 
1018 static struct schedule_node *
choose_instruction_to_schedule(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct schedule_node * prev_inst)1019 choose_instruction_to_schedule(struct v3d_compile *c,
1020                                struct choose_scoreboard *scoreboard,
1021                                struct schedule_node *prev_inst)
1022 {
1023         struct schedule_node *chosen = NULL;
1024         int chosen_prio = 0;
1025 
1026         /* Don't pair up anything with a thread switch signal -- emit_thrsw()
1027          * will handle pairing it along with filling the delay slots.
1028          */
1029         if (prev_inst) {
1030                 if (prev_inst->inst->qpu.sig.thrsw)
1031                         return NULL;
1032         }
1033 
1034         bool ldvary_pipelining = c->s->info.stage == MESA_SHADER_FRAGMENT &&
1035                                  scoreboard->ldvary_count < c->num_inputs;
1036         bool skipped_insts_for_ldvary_pipelining = false;
1037 retry:
1038         list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,
1039                             dag.link) {
1040                 const struct v3d_qpu_instr *inst = &n->inst->qpu;
1041 
1042                 if (ldvary_pipelining && try_skip_for_ldvary_pipelining(inst)) {
1043                         skipped_insts_for_ldvary_pipelining = true;
1044                         continue;
1045                 }
1046 
1047                 /* Don't choose the branch instruction until it's the last one
1048                  * left.  We'll move it up to fit its delay slots after we
1049                  * choose it.
1050                  */
1051                 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
1052                     !list_is_singular(&scoreboard->dag->heads)) {
1053                         continue;
1054                 }
1055 
1056                 /* We need to have 3 delay slots between a write to unifa and
1057                  * a follow-up ldunifa.
1058                  */
1059                 if ((inst->sig.ldunifa || inst->sig.ldunifarf) &&
1060                     scoreboard->tick - scoreboard->last_unifa_write_tick <= 3)
1061                         continue;
1062 
1063                 /* "An instruction must not read from a location in physical
1064                  *  regfile A or B that was written to by the previous
1065                  *  instruction."
1066                  */
1067                 if (reads_too_soon_after_write(scoreboard, n->inst))
1068                         continue;
1069 
1070                 if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))
1071                         continue;
1072 
1073                 /* "Before doing a TLB access a scoreboard wait must have been
1074                  *  done. This happens either on the first or last thread
1075                  *  switch, depending on a setting (scb_wait_on_first_thrsw) in
1076                  *  the shader state."
1077                  */
1078                 if (pixel_scoreboard_too_soon(c, scoreboard, inst))
1079                         continue;
1080 
1081                 /* ldunif and ldvary both write r5, but ldunif does so a tick
1082                  * sooner.  If the ldvary's r5 wasn't used, then ldunif might
1083                  * otherwise get scheduled so ldunif and ldvary try to update
1084                  * r5 in the same tick.
1085                  */
1086                 if ((inst->sig.ldunif || inst->sig.ldunifa) &&
1087                     scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
1088                         continue;
1089                 }
1090 
1091                 /* If we are in a thrsw delay slot check that this instruction
1092                  * is valid for that.
1093                  */
1094                 if (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick &&
1095                     !qpu_inst_after_thrsw_valid_in_delay_slot(c, scoreboard,
1096                                                               n->inst)) {
1097                         continue;
1098                 }
1099 
1100                 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
1101                         /* Don't try to put a branch in the delay slots of another
1102                          * branch or a unifa write.
1103                          */
1104                         if (scoreboard->last_branch_tick + 3 >= scoreboard->tick)
1105                                 continue;
1106                         if (scoreboard->last_unifa_write_tick + 3 >= scoreboard->tick)
1107                                 continue;
1108 
1109                         /* No branch with cond != 0,2,3 and msfign != 0 after
1110                          * setmsf.
1111                          */
1112                         if (scoreboard->last_setmsf_tick == scoreboard->tick - 1 &&
1113                             inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
1114                             inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
1115                             inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
1116                             inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
1117                                 continue;
1118                         }
1119                 }
1120 
1121                 /* If we're trying to pair with another instruction, check
1122                  * that they're compatible.
1123                  */
1124                 if (prev_inst) {
1125                         /* Don't pair up a thread switch signal -- we'll
1126                          * handle pairing it when we pick it on its own.
1127                          */
1128                         if (inst->sig.thrsw)
1129                                 continue;
1130 
1131                         if (prev_inst->inst->uniform != -1 &&
1132                             n->inst->uniform != -1)
1133                                 continue;
1134 
1135                        /* Simulator complains if we have two uniforms loaded in
1136                         * the the same instruction, which could happen if we
1137                         * have a ldunif or sideband uniform and we pair that
1138                         * with ldunifa.
1139                         */
1140                         if (vir_has_uniform(prev_inst->inst) &&
1141                             (inst->sig.ldunifa || inst->sig.ldunifarf)) {
1142                                 continue;
1143                         }
1144 
1145                         if ((prev_inst->inst->qpu.sig.ldunifa ||
1146                              prev_inst->inst->qpu.sig.ldunifarf) &&
1147                             vir_has_uniform(n->inst)) {
1148                                 continue;
1149                         }
1150 
1151                         /* Don't merge TLB instructions before we have acquired
1152                          * the scoreboard lock.
1153                          */
1154                         if (pixel_scoreboard_too_soon(c, scoreboard, inst))
1155                                 continue;
1156 
1157                         /* When we succesfully pair up an ldvary we then try
1158                          * to merge it into the previous instruction if
1159                          * possible to improve pipelining. Don't pick up the
1160                          * ldvary now if the follow-up fixup would place
1161                          * it in the delay slots of a thrsw, which is not
1162                          * allowed and would prevent the fixup from being
1163                          * successul.
1164                          */
1165                         if (inst->sig.ldvary &&
1166                             scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) {
1167                                 continue;
1168                         }
1169 
1170                         struct v3d_qpu_instr merged_inst;
1171                         if (!qpu_merge_inst(c->devinfo, &merged_inst,
1172                                             &prev_inst->inst->qpu, inst)) {
1173                                 continue;
1174                         }
1175                 }
1176 
1177                 int prio = get_instruction_priority(c->devinfo, inst);
1178 
1179                 if (mux_read_stalls(scoreboard, inst)) {
1180                         /* Don't merge an instruction that stalls */
1181                         if (prev_inst)
1182                                 continue;
1183                         else {
1184                                 /* Any instruction that don't stall will have
1185                                  * higher scheduling priority */
1186                                 prio -= MAX_SCHEDULE_PRIORITY;
1187                                 assert(prio < 0);
1188                         }
1189                 }
1190 
1191                 /* Found a valid instruction.  If nothing better comes along,
1192                  * this one works.
1193                  */
1194                 if (!chosen) {
1195                         chosen = n;
1196                         chosen_prio = prio;
1197                         continue;
1198                 }
1199 
1200                 if (prio > chosen_prio) {
1201                         chosen = n;
1202                         chosen_prio = prio;
1203                 } else if (prio < chosen_prio) {
1204                         continue;
1205                 }
1206 
1207                 if (n->delay > chosen->delay) {
1208                         chosen = n;
1209                         chosen_prio = prio;
1210                 } else if (n->delay < chosen->delay) {
1211                         continue;
1212                 }
1213         }
1214 
1215         /* If we did not find any instruction to schedule but we discarded
1216          * some of them to prioritize ldvary pipelining, try again.
1217          */
1218         if (!chosen && !prev_inst && skipped_insts_for_ldvary_pipelining) {
1219                 skipped_insts_for_ldvary_pipelining = false;
1220                 ldvary_pipelining = false;
1221                 goto retry;
1222         }
1223 
1224         if (chosen && chosen->inst->qpu.sig.ldvary) {
1225                 scoreboard->ldvary_count++;
1226                 /* If we are pairing an ldvary, flag it so we can fix it up for
1227                  * optimal pipelining of ldvary sequences.
1228                  */
1229                 if (prev_inst)
1230                         scoreboard->fixup_ldvary = true;
1231         }
1232 
1233         return chosen;
1234 }
1235 
1236 static void
update_scoreboard_for_magic_waddr(struct choose_scoreboard * scoreboard,enum v3d_qpu_waddr waddr,const struct v3d_device_info * devinfo)1237 update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
1238                                   enum v3d_qpu_waddr waddr,
1239                                   const struct v3d_device_info *devinfo)
1240 {
1241         if (v3d_qpu_magic_waddr_is_sfu(waddr))
1242                 scoreboard->last_magic_sfu_write_tick = scoreboard->tick;
1243         else if (devinfo->ver >= 40 && waddr == V3D_QPU_WADDR_UNIFA)
1244                 scoreboard->last_unifa_write_tick = scoreboard->tick;
1245 }
1246 
1247 static void
update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst)1248 update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,
1249                                       const struct v3d_qpu_instr *inst)
1250 {
1251         if (v3d_qpu_instr_is_sfu(inst)) {
1252                 scoreboard->last_stallable_sfu_reg = inst->alu.add.waddr;
1253                 scoreboard->last_stallable_sfu_tick = scoreboard->tick;
1254         }
1255 }
1256 
1257 static void
update_scoreboard_for_chosen(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst,const struct v3d_device_info * devinfo)1258 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
1259                              const struct v3d_qpu_instr *inst,
1260                              const struct v3d_device_info *devinfo)
1261 {
1262         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
1263                 return;
1264 
1265         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
1266 
1267         if (inst->alu.add.op != V3D_QPU_A_NOP)  {
1268                 if (inst->alu.add.magic_write) {
1269                         update_scoreboard_for_magic_waddr(scoreboard,
1270                                                           inst->alu.add.waddr,
1271                                                           devinfo);
1272                 } else {
1273                         update_scoreboard_for_sfu_stall_waddr(scoreboard,
1274                                                               inst);
1275                 }
1276 
1277                 if (inst->alu.add.op == V3D_QPU_A_SETMSF)
1278                         scoreboard->last_setmsf_tick = scoreboard->tick;
1279         }
1280 
1281         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
1282                 if (inst->alu.mul.magic_write) {
1283                         update_scoreboard_for_magic_waddr(scoreboard,
1284                                                           inst->alu.mul.waddr,
1285                                                           devinfo);
1286                 }
1287         }
1288 
1289         if (inst->sig.ldvary)
1290                 scoreboard->last_ldvary_tick = scoreboard->tick;
1291 }
1292 
1293 static void
dump_state(const struct v3d_device_info * devinfo,struct dag * dag)1294 dump_state(const struct v3d_device_info *devinfo, struct dag *dag)
1295 {
1296         list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) {
1297                 fprintf(stderr, "         t=%4d: ", n->unblocked_time);
1298                 v3d_qpu_dump(devinfo, &n->inst->qpu);
1299                 fprintf(stderr, "\n");
1300 
1301                 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1302                         struct schedule_node *child =
1303                                 (struct schedule_node *)edge->child;
1304                         if (!child)
1305                                 continue;
1306 
1307                         fprintf(stderr, "                 - ");
1308                         v3d_qpu_dump(devinfo, &child->inst->qpu);
1309                         fprintf(stderr, " (%d parents, %c)\n",
1310                                 child->dag.parent_count,
1311                                 edge->data ? 'w' : 'r');
1312                 }
1313         }
1314 }
1315 
magic_waddr_latency(const struct v3d_device_info * devinfo,enum v3d_qpu_waddr waddr,const struct v3d_qpu_instr * after)1316 static uint32_t magic_waddr_latency(const struct v3d_device_info *devinfo,
1317                                     enum v3d_qpu_waddr waddr,
1318                                     const struct v3d_qpu_instr *after)
1319 {
1320         /* Apply some huge latency between texture fetch requests and getting
1321          * their results back.
1322          *
1323          * FIXME: This is actually pretty bogus.  If we do:
1324          *
1325          * mov tmu0_s, a
1326          * <a bit of math>
1327          * mov tmu0_s, b
1328          * load_tmu0
1329          * <more math>
1330          * load_tmu0
1331          *
1332          * we count that as worse than
1333          *
1334          * mov tmu0_s, a
1335          * mov tmu0_s, b
1336          * <lots of math>
1337          * load_tmu0
1338          * <more math>
1339          * load_tmu0
1340          *
1341          * because we associate the first load_tmu0 with the *second* tmu0_s.
1342          */
1343         if (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) &&
1344             v3d_qpu_waits_on_tmu(after)) {
1345                 return 100;
1346         }
1347 
1348         /* Assume that anything depending on us is consuming the SFU result. */
1349         if (v3d_qpu_magic_waddr_is_sfu(waddr))
1350                 return 3;
1351 
1352         return 1;
1353 }
1354 
1355 static uint32_t
instruction_latency(const struct v3d_device_info * devinfo,struct schedule_node * before,struct schedule_node * after)1356 instruction_latency(const struct v3d_device_info *devinfo,
1357                     struct schedule_node *before, struct schedule_node *after)
1358 {
1359         const struct v3d_qpu_instr *before_inst = &before->inst->qpu;
1360         const struct v3d_qpu_instr *after_inst = &after->inst->qpu;
1361         uint32_t latency = 1;
1362 
1363         if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU ||
1364             after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
1365                 return latency;
1366 
1367         if (before_inst->alu.add.magic_write) {
1368                 latency = MAX2(latency,
1369                                magic_waddr_latency(devinfo,
1370                                                    before_inst->alu.add.waddr,
1371                                                    after_inst));
1372         }
1373 
1374         if (before_inst->alu.mul.magic_write) {
1375                 latency = MAX2(latency,
1376                                magic_waddr_latency(devinfo,
1377                                                    before_inst->alu.mul.waddr,
1378                                                    after_inst));
1379         }
1380 
1381         if (v3d_qpu_instr_is_sfu(before_inst))
1382                 return 2;
1383 
1384         return latency;
1385 }
1386 
1387 /** Recursive computation of the delay member of a node. */
1388 static void
compute_delay(struct dag_node * node,void * state)1389 compute_delay(struct dag_node *node, void *state)
1390 {
1391         struct schedule_node *n = (struct schedule_node *)node;
1392         struct v3d_compile *c = (struct v3d_compile *) state;
1393 
1394         n->delay = 1;
1395 
1396         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1397                 struct schedule_node *child =
1398                         (struct schedule_node *)edge->child;
1399 
1400                 n->delay = MAX2(n->delay, (child->delay +
1401                                            instruction_latency(c->devinfo, n,
1402                                                                child)));
1403         }
1404 }
1405 
1406 /* Removes a DAG head, but removing only the WAR edges. (dag_prune_head()
1407  * should be called on it later to finish pruning the other edges).
1408  */
1409 static void
pre_remove_head(struct dag * dag,struct schedule_node * n)1410 pre_remove_head(struct dag *dag, struct schedule_node *n)
1411 {
1412         list_delinit(&n->dag.link);
1413 
1414         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1415                 if (edge->data)
1416                         dag_remove_edge(dag, edge);
1417         }
1418 }
1419 
1420 static void
mark_instruction_scheduled(const struct v3d_device_info * devinfo,struct dag * dag,uint32_t time,struct schedule_node * node)1421 mark_instruction_scheduled(const struct v3d_device_info *devinfo,
1422                            struct dag *dag,
1423                            uint32_t time,
1424                            struct schedule_node *node)
1425 {
1426         if (!node)
1427                 return;
1428 
1429         util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) {
1430                 struct schedule_node *child =
1431                         (struct schedule_node *)edge->child;
1432 
1433                 if (!child)
1434                         continue;
1435 
1436                 uint32_t latency = instruction_latency(devinfo, node, child);
1437 
1438                 child->unblocked_time = MAX2(child->unblocked_time,
1439                                              time + latency);
1440         }
1441         dag_prune_head(dag, &node->dag);
1442 }
1443 
1444 static void
insert_scheduled_instruction(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard,struct qinst * inst)1445 insert_scheduled_instruction(struct v3d_compile *c,
1446                              struct qblock *block,
1447                              struct choose_scoreboard *scoreboard,
1448                              struct qinst *inst)
1449 {
1450         list_addtail(&inst->link, &block->instructions);
1451 
1452         update_scoreboard_for_chosen(scoreboard, &inst->qpu, c->devinfo);
1453         c->qpu_inst_count++;
1454         scoreboard->tick++;
1455 }
1456 
1457 static struct qinst *
vir_nop()1458 vir_nop()
1459 {
1460         struct qreg undef = vir_nop_reg();
1461         struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
1462 
1463         return qinst;
1464 }
1465 
1466 static void
emit_nop(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard)1467 emit_nop(struct v3d_compile *c, struct qblock *block,
1468          struct choose_scoreboard *scoreboard)
1469 {
1470         insert_scheduled_instruction(c, block, scoreboard, vir_nop());
1471 }
1472 
1473 static bool
qpu_inst_valid_in_thrend_slot(struct v3d_compile * c,const struct qinst * qinst,int slot)1474 qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
1475                               const struct qinst *qinst, int slot)
1476 {
1477         const struct v3d_qpu_instr *inst = &qinst->qpu;
1478 
1479         /* Only TLB Z writes are prohibited in the last slot, but we don't
1480          * have those flagged so prohibit all TLB ops for now.
1481          */
1482         if (slot == 2 && qpu_inst_is_tlb(inst))
1483                 return false;
1484 
1485         if (slot > 0 && qinst->uniform != ~0)
1486                 return false;
1487 
1488         if (v3d_qpu_uses_vpm(inst))
1489                 return false;
1490 
1491         if (inst->sig.ldvary)
1492                 return false;
1493 
1494         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
1495                 /* GFXH-1625: TMUWT not allowed in the final instruction. */
1496                 if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT)
1497                         return false;
1498 
1499                 /* No writing physical registers at the end. */
1500                 if (!inst->alu.add.magic_write ||
1501                     !inst->alu.mul.magic_write) {
1502                         return false;
1503                 }
1504 
1505                 if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
1506                     !inst->sig_magic) {
1507                         return false;
1508                 }
1509 
1510                 if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
1511                         return false;
1512 
1513                 /* RF0-2 might be overwritten during the delay slots by
1514                  * fragment shader setup.
1515                  */
1516                 if (inst->raddr_a < 3 &&
1517                     (inst->alu.add.a == V3D_QPU_MUX_A ||
1518                      inst->alu.add.b == V3D_QPU_MUX_A ||
1519                      inst->alu.mul.a == V3D_QPU_MUX_A ||
1520                      inst->alu.mul.b == V3D_QPU_MUX_A)) {
1521                         return false;
1522                 }
1523 
1524                 if (inst->raddr_b < 3 &&
1525                     !inst->sig.small_imm &&
1526                     (inst->alu.add.a == V3D_QPU_MUX_B ||
1527                      inst->alu.add.b == V3D_QPU_MUX_B ||
1528                      inst->alu.mul.a == V3D_QPU_MUX_B ||
1529                      inst->alu.mul.b == V3D_QPU_MUX_B)) {
1530                         return false;
1531                 }
1532         }
1533 
1534         return true;
1535 }
1536 
1537 /**
1538  * This is called when trying to merge a thrsw back into the instruction stream
1539  * of instructions that were scheduled *before* the thrsw signal to fill its
1540  * delay slots. Because the actual execution of the thrsw happens after the
1541  * delay slots, it is usually safe to do this, but there are some cases that
1542  * need special care.
1543  */
1544 static bool
qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile * c,const struct qinst * qinst,uint32_t slot)1545 qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
1546                                           const struct qinst *qinst,
1547                                           uint32_t slot)
1548 {
1549         /* No scheduling SFU when the result would land in the other
1550          * thread.  The simulator complains for safety, though it
1551          * would only occur for dead code in our case.
1552          */
1553         if (slot > 0 &&
1554             qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
1555             (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) ||
1556              v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) {
1557                 return false;
1558         }
1559 
1560         if (slot > 0 && qinst->qpu.sig.ldvary)
1561                 return false;
1562 
1563         /* unifa and the following 3 instructions can't overlap a
1564          * thread switch/end. The docs further clarify that this means
1565          * the cycle at which the actual thread switch/end happens
1566          * and not when the thrsw instruction is processed, which would
1567          * be after the 2 delay slots following the thrsw instruction.
1568          * This means that we can move up a thrsw up to the instruction
1569          * right after unifa:
1570          *
1571          * unifa, r5
1572          * thrsw
1573          * delay slot 1
1574          * delay slot 2
1575          * Thread switch happens here, 4 instructions away from unifa
1576          */
1577         if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu))
1578                 return false;
1579 
1580         return true;
1581 }
1582 
1583 /**
1584  * This is called for instructions scheduled *after* a thrsw signal that may
1585  * land in the delay slots of the thrsw. Because these instructions were
1586  * scheduled after the thrsw, we need to be careful when placing them into
1587  * the delay slots, since that means that we are moving them ahead of the
1588  * thread switch and we need to ensure that is not a problem.
1589  */
1590 static bool
qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile * c,struct choose_scoreboard * scoreboard,const struct qinst * qinst)1591 qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
1592                                          struct choose_scoreboard *scoreboard,
1593                                          const struct qinst *qinst)
1594 {
1595         const uint32_t slot = scoreboard->tick - scoreboard->last_thrsw_tick;
1596         assert(slot <= 2);
1597 
1598         /* We merge thrsw instructions back into the instruction stream
1599          * manually, so any instructions scheduled after a thrsw shold be
1600          * in the actual delay slots and not in the same slot as the thrsw.
1601          */
1602         assert(slot >= 1);
1603 
1604         /* No emitting a thrsw while the previous thrsw hasn't happened yet. */
1605         if (qinst->qpu.sig.thrsw)
1606                 return false;
1607 
1608         /* The restrictions for instructions scheduled before the the thrsw
1609          * also apply to instructions scheduled after the thrsw that we want
1610          * to place in its delay slots.
1611          */
1612         if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
1613                 return false;
1614 
1615         /* TLB access is disallowed until scoreboard wait is executed, which
1616          * we do on the last thread switch.
1617          */
1618         if (qpu_inst_is_tlb(&qinst->qpu))
1619                 return false;
1620 
1621         /* Instruction sequence restrictions: Branch is not allowed in delay
1622          * slots of a thrsw.
1623          */
1624         if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
1625                 return false;
1626 
1627         /* Miscellaneous restrictions: At the point of a thrsw we need to have
1628          * at least one outstanding lookup or TSY wait.
1629          *
1630          * So avoid placing TMU instructions scheduled after the thrsw into
1631          * its delay slots or we may be compromising the integrity of our TMU
1632          * sequences. Also, notice that if we moved these instructions into
1633          * the delay slots of a previous thrsw we could overflow our TMU output
1634          * fifo, since we could be effectively pipelining a lookup scheduled
1635          * after the thrsw into the sequence before the thrsw.
1636          */
1637         if (v3d_qpu_writes_tmu(c->devinfo, &qinst->qpu) ||
1638             qinst->qpu.sig.wrtmuc) {
1639                 return false;
1640         }
1641 
1642         /* Don't move instructions that wait on the TMU before the thread switch
1643          * happens since that would make the current thread stall before the
1644          * switch, which is exactly what we want to avoid with the thrsw
1645          * instruction.
1646          */
1647         if (v3d_qpu_waits_on_tmu(&qinst->qpu))
1648                 return false;
1649 
1650         /* A thread switch invalidates all accumulators, so don't place any
1651          * instructions that write accumulators into the delay slots.
1652          */
1653         if (v3d_qpu_writes_accum(c->devinfo, &qinst->qpu))
1654                 return false;
1655 
1656         /* Multop has an implicit write to the rtop register which is an
1657          * specialized accumulator that is only used with this instruction.
1658          */
1659         if (qinst->qpu.alu.mul.op == V3D_QPU_M_MULTOP)
1660                 return false;
1661 
1662         /* Flags are invalidated across a thread switch, so dont' place
1663          * instructions that write flags into delay slots.
1664          */
1665         if (v3d_qpu_writes_flags(&qinst->qpu))
1666                 return false;
1667 
1668         return true;
1669 }
1670 
1671 static bool
valid_thrsw_sequence(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qinst * qinst,int instructions_in_sequence,bool is_thrend)1672 valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard,
1673                      struct qinst *qinst, int instructions_in_sequence,
1674                      bool is_thrend)
1675 {
1676         /* No emitting our thrsw while the previous thrsw hasn't happened yet. */
1677         if (scoreboard->last_thrsw_tick + 3 >
1678             scoreboard->tick - instructions_in_sequence) {
1679                 return false;
1680         }
1681 
1682         for (int slot = 0; slot < instructions_in_sequence; slot++) {
1683                 if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
1684                         return false;
1685 
1686                 if (is_thrend &&
1687                     !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) {
1688                         return false;
1689                 }
1690 
1691                 /* Note that the list is circular, so we can only do this up
1692                  * to instructions_in_sequence.
1693                  */
1694                 qinst = (struct qinst *)qinst->link.next;
1695         }
1696 
1697         return true;
1698 }
1699 
1700 /**
1701  * Emits a THRSW signal in the stream, trying to move it up to pair with
1702  * another instruction.
1703  */
1704 static int
emit_thrsw(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard,struct qinst * inst,bool is_thrend)1705 emit_thrsw(struct v3d_compile *c,
1706            struct qblock *block,
1707            struct choose_scoreboard *scoreboard,
1708            struct qinst *inst,
1709            bool is_thrend)
1710 {
1711         int time = 0;
1712 
1713         /* There should be nothing in a thrsw inst being scheduled other than
1714          * the signal bits.
1715          */
1716         assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
1717         assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP);
1718         assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP);
1719 
1720         /* Don't try to emit a thrsw in the delay slots of a previous thrsw
1721          * or branch.
1722          */
1723         while (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick) {
1724                 emit_nop(c, block, scoreboard);
1725                 time++;
1726         }
1727         while (scoreboard->last_branch_tick + 3 >= scoreboard->tick) {
1728                 emit_nop(c, block, scoreboard);
1729                 time++;
1730         }
1731 
1732         /* Find how far back into previous instructions we can put the THRSW. */
1733         int slots_filled = 0;
1734         struct qinst *merge_inst = NULL;
1735         vir_for_each_inst_rev(prev_inst, block) {
1736                 struct v3d_qpu_sig sig = prev_inst->qpu.sig;
1737                 sig.thrsw = true;
1738                 uint32_t packed_sig;
1739 
1740                 if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig))
1741                         break;
1742 
1743                 if (!valid_thrsw_sequence(c, scoreboard,
1744                                           prev_inst, slots_filled + 1,
1745                                           is_thrend)) {
1746                         break;
1747                 }
1748 
1749                 merge_inst = prev_inst;
1750                 if (++slots_filled == 3)
1751                         break;
1752         }
1753 
1754         bool needs_free = false;
1755         if (merge_inst) {
1756                 merge_inst->qpu.sig.thrsw = true;
1757                 needs_free = true;
1758                 scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled;
1759         } else {
1760                 scoreboard->last_thrsw_tick = scoreboard->tick;
1761                 insert_scheduled_instruction(c, block, scoreboard, inst);
1762                 time++;
1763                 slots_filled++;
1764                 merge_inst = inst;
1765         }
1766 
1767         scoreboard->first_thrsw_emitted = true;
1768 
1769         /* If we're emitting the last THRSW (other than program end), then
1770          * signal that to the HW by emitting two THRSWs in a row.
1771          */
1772         if (inst->is_last_thrsw) {
1773                 if (slots_filled <= 1) {
1774                         emit_nop(c, block, scoreboard);
1775                         time++;
1776                 }
1777                 struct qinst *second_inst =
1778                         (struct qinst *)merge_inst->link.next;
1779                 second_inst->qpu.sig.thrsw = true;
1780                 scoreboard->last_thrsw_emitted = true;
1781         }
1782 
1783         /* Make sure the thread end executes within the program lifespan */
1784         if (is_thrend) {
1785                 for (int i = 0; i < 3 - slots_filled; i++) {
1786                         emit_nop(c, block, scoreboard);
1787                         time++;
1788                 }
1789         }
1790 
1791         /* If we put our THRSW into another instruction, free up the
1792          * instruction that didn't end up scheduled into the list.
1793          */
1794         if (needs_free)
1795                 free(inst);
1796 
1797         return time;
1798 }
1799 
1800 static bool
qpu_inst_valid_in_branch_delay_slot(struct v3d_compile * c,struct qinst * inst)1801 qpu_inst_valid_in_branch_delay_slot(struct v3d_compile *c, struct qinst *inst)
1802 {
1803         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
1804                 return false;
1805 
1806         if (inst->qpu.sig.thrsw)
1807                 return false;
1808 
1809         if (v3d_qpu_writes_unifa(c->devinfo, &inst->qpu))
1810                 return false;
1811 
1812         if (vir_has_uniform(inst))
1813                 return false;
1814 
1815         return true;
1816 }
1817 
1818 static void
emit_branch(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard,struct qinst * inst)1819 emit_branch(struct v3d_compile *c,
1820            struct qblock *block,
1821            struct choose_scoreboard *scoreboard,
1822            struct qinst *inst)
1823 {
1824         assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
1825 
1826         /* We should've not picked up a branch for the delay slots of a previous
1827          * thrsw, branch or unifa write instruction.
1828          */
1829         int branch_tick = scoreboard->tick;
1830         assert(scoreboard->last_thrsw_tick + 2 < branch_tick);
1831         assert(scoreboard->last_branch_tick + 3 < branch_tick);
1832         assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);
1833 
1834         /* Can't place a branch with msfign != 0 and cond != 0,2,3 after
1835          * setmsf.
1836          */
1837         bool is_safe_msf_branch =
1838                 inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE ||
1839                 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS ||
1840                 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 ||
1841                 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_NA0;
1842         assert(scoreboard->last_setmsf_tick != branch_tick - 1 ||
1843                is_safe_msf_branch);
1844 
1845         /* Insert the branch instruction */
1846         insert_scheduled_instruction(c, block, scoreboard, inst);
1847 
1848         /* Now see if we can move the branch instruction back into the
1849          * instruction stream to fill its delay slots
1850          */
1851         int slots_filled = 0;
1852         while (slots_filled < 3 && block->instructions.next != &inst->link) {
1853                 struct qinst *prev_inst = (struct qinst *) inst->link.prev;
1854                 assert(prev_inst->qpu.type != V3D_QPU_INSTR_TYPE_BRANCH);
1855 
1856                 /* Can't move the branch instruction if that would place it
1857                  * in the delay slots of other instructions.
1858                  */
1859                 if (scoreboard->last_branch_tick + 3 >=
1860                     branch_tick - slots_filled - 1) {
1861                         break;
1862                 }
1863 
1864                 if (scoreboard->last_thrsw_tick + 2 >=
1865                     branch_tick - slots_filled - 1) {
1866                         break;
1867                 }
1868 
1869                 if (scoreboard->last_unifa_write_tick + 3 >=
1870                     branch_tick - slots_filled - 1) {
1871                         break;
1872                 }
1873 
1874                 /* Can't move a conditional branch before the instruction
1875                  * that writes the flags for its condition.
1876                  */
1877                 if (v3d_qpu_writes_flags(&prev_inst->qpu) &&
1878                     inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) {
1879                         break;
1880                 }
1881 
1882                 if (!qpu_inst_valid_in_branch_delay_slot(c, prev_inst))
1883                         break;
1884 
1885                 if (!is_safe_msf_branch) {
1886                         struct qinst *prev_prev_inst =
1887                                 (struct qinst *) prev_inst->link.prev;
1888                         if (prev_prev_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
1889                             prev_prev_inst->qpu.alu.add.op == V3D_QPU_A_SETMSF) {
1890                                 break;
1891                         }
1892                 }
1893 
1894                 list_del(&prev_inst->link);
1895                 list_add(&prev_inst->link, &inst->link);
1896                 slots_filled++;
1897         }
1898 
1899         block->branch_qpu_ip = c->qpu_inst_count - 1 - slots_filled;
1900         scoreboard->last_branch_tick = branch_tick - slots_filled;
1901 
1902         /* Fill any remaining delay slots.
1903          *
1904          * For unconditional branches we'll try to fill these with the
1905          * first instructions in the successor block after scheduling
1906          * all blocks when setting up branch targets.
1907          */
1908         for (int i = 0; i < 3 - slots_filled; i++)
1909                 emit_nop(c, block, scoreboard);
1910 }
1911 
1912 static bool
alu_reads_register(struct v3d_qpu_instr * inst,bool add,bool magic,uint32_t index)1913 alu_reads_register(struct v3d_qpu_instr *inst,
1914                    bool add, bool magic, uint32_t index)
1915 {
1916         uint32_t num_src;
1917         enum v3d_qpu_mux mux_a, mux_b;
1918 
1919         if (add) {
1920                 num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
1921                 mux_a = inst->alu.add.a;
1922                 mux_b = inst->alu.add.b;
1923         } else {
1924                 num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
1925                 mux_a = inst->alu.mul.a;
1926                 mux_b = inst->alu.mul.b;
1927         }
1928 
1929         for (int i = 0; i < num_src; i++) {
1930                 if (magic) {
1931                         if (i == 0 && mux_a == index)
1932                                 return true;
1933                         if (i == 1 && mux_b == index)
1934                                 return true;
1935                 } else {
1936                         if (i == 0 && mux_a == V3D_QPU_MUX_A &&
1937                             inst->raddr_a == index) {
1938                                 return true;
1939                         }
1940                         if (i == 0 && mux_a == V3D_QPU_MUX_B &&
1941                             inst->raddr_b == index) {
1942                                 return true;
1943                         }
1944                         if (i == 1 && mux_b == V3D_QPU_MUX_A &&
1945                             inst->raddr_a == index) {
1946                                 return true;
1947                         }
1948                         if (i == 1 && mux_b == V3D_QPU_MUX_B &&
1949                             inst->raddr_b == index) {
1950                                 return true;
1951                         }
1952                 }
1953         }
1954 
1955         return false;
1956 }
1957 
1958 /**
1959  * This takes and ldvary signal merged into 'inst' and tries to move it up to
1960  * the previous instruction to get good pipelining of ldvary sequences,
1961  * transforming this:
1962  *
1963  * nop                  ; nop               ; ldvary.r4
1964  * nop                  ; fmul  r0, r4, rf0 ;
1965  * fadd  rf13, r0, r5   ; nop;              ; ldvary.r1  <-- inst
1966  *
1967  * into:
1968  *
1969  * nop                  ; nop               ; ldvary.r4
1970  * nop                  ; fmul  r0, r4, rf0 ; ldvary.r1
1971  * fadd  rf13, r0, r5   ; nop;              ;            <-- inst
1972  *
1973  * If we manage to do this successfully (we return true here), then flagging
1974  * the ldvary as "scheduled" may promote the follow-up fmul to a DAG head that
1975  * we will be able to pick up to merge into 'inst', leading to code like this:
1976  *
1977  * nop                  ; nop               ; ldvary.r4
1978  * nop                  ; fmul  r0, r4, rf0 ; ldvary.r1
1979  * fadd  rf13, r0, r5   ; fmul  r2, r1, rf0 ;            <-- inst
1980  */
1981 static bool
fixup_pipelined_ldvary(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qblock * block,struct v3d_qpu_instr * inst)1982 fixup_pipelined_ldvary(struct v3d_compile *c,
1983                        struct choose_scoreboard *scoreboard,
1984                        struct qblock *block,
1985                        struct v3d_qpu_instr *inst)
1986 {
1987         /* We only call this if we have successfuly merged an ldvary into a
1988          * previous instruction.
1989          */
1990         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
1991         assert(inst->sig.ldvary);
1992         uint32_t ldvary_magic = inst->sig_magic;
1993         uint32_t ldvary_index = inst->sig_addr;
1994 
1995         /* The instruction in which we merged the ldvary cannot read
1996          * the ldvary destination, if it does, then moving the ldvary before
1997          * it would overwrite it.
1998          */
1999         if (alu_reads_register(inst, true, ldvary_magic, ldvary_index))
2000                 return false;
2001         if (alu_reads_register(inst, false, ldvary_magic, ldvary_index))
2002                 return false;
2003 
2004         /* The implicit ldvary destination may not be written to by a signal
2005          * in the instruction following ldvary. Since we are planning to move
2006          * ldvary to the previous instruction, this means we need to check if
2007          * the current instruction has any other signal that could create this
2008          * conflict. The only other signal that can write to the implicit
2009          * ldvary destination that is compatible with ldvary in the same
2010          * instruction is ldunif.
2011          */
2012         if (inst->sig.ldunif)
2013                 return false;
2014 
2015         /* The previous instruction can't write to the same destination as the
2016          * ldvary.
2017          */
2018         struct qinst *prev = (struct qinst *) block->instructions.prev;
2019         if (!prev || prev->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
2020                 return false;
2021 
2022         if (prev->qpu.alu.add.op != V3D_QPU_A_NOP) {
2023                 if (prev->qpu.alu.add.magic_write == ldvary_magic &&
2024                     prev->qpu.alu.add.waddr == ldvary_index) {
2025                         return false;
2026                 }
2027         }
2028 
2029         if (prev->qpu.alu.mul.op != V3D_QPU_M_NOP) {
2030                 if (prev->qpu.alu.mul.magic_write == ldvary_magic &&
2031                     prev->qpu.alu.mul.waddr == ldvary_index) {
2032                         return false;
2033                 }
2034         }
2035 
2036         /* The previous instruction cannot have a conflicting signal */
2037         if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig))
2038                 return false;
2039 
2040         /* The previous instruction cannot use flags since ldvary uses the
2041          * 'cond' instruction field to store the destination.
2042          */
2043         if (v3d_qpu_writes_flags(&prev->qpu))
2044                 return false;
2045         if (v3d_qpu_reads_flags(&prev->qpu))
2046                 return false;
2047 
2048         /* We can't put an ldvary in the delay slots of a thrsw. We should've
2049          * prevented this when pairing up the ldvary with another instruction
2050          * and flagging it for a fixup.
2051          */
2052         assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1);
2053 
2054         /* Move the ldvary to the previous instruction and remove it from the
2055          * current one.
2056          */
2057         prev->qpu.sig.ldvary = true;
2058         prev->qpu.sig_magic = ldvary_magic;
2059         prev->qpu.sig_addr = ldvary_index;
2060         scoreboard->last_ldvary_tick = scoreboard->tick - 1;
2061 
2062         inst->sig.ldvary = false;
2063         inst->sig_magic = false;
2064         inst->sig_addr = 0;
2065 
2066         /* By moving ldvary to the previous instruction we make it update
2067          * r5 in the current one, so nothing else in it should write r5.
2068          * This should've been prevented by our depedency tracking, which
2069          * would not allow ldvary to be paired up with an instruction that
2070          * writes r5 (since our dependency tracking doesn't know that the
2071          * ldvary write r5 happens in the next instruction).
2072          */
2073         assert(!v3d_qpu_writes_r5(c->devinfo, inst));
2074 
2075         return true;
2076 }
2077 
2078 static uint32_t
schedule_instructions(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qblock * block,enum quniform_contents * orig_uniform_contents,uint32_t * orig_uniform_data,uint32_t * next_uniform)2079 schedule_instructions(struct v3d_compile *c,
2080                       struct choose_scoreboard *scoreboard,
2081                       struct qblock *block,
2082                       enum quniform_contents *orig_uniform_contents,
2083                       uint32_t *orig_uniform_data,
2084                       uint32_t *next_uniform)
2085 {
2086         const struct v3d_device_info *devinfo = c->devinfo;
2087         uint32_t time = 0;
2088 
2089         while (!list_is_empty(&scoreboard->dag->heads)) {
2090                 struct schedule_node *chosen =
2091                         choose_instruction_to_schedule(c, scoreboard, NULL);
2092                 struct schedule_node *merge = NULL;
2093 
2094                 /* If there are no valid instructions to schedule, drop a NOP
2095                  * in.
2096                  */
2097                 struct qinst *qinst = chosen ? chosen->inst : vir_nop();
2098                 struct v3d_qpu_instr *inst = &qinst->qpu;
2099 
2100                 if (debug) {
2101                         fprintf(stderr, "t=%4d: current list:\n",
2102                                 time);
2103                         dump_state(devinfo, scoreboard->dag);
2104                         fprintf(stderr, "t=%4d: chose:   ", time);
2105                         v3d_qpu_dump(devinfo, inst);
2106                         fprintf(stderr, "\n");
2107                 }
2108 
2109                 /* We can't mark_instruction_scheduled() the chosen inst until
2110                  * we're done identifying instructions to merge, so put the
2111                  * merged instructions on a list for a moment.
2112                  */
2113                 struct list_head merged_list;
2114                 list_inithead(&merged_list);
2115 
2116                 /* Schedule this instruction onto the QPU list. Also try to
2117                  * find an instruction to pair with it.
2118                  */
2119                 if (chosen) {
2120                         time = MAX2(chosen->unblocked_time, time);
2121                         pre_remove_head(scoreboard->dag, chosen);
2122 
2123                         while ((merge =
2124                                 choose_instruction_to_schedule(c, scoreboard,
2125                                                                chosen))) {
2126                                 time = MAX2(merge->unblocked_time, time);
2127                                 pre_remove_head(scoreboard->dag, merge);
2128                                 list_addtail(&merge->link, &merged_list);
2129                                 (void)qpu_merge_inst(devinfo, inst,
2130                                                      inst, &merge->inst->qpu);
2131                                 if (merge->inst->uniform != -1) {
2132                                         chosen->inst->uniform =
2133                                                 merge->inst->uniform;
2134                                 }
2135 
2136                                 if (debug) {
2137                                         fprintf(stderr, "t=%4d: merging: ",
2138                                                 time);
2139                                         v3d_qpu_dump(devinfo, &merge->inst->qpu);
2140                                         fprintf(stderr, "\n");
2141                                         fprintf(stderr, "         result: ");
2142                                         v3d_qpu_dump(devinfo, inst);
2143                                         fprintf(stderr, "\n");
2144                                 }
2145 
2146                                 if (scoreboard->fixup_ldvary) {
2147                                         scoreboard->fixup_ldvary = false;
2148                                         if (fixup_pipelined_ldvary(c, scoreboard, block, inst)) {
2149                                                 /* Flag the ldvary as scheduled
2150                                                  * now so we can try to merge the
2151                                                  * follow-up instruction in the
2152                                                  * the ldvary sequence into the
2153                                                  * current instruction.
2154                                                  */
2155                                                 mark_instruction_scheduled(
2156                                                         devinfo, scoreboard->dag,
2157                                                         time, merge);
2158                                         }
2159                                 }
2160                         }
2161                         if (mux_read_stalls(scoreboard, inst))
2162                                 c->qpu_inst_stalled_count++;
2163                 }
2164 
2165                 /* Update the uniform index for the rewritten location --
2166                  * branch target updating will still need to change
2167                  * c->uniform_data[] using this index.
2168                  */
2169                 if (qinst->uniform != -1) {
2170                         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
2171                                 block->branch_uniform = *next_uniform;
2172 
2173                         c->uniform_data[*next_uniform] =
2174                                 orig_uniform_data[qinst->uniform];
2175                         c->uniform_contents[*next_uniform] =
2176                                 orig_uniform_contents[qinst->uniform];
2177                         qinst->uniform = *next_uniform;
2178                         (*next_uniform)++;
2179                 }
2180 
2181                 if (debug) {
2182                         fprintf(stderr, "\n");
2183                 }
2184 
2185                 /* Now that we've scheduled a new instruction, some of its
2186                  * children can be promoted to the list of instructions ready to
2187                  * be scheduled.  Update the children's unblocked time for this
2188                  * DAG edge as we do so.
2189                  */
2190                 mark_instruction_scheduled(devinfo, scoreboard->dag, time, chosen);
2191                 list_for_each_entry(struct schedule_node, merge, &merged_list,
2192                                     link) {
2193                         mark_instruction_scheduled(devinfo, scoreboard->dag, time, merge);
2194 
2195                         /* The merged VIR instruction doesn't get re-added to the
2196                          * block, so free it now.
2197                          */
2198                         free(merge->inst);
2199                 }
2200 
2201                 if (inst->sig.thrsw) {
2202                         time += emit_thrsw(c, block, scoreboard, qinst, false);
2203                 } else if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
2204                         emit_branch(c, block, scoreboard, qinst);
2205                 } else {
2206                         insert_scheduled_instruction(c, block,
2207                                                      scoreboard, qinst);
2208                 }
2209         }
2210 
2211         return time;
2212 }
2213 
2214 static uint32_t
qpu_schedule_instructions_block(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qblock * block,enum quniform_contents * orig_uniform_contents,uint32_t * orig_uniform_data,uint32_t * next_uniform)2215 qpu_schedule_instructions_block(struct v3d_compile *c,
2216                                 struct choose_scoreboard *scoreboard,
2217                                 struct qblock *block,
2218                                 enum quniform_contents *orig_uniform_contents,
2219                                 uint32_t *orig_uniform_data,
2220                                 uint32_t *next_uniform)
2221 {
2222         void *mem_ctx = ralloc_context(NULL);
2223         scoreboard->dag = dag_create(mem_ctx);
2224         struct list_head setup_list;
2225 
2226         list_inithead(&setup_list);
2227 
2228         /* Wrap each instruction in a scheduler structure. */
2229         while (!list_is_empty(&block->instructions)) {
2230                 struct qinst *qinst = (struct qinst *)block->instructions.next;
2231                 struct schedule_node *n =
2232                         rzalloc(mem_ctx, struct schedule_node);
2233 
2234                 dag_init_node(scoreboard->dag, &n->dag);
2235                 n->inst = qinst;
2236 
2237                 list_del(&qinst->link);
2238                 list_addtail(&n->link, &setup_list);
2239         }
2240 
2241         calculate_forward_deps(c, scoreboard->dag, &setup_list);
2242         calculate_reverse_deps(c, scoreboard->dag, &setup_list);
2243 
2244         dag_traverse_bottom_up(scoreboard->dag, compute_delay, c);
2245 
2246         uint32_t cycles = schedule_instructions(c, scoreboard, block,
2247                                                 orig_uniform_contents,
2248                                                 orig_uniform_data,
2249                                                 next_uniform);
2250 
2251         ralloc_free(mem_ctx);
2252         scoreboard->dag = NULL;
2253 
2254         return cycles;
2255 }
2256 
2257 static void
qpu_set_branch_targets(struct v3d_compile * c)2258 qpu_set_branch_targets(struct v3d_compile *c)
2259 {
2260         vir_for_each_block(block, c) {
2261                 /* The end block of the program has no branch. */
2262                 if (!block->successors[0])
2263                         continue;
2264 
2265                 /* If there was no branch instruction, then the successor
2266                  * block must follow immediately after this one.
2267                  */
2268                 if (block->branch_qpu_ip == ~0) {
2269                         assert(block->end_qpu_ip + 1 ==
2270                                block->successors[0]->start_qpu_ip);
2271                         continue;
2272                 }
2273 
2274                 /* Walk back through the delay slots to find the branch
2275                  * instr.
2276                  */
2277                 struct qinst *branch = NULL;
2278                 struct list_head *entry = block->instructions.prev;
2279                 int32_t delay_slot_count = -1;
2280                 struct qinst *delay_slots_start = NULL;
2281                 for (int i = 0; i < 3; i++) {
2282                         entry = entry->prev;
2283                         struct qinst *inst =
2284                                 container_of(entry, struct qinst, link);
2285 
2286                         if (delay_slot_count == -1) {
2287                                 if (!v3d_qpu_is_nop(&inst->qpu))
2288                                         delay_slot_count = i;
2289                                 else
2290                                         delay_slots_start = inst;
2291                         }
2292 
2293                         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) {
2294                                 branch = inst;
2295                                 break;
2296                         }
2297                 }
2298                 assert(branch && branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
2299                 assert(delay_slot_count >= 0 && delay_slot_count <= 3);
2300                 assert(delay_slot_count == 0 || delay_slots_start != NULL);
2301 
2302                 /* Make sure that the if-we-don't-jump
2303                  * successor was scheduled just after the
2304                  * delay slots.
2305                  */
2306                 assert(!block->successors[1] ||
2307                        block->successors[1]->start_qpu_ip ==
2308                        block->branch_qpu_ip + 4);
2309 
2310                 branch->qpu.branch.offset =
2311                         ((block->successors[0]->start_qpu_ip -
2312                           (block->branch_qpu_ip + 4)) *
2313                          sizeof(uint64_t));
2314 
2315                 /* Set up the relative offset to jump in the
2316                  * uniform stream.
2317                  *
2318                  * Use a temporary here, because
2319                  * uniform_data[inst->uniform] may be shared
2320                  * between multiple instructions.
2321                  */
2322                 assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT);
2323                 c->uniform_data[branch->uniform] =
2324                         (block->successors[0]->start_uniform -
2325                          (block->branch_uniform + 1)) * 4;
2326 
2327                 /* If this is an unconditional branch, try to fill any remaining
2328                  * delay slots with the initial instructions of the successor
2329                  * block.
2330                  *
2331                  * FIXME: we can do the same for conditional branches if we
2332                  * predicate the instructions to match the branch condition.
2333                  */
2334                 if (branch->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS) {
2335                         struct list_head *successor_insts =
2336                                 &block->successors[0]->instructions;
2337                         delay_slot_count = MIN2(delay_slot_count,
2338                                                 list_length(successor_insts));
2339                         struct qinst *s_inst =
2340                                 (struct qinst *) successor_insts->next;
2341                         struct qinst *slot = delay_slots_start;
2342                         int slots_filled = 0;
2343                         while (slots_filled < delay_slot_count &&
2344                                qpu_inst_valid_in_branch_delay_slot(c, s_inst)) {
2345                                 memcpy(&slot->qpu, &s_inst->qpu,
2346                                        sizeof(slot->qpu));
2347                                 s_inst = (struct qinst *) s_inst->link.next;
2348                                 slot = (struct qinst *) slot->link.next;
2349                                 slots_filled++;
2350                         }
2351                         branch->qpu.branch.offset +=
2352                                 slots_filled * sizeof(uint64_t);
2353                 }
2354         }
2355 }
2356 
2357 uint32_t
v3d_qpu_schedule_instructions(struct v3d_compile * c)2358 v3d_qpu_schedule_instructions(struct v3d_compile *c)
2359 {
2360         const struct v3d_device_info *devinfo = c->devinfo;
2361         struct qblock *end_block = list_last_entry(&c->blocks,
2362                                                    struct qblock, link);
2363 
2364         /* We reorder the uniforms as we schedule instructions, so save the
2365          * old data off and replace it.
2366          */
2367         uint32_t *uniform_data = c->uniform_data;
2368         enum quniform_contents *uniform_contents = c->uniform_contents;
2369         c->uniform_contents = ralloc_array(c, enum quniform_contents,
2370                                            c->num_uniforms);
2371         c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
2372         c->uniform_array_size = c->num_uniforms;
2373         uint32_t next_uniform = 0;
2374 
2375         struct choose_scoreboard scoreboard;
2376         memset(&scoreboard, 0, sizeof(scoreboard));
2377         scoreboard.last_ldvary_tick = -10;
2378         scoreboard.last_unifa_write_tick = -10;
2379         scoreboard.last_magic_sfu_write_tick = -10;
2380         scoreboard.last_uniforms_reset_tick = -10;
2381         scoreboard.last_thrsw_tick = -10;
2382         scoreboard.last_branch_tick = -10;
2383         scoreboard.last_setmsf_tick = -10;
2384         scoreboard.last_stallable_sfu_tick = -10;
2385 
2386         if (debug) {
2387                 fprintf(stderr, "Pre-schedule instructions\n");
2388                 vir_for_each_block(block, c) {
2389                         fprintf(stderr, "BLOCK %d\n", block->index);
2390                         list_for_each_entry(struct qinst, qinst,
2391                                             &block->instructions, link) {
2392                                 v3d_qpu_dump(devinfo, &qinst->qpu);
2393                                 fprintf(stderr, "\n");
2394                         }
2395                 }
2396                 fprintf(stderr, "\n");
2397         }
2398 
2399         uint32_t cycles = 0;
2400         vir_for_each_block(block, c) {
2401                 block->start_qpu_ip = c->qpu_inst_count;
2402                 block->branch_qpu_ip = ~0;
2403                 block->start_uniform = next_uniform;
2404 
2405                 cycles += qpu_schedule_instructions_block(c,
2406                                                           &scoreboard,
2407                                                           block,
2408                                                           uniform_contents,
2409                                                           uniform_data,
2410                                                           &next_uniform);
2411 
2412                 block->end_qpu_ip = c->qpu_inst_count - 1;
2413         }
2414 
2415         /* Emit the program-end THRSW instruction. */;
2416         struct qinst *thrsw = vir_nop();
2417         thrsw->qpu.sig.thrsw = true;
2418         emit_thrsw(c, end_block, &scoreboard, thrsw, true);
2419 
2420         qpu_set_branch_targets(c);
2421 
2422         assert(next_uniform == c->num_uniforms);
2423 
2424         return cycles;
2425 }
2426