• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  * Copyright © 2014-2017 Broadcom
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 /**
26  * @file
27  *
28  * The basic model of the list scheduler is to take a basic block, compute a
29  * DAG of the dependencies, and make a list of the DAG heads.  Heuristically
30  * pick a DAG head, then put all the children that are now DAG heads into the
31  * list of things to schedule.
32  *
33  * The goal of scheduling here is to pack pairs of operations together in a
34  * single QPU instruction.
35  */
36 
37 #include "qpu/qpu_disasm.h"
38 #include "v3d_compiler.h"
39 #include "util/ralloc.h"
40 #include "util/dag.h"
41 
42 static bool debug;
43 
44 struct schedule_node_child;
45 
46 struct schedule_node {
47         struct dag_node dag;
48         struct list_head link;
49         struct qinst *inst;
50 
51         /* Longest cycles + instruction_latency() of any parent of this node. */
52         uint32_t unblocked_time;
53 
54         /**
55          * Minimum number of cycles from scheduling this instruction until the
56          * end of the program, based on the slowest dependency chain through
57          * the children.
58          */
59         uint32_t delay;
60 
61         /**
62          * cycles between this instruction being scheduled and when its result
63          * can be consumed.
64          */
65         uint32_t latency;
66 };
67 
68 /* When walking the instructions in reverse, we need to swap before/after in
69  * add_dep().
70  */
71 enum direction { F, R };
72 
73 struct schedule_state {
74         const struct v3d_device_info *devinfo;
75         struct dag *dag;
76         struct schedule_node *last_r[6];
77         struct schedule_node *last_rf[64];
78         struct schedule_node *last_sf;
79         struct schedule_node *last_vpm_read;
80         struct schedule_node *last_tmu_write;
81         struct schedule_node *last_tmu_config;
82         struct schedule_node *last_tmu_read;
83         struct schedule_node *last_tlb;
84         struct schedule_node *last_vpm;
85         struct schedule_node *last_unif;
86         struct schedule_node *last_rtop;
87         struct schedule_node *last_unifa;
88         struct schedule_node *last_setmsf;
89         enum direction dir;
90         /* Estimated cycle when the current instruction would start. */
91         uint32_t time;
92 };
93 
94 static void
add_dep(struct schedule_state * state,struct schedule_node * before,struct schedule_node * after,bool write)95 add_dep(struct schedule_state *state,
96         struct schedule_node *before,
97         struct schedule_node *after,
98         bool write)
99 {
100         bool write_after_read = !write && state->dir == R;
101         uintptr_t edge_data = write_after_read;
102 
103         if (!before || !after)
104                 return;
105 
106         assert(before != after);
107 
108         if (state->dir == F)
109                 dag_add_edge(&before->dag, &after->dag, edge_data);
110         else
111                 dag_add_edge(&after->dag, &before->dag, edge_data);
112 }
113 
114 static void
add_read_dep(struct schedule_state * state,struct schedule_node * before,struct schedule_node * after)115 add_read_dep(struct schedule_state *state,
116               struct schedule_node *before,
117               struct schedule_node *after)
118 {
119         add_dep(state, before, after, false);
120 }
121 
122 static void
add_write_dep(struct schedule_state * state,struct schedule_node ** before,struct schedule_node * after)123 add_write_dep(struct schedule_state *state,
124               struct schedule_node **before,
125               struct schedule_node *after)
126 {
127         add_dep(state, *before, after, true);
128         *before = after;
129 }
130 
131 static bool
qpu_inst_is_tlb(const struct v3d_qpu_instr * inst)132 qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
133 {
134         if (inst->sig.ldtlb || inst->sig.ldtlbu)
135                 return true;
136 
137         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
138                 return false;
139 
140         if (inst->alu.add.op != V3D_QPU_A_NOP &&
141             inst->alu.add.magic_write &&
142             (inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
143              inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
144                 return true;
145 
146         if (inst->alu.mul.op != V3D_QPU_M_NOP &&
147             inst->alu.mul.magic_write &&
148             (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
149              inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
150                 return true;
151 
152         return false;
153 }
154 
155 static void
process_mux_deps(struct schedule_state * state,struct schedule_node * n,enum v3d_qpu_mux mux)156 process_mux_deps(struct schedule_state *state, struct schedule_node *n,
157                  enum v3d_qpu_mux mux)
158 {
159         assert(state->devinfo->ver < 71);
160         switch (mux) {
161         case V3D_QPU_MUX_A:
162                 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
163                 break;
164         case V3D_QPU_MUX_B:
165                 if (!n->inst->qpu.sig.small_imm_b) {
166                         add_read_dep(state,
167                                      state->last_rf[n->inst->qpu.raddr_b], n);
168                 }
169                 break;
170         default:
171                 add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n);
172                 break;
173         }
174 }
175 
176 
177 static void
process_raddr_deps(struct schedule_state * state,struct schedule_node * n,uint8_t raddr,bool is_small_imm)178 process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
179                    uint8_t raddr, bool is_small_imm)
180 {
181         assert(state->devinfo->ver >= 71);
182 
183         if (!is_small_imm)
184                 add_read_dep(state, state->last_rf[raddr], n);
185 }
186 
187 static bool
tmu_write_is_sequence_terminator(uint32_t waddr)188 tmu_write_is_sequence_terminator(uint32_t waddr)
189 {
190         switch (waddr) {
191         case V3D_QPU_WADDR_TMUS:
192         case V3D_QPU_WADDR_TMUSCM:
193         case V3D_QPU_WADDR_TMUSF:
194         case V3D_QPU_WADDR_TMUSLOD:
195         case V3D_QPU_WADDR_TMUA:
196         case V3D_QPU_WADDR_TMUAU:
197                 return true;
198         default:
199                 return false;
200         }
201 }
202 
203 static bool
can_reorder_tmu_write(const struct v3d_device_info * devinfo,uint32_t waddr)204 can_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr)
205 {
206         if (tmu_write_is_sequence_terminator(waddr))
207                 return false;
208 
209         if (waddr == V3D_QPU_WADDR_TMUD)
210                 return false;
211 
212         return true;
213 }
214 
215 static void
process_waddr_deps(struct schedule_state * state,struct schedule_node * n,uint32_t waddr,bool magic)216 process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
217                    uint32_t waddr, bool magic)
218 {
219         if (!magic) {
220                 add_write_dep(state, &state->last_rf[waddr], n);
221         } else if (v3d_qpu_magic_waddr_is_tmu(state->devinfo, waddr)) {
222                 if (can_reorder_tmu_write(state->devinfo, waddr))
223                         add_read_dep(state, state->last_tmu_write, n);
224                 else
225                         add_write_dep(state, &state->last_tmu_write, n);
226 
227                 if (tmu_write_is_sequence_terminator(waddr))
228                         add_write_dep(state, &state->last_tmu_config, n);
229         } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) {
230                 /* Handled by v3d_qpu_writes_r4() check. */
231         } else {
232                 switch (waddr) {
233                 case V3D_QPU_WADDR_R0:
234                 case V3D_QPU_WADDR_R1:
235                 case V3D_QPU_WADDR_R2:
236                         add_write_dep(state,
237                                       &state->last_r[waddr - V3D_QPU_WADDR_R0],
238                                       n);
239                         break;
240                 case V3D_QPU_WADDR_R3:
241                 case V3D_QPU_WADDR_R4:
242                 case V3D_QPU_WADDR_R5:
243                         /* Handled by v3d_qpu_writes_r*() checks below. */
244                         break;
245 
246                 case V3D_QPU_WADDR_VPM:
247                 case V3D_QPU_WADDR_VPMU:
248                         add_write_dep(state, &state->last_vpm, n);
249                         break;
250 
251                 case V3D_QPU_WADDR_TLB:
252                 case V3D_QPU_WADDR_TLBU:
253                         add_write_dep(state, &state->last_tlb, n);
254                         break;
255 
256                 case V3D_QPU_WADDR_SYNC:
257                 case V3D_QPU_WADDR_SYNCB:
258                 case V3D_QPU_WADDR_SYNCU:
259                         /* For CS barrier(): Sync against any other memory
260                          * accesses.  There doesn't appear to be any need for
261                          * barriers to affect ALU operations.
262                          */
263                         add_write_dep(state, &state->last_tmu_write, n);
264                         add_write_dep(state, &state->last_tmu_read, n);
265                         break;
266 
267                 case V3D_QPU_WADDR_UNIFA:
268                         add_write_dep(state, &state->last_unifa, n);
269                         break;
270 
271                 case V3D_QPU_WADDR_NOP:
272                         break;
273 
274                 default:
275                         fprintf(stderr, "Unknown waddr %d\n", waddr);
276                         abort();
277                 }
278         }
279 }
280 
281 /**
282  * Common code for dependencies that need to be tracked both forward and
283  * backward.
284  *
285  * This is for things like "all reads of r4 have to happen between the r4
286  * writes that surround them".
287  */
288 static void
calculate_deps(struct schedule_state * state,struct schedule_node * n)289 calculate_deps(struct schedule_state *state, struct schedule_node *n)
290 {
291         const struct v3d_device_info *devinfo = state->devinfo;
292         struct qinst *qinst = n->inst;
293         struct v3d_qpu_instr *inst = &qinst->qpu;
294         /* If the input and output segments are shared, then all VPM reads to
295          * a location need to happen before all writes.  We handle this by
296          * serializing all VPM operations for now.
297          *
298          * FIXME: we are assuming that the segments are shared. That is
299          * correct right now as we are only using shared, but technically you
300          * can choose.
301          */
302         bool separate_vpm_segment = false;
303 
304         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
305                 if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS)
306                         add_read_dep(state, state->last_sf, n);
307 
308                 /* XXX: BDI */
309                 /* XXX: BDU */
310                 /* XXX: ub */
311                 /* XXX: raddr_a */
312 
313                 add_write_dep(state, &state->last_unif, n);
314                 return;
315         }
316 
317         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
318 
319         /* XXX: LOAD_IMM */
320 
321         if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
322                 if (devinfo->ver < 71) {
323                         process_mux_deps(state, n, inst->alu.add.a.mux);
324                 } else {
325                         process_raddr_deps(state, n, inst->alu.add.a.raddr,
326                                            inst->sig.small_imm_a);
327                 }
328         }
329         if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
330                 if (devinfo->ver < 71) {
331                         process_mux_deps(state, n, inst->alu.add.b.mux);
332                 } else {
333                         process_raddr_deps(state, n, inst->alu.add.b.raddr,
334                                            inst->sig.small_imm_b);
335                 }
336         }
337 
338         if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
339                 if (devinfo->ver < 71) {
340                         process_mux_deps(state, n, inst->alu.mul.a.mux);
341                 } else {
342                         process_raddr_deps(state, n, inst->alu.mul.a.raddr,
343                                            inst->sig.small_imm_c);
344                 }
345         }
346         if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
347                 if (devinfo->ver < 71) {
348                         process_mux_deps(state, n, inst->alu.mul.b.mux);
349                 } else {
350                         process_raddr_deps(state, n, inst->alu.mul.b.raddr,
351                                            inst->sig.small_imm_d);
352                 }
353         }
354 
355         switch (inst->alu.add.op) {
356         case V3D_QPU_A_VPMSETUP:
357                 /* Could distinguish read/write by unpacking the uniform. */
358                 add_write_dep(state, &state->last_vpm, n);
359                 add_write_dep(state, &state->last_vpm_read, n);
360                 break;
361 
362         case V3D_QPU_A_STVPMV:
363         case V3D_QPU_A_STVPMD:
364         case V3D_QPU_A_STVPMP:
365                 add_write_dep(state, &state->last_vpm, n);
366                 break;
367 
368         case V3D_QPU_A_LDVPMV_IN:
369         case V3D_QPU_A_LDVPMD_IN:
370         case V3D_QPU_A_LDVPMG_IN:
371         case V3D_QPU_A_LDVPMP:
372                 if (!separate_vpm_segment)
373                         add_write_dep(state, &state->last_vpm, n);
374                 break;
375 
376         case V3D_QPU_A_VPMWT:
377                 add_read_dep(state, state->last_vpm, n);
378                 break;
379 
380         case V3D_QPU_A_MSF:
381                 add_read_dep(state, state->last_tlb, n);
382                 add_read_dep(state, state->last_setmsf, n);
383                 break;
384 
385         case V3D_QPU_A_SETMSF:
386                 add_write_dep(state, &state->last_setmsf, n);
387                 add_write_dep(state, &state->last_tmu_write, n);
388                 FALLTHROUGH;
389         case V3D_QPU_A_SETREVF:
390                 add_write_dep(state, &state->last_tlb, n);
391                 break;
392 
393         case V3D_QPU_A_BALLOT:
394         case V3D_QPU_A_BCASTF:
395         case V3D_QPU_A_ALLEQ:
396         case V3D_QPU_A_ALLFEQ:
397                 add_read_dep(state, state->last_setmsf, n);
398                 break;
399 
400         default:
401                 break;
402         }
403 
404         switch (inst->alu.mul.op) {
405         case V3D_QPU_M_MULTOP:
406         case V3D_QPU_M_UMUL24:
407                 /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and
408                  * resets it to 0.  We could possibly reorder umul24s relative
409                  * to each other, but for now just keep all the MUL parts in
410                  * order.
411                  */
412                 add_write_dep(state, &state->last_rtop, n);
413                 break;
414         default:
415                 break;
416         }
417 
418         if (inst->alu.add.op != V3D_QPU_A_NOP) {
419                 process_waddr_deps(state, n, inst->alu.add.waddr,
420                                    inst->alu.add.magic_write);
421         }
422         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
423                 process_waddr_deps(state, n, inst->alu.mul.waddr,
424                                    inst->alu.mul.magic_write);
425         }
426         if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) {
427                 process_waddr_deps(state, n, inst->sig_addr,
428                                    inst->sig_magic);
429         }
430 
431         if (v3d_qpu_writes_r3(devinfo, inst))
432                 add_write_dep(state, &state->last_r[3], n);
433         if (v3d_qpu_writes_r4(devinfo, inst))
434                 add_write_dep(state, &state->last_r[4], n);
435         if (v3d_qpu_writes_r5(devinfo, inst))
436                 add_write_dep(state, &state->last_r[5], n);
437         if (v3d_qpu_writes_rf0_implicitly(devinfo, inst))
438                 add_write_dep(state, &state->last_rf[0], n);
439 
440         /* If we add any more dependencies here we should consider whether we
441          * also need to update qpu_inst_after_thrsw_valid_in_delay_slot.
442          */
443         if (inst->sig.thrsw) {
444                 /* All accumulator contents and flags are undefined after the
445                  * switch.
446                  */
447                 for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
448                         add_write_dep(state, &state->last_r[i], n);
449                 add_write_dep(state, &state->last_sf, n);
450                 add_write_dep(state, &state->last_rtop, n);
451 
452                 /* Scoreboard-locking operations have to stay after the last
453                  * thread switch.
454                  */
455                 add_write_dep(state, &state->last_tlb, n);
456 
457                 add_write_dep(state, &state->last_tmu_write, n);
458                 add_write_dep(state, &state->last_tmu_config, n);
459         }
460 
461         if (v3d_qpu_waits_on_tmu(inst)) {
462                 /* TMU loads are coming from a FIFO, so ordering is important.
463                  */
464                 add_write_dep(state, &state->last_tmu_read, n);
465                 /* Keep TMU loads after their TMU lookup terminator */
466                 add_read_dep(state, state->last_tmu_config, n);
467         }
468 
469         /* Allow wrtmuc to be reordered with other instructions in the
470          * same TMU sequence by using a read dependency on the last TMU
471          * sequence terminator.
472          */
473         if (inst->sig.wrtmuc)
474                 add_read_dep(state, state->last_tmu_config, n);
475 
476         if (inst->sig.ldtlb | inst->sig.ldtlbu)
477                 add_write_dep(state, &state->last_tlb, n);
478 
479         if (inst->sig.ldvpm) {
480                 add_write_dep(state, &state->last_vpm_read, n);
481 
482                 /* At least for now, we're doing shared I/O segments, so queue
483                  * all writes after all reads.
484                  */
485                 if (!separate_vpm_segment)
486                         add_write_dep(state, &state->last_vpm, n);
487         }
488 
489         /* inst->sig.ldunif or sideband uniform read */
490         if (vir_has_uniform(qinst))
491                 add_write_dep(state, &state->last_unif, n);
492 
493         /* Both unifa and ldunifa must preserve ordering */
494         if (inst->sig.ldunifa || inst->sig.ldunifarf)
495                 add_write_dep(state, &state->last_unifa, n);
496 
497         if (v3d_qpu_reads_flags(inst))
498                 add_read_dep(state, state->last_sf, n);
499         if (v3d_qpu_writes_flags(inst))
500                 add_write_dep(state, &state->last_sf, n);
501 }
502 
503 static void
calculate_forward_deps(struct v3d_compile * c,struct dag * dag,struct list_head * schedule_list)504 calculate_forward_deps(struct v3d_compile *c, struct dag *dag,
505                        struct list_head *schedule_list)
506 {
507         struct schedule_state state;
508 
509         memset(&state, 0, sizeof(state));
510         state.dag = dag;
511         state.devinfo = c->devinfo;
512         state.dir = F;
513 
514         list_for_each_entry(struct schedule_node, node, schedule_list, link)
515                 calculate_deps(&state, node);
516 }
517 
518 static void
calculate_reverse_deps(struct v3d_compile * c,struct dag * dag,struct list_head * schedule_list)519 calculate_reverse_deps(struct v3d_compile *c, struct dag *dag,
520                        struct list_head *schedule_list)
521 {
522         struct schedule_state state;
523 
524         memset(&state, 0, sizeof(state));
525         state.dag = dag;
526         state.devinfo = c->devinfo;
527         state.dir = R;
528 
529         list_for_each_entry_rev(struct schedule_node, node, schedule_list,
530                                 link) {
531                 calculate_deps(&state, (struct schedule_node *)node);
532         }
533 }
534 
535 struct choose_scoreboard {
536         struct dag *dag;
537         int tick;
538         int last_magic_sfu_write_tick;
539         int last_stallable_sfu_reg;
540         int last_stallable_sfu_tick;
541         int last_ldvary_tick;
542         int last_unifa_write_tick;
543         int last_uniforms_reset_tick;
544         int last_thrsw_tick;
545         int last_branch_tick;
546         int last_setmsf_tick;
547         bool first_thrsw_emitted;
548         bool last_thrsw_emitted;
549         bool fixup_ldvary;
550         int ldvary_count;
551         int pending_ldtmu_count;
552         bool first_ldtmu_after_thrsw;
553 
554         /* V3D 7.x */
555         int last_implicit_rf0_write_tick;
556         bool has_rf0_flops_conflict;
557 };
558 
559 static bool
mux_reads_too_soon(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst,enum v3d_qpu_mux mux)560 mux_reads_too_soon(struct choose_scoreboard *scoreboard,
561                    const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
562 {
563         switch (mux) {
564         case V3D_QPU_MUX_R4:
565                 if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2)
566                         return true;
567                 break;
568 
569         case V3D_QPU_MUX_R5:
570                 if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
571                         return true;
572                 break;
573         default:
574                 break;
575         }
576 
577         return false;
578 }
579 
580 static bool
reads_too_soon(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst,uint8_t raddr)581 reads_too_soon(struct choose_scoreboard *scoreboard,
582                const struct v3d_qpu_instr *inst, uint8_t raddr)
583 {
584         switch (raddr) {
585         case 0: /* ldvary delayed write of C coefficient to rf0 */
586                 if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
587                         return true;
588                 break;
589         default:
590                 break;
591         }
592 
593         return false;
594 }
595 
596 static bool
reads_too_soon_after_write(const struct v3d_device_info * devinfo,struct choose_scoreboard * scoreboard,struct qinst * qinst)597 reads_too_soon_after_write(const struct v3d_device_info *devinfo,
598                            struct choose_scoreboard *scoreboard,
599                            struct qinst *qinst)
600 {
601         const struct v3d_qpu_instr *inst = &qinst->qpu;
602 
603         /* XXX: Branching off of raddr. */
604         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
605                 return false;
606 
607         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
608 
609         if (inst->alu.add.op != V3D_QPU_A_NOP) {
610                 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
611                         if (devinfo->ver < 71) {
612                                 if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux))
613                                         return true;
614                         } else {
615                                 if (reads_too_soon(scoreboard, inst, inst->alu.add.a.raddr))
616                                         return true;
617                         }
618                 }
619                 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
620                         if (devinfo->ver < 71) {
621                                 if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux))
622                                         return true;
623                         } else {
624                                 if (reads_too_soon(scoreboard, inst, inst->alu.add.b.raddr))
625                                         return true;
626                         }
627                 }
628         }
629 
630         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
631                 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
632                         if (devinfo->ver < 71) {
633                                 if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux))
634                                         return true;
635                         } else {
636                                 if (reads_too_soon(scoreboard, inst, inst->alu.mul.a.raddr))
637                                         return true;
638                         }
639                 }
640                 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
641                         if (devinfo->ver < 71) {
642                                 if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux))
643                                         return true;
644                         } else {
645                                 if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr))
646                                         return true;
647                         }
648                 }
649         }
650 
651         /* XXX: imm */
652 
653         return false;
654 }
655 
656 static bool
writes_too_soon_after_write(const struct v3d_device_info * devinfo,struct choose_scoreboard * scoreboard,struct qinst * qinst)657 writes_too_soon_after_write(const struct v3d_device_info *devinfo,
658                             struct choose_scoreboard *scoreboard,
659                             struct qinst *qinst)
660 {
661         const struct v3d_qpu_instr *inst = &qinst->qpu;
662 
663         /* Don't schedule any other r4 write too soon after an SFU write.
664          * This would normally be prevented by dependency tracking, but might
665          * occur if a dead SFU computation makes it to scheduling.
666          */
667         if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 &&
668             v3d_qpu_writes_r4(devinfo, inst))
669                 return true;
670 
671         if (devinfo->ver == 42)
672            return false;
673 
674         /* Don't schedule anything that writes rf0 right after ldvary, since
675          * that would clash with the ldvary's delayed rf0 write (the exception
676          * is another ldvary, since its implicit rf0 write would also have
677          * one cycle of delay and would not clash).
678          */
679         if (scoreboard->last_ldvary_tick + 1 == scoreboard->tick &&
680             (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
681              (v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
682               !inst->sig.ldvary))) {
683             return true;
684        }
685 
686         return false;
687 }
688 
689 static bool
scoreboard_is_locked(struct choose_scoreboard * scoreboard,bool lock_scoreboard_on_first_thrsw)690 scoreboard_is_locked(struct choose_scoreboard *scoreboard,
691                      bool lock_scoreboard_on_first_thrsw)
692 {
693         if (lock_scoreboard_on_first_thrsw) {
694                 return scoreboard->first_thrsw_emitted &&
695                        scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
696         }
697 
698         return scoreboard->last_thrsw_emitted &&
699                scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
700 }
701 
702 static bool
pixel_scoreboard_too_soon(struct v3d_compile * c,struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst)703 pixel_scoreboard_too_soon(struct v3d_compile *c,
704                           struct choose_scoreboard *scoreboard,
705                           const struct v3d_qpu_instr *inst)
706 {
707         return qpu_inst_is_tlb(inst) &&
708                !scoreboard_is_locked(scoreboard,
709                                      c->lock_scoreboard_on_first_thrsw);
710 }
711 
712 static bool
qpu_instruction_uses_rf(const struct v3d_device_info * devinfo,const struct v3d_qpu_instr * inst,uint32_t waddr)713 qpu_instruction_uses_rf(const struct v3d_device_info *devinfo,
714                         const struct v3d_qpu_instr *inst,
715                         uint32_t waddr) {
716 
717         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
718            return false;
719 
720         if (devinfo->ver < 71) {
721                 if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
722                     inst->raddr_a == waddr)
723                         return true;
724 
725                 if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
726                     !inst->sig.small_imm_b && (inst->raddr_b == waddr))
727                         return true;
728         } else {
729                 if (v3d71_qpu_reads_raddr(inst, waddr))
730                         return true;
731         }
732 
733         return false;
734 }
735 
736 static bool
read_stalls(const struct v3d_device_info * devinfo,struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst)737 read_stalls(const struct v3d_device_info *devinfo,
738             struct choose_scoreboard *scoreboard,
739             const struct v3d_qpu_instr *inst)
740 {
741         return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
742                 qpu_instruction_uses_rf(devinfo, inst,
743                                         scoreboard->last_stallable_sfu_reg);
744 }
745 
746 /* We define a max schedule priority to allow negative priorities as result of
747  * subtracting this max when an instruction stalls. So instructions that
748  * stall have lower priority than regular instructions. */
749 #define MAX_SCHEDULE_PRIORITY 16
750 
751 static int
get_instruction_priority(const struct v3d_device_info * devinfo,const struct v3d_qpu_instr * inst)752 get_instruction_priority(const struct v3d_device_info *devinfo,
753                          const struct v3d_qpu_instr *inst)
754 {
755         uint32_t baseline_score;
756         uint32_t next_score = 0;
757 
758         /* Schedule TLB operations as late as possible, to get more
759          * parallelism between shaders.
760          */
761         if (qpu_inst_is_tlb(inst))
762                 return next_score;
763         next_score++;
764 
765         /* Empirical testing shows that using priorities to hide latency of
766          * TMU operations when scheduling QPU leads to slightly worse
767          * performance, even at 2 threads. We think this is because the thread
768          * switching is already quite effective at hiding latency and NIR
769          * scheduling (and possibly TMU pipelining too) are sufficient to hide
770          * TMU latency, so piling up on that here doesn't provide any benefits
771          * and instead may cause us to postpone critical paths that depend on
772          * the TMU results.
773          */
774 #if 0
775         /* Schedule texture read results collection late to hide latency. */
776         if (v3d_qpu_waits_on_tmu(inst))
777                 return next_score;
778         next_score++;
779 #endif
780 
781         /* Default score for things that aren't otherwise special. */
782         baseline_score = next_score;
783         next_score++;
784 
785 #if 0
786         /* Schedule texture read setup early to hide their latency better. */
787         if (v3d_qpu_writes_tmu(devinfo, inst))
788                 return next_score;
789         next_score++;
790 #endif
791 
792         /* We should increase the maximum if we assert here */
793         assert(next_score < MAX_SCHEDULE_PRIORITY);
794 
795         return baseline_score;
796 }
797 
798 enum {
799         V3D_PERIPHERAL_VPM_READ           = (1 << 0),
800         V3D_PERIPHERAL_VPM_WRITE          = (1 << 1),
801         V3D_PERIPHERAL_VPM_WAIT           = (1 << 2),
802         V3D_PERIPHERAL_SFU                = (1 << 3),
803         V3D_PERIPHERAL_TMU_WRITE          = (1 << 4),
804         V3D_PERIPHERAL_TMU_READ           = (1 << 5),
805         V3D_PERIPHERAL_TMU_WAIT           = (1 << 6),
806         V3D_PERIPHERAL_TMU_WRTMUC_SIG     = (1 << 7),
807         V3D_PERIPHERAL_TSY                = (1 << 8),
808         V3D_PERIPHERAL_TLB_READ           = (1 << 9),
809         V3D_PERIPHERAL_TLB_WRITE          = (1 << 10),
810 };
811 
812 static uint32_t
qpu_peripherals(const struct v3d_device_info * devinfo,const struct v3d_qpu_instr * inst)813 qpu_peripherals(const struct v3d_device_info *devinfo,
814                 const struct v3d_qpu_instr *inst)
815 {
816         uint32_t result = 0;
817         if (v3d_qpu_reads_vpm(inst))
818                 result |= V3D_PERIPHERAL_VPM_READ;
819         if (v3d_qpu_writes_vpm(inst))
820                 result |= V3D_PERIPHERAL_VPM_WRITE;
821         if (v3d_qpu_waits_vpm(inst))
822                 result |= V3D_PERIPHERAL_VPM_WAIT;
823 
824         if (v3d_qpu_writes_tmu(devinfo, inst))
825                 result |= V3D_PERIPHERAL_TMU_WRITE;
826         if (inst->sig.ldtmu)
827                 result |= V3D_PERIPHERAL_TMU_READ;
828         if (inst->sig.wrtmuc)
829                 result |= V3D_PERIPHERAL_TMU_WRTMUC_SIG;
830 
831         if (v3d_qpu_uses_sfu(inst))
832                 result |= V3D_PERIPHERAL_SFU;
833 
834         if (v3d_qpu_reads_tlb(inst))
835                 result |= V3D_PERIPHERAL_TLB_READ;
836         if (v3d_qpu_writes_tlb(inst))
837                 result |= V3D_PERIPHERAL_TLB_WRITE;
838 
839         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
840                 if (inst->alu.add.op != V3D_QPU_A_NOP &&
841                     inst->alu.add.magic_write &&
842                     v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr)) {
843                         result |= V3D_PERIPHERAL_TSY;
844                 }
845 
846                 if (inst->alu.add.op == V3D_QPU_A_TMUWT)
847                         result |= V3D_PERIPHERAL_TMU_WAIT;
848         }
849 
850         return result;
851 }
852 
853 static bool
qpu_compatible_peripheral_access(const struct v3d_device_info * devinfo,const struct v3d_qpu_instr * a,const struct v3d_qpu_instr * b)854 qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
855                                  const struct v3d_qpu_instr *a,
856                                  const struct v3d_qpu_instr *b)
857 {
858         const uint32_t a_peripherals = qpu_peripherals(devinfo, a);
859         const uint32_t b_peripherals = qpu_peripherals(devinfo, b);
860 
861         /* We can always do one peripheral access per instruction. */
862         if (util_bitcount(a_peripherals) + util_bitcount(b_peripherals) <= 1)
863                 return true;
864 
865         /* V3D 4.x can't do more than one peripheral access except in a
866          * few cases:
867          */
868         if (devinfo->ver == 42) {
869                 /* WRTMUC signal with TMU register write (other than tmuc). */
870                 if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
871                     b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
872                         return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
873                 }
874                 if (b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
875                     a_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
876                         return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
877                 }
878 
879                 /* TMU read with VPM read/write. */
880                 if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
881                     (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
882                      b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
883                         return true;
884                 }
885                 if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
886                     (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
887                      a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
888                         return true;
889                 }
890 
891                 return false;
892         }
893 
894         /* V3D 7.x can't have more than one of these restricted peripherals */
895         const uint32_t restricted = V3D_PERIPHERAL_TMU_WRITE |
896                                     V3D_PERIPHERAL_TMU_WRTMUC_SIG |
897                                     V3D_PERIPHERAL_TSY |
898                                     V3D_PERIPHERAL_TLB_READ |
899                                     V3D_PERIPHERAL_SFU |
900                                     V3D_PERIPHERAL_VPM_READ |
901                                     V3D_PERIPHERAL_VPM_WRITE;
902 
903         const uint32_t a_restricted = a_peripherals & restricted;
904         const uint32_t b_restricted = b_peripherals & restricted;
905         if (a_restricted && b_restricted) {
906                 /* WRTMUC signal with TMU register write (other than tmuc) is
907                  * allowed though.
908                  */
909                 if (!((a_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
910                        b_restricted == V3D_PERIPHERAL_TMU_WRITE &&
911                        v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||
912                       (b_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
913                        a_restricted == V3D_PERIPHERAL_TMU_WRITE &&
914                        v3d_qpu_writes_tmu_not_tmuc(devinfo, a)))) {
915                         return false;
916                 }
917         }
918 
919         /* Only one TMU read per instruction */
920         if ((a_peripherals & V3D_PERIPHERAL_TMU_READ) &&
921             (b_peripherals & V3D_PERIPHERAL_TMU_READ)) {
922                 return false;
923         }
924 
925         /* Only one TLB access per instruction */
926         if ((a_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
927                               V3D_PERIPHERAL_TLB_READ)) &&
928             (b_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
929                               V3D_PERIPHERAL_TLB_READ))) {
930                 return false;
931         }
932 
933         return true;
934 }
935 
936 /* Compute a bitmask of which rf registers are used between
937  * the two instructions.
938  */
939 static uint64_t
qpu_raddrs_used(const struct v3d_qpu_instr * a,const struct v3d_qpu_instr * b)940 qpu_raddrs_used(const struct v3d_qpu_instr *a,
941                 const struct v3d_qpu_instr *b)
942 {
943         assert(a->type == V3D_QPU_INSTR_TYPE_ALU);
944         assert(b->type == V3D_QPU_INSTR_TYPE_ALU);
945 
946         uint64_t raddrs_used = 0;
947         if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A))
948                 raddrs_used |= (1ll << a->raddr_a);
949         if (!a->sig.small_imm_b && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
950                 raddrs_used |= (1ll << a->raddr_b);
951         if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A))
952                 raddrs_used |= (1ll << b->raddr_a);
953         if (!b->sig.small_imm_b && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
954                 raddrs_used |= (1ll << b->raddr_b);
955 
956         return raddrs_used;
957 }
958 
959 /* Takes two instructions and attempts to merge their raddr fields (including
960  * small immediates) into one merged instruction. For V3D 4.x, returns false
961  * if the two instructions access more than two different rf registers between
962  * them, or more than one rf register and one small immediate. For 7.x returns
963  * false if both instructions use small immediates.
964  */
965 static bool
qpu_merge_raddrs(struct v3d_qpu_instr * result,const struct v3d_qpu_instr * add_instr,const struct v3d_qpu_instr * mul_instr,const struct v3d_device_info * devinfo)966 qpu_merge_raddrs(struct v3d_qpu_instr *result,
967                  const struct v3d_qpu_instr *add_instr,
968                  const struct v3d_qpu_instr *mul_instr,
969                  const struct v3d_device_info *devinfo)
970 {
971         if (devinfo->ver >= 71) {
972                 assert(add_instr->sig.small_imm_a +
973                        add_instr->sig.small_imm_b <= 1);
974                 assert(add_instr->sig.small_imm_c +
975                        add_instr->sig.small_imm_d == 0);
976                 assert(mul_instr->sig.small_imm_a +
977                        mul_instr->sig.small_imm_b == 0);
978                 assert(mul_instr->sig.small_imm_c +
979                        mul_instr->sig.small_imm_d <= 1);
980 
981                 result->sig.small_imm_a = add_instr->sig.small_imm_a;
982                 result->sig.small_imm_b = add_instr->sig.small_imm_b;
983                 result->sig.small_imm_c = mul_instr->sig.small_imm_c;
984                 result->sig.small_imm_d = mul_instr->sig.small_imm_d;
985 
986                 return (result->sig.small_imm_a +
987                         result->sig.small_imm_b +
988                         result->sig.small_imm_c +
989                         result->sig.small_imm_d) <= 1;
990         }
991 
992         assert(devinfo->ver == 42);
993 
994         uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
995         int naddrs = util_bitcount64(raddrs_used);
996 
997         if (naddrs > 2)
998                 return false;
999 
1000         if ((add_instr->sig.small_imm_b || mul_instr->sig.small_imm_b)) {
1001                 if (naddrs > 1)
1002                         return false;
1003 
1004                 if (add_instr->sig.small_imm_b && mul_instr->sig.small_imm_b)
1005                         if (add_instr->raddr_b != mul_instr->raddr_b)
1006                                 return false;
1007 
1008                 result->sig.small_imm_b = true;
1009                 result->raddr_b = add_instr->sig.small_imm_b ?
1010                         add_instr->raddr_b : mul_instr->raddr_b;
1011         }
1012 
1013         if (naddrs == 0)
1014                 return true;
1015 
1016         int raddr_a = ffsll(raddrs_used) - 1;
1017         raddrs_used &= ~(1ll << raddr_a);
1018         result->raddr_a = raddr_a;
1019 
1020         if (!result->sig.small_imm_b) {
1021                 if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
1022                     raddr_a == add_instr->raddr_b) {
1023                         if (add_instr->alu.add.a.mux == V3D_QPU_MUX_B)
1024                                 result->alu.add.a.mux = V3D_QPU_MUX_A;
1025                         if (add_instr->alu.add.b.mux == V3D_QPU_MUX_B &&
1026                             v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
1027                                 result->alu.add.b.mux = V3D_QPU_MUX_A;
1028                         }
1029                 }
1030                 if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) &&
1031                     raddr_a == mul_instr->raddr_b) {
1032                         if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_B)
1033                                 result->alu.mul.a.mux = V3D_QPU_MUX_A;
1034                         if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_B &&
1035                             v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
1036                                 result->alu.mul.b.mux = V3D_QPU_MUX_A;
1037                         }
1038                 }
1039         }
1040         if (!raddrs_used)
1041                 return true;
1042 
1043         int raddr_b = ffsll(raddrs_used) - 1;
1044         result->raddr_b = raddr_b;
1045         if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) &&
1046             raddr_b == add_instr->raddr_a) {
1047                 if (add_instr->alu.add.a.mux == V3D_QPU_MUX_A)
1048                         result->alu.add.a.mux = V3D_QPU_MUX_B;
1049                 if (add_instr->alu.add.b.mux == V3D_QPU_MUX_A &&
1050                     v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
1051                         result->alu.add.b.mux = V3D_QPU_MUX_B;
1052                 }
1053         }
1054         if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) &&
1055             raddr_b == mul_instr->raddr_a) {
1056                 if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_A)
1057                         result->alu.mul.a.mux = V3D_QPU_MUX_B;
1058                 if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_A &&
1059                     v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
1060                         result->alu.mul.b.mux = V3D_QPU_MUX_B;
1061                 }
1062         }
1063 
1064         return true;
1065 }
1066 
1067 static bool
can_do_add_as_mul(enum v3d_qpu_add_op op)1068 can_do_add_as_mul(enum v3d_qpu_add_op op)
1069 {
1070         switch (op) {
1071         case V3D_QPU_A_ADD:
1072         case V3D_QPU_A_SUB:
1073                 return true;
1074         default:
1075                 return false;
1076         }
1077 }
1078 
1079 static enum v3d_qpu_mul_op
add_op_as_mul_op(enum v3d_qpu_add_op op)1080 add_op_as_mul_op(enum v3d_qpu_add_op op)
1081 {
1082         switch (op) {
1083         case V3D_QPU_A_ADD:
1084                 return V3D_QPU_M_ADD;
1085         case V3D_QPU_A_SUB:
1086                 return V3D_QPU_M_SUB;
1087         default:
1088                 unreachable("unexpected add opcode");
1089         }
1090 }
1091 
1092 static void
qpu_convert_add_to_mul(const struct v3d_device_info * devinfo,struct v3d_qpu_instr * inst)1093 qpu_convert_add_to_mul(const struct v3d_device_info *devinfo,
1094                        struct v3d_qpu_instr *inst)
1095 {
1096         STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));
1097         assert(inst->alu.add.op != V3D_QPU_A_NOP);
1098         assert(inst->alu.mul.op == V3D_QPU_M_NOP);
1099 
1100         memcpy(&inst->alu.mul, &inst->alu.add, sizeof(inst->alu.mul));
1101         inst->alu.mul.op = add_op_as_mul_op(inst->alu.add.op);
1102         inst->alu.add.op = V3D_QPU_A_NOP;
1103 
1104         inst->flags.mc = inst->flags.ac;
1105         inst->flags.mpf = inst->flags.apf;
1106         inst->flags.muf = inst->flags.auf;
1107         inst->flags.ac = V3D_QPU_COND_NONE;
1108         inst->flags.apf = V3D_QPU_PF_NONE;
1109         inst->flags.auf = V3D_QPU_UF_NONE;
1110 
1111         inst->alu.mul.output_pack = inst->alu.add.output_pack;
1112 
1113         inst->alu.mul.a.unpack = inst->alu.add.a.unpack;
1114         inst->alu.mul.b.unpack = inst->alu.add.b.unpack;
1115         inst->alu.add.output_pack = V3D_QPU_PACK_NONE;
1116         inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
1117         inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
1118 
1119         if (devinfo->ver >= 71) {
1120                 assert(!inst->sig.small_imm_c && !inst->sig.small_imm_d);
1121                 assert(inst->sig.small_imm_a + inst->sig.small_imm_b <= 1);
1122                 if (inst->sig.small_imm_a) {
1123                         inst->sig.small_imm_c = true;
1124                         inst->sig.small_imm_a = false;
1125                 } else if (inst->sig.small_imm_b) {
1126                         inst->sig.small_imm_d = true;
1127                         inst->sig.small_imm_b = false;
1128                 }
1129         }
1130 }
1131 
1132 static bool
can_do_mul_as_add(const struct v3d_device_info * devinfo,enum v3d_qpu_mul_op op)1133 can_do_mul_as_add(const struct v3d_device_info *devinfo, enum v3d_qpu_mul_op op)
1134 {
1135         switch (op) {
1136         case V3D_QPU_M_MOV:
1137         case V3D_QPU_M_FMOV:
1138                 return devinfo->ver >= 71;
1139         default:
1140                 return false;
1141         }
1142 }
1143 
1144 static enum v3d_qpu_mul_op
mul_op_as_add_op(enum v3d_qpu_mul_op op)1145 mul_op_as_add_op(enum v3d_qpu_mul_op op)
1146 {
1147         switch (op) {
1148         case V3D_QPU_M_MOV:
1149                 return V3D_QPU_A_MOV;
1150         case V3D_QPU_M_FMOV:
1151                 return V3D_QPU_A_FMOV;
1152         default:
1153                 unreachable("unexpected mov opcode");
1154         }
1155 }
1156 
1157 static void
qpu_convert_mul_to_add(struct v3d_qpu_instr * inst)1158 qpu_convert_mul_to_add(struct v3d_qpu_instr *inst)
1159 {
1160         STATIC_ASSERT(sizeof(inst->alu.add) == sizeof(inst->alu.mul));
1161         assert(inst->alu.mul.op != V3D_QPU_M_NOP);
1162         assert(inst->alu.add.op == V3D_QPU_A_NOP);
1163 
1164         memcpy(&inst->alu.add, &inst->alu.mul, sizeof(inst->alu.add));
1165         inst->alu.add.op = mul_op_as_add_op(inst->alu.mul.op);
1166         inst->alu.mul.op = V3D_QPU_M_NOP;
1167 
1168         inst->flags.ac = inst->flags.mc;
1169         inst->flags.apf = inst->flags.mpf;
1170         inst->flags.auf = inst->flags.muf;
1171         inst->flags.mc = V3D_QPU_COND_NONE;
1172         inst->flags.mpf = V3D_QPU_PF_NONE;
1173         inst->flags.muf = V3D_QPU_UF_NONE;
1174 
1175         inst->alu.add.output_pack = inst->alu.mul.output_pack;
1176         inst->alu.add.a.unpack = inst->alu.mul.a.unpack;
1177         inst->alu.add.b.unpack = inst->alu.mul.b.unpack;
1178         inst->alu.mul.output_pack = V3D_QPU_PACK_NONE;
1179         inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
1180         inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
1181 
1182         assert(!inst->sig.small_imm_a && !inst->sig.small_imm_b);
1183         assert(inst->sig.small_imm_c + inst->sig.small_imm_d <= 1);
1184         if (inst->sig.small_imm_c) {
1185                 inst->sig.small_imm_a = true;
1186                 inst->sig.small_imm_c = false;
1187         } else if (inst->sig.small_imm_d) {
1188                 inst->sig.small_imm_b = true;
1189                 inst->sig.small_imm_d = false;
1190         }
1191 }
1192 
1193 static bool
qpu_merge_inst(const struct v3d_device_info * devinfo,struct v3d_qpu_instr * result,const struct v3d_qpu_instr * a,const struct v3d_qpu_instr * b)1194 qpu_merge_inst(const struct v3d_device_info *devinfo,
1195                struct v3d_qpu_instr *result,
1196                const struct v3d_qpu_instr *a,
1197                const struct v3d_qpu_instr *b)
1198 {
1199         if (a->type != V3D_QPU_INSTR_TYPE_ALU ||
1200             b->type != V3D_QPU_INSTR_TYPE_ALU) {
1201                 return false;
1202         }
1203 
1204         if (!qpu_compatible_peripheral_access(devinfo, a, b))
1205                 return false;
1206 
1207         struct v3d_qpu_instr merge = *a;
1208         const struct v3d_qpu_instr *add_instr = NULL, *mul_instr = NULL;
1209 
1210         struct v3d_qpu_instr mul_inst;
1211         if (b->alu.add.op != V3D_QPU_A_NOP) {
1212                 if (a->alu.add.op == V3D_QPU_A_NOP) {
1213                         merge.alu.add = b->alu.add;
1214 
1215                         merge.flags.ac = b->flags.ac;
1216                         merge.flags.apf = b->flags.apf;
1217                         merge.flags.auf = b->flags.auf;
1218 
1219                         add_instr = b;
1220                         mul_instr = a;
1221                 }
1222                 /* If a's add op is used but its mul op is not, then see if we
1223                  * can convert either a's add op or b's add op to a mul op
1224                  * so we can merge.
1225                  */
1226                 else if (a->alu.mul.op == V3D_QPU_M_NOP &&
1227                          can_do_add_as_mul(b->alu.add.op)) {
1228                         mul_inst = *b;
1229                         qpu_convert_add_to_mul(devinfo, &mul_inst);
1230 
1231                         merge.alu.mul = mul_inst.alu.mul;
1232 
1233                         merge.flags.mc = mul_inst.flags.mc;
1234                         merge.flags.mpf = mul_inst.flags.mpf;
1235                         merge.flags.muf = mul_inst.flags.muf;
1236 
1237                         add_instr = a;
1238                         mul_instr = &mul_inst;
1239                 } else if (a->alu.mul.op == V3D_QPU_M_NOP &&
1240                            can_do_add_as_mul(a->alu.add.op)) {
1241                         mul_inst = *a;
1242                         qpu_convert_add_to_mul(devinfo, &mul_inst);
1243 
1244                         merge = mul_inst;
1245                         merge.alu.add = b->alu.add;
1246 
1247                         merge.flags.ac = b->flags.ac;
1248                         merge.flags.apf = b->flags.apf;
1249                         merge.flags.auf = b->flags.auf;
1250 
1251                         add_instr = b;
1252                         mul_instr = &mul_inst;
1253                 } else {
1254                         return false;
1255                 }
1256         }
1257 
1258         struct v3d_qpu_instr add_inst;
1259         if (b->alu.mul.op != V3D_QPU_M_NOP) {
1260                 if (a->alu.mul.op == V3D_QPU_M_NOP) {
1261                         merge.alu.mul = b->alu.mul;
1262 
1263                         merge.flags.mc = b->flags.mc;
1264                         merge.flags.mpf = b->flags.mpf;
1265                         merge.flags.muf = b->flags.muf;
1266 
1267                         mul_instr = b;
1268                         add_instr = a;
1269                 }
1270                 /* If a's mul op is used but its add op is not, then see if we
1271                  * can convert either a's mul op or b's mul op to an add op
1272                  * so we can merge.
1273                  */
1274                 else if (a->alu.add.op == V3D_QPU_A_NOP &&
1275                          can_do_mul_as_add(devinfo, b->alu.mul.op)) {
1276                         add_inst = *b;
1277                         qpu_convert_mul_to_add(&add_inst);
1278 
1279                         merge.alu.add = add_inst.alu.add;
1280 
1281                         merge.flags.ac = add_inst.flags.ac;
1282                         merge.flags.apf = add_inst.flags.apf;
1283                         merge.flags.auf = add_inst.flags.auf;
1284 
1285                         mul_instr = a;
1286                         add_instr = &add_inst;
1287                 } else if (a->alu.add.op == V3D_QPU_A_NOP &&
1288                            can_do_mul_as_add(devinfo, a->alu.mul.op)) {
1289                         add_inst = *a;
1290                         qpu_convert_mul_to_add(&add_inst);
1291 
1292                         merge = add_inst;
1293                         merge.alu.mul = b->alu.mul;
1294 
1295                         merge.flags.mc = b->flags.mc;
1296                         merge.flags.mpf = b->flags.mpf;
1297                         merge.flags.muf = b->flags.muf;
1298 
1299                         mul_instr = b;
1300                         add_instr = &add_inst;
1301                 } else {
1302                         return false;
1303                 }
1304         }
1305 
1306         /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and
1307          * they have restrictions on the number of raddrs that can be adressed
1308          * in a single instruction. In V3D 7.x, we don't have that restriction,
1309          * but we are still limited to a single small immediate per instruction.
1310          */
1311         if (add_instr && mul_instr &&
1312             !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) {
1313                 return false;
1314         }
1315 
1316         merge.sig.thrsw |= b->sig.thrsw;
1317         merge.sig.ldunif |= b->sig.ldunif;
1318         merge.sig.ldunifrf |= b->sig.ldunifrf;
1319         merge.sig.ldunifa |= b->sig.ldunifa;
1320         merge.sig.ldunifarf |= b->sig.ldunifarf;
1321         merge.sig.ldtmu |= b->sig.ldtmu;
1322         merge.sig.ldvary |= b->sig.ldvary;
1323         merge.sig.ldvpm |= b->sig.ldvpm;
1324         merge.sig.ldtlb |= b->sig.ldtlb;
1325         merge.sig.ldtlbu |= b->sig.ldtlbu;
1326         merge.sig.ucb |= b->sig.ucb;
1327         merge.sig.rotate |= b->sig.rotate;
1328         merge.sig.wrtmuc |= b->sig.wrtmuc;
1329 
1330         if (v3d_qpu_sig_writes_address(devinfo, &a->sig) &&
1331             v3d_qpu_sig_writes_address(devinfo, &b->sig))
1332                 return false;
1333         merge.sig_addr |= b->sig_addr;
1334         merge.sig_magic |= b->sig_magic;
1335 
1336         uint64_t packed;
1337         bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed);
1338 
1339         *result = merge;
1340         /* No modifying the real instructions on failure. */
1341         assert(ok || (a != result && b != result));
1342 
1343         return ok;
1344 }
1345 
1346 static inline bool
try_skip_for_ldvary_pipelining(const struct v3d_qpu_instr * inst)1347 try_skip_for_ldvary_pipelining(const struct v3d_qpu_instr *inst)
1348 {
1349         return inst->sig.ldunif || inst->sig.ldunifrf;
1350 }
1351 
1352 static bool
1353 qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
1354                                          struct choose_scoreboard *scoreboard,
1355                                          const struct qinst *qinst);
1356 
1357 static struct schedule_node *
choose_instruction_to_schedule(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct schedule_node * prev_inst)1358 choose_instruction_to_schedule(struct v3d_compile *c,
1359                                struct choose_scoreboard *scoreboard,
1360                                struct schedule_node *prev_inst)
1361 {
1362         struct schedule_node *chosen = NULL;
1363         int chosen_prio = 0;
1364 
1365         /* Don't pair up anything with a thread switch signal -- emit_thrsw()
1366          * will handle pairing it along with filling the delay slots.
1367          */
1368         if (prev_inst) {
1369                 if (prev_inst->inst->qpu.sig.thrsw)
1370                         return NULL;
1371         }
1372 
1373         bool ldvary_pipelining = c->s->info.stage == MESA_SHADER_FRAGMENT &&
1374                                  scoreboard->ldvary_count < c->num_inputs;
1375         bool skipped_insts_for_ldvary_pipelining = false;
1376 retry:
1377         list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,
1378                             dag.link) {
1379                 const struct v3d_qpu_instr *inst = &n->inst->qpu;
1380 
1381                 if (ldvary_pipelining && try_skip_for_ldvary_pipelining(inst)) {
1382                         skipped_insts_for_ldvary_pipelining = true;
1383                         continue;
1384                 }
1385 
1386                 /* Don't choose the branch instruction until it's the last one
1387                  * left.  We'll move it up to fit its delay slots after we
1388                  * choose it.
1389                  */
1390                 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
1391                     !list_is_singular(&scoreboard->dag->heads)) {
1392                         continue;
1393                 }
1394 
1395                 /* We need to have 3 delay slots between a write to unifa and
1396                  * a follow-up ldunifa.
1397                  */
1398                 if ((inst->sig.ldunifa || inst->sig.ldunifarf) &&
1399                     scoreboard->tick - scoreboard->last_unifa_write_tick <= 3)
1400                         continue;
1401 
1402                 /* "An instruction must not read from a location in physical
1403                  *  regfile A or B that was written to by the previous
1404                  *  instruction."
1405                  */
1406                 if (reads_too_soon_after_write(c->devinfo, scoreboard, n->inst))
1407                         continue;
1408 
1409                 if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))
1410                         continue;
1411 
1412                 /* "Before doing a TLB access a scoreboard wait must have been
1413                  *  done. This happens either on the first or last thread
1414                  *  switch, depending on a setting (scb_wait_on_first_thrsw) in
1415                  *  the shader state."
1416                  */
1417                 if (pixel_scoreboard_too_soon(c, scoreboard, inst))
1418                         continue;
1419 
1420                 /* ldunif and ldvary both write the same register (r5 for v42
1421                  * and below, rf0 for v71), but ldunif does so a tick sooner.
1422                  * If the ldvary's register wasn't used, then ldunif might
1423                  * otherwise get scheduled so ldunif and ldvary try to update
1424                  * the register in the same tick.
1425                  */
1426                 if ((inst->sig.ldunif || inst->sig.ldunifa) &&
1427                     scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
1428                         continue;
1429                 }
1430 
1431                 /* If we are in a thrsw delay slot check that this instruction
1432                  * is valid for that.
1433                  */
1434                 if (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick &&
1435                     !qpu_inst_after_thrsw_valid_in_delay_slot(c, scoreboard,
1436                                                               n->inst)) {
1437                         continue;
1438                 }
1439 
1440                 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
1441                         /* Don't try to put a branch in the delay slots of another
1442                          * branch or a unifa write.
1443                          */
1444                         if (scoreboard->last_branch_tick + 3 >= scoreboard->tick)
1445                                 continue;
1446                         if (scoreboard->last_unifa_write_tick + 3 >= scoreboard->tick)
1447                                 continue;
1448 
1449                         /* No branch with cond != 0,2,3 and msfign != 0 after
1450                          * setmsf.
1451                          */
1452                         if (scoreboard->last_setmsf_tick == scoreboard->tick - 1 &&
1453                             inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
1454                             inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
1455                             inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
1456                             inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
1457                                 continue;
1458                         }
1459                 }
1460 
1461                 /* If we're trying to pair with another instruction, check
1462                  * that they're compatible.
1463                  */
1464                 if (prev_inst) {
1465                         /* Don't pair up a thread switch signal -- we'll
1466                          * handle pairing it when we pick it on its own.
1467                          */
1468                         if (inst->sig.thrsw)
1469                                 continue;
1470 
1471                         if (prev_inst->inst->uniform != -1 &&
1472                             n->inst->uniform != -1)
1473                                 continue;
1474 
1475                        /* Simulator complains if we have two uniforms loaded in
1476                         * the the same instruction, which could happen if we
1477                         * have a ldunif or sideband uniform and we pair that
1478                         * with ldunifa.
1479                         */
1480                         if (vir_has_uniform(prev_inst->inst) &&
1481                             (inst->sig.ldunifa || inst->sig.ldunifarf)) {
1482                                 continue;
1483                         }
1484 
1485                         if ((prev_inst->inst->qpu.sig.ldunifa ||
1486                              prev_inst->inst->qpu.sig.ldunifarf) &&
1487                             vir_has_uniform(n->inst)) {
1488                                 continue;
1489                         }
1490 
1491                         /* Don't merge TLB instructions before we have acquired
1492                          * the scoreboard lock.
1493                          */
1494                         if (pixel_scoreboard_too_soon(c, scoreboard, inst))
1495                                 continue;
1496 
1497                         /* When we successfully pair up an ldvary we then try
1498                          * to merge it into the previous instruction if
1499                          * possible to improve pipelining. Don't pick up the
1500                          * ldvary now if the follow-up fixup would place
1501                          * it in the delay slots of a thrsw, which is not
1502                          * allowed and would prevent the fixup from being
1503                          * successful. In V3D 7.x we can allow this to happen
1504                          * as long as it is not the last delay slot.
1505                          */
1506                         if (inst->sig.ldvary) {
1507                                 if (c->devinfo->ver == 42 &&
1508                                     scoreboard->last_thrsw_tick + 2 >=
1509                                     scoreboard->tick - 1) {
1510                                         continue;
1511                                 }
1512                                 if (c->devinfo->ver >= 71 &&
1513                                     scoreboard->last_thrsw_tick + 2 ==
1514                                     scoreboard->tick - 1) {
1515                                         continue;
1516                                 }
1517                         }
1518 
1519                         /* We can emit a new tmu lookup with a previous ldtmu
1520                          * if doing this would free just enough space in the
1521                          * TMU output fifo so we don't overflow, however, this
1522                          * is only safe if the ldtmu cannot stall.
1523                          *
1524                          * A ldtmu can stall if it is not the first following a
1525                          * thread switch and corresponds to the first word of a
1526                          * read request.
1527                          *
1528                          * FIXME: For now we forbid pairing up a new lookup
1529                          * with a previous ldtmu that is not the first after a
1530                          * thrsw if that could overflow the TMU output fifo
1531                          * regardless of whether the ldtmu is reading the first
1532                          * word of a TMU result or not, since we don't track
1533                          * this aspect in the compiler yet.
1534                          */
1535                         if (prev_inst->inst->qpu.sig.ldtmu &&
1536                             !scoreboard->first_ldtmu_after_thrsw &&
1537                             (scoreboard->pending_ldtmu_count +
1538                              n->inst->ldtmu_count > 16 / c->threads)) {
1539                                 continue;
1540                         }
1541 
1542                         struct v3d_qpu_instr merged_inst;
1543                         if (!qpu_merge_inst(c->devinfo, &merged_inst,
1544                                             &prev_inst->inst->qpu, inst)) {
1545                                 continue;
1546                         }
1547                 }
1548 
1549                 int prio = get_instruction_priority(c->devinfo, inst);
1550 
1551                 if (read_stalls(c->devinfo, scoreboard, inst)) {
1552                         /* Don't merge an instruction that stalls */
1553                         if (prev_inst)
1554                                 continue;
1555                         else {
1556                                 /* Any instruction that don't stall will have
1557                                  * higher scheduling priority */
1558                                 prio -= MAX_SCHEDULE_PRIORITY;
1559                                 assert(prio < 0);
1560                         }
1561                 }
1562 
1563                 /* Found a valid instruction.  If nothing better comes along,
1564                  * this one works.
1565                  */
1566                 if (!chosen) {
1567                         chosen = n;
1568                         chosen_prio = prio;
1569                         continue;
1570                 }
1571 
1572                 if (prio > chosen_prio) {
1573                         chosen = n;
1574                         chosen_prio = prio;
1575                 } else if (prio < chosen_prio) {
1576                         continue;
1577                 }
1578 
1579                 if (n->delay > chosen->delay) {
1580                         chosen = n;
1581                         chosen_prio = prio;
1582                 } else if (n->delay < chosen->delay) {
1583                         continue;
1584                 }
1585         }
1586 
1587         /* If we did not find any instruction to schedule but we discarded
1588          * some of them to prioritize ldvary pipelining, try again.
1589          */
1590         if (!chosen && !prev_inst && skipped_insts_for_ldvary_pipelining) {
1591                 skipped_insts_for_ldvary_pipelining = false;
1592                 ldvary_pipelining = false;
1593                 goto retry;
1594         }
1595 
1596         if (chosen && chosen->inst->qpu.sig.ldvary) {
1597                 scoreboard->ldvary_count++;
1598                 /* If we are pairing an ldvary, flag it so we can fix it up for
1599                  * optimal pipelining of ldvary sequences.
1600                  */
1601                 if (prev_inst)
1602                         scoreboard->fixup_ldvary = true;
1603         }
1604 
1605         return chosen;
1606 }
1607 
1608 static void
update_scoreboard_for_magic_waddr(struct choose_scoreboard * scoreboard,enum v3d_qpu_waddr waddr,const struct v3d_device_info * devinfo)1609 update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
1610                                   enum v3d_qpu_waddr waddr,
1611                                   const struct v3d_device_info *devinfo)
1612 {
1613         if (v3d_qpu_magic_waddr_is_sfu(waddr))
1614                 scoreboard->last_magic_sfu_write_tick = scoreboard->tick;
1615         else if (waddr == V3D_QPU_WADDR_UNIFA)
1616                 scoreboard->last_unifa_write_tick = scoreboard->tick;
1617 }
1618 
1619 static void
update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst)1620 update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,
1621                                       const struct v3d_qpu_instr *inst)
1622 {
1623         if (v3d_qpu_instr_is_sfu(inst)) {
1624                 scoreboard->last_stallable_sfu_reg = inst->alu.add.waddr;
1625                 scoreboard->last_stallable_sfu_tick = scoreboard->tick;
1626         }
1627 }
1628 
1629 static void
update_scoreboard_tmu_tracking(struct choose_scoreboard * scoreboard,const struct qinst * inst)1630 update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard,
1631                                const struct qinst *inst)
1632 {
1633         /* Track if the have seen any ldtmu after the last thread switch */
1634         if (scoreboard->tick == scoreboard->last_thrsw_tick + 2)
1635                 scoreboard->first_ldtmu_after_thrsw = true;
1636 
1637         /* Track the number of pending ldtmu instructions for outstanding
1638          * TMU lookups.
1639          */
1640         scoreboard->pending_ldtmu_count += inst->ldtmu_count;
1641         if (inst->qpu.sig.ldtmu) {
1642                 assert(scoreboard->pending_ldtmu_count > 0);
1643                 scoreboard->pending_ldtmu_count--;
1644                 scoreboard->first_ldtmu_after_thrsw = false;
1645         }
1646 }
1647 
1648 static void
set_has_rf0_flops_conflict(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst,const struct v3d_device_info * devinfo)1649 set_has_rf0_flops_conflict(struct choose_scoreboard *scoreboard,
1650                            const struct v3d_qpu_instr *inst,
1651                            const struct v3d_device_info *devinfo)
1652 {
1653         if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick &&
1654             v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
1655             !inst->sig_magic) {
1656                 scoreboard->has_rf0_flops_conflict = true;
1657         }
1658 }
1659 
1660 static void
update_scoreboard_for_rf0_flops(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst,const struct v3d_device_info * devinfo)1661 update_scoreboard_for_rf0_flops(struct choose_scoreboard *scoreboard,
1662                                 const struct v3d_qpu_instr *inst,
1663                                 const struct v3d_device_info *devinfo)
1664 {
1665         if (devinfo->ver < 71)
1666                 return;
1667 
1668         /* Thread switch restrictions:
1669          *
1670          * At the point of a thread switch or thread end (when the actual
1671          * thread switch or thread end happens, not when the signalling
1672          * instruction is processed):
1673          *
1674          *    - If the most recent write to rf0 was from a ldunif, ldunifa, or
1675          *      ldvary instruction in which another signal also wrote to the
1676          *      register file, and the final instruction of the thread section
1677          *      contained a signal which wrote to the register file, then the
1678          *      value of rf0 is undefined at the start of the new section
1679          *
1680          * Here we use the scoreboard to track if our last rf0 implicit write
1681          * happens at the same time that another signal writes the register
1682          * file (has_rf0_flops_conflict). We will use that information when
1683          * scheduling thrsw instructions to avoid putting anything in their
1684          * last delay slot which has a signal that writes to the register file.
1685          */
1686 
1687         /* Reset tracking if we have an explicit rf0 write or we are starting
1688          * a new thread section.
1689          */
1690         if (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
1691             scoreboard->tick - scoreboard->last_thrsw_tick == 3) {
1692                 scoreboard->last_implicit_rf0_write_tick = -10;
1693                 scoreboard->has_rf0_flops_conflict = false;
1694         }
1695 
1696         if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) {
1697                 scoreboard->last_implicit_rf0_write_tick = inst->sig.ldvary ?
1698                         scoreboard->tick + 1 : scoreboard->tick;
1699         }
1700 
1701         set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
1702 }
1703 
1704 static void
update_scoreboard_for_chosen(struct choose_scoreboard * scoreboard,const struct qinst * qinst,const struct v3d_device_info * devinfo)1705 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
1706                              const struct qinst *qinst,
1707                              const struct v3d_device_info *devinfo)
1708 {
1709         const struct v3d_qpu_instr *inst = &qinst->qpu;
1710 
1711         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
1712                 return;
1713 
1714         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
1715 
1716         if (inst->alu.add.op != V3D_QPU_A_NOP)  {
1717                 if (inst->alu.add.magic_write) {
1718                         update_scoreboard_for_magic_waddr(scoreboard,
1719                                                           inst->alu.add.waddr,
1720                                                           devinfo);
1721                 } else {
1722                         update_scoreboard_for_sfu_stall_waddr(scoreboard,
1723                                                               inst);
1724                 }
1725 
1726                 if (inst->alu.add.op == V3D_QPU_A_SETMSF)
1727                         scoreboard->last_setmsf_tick = scoreboard->tick;
1728         }
1729 
1730         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
1731                 if (inst->alu.mul.magic_write) {
1732                         update_scoreboard_for_magic_waddr(scoreboard,
1733                                                           inst->alu.mul.waddr,
1734                                                           devinfo);
1735                 }
1736         }
1737 
1738         if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && inst->sig_magic) {
1739                 update_scoreboard_for_magic_waddr(scoreboard,
1740                                                   inst->sig_addr,
1741                                                   devinfo);
1742         }
1743 
1744         if (inst->sig.ldvary)
1745                 scoreboard->last_ldvary_tick = scoreboard->tick;
1746 
1747         update_scoreboard_for_rf0_flops(scoreboard, inst, devinfo);
1748 
1749         update_scoreboard_tmu_tracking(scoreboard, qinst);
1750 }
1751 
1752 static void
dump_state(const struct v3d_device_info * devinfo,struct dag * dag)1753 dump_state(const struct v3d_device_info *devinfo, struct dag *dag)
1754 {
1755         list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) {
1756                 fprintf(stderr, "         t=%4d: ", n->unblocked_time);
1757                 v3d_qpu_dump(devinfo, &n->inst->qpu);
1758                 fprintf(stderr, "\n");
1759 
1760                 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1761                         struct schedule_node *child =
1762                                 (struct schedule_node *)edge->child;
1763                         if (!child)
1764                                 continue;
1765 
1766                         fprintf(stderr, "                 - ");
1767                         v3d_qpu_dump(devinfo, &child->inst->qpu);
1768                         fprintf(stderr, " (%d parents, %c)\n",
1769                                 child->dag.parent_count,
1770                                 edge->data ? 'w' : 'r');
1771                 }
1772         }
1773 }
1774 
magic_waddr_latency(const struct v3d_device_info * devinfo,enum v3d_qpu_waddr waddr,const struct v3d_qpu_instr * after)1775 static uint32_t magic_waddr_latency(const struct v3d_device_info *devinfo,
1776                                     enum v3d_qpu_waddr waddr,
1777                                     const struct v3d_qpu_instr *after)
1778 {
1779         /* Apply some huge latency between texture fetch requests and getting
1780          * their results back.
1781          *
1782          * FIXME: This is actually pretty bogus.  If we do:
1783          *
1784          * mov tmu0_s, a
1785          * <a bit of math>
1786          * mov tmu0_s, b
1787          * load_tmu0
1788          * <more math>
1789          * load_tmu0
1790          *
1791          * we count that as worse than
1792          *
1793          * mov tmu0_s, a
1794          * mov tmu0_s, b
1795          * <lots of math>
1796          * load_tmu0
1797          * <more math>
1798          * load_tmu0
1799          *
1800          * because we associate the first load_tmu0 with the *second* tmu0_s.
1801          */
1802         if (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) &&
1803             v3d_qpu_waits_on_tmu(after)) {
1804                 return 100;
1805         }
1806 
1807         /* Assume that anything depending on us is consuming the SFU result. */
1808         if (v3d_qpu_magic_waddr_is_sfu(waddr))
1809                 return 3;
1810 
1811         return 1;
1812 }
1813 
1814 static uint32_t
instruction_latency(const struct v3d_device_info * devinfo,struct schedule_node * before,struct schedule_node * after)1815 instruction_latency(const struct v3d_device_info *devinfo,
1816                     struct schedule_node *before, struct schedule_node *after)
1817 {
1818         const struct v3d_qpu_instr *before_inst = &before->inst->qpu;
1819         const struct v3d_qpu_instr *after_inst = &after->inst->qpu;
1820         uint32_t latency = 1;
1821 
1822         if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU ||
1823             after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
1824                 return latency;
1825 
1826         if (v3d_qpu_instr_is_sfu(before_inst))
1827                 return 2;
1828 
1829         if (before_inst->alu.add.op != V3D_QPU_A_NOP &&
1830             before_inst->alu.add.magic_write) {
1831                 latency = MAX2(latency,
1832                                magic_waddr_latency(devinfo,
1833                                                    before_inst->alu.add.waddr,
1834                                                    after_inst));
1835         }
1836 
1837         if (before_inst->alu.mul.op != V3D_QPU_M_NOP &&
1838             before_inst->alu.mul.magic_write) {
1839                 latency = MAX2(latency,
1840                                magic_waddr_latency(devinfo,
1841                                                    before_inst->alu.mul.waddr,
1842                                                    after_inst));
1843         }
1844 
1845         return latency;
1846 }
1847 
1848 /** Recursive computation of the delay member of a node. */
1849 static void
compute_delay(struct dag_node * node,void * state)1850 compute_delay(struct dag_node *node, void *state)
1851 {
1852         struct schedule_node *n = (struct schedule_node *)node;
1853         struct v3d_compile *c = (struct v3d_compile *) state;
1854 
1855         n->delay = 1;
1856 
1857         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1858                 struct schedule_node *child =
1859                         (struct schedule_node *)edge->child;
1860 
1861                 n->delay = MAX2(n->delay, (child->delay +
1862                                            instruction_latency(c->devinfo, n,
1863                                                                child)));
1864         }
1865 }
1866 
1867 /* Removes a DAG head, but removing only the WAR edges. (dag_prune_head()
1868  * should be called on it later to finish pruning the other edges).
1869  */
1870 static void
pre_remove_head(struct dag * dag,struct schedule_node * n)1871 pre_remove_head(struct dag *dag, struct schedule_node *n)
1872 {
1873         list_delinit(&n->dag.link);
1874 
1875         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1876                 if (edge->data)
1877                         dag_remove_edge(dag, edge);
1878         }
1879 }
1880 
1881 static void
mark_instruction_scheduled(const struct v3d_device_info * devinfo,struct dag * dag,uint32_t time,struct schedule_node * node)1882 mark_instruction_scheduled(const struct v3d_device_info *devinfo,
1883                            struct dag *dag,
1884                            uint32_t time,
1885                            struct schedule_node *node)
1886 {
1887         if (!node)
1888                 return;
1889 
1890         util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) {
1891                 struct schedule_node *child =
1892                         (struct schedule_node *)edge->child;
1893 
1894                 if (!child)
1895                         continue;
1896 
1897                 uint32_t latency = instruction_latency(devinfo, node, child);
1898 
1899                 child->unblocked_time = MAX2(child->unblocked_time,
1900                                              time + latency);
1901         }
1902         dag_prune_head(dag, &node->dag);
1903 }
1904 
1905 static void
insert_scheduled_instruction(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard,struct qinst * inst)1906 insert_scheduled_instruction(struct v3d_compile *c,
1907                              struct qblock *block,
1908                              struct choose_scoreboard *scoreboard,
1909                              struct qinst *inst)
1910 {
1911         list_addtail(&inst->link, &block->instructions);
1912 
1913         update_scoreboard_for_chosen(scoreboard, inst, c->devinfo);
1914         c->qpu_inst_count++;
1915         scoreboard->tick++;
1916 }
1917 
1918 static struct qinst *
vir_nop()1919 vir_nop()
1920 {
1921         struct qreg undef = vir_nop_reg();
1922         struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
1923 
1924         return qinst;
1925 }
1926 
1927 static void
emit_nop(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard)1928 emit_nop(struct v3d_compile *c, struct qblock *block,
1929          struct choose_scoreboard *scoreboard)
1930 {
1931         insert_scheduled_instruction(c, block, scoreboard, vir_nop());
1932 }
1933 
1934 static bool
qpu_inst_valid_in_thrend_slot(struct v3d_compile * c,const struct qinst * qinst,int slot)1935 qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
1936                               const struct qinst *qinst, int slot)
1937 {
1938         const struct v3d_qpu_instr *inst = &qinst->qpu;
1939 
1940         if (slot == 2 && qinst->is_tlb_z_write)
1941                 return false;
1942 
1943         if (slot > 0 && qinst->uniform != ~0)
1944                 return false;
1945 
1946         if (c->devinfo->ver == 42 && v3d_qpu_waits_vpm(inst))
1947                 return false;
1948 
1949         if (inst->sig.ldvary)
1950                 return false;
1951 
1952         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
1953                 /* GFXH-1625: TMUWT not allowed in the final instruction. */
1954                 if (c->devinfo->ver == 42 && slot == 2 &&
1955                     inst->alu.add.op == V3D_QPU_A_TMUWT) {
1956                         return false;
1957                 }
1958 
1959                 if (c->devinfo->ver == 42) {
1960                         /* No writing physical registers at the end. */
1961                         bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
1962                         bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP;
1963                         if ((!add_is_nop && !inst->alu.add.magic_write) ||
1964                             (!mul_is_nop && !inst->alu.mul.magic_write)) {
1965                                 return false;
1966                         }
1967 
1968                         if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
1969                             !inst->sig_magic) {
1970                                 return false;
1971                         }
1972                 }
1973 
1974                 if (c->devinfo->ver >= 71) {
1975                         /* The thread end instruction must not write to the
1976                          * register file via the add/mul ALUs.
1977                          */
1978                         if (slot == 0 &&
1979                             (!inst->alu.add.magic_write ||
1980                              !inst->alu.mul.magic_write)) {
1981                                 return false;
1982                         }
1983                 }
1984 
1985                 if (c->devinfo->ver == 42) {
1986                         /* RF0-2 might be overwritten during the delay slots by
1987                          * fragment shader setup.
1988                          */
1989                         if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A))
1990                                 return false;
1991 
1992                         if (inst->raddr_b < 3 &&
1993                             !inst->sig.small_imm_b &&
1994                             v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
1995                                 return false;
1996                         }
1997                 }
1998 
1999                 if (c->devinfo->ver >= 71) {
2000                         /* RF2-3 might be overwritten during the delay slots by
2001                          * fragment shader setup.
2002                          */
2003                         if (v3d71_qpu_reads_raddr(inst, 2) ||
2004                             v3d71_qpu_reads_raddr(inst, 3)) {
2005                                 return false;
2006                         }
2007 
2008                         if (v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 2) ||
2009                             v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 3)) {
2010                                 return false;
2011                         }
2012                 }
2013         }
2014 
2015         return true;
2016 }
2017 
2018 /**
2019  * This is called when trying to merge a thrsw back into the instruction stream
2020  * of instructions that were scheduled *before* the thrsw signal to fill its
2021  * delay slots. Because the actual execution of the thrsw happens after the
2022  * delay slots, it is usually safe to do this, but there are some cases that
2023  * need special care.
2024  */
2025 static bool
qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile * c,struct choose_scoreboard * scoreboard,const struct qinst * qinst,uint32_t slot)2026 qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
2027                                           struct choose_scoreboard *scoreboard,
2028                                           const struct qinst *qinst,
2029                                           uint32_t slot)
2030 {
2031         /* No scheduling SFU when the result would land in the other
2032          * thread.  The simulator complains for safety, though it
2033          * would only occur for dead code in our case.
2034          */
2035         if (slot > 0 && v3d_qpu_instr_is_legacy_sfu(&qinst->qpu))
2036                 return false;
2037 
2038         if (qinst->qpu.sig.ldvary) {
2039                 if (c->devinfo->ver == 42 && slot > 0)
2040                         return false;
2041                 if (c->devinfo->ver >= 71 && slot == 2)
2042                         return false;
2043         }
2044 
2045         /* unifa and the following 3 instructions can't overlap a
2046          * thread switch/end. The docs further clarify that this means
2047          * the cycle at which the actual thread switch/end happens
2048          * and not when the thrsw instruction is processed, which would
2049          * be after the 2 delay slots following the thrsw instruction.
2050          * This means that we can move up a thrsw up to the instruction
2051          * right after unifa:
2052          *
2053          * unifa, r5
2054          * thrsw
2055          * delay slot 1
2056          * delay slot 2
2057          * Thread switch happens here, 4 instructions away from unifa
2058          */
2059         if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu))
2060                 return false;
2061 
2062         /* See comment when we set has_rf0_flops_conflict for details */
2063         if (c->devinfo->ver >= 71 &&
2064             slot == 2 &&
2065             v3d_qpu_sig_writes_address(c->devinfo, &qinst->qpu.sig) &&
2066             !qinst->qpu.sig_magic) {
2067                 if (scoreboard->has_rf0_flops_conflict)
2068                         return false;
2069                 if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick)
2070                         return false;
2071         }
2072 
2073         return true;
2074 }
2075 
2076 /**
2077  * This is called for instructions scheduled *after* a thrsw signal that may
2078  * land in the delay slots of the thrsw. Because these instructions were
2079  * scheduled after the thrsw, we need to be careful when placing them into
2080  * the delay slots, since that means that we are moving them ahead of the
2081  * thread switch and we need to ensure that is not a problem.
2082  */
2083 static bool
qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile * c,struct choose_scoreboard * scoreboard,const struct qinst * qinst)2084 qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
2085                                          struct choose_scoreboard *scoreboard,
2086                                          const struct qinst *qinst)
2087 {
2088         const uint32_t slot = scoreboard->tick - scoreboard->last_thrsw_tick;
2089         assert(slot <= 2);
2090 
2091         /* We merge thrsw instructions back into the instruction stream
2092          * manually, so any instructions scheduled after a thrsw should be
2093          * in the actual delay slots and not in the same slot as the thrsw.
2094          */
2095         assert(slot >= 1);
2096 
2097         /* No emitting a thrsw while the previous thrsw hasn't happened yet. */
2098         if (qinst->qpu.sig.thrsw)
2099                 return false;
2100 
2101         /* The restrictions for instructions scheduled before the the thrsw
2102          * also apply to instructions scheduled after the thrsw that we want
2103          * to place in its delay slots.
2104          */
2105         if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, qinst, slot))
2106                 return false;
2107 
2108         /* TLB access is disallowed until scoreboard wait is executed, which
2109          * we do on the last thread switch.
2110          */
2111         if (qpu_inst_is_tlb(&qinst->qpu))
2112                 return false;
2113 
2114         /* Instruction sequence restrictions: Branch is not allowed in delay
2115          * slots of a thrsw.
2116          */
2117         if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
2118                 return false;
2119 
2120         /* Miscellaneous restrictions: At the point of a thrsw we need to have
2121          * at least one outstanding lookup or TSY wait.
2122          *
2123          * So avoid placing TMU instructions scheduled after the thrsw into
2124          * its delay slots or we may be compromising the integrity of our TMU
2125          * sequences. Also, notice that if we moved these instructions into
2126          * the delay slots of a previous thrsw we could overflow our TMU output
2127          * fifo, since we could be effectively pipelining a lookup scheduled
2128          * after the thrsw into the sequence before the thrsw.
2129          */
2130         if (v3d_qpu_writes_tmu(c->devinfo, &qinst->qpu) ||
2131             qinst->qpu.sig.wrtmuc) {
2132                 return false;
2133         }
2134 
2135         /* Don't move instructions that wait on the TMU before the thread switch
2136          * happens since that would make the current thread stall before the
2137          * switch, which is exactly what we want to avoid with the thrsw
2138          * instruction.
2139          */
2140         if (v3d_qpu_waits_on_tmu(&qinst->qpu))
2141                 return false;
2142 
2143         /* A thread switch invalidates all accumulators, so don't place any
2144          * instructions that write accumulators into the delay slots.
2145          */
2146         if (v3d_qpu_writes_accum(c->devinfo, &qinst->qpu))
2147                 return false;
2148 
2149         /* Multop has an implicit write to the rtop register which is an
2150          * specialized accumulator that is only used with this instruction.
2151          */
2152         if (qinst->qpu.alu.mul.op == V3D_QPU_M_MULTOP)
2153                 return false;
2154 
2155         /* Flags are invalidated across a thread switch, so dont' place
2156          * instructions that write flags into delay slots.
2157          */
2158         if (v3d_qpu_writes_flags(&qinst->qpu))
2159                 return false;
2160 
2161         /* TSY sync ops materialize at the point of the next thread switch,
2162          * therefore, if we have a TSY sync right after a thread switch, we
2163          * cannot place it in its delay slots, or we would be moving the sync
2164          * to the thrsw before it instead.
2165          */
2166         if (qinst->qpu.alu.add.op == V3D_QPU_A_BARRIERID)
2167                 return false;
2168 
2169         return true;
2170 }
2171 
2172 static bool
valid_thrsw_sequence(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qinst * qinst,int instructions_in_sequence,bool is_thrend)2173 valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard,
2174                      struct qinst *qinst, int instructions_in_sequence,
2175                      bool is_thrend)
2176 {
2177         for (int slot = 0; slot < instructions_in_sequence; slot++) {
2178                 if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard,
2179                                                                qinst, slot)) {
2180                         return false;
2181                 }
2182 
2183                 if (is_thrend &&
2184                     !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) {
2185                         return false;
2186                 }
2187 
2188                 /* Note that the list is circular, so we can only do this up
2189                  * to instructions_in_sequence.
2190                  */
2191                 qinst = (struct qinst *)qinst->link.next;
2192         }
2193 
2194         return true;
2195 }
2196 
2197 /**
2198  * Emits a THRSW signal in the stream, trying to move it up to pair with
2199  * another instruction.
2200  */
2201 static int
emit_thrsw(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard,struct qinst * inst,bool is_thrend)2202 emit_thrsw(struct v3d_compile *c,
2203            struct qblock *block,
2204            struct choose_scoreboard *scoreboard,
2205            struct qinst *inst,
2206            bool is_thrend)
2207 {
2208         int time = 0;
2209 
2210         /* There should be nothing in a thrsw inst being scheduled other than
2211          * the signal bits.
2212          */
2213         assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
2214         assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP);
2215         assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP);
2216 
2217         /* Don't try to emit a thrsw in the delay slots of a previous thrsw
2218          * or branch.
2219          */
2220         while (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick) {
2221                 emit_nop(c, block, scoreboard);
2222                 time++;
2223         }
2224         while (scoreboard->last_branch_tick + 3 >= scoreboard->tick) {
2225                 emit_nop(c, block, scoreboard);
2226                 time++;
2227         }
2228 
2229         /* Find how far back into previous instructions we can put the THRSW. */
2230         int slots_filled = 0;
2231         int invalid_sig_count = 0;
2232         int invalid_seq_count = 0;
2233         bool last_thrsw_after_invalid_ok = false;
2234         struct qinst *merge_inst = NULL;
2235         vir_for_each_inst_rev(prev_inst, block) {
2236                 /* No emitting our thrsw while the previous thrsw hasn't
2237                  * happened yet.
2238                  */
2239                 if (scoreboard->last_thrsw_tick + 3 >
2240                     scoreboard->tick - (slots_filled + 1)) {
2241                         break;
2242                 }
2243 
2244 
2245                 if (!valid_thrsw_sequence(c, scoreboard,
2246                                           prev_inst, slots_filled + 1,
2247                                           is_thrend)) {
2248                         /* Even if the current sequence isn't valid, we may
2249                          * be able to get a valid sequence by trying to move the
2250                          * thrsw earlier, so keep going.
2251                          */
2252                         invalid_seq_count++;
2253                         goto cont_block;
2254                 }
2255 
2256                 struct v3d_qpu_sig sig = prev_inst->qpu.sig;
2257                 sig.thrsw = true;
2258                 uint32_t packed_sig;
2259                 if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) {
2260                         /* If we can't merge the thrsw here because of signal
2261                          * incompatibility, keep going, we might be able to
2262                          * merge it in an earlier instruction.
2263                          */
2264                         invalid_sig_count++;
2265                         goto cont_block;
2266                 }
2267 
2268                 /* For last thrsw we need 2 consecutive slots that are
2269                  * thrsw compatible, so if we have previously jumped over
2270                  * an incompatible signal, flag that we have found the first
2271                  * valid slot here and keep going.
2272                  */
2273                 if (inst->is_last_thrsw && invalid_sig_count > 0 &&
2274                     !last_thrsw_after_invalid_ok) {
2275                         last_thrsw_after_invalid_ok = true;
2276                         invalid_sig_count++;
2277                         goto cont_block;
2278                 }
2279 
2280                 /* We can merge the thrsw in this instruction */
2281                 last_thrsw_after_invalid_ok = false;
2282                 invalid_sig_count = 0;
2283                 invalid_seq_count = 0;
2284                 merge_inst = prev_inst;
2285 
2286 cont_block:
2287                 if (++slots_filled == 3)
2288                         break;
2289         }
2290 
2291         /* If we jumped over a signal incompatibility and did not manage to
2292          * merge the thrsw in the end, we need to adjust slots filled to match
2293          * the last valid merge point.
2294          */
2295         assert((invalid_sig_count == 0 && invalid_seq_count == 0) ||
2296                 slots_filled >= invalid_sig_count + invalid_seq_count);
2297         if (invalid_sig_count > 0)
2298                 slots_filled -= invalid_sig_count;
2299         if (invalid_seq_count > 0)
2300                 slots_filled -= invalid_seq_count;
2301 
2302         bool needs_free = false;
2303         if (merge_inst) {
2304                 merge_inst->qpu.sig.thrsw = true;
2305                 needs_free = true;
2306                 scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled;
2307         } else {
2308                 scoreboard->last_thrsw_tick = scoreboard->tick;
2309                 insert_scheduled_instruction(c, block, scoreboard, inst);
2310                 time++;
2311                 slots_filled++;
2312                 merge_inst = inst;
2313         }
2314 
2315         scoreboard->first_thrsw_emitted = true;
2316 
2317         /* If we're emitting the last THRSW (other than program end), then
2318          * signal that to the HW by emitting two THRSWs in a row.
2319          */
2320         if (inst->is_last_thrsw) {
2321                 if (slots_filled <= 1) {
2322                         emit_nop(c, block, scoreboard);
2323                         time++;
2324                 }
2325                 struct qinst *second_inst =
2326                         (struct qinst *)merge_inst->link.next;
2327                 second_inst->qpu.sig.thrsw = true;
2328                 scoreboard->last_thrsw_emitted = true;
2329         }
2330 
2331         /* Make sure the thread end executes within the program lifespan */
2332         if (is_thrend) {
2333                 for (int i = 0; i < 3 - slots_filled; i++) {
2334                         emit_nop(c, block, scoreboard);
2335                         time++;
2336                 }
2337         }
2338 
2339         /* If we put our THRSW into another instruction, free up the
2340          * instruction that didn't end up scheduled into the list.
2341          */
2342         if (needs_free)
2343                 free(inst);
2344 
2345         return time;
2346 }
2347 
2348 static bool
qpu_inst_valid_in_branch_delay_slot(struct v3d_compile * c,struct qinst * inst)2349 qpu_inst_valid_in_branch_delay_slot(struct v3d_compile *c, struct qinst *inst)
2350 {
2351         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
2352                 return false;
2353 
2354         if (inst->qpu.sig.thrsw)
2355                 return false;
2356 
2357         if (v3d_qpu_writes_unifa(c->devinfo, &inst->qpu))
2358                 return false;
2359 
2360         if (vir_has_uniform(inst))
2361                 return false;
2362 
2363         return true;
2364 }
2365 
2366 static void
emit_branch(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard,struct qinst * inst)2367 emit_branch(struct v3d_compile *c,
2368            struct qblock *block,
2369            struct choose_scoreboard *scoreboard,
2370            struct qinst *inst)
2371 {
2372         assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
2373 
2374         /* We should've not picked up a branch for the delay slots of a previous
2375          * thrsw, branch or unifa write instruction.
2376          */
2377         int branch_tick = scoreboard->tick;
2378         assert(scoreboard->last_thrsw_tick + 2 < branch_tick);
2379         assert(scoreboard->last_branch_tick + 3 < branch_tick);
2380         assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);
2381 
2382         /* V3D 4.x can't place a branch with msfign != 0 and cond != 0,2,3 after
2383          * setmsf.
2384          */
2385         bool is_safe_msf_branch =
2386                 c->devinfo->ver >= 71 ||
2387                 inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE ||
2388                 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS ||
2389                 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 ||
2390                 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_NA0;
2391         assert(scoreboard->last_setmsf_tick != branch_tick - 1 ||
2392                is_safe_msf_branch);
2393 
2394         /* Insert the branch instruction */
2395         insert_scheduled_instruction(c, block, scoreboard, inst);
2396 
2397         /* Now see if we can move the branch instruction back into the
2398          * instruction stream to fill its delay slots
2399          */
2400         int slots_filled = 0;
2401         while (slots_filled < 3 && block->instructions.next != &inst->link) {
2402                 struct qinst *prev_inst = (struct qinst *) inst->link.prev;
2403                 assert(prev_inst->qpu.type != V3D_QPU_INSTR_TYPE_BRANCH);
2404 
2405                 /* Can't move the branch instruction if that would place it
2406                  * in the delay slots of other instructions.
2407                  */
2408                 if (scoreboard->last_branch_tick + 3 >=
2409                     branch_tick - slots_filled - 1) {
2410                         break;
2411                 }
2412 
2413                 if (scoreboard->last_thrsw_tick + 2 >=
2414                     branch_tick - slots_filled - 1) {
2415                         break;
2416                 }
2417 
2418                 if (scoreboard->last_unifa_write_tick + 3 >=
2419                     branch_tick - slots_filled - 1) {
2420                         break;
2421                 }
2422 
2423                 /* Do not move up a branch if it can disrupt an ldvary sequence
2424                  * as that can cause stomping of the r5 register.
2425                  */
2426                 if (scoreboard->last_ldvary_tick + 2 >=
2427                     branch_tick - slots_filled) {
2428                        break;
2429                 }
2430 
2431                 /* Can't move a conditional branch before the instruction
2432                  * that writes the flags for its condition.
2433                  */
2434                 if (v3d_qpu_writes_flags(&prev_inst->qpu) &&
2435                     inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) {
2436                         break;
2437                 }
2438 
2439                 if (!qpu_inst_valid_in_branch_delay_slot(c, prev_inst))
2440                         break;
2441 
2442                 if (!is_safe_msf_branch) {
2443                         struct qinst *prev_prev_inst =
2444                                 (struct qinst *) prev_inst->link.prev;
2445                         if (prev_prev_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
2446                             prev_prev_inst->qpu.alu.add.op == V3D_QPU_A_SETMSF) {
2447                                 break;
2448                         }
2449                 }
2450 
2451                 list_del(&prev_inst->link);
2452                 list_add(&prev_inst->link, &inst->link);
2453                 slots_filled++;
2454         }
2455 
2456         block->branch_qpu_ip = c->qpu_inst_count - 1 - slots_filled;
2457         scoreboard->last_branch_tick = branch_tick - slots_filled;
2458 
2459         /* Fill any remaining delay slots.
2460          *
2461          * For unconditional branches we'll try to fill these with the
2462          * first instructions in the successor block after scheduling
2463          * all blocks when setting up branch targets.
2464          */
2465         for (int i = 0; i < 3 - slots_filled; i++)
2466                 emit_nop(c, block, scoreboard);
2467 }
2468 
2469 static bool
alu_reads_register(const struct v3d_device_info * devinfo,struct v3d_qpu_instr * inst,bool add,bool magic,uint32_t index)2470 alu_reads_register(const struct v3d_device_info *devinfo,
2471                    struct v3d_qpu_instr *inst,
2472                    bool add, bool magic, uint32_t index)
2473 {
2474         uint32_t num_src;
2475         if (add)
2476                 num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
2477         else
2478                 num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
2479 
2480         if (devinfo->ver == 42) {
2481                 enum v3d_qpu_mux mux_a, mux_b;
2482                 if (add) {
2483                         mux_a = inst->alu.add.a.mux;
2484                         mux_b = inst->alu.add.b.mux;
2485                 } else {
2486                         mux_a = inst->alu.mul.a.mux;
2487                         mux_b = inst->alu.mul.b.mux;
2488                 }
2489 
2490                 for (int i = 0; i < num_src; i++) {
2491                         if (magic) {
2492                                 if (i == 0 && mux_a == index)
2493                                         return true;
2494                                 if (i == 1 && mux_b == index)
2495                                         return true;
2496                         } else {
2497                                 if (i == 0 && mux_a == V3D_QPU_MUX_A &&
2498                                     inst->raddr_a == index) {
2499                                         return true;
2500                                 }
2501                                 if (i == 0 && mux_a == V3D_QPU_MUX_B &&
2502                                     inst->raddr_b == index) {
2503                                         return true;
2504                                 }
2505                                 if (i == 1 && mux_b == V3D_QPU_MUX_A &&
2506                                     inst->raddr_a == index) {
2507                                         return true;
2508                                 }
2509                                 if (i == 1 && mux_b == V3D_QPU_MUX_B &&
2510                                     inst->raddr_b == index) {
2511                                         return true;
2512                                 }
2513                         }
2514                 }
2515 
2516                 return false;
2517         }
2518 
2519         assert(devinfo->ver >= 71);
2520         assert(!magic);
2521 
2522         uint32_t raddr_a, raddr_b;
2523         if (add) {
2524                 raddr_a = inst->alu.add.a.raddr;
2525                 raddr_b = inst->alu.add.b.raddr;
2526         } else {
2527                 raddr_a = inst->alu.mul.a.raddr;
2528                 raddr_b = inst->alu.mul.b.raddr;
2529         }
2530 
2531         for (int i = 0; i < num_src; i++) {
2532                 if (i == 0 && raddr_a == index)
2533                         return true;
2534                 if (i == 1 && raddr_b == index)
2535                         return true;
2536         }
2537 
2538         return false;
2539 }
2540 
2541 /**
2542  * This takes and ldvary signal merged into 'inst' and tries to move it up to
2543  * the previous instruction to get good pipelining of ldvary sequences,
2544  * transforming this:
2545  *
2546  * nop                  ; nop               ; ldvary.r4
2547  * nop                  ; fmul  r0, r4, rf0 ;
2548  * fadd  rf13, r0, r5   ; nop;              ; ldvary.r1  <-- inst
2549  *
2550  * into:
2551  *
2552  * nop                  ; nop               ; ldvary.r4
2553  * nop                  ; fmul  r0, r4, rf0 ; ldvary.r1
2554  * fadd  rf13, r0, r5   ; nop;              ;            <-- inst
2555  *
2556  * If we manage to do this successfully (we return true here), then flagging
2557  * the ldvary as "scheduled" may promote the follow-up fmul to a DAG head that
2558  * we will be able to pick up to merge into 'inst', leading to code like this:
2559  *
2560  * nop                  ; nop               ; ldvary.r4
2561  * nop                  ; fmul  r0, r4, rf0 ; ldvary.r1
2562  * fadd  rf13, r0, r5   ; fmul  r2, r1, rf0 ;            <-- inst
2563  */
2564 static bool
fixup_pipelined_ldvary(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qblock * block,struct v3d_qpu_instr * inst)2565 fixup_pipelined_ldvary(struct v3d_compile *c,
2566                        struct choose_scoreboard *scoreboard,
2567                        struct qblock *block,
2568                        struct v3d_qpu_instr *inst)
2569 {
2570         const struct v3d_device_info *devinfo = c->devinfo;
2571 
2572         /* We only call this if we have successfully merged an ldvary into a
2573          * previous instruction.
2574          */
2575         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
2576         assert(inst->sig.ldvary);
2577         uint32_t ldvary_magic = inst->sig_magic;
2578         uint32_t ldvary_index = inst->sig_addr;
2579 
2580         /* The instruction in which we merged the ldvary cannot read
2581          * the ldvary destination, if it does, then moving the ldvary before
2582          * it would overwrite it.
2583          */
2584         if (alu_reads_register(devinfo, inst, true, ldvary_magic, ldvary_index))
2585                 return false;
2586         if (alu_reads_register(devinfo, inst, false, ldvary_magic, ldvary_index))
2587                 return false;
2588 
2589         /* The implicit ldvary destination may not be written to by a signal
2590          * in the instruction following ldvary. Since we are planning to move
2591          * ldvary to the previous instruction, this means we need to check if
2592          * the current instruction has any other signal that could create this
2593          * conflict. The only other signal that can write to the implicit
2594          * ldvary destination that is compatible with ldvary in the same
2595          * instruction is ldunif.
2596          */
2597         if (inst->sig.ldunif)
2598                 return false;
2599 
2600         /* The previous instruction can't write to the same destination as the
2601          * ldvary.
2602          */
2603         struct qinst *prev = (struct qinst *) block->instructions.prev;
2604         if (!prev || prev->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
2605                 return false;
2606 
2607         if (prev->qpu.alu.add.op != V3D_QPU_A_NOP) {
2608                 if (prev->qpu.alu.add.magic_write == ldvary_magic &&
2609                     prev->qpu.alu.add.waddr == ldvary_index) {
2610                         return false;
2611                 }
2612         }
2613 
2614         if (prev->qpu.alu.mul.op != V3D_QPU_M_NOP) {
2615                 if (prev->qpu.alu.mul.magic_write == ldvary_magic &&
2616                     prev->qpu.alu.mul.waddr == ldvary_index) {
2617                         return false;
2618                 }
2619         }
2620 
2621         /* The previous instruction cannot have a conflicting signal */
2622         if (v3d_qpu_sig_writes_address(devinfo, &prev->qpu.sig))
2623                 return false;
2624 
2625         uint32_t sig;
2626         struct v3d_qpu_sig new_sig = prev->qpu.sig;
2627         new_sig.ldvary = true;
2628         if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
2629                 return false;
2630 
2631         /* The previous instruction cannot use flags since ldvary uses the
2632          * 'cond' instruction field to store the destination.
2633          */
2634         if (v3d_qpu_writes_flags(&prev->qpu))
2635                 return false;
2636         if (v3d_qpu_reads_flags(&prev->qpu))
2637                 return false;
2638 
2639         /* We can't put an ldvary in the delay slots of a thrsw. We should've
2640          * prevented this when pairing up the ldvary with another instruction
2641          * and flagging it for a fixup. In V3D 7.x this is limited only to the
2642          * second delay slot.
2643          */
2644         assert((devinfo->ver == 42 &&
2645                 scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1) ||
2646                (devinfo->ver >= 71 &&
2647                 scoreboard->last_thrsw_tick + 2 != scoreboard->tick - 1));
2648 
2649         /* Move the ldvary to the previous instruction and remove it from the
2650          * current one.
2651          */
2652         prev->qpu.sig.ldvary = true;
2653         prev->qpu.sig_magic = ldvary_magic;
2654         prev->qpu.sig_addr = ldvary_index;
2655         scoreboard->last_ldvary_tick = scoreboard->tick - 1;
2656 
2657         inst->sig.ldvary = false;
2658         inst->sig_magic = false;
2659         inst->sig_addr = 0;
2660 
2661         /* Update rf0 flops tracking for new ldvary delayed rf0 write tick */
2662         if (devinfo->ver >= 71) {
2663                 scoreboard->last_implicit_rf0_write_tick = scoreboard->tick;
2664                 set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
2665         }
2666 
2667         /* By moving ldvary to the previous instruction we make it update r5
2668          * (rf0 for ver >= 71) in the current one, so nothing else in it
2669          * should write this register.
2670          *
2671          * This should've been prevented by our depedency tracking, which
2672          * would not allow ldvary to be paired up with an instruction that
2673          * writes r5/rf0 (since our dependency tracking doesn't know that the
2674          * ldvary write to r5/rf0 happens in the next instruction).
2675          */
2676         assert(!v3d_qpu_writes_r5(devinfo, inst));
2677         assert(devinfo->ver == 42 ||
2678                (!v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
2679                 !v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0)));
2680 
2681         return true;
2682 }
2683 
2684 static uint32_t
schedule_instructions(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qblock * block,enum quniform_contents * orig_uniform_contents,uint32_t * orig_uniform_data,uint32_t * next_uniform)2685 schedule_instructions(struct v3d_compile *c,
2686                       struct choose_scoreboard *scoreboard,
2687                       struct qblock *block,
2688                       enum quniform_contents *orig_uniform_contents,
2689                       uint32_t *orig_uniform_data,
2690                       uint32_t *next_uniform)
2691 {
2692         const struct v3d_device_info *devinfo = c->devinfo;
2693         uint32_t time = 0;
2694 
2695         while (!list_is_empty(&scoreboard->dag->heads)) {
2696                 struct schedule_node *chosen =
2697                         choose_instruction_to_schedule(c, scoreboard, NULL);
2698                 struct schedule_node *merge = NULL;
2699 
2700                 /* If there are no valid instructions to schedule, drop a NOP
2701                  * in.
2702                  */
2703                 struct qinst *qinst = chosen ? chosen->inst : vir_nop();
2704                 struct v3d_qpu_instr *inst = &qinst->qpu;
2705 
2706                 if (debug) {
2707                         fprintf(stderr, "t=%4d: current list:\n",
2708                                 time);
2709                         dump_state(devinfo, scoreboard->dag);
2710                         fprintf(stderr, "t=%4d: chose:   ", time);
2711                         v3d_qpu_dump(devinfo, inst);
2712                         fprintf(stderr, "\n");
2713                 }
2714 
2715                 /* We can't mark_instruction_scheduled() the chosen inst until
2716                  * we're done identifying instructions to merge, so put the
2717                  * merged instructions on a list for a moment.
2718                  */
2719                 struct list_head merged_list;
2720                 list_inithead(&merged_list);
2721 
2722                 /* Schedule this instruction onto the QPU list. Also try to
2723                  * find an instruction to pair with it.
2724                  */
2725                 if (chosen) {
2726                         time = MAX2(chosen->unblocked_time, time);
2727                         pre_remove_head(scoreboard->dag, chosen);
2728 
2729                         while ((merge =
2730                                 choose_instruction_to_schedule(c, scoreboard,
2731                                                                chosen))) {
2732                                 time = MAX2(merge->unblocked_time, time);
2733                                 pre_remove_head(scoreboard->dag, merge);
2734                                 list_addtail(&merge->link, &merged_list);
2735                                 (void)qpu_merge_inst(devinfo, inst,
2736                                                      inst, &merge->inst->qpu);
2737                                 if (merge->inst->uniform != -1) {
2738                                         chosen->inst->uniform =
2739                                                 merge->inst->uniform;
2740                                 }
2741 
2742                                 chosen->inst->ldtmu_count +=
2743                                         merge->inst->ldtmu_count;
2744 
2745                                 if (debug) {
2746                                         fprintf(stderr, "t=%4d: merging: ",
2747                                                 time);
2748                                         v3d_qpu_dump(devinfo, &merge->inst->qpu);
2749                                         fprintf(stderr, "\n");
2750                                         fprintf(stderr, "         result: ");
2751                                         v3d_qpu_dump(devinfo, inst);
2752                                         fprintf(stderr, "\n");
2753                                 }
2754 
2755                                 if (scoreboard->fixup_ldvary) {
2756                                         scoreboard->fixup_ldvary = false;
2757                                         if (fixup_pipelined_ldvary(c, scoreboard, block, inst)) {
2758                                                 /* Flag the ldvary as scheduled
2759                                                  * now so we can try to merge the
2760                                                  * follow-up instruction in the
2761                                                  * the ldvary sequence into the
2762                                                  * current instruction.
2763                                                  */
2764                                                 mark_instruction_scheduled(
2765                                                         devinfo, scoreboard->dag,
2766                                                         time, merge);
2767                                         }
2768                                 }
2769                         }
2770                         if (read_stalls(c->devinfo, scoreboard, inst))
2771                                 c->qpu_inst_stalled_count++;
2772                 }
2773 
2774                 /* Update the uniform index for the rewritten location --
2775                  * branch target updating will still need to change
2776                  * c->uniform_data[] using this index.
2777                  */
2778                 if (qinst->uniform != -1) {
2779                         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
2780                                 block->branch_uniform = *next_uniform;
2781 
2782                         c->uniform_data[*next_uniform] =
2783                                 orig_uniform_data[qinst->uniform];
2784                         c->uniform_contents[*next_uniform] =
2785                                 orig_uniform_contents[qinst->uniform];
2786                         qinst->uniform = *next_uniform;
2787                         (*next_uniform)++;
2788                 }
2789 
2790                 if (debug) {
2791                         fprintf(stderr, "\n");
2792                 }
2793 
2794                 /* Now that we've scheduled a new instruction, some of its
2795                  * children can be promoted to the list of instructions ready to
2796                  * be scheduled.  Update the children's unblocked time for this
2797                  * DAG edge as we do so.
2798                  */
2799                 mark_instruction_scheduled(devinfo, scoreboard->dag, time, chosen);
2800                 list_for_each_entry(struct schedule_node, merge, &merged_list,
2801                                     link) {
2802                         mark_instruction_scheduled(devinfo, scoreboard->dag, time, merge);
2803 
2804                         /* The merged VIR instruction doesn't get re-added to the
2805                          * block, so free it now.
2806                          */
2807                         free(merge->inst);
2808                 }
2809 
2810                 if (inst->sig.thrsw) {
2811                         time += emit_thrsw(c, block, scoreboard, qinst, false);
2812                 } else if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
2813                         emit_branch(c, block, scoreboard, qinst);
2814                 } else {
2815                         insert_scheduled_instruction(c, block,
2816                                                      scoreboard, qinst);
2817                 }
2818         }
2819 
2820         return time;
2821 }
2822 
2823 static uint32_t
qpu_schedule_instructions_block(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qblock * block,enum quniform_contents * orig_uniform_contents,uint32_t * orig_uniform_data,uint32_t * next_uniform)2824 qpu_schedule_instructions_block(struct v3d_compile *c,
2825                                 struct choose_scoreboard *scoreboard,
2826                                 struct qblock *block,
2827                                 enum quniform_contents *orig_uniform_contents,
2828                                 uint32_t *orig_uniform_data,
2829                                 uint32_t *next_uniform)
2830 {
2831         void *mem_ctx = ralloc_context(NULL);
2832         scoreboard->dag = dag_create(mem_ctx);
2833         struct list_head setup_list;
2834 
2835         list_inithead(&setup_list);
2836 
2837         /* Wrap each instruction in a scheduler structure. */
2838         while (!list_is_empty(&block->instructions)) {
2839                 struct qinst *qinst = (struct qinst *)block->instructions.next;
2840                 struct schedule_node *n =
2841                         rzalloc(mem_ctx, struct schedule_node);
2842 
2843                 dag_init_node(scoreboard->dag, &n->dag);
2844                 n->inst = qinst;
2845 
2846                 list_del(&qinst->link);
2847                 list_addtail(&n->link, &setup_list);
2848         }
2849 
2850         calculate_forward_deps(c, scoreboard->dag, &setup_list);
2851         calculate_reverse_deps(c, scoreboard->dag, &setup_list);
2852 
2853         dag_traverse_bottom_up(scoreboard->dag, compute_delay, c);
2854 
2855         uint32_t cycles = schedule_instructions(c, scoreboard, block,
2856                                                 orig_uniform_contents,
2857                                                 orig_uniform_data,
2858                                                 next_uniform);
2859 
2860         ralloc_free(mem_ctx);
2861         scoreboard->dag = NULL;
2862 
2863         return cycles;
2864 }
2865 
2866 static void
qpu_set_branch_targets(struct v3d_compile * c)2867 qpu_set_branch_targets(struct v3d_compile *c)
2868 {
2869         vir_for_each_block(block, c) {
2870                 /* The end block of the program has no branch. */
2871                 if (!block->successors[0])
2872                         continue;
2873 
2874                 /* If there was no branch instruction, then the successor
2875                  * block must follow immediately after this one.
2876                  */
2877                 if (block->branch_qpu_ip == ~0) {
2878                         assert(block->end_qpu_ip + 1 ==
2879                                block->successors[0]->start_qpu_ip);
2880                         continue;
2881                 }
2882 
2883                 /* Walk back through the delay slots to find the branch
2884                  * instr.
2885                  */
2886                 struct qinst *branch = NULL;
2887                 struct list_head *entry = block->instructions.prev;
2888                 int32_t delay_slot_count = -1;
2889                 struct qinst *delay_slots_start = NULL;
2890                 for (int i = 0; i < 3; i++) {
2891                         entry = entry->prev;
2892                         struct qinst *inst =
2893                                 container_of(entry, struct qinst, link);
2894 
2895                         if (delay_slot_count == -1) {
2896                                 if (!v3d_qpu_is_nop(&inst->qpu))
2897                                         delay_slot_count = i;
2898                                 else
2899                                         delay_slots_start = inst;
2900                         }
2901 
2902                         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) {
2903                                 branch = inst;
2904                                 break;
2905                         }
2906                 }
2907                 assert(branch && branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
2908                 assert(delay_slot_count >= 0 && delay_slot_count <= 3);
2909                 assert(delay_slot_count == 0 || delay_slots_start != NULL);
2910 
2911                 /* Make sure that the if-we-don't-jump
2912                  * successor was scheduled just after the
2913                  * delay slots.
2914                  */
2915                 assert(!block->successors[1] ||
2916                        block->successors[1]->start_qpu_ip ==
2917                        block->branch_qpu_ip + 4);
2918 
2919                 branch->qpu.branch.offset =
2920                         ((block->successors[0]->start_qpu_ip -
2921                           (block->branch_qpu_ip + 4)) *
2922                          sizeof(uint64_t));
2923 
2924                 /* Set up the relative offset to jump in the
2925                  * uniform stream.
2926                  *
2927                  * Use a temporary here, because
2928                  * uniform_data[inst->uniform] may be shared
2929                  * between multiple instructions.
2930                  */
2931                 assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT);
2932                 c->uniform_data[branch->uniform] =
2933                         (block->successors[0]->start_uniform -
2934                          (block->branch_uniform + 1)) * 4;
2935 
2936                 /* If this is an unconditional branch, try to fill any remaining
2937                  * delay slots with the initial instructions of the successor
2938                  * block.
2939                  *
2940                  * FIXME: we can do the same for conditional branches if we
2941                  * predicate the instructions to match the branch condition.
2942                  */
2943                 if (branch->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS) {
2944                         struct list_head *successor_insts =
2945                                 &block->successors[0]->instructions;
2946                         delay_slot_count = MIN2(delay_slot_count,
2947                                                 list_length(successor_insts));
2948                         struct qinst *s_inst =
2949                                 (struct qinst *) successor_insts->next;
2950                         struct qinst *slot = delay_slots_start;
2951                         int slots_filled = 0;
2952                         while (slots_filled < delay_slot_count &&
2953                                qpu_inst_valid_in_branch_delay_slot(c, s_inst)) {
2954                                 memcpy(&slot->qpu, &s_inst->qpu,
2955                                        sizeof(slot->qpu));
2956                                 s_inst = (struct qinst *) s_inst->link.next;
2957                                 slot = (struct qinst *) slot->link.next;
2958                                 slots_filled++;
2959                         }
2960                         branch->qpu.branch.offset +=
2961                                 slots_filled * sizeof(uint64_t);
2962                 }
2963         }
2964 }
2965 
2966 uint32_t
v3d_qpu_schedule_instructions(struct v3d_compile * c)2967 v3d_qpu_schedule_instructions(struct v3d_compile *c)
2968 {
2969         const struct v3d_device_info *devinfo = c->devinfo;
2970         struct qblock *end_block = list_last_entry(&c->blocks,
2971                                                    struct qblock, link);
2972 
2973         /* We reorder the uniforms as we schedule instructions, so save the
2974          * old data off and replace it.
2975          */
2976         uint32_t *uniform_data = c->uniform_data;
2977         enum quniform_contents *uniform_contents = c->uniform_contents;
2978         c->uniform_contents = ralloc_array(c, enum quniform_contents,
2979                                            c->num_uniforms);
2980         c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
2981         c->uniform_array_size = c->num_uniforms;
2982         uint32_t next_uniform = 0;
2983 
2984         struct choose_scoreboard scoreboard;
2985         memset(&scoreboard, 0, sizeof(scoreboard));
2986         scoreboard.last_ldvary_tick = -10;
2987         scoreboard.last_unifa_write_tick = -10;
2988         scoreboard.last_magic_sfu_write_tick = -10;
2989         scoreboard.last_uniforms_reset_tick = -10;
2990         scoreboard.last_thrsw_tick = -10;
2991         scoreboard.last_branch_tick = -10;
2992         scoreboard.last_setmsf_tick = -10;
2993         scoreboard.last_stallable_sfu_tick = -10;
2994         scoreboard.first_ldtmu_after_thrsw = true;
2995         scoreboard.last_implicit_rf0_write_tick = - 10;
2996 
2997         if (debug) {
2998                 fprintf(stderr, "Pre-schedule instructions\n");
2999                 vir_for_each_block(block, c) {
3000                         fprintf(stderr, "BLOCK %d\n", block->index);
3001                         list_for_each_entry(struct qinst, qinst,
3002                                             &block->instructions, link) {
3003                                 v3d_qpu_dump(devinfo, &qinst->qpu);
3004                                 fprintf(stderr, "\n");
3005                         }
3006                 }
3007                 fprintf(stderr, "\n");
3008         }
3009 
3010         uint32_t cycles = 0;
3011         vir_for_each_block(block, c) {
3012                 block->start_qpu_ip = c->qpu_inst_count;
3013                 block->branch_qpu_ip = ~0;
3014                 block->start_uniform = next_uniform;
3015 
3016                 cycles += qpu_schedule_instructions_block(c,
3017                                                           &scoreboard,
3018                                                           block,
3019                                                           uniform_contents,
3020                                                           uniform_data,
3021                                                           &next_uniform);
3022 
3023                 block->end_qpu_ip = c->qpu_inst_count - 1;
3024         }
3025 
3026         /* Emit the program-end THRSW instruction. */;
3027         struct qinst *thrsw = vir_nop();
3028         thrsw->qpu.sig.thrsw = true;
3029         emit_thrsw(c, end_block, &scoreboard, thrsw, true);
3030 
3031         qpu_set_branch_targets(c);
3032 
3033         assert(next_uniform == c->num_uniforms);
3034 
3035         return cycles;
3036 }
3037